Function inside an array php - php

So I have a web crawler that I am working on. And I have a CSV file that have about one million websites that I want to pass to be crawled. My problem is that I am able to save the CSV file in an array but when I pass it to the method that crawls it; it seems that it takes the first element and crawls it not the whole array. Can someone help me?
<?php
include("classes/DomDocumentParser.php");
include("config.php");
$alreadyCrawled = array();
$crawling = array();
$alreadyFoundImages = array();
$my_list = array();
function linkExists($url){
global $con;
$query = $con->prepare("SELECT * FROM sites WHERE url = :url");
$query ->bindParam(":url",$url);
$query->execute();
return $query->rowCount() != 0;
}
function insertImage($url,$src,$title,$alt){
global $con;
$query = $con->prepare("INSERT INTO images(siteUrl, imageUrl, alt, title)
VALUES(:siteUrl,:imageUrl,:alt,:title)");
$query ->bindParam(":siteUrl",$url);
$query ->bindParam(":imageUrl",$src);
$query ->bindParam(":alt",$alt);
$query ->bindParam(":title",$title);
return $query->execute();
}
function insertLink($url,$title,$description,$keywords){
global $con;
$query = $con->prepare("INSERT INTO sites(Url, title, description, keywords)
VALUES(:url,:title,:description,:keywords)");
$query ->bindParam(":url",$url);
$query ->bindParam(":title",$title);
$query ->bindParam(":description",$description);
$query ->bindParam(":keywords",$keywords);
return $query->execute();
}
function createLink($src,$url){
$scheme = parse_url($url)["scheme"]; // http or https
$host = parse_url($url)["host"]; // www.mohamad-ahmad.com
if(substr($src,0,2) =="//"){
// //www.mohanadahmad.com
$src = $scheme . ":" . $src;
}
else if(substr($src,0,1) =="/"){
// /aboutus/about.php
$src = $scheme . "://" . $host . $src;
}
else if(substr($src,0,2) =="./"){
// ./aboutus/about.php
$src = $scheme . "://" . $host . dirname(parse_url($url)["path"]) . substr($src ,1);
}
else if(substr($src,0,3) =="../"){
// ../aboutus/about.php
$src = $scheme . "://" . $host . "/" . $src;
}
else if(substr($src,0,5) !="https" && substr($src,0,4) !="http" ){
// aboutus/about.php
$src = $scheme . "://" . $host ."/" .$src;
}
return $src;
}
function getDetails($url){
global $alreadyFoundImages;
$parser = new DomDocumentParser($url);
$titleArray = $parser->getTitletags();
if(sizeof($titleArray) == 0 || $titleArray->item(0) == NULL){
return;
}
$title = $titleArray -> item(0) -> nodeValue;
$title = str_replace("\n","",$title);
if($title == ""){
return;
}
$description="";
$keywords="";
$metasArray = $parser -> getMetatags();
foreach($metasArray as $meta){
if($meta->getAttribute("name") == "description"){
$description = $meta -> getAttribute("content");
}
if($meta->getAttribute("name") == "keywords"){
$keywords = $meta -> getAttribute("content");
}
}
$description = str_replace("\n","",$description);
$keywords = str_replace("\n","",$keywords);
if(linkExists($url)){
echo "$url already exists <br>";
}
else if(insertLink($url,$title,$description,$keywords)){
echo "SUCCESS: $url <br>";
}
else{
echo "ERROR: Failed to insert $url <br>";
}
$imageArray = $parser ->getImages();
foreach($imageArray as $image){
$src = $image->getAttribute("src");
$alt = $image->getAttribute("alt");
$title = $image->getAttribute("title");
if(!$title && !$alt){
continue;
}
$src = createLink($src,$url);
if(!in_array($src,$alreadyFoundImages)){
$alreadyFoundImages[] = $src;
insertImage($url,$src,$alt,$title);
}
}
}
function followLinks($url) {
global $crawling;
global $alreadyCrawled;
$parser = new DomDocumentParser($url);
$linkList = $parser->getLinks();
foreach($linkList as $link){
$href = $link->getAttribute("href");
if(strpos($href,"#") !==false){
// Ignore anchor url
continue;
}
else if(substr($href,0,11)== "javascript:"){
// Ignore javascript url
continue;
}
$href = createLink($href,$url);
if(!in_array($href,$alreadyCrawled)){
$alreadyCrawled[] = $href;
$crawling[] = $href;
//getDetails contain the insert into db
getDetails($href);
}
}
array_shift($crawling);
foreach($crawling as $site){
followLinks($site);
}
}
function fill_my_list(){
global $my_list;
$file = fopen('top-1m.csv', 'r');
while( ($data = fgetcsv($file)) !== false ) {
$startUrl = "https://www.".$data[1];
$my_list[] = $startUrl;
}
foreach($my_list as $key => $u){
followLinks($u);
}
}
fill_my_list();
?>

You can do something like this by php.net
$row = 1;
if (($File = fopen("test.csv", "r")) !== FALSE) {
while (($data = fgetcsv($File, 1000, ",")) !== FALSE) {
$num = count($data);
echo "<p> $num fields in line $row: <br /></p>\n";
$row++;
for ($c=0; $c < $num; $c++) {
echo $data[$c] . "<br />\n";
}
//Use $data[$c];
}
fclose($File);
}
Here more examples : https://www.php.net/manual/en/function.fgetcsv.php#refsect1-function.fgetcsv-examples

Related

Script not Iterating through all files in directory

PHP Script written for SuiteCRM improting is supposed to loop through all files in the directory and import each to Database. Having a lot of trouble.
Script reads and works on the first file only then finishes when it should loop :(
Importing works fine and data is added the the database from first file.
function MemberImportJob()
{
try
{
$config = new Configurator();
$config->loadConfig();
$xmlDataDir = 'custom/wimporter/eidimport';
$directoryContent = scandir($xmlDataDir);
//foreach ($directoryContent as $itemFile)
foreach (glob($xmlDataDir) as $itemfile)
{
var_dump($itemfile);
if (is_dir($xmlDataDir . DIRECTORY_SEPARATOR . $itemFile)) continue;
if (strcasecmp(substr($itemFile, -4), ".csv") != 0) continue;
$oFile = fopen($xmlDataDir . DIRECTORY_SEPARATOR . $itemFile, 'r');
if ($oFile !== FALSE)
{
$header = NULL;
$data = Array();
while (($data[] = fgetcsv($oFile, 90000, ',')) !== FALSE) { }
fclose($oFile);
//combine into a nice associative array:
$arow=Array();
$fields = array_shift($data);
foreach ($data as $i=>$arow)
{
array_combine " . $i);
if (is_array($arow)) {
$data[$i] = array_combine($fields, $arow);}
}
unset($arow);
$num = count($data);
for ($row=0; $row < $num - 1; $row++)
{
$Member = BeanFactory::getBean("locte_Membership");
$Member=$Member->retrieve_by_string_fields(array('last_name' => $data[$row]["LAST NAME"], 'first_name' => $data[$row]["FIRST NAME"], 'lcl_affiliate_number' => $data[$row]["AFFILIATE"]));
$MemberID = $Member->id;
if (is_null($Member)) {
$Member = BeanFactory::newBean('locte_Membership');
$delta = fillPerson($data[$row], $Member, "FULL NAME");
if(count($delta))
{
$Member_id = $Member->save();
}
} else {
v
var_dump($Member->id);
$delta = fillFoundrecord($data[$row], $Member, "FULL NAME");
echo("record Updated!");
$Member_id = $Member->save();
}
unset($data[$row]);
}
}
return true;
}
} catch (Exception $e)
{
return false;
}
}

How to refresh opencart modification by code

I need to refresh modifications after install module.
public function install() {
$this->load->controller('marketplace/modification/refresh');
}
I tried this. Its worked but the page redirected to modification listing. How can i do without redirect. I am using opencart 3.
If you don't want to edit modification.php or clone its refresh function, You can use this:
public function install(){
$data['redirect'] = 'extension/extension/module';
$this->load->controller('marketplace/modification/refresh', $data);
}
You could not controll by this way as you are doing:
You need to do this as
public function install() {
$this->refresh();
}
protected function refresh($data = array()) {
$this->load->language('marketplace/modification');
$this->document->setTitle($this->language->get('heading_title'));
$this->load->model('setting/modification');
if ($this->validate()) {
// Just before files are deleted, if config settings say maintenance mode is off then turn it on
$maintenance = $this->config->get('config_maintenance');
$this->load->model('setting/setting');
$this->model_setting_setting->editSettingValue('config', 'config_maintenance', true);
//Log
$log = array();
// Clear all modification files
$files = array();
// Make path into an array
$path = array(DIR_MODIFICATION . '*');
// While the path array is still populated keep looping through
while (count($path) != 0) {
$next = array_shift($path);
foreach (glob($next) as $file) {
// If directory add to path array
if (is_dir($file)) {
$path[] = $file . '/*';
}
// Add the file to the files to be deleted array
$files[] = $file;
}
}
// Reverse sort the file array
rsort($files);
// Clear all modification files
foreach ($files as $file) {
if ($file != DIR_MODIFICATION . 'index.html') {
// If file just delete
if (is_file($file)) {
unlink($file);
// If directory use the remove directory function
} elseif (is_dir($file)) {
rmdir($file);
}
}
}
// Begin
$xml = array();
// Load the default modification XML
$xml[] = file_get_contents(DIR_SYSTEM . 'modification.xml');
// This is purly for developers so they can run mods directly and have them run without upload after each change.
$files = glob(DIR_SYSTEM . '*.ocmod.xml');
if ($files) {
foreach ($files as $file) {
$xml[] = file_get_contents($file);
}
}
// Get the default modification file
$results = $this->model_setting_modification->getModifications();
foreach ($results as $result) {
if ($result['status']) {
$xml[] = $result['xml'];
}
}
$modification = array();
foreach ($xml as $xml) {
if (empty($xml)){
continue;
}
$dom = new DOMDocument('1.0', 'UTF-8');
$dom->preserveWhiteSpace = false;
$dom->loadXml($xml);
// Log
$log[] = 'MOD: ' . $dom->getElementsByTagName('name')->item(0)->textContent;
// Wipe the past modification store in the backup array
$recovery = array();
// Set the a recovery of the modification code in case we need to use it if an abort attribute is used.
if (isset($modification)) {
$recovery = $modification;
}
$files = $dom->getElementsByTagName('modification')->item(0)->getElementsByTagName('file');
foreach ($files as $file) {
$operations = $file->getElementsByTagName('operation');
$files = explode('|', $file->getAttribute('path'));
foreach ($files as $file) {
$path = '';
// Get the full path of the files that are going to be used for modification
if ((substr($file, 0, 7) == 'catalog')) {
$path = DIR_CATALOG . substr($file, 8);
}
if ((substr($file, 0, 5) == 'admin')) {
$path = DIR_APPLICATION . substr($file, 6);
}
if ((substr($file, 0, 6) == 'system')) {
$path = DIR_SYSTEM . substr($file, 7);
}
if ($path) {
$files = glob($path, GLOB_BRACE);
if ($files) {
foreach ($files as $file) {
// Get the key to be used for the modification cache filename.
if (substr($file, 0, strlen(DIR_CATALOG)) == DIR_CATALOG) {
$key = 'catalog/' . substr($file, strlen(DIR_CATALOG));
}
if (substr($file, 0, strlen(DIR_APPLICATION)) == DIR_APPLICATION) {
$key = 'admin/' . substr($file, strlen(DIR_APPLICATION));
}
if (substr($file, 0, strlen(DIR_SYSTEM)) == DIR_SYSTEM) {
$key = 'system/' . substr($file, strlen(DIR_SYSTEM));
}
// If file contents is not already in the modification array we need to load it.
if (!isset($modification[$key])) {
$content = file_get_contents($file);
$modification[$key] = preg_replace('~\r?\n~', "\n", $content);
$original[$key] = preg_replace('~\r?\n~', "\n", $content);
// Log
$log[] = PHP_EOL . 'FILE: ' . $key;
}
foreach ($operations as $operation) {
$error = $operation->getAttribute('error');
// Ignoreif
$ignoreif = $operation->getElementsByTagName('ignoreif')->item(0);
if ($ignoreif) {
if ($ignoreif->getAttribute('regex') != 'true') {
if (strpos($modification[$key], $ignoreif->textContent) !== false) {
continue;
}
} else {
if (preg_match($ignoreif->textContent, $modification[$key])) {
continue;
}
}
}
$status = false;
// Search and replace
if ($operation->getElementsByTagName('search')->item(0)->getAttribute('regex') != 'true') {
// Search
$search = $operation->getElementsByTagName('search')->item(0)->textContent;
$trim = $operation->getElementsByTagName('search')->item(0)->getAttribute('trim');
$index = $operation->getElementsByTagName('search')->item(0)->getAttribute('index');
// Trim line if no trim attribute is set or is set to true.
if (!$trim || $trim == 'true') {
$search = trim($search);
}
// Add
$add = $operation->getElementsByTagName('add')->item(0)->textContent;
$trim = $operation->getElementsByTagName('add')->item(0)->getAttribute('trim');
$position = $operation->getElementsByTagName('add')->item(0)->getAttribute('position');
$offset = $operation->getElementsByTagName('add')->item(0)->getAttribute('offset');
if ($offset == '') {
$offset = 0;
}
// Trim line if is set to true.
if ($trim == 'true') {
$add = trim($add);
}
// Log
$log[] = 'CODE: ' . $search;
// Check if using indexes
if ($index !== '') {
$indexes = explode(',', $index);
} else {
$indexes = array();
}
// Get all the matches
$i = 0;
$lines = explode("\n", $modification[$key]);
for ($line_id = 0; $line_id < count($lines); $line_id++) {
$line = $lines[$line_id];
// Status
$match = false;
// Check to see if the line matches the search code.
if (stripos($line, $search) !== false) {
// If indexes are not used then just set the found status to true.
if (!$indexes) {
$match = true;
} elseif (in_array($i, $indexes)) {
$match = true;
}
$i++;
}
// Now for replacing or adding to the matched elements
if ($match) {
switch ($position) {
default:
case 'replace':
$new_lines = explode("\n", $add);
if ($offset < 0) {
array_splice($lines, $line_id + $offset, abs($offset) + 1, array(str_replace($search, $add, $line)));
$line_id -= $offset;
} else {
array_splice($lines, $line_id, $offset + 1, array(str_replace($search, $add, $line)));
}
break;
case 'before':
$new_lines = explode("\n", $add);
array_splice($lines, $line_id - $offset, 0, $new_lines);
$line_id += count($new_lines);
break;
case 'after':
$new_lines = explode("\n", $add);
array_splice($lines, ($line_id + 1) + $offset, 0, $new_lines);
$line_id += count($new_lines);
break;
}
// Log
$log[] = 'LINE: ' . $line_id;
$status = true;
}
}
$modification[$key] = implode("\n", $lines);
} else {
$search = trim($operation->getElementsByTagName('search')->item(0)->textContent);
$limit = $operation->getElementsByTagName('search')->item(0)->getAttribute('limit');
$replace = trim($operation->getElementsByTagName('add')->item(0)->textContent);
// Limit
if (!$limit) {
$limit = -1;
}
// Log
$match = array();
preg_match_all($search, $modification[$key], $match, PREG_OFFSET_CAPTURE);
// Remove part of the the result if a limit is set.
if ($limit > 0) {
$match[0] = array_slice($match[0], 0, $limit);
}
if ($match[0]) {
$log[] = 'REGEX: ' . $search;
for ($i = 0; $i < count($match[0]); $i++) {
$log[] = 'LINE: ' . (substr_count(substr($modification[$key], 0, $match[0][$i][1]), "\n") + 1);
}
$status = true;
}
// Make the modification
$modification[$key] = preg_replace($search, $replace, $modification[$key], $limit);
}
if (!$status) {
// Abort applying this modification completely.
if ($error == 'abort') {
$modification = $recovery;
// Log
$log[] = 'NOT FOUND - ABORTING!';
break 5;
}
// Skip current operation or break
elseif ($error == 'skip') {
// Log
$log[] = 'NOT FOUND - OPERATION SKIPPED!';
continue;
}
// Break current operations
else {
// Log
$log[] = 'NOT FOUND - OPERATIONS ABORTED!';
break;
}
}
}
}
}
}
}
}
// Log
$log[] = '----------------------------------------------------------------';
}
// Log
$ocmod = new Log('ocmod.log');
$ocmod->write(implode("\n", $log));
// Write all modification files
foreach ($modification as $key => $value) {
// Only create a file if there are changes
if ($original[$key] != $value) {
$path = '';
$directories = explode('/', dirname($key));
foreach ($directories as $directory) {
$path = $path . '/' . $directory;
if (!is_dir(DIR_MODIFICATION . $path)) {
#mkdir(DIR_MODIFICATION . $path, 0777);
}
}
$handle = fopen(DIR_MODIFICATION . $key, 'w');
fwrite($handle, $value);
fclose($handle);
}
}
// Maintance mode back to original settings
$this->model_setting_setting->editSettingValue('config', 'config_maintenance', $maintenance);
// Do not return success message if refresh() was called with $data
$this->session->data['success'] = $this->language->get('text_success');
$url = '';
if (isset($this->request->get['sort'])) {
$url .= '&sort=' . $this->request->get['sort'];
}
if (isset($this->request->get['order'])) {
$url .= '&order=' . $this->request->get['order'];
}
if (isset($this->request->get['page'])) {
$url .= '&page=' . $this->request->get['page'];
}
}
}
I hope it shouwl work for you.
This process is used to refresh the modification when your module installing.
if you need globally this then please tell me I will update you process.

Function/filter/action to replace product title for woocommerce

I have this function
function my_product_title($title, $id)
{
if(in_the_loop() && is_product())
{
return '<span class="border">FooBar</span>';
}
return $title;
}
add_filter( 'the_title', 'my_product_title', 5, 2);
and it can replace the product title with return '<span class="border">FooBar</span>';.
I also have a custom script in "mycustomtitle.php" that can modify the products titles and my script can echo that modified title as $mycustomtitle
I want to replace the original product title with my $mycustomtitle without changing anything in the core files.
I've tried to just change return '<span class="border">FooBar</span>'; to $mycustomtitle but it only removes the original title and gives no output at all...
Thanks!
UPDATE 2016-10-20 With custom code:
<?php
include $_SERVER['DOCUMENT_ROOT'] . '/wp-content/plugins/seo-controlpanel/seo-engine/explode.php';
$tit1 = $boomprint[array_rand($boomprint)].' '.file_get_contents($_SERVER['DOCUMENT_ROOT'] . '/wp-content/plugins/seo-controlpanel/write-to/product-title-h1.php');
$tit2 = $boomprint[array_rand($boomprint)].' '.file_get_contents($_SERVER['DOCUMENT_ROOT'] . '/wp-content/plugins/seo-controlpanel/write-to/product-title-h1.php');
$title_keys = array($tit1,$tit2);
$title_key11 = $title_keys[array_rand($title_keys)];
if(!function_exists(spin11)){
function spin11($string11) {
while(true) {
if(!preg_match_all('/({([^\{]*?)\})/', $string11, $matches))
break;
foreach($matches[2] as $i => $match) {
$parts = explode('|', $match);
$string11 = str_replace_once11($matches[0][$i], $parts[mt_rand(0, count($parts)-1)], $string11);
}
}
return $string11;
}
}
if(!function_exists(str_replace_once11)){
function str_replace_once11($from,$to,$str)
{
$str = explode($from,$str,2);
return $str[0].$to.$str[1];
}
$title_id11 = get_the_ID();
$fileLocation11 = getenv("DOCUMENT_ROOT") . '/wp-content/plugins/seo-controlpanel/seo-cache/product-title-h1/'.$title_id11.'.txt';
if(!file_exists($fileLocation11)){
$file11 = fopen($fileLocation11,"w");
$content11 = spin11($title_key11);
fwrite($file11,$content11);
fclose($file11);
}
if(file_exists($fileLocation11)){
$myFile11 = $fileLocation11;
$fh11 = fopen($myFile11, 'r');
$theData11 = fread($fh11, filesize($myFile11));
fclose($fh11);
}
}
?>
Insert your custom code in your function.
function my_product_title($title, $id) {
include $_SERVER['DOCUMENT_ROOT'] . '/wp-content/plugins/seo-controlpanel/seo-engine/explode.php';
$tit1 = $boomprint[array_rand($boomprint)].' '.file_get_contents($_SERVER['DOCUMENT_ROOT'] . '/wp-content/plugins/seo-controlpanel/write-to/product-title-h1.php');
$tit2 = $boomprint[array_rand($boomprint)].' '.file_get_contents($_SERVER['DOCUMENT_ROOT'] . '/wp-content/plugins/seo-controlpanel/write-to/product-title-h1.php');
$title_keys = array($tit1,$tit2);
$title_key11 = $title_keys[array_rand($title_keys)];
if(!function_exists(spin11)){
function spin11($string11) {
while(true) {
if(!preg_match_all('/({([^\{]*?)\})/', $string11, $matches))
break;
foreach($matches[2] as $i => $match) {
$parts = explode('|', $match);
$string11 = str_replace_once11($matches[0][$i], $parts[mt_rand(0, count($parts)-1)], $string11);
}
}
return $string11;
}
}
if(!function_exists(str_replace_once11)){
function str_replace_once11($from,$to,$str)
{
$str = explode($from,$str,2);
return $str[0].$to.$str[1];
}
$title_id11 = get_the_ID();
$fileLocation11 = getenv("DOCUMENT_ROOT") . '/wp-content/plugins/seo-controlpanel/seo-cache/product-title-h1/'.$title_id11.'.txt';
if(!file_exists($fileLocation11)){
$file11 = fopen($fileLocation11,"w");
$content11 = spin11($title_key11);
fwrite($file11,$content11);
fclose($file11);
}
if(file_exists($fileLocation11)){
$myFile11 = $fileLocation11;
$fh11 = fopen($myFile11, 'r');
$theData11 = fread($fh11, filesize($myFile11));
fclose($fh11);
}
}
if(in_the_loop() && is_product()) {
return $theData11;
}
return $title;
}
add_filter( 'the_title', 'my_product_title', 5, 2);
Is $mycustomtitle global variable or it is defined inside another function/method/class?
If it is global variable this may help:
function my_product_title($title, $id)
{
global $mycustomtitle;
if(in_the_loop() && is_product())
{
return $mycustomtitle;
}
return $title;
}
<?php
include $_SERVER['DOCUMENT_ROOT'] . '/wp-content/plugins/seo-controlpanel/seo-engine/explode.php';
$tit1 = $boomprint[array_rand($boomprint)].' '.file_get_contents($_SERVER['DOCUMENT_ROOT'] . '/wp-content/plugins/seo-controlpanel/write-to/product-title-h1.php');
$tit2 = $boomprint[array_rand($boomprint)].' '.file_get_contents($_SERVER['DOCUMENT_ROOT'] . '/wp-content/plugins/seo-controlpanel/write-to/product-title-h1.php');
$title_keys = array($tit1,$tit2);
$title_key11 = $title_keys[array_rand($title_keys)];
if(!function_exists(spin11)){
function spin11($string11) {
while(true) {
if(!preg_match_all('/({([^\{]*?)\})/', $string11, $matches))
break;
foreach($matches[2] as $i => $match) {
$parts = explode('|', $match);
$string11 = str_replace_once11($matches[0][$i], $parts[mt_rand(0, count($parts)-1)], $string11);
}
}
return $string11;
}
}
if(!function_exists(str_replace_once11)){
function str_replace_once11($from,$to,$str)
{
$str = explode($from,$str,2);
return $str[0].$to.$str[1];
}
$title_id11 = get_the_ID();
$fileLocation11 = getenv("DOCUMENT_ROOT") . '/wp-content/plugins/seo-controlpanel/seo-cache/product-title-h1/'.$title_id11.'.txt';
if(!file_exists($fileLocation11)){
$file11 = fopen($fileLocation11,"w");
$content11 = spin11($title_key11);
fwrite($file11,$content11);
fclose($file11);
}
if(file_exists($fileLocation11)){
$myFile11 = $fileLocation11;
$fh11 = fopen($myFile11, 'r');
$theData11 = fread($fh11, filesize($myFile11));
fclose($fh11);
}
}
//echo $theData11;
?>

Change php script with variables from working in http to working in shell

I use a script from here to generate my sitemaps.
I can call it with the browser with http://www.example.com/sitemap.php?update=pages and its working fine.
I need to call it as shell script so that I can automate it with the windows task scheduler. But the script needs to be changed to get the variables ?update=pages. But I don't manage to change it correctly.
Could anybody help me so that I can execute the script from command line with
...\php C:\path\to\script\sitemap.php update=pages. It would also be fine for me to hardcode the variables into the script since I wont change them anyway.
define("BASE_URL", "http://www.example.com/");
define ('BASE_URI', $_SERVER['DOCUMENT_ROOT'] . '/');
class Sitemap {
private $compress;
private $page = 'index';
private $index = 1;
private $count = 1;
private $urls = array();
public function __construct ($compress=true) {
ini_set('memory_limit', '75M'); // 50M required per tests
$this->compress = ($compress) ? '.gz' : '';
}
public function page ($name) {
$this->save();
$this->page = $name;
$this->index = 1;
}
public function url ($url, $lastmod='', $changefreq='', $priority='') {
$url = htmlspecialchars(BASE_URL . 'xx' . $url);
$lastmod = (!empty($lastmod)) ? date('Y-m-d', strtotime($lastmod)) : false;
$changefreq = (!empty($changefreq) && in_array(strtolower($changefreq), array('always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'))) ? strtolower($changefreq) : false;
$priority = (!empty($priority) && is_numeric($priority) && abs($priority) <= 1) ? round(abs($priority), 1) : false;
if (!$lastmod && !$changefreq && !$priority) {
$this->urls[] = $url;
} else {
$url = array('loc'=>$url);
if ($lastmod !== false) $url['lastmod'] = $lastmod;
if ($changefreq !== false) $url['changefreq'] = $changefreq;
if ($priority !== false) $url['priority'] = ($priority < 1) ? $priority : '1.0';
$this->urls[] = $url;
}
if ($this->count == 50000) {
$this->save();
} else {
$this->count++;
}
}
public function close() {
$this->save();
}
private function save () {
if (empty($this->urls)) return;
$file = "sitemaps/xx-sitemap-{$this->page}-{$this->index}.xml{$this->compress}";
$xml = '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
$xml .= '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n";
foreach ($this->urls as $url) {
$xml .= ' <url>' . "\n";
if (is_array($url)) {
foreach ($url as $key => $value) $xml .= " <{$key}>{$value}</{$key}>\n";
} else {
$xml .= " <loc>{$url}</loc>\n";
}
$xml .= ' </url>' . "\n";
}
$xml .= '</urlset>' . "\n";
$this->urls = array();
if (!empty($this->compress)) $xml = gzencode($xml, 9);
$fp = fopen(BASE_URI . $file, 'wb');
fwrite($fp, $xml);
fclose($fp);
$this->index++;
$this->count = 1;
$num = $this->index; // should have already been incremented
while (file_exists(BASE_URI . "xxb-sitemap-{$this->page}-{$num}.xml{$this->compress}")) {
unlink(BASE_URI . "xxc-sitemap-{$this->page}-{$num}.xml{$this->compress}");
$num++;
}
$this->index($file);
}
private function index ($file) {
$sitemaps = array();
$index = "sitemaps/xx-sitemap-index.xml{$this->compress}";
if (file_exists(BASE_URI . $index)) {
$xml = (!empty($this->compress)) ? gzfile(BASE_URI . $index) : file(BASE_URI . $index);
$tags = $this->xml_tag(implode('', $xml), array('sitemap'));
foreach ($tags as $xml) {
$loc = str_replace(BASE_URL, '', $this->xml_tag($xml, 'loc'));
$lastmod = $this->xml_tag($xml, 'lastmod');
$lastmod = ($lastmod) ? date('Y-m-d', strtotime($lastmod)) : date('Y-m-d');
if (file_exists(BASE_URI . $loc)) $sitemaps[$loc] = $lastmod;
}
}
$sitemaps[$file] = date('Y-m-d');
$xml = '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
$xml .= '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n";
foreach ($sitemaps as $loc => $lastmod) {
$xml .= ' <sitemap>' . "\n";
$xml .= ' <loc>' . BASE_URL . $loc . '</loc>' . "\n";
$xml .= ' <lastmod>' . $lastmod . '</lastmod>' . "\n";
$xml .= ' </sitemap>' . "\n";
}
$xml .= '</sitemapindex>' . "\n";
if (!empty($this->compress)) $xml = gzencode($xml, 9);
$fp = fopen(BASE_URI . $index, 'wb');
fwrite($fp, $xml);
fclose($fp);
}
private function xml_tag ($xml, $tag, &$end='') {
if (is_array($tag)) {
$tags = array();
while ($value = $this->xml_tag($xml, $tag[0], $end)) {
$tags[] = $value;
$xml = substr($xml, $end);
}
return $tags;
}
$pos = strpos($xml, "<{$tag}>");
if ($pos === false) return false;
$start = strpos($xml, '>', $pos) + 1;
$length = strpos($xml, "</{$tag}>", $start) - $start;
$end = strpos($xml, '>', $start + $length) + 1;
return ($end !== false) ? substr($xml, $start, $length) : false;
}
public function __destruct () {
$this->save();
}
}
// start part 2
$sitemap = new Sitemap;
if (get('pages')) {
$sitemap->page('pages');
$result = mysql_query("SELECT uri FROM app_uri");
while (list($url, $created) = mysql_fetch_row($result)) {
$sitemap->url($url, $created, 'monthly');
}
}
$sitemap->close();
unset ($sitemap);
function get ($name) {
return (isset($_GET['update']) && strpos($_GET['update'], $name) !== false) ? true : false;
}
?>
I could install wget (it's available for windows as well) and then call the url via localhost in the task scheduler script:
wget.exe "http://localhost/path/to/script.php?pages=test"
This way you wouldn't have to rewrite the php script.
Otherwise, if the script is meant for shell usage only, then pass variables via command line:
php yourscript.php variable1 variable2 ...
In the php script you can than access those variables using the $argv variable:
$variable1 = $argv[1];
$variable2 = $argv[2];
have a look on:
How to pass GET variables to php file with Shell?
which already answered the same question :).

php sitemap for large websites

I want to create a sitemap for a page with more than 30.000.000 pages. The page is daily updating, removing and adding new pages.
I found this php script which I would like to run with a cron job.
Sitemap php script
I have all URIs in the table "myuri" in the column "uri" entries are written e.g. "/this-is-a-page.html". What parameters do I need to add to the script to get it running on my table?
<?php
/*
* author: Kyle Gadd
* documentation: http://www.php-ease.com/classes/sitemap.html
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
class Sitemap {
private $compress;
private $page = 'index';
private $index = 1;
private $count = 1;
private $urls = array();
public function __construct ($compress=true) {
ini_set('memory_limit', '75M'); // 50M required per tests
$this->compress = ($compress) ? '.gz' : '';
}
public function page ($name) {
$this->save();
$this->page = $name;
$this->index = 1;
}
public function url ($url, $lastmod='', $changefreq='', $priority='') {
$url = htmlspecialchars(BASE_URL . $url);
$lastmod = (!empty($lastmod)) ? date('Y-m-d', strtotime($lastmod)) : false;
$changefreq = (!empty($changefreq) && in_array(strtolower($changefreq), array('always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'))) ? strtolower($changefreq) : false;
$priority = (!empty($priority) && is_numeric($priority) && abs($priority) <= 1) ? round(abs($priority), 1) : false;
if (!$lastmod && !$changefreq && !$priority) {
$this->urls[] = $url;
} else {
$url = array('loc'=>$url);
if ($lastmod !== false) $url['lastmod'] = $lastmod;
if ($changefreq !== false) $url['changefreq'] = $changefreq;
if ($priority !== false) $url['priority'] = ($priority < 1) ? $priority : '1.0';
$this->urls[] = $url;
}
if ($this->count == 50000) {
$this->save();
} else {
$this->count++;
}
}
public function close() {
$this->save();
$this->ping_search_engines();
}
private function save () {
if (empty($this->urls)) return;
$file = "sitemap-{$this->page}-{$this->index}.xml{$this->compress}";
$xml = '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
$xml .= '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n";
foreach ($this->urls as $url) {
$xml .= ' <url>' . "\n";
if (is_array($url)) {
foreach ($url as $key => $value) $xml .= " <{$key}>{$value}</{$key}>\n";
} else {
$xml .= " <loc>{$url}</loc>\n";
}
$xml .= ' </url>' . "\n";
}
$xml .= '</urlset>' . "\n";
$this->urls = array();
if (!empty($this->compress)) $xml = gzencode($xml, 9);
$fp = fopen(BASE_URI . $file, 'wb');
fwrite($fp, $xml);
fclose($fp);
$this->index++;
$this->count = 1;
$num = $this->index; // should have already been incremented
while (file_exists(BASE_URI . "sitemap-{$this->page}-{$num}.xml{$this->compress}")) {
unlink(BASE_URI . "sitemap-{$this->page}-{$num}.xml{$this->compress}");
$num++;
}
$this->index($file);
}
private function index ($file) {
$sitemaps = array();
$index = "sitemap-index.xml{$this->compress}";
if (file_exists(BASE_URI . $index)) {
$xml = (!empty($this->compress)) ? gzfile(BASE_URI . $index) : file(BASE_URI . $index);
$tags = $this->xml_tag(implode('', $xml), array('sitemap'));
foreach ($tags as $xml) {
$loc = str_replace(BASE_URL, '', $this->xml_tag($xml, 'loc'));
$lastmod = $this->xml_tag($xml, 'lastmod');
$lastmod = ($lastmod) ? date('Y-m-d', strtotime($lastmod)) : date('Y-m-d');
if (file_exists(BASE_URI . $loc)) $sitemaps[$loc] = $lastmod;
}
}
$sitemaps[$file] = date('Y-m-d');
$xml = '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
$xml .= '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n";
foreach ($sitemaps as $loc => $lastmod) {
$xml .= ' <sitemap>' . "\n";
$xml .= ' <loc>' . BASE_URL . $loc . '</loc>' . "\n";
$xml .= ' <lastmod>' . $lastmod . '</lastmod>' . "\n";
$xml .= ' </sitemap>' . "\n";
}
$xml .= '</sitemapindex>' . "\n";
if (!empty($this->compress)) $xml = gzencode($xml, 9);
$fp = fopen(BASE_URI . $index, 'wb');
fwrite($fp, $xml);
fclose($fp);
}
private function xml_tag ($xml, $tag, &$end='') {
if (is_array($tag)) {
$tags = array();
while ($value = $this->xml_tag($xml, $tag[0], $end)) {
$tags[] = $value;
$xml = substr($xml, $end);
}
return $tags;
}
$pos = strpos($xml, "<{$tag}>");
if ($pos === false) return false;
$start = strpos($xml, '>', $pos) + 1;
$length = strpos($xml, "</{$tag}>", $start) - $start;
$end = strpos($xml, '>', $start + $length) + 1;
return ($end !== false) ? substr($xml, $start, $length) : false;
}
public function ping_search_engines () {
$sitemap = BASE_URL . 'sitemap-index.xml' . $this->compress;
$engines = array();
$engines['www.google.com'] = '/webmasters/tools/ping?sitemap=' . urlencode($sitemap);
$engines['www.bing.com'] = '/webmaster/ping.aspx?siteMap=' . urlencode($sitemap);
$engines['submissions.ask.com'] = '/ping?sitemap=' . urlencode($sitemap);
foreach ($engines as $host => $path) {
if ($fp = fsockopen($host, 80)) {
$send = "HEAD $path HTTP/1.1\r\n";
$send .= "HOST: $host\r\n";
$send .= "CONNECTION: Close\r\n\r\n";
fwrite($fp, $send);
$http_response = fgets($fp, 128);
fclose($fp);
list($response, $code) = explode (' ', $http_response);
if ($code != 200) trigger_error ("{$host} ping was unsuccessful.<br />Code: {$code}<br />Response: {$response}");
}
}
}
public function __destruct () {
$this->save();
}
}
?>
There is already an example of usage on the page:
<?php
require_once ('php/classes/Sitemap.php');
$sitemap = new Sitemap;
if (get('pages')) {
$sitemap->page('pages');
$result = db_query ("SELECT url, created FROM pages"); // 20 pages
while (list($url, $created) = $result->fetch_row()) {
$sitemap->url($url, $created, 'yearly');
}
}
if (get('posts')) {
$sitemap->page('posts');
$result = db_query ("SELECT url, updated FROM posts"); // 70,000 posts
while (list($url, $updated) = $result->fetch_row()) {
$sitemap->url($url, $updated, 'monthly');
}
}
$sitemap->close();
unset ($sitemap);
function get ($name) {
return (isset($_GET['update']) && strpos($_GET['update'], $name) !== false) ? true : false;
}
?>
I would change this part....
if (get('pages')) {
$sitemap->page('pages');
$result = db_query ("SELECT uri FROM myuri");
while (list($url) = mysql_fetch_row($result)) {
$sitemap->url($url,'', 'yearly');
}
}
Not sure if that $updated is needed? Looks like the function just defaults it to an empty string anyways...... But maybe you could at a timestamp column to your table to pull the last updated date as well, and feed it into the function where I put ''.
Also....remove this part...
if (get('posts')) {
$sitemap->page('posts');
$result = db_query ("SELECT url, updated FROM posts"); // 70,000 posts
while (list($url, $updated) = $result->fetch_row()) {
$sitemap->url($url, $updated, 'monthly');
}
}

Categories