So I have a web crawler that I am working on. And I have a CSV file that have about one million websites that I want to pass to be crawled. My problem is that I am able to save the CSV file in an array but when I pass it to the method that crawls it; it seems that it takes the first element and crawls it not the whole array. Can someone help me?
<?php
include("classes/DomDocumentParser.php");
include("config.php");
$alreadyCrawled = array();
$crawling = array();
$alreadyFoundImages = array();
$my_list = array();
function linkExists($url){
global $con;
$query = $con->prepare("SELECT * FROM sites WHERE url = :url");
$query ->bindParam(":url",$url);
$query->execute();
return $query->rowCount() != 0;
}
function insertImage($url,$src,$title,$alt){
global $con;
$query = $con->prepare("INSERT INTO images(siteUrl, imageUrl, alt, title)
VALUES(:siteUrl,:imageUrl,:alt,:title)");
$query ->bindParam(":siteUrl",$url);
$query ->bindParam(":imageUrl",$src);
$query ->bindParam(":alt",$alt);
$query ->bindParam(":title",$title);
return $query->execute();
}
function insertLink($url,$title,$description,$keywords){
global $con;
$query = $con->prepare("INSERT INTO sites(Url, title, description, keywords)
VALUES(:url,:title,:description,:keywords)");
$query ->bindParam(":url",$url);
$query ->bindParam(":title",$title);
$query ->bindParam(":description",$description);
$query ->bindParam(":keywords",$keywords);
return $query->execute();
}
function createLink($src,$url){
$scheme = parse_url($url)["scheme"]; // http or https
$host = parse_url($url)["host"]; // www.mohamad-ahmad.com
if(substr($src,0,2) =="//"){
// //www.mohanadahmad.com
$src = $scheme . ":" . $src;
}
else if(substr($src,0,1) =="/"){
// /aboutus/about.php
$src = $scheme . "://" . $host . $src;
}
else if(substr($src,0,2) =="./"){
// ./aboutus/about.php
$src = $scheme . "://" . $host . dirname(parse_url($url)["path"]) . substr($src ,1);
}
else if(substr($src,0,3) =="../"){
// ../aboutus/about.php
$src = $scheme . "://" . $host . "/" . $src;
}
else if(substr($src,0,5) !="https" && substr($src,0,4) !="http" ){
// aboutus/about.php
$src = $scheme . "://" . $host ."/" .$src;
}
return $src;
}
function getDetails($url){
global $alreadyFoundImages;
$parser = new DomDocumentParser($url);
$titleArray = $parser->getTitletags();
if(sizeof($titleArray) == 0 || $titleArray->item(0) == NULL){
return;
}
$title = $titleArray -> item(0) -> nodeValue;
$title = str_replace("\n","",$title);
if($title == ""){
return;
}
$description="";
$keywords="";
$metasArray = $parser -> getMetatags();
foreach($metasArray as $meta){
if($meta->getAttribute("name") == "description"){
$description = $meta -> getAttribute("content");
}
if($meta->getAttribute("name") == "keywords"){
$keywords = $meta -> getAttribute("content");
}
}
$description = str_replace("\n","",$description);
$keywords = str_replace("\n","",$keywords);
if(linkExists($url)){
echo "$url already exists <br>";
}
else if(insertLink($url,$title,$description,$keywords)){
echo "SUCCESS: $url <br>";
}
else{
echo "ERROR: Failed to insert $url <br>";
}
$imageArray = $parser ->getImages();
foreach($imageArray as $image){
$src = $image->getAttribute("src");
$alt = $image->getAttribute("alt");
$title = $image->getAttribute("title");
if(!$title && !$alt){
continue;
}
$src = createLink($src,$url);
if(!in_array($src,$alreadyFoundImages)){
$alreadyFoundImages[] = $src;
insertImage($url,$src,$alt,$title);
}
}
}
function followLinks($url) {
global $crawling;
global $alreadyCrawled;
$parser = new DomDocumentParser($url);
$linkList = $parser->getLinks();
foreach($linkList as $link){
$href = $link->getAttribute("href");
if(strpos($href,"#") !==false){
// Ignore anchor url
continue;
}
else if(substr($href,0,11)== "javascript:"){
// Ignore javascript url
continue;
}
$href = createLink($href,$url);
if(!in_array($href,$alreadyCrawled)){
$alreadyCrawled[] = $href;
$crawling[] = $href;
//getDetails contain the insert into db
getDetails($href);
}
}
array_shift($crawling);
foreach($crawling as $site){
followLinks($site);
}
}
function fill_my_list(){
global $my_list;
$file = fopen('top-1m.csv', 'r');
while( ($data = fgetcsv($file)) !== false ) {
$startUrl = "https://www.".$data[1];
$my_list[] = $startUrl;
}
foreach($my_list as $key => $u){
followLinks($u);
}
}
fill_my_list();
?>
You can do something like this by php.net
$row = 1;
if (($File = fopen("test.csv", "r")) !== FALSE) {
while (($data = fgetcsv($File, 1000, ",")) !== FALSE) {
$num = count($data);
echo "<p> $num fields in line $row: <br /></p>\n";
$row++;
for ($c=0; $c < $num; $c++) {
echo $data[$c] . "<br />\n";
}
//Use $data[$c];
}
fclose($File);
}
Here more examples : https://www.php.net/manual/en/function.fgetcsv.php#refsect1-function.fgetcsv-examples
Related
PHP Script written for SuiteCRM improting is supposed to loop through all files in the directory and import each to Database. Having a lot of trouble.
Script reads and works on the first file only then finishes when it should loop :(
Importing works fine and data is added the the database from first file.
function MemberImportJob()
{
try
{
$config = new Configurator();
$config->loadConfig();
$xmlDataDir = 'custom/wimporter/eidimport';
$directoryContent = scandir($xmlDataDir);
//foreach ($directoryContent as $itemFile)
foreach (glob($xmlDataDir) as $itemfile)
{
var_dump($itemfile);
if (is_dir($xmlDataDir . DIRECTORY_SEPARATOR . $itemFile)) continue;
if (strcasecmp(substr($itemFile, -4), ".csv") != 0) continue;
$oFile = fopen($xmlDataDir . DIRECTORY_SEPARATOR . $itemFile, 'r');
if ($oFile !== FALSE)
{
$header = NULL;
$data = Array();
while (($data[] = fgetcsv($oFile, 90000, ',')) !== FALSE) { }
fclose($oFile);
//combine into a nice associative array:
$arow=Array();
$fields = array_shift($data);
foreach ($data as $i=>$arow)
{
array_combine " . $i);
if (is_array($arow)) {
$data[$i] = array_combine($fields, $arow);}
}
unset($arow);
$num = count($data);
for ($row=0; $row < $num - 1; $row++)
{
$Member = BeanFactory::getBean("locte_Membership");
$Member=$Member->retrieve_by_string_fields(array('last_name' => $data[$row]["LAST NAME"], 'first_name' => $data[$row]["FIRST NAME"], 'lcl_affiliate_number' => $data[$row]["AFFILIATE"]));
$MemberID = $Member->id;
if (is_null($Member)) {
$Member = BeanFactory::newBean('locte_Membership');
$delta = fillPerson($data[$row], $Member, "FULL NAME");
if(count($delta))
{
$Member_id = $Member->save();
}
} else {
v
var_dump($Member->id);
$delta = fillFoundrecord($data[$row], $Member, "FULL NAME");
echo("record Updated!");
$Member_id = $Member->save();
}
unset($data[$row]);
}
}
return true;
}
} catch (Exception $e)
{
return false;
}
}
I need to refresh modifications after install module.
public function install() {
$this->load->controller('marketplace/modification/refresh');
}
I tried this. Its worked but the page redirected to modification listing. How can i do without redirect. I am using opencart 3.
If you don't want to edit modification.php or clone its refresh function, You can use this:
public function install(){
$data['redirect'] = 'extension/extension/module';
$this->load->controller('marketplace/modification/refresh', $data);
}
You could not controll by this way as you are doing:
You need to do this as
public function install() {
$this->refresh();
}
protected function refresh($data = array()) {
$this->load->language('marketplace/modification');
$this->document->setTitle($this->language->get('heading_title'));
$this->load->model('setting/modification');
if ($this->validate()) {
// Just before files are deleted, if config settings say maintenance mode is off then turn it on
$maintenance = $this->config->get('config_maintenance');
$this->load->model('setting/setting');
$this->model_setting_setting->editSettingValue('config', 'config_maintenance', true);
//Log
$log = array();
// Clear all modification files
$files = array();
// Make path into an array
$path = array(DIR_MODIFICATION . '*');
// While the path array is still populated keep looping through
while (count($path) != 0) {
$next = array_shift($path);
foreach (glob($next) as $file) {
// If directory add to path array
if (is_dir($file)) {
$path[] = $file . '/*';
}
// Add the file to the files to be deleted array
$files[] = $file;
}
}
// Reverse sort the file array
rsort($files);
// Clear all modification files
foreach ($files as $file) {
if ($file != DIR_MODIFICATION . 'index.html') {
// If file just delete
if (is_file($file)) {
unlink($file);
// If directory use the remove directory function
} elseif (is_dir($file)) {
rmdir($file);
}
}
}
// Begin
$xml = array();
// Load the default modification XML
$xml[] = file_get_contents(DIR_SYSTEM . 'modification.xml');
// This is purly for developers so they can run mods directly and have them run without upload after each change.
$files = glob(DIR_SYSTEM . '*.ocmod.xml');
if ($files) {
foreach ($files as $file) {
$xml[] = file_get_contents($file);
}
}
// Get the default modification file
$results = $this->model_setting_modification->getModifications();
foreach ($results as $result) {
if ($result['status']) {
$xml[] = $result['xml'];
}
}
$modification = array();
foreach ($xml as $xml) {
if (empty($xml)){
continue;
}
$dom = new DOMDocument('1.0', 'UTF-8');
$dom->preserveWhiteSpace = false;
$dom->loadXml($xml);
// Log
$log[] = 'MOD: ' . $dom->getElementsByTagName('name')->item(0)->textContent;
// Wipe the past modification store in the backup array
$recovery = array();
// Set the a recovery of the modification code in case we need to use it if an abort attribute is used.
if (isset($modification)) {
$recovery = $modification;
}
$files = $dom->getElementsByTagName('modification')->item(0)->getElementsByTagName('file');
foreach ($files as $file) {
$operations = $file->getElementsByTagName('operation');
$files = explode('|', $file->getAttribute('path'));
foreach ($files as $file) {
$path = '';
// Get the full path of the files that are going to be used for modification
if ((substr($file, 0, 7) == 'catalog')) {
$path = DIR_CATALOG . substr($file, 8);
}
if ((substr($file, 0, 5) == 'admin')) {
$path = DIR_APPLICATION . substr($file, 6);
}
if ((substr($file, 0, 6) == 'system')) {
$path = DIR_SYSTEM . substr($file, 7);
}
if ($path) {
$files = glob($path, GLOB_BRACE);
if ($files) {
foreach ($files as $file) {
// Get the key to be used for the modification cache filename.
if (substr($file, 0, strlen(DIR_CATALOG)) == DIR_CATALOG) {
$key = 'catalog/' . substr($file, strlen(DIR_CATALOG));
}
if (substr($file, 0, strlen(DIR_APPLICATION)) == DIR_APPLICATION) {
$key = 'admin/' . substr($file, strlen(DIR_APPLICATION));
}
if (substr($file, 0, strlen(DIR_SYSTEM)) == DIR_SYSTEM) {
$key = 'system/' . substr($file, strlen(DIR_SYSTEM));
}
// If file contents is not already in the modification array we need to load it.
if (!isset($modification[$key])) {
$content = file_get_contents($file);
$modification[$key] = preg_replace('~\r?\n~', "\n", $content);
$original[$key] = preg_replace('~\r?\n~', "\n", $content);
// Log
$log[] = PHP_EOL . 'FILE: ' . $key;
}
foreach ($operations as $operation) {
$error = $operation->getAttribute('error');
// Ignoreif
$ignoreif = $operation->getElementsByTagName('ignoreif')->item(0);
if ($ignoreif) {
if ($ignoreif->getAttribute('regex') != 'true') {
if (strpos($modification[$key], $ignoreif->textContent) !== false) {
continue;
}
} else {
if (preg_match($ignoreif->textContent, $modification[$key])) {
continue;
}
}
}
$status = false;
// Search and replace
if ($operation->getElementsByTagName('search')->item(0)->getAttribute('regex') != 'true') {
// Search
$search = $operation->getElementsByTagName('search')->item(0)->textContent;
$trim = $operation->getElementsByTagName('search')->item(0)->getAttribute('trim');
$index = $operation->getElementsByTagName('search')->item(0)->getAttribute('index');
// Trim line if no trim attribute is set or is set to true.
if (!$trim || $trim == 'true') {
$search = trim($search);
}
// Add
$add = $operation->getElementsByTagName('add')->item(0)->textContent;
$trim = $operation->getElementsByTagName('add')->item(0)->getAttribute('trim');
$position = $operation->getElementsByTagName('add')->item(0)->getAttribute('position');
$offset = $operation->getElementsByTagName('add')->item(0)->getAttribute('offset');
if ($offset == '') {
$offset = 0;
}
// Trim line if is set to true.
if ($trim == 'true') {
$add = trim($add);
}
// Log
$log[] = 'CODE: ' . $search;
// Check if using indexes
if ($index !== '') {
$indexes = explode(',', $index);
} else {
$indexes = array();
}
// Get all the matches
$i = 0;
$lines = explode("\n", $modification[$key]);
for ($line_id = 0; $line_id < count($lines); $line_id++) {
$line = $lines[$line_id];
// Status
$match = false;
// Check to see if the line matches the search code.
if (stripos($line, $search) !== false) {
// If indexes are not used then just set the found status to true.
if (!$indexes) {
$match = true;
} elseif (in_array($i, $indexes)) {
$match = true;
}
$i++;
}
// Now for replacing or adding to the matched elements
if ($match) {
switch ($position) {
default:
case 'replace':
$new_lines = explode("\n", $add);
if ($offset < 0) {
array_splice($lines, $line_id + $offset, abs($offset) + 1, array(str_replace($search, $add, $line)));
$line_id -= $offset;
} else {
array_splice($lines, $line_id, $offset + 1, array(str_replace($search, $add, $line)));
}
break;
case 'before':
$new_lines = explode("\n", $add);
array_splice($lines, $line_id - $offset, 0, $new_lines);
$line_id += count($new_lines);
break;
case 'after':
$new_lines = explode("\n", $add);
array_splice($lines, ($line_id + 1) + $offset, 0, $new_lines);
$line_id += count($new_lines);
break;
}
// Log
$log[] = 'LINE: ' . $line_id;
$status = true;
}
}
$modification[$key] = implode("\n", $lines);
} else {
$search = trim($operation->getElementsByTagName('search')->item(0)->textContent);
$limit = $operation->getElementsByTagName('search')->item(0)->getAttribute('limit');
$replace = trim($operation->getElementsByTagName('add')->item(0)->textContent);
// Limit
if (!$limit) {
$limit = -1;
}
// Log
$match = array();
preg_match_all($search, $modification[$key], $match, PREG_OFFSET_CAPTURE);
// Remove part of the the result if a limit is set.
if ($limit > 0) {
$match[0] = array_slice($match[0], 0, $limit);
}
if ($match[0]) {
$log[] = 'REGEX: ' . $search;
for ($i = 0; $i < count($match[0]); $i++) {
$log[] = 'LINE: ' . (substr_count(substr($modification[$key], 0, $match[0][$i][1]), "\n") + 1);
}
$status = true;
}
// Make the modification
$modification[$key] = preg_replace($search, $replace, $modification[$key], $limit);
}
if (!$status) {
// Abort applying this modification completely.
if ($error == 'abort') {
$modification = $recovery;
// Log
$log[] = 'NOT FOUND - ABORTING!';
break 5;
}
// Skip current operation or break
elseif ($error == 'skip') {
// Log
$log[] = 'NOT FOUND - OPERATION SKIPPED!';
continue;
}
// Break current operations
else {
// Log
$log[] = 'NOT FOUND - OPERATIONS ABORTED!';
break;
}
}
}
}
}
}
}
}
// Log
$log[] = '----------------------------------------------------------------';
}
// Log
$ocmod = new Log('ocmod.log');
$ocmod->write(implode("\n", $log));
// Write all modification files
foreach ($modification as $key => $value) {
// Only create a file if there are changes
if ($original[$key] != $value) {
$path = '';
$directories = explode('/', dirname($key));
foreach ($directories as $directory) {
$path = $path . '/' . $directory;
if (!is_dir(DIR_MODIFICATION . $path)) {
#mkdir(DIR_MODIFICATION . $path, 0777);
}
}
$handle = fopen(DIR_MODIFICATION . $key, 'w');
fwrite($handle, $value);
fclose($handle);
}
}
// Maintance mode back to original settings
$this->model_setting_setting->editSettingValue('config', 'config_maintenance', $maintenance);
// Do not return success message if refresh() was called with $data
$this->session->data['success'] = $this->language->get('text_success');
$url = '';
if (isset($this->request->get['sort'])) {
$url .= '&sort=' . $this->request->get['sort'];
}
if (isset($this->request->get['order'])) {
$url .= '&order=' . $this->request->get['order'];
}
if (isset($this->request->get['page'])) {
$url .= '&page=' . $this->request->get['page'];
}
}
}
I hope it shouwl work for you.
This process is used to refresh the modification when your module installing.
if you need globally this then please tell me I will update you process.
I have this function
function my_product_title($title, $id)
{
if(in_the_loop() && is_product())
{
return '<span class="border">FooBar</span>';
}
return $title;
}
add_filter( 'the_title', 'my_product_title', 5, 2);
and it can replace the product title with return '<span class="border">FooBar</span>';.
I also have a custom script in "mycustomtitle.php" that can modify the products titles and my script can echo that modified title as $mycustomtitle
I want to replace the original product title with my $mycustomtitle without changing anything in the core files.
I've tried to just change return '<span class="border">FooBar</span>'; to $mycustomtitle but it only removes the original title and gives no output at all...
Thanks!
UPDATE 2016-10-20 With custom code:
<?php
include $_SERVER['DOCUMENT_ROOT'] . '/wp-content/plugins/seo-controlpanel/seo-engine/explode.php';
$tit1 = $boomprint[array_rand($boomprint)].' '.file_get_contents($_SERVER['DOCUMENT_ROOT'] . '/wp-content/plugins/seo-controlpanel/write-to/product-title-h1.php');
$tit2 = $boomprint[array_rand($boomprint)].' '.file_get_contents($_SERVER['DOCUMENT_ROOT'] . '/wp-content/plugins/seo-controlpanel/write-to/product-title-h1.php');
$title_keys = array($tit1,$tit2);
$title_key11 = $title_keys[array_rand($title_keys)];
if(!function_exists(spin11)){
function spin11($string11) {
while(true) {
if(!preg_match_all('/({([^\{]*?)\})/', $string11, $matches))
break;
foreach($matches[2] as $i => $match) {
$parts = explode('|', $match);
$string11 = str_replace_once11($matches[0][$i], $parts[mt_rand(0, count($parts)-1)], $string11);
}
}
return $string11;
}
}
if(!function_exists(str_replace_once11)){
function str_replace_once11($from,$to,$str)
{
$str = explode($from,$str,2);
return $str[0].$to.$str[1];
}
$title_id11 = get_the_ID();
$fileLocation11 = getenv("DOCUMENT_ROOT") . '/wp-content/plugins/seo-controlpanel/seo-cache/product-title-h1/'.$title_id11.'.txt';
if(!file_exists($fileLocation11)){
$file11 = fopen($fileLocation11,"w");
$content11 = spin11($title_key11);
fwrite($file11,$content11);
fclose($file11);
}
if(file_exists($fileLocation11)){
$myFile11 = $fileLocation11;
$fh11 = fopen($myFile11, 'r');
$theData11 = fread($fh11, filesize($myFile11));
fclose($fh11);
}
}
?>
Insert your custom code in your function.
function my_product_title($title, $id) {
include $_SERVER['DOCUMENT_ROOT'] . '/wp-content/plugins/seo-controlpanel/seo-engine/explode.php';
$tit1 = $boomprint[array_rand($boomprint)].' '.file_get_contents($_SERVER['DOCUMENT_ROOT'] . '/wp-content/plugins/seo-controlpanel/write-to/product-title-h1.php');
$tit2 = $boomprint[array_rand($boomprint)].' '.file_get_contents($_SERVER['DOCUMENT_ROOT'] . '/wp-content/plugins/seo-controlpanel/write-to/product-title-h1.php');
$title_keys = array($tit1,$tit2);
$title_key11 = $title_keys[array_rand($title_keys)];
if(!function_exists(spin11)){
function spin11($string11) {
while(true) {
if(!preg_match_all('/({([^\{]*?)\})/', $string11, $matches))
break;
foreach($matches[2] as $i => $match) {
$parts = explode('|', $match);
$string11 = str_replace_once11($matches[0][$i], $parts[mt_rand(0, count($parts)-1)], $string11);
}
}
return $string11;
}
}
if(!function_exists(str_replace_once11)){
function str_replace_once11($from,$to,$str)
{
$str = explode($from,$str,2);
return $str[0].$to.$str[1];
}
$title_id11 = get_the_ID();
$fileLocation11 = getenv("DOCUMENT_ROOT") . '/wp-content/plugins/seo-controlpanel/seo-cache/product-title-h1/'.$title_id11.'.txt';
if(!file_exists($fileLocation11)){
$file11 = fopen($fileLocation11,"w");
$content11 = spin11($title_key11);
fwrite($file11,$content11);
fclose($file11);
}
if(file_exists($fileLocation11)){
$myFile11 = $fileLocation11;
$fh11 = fopen($myFile11, 'r');
$theData11 = fread($fh11, filesize($myFile11));
fclose($fh11);
}
}
if(in_the_loop() && is_product()) {
return $theData11;
}
return $title;
}
add_filter( 'the_title', 'my_product_title', 5, 2);
Is $mycustomtitle global variable or it is defined inside another function/method/class?
If it is global variable this may help:
function my_product_title($title, $id)
{
global $mycustomtitle;
if(in_the_loop() && is_product())
{
return $mycustomtitle;
}
return $title;
}
<?php
include $_SERVER['DOCUMENT_ROOT'] . '/wp-content/plugins/seo-controlpanel/seo-engine/explode.php';
$tit1 = $boomprint[array_rand($boomprint)].' '.file_get_contents($_SERVER['DOCUMENT_ROOT'] . '/wp-content/plugins/seo-controlpanel/write-to/product-title-h1.php');
$tit2 = $boomprint[array_rand($boomprint)].' '.file_get_contents($_SERVER['DOCUMENT_ROOT'] . '/wp-content/plugins/seo-controlpanel/write-to/product-title-h1.php');
$title_keys = array($tit1,$tit2);
$title_key11 = $title_keys[array_rand($title_keys)];
if(!function_exists(spin11)){
function spin11($string11) {
while(true) {
if(!preg_match_all('/({([^\{]*?)\})/', $string11, $matches))
break;
foreach($matches[2] as $i => $match) {
$parts = explode('|', $match);
$string11 = str_replace_once11($matches[0][$i], $parts[mt_rand(0, count($parts)-1)], $string11);
}
}
return $string11;
}
}
if(!function_exists(str_replace_once11)){
function str_replace_once11($from,$to,$str)
{
$str = explode($from,$str,2);
return $str[0].$to.$str[1];
}
$title_id11 = get_the_ID();
$fileLocation11 = getenv("DOCUMENT_ROOT") . '/wp-content/plugins/seo-controlpanel/seo-cache/product-title-h1/'.$title_id11.'.txt';
if(!file_exists($fileLocation11)){
$file11 = fopen($fileLocation11,"w");
$content11 = spin11($title_key11);
fwrite($file11,$content11);
fclose($file11);
}
if(file_exists($fileLocation11)){
$myFile11 = $fileLocation11;
$fh11 = fopen($myFile11, 'r');
$theData11 = fread($fh11, filesize($myFile11));
fclose($fh11);
}
}
//echo $theData11;
?>
I use a script from here to generate my sitemaps.
I can call it with the browser with http://www.example.com/sitemap.php?update=pages and its working fine.
I need to call it as shell script so that I can automate it with the windows task scheduler. But the script needs to be changed to get the variables ?update=pages. But I don't manage to change it correctly.
Could anybody help me so that I can execute the script from command line with
...\php C:\path\to\script\sitemap.php update=pages. It would also be fine for me to hardcode the variables into the script since I wont change them anyway.
define("BASE_URL", "http://www.example.com/");
define ('BASE_URI', $_SERVER['DOCUMENT_ROOT'] . '/');
class Sitemap {
private $compress;
private $page = 'index';
private $index = 1;
private $count = 1;
private $urls = array();
public function __construct ($compress=true) {
ini_set('memory_limit', '75M'); // 50M required per tests
$this->compress = ($compress) ? '.gz' : '';
}
public function page ($name) {
$this->save();
$this->page = $name;
$this->index = 1;
}
public function url ($url, $lastmod='', $changefreq='', $priority='') {
$url = htmlspecialchars(BASE_URL . 'xx' . $url);
$lastmod = (!empty($lastmod)) ? date('Y-m-d', strtotime($lastmod)) : false;
$changefreq = (!empty($changefreq) && in_array(strtolower($changefreq), array('always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'))) ? strtolower($changefreq) : false;
$priority = (!empty($priority) && is_numeric($priority) && abs($priority) <= 1) ? round(abs($priority), 1) : false;
if (!$lastmod && !$changefreq && !$priority) {
$this->urls[] = $url;
} else {
$url = array('loc'=>$url);
if ($lastmod !== false) $url['lastmod'] = $lastmod;
if ($changefreq !== false) $url['changefreq'] = $changefreq;
if ($priority !== false) $url['priority'] = ($priority < 1) ? $priority : '1.0';
$this->urls[] = $url;
}
if ($this->count == 50000) {
$this->save();
} else {
$this->count++;
}
}
public function close() {
$this->save();
}
private function save () {
if (empty($this->urls)) return;
$file = "sitemaps/xx-sitemap-{$this->page}-{$this->index}.xml{$this->compress}";
$xml = '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
$xml .= '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n";
foreach ($this->urls as $url) {
$xml .= ' <url>' . "\n";
if (is_array($url)) {
foreach ($url as $key => $value) $xml .= " <{$key}>{$value}</{$key}>\n";
} else {
$xml .= " <loc>{$url}</loc>\n";
}
$xml .= ' </url>' . "\n";
}
$xml .= '</urlset>' . "\n";
$this->urls = array();
if (!empty($this->compress)) $xml = gzencode($xml, 9);
$fp = fopen(BASE_URI . $file, 'wb');
fwrite($fp, $xml);
fclose($fp);
$this->index++;
$this->count = 1;
$num = $this->index; // should have already been incremented
while (file_exists(BASE_URI . "xxb-sitemap-{$this->page}-{$num}.xml{$this->compress}")) {
unlink(BASE_URI . "xxc-sitemap-{$this->page}-{$num}.xml{$this->compress}");
$num++;
}
$this->index($file);
}
private function index ($file) {
$sitemaps = array();
$index = "sitemaps/xx-sitemap-index.xml{$this->compress}";
if (file_exists(BASE_URI . $index)) {
$xml = (!empty($this->compress)) ? gzfile(BASE_URI . $index) : file(BASE_URI . $index);
$tags = $this->xml_tag(implode('', $xml), array('sitemap'));
foreach ($tags as $xml) {
$loc = str_replace(BASE_URL, '', $this->xml_tag($xml, 'loc'));
$lastmod = $this->xml_tag($xml, 'lastmod');
$lastmod = ($lastmod) ? date('Y-m-d', strtotime($lastmod)) : date('Y-m-d');
if (file_exists(BASE_URI . $loc)) $sitemaps[$loc] = $lastmod;
}
}
$sitemaps[$file] = date('Y-m-d');
$xml = '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
$xml .= '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n";
foreach ($sitemaps as $loc => $lastmod) {
$xml .= ' <sitemap>' . "\n";
$xml .= ' <loc>' . BASE_URL . $loc . '</loc>' . "\n";
$xml .= ' <lastmod>' . $lastmod . '</lastmod>' . "\n";
$xml .= ' </sitemap>' . "\n";
}
$xml .= '</sitemapindex>' . "\n";
if (!empty($this->compress)) $xml = gzencode($xml, 9);
$fp = fopen(BASE_URI . $index, 'wb');
fwrite($fp, $xml);
fclose($fp);
}
private function xml_tag ($xml, $tag, &$end='') {
if (is_array($tag)) {
$tags = array();
while ($value = $this->xml_tag($xml, $tag[0], $end)) {
$tags[] = $value;
$xml = substr($xml, $end);
}
return $tags;
}
$pos = strpos($xml, "<{$tag}>");
if ($pos === false) return false;
$start = strpos($xml, '>', $pos) + 1;
$length = strpos($xml, "</{$tag}>", $start) - $start;
$end = strpos($xml, '>', $start + $length) + 1;
return ($end !== false) ? substr($xml, $start, $length) : false;
}
public function __destruct () {
$this->save();
}
}
// start part 2
$sitemap = new Sitemap;
if (get('pages')) {
$sitemap->page('pages');
$result = mysql_query("SELECT uri FROM app_uri");
while (list($url, $created) = mysql_fetch_row($result)) {
$sitemap->url($url, $created, 'monthly');
}
}
$sitemap->close();
unset ($sitemap);
function get ($name) {
return (isset($_GET['update']) && strpos($_GET['update'], $name) !== false) ? true : false;
}
?>
I could install wget (it's available for windows as well) and then call the url via localhost in the task scheduler script:
wget.exe "http://localhost/path/to/script.php?pages=test"
This way you wouldn't have to rewrite the php script.
Otherwise, if the script is meant for shell usage only, then pass variables via command line:
php yourscript.php variable1 variable2 ...
In the php script you can than access those variables using the $argv variable:
$variable1 = $argv[1];
$variable2 = $argv[2];
have a look on:
How to pass GET variables to php file with Shell?
which already answered the same question :).
I want to create a sitemap for a page with more than 30.000.000 pages. The page is daily updating, removing and adding new pages.
I found this php script which I would like to run with a cron job.
Sitemap php script
I have all URIs in the table "myuri" in the column "uri" entries are written e.g. "/this-is-a-page.html". What parameters do I need to add to the script to get it running on my table?
<?php
/*
* author: Kyle Gadd
* documentation: http://www.php-ease.com/classes/sitemap.html
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
class Sitemap {
private $compress;
private $page = 'index';
private $index = 1;
private $count = 1;
private $urls = array();
public function __construct ($compress=true) {
ini_set('memory_limit', '75M'); // 50M required per tests
$this->compress = ($compress) ? '.gz' : '';
}
public function page ($name) {
$this->save();
$this->page = $name;
$this->index = 1;
}
public function url ($url, $lastmod='', $changefreq='', $priority='') {
$url = htmlspecialchars(BASE_URL . $url);
$lastmod = (!empty($lastmod)) ? date('Y-m-d', strtotime($lastmod)) : false;
$changefreq = (!empty($changefreq) && in_array(strtolower($changefreq), array('always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'))) ? strtolower($changefreq) : false;
$priority = (!empty($priority) && is_numeric($priority) && abs($priority) <= 1) ? round(abs($priority), 1) : false;
if (!$lastmod && !$changefreq && !$priority) {
$this->urls[] = $url;
} else {
$url = array('loc'=>$url);
if ($lastmod !== false) $url['lastmod'] = $lastmod;
if ($changefreq !== false) $url['changefreq'] = $changefreq;
if ($priority !== false) $url['priority'] = ($priority < 1) ? $priority : '1.0';
$this->urls[] = $url;
}
if ($this->count == 50000) {
$this->save();
} else {
$this->count++;
}
}
public function close() {
$this->save();
$this->ping_search_engines();
}
private function save () {
if (empty($this->urls)) return;
$file = "sitemap-{$this->page}-{$this->index}.xml{$this->compress}";
$xml = '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
$xml .= '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n";
foreach ($this->urls as $url) {
$xml .= ' <url>' . "\n";
if (is_array($url)) {
foreach ($url as $key => $value) $xml .= " <{$key}>{$value}</{$key}>\n";
} else {
$xml .= " <loc>{$url}</loc>\n";
}
$xml .= ' </url>' . "\n";
}
$xml .= '</urlset>' . "\n";
$this->urls = array();
if (!empty($this->compress)) $xml = gzencode($xml, 9);
$fp = fopen(BASE_URI . $file, 'wb');
fwrite($fp, $xml);
fclose($fp);
$this->index++;
$this->count = 1;
$num = $this->index; // should have already been incremented
while (file_exists(BASE_URI . "sitemap-{$this->page}-{$num}.xml{$this->compress}")) {
unlink(BASE_URI . "sitemap-{$this->page}-{$num}.xml{$this->compress}");
$num++;
}
$this->index($file);
}
private function index ($file) {
$sitemaps = array();
$index = "sitemap-index.xml{$this->compress}";
if (file_exists(BASE_URI . $index)) {
$xml = (!empty($this->compress)) ? gzfile(BASE_URI . $index) : file(BASE_URI . $index);
$tags = $this->xml_tag(implode('', $xml), array('sitemap'));
foreach ($tags as $xml) {
$loc = str_replace(BASE_URL, '', $this->xml_tag($xml, 'loc'));
$lastmod = $this->xml_tag($xml, 'lastmod');
$lastmod = ($lastmod) ? date('Y-m-d', strtotime($lastmod)) : date('Y-m-d');
if (file_exists(BASE_URI . $loc)) $sitemaps[$loc] = $lastmod;
}
}
$sitemaps[$file] = date('Y-m-d');
$xml = '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
$xml .= '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n";
foreach ($sitemaps as $loc => $lastmod) {
$xml .= ' <sitemap>' . "\n";
$xml .= ' <loc>' . BASE_URL . $loc . '</loc>' . "\n";
$xml .= ' <lastmod>' . $lastmod . '</lastmod>' . "\n";
$xml .= ' </sitemap>' . "\n";
}
$xml .= '</sitemapindex>' . "\n";
if (!empty($this->compress)) $xml = gzencode($xml, 9);
$fp = fopen(BASE_URI . $index, 'wb');
fwrite($fp, $xml);
fclose($fp);
}
private function xml_tag ($xml, $tag, &$end='') {
if (is_array($tag)) {
$tags = array();
while ($value = $this->xml_tag($xml, $tag[0], $end)) {
$tags[] = $value;
$xml = substr($xml, $end);
}
return $tags;
}
$pos = strpos($xml, "<{$tag}>");
if ($pos === false) return false;
$start = strpos($xml, '>', $pos) + 1;
$length = strpos($xml, "</{$tag}>", $start) - $start;
$end = strpos($xml, '>', $start + $length) + 1;
return ($end !== false) ? substr($xml, $start, $length) : false;
}
public function ping_search_engines () {
$sitemap = BASE_URL . 'sitemap-index.xml' . $this->compress;
$engines = array();
$engines['www.google.com'] = '/webmasters/tools/ping?sitemap=' . urlencode($sitemap);
$engines['www.bing.com'] = '/webmaster/ping.aspx?siteMap=' . urlencode($sitemap);
$engines['submissions.ask.com'] = '/ping?sitemap=' . urlencode($sitemap);
foreach ($engines as $host => $path) {
if ($fp = fsockopen($host, 80)) {
$send = "HEAD $path HTTP/1.1\r\n";
$send .= "HOST: $host\r\n";
$send .= "CONNECTION: Close\r\n\r\n";
fwrite($fp, $send);
$http_response = fgets($fp, 128);
fclose($fp);
list($response, $code) = explode (' ', $http_response);
if ($code != 200) trigger_error ("{$host} ping was unsuccessful.<br />Code: {$code}<br />Response: {$response}");
}
}
}
public function __destruct () {
$this->save();
}
}
?>
There is already an example of usage on the page:
<?php
require_once ('php/classes/Sitemap.php');
$sitemap = new Sitemap;
if (get('pages')) {
$sitemap->page('pages');
$result = db_query ("SELECT url, created FROM pages"); // 20 pages
while (list($url, $created) = $result->fetch_row()) {
$sitemap->url($url, $created, 'yearly');
}
}
if (get('posts')) {
$sitemap->page('posts');
$result = db_query ("SELECT url, updated FROM posts"); // 70,000 posts
while (list($url, $updated) = $result->fetch_row()) {
$sitemap->url($url, $updated, 'monthly');
}
}
$sitemap->close();
unset ($sitemap);
function get ($name) {
return (isset($_GET['update']) && strpos($_GET['update'], $name) !== false) ? true : false;
}
?>
I would change this part....
if (get('pages')) {
$sitemap->page('pages');
$result = db_query ("SELECT uri FROM myuri");
while (list($url) = mysql_fetch_row($result)) {
$sitemap->url($url,'', 'yearly');
}
}
Not sure if that $updated is needed? Looks like the function just defaults it to an empty string anyways...... But maybe you could at a timestamp column to your table to pull the last updated date as well, and feed it into the function where I put ''.
Also....remove this part...
if (get('posts')) {
$sitemap->page('posts');
$result = db_query ("SELECT url, updated FROM posts"); // 70,000 posts
while (list($url, $updated) = $result->fetch_row()) {
$sitemap->url($url, $updated, 'monthly');
}
}