PHP script is timing out - php

I am pulling data from a page and I know this is a long process depending on the date being pulled. After 132 seconds of pulling the data the page times-out.
I have set the set_time_limit(0);and ignore_user_abort(true); - I am not sure what else to do to keep the script alive and pull all the data.
I have added the code below in case there is something i can do to speed it up??
set_time_limit(0);
ignore_user_abort(true);
error_reporting(-1);
ini_set('display_errors', 'On');
include "../include/class.php";
include "../include/db.php";
//the below will get the list of id's for each race that day
function curl($url){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch,CURLOPT_FOLLOWLOCATION,true);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
$url = "http://form.timeform.betfair.com/daypage?date=20150516"; //WILL NEED TO PULL TOMORROWS DATE AS DD-MM-YYY
$html = curl($url);
$dom = new DOMDocument();
#$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$xpath = new DOMXPath($dom);
//pull the individual cards for the day
//li class="rac-cardsclass="ix ixc"
$getdropdown = '//div[contains(#data-location, "RACING_COUNTRY_GB_IE")]//div[contains(#class, "course")]';
$getdropdown2 = $xpath->query($getdropdown);
//loop through each individual card
foreach($getdropdown2 as $dropresults) {
//loop through and get all the a tags
$arr = $dropresults->getElementsByTagName("a");
foreach($arr as $item) {
//only grab the links which point to the results page
if(strpos($item->getAttribute('href'), 'raceresult') !== false) {
//grab the code
$code = explode("=", $item->getAttribute('href'));
$code = end($code);
$url = "http://form.timeform.betfair.com/raceresult?raceId=" . $code; //WILL NEED TO PULL TOMORROWS DATE AS DD-MM-YYY
$html = curl($url);
$dom = new DOMDocument();
#$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$xpath = new DOMXPath($dom);
$spanTexts = array();
//get the place name
$getplacename = '//span[contains(#class, "locality")]';
$getplacename2 = $xpath->query($getplacename);
//loop through each individual card
foreach($getplacename2 as $getplacename22) {
echo "Venue: " . $venue = $getplacename22->textContent;
} //$getplacename2 as $getplacename22
$gettime = '//abbr [contains(#class, "dtstart")]';
//get the Date and the Time
$gettime2 = $xpath->query($gettime);
foreach($gettime2 as $gettime22) {
echo "Date : " . $Dateandtime = date(trim($gettime22->getAttribute('title')), strtotime('+5 hours'));
} //$gettime2 as $gettime22
//pull the data for the race e.g going money ect
$getdropdown22 = '//div[contains(#class, "content")]/p';
$getdropdown222 = $xpath->query($getdropdown22);
foreach($getdropdown222 as $dropresults2) {
$racename = trim($dropresults2->childNodes->item(0)->textContent);
//foreach ($dropresults2->childNodes as $node) { if(is_object($node)) { echo $node->nodeType; } else { echo $node; } }
foreach($dropresults2->childNodes as $node) {
if(is_object($node) && $node->nodeType === XML_ELEMENT_NODE && strtolower($node->tagName) === 'span') {
$spanTexts[] = (string) $node->textContent;
} //is_object($node) && $node->nodeType === XML_ELEMENT_NODE && strtolower($node->tagName) === 'span'
} //$dropresults2->childNodes as $node
if(count($spanTexts) < 6)
continue;
list($going, $distance, $age, $prizemoney, $runners, $racetype) = $spanTexts;
$going = str_replace(array(
'Â',
'Going:',
'|'
), '', $going);
$distance = miletofurlong($distance = trim(GetBetween($distance, ':', 'Â')));
$age = trim(GetBetween($age, ':', 'Â'));
$prizemoney = trim(GetBetween($prizemoney, '£', 'Â'));
$runners = trim(GetBetween($runners, ':', 'Â'));
$racetype = trim(GetBetween($racetype, ':', 'Â'));
} //$getdropdown222 as $dropresults2
//pull the individual horse data
$getdropdown = '//div[contains(#class, "table-container")]//tbody//tr';
$getdropdown2 = $xpath->query($getdropdown);
//loop through each individual card
foreach($getdropdown2 as $dropresults) {
$position = $dropresults->childNodes->item(0)->childNodes->item(1)->textContent;
$draw = str_replace(array('(',')'), '', $dropresults->childNodes->item(0)->childNodes->item(3)->textContent);
$losingdist = str_replace('Â', '', trim($dropresults->childNodes->item(2)->textContent));
if(strpos($losingdist, '¾') !== false) {
$losingdist = str_replace('¾', '.75', $losingdist);
} //strpos($losingdist, '¾') !== false
if(strpos($losingdist, '½') !== false) {
$losingdist = str_replace('½', '.5', $losingdist);
} //strpos($losingdist, '½') !== false
if(strpos($losingdist, '¼') !== false) {
$losingdist = str_replace('¼', '.25', $losingdist);
} //strpos($losingdist, '¼') !== false
$losingdist;
$horse = trim(preg_replace("/\([^\)]+\)/","",str_replace("'","",trim($dropresults->childNodes->item(4)->textContent))));
$horseage = trim($dropresults->childNodes->item(6)->textContent);
$weight = trim($dropresults->childNodes->item(8)->childNodes->item(1)->textContent);
$or = str_replace(array('(',')'), '', trim($dropresults->childNodes->item(8)->childNodes->item(3)->textContent));
str_replace('-', '', $eq = trim($dropresults->childNodes->item(10)->textContent));
$jockey = trim($dropresults->childNodes->item(12)->childNodes->item(1)->textContent);
$trainer = trim($dropresults->childNodes->item(12)->childNodes->item(4)->textContent);
$highandlowinrunning = trim($dropresults->childNodes->item(14)->childNodes->item(1)->textContent);
$highandlow = explode("/", $highandlowinrunning);
str_replace('-', '', $lowodds = trim($highandlow['1']));
str_replace('-', '', $highodds = trim($highandlow['0']));
$bfsp = trim($dropresults->childNodes->item(16)->childNodes->item(1)->textContent);
$isp = trim(str_replace('/', '', $dropresults->childNodes->item(16)->childNodes->item(3)->textContent));
$placeodds = trim($dropresults->childNodes->item(18)->textContent);
$venue = mysqli_real_escape_string($db, $venue);
$Dateandtime = mysqli_real_escape_string($db,$Dateandtime);
$going = mysqli_real_escape_string($db, $going);
$distance = mysqli_real_escape_string($db,$distance);
$age = mysqli_real_escape_string($db,$age);
$prizemoney = mysqli_real_escape_string($db,$prizemoney);
$runners = mysqli_real_escape_string($db,$runners );
$racetype = mysqli_real_escape_string($db,$racetype);
$position = mysqli_real_escape_string($db,$position );
$draw = mysqli_real_escape_string($db,$draw);
$losingdist = mysqli_real_escape_string($db,$losingdist);
$horse = mysqli_real_escape_string($db,$horse );
$age = mysqli_real_escape_string($db,$age);
$weight = mysqli_real_escape_string($db,$weight);
$or = mysqli_real_escape_string($db,$or );
$eq = mysqli_real_escape_string($db,$eq );
$jockey = mysqli_real_escape_string($db,$jockey);
$trainer = mysqli_real_escape_string($db,$trainer);
$lowodds = mysqli_real_escape_string($db,$lowodds);
$highodds = mysqli_real_escape_string($db,$highodds);
$bfsp = mysqli_real_escape_string($db,$bfsp);
$isp = mysqli_real_escape_string($db,$isp);
$placeodds = mysqli_real_escape_string($db,$placeodds);
$sql = "
INSERT INTO `Race_Records`
(
`Venue`,
`DateandTime`,
`Going`,
`Distance`,
`Age`,
`PrizeMoney`,
`Runners`,
`RaceType`,
`Position`,
`Draw`,
`LosingDist`,
`Horse`,
`HorseAge`,
`Weight`,
`OR`,
`EQ`,
`Jockey`,
`Trainer`,
`InRunningLow`,
`InRunningHigh`,
`BFSP`,
`ISP`,
`PlaceOdds`,
`RaceName`
)
VALUES
(
'$venue',
'$Dateandtime',
'$going',
'$distance',
'$age',
'$prizemoney',
'$runners',
'$racetype',
'$position',
'$draw',
'$losingdist',
'$horse',
'$age',
'$weight',
'$or',
'$eq',
'$jockey',
'$trainer',
'$lowodds',
'$highodds',
'$bfsp',
'$isp',
'$placeodds',
'$racename'
)
";
$res = mysqli_query($db, $sql);
if (!$res) {
echo PHP_EOL . "FAIL: $sql";
trigger_error(mysqli_error($db), E_USER_ERROR);
}
}
}
}
}
$id = date_create($id);
$theid2 = date_format($id,"d-m-Y");
$url = "www.sportinglife.com/racing/results/".$theid2; //WILL NEED TO PULL TOMORROWS DATE AS DD-MM-YYY
$html = curl($url);
$dom = new DOMDocument();
#$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$xpath = new DOMXPath($dom);
$getdropdown = '//li[contains(#class, "rac-cards")]//div[contains(#class, "ix ixv")]';
$getdropdown2 = $xpath->query($getdropdown);
//loop through each individual card
foreach($getdropdown2 as $dropresults) {
//loop through and get all the a tags
$arr = $dropresults->getElementsByTagName("a");
foreach($arr as $item) {
//only grab the links which point to the results page
//grab the code
$getcomments = $item->getAttribute('href');
foreach ($listofcorses as $bad) {
if (strstr( strtolower($getcomments),strtolower($bad)) !== false) {
$url = "http://www.sportinglife.com/".$getcomments; //WILL NEED TO PULL TOMORROWS DATE AS DD-MM-YYY
$html = curl($url);
$dom = new DOMDocument();
#$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$xpath = new DOMXPath($dom);
$spanTexts = array();
//get the place name
$getplacename = '//table';
$getplacename2 = $xpath->query($getplacename);
//loop through each individual card
$loopnumber = 0;
foreach($getplacename2 as $getplacename22) {
// get how many child nodes are in the loop
$count = 0;
foreach($getplacename22 ->childNodes->item(11)->childNodes as $node)
if(!($node instanceof \DomText))
$count++;
//loop through and get the horses name and the comment
for ($i = 0; $i < $count; $i++) {
if ($i % 2 == 0)
{
if ($getplacename22 ->childNodes->item(11)->childNodes->item($i)->childNodes->item(4) != null)
{
$horse = mysqli_real_escape_string($db,trim(preg_replace("/[^A-Za-z ]+/", "", preg_replace("/\([^\)]+\)/","",trim($getplacename22 ->childNodes->item(11)->childNodes->item($i)->childNodes->item(4)->textContent)))));
$check = "ok";
}
else
{
$check = "no";
}
}
else
{
if ($check == "ok") {
$comments = mysqli_real_escape_string($db,trim($getplacename22 ->childNodes->item(11)->childNodes->item($i)->textContent));
//update the database
$results = $db->query("UPDATE Race_Records SET comments= '$comments' WHERE Horse='$horse'");
}
}
}
}
}
}
}
}
?>

You could try setting curl's timeout
curl_setopt($ch,CURLOPT_TIMEOUT,1000);
You might also want to check that the services you are accessing in the loop are rate-limited or not, and if so put in an appropriate sleep in the loop to make sure you aren't making too many requests from the service in consecutive cycles; it could well be that the code is running OK, but then timeingout after a number of HTTP requests to the remote service

Set max execution time
// Begin your php code with this
ini_set('max_execution_time',300); // 60s*5=300s 5 minutes

Related

how to scrape a webpage with pagination

i'm setting up a new server, and want to scrape some information from a website
this is my code i tried to scrape pages one by one but i only get 2 of pages
$result = array();
function scrapingAnimelist($url, $page)
{
$res = array();
$urlParsed = $url . "&page=" . $page;
$html = file_get_html($urlParsed);
$pageData = array();
foreach ($html->find('div[class=body]') as $item) {
$metaData = array();
$metaData['title'] = $item->find('h2[class=title]', 0)->innertext;
$metaData['img'] = $item->find('img[class=img]', 0)->src;
$metaData['url'] = $item->find('a', 0)->href;
array_push($pageData, $metaData);
}
$res[$page] = $pageData;
if (sizeof($pageData) == 20) {
$page++;
$res[$page] = scrapingAnimelist($url, $page);
}
global $result;
$result = $res;
return $pageData;
}
i expect the output of json object with only 2 arrays ( page datas ) to be 3 in link : https://anime-list2.cf/anime-search?s=mag
Your $result is not set on the second run
yout should make it like this
$result = array();
function scrapingAnimelist($url, $page) {
global $result;
$urlParsed = $url . "&page=" . $page;
$html = file_get_html($urlParsed);
$pageData = array();
foreach ($html->find('div[class=body]') as $item) {
$metaData = array();
$metaData['title'] = $item->find('h2[class=title]', 0)->innertext;
$metaData['img'] = $item->find('img[class=img]', 0)->src;
$metaData['url'] = $item->find('a', 0)->href;
array_push($pageData, $metaData);
}
$result[$page] = $pageData;
if (sizeof($pageData) == 20) {
return scrapingAnimelist($url, $page + 1);
}
return $result;
}

Insert element into an array in PHP

foreach($key_doc_count as $item) {
mb_language('Japanese');
$product = $item["key"];
$product_url = 'https://search.rakuten.co.jp/search/mall/'.urlencode($product) . '/';
$source = file_get_contents($product_url);
$source = mb_convert_encoding($source, 'utf8', 'auto');
$rakuten_search_html = str_get_html($source);
$count=0;
foreach ($rakuten_search_html->find('img._verticallyaligned') as $item_image) {
if(strlen($item_image->alt > 2))
{
$ss['image_url'] = $item_image->src;
$ss['title'] = $item_image->alt;
$items_kk[] = $ss;
$count++;
if($count <5)
{
break;
}
}
}
$new_item["term"] = $item["key"];
$new_item["current_count"] = $item["doc_count"];
$new_item["results"] = $terms_kk;
$new_word_array[] = $new_item;
}
var_dump($new_word_array);
I am trying to insert the url and title of the product in an array names $ss and then assign that array to $new_term["result"] .
But its not working
The error was HTTP ERROR 500
The variables $items_kk and $new_word_array seems to have not been initialized outside of the foreach loop. Try to init them as an empty array just before the loop :
$new_word_array = [];
foreach($key_doc_count as $item) {
mb_language('Japanese');
$product = $item["key"];
$product_url = 'https://search.rakuten.co.jp/search/mall/'.urlencode($product) . '/';
$source = file_get_contents($product_url);
$source = mb_convert_encoding($source, 'utf8', 'auto');
$rakuten_search_html = str_get_html($source);
$count=0;
$items_kk = [];
foreach ($rakuten_search_html->find('img._verticallyaligned') as $item_image) {
if(strlen($item_image->alt > 2))
{
$ss['image_url'] = $item_image->src;
$ss['title'] = $item_image->alt;
$items_kk[] = $ss;
$count++;
if($count <5)
{
break;
}
}
}
$new_item["term"] = $item["key"];
$new_item["current_count"] = $item["doc_count"];
$new_item["results"] = $terms_kk;
$new_word_array[] = $new_item;
}
var_dump($new_word_array);
Also, you're setting $items_kk and assigning $terms_kk to the new array. It may be an error ?

XML parsing in php

I am parsing a xml and but there is a tag which contain image and text both and i want to seprate both image and text in diffrent columns of table in my design layout but i dont know how to do it. please help me. my php file is :
<?php
$RSS_Content = array();
function RSS_Tags($item, $type)
{
$y = array();
$tnl = $item->getElementsByTagName("title");
$tnl = $tnl->item(0);
$title = $tnl->firstChild->textContent;
$tnl = $item->getElementsByTagName("link");
$tnl = $tnl->item(0);
$link = $tnl->firstChild->textContent;
$tnl = $item->getElementsByTagName("description");
$tnl = $tnl->item(0);
$img = $tnl->firstChild->textContent;
$y["title"] = $title;
$y["link"] = $link;
$y["description"] = $img;
$y["type"] = $type;
return $y;
}
function RSS_Channel($channel)
{
global $RSS_Content;
$items = $channel->getElementsByTagName("item");
// Processing channel
$y = RSS_Tags($channel, 0); // get description of channel, type 0
array_push($RSS_Content, $y);
// Processing articles
foreach($items as $item)
{
$y = RSS_Tags($item, 1); // get description of article, type 1
array_push($RSS_Content, $y);
}
}
function RSS_Retrieve($url)
{
global $RSS_Content;
$doc = new DOMDocument();
$doc->load($url);
$channels = $doc->getElementsByTagName("channel");
$RSS_Content = array();
foreach($channels as $channel)
{
RSS_Channel($channel);
}
}
function RSS_RetrieveLinks($url)
{
global $RSS_Content;
$doc = new DOMDocument();
$doc->load($url);
$channels = $doc->getElementsByTagName("channel");
$RSS_Content = array();
foreach($channels as $channel)
{
$items = $channel->getElementsByTagName("item");
foreach($items as $item)
{
$y = RSS_Tags($item, 1);
array_push($RSS_Content, $y);
}
}
}
function RSS_Links($url, $size = 15)
{
global $RSS_Content;
$page = "<ul>";
RSS_RetrieveLinks($url);
if($size > 0)
$recents = array_slice($RSS_Content, 0, $size + 1);
foreach($recents as $article)
{
$type = $article["type"];
if($type == 0) continue;
$title = $article["title"];
$link = $article["link"];
$img = $article["description"];
$page .= "$title\n";
}
$page .="</ul>\n";
return $page;
}
function RSS_Display($url, $click, $size = 8, $site = 0, $withdate = 0)
{
global $RSS_Content;
$opened = false;
$page = "";
$site = (intval($site) == 0) ? 1 : 0;
RSS_Retrieve($url);
if($size > 0)
$recents = array_slice($RSS_Content, $site, $size + 1 - $site);
foreach($recents as $article)
{
$type = $article["type"];
if($type == 0)
{
if($opened == true)
{
$page .="</ul>\n";
$opened = false;
}
$page .="<b>";
}
else
{
if($opened == false)
{
$page .= "<table width='369' border='0'>
<tr>";
$opened = true;
}
}
$title = $article["title"];
$link = $article["link"];
$img = $article["description"];
$page .= "<td width='125' align='center' valign='middle'>
<div align='center'>$img</div></td>
<td width='228' align='left' valign='middle'><div align='left'><a
href=\"$click\" target='_top'>$title</a></div></td>";
if($withdate)
{
$date = $article["date"];
$page .=' <span class="rssdate">'.$date.'</span>';
}
if($type==0)
{
$page .="<br />";
}
}
if($opened == true)
{
$page .="</tr>
</table>";
}
return $page."\n";
}
?>
To separate the image and description you need to parse the HTML that is stored inside the description element again as XML. Luckily it is valid XML inside that element, therefore you can do this straight forward with SimpleXML, the following code-example take the URL and converts each item *description* into the text only and extracts the src attribute of the image to store it as the image element:
<item>
<title>Fake encounter: BJP backs Kataria, says CBI targeting Modi</title>
<link>http://ibnlive.in.com/news/fake-encounter-bjp-backs-kataria-says-cbi-targeting-modi/391802-37-64.html</link>
<description>The BJP lashed out at the CBI and questioned its 'shoddy investigation' into the Sohrabuddin fake encounter case.</description>
<pubDate>Wed, 15 May 2013 13:48:56 +0530</pubDate>
<guid>http://ibnlive.in.com/news/fake-encounter-bjp-backs-kataria-says-cbi-targeting-modi/391802-37-64.html</guid>
<image>http://static.ibnlive.in.com/ibnlive/pix/sitepix/05_2013/bjplive_kataria3.jpg</image>
</item>
The code-example is:
$url = 'http://ibnlive.in.com/ibnrss/top.xml';
$feed = simplexml_load_file($url);
$items = $feed->xpath('(//channel/item)');
foreach ($items as $item) {
list($description, $image) =
simplexml_load_string("<r>$item->description</r>")
->xpath('(/r|/r//#src)');
$item->description = (string)$description;
$item->image = (string)$image;
}
You can then import the SimpleXML into a DOMElement with dom_import_simplexml() however honestly, I just would wrap that little HTML creation as well into a foreach of SimpleXML because you can make use of LimitIterator for the paging as well as you could with DOMDocument and the data you access is actually easily at hand with SimpleXML, it's just easy to pass along the XML elements as SimpleXMLElements instead of parsing into an array first and then processing the array. That's moot.

PHP parser to scrape data error

I'm trying to code a php parser to gather professor reviews from ratemyprofessor.com. Each professor has a page and it has all the reviews in it, I want to parse each professor's site and extract the comments into a txt file.
This is what I have so far but it doesn't excute properly when I run it because the output txt file remains empty. what can be the issue?
<?php
set_time_limit(0);
$domain = "http://www.ratemyprofessors.com";
$content = "div id=commentsection";
$content_tag = "comment";
$output_file = "reviews.txt";
$max_urls_to_check = 400;
$rounds = 0;
$reviews_stack = array();
$max_size_domain_stack = 10000;
$checked_domains = array();
while ($domain != "" && $rounds < $max_urls_to_check) {
$doc = new DOMDocument();
#$doc->loadHTMLFile($domain);
$found = false;
foreach($doc->getElementsByTagName($content_tag) as $tag) {
if (strpos($tag->nodeValue, $content)) {
$found = true;
break;
}
}
$checked_domains[$domain] = $found;
foreach($doc->getElementsByTagName('a') as $link) {
$href = $link->getAttribute('href');
if (strpos($href, 'http://') !== false && strpos($href, $domain) === false) {
$href_array = explode("/", $href);
if (count($domain_stack) < $max_size_domain_stack &&
$checked_domains["http://".$href_array[2]] === null) {
array_push($domain_stack, "http://".$href_array[2]);
}
};
}
$domain_stack = array_unique($domain_stack);
$domain = $domain_stack[0];
unset($domain_stack[0]);
$domain_stack = array_values($domain_stack);
$rounds++;
}
$found_domains = "";
foreach ($checked_domains as $key => $value) {
if ($value) {
$found_domains .= $key."\n";
}
}
file_put_contents($output_file, $found_domains);
?>
This is what I have so far but it doesn't excute properly when I run it because the output txt file remains empty. what can be the issue?
It gives empty output since there is a lack of array variable initialization.
Main part. Add an initialization of variable:
$domain_stack = array(); // before while ($domain != ...... )
Additional. Fix other warnings and notices:
// change this
$checked_domains["http://".$href_array[2]] === null
// into
!isset($checked_domains["http://".$href_array[2]])
// another line
// check if key exists
if (isset($domain_stack[0])) {
$domain = $domain_stack[0];
unset($domain_stack[0]);
}

One result array

I'm trying to add the results of a script to an array, but once I look into it there is only one item in it, probably me being silly with placement
function crawl_page($url, $depth)
{
static $seen = array();
$Linklist = array();
if (isset($seen[$url]) || $depth === 0) {
return;
}
$seen[$url] = true;
$dom = new DOMDocument('1.0');
#$dom->loadHTMLFile($url);
$anchors = $dom->getElementsByTagName('a');
foreach ($anchors as $element) {
$href = $element->getAttribute('href');
if (0 !== strpos($href, 'http')) {
$href = rtrim($url, '/') . '/' . ltrim($href, '/');
}
if(shouldScrape($href)==true)
{
crawl_page($href, $depth - 1);
}
}
echo "URL:",$url;
echo http_response($url);
echo "<br/>";
$Linklist[] = $url;
$XML = new DOMDocument('1.0');
$XML->formatOutput = true;
$root = $XML->createElement('Links');
$root = $XML->appendChild($root);
foreach ($Linklist as $value)
{
$child = $XML->createElement('Linkdetails');
$child = $root->appendChild($child);
$text = $XML->createTextNode($value);
$text = $child->appendChild($text);
}
$XML->save("linkList.xml");
}
$Linklist[] = $url; will add a single item to the $Linklist array. This line needs to be in a loop I think.
static $Linklist = array(); i think, but code is awful

Categories