I trying to extract the news headlines and the link (href) of each headline using the code bellow, but the link extraction is not working. It's only getting the headline. Please help me find out what's wrong with the code.
Link to page from which I want to get the headline and link from:
http://web.tmxmoney.com/news.php?qm_symbol=BCM
<?php
$data = file_get_contents('http://web.tmxmoney.com/news.php?qm_symbol=BCM');
$dom = new domDocument;
#$dom->loadHTML($data);
$dom->preserveWhiteSpace = true;
$xpath = new DOMXPath($dom);
$rows = $xpath->query('//div');
foreach ($rows as $row) {
$cols = $row->getElementsByTagName('span');
$newstitle = $cols->item(0)->nodeValue;
$link = $cols->item(0)->nodeType === HTML_ELEMENT_NODE ? $cols->item(0)->getElementsByTagName('a')->item(0)->getAttribute('href') : '';
echo $newstitle . '<br>';
echo $link . '<br><br>';
}
?>
Thanks in advance for your help!
Try to do this:
<?php
$data= file_get_contents('http://web.tmxmoney.com/news.php?qm_symbol=BCM');
$dom = new DOMDocument();
#$dom->loadHTML($data);
$xpath = new DOMXPath($dom);
$hrefs= $xpath->query('/html/body//a');
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
if(!filter_var($url, FILTER_VALIDATE_URL) === false){
echo ''.$url.'<br />';
}
}
?>
I have found the solution. Here it goes:
<?php
$data = file_get_contents('http://web.tmxmoney.com/news.php?qm_symbol=BCM');
$dom = new domDocument;
#$dom->loadHTML($data);
$dom->preserveWhiteSpace = true;
$xpath = new DOMXPath($dom);
$rows = $xpath->query('//div');
foreach ($rows as $row) {
$cols1 = $row->getElementsByTagName('a');
$link = $cols1->item(0)->nodeType === XML_ELEMENT_NODE ? $cols1->item(0)->getAttribute('href') : '';
$cols2 = $row->getElementsByTagName('span');
$title = $cols2->item(0)->nodeValue;
$source = $cols2->item(1)->nodeValue;
echo $title . '<br>';
echo $source . '<br>';
echo $link . '<br><br>';
}
?>
I want to create an RSS feed in PHP based on other RSS feeds. That works.
But, Now I want to change how many items are shown in the new created feed. This based on calculation of the items of another feed.
I use this script for the calculation, which works fine:
<?php
$url = 'http://www.nu.nl/rss/Algemeen';
$xml = simplexml_load_file($url);
$tags = array();
foreach($xml->channel->item as $item) {
$children = $item->children(); // get all children of each item tag
foreach ($children as $node) {
$tags[] = $node->getName(); // get the node name of each children
}
}
$test = count($tags);
$count = array_count_values($tags); // count the values
?>
<?php
$mystring = count($tags);
$findme = '3';
$pos = strpos($mystyring, $findme);
?>
<?php
$artikel = ($mystring / 5);
echo $artikel;
?>
I use this script, magpierss, fow creating a new feed:
<?php
/**
* Setup
*
*/
$DOMAIN_NAME = 'http://sitename.nl/';
$FEED_URL = $DOMAIN_NAME . 'rss/full.php';
$SITE_TITLE = 'test';
$SITE_DESRIPTION = '-';
$SITE_AUTHOR = 'test2';
$RSS_CACHE = "/tmp/rsscache";
$RSS_CACHE_EXP = 3600;
$FEED_LIST = array(
'http://www.nu.nl/rss/Economie',
'http://www.nu.nl/rss/Internet'
);
/**
* Do not modify below this point
*
*/
define('MAGPIE_CACHE_DIR', $RSS_CACHE);
define('MAGPIE_CACHE_AGE', $RSS_CACHE_EXP);
define('MAGPIE_OUTPUT_ENCODING', 'utf-8');
// include required files
require_once ('magpierss-0.72/rss_fetch.inc');
include ('feedcreator.class.php');
/* Set RSS properties */
$rss = new UniversalFeedCreator();
$rss->useCached();
$rss->title = $SITE_TITLE;
$rss->description = $SITE_DESRIPTION;
$rss->link = $DOMAIN_NAME;
$rss->syndicationURL = $FEED_URL;
$rss->encoding = 'utf8';
/* Set Image properties
$image = new FeedImage();
$image->title = $SITE_TITLE . " Logo";
$image->url = $SITE_LOG_URL;
$image->link = $DOMAIN_NAME;
$image->description = "Feed provided by " . $SITE_TITLE . ". Click to visit.";
$rss->image = $image;
*/
function showSummary($url, $num = 10, $showfullfeed = false) {
global $rss, $DOMAIN_NAME, $SITE_AUTHOR, $SITE_TITLE;
$num_items = $num;
# $rss1 = fetch_rss($url);
if ($rss1) {
$items = array_slice($rss1->items, 0, $num_items);
foreach ($items as $item) {
$href = $item['link'];
$title = $item['title'];
if (!$showfullfeed) {
$desc = $item['description'];
} else {
$desc = $item['content']['encoded'];
}
// $desc .= '
//Copyright © '.$SITE_TITLE.'. All Rights Reserved.
//';
$pdate = $item['pubdate'];
$rss_item = new FeedItem();
$rss_item->title = $item['title'];
$rss_item->link = $item['link'];
$rss_item->description = $item['content']['encoded'];
$rss_item->date = $item['pubdate'];
$rss_item->source = $DOMAIN_NAME;
$rss_item->author = $SITE_AUTHOR;
$rss->addItem($rss_item);
}
} else {
echo "Error: Cannot fetch feed url - " . $url;
}
}
// Fetch all feeds
foreach($FEED_LIST as $v) showSummary($v);
// Sort items by date
function __usort($ad, $bd) {return strtotime($bd->date) - strtotime($ad->date);}
usort($rss->items, '__usort');
// Display items
$rss->saveFeed("RSS1.0", $RSS_CACHE . "/feed.xml");
How could I let the variable $artikel decide how many items are showed in the feed?
I am pulling data from a page and I know this is a long process depending on the date being pulled. After 132 seconds of pulling the data the page times-out.
I have set the set_time_limit(0);and ignore_user_abort(true); - I am not sure what else to do to keep the script alive and pull all the data.
I have added the code below in case there is something i can do to speed it up??
set_time_limit(0);
ignore_user_abort(true);
error_reporting(-1);
ini_set('display_errors', 'On');
include "../include/class.php";
include "../include/db.php";
//the below will get the list of id's for each race that day
function curl($url){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch,CURLOPT_FOLLOWLOCATION,true);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
$url = "http://form.timeform.betfair.com/daypage?date=20150516"; //WILL NEED TO PULL TOMORROWS DATE AS DD-MM-YYY
$html = curl($url);
$dom = new DOMDocument();
#$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$xpath = new DOMXPath($dom);
//pull the individual cards for the day
//li class="rac-cardsclass="ix ixc"
$getdropdown = '//div[contains(#data-location, "RACING_COUNTRY_GB_IE")]//div[contains(#class, "course")]';
$getdropdown2 = $xpath->query($getdropdown);
//loop through each individual card
foreach($getdropdown2 as $dropresults) {
//loop through and get all the a tags
$arr = $dropresults->getElementsByTagName("a");
foreach($arr as $item) {
//only grab the links which point to the results page
if(strpos($item->getAttribute('href'), 'raceresult') !== false) {
//grab the code
$code = explode("=", $item->getAttribute('href'));
$code = end($code);
$url = "http://form.timeform.betfair.com/raceresult?raceId=" . $code; //WILL NEED TO PULL TOMORROWS DATE AS DD-MM-YYY
$html = curl($url);
$dom = new DOMDocument();
#$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$xpath = new DOMXPath($dom);
$spanTexts = array();
//get the place name
$getplacename = '//span[contains(#class, "locality")]';
$getplacename2 = $xpath->query($getplacename);
//loop through each individual card
foreach($getplacename2 as $getplacename22) {
echo "Venue: " . $venue = $getplacename22->textContent;
} //$getplacename2 as $getplacename22
$gettime = '//abbr [contains(#class, "dtstart")]';
//get the Date and the Time
$gettime2 = $xpath->query($gettime);
foreach($gettime2 as $gettime22) {
echo "Date : " . $Dateandtime = date(trim($gettime22->getAttribute('title')), strtotime('+5 hours'));
} //$gettime2 as $gettime22
//pull the data for the race e.g going money ect
$getdropdown22 = '//div[contains(#class, "content")]/p';
$getdropdown222 = $xpath->query($getdropdown22);
foreach($getdropdown222 as $dropresults2) {
$racename = trim($dropresults2->childNodes->item(0)->textContent);
//foreach ($dropresults2->childNodes as $node) { if(is_object($node)) { echo $node->nodeType; } else { echo $node; } }
foreach($dropresults2->childNodes as $node) {
if(is_object($node) && $node->nodeType === XML_ELEMENT_NODE && strtolower($node->tagName) === 'span') {
$spanTexts[] = (string) $node->textContent;
} //is_object($node) && $node->nodeType === XML_ELEMENT_NODE && strtolower($node->tagName) === 'span'
} //$dropresults2->childNodes as $node
if(count($spanTexts) < 6)
continue;
list($going, $distance, $age, $prizemoney, $runners, $racetype) = $spanTexts;
$going = str_replace(array(
'Â',
'Going:',
'|'
), '', $going);
$distance = miletofurlong($distance = trim(GetBetween($distance, ':', 'Â')));
$age = trim(GetBetween($age, ':', 'Â'));
$prizemoney = trim(GetBetween($prizemoney, '£', 'Â'));
$runners = trim(GetBetween($runners, ':', 'Â'));
$racetype = trim(GetBetween($racetype, ':', 'Â'));
} //$getdropdown222 as $dropresults2
//pull the individual horse data
$getdropdown = '//div[contains(#class, "table-container")]//tbody//tr';
$getdropdown2 = $xpath->query($getdropdown);
//loop through each individual card
foreach($getdropdown2 as $dropresults) {
$position = $dropresults->childNodes->item(0)->childNodes->item(1)->textContent;
$draw = str_replace(array('(',')'), '', $dropresults->childNodes->item(0)->childNodes->item(3)->textContent);
$losingdist = str_replace('Â', '', trim($dropresults->childNodes->item(2)->textContent));
if(strpos($losingdist, '¾') !== false) {
$losingdist = str_replace('¾', '.75', $losingdist);
} //strpos($losingdist, '¾') !== false
if(strpos($losingdist, '½') !== false) {
$losingdist = str_replace('½', '.5', $losingdist);
} //strpos($losingdist, '½') !== false
if(strpos($losingdist, '¼') !== false) {
$losingdist = str_replace('¼', '.25', $losingdist);
} //strpos($losingdist, '¼') !== false
$losingdist;
$horse = trim(preg_replace("/\([^\)]+\)/","",str_replace("'","",trim($dropresults->childNodes->item(4)->textContent))));
$horseage = trim($dropresults->childNodes->item(6)->textContent);
$weight = trim($dropresults->childNodes->item(8)->childNodes->item(1)->textContent);
$or = str_replace(array('(',')'), '', trim($dropresults->childNodes->item(8)->childNodes->item(3)->textContent));
str_replace('-', '', $eq = trim($dropresults->childNodes->item(10)->textContent));
$jockey = trim($dropresults->childNodes->item(12)->childNodes->item(1)->textContent);
$trainer = trim($dropresults->childNodes->item(12)->childNodes->item(4)->textContent);
$highandlowinrunning = trim($dropresults->childNodes->item(14)->childNodes->item(1)->textContent);
$highandlow = explode("/", $highandlowinrunning);
str_replace('-', '', $lowodds = trim($highandlow['1']));
str_replace('-', '', $highodds = trim($highandlow['0']));
$bfsp = trim($dropresults->childNodes->item(16)->childNodes->item(1)->textContent);
$isp = trim(str_replace('/', '', $dropresults->childNodes->item(16)->childNodes->item(3)->textContent));
$placeodds = trim($dropresults->childNodes->item(18)->textContent);
$venue = mysqli_real_escape_string($db, $venue);
$Dateandtime = mysqli_real_escape_string($db,$Dateandtime);
$going = mysqli_real_escape_string($db, $going);
$distance = mysqli_real_escape_string($db,$distance);
$age = mysqli_real_escape_string($db,$age);
$prizemoney = mysqli_real_escape_string($db,$prizemoney);
$runners = mysqli_real_escape_string($db,$runners );
$racetype = mysqli_real_escape_string($db,$racetype);
$position = mysqli_real_escape_string($db,$position );
$draw = mysqli_real_escape_string($db,$draw);
$losingdist = mysqli_real_escape_string($db,$losingdist);
$horse = mysqli_real_escape_string($db,$horse );
$age = mysqli_real_escape_string($db,$age);
$weight = mysqli_real_escape_string($db,$weight);
$or = mysqli_real_escape_string($db,$or );
$eq = mysqli_real_escape_string($db,$eq );
$jockey = mysqli_real_escape_string($db,$jockey);
$trainer = mysqli_real_escape_string($db,$trainer);
$lowodds = mysqli_real_escape_string($db,$lowodds);
$highodds = mysqli_real_escape_string($db,$highodds);
$bfsp = mysqli_real_escape_string($db,$bfsp);
$isp = mysqli_real_escape_string($db,$isp);
$placeodds = mysqli_real_escape_string($db,$placeodds);
$sql = "
INSERT INTO `Race_Records`
(
`Venue`,
`DateandTime`,
`Going`,
`Distance`,
`Age`,
`PrizeMoney`,
`Runners`,
`RaceType`,
`Position`,
`Draw`,
`LosingDist`,
`Horse`,
`HorseAge`,
`Weight`,
`OR`,
`EQ`,
`Jockey`,
`Trainer`,
`InRunningLow`,
`InRunningHigh`,
`BFSP`,
`ISP`,
`PlaceOdds`,
`RaceName`
)
VALUES
(
'$venue',
'$Dateandtime',
'$going',
'$distance',
'$age',
'$prizemoney',
'$runners',
'$racetype',
'$position',
'$draw',
'$losingdist',
'$horse',
'$age',
'$weight',
'$or',
'$eq',
'$jockey',
'$trainer',
'$lowodds',
'$highodds',
'$bfsp',
'$isp',
'$placeodds',
'$racename'
)
";
$res = mysqli_query($db, $sql);
if (!$res) {
echo PHP_EOL . "FAIL: $sql";
trigger_error(mysqli_error($db), E_USER_ERROR);
}
}
}
}
}
$id = date_create($id);
$theid2 = date_format($id,"d-m-Y");
$url = "www.sportinglife.com/racing/results/".$theid2; //WILL NEED TO PULL TOMORROWS DATE AS DD-MM-YYY
$html = curl($url);
$dom = new DOMDocument();
#$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$xpath = new DOMXPath($dom);
$getdropdown = '//li[contains(#class, "rac-cards")]//div[contains(#class, "ix ixv")]';
$getdropdown2 = $xpath->query($getdropdown);
//loop through each individual card
foreach($getdropdown2 as $dropresults) {
//loop through and get all the a tags
$arr = $dropresults->getElementsByTagName("a");
foreach($arr as $item) {
//only grab the links which point to the results page
//grab the code
$getcomments = $item->getAttribute('href');
foreach ($listofcorses as $bad) {
if (strstr( strtolower($getcomments),strtolower($bad)) !== false) {
$url = "http://www.sportinglife.com/".$getcomments; //WILL NEED TO PULL TOMORROWS DATE AS DD-MM-YYY
$html = curl($url);
$dom = new DOMDocument();
#$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$xpath = new DOMXPath($dom);
$spanTexts = array();
//get the place name
$getplacename = '//table';
$getplacename2 = $xpath->query($getplacename);
//loop through each individual card
$loopnumber = 0;
foreach($getplacename2 as $getplacename22) {
// get how many child nodes are in the loop
$count = 0;
foreach($getplacename22 ->childNodes->item(11)->childNodes as $node)
if(!($node instanceof \DomText))
$count++;
//loop through and get the horses name and the comment
for ($i = 0; $i < $count; $i++) {
if ($i % 2 == 0)
{
if ($getplacename22 ->childNodes->item(11)->childNodes->item($i)->childNodes->item(4) != null)
{
$horse = mysqli_real_escape_string($db,trim(preg_replace("/[^A-Za-z ]+/", "", preg_replace("/\([^\)]+\)/","",trim($getplacename22 ->childNodes->item(11)->childNodes->item($i)->childNodes->item(4)->textContent)))));
$check = "ok";
}
else
{
$check = "no";
}
}
else
{
if ($check == "ok") {
$comments = mysqli_real_escape_string($db,trim($getplacename22 ->childNodes->item(11)->childNodes->item($i)->textContent));
//update the database
$results = $db->query("UPDATE Race_Records SET comments= '$comments' WHERE Horse='$horse'");
}
}
}
}
}
}
}
}
?>
You could try setting curl's timeout
curl_setopt($ch,CURLOPT_TIMEOUT,1000);
You might also want to check that the services you are accessing in the loop are rate-limited or not, and if so put in an appropriate sleep in the loop to make sure you aren't making too many requests from the service in consecutive cycles; it could well be that the code is running OK, but then timeingout after a number of HTTP requests to the remote service
Set max execution time
// Begin your php code with this
ini_set('max_execution_time',300); // 60s*5=300s 5 minutes
How can I take all the attribute of an element? Like on my example below I can only get one at a time, I want to pull out all of the anchor tag's attribute.
$dom = new DOMDocument();
#$dom->loadHTML(http://www.example.com);
$a = $dom->getElementsByTagName("a");
echo $a->getAttribute('href');
thanks!
$length = $a->attributes->length;
$attrs = array();
for ($i = 0; $i < $length; ++$i) {
$name = $a->attributes->item($i)->name;
$value = $a->getAttribute($name);
$attrs[$name] = $value;
}
print_r($attrs);
"Inspired" by Simon's answer. I think you can cut out the getAttribute call, so here's a solution without it:
$attrs = array();
for ($i = 0; $i < $a->attributes->length; ++$i) {
$node = $a->attributes->item($i);
$attrs[$node->nodeName] = $node->nodeValue;
}
var_dump($attrs);
$a = $dom->getElementsByTagName("a");
foreach($a as $element)
{
echo $element->getAttribute('href');
}
$html = $data['html'];
if(!empty($html)){
$doc = new DOMDocument();
$doc->loadHTML($html);
$doc->saveHTML();
$datadom = $doc->getElementsByTagName("input");
foreach($datadom as $element)
{
$class =$class." ".$element->getAttribute('class');
}
}