Ok, so the preg_match_all wont work towards Yahoo.
I'm trying to preg_match_all the results i get from Yahoo using a cURL curl_multi_getcontent method.
I have succeeded to fetch the site and so, but when I'm trying to get the result of the links, it wont match anything. When I'm using the regex in Notepad++ it succeeds but not in PHP apparently.
I'm currently using:
preg_match_all(
'#<span class="url" id="(.*?)">(.+?)</span>#si', $urlContents[2], $yahoo
);
Check the HTML at [http://se.search.yahoo.com/search?p=random&toggle=1&cop=mss&ei=UTF-8&fr=yfp-t][1] for example and you will see that all links start with <span class="url" id="something random"> and ends with </span>.
Could someone possible help me with how I should retreive this information?
I only need the actual link address to each result.
Entire PHP Script
public function multiSearch($question)
{
$sites['google'] = "http://www.google.com/search?q={$question}&gl=sv";
$sites['bing'] = "http://www.bing.com/search?q={$question}";
$sites['yahoo'] = "http://se.search.yahoo.com/search?p={$question}";
$urlHandler = array();
foreach($sites as $site)
{
$handler = curl_init();
curl_setopt($handler, CURLOPT_URL, $site);
curl_setopt($handler, CURLOPT_HEADER, 0);
curl_setopt($handler, CURLOPT_RETURNTRANSFER, 1);
array_push($urlHandler, $handler);
}
$multiHandler = curl_multi_init();
foreach($urlHandler as $key => $url)
{
curl_multi_add_handle($multiHandler, $url);
}
$running = null;
do
{
curl_multi_exec($multiHandler, $running);
}
while($running > 0);
$urlContents = array();
foreach($urlHandler as $key => $url)
{
$urlContents[$key] = curl_multi_getcontent($url);
}
foreach($urlHandler as $key => $url)
{
curl_multi_remove_handle($multiHandler, $url);
}
foreach($urlContents as $urlContent)
{
preg_match_all('/<li class="g">(.*?)<\/li>/si', $urlContent, $matches);
//$this->view_data['results'][] = "Random";
}
preg_match_all('#<cite>(.+?)</cite>#si', $urlContents[1], $googleLinks);
preg_match_all('#<span class="url" id="(.*)">(.+?)</span>#si', $urlContents[2], $yahoo);
var_dump($yahoo);
die();
$findHtml = array('/<cite>/', '/<\/cite>/', '/<b>/', '/<\/b>/', '/ /', '/"/', '/<strong>/', '/<\/strong>/');
$removeHtml = array('', '', '', '', '', '', '', '');
foreach($googleLinks as $links => $val)
{
foreach($val as $link)
$this->view_data['results'][] = preg_replace($findHtml, $removeHtml, $link);
break;
}
}
First off, you should not use regular expressions to process HTML. There are pretty good DOM parsers available for PHP. For example:
$d = new DOMDocument;
$d->loadHTML($s);
$x = new DOMXPath($d);
foreach ($x->query('//span[#class="url"]') as $node) {
// process each node the way you wish
// print the id for instance
echo $node->getAttribute('id'), PHP_EOL;
}
Besides that, the expression should work except that id="(.*)" is greedy; that can be fixed with:
#<span class="url" id="(.*?)">(.+?)</span>#si
It's possible that there's more text after id="..." and the >; that would bring the expression to:
#<span class="url" id="(.*?)"[^>]*>(.+?)</span>#si
Related
I am trying to add a comma and whitespace to some data I am scraping from a website. The data scrapes successfully, but they are muddled up together, and the space and comma are trying to add only get added to the last item. Here is the code I currently have
$html = curl_exec($ch);
$dom = new DOMDocument();
#$dom->loadHTML($html);
$finder = new DomXPath($dom);
$class_ops = 'ipc-inline-list ';
$class_opp = 'ipc-inline ';
$node = $finder->query("//div[#class='$class_ops']//ul[#class='$class_opp']");
foreach ($node as $index => $t) {
if ($index == 3) {
$la = $t->textContent.", ";
}
}
echo $la;
Current Result
DoyleBrainDavid,
Expected Result
Doyle, Brain, David
I am using this code
$c1 = curl_init('https://stackoverflow.com/');
curl_setopt($c1, CURLOPT_RETURNTRANSFER, true);
$html = curl_exec($c1);
if (curl_error($c1))
die(curl_error($c1));
// Get the status code
$status = curl_getinfo($c1, CURLINFO_HTTP_CODE);
curl_close($c1);
preg_match_all('/<span(.*?)<\/span>/s', $html, $matches1);
foreach($matches1[0] as $k=>$v){
$enc = mb_detect_encoding($v);
$v = mb_convert_encoding($v,$enc, "UTF-8");
$match1[$k] = strip_tags ($v);
//$match1[$k] = preg_replace('/^[^A-Za-z0-9]+/', '', $match1[$k]);
}
var_dump($match1);
In your case you can replace like this
preg_match_all('/<div class="ipc-inline-list">(.*?)<\/div>/s', $html, $matches1);
This return array with matches.
I hope this can be helpful for you
You want each li, not the ul as one block. Try:
$node = $finder->query("//div[#class='$class_ops']//ul[#class='$class_opp']/li");
Demo: https://3v4l.org/Mvfud
If that doesn't work the actual HTML content should be added to the question.
I have a simple script that until yesterday had worked fine for 2 years. Im just taking a XML feed from a WP site and formatting it to be displayed on a different website. Here is the code:
<?php
function download_page($path){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL,$path);
curl_setopt($ch, CURLOPT_FAILONERROR,1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch, CURLOPT_TIMEOUT, 15);
$retValue = curl_exec($ch);
curl_close($ch);
return $retValue;
}
$sXML = download_page('https://example.com/tradeblog/feed/atom/');
$oXML = new SimpleXMLElement($sXML);
$items = $oXML->entry;
$i = 0;
foreach($items as $item) {
$title = $item->title;
$link = $item->link;
echo '<li>';
foreach($link as $links) {
$loc = $links['href'];
$href = str_replace("/feed/atom/", "", $loc);
echo "<a href=\"$href\" target=\"_blank\">";
}
echo $title;
echo "</a>";;
echo "</li>";
if(++$i == 3) break;
}
?>
I can echo out $sXML and it will display the entire XML contents as expected. When I try and echo $oXML I get the 500 error. Any use of $oXML causes the 500. What changed? Is there a different / better way to do this using PHP?
It seems your xml source is not exactly a xml. I tried to validate it using w3 scholl validator and it throws an error. Tried here too, and got the same error.
Not sure why, but this worked
<?php
$rss = new DOMDocument();
$rss->load('https://example.com/tradeblog/feed/rss2/');
$feed = array();
foreach ($rss->getElementsByTagName('item') as $node) {
$item = array (
'title' => $node->getElementsByTagName('title')->item(0)->nodeValue,
'link' => $node->getElementsByTagName('link')->item(0)->nodeValue,
);
array_push($feed, $item);
}
$limit = 3;
for($x=0;$x<$limit;$x++) {
$title = str_replace(' & ', ' & ', $feed[$x]['title']);
$link = $feed[$x]['link'];
echo '<li>'.$title.'</li>';
}
?>
I'm working on an app that gets all the URLs from an array of sites and displays it in array form or JSON.
I can do it using for loop, the problem is the execution time when I tried 10 URLs it gives me an error saying exceeds maximum execution time.
Upon searching I found this multi curl
I also found this Fast PHP CURL Multiple Requests: Retrieve the content of multiple URLs using CURL. I tried to add my code but didn't work because I don't how to use the function.
Hope you help me.
Thanks.
This is my sample code.
<?php
$urls=array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/');
$mh = curl_multi_init();
foreach ($urls as $i => $url) {
$urlContent = file_get_contents($url);
$dom = new DOMDocument();
#$dom->loadHTML($urlContent);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if(!filter_var($url, FILTER_VALIDATE_URL) === false){
echo ''.$url.'<br />';
}
}
$conn[$i]=curl_init($url);
$fp[$i]=fopen ($g, "w");
curl_setopt ($conn[$i], CURLOPT_FILE, $fp[$i]);
curl_setopt ($conn[$i], CURLOPT_HEADER ,0);
curl_setopt($conn[$i],CURLOPT_CONNECTTIMEOUT,60);
curl_multi_add_handle ($mh,$conn[$i]);
}
do {
$n=curl_multi_exec($mh,$active);
}
while ($active);
foreach ($urls as $i => $url) {
curl_multi_remove_handle($mh,$conn[$i]);
curl_close($conn[$i]);
fclose ($fp[$i]);
}
curl_multi_close($mh);
?>
Here is a function that I put together that will properly utilize the curl_multi_init() function. It is more or less the same function that you will find on PHP.net with some minor tweaks. I have had great success with this.
function multi_thread_curl($urlArray, $optionArray, $nThreads) {
//Group your urls into groups/threads.
$curlArray = array_chunk($urlArray, $nThreads, $preserve_keys = true);
//Iterate through each batch of urls.
$ch = 'ch_';
foreach($curlArray as $threads) {
//Create your cURL resources.
foreach($threads as $thread=>$value) {
${$ch . $thread} = curl_init();
curl_setopt_array(${$ch . $thread}, $optionArray); //Set your main curl options.
curl_setopt(${$ch . $thread}, CURLOPT_URL, $value); //Set url.
}
//Create the multiple cURL handler.
$mh = curl_multi_init();
//Add the handles.
foreach($threads as $thread=>$value) {
curl_multi_add_handle($mh, ${$ch . $thread});
}
$active = null;
//execute the handles.
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active && $mrc == CURLM_OK) {
if (curl_multi_select($mh) != -1) {
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
//Get your data and close the handles.
foreach($threads as $thread=>$value) {
$results[$thread] = curl_multi_getcontent(${$ch . $thread});
curl_multi_remove_handle($mh, ${$ch . $thread});
}
//Close the multi handle exec.
curl_multi_close($mh);
}
return $results;
}
//Add whatever options here. The CURLOPT_URL is left out intentionally.
//It will be added in later from the url array.
$optionArray = array(
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0',//Pick your user agent.
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_TIMEOUT => 10
);
//Create an array of your urls.
$urlArray = array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/'
);
//Play around with this number and see what works best.
//This is how many urls it will try to do at one time.
$nThreads = 20;
//To use run the function.
$results = multi_thread_curl($urlArray, $optionArray, $nThreads);
Once this is complete you will have an array containing all of the html from your list of websites. It is at this point where I would loop through them and pull out all of the urls.
Like so:
foreach($results as $page){
$dom = new DOMDocument();
#$dom->loadHTML($page);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if(!filter_var($url, FILTER_VALIDATE_URL) === false){
echo ''.$url.'<br />';
}
}
}
It is also worth keeping in the back of you head the ability to increase the run time of your script.
If your using a hosting service you may be restricted to something in the ball park of two minutes regardless of what you set your max execution time to. Just food for thought.
This is done by:
ini_set('max_execution_time', 120);
You can always try more time but you'll never know till you time it.
Hope it helps.
You may be using an endless loop - if not, you can can increase the maximum execution time in php.ini or with:
ini_set('max_execution_time', 600); // 600 seconds = 10 minutes
This is what I achieved after working on the code, It worked but not sure if this is the best answer. Kindly check my code.
<?php
$array = array('https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/');
print_r (getUrls($array));
function getUrls($array) {
$arrUrl = array();
$arrList = array();
$url_count = count($array);
$curl_array = array();
$ch = curl_multi_init();
foreach($array as $count => $url) {
$curl_array[$count] = curl_init($url);
curl_setopt($curl_array[$count], CURLOPT_RETURNTRANSFER, true);
curl_multi_add_handle($ch, $curl_array[$count]);
}
do{
curl_multi_exec($ch, $exec);
curl_multi_select($ch,1);
}while($exec);
foreach($array as $count => $url) {
$arrUrl = array();
$urlContent = curl_multi_getcontent($curl_array[$count]);
$dom = new DOMDocument();
#$dom->loadHTML($urlContent);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if (filter_var($url, FILTER_VALIDATE_URL) !== false) {
if (strpos($url, 'mailto') === false) {
$arrUrl[] = $url;
}
}
}
array_push($arrList, array_unique($arrUrl));
}
foreach($array as $count => $url) {
curl_multi_remove_handle($ch, $curl_array[$count]);
}
curl_multi_close($ch);
foreach($array as $count => $url) {
curl_close($curl_array[$count]);
}
return $arrList;
}
First of all i know that OP does asking about multi_curl but i just adding another alternative if the OP may changes his mind. What i do here is splitting the urls into many request so the cpu usage will not that big. If the OP still wants use multi_curl maybe the PHP master here could gives more better solution.
<?php
$num = preg_replace('/[^0-9]/','',$_GET['num']);
$num = empty($num) ? 0 : $num;
$urls=array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/');
if(!empty($urls[$num]))
{
/* do your single curl stuff here and store its data here*/
/*now redirect to the next url. dont use header location redirect, it would ends up too many redirect error in browser*/
$next_url = !empty($urls[$num+1]) ? $urls[$num+1] : 'done';
echo '<html>
<head>
<meta http-equiv="refresh" content="0;url=http://yourcodedomain.com/yourpath/yourcode.php?num='.$next_url.'" />
</head>
<body>
<p>Fetching: '.$num+1.' / '.count($urls).'</p>
</body>
</html>';
}
elseif($_GET['num'] == 'done')
{
/*if all sites have been fetched, do something here*/
}
else
{
/*throws exception here*/
}
?>
i had same issue then i solved using usleep() this try and let me know
do {
usleep(10000);
$n=curl_multi_exec($mh,$active);
}
Try this simplified version:
$urls = [
'https://en.wikipedia.org/',
'https://secure.php.net/',
];
set_time_limit(0);
libxml_use_internal_errors(true);
$hrefs = [];
foreach ($urls as $url) {
$html = file_get_contents($url);
$doc = new DOMDocument;
$doc->loadHTML($html);
foreach ($doc->getElementsByTagName('a') as $link) {
$href = filter_var($link->getAttribute('href'), FILTER_SANITIZE_URL);
if (filter_var($href, FILTER_VALIDATE_URL)) {
echo "<a href='{$href}'>{$href}</a><br/>\n";
}
}
}
I'm not sure I am going about this the right way but I am trying to echo out individual elements of data from an array, but not succeeding, I only need to grab around 10 variables for average fuel consumption from an XML File here: https://www.fueleconomy.gov/ws/rest/ympg/shared/vehicles?make=honda&model=civic
I only need make, model, year avgMpg which is a child of youMpgVehicle etc so I can place them within a table in the same was as you can echo out SQL data within PHP.
function download_page($path){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL,$path);
curl_setopt($ch, CURLOPT_FAILONERROR,1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch, CURLOPT_TIMEOUT, 15);
//curl_setopt($ch, CURLOPT_SSLVERSION,3);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
//curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
$retValue = curl_exec($ch);
curl_close($ch);
return $retValue;
}
$sXML = download_page('https://www.fueleconomy.gov/ws/rest/ympg/shared/vehicles?make=honda&model=civic');
$oXML = new SimpleXMLElement($sXML);
$dom = new DomDocument();
$dom->loadXml($sXML);
$dataElements = $dom->getElementsByTagName('vehicle');
$array = array();
foreach ($dataElements as $element) {
$subarray = array();
foreach ($element->childNodes as $node) {
if (!$node instanceof DomElement) {
continue;
}
$key = $node->tagName;
$value = $node->textContent;
$subarray[$key] = $value;
}
$array[] = $subarray;
// var_dump($array); // returns the array as expected
var_dump($array[0]["barrels08"]); //how can I get this and other variables?
}
The structure is like this: (Or you can click on the hyperlink above)
-<vehicles>
-<vehicle>
<atvType/>
<barrels08>10.283832</barrels08>
<barrelsA08>0.0</barrelsA08>
<charge120>0.0</charge120>
<charge240>0.0</charge240>
<city08>28</city08>
<city08U>28.0743</city08U>
<cityA08>0</cityA08>
<cityA08U>0.0</cityA08U>
<cityCD>0.0</cityCD>
<cityE>0.0</cityE>
<cityUF>0.0</cityUF>
<co2>279</co2>
<co2A>-1</co2A>
<co2TailpipeAGpm>0.0</co2TailpipeAGpm>
<co2TailpipeGpm>279.0</co2TailpipeGpm>
<comb08>32</comb08>
<comb08U>31.9768</comb08U>
<combA08>0</combA08>
<combA08U>0.0</combA08U>
<combE>0.0</combE>
<combinedCD>0.0</combinedCD>
<combinedUF>0.0</combinedUF>
<cylinders>4</cylinders>
<displ>1.8</displ>
<drive>Front-Wheel Drive</drive>
<engId>18</engId>
<eng_dscr/>
<evMotor/>
<feScore>8</feScore>
<fuelCost08>1550</fuelCost08>
<fuelCostA08>0</fuelCostA08>
<fuelType>Regular</fuelType>
<fuelType1/>
<fuelType2/>
<ghgScore>8</ghgScore>
<ghgScoreA>-1</ghgScoreA>
<guzzler/>
<highway08>39</highway08>
<highway08U>38.5216</highway08U>
<highwayA08>0</highwayA08>
<highwayA08U>0.0</highwayA08U>
<highwayCD>0.0</highwayCD>
<highwayE>0.0</highwayE>
<highwayUF>0.0</highwayUF>
<hlv>0</hlv>
<hpv>0</hpv>
<id>33504</id>
<lv2>12</lv2>
<lv4>12</lv4>
<make>Honda</make>
<mfrCode>HNX</mfrCode>
<model>Civic</model>
<mpgData>Y</mpgData>
<phevBlended>false</phevBlended>
<pv2>83</pv2>
<pv4>95</pv4>
<rangeA/>
<rangeCityA>0.0</rangeCityA>
<rangeHwyA>0.0</rangeHwyA>
<trans_dscr/>
<trany>Automatic 5-spd</trany>
<UCity>36.4794</UCity>
<UCityA>0.0</UCityA>
<UHighway>55.5375</UHighway>
<UHighwayA>0.0</UHighwayA>
<VClass>Compact Cars</VClass>
<year>2013</year>
<youSaveSpend>3000</youSaveSpend>
-
33.612226599
45
55
47
28
16
33504
You don't actually need to put everything into an array if you just want to display the data. SimpleXML makes it very simple to handle XML data. If I may suggest a maybe less complex solution:
<?php
function getFuelDataAsXml($make, $model)
{
// In most cases CURL is overkill, unless you need something more complex
$data = file_get_contents("https://www.fueleconomy.gov/ws/rest/ympg/shared/vehicles?make={$make}&model={$model}");
// If we got some data, return it as XML, otherwise return null
return $data ? simplexml_load_string($data) : null;
}
// get the data for a specific make and model
$data = getFuelDataAsXml('honda', 'civic');
// iterate over all vehicle-nodes
foreach($data->vehicle as $vehicleData)
{
echo $vehicleData->barrels08 . '<br />';
echo $vehicleData->yourMpgVehicle->avgMpg . '<br />';
echo '<hr />';
}
To fetch data from an DOM use Xpath:
$url = "https://www.fueleconomy.gov/ws/rest/ympg/shared/vehicles?make=honda&model=civic";
$dom = new DomDocument();
$dom->load($url);
$xpath = new DOMXpath($dom);
foreach ($$xpath->evaluate('/*/vehicle') as $vehicle) {
var_dump(
array(
$xpath->evaluate('string(fuelType)', $vehicle),
$xpath->evaluate('number(fuelCost08)', $vehicle),
$xpath->evaluate('number(barrels08)', $vehicle)
)
);
}
Most Xpath expressions return an a list of nodes that can be iterated using foreach. Using number() or string() will cast the value or content of the first node into a float or string. If the list was empty you will get an empty value.
This question already has an answer here:
Closed 10 years ago.
Possible Duplicate:
Screen scapingin in php using file_get_contents
Can anyone help me.. I am trying to scrape Hotel reviews from LateRooms.com dont tell me its a bad idea because I already have permission as an affiliate
My code:
<?php
header('content-type: text/plain');
$contents = file_get_contents('http://www.laterooms.com/en/hotel-reviews/238902_the-westfield-bb-sandown.aspx');
$contents = preg_replace('/\s(1,)/', ' ', $contents);
print $contents . "\n";
$records = preg_split('/<div id="review/', $contents);
for ($ix = 1; $ix < count($records); $ix++) {
$tmp = $records[$ix];
preg_match('/id="review"/', $tmp, $match_reviews);
print_r($match_reviews);
exit();
}
?>
This works really well the only problem is that It pulls in the whole page of code and doesnt match the div id 'review'
Thanks in advance
function file_get_contents_curl($url){
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
function DOMinnerHTML($element){
$innerHTML = "";
$children = $element->childNodes;
foreach ($children as $child)
{
$tmp_dom = new DOMDocument();
$tmp_dom->appendChild($tmp_dom->importNode($child, true));
$innerHTML.=trim($tmp_dom->saveHTML());
}
return $innerHTML;
}
$url = 'http://www.laterooms.com/en/hotel-reviews/238902_the-westfield-bb-sandown.aspx';
$html = file_get_contents_curl($url);
//parsing begins here:
$doc = new DOMDocument();
#$doc->loadHTML($html);
$div_elements = $doc->getElementsByTagName('div');
if ($div_elements->length <> 0){
foreach ($div_elements as $div_element) {
if ($div_element->getAttribute('class') == 'review newReview'){
$reviews[] = DOMinnerHTML($div_element);
}
}
}
print_r($reviews);
Try this, it will return all reviews. You can refine the content as per your requirement.