Combine 2 separate foreach loops [closed] - php

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 6 years ago.
Improve this question
so far ive managed to scrape 2 elements from an external site on to my test page:
http://mt-cloud.co.uk/nhs/
(Please do a test search on the page to view results)
$ch = curl_init('http://www.nhs.uk/service-search/GP/m410ux/Results/4/-2.35167407989502/53.4519462585449/4/0?distance=25');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$output = curl_exec($ch);
curl_close($ch);
$document = new DOMDocument;
libxml_use_internal_errors(true);
$document->loadHTML($output);
$document->encoding = 'utf-8';
$xpath = new DOMXPath($document);
$pnames = $xpath->query("//th[#class='fctitle']");
$addresses = $xpath->query("//td[#class='fcdetails fc-first']");
I have 2 foreach loops:
foreach ($pnames as $pname) {
$result1 = $document->saveHTML($pname);
}
foreach ($addresses as $address) {
$result2 = $document->saveHTML($address);
}
$result1 = Name of a GP Practice
$result2 = Address of GP Practice
As you see on the test page my result 1 and 2 are seperated..how do i get the them so i have Practice name and Practice Address together?
UPDATE (#Tri)
for($i = 0; $i < count($pnames); $i++){
$name= $document->saveHTML($pnames[$i]);
$name=str_replace ('<a href="/Services/', '<a href="http://www.nhs.uk/Services/', $name);
$address = $document->saveHTML($addresses[$i]);
echo $name.'<br>'.$address;
}
Only returns one result rather than all
Here is my full php code: http://mt-cloud.co.uk/nhs/content/code
Image of data i'm trying to scrape http://mt-cloud.co.uk/nhs/content/results.png

You first code was OK, you just need to store your names and addresses in an 2 dimensional array and then loop through your array.
This part of the code is exactly same as yours:
$ch = curl_init('http://www.nhs.uk/service-search/GP/m410ux/Results/4/-2.35167407989502/53.4519462585449/4/0?distance=25');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$output = curl_exec($ch);
curl_close($ch);
$document = new DOMDocument;
libxml_use_internal_errors(true);
$document->loadHTML($output);
$document->encoding = 'utf-8';
$xpath = new DOMXPath($document);
$pnames = $xpath->query("//th[#class='fctitle']");
$addresses = $xpath->query("//td[#class='fcdetails fc-first']");
Now we will create an array:
$results = array();
Then use your loops and store names and addresses as pairs in same indexs of array:
$iCnt = 0;
foreach ($pnames as $pname){
$results[$iCnt]['name'] = $document->saveHTML($pname);
$iCnt++;
}
$iCnt = 0;
foreach ($addresses as $address){
$results[$iCnt]['address'] = $document->saveHTML($address);
$iCnt++;
}
Now we have an array with pairs of names and addresses and if we loop through it, we can see them together:
for($iCnt = 0, $cnt = count($results); $iCnt < $cnt; $iCnt++){
echo 'Name: '.$results[$iCnt]['name'].'<br>';
echo 'Address: '.$results[$iCnt]['address'].'<br>';
}
That's all. The complete code will look like this:
<?php
$ch = curl_init('http://www.nhs.uk/service-search/GP/m410ux/Results/4/-2.35167407989502/53.4519462585449/4/0?distance=25');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$output = curl_exec($ch);
curl_close($ch);
$document = new DOMDocument;
libxml_use_internal_errors(true);
$document->loadHTML($output);
$document->encoding = 'utf-8';
$xpath = new DOMXPath($document);
$pnames = $xpath->query("//th[#class='fctitle']");
$addresses = $xpath->query("//td[#class='fcdetails fc-first']");
$results = array();
$iCnt = 0;
foreach ($pnames as $pname){
$results[$iCnt]['name'] = $document->saveHTML($pname);
$iCnt++;
}
$iCnt = 0;
foreach ($addresses as $address){
$results[$iCnt]['address'] = $document->saveHTML($address);
$iCnt++;
}
for($iCnt = 0, $cnt = count($results); $iCnt < $cnt; $iCnt++){
echo 'Name: '.$results[$iCnt]['name'].'<br>';
echo 'Address: '.$results[$iCnt]['address'].'<br>';
}
?>

So just do not use foreach:
for($i = 0; $i < count($pnames); $i++){
$name= $document->saveHTML($pnames[$i]);
$address = $document->saveHTML($addresses[$i]);
//do something with your result
}
Note: you must make sure that length of addresses equal to pnames

Related

Extract url meta tags using php

I want to implement facebook like url extract system and i am using php CURL to extract the data
But i am getting url data of only few websites not of all websites
Here is the code:-
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $_POST["url"]);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$data = curl_exec($ch);
curl_close($ch);
// Load HTML to DOM Object
$dom = new DOMDocument();
#$dom->loadHTML($data);
// Parse DOM to get Title
$nodes = $dom->getElementsByTagName('title');
$title = $nodes->item(0)->nodeValue;
// Parse DOM to get Meta Description
$metas = $dom->getElementsByTagName('meta');
$body = "";
for ($i = 0; $i < $metas->length; $i ++) {
$meta = $metas->item($i);
if ($meta->getAttribute('name') == 'description') {
$body = $meta->getAttribute('content');
}
}
// Parse DOM to get Images
$image_urls = array();
$images = $dom->getElementsByTagName('img');
for ($i = 0; $i < $images->length; $i ++) {
$image = $images->item($i);
$src = $image->getAttribute('src');
if(filter_var($src, FILTER_VALIDATE_URL)) {
$image_src[] = $src;
}
}
$output = array(
'title' => $title,
'image_src' => $image_src,
'body' => $body
);
echo json_encode($output);
For few url i got the details like title, description, images but for most of the websites the code do not extract any details do i need to use client side language like jquery

Asynchronous php loop

I'm building a site where I have a json array of URL's, I use these urls to parse the meta data to display on the site. I want to show a thumbnail image, title and description. The issue that I've found is that it takes almost 10 seconds to get the data ready to be displayed in the site.
My question I want to ask is this: How should I go about to make it asynchronous, so that the loading time is drastically reduced. Does anyone have suggestions for this?
$jsonData = file_get_contents("data.json");
$json = json_decode($jsonData, true);
<div class="container" id="content">
<?php
$jsonData = file_get_contents("data.json");
$json = json_decode($jsonData, true);
function file_get_contents_curl($url){
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
foreach ($json['posts'] as $post){
$html = file_get_contents_curl($post['url']);
//parsing begins here:
$doc = new DOMDocument();
#$doc->loadHTML($html);
$nodes = $doc->getElementsByTagName('title');
//get and display what you need:
$title = $nodes->item(0)->nodeValue;
$metas = $doc->getElementsByTagName('meta');
for ($i = 0; $i < $metas->length; $i++)
{
$meta = $metas->item($i);
if($meta->getAttribute('property')=='og:image'){
$image = $meta->getAttribute('content');}
if($meta->getAttribute('name') == 'description')
$description = $meta->getAttribute('content');
if($meta->getAttribute('name') == 'keywords')
$keywords = $meta->getAttribute('content');
}
$output = "<a href='$post[url]'><br>";
$output .= "<div class='thumbnail'><br>";
$output .= "<img src='$image'><br>";
$output .= "<p class='article-headline'>$title</p><br>";
$output .= "<p class='article-excerpt'>$description</p><br>";
$output .= "</div></a>";
echo $output;
}
?>
</div>
my data.json file looks like this but with about 16 urls. The idea is that it's supposed to be able to handle an array with alot more URL's:
{
"posts": [
{
"url":"https://medium.com/s/story/the-absurdity-of-student-loan-debt-fb61fdca7d8c"
},
{
"url":"https://medium.com/s/in-defense-of-the-worst-human-emotions/jealousy-is-a-brat-eed054493965"
},
{
"url":"https://medium.com/swlh/chatbots-were-the-next-big-thing-what-happened-5fc49dd6fa61"
},
{
"url":"https://medium.com/s/trustissues/my-so-called-millennial-entitlement-9be84343c713"
}
]
}

Append value to the URL Using curl

call multiple url sequentially with php curl
this question is also related to my previous post so ive shared the link above...
Now when i tried append the sessionid in my url execute below code..It didnt run properly..so echoed the url and got following output..
3.0971635097876E+183.0971635097876E+1
So how to append the sid with the url...
Below is how I've tried..
<?php
$response=3097163509787559940;
$url1 = 'http://192.168.1.220/cgi-bin/controller.tcl?sid='+$response+'&type=inverter&inverter=318002N463';
echo "$url1";
$url2 = 'http://192.168.1.220/cgi-bin/overview.tcl?sid='+$response+'&menuParentId=3';
echo "$url2";
$nodes = array('$url1', '$url2');
$node_count = count($nodes);
$curl_arr = array();
$master = curl_multi_init();
for($i = 0; $i < $node_count; $i++){
$url =$nodes[$i];
$curl_arr[$i] = curl_init($url);
curl_setopt($curl_arr[$i], CURLOPT_RETURNTRANSFER, true);
curl_multi_add_handle($master, $curl_arr[$i]);
}
do {
curl_multi_exec($master,$running);
} while($running > 0);
echo "results: ";
for($i = 0; $i < $node_count; $i++){
$results = curl_multi_getcontent ( $curl_arr[$i] );
echo( $i . "\n" . $results . "\n");
}
echo 'done';
?>
Strings are concatenated in PHP with dot (.)
Replace concatenation character + with . in both $url1 and url2
$url1 = 'http://192.168.1.220/cgi-bin/controller.tcl?sid='.$response.'&type=inverter&inverter=318002N463';
Remove quotes while creating array. When enclosed in single quotes, it will render it as string and not as variable.
$nodes = array($url1, $url2);
You need to do 3 changes like below:-
$url1 = "http://192.168.1.220/cgi-bin/controller.tcl?sid=$response&type=inverter&inverter=318002N463";
$url2 = "http://192.168.1.220/cgi-bin/overview.tcl?sid=$response&menuParentId=3";
$nodes = array($url1, $url2);//remove quotes around urls

xpath query get child element

I'm currently using the following address via PHP CURL / Xpath to grab some data:
$url: http://v1.syndication.nhschoices.nhs.uk/organisations/hospitals/name/trafford?apikey=TBQBMMKR
Code used:
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$output = curl_exec($ch);
curl_close($ch);
$document = new DOMDocument;
libxml_use_internal_errors(true);
$document->loadHTML($output);
$document->encoding = 'utf-8';
$xpath = new DOMXPath($document);
$orgcodes= $xpath->query('//dd[2]');
$results = array();
$iCnt = 0;
foreach ($orgcodes as $orgcode){
$results[$iCnt]['orgcode'] = $document->saveHTML($orgcode); $iCnt++; }
$n = 0;
for($iCnt = 0, $cnt = count($results);
$iCnt < $cnt; $iCnt++){
$n++;
echo $results[$iCnt]['orgcode'];
}?>
I want to echo the ODS codes like so :
RM321
RW3TR
But i'm getting the following instead:
RM321
53.4540634155273
RW3TR
53.4540634155273
Any ideas? I'm guessing its do with the xquery but I'm at a puzzle.

Crawl a website, get the links, crawl the links with PHP and XPATH

I want to crawl an entire website , I have read several threads but I cannot manage to get data in a 2nd level.
That is, I can return the links from a starting page but then I cannot find a way to parse the links and get the content of each link...
The code I use is:
<?php
// SELECT STARTING PAGE
$url = 'http://mydomain.com/';
$html= file_get_contents($url);
// GET ALL THE LINKS OF EACH PAGE
// create a dom object
$dom = new DOMDocument();
#$dom->loadHTML($html);
// run xpath for the dom
$xPath = new DOMXPath($dom);
// get links from starting page
$elements = $xPath->query("//a/#href");
foreach ($elements as $e) {
echo $e->nodeValue. "<br />";
}
// Parse each page using the extracted links?
?>
Could somebody help me out for the last part with an example?
I will be really much appreciated!
Well , thanx for your answers!
I tried some stuff but I Haven't managet to get any results yet - I am new to programming..
Below, you can find 2 of my attempts - the 1st trying to parse the links and in the second trying to replace file_get contents with Curl:
1)
<?php
// GET STARTING PAGE
$url = 'http://www.capoeira.com.gr/';
$html= file_get_contents($url);
//GET ALL THE LINKS FROM STARTING PAGE
// create a dom object
$dom = new DOMDocument();
#$dom->loadHTML($html);
// run xpath for the dom
$xPath = new DOMXPath($dom);
// get specific elements from the sites
$elements = $xPath->query("//a/#href");
//PARSE EACH LINK
foreach($elements as $e) {
$URLS= file_get_contents($e);
$dom = new DOMDocument();
#$dom->loadHTML($html);
$xPath = new DOMXPath($dom);
$output = $xPath->query("//div[#class='content-entry clearfix']");
echo $output ->nodeValue;
}
?>
For the above code I get
Warning: file_get_contents() expects parameter 1 to be string, object given in ../example.php on line 26
2)
<?php
$curl = curl_init();
curl_setopt($curl, CURLOPT_POST, 1);
curl_setopt($curl, CURLOPT_URL, "http://capoeira.com.gr");
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
$content= curl_exec($curl);
curl_close($curl);
$dom = new DOMDocument();
#$dom->loadHTML($content);
$xPath = new DOMXPath($dom);
$elements = $xPath->query("//a/#href");
foreach ($elements as $e) {
echo $e->nodeValue. "<br />";
}
?>
I get no results. I tried to echo $content and then I get :
You don't have permission to access / on this server.
Additionally, a 413 Request Entity Too Large error was encountered while trying to use an ErrorDocument to handle the request...
Any ideas please?? :)
You can try the following. See this thread for more details
<?php
//set_time_limit (0);
function crawl_page($url, $depth = 5){
$seen = array();
if(($depth == 0) or (in_array($url, $seen))){
return;
}
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
$result = curl_exec ($ch);
curl_close ($ch);
if( $result ){
$stripped_file = strip_tags($result, "<a>");
preg_match_all("/<a[\s]+[^>]*?href[\s]?=[\s\"\']+"."(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $stripped_file, $matches, PREG_SET_ORDER );
foreach($matches as $match){
$href = $match[1];
if (0 !== strpos($href, 'http')) {
$path = '/' . ltrim($href, '/');
if (extension_loaded('http')) {
$href = http_build_url($href , array('path' => $path));
} else {
$parts = parse_url($href);
$href = $parts['scheme'] . '://';
if (isset($parts['user']) && isset($parts['pass'])) {
$href .= $parts['user'] . ':' . $parts['pass'] . '#';
}
$href .= $parts['host'];
if (isset($parts['port'])) {
$href .= ':' . $parts['port'];
}
$href .= $path;
}
}
crawl_page($href, $depth - 1);
}
}
echo "Crawled {$href}";
}
crawl_page("http://www.sitename.com/",3);
?>
$doc = new DOMDocument;
$doc->load('file.htm');
$items = $doc->getElementsByTagName('a');
foreach($items as $value) {
echo $value->nodeValue . "\n";
$attrs = $value->attributes;
echo $attrs->getNamedItem('href')->nodeValue . "\n";
};
find link from website recursively with depth
<?php
$depth = 1;
print_r(getList($depth));
function getList($depth)
{
$lists = getDepth($depth);
return $lists;
}
function getUrl($request_url)
{
$countValid = 0;
$brokenCount =0;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $request_url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); // We want to get the respone
$result = curl_exec($ch);
$regex = '|<a.*?href="(.*?)"|';
preg_match_all($regex, $result, $parts);
$links = $parts[1];
$lists = array();
foreach ($links as $link)
{
$url = htmlentities($link);
$result =getFlag($url);
if($result == true)
{
$UrlLists["clean"][$countValid] =$url;
$countValid++;
}
else
{
$UrlLists["broken"][$brokenCount]= "broken->".$url;
$brokenCount++;
}
}
curl_close($ch);
return $UrlLists;
}
function ZeroDepth($list)
{
$request_url = $list;
$listss["0"]["0"] = getUrl($request_url);
$lists["0"]["0"]["clean"] = array_unique($listss["0"]["0"]["clean"]);
$lists["0"]["0"]["broken"] = array_unique($listss["0"]["0"]["broken"]);
return $lists;
}
function getDepth($depth)
{
// $list =OW_URL_HOME;
$list = "https://example.com";//enter the url of website
$lists =ZeroDepth($list);
for($i=1;$i<=$depth;$i++)
{
$l= $i;
$l= $l-1;
$depthArray=1;
foreach($lists[$l][$l]["clean"] as $depthUrl)
{
$request_url = $depthUrl;
$lists[$i][$depthArray]["requst_url"]=$request_url;
$lists[$i][$depthArray] = getUrl($request_url);
}
}
return $lists;
}
function getFlag($url)
{
$url_response = array();
$curl = curl_init();
$curl_options = array();
$curl_options[CURLOPT_RETURNTRANSFER] = true;
$curl_options[CURLOPT_URL] = $url;
$curl_options[CURLOPT_NOBODY] = true;
$curl_options[CURLOPT_TIMEOUT] = 60;
curl_setopt_array($curl, $curl_options);
curl_exec($curl);
$status = curl_getinfo($curl, CURLINFO_HTTP_CODE);
if ($status == 200)
{
return true;
}
else
{
return false;
}
curl_close($curl);
}
?>`
Please check the code below, hope it helps you.
<?php
$html = new DOMDocument();
#$html->loadHtmlFile('http://www.yourdomain.com');
$xpath = new DOMXPath( $html );
$nodelist = $xpath->query( "//div[#class='A-CLASS-Name']/h3/a/#href" );
foreach ($nodelist as $n){
echo $n->nodeValue."\n<br>";
}
?>
Thanks,
Roger
<?php
$path='http://www.hscripts.com/';
$html = file_get_contents($path);
$dom = new DOMDocument();
#$dom->loadHTML($html);
// grab all the on the page
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for ($i = 0; $i < $hrefs->length; $i++ ) {
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
echo $url.'<br />';
}
?>
you can use above code to get all possible links

Categories