Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 6 years ago.
Improve this question
so far ive managed to scrape 2 elements from an external site on to my test page:
http://mt-cloud.co.uk/nhs/
(Please do a test search on the page to view results)
$ch = curl_init('http://www.nhs.uk/service-search/GP/m410ux/Results/4/-2.35167407989502/53.4519462585449/4/0?distance=25');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$output = curl_exec($ch);
curl_close($ch);
$document = new DOMDocument;
libxml_use_internal_errors(true);
$document->loadHTML($output);
$document->encoding = 'utf-8';
$xpath = new DOMXPath($document);
$pnames = $xpath->query("//th[#class='fctitle']");
$addresses = $xpath->query("//td[#class='fcdetails fc-first']");
I have 2 foreach loops:
foreach ($pnames as $pname) {
$result1 = $document->saveHTML($pname);
}
foreach ($addresses as $address) {
$result2 = $document->saveHTML($address);
}
$result1 = Name of a GP Practice
$result2 = Address of GP Practice
As you see on the test page my result 1 and 2 are seperated..how do i get the them so i have Practice name and Practice Address together?
UPDATE (#Tri)
for($i = 0; $i < count($pnames); $i++){
$name= $document->saveHTML($pnames[$i]);
$name=str_replace ('<a href="/Services/', '<a href="http://www.nhs.uk/Services/', $name);
$address = $document->saveHTML($addresses[$i]);
echo $name.'<br>'.$address;
}
Only returns one result rather than all
Here is my full php code: http://mt-cloud.co.uk/nhs/content/code
Image of data i'm trying to scrape http://mt-cloud.co.uk/nhs/content/results.png
You first code was OK, you just need to store your names and addresses in an 2 dimensional array and then loop through your array.
This part of the code is exactly same as yours:
$ch = curl_init('http://www.nhs.uk/service-search/GP/m410ux/Results/4/-2.35167407989502/53.4519462585449/4/0?distance=25');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$output = curl_exec($ch);
curl_close($ch);
$document = new DOMDocument;
libxml_use_internal_errors(true);
$document->loadHTML($output);
$document->encoding = 'utf-8';
$xpath = new DOMXPath($document);
$pnames = $xpath->query("//th[#class='fctitle']");
$addresses = $xpath->query("//td[#class='fcdetails fc-first']");
Now we will create an array:
$results = array();
Then use your loops and store names and addresses as pairs in same indexs of array:
$iCnt = 0;
foreach ($pnames as $pname){
$results[$iCnt]['name'] = $document->saveHTML($pname);
$iCnt++;
}
$iCnt = 0;
foreach ($addresses as $address){
$results[$iCnt]['address'] = $document->saveHTML($address);
$iCnt++;
}
Now we have an array with pairs of names and addresses and if we loop through it, we can see them together:
for($iCnt = 0, $cnt = count($results); $iCnt < $cnt; $iCnt++){
echo 'Name: '.$results[$iCnt]['name'].'<br>';
echo 'Address: '.$results[$iCnt]['address'].'<br>';
}
That's all. The complete code will look like this:
<?php
$ch = curl_init('http://www.nhs.uk/service-search/GP/m410ux/Results/4/-2.35167407989502/53.4519462585449/4/0?distance=25');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$output = curl_exec($ch);
curl_close($ch);
$document = new DOMDocument;
libxml_use_internal_errors(true);
$document->loadHTML($output);
$document->encoding = 'utf-8';
$xpath = new DOMXPath($document);
$pnames = $xpath->query("//th[#class='fctitle']");
$addresses = $xpath->query("//td[#class='fcdetails fc-first']");
$results = array();
$iCnt = 0;
foreach ($pnames as $pname){
$results[$iCnt]['name'] = $document->saveHTML($pname);
$iCnt++;
}
$iCnt = 0;
foreach ($addresses as $address){
$results[$iCnt]['address'] = $document->saveHTML($address);
$iCnt++;
}
for($iCnt = 0, $cnt = count($results); $iCnt < $cnt; $iCnt++){
echo 'Name: '.$results[$iCnt]['name'].'<br>';
echo 'Address: '.$results[$iCnt]['address'].'<br>';
}
?>
So just do not use foreach:
for($i = 0; $i < count($pnames); $i++){
$name= $document->saveHTML($pnames[$i]);
$address = $document->saveHTML($addresses[$i]);
//do something with your result
}
Note: you must make sure that length of addresses equal to pnames
Related
I want to implement facebook like url extract system and i am using php CURL to extract the data
But i am getting url data of only few websites not of all websites
Here is the code:-
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $_POST["url"]);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$data = curl_exec($ch);
curl_close($ch);
// Load HTML to DOM Object
$dom = new DOMDocument();
#$dom->loadHTML($data);
// Parse DOM to get Title
$nodes = $dom->getElementsByTagName('title');
$title = $nodes->item(0)->nodeValue;
// Parse DOM to get Meta Description
$metas = $dom->getElementsByTagName('meta');
$body = "";
for ($i = 0; $i < $metas->length; $i ++) {
$meta = $metas->item($i);
if ($meta->getAttribute('name') == 'description') {
$body = $meta->getAttribute('content');
}
}
// Parse DOM to get Images
$image_urls = array();
$images = $dom->getElementsByTagName('img');
for ($i = 0; $i < $images->length; $i ++) {
$image = $images->item($i);
$src = $image->getAttribute('src');
if(filter_var($src, FILTER_VALIDATE_URL)) {
$image_src[] = $src;
}
}
$output = array(
'title' => $title,
'image_src' => $image_src,
'body' => $body
);
echo json_encode($output);
For few url i got the details like title, description, images but for most of the websites the code do not extract any details do i need to use client side language like jquery
I'm building a site where I have a json array of URL's, I use these urls to parse the meta data to display on the site. I want to show a thumbnail image, title and description. The issue that I've found is that it takes almost 10 seconds to get the data ready to be displayed in the site.
My question I want to ask is this: How should I go about to make it asynchronous, so that the loading time is drastically reduced. Does anyone have suggestions for this?
$jsonData = file_get_contents("data.json");
$json = json_decode($jsonData, true);
<div class="container" id="content">
<?php
$jsonData = file_get_contents("data.json");
$json = json_decode($jsonData, true);
function file_get_contents_curl($url){
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
foreach ($json['posts'] as $post){
$html = file_get_contents_curl($post['url']);
//parsing begins here:
$doc = new DOMDocument();
#$doc->loadHTML($html);
$nodes = $doc->getElementsByTagName('title');
//get and display what you need:
$title = $nodes->item(0)->nodeValue;
$metas = $doc->getElementsByTagName('meta');
for ($i = 0; $i < $metas->length; $i++)
{
$meta = $metas->item($i);
if($meta->getAttribute('property')=='og:image'){
$image = $meta->getAttribute('content');}
if($meta->getAttribute('name') == 'description')
$description = $meta->getAttribute('content');
if($meta->getAttribute('name') == 'keywords')
$keywords = $meta->getAttribute('content');
}
$output = "<a href='$post[url]'><br>";
$output .= "<div class='thumbnail'><br>";
$output .= "<img src='$image'><br>";
$output .= "<p class='article-headline'>$title</p><br>";
$output .= "<p class='article-excerpt'>$description</p><br>";
$output .= "</div></a>";
echo $output;
}
?>
</div>
my data.json file looks like this but with about 16 urls. The idea is that it's supposed to be able to handle an array with alot more URL's:
{
"posts": [
{
"url":"https://medium.com/s/story/the-absurdity-of-student-loan-debt-fb61fdca7d8c"
},
{
"url":"https://medium.com/s/in-defense-of-the-worst-human-emotions/jealousy-is-a-brat-eed054493965"
},
{
"url":"https://medium.com/swlh/chatbots-were-the-next-big-thing-what-happened-5fc49dd6fa61"
},
{
"url":"https://medium.com/s/trustissues/my-so-called-millennial-entitlement-9be84343c713"
}
]
}
call multiple url sequentially with php curl
this question is also related to my previous post so ive shared the link above...
Now when i tried append the sessionid in my url execute below code..It didnt run properly..so echoed the url and got following output..
3.0971635097876E+183.0971635097876E+1
So how to append the sid with the url...
Below is how I've tried..
<?php
$response=3097163509787559940;
$url1 = 'http://192.168.1.220/cgi-bin/controller.tcl?sid='+$response+'&type=inverter&inverter=318002N463';
echo "$url1";
$url2 = 'http://192.168.1.220/cgi-bin/overview.tcl?sid='+$response+'&menuParentId=3';
echo "$url2";
$nodes = array('$url1', '$url2');
$node_count = count($nodes);
$curl_arr = array();
$master = curl_multi_init();
for($i = 0; $i < $node_count; $i++){
$url =$nodes[$i];
$curl_arr[$i] = curl_init($url);
curl_setopt($curl_arr[$i], CURLOPT_RETURNTRANSFER, true);
curl_multi_add_handle($master, $curl_arr[$i]);
}
do {
curl_multi_exec($master,$running);
} while($running > 0);
echo "results: ";
for($i = 0; $i < $node_count; $i++){
$results = curl_multi_getcontent ( $curl_arr[$i] );
echo( $i . "\n" . $results . "\n");
}
echo 'done';
?>
Strings are concatenated in PHP with dot (.)
Replace concatenation character + with . in both $url1 and url2
$url1 = 'http://192.168.1.220/cgi-bin/controller.tcl?sid='.$response.'&type=inverter&inverter=318002N463';
Remove quotes while creating array. When enclosed in single quotes, it will render it as string and not as variable.
$nodes = array($url1, $url2);
You need to do 3 changes like below:-
$url1 = "http://192.168.1.220/cgi-bin/controller.tcl?sid=$response&type=inverter&inverter=318002N463";
$url2 = "http://192.168.1.220/cgi-bin/overview.tcl?sid=$response&menuParentId=3";
$nodes = array($url1, $url2);//remove quotes around urls
I'm currently using the following address via PHP CURL / Xpath to grab some data:
$url: http://v1.syndication.nhschoices.nhs.uk/organisations/hospitals/name/trafford?apikey=TBQBMMKR
Code used:
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$output = curl_exec($ch);
curl_close($ch);
$document = new DOMDocument;
libxml_use_internal_errors(true);
$document->loadHTML($output);
$document->encoding = 'utf-8';
$xpath = new DOMXPath($document);
$orgcodes= $xpath->query('//dd[2]');
$results = array();
$iCnt = 0;
foreach ($orgcodes as $orgcode){
$results[$iCnt]['orgcode'] = $document->saveHTML($orgcode); $iCnt++; }
$n = 0;
for($iCnt = 0, $cnt = count($results);
$iCnt < $cnt; $iCnt++){
$n++;
echo $results[$iCnt]['orgcode'];
}?>
I want to echo the ODS codes like so :
RM321
RW3TR
But i'm getting the following instead:
RM321
53.4540634155273
RW3TR
53.4540634155273
Any ideas? I'm guessing its do with the xquery but I'm at a puzzle.
I want to crawl an entire website , I have read several threads but I cannot manage to get data in a 2nd level.
That is, I can return the links from a starting page but then I cannot find a way to parse the links and get the content of each link...
The code I use is:
<?php
// SELECT STARTING PAGE
$url = 'http://mydomain.com/';
$html= file_get_contents($url);
// GET ALL THE LINKS OF EACH PAGE
// create a dom object
$dom = new DOMDocument();
#$dom->loadHTML($html);
// run xpath for the dom
$xPath = new DOMXPath($dom);
// get links from starting page
$elements = $xPath->query("//a/#href");
foreach ($elements as $e) {
echo $e->nodeValue. "<br />";
}
// Parse each page using the extracted links?
?>
Could somebody help me out for the last part with an example?
I will be really much appreciated!
Well , thanx for your answers!
I tried some stuff but I Haven't managet to get any results yet - I am new to programming..
Below, you can find 2 of my attempts - the 1st trying to parse the links and in the second trying to replace file_get contents with Curl:
1)
<?php
// GET STARTING PAGE
$url = 'http://www.capoeira.com.gr/';
$html= file_get_contents($url);
//GET ALL THE LINKS FROM STARTING PAGE
// create a dom object
$dom = new DOMDocument();
#$dom->loadHTML($html);
// run xpath for the dom
$xPath = new DOMXPath($dom);
// get specific elements from the sites
$elements = $xPath->query("//a/#href");
//PARSE EACH LINK
foreach($elements as $e) {
$URLS= file_get_contents($e);
$dom = new DOMDocument();
#$dom->loadHTML($html);
$xPath = new DOMXPath($dom);
$output = $xPath->query("//div[#class='content-entry clearfix']");
echo $output ->nodeValue;
}
?>
For the above code I get
Warning: file_get_contents() expects parameter 1 to be string, object given in ../example.php on line 26
2)
<?php
$curl = curl_init();
curl_setopt($curl, CURLOPT_POST, 1);
curl_setopt($curl, CURLOPT_URL, "http://capoeira.com.gr");
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
$content= curl_exec($curl);
curl_close($curl);
$dom = new DOMDocument();
#$dom->loadHTML($content);
$xPath = new DOMXPath($dom);
$elements = $xPath->query("//a/#href");
foreach ($elements as $e) {
echo $e->nodeValue. "<br />";
}
?>
I get no results. I tried to echo $content and then I get :
You don't have permission to access / on this server.
Additionally, a 413 Request Entity Too Large error was encountered while trying to use an ErrorDocument to handle the request...
Any ideas please?? :)
You can try the following. See this thread for more details
<?php
//set_time_limit (0);
function crawl_page($url, $depth = 5){
$seen = array();
if(($depth == 0) or (in_array($url, $seen))){
return;
}
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
$result = curl_exec ($ch);
curl_close ($ch);
if( $result ){
$stripped_file = strip_tags($result, "<a>");
preg_match_all("/<a[\s]+[^>]*?href[\s]?=[\s\"\']+"."(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $stripped_file, $matches, PREG_SET_ORDER );
foreach($matches as $match){
$href = $match[1];
if (0 !== strpos($href, 'http')) {
$path = '/' . ltrim($href, '/');
if (extension_loaded('http')) {
$href = http_build_url($href , array('path' => $path));
} else {
$parts = parse_url($href);
$href = $parts['scheme'] . '://';
if (isset($parts['user']) && isset($parts['pass'])) {
$href .= $parts['user'] . ':' . $parts['pass'] . '#';
}
$href .= $parts['host'];
if (isset($parts['port'])) {
$href .= ':' . $parts['port'];
}
$href .= $path;
}
}
crawl_page($href, $depth - 1);
}
}
echo "Crawled {$href}";
}
crawl_page("http://www.sitename.com/",3);
?>
$doc = new DOMDocument;
$doc->load('file.htm');
$items = $doc->getElementsByTagName('a');
foreach($items as $value) {
echo $value->nodeValue . "\n";
$attrs = $value->attributes;
echo $attrs->getNamedItem('href')->nodeValue . "\n";
};
find link from website recursively with depth
<?php
$depth = 1;
print_r(getList($depth));
function getList($depth)
{
$lists = getDepth($depth);
return $lists;
}
function getUrl($request_url)
{
$countValid = 0;
$brokenCount =0;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $request_url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); // We want to get the respone
$result = curl_exec($ch);
$regex = '|<a.*?href="(.*?)"|';
preg_match_all($regex, $result, $parts);
$links = $parts[1];
$lists = array();
foreach ($links as $link)
{
$url = htmlentities($link);
$result =getFlag($url);
if($result == true)
{
$UrlLists["clean"][$countValid] =$url;
$countValid++;
}
else
{
$UrlLists["broken"][$brokenCount]= "broken->".$url;
$brokenCount++;
}
}
curl_close($ch);
return $UrlLists;
}
function ZeroDepth($list)
{
$request_url = $list;
$listss["0"]["0"] = getUrl($request_url);
$lists["0"]["0"]["clean"] = array_unique($listss["0"]["0"]["clean"]);
$lists["0"]["0"]["broken"] = array_unique($listss["0"]["0"]["broken"]);
return $lists;
}
function getDepth($depth)
{
// $list =OW_URL_HOME;
$list = "https://example.com";//enter the url of website
$lists =ZeroDepth($list);
for($i=1;$i<=$depth;$i++)
{
$l= $i;
$l= $l-1;
$depthArray=1;
foreach($lists[$l][$l]["clean"] as $depthUrl)
{
$request_url = $depthUrl;
$lists[$i][$depthArray]["requst_url"]=$request_url;
$lists[$i][$depthArray] = getUrl($request_url);
}
}
return $lists;
}
function getFlag($url)
{
$url_response = array();
$curl = curl_init();
$curl_options = array();
$curl_options[CURLOPT_RETURNTRANSFER] = true;
$curl_options[CURLOPT_URL] = $url;
$curl_options[CURLOPT_NOBODY] = true;
$curl_options[CURLOPT_TIMEOUT] = 60;
curl_setopt_array($curl, $curl_options);
curl_exec($curl);
$status = curl_getinfo($curl, CURLINFO_HTTP_CODE);
if ($status == 200)
{
return true;
}
else
{
return false;
}
curl_close($curl);
}
?>`
Please check the code below, hope it helps you.
<?php
$html = new DOMDocument();
#$html->loadHtmlFile('http://www.yourdomain.com');
$xpath = new DOMXPath( $html );
$nodelist = $xpath->query( "//div[#class='A-CLASS-Name']/h3/a/#href" );
foreach ($nodelist as $n){
echo $n->nodeValue."\n<br>";
}
?>
Thanks,
Roger
<?php
$path='http://www.hscripts.com/';
$html = file_get_contents($path);
$dom = new DOMDocument();
#$dom->loadHTML($html);
// grab all the on the page
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for ($i = 0; $i < $hrefs->length; $i++ ) {
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
echo $url.'<br />';
}
?>
you can use above code to get all possible links