PHP Call to a member function find() - php

Code on pastebin
Link on test page
Code:
require_once('simple_html_dom.php');
$file = 'http://testwork.ru/Tempp/domains.php'; // page with table
$SymbolsCountMin = 0;
$SymbolsCountMax = 10;
$SymbolsBackLis = array('-','_','.','0','1','2','3','4','5','6','7','8','9');
$ArrTr = array();
$ArrTd = array();
$ch = curl_init($file);
curl_setopt($ch, CURLOPT_URL, $file );
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true );
curl_setopt($ch, CURLOPT_VERBOSE, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, array(
'Content-Type: application/x-www-form-urlencoded'
));
$html = curl_exec($ch);
//$responseInfo = curl_getinfo($ch);
curl_close ($ch);
//var_dump($html);
//$html = file_get_html('http://testwork.ru/Tempp/domains.php');
// Find all tr
$row = 0;
foreach($html->find('tr') as $tr){
if($row!=0){
$row++;
$column = 0;
foreach($tr->find('td') as $td){
$column++;
$text = $td->plaintext;
$ArrTd[$column] = $text;
}
}
if(iconv_strlen($ArrTd[0]) > $SymbolsCountMin && iconv_strlen($ArrTd[0]) < $SymbolsCountMax && !in_array($ArrTd[0], $SymbolsBackList)){
$ArrTr[$row] = $ArrTd;
}
}
$c = '';
foreach($ArrTr as $arr_tr =>$ftr){
$c .='<tr>';
foreach($ftr as $arr_td =>$ftd){
$c .='<td>';
$c .= $ftd;
$c .='<td>';
}
$c .='</tr>';
}
$row_header = '
<table style="text-align:center;">
';
$row_header .= $c;
$row_header .= '
</table>';
echo $row_header;
I get error Fatal error: Call to a member function find() on a non-object in /var/www/seo-main/data/www/testwork.ru/Tempp/parse_domains.php on line 34
Tell me please why i get it error and how make right ?

find() is a member function of
http://simplehtmldom.sourceforge.net/
You have commented out the line
$html = file_get_html('http://testwork.ru/Tempp/domains.php');
With CURL it will return html content not the parsed data.
If you are intend to use CURL then use
$html_data = curl_exec($ch);
curl_close ($ch);
$html = str_get_html($html_data ); // using the returned html from curl to the parser
Then do the rest of the parsing part.

You need to edit the code like this:
// Find all tr
$row = 0;
if(!$html) { // check if $html does exists
foreach($html->find('tr') as $tr){
if($row!=0){
$row++;
$column = 0;
foreach($tr->find('td') as $td){
$column++;
$text = $td->plaintext;
$ArrTd[$column] = $text;
}
}
if(iconv_strlen($ArrTd[0]) > $SymbolsCountMin && iconv_strlen($ArrTd[0]) < $SymbolsCountMax && !in_array($ArrTd[0], $SymbolsBackList)){
$ArrTr[$row] = $ArrTd;
}
}
}
or use a try and catch statement

Related

Asynchronous php loop

I'm building a site where I have a json array of URL's, I use these urls to parse the meta data to display on the site. I want to show a thumbnail image, title and description. The issue that I've found is that it takes almost 10 seconds to get the data ready to be displayed in the site.
My question I want to ask is this: How should I go about to make it asynchronous, so that the loading time is drastically reduced. Does anyone have suggestions for this?
$jsonData = file_get_contents("data.json");
$json = json_decode($jsonData, true);
<div class="container" id="content">
<?php
$jsonData = file_get_contents("data.json");
$json = json_decode($jsonData, true);
function file_get_contents_curl($url){
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
foreach ($json['posts'] as $post){
$html = file_get_contents_curl($post['url']);
//parsing begins here:
$doc = new DOMDocument();
#$doc->loadHTML($html);
$nodes = $doc->getElementsByTagName('title');
//get and display what you need:
$title = $nodes->item(0)->nodeValue;
$metas = $doc->getElementsByTagName('meta');
for ($i = 0; $i < $metas->length; $i++)
{
$meta = $metas->item($i);
if($meta->getAttribute('property')=='og:image'){
$image = $meta->getAttribute('content');}
if($meta->getAttribute('name') == 'description')
$description = $meta->getAttribute('content');
if($meta->getAttribute('name') == 'keywords')
$keywords = $meta->getAttribute('content');
}
$output = "<a href='$post[url]'><br>";
$output .= "<div class='thumbnail'><br>";
$output .= "<img src='$image'><br>";
$output .= "<p class='article-headline'>$title</p><br>";
$output .= "<p class='article-excerpt'>$description</p><br>";
$output .= "</div></a>";
echo $output;
}
?>
</div>
my data.json file looks like this but with about 16 urls. The idea is that it's supposed to be able to handle an array with alot more URL's:
{
"posts": [
{
"url":"https://medium.com/s/story/the-absurdity-of-student-loan-debt-fb61fdca7d8c"
},
{
"url":"https://medium.com/s/in-defense-of-the-worst-human-emotions/jealousy-is-a-brat-eed054493965"
},
{
"url":"https://medium.com/swlh/chatbots-were-the-next-big-thing-what-happened-5fc49dd6fa61"
},
{
"url":"https://medium.com/s/trustissues/my-so-called-millennial-entitlement-9be84343c713"
}
]
}

Yahoo Boss API Pagination?

I use the code in php to connect to the api and display the results...
<?php
ini_set('display_errors', 'On');
error_reporting(E_ALL);
require("OAuth.php");
$cc_key = "cc_key"; //cc_key
$cc_secret = "cc_secret"; // cc_secret key
$url = "https://yboss.yahooapis.com/ysearch/web";
$args = array();
$args["q"] = htmlspecialchars($_GET["q"]);
$args["format"] = "json";
$consumer = new OAuthConsumer($cc_key, $cc_secret);
$request = OAuthRequest::from_consumer_and_token($consumer, NULL,"GET", $url, $args);
$request->sign_request(new OAuthSignatureMethod_HMAC_SHA1(), $consumer, NULL);
$url = sprintf("%s?%s", $url, OAuthUtil::build_http_query($args));
//echo $url . "<br>"; test uri
$ch = curl_init();
$headers = array($request->to_header());
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
$rsp = curl_exec($ch);
$results = json_decode($rsp, true);
//force to assoc-array, which will allow array-access
foreach($results['bossresponse']['web']['results'] as $result)
{
//$result is array here, but do the same stuff
echo '<a href="'.$result['url'].'" target=\'_blank\'>'.$result['title'].'</a></br>';
echo ''.$result['abstract'].'</br>';
echo '<a href="'.$result['url'].'" target=\'_blank\'>'.$result['dispurl'].'</a></br>';
}
?>
then write mini "pagination"
//$start = "&start=" . "0";
$start_val = $_GET['start'];
if ($start_val == "") $start_val = 0;
$start = "&start=" . $start_val;
// Some more code...
$count_val = 10;
$count = "&count=" . $count_val;
if ($query != "") {
if ($start_val != 0) {
echo 'previous';
echo '<span> | </span>';
}
echo 'next';
}
but "pagination" does not work =(
I can not understand why does not work
My question is how do I paginate results, since all the 50 results appear on the first web page only. I want to display ten results in every page.
Please HELP me
Thanks.

PHP curl request on remote images taking forever, what can I do to improve my code?

Here is the code I have, I cannot work out what is causing the delays?
To load the remote URL it doesn't take more than a second, should I pass a user_agent?
Please excuse if this is a stupid question, I'm new to PHP, would it be worth setting a timeout as part of the curl request?
<?php
$url = $_GET['url'];
if(!filter_var($url, FILTER_VALIDATE_URL)) {
?>
{"errors":1,"message":"The URL was not valid"}
<?php
die();
}
$p=parse_url($url);
$baseurl = $p['scheme'] . '://' . $p['host'];
$path_parts = pathinfo($url);
$current_dir = $path_parts['dirname'];
Function check_img($file) {
$x = #getimagesize($file);
if ($x) {
switch ($x['mime']) {
case "image/gif" || "image/jpeg" || "image/png":
$response = true;
break;
default:
$response = false;
break;
}
} else {
$response = false;
}
return $response;
}
function ranger($url){
$headers = array(
"Range: bytes=0-605768"
);
$curl = curl_init($url);
curl_setopt($curl, CURLOPT_HTTPHEADER, $headers);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
$data = curl_exec($curl);
curl_close($curl);
return $data;
}
function file_get_contents_curl($url)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
$html = file_get_contents_curl($url);
//parsing begins here:
$doc = new DOMDocument();
#$doc->loadHTML($html);
$nodes = $doc->getElementsByTagName('title');
// Get all image tags
$imageTags = $doc->getElementsByTagName('img');
$numImages = $doc->getElementsByTagName('img')->length;
//get and display what you need:
$metas = $doc->getElementsByTagName('meta');
for ($i = 0; $i < $metas->length; $i++)
{
$meta = $metas->item($i);
if($meta->getAttribute('property') == 'og:image' || $meta->getAttribute('name') == 'og:image')
$fb_image = $meta->getAttribute('content');
$fb_image = isset($fb_image) ? $fb_image : '';
}
?>
{
"resource_images": {
"url" : "<?php echo $url?>",
"baseurl" : "<?php echo $baseurl?>",
"fb" : "<?php echo $fb_image?>",
"images" : [<?php
$i = 0;
$image_results = array();
$numItems = count($imageTags);
if ($fb_image !== '') {
$image_results[] = $fb_image;
}
foreach($imageTags as $tag) {
if ($i >= 25) {
break;
}
if (substr($tag->getAttribute('src'),0,4) === 'http') {
$img = $tag->getAttribute('src');
} elseif (substr($tag->getAttribute('src'),0,1) === '/') {
$img = $baseurl . $tag->getAttribute('src');
} else {
$img = $current_dir . $tag->getAttribute('src');
}
$exists = check_img($img);
if ($exists) {
$raw = ranger($img);
$im = imagecreatefromstring($raw);
$width = imagesx($im);
$height = imagesy($im);
if ($width > 300) {
$image_results[] = str_replace('"', "", $img);
}
if(++$i < $numItems && ++$i < 25) {
echo ",";
}
}
}
$i = 0;
foreach($image_results as $img_url) {
?>
{
"url" : "<?php echo str_replace('"', "", $img_url);?>",
"count" : <?php echo count($image_results)?>
}
<?php
if(++$i < count($image_results) && $i < 15) {
echo ",";
}
}?>
]
}
}
use this at the beginning:
set_time_limit(0)
Yes, definately a timeout on the curl, as this can go on forever.
What I would do in this case is to pinpoint the code that is taking up alot of the time like this:
<?php
function microtime_float() {
list($usec, $sec) = explode(" ", microtime());
return ((float)$usec + (float)$sec);
}
$time_start = microtime_float(); //this # the top of ur file
// process some code
// ...
// show results, this can be anywhere, inside a function, loop etc,
$time_end = microtime_float();
$time = $time_end - $time_start;
echo "Did it in $time seconds\n . <br>";
I wouldnt time the whole script but go part by part and find out where the wait is.

PHP Foreach loop only displaying last result

I am having a bit of trouble with some code where in an array I have a list of 2 and the function is only displaying the last in the list.
Here is the code:
<?php
function getKeywordPosition($theurl,$thekeywords) {
$theurl = $theurl;
$thekeywords = $thekeywords;
$found = false;
$x = 0;
for($x; $x < 64 && $found == false;)
{
$url = "http://ajax.googleapis.com/ajax/services/search/web?v=1.0&"
. "q=".stripslashes(str_replace(' ', '%20', $thekeywords)).'&start='.$x;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_REFERER, 'http://www.boo.com');
$body = curl_exec($ch);
curl_close($ch);
$json = json_decode($body);
$x4 = $x + 4;
$old_x = $x;
for($x; $x < $x4 && $found == false; $x = $x + 1)
{
if (strpos($json->responseData->results[$x-$old_x]->unescapedUrl, strtolower($theurl)) !== false)
{
$found = true;
}
}
// now have some fun with the results...
}
if($found)
{
echo '<strong>'.$theurl.'</strong> is located as the <strong>'.$x.'</strong> result when searching for <strong>'.stripslashes($thekeywords).'</strong>';
echo '<br>';
}
}
$list = array('php.com'=>'php', 'php.com'=>'php');
foreach($list as $key => $value){
getKeywordPosition($key,$value);
}
?>
Why is this not working properly?
Unless this is a badly contrived example, then th issue is you have duplicate keys in your array:
$list = array('php.com'=>'php', 'php.com'=>'php');
This array has a single entry
You coud refactor like so:
$list = array(
array('url'=>'php.net', 'keyword'=>'php'),
array('url'=>'php.net', 'keyword'=>'arrays'),
array('url'=>'php.net', 'keyword'=>'anotherkeyword')
);
foreach($list as $entry){
getKeywordPosition($entry['url'], $entry['keyword']);
}

Crawl a website, get the links, crawl the links with PHP and XPATH

I want to crawl an entire website , I have read several threads but I cannot manage to get data in a 2nd level.
That is, I can return the links from a starting page but then I cannot find a way to parse the links and get the content of each link...
The code I use is:
<?php
// SELECT STARTING PAGE
$url = 'http://mydomain.com/';
$html= file_get_contents($url);
// GET ALL THE LINKS OF EACH PAGE
// create a dom object
$dom = new DOMDocument();
#$dom->loadHTML($html);
// run xpath for the dom
$xPath = new DOMXPath($dom);
// get links from starting page
$elements = $xPath->query("//a/#href");
foreach ($elements as $e) {
echo $e->nodeValue. "<br />";
}
// Parse each page using the extracted links?
?>
Could somebody help me out for the last part with an example?
I will be really much appreciated!
Well , thanx for your answers!
I tried some stuff but I Haven't managet to get any results yet - I am new to programming..
Below, you can find 2 of my attempts - the 1st trying to parse the links and in the second trying to replace file_get contents with Curl:
1)
<?php
// GET STARTING PAGE
$url = 'http://www.capoeira.com.gr/';
$html= file_get_contents($url);
//GET ALL THE LINKS FROM STARTING PAGE
// create a dom object
$dom = new DOMDocument();
#$dom->loadHTML($html);
// run xpath for the dom
$xPath = new DOMXPath($dom);
// get specific elements from the sites
$elements = $xPath->query("//a/#href");
//PARSE EACH LINK
foreach($elements as $e) {
$URLS= file_get_contents($e);
$dom = new DOMDocument();
#$dom->loadHTML($html);
$xPath = new DOMXPath($dom);
$output = $xPath->query("//div[#class='content-entry clearfix']");
echo $output ->nodeValue;
}
?>
For the above code I get
Warning: file_get_contents() expects parameter 1 to be string, object given in ../example.php on line 26
2)
<?php
$curl = curl_init();
curl_setopt($curl, CURLOPT_POST, 1);
curl_setopt($curl, CURLOPT_URL, "http://capoeira.com.gr");
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
$content= curl_exec($curl);
curl_close($curl);
$dom = new DOMDocument();
#$dom->loadHTML($content);
$xPath = new DOMXPath($dom);
$elements = $xPath->query("//a/#href");
foreach ($elements as $e) {
echo $e->nodeValue. "<br />";
}
?>
I get no results. I tried to echo $content and then I get :
You don't have permission to access / on this server.
Additionally, a 413 Request Entity Too Large error was encountered while trying to use an ErrorDocument to handle the request...
Any ideas please?? :)
You can try the following. See this thread for more details
<?php
//set_time_limit (0);
function crawl_page($url, $depth = 5){
$seen = array();
if(($depth == 0) or (in_array($url, $seen))){
return;
}
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
$result = curl_exec ($ch);
curl_close ($ch);
if( $result ){
$stripped_file = strip_tags($result, "<a>");
preg_match_all("/<a[\s]+[^>]*?href[\s]?=[\s\"\']+"."(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $stripped_file, $matches, PREG_SET_ORDER );
foreach($matches as $match){
$href = $match[1];
if (0 !== strpos($href, 'http')) {
$path = '/' . ltrim($href, '/');
if (extension_loaded('http')) {
$href = http_build_url($href , array('path' => $path));
} else {
$parts = parse_url($href);
$href = $parts['scheme'] . '://';
if (isset($parts['user']) && isset($parts['pass'])) {
$href .= $parts['user'] . ':' . $parts['pass'] . '#';
}
$href .= $parts['host'];
if (isset($parts['port'])) {
$href .= ':' . $parts['port'];
}
$href .= $path;
}
}
crawl_page($href, $depth - 1);
}
}
echo "Crawled {$href}";
}
crawl_page("http://www.sitename.com/",3);
?>
$doc = new DOMDocument;
$doc->load('file.htm');
$items = $doc->getElementsByTagName('a');
foreach($items as $value) {
echo $value->nodeValue . "\n";
$attrs = $value->attributes;
echo $attrs->getNamedItem('href')->nodeValue . "\n";
};
find link from website recursively with depth
<?php
$depth = 1;
print_r(getList($depth));
function getList($depth)
{
$lists = getDepth($depth);
return $lists;
}
function getUrl($request_url)
{
$countValid = 0;
$brokenCount =0;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $request_url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); // We want to get the respone
$result = curl_exec($ch);
$regex = '|<a.*?href="(.*?)"|';
preg_match_all($regex, $result, $parts);
$links = $parts[1];
$lists = array();
foreach ($links as $link)
{
$url = htmlentities($link);
$result =getFlag($url);
if($result == true)
{
$UrlLists["clean"][$countValid] =$url;
$countValid++;
}
else
{
$UrlLists["broken"][$brokenCount]= "broken->".$url;
$brokenCount++;
}
}
curl_close($ch);
return $UrlLists;
}
function ZeroDepth($list)
{
$request_url = $list;
$listss["0"]["0"] = getUrl($request_url);
$lists["0"]["0"]["clean"] = array_unique($listss["0"]["0"]["clean"]);
$lists["0"]["0"]["broken"] = array_unique($listss["0"]["0"]["broken"]);
return $lists;
}
function getDepth($depth)
{
// $list =OW_URL_HOME;
$list = "https://example.com";//enter the url of website
$lists =ZeroDepth($list);
for($i=1;$i<=$depth;$i++)
{
$l= $i;
$l= $l-1;
$depthArray=1;
foreach($lists[$l][$l]["clean"] as $depthUrl)
{
$request_url = $depthUrl;
$lists[$i][$depthArray]["requst_url"]=$request_url;
$lists[$i][$depthArray] = getUrl($request_url);
}
}
return $lists;
}
function getFlag($url)
{
$url_response = array();
$curl = curl_init();
$curl_options = array();
$curl_options[CURLOPT_RETURNTRANSFER] = true;
$curl_options[CURLOPT_URL] = $url;
$curl_options[CURLOPT_NOBODY] = true;
$curl_options[CURLOPT_TIMEOUT] = 60;
curl_setopt_array($curl, $curl_options);
curl_exec($curl);
$status = curl_getinfo($curl, CURLINFO_HTTP_CODE);
if ($status == 200)
{
return true;
}
else
{
return false;
}
curl_close($curl);
}
?>`
Please check the code below, hope it helps you.
<?php
$html = new DOMDocument();
#$html->loadHtmlFile('http://www.yourdomain.com');
$xpath = new DOMXPath( $html );
$nodelist = $xpath->query( "//div[#class='A-CLASS-Name']/h3/a/#href" );
foreach ($nodelist as $n){
echo $n->nodeValue."\n<br>";
}
?>
Thanks,
Roger
<?php
$path='http://www.hscripts.com/';
$html = file_get_contents($path);
$dom = new DOMDocument();
#$dom->loadHTML($html);
// grab all the on the page
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for ($i = 0; $i < $hrefs->length; $i++ ) {
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
echo $url.'<br />';
}
?>
you can use above code to get all possible links

Categories