HTML Dom code not running - php

I am trying to create a script that extracts text from a website table and displays it via php. When I run it on this address :
http://lmvz.anofm.ro:8080/lmv/detalii.jsp?UNIQUEJVID=50/01/1150001435/1&judet=50
It turns out empty. Is there something wrong with the code? And how can I fix / improve it?
<?php
include_once('simple_html_dom.php');
curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
// Start a cURL resource
$ch = curl_init();
// Set options for the cURL
curl_setopt($ch, CURLOPT_URL, 'http://lmvz.anofm.ro:8080/lmv/detalii.jsp?UNIQUEJVID=50/01/1150001435/1&judet=50'); // target
curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']); // provide a user-agent
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); // follow any redirects
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); // return the result
// Execute the cURL fetch
$result = curl_exec($ch);
// Close the resource
curl_close($ch);
// Output the results
echo $result;
function scraping() {
// create HTML DOM
$html = file_get_html('http://lmvz.anofm.ro:8080/lmv/detalii.jsp?UNIQUEJVID=50/01/1150001435/1&judet=50');
// get article block
if($html && is_object($html) && isset($html->nodes)){
foreach($html->find('/html/body/table') as $article) {
// get title
$item['titlu'] = trim($article->find('/html/body/table/tbody/tr[1]/td/div', 0)->plaintext);
// get body
$item['tr2'] = trim($article->find('/html/body/table/tbody/tr[2]', 0)->plaintext);
$item['tr3'] = trim($article->find('/html/body/table/tbody/tr[3]', 0)->plaintext);
$item['tr4'] = trim($article->find('/html/body/table/tbody/tr[4]', 0)->plaintext);
$item['tr5'] = trim($article->find('/html/body/table/tbody/tr[5]', 0)->plaintext);
$item['tr6'] = trim($article->find('/html/body/table/tbody/tr[6]', 0)->plaintext);
$item['tr7'] = trim($article->find('/html/body/table/tbody/tr[7]', 0)->plaintext);
$item['tr8'] = trim($article->find('/html/body/table/tbody/tr[8]', 0)->plaintext);
$item['tr9'] = trim($article->find('/html/body/table/tbody/tr[9]', 0)->plaintext);
$item['tr10'] = trim($article->find('/html/body/table/tbody/tr[10]', 0)->plaintext);
$item['tr11'] = trim($article->find('/html/body/table/tbody/tr[11]', 0)->plaintext);
$item['tr12'] = trim($article->find('/html/body/table/tbody/tr[12]', 0)->plaintext);
$ret[] = $item;
}
// clean up memory
$html->clear();
unset($html);
return $ret;}
}
// -----------------------------------------------------------------------------
// test it!
$ret = scraping();
foreach($ret as $v) {
echo $v['titlu'].'<br>';
echo '<ul>';
echo '<li>'.$v['tr2'].'</li>';
echo '<li>'.$v['tr3'].'</li>';
echo '<li>'.$v['tr4'].'</li>';
echo '<li>'.$v['tr5'].'</li>';
echo '<li>'.$v['tr6'].'</li>';
echo '<li>'.$v['tr7'].'</li>';
echo '<li>'.$v['tr8'].'</li>';
echo '<li>'.$v['tr9'].'</li>';
echo '<li>'.$v['tr10'].'</li>';
echo '<li>'.$v['tr11'].'</li>';
echo '<li>'.$v['tr12'].'</li>';
echo '</ul>';
}
?>

Because in foreach you use result of finding /html/body/table you should not use full path but ask:
$item['titlu'] = trim($article->find('/tbody/tr[1]/td/div', 0)->plaintext);
$item['tr2'] = trim($article->find('/tbody/tr[2]', 0)->plaintext);
and so on...
To your curl works, you need move
$ch = curl_init();
before first curl_setopt

Related

How to extract the direct Sibnet video url PHP

I'm searching for a solution to this problem for a long time and I didn't get any solutions.
I managed to extract the mp4 URL, ​​but the problem is that this link redirects to another URL that can be seen in response header: Location, I don't know how I can get this URL.
Response Header(img)
<?php
function tidy_html($input_string) {
$config = array('output-html' => true,'indent' => true,'wrap'=> 800);
// Detect if Tidy is in configured
if( function_exists('tidy_get_release') ) {
$tidy = new tidy;
$tidy->parseString($input_string, $config, 'raw');
$tidy->cleanRepair();
$cleaned_html = tidy_get_output($tidy);
}
else {
# Tidy not configured for this Server
$cleaned_html = $input_string;
}
return $cleaned_html;
}
function getFromPage($webAddress,$path){
$source = file_get_contents($webAddress); //download the page
$clean_source = tidy_html($source);
$doc = new DOMDocument;
// suppress errors
libxml_use_internal_errors(true);
// load the html source from a string
$doc->loadHTML($clean_source);
$xpath = new DOMXPath($doc);
$data="";
$nodelist = $xpath->query($path);
$node_counts = $nodelist->length; // count how many nodes returned
if ($node_counts) { // it will be true if the count is more than 0
foreach ($nodelist as $element) {
$data= $data.$element->nodeValue . "\n";
}
}
return $data;
}
$vidID = 4145616; //videoid : https://video.sibnet.ru/shell.php?videoid=4145616
$link1 = getFromPage("https://video.sibnet.ru/shell.php?videoid=".$vidID,"/html/body/script[21]/text()"); // Use XPath
$json = urldecode($link1);
$link2 = strstr($json, "player.src");
$url = substr($link2, 0, strpos($link2, ","));
$url =str_replace('"',"",$url);
$url = substr($url , 18);
//header('Location: https://video.sibnet.ru'.$url);
echo ('https://video.sibnet.ru'.$url)
?>
<?php
$url='https://video.sibnet.ru'.$url;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$a = curl_exec($ch);
$url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); // This is what you need, it will return you the last effective URL
$realUrl = $url; //here you go
?>
SOURCE: https://stackoverflow.com/a/17473000/14885297

How to retrieve broken links

I would like to retrieve broken links of a given website.
I have this code but it doesn't work.
Can you help me ?
// function to check url
function check_url($url) {
//echo "Test broken liens";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch , CURLOPT_RETURNTRANSFER, 1);
$data = curl_exec($ch);
$headers = curl_getinfo($ch);
curl_close($ch);
return $headers['http_code'];
}
if(check_url("https://www.amazon.com/")==200){
echo "<br> The link is validated <br>";
}else{
echo "<br>broken links<br>";
}
// this function check all the code of a website and retrieve the tag of a hyperlink
function getLinks(){
$html = file_get_contents('https://www.amazon.com/');
$dom = new domDocument;
#$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$images = $dom->getElementsByTagName('a');
foreach ($images as $image) {
$file= $image->getAttribute('href')."<br>";
$lien= "https://www.amazon.com/".$file;
echo $lien;
echo existenceLien($lien);
}
}
echo getLinks();
// The target is to search the broken links in a website and worn the existence of those links
//check if link exist and display the result for each
function linkexistence($url){
// get the url
$test = get_headers($url , 1);
$message="";
// use preg_match function
if (preg_match("#HTTP/1.1 200i#", $test[0])) {
$message="Valid";
}elseif (preg_match("#HTTP/1.1 404i#", $test[0])) {
$message="Non-existent page ! (404)";
}elseif (preg_match("#HTTP/1.1 301i#", $test[0])) {
$message="The page has been moved";
}elseif (preg_match("#HTTP/1.1 403i#", $test[0])) {
$message="Access to the page refused! (403)";
}else {
$message="Invalid links";
}
return $message;
}*****
The mask is wrong in preg_match function, currently your mask is
#HTTP/1.1 200i#
but I believe that you have to use the following mask
#HTTP/1.1 200#i
thus you have to move the "i" after "#" in all your preg_match functions.
the "i" means the case sensitivity will be ignored

I want to print all curl scraped value. how do i to that?

this is my first time in using Curl and selecting element within XPath. Attached is my current code.
<?php
//$curl = curl_init('https://silvergoldbull.com/');
$curl = curl_init('https://e-katalog.lkpp.go.id/backend/katalog/list_produk/77/?isSubmitted=1&orderBy=hargaAsc&list=5&manufakturId=all&penyediaId=all&page=1');
curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
$page = curl_exec($curl);
if(curl_errno($curl)) // check for execution errors
{
echo 'Scraper error: ' . curl_error($curl);
exit;
}
echo $page;
curl_close($curl);
$page_doc = new DOMDocument;
libxml_use_internal_errors(true);
$page_doc->loadHTML($page);
libxml_clear_errors(); //remove errors for yucky html
$page_doc_xpath = new DOMXPath($page_doc);
//$result = $page_doc_xpath->evaluate('/html/body/div[2]/div[5]/div/div/div[3]/div[3]/div/table/tbody/tr[1]/td/div/div[3]/div/div[1]/div/ol/li/a');
$result = $page_doc_xpath->evaluate('string(/html/body/div[2]/div[5]/div/div/div[3]/div[3]/div/table/tbody/tr[1]/td/div/div[3]/div/div[1]/div/ol/li/a)');
echo "----";
echo $result;
/* $silverprice = $page_doc_xpath->evaluate('string(/html/body/nav/div[3]/div/div/ul/li[1]/a/span/div/div/strong)');
echo $silverprice; */
/* $buyers = tree.xpath('//div[#title="buyer-name"]/text()') */
/* $regex = '/<div id="case_textlist">(.*?)<\/div>/s';
if ( preg_match($regex, $page, $list) )
echo $list[0];
else
print "Not found"; */
?>
by using those code, i am able to retrieve Computer Supplies within green bracket at the end of the page. However, how do i retrieve the rest of red brackets ?
update :
i modify $result into the following code and still not working. It only return Networking instead of all in the brackets
$result = $page_doc_xpath->evaluate('string(//div[#class="categoryPath"]//a)');
In my case i use Goutte for scrape the data
use Goutte\Client;
$client = new Client();
$crawler = $client->request('GET', $url);
$titles = $crawler->filter('.listing--name')->extract('_text');
by use class or id can be text of the node...

PHP Simple HTML Dom parser returns 0

I use PHP Simple HTML Dom parser to get some elements of a page. Unfortunately, I get as a result 0 or 1... I would like to get the innerHTML instead.
Here is a photo of the dom:
And here is my code:
include('simple_html_dom.php');
// We take the url we want to scrape
$URL = 'https://www.legifrance.gouv.fr/affichTexte.do?cidTexte=JORFTEXT000033011065&dateTexte=20160821';
// Curl init
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $URL);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$result = curl_exec ($ch);
curl_close($ch);
// We get the html
$html = new simple_html_dom();
$html->load($result);
// Find all article blocks
foreach($html->find('div.data') as $article) {
$item['title'] = $article->find('.titreSection', 0) ->plaintext;
$resultat[] = "<p>" + $item['title']."</p></br>";
}
include 'vue_scrap.php';
?>
Here is the code of my view:
foreach ($resultat as $result){
echo $result;
}
Thank you for your help.
In fact I just did a mistake with that line:
$resultat[] = "<p>" + $item['title']."</p></br>";
The correct version is:
$resultat[] = "<p>".$item['title']."</p></br>";

How can I read all the <status> tags from the Twitter timeline for a user?

I am trying to read twitter timeline with cURL, and for some reason I am unable to use preg_match. Here's my current code, do you see any issues?
$feed = "http://twitter.com/statuses/user_timeline/antonpug.xml?count=3";
function parse_feed($feed) {
//$matches = Array();
preg_match_all("/<status>(.*?)<\/status>/", $content[0], $matches);
return $matches[0];
//$stepOne = explode("<content type=\"html\">", $feed);
//$stepTwo = explode("</content>", $stepOne[1]);
//$tweet = $stepTwo[0];
//$tweet = htmlspecialchars_decode($tweet,ENT_QUOTES);
//return $tweet;
}
//Initialize the Curl session
$ch = curl_init();
//Set curl to return the data instead of printing it to the browser.
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
//Set the URL
curl_setopt($ch, CURLOPT_URL, $feed);
//Execute the fetch
$twitterFeed = curl_exec($ch);
//Close the connection
curl_close($ch);
//$twitterFeed = file_get_contents($feed);
echo(parse_feed($twitterFeed));
I guess the better idea would be to use simplexml to work with XML as with object.
Your function then would be something like
function parse_feed($feed) {
$xml = simplexml_load_string($feed);
if(isset($xml->status)) {
return $xml->xpath('status');
} else {
return false;
}
}
It will return simplexml object.

Categories