I dont parse this url: http://foldmunka.net
$ch = curl_init("http://foldmunka.net");
//curl_setopt($ch, CURLOPT_NOBODY, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
//curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); //not necessary unless the file redirects (like the PHP example we're using here)
$data = curl_exec($ch);
$info = curl_getinfo($ch);
curl_close($ch);
clearstatcache();
if ($data === false) {
echo 'cURL failed';
exit;
}
$dom = new DOMDocument();
$data = mb_convert_encoding($data, 'HTML-ENTITIES', "utf-8");
$data = preg_replace('/<\!\-\-\[if(.*)\]>/', '', $data);
$data = str_replace('<![endif]-->', '', $data);
$data = str_replace('<!--', '', $data);
$data = str_replace('-->', '', $data);
$data = preg_replace('#<script[^>]*?>.*?</script>#si', '', $data);
$data = preg_replace('#<style[^>]*?>.*?</style>#si', '', $data);
$data = mb_convert_encoding($data, 'HTML-ENTITIES', "utf-8");
#$dom->loadHTML($data);
$els = $dom->getElementsByTagName('*');
foreach($els as $el){
print $el->nodeName." | ".$el->getAttribute('content')."<hr />";
if($el->getAttribute('title'))$el->nodeValue = $el->getAttribute('title')." ".$el->nodeValue;
if($el->getAttribute('alt'))$el->nodeValue = $el->getAttribute('alt')." ".$el->nodeValue;
print $el->nodeName." | ".$el->nodeValue."<hr />";
}
I need sequentially the alt, title attributes and the simple text, but this page i cannot access the nodes within the body tag.
Here is a solution with DomDocument and DOMXPath. It is much shorter and runs much faster (~100ms against ~2300ms) than the other solution with Simple HTML DOM Parser.
<?php
function makePlainText($source)
{
$dom = new DOMDocument();
$dom->loadHtmlFile($source);
// use this instead of loadHtmlFile() to load from string:
//$dom->loadHtml('<html><title>Hello</title><body>Hello this site<img src="asdasd.jpg" alt="alt attr" title="title attr">click Some text.</body></html>');
$xpath = new DOMXPath($dom);
$plain = '';
foreach ($xpath->query('//text()|//a|//img') as $node)
{
if ($node->nodeName == '#cdata-section')
continue;
if ($node instanceof DOMElement)
{
if ($node->hasAttribute('alt'))
$plain .= $node->getAttribute('alt') . ' ';
if ($node->hasAttribute('title'))
$plain .= $node->getAttribute('title') . ' ';
}
if ($node instanceof DOMText)
$plain .= $node->textContent . ' ';
}
return $plain;
}
echo makePlainText('http://foldmunka.net');
I'm not sure I'm getting what this script does - the replace operations look like an attempt at sanitation but I'm not sure what for, if you're just extracting some parts of the code - but have you tried the Simple HTML DOM Browser? It may be able to handle the parsing part more easily. Check out the examples.
Here is a Simple Html DOM Parser solution just for comparison. It's output is similar for the DomDocument solution's, but this one is more complicated and runs much slower (~2300ms against DomDocument's ~100ms), so I don't recommend to use it:
Updated to work with <img> elements inside <a> elements.
<?php
require_once('simple_html_dom.php');
// we are needing this because Simple Html DOM Parser's callback handler
// doesn't handle arguments
static $processed_plain_text = '';
define('LOAD_FROM_URL', 'loadfromurl');
define('LOAD_FROM_STRING', 'loadfromstring');
function callback_cleanNestedAnchorContent($element)
{
if ($element->tag == 'a')
$element->innertext = makePlainText($element->innertext, LOAD_FROM_STRING);
}
function callback_buildPlainText($element)
{
global $processed_plain_text;
$excluded_tags = array('script', 'style');
switch ($element->tag)
{
case 'text':
// filter when 'text' is descendant of 'a', because we are
// processing the anchor tags with the required attributes
// separately at the 'a' tag,
// and also filter out other unneccessary tags
if (($element->parent->tag != 'a') && !in_array($element->parent->tag, $excluded_tags))
$processed_plain_text .= $element->innertext . ' ';
break;
case 'img':
$processed_plain_text .= $element->alt . ' ';
$processed_plain_text .= $element->title . ' ';
break;
case 'a':
$processed_plain_text .= $element->alt . ' ';
$processed_plain_text .= $element->title . ' ';
$processed_plain_text .= $element->innertext . ' ';
break;
}
}
function makePlainText($source, $mode = LOAD_FROM_URL)
{
global $processed_plain_text;
if ($mode == LOAD_FROM_URL)
$html = file_get_html($source);
elseif ($mode == LOAD_FROM_STRING)
$html = str_get_dom ($source);
else
return 'Wrong mode defined in makePlainText: ' . $mode;
$html->set_callback('callback_cleanNestedAnchorContent');
// processing with the first callback to clean up the anchor tags
$html = str_get_html($html->save());
$html->set_callback('callback_buildPlainText');
// processing with the second callback to build the full plain text with
// the required attributes of the 'img' and 'a' tags, and excluding the
// unneccessary ones like script and style tags
$html->save();
$return = $processed_plain_text;
// cleaning the global variable
$processed_plain_text = '';
return $return;
}
//$html = '<html><title>Hello</title><body>Hello <span>this</span> site<img src="asdasd.jpg" alt="alt attr" title="title attr">click <span><strong>HERE</strong></span><img src="image.jpg" title="IMAGE TITLE INSIDE ANCHOR" alt="ALTINACNHOR"> Some text.</body></html>';
echo makePlainText('http://foldmunka.net');
//echo makePlainText($html, LOAD_FROM_STRING);
Related
I'm trying to write a script to cURL a few pages from a password protected site.
The idea is to scrape information on submitted stock codes from their products database to generate and print out the results (eventually importing directly to my own database, but currently just printing the results on screen).
My function is as follows:
function LookupProduct($ItemCodes) {
//set a temp file name for the login cookie
$tmp_fname = "tmp/".md5(date('D F d')).".cookie";
$tmp_fname = realpath($tmp_fname);
//reset/declare the functions output
$return = '';
// build post data from form
$fields = array(
'UserName' => urlencode("username"),
'Password' => urlencode("password"),
);
$fieldString='';
foreach($fields as $key=>$value) {
$fieldString .= $key.'='.$value.'&';
}
rtrim($fieldString, '&');
//initialise the curl session
$ch = curl_init();
//set options for curl login
$loginurl = "https://suppliers-website/login/";
curl_setopt($ch,CURLOPT_URL, $loginurl);
curl_setopt($ch,CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch,CURLOPT_COOKIESESSION, true);
curl_setopt($ch,CURLOPT_POST, count($fields));
curl_setopt($ch,CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch,CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch,CURLOPT_POSTFIELDS, $fieldString);
curl_setopt($ch,CURLOPT_COOKIEJAR, $tmp_fname);
curl_setopt($ch,CURLOPT_COOKIEFILE, $tmp_fname);
curl_setopt($ch,CURLOPT_RETURNTRANSFER, true);
//do the actual login, generate cookie
$result = curl_exec($ch);
//build array of codes to lookup
$codes=explode(",", $ItemCodes);
//lookup each code in the array
foreach($codes as $code) {
//set the product page to curl
$lookupUrl = "https://suppliers-website/product/".$code;
curl_setopt($ch,CURLOPT_URL, $lookupUrl);
//load product page html into $lookupcontent
unset($lookupcontent);
$lookupcontent = curl_exec($ch);
//if we have a valid page, then go ahead and pluck the data
if (strlen($lookupcontent) < 100) {
echo "<li>Error logging in: <blockquote>".$lookupcontent."</blockquote></li>";
} else {
//load product page html into a DOM
unset($dom);
unset($xpath);
$dom = new DOMDocument;
$dom->loadHTML($lookupcontent);
$xpath = new DOMXPath($dom);
//find the image src
unset($imgnames);
foreach($dom->getElementsByTagName('a') as $node) {
if (strpos($node->getAttribute('href'),'StockLoRes') !== false) {
$imgnames = explode("=", $node->getAttribute('href'));
$imgname = $imgnames[1];
$filelocation = $node->getAttribute('href');
}
}
//set the image to curl
$imglink = "https://suppliers-website/login/".$filelocation;
curl_setopt($ch,CURLOPT_URL,$imglink);
//curl the image
unset($curlimage);
$curlimage = curl_exec($ch);
//save the image locally
unset($saveimage);
$saveimage = fopen('tmp/'.$imgname, 'w');
fwrite($saveimage, $curlimage);
fclose($saveimage);
// find the product description
unset($results);
$classname = 'ItemDetails_Description';
$results = $xpath->query("//*[#class='" . $classname . "']");
if ($results->length > 0) {
$description = $results->item(0)->nodeValue;
$description = strip_tags($description);
$description = str_replace("•", "", $description);
}
//find the price
unset($pricearray);
foreach($dom->getElementsByTagName('div') as $node) {
if (strpos($node->nodeValue,'£') !== false) {
$pricearray[] = $node->nodeValue;
}
}
$pricearray=array_reverse($pricearray);
$price = $pricearray[0];
$price = str_replace("£", "", $price);
//find the title
unset($results);
$classname = 'ItemDetails_ItemName';
$results = $xpath->query("//*[#class='" . $classname . "']");
if ($results->length > 0) {
$title = $results->item(0)->nodeValue;
}
//find the publisher
unset($results);
$classname = 'ItemDetails_Publisher';
$results = $xpath->query("//*[#class='" . $classname . "']");
if ($results->length > 0) {
$publisher = $results->item(0)->nodeValue;
}
}
//add all the values to the data to be returned
$return .= '<div style="border:1px solid grey;margin:20px;float:left;">';
$return .= "<a href='tmp/".$imgname."'>";
$return .= "<img src='tmp/".$imgname."' width='100' align='left' /></a>";
$return .= "<h1>" .$title ."</h1>";
$return .= "<h3>" .$publisher ."</h3>";
$return .= "<h2>£" .$price ."</h2>";
$return .= "<h4>" .$description."</h2>";
$return .= '</div><br clear="all" />';
}
//echo out the data
echo $return;
//close connection
curl_close($ch);
}
I am using the following to trigger it:
if(isset($_POST['ItemCodes'])) {
$code=$_POST['ItemCodes'];
$code=str_replace("\n\r", ",", $code);
$code=str_replace("\r", ",", $code);
echo "ItemCodes: ".$code;
echo LookupProduct($code);
}
The script can successfully log in, save a cookie, and get info from a page, but if I try to request multiple pages the script fails to work as intended, instead returning 3 instances of the same product. Did I fail to reset a variable somewhere? I've tried unsetting everything but I still just get the same product three times, as if my function only works once.
I'm trying to scrape the following page: http://mangafox.me/manga/
I wanted the script to click on each of those links and scrape the details of each manga and for the most part my code does exactly that. It works, but for some reason the page just stops loading midway (it doesn't even go through the # list).
There is no error message so I don't know what I'm looking for. I would appreciate some advice on what I'm doing wrong.
Code:
<?php
include('simple_html_dom.php');
set_time_limit(0);
//ini_set('max_execution_time', 300);
//Creates an instance of the simple_html_dom class
$html = new simple_html_dom();
//Loads the page from the URL entered
$html->load_file('http://mangafox.me/manga');
//Finds an element and if there is more than 1 instance the variable becomes an array
$manga_urls = $html->find('.manga_list a');
//Function which retrieves information needed to populate the DB from indiviual manga pages.
function getmanga($value, $url){
$pagehtml = new simple_html_dom();
$pagehtml->load_file($url);
if ($value == 'desc') {
$description = $pagehtml->find('p.summary');
foreach($description as $d){
//return $d->plaintext;
return $desc = $d->plaintext;
}
unset($description);
} else if ($value == 'status') {
$status = $pagehtml->find('div[class=data] span');
foreach ($status as $s) {
$status = explode(",", $s->plaintext);
return $status[0];
}
unset($status);
} else if ($value == 'genre') {
$genre = $pagehtml->find('//*[#id="title"]/table/tbody/tr[2]/td[4]');
foreach ($genre as $g) {
return $g->plaintext;
}
unset($genre);
} else if ($value == 'author') {
$author = $pagehtml->find('//*[#id="title"]/table/tbody/tr[2]/td[2]');
foreach ($author as $a) {
return $a->plaintext;
}
unset($author);
} else if ($value == 'release') {
$release = $pagehtml->find('//*[#id="title"]/table/tbody/tr[2]/td[1]');
foreach ($release as $r) {
return $r->plaintext;
}
unset($release);
} else if ($value == 'image') {
$image = $pagehtml->find('.cover img');
foreach ($image as $i) {
return $i->src;
}
unset($image);
}
$pagehtml->clear();
unset($pagehtml);
}
foreach($manga_urls as $url) {
$href = $url->href;
if (strpos($href, 'http') !== false){
echo 'Title: ' . $url->plaintext . '<br />';
echo 'Link: ' . $href . '<br />';
echo 'Description: ' . getmanga('desc', $href) . '<br />';
echo 'Status: ' . getmanga('status',$href) . '<br />';
echo 'Genre: ' . getmanga('genre', $href) . '<br />';
echo 'Author: ' . getmanga('author', $href) . '<br />';
echo 'Release: ' . getmanga('release', $href) . '<br />';
echo 'Image Link: ' . getmanga('image', $href) . '<br />';
echo '<br /><br />';
}
}
$html->clear();
unset($html);
?>
So, it was not a 'just do this' fix, but I did it ;)
Beside the fact is was importing the sub pages way too much, it also had a huge simple_html_dom to iterate through. It has like 13307 items, and simple_html_dom is not made for speed or efficiency. It allocated much space for things you didn't need in this case. That is why I replaced the main simple_html_dom with a regular expression.
I think it still takes ages to load fully, and you are better of using a other language, but this is a working result :-)
https://gist.github.com/dralletje/ee996ffe4c957cdccd01
I have faced the same issue, when the loop with 20k iterations stopped without any error message. So posting the solution so it might help someone.
The issue seems to be of performance as stated before. So I decided to use curl instead of simple html dom. The function bellow returns content of website:
function getContent($url){
$ch = curl_init();
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$result = curl_exec($ch);
curl_close($ch);
if($result){
return $result;
}else{
return "";
}
}
Now to traverse the DOM, I am still using simple html dom, but the code is changed as:
$content = getContent($url);
if($content){
// Create a DOM object
$doc = new simple_html_dom();
// Load HTML from a string
$doc->load($content);
}else{
continue;
}
And at the end of each loop close and unset variable as:
$doc->clear();
unset($doc);
i want to extract couple of tables from a web page and show them in my page
i was going to use regex to extract them but then i saw the DOMDocument class
and it seems cleaner i've looked in stackoverflow and it seems all the questions are about getting inner text or using a loop to get inner nodes of elements . i want to now how can i extract and print a html element by it's id .
$html = file_get_contents("www.site.com");
$xml = new DOMDocument();
$xml->loadHTML($html);
$xpath = new DOMXPath($xml);
$table =$xpath->query("//*[#id='myid']");
$table->saveHTML(); // this obviously doesn't work
how can i show or echo the $table as an actual html table on my page ?
Firstly, DOMDocument has a getElementById() method so your XPath is unnecessary - although I suspect that is how it works underneath.
Secondly, in order to get fragments of markup rather than a whole document, you use DOMNode::C41N(), so your code would look like this:
<?php
// Load the HTML into a DOMDocument
// Don't forget you could just pass the URL to loadHTML()
$html = file_get_contents("www.site.com");
$dom = new DOMDocument('1.0');
$dom->loadHTML($html);
// Get the target element
$element = $dom->getElementById('myid');
// Get the HTML as a string
$string = $element->C14N();
See a working example.
You can use DOMElement::C14N() to get the canonicalized HTML(XML) representation of a DOMElement, or if you like a bit more control so that you can filter certain elements and attributes you can use something like this:
function toHTML($nodeList, $tagsToStrip=array('script','object','noscript','form','style'),$attributesToSkip=array('on*')) {
$html = '';
foreach($nodeList as $subIndex => $values) {
if(!in_array(strtolower($values->nodeName), $tagsToStrip)) {
if(substr($values->nodeName,0,1) != '#') {
$html .= ' <'.$values->nodeName;
if($values->attributes) {
for($i=0;$values->attributes->item($i);$i++) {
if( !in_array( strtolower($values->attributes->item($i)->nodeName) , $attributesToSkip ) && (in_array('on*',$attributesToSkip) && substr( strtolower($values->attributes->item($i)->nodeName) ,0 , 2) != 'on') ) {
$vvv = $values->attributes->item($i)->nodeValue;
if( in_array( strtolower($values->attributes->item($i)->nodeName) , array('src','href') ) ) {
$vvv = resolve_href( $this->url , $vvv );
}
$html .= ' '.$values->attributes->item($i)->nodeName.'="'.$vvv.'"';
}
}
}
if(in_array(strtolower($values->nodeName), array('br','img'))) {
$html .= ' />';
} else {
$html .= '> ';
if(!$values->firstChild) {
$html .= htmlspecialchars( $values->textContent , ENT_COMPAT , 'UTF-8' , true );
} else {
$html .= toHTML($values->childNodes,$tagsToStrip,$attributesToSkip);
}
$html .= ' </'.$values->nodeName.'> ';
}
} elseif(substr($values->nodeName,1,1) == 't') {
$inner = htmlspecialchars( $values->textContent , ENT_COMPAT , 'UTF-8' , true );
$html .= $inner;
}
}
}
return $html;
}
echo toHTML($table);
I have this strange problem parsing XML document in PHP loaded via cURL. I cannot get nodeValue containing URL address (I'm trying to implement simple RSS reader into my CMS). Strange thing is that it works for every node except that containing url addresses and date ( and ).
Here is the code (I know it is a stupid solution, but I'm kinda newbie in working with DOM and parsing XML documents).
function file_get_contents_curl($url) {
$ch = curl_init(); // initialize curl handle
curl_setopt($ch, CURLOPT_URL, $url); // set url to post to
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); // return into a variable
curl_setopt($ch, CURLOPT_TIMEOUT, 4); // times out after 4s
$result = curl_exec($ch); // run the whole process
return $result;
}
function vypis($adresa) {
$html = file_get_contents_curl($adresa);
$doc = new DOMDocument();
#$doc->loadHTML($html);
$nodes = $doc->getElementsByTagName('title');
$desc = $doc->getElementsByTagName('description');
$ctg = $doc->getElementsByTagName('category');
$pd = $doc->getElementsByTagName('pubDate');
$ab = $doc->getElementsByTagName('link');
$aut = $doc->getElementsByTagName('author');
for ($i = 1; $i < $desc->length; $i++) {
$dsc = $desc->item($i);
$titles = $nodes->item($i);
$categorys = $ctg->item($i);
$pubDates = $pd->item($i);
$links = $ab->item($i);
$autors = $aut->item($i);
$description = $dsc->nodeValue;
$title = $titles->nodeValue;
$category = $categorys->nodeValue;
$pubDate = $pubDates->nodeValue;
$link = $links->nodeValue;
$autor = $autors->nodeValue;
echo 'Title:' . $title . '<br/>';
echo 'Description:' . $description . '<br/>';
echo 'Category:' . $category . '<br/>';
echo 'Datum ' . gmdate("D, d M Y H:i:s",
strtotime($pubDate)) . " GMT" . '<br/>';
echo "Autor: $autor" . '<br/>';
echo 'Link: ' . $link . '<br/><br/>';
}
}
Can you please help me with this?
To read RSS you shouldn't use loadHTML, but loadXML. One reason why your links don't show is because the <link> tag in HTML ignores its contents. See also here: http://www.w3.org/TR/html401/struct/links.html#h-12.3
Also, I find it easier to just iterate over the <item> tags and then iterate over their children nodes. Like so:
$d = new DOMDocument;
// don't show xml warnings
libxml_use_internal_errors(true);
$d->loadXML($xml_contents);
// clear xml warnings buffer
libxml_clear_errors();
$items = array();
// iterate all item tags
foreach ($d->getElementsByTagName('item') as $item) {
$item_attributes = array();
// iterate over children
foreach ($item->childNodes as $child) {
$item_attributes[$child->nodeName] = $child->nodeValue;
}
$items[] = $item_attributes;
}
var_dump($items);
Thanks for taking the time to read my post... I'm trying to extract some information from my website using Simple HTML Dom...
I have it reading from the HTML source ok, now I'm just trying to extract the information that I need. I have a feeling I'm going about this in the wrong way... Here's my script...
<?php
include_once('simple_html_dom.php');
// create doctype
$dom = new DOMDocument("1.0");
// display document in browser as plain text
// for readability purposes
//header("Content-Type: text/plain");
// create root element
$xmlProducts = $dom->createElement("products");
$dom->appendChild($xmlProducts);
$html = file_get_html('http://myshop.com/small_houses.html');
$html .= file_get_html('http://myshop.com/medium_houses.html');
$html .= file_get_html('http://myshop.com/large_houses.html');
//Define my variable for later
$product['image'] = '';
$product['title'] = '';
$product['description'] = '';
foreach($html->find('img') as $src){
if (strpos($src->src,"http://myshop.com") === false) {
$src->src = "http://myshop.com/$src->src";
}
$product['image'] = $src->src;
}
foreach($html->find('p[class*=imAlign_left]') as $description){
$product['description'] = $description->innertext;
}
foreach($html->find('span[class*=fc3]') as $title){
$product['title'] = $title->innertext;
}
echo $product['img'];
echo $product['description'];
echo $product['title'];
?>
I put echo's on the end for sake of testing...but I'm not getting anything... Any pointers would be a great HELP!
Thanks
Charles
file_get_html() returns a HTMLDom Object, and you cannot concatenate Objects, although HTMLDom have __toString methods when there concatenated there more then lilly corrupt in some way, try the following:
<?php
include_once('simple_html_dom.php');
// create doctype
$dom = new DOMDocument("1.0");
// display document in browser as plain text
// for readability purposes
//header("Content-Type: text/plain");
// create root element
$xmlProducts = $dom->createElement("products");
$dom->appendChild($xmlProducts);
$pages = array(
'http://myshop.com/small_houses.html',
'http://myshop.com/medium_houses.html',
'http://myshop.com/large_houses.html'
)
foreach($pages as $page)
{
$product = array();
$source = file_get_html($page);
foreach($source->find('img') as $src)
{
if (strpos($src->src,"http://myshop.com") === false)
{
$product['image'] = "http://myshop.com/$src->src";
}
}
foreach($source->find('p[class*=imAlign_left]') as $description)
{
$product['description'] = $description->innertext;
}
foreach($source->find('span[class*=fc3]') as $title)
{
$product['title'] = $title->innertext;
}
//debug perposes!
echo "Current Page: " . $page . "\n";
print_r($product);
echo "\n\n\n"; //Clear seperator
}
?>