<?
$file = "http://www.google.com";
$doc = new DOMDocument();
echo #$doc->loadHTML(file_get_contents($file));
$element = $doc->getElementsbyTagName('span');
echo trim($element->item(0)->nodeValue);
echo trim($element->item(0)->textContent);
if (!is_null($element)) {
$content = $element->nodeValue;
if (empty($content)) {
$content = $element->textContent;
}
echo $content . "\n";
}
?>
i am trying to test this script and am wondering why can't i parse google? if you look into the source page, hit ctrl+f type in span there is obviously a span tag. why isn't it giving me results??
<?php
$file = 'http://www.google.com';
$doc = new DOMDocument();
# $doc->loadHTML(file_get_contents($file));
$element = $doc->getElementsByTagName('span');
if (0 != $element->length)
{
$content = trim($element->item(0)->nodeValue);
if (empty($content))
{
$content = trim($element->item(0)->textContent);
}
echo $content . "\n";
}
?>
Not 100% sure, but doesnt allow_url_fopen need to be enabled in php.ini for this to work?
code removed
Related
in my table, I have a row that contains a string like this:
<p>hello</p><p>this is patrick</p><p><img src="/assets/img/myface.jpg" width="320" height="320"/></p>
and I want to give the <img> tag an alt attribute. I've got quite close now but somehow my code still shows 2 <img> tags although the string only has 1. can anyone tell me what I'm doing wrong?
this is my code so far:
$str = '<p>hello</p><p>this is patrick</p><p><img src="/assets/img/myface.jpg" width="320" height="320"/></p>';
$new_html = '';
$dom = new DOMDocument();
$dom->loadHTML($str);
$content = $dom->getElementsByTagName('*');
foreach ($content as $i => $node)
{
if ($node->nodeName == 'html' || $node->nodeName == 'body')
{
continue; // dont need to process these tags, right?
}
if ($node->nodeName == 'img')
{
$img_src = $node->getAttribute('src');
$path_arr = explode('/', $img_src);
$filename = $path_arr[count($path_arr)-1]; // myface.jpg
$alt = 'blah';
$node->setAttribute('alt', $alt);
}
echo $dom->saveXML($node);
}
$content = $dom->getElementsByTagName('img');
foreach ($content as $node) {
$img_src = $node->getAttribute('src');
$filename = basename($img_src);
$node->setAttribute('alt', $filename);
}
echo $dom->saveHTML();
Loop only through images with $content = $dom->getElementsByTagName('img');
Move $dom->saveHTML(); after lthe loop.
Get filename with $filename = basename($img_src);
The slightly changed code below does the work. It only gets the img tags and saves the HTML outside the loop. Note that I changed the way that HTML was loaded, to not include the wrapper tags.
<?php
$str = '<p>hello</p><p>this is patrick</p><p><img src="/assets/img/myface.jpg" width="320" height="320"/></p>';
$new_html = '';
$dom = new DOMDocument();
$dom->loadHTML($str, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
$content = $dom->getElementsByTagName('img');
foreach ($content as $i => $node)
{
$img_src = $node->getAttribute('src');
$path_arr = explode('/', $img_src);
$filename = $path_arr[count($path_arr)-1]; // myface.jpg
$alt = 'blah';
$node->setAttribute('alt', $alt);
}
echo $dom->saveHTML();
The problem is that when you use
echo $dom->saveXML($node);
in the loop, it will output for various tags and so the output is not the end result, but a combination of other parts of the document.
Try changing it to
echo $node->nodeName."=>".$dom->saveXML($node).PHP_EOL;
to see what it does.
You could just remove the current echo and add
echo $dom->saveXML();
after the end of the loop.
Alternatively, if you just want to process the <img> tags, you can limit the loop more specifically...
$content = $dom->getElementsByTagName('img');
foreach ($content as $i => $node)
{
$img_src = $node->getAttribute('src');
$path_arr = explode('/', $img_src);
$filename = $path_arr[count($path_arr)-1]; // myface.jpg
$alt = 'blah';
$node->setAttribute('alt', $alt);
}
echo $dom->saveXML();
What is the right syntax to use xpath to get the contents of all divs with a certain class? i seem to be getting the divs but i don't know how to get their innerHTML.
$url = "http://www.vanityfair.com/politics/2012/10/michael-lewis-profile-barack-obama";
$ctx = stream_context_create(array('http'=> array('timeout' => 10)));
libxml_use_internal_errors(TRUE);
$num = 0;
if($html = #file_get_contents($url,false,$ctx)){
$doc = DOMDocument::loadHTML($html);
$xpath = new DOMXPath($doc);
foreach($xpath->query('//div[#class="page-display"]') as $div){
$num++;
echo "$num. ";
//????
echo "<br/>";
}
echo "<br/>FINISHED";
}else{
echo "FAIL";
}
There is no HTML in the class="page-display" divs - so you're not going to get anything at all.
Do you mean the get class="parbase cn_text"?
foreach($xpath->query('//div[#class="parbase cn_text"]') as $div){
$num++;
echo "$num. ";
//????
echo $div->textContent;
echo "<br/>";
}
This would be a HTML file:
<li class="msgln">hello</li><li class="msgln">hi</li><li class="msgln">hey</li>
And php script:
$fp = fopen("file.html", 'a');
....
fclose($fp);
How to remove first <li class="msgln">hello</li>?
Content in <li> is dynamically changed
This will work even if the first li would contain other nested lis:
<?php
$doc = new DOMDocument();
$doc->loadHTML('<li class="msgln">hello</li><li class="msgln">hi</li><li class="msgln">hey</li>
');
$root = $doc->documentElement;
$p = $doc->documentElement->childNodes->item(0)->childNodes;
$li = $doc->getElementsByTagName('li')->item(0);
$li->parentNode->removeChild($li);
$html = '';
foreach ($root->childNodes->item(0)->childNodes as $child) {
$html .= $doc->saveXML($child);
}
echo $html;
?>
using regex may cause unexpected results.
You can use preg_replace to achieve this:
$html = file_get_contents('file.html');
$html = preg_replace('#^<li[^>]*>[^<]+</li>#i', '', $html);
If the content of the file is exactly as described then you could use strip_tags() such like:
$fp = fopen("file.html", 'a');
$content = fread($fp);
$content = strip_tags($content);
fclose($fp);
Alternatively you could use regular expressions but this would be slower.
$fp = fopen("file.html", 'a');
$content = fread($fp);
$text = preg_replace( "/<li.+?>.+?<\/li>/is", "", $content, 1 );
fclose($fp);
try this (without regex)
//string contains the file value
$string = '<li class="msgln">hello</li><li class="msgln">hi</li><li class="msgln">hey</li>';
$tag = '</li>';
$lis = explode($tag, $string);
if(count($lis) > 0) {
unset($lis[0]);
$string = implode($tag, $lis);
}
I found this code to check for links on an URL.
<?php
$url = "http://example.com";
$input = #file_get_contents($url);
$dom = new DOMDocument();
$dom->strictErrorChecking = false;
#$dom->loadHTML($input);
$links = $dom->getElementsByTagName('a');
foreach($links as $link) {
if ($link->hasAttribute('href')) {
$href = $link->getAttribute('href');
if (stripos($href, 'shows') !== false) {
echo "<p>http://example.com" . $href . "</p>\n";
}
}
}
?>
Works good, it shows all the links that contains 'shows'.
For example the script above find 3 links, so i get:
<p>http://example.com/shows/Link1</p>
<p>http://example.com/shows/Link2</p>
<p>http://example.com/shows/Link3</p>
Now the thing i try to do is to check those urls i just fetched also for links that contains 'shows'.
To be honest i'm a php noob, so i don't know where to start :(
Regards,
Bart
Something like:
function checklinks($url){
$input = #file_get_contents($url);
$dom = new DOMDocument();
$dom->strictErrorChecking = false;
#$dom->loadHTML($input);
$links = $dom->getElementsByTagName('a');
foreach($links as $link) {
if ($link->hasAttribute('href')) {
$href = $link->getAttribute('href');
if (stripos($href, 'shows') !== false) {
echo "<p>" . $url . "/" . $href . "</p>\n";
checklinks($url . "/" . $href);
}
}
}
}
$url = "http://example.com";
checklinks($url);
Make it recursive - call the function again in the function itself.
Thanks for taking the time to read my post... I'm trying to extract some information from my website using Simple HTML Dom...
I have it reading from the HTML source ok, now I'm just trying to extract the information that I need. I have a feeling I'm going about this in the wrong way... Here's my script...
<?php
include_once('simple_html_dom.php');
// create doctype
$dom = new DOMDocument("1.0");
// display document in browser as plain text
// for readability purposes
//header("Content-Type: text/plain");
// create root element
$xmlProducts = $dom->createElement("products");
$dom->appendChild($xmlProducts);
$html = file_get_html('http://myshop.com/small_houses.html');
$html .= file_get_html('http://myshop.com/medium_houses.html');
$html .= file_get_html('http://myshop.com/large_houses.html');
//Define my variable for later
$product['image'] = '';
$product['title'] = '';
$product['description'] = '';
foreach($html->find('img') as $src){
if (strpos($src->src,"http://myshop.com") === false) {
$src->src = "http://myshop.com/$src->src";
}
$product['image'] = $src->src;
}
foreach($html->find('p[class*=imAlign_left]') as $description){
$product['description'] = $description->innertext;
}
foreach($html->find('span[class*=fc3]') as $title){
$product['title'] = $title->innertext;
}
echo $product['img'];
echo $product['description'];
echo $product['title'];
?>
I put echo's on the end for sake of testing...but I'm not getting anything... Any pointers would be a great HELP!
Thanks
Charles
file_get_html() returns a HTMLDom Object, and you cannot concatenate Objects, although HTMLDom have __toString methods when there concatenated there more then lilly corrupt in some way, try the following:
<?php
include_once('simple_html_dom.php');
// create doctype
$dom = new DOMDocument("1.0");
// display document in browser as plain text
// for readability purposes
//header("Content-Type: text/plain");
// create root element
$xmlProducts = $dom->createElement("products");
$dom->appendChild($xmlProducts);
$pages = array(
'http://myshop.com/small_houses.html',
'http://myshop.com/medium_houses.html',
'http://myshop.com/large_houses.html'
)
foreach($pages as $page)
{
$product = array();
$source = file_get_html($page);
foreach($source->find('img') as $src)
{
if (strpos($src->src,"http://myshop.com") === false)
{
$product['image'] = "http://myshop.com/$src->src";
}
}
foreach($source->find('p[class*=imAlign_left]') as $description)
{
$product['description'] = $description->innertext;
}
foreach($source->find('span[class*=fc3]') as $title)
{
$product['title'] = $title->innertext;
}
//debug perposes!
echo "Current Page: " . $page . "\n";
print_r($product);
echo "\n\n\n"; //Clear seperator
}
?>