How to convert PHP to XML output - php

I have a php code. this code outputs an HTML. I need to modify this code to output an XML.
ANy ideas as to how shall I go about doing this. Is there any XML library available that directly does the job or do i have to manually create each node.?
My php code is:
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<style>
a {text-decoration:none; color:black;}
</style>
</head>
<body>
<?php
$a=$_POST["title"];
$b=$_POST["name"];
$c="http://www.imdb.com/search/title?title=".urlencode($a)."&title_type=".urlencode($b);
$d=file_get_contents($c);
preg_match_all('/<div id="main">\n(No results.)/', $d,$nore);
preg_match_all('#<img src="(.*)"#Us', $d, $img);//image
preg_match_all('/<a\s*href="\/title\/tt[0-9]*\/">((?:[a-z]*(?:&*[.]*)?\s*-*[a-z]*[0-9]*[^<])+)/i',$d,$tit); //title
preg_match_all('/<span\sclass="year_type">\s*\(([\d]*)/',$d,$ye); //movie year working fine
preg_match_all('#<span class="credit">\n Dir: (.*)\n(?: With:)?#Us',$d,$dir); //director
preg_match_all('/<span class="rating-rating"><span class="value">([\w]*.[\w]*)/i',$d,$rat); //rating
preg_match_all('/<a\shref="(\/title\/tt[0-9]*\/)"\s*[title]+/i',$d,$lin); //link
for($i=0;$i<5;$i++)
{
if (#$rat[1][$i]=="-")
$rat[1][$i]="N/A";
}
for($i=0;$i<5;$i++)
{
if(#$dir[1][$i]=="")
$dir[1][$i]="N/A";
}
if(count($tit[1])>5)
$cnt=5;
else
$cnt=count($tit[1]);
echo"<center><b>Search Result</b></center>";
echo "<br/>";
echo "<center><b>\"$a\"of type\"$b\":</b></center>";
echo"<br/>";
if(#$nore[1][0]=="No results.")
echo "<center><b>No movies found!</b></center>";
else
{
echo "<center><table border=1><tr><td><center>Image</center></td><td><center>Title</center></td><td><center>Year</center></td><td><center>Director</center></td><td><center>Rating(10)</center></td><td><center>Link to Movie</center></td></tr>";
for($j=0;$j<$cnt;$j++)
{
echo "<tr>";
echo "<td>".#$img[0][$j+2]."</td>";
echo "<td><center>".#$tit[1][$j]."</center></td>";
echo "<td><center>".#$ye[1][$j]."</center></td>";
echo "<td><center>".#$dir[1][$j]."</center></td>";
echo "<td><center>".#$rat[1][$j]."</center></td>";
echo '<td><center><a style="text-decoration:underline; color:blue;" href="http://www.imdb.com'.#$lin[1][$j].'">Details</a></center></td>';
echo "</tr>";
}
echo "</table></center>";
}
?>
</body>
</html>
Expected XML output:
<result cover="http://ia.mediaimdb.com/images
/M/MV5BMjMyOTM4MDMxNV5BMl5BanBnXkFtZTcwNjIyNzExOA##._V1._SX54_
CR0,0,54,74_.jpg" title="The Amazing Spider-Man(2012)"year="2012"
director="Marc Webb" rating="7.5"
details="http://www.imdb.com/title/tt0948470"/>
<result cover="http://ia.mediaimdb.
com/images/M/MV5BMzk3MTE5MDU5NV5BMl5BanBnXkFtZTYwMjY3NTY3._V1._SX54_CR0,
0,54,74_.jpg" title="Spider-Man(2002)" year="2002"director="Sam Raimi"
rating="7.3" details="http://www.imdb.com/title/tt0145487"/>
<result cover="http://ia.mediaimdb.
com/images/M/MV5BODUwMDc5Mzc5M15BMl5BanBnXkFtZTcwNDgzOTY0MQ##._V1._SX54_
CR0,0,54,74_.jpg" title="Spider-Man 3 (2007)" year="2007" director="Sam
Raimi" rating="6.3" details="http://www.imdb.com/title/tt0413300"/>
<result cover="http://i.mediaimdb.
com/images/SF1f0a42ee1aa08d477a576fbbf7562eed/realm/feature.gif" title="
The Amazing Spider-Man 2 (2014)" year="2014" director="Sam Raimi"
rating="6.3" details="http://www.imdb.com/title/tt1872181"/>
<result cover="http://ia.mediaimdb.
com/images/M/MV5BMjE1ODcyODYxMl5BMl5BanBnXkFtZTcwNjA1NDE3MQ##._V1._SX54_
CR0,0,54,74_.jpg" title="Spider-Man 2 (2004)" year="2004" director="Sam
Raimi" rating="7.5" details="http://www.imdb.com/title/tt0316654"/>
</results>

First thing, you're parsing your html result with regex which is inefficient, unnecessary, and... well, you're answering to the cthulhu call!
Second, parsing IMDB HTML to retrieve results, although valid, might be unnecessary. There are some neat 3rd party APIs that do the job for you, like http://imdbapi.org
If you don't want to use any 3rd party API though, IMHO, you should, instead, parse the HTML using a DOM parser/manipulator, like DOMDocument, for instance, which is safer, better and, at the same time, can solve your HTML to XML problem.
Here's the bit you asked (build XML and HTML from results):
function resultsToHTML($results)
{
$doc = new DOMDocumet();
$table = $doc->createElement('table');
foreach ($results as $r) {
$row = $doc->createElement('tr');
$doc->appendChild($row);
$title = $doc->createElement('td', $r['title']);
$row->appendChild($title);
$year = $doc->createElement('td', $r['year']);
$row->appendChild($year);
$rating = $doc->createElement('td', $r['rating']);
$row->appendChild($rating);
$imgTD = $doc->createElement('td');
//Creating a img tag (use only on)
$img = $doc->createElement('img');
$img->setAttribute('src', $r['img_src']);
$imgTD->appendChild($img);
$row->appendChild($imgTD);
$imgTD = $doc->createElement('td');
//Importing directly from the old document
$fauxDoc = new DOMDocument();
$fauxDoc->loadXML($r['img']);
$img = $fauxDoc->getElementsByTagName('img')->index(0);
$importedImg = $doc->importNode('$img', true);
$imgTD->appendChild($importedImg);
$row->appendChild($imgTD);
}
return $doc;
}
function resultsToXML($results)
{
$doc = new DOMDocumet();
$root = $doc->createElement('results');
foreach ($results as $r) {
$element = $root->createElement('result');
$element->setAttribute('cover', $r['img_src']);
$element->setAttribute('title', $r['title']);
$element->setAttribute('year', $r['year']);
$element->setAttribute('rating', $r['rating']);
$root->appendChild($element);
}
$doc->appendChild($root);
return $doc;
}
to print them you just need to
$xml = resultsToXML($results);
print $xml->saveXML();
Same thing with html
Here's a refactor of your code with DOMDocument, based on your post:
<?php
//Mock IMDB Link
$a = 'The Amazing Spider-Man';
$b = 'title';
$c = "http://www.imdb.com/search/title?title=".urlencode($a)."&title_type=".urlencode($b);
// HTML might be malformed so we want DOMDocument to be quiet
libxml_use_internal_errors(true);
//Initialize DOMDocument parser
$doc = new DOMDocument();
//Load previously downloaded document
$doc->loadHTMLFile($c);
//initialize array to store results
$results = array();
// get table of results and extract a list of rows
$listOfTables = $doc->getElementsByTagName('table');
$rows = getResultRows($listOfTables);
$i = 0;
//loop through all rows to retrieve information
foreach ($rows as $row) {
if ($title = getTitle($row)) {
$results[$i]['title'] = $title;
}
if (!is_null($year = getYear($row)) && $year) {
$results[$i]['year'] = $year;
}
if (!is_null($rating = getRating($row)) && $rating) {
$results[$i]['rating'] = $rating;
}
if ($img = getImage($row)) {
$results[$i]['img'] = $img;
}
if ($src = getImageSrc($row)) {
$results[$i]['img_src'] = $src;
}
++$i;
}
//the first result can be a false positive due to the
// results' table header, so we remove it
if (isset($results[0])) {
array_shift($results);
}
FUNCTIONS
function getResultRows($listOfTables)
{
foreach ($listOfTables as $table) {
if ($table->getAttribute('class') === 'results') {
return $table->getElementsByTagName('tr');
}
}
}
function getImageSrc($row)
{
$img = $row->getElementsByTagName('img')->item(0);
if (!is_null($img)) {
return $img->getAttribute('src');
} else {
return false;
}
}
function getImage($row, $doc)
{
$img = $row->getElementsByTagName('img')->item(0);
if (!is_null($img)) {
return $doc->saveHTML($img);
} else {
return false;
}
}
function getTitle($row)
{
$tdInfo = getTDInfo($row->getElementsByTagName('td'));
if (!is_null($tdInfo) && !is_null($as = $tdInfo->getElementsByTagName('a'))) {
return $as->item(0)->nodeValue;
} else {
return false;
}
}
function getYear($row)
{
$tdInfo = getTDInfo($row->getElementsByTagName('td'));
if (!is_null($tdInfo) && !is_null($spans = $tdInfo->getElementsByTagName('span'))) {
foreach ($spans as $span) {
if ($span->getAttribute('class') === 'year_type') {
return str_replace(')', '', str_replace('(', '', $span->nodeValue));
}
}
}
}
function getRating($row)
{
$tdInfo = getTDInfo($row->getElementsByTagName('td'));
if (!is_null($tdInfo) && !is_null($spans = $tdInfo->getElementsByTagName('span'))) {
foreach ($spans as $span) {
if ($span->getAttribute('class') === 'rating-rating') {
return $span->nodeValue;
}
}
}
}
function getTDInfo($tds)
{
foreach ($tds as $td) {
if ($td->getAttribute('class') == 'title') {
return $td;
}
}
}

Related

recursive PHP function return Allowed memory size exhausted

i'm working on a small DOM project, to get dynamic links and static links. but my function take a lot of time to be executed and return an error as you can see
allowed memory size of bytes exhausted
this is my PHP code:
public $domain_name = 'www.example.com';
public function dynamic_url2($url = "http://www.example.com"){
$pages = array();
$html = file_get_html($url);
foreach($html->find('a') as $page){
if(valid_url($page->href)){
$parse_page = parse_url($page->href);
if($parse_page['host'] == $this->domain_name){
if(!in_array($page->href, $pages)){
$pages[] = $page->href;
if(array_key_exists('query', $parse_page))
echo 'contain dynamic parameters : '. $page->href.'<br>';
else
echo 'not dynamic : '. $page->href.'<br>';
}
return $this->dynamic_url2($page->href);
}
}
}
}
is my function correct ? how can i optimize it ?
thanks
Apart from some minor adjustments that I made while testing, you only need to make $pages modifiable (via &$pages in the function declaration) and pass the $pages array with every recursive call.
public $domain_name = 'https://www.example.html';
public function dynamic_url2($url, &$pages = []){
//echo "<div>Crawling $url</div>";
$dom = new DOMDocument;
libxml_use_internal_errors(true); // for malformed html warning suppression
$dom->loadHTML(file_get_contents($url)); // this doesn't account for relative urls
$xpath = new DOMXPath($dom);
foreach ($xpath->query("//a") as $a) {
$href = $a->getAttribute('href');
//echo "<div>Found $href # $url</div>";
if (valid_url($href)) {
$parsed = parse_url($href);
if ($parsed['host'] == $this->domain_name && !in_array($href, $pages)) {
$pages[] = $href;
//echo "<div>$href is " , (array_key_exists('query', $parsed) ? '' : 'not ') , 'dynamic</div>';
$this->dynamic_url2($href, $pages);
} else {
//echo "<div>Ignored url: $href</div>";
}
} else {
//echo "<div>Invalid url: $href</div>";
}
}
return $pages;
}
var_export($this->dynamic_url2($this->domain_name));

PHP Simplexml iterator

How to elaborate an array reflecting the structure of the following xml ?
Thanks in advance.
XML source :
<document>
<section type="group" width="100">
<section type="list" width"50"/>
<style>classe1 {color:red}</style>
<section type="text" height="25">azerty</section>
</section>
</document>
Please note the three tags ('section', 'style' then 'section') embedded in the first level 'section'
Example of desired generated array should reflecting this embedding, attributes and tags order :
Array
{
[0]=>Array
{
[key]=>section
[attributes]=>Array
{
[type]=>group
[width]=>100
}
[0]=>Array
{
[key]=>section
[attributes]=>Array
{
[type]=>list
[width]=>50
}
}
[1]=>Array
{
[key]=>style
[content]=>classe1 {color:red}
}
[2]=>Array
{
[key]=>section
[attributes]=>Array
{
[type]=>text
[width]=>25
}
[content]=>azerty
}
}
}
I tried without success whith this code :
<?php
function xml2array($fName)
{
$sxi = new SimpleXmlIterator($fName, null, true);
return sxiToArray($sxi);
}
function sxiToArray($sxi)
{
$a = array();
for( $sxi->rewind(); $sxi->valid(); $sxi->next() )
{
if(!array_key_exists($sxi->key(), $a))
$a[$sxi->key()] = array();
if($sxi->hasChildren())
$a[$sxi->key()][] = sxiToArray($sxi->current());
else
{
$a[$sxi->key() ]['attributs'] = $sxi->attributes();
$a[$sxi->key()][] = strval($sxi->current());
}
}
return $a;
}
try
{
$catArray = xml2array("temp.xml");
echo '<pre>'.print_r($catArray,true);
}
catch(Exception $e)
{
echo 'ERREUR : '.$e->getMessage();
}
?>
I've updated the code to achieve what I think your after. There are a few bits where I've managed the arrays differently. Especially with attributes - I've added them one at a time to allow me to create the key/value setup.
<?php
error_reporting(E_ALL);
ini_set('display_errors', 1);
function xml2array($fName)
{
$sxi = new SimpleXmlIterator($fName, null, true);
$sxi->rewind();
return sxiToArray($sxi)[0];
}
function sxiToArray($sxi)
{
$a = array();
for( $sxi->rewind(); $sxi->valid(); $sxi->next() )
{
$newData = [];
$newData['key'] = $sxi->key();
foreach ( $sxi->current()->attributes() as $key=>$attribute) {
$newData['attributes'][(string)$key] = (string)$attribute;
}
if($sxi->hasChildren()) {
$newData = array_merge( $newData, sxiToArray($sxi->current()));
}
else
{
if ( strlen(strval($sxi->current())) > 0 ) {
$newData['content'] = strval($sxi->current());
}
}
$a[] = $newData;
}
return $a;
}
try
{
$catArray = xml2array("t1.xml");
echo '<pre>'.print_r($catArray,true);
}
catch(Exception $e)
{
echo 'ERREUR : '.$e->getMessage();
}
{
echo 'ERREUR : '.$e->getMessage();
}
I also had to correct the XML as
<section type="list" width"50"/>
should be
<section type="list" width="50"/>
(missing =)

Get second variable of a foreach array in AJAX

To clean it up, I put this question with a new topic.
I'm using the autocompleter of "http://www.devbridge.com/sourcery/components/jquery-autocomplete/" but in a previous version because of its lightweight and speed. the version I used can be found here: https://code.google.com/p/jquery-autocomplete/source/browse/
Within that I try to receive results with
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
$(function() {
$("#ac1").autocomplete('search.php', {
selectFirst: true
});
$("#flush").click(function() {
var ac = $("#ac1").data('autocompleter');
if (ac && $.isFunction(ac.cacheFlush)) {
ac.cacheFlush();
} else {
alert('Error flushing cache');
}
});
the data.php is simply structured:
$data = array(
"Berlin" => "10178",
"Hamburg" => "20038",
"München" => "80331",
and the search.php file contains the following:
<?php
include 'data.php';
function autocomplete_format($results) {
foreach ($results as $result) {
echo $result[0] . '|' . $result[1] . "\n";
}
}
if (isset($_GET['q'])) {
$q = strtolower($_GET['q']);
if ($q) {
foreach ($data as $key => $value) {
if (strpos(strtolower($key), $q) !== false) {
$results[] = array($key, $value);
}
}
}
}
$output = 'autocomplete';
if (isset($_GET['output'])) {
$output = strtolower($_GET['output']);
}
if ($output === 'json') {
echo json_encode($results);
} else {
echo autocomplete_format($results);
}
Now I try, that after I chose the right city from the dropdown menu there will be the number right beside the city (in the data.php) in an new label.
Try this:
echo $data['Berlin'];

RSS Parser to include Categories

I recently inherited a RSS/XML parser, and while it seems to work really good, I'm finding some things are missing.
For instance, pulling in a RSS feed from a blog. It's missing all the categories in the items. It shows as each item having only one category when in reality it should show as having a multitude of categories.
Link to Demo: http://dev.o7t.in/rss/
Link to Actual Feed: http://o7thblog.com/feed/
You can see how the first item in the feed itself has 8 total categories in the first item. (may need to view source)
However, in the Demo you can see that it only shows 1 category
Here is my entire code for the class:
<?php
class o7thRssFeedPuller{
public $FeedUrl = ''; // URL of the feed to pull in
public $ReturnJson = false; // Return the array as a JSON encoded string instead?
public $MaxItems = 0; // 0 = unlimited (except by feed), only applicable to GetItems
// Internal holders
private $document;
private $channel;
private $items;
// Get the full RSS feed
public function GetRSS($includeAttributes = false) {
// Pull in our feed
$this->loadParser(file_get_contents($this->FeedUrl, false, $this->randomContext()));
if($includeAttributes) {
// only if we are including attributes
return ($this->ReturnJson) ? json_encode($this->document) : $this->document;
}
// Return either an array or a json encoded string
return ($this->ReturnJson) ? json_encode($this->valueReturner()) : $this->valueReturner();
}
// Get the channel data
public function GetChannel($includeAttributes = false) {
// Pull in our feed
$this->loadParser(file_get_contents($this->FeedUrl, false, $this->randomContext()));
if($includeAttributes) {
// only if we are including attributes
return ($this->ReturnJson) ? json_encode($this->channel) : $this->channel;
}
// Return either an array or a json encoded string
return ($this->ReturnJson) ? json_encode($this->valueReturner($this->channel)) : $this->valueReturner($this->channel);
}
// Get the items
public function GetItems($includeAttributes=false) {
// Pull in our feed
$this->loadParser(file_get_contents($this->FeedUrl, false, $this->randomContext()));
if($includeAttributes) {
// only if we are including attributes
$arr = ($this->MaxItems == 0) ? $this->items : array_slice($this->items, 0, $this->MaxItems);
return ($this->ReturnJson) ? json_encode($arr) : $arr;
}
// Return either an array or a json encoded string
$arr = ($this->MaxItems == 0) ? $this->valueReturner($this->items) : array_slice($this->valueReturner($this->items), 0, $this->MaxItems);
return ($this->ReturnJson) ? json_encode($arr) : $arr;
}
// -------------------------------------------------------------------------------------------------
// Internal Methods
private function loadParser($rss=false) {
if($rss) {
$this->document = array();
$this->channel = array();
$this->items = array();
$DOMDocument = new DOMDocument;
$DOMDocument->strictErrorChecking = false;
$DOMDocument->loadXML($rss);
$this->document = $this->extractDOM($DOMDocument->childNodes);
}
}
private function valueReturner($valueBlock=false) {
if(!$valueBlock) {
$valueBlock = $this->document;
}
foreach($valueBlock as $valueName => $values) {
if(isset($values['value'])) {
$values = $values['value'];
}
if(is_array($values)) {
$valueBlock[$valueName] = $this->valueReturner($values);
} else {
$valueBlock[$valueName] = $values;
}
}
return $valueBlock;
}
private function extractDOM($nodeList,$parentNodeName=false) {
$itemCounter = 0;
foreach($nodeList as $values) {
if(substr($values->nodeName,0,1) != '#') {
if($values->nodeName == 'item') {
$nodeName = $values->nodeName.':'.$itemCounter;
$itemCounter++;
} else {
$nodeName = $values->nodeName;
}
$tempNode[$nodeName] = array();
if($values->attributes) {
for($i=0;$values->attributes->item($i);$i++) {
$tempNode[$nodeName]['properties'][$values->attributes->item($i)->nodeName] = $values->attributes->item($i)->nodeValue;
}
}
if(!$values->firstChild) {
$tempNode[$nodeName]['value'] = $values->textContent;
} else {
$tempNode[$nodeName]['value'] = $this->extractDOM($values->childNodes, $values->nodeName);
}
if(in_array($parentNodeName, array('channel','rdf:RDF'))) {
if($values->nodeName == 'item') {
$this->items[] = $tempNode[$nodeName]['value'];
} elseif(!in_array($values->nodeName, array('rss','channel'))) {
$this->channel[$values->nodeName] = $tempNode[$nodeName];
}
}
} elseif(substr($values->nodeName,1) == 'text') {
$tempValue = trim(preg_replace('/\s\s+/',' ',str_replace("\n",' ', $values->textContent)));
if($tempValue) {
$tempNode = $tempValue;
}
} elseif(substr($values->nodeName,1) == 'cdata-section'){
$tempNode = $values->textContent;
}
}
return (!isset($tempNode)) ? null : $tempNode;
}
// Load in a random header to pass
private function randomContext() {
$headerstrings = array();
$headerstrings['User-Agent'] = 'Mozilla/5.0 (Windows; U; Windows NT 5.'.rand(0,2).'; en-US; rv:1.'.rand(2,9).'.'.rand(0,4).'.'.rand(1,9).') Gecko/2007'.rand(10,12).rand(10,30).' Firefox/2.0.'.rand(0,1).'.'.rand(1,9);
$headerstrings['Accept-Charset'] = rand(0,1) ? 'en-gb,en;q=0.'.rand(3,8) : 'en-us,en;q=0.'.rand(3,8);
$headerstrings['Accept-Language'] = 'en-us,en;q=0.'.rand(4,6);
$setHeaders = 'Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'."\r\n".
'Accept-Charset: '.$headerstrings['Accept-Charset']."\r\n".
'Accept-Language: '.$headerstrings['Accept-Language']."\r\n".
'User-Agent: '.$headerstrings['User-Agent']."\r\n";
$contextOptions = array(
'http'=>array(
'method'=>"GET",
'header'=>$setHeaders
)
);
return stream_context_create($contextOptions);
}
}
?>
And for the demo page:
<?php
require_once($_SERVER['DOCUMENT_ROOT'] . '/rss/o7th.rss.feed.puller.php');
$fp = new o7thRssFeedPuller();
$fp->FeedUrl = 'http://o7thblog.com/feed';
$fp->MaxItems = 2;
echo '<table width="100%" cellpadding="0" cellspacing="0">';
echo ' <tr>';
echo ' <td>';
echo ' <textarea cols="120" rows="30">';
print_r($fp->GetItems());
echo ' </textarea>';
echo ' </td>';
echo ' </tr>';
echo '</table>';
?>
So, I assume that the issue lies somewhere in either the valueReturner method or the extractDOM method, but I am just not sure where, nor what I can do to get all the categories in the returned array.
Can you help?
I would suggest using SimpleXML to parse the feed.
Here is how you can do it:
$feed_url = 'http://o7thblog.com/feed/';
$feed = simplexml_load_file($feed_url, null, LIBXML_NOCDATA);
$channel = $feed->channel;
echo "<h1>{$channel->title}</h1>\n";
echo "{$channel->description}\n";
echo "<dl>\n";
foreach ($channel->item as $item) {
echo "<dt>{$item->title}</dt>\n"
. "<dd style=\"margin-bottom: 30px;\"><div style=\"font-size: small;\">{$item->pubDate}</div>\n"
. "<div>{$item->description}</div>\n"
. "Categories: <strong>".implode('</strong>, <strong>', (array) $item->category) . "</strong>\n</dd>";
}
echo "</dl>\n";
Above shows you all categories.
You have written a custom parser for what you can do simply with one line of code!
$feed = (array) simplexml_load_file('http://o7thblog.com/feed/', null, LIBXML_NOCDATA);

Using Simple Html Dom to remove some elements

This is the page I'm trying to parse using Simple Html Dom. I've gotten 90% of the functionality done, but since I'm new to the library I'm not quite sure to do this.
I want to scrape the text of each news item, but since the text is inside the <p>element, using something like ->innertext bring everything inside, including the link.
Here's what I've tried:
<h1>Scraper Noticias</h1>
<?php
include('simple_html_dom.php');
class News {
var $image;
var $fechanoticia;
var $title;
var $description;
var $sourceurl;
function get_image( ) {
return $this->image;
}
function set_image ($new_image) {
$this->image = $new_image;
}
function get_fechanoticia( ) {
return $this->fechanoticia;
}
function set_fechanoticia ($new_fechanoticia) {
$this->fechanoticia = $new_fechanoticia;
}
function get_title( ) {
return $this->title;
}
function set_title ($new_title) {
$this->title = $new_title;
}
function get_description( ) {
return $this->description;
}
function set_description ($new_description) {
$this->description = $new_description;
}
function get_sourceurl( ) {
return $this->sourceurl;
}
function set_sourceurl ($new_sourceurl) {
$this->sourceurl = $new_sourceurl;
}
}
// Create DOM from URL or file
$html = file_get_html('http://www.uvm.cl/noticias_mas.shtml');
$parsedNews = array();
// Find all news items.
foreach($html->find('#cont2 p') as $element) {
$newItem = new News;
// Parse the news item's thumbnail image.
foreach ($element->find('img') as $image) {
$newItem->set_image($image->src);
//echo $newItem->get_image() . "<br />";
}
// Parse the news item's post date.
foreach ($element->find('span.fechanoticia') as $fecha) {
$newItem->set_fechanoticia($fecha->innertext);
//echo $newItem->get_fechanoticia() . "<br />";
}
// Parse the news item's title.
foreach ($element->find('a') as $title) {
$newItem->set_title($title->innertext);
//echo $newItem->get_title() . "<br />";
}
// Parse the news item's source URL link.
foreach ($element->find('a') as $sourceurl) {
$newItem->set_sourceurl("http://www.uvm.cl/" . $sourceurl->href);
}
// Parse the news items' description text.
echo $link; //This is the entire <p> tag. How can I get just the text. Not the link?
}
?>
Here's a solution I found. Although if I can improve the code, it would be appreciated.
<h1>Scraper Noticias</h1>
<?php
include('simple_html_dom.php');
class News {
var $image;
var $fechanoticia;
var $title;
var $description;
var $sourceurl;
function get_image( ) {
return $this->image;
}
function set_image ($new_image) {
$this->image = $new_image;
}
function get_fechanoticia( ) {
return $this->fechanoticia;
}
function set_fechanoticia ($new_fechanoticia) {
$this->fechanoticia = $new_fechanoticia;
}
function get_title( ) {
return $this->title;
}
function set_title ($new_title) {
$this->title = $new_title;
}
function get_description( ) {
return $this->description;
}
function set_description ($new_description) {
$this->description = $new_description;
}
function get_sourceurl( ) {
return $this->sourceurl;
}
function set_sourceurl ($new_sourceurl) {
$this->sourceurl = $new_sourceurl;
}
}
// Create DOM from URL or file
$html = file_get_html('http://www.uvm.cl/noticias_mas.shtml');
$parsedNews = array();
// Find all news items.
foreach($html->find('#cont2 p') as $element) {
$newItem = new News;
// Parse the news item's thumbnail image.
foreach ($element->find('img') as $image) {
$newItem->set_image($image->src);
//echo $newItem->get_image() . "<br />";
}
// Parse the news item's post date.
foreach ($element->find('span.fechanoticia') as $fecha) {
$newItem->set_fechanoticia($fecha->innertext);
//echo $newItem->get_fechanoticia() . "<br />";
}
// Parse the news item's title.
foreach ($element->find('a') as $title) {
$newItem->set_title($title->innertext);
//echo $newItem->get_title() . "<br />";
}
// Parse the news item's source URL link.
foreach ($element->find('a') as $sourceurl) {
$newItem->set_sourceurl("http://www.uvm.cl/" . $sourceurl->href);
}
// Parse the news items' description text.
foreach ($element->find('a') as $link) {
$link->outertext = '';
}
foreach ($element->find('span') as $link) {
$link->outertext = '';
}
foreach ($element->find('img') as $link) {
$link->outertext = '';
}
echo $element->innertext;
}
?>
Use the innertext instead of outertext
foreach ($element->find('a') as $sourceurl) {
echo $sourceurl->innertext . "<br />";
}

Categories