The following codes scrapes a list of links from a given webpage and then place them into another script that scrapes the text from the given links and places the data into a csv document. The code runs perfectly on localhost (wampserver 5.5 php) but fails horribly when placed on domain.
You can check out the functionality of the script at http://miskai.tk/ANOFM/csv.php .
Also, file get html and curl are both enabled onto the server.
<?php
header('Content-Type: application/excel');
header('Content-Disposition: attachment; filename="Mehedinti.csv"');
include_once 'simple_html_dom.php';
include_once 'csv.php';
$urls = scrape_main_page();
function scraping($url) {
// create HTML DOM
$html = file_get_html($url);
// get article block
if ($html && is_object($html) && isset($html->nodes)) {
foreach ($html->find('/html/body/table') as $article) {
// get title
$item['titlu'] = trim($article->find('/tbody/tr[1]/td/div', 0)->plaintext);
// get body
$item['tr2'] = trim($article->find('/tbody/tr[2]/td[2]', 0)->plaintext);
$item['tr3'] = trim($article->find('/tbody/tr[3]/td[2]', 0)->plaintext);
$item['tr4'] = trim($article->find('/tbody/tr[4]/td[2]', 0)->plaintext);
$item['tr5'] = trim($article->find('/tbody/tr[5]/td[2]', 0)->plaintext);
$item['tr6'] = trim($article->find('/tbody/tr[6]/td[2]', 0)->plaintext);
$item['tr7'] = trim($article->find('/tbody/tr[7]/td[2]', 0)->plaintext);
$item['tr8'] = trim($article->find('/tbody/tr[8]/td[2]', 0)->plaintext);
$item['tr9'] = trim($article->find('/tbody/tr[9]/td[2]', 0)->plaintext);
$item['tr10'] = trim($article->find('/tbody/tr[10]/td[2]', 0)->plaintext);
$item['tr11'] = trim($article->find('/tbody/tr[11]/td[2]', 0)->plaintext);
$item['tr12'] = trim($article->find('/tbody/tr[12]/td/div/]', 0)->plaintext);
$ret[] = $item;
}
// clean up memory
$html->clear();
unset($html);
return $ret;}
}
$output = fopen("php://output", "w");
foreach ($urls as $url) {
$ret = scraping($url);
foreach($ret as $v){
fputcsv($output, $v);}
}
fclose($output);
exit();
second file
<?php
function get_contents($url) {
// We could just use file_get_contents but using curl makes it more future-proof (setting a timeout for example)
$ch = curl_init($url);
curl_setopt_array($ch, array(CURLOPT_RETURNTRANSFER => true,));
$content = curl_exec($ch);
curl_close($ch);
return $content;
}
function scrape_main_page() {
set_time_limit(300);
libxml_use_internal_errors(true); // Prevent DOMDocument from spraying errors onto the page and hide those errors internally ;)
$html = get_contents("http://lmvz.anofm.ro:8080/lmv/index2.jsp?judet=26");
$dom = new DOMDocument();
$dom->loadHTML($html);
die(var_dump($html));
$xpath = new DOMXPath($dom);
$results = $xpath->query("//table[#width=\"645\"]/tr");
$all = array();
//var_dump($results);
for($i = 1; $i < $results->length; $i++) {
$tr = $results->item($i);
$id = $tr->childNodes->item(0)->textContent;
$requesturl = "http://lmvz.anofm.ro:8080/lmv/detalii.jsp?UNIQUEJVID=" . urlencode($id) .
"&judet=26";
$details = scrape_detail_page($requesturl);
$newObj = new stdClass();
$newObj = $id;
$all[] = $newObj;
}
foreach($all as $xtr) {
$urls[] = "http://lmvz.anofm.ro:8080/lmv/detalii.jsp?UNIQUEJVID=" . $xtr .
"&judet=26";
}
return $urls;
}
scrape_main_page();
Yeah, the problem here is your php.ini configuration. Make sure the server supports curl and fopen. If not start your own linux server.
Related
Im a a newbie trying to code a crawler to make some stats from a forum.
Here is my code :
<?php
$ch = curl_init();
$timeout = 0; // set to zero for no timeout
curl_setopt ($ch, CURLOPT_URL, 'http://m.jeuxvideo.com/forums/42-51-61913988-1-0-1-0-je-code-un-bot-pour-le-forom-je-vous-le-montre-en-action.htm');
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
$file_contents = curl_exec($ch);
curl_close($ch);
$dom = new DOMDocument;
libxml_use_internal_errors(true);
$dom->loadHTML($file_contents);
$xpath = new DOMXPath($dom);
$posts = $xpath->query("//div[#class='who-post']/a");//$elements = $xpath->query("/html/body/div[#id='yourTagIdHere']");
$dates = $xpath->query("//div[#class='date-post']");//$elements = $xpath->query("/html/body/div[#id='yourTagIdHere']");
$contents = $xpath->query("//div[#class='message text-enrichi-fmobile text-crop-fmobile']/p");//$elements = $xpath->query("/html/body/div[#id='yourTagIdHere']");
$i = 0;
foreach ($posts as $post) {
$nodes = $post->childNodes;
foreach ($nodes as $node) {
$value = trim($node->nodeValue);
$tab[$i]['author'] = $value;
$i++;
}
}
$i = 0;
foreach ($dates as $date) {
$nodes = $date->childNodes;
foreach ($nodes as $node) {
$value = trim($node->nodeValue);
$tab[$i]['date'] = $value;
$i++;
}
}
$i = 0;
foreach ($contents as $content) {
$nodes = $content->childNodes;
foreach ($nodes as $node) {
$value = $node->nodeValue;
echo $value;
$tab[$i]['content'] = trim($value);
$i++;
}
}
?>
<h1>Participants</h2>
<pre>
<?php
print_r($tab);
?>
</pre>
As you can see, the code do not retrieve some content. For example, Im trying to retrieve this content from : http://m.jeuxvideo.com/forums/42-51-61913988-1-0-1-0-je-code-un-bot-pour-le-forom-je-vous-le-montre-en-action.htm
The second post is a picture and my code do not work.
On the second hand, I guess i made some errors, I find my code ugly.
Can you help me please ?
You could simply select the posts first, then grab each subdata separately using:
DOMXPath::evaluate combined with normalize-space to retrieve pure text,
DOMXPath::query combined with DOMDocument::save to retrieve message paragraphs.
Code:
$xpath = new DOMXPath($dom);
$postsElements = $xpath->query('//*[#class="post"]');
$posts = [];
foreach ($postsElements as $postElement) {
$author = $xpath->evaluate('normalize-space(.//*[#class="who-post"])', $postElement);
$date = $xpath->evaluate('normalize-space(.//*[#class="date-post"])', $postElement);
$message = '';
foreach ($xpath->query('.//*[contains(#class, "message")]/p', $postElement) as $messageParagraphElement) {
$message .= $dom->saveHTML($messageParagraphElement);
}
$posts[] = (object)compact('author', 'date', 'message');
}
print_r($posts);
Unrelated note: scraping a website's HTML is not illegal in itself, but you should refrain from displaying their data on your own app/website without their consent. Also, this might break just about anytime if they decide to alter their HTML structure/CSS class names.
I am fetching pagination urls list from another website using file_get_contents but the while loop won't work, it fetches data of the first url and thats it won't work on the second url of the array fetched from my database of urls.
include('simple_html_dom.php');
ini_set('max_execution_time', 0);
$con=mysqli_connect("localhost","root","","mydb");
$crawl_query = "SELECT url from list LIMIT 10";
$crawl_url = mysqli_query($con, $crawl_query);
$rows = array();
while($rows = mysqli_fetch_array($crawl_url))
{
$pages = $rows['url'];
$html = file_get_contents($pages);
$dom = new DOMDocument();
#$dom->loadHTML($html);
$finder = new DomXPath($dom);
$links = $finder->query("//*[contains(#class, 'pages')]");
$array1= array();
foreach ($links as $link){
$array1[] = $link;
$length[] = $array1[0]->getElementsByTagName('a');
$final_length = $length[0]->length -1;
for($i=1; $i<=$final_length; $i++ )
{
if($i==1)
{
echo rtrim($pages, ".html").trim($i, "1");
echo "<br/>";
}
else
{
echo rtrim($pages, ".html")."_".trim($i, "1");
echo "<br/>";
}
}
}
}
All I get is
example.com/content/new
example.com/content/new2
example.com/content/new3
which is the result of first url in my database. That url has 3 pages in it's pagination but I can't get the loop to work on second url from mydb.
I try to pre-sort and slice a big XML file for later processing via xml_parser
function CreateXMLParser($CHARSET, $bareXML = false) {
$CURRXML = xml_parser_create($CHARSET);
xml_parser_set_option( $CURRXML, XML_OPTION_CASE_FOLDING, false);
xml_parser_set_option( $CURRXML, XML_OPTION_TARGET_ENCODING, $CHARSET);
xml_set_element_handler($CURRXML, 'startElement', 'endElement');
xml_set_character_data_handler($CURRXML, 'dataHandler');
xml_set_default_handler($CURRXML, 'defaultHandler');
if ($bareXML) {
xml_parse($CURRXML, '<?xml version="1.0"?>', 0);
}
return $CURRXML;
}
function ChunkXMLBigFile($file, $tag = 'item', $howmany = 1000) {
global $CHUNKON, $CHUNKS, $ITEMLIMIT;
$CHUNKON = $tag;
$ITEMLIMIT = $howmany;
$xml = CreateXMLParser('UTF-8', false);
$fp = fopen($file, "r");
$CHUNKS = 0;
while(!feof($fp)) {
$chunk = fgets($fp, 10240);
xml_parse($xml, $chunk, feof($fp));
}
xml_parser_free($xml);
processChunk();
}
function processChunk() {
global $CHUNKS, $PAYLOAD, $ITEMCOUNT;
if ('' == $PAYLOAD) {
return;
}
$xp = fopen($file = "xmlTemp/slices/slice_".$CHUNKS.".xml", "w");
fwrite($xp, '<?xml version="1.0" ?>'."\n");
fwrite($xp, "<producten>");
fwrite($xp, $PAYLOAD);
fwrite($xp, "</producten>");
fclose($xp);
print "Written ".$file."<br>";
$CHUNKS++;
$PAYLOAD = '';
$ITEMCOUNT = 0;
}
function startElement($xml, $tag, $attrs = array()) {
global $PAYLOAD, $CHUNKS, $ITEMCOUNT, $CHUNKON;
if (!($CHUNKS||$ITEMCOUNT)) {
if ($CHUNKON == strtolower($tag)) {
$PAYLOAD = '';
}
} else {
$PAYLOAD .= "<".$tag;
}
foreach($attrs as $k => $v) {
$PAYLOAD .= " $k=".'"'.addslashes($v).'"';
}
$PAYLOAD .= '>';
}
function endElement($xml, $tag) {
global $CHUNKON, $ITEMCOUNT, $ITEMLIMIT;
dataHandler(null, "<$tag>");
if ($CHUNKON == strtolower($tag)) {
if (++$ITEMCOUNT >= $ITEMLIMIT) {
processChunk();
}
}
}
function dataHandler($xml, $data) {
global $PAYLOAD;
$PAYLOAD .= $data;
}
but how can I access the node-name??
.. I have to sort some items (with n nodes) out, before the slice-file is saved. the the XML is parsed line after line, right? so I have to save the nodes from a whole item temporarely and decide if the item is gonna be written to the file.. is there a way to do this?
Your code is effectively reading the entire source file every time you call the ChunkXMLBigFile function.
After your while loop you have all the elements, which you can then manipulate any way you like.
See the following questions about how to approach this:
How to sort a xml file using DOM
Sort XML nodes with PHP
If you parse the chunks after that in batches of $howmany you are where you want to be.
Tip: there are many examples online where this functionality is presented in an Object Orient Programming (OOP) approach where all the functions are inside a class. This would also eliminate the need of global variables which can cause some (read: a lot) of frustrations and confusion.
I've made this program that updates an xml file based on entries in an array.
I've used FILE_APPEND because the entries are more than one and otherwise file gets overwritten. But the problem is the xml version tag prints out as many times as many entries are there.
So i want to remove this tag.
Here's my program:-
<?php
include 'array.php';
$xmlW = new XMLWriter();
$file = 'entry-'. date('M-D-Y') .'.xml';
/*$setting = new XMLWriterSettings();
$setting->OmitXmlDeclaration = true;*/
foreach($data as $d) {
if(in_array ($d['Mode'], array('ccAV','MB','Paypal','E2P'))) {
$recordType = 'receipt';
$xml_object = simplexml_load_file ('receipt.xml');
} else {
$xml_object = simplexml_load_file ('journal.xml');
$recordType = 'journal';
}
$xml_object->xpath("/ENVELOPE/BODY/IMPORTDATA/REQUESTDATA/TALLYMESSAGE/VOUCHER")[0]->DATE = $d['InvoiceDate'];
$xml_object->xpath("/ENVELOPE/BODY/IMPORTDATA/REQUESTDATA/TALLYMESSAGE/VOUCHER")[0]->NARRATION = 'Rahul';
$xml_object->xpath("/ENVELOPE/BODY/IMPORTDATA/REQUESTDATA/TALLYMESSAGE/VOUCHER")[0]->EFFECTIVEDATE = $d['InvoiceDate'];
$xml_object->xpath("/ENVELOPE/BODY/IMPORTDATA/REQUESTDATA/TALLYMESSAGE/VOUCHER/ALLLEDGERENTRIES.LIST")[0]->LEDGERNAME = $d['Mode'];
$xml_object->xpath("/ENVELOPE/BODY/IMPORTDATA/REQUESTDATA/TALLYMESSAGE/VOUCHER/ALLLEDGERENTRIES.LIST")[0]->AMOUNT = 'Rahul';
$xml_object->xpath("/ENVELOPE/BODY/IMPORTDATA/REQUESTDATA/TALLYMESSAGE/VOUCHER/ALLLEDGERENTRIES.LIST")[1]->AMOUNT = 'Rahul';
$xml_object->xpath("/ENVELOPE/BODY/IMPORTDATA/REQUESTDATA/TALLYMESSAGE/VOUCHER/ALLLEDGERENTRIES.LIST")[2]->AMOUNT = 'Rahul';
$xml_object->xpath("/ENVELOPE/BODY/IMPORTDATA/REQUESTDATA/TALLYMESSAGE/VOUCHER/ALLLEDGERENTRIES.LIST/BANKALLOCATIONS.LIST")[0]->DATE = 'Rahul';
$xml_object->xpath("/ENVELOPE/BODY/IMPORTDATA/REQUESTDATA/TALLYMESSAGE/VOUCHER/ALLLEDGERENTRIES.LIST/BANKALLOCATIONS.LIST")[0]->INSTRUMENTDATE = 'Rahul';
$xml_object->xpath("/ENVELOPE/BODY/IMPORTDATA/REQUESTDATA/TALLYMESSAGE/VOUCHER/ALLLEDGERENTRIES.LIST/BANKALLOCATIONS.LIST")[0]->AMOUNT = 'Rahul';
$xml = $xml_object->asXML();
file_put_contents($file, $xml, FILE_APPEND);
}
?>
Thanks for the help.
This is my code which I expected to generate normal RSS. However, after each <item> there's a </channel></rss><?xml version="1.0"?><rss version="2.0">. What do I need to change so that this is only declared at the end of the script as it should be?
Do let me know if I've missed out any important information.
function jobscrape($title, $link, $root, $description, $job_location) {
header("Content-Type: application/rss+xml; charset=UTF-8");
$xml = new SimpleXMLElement('<rss/>');
$xml->addAttribute("version", "2.0");
$channel = $xml->addChild("channel");
$channel->addChild("title", $title);
$channel->addChild("link", $link);
$channel->addChild("description", "This is a description");
$channel->addChild("language", "en-us");
$html = file_get_contents($link);
$doc = new DOMDocument();
libxml_use_internal_errors(TRUE);
if(!empty($html)) {
$doc->loadHTML($html);
libxml_clear_errors(); // remove errors for yucky html
$xpath = new DOMXPath($doc);
$row = $xpath->query($job_location);
if ($row->length > 0) {
foreach ($row as $job) {
$jobs = array();
$entries = array();
$jobs['title'] = $job->nodeValue;
$jobs['description'] = "This is a description";
$jobs['link'] = $job->getAttribute('href');
array_push($entries,$jobs);
foreach ($entries as $entry) {
$item = $channel->addChild("item");
$item->addChild("title", $entry['title']);
$item->addChild("link", $entry['link']);
$item->addChild("description", $entry['description']);
}
echo $xml->asXML();
}
}
else { echo "row is less than 0";}
}
else {
echo "this is empty";
}
}
Create one XML document, add all your items to that document, and call ->asXML(); on the complete document at the end (instead of on each fragment every time you go around the loop).