I try to pre-sort and slice a big XML file for later processing via xml_parser
function CreateXMLParser($CHARSET, $bareXML = false) {
$CURRXML = xml_parser_create($CHARSET);
xml_parser_set_option( $CURRXML, XML_OPTION_CASE_FOLDING, false);
xml_parser_set_option( $CURRXML, XML_OPTION_TARGET_ENCODING, $CHARSET);
xml_set_element_handler($CURRXML, 'startElement', 'endElement');
xml_set_character_data_handler($CURRXML, 'dataHandler');
xml_set_default_handler($CURRXML, 'defaultHandler');
if ($bareXML) {
xml_parse($CURRXML, '<?xml version="1.0"?>', 0);
}
return $CURRXML;
}
function ChunkXMLBigFile($file, $tag = 'item', $howmany = 1000) {
global $CHUNKON, $CHUNKS, $ITEMLIMIT;
$CHUNKON = $tag;
$ITEMLIMIT = $howmany;
$xml = CreateXMLParser('UTF-8', false);
$fp = fopen($file, "r");
$CHUNKS = 0;
while(!feof($fp)) {
$chunk = fgets($fp, 10240);
xml_parse($xml, $chunk, feof($fp));
}
xml_parser_free($xml);
processChunk();
}
function processChunk() {
global $CHUNKS, $PAYLOAD, $ITEMCOUNT;
if ('' == $PAYLOAD) {
return;
}
$xp = fopen($file = "xmlTemp/slices/slice_".$CHUNKS.".xml", "w");
fwrite($xp, '<?xml version="1.0" ?>'."\n");
fwrite($xp, "<producten>");
fwrite($xp, $PAYLOAD);
fwrite($xp, "</producten>");
fclose($xp);
print "Written ".$file."<br>";
$CHUNKS++;
$PAYLOAD = '';
$ITEMCOUNT = 0;
}
function startElement($xml, $tag, $attrs = array()) {
global $PAYLOAD, $CHUNKS, $ITEMCOUNT, $CHUNKON;
if (!($CHUNKS||$ITEMCOUNT)) {
if ($CHUNKON == strtolower($tag)) {
$PAYLOAD = '';
}
} else {
$PAYLOAD .= "<".$tag;
}
foreach($attrs as $k => $v) {
$PAYLOAD .= " $k=".'"'.addslashes($v).'"';
}
$PAYLOAD .= '>';
}
function endElement($xml, $tag) {
global $CHUNKON, $ITEMCOUNT, $ITEMLIMIT;
dataHandler(null, "<$tag>");
if ($CHUNKON == strtolower($tag)) {
if (++$ITEMCOUNT >= $ITEMLIMIT) {
processChunk();
}
}
}
function dataHandler($xml, $data) {
global $PAYLOAD;
$PAYLOAD .= $data;
}
but how can I access the node-name??
.. I have to sort some items (with n nodes) out, before the slice-file is saved. the the XML is parsed line after line, right? so I have to save the nodes from a whole item temporarely and decide if the item is gonna be written to the file.. is there a way to do this?
Your code is effectively reading the entire source file every time you call the ChunkXMLBigFile function.
After your while loop you have all the elements, which you can then manipulate any way you like.
See the following questions about how to approach this:
How to sort a xml file using DOM
Sort XML nodes with PHP
If you parse the chunks after that in batches of $howmany you are where you want to be.
Tip: there are many examples online where this functionality is presented in an Object Orient Programming (OOP) approach where all the functions are inside a class. This would also eliminate the need of global variables which can cause some (read: a lot) of frustrations and confusion.
Related
I have a XML file that I want to import in my opencart store.
My XML file is:
<Model>100002</Model>
<Itemcode>10000200</Itemcode>
<Description>description of item</Description>
<Currency>EUR</Currency>
<IndustryCol1>5,98</IndustryCol1>
<IndustryCol2>2,88</IndustryCol2>
<IndustryCol3>2,8</IndustryCol3>
<IndustryCol4>2,7</IndustryCol4>
</PriceInfoRow>
</PriceInfo>
I need to replace comma with dot in
<IndustryCol1>5,98</IndustryCol1>
<IndustryCol2>2,88</IndustryCol2>
<IndustryCol3>2,8</IndustryCol3>
<IndustryCol4>2,7</IndustryCol4>
I need this because when I load the XML file with comma the script turns 5,98 to 5,00.
Any help?
Edit:
This is the function when i upload a xml file:
private function importXML($filename, $product_tag, $xml_options) {
$this->product_tag = $product_tag; $this->xml_data = ''; $fh =
fopen($filename, 'r'); $xml_parser =
xml_parser_create($this->file_encoding); xml_set_object($xml_parser,
$this); xml_set_element_handler($xml_parser, 'startTag', 'endTag');
xml_set_character_data_handler($xml_parser, 'cData');
xml_parser_set_option($xml_parser, XML_OPTION_CASE_FOLDING, false);
while ($data = fread($fh, 4096)) { if (!xml_parse($xml_parser,
$data, feof($fh))) {
xml_parser_free($xml_parser);
return false; } if ($this->cron_fetch && $this->total_items_ready >= CRON_FETCH_NUM) {
xml_parser_free($xml_parser);
return true; } } xml_parser_free($xml_parser); return true; }
$xml = simplexml_load_string($s);
// Find all items which name starts with "IndustryCol"
$items = $xml->xpath('//*[starts-with(name(), "IndustryCol")]');
foreach($items as $k=>$v) {
// Replace node values
$items[$k][0] = str_replace(',', '.', $v);
}
echo $xml->asXML();
demo
Im using this code for create a xml file which is listing files' url and descriptions in directory. but i need to add in this xml also number of file. so i want to give to files a number (or ID), like 1 for file 2....n , like counter for every row how i can do this ?
thank you
<?php
$myfeed = new RSSFeed();
// Open the current directory (or specify it)
$dir = opendir ("./");
while (false !== ($file = readdir($dir))) {
if (strpos($file, '.jpg',1)||strpos($file, '.gif',1) ) {
$myfeed->SetItem("http://host.com/images/$file", "$file", "");
}
}
// Output the XML File
$fp = fopen('rss.xml', 'w');
fwrite($fp, $myfeed->output());
fclose($fp);
echo $myfeed->output(); //lets see that
class RSSFeed {
// VARIABLES
// channel vars
var $channel_url;
var $channel_title;
// items
var $items = array();
var $nritems;
// FUNCTIONS
// constructor
function RSSFeed() {
$this->nritems=0;
$this->channel_url='';
$this->channel_title='';
}
// set channel vars
function SetChannel($url, $title, $description, $lang, $copyright, $creator, $subject) {
$this->channel_url=$url;
$this->channel_title=$title;
}
// set item
function SetItem($url, $title, $description) {
$this->items[$this->nritems]['url']=$url;
$this->items[$this->nritems]['title']=$title;
$this->nritems++;
}
// output feed
function Output() {
$output = '<?xml version="1.0" encoding="UTF-8"?>'."\n";
$output .= '<playlist version="0" xmlns = "http://xspf.org/ns/0/">'."\n";
$output .= '<trackList>'."\n";
for($k=0; $k<$this->nritems; $k++) {
$output .= '<track>'."\n";
$output .= '<location>'.$this->items[$k]['url'].'</location>'."\n";
$output .= '<image></image>'."\n";
$output .= '<annotation>'.$this->items[$k]['title'].'</annotation>'."\n";
$output .= '</track>'."\n";
};
$output .= '</trackList>'."\n";
$output .= '</playlist>'."\n";
return $output;
}
};
?>
First up, I don't think creating XML with string-functions is a good idea, rather use a parser like simplexmlor DOM.
Assigning an id to each entry, you could take your $k in function Output():
for($k=0; $k<$this->nritems; $k++) {
$output .= '<track>'."\n";
$output .= "<id>$k</id>\n";
// (...)
$output .= "</track>\n";
}
or add id as attribute to the <track>-node:
$output .= "<track id=\"$k\">\n";
I need to find and replace in an Evernote xml file. It contains multiple entries like this:
<en-media alt="Evernote Logo" hash="4914ced8925f9adcc1c58ab87813c81f" type="image/png"></en-media>
<en-media alt="Evernote Logo" hash="4914dsd8925f9adcc1c58ab87813c81f" type="image/png"></en-media>
with
<img src="https://sandbox.evernote.com/shard/s1/res/143c8ad0-da92-4271-8410-651b88e8a2f1" height="something" width="something"/>
<img src="https://sandbox.evernote.com/shard/s1/res/143c8233-da92-4271-8410-651b88e8a2f1" height="something" width="something"/>
I'm doing this because Evernote's SDK admittedly lacks an easy way to show both textual and mime data (images, pdf's, etc.) in a single (or several easy) commands.
Here is my attempt at a "find" but now I need the "replace":
$content=html_entity_decode($content); //content contains and — causing simplexml to complain
$content=str_replace('&',htmlentities('&'),$content); //encode & -- Lewis & Clark
$x = new SimpleXMLElement($content);
$x = xpath('//en-media'); //find
[?????] MYCODE_getLink(); //replace
$content=$x->asXML(); //output
$content=htmlentities($content); //put entities back
Below is my solution, which just appends, since I couldn't figure out the replace. I still need to remove certain XML tags anyway for the final HTML presentation. Anyway, it works, so I'm posting it here in case it helps you:
$f = new NoteFilter();
$f->notebookGuid="e0a42e90-0297-442f-8157-44a596e5b8b5"; //default
//$f->notebookGuid="b733f6ab-e3b7-443a-8f5a-2bbe77ea1c1e"; //MyStuff
$n = $noteStore->findNotes($authToken, $f, 0, 100); // Fetch up to 100 notes
$total=$n->totalNotes;
if (!empty($n->notes)) {
foreach ($n->notes as $note) {
$fullNote = $noteStore->getNote($authToken, $note->guid, true, false, false, false);
$content = $fullNote->content;
$dom = new DOMDocument;
$dom->loadXml($content);
//list all <en-media>
$medias = $dom->getElementsByTagName('en-media');
foreach ($medias as $media) {
$hash = $media->getAttribute('hash');
$hash = hashXmlToBin($hash); //xml to bin for ByHash method below
$resource=$noteStore->getResourceByHash($authToken, $note->guid,$hash,0,0,0,0);
//get url
$url=resourceUrl($authToken,$resource);
//if image, show inline
$inline=array('image/png','image/jpeg','image/jpg','image/gif');
if (in_array($resource->mime,$inline)) {
$img=$dom->createElement('img');
$img->setAttribute('src', $url);
$img->setAttribute('width', $resource->width);
$img->setAttribute('height', $resource->height);
}else { //show link
$rewrite=array('application/pdf'=>'PDF');
$mime=str_replace('application/','',$resource->mime);
$filename=$resource->attributes->fileName;
$img=$dom->createElement('a',"Download {$filename} ({$mime})");
$img->setAttribute('href', $url);
$img->setAttribute('class', "download-attachement");
}
// append to DOM
$media->appendChild($img);
}//foreach medias
$content=$dom->saveXML();
$out[]=$content;
}//foreach notes
foreach ($out as $val)
print "<hr/>".$val; //each note
}//notes exist
/*
* http://discussion.evernote.com/topic/4521-en-media-hash/
*/
function hashXmlToBin($hash) {
$chunks = explode("\n", chunk_split($hash,2,"\n"));
$calc_hash = "";
foreach ($chunks as $chunk) {
$newdata="";
if (!empty($chunk)) {
$len = strlen($chunk);
for($i=0;$i<$len;$i+=2) {
$newdata .= pack("C",hexdec(substr($chunk,$i,2)));
}
$bin_chunk = $newdata;
$calc_hash .= $bin_chunk;
}
}
return $calc_hash;
}
/*
* return a resource url
*/
function resourceUrl($authToken, $resource, $resize = FALSE, $thumbnailSize = 150) {
//build URL
if (!$resize)
$url=EVERNOTE_SERVER."/shard/".$_SESSION['shard']."/res/".$resource->guid; //originals
else
$url=EVERNOTE_SERVER."/shard/".$_SESSION['shard']."/thm/res/".$resource->guid."?size={$thumbnailSize}"; //thumbnail
return $url;
}
from a PHP script I'm downloading a RSS feed like:
$fp = fopen('http://news.google.es/news?cf=all&ned=es_ve&hl=es&output=rss','r')
or die('Error reading RSS data.');
The feed is an spanish news feed, after I downloaded the file I parsed all the info into one var that have only the content of the tag <description> of every <item>. Well, the issue is that when I echo the var all the information have an html enconding like:
echo($result); // this print: el ministerio pãºblico investigarã¡ la publicaciã³n en la primera pã¡gina
Well I can create a HUGE case instance that searchs for every char can change it for the correspongind one, like: ã¡ for Á and so and so, but there is no way to do this with a single function??? or even better, there is no way to download the content to $fp without the html encoding? Thanks!
Actual code:
<?php
$acumula="";
$insideitem = false;
$tag = '';
$title = '';
$description = '';
$link = '';
function startElement($parser, $name, $attrs) {
global $insideitem, $tag, $title, $description, $link;
if ($insideitem) {
$tag = $name;
} elseif ($name == 'ITEM') {
$insideitem = true;
}
}
function endElement($parser, $name) {
global $insideitem, $tag, $title, $description, $link, $acumula;
if ($name == 'ITEM') {
$acumula = $acumula . (trim($title)) . "<br>" . (trim($description));
$title = '';
$description = '';
$link = '';
$insideitem = false;
}
}
function characterData($parser, $data) {
global $insideitem, $tag, $title, $description, $link;
if ($insideitem) {
switch ($tag) {
case 'TITLE':
$title .= $data;
break;
case 'DESCRIPTION':
$description .= $data;
break;
case 'LINK':
$link .= $data;
break;
}
}
}
$xml_parser = xml_parser_create();
xml_set_element_handler($xml_parser, 'startElement', 'endElement');
xml_set_character_data_handler($xml_parser, "characterData");
$fp = fopen('http://news.google.es/news?cf=all&ned=es_ve&hl=es&output=rss','r')
or die('Error reading RSS data.');
while ($data = fread($fp, 4096)) {
xml_parse($xml_parser, $data, feof($fp))
or die(sprintf('XML error: %s at line %d',
xml_error_string(xml_get_error_code($xml_parser)),
xml_get_current_line_number($xml_parser)));
}
//echo $acumula;
fclose($fp);
xml_parser_free($xml_parser);
echo($acumula); // THIS IS $RESULT!
?>
EDIT
Since you're already using the XML parser, you're guaranteed the encoding is UTF-8.
If your page is encoded in ISO-8859-1, or even ASCII, you can do this to convert:
$result = mb_convert_encoding($result, "HTML-ENTITIES", "UTF-8");
Use a library that handles this for you, e.g. the DOM extension or SimpleXML. Example:
$d = new DOMDocument();
$d->load('http://news.google.es/news?cf=all&ned=es_ve&hl=es&output=rss');
//now all the data you get will be encoded in UTF-8
Example with SimpleXML:
$url = 'http://news.google.es/news?cf=all&ned=es_ve&hl=es&output=rss';
if ($sxml = simplexml_load_file($url)) {
echo htmlspecialchars($sxml->channel->title); //UTF-8
}
You can use DOMDocument from PHP to strip HTML encoding tags.
And use encoding conversion functions also from PHP to change encoding of this sting.
Hi I'm using a class to parse my XML.
I have to parse request XML and response XML, and I'd like to do it with the same parser. When I parse the response XML, I keep getting junk after document element error.
I narrowed my XML down to <?xml version="1.0" encoding="UTF-8" ?><check></check> and I'm still getting the error, so the only thing I could think of is the second <?xml ..?> header being parsed as 'junk'.
What I basically want to do is reset the XML parser so it can start off as a new document, but I don't want to create a new parser object. Is this possible?
Edit:
I'm using the following code in my XmlParser object.
<?php
class XmlComponent extends Object {
private $parser, $c, $current_tag;
public $contents;
function initialize(&$controller, $settings = array())
{
$this->controller =& $controller;
$this->parser = xml_parser_create();
xml_set_object($this->parser, $this);
xml_set_element_handler($this->parser, "tag_open", "tag_close");
xml_set_character_data_handler($this->parser, "cdata");
xml_parser_set_option($this->parser, XML_OPTION_CASE_FOLDING, 0);
if (isset($settings['xml'])) {
$contents = $settings['xml'];
}
if (isset($settings['file'])) {
$f = fopen($settings['file'], 'r');
$contents = fread($f, filesize($settings['file']));
fclose($f);
}
debug($this->parse($contents));
}
public function parse($xml)
{
$xml = trim(preg_replace("/>\s+</", '><', $xml));
$this->contents = array();
$this->c = &$this->contents;
xml_parse($this->parser, $xml);
return xml_error_string(xml_get_error_code($this->parser))."\r\n".$xml;
}
//*/
public function xml($root)
{
$xml = '';
foreach ($root as $tag=>$elem) {
if (substr($tag, 0,1) != '#') {
foreach ($elem as $val) {
$xml .= '<'.$tag;
if (is_array($val) && isset($val['#attributes'])) {
foreach($val['#attributes'] as $a_key => $a_val) {
$xml .= ' '.$a_key.'="'.$a_val.'"';
}
}
$xml .= '>';
if (is_array($val) && isset($val['#data'])) {
$xml .= $val['#data'];
}
$xml .= $this->xml($val);
$xml .= '</'.$tag.'>';
}
}
}
return $xml;
}
//*/
private function tag_open($parser, $tag, $attributes)
{
if (!empty($attributes)) { $this->c[$tag][] = array('#attributes' => $attributes, '#parent' => &$this->c); }
else { $this->c[$tag][] = array('#parent' => &$this->c); }
$this->c =& $this->c[$tag][count($this->c[$tag]) - 1];
}
private function tag_close($parser, $tag)
{
$parent = &$this->c['#parent'];
unset($this->c['#parent']);
$this->c =& $parent;
}
private function cdata($parser, $data)
{
if (!empty($data)) {$this->c['#data'] = $data;}
}
//*/
}
?>
What I basically want to do is reset the XML parser so it can start off as a new document, but I don't want to create a new parser object. Is this possible?
No, you have to create a new parser. You can, of course, provide in your class a method that "resets" the parser; i.e., deletes the current one and creates a new one, hence hiding it from the class consumer.