I have html code something like this:
<p><i>i_text</i>,p_text</p>
i_text,p_text
i want change all node values in this domelement and keep all tags
i_changed_text,p_changed_text
my attempts)
$html = '<p><i>i_text</i> p_text</p>';
$dom = new DOMDocument();
$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$dom->validateOnParse = true;
$elements = $dom->getElementsByTagName('*');
foreach ($elements as $element) {
$element->nodeValue = str_replace('_','_changed_',$element->nodeValue);
}
echo($dom->saveHTML());
output i_changed_text,p_changed_text
this code return correct text but don't save childnodes
$html = '<p><i>i_text</i>,p_text</p>';
$dom = new DOMDocument();
$dom->loadXML($html);
$dom->preserveWhiteSpace = false;
$dom->validateOnParse = true;
$elements = $dom->getElementsByTagName('*');
$elem = $dom->createElement('dfn', 'tag');
$attr = $dom->createAttribute('text');
$attr->value = 'element';
$elem->appendChild($attr);
$elements = $dom->getElementsByTagName('*');
foreach ($elements as $element) {
while ($element->hasChildnodes()) {
$element = $element->childNodes->item(0);
}
$changed_value = str_replace('_','_changed_',$element->nodeValue);
$element->nodeValue = str_replace("tag", $dom->saveXML($elem), $changed_value);
}
echo ($dom->saveXML());
output
i_changed_text,p_text
this code save and change values in childnodes but don't change text in parentnode
my solution)
i_text,p_text,a_text,another one_text
$html = '<p><i>i_text</i>,p_text<b>,a_text</b>,another one_text</p>';
$dom = new DOMDocument();
$dom->loadXML($html);
$dom->preserveWhiteSpace = false;
$dom->validateOnParse = true;
$elements = $dom->getElementsByTagName('*');
foreach ($elements as $element) {
if($element->hasChildnodes()==true && $element->parentNode->nodeName == '#document'){
foreach($element->childNodes as $element_child){
$element_child->nodeValue = str_replace('_','_changed_', $element_child->nodeValue);
}
}
}
echo ($dom->saveXML());
output
i_changed_text,p_changed_text,a_changed_text,another one_changed_text
Related
I’m trying to scrape a table on Borsa Italiana
I use this code
<?php
$url = "https://www.borsaitaliana.it/borsa/azioni/global-equity-market/dati-completi.html?isin=IT0001477402";
$html = file_get_contents($url);
libxml_use_internal_errors(true);
$doc = new \DOMDocument();
if($doc->loadHTML($html))
{
$result = new \DOMDocument();
$result->formatOutput = true;
$table = $result->appendChild($result->createElement("table"));
$tbody = $table->appendChild($result->createElement("tbody"));
$xpath = new \DOMXPath($doc);
foreach($xpath->query("//table[#class=\"m-table -clear-m\"]/tbody/tr") as $row)
{
$newRow = $tbody->appendChild($result->createElement("tr"));
foreach($xpath->query("./td[position()>0 and position()<3]", $row) as $cell)
{
$newRow->appendChild($result->createElement("td", trim($cell->nodeValue)));
}
}
}
echo $result->saveHTML($result->documentElement);
?>
Result is a table with two columns and more rows. I would transpose first column in header, in order to save result in my database for my personal use.
Can anyone help me?
Thank you
Try it:
<?php
$url = "https://www.borsaitaliana.it/borsa/azioni/global-equity-market/dati-completi.html?isin=IT0001477402";
$html = file_get_contents($url);
libxml_use_internal_errors(true);
$doc = new \DOMDocument();
if ($doc->loadHTML($html)) {
$result = new \DOMDocument();
$result->formatOutput = true;
$xpath = new \DOMXPath($doc);
// collects data in $arr -->
$arr = [];
foreach ($xpath->query("//table[#class=\"m-table -clear-m\"]/tbody/tr") as $row) {
$itm = [];
foreach ($xpath->query("./td[position()>0 and position()<3]", $row) as $cell) {
$itm[] = trim($cell->nodeValue);
}
$arr[] = $itm;
}
// <--
$table = $result->appendChild($result->createElement("table"));
// outputs head -->
$thead = $table->appendChild($result->createElement("thead"));
$newRow = $thead->appendChild($result->createElement("tr"));
foreach (array_column($arr, 0) as $th) {
$newRow->appendChild($result->createElement("th", $th));
}
// <--
// outputs data -->
$tbody = $table->appendChild($result->createElement("tbody"));
$newRow = $tbody->appendChild($result->createElement("tr"));
foreach ($arr as $row) {
$newRow->appendChild($result->createElement("td", isset($row[1])? $row[1]: ""));
}
// <--
}
echo $result->saveHTML($result->documentElement);
But I agree with #tim - you have to use API for that.
Using PHP i want to remove all HTML attributes except
"src" attribute from "img" tag
and
"href" attribute from "a" tag
My Input file is .html file which is been converted from .doc and .docx
My output file again should be HTML file with removed attribute
Kindly help me please
Edit ::
After Trying alexander script as below if i open the strip.html in code editor i don't see any changes
<?php
$path = '/var/www/strip.html';
$html = file_get_contents($path);
$dom = new DOMDocument();
$dom->strictErrorChecking = false;
$dom->formatOutput = true;
$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
if (false === ($elements = $xpath->query("//img"))) die('Error');
foreach ($elements as $element) {
for ($i = $element->attributes->length; --$i >= 0;) {
$name = $element->attributes->item($i)->name;
if ('src' !== $name) {
$element->removeAttribute($name);
}
}
}
if (false === ($elements = $xpath->query("//a"))) die('Error');
foreach ($elements as $element) {
for ($i = $element->attributes->length; --$i >= 0;) {
$name = $element->attributes->item($i)->name;
if ('href' !== $name) {
$element->removeAttribute($name);
}
}
}
$dom->saveHTMLFile($path);
?>
Use DOMDocument class for parsing HTML ("a" and "img" tags processing):
$path = '/path/to/file.html';
$html = file_get_contents($path);
$dom = new DOMDocument();
//$dom->strictErrorChecking = false;
$dom->formatOutput = true;
$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
if (false === ($elements = $xpath->query("//img"))) die('Error');
foreach ($elements as $element) {
for ($i = $element->attributes->length; --$i >= 0;) {
$name = $element->attributes->item($i)->name;
if ('src' !== $name) {
$element->removeAttribute($name);
}
}
}
if (false === ($elements = $xpath->query("//a"))) die('Error');
foreach ($elements as $element) {
for ($i = $element->attributes->length; --$i >= 0;) {
$name = $element->attributes->item($i)->name;
if ('href' !== $name) {
$element->removeAttribute($name);
}
}
}
$dom->saveHTMLFile($path);
Also, read why you can't parse [X]HTML with regex and take a look at useful xpath links.
Update (all tags with exception "a" and "img" attributes processing):
$path = '/path/to/file.html';
$html = file_get_contents($path);
$dom = new DOMDocument();
//$dom->strictErrorChecking = false;
$dom->formatOutput = true;
$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
if (false === ($elements = $xpath->query("//*"))) die('Error');
foreach ($elements as $element) {
for ($i = $element->attributes->length; --$i >= 0;) {
$name = $element->attributes->item($i)->name;
if (('img' === $element->nodeName && 'src' === $name)
|| ('a' === $element->nodeName && 'href' === $name)
) {
continue;
}
$element->removeAttribute($name);
}
}
$dom->saveHTMLFile($path);
i tried to concatenate innerhtml of div into string variable:
games variable:
$games = '';
DOMinnerHTML function:
function DOMinnerHTML($element)
{
$innerHTML = "";
$children = $element->childNodes;
foreach ($children as $child)
{
$tmp_dom = new DOMDocument();
$tmp_dom->appendChild($tmp_dom->importNode($child, true));
$innerHTML.=trim($tmp_dom->saveHTML());
}
return $innerHTML;
}
ExtractFromType function:
function ExtractFromType($type)
{
$html = file_get_contents('www.site.com/' .$type);
$dom = new domDocument;
#$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$divs = $dom->getElementsByTagName('div');
foreach ($divs as $div) {
if (strpos($div->getAttribute('style'),'MyString') !== false) {
//////
$games = $games.DOMinnerHTML($div);
//////
}
}
}
code:
ExtractFromType('MyType');
echo $games; // = Nothing.
this code return nothing.
$games is defined in the global scope, and it's not available inside ExctractFromType. Define it inside the function, then return the value:
function ExtractFromType($type) {
$html = file_get_contents('www.site.com/' .$type);
$dom = new domDocument;
#$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$divs = $dom->getElementsByTagName('div');
$games = '';
foreach ($divs as $div) {
if (strpos($div->getAttribute('style'),'MyString') !== false) {
$games = $games.DOMinnerHTML($div);
}
}
}
echo ExtractFromType('MyType');
I am using php's DomDocument class to load an HTML file and then empty its contents. The problem is when I do .removeChild() it gives me 'Not Found Error'. heres my code
$doc=new DOMDocument();
$doc->loadHTMLFile("a.html");
$body= $doc->getElementsByTagName('body')->item(0);
foreach($body->childNodes as $child)
{
$body->removeChild($child);
}
$child is of DOMText type....may be because removeChild expects DOMNode and not DOMText? if yes then how can i iterate over childNodes such that $child is of type DOMNode?
Use a for loop instead of a foreach loop.
$doc=new DOMDocument();
$doc->loadHTML("c.html");
$doc->preserveWhiteSpace = true;
$body = $doc->getElementsByTagName('body')->item(0);
$children = $body->childNodes;
$length = $children->length;
for($i = 0 ; $i < $length; $i++) {
$child = $children->item($i);
if ($child)
$body->removeChild($child);
}
$html = $doc->saveHTML();
echo $html;
I've have the following (PHP) code that traverses an entire DOM document to get all of the text nodes. It's a bit of a ugly solution, and I'm sure there must be a better way... so, is there?
$skip = false;
$node = $document;
$nodes = array();
while ($node) {
if ($node->nodeType == 3) {
$nodes[] = $node;
}
if (!$skip && $node->firstChild) {
$node = $node->firstChild;
} elseif ($node->nextSibling) {
$node = $node->nextSibling;
$skip = false;
} else {
$node = $node->parentNode;
$skip = true;
}
}
Thanks.
The XPath expression you need is //text(). Try using it with DOMXPath::query. For example:
$xpath = new DOMXPath($doc);
$textnodes = $xpath->query('//text()');