I have a question regarding simplify my codes.
I have
public function getText($text){
if(!empty($text)){
$dom = new DomDocument();
$dom->loadHTML($text);
$xpath=new DOMXpath($dom);
$result = $xpath->query('//a');
if($result->length > 0){
$atags=$dom->getElementsByTagName('a');
foreach($atags as $atag){
$style = $atag ->getAttribute('style');
$atag->setAttribute('style',$style.' text-decoration:none;color:black;');
}
$returnText .= $dom->saveHTML();
return $returnText;
}
$result = $xpath->query('//table');
if($result->length > 0){
$tables = $dom->getElementsByTagName('table');
$inputs = $dom->getElementsByTagName('input');
foreach ($inputs as $input) {
$input->setAttribute('style','text-align:center;');
}
foreach ($tables as $table) {
$table->setAttribute('width',500);
$table->setAttribute('style','border:2px solid #8C8C8C;text-align:center;table-layout:fixed;');
}
$returnText .= $dom->saveHTML();
return $returnText;
}
}
return $text;
}
public function getTextwithIndex($text,$index=''){
if(!empty($text[$index])){
$dom = new DomDocument();
$dom->loadHTML($text[$index]);
$xpath=new DOMXpath($dom);
$result = $xpath->query('//a');
if($result->length > 0){
$atags=$dom->getElementsByTagName('a');
foreach($atags as $atag){
$style = $atag ->getAttribute('style');
$atag->setAttribute('style',$style.' text-decoration:none;color:black;');
}
$returnText .= $dom->saveHTML();
return $returnText;
}
$result = $xpath->query('//tbody');
if($result->length > 0){
$tbodies = $dom->getElementsByTagName('tbody');
$cells = $dom->getElementsByTagName('td');
$inputs = $dom->getElementsByTagName('input');
foreach ($inputs as $input) {
$input->setAttribute('style','text-align:center;');
}
foreach ($cells as $cell) {
$cell->setAttribute('style','border:1px solid black;');
}
foreach ($tbodies as $tbody) {
$table = $dom->createElement('table');
$table->setAttribute('width',500);
$table->setAttribute('style','border:2px solid #8C8C8C;text-align:center;table-layout:fixed;');
$tbody->parentNode->replaceChild($table, $tbody);
$table->appendChild($tbody);
}
$returnText .= $dom->saveHTML();
return $returnText;
}
}
return $text;
}
The difference between the method is $index and some modification of my domdocument. I feel like it's really cumbersome and could use some refactoring. Does anyone have any good suggestions? Thanks!
How about something like this:
public function getTextwithIndex($text,$index='') {
if (empty($index))
return getText($text); //not sure how $text works, so this line might be different.
return getText($text[$index]);
}
Or something like this:
public function getText($text, $index = false){
if ($index)
$text = $text[$index];
if(!empty($text)){
$dom = new DomDocument();
$dom->loadHTML($text);
$xpath=new DOMXpath($dom);
$result = $xpath->query('//a');
if($result->length > 0){
$atags=$dom->getElementsByTagName('a');
foreach($atags as $atag){
$style = $atag ->getAttribute('style');
$atag->setAttribute('style',$style.' text-decoration:none;color:black;');
}
$returnText .= $dom->saveHTML();
return $returnText;
}
$result = $xpath->query('//table');
if($result->length > 0){
if ($index) {
//do 'getTextWithIndex' dom stuff
} else {
$tables = $dom->getElementsByTagName('table');
$inputs = $dom->getElementsByTagName('input');
}
foreach ($inputs as $input) {
$input->setAttribute('style','text-align:center;');
}
foreach ($tables as $table) {
$table->setAttribute('width',500);
$table->setAttribute('style','border:2px solid #8C8C8C;text-align:center;table-layout:fixed;');
}
$returnText .= $dom->saveHTML();
return $returnText;
}
}
return $text;
}
Related
I’m trying to scrape a table on Borsa Italiana
I use this code
<?php
$url = "https://www.borsaitaliana.it/borsa/azioni/global-equity-market/dati-completi.html?isin=IT0001477402";
$html = file_get_contents($url);
libxml_use_internal_errors(true);
$doc = new \DOMDocument();
if($doc->loadHTML($html))
{
$result = new \DOMDocument();
$result->formatOutput = true;
$table = $result->appendChild($result->createElement("table"));
$tbody = $table->appendChild($result->createElement("tbody"));
$xpath = new \DOMXPath($doc);
foreach($xpath->query("//table[#class=\"m-table -clear-m\"]/tbody/tr") as $row)
{
$newRow = $tbody->appendChild($result->createElement("tr"));
foreach($xpath->query("./td[position()>0 and position()<3]", $row) as $cell)
{
$newRow->appendChild($result->createElement("td", trim($cell->nodeValue)));
}
}
}
echo $result->saveHTML($result->documentElement);
?>
Result is a table with two columns and more rows. I would transpose first column in header, in order to save result in my database for my personal use.
Can anyone help me?
Thank you
Try it:
<?php
$url = "https://www.borsaitaliana.it/borsa/azioni/global-equity-market/dati-completi.html?isin=IT0001477402";
$html = file_get_contents($url);
libxml_use_internal_errors(true);
$doc = new \DOMDocument();
if ($doc->loadHTML($html)) {
$result = new \DOMDocument();
$result->formatOutput = true;
$xpath = new \DOMXPath($doc);
// collects data in $arr -->
$arr = [];
foreach ($xpath->query("//table[#class=\"m-table -clear-m\"]/tbody/tr") as $row) {
$itm = [];
foreach ($xpath->query("./td[position()>0 and position()<3]", $row) as $cell) {
$itm[] = trim($cell->nodeValue);
}
$arr[] = $itm;
}
// <--
$table = $result->appendChild($result->createElement("table"));
// outputs head -->
$thead = $table->appendChild($result->createElement("thead"));
$newRow = $thead->appendChild($result->createElement("tr"));
foreach (array_column($arr, 0) as $th) {
$newRow->appendChild($result->createElement("th", $th));
}
// <--
// outputs data -->
$tbody = $table->appendChild($result->createElement("tbody"));
$newRow = $tbody->appendChild($result->createElement("tr"));
foreach ($arr as $row) {
$newRow->appendChild($result->createElement("td", isset($row[1])? $row[1]: ""));
}
// <--
}
echo $result->saveHTML($result->documentElement);
But I agree with #tim - you have to use API for that.
I which to extract all the link include on page with anchor or alt attribute on image include in the links if this one come first.
$html = 'Anchor';
Must return "lien.fr;Anchor"
$html = '<img alt="Alt Anchor">Anchor';
Must return "lien.fr;Alt Anchor"
$html = 'Anchor<img alt="Alt Anchor">';
Must return "lien.fr;Anchor"
I did:
$doc = new DOMDocument();
$doc->loadHTML($html);
$out = "";
$n = 0;
$links = $doc->getElementsByTagName('a');
foreach ($links as $element) {
$href = $img_alt = $anchor = "";
$href = $element->getAttribute('href');
$n++;
if (!strrpos($href, "panier?")) {
if ($element->firstChild->nodeName == "img") {
$imgs = $element->getElementsByTagName('img');
foreach ($imgs as $img) {
if ($anchor = $img->getAttribute('alt')) {
break;
}
}
}
if (($anchor == "") && ($element->nodeValue)) {
$anchor = $element->nodeValue;
}
$out[$n]['link'] = $href;
$out[$n]['anchor'] = $anchor;
}
}
This seems to work but if there some space or indentation it doesn't
as
$html = '<a href="link.fr">
<img src="ceinture-gris" alt="alt anchor"/>
</a>';
the $element->firstChild->nodeName will be text
Something like this:
$doc = new DOMDocument();
$doc->loadHTML($html);
// Output texts that will later be joined with ';'
$out = [];
// Maximum number of items to add to $out
$max_out_items = 2;
// List of img tag attributes that will be parsed by the loop below
// (in the order specified in this array!)
$img_attributes = ['alt', 'src', 'title'];
$links = $doc->getElementsByTagName('a');
foreach ($links as $element) {
if ($href = trim($element->getAttribute('href'))) {
$out []= $href;
if (count($out) >= $max_out_items)
break;
}
foreach ($element->childNodes as $child) {
if ($child->nodeType === XML_TEXT_NODE &&
$text = trim($child->nodeValue))
{
$out []= $text;
if (count($out) >= $max_out_items)
break;
} elseif ($child->nodeName == 'img') {
foreach ($img_attributes as $attr_name) {
if ($attr_value = trim($child->getAttribute($attr_name))) {
$out []= $attr_value;
if (count($out) >= $max_out_items)
goto Result;
}
}
}
}
}
Result:
echo $out = implode(';', $out);
Using PHP i want to remove all HTML attributes except
"src" attribute from "img" tag
and
"href" attribute from "a" tag
My Input file is .html file which is been converted from .doc and .docx
My output file again should be HTML file with removed attribute
Kindly help me please
Edit ::
After Trying alexander script as below if i open the strip.html in code editor i don't see any changes
<?php
$path = '/var/www/strip.html';
$html = file_get_contents($path);
$dom = new DOMDocument();
$dom->strictErrorChecking = false;
$dom->formatOutput = true;
$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
if (false === ($elements = $xpath->query("//img"))) die('Error');
foreach ($elements as $element) {
for ($i = $element->attributes->length; --$i >= 0;) {
$name = $element->attributes->item($i)->name;
if ('src' !== $name) {
$element->removeAttribute($name);
}
}
}
if (false === ($elements = $xpath->query("//a"))) die('Error');
foreach ($elements as $element) {
for ($i = $element->attributes->length; --$i >= 0;) {
$name = $element->attributes->item($i)->name;
if ('href' !== $name) {
$element->removeAttribute($name);
}
}
}
$dom->saveHTMLFile($path);
?>
Use DOMDocument class for parsing HTML ("a" and "img" tags processing):
$path = '/path/to/file.html';
$html = file_get_contents($path);
$dom = new DOMDocument();
//$dom->strictErrorChecking = false;
$dom->formatOutput = true;
$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
if (false === ($elements = $xpath->query("//img"))) die('Error');
foreach ($elements as $element) {
for ($i = $element->attributes->length; --$i >= 0;) {
$name = $element->attributes->item($i)->name;
if ('src' !== $name) {
$element->removeAttribute($name);
}
}
}
if (false === ($elements = $xpath->query("//a"))) die('Error');
foreach ($elements as $element) {
for ($i = $element->attributes->length; --$i >= 0;) {
$name = $element->attributes->item($i)->name;
if ('href' !== $name) {
$element->removeAttribute($name);
}
}
}
$dom->saveHTMLFile($path);
Also, read why you can't parse [X]HTML with regex and take a look at useful xpath links.
Update (all tags with exception "a" and "img" attributes processing):
$path = '/path/to/file.html';
$html = file_get_contents($path);
$dom = new DOMDocument();
//$dom->strictErrorChecking = false;
$dom->formatOutput = true;
$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
if (false === ($elements = $xpath->query("//*"))) die('Error');
foreach ($elements as $element) {
for ($i = $element->attributes->length; --$i >= 0;) {
$name = $element->attributes->item($i)->name;
if (('img' === $element->nodeName && 'src' === $name)
|| ('a' === $element->nodeName && 'href' === $name)
) {
continue;
}
$element->removeAttribute($name);
}
}
$dom->saveHTMLFile($path);
i tried to concatenate innerhtml of div into string variable:
games variable:
$games = '';
DOMinnerHTML function:
function DOMinnerHTML($element)
{
$innerHTML = "";
$children = $element->childNodes;
foreach ($children as $child)
{
$tmp_dom = new DOMDocument();
$tmp_dom->appendChild($tmp_dom->importNode($child, true));
$innerHTML.=trim($tmp_dom->saveHTML());
}
return $innerHTML;
}
ExtractFromType function:
function ExtractFromType($type)
{
$html = file_get_contents('www.site.com/' .$type);
$dom = new domDocument;
#$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$divs = $dom->getElementsByTagName('div');
foreach ($divs as $div) {
if (strpos($div->getAttribute('style'),'MyString') !== false) {
//////
$games = $games.DOMinnerHTML($div);
//////
}
}
}
code:
ExtractFromType('MyType');
echo $games; // = Nothing.
this code return nothing.
$games is defined in the global scope, and it's not available inside ExctractFromType. Define it inside the function, then return the value:
function ExtractFromType($type) {
$html = file_get_contents('www.site.com/' .$type);
$dom = new domDocument;
#$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$divs = $dom->getElementsByTagName('div');
$games = '';
foreach ($divs as $div) {
if (strpos($div->getAttribute('style'),'MyString') !== false) {
$games = $games.DOMinnerHTML($div);
}
}
}
echo ExtractFromType('MyType');
I'm trying to add the results of a script to an array, but once I look into it there is only one item in it, probably me being silly with placement
function crawl_page($url, $depth)
{
static $seen = array();
$Linklist = array();
if (isset($seen[$url]) || $depth === 0) {
return;
}
$seen[$url] = true;
$dom = new DOMDocument('1.0');
#$dom->loadHTMLFile($url);
$anchors = $dom->getElementsByTagName('a');
foreach ($anchors as $element) {
$href = $element->getAttribute('href');
if (0 !== strpos($href, 'http')) {
$href = rtrim($url, '/') . '/' . ltrim($href, '/');
}
if(shouldScrape($href)==true)
{
crawl_page($href, $depth - 1);
}
}
echo "URL:",$url;
echo http_response($url);
echo "<br/>";
$Linklist[] = $url;
$XML = new DOMDocument('1.0');
$XML->formatOutput = true;
$root = $XML->createElement('Links');
$root = $XML->appendChild($root);
foreach ($Linklist as $value)
{
$child = $XML->createElement('Linkdetails');
$child = $root->appendChild($child);
$text = $XML->createTextNode($value);
$text = $child->appendChild($text);
}
$XML->save("linkList.xml");
}
$Linklist[] = $url; will add a single item to the $Linklist array. This line needs to be in a loop I think.
static $Linklist = array(); i think, but code is awful