i tried to concatenate innerhtml of div into string variable:
games variable:
$games = '';
DOMinnerHTML function:
function DOMinnerHTML($element)
{
$innerHTML = "";
$children = $element->childNodes;
foreach ($children as $child)
{
$tmp_dom = new DOMDocument();
$tmp_dom->appendChild($tmp_dom->importNode($child, true));
$innerHTML.=trim($tmp_dom->saveHTML());
}
return $innerHTML;
}
ExtractFromType function:
function ExtractFromType($type)
{
$html = file_get_contents('www.site.com/' .$type);
$dom = new domDocument;
#$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$divs = $dom->getElementsByTagName('div');
foreach ($divs as $div) {
if (strpos($div->getAttribute('style'),'MyString') !== false) {
//////
$games = $games.DOMinnerHTML($div);
//////
}
}
}
code:
ExtractFromType('MyType');
echo $games; // = Nothing.
this code return nothing.
$games is defined in the global scope, and it's not available inside ExctractFromType. Define it inside the function, then return the value:
function ExtractFromType($type) {
$html = file_get_contents('www.site.com/' .$type);
$dom = new domDocument;
#$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$divs = $dom->getElementsByTagName('div');
$games = '';
foreach ($divs as $div) {
if (strpos($div->getAttribute('style'),'MyString') !== false) {
$games = $games.DOMinnerHTML($div);
}
}
}
echo ExtractFromType('MyType');
Related
I have html code something like this:
<p><i>i_text</i>,p_text</p>
i_text,p_text
i want change all node values in this domelement and keep all tags
i_changed_text,p_changed_text
my attempts)
$html = '<p><i>i_text</i> p_text</p>';
$dom = new DOMDocument();
$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$dom->validateOnParse = true;
$elements = $dom->getElementsByTagName('*');
foreach ($elements as $element) {
$element->nodeValue = str_replace('_','_changed_',$element->nodeValue);
}
echo($dom->saveHTML());
output i_changed_text,p_changed_text
this code return correct text but don't save childnodes
$html = '<p><i>i_text</i>,p_text</p>';
$dom = new DOMDocument();
$dom->loadXML($html);
$dom->preserveWhiteSpace = false;
$dom->validateOnParse = true;
$elements = $dom->getElementsByTagName('*');
$elem = $dom->createElement('dfn', 'tag');
$attr = $dom->createAttribute('text');
$attr->value = 'element';
$elem->appendChild($attr);
$elements = $dom->getElementsByTagName('*');
foreach ($elements as $element) {
while ($element->hasChildnodes()) {
$element = $element->childNodes->item(0);
}
$changed_value = str_replace('_','_changed_',$element->nodeValue);
$element->nodeValue = str_replace("tag", $dom->saveXML($elem), $changed_value);
}
echo ($dom->saveXML());
output
i_changed_text,p_text
this code save and change values in childnodes but don't change text in parentnode
my solution)
i_text,p_text,a_text,another one_text
$html = '<p><i>i_text</i>,p_text<b>,a_text</b>,another one_text</p>';
$dom = new DOMDocument();
$dom->loadXML($html);
$dom->preserveWhiteSpace = false;
$dom->validateOnParse = true;
$elements = $dom->getElementsByTagName('*');
foreach ($elements as $element) {
if($element->hasChildnodes()==true && $element->parentNode->nodeName == '#document'){
foreach($element->childNodes as $element_child){
$element_child->nodeValue = str_replace('_','_changed_', $element_child->nodeValue);
}
}
}
echo ($dom->saveXML());
output
i_changed_text,p_changed_text,a_changed_text,another one_changed_text
I’m trying to scrape a table on Borsa Italiana
I use this code
<?php
$url = "https://www.borsaitaliana.it/borsa/azioni/global-equity-market/dati-completi.html?isin=IT0001477402";
$html = file_get_contents($url);
libxml_use_internal_errors(true);
$doc = new \DOMDocument();
if($doc->loadHTML($html))
{
$result = new \DOMDocument();
$result->formatOutput = true;
$table = $result->appendChild($result->createElement("table"));
$tbody = $table->appendChild($result->createElement("tbody"));
$xpath = new \DOMXPath($doc);
foreach($xpath->query("//table[#class=\"m-table -clear-m\"]/tbody/tr") as $row)
{
$newRow = $tbody->appendChild($result->createElement("tr"));
foreach($xpath->query("./td[position()>0 and position()<3]", $row) as $cell)
{
$newRow->appendChild($result->createElement("td", trim($cell->nodeValue)));
}
}
}
echo $result->saveHTML($result->documentElement);
?>
Result is a table with two columns and more rows. I would transpose first column in header, in order to save result in my database for my personal use.
Can anyone help me?
Thank you
Try it:
<?php
$url = "https://www.borsaitaliana.it/borsa/azioni/global-equity-market/dati-completi.html?isin=IT0001477402";
$html = file_get_contents($url);
libxml_use_internal_errors(true);
$doc = new \DOMDocument();
if ($doc->loadHTML($html)) {
$result = new \DOMDocument();
$result->formatOutput = true;
$xpath = new \DOMXPath($doc);
// collects data in $arr -->
$arr = [];
foreach ($xpath->query("//table[#class=\"m-table -clear-m\"]/tbody/tr") as $row) {
$itm = [];
foreach ($xpath->query("./td[position()>0 and position()<3]", $row) as $cell) {
$itm[] = trim($cell->nodeValue);
}
$arr[] = $itm;
}
// <--
$table = $result->appendChild($result->createElement("table"));
// outputs head -->
$thead = $table->appendChild($result->createElement("thead"));
$newRow = $thead->appendChild($result->createElement("tr"));
foreach (array_column($arr, 0) as $th) {
$newRow->appendChild($result->createElement("th", $th));
}
// <--
// outputs data -->
$tbody = $table->appendChild($result->createElement("tbody"));
$newRow = $tbody->appendChild($result->createElement("tr"));
foreach ($arr as $row) {
$newRow->appendChild($result->createElement("td", isset($row[1])? $row[1]: ""));
}
// <--
}
echo $result->saveHTML($result->documentElement);
But I agree with #tim - you have to use API for that.
I am trying to grab URL, with DOMparser but stuck at getNamedItem
How to solve this problem? What I am missing here? I welcome for any idea!
$url = 'https://www.31sumai.com/search/area/kansai/result/?area=16,17,18';
$html = file_get_contents($url);
libxml_use_internal_errors(true);
$DOMParser = new \DOMDocument();
$DOMParser->loadHTML($html);
$mainlink = null;
$allPTags = $DOMParser->getElementsByTagName('p');
foreach ($allPTags as $ptag) {
$class = $ptag->attributes->getNamedItem("class");
if ($class && $class->nodeValue == 'c-name') {
$main = $ptag->attributes->getNamedItem("href");
if ($main) {
$mainlink = $main->nodeValue;
}
}
}
var_dump($mainlink);
It s returning null but already checked the website, there is a URL in that tag.
$url = 'https://lions-mansion.jp/area/kansai/';
$html = file_get_contents($url);
libxml_use_internal_errors(true);
$DOMParser = new \DOMDocument();
$DOMParser->loadHTML($html);
$mainlink = null;
$allPTags = $DOMParser->getElementsByTagName('p');
foreach ($allPTags as $ptag) {
$class = $ptag->attributes->getNamedItem("class");
if ($class && $class->nodeValue == 'areapageDetailList_item_btn_hp') {
$links = $ptag->getElementsByTagName('a');
foreach ($links as $link) {
$hrefAttr = $link->attributes->getNamedItem("href");
if ($hrefAttr) {
$mainlink = $hrefAttr->nodeValue;
}
}
}
}
echo $mainlink;
I have a question regarding simplify my codes.
I have
public function getText($text){
if(!empty($text)){
$dom = new DomDocument();
$dom->loadHTML($text);
$xpath=new DOMXpath($dom);
$result = $xpath->query('//a');
if($result->length > 0){
$atags=$dom->getElementsByTagName('a');
foreach($atags as $atag){
$style = $atag ->getAttribute('style');
$atag->setAttribute('style',$style.' text-decoration:none;color:black;');
}
$returnText .= $dom->saveHTML();
return $returnText;
}
$result = $xpath->query('//table');
if($result->length > 0){
$tables = $dom->getElementsByTagName('table');
$inputs = $dom->getElementsByTagName('input');
foreach ($inputs as $input) {
$input->setAttribute('style','text-align:center;');
}
foreach ($tables as $table) {
$table->setAttribute('width',500);
$table->setAttribute('style','border:2px solid #8C8C8C;text-align:center;table-layout:fixed;');
}
$returnText .= $dom->saveHTML();
return $returnText;
}
}
return $text;
}
public function getTextwithIndex($text,$index=''){
if(!empty($text[$index])){
$dom = new DomDocument();
$dom->loadHTML($text[$index]);
$xpath=new DOMXpath($dom);
$result = $xpath->query('//a');
if($result->length > 0){
$atags=$dom->getElementsByTagName('a');
foreach($atags as $atag){
$style = $atag ->getAttribute('style');
$atag->setAttribute('style',$style.' text-decoration:none;color:black;');
}
$returnText .= $dom->saveHTML();
return $returnText;
}
$result = $xpath->query('//tbody');
if($result->length > 0){
$tbodies = $dom->getElementsByTagName('tbody');
$cells = $dom->getElementsByTagName('td');
$inputs = $dom->getElementsByTagName('input');
foreach ($inputs as $input) {
$input->setAttribute('style','text-align:center;');
}
foreach ($cells as $cell) {
$cell->setAttribute('style','border:1px solid black;');
}
foreach ($tbodies as $tbody) {
$table = $dom->createElement('table');
$table->setAttribute('width',500);
$table->setAttribute('style','border:2px solid #8C8C8C;text-align:center;table-layout:fixed;');
$tbody->parentNode->replaceChild($table, $tbody);
$table->appendChild($tbody);
}
$returnText .= $dom->saveHTML();
return $returnText;
}
}
return $text;
}
The difference between the method is $index and some modification of my domdocument. I feel like it's really cumbersome and could use some refactoring. Does anyone have any good suggestions? Thanks!
How about something like this:
public function getTextwithIndex($text,$index='') {
if (empty($index))
return getText($text); //not sure how $text works, so this line might be different.
return getText($text[$index]);
}
Or something like this:
public function getText($text, $index = false){
if ($index)
$text = $text[$index];
if(!empty($text)){
$dom = new DomDocument();
$dom->loadHTML($text);
$xpath=new DOMXpath($dom);
$result = $xpath->query('//a');
if($result->length > 0){
$atags=$dom->getElementsByTagName('a');
foreach($atags as $atag){
$style = $atag ->getAttribute('style');
$atag->setAttribute('style',$style.' text-decoration:none;color:black;');
}
$returnText .= $dom->saveHTML();
return $returnText;
}
$result = $xpath->query('//table');
if($result->length > 0){
if ($index) {
//do 'getTextWithIndex' dom stuff
} else {
$tables = $dom->getElementsByTagName('table');
$inputs = $dom->getElementsByTagName('input');
}
foreach ($inputs as $input) {
$input->setAttribute('style','text-align:center;');
}
foreach ($tables as $table) {
$table->setAttribute('width',500);
$table->setAttribute('style','border:2px solid #8C8C8C;text-align:center;table-layout:fixed;');
}
$returnText .= $dom->saveHTML();
return $returnText;
}
}
return $text;
}
I'm trying to add the results of a script to an array, but once I look into it there is only one item in it, probably me being silly with placement
function crawl_page($url, $depth)
{
static $seen = array();
$Linklist = array();
if (isset($seen[$url]) || $depth === 0) {
return;
}
$seen[$url] = true;
$dom = new DOMDocument('1.0');
#$dom->loadHTMLFile($url);
$anchors = $dom->getElementsByTagName('a');
foreach ($anchors as $element) {
$href = $element->getAttribute('href');
if (0 !== strpos($href, 'http')) {
$href = rtrim($url, '/') . '/' . ltrim($href, '/');
}
if(shouldScrape($href)==true)
{
crawl_page($href, $depth - 1);
}
}
echo "URL:",$url;
echo http_response($url);
echo "<br/>";
$Linklist[] = $url;
$XML = new DOMDocument('1.0');
$XML->formatOutput = true;
$root = $XML->createElement('Links');
$root = $XML->appendChild($root);
foreach ($Linklist as $value)
{
$child = $XML->createElement('Linkdetails');
$child = $root->appendChild($child);
$text = $XML->createTextNode($value);
$text = $child->appendChild($text);
}
$XML->save("linkList.xml");
}
$Linklist[] = $url; will add a single item to the $Linklist array. This line needs to be in a loop I think.
static $Linklist = array(); i think, but code is awful