From a string that contain a tei file, I generate an index to navigate to their blocks, I retrieve all the div tags, I also want to get, if present, the content of a tag (the tag <head>) inside current div.
Example tei file:
<div type="lib" n="1"><head>LIBER I</head>...
<div type="pr">...</div>
<div type="cap" n="1"><head>CAP EX</head><p><milestone unit="par" n="1" />...<milestone unit="par" n="2" />...</div>
<div type="cap" n="2"><head>CAP EX</head><milestone unit="par" n="1" />...<milestone unit="par" n="2" />...</div>
</div>
I tried this but don't work:
//source file:
$fulltext = '<div type="lib" n="1"><head>LIBER I</head>...<div type="pr">...</div><div type="cap" n="1"><head>CAP EX</head><p><milestone unit="par" n="1" />...<milestone unit="par" n="2" />...</div><div type="cap" n="2"><head>CAP EX</head><milestone unit="par" n="1" />...<milestone unit="par" n="2" />...</div></div>';
$dom = new DOMDocument();
#$dom->loadHTML($fulltext);
$domx = new DOMXPath($dom);
$entries = $domx->evaluate("//div");
echo '<ul>';
foreach ($entries as $entry){
$title = '';
type = $entry->getAttribute( 'type' );
$n = $entry->getAttribute( 'n' );
$head = $domx->evaluate("string(./head[1])",$entry);
if( $head != '' ) $title = $head; else $title = $n;
echo '<li><a href="#'.$type.'-'.$n.'">'.$title.'</li>';
}
echo '</ul>';
The line don't work:
$head = $domx->evaluate("string(./head[1])",$entry);
Error returned:
DOMDocument::loadHTML(): htmlParseStartTag: misplaced <head> tag in Entity, line: 3
The purpose of this line is to get the text of the child tag head inside the loop (in this example "LIBER I")
Using the # symbol on the load can hide all sorts of issues. So if you take it out you get errors with your document.
If however you change the line to
$dom->loadXML($fulltext);
The output gives you what your after.
Resolved using XMLReader:
$level = 0;
$indici_bc = array();
$indici_head = array();
$passed_milestone = false;
$xml = new XMLReader();
$xml->open($pathTei);
//$xml->xml($testo);
while ($xml->read()){
if($xml->nodeType == XMLReader::END_ELEMENT && $xml->name == 'div'){
$level--;
$last_blocco = $xml->name;
if($passed_milestone){ $level--; $passed_milestone = false; }
}
if($xml->nodeType == XMLReader::ELEMENT && ($xml->name == 'div' || $xml->name == 'milestone' )){
$blocco = $xml->name;
$type = $xml->getAttribute('type');
$n = $xml->getAttribute('n');
$unit = isset($xml->getAttribute('unit')) ? $xml->getAttribute('unit') : '';
//here I get the child node
$node = new SimpleXMLElement($xml->readOuterXML());
$head = $node->head ? (string)$node->head : '';
$indici_head[] = $head;
if($last_blocco != 'milestone') $level++;
if($blocco == 'div') $bc[$level] = $n; else $bc[($level+1)] = $n;
$bc_str = '';
for($j=1;$j<$level;$j++){
if( $bc_str != '' ) $bc_str.='.';
$bc_str.=$bc[$j];
}
if( $bc_str != '' ) $bc_str.='.';
$bc_str.=$n;
$last_blocco = $xml->name;
if( $blocco == 'milestone' ) $passed_milestone = true;
$indici_bc[]=$bc_str;
}
}
$xml->close();
I have the jquery that i can run and console and finds the element.
$.get("http://www.roblox.com/groups/group.aspx?gid=2755722", function(webpage) {
if ($(webpage).find("#ctl00_cphRoblox_rbxGroupFundsPane_GroupFunds .robux").length) {
alert("Eureka I found it!")
} else {
alert("nope!")
}
})
<div id="ctl00_cphRoblox_rbxGroupFundsPane_GroupFunds" class="StandardBox" style="padding-right:0">
<b>Funds:</b>
<span class="robux" style="margin-left:5px">29</span>
<span class="tickets" style="margin-left:5px">45</span>
</div>
When i try to run it as PHP with functions and using DomDocuments to handle it all, it wont return anything when i decode it. (the following is all part of a class)
protected function xpath($url,$path)
{
libxml_use_internal_errors(true);
$dom = new DomDocument;
$dom->loadHTML($this->file_get_contents_curl($url));
$xpath = new DomXPath($dom);
return $xpath->query($path);
}
public function GetGroupStats($id)
{
$elements = array (
'Robux' => "//span[#id='ctl00_cphRoblox_rbxGroupFundsPane_GroupFunds .robux']",
'Tix' => "//span[#id='ctl00_cphRoblox_rbxGroupFundsPane_GroupFunds .tickets']",
);
$data = array();
foreach($elements as $name => $element)
{
foreach ($this->xpath('http://www.roblox.com/Groups/group.aspx?gid='.$id,$element) as $i => $node)
$data[$name] = $node->nodeValue;
}
return $data;
}
//File that includes the class and runs the function (ignore the login stuff because it isn't required for this situation)
<?php
$randomstuffdude = include 'RApi.php';
$GetAccessToken = $_GET['token'];
if ($GetAccessToken == "secrettoken6996") {
$rbxBot = new Roblox();
$rbxBot -> DoLogin();
$StatsArray = $rbxBot->GetGroupStats(2755722);
foreach ($StatsArray as $other => $array) {
echo $other . ' : ' . $array . ' / ';
}
} else {
echo "no";
}
?>
I am getting JSON data through visiting a link using PHP HTML DOM, but sometimes, I get an empty page so I want to know that how can I really check if page is empty so that I can skip it by using continue in for loop
I am checking it through :
if (empty($jsondata))
But I always get TRUE never gets false even if page is returned empty
Here is my code :
<?php
$prefix = $_POST['prefix'];
$start_product = $_POST['start_product'];
$end_product = $_POST['end_product'];
set_time_limit(0);
for ($i=$start_product; $i <= $end_product; $i++) {
include('simple_html_dom.php');
$prefix ="00";
$i= "11";
$jsondata = file_get_html('http://www.ewallpk.com/index.php?controller=search&q=A'.$prefix.$i.'&limit=10×tamp=1445547668758&ajaxSearch=1&id_lang=1');
if (!empty($jsondata)) {
$data = json_decode($jsondata, true);
$product = file_get_html($data[0]["product_link"]);
$product_name= "";
foreach($product->find('div[id=pb-left-column] h1') as $element) {
$product_name.=$element->innertext . '<br>';
}
$product_name = explode("_", $product_name);
$count = count($product_name);
if ($count < 3) {
$product_name=$product_name[0];
} else {
$product_name = "Error";
}
$product_description= "";
foreach($product->find('div[id=short_description_content]') as $element) {
$product_description.=$element->plaintext . '<br>';
}
$product_price= "";
foreach($product->find('p[class=our_price_display] span') as $element) {
$product_price.=$element->innertext . '<br>';
}
$image_link= "";
foreach($product->find('img[id=bigpic]') as $element) {
$image_link.=$element->src;
}
$content = file_get_contents($image_link);
file_put_contents('item_images/A'.$prefix.$i.'.jpg', $content);
echo "<strong>Product No : </strong> A".$prefix.$i."</br>";
echo "<strong>Product Name : </strong>".$product_name."</br>";
echo "<strong>Product Description : </strong>".$product_description;
echo "<strong>Product Price : </strong>".$product_price."</br></br></br>";
} else {
continue;
}
}
?>
You're probably getting some whitespace in the empty response, so trim it off before testing. You also should be using file_get_contents, since the response is not HTML.
$jsondata = file_get_contents('http://www.ewallpk.com/index.php?controller=search&q=A'.$prefix.$i.'&limit=10×tamp=1445547668758&ajaxSearch=1&id_lang=1');
$jsondata = trim($jsondata);
if (!empty($jsondata)) {
...
}
Trying to walk the dom for div and indent it as I go. It works,except there are duplicates. I could save to an array and check for duplicates, but wondering if there is an easier way. Thanks.
function dom_parse_div_tag($htmlfile)
{
libxml_use_internal_errors(true); // supresses dom warnings
$dom = new DOMDocument();
$dom->preserveWhiteSpace = false;
$dom->loadHTML($htmlfile);
$nodes = $dom->getElementsByTagName("div");
foreach ($nodes as $ii=>$node) {
echo "<br>";
$nodeclass = $node->attributes->getNamedItem('class');
if (isset($nodeclass))
echo "Class:" . $nodeclass->nodeValue ."<br>";
dom_child_node_print($node,0);
}
}
function dom_child_node_print($node,$level)
{
echo "<br>";
if($node->hasChildNodes()) {
$nclass = $node->attributes->getNamedItem('class');
if (isset($nclass))
echobr("Class:" . $nclass->nodeValue);
foreach ($node->childNodes as $ochildnode) {
if($ochildnode->hasChildNodes()) {
dom_child_node_print($ochildnode, $level + 1);
}
else {
if (trim($ochildnode->nodeValue) !== "") {
echo "Level$level," . strg_remove_linefeed($ochildnode->nodeValue) ."<br>";
}
}
}
}
}
The function below is designed to apply rel="nofollow" attributes to all external links and no internal links unless the path matches a predefined root URL defined as $my_folder below.
So given the variables...
$my_folder = 'http://localhost/mytest/go/';
$blog_url = 'http://localhost/mytest';
And the content...
internal
internal cloaked link
external
The end result, after replacement should be...
internal
internal cloaked link
external
Notice that the first link is not altered, since its an internal link.
The link on the second line is also an internal link, but since it matches our $my_folder string, it gets the nofollow too.
The third link is the easiest, since it does not match the blog_url, its obviously an external link.
However, in the script below, ALL of my links are getting nofollow. How can I fix the script to do what I want?
function save_rseo_nofollow($content) {
$my_folder = $rseo['nofollow_folder'];
$blog_url = get_bloginfo('url');
preg_match_all('~<a.*>~isU',$content["post_content"],$matches);
for ( $i = 0; $i <= sizeof($matches[0]); $i++){
if ( !preg_match( '~nofollow~is',$matches[0][$i])
&& (preg_match('~' . $my_folder . '~', $matches[0][$i])
|| !preg_match( '~'.$blog_url.'~',$matches[0][$i]))){
$result = trim($matches[0][$i],">");
$result .= ' rel="nofollow">';
$content["post_content"] = str_replace($matches[0][$i], $result, $content["post_content"]);
}
}
return $content;
}
Here is the DOMDocument solution...
$str = 'internal
internal cloaked link
external
external
external
external
';
$dom = new DOMDocument();
$dom->preserveWhitespace = FALSE;
$dom->loadHTML($str);
$a = $dom->getElementsByTagName('a');
$host = strtok($_SERVER['HTTP_HOST'], ':');
foreach($a as $anchor) {
$href = $anchor->attributes->getNamedItem('href')->nodeValue;
if (preg_match('/^https?:\/\/' . preg_quote($host, '/') . '/', $href)) {
continue;
}
$noFollowRel = 'nofollow';
$oldRelAtt = $anchor->attributes->getNamedItem('rel');
if ($oldRelAtt == NULL) {
$newRel = $noFollowRel;
} else {
$oldRel = $oldRelAtt->nodeValue;
$oldRel = explode(' ', $oldRel);
if (in_array($noFollowRel, $oldRel)) {
continue;
}
$oldRel[] = $noFollowRel;
$newRel = implode($oldRel, ' ');
}
$newRelAtt = $dom->createAttribute('rel');
$noFollowNode = $dom->createTextNode($newRel);
$newRelAtt->appendChild($noFollowNode);
$anchor->appendChild($newRelAtt);
}
var_dump($dom->saveHTML());
Output
string(509) "<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><body>
internal
internal cloaked link
external
external
external
external
</body></html>
"
Try to make it more readable first, and only afterwards make your if rules more complex:
function save_rseo_nofollow($content) {
$content["post_content"] =
preg_replace_callback('~<(a\s[^>]+)>~isU', "cb2", $content["post_content"]);
return $content;
}
function cb2($match) {
list($original, $tag) = $match; // regex match groups
$my_folder = "/hostgator"; // re-add quirky config here
$blog_url = "http://localhost/";
if (strpos($tag, "nofollow")) {
return $original;
}
elseif (strpos($tag, $blog_url) && (!$my_folder || !strpos($tag, $my_folder))) {
return $original;
}
else {
return "<$tag rel='nofollow'>";
}
}
Gives following output:
[post_content] =>
internal
<a href="http://localhost/mytest/go/hostgator" rel=nofollow>internal cloaked link</a>
<a href="http://cnn.com" rel=nofollow>external</a>
The problem in your original code might have been $rseo which wasn't declared anywhere.
Try this one (PHP 5.3+):
skip selected address
allow manually set rel parameter
and code:
function nofollow($html, $skip = null) {
return preg_replace_callback(
"#(<a[^>]+?)>#is", function ($mach) use ($skip) {
return (
!($skip && strpos($mach[1], $skip) !== false) &&
strpos($mach[1], 'rel=') === false
) ? $mach[1] . ' rel="nofollow">' : $mach[0];
},
$html
);
}
Examples:
echo nofollow('something');
// will be same because it's already contains rel parameter
echo nofollow('something'); // ad
// add rel="nofollow" parameter to anchor
echo nofollow('something', 'localhost');
// skip this link as internall link
Using regular expressions to do this job properly would be quite complicated. It would be easier to use an actual parser, such as the one from the DOM extension. DOM isn't very beginner-friendly, so what you can do is load the HTML with DOM then run the modifications with SimpleXML. They're backed by the same library, so it's easy to use one with the other.
Here's how it can look like:
$my_folder = 'http://localhost/mytest/go/';
$blog_url = 'http://localhost/mytest';
$html = '<html><body>
internal
internal cloaked link
external
</body></html>';
$dom = new DOMDocument;
$dom->loadHTML($html);
$sxe = simplexml_import_dom($dom);
// grab all <a> nodes with an href attribute
foreach ($sxe->xpath('//a[#href]') as $a)
{
if (substr($a['href'], 0, strlen($blog_url)) === $blog_url
&& substr($a['href'], 0, strlen($my_folder)) !== $my_folder)
{
// skip all links that start with the URL in $blog_url, as long as they
// don't start with the URL from $my_folder;
continue;
}
if (empty($a['rel']))
{
$a['rel'] = 'nofollow';
}
else
{
$a['rel'] .= ' nofollow';
}
}
$new_html = $dom->saveHTML();
echo $new_html;
As you can see, it's really short and simple. Depending on your needs, you may want to use preg_match() in place of the strpos() stuff, for example:
// change the regexp to your own rules, here we match everything under
// "http://localhost/mytest/" as long as it's not followed by "go"
if (preg_match('#^http://localhost/mytest/(?!go)#', $a['href']))
{
continue;
}
Note
I missed the last code block in the OP when I first read the question. The code I posted (and basically any solution based on DOM) is better suited at processing a whole page rather than a HTML block. Otherwise, DOM will attempt to "fix" your HTML and may add a <body> tag, a DOCTYPE, etc...
Thanks #alex for your nice solution. But, I was having a problem with Japanese text. I have fixed it as following way. Also, this code can skip multiple domains with the $whiteList array.
public function addRelNoFollow($html, $whiteList = [])
{
$dom = new \DOMDocument();
$dom->preserveWhiteSpace = false;
$dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
$a = $dom->getElementsByTagName('a');
/** #var \DOMElement $anchor */
foreach ($a as $anchor) {
$href = $anchor->attributes->getNamedItem('href')->nodeValue;
$domain = parse_url($href, PHP_URL_HOST);
// Skip whiteList domains
if (in_array($domain, $whiteList, true)) {
continue;
}
// Check & get existing rel attribute values
$noFollow = 'nofollow';
$rel = $anchor->attributes->getNamedItem('rel');
if ($rel) {
$values = explode(' ', $rel->nodeValue);
if (in_array($noFollow, $values, true)) {
continue;
}
$values[] = $noFollow;
$newValue = implode($values, ' ');
} else {
$newValue = $noFollow;
}
// Create new rel attribute
$rel = $dom->createAttribute('rel');
$node = $dom->createTextNode($newValue);
$rel->appendChild($node);
$anchor->appendChild($rel);
}
// There is a problem with saveHTML() and saveXML(), both of them do not work correctly in Unix.
// They do not save UTF-8 characters correctly when used in Unix, but they work in Windows.
// So we need to do as follows. #see https://stackoverflow.com/a/20675396/1710782
return $dom->saveHTML($dom->documentElement);
}
<?
$str='internal
internal cloaked link
external';
function test($x){
if (preg_match('#localhost/mytest/(?!go/)#i',$x[0])>0) return $x[0];
return 'rel="nofollow" '.$x[0];
}
echo preg_replace_callback('/href=[\'"][^\'"]+/i', 'test', $str);
?>
Here is the another solution which has whitelist option and add tagret Blank attribute.
And also it check if there already a rel attribute before add a new one.
function Add_Nofollow_Attr($Content, $Whitelist = [], $Add_Target_Blank = true)
{
$Whitelist[] = $_SERVER['HTTP_HOST'];
foreach ($Whitelist as $Key => $Link)
{
$Host = preg_replace('#^https?://#', '', $Link);
$Host = "https?://". preg_quote($Host, '/');
$Whitelist[$Key] = $Host;
}
if(preg_match_all("/<a .*?>/", $Content, $matches, PREG_SET_ORDER))
{
foreach ($matches as $Anchor_Tag)
{
$IS_Rel_Exist = $IS_Follow_Exist = $IS_Target_Blank_Exist = $Is_Valid_Tag = false;
if(preg_match_all("/(\w+)\s*=\s*['|\"](.*?)['|\"]/",$Anchor_Tag[0],$All_matches2))
{
foreach ($All_matches2[1] as $Key => $Attr_Name)
{
if($Attr_Name == 'href')
{
$Is_Valid_Tag = true;
$Url = $All_matches2[2][$Key];
// bypass #.. or internal links like "/"
if(preg_match('/^\s*[#|\/].*/', $Url))
{
continue 2;
}
foreach ($Whitelist as $Link)
{
if (preg_match("#$Link#", $Url)) {
continue 3;
}
}
}
else if($Attr_Name == 'rel')
{
$IS_Rel_Exist = true;
$Rel = $All_matches2[2][$Key];
preg_match("/[n|d]ofollow/", $Rel, $match, PREG_OFFSET_CAPTURE);
if( count($match) > 0 )
{
$IS_Follow_Exist = true;
}
else
{
$New_Rel = 'rel="'. $Rel . ' nofollow"';
}
}
else if($Attr_Name == 'target')
{
$IS_Target_Blank_Exist = true;
}
}
}
$New_Anchor_Tag = $Anchor_Tag;
if(!$IS_Rel_Exist)
{
$New_Anchor_Tag = str_replace(">",' rel="nofollow">',$Anchor_Tag);
}
else if(!$IS_Follow_Exist)
{
$New_Anchor_Tag = preg_replace("/rel=[\"|'].*?[\"|']/",$New_Rel,$Anchor_Tag);
}
if($Add_Target_Blank && !$IS_Target_Blank_Exist)
{
$New_Anchor_Tag = str_replace(">",' target="_blank">',$New_Anchor_Tag);
}
$Content = str_replace($Anchor_Tag,$New_Anchor_Tag,$Content);
}
}
return $Content;
}
To use it:
$Page_Content = 'internal
internal
google
example
stackoverflow';
$Whitelist = ["http://yoursite.com","http://localhost"];
echo Add_Nofollow_Attr($Page_Content,$Whitelist,true);
WordPress decision:
function replace__method($match) {
list($original, $tag) = $match; // regex match groups
$my_folder = "/articles"; // re-add quirky config here
$blog_url = 'https://'.$_SERVER['SERVER_NAME'];
if (strpos($tag, "nofollow")) {
return $original;
}
elseif (strpos($tag, $blog_url) && (!$my_folder || !strpos($tag, $my_folder))) {
return $original;
}
else {
return "<$tag rel='nofollow'>";
}
}
add_filter( 'the_content', 'add_nofollow_to_external_links', 1 );
function add_nofollow_to_external_links( $content ) {
$content = preg_replace_callback('~<(a\s[^>]+)>~isU', "replace__method", $content);
return $content;
}
a good script which allows to add nofollow automatically and to keep the other attributes
function nofollow(string $html, string $baseUrl = null) {
return preg_replace_callback(
'#<a([^>]*)>(.+)</a>#isU', function ($mach) use ($baseUrl) {
list ($a, $attr, $text) = $mach;
if (preg_match('#href=["\']([^"\']*)["\']#', $attr, $url)) {
$url = $url[1];
if (is_null($baseUrl) || !str_starts_with($url, $baseUrl)) {
if (preg_match('#rel=["\']([^"\']*)["\']#', $attr, $rel)) {
$relAttr = $rel[0];
$rel = $rel[1];
}
$rel = 'rel="' . ($rel ? (strpos($rel, 'nofollow') ? $rel : $rel . ' nofollow') : 'nofollow') . '"';
$attr = isset($relAttr) ? str_replace($relAttr, $rel, $attr) : $attr . ' ' . $rel;
$a = '<a ' . $attr . '>' . $text . '</a>';
}
}
return $a;
},
$html
);
}