I am using plesk 11.0.9 version.I set a cron for updating the database.The cron file will run once's in a day.Each time I want a huge amount of data(around 20,000)updated to the database.But the cron file will run only for 5 min and time out occur.Due to this reason the database is not updated correctly.I used the following code inside the cron file.
<?php
$query=mysql_query("SELECT id,detail_url,region,region_id FROM c_url_details_crone");
while($ress=mysql_fetch_array($query))
{
echo '---'. $city=$ress['region']; echo "<br>";
$url=$ress['detail_url'];
$did=$ress['id'];
require_once 'simplehtmldom_1_51/simple_html_dom.php';
$html = file_get_html($url);
foreach($html->find('h3[class=h3 z-address]') as $link)
$data['address']= $link;
foreach($html->find('h2[class=z-price brown]') as $pr)
$data['priceH']= $pr;
foreach($html->find('div[class=z-feature]') as $rooms)
$data['room']=$rooms;
foreach($html->find('div[class=z-description]') as $description)
$data['description']=$description;
$property1='';
foreach($html->find('table[class=table-style]') as $property)
{
$property1=$property1. ','.$property;
}
$data['property']=$property1 ;
foreach($html->find('table[class=table-style z-rooms]') as $extras)
{
$data['extras']=$extras;
}
foreach($html->find('li[class=z-mls]') as $listcode)
{
$data['listcode']=$listcode;
}
foreach($html->find('div[class=z-listing-by]') as $propertybrkr)
{
$data['propertybrkr']=$propertybrkr;
}
foreach($html->find('div[class=small-12 columns flat-columns z-block]') as $propertyimages)
$data['propertyimages']=$propertyimages;
if(isset($propertyimages)){ $file_contents= $propertyimages;
$img1='';
foreach(#$file_contents->find('img') as $element)
$img1=$img1. ','.$element->src;
$imgeslidr=explode(',',$img1);
// print_r($imgeslidr);
#$st0=implode(",",$imgeslidr);
}
$detls= explode(',',$property1);
#$dom = new DOMDocument;
#$dom->loadHTML(#$detls[1]);
#$cells = $dom->getElementsByTagName('td');
#$contents = array();
#$aa="";
foreach($cells as $cell)
{
#$contents[] = $cell->nodeValue;
$aa.=$cell->nodeValue.",";
}
#$dom = new DOMDocument;
#$dom->loadHTML(#$detls[1]);
#$cells = $dom->getElementsByTagName('th');
#$contents = array();
$bb='';
foreach($cells as $cell)
{
#$contents1[] = $cell->nodeValue;
$bb.=$cell->nodeValue.",";
}
#$dom = new DOMDocument;
#$dom->loadHTML(#$detls[3]);
#$cells = $dom->getElementsByTagName('th');
#$contents = array();
#$cc="";
foreach($cells as $cell)
{
#$contents2[] = $cell->nodeValue;
$cc.=$cell->nodeValue.",";
}
#$dom = new DOMDocument;
#$dom->loadHTML(#$detls[3]);
#$cells = $dom->getElementsByTagName('td');
#$contents = array();
#$dd="";
foreach($cells as $cell)
{
#$contents3[] = $cell->nodeValue;
$dd.=$cell->nodeValue.",";
}
#$dom = new DOMDocument;
#$dom->loadHTML(#$detls[4]);
#$cells = $dom->getElementsByTagName('th');
#$contents = array();
#$ff="";
foreach($cells as $cell)
{
#$contents5[] = $cell->nodeValue;
$ff.=$cell->nodeValue.",";
}
#$dom = new DOMDocument;
#$dom->loadHTML(#$detls[3]);
#$cells = $dom->getElementsByTagName('thead');
#$contents = array();
#$gg="";
foreach($cells as $cell)
{
#$contents6[] = $cell->nodeValue;
$gg.=$cell->nodeValue.",";
}
#$dom = new DOMDocument;
#$dom->loadHTML(#$detls[4]);
#$cells = $dom->getElementsByTagName('thead');
#$contents = array();
#$hh="";
foreach($cells as $cell)
{
#$contents7[] = $cell->nodeValue;
$hh.=$cell->nodeValue.",";
}
#$dom = new DOMDocument;
#$dom->loadHTML(#$detls[2]);
#$cells = $dom->getElementsByTagName('thead');
#$contents = array();
#$uu="";
foreach($cells as $cell)
{
#$contents8[] = $cell->nodeValue;
$uu.=$cell->nodeValue.",";
}
#$dom = new DOMDocument;
#$dom->loadHTML(#$detls[2]);
#$cells = $dom->getElementsByTagName('th');
#$contents = array();
#$jj="";
foreach($cells as $cell)
{
#$contents9[] = $cell->nodeValue;
$jj.=$cell->nodeValue.",";
}
#$dom = new DOMDocument;
#$dom->loadHTML(#$detls[2]);
#$cells = $dom->getElementsByTagName('td');
#$contents = array();
#$kk="";
foreach($cells as $cell)
{
#$contents10[] = $cell->nodeValue;
$kk.=$cell->nodeValue.",";
}
#$dom = new DOMDocument;
#$dom->loadHTML(#$extras);
#$cells = $dom->getElementsByTagName('td');
#$contents = array();
#$contss="";
foreach($cells as $cell)
{
#$contentsss[] = $cell->nodeValue;
$contss.=$cell->nodeValue."**#**";
}
#$images1=$st0;
#$a1=$aa;
#$b1=$bb;
#$c1=mysql_real_escape_string($cc);
#$d1=mysql_real_escape_string($dd);
#$f1=mysql_real_escape_string($ff);
#$g1=mysql_real_escape_string($gg);
#$h1=mysql_real_escape_string($hh);
#$i1=mysql_real_escape_string($uu);
#$j1=mysql_real_escape_string($jj);
#$k1=mysql_real_escape_string($kk);
#$l1=mysql_real_escape_string($contss);
#$decs=$description;
$c++;
mysql_query("INSERT INTO house_moredetails_crone(city,durl,a,b,c,d,f,g,h,i,j,k,l,description,images)
VALUES('".$city."','".$url."','".$a1."','".$b1."','".$c1."','".$d1."','".$f1."','".$g1."','".$h1."','".$i1."','".$j1."','".$k1."','".$l1."','".$decs."','".$images1."')") ;
mysql_query("DELETE FROM c_url_details_crone WHERE id='$did'");
}
?>
Is there any other better option other than the cron for updating database.Or by using cron is there any solution to update database without timeout.Waiting for your reply!!
Related
I have html code something like this:
<p><i>i_text</i>,p_text</p>
i_text,p_text
i want change all node values in this domelement and keep all tags
i_changed_text,p_changed_text
my attempts)
$html = '<p><i>i_text</i> p_text</p>';
$dom = new DOMDocument();
$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$dom->validateOnParse = true;
$elements = $dom->getElementsByTagName('*');
foreach ($elements as $element) {
$element->nodeValue = str_replace('_','_changed_',$element->nodeValue);
}
echo($dom->saveHTML());
output i_changed_text,p_changed_text
this code return correct text but don't save childnodes
$html = '<p><i>i_text</i>,p_text</p>';
$dom = new DOMDocument();
$dom->loadXML($html);
$dom->preserveWhiteSpace = false;
$dom->validateOnParse = true;
$elements = $dom->getElementsByTagName('*');
$elem = $dom->createElement('dfn', 'tag');
$attr = $dom->createAttribute('text');
$attr->value = 'element';
$elem->appendChild($attr);
$elements = $dom->getElementsByTagName('*');
foreach ($elements as $element) {
while ($element->hasChildnodes()) {
$element = $element->childNodes->item(0);
}
$changed_value = str_replace('_','_changed_',$element->nodeValue);
$element->nodeValue = str_replace("tag", $dom->saveXML($elem), $changed_value);
}
echo ($dom->saveXML());
output
i_changed_text,p_text
this code save and change values in childnodes but don't change text in parentnode
my solution)
i_text,p_text,a_text,another one_text
$html = '<p><i>i_text</i>,p_text<b>,a_text</b>,another one_text</p>';
$dom = new DOMDocument();
$dom->loadXML($html);
$dom->preserveWhiteSpace = false;
$dom->validateOnParse = true;
$elements = $dom->getElementsByTagName('*');
foreach ($elements as $element) {
if($element->hasChildnodes()==true && $element->parentNode->nodeName == '#document'){
foreach($element->childNodes as $element_child){
$element_child->nodeValue = str_replace('_','_changed_', $element_child->nodeValue);
}
}
}
echo ($dom->saveXML());
output
i_changed_text,p_changed_text,a_changed_text,another one_changed_text
I’m trying to scrape a table on Borsa Italiana
I use this code
<?php
$url = "https://www.borsaitaliana.it/borsa/azioni/global-equity-market/dati-completi.html?isin=IT0001477402";
$html = file_get_contents($url);
libxml_use_internal_errors(true);
$doc = new \DOMDocument();
if($doc->loadHTML($html))
{
$result = new \DOMDocument();
$result->formatOutput = true;
$table = $result->appendChild($result->createElement("table"));
$tbody = $table->appendChild($result->createElement("tbody"));
$xpath = new \DOMXPath($doc);
foreach($xpath->query("//table[#class=\"m-table -clear-m\"]/tbody/tr") as $row)
{
$newRow = $tbody->appendChild($result->createElement("tr"));
foreach($xpath->query("./td[position()>0 and position()<3]", $row) as $cell)
{
$newRow->appendChild($result->createElement("td", trim($cell->nodeValue)));
}
}
}
echo $result->saveHTML($result->documentElement);
?>
Result is a table with two columns and more rows. I would transpose first column in header, in order to save result in my database for my personal use.
Can anyone help me?
Thank you
Try it:
<?php
$url = "https://www.borsaitaliana.it/borsa/azioni/global-equity-market/dati-completi.html?isin=IT0001477402";
$html = file_get_contents($url);
libxml_use_internal_errors(true);
$doc = new \DOMDocument();
if ($doc->loadHTML($html)) {
$result = new \DOMDocument();
$result->formatOutput = true;
$xpath = new \DOMXPath($doc);
// collects data in $arr -->
$arr = [];
foreach ($xpath->query("//table[#class=\"m-table -clear-m\"]/tbody/tr") as $row) {
$itm = [];
foreach ($xpath->query("./td[position()>0 and position()<3]", $row) as $cell) {
$itm[] = trim($cell->nodeValue);
}
$arr[] = $itm;
}
// <--
$table = $result->appendChild($result->createElement("table"));
// outputs head -->
$thead = $table->appendChild($result->createElement("thead"));
$newRow = $thead->appendChild($result->createElement("tr"));
foreach (array_column($arr, 0) as $th) {
$newRow->appendChild($result->createElement("th", $th));
}
// <--
// outputs data -->
$tbody = $table->appendChild($result->createElement("tbody"));
$newRow = $tbody->appendChild($result->createElement("tr"));
foreach ($arr as $row) {
$newRow->appendChild($result->createElement("td", isset($row[1])? $row[1]: ""));
}
// <--
}
echo $result->saveHTML($result->documentElement);
But I agree with #tim - you have to use API for that.
I am trying to grab URL, with DOMparser but stuck at getNamedItem
How to solve this problem? What I am missing here? I welcome for any idea!
$url = 'https://www.31sumai.com/search/area/kansai/result/?area=16,17,18';
$html = file_get_contents($url);
libxml_use_internal_errors(true);
$DOMParser = new \DOMDocument();
$DOMParser->loadHTML($html);
$mainlink = null;
$allPTags = $DOMParser->getElementsByTagName('p');
foreach ($allPTags as $ptag) {
$class = $ptag->attributes->getNamedItem("class");
if ($class && $class->nodeValue == 'c-name') {
$main = $ptag->attributes->getNamedItem("href");
if ($main) {
$mainlink = $main->nodeValue;
}
}
}
var_dump($mainlink);
It s returning null but already checked the website, there is a URL in that tag.
$url = 'https://lions-mansion.jp/area/kansai/';
$html = file_get_contents($url);
libxml_use_internal_errors(true);
$DOMParser = new \DOMDocument();
$DOMParser->loadHTML($html);
$mainlink = null;
$allPTags = $DOMParser->getElementsByTagName('p');
foreach ($allPTags as $ptag) {
$class = $ptag->attributes->getNamedItem("class");
if ($class && $class->nodeValue == 'areapageDetailList_item_btn_hp') {
$links = $ptag->getElementsByTagName('a');
foreach ($links as $link) {
$hrefAttr = $link->attributes->getNamedItem("href");
if ($hrefAttr) {
$mainlink = $hrefAttr->nodeValue;
}
}
}
}
echo $mainlink;
Using PHP i want to remove all HTML attributes except
"src" attribute from "img" tag
and
"href" attribute from "a" tag
My Input file is .html file which is been converted from .doc and .docx
My output file again should be HTML file with removed attribute
Kindly help me please
Edit ::
After Trying alexander script as below if i open the strip.html in code editor i don't see any changes
<?php
$path = '/var/www/strip.html';
$html = file_get_contents($path);
$dom = new DOMDocument();
$dom->strictErrorChecking = false;
$dom->formatOutput = true;
$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
if (false === ($elements = $xpath->query("//img"))) die('Error');
foreach ($elements as $element) {
for ($i = $element->attributes->length; --$i >= 0;) {
$name = $element->attributes->item($i)->name;
if ('src' !== $name) {
$element->removeAttribute($name);
}
}
}
if (false === ($elements = $xpath->query("//a"))) die('Error');
foreach ($elements as $element) {
for ($i = $element->attributes->length; --$i >= 0;) {
$name = $element->attributes->item($i)->name;
if ('href' !== $name) {
$element->removeAttribute($name);
}
}
}
$dom->saveHTMLFile($path);
?>
Use DOMDocument class for parsing HTML ("a" and "img" tags processing):
$path = '/path/to/file.html';
$html = file_get_contents($path);
$dom = new DOMDocument();
//$dom->strictErrorChecking = false;
$dom->formatOutput = true;
$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
if (false === ($elements = $xpath->query("//img"))) die('Error');
foreach ($elements as $element) {
for ($i = $element->attributes->length; --$i >= 0;) {
$name = $element->attributes->item($i)->name;
if ('src' !== $name) {
$element->removeAttribute($name);
}
}
}
if (false === ($elements = $xpath->query("//a"))) die('Error');
foreach ($elements as $element) {
for ($i = $element->attributes->length; --$i >= 0;) {
$name = $element->attributes->item($i)->name;
if ('href' !== $name) {
$element->removeAttribute($name);
}
}
}
$dom->saveHTMLFile($path);
Also, read why you can't parse [X]HTML with regex and take a look at useful xpath links.
Update (all tags with exception "a" and "img" attributes processing):
$path = '/path/to/file.html';
$html = file_get_contents($path);
$dom = new DOMDocument();
//$dom->strictErrorChecking = false;
$dom->formatOutput = true;
$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
if (false === ($elements = $xpath->query("//*"))) die('Error');
foreach ($elements as $element) {
for ($i = $element->attributes->length; --$i >= 0;) {
$name = $element->attributes->item($i)->name;
if (('img' === $element->nodeName && 'src' === $name)
|| ('a' === $element->nodeName && 'href' === $name)
) {
continue;
}
$element->removeAttribute($name);
}
}
$dom->saveHTMLFile($path);
i tried to concatenate innerhtml of div into string variable:
games variable:
$games = '';
DOMinnerHTML function:
function DOMinnerHTML($element)
{
$innerHTML = "";
$children = $element->childNodes;
foreach ($children as $child)
{
$tmp_dom = new DOMDocument();
$tmp_dom->appendChild($tmp_dom->importNode($child, true));
$innerHTML.=trim($tmp_dom->saveHTML());
}
return $innerHTML;
}
ExtractFromType function:
function ExtractFromType($type)
{
$html = file_get_contents('www.site.com/' .$type);
$dom = new domDocument;
#$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$divs = $dom->getElementsByTagName('div');
foreach ($divs as $div) {
if (strpos($div->getAttribute('style'),'MyString') !== false) {
//////
$games = $games.DOMinnerHTML($div);
//////
}
}
}
code:
ExtractFromType('MyType');
echo $games; // = Nothing.
this code return nothing.
$games is defined in the global scope, and it's not available inside ExctractFromType. Define it inside the function, then return the value:
function ExtractFromType($type) {
$html = file_get_contents('www.site.com/' .$type);
$dom = new domDocument;
#$dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$divs = $dom->getElementsByTagName('div');
$games = '';
foreach ($divs as $div) {
if (strpos($div->getAttribute('style'),'MyString') !== false) {
$games = $games.DOMinnerHTML($div);
}
}
}
echo ExtractFromType('MyType');