I am using DOMDocument() to load the text of html page.
It takes much time to load the page. Does that means it downloads images also ?
Any alternatives or solution to load a html page from url faster ?
I am using DOMDocument() to basically extract meta description, title text, body text etc.
Solution with working code will be highly appreciated.
<?php
set_time_limit(0);
include "connection.php";
error_reporting(E_ERROR | E_PARSE);
// Create a document instance
$doc = new DOMDocument();
if(!isset($_GET['url'])){
$_GET['url']=$urlfromdaemon;
$silentcrawl="set";
}
$doc->loadHTMLFile($_GET['url']);
$base_url=$_GET['url'];
$base_url = parse_url($base_url);
$base_url = 'http://'.$base_url['host'].'/';
//Searches for all elements with the "a" tag name
$tit = $doc->getElementsByTagName( "a" );
$urlarray=array();
$t=0;
foreach($tit AS $x){
$urlarray[$t]=$x->getAttribute('href');
$urlanchor[$t]=$x->nodeValue;
$t++;
}
//This makes the URL with spaces work correctly
for($i=0;$i<count($urlarray);$i++){
$urlarray[$i]= str_ireplace(" ","%20",$urlarray[$i]);
}
//
for($i=0;$i<count($urlarray);$i++){
$result=stristr(substr($urlarray[$i], 0, 7),"http://");
if($result==''){
if(stristr(substr($urlarray[$i], 0, 8),"https://")!=''){
}
else if(stristr(substr($urlarray[$i], 0, 2),"//")!=''){
$urlarray[$i]= 'http:'.$urlarray[$i];
}
else if(stristr(substr($urlarray[$i], 0, 4),"www.")==''){
//critical code section
$urlcheck='http://'.$urlarray[$i];
$headers = #get_headers($urlcheck, 1);
if ($headers === FALSE) { //Test for differentiate example.com with example .
if(substr($_GET['url'],-1)=='/'){
$urlarray[$i]= $_GET['url'].$urlarray[$i];
}
else{
if(parse_url($_GET['url'], PHP_URL_PATH)=='/'){
$trim=basename(parse_url($_GET['url'], PHP_URL_PATH));
$urlarray[$i]= str_ireplace($trim,"",$_GET['url']).$urlarray[$i];
}
else if(substr(str_ireplace(basename($_GET['url']),"",$_GET['url']),-1)=='/'){
if(substr(str_ireplace(basename($_GET['url']),"",$_GET['url']),-7)=='http://'){
$trim=basename(parse_url($_GET['url'], PHP_URL_PATH));
$urlarray[$i]= str_ireplace($trim,"",$_GET['url']).'/'.$urlarray[$i];
}
else{
$trim=basename(parse_url($_GET['url'], PHP_URL_PATH));
$urlarray[$i]= str_ireplace($trim,"",$_GET['url']).$urlarray[$i];
}
}
else{
$trim=basename(parse_url($_GET['url'], PHP_URL_PATH));
$urlarray[$i]= str_ireplace($trim,"",$_GET['url']).'/'.$urlarray[$i];
}
}
}
else {
$urlarray[$i]= 'http://'.$urlarray[$i];
}
//
}
else{
$urlarray[$i]='http://'.$urlarray[$i];
}
}
}
for($i=0;$i<count($urlarray);$i++){
$file = $urlarray[$i];
$file_headers = #get_headers($file);
if($file_headers[0] =='HTTP/1.1 404 Not Found') {
if(!isset($silentcrawl)){
//print_r($file_headers);
echo '<img style="width:20px;height:20px;float:left;" src="cross.png" > '.$urlarray[$i].'<br><Br>';
}
}
else {
if(!isset($silentcrawl)){
echo '<img style="width:20px;height:20px;float:left;" src="tick.png" > '.$urlarray[$i].'<br><br>';
}
//Insert Active Links into the database
$res=mysqli_query($con,"SELECT * from links where url='$urlarray[$i]' ");
$count=mysqli_num_rows($res);
if($count==0){
$sql="INSERT INTO links (url,referer,anchor_pool)
VALUES ('$urlarray[$i]','$_GET[url]','$urlanchor[$i]')";
mysqli_query($con,$sql);
}
else{
$res=mysqli_query($con,"SELECT * from links where url='$urlarray[$i]' ");
while($row=mysqli_fetch_array($res)){
$referers=explode(" ",$row['referer']);
$refcount=0;
for($j=0;$j<count($referers);$j++){
if($_GET['url']==$referers[$j]){
$refcount++;
//echo "same referer";
}
if($_GET['url']==$urlarray[$i]){
$refcount++;
//echo "same referer";
}
}
if($refcount<1){
$newreferer=$row['referer']." ".$_GET['url'];
$sql="update links set referer='$newreferer' where url='$urlarray[$i]' ";
mysqli_query($con,$sql);
$anchor=$row['anchor_pool'].' '.$urlanchor[$i];
$anchors=explode(" ",$anchor);
$anchors=array_unique($anchors);
$anchors=array_values($anchors);
$final_anchor=implode(' ',$anchors);
$sql="update links set anchor_pool='$final_anchor' where url='$urlarray[$i]' ";
mysqli_query($con,$sql);
}
}
}
}
}
$errors = array_filter($urlarray);
if (!empty($errors)) {
}
else{
echo "Either the URL is down or page contains no Links !, Try entering URL along with protocol used.";
}
$prime=$_GET['url'];
$res=mysqli_query($con,"SELECT * from links where url='$prime' ");
$count=mysqli_num_rows($res);
if($count==0){
$sql="INSERT INTO links (url,referer,anchor_pool,backlinks,status)
VALUES ('$_GET[url]','','','0','1')";
mysqli_query($con,$sql);
}
else{
$file_headers = #get_headers($prime);
if($file_headers[0] == 'HTTP/1.1 200 OK') {
$sql="update links set status='1' where url='$prime' ";
mysqli_query($con,$sql);
}
}
$res=mysqli_query($con,"SELECT * from links ");
while($row=mysqli_fetch_array($res)){
$bkarray=array_filter(explode(" ",$row['referer']));
for($i=0;$i<count($bkarray);$i++){
$base_url=parse_url($bkarray[$i]);
$bkarray[$i]=$base_url['host'];
}
$bkarray=array_unique($bkarray);
$bkarray=array_values($bkarray);
$bkarray=array_filter($bkarray);
$bk=count($bkarray);
$sql="update links set backlinks='$bk' where url='$row[url]' ";
mysqli_query($con,$sql);
}
?>
The DOMDocument class is an HTML/XML parser. Period.
The code you haven't shared probably makes use of PHP stream wrappers to transparently download a remote resource via HTTP using the same syntax as loading local files. That's an entirely different task. As far as I know, PHP does not bundle a full-featured web crawler as part of its builtin libraries.
Edit: Here's where the complete download takes place:
$doc->loadHTMLFile($_GET['url']);
Everything after this line is not caused by download issues.
Related
I would like to scrape the google search result up to page 2 but i'm having trouble on the result of blank page of my website or timeout.
for($j=0; $j<$acount; $j++){
sleep(60);
for($sp = 0; $sp <= 10; $sp+=10){
$url = 'http://www.google.'.$lang.'/search?q='.$in.'&start='.$sp;
if($sp == 10){
$datenbank = "proxy_work.php";
$datei = fopen($datenbank,"a+");
fwrite($datei, $data);
fwrite ($datei,"\r\n");
fclose($datei);
} else {
$datenbank = "proxy_work.php";
$datei = fopen($datenbank,"w+");
fwrite($datei, $data);
fwrite ($datei,"\r\n");
fclose($datei);
}
}
$html = file_get_html("proxy_work.php");
foreach($html->find('a') as $e){
// $title = $h3->innertext;
$link = $e->href;
if(in_array($endomain, $approveurl)){
}
// if it is not a direct link but url reference found inside it, then extract
if (!preg_match('/^https?/', $link) && preg_match('/q=(.+)&sa=/U', $link, $matches) && preg_match('/^https?/', $matches[1])) {
$link = $matches[1];
} else if (!preg_match('/^https?/', $link)) { // skip if it is not a valid link
continue;
}
}
}
Google search result pages (SERP) are not like a common website with static html. Google preserves its data from web scraping. Consider its data as a business directory and see the following tips for business directory scrape:
IP-proxying.
Imitating human behaviour by using some browser automation tools (Selenium, iMacros and others).
Read more here.
How to add statement, when I search and it doesnt exist on the url, it will show nothing.html?
$url1 = "http://www.pengadaan.net/tend_src_cont2.php?src_nm=";
$url2 = $_GET['src_nm']."&src_prop=";
$url3 = $_GET['src_prop'];
$url = $url1.$url2.$url3;
$html = file_get_html($url);
if (method_exists($html,"find")) {
echo "<ul>";
foreach($html->find('div[class=pengadaan-item] h1[] a[]') as $element ) {
echo ("<li>".$element."</li>");
}
echo "</ul>";
echo $url;
}
else {
}
There are two ways to move to another page in PHP. you can do header("Location: http://www.yourwebsite.com/nothing.php"); or you can have PHP echo JavaScript to do a reidrect (if you already defined your headers):
if (method_exists($html,"find")) { // If 'find exist'
...
} else { // Otherwise it does not exist
header("Location: http://www.pengadaan.net/nothing.php"); // redirect here
}
Or if you already sent you headers you can get around it using JavaScript:
...
} else {
echo '<script>window.location.replace("http://www.pengadaan.net/nothing.php")</script>';
}
Here is what I want to do..
Lets say I am looking for the link "example.com" in a file at http://example.com/test.html".
I want to take a PHP script that looks for an in the mentioned website. However, I also need it to work if there is a class or ID tag in the <A>.
See below url
How can I check if a URL exists via PHP?
or try it
$file = 'http://www.domain.com/somefile.jpg';
$file_headers = #get_headers($file);
if($file_headers[0] == 'HTTP/1.1 404 Not Found') {
$exists = false;
}
else {
$exists = true;
}
From here: http://www.php.net/manual/en/function.file-exists.php#75064
...and right below the above post, there's a curl solution:
function url_exists($url) {
if (!$fp = curl_init($url)) return false;
return true;
}
Update code:-
You can use SimpleHtmlDom Class for find id or class in tag
see the below URL
http://simplehtmldom.sourceforge.net/
http://simplehtmldom.sourceforge.net/manual_api.htm
http://sourceforge.net/projects/simplehtmldom/files/
http://davidwalsh.name/php-notifications
Here is what I have found in case anyone else needs it also!
$url = "http://example.com/test.html";
$searchFor = "example.com"
$input = #file_get_contents($url) or die("Could not access file: $url");
$regexp = "<a\s[^>]*href=(\"??)([^\" >]*?)\\1[^>]*>(.*)<\/a>";
if(preg_match_all("/$regexp/siU", $input, $matches, PREG_SET_ORDER)) {
foreach($matches as $match) {
echo $match[2];
if ($match[2] == $searchFor)
{
$isMatch = 1;
} else {
$isMatch= 0;
}
// $match[0] = A tag
// $match[2] = link address
// $match[3] = link text
}
}
if ($isMatch)
{
echo "<p><font color=red size=5 align=center>The page specified does contain your link. You have been credited the award amount!</font></p>";
} else {
echo "<p><font color=red size=5 align=center>The specified page does not have your referral link.</font></p>";
}
So, right now I'm trying to generate dynamic URLs as well as their paths without using .htaccess or the use of modules. Basically, I'm trying to rewrite and output URL paths that work statelessly on different localhosts served by Apache.
I would like to have an index.php that gets URIs through a query string like this (obviously not code):
-> index.php (echos dynamically generated URL hyperlinks)
-> index.php?q=/ (echos every URI in JSON)
-> index.php?q=/some_id (outputs table some_id info w/ links to reviews)
-> index.php?q=/some_id/reviews (outputs table column of reviews w/ link to review_ids)
-> index.php?q=/some_id/reviews/review_id (output related column info)
Could someone walk me through how I'd go about doing this? I figure I'm going to have to write the URL using $_SERVER methods and explode while iterating through an array of table IDs..?
Any help is greatly appreciated!
EDIT:
Here's the code I was trying to write :/
<?php
$db = $user = $pw = 'logininfo';
try {
$dbconn = new PDO('mysql:host=localhost;db='.$db, $user, $pw;
}
catch (Exception $e) {
echo "Error: ";
echo $e->getMessage();
}
?>
<!DOCTYPE HTML>
<head>
<title>Product Reviews</title>
</head>
<body>
<h1>Product List:</h1>
<h2>
<ul>
<?php
try {
$sql = "SELECT somename, some_id FROM products";
$stmt = $dbconn->query($sql);
if($stmt !== false) {
foreach($stmt as $row) { //output just name
echo "<li>";
echo htmlentities($row['somename'])."<br />";
if($stmt !== false) {
$url = "<a href=index.php?q=".
htmlentities($row['some_id'])."/>".
htmlentities($row['somename'])."'s Review</a>";
echo $url; //output URL
echo "</li>"."<br />";
}
else {
echo "Unnecessary";
}
}
if($_GET['url']) { //don't really know what to put here
header("Location: $url"); //can't use headers anyway
}
}
$stmt = null;
}
catch (PDOEXCEPTION $e) {
echo $e->getMessge();
}
?>
</ul>
</h2>
</body>
</html>
You can write URLs as :
http://example.org/index.php/reviews/id/ [where id can be your review id]
and use $_SERVER['PATH_INFO'] in index.php to get part of URL which is after index.php, then explode the text and get desired data out of it.
<?php
$query_string = explode('/', $_SERVER['PATH_INFO']);
switch(count($query_string)) {
case 2:
$some_id = (int) $query_string[1];
if ($some_id === 0) {
//(echos every URI in JSON)
}
else {
// (outputs table some_id info w/ links to reviews)
}
break;
case 3:
//(outputs table column of reviews w/ link to review_ids)
$some_id = (int) $query_string[1];
$table_name = $query_string[2];
break;
case 4:
//(output related column info)
$some_id = (int) $query_string[1];
$table_name = $query_string[2];
$review_id = (int) $query_string[3];
break;
default:
// Error case
}
?>
Try this for size
if (isset($_GET['q']) && !empty($_GET['q']))
{
$params = explode("/",$_GET['q']);
if (isset($params[3]) && !empty($params[3]))
echo "(output {$params[2]} {$params[3]} info)";
else if (isset($params[2]) && !empty($params[2]))
echo "(outputs table column of {$params[2]} w/ link to review_ids)";
else if (isset($params[1]) && !empty($params[1]))
echo "(outputs table {$params[1]} info w/ links to reviews)";
else
echo "(echos every URI in JSON) ";
}
else
echo "(echos dynamically generated URL hyperlinks)";
I have this code:
btn_jouer.onRelease = function ()
{
verif = txt_email_user.text;
if (txt_email_user.text == "")
{
txt_erreur.textColor = 16724736;
txt_erreur.text = "Champ(s) manquant(s)";
}
else if (verif.indexOf("#", 0) == -1 || verif.indexOf(".", 0) == -1)
{
txt_erreur.textColor = 16724736;
txt_erreur.text = "Adresse E-mail invalide";
}
else
{
php_login = new LoadVars();
php_login.email = txt_email_user.text;
php_login.sendAndLoad(_root.page_Login, php_login, "POST");
php_login.onLoad = function(succes)
{
if (succes)
{
//txt_erreur.text = php_login.etat;
//return;
if (php_login.etat == "exist")
{
_root.var_user.id = php_login.id;
_root.var_user.nom = php_login.nom;
_root.var_user.prenom = php_login.prenom;
_root.var_user.score = php_login.score;
_root.MovieLogin.unloadMovie();
if (_root._root.selectedPhone == "KS360")
{
_root.gotoAndStop(4);
}
else
{
_root.gotoAndStop(3);
} // end else if
}
else if (php_login.etat == "non")
{
trace (php_login.etat);
txt_erreur.text = "Email non enregistré! veuillez vous s'inscrir";
} // end if
} // end else if
};
} // end else if
};
The "page_Login" is login.php file on the server,
After debugging, the file login.php successfully received Posted data so i got:
$_POST['email'] = "what ever you type in swf form";
The login.php processor file:
if(isset($_REQUEST['email'])){
$email = strtolower(addslashes($_REQUEST['email']));
$DB->_request("select * from gamers where email='$email'");
if($DB->_nr() > 0) {
$row = mysql_fetch_array($DB->Result);
echo "&etat=exist&nom={$row['nom']}&prenom={$row['prenom']}&score={$row['score']}";
//
exit;
}
else {
echo "&etat=non";
exit;
}
}
Here above, the $DB->_nr() always returns "0" even the email address exists!
I have tried to create a simple html page having a form with method POST and have a simple input type text with a name="email"
When i write my email which is valid in the database and hit submit $DB->_nr() returns 1.
This really is driving me crazy, i'm sure that the email address exists, the login.php page receive posted data "email = validemail#domain.com" from SendAndLoad(); but mysql_num_rows returns 0.
Any one there had the same issue??
Any help would be so much appreciated!
Barry,
Use the following code in PHP to compare the email in both cases: given from flash and from HTML form:
if(isset($_REQUEST['email'])){
//createa the testFile.txt and give it attributes with 0777 for permission (in case you are under linux)
$myFile = "testFile.txt";
$fh = fopen($myFile, 'a') or die("can't open file");
fwrite($fh, "-".$_REQUEST['email']."-\r\n");
fclose($fh);
$email = strtolower(addslashes($_REQUEST['email']));
$DB->_request("select * from gamers where email='$email'");
if($DB->_nr() > 0) {
$row = mysql_fetch_array($DB->Result);
echo "&etat=exist&nom={$row['nom']}&prenom={$row['prenom']}&score={$row['score']}";
//
exit;
}
else {
echo "&etat=non";
exit;
}
}
if you test for both of the cases, you will be able to compare the two exact forms. I have put "-" in the front and the end of it just to see if there are any whitespaces next to the email value.
Please reply with a compare result. thank you.