I have a database of a few thousand URL's that I am checking for links on pages (end up looking for specific links) and so I am throwing the below function through a loop and every once and awhile one of the URL's is bad and then the entire program just stalls and stops running and starts building up memory used. I thought adding the CURLOPT_TIMEOUT would fix this but it didn't. Any ideas?
$options = array(
CURLOPT_RETURNTRANSFER => true, // return web page
CURLOPT_HEADER => false, // don't return headers
CURLOPT_FOLLOWLOCATION => true, // follow redirects
CURLOPT_ENCODING => "", // handle all encodings
CURLOPT_USERAGENT => "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13'", // who am i
CURLOPT_AUTOREFERER => true, // set referer on redirect
CURLOPT_TIMEOUT => 2, // timeout on response
CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
CURLOPT_POST => 0, // i am sending post data
CURLOPT_POSTFIELDS => $curl_data, // this are my post vars
CURLOPT_SSL_VERIFYHOST => 0, // don't verify ssl
CURLOPT_SSL_VERIFYPEER => false, //
CURLOPT_VERBOSE => 1 //
);
$ch = curl_init($url);
curl_setopt_array($ch,$options);
$content = curl_exec($ch);
$err = curl_errno($ch);
$errmsg = curl_error($ch) ;
$header = curl_getinfo($ch);
curl_close($ch);
// $header['errno'] = $err;
// $header['errmsg'] = $errmsg;
$header['content'] = $content;
#Extract the raw URl from the current one
$scheme = parse_url($url, PHP_URL_SCHEME); //Ex: http
$host = parse_url($url, PHP_URL_HOST); //Ex: www.google.com
$raw_url = $scheme . '://' . $host; //Ex: http://www.google.com
#Replace the relative link by an absolute one
$relative = array();
$absolute = array();
#String to search
$relative[0] = '/src="\//';
$relative[1] = '/href="\//';
#String to remplace by
$absolute[0] = 'src="' . $raw_url . '/';
$absolute[1] = 'href="' . $raw_url . '/';
$source = preg_replace($relative, $absolute, $content); //Ex: src="/image/google.png" to src="http://www.google.com/image/google.png"
return $source;
curl_exec will return false if it cannot find the URL.
The HTTP status code will be zero.
Check the results of curl_exec and check the HTTP status
code too.
$content = curl_exec($ch);
$httpStatus = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if ( $content === false) {
if ($httpStatus == 0) {
$content = "link was not found";
}
}
....
The way you have it currently, the line of code
header['content'] = $content;
will get the value of false. This is not what you
want.
I am using curl_exec and my code does not stall if
it cannot find the url. The code keeps running.
You may end up with nothing in your browser though
and a message in the Firebug Console like "500 Internal Server Error".
Maybe that's what you mean by stall.
So basically you don't know and just guess that the curl request is stalling.
For this answer I can only guess as well then. You might need to set one of the following curl option as well: CURLOPT_CONNECTTIMEOUT
If the connect already stalls, the other timeout setting might not be taken into account. I'm not entirely sure, but please see Why would CURL time out in 1000ms when I have set up timeout upto 3000ms?.
Related
I need to scrape an ASP website using cURL. My hosting does not allow me to turn off safe_mode or open_basedir. That's why CURLOPT_FOLLOWLOCATION cannot be activated (it throws an error "CURLOPT_FOLLOWLOCATION cannot be activated when an open_basedir is set").
I tried to implement some workaround but after several unlucky days starting to be desperate. I am wondering how to change the code below to contain manual redirection instead of CURLOPT_FOLLOWLOCATION:
include_once __DIR__.'/simple_html_dom.php';
define('COOKIE_FILE', __DIR__.'/cookie.txt');
#unlink(COOKIE_FILE); //clear cookies before we start
define('CURL_LOG_FILE', __DIR__.'/request.txt');
#unlink(CURL_LOG_FILE);//clear curl log
class ASPBrowser {
public $exclude = array();
public $lastUrl = '';
public $dom = false;
/**Get simplehtmldom object from url
* #param $url
* #param $post
* #return bool|simple_html_dom
*/
public function getDom($url, $post = false) {
$f = fopen(CURL_LOG_FILE, 'a+'); // curl session log file
if($this->lastUrl) $header[] = "Referer: {$this->lastUrl}";
$curlOptions = array(
CURLOPT_ENCODING => 'gzip,deflate',
CURLOPT_AUTOREFERER => 1,
CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
CURLOPT_TIMEOUT => 120, // timeout on response
CURLOPT_URL => $url,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_SSL_VERIFYHOST => false,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 9,
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_HEADER => 0,
CURLOPT_USERAGENT => "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
CURLOPT_COOKIEFILE => COOKIE_FILE,
CURLOPT_COOKIEJAR => COOKIE_FILE,
CURLOPT_STDERR => $f, // log session
CURLOPT_VERBOSE => true,
);
if($post) { // add post options
$curlOptions[CURLOPT_POSTFIELDS] = $post;
$curlOptions[CURLOPT_POST] = true;
}
$curl = curl_init();
curl_setopt_array($curl, $curlOptions);
$data = curl_exec($curl);
$this->lastUrl = curl_getinfo($curl, CURLINFO_EFFECTIVE_URL); // get url we've been redirected to
curl_close($curl);
if($this->dom) {
$this->dom->clear();
$this->dom = false;
}
$dom = $this->dom = str_get_html($data);
fwrite($f, "{$post}\n\n");
fwrite($f, "-----------------------------------------------------------\n\n");
fclose($f);
return $dom;
}
function createASPPostParams($dom, array $params) {
$postData = $dom->find('input,select,textarea');
$postFields = array();
foreach($postData as $d) {
$name = $d->name;
if(trim($name) == '' || in_array($name, $this->exclude)) continue;
$value = isset($params[$name]) ? $params[$name] : $d->value;
$postFields[] = rawurlencode($name).'='.rawurlencode($value);
}
$postFields = implode('&', $postFields);
return $postFields;
}
function doPostRequest($url, array $params) {
$post = $this->createASPPostParams($this->dom, $params);
return $this->getDom($url, $post);
}
function doPostBack($url, $eventTarget, $eventArgument = '') {
return $this->doPostRequest($url, array(
'__EVENTTARGET' => $eventTarget,
'__EVENTARGUMENT' => $eventArgument
));
}
function doGetRequest($url) {
return $this->getDom($url);
}
}
(Credits: Andrey http://256cats.com/scraping-asp-websites-php-dopostback-ajax-emulation/)
You're probably looking for the CURLINFO_REDIRECT_URL info variable, as that returns the URL that it would otherwise had redirected to if you'd allowed it. Added in PHP 5.3.7.
Note that the exact response code 3xx also affects how the HTTP request method is supposed to change or not change when you follow a redirect. See details in the HTTP spec, RFC 7231 section 6.4.
The libcurl docs for CURLINFO_REDIRECT_URL.
I'm trying to access one page in a website with CURL, however it needs to be logged in i tried the code to login and it was successful
<?php
$user_agent = "Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20140319 Firefox/24.0 Iceweasel/24.4.0";
$curl_crack = curl_init();
CURL_SETOPT($curl_crack,CURLOPT_URL,"https://www.vininspect.com/en/account/login");
CURL_SETOPT($curl_crack,CURLOPT_USERAGENT,$user_agent);
CURL_SETOPT($curl_crack,CURLOPT_PROXY,"183.78.169.60:37899");
CURL_SETOPT($curl_crack,CURLOPT_PROXYTYPE,CURLPROXY_SOCKS5);
CURL_SETOPT($curl_crack,CURLOPT_POST,True);
CURL_SETOPT($curl_crack,CURLOPT_POSTFIELDS,"LoginForm[email]=naceriwalid%40hotmail.com&LoginForm[password]=passwordhere&toploginform[rememberme]=0&yt1=&toploginform[rememberme]=0");
CURL_SETOPT($curl_crack,CURLOPT_RETURNTRANSFER,True);
CURL_SETOPT($curl_crack,CURLOPT_FOLLOWLOCATION,True);
CURL_SETOPT($curl_crack,CURLOPT_COOKIEFILE,"cookie.txt"); //Put the full path of the cookie file if you want it to write on it
CURL_SETOPT($curl_crack,CURLOPT_COOKIEJAR,"cookie.txt"); //Put the full path of the cookie file if you want it to write on it
CURL_SETOPT($curl_crack,CURLOPT_CONNECTTIMEOUT,30);
CURL_SETOPT($curl_crack,CURLOPT_TIMEOUT,30);
$exec = curl_exec($curl_crack);
if(preg_match("/^you are logged|logout|successfully logged$/i",$exec))
{
echo "yoooha";
}
?>
Now the only problem I'm facing let's say that i don't want to be redirected to the logged in page, i want to be redirected to this page http://example.com/buy, how i can do that in the same code?
If you want to go to /buy after you log in, just use the same curl handle and issue another request for that page. cURL will retain the cookies for the duration of the handle (and on subsequent requests since you are saving them to a file and reading them back with the cookie jar.
For example:
$user_agent = "Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20140319 Firefox/24.0 Iceweasel/24.4.0";
$curl_crack = curl_init();
CURL_SETOPT($curl_crack,CURLOPT_URL,"https://www.vininspect.com/en/account/login");
CURL_SETOPT($curl_crack,CURLOPT_USERAGENT,$user_agent);
CURL_SETOPT($curl_crack,CURLOPT_PROXY,"183.78.169.60:37899");
CURL_SETOPT($curl_crack,CURLOPT_PROXYTYPE,CURLPROXY_SOCKS5);
CURL_SETOPT($curl_crack,CURLOPT_POST,True);
CURL_SETOPT($curl_crack,CURLOPT_POSTFIELDS,"LoginForm[email]=naceriwalid%40hotmail.com&LoginForm[password]=passwordhere&toploginform[rememberme]=0&yt1=&toploginform[rememberme]=0");
CURL_SETOPT($curl_crack,CURLOPT_RETURNTRANSFER,True);
CURL_SETOPT($curl_crack,CURLOPT_FOLLOWLOCATION,True);
CURL_SETOPT($curl_crack,CURLOPT_COOKIEFILE,"cookie.txt"); //Put the full path of the cookie file if you want it to write on it
CURL_SETOPT($curl_crack,CURLOPT_COOKIEJAR,"cookie.txt"); //Put the full path of the cookie file if you want it to write on it
CURL_SETOPT($curl_crack,CURLOPT_CONNECTTIMEOUT,30);
CURL_SETOPT($curl_crack,CURLOPT_TIMEOUT,30);
$exec = curl_exec($curl_crack);
if(preg_match("/^you are logged|logout|successfully logged$/i",$exec))
{
$post = array('search' => 'keyword', 'abc' => 'xyz');
curl_setopt($curl_crack, CURLOPT_POST, 1); // change back to GET
curl_setopt($curl_crack, CURLOPT_POSTFIELDS, http_build_query($post)); // set post data
curl_setopt($curl_crack, CURLOPT_URL, 'http://example.com/buy'); // set url for next request
$exec = curl_exec($curl_crack); // make request to buy on the same handle with the current login session
}
Here are some other examples of using PHP & cURL to make multiple requests:
How to login in with Curl and SSL and cookies (links to multiple other examples)
Grabbing data from a website with cURL after logging in?
Pinterest login with PHP and cURL not working
Login to Google with PHP and Curl, Cookie turned off?
PHP Curl - Cookies problem
You just need to change the URL after login is compete and then run curl_exec Like this :
<?php
//login code goes here
if(preg_match("/^you are logged|logout|successfully logged$/i",$exec))
{
echo "Logged in! now lets go to other page while we are logged in, shall we?";
//The new URL that you want to go to while logged in goes in bottom line :
CURL_SETOPT($curl_crack, CURLOPT_URL, "https://new_url_to_go.com/something");
$exec = curl_exec($curl_crack);
// now $exec contains the the content of new page with login
}
curl_close($curl_crack);//dont forgert to close curl session at last
?>
First define these function to get an associative array containing the url header and content (see http://nadeausoftware.com/articles/2007/06/php_tip_how_get_web_page_using_curl):
/**
* Get a web file (HTML, XHTML, XML, image, etc.) from a URL. Return an
* array containing the HTTP server response header fields and content.
*/
function get_web_page( $url, $params, $is_post = true )
{
$options = array(
CURLOPT_RETURNTRANSFER => true, // return web page
CURLOPT_HEADER => false, // don't return headers
CURLOPT_FOLLOWLOCATION => true, // follow redirects
CURLOPT_ENCODING => "", // handle all encodings
CURLOPT_USERAGENT => "Mozilla/4.0 (compatible;)", // i'm mozilla
CURLOPT_AUTOREFERER => true, // set referer on redirect
CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
CURLOPT_TIMEOUT => 120, // timeout on response
CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
);
if($is_post) { //use POST
$options[CURLOPT_POST] = 1;
$options[CURLOPT_POSTFIELDS] = http_build_query($params);
} else { //use GET
$url = $url.'?'.http_build_query($params);
}
$ch = curl_init( $url );
curl_setopt_array( $ch, $options );
$content = curl_exec( $ch );
$err = curl_errno( $ch );
$errmsg = curl_error( $ch );
$header = curl_getinfo( $ch );
curl_close( $ch );
$header['errno'] = $err;
$header['errmsg'] = $errmsg;
$header['content'] = $content;
return $header;
}
try this to load the 'http://www.example.com/buy' after login is successful.
// after curl login setup
$exec = curl_exec($curl_crack);
if(preg_match("/^you are logged|logout|successfully logged$/i",$exec))
{
// close login CURL resource, and free up system resources
curl_close($curl_crack);
$params = array('product_id'=>'xxxx', qty=>10);
$url = 'http://www.example.com/buy';
//use above function to get the url content via POST params
$result = get_web_page($url, $params, true);
if($result['http_code'] == 200) {
//echo the content
echo $result['content'];
die();
}
}
I have a php script that sends a file via cURL, to a remote location.
This works great on a Mac.
On Windows w/ MAMP (I am stuck with this at present), no request reaches the remote server.
If I take away the CURLOPT_POSTFIELDS parameter, the request is sent, however without my data (obviously). This tells me that cURL is loaded ok, and is able to send a request.
What I can't work out, is what is causing this to do nothing when CURLOPT_POSTFIELDS is included (as per the code below) - no errors (that I am aware of).
Here is the code that I am running:
function getCurlValue($filename, $contentType, $postname)
{
if (function_exists('curl_file_create')) {
return curl_file_create($filename, $contentType, $postname);
}
// Use the old style if using an older version of PHP
$value = "#{$this->filename};filename=" . $postname;
if ($contentType) {
$value .= ';type=' . $contentType;
}
return $value;
}
$filename = 'c:\path\to\file\test.txt';
$cfile = getCurlValue($filename,'text/plain','test.txt');
$data = array('updateFile' => $cfile);
$ch = curl_init();
$options = array(CURLOPT_URL => 'http://url/to/my/service',
CURLOPT_RETURNTRANSFER => true,
CURLINFO_HEADER_OUT => true, //Request header
CURLOPT_HEADER => true, //Return header
CURLOPT_FAILONERROR => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => $data
);
curl_setopt_array($ch, $options);
$result = curl_exec($ch);
$header_info = curl_getinfo($ch,CURLINFO_HEADER_OUT);
$header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
$header = substr($result, 0, $header_size);
$body = substr($result, $header_size);
curl_close($ch);
I have checked for cURL errors, of which there are none thrown.
Any thoughts would be most appreciated.
Thanks,
Nathan
This issue appears to have resolved itself. All is now working, with no changes on my part.
Must have been something at the server end....
When i'm trying to invoke the YQL via cURL i'm getting the following error.
HTTP Version Not Supported
Description: The web server "engine1.yql.vip.bf1.yahoo.com" is using an unsupported version of the HTTP protocol.
Following is the code used
// URL
$URL = "https://query.yahooapis.com/v1/public/yql?q=select * from html where url=\"http://www.infibeam.com/Books/search?q=9788179917558\" and xpath=\"//span[#class='infiPrice amount price']/text()\"&format=json";
// set url
curl_setopt($ch, CURLOPT_URL, $URL);
//return the transfer as a string
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
// $output contains the output string
$output = curl_exec($ch);
// close curl resource to free up system resources
curl_close($ch);
echo $output;
?>
Invoking the same URL from thr browser works fine
https://query.yahooapis.com/v1/public/yql?q=select * from html where
url="http://www.infibeam.com/Books/search?q=9788179917558" and
xpath="//span[#class='infiPrice amount price']/text()"&format=json
Can someone please point me what is wrong in the code?
The problem is probably caused because the url you feed to cURL is not valid. You need to prepare / encode the individual values of the query strings for use in a url.
You can do that using urlencode():
$q = urlencode("select * from html where url=\"http://www.infibeam.com/Books/search?q=9788179917558\" and xpath=\"//span[#class='infiPrice amount price']/text()\"");
$URL = "https://query.yahooapis.com/v1/public/yql?q={$q}&format=json";
In this case I have only encoded the value of q as the format does not contain characters that you cannot use in a url, but normally you'd do that for any value you don't know or control.
Okay I gottacha .. The problem was with the https. Used the following snippet for debug
if (false === ($data = curl_exec($ch))) {
die("Eek! Curl error! " . curl_error($ch));
}
Added below code to accept SSL certificates by default.
$options = array(CURLOPT_URL => $URL,
CURLOPT_HEADER => "Content-Type:text/xml",
CURLOPT_SSL_VERIFYPEER => 0,
CURLOPT_RETURNTRANSFER => TRUE
);
Complete code is here
<?php
// create curl resource
$ch = curl_init();
// URL
$q = urlencode("select * from html where url=\"http://www.infibeam.com/Books/search?q=9788179917558\" and xpath=\"//span[#class='infiPrice amount price']/text()\"");
$URL = "https://query.yahooapis.com/v1/public/yql?q={$q}&format=json";
echo "URL is ".$URL;
$ch = curl_init();
//Define curl options in an array
$options = array(CURLOPT_URL => $URL,
CURLOPT_HEADER => "Content-Type:text/xml",
CURLOPT_SSL_VERIFYPEER => 0,
CURLOPT_RETURNTRANSFER => TRUE
);
//Set options against curl object
curl_setopt_array($ch, $options);
//Assign execution of curl object to a variable
$data = curl_exec($ch);
echo($data);
//Pass results to the SimpleXMLElement function
//$xml = new SimpleXMLElement($data);
echo($data);
if (false === ($data = curl_exec($ch))) {
die("Eek! Curl error! " . curl_error($ch));
}
if (200 !== (int)curl_getinfo($ch, CURLINFO_HTTP_CODE)) {
die("Oh dear, no 200 OK?!");
}
//Close curl object
curl_close($ch);
?>
I am facing one unusual behavior of curl. For a given page, I some times get HTTP response code as 200 and sometimes I get 0 as HTTP response code. I am not able to understand whether this page is valid or not. If you try the given code, please try it for at least 5-10 times so that you can see the difference.
function print_info()
{
$url = 'bart.no';
$arr = array(
'bart.no',
'bolandirekt.nu',
'ekompassen.com',
'ekompassen.nu',
);
foreach ($arr as $url)
{
echo "<br/>URL: " . $url;
$temp = str_replace(array("www.", "http://", "https://"), "", strtolower($url));
// From this array it will be decided which is to prepend
$pre_array = array("", "www.", "https://", "http://", "https://www.", "http://www.");
$status_code = array();
// For each Value Status will be stored
foreach ($pre_array as $pre)
{
$options = array(
CURLOPT_RETURNTRANSFER => TRUE, // return web page
CURLOPT_HEADER => TRUE, // don't return headers
CURLOPT_FOLLOWLOCATION => FALSE, // follow redirects
CURLOPT_ENCODING => "", // handle all encodings
CURLOPT_USERAGENT => "spider", // who am i
CURLOPT_AUTOREFERER => FALSE, // set referer on redirect
CURLOPT_SSL_VERIFYHOST => FALSE, //ssl verify host
CURLOPT_SSL_VERIFYPEER => FALSE, //ssl verify peer
CURLOPT_NOBODY => FALSE,
CURLOPT_CONNECTTIMEOUT => 20, // timeout on connect
CURLOPT_TIMEOUT => 20, // timeout on response
);
// Initializing Curl
$ch = curl_init($pre . $temp);
// Set Curl Options
curl_setopt_array($ch, $options);
// Execute Curl
$content = curl_exec($ch);
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
echo "<pre/>";
if ($code == 200)
{
print_r(curl_getinfo($ch));
break;
}
curl_close($ch);
}
}
}
So my final doubt is : Why I am getting response code 200 for the pages which are not existing Or not opening in browser ? Also, why sometimes I get response code 0 and sometimes response code 200 for the same page even if I keep time interval between requests ?
The CURL request did not complete, thus there's no response code.
The reason for this may be an invalid host name (can't resolve), malformed URL, timeout, etc.
You should be able to get the CURL error code as in CodeCaster's comment and curl_error / curl_errno docs.
Once the CURL request completed properly, then a response code (from the server) should be available and meaningful.