how to use curl in php? [duplicate] - php

This question already has an answer here:
Closed 11 years ago.
Possible Duplicate:
get the value of an url response with curl
I have an php page names stores.php now i want to see the output of this page using curl, what i can do ?
my code is so far for stores.php page
<?php
include_once '../application/Boot.php';
if ($_SERVER['REQUEST_METHOD'] == 'POST') {
$body = #file_get_contents('php://input');
$json = json_decode($body, true);
if (isset($json['version'])) {
$client_cache_version = #$json['version'];
$sql = $db->quoteInto("SELECT * FROM stores where version_modified > ". $client_cache_version);
$results = $db->fetchAll($sql);
$version_sql = $db->quoteInto("SELECT max(version_modified) as version FROM stores");
$version_results = $db->fetchAll($version_sql);
$count = array(
'count' => sizeof($results)
);
array_push($results, $version_results['0']);
array_push($results, $count);
//ob_start("ob_gzhandler");
header('HTTP/1.1 200 Stores list');
echo json_encode($results);
exit;
}else {
header('HTTP/1.1 400 Bad Request');
exit;
}
}else{
header('HTTP/1.1 400 Bad Request');
exit;
}
?>

use man curl for how to use curl to display the response of a webpage.
example:
curl "http://www.stackoverflow.com"

function getPage($url, $referer, $agent, $header, $timeout, $proxy="")
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, $header);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
if($proxy != "")
{
curl_setopt($ch, CURLOPT_PROXY, $proxy);
curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, 0);
}
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_REFERER, $referer);
curl_setopt($ch, CURLOPT_USERAGENT, $agent);
curl_setopt($ch, CURLOPT_COOKIEJAR, realpath('cookies.txt'));
curl_setopt($ch, CURLOPT_COOKIEFILE, realpath('/cookies.txt'));
$result['EXE'] = curl_exec($ch);
$result['INF'] = curl_getinfo($ch);
$result['ERR'] = curl_error($ch);
curl_close($ch);
return $result;
}
$url = "www.targeturl.com";
$referer = "http;//www.google.com";
$agent = 'Googlebot/2.1 (http://www.googlebot.com/bot.html)';
$header = 1;
$timeout = 15;
$result = getPage($url, $referer, $agent, $header, $timeout);
//$result["ERR"] contain errors if any one
//$result['EXE'] have the html of traget url you supplied in $url variable
//$result['info] have information.
you can use it like this
if(empty($result["ERR"])) // no error
{
echo $result['EXE']; //html of target url
}
else // errors
{
// do something on errors
}
// $proxy is optional
// if you want to open target url through a proxy use it like this
$proxy = "120.232.23.23:8080";
$result = getPage($url, $referer, $agent, $header, $timeout,$proxy);

Related

How to check entered url is indexing on google or not using php

I am trying to create a script that should check all indexed pages from Google. But it will not give proper output.
<?php
function indexed($url) {
$url = 'http://webcache.googleusercontent.com/search?q=cache:' . urlencode($url);
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_NOBODY, true);
curl_setopt($ch, CURLOPT_USERAGENT, 'Chrome 10');
if (!curl_exec($ch)) {
// var_dump('failed');
return false;
}
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
//var_dump($code);
return $code == '200';
}
echo indexed('http://www.berlin-info.de/de/tourist-info/urlaub-in-kanada');
?>

Basic Authorization PHP with Server related

I have some problem that related to HTTP_HEADERS in curl php in opencart. The code below is caller.
$ch = curl_init();
$url = 'http://aaa.com/index.php?route=common/home/getTotalCustomer';
$url2 = 'http://bbb.com/index.php?route=common/home/getTotalCustomer';
$url3 = 'http://ccc.com/index.php?route=common/home/getTotalCustomer';
$header = array('Authorization:Basic ' . base64_encode('user:password'));
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FAILONERROR, true);
curl_setopt($ch, CURLOPT_HEADER, false);
$results = curl_exec($ch);
echo '<pre>';
print_r(json_decode($results, true));
echo '</pre>';
The receiver code like below:
public function getTotalCustomer(){
$json = array();
$this->load->model('account/customer');
if(isset($_SERVER['PHP_AUTH_USER']) && isset($_SERVER['PHP_AUTH_PW'])){
if(($_SERVER['PHP_AUTH_PW'] == 'password') && ($_SERVER['PHP_AUTH_USER'] == 'user')){
$json['total_customer'] = $this->model_account_customer->getTotalCustomer();
}
} else{
$json['message'] = 'failed';
}
$this->response->addHeader('Content-Type: application/json');
$this->response->setOutput(json_encode($json));
}
I had tried in multiple domain with different servers. Some server can return the data but some server cannot return the data. Why?
Your header that you're sending is incorrect.
You're passing
Authorization:Basic <username:password>
it should be
Authorization: Basic <username:password>
note the space.

Cannot get an http markup of google [duplicate]

I'm trying to make curl follow a redirect but I can't quite get it to work right. I have a string that I want to send as a GET param to a server and get the resulting URL.
Example:
String = Kobold Vermin
Url = www.wowhead.com/search?q=Kobold+Worker
If you go to that url it will redirect you to "www.wowhead.com/npc=257". I want curl to return this URL to my PHP code so that i can extract the "npc=257" and use it.
Current code:
function npcID($name) {
$urltopost = "http://www.wowhead.com/search?q=" . $name;
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1");
curl_setopt($ch, CURLOPT_URL, $urltopost);
curl_setopt($ch, CURLOPT_REFERER, "http://www.wowhead.com");
curl_setopt($ch, CURLOPT_HTTPHEADER, Array("Content-Type:application/x-www-form-urlencoded"));
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
return curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
}
This however returns www.wowhead.com/search?q=Kobold+Worker and not www.wowhead.com/npc=257.
I suspect PHP is returning before the external redirect happens. How can I fix this?
To make cURL follow a redirect, use:
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
Erm... I don't think you're actually executing the curl... Try:
curl_exec($ch);
...after setting the options, and before the curl_getinfo() call.
EDIT: If you just want to find out where a page redirects to, I'd use the advice here, and just use Curl to grab the headers and extract the Location: header from them:
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$result = curl_exec($ch);
if (preg_match('~Location: (.*)~i', $result, $match)) {
$location = trim($match[1]);
}
Add this line to curl inizialization
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
and use getinfo before curl_close
$redirectURL = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL );
es:
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_USERAGENT,'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_BINARYTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT ,0);
curl_setopt($ch, CURLOPT_TIMEOUT, 60);
$html = curl_exec($ch);
$redirectURL = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL );
curl_close($ch);
The answer above didn't work for me on one of my servers, something to to with basedir, so I re-hashed it a little. The code below works on all my servers.
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
$a = curl_exec($ch);
curl_close( $ch );
// the returned headers
$headers = explode("\n",$a);
// if there is no redirection this will be the final url
$redir = $url;
// loop through the headers and check for a Location: str
$j = count($headers);
for($i = 0; $i < $j; $i++){
// if we find the Location header strip it and fill the redir var
if(strpos($headers[$i],"Location:") !== false){
$redir = trim(str_replace("Location:","",$headers[$i]));
break;
}
}
// do whatever you want with the result
echo $redir;
The chosen answer here is decent but its case sensitive, doesn't protect against relative location: headers (which some sites do) or pages that might actually have the phrase Location: in their content... (which zillow currently does).
A bit sloppy, but a couple quick edits to make this a bit smarter are:
function getOriginalURL($url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
$result = curl_exec($ch);
$httpStatus = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
// if it's not a redirection (3XX), move along
if ($httpStatus < 300 || $httpStatus >= 400)
return $url;
// look for a location: header to find the target URL
if(preg_match('/location: (.*)/i', $result, $r)) {
$location = trim($r[1]);
// if the location is a relative URL, attempt to make it absolute
if (preg_match('/^\/(.*)/', $location)) {
$urlParts = parse_url($url);
if ($urlParts['scheme'])
$baseURL = $urlParts['scheme'].'://';
if ($urlParts['host'])
$baseURL .= $urlParts['host'];
if ($urlParts['port'])
$baseURL .= ':'.$urlParts['port'];
return $baseURL.$location;
}
return $location;
}
return $url;
}
Note that this still only goes 1 redirection deep. To go deeper, you actually need to get the content and follow the redirects.
Sometimes you need to get HTTP headers but at the same time you don't want return those headers.**
This skeleton takes care of cookies and HTTP redirects using recursion. The main idea here is to avoid return HTTP headers to the client code.
You can build a very strong curl class over it. Add POST functionality, etc.
<?php
class curl {
static private $cookie_file = '';
static private $user_agent = '';
static private $max_redirects = 10;
static private $followlocation_allowed = true;
function __construct()
{
// set a file to store cookies
self::$cookie_file = 'cookies.txt';
// set some general User Agent
self::$user_agent = 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)';
if ( ! file_exists(self::$cookie_file) || ! is_writable(self::$cookie_file))
{
throw new Exception('Cookie file missing or not writable.');
}
// check for PHP settings that unfits
// correct functioning of CURLOPT_FOLLOWLOCATION
if (ini_get('open_basedir') != '' || ini_get('safe_mode') == 'On')
{
self::$followlocation_allowed = false;
}
}
/**
* Main method for GET requests
* #param string $url URI to get
* #return string request's body
*/
static public function get($url)
{
$process = curl_init($url);
self::_set_basic_options($process);
// this function is in charge of output request's body
// so DO NOT include HTTP headers
curl_setopt($process, CURLOPT_HEADER, 0);
if (self::$followlocation_allowed)
{
// if PHP settings allow it use AUTOMATIC REDIRECTION
curl_setopt($process, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($process, CURLOPT_MAXREDIRS, self::$max_redirects);
}
else
{
curl_setopt($process, CURLOPT_FOLLOWLOCATION, false);
}
$return = curl_exec($process);
if ($return === false)
{
throw new Exception('Curl error: ' . curl_error($process));
}
// test for redirection HTTP codes
$code = curl_getinfo($process, CURLINFO_HTTP_CODE);
if ($code == 301 || $code == 302)
{
curl_close($process);
try
{
// go to extract new Location URI
$location = self::_parse_redirection_header($url);
}
catch (Exception $e)
{
throw $e;
}
// IMPORTANT return
return self::get($location);
}
curl_close($process);
return $return;
}
static function _set_basic_options($process)
{
curl_setopt($process, CURLOPT_USERAGENT, self::$user_agent);
curl_setopt($process, CURLOPT_COOKIEFILE, self::$cookie_file);
curl_setopt($process, CURLOPT_COOKIEJAR, self::$cookie_file);
curl_setopt($process, CURLOPT_RETURNTRANSFER, 1);
// curl_setopt($process, CURLOPT_VERBOSE, 1);
// curl_setopt($process, CURLOPT_SSL_VERIFYHOST, false);
// curl_setopt($process, CURLOPT_SSL_VERIFYPEER, false);
}
static function _parse_redirection_header($url)
{
$process = curl_init($url);
self::_set_basic_options($process);
// NOW we need to parse HTTP headers
curl_setopt($process, CURLOPT_HEADER, 1);
$return = curl_exec($process);
if ($return === false)
{
throw new Exception('Curl error: ' . curl_error($process));
}
curl_close($process);
if ( ! preg_match('#Location: (.*)#', $return, $location))
{
throw new Exception('No Location found');
}
if (self::$max_redirects-- <= 0)
{
throw new Exception('Max redirections reached trying to get: ' . $url);
}
return trim($location[1]);
}
}
You can use:
$redirectURL = curl_getinfo($ch,CURLINFO_REDIRECT_URL);
Lot's of regex here, despite the fact i really like them this way might be more stable to me:
$resultCurl=curl_exec($curl); //get curl result
//Optional line if you want to store the http status code
$headerHttpCode=curl_getinfo($curl,CURLINFO_HTTP_CODE);
//let's use dom and xpath
$dom = new \DOMDocument();
libxml_use_internal_errors(true);
$dom->loadHTML($resultCurl, LIBXML_HTML_NODEFDTD);
libxml_use_internal_errors(false);
$xpath = new \DOMXPath($dom);
$head=$xpath->query("/html/body/p/a/#href");
$newUrl=$head[0]->nodeValue;
The location part is a link in the HTML sent by apache. So Xpath is perfect to recover it.

Finding URL redirects with HTTP headers and curl?

I'm trying to code a redirect checker, to check if a URL is search engine friendly. It has to check if a URL is redirected or not, and if it's redirected it has to tell if it's SEO friendly (301 status code) or not (302/304).
Here's something similiar I've found: http://www.webconfs.com/redirect-check.php
It also should be able to follow multiple redirects (e.g. from A to B to C) and tell me that A redirects to C.
This is what I got so far, but it doesn't work quite right (example: when typing in www.example.com it doesnt find the redirect to www.example.com/page1)
<?php
// You can edit the messages of the respective code over here
$httpcode = array();
$httpcode["200"] = "Ok";
$httpcode["201"] = "Created";
$httpcode["302"] = "Found";
$httpcode["301"] = "Moved Permanently";
$httpcode["304"] = "Not Modified";
$httpcode["400"] = "Bad Request";
if(count($_POST)>0)
{
$url = $_POST["url"];
$curlurl = "http://".$url."/";
$ch = curl_init();
// Set URL to download
curl_setopt($ch, CURLOPT_URL, $curlurl);
// User agent
curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER["HTTP_USER_AGENT"]);
// Include header in result? (0 = yes, 1 = no)
curl_setopt($ch, CURLOPT_HEADER, 0);
// Should cURL return or print out the data? (true = return, false = print)
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
// Timeout in seconds
curl_setopt($ch, CURLOPT_TIMEOUT, 15);
// Download the given URL, and return output
$output = curl_exec($ch);
$curlinfo = curl_getinfo($ch);
if(($curlinfo["http_code"]=="301") || ($curlinfo["http_code"]=="302"))
{
$ch = curl_init();
// Set URL to download
curl_setopt($ch, CURLOPT_URL, $curlurl);
// User agent
curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER["HTTP_USER_AGENT"]);
// Include header in result? (0 = yes, 1 = no)
curl_setopt($ch, CURLOPT_HEADER, 0);
// Should cURL return or print out the data? (true = return, false = print)
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
// Timeout in seconds
curl_setopt($ch, CURLOPT_TIMEOUT, 15);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
// Download the given URL, and return output
$output = curl_exec($ch);
$curlinfo = curl_getinfo($ch);
echo $url." is redirected to ".$curlinfo["url"];
}
else
{
echo $url." is not getting redirected";
}
// Close the cURL resource, and free system resources
curl_close($ch);
}
?>
<form action="" method="post">
http://<input type="text" name="url" size="30" />/ <b>e.g. www.google.com</b><br/>
<input type="submit" value="Submit" />
</form>
Well if you want to record every redirect you have to implement it yourself and turn off the automatic "location following":
function curl_trace_redirects($url, $timeout = 15) {
$result = array();
$ch = curl_init();
$trace = true;
$currentUrl = $url;
$urlHist = array();
while($trace && $timeout > 0 && !isset($urlHist[$currentUrl])) {
$urlHist[$currentUrl] = true;
curl_setopt($ch, CURLOPT_URL, $currentUrl);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_NOBODY, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
$output = curl_exec($ch);
if($output === false) {
$traceItem = array(
'errorno' => curl_errno($ch),
'error' => curl_error($ch),
);
$trace = false;
} else {
$curlinfo = curl_getinfo($ch);
if(isset($curlinfo['total_time'])) {
$timeout -= $curlinfo['total_time'];
}
if(!isset($curlinfo['redirect_url'])) {
$curlinfo['redirect_url'] = get_redirect_url($output);
}
if(!empty($curlinfo['redirect_url'])) {
$currentUrl = $curlinfo['redirect_url'];
} else {
$trace = false;
}
$traceItem = $curlinfo;
}
$result[] = $traceItem;
}
if($timeout < 0) {
$result[] = array('timeout' => $timeout);
}
curl_close($ch);
return $result;
}
// apparently 'redirect_url' is not available on all curl-versions
// so we fetch the location header ourselves
function get_redirect_url($header) {
if(preg_match('/^Location:\s+(.*)$/mi', $header, $m)) {
return trim($m[1]);
}
return "";
}
And you use it like that:
$res = curl_trace_redirects("http://www.example.com");
foreach($res as $item) {
if(isset($item['timeout'])) {
echo "Timeout reached!\n";
} else if(isset($item['error'])) {
echo "error: ", $item['error'], "\n";
} else {
echo $item['url'];
if(!empty($item['redirect_url'])) {
// redirection
echo " -> (", $item['http_code'], ")";
}
echo "\n";
}
}
It's possible that my code isn't fully thought out, but I guess it's a good start.
Edit
Here's some sample Output:
http://midas/~stefan/test/redirect/fritzli.html -> (302)
http://midas/~stefan/test/redirect/hansli.html -> (301)
http://midas/~stefan/test/redirect/heiri.html

How can I find where I will be redirected using cURL in PHP?

I'm trying to make curl follow a redirect but I can't quite get it to work right. I have a string that I want to send as a GET param to a server and get the resulting URL.
Example:
String = Kobold Vermin
Url = www.wowhead.com/search?q=Kobold+Worker
If you go to that url it will redirect you to "www.wowhead.com/npc=257". I want curl to return this URL to my PHP code so that i can extract the "npc=257" and use it.
Current code:
function npcID($name) {
$urltopost = "http://www.wowhead.com/search?q=" . $name;
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1");
curl_setopt($ch, CURLOPT_URL, $urltopost);
curl_setopt($ch, CURLOPT_REFERER, "http://www.wowhead.com");
curl_setopt($ch, CURLOPT_HTTPHEADER, Array("Content-Type:application/x-www-form-urlencoded"));
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
return curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
}
This however returns www.wowhead.com/search?q=Kobold+Worker and not www.wowhead.com/npc=257.
I suspect PHP is returning before the external redirect happens. How can I fix this?
To make cURL follow a redirect, use:
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
Erm... I don't think you're actually executing the curl... Try:
curl_exec($ch);
...after setting the options, and before the curl_getinfo() call.
EDIT: If you just want to find out where a page redirects to, I'd use the advice here, and just use Curl to grab the headers and extract the Location: header from them:
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$result = curl_exec($ch);
if (preg_match('~Location: (.*)~i', $result, $match)) {
$location = trim($match[1]);
}
Add this line to curl inizialization
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
and use getinfo before curl_close
$redirectURL = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL );
es:
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_USERAGENT,'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_BINARYTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT ,0);
curl_setopt($ch, CURLOPT_TIMEOUT, 60);
$html = curl_exec($ch);
$redirectURL = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL );
curl_close($ch);
The answer above didn't work for me on one of my servers, something to to with basedir, so I re-hashed it a little. The code below works on all my servers.
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
$a = curl_exec($ch);
curl_close( $ch );
// the returned headers
$headers = explode("\n",$a);
// if there is no redirection this will be the final url
$redir = $url;
// loop through the headers and check for a Location: str
$j = count($headers);
for($i = 0; $i < $j; $i++){
// if we find the Location header strip it and fill the redir var
if(strpos($headers[$i],"Location:") !== false){
$redir = trim(str_replace("Location:","",$headers[$i]));
break;
}
}
// do whatever you want with the result
echo $redir;
The chosen answer here is decent but its case sensitive, doesn't protect against relative location: headers (which some sites do) or pages that might actually have the phrase Location: in their content... (which zillow currently does).
A bit sloppy, but a couple quick edits to make this a bit smarter are:
function getOriginalURL($url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
$result = curl_exec($ch);
$httpStatus = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
// if it's not a redirection (3XX), move along
if ($httpStatus < 300 || $httpStatus >= 400)
return $url;
// look for a location: header to find the target URL
if(preg_match('/location: (.*)/i', $result, $r)) {
$location = trim($r[1]);
// if the location is a relative URL, attempt to make it absolute
if (preg_match('/^\/(.*)/', $location)) {
$urlParts = parse_url($url);
if ($urlParts['scheme'])
$baseURL = $urlParts['scheme'].'://';
if ($urlParts['host'])
$baseURL .= $urlParts['host'];
if ($urlParts['port'])
$baseURL .= ':'.$urlParts['port'];
return $baseURL.$location;
}
return $location;
}
return $url;
}
Note that this still only goes 1 redirection deep. To go deeper, you actually need to get the content and follow the redirects.
Sometimes you need to get HTTP headers but at the same time you don't want return those headers.**
This skeleton takes care of cookies and HTTP redirects using recursion. The main idea here is to avoid return HTTP headers to the client code.
You can build a very strong curl class over it. Add POST functionality, etc.
<?php
class curl {
static private $cookie_file = '';
static private $user_agent = '';
static private $max_redirects = 10;
static private $followlocation_allowed = true;
function __construct()
{
// set a file to store cookies
self::$cookie_file = 'cookies.txt';
// set some general User Agent
self::$user_agent = 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)';
if ( ! file_exists(self::$cookie_file) || ! is_writable(self::$cookie_file))
{
throw new Exception('Cookie file missing or not writable.');
}
// check for PHP settings that unfits
// correct functioning of CURLOPT_FOLLOWLOCATION
if (ini_get('open_basedir') != '' || ini_get('safe_mode') == 'On')
{
self::$followlocation_allowed = false;
}
}
/**
* Main method for GET requests
* #param string $url URI to get
* #return string request's body
*/
static public function get($url)
{
$process = curl_init($url);
self::_set_basic_options($process);
// this function is in charge of output request's body
// so DO NOT include HTTP headers
curl_setopt($process, CURLOPT_HEADER, 0);
if (self::$followlocation_allowed)
{
// if PHP settings allow it use AUTOMATIC REDIRECTION
curl_setopt($process, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($process, CURLOPT_MAXREDIRS, self::$max_redirects);
}
else
{
curl_setopt($process, CURLOPT_FOLLOWLOCATION, false);
}
$return = curl_exec($process);
if ($return === false)
{
throw new Exception('Curl error: ' . curl_error($process));
}
// test for redirection HTTP codes
$code = curl_getinfo($process, CURLINFO_HTTP_CODE);
if ($code == 301 || $code == 302)
{
curl_close($process);
try
{
// go to extract new Location URI
$location = self::_parse_redirection_header($url);
}
catch (Exception $e)
{
throw $e;
}
// IMPORTANT return
return self::get($location);
}
curl_close($process);
return $return;
}
static function _set_basic_options($process)
{
curl_setopt($process, CURLOPT_USERAGENT, self::$user_agent);
curl_setopt($process, CURLOPT_COOKIEFILE, self::$cookie_file);
curl_setopt($process, CURLOPT_COOKIEJAR, self::$cookie_file);
curl_setopt($process, CURLOPT_RETURNTRANSFER, 1);
// curl_setopt($process, CURLOPT_VERBOSE, 1);
// curl_setopt($process, CURLOPT_SSL_VERIFYHOST, false);
// curl_setopt($process, CURLOPT_SSL_VERIFYPEER, false);
}
static function _parse_redirection_header($url)
{
$process = curl_init($url);
self::_set_basic_options($process);
// NOW we need to parse HTTP headers
curl_setopt($process, CURLOPT_HEADER, 1);
$return = curl_exec($process);
if ($return === false)
{
throw new Exception('Curl error: ' . curl_error($process));
}
curl_close($process);
if ( ! preg_match('#Location: (.*)#', $return, $location))
{
throw new Exception('No Location found');
}
if (self::$max_redirects-- <= 0)
{
throw new Exception('Max redirections reached trying to get: ' . $url);
}
return trim($location[1]);
}
}
You can use:
$redirectURL = curl_getinfo($ch,CURLINFO_REDIRECT_URL);
Lot's of regex here, despite the fact i really like them this way might be more stable to me:
$resultCurl=curl_exec($curl); //get curl result
//Optional line if you want to store the http status code
$headerHttpCode=curl_getinfo($curl,CURLINFO_HTTP_CODE);
//let's use dom and xpath
$dom = new \DOMDocument();
libxml_use_internal_errors(true);
$dom->loadHTML($resultCurl, LIBXML_HTML_NODEFDTD);
libxml_use_internal_errors(false);
$xpath = new \DOMXPath($dom);
$head=$xpath->query("/html/body/p/a/#href");
$newUrl=$head[0]->nodeValue;
The location part is a link in the HTML sent by apache. So Xpath is perfect to recover it.

Categories