I know, there is already a thread about this ... see How to pass the steam age check using curl? ... but I'm a new user and can't comment in an existing thread and the answer marked as solution there doesn't work anymore.
I had my own code that worked fine in the past (around 2017), but doesn't work anymore as well.
Here is my code that worked in the past:
function curl_redirect_exec2($ch, &$redirects, $curlopt_header = false) {
$ckfile = tempnam(sys_get_temp_dir(), "CURLCOOKIE");
curl_setopt($ch, CURLOPT_COOKIEJAR, $ckfile);
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5");
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, 'snr=1_agecheck _agecheck__age-gate&ageDay=1&ageMonth=May&ageYear=1990');
curl_setopt($ch, CURLOPT_FRESH_CONNECT, true);
curl_setopt($ch, CURLOPT_UNRESTRICTED_AUTH, true);
curl_setopt($ch, CURLOPT_COOKIEFILE, $ckfile);
//new start
curl_setopt($ch, CURLOPT_COOKIE, 'mature_content=1; path=/app/'.$gameid.';');
//new end
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$data = curl_exec($ch);
$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if($http_code == 301 || $http_code == 302) {
list($header) = explode("\r\n\r\n", $data, 2);
$matches = array();
preg_match('/(Location:|URI:)(.*?)\n/', $header, $matches);
$url = trim(array_pop($matches));
$url_parsed = parse_url($url);
if(isset($url_parsed)) {
curl_setopt($ch, CURLOPT_URL, $url);
$redirects++;
return curl_redirect_exec2($ch, $redirects);
}
}
if($curlopt_header) {
return $data;
} else {
list(,$body) = explode("\r\n\r\n", $data, 2);
return $body;
}
}
And here is the code sample from the thread linked above that also seemed to work in the past (but doesn't anymore):
<?php
$url = "http://store.steampowered.com/app/312660/";
// $file = __DIR__ . DIRECTORY_SEPARATOR . "cookie.txt";
// $postData = array(
// 'ageDay' => '31',
// 'ageMonth' => 'July',
// 'ageYear' => '1993'
// );
$ch = curl_init();
curl_setopt($ch,CURLOPT_URL,$url);
curl_setopt($ch,CURLOPT_POST,true);
curl_setopt($ch,CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13");
curl_setopt($ch,CURLOPT_POSTFIELDS,$postData);
// curl_setopt($ch,CURLOPT_COOKIESESSION, true);
// curl_setopt($ch,CURLOPT_COOKIEJAR,$file);
// curl_setopt($ch,CURLOPT_COOKIEFILE,$file);
$strCookie = 'mature_content=' . 1 . '; path=/';
curl_setopt( $ch, CURLOPT_COOKIE, $strCookie );
curl_setopt($ch,CURLOPT_FOLLOWLOCATION,true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$data = curl_exec($ch);
curl_close($ch);
echo $data;
?>
What I tested so far:
You can use the game "RUST" as an example: https://store.steampowered.com/app/252490/
Page redirects to age check: https://store.steampowered.com/agecheck/app/252490/
I saw that the cookie set uses other names now ("wants_mature_content" instead of "mature_content" in the JavaScript), but even after changing the PHP to use the new name, it doesn't work.
JavaScript code from Steam page:
function HideAgeGate( )
{
var bHideAll = false;
console.log(bHideAll);
var strCookiePath = bHideAll ? '/' : "\/app\/252490";
V_SetCookie( 'wants_mature_content', 1, 365, strCookiePath );
document.location = "https:\/\/store.steampowered.com\/app\/252490\/Rust\/?snr=";
}
Edit: I also found the function "V_SetCookie" ... in https://store.akamai.steamstatic.com/public/shared/javascript/shared_global.js ... that is called by the code above:
function V_SetCookie( strCookieName, strValue, expiryInDays, path )
{
if ( !path )
path = '/';
var strDate = '';
if( typeof expiryInDays != 'undefined' && expiryInDays )
{
var dateExpires = new Date();
dateExpires.setTime( dateExpires.getTime() + 1000 * 60 * 60 * 24 * expiryInDays );
strDate = '; expires=' + dateExpires.toGMTString();
}
document.cookie = strCookieName + '=' + strValue + strDate + ';path=' + path;
}
Can somebody help please? :-) Thanks!
This is working for me
<?php
$url = 'https://store.steampowered.com/bundle/5699/Grand_Theft_Auto_V_Premium_Edition/';
$ch = curl_init();
curl_setopt($ch, CURLOPT_ENCODING, '');
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, ['Cookie: birthtime=470682001;lastagecheckage=1-0-1985;']);
$html = curl_exec($ch);
curl_close($ch);
var_dump($html);
Ok, got it working.
There are actually 3 cookies: "wants_mature_content", "lastagecheckage" and "birthtime"
Open a page with age check in e.g. Chrome, click on "View Page" and then look for the 3 cookies in chrome (and their content). Set all 3 cookies with PHP's curl and it's working ;-)
Related
I'm trying to make curl follow a redirect but I can't quite get it to work right. I have a string that I want to send as a GET param to a server and get the resulting URL.
Example:
String = Kobold Vermin
Url = www.wowhead.com/search?q=Kobold+Worker
If you go to that url it will redirect you to "www.wowhead.com/npc=257". I want curl to return this URL to my PHP code so that i can extract the "npc=257" and use it.
Current code:
function npcID($name) {
$urltopost = "http://www.wowhead.com/search?q=" . $name;
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1");
curl_setopt($ch, CURLOPT_URL, $urltopost);
curl_setopt($ch, CURLOPT_REFERER, "http://www.wowhead.com");
curl_setopt($ch, CURLOPT_HTTPHEADER, Array("Content-Type:application/x-www-form-urlencoded"));
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
return curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
}
This however returns www.wowhead.com/search?q=Kobold+Worker and not www.wowhead.com/npc=257.
I suspect PHP is returning before the external redirect happens. How can I fix this?
To make cURL follow a redirect, use:
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
Erm... I don't think you're actually executing the curl... Try:
curl_exec($ch);
...after setting the options, and before the curl_getinfo() call.
EDIT: If you just want to find out where a page redirects to, I'd use the advice here, and just use Curl to grab the headers and extract the Location: header from them:
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$result = curl_exec($ch);
if (preg_match('~Location: (.*)~i', $result, $match)) {
$location = trim($match[1]);
}
Add this line to curl inizialization
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
and use getinfo before curl_close
$redirectURL = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL );
es:
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_USERAGENT,'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_BINARYTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT ,0);
curl_setopt($ch, CURLOPT_TIMEOUT, 60);
$html = curl_exec($ch);
$redirectURL = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL );
curl_close($ch);
The answer above didn't work for me on one of my servers, something to to with basedir, so I re-hashed it a little. The code below works on all my servers.
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
$a = curl_exec($ch);
curl_close( $ch );
// the returned headers
$headers = explode("\n",$a);
// if there is no redirection this will be the final url
$redir = $url;
// loop through the headers and check for a Location: str
$j = count($headers);
for($i = 0; $i < $j; $i++){
// if we find the Location header strip it and fill the redir var
if(strpos($headers[$i],"Location:") !== false){
$redir = trim(str_replace("Location:","",$headers[$i]));
break;
}
}
// do whatever you want with the result
echo $redir;
The chosen answer here is decent but its case sensitive, doesn't protect against relative location: headers (which some sites do) or pages that might actually have the phrase Location: in their content... (which zillow currently does).
A bit sloppy, but a couple quick edits to make this a bit smarter are:
function getOriginalURL($url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
$result = curl_exec($ch);
$httpStatus = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
// if it's not a redirection (3XX), move along
if ($httpStatus < 300 || $httpStatus >= 400)
return $url;
// look for a location: header to find the target URL
if(preg_match('/location: (.*)/i', $result, $r)) {
$location = trim($r[1]);
// if the location is a relative URL, attempt to make it absolute
if (preg_match('/^\/(.*)/', $location)) {
$urlParts = parse_url($url);
if ($urlParts['scheme'])
$baseURL = $urlParts['scheme'].'://';
if ($urlParts['host'])
$baseURL .= $urlParts['host'];
if ($urlParts['port'])
$baseURL .= ':'.$urlParts['port'];
return $baseURL.$location;
}
return $location;
}
return $url;
}
Note that this still only goes 1 redirection deep. To go deeper, you actually need to get the content and follow the redirects.
Sometimes you need to get HTTP headers but at the same time you don't want return those headers.**
This skeleton takes care of cookies and HTTP redirects using recursion. The main idea here is to avoid return HTTP headers to the client code.
You can build a very strong curl class over it. Add POST functionality, etc.
<?php
class curl {
static private $cookie_file = '';
static private $user_agent = '';
static private $max_redirects = 10;
static private $followlocation_allowed = true;
function __construct()
{
// set a file to store cookies
self::$cookie_file = 'cookies.txt';
// set some general User Agent
self::$user_agent = 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)';
if ( ! file_exists(self::$cookie_file) || ! is_writable(self::$cookie_file))
{
throw new Exception('Cookie file missing or not writable.');
}
// check for PHP settings that unfits
// correct functioning of CURLOPT_FOLLOWLOCATION
if (ini_get('open_basedir') != '' || ini_get('safe_mode') == 'On')
{
self::$followlocation_allowed = false;
}
}
/**
* Main method for GET requests
* #param string $url URI to get
* #return string request's body
*/
static public function get($url)
{
$process = curl_init($url);
self::_set_basic_options($process);
// this function is in charge of output request's body
// so DO NOT include HTTP headers
curl_setopt($process, CURLOPT_HEADER, 0);
if (self::$followlocation_allowed)
{
// if PHP settings allow it use AUTOMATIC REDIRECTION
curl_setopt($process, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($process, CURLOPT_MAXREDIRS, self::$max_redirects);
}
else
{
curl_setopt($process, CURLOPT_FOLLOWLOCATION, false);
}
$return = curl_exec($process);
if ($return === false)
{
throw new Exception('Curl error: ' . curl_error($process));
}
// test for redirection HTTP codes
$code = curl_getinfo($process, CURLINFO_HTTP_CODE);
if ($code == 301 || $code == 302)
{
curl_close($process);
try
{
// go to extract new Location URI
$location = self::_parse_redirection_header($url);
}
catch (Exception $e)
{
throw $e;
}
// IMPORTANT return
return self::get($location);
}
curl_close($process);
return $return;
}
static function _set_basic_options($process)
{
curl_setopt($process, CURLOPT_USERAGENT, self::$user_agent);
curl_setopt($process, CURLOPT_COOKIEFILE, self::$cookie_file);
curl_setopt($process, CURLOPT_COOKIEJAR, self::$cookie_file);
curl_setopt($process, CURLOPT_RETURNTRANSFER, 1);
// curl_setopt($process, CURLOPT_VERBOSE, 1);
// curl_setopt($process, CURLOPT_SSL_VERIFYHOST, false);
// curl_setopt($process, CURLOPT_SSL_VERIFYPEER, false);
}
static function _parse_redirection_header($url)
{
$process = curl_init($url);
self::_set_basic_options($process);
// NOW we need to parse HTTP headers
curl_setopt($process, CURLOPT_HEADER, 1);
$return = curl_exec($process);
if ($return === false)
{
throw new Exception('Curl error: ' . curl_error($process));
}
curl_close($process);
if ( ! preg_match('#Location: (.*)#', $return, $location))
{
throw new Exception('No Location found');
}
if (self::$max_redirects-- <= 0)
{
throw new Exception('Max redirections reached trying to get: ' . $url);
}
return trim($location[1]);
}
}
You can use:
$redirectURL = curl_getinfo($ch,CURLINFO_REDIRECT_URL);
Lot's of regex here, despite the fact i really like them this way might be more stable to me:
$resultCurl=curl_exec($curl); //get curl result
//Optional line if you want to store the http status code
$headerHttpCode=curl_getinfo($curl,CURLINFO_HTTP_CODE);
//let's use dom and xpath
$dom = new \DOMDocument();
libxml_use_internal_errors(true);
$dom->loadHTML($resultCurl, LIBXML_HTML_NODEFDTD);
libxml_use_internal_errors(false);
$xpath = new \DOMXPath($dom);
$head=$xpath->query("/html/body/p/a/#href");
$newUrl=$head[0]->nodeValue;
The location part is a link in the HTML sent by apache. So Xpath is perfect to recover it.
I have a PHP Simple HTML DOM Parser locally in MAMP that pulls information and works well with the Japan version of a website, since I'm located in Japan. However, I would like to pull information from the UK version of the site. What is the simplest way to do this?
I tried the following from the documentation and it didn't work.
$context = array('http' => array('proxy' => '212.82.126.32:80','request_fulluri' => true,),);
$stream = stream_context_create($context);
$html = file_get_html('http://www.supremenewyork.com/shop/new', false, $stream);
I also tried the curl version with modifications as the site has safe mode enabled. That didn't work as well.
function curl_exec_follow(/*resource*/ $ch, /*int*/ &$maxredirect = null) {
$mr = $maxredirect === null ? 5 : intval($maxredirect);
if (ini_get('open_basedir') == '' && ini_get('safe_mode' == 'Off')) {
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, $mr > 0);
curl_setopt($ch, CURLOPT_MAXREDIRS, $mr);
} else {
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
if ($mr > 0) {
$newurl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
$rch = curl_copy_handle($ch);
curl_setopt($rch, CURLOPT_HEADER, true);
curl_setopt($rch, CURLOPT_NOBODY, true);
curl_setopt($rch, CURLOPT_FORBID_REUSE, false);
curl_setopt($rch, CURLOPT_RETURNTRANSFER, true);
do {
curl_setopt($rch, CURLOPT_URL, $newurl);
$header = curl_exec($rch);
if (curl_errno($rch)) {
$code = 0;
} else {
$code = curl_getinfo($rch, CURLINFO_HTTP_CODE);
if ($code == 301 || $code == 302) {
preg_match('/Location:(.*?)\n/', $header, $matches);
$newurl = trim(array_pop($matches));
} else {
$code = 0;
}
}
} while ($code && --$mr);
curl_close($rch);
if (!$mr) {
if ($maxredirect === null) {
trigger_error('Too many redirects. When following redirects, libcurl hit the maximum amount.', E_USER_WARNING);
} else {
$maxredirect = 0;
}
return false;
}
curl_setopt($ch, CURLOPT_URL, $newurl);
}
}
return curl_exec($ch);
}
$url = 'http://www.supremenewyork.com/shop/new';
$proxy = '212.82.126.32:80';
$options = array(
CURLOPT_PROXY => $proxy,
CURLOPT_HTTPPROXYTUNNEL => 0,
CURLOPT_REFERER => "http://www.google.com",
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_USERAGENT => "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1",
CURLOPT_CONNECTTIMEOUT => 20,
CURLOPT_TIMEOUT => 20,
CURLOPT_MAXREDIRS => 10,
CURLOPT_HEADER => true,
);
$ch = curl_init( $url );
//curl_setopt_array( $ch, $options );
$content = curl_exec_follow( $ch );
$html = new simple_html_dom();
$html->load($content,true,false);
I tried uploading to US and UK servers as well, but that didn't work and it just pulls US data. Some help please?
Curl works whatever safe mode is enable or disable.
Your Curl script is too complex, make it simple and try again.
$content = curl_exec_follow('http://www.supremenewyork.com/shop/new');
$html = new simple_html_dom();
$html->load($content,true,false);
I modified your code, you can try.
// define cookie file path here
define('CRAWLER_COOKIE_FILENAME', 'cookie.txt');
function curl_exec_follow($url) {
$proxy = '212.82.126.32:80';
$agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1';
// Some websites check referrer
$host = parse_url($url, PHP_URL_HOST);
$scheme = parse_url($url, PHP_URL_SCHEME);
$referrer = $scheme . '://' . $host;
$ch = curl_init();
$curl_defaults = array(
CURLOPT_HEADER => 0,
CURLOPT_FOLLOWLOCATION => 1,
CURLOPT_RETURNTRANSFER => 1,
);
curl_setopt_array($ch, $curl_defaults);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_PROXY, $proxy);
curl_setopt($ch, CURLOPT_USERAGENT, $agent);
curl_setopt($ch, CURLOPT_REFERER, $referrer);
if ( !file_exists(CRAWLER_COOKIE_FILENAME) || !is_writable(CRAWLER_COOKIE_FILENAME) ) {
echo 'Cookie file is missing or not writable.';
exit;
}
curl_setopt($ch, CURLOPT_COOKIESESSION, 0);
curl_setopt($ch, CURLOPT_COOKIEFILE, CRAWLER_COOKIE_FILENAME);
curl_setopt($ch, CURLOPT_COOKIEJAR, CRAWLER_COOKIE_FILENAME);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5);
// allow to crawl https webpages
curl_setopt($ch,CURLOPT_SSL_VERIFYHOST,0);
curl_setopt($ch,CURLOPT_SSL_VERIFYPEER,0);
// the download speed must be at least 1 byte per second
curl_setopt($ch,CURLOPT_LOW_SPEED_LIMIT, 1);
// if the download speed is below 1 byte per second for more than 30 seconds curl will give up
curl_setopt($ch,CURLOPT_LOW_SPEED_TIME, 30);
$content = curl_exec($ch);
if ($ret === FALSE) {
echo curl_error($ch);
}
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if ( $code != '200' ) echo 'http error code: ' . $code;
curl_close($ch);
return $content;
}
I'm trying to make curl follow a redirect but I can't quite get it to work right. I have a string that I want to send as a GET param to a server and get the resulting URL.
Example:
String = Kobold Vermin
Url = www.wowhead.com/search?q=Kobold+Worker
If you go to that url it will redirect you to "www.wowhead.com/npc=257". I want curl to return this URL to my PHP code so that i can extract the "npc=257" and use it.
Current code:
function npcID($name) {
$urltopost = "http://www.wowhead.com/search?q=" . $name;
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1");
curl_setopt($ch, CURLOPT_URL, $urltopost);
curl_setopt($ch, CURLOPT_REFERER, "http://www.wowhead.com");
curl_setopt($ch, CURLOPT_HTTPHEADER, Array("Content-Type:application/x-www-form-urlencoded"));
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
return curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
}
This however returns www.wowhead.com/search?q=Kobold+Worker and not www.wowhead.com/npc=257.
I suspect PHP is returning before the external redirect happens. How can I fix this?
To make cURL follow a redirect, use:
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
Erm... I don't think you're actually executing the curl... Try:
curl_exec($ch);
...after setting the options, and before the curl_getinfo() call.
EDIT: If you just want to find out where a page redirects to, I'd use the advice here, and just use Curl to grab the headers and extract the Location: header from them:
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$result = curl_exec($ch);
if (preg_match('~Location: (.*)~i', $result, $match)) {
$location = trim($match[1]);
}
Add this line to curl inizialization
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
and use getinfo before curl_close
$redirectURL = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL );
es:
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_USERAGENT,'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_BINARYTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT ,0);
curl_setopt($ch, CURLOPT_TIMEOUT, 60);
$html = curl_exec($ch);
$redirectURL = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL );
curl_close($ch);
The answer above didn't work for me on one of my servers, something to to with basedir, so I re-hashed it a little. The code below works on all my servers.
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
$a = curl_exec($ch);
curl_close( $ch );
// the returned headers
$headers = explode("\n",$a);
// if there is no redirection this will be the final url
$redir = $url;
// loop through the headers and check for a Location: str
$j = count($headers);
for($i = 0; $i < $j; $i++){
// if we find the Location header strip it and fill the redir var
if(strpos($headers[$i],"Location:") !== false){
$redir = trim(str_replace("Location:","",$headers[$i]));
break;
}
}
// do whatever you want with the result
echo $redir;
The chosen answer here is decent but its case sensitive, doesn't protect against relative location: headers (which some sites do) or pages that might actually have the phrase Location: in their content... (which zillow currently does).
A bit sloppy, but a couple quick edits to make this a bit smarter are:
function getOriginalURL($url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
$result = curl_exec($ch);
$httpStatus = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
// if it's not a redirection (3XX), move along
if ($httpStatus < 300 || $httpStatus >= 400)
return $url;
// look for a location: header to find the target URL
if(preg_match('/location: (.*)/i', $result, $r)) {
$location = trim($r[1]);
// if the location is a relative URL, attempt to make it absolute
if (preg_match('/^\/(.*)/', $location)) {
$urlParts = parse_url($url);
if ($urlParts['scheme'])
$baseURL = $urlParts['scheme'].'://';
if ($urlParts['host'])
$baseURL .= $urlParts['host'];
if ($urlParts['port'])
$baseURL .= ':'.$urlParts['port'];
return $baseURL.$location;
}
return $location;
}
return $url;
}
Note that this still only goes 1 redirection deep. To go deeper, you actually need to get the content and follow the redirects.
Sometimes you need to get HTTP headers but at the same time you don't want return those headers.**
This skeleton takes care of cookies and HTTP redirects using recursion. The main idea here is to avoid return HTTP headers to the client code.
You can build a very strong curl class over it. Add POST functionality, etc.
<?php
class curl {
static private $cookie_file = '';
static private $user_agent = '';
static private $max_redirects = 10;
static private $followlocation_allowed = true;
function __construct()
{
// set a file to store cookies
self::$cookie_file = 'cookies.txt';
// set some general User Agent
self::$user_agent = 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)';
if ( ! file_exists(self::$cookie_file) || ! is_writable(self::$cookie_file))
{
throw new Exception('Cookie file missing or not writable.');
}
// check for PHP settings that unfits
// correct functioning of CURLOPT_FOLLOWLOCATION
if (ini_get('open_basedir') != '' || ini_get('safe_mode') == 'On')
{
self::$followlocation_allowed = false;
}
}
/**
* Main method for GET requests
* #param string $url URI to get
* #return string request's body
*/
static public function get($url)
{
$process = curl_init($url);
self::_set_basic_options($process);
// this function is in charge of output request's body
// so DO NOT include HTTP headers
curl_setopt($process, CURLOPT_HEADER, 0);
if (self::$followlocation_allowed)
{
// if PHP settings allow it use AUTOMATIC REDIRECTION
curl_setopt($process, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($process, CURLOPT_MAXREDIRS, self::$max_redirects);
}
else
{
curl_setopt($process, CURLOPT_FOLLOWLOCATION, false);
}
$return = curl_exec($process);
if ($return === false)
{
throw new Exception('Curl error: ' . curl_error($process));
}
// test for redirection HTTP codes
$code = curl_getinfo($process, CURLINFO_HTTP_CODE);
if ($code == 301 || $code == 302)
{
curl_close($process);
try
{
// go to extract new Location URI
$location = self::_parse_redirection_header($url);
}
catch (Exception $e)
{
throw $e;
}
// IMPORTANT return
return self::get($location);
}
curl_close($process);
return $return;
}
static function _set_basic_options($process)
{
curl_setopt($process, CURLOPT_USERAGENT, self::$user_agent);
curl_setopt($process, CURLOPT_COOKIEFILE, self::$cookie_file);
curl_setopt($process, CURLOPT_COOKIEJAR, self::$cookie_file);
curl_setopt($process, CURLOPT_RETURNTRANSFER, 1);
// curl_setopt($process, CURLOPT_VERBOSE, 1);
// curl_setopt($process, CURLOPT_SSL_VERIFYHOST, false);
// curl_setopt($process, CURLOPT_SSL_VERIFYPEER, false);
}
static function _parse_redirection_header($url)
{
$process = curl_init($url);
self::_set_basic_options($process);
// NOW we need to parse HTTP headers
curl_setopt($process, CURLOPT_HEADER, 1);
$return = curl_exec($process);
if ($return === false)
{
throw new Exception('Curl error: ' . curl_error($process));
}
curl_close($process);
if ( ! preg_match('#Location: (.*)#', $return, $location))
{
throw new Exception('No Location found');
}
if (self::$max_redirects-- <= 0)
{
throw new Exception('Max redirections reached trying to get: ' . $url);
}
return trim($location[1]);
}
}
You can use:
$redirectURL = curl_getinfo($ch,CURLINFO_REDIRECT_URL);
Lot's of regex here, despite the fact i really like them this way might be more stable to me:
$resultCurl=curl_exec($curl); //get curl result
//Optional line if you want to store the http status code
$headerHttpCode=curl_getinfo($curl,CURLINFO_HTTP_CODE);
//let's use dom and xpath
$dom = new \DOMDocument();
libxml_use_internal_errors(true);
$dom->loadHTML($resultCurl, LIBXML_HTML_NODEFDTD);
libxml_use_internal_errors(false);
$xpath = new \DOMXPath($dom);
$head=$xpath->query("/html/body/p/a/#href");
$newUrl=$head[0]->nodeValue;
The location part is a link in the HTML sent by apache. So Xpath is perfect to recover it.
I am working on saving images from external sites into a folder in my wordpress theme. And I was wondering if its Ok to call curl twice or can it be done with one time.
Example:
$data = get_url('http://www.veoh.com/watch/v19935546Y8hZPgbZ'); // getting the url first curl instance
preg_match('/fullHighResImagePath="(.*?)"/', $data, $thumbnail); // find the image from content
savePhoto($thumbnail, $post->ID); //2nd instance of curl to save the image
function get_url($url) {
$user_agent = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2)";
$ytc = curl_init(); // initialize curl handle
curl_setopt($ytc, CURLOPT_URL, $url); // set url to post to
curl_setopt($ytc, CURLOPT_FAILONERROR, 1); // Fail on errors
curl_setopt($ytc, CURLOPT_FOLLOWLOCATION, 1); // allow redirects
curl_setopt($ytc, CURLOPT_RETURNTRANSFER, 1); // return into a variable
curl_setopt($ytc, CURLOPT_PORT, 80); //Set the port number
curl_setopt($ytc, CURLOPT_TIMEOUT, 15); // times out after 15s
curl_setopt($ytc, CURLOPT_HEADER, 1); // include HTTP headers
curl_setopt($ytc, CURLOPT_USERAGENT, $user_agent);
$source = curl_exec($ytc);
curl_close($ytc);
$data = trim( $source );
return $data;
}
function savePhoto($remoteImage, $isbn) {
$ch = curl_init();
curl_setopt ($ch, CURLOPT_URL, $remoteImage);
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, 0);
$fileContents = curl_exec($ch);
curl_close($ch);
if (DIRECTORY_SEPARATOR=='/'){
$absolute_path = dirname(__FILE__).'/';
} else {
$absolute_path = str_replace('\\', '/', dirname(__FILE__)).'/';
}
$newImg = imagecreatefromstring($fileContents);
return imagejpeg($newImg, $absolute_path ."video_images/{$isbn}.jpg",100);
}
Use Worpress function wp_remote_get and let Wordpress handle the calls using curl.
So you can do something like
$data = my_get_remote_content('http://www.veoh.com/watch/v19935546Y8hZPgbZ');
// find the image from content
preg_match('/fullHighResImagePath="(.*?)"/', $data, $thumbnail);
//2nd instance of curl to save the image
savePhoto(my_get_remote_content($thumbnail), $post->ID);
function my_get_remote_content($url) {
$response = wp_remote_get($url,
array(
'headers' => array(
'user-agent' => 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2)'
)
)
);
if( is_wp_error( $response ) ) {
throw new Exception('Error fetching remote content');
} else {
$data = wp_remote_retrieve_body($response);
return $data;
}
}
function savePhoto($fileContents, $isbn) {
if (DIRECTORY_SEPARATOR=='/'){
$absolute_path = dirname(__FILE__).'/';
} else {
$absolute_path = str_replace('\\', '/', dirname(__FILE__)).'/';
}
$newImg = imagecreatefromstring($fileContents);
return imagejpeg($newImg, $absolute_path ."video_images/{$isbn}.jpg",100);
}
I'm trying to make curl follow a redirect but I can't quite get it to work right. I have a string that I want to send as a GET param to a server and get the resulting URL.
Example:
String = Kobold Vermin
Url = www.wowhead.com/search?q=Kobold+Worker
If you go to that url it will redirect you to "www.wowhead.com/npc=257". I want curl to return this URL to my PHP code so that i can extract the "npc=257" and use it.
Current code:
function npcID($name) {
$urltopost = "http://www.wowhead.com/search?q=" . $name;
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1");
curl_setopt($ch, CURLOPT_URL, $urltopost);
curl_setopt($ch, CURLOPT_REFERER, "http://www.wowhead.com");
curl_setopt($ch, CURLOPT_HTTPHEADER, Array("Content-Type:application/x-www-form-urlencoded"));
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
return curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
}
This however returns www.wowhead.com/search?q=Kobold+Worker and not www.wowhead.com/npc=257.
I suspect PHP is returning before the external redirect happens. How can I fix this?
To make cURL follow a redirect, use:
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
Erm... I don't think you're actually executing the curl... Try:
curl_exec($ch);
...after setting the options, and before the curl_getinfo() call.
EDIT: If you just want to find out where a page redirects to, I'd use the advice here, and just use Curl to grab the headers and extract the Location: header from them:
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$result = curl_exec($ch);
if (preg_match('~Location: (.*)~i', $result, $match)) {
$location = trim($match[1]);
}
Add this line to curl inizialization
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
and use getinfo before curl_close
$redirectURL = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL );
es:
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_USERAGENT,'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_BINARYTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT ,0);
curl_setopt($ch, CURLOPT_TIMEOUT, 60);
$html = curl_exec($ch);
$redirectURL = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL );
curl_close($ch);
The answer above didn't work for me on one of my servers, something to to with basedir, so I re-hashed it a little. The code below works on all my servers.
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
$a = curl_exec($ch);
curl_close( $ch );
// the returned headers
$headers = explode("\n",$a);
// if there is no redirection this will be the final url
$redir = $url;
// loop through the headers and check for a Location: str
$j = count($headers);
for($i = 0; $i < $j; $i++){
// if we find the Location header strip it and fill the redir var
if(strpos($headers[$i],"Location:") !== false){
$redir = trim(str_replace("Location:","",$headers[$i]));
break;
}
}
// do whatever you want with the result
echo $redir;
The chosen answer here is decent but its case sensitive, doesn't protect against relative location: headers (which some sites do) or pages that might actually have the phrase Location: in their content... (which zillow currently does).
A bit sloppy, but a couple quick edits to make this a bit smarter are:
function getOriginalURL($url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
$result = curl_exec($ch);
$httpStatus = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
// if it's not a redirection (3XX), move along
if ($httpStatus < 300 || $httpStatus >= 400)
return $url;
// look for a location: header to find the target URL
if(preg_match('/location: (.*)/i', $result, $r)) {
$location = trim($r[1]);
// if the location is a relative URL, attempt to make it absolute
if (preg_match('/^\/(.*)/', $location)) {
$urlParts = parse_url($url);
if ($urlParts['scheme'])
$baseURL = $urlParts['scheme'].'://';
if ($urlParts['host'])
$baseURL .= $urlParts['host'];
if ($urlParts['port'])
$baseURL .= ':'.$urlParts['port'];
return $baseURL.$location;
}
return $location;
}
return $url;
}
Note that this still only goes 1 redirection deep. To go deeper, you actually need to get the content and follow the redirects.
Sometimes you need to get HTTP headers but at the same time you don't want return those headers.**
This skeleton takes care of cookies and HTTP redirects using recursion. The main idea here is to avoid return HTTP headers to the client code.
You can build a very strong curl class over it. Add POST functionality, etc.
<?php
class curl {
static private $cookie_file = '';
static private $user_agent = '';
static private $max_redirects = 10;
static private $followlocation_allowed = true;
function __construct()
{
// set a file to store cookies
self::$cookie_file = 'cookies.txt';
// set some general User Agent
self::$user_agent = 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)';
if ( ! file_exists(self::$cookie_file) || ! is_writable(self::$cookie_file))
{
throw new Exception('Cookie file missing or not writable.');
}
// check for PHP settings that unfits
// correct functioning of CURLOPT_FOLLOWLOCATION
if (ini_get('open_basedir') != '' || ini_get('safe_mode') == 'On')
{
self::$followlocation_allowed = false;
}
}
/**
* Main method for GET requests
* #param string $url URI to get
* #return string request's body
*/
static public function get($url)
{
$process = curl_init($url);
self::_set_basic_options($process);
// this function is in charge of output request's body
// so DO NOT include HTTP headers
curl_setopt($process, CURLOPT_HEADER, 0);
if (self::$followlocation_allowed)
{
// if PHP settings allow it use AUTOMATIC REDIRECTION
curl_setopt($process, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($process, CURLOPT_MAXREDIRS, self::$max_redirects);
}
else
{
curl_setopt($process, CURLOPT_FOLLOWLOCATION, false);
}
$return = curl_exec($process);
if ($return === false)
{
throw new Exception('Curl error: ' . curl_error($process));
}
// test for redirection HTTP codes
$code = curl_getinfo($process, CURLINFO_HTTP_CODE);
if ($code == 301 || $code == 302)
{
curl_close($process);
try
{
// go to extract new Location URI
$location = self::_parse_redirection_header($url);
}
catch (Exception $e)
{
throw $e;
}
// IMPORTANT return
return self::get($location);
}
curl_close($process);
return $return;
}
static function _set_basic_options($process)
{
curl_setopt($process, CURLOPT_USERAGENT, self::$user_agent);
curl_setopt($process, CURLOPT_COOKIEFILE, self::$cookie_file);
curl_setopt($process, CURLOPT_COOKIEJAR, self::$cookie_file);
curl_setopt($process, CURLOPT_RETURNTRANSFER, 1);
// curl_setopt($process, CURLOPT_VERBOSE, 1);
// curl_setopt($process, CURLOPT_SSL_VERIFYHOST, false);
// curl_setopt($process, CURLOPT_SSL_VERIFYPEER, false);
}
static function _parse_redirection_header($url)
{
$process = curl_init($url);
self::_set_basic_options($process);
// NOW we need to parse HTTP headers
curl_setopt($process, CURLOPT_HEADER, 1);
$return = curl_exec($process);
if ($return === false)
{
throw new Exception('Curl error: ' . curl_error($process));
}
curl_close($process);
if ( ! preg_match('#Location: (.*)#', $return, $location))
{
throw new Exception('No Location found');
}
if (self::$max_redirects-- <= 0)
{
throw new Exception('Max redirections reached trying to get: ' . $url);
}
return trim($location[1]);
}
}
You can use:
$redirectURL = curl_getinfo($ch,CURLINFO_REDIRECT_URL);
Lot's of regex here, despite the fact i really like them this way might be more stable to me:
$resultCurl=curl_exec($curl); //get curl result
//Optional line if you want to store the http status code
$headerHttpCode=curl_getinfo($curl,CURLINFO_HTTP_CODE);
//let's use dom and xpath
$dom = new \DOMDocument();
libxml_use_internal_errors(true);
$dom->loadHTML($resultCurl, LIBXML_HTML_NODEFDTD);
libxml_use_internal_errors(false);
$xpath = new \DOMXPath($dom);
$head=$xpath->query("/html/body/p/a/#href");
$newUrl=$head[0]->nodeValue;
The location part is a link in the HTML sent by apache. So Xpath is perfect to recover it.