I have this code to fetch the source code of the sites, but there are some sites that use Cloudflare.. and it is not possible to fetch the source code
Is there a way to bypass Cloudflare?
// get url of redirect from source url & get url view source
$url = '/';
function get_remote_data($url, $post_paramtrs=false, $curl_opts=[]){
$c = curl_init();
curl_setopt($c, CURLOPT_URL, $url);
curl_setopt($c, CURLOPT_RETURNTRANSFER, 1);
//if parameters were passed to this function, then transform into POST method.. (if you need GET request, then simply change the passed URL)
if($post_paramtrs){ curl_setopt($c, CURLOPT_POST,TRUE); curl_setopt($c, CURLOPT_POSTFIELDS, (is_array($post_paramtrs)? http_build_query($post_paramtrs) : $post_paramtrs) ); }
curl_setopt($c, CURLOPT_SSL_VERIFYHOST,false);
curl_setopt($c, CURLOPT_SSL_VERIFYPEER,false);
curl_setopt($c, CURLOPT_COOKIE, 'CookieName1=Value;');
$headers[]= "User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:76.0) Gecko/20100101 Firefox/76.0"; $headers[]= "Pragma: "; $headers[]= "Cache-Control: max-age=0";
if (!empty($post_paramtrs) && !is_array($post_paramtrs) && is_object(json_decode($post_paramtrs))){ $headers[]= 'Content-Type: application/json'; $headers[]= 'Content-Length: '.strlen($post_paramtrs); }
curl_setopt($c, CURLOPT_HTTPHEADER, $headers);
curl_setopt($c, CURLOPT_MAXREDIRS, 10);
//if SAFE_MODE or OPEN_BASEDIR is set,then FollowLocation cant be used.. so...
$follow_allowed= ( ini_get('open_basedir') || ini_get('safe_mode')) ? false:true; if ($follow_allowed){curl_setopt($c, CURLOPT_FOLLOWLOCATION, 1);}
curl_setopt($c, CURLOPT_CONNECTTIMEOUT, 9);
curl_setopt($c, CURLOPT_REFERER, $url);
curl_setopt($c, CURLOPT_TIMEOUT, 60);
curl_setopt($c, CURLOPT_AUTOREFERER, true);
curl_setopt($c, CURLOPT_ENCODING, '');
curl_setopt($c, CURLOPT_HEADER, !empty($extra['return_array']));
//set extra options if passed
if(!empty($curl_opts)) foreach($curl_opts as $key=>$value) curl_setopt($c, constant($key), $value);
$data = curl_exec($c);
if(!empty($extra['return_array'])) {
preg_match("/(.*?)\r\n\r\n((?!HTTP\/\d\.\d).*)/si",$data, $x); preg_match_all('/(.*?): (.*?)\r\n/i', trim('head_line: '.$x[1]), $headers_, PREG_SET_ORDER); foreach($headers_ as $each){ $header[$each[1]] = $each[2]; } $data=trim($x[2]);
}
$status=curl_getinfo($c); curl_close($c);
// if redirected, then get that redirected page
if($status['http_code']==301 || $status['http_code']==302) {
//if we FOLLOWLOCATION was not allowed, then re-get REDIRECTED URL
//p.s. WE dont need "else", because if FOLLOWLOCATION was allowed, then we wouldnt have come to this place, because 301 could already auto-followed by curl :)
if (!$follow_allowed){
//if REDIRECT URL is found in HEADER
if(empty($redirURL)){if(!empty($status['redirect_url'])){$redirURL=$status['redirect_url'];}}
//if REDIRECT URL is found in RESPONSE
if(empty($redirURL)){preg_match('/(Location:|URI:)(.*?)(\r|\n)/si', $data, $m); if (!empty($m[2])){ $redirURL=$m[2]; } }
//if REDIRECT URL is found in OUTPUT
if(empty($redirURL)){preg_match('/moved\s\<a(.*?)href\=\"(.*?)\"(.*?)here\<\/a\>/si',$data,$m); if (!empty($m[1])){ $redirURL=$m[1]; } }
//if URL found, then re-use this function again, for the found url
if(!empty($redirURL)){$t=debug_backtrace(); return call_user_func( $t[0]["function"], trim($redirURL), $post_paramtrs);}
}
}
// if not redirected,and nor "status 200" page, then error..
elseif ( $status['http_code'] != 200 ) { $data = "ERRORCODE22 with $url<br/><br/>Last status codes:".json_encode($status)."<br/><br/>Last data got:$data";}
//URLS correction
$answer = ( !empty($extra['return_array']) ? array('data'=>$data, 'header'=>$header, 'info'=>$status) : $data);
return $answer;
}
$view_source = get_remote_data($url, $post_paramtrs=false, $curl_opts=[]);
page appears >>> Please Wait... | Cloudflare
Cloudflare is a middleware. Your request goes through cloudflare and then to your server. You do not access to your site directly. If site disables cloudflare then you can access the site directly.
Related
I'm trying to make curl follow a redirect but I can't quite get it to work right. I have a string that I want to send as a GET param to a server and get the resulting URL.
Example:
String = Kobold Vermin
Url = www.wowhead.com/search?q=Kobold+Worker
If you go to that url it will redirect you to "www.wowhead.com/npc=257". I want curl to return this URL to my PHP code so that i can extract the "npc=257" and use it.
Current code:
function npcID($name) {
$urltopost = "http://www.wowhead.com/search?q=" . $name;
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1");
curl_setopt($ch, CURLOPT_URL, $urltopost);
curl_setopt($ch, CURLOPT_REFERER, "http://www.wowhead.com");
curl_setopt($ch, CURLOPT_HTTPHEADER, Array("Content-Type:application/x-www-form-urlencoded"));
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
return curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
}
This however returns www.wowhead.com/search?q=Kobold+Worker and not www.wowhead.com/npc=257.
I suspect PHP is returning before the external redirect happens. How can I fix this?
To make cURL follow a redirect, use:
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
Erm... I don't think you're actually executing the curl... Try:
curl_exec($ch);
...after setting the options, and before the curl_getinfo() call.
EDIT: If you just want to find out where a page redirects to, I'd use the advice here, and just use Curl to grab the headers and extract the Location: header from them:
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$result = curl_exec($ch);
if (preg_match('~Location: (.*)~i', $result, $match)) {
$location = trim($match[1]);
}
Add this line to curl inizialization
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
and use getinfo before curl_close
$redirectURL = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL );
es:
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_USERAGENT,'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_BINARYTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT ,0);
curl_setopt($ch, CURLOPT_TIMEOUT, 60);
$html = curl_exec($ch);
$redirectURL = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL );
curl_close($ch);
The answer above didn't work for me on one of my servers, something to to with basedir, so I re-hashed it a little. The code below works on all my servers.
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
$a = curl_exec($ch);
curl_close( $ch );
// the returned headers
$headers = explode("\n",$a);
// if there is no redirection this will be the final url
$redir = $url;
// loop through the headers and check for a Location: str
$j = count($headers);
for($i = 0; $i < $j; $i++){
// if we find the Location header strip it and fill the redir var
if(strpos($headers[$i],"Location:") !== false){
$redir = trim(str_replace("Location:","",$headers[$i]));
break;
}
}
// do whatever you want with the result
echo $redir;
The chosen answer here is decent but its case sensitive, doesn't protect against relative location: headers (which some sites do) or pages that might actually have the phrase Location: in their content... (which zillow currently does).
A bit sloppy, but a couple quick edits to make this a bit smarter are:
function getOriginalURL($url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
$result = curl_exec($ch);
$httpStatus = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
// if it's not a redirection (3XX), move along
if ($httpStatus < 300 || $httpStatus >= 400)
return $url;
// look for a location: header to find the target URL
if(preg_match('/location: (.*)/i', $result, $r)) {
$location = trim($r[1]);
// if the location is a relative URL, attempt to make it absolute
if (preg_match('/^\/(.*)/', $location)) {
$urlParts = parse_url($url);
if ($urlParts['scheme'])
$baseURL = $urlParts['scheme'].'://';
if ($urlParts['host'])
$baseURL .= $urlParts['host'];
if ($urlParts['port'])
$baseURL .= ':'.$urlParts['port'];
return $baseURL.$location;
}
return $location;
}
return $url;
}
Note that this still only goes 1 redirection deep. To go deeper, you actually need to get the content and follow the redirects.
Sometimes you need to get HTTP headers but at the same time you don't want return those headers.**
This skeleton takes care of cookies and HTTP redirects using recursion. The main idea here is to avoid return HTTP headers to the client code.
You can build a very strong curl class over it. Add POST functionality, etc.
<?php
class curl {
static private $cookie_file = '';
static private $user_agent = '';
static private $max_redirects = 10;
static private $followlocation_allowed = true;
function __construct()
{
// set a file to store cookies
self::$cookie_file = 'cookies.txt';
// set some general User Agent
self::$user_agent = 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)';
if ( ! file_exists(self::$cookie_file) || ! is_writable(self::$cookie_file))
{
throw new Exception('Cookie file missing or not writable.');
}
// check for PHP settings that unfits
// correct functioning of CURLOPT_FOLLOWLOCATION
if (ini_get('open_basedir') != '' || ini_get('safe_mode') == 'On')
{
self::$followlocation_allowed = false;
}
}
/**
* Main method for GET requests
* #param string $url URI to get
* #return string request's body
*/
static public function get($url)
{
$process = curl_init($url);
self::_set_basic_options($process);
// this function is in charge of output request's body
// so DO NOT include HTTP headers
curl_setopt($process, CURLOPT_HEADER, 0);
if (self::$followlocation_allowed)
{
// if PHP settings allow it use AUTOMATIC REDIRECTION
curl_setopt($process, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($process, CURLOPT_MAXREDIRS, self::$max_redirects);
}
else
{
curl_setopt($process, CURLOPT_FOLLOWLOCATION, false);
}
$return = curl_exec($process);
if ($return === false)
{
throw new Exception('Curl error: ' . curl_error($process));
}
// test for redirection HTTP codes
$code = curl_getinfo($process, CURLINFO_HTTP_CODE);
if ($code == 301 || $code == 302)
{
curl_close($process);
try
{
// go to extract new Location URI
$location = self::_parse_redirection_header($url);
}
catch (Exception $e)
{
throw $e;
}
// IMPORTANT return
return self::get($location);
}
curl_close($process);
return $return;
}
static function _set_basic_options($process)
{
curl_setopt($process, CURLOPT_USERAGENT, self::$user_agent);
curl_setopt($process, CURLOPT_COOKIEFILE, self::$cookie_file);
curl_setopt($process, CURLOPT_COOKIEJAR, self::$cookie_file);
curl_setopt($process, CURLOPT_RETURNTRANSFER, 1);
// curl_setopt($process, CURLOPT_VERBOSE, 1);
// curl_setopt($process, CURLOPT_SSL_VERIFYHOST, false);
// curl_setopt($process, CURLOPT_SSL_VERIFYPEER, false);
}
static function _parse_redirection_header($url)
{
$process = curl_init($url);
self::_set_basic_options($process);
// NOW we need to parse HTTP headers
curl_setopt($process, CURLOPT_HEADER, 1);
$return = curl_exec($process);
if ($return === false)
{
throw new Exception('Curl error: ' . curl_error($process));
}
curl_close($process);
if ( ! preg_match('#Location: (.*)#', $return, $location))
{
throw new Exception('No Location found');
}
if (self::$max_redirects-- <= 0)
{
throw new Exception('Max redirections reached trying to get: ' . $url);
}
return trim($location[1]);
}
}
You can use:
$redirectURL = curl_getinfo($ch,CURLINFO_REDIRECT_URL);
Lot's of regex here, despite the fact i really like them this way might be more stable to me:
$resultCurl=curl_exec($curl); //get curl result
//Optional line if you want to store the http status code
$headerHttpCode=curl_getinfo($curl,CURLINFO_HTTP_CODE);
//let's use dom and xpath
$dom = new \DOMDocument();
libxml_use_internal_errors(true);
$dom->loadHTML($resultCurl, LIBXML_HTML_NODEFDTD);
libxml_use_internal_errors(false);
$xpath = new \DOMXPath($dom);
$head=$xpath->query("/html/body/p/a/#href");
$newUrl=$head[0]->nodeValue;
The location part is a link in the HTML sent by apache. So Xpath is perfect to recover it.
Okay, so here is my (maybe quite challenging) problem:
I am trying to login on 9gag.com/login with PHP and cURL, but it is not working. There is no error or anything, even the response looks fine.
The login form is protected with a CSRF token, so the script connects for the first time to get the cookies (session, ts1 etc.) and the associated CSRF token.
After that the script posts the obtained cookies, the CSRF token and also the username/password combination to the server.
You might notice that a few more cookies like "_gat" or "__gads" are posted, too, but their values either do not change, are self-explaining (for example "countryCode") or are only important for Google ad tracking and statistical purposes.
These two requests are exactly like a browser (in my case Safari) would submit them (I used Live HTTP Headers), so the API should not see any difference at all.
Here is my php snippet:
$login_email = 'some_email#email.com';
$login_pass = 'some_password';
function get_string_between($string, $start, $end) {
$string = ' ' . $string;
$ini = strpos($string, $start);
if ($ini == 0) return '';
$ini += strlen($start);
$len = strpos($string, $end, $ini) - $ini;
return substr($string, $ini, $len);
}
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, 'https://9gag.com/login');
curl_setopt($ch, CURLOPT_POST, FALSE);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_HTTPHEADER, array('Accept-Charset: utf-8',
'Accept-Language: en-us,en;q=0.7,bn-bd;q=0.3',
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'));
curl_setopt($ch, CURLOPT_COOKIEJAR, getcwd () . '/cookies_9gag.txt' );
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/601.5.17 (KHTML, like Gecko) Version/9.1 Safari/601.5.17");
$gag_login_page = curl_exec($ch) or die(curl_error($ch));
$pos_csrftoken = strpos($gag_login_page, 'name="csrftoken"');
$part_csrf = substr($gag_login_page, $pos_csrftoken + 9, 100);
$gag_csrf = get_string_between($part_csrf, 'value="', '"');
preg_match_all('/^Set-Cookie:\s*([^;]*)/mi', $gag_login_page, $matches);
$cookies = array();
foreach($matches[1] as $item) {
parse_str($item, $cookie);
$cookies = array_merge($cookies, $cookie);
}
$phpsessid = $cookies['PHPSESSID'];
$ts1 = $cookies['ts1'];
$ri = $cookies['____ri'];
$session = $cookies['session'];
//echo 'Cookie: _gat=1;____ri='.$ri.'; ts1='.$ts1.'; sign_up_referer=https%3A%2F%2F9gag.com%2Flogin; countryCode=00; cacheableGrace=1; __gads=ID=d94815096752d058:T=1462990222:S=ALNI_Ma8SdWnOioJeYSCcIdj7p1LdHtgpA; PHPSESSID='.$phpsessid.'; session='.urlencode($session).'; gag_tz=2; _ga=GA1.2.1699141998.1462990222; _pk_id.7.f7ab=d2deed23abfae124.1462990220.1.1462990329.1462990220.; _pk_ses.7.f7ab=*';
curl_setopt($ch, CURLOPT_URL, 'https://9gag.com/login');
curl_setopt($ch, CURLOPT_POSTFIELDS,'csrftoken='.urlencode($gag_csrf).'&next='.'&location=1'.'&username='.urlencode($login_email).'&password='.urlencode($login_pass));
curl_setopt($ch, CURLOPT_POST, TRUE);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_HTTPHEADER, array('Accept-Charset: utf-8',
'Connection: keep-alive',
'Accept-Language: en-us,en;q=0.7,bn-bd;q=0.3',
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Cookie: _gat=1; ____ri='.$ri.'; ts1='.$ts1.'; sign_up_referer=https%3A%2F%2F9gag.com%2Flogin; countryCode=00; cacheableGrace=1; __gads=ID=d94815096752d058:T=1462990222:S=ALNI_Ma8SdWnOioJeYSCcIdj7p1LdHtgpA; PHPSESSID='.$phpsessid.'; session='.urlencode($session).'; gag_tz=2; _ga=GA1.2.1699141998.1462990222; _pk_id.7.f7ab=d2deed23abfae124.1462990220.1.1462990329.1462990220.; _pk_ses.7.f7ab=*'
));
curl_setopt($ch, CURLOPT_COOKIEFILE, getcwd () . '/cookies_9gag.txt' );
curl_setopt($ch, CURLOPT_COOKIEJAR, getcwd () . '/cookies_9gag.txt' );
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/601.5.17 (KHTML, like Gecko) Version/9.1 Safari/601.5.17");
$gag_logged_in_page = curl_exec($ch) or die(curl_error($ch));
$pos = strpos(gag_logged_in_page, 'some_username');
if ($pos === false) {
echo 'Not logged in.';
}
curl_close($ch);
Thank you for reading and your help!
Have a nice day!
PS: I know that the code to obtain the cookies and the CSRF token from the response might be a bit unusual and there are better ways to do it, but it is working (checked already).
PS2: I also know that there is already a JSON API for 9gag, called Infinigag, but it has not the functionality that I need, so I cannot use it. I already contacted the author but he is not responding since weeks, so there is no chance for me to ask him how he managed to solve this problem.
You cannot access any user related content on 9gag without javascript enabled (which you don't have with curl), although, you can scrap the content of any other page. I guess they have some sort of javascript detection in place for user based actions, like voting, uploading, etc..
You may consider automatizing your requests with selenium.
Note:
I've tested several scripts including yours and, despite the fact that I could get the cookies with session and token, the curl requests either timed-out or returned empty when trying to access user related content.
I'm currently using cURL to try and get the URL from a redirect for a website scraper. I only need the url from the website. I've researched on stackoverflow and other sites for the past couple days and have been unsuccessful. The code I'm currently using is from this website:
$url = "http://www.someredirect.com";
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_USERAGENT,'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_NOBODY, true);
$response = curl_exec($ch);
preg_match_all('/^Location:(.*)$/mi', $response, $matches);
curl_close($ch);
echo !empty($matches[1]) ? trim($matches[1][0]) : 'No redirect found';
Any help would be greatly appreciated!
In your particular case, the server is checking for certain user-agent strings.
When a server checks the user-agent string, it will only respond with a 302 redirect status code when the server sees a "valid" (according to the server) user-agent. Any "invalid" user-agents will not receive the 302 redirect status code response or Location: header.
In your particular case, when the server receives a request from an "invalid" user-agent it responds with a 200 OK status code with no text in the response body.
(Note: in the code below, the actual URLs provided have been replaced with examples.)
Let's say that http://www.example.com's server checks the User-Agent string and that http://www.example.com/product/123/ redirects to http://www.example.org/abc.
In PHP your solution would be:
<?php
$url = 'http://www.example.com/product/123/';
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (X11; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0"); // Necessary. The server checks for a valid User-Agent.
curl_exec($ch);
$response = curl_exec($ch);
preg_match_all('/^Location:(.*)$/mi', $response, $matches);
curl_close($ch);
echo !empty($matches[1]) ? trim($matches[1][0]) : 'No redirect found';
And, the output of this script would be: http://www.example.org/abc.
Try using this code:
function curl_last_url(/*resource*/ $ch, /*int*/ &$maxredirect = null) {
$mr = $maxredirect === null ? 5 : intval($maxredirect);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
if ($mr > 0) {
echo $mr;
echo $newurl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
$rch = curl_copy_handle($ch);
curl_setopt($rch, CURLOPT_HEADER, true);
curl_setopt($rch, CURLOPT_NOBODY, true);
curl_setopt($rch, CURLOPT_FORBID_REUSE, false);
curl_setopt($rch, CURLOPT_RETURNTRANSFER, true);
do {
curl_setopt($rch, CURLOPT_URL, $newurl);
$header = curl_exec($rch);
if (curl_errno($rch)) {
$code = 0;
} else {
$code = curl_getinfo($rch, CURLINFO_HTTP_CODE);
echo $code;
if ($code == 301 || $code == 302) {
preg_match('/Location:(.*?)\n/', $header, $matches);
$newurl = trim(array_pop($matches));
} else {
$code = 0;
}
}
} while ($code && --$mr);
curl_close($rch);
if (!$mr) {
if ($maxredirect === null) {
trigger_error('Too many redirects. When following redirects, libcurl hit the maximum amount.', E_USER_WARNING);
} else {
$maxredirect = 0;
}
return false;
}
curl_setopt($ch, CURLOPT_URL, $newurl);
}
return $newurl;
}
For what i'm trying to do, i use PHP5 in CLI, and cURL extension.
I'm trying to download a file from youtube's server, it works fine with any navigator,
the link is something like that;
`http://youtube.com/get_video_info?video_id=VIDEO_ID
exemple: http://youtube.com/get_video_info?video_id=9pQxmD6Bhd
When i access this file trough my navigator, it prompt me with a download box for the file
'get_video_info', when downloaded the file content some data, ..
The problem is to get this file with cURL, i keep getting this error message;
status=fail&errorcode=2&reason=Invalid+parameters.
This is the code ( i tried to change some option, but i'm not familliar with cURL, so i'm stuck.
$c = curl_init();
curl_setopt($c, CURLOPT_URL, "http://youtube.com/get_video_info?video_id=9pQxmD6Bhd");
curl_setopt($c, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1");
curl_setopt($c, CURLOPT_RETURNTRANSFER, true);
curl_setopt($c, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($c, CURLOPT_HEADER, false);
$output = curl_exec($c);
if($output === false)
{
trigger_error('Erreur curl : '.curl_error($c),E_USER_WARNING);
}
else
{
var_dump($output);
}
curl_close($c);
I tried to use some curl_setopt options, like CURLOPT_TRANSFERTEXT with no success.
I definitely need help !
Thanks for answers, and sorry if i did something that dont respect the rules here, it's my first post.
EDIT
Here is the code to download youtube video ( .ogg ) with php in cli.
<?php
/*Youtube URL and ID*/
$youtube_video = "http://www.youtube.com/watch?v=Ftud51NhY2I";
$yt_id = explode("=", $youtube_video);
$id = $yt_id[1];
/*
Functions
*/
function get_link($raw){
$url = rawurldecode(rawurldecode($raw));
$url = explode("&qual", $url);
return $url[0];
}
/*
Here we go
Query video token
*/
$c = curl_init();
curl_setopt($c, CURLOPT_URL, $youtube_video);
curl_setopt($c, CURLOPT_RETURNTRANSFER, true);
curl_setopt($c, CURLOPT_HEADER, false);
$output = curl_exec($c);
if($output === false)
{
trigger_error('Erreur curl : '.curl_error($c),E_USER_WARNING);
}
else{}
curl_close($c);
/*
Get Video infos
*/
$c = curl_init();
curl_setopt($c, CURLOPT_URL, "http://youtube.com/get_video_info?video_id=".$id);
curl_setopt($c, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1");
curl_setopt($c, CURLOPT_RETURNTRANSFER, true);
curl_setopt($c, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($c, CURLOPT_HEADER, false);
$output = curl_exec($c);
if($output === false){trigger_error('Erreur curl : '.curl_error($c),E_USER_WARNING);}
else{}
curl_close($c);
/*Get RAW link*/
$temp = explode("url_encoded_fmt_stream_map=url%3D", $output);
$url = explode("=", $temp[1]);
$url = get_link($url[0]);
/*Get Video name*/
$temp = "";
$temp = explode("title=", $output);
$title = explode("&", $temp[1]);
$title = rawurldecode(rawurldecode($title[0]));
$replace = array(':', '+', '\\', '/', '"', '<', '>', '|', '(', ')', '\'');
$title = str_replace($replace, ' ',$title);
//echo $title;
/*
Download Video
*/
$url = $url;
$path = $title.'.ogg';
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$data = curl_exec($ch);
curl_close($ch);
file_put_contents($path, $data);
echo "Done... \r\n";
?>
You get error message because the video_id parameter isn't valid.
Try changing that ID and it should work correctly.
http://www.youtube.com/watch?v=9pQxmD6Bhd - does not exist
youtube has changed their system. now it is working only with the real IP who use the get_video_info system. when you try with cURL it sends the server IP to Youtube, then you have to download videos with the servers IP, because youtube system creates the direct video download urls with given IP.
I'm trying to make curl follow a redirect but I can't quite get it to work right. I have a string that I want to send as a GET param to a server and get the resulting URL.
Example:
String = Kobold Vermin
Url = www.wowhead.com/search?q=Kobold+Worker
If you go to that url it will redirect you to "www.wowhead.com/npc=257". I want curl to return this URL to my PHP code so that i can extract the "npc=257" and use it.
Current code:
function npcID($name) {
$urltopost = "http://www.wowhead.com/search?q=" . $name;
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1");
curl_setopt($ch, CURLOPT_URL, $urltopost);
curl_setopt($ch, CURLOPT_REFERER, "http://www.wowhead.com");
curl_setopt($ch, CURLOPT_HTTPHEADER, Array("Content-Type:application/x-www-form-urlencoded"));
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
return curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
}
This however returns www.wowhead.com/search?q=Kobold+Worker and not www.wowhead.com/npc=257.
I suspect PHP is returning before the external redirect happens. How can I fix this?
To make cURL follow a redirect, use:
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
Erm... I don't think you're actually executing the curl... Try:
curl_exec($ch);
...after setting the options, and before the curl_getinfo() call.
EDIT: If you just want to find out where a page redirects to, I'd use the advice here, and just use Curl to grab the headers and extract the Location: header from them:
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$result = curl_exec($ch);
if (preg_match('~Location: (.*)~i', $result, $match)) {
$location = trim($match[1]);
}
Add this line to curl inizialization
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
and use getinfo before curl_close
$redirectURL = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL );
es:
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_USERAGENT,'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_BINARYTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT ,0);
curl_setopt($ch, CURLOPT_TIMEOUT, 60);
$html = curl_exec($ch);
$redirectURL = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL );
curl_close($ch);
The answer above didn't work for me on one of my servers, something to to with basedir, so I re-hashed it a little. The code below works on all my servers.
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
$a = curl_exec($ch);
curl_close( $ch );
// the returned headers
$headers = explode("\n",$a);
// if there is no redirection this will be the final url
$redir = $url;
// loop through the headers and check for a Location: str
$j = count($headers);
for($i = 0; $i < $j; $i++){
// if we find the Location header strip it and fill the redir var
if(strpos($headers[$i],"Location:") !== false){
$redir = trim(str_replace("Location:","",$headers[$i]));
break;
}
}
// do whatever you want with the result
echo $redir;
The chosen answer here is decent but its case sensitive, doesn't protect against relative location: headers (which some sites do) or pages that might actually have the phrase Location: in their content... (which zillow currently does).
A bit sloppy, but a couple quick edits to make this a bit smarter are:
function getOriginalURL($url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
$result = curl_exec($ch);
$httpStatus = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
// if it's not a redirection (3XX), move along
if ($httpStatus < 300 || $httpStatus >= 400)
return $url;
// look for a location: header to find the target URL
if(preg_match('/location: (.*)/i', $result, $r)) {
$location = trim($r[1]);
// if the location is a relative URL, attempt to make it absolute
if (preg_match('/^\/(.*)/', $location)) {
$urlParts = parse_url($url);
if ($urlParts['scheme'])
$baseURL = $urlParts['scheme'].'://';
if ($urlParts['host'])
$baseURL .= $urlParts['host'];
if ($urlParts['port'])
$baseURL .= ':'.$urlParts['port'];
return $baseURL.$location;
}
return $location;
}
return $url;
}
Note that this still only goes 1 redirection deep. To go deeper, you actually need to get the content and follow the redirects.
Sometimes you need to get HTTP headers but at the same time you don't want return those headers.**
This skeleton takes care of cookies and HTTP redirects using recursion. The main idea here is to avoid return HTTP headers to the client code.
You can build a very strong curl class over it. Add POST functionality, etc.
<?php
class curl {
static private $cookie_file = '';
static private $user_agent = '';
static private $max_redirects = 10;
static private $followlocation_allowed = true;
function __construct()
{
// set a file to store cookies
self::$cookie_file = 'cookies.txt';
// set some general User Agent
self::$user_agent = 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)';
if ( ! file_exists(self::$cookie_file) || ! is_writable(self::$cookie_file))
{
throw new Exception('Cookie file missing or not writable.');
}
// check for PHP settings that unfits
// correct functioning of CURLOPT_FOLLOWLOCATION
if (ini_get('open_basedir') != '' || ini_get('safe_mode') == 'On')
{
self::$followlocation_allowed = false;
}
}
/**
* Main method for GET requests
* #param string $url URI to get
* #return string request's body
*/
static public function get($url)
{
$process = curl_init($url);
self::_set_basic_options($process);
// this function is in charge of output request's body
// so DO NOT include HTTP headers
curl_setopt($process, CURLOPT_HEADER, 0);
if (self::$followlocation_allowed)
{
// if PHP settings allow it use AUTOMATIC REDIRECTION
curl_setopt($process, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($process, CURLOPT_MAXREDIRS, self::$max_redirects);
}
else
{
curl_setopt($process, CURLOPT_FOLLOWLOCATION, false);
}
$return = curl_exec($process);
if ($return === false)
{
throw new Exception('Curl error: ' . curl_error($process));
}
// test for redirection HTTP codes
$code = curl_getinfo($process, CURLINFO_HTTP_CODE);
if ($code == 301 || $code == 302)
{
curl_close($process);
try
{
// go to extract new Location URI
$location = self::_parse_redirection_header($url);
}
catch (Exception $e)
{
throw $e;
}
// IMPORTANT return
return self::get($location);
}
curl_close($process);
return $return;
}
static function _set_basic_options($process)
{
curl_setopt($process, CURLOPT_USERAGENT, self::$user_agent);
curl_setopt($process, CURLOPT_COOKIEFILE, self::$cookie_file);
curl_setopt($process, CURLOPT_COOKIEJAR, self::$cookie_file);
curl_setopt($process, CURLOPT_RETURNTRANSFER, 1);
// curl_setopt($process, CURLOPT_VERBOSE, 1);
// curl_setopt($process, CURLOPT_SSL_VERIFYHOST, false);
// curl_setopt($process, CURLOPT_SSL_VERIFYPEER, false);
}
static function _parse_redirection_header($url)
{
$process = curl_init($url);
self::_set_basic_options($process);
// NOW we need to parse HTTP headers
curl_setopt($process, CURLOPT_HEADER, 1);
$return = curl_exec($process);
if ($return === false)
{
throw new Exception('Curl error: ' . curl_error($process));
}
curl_close($process);
if ( ! preg_match('#Location: (.*)#', $return, $location))
{
throw new Exception('No Location found');
}
if (self::$max_redirects-- <= 0)
{
throw new Exception('Max redirections reached trying to get: ' . $url);
}
return trim($location[1]);
}
}
You can use:
$redirectURL = curl_getinfo($ch,CURLINFO_REDIRECT_URL);
Lot's of regex here, despite the fact i really like them this way might be more stable to me:
$resultCurl=curl_exec($curl); //get curl result
//Optional line if you want to store the http status code
$headerHttpCode=curl_getinfo($curl,CURLINFO_HTTP_CODE);
//let's use dom and xpath
$dom = new \DOMDocument();
libxml_use_internal_errors(true);
$dom->loadHTML($resultCurl, LIBXML_HTML_NODEFDTD);
libxml_use_internal_errors(false);
$xpath = new \DOMXPath($dom);
$head=$xpath->query("/html/body/p/a/#href");
$newUrl=$head[0]->nodeValue;
The location part is a link in the HTML sent by apache. So Xpath is perfect to recover it.