I want to store curl_multi_exec records to a variable but it didn't work out for me after using CURLOPT_RETURNTRANSFER = TRUE, then I did some research and add curl_multi_getcontent this works fine I mean its record values for the variable but the problem is it only stores few results in the variable.
$ch = curl_init();
curl_setopt_array($ch, array(
CURLOPT_URL => $stream_url,
CURLOPT_ENCODING => "gzip",
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_HTTPAUTH => CURLAUTH_BASIC,
CURLOPT_TIMEOUT => 10,
CURLOPT_USERPWD => $user.":".$pass,
CURLOPT_WRITEFUNCTION => "print_out_data",
//CURLOPT_RETURNTRANSFER => true,
CURLOPT_VERBOSE => true // uncomment for curl verbosity
));
$running = null;
$mh = curl_multi_init();
curl_multi_add_handle($mh, $ch);
do {
curl_multi_select($mh, 1);
curl_multi_exec($mh, $running);
$content = curl_multi_getcontent($ch);
$arr = json_decode($content, true);
// print_r($arr);
$foo = $arr['id'];
$bar = $arr['body'];
} while($running > 0);
curl_multi_remove_handle($mh, $ch);
curl_multi_close($ch);
Before do{}while() write
$content = array();
Line
$content = curl_multi_getcontent($ch);
Replace to
$content[] = curl_multi_getcontent($ch);
After your loop write
print_r($content);
Related
I need to scrape some data out of a website that asks for login first, and i do manage to login using curl, here is my login code:
$login = 'https://example.com/login';
$ch = curl_init($login);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_COOKIEJAR => COOKIE_FILE,
CURLOPT_COOKIEFILE => COOKIE_FILE
]);
$response = curl_exec($ch);
$re = '/<input type="hidden" name="csrf" value="(.*?)" \/>/m';
preg_match_all($re, $response, $matches, PREG_SET_ORDER, 0);
$arr = array(
'email' => 'email#example.com',
'password' => 'Password123',
'csrf' => $matches[0][1]
);
curl_setopt_array($ch, [
CURLOPT_URL => $login,
CURLOPT_USERAGENT => 'Mozilla/5.0',
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => http_build_query($arr),
CURLOPT_COOKIEJAR => COOKIE_FILE,
CURLOPT_COOKIEFILE => COOKIE_FILE,
CURLOPT_FOLLOWLOCATION => true
]);
curl_exec($ch);
Now, after login i have to scrape 70-100 pages and i manage to do that by using a foreach loop but it takes like forever. Here is my code:
$arr = [
[
'id' => '1',
'csrf' => $matches[0][1] //same csrf as in login
],[
'id' => '2',
'csrf' => $matches[0][1] //same csrf as in login
],[
...
],[
'id' => '100',
'csrf' => $matches[0][1] //same csrf as in login
]
];
foreach($arr as $v){
curl_setopt_array($ch,[
CURLOPT_URL => 'https://example.com/submit',
CURLOPT_USERAGENT => 'Mozilla/5.0',
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => http_build_query($v),
CURLOPT_FOLLOWLOCATION => true
]);
$return = curl_exec($ch);
$info = curl_getinfo($ch);
//do something with the returned data
}
But if i'm trying to use multi_curl i can't keep the login alive and i'm greeted by a 405 http_code.
Is there a solution to use curl for login and multi for scraping? Thank you!
EDIT
Here is the code that i'm using for multi_curl(found it here, on stackoverflow):
function multiRequest($data, $options = array()) {
// array of curl handles
$curly = array();
// data to be returned
$result = array();
// multi handle
$mh = curl_multi_init();
// loop through $data and create curl handles
// then add them to the multi-handle
foreach ($data as $id => $d) {
$curly[$id] = curl_init();
$url = (is_array($d) && !empty($d['url'])) ? $d['url'] : $d;
curl_setopt($curly[$id], CURLOPT_URL, $url);
curl_setopt($curly[$id], CURLOPT_USERAGENT, 'Mozilla/5.0');
curl_setopt($curly[$id], CURLOPT_RETURNTRANSFER, 1);
// post?
if (is_array($d)) {
if (!empty($d['post'])) {
curl_setopt($curly[$id], CURLOPT_POST, true);
curl_setopt($curly[$id], CURLOPT_POSTFIELDS, http_build_query($d['post']));
curl_setopt($curly[$id], CURLOPT_FOLLOWLOCATION, true);
}
}
// extra options?
if (!empty($options)) {
curl_setopt_array($curly[$id], $options);
}
curl_multi_add_handle($mh, $curly[$id]);
}
// execute the handles
$running = null;
do {
curl_multi_exec($mh, $running);
} while($running > 0);
// get content and remove handles
foreach($curly as $id => $c) {
$result[$id] = curl_multi_getcontent($c);
curl_multi_remove_handle($mh, $c);
}
// all done
curl_multi_close($mh);
return $result;
}
I am trying to fetch the header info from multiple webpages. I tried to do so using single cURL requests using the code shown below :
<?php
$arr = array(
"John", "Mary",
"William", " Peter",
"James", "Emma",
"George", "Elizabeth",
"Charles", "Margaret",
);
$ch = curl_init();
for($i=0; $i<sizeOf($arr); $i++){
$url = "https://example.com/".$arr[$i];
$options = array(
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HEADER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_ENCODING => "",
CURLOPT_SSL_VERIFYPEER => FALSE,
CURLOPT_AUTOREFERER => true,
CURLOPT_CONNECTTIMEOUT => 120,
CURLOPT_TIMEOUT => 120,
CURLOPT_MAXREDIRS => 10,
);
curl_setopt_array( $ch, $options );
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if ( $httpCode != 200 ){
echo $arr[$i]." Error<br>";
} else {
echo $arr[$i]." Success<br>";
}
}
curl_close($ch);
?>
But this code seems to take a very long execution time. I searched the internet & found curl_multi_exec which could be used to run multiple cURL requests at a time. So now I use this code :
<?php
ini_set('max_execution_time', 0);
$arr = array(
"John", "Mary",
"William", " Peter",
"James", "Emma",
"George", "Elizabeth",
"Charles", "Margaret",
);
function multiRequest($data) {
// array of curl handles
$curly = array();
// data to be returned
$result = array();
// multi handle
$mh = curl_multi_init();
// loop through $data and create curl handles
// then add them to the multi-handle
foreach ($data as $id => $d) {
$curly[$id] = curl_init();
$url = "https://example.com/".$data[$id];
$options = array(
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HEADER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_ENCODING => "",
CURLOPT_SSL_VERIFYPEER => FALSE,
CURLOPT_AUTOREFERER => true,
CURLOPT_CONNECTTIMEOUT => 120,
CURLOPT_TIMEOUT => 120,
CURLOPT_MAXREDIRS => 10,
);
// extra options?
if (!empty($options)) {
curl_setopt_array($curly[$id], $options);
}
curl_multi_add_handle($mh, $curly[$id]);
}
// execute the handles
$running = null;
do {
curl_multi_exec($mh, $running);
} while($running > 0);
// get content and remove handles
foreach($curly as $id => $c) {
$result[$id] = curl_multi_getcontent($c);
//Code to fetch header info
curl_multi_remove_handle($mh, $c);
}
// all done
curl_multi_close($mh);
return $result;
}
multiRequest($arr);
?>
How to fetch multiple header_info from curl_multi_init HTTP request?
This code from your first example:
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if ( $httpCode != 200 ){
echo $arr[$i]." Error<br>";
} else {
echo $arr[$i]." Success<br>";
}
will work even if the curl handle was executed by curl_multi_exec().
In your second example, replace this code:
// get content and remove handles
foreach($curly as $id => $c) {
$result[$id] = curl_multi_getcontent($c);
//Code to fetch header info
curl_multi_remove_handle($mh, $c);
}
with this:
// get content and remove handles
foreach($curly as $id => $c) {
$result[$id] = curl_multi_getcontent($c);
$httpCode = curl_getinfo($c, CURLINFO_HTTP_CODE);
$url = curl_getinfo($c, CURLINFO_EFFECTIVE_URL);
if ( $httpCode != 200 ){
echo $url." Error<br>";
} else {
echo $url." Success<br>";
}
curl_multi_remove_handle($mh, $c);
}
Can you give me some idea how to improve this function so it handles unexpected reply when server returns output that is not in xml, eg a simple server error message in html and then retry fetching the xml?
function fetch_xml($url, $timeout=15)
{
$ch = curl_init();
curl_setopt_array($ch, array(
CURLOPT_HEADER => 0,
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_CONNECTTIMEOUT => (int)$timeout,
CURLOPT_FOLLOWLOCATION => 1,
CURLOPT_URL => $url)
);
$xml_data = curl_exec($ch);
curl_close($ch);
if (!empty($xml_data)) {
return new SimpleXmlElement($xml_data);
}
else {
return null;
}
}
You can give this a try. I haven't tested it out.
function fetch_xml($url, $timeout = 15, $max_attempts = 5, $attempts = 0)
{
$ch = curl_init();
curl_setopt_array($ch, array(
CURLOPT_HEADER => 0,
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_CONNECTTIMEOUT => (int)$timeout,
CURLOPT_FOLLOWLOCATION => 1,
CURLOPT_URL => $url)
);
$xml_data = curl_exec($ch);
curl_close($ch);
if ($attempts <= $max_attempts && !empty($xml_data)) // don't infinite loop
{
try
{
return new SimpleXmlElement($xml_data);
}
catch (Exception $e)
{
return fetch_xml($url, (int)$timeout, $max_attempts, $attempts++);
}
}
return NULL;
}
I'm trying to understand how getting an access_token works with CURL.
$url = "https://api.com/oauth/access_token";
$access_token_parameters = array(
'client_id' => '',
'client_secret' => '',
'grant_type' => '',
'redirect_uri' => '',
'code' => $_GET['code']
);
$curl = curl_init($url);
curl_setopt($curl,CURLOPT_POST,true);
curl_setopt($curl,CURLOPT_POSTFIELDS,$access_token_parameters);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
$result = curl_exec($curl);
curl_close($curl);
I'm supposed to get a JSON string with this right?
I tried a various of things to output the string
$test = json_decode($result);
print_r($test);
$arr = json_encode($result,true);
foreach($arr as $val){
echo $val['access_token'];
}
Am I doing this wrong?
I believe the correct JSON output should be something like this :
{
"access_token": "fb2e77d.47a0479900504cb3ab4a1f626d174d2d",
"user": {
"id": "1574083",
"username": "snoopdogg",
"full_name": "Snoop Dogg",
"profile_picture": "http://distillery.s3.amazonaws.com/profiles/profile_1574083_75sq_1295469061.jpg"
}
}
But this is not working? I'm trying to get the access_token from the server.
Any help would be appreciated! Thank you
Try use below function to get content
$response = get_web_page($url);
$resArr = array();
$resArr = json_decode($response);
//echo"<pre>"; print_r($resArr); echo"</pre>";
function get_web_page($url) {
$options = array (CURLOPT_RETURNTRANSFER => true, // return web page
CURLOPT_HEADER => false, // don't return headers
CURLOPT_FOLLOWLOCATION => true, // follow redirects
CURLOPT_ENCODING => "", // handle compressed
CURLOPT_USERAGENT => "test", // who am i
CURLOPT_AUTOREFERER => true, // set referer on redirect
CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
CURLOPT_TIMEOUT => 120, // timeout on response
CURLOPT_MAXREDIRS => 10 ); // stop after 10 redirects
$ch = curl_init ( $url );
curl_setopt_array ( $ch, $options );
$content = curl_exec ( $ch );
$err = curl_errno ( $ch );
$errmsg = curl_error ( $ch );
$header = curl_getinfo ( $ch );
$httpCode = curl_getinfo ( $ch, CURLINFO_HTTP_CODE );
curl_close ( $ch );
$header ['errno'] = $err;
$header ['errmsg'] = $errmsg;
$header ['content'] = $content;
return $header ['content'];
}
I have the following JSON code:
{"username":"user1","password":"123456"}
That I need to pass to a url, lets say: http://api.mywebsite.com
I'm an extreme php newb, so I've been following a curl tutorial, but here is my current PHP code:
<?php
function get_web_page( $url )
{
$options = array(
CURLOPT_RETURNTRANSFER => true, // return web page
CURLOPT_HEADER => true, // don't return headers
CURLOPT_FOLLOWLOCATION => true, // follow redirects
CURLOPT_ENCODING => "", // handle compressed
CURLOPT_USERAGENT => "spider", // who am i
CURLOPT_AUTOREFERER => true, // set referer on redirect
CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
CURLOPT_TIMEOUT => 120, // timeout on response
CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
);
$ch = curl_init( $url );
curl_setopt_array( $ch, $options );
$content = curl_exec( $ch );
$err = curl_errno( $ch );
$errmsg = curl_error( $ch );
$header = curl_getinfo( $ch );
curl_close( $ch );
$header['errno'] = $err;
$header['errmsg'] = $errmsg;
$header['content'] = $content;
return $header;
}
?>
You might want to look into the CURLOPT_POSTFIELDS AND CURLOPT_POST options. These allow you to do a POST request and pass the data set into the CURLOPT_POSTFIELDS in the request.
Something in the lines of this:
$body = 'bar=1&foo=2&baz=3';
$c = curl_init ($url);
curl_setopt ($c, CURLOPT_POST, true);
curl_setopt ($c, CURLOPT_POSTFIELDS, $body);
curl_setopt ($c, CURLOPT_RETURNTRANSFER, true);
When you want to use normal GET Params:
$jsonString ='{"username":"user1","password":"123456"}';
$params = json_decode($jsonString);
$getParams = '';
$first = true;
foreach ($params as $key => $param){
if ($first){
$getParams .= '?';
$first = false;
} else{
$getParams .= '&';
}
$getParams .= $key .'=' .$param;
}
echo $getParams;
get_web_page($url . $getParams);