I've heard a lot about php's multi threading with cURL but have never really tried it and I find it a bit tough to understand how it actually works. Could anyone convert this into curl_multi?
$path1 = array("path1", "path2", "path3"); //example
$path2 = array("path1", "path2", "path3"); //example
$opt = curl_init($path1);
curl_setopt($opt, CURLOPT_RETURNTRANSFER, true);
$content = curl_exec($opt);
curl_close($opt);
file_put_contents($path2, $content);
What I want to actually do is to download multiple files from the arrays path 1 into path 2 using curl_multi.
This is nice project to start with...
https://github.com/jmathai/php-multi-curl
I am using curl multi and it is awesome indeed. I am using this to make faster push notifications.
https://github.com/Krutarth/FlashSnsPns
The above accepted answer is outdated/wrong, So, correct answer has to be up voted.
http://php.net/manual/en/function.curl-multi-init.php
Now, PHP supports fetching multiple URLs at the same time.
There is a very good function written by someone, http://archevery.blogspot.in/2013/07/php-curl-multi-threading.html
This is the function:
function runRequests($url_array, $thread_width = 4) {
$threads = 0;
$master = curl_multi_init();
$curl_opts = array(CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 5,
CURLOPT_CONNECTTIMEOUT => 15,
CURLOPT_TIMEOUT => 15,
CURLOPT_RETURNTRANSFER => TRUE);
$results = array();
$count = 0;
foreach($url_array as $url) {
$ch = curl_init();
$curl_opts[CURLOPT_URL] = $url;
curl_setopt_array($ch, $curl_opts);
curl_multi_add_handle($master, $ch); //push URL for single rec send into curl stack
$results[$count] = array("url" => $url, "handle" => $ch);
$threads++;
$count++;
if($threads >= $thread_width) { //start running when stack is full to width
while($threads >= $thread_width) {
usleep(100);
while(($execrun = curl_multi_exec($master, $running)) === -1){}
curl_multi_select($master);
// a request was just completed - find out which one and remove it from stack
while($done = curl_multi_info_read($master)) {
foreach($results as &$res) {
if($res['handle'] == $done['handle']) {
$res['result'] = curl_multi_getcontent($done['handle']);
}
}
curl_multi_remove_handle($master, $done['handle']);
curl_close($done['handle']);
$threads--;
}
}
}
}
do { //finish sending remaining queue items when all have been added to curl
usleep(100);
while(($execrun = curl_multi_exec($master, $running)) === -1){}
curl_multi_select($master);
while($done = curl_multi_info_read($master)) {
foreach($results as &$res) {
if($res['handle'] == $done['handle']) {
$res['result'] = curl_multi_getcontent($done['handle']);
}
}
curl_multi_remove_handle($master, $done['handle']);
curl_close($done['handle']);
$threads--;
}
} while($running > 0);
curl_multi_close($master);
return $results;
}
You can just use it.
Related
I am writing an interface in which I must launch 4 http requests to get some infomation.
I implemented the interface in 2 ways:
using sequential file_get_contents.
using multi curl.
I have benchmarked the 2 versions with jmeter. The result shows that multi curl is much better than sequential file_get_contents when there's only 1 thread in jmeter making requests, but much worse when 100 threads.
The question is: which could bring the bad performance of multi curl?
My multi curl code is as below:
$curl_handle_arr = array ();
$master = curl_multi_init();
foreach ( $call_url_arr as $key => $url )
{
$curl_handle = curl_init( $url );
$curl_handle_arr [$key] = $curl_handle;
curl_setopt( $curl_handle , CURLOPT_RETURNTRANSFER , true );
curl_setopt( $curl_handle , CURLOPT_POST , true );
curl_setopt( $curl_handle , CURLOPT_POSTFIELDS , http_build_query( $params_arr [$key] ) );
curl_multi_add_handle( $master , $curl_handle );
}
$running = null;
$mrc = null;
do
{
$mrc = curl_multi_exec( $master , $running );
}
while ( $mrc == CURLM_CALL_MULTI_PERFORM );
while ( $running && $mrc == CURLM_OK )
{
if (curl_multi_select( $master ) != - 1)
{
do
{
$mrc = curl_multi_exec( $master , $running );
}
while ( $mrc == CURLM_CALL_MULTI_PERFORM );
}
}
foreach ( $call_url_arr as $key => $url )
{
$curl_handle = $curl_handle_arr [$key];
if (curl_error( $curl_handle ) == '')
{
$result_str_arr [$key] = curl_multi_getcontent( $curl_handle );
}
curl_multi_remove_handle( $master , $curl_handle );
}
curl_multi_close( $master );
1. Simple optimization
You should sleep about 2500 microseconds if curl_multi_select failed.
Actually, it defintely fails sometimes for each execution.
Without sleeping, your CPU resources get occupied by lots of while (true) { } loops.
If you do nothing after some (not all) of the requests have finished,
you should let maximum timeout seconds larger.
Your code is written for old libcurls. As of libcurl version 7.2,
the state CURLM_CALL_MULTI_PERFORM does not appear anymore.
So, the following code
$running = null;
$mrc = null;
do
{
$mrc = curl_multi_exec( $master , $running );
}
while ( $mrc == CURLM_CALL_MULTI_PERFORM );
while ( $running && $mrc == CURLM_OK )
{
if (curl_multi_select( $master ) != - 1)
{
do
{
$mrc = curl_multi_exec( $master , $running );
}
while ( $mrc == CURLM_CALL_MULTI_PERFORM );
}
}
should be
curl_multi_exec($master, $running);
do
{
if (curl_multi_select($master, 99) === -1)
{
usleep(2500);
continue;
}
curl_multi_exec($master, $running);
} while ($running);
Note
The timeout value of curl_multi_select should be tuned only if you want to do something like...
curl_multi_exec($master, $running);
do
{
if (curl_multi_select($master, $TIMEOUT) === -1)
{
usleep(2500);
continue;
}
curl_multi_exec($master, $running);
while ($info = curl_multi_info_read($master))
{
/* Do something with $info */
}
} while ($running);
Otherwise, the value should be extreamly large.
(However, PHP_INT_MAX is too large; libcurl treats it as an invalid value.)
2. Easy experiment in one PHP process
I tested using my parallel cURL executor library: mpyw/co
(The prep. for is improper and it should be by, sorry for my poor English xD)
<?php
require 'vendor/autoload.php';
use mpyw\Co\Co;
function four_sequencial_requests_for_one_hundread_people()
{
for ($i = 0; $i < 100; ++$i) {
$tasks[] = function () use ($i) {
$ch = curl_init();
curl_setopt_array($ch, [
CURLOPT_URL => 'example.com',
CURLOPT_FORBID_REUSE => true,
CURLOPT_RETURNTRANSFER => true,
]);
for ($j = 0; $j < 4; ++$j) {
yield $ch;
}
};
}
$start = microtime(true);
yield $tasks;
$end = microtime(true);
printf("Time of %s: %.2f sec\n", __FUNCTION__, $end - $start);
}
function requests_for_four_hundreds_people()
{
for ($i = 0; $i < 400; ++$i) {
$tasks[] = function () use ($i) {
$ch = curl_init();
curl_setopt_array($ch, [
CURLOPT_URL => 'example.com',
CURLOPT_FORBID_REUSE => true,
CURLOPT_RETURNTRANSFER => true,
]);
yield $ch;
};
}
$start = microtime(true);
yield $tasks;
$end = microtime(true);
printf("Time of %s: %.2f sec\n", __FUNCTION__, $end - $start);
}
Co::wait(four_sequencial_requests_for_one_hundread_people(), [
'concurrency' => 0, // Zero means unlimited
]);
Co::wait(requests_for_four_hundreds_people(), [
'concurrency' => 0, // Zero means unlimited
]);
I tried for five times to get the following results:
I also tried in reverse order (The 3rd request was kicked xD):
These results represent too many concurrent TCP connections actually decrease throughputs.
3. Advanced optimization
3-A. For different destinations
If you want to optimize for both few and many concurrent requests, the following dirty solution may help you.
Share the number of requesters using apcu_add / apcu_fetch / apcu_delete.
Switch methods(sequencial or parallel) by current value.
3-B. For the same destinations
CURLMOPT_PIPELINING will help you. This option bundles all HTTP/1.1 connections for the same destination into one TCP connection.
curl_multi_setopt($master, CURLMOPT_PIPELINING, 1);
I have very simple scraper now does what I need, but it's very slow it scrapes 2 pictures in 3 seconds what I need to do is at least 1000 pictures in a few seconds.
This is the code I use now
<?php
require_once('config.php');
//Calling PHasher class file.
include_once('classes/phasher.class.php');
$I = PHasher::Instance();
//Prevent execution timeout.
set_time_limit(0);
//Solving SSL Problem.
$arrContextOptions=array(
"ssl"=>array(
"verify_peer"=>false,
"verify_peer_name"=>false,
),
);
//Check if the database contains hashed pictures or if it's empty, Then start from the latest hashed picture or start from 4.
$check = mysqli_query($con, "SELECT fid FROM images ORDER BY fid DESC LIMIT 1;");
if(mysqli_num_rows($check) > 0){
$max_fid = mysqli_fetch_row($check);
$fid = $max_fid[0]+1;
} else {
$fid = 4;
}
$deletedProfile = "https://z-1-static.xx.fbcdn.net/rsrc.php/v2/yo/r/UlIqmHJn-SK.gif";
//Infinte while loop to fetch profiles pictures and save them inside avatar folder.
$initial = $fid;
while($fid = $initial){
$url = 'https://graph.facebook.com/'.$fid.'/picture?width=378&height=378';
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); // follow the redirects
curl_setopt($ch, CURLOPT_HEADER, false); // no needs to pass the headers to the data stream
curl_setopt($ch, CURLOPT_NOBODY, true); // get the resource without a body
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // accept any server certificate
curl_exec($ch);
// get the last used URL
$lastUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
curl_close($ch);
if($lastUrl == $deletedProfile){
$initial++;
}else{
$imageUrl = file_get_contents($url, false, stream_context_create($arrContextOptions));
$savedImage = dirname(__file__).'/avatar/image.jpg';
file_put_contents($savedImage, $imageUrl);
//Exclude deleted profiles or corrupted pictures.
if(getimagesize($savedImage) > 0 ){
//PHasher class call to hash the images to hexdecimal values or binary values.
$hash = $I->FastHashImage($savedImage);
$hex = $I->HashAsString($hash);
//Store Facebook id and hashed values for the images in hexa values.
mysqli_query($con, "INSERT INTO images(fid, hash) VALUES ('$fid', '$hex')");
$initial++;
} else {
$initial++;
}
}
}
?>
I didn't figure out how to do it, but what I am thinking of now is:
1- Divide into 1000 profiles for each loop and store them in an array.
$items = array();
for($i=$fid; $i <= $fid+1000; $i++){
$url = 'https://graph.facebook.com/'.$i.'/picture?width=378&height=378';
$items[$i] = array($url);
}
but the results are incorrect I want to know how to fix the output of the array.
Array ( [28990] => Array ( [0] => https://graph.facebook.com/28990/picture?width=378&height=378 )
[28991] => Array ( [0] => https://graph.facebook.com/28991/picture?width=378&height=378 )
[28992] => Array ( [0] => https://graph.facebook.com/28992/picture?width=378&height=378 )
[28993] => Array ( [0] => https://graph.facebook.com/28993/picture?width=378&height=378 )
[28994] => Array ( [0] => https://graph.facebook.com/28994/picture?width=378&height=378 )
[28995] => Array ( [0] => https://graph.facebook.com/28995/picture?width=378&height=378 )
[28996] => Array ( [0] => https://graph.facebook.com/28996/picture?width=378&height=378 )
[28997] => Array ( [0] => https://graph.facebook.com/28997/picture?width=378&height=378 )
2- Then I want to use the output array inside Mulit curl, allows the processing of multiple cURL handles asynchronously.
3- Check the output URLs if it's equal to the deleted profile if not pass it to be converted as a hash value using PHasher and store it inside the DB.
I just have what you need, although I haven't been able to reach that kind of throughput (1000 parallel requests per sec)
I forgot where I got this before but I am using this to download reddit content:
class ParallelCurl {
public $max_requests;
public $options;
public $outstanding_requests;
public $multi_handle;
public function __construct($in_max_requests = 10, $in_options = array()) {
$this->max_requests = $in_max_requests;
$this->options = $in_options;
$this->outstanding_requests = array();
$this->multi_handle = curl_multi_init();
}
//Ensure all the requests finish nicely
public function __destruct() {
$this->finishAllRequests();
}
// Sets how many requests can be outstanding at once before we block and wait for one to
// finish before starting the next one
public function setMaxRequests($in_max_requests) {
$this->max_requests = $in_max_requests;
}
// Sets the options to pass to curl, using the format of curl_setopt_array()
public function setOptions($in_options) {
$this->options = $in_options;
}
// Start a fetch from the $url address, calling the $callback function passing the optional
// $user_data value. The callback should accept 3 arguments, the url, curl handle and user
// data, eg on_request_done($url, $ch, $user_data);
public function startRequest($url, $callback, $user_data = array(), $post_fields = null, $headers = null) {
if ($this->max_requests > 0)
$this->waitForOutstandingRequestsToDropBelow($this->max_requests);
$ch = curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt_array($ch, $this->options);
curl_setopt($ch, CURLOPT_URL, $url);
if (isset($post_fields)) {
curl_setopt($ch, CURLOPT_POST, TRUE);
curl_setopt($ch, CURLOPT_POSTFIELDS, $post_fields);
}
if (is_array($headers)) {
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
}
curl_multi_add_handle($this->multi_handle, $ch);
$ch_array_key = (int) $ch;
$this->outstanding_requests[$ch_array_key] = array(
'link_url' => $url,
'callback' => $callback,
'user_data' => $user_data,
);
$this->checkForCompletedRequests();
}
// You *MUST* call this function at the end of your script. It waits for any running requests
// to complete, and calls their callback functions
public function finishAllRequests() {
$this->waitForOutstandingRequestsToDropBelow(1);
}
// Checks to see if any of the outstanding requests have finished
private function checkForCompletedRequests() {
/*
// Call select to see if anything is waiting for us
if (curl_multi_select($this->multi_handle, 0.0) === -1)
return;
// Since something's waiting, give curl a chance to process it
do {
$mrc = curl_multi_exec($this->multi_handle, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
*/
// fix for https://bugs.php.net/bug.php?id=63411
do {
$mrc = curl_multi_exec($this->multi_handle, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active && $mrc == CURLM_OK) {
if (curl_multi_select($this->multi_handle) != -1) {
do {
$mrc = curl_multi_exec($this->multi_handle, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
} else
return;
}
// Now grab the information about the completed requests
while ($info = curl_multi_info_read($this->multi_handle)) {
$ch = $info['handle'];
$ch_array_key = (int) $ch;
if (!isset($this->outstanding_requests[$ch_array_key])) {
die("Error - handle wasn't found in requests: '$ch' in " .
print_r($this->outstanding_requests, true));
}
$request = $this->outstanding_requests[$ch_array_key];
$url = $request['link_url'];
$content = curl_multi_getcontent($ch);
$callback = $request['callback'];
$user_data = $request['user_data'];
call_user_func($callback, $content, $url, $ch, $user_data);
unset($this->outstanding_requests[$ch_array_key]);
curl_multi_remove_handle($this->multi_handle, $ch);
}
}
// Blocks until there's less than the specified number of requests outstanding
private function waitForOutstandingRequestsToDropBelow($max) {
while (1) {
$this->checkForCompletedRequests();
if (count($this->outstanding_requests) < $max)
break;
usleep(10000);
}
}
}
The way this works is you pass to ParallelCurl::startRequest() a URL and a callback function (could be anonymous), and this queues a download for this URL, then calls the function when the download finishes.
$pcurl = new ParallelCurl(10, array(
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_FOLLOWLOCATION => 1,
CURLOPT_SSL_VERIFYPEER => 1,
));
$pcurl->startRequest($url, function($data) {
// download finished. $data is html or binary, whatever you requested
echo $data;
});
This code is running fine in terminal when I run the file as
$php webcrawler.php
However, I am curious on what I need to do to make it run on URLs specified in the console i.e.
$php webcrawler.php http://samplesite.com
Here is the full code I have so far:
class Ga_track
{
function get_ga_implemented($url)
{
$options = array(
CURLOPT_RETURNTRANSFER => TRUE, // return web page
CURLOPT_HEADER => TRUE, // don't return headers
CURLOPT_ENCODING => "", // handle all encodings
CURLOPT_USERAGENT => "Mozilla/5.0 (Windows NT 6.1; WOW64)", // who am i
CURLOPT_SSL_VERIFYHOST => FALSE, //ssl verify host
CURLOPT_SSL_VERIFYPEER => FALSE, //ssl verify peer
CURLOPT_NOBODY => FALSE
);
$ch = curl_init($url);
curl_setopt_array($ch, $options);
//2> Grab content of the url using CURL
$content = curl_exec($ch);
$flag1_trackpage = false; //FLag for the phrase '_trackPageview'
$flag2_ga_js = false; //FLag for the phrase 'ga.js'
// Script Regex
$script_regex = "/<script\b[^>]*>([\s\S]*?)<\/script>/i";
// UA_ID Regex
$ua_regex = "/UA-[0-9]{5,}-[0-9]{1,}/";
preg_match_all($script_regex, $content, $inside_script);
for ($i = 0; $i < count($inside_script[0]); $i++)
{
if (stristr($inside_script[0][$i], "ga.js"))
$flag2_ga_js = TRUE;
if (stristr($inside_script[0][$i], "_trackPageview"))
$flag1_trackpage = TRUE;
}
preg_match_all($ua_regex, $content, $ua_id);
//6> Check whether all 3 word phrases are present or not.
if ($flag2_ga_js && $flag1_trackpage && count($ua_id > 0))
return ($ua_id);
else
return (NULL);
}
}
$ga_obj = new Ga_track();
$url = "http://www.samplesite.com";
$ua_id = $ga_obj->get_ga_implemented($url);
if ($ua_id == NULL)
{
echo "USING GA: NO\r\n";
}
else
{
echo "USING GA: YES\r\n";
}
So I was able to find a solution for using the URL in the command line. I have to use cli and pass in the args. I am still having some trouble with it reading the content, but that is a separate issue. Here is the updated code:
class track {
function __construct($arg1, $arg2, $arg3) {
if(!$this->isCli()) die("Please use php-cli!");
if (!function_exists('curl_init')) die("Please install cURL!");
$this->hp = $arg1;
$this->rlevel = $arg2;
$this->rmax = $arg3;
}
function isCli() {
return php_sapi_name()==="cli";
}
function getContent() {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $this->hp);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$content = curl_exec($ch);
curl_close($ch);
return $content;
}
function checkGA($content) {
$flag_ga = false;
$script_regex = "/<script\b[^>]*>([\s\S]*?)<\/script>/i";
preg_match_all($script_regex, $content, $inside_script);
for ($i = 0; $i < count($inside_script[0]); $i++)
{
if (stristr($inside_script[0][$i], "ga.js"))
$flag_ga = TRUE;
else
return (NULL);
}
}
}
/*
*
* Echo Output
*
*/
$track_obj = new track();
$ga = $track_obj->checkGA($content);
if ($ga == NULL) {
echo "USING GA: NO\r\n";
}
else {
echo "USING GA: YES\r\n";
}
I am doing a simple app that reads json data from 15 different URLs. I have a special need that I need to do this serverly. I am using file_get_contents($url).
Since I am using file_get_contents($url). I wrote a simple script, is it:
$websites = array(
$url1,
$url2,
$url3,
...
$url15
);
foreach ($websites as $website) {
$data[] = file_get_contents($website);
}
and it was proven to be very slow, because it waits for the first request and then do the next one.
If you mean multi-curl then, something like this might help:
$nodes = array($url1, $url2, $url3);
$node_count = count($nodes);
$curl_arr = array();
$master = curl_multi_init();
for($i = 0; $i < $node_count; $i++)
{
$url =$nodes[$i];
$curl_arr[$i] = curl_init($url);
curl_setopt($curl_arr[$i], CURLOPT_RETURNTRANSFER, true);
curl_multi_add_handle($master, $curl_arr[$i]);
}
do {
curl_multi_exec($master,$running);
} while($running > 0);
for($i = 0; $i < $node_count; $i++)
{
$results[] = curl_multi_getcontent ( $curl_arr[$i] );
}
print_r($results);
i don't particularly like the approach of any of the existing answers
Timo's code: might sleep/select() during CURLM_CALL_MULTI_PERFORM which is wrong, it might also fail to sleep when ($still_running > 0 && $exec != CURLM_CALL_MULTI_PERFORM) which may make the code spin at 100% cpu usage (of 1 core) for no reason
Sudhir's code: will not sleep when $still_running > 0 , and spam-call the async-function curl_multi_exec() until everything has been downloaded, which cause php to use 100% cpu (of 1 cpu core) until everything has been downloaded, in other words it fails to sleep while downloading
here's an approach with neither of those issues:
$websites = array(
"http://google.com",
"http://example.org"
// $url2,
// $url3,
// ...
// $url15
);
$mh = curl_multi_init();
foreach ($websites as $website) {
$worker = curl_init($website);
curl_setopt_array($worker, [
CURLOPT_RETURNTRANSFER => 1
]);
curl_multi_add_handle($mh, $worker);
}
for (;;) {
$still_running = null;
do {
$err = curl_multi_exec($mh, $still_running);
} while ($err === CURLM_CALL_MULTI_PERFORM);
if ($err !== CURLM_OK) {
// handle curl multi error?
}
if ($still_running < 1) {
// all downloads completed
break;
}
// some haven't finished downloading, sleep until more data arrives:
curl_multi_select($mh, 1);
}
$results = [];
while (false !== ($info = curl_multi_info_read($mh))) {
if ($info["result"] !== CURLE_OK) {
// handle download error?
}
$results[curl_getinfo($info["handle"], CURLINFO_EFFECTIVE_URL)] = curl_multi_getcontent($info["handle"]);
curl_multi_remove_handle($mh, $info["handle"]);
curl_close($info["handle"]);
}
curl_multi_close($mh);
var_export($results);
note that an issue shared by all 3 approaches here (my answer, and Sudhir's answer, and Timo's answer) is that they will open all connections simultaneously, if you have 1,000,000 websites to fetch, these scripts will try to open 1,000,000 connections simultaneously. if you need to like.. only download 50 websites at a time, or something like that, maybe try:
$websites = array(
"http://google.com",
"http://example.org"
// $url2,
// $url3,
// ...
// $url15
);
var_dump(fetch_urls($websites,50));
function fetch_urls(array $urls, int $max_connections, int $timeout_ms = 10000, bool $return_fault_reason = true): array
{
if ($max_connections < 1) {
throw new InvalidArgumentException("max_connections MUST be >=1");
}
foreach ($urls as $key => $foo) {
if (! is_string($foo)) {
throw new \InvalidArgumentException("all urls must be strings!");
}
if (empty($foo)) {
unset($urls[$key]); // ?
}
}
unset($foo);
// DISABLED for benchmarking purposes: $urls = array_unique($urls); // remove duplicates.
$ret = array();
$mh = curl_multi_init();
$workers = array();
$work = function () use (&$ret, &$workers, &$mh, $return_fault_reason) {
// > If an added handle fails very quickly, it may never be counted as a running_handle
while (1) {
do {
$err = curl_multi_exec($mh, $still_running);
} while ($err === CURLM_CALL_MULTI_PERFORM);
if ($still_running < count($workers)) {
// some workers finished, fetch their response and close them
break;
}
$cms = curl_multi_select($mh, 1);
// var_dump('sr: ' . $still_running . " c: " . count($workers)." cms: ".$cms);
}
while (false !== ($info = curl_multi_info_read($mh))) {
// echo "NOT FALSE!";
// var_dump($info);
{
if ($info['msg'] !== CURLMSG_DONE) {
continue;
}
if ($info['result'] !== CURLE_OK) {
if ($return_fault_reason) {
$ret[$workers[(int) $info['handle']]] = print_r(array(
false,
$info['result'],
"curl_exec error " . $info['result'] . ": " . curl_strerror($info['result'])
), true);
}
} elseif (CURLE_OK !== ($err = curl_errno($info['handle']))) {
if ($return_fault_reason) {
$ret[$workers[(int) $info['handle']]] = print_r(array(
false,
$err,
"curl error " . $err . ": " . curl_strerror($err)
), true);
}
} else {
$ret[$workers[(int) $info['handle']]] = curl_multi_getcontent($info['handle']);
}
curl_multi_remove_handle($mh, $info['handle']);
assert(isset($workers[(int) $info['handle']]));
unset($workers[(int) $info['handle']]);
curl_close($info['handle']);
}
}
// echo "NO MORE INFO!";
};
foreach ($urls as $url) {
while (count($workers) >= $max_connections) {
// echo "TOO MANY WORKERS!\n";
$work();
}
$neww = curl_init($url);
if (! $neww) {
trigger_error("curl_init() failed! probably means that max_connections is too high and you ran out of system resources", E_USER_WARNING);
if ($return_fault_reason) {
$ret[$url] = array(
false,
- 1,
"curl_init() failed"
);
}
continue;
}
$workers[(int) $neww] = $url;
curl_setopt_array($neww, array(
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_SSL_VERIFYHOST => 0,
CURLOPT_SSL_VERIFYPEER => 0,
CURLOPT_TIMEOUT_MS => $timeout_ms
));
curl_multi_add_handle($mh, $neww);
// curl_multi_exec($mh, $unused_here); LIKELY TO BE MUCH SLOWER IF DONE IN THIS LOOP: TOO MANY SYSCALLS
}
while (count($workers) > 0) {
// echo "WAITING FOR WORKERS TO BECOME 0!";
// var_dump(count($workers));
$work();
}
curl_multi_close($mh);
return $ret;
}
that will download the entire list and not download more than 50 urls simultaneously
(but even that approach stores all the results in-ram, so even that approach may end up running out of ram; if you want to store it in a database instead of in ram, the curl_multi_getcontent part can be modified to store it in a database instead of in a ram-persistent variable.)
I would like to provide a more complete example without hitting the CPU at 100% and crashing when there's a slight error or something unexpected.
It also shows you how to fetch the headers, the body, request info and manual redirect following.
Disclaimer, this code is intended to be extended and implemented into a library or as a quick starting point, and as such the functions inside of it are kept to a minimum.
function mtime(){
return microtime(true);
}
function ptime($prev){
$t = microtime(true) - $prev;
$t = $t * 1000;
return str_pad($t, 20, 0, STR_PAD_RIGHT);
}
// This function exists to add compatibility for CURLM_CALL_MULTI_PERFORM for old curl versions, on modern curl it will only run once and be the equivalent of calling curl_multi_exec
function curl_multi_exec_full($mh, &$still_running) {
// In theory curl_multi_exec should never return CURLM_CALL_MULTI_PERFORM (-1) because it has been deprecated
// In practice it sometimes does
// So imagine that this just runs curl_multi_exec once and returns it's value
do {
$state = curl_multi_exec($mh, $still_running);
// curl_multi_select($mh, $timeout) simply blocks for $timeout seconds while curl_multi_exec() returns CURLM_CALL_MULTI_PERFORM
// We add it to prevent CPU 100% usage in case this thing misbehaves (especially for old curl on windows)
} while ($still_running > 0 && $state === CURLM_CALL_MULTI_PERFORM && curl_multi_select($mh, 0.1));
return $state;
}
// This function replaces curl_multi_select and makes the name make more sense, since all we're doing is waiting for curl, it also forces a minimum sleep time between requests to avoid excessive CPU usage.
function curl_multi_wait($mh, $minTime = 0.001, $maxTime = 1){
$umin = $minTime*1000000;
$start_time = microtime(true);
// it sleeps until there is some activity on any of the descriptors (curl files)
// it returns the number of descriptors (curl files that can have activity)
$num_descriptors = curl_multi_select($mh, $maxTime);
// if the system returns -1, it means that the wait time is unknown, and we have to decide the minimum time to wait
// but our `$timespan` check below catches this edge case, so this `if` isn't really necessary
if($num_descriptors === -1){
usleep($umin);
}
$timespan = (microtime(true) - $start_time);
// This thing runs very fast, up to 1000 times for 2 urls, which wastes a lot of CPU
// This will reduce the runs so that each interval is separated by at least minTime
if($timespan < $umin){
usleep($umin - $timespan);
//print "sleep for ".($umin - $timeDiff).PHP_EOL;
}
}
$handles = [
[
CURLOPT_URL=>"http://example.com/",
CURLOPT_HEADER=>false,
CURLOPT_RETURNTRANSFER=>true,
CURLOPT_FOLLOWLOCATION=>false,
],
[
CURLOPT_URL=>"http://www.php.net",
CURLOPT_HEADER=>false,
CURLOPT_RETURNTRANSFER=>true,
CURLOPT_FOLLOWLOCATION=>false,
// this function is called by curl for each header received
// This complies with RFC822 and RFC2616, please do not suggest edits to make use of the mb_ string functions, it is incorrect!
// https://stackoverflow.com/a/41135574
CURLOPT_HEADERFUNCTION=>function($ch, $header)
{
print "header from http://www.php.net: ".$header;
//$header = explode(':', $header, 2);
//if (count($header) < 2){ // ignore invalid headers
// return $len;
//}
//$headers[strtolower(trim($header[0]))][] = trim($header[1]);
return strlen($header);
}
]
];
//create the multiple cURL handle
$mh = curl_multi_init();
$chandles = [];
foreach($handles as $opts) {
// create cURL resources
$ch = curl_init();
// set URL and other appropriate options
curl_setopt_array($ch, $opts);
// add the handle
curl_multi_add_handle($mh, $ch);
$chandles[] = $ch;
}
//execute the multi handle
$prevRunning = null;
$count = 0;
do {
$time = mtime();
// $running contains the number of currently running requests
$status = curl_multi_exec_full($mh, $running);
$count++;
print ptime($time).": curl_multi_exec status=$status running $running".PHP_EOL;
// One less is running, meaning one has finished
if($running < $prevRunning){
print ptime($time).": curl_multi_info_read".PHP_EOL;
// msg: The CURLMSG_DONE constant. Other return values are currently not available.
// result: One of the CURLE_* constants. If everything is OK, the CURLE_OK will be the result.
// handle: Resource of type curl indicates the handle which it concerns.
while ($read = curl_multi_info_read($mh, $msgs_in_queue)) {
$info = curl_getinfo($read['handle']);
if($read['result'] !== CURLE_OK){
// handle the error somehow
print "Error: ".$info['url'].PHP_EOL;
}
if($read['result'] === CURLE_OK){
/*
// This will automatically follow the redirect and still give you control over the previous page
// TODO: max redirect checks and redirect timeouts
if(isset($info['redirect_url']) && trim($info['redirect_url'])!==''){
print "running redirect: ".$info['redirect_url'].PHP_EOL;
$ch3 = curl_init();
curl_setopt($ch3, CURLOPT_URL, $info['redirect_url']);
curl_setopt($ch3, CURLOPT_HEADER, 0);
curl_setopt($ch3, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch3, CURLOPT_FOLLOWLOCATION, 0);
curl_multi_add_handle($mh,$ch3);
}
*/
print_r($info);
$body = curl_multi_getcontent($read['handle']);
print $body;
}
}
}
// Still running? keep waiting...
if ($running > 0) {
curl_multi_wait($mh);
}
$prevRunning = $running;
} while ($running > 0 && $status == CURLM_OK);
//close the handles
foreach($chandles as $ch){
curl_multi_remove_handle($mh, $ch);
}
curl_multi_close($mh);
print $count.PHP_EOL;
So the gist is that I need to post XML data query to a gateway page to receive a XML response which O parse later, there can be anywhere from 3-60 queries to this web service, I unfortunately have to run a simple loop right now and do them one at a time. On the response side, I will only need 1 (or a max of 5) of the lines in the response, line 2 is the first line that I need containing image data. So I'd like the ability to select which lines I am reading in if at all possible.
I created a simple "Read in" function as I said out of a basic for loop, here's the code that I am currently using and would like to revise.
$part1 = 'XML Beginning'; $part2 = XML End';
$posts = array( 0 => 'SC-010052214', 1 => 'SC-000032972', 2 => 'SC-012535460', 3 => 'SC-011257289', 4 => 'SC-010134078' );
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, 'http://example.com/index.php');
curl_setopt($ch, CURLOPT_RETURNTRANSFER => 1);
curl_setopt ($ch, CURLOPT_POST, 1);
$count = count($posts);
for($i=0;$i<$count;$i++) {
curl_setopt ($ch, CURLOPT_POSTFIELDS, "payload=$part1{$posts[$i]}$part2");
$return[] = curl_exec ($ch);
}
curl_close ($ch);
print_r($return);
Restrictions: I cannot use ?post=$data0&post=$data1&post=$data3 unfortunately, so I need a better solution. Other than that, I'd like to see what kinds of improvements can be made here.
Maybe http://php.net/manual/en/function.curl-multi-init.php helps you
Because of limits in quick response,
<?php
function m_curl($input) {
// compile queries for usable locations
foreach($input['content'] as $pos=>$item) {
$query = '<childDetailQuery><request><query-replacement>';
$query .= "<item_number>{$item}</item_number>";
$query .= (isset($input['story']) && $input['story'] != NULL)
? "<story_type>".$input['story']."</story_type>"
: '<story_type>SHORT</story_type>';
$query .= (isset($input['party']) && $input['party'] != NULL)
? "<party_number>".$input['party']."</party_number>"
: '';
$query .= "</query-replacement><latency-tolerance>NONE</latency-tolerance>";
$query .= '</request></childDetailQuery>';
$queries[] = $query;
unset($query);
}
// make sure the rolling window isn't greater than the # of urls
$limit = 10;
$limit = (sizeof($queries) < $limit) ? sizeof($queries) : $limit;
$master = curl_multi_init();
$curl_arr = array();
// add additional curl options here
$std_options = array(
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_FOLLOWLOCATION => 1,
CURLOPT_MAXREDIRS => 0,
);
$options = ($coptions) ? ($std_options + $coptions) : $std_options;
echo $input['location'];
// start the first batch of requests
for ($i = 0; $i < $limit; $i++) {
$ch = curl_init();
$options[CURLOPT_POSTFIELDS] = "payload=".$queries[$i];
curl_setopt_array($ch,$options);
curl_multi_add_handle($master, $ch);
}
do {
while(($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM);
if($execrun != CURLM_OK) {
echo 'Curl Error'; break;
}
// a request was just completed -- find out which one
while($done = curl_multi_info_read($master)) {
$info = curl_getinfo($done['handle']);
if ($info['http_code'] == 200) {
$output = curl_multi_getcontent($done['handle']);
// request successful. process output using the callback function.
parse_returns($output);
// start a new request (it's important to do this before removing the old one)
$ch = curl_init();
$options[CURLOPT_POSTFIELDS] = "payload=".$queries[$i++]; // increment i
curl_setopt_array($ch,$options);
curl_multi_add_handle($master, $ch);
// remove the curl handle that just completed
curl_multi_remove_handle($master, $done['handle']);
} else {
echo 'Failed on:'; var_dump($info);
echo 'With options:'; var_dump($options);
// request failed. add error handling.
}
}
} while ($running);
curl_multi_close($master);
return false;
}
function parse_returns($data) {
print_r($data);
}
// set query numbers
$data = array(
0 => 'SC-010052214',
1 => 'SC-000032972',
2 => 'SC-012535460',
3 => 'SC-011257289',
4 => 'SC-010134078'
);
// set options array
$options = array(
'location' => 'http://ibudev.wvus.org/websvc/actions/wvsMessageRouter.php',
'readline' => 2,
'coptions' => NULL,
'content' => $data,
'story' => 'FULL',
'party' => NULL,
);
m_curl($options);
?>