I'm currently using php URL to browse over 500 web pages a day with cookies.
I have to check each page to ensure that the account is still logged in and the pages are being viewed as a member, not a guest.
The script takes an hour or two to complete as it sleeps in between views.
I just want to know if there's anything I can do to reduce the load this script puts on the local server, I've made sure to clear variables at the end of each loop but is there anything I'm missing that would help?
Any new cURL settings that would help?
$i = 0;
$useragents = array();
foreach($urls as $url){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_COOKIEJAR, str_replace('\\','/',dirname(__FILE__)).'/cookies.txt');
curl_setopt($ch, CURLOPT_COOKIEFILE, str_replace('\\','/',dirname(__FILE__)).'/cookies.txt');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_USERAGENT, $useragents[array_rand($useragents)]);
$html = curl_exec($ch);
curl_close($ch);
if(!$html)
die("No HTML - Not logged in");
if($i %10 != 0)
sleep(rand(5,20));
else
sleep(rand(rand(60,180), rand(300,660)));
$i++;
$html = '';
}
You could reuse your curl handle instead of creating a new one for each connection.
Clearing $html at the end of each iteration won't reduce memory usage and just adds an extra operation because it already gets reset in the next iteration.
$i = 0;
$useragents = array();
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_COOKIEJAR, str_replace('\\','/',dirname(__FILE__)).'/cookies.txt');
curl_setopt($ch, CURLOPT_COOKIEFILE, str_replace('\\','/',dirname(__FILE__)).'/cookies.txt');
foreach($urls as $url){
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_USERAGENT, $useragents[array_rand($useragents)]);
$html = curl_exec($ch);
if(!$html)
die("No HTML - Not logged in");
if($i++ % 10 != 0)
sleep(rand(5,20));
else
sleep(rand(rand(60,180), rand(300,660)));
}
curl_close($ch);
Related
I am trying to open a html page using CURL and then extracting the captcha image URL and saving the image as PNG. I am being able to do both but the image displayed on screen and the image file saved are different. How can I fix this?
//Get page contents first
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL,"https://www.gstsearch.in/track-provisional-id.html");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($ch, CURLOPT_COOKIESESSION, TRUE);
curl_setopt($ch, CURLOPT_COOKIEFILE, "cookiefile.txt");
curl_setopt($ch, CURLOPT_COOKIEJAR, "cookiefile.txt");
$pageContent = curl_exec ($ch);
$errNo = curl_errno($ch); //CURL error code
curl_close ($ch);
if($errNo == 0) {
$imgURL = getCaptcha($pageContent); //Get captcha image
saveCaptcha($imgURL); //Save the captcha image as PNG
}
else {
$errorMsg = curl_strerror($errNo);
echo "CURL error ({$errNo}):\n {$errorMsg}";
}
function getCaptcha($html) {
$dom = new DOMDocument();
#$dom->loadHTML($html);
$captchaImg = $dom->getElementById('captchacode');
$imgSrc = $captchaImg->getAttribute('data-src');
//URL of the current captcha image
$imgURL = "https://www.gstsearch.in/{$imgSrc}";
echo "<img src={$imgURL}>";
return $imgURL;
}
function saveCaptcha($url) {
$fp = fopen ("captcha.png", 'w+');
$sc = curl_init();
curl_setopt($sc, CURLOPT_URL, $url);
curl_setopt($sc, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($sc, CURLOPT_COOKIEFILE, "cookiefile.txt");
curl_setopt($sc, CURLOPT_COOKIEJAR, "cookiefile.txt");
curl_setopt($sc, CURLOPT_FILE, $fp);
curl_setopt($sc, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($sc, CURLOPT_USERAGENT, 'Mozilla/5.0');
curl_exec($sc);
curl_close($sc);
fclose($fp);
}
UPDATE: I updated the code as per suggestions but still the same thing happens. What am I missing?
I agree with #jeroen, remote site thinks that there are two different users: one posting the information, another is retrieving CAPTCHA :)
You can store (and re-use) session_id with this:
//this is to pass `session_id` between requests
curl_setopt($ch, CURLOPT_COOKIEFILE, $some_path . 'cookie.txt');
//this is to store cookies for future requests, i.e. if you want to retain your session
curl_setopt($ch, CURLOPT_COOKIEJAR, $some_path . 'cookie.txt');
you should use these for both requests. this way site will think that you are the same user, but not two different (as it thinks right now)
I am trying to automate the configuration of x IP cameras from their embbeded web server (Self Signed Certificates). So if you try to connect to a camera through a browser in a normal way (no script), you'll have to add an exception, works fine.
I want to automate this, and all my scripts PHP are ran in a Powershell CLI.
I have the following PHP script :
<?php
include('C:\wamp64\bin\php\php7.0.10\run\Librairie\LIB_parse.php');
include('C:\wamp64\bin\php\php7.0.10\run\Librairie\LIB_http.php');
include('C:\wamp64\bin\php\php7.0.10\run\Librairie\LIB_resolve_addresses.php');
$TableauIP = fopen('C:\wamp64\bin\php\php7.0.10\run\x\Ipcamera.txt', 'r');
$count = 0;
while (($URLcamera = fgets($TableauIP, 4096)) !== false){
$IP_unparsed = $URLcamera;
$Ipcamera = return_between($IP_unparsed, "//", "/", EXCL);
echo("Automatic configuration for : ".$Ipcamera."\n");
echo("...............\n\n");
echo("Downloading page : ".$IP_unparsed."\n\n");
$web_page =http_get($IP_unparsed, $ref = "");
echo "ERROR \n";
var_dump($web_page['ERROR']);
$head_section = return_between($string=$web_page['FILE'], $start="<head>", $end="</head>", $type=EXCL);
$meta_tag_array = parse_array($head_section, $beg_tag="<meta", $close_tag=">");
for($xx=0; $xx<count($meta_tag_array); $xx++){
echo $meta_tag_array[$xx]."\n";
}
for($xx=0; $xx<count($meta_tag_array); $xx++){
$meta_attribute = get_attribute($meta_tag_array[$xx], $attribute="http-equiv");
if(strtolower($meta_attribute)=="refresh"){
$new_page = return_between($meta_tag_array[$xx], $start="URL", $end=">", $type=EXCL);
$new_page = trim(str_replace("", "", $new_page));
$new_page = str_replace("=", "", $new_page);
$new_page = str_replace("\"", "", $new_page);
$new_page = resolve_address($new_page, $IP_unparsed);
}
break;
}
echo "HTML Head redirection detected<br>\n\n";
echo "Redirect page = ".$new_page."\n";
$web_page2 = http_get($new_page, $ref = "");
//$web_page = http_get($IP_unparsed.'/login.cs', $ref = "");
echo "FILE CONTENT \n";
var_dump($web_page2['FILE']);
echo "FILE ERROR \n";
var_dump($web_page2['ERROR']);
// for($xx=0; $xx<count($web_page); $xx++){
// echo($web_page[$xx]);
// }
// echo "ERROR \n";
// var_dump($new_page['ERROR']);
//*******************************
// $web_page = file($new_page);
// for($xx = 0; $xx < count($web_page); $xx++)
// echo $web_page[$xx];
//********************************
// $file_handle = fopen($new_page, "r");
// while (!feof($file_handle))
// {
// echo fgets($file_handle, 4096);
// }
// fclose($file_handle);
$count++;
}
?>
(I left the comments, I've tried different way to display the webpage)
As you can see, I am using the engine WampServer_x64 on a basic Windows 7.
I'm following a redirection to the https://x.x.x.x/login.cs page.
The important part is the download of webpage2.
Here the LIB_parse library (just necessary lines), wrapping curl options in PHP functions :
function http_get($target, $ref)
{
return http($target, $ref, $method="GET", $data_array="", EXCL_HEAD);
}
function http($target, $ref, $method, $data_array, $incl_head)
{
# Initialize PHP/CURL handle
$ch = curl_init();
# Prcess data, if presented
if(is_array($data_array))
{
# Convert data array into a query string (ie animal=dog&sport=baseball)
foreach ($data_array as $key => $value)
{
if(strlen(trim($value))>0)
$temp_string[] = $key . "=" . urlencode($value);
else
$temp_string[] = $key;
}
$query_string = join('&', $temp_string);
}
# HEAD method configuration
if($method == HEAD)
{
curl_setopt($ch, CURLOPT_HEADER, TRUE); // No http head
curl_setopt($ch, CURLOPT_NOBODY, TRUE); // Return body
}
else
{
# GET method configuration
if($method == GET)
{
if(isset($query_string))
$target = $target . "?" . $query_string;
curl_setopt ($ch, CURLOPT_HTTPGET, TRUE);
curl_setopt ($ch, CURLOPT_POST, FALSE);
}
# POST method configuration
if($method == POST)
{
if(isset($query_string))
curl_setopt ($ch, CURLOPT_POSTFIELDS, $query_string);
curl_setopt ($ch, CURLOPT_POST, TRUE);
curl_setopt ($ch, CURLOPT_HTTPGET, FALSE);
}
curl_setopt($ch, CURLOPT_HEADER, $incl_head); // Include head as needed
curl_setopt($ch, CURLOPT_NOBODY, FALSE); // Return body
}
curl_setopt($ch, CURLOPT_COOKIEJAR, COOKIE_FILE); // Cookie management.
curl_setopt($ch, CURLOPT_COOKIEFILE, COOKIE_FILE);
curl_setopt($ch, CURLOPT_TIMEOUT, CURL_TIMEOUT); // Timeout
curl_setopt($ch, CURLOPT_USERAGENT, WEBBOT_NAME); // Webbot name
curl_setopt($ch, CURLOPT_URL, $target); // Target site
curl_setopt($ch, CURLOPT_REFERER, $ref); // Referer value
curl_setopt($ch, CURLOPT_VERBOSE, FALSE); // Minimize logs
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
curl_setopt($ch, CURLOPT_SSLVERSION, CURL_SSLVERSION_TLSv1_2);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); // No certificate
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE); // Follow redirects
curl_setopt($ch, CURLOPT_MAXREDIRS, 4); // Limit redirections to four
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); // Return in string
# Create return array
$return_array['FILE'] = curl_exec($ch);
$return_array['STATUS'] = curl_getinfo($ch);
$return_array['ERROR'] = curl_error($ch);
# Close PHP/CURL handle
curl_close($ch);
# Return results
return $return_array;
}
I do not know how to handle the TLS connection with cURL. I've been trying for hours with different stuff .. I have this issue : encrypted alert :
whireshark capture TCP and TLS exchange
I've add this line to the original library :
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
//curl_setopt($ch, CURLOPT_SSLVERSION, 6);
curl_setopt($ch, CURLOPT_SSLVERSION, CURL_SSLVERSION_TLSv1_2);
I can't get the web page.
Apparently, the SSL version is 1.0.2h.
I have tried many different things .. With many different error types, but always around the SSL certificate stuff..
I have no more ideas where to look..
If you guys can give me a trail ! That would be nice
How to run url from php script in the same way (exactly the same behaviour) as in browser when i run url from address bar. I mean with the same header data, cookies and additional data which browser send. How to add this data in php.
I need this cause when I logged in, answers from this 2 cases are not the same:
in browser I still logged in and this is correct
from php run I am logged OUT - not correct
I've tried file_get_contents nad curl (from here) but it doesn't work properly - response is still different.
I'm calling http://127.0.0.1/check.html and here is function check:
public function check(){
echo 'begin';
// $total_rows = file_get_contents('https://127.0.0.1:8443/example.html?shopId=121');
$total_rows = $this->getUrl('https://127.0.0.1:8443/example.html', '121');
print_r($total_rows);
echo 'end';
}
function getUrl($url, $shopId ='') {
$post = 'shopId=' . $shopId;
$ch = curl_init();
$cookie_string="";
foreach( $_COOKIE as $key => $value ) {
$cookie_string .= "$key=$value;";
};
$cookie_string .= "JSESSIONIDSSO=66025D1CC9EF39ED7F5DB024B6026C61";
// echo $cookie_string;;
$ch = curl_init();
curl_setopt($ch,CURLOPT_COOKIE, $cookie_string);
// curl_setopt($ch, CURLOPT_PORT, 8443);
curl_setopt($ch, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
// curl_setopt ($ch, CURLOPT_CAINFO, dirname(__FILE__)."/../../files/cacert.pem");
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $post);
// curl_setopt($ch, CURLOPT_HTTPHEADER, Array("Secure Content-Type: text/html; charset=UTF-8"));
// curl_setopt($ch, CURLOPT_HTTPHEADER, array('Host: 127.0.0.1:8443'));
$ret = curl_exec($ch);
curl_error ($ch );
curl_close($ch);
return $ret;
}
Try it:
http://www.lastcraft.com/browser_documentation.php
Or that:
http://sourceforge.net/projects/snoopy/
Or that:
php curl: how can i emulate a get request exactly like a web browser?
Hope help
You can execute the cron job using your PHP script to execute the other script.
I've been using various pieces of different twitter feeds to grab tweets, but now I've hit a wall with the rate limiting and caching tweets. Here's my code:
function tweets($twitter_handle, $tweet_limit, $tweet_links, $tweet_tags, $tweet_avatar, $tweet_profile) {
/* Store Tweets in a JSON object */
$tweet_feed = json_decode(file_get_contents('http://api.twitter.com/1/statuses/user_timeline.json?screen_name='.
$twitter_handle.'&include_entities=true&include_rts=true&count='.$hard_max.''));
This works great until I hit the rate limit. Here's what I added to cache tweets:
function tweets($twitter_handle, $tweet_limit, $tweet_links, $tweet_tags, $tweet_avatar, $tweet_profile) {
$url = 'http://api.twitter.com/1/statuses/user_timeline.json?screen_name='.$twitter_handle.'&include_entities=true&include_rts=true&count='.$hard_max.'';
$cache = dirname(__FILE__) . '/cache/twitter';
if(filemtime($cache) < (time() - 60))
{
mkdir(dirname(__FILE__) . '/cache', 0777);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_REFERER, $_SERVER['REQUEST_URI']);
$data = curl_exec($ch);
curl_close($ch);
$cachefile = fopen($cache, 'wb');
fwrite($cachefile, $data);
fclose($cachefile);
}
else
{
$data = file_get_contents($cache);
}
$tweet_feed = json_decode($data);
This however only returns the username and timestamp (which is wrong), when it should be returning the twitter avatar, tweet content, correct timestamp, etc. Additionally, it's also returning an error every few refreshes:
Warning: mkdir() [function.mkdir]: File exists in /home/content/36/8614836/html/wp-content/themes/NCmainSite/functions.php on line 110
Any help would be appreciated.
If you need more info, here's the rest of the function: http://snippi.com/s/9f066q0
Here try this ive fixed your issues, plus you had a rogue post opt in curl.
<?php
function tweets($twitter_handle, $tweet_limit, $tweet_links, $tweet_tags, $tweet_avatar, $tweet_profile) {
$http_query = array('screen_name'=>$twitter_handle,
'include_entities'=>'true',
'include_rts'=>'true',
'count'=>(isset($hard_max))?$hard_max:'5');
$url = 'http://api.twitter.com/1/statuses/user_timeline.json?'.http_build_query($http_query);
$cache_folder = dirname(__FILE__) . '/cache';
$cache_file = $cache_folder . '/twitter.json';
//Check folder exists
if(!file_exists($cache_folder)){mkdir($cache_folder, 0777);}
//Do if cache files not found or older then 60 seconds (tho 60 is not enough)
if(!file_exists($cache_file) || filemtime($cache_file) < (time() - 60)){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_REFERER, $_SERVER['REQUEST_URI']);
$data = curl_exec($ch);
curl_close($ch);
file_put_contents($cache_file,$data);
}else{
$data = file_get_contents($cache_file);
}
return json_decode($data);
}
$twitter = tweets('RemotiaSoftware', 'tweet_limit','tweet_links', 'tweet_tags', 'tweet_avatar', 'tweet_profile');
print_r($twitter);
?>
I am using curl, I am wondering how would I send post/submit data on my page to those websites? The web site has "host, time, port". My MYSQL database has a list of urls. I was thinking of curl_multi but I am not sure.
Please someone post examples. It has to be a fast method.
Basically feteches the url and post.
while($resultSet = mysql_fetch_array($SQL)){
$ch = curl_init($resultSet['url'] . $fullcurl);
curl_setopt($ch, CURLOPT_TIMEOUT, 2);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
}
The PHP cURL reference says that the CURLOPT_POST option, set to true, makes it a POST request. CURLOPT_POSTFIELDS sets the fields that you will send in foo=bar&spam=eggs format (which one can build from an array with http_build_query).
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, 'foo=bar&spam=eggs');
Here is an example on how to do it with curl_multi. Although you should break it up so you only have a certain amount of URLs going out at once (i.e. 30). I added the follow location directive, which you usually want.
$mh = curl_multi_init();
$ch = array();
while($resultSet = mysql_fetch_array($SQL)){
$ch[$i] = curl_init($resultSet['url'] . $fullcurl);
curl_setopt($ch[$i], CURLOPT_TIMEOUT, 2);
curl_setopt($ch[$i], CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch[$i], CURLOPT_FOLLOWLOCATION, true);
curl_multi_add_handle($mh, $ch[$i]);
}
$running = null;
do {
curl_multi_exec($mh,$running);
} while ($running > 0);
$num = count($ch);
for ($i=0; $i<$num; $i++ ) {
curl_multi_remove_handle($mh, $ch[$i]);
}
curl_multi_close($mh);
Give this a shot:
while ($resultSet = mysql_fetch_assoc($SQL)) {
$ch = curl_init($resultSet['url']);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch, CURLOPT_TIMEOUT,2);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $fullcurl);
$response = curl_exec($ch);
curl_close();
}