Grab/download images from multiple pages using php preg_match_all & cURL - php

So I'm trying to grab some images from another site, the problem is each image is on a different page
IE: id/1, id/2, id/3 etc etc
so far I have the code below which can grab an image from the single URL given using:
$returned_content = get_data('http://somedomain.com/id/1/');
but need to make the line above become an array (I guess) so it will grab the image from page 1 then go on to grab the next image on page 2 then page 3 etc etc automatically
function get_data($url){
$ch = curl_init();
$timeout = 5;
curl_setopt($ch,CURLOPT_URL,$url);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,$timeout);
curl_setopt($ch,CURLOPT_USERAGENT,'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13');
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
$returned_content = get_data('http://somedomain.com/id/1/');
if (preg_match_all("~http://somedomain.com/images/(.*?)\.jpg~i", $returned_content, $matches)) {
$src = 0;
foreach ($matches[1] as $key) {
if(++$src > 1) break;
$out = $key;
}
$file = 'http://somedomain.com/images/' . $out . '.jpg';
$dir = 'photos';
$imgurl = get_data($file);
file_put_contents($dir . '/' . $out . '.jpg', $imgurl);
echo 'done';
}
As always all help is appreciated and thanks in advance.

This was pretty confusing, because it sounded like you were only interested in saving one image per page. But then the code makes it look like you're actually trying to save every image on each page. So it's entirely possible I completely misunderstood... But here goes.
Looping over each page isn't that difficult:
$i = 1;
$l = 101;
while ($i < $l) {
$html = get_data('http://somedomain.com/id/'.$i.'/');
getImages($html);
$i += 1;
}
The following then assumes that you're trying to save all the images on that particular page:
function getImages($html) {
$matches = array();
$regex = '~http://somedomain.com/images/(.*?)\.jpg~i';
preg_match_all($regex, $html, $matches);
foreach ($matches[1] as $img) {
saveImg($img);
}
}
function saveImg($name) {
$url = 'http://somedomain.com/images/'.$name.'.jpg';
$data = get_data($url);
file_put_contents('photos/'.$name.'.jpg', $data);
}

Related

How to download google search big image using php script?

I want to download the google search big image. Below is my code, which is working fine for download small image from google search.
<?php
include_once('simple_html_dom.php');
set_time_limit(0);
$fp = fopen('csv/search.csv','r') or die("can't open file");
$csv_data = array();
while($csv_line = fgetcsv($fp)) {
for ($i = 0, $j = count($csv_line); $i < $j; $i++) {
$imgname = $csv_line[$i];
$search_query = $csv_line[$i];
$search_query = urlencode(trim($search_query));
$html=file_get_html('http://images.google.com/images?as_q='. $search_query .'&hl=en&imgtbs=z&btnG=Search+Images&as_epq=&as_oq=&as_eq=&imgtype=&imgsz=m&imgw=&imgh=&imgar=&as_filetype=&imgc=&as_sitesearch=&as_rights=&safe=images&as_st=y');
$image_container = $html->find('div#rcnt', 0);
$images = $html->find('img');
$image_count = 1; //Enter the amount of images to be shown
$i = 0;
foreach($images as $image){
$srcimg = $image->src;
if($i == $image_count) break;
$i++;
$randname = $imgname.".jpg";
$randname = "Images/".$randname;
file_put_contents("$randname", file_get_contents($srcimg));
}
}
}
?>
Any idea?
This worked for me. simple_html_dom.php wouldn't do the trick since the 'big image' is inside a snippet of JSON near each thumbnail in the DOM.
<?php
$search_query = "Some Keyword"; //change this
$search_query = urlencode( $search_query );
$googleRealURL = "https://www.google.com/search?hl=en&biw=1360&bih=652&tbs=isz%3Alt%2Cislt%3Asvga%2Citp%3Aphoto&tbm=isch&sa=1&q=".$search_query."&oq=".$search_query."&gs_l=psy-ab.12...0.0.0.10572.0.0.0.0.0.0.0.0..0.0....0...1..64.psy-ab..0.0.0.wFdNGGlUIRk";
// Call Google with CURL + User-Agent
$ch = curl_init($googleRealURL);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (X11; Linux i686; rv:20.0) Gecko/20121230 Firefox/20.0');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
$google = curl_exec($ch);
$array_imghtml = explode("\"ou\":\"", $google); //the big url is inside JSON snippet "ou":"big url"
foreach($array_imghtml as $key => $value){
if ($key > 0) {
$array_imghtml_2 = explode("\",\"", $value);
$array_imgurl[] = $array_imghtml_2[0];
}
}
var_dump($array_imgurl); //array contains the urls for the big images
die();
?>
I think rather than crawling the page you can you the google custom search api
for more details here the url:
https://developers.google.com/custom-search/json-api/v1/overview

php array of pictures with redirect

There is a file with a lot location of pictures, foo.txt.
http://foo/bar1.png
http://foo/bar2.png
http://foo/bar3.png
I make an array and each is displayed, but some pictures are missing.
The server thus forwards on the same 404 image for all.
http://foo/404_bar.png
How I can show pictures - if there is a redirect, that this picture does not show. We need this to understand dynamically.
$lines = file("foo.txt")
foreach ($lines as $line_num => $line) {
echo '<img src="', $line ,'"><br>';}
Browser show:
bar1.png
404_bar.png
404_bar.png
As the bar2.png and bar3.png images are missing on the server, he is displayed 404_bar.png. Me need to display only:
bar1.png
I think it is necessary to dig the headers.
as I understand from your question, you need to only show the existed images, so you should check the urls if they are exist or not:
$lines = file("foo.txt")
foreach ($lines as $line_num => $line) {
if(is_url_exist($line))
echo '<img src="', $line ,'"><br>';
}
function is_url_exist($url){
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_NOBODY, true);
curl_exec($ch);
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if($code == 200){
$status = true;
}else{
$status = false;
}
curl_close($ch);
return $status;
}

How to get a specified row using cUrl PHP

Hey guys I use curl to communicate web external server, but the type of response is html, I was able to convert it to json code (more than 4000 row) but I have no idea how to get specified row which contains my result. Any idea ?
Here is my cUrl code :
require_once('getJson.php');
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, 'http://www.reputationauthority.org/domain_lookup.php?ip=website.com&Submit.x=9&Submit.y=5&Submit=Search');
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
$data = curl_exec($ch);
$httpcode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
$data = '<<<EOF'.$data.'EOF';
$json = new GetJson();
header("Content-Type: text/plain");
$res = json_encode($json->html_to_obj($data), JSON_PRETTY_PRINT);
$myArray = json_decode($res,true);
For getJson.php
class GetJson{
function html_to_obj($html) {
libxml_use_internal_errors(true);
$dom = new DOMDocument();
$dom->loadHTML($html);
return $this->element_to_obj($dom->documentElement);
}
function element_to_obj($element) {
if ($element->nodeType == XML_ELEMENT_NODE){
$obj = array( "tag" => $element->tagName );
foreach ($element->attributes as $attribute) {
$obj[$attribute->name] = $attribute->value;
}
foreach ($element->childNodes as $subElement) {
if ($subElement->nodeType == XML_TEXT_NODE) {
$obj["html"] = $subElement->wholeText;
}
else {
$obj["children"][] = $this->element_to_obj($subElement);
}
}
return $obj;
}
}
}
My idea is instead of Browsing rows to achieve lign 2175 (doing something like : $data['children'][2]['children'][7]['children'][3]['children'][1]['children'][1]['children'][0]['children'][1]['children'][0]['children'][1]['children'][2]['children'][0]['children'][0]['html'] is not a good idea to me), I want to go directly to it.
If the HTML being returned has a consistent structure every time, and you just want one particular value from one part of it, you may be able to use regular expressions to parse the HTML and find the part you need. This is an alternative you trying to put the whole thing into an array. I have used this technique before to parse a HTML document and find a specific item. Here's a simple example. You will need to adapt it to your needs, since you haven't specified the exact nature of the data you're seeking. You may need to go down several levels of parsing to find the right bit:
$data = curl_exec($ch);
//Split the output into an array that we can loop through line by line
$array = preg_split('/\n/',$data);
//For each line in the output
foreach ($array as $element)
{
//See if the line contains a hyperlink
if (preg_match("/<a href/", "$element"))
{
...[do something here, e.g. store the data retrieved, or do more matching to find something within it]...
}
}

Parse REST Response Using PHP / CURL

I'm trying the REST API here: https://www.semrush.com/api-analytics/ , specifically the Organic Results, but no matter what I've tried, I can't seem to manipulate the data. Can someone tell me how to do this? I've tried SimpleXML, JSON, and even breaking up the response via explode() but I must be missing something because all I can do is push the result to the beginning of an array and not actually break it up.
This is my current code:
$url = "http://api.semrush.com/?type=phrase_organic&key=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX&display_limit=10&export_columns=Dn,Ur&phrase=seo&database=us";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch,CURLOPT_USERAGENT,"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$result = curl_exec($ch);
curl_close($ch);
var_dump($result);
With the result being:
string 'Domain;Url
site-analyzer.com;https://www.site-analyzer.com/
woorank.com;https://www.woorank.com/
hubspot.com;http://blog.hubspot.com/blog/tabid/6307/bid/33164/6-SEO-Tools-to-Analyze-Your-Site-Like-Google-Does.aspx
seoworkers.com;http://www.seoworkers.com/tools/analyzer.html
seositecheckup.com;http://seositecheckup.com/
site-seo-analysis.com;http://www.site-seo-analysis.com/
webseoanalytics.com;http://www.webseoanalytics.com/free/seo-tools/web-seo-analysis.php
seocentro.com;http://www.seocentro.com/t'... (length=665)
Is there a simple way to break this up so I can manipulate or reformat the response?
You need to properly explode the new-line characters in order to get to the csv structure, then parse it, as csv
foreach(preg_split("/((\r?\n)|(\r\n?))/", $response) as $key=>$line){
if ($key!=0) {
list($domain,$url) = str_getcsv($line,';');
print 'Domain: ' . $domain . ', URL: ' . $url . PHP_EOL;
}
}
Using the sample response from https://www.semrush.com/api-analytics/#phrase_organic,
the above will output
Domain: wikipedia.org, URL: http://en.wikipedia.org/wiki/Search_engine_optimization
Domain: searchengineland.com, URL: http://searchengineland.com/guide/what-is-seo
Domain: moz.com, URL: http://moz.com/beginners-guide-to-seo
The if statement is there to filter out the first line, the csv header.
Well, we could explode by space " ", then by ;
$response = explode(" ", trim(str_replace("Domain;Url", "", $response)));
$readableResponse = [];
foreach($response as $r)
{
$e = explode(";", $r);
$readableResponse[$e[0]] = $e[1];
}
print_r($readableResponse);
Ie. Live on phpsandbox
[searchengineland.com] => http://searchengineland.com/guide/what-is-seo
[wikipedia.org] => https://en.wikipedia.org/wiki/Search_engine_optimization
....

Youtube channel subscriber count

I'm trying to pull the count of subscribers for a particular youtube channel. I referred some links on Stackoverflow as well as external sites, came across links like this. Almost all the links suggested me to use youtube gdata api and pull the count from subscriberCount but the following code
$data = file_get_contents("http://gdata.youtube.com/feeds/api/users/Tollywood/playlists");
$xml = simplexml_load_string($data);
print_r($xml);
returns no such subscriberCount. Is there any other way of getting subscribers count or am I doing something wrong?
The YouTube API v2.0 is deprecated. Here's how to do it with 3.0. OAuth is not needed.
1) Log in to a Google account and go to https://console.developers.google.com/. You may have to start a new project.
2) Navigate to APIs & auth and go to Public API Access -> Create a New Key
3) Choose the option you need (I used 'browser applications') This will give you an API key.
4) Navigate to your channel in YouTube and look at the URL. The channel ID is here: https://www.youtube.com/channel/YOUR_CHANNEL_ID
5) Use the API key and channel ID to get your result with this query: https://www.googleapis.com/youtube/v3/channels?part=statistics&id=YOUR_CHANNEL_ID&key=YOUR_API_KEY
Great success!
Documentation is actually pretty good, but there's a lot of it. Here's a couple of key links:
Channel information documentation: https://developers.google.com/youtube/v3/sample_requests
"Try it" page: https://developers.google.com/youtube/v3/docs/subscriptions/list#try-it
Try this ;)
<?php
$data = file_get_contents('http://gdata.youtube.com/feeds/api/users/Tollywood');
$xml = new SimpleXMLElement($data);
$stats_data = (array)$xml->children('yt', true)->statistics->attributes();
$stats_data = $stats_data['#attributes'];
/********* OR **********/
$data = file_get_contents('http://gdata.youtube.com/feeds/api/users/Tollywood?alt=json');
$data = json_decode($data, true);
$stats_data = $data['entry']['yt$statistics'];
/**********************************************************/
echo 'lastWebAccess = '.$stats_data['lastWebAccess'].'<br />';
echo 'subscriberCount = '.$stats_data['subscriberCount'].'<br />';
echo 'videoWatchCount = '.$stats_data['videoWatchCount'].'<br />';
echo 'viewCount = '.$stats_data['viewCount'].'<br />';
echo 'totalUploadViews = '.$stats_data['totalUploadViews'].'<br />';
?>
I could do it with regex for my page , not sure does it work for you or not . check following codes:
<?php
$channel = 'http://youtube.com/user/YOURUSERNAME/';
$t = file_get_contents($channel);
$pattern = '/yt-uix-tooltip" title="(.*)" tabindex/';
preg_match($pattern, $t, $matches, PREG_OFFSET_CAPTURE);
echo $matches[1][0];
<?php
//this code was written by Abdu ElRhoul
//If you have any questions please contact me at info#oklahomies.com
//My website is http://Oklahomies.com
set_time_limit(0);
function retrieveContent($url){
$file = fopen($url,"rb");
if (!$file)
return "";
while (feof ($file)===false) {
$line = fgets ($file, 1024);
$salida .= $line;
}
fclose($file);
return $salida;
}
{
$content = retrieveContent("https://www.youtube.com/user/rhoula/about"); //replace rhoula with the channel name
$start = strpos($content,'<span class="about-stat"><b>');
$end = strpos($content,'</b>',$start+1);
$output = substr($content,$start,$end-$start);
echo "Number of Subscribers = $output";
}
?>
<?php
echo get_subscriber("UCOshmVNmGce3iwozz55hpww");
function get_subscriber($channel,$use = "user") {
(int) $subs = 0;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, "https://www.youtube.com/".$use."/".$channel."/about?disable_polymer=1");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1 );
curl_setopt($ch, CURLOPT_POST, 0 );
curl_setopt($ch, CURLOPT_REFERER, 'https://www.youtube.com/');
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0');
$result = curl_exec($ch);
$R = curl_getinfo($ch);
if($R["http_code"] == 200) {
$pattern = '/yt-uix-tooltip" title="(.*)" tabindex/';
preg_match($pattern, $result, $matches, PREG_OFFSET_CAPTURE);
$subs = intval(str_replace(',','',$matches[1][0]));
}
if($subs == 0 && $use == "user") return get_subscriber($channel,"channel");
return $subs;
}

Categories