I have a problem with sending data from a table to a CSV file.
Array
[link1] => HTTP Code
[link2] => HTTP Code
[link3] => HTTP Code
[link4] => HTTP Code
I need to send the data to a CSV file so that the links do not recur.
Unfortunately, I don't know how to send link after link (I work in a foreach loop) to extract each of these links and send it to CSV, and at the same time check that already did not show up.
This is my code:
require('simple/simple_html_dom.php');
$xml = simplexml_load_file('https://www.gutscheinpony.de/sitemap.xml');
$fp = fopen('Links2.csv', 'w');
set_time_limit(0);
$links=[];
foreach ($xml->url as $link_url)
{
$url = $link_url->loc;
$data=file_get_html($url);
$data = strip_tags($data,"<a>");
$d = preg_split("/<\/a>/",$data);
foreach ( $d as $k=>$u ){
if( strpos($u, "<a href=") !== FALSE ){
$u = preg_replace("/.*<a\s+href=\"/sm","",$u);
$u = preg_replace("/\".*/","",$u);
if ( strpos($u, "http") !== FALSE) {
$ch = curl_init($u);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$output = curl_exec($ch);
$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if(strpos($u, "https://www.gutscheinpony.de/") !== FALSE )
$u = substr($u, 28);
if($u == "/")
$u = $url;
}
$links[$u] = $http_code;
$wynik = array( array($u, $url , $http_code));
foreach ($wynik as $fields) {
fputcsv($fp, $fields);
}
}
}
}
curl_close($ch);
fclose($fp);
echo 'Send to CSV file successfully completed ... ';
I need get every link from .xml, download links that are on the same page and specify the HTTP status. This part I have done. I can't only appropriate way to send data to a CSV file.
I'm counting on your help.
The code below is essentially your code with a few modifications. There was also the observation that :// does not seem acceptable as part of PHP Array Keys.
<?php
require __DIR__ . '/simple/simple_html_dom.php';
$xml = simplexml_load_file('https://www.gutscheinpony.de/sitemap.xml');
$fp = fopen(__DIR__ . '/Links2.csv', 'w');
set_time_limit(0);
$links = [];
$status = false;
foreach ($xml->url as $link_url){
$url = $link_url->loc;
$data = file_get_html($url);
$data = strip_tags($data,"<a>");
$d = preg_split("/<\/a>/",$data);
foreach ( $d as $k=>$u ){
$http_code = 404;
if( strpos($u, "<a href=") !== FALSE ){
$u = preg_replace("/.*<a\s+href=\"/sm","",$u);
$u = preg_replace("/\".*/","",$u);
if ( strpos($u, "http") !== FALSE) {
// JUST GET THE CODE ON EACH ITERATION,
// OPENING THE STREAM & CLOSING IT AGAIN ON EACH ITERATION...
$http_code = getHttpCodeStatus($u);
if(strpos($u, "https://www.gutscheinpony.de/") !== FALSE ){
$u = substr($u, 28);
}
if($u == "/") {
$u = $url;
}
// THIS COULD BE A BUG... USING :// AS PART OF AN ARRAY KEY SEEMS NOT TO WORK
$links[str_replace("://", "_", $u)] = $http_code;
// RUN THE var_dump(), TO VIEW THE PROCESS AS IT PROGRESSES IF YOU WISH TO
var_dump($links);
$status = fputcsv($fp, array($u, $url , $http_code));
}
}
}
}
fclose($fp);
if($status) {
echo count($links) . ' entries were successfully processed and written to disk as a CSV File... ';
}else{
echo 'It seems like some entries were not successfully written to disk - at least the last entry... ';
}
function getHttpCodeStatus($u){
$ch = curl_init($u);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$output = curl_exec($ch);
$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
return $http_code;
}
Related
i am reading the html source code of instagram post by using the CURL. I am able to do this on localhost but when i test the code on live domain then meta tags with og property like og:type is missing, it only showing at localhost.
This is the complete code.
<?php
function get_domain($url)
{
$pieces = parse_url($url);
$domain = isset($pieces['host']) ? $pieces['host'] : $pieces['path'];
if (preg_match('/(?P<domain>[a-z0-9][a-z0-9\-]{1,63}\.[a-z\.]{2,6})$/i',
$domain, $regs)) {
return $regs['domain'];
}
return false;
}
//run curl here and get html code of instagram post page
function file_get_contents_curl($url)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_BINARYTRANSFER, true);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
//check instagram url
function checkinstaurl($urlhere) {
//remove white space
$urlhere = trim($urlhere);
$urlhere = htmlspecialchars($urlhere);
///remove white space
if (get_domain($urlhere) == "instagram.com") {
//getting the meta tag data
$html = file_get_contents_curl($urlhere);
//parsing begins here:
$doc = new DOMDocument();
#$doc->loadHTML($html);
$nodes = $doc->getElementsByTagName('title');
//get and display what you need:
$title = $nodes->item(0)->nodeValue;
$metas = $doc->getElementsByTagName('meta');
$mediatype = null;
$description = null;
for ($i = 0; $i < $metas->length; $i++)
{
$meta = $metas->item($i);
if($meta->getAttribute('property') == 'og:type')
$mediatype = $meta -> getAttribute('content');
if($mediatype == 'video') {
if($meta->getAttribute('property') == 'og:video')
$description = $meta -> getAttribute('content');
} else {
if($meta->getAttribute('property') == 'og:image')
$description = $meta -> getAttribute('content');
$mediatype = 'photo';
}
} // for loop statement
$out['mediatype'] = $mediatype;
$out['descriptionc'] = $description;
return $out;
//getting the meta tag data
}
}
/*output*/
$igurl = 'https://www.instagram.com/p/COf0dN0M8pU/';
$output = checkinstaurl($igurl);
echo "<pre>";
print_r($output);
?>
This above code, At Localhost returns the complete html with meta tags but on live domain meta tags with og property is missing.
I'm working on an app that gets all the URLs from an array of sites and displays it in array form or JSON.
I can do it using for loop, the problem is the execution time when I tried 10 URLs it gives me an error saying exceeds maximum execution time.
Upon searching I found this multi curl
I also found this Fast PHP CURL Multiple Requests: Retrieve the content of multiple URLs using CURL. I tried to add my code but didn't work because I don't how to use the function.
Hope you help me.
Thanks.
This is my sample code.
<?php
$urls=array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/');
$mh = curl_multi_init();
foreach ($urls as $i => $url) {
$urlContent = file_get_contents($url);
$dom = new DOMDocument();
#$dom->loadHTML($urlContent);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if(!filter_var($url, FILTER_VALIDATE_URL) === false){
echo ''.$url.'<br />';
}
}
$conn[$i]=curl_init($url);
$fp[$i]=fopen ($g, "w");
curl_setopt ($conn[$i], CURLOPT_FILE, $fp[$i]);
curl_setopt ($conn[$i], CURLOPT_HEADER ,0);
curl_setopt($conn[$i],CURLOPT_CONNECTTIMEOUT,60);
curl_multi_add_handle ($mh,$conn[$i]);
}
do {
$n=curl_multi_exec($mh,$active);
}
while ($active);
foreach ($urls as $i => $url) {
curl_multi_remove_handle($mh,$conn[$i]);
curl_close($conn[$i]);
fclose ($fp[$i]);
}
curl_multi_close($mh);
?>
Here is a function that I put together that will properly utilize the curl_multi_init() function. It is more or less the same function that you will find on PHP.net with some minor tweaks. I have had great success with this.
function multi_thread_curl($urlArray, $optionArray, $nThreads) {
//Group your urls into groups/threads.
$curlArray = array_chunk($urlArray, $nThreads, $preserve_keys = true);
//Iterate through each batch of urls.
$ch = 'ch_';
foreach($curlArray as $threads) {
//Create your cURL resources.
foreach($threads as $thread=>$value) {
${$ch . $thread} = curl_init();
curl_setopt_array(${$ch . $thread}, $optionArray); //Set your main curl options.
curl_setopt(${$ch . $thread}, CURLOPT_URL, $value); //Set url.
}
//Create the multiple cURL handler.
$mh = curl_multi_init();
//Add the handles.
foreach($threads as $thread=>$value) {
curl_multi_add_handle($mh, ${$ch . $thread});
}
$active = null;
//execute the handles.
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active && $mrc == CURLM_OK) {
if (curl_multi_select($mh) != -1) {
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
//Get your data and close the handles.
foreach($threads as $thread=>$value) {
$results[$thread] = curl_multi_getcontent(${$ch . $thread});
curl_multi_remove_handle($mh, ${$ch . $thread});
}
//Close the multi handle exec.
curl_multi_close($mh);
}
return $results;
}
//Add whatever options here. The CURLOPT_URL is left out intentionally.
//It will be added in later from the url array.
$optionArray = array(
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0',//Pick your user agent.
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_TIMEOUT => 10
);
//Create an array of your urls.
$urlArray = array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/'
);
//Play around with this number and see what works best.
//This is how many urls it will try to do at one time.
$nThreads = 20;
//To use run the function.
$results = multi_thread_curl($urlArray, $optionArray, $nThreads);
Once this is complete you will have an array containing all of the html from your list of websites. It is at this point where I would loop through them and pull out all of the urls.
Like so:
foreach($results as $page){
$dom = new DOMDocument();
#$dom->loadHTML($page);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if(!filter_var($url, FILTER_VALIDATE_URL) === false){
echo ''.$url.'<br />';
}
}
}
It is also worth keeping in the back of you head the ability to increase the run time of your script.
If your using a hosting service you may be restricted to something in the ball park of two minutes regardless of what you set your max execution time to. Just food for thought.
This is done by:
ini_set('max_execution_time', 120);
You can always try more time but you'll never know till you time it.
Hope it helps.
You may be using an endless loop - if not, you can can increase the maximum execution time in php.ini or with:
ini_set('max_execution_time', 600); // 600 seconds = 10 minutes
This is what I achieved after working on the code, It worked but not sure if this is the best answer. Kindly check my code.
<?php
$array = array('https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/','https://www.google.com/');
print_r (getUrls($array));
function getUrls($array) {
$arrUrl = array();
$arrList = array();
$url_count = count($array);
$curl_array = array();
$ch = curl_multi_init();
foreach($array as $count => $url) {
$curl_array[$count] = curl_init($url);
curl_setopt($curl_array[$count], CURLOPT_RETURNTRANSFER, true);
curl_multi_add_handle($ch, $curl_array[$count]);
}
do{
curl_multi_exec($ch, $exec);
curl_multi_select($ch,1);
}while($exec);
foreach($array as $count => $url) {
$arrUrl = array();
$urlContent = curl_multi_getcontent($curl_array[$count]);
$dom = new DOMDocument();
#$dom->loadHTML($urlContent);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$url = filter_var($url, FILTER_SANITIZE_URL);
// validate url
if (filter_var($url, FILTER_VALIDATE_URL) !== false) {
if (strpos($url, 'mailto') === false) {
$arrUrl[] = $url;
}
}
}
array_push($arrList, array_unique($arrUrl));
}
foreach($array as $count => $url) {
curl_multi_remove_handle($ch, $curl_array[$count]);
}
curl_multi_close($ch);
foreach($array as $count => $url) {
curl_close($curl_array[$count]);
}
return $arrList;
}
First of all i know that OP does asking about multi_curl but i just adding another alternative if the OP may changes his mind. What i do here is splitting the urls into many request so the cpu usage will not that big. If the OP still wants use multi_curl maybe the PHP master here could gives more better solution.
<?php
$num = preg_replace('/[^0-9]/','',$_GET['num']);
$num = empty($num) ? 0 : $num;
$urls=array(
'http://site1.com/',
'http://site2.com/',
'http://site3.com/');
if(!empty($urls[$num]))
{
/* do your single curl stuff here and store its data here*/
/*now redirect to the next url. dont use header location redirect, it would ends up too many redirect error in browser*/
$next_url = !empty($urls[$num+1]) ? $urls[$num+1] : 'done';
echo '<html>
<head>
<meta http-equiv="refresh" content="0;url=http://yourcodedomain.com/yourpath/yourcode.php?num='.$next_url.'" />
</head>
<body>
<p>Fetching: '.$num+1.' / '.count($urls).'</p>
</body>
</html>';
}
elseif($_GET['num'] == 'done')
{
/*if all sites have been fetched, do something here*/
}
else
{
/*throws exception here*/
}
?>
i had same issue then i solved using usleep() this try and let me know
do {
usleep(10000);
$n=curl_multi_exec($mh,$active);
}
Try this simplified version:
$urls = [
'https://en.wikipedia.org/',
'https://secure.php.net/',
];
set_time_limit(0);
libxml_use_internal_errors(true);
$hrefs = [];
foreach ($urls as $url) {
$html = file_get_contents($url);
$doc = new DOMDocument;
$doc->loadHTML($html);
foreach ($doc->getElementsByTagName('a') as $link) {
$href = filter_var($link->getAttribute('href'), FILTER_SANITIZE_URL);
if (filter_var($href, FILTER_VALIDATE_URL)) {
echo "<a href='{$href}'>{$href}</a><br/>\n";
}
}
}
I am using two JSON feed sources and PHP to display a real estate property slideshow with agents on a website. The code was working prior to the feed provider making changes to where they store property and agent images. I have made the necessary adjustments for the images, but the feed data is not working now. I have contacted the feed providers about the issue, but they say the problem is on my end. No changes beyond the image URLs were made, so I am unsure where the issue may be. I am new to JSON, so I might be missing something. I have included the full script below. Here are the two JSON feed URLs: http://century21.ca/FeaturedDataHandler.c?DataType=4&EntityType=2&EntityID=2119 and http://century21.ca/FeaturedDataHandler.c?DataType=3&AgentID=27830&RotationType=1. The first URL grabs all of the agents and the second grabs a single agent's properties. The AgentID value is sourced from the JSON feed URL dynamically.
class Core
{
private $base_url;
private $property_image_url;
private $agent_id;
private $request_agent_properties_url;
private $request_all_agents_url;
private function formatJSON($json)
{
$from = array('Props:', 'Success:', 'Address:', ',Price:', 'PicTicks:', ',Image:', 'Link:', 'MissingImage:', 'ShowingCount:', 'ShowcaseHD:', 'ListingStatusCode:', 'Bedrooms:', 'Bathrooms:', 'IsSold:', 'ShowSoldPrice:', 'SqFootage:', 'YearBuilt:', 'Style:', 'PriceTypeDesc:');
$to = array('"Props":', '"Success":', '"Address":', ',"Price":', '"PicTicks":', ',"Image":', '"Link":', '"MissingImage":', '"ShowingCount":', '"ShowcaseHD":', '"ListingStatusCode":', '"Bedrooms":', '"Bathrooms":', '"IsSold":', '"ShowSoldPrice":', '"SqFootage":', '"YearBuilt":', '"Style":', '"PriceTypeDesc":' );
return str_ireplace($from, $to, $json); //returns the clean JSON
}
function __construct($agent=false)
{
$this->base_url = 'http://www.century21.ca';
$this->property_image_url = 'http://images.century21.ca';
$this->agent_id = ($agent ? $agent : false);
$this->request_all_agents_url =
$this->base_url.'/FeaturedDataHandler.c?DataType=4&EntityType=3&EntityID=3454';
$this->request_agent_properties_url =
$this->base_url.'/FeaturedDataHandler.c?DataType=3'.'&AgentID='.$this->agent_id.'&RotationType=1';
}
/**
* getSlides()
*/
function getSlides()
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $this->request_all_agents_url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HEADER, 0);
$response = curl_exec($ch);
curl_close($ch);
if (empty($response))
return false;
else
$agents = $this->decode_json_string($response);
// Loop Agents And Look For Requested ID
foreach ($agents as $agent)
{
if (($this->agent_id != false) && (isset($agent['WTLUserID'])) && ($agent['WTLUserID'] != $this->agent_id))
{
continue; // You have specified a
}
$properties = $this->getProperties($agent['WTLUserID']);
$this->print_property_details($properties, $agent);
}
}
/**
* getProperties()
*/
function getProperties($agent_id)
{
$url = $this->base_url.'/FeaturedDataHandler.c?DataType=3'.'&AgentID='.$agent_id.'&RotationType=1';
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HEADER, 0);
$response = curl_exec($ch);
curl_close($ch);
$json = json_decode($response);
if (empty($response))
die('No response 2'); //return false;
else
$json = $this->formatJSON($this->decode_json_string($response));
var_dump($json);
die();
// return $json;
}
/**
* print_property_details()
*/
function print_property_details($properties, $agent, $html='')
{
$BASE_URL = $this->base_url;
$PROPERTY_IMAGE_URL = $this->property_image_url;
foreach ($properties as $property)
{
$img = $property['Image'];
// $img = ($property['Image'] ? $property['Image'] : "some url to a dummy image here")
if($property['ListingStatusCode'] != 'SOLD'){
$address = $property['Address'];
$shortaddr = substr($address, 0, -12);
$html .= "<div class='listings'>";
$html .= "<div class='property-image'>";
$html .= "<img src='". $PROPERTY_IMAGE_URL ."' width='449' height='337' alt='' />";
$html .= "</div>";
$html .= "<div class='property-info'>";
$html .= "<span class='property-price'>". $property['Price'] ."</span>";
$html .= "<span class='property-street'>". $shortaddr ."</span>";
$html .= "</div>";
$html .= "<div class='agency'>";
$html .= "<div class='agent'>";
$html .= "<img src='". $agent['PhotoUrl']. "' class='agent-image' width='320' height='240' />";
$html .= "<span class='agent-name'><b>Agent:</b>". $agent['DisplayName'] ."</span>";
$html .= "</div>";
$html .= "</div>";
$html .= "</div>";
}
}
echo $html;
}
function decode_json_string($json)
{
// Strip out junk
$strip = array("{\"Agents\": [","{Props: ",",Success:true}",",\"Success\":true","\r","\n","[{","}]");
$json = str_replace($strip,"",$json);
// Instantiate array
$json_array = array();
foreach (explode("},{",$json) as $row)
{
/// Remove commas and colons between quotes
if (preg_match_all('/"([^\\"]+)"/', $row, $match)) {
foreach ($match as $m)
{
$row = str_replace($m,str_replace(",","|comma|",$m),$row);
$row = str_replace($m,str_replace(":","|colon|",$m),$row);
}
}
// Instantiate / clear array
$array = array();
foreach (explode(',',$row) as $pair)
{
$var = explode(":",$pair);
// Add commas and colons back
$val = str_replace("|colon|",":",$var[1]);
$val = str_replace("|comma|",",",$val);
$val = trim($val,'"');
$val = trim($val);
$key = trim($var[0]);
$key = trim($key,'{');
$key = trim($key,'}');
$array[$key] = $val;
}
// Add to array
$json_array[] = $array;
}
return $json_array;
}
}
Try this code to fix the JSON:
$url = 'http://century21.ca/FeaturedDataHandler.c?DataType=3&AgentID=27830&RotationType=1';
$invalid_json = file_get_contents($url);
$json = preg_replace("/([{,])([a-zA-Z][^: ]+):/", "$1\"$2\":", $invalid_json);
var_dump($json);
All your keys need to be double-quoted
JSON on the second URL is not a valid JSON, that's why you're not getting the reults, as PHP unable to decode that feed.
I tried to process it, and get this error
Error: Parse error on line 1:
{Props: [{Address:"28
-^
Expecting 'STRING', '}'
Feed image for first URL
and here is view of 2nd URL's feed
as per error for second feed, all the keys should be wrapped within " as these are strings rather than CONSTANTS.
e.g.
Props should be "Props" and all other too.
EDIT
You need to update your functionand add this one(formatJSON($json)) to your class
// Update this function, just need to update last line of function
function getProperties($agent_id)
{
$url = $this->base_url.'/FeaturedDataHandler.c?DataType=3'.'&AgentID='.$agent_id.'&RotationType=1';
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HEADER, 0);
$response = curl_exec($ch);
curl_close($ch);
$json = json_decode($response);
if (empty($response))
die('No response 2'); //return false;
else
return $this->formatJSON($this->decode_json_string($response)); //this one only need to be updated.
}
//add this function to class. This will format json
private function formatJSON($json){
$from= array('Props:', 'Success:', 'Address:', ',Price:', 'PicTicks:', ',Image:', 'Link:', 'MissingImage:', 'ShowingCount:', 'ShowcaseHD:', 'ListingStatusCode:', 'Bedrooms:', 'Bathrooms:', 'IsSold:', 'ShowSoldPrice:', 'SqFootage:', 'YearBuilt:', 'Style:', 'PriceTypeDesc:');
$to = array('"Props":', '"Success":', '"Address":', ',"Price":', '"PicTicks":', ',"Image":', '"Link":', '"MissingImage":', '"ShowingCount":', '"ShowcaseHD":', '"ListingStatusCode":', '"Bedrooms":', '"Bathrooms":', '"IsSold":', '"ShowSoldPrice":', '"SqFootage":', '"YearBuilt":', '"Style":', '"PriceTypeDesc":' );
return str_ireplace($from, $to, $json); //returns the clean JSON
}
EDIT
I've tested that function, and it's working fine, may be there is something wrong with your function decode_json_string($json)
I've taken unclean json from second URL, and cleaning it here, and putting that cleaned json in json editor to check either it's working or not HERE
I'm trying to write a script to cURL a few pages from a password protected site.
The idea is to scrape information on submitted stock codes from their products database to generate and print out the results (eventually importing directly to my own database, but currently just printing the results on screen).
My function is as follows:
function LookupProduct($ItemCodes) {
//set a temp file name for the login cookie
$tmp_fname = "tmp/".md5(date('D F d')).".cookie";
$tmp_fname = realpath($tmp_fname);
//reset/declare the functions output
$return = '';
// build post data from form
$fields = array(
'UserName' => urlencode("username"),
'Password' => urlencode("password"),
);
$fieldString='';
foreach($fields as $key=>$value) {
$fieldString .= $key.'='.$value.'&';
}
rtrim($fieldString, '&');
//initialise the curl session
$ch = curl_init();
//set options for curl login
$loginurl = "https://suppliers-website/login/";
curl_setopt($ch,CURLOPT_URL, $loginurl);
curl_setopt($ch,CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch,CURLOPT_COOKIESESSION, true);
curl_setopt($ch,CURLOPT_POST, count($fields));
curl_setopt($ch,CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch,CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch,CURLOPT_POSTFIELDS, $fieldString);
curl_setopt($ch,CURLOPT_COOKIEJAR, $tmp_fname);
curl_setopt($ch,CURLOPT_COOKIEFILE, $tmp_fname);
curl_setopt($ch,CURLOPT_RETURNTRANSFER, true);
//do the actual login, generate cookie
$result = curl_exec($ch);
//build array of codes to lookup
$codes=explode(",", $ItemCodes);
//lookup each code in the array
foreach($codes as $code) {
//set the product page to curl
$lookupUrl = "https://suppliers-website/product/".$code;
curl_setopt($ch,CURLOPT_URL, $lookupUrl);
//load product page html into $lookupcontent
unset($lookupcontent);
$lookupcontent = curl_exec($ch);
//if we have a valid page, then go ahead and pluck the data
if (strlen($lookupcontent) < 100) {
echo "<li>Error logging in: <blockquote>".$lookupcontent."</blockquote></li>";
} else {
//load product page html into a DOM
unset($dom);
unset($xpath);
$dom = new DOMDocument;
$dom->loadHTML($lookupcontent);
$xpath = new DOMXPath($dom);
//find the image src
unset($imgnames);
foreach($dom->getElementsByTagName('a') as $node) {
if (strpos($node->getAttribute('href'),'StockLoRes') !== false) {
$imgnames = explode("=", $node->getAttribute('href'));
$imgname = $imgnames[1];
$filelocation = $node->getAttribute('href');
}
}
//set the image to curl
$imglink = "https://suppliers-website/login/".$filelocation;
curl_setopt($ch,CURLOPT_URL,$imglink);
//curl the image
unset($curlimage);
$curlimage = curl_exec($ch);
//save the image locally
unset($saveimage);
$saveimage = fopen('tmp/'.$imgname, 'w');
fwrite($saveimage, $curlimage);
fclose($saveimage);
// find the product description
unset($results);
$classname = 'ItemDetails_Description';
$results = $xpath->query("//*[#class='" . $classname . "']");
if ($results->length > 0) {
$description = $results->item(0)->nodeValue;
$description = strip_tags($description);
$description = str_replace("•", "", $description);
}
//find the price
unset($pricearray);
foreach($dom->getElementsByTagName('div') as $node) {
if (strpos($node->nodeValue,'£') !== false) {
$pricearray[] = $node->nodeValue;
}
}
$pricearray=array_reverse($pricearray);
$price = $pricearray[0];
$price = str_replace("£", "", $price);
//find the title
unset($results);
$classname = 'ItemDetails_ItemName';
$results = $xpath->query("//*[#class='" . $classname . "']");
if ($results->length > 0) {
$title = $results->item(0)->nodeValue;
}
//find the publisher
unset($results);
$classname = 'ItemDetails_Publisher';
$results = $xpath->query("//*[#class='" . $classname . "']");
if ($results->length > 0) {
$publisher = $results->item(0)->nodeValue;
}
}
//add all the values to the data to be returned
$return .= '<div style="border:1px solid grey;margin:20px;float:left;">';
$return .= "<a href='tmp/".$imgname."'>";
$return .= "<img src='tmp/".$imgname."' width='100' align='left' /></a>";
$return .= "<h1>" .$title ."</h1>";
$return .= "<h3>" .$publisher ."</h3>";
$return .= "<h2>£" .$price ."</h2>";
$return .= "<h4>" .$description."</h2>";
$return .= '</div><br clear="all" />';
}
//echo out the data
echo $return;
//close connection
curl_close($ch);
}
I am using the following to trigger it:
if(isset($_POST['ItemCodes'])) {
$code=$_POST['ItemCodes'];
$code=str_replace("\n\r", ",", $code);
$code=str_replace("\r", ",", $code);
echo "ItemCodes: ".$code;
echo LookupProduct($code);
}
The script can successfully log in, save a cookie, and get info from a page, but if I try to request multiple pages the script fails to work as intended, instead returning 3 instances of the same product. Did I fail to reset a variable somewhere? I've tried unsetting everything but I still just get the same product three times, as if my function only works once.
I'm currently looking for solution to dynamically filter website content. By "dynamic" I mean I would calculate the percentage of the bad words i.e. shit, f**k, etc over the whole words on the first page. Say the website is allowed if the percentage is no more than 30%. How do I make it search each word on the first page and match them with the bad words list then divide by the total number of the words so then I would be able to get the percentage? The rationale is not to make a content filter but to just block the website should even a single word in the page matches with the bad words list. I have got this though, but it is of static.
$filename = "filters.txt";
$fp = #fopen($filename, 'r');
if ($fp) {
$array = explode("\n", fread($fp, filesize($filename)));
foreach($array as $key => $val){
list($before,$after) = split("~",$val);
$input = preg_replace($before,$after,$input);
}
}
*filter.txt contains the list of bad words
Thanx Erisco!
Tried this but it doesnt seem to work thou.
function get_content($url)
{
$ch = curl_init();
curl_setopt ($ch, CURLOPT_URL, $url);
curl_setopt ($ch, CURLOPT_HEADER, 0);
ob_start();
curl_exec ($ch);
curl_close ($ch);
$string = ob_get_contents();
ob_end_clean();
return $string;
}
/* $toLoad is from Browse.php */
$sourceOfWebpage = get_content($toLoad);
$textOfWebpage = strip_tags($sourceOfWebpage);
/* array: Obtained by your filter.txt file */
// Open the filters file and filter all of the results.
$filename = "filters.txt";
$badWords = #fopen($filename, 'r');
if ($badWords) {
$array = explode("\n", fread($fp, filesize($filename)));
foreach($array as $key => $val){
list($before,$after) = split("~",$val);
$input = preg_replace($before,$after,$input);
}
}
/* float: Some decimal value */
$allowedBadWordsPercent = 0.30;
$numberOfWords = str_word_count($textOfWebpage);
$numberOfBadWords = 0;
str_ireplace($badWords, '', $sourceOfWebpage, $numberOfBadWords);
if ($numberOfBadWords != 0) {
$badWordsPercent = $numberOfWords / $numberOfBadWords;
} else {
$badWordsPercent = 0;
}
if ($badWordsPercent > $allowedBadWordsPercent) {
echo 'This is a naughty webpage';
}
This is the rough idea of what I'd do. You could argue that using str_ireplace() purely for the count is devious though. I am not sure if there is a more direction function without busting out the regexp.
/* string: Obtained by CURL or similar */
$sourceOfWebpage;
$textOfWebpage = strip_tags($sourceOfWebpage);
/* array: Obtained by your filter.txt file */
$badWords;
/* float: Some decimal value */
$allowedBadWordsPercent = 0.30;
$numberOfWords = str_word_count($textOfWebpage);
$numberOfBadWords = 0;
str_ireplace($badWords, '', $sourceOfWebpage, $numberOfBadWords);
if ($numberOfBadWords != 0) {
$badWordsPercent = $numberOfWords / $numberOfBadWords;
} else {
$badWordsPercent = 0;
}
if ($badWordsPercent > $allowedBadWordsPercent) {
echo 'This is a naughty webpage';
}