Despite the xpath being correct (to the best of my knowledge), this code is still outputting strangely.
By this I mean that lots of was_price and now_price values are not being scraped from the page and so are returning as £.
Any idea what's wrong?
Here's the site I'm scraping from.
Code:
function scrape($list_url, $shop_name, $photo_location, $photo_url_root, $product_location, $product_url_root, $was_price_location, $now_price_location, $gender, $country, mysqli $con)
{
$html = file_get_contents($list_url);
$doc = new DOMDocument();
libxml_use_internal_errors(TRUE);
if(!empty($html))
{
$doc->loadHTML($html);
libxml_clear_errors(); // remove errors for yucky html
$xpath = new DOMXPath($doc);
/* FIND LINK TO PRODUCT PAGE */
$products = array();
$row = $xpath->query($product_location);
/* Create an array containing products */
if ($row->length > 0)
{
foreach ($row as $location)
{
$product_urls[] = $product_url_root . $location->getAttribute('href');
}
}
else { echo "product location is wrong<br>";}
$imgs = $xpath->query($photo_location);
/* Create an array containing the image links */
if ($imgs->length > 0)
{
foreach ($imgs as $img)
{
$photo_url[] = $photo_url_root . $img->getAttribute('src');
}
}
else { echo "photo location is wrong<br>";}
$was = $xpath->query($was_price_location);
/* Create an array containing the was price */
if ($was->length > 0)
{
foreach ($was as $price)
{
$stripped = preg_replace("/[^0-9,.]/", "", $price->nodeValue);
$was_price[] = "£".$stripped;
}
}
else { echo "was price location is wrong<br>";}
$now = $xpath->query($now_price_location);
/* Create an array containing the sale price */
if ($now->length > 0)
{
foreach ($now as $price)
{
$stripped = preg_replace("/[^0-9,.]/", "", $price->nodeValue);
$now_price[] = "£".$stripped;
}
}
else { echo "now price location is wrong<br>";}
$result = array();
/* Create an associative array containing all the above values */
foreach ($product_urls as $i => $product_url)
{
$result[] = array(
'product_url' => $product_url,
'shop_name' => $shop_name,
'photo_url' => $photo_url[$i],
'was_price' => $was_price[$i],
'now_price' => $now_price[$i]
);
}
echo json_encode($result);
}
else
{
echo "this is empty";
}
}
$list_url = "http://www.asos.com/Women/Sale/70-Off-Sale/Cat/pgecategory.aspx?cid=16903&pge=0&pgesize=1002&sort=-1";
$shop_name = "ASOS";
$photo_location = "//ul[#id='items']/li/div[#class='categoryImageDiv']/*[1]/img";
$photo_url_root = "";
$product_location = "//ul[#id='items']/li/div[#class='categoryImageDiv']/*[1]";
$product_url_root = "http://www.asos.com";
$was_price_location = "//ul[#id='items']/li/div[#class='productprice']/span[#class='price' or #class='recRP rrp']"; // leave recRP rrp
$now_price_location = "//ul[#id='items']/li/div[#class='productprice']/span[#class='prevPrice previousprice' or #class='price outlet-current-price']"; // leave outlet-current-price
$gender = "f";
$country = "UK";
scrape($list_url, $shop_name, $photo_location, $photo_url_root, $product_location, $product_url_root, $was_price_location, $now_price_location, $gender, $country, $con);
I was counting the number of matches per site, and it looks like that there are 1563 hits for your was_price and only 1440 for your now_price. This tells me that either your Xpath isn't working in 100% of the cases or that some of the articles only have one price.
So you have to make sure that all of our XPath expressions return the same amount of results, so that: products = new_price = old_price = images
Related
Excuse my English, please.
I use Rollingcurl to crawl various pages.
Rollingcurl: https://github.com/LionsAd/rolling-curl
My class:
<?php
class Imdb
{
private $release;
public function __construct()
{
$this->release = "";
}
// SEARCH
public static function most_popular($response, $info)
{
$doc = new DOMDocument();
libxml_use_internal_errors(true); //disable libxml errors
if (!empty($response)) {
//if any html is actually returned
$doc->loadHTML($response);
libxml_clear_errors(); //remove errors for yucky html
$xpath = new DOMXPath($doc);
//get all the h2's with an id
$row = $xpath->query("//div[contains(#class, 'lister-item-image') and contains(#class, 'float-left')]/a/#href");
$nexts = $xpath->query("//a[contains(#class, 'lister-page-next') and contains(#class, 'next-page')]");
$names = $xpath->query('//img[#class="loadlate"]');
// NEXT URL - ONE TIME
$Count = 0;
$next_url = "";
foreach ($nexts as $next) {
$Count++;
if ($Count == 1) {
/*echo "Next URL: " . $next->getAttribute('href') . "<br/>";*/
$next_link = $next->getAttribute('href');
}
}
// RELEASE NAME
$rls_name = "";
foreach ($names as $name) {
$rls_name .= $name->getAttribute('alt');
}
// IMDB TT0000000 RLEASE
if ($row->length > 0) {
$link = "";
foreach ($row as $row) {
$tt_info .= #get_match('/tt\\d{7}/is', $doc->saveHtml($row), 0);
}
}
}
$array = array(
$next_link,
$rls_name,
$tt_info,
);
return ($array);
}
}
Output/Return:
$array = array(
$next_link,
$rls_name,
$tt_info,
);
return ($array);
Call:
<?php
error_reporting(E_ALL);
ini_set('display_errors', 1);
function get_match($regex, $content, $pos = 1)
{
/* do your job */
preg_match($regex, $content, $matches);
/* return our result */
return $matches[intval($pos)];
}
require "RollingCurl.php";
require "imdb_class.php";
$imdb = new Imdb;
if (isset($_GET['action']) || isset($_POST['action'])) {
$action = (isset($_GET['action'])) ? $_GET['action'] : $_POST['action'];
} else {
$action = "";
}
echo " 2222<br /><br />";
if ($action == "most_popular") {
$popular = '&num_votes=1000,&production_status=released&groups=top_1000&sort=moviemeter,asc&count=40&start=1';
if (isset($_GET['date'])) {
$link = "https://www.imdb.com/search/title?title_type=feature,tv_movie&release_date=,".$_GET['date'].$popular;
} else {
$link = "https://www.imdb.com/search/title?title_type=feature,tv_movie&release_date=,2018".$popular;
}
$urls = array($link);
$rc = new RollingCurl([$imdb, 'most_popular']); //[$imdb, 'most_popular']
$rc->window_size = 20;
foreach ($urls as $url) {
$request = new RollingCurlRequest($url);
$rc->add($request);
}
$stream = $rc->execute();
}
If I output everything as "echo" in the class, everything is also displayed. However, I want to call everything individually.
If I now try to output it like this, it doesn't work.
$stream[0]
$stream[1]
$stream[3]
Does anyone have any idea how this might work?
Thank you very much in advance.
RollingCurl doesn't do anything with the return value of the callback, and doesn't return it to the caller. $rc->execute() just returns true when there's a callback function. If you want to save anything, you need to do it in the callback function itself.
You should make most_popular a non-static function, and give it a property $results that you initialize to [] in the constructor.. Then it can do:
$this->results[] = $array;
After you do
$rc->execute();
you can do:
foreach ($imdb->results as $result) {
echo "Release name: $result[1]<br>TT Info: $result[2]<br>";
}
It would be better if you put the data you extracted from the document in arrays rather than concatenated strings, e.g.
$this->$rls_names = [];
foreach ($names as $name) {
$this->$rls_names[] = $name->getAttribute('alt');
}
$this->$tt_infos = [];
foreach ($rows as $row) {
$this->$tt_infos[] = #get_match('/tt\\d{7}/is', $doc->saveHtml($row), 0);
}
$this->next_link = $next[0]->getAttribute('href'); // no need for a loop to get the first element of an array
I am building custom webservices for the Mobile app,
I want to load filter according to product collection programmatically.
Currently i am getting filter on entire category but when we apply any filter it will gave same filter option again.
Below is the code which i have used for category filter
$layer = Mage::getModel("catalog/layer");
$_category = Mage::getModel("catalog/category")->load($category_id);
$layer->setCurrentCategory($_category);
$attributes = $layer->getFilterableAttributes();
/* custom for filters
$collection = Mage::getResourceModel('catalog/product_attribute_collection');
$collection->setItemObjectClass('catalog/resource_eav_attribute');
$collection->setAttributeSetFilter(array(4));
$collection->addStoreLabel(Mage::app()->getStore()->getId());
$collection->setOrder('position', 'ASC');
$collection->addFieldToFilter('additional_table.is_filterable', array('gt' => 0));
$attributes = $collection->load();
*/
// print_r($attributes->getData());exit();
$attributeCollection =array();
$i=0;
$attributeCollection = array();
foreach ($attributes as $attribute) {
if($attribute->getAttributeCode() == 'price') {
$filterBlockName = 'catalog/layer_filter_price';
}elseif($attribute->getBackendType() == 'decimal'){
$filterBlockName = 'catalog/layer_filter_decimal';
}else{
$filterBlockName = 'catalog/layer_filter_attribute';
}
$result = Mage::app()->getLayout()->createBlock($filterBlockName)->setLayer($layer)->setAttributeModel($attribute)->init();
$attributeCollection[$i]['Code'] = $attribute->getAttributeCode();
$attributeCollection[$i]['Label'] = $attribute->getStoreLabel();
$j=0;
$attributeOptionCollection =array();
foreach($result->getItems() as $option) {
if($attribute->getAttributeCode()=='price'){
$attributeOptionCollection[$j]['Label'] = strip_tags($option->getLabel());
}else{
$attributeOptionCollection[$j]['Label'] = $option->getLabel();
}
$attributeOptionCollection[$j]['Value'] = $option->getValue();
$attributeOptionCollection[$j]['Type'] = $option->getFrontend();
$j++;
}
$attributeCollection[$i]['Options'] = $attributeOptionCollection;
$i++;
}
// print_r($attributeCollection);exit();
// echo "<pre>";
$counter = 0;
$availableSortOptions = $_category->getavailablesortbyoptions();
foreach ($availableSortOptions as $key=>$options) {
$optinsList[$counter]['Code'] = $key;
$optinsList[$counter]['Label'] =$options;
$counter++;
// print_r($options);
}
$information['Filters'] = $attributeCollection;
Please suggest how i proceed
I used this for a particular Category - hope this helps
//require necessary files
require_once('../app/Mage.php');
$categoryID = $_POST['categoryID'];
//necessary initialization
Mage::app();
$websiteId = Mage::app()->getWebsite()->getId();
$store = Mage::app()->getStore();
try{
$json = array('status' => true);
$json['data'] = array();
$layer = Mage::getModel("catalog/layer");
$category = Mage::getModel("catalog/category")->load($categoryID); // 3rd Category
$layer->setCurrentCategory($category);
$attributes = $layer->getFilterableAttributes();
foreach ($attributes as $attribute) {
$filterBlockName = 'catalog/layer_filter_attribute';
$result = Mage::app()->getLayout()->createBlock($filterBlockName)->setLayer($layer)->setAttributeModel($attribute)->init();
foreach($result->getItems() as $option) {
$count[] = array('attribute_name' => $option->getLabel(),'attribute_value' => $option->getValue());
}
if($count!=null){
$json['data'][] = array('name'=>ucfirst($attribute->getAttributeCode()),'count'=>$count);
}
unset($count);
}
}
catch (Exception $e) {
$json = array('status' => false, 'message' => $e->getMessage());
}
echo json_encode($json);
this my code. It create xml file from mysql..
my problem:
for($i=0; $i<count($str_exp1); $i++) // HERE
{
$str_exp2 = explode(",", $str_exp1[$i]);
$newnode->setAttribute("lat", $str_exp2[0]);
$newnode->setAttribute("lng", $str_exp2[1]);
}
for not show the all data... it only show me latest one data.. i cant find where is there problem..
P.S. Sorry for my english
0
$doc = new DOMDocument("1.0");
$node = $doc->createElement("marker");
$parnode = $doc->appendchild($node);
$result = mysql_query("SELECT * FROM usersline");
if(mysql_num_rows($result)>0)
{
header("Content-type: text/xml");
while ($mar = mysql_fetch_array($result))
{
$node = $doc->createElement("line");
$newnode = $parnode->appendChild($node);
$newnode->setAttribute("id_line", $mar['id_line']);
$newnode->setAttribute("color", $mar['colour']);
$newnode->setAttribute("width", $mar['width']);
$node = $doc->createElement("point");
$newnode = $parnode->appendChild($node);
$str_exp1 = explode(";", $mar['coordinats']);
for($i=0; $i<count($str_exp1); $i++) // HERE
{
$str_exp2 = explode(",", $str_exp1[$i]);
$newnode->setAttribute("lat", $str_exp2[0]);
$newnode->setAttribute("lng", $str_exp2[1]);
}
}
$xmlfile = $doc->saveXML();
echo $xmlfile;
}
else
{
echo "<p>Ëèíèé íå îáíàðóæåíî!</p>";
}
Your problem is that you set multiple values to the same node. So you are always overwriting the attribute values with the latest lat/long value.
Instead you need to add a new element per each lat/long pair because XML elements do not have duplicate attributes.
Some example code based on your question, as you can see I introduce some functions to keep things more modular:
$result = $db->query("SELECT * FROM usersline");
if (!$result || !count($result)) {
echo "<p>Ëèíèé íå îáíàðóæåíî!</p>";
return;
}
$doc = new DOMDocument("1.0");
$doc->loadXML('<marker/>');
$marker = $doc->documentElement;
foreach ($result as $mar) {
$line = $doc->createElement('line');
$attributes = array_map_array(['id_line', 'colour' => 'color', 'width'], $mar);
element_add_attributes($line, $attributes);
foreach (coordinates_to_array($mar['coordinats']) as $latlong) {
$point = $doc->createElement('point');
element_add_attributes($point, $latlong);
$line->appendChild($point);
}
$marker->appendChild($line);
}
header("Content-type: text/xml");
echo $doc->saveXML();
function element_add_attributes(DOMElement $element, array $attributes)
{
foreach ($attributes as $name => $value) {
if (!is_string($name)) continue;
$element->setAttribute($name, $value);
}
}
function array_map_array(array $map, array $array)
{
$result = array();
foreach ($map as $alias => $name) {
$source = is_string($alias) ? $alias : $name;
$result[$name] = $array[$source];
}
return $result;
}
function coordinates_to_array($coordinates)
{
$result = array();
$coordinatePairs = explode(";", $coordinates);
foreach ($coordinatePairs as $coordinatePair) {
list($pair['lat'], $pair['lng']) = explode(',', $coordinatePair, 2) + ['', ''];
$result[] = $pair;
}
return $result;
}
I hope this example is helpful and shows you some ways how you can put a problem apart so that your code becomes more easy and more stable.
To make use of $db->query(...) first define a class that has the query method:
class DB {
public function query($sql) {
$dbhandle = mysql_query($sql);
$result = array();
while ($mar = mysql_fetch_array($dbhandle))
$result[] = $mar
;
return $result;
}
}
Then instantiate it:
$db = new DB();
You can then use the code above for that part.
For the problem with the PHP 5.4 array notation for example in this line:
$attributes = array_map_array(['id_line', 'colour' => 'color', 'width'], $mar);
First of all extract the array out of it:
$mapping = ['id_line', 'colour' => 'color', 'width'];
$attributes = array_map_array($mapping, $mar);
Then define the array with the array( and ) notation instead of [ and ]:
$mapping = array('id_line', 'colour' => 'color', 'width');
$attributes = array_map_array($mapping, $mar);
Do so as well in other places, e.g.
['', '']
becomes
array('', '')
and similar.
Replace your code with this:
$str_exp1 = explode(";", $mar['coordinats']);
$newnode->setAttribute("lat", $str_exp1[0]);
$newnode->setAttribute("lng", $str_exp1[1]);
How to find the values for namespace content:encoded and dc:creator with the following code
Unfortunately I cannot use simplepie or magpierss or even simplexml.
I know I've to use $doc->getElementsByTagName, but cannot figure out where?
<?php
function rss_to_array($tags, $array, $url) {
$doc = new DOMdocument();
#$doc->load($url);
$rss_array = array();
foreach($tags as $tag) {
if ($doc->getElementsByTagName($tag)) {
foreach($doc->getElementsByTagName($tag) AS $node) {
$items = array();
foreach($array AS $key => $values) {
$items[$key] = array();
foreach($values as $value) {
if ($itemsCheck = $node->getElementsByTagName($value)) {
for( $j=0 ; $j < $itemsCheck->length; $j++ ) {
if (($attribute = $itemsCheck->item($j)->nodeValue) != "") {
$items[$key][] = $attribute;
} else if ($attribute = $itemsCheck->item($j)->getAttribute('term')) {
$items[$key][] = $attribute;
} else if ($itemsCheck->item($j)->getAttribute('rel') == 'alternate') {
$items[$key][] = $itemsCheck->item($j)->getAttribute('href');
}
}
}
}
}
array_push($rss_array, $items);
}
}
}
return $rss_array;
}
$rss_item_tags = array('item', 'entry');
$rss_tags = array(
'title' => array('title'),
'description' => array('description', 'content', 'summary'),
'link' => array('link', 'feedburner'),
'category' => array('category')
);
$rssfeed = rss_to_array($rss_item_tags, $rss_tags, $url);
echo '<pre>';
print_r($rssfeed);
echo '</pre>';
exit;
?>
for RSS feeds, try using simplexml_load_file. It creates an object out of the XML and, as all RSS feeds are the same, then you can do something like:
$feed = simplexml_load_file(your_rss_url_here);
for($i=0; $i < 10; $i++){
// this is assuming there are 10 pieces of content for each RSS you're loading
$link = $feed->channel->item[$i]->link;
// do each for pubdate, author, description, title, etc.
}
http://php.net/manual/en/book.simplexml.php
I am trying to pull the price and seller from the amazon offer listing pages found at:
http://www.amazon.com/gp/offer-listing/B002UYSHMM
I can get the price by using:
$ret['Retail'] = $html->find('span[class="price"]', 0)->innertext;
This pulls the first price in the offer listing
I tried to pull the matching seller of the first price by using the below to get the alt value from the img which contains the seller name:
$ret['SoldBy'] = $html->find('ul.sellerInformation img', 0)->getAttribute('alt');
It worked for the first one but as I went down it started missing sellers and even missing prices in some cases.
Can anyone tell why it would miss sellers and even jump around on prices? All I did to get additional sellers is:
$ret['Retail2'] = $html->find('span[class="price"]', 1)->innertext;
$ret['SoldBy2'] = $html->find('ul.sellerInformation img', 1)->getAttribute('alt');
$ret['Retail3'] = $html->find('span[class="price"]', 2)->innertext;
$ret['SoldBy3'] = $html->find('ul.sellerInformation img', 2)->getAttribute('alt');
$ret['Retail4'] = $html->find('span[class="price"]', 3)->innertext;
$ret['SoldBy4'] = $html->find('ul.sellerInformation img', 3)->getAttribute('alt');
$ret['Retail5'] = $html->find('span[class="price"]', 4)->innertext;
$ret['SoldBy5'] = $html->find('ul.sellerInformation img', 4)->getAttribute('alt');
$ret['Retail6'] = $html->find('span[class="price"]', 5)->innertext;
$ret['SoldBy6'] = $html->find('ul.sellerInformation img', 5)->getAttribute('alt');
$ret['Retail7'] = $html->find('span[class="price"]', 6)->innertext;
$ret['SoldBy7'] = $html->find('ul.sellerInformation img', 6)->getAttribute('alt');
Thank you for any suggestions!
<?php
$url = 'http://www.amazon.com/gp/offer-listing/B0036RNK7O/ref=dp_olp_new?ie=UTF8&qid=1319582305&sr=8-2';
$dom = new DomDocument();
$content = file_get_contents($url);
$dom->loadHTML($content);
$results = array();
$classes_to_collect = array('price', 'shipping_block', 'condition', 'sellerInformation');
$seller_elements = array('name', 'rating', 'stock_info', 'item_info');
foreach($dom->getElementsByTagName('tbody') as $tb)
{
if($tb->hasAttribute('class') && stripos($tb->getAttribute('class'), 'result')!==false)
{
foreach($tb->getElementsByTagName('tr') as $tr)
{
$new_result = array();
foreach($tr->getElementsByTagName('td') as $td)
{
foreach($td->childNodes as $cne)
{
foreach($classes_to_collect as $ctc)
{
if($cne->hasAttributes() && $cne->getAttribute('class') && stripos($cne->getAttribute('class'), $ctc)!==false)
{
if($cne->localName=='ul')
{
$new_sellern = array();
$lis = $cne->getElementsByTagName('li');
foreach($lis as $lii=>$lie)
{
$value = $lie->textContent;
if($seller_elements[$lii]=='item_info')
{
$cutoff = strpos($value, 'amznJQ.onReady');
if($cutoff) $value = substr($value, 0, $cutoff);
}
else if($seller_elements[$lii]=='name')
{
$cutoff = strpos($value, 'Seller:');
if($cutoff!==false) $value = substr($value, 7);
}
else if($seller_elements[$lii]=='rating')
{
$cutoff = strpos($value, 'Seller Rating:');
if($cutoff!==false) $value = substr($value, 14);
}
$new_seller[$seller_elements[$lii]] = trim($value);
}
$new_result[$ctc] = $new_seller;
}
else $new_result[$ctc] = $cne->textContent;
}
}
}
}
$results[] = $new_result;
}
}
}
print_r($results);
Will print a huge multi-dimensional array
I used a foreach and put the results into an array. Worked much better since the number of sellers varies by item.
foreach($html->find('div.resultsset table tbody.result tr') as $article) {
if($article->find('span.price', 0)) {
// get retail
$item['Retail'] = $article->find('span.price', 0)->plaintext;
// get soldby
if($article->find('img', 0)->getAttribute('alt') <> '') {
$item['SoldBy'] = $article->find('img', 0)->getAttribute('alt'); }
else {$item['SoldBy'] = $article->find('ul.sellerInformation li a b', 0)->plaintext;}
$ret[] = $item;
}
}