file_get_contents is doing strange on server - php

this is a script that gives you a direct mp3 download link from a youtube video id, variable $gg is the video id
So when i run this code on my xampp locally it runs fine and returns me a direct download link but when i try to run this code on my hosts server it returns a link but not a direct link but a download page what am i doing wrong?
<?php
$gg = '6Y1Emb7Jyks';
$site = 'http://www.youtubeinmp3.com/widget/button/?video=https://www.youtube.com/watch?v='.$gg;
$html = file_get_contents($site);
$dom = new DOMDocument();
#$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for ($i = 0; $i < $hrefs->length; $i++) {
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
}
$lol = 'http://www.youtubeinmp3.com/'.$url;
echo $lol;

figured it out myself i had to follow the redirect
final code
<?php
$gg = '6Y1Emb7Jyks';
$site = 'http://www.youtubeinmp3.com/widget/button/?video=https://www.youtube.com/watch?v='.$gg;
$html = file_get_contents($site);
$dom = new DOMDocument();
#$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for ($i = 0; $i < $hrefs->length; $i++) {
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
}
$lol = 'http://www.youtubeinmp3.com/'.$url;
$url= $lol;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$a = curl_exec($ch);
$url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
echo $url;

Related

Unable to grab content traversing multiple pages

I've written a script in php to scrape the titles and its links from a webpage. The webpage displays it's content traversing multiple pages. My below script can parse the titles and links from it's landing page.
How can I rectify my existing script to get data from multiple pages, as in upto 10 pages?
This is my attempt so far:
<?php
include "simple_html_dom.php";
$link = "https://stackoverflow.com/questions/tagged/web-scraping?page=2";
function get_content($url)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_BINARYTRANSFER, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$htmlContent = curl_exec($ch);
curl_close($ch);
$dom = new simple_html_dom();
$dom->load($htmlContent);
foreach($dom->find('.question-summary') as $file){
$itemTitle = $file->find('.question-hyperlink', 0)->innertext;
$itemLink = $file->find('.question-hyperlink', 0)->href;
echo "{$itemTitle},{$itemLink}<br>";
}
}
get_content($link);
?>
The site increments it's pages like ?page=2,?page=3 e.t.c.
This is how I got success (coping with Nima's suggestion).
<?php
include "simple_html_dom.php";
$link = "https://stackoverflow.com/questions/tagged/web-scraping?page=";
function get_content($url)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_BINARYTRANSFER, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$htmlContent = curl_exec($ch);
curl_close($ch);
$dom = new simple_html_dom();
$dom->load($htmlContent);
foreach($dom->find('.question-summary') as $file){
$itemTitle = $file->find('.question-hyperlink', 0)->innertext;
$itemLink = $file->find('.question-hyperlink', 0)->href;
echo "{$itemTitle},{$itemLink}<br>";
}
}
for($i = 1; $i<10; $i++){
get_content($link.$i);
}
?>
Here is how i would do it with XPath:
$url = 'https://stackoverflow.com/questions/tagged/web-scraping';
$dom = new DOMDocument();
$source = loadUrlSource($url);
$dom->loadHTML($source);
$xpath = new DOMXPath($dom);
$domPage = new DOMDocument();
$domPage->loadHTML($source);
$xpath_page = new DOMXPath($domPage);
// Find page links with the title "go to page" within the div container that contains "pager" class.
$pageItems = $xpath_page->query("//div[contains(#class, 'pager')]//a[contains(#title, 'go to page')]");
// Get last page number.
// Since you will look once at the beginning for the page number, subtract by 2 because the link "next" has title "go to page" as well.
$pageCount = (int)$pageItems[$pageItems->length-2]->textContent;
// Loop every page
for($page=1; $page < $pageCount; $page++) {
$source = loadUrlSource($url . "?page={$page}");
// Do whatever with the source. You can also call simple_html_dom on the content.
// $dom = new simple_html_dom();
// $dom->load($source);
}

Get meta data from tumblr

I would like to put open graph meta tags (og:image, og:description) to a database on my site from a tumblr blog. I tried to get meta tags from tubmlr with this code
function file_get_contents_curl($url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
$html = file_get_contents_curl("http://www.example.com");
//parsing begins here:
$doc = new DOMDocument();
#$doc->loadHTML($html);
$nodes = $doc->getElementsByTagName('title');
//get and display what you need:
$title = $nodes->item(0)->nodeValue;
$metas = $doc->getElementsByTagName('meta');
for ($i = 0; $i < $metas->length; $i++)
{
$meta = $metas->item($i);
if($meta->getAttribute('name') == 'description')
$description = $meta->getAttribute('content');
if($meta->getAttribute('name') == 'keywords')
$keywords = $meta->getAttribute('content');
}
echo "Title: $title". '<br/><br/>';
echo "Description: $description". '<br/><br/>';
echo "Keywords: $keywords";
I know this code won't get the open graph code, but the problem is it doesn't get any meta. The code works fine with for exaple www.google.com but not with tumblr.
How could I get the tumblr meta?
Thank you!

Get Title and Meta description and image of External site in yii framework

how can i get the meta description , title and image , from external site url, i have achieved this using php but i dont know how i use this it in yii controller, my code is
function file_get_contents_curl($url)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
$html = file_get_contents_curl("http://www.example.com");
//parsing begins here:
$doc = new DOMDocument();
#$doc->loadHTML($html);
$nodes = $doc->getElementsByTagName('title');
//get and display what you need:
$title = $nodes->item(0)->nodeValue;
$metas = $doc->getElementsByTagName('meta');
for ($i = 0; $i < $metas->length; $i++)
{
$meta = $metas->item($i);
if($meta->getAttribute('name') == 'description')
$description = $meta->getAttribute('content');
if($meta->getAttribute('name') == 'keywords')
$keywords = $meta->getAttribute('content');
if($meta->getAttribute('language') == 'language');
$language = $meta->getAttribute('language');
}
echo "Title: $title". '<br/><br/>';
echo "Description: $description". '<br/><br/>';
echo "Keywords: $keywords";
im new to yii , any help
I used your code (with some minor edits) to create the following file. Save it in protected/components/HttpDetails.php (note: error handling not implemented - in case of http failure or other)
class HttpDetails {
private static function file_get_contents_curl($url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
public static function getDetails($url) {
$html = self::file_get_contents_curl($url);
//parsing begins here:
$doc = new DOMDocument();
#$doc->loadHTML($html);
$nodes = $doc->getElementsByTagName('title');
//get and display what you need:
$title = $nodes->item(0)->nodeValue;
$metas = $doc->getElementsByTagName('meta');
for ($i = 0; $i < $metas->length; $i++) {
$meta = $metas->item($i);
if ($meta->getAttribute('name') == 'description')
$description = $meta->getAttribute('content');
if ($meta->getAttribute('name') == 'keywords')
$keywords = $meta->getAttribute('content');
}
return array(
'title'=>isset($title)?$title:'Not set',
'description'=> isset($description)?$description:'Not set',
'keywords'=> isset($keywords)?$keywords:'Not set',
);
}
}
Edit your import array in protected\config\main.php to include 'application.components.HttpDetails'
...
'import' => array(
...
'application.components.HttpDetails',
),
To read the details from a page do the following in any controller (or elsewhere in your application)
$url = "www.cnn.com";
$details = HttpDetails::getDetails($url);
$title = $details['title'];
$description = $details['description'];
$keywords = $details['keywords'];
The above exact code has been tested and works fine. If you are getting errors, you should check your php environment for DOM / libxml extensions where your Yii is hosted.

How to scrape content betweeen all link tags like SCRAPE THIS on a page?

I'm trying to scrape a site's link texts, i.e SCRAPE THIS. I want to do this for all links on the page. So far I have this:
<?php
$target_url = "SITE I WANT TO SCRAPE";
// make the cURL request to $target_url
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL,$target_url);
curl_setopt($ch, CURLOPT_FAILONERROR, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
$html= curl_exec($ch);
if (!$html) {
echo "<br />cURL error number:" .curl_errno($ch);
echo "<br />cURL error:" . curl_error($ch);
exit;
}
// parse the html into a DOMDocument
$dom = new DOMDocument();
#$dom->loadHTML($html);
// grab all the on the page
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a/text()");
for ($i = 0; $i < $hrefs->length; $i++) {
$href = $hrefs->item($i);
echo "<br />Link stored: $href";
}
?>
I'm pretty new to this stuff and can't figure out what I'm doing wrong?
Thanks!
In your for loop, $href is not a string. It's actually a DOMText node. In order to use it as a string, you need to access its nodeValue property.
for ($i = 0; $i < $hrefs->length; $i++) {
$href = $hrefs->item($i);
echo "<br />Link stored: $href->nodeValue";
}
Try:
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a/text()");
for ($i = 0; $i < $hrefs->length; $i++) {
$href = $hrefs->item($i)->textContent;
echo "<br />Link stored: $href";
}

Getting url data by curl method giving unexpected results in symbols

I am facing some times Problem in getting url data by curl method specially website data is is in other language like arabic etc
My curl function is
function file_get_contents_curl($url)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
$data = curl_exec($ch);
$info = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
//checking mime types
if(strstr($info,'text/html')) {
curl_close($ch);
return $data;
} else {
return false;
}
}
And how i am getting data
$html = file_get_contents_curl($checkurl);
$grid ='';
if($html)
{
$doc = new DOMDocument();
#$doc->loadHTML($html);
$nodes = $doc->getElementsByTagName('title');
#$title = $nodes->item(0)->nodeValue;
#$metas = $doc->getElementsByTagName('meta');
for ($i = 0; $i < $metas->length; $i++)
{
$meta = $metas->item($i);
if($meta->getAttribute('name') == 'description')
$description = $meta->getAttribute('content');
}
I am getting all data correctly from some arabic websites like
http://www.emaratalyoum.com/multimedia/videos/2012-04-08-1.474873
and when i give this youtube url
http://www.youtube.com/watch?v=Eyxljw31TtU&feature=g-logo&context=G2c4f841FOAAAAAAAFAA
it shows symbols..
what setting i have to do to show exactly the same title description.
Introduction
Getting Arabic can be very tricky but they are some basic steps you need to ensure
Your document must output UTF-8
Your DOMDocument must read in UTF-8 fromat
Problem
When getting Youtube information its already given the information in "UTF-8" format and the retrieval process adds addition UTF-8 encoding .... not sure why this occurs but a simple utf8_decode would fix the issue
Example
header('Content-Type: text/html; charset=UTF-8');
echo displayMeta("http://www.emaratalyoum.com/multimedia/videos/2012-04-08-1.474873");
echo displayMeta("http://www.youtube.com/watch?v=Eyxljw31TtU&feature=g-logo&context=G2c4f841FOAAAAAAAFAA");
Output
emaratalyoum.com
التقطت عدسات الكاميرا حارس مرمى ريال مدريد إيكر كاسياس في موقف محرج قبل لحظات من بداية مباراة النادي الملكي مع أبويل القبرصي في ذهاب دور الثمانية لدوري أبطال
youtube.com
أوروبا.ففي النفق المؤدي إلى الملعب، قام كاسياس بوضع إصبعه في أنفه، وبعدها قام بمسح يده في وجه أحدبنات سعوديات: أريد "شايب يدللني ولا شاب يعللني"
Function Used
displayMeta
function displayMeta($checkurl) {
$html = file_get_contents_curl($checkurl);
$grid = '';
if ($html) {
$doc = new DOMDocument("1.0","UTF-8");
#$doc->loadHTML($html);
$nodes = $doc->getElementsByTagName('title');
$title = $nodes->item(0)->nodeValue;
$metas = $doc->getElementsByTagName('meta');
for($i = 0; $i < $metas->length; $i ++) {
$meta = $metas->item($i);
if ($meta->getAttribute('name') == 'description') {
$description = $meta->getAttribute('content');
if (stripos(parse_url($checkurl, PHP_URL_HOST), "youtube") !== false)
return utf8_decode($description);
else {
return $description;
}
}
}
}
}
*file_get_contents_curl*
function file_get_contents_curl($url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
$data = curl_exec($ch);
$info = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
// checking mime types
if (strstr($info, 'text/html')) {
curl_close($ch);
return $data;
} else {
return false;
}
}
I believe this will work... utf8_decode() your attribute..
function file_get_contents_curl($url)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
$data = curl_exec($ch);
$info = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
//checking mime types
if(strstr($info,'text/html')) {
curl_close($ch);
return $data;
} else {
return false;
}
}
$html = file_get_contents_curl($checkurl);
$grid ='';
if($html)
{
$doc = new DOMDocument();
#$doc->loadHTML($html);
$nodes = $doc->getElementsByTagName('title');
#$title = $nodes->item(0)->nodeValue;
#$metas = $doc->getElementsByTagName('meta');
for ($i = 0; $i < $metas->length; $i++)
{
$meta = $metas->item($i);
if($meta->getAttribute('name') == 'description')
$description = utf8_decode($meta->getAttribute('content'));
}
What happens here is that you're discarding the found Content-Type header that cURL returned in your file_get_contents_curl() function; DOMDocument needs that information to understand the character set that was used on the page.
A somewhat ugly hack, but most generic, is to prefix the returned page with a <meta> tag containing the returned character set from the response headers:
if (strstr($info, 'text/html')) {
curl_close($ch);
return '<meta http-equiv="Content-Type" content="' . $info . '" />' . $data;
}
DOMDocument will accept the misplaced meta tag and do the respective conversions automatically.

Categories