I'm searching for a solution to this problem for a long time and I didn't get any solutions.
I managed to extract the mp4 URL, but the problem is that this link redirects to another URL that can be seen in response header: Location, I don't know how I can get this URL.
Response Header(img)
<?php
function tidy_html($input_string) {
$config = array('output-html' => true,'indent' => true,'wrap'=> 800);
// Detect if Tidy is in configured
if( function_exists('tidy_get_release') ) {
$tidy = new tidy;
$tidy->parseString($input_string, $config, 'raw');
$tidy->cleanRepair();
$cleaned_html = tidy_get_output($tidy);
}
else {
# Tidy not configured for this Server
$cleaned_html = $input_string;
}
return $cleaned_html;
}
function getFromPage($webAddress,$path){
$source = file_get_contents($webAddress); //download the page
$clean_source = tidy_html($source);
$doc = new DOMDocument;
// suppress errors
libxml_use_internal_errors(true);
// load the html source from a string
$doc->loadHTML($clean_source);
$xpath = new DOMXPath($doc);
$data="";
$nodelist = $xpath->query($path);
$node_counts = $nodelist->length; // count how many nodes returned
if ($node_counts) { // it will be true if the count is more than 0
foreach ($nodelist as $element) {
$data= $data.$element->nodeValue . "\n";
}
}
return $data;
}
$vidID = 4145616; //videoid : https://video.sibnet.ru/shell.php?videoid=4145616
$link1 = getFromPage("https://video.sibnet.ru/shell.php?videoid=".$vidID,"/html/body/script[21]/text()"); // Use XPath
$json = urldecode($link1);
$link2 = strstr($json, "player.src");
$url = substr($link2, 0, strpos($link2, ","));
$url =str_replace('"',"",$url);
$url = substr($url , 18);
//header('Location: https://video.sibnet.ru'.$url);
echo ('https://video.sibnet.ru'.$url)
?>
<?php
$url='https://video.sibnet.ru'.$url;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$a = curl_exec($ch);
$url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); // This is what you need, it will return you the last effective URL
$realUrl = $url; //here you go
?>
SOURCE: https://stackoverflow.com/a/17473000/14885297
Related
I tried to extract the download url from the webpage.
the code which tried is below
function getbinaryurl ($url)
{
$curl = curl_init($url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_FRESH_CONNECT, true);
$value1 = curl_exec($curl);
curl_close($curl);
$start = preg_quote('<script type="text/x-component">', '/');
$end = preg_quote('</script>', '/');
$rx = preg_match("/$start(.*?)$end/", $value1, $matches);
var_dump($matches);
}
$url = "https://www.sourcetreeapp.com/download-archives";
getbinaryurl($url);
this way i am getting the tags info not the content inside the script tag. how to get the info inside.
expected result is:
https://product-downloads.atlassian.com/software/sourcetree/ga/Sourcetree_4.0.1_234.zip,
https://product-downloads.atlassian.com/software/sourcetree/windows/ga/SourceTreeSetup-3.3.6.exe,
https://product-downloads.atlassian.com/software/sourcetree/windows/ga/SourcetreeEnterpriseSetup_3.3.6.msi
i am very much new in writing these regular expressions. can any help me pls.
Instead of using regex, using DOMDocument and XPath allows you to have more control of the elements you select.
Although XPath can be difficult (same as regex), this can look more intuitive to some. The code uses //script[#type="text/x-component"][contains(text(), "macURL")] which broken down is
//script = any script node
[#type="text/x-component"] = which has an attribute called type with
the specific value
[contains(text(), "macURL")] = who's text contains the string macURL
The query() method returns a list of matches, so loop over them. The content is JSON, so decode it and output the values...
function getbinaryurl ($url)
{
$curl = curl_init($url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_FRESH_CONNECT, true);
$value1 = curl_exec($curl);
curl_close($curl);
$doc = new DOMDocument();
libxml_use_internal_errors(true);
$doc->loadHTML($value1);
libxml_use_internal_errors(false);
$xp = new DOMXPath($doc);
$srcs = $xp->query('//script[#type="text/x-component"][contains(text(), "macURL")]');
foreach ( $srcs as $src ) {
$content = json_decode( $src->textContent, true);
echo $content['params']['macURL'] . PHP_EOL;
echo $content['params']['windowsURL'] . PHP_EOL;
echo $content['params']['enterpriseURL'] . PHP_EOL;
}
}
$url = "https://www.sourcetreeapp.com/download-archives";
getbinaryurl($url);
which outputs
https://product-downloads.atlassian.com/software/sourcetree/ga/Sourcetree_4.0.1_234.zip
https://product-downloads.atlassian.com/software/sourcetree/windows/ga/SourceTreeSetup-3.3.8.exe
https://product-downloads.atlassian.com/software/sourcetree/windows/ga/SourcetreeEnterpriseSetup_3.3.8.msi
With this code (below) i can return the current price of AAPL/Apple. How do i modify this to return the previous close for example.
$ticker = "aapl";
$url = "http://reuters.com/finance/stocks/overview?symbol=";
$newURL = $url.$ticker;
$result = file_get_contents($newURL);
$nyArr1 = explode('font-size: 23px;">', $result);
if ($nyArr1[1]) {
$nyArr2 = explode("</span>", $nyArr1[1]);
if ($nyArr2[1]) {
$nyPrice = $nyArr2[0];
}
}
Site link: https://www.reuters.com/finance/stocks/overview/AAPL.O
I recommend you to use DOMDocument to parse a HTML document with PHP, like this :
$ticker = "aapl";
$baseUrl = "http://reuters.com/finance/stocks/overview?symbol=";
$url = $baseUrl.$ticker;
$dom = new DOMDocument();
$dom->preserveWhiteSpace = false;
$dom->loadHTMLFile($url);
$finder = new DomXPath($dom);
echo "First value :". $finder->query('//*[#id="headerQuoteContainer"]/div[1]/div/span[2]')->item(0)->nodeValue."<br/>";
echo "Second value :". $finder->query('//*[#id="headerQuoteContainer"]/div[3]/div[1]/span[2]')->item(0)->nodeValue;
I have used DomXPath but it is not mandatory.
Try the following to get the required content. In case you need another value, all you wanna do is modify this visible text [contains(.,'Prev Close')] to satisfy your need.
<?php
function get_content($url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_exec($ch);
$htmlContent = curl_exec($ch);
curl_close($ch);
$dom = #DOMDocument::loadHTML($htmlContent);
$xp = new DOMXPath($dom);
$prevClose = $xp->query("//span[contains(.,'Prev Close')]/following-sibling::span")->item(0)->nodeValue;
$Open = $xp->query("//span[contains(.,'Open')]/following-sibling::span")->item(0)->nodeValue;
echo "PrevClose: $prevClose". '<br/>';
echo "Open: $Open";
}
$link = "https://www.reuters.com/finance/stocks/overview/AAPL.O";
get_content($link);
?>
I am attempting to access an XML feed (with username and password) by using XMLReader.
Formerly, I had integrated the credentials into the url (e.g. http://username:password#mysite.com); however, this is not working now.
I get 'XPath query failed for bio' at the final check in my code.
Would it be possible to specify the username/password in XMLReader?
Thanks for any leads.
My code (edited to include my Curl code):
<?php
$secondary_user_id = "jsmith";
$url_bio = "http://username:password#mysite.com";
//
$a_username='username';
$a_password='password';
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url_bio);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_USERPWD, $a_username . ':' . $a_password);
$result = curl_exec($ch);
$a_error = curl_error($ch);
echo '<br>'.$a_error.'<br>';
curl_close($ch);
//
$reader = new XMLReader();
//$reader->open($url_bio);
$reader->XML($result);
while ($reader->read())
{
if ($reader->nodeType == XMLReader::ELEMENT && $reader->name == 'Users')
{
//the code works to this point
echo 'success<br>';
$node = $reader->expand();
$doc = new DOMDocument('1.0','UTF-8');//
$n = $doc->importNode($node,true);
$doc->appendChild($n);
//$xml_bio_report = simplexml_import_dom($doc->importNode($reader->expand(),true));//
$xml_bio_report = simplexml_import_dom($n);//
$xml_bio_report->registerXPathNamespace('xlink','http://www.w3.org/1999/xlink');
$xml_bio_report->registerXPathNamespace('dmu','http://www.digitalmeasures.com/schema/user-metadata');
//echo $xml_bio_report->Users->User;
$xml_bio_report_abbrev = $xml_bio_report->xpath('//User[#SecondaryID="'.$secondary_user_id.'"]');
if ($xml_bio_report_abbrev){
echo '<h1>'.$xml_bio_report_abbrev[0]['username'].'</h1>';
echo '<h1>'.$xml_bio_report_abbrev[0]['SecondaryID'].'</h1>';
} else {
echo 'XPath query failed for bio';
}
}
}
?>
XMLReader::open() uses the PHP streamwrapper. Here is a function called libxml_set_streams_context() that allows to set the context for the next open/load call.
$opts = array(
'http' => array(
'user_agent' => 'PHP libxml agent',
)
);
$context = stream_context_create($opts);
libxml_set_streams_context($context);
$reader = new XMLReader();
$reader->open($url_bio);
//...
this is my first time in using Curl and selecting element within XPath. Attached is my current code.
<?php
//$curl = curl_init('https://silvergoldbull.com/');
$curl = curl_init('https://e-katalog.lkpp.go.id/backend/katalog/list_produk/77/?isSubmitted=1&orderBy=hargaAsc&list=5&manufakturId=all&penyediaId=all&page=1');
curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
$page = curl_exec($curl);
if(curl_errno($curl)) // check for execution errors
{
echo 'Scraper error: ' . curl_error($curl);
exit;
}
echo $page;
curl_close($curl);
$page_doc = new DOMDocument;
libxml_use_internal_errors(true);
$page_doc->loadHTML($page);
libxml_clear_errors(); //remove errors for yucky html
$page_doc_xpath = new DOMXPath($page_doc);
//$result = $page_doc_xpath->evaluate('/html/body/div[2]/div[5]/div/div/div[3]/div[3]/div/table/tbody/tr[1]/td/div/div[3]/div/div[1]/div/ol/li/a');
$result = $page_doc_xpath->evaluate('string(/html/body/div[2]/div[5]/div/div/div[3]/div[3]/div/table/tbody/tr[1]/td/div/div[3]/div/div[1]/div/ol/li/a)');
echo "----";
echo $result;
/* $silverprice = $page_doc_xpath->evaluate('string(/html/body/nav/div[3]/div/div/ul/li[1]/a/span/div/div/strong)');
echo $silverprice; */
/* $buyers = tree.xpath('//div[#title="buyer-name"]/text()') */
/* $regex = '/<div id="case_textlist">(.*?)<\/div>/s';
if ( preg_match($regex, $page, $list) )
echo $list[0];
else
print "Not found"; */
?>
by using those code, i am able to retrieve Computer Supplies within green bracket at the end of the page. However, how do i retrieve the rest of red brackets ?
update :
i modify $result into the following code and still not working. It only return Networking instead of all in the brackets
$result = $page_doc_xpath->evaluate('string(//div[#class="categoryPath"]//a)');
In my case i use Goutte for scrape the data
use Goutte\Client;
$client = new Client();
$crawler = $client->request('GET', $url);
$titles = $crawler->filter('.listing--name')->extract('_text');
by use class or id can be text of the node...
I am trying to read twitter timeline with cURL, and for some reason I am unable to use preg_match. Here's my current code, do you see any issues?
$feed = "http://twitter.com/statuses/user_timeline/antonpug.xml?count=3";
function parse_feed($feed) {
//$matches = Array();
preg_match_all("/<status>(.*?)<\/status>/", $content[0], $matches);
return $matches[0];
//$stepOne = explode("<content type=\"html\">", $feed);
//$stepTwo = explode("</content>", $stepOne[1]);
//$tweet = $stepTwo[0];
//$tweet = htmlspecialchars_decode($tweet,ENT_QUOTES);
//return $tweet;
}
//Initialize the Curl session
$ch = curl_init();
//Set curl to return the data instead of printing it to the browser.
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
//Set the URL
curl_setopt($ch, CURLOPT_URL, $feed);
//Execute the fetch
$twitterFeed = curl_exec($ch);
//Close the connection
curl_close($ch);
//$twitterFeed = file_get_contents($feed);
echo(parse_feed($twitterFeed));
I guess the better idea would be to use simplexml to work with XML as with object.
Your function then would be something like
function parse_feed($feed) {
$xml = simplexml_load_string($feed);
if(isset($xml->status)) {
return $xml->xpath('status');
} else {
return false;
}
}
It will return simplexml object.