how to insert form query into $html = file_get_contents_curl(); - php

Hi i'm trying to use a form submission data in file_get_contents_curl()
i want to make a form where you submit an url and get to look up meta tags on that website
here's my code:
<form method="post" action="index.php">
<input type="text" name="test">
<input type="submit">
</form></center>
<?php
function file_get_contents_curl($url)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
$html = file_get_contents_curl($_POST["test"];); // the submitted url should go here
$doc = new DOMDocument();
#$doc->loadHTML($html);
$nodes = $doc->getElementsByTagName('title');
$title = $nodes->item(0)->nodeValue;
$metas = $doc->getElementsByTagName('meta');
for ($i = 0; $i < $metas->length; $i++)
{
$meta = $metas->item($i);
if($meta->getAttribute('name') == 'title')
$description = $meta->getAttribute('content');
if($meta->getAttribute('name') == 'description')
$description = $meta->getAttribute('content');
if($meta->getAttribute('name') == 'keywords')
$keywords = $meta->getAttribute('content');
}
echo '<br/><br/>';
echo "Title: $title". '<br/><br/>';
echo "Description: $description". '<br/><br/>';
echo "Keywords: $keywords". '<br/><br/>';
?>
when i manually enter address everything works great eg.
$html = file_get_contents_curl('skynews.com');
but this doesn't work at all:
$html = file_get_contents_curl($_POST["test"];);
How do i call on this variable?

OK, i finally figured it out!
i had to add a variable first
<?php $address = $_POST["test"];?>
and
$html = file_get_contents_curl($address);
works great

Related

Extract url meta tags using php

I want to implement facebook like url extract system and i am using php CURL to extract the data
But i am getting url data of only few websites not of all websites
Here is the code:-
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $_POST["url"]);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$data = curl_exec($ch);
curl_close($ch);
// Load HTML to DOM Object
$dom = new DOMDocument();
#$dom->loadHTML($data);
// Parse DOM to get Title
$nodes = $dom->getElementsByTagName('title');
$title = $nodes->item(0)->nodeValue;
// Parse DOM to get Meta Description
$metas = $dom->getElementsByTagName('meta');
$body = "";
for ($i = 0; $i < $metas->length; $i ++) {
$meta = $metas->item($i);
if ($meta->getAttribute('name') == 'description') {
$body = $meta->getAttribute('content');
}
}
// Parse DOM to get Images
$image_urls = array();
$images = $dom->getElementsByTagName('img');
for ($i = 0; $i < $images->length; $i ++) {
$image = $images->item($i);
$src = $image->getAttribute('src');
if(filter_var($src, FILTER_VALIDATE_URL)) {
$image_src[] = $src;
}
}
$output = array(
'title' => $title,
'image_src' => $image_src,
'body' => $body
);
echo json_encode($output);
For few url i got the details like title, description, images but for most of the websites the code do not extract any details do i need to use client side language like jquery

Getting links from page using PHP DOM

I am trying to do something like "Serial Downloader" to import all episodes from all seasons from this website to my openload account using this code:
<?php
error_reporting(0);
$serial = file_get_contents($_GET['serial']);
$doc = new DOMDocument();
$doc -> loadHTML($serial);
$xpath = new DOMXPath($doc);
$seasons = $xpath->query("//*[#class='vypisserial']")->item(0);
$serial_divs = $seasons->getElementsByTagName('div');
$x = 0;
foreach($serial_divs as $season){
$x++;
echo "Season ".$x."<br />";
$season_inner = $season->getElementsByTagName('div')->item(0);
if($season_inner->getAttribute('id')!==""){
echo "--- START OF SEASON ID '".$season_inner->getAttribute('id')."' ---<br />";
$season_div = $doc -> getElementByID($season_inner->getAttribute('id'));
$episode_links = $season_div->getElementsByTagName('a');
foreach ($episode_links as $episode_link_a) {
$episode_link = $episode_link_a -> getAttribute("href");
$c = file_get_contents("https://freeserial.sk".$episode_link);
$doc = new DOMDocument();
$doc -> loadHTML($c);
$frames = $doc -> getElementsByTagName('iframe');
$link = "https://freeserial.sk".($frames[0] -> getAttribute("src"));
$video = file_get_contents($link);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $link);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_exec($ch);
$url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
echo "episode_link - ".$url."<br />";
$c = file_get_contents("https://api.openload.co/1/remotedl/add?login=3&key=C&url=".$url);
}
echo "--- END OF SEASON ID '".$season_inner->getAttribute('id')."' ---<br />";
} else {
echo "Nothing";
}
}
When I go to file.php?serial=https://www.freeserial.sk/serial/skam, I see this
Only one season is downloaded instead of four. I have no idea what's wrong. I will appreciate any help. Thanks
The main problem was how you were trying to read the document hierarchy, I've changed it to use the <div class="itemSeriaVypis"> element as the base for each series and then use the data relative to this.
$serial = file_get_contents($_GET['serial']);
$doc = new DOMDocument();
file_put_contents("season.html", $serial);
$doc -> loadHTML($serial);
$xpath = new DOMXPath($doc);
$serial_divs = $xpath->query("//*[#class='itemSeriaVypis']");
$x = 0;
foreach($serial_divs as $season){
$x++;
echo "Season ".$x."<br />";
echo "--- START OF SEASON ID '".$season->getAttribute('id')."' ---<br />";
$episode_links = $season->getElementsByTagName('a');
foreach ($episode_links as $episode_link_a) {
$episode_link = $episode_link_a -> getAttribute("href");
$c = file_get_contents("https://freeserial.sk".$episode_link);
$doc = new DOMDocument();
$doc -> loadHTML($c);
$frames = $doc -> getElementsByTagName('iframe');
$link = "https://freeserial.sk".($frames[0] -> getAttribute("src"));
$video = file_get_contents($link);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $link);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_exec($ch);
$url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
echo "episode_link - ".$url."<br />";
$c = file_get_contents("https://api.openload.co/1/remotedl/add?login=c5b4f1671c8e8323&key=CQkTSjzz&url=".$url);
}
echo "--- END OF SEASON ID '".$season->getAttribute('id')."' ---<br />";
}

Asynchronous php loop

I'm building a site where I have a json array of URL's, I use these urls to parse the meta data to display on the site. I want to show a thumbnail image, title and description. The issue that I've found is that it takes almost 10 seconds to get the data ready to be displayed in the site.
My question I want to ask is this: How should I go about to make it asynchronous, so that the loading time is drastically reduced. Does anyone have suggestions for this?
$jsonData = file_get_contents("data.json");
$json = json_decode($jsonData, true);
<div class="container" id="content">
<?php
$jsonData = file_get_contents("data.json");
$json = json_decode($jsonData, true);
function file_get_contents_curl($url){
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
foreach ($json['posts'] as $post){
$html = file_get_contents_curl($post['url']);
//parsing begins here:
$doc = new DOMDocument();
#$doc->loadHTML($html);
$nodes = $doc->getElementsByTagName('title');
//get and display what you need:
$title = $nodes->item(0)->nodeValue;
$metas = $doc->getElementsByTagName('meta');
for ($i = 0; $i < $metas->length; $i++)
{
$meta = $metas->item($i);
if($meta->getAttribute('property')=='og:image'){
$image = $meta->getAttribute('content');}
if($meta->getAttribute('name') == 'description')
$description = $meta->getAttribute('content');
if($meta->getAttribute('name') == 'keywords')
$keywords = $meta->getAttribute('content');
}
$output = "<a href='$post[url]'><br>";
$output .= "<div class='thumbnail'><br>";
$output .= "<img src='$image'><br>";
$output .= "<p class='article-headline'>$title</p><br>";
$output .= "<p class='article-excerpt'>$description</p><br>";
$output .= "</div></a>";
echo $output;
}
?>
</div>
my data.json file looks like this but with about 16 urls. The idea is that it's supposed to be able to handle an array with alot more URL's:
{
"posts": [
{
"url":"https://medium.com/s/story/the-absurdity-of-student-loan-debt-fb61fdca7d8c"
},
{
"url":"https://medium.com/s/in-defense-of-the-worst-human-emotions/jealousy-is-a-brat-eed054493965"
},
{
"url":"https://medium.com/swlh/chatbots-were-the-next-big-thing-what-happened-5fc49dd6fa61"
},
{
"url":"https://medium.com/s/trustissues/my-so-called-millennial-entitlement-9be84343c713"
}
]
}

PHP Call to a member function find()

Code on pastebin
Link on test page
Code:
require_once('simple_html_dom.php');
$file = 'http://testwork.ru/Tempp/domains.php'; // page with table
$SymbolsCountMin = 0;
$SymbolsCountMax = 10;
$SymbolsBackLis = array('-','_','.','0','1','2','3','4','5','6','7','8','9');
$ArrTr = array();
$ArrTd = array();
$ch = curl_init($file);
curl_setopt($ch, CURLOPT_URL, $file );
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true );
curl_setopt($ch, CURLOPT_VERBOSE, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, array(
'Content-Type: application/x-www-form-urlencoded'
));
$html = curl_exec($ch);
//$responseInfo = curl_getinfo($ch);
curl_close ($ch);
//var_dump($html);
//$html = file_get_html('http://testwork.ru/Tempp/domains.php');
// Find all tr
$row = 0;
foreach($html->find('tr') as $tr){
if($row!=0){
$row++;
$column = 0;
foreach($tr->find('td') as $td){
$column++;
$text = $td->plaintext;
$ArrTd[$column] = $text;
}
}
if(iconv_strlen($ArrTd[0]) > $SymbolsCountMin && iconv_strlen($ArrTd[0]) < $SymbolsCountMax && !in_array($ArrTd[0], $SymbolsBackList)){
$ArrTr[$row] = $ArrTd;
}
}
$c = '';
foreach($ArrTr as $arr_tr =>$ftr){
$c .='<tr>';
foreach($ftr as $arr_td =>$ftd){
$c .='<td>';
$c .= $ftd;
$c .='<td>';
}
$c .='</tr>';
}
$row_header = '
<table style="text-align:center;">
';
$row_header .= $c;
$row_header .= '
</table>';
echo $row_header;
I get error Fatal error: Call to a member function find() on a non-object in /var/www/seo-main/data/www/testwork.ru/Tempp/parse_domains.php on line 34
Tell me please why i get it error and how make right ?
find() is a member function of
http://simplehtmldom.sourceforge.net/
You have commented out the line
$html = file_get_html('http://testwork.ru/Tempp/domains.php');
With CURL it will return html content not the parsed data.
If you are intend to use CURL then use
$html_data = curl_exec($ch);
curl_close ($ch);
$html = str_get_html($html_data ); // using the returned html from curl to the parser
Then do the rest of the parsing part.
You need to edit the code like this:
// Find all tr
$row = 0;
if(!$html) { // check if $html does exists
foreach($html->find('tr') as $tr){
if($row!=0){
$row++;
$column = 0;
foreach($tr->find('td') as $td){
$column++;
$text = $td->plaintext;
$ArrTd[$column] = $text;
}
}
if(iconv_strlen($ArrTd[0]) > $SymbolsCountMin && iconv_strlen($ArrTd[0]) < $SymbolsCountMax && !in_array($ArrTd[0], $SymbolsBackList)){
$ArrTr[$row] = $ArrTd;
}
}
}
or use a try and catch statement

Crawl a website, get the links, crawl the links with PHP and XPATH

I want to crawl an entire website , I have read several threads but I cannot manage to get data in a 2nd level.
That is, I can return the links from a starting page but then I cannot find a way to parse the links and get the content of each link...
The code I use is:
<?php
// SELECT STARTING PAGE
$url = 'http://mydomain.com/';
$html= file_get_contents($url);
// GET ALL THE LINKS OF EACH PAGE
// create a dom object
$dom = new DOMDocument();
#$dom->loadHTML($html);
// run xpath for the dom
$xPath = new DOMXPath($dom);
// get links from starting page
$elements = $xPath->query("//a/#href");
foreach ($elements as $e) {
echo $e->nodeValue. "<br />";
}
// Parse each page using the extracted links?
?>
Could somebody help me out for the last part with an example?
I will be really much appreciated!
Well , thanx for your answers!
I tried some stuff but I Haven't managet to get any results yet - I am new to programming..
Below, you can find 2 of my attempts - the 1st trying to parse the links and in the second trying to replace file_get contents with Curl:
1)
<?php
// GET STARTING PAGE
$url = 'http://www.capoeira.com.gr/';
$html= file_get_contents($url);
//GET ALL THE LINKS FROM STARTING PAGE
// create a dom object
$dom = new DOMDocument();
#$dom->loadHTML($html);
// run xpath for the dom
$xPath = new DOMXPath($dom);
// get specific elements from the sites
$elements = $xPath->query("//a/#href");
//PARSE EACH LINK
foreach($elements as $e) {
$URLS= file_get_contents($e);
$dom = new DOMDocument();
#$dom->loadHTML($html);
$xPath = new DOMXPath($dom);
$output = $xPath->query("//div[#class='content-entry clearfix']");
echo $output ->nodeValue;
}
?>
For the above code I get
Warning: file_get_contents() expects parameter 1 to be string, object given in ../example.php on line 26
2)
<?php
$curl = curl_init();
curl_setopt($curl, CURLOPT_POST, 1);
curl_setopt($curl, CURLOPT_URL, "http://capoeira.com.gr");
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
$content= curl_exec($curl);
curl_close($curl);
$dom = new DOMDocument();
#$dom->loadHTML($content);
$xPath = new DOMXPath($dom);
$elements = $xPath->query("//a/#href");
foreach ($elements as $e) {
echo $e->nodeValue. "<br />";
}
?>
I get no results. I tried to echo $content and then I get :
You don't have permission to access / on this server.
Additionally, a 413 Request Entity Too Large error was encountered while trying to use an ErrorDocument to handle the request...
Any ideas please?? :)
You can try the following. See this thread for more details
<?php
//set_time_limit (0);
function crawl_page($url, $depth = 5){
$seen = array();
if(($depth == 0) or (in_array($url, $seen))){
return;
}
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
$result = curl_exec ($ch);
curl_close ($ch);
if( $result ){
$stripped_file = strip_tags($result, "<a>");
preg_match_all("/<a[\s]+[^>]*?href[\s]?=[\s\"\']+"."(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $stripped_file, $matches, PREG_SET_ORDER );
foreach($matches as $match){
$href = $match[1];
if (0 !== strpos($href, 'http')) {
$path = '/' . ltrim($href, '/');
if (extension_loaded('http')) {
$href = http_build_url($href , array('path' => $path));
} else {
$parts = parse_url($href);
$href = $parts['scheme'] . '://';
if (isset($parts['user']) && isset($parts['pass'])) {
$href .= $parts['user'] . ':' . $parts['pass'] . '#';
}
$href .= $parts['host'];
if (isset($parts['port'])) {
$href .= ':' . $parts['port'];
}
$href .= $path;
}
}
crawl_page($href, $depth - 1);
}
}
echo "Crawled {$href}";
}
crawl_page("http://www.sitename.com/",3);
?>
$doc = new DOMDocument;
$doc->load('file.htm');
$items = $doc->getElementsByTagName('a');
foreach($items as $value) {
echo $value->nodeValue . "\n";
$attrs = $value->attributes;
echo $attrs->getNamedItem('href')->nodeValue . "\n";
};
find link from website recursively with depth
<?php
$depth = 1;
print_r(getList($depth));
function getList($depth)
{
$lists = getDepth($depth);
return $lists;
}
function getUrl($request_url)
{
$countValid = 0;
$brokenCount =0;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $request_url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); // We want to get the respone
$result = curl_exec($ch);
$regex = '|<a.*?href="(.*?)"|';
preg_match_all($regex, $result, $parts);
$links = $parts[1];
$lists = array();
foreach ($links as $link)
{
$url = htmlentities($link);
$result =getFlag($url);
if($result == true)
{
$UrlLists["clean"][$countValid] =$url;
$countValid++;
}
else
{
$UrlLists["broken"][$brokenCount]= "broken->".$url;
$brokenCount++;
}
}
curl_close($ch);
return $UrlLists;
}
function ZeroDepth($list)
{
$request_url = $list;
$listss["0"]["0"] = getUrl($request_url);
$lists["0"]["0"]["clean"] = array_unique($listss["0"]["0"]["clean"]);
$lists["0"]["0"]["broken"] = array_unique($listss["0"]["0"]["broken"]);
return $lists;
}
function getDepth($depth)
{
// $list =OW_URL_HOME;
$list = "https://example.com";//enter the url of website
$lists =ZeroDepth($list);
for($i=1;$i<=$depth;$i++)
{
$l= $i;
$l= $l-1;
$depthArray=1;
foreach($lists[$l][$l]["clean"] as $depthUrl)
{
$request_url = $depthUrl;
$lists[$i][$depthArray]["requst_url"]=$request_url;
$lists[$i][$depthArray] = getUrl($request_url);
}
}
return $lists;
}
function getFlag($url)
{
$url_response = array();
$curl = curl_init();
$curl_options = array();
$curl_options[CURLOPT_RETURNTRANSFER] = true;
$curl_options[CURLOPT_URL] = $url;
$curl_options[CURLOPT_NOBODY] = true;
$curl_options[CURLOPT_TIMEOUT] = 60;
curl_setopt_array($curl, $curl_options);
curl_exec($curl);
$status = curl_getinfo($curl, CURLINFO_HTTP_CODE);
if ($status == 200)
{
return true;
}
else
{
return false;
}
curl_close($curl);
}
?>`
Please check the code below, hope it helps you.
<?php
$html = new DOMDocument();
#$html->loadHtmlFile('http://www.yourdomain.com');
$xpath = new DOMXPath( $html );
$nodelist = $xpath->query( "//div[#class='A-CLASS-Name']/h3/a/#href" );
foreach ($nodelist as $n){
echo $n->nodeValue."\n<br>";
}
?>
Thanks,
Roger
<?php
$path='http://www.hscripts.com/';
$html = file_get_contents($path);
$dom = new DOMDocument();
#$dom->loadHTML($html);
// grab all the on the page
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for ($i = 0; $i < $hrefs->length; $i++ ) {
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
echo $url.'<br />';
}
?>
you can use above code to get all possible links

Categories