I am new to programming,
I need to extract the wikipedia content and put it into html.
//curl request returns json output via json_decode php function
function curl($url){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 GTB5');
$result = curl_exec($ch);
curl_close($ch);
return $result;
}
$search = $_GET["search"];
if (empty($search)) {
//term param not passed in url
exit;
} else {
//create url to use in curl call
$term = str_replace(" ", "_", $search);
$url = "https://en.wikipedia.org/w/api.php?action=opensearch&search=".$search."&limit=1&namespace=0&format=jsonfm";
$json = curl($url);
$data = json_decode($json, true);
$data = $data['parse']['wikitext']['*'];
}
so I basically want to reprint a wiki page but with my styles and do not know how to do.
Any ideas, Thanks
Related
I have this code to try and get the pagination links using php but the result is not quiet right. could any one help me.
what I get back is just a recurring instance of the first link.
<?php
include_once('simple_html_dom.php');
function dlPage($href) {
$curl = curl_init();
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($curl, CURLOPT_HEADER, false);
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($curl, CURLOPT_URL, $href);
curl_setopt($curl, CURLOPT_REFERER, $href);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.125 Safari/533.4");
$str = curl_exec($curl);
curl_close($curl);
// Create a DOM object
$dom = new simple_html_dom();
// Load HTML from a string
$dom->load($str);
$Next_Link = array();
foreach($dom->find('a[title=Next]') as $element){
$Next_Link[] = $element->href;
}
print_r($Next_Link);
$next_page_url = $Next_Link[0];
if($next_page_url !='') {
echo '<br>' . $next_page_url;
$dom->clear();
unset($dom);
//load the next page from the pagination to collect the next link
dlPage($next_page_url);
}
}
$url = 'https://www.jumia.com.gh/phones/';
$data = dlPage($url);
//print_r($data)
?>
what i want to get is
mySiteUrl/?facet_is_mpg_child=0&viewType=gridView&page=2
mySiteUrl//?facet_is_mpg_child=0&viewType=gridView&page=3
.
.
.
to the last link in the pagination. Please help
Here it is. Look that I htmlspecialchars_decode the link. Cause the href in curl there shouldn't be an & like in xml. Should the return value of dlPage the last link in Pagination. I understood so.
<?php
include_once('simple_html_dom.php');
function dlPage($href, $already_loaded = array()) {
$curl = curl_init();
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($curl, CURLOPT_HEADER, false);
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($curl, CURLOPT_URL, $href);
curl_setopt($curl, CURLOPT_REFERER, $href);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.125 Safari/533.4");
$htmlPage = curl_exec($curl);
curl_close($curl);
echo "Loading From URL:" . $href . "<br/>\n";
$already_loaded[$href] = true;
// Create a DOM object
$dom = file_get_html($href);
// Load HTML from a string
$dom->load($htmlPage);
$next_page_url = null;
$items = $dom->find('ul[class="osh-pagination"] li[class="item"] a[title="Next"]');
foreach ($items as $item) {
$link = htmlspecialchars_decode($item->href);
if (!isset($already_loaded[$link])) {
$next_page_url = $link;
break;
}
}
if ($next_page_url !== null) {
$dom->clear();
unset($dom);
//load the next page from the pagination to collect the next link
return dlPage($next_page_url, $already_loaded);
}
return $href;
}
$url = 'https://www.jumia.com.gh/phones/';
$data = dlPage($url);
echo "DATA:" . $data . "\n";
And the output is:
Loading From URL:https://www.jumia.com.gh/phones/<br/>
Loading From URL:https://www.jumia.com.gh/phones/?facet_is_mpg_child=0&viewType=gridView&page=2<br/>
Loading From URL:https://www.jumia.com.gh/phones/?facet_is_mpg_child=0&viewType=gridView&page=3<br/>
Loading From URL:https://www.jumia.com.gh/phones/?facet_is_mpg_child=0&viewType=gridView&page=4<br/>
Loading From URL:https://www.jumia.com.gh/phones/?facet_is_mpg_child=0&viewType=gridView&page=5<br/>
DATA:https://www.jumia.com.gh/phones/?facet_is_mpg_child=0&viewType=gridView&page=5
Hi I'm trying to make this api call from wikipedia using this url, however it says it's null when I dump the variable. This function works for my other json api calls but not for this, I tested it in the broswer manually it gives me a result. Here is my attempt
$url = 'http://en.wikipedia.org/w/api.php?action=query&format=json&titles=Image:Romerolagus diazi (dispale) 001.jpg&prop=imageinfo&iiprop=url';
$result = apicall($url);
var_dump($result);
function apicall($url){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_USERAGENT, 'MyBot/1.0 (http://www.mysite.com/)');
$result = curl_exec($ch);
if (!$result) {
exit('cURL Error: '.curl_error($ch));
}
$var = json_decode($result);
return $var;
}
urlencode() problem, modify like this
<?php
$url = 'http://en.wikipedia.org/w/api.php';
$titles = urlencode('Image:Romerolagus diazi (dispale) 001.jpg');
$queryStr = 'action=query&format=json&titles='.$titles.'&prop=imageinfo&iiprop=url';
$url = $url . '?' . $queryStr;
$result = apicall($url);
var_dump($result);
function apicall($url){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 Safari/537.22');
$result = curl_exec($ch);
if (!$result) {
exit('cURL Error: '.curl_error($ch));
}
var_dump($result);
$var = json_decode($result);
return $var;
}
You should consider using http_build_query() to build the URL:
$url = 'http://en.wikipedia.org/w/api.php?' . http_build_query(array(
'action' => 'query',
'format' => 'json',
'titles' => 'Image:Romerolagus diazi (dispale) 001.jpg',
'prop' => 'imageinfo',
'iiprop' => 'url',
));
$url = "https://graph.facebook.com/fql?q=SELECT%20uid2%20FROM%20friend%20WHERE%20uid1=me()&access_token=sadfasdf";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 GTB5');
$string = curl_exec($ch); // grab URL and pass it to the browser
echo $string;
curl_close($ch);
Right now, echo $string outputs the entire data grabbed as plain text.
{ "data": [
{ "uid2": "91298391" },
{ "uid2": "00291509" },
{ "uid2": "101927261" }
]}
How can I instead grab specific data similar to JSON? In this case, I want to be able to grab every id. I have a feeling I will use preg_match but I'm stumped.
php has a built in function for turning a json string into an array.
json_decode
the second parameter makes it give you back an assoc array instead of an object. Then just grab the values:
$ids = array_values(json_decode($string, true));
Using Wikiepdia API link to get some basic informations about some world known characters.
Example : (About Dave Longaberger)
This would show as following
Now my question
I'd like to parse the xml to get such basic informations between <extract></extract> to show it.
Here is my idea but failed (I/O warning : failed to load external entity)
<?PHP
$url = 'http://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles=Dave Longaberger&format=xml&exintro=1';
$xml = simplexml_load_file($url);
// get extract
$text=$xml->pages[0]->extract;
// show title
echo $text;
?>
Another idea but also failed (failed to open stream: HTTP request failed!)
<?PHP
function get_url_contents($url){
$crl = curl_init();
$timeout = 5;
curl_setopt ($crl, CURLOPT_URL,$url);
curl_setopt ($crl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt ($crl, CURLOPT_CONNECTTIMEOUT, $timeout);
$ret = curl_exec($crl);
curl_close($crl);
return $ret;
}
$url = "http://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles=Dave Longaberger&format=xml&exintro=1";
$text = file_get_contents($url);
echo $text;
?>
so any idea how to do it. ~ Thanks
Update (after added urlencode or rawurlencode still not working)
$name = "Dave Longaberger";
$name = urlencode($name);
$url = 'http://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles='.$name.'&format=xml&exintro=1';
$text = file_get_contents($url);
Also not working
$url = 'http://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles=Dave Longaberger&format=xml&exintro=1';
$url = urlencode($url);
$text = file_get_contents($url);
nor
$url = 'http://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles='.rawurlencode('Dave Longaberger').'&format=xml&exintro=1';
$text = file_get_contents($url);
Well so i really don't know looks like it is impossible by somehow.
Set the User Agent Header in your curl request, wikipedia replies with error 403 forbidden otherwise.
<?PHP
$url = "http://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles=Dave+Longaberger&format=xml&exintro=1";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1");
$xml = curl_exec($ch);
curl_close($ch);
echo $xml;
?>
Alternatively:
ini_set("user_agent","Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1");
$url = "http://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles=Dave+Longaberger&format=xml&exintro=1";
$xml = simplexml_load_file($url);
$extracts = $xml->xpath("/api/query/pages/page/extract");
var_dump($extracts);
Look at the note in this php man page
http://php.net/manual/en/function.file-get-contents.php
If you're opening a URI with special characters, such as spaces, you need to encode the URI with urlencode().
A similar question has been posted at but i could not find the solution there
Curl error Could not resolve host: saved_report.xml; No data record of requested type"
<?php
$url="http://en.wikipedia.org/wiki/Pakistan";
$ch = curl_init(urlencode($url));
echo $ch;
// used to spoof that coming from a real browser so we don't get blocked by some sites
$useragent="Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1";
curl_setopt($ch, CURLOPT_USERAGENT, $useragent);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 4);
curl_setopt($ch, CURLOPT_TIMEOUT, 8);
curl_setopt($ch, CURLOPT_LOW_SPEED_TIME, 10);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$content = curl_exec($ch);
$info = curl_getinfo($ch);
if ($content === false || $info['http_code'] != 200) {
$content = "No cURL data returned for $url [". $info['http_code']. "]";
if (curl_error($ch))
$content .= "\n". curl_error($ch);
}
else {
// 'OK' status; format $output data if necessary here:
echo "...";
}
echo $content;
curl_close($ch);
?>
when i paste the same address in browser i am able to access the webpage. but when i run this script i get the error message. Can anyone please help me.
Thanks
Remove the urlencode call.
remove the urlencode($url) it should be:
$ch = curl_init($url);
Well.
If you remove urlencode() with instantiating your $ch-var, you go just fine. urlencode() is definitely wrong here.
Good:
$ch = curl_init($url);
Bad:
$ch = curl_init(urlencode($url));
$ch = curl_init($url);
instead of
$ch = curl_init(urlencode($url));