This is a custom encryption library. I do not know much about PHP's standard library of functions and was wondering if the following code can be improved in any way. The implementation should yield the same results, the API should remain as it is, but ways to make is more PHP-ish would be greatly appreciated.
Code
<?php
/***************************************
Create random major and minor SPICE key.
***************************************/
function crypt_major()
{
$all = range("\x00", "\xFF");
shuffle($all);
$major_key = implode("", $all);
return $major_key;
}
function crypt_minor()
{
$sample = array();
do
{
array_push($sample, 0, 1, 2, 3);
} while (count($sample) != 256);
shuffle($sample);
$list = array();
for ($index = 0; $index < 64; $index++)
{
$b12 = $sample[$index * 4] << 6;
$b34 = $sample[$index * 4 + 1] << 4;
$b56 = $sample[$index * 4 + 2] << 2;
$b78 = $sample[$index * 4 + 3];
array_push($list, $b12 + $b34 + $b56 + $b78);
}
$minor_key = implode("", array_map("chr", $list));
return $minor_key;
}
/***************************************
Create the SPICE key via the given name.
***************************************/
function named_major($name)
{
srand(crc32($name));
return crypt_major();
}
function named_minor($name)
{
srand(crc32($name));
return crypt_minor();
}
/***************************************
Check validity for major and minor keys.
***************************************/
function _check_major($key)
{
if (is_string($key) && strlen($key) == 256)
{
foreach (range("\x00", "\xFF") as $char)
{
if (substr_count($key, $char) == 0)
{
return FALSE;
}
}
return TRUE;
}
return FALSE;
}
function _check_minor($key)
{
if (is_string($key) && strlen($key) == 64)
{
$indexs = array();
foreach (array_map("ord", str_split($key)) as $byte)
{
foreach (range(6, 0, 2) as $shift)
{
array_push($indexs, ($byte >> $shift) & 3);
}
}
$dict = array_count_values($indexs);
foreach (range(0, 3) as $index)
{
if ($dict[$index] != 64)
{
return FALSE;
}
}
return TRUE;
}
return FALSE;
}
/***************************************
Create encode maps for encode functions.
***************************************/
function _encode_map_1($major)
{
return array_map("ord", str_split($major));
}
function _encode_map_2($minor)
{
$map_2 = array(array(), array(), array(), array());
$list = array();
foreach (array_map("ord", str_split($minor)) as $byte)
{
foreach (range(6, 0, 2) as $shift)
{
array_push($list, ($byte >> $shift) & 3);
}
}
for ($byte = 0; $byte < 256; $byte++)
{
array_push($map_2[$list[$byte]], chr($byte));
}
return $map_2;
}
/***************************************
Create decode maps for decode functions.
***************************************/
function _decode_map_1($minor)
{
$map_1 = array();
foreach (array_map("ord", str_split($minor)) as $byte)
{
foreach (range(6, 0, 2) as $shift)
{
array_push($map_1, ($byte >> $shift) & 3);
}
}
return $map_1;
}function _decode_map_2($major)
{
$map_2 = array();
$temp = array_map("ord", str_split($major));
for ($byte = 0; $byte < 256; $byte++)
{
$map_2[$temp[$byte]] = chr($byte);
}
return $map_2;
}
/***************************************
Encrypt or decrypt the string with maps.
***************************************/
function _encode($string, $map_1, $map_2)
{
$cache = "";
foreach (str_split($string) as $char)
{
$byte = $map_1[ord($char)];
foreach (range(6, 0, 2) as $shift)
{
$cache .= $map_2[($byte >> $shift) & 3][mt_rand(0, 63)];
}
}
return $cache;
}
function _decode($string, $map_1, $map_2)
{
$cache = "";
$temp = str_split($string);
for ($iter = 0; $iter < strlen($string) / 4; $iter++)
{
$b12 = $map_1[ord($temp[$iter * 4])] << 6;
$b34 = $map_1[ord($temp[$iter * 4 + 1])] << 4;
$b56 = $map_1[ord($temp[$iter * 4 + 2])] << 2;
$b78 = $map_1[ord($temp[$iter * 4 + 3])];
$cache .= $map_2[$b12 + $b34 + $b56 + $b78];
}
return $cache;
}
/***************************************
This is the public interface for coding.
***************************************/
function encode_string($string, $major, $minor)
{
if (is_string($string))
{
if (_check_major($major) && _check_minor($minor))
{
$map_1 = _encode_map_1($major);
$map_2 = _encode_map_2($minor);
return _encode($string, $map_1, $map_2);
}
}
return FALSE;
}
function decode_string($string, $major, $minor)
{
if (is_string($string) && strlen($string) % 4 == 0)
{
if (_check_major($major) && _check_minor($minor))
{
$map_1 = _decode_map_1($minor);
$map_2 = _decode_map_2($major);
return _decode($string, $map_1, $map_2);
}
}
return FALSE;
}
?>
This is a sample showing how the code is being used. Hex editors may be of help with the input / output.
Example
<?php
# get and process all of the form data
# $input = htmlspecialchars($_POST["input"]);
# $majorname = htmlspecialchars($_POST["majorname"]);
# $minorname = htmlspecialchars($_POST["minorname"]);
# $majorkey = htmlspecialchars($_POST["majorkey"]);
# $minorkey = htmlspecialchars($_POST["minorkey"]);
# $output = htmlspecialchars($_POST["output"]);
# process the submissions by operation
# CREATE
# $operation = $_POST["operation"];
if ($operation == "Create")
{
if (strlen($_POST["majorname"]) == 0)
{
$majorkey = bin2hex(crypt_major());
}
if (strlen($_POST["minorname"]) == 0)
{
$minorkey = bin2hex(crypt_minor());
}
if (strlen($_POST["majorname"]) != 0)
{
$majorkey = bin2hex(named_major($_POST["majorname"]));
}
if (strlen($_POST["minorname"]) != 0)
{
$minorkey = bin2hex(named_minor($_POST["minorname"]));
}
}
# ENCRYPT or DECRYPT
function is_hex($char)
{
if ($char == "0"):
return TRUE;
elseif ($char == "1"):
return TRUE;
elseif ($char == "2"):
return TRUE;
elseif ($char == "3"):
return TRUE;
elseif ($char == "4"):
return TRUE;
elseif ($char == "5"):
return TRUE;
elseif ($char == "6"):
return TRUE;
elseif ($char == "7"):
return TRUE;
elseif ($char == "8"):
return TRUE;
elseif ($char == "9"):
return TRUE;
elseif ($char == "a"):
return TRUE;
elseif ($char == "b"):
return TRUE;
elseif ($char == "c"):
return TRUE;
elseif ($char == "d"):
return TRUE;
elseif ($char == "e"):
return TRUE;
elseif ($char == "f"):
return TRUE;
else:
return FALSE;
endif;
}
function hex2bin($str)
{
if (strlen($str) % 2 == 0):
$string = strtolower($str);
else:
$string = strtolower("0" . $str);
endif;
$cache = "";
$temp = str_split($str);
for ($index = 0; $index < count($temp) / 2; $index++)
{
$h1 = $temp[$index * 2];
if (is_hex($h1))
{
$h2 = $temp[$index * 2 + 1];
if (is_hex($h2))
{
$cache .= chr(hexdec($h1 . $h2));
}
else
{
return FALSE;
}
}
else
{
return FALSE;
}
}
return $cache;
}
if ($operation == "Encrypt" || $operation == "Decrypt")
{
# CHECK FOR ANY ERROR
$errors = array();
if (strlen($_POST["input"]) == 0)
{
$output = "";
}
$binmajor = hex2bin($_POST["majorkey"]);
if (strlen($_POST["majorkey"]) == 0)
{
array_push($errors, "There must be a major key.");
}
elseif ($binmajor == FALSE)
{
array_push($errors, "The major key must be in hex.");
}
elseif (_check_major($binmajor) == FALSE)
{
array_push($errors, "The major key is corrupt.");
}
$binminor = hex2bin($_POST["minorkey"]);
if (strlen($_POST["minorkey"]) == 0)
{
array_push($errors, "There must be a minor key.");
}
elseif ($binminor == FALSE)
{
array_push($errors, "The minor key must be in hex.");
}
elseif (_check_minor($binminor) == FALSE)
{
array_push($errors, "The minor key is corrupt.");
}
if ($_POST["operation"] == "Decrypt")
{
$bininput = hex2bin(str_replace("\r", "", str_replace("\n", "", $_POST["input"])));
if ($bininput == FALSE)
{
if (strlen($_POST["input"]) != 0)
{
array_push($errors, "The input data must be in hex.");
}
}
elseif (strlen($bininput) % 4 != 0)
{
array_push($errors, "The input data is corrupt.");
}
}
if (count($errors) != 0)
{
# ERRORS ARE FOUND
$output = "ERROR:";
foreach ($errors as $error)
{
$output .= "\n" . $error;
}
}
elseif (strlen($_POST["input"]) != 0)
{
# CONTINUE WORKING
if ($_POST["operation"] == "Encrypt")
{
# ENCRYPT
$output = substr(chunk_split(bin2hex(encode_string($_POST["input"], $binmajor, $binminor)), 58), 0, -2);
}
else
{
# DECRYPT
$output = htmlspecialchars(decode_string($bininput, $binmajor, $binminor));
}
}
}
# echo the form with the values filled
echo "<P><TEXTAREA class=maintextarea name=input rows=25 cols=25>" . $input . "</TEXTAREA></P>\n";
echo "<P>Major Name:</P>\n";
echo "<P><INPUT id=textbox1 name=majorname value=\"" . $majorname . "\"></P>\n";
echo "<P>Minor Name:</P>\n";
echo "<P><INPUT id=textbox1 name=minorname value=\"" . $minorname . "\"></P>\n";
echo "<DIV style=\"TEXT-ALIGN: center\"><INPUT class=submit type=submit value=Create name=operation>\n";
echo "</DIV>\n";
echo "<P>Major Key:</P>\n";
echo "<P><INPUT id=textbox1 name=majorkey value=\"" . $majorkey . "\"></P>\n";
echo "<P>Minor Key:</P>\n";
echo "<P><INPUT id=textbox1 name=minorkey value=\"" . $minorkey . "\"></P>\n";
echo "<DIV style=\"TEXT-ALIGN: center\"><INPUT class=submit type=submit value=Encrypt name=operation> \n";
echo "<INPUT class=submit type=submit value=Decrypt name=operation> </DIV>\n";
echo "<P>Result:</P>\n";
echo "<P><TEXTAREA class=maintextarea name=output rows=25 readOnly cols=25>" . $output . "</TEXTAREA></P></DIV></FORM>\n";
?>
What should be editted for better memory efficiency or faster execution?
You could replace your isHex function with:
function isHex($char) {
return strpos("0123456789ABCDEF",strtoupper($char)) > -1;
}
And your hex2bin might be better as:
function hex2bin($h)
{
if (!is_string($h)) return null;
$r='';
for ($a=0; $a<strlen($h); $a+=2) { $r.=chr(hexdec($h{$a}.$h{($a+1)})); }
return $r;
}
You seem to have a lot of if..elseif...elseif...elseif which would be a lot cleaner in a switch or seperated into different methods.
However, I'm talking more from a maintainability and readability perspective - although the easier it is to read and understand, the easier it is to optimize. If it would help you if I waded through all the code and wrote it in a cleaner way, then I'll do so...
Related
I'm trying to generate auto description from tags.
The code was working but after updating my site to Laravel 6 in stop working. I need to get it back working.
if( !empty( $request->description ) )
{
$description = Helper::checkTextDb($request->description);
}
else
{
$a_key = explode(",", strtolower($request->tags));
if(count($a_key) == 0)
$description = 'This is a great thing';
else
{
$description_get_keys = '';
foreach ($a_key as &$value)
{
if($value == end($a_key) && count($a_key) != 1)
$description_get_keys = $description_get_keys.' and '.$value.'.';
else if(count($a_key) == 1)
$description_get_keys = $value.'.';
else if (count($a_key) > 1 && $a_key[0] == $value)
$description_get_keys = $value;
else
$description_get_keys = $description_get_keys.', '.$value;
}
$description = 'This is a great thing about '.$description_get_keys;
}
}
I see a couple things that could possibly be an issue, not knowing what came before this code.
I will assume that the $request variable is an instance of Illuminate\Http\Request and that it is available in the function, right?
Try this updated code:
if($request->has('description'))
{
$description = Helper::checkTextDb($request->description);
}
else if ($request->has('tags'))
{
if (strpos($request->tags, ',') === false)
{
$description = 'This is a great thing';
}
else {
$a_key = explode(",", strtolower($request->tags));
$a_count = count($a_key);
$description_get_keys = '';
for ($i = 0; $i < $a_count; $i++)
{
if ($a_count == 1) {
$description_get_keys = "{$a_key[$i]}.";
}
else {
// first
if ($i === 0) {
$description_get_keys = $a_key[0];
}
// last
else if ($i === $a_count - 1) {
$description_get_keys .= " and {$a_key[$i]}.";
}
// middle
else {
$description_get_keys .= ", {$a_key[$i]}";
}
}
}
$description = "This is a great thing about {$description_get_keys}";
}
}
I wrote that quick so hopefully there are no errors.
So In case if the user types in 1,000.00, this is a valid number. what if the user types in 1,0,000,00.0000,0
You see theres multiple errors in here where you a value cannot be 1,0... or 000,00 because it has to be at least 3 numbers after the comma. To further extend it, after the dot in a number you cannot have a comma.
$keyword = $_POST['keyword'];
if (preg_match('/0/',$keyword) ||preg_match('/1/',$keyword) ||preg_match('/2/',$keyword)||preg_match('/3/',$keyword) ||preg_match('/4/',$keyword) || preg_match('/5/',$keyword) || preg_match('/6/',$keyword) || preg_match('/7/',$keyword) ||preg_match('/8/',$keyword) ||preg_match('/9/',$keyword) ||preg_match('/-/',$keyword) ||preg_match('/+/',$keyword) || preg_match('/,/',$keyword) ) {
$keywordtest = $keyword;
if ($keyword[0] == '+' || $keyword[0] == '-')
{
$keywordtest = substr_replace($keyword, '1', 0, 1);
echo $keywordtest;
}
if (preg_match('/,/',$keyword))
{
$x = 0;
while(true)
{
$findme = ',';
$pos = strpos($keyword, $findme);
if ($pos !== false)
{
$posArray[x] = $pos;
$x = x + 1;
if (x == 10)
break;
}
}
$numberCheck = posArray[x-1];
if (is_numeric($numberCheck))
{
}
else
{
echo "false '{$keyword}' is not numeric", PHP_EOL;
return 0;
}
$numberCheck = posArray[x+1];
if (is_numeric($numberCheck))
{
}
else
{
echo "false '{$keyword}' is not numeric", PHP_EOL;
return 0;
}
$numberCheck = posArray[x+2];
if (is_numeric($numberCheck))
{
}
else
{
echo "false '{$keyword}' is not numeric", PHP_EOL;
return 0;
}
$numberCheck = posArray[x+3];
if (is_numeric($numberCheck))
{
}
else
{
echo "false '{$keyword}' is not numeric", PHP_EOL;
return 0;
}
}
test_numeric($keywordtest,$keyword);
return 0;
}
else
{
echo "false '{$keyword}' is not numeric", PHP_EOL;
}
function test_numeric($keywordtest,$keyword)
{
if (is_numeric($keywordtest)){
echo "true '{$keyword}' is numeric", PHP_EOL;
}
return 0;
}
?>
You will want to use filter_var in combination with the FILTER_VALIDATE_FLOAT filter type and the FILTER_FLAG_ALLOW_THOUSAND flag.
Then it is really straightforward:
// $val == 100000
$val = filter_var('100,000.00', FILTER_VALIDATE_FLOAT, array('flags' => FILTER_FLAG_ALLOW_THOUSAND));
// $val == -100000
$val = filter_var('-100,000.00', FILTER_VALIDATE_FLOAT, array('flags' => FILTER_FLAG_ALLOW_THOUSAND));
// $val === false (invalid)
$val = filter_var('100,000.00,00', FILTER_VALIDATE_FLOAT, array('flags' => FILTER_FLAG_ALLOW_THOUSAND));
My company receives coupons codes from another company and I need to process them to use them in my company's website. The problem is that they send me a PDF file with the codes, and they say their system can only export them in PDF. Strange.
I've tried several times to let them know that I need those coupons in plain text separated by something (a.k.a CSV) and they doesn't take any notice of that.
My website is written in PHP, and uses MySQL. I would like to offer a way to upload that coupons file and add those to the database. It would be easy considering it is a CSV file but it's not.
Is there any way I can programatically -not manually- process PDF files as text files? or, any workaround for this situation?
I have found pdf2text to have worked well for quite some time.
Will not work with image (TIFF) PDF. will work with text searchable PDF.
include('/home/user/php/class.pdf2text.php');
$p2t = new PDF2Text();
$p2t ->setFilename($pdf);
$p2t ->decodePDF();
$data = $p2t ->output();
$pos = strpos($data,$search);
if (pos){...}
Source:
<?php
class PDF2Text {
// Some settings
var $multibyte = 4; // Use setUnicode(TRUE|FALSE)
var $convertquotes = ENT_QUOTES; // ENT_COMPAT (double-quotes), ENT_QUOTES (Both), ENT_NOQUOTES (None)
var $showprogress = true; // TRUE if you have problems with time-out
// Variables
var $filename = '';
var $decodedtext = '';
function setFilename($filename) {
// Reset
$this->decodedtext = '';
$this->filename = $filename;
}
function output($echo = false) {
if($echo) echo $this->decodedtext;
else return $this->decodedtext;
}
function setUnicode($input) {
// 4 for unicode. But 2 should work in most cases just fine
if($input == true) $this->multibyte = 4;
else $this->multibyte = 2;
}
function decodePDF() {
// Read the data from pdf file
$infile = #file_get_contents($this->filename, FILE_BINARY);
if (empty($infile))
return "";
// Get all text data.
$transformations = array();
$texts = array();
// Get the list of all objects.
preg_match_all("#obj[\n|\r](.*)endobj[\n|\r]#ismU", $infile . "endobj\r", $objects);
$objects = #$objects[1];
// Select objects with streams.
for ($i = 0; $i < count($objects); $i++) {
$currentObject = $objects[$i];
// Prevent time-out
#set_time_limit ();
if($this->showprogress) {
// echo ". ";
flush(); ob_flush();
}
// Check if an object includes data stream.
if (preg_match("#stream[\n|\r](.*)endstream[\n|\r]#ismU", $currentObject . "endstream\r", $stream )) {
$stream = ltrim($stream[1]);
// Check object parameters and look for text data.
$options = $this->getObjectOptions($currentObject);
if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"])) )
// if ( $options["Image"] && $options["Subtype"] )
// if (!(empty($options["Length1"]) && empty($options["Subtype"])) )
continue;
// Hack, length doesnt always seem to be correct
unset($options["Length"]);
// So, we have text data. Decode it.
$data = $this->getDecodedStream($stream, $options);
if (strlen($data)) {
if (preg_match_all("#BT[\n|\r](.*)ET[\n|\r]#ismU", $data . "ET\r", $textContainers)) {
$textContainers = #$textContainers[1];
$this->getDirtyTexts($texts, $textContainers);
} else
$this->getCharTransformations($transformations, $data);
}
}
}
// Analyze text blocks taking into account character transformations and return results.
$this->decodedtext = $this->getTextUsingTransformations($texts, $transformations);
}
function decodeAsciiHex($input) {
$output = "";
$isOdd = true;
$isComment = false;
for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) {
$c = $input[$i];
if($isComment) {
if ($c == '\r' || $c == '\n')
$isComment = false;
continue;
}
switch($c) {
case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break;
case '%':
$isComment = true;
break;
default:
$code = hexdec($c);
if($code === 0 && $c != '0')
return "";
if($isOdd)
$codeHigh = $code;
else
$output .= chr($codeHigh * 16 + $code);
$isOdd = !$isOdd;
break;
}
}
if($input[$i] != '>')
return "";
if($isOdd)
$output .= chr($codeHigh * 16);
return $output;
}
function decodeAscii85($input) {
$output = "";
$isComment = false;
$ords = array();
for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) {
$c = $input[$i];
if($isComment) {
if ($c == '\r' || $c == '\n')
$isComment = false;
continue;
}
if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ')
continue;
if ($c == '%') {
$isComment = true;
continue;
}
if ($c == 'z' && $state === 0) {
$output .= str_repeat(chr(0), 4);
continue;
}
if ($c < '!' || $c > 'u')
return "";
$code = ord($input[$i]) & 0xff;
$ords[$state++] = $code - ord('!');
if ($state == 5) {
$state = 0;
for ($sum = 0, $j = 0; $j < 5; $j++)
$sum = $sum * 85 + $ords[$j];
for ($j = 3; $j >= 0; $j--)
$output .= chr($sum >> ($j * 8));
}
}
if ($state === 1)
return "";
elseif ($state > 1) {
for ($i = 0, $sum = 0; $i < $state; $i++)
$sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i);
for ($i = 0; $i < $state - 1; $i++) {
try {
if(false == ($o = chr($sum >> ((3 - $i) * 8)))) {
throw new Exception('Error');
}
$output .= $o;
} catch (Exception $e) { /*Dont do anything*/ }
}
}
return $output;
}
function decodeFlate($data) {
return #gzuncompress($data);
}
function getObjectOptions($object) {
$options = array();
if (preg_match("#<<(.*)>>#ismU", $object, $options)) {
$options = explode("/", $options[1]);
#array_shift($options);
$o = array();
for ($j = 0; $j < #count($options); $j++) {
$options[$j] = preg_replace("#\s+#", " ", trim($options[$j]));
if (strpos($options[$j], " ") !== false) {
$parts = explode(" ", $options[$j]);
$o[$parts[0]] = $parts[1];
} else
$o[$options[$j]] = true;
}
$options = $o;
unset($o);
}
return $options;
}
function getDecodedStream($stream, $options) {
$data = "";
if (empty($options["Filter"]))
$data = $stream;
else {
$length = !empty($options["Length"]) ? $options["Length"] : strlen($stream);
$_stream = substr($stream, 0, $length);
foreach ($options as $key => $value) {
if ($key == "ASCIIHexDecode")
$_stream = $this->decodeAsciiHex($_stream);
elseif ($key == "ASCII85Decode")
$_stream = $this->decodeAscii85($_stream);
elseif ($key == "FlateDecode")
$_stream = $this->decodeFlate($_stream);
elseif ($key == "Crypt") { // TO DO
}
}
$data = $_stream;
}
return $data;
}
function getDirtyTexts(&$texts, $textContainers) {
for ($j = 0; $j < count($textContainers); $j++) {
if (preg_match_all("#\[(.*)\]\s*TJ[\n|\r]#ismU", $textContainers[$j], $parts))
$texts = array_merge($texts, array(#implode('', $parts[1])));
elseif (preg_match_all("#T[d|w|m|f]\s*(\(.*\))\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts))
$texts = array_merge($texts, array(#implode('', $parts[1])));
elseif (preg_match_all("#T[d|w|m|f]\s*(\[.*\])\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts))
$texts = array_merge($texts, array(#implode('', $parts[1])));
}
}
function getCharTransformations(&$transformations, $stream) {
preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER);
preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER);
for ($j = 0; $j < count($chars); $j++) {
$count = $chars[$j][1];
$current = explode("\n", trim($chars[$j][2]));
for ($k = 0; $k < $count && $k < count($current); $k++) {
if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map))
$transformations[str_pad($map[1], 4, "0")] = $map[2];
}
}
for ($j = 0; $j < count($ranges); $j++) {
$count = $ranges[$j][1];
$current = explode("\n", trim($ranges[$j][2]));
for ($k = 0; $k < $count && $k < count($current); $k++) {
if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) {
$from = hexdec($map[1]);
$to = hexdec($map[2]);
$_from = hexdec($map[3]);
for ($m = $from, $n = 0; $m <= $to; $m++, $n++)
$transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n);
} elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) {
$from = hexdec($map[1]);
$to = hexdec($map[2]);
$parts = preg_split("#\s+#", trim($map[3]));
for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++)
$transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n]));
}
}
}
}
function getTextUsingTransformations($texts, $transformations) {
$document = "";
for ($i = 0; $i < count($texts); $i++) {
$isHex = false;
$isPlain = false;
$hex = "";
$plain = "";
for ($j = 0; $j < strlen($texts[$i]); $j++) {
$c = $texts[$i][$j];
switch($c) {
case "<":
$hex = "";
$isHex = true;
$isPlain = false;
break;
case ">":
$hexs = str_split($hex, $this->multibyte); // 2 or 4 (UTF8 or ISO)
for ($k = 0; $k < count($hexs); $k++) {
$chex = str_pad($hexs[$k], 4, "0"); // Add tailing zero
if (isset($transformations[$chex]))
$chex = $transformations[$chex];
$document .= html_entity_decode("&#x".$chex.";");
}
$isHex = false;
break;
case "(":
$plain = "";
$isPlain = true;
$isHex = false;
break;
case ")":
$document .= $plain;
$isPlain = false;
break;
case "\\":
$c2 = $texts[$i][$j + 1];
if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2;
elseif ($c2 == "n") $plain .= '\n';
elseif ($c2 == "r") $plain .= '\r';
elseif ($c2 == "t") $plain .= '\t';
elseif ($c2 == "b") $plain .= '\b';
elseif ($c2 == "f") $plain .= '\f';
elseif ($c2 >= '0' && $c2 <= '9') {
$oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3));
$j += strlen($oct) - 1;
$plain .= html_entity_decode("&#".octdec($oct).";", $this->convertquotes);
}
$j++;
break;
default:
if ($isHex)
$hex .= $c;
elseif ($isPlain)
$plain .= $c;
break;
}
}
$document .= "\n";
}
return $document;
}
}
?>
I know there are a lot of PDF extraction methods/techniques, but I'm after a reliable text extractor for PDFs in PHP. All I want is to extract words, but not numbers and no special characters.
Any ideas of solid techniques to achieve this?
The Zend Framework provides Zend_Pdf, a php class that will load and parse pdf documents.
Here is a script that shows how to extract the text from a loaded Zend_Pdf object.
If you use laravel, you can try my library...
<?php
namespace App\Helpers;
/*
SYNTAX:
include('class.pdf2text.php');
$a = new PDF2Text();
$a->setFilename('test.pdf');
$a->decodePDF();
echo $a->output();
*/
class PDF2Text {
// Some settings
var $multibyte = 4; // Use setUnicode(TRUE|FALSE)
var $convertquotes = ENT_QUOTES; // ENT_COMPAT (double-quotes), ENT_QUOTES (Both), ENT_NOQUOTES (None)
var $showprogress = false; // TRUE if you have problems with time-out
// Variables
var $filename = '';
var $decodedtext = '';
function setFilename($filename) {
$this->decodedtext = '';
$this->filename = $filename;
}
function output($echo = false) {
if($echo) echo $this->decodedtext;
else return $this->decodedtext;
}
function setUnicode($input) {
// 4 for unicode. But 2 should work in most cases just fine
if($input == true) $this->multibyte = 4;
else $this->multibyte = 2;
}
function decodePDF() {
// Read the data from pdf file
$infile = #file_get_contents($this->filename, FILE_BINARY);
if (empty($infile))
return "";
// Get all text data.
$transformations = array();
$texts = array();
// Get the list of all objects.
preg_match_all("#obj[\n|\r](.*)endobj[\n|\r]#ismU", $infile . "endobj\r", $objects);
$objects = #$objects[1];
// Select objects with streams.
for ($i = 0; $i < count($objects); $i++) {
$currentObject = $objects[$i];
// Prevent time-out
#set_time_limit ();
if($this->showprogress) {
flush(); ob_flush();
}
// Check if an object includes data stream.
if (preg_match("#stream[\n|\r](.*)endstream[\n|\r]#ismU", $currentObject . "endstream\r", $stream )) {
$stream = ltrim($stream[1]);
// Check object parameters and look for text data.
$options = $this->getObjectOptions($currentObject);
if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"])) )
continue;
unset($options["Length"]);
// So, we have text data. Decode it.
$data = $this->getDecodedStream($stream, $options);
if (strlen($data)) {
if (preg_match_all("#BT[\n|\r](.*)ET[\n|\r]#ismU", $data . "ET\r", $textContainers)) {
$textContainers = #$textContainers[1];
$this->getDirtyTexts($texts, $textContainers);
} else
$this->getCharTransformations($transformations, $data);
}
}
}
// Analyze text blocks taking into account character transformations and return results.
$this->decodedtext = $this->getTextUsingTransformations($texts, $transformations);
}
function decodeAsciiHex($input) {
$output = "";
$isOdd = true;
$isComment = false;
for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) {
$c = $input[$i];
if($isComment) {
if ($c == '\r' || $c == '\n')
$isComment = false;
continue;
}
switch($c) {
case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break;
case '%':
$isComment = true;
break;
default:
$code = hexdec($c);
if($code === 0 && $c != '0')
return "";
if($isOdd)
$codeHigh = $code;
else
$output .= chr($codeHigh * 16 + $code);
$isOdd = !$isOdd;
break;
}
}
if($input[$i] != '>')
return "";
if($isOdd)
$output .= chr($codeHigh * 16);
return $output;
}
function decodeAscii85($input) {
$output = "";
$isComment = false;
$ords = array();
for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) {
$c = $input[$i];
if($isComment) {
if ($c == '\r' || $c == '\n')
$isComment = false;
continue;
}
if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ')
continue;
if ($c == '%') {
$isComment = true;
continue;
}
if ($c == 'z' && $state === 0) {
$output .= str_repeat(chr(0), 4);
continue;
}
if ($c < '!' || $c > 'u')
return "";
$code = ord($input[$i]) & 0xff;
$ords[$state++] = $code - ord('!');
if ($state == 5) {
$state = 0;
for ($sum = 0, $j = 0; $j < 5; $j++)
$sum = $sum * 85 + $ords[$j];
for ($j = 3; $j >= 0; $j--)
$output .= chr($sum >> ($j * 8));
}
}
if ($state === 1)
return "";
elseif ($state > 1) {
for ($i = 0, $sum = 0; $i < $state; $i++)
$sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i);
for ($i = 0; $i < $state - 1; $i++) {
try {
if(false == ($o = chr($sum >> ((3 - $i) * 8)))) {
throw new Exception('Error');
}
$output .= $o;
} catch (Exception $e) { /*Dont do anything*/ }
}
}
return $output;
}
function decodeFlate($data) {
return #gzuncompress($data);
}
function getObjectOptions($object) {
$options = array();
if (preg_match("#<<(.*)>>#ismU", $object, $options)) {
$options = explode("/", $options[1]);
#array_shift($options);
$o = array();
for ($j = 0; $j < #count($options); $j++) {
$options[$j] = preg_replace("#\s+#", " ", trim($options[$j]));
if (strpos($options[$j], " ") !== false) {
$parts = explode(" ", $options[$j]);
$o[$parts[0]] = $parts[1];
} else
$o[$options[$j]] = true;
}
$options = $o;
unset($o);
}
return $options;
}
function getDecodedStream($stream, $options) {
$data = "";
if (empty($options["Filter"]))
$data = $stream;
else {
$length = !empty($options["Length"]) ? $options["Length"] : strlen($stream);
$_stream = substr($stream, 0, $length);
foreach ($options as $key => $value) {
if ($key == "ASCIIHexDecode")
$_stream = $this->decodeAsciiHex($_stream);
elseif ($key == "ASCII85Decode")
$_stream = $this->decodeAscii85($_stream);
elseif ($key == "FlateDecode")
$_stream = $this->decodeFlate($_stream);
elseif ($key == "Crypt") { // TO DO
}
}
$data = $_stream;
}
return $data;
}
function getDirtyTexts(&$texts, $textContainers) {
for ($j = 0; $j < count($textContainers); $j++) {
if (preg_match_all("#\[(.*)\]\s*TJ[\n|\r]#ismU", $textContainers[$j], $parts))
$texts = array_merge($texts, array(#implode('', $parts[1])));
elseif (preg_match_all("#T[d|w|m|f]\s*(\(.*\))\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts))
$texts = array_merge($texts, array(#implode('', $parts[1])));
elseif (preg_match_all("#T[d|w|m|f]\s*(\[.*\])\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts))
$texts = array_merge($texts, array(#implode('', $parts[1])));
}
}
function getCharTransformations(&$transformations, $stream) {
preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER);
preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER);
for ($j = 0; $j < count($chars); $j++) {
$count = $chars[$j][1];
$current = explode("\n", trim($chars[$j][2]));
for ($k = 0; $k < $count && $k < count($current); $k++) {
if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map))
$transformations[str_pad($map[1], 4, "0")] = $map[2];
}
}
for ($j = 0; $j < count($ranges); $j++) {
$count = $ranges[$j][1];
$current = explode("\n", trim($ranges[$j][2]));
for ($k = 0; $k < $count && $k < count($current); $k++) {
if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) {
$from = hexdec($map[1]);
$to = hexdec($map[2]);
$_from = hexdec($map[3]);
for ($m = $from, $n = 0; $m <= $to; $m++, $n++)
$transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n);
} elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) {
$from = hexdec($map[1]);
$to = hexdec($map[2]);
$parts = preg_split("#\s+#", trim($map[3]));
for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++)
$transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n]));
}
}
}
}
function getTextUsingTransformations($texts, $transformations) {
$document = "";
for ($i = 0; $i < count($texts); $i++) {
$isHex = false;
$isPlain = false;
$hex = "";
$plain = "";
for ($j = 0; $j < strlen($texts[$i]); $j++) {
$c = $texts[$i][$j];
switch($c) {
case "<":
$hex = "";
$isHex = true;
$isPlain = false;
break;
case ">":
$hexs = str_split($hex, $this->multibyte); // 2 or 4 (UTF8 or ISO)
for ($k = 0; $k < count($hexs); $k++) {
$chex = str_pad($hexs[$k], 4, "0"); // Add tailing zero
if (isset($transformations[$chex]))
$chex = $transformations[$chex];
$document .= html_entity_decode("&#x".$chex.";");
}
$isHex = false;
break;
case "(":
$plain = "";
$isPlain = true;
$isHex = false;
break;
case ")":
$document .= $plain;
$isPlain = false;
break;
case "\\":
$c2 = $texts[$i][$j + 1];
if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2;
elseif ($c2 == "n") $plain .= '\n';
elseif ($c2 == "r") $plain .= '\r';
elseif ($c2 == "t") $plain .= '\t';
elseif ($c2 == "b") $plain .= '\b';
elseif ($c2 == "f") $plain .= '\f';
elseif ($c2 >= '0' && $c2 <= '9') {
$oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3));
$j += strlen($oct) - 1;
$plain .= html_entity_decode("&#".octdec($oct).";", $this->convertquotes);
}
$j++;
break;
default:
if ($isHex)
$hex .= $c;
elseif ($isPlain)
$plain .= $c;
break;
}
}
$document .= "\n";
//$document .= "<br>";
}
return $document;
}
}
how remove multi line comment ? ( /* comment php */) from file .php
how remove single comment ? ( // coment ) from file.php
how remove enter end line ?
sample :
single comment line
$G["url"] = "http://".$_SERVER["HTTP_HOST"]
multi comment line
/*
* 310
* - "::"
* - $col
*/
Try using the code posted below, it even ignores comment tokens on strings like " /* comment2 */ "
<?php
$fileIn = "src.php";
$fileOut = "srcCompressed.php";
$text = file_get_contents($fileIn);
$text = compress_php_src($text);
file_put_contents($fileOut, $text);
function compress_php_src($src) {
// Whitespaces left and right from this signs can be ignored
static $IW = array(
T_CONCAT_EQUAL, // .=
T_DOUBLE_ARROW, // =>
T_BOOLEAN_AND, // &&
T_BOOLEAN_OR, // ||
T_IS_EQUAL, // ==
T_IS_NOT_EQUAL, // != or <>
T_IS_SMALLER_OR_EQUAL, // <=
T_IS_GREATER_OR_EQUAL, // >=
T_INC, // ++
T_DEC, // --
T_PLUS_EQUAL, // +=
T_MINUS_EQUAL, // -=
T_MUL_EQUAL, // *=
T_DIV_EQUAL, // /=
T_IS_IDENTICAL, // ===
T_IS_NOT_IDENTICAL, // !==
T_DOUBLE_COLON, // ::
T_PAAMAYIM_NEKUDOTAYIM, // ::
T_OBJECT_OPERATOR, // ->
T_DOLLAR_OPEN_CURLY_BRACES, // ${
T_AND_EQUAL, // &=
T_MOD_EQUAL, // %=
T_XOR_EQUAL, // ^=
T_OR_EQUAL, // |=
T_SL, // <<
T_SR, // >>
T_SL_EQUAL, // <<=
T_SR_EQUAL, // >>=
);
if(is_file($src)) {
if(!$src = file_get_contents($src)) {
return false;
}
}
$tokens = token_get_all($src);
$new = "";
$c = sizeof($tokens);
$iw = false; // ignore whitespace
$ih = false; // in HEREDOC
$ls = ""; // last sign
$ot = null; // open tag
for($i = 0; $i < $c; $i++) {
$token = $tokens[$i];
if(is_array($token)) {
list($tn, $ts) = $token; // tokens: number, string, line
$tname = token_name($tn);
if($tn == T_INLINE_HTML) {
$new .= $ts;
$iw = false;
} else {
if($tn == T_OPEN_TAG) {
if(strpos($ts, " ") || strpos($ts, "\n") || strpos($ts, "\t") || strpos($ts, "\r")) {
$ts = rtrim($ts);
}
$ts .= " ";
$new .= $ts;
$ot = T_OPEN_TAG;
$iw = true;
} elseif($tn == T_OPEN_TAG_WITH_ECHO) {
$new .= $ts;
$ot = T_OPEN_TAG_WITH_ECHO;
$iw = true;
} elseif($tn == T_CLOSE_TAG) {
if($ot == T_OPEN_TAG_WITH_ECHO) {
$new = rtrim($new, "; ");
} else {
$ts = " ".$ts;
}
$new .= $ts;
$ot = null;
$iw = false;
} elseif(in_array($tn, $IW)) {
$new .= $ts;
$iw = true;
} elseif($tn == T_CONSTANT_ENCAPSED_STRING
|| $tn == T_ENCAPSED_AND_WHITESPACE)
{
if($ts[0] == '"') {
$ts = addcslashes($ts, "\n\t\r");
}
$new .= $ts;
$iw = true;
} elseif($tn == T_WHITESPACE) {
$nt = #$tokens[$i+1];
if(!$iw && (!is_string($nt) || $nt == '$') && !in_array($nt[0], $IW)) {
$new .= " ";
}
$iw = false;
} elseif($tn == T_START_HEREDOC) {
$new .= "<<<S\n";
$iw = false;
$ih = true; // in HEREDOC
} elseif($tn == T_END_HEREDOC) {
$new .= "S;";
$iw = true;
$ih = false; // in HEREDOC
for($j = $i+1; $j < $c; $j++) {
if(is_string($tokens[$j]) && $tokens[$j] == ";") {
$i = $j;
break;
} else if($tokens[$j][0] == T_CLOSE_TAG) {
break;
}
}
} elseif($tn == T_COMMENT || $tn == T_DOC_COMMENT) {
$iw = true;
} else {
if(!$ih) {
$ts = strtolower($ts);
}
$new .= $ts;
$iw = false;
}
}
$ls = "";
} else {
if(($token != ";" && $token != ":") || $ls != $token) {
$new .= $token;
$ls = $token;
}
$iw = true;
}
}
return $new;
}
?>
Credits to gelamu function compress_php_src().
There's no reason to remove comments from your code. The overhead from processing them is so incredibly minimal that it doesn't justify the effort.
I think you are probably looking for php_strip_whitespace().