Am trying to create a JSON file from a large dump of a database query, and works when I set a LIMIT to 100000 rows being returned, but when I want all rows to be returned, it just goes to a 502 Error (The page request was canceled because it took too long to complete). Wondering if there is a way that I can streamline the creation of a JSON File in bits using php, or if there is a Library out there that will allow me to build the json file in parts?
Basically am running a .php file here to attempt to get all orders in json format from woocommerce, since the plugin I purchased "CSV Import Suite" does not work when importing orders, it just stays in the queue.
So, I decided to try and export all orders myself, but keep hitting a 502 Error Page and it never creates the .json file either, so am thinking I need a way to stream this somehow. Any help on this would be appreciated...
ini_set('memory_limit', '-1');
ini_set('max_execution_time', '-1');
set_time_limit(0);
error_reporting(E_ALL);
ob_implicit_flush(TRUE);
ob_end_flush();
global $wpdb, $root_dir;
if (!defined('ABSPATH'))
$root_dir = dirname(__FILE__) . '/';
else
$root_dir = ABSPATH;
$download = isset($_GET['download']);
// Allows us to use WP functions in a .php file without 404 headers!
require_once($root_dir . 'wp-config.php');
$wp->init();
$wp->parse_request();
$wp->query_posts();
$wp->register_globals();
if (empty($download))
$wp->send_headers();
// exclude
$exclude_post_statuses = array('trash', 'wc-refunded', 'wc_cancelled');
$start_date = !empty($_GET['start_date']) ? DateTime::createFromFormat('Y-m-d', $_GET['start_date']) : '';
$end_date = !empty($_GET['end_date']) ? DateTime::createFromFormat('Y-m-d', $_GET['end_date']) : '';
$order_db = array(
'columns' => array(
'p' => array('ID', 'post_author', 'post_date', 'post_date_gmt', 'post_content', 'post_title', 'post_excerpt', 'post_status', 'comment_status', 'ping_status', 'post_password', 'post_name', 'to_ping', 'pinged', 'post_modified', 'post_modified_gmt', 'post_content_filtered', 'post_parent', 'guid', 'menu_order', 'post_type', 'post_mime_type', 'comment_count'),
'pm' => array('meta_id', 'post_id', 'meta_key', 'meta_value'),
'oi' => array('order_item_id', 'order_item_name', 'order_item_type', 'order_id'),
'oim' => array('meta_id', 'order_item_id', 'meta_key', 'meta_value')
)
);
$select_data = '';
$total_columns = count($order_db['columns']);
$i = 1;
foreach($order_db['columns'] as $column_key => $columns)
{
$select_data .= implode(', ', array_map(
function ($v, $k) { return $k . '.' . $v . ' AS ' . $k . '_' . $v; },
$columns,
array_fill(0, count($columns), $column_key)
));
if ($i < $total_columns)
$select_data .= ', ';
$i++;
}
// HUGE DATABASE DUMP HERE, needs to be converted to JSON, after getting all columns of all tables...
$orders_query = $wpdb->get_results('
SELECT ' . $select_data . '
FROM ' . $wpdb->posts . ' AS p
INNER JOIN ' . $wpdb->postmeta . ' AS pm ON (pm.post_id = p.ID)
LEFT JOIN ' . $wpdb->prefix . 'woocommerce_order_items AS oi ON (oi.order_id = p.ID)
LEFT JOIN ' . $wpdb->prefix . 'woocommerce_order_itemmeta AS oim ON (oim.order_item_id = oi.order_item_id)
WHERE p.post_type = "shop_order"' . (!empty($exclude_post_statuses) ? ' AND p.post_status NOT IN ("' . implode('","', $exclude_post_statuses) . '")' : '') . (!empty($start_date) ? ' AND post_date >= "' . $start_date->format('Y-m-d H:i:s') . '"' : '') . (!empty($end_date) ? ' AND post_date <= "' . $end_date->format('Y-m-d H:i:s') . '"' : '') . '
ORDER BY p.ID ASC', ARRAY_A);
$json = array();
if (!empty($orders_query))
{
foreach($orders_query as $order_query)
{
if (!isset($json[$order_query['p_post_type']], $json[$order_query['p_post_type']][$order_query['p_post_name']]))
$json[$order_query['p_post_type']][$order_query['p_post_name']] = array(
'posts' => array(),
'postmeta' => array(),
'woocommerce_order_items' => array(),
'woocommerce_order_itemmeta' => array()
);
if (!empty($order_query['p_ID']))
$json[$order_query['p_post_type']][$order_query['p_post_name']]['posts'][$order_query['p_ID']] = array_filter($order_query, function($k) {
$is_p = strpos($k, 'p_');
return $is_p !== FALSE && empty($is_p);
}, ARRAY_FILTER_USE_KEY);
if (!empty($order_query['pm_meta_id']))
$json[$order_query['p_post_type']][$order_query['p_post_name']]['postmeta'][$order_query['pm_meta_id']] = array_filter($order_query, function($k) {
$is_pm = strpos($k, 'pm_');
return $is_pm !== FALSE && empty($is_pm);
}, ARRAY_FILTER_USE_KEY);
if (!empty($order_query['oi_order_item_id']))
$json[$order_query['p_post_type']][$order_query['p_post_name']]['woocommerce_order_items'][$order_query['oi_order_item_id']] = array_filter($order_query, function($k) {
$is_io = strpos($k, 'oi_');
return $is_io !== FALSE && empty($is_io);
}, ARRAY_FILTER_USE_KEY);
if (!empty($order_query['oim_meta_id']))
$json[$order_query['p_post_type']][$order_query['p_post_name']]['woocommerce_order_itemmeta'][$order_query['oim_meta_id']] = array_filter($order_query, function($k) {
$is_oim = strpos($k, 'oim_');
return $is_oim !== FALSE && empty($is_oim);
}, ARRAY_FILTER_USE_KEY);
}
}
// Downloading or viewing?
if (!empty($download))
{
// Outputs json in a textarea for you to copy and paste into a .json file for import...
if (!empty($json))
{
$filename = uniqid('orders_') . '.json';
$fp = fopen($filename, 'w');
fwrite($fp, json_encode($json));
fclose($fp);
$size = filesize($root_dir . '/' . $filename);
header('Content-Description: File Transfer');
header('Content-Type: application/octet-stream');
header("Content-Disposition: attachment; filename=\"" . $filename . "\"");
header('Content-Transfer-Encoding: binary');
header('Connection: Keep-Alive');
header('Expires: 0');
header('Cache-Control: must-revalidate, post-check=0, pre-check=0');
header('Pragma: public');
header('Content-Length: ' . $size);
readfile($root_dir . '/' . $filename);
}
}
else
{
// Outputs json in a textarea for you to copy and paste into a .json file for import...
if (!empty($json))
echo '<textarea cols="200" rows="50">', json_encode($json), '</textarea>';
}
The JSON File created could be a well over 500 MB, and possibly even up to 1 Gig of data. So, I believe PHP is running out of memory here, and needs to be processed bit by bit somehow, either in the background, or completely, without hitting the php memory limit. I believe the memory limit is set to 1024 MB, which is pretty high, but not high enough and tbh, for what I'm doing, I don't think we can ever have enough memory to perform the operation as is. Something needs to change in how I process the json and/or download it. And I do not want to create multiple json files, please only 1 JSON file.
I think there might be couple of issues. Firstly I would suggest you do some profiling.
// HUGE DATABASE DUMP HERE, needs to be converted to JSON, after getting all columns of all tables...
echo 'Start Time: '. date("Y-m-d H:i:s");
echo ' Memory Usage: ' . (memory_get_usage()/1048576) . ' MB \n';
$orders_query = $wpdb->get_results('
SELECT ' . $select_data . '
FROM ' . $wpdb->posts . ' AS p
INNER JOIN ' . $wpdb->postmeta . ' AS pm ON (pm.post_id = p.ID)
LEFT JOIN ' . $wpdb->prefix . 'woocommerce_order_items AS oi ON (oi.order_id = p.ID)
LEFT JOIN ' . $wpdb->prefix . 'woocommerce_order_itemmeta AS oim ON (oim.order_item_id = oi.order_item_id)
WHERE p.post_type = "shop_order"' . (!empty($exclude_post_statuses) ? ' AND p.post_status NOT IN ("' . implode('","', $exclude_post_statuses) . '")' : '') . (!empty($start_date) ? ' AND post_date >= "' . $start_date->format('Y-m-d H:i:s') . '"' : '') . (!empty($end_date) ? ' AND post_date <= "' . $end_date->format('Y-m-d H:i:s') . '"' : '') . '
ORDER BY p.ID ASC', ARRAY_A);
echo 'End Time: '. date("Y-m-d H:i:s");
echo ' Memory Usage: ' . (memory_get_usage()/1048576) . ' MB \n';
die('Finished');
$json = array();
The above will help you to know how much memory is in use, upto this point. If it fails before it prints 'Finished', we know it is not a json issue. If the script works fine then we can first create a csv file rather json. Since you are running a select query, (at this point) it does not have to be nested json file which you require. A flat structure can be achieved by just creating a CSV file.
$csvFile = uniqid('orders') . '.csv';
$fp = fopen($csvFile, 'w');
if (!empty($orders_query))
{
$firstRow = true;
foreach($orders_query as $order_query)
{
if(true === $firstRow) {
$keys = array_keys($order_query);
fputcsv($fp, $order_query);
$firstRow = false;
}
fputcsv($fp, $order_query);
}
}
fclose($fp);
If the above works fine you at-least have a csv file to work with.
At this point I am not sure how complex is your data structure nested. For e.g how many distinct values exist for 'p_post_type' and 'p_post_name' you are having. You might require to parse the csv file and create multiple json file for each ['p_post_type']['p_post_name']['posts'], ['p_post_type']['p_post_name']['posts'], ['p_post_type']['p_post_name']['woocommerce_order_items'] and ['p_post_type']['p_post_name']['woocommerce_order_itemmeta'].
If the number of files are few you can write a script to merge them automatically or do them manually. If you have too many nested items, the number of json files that might be created might be a lot and might be hard to merge them and might not be a feasible option.
If the number of json files are lot, I would like to know what is the purpose of having such a huge single json file. If export is an issue import might be an issue too, especially ingesting such a huge json file in memory. I believe if the purpose of creating the json file is to import it in some form, at some stage in future, I think you might have to look at the option of just having a csv file instead, which you use to filter out whatever is required at that point of time.
I hope this helps.
FURTHER UPDATE
It looks to me that $wpdb->get_results is using mysqli_query/mysql_query (depending on your configuration) to fetch the results. See wordpress query docs. It is not memory efficient way to fetch the data this way. I believe you might be failing at this point ($wpdb->get_results) itself. I would suggest you to run the query without using $wpdb. There is a concept of unbuffered query whenever large data retrieval is required, which has very low impact on the memory. Further information can be found here mysql unbuffering.
Even if you get past this point, you will still run into memory issues, due to the way how you are storing everything in $json variable which is eating up lot of your memory. $json is an array and it would interesting to know how PHP array works. PHP arrays are dynamic and they do not allocate extra memory every time a new element is added, since that would be extremely slow. It instead, increases the array size to the power of two, which means whenever the limit is exhausted it increases the array limit to twice its current limit and in the process tries to increase the memory to twice the limit. This has been less of an issue with PHP 7, since they have made some major changes to the php core. So if you have 2GB data that might be required to be stored in $json, the script might easily allocate anywhere between 3-4 GB memory, depending upon when it hits the limit. Further details can be found here php array and How does PHP memory actually work
If you consider the overhead of the $orders_query which is an array combined with overhead of $json it is quite substantial due to the way PHP array works.
You can also try to create another database B. So while you are reading from database A, you simultaneously start writing data to database B. In the end you have database B with all the data in it with the power of MySQL. You could also push the same data into a MongoDB which would be lightning fast and might help you with the json nesting you are after. MongoDB is meant to work really efficiently with large datasets.
JSON STREAMING SOLUTION
Firstly, I would like to say that streaming is sequential/linear process. As such, it is does not have memory of what was added before this point of time or what will added after this point of time. It works in small chunks and that is the reason it is so memory efficient. So when you actually write or read, the responsibility lies with the script, that it maintains a specific order, which is kind of saying you are writing/reading your own json, as streaming only understands text and has no clue about what json is and won't bother itself in writing/reading a correct one.
I have found a library on github https://github.com/skolodyazhnyy/json-stream which would help in you achieving what you want. I have experimented with the code and I can see it will work for you with some tweaks in your code.
I am going to write some pseudo-code for you.
//order is important in this query as streaming would require to maintain a proper order.
$query1 = select distinct p_post_type from ...YOUR QUERY... order by p_post_type;
$result1 = based on $query1;
$filename = 'data.json';
$fh = fopen($filename, "w");
$writer = new Writer($fh);
$writer->enter(Writer::TYPE_OBJECT);
foreach($result1 as $fields1) {
$posttype = $fields1['p_post_type'];
$writer->enter($posttype, Writer::TYPE_ARRAY);
$query2 = select distinct p_post_name from ...YOUR QUERY... YOUR WHERE ... and p_post_type= $posttype order by p_post_type,p_post_name;
$result2 = based on $query2;
foreach($result2 as $fields2) {
$postname = $fields1['p_post_name'];
$writer->enter($postname, Writer::TYPE_ARRAY);
$query3 = select ..YOUR COLUMNS.. from ...YOUR QUERY... YOUR WHERE ... and p_post_type= $posttype and p_post_name=$postname where p_ID is not null order by p_ID;
$result3 = based on $query3;
foreach($result2 as $field3) {
$writer->enter('posts', Writer::TYPE_ARRAY);
// write an array item
$writer->write(null, $field3);
}
$writer->leave();
$query4 = select ..YOUR COLUMNS.. from ...YOUR QUERY... YOUR WHERE ... and p_post_type= $posttype and p_post_name=$postname where pm_meta_id is not null order by pm_meta_id;
$result4 = based on $query4;
foreach($result4 as $field4) {
$writer->enter('postmeta', Writer::TYPE_ARRAY);
// write an array item
$writer->write(null, $field4);
}
$writer->leave();
$query5 = select ..YOUR COLUMNS.. from ...YOUR QUERY... YOUR WHERE ... and p_post_type= $posttype and p_post_name=$postname where oi_order_item_id is not null order by oi_order_item_id;
$result5 = based on $query5;
foreach($result5 as $field5) {
$writer->enter('woocommerce_order_items', Writer::TYPE_ARRAY);
// write an array item
$writer->write(null, $field5);
}
$writer->leave();
$query6 = select ..YOUR COLUMNS.. from ...YOUR QUERY... YOUR WHERE ... and p_post_type= $posttype and p_post_name=$postname where oim_meta_id is not null order by oim_meta_id;
$result6 = based on $query6;
foreach($result6 as $field6) {
$writer->enter('woocommerce_order_itemmeta', Writer::TYPE_ARRAY);
// write an array item
$writer->write(null, $field5);
}
$writer->leave();
}
$writer->leave();
fclose($fh);
You might have to start limiting your queries to 10 something until you get it right. Since the code above might not just work as it is. You should be able to read the code in similar fashion as the same library has got a Reader class to help. I have tested both reader and writer and they seem to work fine.
Creating the file
The problem with your code is you are trying to fit whole dataset into the memory, which eventually will fail as soon as your database gets large enough. To overcome this you have to fetch the data in batches.
We are going to generate the query multiple times so I extracted your query into a function. I skipped passing required parameters though (or making them global if you will) for brevity so you have to get this to work by yourself.
function generate_query($select, $limit = null, $offset = null) {
$query = 'SELECT ' . $select . '
FROM ' . $wpdb->posts . ' AS p
INNER JOIN ' . $wpdb->postmeta . ' AS pm ON (pm.post_id = p.ID)
LEFT JOIN ' . $wpdb->prefix . 'woocommerce_order_items AS oi ON (oi.order_id = p.ID)
LEFT JOIN ' . $wpdb->prefix . 'woocommerce_order_itemmeta AS oim ON (oim.order_item_id = oi.order_item_id)
WHERE p.post_type = "shop_order"' . (!empty($exclude_post_statuses) ? ' AND p.post_status NOT IN ("' . implode('","', $exclude_post_statuses) . '")' : '') . (!empty($start_date) ? ' AND post_date >= "' . $start_date->format('Y-m-d H:i:s') . '"' : '') . (!empty($end_date) ? ' AND post_date <= "' . $end_date->format('Y-m-d H:i:s') . '"' : '') . '
ORDER BY p.ID ASC';
if ($limit && $offset) {
$query .= ' LIMIT ' . $limit . ' OFFSET ' . $offset;
}
return $query;
}
Now we will get results from the db in batches, we define the batch count that is the number of records per iteration that we will load into the memory. You can later on play with this value to find one that will be fast enough and won't make PHP crash. Keep in mind we want to reduce the number of database queries as much as possible:
define('BATCH_COUNT', 500);
Before we create the loop we need to know how many iterations (database calls) we will make, so we need the total order count. Having this and the batch count, we can calculate this value easily:
$orders_count = $wpdb->get_col(generate_query('COUNT(*)'));
$iteration_count = ceil($orders_count / BATCH_COUNT);
As a result we would like to have a huge JSON string inside the result file. Since with each iteration we will have a separate JSON containing an array of objects, we will simply strip the [ and ] from each side of the JSON string and put that string into the file.
Final code:
define('FILE', 'dump.json');
file_put_contents(FILE, '[');
for ($i = 0; $i < $iteration_count; $i++) {
$offset = $i * BATCH_COUNT;
$result = $wpdb->get_results(
generate_query($select_data, BATCH_COUNT, $offset),
ARRAY_A
);
// do additional work here, add missing arrays etc.
// ...
// I assume here the $result is a valid array ready for
// creating JSON from it
// we append the result file with partial JSON
file_put_contents(FILE, trim(json_encode($result), '[]'), FILE_APPEND);
}
file_put_contents(FILE, ']', FILE_APPEND);
Congratulations, you have just created your first huge JSON dump ;) You should run this script in the command line so it can get as long as it needs to, there's no need to modify the memory limit from now on, because we are never going to hit the limit hopefully.
Sending the file
Streaming large files with PHP is easy and has already been answered on SO many times. However I personally don't recommend you doing anything time consuming in PHP, because it sucks as a long running process, either in the command line or as a file server.
I assume you are using Apache. You should consider using SendFile and let Apache do the hard work for you. This method is far more efficient when dealing with huge files. This method is very easy, all you need to do is pass the path to the file in the header:
header('X-Sendfile: ' . $path_to_the_file);
Should you use Nginx there's XSendFile support as well.
This method does not use a lot of memory, does not block the PHP process. The file does not need to be accessible in the webroot too. I use XSendFile all the time to serve 4K videos to authenticated users.
First, you should ask yourself a question: Do I need to write database dump myself?
If not then you can simply use some service that will do the work for you. Mysqldump-php should be able to do the job.
Then you can simply:
include_once(dirname(__FILE__) . '/mysqldump-php-2.0.0/src/Ifsnop/Mysqldump/Mysqldump.php');
$dump = new Ifsnop\Mysqldump\Mysqldump('mysql:host=localhost;dbname=testdb', 'username', 'password');
$dump->start('storage/work/dump.sql');
This should create .sql file. However, you wanted json file. That shouldn't be a problem though. This tool will do the rest of the job: http://www.csvjson.com/sql2json
You can also find the source code of sql2json on github: https://github.com/martindrapeau/csvjson-app
I believe you may be looking for Generators
http://php.net/manual/en/language.generators.overview.php
https://scotch.io/tutorials/understanding-php-generators
Instead of creating that huge $json array, you iterate over each $order_query and perform operations on each iteration, negating the need to store it in memory.
Your problem is that you get a large result set from your query, which is heavy since you have 3 joins.
You could define a limit and use offset to get the data in chunks and then output your json in parts. The main problem is to somehow get the json data in memory and then access it to output in parts.
For the latter cache or a nosql database could be used. My solution will use cache and in particular memcache:
class Cache {
private $cache;
public function __construct($cache)
{
$this->cache = $cache;
}
public function addPostName($postName)
{
$this->addKeyToJsonObject('postNames', $postName);
}
public function addKeyToJsonObject($rootName, $key)
{
$childNames = $this->cache->get($rootName);
if($childNames === false) {
$this->cache->set($rootName, [$key]);
}
else {
$childNamesList = $childNames;
// not found
if(array_search($key, $childNamesList) === false) {
$childNamesList[] = $key;
$this->cache->set($rootName, $childNamesList);
}
}
}
public function getPostNames()
{
return $this->cache->get('postNames');
}
public function set($key, $value) {
$this->cache->add($key, $value);
}
public function addPostIdsByNameAndType($postName, $type, $pid)
{
$this->addKeyToJsonObject($postName . '-' . $type, $pid);
}
public function getPostIdsByNameAndType($postName, $type)
{
return $this->cache->get($postName . '-' . $type);
}
public function addPostValueByNameTypeAndId($postName, $type, $pid, $value)
{
$this->cache->set($postName . '-' . $type . '-' . $pid, $value);
}
public function getPostValueByNameTypeAndId($postName, $type, $pid)
{
return $this->cache->get($postName . '-' . $type . '-' . $pid);
}
}
and then:
$memcache = new Memcache();
$memcache->connect('127.0.0.1', 11211) or die ("Could not connect");
$memcache->flush();
$cache = new Cache($memcache);
header('Content-disposition: attachment; filename=file.json');
header('Content-type: application/json');
echo '{"shop_order":{';
function getResultSet($wpdb, $offset = 1) {
return $wpdb->get_results('
SELECT ' . $select_data . '
FROM ' . $wpdb->posts . ' AS p
INNER JOIN ' . $wpdb->postmeta . ' AS pm ON (pm.post_id = p.ID)
LEFT JOIN ' . $wpdb->prefix . 'woocommerce_order_items AS oi ON (oi.order_id = p.ID)
LEFT JOIN ' . $wpdb->prefix . 'woocommerce_order_itemmeta AS oim ON (oim.order_item_id = oi.order_item_id)
WHERE p.post_type = "shop_order"' . (!empty($exclude_post_statuses) ? ' AND p.post_status NOT IN ("' . implode('","', $exclude_post_statuses) . '")' : '') . (!empty($start_date) ? ' AND post_date >= "' . $start_date->format('Y-m-d H:i:s') . '"' : '') . (!empty($end_date) ? ' AND post_date <= "' . $end_date->format('Y-m-d H:i:s') . '"' : '') . '
ORDER BY p.ID ASC LIMIT 1000 OFFSET ' . $offset, ARRAY_A);
}
$offset = 1;
$orders_query = getResultSet($wpdb, 1);
while(!empty($orders_query)) {
cacheRowData($cache, $orders_query);
$offset = $offset + 1000;
$orders_query = getResultSet($wpdb, $offset);
}
outputRowData($cache);
function cacheRowData($cache, $orders_query)
{
foreach($orders_query as $order_query) {
if(empty($order_query)) { continue; }
$cache->addPostName($order_query['p_post_name']);
// posts
if (!empty($order_query['p_ID'])) {
$cache->addPostIdsByNameAndType($order_query['p_post_name'],'posts', $order_query['p_ID']);
$value = array_filter($order_query, function($k) {
$is_p = strpos($k, 'p_');
return $is_p !== FALSE && empty($is_p);
}, ARRAY_FILTER_USE_KEY);
$cache->addPostValueByNameTypeAndId($order_query['p_post_name'],'posts', $order_query['p_ID'], $value);
}
if (!empty($order_query['pm_meta_id'])) {
$cache->addPostIdsByNameAndType($order_query['p_post_name'],'postmeta', $order_query['pm_meta_id']);
$value = array_filter($order_query, function($k) {
$is_pm = strpos($k, 'pm_');
return $is_pm !== FALSE && empty($is_pm);
}, ARRAY_FILTER_USE_KEY);
$cache->addPostValueByNameTypeAndId($order_query['p_post_name'],'postmeta', $order_query['pm_meta_id'], $value);
}
// here do the same for "woocommerce_order_items" and "woocommerce_order_itemmeta"
}
}
function outputRowData($cache)
{
$cachedPostNames = $cache->getPostNames();
$firstRow = true;
foreach($cachedPostNames as $postName) {
if(empty($postName)) { continue; }
if($firstRow === false) {
echo ',';
}
$firstRow = false;
echo '"' . $postName . '":{';
$postIds = $cache->getPostIdsByNameAndType($postName, 'posts');
if(!$postIds) {
$postIds = [];
}
// generate posts
$postValues = [];
foreach ($postIds as $postId) {
$postValues[$postId] = $cache->getPostValueByNameTypeAndId($postName, 'posts', $postId);
}
$postMetaIds = $cache->getPostIdsByNameAndType($postName, 'postmeta');
if(!$postMetaIds) {
$postMetaIds = [];
}
$postMetaValues = [];
foreach ($postMetaIds as $postMetaId) {
$postMetaValues[$postMetaId] = $cache->getPostValueByNameTypeAndId($postName, 'postmeta', $postMetaId);
}
// here do the same for "woocommerce_order_items" and "woocommerce_order_itemmeta"
echo '"posts":' . json_encode($postValues) . ',';
echo '"postmeta":' . json_encode($postMetaValues);
echo '}';
ob_flush();
flush(); // flush the output to start the download
}
}
echo '}}';
So there are lot of things that you need to get working to this right. I will all the point that I have in mind.
Termination by WebServer
If you use Apache or Nginx/PHP-FPM, both by default have a timeout for the url that is hit. So even though you have used
ini_set('memory_limit', '-1');
ini_set('max_execution_time', '-1');
set_time_limit(0);
To let the script run for long, but Apache, Nginx, PHP-FPM all have a timeout which won't allow your script to work. So you need fix these to get it working. You never mentioned which server you used. But a NGINX+PHP-FPM will result in 502 for sure with the default config.
Memory Usage
Even though you have used
ini_set('memory_limit', '-1');
If your memory needs rises high, PHP may start using paging and your code could become slow.
PHP CLI or PHP Web?
Not sure what is the frequency of execution here, but if it is low you can consider your data dumping script to be run through a PHP-CLI instead of HTTP. This would mean that you would run a PHP script directly through terminal to dump the JSON into a file and later use a URL to download the file directly
Using X-Sendfile or X-Accel-Redirect
If you are using apache you can send a header
header('X-Sendfile: /data/generated.json');
In case of Nginx you can send a
header('X-Accel-Redirect: /data/generated.json');
This you would do only in case you have decided to run the script as web and not as a CLI. When the generation of the JSON has finished, you don't want your script to read the file and server. You just want the webserver to care of it.
Unbuffered query instead of WPDB Query
https://core.trac.wordpress.org/browser/tags/4.9/src/wp-includes/wp-db.php#L2480
By default WPDB query fetches all the data into memory. But you can query the DB yourself using unbuffered query, this will not flood the memory
Example #1 Unbuffered query example: mysqli
<?php
$mysqli = new mysqli("localhost", "my_user", "my_password", "world");
$uresult = $mysqli->query("SELECT Name FROM City", MYSQLI_USE_RESULT);
if ($uresult) {
while ($row = $uresult->fetch_assoc()) {
echo $row['Name'] . PHP_EOL;
}
}
$uresult->close();
?>
Example #2 Unbuffered query example: pdo_mysql
<?php
$pdo = new PDO("mysql:host=localhost;dbname=world", 'my_user', 'my_pass');
$pdo->setAttribute(PDO::MYSQL_ATTR_USE_BUFFERED_QUERY, false);
$uresult = $pdo->query("SELECT Name FROM City");
if ($uresult) {
while ($row = $uresult->fetch(PDO::FETCH_ASSOC)) {
echo $row['Name'] . PHP_EOL;
}
}
?>
Example #3 Unbuffered query example: mysql
<?php
$conn = mysql_connect("localhost", "my_user", "my_pass");
$db = mysql_select_db("world");
$uresult = mysql_unbuffered_query("SELECT Name FROM City");
if ($uresult) {
while ($row = mysql_fetch_assoc($uresult)) {
echo $row['Name'] . PHP_EOL;
}
}
?>
https://secure.php.net/manual/en/mysqlinfo.concepts.buffering.php
PS: There might be few more points that I am missing right now in my head, would update this soon
My php is running out of memory with a server error "Out of memory:Kill process..about 25% of the way through the process" Although it searches through about 10,000 lines, the number of lines that match the criteria, and therefore need to be stored and written to the file at the end of the process, are less than 200. So I am not sure why it is running out of memory.
Am I receiving this error because I am not clearing variables after each loop, or do I need to increase the memory on the server?
The process in brief is:
- LOOPA - loop through list of 400 zip codes
- using one api call for each zip - get list of all places within each zip (typically about 40-50)
-- SUBLOOP1 - for each place found, use an api call to get all events for that place
---- SUBLOOP1A loop through events to count the number for each place
zips = file($configFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
$dnis = file($dniFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
$s3->registerStreamWrapper();
$file = fopen("s3://{$bucket}/{$key}", 'w') or die("Unable to open file!");
fwrite($file, $type . " id" . "\t" . $type . " name" . "\t" . "zip" . "\t" . "event count" . "\n" );
foreach($zips as $n => $zip){
//first line is the lable to describe zips, so skip it
if ($n < 1) continue;
$params = $url;
$params .= "&q=" . $zip;
$more_node_pages = true;
while ($more_node_pages){
$res = fetchEvents($params);
//Now find the number of events for each place
foreach($res->data as $node){
//first check if on Do Not Include list
$countevents = true;
foreach($dnis as $dni) {
if ($dni == $node->id) {
echo "Not going to get events for ". $node->name . " id# " . $dni . "\n\n";
$countevents = false;
break;
}
}
//if it found a match, skip this and go to the next
if (!$countevents) continue;
$params = $url . $node->id . "/events/?fields=start_time.order(reverse_chronological)&limit=" . $limit . "&access_token=". $access_token;
//Count the number of valid upcoming events for that node
$event_count = 0;
$more_pages = true;
$more_events = true;
while ($more_pages) {
$evResponse = fetchEvents($params);
if (!empty($evResponse->error)) {
checkError($evResponse->error->message, $evResponse->error->code, $file);
}
//if it finds any events for that place, go throught each event for that place one by one to count until you reach today
foreach($evResponse->data as $event){
if(strtotime($event->start_time) > strtotime('now')){
$event_count++;
}
//else we have reached today's events for this node, so get out of this loop, and don't retrieve any more events for this node
else {
$more_events = false;
break;
}
}
if (!empty($evResponse->paging->next) and $more_events) $params = $evResponse->paging->next;
else $more_pages = false;
} //end while loop looking for more pages with more events for that node (page)
if ($event_count > "0") {
fwrite($file, $node->id . "\t" . $node->name . "\t" . $zip . "\t" . $event_count . "\n");
echo $event_count . "\n";
}
} // loop back to the next place until done
//test to see if there is an additional page
if (!empty($res->paging->next)) $params = $res->paging->next; else $more_node_pages = false;
} //close while loop for $more_node_pages containing additional nodes for that zip
} // loop back to the next zip until done
fclose($file);
I would highly recommend adding output to the beginning of each nested loop. I think you most likely have an infinite loop, which is causing the script to run out of memory.
If that isn't the case, then you can try increasing the memory limit for your PHP script by adding this line of PHP to the top of your script:
ini_set("memory_limit", "5G");
If it takes more than 5GB of RAM for your script to process the 400 zip codes, I would recommend breaking your script up so that you can run zip codes 0-10 and then 11-20, then 21-30, etc.
Hope this helps, cheers.
You need to find out where the memory is being lost and then you can either take care of it or work around it. memory_get_usage() is your friend - print it at the top (or bottom) of each loop with some identifier so you can see when & where you are using up memory.
Background: Working with MediaWiki 1.19.1, Graphviz 2.28.0, Extension:GraphViz 0.9 on WAMP stack (Server 2008, Apache 2.4.2, MySQL 5.5.27, PHP 5.4.5). Everything is working great and as expected for the basic functionality of rendering a clickable image from a Graphviz diagram using the GraphViz extension in MediaWiki.
Problem: The links in the image map are not added to the MediaWiki pagelinks table. I get why they aren't added but it becomes an issue if there is no way to follow the links back with the 'What links here' functionality.
Desired solution: During the processing of the diagram in the GraphViz extension, I would like to use the generated .map file to then create a list of wikilinks to add on the page to get picked up by MediaWiki and added to the pagelinks table.
Details:
This GraphViz extension code:
<graphviz border='frame' format='png'>
digraph example1 {
// define nodes
nodeHello [
label="I say Hello",
URL="Hello"
]
nodeWorld [
label="You say World!",
URL="World"
]
// link nodes
nodeHello -> nodeWorld!
}
</graphviz>
Generates this image:
And this image map code in a corresponding .map file on the server:
<map id="example1" name="example1">
<area shape="poly" id="node1" href="Hello" title="I say Hello" alt="" coords="164,29,161,22,151,15,137,10,118,7,97,5,77,7,58,10,43,15,34,22,31,29,34,37,43,43,58,49,77,52,97,53,118,52,137,49,151,43,161,37"/>
<area shape="poly" id="node2" href="World" title="You say World!" alt="" coords="190,125,186,118,172,111,152,106,126,103,97,101,69,103,43,106,22,111,9,118,5,125,9,133,22,139,43,145,69,148,97,149,126,148,152,145,172,139,186,133"/>
</map>
From that image map file, I would like to be able to extract the href and title to build wikilinks like so:
[[Hello|I say Hello]]
[[World|You say World!]]
I'm guessing that since that .map file is essentially XML that I could just use XPATH to query the file, but that is just a guess. PHP is not my strongest area and I don't know the best approach to going about the XML/XPATH option or if that is even the best approach to pull that info from the file.
Once I got that collection/array of wikilinks from the .map file, I'm sure I can hack up the GraphViz.php extension file to add it to the contents of the page to get it added to the pagelinks table.
Progress: I had a bit of an Rubber Duck Problem Solving moment right as I submitted the question. I realized that since I had well formed data in the image map, that XPATH was probably the way to go. It was fairly trivial to be able to pull the data I needed, especially since I found that the map file contents was stilled stored in a local string variable.
$xml = new SimpleXMLElement( $map );
foreach($xml->area as $item) {
$links .= "[[" . $item->attributes()->href . "|" . $item->attributes()->title . "]]";
}
Final Solution: See my accepted answer below.
Thanks for taking a look. I appreciate any assistance or direction you can offer.
I finally worked through all of the issues and now have a fairly decent solution to render the graph nicely, provide a list of links, and register the links with wiki. My solution doesn't fully support all of the capabilities of the current GraphViz extension as it is written as there is functionality we do not need and I do not want to support. Here are the assumptions / limitations of this solution:
Does not support MscGen: We only have a need for Graphviz.
Does not support imageAtrributes: We wanted to control the format and presentation and it seemed like there were inconsistencies in the imageAttributes implementation that would then cause further support issues.
Does not support wikilinks: While it would be nice to provide consistent link usage through wiki and the Graphviz extension, the reality is that Graphviz is a completely different markup environment. While the current extension 'supports' wikilinks, the implementation is a little weak and leaves areas for confusion. Example: Wikilinks support giving the link an optional description but Graphviz already uses the node label for the description. So then you end up ignoring the wikilink description and telling users that 'Yes, we support wikilinks but don't use the description part' So since we aren't really using wikilinks correctly, just implement a regular link implementation and try to avoid the confusion entirely.
Here is what the output looks like:
Here are the changes that were made
Comment out this line:
// We don't want to support wikilinks so don't replace them
//$timelinesrc = rewriteWikiUrls( $timelinesrc ); // if we use wiki-links we transform them to real urls
Replace this block of code:
// clean up map-name
$map = preg_replace( '#<ma(.*)>#', ' ', $map );
$map = str_replace( '</map>', '', $map );
if ( $renderer == 'mscgen' ) {
$mapbefore = $map;
$map = preg_replace( '/(\w+)\s([_:%#/\w]+)\s(\d+,\d+)\s(\d+,\d+)/',
'<area shape="$1" href="$2" title="$2" alt="$2" coords="$3,$4" />',
$map );
}
/* Procduce html
*/
if ( $wgGraphVizSettings->imageFormatting )
{
$txt = imageAtrributes( $args, $storagename, $map, $outputType, $wgUploadPath ); // if we want borders/position/...
} else {
$txt = '<map name="' . $storagename . '">' . $map . '</map>' .
'<img src="' . $wgUploadPath . '/graphviz/' . $storagename . '.' . $outputType . '"' .
' usemap="#' . $storagename . '" />';
}
With this code:
$intHtml = '';
$extHtml = '';
$badHtml = '';
// Wrap the map/area info with top level nodes and load into xml object
$xmlObj = new SimpleXMLElement( $map );
// What does map look like before we start working with it?
wfDebugLog( 'graphviz', 'map before: ' . $map . "\n" );
// loop through each of the <area> nodes
foreach($xmlObj->area as $areaNode) {
wfDebugLog( 'graphviz', "areaNode: " . $areaNode->asXML() . "\n" );
// Get the data from the XML attributes
$hrefValue = (string)$areaNode->attributes()->href;
$textValue = (string)$areaNode->attributes()->title;
wfDebugLog( 'graphviz', '$hrefValue before: ' . $hrefValue . "\n" );
wfDebugLog( 'graphviz', '$textValue before: ' . $textValue . "\n" );
// For the text fields, multiple spaces (" ") in the Graphviz source (label)
// turns into a regular space followed by encoded representations of
// non-breaking spaces (" ") in the .map file which then turns
// into the following in the local variables: (" Â Â ").
// The following two options appear to convert/decode the characters
// appropriately. Leaving the lines commented out for now, as we have
// not seen a graph in the wild with multiple spaces in the label -
// just happened to stumble on the scenario.
// See http://www.php.net/manual/en/simplexmlelement.asxml.php
// and http://stackoverflow.com/questions/2050723/how-can-i-preg-replace-special-character-like-pret-a-porter
//$textValue = iconv("UTF-8", "ASCII//TRANSLIT", $textValue);
//$textValue = html_entity_decode($textValue, ENT_NOQUOTES, 'UTF-8');
// Now we need to deal with the whitespace characters like tabs and newlines
// and also deal with them correctly to replace multiple occurences.
// Unfortunately, the \n and \t values in the variable aren't actually
// tab or newline characters but literal characters '\' + 't' or '\' + 'n'.
// So the normally recommended regex '/\s+/u' to replace the whitespace
// characters does not work.
// See http://stackoverflow.com/questions/6579636/preg-replace-n-in-string
$hrefValue = preg_replace("/( |\\\\n|\\\\t)+/", ' ', $hrefValue);
$textValue = preg_replace("/( |\\\\n|\\\\t)+/", ' ', $textValue);
// check to see if the url matches any of the
// allowed protocols for external links
if ( preg_match( '/^(?:' . wfUrlProtocols() . ')/', $hrefValue ) ) {
// external link
$parser->mOutput->addExternalLink( $hrefValue );
$extHtml .= Linker::makeExternalLink( $hrefValue, $textValue ) . ', ';
}
else {
$first = substr( $hrefValue, 0, 1 );
if ( $first == '\\' || $first == '[' || $first == '/' ) {
// potential UNC path, wikilink, absolute or relative path
$hrefValue = '#InvalidLink';
$badHtml .= Linker::makeExternalLink( $hrefValue, $textValue ) . ', ';
$textValue = 'Invalid link. Check Graphviz source.';
}
else {
$title = Title::newFromText( $hrefValue );
if ( is_null( $title ) ) {
// invalid link
$hrefValue = '#InvalidLink';
$badHtml .= Linker::makeExternalLink( $hrefValue, $textValue ) . ', ';
$textValue = 'Invalid link. Check Graphviz source.';
}
else {
// internal link
$parser->mOutput->addLink( $title );
$intHtml .= Linker::link( $title, $textValue ) . ', ';
$hrefValue = $title->getFullURL();
}
}
}
$areaNode->attributes()->href = $hrefValue;
$areaNode->attributes()->title = $textValue;
}
$map = $xmlObj->asXML();
// The contents of $map, which is now XML, gets embedded
// in the HTML sent to the browser so we need to strip
// the XML version tag and we also strip the <map> because
// it will get replaced with a new one with the correct name.
$map = str_replace( '<?xml version="1.0"?>', '', $map );
$map = preg_replace( '#<ma(.*)>#', ' ', $map );
$map = str_replace( '</map>', '', $map );
// Let's see what it looks like now that we are done with it.
wfDebugLog( 'graphviz', 'map after: ' . $map . "\n" );
$txt = '' .
'<table style="background-color:#f9f9f9;border:1px solid #ddd;">' .
'<tr>' .
'<td style="border:1px solid #ddd;text-align:center;">' .
'<map name="' . $storagename . '">' . $map . '</map>' .
'<img src="' . $wgUploadPath . '/graphviz/' . $storagename . '.' . $outputType . '"' . ' usemap="#' . $storagename . '" />' .
'</td>' .
'</tr>' .
'<tr>' .
'<td style="font:10px verdana;">' .
'This Graphviz diagram links to the following pages:' .
'<br /><strong>Internal</strong>: ' . ( $intHtml != '' ? rtrim( $intHtml, ' ,' ) : '<em>none</em>' ) .
'<br /><strong>External</strong>: ' . ( $extHtml != '' ? rtrim( $extHtml, ' ,' ) : '<em>none</em>' ) .
( $badHtml != '' ? '<br /><strong>Invalid</strong>: ' . rtrim($badHtml, ' ,') .
'<br /><em>Tip: Do not use wikilinks ([]), UNC paths (\\) or relative links (/) when creating links in Graphviz diagrams.</em>' : '' ) .
'</td>' .
'</tr>' .
'</table>';
Possible enhancements:
It would be nice if the list of links below the graph were sorted and de-duped.