mirror of
https://github.com/sbrl/research-rainfallradar
synced 2024-11-22 01:12:59 +00:00
write glue for .jsonl.gz → .tfrecord.gz converter
This commit is contained in:
parent
f3652edf82
commit
222a6146ec
6 changed files with 79 additions and 25 deletions
|
@ -1 +1,2 @@
|
||||||
tensorflow>=2.4
|
tensorflow>=2.4
|
||||||
|
silence_tensorflow
|
|
@ -6,6 +6,9 @@ import gzip
|
||||||
import json
|
import json
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
from silence_tensorflow import silence_tensorflow
|
||||||
|
if not os.environ.get("NO_SILENCE"):
|
||||||
|
silence_tensorflow()
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
|
||||||
# TO PARSE:
|
# TO PARSE:
|
||||||
|
@ -59,7 +62,7 @@ def convert(filepath_in, filepath_out):
|
||||||
## 3: Print shape definitions (required when parsing)
|
## 3: Print shape definitions (required when parsing)
|
||||||
###
|
###
|
||||||
if i == 0:
|
if i == 0:
|
||||||
print("SHAPES\t"+json.dumps({ "rainfallradar": rainfall.shape.as_list(), "waterdepth": water.shape.as_list() }))
|
print("SHAPES\t"+json.dumps({ "rainfallradar": rainfall.shape.as_list(), "waterdepth": water.shape.as_list() }), flush=True)
|
||||||
|
|
||||||
###
|
###
|
||||||
## 4: Serialise tensors
|
## 4: Serialise tensors
|
||||||
|
@ -76,7 +79,8 @@ def convert(filepath_in, filepath_out):
|
||||||
}))
|
}))
|
||||||
writer.write(record.SerializeToString())
|
writer.write(record.SerializeToString())
|
||||||
|
|
||||||
print(i)
|
print(f"{i}", flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
|
@ -1,45 +1,54 @@
|
||||||
"use strict";
|
"use strict";
|
||||||
|
|
||||||
import fs from 'fs';
|
import fs from 'fs';
|
||||||
|
import path from 'path';
|
||||||
import child_process from 'child_process';
|
import child_process from 'child_process';
|
||||||
|
import { Readable } from 'stream';
|
||||||
|
|
||||||
import nexline from nexline;
|
import nexline from 'nexline';
|
||||||
|
|
||||||
import log from './NamespacedLog.mjs'; const l = log("gzipchildprocess");
|
import log from '../io/NamespacedLog.mjs'; const l = log("gzipchildprocess");
|
||||||
import { end_safe } from './StreamHelpers.mjs';
|
// import { end_safe } from '../io/StreamHelpers.mjs';
|
||||||
import { fstat } from 'fs';
|
|
||||||
|
|
||||||
|
function snore(ms) {
|
||||||
|
return new Promise((resolve, _reject) => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
const __dirname = import.meta.url.slice(7, import.meta.url.lastIndexOf("/"));
|
const __dirname = import.meta.url.slice(7, import.meta.url.lastIndexOf("/"));
|
||||||
|
|
||||||
async function* py_jsonl2tfrecord(filepath_source, filepath_target, filepath_meta=null) {
|
async function* py_jsonl2tfrecord(filepath_source, filepath_target, filepath_meta=null) {
|
||||||
// get stdin() { return this.child_process.stdin; }
|
// get stdin() { return this.converter.stdin; }
|
||||||
// get stdout() { return this.child_process.stdout; }
|
// get stdout() { return this.converter.stdout; }
|
||||||
// get stderr() { return this.child_process.stderr; }
|
// get stderr() { return this.converter.stderr; }
|
||||||
|
|
||||||
|
const env = {}; Object.assign(env, process.env);
|
||||||
|
if(filepath_meta !== null) env["NO_SILENCE"] = "NO_SILENCE";
|
||||||
|
|
||||||
child_process = child_process.spawn(
|
const converter = child_process.spawn(
|
||||||
"python3", [
|
"python3", [
|
||||||
path.join(__dirname, "json2tfrecord.py"),
|
path.join(__dirname, "json2tfrecord.py"),
|
||||||
"--input", filepath_source,
|
"--input", filepath_source,
|
||||||
"--output", filepath_target
|
"--output", filepath_target
|
||||||
], { // TODO: detect binary - python3 vs python
|
], { // TODO: detect binary - python3 vs python
|
||||||
// Pipe stdin + stdout; send error to the parent process
|
// Pipe stdin + stdout; send error to the parent process
|
||||||
stdio: [ "ignore", "pipe", "inherit" ]
|
stdio: [ "ignore", "pipe", "inherit" ],
|
||||||
|
env
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
// converter.stdout.on("data", (chunk) => console.log(`DEBUG chunk`, chunk));
|
||||||
|
|
||||||
const reader = nexline({ input: child_process.stdout });
|
const reader = nexline({ input: new Readable().wrap(converter.stdout) });
|
||||||
|
|
||||||
for await(const line of reader) {
|
for await(const line of reader) {
|
||||||
if(line.startsWith("SHAPE") && filepath_meta !== null ) {
|
if(line.startsWith("SHAPES\t")) {
|
||||||
await fs.promises.writeFile(
|
if(filepath_meta !== null) {
|
||||||
filepath_meta,
|
await fs.promises.writeFile(
|
||||||
line.split(/\t+/)[1]
|
filepath_meta,
|
||||||
);
|
line.split(/\t+/)[1]
|
||||||
|
);
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
yield parseInt(line, 10);
|
yield parseInt(line, 10);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
import fs from 'fs';
|
import fs from 'fs';
|
||||||
import path from 'path';
|
import path from 'path';
|
||||||
|
import os from 'os';
|
||||||
import p_map from 'p-map';
|
import p_map from 'p-map';
|
||||||
import pretty_ms from 'pretty-ms';
|
import pretty_ms from 'pretty-ms';
|
||||||
|
|
||||||
|
@ -9,19 +10,28 @@ import debounce from '../async/debounce.mjs';
|
||||||
import py_jsonl2tfrecord from '../python/py_jsonl2tfrecord.mjs';
|
import py_jsonl2tfrecord from '../python/py_jsonl2tfrecord.mjs';
|
||||||
import log from '../../lib/io/NamespacedLog.mjs'; const l = log("jsonl2tf");
|
import log from '../../lib/io/NamespacedLog.mjs'; const l = log("jsonl2tf");
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts a directory of .jsonl.gz files to .tfrecord.gz files.
|
||||||
|
* @param {string} dirpath_source The source directory to read from.
|
||||||
|
* @param {string} dirpath_target The target directory to write to.
|
||||||
|
* @return {void}
|
||||||
|
*/
|
||||||
export default async function(dirpath_source, dirpath_target) {
|
export default async function(dirpath_source, dirpath_target) {
|
||||||
const files = fs.promises.readdir(dirpath_source);
|
const files = await fs.promises.readdir(dirpath_source);
|
||||||
|
|
||||||
let time_start = new Date(), lines_processed = 0, files_complete = 0;
|
let time_start = new Date(), lines_processed = 0, files_complete = 0;
|
||||||
|
|
||||||
const update_progress = debounce(() => {
|
const update_progress_force = () => {
|
||||||
process.stdout.write(`${files_complete}/${lines_processed} files/lines complete | ${((new Date() - time_start) / lines_processed).toFixed(3)} lines/sec | ${((files_processed / files.length)*100).toFixed(2)}% complete\r`);
|
process.stdout.write(`${files_complete}/${lines_processed} files/lines complete | ${((new Date() - time_start) / lines_processed).toFixed(3)} lines/sec | ${((files_processed / files.length)*100).toFixed(2)}% complete\r`);
|
||||||
});
|
};
|
||||||
|
const update_progress = debounce(update_progress_force);
|
||||||
|
|
||||||
p_map(files, async (filename, i) => {
|
await p_map(files, async (filename, i) => {
|
||||||
const filepath_source = path.join(dirpath_source, filename);
|
const filepath_source = path.join(dirpath_source, filename);
|
||||||
const filepath_dest = path.join(dirpath_target, filename);
|
const filepath_dest = path.join(dirpath_target, filename.replace(/\.jsonl\.gz$/, ".tfrecord.gz"));
|
||||||
const filepath_meta = i === 0 ? path.join(dirpath_target, `metadata.json`) : null;
|
const filepath_meta = i === 0 ? path.join(dirpath_target, `metadata.json`) : null;
|
||||||
|
l.log(`start ${i} | ${filename} | META ${filepath_meta}`);
|
||||||
let time_start = new Date(), lines_done = 0;
|
let time_start = new Date(), lines_done = 0;
|
||||||
for await (let line_number of py_jsonl2tfrecord(filepath_source, filepath_dest, filepath_meta)) {
|
for await (let line_number of py_jsonl2tfrecord(filepath_source, filepath_dest, filepath_meta)) {
|
||||||
lines_processed++;
|
lines_processed++;
|
||||||
|
@ -30,5 +40,7 @@ export default async function(dirpath_source, dirpath_target) {
|
||||||
}
|
}
|
||||||
files_complete++;
|
files_complete++;
|
||||||
l.log(`converted ${filename}: ${lines_done} lines @ ${pretty_ms((new Date() - time_start) / lines_done)}`);
|
l.log(`converted ${filename}: ${lines_done} lines @ ${pretty_ms((new Date() - time_start) / lines_done)}`);
|
||||||
});
|
}, { concurrency: os.cpus().length });
|
||||||
|
update_progress_force();
|
||||||
|
l.log(`complete: ${lines_processed}/${files_complete} lines/files processed in ${pretty_ms(new Date() - time_start)}`);
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
"use strict";
|
||||||
|
|
||||||
|
import fs from 'fs';
|
||||||
|
|
||||||
|
import settings from "../../settings.mjs";
|
||||||
|
|
||||||
|
import jsonl_to_tf from '../../lib/record/jsonl_to_tf.mjs';
|
||||||
|
|
||||||
|
export default async function() {
|
||||||
|
if(typeof settings.source !== "string")
|
||||||
|
throw new Error(`Error: No source directory specified (see the --source CLI argument)`);
|
||||||
|
if(typeof settings.target !== "string")
|
||||||
|
throw new Error(`Error: No target directory specified (see the --target CLI argument)`);
|
||||||
|
|
||||||
|
if(!fs.existsSync(settings.source))
|
||||||
|
throw new Error(`Error: The source directory at '${settings.source}' doesn't exist or you haven't got permission to access it.`);
|
||||||
|
if(!fs.existsSync(settings.target))
|
||||||
|
await fs.promises.mkdir(settings.target);
|
||||||
|
|
||||||
|
await jsonl_to_tf(settings.source, settings.target);
|
||||||
|
}
|
7
rainfallwrangler/src/subcommands/jsonl2tfrecord/meta.mjs
Normal file
7
rainfallwrangler/src/subcommands/jsonl2tfrecord/meta.mjs
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
"use strict";
|
||||||
|
|
||||||
|
export default function(cli) {
|
||||||
|
cli.subcommand("jsonl2tfrecord", "Convert a directory of .jsonl.gz files to .tfrecord.gz files.")
|
||||||
|
.argument("source", "Path to the source directory.", null, "string")
|
||||||
|
.argument("target", "Path to the target directory.", null, "string");
|
||||||
|
}
|
Loading…
Reference in a new issue