mirror of
https://github.com/sbrl/research-rainfallradar
synced 2024-11-22 09:13:01 +00:00
Fix bugs in remainder of rainfallwrangler:uniq :D
This commit is contained in:
parent
31bd7899b6
commit
82e826fd69
5 changed files with 15 additions and 9 deletions
|
@ -1,7 +1,6 @@
|
||||||
#!/usr/bin/env node
|
#!/usr/bin/env node
|
||||||
"use strict";
|
"use strict";
|
||||||
|
|
||||||
|
|
||||||
import cli from './cli.mjs';
|
import cli from './cli.mjs';
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -47,20 +47,22 @@ class RecordUniqManager {
|
||||||
.filter(filename => filename.endsWith(".jsonl.gz"))
|
.filter(filename => filename.endsWith(".jsonl.gz"))
|
||||||
.map(filename => path.join(dirpath_source, filename));
|
.map(filename => path.join(dirpath_source, filename));
|
||||||
|
|
||||||
l.log(`STEP [1 / 5]: Hashing files`);
|
l.log(`STEP [1 / 5]: Hashing ${files.length} files`);
|
||||||
|
const time_start_hash = new Date();
|
||||||
await p_map(files, this.#do_single_hash.bind(this), { concurrency: this.worker_count + 10 });
|
await p_map(files, this.#do_single_hash.bind(this), { concurrency: this.worker_count + 10 });
|
||||||
l.log(`STEP [1 / 5]: ${this.hashes.size} hashes gathered in total.`);
|
const time_per_hash = (new Date() - time_start_hash) / this.hashes.size;
|
||||||
|
l.log(`STEP [1 / 5]: ${this.hashes.size} hashes gathered in total, averaging ${(1000/time_per_hash).toFixed(2)} hashes/sec.`);
|
||||||
|
|
||||||
l.log(`STEP [ 2 / 5 ]: Identify duplicates`);
|
// l.log(`STEP [ 2 / 5 ]: Identify duplicates`);
|
||||||
const dupes = this.find_duplicates();
|
const dupes = this.find_duplicates();
|
||||||
this.hashes.clear(); // Save memory
|
this.hashes.clear(); // Save memory
|
||||||
l.log(`STEP [ 2 / 5 ]: ${dupes.length} duplicate groups identified`);
|
l.log(`STEP [ 2 / 5 ]: ${dupes.length} duplicate groups identified`);
|
||||||
|
|
||||||
l.log(`STEP [ 3 / 5 ]: Assemble deletion lists`);
|
// l.log(`STEP [ 3 / 5 ]: Assemble deletion lists`);
|
||||||
const deletion_lists = this.assemble_deletion_lists(dupes);
|
const deletion_lists = this.assemble_deletion_lists(dupes);
|
||||||
l.log(`STEP [ 3 / 5 ]: ${[...deletion_lists.values()].reduce((acc, next) => next.length + acc, 0)} duplicates to be deleted.`);
|
l.log(`STEP [ 3 / 5 ]: ${[...deletion_lists.values()].reduce((acc, next) => next.length + acc, 0)} duplicates to be deleted.`);
|
||||||
|
|
||||||
l.log(`STEP [ 4 / 5 ]: Delete duplicates`);
|
// l.log(`STEP [ 4 / 5 ]: Delete duplicates`);
|
||||||
await p_map(
|
await p_map(
|
||||||
deletion_lists.entries(),
|
deletion_lists.entries(),
|
||||||
async (args) => await this.#do_single_delete(...args),
|
async (args) => await this.#do_single_delete(...args),
|
||||||
|
@ -139,6 +141,10 @@ class RecordUniqManager {
|
||||||
|
|
||||||
this.items_deleted += result.value;
|
this.items_deleted += result.value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
close() {
|
||||||
|
this.pool.terminate();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export default RecordUniqManager;
|
export default RecordUniqManager;
|
||||||
|
|
|
@ -8,7 +8,6 @@ import log from '../../io/NamespacedLog.mjs'; const l = log("recorduniq:worker:h
|
||||||
|
|
||||||
export default async function(filepath) {
|
export default async function(filepath) {
|
||||||
const result = [];
|
const result = [];
|
||||||
|
|
||||||
let i = -1;
|
let i = -1;
|
||||||
for await(const line of records_read(filepath)) {
|
for await(const line of records_read(filepath)) {
|
||||||
i++;
|
i++;
|
||||||
|
|
|
@ -37,9 +37,10 @@ async function records_recompress(dirpath_source, dirpath_target, items_per_file
|
||||||
i++;
|
i++;
|
||||||
i_this_file++;
|
i_this_file++;
|
||||||
|
|
||||||
if(new Date() - time_display > 2000) {
|
if(new Date() - time_display > 500) {
|
||||||
const elapsed = new Date() - time_start;
|
const elapsed = new Date() - time_start;
|
||||||
process.stdout.write(`${pretty_ms(elapsed, { keepDecimalsOnWholeSeconds: true })} elapsed | ${i_file}/${i_this_file}/${i} files/this file/total | ${(1000 / (elapsed / i)).toFixed(2)} lines/sec | ${items_per_file - i_this_file} left for this file\r`)
|
process.stdout.write(`${pretty_ms(elapsed, { keepDecimalsOnWholeSeconds: true })} elapsed | ${i_file}/${i_this_file}/${i} files/thisfile/total | ${(1000 / (elapsed / i)).toFixed(2)} lines/sec | ${items_per_file - i_this_file} left for this file \r`);
|
||||||
|
time_display = new Date();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
await writer.close();
|
await writer.close();
|
||||||
|
|
|
@ -15,4 +15,5 @@ export default async function() {
|
||||||
|
|
||||||
const uniq_manager = new RecordUniqManager(settings.count_file);
|
const uniq_manager = new RecordUniqManager(settings.count_file);
|
||||||
await uniq_manager.deduplicate(settings.source, settings.target);
|
await uniq_manager.deduplicate(settings.source, settings.target);
|
||||||
|
uniq_manager.close(); // Terminate the workerpool nowt hat we're done
|
||||||
}
|
}
|
Loading…
Reference in a new issue