diff --git a/rainfallwrangler/slurm-recordify.job b/rainfallwrangler/slurm-recordify.job new file mode 100755 index 0000000..c7c97e8 --- /dev/null +++ b/rainfallwrangler/slurm-recordify.job @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +#SBATCH -J RW2jsonl +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH -o %j.%N.%a.out +#SBATCH -e %j.%N.%a.err +#SBATCH -p compute +#SBATCH --time=3-00:00:00 +#SBATCH --mem=8096 +# * 8GB RAM + +module load utilities/multi +module load readline/7.0 +module load gcc/10.2.0 + +# module load cuda/11.5.0 + +module load python/anaconda/4.6/miniconda/3.7 + +RAINFALL="${RAINFALL:-$HOME/data/nimrod_ceda.jsonl.gz}"; +WATER="${WATER:-$HOME/data/WaterDepths-new.stream.asc.gz}"; +OUTPUT="${OUTPUT}"; +COUNT_FILE="${COUNT_FILE:4096}"; + +if [[ -z "${WATER}" ]]; then + echo "Error: No input water depth file specified in the WATER environment variable."; + exit 1; +fi +if [[ -z "${RAINFALL}" ]]; then + echo "Error: No input rainfall file specified in the RAINFALL environment variables."; + exit 1; +fi + +if [[ -z "${OUTPUT}" ]]; then + echo "Error: No output directory specified in the OUTPUT environment variable."; + exit 1; +fi + +if [[ ! -d "${OUTPUT}" ]]; then + echo "Error: That input directory either doesn't exist, isn't a directory, or we don't have permission to access it."; + exit 3; +fi + +export PATH=$HOME/software/bin:$PATH; + + +OUTPUT_UNIQ="${OUTPUT%/}_uniq"; # Stript trailing slash, if present +OUTPUT_TFRECORD="${OUTPUT%/}_tfrecord"; # Stript trailing slash, if present + +echo ">>> Settings"; + +echo "RAINFALL $RAINFALL"; +echo "WATER $WATER"; +echo "OUTPUT $OUTPUT"; +echo "ARGS $ARGS"; + +echo ">>> Installing requirements"; +cd ../aimodel || { echo "Error: Failed to cd to ai model directory"; exit 1; }; +conda run -n py38 pip install -r requirements.txt; +cd ../rainfallwrangler || { echo "Error: Failed to cd back to rainfallwrangler directory"; exit 1; }; +npm install; +echo ">>> Converting dataset to .jsonl.gz"; +/usr/bin/env time -v src/index.mjs recordify --verbose --rainfall "${RAINFALL}" --water "${WATER}" --output "${OUTPUT}" --count-file "${COUNT_FILE}" ${ARGS}; +echo ">>> Deduplicating dataset"; +# This also automatically recompresses for us - hence the source/target rather than in-place +srun --comment 'RainUniq' --exclusive -p compute /usr/bin/env time -v src/index.mjs uniq --source "${OUTPUT}" --target "${OUTPUT_UNIQ}" --count-file "${COUNT_FILE}"; +echo ">>> Removing intermediate output"; +rm -r "${OUTPUT}"; +echo ">>> Queuing .jsonl.gz → tfrecord"; +INPUT="${OUTPUT_UNIQ}" OUTPUT="${OUTPUT_TFRECORD}" sbatch ./slurm-jsonl2tfrecord.job; +echo ">>> exited with code $?";