From bfb837d4f7eb7290cf5ccc4fc64f71ae1e19f7ed Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Tue, 9 Jul 2024 22:31:25 +0100 Subject: [PATCH] Initial commit --- .gitignore | 161 ++++++++++++++++++++ README.md | 7 + package-lock.json | 318 +++++++++++++++++++++++++++++++++++++++ package.json | 18 +++ src/index.css | 39 +++++ src/index.mjs | 55 +++++++ src/index.sh | 64 ++++++++ src/lib/ai-extract.mjs | 56 +++++++ src/lib/make-html.mjs | 26 ++++ src/lib/parse-result.mjs | 22 +++ src/template.html | 48 ++++++ 11 files changed, 814 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 package-lock.json create mode 100644 package.json create mode 100644 src/index.css create mode 100755 src/index.mjs create mode 100755 src/index.sh create mode 100644 src/lib/ai-extract.mjs create mode 100644 src/lib/make-html.mjs create mode 100644 src/lib/parse-result.mjs create mode 100644 src/template.html diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..98883e7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,161 @@ +*.html +!src/*.html +# Created by https://www.toptal.com/developers/gitignore/api/git,node +# Edit at https://www.toptal.com/developers/gitignore?templates=git,node + +### Git ### +# Created by git for backups. To disable backups in Git: +# $ git config --global mergetool.keepBackup false +*.orig + +# Created by git when using merge tools for conflicts +*.BACKUP.* +*.BASE.* +*.LOCAL.* +*.REMOTE.* +*_BACKUP_*.txt +*_BASE_*.txt +*_LOCAL_*.txt +*_REMOTE_*.txt + +### Node ### +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +lerna-debug.log* +.pnpm-debug.log* + +# Diagnostic reports (https://nodejs.org/api/report.html) +report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json + +# Runtime data +pids +*.pid +*.seed +*.pid.lock + +# Directory for instrumented libs generated by jscoverage/JSCover +lib-cov + +# Coverage directory used by tools like istanbul +coverage +*.lcov + +# nyc test coverage +.nyc_output + +# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) +.grunt + +# Bower dependency directory (https://bower.io/) +bower_components + +# node-waf configuration +.lock-wscript + +# Compiled binary addons (https://nodejs.org/api/addons.html) +build/Release + +# Dependency directories +node_modules/ +jspm_packages/ + +# Snowpack dependency directory (https://snowpack.dev/) +web_modules/ + +# TypeScript cache +*.tsbuildinfo + +# Optional npm cache directory +.npm + +# Optional eslint cache +.eslintcache + +# Optional stylelint cache +.stylelintcache + +# Microbundle cache +.rpt2_cache/ +.rts2_cache_cjs/ +.rts2_cache_es/ +.rts2_cache_umd/ + +# Optional REPL history +.node_repl_history + +# Output of 'npm pack' +*.tgz + +# Yarn Integrity file +.yarn-integrity + +# dotenv environment variable files +.env +.env.development.local +.env.test.local +.env.production.local +.env.local + +# parcel-bundler cache (https://parceljs.org/) +.cache +.parcel-cache + +# Next.js build output +.next +out + +# Nuxt.js build / generate output +.nuxt +dist + +# Gatsby files +.cache/ +# Comment in the public line in if your project uses Gatsby and not Next.js +# https://nextjs.org/blog/next-9-1#public-directory-support +# public + +# vuepress build output +.vuepress/dist + +# vuepress v2.x temp and cache directory +.temp + +# Docusaurus cache and generated files +.docusaurus + +# Serverless directories +.serverless/ + +# FuseBox cache +.fusebox/ + +# DynamoDB Local files +.dynamodb/ + +# TernJS port file +.tern-port + +# Stores VSCode versions used for testing VSCode extensions +.vscode-test + +# yarn v2 +.yarn/cache +.yarn/unplugged +.yarn/build-state.yml +.yarn/install-state.gz +.pnp.* + +### Node Patch ### +# Serverless Webpack directories +.webpack/ + +# Optional stylelint cache + +# SvelteKit build / generate output +.svelte-kit + +# End of https://www.toptal.com/developers/gitignore/api/git,node diff --git a/README.md b/README.md new file mode 100644 index 0000000..8c79d80 --- /dev/null +++ b/README.md @@ -0,0 +1,7 @@ +# receipt-parser + + + + +## System Requirements +- `libtesseract-dev` \ No newline at end of file diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 0000000..55147d1 --- /dev/null +++ b/package-lock.json @@ -0,0 +1,318 @@ +{ + "name": "receipt-parser", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "receipt-parser", + "version": "1.0.0", + "license": "GPL-3.0", + "dependencies": { + "@anthropic-ai/sdk": "^0.24.3", + "nightink": "^1.0.2", + "p-throttle": "^6.1.0", + "pdf-to-text": "^0.0.7", + "tqdm": "^2.0.3" + } + }, + "node_modules/@anthropic-ai/sdk": { + "version": "0.24.3", + "resolved": "https://registry.npmjs.org/@anthropic-ai/sdk/-/sdk-0.24.3.tgz", + "integrity": "sha512-916wJXO6T6k8R6BAAcLhLPv/pnLGy7YSEBZXZ1XTFbLcTZE8oTy3oDW9WJf9KKZwMvVcePIfoTSvzXHRcGxkQQ==", + "license": "MIT", + "dependencies": { + "@types/node": "^18.11.18", + "@types/node-fetch": "^2.6.4", + "abort-controller": "^3.0.0", + "agentkeepalive": "^4.2.1", + "form-data-encoder": "1.7.2", + "formdata-node": "^4.3.2", + "node-fetch": "^2.6.7", + "web-streams-polyfill": "^3.2.1" + } + }, + "node_modules/@types/node": { + "version": "18.19.39", + "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.39.tgz", + "integrity": "sha512-nPwTRDKUctxw3di5b4TfT3I0sWDiWoPQCZjXhvdkINntwr8lcoVCKsTgnXeRubKIlfnV+eN/HYk6Jb40tbcEAQ==", + "license": "MIT", + "dependencies": { + "undici-types": "~5.26.4" + } + }, + "node_modules/@types/node-fetch": { + "version": "2.6.11", + "resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.11.tgz", + "integrity": "sha512-24xFj9R5+rfQJLRyM56qh+wnVSYhyXC2tkoBndtY0U+vubqNsYXGjufB2nn8Q6gt0LrARwL6UBtMCSVCwl4B1g==", + "license": "MIT", + "dependencies": { + "@types/node": "*", + "form-data": "^4.0.0" + } + }, + "node_modules/abort-controller": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz", + "integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==", + "license": "MIT", + "dependencies": { + "event-target-shim": "^5.0.0" + }, + "engines": { + "node": ">=6.5" + } + }, + "node_modules/agentkeepalive": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.5.0.tgz", + "integrity": "sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==", + "license": "MIT", + "dependencies": { + "humanize-ms": "^1.2.1" + }, + "engines": { + "node": ">= 8.0.0" + } + }, + "node_modules/asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==", + "license": "MIT" + }, + "node_modules/combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "license": "MIT", + "dependencies": { + "delayed-stream": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", + "license": "MIT", + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/event-target-shim": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz", + "integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/form-data": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", + "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==", + "license": "MIT", + "dependencies": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "mime-types": "^2.1.12" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/form-data-encoder": { + "version": "1.7.2", + "resolved": "https://registry.npmjs.org/form-data-encoder/-/form-data-encoder-1.7.2.tgz", + "integrity": "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==", + "license": "MIT" + }, + "node_modules/formdata-node": { + "version": "4.4.1", + "resolved": "https://registry.npmjs.org/formdata-node/-/formdata-node-4.4.1.tgz", + "integrity": "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==", + "license": "MIT", + "dependencies": { + "node-domexception": "1.0.0", + "web-streams-polyfill": "4.0.0-beta.3" + }, + "engines": { + "node": ">= 12.20" + } + }, + "node_modules/formdata-node/node_modules/web-streams-polyfill": { + "version": "4.0.0-beta.3", + "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-4.0.0-beta.3.tgz", + "integrity": "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==", + "license": "MIT", + "engines": { + "node": ">= 14" + } + }, + "node_modules/html-entities": { + "version": "2.5.2", + "resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.5.2.tgz", + "integrity": "sha512-K//PSRMQk4FZ78Kyau+mZurHn3FH0Vwr+H36eE0rPbeYkRRi9YxceYPhuN60UwWorxyKHhqoAJl2OFKa4BVtaA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/mdevils" + }, + { + "type": "patreon", + "url": "https://patreon.com/mdevils" + } + ], + "license": "MIT" + }, + "node_modules/humanize-ms": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/humanize-ms/-/humanize-ms-1.2.1.tgz", + "integrity": "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==", + "license": "MIT", + "dependencies": { + "ms": "^2.0.0" + } + }, + "node_modules/mime-db": { + "version": "1.52.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", + "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/mime-types": { + "version": "2.1.35", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", + "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", + "license": "MIT", + "dependencies": { + "mime-db": "1.52.0" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "license": "MIT" + }, + "node_modules/nightink": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/nightink/-/nightink-1.0.2.tgz", + "integrity": "sha512-4m/WBimAdO0TkUOF+UFBR9U09UTF+Y/G+PAMWarQmq7wUxclSPkA/hyi9wq4RUxJleV7PvN782YTQ4BgS7Vd/w==", + "license": "MPL-2.0", + "dependencies": { + "html-entities": "^2.3.2" + } + }, + "node_modules/node-domexception": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz", + "integrity": "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/jimmywarting" + }, + { + "type": "github", + "url": "https://paypal.me/jimmywarting" + } + ], + "license": "MIT", + "engines": { + "node": ">=10.5.0" + } + }, + "node_modules/node-fetch": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", + "license": "MIT", + "dependencies": { + "whatwg-url": "^5.0.0" + }, + "engines": { + "node": "4.x || >=6.0.0" + }, + "peerDependencies": { + "encoding": "^0.1.0" + }, + "peerDependenciesMeta": { + "encoding": { + "optional": true + } + } + }, + "node_modules/p-throttle": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/p-throttle/-/p-throttle-6.1.0.tgz", + "integrity": "sha512-eQMdGTxk2+047La67wefUtt0tEHh7D+C8Jl7QXoFCuIiNYeQ9zWs2AZiJdIAs72rSXZ06t11me2bgalRNdy3SQ==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/pdf-to-text": { + "version": "0.0.7", + "resolved": "https://registry.npmjs.org/pdf-to-text/-/pdf-to-text-0.0.7.tgz", + "integrity": "sha512-NHWB7u/9q+SZ28UtEgJYljamp61j06oldHdvGik1729pzRFLCO4igbZwm0MOUWoIQUz4nla3n+cf3Jh7uiOZwQ==", + "license": "ISC" + }, + "node_modules/tqdm": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/tqdm/-/tqdm-2.0.3.tgz", + "integrity": "sha512-Ju50G550gspkjd1AiJ/jFBHe2dii9s+KPntEsq0o73BqywqzNWPUM8/FD3zM1rOH7OGLoH7pGSGI90Ct+Yd/5Q==", + "license": "ISC" + }, + "node_modules/tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==", + "license": "MIT" + }, + "node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", + "license": "MIT" + }, + "node_modules/web-streams-polyfill": { + "version": "3.3.3", + "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-3.3.3.tgz", + "integrity": "sha512-d2JWLCivmZYTSIoge9MsgFCZrt571BikcWGYkjC1khllbTeDlGqZ2D8vD8E/lJa8WGWbb7Plm8/XJYV7IJHZZw==", + "license": "MIT", + "engines": { + "node": ">= 8" + } + }, + "node_modules/webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==", + "license": "BSD-2-Clause" + }, + "node_modules/whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "license": "MIT", + "dependencies": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + } + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..d2fe103 --- /dev/null +++ b/package.json @@ -0,0 +1,18 @@ +{ + "name": "receipt-parser", + "version": "1.0.0", + "description": "Extracts and processes receipts.", + "main": "src/index.mjs", + "scripts": { + "test": "echo \"no tests implemented\"" + }, + "author": "Starbeamrainbowlabs", + "license": "GPL-3.0", + "dependencies": { + "@anthropic-ai/sdk": "^0.24.3", + "nightink": "^1.0.2", + "p-throttle": "^6.1.0", + "pdf-to-text": "^0.0.7", + "tqdm": "^2.0.3" + } +} diff --git a/src/index.css b/src/index.css new file mode 100644 index 0000000..da8be58 --- /dev/null +++ b/src/index.css @@ -0,0 +1,39 @@ +html, body { font-size: 100%; } + +body { + font-family: sans-serif; +} + +h1 { + text-align: center; +} + +table { + width: 100%; + border-collapse: collapse; + margin-bottom: 1rem; +} + +th, td { + padding: 0.75rem; + text-align: left; + border-bottom: 1px solid #ddd; +} + +th { + background-color: #f2f2f2; + font-weight: bold; +} + +tr:nth-child(even) { + background-color: #f8f8f8; +} + +tr:hover { + background-color: #e8e8e8; +} + + +.align-right { + text-align: right; +} \ No newline at end of file diff --git a/src/index.mjs b/src/index.mjs new file mode 100755 index 0000000..862dfa4 --- /dev/null +++ b/src/index.mjs @@ -0,0 +1,55 @@ +#!/usr/bin/env node +"use strict"; + +import path from 'path'; +import fs from 'fs'; + +import tqdm from 'tqdm'; +import pThrottle from 'p-throttle'; + +import ai_extract from './lib/ai-extract.mjs'; +import make_html from './lib/make-html.mjs'; + +// HACK: Make sure __dirname is defined when using es6 modules. I forget where I found this - a PR with a source URL would be great! +const __dirname = import.meta.url.slice(7, import.meta.url.lastIndexOf("/")); + +// const filepaths = process.argv.slice(2).map(f => path.resolve(process.cwd(), f)); +const filepaths = fs.readFileSync(process.argv.slice(2)[0], "utf-8") + .split(`\n`) + .map(el => el.trim()) + .filter(el => el.length > 0); + +console.log(`Hello from Node.js`); + +console.log(`>>> FILEPATHS`); +console.log(filepaths.join(`\n`)); + +const txts = await Promise.all(filepaths.map(filepath => fs.promises.readFile(filepath, "utf-8"))); + + +const throttler = pThrottle({ + limit: 40, + interval: 60 * 1000 // every minute +}); +const throttled = throttler(ai_extract); + +const objs = []; +let i = 0; +for(const txt of tqdm(txts, { total: txts.length })) { + console.log(`>>> AI > ${i}`); + + const result = await throttled(txt, true); + result.i = i; + + console.log(result); + + objs.push(result); + + i++; +} + +const html = make_html(objs); +const filepath_out = path.join(process.cwd(), `report.html`); +fs.writeFileSync(filepath_out, html); + +console.log(`Written output to ${filepath_out}`); diff --git a/src/index.sh b/src/index.sh new file mode 100755 index 0000000..f05317d --- /dev/null +++ b/src/index.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash + +__dirname="$(dirname "$0")"; + +temp_dir="$(mktemp --tmpdir -d "receipt-parse-XXXXXXX")"; +if [[ -z "${temp_dir}" ]]; then + echo "Error: Failed to get temporary directory" >&2; + exit 1; +fi + +on_exit() { + rm -rf "${temp_dir}"; +} +trap on_exit EXIT; + +mkdir "${temp_dir}/tmp"; +mkdir "${temp_dir}/txt"; + +echo ">>> Text extraction"; + +filepath_filelist="${temp_dir}/files.txt"; + +i=0; +for filename in "$@"; do + echo "Processing ${filename}"; + filepath="$(realpath -s "${filename}")"; + ext="$(echo "${filename#*.}" | tr '[:upper:]' '[:lower:]')"; + + ### + ## Convert to text + ### + if [[ "${ext}" == "pdf" ]]; then + filepath_out="${temp_dir}/tmp/out.txt"; + pdftotext "${filepath}" "${filepath_out}"; + elif [[ "${ext}" == "txt" ]]; then + echo "skipping ${filename} because it's a text file"; + continue + else + tesseract "${filepath}" "${temp_dir}/tmp/out"; + filepath_out="$(find "${temp_dir}/tmp" -type f | head -n1)"; + fi + + ### + ## Move out of tmp dir + ### + filepath_final="${temp_dir}/txt/${i}.txt"; + mv "${filepath_out}" "${filepath_final}"; + i="$((i+1))"; + + ### + ## Clean up tmp dir ready for the next file + ### + set +e; + rm -rf "${temp_dir}/tmp/*"; + set -e; + + ### + ## Log the final filepath + ### + echo "${filepath_final}" >>"${filepath_filelist}"; +done + +echo ">>> Running Node.js"; +"${__dirname}/index.mjs" "${filepath_filelist}"; \ No newline at end of file diff --git a/src/lib/ai-extract.mjs b/src/lib/ai-extract.mjs new file mode 100644 index 0000000..0750942 --- /dev/null +++ b/src/lib/ai-extract.mjs @@ -0,0 +1,56 @@ +import Anthropic from "@anthropic-ai/sdk"; + +import parse_result from "./parse-result.mjs"; + +const prompt = `The input is a receipt or invoice. The user is a theatre production company. Output only a JSON object and nothing else with the following properties: + +date (string): the date the item was purchased, formatted as an iso date. If not present, set this value to null. + +item_name (string): The name of the item purchased. Where there are multiple items, summarise them with a single name. Shorten excessively long product names. + +paid (number): The total amount that was paid + +vat (number): The total amount of VAT paid. If this value is NOT present, set the JSON property vat_percent to the VAT percentage. If this is not present either, set the value to null. + +category (string): Estimate a single category for the item purchased from the following options: Props, Consumables, Technical, Travel, Logistics, Costume, Wellbeing, Health/Medical, Training`; + +const anthropic = new Anthropic({ + apiKey: process.env['ANTHROPIC_API_KEY'], // the default, apparently +}); + +export default async function ai_extract(text, pretend=false) { + if(pretend) { + console.log(`WARNING: IN PRETEND MODE.`); + return parse_result({ + date: "2024-06-25", + item_name: "3M Picture/Strip, Gale Decoration Kit, T&G Matt Paint", + paid: 26.95, + vat_percent: 20, + category: "Props" + }); + } + const msg = await anthropic.messages.create({ + model: "claude-3-haiku-20240307", + max_tokens: 1000, + temperature: 0, + system: prompt, + messages: [ + { + "role": "user", + "content": [ + { + "type": "text", + text + } + ] + } + ] + }); + + const response = msg.content[0].text; + + const obj = JSON.parse(response); + console.log(`AI OUTPUT:`, response, `OBJ`, obj); + + return parse_result(obj); +} \ No newline at end of file diff --git a/src/lib/make-html.mjs b/src/lib/make-html.mjs new file mode 100644 index 0000000..75c5452 --- /dev/null +++ b/src/lib/make-html.mjs @@ -0,0 +1,26 @@ +"use strict"; + +import path from 'path'; +import fs from 'fs'; + +import { NightInk } from 'nightink'; + +// HACK: Make sure __dirname is defined when using es6 modules. I forget where I found this - a PR with a source URL would be great! +const __dirname = import.meta.url.slice(7, import.meta.url.lastIndexOf("/")); + +const template = fs.readFileSync(path.join(__dirname, `../template.html`), `utf-8`); +const css = fs.readFileSync(path.join(__dirname, `../index.css`), `utf-8`); + +export default function(objs) { + const total = objs.reduce((acc, obj) => acc + obj.paid, 0); + + const values = { + objs, + total, + css + }; + + console.debug(`VALUES`, values); + + return NightInk(template, values); +} \ No newline at end of file diff --git a/src/lib/parse-result.mjs b/src/lib/parse-result.mjs new file mode 100644 index 0000000..84d7b56 --- /dev/null +++ b/src/lib/parse-result.mjs @@ -0,0 +1,22 @@ +"use strict"; + +export default function parse_result(obj) { + if(typeof obj.vat === "undefined") { + if(typeof obj.vat_percent === "undefined") { + obj.vat = null; + } + else { + if(obj.vat_percent > 1) + obj.vat_percent /= 100; + obj.vat = Math.round(obj.paid * obj.vat_percent * 100) / 100; + } + } + + if(obj.vat !== null) + obj.net = obj.paid - obj.vat; + + if(typeof obj.net === "undefined") + obj.net = null; + + return obj; // daisy chain! :D +} \ No newline at end of file diff --git a/src/template.html b/src/template.html new file mode 100644 index 0000000..2088bf1 --- /dev/null +++ b/src/template.html @@ -0,0 +1,48 @@ + + + + + Expense Report + + +

Project name Expense Report

+ + + + + + + + + + + + + + + + {#each objs} + + + + + + + + + + {#endeach} + + +
#DateCategoryDescriptionNetVATTotal
{{i}}{{date}}{{category}}{{item_name}}{{net}}{{vat}}{{paid}}
+ +
+ All total: {{total}} +
+ + + + + \ No newline at end of file