From 0328fbe90c556d5f0ef23d2855b3ebbe8d64d392 Mon Sep 17 00:00:00 2001 From: Starbeamrainbowlabs Date: Tue, 9 Jul 2024 23:12:06 +0100 Subject: [PATCH] finish script --- CONTRIBUTING.md | 12 +++++++++ README.md | 56 +++++++++++++++++++++++++++++++++++++-- src/index.mjs | 16 +++++++++-- src/lib/ai-extract.mjs | 18 +++++++++---- src/lib/make-html.mjs | 3 ++- src/lib/trim-specific.mjs | 12 +++++++++ 6 files changed, 107 insertions(+), 10 deletions(-) create mode 100644 CONTRIBUTING.md create mode 100644 src/lib/trim-specific.mjs diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..270eb34 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,12 @@ +# Contributors guide +Contributions are very welcome and very much encouraged - both issues and pull requests! Please mention in your pull request that you release your work under the MPL-2.0 (see below). + +Due to life stuff at times it may take up to 2 weeks at times for me to respond to issues and pull requests. If it has been a few days and you have not received a reply, consider joining [the Gitter/Matrix chat from a larger project of mine](https://app.gitter.im/#/room/#Pepperminty-Wiki_Lobby:gitter.im) and poking me on there. + +I recommend checking on your pull request or issue every few days for a reply. + +If you open a pull request, I will review your changes and reply with a review detailing some changes I would like you to make. It may take a few back and forth comments, but then once I am happy with your changes I'll accept them and merge them into the codebase. This way we can polish your contribution to make it fit in with the existing codebase better ✨ + +It is also unlikely, but possible I may reject your changes. If this is the case, I will leave a comment explaining why. + +If you are no longer interested in continuing to work with a pull request or will be away for a while, please leave a comment. Nobody will be offended! If you do not leave a comment or do not respond for 2 weeks, I may take over your pull request, work on it, and merge it myself. diff --git a/README.md b/README.md index be7ffb7..f804a71 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,58 @@ # receipt-parser -> Receipt/invoice parser. Takes a list of PDF/images → pdftotext/tesseract → Anthropic Claude API for extraction → contenteditable HTML. +> Receipt/invoice parser. Takes a list of PDF/images → pdftotext/tesseract → Anthropic Claude API for extraction → contenteditable HTML. We use the Claude Haiku model to reduce cost while maintaining high accuracy. + +The receipt-parser is a tool that takes PDF files or images of receipts and invoices as input, and extracts the relevant information using optical character recognition (OCR) and the Anthropic Claude API. The extracted data is then presented in an easy-to-read, editable HTML format. ## System Requirements -- `libtesseract-dev` +- Node.js +- Bash +- Tesseract (`sudo apt install tesseract`) +- Anthropic API key: + +## Usage +First, clone the repository: `git clone https://github.com/your-username/receipt-parser.git` + +Then, install the required dependencies: `npm install` + +Obtain an Anthropic API key from + +Run the script list so: + +```bash +./index.sh /path/to/receipt1.pdf /path/to/receipt2.pdf ... +``` + +...it takes a list of files. + +The script will process all PDF and image (anything Tesseract supports) files to: + +- Extract text using OCR and Tesseract +- Convert that into a machine-readable JSON object with the Anthropic Claude API +- Generate an HTML file with the extracted data for each input file. + +### Nautilus Script +To register the script as a Nautilus script (for easy right-click access in the file manager), follow these steps: + +From the root of this repository, run this command: + +```bash +ln -s $(pwd)/src/index.sh ~/.local/share/nautilus/scripts/parse-receipts +``` + +Then, restart Nautilus by running `nautilus -q` in the terminal. + +After restarting Nautilus, you should be able to right-click on any PDF or image file and select "Scripts" > "parse-receipts" to run the receipt parser on the selected files. + + +## Contributing +Contributions are very welcome - both issues and pull requests! Please mention in your pull request that you release your work under the MPL-2.0 (see below). + +See [CONTRIBUTING.md](./CONTRIBUTING.md) for a guide on what to expect when submitting a pull request or issue to this project. + +If you're feeling that way inclined, the sponsor button at the top of the page (if you're on GitHub) will take you to my [Liberapay profile](https://liberapay.com/sbrl) if you'd like to donate to say an extra thank you :-) + + +## License +This project is released under the GNU Public License 3.0. The full license text is included in the `LICENSE` file in this repository. Tldr legal have a [great summary](https://www.tldrlegal.com/license/gnu-general-public-license-v3-gpl-3) of the license if you're interested. + diff --git a/src/index.mjs b/src/index.mjs index 862dfa4..f037333 100755 --- a/src/index.mjs +++ b/src/index.mjs @@ -35,10 +35,22 @@ const throttled = throttler(ai_extract); const objs = []; let i = 0; -for(const txt of tqdm(txts, { total: txts.length })) { +for(let txt of tqdm(txts, { total: txts.length })) { console.log(`>>> AI > ${i}`); - const result = await throttled(txt, true); + txt = txt.trim(); + if(txt.length === 0) { + objs.push({ + date: "unknown", + item_name: "unknown", + paid: null, + vat: null, + category: "Unknown" + }); + continue; + } + + const result = await throttled(txt, false); // bool is whether we're pretend or not - i.e. not making anthropic/claude api calls result.i = i; console.log(result); diff --git a/src/lib/ai-extract.mjs b/src/lib/ai-extract.mjs index 0750942..4b87186 100644 --- a/src/lib/ai-extract.mjs +++ b/src/lib/ai-extract.mjs @@ -1,8 +1,9 @@ import Anthropic from "@anthropic-ai/sdk"; import parse_result from "./parse-result.mjs"; +import trim_specific from "./trim-specific.mjs"; -const prompt = `The input is a receipt or invoice. The user is a theatre production company. Output only a JSON object and nothing else with the following properties: +const prompt = `The input is a receipt, invoice or parking/bus ticket. The user is a theatre production company. Output only a single JSON object and NOT an array and nothing else with the following properties: date (string): the date the item was purchased, formatted as an iso date. If not present, set this value to null. @@ -12,7 +13,9 @@ paid (number): The total amount that was paid vat (number): The total amount of VAT paid. If this value is NOT present, set the JSON property vat_percent to the VAT percentage. If this is not present either, set the value to null. -category (string): Estimate a single category for the item purchased from the following options: Props, Consumables, Technical, Travel, Logistics, Costume, Wellbeing, Health/Medical, Training`; +category (string): Estimate a single category for the item purchased from the following options: Props, Consumables, Technical, Travel, Logistics, Costume, Wellbeing, Health/Medical, Training + +No matter what your output MUST be valid JSON.`; const anthropic = new Anthropic({ apiKey: process.env['ANTHROPIC_API_KEY'], // the default, apparently @@ -46,11 +49,16 @@ export default async function ai_extract(text, pretend=false) { } ] }); + console.log(`\n\n\n\n\n\n`); + console.log(`--------------------------------------`); + let response = msg.content[0].text; + console.log(`AI OUTPUT:`, response); - const response = msg.content[0].text; - + console.log(`--------------------------------------`); + response = trim_specific(response, `,`); const obj = JSON.parse(response); - console.log(`AI OUTPUT:`, response, `OBJ`, obj); + console.log(`AI OUTPUT: OBJ`, obj); + console.log(`--------------------------------------`); return parse_result(obj); } \ No newline at end of file diff --git a/src/lib/make-html.mjs b/src/lib/make-html.mjs index 75c5452..265a0a6 100644 --- a/src/lib/make-html.mjs +++ b/src/lib/make-html.mjs @@ -12,7 +12,8 @@ const template = fs.readFileSync(path.join(__dirname, `../template.html`), `utf- const css = fs.readFileSync(path.join(__dirname, `../index.css`), `utf-8`); export default function(objs) { - const total = objs.reduce((acc, obj) => acc + obj.paid, 0); + let total = objs.reduce((acc, obj) => acc + obj.paid, 0); + total = Math.round(total * 100) / 100; const values = { objs, diff --git a/src/lib/trim-specific.mjs b/src/lib/trim-specific.mjs new file mode 100644 index 0000000..7de5245 --- /dev/null +++ b/src/lib/trim-specific.mjs @@ -0,0 +1,12 @@ +"use strict"; + +function escapeRegexp(s) { + return s.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'); +} + +function trim_specific(value, find) { + const find2 = escapeRegexp(find); + return value.replace(new RegExp(`^[${find2}]*(.*?)[${find2}]*$`), '$1'); +} + +export default trim_specific; \ No newline at end of file