Skip to content

Commit

Permalink
feat: improvements to pdfreader
Browse files Browse the repository at this point in the history
Refactored pdfreader to be more extensible and added a mercurycards importer which uses the pdfreader to read credit card transactions.
  • Loading branch information
gary-roach committed Nov 22, 2024
1 parent 738b0ea commit ad320e0
Show file tree
Hide file tree
Showing 10 changed files with 454 additions and 94 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def custom_init(self):
self.pdf_table_extraction_settings = {"join_tolerance": 4, "snap_tolerance": 4}
self.pdf_table_extraction_crop = (0, 0, 0, 0)
self.pdf_table_title_height = 0
self.pdf_page_break_top = 0
# Set this true as you play with the extraction settings and crop to view images of what the pdf parser detects
self.debug = True

Expand Down
163 changes: 163 additions & 0 deletions beancount_reds_importers/importers/mercurycards/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
"""Mercury Cards pdf importer for beancount."""

import re
from datetime import datetime

import petl as etl

from beancount_reds_importers.libreader import pdfreader
from beancount_reds_importers.libtransactionbuilder import banking


class Importer(banking.Importer, pdfreader.Importer):
IMPORTER_NAME = "Mercury Cards"

def custom_init(self):
if not self.custom_init_run:
self.max_rounding_error = 0.04
self.filename_pattern_def = "Mercury Statement *.pdf"
self.pdf_table_extraction_settings = {
"vertical_strategy": "text",
"horizontal_strategy": "text",
}
self.pdf_table_extraction_crop = (0, 0, 0, 0)
self.pdf_table_title_height = 0
self.pdf_page_break_top = 0
self.date_format = "%m/%d/%Y"
self.transaction_table_section = "table_1"
self.meta_text = ""
self.skip_transaction_types = {}
self.header_map = {
"Post Date": "settleDate",
"Trans Date": "date",
"Description": "memo",
"Reference": "reference",
"Amount": "amount",
}

# payee and narration are swapped
# We need to swap them back. See banking.py
self.get_payee = lambda ot: ot.memo
self.get_narration = lambda ot: None # setting to none to use smart importer

self.debug = True
self.custom_init_run = True

def file_date(self, file):
if not self.file_read_done:
self.read_file(file)

return self.get_closing_date()

def get_closing_date(self):
if self.meta_text == "":
raise ValueError("No meta_text has been found")

# Pattern to match "Closing Date" followed by a date in mm/dd/yyyy format
pattern = r"Closing Date\s+(\d{2}/\d{2}/\d{4})"

# Search for all matches in self.meta_text
matches = re.findall(pattern, self.meta_text)

date_string = matches[0]
date_format = "%m/%d/%Y"
datetime_object = datetime.strptime(date_string, date_format)

return datetime_object

def get_adjusted_crop(self, page_idx, page):
"""Dynamically find the crop positon based on the position of text found on the page."""
adjusted_crop = (0, 0, 1, 1)
table_start_search_text = "TRANSACTIONS"
table_start_search_results = page.search(table_start_search_text)
if table_start_search_results:
table_start = table_start_search_results[0]
table_start_x = table_start["x0"] - 30
table_start_y = table_start["bottom"] + 50

table_end_search_text = "YEAR-TO-DATE"
table_end_search_results = page.search(table_end_search_text)

if table_end_search_results:
table_end = table_end_search_results[0]
table_end_y = table_end["top"] - 10
else:
table_end_y = page.bbox[3] # if no end text is found use the whole page

adjusted_crop = (
(table_start_x),
(table_start_y),
(page.bbox[2]),
(table_end_y),
)
return adjusted_crop

def fix_years(self, table):
"""
Determine the correct year for the given date string (MM/DD format).
"""

def get_year(d):
# Get the current year
current_year = self.get_closing_date().year

return f"{d}/{current_year}"

date_headers = ["Post Date", "Trans Date"]
for i in date_headers:
if i in table.header():
table = table.convert(i, lambda d: get_year(d))

return table

def prepare_tables(self):
"""Make final adjustments to tables before processing by the transaction builder."""
for section, table in self.alltables.items():
# set table headers. table was goofy, so they had to be croped out
headers = [
"Post Date",
"Trans Date",
"Description",
"City",
"State",
"Reference",
"Amount",
]
table = etl.wrap(etl.pushheader(table, headers))

# add year to mm/dd formatted date
table = self.fix_years(table)

table = table.rename(self.header_map)
table = self.convert_columns(table)

# the amounts should be negative since they're charges
table = etl.convert(table, "amount", lambda a: a * -1)

table = self.fix_column_names(table)
table = self.prepare_processed_table(
table
) # override this to make additonal adjustments

self.alltables[section] = table

self.combine_tables()
return

def combine_tables(self):
# Initialize an empty table
combined_table = None

for section, table in self.alltables.items():
# Convert each table to a petl table
petl_table = etl.wrap(table)

# Combine tables
if combined_table is None:
combined_table = petl_table # First table initializes the combined table
else:
combined_table = etl.cat(
combined_table, petl_table
) # Concatenate additional tables

return combined_table
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@

2024-10-05 * "Whole Foods"
Liabilities:Credit-Cards:Mercury -15.01 USD

2024-10-06 * "Car Wash"
Liabilities:Credit-Cards:Mercury -35.30 USD

2024-10-07 * "Taco Bell"
Liabilities:Credit-Cards:Mercury -22.76 USD

2024-10-07 * "Mcdonald's"
Liabilities:Credit-Cards:Mercury -10.11 USD

2024-10-08 * "Papa John's"
Liabilities:Credit-Cards:Mercury -30.16 USD

2024-10-10 * "Paypal"
Liabilities:Credit-Cards:Mercury -33.97 USD

2024-10-11 * "Amazon"
Liabilities:Credit-Cards:Mercury -101.55 USD

2024-10-11 * "Wm Supercenter"
Liabilities:Credit-Cards:Mercury -53.44 USD

2024-10-12 * "Amazon"
Liabilities:Credit-Cards:Mercury -204.32 USD

2024-10-12 * "Amazon"
Liabilities:Credit-Cards:Mercury -4.90 USD

2024-10-12 * "Target"
Liabilities:Credit-Cards:Mercury -10.90 USD

2024-10-13 * "Bp"
Liabilities:Credit-Cards:Mercury -106.95 USD

2024-10-14 * "Circle K"
Liabilities:Credit-Cards:Mercury -50.69 USD

2024-10-15 * "Amazon"
Liabilities:Credit-Cards:Mercury -10.11 USD

2024-10-15 * "Doordash"
Liabilities:Credit-Cards:Mercury -73.82 USD

2024-10-16 * "Mcdonald's"
Liabilities:Credit-Cards:Mercury -10.68 USD

2024-10-16 * "7-Eleven"
Liabilities:Credit-Cards:Mercury -10.11 USD

2024-10-17 * "Advance Auto Parts"
Liabilities:Credit-Cards:Mercury -13.86 USD

2024-10-18 * "Mcdonald's"
Liabilities:Credit-Cards:Mercury -15.11 USD

2024-10-18 * "Papa John's"
Liabilities:Credit-Cards:Mercury -31.29 USD

2024-10-18 * "Mcdonald's"
Liabilities:Credit-Cards:Mercury -12.62 USD

2024-10-19 * "Amazon Prime"
Liabilities:Credit-Cards:Mercury -23.52 USD

2024-10-19 * "Amazon"
Liabilities:Credit-Cards:Mercury -30.46 USD

2024-10-19 * "Amazon"
Liabilities:Credit-Cards:Mercury -44.21 USD

2024-10-19 * "Amazon"
Liabilities:Credit-Cards:Mercury -23.57 USD

2024-10-19 * "Amazon"
Liabilities:Credit-Cards:Mercury -22.24 USD

2024-10-19 * "Amazon"
Liabilities:Credit-Cards:Mercury -69.23 USD

2024-10-19 * "Amazon"
Liabilities:Credit-Cards:Mercury -60.00 USD

2024-10-19 * "Amazon"
Liabilities:Credit-Cards:Mercury -57.07 USD

2024-10-19 * "Amazon"
Liabilities:Credit-Cards:Mercury -224.05 USD

2024-10-20 * "Whole Foods"
Liabilities:Credit-Cards:Mercury -6.39 USD

2024-10-21 * "Doordash"
Liabilities:Credit-Cards:Mercury -79.56 USD

2024-10-21 * "Amazon"
Liabilities:Credit-Cards:Mercury -20.24 USD

2024-10-22 * "Papa John's"
Liabilities:Credit-Cards:Mercury -37.85 USD

2024-10-22 * "Mcdonald's"
Liabilities:Credit-Cards:Mercury -10.11 USD

2024-10-22 * "Racetrac"
Liabilities:Credit-Cards:Mercury -32.90 USD

2024-10-23 * "Mcdonald's"
Liabilities:Credit-Cards:Mercury -10.11 USD

2024-10-24 * "Doordash"
Liabilities:Credit-Cards:Mercury -28.41 USD

2024-10-24 * "Doordash"
Liabilities:Credit-Cards:Mercury -40.83 USD

2024-10-25 * "Doordash"
Liabilities:Credit-Cards:Mercury -68.35 USD
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Liabilities:Credit-Cards:Mercury
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2024-11-05T00:00:00
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
mercury_statement_20241105.pdf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from os import path

from beancount.ingest import regression_pytest as regtest

from beancount_reds_importers.importers import mercurycards


@regtest.with_importer(
mercurycards.Importer(
{
"main_account": "Liabilities:Credit-Cards:Mercury",
"emit_filing_account_metadata": False,
"filename_pattern": "mercury_statement_20241105.pdf",
"skip_transaction_types": {},
"currency": "USD",
}
)
)
@regtest.with_testdir(path.dirname(__file__))
class TestMercuryCards(regtest.ImporterTestBase):
pass
3 changes: 2 additions & 1 deletion beancount_reds_importers/libreader/csv_multitable_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def file_date(self, file):
pass

def convert_columns(self, rdr):
pass
# Convert columns for a single table. Can be called from prepare_tables for each table.
return super().convert_columns(rdr)

def is_section_title(self, row):
# Match against rows that contain section titles. Eg: 'section1', 'section2', ...
Expand Down
Loading

0 comments on commit ad320e0

Please sign in to comment.