Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: improvements to pdfreader #110

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def custom_init(self):
self.pdf_table_extraction_settings = {"join_tolerance": 4, "snap_tolerance": 4}
self.pdf_table_extraction_crop = (0, 0, 0, 0)
self.pdf_table_title_height = 0
self.pdf_page_break_top = 0
# Set this true as you play with the extraction settings and crop to view images of what the pdf parser detects
self.debug = True

Expand Down
146 changes: 146 additions & 0 deletions beancount_reds_importers/importers/mercurycards/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
"""Mercury Cards pdf importer for beancount."""

from datetime import datetime
import petl as etl
import re
from beancount_reds_importers.libreader import pdfreader
from beancount_reds_importers.libtransactionbuilder import banking


class Importer(banking.Importer, pdfreader.Importer):
IMPORTER_NAME = "Mercury Cards"

def custom_init(self):
if not self.custom_init_run:
self.max_rounding_error = 0.04
self.filename_pattern_def = "Mercury Statement *.pdf"
self.pdf_table_extraction_settings = {"vertical_strategy": "text", "horizontal_strategy": "text",}
self.pdf_table_extraction_crop = (0, 0, 0, 0)
self.pdf_table_title_height = 0
self.pdf_page_break_top = 0
self.date_format = "%m/%d/%Y"
self.transaction_table_section = "table_1"
self.meta_text = ""
self.skip_transaction_types = {}
self.header_map = {
'Post Date' : 'settleDate',
'Trans Date' : 'date',
'Description' : 'memo',
'Reference' : 'reference',
'Amount' : 'amount'
}

# payee and narration are swapped
# We need to swap them back. See banking.py
self.get_payee = lambda ot: ot.memo
self.get_narration = lambda ot: None # setting to none to use smart importer

self.debug = True
self.custom_init_run = True


def file_date(self, file):
if self.file_read_done == False:
self.read_file(file)

return self.get_closing_date()

def get_closing_date(self):
if self.meta_text == "":
raise ValueError("No meta_text has been found")

# Pattern to match "Closing Date" followed by a date in mm/dd/yyyy format
pattern = r"Closing Date\s+(\d{2}/\d{2}/\d{4})"

# Search for all matches in self.meta_text
matches = re.findall(pattern, self.meta_text)

date_string = matches[0]
date_format = "%m/%d/%Y"
datetime_object = datetime.strptime(date_string, date_format)

return datetime_object

def get_adjusted_crop(self, page_idx, page):
"""Dynamically find the crop positon based on the position of text found on the page."""
adjusted_crop = (0, 0, 1, 1)
table_start_search_text = "TRANSACTIONS"
table_start_search_results = page.search(table_start_search_text)
if table_start_search_results:
table_start = table_start_search_results[0]
table_start_x = table_start['x0'] - 30
table_start_y = table_start['bottom'] + 50

table_end_search_text = "YEAR-TO-DATE"
table_end_search_results = page.search(table_end_search_text)

if table_end_search_results:
table_end = table_end_search_results[0]
table_end_y = table_end['top'] - 10
else:
table_end_y = page.bbox[3] # if no end text is found use the whole page

adjusted_crop = (
(table_start_x),
(table_start_y),
(page.bbox[2]),
(table_end_y),
)
return adjusted_crop

def fix_years(self, table):
"""
Determine the correct year for the given date string (MM/DD format).
"""
def get_year(d):
# Get the current year
current_year = self.get_closing_date().year

return f"{d}/{current_year}"

date_headers = ['Post Date', 'Trans Date']
for i in date_headers:
if i in table.header():
table = table.convert(i, lambda d: get_year(d))

return table

def prepare_tables(self):
"""Make final adjustments to tables before processing by the transaction builder."""
for section, table in self.alltables.items():
# set table headers. table was goofy, so they had to be croped out
headers = ['Post Date', 'Trans Date', 'Description', 'City', 'State', 'Reference', 'Amount']
table = etl.wrap(etl.pushheader(table, headers))

# add year to mm/dd formatted date
table = self.fix_years(table)

table = table.rename(self.header_map)
table = self.convert_columns(table)

# the amounts should be negative since they're charges
table = etl.convert(table, 'amount', lambda a: a * -1)

table = self.fix_column_names(table)
table = self.prepare_processed_table(table) # override this to make additonal adjustments

self.alltables[section] = table

self.combine_tables()
return

def combine_tables(self):
# Initialize an empty table
combined_table = None

for section, table in self.alltables.items():
# Convert each table to a petl table
petl_table = etl.wrap(table)

# Combine tables
if combined_table is None:
combined_table = petl_table # First table initializes the combined table
else:
combined_table = etl.cat(combined_table, petl_table) # Concatenate additional tables

return combined_table
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@

2024-10-05 * "Whole Foods"
Liabilities:Credit-Cards:Mercury -15.01 USD

2024-10-06 * "Car Wash"
Liabilities:Credit-Cards:Mercury -35.30 USD

2024-10-07 * "Taco Bell"
Liabilities:Credit-Cards:Mercury -22.76 USD

2024-10-07 * "Mcdonald's"
Liabilities:Credit-Cards:Mercury -10.11 USD

2024-10-08 * "Papa John's"
Liabilities:Credit-Cards:Mercury -30.16 USD

2024-10-10 * "Paypal"
Liabilities:Credit-Cards:Mercury -33.97 USD

2024-10-11 * "Amazon"
Liabilities:Credit-Cards:Mercury -101.55 USD

2024-10-11 * "Wm Supercenter"
Liabilities:Credit-Cards:Mercury -53.44 USD

2024-10-12 * "Amazon"
Liabilities:Credit-Cards:Mercury -204.32 USD

2024-10-12 * "Amazon"
Liabilities:Credit-Cards:Mercury -4.90 USD

2024-10-12 * "Target"
Liabilities:Credit-Cards:Mercury -10.90 USD

2024-10-13 * "Bp"
Liabilities:Credit-Cards:Mercury -106.95 USD

2024-10-14 * "Circle K"
Liabilities:Credit-Cards:Mercury -50.69 USD

2024-10-15 * "Amazon"
Liabilities:Credit-Cards:Mercury -10.11 USD

2024-10-15 * "Doordash"
Liabilities:Credit-Cards:Mercury -73.82 USD

2024-10-16 * "Mcdonald's"
Liabilities:Credit-Cards:Mercury -10.68 USD

2024-10-16 * "7-Eleven"
Liabilities:Credit-Cards:Mercury -10.11 USD

2024-10-17 * "Advance Auto Parts"
Liabilities:Credit-Cards:Mercury -13.86 USD

2024-10-18 * "Mcdonald's"
Liabilities:Credit-Cards:Mercury -15.11 USD

2024-10-18 * "Papa John's"
Liabilities:Credit-Cards:Mercury -31.29 USD

2024-10-18 * "Mcdonald's"
Liabilities:Credit-Cards:Mercury -12.62 USD

2024-10-19 * "Amazon Prime"
Liabilities:Credit-Cards:Mercury -23.52 USD

2024-10-19 * "Amazon"
Liabilities:Credit-Cards:Mercury -30.46 USD

2024-10-19 * "Amazon"
Liabilities:Credit-Cards:Mercury -44.21 USD

2024-10-19 * "Amazon"
Liabilities:Credit-Cards:Mercury -23.57 USD

2024-10-19 * "Amazon"
Liabilities:Credit-Cards:Mercury -22.24 USD

2024-10-19 * "Amazon"
Liabilities:Credit-Cards:Mercury -69.23 USD

2024-10-19 * "Amazon"
Liabilities:Credit-Cards:Mercury -60.00 USD

2024-10-19 * "Amazon"
Liabilities:Credit-Cards:Mercury -57.07 USD

2024-10-19 * "Amazon"
Liabilities:Credit-Cards:Mercury -224.05 USD

2024-10-20 * "Whole Foods"
Liabilities:Credit-Cards:Mercury -6.39 USD

2024-10-21 * "Doordash"
Liabilities:Credit-Cards:Mercury -79.56 USD

2024-10-21 * "Amazon"
Liabilities:Credit-Cards:Mercury -20.24 USD

2024-10-22 * "Papa John's"
Liabilities:Credit-Cards:Mercury -37.85 USD

2024-10-22 * "Mcdonald's"
Liabilities:Credit-Cards:Mercury -10.11 USD

2024-10-22 * "Racetrac"
Liabilities:Credit-Cards:Mercury -32.90 USD

2024-10-23 * "Mcdonald's"
Liabilities:Credit-Cards:Mercury -10.11 USD

2024-10-24 * "Doordash"
Liabilities:Credit-Cards:Mercury -28.41 USD

2024-10-24 * "Doordash"
Liabilities:Credit-Cards:Mercury -40.83 USD

2024-10-25 * "Doordash"
Liabilities:Credit-Cards:Mercury -68.35 USD
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Liabilities:Credit-Cards:Mercury
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2024-11-05T00:00:00
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
mercury_statement_20241105.pdf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from os import path

from beancount.ingest import regression_pytest as regtest

from beancount_reds_importers.importers import mercurycards


@regtest.with_importer(
mercurycards.Importer(
{
'main_account' : 'Liabilities:Credit-Cards:Mercury',
'emit_filing_account_metadata' : False,
'filename_pattern' : 'mercury_statement_20241105.pdf',
'skip_transaction_types' : {},
'currency' : 'USD'
}
)
)
@regtest.with_testdir(path.dirname(__file__))
class TestMercuryCards(regtest.ImporterTestBase):
pass
3 changes: 2 additions & 1 deletion beancount_reds_importers/libreader/csv_multitable_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def file_date(self, file):
pass

def convert_columns(self, rdr):
pass
# Convert columns for a single table. Can be called from prepare_tables for each table.
return super().convert_columns(rdr)

def is_section_title(self, row):
# Match against rows that contain section titles. Eg: 'section1', 'section2', ...
Expand Down
Loading
Loading