Skip to content

Commit

Permalink
add digitization performance tracker
Browse files Browse the repository at this point in the history
add jdbc.jar
  • Loading branch information
bturkus committed May 1, 2024
1 parent 771e87c commit fa9ca84
Show file tree
Hide file tree
Showing 2 changed files with 265 additions and 0 deletions.
265 changes: 265 additions & 0 deletions ami_scripts/digitization_performance_tracker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
#!/usr/bin/env python3

import argparse
import jaydebeapi
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from hurry.filesize import size
import numpy as np
import datetime
from matplotlib.backends.backend_pdf import PdfPages


# Setup display options for better readability in the output
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

def get_args():
parser = argparse.ArgumentParser(description='Generate Production Stats & Cool Visualizations from AMIDB')
parser.add_argument('-f', '--fiscal', action='store_true',
help='organize stats and visualizations by fiscal year instead of calendar year')
parser.add_argument('-e', '--engineer', nargs='+',
help='Filter output by specific engineers (last names).')
return parser.parse_args()


def fetch_data_from_jdbc():
# Load environment variables
server_ip = os.getenv('FM_SERVER')
database_name = os.getenv('AMI_DATABASE')
username = os.getenv('AMI_DATABASE_USERNAME')
password = os.getenv('AMI_DATABASE_PASSWORD')

# Dynamically set the JDBC path
jdbc_path = os.path.expanduser('~/Desktop/ami-preservation/ami_scripts/jdbc/fmjdbc.jar')

conn = None
df = pd.DataFrame() # Default empty DataFrame in case of issues

try:
conn = jaydebeapi.connect(
'com.filemaker.jdbc.Driver',
f'jdbc:filemaker://{server_ip}/{database_name}',
[username, password],
jdbc_path
)
print("Connection to AMIDB successful!")
print("Now Fetching Data (Expect 2-3 minutes)")

query = 'SELECT "bibliographic.primaryID", "technical.dateCreated", "technical.fileFormat", "technical.fileSize.measure", "technical.durationMilli.measure", "asset.fileRole", "digitizer.operator.lastName", "bibliographic.vernacularDivisionCode", "source.object.format", "source.object.type" FROM tbl_metadata'
curs = conn.cursor()
curs.execute(query)

columns = [desc[0] for desc in curs.description]
data = [dict(zip(columns, row)) for row in curs.fetchall()]

df = pd.DataFrame(data)
print("Data fetched successfully!")
print(f"Total records fetched: {len(df)}")

except Exception as e:
print(f"Failed to connect or execute query: {e}")

finally:
if conn:
conn.close()

return df


def get_fiscal_year(date):
year = date.year
month = date.month

if month >= 7:
fiscal_year = year + 1
else:
fiscal_year = year

return f"FY{str(fiscal_year)[2:]}"

def process_data(df, args, fiscal=False):
def convert_date(date_str):
if "-" in str(date_str):
# YYYY-MM-DD format
return pd.to_datetime(date_str, format='%Y-%m-%d', errors='coerce')
else:
# M/D/Y format
return pd.to_datetime(date_str, format='%m/%d/%Y', errors='coerce')

# Apply the function to the date column
df['technical.dateCreated'] = df['technical.dateCreated'].apply(convert_date)

# Filter by engineer if specified
if args.engineer:
df = df[df['digitizer.operator.lastName'].isin(args.engineer)].copy()

# Assigning calendar year, fiscal year, and month using .loc to avoid SettingWithCopyWarning
df.loc[:, 'calendar_year'] = df['technical.dateCreated'].dt.year
df.loc[:, 'fiscal_year'] = df['technical.dateCreated'].apply(get_fiscal_year)
df.loc[:, 'month'] = df['technical.dateCreated'].dt.strftime('%Y-%m') # Year-Month format

return df


def display_monthly_output_by_operator(df, args, fiscal=False):
if 'digitizer.operator.lastName' not in df.columns:
print("\nThe 'digitizer.operator.lastName' field is not present in the DataFrame. Skipping the function.\n")
return None, None # Return None for both outputs if the required column is missing

# Filter the DataFrame for records where asset.fileRole is 'pm'
df_pm = df[df['asset.fileRole'] == 'pm']

# Choose the correct year column based on fiscal or calendar year
year_column = 'fiscal_year' if fiscal else 'calendar_year'
current_year = get_fiscal_year(datetime.datetime.now()) if fiscal else datetime.datetime.now().year

# Filter data for the current year
df_pm_current_year = df_pm[df_pm[year_column] == current_year]

# If engineer filter is specified, filter data for the selected engineers
if args.engineer:
df_pm_current_year = df_pm_current_year[df_pm_current_year['digitizer.operator.lastName'].isin(args.engineer)]

# Group by digitizer and month
output_by_operator = df_pm_current_year.groupby(['digitizer.operator.lastName', 'month']).agg({
'bibliographic.primaryID': 'nunique'
}).reset_index()

# Sum the output for each operator
output_sum = output_by_operator.groupby('digitizer.operator.lastName')['bibliographic.primaryID'].sum().reset_index()
output_sum['month'] = 'Total' # Assign 'Total' as the month for the sum row
output_by_operator_summed = pd.concat([output_by_operator, output_sum], ignore_index=True)
print(output_by_operator_summed)

# Check if there is any data available for the current year
if output_by_operator.empty:
print(f"No data available for the current {'fiscal' if fiscal else 'calendar'} year.")
else:
# Plotting
sns.set_style("whitegrid")
plt.figure(figsize=(12, 6))
sns.lineplot(data=output_by_operator, x='month', y='bibliographic.primaryID', hue='digitizer.operator.lastName', marker='o', linewidth=2)
plt.title(f'Monthly Digitization Output by Operator (PM role only) - {"Fiscal" if fiscal else "Calendar"} Year: {current_year}')
plt.xlabel('Month')
plt.ylabel('Items Digitized')
plt.xticks(rotation=45)
plt.tight_layout()
plt.legend(title='Digitizer')
plt.show()

return output_by_operator, current_year


def plot_object_format_counts(df, args, fiscal=False, top_n=10):
if 'digitizer.operator.lastName' not in df.columns:
print("\nThe 'digitizer.operator.lastName' field is not present in the DataFrame. Skipping the function.\n")
return

df_pm = df[df['asset.fileRole'] == 'pm']

# Determine the current fiscal or calendar year within the function
year_column = 'fiscal_year' if fiscal else 'calendar_year'
current_year = get_fiscal_year(datetime.datetime.now()) if fiscal else datetime.datetime.now().year

df_pm_current_year = df_pm[df_pm[year_column] == current_year]
if args.engineer:
df_pm_current_year = df_pm_current_year[df_pm_current_year['digitizer.operator.lastName'].isin(args.engineer)]

format_counts = df_pm_current_year.groupby('source.object.format')['bibliographic.primaryID'].nunique().nlargest(top_n).reset_index()
format_counts.columns = ['Format', 'Count']

# Plotting with annotations
fig, ax = plt.subplots(figsize=(15, 6))
sns.barplot(x='Format', y='Count', data=format_counts, palette='viridis', ax=ax)
plt.xticks(rotation=45)
plt.xlabel('Source Object Format')
plt.ylabel('Count')
plt.title(f'Top {top_n} Counts of Source Object Formats in {current_year}', fontsize=16, fontweight='bold')
plt.subplots_adjust(bottom=0.3)

# Adding annotations
for p in ax.patches:
ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='bottom', color='black', xytext=(0, 5), textcoords='offset points')

plt.show()

return format_counts


def save_plot_to_pdf(data, bar_data, args, current_year):
engineer_name = "_".join(args.engineer) if args.engineer else ""
pdf_filename = f"Digitization_Report_{engineer_name}.pdf" if engineer_name else "Digitization_Report.pdf"
pdf_path = os.path.join(os.path.expanduser("~"), 'Desktop', pdf_filename)

with PdfPages(pdf_path) as pdf:
# First chart: Line plot
fig, ax = plt.subplots(figsize=(10, 5))
sns.lineplot(data=data, x='month', y='bibliographic.primaryID', hue='digitizer.operator.lastName', marker='o', linewidth=2, ax=ax)
plt.title('Monthly Digitization Output by Operator (PM role only)')
plt.xlabel('Month')
plt.ylabel('Items Digitized')
plt.xticks(rotation=45)
plt.legend(title='Digitizer')
plt.tight_layout()
pdf.savefig(fig)
plt.close(fig)

# Second chart: Data table with sums by month and year
summary_df = data.groupby('month').agg({'bibliographic.primaryID': 'sum'}).reset_index()
summary_df.columns = ['Month', 'Total Items Digitized']

# Calculate and append yearly total using concat
yearly_total = pd.DataFrame([{'Month': 'Year Total', 'Total Items Digitized': summary_df['Total Items Digitized'].sum()}])
summary_df = pd.concat([summary_df, yearly_total], ignore_index=True)

fig, ax = plt.subplots(figsize=(10, 5))
ax.axis('off')
table = ax.table(cellText=summary_df.values, colLabels=summary_df.columns, loc='center', cellLoc='center')
table.auto_set_font_size(True)
table.scale(1.2, 1.2)
pdf.savefig(fig)
plt.close(fig)

# Third chart: Bar plot with annotations
fig, ax = plt.subplots(figsize=(12, 6))
sns.barplot(x='Format', y='Count', data=bar_data, palette='viridis', ax=ax)
plt.xticks(rotation=45)
plt.xlabel('Format')
plt.ylabel('Count')
plt.title(f'Top {len(bar_data)} Counts of Source Object Formats in {current_year}', fontsize=16)
plt.subplots_adjust(bottom=0.3)

# Adding annotations
for p in ax.patches:
ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='bottom', color='black', xytext=(0, 5), textcoords='offset points')

plt.tight_layout()
pdf.savefig(fig)
plt.close(fig)

print(f"PDF report has been saved to {pdf_path}.")




def main():
args = get_args()
df = fetch_data_from_jdbc()
df_processed = process_data(df, args, fiscal=args.fiscal)
line_data, current_year = display_monthly_output_by_operator(df_processed, args, fiscal=args.fiscal)
if line_data is None: # Check if line_data is None before proceeding
print("Error: Missing data. Exiting the program.")
return
bar_data = plot_object_format_counts(df_processed, args, fiscal=args.fiscal)
save_plot_to_pdf(line_data, bar_data, args, current_year)

if __name__ == "__main__":
main()

Binary file added ami_scripts/jdbc/fmjdbc.jar
Binary file not shown.

0 comments on commit fa9ca84

Please sign in to comment.