diff --git a/ami_scripts/digitization_performance_tracker.py b/ami_scripts/digitization_performance_tracker.py new file mode 100755 index 00000000..b4e7b8f8 --- /dev/null +++ b/ami_scripts/digitization_performance_tracker.py @@ -0,0 +1,265 @@ +#!/usr/bin/env python3 + +import argparse +import jaydebeapi +import os +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +from hurry.filesize import size +import numpy as np +import datetime +from matplotlib.backends.backend_pdf import PdfPages + + +# Setup display options for better readability in the output +pd.set_option('display.max_rows', None) +pd.set_option('display.max_columns', None) +pd.set_option('display.width', None) +pd.set_option('display.max_colwidth', None) + +def get_args(): + parser = argparse.ArgumentParser(description='Generate Production Stats & Cool Visualizations from AMIDB') + parser.add_argument('-f', '--fiscal', action='store_true', + help='organize stats and visualizations by fiscal year instead of calendar year') + parser.add_argument('-e', '--engineer', nargs='+', + help='Filter output by specific engineers (last names).') + return parser.parse_args() + + +def fetch_data_from_jdbc(): + # Load environment variables + server_ip = os.getenv('FM_SERVER') + database_name = os.getenv('AMI_DATABASE') + username = os.getenv('AMI_DATABASE_USERNAME') + password = os.getenv('AMI_DATABASE_PASSWORD') + + # Dynamically set the JDBC path + jdbc_path = os.path.expanduser('~/Desktop/ami-preservation/ami_scripts/jdbc/fmjdbc.jar') + + conn = None + df = pd.DataFrame() # Default empty DataFrame in case of issues + + try: + conn = jaydebeapi.connect( + 'com.filemaker.jdbc.Driver', + f'jdbc:filemaker://{server_ip}/{database_name}', + [username, password], + jdbc_path + ) + print("Connection to AMIDB successful!") + print("Now Fetching Data (Expect 2-3 minutes)") + + query = 'SELECT "bibliographic.primaryID", "technical.dateCreated", "technical.fileFormat", "technical.fileSize.measure", "technical.durationMilli.measure", "asset.fileRole", "digitizer.operator.lastName", "bibliographic.vernacularDivisionCode", "source.object.format", "source.object.type" FROM tbl_metadata' + curs = conn.cursor() + curs.execute(query) + + columns = [desc[0] for desc in curs.description] + data = [dict(zip(columns, row)) for row in curs.fetchall()] + + df = pd.DataFrame(data) + print("Data fetched successfully!") + print(f"Total records fetched: {len(df)}") + + except Exception as e: + print(f"Failed to connect or execute query: {e}") + + finally: + if conn: + conn.close() + + return df + + +def get_fiscal_year(date): + year = date.year + month = date.month + + if month >= 7: + fiscal_year = year + 1 + else: + fiscal_year = year + + return f"FY{str(fiscal_year)[2:]}" + +def process_data(df, args, fiscal=False): + def convert_date(date_str): + if "-" in str(date_str): + # YYYY-MM-DD format + return pd.to_datetime(date_str, format='%Y-%m-%d', errors='coerce') + else: + # M/D/Y format + return pd.to_datetime(date_str, format='%m/%d/%Y', errors='coerce') + + # Apply the function to the date column + df['technical.dateCreated'] = df['technical.dateCreated'].apply(convert_date) + + # Filter by engineer if specified + if args.engineer: + df = df[df['digitizer.operator.lastName'].isin(args.engineer)].copy() + + # Assigning calendar year, fiscal year, and month using .loc to avoid SettingWithCopyWarning + df.loc[:, 'calendar_year'] = df['technical.dateCreated'].dt.year + df.loc[:, 'fiscal_year'] = df['technical.dateCreated'].apply(get_fiscal_year) + df.loc[:, 'month'] = df['technical.dateCreated'].dt.strftime('%Y-%m') # Year-Month format + + return df + + +def display_monthly_output_by_operator(df, args, fiscal=False): + if 'digitizer.operator.lastName' not in df.columns: + print("\nThe 'digitizer.operator.lastName' field is not present in the DataFrame. Skipping the function.\n") + return None, None # Return None for both outputs if the required column is missing + + # Filter the DataFrame for records where asset.fileRole is 'pm' + df_pm = df[df['asset.fileRole'] == 'pm'] + + # Choose the correct year column based on fiscal or calendar year + year_column = 'fiscal_year' if fiscal else 'calendar_year' + current_year = get_fiscal_year(datetime.datetime.now()) if fiscal else datetime.datetime.now().year + + # Filter data for the current year + df_pm_current_year = df_pm[df_pm[year_column] == current_year] + + # If engineer filter is specified, filter data for the selected engineers + if args.engineer: + df_pm_current_year = df_pm_current_year[df_pm_current_year['digitizer.operator.lastName'].isin(args.engineer)] + + # Group by digitizer and month + output_by_operator = df_pm_current_year.groupby(['digitizer.operator.lastName', 'month']).agg({ + 'bibliographic.primaryID': 'nunique' + }).reset_index() + + # Sum the output for each operator + output_sum = output_by_operator.groupby('digitizer.operator.lastName')['bibliographic.primaryID'].sum().reset_index() + output_sum['month'] = 'Total' # Assign 'Total' as the month for the sum row + output_by_operator_summed = pd.concat([output_by_operator, output_sum], ignore_index=True) + print(output_by_operator_summed) + + # Check if there is any data available for the current year + if output_by_operator.empty: + print(f"No data available for the current {'fiscal' if fiscal else 'calendar'} year.") + else: + # Plotting + sns.set_style("whitegrid") + plt.figure(figsize=(12, 6)) + sns.lineplot(data=output_by_operator, x='month', y='bibliographic.primaryID', hue='digitizer.operator.lastName', marker='o', linewidth=2) + plt.title(f'Monthly Digitization Output by Operator (PM role only) - {"Fiscal" if fiscal else "Calendar"} Year: {current_year}') + plt.xlabel('Month') + plt.ylabel('Items Digitized') + plt.xticks(rotation=45) + plt.tight_layout() + plt.legend(title='Digitizer') + plt.show() + + return output_by_operator, current_year + + +def plot_object_format_counts(df, args, fiscal=False, top_n=10): + if 'digitizer.operator.lastName' not in df.columns: + print("\nThe 'digitizer.operator.lastName' field is not present in the DataFrame. Skipping the function.\n") + return + + df_pm = df[df['asset.fileRole'] == 'pm'] + + # Determine the current fiscal or calendar year within the function + year_column = 'fiscal_year' if fiscal else 'calendar_year' + current_year = get_fiscal_year(datetime.datetime.now()) if fiscal else datetime.datetime.now().year + + df_pm_current_year = df_pm[df_pm[year_column] == current_year] + if args.engineer: + df_pm_current_year = df_pm_current_year[df_pm_current_year['digitizer.operator.lastName'].isin(args.engineer)] + + format_counts = df_pm_current_year.groupby('source.object.format')['bibliographic.primaryID'].nunique().nlargest(top_n).reset_index() + format_counts.columns = ['Format', 'Count'] + + # Plotting with annotations + fig, ax = plt.subplots(figsize=(15, 6)) + sns.barplot(x='Format', y='Count', data=format_counts, palette='viridis', ax=ax) + plt.xticks(rotation=45) + plt.xlabel('Source Object Format') + plt.ylabel('Count') + plt.title(f'Top {top_n} Counts of Source Object Formats in {current_year}', fontsize=16, fontweight='bold') + plt.subplots_adjust(bottom=0.3) + + # Adding annotations + for p in ax.patches: + ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()), + ha='center', va='bottom', color='black', xytext=(0, 5), textcoords='offset points') + + plt.show() + + return format_counts + + +def save_plot_to_pdf(data, bar_data, args, current_year): + engineer_name = "_".join(args.engineer) if args.engineer else "" + pdf_filename = f"Digitization_Report_{engineer_name}.pdf" if engineer_name else "Digitization_Report.pdf" + pdf_path = os.path.join(os.path.expanduser("~"), 'Desktop', pdf_filename) + + with PdfPages(pdf_path) as pdf: + # First chart: Line plot + fig, ax = plt.subplots(figsize=(10, 5)) + sns.lineplot(data=data, x='month', y='bibliographic.primaryID', hue='digitizer.operator.lastName', marker='o', linewidth=2, ax=ax) + plt.title('Monthly Digitization Output by Operator (PM role only)') + plt.xlabel('Month') + plt.ylabel('Items Digitized') + plt.xticks(rotation=45) + plt.legend(title='Digitizer') + plt.tight_layout() + pdf.savefig(fig) + plt.close(fig) + + # Second chart: Data table with sums by month and year + summary_df = data.groupby('month').agg({'bibliographic.primaryID': 'sum'}).reset_index() + summary_df.columns = ['Month', 'Total Items Digitized'] + + # Calculate and append yearly total using concat + yearly_total = pd.DataFrame([{'Month': 'Year Total', 'Total Items Digitized': summary_df['Total Items Digitized'].sum()}]) + summary_df = pd.concat([summary_df, yearly_total], ignore_index=True) + + fig, ax = plt.subplots(figsize=(10, 5)) + ax.axis('off') + table = ax.table(cellText=summary_df.values, colLabels=summary_df.columns, loc='center', cellLoc='center') + table.auto_set_font_size(True) + table.scale(1.2, 1.2) + pdf.savefig(fig) + plt.close(fig) + + # Third chart: Bar plot with annotations + fig, ax = plt.subplots(figsize=(12, 6)) + sns.barplot(x='Format', y='Count', data=bar_data, palette='viridis', ax=ax) + plt.xticks(rotation=45) + plt.xlabel('Format') + plt.ylabel('Count') + plt.title(f'Top {len(bar_data)} Counts of Source Object Formats in {current_year}', fontsize=16) + plt.subplots_adjust(bottom=0.3) + + # Adding annotations + for p in ax.patches: + ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()), + ha='center', va='bottom', color='black', xytext=(0, 5), textcoords='offset points') + + plt.tight_layout() + pdf.savefig(fig) + plt.close(fig) + + print(f"PDF report has been saved to {pdf_path}.") + + + + +def main(): + args = get_args() + df = fetch_data_from_jdbc() + df_processed = process_data(df, args, fiscal=args.fiscal) + line_data, current_year = display_monthly_output_by_operator(df_processed, args, fiscal=args.fiscal) + if line_data is None: # Check if line_data is None before proceeding + print("Error: Missing data. Exiting the program.") + return + bar_data = plot_object_format_counts(df_processed, args, fiscal=args.fiscal) + save_plot_to_pdf(line_data, bar_data, args, current_year) + +if __name__ == "__main__": + main() + diff --git a/ami_scripts/jdbc/fmjdbc.jar b/ami_scripts/jdbc/fmjdbc.jar new file mode 100644 index 00000000..cf9f6a8a Binary files /dev/null and b/ami_scripts/jdbc/fmjdbc.jar differ