add digitization performance tracker

add jdbc.jar
NYPL · May 1, 2024 · fa9ca84 · fa9ca84
1 parent 771e87c
commit fa9ca84
Show file tree

Hide file tree

Showing 2 changed files with 265 additions and 0 deletions.
diff --git a/ami_scripts/digitization_performance_tracker.py b/ami_scripts/digitization_performance_tracker.py
@@ -0,0 +1,265 @@
+#!/usr/bin/env python3
+
+import argparse
+import jaydebeapi
+import os
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from hurry.filesize import size
+import numpy as np
+import datetime
+from matplotlib.backends.backend_pdf import PdfPages
+
+
+# Setup display options for better readability in the output
+pd.set_option('display.max_rows', None)
+pd.set_option('display.max_columns', None)
+pd.set_option('display.width', None)
+pd.set_option('display.max_colwidth', None)
+
+def get_args():
+    parser = argparse.ArgumentParser(description='Generate Production Stats & Cool Visualizations from AMIDB')
+    parser.add_argument('-f', '--fiscal', action='store_true',
+                        help='organize stats and visualizations by fiscal year instead of calendar year')
+    parser.add_argument('-e', '--engineer', nargs='+',
+                        help='Filter output by specific engineers (last names).')
+    return parser.parse_args()
+
+
+def fetch_data_from_jdbc():
+    # Load environment variables
+    server_ip = os.getenv('FM_SERVER')
+    database_name = os.getenv('AMI_DATABASE')
+    username = os.getenv('AMI_DATABASE_USERNAME')
+    password = os.getenv('AMI_DATABASE_PASSWORD')
+
+    # Dynamically set the JDBC path
+    jdbc_path = os.path.expanduser('~/Desktop/ami-preservation/ami_scripts/jdbc/fmjdbc.jar')
+
+    conn = None
+    df = pd.DataFrame()  # Default empty DataFrame in case of issues
+
+    try:
+        conn = jaydebeapi.connect(
+            'com.filemaker.jdbc.Driver',
+            f'jdbc:filemaker://{server_ip}/{database_name}',
+            [username, password],
+            jdbc_path
+        )
+        print("Connection to AMIDB successful!")
+        print("Now Fetching Data (Expect 2-3 minutes)")
+
+        query = 'SELECT "bibliographic.primaryID", "technical.dateCreated", "technical.fileFormat", "technical.fileSize.measure", "technical.durationMilli.measure", "asset.fileRole", "digitizer.operator.lastName", "bibliographic.vernacularDivisionCode", "source.object.format", "source.object.type" FROM tbl_metadata'
+        curs = conn.cursor()
+        curs.execute(query)
+
+        columns = [desc[0] for desc in curs.description]
+        data = [dict(zip(columns, row)) for row in curs.fetchall()]
+
+        df = pd.DataFrame(data)
+        print("Data fetched successfully!")
+        print(f"Total records fetched: {len(df)}")  
+
+    except Exception as e:
+        print(f"Failed to connect or execute query: {e}")
+
+    finally:
+        if conn:
+            conn.close()
+
+    return df
+
+
+def get_fiscal_year(date):
+    year = date.year
+    month = date.month
+
+    if month >= 7:
+        fiscal_year = year + 1
+    else:
+        fiscal_year = year
+
+    return f"FY{str(fiscal_year)[2:]}"
+
+def process_data(df, args, fiscal=False):
+    def convert_date(date_str):
+        if "-" in str(date_str):
+            # YYYY-MM-DD format
+            return pd.to_datetime(date_str, format='%Y-%m-%d', errors='coerce')
+        else:
+            # M/D/Y format
+            return pd.to_datetime(date_str, format='%m/%d/%Y', errors='coerce')
+
+    # Apply the function to the date column
+    df['technical.dateCreated'] = df['technical.dateCreated'].apply(convert_date)
+
+    # Filter by engineer if specified
+    if args.engineer:
+        df = df[df['digitizer.operator.lastName'].isin(args.engineer)].copy()
+
+    # Assigning calendar year, fiscal year, and month using .loc to avoid SettingWithCopyWarning
+    df.loc[:, 'calendar_year'] = df['technical.dateCreated'].dt.year
+    df.loc[:, 'fiscal_year'] = df['technical.dateCreated'].apply(get_fiscal_year)
+    df.loc[:, 'month'] = df['technical.dateCreated'].dt.strftime('%Y-%m')  # Year-Month format
+
+    return df
+
+
+def display_monthly_output_by_operator(df, args, fiscal=False):
+    if 'digitizer.operator.lastName' not in df.columns:
+        print("\nThe 'digitizer.operator.lastName' field is not present in the DataFrame. Skipping the function.\n")
+        return None, None  # Return None for both outputs if the required column is missing
+
+    # Filter the DataFrame for records where asset.fileRole is 'pm'
+    df_pm = df[df['asset.fileRole'] == 'pm']
+
+    # Choose the correct year column based on fiscal or calendar year
+    year_column = 'fiscal_year' if fiscal else 'calendar_year'
+    current_year = get_fiscal_year(datetime.datetime.now()) if fiscal else datetime.datetime.now().year
+
+    # Filter data for the current year
+    df_pm_current_year = df_pm[df_pm[year_column] == current_year]
+
+    # If engineer filter is specified, filter data for the selected engineers
+    if args.engineer:
+        df_pm_current_year = df_pm_current_year[df_pm_current_year['digitizer.operator.lastName'].isin(args.engineer)]
+
+    # Group by digitizer and month
+    output_by_operator = df_pm_current_year.groupby(['digitizer.operator.lastName', 'month']).agg({
+        'bibliographic.primaryID': 'nunique'
+    }).reset_index()
+
+    # Sum the output for each operator
+    output_sum = output_by_operator.groupby('digitizer.operator.lastName')['bibliographic.primaryID'].sum().reset_index()
+    output_sum['month'] = 'Total'  # Assign 'Total' as the month for the sum row
+    output_by_operator_summed = pd.concat([output_by_operator, output_sum], ignore_index=True)
+    print(output_by_operator_summed)
+
+    # Check if there is any data available for the current year
+    if output_by_operator.empty:
+        print(f"No data available for the current {'fiscal' if fiscal else 'calendar'} year.")
+    else:
+        # Plotting
+        sns.set_style("whitegrid")
+        plt.figure(figsize=(12, 6))
+        sns.lineplot(data=output_by_operator, x='month', y='bibliographic.primaryID', hue='digitizer.operator.lastName', marker='o', linewidth=2)
+        plt.title(f'Monthly Digitization Output by Operator (PM role only) - {"Fiscal" if fiscal else "Calendar"} Year: {current_year}')
+        plt.xlabel('Month')
+        plt.ylabel('Items Digitized')
+        plt.xticks(rotation=45)
+        plt.tight_layout()
+        plt.legend(title='Digitizer')
+        plt.show()
+
+    return output_by_operator, current_year
+
+
+def plot_object_format_counts(df, args, fiscal=False, top_n=10):
+    if 'digitizer.operator.lastName' not in df.columns:
+        print("\nThe 'digitizer.operator.lastName' field is not present in the DataFrame. Skipping the function.\n")
+        return
+
+    df_pm = df[df['asset.fileRole'] == 'pm']
+
+    # Determine the current fiscal or calendar year within the function
+    year_column = 'fiscal_year' if fiscal else 'calendar_year'
+    current_year = get_fiscal_year(datetime.datetime.now()) if fiscal else datetime.datetime.now().year
+
+    df_pm_current_year = df_pm[df_pm[year_column] == current_year]
+    if args.engineer:
+        df_pm_current_year = df_pm_current_year[df_pm_current_year['digitizer.operator.lastName'].isin(args.engineer)]
+
+    format_counts = df_pm_current_year.groupby('source.object.format')['bibliographic.primaryID'].nunique().nlargest(top_n).reset_index()
+    format_counts.columns = ['Format', 'Count']
+
+    # Plotting with annotations
+    fig, ax = plt.subplots(figsize=(15, 6))
+    sns.barplot(x='Format', y='Count', data=format_counts, palette='viridis', ax=ax)
+    plt.xticks(rotation=45)
+    plt.xlabel('Source Object Format')
+    plt.ylabel('Count')
+    plt.title(f'Top {top_n} Counts of Source Object Formats in {current_year}', fontsize=16, fontweight='bold')
+    plt.subplots_adjust(bottom=0.3)
+
+    # Adding annotations
+    for p in ax.patches:
+        ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
+                    ha='center', va='bottom', color='black', xytext=(0, 5), textcoords='offset points')
+
+    plt.show()
+
+    return format_counts
+
+
+def save_plot_to_pdf(data, bar_data, args, current_year):
+    engineer_name = "_".join(args.engineer) if args.engineer else ""
+    pdf_filename = f"Digitization_Report_{engineer_name}.pdf" if engineer_name else "Digitization_Report.pdf"
+    pdf_path = os.path.join(os.path.expanduser("~"), 'Desktop', pdf_filename)
+
+    with PdfPages(pdf_path) as pdf:
+        # First chart: Line plot
+        fig, ax = plt.subplots(figsize=(10, 5))
+        sns.lineplot(data=data, x='month', y='bibliographic.primaryID', hue='digitizer.operator.lastName', marker='o', linewidth=2, ax=ax)
+        plt.title('Monthly Digitization Output by Operator (PM role only)')
+        plt.xlabel('Month')
+        plt.ylabel('Items Digitized')
+        plt.xticks(rotation=45)
+        plt.legend(title='Digitizer')
+        plt.tight_layout()
+        pdf.savefig(fig)
+        plt.close(fig)
+
+        # Second chart: Data table with sums by month and year
+        summary_df = data.groupby('month').agg({'bibliographic.primaryID': 'sum'}).reset_index()
+        summary_df.columns = ['Month', 'Total Items Digitized']
+
+        # Calculate and append yearly total using concat
+        yearly_total = pd.DataFrame([{'Month': 'Year Total', 'Total Items Digitized': summary_df['Total Items Digitized'].sum()}])
+        summary_df = pd.concat([summary_df, yearly_total], ignore_index=True)
+
+        fig, ax = plt.subplots(figsize=(10, 5))
+        ax.axis('off')
+        table = ax.table(cellText=summary_df.values, colLabels=summary_df.columns, loc='center', cellLoc='center')
+        table.auto_set_font_size(True)
+        table.scale(1.2, 1.2)
+        pdf.savefig(fig)
+        plt.close(fig)
+
+        # Third chart: Bar plot with annotations
+        fig, ax = plt.subplots(figsize=(12, 6))
+        sns.barplot(x='Format', y='Count', data=bar_data, palette='viridis', ax=ax)
+        plt.xticks(rotation=45)
+        plt.xlabel('Format')
+        plt.ylabel('Count')
+        plt.title(f'Top {len(bar_data)} Counts of Source Object Formats in {current_year}', fontsize=16)
+        plt.subplots_adjust(bottom=0.3)
+
+        # Adding annotations
+        for p in ax.patches:
+            ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
+                        ha='center', va='bottom', color='black', xytext=(0, 5), textcoords='offset points')
+
+        plt.tight_layout()
+        pdf.savefig(fig)
+        plt.close(fig)
+
+    print(f"PDF report has been saved to {pdf_path}.")
+
+
+
+
+def main():
+    args = get_args()
+    df = fetch_data_from_jdbc()
+    df_processed = process_data(df, args, fiscal=args.fiscal)
+    line_data, current_year = display_monthly_output_by_operator(df_processed, args, fiscal=args.fiscal)
+    if line_data is None:  # Check if line_data is None before proceeding
+        print("Error: Missing data. Exiting the program.")
+        return
+    bar_data = plot_object_format_counts(df_processed, args, fiscal=args.fiscal)
+    save_plot_to_pdf(line_data, bar_data, args, current_year)
+
+if __name__ == "__main__":
+    main()
+
diff --git a/ami_scripts/jdbc/fmjdbc.jar b/ami_scripts/jdbc/fmjdbc.jar