SFR-2047: Initial Counter 5 Functionality (#317)

NYPL · Jul 31, 2024 · d496f65 · d496f65
1 parent ee49519
commit d496f65
Show file tree

Hide file tree

Showing 19 changed files with 338 additions and 92 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,7 +2,7 @@
 
 ## unreleased version -- v0.13.1
 ## Added
-- New script to parse download requests from S3 log files for UMP books 
+- New analytics folder for University Press project code. Contains methodology for generating Counter 5 reports
 - New script to update current UofM manifests with fulfill endpoints to replace pdf/epub urls
 - Updated README with appendix and additions to avaliable processes
 - New process to add fulfill urls to Limited Access manifests and update fulfill_limited_access flags to True

diff --git a/README.md b/README.md
@@ -22,6 +22,18 @@ Using these retrieved records, and matched records from the DCDW as a corpus, th
 
 This application is built as a monorepo, which can be built as a single Docker container. This container can be run to execute different processes, which either execute discrete tasks or start a persistent service (such as a Flask API instance). The monorepo structure allows for a high degree of code reuse and makes extending existing services/adding new services easier as they can be based on existing patterns. Many of the modules include abstract base classes that define the mandatory methods for each service.
 
+## Analytics
+
+Analytics projects are stored separately from other DRB processes and scripts, in the [analytics](analytics) folder. Each analytics project is listed below:
+* [University Press Backlist Project](analytics/upress_reporting) = Generates [Counter 5 reports](https://airtable.com/appBoLf4lMofecGPU/tblIjRKk0fnoGOqMo?blocks=hide) given a timeframe. Be sure to set up environment variables `DOWNLOAD_BUCKET` and `DOWNLOAD_LOG_PATH`. To generate Counter 5 reports, run the following:
+    ```
+    python3 analytics/upress_reporting/runner.py <REPORTING PERIOD>
+    ```
+  Here is an example command:
+    ```
+    python3 analytics/upress_reporting/runner.py "2024-03-01 to 2024-03-30"
+    ```
+
 ### Local Development
 
 Locally these services can be run in two modes:

diff --git a/analytics/upress_reporting/__init__.py b/analytics/upress_reporting/__init__.py
diff --git a/analytics/upress_reporting/counter_5_controller.py b/analytics/upress_reporting/counter_5_controller.py
@@ -0,0 +1,20 @@
+from nypl_py_utils.functions.log_helper import create_log
+from analytics.upress_reporting.models.reports.downloads import DownloadsReport
+
+
+class Counter5Controller:
+    """Class for orchestrating various Counter 5 reports"""
+    def __init__(self, reporting_period):
+        self.logger = create_log("counter_5_controller")
+        self.publishers = []
+        self.reporting_period = reporting_period
+
+    def _setup_reports(self, publisher, date_range):
+        return [
+            DownloadsReport(publisher, date_range)
+        ]
+
+    def pull_reports(self):
+        self.logger.info("Pulling Counter 5 reports...")
+        for publisher in self.publishers:
+            return True
diff --git a/analytics/upress_reporting/helpers/download_data_aggregator.py b/analytics/upress_reporting/helpers/download_data_aggregator.py
@@ -0,0 +1,140 @@
+import os
+import boto3
+import re
+
+from analytics.upress_reporting.models.data.download_event import DownloadEvent
+from logger import createLog
+from model import Edition, Item, Link
+from model.postgres.item import ITEM_LINKS
+from managers import DBManager
+
+# Regexes needed to parse S3 logs
+REQUEST_REGEX = r"REST.GET.OBJECT "
+# File ID includes the file name for the pdf object
+FILE_ID_REGEX = r"REST.GET.OBJECT (.+pdf\s)"
+TIMESTAMP_REGEX = r"\[.+\]"
+REFERRER_REGEX = r"https://drb-qa.nypl.org/"
+
+
+class DownloadDataAggregator:
+    """
+    Parses S3 download logs and generates list of DownloadEvents, each corresponding 
+    to a single download request.
+    """
+
+    def __init__(self, publisher, date_range):
+        self.publisher = publisher
+        self.date_range = date_range
+
+        self.s3_client = boto3.client("s3")
+        self.bucket_name = os.environ.get("DOWNLOAD_BUCKET", None)
+        self.log_path = os.environ.get("DOWNLOAD_LOG_PATH", None)
+
+        self._setup_db_manager()
+        self.logger = createLog("download_request_parser")
+
+    def pull_download_events(self):
+        '''
+        Returns list of DownloadEvents in a given reporting period.
+        '''
+        download_events = []
+
+        for date in self.date_range:
+            folder_name = date.strftime("%Y/%m/%d")
+            batch = self._load_batch(folder_name)
+            downloads_per_day = self._parse_logs(batch)
+            download_events.extend(downloads_per_day)
+
+        self.db_manager.closeConnection()
+        return download_events
+
+    def _setup_db_manager(self):
+        self.db_manager = DBManager(
+            user=os.environ.get("POSTGRES_USER", None),
+            pswd=os.environ.get("POSTGRES_PSWD", None),
+            host=os.environ.get("POSTGRES_HOST", None),
+            port=os.environ.get("POSTGRES_PORT", None),
+            db=os.environ.get("POSTGRES_NAME", None),
+        )
+        self.db_manager.generateEngine()
+        self.db_manager.createSession()
+
+    def _load_batch(self, log_folder):
+        prefix = self.log_path + log_folder + "/"
+        paginator = self.s3_client.get_paginator("list_objects_v2")
+        page_iterator = paginator.paginate(
+            Bucket=self.bucket_name, Prefix=prefix)
+        return page_iterator
+
+    def _parse_logs(self, batch):
+        '''
+        The edition title, identifier, and timestamp are parsed out of the
+        S3 server access log files for UMP download requests.
+        '''
+        downloads_in_batch = []
+
+        for log_file in batch:
+            if "Contents" not in log_file:
+                path = self._redact_s3_path(log_file["Prefix"])
+                raise DownloadParsingError(
+                    f"Log files in path {path} do not exist.")
+            else:
+                for content in log_file["Contents"]:
+                    curr_key = str(content["Key"])
+                    log_object_dict = self.s3_client.get_object(
+                        Bucket=self.bucket_name, Key=f"{curr_key}"
+                    )
+                    for i in log_object_dict["Body"].iter_lines():
+                        log_object_dict = i.decode("utf8")
+                        parse_tuple = self._match_log_info_with_frbr_data(
+                            log_object_dict)
+                        if parse_tuple:
+                            downloads_in_batch.append(DownloadEvent(
+                                parse_tuple[0], parse_tuple[1], parse_tuple[2]))
+        return downloads_in_batch
+
+    def _redact_s3_path(self, path):
+        '''
+        Used to remove sensitive data from S3 prefix before passing to error message.
+        Example input = "logs/123456789/us-east-1/ump-pdf-repository/2024/1/1"
+        Example output: "logs/NYPL_AWS_ID/us-east-1/ump-pdf-repository/2024/1/1"
+        '''
+        split_path = path.split("/")
+        split_path[1] = "NYPL_AWS_ID"
+        return "/".join(split_path)
+
+    def _match_log_info_with_frbr_data(self, log_object):
+        matchRequest = re.search(REQUEST_REGEX, log_object)
+        matchReferrer = re.search(REFERRER_REGEX, log_object)
+
+        if matchRequest and matchReferrer and "403 AccessDenied" not in log_object:
+            match_time = re.search(TIMESTAMP_REGEX, log_object)
+            match_file_id = re.search(FILE_ID_REGEX, log_object)
+            link_group = match_file_id.group(1)
+            title_parse = ""
+            id_parse = None
+
+            for item in self.db_manager.session.query(Item).filter(
+                Item.source == self.publisher
+            ):
+                for link in (
+                    self.db_manager.session.query(Link)
+                    .join(ITEM_LINKS)
+                    .filter(ITEM_LINKS.c.item_id == item.id)
+                    .filter(Link.media_type == "application/pdf")
+                    .filter(Link.url.contains(link_group.strip()))
+                    .all()
+                ):
+                    item_edit_id = item.edition_id
+                    for edit in self.db_manager.session.query(Edition).filter(
+                        Edition.id == item_edit_id
+                    ):
+                        title_parse = edit.title
+                        id_parse = edit.id
+
+            return [title_parse, match_time.group(0), id_parse]
+
+
+class DownloadParsingError(Exception):
+    def __init__(self, message=None):
+        self.message = message
diff --git a/analytics/upress_reporting/models/data/download_event.py b/analytics/upress_reporting/models/data/download_event.py
@@ -0,0 +1,7 @@
+from dataclasses import dataclass
+
+@dataclass(init=True, repr=True)
+class DownloadEvent():
+    title: str
+    timestamp: str
+    edition_id: int
diff --git a/analytics/upress_reporting/models/reports/counter_5_report.py b/analytics/upress_reporting/models/reports/counter_5_report.py
@@ -0,0 +1,45 @@
+import pandas
+import re
+import uuid
+
+from abc import ABC, abstractmethod
+from datetime import datetime
+
+
+class Counter5Report(ABC):
+    def __init__(self, publisher, reporting_period=None):
+        self.publisher = publisher
+        self.created = datetime.today().strftime("%Y-%m-%d")
+        self.created_by = "NYPL"
+        if reporting_period is not None:
+            self.reporting_period = reporting_period
+        else:
+            # Set reporting period to first month of current year
+            # TODO: determine default reporting period
+            self.reporting_period = (
+                f"{datetime.now().year}-01-01 to {datetime.now().year}-01-31"
+            )
+
+    @abstractmethod
+    def build_header(self) -> dict:
+        return
+
+    @abstractmethod
+    def build_report(self):
+        return
+
+    def generate_report_id(self):
+        return uuid.uuid4()
+
+    def parse_reporting_period(self, reporting_period):
+        """
+        Input: String with date range in Y-m-d format (ex. "2024-01-01 to 2024-12-31")
+        Output: Pandas date_range object
+        """
+        date_pattern = "20[0-9][0-9](.|-|)(\\d\\d)(.|-|)(\\d\\d)"
+
+        if re.search(
+            ("^" + date_pattern + "\\sto\\s" + date_pattern), reporting_period
+        ):
+            start, end = reporting_period.split(" to ")
+            return pandas.date_range(start=start, end=end)
diff --git a/analytics/upress_reporting/models/reports/country_level.py b/analytics/upress_reporting/models/reports/country_level.py
@@ -0,0 +1,6 @@
+from analytics.upress_reporting.models.reports.counter_5_report import Counter5Report
+
+
+class CountryLevelReport(Counter5Report):
+    def __init__(self, publisher):
+        super().__init__(publisher)
diff --git a/analytics/upress_reporting/models/reports/downloads.py b/analytics/upress_reporting/models/reports/downloads.py
@@ -0,0 +1,49 @@
+import csv
+
+from analytics.upress_reporting.helpers.download_data_aggregator import (
+    DownloadDataAggregator,
+)
+from analytics.upress_reporting.models.reports.counter_5_report import Counter5Report
+
+ADDITIONAL_HEADERS = ["Book Title",
+                      "Book ID",
+                      "Authors",
+                      "ISBN",
+                      "eISBN",
+                      "Copyright Year",
+                      "Disciplines",
+                      "Usage Type",
+                      "Reporting Period Total"]
+
+
+class DownloadsReport(Counter5Report):
+    def __init__(self, *args):
+        super().__init__(*args)
+        pandas_date_range = self.parse_reporting_period(self.reporting_period)
+        self.download_request_parser = DownloadDataAggregator(
+            self.publisher, pandas_date_range)
+
+    def build_header(self):
+        return {
+            "Report_Name": "NYPL DRB Book Usage by Title / Downloads",
+            "Report_ID": self.generate_report_id(),
+            "Report_Description": "Downloads of your books from NYPL's Digital Research Books by title.",
+            "Publisher_Name": self.publisher,
+            "Reporting_Period": self.reporting_period,
+            "Created": self.created,
+            "Created_By": self.created_by,
+        }
+
+    def build_report(self):
+        download_events = self.download_request_parser.pull_download_events()
+        header = self.build_header()
+        with open('counter_5_downloads_report.csv', 'w') as csv_file:
+            writer = csv.writer(csv_file)
+            for key, value in header.items():
+                writer.writerow([key, value])
+            writer.writerow([])
+            # following code is temp until we can integrate DRB data
+            writer.writerow(["Title", "Timestamp", "Edition_ID"])
+            for download_event in download_events:
+                writer.writerow(
+                    [download_event.title, download_event.timestamp, download_event.edition_id])
diff --git a/analytics/upress_reporting/models/reports/total_usage.py b/analytics/upress_reporting/models/reports/total_usage.py
@@ -0,0 +1,6 @@
+from analytics.upress_reporting.models.reports.counter_5_report import Counter5Report
+
+
+class TotalUsageReport(Counter5Report):
+    def __init__(self, publisher):
+        super().__init__(publisher)
diff --git a/analytics/upress_reporting/models/reports/views.py b/analytics/upress_reporting/models/reports/views.py
@@ -0,0 +1,6 @@
+from analytics.upress_reporting.models.reports.counter_5_report import Counter5Report
+
+
+class ViewsReport(Counter5Report):
+    def __init__(self, publisher):
+        super().__init__(publisher)
diff --git a/analytics/upress_reporting/runner.py b/analytics/upress_reporting/runner.py
@@ -0,0 +1,24 @@
+import sys
+
+from analytics.upress_reporting.models.reports.downloads import DownloadsReport
+from datetime import datetime
+from logger import createLog
+from main import loadEnvFile
+
+
+def main():
+    logger = createLog("Generating Counter 5 reports...")
+    loadEnvFile('local-compose', fileString='config/local-compose.yaml')
+
+    downloads_report = None
+
+    if (len(sys.argv) <= 1):
+        print(
+            f"No reporting period passed in. Generating report for Jan {datetime.now().year}!")
+    downloads_report = DownloadsReport("UofM") if (
+        len(sys.argv) <= 1) else DownloadsReport("UofM", sys.argv[1])
+    downloads_report.build_report()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/config/development.yaml b/config/development.yaml
@@ -92,3 +92,7 @@ API_PROXY_CORS_ALLOWED: '*'
 
 # Current NYPL Webreader version
 READER_VERSION: v1
+
+# ANALYTICS CONFIGURATION
+DOWNLOAD_BUCKET: "ump-pdf-repository-logs"
+DOWNLOAD_LOG_PATH: "logs/946183545209/us-east-1/ump-pdf-repository/"
diff --git a/config/example.yaml b/config/example.yaml
@@ -73,4 +73,8 @@ WEBPUB_PDF_PROFILE: http://librarysimplified.org/terms/profiles/pdf
 API_PROXY_CORS_ALLOWED: (?:Source1:Source2)
 
 # Current NYPL Webreader version
-READER_VERSION: xxx
+READER_VERSION: xxx
+
+# ANALYTICS CONFIGURATION
+DOWNLOAD_BUCKET: "ump-pdf-repository-logs"
+DOWNLOAD_LOG_PATH: "logs/946183545209/us-east-1/ump-pdf-repository/"
diff --git a/config/production.yaml b/config/production.yaml
@@ -99,3 +99,7 @@ API_PROXY_CORS_ALLOWED: http[s]?://.*nypl.org
 
 # Current NYPL Webreader version
 READER_VERSION: v2
+
+# ANALYTICS CONFIGURATION
+DOWNLOAD_BUCKET: "ump-pdf-repository-logs"
+DOWNLOAD_LOG_PATH: "logs/946183545209/us-east-1/ump-pdf-repository/"
diff --git a/config/qa.yaml b/config/qa.yaml
@@ -99,3 +99,7 @@ API_PROXY_CORS_ALLOWED: (?:http[s]?:\/\/.*nypl.org|https:\/\/.*(?:nypl|sfr).*ver
 
 # Current NYPL Webreader version
 READER_VERSION: v2
+
+# ANALYTICS CONFIGURATION
+DOWNLOAD_BUCKET: "ump-pdf-repository-logs"
+DOWNLOAD_LOG_PATH: "logs/946183545209/us-east-1/ump-pdf-repository/"
diff --git a/config/sample-compose.yaml b/config/sample-compose.yaml
@@ -78,4 +78,8 @@ API_PROXY_CORS_ALLOWED: '*'
 
 READER_VERSION: v2
 
-DEFAULT_COVER_URL: https://drb-files-qa.s3.amazonaws.com/covers/default/defaultCover.png
+DEFAULT_COVER_URL: https://drb-files-qa.s3.amazonaws.com/covers/default/defaultCover.png
+
+# ANALYTICS CONFIGURATION
+DOWNLOAD_BUCKET: "ump-pdf-repository-logs"
+DOWNLOAD_LOG_PATH: "logs/946183545209/us-east-1/ump-pdf-repository/"
diff --git a/scripts/__init__.py b/scripts/__init__.py
@@ -14,5 +14,4 @@
 from .countCABooks import main as countCA
 from .nyplLoginFlags import main as nyplFlags
 from .deleteUMPManifestLinks import main as deleteUMPManifests
-from .parseDownloadRequests import main as parseDownloads
 from .addFulfillManifest import main as fulfillManifest