Skip to content

Commit

Permalink
SFR-2047: Initial Counter 5 Functionality (#317)
Browse files Browse the repository at this point in the history
  • Loading branch information
fatimarahman authored Jul 31, 2024
1 parent ee49519 commit d496f65
Show file tree
Hide file tree
Showing 19 changed files with 338 additions and 92 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

## unreleased version -- v0.13.1
## Added
- New script to parse download requests from S3 log files for UMP books
- New analytics folder for University Press project code. Contains methodology for generating Counter 5 reports
- New script to update current UofM manifests with fulfill endpoints to replace pdf/epub urls
- Updated README with appendix and additions to avaliable processes
- New process to add fulfill urls to Limited Access manifests and update fulfill_limited_access flags to True
Expand Down
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,18 @@ Using these retrieved records, and matched records from the DCDW as a corpus, th

This application is built as a monorepo, which can be built as a single Docker container. This container can be run to execute different processes, which either execute discrete tasks or start a persistent service (such as a Flask API instance). The monorepo structure allows for a high degree of code reuse and makes extending existing services/adding new services easier as they can be based on existing patterns. Many of the modules include abstract base classes that define the mandatory methods for each service.

## Analytics

Analytics projects are stored separately from other DRB processes and scripts, in the [analytics](analytics) folder. Each analytics project is listed below:
* [University Press Backlist Project](analytics/upress_reporting) = Generates [Counter 5 reports](https://airtable.com/appBoLf4lMofecGPU/tblIjRKk0fnoGOqMo?blocks=hide) given a timeframe. Be sure to set up environment variables `DOWNLOAD_BUCKET` and `DOWNLOAD_LOG_PATH`. To generate Counter 5 reports, run the following:
```
python3 analytics/upress_reporting/runner.py <REPORTING PERIOD>
```
Here is an example command:
```
python3 analytics/upress_reporting/runner.py "2024-03-01 to 2024-03-30"
```
### Local Development
Locally these services can be run in two modes:
Expand Down
Empty file.
20 changes: 20 additions & 0 deletions analytics/upress_reporting/counter_5_controller.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from nypl_py_utils.functions.log_helper import create_log
from analytics.upress_reporting.models.reports.downloads import DownloadsReport


class Counter5Controller:
"""Class for orchestrating various Counter 5 reports"""
def __init__(self, reporting_period):
self.logger = create_log("counter_5_controller")
self.publishers = []
self.reporting_period = reporting_period

def _setup_reports(self, publisher, date_range):
return [
DownloadsReport(publisher, date_range)
]

def pull_reports(self):
self.logger.info("Pulling Counter 5 reports...")
for publisher in self.publishers:
return True
140 changes: 140 additions & 0 deletions analytics/upress_reporting/helpers/download_data_aggregator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import os
import boto3
import re

from analytics.upress_reporting.models.data.download_event import DownloadEvent
from logger import createLog
from model import Edition, Item, Link
from model.postgres.item import ITEM_LINKS
from managers import DBManager

# Regexes needed to parse S3 logs
REQUEST_REGEX = r"REST.GET.OBJECT "
# File ID includes the file name for the pdf object
FILE_ID_REGEX = r"REST.GET.OBJECT (.+pdf\s)"
TIMESTAMP_REGEX = r"\[.+\]"
REFERRER_REGEX = r"https://drb-qa.nypl.org/"


class DownloadDataAggregator:
"""
Parses S3 download logs and generates list of DownloadEvents, each corresponding
to a single download request.
"""

def __init__(self, publisher, date_range):
self.publisher = publisher
self.date_range = date_range

self.s3_client = boto3.client("s3")
self.bucket_name = os.environ.get("DOWNLOAD_BUCKET", None)
self.log_path = os.environ.get("DOWNLOAD_LOG_PATH", None)

self._setup_db_manager()
self.logger = createLog("download_request_parser")

def pull_download_events(self):
'''
Returns list of DownloadEvents in a given reporting period.
'''
download_events = []

for date in self.date_range:
folder_name = date.strftime("%Y/%m/%d")
batch = self._load_batch(folder_name)
downloads_per_day = self._parse_logs(batch)
download_events.extend(downloads_per_day)

self.db_manager.closeConnection()
return download_events

def _setup_db_manager(self):
self.db_manager = DBManager(
user=os.environ.get("POSTGRES_USER", None),
pswd=os.environ.get("POSTGRES_PSWD", None),
host=os.environ.get("POSTGRES_HOST", None),
port=os.environ.get("POSTGRES_PORT", None),
db=os.environ.get("POSTGRES_NAME", None),
)
self.db_manager.generateEngine()
self.db_manager.createSession()

def _load_batch(self, log_folder):
prefix = self.log_path + log_folder + "/"
paginator = self.s3_client.get_paginator("list_objects_v2")
page_iterator = paginator.paginate(
Bucket=self.bucket_name, Prefix=prefix)
return page_iterator

def _parse_logs(self, batch):
'''
The edition title, identifier, and timestamp are parsed out of the
S3 server access log files for UMP download requests.
'''
downloads_in_batch = []

for log_file in batch:
if "Contents" not in log_file:
path = self._redact_s3_path(log_file["Prefix"])
raise DownloadParsingError(
f"Log files in path {path} do not exist.")
else:
for content in log_file["Contents"]:
curr_key = str(content["Key"])
log_object_dict = self.s3_client.get_object(
Bucket=self.bucket_name, Key=f"{curr_key}"
)
for i in log_object_dict["Body"].iter_lines():
log_object_dict = i.decode("utf8")
parse_tuple = self._match_log_info_with_frbr_data(
log_object_dict)
if parse_tuple:
downloads_in_batch.append(DownloadEvent(
parse_tuple[0], parse_tuple[1], parse_tuple[2]))
return downloads_in_batch

def _redact_s3_path(self, path):
'''
Used to remove sensitive data from S3 prefix before passing to error message.
Example input = "logs/123456789/us-east-1/ump-pdf-repository/2024/1/1"
Example output: "logs/NYPL_AWS_ID/us-east-1/ump-pdf-repository/2024/1/1"
'''
split_path = path.split("/")
split_path[1] = "NYPL_AWS_ID"
return "/".join(split_path)

def _match_log_info_with_frbr_data(self, log_object):
matchRequest = re.search(REQUEST_REGEX, log_object)
matchReferrer = re.search(REFERRER_REGEX, log_object)

if matchRequest and matchReferrer and "403 AccessDenied" not in log_object:
match_time = re.search(TIMESTAMP_REGEX, log_object)
match_file_id = re.search(FILE_ID_REGEX, log_object)
link_group = match_file_id.group(1)
title_parse = ""
id_parse = None

for item in self.db_manager.session.query(Item).filter(
Item.source == self.publisher
):
for link in (
self.db_manager.session.query(Link)
.join(ITEM_LINKS)
.filter(ITEM_LINKS.c.item_id == item.id)
.filter(Link.media_type == "application/pdf")
.filter(Link.url.contains(link_group.strip()))
.all()
):
item_edit_id = item.edition_id
for edit in self.db_manager.session.query(Edition).filter(
Edition.id == item_edit_id
):
title_parse = edit.title
id_parse = edit.id

return [title_parse, match_time.group(0), id_parse]


class DownloadParsingError(Exception):
def __init__(self, message=None):
self.message = message
7 changes: 7 additions & 0 deletions analytics/upress_reporting/models/data/download_event.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from dataclasses import dataclass

@dataclass(init=True, repr=True)
class DownloadEvent():
title: str
timestamp: str
edition_id: int
45 changes: 45 additions & 0 deletions analytics/upress_reporting/models/reports/counter_5_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import pandas
import re
import uuid

from abc import ABC, abstractmethod
from datetime import datetime


class Counter5Report(ABC):
def __init__(self, publisher, reporting_period=None):
self.publisher = publisher
self.created = datetime.today().strftime("%Y-%m-%d")
self.created_by = "NYPL"
if reporting_period is not None:
self.reporting_period = reporting_period
else:
# Set reporting period to first month of current year
# TODO: determine default reporting period
self.reporting_period = (
f"{datetime.now().year}-01-01 to {datetime.now().year}-01-31"
)

@abstractmethod
def build_header(self) -> dict:
return

@abstractmethod
def build_report(self):
return

def generate_report_id(self):
return uuid.uuid4()

def parse_reporting_period(self, reporting_period):
"""
Input: String with date range in Y-m-d format (ex. "2024-01-01 to 2024-12-31")
Output: Pandas date_range object
"""
date_pattern = "20[0-9][0-9](.|-|)(\\d\\d)(.|-|)(\\d\\d)"

if re.search(
("^" + date_pattern + "\\sto\\s" + date_pattern), reporting_period
):
start, end = reporting_period.split(" to ")
return pandas.date_range(start=start, end=end)
6 changes: 6 additions & 0 deletions analytics/upress_reporting/models/reports/country_level.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from analytics.upress_reporting.models.reports.counter_5_report import Counter5Report


class CountryLevelReport(Counter5Report):
def __init__(self, publisher):
super().__init__(publisher)
49 changes: 49 additions & 0 deletions analytics/upress_reporting/models/reports/downloads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import csv

from analytics.upress_reporting.helpers.download_data_aggregator import (
DownloadDataAggregator,
)
from analytics.upress_reporting.models.reports.counter_5_report import Counter5Report

ADDITIONAL_HEADERS = ["Book Title",
"Book ID",
"Authors",
"ISBN",
"eISBN",
"Copyright Year",
"Disciplines",
"Usage Type",
"Reporting Period Total"]


class DownloadsReport(Counter5Report):
def __init__(self, *args):
super().__init__(*args)
pandas_date_range = self.parse_reporting_period(self.reporting_period)
self.download_request_parser = DownloadDataAggregator(
self.publisher, pandas_date_range)

def build_header(self):
return {
"Report_Name": "NYPL DRB Book Usage by Title / Downloads",
"Report_ID": self.generate_report_id(),
"Report_Description": "Downloads of your books from NYPL's Digital Research Books by title.",
"Publisher_Name": self.publisher,
"Reporting_Period": self.reporting_period,
"Created": self.created,
"Created_By": self.created_by,
}

def build_report(self):
download_events = self.download_request_parser.pull_download_events()
header = self.build_header()
with open('counter_5_downloads_report.csv', 'w') as csv_file:
writer = csv.writer(csv_file)
for key, value in header.items():
writer.writerow([key, value])
writer.writerow([])
# following code is temp until we can integrate DRB data
writer.writerow(["Title", "Timestamp", "Edition_ID"])
for download_event in download_events:
writer.writerow(
[download_event.title, download_event.timestamp, download_event.edition_id])
6 changes: 6 additions & 0 deletions analytics/upress_reporting/models/reports/total_usage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from analytics.upress_reporting.models.reports.counter_5_report import Counter5Report


class TotalUsageReport(Counter5Report):
def __init__(self, publisher):
super().__init__(publisher)
6 changes: 6 additions & 0 deletions analytics/upress_reporting/models/reports/views.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from analytics.upress_reporting.models.reports.counter_5_report import Counter5Report


class ViewsReport(Counter5Report):
def __init__(self, publisher):
super().__init__(publisher)
24 changes: 24 additions & 0 deletions analytics/upress_reporting/runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import sys

from analytics.upress_reporting.models.reports.downloads import DownloadsReport
from datetime import datetime
from logger import createLog
from main import loadEnvFile


def main():
logger = createLog("Generating Counter 5 reports...")
loadEnvFile('local-compose', fileString='config/local-compose.yaml')

downloads_report = None

if (len(sys.argv) <= 1):
print(
f"No reporting period passed in. Generating report for Jan {datetime.now().year}!")
downloads_report = DownloadsReport("UofM") if (
len(sys.argv) <= 1) else DownloadsReport("UofM", sys.argv[1])
downloads_report.build_report()


if __name__ == '__main__':
main()
4 changes: 4 additions & 0 deletions config/development.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,7 @@ API_PROXY_CORS_ALLOWED: '*'

# Current NYPL Webreader version
READER_VERSION: v1

# ANALYTICS CONFIGURATION
DOWNLOAD_BUCKET: "ump-pdf-repository-logs"
DOWNLOAD_LOG_PATH: "logs/946183545209/us-east-1/ump-pdf-repository/"
6 changes: 5 additions & 1 deletion config/example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,4 +73,8 @@ WEBPUB_PDF_PROFILE: http://librarysimplified.org/terms/profiles/pdf
API_PROXY_CORS_ALLOWED: (?:Source1:Source2)

# Current NYPL Webreader version
READER_VERSION: xxx
READER_VERSION: xxx

# ANALYTICS CONFIGURATION
DOWNLOAD_BUCKET: "ump-pdf-repository-logs"
DOWNLOAD_LOG_PATH: "logs/946183545209/us-east-1/ump-pdf-repository/"
4 changes: 4 additions & 0 deletions config/production.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,7 @@ API_PROXY_CORS_ALLOWED: http[s]?://.*nypl.org

# Current NYPL Webreader version
READER_VERSION: v2

# ANALYTICS CONFIGURATION
DOWNLOAD_BUCKET: "ump-pdf-repository-logs"
DOWNLOAD_LOG_PATH: "logs/946183545209/us-east-1/ump-pdf-repository/"
4 changes: 4 additions & 0 deletions config/qa.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,7 @@ API_PROXY_CORS_ALLOWED: (?:http[s]?:\/\/.*nypl.org|https:\/\/.*(?:nypl|sfr).*ver

# Current NYPL Webreader version
READER_VERSION: v2

# ANALYTICS CONFIGURATION
DOWNLOAD_BUCKET: "ump-pdf-repository-logs"
DOWNLOAD_LOG_PATH: "logs/946183545209/us-east-1/ump-pdf-repository/"
6 changes: 5 additions & 1 deletion config/sample-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,4 +78,8 @@ API_PROXY_CORS_ALLOWED: '*'

READER_VERSION: v2

DEFAULT_COVER_URL: https://drb-files-qa.s3.amazonaws.com/covers/default/defaultCover.png
DEFAULT_COVER_URL: https://drb-files-qa.s3.amazonaws.com/covers/default/defaultCover.png

# ANALYTICS CONFIGURATION
DOWNLOAD_BUCKET: "ump-pdf-repository-logs"
DOWNLOAD_LOG_PATH: "logs/946183545209/us-east-1/ump-pdf-repository/"
1 change: 0 additions & 1 deletion scripts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,4 @@
from .countCABooks import main as countCA
from .nyplLoginFlags import main as nyplFlags
from .deleteUMPManifestLinks import main as deleteUMPManifests
from .parseDownloadRequests import main as parseDownloads
from .addFulfillManifest import main as fulfillManifest
Loading

0 comments on commit d496f65

Please sign in to comment.