-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
SFR-2047: Initial Counter 5 Functionality (#317)
- Loading branch information
1 parent
ee49519
commit d496f65
Showing
19 changed files
with
338 additions
and
92 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from nypl_py_utils.functions.log_helper import create_log | ||
from analytics.upress_reporting.models.reports.downloads import DownloadsReport | ||
|
||
|
||
class Counter5Controller: | ||
"""Class for orchestrating various Counter 5 reports""" | ||
def __init__(self, reporting_period): | ||
self.logger = create_log("counter_5_controller") | ||
self.publishers = [] | ||
self.reporting_period = reporting_period | ||
|
||
def _setup_reports(self, publisher, date_range): | ||
return [ | ||
DownloadsReport(publisher, date_range) | ||
] | ||
|
||
def pull_reports(self): | ||
self.logger.info("Pulling Counter 5 reports...") | ||
for publisher in self.publishers: | ||
return True |
140 changes: 140 additions & 0 deletions
140
analytics/upress_reporting/helpers/download_data_aggregator.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
import os | ||
import boto3 | ||
import re | ||
|
||
from analytics.upress_reporting.models.data.download_event import DownloadEvent | ||
from logger import createLog | ||
from model import Edition, Item, Link | ||
from model.postgres.item import ITEM_LINKS | ||
from managers import DBManager | ||
|
||
# Regexes needed to parse S3 logs | ||
REQUEST_REGEX = r"REST.GET.OBJECT " | ||
# File ID includes the file name for the pdf object | ||
FILE_ID_REGEX = r"REST.GET.OBJECT (.+pdf\s)" | ||
TIMESTAMP_REGEX = r"\[.+\]" | ||
REFERRER_REGEX = r"https://drb-qa.nypl.org/" | ||
|
||
|
||
class DownloadDataAggregator: | ||
""" | ||
Parses S3 download logs and generates list of DownloadEvents, each corresponding | ||
to a single download request. | ||
""" | ||
|
||
def __init__(self, publisher, date_range): | ||
self.publisher = publisher | ||
self.date_range = date_range | ||
|
||
self.s3_client = boto3.client("s3") | ||
self.bucket_name = os.environ.get("DOWNLOAD_BUCKET", None) | ||
self.log_path = os.environ.get("DOWNLOAD_LOG_PATH", None) | ||
|
||
self._setup_db_manager() | ||
self.logger = createLog("download_request_parser") | ||
|
||
def pull_download_events(self): | ||
''' | ||
Returns list of DownloadEvents in a given reporting period. | ||
''' | ||
download_events = [] | ||
|
||
for date in self.date_range: | ||
folder_name = date.strftime("%Y/%m/%d") | ||
batch = self._load_batch(folder_name) | ||
downloads_per_day = self._parse_logs(batch) | ||
download_events.extend(downloads_per_day) | ||
|
||
self.db_manager.closeConnection() | ||
return download_events | ||
|
||
def _setup_db_manager(self): | ||
self.db_manager = DBManager( | ||
user=os.environ.get("POSTGRES_USER", None), | ||
pswd=os.environ.get("POSTGRES_PSWD", None), | ||
host=os.environ.get("POSTGRES_HOST", None), | ||
port=os.environ.get("POSTGRES_PORT", None), | ||
db=os.environ.get("POSTGRES_NAME", None), | ||
) | ||
self.db_manager.generateEngine() | ||
self.db_manager.createSession() | ||
|
||
def _load_batch(self, log_folder): | ||
prefix = self.log_path + log_folder + "/" | ||
paginator = self.s3_client.get_paginator("list_objects_v2") | ||
page_iterator = paginator.paginate( | ||
Bucket=self.bucket_name, Prefix=prefix) | ||
return page_iterator | ||
|
||
def _parse_logs(self, batch): | ||
''' | ||
The edition title, identifier, and timestamp are parsed out of the | ||
S3 server access log files for UMP download requests. | ||
''' | ||
downloads_in_batch = [] | ||
|
||
for log_file in batch: | ||
if "Contents" not in log_file: | ||
path = self._redact_s3_path(log_file["Prefix"]) | ||
raise DownloadParsingError( | ||
f"Log files in path {path} do not exist.") | ||
else: | ||
for content in log_file["Contents"]: | ||
curr_key = str(content["Key"]) | ||
log_object_dict = self.s3_client.get_object( | ||
Bucket=self.bucket_name, Key=f"{curr_key}" | ||
) | ||
for i in log_object_dict["Body"].iter_lines(): | ||
log_object_dict = i.decode("utf8") | ||
parse_tuple = self._match_log_info_with_frbr_data( | ||
log_object_dict) | ||
if parse_tuple: | ||
downloads_in_batch.append(DownloadEvent( | ||
parse_tuple[0], parse_tuple[1], parse_tuple[2])) | ||
return downloads_in_batch | ||
|
||
def _redact_s3_path(self, path): | ||
''' | ||
Used to remove sensitive data from S3 prefix before passing to error message. | ||
Example input = "logs/123456789/us-east-1/ump-pdf-repository/2024/1/1" | ||
Example output: "logs/NYPL_AWS_ID/us-east-1/ump-pdf-repository/2024/1/1" | ||
''' | ||
split_path = path.split("/") | ||
split_path[1] = "NYPL_AWS_ID" | ||
return "/".join(split_path) | ||
|
||
def _match_log_info_with_frbr_data(self, log_object): | ||
matchRequest = re.search(REQUEST_REGEX, log_object) | ||
matchReferrer = re.search(REFERRER_REGEX, log_object) | ||
|
||
if matchRequest and matchReferrer and "403 AccessDenied" not in log_object: | ||
match_time = re.search(TIMESTAMP_REGEX, log_object) | ||
match_file_id = re.search(FILE_ID_REGEX, log_object) | ||
link_group = match_file_id.group(1) | ||
title_parse = "" | ||
id_parse = None | ||
|
||
for item in self.db_manager.session.query(Item).filter( | ||
Item.source == self.publisher | ||
): | ||
for link in ( | ||
self.db_manager.session.query(Link) | ||
.join(ITEM_LINKS) | ||
.filter(ITEM_LINKS.c.item_id == item.id) | ||
.filter(Link.media_type == "application/pdf") | ||
.filter(Link.url.contains(link_group.strip())) | ||
.all() | ||
): | ||
item_edit_id = item.edition_id | ||
for edit in self.db_manager.session.query(Edition).filter( | ||
Edition.id == item_edit_id | ||
): | ||
title_parse = edit.title | ||
id_parse = edit.id | ||
|
||
return [title_parse, match_time.group(0), id_parse] | ||
|
||
|
||
class DownloadParsingError(Exception): | ||
def __init__(self, message=None): | ||
self.message = message |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from dataclasses import dataclass | ||
|
||
@dataclass(init=True, repr=True) | ||
class DownloadEvent(): | ||
title: str | ||
timestamp: str | ||
edition_id: int |
45 changes: 45 additions & 0 deletions
45
analytics/upress_reporting/models/reports/counter_5_report.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import pandas | ||
import re | ||
import uuid | ||
|
||
from abc import ABC, abstractmethod | ||
from datetime import datetime | ||
|
||
|
||
class Counter5Report(ABC): | ||
def __init__(self, publisher, reporting_period=None): | ||
self.publisher = publisher | ||
self.created = datetime.today().strftime("%Y-%m-%d") | ||
self.created_by = "NYPL" | ||
if reporting_period is not None: | ||
self.reporting_period = reporting_period | ||
else: | ||
# Set reporting period to first month of current year | ||
# TODO: determine default reporting period | ||
self.reporting_period = ( | ||
f"{datetime.now().year}-01-01 to {datetime.now().year}-01-31" | ||
) | ||
|
||
@abstractmethod | ||
def build_header(self) -> dict: | ||
return | ||
|
||
@abstractmethod | ||
def build_report(self): | ||
return | ||
|
||
def generate_report_id(self): | ||
return uuid.uuid4() | ||
|
||
def parse_reporting_period(self, reporting_period): | ||
""" | ||
Input: String with date range in Y-m-d format (ex. "2024-01-01 to 2024-12-31") | ||
Output: Pandas date_range object | ||
""" | ||
date_pattern = "20[0-9][0-9](.|-|)(\\d\\d)(.|-|)(\\d\\d)" | ||
|
||
if re.search( | ||
("^" + date_pattern + "\\sto\\s" + date_pattern), reporting_period | ||
): | ||
start, end = reporting_period.split(" to ") | ||
return pandas.date_range(start=start, end=end) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from analytics.upress_reporting.models.reports.counter_5_report import Counter5Report | ||
|
||
|
||
class CountryLevelReport(Counter5Report): | ||
def __init__(self, publisher): | ||
super().__init__(publisher) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import csv | ||
|
||
from analytics.upress_reporting.helpers.download_data_aggregator import ( | ||
DownloadDataAggregator, | ||
) | ||
from analytics.upress_reporting.models.reports.counter_5_report import Counter5Report | ||
|
||
ADDITIONAL_HEADERS = ["Book Title", | ||
"Book ID", | ||
"Authors", | ||
"ISBN", | ||
"eISBN", | ||
"Copyright Year", | ||
"Disciplines", | ||
"Usage Type", | ||
"Reporting Period Total"] | ||
|
||
|
||
class DownloadsReport(Counter5Report): | ||
def __init__(self, *args): | ||
super().__init__(*args) | ||
pandas_date_range = self.parse_reporting_period(self.reporting_period) | ||
self.download_request_parser = DownloadDataAggregator( | ||
self.publisher, pandas_date_range) | ||
|
||
def build_header(self): | ||
return { | ||
"Report_Name": "NYPL DRB Book Usage by Title / Downloads", | ||
"Report_ID": self.generate_report_id(), | ||
"Report_Description": "Downloads of your books from NYPL's Digital Research Books by title.", | ||
"Publisher_Name": self.publisher, | ||
"Reporting_Period": self.reporting_period, | ||
"Created": self.created, | ||
"Created_By": self.created_by, | ||
} | ||
|
||
def build_report(self): | ||
download_events = self.download_request_parser.pull_download_events() | ||
header = self.build_header() | ||
with open('counter_5_downloads_report.csv', 'w') as csv_file: | ||
writer = csv.writer(csv_file) | ||
for key, value in header.items(): | ||
writer.writerow([key, value]) | ||
writer.writerow([]) | ||
# following code is temp until we can integrate DRB data | ||
writer.writerow(["Title", "Timestamp", "Edition_ID"]) | ||
for download_event in download_events: | ||
writer.writerow( | ||
[download_event.title, download_event.timestamp, download_event.edition_id]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from analytics.upress_reporting.models.reports.counter_5_report import Counter5Report | ||
|
||
|
||
class TotalUsageReport(Counter5Report): | ||
def __init__(self, publisher): | ||
super().__init__(publisher) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from analytics.upress_reporting.models.reports.counter_5_report import Counter5Report | ||
|
||
|
||
class ViewsReport(Counter5Report): | ||
def __init__(self, publisher): | ||
super().__init__(publisher) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import sys | ||
|
||
from analytics.upress_reporting.models.reports.downloads import DownloadsReport | ||
from datetime import datetime | ||
from logger import createLog | ||
from main import loadEnvFile | ||
|
||
|
||
def main(): | ||
logger = createLog("Generating Counter 5 reports...") | ||
loadEnvFile('local-compose', fileString='config/local-compose.yaml') | ||
|
||
downloads_report = None | ||
|
||
if (len(sys.argv) <= 1): | ||
print( | ||
f"No reporting period passed in. Generating report for Jan {datetime.now().year}!") | ||
downloads_report = DownloadsReport("UofM") if ( | ||
len(sys.argv) <= 1) else DownloadsReport("UofM", sys.argv[1]) | ||
downloads_report.build_report() | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.