From 48b618abb40fe64378eb8033a99f8ea746f9cf94 Mon Sep 17 00:00:00 2001
From: bturkus <benjaminturkus@nypl.org>
Date: Fri, 22 Nov 2024 16:45:13 -0500
Subject: [PATCH] add some aws/eavie scripts

---
 ami_scripts/compare_aws_eavie.py | 64 ++++++++++++++++++++++++++++++++
 ami_scripts/export_s3_to_csv.py  | 54 +++++++++++++++++++++++++++
 2 files changed, 118 insertions(+)
 create mode 100755 ami_scripts/compare_aws_eavie.py
 create mode 100755 ami_scripts/export_s3_to_csv.py

diff --git a/ami_scripts/compare_aws_eavie.py b/ami_scripts/compare_aws_eavie.py
new file mode 100755
index 0000000..38b3b35
--- /dev/null
+++ b/ami_scripts/compare_aws_eavie.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+
+import csv
+import argparse
+
+def extract_unique_ids_from_bucket(bucket_csv):
+    """Extracts unique 6-digit IDs from the bucket CSV."""
+    unique_ids = set()
+    with open(bucket_csv, 'r') as file:
+        reader = csv.reader(file, delimiter='\t')  # Tab-delimited
+        for row_num, row in enumerate(reader, start=1):
+            try:
+                # Extract the key (1st column) and split to get the 6-digit ID
+                key = row[0]
+                id_part = key.split('_')[1]  # Assuming format like axv_211010_v01_sc.json
+                if id_part.isdigit() and len(id_part) == 6:
+                    unique_ids.add(id_part)
+            except IndexError:
+                print(f"Skipping malformed row {row_num}: {row}")
+            except Exception as e:
+                print(f"Error processing row {row_num}: {row}. Error: {e}")
+    return unique_ids
+
+def find_ids_with_issues(bucket_ids, streaming_csv):
+    """Finds IDs present in the bucket but marked as FALSE in the streaming CSV."""
+    issues = []
+    with open(streaming_csv, 'r') as file:
+        reader = csv.DictReader(file)
+        for row in reader:
+            idf = row['item_idf']
+            media_available = row['media_available']
+            if idf in bucket_ids and media_available.upper() == 'FALSE':
+                issues.append(idf)
+    return issues
+
+def main():
+    # Set up argument parser
+    parser = argparse.ArgumentParser(description="Compare AWS bucket and streaming platform lists.")
+    parser.add_argument('-b', '--bucket', required=True, help="Path to the AWS bucket CSV file")
+    parser.add_argument('-s', '--streaming', required=True, help="Path to the streaming platform CSV file")
+    parser.add_argument('-o', '--output', help="Output file to save the results", default='issues.txt')
+    args = parser.parse_args()
+
+    # Extract IDs from bucket CSV
+    print("Extracting unique IDs from the bucket CSV...")
+    bucket_ids = extract_unique_ids_from_bucket(args.bucket)
+    print(f"Found {len(bucket_ids)} unique IDs in the bucket.")
+
+    # Compare with streaming platform CSV
+    print("Comparing IDs with the streaming platform list...")
+    issues = find_ids_with_issues(bucket_ids, args.streaming)
+    print(f"Found {len(issues)} IDs with issues.")
+
+    # Save results
+    if issues:
+        with open(args.output, 'w') as file:
+            for issue in issues:
+                file.write(f"{issue}\n")
+        print(f"Issues saved to {args.output}")
+    else:
+        print("No issues found.")
+
+if __name__ == '__main__':
+    main()
diff --git a/ami_scripts/export_s3_to_csv.py b/ami_scripts/export_s3_to_csv.py
new file mode 100755
index 0000000..c0389aa
--- /dev/null
+++ b/ami_scripts/export_s3_to_csv.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+
+import boto3
+import csv
+import argparse
+
+def list_s3_objects(bucket_name, output_file):
+    # Initialize S3 client
+    s3 = boto3.client('s3')
+
+    try:
+        # Write to CSV
+        with open(output_file, mode='w', newline='') as file:
+            writer = csv.writer(file)
+            writer.writerow(['Key', 'LastModified', 'Size'])  # CSV header
+
+            # Pagination logic
+            continuation_token = None
+            total_files = 0
+
+            while True:
+                if continuation_token:
+                    response = s3.list_objects_v2(Bucket=bucket_name, ContinuationToken=continuation_token)
+                else:
+                    response = s3.list_objects_v2(Bucket=bucket_name)
+
+                # Write object details to CSV
+                for obj in response.get('Contents', []):
+                    writer.writerow([obj['Key'], obj['LastModified'], obj['Size']])
+                    total_files += 1
+
+                # Check if there are more objects to fetch
+                if response.get('IsTruncated'):  # True if there are more objects to fetch
+                    continuation_token = response.get('NextContinuationToken')
+                else:
+                    break
+
+        print(f"Export complete! {total_files} files written to {output_file}")
+
+    except Exception as e:
+        print(f"Error: {e}")
+
+def main():
+    # Set up argument parser
+    parser = argparse.ArgumentParser(description="Export contents of an S3 bucket to a CSV file.")
+    parser.add_argument('-b', '--bucket', required=True, help="Name of the S3 bucket")
+    parser.add_argument('-o', '--out', required=True, help="Output CSV file location and name")
+    args = parser.parse_args()
+
+    # Call the function to list S3 objects
+    list_s3_objects(args.bucket, args.out)
+
+if __name__ == '__main__':
+    main()