-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
118 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import csv | ||
import argparse | ||
|
||
def extract_unique_ids_from_bucket(bucket_csv): | ||
"""Extracts unique 6-digit IDs from the bucket CSV.""" | ||
unique_ids = set() | ||
with open(bucket_csv, 'r') as file: | ||
reader = csv.reader(file, delimiter='\t') # Tab-delimited | ||
for row_num, row in enumerate(reader, start=1): | ||
try: | ||
# Extract the key (1st column) and split to get the 6-digit ID | ||
key = row[0] | ||
id_part = key.split('_')[1] # Assuming format like axv_211010_v01_sc.json | ||
if id_part.isdigit() and len(id_part) == 6: | ||
unique_ids.add(id_part) | ||
except IndexError: | ||
print(f"Skipping malformed row {row_num}: {row}") | ||
except Exception as e: | ||
print(f"Error processing row {row_num}: {row}. Error: {e}") | ||
return unique_ids | ||
|
||
def find_ids_with_issues(bucket_ids, streaming_csv): | ||
"""Finds IDs present in the bucket but marked as FALSE in the streaming CSV.""" | ||
issues = [] | ||
with open(streaming_csv, 'r') as file: | ||
reader = csv.DictReader(file) | ||
for row in reader: | ||
idf = row['item_idf'] | ||
media_available = row['media_available'] | ||
if idf in bucket_ids and media_available.upper() == 'FALSE': | ||
issues.append(idf) | ||
return issues | ||
|
||
def main(): | ||
# Set up argument parser | ||
parser = argparse.ArgumentParser(description="Compare AWS bucket and streaming platform lists.") | ||
parser.add_argument('-b', '--bucket', required=True, help="Path to the AWS bucket CSV file") | ||
parser.add_argument('-s', '--streaming', required=True, help="Path to the streaming platform CSV file") | ||
parser.add_argument('-o', '--output', help="Output file to save the results", default='issues.txt') | ||
args = parser.parse_args() | ||
|
||
# Extract IDs from bucket CSV | ||
print("Extracting unique IDs from the bucket CSV...") | ||
bucket_ids = extract_unique_ids_from_bucket(args.bucket) | ||
print(f"Found {len(bucket_ids)} unique IDs in the bucket.") | ||
|
||
# Compare with streaming platform CSV | ||
print("Comparing IDs with the streaming platform list...") | ||
issues = find_ids_with_issues(bucket_ids, args.streaming) | ||
print(f"Found {len(issues)} IDs with issues.") | ||
|
||
# Save results | ||
if issues: | ||
with open(args.output, 'w') as file: | ||
for issue in issues: | ||
file.write(f"{issue}\n") | ||
print(f"Issues saved to {args.output}") | ||
else: | ||
print("No issues found.") | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import boto3 | ||
import csv | ||
import argparse | ||
|
||
def list_s3_objects(bucket_name, output_file): | ||
# Initialize S3 client | ||
s3 = boto3.client('s3') | ||
|
||
try: | ||
# Write to CSV | ||
with open(output_file, mode='w', newline='') as file: | ||
writer = csv.writer(file) | ||
writer.writerow(['Key', 'LastModified', 'Size']) # CSV header | ||
|
||
# Pagination logic | ||
continuation_token = None | ||
total_files = 0 | ||
|
||
while True: | ||
if continuation_token: | ||
response = s3.list_objects_v2(Bucket=bucket_name, ContinuationToken=continuation_token) | ||
else: | ||
response = s3.list_objects_v2(Bucket=bucket_name) | ||
|
||
# Write object details to CSV | ||
for obj in response.get('Contents', []): | ||
writer.writerow([obj['Key'], obj['LastModified'], obj['Size']]) | ||
total_files += 1 | ||
|
||
# Check if there are more objects to fetch | ||
if response.get('IsTruncated'): # True if there are more objects to fetch | ||
continuation_token = response.get('NextContinuationToken') | ||
else: | ||
break | ||
|
||
print(f"Export complete! {total_files} files written to {output_file}") | ||
|
||
except Exception as e: | ||
print(f"Error: {e}") | ||
|
||
def main(): | ||
# Set up argument parser | ||
parser = argparse.ArgumentParser(description="Export contents of an S3 bucket to a CSV file.") | ||
parser.add_argument('-b', '--bucket', required=True, help="Name of the S3 bucket") | ||
parser.add_argument('-o', '--out', required=True, help="Output CSV file location and name") | ||
args = parser.parse_args() | ||
|
||
# Call the function to list S3 objects | ||
list_s3_objects(args.bucket, args.out) | ||
|
||
if __name__ == '__main__': | ||
main() |