Skip to content

Commit

Permalink
Update path_grabber.py
Browse files Browse the repository at this point in the history
  • Loading branch information
bturkus committed Nov 13, 2024
1 parent 6890559 commit 60fbb29
Showing 1 changed file with 66 additions and 30 deletions.
96 changes: 66 additions & 30 deletions ami_scripts/old_scripts/path_grabber.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -2,52 +2,88 @@

import argparse
import os
import glob
import logging
import csv
import re
import logging
from pathlib import Path

LOGGER = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

def _make_parser():
parser = argparse.ArgumentParser(description="Pull MediaInfo from a bunch of video or audio files")
parser.add_argument("-d", "--directory",
help = "path to folder full of media files",
required = False)
parser.add_argument("-f", "--file",
help = "path to folder full of media files",
required = False)
help="Path to folder full of media files",
required=True)
parser.add_argument("-o", "--output",
help = "path to save csv",
required = True)
help="Path to save CSV",
required=True)
parser.add_argument("-c", "--checkpoint",
help="Path to a checkpoint file for resuming progress",
required=False)
return parser

def load_checkpoint(checkpoint_file):
if checkpoint_file and os.path.exists(checkpoint_file):
with open(checkpoint_file, 'r') as f:
processed_files = {line.strip() for line in f}
LOGGER.info(f"Loaded {len(processed_files)} entries from checkpoint.")
return processed_files
return set()

return parser
def save_checkpoint(checkpoint_file, processed_files):
if checkpoint_file:
with open(checkpoint_file, 'w') as f:
f.writelines(f"{file}\n" for file in processed_files)
LOGGER.info(f"Checkpoint saved with {len(processed_files)} entries.")

def gather_files(directory, extensions, processed_files):
LOGGER.info("Gathering files...")
for root, _, files in os.walk(directory):
for file in files:
if any(file.lower().endswith(ext) for ext in extensions):
file_path = os.path.join(root, file)
if file_path not in processed_files:
yield file_path

def main():
parser = _make_parser()
args = parser.parse_args()

files_to_examine = []

#validate that dir exists and add all files to queue
if args.directory:
if os.path.isdir(args.directory):
glob_abspath = os.path.abspath(os.path.join(args.directory, '**/*'))
for filename in glob.glob(glob_abspath, recursive = True):
if filename.endswith(('.mkv', '.mov', '.json', '.wav', '.WAV', '.mp4', '.dv', '.iso', '.flac')):
files_to_examine.append(filename)

all_data = []
for file in files_to_examine:
file_data = [file]
all_data.append(file_data)

with open(args.output, 'w') as f:
md_csv = csv.writer(f)
md_csv.writerow(['filePath'])
md_csv.writerows(all_data)
directory = args.directory
output_file = args.output
checkpoint_file = args.checkpoint
extensions = ['.mkv', '.mov', '.json', '.wav', '.mp4', '.dv', '.iso', '.flac']

if not os.path.isdir(directory):
LOGGER.error("The specified directory does not exist.")
return

# Load checkpoint if available
processed_files = load_checkpoint(checkpoint_file)

# Open output file in append mode
with open(output_file, 'a', newline='') as f:
writer = csv.writer(f)
if os.path.getsize(output_file) == 0: # Write header if file is empty
writer.writerow(['filePath'])

try:
# Iterate over files and write to CSV
for file_path in gather_files(directory, extensions, processed_files):
writer.writerow([file_path])
processed_files.add(file_path)

# Save checkpoint periodically
if len(processed_files) % 1000 == 0:
save_checkpoint(checkpoint_file, processed_files)
LOGGER.info(f"Processed {len(processed_files)} files so far.")
except KeyboardInterrupt:
LOGGER.warning("Process interrupted. Saving progress...")
save_checkpoint(checkpoint_file, processed_files)

# Final checkpoint save
save_checkpoint(checkpoint_file, processed_files)
LOGGER.info(f"Processing complete. Total files processed: {len(processed_files)}")

if __name__ == "__main__":
main()

0 comments on commit 60fbb29

Please sign in to comment.