From 60fbb29c46f66ee22adfb1aae5c2c48b7c18ebac Mon Sep 17 00:00:00 2001 From: bturkus Date: Wed, 13 Nov 2024 15:19:49 -0500 Subject: [PATCH] Update path_grabber.py --- ami_scripts/old_scripts/path_grabber.py | 96 +++++++++++++++++-------- 1 file changed, 66 insertions(+), 30 deletions(-) mode change 100644 => 100755 ami_scripts/old_scripts/path_grabber.py diff --git a/ami_scripts/old_scripts/path_grabber.py b/ami_scripts/old_scripts/path_grabber.py old mode 100644 new mode 100755 index 101ff036..b65a0ce0 --- a/ami_scripts/old_scripts/path_grabber.py +++ b/ami_scripts/old_scripts/path_grabber.py @@ -2,52 +2,88 @@ import argparse import os -import glob -import logging import csv -import re +import logging +from pathlib import Path LOGGER = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) def _make_parser(): parser = argparse.ArgumentParser(description="Pull MediaInfo from a bunch of video or audio files") parser.add_argument("-d", "--directory", - help = "path to folder full of media files", - required = False) - parser.add_argument("-f", "--file", - help = "path to folder full of media files", - required = False) + help="Path to folder full of media files", + required=True) parser.add_argument("-o", "--output", - help = "path to save csv", - required = True) + help="Path to save CSV", + required=True) + parser.add_argument("-c", "--checkpoint", + help="Path to a checkpoint file for resuming progress", + required=False) + return parser +def load_checkpoint(checkpoint_file): + if checkpoint_file and os.path.exists(checkpoint_file): + with open(checkpoint_file, 'r') as f: + processed_files = {line.strip() for line in f} + LOGGER.info(f"Loaded {len(processed_files)} entries from checkpoint.") + return processed_files + return set() - return parser +def save_checkpoint(checkpoint_file, processed_files): + if checkpoint_file: + with open(checkpoint_file, 'w') as f: + f.writelines(f"{file}\n" for file in processed_files) + LOGGER.info(f"Checkpoint saved with {len(processed_files)} entries.") +def gather_files(directory, extensions, processed_files): + LOGGER.info("Gathering files...") + for root, _, files in os.walk(directory): + for file in files: + if any(file.lower().endswith(ext) for ext in extensions): + file_path = os.path.join(root, file) + if file_path not in processed_files: + yield file_path def main(): parser = _make_parser() args = parser.parse_args() - files_to_examine = [] - - #validate that dir exists and add all files to queue - if args.directory: - if os.path.isdir(args.directory): - glob_abspath = os.path.abspath(os.path.join(args.directory, '**/*')) - for filename in glob.glob(glob_abspath, recursive = True): - if filename.endswith(('.mkv', '.mov', '.json', '.wav', '.WAV', '.mp4', '.dv', '.iso', '.flac')): - files_to_examine.append(filename) - - all_data = [] - for file in files_to_examine: - file_data = [file] - all_data.append(file_data) - - with open(args.output, 'w') as f: - md_csv = csv.writer(f) - md_csv.writerow(['filePath']) - md_csv.writerows(all_data) + directory = args.directory + output_file = args.output + checkpoint_file = args.checkpoint + extensions = ['.mkv', '.mov', '.json', '.wav', '.mp4', '.dv', '.iso', '.flac'] + + if not os.path.isdir(directory): + LOGGER.error("The specified directory does not exist.") + return + + # Load checkpoint if available + processed_files = load_checkpoint(checkpoint_file) + + # Open output file in append mode + with open(output_file, 'a', newline='') as f: + writer = csv.writer(f) + if os.path.getsize(output_file) == 0: # Write header if file is empty + writer.writerow(['filePath']) + + try: + # Iterate over files and write to CSV + for file_path in gather_files(directory, extensions, processed_files): + writer.writerow([file_path]) + processed_files.add(file_path) + + # Save checkpoint periodically + if len(processed_files) % 1000 == 0: + save_checkpoint(checkpoint_file, processed_files) + LOGGER.info(f"Processed {len(processed_files)} files so far.") + except KeyboardInterrupt: + LOGGER.warning("Process interrupted. Saving progress...") + save_checkpoint(checkpoint_file, processed_files) + + # Final checkpoint save + save_checkpoint(checkpoint_file, processed_files) + LOGGER.info(f"Processing complete. Total files processed: {len(processed_files)}") if __name__ == "__main__": main()