diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..9bd4e96 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,34 @@ +# Use Ubuntu as base image +FROM ubuntu:latest + +# Set the working directory +WORKDIR /app + +# Install Python 3 and pip +# Print out Python and pip versions +RUN echo "[ ] Updating package lists..." && \ + apt-get update && \ + echo "[ ] Installing Python 3 and pip..." && \ + apt-get install -y python3 python3-pip && \ + echo "[ ] Cleaning up package cache..." && \ + apt-get clean && \ + echo "[ ] Creating a symbolic link for Python 3..." && \ + ln -s /usr/bin/python3 /usr/bin/python && \ + echo "[ ] Verifying Python and pip versions..." && \ + python --version && \ + pip --version + +# Copy requirements.txt to the root directory of the image +COPY requirements.txt /requirements.txt + +# Install Python dependencies from requirements.txt +RUN python3 -m pip install -r /requirements.txt + +# Create a symbolic link from /app/plex_dupefinder.py to /plex_dupefinder +RUN ln -s /app/plex_dupefinder.py /plex_dupefinder + +# Define a volume for the Python application +VOLUME /app + +# Define default command +ENTRYPOINT ["/plex_dupefinder"] diff --git a/Dockerfile_README.md b/Dockerfile_README.md new file mode 100644 index 0000000..8df98ae --- /dev/null +++ b/Dockerfile_README.md @@ -0,0 +1,78 @@ +# Dockerizing the Plex Dupefinder Python Application + +This guide outlines the steps for Dockerizing the Plex Dupefinder Python application using Docker. + +## Folder Structure + +Ensure that your project has the following folder structure: + +``` +plex_dupefinder/ +│ +└── app/ + └── Dockerfile # (Included from git checkout) + └── ... (all contents from the git checkout) +``` + +- `plex_dupefinder/`: Root directory for the Plex Dupefinder project. +- `app/`: Contains all files and directories retrieved from the git checkout, including the Dockerfile. + +## Setting Up the Folder Structure + +Follow these steps to set up the folder structure and retrieve your Python application files: + +1. Open a terminal and navigate to the desired directory for your project. +2. Run the following command to clone your repository and create the `app/` folder: + ``` + git clone https://github.com/Hossy/plex_dupefinder.git app + ``` + +This command clones your repository and creates the `/app` folder containing all files and directories from the git checkout, including the Dockerfile. + +## Building the Docker Image + +Follow these steps to build the Docker image for your Python application: + +1. Open a terminal and navigate to the root directory of your project. +2. Run the following command to build the Docker image: + ``` + docker build -t plex_dupefinder app + ``` + +This command builds the Docker image named `plex_dupefinder` using the Dockerfile located in the `app/` directory. + +### Preserving the Locally-Built Image + +To preserve the locally-built image when running `docker system prune`, follow these additional steps: + +1. When building the Docker image, add a label to it using the `--label` flag: + ``` + docker build -t plex_dupefinder --label "preserve=true" app + ``` + + This command adds the label `preserve=true` to the `plex_dupefinder` image. + +2. When running `docker system prune`, use the `--filter` flag to exclude images with the `preserve=true` label: + ``` + docker system prune -af --filter "label!=preserve=true" + ``` + + This command prunes all unused data (containers, networks, volumes, and images) except those with the `preserve=true` label, ensuring that the locally-built image is preserved from deletion. + +## Running the Docker Container + +Once the Docker image is built, you can run it as a Docker container using the following steps: + +1. Run the following command to start a Docker container from the image: + ``` + docker run --rm --name plex_dupefinder -v plex_dupefinder/app:/app plex_dupefinder + ``` + + - Replace `plex_dupefinder/app` with the absolute path to the `app/` directory on your system. + +2. If running with `SKIP_OTHER_DUPES=false`, add the `-i` option to the `docker run` command: + ``` + docker run -i --rm --name plex_dupefinder -v plex_dupefinder/app:/app plex_dupefinder + ``` + + The `-i` option ensures interactive mode, allowing input to be sent to the container. diff --git a/NewFeatures_README.md b/NewFeatures_README.md new file mode 100644 index 0000000..3ac0d3c --- /dev/null +++ b/NewFeatures_README.md @@ -0,0 +1,33 @@ +# Additional Features in Plex Dupefinder + +This document outlines the additional features added to the Plex Dupefinder Python application. + +## Dry Run Option + +The dry run option allows users to simulate the deletion process without actually deleting any files. This option is configured using the `DRY_RUN` parameter in the `config.json` file. When enabled, Plex Dupefinder will log potential delete operations without carrying them out. + +Users can also pass the `--dry-run` parameter from the command line to temporarily activate this feature. + +## Preventing Plex Optimized Versions + +Plex Optimized Versions are automatically excluded from consideration as duplicates. This enhancement improves the accuracy of duplicate detection by excluding files that Plex has optimized for streaming. + +However, to address rare instances where Plex incorrectly identifies files as non-optimized versions, users can configure the application to exclude files located under the "Plex Versions" folder from duplicate consideration. This option is controlled by the `SKIP_PLEX_VERSIONS_FOLDER` parameter in the `config.json` file. Media files under the "Plex Versions" folder will be ignored during duplicate identification, providing an additional layer of accuracy to the duplicate detection process. + +## Handling Unavailable Media Files + +The application now has the ability to remove entries from Plex for media files marked as 'Unavailable'. This functionality is controlled by the `FIND_UNAVAILABLE` parameter in the `config.json` file. Plex Dupefinder will attempt to delete the entry associated with unavailable media files, excluding cases where Plex reports a file size for an unavailable file. This precautionary measure prevents accidental deletion of valid files. + +## Deleting Extra .TS Files + +A new option, `FIND_EXTRA_TS`, allows users to delete all `.TS` files when a non-`.TS` file is present in the duplicate list. This feature is particularly useful for cleaning up recordings when a higher-quality, non-recorded version is available. By enabling this option, users can ensure that their media library remains clutter-free and optimized for better-quality content. + +## Skipping Other Duplicate Checks (Batch Mode) + +The `SKIP_OTHER_DUPES` option in the `config.json` file enables users to skip other duplicate checks when not using the `AUTO_DELETE` option. This allows features like `FIND_UNAVAILABLE` and `FIND_OTHER_TS` to be run in batch mode or on a scheduled task unattended, enhancing the flexibility and automation capabilities of Plex Dupefinder. + +Alternatively, users can pass the `--skip-other-dupes` parameter from the command line to temporarily activate this feature. + +## Docker Image Support + +A Dockerfile is provided with Plex Dupefinder, allowing users to build a local image for running Plex Dupefinder instead of installing additional software locally. This enhances portability and simplifies the deployment process, enabling users to run Plex Dupefinder in various environments without additional dependencies. For detailed instructions on building and using the Docker image, please see `Dockerfile_README.md`. diff --git a/config.py b/config.py index 797715e..8d2a6b3 100755 --- a/config.py +++ b/config.py @@ -10,6 +10,7 @@ config_path = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), 'config.json') base_config = { + 'DRY_RUN': False, 'PLEX_SERVER': 'https://plex.your-server.com', 'PLEX_TOKEN': '', 'PLEX_LIBRARIES': {}, @@ -23,6 +24,10 @@ 'SKIP_LIST': [], 'SCORE_FILESIZE': True, 'AUTO_DELETE': False, + 'SKIP_OTHER_DUPES': False, + 'SKIP_PLEX_VERSIONS_FOLDER': True, + 'FIND_UNAVAILABLE': False, + 'FIND_EXTRA_TS': False, 'FIND_DUPLICATE_FILEPATHS_ONLY': False } cfg = None diff --git a/plex_dupefinder.py b/plex_dupefinder.py index dcdb01b..e7ffa52 100755 --- a/plex_dupefinder.py +++ b/plex_dupefinder.py @@ -5,6 +5,7 @@ import os import sys import time +import argparse from fnmatch import fnmatch from tabulate import tabulate @@ -131,6 +132,8 @@ def get_media_info(item): 'video_duration': 0, 'file': [], 'multipart': False, + 'file_exists': True, # Used with FIND_UNAVAILABLE + 'file_exts': {}, # Used with FIND_EXTRA_TS; It is an array whose keys are the file extensions and whose values are the count of media with that file extension 'file_size': 0 } # get id @@ -192,6 +195,15 @@ def get_media_info(item): for part in item.parts: info['file'].append(part.file) info['file_size'] += part.size if part.size else 0 + if cfg['FIND_UNAVAILABLE'] and not part.exists: + info['file_exists'] = False + if cfg['FIND_EXTRA_TS']: + name,ext = os.path.splitext(part.file) + ext = ext.lower() + if ext in info['file_exts']: + info['file_exts'][ext] += 1 + else: + info['file_exts'][ext] = 1 return info @@ -199,10 +211,13 @@ def get_media_info(item): def delete_item(show_key, media_id): delete_url = urljoin(cfg['PLEX_SERVER'], '%s/media/%d' % (show_key, media_id)) log.debug("Sending DELETE request to %r" % delete_url) - if requests.delete(delete_url, headers={'X-Plex-Token': cfg['PLEX_TOKEN']}).status_code == 200: - print("\t\tDeleted media item: %r" % media_id) + if cfg['DRY_RUN']: + print("\t\tDRY RUN -- Would've deleted media item: %r" % media_id) else: - print("\t\tError deleting media item: %r" % media_id) + if requests.delete(delete_url, headers={'X-Plex-Token': cfg['PLEX_TOKEN']}).status_code == 200: + print("\t\tDeleted media item: %r" % media_id) + else: + print("\t\tError deleting media item: %r" % media_id) ############################################################ @@ -339,6 +354,18 @@ def build_tabulated(parts, items): ######################################################################### """) print("Initialized") + # process arguments + # Create argument parser + parser = argparse.ArgumentParser(description='A python script that finds duplicate versions of media (TV episodes and movies) in your Plex Library and tells Plex to remove the lowest rated files/versions (based on user-specified scoring) to leave behind a single file/version.') + # Add arguments + parser.add_argument('--dry-run', action='store_true', help='Temporarily sets DRY_RUN') + parser.add_argument('--skip-other-dupes', action='store_true', help='Temporarily sets SKIP_OTHER_DUPES') + # Parse the arguments + args = parser.parse_args() + if args.dry_run: + cfg['DRY_RUN'] = True + if args.skip_other_dupes: + cfg['SKIP_OTHER_DUPES'] = True process_later = {} # process sections print("Finding dupes...") @@ -348,111 +375,205 @@ def build_tabulated(parts, items): # loop returned duplicates for item in dupes: if item.type == 'episode': - title = "%s - %02dx%02d - %s" % ( - item.grandparentTitle, int(item.parentIndex), int(item.index), item.title) + # Sometimes, item.index is None + if item.index is None: + title = "%s - %s - %s" % ( + item.grandparentTitle, item.seasonEpisode, item.title) + else: + title = "%s - %02dx%02d - %s" % ( + item.grandparentTitle, int(item.parentIndex), int(item.index), item.title) elif item.type == 'movie': title = item.title else: title = 'Unknown' log.info("Processing: %r", title) + # If we're looking for unavailable media, double-check the existence and log the latest status to debug + if cfg['FIND_UNAVAILABLE']: + # If all files are already marked as available, log it and move on + if all(part.exists for media in item.media for part in media.parts): + log.debug("All media is available for %s", item.title) + # If any files are marked unavailable, tell Plex to recheck to verify + else: + log.debug("Reloading %s", item.title) + item.reload(timeout=90) # Force a recheck if the media files exist + media = parts = {} + for media in item.media: + for part in media.parts: + log.debug("%r,%r -- %s exists = %s; size = %s", media.id, part.id, part.file, part.exists, part.size) # loop returned parts for media item (copy 1, copy 2...) parts = {} for part in item.media: part_info = get_media_info(part) + # Skip media if it was automatically generated by Plex for optimization + if part.isOptimizedVersion: + log.info("ID: %r (%r) -- Skipping optimized version", part.id, part_info['file']) + print("ID: %r (%r) -- Skipping optimized version" % (part.id, part_info['file'])) + continue + # Check if the path contains "\\Plex Versions\\" since isOptimizedVersion is sometimes not being set correctly + elif cfg['SKIP_PLEX_VERSIONS_FOLDER'] and any("\\Plex Versions\\" in file_path for file_path in part_info['file']): + log.info("ID: %r (%r) -- Skipping Plex Versions; isOptimizedVersion = %r", part.id, part_info['file'], part.isOptimizedVersion) + print("ID: %r (%r) -- Skipping Plex Versions; isOptimizedVersion = %r" % (part.id, part_info['file'], part.isOptimizedVersion)) + continue + # Log all other instances in case troubleshooting is needed + else: + log.debug("ID: %r (%r) -- Including; isOptimizedVersion = %r", part.id, part_info['file'], part.isOptimizedVersion) + # print("ID: %r (%r) -- Including; isOptimizedVersion = %r" % (part.id, part_info['file'], part.isOptimizedVersion)) if not cfg['FIND_DUPLICATE_FILEPATHS_ONLY']: part_info['score'] = get_score(part_info) part_info['show_key'] = item.key log.info("ID: %r - Score: %s - Meta:\n%r", part.id, part_info.get('score', 'N/A'), part_info) parts[part.id] = part_info - process_later[title] = parts + # If, after skipping media, we still have more than 1 file in the list, keep it on the list + if len(parts) > 1: + process_later[title] = parts + # Skip this media if we don't have at least 2 files left in the list + else: + log.info("No duplicates after ignoring optimized versions for : %r", item.title) + print("No duplicates after ignoring optimized versions for : %r" % item.title) # process processed items time.sleep(5) for item, parts in process_later.items(): - if not cfg['AUTO_DELETE']: - partz = {} - # manual delete - print("\nWhich media item do you wish to keep for %r ?\n" % item) - - sort_key = None - sort_order = None - - if cfg['FIND_DUPLICATE_FILEPATHS_ONLY']: - sort_key = "id" - sort_order_reverse = False - else: - sort_key = "score" - sort_order_reverse = True - - media_items = {} - best_item = None - for pos, (media_id, part_info) in enumerate(collections.OrderedDict( - sorted(parts.items(), key=lambda x: x[1][sort_key], reverse=sort_order_reverse)).items(), start=1): - if pos == 1: - best_item = part_info - media_items[pos] = media_id - partz[media_id] = part_info - - headers, data = build_tabulated(partz, media_items) - print(tabulate(data, headers=headers)) - - keep_item = input("\nChoose item to keep (0 or s = skip | 1 or b = best): ") - if (keep_item.lower() != 's') and (keep_item.lower() == 'b' or 0 < int(keep_item) <= len(media_items)): - write_decision(title=item) - for media_id, part_info in parts.items(): - if keep_item.lower() == 'b' and best_item is not None and best_item == part_info: - print("\tKeeping : %r" % media_id) - write_decision(keeping=part_info) - elif keep_item.lower() != 'b' and len(media_items) and media_id == media_items[int(keep_item)]: - print("\tKeeping : %r" % media_id) - write_decision(keeping=part_info) + # Remove all unavailable media that are not in SKIP_LIST + if cfg['FIND_UNAVAILABLE']: + title_decided = False + for media_id, part_info in parts.items(): + if not part_info['file_exists']: + if not title_decided: + title_decided = True + write_decision(title=item) + # Even if Plex is reporting the file as nonexistent, skip it if it has a file size. Plex sometimes reports files as missing even though they are accessible by Plex. + if part_info['file_size'] > 0: + log.info("\tSkipping removal per file size (%r) : %r - %r", part_info['file_size'], media_id, part_info['file']) + print("\tSkipping removal per file size (%r) : %r - %r" % (part_info['file_size'], media_id, part_info['file'])) + continue + if should_skip(part_info['file']): + log.info("\tSkipping removal per SKIP_LIST : %r - %r", media_id, part_info['file']) + print("\tSkipping removal per SKIP_LIST : %r - %r" % (media_id, part_info['file'])) + continue else: - print("\tRemoving : %r" % media_id) - delete_item(part_info['show_key'], media_id) - write_decision(removed=part_info) - time.sleep(2) - elif keep_item.lower() == 's' or int(keep_item) == 0: - print("Skipping deletion(s) for %r" % item) - else: - print("Unexpected response, skipping deletion(s) for %r" % item) - else: - # auto delete - print("\nDetermining best media item to keep for %r ..." % item) - keep_score = 0 - keep_id = None - - if cfg['FIND_DUPLICATE_FILEPATHS_ONLY']: - # select lowest id to keep - for media_id, part_info in parts.items(): - if keep_score == 0 and keep_id is None: - keep_score = int(part_info['id']) - keep_id = media_id - elif int(part_info['id']) < keep_score: - keep_score = part_info['id'] - keep_id = media_id - else: - # select highest score to keep - for media_id, part_info in parts.items(): - if int(part_info['score']) > keep_score: - keep_score = part_info['score'] - keep_id = media_id - - if keep_id: - # delete other items - write_decision(title=item) - for media_id, part_info in parts.items(): - if media_id == keep_id: - print("\tKeeping : %r - %r" % (media_id, part_info['file'])) - write_decision(keeping=part_info) + log.info("Removing unavailable media : %r - %r (size: %r)", media_id, part_info['file'], part_info['file_size']) + print("Removing unavailable media : %r - %r (size: %r)" % (media_id, part_info['file'], part_info['file_size'])) + delete_item(part_info['show_key'], media_id) + write_decision(removed=part_info) + time.sleep(2) + + # Delete .ts files when other media exists + if cfg['FIND_EXTRA_TS']: + title_decided = False + file_exts = {} + for media_id, part_info in parts.items(): + for k,v in part_info['file_exts'].items(): + if k in file_exts: + file_exts[k] += v else: - print("\tRemoving : %r - %r" % (media_id, part_info['file'])) - if should_skip(part_info['file']): - print("\tSkipping removal of this item as there is a match in SKIP_LIST") - continue - delete_item(part_info['show_key'], media_id) - write_decision(removed=part_info) - time.sleep(2) + file_exts[k] = v + if len(file_exts) > 1 and ".ts" in file_exts: + for media_id, part_info in parts.items(): + if ".ts" in part_info['file_exts']: + if len(part_info['file_exts']) == 1: + if not title_decided: + title_decided = True + write_decision(title=item) + if should_skip(part_info['file']): + log.info("\tSkipping removal per SKIP_LIST : %r - %r", media_id, part_info['file']) + print("\tSkipping removal per SKIP_LIST : %r - %r" % (media_id, part_info['file'])) + continue + else: + log.info("Removing extra TS media : %r - %r", media_id, part_info['file']) + print("Removing extra TS media : %r - %r" % (media_id, part_info['file'])) + delete_item(part_info['show_key'], media_id) + write_decision(removed=part_info) + time.sleep(2) + else: + # Not sure when this would happen, but let's note it. + print("\tSkipping removal of %r as there is more than one file type that make up %r for %s." % (part_info['file'], media_id, item)) + + if not cfg['SKIP_OTHER_DUPES']: + if not cfg['AUTO_DELETE']: + partz = {} + # manual delete + print("\nWhich media item do you wish to keep for %r ?\n" % item) + + sort_key = None + sort_order = None + + if cfg['FIND_DUPLICATE_FILEPATHS_ONLY']: + sort_key = "id" + sort_order_reverse = False + else: + sort_key = "score" + sort_order_reverse = True + + media_items = {} + best_item = None + for pos, (media_id, part_info) in enumerate(collections.OrderedDict( + sorted(parts.items(), key=lambda x: x[1][sort_key], reverse=sort_order_reverse)).items(), start=1): + if pos == 1: + best_item = part_info + media_items[pos] = media_id + partz[media_id] = part_info + + headers, data = build_tabulated(partz, media_items) + print(tabulate(data, headers=headers)) + + keep_item = input("\nChoose item to keep (0 or s = skip | 1 or b = best): ") + if (keep_item.lower() != 's') and (keep_item.lower() == 'b' or 0 < int(keep_item) <= len(media_items)): + write_decision(title=item) + for media_id, part_info in parts.items(): + if keep_item.lower() == 'b' and best_item is not None and best_item == part_info: + print("\tKeeping : %r" % media_id) + write_decision(keeping=part_info) + elif keep_item.lower() != 'b' and len(media_items) and media_id == media_items[int(keep_item)]: + print("\tKeeping : %r" % media_id) + write_decision(keeping=part_info) + else: + print("\tRemoving : %r" % media_id) + delete_item(part_info['show_key'], media_id) + write_decision(removed=part_info) + time.sleep(2) + elif keep_item.lower() == 's' or int(keep_item) == 0: + print("Skipping deletion(s) for %r" % item) + else: + print("Unexpected response, skipping deletion(s) for %r" % item) else: - print("Unable to determine best media item to keep for %r", item) + # auto delete + print("\nDetermining best media item to keep for %r ..." % item) + keep_score = 0 + keep_id = None + + if cfg['FIND_DUPLICATE_FILEPATHS_ONLY']: + # select lowest id to keep + for media_id, part_info in parts.items(): + if keep_score == 0 and keep_id is None: + keep_score = int(part_info['id']) + keep_id = media_id + elif int(part_info['id']) < keep_score: + keep_score = part_info['id'] + keep_id = media_id + else: + # select highest score to keep + for media_id, part_info in parts.items(): + if int(part_info['score']) > keep_score: + keep_score = part_info['score'] + keep_id = media_id + + if keep_id: + # delete other items + write_decision(title=item) + for media_id, part_info in parts.items(): + if media_id == keep_id: + print("\tKeeping : %r - %r" % (media_id, part_info['file'])) + write_decision(keeping=part_info) + else: + print("\tRemoving : %r - %r" % (media_id, part_info['file'])) + if should_skip(part_info['file']): + print("\tSkipping removal of this item as there is a match in SKIP_LIST") + continue + delete_item(part_info['show_key'], media_id) + write_decision(removed=part_info) + time.sleep(2) + else: + print("Unable to determine best media item to keep for %r", item)