From 03b169f041cf54e1e6f90541d22c47479fa10cd4 Mon Sep 17 00:00:00 2001 From: Peter van Heusden Date: Thu, 5 May 2022 11:37:57 +0200 Subject: [PATCH 1/9] * Support download assignment cache to datadir and using from datadir * Switch to using pip for all installs (thanks for tip from Wolfgang Maier) * Change logic for finding constellations files to use latest version --- .github/workflows/pangolin.yml | 11 ++++- pangolin/command.py | 14 +++--- pangolin/utils/config.py | 2 + pangolin/utils/data_checks.py | 12 ++--- pangolin/utils/initialising.py | 81 +++++++++++++++++++++------------- pangolin/utils/update.py | 50 +++++---------------- 6 files changed, 89 insertions(+), 81 deletions(-) diff --git a/.github/workflows/pangolin.yml b/.github/workflows/pangolin.yml index ff3a4b1..c153092 100644 --- a/.github/workflows/pangolin.yml +++ b/.github/workflows/pangolin.yml @@ -55,4 +55,13 @@ jobs: run: pangolin --update-data 2>&1 | tee pangolin_update_data.log - name: Run pangolin verbose mode run: pangolin --verbose pangolin/test/test_seqs.fasta 2>&1 | tee pangolin_verbose.log - + - name: Add assignment cache + run: pangolin --add-assignment-cache + - name: Test use-assignment-cache + run: pangolin --use-assignment-cache pangolin/test/test_seqs.fasta 2>&1 | grep 'Using pangolin-assignment cache' + - name: remove assignment cache + run: pip uninstall -y pangolin-assignment + - name: Add assignment cache to datadir + run: mkdir ac && pangolin --add-assignment-cache --datadir ac + - name: Test use-assignment-cache with datadir + run: pangolin --use-assignment-cache --datadir ac pangolin/test/test_seqs.fasta 2>&1 | grep 'Using pangolin-assignment cache' diff --git a/pangolin/command.py b/pangolin/command.py index 2bd6abb..01eeb35 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -4,17 +4,17 @@ from pangolin.utils import data_checks try: import pangolin_data -except: +except ImportError: data_checks.install_error("pangolin_data", "https://github.com/cov-lineages/pangolin-data.git") try: import scorpio -except: +except ImportError: data_checks.install_error("scorpio", "https://github.com/cov-lineages/scorpio.git") try: import constellations -except: +except ImportError: data_checks.install_error("constellations", "https://github.com/cov-lineages/constellations.git") import os @@ -110,20 +110,22 @@ def main(sysargs = sys.argv[1:]): setup_data(args.datadir,config[KEY_ANALYSIS_MODE], config) if args.add_assignment_cache: - update.install_pangolin_assignment() + update.install_pangolin_assignment(config[KEY_PANGOLIN_ASSIGNMENT_VERSION], args.datadir) if args.update: version_dictionary = {'pangolin': __version__, 'pangolin-data': config[KEY_PANGOLIN_DATA_VERSION], 'constellations': config[KEY_CONSTELLATIONS_VERSION], 'scorpio': config[KEY_SCORPIO_VERSION]} - update.add_pangolin_assignment_if_installed(version_dictionary) + if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None: + version_dictionary['pangolin-assignment'] = config[KEY_PANGOLIN_ASSIGNMENT_VERSION] update.update(version_dictionary) if args.update_data: version_dictionary = {'pangolin-data': config[KEY_PANGOLIN_DATA_VERSION], 'constellations': config[KEY_CONSTELLATIONS_VERSION]} - update.add_pangolin_assignment_if_installed(version_dictionary) + if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None: + version_dictionary['pangolin-assignment'] = config[KEY_PANGOLIN_ASSIGNMENT_VERSION] update.update(version_dictionary, args.datadir) # install_pangolin_assignment doesn't exit so that --update/--update-data can be given at the diff --git a/pangolin/utils/config.py b/pangolin/utils/config.py index 1f0d0be..9f10f8e 100644 --- a/pangolin/utils/config.py +++ b/pangolin/utils/config.py @@ -40,6 +40,8 @@ KEY_PANGOLIN_VERSION="pangolin_version" KEY_CONSTELLATIONS_VERSION="constellation_version" KEY_SCORPIO_VERSION="scorpio_version" +KEY_PANGOLIN_ASSIGNMENT_VERSION="pangolin_assignment_version" +KEY_PANGOLIN_ASSIGNMENT_PATH="pangolin_assignment_path" KEY_VERBOSE="verbose" KEY_LOG_API = "log_api" diff --git a/pangolin/utils/data_checks.py b/pangolin/utils/data_checks.py index 05278b5..0b2ce28 100644 --- a/pangolin/utils/data_checks.py +++ b/pangolin/utils/data_checks.py @@ -79,9 +79,8 @@ def install_error(package, url): def get_assignment_cache(cache_file, config): cache = "" - try: - import pangolin_assignment - pangolin_assignment_dir = pangolin_assignment.__path__[0] + if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None: + pangolin_assignment_dir = config[KEY_PANGOLIN_ASSIGNMENT_PATH] for r, d, f in os.walk(pangolin_assignment_dir): for fn in f: if fn == cache_file and cache == "": @@ -89,15 +88,15 @@ def get_assignment_cache(cache_file, config): if not os.path.exists(cache): sys.stderr.write(cyan(f'Error: cannot find assignment cache file {cache_file} in pangolin_assignment\n')) sys.exit(-1) - except: + else: sys.stderr.write(cyan('\nError: "pangolin --add-assignment-cache" is required before ' '"pangolin --use-assignment-cache", in order to install optional ' 'pangolin-assignment repository (that will make future data updates slower).\n')) sys.exit(-1) # Check versions of pangolin-data and pangolin-assignment to make sure they are consistent. - if pangolin_assignment.__version__.lstrip('v') != config[KEY_PANGOLIN_DATA_VERSION].lstrip('v'): - print(cyan(f'Error: pangolin_assignment cache version {pangolin_assignment.__version__} ' + if config[KEY_PANGOLIN_ASSIGNMENT_VERSION].lstrip('v') != config[KEY_PANGOLIN_DATA_VERSION].lstrip('v'): + print(cyan(f'Error: pangolin_assignment cache version {config[KEY_PANGOLIN_ASSIGNMENT_VERSION]} ' f'does not match pangolin_data version {config[KEY_PANGOLIN_DATA_VERSION]}. ' 'Run "pangolin --update-data" to fetch latest versions of both.')) sys.exit(-1) @@ -107,6 +106,7 @@ def get_assignment_cache(cache_file, config): line = f.readline() except: with open(cache, 'r') as f: + # this is legacy code from when the assignment cache was installed using pip and git-lfs line = f.readline() if "git-lfs.github.com" in line: sys.stderr.write(cyan( diff --git a/pangolin/utils/initialising.py b/pangolin/utils/initialising.py index f272449..26c677f 100644 --- a/pangolin/utils/initialising.py +++ b/pangolin/utils/initialising.py @@ -13,6 +13,15 @@ from pangolin import __version__ import pangolin_data +pangolin_assignment_version = None +pangolin_assignment_path = None +try: + import pangolin_assignment + pangolin_assignment_version = pangolin_assignment.__version__ + pangolin_assignment_path = pangolin_assignment.__path__[0] +except ImportError: + # if we can't import the module, leave the variables as None + pass import scorpio import constellations @@ -54,7 +63,9 @@ def setup_config_dict(cwd): KEY_PANGOLIN_DATA_VERSION: pangolin_data.__version__, KEY_SCORPIO_VERSION: scorpio.__version__, KEY_CONSTELLATIONS_VERSION: constellations.__version__, - + KEY_PANGOLIN_ASSIGNMENT_VERSION: pangolin_assignment_version, + KEY_PANGOLIN_ASSIGNMENT_PATH: pangolin_assignment_path, + KEY_VERBOSE: False, KEY_LOG_API: "", KEY_THREADS: 1 @@ -118,67 +129,77 @@ def version_from_init(init_file): break return version -def setup_data(datadir_arg,analysis_mode, config): +def setup_data(datadir_arg, analysis_mode, config): + global pangolin_assignment_version + global pangolin_assignment_path datadir = check_datadir(datadir_arg) pangolin_data_dir = pangolin_data.__path__[0] + + # collect constellations files from the contents of the constellations module constellations_dir = constellations.__path__[0] + constellations_version = constellations.__version__ constellation_files = [] - - data_locations = [os.walk(constellations_dir)] - - if datadir: - data_locations.append(os.walk(datadir)) - - # the logic of this is to search the "built-in" constellations - # path first and then if as custom datadir is passed, follow up with those, so that - # any files found in the datadir supercede the "built-in" modules. The assumption - # here is that the datadir contains newer (user updated) data - for r, _, f in itertools.chain.from_iterable(data_locations): - if r.endswith('/constellations') or r.endswith('/constellations/definitions'): - constellation_files = [] # only collect the constellations from the last directory found + for r, _, f in os.walk(constellations_dir): for fn in f: - if r.endswith('/constellations') and fn == '__init__.py': - constellations_version = version_from_init(os.path.join(r, fn)) - elif (r.endswith('/constellations') or r.endswith('/constellations/definitions')) and fn.endswith('.json'): + if (r.endswith('/constellations') or r.endswith('/constellations/definitions')) and fn.endswith('.json'): constellation_files.append(os.path.join(r, fn)) pangolin_data_version = pangolin_data.__version__ + + # pangolin_assignment_version and pangolin_assignment_path are set at module import time use_datadir = False - datadir_too_old = False + constellation_files_from_datadir = [] + constellations_version_from_datadir = None if datadir: version = "Unknown" for r,d,f in os.walk(datadir): for fn in f: + if r.endswith('/constellations') and fn == '__init__.py': + constellations_version_from_datadir = version_from_init(os.path.join(r, fn)) + elif (r.endswith('/constellations') or r.endswith('/constellations/definitions')) and fn.endswith('.json'): + constellation_files_from_datadir.append(os.path.join(r, fn)) + # pangolin-data/__init__.py not constellations/__init__.py: - if r.endswith('data') and fn == "__init__.py": + if r.endswith('/pangolin_data') and fn == "__init__.py": # print("Found " + os.path.join(r, fn)) version = version_from_init(os.path.join(r, fn)) if not version: continue - if LooseVersion(version) >= LooseVersion(pangolin_data.__version__): + if LooseVersion(version) > LooseVersion(pangolin_data.__version__): # only use this if the version is >= than what we already have pangolin_data_version = version use_datadir = True else: - datadir_too_old = True - sys.stderr.write(cyan(f"Warning: Ignoring specified datadir {datadir} - it contains pangoLEARN model files older ({version}) than those installed ({pangolin_data.__version__})\n")) + sys.stderr.write(cyan(f"Warning: Ignoring pangolin data in specified datadir {datadir} - it contains pangolin_data older ({version}) than those installed ({pangolin_data.__version__})\n")) + elif r.endswith('/pangolin_assignment') and fn == '__init__.py': + version = version_from_init(os.path.join(r, fn)) + if not version: + continue - if use_datadir == False: - # we haven't got a viable datadir from searching args.datadir - if datadir and not datadir_too_old: - sys.stderr.write(cyan( - f"Warning: Ignoring specified datadir {datadir} - could not find __init__.py file to check versions \n")) + if pangolin_assignment_version is None or LooseVersion(version) > LooseVersion(pangolin_assignment_version): + # only use this if the version is >= than what we already have + pangolin_assignment_version = version + pangolin_assignment_path = r + else: + sys.stderr.write(cyan(f"Warning: Ignoring pangolin assignment in specified datadir {datadir} - it contains pangolin_assignment older ({version}) than those installed ({pangolin_assignment_version})\n")) + + if constellations_version_from_datadir is not None and LooseVersion(constellations_version_from_datadir) > LooseVersion(constellations_version): + constellation_files = constellation_files_from_datadir + constellations_version = constellations_version_from_datadir + if use_datadir == False: pangolin_data_dir = pangolin_data.__path__[0] datadir = os.path.join(pangolin_data_dir,"data") config[KEY_PANGOLIN_DATA_VERSION] = pangolin_data_version config[KEY_CONSTELLATIONS_VERSION] = constellations_version - config[KEY_DATADIR] = datadir + config[KEY_DATADIR] = datadir # this is the pangolin_data datadir, the naming is from when there was only a single datadir to worry about config[KEY_CONSTELLATION_FILES] = constellation_files + config[KEY_PANGOLIN_ASSIGNMENT_VERSION] = pangolin_assignment_version + config[KEY_PANGOLIN_ASSIGNMENT_PATH] = pangolin_assignment_path def parse_qc_thresholds(maxambig, minlen, reference_fasta, config): @@ -244,7 +265,7 @@ def print_versions_exit(config): # Report pangolin_assignment version if it is installed, otherwise ignore try: import pangolin_assignment - print(f"pangolin-assignment: {pangolin_assignment.__version__}") + print(f"pangolin-assignment: {config[KEY_PANGOLIN_ASSIGNMENT_VERSION]}") except: pass # Print versions of other important tools used by pangolin diff --git a/pangolin/utils/update.py b/pangolin/utils/update.py index 721a9eb..c794548 100644 --- a/pangolin/utils/update.py +++ b/pangolin/utils/update.py @@ -64,41 +64,31 @@ def git_lfs_install(): sys.stderr.write(cyan(f"Error: {e}:\n{stderr}\n")) sys.exit(-1) -def pip_install_dep(dependency, release): +def pip_install_dep(dependency, release, datadir=None): """ Use pip install to install a cov-lineages repository with the specificed release """ + env_vars = None + if datadir is not None: + env_vars = {'PIP_TARGET': datadir, 'PIP_UPGRADE': '1'} url = f"git+https://github.com/cov-lineages/{dependency}.git@{release}" subprocess.run([sys.executable, '-m', 'pip', 'install', '--upgrade', url], check=True, stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL) + stderr=subprocess.DEVNULL, + env=env_vars) -def install_pangolin_assignment(): +def install_pangolin_assignment(pangolin_assignment_version, datadir=None): """ If the pangolin-assignment repo has not been installed already then install the latest release. """ - try: - import pangolin_assignment - print(f"pangolin-assignment already installed with version {pangolin_assignment.__version__}; use --update or --update-data if you wish to update it.", file=sys.stderr) - - except: + if pangolin_assignment_version is not None: + print(f"pangolin-assignment already installed with version {pangolin_assignment_version}; use --update or --update-data if you wish to update it.", file=sys.stderr) + else: git_lfs_install() latest_release, tarball = get_latest_release('pangolin-assignment') - pip_install_dep('pangolin-assignment', latest_release) - print(f"pangolin-assignment installed with latest release ({latest_release})") - - -def add_pangolin_assignment_if_installed(version_dictionary): - """ - If pangolin_assignment has been installed then add it to version_dictionary, else ignore. - """ - try: - import pangolin_assignment - version_dictionary["pangolin-assignment"] = pangolin_assignment.__version__ - except: - pass + pip_install_dep('pangolin-assignment', latest_release, datadir) def update(version_dictionary, data_dir=None): @@ -154,23 +144,7 @@ def update(version_dictionary, data_dir=None): version = LooseVersion(version) if version < latest_release_tidied: - if data_dir is not None: - # this path only gets followed when the user has --update_data and they - # have also specified a --datadir - with TemporaryDirectory() as tempdir: - dependency_package = package_names.get(dependency, dependency) - tarball_path = os.path.join(tempdir, 'tarball.tgz') - open(tarball_path, 'wb').write(request.urlopen(latest_release_tarball).read()) - tf = tarfile.open(tarball_path) - extracted_dir = tf.next().name - tf.extractall(path=tempdir) - tf.close() - destination_directory = os.path.join(data_dir, dependency_package) - if os.path.isdir(destination_directory): - shutil.rmtree(destination_directory) - shutil.move(os.path.join(tempdir, extracted_dir, dependency_package), destination_directory) - else: - pip_install_dep(dependency, latest_release) + pip_install_dep(dependency, latest_release, data_dir) print(f"{dependency} updated to {latest_release}", file=sys.stderr) elif version > latest_release_tidied: print(f"{dependency} ({version}) is newer than latest stable " From 4b5172060eb24490d7cdec2f8daf10b2cca5fed1 Mon Sep 17 00:00:00 2001 From: Peter van Heusden Date: Thu, 5 May 2022 11:44:26 +0200 Subject: [PATCH 2/9] Merge changes to initialising.py --- pangolin/utils/initialising.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pangolin/utils/initialising.py b/pangolin/utils/initialising.py index 26c677f..f22a1d6 100644 --- a/pangolin/utils/initialising.py +++ b/pangolin/utils/initialising.py @@ -184,12 +184,16 @@ def setup_data(datadir_arg, analysis_mode, config): pangolin_assignment_version = version pangolin_assignment_path = r else: +<<<<<<< HEAD sys.stderr.write(cyan(f"Warning: Ignoring pangolin assignment in specified datadir {datadir} - it contains pangolin_assignment older ({version}) than those installed ({pangolin_assignment_version})\n")) if constellations_version_from_datadir is not None and LooseVersion(constellations_version_from_datadir) > LooseVersion(constellations_version): constellation_files = constellation_files_from_datadir constellations_version = constellations_version_from_datadir +======= +\ sys.stderr.write(cyan(f"Warning: Ignoring pangolin assignment in specified datadir {datadir} - it contains pangolin_assignment older ({version}) than those installed ({pangolin_assignment.__version__})\n")) +>>>>>>> eba85c9 (Remove useless warning about datadir) if use_datadir == False: pangolin_data_dir = pangolin_data.__path__[0] datadir = os.path.join(pangolin_data_dir,"data") @@ -228,11 +232,10 @@ def parse_qc_thresholds(maxambig, minlen, reference_fasta, config): print(green(f"Maximum ambiguity allowed is {config[KEY_MAXAMBIG]}.\n****")) - def print_ram_warning(analysis_mode): if analysis_mode == "pangolearn": print(cyan("Warning: pangoLEARN mode may use a significant amount of RAM, be aware that it will not suit every system.")) - + def print_alias_file_exit(alias_file): with open(alias_file, 'r') as handle: for line in handle: From 77ae9f3537de1ca3ad724e53e8962755d196fdc2 Mon Sep 17 00:00:00 2001 From: Peter van Heusden Date: Thu, 5 May 2022 11:45:55 +0200 Subject: [PATCH 3/9] Fix merge conflict --- pangolin/utils/initialising.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pangolin/utils/initialising.py b/pangolin/utils/initialising.py index f22a1d6..9d4753f 100644 --- a/pangolin/utils/initialising.py +++ b/pangolin/utils/initialising.py @@ -184,16 +184,12 @@ def setup_data(datadir_arg, analysis_mode, config): pangolin_assignment_version = version pangolin_assignment_path = r else: -<<<<<<< HEAD sys.stderr.write(cyan(f"Warning: Ignoring pangolin assignment in specified datadir {datadir} - it contains pangolin_assignment older ({version}) than those installed ({pangolin_assignment_version})\n")) if constellations_version_from_datadir is not None and LooseVersion(constellations_version_from_datadir) > LooseVersion(constellations_version): constellation_files = constellation_files_from_datadir constellations_version = constellations_version_from_datadir -======= -\ sys.stderr.write(cyan(f"Warning: Ignoring pangolin assignment in specified datadir {datadir} - it contains pangolin_assignment older ({version}) than those installed ({pangolin_assignment.__version__})\n")) ->>>>>>> eba85c9 (Remove useless warning about datadir) if use_datadir == False: pangolin_data_dir = pangolin_data.__path__[0] datadir = os.path.join(pangolin_data_dir,"data") From 9af5304bf396f8edd0149dc5e05dd5f33d86e311 Mon Sep 17 00:00:00 2001 From: Peter van Heusden Date: Fri, 6 May 2022 11:52:30 +0200 Subject: [PATCH 4/9] Fix issues noted in PR --- pangolin/utils/data_checks.py | 1 - pangolin/utils/initialising.py | 46 ++++++++++++++++------------------ pangolin/utils/update.py | 8 +++--- 3 files changed, 26 insertions(+), 29 deletions(-) diff --git a/pangolin/utils/data_checks.py b/pangolin/utils/data_checks.py index 0b2ce28..b263e64 100644 --- a/pangolin/utils/data_checks.py +++ b/pangolin/utils/data_checks.py @@ -106,7 +106,6 @@ def get_assignment_cache(cache_file, config): line = f.readline() except: with open(cache, 'r') as f: - # this is legacy code from when the assignment cache was installed using pip and git-lfs line = f.readline() if "git-lfs.github.com" in line: sys.stderr.write(cyan( diff --git a/pangolin/utils/initialising.py b/pangolin/utils/initialising.py index 9d4753f..e7ce106 100644 --- a/pangolin/utils/initialising.py +++ b/pangolin/utils/initialising.py @@ -13,15 +13,14 @@ from pangolin import __version__ import pangolin_data -pangolin_assignment_version = None -pangolin_assignment_path = None +class PangolinAssignmentWrapper(): + __version__ = None + __path__ = [None] try: import pangolin_assignment - pangolin_assignment_version = pangolin_assignment.__version__ - pangolin_assignment_path = pangolin_assignment.__path__[0] except ImportError: - # if we can't import the module, leave the variables as None - pass + # if we can't import the module, leave the variables we replace it with a mock with suitable attributes + pangolin_assignment = PangolinAssignmentWrapper() import scorpio import constellations @@ -63,9 +62,9 @@ def setup_config_dict(cwd): KEY_PANGOLIN_DATA_VERSION: pangolin_data.__version__, KEY_SCORPIO_VERSION: scorpio.__version__, KEY_CONSTELLATIONS_VERSION: constellations.__version__, - KEY_PANGOLIN_ASSIGNMENT_VERSION: pangolin_assignment_version, - KEY_PANGOLIN_ASSIGNMENT_PATH: pangolin_assignment_path, - + KEY_PANGOLIN_ASSIGNMENT_VERSION: pangolin_assignment.__version__, + KEY_PANGOLIN_ASSIGNMENT_PATH: pangolin_assignment.__path__[0], + KEY_VERBOSE: False, KEY_LOG_API: "", KEY_THREADS: 1 @@ -129,10 +128,7 @@ def version_from_init(init_file): break return version -def setup_data(datadir_arg, analysis_mode, config): - global pangolin_assignment_version - global pangolin_assignment_path - +def setup_data(datadir_arg, analysis_mode, config, use_old_data): datadir = check_datadir(datadir_arg) pangolin_data_dir = pangolin_data.__path__[0] @@ -148,7 +144,8 @@ def setup_data(datadir_arg, analysis_mode, config): pangolin_data_version = pangolin_data.__version__ - # pangolin_assignment_version and pangolin_assignment_path are set at module import time + pangolin_assignment_version = pangolin_assignment.__version__ + pangolin_assignment_path = pangolin_assignment.__path__[0] use_datadir = False constellation_files_from_datadir = [] constellations_version_from_datadir = None @@ -168,8 +165,7 @@ def setup_data(datadir_arg, analysis_mode, config): if not version: continue - if LooseVersion(version) > LooseVersion(pangolin_data.__version__): - # only use this if the version is >= than what we already have + if use_old_data or LooseVersion(version) >= LooseVersion(pangolin_data.__version__): pangolin_data_version = version use_datadir = True else: @@ -179,16 +175,19 @@ def setup_data(datadir_arg, analysis_mode, config): if not version: continue - if pangolin_assignment_version is None or LooseVersion(version) > LooseVersion(pangolin_assignment_version): + if use_old_data or (pangolin_assignment_version is not None and LooseVersion(version) >= LooseVersion(pangolin_assignment.__version__)): # only use this if the version is >= than what we already have pangolin_assignment_version = version pangolin_assignment_path = r else: - sys.stderr.write(cyan(f"Warning: Ignoring pangolin assignment in specified datadir {datadir} - it contains pangolin_assignment older ({version}) than those installed ({pangolin_assignment_version})\n")) + sys.stderr.write(cyan(f"Warning: Ignoring pangolin assignment in specified datadir {datadir} - it contains pangolin_assignment older ({version}) than those installed ({pangolin_assignment.__version__})\n")) - if constellations_version_from_datadir is not None and LooseVersion(constellations_version_from_datadir) > LooseVersion(constellations_version): - constellation_files = constellation_files_from_datadir - constellations_version = constellations_version_from_datadir + if constellations_version_from_datadir is not None: + if use_old_data or LooseVersion(constellations_version_from_datadir) > LooseVersion(constellations_version): + constellation_files = constellation_files_from_datadir + constellations_version = constellations_version_from_datadir + else: + sys.stderr.write(cyan(f"Warning: Ignoring constellations in specified datadir {datadir} - it contains constellations older ({constellations_version_from_datadir}) than those installed ({constellations_version})\n")) if use_datadir == False: pangolin_data_dir = pangolin_data.__path__[0] @@ -262,11 +261,8 @@ def print_versions_exit(config): f"constellations: {config[KEY_CONSTELLATIONS_VERSION]}\n" f"scorpio: {config[KEY_SCORPIO_VERSION]}") # Report pangolin_assignment version if it is installed, otherwise ignore - try: - import pangolin_assignment + if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None: print(f"pangolin-assignment: {config[KEY_PANGOLIN_ASSIGNMENT_VERSION]}") - except: - pass # Print versions of other important tools used by pangolin print_conda_version(['usher', 'ucsc-fatovcf', 'gofasta', 'minimap2']) sys.exit(0) diff --git a/pangolin/utils/update.py b/pangolin/utils/update.py index c794548..b124150 100644 --- a/pangolin/utils/update.py +++ b/pangolin/utils/update.py @@ -69,10 +69,12 @@ def pip_install_dep(dependency, release, datadir=None): Use pip install to install a cov-lineages repository with the specificed release """ env_vars = None - if datadir is not None: - env_vars = {'PIP_TARGET': datadir, 'PIP_UPGRADE': '1'} url = f"git+https://github.com/cov-lineages/{dependency}.git@{release}" - subprocess.run([sys.executable, '-m', 'pip', 'install', '--upgrade', url], + pip_command = [sys.executable, '-m', 'pip', 'install', '--upgrade'] + if datadir is not None: + pip_command.append('--target', datadir) + pip_command.append(url) + subprocess.run(pip_command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, From 937af6df614c92b237105b6c52a95ced29c34f8a Mon Sep 17 00:00:00 2001 From: Peter van Heusden Date: Fri, 6 May 2022 17:09:42 +0200 Subject: [PATCH 5/9] Add use_old_datadir option --- pangolin/command.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pangolin/command.py b/pangolin/command.py index 01eeb35..bcc8446 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -81,6 +81,7 @@ def main(sysargs = sys.argv[1:]): d_group.add_argument('--add-assignment-cache', action='store_true', dest="add_assignment_cache", default=False, help="Install the pangolin-assignment repository for use with --use-assignment-cache. This makes updates slower and makes pangolin slower for small numbers of input sequences but much faster for large numbers of input sequences.") d_group.add_argument('--use-assignment-cache', action='store_true', dest="use_assignment_cache", default=False, help="Use assignment cache from optional pangolin-assignment repository. NOTE: the repository must be installed by --add-assignment-cache before using --use-assignment-cache.") d_group.add_argument('-d', '--datadir', action='store',dest="datadir",help="Data directory minimally containing the pangoLEARN model, header files and UShER tree. Default: Installed pangolin-data package.") + d_group.add_argument('--use_old_datadir', action='store_true', default=False, help="Use the data from data directory even if older than data installed via Python packages. Default: False") d_group.add_argument('--usher-tree', action='store', dest='usher_protobuf', help="UShER Mutation Annotated Tree protobuf file to use instead of default from pangolin-data repository or --datadir.") d_group.add_argument('--assignment-cache', action='store', dest='assignment_cache', help="Cached precomputed assignment file to use instead of default from pangolin-assignment repository. Does not require installation of pangolin-assignment.") @@ -107,7 +108,7 @@ def main(sysargs = sys.argv[1:]): if args.usher: sys.stderr.write(cyan(f"--usher is a pangolin v3 option and is deprecated in pangolin v4. UShER is now the default analysis mode. Use --analysis-mode to explicitly set mode.\n")) - setup_data(args.datadir,config[KEY_ANALYSIS_MODE], config) + setup_data(args.datadir,config[KEY_ANALYSIS_MODE], config, args.use_old_datadir) if args.add_assignment_cache: update.install_pangolin_assignment(config[KEY_PANGOLIN_ASSIGNMENT_VERSION], args.datadir) From 3fa0c5a6ac6dcfce19278a98b696281119e66422 Mon Sep 17 00:00:00 2001 From: Peter van Heusden Date: Fri, 6 May 2022 17:10:11 +0200 Subject: [PATCH 6/9] Add get_constellation_files() --- pangolin/utils/data_checks.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pangolin/utils/data_checks.py b/pangolin/utils/data_checks.py index b263e64..b62ca2e 100644 --- a/pangolin/utils/data_checks.py +++ b/pangolin/utils/data_checks.py @@ -114,5 +114,13 @@ def get_assignment_cache(cache_file, config): sys.exit(-1) return cache +def get_constellation_files(path): + constellation_files = [] + for r, _, f in os.walk(path): + for fn in f: + if (r.endswith('/constellations') or r.endswith('/constellations/definitions')) and fn.endswith('.json'): + constellation_files.append(os.path.join(r, fn)) + return constellation_files + # config={} # check_install() From c30e201fc11d421f8b595ebfec58a4015b71574d Mon Sep 17 00:00:00 2001 From: Peter van Heusden Date: Fri, 6 May 2022 17:11:06 +0200 Subject: [PATCH 7/9] Rewrite setup_data() to be less repetitive --- pangolin/utils/initialising.py | 93 ++++++++++------------------------ 1 file changed, 26 insertions(+), 67 deletions(-) diff --git a/pangolin/utils/initialising.py b/pangolin/utils/initialising.py index e7ce106..e504da6 100644 --- a/pangolin/utils/initialising.py +++ b/pangolin/utils/initialising.py @@ -131,74 +131,33 @@ def version_from_init(init_file): def setup_data(datadir_arg, analysis_mode, config, use_old_data): datadir = check_datadir(datadir_arg) - pangolin_data_dir = pangolin_data.__path__[0] - - # collect constellations files from the contents of the constellations module - constellations_dir = constellations.__path__[0] - constellations_version = constellations.__version__ - constellation_files = [] - for r, _, f in os.walk(constellations_dir): - for fn in f: - if (r.endswith('/constellations') or r.endswith('/constellations/definitions')) and fn.endswith('.json'): - constellation_files.append(os.path.join(r, fn)) - - pangolin_data_version = pangolin_data.__version__ - - pangolin_assignment_version = pangolin_assignment.__version__ - pangolin_assignment_path = pangolin_assignment.__path__[0] - use_datadir = False - constellation_files_from_datadir = [] - constellations_version_from_datadir = None + config[KEY_PANGOLIN_DATA_VERSION] = pangolin_data.__version__ + config[KEY_DATADIR] = pangolin_data.__path__[0] + config[KEY_CONSTELLATIONS_VERSION] = constellations.__version__ + config[KEY_CONSTELLATION_FILES] = get_constellation_files(constellations.__path__[0]) + config[KEY_PANGOLIN_ASSIGNMENT_VERSION] = pangolin_assignment.__version__ + config[KEY_PANGOLIN_ASSIGNMENT_PATH] = pangolin_assignment.__path__[0] + if datadir: - version = "Unknown" - for r,d,f in os.walk(datadir): - for fn in f: - if r.endswith('/constellations') and fn == '__init__.py': - constellations_version_from_datadir = version_from_init(os.path.join(r, fn)) - elif (r.endswith('/constellations') or r.endswith('/constellations/definitions')) and fn.endswith('.json'): - constellation_files_from_datadir.append(os.path.join(r, fn)) - - # pangolin-data/__init__.py not constellations/__init__.py: - if r.endswith('/pangolin_data') and fn == "__init__.py": - # print("Found " + os.path.join(r, fn)) - version = version_from_init(os.path.join(r, fn)) - if not version: - continue - - if use_old_data or LooseVersion(version) >= LooseVersion(pangolin_data.__version__): - pangolin_data_version = version - use_datadir = True - else: - sys.stderr.write(cyan(f"Warning: Ignoring pangolin data in specified datadir {datadir} - it contains pangolin_data older ({version}) than those installed ({pangolin_data.__version__})\n")) - elif r.endswith('/pangolin_assignment') and fn == '__init__.py': - version = version_from_init(os.path.join(r, fn)) - if not version: - continue - - if use_old_data or (pangolin_assignment_version is not None and LooseVersion(version) >= LooseVersion(pangolin_assignment.__version__)): - # only use this if the version is >= than what we already have - pangolin_assignment_version = version - pangolin_assignment_path = r - else: - sys.stderr.write(cyan(f"Warning: Ignoring pangolin assignment in specified datadir {datadir} - it contains pangolin_assignment older ({version}) than those installed ({pangolin_assignment.__version__})\n")) - - if constellations_version_from_datadir is not None: - if use_old_data or LooseVersion(constellations_version_from_datadir) > LooseVersion(constellations_version): - constellation_files = constellation_files_from_datadir - constellations_version = constellations_version_from_datadir - else: - sys.stderr.write(cyan(f"Warning: Ignoring constellations in specified datadir {datadir} - it contains constellations older ({constellations_version_from_datadir}) than those installed ({constellations_version})\n")) - - if use_datadir == False: - pangolin_data_dir = pangolin_data.__path__[0] - datadir = os.path.join(pangolin_data_dir,"data") - - config[KEY_PANGOLIN_DATA_VERSION] = pangolin_data_version - config[KEY_CONSTELLATIONS_VERSION] = constellations_version - config[KEY_DATADIR] = datadir # this is the pangolin_data datadir, the naming is from when there was only a single datadir to worry about - config[KEY_CONSTELLATION_FILES] = constellation_files - config[KEY_PANGOLIN_ASSIGNMENT_VERSION] = pangolin_assignment_version - config[KEY_PANGOLIN_ASSIGNMENT_PATH] = pangolin_assignment_path + for module_name in ('constellations', 'pangolin_data', 'pangolin_assignment'): + for r, _, f in os.walk(datadir): + for fn in f: + if r.endswith('/' + module_name) and fn == '__init__.py': + version = version_from_init(os.path.join(r, fn)) + # module_name has been imported so exists in global namespace + current_version = getattr(globals()[module_name], '__version__', '0') + if use_old_data or current_version is None or LooseVersion(version) >= LooseVersion(current_version): + if module_name == "pangolin_data": + config[KEY_PANGOLIN_DATA_VERSION] = version + config[KEY_DATADIR] = os.path.join(datadir, r) + elif module_name == "pangolin_assignment": + config[KEY_PANGOLIN_ASSIGNMENT_VERSION] = version + config[KEY_PANGOLIN_ASSIGNMENT_PATH] = os.path.join(datadir, r) + elif module_name == "constellations": + config[KEY_CONSTELLATIONS_VERSION] = version + config[KEY_CONSTELLATION_FILES] = get_constellation_files(r) + else: + sys.stderr.write(cyan(f"Warning: Ignoring {module_name} in specified datadir {datadir} - it contains {module_name} with older ({version}) than those installed ({current_version})\n")) def parse_qc_thresholds(maxambig, minlen, reference_fasta, config): From 5229572c11731cef9f18e2ac388913aa83fc2246 Mon Sep 17 00:00:00 2001 From: Peter van Heusden Date: Fri, 6 May 2022 17:23:06 +0200 Subject: [PATCH 8/9] Correct pip call --- pangolin/utils/update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangolin/utils/update.py b/pangolin/utils/update.py index b124150..76eeb1f 100644 --- a/pangolin/utils/update.py +++ b/pangolin/utils/update.py @@ -72,7 +72,7 @@ def pip_install_dep(dependency, release, datadir=None): url = f"git+https://github.com/cov-lineages/{dependency}.git@{release}" pip_command = [sys.executable, '-m', 'pip', 'install', '--upgrade'] if datadir is not None: - pip_command.append('--target', datadir) + pip_command.extend(['--target', datadir]) pip_command.append(url) subprocess.run(pip_command, check=True, From 90da65d0d788d337752701e787c3fd9932b7ada4 Mon Sep 17 00:00:00 2001 From: Peter van Heusden Date: Sat, 7 May 2022 08:28:06 +0200 Subject: [PATCH 9/9] Add back pangolin-assignment installation message, other changes to address PR comments --- pangolin/command.py | 2 +- pangolin/utils/update.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pangolin/command.py b/pangolin/command.py index bcc8446..4cbc810 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -81,7 +81,7 @@ def main(sysargs = sys.argv[1:]): d_group.add_argument('--add-assignment-cache', action='store_true', dest="add_assignment_cache", default=False, help="Install the pangolin-assignment repository for use with --use-assignment-cache. This makes updates slower and makes pangolin slower for small numbers of input sequences but much faster for large numbers of input sequences.") d_group.add_argument('--use-assignment-cache', action='store_true', dest="use_assignment_cache", default=False, help="Use assignment cache from optional pangolin-assignment repository. NOTE: the repository must be installed by --add-assignment-cache before using --use-assignment-cache.") d_group.add_argument('-d', '--datadir', action='store',dest="datadir",help="Data directory minimally containing the pangoLEARN model, header files and UShER tree. Default: Installed pangolin-data package.") - d_group.add_argument('--use_old_datadir', action='store_true', default=False, help="Use the data from data directory even if older than data installed via Python packages. Default: False") + d_group.add_argument('--use-old-datadir', action='store_true', default=False, help="Use the data from data directory even if older than data installed via Python packages. Default: False") d_group.add_argument('--usher-tree', action='store', dest='usher_protobuf', help="UShER Mutation Annotated Tree protobuf file to use instead of default from pangolin-data repository or --datadir.") d_group.add_argument('--assignment-cache', action='store', dest='assignment_cache', help="Cached precomputed assignment file to use instead of default from pangolin-assignment repository. Does not require installation of pangolin-assignment.") diff --git a/pangolin/utils/update.py b/pangolin/utils/update.py index 76eeb1f..87ca292 100644 --- a/pangolin/utils/update.py +++ b/pangolin/utils/update.py @@ -68,7 +68,6 @@ def pip_install_dep(dependency, release, datadir=None): """ Use pip install to install a cov-lineages repository with the specificed release """ - env_vars = None url = f"git+https://github.com/cov-lineages/{dependency}.git@{release}" pip_command = [sys.executable, '-m', 'pip', 'install', '--upgrade'] if datadir is not None: @@ -77,8 +76,7 @@ def pip_install_dep(dependency, release, datadir=None): subprocess.run(pip_command, check=True, stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - env=env_vars) + stderr=subprocess.DEVNULL) def install_pangolin_assignment(pangolin_assignment_version, datadir=None): @@ -91,7 +89,7 @@ def install_pangolin_assignment(pangolin_assignment_version, datadir=None): git_lfs_install() latest_release, tarball = get_latest_release('pangolin-assignment') pip_install_dep('pangolin-assignment', latest_release, datadir) - + print(f"pangolin-assignment installed with latest release ({latest_release})") def update(version_dictionary, data_dir=None): """