Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow assignment cache to be saved to and loaded from datadir #444

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion .github/workflows/pangolin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,13 @@ jobs:
run: pangolin --update-data 2>&1 | tee pangolin_update_data.log
- name: Run pangolin verbose mode
run: pangolin --verbose pangolin/test/test_seqs.fasta 2>&1 | tee pangolin_verbose.log

- name: Add assignment cache
run: pangolin --add-assignment-cache
- name: Test use-assignment-cache
run: pangolin --use-assignment-cache pangolin/test/test_seqs.fasta 2>&1 | grep 'Using pangolin-assignment cache'
- name: remove assignment cache
run: pip uninstall -y pangolin-assignment
- name: Add assignment cache to datadir
run: mkdir ac && pangolin --add-assignment-cache --datadir ac
- name: Test use-assignment-cache with datadir
run: pangolin --use-assignment-cache --datadir ac pangolin/test/test_seqs.fasta 2>&1 | grep 'Using pangolin-assignment cache'
14 changes: 8 additions & 6 deletions pangolin/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,17 @@
from pangolin.utils import data_checks
try:
import pangolin_data
except:
except ImportError:
data_checks.install_error("pangolin_data", "https://github.com/cov-lineages/pangolin-data.git")

try:
import scorpio
except:
except ImportError:
data_checks.install_error("scorpio", "https://github.com/cov-lineages/scorpio.git")

try:
import constellations
except:
except ImportError:
data_checks.install_error("constellations", "https://github.com/cov-lineages/constellations.git")

import os
Expand Down Expand Up @@ -110,20 +110,22 @@ def main(sysargs = sys.argv[1:]):
setup_data(args.datadir,config[KEY_ANALYSIS_MODE], config)

if args.add_assignment_cache:
update.install_pangolin_assignment()
update.install_pangolin_assignment(config[KEY_PANGOLIN_ASSIGNMENT_VERSION], args.datadir)

if args.update:
version_dictionary = {'pangolin': __version__,
'pangolin-data': config[KEY_PANGOLIN_DATA_VERSION],
'constellations': config[KEY_CONSTELLATIONS_VERSION],
'scorpio': config[KEY_SCORPIO_VERSION]}
update.add_pangolin_assignment_if_installed(version_dictionary)
if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None:
version_dictionary['pangolin-assignment'] = config[KEY_PANGOLIN_ASSIGNMENT_VERSION]
update.update(version_dictionary)

if args.update_data:
version_dictionary = {'pangolin-data': config[KEY_PANGOLIN_DATA_VERSION],
'constellations': config[KEY_CONSTELLATIONS_VERSION]}
update.add_pangolin_assignment_if_installed(version_dictionary)
if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None:
version_dictionary['pangolin-assignment'] = config[KEY_PANGOLIN_ASSIGNMENT_VERSION]
update.update(version_dictionary, args.datadir)

# install_pangolin_assignment doesn't exit so that --update/--update-data can be given at the
Expand Down
2 changes: 2 additions & 0 deletions pangolin/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
KEY_PANGOLIN_VERSION="pangolin_version"
KEY_CONSTELLATIONS_VERSION="constellation_version"
KEY_SCORPIO_VERSION="scorpio_version"
KEY_PANGOLIN_ASSIGNMENT_VERSION="pangolin_assignment_version"
KEY_PANGOLIN_ASSIGNMENT_PATH="pangolin_assignment_path"

KEY_VERBOSE="verbose"
KEY_LOG_API = "log_api"
Expand Down
12 changes: 6 additions & 6 deletions pangolin/utils/data_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,25 +79,24 @@ def install_error(package, url):

def get_assignment_cache(cache_file, config):
cache = ""
try:
import pangolin_assignment
pangolin_assignment_dir = pangolin_assignment.__path__[0]
if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None:
pangolin_assignment_dir = config[KEY_PANGOLIN_ASSIGNMENT_PATH]
for r, d, f in os.walk(pangolin_assignment_dir):
for fn in f:
if fn == cache_file and cache == "":
cache = os.path.join(r, fn)
if not os.path.exists(cache):
sys.stderr.write(cyan(f'Error: cannot find assignment cache file {cache_file} in pangolin_assignment\n'))
sys.exit(-1)
except:
else:
sys.stderr.write(cyan('\nError: "pangolin --add-assignment-cache" is required before '
'"pangolin --use-assignment-cache", in order to install optional '
'pangolin-assignment repository (that will make future data updates slower).\n'))
sys.exit(-1)

# Check versions of pangolin-data and pangolin-assignment to make sure they are consistent.
if pangolin_assignment.__version__.lstrip('v') != config[KEY_PANGOLIN_DATA_VERSION].lstrip('v'):
print(cyan(f'Error: pangolin_assignment cache version {pangolin_assignment.__version__} '
if config[KEY_PANGOLIN_ASSIGNMENT_VERSION].lstrip('v') != config[KEY_PANGOLIN_DATA_VERSION].lstrip('v'):
print(cyan(f'Error: pangolin_assignment cache version {config[KEY_PANGOLIN_ASSIGNMENT_VERSION]} '
f'does not match pangolin_data version {config[KEY_PANGOLIN_DATA_VERSION]}. '
'Run "pangolin --update-data" to fetch latest versions of both.'))
sys.exit(-1)
Expand All @@ -107,6 +106,7 @@ def get_assignment_cache(cache_file, config):
line = f.readline()
except:
with open(cache, 'r') as f:
# this is legacy code from when the assignment cache was installed using pip and git-lfs
pvanheus marked this conversation as resolved.
Show resolved Hide resolved
line = f.readline()
if "git-lfs.github.com" in line:
sys.stderr.write(cyan(
Expand Down
81 changes: 51 additions & 30 deletions pangolin/utils/initialising.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,15 @@
from pangolin import __version__

import pangolin_data
pangolin_assignment_version = None
pangolin_assignment_path = None
try:
import pangolin_assignment
pangolin_assignment_version = pangolin_assignment.__version__
pangolin_assignment_path = pangolin_assignment.__path__[0]
except ImportError:
# if we can't import the module, leave the variables as None
pass
pvanheus marked this conversation as resolved.
Show resolved Hide resolved
import scorpio
import constellations

Expand Down Expand Up @@ -54,7 +63,9 @@ def setup_config_dict(cwd):
KEY_PANGOLIN_DATA_VERSION: pangolin_data.__version__,
KEY_SCORPIO_VERSION: scorpio.__version__,
KEY_CONSTELLATIONS_VERSION: constellations.__version__,

KEY_PANGOLIN_ASSIGNMENT_VERSION: pangolin_assignment_version,
KEY_PANGOLIN_ASSIGNMENT_PATH: pangolin_assignment_path,

KEY_VERBOSE: False,
KEY_LOG_API: "",
KEY_THREADS: 1
Expand Down Expand Up @@ -118,67 +129,77 @@ def version_from_init(init_file):
break
return version

def setup_data(datadir_arg,analysis_mode, config):
def setup_data(datadir_arg, analysis_mode, config):
global pangolin_assignment_version
global pangolin_assignment_path

datadir = check_datadir(datadir_arg)

pangolin_data_dir = pangolin_data.__path__[0]

# collect constellations files from the contents of the constellations module
constellations_dir = constellations.__path__[0]
constellations_version = constellations.__version__
constellation_files = []

data_locations = [os.walk(constellations_dir)]

if datadir:
data_locations.append(os.walk(datadir))

# the logic of this is to search the "built-in" constellations
# path first and then if as custom datadir is passed, follow up with those, so that
# any files found in the datadir supercede the "built-in" modules. The assumption
# here is that the datadir contains newer (user updated) data
for r, _, f in itertools.chain.from_iterable(data_locations):
if r.endswith('/constellations') or r.endswith('/constellations/definitions'):
constellation_files = [] # only collect the constellations from the last directory found
for r, _, f in os.walk(constellations_dir):
for fn in f:
if r.endswith('/constellations') and fn == '__init__.py':
constellations_version = version_from_init(os.path.join(r, fn))
elif (r.endswith('/constellations') or r.endswith('/constellations/definitions')) and fn.endswith('.json'):
if (r.endswith('/constellations') or r.endswith('/constellations/definitions')) and fn.endswith('.json'):
constellation_files.append(os.path.join(r, fn))

pangolin_data_version = pangolin_data.__version__

# pangolin_assignment_version and pangolin_assignment_path are set at module import time
use_datadir = False
datadir_too_old = False
constellation_files_from_datadir = []
constellations_version_from_datadir = None
if datadir:
version = "Unknown"
for r,d,f in os.walk(datadir):
for fn in f:
if r.endswith('/constellations') and fn == '__init__.py':
constellations_version_from_datadir = version_from_init(os.path.join(r, fn))
elif (r.endswith('/constellations') or r.endswith('/constellations/definitions')) and fn.endswith('.json'):
constellation_files_from_datadir.append(os.path.join(r, fn))

# pangolin-data/__init__.py not constellations/__init__.py:
if r.endswith('data') and fn == "__init__.py":
if r.endswith('/pangolin_data') and fn == "__init__.py":
# print("Found " + os.path.join(r, fn))
version = version_from_init(os.path.join(r, fn))
if not version:
continue

if LooseVersion(version) >= LooseVersion(pangolin_data.__version__):
if LooseVersion(version) > LooseVersion(pangolin_data.__version__):
pvanheus marked this conversation as resolved.
Show resolved Hide resolved
# only use this if the version is >= than what we already have
pangolin_data_version = version
use_datadir = True
else:
datadir_too_old = True
sys.stderr.write(cyan(f"Warning: Ignoring specified datadir {datadir} - it contains pangoLEARN model files older ({version}) than those installed ({pangolin_data.__version__})\n"))
sys.stderr.write(cyan(f"Warning: Ignoring pangolin data in specified datadir {datadir} - it contains pangolin_data older ({version}) than those installed ({pangolin_data.__version__})\n"))
elif r.endswith('/pangolin_assignment') and fn == '__init__.py':
version = version_from_init(os.path.join(r, fn))
if not version:
continue

if use_datadir == False:
# we haven't got a viable datadir from searching args.datadir
if datadir and not datadir_too_old:
sys.stderr.write(cyan(
f"Warning: Ignoring specified datadir {datadir} - could not find __init__.py file to check versions \n"))
if pangolin_assignment_version is None or LooseVersion(version) > LooseVersion(pangolin_assignment_version):
# only use this if the version is >= than what we already have
pvanheus marked this conversation as resolved.
Show resolved Hide resolved
pangolin_assignment_version = version
pangolin_assignment_path = r
else:
sys.stderr.write(cyan(f"Warning: Ignoring pangolin assignment in specified datadir {datadir} - it contains pangolin_assignment older ({version}) than those installed ({pangolin_assignment_version})\n"))

if constellations_version_from_datadir is not None and LooseVersion(constellations_version_from_datadir) > LooseVersion(constellations_version):
AngieHinrichs marked this conversation as resolved.
Show resolved Hide resolved
constellation_files = constellation_files_from_datadir
constellations_version = constellations_version_from_datadir

if use_datadir == False:
pangolin_data_dir = pangolin_data.__path__[0]
datadir = os.path.join(pangolin_data_dir,"data")

config[KEY_PANGOLIN_DATA_VERSION] = pangolin_data_version
config[KEY_CONSTELLATIONS_VERSION] = constellations_version
config[KEY_DATADIR] = datadir
config[KEY_DATADIR] = datadir # this is the pangolin_data datadir, the naming is from when there was only a single datadir to worry about
config[KEY_CONSTELLATION_FILES] = constellation_files
config[KEY_PANGOLIN_ASSIGNMENT_VERSION] = pangolin_assignment_version
config[KEY_PANGOLIN_ASSIGNMENT_PATH] = pangolin_assignment_path

def parse_qc_thresholds(maxambig, minlen, reference_fasta, config):

Expand Down Expand Up @@ -244,7 +265,7 @@ def print_versions_exit(config):
# Report pangolin_assignment version if it is installed, otherwise ignore
try:
import pangolin_assignment
print(f"pangolin-assignment: {pangolin_assignment.__version__}")
print(f"pangolin-assignment: {config[KEY_PANGOLIN_ASSIGNMENT_VERSION]}")
except:
pass
pvanheus marked this conversation as resolved.
Show resolved Hide resolved
# Print versions of other important tools used by pangolin
Expand Down
50 changes: 12 additions & 38 deletions pangolin/utils/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,41 +64,31 @@ def git_lfs_install():
sys.stderr.write(cyan(f"Error: {e}:\n{stderr}\n"))
sys.exit(-1)

def pip_install_dep(dependency, release):
def pip_install_dep(dependency, release, datadir=None):
"""
Use pip install to install a cov-lineages repository with the specificed release
"""
env_vars = None
pvanheus marked this conversation as resolved.
Show resolved Hide resolved
if datadir is not None:
env_vars = {'PIP_TARGET': datadir, 'PIP_UPGRADE': '1'}
pvanheus marked this conversation as resolved.
Show resolved Hide resolved
url = f"git+https://github.com/cov-lineages/{dependency}.git@{release}"
subprocess.run([sys.executable, '-m', 'pip', 'install', '--upgrade', url],
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL)
stderr=subprocess.DEVNULL,
env=env_vars)


def install_pangolin_assignment():
def install_pangolin_assignment(pangolin_assignment_version, datadir=None):
"""
If the pangolin-assignment repo has not been installed already then install the latest release.
"""
try:
import pangolin_assignment
print(f"pangolin-assignment already installed with version {pangolin_assignment.__version__}; use --update or --update-data if you wish to update it.", file=sys.stderr)

except:
if pangolin_assignment_version is not None:
print(f"pangolin-assignment already installed with version {pangolin_assignment_version}; use --update or --update-data if you wish to update it.", file=sys.stderr)
else:
git_lfs_install()
latest_release, tarball = get_latest_release('pangolin-assignment')
pip_install_dep('pangolin-assignment', latest_release)
print(f"pangolin-assignment installed with latest release ({latest_release})")
pvanheus marked this conversation as resolved.
Show resolved Hide resolved


def add_pangolin_assignment_if_installed(version_dictionary):
"""
If pangolin_assignment has been installed then add it to version_dictionary, else ignore.
"""
try:
import pangolin_assignment
version_dictionary["pangolin-assignment"] = pangolin_assignment.__version__
except:
pass
pip_install_dep('pangolin-assignment', latest_release, datadir)


def update(version_dictionary, data_dir=None):
Expand Down Expand Up @@ -154,23 +144,7 @@ def update(version_dictionary, data_dir=None):
version = LooseVersion(version)

if version < latest_release_tidied:
if data_dir is not None:
# this path only gets followed when the user has --update_data and they
# have also specified a --datadir
with TemporaryDirectory() as tempdir:
dependency_package = package_names.get(dependency, dependency)
tarball_path = os.path.join(tempdir, 'tarball.tgz')
open(tarball_path, 'wb').write(request.urlopen(latest_release_tarball).read())
tf = tarfile.open(tarball_path)
extracted_dir = tf.next().name
tf.extractall(path=tempdir)
tf.close()
destination_directory = os.path.join(data_dir, dependency_package)
if os.path.isdir(destination_directory):
shutil.rmtree(destination_directory)
shutil.move(os.path.join(tempdir, extracted_dir, dependency_package), destination_directory)
else:
pip_install_dep(dependency, latest_release)
pip_install_dep(dependency, latest_release, data_dir)
print(f"{dependency} updated to {latest_release}", file=sys.stderr)
elif version > latest_release_tidied:
print(f"{dependency} ({version}) is newer than latest stable "
Expand Down