Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make pango-assignments work with datadir #443

Closed
11 changes: 10 additions & 1 deletion .github/workflows/pangolin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,13 @@ jobs:
run: pangolin --update-data 2>&1 | tee pangolin_update_data.log
- name: Run pangolin verbose mode
run: pangolin --verbose pangolin/test/test_seqs.fasta 2>&1 | tee pangolin_verbose.log

- name: Add assignment cache
run: pangolin --add-assignment-cache
- name: Test use-assignment-cache
run: pangolin --use-assignment-cache pangolin/test/test_seqs.fasta 2>&1 | grep 'Using pangolin-assignment cache'
- name: remove assignment cache
run: pip uninstall -y pangolin-assignment
- name: Add assignment cache to datadir
run: mkdir ac && pangolin --add-assignment-cache --datadir ac
- name: Test use-assignment-cache with datadir
run: pangolin --use-assignment-cache --datadir ac pangolin/test/test_seqs.fasta 2>&1 | grep 'Using pangolin-assignment cache'
14 changes: 8 additions & 6 deletions pangolin/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,17 @@
from pangolin.utils import data_checks
try:
import pangolin_data
except:
except ImportError:
data_checks.install_error("pangolin_data", "https://github.com/cov-lineages/pangolin-data.git")

try:
import scorpio
except:
except ImportError:
data_checks.install_error("scorpio", "https://github.com/cov-lineages/scorpio.git")

try:
import constellations
except:
except ImportError:
data_checks.install_error("constellations", "https://github.com/cov-lineages/constellations.git")

import os
Expand Down Expand Up @@ -110,20 +110,22 @@ def main(sysargs = sys.argv[1:]):
setup_data(args.datadir,config[KEY_ANALYSIS_MODE], config)

if args.add_assignment_cache:
update.install_pangolin_assignment()
update.install_pangolin_assignment(config[KEY_PANGOLIN_ASSIGNMENT_VERSION], args.datadir)

if args.update:
version_dictionary = {'pangolin': __version__,
'pangolin-data': config[KEY_PANGOLIN_DATA_VERSION],
'constellations': config[KEY_CONSTELLATIONS_VERSION],
'scorpio': config[KEY_SCORPIO_VERSION]}
update.add_pangolin_assignment_if_installed(version_dictionary)
if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None:
version_dictionary['pangolin-assignment'] = config[KEY_PANGOLIN_ASSIGNMENT_VERSION]
update.update(version_dictionary)

if args.update_data:
version_dictionary = {'pangolin-data': config[KEY_PANGOLIN_DATA_VERSION],
'constellations': config[KEY_CONSTELLATIONS_VERSION]}
update.add_pangolin_assignment_if_installed(version_dictionary)
if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None:
version_dictionary['pangolin-assignment'] = config[KEY_PANGOLIN_ASSIGNMENT_VERSION]
update.update(version_dictionary, args.datadir)

# install_pangolin_assignment doesn't exit so that --update/--update-data can be given at the
Expand Down
2 changes: 2 additions & 0 deletions pangolin/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
KEY_PANGOLIN_VERSION="pangolin_version"
KEY_CONSTELLATIONS_VERSION="constellation_version"
KEY_SCORPIO_VERSION="scorpio_version"
KEY_PANGOLIN_ASSIGNMENT_VERSION="pangolin_assignment_version"
KEY_PANGOLIN_ASSIGNMENT_PATH="pangolin_assignment_path"

KEY_VERBOSE="verbose"
KEY_LOG_API = "log_api"
Expand Down
15 changes: 4 additions & 11 deletions pangolin/utils/data_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,34 +79,27 @@ def install_error(package, url):

def get_assignment_cache(cache_file, config):
cache = ""
try:
import pangolin_assignment
pangolin_assignment_dir = pangolin_assignment.__path__[0]
if config[KEY_PANGOLIN_ASSIGNMENT_VERSION] is not None:
pangolin_assignment_dir = config[KEY_PANGOLIN_ASSIGNMENT_PATH]
for r, d, f in os.walk(pangolin_assignment_dir):
for fn in f:
if fn == cache_file and cache == "":
cache = os.path.join(r, fn)
if not os.path.exists(cache):
sys.stderr.write(cyan(f'Error: cannot find assignment cache file {cache_file} in pangolin_assignment\n'))
sys.exit(-1)
except:
else:
sys.stderr.write(cyan('\nError: "pangolin --add-assignment-cache" is required before '
'"pangolin --use-assignment-cache", in order to install optional '
'pangolin-assignment repository (that will make future data updates slower).\n'))
sys.exit(-1)

# Check versions of pangolin-data and pangolin-assignment to make sure they are consistent.
if pangolin_assignment.__version__.lstrip('v') != config[KEY_PANGOLIN_DATA_VERSION].lstrip('v'):
print(cyan(f'Error: pangolin_assignment cache version {pangolin_assignment.__version__} '
f'does not match pangolin_data version {config[KEY_PANGOLIN_DATA_VERSION]}. '
'Run "pangolin --update-data" to fetch latest versions of both.'))
sys.exit(-1)

try:
with gzip.open(cache, 'rt') as f:
line = f.readline()
except:
with open(cache, 'r') as f:
# this is legacy code from when the assignment cache was installed using pip and git-lfs
line = f.readline()
if "git-lfs.github.com" in line:
sys.stderr.write(cyan(
Expand Down
48 changes: 34 additions & 14 deletions pangolin/utils/initialising.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,15 @@
from pangolin import __version__

import pangolin_data
pangolin_assignment_version = None
pangolin_assignment_path = None
try:
import pangolin_assignment
pangolin_assignment_version = pangolin_assignment.__version__
pangolin_assignment_path = pangolin_assignment.__path__[0]
except ImportError:
# if we can't import the module, leave the variables as None
pass
import scorpio
import constellations

Expand Down Expand Up @@ -54,7 +63,9 @@ def setup_config_dict(cwd):
KEY_PANGOLIN_DATA_VERSION: pangolin_data.__version__,
KEY_SCORPIO_VERSION: scorpio.__version__,
KEY_CONSTELLATIONS_VERSION: constellations.__version__,

KEY_PANGOLIN_ASSIGNMENT_VERSION: pangolin_assignment_version,
KEY_PANGOLIN_ASSIGNMENT_PATH: pangolin_assignment_path,

KEY_VERBOSE: False,
KEY_LOG_API: "",
KEY_THREADS: 1
Expand Down Expand Up @@ -118,7 +129,9 @@ def version_from_init(init_file):
break
return version

def setup_data(datadir_arg,analysis_mode, config):
def setup_data(datadir_arg, analysis_mode, config):
global pangolin_assignment_version
global pangolin_assignment_path

datadir = check_datadir(datadir_arg)

Expand All @@ -145,14 +158,16 @@ def setup_data(datadir_arg,analysis_mode, config):
constellation_files.append(os.path.join(r, fn))

pangolin_data_version = pangolin_data.__version__

# pangolin_assignment_version and pangolin_assignment_path are set at module import time
use_datadir = False
datadir_too_old = False
if datadir:
version = "Unknown"
for r,d,f in os.walk(datadir):
for fn in f:
# pangolin-data/__init__.py not constellations/__init__.py:
if r.endswith('data') and fn == "__init__.py":
if r.endswith('/pangolin_data') and fn == "__init__.py":
# print("Found " + os.path.join(r, fn))
version = version_from_init(os.path.join(r, fn))
if not version:
Expand All @@ -163,22 +178,28 @@ def setup_data(datadir_arg,analysis_mode, config):
pangolin_data_version = version
use_datadir = True
else:
datadir_too_old = True
sys.stderr.write(cyan(f"Warning: Ignoring specified datadir {datadir} - it contains pangoLEARN model files older ({version}) than those installed ({pangolin_data.__version__})\n"))
sys.stderr.write(cyan(f"Warning: Ignoring pangolin data in specified datadir {datadir} - it contains pangolin_data older ({version}) than those installed ({pangolin_data.__version__})\n"))
elif r.endswith('/pangolin_assignment') and fn == '__init__.py':
version = version_from_init(os.path.join(r, fn))
if not version:
continue

if pangolin_assignment_version is None or LooseVersion(version) >= LooseVersion(pangolin_assignment_version):
# only use this if the version is >= than what we already have
pangolin_assignment_version = version
pangolin_assignment_path = r
else:
sys.stderr.write(cyan(f"Warning: Ignoring pangolin assignment in specified datadir {datadir} - it contains pangolin_assignment older ({version}) than those installed ({pangolin_assignment.__version__})\n"))
if use_datadir == False:
# we haven't got a viable datadir from searching args.datadir
if datadir and not datadir_too_old:
sys.stderr.write(cyan(
f"Warning: Ignoring specified datadir {datadir} - could not find __init__.py file to check versions \n"))

pangolin_data_dir = pangolin_data.__path__[0]
datadir = os.path.join(pangolin_data_dir,"data")

config[KEY_PANGOLIN_DATA_VERSION] = pangolin_data_version
config[KEY_CONSTELLATIONS_VERSION] = constellations_version
config[KEY_DATADIR] = datadir
config[KEY_DATADIR] = datadir # this is the pangolin_data datadir, the naming is from when there was only a single datadir to worry about
config[KEY_CONSTELLATION_FILES] = constellation_files
config[KEY_PANGOLIN_ASSIGNMENT_VERSION] = pangolin_assignment_version
config[KEY_PANGOLIN_ASSIGNMENT_PATH] = pangolin_assignment_path

def parse_qc_thresholds(maxambig, minlen, reference_fasta, config):

Expand Down Expand Up @@ -207,11 +228,10 @@ def parse_qc_thresholds(maxambig, minlen, reference_fasta, config):

print(green(f"Maximum ambiguity allowed is {config[KEY_MAXAMBIG]}.\n****"))


def print_ram_warning(analysis_mode):
if analysis_mode == "pangolearn":
print(cyan("Warning: pangoLEARN mode may use a significant amount of RAM, be aware that it will not suit every system."))

def print_alias_file_exit(alias_file):
with open(alias_file, 'r') as handle:
for line in handle:
Expand Down Expand Up @@ -244,7 +264,7 @@ def print_versions_exit(config):
# Report pangolin_assignment version if it is installed, otherwise ignore
try:
import pangolin_assignment
print(f"pangolin-assignment: {pangolin_assignment.__version__}")
print(f"pangolin-assignment: {config[KEY_PANGOLIN_ASSIGNMENT_VERSION]}")
except:
pass
# Print versions of other important tools used by pangolin
Expand Down
95 changes: 59 additions & 36 deletions pangolin/utils/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import sys
import json
import re
import shutil
import tarfile
import subprocess
Expand All @@ -14,8 +15,10 @@

version_dict_keys = ['pangolin', 'scorpio', 'pangolin-data', 'constellations', 'pangolin-assignment']

dependency_web_dir = { 'pangolin-assignment': 'https://hgdownload.gi.ucsc.edu/goldenPath/wuhCor1/pangolin-assignment' }

def get_latest_release(dependency):

def get_latest_cov_lineages(dependency):
"""
Using the github releases API check for the latest release of dependency and its tarball
"""
Expand All @@ -29,8 +32,8 @@ def get_latest_release(dependency):
# so if this is thrown and there is definitely connectivity then
# double check the version labels
except Exception as e:
sys.stderr.write(cyan("Unable to connect to reach github API "
"--update/--data_update requires internet "
sys.stderr.write(cyan("Unable to connect to reach github API. "
"--update/--update-data requires internet "
"connectivity so may not work on certain "
"systems or if your IP has exceeded the "
f"5,000 request per hour limit\n{e}\n"))
Expand All @@ -49,64 +52,81 @@ def get_latest_release(dependency):
return latest_release, latest_release_tarball


def git_lfs_install():
def get_latest_web_dir(dependency, web_dir):
"""
'git-lfs install' must be run after installing git-lfs and before cloning a repo
that uses Git LFS.
Find the tarball url with the latest release from a web directory with versioned tarballs
instead of github. An HTTP GET of the web directory must return some text that contains
names of files in that directory, some of which are {dependency}-{version}.tar.gz.
"""
try:
subprocess.run(['git-lfs', 'install'],
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL)
except subprocess.CalledProcessError as e:
stderr = e.stderr.decode('utf-8')
sys.stderr.write(cyan(f"Error: {e}:\n{stderr}\n"))
listing = request.urlopen(web_dir).read().decode('utf-8')
except:
sys.stderr.write(cyan(f"Unable to read {web_dir}"))
sys.exit(-1)
tarRe = re.compile(f"{dependency}-(.*?).tar.gz")
matches = list(set(tarRe.findall(listing)))
if not matches:
sys.stderr.write(cyan(f"Can't find {dependency}-<version>.tar.gz files in listing of {web_dir}"))
sys.exit(-1)
versions = [LooseVersion(v) for v in matches]
versions.sort()
latest_release = str(versions[-1])
latest_release_tarball = f"{web_dir}/{dependency}-{latest_release}.tar.gz"
return latest_release, latest_release_tarball


def pip_install_dep(dependency, release):
def get_latest_release(dependency):
"""
Use pip install to install a cov-lineages repository with the specificed release
If dependency comes from a web directory then find latest release and tarball there, otherwise
query github API for cov-lineages repo
"""
if dependency in dependency_web_dir:
return get_latest_web_dir(dependency, dependency_web_dir[dependency])
else:
return get_latest_cov_lineages(dependency)


def pip_install_url(url):
"""
Use pip install to install a package from a url.
"""
url = f"git+https://github.com/cov-lineages/{dependency}.git@{release}"
subprocess.run([sys.executable, '-m', 'pip', 'install', '--upgrade', url],
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL)


def install_pangolin_assignment():
def pip_install_cov_lineages(dependency, release):
"""
If the pangolin-assignment repo has not been installed already then install the latest release.
Use pip install to install a cov-lineages repository with the specified release
"""
try:
import pangolin_assignment
print(f"pangolin-assignment already installed with version {pangolin_assignment.__version__}; use --update or --update-data if you wish to update it.", file=sys.stderr)

except:
git_lfs_install()
latest_release, tarball = get_latest_release('pangolin-assignment')
pip_install_dep('pangolin-assignment', latest_release)
print(f"pangolin-assignment installed with latest release ({latest_release})")
url = f"git+https://github.com/cov-lineages/{dependency}.git@{release}"
pip_install_url(url)


def add_pangolin_assignment_if_installed(version_dictionary):
def install_pangolin_assignment(pangolin_assignment_version, datadir):
"""
If pangolin_assignment has been installed then add it to version_dictionary, else ignore.
If the pangolin-assignment repo has not been installed already then install the latest release.
"""
try:
import pangolin_assignment
version_dictionary["pangolin-assignment"] = pangolin_assignment.__version__
except:
pass
if pangolin_assignment_version is not None:
print(f"pangolin-assignment already installed with version {pangolin_assignment_version}; use --update or --update-data if you wish to update it.", file=sys.stderr)
else:
latest_release, tarball = get_latest_release('pangolin-assignment')
if datadir is not None and os.path.exists(datadir):
# install pangolin-assignment to datadir instead of using pip install
version_dictionary = {'pangolin-assignment': '0'}
update(version_dictionary, datadir)
else:
pip_install_url(tarball)
print(f"pangolin-assignment installed with latest release ({latest_release})")


def update(version_dictionary, data_dir=None):
"""
Using the github releases API check for the latest current release
of the set of dependencies provided e.g., pangolin, scorpio, pangolin-data and
constellations for complete --update and just pangolearn and constellations
for --update_data. If pangolin-assignment has been added to the installation
for --update_data. If pangolin-assignment has been added to version_dictionary
then it will be included in both --update and --update-data.

Dictionary keys must be one of pangolin, scorpio, pangolin-data, constellations
Expand Down Expand Up @@ -170,7 +190,10 @@ def update(version_dictionary, data_dir=None):
shutil.rmtree(destination_directory)
shutil.move(os.path.join(tempdir, extracted_dir, dependency_package), destination_directory)
else:
pip_install_dep(dependency, latest_release)
if dependency in dependency_web_dir:
pip_install_url(latest_release_tarball)
else:
pip_install_cov_lineages(dependency, latest_release)
print(f"{dependency} updated to {latest_release}", file=sys.stderr)
elif version > latest_release_tidied:
print(f"{dependency} ({version}) is newer than latest stable "
Expand Down