Skip to content

Commit

Permalink
Merge pull request #4 from phac-nml/fix/empty-output
Browse files Browse the repository at this point in the history
Bug Fix for When Mash Output is Empty
  • Loading branch information
ericenns authored Nov 23, 2020
2 parents cd37a82 + e282174 commit 4f42693
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 33 deletions.
2 changes: 1 addition & 1 deletion refseq_masher/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-

__version__ = '0.1.1'
__version__ = '0.1.2'
program_name = 'refseq_masher'
program_summary = 'Mash MinHash search your sequences against a NCBI RefSeq genomes database'
program_desc = program_summary + '''
Expand Down
27 changes: 18 additions & 9 deletions refseq_masher/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,12 +134,21 @@ def contains(mash_bin, output, output_type, top_n_results, min_identity, max_pva
max_pvalue=max_pvalue,
min_identity=min_identity,
parallelism=parallelism)
if top_n_results > 0:
df = df.head(top_n_results)
dfs.append(df)
logging.info('Ran Mash Screen on all input. Merging NCBI taxonomic information into results output.')
dfout = merge_ncbi_taxonomy_info(pd.concat(dfs))
logging.info('Merged taxonomic information into results output')
logging.info('Reordering output columns')
dfout = order_output_columns(dfout, MASH_SCREEN_ORDERED_COLUMNS)
write_dataframe(dfout, output_path=output, output_type=output_type)

if df is not None:
if top_n_results > 0:
df = df.head(top_n_results)
dfs.append(df)

logging.info('Ran Mash Screen on all input.')

if len(dfs) > 0:
logging.info('Merging NCBI taxonomic information into results output.')
dfout = merge_ncbi_taxonomy_info(pd.concat(dfs))
logging.info('Merged taxonomic information into results output')
logging.info('Reordering output columns')
dfout = order_output_columns(dfout, MASH_SCREEN_ORDERED_COLUMNS)
write_dataframe(dfout, output_path=output, output_type=output_type)

else:
logging.info('There were no matches found.')
26 changes: 16 additions & 10 deletions refseq_masher/mash/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,14 +112,20 @@ def mash_screen_output_to_dataframe(mash_out: str) -> pd.DataFrame:
mash_out: Mash screen stdout
Returns:
(pd.DataFrame): Mash screen output table ordered by `identity` and `median_multiplicity` columns in descending order
(pd.DataFrame): Mash screen output table ordered by `identity` and `median_multiplicity` columns in descending
order, or None if the Mash output is missing
"""
df = pd.read_table(StringIO(mash_out))
ncols = df.shape[1]
df.columns = MASH_SCREEN_COLUMNS[:ncols]
df.sort_values(by=['identity', 'median_multiplicity'], ascending=[False, False], inplace=True)
match_ids = df.match_id
refseq_matches = [parse_refseq_info(match_id=match_id) for match_id in match_ids]
dfmatch = pd.DataFrame(refseq_matches)
dfmerge = pd.merge(dfmatch, df, on='match_id')
return dfmerge

dfmerge = None

if len(mash_out) > 0:
df = pd.read_table(StringIO(mash_out))
ncols = df.shape[1]
df.columns = MASH_SCREEN_COLUMNS[:ncols]
df.sort_values(by=['identity', 'median_multiplicity'], ascending=[False, False], inplace=True)
match_ids = df.match_id
refseq_matches = [parse_refseq_info(match_id=match_id) for match_id in match_ids]
dfmatch = pd.DataFrame(refseq_matches)
dfmerge = pd.merge(dfmatch, df, on='match_id')

return dfmerge
9 changes: 7 additions & 2 deletions refseq_masher/mash/screen.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def vs_refseq(inputs: Union[str, List[str]],
parallelism: Mash screen number of parallel threads to spawn
Returns:
(pd.DataFrame): Parsed Mash screen results dataframe
(pd.DataFrame): Parsed Mash screen results dataframe or None if the output of Mash was empty
"""
cmd_list = [mash_bin, 'screen',
'-v', str(max_pvalue),
Expand All @@ -40,9 +40,14 @@ def vs_refseq(inputs: Union[str, List[str]],
cmd_list.append(inputs)
else:
raise TypeError('Unexpected type "{}" for "inputs": {}'.format(type(inputs), inputs))

logging.info('Running Mash Screen with NCBI RefSeq sketch database '
'against sample "%s" with inputs: %s', sample_name, inputs)
exit_code, stdout, stderr = run_command(cmd_list, stderr=None)

df = mash_screen_output_to_dataframe(stdout)
df['sample'] = sample_name

if df is not None:
df['sample'] = sample_name

return df
4 changes: 4 additions & 0 deletions tests/data/small.fastq
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
@READ1
AGCTACGTAGCTATATATTATAGCGTAGCTCGTGCGCTAGCTAGCTATATGCTGATGCTCTAGCTATATTATACTAGCTATAGCTAAAACGCTGCTCGTAA
+
CCCCCCCCCCCCCCCCCCCCC;CCCCCCCCCCCCCC;CCCCCCCCCCCCCCCCC-CCCCCCCCCCCCCCCCCCCCCCCCCC;CCCCCCCCCCCCC-CCCCC
35 changes: 24 additions & 11 deletions tests/test_contains.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-

import unittest
from io import StringIO

from click.testing import CliRunner
Expand All @@ -13,16 +13,29 @@
def runner():
return CliRunner()

def test_fasta(runner):
fasta = 'tests/data/Se-Enteritidis.fasta'
result = runner.invoke(cli.contains, ['--top-n-results', '1',
'-p', '4',
fasta])

assert result.exit_code == 0, 'Exit code should be 0'

expected_top_tax_name = 'Salmonella enterica subsp. enterica serovar Enteritidis'
df = pd.read_table(StringIO(result.output))
assert df.top_taxonomy_name.str.contains(expected_top_tax_name).all(), \
'Top Mash RefSeq result should have "{}" in the top_taxonomy_name field'.format(expected_top_tax_name)


class TestContains(unittest.TestCase):

def test_fasta(runner):
fasta = 'tests/data/Se-Enteritidis.fasta'
result = runner.invoke(cli.contains, ['--top-n-results', '1',
'-p', '4',
fasta])
def test_small_fastq(self):
fastq = 'tests/data/small.fastq'
runner = CliRunner()

assert result.exit_code == 0, 'Exit code should be 0'
with self.assertLogs(level='INFO') as cm:
result = runner.invoke(cli.contains, ['-p', '4', fastq])
assert "There were no matches found." in " ".join(cm.output)

expected_top_tax_name = 'Salmonella enterica subsp. enterica serovar Enteritidis'
df = pd.read_table(StringIO(result.output))
assert df.top_taxonomy_name.str.contains(expected_top_tax_name).all(), \
'Top Mash RefSeq result should have "{}" in the top_taxonomy_name field'.format(expected_top_tax_name)
assert result.exit_code == 0, 'Exit code should be 0'
assert len(result.stdout) == 0, 'No output results on empty data'

0 comments on commit 4f42693

Please sign in to comment.