From 1a660ede4c3f8cc71df41f9badf8249f7befd26f Mon Sep 17 00:00:00 2001 From: Guillaume Poirier-Morency Date: Wed, 3 Jul 2024 10:49:12 -0700 Subject: [PATCH] Improve logic for detecting bulk RNA-Seq datasets --- rnaseq_pipeline/miniml_utils.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/rnaseq_pipeline/miniml_utils.py b/rnaseq_pipeline/miniml_utils.py index 2cd9680..fe0ccbb 100644 --- a/rnaseq_pipeline/miniml_utils.py +++ b/rnaseq_pipeline/miniml_utils.py @@ -26,13 +26,20 @@ def collect_geo_samples(f): for x in root.findall('miniml:Sample', ns): gsm_id = x.find("miniml:Accession[@database='GEO']", ns) - library_strategy = x.find('miniml:Library-Strategy', ns) platform_id = x.find('miniml:Platform-Ref', ns) sra_relation = x.find("miniml:Relation[@type='SRA']", ns) - if gsm_id is None or platform_id is None or library_strategy is None or sra_relation is None: + if gsm_id is None or platform_id is None or sra_relation is None: continue - if library_strategy.text in ['RNA-Seq', 'ssRNA-seq']: - gsm_identifiers.add(gsm_id.text) + # this has to match the logic in Gemma for bulk RNA-Seq, see GeoConverterImpl.java + sample_type = x.find('miniml:Type', ns) + if sample_type is None: + continue + if sample_type.text == 'SRA': + library_source = x.find('miniml:Library-Source', ns) + if library_source is not None and library_source.text == 'transcriptomic': + library_strategy = x.find('miniml:Library-Strategy', ns) + if library_strategy is not None and library_strategy.text in ['RNA-Seq', 'ssRNA-seq', 'OTHER']: + gsm_identifiers.add(gsm_id.text) return gsm_identifiers