From a1b65b88dabd297950497a70ede9ec1c50c2a856 Mon Sep 17 00:00:00 2001 From: martin-raden Date: Fri, 1 Dec 2023 21:48:49 +0100 Subject: [PATCH] BUGFIX accFile loading for sequence subsets --- ChangeLog | 14 +++++++-- src/IntaRNA/RnaSequence.h | 30 ++++++++++++++++++-- src/bin/CommandLineParsing.cpp | 4 +-- src/bin/CommandLineParsing.h | 52 +++------------------------------- 4 files changed, 46 insertions(+), 54 deletions(-) diff --git a/ChangeLog b/ChangeLog index 8842063c..92bf1e1a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -13,12 +13,22 @@ # IntaRNA - new arguments t|qPfScale to tune ViennaRNA pf_scale parameter used to scale partition functions for ED value computation to avoid overflows -- BUGFIX : sequence subset selection not supported when accessibilities are read - from file +- BUGFIX : when using sequence subset selection in combination with precomputed + accessibilities from file, wrong accessibilities were loaded for the selected + sequences ################################################################################ ################################################################################ +231201 Martin Raden + * IntaRNA/RNASequence : + + seqNumber : index of this sequence among its input sequence set. + relevant for accessibility loading from file + * bin/CommandLineParsing : + * getFullFilename() : uses RNASequence::seqNumber instead of index within sequence + vector. this enables the correct loading of respective accessibility files in + case only a subset of the sequences is used for prediction (--tSet or --qSet) + 230201 Martin Raden * IntaRNA/AccessibilityVrna : * constructor() : diff --git a/src/IntaRNA/RnaSequence.h b/src/IntaRNA/RnaSequence.h index ee3d5c4f..2352d430 100644 --- a/src/IntaRNA/RnaSequence.h +++ b/src/IntaRNA/RnaSequence.h @@ -63,10 +63,13 @@ class RnaSequence { * @param id the name of the sequence * @param seqString the nucleotide string encoding the sequence * @param idxPos0 input/output index of the first sequence position + * @param seqNumber index of the sequence among the respective sequence set in this call. + * relevant for accessibility parsing */ RnaSequence(const std::string& id , const std::string & seqString - , const long idxPos0 = 1 ); + , const long idxPos0 = 1 + , const size_t seqNumber = 1 ); /** * Destruction and garbage collection @@ -83,6 +86,13 @@ class RnaSequence { const std::string& getId() const; + /** + * Access to the sequence's index among the respective input sequence set of the call arguments + * @return the sequence index among the input sequence set (1-based) + */ + const size_t + getSeqNumber() const; + /** * Access to the length of the sequence. * @return the length of the sequence. @@ -284,6 +294,10 @@ class RnaSequence { //! Input/output index of the first sequence position long idxPos0; + //! index of the sequence within the overall input (1-based). + //! relevant for reading accessibility files + size_t seqNumber; + }; @@ -296,13 +310,15 @@ inline RnaSequence::RnaSequence( const std::string & id , const std::string & seqString - , const long idxPos0 ) + , const long idxPos0 + , const size_t seqNumber ) : id(id) , seqString(getUpperCase(seqString)) , seqCode(getCodeForString(this->seqString)) , ambiguous(this->seqString.find('N')!=std::string::npos) , idxPos0(idxPos0) + , seqNumber(seqNumber) { #if INTARNA_IN_DEBUG_MODE if (id.size() == 0) { @@ -334,6 +350,16 @@ getId() const ///////////////////////////////////////////////////////////////////////////// +inline +const size_t +RnaSequence:: +getSeqNumber() const +{ + return seqNumber; +} + +///////////////////////////////////////////////////////////////////////////// + inline size_t RnaSequence:: diff --git a/src/bin/CommandLineParsing.cpp b/src/bin/CommandLineParsing.cpp index c2a5e440..7238f973 100644 --- a/src/bin/CommandLineParsing.cpp +++ b/src/bin/CommandLineParsing.cpp @@ -2196,7 +2196,7 @@ parseSequencesFasta( const std::string & paramName, // check if sequence is to be stored if (seqSubset.empty() || seqSubset.covers(seqNumber)) { // store sequence - sequences.push_back( RnaSequence( idPrefix+name, sequence, idxPos0 ) ); + sequences.push_back( RnaSequence( idPrefix+name, sequence, idxPos0 , seqNumber) ); } } // clear name data @@ -2247,7 +2247,7 @@ parseSequencesFasta( const std::string & paramName, // check if sequence is to be stored if (seqSubset.empty() || seqSubset.covers(seqNumber)) { // store sequence - sequences.push_back( RnaSequence( idPrefix+name, sequence, idxPos0 ) ); + sequences.push_back( RnaSequence( idPrefix+name, sequence, idxPos0 , seqNumber) ); } } diff --git a/src/bin/CommandLineParsing.h b/src/bin/CommandLineParsing.h index 843b5b95..594c101b 100644 --- a/src/bin/CommandLineParsing.h +++ b/src/bin/CommandLineParsing.h @@ -1212,11 +1212,6 @@ void CommandLineParsing::validate_qAccFile(const std::string & value, const Inde } } else { // should be file - // subsetting not supported, since indexing is screwed - if (!subSet.empty()) { - LOG(ERROR) <<"reading query accessibilities for a subset of sequences from '"< 1) { - fileID += "s"; -// prefix += "t"; - // search for index of the target sequence - for (size_t t = 0; t < getTargetSequences().size(); t++) { - if (getTargetSequences().at(t) == *target) { - // indexing starts with 1 - fileID += toString(t+1); - break; - } - } + fileID += "s" + toString(target->getSeqNumber()); } } else // generate query only if (query != NULL && target == NULL) { if (getQuerySequences().size() > 1) { - fileID += "s"; -// prefix += "q"; - // search for index of the query sequence - for (size_t q = 0; q < getQuerySequences().size(); q++) { - if (getQuerySequences().at(q) == *query) { - // indexing starts with 1 - fileID += toString(q+1); - break; - } - } + fileID += "s"+toString(query->getSeqNumber()); } } else // generate combined part { if (getQuerySequences().size() > 1 || getTargetSequences().size() > 1) { - fileID += "t"; - // search for index of the target sequence - for (size_t t = 0; t < getTargetSequences().size(); t++) { - if (getTargetSequences().at(t) == *target) { - // indexing starts with 1 - fileID += toString(t+1); - break; - } - } - fileID += "q"; - // search for index of the query sequence - for (size_t q = 0; q < getQuerySequences().size(); q++) { - if (getQuerySequences().at(q) == *query) { - // indexing starts with 1 - fileID += toString(q+1); - break; - } - } + fileID += "t"+toString(target->getSeqNumber()); + fileID += "q"+toString(query->getSeqNumber()); } }