Skip to content

Commit

Permalink
BUGFIX accFile loading for sequence subsets
Browse files Browse the repository at this point in the history
  • Loading branch information
martin-raden committed Dec 1, 2023
1 parent b51e3f0 commit a1b65b8
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 54 deletions.
14 changes: 12 additions & 2 deletions ChangeLog
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,22 @@
# IntaRNA
- new arguments t|qPfScale to tune ViennaRNA pf_scale parameter used
to scale partition functions for ED value computation to avoid overflows
- BUGFIX : sequence subset selection not supported when accessibilities are read
from file
- BUGFIX : when using sequence subset selection in combination with precomputed
accessibilities from file, wrong accessibilities were loaded for the selected
sequences

################################################################################
################################################################################

231201 Martin Raden
* IntaRNA/RNASequence :
+ seqNumber : index of this sequence among its input sequence set.
relevant for accessibility loading from file
* bin/CommandLineParsing :
* getFullFilename() : uses RNASequence::seqNumber instead of index within sequence
vector. this enables the correct loading of respective accessibility files in
case only a subset of the sequences is used for prediction (--tSet or --qSet)

230201 Martin Raden
* IntaRNA/AccessibilityVrna :
* constructor() :
Expand Down
30 changes: 28 additions & 2 deletions src/IntaRNA/RnaSequence.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,13 @@ class RnaSequence {
* @param id the name of the sequence
* @param seqString the nucleotide string encoding the sequence
* @param idxPos0 input/output index of the first sequence position
* @param seqNumber index of the sequence among the respective sequence set in this call.
* relevant for accessibility parsing
*/
RnaSequence(const std::string& id
, const std::string & seqString
, const long idxPos0 = 1 );
, const long idxPos0 = 1
, const size_t seqNumber = 1 );

/**
* Destruction and garbage collection
Expand All @@ -83,6 +86,13 @@ class RnaSequence {
const std::string&
getId() const;

/**
* Access to the sequence's index among the respective input sequence set of the call arguments
* @return the sequence index among the input sequence set (1-based)
*/
const size_t
getSeqNumber() const;

/**
* Access to the length of the sequence.
* @return the length of the sequence.
Expand Down Expand Up @@ -284,6 +294,10 @@ class RnaSequence {
//! Input/output index of the first sequence position
long idxPos0;

//! index of the sequence within the overall input (1-based).
//! relevant for reading accessibility files
size_t seqNumber;

};


Expand All @@ -296,13 +310,15 @@ inline
RnaSequence::RnaSequence(
const std::string & id
, const std::string & seqString
, const long idxPos0 )
, const long idxPos0
, const size_t seqNumber )
:
id(id)
, seqString(getUpperCase(seqString))
, seqCode(getCodeForString(this->seqString))
, ambiguous(this->seqString.find('N')!=std::string::npos)
, idxPos0(idxPos0)
, seqNumber(seqNumber)
{
#if INTARNA_IN_DEBUG_MODE
if (id.size() == 0) {
Expand Down Expand Up @@ -334,6 +350,16 @@ getId() const

/////////////////////////////////////////////////////////////////////////////

inline
const size_t
RnaSequence::
getSeqNumber() const
{
return seqNumber;
}

/////////////////////////////////////////////////////////////////////////////

inline
size_t
RnaSequence::
Expand Down
4 changes: 2 additions & 2 deletions src/bin/CommandLineParsing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2196,7 +2196,7 @@ parseSequencesFasta( const std::string & paramName,
// check if sequence is to be stored
if (seqSubset.empty() || seqSubset.covers(seqNumber)) {
// store sequence
sequences.push_back( RnaSequence( idPrefix+name, sequence, idxPos0 ) );
sequences.push_back( RnaSequence( idPrefix+name, sequence, idxPos0 , seqNumber) );
}
}
// clear name data
Expand Down Expand Up @@ -2247,7 +2247,7 @@ parseSequencesFasta( const std::string & paramName,
// check if sequence is to be stored
if (seqSubset.empty() || seqSubset.covers(seqNumber)) {
// store sequence
sequences.push_back( RnaSequence( idPrefix+name, sequence, idxPos0 ) );
sequences.push_back( RnaSequence( idPrefix+name, sequence, idxPos0 , seqNumber) );
}
}

Expand Down
52 changes: 4 additions & 48 deletions src/bin/CommandLineParsing.h
Original file line number Diff line number Diff line change
Expand Up @@ -1212,11 +1212,6 @@ void CommandLineParsing::validate_qAccFile(const std::string & value, const Inde
}
} else {
// should be file
// subsetting not supported, since indexing is screwed
if (!subSet.empty()) {
LOG(ERROR) <<"reading query accessibilities for a subset of sequences from '"<<value<<"' is not supported";
updateParsingCode(ReturnCode::STOP_PARSING_ERROR);
}
// check needed files
for (auto rna = getQuerySequences().begin(); rna != getQuerySequences().end(); rna++) {
if ( ! validateFile( getFullFilename(value, NULL, &(*rna) ) ) ) {
Expand Down Expand Up @@ -1299,11 +1294,6 @@ void CommandLineParsing::validate_tAccFile(const std::string & value, const Inde
}
} else {
// should be file
// subsetting not supported, since indexing is screwed
if (!subSet.empty()) {
LOG(ERROR) <<"reading target accessibilities for a subset of sequences from '"<<value<<"' is not supported";
updateParsingCode(ReturnCode::STOP_PARSING_ERROR);
}
// check needed files
for (auto rna = getTargetSequences().begin(); rna != getTargetSequences().end(); rna++) {
if ( ! validateFile( getFullFilename(value, &(*rna), NULL ) ) ) {
Expand Down Expand Up @@ -1626,54 +1616,20 @@ getFullFilename( const std::string & fileNamePath, const RnaSequence * target, c
// generate target only
if (target != NULL && query == NULL) {
if (getTargetSequences().size() > 1) {
fileID += "s";
// prefix += "t";
// search for index of the target sequence
for (size_t t = 0; t < getTargetSequences().size(); t++) {
if (getTargetSequences().at(t) == *target) {
// indexing starts with 1
fileID += toString(t+1);
break;
}
}
fileID += "s" + toString(target->getSeqNumber());
}
} else
// generate query only
if (query != NULL && target == NULL) {
if (getQuerySequences().size() > 1) {
fileID += "s";
// prefix += "q";
// search for index of the query sequence
for (size_t q = 0; q < getQuerySequences().size(); q++) {
if (getQuerySequences().at(q) == *query) {
// indexing starts with 1
fileID += toString(q+1);
break;
}
}
fileID += "s"+toString(query->getSeqNumber());
}
} else
// generate combined part
{
if (getQuerySequences().size() > 1 || getTargetSequences().size() > 1) {
fileID += "t";
// search for index of the target sequence
for (size_t t = 0; t < getTargetSequences().size(); t++) {
if (getTargetSequences().at(t) == *target) {
// indexing starts with 1
fileID += toString(t+1);
break;
}
}
fileID += "q";
// search for index of the query sequence
for (size_t q = 0; q < getQuerySequences().size(); q++) {
if (getQuerySequences().at(q) == *query) {
// indexing starts with 1
fileID += toString(q+1);
break;
}
}
fileID += "t"+toString(target->getSeqNumber());
fileID += "q"+toString(query->getSeqNumber());
}
}

Expand Down

0 comments on commit a1b65b8

Please sign in to comment.