Skip to content

Commit

Permalink
v1.11.0 SMap remove nan
Browse files Browse the repository at this point in the history
  • Loading branch information
SoftwareLiteracy committed Apr 4, 2022
1 parent 012b291 commit 8424b3b
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 14 deletions.
90 changes: 78 additions & 12 deletions src/API.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,15 @@
// Functions implemented in Eval.cc:
// EmbedDimension(), PredictInterval(), PredictNonlinear()
//
// Note: Functions that implement either filepath or DataFrame
// NOTE: Functions that implement either filepath or DataFrame
// input are overloads. The first pattern takes a filepath
// argument, creates the DataFrame object, then calls the
// second with the DataFrame.
// second with a reference to the DataFrame.
//
// In the case of SMap, there are 4 overloads, two with
// the default SVD solver, two with a user supplied solver.
// In all cases, the final overload (4) creates the SMap
// object and executes the SMap algorithm.
//----------------------------------------------------------------

#include "API.h"
Expand Down Expand Up @@ -172,7 +177,7 @@ DataFrame< double > MakeBlock( DataFrame< double > & dataFrame,
}

//----------------------------------------------------------------------
// Simplex with path/file input
// Simplex with pathIn/dataFile input : calls Simplex( & DataFrame )
//----------------------------------------------------------------------
SimplexValues Simplex( std::string pathIn,
std::string dataFile,
Expand All @@ -197,7 +202,7 @@ SimplexValues Simplex( std::string pathIn,
// DataFrame constructor loads data
DataFrame< double > DF( pathIn, dataFile );

// Pass data frame to Simplex
// Call Simplex( & DataFrame )
SimplexValues S = Simplex( std::ref( DF ),
pathOut,
predictFile,
Expand Down Expand Up @@ -269,7 +274,7 @@ SimplexValues Simplex( DataFrame< double > & DF,
}

//----------------------------------------------------------------------------
// 1) SMap with path/file input
// 1) SMap with pathIn/dataFile input. Calls overload 2)
// Default SVD (LAPACK) assigned in SMap() overload 2)
//----------------------------------------------------------------------------
SMapValues SMap( std::string pathIn,
Expand Down Expand Up @@ -309,8 +314,8 @@ SMapValues SMap( std::string pathIn,
}

//----------------------------------------------------------------------------
// 2) SMap with DataFrame
// Default SVD (LAPACK) assigned in Smap.cc overload 2)
// 2) SMap with DataFrame. Calls overload 4)
// Default SVD (LAPACK) was assigned in Smap.cc overload 2)
//----------------------------------------------------------------------------
SMapValues SMap( DataFrame< double > & DF,
std::string pathOut,
Expand Down Expand Up @@ -347,7 +352,7 @@ SMapValues SMap( DataFrame< double > & DF,
}

//----------------------------------------------------------------------------
// 3) Data path/file with external solver object
// 3) Data pathIn/dataFile with external solver object. Calls 4)
//----------------------------------------------------------------------------
SMapValues SMap( std::string pathIn,
std::string dataFile,
Expand Down Expand Up @@ -424,8 +429,69 @@ SMapValues SMap( DataFrame< double > & DF,
const_predict, verbose, validLib,
generateSteps, parameterList, smapFile );

// Handle nan
// If nan are found in library or prediction rows of columns or target,
// remove them from DataFrame DF
// nan rows are saved in DF nanRows, validRows
// JP: Seems silly to do this check as a default...
std::vector< std::string > nanColsCheck = parameters.columnNames;
// Add target to nanColsCheck for DF.NanRows()
// Don't add empty or degenerate target
if ( not parameters.targetName.empty() and
find( nanColsCheck.begin(), nanColsCheck.end(), parameters.targetName )
== nanColsCheck.end() ) {
nanColsCheck.push_back( parameters.targetName );
}
bool nanFound = DF.NanRows( nanColsCheck ); // Look for nan

// Reference to DF or DataFrameRemoveNanRows
DataFrame< double > DF_ = std::ref( DF );

if ( nanFound ) {
// Remove DF nanRows from parameters.library and prediction
// If any nan are found, set nanRemovedLibPred true
bool nanRemovedLibPred = false;

// First, sort nanRows to delete from lib/pred in reverse order
std::sort( DF.NanRows().begin(), DF.NanRows().end() );

// Now erase in reverse order
std::vector< size_t >::iterator vi;
std::vector< size_t >::reverse_iterator ri;

for ( ri = DF.NanRows().rbegin(); ri != DF.NanRows().rend(); ++ri ) {
vi = std::find( parameters.library.begin(),
parameters.library.end(), *ri );
if ( vi != parameters.library.end() ) {
parameters.library.erase( vi );
if ( not nanRemovedLibPred ){ nanRemovedLibPred = true; }
}
vi = std::find( parameters.prediction.begin(),
parameters.prediction.end(), *ri );
if ( vi != parameters.prediction.end() ) {
parameters.prediction.erase( vi );
if ( not nanRemovedLibPred ){ nanRemovedLibPred = true; }
}
}

if ( nanRemovedLibPred ) {
// JP: If DF is large, it is dumb to create a new DF
DataFrame< double > DFNanRemove = DF.DataFrameRemoveNanRows();
DF_ = std::ref( DFNanRemove );

std::stringstream msg;
msg << "WARNING: SMap() nan rows detected in columns or target. "
<< DF.NanRows().size() << " deleted. "
<< "Original number of rows " << DF.NRows() << ".\n";
if ( not parameters.embedded ) {
msg << "Time delay embedding presumption violated.\n";
}
std::cout << msg.str();
}
}

// Instantiate EDM::SMapClass object
SMapClass SMapModel = SMapClass( DF, std::ref( parameters ) );
SMapClass SMapModel = SMapClass( DF_, std::ref( parameters ) );

if ( generateSteps ) {
SMapModel.Generate( solver );
Expand All @@ -439,11 +505,11 @@ SMapValues SMap( DataFrame< double > & DF,
values.coefficients = SMapModel.coefficients;
values.parameterMap = SMapModel.parameters.Map;

return values;
return values;
}

//----------------------------------------------------------------------
// CCM with path/file input
// CCM with pathin/dataFile input. Calls CCM( & DF )
//----------------------------------------------------------------------
CCMValues CCM( std::string pathIn,
std::string dataFile,
Expand Down Expand Up @@ -553,7 +619,7 @@ CCMValues CCM( DataFrame< double > & DF,
}

//----------------------------------------------------------------------
// Multiview with path/file input
// Multiview with path/file input. Calls Multiview( & DF )
//----------------------------------------------------------------------
MultiviewValues Multiview( std::string pathIn,
std::string dataFile,
Expand Down
65 changes: 64 additions & 1 deletion src/DataFrame.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <iomanip>
#include <fstream>
#include <iterator>
#include <unordered_set>

// Common.cc
extern std::vector<std::string> SplitString( std::string inString,
Expand All @@ -22,7 +23,7 @@ typedef std::vector<std::pair<std::string, std::vector<double>>> NamedData;
//----------------------------------------------------------------
// DataFrame class
// Data container is a single, contiguous valarray: elements.
// NOTE: elements are Row Major format, ala C, C++, numpy
// NOTE: elements are Row Major format, ala C, C++, numpy.
// DataFrame element access is through the () operator: (row,col).
// The time column is not processed as data, but as strings.
//----------------------------------------------------------------
Expand All @@ -40,6 +41,9 @@ class DataFrame {
std::string timeName;
NamedData namedData;

std::vector< size_t > nanRows; // SMap DataFrameRemoveNanRows
std::vector< size_t > validRows; // SMap DataFrameRemoveNanRows

size_t maxRowPrint;
bool noTime;

Expand Down Expand Up @@ -134,6 +138,11 @@ class DataFrame {
return columnNameToIndex;
}

std::vector< size_t > NanRows() const { return nanRows; }
std::vector< size_t > &NanRows() { return nanRows; }
std::vector< size_t > ValidRows() const { return validRows; }
std::vector< size_t > &ValidRows() { return validRows; }

size_t MaxRowPrint() const { return maxRowPrint; }
size_t &MaxRowPrint() { return maxRowPrint; }

Expand Down Expand Up @@ -312,6 +321,60 @@ class DataFrame {
return M;
}

//-----------------------------------------------------------------
// Scan columns for nan
// If no nan: return false, do not set DataFrame nanRows & validRows
// If nan: return true, set DataFrame nanRows & validRows
//-----------------------------------------------------------------
bool NanRows( std::vector< std::string > columns ) {

nanRows.clear(); // JP instead: throw warn/error if already set?
validRows.clear();

bool nanFound = false;
std::unordered_set< size_t > nanSetRows; // unique nan rows

// Scan each column
for ( auto col : columns ) {
std::valarray<T> colData = VectorColumnName( col );
// Scan each row
for ( size_t row = 0; row < n_rows; row++ ) {
if ( std::isnan( colData[ row ] ) ) {
nanSetRows.insert( row );
}
}
}

if ( not nanSetRows.empty() ) { nanFound = true; }

if ( nanFound ) {
// Copy nanSetRows (unordered_set) to DataFrame nanRows (vector)
nanRows.insert( nanRows.end(), nanSetRows.begin(), nanSetRows.end());

// validRows is compliment of nanRows
std::unordered_set< size_t >::const_iterator usi;
for ( size_t row = 0; row < n_rows; row++ ) {
usi = nanSetRows.find( row );
if ( usi == nanSetRows.end() ) {
validRows.push_back( row );
}
}
}

return nanFound;
}

//-----------------------------------------------------------------
// Return (sub)DataFrame with rows removed having nan in columns
//-----------------------------------------------------------------
DataFrame< T > DataFrameRemoveNanRows() {

// NOTE: Call NanRows( columns ) first to set validRows
DataFrame< T > M = DataFrameFromRowIndex( validRows );

return M;
}

//-----------------------------------------------------------------
// Return Elements in Column Major order (Fortran)
//-----------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion src/Parameter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ Parameters::Parameters(

// Set validated flag and instantiate Version
validated ( false ),
version ( 1, 10, 3, "2022-03-27" )
version ( 1, 11, 0, "2022-04-01" )
{
// Constructor code
if ( method != Method::None ) {
Expand Down

0 comments on commit 8424b3b

Please sign in to comment.