From cac957175177305493fa1b2396b67e99c820246b Mon Sep 17 00:00:00 2001 From: Sergei Date: Wed, 8 Nov 2023 09:54:06 -0500 Subject: [PATCH 01/14] Better integration of ZLIB including support for compressed output --- CMakeLists.txt | 2 +- res/TemplateBatchFiles/libv3/IOFunctions.bf | 24 +- src/core/baseobj.cpp | 4 +- src/core/batchlan.cpp | 15 +- src/core/batchlanhelpers.cpp | 2 +- src/core/batchlanruntime.cpp | 67 ++-- src/core/dataset.cpp | 42 ++- src/core/dataset_filter.cpp | 4 +- src/core/fstring.cpp | 5 +- src/core/global_things.cpp | 170 ++-------- src/core/hbl_env.cpp | 7 +- src/core/hyFile.cpp | 336 ++++++++++++++++++++ src/core/include/baseobj.h | 4 +- src/core/include/dataset.h | 6 +- src/core/include/dataset_filter.h | 4 +- src/core/include/global_things.h | 29 +- src/core/include/hbl_env.h | 1 + src/core/include/hy_strings.h | 1 + src/core/include/hy_types.h | 43 +++ src/core/include/list.h | 2 +- src/core/include/matrix.h | 8 +- src/core/include/polynoml.h | 2 +- src/core/include/string_file_wrapper.h | 4 +- src/core/include/topology.h | 2 +- src/core/include/variable.h | 2 +- src/core/likefunc.cpp | 49 ++- src/core/list.cpp | 8 +- src/core/matrix.cpp | 75 +++-- src/core/polynoml.cpp | 23 +- src/core/string_file_wrapper.cpp | 18 +- src/core/strings.cpp | 20 ++ src/core/topology.cpp | 4 +- src/core/variable.cpp | 4 +- src/mains/unix.cpp | 7 +- src/new/bayesgraph.cpp | 22 +- 35 files changed, 669 insertions(+), 347 deletions(-) create mode 100644 src/core/hyFile.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 0e90decfa..c5dc1fcf7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -168,7 +168,7 @@ set_source_files_properties(${SRC_CORE} ${SRC_NEW} {SRC_UTILS} PROPERTIES COMPIL #------------------------------------------------------------------------------- set(DEFAULT_WARNING_FLAGS " -w -Weffc++ -Wextra -Wall ") -set(DEFAULT_DEBUG_WARNING_FLAGS "-Wall -Wno-int-to-pointer-cast -Wno-conversion-null -Wno-sign-compare -Wno-maybe-uninitialized") +set(DEFAULT_DEBUG_WARNING_FLAGS "-Wall -Wno-int-to-pointer-cast -Wno-conversion-null -Wno-sign-compare") diff --git a/res/TemplateBatchFiles/libv3/IOFunctions.bf b/res/TemplateBatchFiles/libv3/IOFunctions.bf index 4bed72707..da323c4e3 100644 --- a/res/TemplateBatchFiles/libv3/IOFunctions.bf +++ b/res/TemplateBatchFiles/libv3/IOFunctions.bf @@ -136,6 +136,25 @@ lfunction io._reportMessageHelper(analysis, text) { } } +/** + * @name io.CompressedObjectSpool + * @param json + * @param file + */ + +lfunction io.CompressedObjectSpool (object, file) { + GetString (have_zip, ZLIB_ENABLED, 0); + if (have_zip && file != "/dev/null") { + if (utility.GetEnvVariable ("GZIP_OUTPUT")) { + utility.ToggleEnvVariable("GZIP_COMPRESSION_LEVEL", 5); + fprintf(file, CLEAR_FILE, object); + utility.ToggleEnvVariable("GZIP_COMPRESSION_LEVEL", None); + return; + } + } + fprintf(file, CLEAR_FILE, object); + } + /** * @name io.SpoolJSON * @param json @@ -144,7 +163,7 @@ lfunction io._reportMessageHelper(analysis, text) { lfunction io.SpoolJSON(json, file) { utility.ToggleEnvVariable("USE_JSON_FOR_MATRIX", 1); if (Type(file) == "String" && Abs (file) > 0) { - fprintf(file, CLEAR_FILE, json); + io.CompressedObjectSpool (json, file); } else { fprintf(stdout, "\n", json, "\n"); } @@ -550,7 +569,8 @@ lfunction io.SpoolLF(lf_id, trunk_path, tag) { lfunction io.SpoolLFToPath(lf_id, path) { if (path != "/dev/null") { Export(__lf_spool, ^ lf_id); - fprintf(path, CLEAR_FILE, __lf_spool); + io.CompressedObjectSpool (__lf_spool, path); + DeleteObject (__lf_spool); } } diff --git a/src/core/baseobj.cpp b/src/core/baseobj.cpp index a72aebe9a..2ec2b0751 100644 --- a/src/core/baseobj.cpp +++ b/src/core/baseobj.cpp @@ -64,9 +64,9 @@ BaseRef BaseObj::toStr(unsigned long) { return new _String(kNullToken); } BaseRef BaseObj::toErrStr(void) { return toStr(); } //______________________________________________________________________________ -void BaseObj::toFileStr(FILE *dest, unsigned long padding) { +void BaseObj::toFileStr(hyFile *dest, unsigned long padding) { _String *s = (_String *)toStr(padding); - fwrite(s->get_str(), 1, s->length(), dest); + dest->fwrite(s->get_str(), 1, s->length()); DeleteObject (s); } diff --git a/src/core/batchlan.cpp b/src/core/batchlan.cpp index 215ef249d..c72d288c2 100644 --- a/src/core/batchlan.cpp +++ b/src/core/batchlan.cpp @@ -2675,7 +2675,7 @@ void _ElementaryCommand::ExecuteCase5 (_ExecutionList& chain) { } SetStatusLine ("Loading Data"); - df = hyFile::openFile (fName.get_str(),"rb"); + df = hyFile::openFile (fName.get_str(),kFileReadBinary); if (df==nil) { // try reading this file as a string formula fName = GetStringFromFormula ((_String*)parameters(1),chain.nameSpacePrefix); @@ -2685,7 +2685,7 @@ void _ElementaryCommand::ExecuteCase5 (_ExecutionList& chain) { return; } - df = hyFile::openFile (fName.get_str(),"rb"); + df = hyFile::openFile (fName.get_str(),kFileReadBinary); if (df==nil) { HandleApplicationError ((_String ("Could not find source dataset file ") & ((_String*)parameters(1))->Enquote('"') & " (resolved to '" & fName & "')\nPath stack:\n\t" & GetPathStack ("\n\t"))); @@ -3208,12 +3208,12 @@ void _ElementaryCommand::ExecuteCase52 (_ExecutionList& chain) { _String spool_file; - FILE* main_file = nil; + hyFile* main_file = nil; if (parameters.countitems () > 6) { spool_file = ProcessLiteralArgument (GetIthParameter(6),chain.nameSpacePrefix); ProcessFileName(spool_file); - main_file = doFileOpen (spool_file,"w"); + main_file = doFileOpen (spool_file,kFileWrite); if (!main_file) { throw (_String("Failed to open ") & spool_file.Enquote() & " for writing"); } @@ -3565,7 +3565,7 @@ bool _ElementaryCommand::Execute (_ExecutionList& chain) { if (terminate_execution) { return false; } - FILE* theDump = doFileOpen (fName,"rb"); + hyFile* theDump = doFileOpen (fName,kFileReadBinary); if (!theDump) { HandleApplicationError (((_String)("File ")&fName&_String(" couldn't be open for reading."))); return false; @@ -3585,7 +3585,8 @@ bool _ElementaryCommand::Execute (_ExecutionList& chain) { } else { importResult = false; } - fclose (theDump); + theDump->close(); + delete (theDump); return importResult; } break; @@ -5114,7 +5115,7 @@ void ReadBatchFile (_String& fName, _ExecutionList& target) { FetchVar(LocateVarByName (optimizationPrecision))->SetValue(&precvalue); #endif*/ - hyFile *f = hyFile::openFile (fName.get_str (), "rb"); + hyFile *f = hyFile::openFile (fName.get_str (), kFileReadBinary); SetStatusLine ("Parsing File"); if (!f) { HandleApplicationError (_String("Could not read batch file '") & fName & "'.\nPath stack:\n\t" & GetPathStack("\n\t")); diff --git a/src/core/batchlanhelpers.cpp b/src/core/batchlanhelpers.cpp index 35819a0fc..a555bba77 100644 --- a/src/core/batchlanhelpers.cpp +++ b/src/core/batchlanhelpers.cpp @@ -83,7 +83,7 @@ void ReadModelList(void) { if (templateModelList.empty() == false) return; _String modelListFile (GetStandardDirectory (HY_HBL_DIRECTORY_TEMPLATE_MODELS) & "models.lst"), - theData (doFileOpen (modelListFile.get_str(),"rb")); + theData (doFileOpen (modelListFile.get_str(),kFileReadBinary)); if (theData.nonempty()) { _ElementaryCommand::ExtractConditions(theData,0,templateModelList); diff --git a/src/core/batchlanruntime.cpp b/src/core/batchlanruntime.cpp index 9078a9ca5..e02ef5f57 100644 --- a/src/core/batchlanruntime.cpp +++ b/src/core/batchlanruntime.cpp @@ -2728,7 +2728,7 @@ bool _ElementaryCommand::HandleFprintf (_ExecutionList& current_program) { success = true; - FILE* destination_file = nil; + hyFile * destination_file = nil; try { @@ -2775,10 +2775,17 @@ bool _ElementaryCommand::HandleFprintf (_ExecutionList& current_program) { do_close = open_handle < 0; if (!do_close) { - destination_file = (FILE*)open_file_handles.GetXtra (open_handle); + destination_file = (hyFile*)open_file_handles.GetXtra (open_handle); } else { - if ((destination_file = doFileOpen (destination.get_str(), "a")) == nil) - throw (_String ("Could not create/open output file at path ") & destination.Enquote() & "."); + #ifdef __ZLIB__ + if ((destination_file = doFileOpen (destination.get_str(), kFileAppend, false, hy_env::EnvVariableGetNumber(hy_env::gzip_compression_level) > 0)) == nil) { + throw (_String ("Could not create/open output file at path ") & destination.Enquote() & "."); + } + #else + if ((destination_file = doFileOpen (destination.get_str(), kFileAppend)) == nil) { + throw (_String ("Could not create/open output file at path ") & destination.Enquote() & "."); + } + #endif } } } @@ -2791,8 +2798,13 @@ bool _ElementaryCommand::HandleFprintf (_ExecutionList& current_program) { // handle special cases first if (*current_argument == kFprintfClearFile) { if (!print_to_stdout && destination_file) { - fclose (destination_file); - destination_file = doFileOpen (destination.get_str(), "w"); + destination_file->close(); delete destination_file; +#ifdef __ZLIB__ + destination_file = doFileOpen (destination.get_str(), kFileWrite,false, hy_env::EnvVariableGetNumber(hy_env::gzip_compression_level) > 0); +#else + destination_file = doFileOpen (destination.get_str(), kFileWrite); +#endif + if (!do_close) { _String* destination_copy = new _String (destination); @@ -2860,7 +2872,8 @@ bool _ElementaryCommand::HandleFprintf (_ExecutionList& current_program) { } if (destination_file && destination_file != hy_message_log_file && do_close) { - fclose (destination_file); + destination_file->close(); + delete destination_file; } return success; @@ -2901,7 +2914,7 @@ bool _ElementaryCommand::HandleExecuteCommandsCases(_ExecutionList& current } _String original_path (file_path); - FILE * source_file = nil; + hyFile * source_file = nil; bool reload = hy_env::EnvVariableTrue(hy_env::always_reload_libraries); @@ -2920,7 +2933,7 @@ bool _ElementaryCommand::HandleExecuteCommandsCases(_ExecutionList& current ReportWarning (_String("Already loaded ") & original_path.Enquote() & " from " & try_path); return true; } - if ((source_file = doFileOpen (try_path.get_str (), "rb"))) { + if ((source_file = doFileOpen (try_path.get_str (), kFileReadBinary))) { file_path = try_path; break; } @@ -2939,7 +2952,7 @@ bool _ElementaryCommand::HandleExecuteCommandsCases(_ExecutionList& current return true; } - if ((source_file = doFileOpen (file_path.get_str (), "rb")) == nil) { + if ((source_file = doFileOpen (file_path.get_str (), kFileReadBinary)) == nil) { throw (_String("Could not read command file from '") & original_path & "' (expanded to '" & file_path & "')"); } @@ -2952,10 +2965,13 @@ bool _ElementaryCommand::HandleExecuteCommandsCases(_ExecutionList& current source_code = new _StringBuffer (source_file); - if (fclose (source_file) ) { // failed to fclose + if (source_file->close() ) { // failed to fclose DeleteObject (source_code); throw (_String("Internal error: failed in a call to fclose ") & file_path.Enquote()); } + + delete (source_file); + pop_path = true; PushFilePath (file_path); } else { // commands are not loaded from a file @@ -3324,7 +3340,8 @@ bool _ElementaryCommand::HandleGetString (_ExecutionList& current_program) static const _String kVersionString ("HYPHY_VERSION"), kTimeStamp ("TIME_STAMP"), kListLoadedLibraries ("LIST_OF_LOADED_LIBRARIES"), - kGetNumberOfMatrixExp ("MATRIX_EXPONENTIALS_COMPUTED"); + kGetNumberOfMatrixExp ("MATRIX_EXPONENTIALS_COMPUTED"), + kCanCompressFiles ("ZLIB_ENABLED"); _Variable * receptacle = nil; @@ -3358,7 +3375,13 @@ bool _ElementaryCommand::HandleGetString (_ExecutionList& current_program) return_value = new _Matrix (loadedLibraryPaths.Keys()); } else if (*GetIthParameter(1UL) == kGetNumberOfMatrixExp) { return_value = new _Constant (matrix_exp_count); - } + } else if (*GetIthParameter(1UL) == kCanCompressFiles) { +#ifdef __ZLIB__ + return_value = new _Constant (1.); +#else + return_value = new _Constant (0.); +#endif + } if (!return_value) { @@ -3743,11 +3766,11 @@ bool _ElementaryCommand::HandleFscanf (_ExecutionList& current_program, boo return false; } - FILE * input_stream = doFileOpen (file_path.get_str(), "rb"); + hyFile * input_stream = doFileOpen (file_path.get_str(), kFileReadBinary); hy_env::EnvVariableSet(hy_env::file_created, new HY_CONSTANT_FALSE, false); if (!input_stream) { if (has_create) { - input_stream = doFileOpen (file_path.get_str(), "wb"); + input_stream = doFileOpen (file_path.get_str(), kFileWriteBinary); if (input_stream){ while (argument_index < upper_bound) { _Variable * store_here = _ValidateStorageVariable (current_program, argument_index + 1UL); @@ -3772,20 +3795,22 @@ bool _ElementaryCommand::HandleFscanf (_ExecutionList& current_program, boo last_call_stream_position = 0L; } - fseek (input_stream,0,SEEK_END); - current_stream_position = ftell (input_stream); + input_stream->seek (0,SEEK_END); + current_stream_position = input_stream->tell(); current_stream_position -= last_call_stream_position; if (current_stream_position<=0) { hy_env::EnvVariableSet(hy_env::end_of_file, new HY_CONSTANT_TRUE, false); - fclose(input_stream); + input_stream->close(); + delete (input_stream); return true; } - rewind (input_stream); - fseek (input_stream, last_call_stream_position, SEEK_SET); + input_stream->rewind(); + input_stream->seek (last_call_stream_position, SEEK_SET); _String * file_data = new _String (input_stream, current_stream_position); - fclose (input_stream); + input_stream->close(); + delete (input_stream); dynamic_reference_manager < file_data; input_data = file_data; current_stream_position = 0L; diff --git a/src/core/dataset.cpp b/src/core/dataset.cpp index a0755eace..cdab5f53b 100644 --- a/src/core/dataset.cpp +++ b/src/core/dataset.cpp @@ -82,7 +82,7 @@ _DataSet::_DataSet(long l) //_______________________________________________________________________ -_DataSet::_DataSet(FILE *f) { +_DataSet::_DataSet(hyFile *f) { dsh = nil; useHorizontalRep = false; theTT = &hy_default_translation_table; @@ -196,16 +196,17 @@ void _DataSet::AddSite(char c) { if (theMap.list_data[0] == 0) { if (theMap.list_data[1] == 0) { if (theNames.lLength) { - fprintf(streamThrough, ">%s\n", ((_String *)theNames(0))->get_str()); + streamThrough->puts ("(_String *)theNames(0))->get_str()"); + streamThrough->putc ('\n'); } else { - fprintf(streamThrough, ">Sequence 1\n"); + streamThrough->puts (">Sequence 1"); } AppendNewInstance(new _String(kEmptyString)); } theMap.list_data[1]++; theMap.list_data[2]++; - fputc(c, streamThrough); + streamThrough->putc(c); } else { HandleApplicationError("Can't add more sites to a file based data set, " "when more that one sequence has been written!", @@ -235,10 +236,16 @@ void _DataSet::Write2Site(long index, char c, char skip_char) { theMap.list_data[0]++; if (theNames.lLength > theMap.list_data[0]) { - fprintf(streamThrough, "\n>%s\n", - ((_String *)theNames(theMap.list_data[0]))->get_str()); + streamThrough->puts ("\n>"); + streamThrough->puts (((_String *)theNames(theMap.list_data[0]))->get_str()); + streamThrough->putc ('\n'); } else { - fprintf(streamThrough, "\n>Sequence %ld\n", theMap.list_data[0] + 1); + streamThrough->puts ("\n>"); + char buffer[64]; + snprintf (buffer, 64, "%ld", theMap.list_data[0] + 1); + streamThrough->puts (buffer); + streamThrough->putc ('\n'); + //fprintf(streamThrough, "\n>Sequence %ld\n", theMap.list_data[0] + 1); } theMap.list_data[1] = 0; @@ -254,7 +261,7 @@ void _DataSet::Write2Site(long index, char c, char skip_char) { } theMap.list_data[1]++; - fputc(c, streamThrough); + streamThrough->putc(c); } else { if (useHorizontalRep) { long currentWritten = ((_String *)list_data[0])->length(); @@ -386,8 +393,8 @@ void _DataSet::SetTranslationTable(_TranslationTable *newTT) { //_______________________________________________________________________ void _DataSet::Finalize(void) { if (streamThrough) { - fclose(streamThrough); - streamThrough = nil; + streamThrough->close(); + delete (streamThrough); theMap.Clear(); } else { if (useHorizontalRep) { @@ -671,12 +678,17 @@ BaseRef _DataSet::toStr(unsigned long) { //___________________________________________________ -void _DataSet::toFileStr(FILE *dest, unsigned long padding) { - fprintf(dest, "%ld species: ", NoOfSpecies()); +void _DataSet::toFileStr(hyFile *dest, unsigned long padding) { + char buffer[512]; + snprintf (buffer, 512, "%ld species: ", NoOfSpecies()); + dest->puts (buffer); + theNames.toFileStr(dest, padding); - - fprintf(dest, ";\nTotal Sites: %ld", GetNoTypes()); - fprintf(dest, ";\nDistinct Sites: %ld", theFrequencies.lLength); + snprintf (buffer, 512, ";\nTotal Sites: %ld", GetNoTypes()); + dest->puts (buffer); + + snprintf (buffer, 512, ";\nDistinct Sites: %ld", theFrequencies.lLength); + dest->puts (buffer); /* fprintf (dest,"\n"); for (long j=0; jclose (); + delete (test); } } } diff --git a/src/core/global_things.cpp b/src/core/global_things.cpp index 785daa11a..9c758f7b5 100644 --- a/src/core/global_things.cpp +++ b/src/core/global_things.cpp @@ -95,7 +95,7 @@ namespace hy_global { ignore_kw_defaults = false, force_verbosity_from_cli = false; - FILE *hy_error_log_file = NULL, + hyFile *hy_error_log_file = NULL, *hy_message_log_file = NULL; hyTreeDefinitionPhase @@ -122,7 +122,7 @@ namespace hy_global { kErrorStringMatrixExportError ("Export matrix called with a non-polynomial matrix argument"), kErrorStringNullOperand ("Attempting to operate on an undefined value; this is probably the result of an earlier 'soft' error condition"), kErrorNumerical ("To treat numerical errors as warnings, please specify \"ENV=TOLERATE_NUMERICAL_ERRORS=1;\" as the command line argument. This often resolves the issue, which is indicative of numerical instability."), - kHyPhyVersion = _String ("2.5.56"), + kHyPhyVersion = _String ("2.5.57"), kNoneToken = "None", kNullToken = "null", @@ -222,14 +222,11 @@ namespace hy_global { //____________________________________________________________________________________ - FILE * doFileOpen (const char * fileName, const char * mode, bool error) { - FILE *daFile = nil; + hyFile * doFileOpen (const char * fileName, hyFileOpenMode mode, bool error, bool compress) { + hyFile *daFile = nil; if (fileName) { - daFile = fopen (fileName,mode); - if (!daFile && error) { - HandleApplicationError (_String("Could not open file '") & *fileName & "' with mode '" & *mode & "'."); - } + daFile = hyFile::openFile (fileName,mode, error, compress); } return daFile; } @@ -404,7 +401,7 @@ namespace hy_global { #ifndef __HEADLESS__ // do not create log files for _HEADLESS_ _String * prefix [2] = {&hy_error_log_name, &hy_messages_log_name}; - FILE ** handle [2] = {&hy_error_log_file, &hy_message_log_file}; + hyFile ** handle [2] = {&hy_error_log_file, &hy_message_log_file}; for (long file_index = 0; file_index < 2; file_index++) { long p = 1L; @@ -419,7 +416,7 @@ namespace hy_global { _String file_name = *prefix[file_index] & ".mpinode" & (long)hy_mpi_node_rank; #endif - *handle[file_index] = doFileOpen (file_name.get_str(),"w+"); + *handle[file_index] = doFileOpen (file_name.get_str(),kFileWrite); while (*handle[file_index] == nil && p<10) { #ifndef __HYPHYMPI__ file_name = *prefix[file_index] & '.' & p; @@ -509,21 +506,24 @@ namespace hy_global { _String * prefix [2] = {&hy_error_log_name, &hy_messages_log_name}; char const * messages [] = {"\nCheck %s for execution error details.\n", "\nCheck %s for diagnostic messages.\n"}; - FILE * handle [2] = {hy_error_log_file, hy_message_log_file}; + hyFile * handle [2] = {hy_error_log_file, hy_message_log_file}; for (long file_index = 0; file_index < 2; file_index++) { if (handle[file_index]) { - fflush (handle[file_index]); - fseek(handle[file_index],0,SEEK_END); - unsigned long fileSize = ftell(handle[file_index]); + //fflush (handle[file_index]); + handle[file_index]->flush (); + handle[file_index]->seek(0, SEEK_END); + unsigned long fileSize = handle[file_index]->tell(); if (fileSize) { fprintf (stderr, messages[file_index], prefix[file_index]->get_str()); if (file_index == 0) { no_errors = false; } - fclose (handle[file_index]); + handle[file_index]->close(); + delete handle[file_index]; } else { - fclose (handle[file_index]); + handle[file_index]->close(); + delete handle[file_index]; remove (prefix[file_index]->get_str()); } } @@ -689,9 +689,9 @@ namespace hy_global { } char str[] = "\n"; - fwrite (str, 1, 1, hy_message_log_file); - fwrite (message.get_str(), 1, message.length(), hy_message_log_file); - fflush (hy_message_log_file); + hy_message_log_file->fwrite (str, 1, 1); + hy_message_log_file->fwrite (message.get_str(), 1, message.length()); + hy_message_log_file->flush(); #endif } @@ -756,13 +756,14 @@ namespace hy_global { char str[] = "\nError:"; if (hy_error_log_file) { - fwrite (str, 1, 7, hy_error_log_file); - fwrite (message.get_str(), 1, message.length(), hy_error_log_file); - fflush (hy_error_log_file); + hy_error_log_file->fwrite (str, 1, 7); + hy_error_log_file->fwrite (message.get_str(), 1, message.length()); + hy_error_log_file->flush (); } if (hy_error_log_file) { - fprintf (hy_error_log_file, "\n%s", (const char*)message); + hy_error_log_file->putc ('\n'); + hy_error_log_file->puts ((const char*)message); } _String errMsg; @@ -1007,128 +1008,5 @@ namespace hy_global { return true; } - //____________________________________________________________________________________ - hyFile* hyFile::openFile (const char * file_path, const char * mode , bool error, long buffer) { - hyFile* f = new hyFile; -#ifdef __ZLIB__ - if (file_path) { - f->_fileReference = gzopen (file_path, mode); - if (!f->_fileReference && error) { - HandleApplicationError (_String("Could not open file '") & *file_path & "' with mode '" & *mode & "'."); - } - } -#else - f->_fileReference = doFileOpen(file_path, mode, error); -#endif - if (!f->_fileReference ) { - delete f; - f = nil; - } - return f; - - } - - //____________________________________________________________________________________ - void hyFile::close (void) { - if (valid()) { - #ifdef __ZLIB__ - gzclose (_fileReference); - #else - fclose (_fileReference); - #endif - _fileReference = NULL; - } - } - - //____________________________________________________________________________________ - void hyFile::lock (void) { - if (valid()) { - #ifdef __ZLIB__ - //gzclose (_fileReference); - #else - flockfile (_fileReference); - #endif - } - } - - //____________________________________________________________________________________ - void hyFile::unlock (void) { - if (valid()) { - #ifdef __ZLIB__ - //gzclose (_fileReference); - #else - funlockfile (_fileReference); - #endif - } - } - - //____________________________________________________________________________________ - void hyFile::rewind (void) { - if (valid()) { - #ifdef __ZLIB__ - gzrewind (_fileReference); - #else - ::rewind (_fileReference); - #endif - } - } - - //____________________________________________________________________________________ - void hyFile::seek (long pos, int whence) { - if (valid()) { - #ifdef __ZLIB__ - gzseek (_fileReference, pos, whence); - #else - fseek (_fileReference, pos, whence); - #endif - } - } - //____________________________________________________________________________________ - - size_t hyFile::tell (void) { - if (valid()) { - #ifdef __ZLIB__ - return gztell (_fileReference); - #else - return ftell (_fileReference); - #endif - } - return 0; - } - //____________________________________________________________________________________ - bool hyFile::feof (void) { - if (valid()) { - #ifdef __ZLIB__ - return gzeof (_fileReference); - #else - return feof_unlocked (_fileReference); - #endif - } - return true; - } - //____________________________________________________________________________________ - int hyFile::getc (void) { - if (valid()) { - #ifdef __ZLIB__ - return gzgetc (_fileReference); - #else - return getc_unlocked (_fileReference); - #endif - } - return 0; - } - - //____________________________________________________________________________________ - unsigned long hyFile::read (void* buffer, unsigned long size, unsigned long items) { - if (valid()) { - #ifdef __ZLIB__ - return gzfread (buffer, size, items, _fileReference); - #else - return ::fread (buffer, size, items, _fileReference); - #endif - } - return 0; - } - } // namespace close diff --git a/src/core/hbl_env.cpp b/src/core/hbl_env.cpp index 1db5d11b1..49d05541b 100644 --- a/src/core/hbl_env.cpp +++ b/src/core/hbl_env.cpp @@ -134,6 +134,7 @@ namespace hy_env { .PushPairCopyKey (data_file_gap_width, new _Constant (10.0)) .PushPairCopyKey (accept_branch_lengths, new _Constant (HY_CONSTANT_TRUE)) .PushPairCopyKey (number_threads, new _Constant (0.)) + .PushPairCopyKey (gzip_compression_level, new _Constant (0.)) ; ; } @@ -221,7 +222,11 @@ _String const if set to `harvest_frequencies_gap_options` to TRUE, then N-fold ambigs will add 1/N to each character count in HarvestFrequencies, otherwise N-folds are ignored in counting */ - + gzip_compression_level ("GZIP_COMPRESSION_LEVEL"), + /* + if __ZLIB__ is present, controls whether or not output files created by fprintf will be compressed by ZLIB; + value of 0 means no compresson + */ include_model_spec ("INCLUDE_MODEL_SPECS"), /* controls whether or not export / serialization operations (like toStr) diff --git a/src/core/hyFile.cpp b/src/core/hyFile.cpp new file mode 100644 index 000000000..889f41293 --- /dev/null +++ b/src/core/hyFile.cpp @@ -0,0 +1,336 @@ +/* + HyPhy - Hypothesis Testing Using Phylogenies. + + Copyright (C) 1997-now + Core Developers: + Sergei L Kosakovsky Pond (sergeilkp@icloud.com) + Art FY Poon (apoon42@uwo.ca) + Steven Weaver (sweaver@temple.edu) + + Module Developers: + Lance Hepler (nlhepler@gmail.com) + Martin Smith (martin.audacis@gmail.com) + + Significant contributions from: + Spencer V Muse (muse@stat.ncsu.edu) + Simon DW Frost (sdf22@cam.ac.uk) + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "hy_types.h" +#include "hy_strings.h" +#include "global_things.h" + +//____________________________________________________________________________________ + hyFile* hyFile::openFile (const char * file_path, hyFileOpenMode mode , bool error, bool compress, long buffer) { + + _String str_mode; + auto handle_error = [&](void * ptr)->void { + if (!ptr && error) { + hy_global::HandleApplicationError (_String("Could not open file '") & *file_path & "' with mode '" & str_mode & "'."); + } + }; + + hyFile* f = new hyFile; +#ifdef __ZLIB__ + if (file_path) { + + _String str_mode; + switch (mode) { + case kFileRead: + str_mode = "r"; + break; + case kFileReadBinary: + str_mode = "rb"; + break; + case kFileWrite: + str_mode = compress ? "w" : "wT"; + break; + case kFileWriteBinary: + str_mode = compress ? "wb" : "wbT"; + break; + case kFileAppend: + str_mode = compress ? "a" : "aT"; + break; + case kFileReadWrite: + str_mode = "w+"; + break; + } + + if (mode != kFileReadWrite) { + f->_fileReference = gzopen (file_path, (char const*)str_mode); + if (f->_fileReference) { + if (gzdirect(f->_fileReference)) { + gzclose (f->_fileReference); + f->_fileReference = NULL; + f->_fileReferenceDirect = ::fopen (file_path, (const char*)str_mode); + handle_error (f->_fileReferenceDirect); + } + } else { + handle_error (f->_fileReference); + } + } else { + f->_fileReferenceDirect = ::fopen (file_path, (const char*)str_mode); + handle_error (f->_fileReferenceDirect); + } + if (!f->valid()) { + delete f; + f = nil; + } +} +#else + if (file_path) { + _String str_mode; + switch (mode) { + case kFileRead: + str_mode = "r"; + break; + case kFileReadBinary: + str_mode = "rb"; + break; + case kFileWrite: + str_mode = "w"; + break; + case kFileWrite: + str_mode = "wb"; + break; + case kFileAppend: + str_mode = "a"; + break; + case kFileReadWrite: + str_mode = "w+"; + break; + } + + f->_fileReference = ::fopen (file_path, (const char*)str_mode); + handle_error (f->_fileReference); + } + if (!f->_fileReference ) { + delete f; + f = nil; + } +#endif + + return f; + + } + + //____________________________________________________________________________________ + int hyFile::close (void) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + return fclose (_fileReferenceDirect); + } else { + return gzclose (_fileReference); + } + #else + return fclose (_fileReference); + #endif + _fileReference = NULL; + } + return 0; + } + + //____________________________________________________________________________________ + void hyFile::lock (void) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + flockfile (_fileReferenceDirect); + } + //gzclose (_fileReference); + #else + flockfile (_fileReference); + #endif + } + } + + //____________________________________________________________________________________ + void hyFile::flush (void) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + fflush (_fileReferenceDirect); + } else { + gzflush (_fileReference, Z_SYNC_FLUSH); + } + #else + fflush(_fileReference); + #endif + } + } + + //____________________________________________________________________________________ + void hyFile::unlock (void) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + funlockfile (_fileReferenceDirect); + } + #else + funlockfile (_fileReference); + #endif + } + } + + //____________________________________________________________________________________ + void hyFile::rewind (void) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + ::rewind (_fileReferenceDirect); + } else { + gzrewind (_fileReference); + } + #else + ::rewind (_fileReference); + #endif + } + } + + //____________________________________________________________________________________ + void hyFile::seek (long pos, int whence) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + fseek (_fileReferenceDirect, pos, whence); + } else { + gzseek (_fileReference, pos, whence); + } + #else + fseek (_fileReference, pos, whence); + #endif + } + } + + //____________________________________________________________________________________ + + size_t hyFile::tell (void) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + return ftell (_fileReferenceDirect); + } else { + return gztell (_fileReference); + } + #else + return ftell (_fileReference); + #endif + } + return 0; + } + //____________________________________________________________________________________ + bool hyFile::feof (void) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + return feof_unlocked (_fileReferenceDirect); + } else { + return gzeof (_fileReference); + } + #else + return feof_unlocked (_fileReference); + #endif + } + return true; + } + //____________________________________________________________________________________ + int hyFile::puts( const char* buffer) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + return ::fputs (buffer, _fileReferenceDirect); + } else { + return gzputs (_fileReference, buffer); + } + + #else + return ::fputs (buffer, _fileReference); + #endif + } + return -1; + } + + //____________________________________________________________________________________ + int hyFile::putc( int chr) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + return ::fputc (chr, _fileReferenceDirect); + } else { + return gzputc (_fileReference, chr); + } + #else + return ::fputc (chr, _fileReference); + #endif + } + return -1; + } + + //____________________________________________________________________________________ + size_t hyFile::fwrite( const void* buffer, size_t size, size_t count) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + return ::fwrite (buffer, size, count, _fileReferenceDirect); + } else { + return gzfwrite (buffer, size, count, _fileReference); + } + #else + return ::fwrite (buffer, size, count, _fileReference); + #endif + } + return -1; + } + //____________________________________________________________________________________ + int hyFile::getc (void) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + return getc_unlocked (_fileReferenceDirect); + } else { + return gzgetc (_fileReference); + } + + #else + return getc_unlocked (_fileReference); + #endif + } + return 0; + } + + //____________________________________________________________________________________ + unsigned long hyFile::read (void* buffer, unsigned long size, unsigned long items) { + if (valid()) { + #ifdef __ZLIB__ + if (_fileReferenceDirect) { + return ::fread (buffer, size, items, _fileReferenceDirect); + } else { + return gzfread (buffer, size, items, _fileReference); + } + + #else + return ::fread (buffer, size, items, _fileReference); + #endif + } + return 0; + } diff --git a/src/core/include/baseobj.h b/src/core/include/baseobj.h index ee99f26a9..343330bf5 100644 --- a/src/core/include/baseobj.h +++ b/src/core/include/baseobj.h @@ -85,8 +85,8 @@ class BaseObj { (default = same as toStr) */ - virtual void toFileStr (FILE *, unsigned long padding = 0UL) ; - /** file representation for the object + virtual void toFileStr (hyFile *, unsigned long padding = 0UL) ; + /** file representation for the object @param padding is used to allow 'pretty' rendering of nested objects, like dictss */ diff --git a/src/core/include/dataset.h b/src/core/include/dataset.h index 705ceee6b..94be6cd3c 100644 --- a/src/core/include/dataset.h +++ b/src/core/include/dataset.h @@ -72,7 +72,7 @@ class _DataSet : public _List // a complete data set public: _DataSet(void); _DataSet(long); - _DataSet(FILE *); + _DataSet(hyFile *); // with estimated number of sites per file virtual ~_DataSet(void); @@ -109,7 +109,7 @@ class _DataSet : public _List // a complete data set virtual BaseRef toStr(unsigned long = 0UL); // convert to string - virtual void toFileStr(FILE *dest, unsigned long = 0UL); + virtual void toFileStr(hyFile *dest, unsigned long = 0UL); void Compact(long); // release string overhead @@ -194,7 +194,7 @@ class _DataSet : public _List // a complete data set _TranslationTable *theTT; // translation Table, if any _List theNames; // Names of species - FILE *streamThrough; + hyFile *streamThrough; _DSHelper *dsh; bool useHorizontalRep; diff --git a/src/core/include/dataset_filter.h b/src/core/include/dataset_filter.h index bc7c8ae3e..1357899e4 100644 --- a/src/core/include/dataset_filter.h +++ b/src/core/include/dataset_filter.h @@ -68,7 +68,7 @@ class _DataSetFilter : public BaseObj { virtual ~_DataSetFilter(void); virtual BaseRef toStr(unsigned long = 0UL); // convert to string - virtual void toFileStr(FILE *, unsigned long = 0UL); // convert to string + virtual void toFileStr(hyFile *, unsigned long = 0UL); // convert to string virtual BaseRef makeDynamic(void) const; virtual void Duplicate(BaseRefConst); @@ -397,7 +397,7 @@ class _DataSetFilter : public BaseObj { long dimension; private: - void internalToStr(FILE *, _StringBuffer *); + void internalToStr(hyFile *, _StringBuffer *); inline void retrieve_individual_site_from_raw_coordinates (_String & store, unsigned long site, unsigned long sequence) const { diff --git a/src/core/include/global_things.h b/src/core/include/global_things.h index 0366697b2..2792b7bb3 100644 --- a/src/core/include/global_things.h +++ b/src/core/include/global_things.h @@ -57,9 +57,7 @@ #include #endif -#ifdef __ZLIB__ - #include -#endif + class _Variable; // forward decl class _ExecutionList; // forward decl @@ -129,26 +127,7 @@ namespace hy_global { /* pass-through structure for reading / writing from a file that may or may not be compressed */ - class hyFile { - public: - hyFile (void) {_fileReference = NULL;} - static hyFile* openFile (const char * file_path, const char * mode , bool error = false, long buffer = 1024*128); - inline bool valid (void) const {return _fileReference != NULL;} - void lock (void); - void unlock (void); - void rewind (void); - void seek (long, int); - void close (); - bool feof (void); - unsigned long read (void* buffer, unsigned long size, unsigned long items); - size_t tell (); - int getc (); -#ifdef __ZLIB__ - gzFile _fileReference; -#else - FILE* _fileReference; -#endif - }; + /** Open the file located at file_path using mode 'mode' @@ -159,7 +138,7 @@ namespace hy_global { @return the FILE handle or nil (if file does not exist or could not be open with the requested mode) */ - FILE* doFileOpen (const char * file_path, const char * mode , bool error = false); + hyFile* doFileOpen (const char * file_path, hyFileOpenMode mode , bool error = false, bool compress = false); /** The omnibus clean-up function that attempts to deallocate all application memory @@ -300,7 +279,7 @@ namespace hy_global { extern _List _hy_standard_library_extensions, _hy_standard_library_paths; - extern FILE* hy_error_log_file, + extern hyFile* hy_error_log_file, * hy_message_log_file; diff --git a/src/core/include/hbl_env.h b/src/core/include/hbl_env.h index ab82c14b8..7e5965237 100644 --- a/src/core/include/hbl_env.h +++ b/src/core/include/hbl_env.h @@ -206,6 +206,7 @@ namespace hy_env { tolerate_numerical_errors, tolerate_constraint_violation, number_threads, + gzip_compression_level, tree_parser_namespace; ; diff --git a/src/core/include/hy_strings.h b/src/core/include/hy_strings.h index 853ae8f99..d43e9a042 100644 --- a/src/core/include/hy_strings.h +++ b/src/core/include/hy_strings.h @@ -268,6 +268,7 @@ class _String : public BaseObj { specifically); also added a check that the # of chars read was the same as the one requested. */ + _String(hyFile *file, long read_this_many = -1L); _String(FILE *file, long read_this_many = -1L); /** diff --git a/src/core/include/hy_types.h b/src/core/include/hy_types.h index b62a2238d..21bf65bf8 100644 --- a/src/core/include/hy_types.h +++ b/src/core/include/hy_types.h @@ -40,6 +40,12 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #ifndef __HY_TYPES__ #define __HY_TYPES__ +#ifdef __ZLIB__ + #include +#endif + +#include + /** Put application-wide typedefs and enums here */ @@ -75,6 +81,43 @@ enum hyComparisonType { kCompareUndefined = 0xff }; +enum hyFileOpenMode { + kFileRead, + kFileReadBinary, + kFileReadWrite, + kFileWrite, + kFileWriteBinary, + kFileAppend +}; +class hyFile { + public: + hyFile (void) {_fileReference = NULL; + #ifdef __ZLIB__ + _fileReferenceDirect = NULL; + #endif + } + static hyFile* openFile (const char * file_path, hyFileOpenMode mode , bool error = false, bool compress = false, long buffer = 1024*128); + inline bool valid (void) const {return _fileReference != NULL || _fileReferenceDirect != NULL;} + void lock (void); + void unlock (void); + void rewind (void); + void seek (long, int); + void flush (void); + int close (); + bool feof (void); + unsigned long read (void* buffer, unsigned long size, unsigned long items); + size_t fwrite( const void* buffer, size_t size, size_t count); + int puts(const char *str); + int putc(int chr); + size_t tell (); + int getc (); + #ifdef __ZLIB__ + gzFile _fileReference; + FILE * _fileReferenceDirect; + #else + FILE* _fileReference; + #endif +}; #endif diff --git a/src/core/include/list.h b/src/core/include/list.h index 9ce7ffb9e..26b3869d0 100644 --- a/src/core/include/list.h +++ b/src/core/include/list.h @@ -402,7 +402,7 @@ class _List:public _SimpleList { /** */ - virtual void toFileStr(FILE*, unsigned long = 0UL); + virtual void toFileStr(hyFile*, unsigned long = 0UL); /** Generate a string that is not present in the list. diff --git a/src/core/include/matrix.h b/src/core/include/matrix.h index 4a59d2b15..c4a6e9b4e 100644 --- a/src/core/include/matrix.h +++ b/src/core/include/matrix.h @@ -407,7 +407,7 @@ class _Matrix: public _MathObject { virtual BaseRef toStr (unsigned long = 0UL); // convert this matrix to a string - virtual void toFileStr (FILE*dest, unsigned long = 0UL); + virtual void toFileStr (hyFile *dest, unsigned long = 0UL); bool AmISparse (void); @@ -428,8 +428,8 @@ class _Matrix: public _MathObject { bool CompareMatrices (_Matrix const* rhs, hyFloat tolerance = kMachineEpsilon) const; - void ExportMatrixExp (_Matrix*, FILE*); - bool ImportMatrixExp (FILE*); + void ExportMatrixExp (_Matrix*, hyFile*); + bool ImportMatrixExp (hyFile*); hyFloat FisherExact (hyFloat, hyFloat, hyFloat); @@ -627,7 +627,7 @@ class _Matrix: public _MathObject { private: - void internal_to_str (_StringBuffer*, FILE*, unsigned long padding); + void internal_to_str (_StringBuffer*, hyFile*, unsigned long padding); void SetupSparseMatrixAllocations (void); bool is_square_numeric (bool dense = true) const; diff --git a/src/core/include/polynoml.h b/src/core/include/polynoml.h index 415245d84..8341bbccf 100644 --- a/src/core/include/polynoml.h +++ b/src/core/include/polynoml.h @@ -158,7 +158,7 @@ class _Polynomial : public _MathObject { virtual BaseObj* toStr (unsigned long = 0UL); void CheckTerm(void); - virtual void toFileStr (FILE*, unsigned long = 0UL); + virtual void toFileStr (hyFile*, unsigned long = 0UL); long GetNoVariables(void) const { return variableIndex.countitems(); diff --git a/src/core/include/string_file_wrapper.h b/src/core/include/string_file_wrapper.h index 172616f5f..e714ea1f6 100644 --- a/src/core/include/string_file_wrapper.h +++ b/src/core/include/string_file_wrapper.h @@ -57,7 +57,7 @@ class StringFileWrapper { */ public: - StringFileWrapper (_StringBuffer * string, FILE * file); + StringFileWrapper (_StringBuffer * string, hyFile * file); /** Create a wrapper around around a string / file pair If both arguments are null, the wrapper will simply "eat" the bufferring operations (/dev/null equivalent). If both arguments @@ -108,7 +108,7 @@ class StringFileWrapper { private: _StringBuffer * string_buffer; - FILE* file_buffer; + hyFile* file_buffer; }; diff --git a/src/core/include/topology.h b/src/core/include/topology.h index 95ebd53b7..27f367dc2 100644 --- a/src/core/include/topology.h +++ b/src/core/include/topology.h @@ -185,7 +185,7 @@ class _TreeTopology: public _CalcNode { char rooted; - virtual void toFileStr (FILE*, unsigned long); + virtual void toFileStr (hyFile*, unsigned long); virtual BaseRef toStr (unsigned long = 0UL); void RerootTreeInternalTraverser (node* iterator, long, bool,_StringBuffer&, _TreeTopologyParseSettings const& settings, hyTopologyBranchLengthMode branch_length_mode, long variable_ref = -1L, bool = false) const; diff --git a/src/core/include/variable.h b/src/core/include/variable.h index c6349247c..2ca5f24f4 100644 --- a/src/core/include/variable.h +++ b/src/core/include/variable.h @@ -69,7 +69,7 @@ class _Variable : public _Constant { virtual void Duplicate (BaseRefConst); virtual BaseRef makeDynamic(void) const; virtual BaseRef toStr (unsigned long = 0UL); - virtual void toFileStr (FILE*, unsigned long = 0UL); + virtual void toFileStr (hyFile*, unsigned long = 0UL); virtual void MarkDone (void); diff --git a/src/core/likefunc.cpp b/src/core/likefunc.cpp index 951332680..a53cc7d83 100644 --- a/src/core/likefunc.cpp +++ b/src/core/likefunc.cpp @@ -455,7 +455,7 @@ void UpdateOptimizationStatus (hyFloat max, long pdone, char init, bool o static _String userStatusString; static clock_t userTimeStart; - FILE *outFile = fileName?doFileOpen (fileName->get_str(),"w"):nil; + hyFile *outFile = fileName?doFileOpen (fileName->get_str(),kFileWrite):nil; _FString* t; static TimeDifference timer; @@ -463,9 +463,6 @@ void UpdateOptimizationStatus (hyFloat max, long pdone, char init, bool o if (init==0) { lCount = likeFuncEvalCallCount; timer.Start(); -#ifndef _MINGW32_MEGA_ - setvbuf (stdout,nil, _IONBF,1); -#endif lastDone = 0; userTimeStart = clock(); checkParameter (optimizationStringQuantum, update_quantum, 0.0); @@ -516,50 +513,41 @@ void UpdateOptimizationStatus (hyFloat max, long pdone, char init, bool o } if (outFile) { - fprintf (outFile,"%s", reportString.get_str()); + outFile->puts (reportString.get_str()); } else -#ifndef _MINGW32_MEGA_ printf ("\015%s", reportString.get_str()); -#else - SetStatusLine (reportString); -#endif } else { char buffer [1024]; if (optimization) { - if (outFile) - fprintf (outFile,"Likelihood function optimization status\nCurrent Maximum: %-14.8g (%ld %% done)\nLikelihood Function evaluations/second: %-8.4g", (double)max, pdone, - (likeFuncEvalCallCount-lCount)/elapsed_time); - else { - long written = snprintf (buffer,1024,"Current Max: %-14.8g (%ld %% done) LF Evals/Sec: %-8.4g", (double)max, pdone, (likeFuncEvalCallCount-lCount)/elapsed_time); + + long written = snprintf (buffer,1024,"Current Max: %-14.8g (%ld %% done) LF Evals/Sec: %-8.4g", (double)max, pdone, (likeFuncEvalCallCount-lCount)/elapsed_time); - if (elapsed_time) { - snprintf (buffer+written,1024-written, "CPU Load: %-8.4g", (clock()-userTimeStart)/((hyFloat)CLOCKS_PER_SEC*elapsed_time)); - } + if (elapsed_time) { + snprintf (buffer+written,1024-written, "CPU Load: %-8.4g", (clock()-userTimeStart)/((hyFloat)CLOCKS_PER_SEC*elapsed_time)); } } else { snprintf (buffer, 1024, "Sites done: %g (%ld %% done)", (double)max, pdone); } -#ifndef _MINGW32_MEGA_ - printf ("\015%s", buffer); -#else - SetStatusLine (_String(buffer)); -#endif + if (outFile) { + outFile->puts (buffer); + } else { + printf ("\015%s", buffer); + } } } else { if (outFile) { - fprintf (outFile,"DONE"); + outFile->puts ("DONE"); } else { -#ifndef _MINGW32_MEGA_ printf ("\033\015 "); setvbuf (stdout,nil,_IOLBF,1024); -#endif } } if (outFile) { - fclose (outFile); + outFile->close(); + delete (outFile); } } @@ -5168,7 +5156,7 @@ _Matrix* _LikelihoodFunction::Optimize (_AssociativeList const * options) void _LikelihoodFunction::_TerminateAndDump(const _String &error, bool sig_term) { - FILE * out = doFileOpen ("/tmp/hyphy.dump", "w"); + hyFile * out = doFileOpen ("/tmp/hyphy.dump", kFileWrite); _String err ("Internal error "); @@ -5178,8 +5166,9 @@ void _LikelihoodFunction::_TerminateAndDump(const _String &error, bool sig_te _StringBuffer sLF (8192L); SerializeLF (sLF,_hyphyLFSerializeModeVanilla); sLF.TrimSpace(); - fwrite ((void*)sLF.get_str(), 1, sLF.length(), out); - fclose (out); + out->fwrite ((void*)sLF.get_str(), 1, sLF.length()); + out->close(); + delete out; } HandleApplicationError (err & '\n' & error, true); @@ -10581,7 +10570,7 @@ void _LikelihoodFunction::StateCounter (long functionCallback) const { if (storeIntermediates && !this_tree->IsDegenerate()) { if (storeIntermediates->nonempty()) { - FILE * file_for_ancestral_sequences = doFileOpen (storeIntermediates->get_str(),"w"); + hyFile * file_for_ancestral_sequences = doFileOpen (storeIntermediates->get_str(),kFileWrite); if (!file_for_ancestral_sequences) { HandleApplicationError (_String ("Failed to open ") & storeIntermediates->Enquote() & " for writing."); target.Finalize(); diff --git a/src/core/list.cpp b/src/core/list.cpp index 35645d609..0dbe4cdb3 100644 --- a/src/core/list.cpp +++ b/src/core/list.cpp @@ -577,17 +577,17 @@ void _List::Replace (long index, BaseRef newObj, bool dup) { // Char* conversion //TODO: toFileStr should be ToFileStr to follow convention. -void _List::toFileStr(FILE* dest, unsigned long) { - fprintf (dest,"{"); +void _List::toFileStr(hyFile* dest, unsigned long) { + dest->puts ("{"); if (lLength) { GetItem(0)->toFileStr(dest); for (unsigned long i = 1UL; iputs (", "); GetItem(i)->toFileStr(dest); } } - fprintf (dest,"}"); + dest->puts ("}"); } // Char* conversion diff --git a/src/core/matrix.cpp b/src/core/matrix.cpp index 5cc12c623..3560b0837 100644 --- a/src/core/matrix.cpp +++ b/src/core/matrix.cpp @@ -8228,7 +8228,7 @@ _Matrix _Matrix::operator - (_Matrix& m) } //_________________________________________________________ -void _Matrix::internal_to_str (_StringBuffer* string, FILE * file, unsigned long padding) { +void _Matrix::internal_to_str (_StringBuffer* string, hyFile * file, unsigned long padding) { StringFileWrapper res (string, file); _String padder (" ", padding); @@ -8331,7 +8331,7 @@ void _Matrix::internal_to_str (_StringBuffer* string, FILE * file, unsigned l } } //_________________________________________________________ -void _Matrix::toFileStr (FILE*dest, unsigned long padding){ +void _Matrix::toFileStr (hyFile *dest, unsigned long padding){ internal_to_str(nil, dest, padding); } //_____________________________________________________________________________________________ @@ -8381,14 +8381,14 @@ void _Matrix::InitMxVar (_SimpleList& mxVariables, hyFloat glValue) { }); } //_____________________________________________________________________________________________ -bool _Matrix::ImportMatrixExp (FILE* theSource) { +bool _Matrix::ImportMatrixExp (hyFile* theSource) { // TODO: SLKP 20171027, need to review and possibly deprecate long mDim=0,i,k=0,j,m; char buffer[255],fc=0; buffer[0]=0; while(1) { - buffer[mDim]=fgetc(theSource); - if (feof(theSource)) { + buffer[mDim]=theSource->getc(); + if (theSource->feof()) { return false; } if (buffer[mDim]==',') { @@ -8404,11 +8404,10 @@ bool _Matrix::ImportMatrixExp (FILE* theSource) { i = 0; _SimpleList varList,c1,c2; while (fc!=';') { - fc = fgetc (theSource); - if ((fc==',')||(fc==';')) { + fc = theSource->getc(); + if ( fc==',' || fc==';') { buffer [i] = 0; _String varName (buffer); - _Variable * ppv = CheckReceptacle (&varName, kEmptyString, true); varList << ppv->get_index(); i = 0; @@ -8416,13 +8415,13 @@ bool _Matrix::ImportMatrixExp (FILE* theSource) { buffer[i]=fc; i++; } - if (feof(theSource)) { + if (theSource->feof()) { return false; } } do { - fc = fgetc (theSource); - if (feof(theSource)) { + fc = theSource->getc(); + if (theSource->feof()) { return false; } } while (fc!=';'); @@ -8432,10 +8431,10 @@ bool _Matrix::ImportMatrixExp (FILE* theSource) { while (kgetc(); buffer[i] = fc; i++; - if (feof(theSource)) { + if (theSource->feof()) { return false; } } @@ -8446,13 +8445,13 @@ bool _Matrix::ImportMatrixExp (FILE* theSource) { while (fc!='}') { i = 0; do { - buffer[i] = fc = fgetc (theSource); + buffer[i] = fc = theSource->getc(); i++; - if (feof(theSource)) { + if (theSource->feof()) { DeleteObject (thisCell); return false; } - } while ((fc!=',')&&(fc!='}')); + } while (fc!=',' && fc!='}'); buffer[i]=0; theCoeffs[j]=atof (buffer); j++; @@ -8461,7 +8460,7 @@ bool _Matrix::ImportMatrixExp (FILE* theSource) { return false; } } - fc = fgetc(theSource); + fc = theSource->getc(); if (fc != '{') { DeleteObject (thisCell); return false; @@ -8472,18 +8471,18 @@ bool _Matrix::ImportMatrixExp (FILE* theSource) { while (fc!='}') { i = 0; do { - buffer[i] = fc = fgetc (theSource); + buffer[i] = fc = theSource->getc(); i++; - if (feof(theSource)) { + if (theSource->feof()) { DeleteObject (thisCell); DeleteObject (pd); return false; } - } while ((fc!=',')&&(fc!='}')); + } while (fc!=',' && fc!='}'); buffer[i]=0; c1<getc(); if (fc != '{') { DeleteObject (thisCell); DeleteObject (pd); @@ -8493,14 +8492,14 @@ bool _Matrix::ImportMatrixExp (FILE* theSource) { while (fc!='}') { i = 0; do { - buffer[i] = fc = fgetc (theSource); + buffer[i] = fc = theSource->getc(); i++; - if (feof(theSource)) { + if (theSource->feof()) { DeleteObject (thisCell); DeleteObject (pd); return false; } - } while ((fc!=',')&&(fc!='}')); + } while (fc!=',' && fc!='}' ); buffer[i]=0; c2<puts (buffer); _SimpleList mxVariables; { _AVLList mxA (&mxVariables); @@ -8546,11 +8549,11 @@ void _Matrix::ExportMatrixExp (_Matrix* theBase, FILE* theDump) long k, i=0; hyFloat* varPool = (hyFloat*)MatrixMemAllocate (mxVariables.countitems()*sizeof(hyFloat)); for (k=0; kGetName()->get_str()); + theDump->puts (LocateVar(mxVariables(k))->GetName()->get_str()); if (kputc (','); } else { - fprintf (theDump,"%c",';'); + theDump->putc (';'); } varPool[k]=topPolyCap; } @@ -8565,7 +8568,8 @@ void _Matrix::ExportMatrixExp (_Matrix* theBase, FILE* theDump) DeleteObject(dummy); checkParameter (ANAL_MATRIX_TOLERANCE,analMatrixTolerance,1e-6); - fprintf (theDump,"%g,%g;",analMatrixTolerance,topPolyCap); + snprintf (buffer, 256, "%g,%g;",analMatrixTolerance,topPolyCap); + theDump->puts (buffer); // now loop thru the cells and check the precision term by term for (k=0; kputs (buffer); for (i=0; i<=tup; i++) { if (i) { - fprintf(theDump,",%18.16g",coeffHolder[i]); + snprintf (buffer,256, ",%18.16g",coeffHolder[i]); } else { - fprintf(theDump,"%18.16g",coeffHolder[i]); + snprintf (buffer, 256,"%18.16g",coeffHolder[i]); } + theDump->puts (buffer); } - fprintf(theDump,"}%ld",tup); + snprintf (buffer,256,"}%ld",tup); + theDump->puts (buffer); c1.toFileStr(theDump); c2.toFileStr(theDump); MatrixMemFree (coeffHolder); diff --git a/src/core/polynoml.cpp b/src/core/polynoml.cpp index f8f7641de..3e3954397 100644 --- a/src/core/polynoml.cpp +++ b/src/core/polynoml.cpp @@ -2730,36 +2730,37 @@ BaseObj* _Polynomial::toStr (unsigned long padding) { //__________________________________________________________________________________ -void _Polynomial::toFileStr (FILE*f, unsigned long padding) +void _Polynomial::toFileStr (hyFile *f, unsigned long padding) { if (theTerms->NumberOfTerms()&&theTerms->thePowers) { - fprintf(f,"p("); + f->puts ("p("); _List _varNames; long i; for (i=0; iGetName(); - fprintf(f,"%s",((_String*)_varNames(i))->get_str()); + f->puts (((_String*)_varNames(i))->get_str()); if (iputs (","); } } - fprintf(f,")="); + f->puts (")="); for (i=0; iNumberOfTerms(); i++) { char number [100]; snprintf (number, sizeof(number),PRINTF_FORMAT_STRING,theTerms->GetCoeff(i)); if ((i>0)&&(number[0]!='-')) { - fprintf(f,"+"); + f->puts ("+"); } - fprintf(f,"%s",number); + f->puts (number); if ((i>0)||!theTerms->IsFirstANumber()) { - fprintf(f,"*"); + f->puts ("*"); long *cT = theTerms->GetTerm(i); for (long k=0; k0) { - fprintf(f,"%s",((_String*)_varNames(k))->get_str()); + f->puts (((_String*)_varNames(k))->get_str()); if (*cT>1) { - fprintf(f,"^");; - fprintf(f,"%ld",*cT); + f->puts ("^"); + snprintf (number, sizeof(number),"%ld",*cT); + f->puts (number); } } } diff --git a/src/core/string_file_wrapper.cpp b/src/core/string_file_wrapper.cpp index 066adbfd3..04211e718 100644 --- a/src/core/string_file_wrapper.cpp +++ b/src/core/string_file_wrapper.cpp @@ -39,7 +39,7 @@ #include "string_file_wrapper.h" -StringFileWrapper::StringFileWrapper (_StringBuffer *string, FILE *file) { +StringFileWrapper::StringFileWrapper (_StringBuffer *string, hyFile *file) { string_buffer = string; file_buffer = file; } @@ -48,7 +48,7 @@ StringFileWrapper& StringFileWrapper::operator << (const char* buffer) { if (string_buffer) { *string_buffer << buffer; } else if (file_buffer) { - fprintf (file_buffer, buffer); + file_buffer->puts (buffer); } return *this; } @@ -57,7 +57,7 @@ StringFileWrapper& StringFileWrapper::operator << (const char letter) { if (string_buffer) { *string_buffer << letter; } else if (file_buffer) { - fputc (letter, file_buffer); + file_buffer->putc (letter); } return *this; } @@ -66,7 +66,7 @@ StringFileWrapper& StringFileWrapper::operator << (const _String& buffer) { if (string_buffer) { *string_buffer << buffer; } else if (file_buffer) { - fprintf (file_buffer, "%s", (const char*)buffer); + file_buffer->puts ((const char*)buffer); } return *this; } @@ -91,14 +91,14 @@ StringFileWrapper& StringFileWrapper::operator << (const StringFileWrapperConsta } else if (file_buffer) { switch (special) { case kStringFileWrapperNewLine: - fputc ('\n', file_buffer); + file_buffer->putc ('\n'); break; case kStringFileWrapperLinefeed: - fputc ('\r', file_buffer); - break; + file_buffer->putc ('\r'); + break; case kStringFileWrapperTab: - fputc ('\t', file_buffer); - break; + file_buffer->putc ('\t'); + break; } } return *this; diff --git a/src/core/strings.cpp b/src/core/strings.cpp index 363684989..3f17bb170 100644 --- a/src/core/strings.cpp +++ b/src/core/strings.cpp @@ -282,6 +282,26 @@ _String::_String(FILE * file, long read_this_many) { } } +//============================================================= +_String::_String(hyFile * file, long read_this_many) { + _String::Initialize (); + if (file) { + if (read_this_many < 0) { + file->seek (0, SEEK_END); + s_length = (unsigned long) file->tell(); + file->rewind(); + } else { + s_length = read_this_many; + } + s_data = (char *)MemAllocate(s_length + 1UL); + unsigned long read_items = file->read(s_data, 1, s_length); + if (read_items < s_length) { + s_data = (char*)MemReallocate(s_data,read_items+1); + s_length = read_items; + } + s_data[s_length] = '\0'; + } +} //============================================================= _String::~_String(void) { if (CanFreeMe()) { diff --git a/src/core/topology.cpp b/src/core/topology.cpp index fb151cbf0..ec6bca60b 100644 --- a/src/core/topology.cpp +++ b/src/core/topology.cpp @@ -957,9 +957,9 @@ BaseRef _TreeTopology::toStr (unsigned long) { } //__________________________________________________________________________________ -void _TreeTopology::toFileStr(FILE* f, unsigned long padding) { +void _TreeTopology::toFileStr(hyFile * f, unsigned long padding) { _String * s = (_String*)toStr(padding); - fprintf (f, "%s", s->get_str()); + f->puts (s->get_str()); DeleteObject(s); } diff --git a/src/core/variable.cpp b/src/core/variable.cpp index 0f5fab881..e1e2285a0 100644 --- a/src/core/variable.cpp +++ b/src/core/variable.cpp @@ -167,14 +167,14 @@ BaseRef _Variable::toStr(unsigned long padding) } //__________________________________________________________________________________ -void _Variable::toFileStr(FILE* f, unsigned long padding) +void _Variable::toFileStr(hyFile* f, unsigned long padding) { if (varValue&&varValue->IsPrintable()) { varValue->toFileStr(f, padding); } else { HBLObjectRef vv = Compute(); if (!vv) { - fprintf(f,"NAN"); + f->puts ("NAN"); } else { vv->toFileStr(f, padding); } diff --git a/src/mains/unix.cpp b/src/mains/unix.cpp index 8eed24af6..19c867ffe 100644 --- a/src/mains/unix.cpp +++ b/src/mains/unix.cpp @@ -283,10 +283,10 @@ void ReadInTemplateFiles(void) { _String dir_sep (get_platform_directory_char()), fileIndex = *((_String*)pathNames(0)) & hy_standard_library_directory & dir_sep & "files.lst"; - FILE* modelList = fopen (fileIndex.get_str(),"r"); + hyFile * modelList = doFileOpen (fileIndex.get_str(),kFileRead, false); if (!modelList) { fileIndex = baseArgDir& hy_standard_library_directory & dir_sep & "files.lst"; - modelList = fopen (fileIndex.get_str(),"r"); + modelList = doFileOpen (fileIndex.get_str(),kFileRead, false); if (!modelList) { return; } @@ -295,7 +295,8 @@ void ReadInTemplateFiles(void) { } _String theData (modelList); - fclose (modelList); + modelList->close(); + delete (modelList); if (theData.length()) { _List extracted_files; diff --git a/src/new/bayesgraph.cpp b/src/new/bayesgraph.cpp index 4e35e961d..48061d997 100644 --- a/src/new/bayesgraph.cpp +++ b/src/new/bayesgraph.cpp @@ -77,7 +77,8 @@ _String #ifdef __UNIX__ void ConsoleBGMStatus (_String const statusLine, hyFloat percentDone, _String const * fileName) { - FILE *outFile = fileName?doFileOpen (fileName->get_str(),"w"):nil; + + hyFile *outFile = fileName?doFileOpen (fileName->get_str(),kFileWrite):nil; _String reportLine (statusLine); if (percentDone >= 0.0) { @@ -85,20 +86,21 @@ void ConsoleBGMStatus (_String const statusLine, hyFloat percentDone, _St } if (outFile) { - fprintf (outFile,"%s", reportLine.get_str()); + outFile->puts(reportLine.get_str()); } else if (verbosity_level == 1) { printf ("\033\015 %s", reportLine.get_str()); + if (percentDone < -1.5) { + printf ("\033\015 "); + setvbuf (stdout,nil,_IOLBF,1024); + } else if (percentDone < -0.5) { + setvbuf (stdout,nil, _IONBF,1); + } } - if (percentDone < -1.5) { - printf ("\033\015 "); - setvbuf (stdout,nil,_IOLBF,1024); - } else if (percentDone < -0.5) { - setvbuf (stdout,nil, _IONBF,1); - } - + if (outFile) { - fclose (outFile); + outFile->close(); + delete (outFile); } } From 707d1530cd6e296ec539d799b935361d4558dbf1 Mon Sep 17 00:00:00 2001 From: Sergei Date: Wed, 8 Nov 2023 11:10:59 -0500 Subject: [PATCH 02/14] better support for the PAML file format --- src/core/dataset.cpp | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/core/dataset.cpp b/src/core/dataset.cpp index a0755eace..b76e03116 100644 --- a/src/core/dataset.cpp +++ b/src/core/dataset.cpp @@ -1972,15 +1972,21 @@ void TrimPhylipLine (_String& CurrentLine, _DataSet& ds) { int fNS = CurrentLine.FirstNonSpaceIndex(), space2 = CurrentLine.FirstSpaceIndex (fNS + 1); - // hack for PAML support - if (space2 > fNS && isspace(CurrentLine.char_at (space2+1))) { - _String sequence_name (CurrentLine,fNS, space2); - CurrentLine.Trim(space2+2,-1); // chop out the name + if (space2 < 0 && CurrentLine.length() > 10) { + _String sequence_name (CurrentLine,fNS, CurrentLine.length()); ds.AddName(sequence_name); + CurrentLine.Trim(CurrentLine.length(),-1); } else { - _String sequence_name (CurrentLine,fNS, fNS+9); - CurrentLine.Trim(fNS+10,-1); // chop out the name - ds.AddName(sequence_name); + // hack for PAML support + if (space2 > fNS && isspace(CurrentLine.char_at (space2+1))) { + _String sequence_name (CurrentLine,fNS, space2); + CurrentLine.Trim(space2+2,-1); // chop out the name + ds.AddName(sequence_name); + } else { + _String sequence_name (CurrentLine,fNS, fNS+9); + CurrentLine.Trim(fNS+10,-1); // chop out the name + ds.AddName(sequence_name); + } } } From 4e530edab4574a9efb3bd463d818ef8a341deb38 Mon Sep 17 00:00:00 2001 From: Sergei Date: Wed, 8 Nov 2023 15:38:32 -0500 Subject: [PATCH 03/14] lib v3 tweaks --- res/TemplateBatchFiles/libv3/tasks/trees.bf | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/res/TemplateBatchFiles/libv3/tasks/trees.bf b/res/TemplateBatchFiles/libv3/tasks/trees.bf index 9368aa92b..e5acfd544 100644 --- a/res/TemplateBatchFiles/libv3/tasks/trees.bf +++ b/res/TemplateBatchFiles/libv3/tasks/trees.bf @@ -4,6 +4,7 @@ LoadFunctionLibrary("../convenience/regexp.bf"); LoadFunctionLibrary("../UtilityFunctions.bf"); LoadFunctionLibrary("TreeTools"); LoadFunctionLibrary("distances.bf"); +LoadFunctionLibrary("../convenience/regexp.bf"); /** @module trees */ @@ -371,7 +372,7 @@ lfunction trees.branch_names(tree, respect_case) { */ lfunction trees.extract_paml_annotation (tree_string) { - + pieces = regexp.SplitWithMatches (tree_string, "\$([0-9])"); N = utility.Array1D (pieces[^("terms.regexp.separators")]); @@ -380,30 +381,33 @@ lfunction trees.extract_paml_annotation (tree_string) { if (N > 0) { for (i,s; in; pieces[^("terms.regexp.strings")]) { sanitized_string += s; - if (+i > 0) { - sanitized_string += "{" + regexp.Replace ((pieces[^("terms.regexp.separators")])[-1+i], "\$", "PAML_CLADE_") + "}"; + if (+i < N) { + sanitized_string += "{" + regexp.Replace ((pieces[^("terms.regexp.separators")])[+i], "\$", "PAML_CLADE_") + "}"; } } } + pieces = regexp.SplitWithMatches (sanitized_string, "\#([0-9])"); N = utility.Array1D (pieces[^("terms.regexp.separators")]); if (N > 0) { for (i,s; in; pieces[^("terms.regexp.strings")]) { sanitized_string += s; - if (+i > 0) { - sanitized_string += "{" + regexp.Replace ((pieces[^("terms.regexp.separators")])[-1+i], "\#", "PAML_GROUP_") + "}"; + if (+i < N) { + sanitized_string += "{" + regexp.Replace ((pieces[^("terms.regexp.separators")])[+i], "\#", "PAML_GROUP_") + "}"; } } } - + if (Abs (sanitized_string) == 0) { sanitized_string = tree_string; } + utility.ToggleEnvVariable ("IGNORE_INTERNAL_NODE_LABELS", TRUE); result = trees.ExtractTreeInfo (sanitized_string); - + utility.ToggleEnvVariable ("IGNORE_INTERNAL_NODE_LABELS", None); + if (has_paml_clades) { additional_annotation = {}; tree_str = result[^"terms.trees.newick"]; @@ -416,6 +420,8 @@ lfunction trees.extract_paml_annotation (tree_string) { } } } + + result[^"terms.trees.model_map"] * additional_annotation; result[^"terms.trees.model_list"] = Columns(result[^"terms.trees.model_map"]); result[^"terms.trees.newick_annotated"] = tree.Annotate (&T, result[^"terms.trees.model_map"], "{}", FALSE); @@ -584,6 +590,7 @@ lfunction trees.ExtractTreeInfo(tree_string) { Topology T = tree_string; res = trees.ExtractTreeInfoFromTopology (&T); res [^"terms.data.file"] = file_name; + DeleteObject (T); return res; } From c17ed47914463f67af38f7bd12e459810c4934c0 Mon Sep 17 00:00:00 2001 From: Sergei Date: Wed, 8 Nov 2023 16:04:13 -0500 Subject: [PATCH 04/14] NOZLIB compile fixes --- src/core/hyFile.cpp | 2 +- src/core/include/hy_types.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/core/hyFile.cpp b/src/core/hyFile.cpp index 889f41293..f76f67753 100644 --- a/src/core/hyFile.cpp +++ b/src/core/hyFile.cpp @@ -109,7 +109,7 @@ case kFileWrite: str_mode = "w"; break; - case kFileWrite: + case kFileWriteBinary: str_mode = "wb"; break; case kFileAppend: diff --git a/src/core/include/hy_types.h b/src/core/include/hy_types.h index 21bf65bf8..ee760e25f 100644 --- a/src/core/include/hy_types.h +++ b/src/core/include/hy_types.h @@ -98,7 +98,6 @@ class hyFile { #endif } static hyFile* openFile (const char * file_path, hyFileOpenMode mode , bool error = false, bool compress = false, long buffer = 1024*128); - inline bool valid (void) const {return _fileReference != NULL || _fileReferenceDirect != NULL;} void lock (void); void unlock (void); void rewind (void); @@ -113,9 +112,11 @@ class hyFile { size_t tell (); int getc (); #ifdef __ZLIB__ + inline bool valid (void) const {return _fileReference != NULL || _fileReferenceDirect != NULL;} gzFile _fileReference; FILE * _fileReferenceDirect; #else + inline bool valid (void) const {return _fileReference != NULL;} FILE* _fileReference; #endif }; From 0031efa287a1847ce63aac1b6a635e86fcfc8661 Mon Sep 17 00:00:00 2001 From: Sergei Date: Wed, 8 Nov 2023 16:07:58 -0500 Subject: [PATCH 05/14] NOZLIB compile fixes --- src/core/dataset.cpp | 10 +++++----- src/core/global_things.cpp | 2 +- src/core/hyFile.cpp | 2 +- src/core/include/hy_types.h | 13 +++++++------ src/core/matrix.cpp | 4 ++-- src/core/string_file_wrapper.cpp | 8 ++++---- 6 files changed, 20 insertions(+), 19 deletions(-) diff --git a/src/core/dataset.cpp b/src/core/dataset.cpp index eb975b552..76222bf92 100644 --- a/src/core/dataset.cpp +++ b/src/core/dataset.cpp @@ -197,7 +197,7 @@ void _DataSet::AddSite(char c) { if (theMap.list_data[1] == 0) { if (theNames.lLength) { streamThrough->puts ("(_String *)theNames(0))->get_str()"); - streamThrough->putc ('\n'); + streamThrough->fputc ('\n'); } else { streamThrough->puts (">Sequence 1"); } @@ -206,7 +206,7 @@ void _DataSet::AddSite(char c) { theMap.list_data[1]++; theMap.list_data[2]++; - streamThrough->putc(c); + streamThrough->fputc(c); } else { HandleApplicationError("Can't add more sites to a file based data set, " "when more that one sequence has been written!", @@ -238,13 +238,13 @@ void _DataSet::Write2Site(long index, char c, char skip_char) { if (theNames.lLength > theMap.list_data[0]) { streamThrough->puts ("\n>"); streamThrough->puts (((_String *)theNames(theMap.list_data[0]))->get_str()); - streamThrough->putc ('\n'); + streamThrough->fputc ('\n'); } else { streamThrough->puts ("\n>"); char buffer[64]; snprintf (buffer, 64, "%ld", theMap.list_data[0] + 1); streamThrough->puts (buffer); - streamThrough->putc ('\n'); + streamThrough->fputc ('\n'); //fprintf(streamThrough, "\n>Sequence %ld\n", theMap.list_data[0] + 1); } @@ -261,7 +261,7 @@ void _DataSet::Write2Site(long index, char c, char skip_char) { } theMap.list_data[1]++; - streamThrough->putc(c); + streamThrough->fputc(c); } else { if (useHorizontalRep) { long currentWritten = ((_String *)list_data[0])->length(); diff --git a/src/core/global_things.cpp b/src/core/global_things.cpp index 9c758f7b5..ab68d6652 100644 --- a/src/core/global_things.cpp +++ b/src/core/global_things.cpp @@ -762,7 +762,7 @@ namespace hy_global { } if (hy_error_log_file) { - hy_error_log_file->putc ('\n'); + hy_error_log_file->fputc ('\n'); hy_error_log_file->puts ((const char*)message); } diff --git a/src/core/hyFile.cpp b/src/core/hyFile.cpp index f76f67753..7f4f39a22 100644 --- a/src/core/hyFile.cpp +++ b/src/core/hyFile.cpp @@ -271,7 +271,7 @@ } //____________________________________________________________________________________ - int hyFile::putc( int chr) { + int hyFile::fputc( int chr) { if (valid()) { #ifdef __ZLIB__ if (_fileReferenceDirect) { diff --git a/src/core/include/hy_types.h b/src/core/include/hy_types.h index ee760e25f..add98ec13 100644 --- a/src/core/include/hy_types.h +++ b/src/core/include/hy_types.h @@ -92,11 +92,12 @@ enum hyFileOpenMode { class hyFile { public: - hyFile (void) {_fileReference = NULL; - #ifdef __ZLIB__ - _fileReferenceDirect = NULL; - #endif - } + hyFile (void) { + _fileReference = NULL; + #ifdef __ZLIB__ + _fileReferenceDirect = NULL; + #endif + } static hyFile* openFile (const char * file_path, hyFileOpenMode mode , bool error = false, bool compress = false, long buffer = 1024*128); void lock (void); void unlock (void); @@ -108,7 +109,7 @@ class hyFile { unsigned long read (void* buffer, unsigned long size, unsigned long items); size_t fwrite( const void* buffer, size_t size, size_t count); int puts(const char *str); - int putc(int chr); + int fputc(int chr); size_t tell (); int getc (); #ifdef __ZLIB__ diff --git a/src/core/matrix.cpp b/src/core/matrix.cpp index 3560b0837..f78744818 100644 --- a/src/core/matrix.cpp +++ b/src/core/matrix.cpp @@ -8551,9 +8551,9 @@ void _Matrix::ExportMatrixExp (_Matrix* theBase, hyFile* theDump) for (k=0; kputs (LocateVar(mxVariables(k))->GetName()->get_str()); if (kputc (','); + theDump->fputc (','); } else { - theDump->putc (';'); + theDump->fputc (';'); } varPool[k]=topPolyCap; } diff --git a/src/core/string_file_wrapper.cpp b/src/core/string_file_wrapper.cpp index 04211e718..09ab7ab8c 100644 --- a/src/core/string_file_wrapper.cpp +++ b/src/core/string_file_wrapper.cpp @@ -57,7 +57,7 @@ StringFileWrapper& StringFileWrapper::operator << (const char letter) { if (string_buffer) { *string_buffer << letter; } else if (file_buffer) { - file_buffer->putc (letter); + file_buffer->fputc (letter); } return *this; } @@ -91,13 +91,13 @@ StringFileWrapper& StringFileWrapper::operator << (const StringFileWrapperConsta } else if (file_buffer) { switch (special) { case kStringFileWrapperNewLine: - file_buffer->putc ('\n'); + file_buffer->fputc ('\n'); break; case kStringFileWrapperLinefeed: - file_buffer->putc ('\r'); + file_buffer->fputc ('\r'); break; case kStringFileWrapperTab: - file_buffer->putc ('\t'); + file_buffer->fputc ('\t'); break; } } From 13e2ca530f1d519e24cad019e9ae357705b6ddd1 Mon Sep 17 00:00:00 2001 From: Sergei Date: Thu, 9 Nov 2023 15:56:09 -0500 Subject: [PATCH 06/14] PAML annotator fixes --- res/TemplateBatchFiles/libv3/tasks/trees.bf | 27 ++++++++++++--------- src/core/global_things.cpp | 2 +- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/res/TemplateBatchFiles/libv3/tasks/trees.bf b/res/TemplateBatchFiles/libv3/tasks/trees.bf index e5acfd544..9a2b09fdf 100644 --- a/res/TemplateBatchFiles/libv3/tasks/trees.bf +++ b/res/TemplateBatchFiles/libv3/tasks/trees.bf @@ -373,33 +373,37 @@ lfunction trees.branch_names(tree, respect_case) { lfunction trees.extract_paml_annotation (tree_string) { - pieces = regexp.SplitWithMatches (tree_string, "\$([0-9])"); + pieces = regexp.SplitWithMatches (tree_string, "\$[0-9]+"); N = utility.Array1D (pieces[^("terms.regexp.separators")]); has_paml_clades = N > 0; sanitized_string = ""; if (N > 0) { - for (i,s; in; pieces[^("terms.regexp.strings")]) { - sanitized_string += s; - if (+i < N) { - sanitized_string += "{" + regexp.Replace ((pieces[^("terms.regexp.separators")])[+i], "\$", "PAML_CLADE_") + "}"; + for (i = 0; i <= N; i+=1) { + sanitized_string += (pieces[^("terms.regexp.strings")])[i]; + if (i < N) { + sanitized_string += "{" + regexp.Replace ((pieces[^("terms.regexp.separators")])[i], "\$", "PAML_CLADE_") + "}"; } } + } else { + sanitized_string = tree_string; } - pieces = regexp.SplitWithMatches (sanitized_string, "\#([0-9])"); + pieces = regexp.SplitWithMatches (sanitized_string, "\#[0-9]+"); N = utility.Array1D (pieces[^("terms.regexp.separators")]); if (N > 0) { - for (i,s; in; pieces[^("terms.regexp.strings")]) { - sanitized_string += s; - if (+i < N) { - sanitized_string += "{" + regexp.Replace ((pieces[^("terms.regexp.separators")])[+i], "\#", "PAML_GROUP_") + "}"; + sanitized_string = ""; + for (i = 0; i <= N; i+=1) { + sanitized_string += (pieces[^("terms.regexp.strings")])[i]; + if (i < N) { + sanitized_string += "{" + regexp.Replace ((pieces[^("terms.regexp.separators")])[i], "\#", "PAML_GROUP_") + "}"; } } } - + + if (Abs (sanitized_string) == 0) { sanitized_string = tree_string; } @@ -408,6 +412,7 @@ lfunction trees.extract_paml_annotation (tree_string) { result = trees.ExtractTreeInfo (sanitized_string); utility.ToggleEnvVariable ("IGNORE_INTERNAL_NODE_LABELS", None); + if (has_paml_clades) { additional_annotation = {}; tree_str = result[^"terms.trees.newick"]; diff --git a/src/core/global_things.cpp b/src/core/global_things.cpp index ab68d6652..5509332a7 100644 --- a/src/core/global_things.cpp +++ b/src/core/global_things.cpp @@ -122,7 +122,7 @@ namespace hy_global { kErrorStringMatrixExportError ("Export matrix called with a non-polynomial matrix argument"), kErrorStringNullOperand ("Attempting to operate on an undefined value; this is probably the result of an earlier 'soft' error condition"), kErrorNumerical ("To treat numerical errors as warnings, please specify \"ENV=TOLERATE_NUMERICAL_ERRORS=1;\" as the command line argument. This often resolves the issue, which is indicative of numerical instability."), - kHyPhyVersion = _String ("2.5.57"), + kHyPhyVersion = _String ("2.5.58"), kNoneToken = "None", kNullToken = "null", From 248158e32d875b2bdef158ec2111bdabe9416375 Mon Sep 17 00:00:00 2001 From: Sergei Date: Tue, 14 Nov 2023 15:50:08 -0500 Subject: [PATCH 07/14] Bug fixes --- res/TemplateBatchFiles/libv3/IOFunctions.bf | 1 + .../libv3/UtilityFunctions.bf | 45 +++++++++-------- src/core/batchlanruntime.cpp | 48 +++++++++++++------ src/core/include/hy_types.h | 2 + 4 files changed, 61 insertions(+), 35 deletions(-) diff --git a/res/TemplateBatchFiles/libv3/IOFunctions.bf b/res/TemplateBatchFiles/libv3/IOFunctions.bf index da323c4e3..7c1e95c00 100644 --- a/res/TemplateBatchFiles/libv3/IOFunctions.bf +++ b/res/TemplateBatchFiles/libv3/IOFunctions.bf @@ -179,6 +179,7 @@ lfunction io.SpoolJSON(json, file) { lfunction io.ParseJSON(file_path) { fscanf(file_path, REWIND, "Raw", test); parsed_test = Eval(test); + DeleteObject (test); return parsed_test; } diff --git a/res/TemplateBatchFiles/libv3/UtilityFunctions.bf b/res/TemplateBatchFiles/libv3/UtilityFunctions.bf index 0697c47bc..09db689e2 100644 --- a/res/TemplateBatchFiles/libv3/UtilityFunctions.bf +++ b/res/TemplateBatchFiles/libv3/UtilityFunctions.bf @@ -793,15 +793,15 @@ lfunction utility.Has (d, key, type) { return FALSE; } - if (Type (key) == "Matrix") { - depth = utility.Array1D (key); - current_check = &d; + if (Type (key) == "Matrix") { + depth = utility.Array1D (key); + current_check = d; current_key = key[0]; for (i = 1; i < depth; i += 1) { - if (Eval ("`current_check`/'`current_key`'")) { - current_check = "(" + current_check + ")['`current_key`']"; - if (Eval ("Type(`current_check`)") != "AssociativeList") { + if (current_check / current_key) { + current_check = current_check[current_key]; + if (Type (current_check) != "AssociativeList") { return FALSE; } current_key = key[i]; @@ -809,13 +809,17 @@ lfunction utility.Has (d, key, type) { return FALSE; } } - - if ( Eval ("`current_check`/'`current_key`'") ) { + + + if ( current_check / current_key ) { + return_value = current_check[current_key]; if (type == None) { return TRUE; } - return Eval ("Type((`current_check`)['`current_key`'])") == type; - } + if (Type (return_value) == type) { + return TRUE; + } + } } } return FALSE; @@ -847,24 +851,25 @@ lfunction utility.GetByKey (d, key, type) { } if (Type (key) == "Matrix") { - depth = utility.Array1D (key); - current_check = &d; + depth = utility.Array1D (key); + current_check = d; current_key = key[0]; for (i = 1; i < depth; i += 1) { - if (Eval ("`current_check`/'`current_key`'")) { - current_check = "(" + current_check + ")['`current_key`']"; - if (Eval ("Type(`current_check`)") != "AssociativeList") { - return FALSE; + if (current_check / current_key) { + current_check = current_check[current_key]; + if (Type (current_check) != "AssociativeList") { + return None; } current_key = key[i]; } else { - return FALSE; + return None; } } - - if ( Eval ("`current_check`/'`current_key`'") ) { - return_value = Eval ("(`current_check`)['`current_key`']"); + + + if ( current_check / current_key ) { + return_value = current_check[current_key]; if (type == None) { return return_value; } diff --git a/src/core/batchlanruntime.cpp b/src/core/batchlanruntime.cpp index e02ef5f57..864c7869c 100644 --- a/src/core/batchlanruntime.cpp +++ b/src/core/batchlanruntime.cpp @@ -1988,7 +1988,7 @@ bool _ElementaryCommand::HandleAdvanceIterator(_ExecutionList& current_prog if (source_object_class == TREE) node_name = new _FString (map_node_to_calcnode (topTraverser)->ContextFreeName()); else - node_name = new _FString (((_TreeTopology*)parameters.GetItem (1))->GetNodeName(topTraverser)); + node_name = new _FString (((_TreeTopology*)parameters.GetItem (reciever_count))->GetNodeName(topTraverser)); if (reciever_count > 1) { ((_Variable*)parameters.GetItem (reciever_count+2))->SetValue (node_name, false, false,NULL); @@ -1999,7 +1999,7 @@ bool _ElementaryCommand::HandleAdvanceIterator(_ExecutionList& current_prog if (source_object_class == TREE) parent_name = new _FString (map_node_to_calcnode (parent_node)->ContextFreeName()); else - parent_name = new _FString (((_TreeTopology*)parameters.GetItem (1))->GetNodeName(parent_node)); + parent_name = new _FString (((_TreeTopology*)parameters.GetItem (reciever_count))->GetNodeName(parent_node)); ((_Variable*)parameters.GetItem (reciever_count+1))->SetValue (parent_name, false, false,NULL); } else { ((_Variable*)parameters.GetItem (reciever_count+1))->SetValue (new _MathObject, false, false,NULL); @@ -3795,24 +3795,42 @@ bool _ElementaryCommand::HandleFscanf (_ExecutionList& current_program, boo last_call_stream_position = 0L; } - input_stream->seek (0,SEEK_END); - current_stream_position = input_stream->tell(); - current_stream_position -= last_call_stream_position; - - if (current_stream_position<=0) { - hy_env::EnvVariableSet(hy_env::end_of_file, new HY_CONSTANT_TRUE, false); - input_stream->close(); - delete (input_stream); - return true; + _String * file_data = nil; + + if (input_stream->is_compressed()) { + /** + 20231113 : when the stream is compressed, SEEK_END is not available. + */ + _StringBuffer * file_data_buffer = new _StringBuffer (input_stream); + if (file_data_buffer->length() == 0L) { + hy_env::EnvVariableSet(hy_env::end_of_file, new HY_CONSTANT_TRUE, false); + input_stream->close(); + delete (input_stream); + DeleteObject (file_data_buffer); + return true; + } + file_data = file_data_buffer; + } else { + input_stream->seek (0,SEEK_END); + current_stream_position = input_stream->tell(); + current_stream_position -= last_call_stream_position; + + if (current_stream_position<=0) { + hy_env::EnvVariableSet(hy_env::end_of_file, new HY_CONSTANT_TRUE, false); + input_stream->close(); + delete (input_stream); + return true; + } + + input_stream->rewind(); + input_stream->seek (last_call_stream_position, SEEK_SET); + file_data = new _String (input_stream, current_stream_position); } - input_stream->rewind(); - input_stream->seek (last_call_stream_position, SEEK_SET); - _String * file_data = new _String (input_stream, current_stream_position); + input_data = file_data; input_stream->close(); delete (input_stream); dynamic_reference_manager < file_data; - input_data = file_data; current_stream_position = 0L; } } diff --git a/src/core/include/hy_types.h b/src/core/include/hy_types.h index add98ec13..fca090501 100644 --- a/src/core/include/hy_types.h +++ b/src/core/include/hy_types.h @@ -116,8 +116,10 @@ class hyFile { inline bool valid (void) const {return _fileReference != NULL || _fileReferenceDirect != NULL;} gzFile _fileReference; FILE * _fileReferenceDirect; + bool is_compressed (void) const { return _fileReference != NULL;} #else inline bool valid (void) const {return _fileReference != NULL;} + bool is_compressed (void) const { return false;} FILE* _fileReference; #endif }; From 61f161b8074fae2dff8802db37d4c49cd041c12d Mon Sep 17 00:00:00 2001 From: Sergei Date: Tue, 14 Nov 2023 15:52:53 -0500 Subject: [PATCH 08/14] Adding an error filter --- .../SelectionAnalyses/error-filter.bf | 270 ++++++++++++++++++ res/TemplateBatchFiles/files.lst | 1 + 2 files changed, 271 insertions(+) create mode 100644 res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf diff --git a/res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf b/res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf new file mode 100644 index 000000000..e64145053 --- /dev/null +++ b/res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf @@ -0,0 +1,270 @@ +RequireVersion("2.5.58"); + +/*------------------------------------------------------------------------------ + Load library files +*/ + +LoadFunctionLibrary("libv3/UtilityFunctions.bf"); +LoadFunctionLibrary("libv3/IOFunctions.bf"); +LoadFunctionLibrary("libv3/stats.bf"); +LoadFunctionLibrary("libv3/all-terms.bf"); + +LoadFunctionLibrary("libv3/tasks/ancestral.bf"); +LoadFunctionLibrary("libv3/tasks/alignments.bf"); +LoadFunctionLibrary("libv3/tasks/estimators.bf"); +LoadFunctionLibrary("libv3/tasks/trees.bf"); + +LoadFunctionLibrary("libv3/convenience/math.bf"); + + +/*------------------------------------------------------------------------------ + Display analysis information +*/ + +efilter.analysis_description = { + terms.io.info: " +The error filter analysis reads a BUSTED-E JSON result file, +identifies sites which may be due to alignment or other error, +and masks them according to a user-selected strategy, saving the resulting MSA to a new file. +A machine readable report on the errors filtered (if any) is also created. + ", + terms.io.version: "0.1", + terms.io.reference: "TBD", + terms.io.authors: "Sergei L Kosakovsky Pond", + terms.io.contact: "spond@temple.edu", + terms.io.requirements: "A BUSTED-E .json results file (possibly .gz compressed)", + terms.settings: {} +}; +io.DisplayAnalysisBanner(efilter.analysis_description); + +KeywordArgument ("json", "Load the BUSTED-E .json result file", None); +efilter.path = io.PromptUserForFilePathRead ("Load the BUSTED-E .json result file"); +io.ReportProgressMessageMD ("efilter","loading","Loading .JSON data"); + +KeywordArgument ("threshold", "Empirical Bayes Factor error threshold for masking sites", 100.); +efilter.threshold = io.PromptUser ("Empirical Bayes Factor threshold for masking sites", 100., 1, 1e100, FALSE); + +KeywordArgument ("ratio", "Empirical Bayes Factor for error vs selection ", 20.); +efilter.ratio_threshold = io.PromptUser ("Empirical Bayes Factor for error vs selection", 20., 1, 1e100, FALSE); + + +KeywordArgument ("site-threshold", "Mask the entire site if more than X% sequences are problematic", .4); +efilter.site_threshold = io.PromptUser ("Mask the entire site if more than X% sequences are problematic", 0.4, 0., 1., FALSE); + + +efilter.json_out = { + terms.json.analysis : efilter.analysis_description, + terms.settings : { + terms.empirical_bayes_factor : efilter.threshold, + "BF ratio" : efilter.ratio_threshold, + "site threshold" : efilter.site_threshold + } +}; + +efilter.json = io.ParseJSON(efilter.path); + +io.CheckAssertion ("utility.GetByKey (efilter.json, {{terms.json.analysis, 'settings', 'error-sink'}},'Number')", "There is no error sink data in the JSON file"); + + +efilter.error_sink_proportion = utility.GetByKey (efilter.json, {{terms.json.fits,"Unconstrained model",terms.json.rate_distribution,"Test","0",terms.json.proportion}},'Number'); + +io.CheckAssertion ("None!=efilter.error_sink_proportion", "There is no estimate of the error-sink omega proportion"); + +efilter.rates = utility.GetByKey (efilter.json, {{terms.json.fits,"Unconstrained model",terms.json.rate_distribution,"Test"}},'AssociativeList'); +efilter.rates_count = utility.Array1D (efilter.rates); + +efilter.fast_omega_proportion = utility.GetByKey (efilter.json, {{terms.json.fits,"Unconstrained model",terms.json.rate_distribution,"Test","" + (efilter.rates_count-1),terms.json.proportion}},'Number'); + +efilter.input = utility.GetByKey (efilter.json, {{terms.json.input}}, "AssociativeList"); +io.CheckAssertion ("None!=efilter.input", "There is no information about the alignment"); + +io.ReportProgressMessageMD ("efilter","loading","- Original data file path: \`" + efilter.input[terms.json.file] + "\`"); +io.ReportProgressMessageMD ("efilter","loading","- Partitions: " + efilter.input[terms.json.partition_count]); +io.ReportProgressMessageMD ("efilter","loading","- Sequences: " + efilter.input[terms.json.sequences]); +io.ReportProgressMessageMD ("efilter","loading","- Sites: " + efilter.input[terms.json.sites]); + +efilter.json_out[terms.json.input] = efilter.input; + +io.ReportProgressMessageMD ("efilter","loading","- Estimated proportion of the MSA in the error-sink class = " + Format (efilter.error_sink_proportion *100, 8, 3) + "%"); + +if (efilter.error_sink_proportion == 0) { + efilter.error_sink_prior_odds = 1e100; +} else { + efilter.error_sink_prior_odds = efilter.error_sink_proportion / (1-efilter.error_sink_proportion); +} +efilter.error_sink_prior_odds_ratio = Min (1e25,efilter.error_sink_proportion / efilter.fast_omega_proportion); + +efilter.site_offset = 0; + +utility.ToggleEnvVariable ("LOOP_TREE_ITERATOR_PREORDER", TRUE); + +for (p = 0; p < efilter.input[terms.json.partition_count]; p+=1) { + efilter.sequences = {}; + efilter.masked_sites = {}; + efilter.branch_data = utility.GetByKey (efilter.json, {{terms.json.branch_attributes,p}}, "AssociativeList"); + efilter.tested_branches = utility.GetByKey (efilter.json, {{terms.json.tested,p}}, "AssociativeList"); + efilter.tree = utility.GetByKey (efilter.input, {{terms.json.trees,p}}, "String"); + efilter.subs = utility.GetByKey (efilter.json, {{terms.substitutions,p}}, "AssociativeList"); + Topology efilter.T = efilter.tree; + efilter.sites = utility.Array1D (efilter.subs ); + + if (p == 0) { + efilter.leaves = {}; + for (s;in;TipName (efilter.T,-1)) { + efilter.sequences[s] = ""; + efilter.sequences[s] * efilter.sites; + efilter.masked_sites[s] = {}; + efilter.leaves[s] = 1; + } + + } + efilter.descendants = {}; + efilter.leaf_descendants = {}; + + for (s;in;BranchName (efilter.T,-1)) { + if (efilter.sequences / s) { + continue; + } + efilter.descendants[s] = {}; + efilter.leaf_descendants[s] = {}; + for (s2, v; in; efilter.T [s]) { + if (v == 1) { + (efilter.leaf_descendants[s])[s2] = 1; + } + (efilter.descendants[s])[s2] = 1; + } + if (Abs (efilter.leaf_descendants[s]) * 2 > Abs (efilter.leaves)) { + t = efilter.leaf_descendants[s]; + efilter.leaf_descendants[s] = efilter.leaves; + efilter.leaf_descendants[s] - t; + } + } + + + for (site = 0; site < efilter.sites; site += 1) { + efilter.subs4site = efilter.subs [site]; + efilter.states = {}; + efilter.masked_already = {}; + efilter.write_out = {}; + for (parent, node; in; efilter.T) { + if (parent) { + if (efilter.subs4site/node) { + efilter.states [node] = efilter.subs4site[node]; + } else { + efilter.states [node] = efilter.states [parent] + } + efilter.write_state = efilter.states [node] ; + if (efilter.masked_already [node]) { + continue; + } + if (efilter.branch_data / node) { + efilter.site_prob = ((efilter.branch_data[node])["Posterior prob omega class by site"])[0][site]; + if (None == efilter.site_prob) { + continue; + } + efilter.site_BF2 = ((efilter.branch_data[node])["Posterior prob omega class by site"])[efilter.rates_count-1][site]; + if (efilter.site_prob < 1) { + efilter.site_BF = efilter.site_prob/(1-efilter.site_prob)/efilter.error_sink_prior_odds; + } else { + efilter.site_BF = 1e25; + } + + if (efilter.site_BF2 < 1) { + efilter.site_BF2 = efilter.site_prob / efilter.site_BF2 / efilter.error_sink_prior_odds_ratio; + } else { + efilter.site_BF2 = 1e25; + } + + if (efilter.site_BF >= efilter.threshold && efilter.site_BF2 >= efilter.ratio_threshold) { + if (efilter.masked_sites/node) { // terminal node + if (efilter.masked_already [ntm] != 1) { + efilter.masked_sites [node] + (site + efilter.site_offset); + } + efilter.write_out[node] = "---"; + efilter.masked_already [node] = 1; + } else { + if (Abs (efilter.leaf_descendants[node]) / Abs (efilter.sequences) >= efilter.site_threshold) { + for (ntm, ignore; in; efilter.sequences) { + efilter.write_out[ntm] = "---"; + if (efilter.masked_already [ntm] != 1) { + efilter.masked_sites [ntm] + (site + efilter.site_offset); + } + } + //console.log ("Masking everything " + site + " " + node + " " + Abs (efilter.leaf_descendants) / Abs (efilter.sequences)); + break; + } + for (ntm, ignore; in; efilter.leaf_descendants[node]) { + efilter.write_out[ntm] = "---"; + if (efilter.masked_already [ntm] != 1) { + efilter.masked_sites [ntm] + (site + efilter.site_offset); + } + } + efilter.masked_already * efilter.leaf_descendants[node]; + } + } + } + } else { + efilter.states [node] = efilter.subs4site["root"]; + } + + + if (efilter.sequences / node && efilter.masked_already[node] == 0) { + efilter.write_out[node] = efilter.write_state; + } + } + + + for (s,st; in; efilter.write_out) { + efilter.sequences[s] * st; + } + + } + + efilter.site_offset += efilter.sites; +} + +utility.ToggleEnvVariable ("LOOP_TREE_ITERATOR_PREORDER", None); + +KeywordArgument ("output", "Write a filtered MSA to"); +efilter.path = io.PromptUserForFilePath ("Write a filtered MSA to"); + +io.ReportProgressMessageMD ("efilter","results","Filtering results"); + +efilter.settings = {"header" : TRUE, "column-widths": { + "0": 50, + "1": 20, + "2": 20 + }}; + +fprintf (stdout, "\n", io.FormatTableRow ({{"Sequence", "Filtered Sites", "Filtered Proportion, %"}}, efilter.settings)); +efilter.settings ["header"] = FALSE; +efilter.total_filtered = 0; + +for (s;in;TipName (efilter.T,-1)) { + efilter.row_matrix = {1,3}; + + efilter.row_matrix [0] = s; + efilter.row_matrix [1] = utility.Array1D (efilter.masked_sites[s]); + efilter.row_matrix [2] = Format (efilter.row_matrix [1] / efilter.site_offset*10, 10, 3); + + efilter.total_filtered += efilter.row_matrix [1]; + + fprintf (stdout, io.FormatTableRow (efilter.row_matrix, efilter.settings)); + efilter.sequences[s] * 0; + fprintf (efilter.path, ">", s, "\n", efilter.sequences[s], "\n"); +} + + +fprintf (stdout, "\n"); + +KeywordArgument ("output-json", "Write the resulting JSON to this file"); +efilter.json_path = io.PromptUserForFilePath ("Save the resulting JSON file to"); + +efilter.json_out ["filter"] = efilter.masked_sites ; + +io.SpoolJSON (efilter.json_out,efilter.json_path); + + + + + diff --git a/res/TemplateBatchFiles/files.lst b/res/TemplateBatchFiles/files.lst index e71a7e28c..81a9c7b40 100644 --- a/res/TemplateBatchFiles/files.lst +++ b/res/TemplateBatchFiles/files.lst @@ -8,6 +8,7 @@ "RELAX","[RELAX] Test for relaxation of selection pressure along a specified set of test branches using RELAX (a random effects test of selection relaxation).","SelectionAnalyses/RELAX.bf"; "FADE","[FADE] Test a protein alignment for directional selection towards specific amino acids along a specified set of test branches using FADE (a FUBAR Approach to Directional Evolution).","SelectionAnalyses/FADE.bf"; "PRIME","[PRIME] ","SelectionAnalyses/PRIME.bf"; +"error-filter","[Error-Filter] Use a BUSTED-E model fit to clean-up a sequence alignment","SelectionAnalyses/error-filter.bf"; "","A collection of tools for evolutionary hypothesis testing coding-sequence data.","!Evolutionary Hypothesis Testing"; "FMM", "Fit a model that permits double (and triple) instantaneous nucleotide substitutions","SelectionAnalyses/FitMultiModel.bf"; From 8bf64cb737df54a5aada8b4edb8d97e689b53b50 Mon Sep 17 00:00:00 2001 From: Sergei Date: Tue, 14 Nov 2023 15:56:32 -0500 Subject: [PATCH 09/14] Adding an error filter --- res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf b/res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf index e64145053..4336c3f32 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf @@ -255,6 +255,9 @@ for (s;in;TipName (efilter.T,-1)) { } +io.ReportProgressMessageMD ("efilter","results","\nMasked a total of **`efilter.total_filtered`** or `Format (efilter.total_filtered/efilter.input[terms.json.sequences]/efilter.input[terms.json.sites]*100,8,3)`% sites"); + + fprintf (stdout, "\n"); KeywordArgument ("output-json", "Write the resulting JSON to this file"); From 815be03e84c8a5b0a7a2191c66ec331aec4134d2 Mon Sep 17 00:00:00 2001 From: Sergei Date: Wed, 15 Nov 2023 19:13:31 -0500 Subject: [PATCH 10/14] mpiOptimizerLoop fix for bug introduced during biowasm tweaks --- .../SelectionAnalyses/BUSTED.bf | 125 ++++++++++-------- .../SelectionAnalyses/error-filter.bf | 3 + src/core/batchlan.cpp | 2 - src/core/dataset.cpp | 36 ++--- src/core/global_things.cpp | 19 +++ src/core/likefunc.cpp | 22 +-- 6 files changed, 110 insertions(+), 97 deletions(-) diff --git a/res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf b/res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf index aaa06918b..8b571766c 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf @@ -473,7 +473,7 @@ busted.initial_grid = {}; busted.initial_grid_presets = {"0" : 0.1}; busted.initial_ranges = {}; -busted.init_grid_setup (busted.distribution); +busted.init_grid_setup (busted.distribution, busted.error_sink); /** setup parameter optimization groups */ @@ -487,7 +487,7 @@ if (busted.has_background) { busted.model_object_map = { "busted.background" : busted.background.bsrel_model, "busted.test" : busted.test.bsrel_model }; busted.background_distribution = models.codon.BS_REL.ExtractMixtureDistribution(busted.background.bsrel_model); - busted.init_grid_setup (busted.background_distribution); + busted.init_grid_setup (busted.background_distribution, busted.error_sink); //PARAMETER_GROUPING + busted.background_distribution["rates"]; //PARAMETER_GROUPING + busted.background_distribution["weights"]; @@ -529,7 +529,7 @@ if (busted.do_srv) { //PARAMETER_GROUPING + busted.srv_distribution["weights"]; PARAMETER_GROUPING + utility.Concat (busted.srv_distribution["rates"],busted.srv_distribution["weights"]); - busted.init_grid_setup (busted.srv_distribution); + busted.init_grid_setup (busted.srv_distribution, FALSE); } @@ -559,17 +559,16 @@ for (busted.partition_index = 0; busted.partition_index < busted.partition_count busted.initial.test_mean = ((selection.io.extract_global_MLE_re (busted.final_partitioned_mg_results, "^" + terms.parameters.omega_ratio + ".+test.+"))["0"])[terms.fit.MLE]; busted.initial_grid = estimators.LHC (busted.initial_ranges,busted.initial_grid.N); - //estimators.CreateInitialGrid (busted.initial_grid, busted.initial_grid.N, busted.initial_grid_presets); busted.initial_grid = utility.Map (busted.initial_grid, "_v_", - 'busted._renormalize (_v_, "busted.distribution", busted.initial.test_mean)' + 'busted._renormalize (_v_, "busted.distribution", busted.initial.test_mean, busted.error_sink)' ); if (busted.has_background) { //GDD rate category busted.initial.background_mean = ((selection.io.extract_global_MLE_re (busted.final_partitioned_mg_results, "^" + terms.parameters.omega_ratio + ".+background.+"))["0"])[terms.fit.MLE]; busted.initial_grid = utility.Map (busted.initial_grid, "_v_", - 'busted._renormalize (_v_, "busted.background_distribution", busted.initial.background_mean)' + 'busted._renormalize (_v_, "busted.background_distribution", busted.initial.background_mean, busted.error_sink)' ); } @@ -606,6 +605,7 @@ if (Type (debug.checkpoint) != "String") { // constrain nucleotide rate parameters busted.tmp_fixed = models.FixParameterSetRegExp (terms.nucleotideRatePrefix,busted.test.bsrel_model); + busted.grid_search.results = estimators.FitLF (busted.filter_names, busted.trees, busted.model_map, busted.final_partitioned_mg_results, busted.model_object_map, { "retain-lf-object": TRUE, @@ -1227,12 +1227,18 @@ lfunction busted.DistributionGuess (mean) { // renormalize the grid to have the same mean as the initial omega value -lfunction busted._renormalize (v, distro, mean) { +lfunction busted._renormalize (v, distro, mean, skip_first) { parameters.SetValues (v); m = parameters.GetStickBreakingDistribution (^distro); d = Rows (m); - m = +(m[-1][0] $ m[-1][1]); // current mean - for (i = 0; i < d; i+=1) { + if (skip_first) { + m0 = m[0][0]*m[0][1]; + } else { + m0 = 0; + } + m = +(m[-1][0] $ m[-1][1]) -m0; // current mean + + for (i = (skip_first != 0); i < d; i+=1) { (v[((^distro)["rates"])[i]])[^"terms.fit.MLE"] = (v[((^distro)["rates"])[i]])[^"terms.fit.MLE"] / m * mean; } return v; @@ -1292,65 +1298,68 @@ lfunction busted.mixture_site_BF (ll, p) { //------------------------------------------------------------------------------ -function busted.init_grid_setup (omega_distro) { - utility.ForEachPair (omega_distro[terms.parameters.rates], "_index_", "_name_", - ' - if (_index_[0] < busted.rate_classes - 1) { // not the last rate - busted.initial_grid [_name_] = { - { - 0.01, 0.1, 0.25, 0.75 - } - }["_MATRIX_ELEMENT_VALUE_^(busted.rate_classes-_index_[0]-1)"]; - busted.initial_grid_presets [_name_] = 0; - - if (busted.error_sink && _index_[0] == 0) { - busted.initial_ranges [_name_] = { - terms.lower_bound : 100, - terms.upper_bound : 10000 - }; - } else { - busted.initial_ranges [_name_] = { - terms.lower_bound : 0, - terms.upper_bound : 1 - }; +function busted.init_grid_setup (omega_distro, error_sink) { + for (_index_,_name_; in; omega_distro[terms.parameters.rates]) { + if (_index_ < busted.rate_classes - 1) { // not the last rate + busted.initial_grid [_name_] = { + { + 0.01, 0.1, 0.25, 0.75 } - } else { - busted.initial_grid [_name_] = { - { - 1, 1.5, 2, 4, 10 - } + }["_MATRIX_ELEMENT_VALUE_^(busted.rate_classes-_index_-1)"]; + + busted.initial_grid_presets [_name_] = 0; + + if (error_sink && _index_ == 0) { + busted.initial_grid [_name_] = {{100,500,1000,5000}}; + busted.initial_ranges [_name_] = { + terms.lower_bound : 100, + terms.upper_bound : 10000 }; + } else { busted.initial_ranges [_name_] = { - terms.lower_bound : 1, - terms.upper_bound : 10 + terms.lower_bound : 0, + terms.upper_bound : 1 }; - busted.initial_grid_presets [_name_] = 2; } - ' - ); - - - utility.ForEachPair (omega_distro[terms.parameters.weights], "_index_", "_name_", - ' + } else { busted.initial_grid [_name_] = { { - 0.2, 0.5, 0.7, 0.8, 0.9 + 1, 1.5, 2, 4, 10 } }; - busted.initial_grid_presets [_name_] = 3; - if (busted.error_sink && _index_[0] == 0) { - busted.initial_ranges [_name_] = { - terms.lower_bound : 0, - terms.upper_bound : 0.01, - }; - } else { - busted.initial_ranges [_name_] = { - terms.lower_bound : 0, - terms.upper_bound : 1 - }; + busted.initial_ranges [_name_] = { + terms.lower_bound : 1, + terms.upper_bound : 10 + }; + busted.initial_grid_presets [_name_] = 2; + } + } + + + for (_index_, _name_; in; omega_distro[terms.parameters.weights]) { + busted.initial_grid [_name_] = { + { + 0.2, 0.5, 0.7, 0.8, 0.9 } - ' - ); + }; + busted.initial_grid_presets [_name_] = 3; + if (error_sink && _index_ == 0) { + busted.initial_grid [_name_] = { + { + 0, 0.001, 0.005, 0.1 + } + }; + busted.initial_ranges [_name_] = { + terms.lower_bound : 0, + terms.upper_bound : 0.01, + }; + } else { + busted.initial_ranges [_name_] = { + terms.lower_bound : 0, + terms.upper_bound : 1 + }; + } + } } //------------------------------------------------------------------------------ diff --git a/res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf b/res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf index 4336c3f32..b72e35469 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf @@ -254,6 +254,9 @@ for (s;in;TipName (efilter.T,-1)) { fprintf (efilter.path, ">", s, "\n", efilter.sequences[s], "\n"); } +if (efilter.input[terms.json.partition_count] == 1) { + fprintf (efilter.path, "\n", Format (efilter.T,1,0)); +} io.ReportProgressMessageMD ("efilter","results","\nMasked a total of **`efilter.total_filtered`** or `Format (efilter.total_filtered/efilter.input[terms.json.sequences]/efilter.input[terms.json.sites]*100,8,3)`% sites"); diff --git a/src/core/batchlan.cpp b/src/core/batchlan.cpp index c72d288c2..3c471bf60 100644 --- a/src/core/batchlan.cpp +++ b/src/core/batchlan.cpp @@ -3243,7 +3243,6 @@ void _ElementaryCommand::ExecuteCase52 (_ExecutionList& chain) { _Variable * category_names_id = CheckReceptacle(&rate_variable_names, __PRETTY_FUNCTION__); _Matrix* category_names = new _Matrix (1,1,false,true); - SetStatusLine ("Simulating Data"); { // lf must be deleted before the referenced datafilters _LikelihoodFunction lf (filter_specification, nil); @@ -3253,7 +3252,6 @@ void _ElementaryCommand::ExecuteCase52 (_ExecutionList& chain) { */ lf.Simulate (*sim_dataset, exclusions, category_values, category_names, root_states, do_internals?(main_file?&spool_file:&kEmptyString):nil); - SetStatusLine ("Idle"); } diff --git a/src/core/dataset.cpp b/src/core/dataset.cpp index 76222bf92..ebdd84684 100644 --- a/src/core/dataset.cpp +++ b/src/core/dataset.cpp @@ -141,9 +141,8 @@ BaseRef _DataSet::makeDynamic(void) const { //_______________________________________________________________________ void _DataSet::ResetIHelper(void) { - if (dsh && dsh->characterPositions.lLength == 256) - for (long k = 0; k < 256; k++) { - dsh->characterPositions.list_data[k] = -1; + if (dsh && dsh->characterPositions.lLength == 256) { + InitializeArray (dsh->characterPositions.list_data, 256, -1L); } } @@ -557,44 +556,27 @@ void _DataSet::Finalize(void) { void _DataSet::Compact(long index) { if (useHorizontalRep) { HandleApplicationError( - "Internal Error: _DataSet::Compact called with compact represntation", + "Internal Error: _DataSet::Compact called on a dataset already using a Compact representation", true); return; } - - _Site *tC = (_Site *)(*(_List *)this)(index); - if (tC->GetRefNo() != -1) + _Site *tC = (_Site *)GetItem (index); + if (tC->GetRefNo() != -1) { // take care of double referencing - { _Site *tCC = tC; - long lastRef, count = 0; + long lastRef, count = 0L; do { lastRef = tCC->GetRefNo(); count++; - tCC = (_Site *)(*(_List *)this)(tCC->GetRefNo()); + tCC = (_Site *)GetItem (tCC->GetRefNo()); } while (tCC->GetRefNo() != -1); - if (count > 1) { + + if (count > 1L) { theFrequencies[lastRef]++; } tC->SetRefNo(lastRef); } - /*if (tC->GetRefNo()==-1) - { - long f = dsh->incompletePatterns->Find(tC); - if (f >= 0) - { - f = dsh->incompletePatterns->GetXtra (f); - if (fClear(); - tC->SetRefNo(f); - } - else - tC->Finalize(); - } - }*/ } //_______________________________________________________________________ diff --git a/src/core/global_things.cpp b/src/core/global_things.cpp index 5509332a7..22a9d3046 100644 --- a/src/core/global_things.cpp +++ b/src/core/global_things.cpp @@ -347,9 +347,11 @@ namespace hy_global { hy_x_variable = nil; hy_n_variable = nil; pathNames.Clear(); +#ifdef _USE_EMSCRIPTEN_ BuiltInFunctions.Clear(); hyReservedWords.Clear(); UnOps.Clear(); +#endif } hy_scanf_last_file_path = kEmptyString; EnvVariableSet(random_seed, new _Constant (hy_random_seed), false); @@ -483,11 +485,13 @@ namespace hy_global { _HY_HBLCommandHelper.Clear(); _HY_ValidHBLExpressions.Clear(); listOfCompiledFormulae.Clear(); +#ifdef _USE_EMSCRIPTEN_ _hy_standard_library_paths.Clear(); _hy_standard_library_extensions.Clear(); availableTemplateFilesAbbreviations.Clear(); availableTemplateFiles.Clear(); availablePostProcessors.Clear(); +#endif #ifdef __HYPHYMPI__ @@ -829,6 +833,21 @@ namespace hy_global { theMessage << "MinGW ";// " & __MINGW32_VERSION; #endif #endif +#ifdef _SLKP_USE_ARM_NEON + theMessage << " ARM Neon SIMD"; +#endif +#ifdef _SLKP_USE_AVX_INTRINSICS + theMessage << " x86 AVX SIMD"; +#endif +#ifdef _SLKP_USE_FMA3_INTRINSICS + theMessage << " with FMA3"; +#endif +#ifdef _SLKP_USE_SSE_INTRINSICS + theMessage << " x86 SSE4 SIMD"; +#endif +#ifdef __ZLIB__ + theMessage << " zlib (v" << ZLIB_VERSION << ")"; +#endif return theMessage; } diff --git a/src/core/likefunc.cpp b/src/core/likefunc.cpp index a53cc7d83..6fa1571a3 100644 --- a/src/core/likefunc.cpp +++ b/src/core/likefunc.cpp @@ -10369,8 +10369,8 @@ void _LikelihoodFunction::StateCounter (long functionCallback) const { for (unsigned i = 0; i < this_site_count; i++) { for (unsigned long discrete_category_index = 0UL; - discrete_category_index < discrete_category_variables.lLength; - discrete_category_index++) { + discrete_category_index < discrete_category_variables.lLength; + discrete_category_index++) { _CategoryVariable* discrete_cat = GetIthCategoryVar(discrete_category_variables(discrete_category_index)); @@ -10396,8 +10396,6 @@ void _LikelihoodFunction::StateCounter (long functionCallback) const { while (good_sites < this_site_count) { - - if (category_simulation_mode != kLFSimulateCategoriesNone ) { for (unsigned long hmm_category_index =0UL; @@ -10517,9 +10515,11 @@ void _LikelihoodFunction::StateCounter (long functionCallback) const { } - target.ResetIHelper(); - for (unsigned long character_index = 0UL; character_index < sites_per_unit; character_index ++) { - target.Compact(site_offset_raw + leaf_count - sites_per_unit + character_index); + if (!target.InternalStorageMode()) { + target.ResetIHelper(); + for (unsigned long character_index = 0UL; character_index < sites_per_unit; character_index ++) { + target.Compact(site_offset_raw + leaf_count - sites_per_unit + character_index); + } } if (storeIntermediates ) { @@ -10529,9 +10529,11 @@ void _LikelihoodFunction::StateCounter (long functionCallback) const { for (unsigned long character_index = 0UL; character_index < sites_per_unit; character_index ++) { target.Write2Site (site_offset_raw + leaf_count - sites_per_unit + character_index, simulated_unit (character_index)); } - target.ResetIHelper(); - for (unsigned long character_index = 0UL; character_index < sites_per_unit; character_index ++) { - target.Compact(site_offset_raw + leaf_count - sites_per_unit + character_index); + if (!target.InternalStorageMode()) { + target.ResetIHelper(); + for (unsigned long character_index = 0UL; character_index < sites_per_unit; character_index ++) { + target.Compact(site_offset_raw + leaf_count - sites_per_unit + character_index); + } } } } else { From d265b1d6e43f2b7098a1fe0b1b186af51886b20d Mon Sep 17 00:00:00 2001 From: Sergei Date: Sat, 18 Nov 2023 09:41:48 -0500 Subject: [PATCH 11/14] aBSREL 2.5 --- .../SelectionAnalyses/aBSREL.bf | 154 ++++++++++++++++-- res/TemplateBatchFiles/libv3/all-terms.bf | 2 + 2 files changed, 138 insertions(+), 18 deletions(-) diff --git a/res/TemplateBatchFiles/SelectionAnalyses/aBSREL.bf b/res/TemplateBatchFiles/SelectionAnalyses/aBSREL.bf index 2678cb5b2..7ff8409e3 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/aBSREL.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/aBSREL.bf @@ -1,4 +1,4 @@ -RequireVersion ("2.5.8"); +RequireVersion ("2.5.57"); LoadFunctionLibrary("libv3/all-terms.bf"); // must be loaded before CF3x4 @@ -10,6 +10,7 @@ LoadFunctionLibrary("libv3/tasks/estimators.bf"); LoadFunctionLibrary("libv3/tasks/alignments.bf"); LoadFunctionLibrary("libv3/tasks/mpi.bf"); LoadFunctionLibrary("libv3/tasks/trees.bf"); +LoadFunctionLibrary("libv3/tasks/ancestral.bf"); LoadFunctionLibrary("libv3/models/rate_variation.bf"); LoadFunctionLibrary("modules/io_functions.ibf"); @@ -69,11 +70,17 @@ absrel.analysis_description = {terms.io.info : "aBSREL (Adaptive branch-site ran uses an adaptive random effects branch-site model framework to test whether each branch has evolved under positive selection, using a procedure which infers an optimal number of rate categories per branch.", - terms.io.version : "2.3", - terms.io.reference : "Less Is More: An Adaptive Branch-Site Random Effects Model for Efficient Detection of Episodic Diversifying Selection (2015). Mol Biol Evol 32 (5): 1342-1353. v2.2 adds support for multiple-hit models. v2.3 adds support for SRV", + terms.io.version : "2.5", + terms.io.reference : " + Less Is More: An Adaptive Branch-Site Random Effects Model for Efficient Detection of Episodic Diversifying Selection (2015). + Mol Biol Evol 32 (5): 1342-1353. v2.2 adds support for multiple-hit models. + v2.3 adds support for SRV. + v2.5 adds support for ancestral state reconstruction, identification of sites contributing to selection signal, and some diagnostics. + ", terms.io.authors : "Sergei L Kosakovsky Pond, Ben Murrell, Steven Weaver and Temple iGEM / UCSD viral evolution group", terms.io.contact : "spond@temple.edu", - terms.io.requirements : "in-frame codon alignment and a phylogenetic tree" + terms.io.requirements : "in-frame codon alignment and a phylogenetic tree", + terms.settings : {} }; @@ -119,6 +126,8 @@ absrel.multi_hit = io.SelectAnOption ({ }, "Include support for multiple nucleotide substitutions"); +selection.io.json_store_setting (absrel.json, "multiple-hit", absrel.multi_hit); + KeywordArgument ("srv", "Include synonymous rate variation", "No"); absrel.srv = io.SelectAnOption ( @@ -140,6 +149,7 @@ if (absrel.do_srv) { absrel.synonymous_rate_classes = io.PromptUser ("The number omega rate classes to include in the model", absrel.synonymous_rate_classes, 1, 10, TRUE); } +selection.io.json_store_setting (absrel.json, "srv", absrel.do_srv); KeywordArgument ("output", "Write the resulting JSON to this file (default is to save to the same path as the alignment file + 'ABSREL.json')", absrel.codon_data_info [terms.json.json]); @@ -554,6 +564,18 @@ io.ReportProgressMessageMD("absrel", "Full adaptive model", "* " + selection.io. selection.io.stopTimer (absrel.json [terms.json.timers], "Full adaptive model fitting"); + +absrel.json [terms.substitutions] = {}; + +absrel.ancestral_info = ancestral.build (absrel.likelihood_function_id,0,FALSE); +(absrel.json [terms.substitutions] )[0] = ancestral.ComputeCompressedSubstitutions (absrel.ancestral_info); +DeleteObject (absrel.ancestral_info); + +absrel.json [terms.json.site_log_likelihood] = { + terms.json.unconstrained : absrel.ComputeSiteLikelihoods (absrel.likelihood_function_id), + terms.json.tested : {} +}; + absrel._report_srv (absrel.full_model.fit, "Full adaptive model", absrel.likelihood_function_id); @@ -612,38 +634,41 @@ if (absrel.multi_hit == "Double") { "0": 35, "1": 10, "2": 20, - "3": 12, - "4": 20, + "3": 18, + "4": 12, "5": 20, + "6": 20, }, terms.number_precision : 2}; - fprintf (stdout, "\n", io.FormatTableRow ({{"Branch", "Rates", "Max. dN/dS", "2x rate", "Test LRT", "Uncorrected p-value"}}, absrel.testing_table.settings)); + fprintf (stdout, "\n", io.FormatTableRow ({{"Branch", "Rates", "Max. dN/dS", "Sites @ EBF>=100", "2x rate", "Test LRT", "Uncorrected p-value"}}, absrel.testing_table.settings)); } else { if (absrel.multi_hit == "Double+Triple") { absrel.testing_table.settings = {terms.table_options.header : TRUE, terms.table_options.column_widths: { "0": 35, "1": 10, "2": 20, - "3": 12, + "3" :18, "4": 12, - "5": 20, + "5": 12, "6": 20, + "7": 20, }, terms.number_precision : 2}; - fprintf (stdout, "\n", io.FormatTableRow ({{"Branch", "Rates", "Max. dN/dS", "2x rate","3x rate", "Test LRT", "Uncorrected p-value"}}, absrel.testing_table.settings)); + fprintf (stdout, "\n", io.FormatTableRow ({{"Branch", "Rates", "Max. dN/dS", "Sites @ EBF>=100", "2x rate","3x rate", "Test LRT", "Uncorrected p-value"}}, absrel.testing_table.settings)); } else { absrel.testing_table.settings = {terms.table_options.header : TRUE, terms.table_options.column_widths: { "0": 35, "1": 10, "2": 20, - "3": 20, + "3": 18, "4": 20, + "5": 20, }, terms.number_precision : 2}; - fprintf (stdout, "\n", io.FormatTableRow ({{"Branch", "Rates", "Max. dN/dS", "Test LRT", "Uncorrected p-value"}}, absrel.testing_table.settings)); + fprintf (stdout, "\n", io.FormatTableRow ({{"Branch", "Rates", "Max. dN/dS", "Sites @ EBF>=100", "Test LRT", "Uncorrected p-value"}}, absrel.testing_table.settings)); } } @@ -656,6 +681,8 @@ absrel.branch.lrt = {}; absrel.branch.delta = {}; absrel.branch.psi = {}; +absrel.branch_site_level_ER = {}; + for (absrel.branch_id = 0; absrel.branch_id < absrel.branch_count; absrel.branch_id += 1) { absrel.current_branch = absrel.names_sorted_by_length[absrel.branch_id]; @@ -670,17 +697,18 @@ for (absrel.branch_id = 0; absrel.branch_id < absrel.branch_count; absrel.branch } else { absrel.report.row [2] = ">1000 (" + Format (absrel.dn_ds.distro[absrel.branch.complexity[absrel.current_branch]-1][1]*100,5,2) + "%)"; } + absrel.report.row [3] = "N/A"; absrel.offset = 0; absrel.final_estimates = absrel.GetBranchEstimates(absrel.model_defintions [absrel.branch.complexity[absrel.current_branch]], absrel.tree_id, absrel.current_branch); if (absrel.multi_hit == "Double" || absrel.multi_hit == "Double+Triple") { - absrel.report.row [3] = (absrel.final_estimates[utility.getGlobalValue ("terms.parameters.multiple_hit_rate")])[utility.getGlobalValue ("terms.fit.MLE")]; + absrel.report.row [4] = (absrel.final_estimates[utility.getGlobalValue ("terms.parameters.multiple_hit_rate")])[utility.getGlobalValue ("terms.fit.MLE")]; absrel.branch.delta [absrel.current_branch] = absrel.report.row [3]; absrel.offset = 1; } if (absrel.multi_hit == "Double+Triple") { - absrel.report.row [4] = (absrel.final_estimates[utility.getGlobalValue ("terms.parameters.triple_hit_rate")])[utility.getGlobalValue ("terms.fit.MLE")]; + absrel.report.row [5] = (absrel.final_estimates[utility.getGlobalValue ("terms.parameters.triple_hit_rate")])[utility.getGlobalValue ("terms.fit.MLE")]; absrel.branch.psi [absrel.current_branch] = absrel.report.row [4]; absrel.offset += 1; } @@ -688,24 +716,70 @@ for (absrel.branch_id = 0; absrel.branch_id < absrel.branch_count; absrel.branch if ((absrel.selected_branches[0])[absrel.current_branch] == terms.tree_attributes.test) { if (absrel.dn_ds.distro [absrel.branch.complexity[absrel.current_branch]-1][0] > 1) { + + absrel.branch_mixtures = absrel.branch.GetMixtureParameters (absrel.model_defintions [absrel.branch.complexity[absrel.current_branch]], absrel.tree_id, absrel.current_branch); + + // stash the weights + if (None != absrel.branch_mixtures) { + absrel.mixture_stash = {}; + + LFCompute (^absrel.likelihood_function_id,LF_START_COMPUTE); + + absrel.cat_count = utility.Array1D (absrel.branch_mixtures) + 1; + absrel.weights = {1, absrel.cat_count - 1 }; + for (absrel.i, v; in; absrel.branch_mixtures) { + absrel.mixture_stash [v] = ^v; + absrel.weights[absrel.i] = ^v; + ^v = 0; + } + + absrel.weights = parameters.GetStickBreakingDistributionWeigths (absrel.weights); + + absrel.siteLL = absrel.ComputeSiteLikelihoods (absrel.likelihood_function_id); + absrel.all_siteLL = {absrel.cat_count, Columns (absrel.siteLL)}; + absrel.all_siteLL [absrel.cat_count-1][0] = absrel.siteLL; + + for (absrel.i = 1; absrel.i < absrel.cat_count; absrel.i += 1) { + if (absrel.i > 1) { + ^( absrel.branch_mixtures[absrel.i-1]) = 0; + } + ^( absrel.branch_mixtures[absrel.i-1]) = 1; + absrel.siteLL = absrel.ComputeSiteLikelihoods (absrel.likelihood_function_id); + absrel.all_siteLL [absrel.i-1][0] = absrel.siteLL; + } + for (p, v; in; absrel.mixture_stash) { + ^p = v; + } + + LFCompute (^absrel.likelihood_function_id,LF_DONE_COMPUTE); + + absrel.branch_site_level_ER [absrel.current_branch] = absrel.mixture_site_logl (absrel.all_siteLL,absrel.weights ); + + absrel.branch_site_level_BF = + ((absrel.compute_mixture_site_BF (absrel.branch_site_level_ER [absrel.current_branch], absrel.weights ))[absrel.cat_count-1][-1])["_MATRIX_ELEMENT_VALUE_>=100"]; + + absrel.report.row [3] = Format (absrel.branch_site_level_BF, 5, 0); + + } + absrel.branch.ConstrainForTesting (absrel.model_defintions [absrel.branch.complexity[absrel.current_branch]], absrel.tree_id, absrel.current_branch); Optimize (absrel.null.mles, ^absrel.likelihood_function_id , {"OPTIMIZATION_METHOD" : "coordinate-wise"} ); absrel.branch.test = absrel.ComputeLRT ( absrel.full_model.fit[terms.fit.log_likelihood], absrel.null.mles[1][0]); + ((absrel.json [terms.json.site_log_likelihood])[terms.json.tested])[ absrel.current_branch ] = absrel.ComputeSiteLikelihoods (absrel.likelihood_function_id); estimators.RestoreLFStateFromSnapshot (absrel.likelihood_function_id, absrel.full_model.mle_set); } else { absrel.branch.test = {terms.LRT : 0, terms.p_value : 1}; } absrel.branch.p_values [ absrel.current_branch ] = absrel.branch.test [terms.p_value]; absrel.branch.lrt [absrel.current_branch] = absrel.branch.test [terms.LRT]; - absrel.report.row [3+absrel.offset] = absrel.branch.test [terms.LRT]; - absrel.report.row [4+absrel.offset] = Format (absrel.branch.test [terms.p_value], 8, 5); + absrel.report.row [4+absrel.offset] = absrel.branch.test [terms.LRT]; + absrel.report.row [5+absrel.offset] = Format (absrel.branch.test [terms.p_value], 8, 5); } else { absrel.branch.lrt [absrel.current_branch] = None; absrel.branch.p_values [absrel.current_branch] = None; - absrel.report.row [3+absrel.offset] = "Not selected"; - absrel.report.row [4+absrel.offset] = "for testing"; + absrel.report.row [4+absrel.offset] = "Not selected"; + absrel.report.row [5+absrel.offset] = "for testing"; } fprintf (stdout, io.FormatTableRow (absrel.report.row, absrel.testing_table.settings)); @@ -857,8 +931,25 @@ lfunction absrel.branch.ConstrainForTesting (model, tree_id, branch_id) { bv, ''); } +} + +//------------------------------------------------------------------------------------------------------------------------ + +lfunction absrel.branch.GetMixtureParameters (model, tree_id, branch_id) { + component_count = model[utility.getGlobalValue ("terms.model.components")]; + local_parameters = (model[utility.getGlobalValue ("terms.parameters")])[utility.getGlobalValue ("terms.local")]; + if (component_count > 1) { + mp = {component_count - 1, 1}; + for (i = 0; i < component_count-1; i += 1) { + mp[i] = "`tree_id`.`branch_id`.`local_parameters[terms.AddCategory (utility.getGlobalValue('terms.mixture.mixture_aux_weight'), i+1)]`"; + } + return mp; + } else { + return None; + } } + //------------------------------------------------------------------------------------------------------------------------ lfunction absrel.PopulateInitialGrid (model, tree_id, branch_id, current_estimates) { @@ -1222,3 +1313,30 @@ function absrel._report_srv (absrel_model_fit, tag, lf_id) { (absrel.json [absrel.json.srv_posteriors]) = absrel.cmx_weighted $ absrel.column_weights; } } + +//------------------------------------------------------------------------------ +lfunction absrel.ComputeSiteLikelihoods (id) { + ConstructCategoryMatrix (sl, ^id, SITE_LOG_LIKELIHOODS); + return sl; +} + +//------------------------------------------------------------------------------ + +lfunction absrel.mixture_site_logl (ll, p) { + res = ll["0"]; + S = Columns (ll); + norm = {1,S}["Max(ll[-1][_MATRIX_ELEMENT_COLUMN_],0)"]; + scaled_ll = ll["Exp(_MATRIX_ELEMENT_VALUE_-norm[_MATRIX_ELEMENT_COLUMN_])*p[_MATRIX_ELEMENT_ROW_]"]; + + norm = p["1"]*scaled_ll; + ll = scaled_ll["_MATRIX_ELEMENT_VALUE_/norm[_MATRIX_ELEMENT_COLUMN_]"]; + return ll; +} + +//------------------------------------------------------------------------------ + +lfunction absrel.compute_mixture_site_BF (ll, p) { + prior_odds = p["_MATRIX_ELEMENT_VALUE_/(1-_MATRIX_ELEMENT_VALUE_)"]; + ll = ll["(_MATRIX_ELEMENT_VALUE_/(1-_MATRIX_ELEMENT_VALUE_))/prior_odds[_MATRIX_ELEMENT_ROW_]"]; + return ll; +} \ No newline at end of file diff --git a/res/TemplateBatchFiles/libv3/all-terms.bf b/res/TemplateBatchFiles/libv3/all-terms.bf index 91add912c..5d8459762 100644 --- a/res/TemplateBatchFiles/libv3/all-terms.bf +++ b/res/TemplateBatchFiles/libv3/all-terms.bf @@ -333,6 +333,8 @@ namespace terms{ proportion = "proportion"; positive = "positive test results"; imputed_states = "Imputed States"; + site_log_likelihood = "Site Log Likelihood"; + unconstrained = "unconstrained"; } From dd1d01af311a054f0cd204d85b9e095089e3cd6f Mon Sep 17 00:00:00 2001 From: Sergei Date: Mon, 27 Nov 2023 11:42:04 -0500 Subject: [PATCH 12/14] aBSREL 2.5 fixes --- .../SelectionAnalyses/aBSREL.bf | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/res/TemplateBatchFiles/SelectionAnalyses/aBSREL.bf b/res/TemplateBatchFiles/SelectionAnalyses/aBSREL.bf index 7ff8409e3..53270a7db 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/aBSREL.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/aBSREL.bf @@ -69,13 +69,11 @@ absrel.display_orders = {terms.original_name: -1, absrel.analysis_description = {terms.io.info : "aBSREL (Adaptive branch-site random effects likelihood) uses an adaptive random effects branch-site model framework to test whether each branch has evolved under positive selection, - using a procedure which infers an optimal number of rate categories per branch.", + using a procedure which infers an optimal number of rate categories per branch. v2.3 adds support for SRV. v2.5 adds support for ancestral state reconstruction, identification of sites contributing to selection signal, and some diagnostics. ", terms.io.version : "2.5", terms.io.reference : " Less Is More: An Adaptive Branch-Site Random Effects Model for Efficient Detection of Episodic Diversifying Selection (2015). Mol Biol Evol 32 (5): 1342-1353. v2.2 adds support for multiple-hit models. - v2.3 adds support for SRV. - v2.5 adds support for ancestral state reconstruction, identification of sites contributing to selection signal, and some diagnostics. ", terms.io.authors : "Sergei L Kosakovsky Pond, Ben Murrell, Steven Weaver and Temple iGEM / UCSD viral evolution group", terms.io.contact : "spond@temple.edu", @@ -576,6 +574,7 @@ absrel.json [terms.json.site_log_likelihood] = { terms.json.tested : {} }; +absrel.distribution_for_json = {}; absrel._report_srv (absrel.full_model.fit, "Full adaptive model", absrel.likelihood_function_id); @@ -616,7 +615,7 @@ selection.io.json_store_lf (absrel.json, absrel.full_model.fit[terms.fit.log_likelihood], absrel.full_model.fit[terms.parameters] + 9 , absrel.codon_data_info[terms.data.sample_size], - {}, + absrel.distribution_for_json, absrel.display_orders[absrel.full_adaptive_model]); KeywordArgument ("save-fit", "Save full adaptive aBSREL model fit to this file (default is not to save)", "/dev/null"); @@ -785,6 +784,14 @@ for (absrel.branch_id = 0; absrel.branch_id < absrel.branch_count; absrel.branch fprintf (stdout, io.FormatTableRow (absrel.report.row, absrel.testing_table.settings)); } +selection.io.json_store_branch_attribute(absrel.json, + terms.posterior, + terms.json.branch_annotations, + -1, + 0, + absrel.branch_site_level_ER); + + if (Abs (absrel.branch.delta)) { selection.io.json_store_branch_attribute(absrel.json, terms.parameters.multiple_hit_rate, terms.json.branch_label, absrel.display_orders[terms.parameters.multiple_hit_rate], 0, From 0622be20992da72bc06897938dec759d64504e57 Mon Sep 17 00:00:00 2001 From: Sergei Date: Mon, 4 Dec 2023 17:08:40 -0500 Subject: [PATCH 13/14] Adding MSS-joint-fitter --- res/TemplateBatchFiles/MSS-joint-fitter.bf | 332 +++++++++++++++++++++ 1 file changed, 332 insertions(+) create mode 100644 res/TemplateBatchFiles/MSS-joint-fitter.bf diff --git a/res/TemplateBatchFiles/MSS-joint-fitter.bf b/res/TemplateBatchFiles/MSS-joint-fitter.bf new file mode 100644 index 000000000..2c627a644 --- /dev/null +++ b/res/TemplateBatchFiles/MSS-joint-fitter.bf @@ -0,0 +1,332 @@ + +/* 1. SETUP +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ + +/* 1a. Initial Setup +------------------------------------------------------------------------------*/ +RequireVersion ("2.5.48"); + +LoadFunctionLibrary ("libv3/all-terms.bf"); +LoadFunctionLibrary ("libv3/convenience/regexp.bf"); +LoadFunctionLibrary ("libv3/convenience/math.bf"); +LoadFunctionLibrary ("libv3/IOFunctions.bf"); +LoadFunctionLibrary ("libv3/UtilityFunctions.bf"); +LoadFunctionLibrary ("libv3/tasks/trees.bf"); +LoadFunctionLibrary ("libv3/tasks/alignments.bf"); +LoadFunctionLibrary ("libv3/tasks/estimators.bf"); +LoadFunctionLibrary ("SelectionAnalyses/modules/io_functions.ibf"); +LoadFunctionLibrary ("libv3/tasks/mpi.bf"); +LoadFunctionLibrary ("libv3/convenience/random.bf"); +LoadFunctionLibrary("libv3/models/codon/MSS.bf"); + + + +mss_selector.analysisDescription = {terms.io.info : "mss_joint_fitter : Performs a joint MSS model fit to several genes jointly.", + terms.io.version : "0.0.1", + terms.io.reference : "TBD", + terms.io.authors : "Sergei L Kosakovsky Pond", + terms.io.contact : "spond@temple.edu", + terms.io.requirements : "A collection of coding alignments with trees in the corresponding alignment files " + }; + + +mss.json = { terms.json.analysis: mss_selector.analysisDescription, + terms.json.input: {}, + }; + + +utility.SetEnvVariable ("OPTIMIZE_SUMMATION_ORDER_PARTITION", 100); +// don't spend too much time optimizing column ordering. + +/* 1b. User Input and data load +------------------------------------------------------------------------------*/ +io.DisplayAnalysisBanner (mss_selector.analysisDescription); + +KeywordArgument ("filelist","List of files to include in this analysis"); +mss_selector.file_list = io.get_a_list_of_files(io.PromptUserForFilePathRead ("List of files to include in this analysis")); + +mss_selector.file_count = utility.Array1D (mss_selector.file_list); +io.CheckAssertion("mss_selector.file_count >= 1", "A non-empty file list is required"); + +io.ReportProgressMessageMD("mss", "data" , "* Loaded a list with **" + mss_selector.file_count + "** files"); + +KeywordArgument ("code", "Which genetic code should be used", "Universal"); +mss.genetic_code = alignments.LoadGeneticCode (None); + + +mss.file_records = {}; +mss.file_info = {}; +mss.tree_info = {}; +mss.file_prefix = {}; +mss.fits = {}; +mss.filters = {}; +mss.parameters = 0; +mss.baselineLL = 0; +mss.is_bic = TRUE; + +KeywordArgument ("output", "Write the resulting JSON to this file",None); +mss.json = io.ReadFromOrCreate ("Save the resulting JSON file to", mss.json); + +mss.json - terms.data.value; + +mss_selector.file_list = utility.DictToArray (mss_selector.file_list); + +mss_selector.queue = mpi.CreateQueue ( + { + "Headers" : {{"libv3/UtilityFunctions.bf","libv3/tasks/alignments.bf", + "libv3/tasks/trees.bf","SelectionAnalyses/modules/io_functions.ibf", + "libv3/tasks/estimators.bf","libv3/models/codon/MSS.bf"}}, + "Variables" : {{}} + } + ); + +function mss.handle_initial_fit (filepath, namespace, genetic_code, run_fit) { + + utility.ToggleEnvVariable ("GLOBAL_FPRINTF_REDIRECT", "/dev/null"); + ExecuteCommands (' + `namespace`.json = {}; + namespace `namespace` { + scaler_prefix = "MSS.scaler"; + KeywordArgument ("code", "Which genetic code should be used", "`genetic_code`"); + KeywordArgument ("alignment", "Load this alignment", "`filepath`"); + utility.ToggleEnvVariable ("ALWAYS_RELOAD_FUNCTION_LIBRARIES", TRUE); + LoadFunctionLibrary ("SelectionAnalyses/modules/shared-load-file.bf"); + utility.ToggleEnvVariable ("ALWAYS_RELOAD_FUNCTION_LIBRARIES", None); + + load_file ({utility.getGlobalValue("terms.prefix"): "`namespace`", + utility.getGlobalValue("terms.settings") : {utility.getGlobalValue("terms.settings.branch_selector") : "selection.io.SelectAllBranches"}}); + + if (^"`&run_fit`") { + doGTR ("`namespace`"); + doPartitionedMG ("`namespace`", FALSE); + } + + }; + '); + utility.ToggleEnvVariable ("GLOBAL_FPRINTF_REDIRECT", None); + return { + "path" : filepath, + "pt" : Eval (namespace + ".trees"), + "init" : Eval (namespace + ".partitioned_mg_results") + }; +} + +lfunction mss.store_initial_fit (node, result, arguments) { + + mss_selector.print_row = { + 5, + 1 + }; + + (^"mss.fits") [result["path"]] = result["init"]; + (^"mss.tree_info") [result["path"]] = result ["pt"]; + + ^"mss.parameters" += (result["init"])[^"terms.parameters"]; + ^"mss.baselineLL" += (result["init"])[^"terms.fit.log_likelihood"]; + //^"mss.sample_size" += (result["init"])[^"terms.data.sample_size"]; + mss_selector.print_row [0] = result["path"]; + info = (^"mss.file_records")[result["path"]]; + mss_selector.print_row [1] = info [^"terms.data.sequences"]; + mss_selector.print_row [2] = info [^"terms.data.sites"]; + + mss.T = 0; + for (mss.b,mss.bi; in; ((result["init"])[^"terms.branch_length"])["0"]) { + mss.T += mss.bi [^"terms.fit.MLE"]; + } + mss_selector.print_row [3] = Format (mss.T, 8, 3); + mss_selector.print_row [4] = Format ((result["init"])[^"terms.fit.log_likelihood"], 12, 4); + fprintf(stdout, io.FormatTableRow(mss_selector.print_row, ^"mss_selector.table_output_options")); + return TRUE; +} + + +mss_selector.table_output_options = { + utility.getGlobalValue("terms.table_options.header"): 1, + utility.getGlobalValue("terms.table_options.minimum_column_width"): 16, + utility.getGlobalValue("terms.table_options.align"): "left", + utility.getGlobalValue("terms.table_options.column_widths"): { + "0": 120, + "1" :10, + "2" :10, + "3" :10, + "4" :15 + } + }; + +mss_selector.header = { + {"Filepath"} + {"Sequences"} + {"Sites"} + {"TreeLength"}, + {"Log(L)"} + }; + + +ExecuteCommands ( "mss.codon_classes = model.codon.MSS.prompt_and_define (terms.global, mss.genetic_code[terms.code])", + {"--mss-type" : "SynREV"} + ); + +io.ReportProgressMessageMD("mss", "fit0" , "Individual file statistics and simple model fits\n"); + + +fprintf(stdout, io.FormatTableRow(mss_selector.header, mss_selector.table_output_options)); +mss_selector.table_output_options[utility.getGlobalValue("terms.table_options.header")] = FALSE; + +mss.path_ordering = {}; +mss.filter_names = {}; +mss.trees = {}; +mss.order2path = {}; + + + +for (mss.counter, mss_selector.path; in; mss_selector.file_list) { + //io.ReportProgressBar("", "Loading datafile " + (+mss.counter+1) + "/" + mss_selector.file_count); + + mss.namespace = "mss_" + mss.counter; + mss.handle_initial_fit (mss_selector.path, mss.namespace, mss.genetic_code[terms.id], FALSE); + mss.file_records [mss_selector.path] = ^"`mss.namespace`.codon_data_info"; + mss.sample_size += (^"`mss.namespace`.codon_data_info")[terms.data.sample_size]; + + + mpi.QueueJob (mss_selector.queue, "mss.handle_initial_fit", {"0" : mss_selector.path, + "1" : mss.namespace, + "2" : mss.genetic_code[terms.id], + "3" : TRUE}, + "mss.store_initial_fit"); + + + + mss.file_info [mss_selector.path] = ^"`mss.namespace`.json"; + mss.filters[mss_selector.path] = ^"`mss.namespace`.filter_names"; + mss.file_prefix [mss_selector.path] = mss.namespace; + mss.order2path [Abs (mss.path_ordering)] = mss_selector.path; + mss.path_ordering [mss_selector.path] = Abs (mss.path_ordering); + mss.filter_names + (^"`mss.namespace`.filter_names")[0]; +} + + + +//selection.io.getIC(fit[terms.fit.log_likelihood], fit[terms.parameters] + xp, ss) + +//io.ClearProgressBar(); +mpi.QueueComplete (mss_selector.queue); + +mss.baseline_AIC = mss.getIC (mss.baselineLL, mss.parameters, mss.sample_size, mss.is_bic); + +io.ReportProgressMessageMD("mss", "fit0done" , "Baseline model fit"); +io.ReportProgressMessageMD("mss", "fit0done", "**LogL = " + mss.baselineLL + "**" + ", **AIC-c = " + mss.baseline_AIC + "** \n"); + +mss.json ["baseline"] = {terms.json.log_likelihood : mss.baselineLL, terms.json.AICc : mss.baseline_AIC}; + +function mss.MSS_generator (type) { + model = Call ("models.codon.MSS.ModelDescription",type, mss.genetic_code[terms.code], mss.codon_classes); + return model; +} + +mss.tree_objects = {}; +mss.fit_objects = {}; +mss.lf_components = { + 2*mss_selector.file_count, + 1 + }; + +mss.lf_id = "MSS.COMPOSITE.LF"; +mss.model_map_overall = {}; + + + +for (mss.counter, mss_selector.path; in; mss_selector.file_list) { + mss.prefix = mss.file_prefix [mss_selector.path]; + + + mss.model_name = "`mss.prefix`.model"; + mss.tree_name = "`mss.prefix`.tree"; + mss.lf_components[2*mss.counter] = mss.filter_names[mss.counter]; + mss.lf_components[2*mss.counter+1] = mss.tree_name; + utility.ExecuteInGlobalNamespace ("`mss.model_name ` = 0"); + ^(mss.model_name) = model.generic.DefineModel("mss.MSS_generator", mss.prefix + ".model_object", { + "0": "terms.global" + }, mss.filter_names[mss.counter], None); + + mss.model_map_overall [mss.prefix + ".model_object"] = ^(mss.model_name); + + mss.model_map = { mss.prefix + ".model_object" : ^(mss.model_name)}; + + model.ApplyModelToTree(mss.lf_components[2 * mss.counter + 1], (mss.tree_info[mss_selector.path])[0], { + "default": ^(mss.model_name) + }, None); + + + initial_values = mss.fits[mss_selector.path]; + + for (mss.p; in; estimators.GetGlobalMLE_RegExp( initial_values, terms.parameters.omega_ratio)) { + (initial_values[terms.global])[terms.parameters.omega_ratio] = mss.p; + } + + mss.model_map["estimators.SetCategory"][""]; + estimators.ApplyExistingEstimates.set_globals = {}; + mss.model_map["estimators.SetGlobals"][""]; + mss.constraint_count = estimators.ApplyExistingEstimatesToTree (mss.lf_components[2 * mss.counter + 1], + mss.model_map, + (initial_values [terms.branch_length])[0], + terms.model.branch_length_constrain, + TRUE); + + models.FixParameterSetRegExp (terms.parameters.omega_ratio, ^(mss.model_name)); + models.FixParameterSetRegExp (terms.nucleotideRatePrefix, ^(mss.model_name)); + + + if (mss.counter == 0) { + mss.reference_set = ^(mss.model_name); + mss.mss_rate_list = model.GetParameters_RegExp( ^(mss.model_name),"^" + terms.parameters.synonymous_rate + ""); + mss.model_dimension = utility.Array1D (mss.mss_rate_list); + mss.scaler_prefix = 'mss.scaler_parameter_'; + mss.scaler_mapping = {}; + mss.scaler_index = { mss.model_dimension , 1}; + for (mss.i, mss.v; in; mss.mss_rate_list) { + mss.scaler_mapping [mss.i] = Abs (mss.scaler_mapping); + mss.scaler_index [ mss.scaler_mapping [mss.i]] = mss.v; + } + for (mss.i2 = 0; mss.i2 < mss.model_dimension; mss.i2 += 1) { + parameters.DeclareGlobal (mss.scaler_prefix + mss.i2, None); + } + + mss.json ["mapping"] = mss.scaler_index; + } else { + //utility.getGlobalValue ("terms.parameters.synonymous_rate"); + models.BindGlobalParameters ({"0" : mss.reference_set , "1" : ^(mss.model_name)}, terms.parameters.synonymous_rate + terms.model.MSS.syn_rate_between); + models.BindGlobalParameters ({"0" : mss.reference_set , "1" : ^(mss.model_name)}, terms.parameters.synonymous_rate + terms.model.MSS.syn_rate_within); + } +} + + +utility.ExecuteInGlobalNamespace ("LikelihoodFunction `mss.lf_id` = (`&mss.lf_components`)"); + +VERBOSITY_LEVEL = 1; +USE_LAST_RESULTS = 1; +Optimize (res, ^mss.lf_id); + +mss.joint_AIC = mss.getIC (res[1][0], mss.parameters + res[1][1], mss.sample_size, mss.is_bic); + +mss.json ["joint"] = {terms.json.log_likelihood : res[1][0], terms.json.AICc : mss.joint_AIC}; + + +io.ReportProgressMessageMD("mss", "fitAlldone" , "Joint model fit"); +io.ReportProgressMessageMD("mss", "fitAlldone", "**LogL = " + res[1][0] + "**" + ", **AIC-c = " + mss.joint_AIC + "** \n"); + +mss.json["joint-model"] = estimators.ExtractMLEsOptions (mss.lf_id, mss.model_map_overall, {terms.globals_only : TRUE}); +//estimators.RemoveBranchLengthConstraints (mss.json["joint-model"]); + +io.SpoolJSON (mss.json, mss.json[terms.data.file]); + +KeywordArgument ("save-fit", "Write the resulting model fit file to this (large!) file", "/dev/null"); +io.SpoolLFToPath(mss.lf_id, io.PromptUserForFilePath ("Save model fit to this file ['/dev/null' to skip]")); + +//------------------------------------------------------------------------------------------------------------------------ + +lfunction mss.getIC(logl, params, samples, is_bic) { + if (is_bic) { + return -2 * logl + Log (samples) * params; + } + return -2 * logl + 2 * samples / (samples - params - 1) * params; +} From 1bb5353533f789b313a61ab5dc5ed81afcc085e7 Mon Sep 17 00:00:00 2001 From: Sergei Date: Tue, 12 Dec 2023 10:31:23 -0500 Subject: [PATCH 14/14] BF updates --- .../SelectionAnalyses/RELAX.bf | 20 ++++++++++++++++--- .../SelectionAnalyses/aBSREL.bf | 8 ++++---- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/res/TemplateBatchFiles/SelectionAnalyses/RELAX.bf b/res/TemplateBatchFiles/SelectionAnalyses/RELAX.bf index 3a68b4a5d..fcb572573 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/RELAX.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/RELAX.bf @@ -43,19 +43,21 @@ relax.analysis_description = { Version 3.1 adds LHC + Nedler-Mead initial fit phase and keyword support. Version 3.1.1 adds some bug fixes for better convergence. Version 4.0 adds support for synonymous rate variation. - Version 4.1 adds further support for multiple hit models", + Version 4.1 adds further support for multiple hit models. + Version 4.1.1 adds reporting for convergence diagnostics", terms.io.version : "3.1.1", terms.io.reference : "RELAX: Detecting Relaxed Selection in a Phylogenetic Framework (2015). Mol Biol Evol 32 (3): 820-832", terms.io.authors : "Sergei L Kosakovsky Pond, Ben Murrell, Steven Weaver and Temple iGEM / UCSD viral evolution g", terms.io.contact : "spond@temple.edu", - terms.io.requirements : "in-frame codon alignment and a phylogenetic tree, with at least two groups of branches defined using the {} notation (one group can be defined as all unlabeled branches)" + terms.io.requirements : "in-frame codon alignment and a phylogenetic tree, with at least two groups of branches defined using the {} notation (one group can be defined as all unlabeled branches)", + terms.settings: {} }; relax.json = { terms.json.analysis: relax.analysis_description, terms.json.input: {}, terms.json.fits : {}, terms.json.timers : {}, - terms.json.test_results : {} + terms.json.test_results : {}, }; relax.relaxation_parameter = "relax.K"; @@ -1001,6 +1003,9 @@ function relax.FitMainTestPair (prompt) { if (Abs (relax.best_samples[1] - relax.best_samples[0]) < 5.) { // could be diagnostic of convergence problems + selection.io.json_store_setting (relax.json, "convergence-flat-surface", relax.best_samples[1] - relax.best_samples[0]); + + io.ReportProgressMessageMD("RELAX", "alt-2", "* Potential convergence issues due to flat likelihood surfaces; checking to see whether K > 1 or K < 1 is robustly inferred"); if (relax.fitted.K > 1) { parameters.SetRange (model.generic.GetGlobalParameter (relax.model_object_map ["relax.test"] , terms.relax.k), terms.range01); @@ -1022,6 +1027,8 @@ function relax.FitMainTestPair (prompt) { if (relax.alternative_model.fit.take2 [terms.fit.log_likelihood] > relax.alternative_model.fit[terms.fit.log_likelihood]) { + relax.stash_fitted_K = relax.fitted.K; + io.ReportProgressMessageMD("RELAX", "alt-2", "\n### Potential for highly unreliable K inference due to multiple local maxima in the likelihood function, treat results with caution "); io.ReportProgressMessageMD("RELAX", "alt-2", "> Relaxation parameter reset to opposite mode of evolution from that obtained in the initial optimization."); @@ -1032,6 +1039,11 @@ function relax.FitMainTestPair (prompt) { relax.inferred_distribution = parameters.GetStickBreakingDistribution (models.codon.BS_REL.ExtractMixtureDistribution (relax.model_object_map ["relax.test"])) % 0; selection.io.report_dnds (relax.inferred_distribution); + selection.io.json_store_setting (relax.json, "convergence-unstable", { + {relax.fitted.K, relax.alternative_model.fit.take2 [terms.fit.log_likelihood]} + {relax.stash_fitted_K, relax.alternative_model.fit [terms.fit.log_likelihood]} + + }); io.ReportProgressMessageMD("RELAX", "alt-2", "* The following rate distribution was inferred for **reference** branches"); relax.inferred_distribution_ref = parameters.GetStickBreakingDistribution (models.codon.BS_REL.ExtractMixtureDistribution (relax.model_object_map ["relax.reference"])) % 0; @@ -1182,6 +1194,8 @@ do { if (relax.LRT [terms.LRT] < 0) { io.ReportProgressMessageMD("RELAX", "refit", "* Detected convergence issues (negative LRT). Refitting the alterative/null model pair from a new starting point"); + selection.io.json_store_setting (relax.json, "convergence-negative-lrt", relax.loop_passes); + relax.general_descriptive.fit = relax.null_model.fit; // reset initial conditions for (relax.k = 1; relax.k < relax.numbers_of_tested_groups; relax.k += 1) { // remove constraints on K relax.model_nmsp = relax.model_namespaces[relax.k ]; diff --git a/res/TemplateBatchFiles/SelectionAnalyses/aBSREL.bf b/res/TemplateBatchFiles/SelectionAnalyses/aBSREL.bf index 53270a7db..12f6beef9 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/aBSREL.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/aBSREL.bf @@ -69,11 +69,11 @@ absrel.display_orders = {terms.original_name: -1, absrel.analysis_description = {terms.io.info : "aBSREL (Adaptive branch-site random effects likelihood) uses an adaptive random effects branch-site model framework to test whether each branch has evolved under positive selection, - using a procedure which infers an optimal number of rate categories per branch. v2.3 adds support for SRV. v2.5 adds support for ancestral state reconstruction, identification of sites contributing to selection signal, and some diagnostics. ", + using a procedure which infers an optimal number of rate categories per branch. v2.2 adds support for multiple-hit models. v2.3 adds support for SRV. v2.5 adds support for ancestral state reconstruction, identification of sites contributing to selection signal, and some diagnostics. ", terms.io.version : "2.5", terms.io.reference : " Less Is More: An Adaptive Branch-Site Random Effects Model for Efficient Detection of Episodic Diversifying Selection (2015). - Mol Biol Evol 32 (5): 1342-1353. v2.2 adds support for multiple-hit models. + Mol Biol Evol 32 (5): 1342-1353. ", terms.io.authors : "Sergei L Kosakovsky Pond, Ben Murrell, Steven Weaver and Temple iGEM / UCSD viral evolution group", terms.io.contact : "spond@temple.edu", @@ -703,12 +703,12 @@ for (absrel.branch_id = 0; absrel.branch_id < absrel.branch_count; absrel.branch if (absrel.multi_hit == "Double" || absrel.multi_hit == "Double+Triple") { absrel.report.row [4] = (absrel.final_estimates[utility.getGlobalValue ("terms.parameters.multiple_hit_rate")])[utility.getGlobalValue ("terms.fit.MLE")]; - absrel.branch.delta [absrel.current_branch] = absrel.report.row [3]; + absrel.branch.delta [absrel.current_branch] = absrel.report.row [4]; absrel.offset = 1; } if (absrel.multi_hit == "Double+Triple") { absrel.report.row [5] = (absrel.final_estimates[utility.getGlobalValue ("terms.parameters.triple_hit_rate")])[utility.getGlobalValue ("terms.fit.MLE")]; - absrel.branch.psi [absrel.current_branch] = absrel.report.row [4]; + absrel.branch.psi [absrel.current_branch] = absrel.report.row [5]; absrel.offset += 1; }