diff --git a/ext/meryl/.gitmodules b/ext/meryl/.gitmodules new file mode 100644 index 0000000..680f532 --- /dev/null +++ b/ext/meryl/.gitmodules @@ -0,0 +1,3 @@ +[submodule "src/utility"] + path = src/utility + url = https://github.com/marbl/meryl-utility diff --git a/ext/meryl/scripts/buildRelease.sh b/ext/meryl/scripts/buildRelease.sh deleted file mode 100644 index 54ac6c0..0000000 --- a/ext/meryl/scripts/buildRelease.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/bin/sh -# -# Before building a release: -# -# Make a place to work, grab the bits you want to release: -# git clone git@github.com:marbl/meryl meryl-release -# cd meryl-release -# -# Commit to master: -# Increase version in documentation/source/conf.py (not present in meryl) -# Increase version in scripts/version_update.pl -# -# Build. This pulls in submodule code. This build isn't used for release -# and can be deleted or aborted (once submodules are populated). -# cd src && gmake -# -# Tag the next release development -# git tag -a v1.3-development -m "Development for v1.3." -# git push --follow-tags -# -# Make a branch: -# git checkout -b v1.2-maintenance -# -# Commit to branch: -# Change 'snapshot' to 'release' in scripts/version_update.pl -# git push --set-upstream origin v1.2-maintenance -# -# Run this script: -# scripts/buildRelease.sh 1.2 -# - -version=$1 - -if [ x$version = x ] ; then - echo usage: $0 numeric-version - exit -fi - -# -# Cleanup any old build, make space for the new one, and initialize scripts. -# - -if [ -e .git ] ; then - echo Moving .git directory out of the way. - mv .git dot-git-directory -fi - -echo Preparing build trees. - -rm -rf build -rm -rf build-darwin build-darwin.out -rm -rf build-linux build-linux.out -rm -rf build-src - -rm -f build-linux.sh - -rm -f meryl-${version}.Darwin-amd64.tar meryl-${version}.Darwin-amd64.tar.xz -rm -f meryl-${version}.Linux-amd64.tar meryl-${version}.Linux-amd64.tar.xz -rm -f meryl-${version}.tar meryl-${version}.tar.xz - -mkdir -p build-src/scripts -mkdir -p build-darwin/scripts -mkdir -p build-linux/scripts - -rsync -a src/ build-src/src -rsync -a src/ build-darwin/src -rsync -a src/ build-linux/src - -cp -p README* build-src/ -cp -p README* build-darwin/ -cp -p README* build-linux/ - -cp -p scripts/version_update.pl build-src/scripts/ -cp -p scripts/version_update.pl build-darwin/scripts/ -cp -p scripts/version_update.pl build-linux/scripts/ - -echo >> build-linux.sh "#!/bin/bash" -echo >> build-linux.sh "" -echo >> build-linux.sh "rm -rf /dock/build" -echo >> build-linux.sh "cd /dock/src" -echo >> build-linux.sh "gmake -j 12 > ../build-linux.out 2>&1" -echo >> build-linux.sh "cd .." -echo >> build-linux.sh "" -echo >> build-linux.sh "mv build/* build-linux/" -echo >> build-linux.sh "" -echo >> build-linux.sh "rm -rf build-darwin/obj" -echo >> build-linux.sh "rm -rf build-linux/obj" -echo >> build-linux.sh "" -echo >> build-linux.sh "mv build-darwin meryl-$version" -echo >> build-linux.sh "tar -cf meryl-$version.Darwin-amd64.tar meryl-$version/README* meryl-$version/bin meryl-$version/lib meryl-$version/share" -echo >> build-linux.sh "mv meryl-$version build-darwin" -echo >> build-linux.sh "" -echo >> build-linux.sh "mv build-linux meryl-$version" -echo >> build-linux.sh "tar -cf meryl-$version.Linux-amd64.tar meryl-$version/README* meryl-$version/bin meryl-$version/lib meryl-$version/share" -echo >> build-linux.sh "mv meryl-$version build-linux" -echo >> build-linux.sh "" -echo >> build-linux.sh "mv build-src meryl-$version" -echo >> build-linux.sh "tar -cf meryl-$version.tar meryl-$version/README* meryl-$version/src meryl-$version/scripts" -echo >> build-linux.sh "mv meryl-$version build-src" -echo >> build-linux.sh "" -echo >> build-linux.sh "" - -chmod 755 build-linux.sh - -# -# -# - -echo Build for MacOS. - -cd src -gmake -j 12 > ../build-darwin.out 2>&1 -cd .. - -mv build/* build-darwin/ - -echo Make static binaries for MacOS. - -cd build-darwin -python ../scripts/statifyOSX.py bin lib true true >> ../build-darwin.out 2>&1 -python ../scripts/statifyOSX.py lib lib true true >> ../build-darwin.out 2>&1 -cd .. - -# -# -# - -echo Build for Linux and make tarballs. - -echo \ -docker run -v `pwd`:/dock -t -i --rm phusion/holy-build-box-64:latest /hbb_exe/activate-exec bash /dock/build-linux.sh -docker run -v `pwd`:/dock -t -i --rm phusion/holy-build-box-64:latest /hbb_exe/activate-exec bash /dock/build-linux.sh - -# strip --only-keep-debug - -echo Compress. - -xz -9v meryl-$version.Darwin-amd64.tar -xz -9v meryl-$version.Linux-amd64.tar -xz -9v meryl-$version.tar - -if [ -e dot-git-directory ] ; then - echo Restoring .git directory. - mv dot-git-directory .git -fi - -exit diff --git a/ext/meryl/scripts/version_update.pl b/ext/meryl/scripts/version_update.pl index 9ec23af..a319137 100755 --- a/ext/meryl/scripts/version_update.pl +++ b/ext/meryl/scripts/version_update.pl @@ -39,7 +39,7 @@ my $label = "snapshot"; # If not 'release' print this in the version output. my $major = "1"; # Bump before release. -my $minor = "2"; # Bump before release. +my $minor = "0"; # Bump before release. my $branch = "master"; my $version = "v$major.$minor"; @@ -87,12 +87,7 @@ $version = "v$major.$minor"; } else { - $major = "0"; - $minor = "0"; - $commits = "0"; - $hash1 = $_; - - $version = "v$major.$minor"; + die "Failed to parse describe string '$_'.\n"; } } close(F); @@ -168,12 +163,14 @@ # Report what we found. This is really for the gmake output. if (defined($commits)) { - print "\$(info Building $label $version +$commits changes (r$revCount $hash1) ($dirty))\n"; + print STDERR "Building $label $version +$commits changes (r$revCount $hash1) ($dirty)\n"; foreach my $s (@submodules) { - print "\$(info \$(space) $s)\n"; + print STDERR " $s\n"; } + print STDERR "\n"; } else { - print "\$(info Building $label $version)\n"; + print STDERR "Building $label $version\n"; + print STDERR "\n"; } # Dump a new file, but don't overwrite the original. diff --git a/ext/meryl/src/9d8b34d1bec567d21facb6b745ad8933bbd9be5d.zip b/ext/meryl/src/9d8b34d1bec567d21facb6b745ad8933bbd9be5d.zip new file mode 100644 index 0000000..1f6449b Binary files /dev/null and b/ext/meryl/src/9d8b34d1bec567d21facb6b745ad8933bbd9be5d.zip differ diff --git a/ext/meryl/src/Makefile b/ext/meryl/src/Makefile index ea8d562..870edab 100644 --- a/ext/meryl/src/Makefile +++ b/ext/meryl/src/Makefile @@ -27,14 +27,7 @@ # instances of "$" within them need to be escaped with a second "$" to # accomodate the double expansion that occurs when eval is invoked. -# Before doing ANYTHING, initialize submodules...if the version of -# git is compatible. -gitv := $(shell git --version | cut -d\ -f 3 | cut -c 1) -ifeq (1, $(gitv)) - gitv := $(shell git --version | cut -d\ -f 3) - $(error git '$(shell which git)' version '$(gitv)' too old; at least version 2.12 is required) -endif - +# Before doing ANYTHING, initialize submodules. ifeq ($(wildcard utility/src/Makefile), ) $(info $(shell git submodule update --init utility)) $(info $(space)) @@ -350,9 +343,8 @@ DIR_STACK := INCDIRS := TGT_STACK := -# Discover our OS and architecture. These were previously used to set -# BUILD_DIR and TARGET_DIR to allow multi-platform builds. DESTDIR will do -# that for us too. +# Discover our OS and architecture. These are used to set the BUILD_DIR and TARGET_DIR to +# something more useful than 'build' and '.'. OSTYPE := $(shell echo `uname`) OSVERSION := $(shell echo `uname -r`) @@ -377,18 +369,6 @@ ifeq (${OSTYPE}, SunOS) endif endif -# Set paths for building and installing. If DESTDIR doesn't exist, use the -# directory just above us. - -ifeq "$(strip ${DESTDIR})" "" - BUILD_DIR := $(realpath ..)/build/obj - TARGET_DIR := $(realpath ..)/build -else - BUILD_DIR := $(DESTDIR)/canu/build/obj - TARGET_DIR := $(DESTDIR)/canu/build -endif - -# # Set compiler and flags based on discovered hardware # # By default, debug symbols are included in all builds (even optimized). @@ -405,6 +385,7 @@ endif # BUILDJEMALLOC will enable jemalloc library support. # + ifeq ($(origin CXXFLAGS), undefined) ifeq ($(BUILDOPTIMIZED), 1) else @@ -456,27 +437,44 @@ endif # So, we require gcc7 (from MacPorts) or gcc8 (from hommebrew). # # If from MacPorts: -# port install gcc9 -# port select gcc mp-gcc9 +# port install gcc7 +# port select gcc mp-gcc7 # # If CC is set to 'cc', the GNU make default, we'll automagically search for other -# versions and use those if found. +# versions and use those if found, preferring gcc7 over gcc8. # +# There' definitely a clever way to do this with 'foreach', but my Make is lacking. +# +ifeq (${OSTYPE}, Darwin) + ifeq ($(CC), cc) + CC7 := $(shell echo `which gcc-mp-7`) + CXX7 := $(shell echo `which g++-mp-7`) -define TEST_COMPILER - ifeq ($${CC}, cc) - CCTEST := $$(shell echo `which gcc-${1}`) - CXXTEST := $$(shell echo `which g++-${1}`) + ifdef CXX7 + CC := $(CC7) + CXX := $(CXX7) + endif + endif + + ifeq ($(CC), cc) + CC8 := $(shell echo `which gcc-7`) + CXX8 := $(shell echo `which g++-7`) - ifdef CXXTEST - CC := $${CCTEST} - CXX := $${CXXTEST} + ifdef CXX8 + CC := $(CC8) + CXX := $(CXX8) endif endif -endef -ifeq (${OSTYPE}, Darwin) - $(foreach suffix,mp-9 9 mp-8 8 mp-7 7,$(eval $(call TEST_COMPILER,${suffix}))) + ifeq ($(CC), cc) + CC8 := $(shell echo `which gcc-8`) + CXX8 := $(shell echo `which g++-8`) + + ifdef CXX8 + CC := $(CC8) + CXX := $(CXX8) + endif + endif ifneq ($(shell echo `$(CXX) --version 2>&1 | grep -c clang`), 0) CPATH := $(shell echo `which $(CXX)`) @@ -516,12 +514,9 @@ ifeq (${CANU_BUILD_ENV}, ports) else - # Ignore the gmake default 'c++' and force g++9. - ifeq ($(origin CXX), default) - CC = gcc9 - CXX = g++9 - CCLIB = -rpath /usr/local/lib/gcc9 - endif + CC ?= gcc6 + CXX ?= g++6 + CCLIB ?= -rpath /usr/local/lib/gcc6 # GCC CXXFLAGS += -I/usr/local/include -pthread -fopenmp -fPIC @@ -688,11 +683,11 @@ $(foreach TGT,${ALL_TGTS},\ # Makefile processed. Regenerate the version number file, make some # directories, and report that we're starting the build. -$(eval $(shell ../scripts/version_update.pl meryl utility/src/utility/version.H)) +$(shell ../scripts/version_update.pl meryl utility/src/utility/version.H) $(shell mkdir -p ${TARGET_DIR}/bin) -$(info For '${OSTYPE}' '${OSVERSION}' as '${MACHINETYPE}' into '${TARGET_DIR}/{bin,obj}'.) +$(info For '${OSTYPE}' '${OSVERSION}' as '${MACHINETYPE}' into '${DESTDIR}${PREFIX}/$(OSTYPE)-$(MACHINETYPE)/{bin,obj}'.) $(info Using '$(shell which ${CXX})' version '${GXX_VV}'.) ifneq ($(origin CXXFLAGSUSER), undefined) $(info Using user-supplied CXXFLAGS '${CXXFLAGSUSER}'.) diff --git a/ext/meryl/src/main.mk b/ext/meryl/src/main.mk index 2a2c826..23810c4 100644 --- a/ext/meryl/src/main.mk +++ b/ext/meryl/src/main.mk @@ -1,5 +1,29 @@ -MODULE := meryl + +# If 'make' isn't run from the root directory, we need to set these to +# point to the upper level build directory. + +ifeq "$(strip ${DESTDIR})" "" + DESTDIR := +endif + +ifeq "$(strip ${PREFIX})" "" + ifeq "$(strip ${DESTDIR})" "" + PREFIX := $(realpath ..) + else + PREFIX := /meryl + endif +endif + +ifeq "$(strip ${BUILD_DIR})" "" + BUILD_DIR := $(DESTDIR)$(PREFIX)/$(OSTYPE)-$(MACHINETYPE)/obj +endif + +ifeq "$(strip ${TARGET_DIR})" "" + TARGET_DIR := $(DESTDIR)$(PREFIX)/$(OSTYPE)-$(MACHINETYPE) +endif + TARGET := libmeryl.a + SOURCES := utility/src/utility/edlib.C \ \ utility/src/utility/files.C \ @@ -28,11 +52,11 @@ SOURCES := utility/src/utility/edlib.C \ utility/src/utility/kmers.C \ \ utility/src/utility/bits.C \ - utility/src/utility/bits-wordArray.C \ \ utility/src/utility/hexDump.C \ utility/src/utility/md5.C \ utility/src/utility/mt19937ar.C \ + utility/src/utility/objectStore.C \ utility/src/utility/speedCounter.C \ utility/src/utility/sweatShop.C \ \ @@ -59,12 +83,11 @@ SRC_INCDIRS := . \ utility SUBMAKEFILES := meryl/meryl.mk \ - meryl-analyze/meryl-analyze.mk \ meryl-simple/meryl-simple.mk \ meryl-import/meryl-import.mk \ - meryl-lookup/meryl-lookup.mk + meryl-lookup/meryl-lookup.mk \ + meryl-check/meryl-check.mk ifeq ($(BUILDTESTS), 1) -SUBMAKEFILES += tests/merylCountArrayTest.mk \ - tests/merylExactLookupTest.mk +SUBMAKEFILES += tests/merylCountArrayTest.mk endif diff --git a/ext/meryl/src/meryl-analyze/meryl-analyze.C b/ext/meryl/src/meryl-analyze/meryl-analyze.C index 3904139..9131543 100644 --- a/ext/meryl/src/meryl-analyze/meryl-analyze.C +++ b/ext/meryl/src/meryl-analyze/meryl-analyze.C @@ -52,8 +52,8 @@ public: void insert(V value) { if ((_minValue <= value) && (value <= _maxValue)) { - _smallestV = std::min(_smallestV, value); - _largestV = std::max(_largestV, value); + _smallestV = min(_smallestV, value); + _largestV = max(_largestV, value); _histo[value - _minValue]++; } @@ -107,8 +107,8 @@ public: void insert(V value) { if ((_minValue <= value) && (value <= _maxValue)) { - _smallestV = std::min(_smallestV, value); - _largestV = std::max(_largestV, value); + _smallestV = min(_smallestV, value); + _largestV = max(_largestV, value); _histo[value]++; } @@ -124,13 +124,13 @@ public: }; private: - V _minValue; // Minimum value we'll accept into the histogram - V _maxValue; + V _minValue; // Minimum value we'll accept into the histogram + V _maxValue; - V _smallestV; // Minimum value we have seen in the input data - V _largestV; + V _smallestV; // Minimum value we have seen in the input data + V _largestV; - std::map _histo; // Histogram data. + map _histo; // Histogram data. }; @@ -325,8 +325,8 @@ main(int argc, char **argv) { argc = AS_configure(argc, argv); - std::vector err; - int arg = 1; + vector err; + int arg = 1; while (arg < argc) { if (strcmp(argv[arg], "-mers") == 0) { inputDBname = argv[++arg]; diff --git a/ext/meryl/src/meryl-check/meryl-check.C b/ext/meryl/src/meryl-check/meryl-check.C new file mode 100644 index 0000000..b9a34a2 --- /dev/null +++ b/ext/meryl/src/meryl-check/meryl-check.C @@ -0,0 +1,174 @@ + +/****************************************************************************** + * + * This file is part of meryl, a genomic k-kmer counter with nice features. + * + * This software is based on: + * 'Canu' v2.0 (https://github.com/marbl/canu) + * which is based on: + * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net) + * the 'kmer package' r1994 (http://kmer.sourceforge.net) + * + * Except as indicated otherwise, this is a 'United States Government Work', + * and is released in the public domain. + * + * File 'README.licenses' in the root directory of this distribution + * contains full conditions and disclaimers. + */ + +#include "runtime.H" + +#include "kmers.H" +#include "sequence.H" +#include "bits.H" + + + + +int +main(int argc, char **argv) { + char *inputSeqName = NULL; + char *inputDBname = NULL; + uint64 minV = 0; + uint64 maxV = UINT64_MAX; + uint32 threads = 1; + + argc = AS_configure(argc, argv); + + vector err; + int arg = 1; + while (arg < argc) { + if (strcmp(argv[arg], "-sequence") == 0) { // INPUT READS and RANGE TO PROCESS + inputSeqName = argv[++arg]; + + } else if (strcmp(argv[arg], "-mers") == 0) { + inputDBname = argv[++arg]; + + } else if (strcmp(argv[arg], "-min") == 0) { + minV = strtouint64(argv[++arg]); + + } else if (strcmp(argv[arg], "-max") == 0) { + maxV = strtouint64(argv[++arg]); + + } else if (strcmp(argv[arg], "-threads") == 0) { + threads = strtouint32(argv[++arg]); + + } else { + char *s = new char [1024]; + snprintf(s, 1024, "Unknown option '%s'.\n", argv[arg]); + err.push_back(s); + } + + arg++; + } + + if (inputSeqName == NULL) + err.push_back("No input sequences (-sequence) supplied.\n"); + if (inputDBname == NULL) + err.push_back("No query meryl database (-mers) supplied.\n"); + + if (err.size() > 0) { + fprintf(stderr, "usage: %s ...\n", argv[0]); + fprintf(stderr, "\n"); + + for (uint32 ii=0; ii check; + + // Open a database, load the kmers and values into 'check'. + + fprintf(stderr, "Open meryl database '%s'.\n", inputDBname); + merylFileReader *merylDB = new merylFileReader(inputDBname); + + fprintf(stderr, "Convert to lookup table.\n"); + //merylExactLookup *kmerLookup = new merylExactLookup(merylDB, minV, maxV); + + fprintf(stderr, "Create mapping to value.\n"); + uint64 nKmers = 0; + + while (merylDB->nextMer() == true) { + kmer kmer = merylDB->theFMer(); + uint32 value = merylDB->theValue(); + + check[kmer] = value; + + nKmers++; + + if ((nKmers % 100000) == 0) { + fprintf(stderr, "Loaded %li kmers.\n", nKmers); + } + } + + delete merylDB; + //delete kmerLookup; + + fprintf(stderr,"Loaded %lu kmers into check map of size %lu\n", nKmers, check.size()); + + // + + fprintf(stderr, "Stream kmers from '%s'.\n", inputSeqName); + + dnaSeqFile *seqFile = new dnaSeqFile(inputSeqName); + + { + uint32 nameMax = 0; + char *name = NULL; + uint64 seqLen = 0; + uint64 seqMax = 0; + char *seq = NULL; + uint8 *qlt = NULL; + + char fString[64]; + char rString[64]; + + while (seqFile->loadSequence(name, nameMax, seq, qlt, seqMax, seqLen)) { + kmerIterator kiter(seq, seqLen); + + while (kiter.nextMer()) { + kmer fMer = kiter.fmer(); + kmer rMer = kiter.rmer(); + uint64 value = 0; + + if (fMer < rMer) + value = check[fMer]--; + else + value = check[rMer]--; + + if (value == 0) + fprintf(stdout, "%s\t%s\t%s ZERO\n", + name, + kiter.fmer().toString(fString), + kiter.rmer().toString(rString)); + + } + } + + delete [] name; + delete [] seq; + delete [] qlt; + } + + delete seqFile; + + // Check that all values are zero. + + for (map::iterator it=check.begin(); it != check.end(); it++) { + kmer k = it->first; + uint32 v = it->second; + + if (v != 0) { + char kmerString[64]; + + fprintf(stderr, "%s\t%u\n", k.toString(kmerString), v); + } + } + + exit(0); +} diff --git a/ext/meryl/src/meryl-check/meryl-check.mk b/ext/meryl/src/meryl-check/meryl-check.mk new file mode 100644 index 0000000..73388ec --- /dev/null +++ b/ext/meryl/src/meryl-check/meryl-check.mk @@ -0,0 +1,20 @@ + +# If 'make' isn't run from the root directory, we need to set these to +# point to the upper level build directory. +ifeq "$(strip ${BUILD_DIR})" "" + BUILD_DIR := ../$(OSTYPE)-$(MACHINETYPE)/obj +endif +ifeq "$(strip ${TARGET_DIR})" "" + TARGET_DIR := ../$(OSTYPE)-$(MACHINETYPE) +endif + +TARGET := meryl-check +SOURCES := meryl-check.C \ + +SRC_INCDIRS := . ../utility/src/utility + +TGT_LDFLAGS := -L${TARGET_DIR}/lib +TGT_LDLIBS := -lmeryl +TGT_PREREQS := libmeryl.a + +SUBMAKEFILES := diff --git a/ext/meryl/src/meryl-import/meryl-import.C b/ext/meryl/src/meryl-import/meryl-import.C index 0b5d908..698eb9a 100644 --- a/ext/meryl/src/meryl-import/meryl-import.C +++ b/ext/meryl/src/meryl-import/meryl-import.C @@ -37,13 +37,13 @@ main(int argc, char **argv) { bool useC = true; bool useF = false; - uint32 threads = getMaxThreadsAllowed(); - //uint64 memory = 8; + uint32 threads = 1; + uint64 memory = 8; argc = AS_configure(argc, argv); - std::vector err; - int arg = 1; + vector err; + int arg = 1; while (arg < argc) { if (strcmp(argv[arg], "-kmers") == 0) { inputName = argv[++arg]; @@ -72,7 +72,7 @@ main(int argc, char **argv) { threads = strtouint32(argv[++arg]); } else if (strcmp(argv[arg], "-memory") == 0) { // Not implemented. If implemented, merylCountArray::initializeValues() - //memory = strtouint64(argv[++arg]); // needs to return a memory size, etc, etc. + memory = strtouint64(argv[++arg]); // needs to return a memory size, etc, etc. } else { char *s = new char [1024]; @@ -149,7 +149,7 @@ main(int argc, char **argv) { uint32 nPrefix = 1 << wPrefix; uint32 wData = 2 * kmerTiny::merSize() - wPrefix; - uint64 wDataMask = buildLowBitMask(wData); + uint64 wDataMask = uint64MASK(wData); // Open the input kmer file, allocate space for reading kmer lines. @@ -212,8 +212,8 @@ main(int argc, char **argv) { // And use it. - kmdata pp = (useF == true) ? ((kmdata)kmerF >> wData) : ((kmdata)kmerR >> wData); - kmdata mm = (useF == true) ? ((kmdata)kmerF & wDataMask) : ((kmdata)kmerR & wDataMask); + uint64 pp = (useF == true) ? ((uint64)kmerF >> wData) : ((uint64)kmerR >> wData); + uint64 mm = (useF == true) ? ((uint64)kmerF & wDataMask) : ((uint64)kmerR & wDataMask); assert(pp < nPrefix); diff --git a/ext/meryl/src/meryl-import/meryl-import.mk b/ext/meryl/src/meryl-import/meryl-import.mk index 1eaf170..99eb8cb 100644 --- a/ext/meryl/src/meryl-import/meryl-import.mk +++ b/ext/meryl/src/meryl-import/meryl-import.mk @@ -1,27 +1,21 @@ + +# If 'make' isn't run from the root directory, we need to set these to +# point to the upper level build directory. +ifeq "$(strip ${BUILD_DIR})" "" + BUILD_DIR := ../$(OSTYPE)-$(MACHINETYPE)/obj +endif +ifeq "$(strip ${TARGET_DIR})" "" + TARGET_DIR := ../$(OSTYPE)-$(MACHINETYPE) +endif + TARGET := meryl-import SOURCES := meryl-import.C \ ../meryl/merylCountArray.C -SRC_INCDIRS := . ../meryl - -# If we're part of Canu, build with canu support and use Canu's copy of -# meryl-utility. Otherwise, don't. -ifneq ($(wildcard stores/sqStore.H), ) - SRC_CXXFLAGS := -DCANU - SRC_INCDIRS += ../../../utility/src/utility ../../../stores - -# If we're part of something else, include the something else's -# utility directory. -else ifneq ($(wildcard meryl/src/meryl/meryl.C), ) - SRC_INCDIRS += ../../../utility/src/utility - -# Otherwise, we're building directly in the meryl repo. -else - SRC_INCDIRS += ../utility/src/utility - -endif +SRC_INCDIRS := . ../utility/src/utility ../meryl +TGT_LDFLAGS := -L${TARGET_DIR}/lib +TGT_LDLIBS := -lmeryl +TGT_PREREQS := libmeryl.a -TGT_LDFLAGS := -L${TARGET_DIR}/lib -TGT_LDLIBS := -l${MODULE} -TGT_PREREQS := lib${MODULE}.a +SUBMAKEFILES := diff --git a/ext/meryl/src/meryl-lookup/dump.C b/ext/meryl/src/meryl-lookup/dump.C deleted file mode 100644 index 30090c1..0000000 --- a/ext/meryl/src/meryl-lookup/dump.C +++ /dev/null @@ -1,148 +0,0 @@ - -/****************************************************************************** - * - * This file is part of meryl, a genomic k-kmer counter with nice features. - * - * This software is based on: - * 'Canu' v2.0 (https://github.com/marbl/canu) - * which is based on: - * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net) - * the 'kmer package' r1994 (http://kmer.sourceforge.net) - * - * Except as indicated otherwise, this is a 'United States Government Work', - * and is released in the public domain. - * - * File 'README.licenses' in the root directory of this distribution - * contains full conditions and disclaimers. - */ - -#include "meryl-lookup.H" -#include "sweatShop.H" - - -class dumpInput { -public: - dumpInput() { - }; - ~dumpInput() { - delete [] fwd; - delete [] rev; - }; - - dnaSeq seq; - uint64 seqIdx; - - kmvalu *fwd = nullptr; - kmvalu *rev = nullptr; - - uint64 maxP; -}; - - - -static // (This really came from merfin) -void * -loadSequence(void *G) { - lookupGlobal *g = (lookupGlobal *)G; - dumpInput *s = new dumpInput; - - if (g->seqFile1->loadSequence(s->seq) == false) { - delete s; - return(nullptr); - } - - s->seqIdx = g->seqFile1->seqIdx(); - - return(s); -} - - - -static -void -processSequence(void *G, void *T, void *S) { - lookupGlobal *g = (lookupGlobal *)G; - dumpInput *s = (dumpInput *)S; - merylExactLookup *L = g->lookupDBs[0]; - - // Allocate and clear outputs. - - s->fwd = new kmvalu [s->seq.length()]; - s->rev = new kmvalu [s->seq.length()]; - - for (uint32 ii=0; iiseq.length(); ii++) - s->fwd[ii] = s->rev[ii] = 0; - - // Zip down all the kmers, saving the value of each. - - kmerIterator kiter(s->seq.bases(), s->seq.length()); - - while (kiter.nextMer()) { - uint64 p = kiter.bgnPosition(); - - s->fwd[p] = L->value(kiter.fmer()); - s->rev[p] = L->value(kiter.rmer()); - - s->maxP = p+1; - } - - // Release the memory use for storing the sequence. - - s->seq.releaseBases(); -} - - - -static -void -outputSequence(void *G, void *S) { - lookupGlobal *g = (lookupGlobal *)G; - dumpInput *s = (dumpInput *)S; - - // Allocate space for the output string. - - resizeArray(g->outstring, 0, g->outstringMax, strlen(s->seq.ident()) + 16 + 16 + 16, _raAct::doNothing); - - // Copy the sequence ident into the output strig. - - char *outptr = g->outstring; - - for (char const *x = s->seq.ident(); *x; ) - *outptr++ = *x++; - - *outptr++ = '\t'; - - // 'outptr' is now where we start adding new info for each kmer, - // and we output the string from 'outroot'. - - for (uint64 p=0; pmaxP; p++) { - char *t; - - if (s->fwd[p] + s->rev[p] == 0) - continue; - - t = toDec(s->seqIdx, outptr); *t++ = '\t'; - t = toDec(p, t); *t++ = '\t'; - t = toDec(s->fwd[p], t); *t++ = '\t'; - t = toDec(s->rev[p], t); *t++ = '\n'; *t = 0; - - fputs(g->outstring, g->outFile1->file()); - } - - delete s; -} - - - -void -dumpExistence(lookupGlobal *g) { - sweatShop *ss = new sweatShop(loadSequence, processSequence, outputSequence); - - ss->setLoaderQueueSize(4096); - ss->setNumberOfWorkers(omp_get_max_threads()); - ss->setWriterQueueSize(4096); - - ss->run(g, g->showProgress); - - delete ss; -} diff --git a/ext/meryl/src/meryl-lookup/existence.C b/ext/meryl/src/meryl-lookup/existence.C deleted file mode 100644 index bd90ef4..0000000 --- a/ext/meryl/src/meryl-lookup/existence.C +++ /dev/null @@ -1,144 +0,0 @@ - -/****************************************************************************** - * - * This file is part of meryl, a genomic k-kmer counter with nice features. - * - * This software is based on: - * 'Canu' v2.0 (https://github.com/marbl/canu) - * which is based on: - * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net) - * the 'kmer package' r1994 (http://kmer.sourceforge.net) - * - * Except as indicated otherwise, this is a 'United States Government Work', - * and is released in the public domain. - * - * File 'README.licenses' in the root directory of this distribution - * contains full conditions and disclaimers. - */ - -#include "meryl-lookup.H" -#include "sweatShop.H" - - - -class existInput { -public: - existInput() { - }; - ~existInput() { - delete [] nFound; - }; - - dnaSeq seq; - - uint64 nTotal = 0; - uint64 *nFound = nullptr; -}; - - - -static // (This really came from merfin) -void * -loadSequence(void *G) { - lookupGlobal *g = (lookupGlobal *)G; - existInput *s = new existInput(); - - if (g->seqFile1->loadSequence(s->seq) == false) { - delete s; - return(nullptr); - } - - return(s); -} - - - -static -void -processSequence(void *G, void *T, void *S) { - lookupGlobal *g = (lookupGlobal *)G; - existInput *s = (existInput *)S; - int32 nIn = g->lookupDBs.size(); - - // Allocate and clear outputs. - - s->nTotal = 0; - s->nFound = new uint64 [nIn]; - - for (uint32 dd=0; ddnFound[dd] = 0; - - // Zip through the kmers, counting how many kmers we have and how many we - // found in each input. - - kmerIterator kiter(s->seq.bases(), s->seq.length()); - - while (kiter.nextMer()) { - s->nTotal++; - - for (uint32 dd=0; ddlookupDBs[dd]->value(kiter.fmer()) > 0) || - (g->lookupDBs[dd]->value(kiter.rmer()) > 0)) - s->nFound[dd]++; - } - } - - // Release the memory use for storing the sequence. - - s->seq.releaseBases(); -} - - - -static -void -outputSequence(void *G, void *S) { - lookupGlobal *g = (lookupGlobal *)G; - existInput *s = (existInput *)S; - int32 nIn = g->lookupDBs.size(); - - // Allocate space for the output string. - - resizeArray(g->outstring, 0, g->outstringMax, 16 + 16 * 2 * nIn, _raAct::doNothing); - - // Create the string. - - char *t = g->outstring; - - *t++ = '\t'; - t = toDec(s->nTotal, t); - - for (uint32 dd=0; ddlookupDBs[dd]->nKmers(), t); - - *t++ = '\t'; - t = toDec(s->nFound[dd], t); - } - - *t++ = '\n'; - *t = 0; - - // And output it. - - fputs(s->seq.ident(), g->outFile1->file()); - fputs(g->outstring, g->outFile1->file()); - - delete s; -} - - - - -void -reportExistence(lookupGlobal *g) { - sweatShop *ss = new sweatShop(loadSequence, processSequence, outputSequence); - - ss->setLoaderQueueSize(4096); - ss->setNumberOfWorkers(omp_get_max_threads()); - ss->setWriterQueueSize(4096); - - ss->run(g, g->showProgress); - - delete ss; -} diff --git a/ext/meryl/src/meryl-lookup/include-exclude.C b/ext/meryl/src/meryl-lookup/include-exclude.C deleted file mode 100644 index d2f9c56..0000000 --- a/ext/meryl/src/meryl-lookup/include-exclude.C +++ /dev/null @@ -1,144 +0,0 @@ - -/****************************************************************************** - * - * This file is part of meryl, a genomic k-kmer counter with nice features. - * - * This software is based on: - * 'Canu' v2.0 (https://github.com/marbl/canu) - * which is based on: - * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net) - * the 'kmer package' r1994 (http://kmer.sourceforge.net) - * - * Except as indicated otherwise, this is a 'United States Government Work', - * and is released in the public domain. - * - * File 'README.licenses' in the root directory of this distribution - * contains full conditions and disclaimers. - */ - -#include "meryl-lookup.H" -#include "sweatShop.H" - - - -class filterInput { -public: - filterInput() { - }; - ~filterInput() { - }; - - dnaSeq seq1; - dnaSeq seq2; - - uint64 nTotal; - uint64 nFound; -}; - - - - -static -void * -loadSequence(void *G) { - lookupGlobal *g = (lookupGlobal *)G; - filterInput *s = new filterInput; - - bool load1 = (g->seqFile1 != nullptr) && (g->seqFile1->loadSequence(s->seq1) == true); - bool load2 = (g->seqFile2 != nullptr) && (g->seqFile2->loadSequence(s->seq2) == true); - - if ((load1 == false) && - (load2 == false)) { - delete s; - return(nullptr); - } - - return(s); -} - - - -static -uint64 -processSequence(merylExactLookup *L, dnaSeq &seq) { - kmerIterator kiter(seq.bases(), seq.length()); - uint64 found = 0; - - while (kiter.nextMer()) - if ((L->value(kiter.fmer()) > 0) || - (L->value(kiter.rmer()) > 0)) - found++; - - return(found); -} - - -static -void -processSequence(void *G, void *T, void *S) { - lookupGlobal *g = (lookupGlobal *)G; - filterInput *s = (filterInput *)S; - - // Count the number of kmers found in the database from either - // seq1 or seq2. - - s->nFound = processSequence(g->lookupDBs[0], s->seq1); - s->nFound += processSequence(g->lookupDBs[0], s->seq2); -} - - - -static -void -outputSequence(compressedFileWriter *O, - dnaSeq &seq, - uint64 nFound) { - - if (O == nullptr) - return; - - if (seq.quals()[0] == 0) fprintf(O->file(), ">%s nKmers=%lu\n%s\n", seq.ident(), nFound, seq.bases()); - else fprintf(O->file(), "@%s nKmers=%lu\n%s\n+\n%s\n", seq.ident(), nFound, seq.bases(), seq.quals()); -} - - -static -void -outputSequence(void *G, void *S) { - lookupGlobal *g = (lookupGlobal *)G; - filterInput *s = (filterInput *)S; - - g->nReadsTotal++; - - // Write output if: - // 'include' and mers found. - // 'exclude' and no mers found. - - if (((s->nFound > 0) && (g->reportType == lookupOp::opInclude)) || - ((s->nFound == 0) && (g->reportType == lookupOp::opExclude))) { - g->nReadsFound++; - - outputSequence(g->outFile1, s->seq1, s->nFound); - outputSequence(g->outFile2, s->seq2, s->nFound); - } - - delete s; -} - - - -void -filter(lookupGlobal *g) { - sweatShop *ss = new sweatShop(loadSequence, processSequence, outputSequence); - - ss->setLoaderQueueSize(omp_get_max_threads()); - ss->setNumberOfWorkers(omp_get_max_threads()); - ss->setWriterQueueSize(omp_get_max_threads()); - - ss->run(g, g->showProgress); - - delete ss; - - fprintf(stderr, "\nIncluding %lu reads (or read pairs) out of %lu.\n", g->nReadsTotal, g->nReadsFound); -} - diff --git a/ext/meryl/src/meryl-lookup/meryl-lookup.C b/ext/meryl/src/meryl-lookup/meryl-lookup.C index 51d4e45..5a60b5a 100644 --- a/ext/meryl/src/meryl-lookup/meryl-lookup.C +++ b/ext/meryl/src/meryl-lookup/meryl-lookup.C @@ -16,241 +16,273 @@ * contains full conditions and disclaimers. */ -#include "meryl-lookup.H" +#include "runtime.H" -void dumpExistence(lookupGlobal *G); -void reportExistence(lookupGlobal *G); -void filter(lookupGlobal *G); +#include "kmers.H" +#include "system.H" +#include "sequence.H" +#include "bits.H" -void -lookupGlobal::initialize(void) { - omp_set_num_threads(nThreads); -} +#define OP_NONE 0 +#define OP_DUMP 1 +#define OP_EXISTENCE 2 +#define OP_INCLUDE 3 +#define OP_EXCLUDE 4 void -lookupGlobal::loadLookupTables(void) { - std::vector merylDBs; // Input meryl database. - std::vector minMem; // Estimated min memory for lookup table. - std::vector optMem; // Estimated max memory for lookup table. - - // Open input meryl databases, initialize lookup. - - for (uint32 ii=0; ii &klookup, + vector &klabel) { - // Estimate memory needed for each lookup table. + // Build a list of labels for each database. If no labels are provided, + // this is just an empty string. - double minMemTotal = 0.0; - double optMemTotal = 0.0; + char **labels = new char * [klookup.size()]; - for (uint32 ii=0; iiestimateMemoryUsage(merylDBs[ii], maxMemory, minm, optm, minV, maxV); + // If we don't have the ll'th input label, make an empty string. - minMemTotal += minm; - optMemTotal += optm; - } - - // Use either the smallest or 'fastest' table, or fail, depending on how - // much memory the use lets us use. + if (klabel.size() <= ll) { + labels[ll] = new char [1]; + labels[ll][0] = 0; + continue; + } - bool useOpt = (optMemTotal <= maxMemory); - bool useMin = (minMemTotal <= maxMemory) && (useOpt == false); + // Otherwise, we have a label, so allocate space for a tab, a copy of + // the label, and a NUL byte, then create the string we'll output. - fprintf(stderr, "--\n"); - fprintf(stderr, "-- Minimal memory needed: %.3f GB%s\n", minMemTotal, (useMin) ? " enabled" : ""); - fprintf(stderr, "-- Optimal memory needed: %.3f GB%s\n", optMemTotal, (useOpt) ? " enabled" : ""); - fprintf(stderr, "-- Memory limit %.3f GB\n", maxMemory); - fprintf(stderr, "--\n"); + labels[ll] = new char [strlen(klabel[ll]) + 2]; - if ((useMin == false) && - (useOpt == false)) { - fprintf(stderr, "\n"); - fprintf(stderr, "Not enough memory to load databases. Increase -memory.\n"); - fprintf(stderr, "\n"); - exit(1); + labels[ll][0] = '\t'; + strcpy(labels[ll] + 1, klabel[ll]); } - if (doEstimate == true) { - fprintf(stderr, "-- Stopping after memory estimated reported; -estimate option enabled.\n"); - exit(0); + // Scan each sequence against each database. + + char fString[65]; + char rString[65]; + dnaSeq seq; + + for (uint32 seqId=0; sfile->loadSequence(seq); seqId++) { + kmerIterator kiter(seq.bases(), seq.length()); + + while (kiter.nextBase()) { + if (kiter.isValid() == false) { + fprintf(ofile->file(), "%s\t%u\t%lu\t%c\n", + seq.name(), + seqId, + kiter.position(), + kiter.isACGTbgn() ? 'n' : 'N'); + } + + else { + for (uint32 dd=0; ddexists(kiter.fmer(), fValue); + bool rExists = klookup[dd]->exists(kiter.rmer(), rValue); + + fprintf(ofile->file(), "%s\t%u\t%lu\t%c\t%s\t%lu\t%s\t%lu\t%s\n", + seq.name(), + seqId, + kiter.position(), + (fExists || rExists) ? 'T' : 'F', + kiter.fmer().toString(fString), fValue, + kiter.rmer().toString(rString), rValue, + labels[dd]); + } + } + } } +} - // Now load the data and forget about the input databases. - for (uint32 ii=0; iiload(merylDBs[ii], maxMemory, useMin, useOpt, minV, maxV) == false) - exit(1); +void +reportExistence(dnaSeqFile *sfile, + compressedFileWriter *ofile, + vector &klookup, + vector &klabel) { + dnaSeq seq; - delete merylDBs[ii]; + while (sfile->loadSequence(seq)) { + kmerIterator kiter(seq.bases(), seq.length()); + + uint64 nKmer = 0; + uint64 nKmerFound = 0; + + while (kiter.nextMer()) { + nKmer++; + + if ((klookup[0]->value(kiter.fmer()) > 0) || + (klookup[0]->value(kiter.rmer()) > 0)) + nKmerFound++; + } + + fprintf(ofile->file(), "%s\t%lu\t%lu\t%lu\n", seq.name(), nKmer, klookup[0]->nKmers(), nKmerFound); } } -// Open input sequences. void -lookupGlobal::openInputs(void) { +filter(dnaSeqFile *sfile1, + dnaSeqFile *sfile2, + compressedFileWriter *ofile1, + compressedFileWriter *ofile2, + vector &klookup, + bool outputIfFound) { - if (seqName1) { - fprintf(stderr, "-- Opening input sequences '%s'.\n", seqName1); - seqFile1 = new dnaSeqFile(seqName1); - } + // Do nothing if there are no sequences. - if (seqName2) { - fprintf(stderr, "-- Opening input sequences '%s'.\n", seqName2); - seqFile2 = new dnaSeqFile(seqName2); - } -} + if ((sfile1 == NULL) && (sfile2 == NULL)) + return; + // While we load sequences from all files supplied... + dnaSeq seq1; + dnaSeq seq2; -// Open output writers. -void -lookupGlobal::openOutputs(void) { + uint64 nReads = 0; + uint64 nReadsFound = 0; - if (outName1) { - fprintf(stderr, "-- Opening output file '%s'.\n", outName1); - outFile1 = new compressedFileWriter(outName1); - } + while (((sfile1 == NULL) || (sfile1->loadSequence(seq1))) && + ((sfile2 == NULL) || (sfile2->loadSequence(seq2)))) { + uint32 nKmerFound = 0; - if (outName2) { - fprintf(stderr, "-- Opening output file '%s'.\n", outName1); - outFile2 = new compressedFileWriter(outName2); - } -} + nReads++; + + if (seq1.length() > 0) { + kmerIterator kiter(seq1.bases(), seq1.length()); + + while (kiter.nextMer()) + if ((klookup[0]->value(kiter.fmer()) > 0) || + (klookup[0]->value(kiter.rmer()) > 0)) + nKmerFound++; + } + + if (seq2.length() > 0) { + kmerIterator kiter(seq2.bases(), seq2.length()); + while (kiter.nextMer()) + if ((klookup[0]->value(kiter.fmer()) > 0) || + (klookup[0]->value(kiter.rmer()) > 0)) + nKmerFound++; + } + + // Report the sequence if: + // any kmers are found and ifFound + // no kmers are found and not ifFound + + if ((nKmerFound > 0) == outputIfFound) { + nReadsFound++; + if (sfile1 != NULL) { + if (seq1.quals()[0] == 0) fprintf(ofile1->file(), ">%s nKmers=%u\n%s\n", seq1.name(), nKmerFound, seq1.bases()); + else fprintf(ofile1->file(), "@%s nKmers=%u\n%s\n+\n%s\n", seq1.name(), nKmerFound, seq1.bases(), seq1.quals()); + } + + if (sfile2 != NULL) { + if (seq2.quals()[0] == 0) fprintf(ofile2->file(), ">%s nKmers=%u\n%s\n", seq2.name(), nKmerFound, seq2.bases()); + else fprintf(ofile2->file(), "@%s nKmers=%u\n%s\n+\n%s\n", seq2.name(), nKmerFound, seq2.bases(), seq2.quals()); + } + } + } + fprintf(stderr, "\nIncluding %lu reads (or read pairs) out of %lu.\n", nReadsFound, nReads); +} int main(int argc, char **argv) { - lookupGlobal *G = new lookupGlobal; + char *seqName1 = NULL; + char *seqName2 = NULL; + + char *outName1 = NULL; + char *outName2 = NULL; + + vector inputDBname; + vector inputDBlabel; + + uint64 minV = 0; + uint64 maxV = UINT64_MAX; + uint32 threads = omp_get_max_threads(); + uint32 memory = 0; + uint32 reportType = OP_NONE; argc = AS_configure(argc, argv); - std::vector err; - for (int32 arg=1; arg < argc; arg++) { + vector err; + int arg = 1; + while (arg < argc) { if (strcmp(argv[arg], "-sequence") == 0) { - G->seqName1 = argv[++arg]; + seqName1 = argv[++arg]; if ((arg + 1 < argc) && (argv[arg + 1][0] != '-')) - G->seqName2 = argv[++arg]; + seqName2 = argv[++arg]; } else if (strcmp(argv[arg], "-mers") == 0) { while ((arg + 1 < argc) && (argv[arg + 1][0] != '-')) - G->lookupDBname.push_back(argv[++arg]); + inputDBname.push_back(argv[++arg]); } else if (strcmp(argv[arg], "-labels") == 0) { while ((arg + 1 < argc) && (argv[arg + 1][0] != '-')) - G->lookupDBlabel.push_back(argv[++arg]); + inputDBlabel.push_back(argv[++arg]); } else if (strcmp(argv[arg], "-output") == 0) { - G->outName1 = argv[++arg]; + outName1 = argv[++arg]; if ((arg + 1 < argc) && (argv[arg + 1][0] != '-')) - G->outName2 = argv[++arg]; + outName2 = argv[++arg]; } else if (strcmp(argv[arg], "-min") == 0) { - G->minV = (kmvalu)strtouint32(argv[++arg]); + minV = strtouint64(argv[++arg]); } else if (strcmp(argv[arg], "-max") == 0) { - G->maxV = (kmvalu)strtouint32(argv[++arg]); + maxV = strtouint64(argv[++arg]); } else if (strcmp(argv[arg], "-threads") == 0) { - G->nThreads = strtouint32(argv[++arg]); + threads = strtouint32(argv[++arg]); } else if (strcmp(argv[arg], "-memory") == 0) { - G->maxMemory = strtodouble(argv[++arg]); + memory = strtouint32(argv[++arg]); } else if (strcmp(argv[arg], "-dump") == 0) { - G->reportType = lookupOp::opDump; + reportType = OP_DUMP; } else if (strcmp(argv[arg], "-existence") == 0) { - G->reportType = lookupOp::opExistence; + reportType = OP_EXISTENCE; } else if (strcmp(argv[arg], "-include") == 0) { - G->reportType = lookupOp::opInclude; + reportType = OP_INCLUDE; } else if (strcmp(argv[arg], "-exclude") == 0) { - G->reportType = lookupOp::opExclude; - - } else if (strcmp(argv[arg], "-estimate") == 0) { - G->doEstimate = true; - - } else if (strcmp(argv[arg], "-V") == 0) { - G->showProgress = true; + reportType = OP_EXCLUDE; } else { char *s = new char [1024]; snprintf(s, 1024, "Unknown option '%s'.\n", argv[arg]); err.push_back(s); } - } - - // Check for invalid usage. - - if (G->reportType == lookupOp::opNone) { - err.push_back("No report-type (-existence, -dump, -include, -exclude) supplied.\n"); - } - - if (G->reportType == lookupOp::opDump) { - if (G->seqName1 == nullptr) err.push_back("No input sequences (-sequence) supplied.\n"); - if (G->seqName2 != nullptr) err.push_back("Only one input sequence (-sequence) supported for -dump.\n"); - - if (G->outName1 == nullptr) err.push_back("No output file (-output) supplied.\n"); - if (G->outName2 != nullptr) err.push_back("Only one output file (-output) supported for -dump.\n"); - if (G->lookupDBname.size() == 0) err.push_back("No meryl database (-mers) supplied.\n"); - if (G->lookupDBname.size() > 1) err.push_back("Only one meryl database (-mers) supported for -dump.\n"); - } - - if (G->reportType == lookupOp::opExistence) { - if (G->seqName1 == nullptr) err.push_back("No input sequences (-sequence) supplied.\n"); - if (G->seqName2 != nullptr) err.push_back("Only one input sequence (-sequence) supported for -existence.\n"); - - if (G->outName1 == nullptr) err.push_back("No output file (-output) supplied.\n"); - if (G->outName2 != nullptr) err.push_back("Only one output file (-output) supported for -existence.\n"); - - if (G->lookupDBname.size() == 0) err.push_back("No meryl database (-mers) supplied.\n"); - } - - if ((G->reportType == lookupOp::opInclude) || - (G->reportType == lookupOp::opExclude)) { - if (G->seqName1 == nullptr) err.push_back("No input sequences (-sequence) supplied.\n"); - if (G->outName1 == nullptr) err.push_back("No output file (-output) supplied.\n"); - - if ((G->seqName2 != nullptr) && - (G->outName2 == nullptr)) err.push_back("No second output file (-output) supplied for second input (-input) file.\n"); - - if ((G->seqName2 == nullptr) && - (G->outName2 != nullptr)) err.push_back("No second input file (-input) supplied for second output (-output) file.\n"); - - if (G->lookupDBname.size() == 0) err.push_back("No meryl database (-mers) supplied.\n"); - if (G->lookupDBname.size() > 1) err.push_back("Only one meryl database (-mers) supported for -include or -exclude.\n"); + arg++; } + if ((seqName1 == NULL) && (seqName2 == NULL)) + err.push_back("No input sequences (-sequence) supplied.\n"); + if (inputDBname.size() == 0) + err.push_back("No query meryl database (-mers) supplied.\n"); + if (reportType == OP_NONE) + err.push_back("No report-type (-existence, etc) supplied.\n"); if (err.size() > 0) { fprintf(stderr, "usage: %s \\\n", argv[0]); - fprintf(stderr, " [-estimate] \\\n"); fprintf(stderr, " -sequence [] \\\n"); fprintf(stderr, " -output []\n"); fprintf(stderr, " -mers [] [...] \\\n"); @@ -260,7 +292,7 @@ main(int argc, char **argv) { fprintf(stderr, "\n"); fprintf(stderr, " Multiple databases are supported.\n"); fprintf(stderr, "\n"); - fprintf(stderr, " Up to two input sequences are supported (only for -include / -exclude).\n"); + fprintf(stderr, " Up to two inptu sequences are supported (only for -include / -exclude).\n"); fprintf(stderr, "\n"); fprintf(stderr, " Input files can be FASTA or FASTQ; uncompressed, gz, bz2 or xz compressed\n"); fprintf(stderr, "\n"); @@ -278,25 +310,18 @@ main(int argc, char **argv) { fprintf(stderr, " exits with an error.\n"); fprintf(stderr, " -memory m Don't use more than m GB memory\n"); fprintf(stderr, "\n"); - fprintf(stderr, " If -estimate is supplied, processing will stop after a (quick) estimate\n"); - fprintf(stderr, " of memory needed to load the databases is written to stdout.\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "\n"); fprintf(stderr, " Exactly one report type must be specified.\n"); fprintf(stderr, "\n"); fprintf(stderr, " -existence"); fprintf(stderr, " Report a tab-delimited line for each sequence showing the number of kmers\n"); fprintf(stderr, " in the sequence, in the database, and in both.\n"); fprintf(stderr, "\n"); - fprintf(stderr, " Multiple input -mers may be supplied. If no output is supplied, output is written\n"); - fprintf(stderr, " to stdout.\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " output: seqName mersInSeq mersInDB1 mersInSeq&DB1 [ mersInDB2 mersInSeq&DB2 ... ]\n"); - fprintf(stderr, " seqName - name of the sequence\n"); - fprintf(stderr, " mersInSeq - number of mers in the sequence\n"); - fprintf(stderr, " mersInDB - number of mers in the meryl database\n"); - fprintf(stderr, " mersInSeq&DB - number of mers in the sequence that are\n"); - fprintf(stderr, " also in the database\n"); + fprintf(stderr, " output: seqName mersInSeq mersInDB mersInBoth\n"); + fprintf(stderr, " seqName - name of the sequence\n"); + fprintf(stderr, " mersInSeq - number of mers in the sequence\n"); + fprintf(stderr, " mersInDB - number of mers in the meryl database\n"); + fprintf(stderr, " mersInBoth - number of mers in the sequence that are\n"); + fprintf(stderr, " also in the database\n"); fprintf(stderr, "\n"); fprintf(stderr, "\n"); fprintf(stderr, " -dump\n"); @@ -304,9 +329,6 @@ main(int argc, char **argv) { fprintf(stderr, " order, annotated with the value of the kmer in the input database. If the kmer\n"); fprintf(stderr, " does not exist in the database its value will be reported as zero.\n"); fprintf(stderr, "\n"); - fprintf(stderr, " Only one input may be supplied. If no output is supplied, output is written\n"); - fprintf(stderr, " to stdout.\n"); - fprintf(stderr, "\n"); fprintf(stderr, " output: seqName seqId seqPos exists fwd-mer fwd-val rev-mer rev-val\n"); fprintf(stderr, " seqName - name of the sequence this kmer is from\n"); fprintf(stderr, " seqId - numeric version of the seqName (0-based)\n"); @@ -317,14 +339,14 @@ main(int argc, char **argv) { fprintf(stderr, " rev-mer - reverse mer sequence\n"); fprintf(stderr, " rev-val - value of the reverse mer in the database\n"); fprintf(stderr, "\n"); - fprintf(stderr, "\n"); fprintf(stderr, " -include / -exclude\n"); fprintf(stderr, " Extract sequences containing (-include) or not containing (-exclude) kmers in\n"); fprintf(stderr, " any input database. Output sequences are written in the same format as the input\n"); fprintf(stderr, " sequences, with the number of kmers found added to the name.\n"); fprintf(stderr, "\n"); - fprintf(stderr, " If two input files are supplied, the corresponding sequences are treated as a pair,\n"); - fprintf(stderr, " and two output files MUST be supplied.\n"); + fprintf(stderr, " If two input files are supplied, the corresponding sequences are treated as a pair.\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "\n"); fprintf(stderr, "\n"); fprintf(stderr, " output: sequence given format (fasta or fastq) with the number of overlapping kmers appended\n"); fprintf(stderr, " if pairs of sequences are given, R1 will be stdout and R2 be named as \n"); @@ -332,6 +354,13 @@ main(int argc, char **argv) { fprintf(stderr, " seqName - name of the sequence this kmer is from\n"); fprintf(stderr, " mersInBoth - number of mers in both sequence and in the database\n"); fprintf(stderr, "\n"); + fprintf(stderr, " -exclude Extract sequences *NOT containing* kmers in .\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " output: sequence given format (fasta or fastq) without reads containing kmers\n"); + fprintf(stderr, " if pairs of sequences are given, R1 will be stdout and R2 be named as \n"); + fprintf(stderr, " will be automatically compressed if ends with .gz, .bz2, or xs\n"); + fprintf(stderr, " seqName - name of the sequence this kmer is from\n"); + fprintf(stderr, "\n"); for (uint32 ii=0; iiinitialize(); - G->loadLookupTables(); - G->openInputs(); - G->openOutputs(); - - switch (G->reportType) { - case lookupOp::opNone: break; - case lookupOp::opDump: dumpExistence(G); break; - case lookupOp::opExistence: reportExistence(G); break; - case lookupOp::opInclude: filter(G); break; - case lookupOp::opExclude: filter(G); break; - default: break; + omp_set_num_threads(threads); + + // Open the kmers, build a lookup table. + + vector kmerLookups; + + for (uint32 ii=0; iiconfigure() == false) + exit(1); + + kmerLookup->load(); + + delete merylDB; // Not needed anymore. } - delete G; - fprintf(stderr, "Bye!\n"); + // Open input sequences. - return(0); -} + dnaSeqFile *seqFile1 = NULL; + dnaSeqFile *seqFile2 = NULL; + + if (seqName1 != NULL) { + fprintf(stderr, "-- Opening sequences in '%s'.\n", seqName1); + + seqFile1 = new dnaSeqFile(seqName1); + } + + if (seqName2 != NULL) { + fprintf(stderr, "-- Opening sequences in '%s'.\n", seqName2); + + seqFile2 = new dnaSeqFile(seqName2); + } + + // Open output writers. + + compressedFileWriter *outFile1 = (outName1 == NULL) ? NULL : new compressedFileWriter(outName1); + compressedFileWriter *outFile2 = (outName2 == NULL) ? NULL : new compressedFileWriter(outName2); + + // Do something. + + if (reportType == OP_DUMP) + dumpExistence(seqFile1, outFile1, kmerLookups, inputDBlabel); + + if (reportType == OP_EXISTENCE) + reportExistence(seqFile1, outFile1, kmerLookups, inputDBlabel); + if (reportType == OP_INCLUDE) + filter(seqFile1, seqFile2, outFile1, outFile2, kmerLookups, true); + if (reportType == OP_EXCLUDE) + filter(seqFile1, seqFile2, outFile1, outFile2, kmerLookups, false); + // Done! + delete seqFile1; + delete seqFile2; + + delete outFile1; + delete outFile2; + + for (uint32 ii=0; ii lookupDBname; - std::vector lookupDBlabel; - std::vector lookupDBs; // Kmer lookup table. - - kmvalu minV = 0; - kmvalu maxV = kmvalumax; - - lookupOp reportType = lookupOp::opNone; - - bool doEstimate = false; - bool showProgress = false; - - // Outputs for existence. - - char *outstring = nullptr; - uint32 outstringMax = 0; - - // Outputs for include/exclude. - - uint64 nReadsTotal = 0; - uint64 nReadsFound = 0; - -}; - - - diff --git a/ext/meryl/src/meryl-lookup/meryl-lookup.mk b/ext/meryl/src/meryl-lookup/meryl-lookup.mk index d9b7967..5c7cfcf 100644 --- a/ext/meryl/src/meryl-lookup/meryl-lookup.mk +++ b/ext/meryl/src/meryl-lookup/meryl-lookup.mk @@ -1,29 +1,35 @@ + +# If 'make' isn't run from the root directory, we need to set these to +# point to the upper level build directory. +ifeq "$(strip ${BUILD_DIR})" "" + BUILD_DIR := ../$(OSTYPE)-$(MACHINETYPE)/obj +endif +ifeq "$(strip ${TARGET_DIR})" "" + TARGET_DIR := ../$(OSTYPE)-$(MACHINETYPE) +endif + TARGET := meryl-lookup -SOURCES := meryl-lookup.C \ - dump.C \ - existence.C \ - include-exclude.C +SOURCES := meryl-lookup.C -SRC_INCDIRS := . +SRC_INCDIRS := . ../utility/src/utility ../meryl + +# If we're part of Canu, build with canu support. +# Otherwise, don't. -# If we're part of Canu, build with canu support and use Canu's copy of -# meryl-utility. Otherwise, don't. ifneq ($(wildcard stores/sqStore.H), ) - SRC_CXXFLAGS := -DCANU - SRC_INCDIRS += ../../../utility/src/utility ../../../stores -# If we're part of something else, include the something else's -# utility directory. -else ifneq ($(wildcard meryl/src/meryl/meryl.C), ) - SRC_INCDIRS += ../../../utility/src/utility +SRC_CXXFLAGS := -DCANU + +TGT_LDFLAGS := -L${TARGET_DIR}/lib +TGT_LDLIBS := -lcanu +TGT_PREREQS := libcanu.a -# Otherwise, we're building directly in the meryl repo. else - SRC_INCDIRS += ../utility/src/utility -endif +TGT_LDFLAGS := -L${TARGET_DIR}/lib +TGT_LDLIBS := -lmeryl +TGT_PREREQS := libmeryl.a +endif -TGT_LDFLAGS := -L${TARGET_DIR}/lib -TGT_LDLIBS := -l${MODULE} -TGT_PREREQS := lib${MODULE}.a +SUBMAKEFILES := diff --git a/ext/meryl/src/meryl-simple/meryl-simple.C b/ext/meryl/src/meryl-simple/meryl-simple.C index a666aef..43ead0f 100644 --- a/ext/meryl/src/meryl-simple/meryl-simple.C +++ b/ext/meryl/src/meryl-simple/meryl-simple.C @@ -36,7 +36,7 @@ main(int argc, char **argv) { argc = AS_configure(argc, argv); - std::vector err; + vector err; for (int32 arg=1; arg < argc; arg++) { if (strcmp(argv[arg], "-k") == 0) { kSize = strtouint32(argv[++arg]); @@ -152,7 +152,7 @@ main(int argc, char **argv) { fprintf(stderr, "-- Sorting %lu kmers.\n", kmersLen); - std::sort(kmers, kmers + kmersLen); + sort(kmers, kmers + kmersLen); // Scan, count and output stuff. diff --git a/ext/meryl/src/meryl-simple/meryl-simple.mk b/ext/meryl/src/meryl-simple/meryl-simple.mk index a22a287..9e7e0f1 100644 --- a/ext/meryl/src/meryl-simple/meryl-simple.mk +++ b/ext/meryl/src/meryl-simple/meryl-simple.mk @@ -1,26 +1,37 @@ + +# If 'make' isn't run from the root directory, we need to set these to +# point to the upper level build directory. +ifeq "$(strip ${BUILD_DIR})" "" + BUILD_DIR := ../$(OSTYPE)-$(MACHINETYPE)/obj +endif +ifeq "$(strip ${TARGET_DIR})" "" + TARGET_DIR := ../$(OSTYPE)-$(MACHINETYPE) +endif + TARGET := meryl-simple SOURCES := meryl-simple.C -SRC_INCDIRS := . +SRC_INCDIRS := . ../utility/src/utility # If we're part of Canu, build with canu support and use Canu's copy of # meryl-utility. Otherwise, don't. + ifneq ($(wildcard stores/sqStore.H), ) - SRC_CXXFLAGS := -DCANU - SRC_INCDIRS := ../../../utility/src/utility ../../../stores -# If we're part of something else, include the something else's -# utility directory. -else ifneq ($(wildcard meryl/src/meryl/meryl.C), ) - SRC_INCDIRS := ../../../utility/src/utility +SRC_CXXFLAGS := -DCANU + +SRC_INCDIRS := . ../../../utility/src/utility ../../../stores + +TGT_LDFLAGS := -L${TARGET_DIR}/lib +TGT_LDLIBS := -lcanu +TGT_PREREQS := libcanu.a -# Otherwise, we're building directly in the meryl repo. else - SRC_INCDIRS := ../utility/src/utility -endif +TGT_LDFLAGS := -L${TARGET_DIR}/lib +TGT_LDLIBS := -lmeryl +TGT_PREREQS := libmeryl.a +endif -TGT_LDFLAGS := -L${TARGET_DIR}/lib -TGT_LDLIBS := -l${MODULE} -TGT_PREREQS := lib${MODULE}.a +SUBMAKEFILES := diff --git a/ext/meryl/src/meryl/meryl.C b/ext/meryl/src/meryl/meryl.C index 963fd5f..fd24e79 100644 --- a/ext/meryl/src/meryl/meryl.C +++ b/ext/meryl/src/meryl/meryl.C @@ -30,23 +30,23 @@ main(int argc, char **argv) { argc = AS_configure(argc, argv); - std::vector err; + vector err; for (int32 arg=1; arg < argc; arg++) { // // Scan for debug options and requests for help. // - if (strcmp(argv[arg], "dumpIndex") == 0) { // Report the index for the dataset. - arg++; // It's just the parameters used for encoding. - delete new merylFileReader(argv[arg++], true); // Expects a meryl db directory as a parameter. - exit(0); + if (strcmp(argv[arg], "dumpIndex") == 0) { // Report the index for the dataset. + arg++; // It's just the parameters used for encoding. + delete new merylFileReader(argv[arg++], true); + continue; } - if (strcmp(argv[arg], "dumpFile") == 0) { // Dump the index for a single data file. - arg++; // Expects a meryl file prefix as a parameter. - dumpMerylDataFile(argv[arg++]); // (e.g., db.meryl/0x000000) - exit(0); + if (strcmp(argv[arg], "dumpFile") == 0) { // Dump the index for a single data file. + arg++; + dumpMerylDataFile(argv[arg++]); + continue; } if ((strcmp(argv[arg], "-h") == 0) || @@ -103,8 +103,6 @@ main(int argc, char **argv) { fprintf(stderr, "\n"); fprintf(stderr, " COMMANDS:\n"); fprintf(stderr, "\n"); - fprintf(stderr, " statistics display total, unique, distnict, present number of the kmers on the screen. accepts exactly one input.\n"); - fprintf(stderr, " histogram display kmer frequency on the screen as 'frequencycount'. accepts exactly one input.\n"); fprintf(stderr, " print display kmers on the screen as 'kmercount'. accepts exactly one input.\n"); fprintf(stderr, "\n"); fprintf(stderr, " count Count the occurrences of canonical kmers in the input. must have 'output' specified.\n"); @@ -124,7 +122,6 @@ main(int argc, char **argv) { fprintf(stderr, " decrease X subtract X from the count of each kmer.\n"); fprintf(stderr, " multiply X multiply the count of each kmer by X.\n"); fprintf(stderr, " divide X divide the count of each kmer by X.\n"); - fprintf(stderr, " divide-round X divide the count of each kmer by X and round results. count < X will become 1.\n"); fprintf(stderr, " modulo X set the count of each kmer to the remainder of the count divided by X.\n"); fprintf(stderr, "\n"); fprintf(stderr, " union return kmers that occur in any input, set the count to the number of inputs with this kmer.\n"); @@ -137,8 +134,6 @@ main(int argc, char **argv) { fprintf(stderr, " intersect-max return kmers that occur in all inputs, set the count to the maximum count.\n"); fprintf(stderr, " intersect-sum return kmers that occur in all inputs, set the count to the sum of the counts.\n"); fprintf(stderr, "\n"); - fprintf(stderr, " subtract return kmers that occur in the first input, subtracting counts from the other inputs\n"); - fprintf(stderr, "\n"); fprintf(stderr, " difference return kmers that occur in the first input, but none of the other inputs\n"); fprintf(stderr, " symmetric-difference return kmers that occur in exactly one input\n"); fprintf(stderr, "\n"); @@ -241,7 +236,7 @@ main(int argc, char **argv) { continue; fprintf(stderr, "\n"); - fprintf(stderr, "PROCESSING TREE #%u using %u thread%s.\n", rr+1, getMaxThreadsAllowed(), getMaxThreadsAllowed() == 1 ? "" : "s"); + fprintf(stderr, "PROCESSING TREE #%u using %u thread%s.\n", rr+1, omp_get_max_threads(), omp_get_max_threads() == 1 ? "" : "s"); B->printTree(root, 2); #pragma omp parallel for schedule(dynamic, 1) diff --git a/ext/meryl/src/meryl/meryl.H b/ext/meryl/src/meryl/meryl.H index 7c0e2f9..cf0c7d5 100644 --- a/ext/meryl/src/meryl/meryl.H +++ b/ext/meryl/src/meryl/meryl.H @@ -28,6 +28,7 @@ #include #include #include +using namespace std; class merylCommandBuilder { @@ -45,7 +46,7 @@ public: bool isPrinter(void); bool isMerylInput(void); - bool isCanuInput(std::vector &err); + bool isCanuInput(vector &err); bool isSequenceInput(void); void finalize(void); @@ -97,11 +98,12 @@ private: // // _opList is a list of operations. - std::stack _opStack; - std::vector _opList; - merylOperation **_thList[64] = { nullptr }; // Mirrors opList + stack _opStack; + vector _opList; + merylOperation **_thList[64] = { nullptr }; // Mirrors opList + + vector _opRoot; - std::vector _opRoot; }; diff --git a/ext/meryl/src/meryl/meryl.mk b/ext/meryl/src/meryl/meryl.mk index f7541df..8aa77b9 100644 --- a/ext/meryl/src/meryl/meryl.mk +++ b/ext/meryl/src/meryl/meryl.mk @@ -1,3 +1,13 @@ + +# If 'make' isn't run from the root directory, we need to set these to +# point to the upper level build directory. +ifeq "$(strip ${BUILD_DIR})" "" + BUILD_DIR := ../$(OSTYPE)-$(MACHINETYPE)/obj +endif +ifeq "$(strip ${TARGET_DIR})" "" + TARGET_DIR := ../$(OSTYPE)-$(MACHINETYPE) +endif + TARGET := meryl SOURCES := meryl.C \ merylCommandBuilder.C \ @@ -10,26 +20,27 @@ SOURCES := meryl.C \ merylOp-nextMer.C \ merylOp.C -SRC_INCDIRS := . +SRC_INCDIRS := . ../utility/src/utility # If we're part of Canu, build with canu support and use Canu's copy of # meryl-utility. Otherwise, don't. + ifneq ($(wildcard stores/sqStore.H), ) - SRC_CXXFLAGS := -DCANU - SRC_INCDIRS := ../../../utility/src/utility ../../../stores -# If we're part of something else, include the something else's -# utility directory. -else ifneq ($(wildcard meryl/src/meryl/meryl.C), ) - SRC_INCDIRS := ../../../utility/src/utility +SRC_CXXFLAGS := -DCANU + +SRC_INCDIRS := . ../../../utility/src/utility ../../../stores + +TGT_LDFLAGS := -L${TARGET_DIR}/lib +TGT_LDLIBS := -lcanu +TGT_PREREQS := libcanu.a -# Otherwise, we're building directly in the meryl repo. else - SRC_INCDIRS := ../utility/src/utility -endif +TGT_LDFLAGS := -L${TARGET_DIR}/lib +TGT_LDLIBS := -lmeryl +TGT_PREREQS := libmeryl.a +endif -TGT_LDFLAGS := -L${TARGET_DIR}/lib -TGT_LDLIBS := -l${MODULE} -TGT_PREREQS := lib${MODULE}.a +SUBMAKEFILES := diff --git a/ext/meryl/src/meryl/merylCommandBuilder.C b/ext/meryl/src/meryl/merylCommandBuilder.C index 2755612..0523c54 100644 --- a/ext/meryl/src/meryl/merylCommandBuilder.C +++ b/ext/meryl/src/meryl/merylCommandBuilder.C @@ -57,8 +57,8 @@ isNumber(char *s, char dot='.') { // Everything is initialized in the declaration. Nothing really to do here. merylCommandBuilder::merylCommandBuilder() { - _allowedThreads = getMaxThreadsAllowed(); // Absolute maximum limits on - _allowedMemory = getMaxMemoryAllowed(); // memory= and threads= values + _allowedThreads = omp_get_max_threads(); // Absolute maximum limits on + _allowedMemory = getPhysicalMemorySize(); // memory= and threads= values. } @@ -155,7 +155,7 @@ merylCommandBuilder::initialize(char *opt) { // Save a few copies of the command line word. - strncpy(_inoutName, _optString, FILENAME_MAX + 1); + strncpy(_inoutName, _optString, FILENAME_MAX); snprintf(_indexName, FILENAME_MAX, "%s/merylIndex", _optString); snprintf(_sqInfName, FILENAME_MAX, "%s/info", _optString); @@ -214,22 +214,18 @@ merylCommandBuilder::processOptions(void) { return(true); } - // If the string is entirely a number, treat it as either a threshold or a - // constant, depending on the operation. This is used for things like - // "greater-than 45" and "divide 2". - // - // If there is no operation, or it doesn't want a number, we fall trhough - // and return 'false' when key/val is checked below. - - bool isNum = isNumber(_optString, 0); - - if ((_opStack.top()->needsThreshold() == true) && (isNum == true)) { - _opStack.top()->setThreshold(strtouint64(_optString)); + if (strncmp(_optString, "-E", 3) == 0) { +#warning "-E not implemented." + //findMaxInputSizeForMemorySize(strtouint32(argv[arg+1]), + // (uint64)(1000000000 * strtodouble(argv[arg+2]))); return(true); } - if ((_opStack.top()->needsConstant() == true) && (isNum == true)) { - _opStack.top()->setConstant(strtouint64(_optString)); + // If the string is entirely a number, treat it as a threshold. This is + // used for things like "greater-than 45". + + if (isNumber(_optString, 0)) { + _opStack.top()->setThreshold(strtouint64(_optString)); return(true); } @@ -354,7 +350,6 @@ merylCommandBuilder::processOperation(void) { else if (0 == strcmp(_optString, "decrease")) non = opDecrease; else if (0 == strcmp(_optString, "multiply")) non = opMultiply; else if (0 == strcmp(_optString, "divide")) non = opDivide; - else if (0 == strcmp(_optString, "divide-round")) non = opDivideRound; else if (0 == strcmp(_optString, "modulo")) non = opModulo; else if (0 == strcmp(_optString, "union")) non = opUnion; @@ -367,8 +362,6 @@ merylCommandBuilder::processOperation(void) { else if (0 == strcmp(_optString, "intersect-max")) non = opIntersectMax; else if (0 == strcmp(_optString, "intersect-sum")) non = opIntersectSum; - else if (0 == strcmp(_optString, "subtract")) non = opSubtract; - else if (0 == strcmp(_optString, "difference")) non = opDifference; else if (0 == strcmp(_optString, "symmetric-difference")) non = opSymmetricDifference; @@ -507,7 +500,7 @@ merylCommandBuilder::isMerylInput(void) { } bool -merylCommandBuilder::isCanuInput(std::vector &err) { +merylCommandBuilder::isCanuInput(vector &err) { if ((fileExists(_sqInfName) == false) || (fileExists(_sqRdsName) == false)) @@ -623,8 +616,6 @@ void merylCommandBuilder::spawnThreads(void) { uint32 indent = 0; - omp_set_num_threads(_allowedThreads); - for (uint32 tt=0; tt<64; tt++) { // Construct a list of operations for each thread. diff --git a/ext/meryl/src/meryl/merylCountArray.C b/ext/meryl/src/meryl/merylCountArray.C index 02beedf..e70a886 100644 --- a/ext/meryl/src/meryl/merylCountArray.C +++ b/ext/meryl/src/meryl/merylCountArray.C @@ -99,7 +99,7 @@ merylCountArray::merylCountArray(void) { uint64 -merylCountArray::initialize(uint64 prefix, uint32 width) { +merylCountArray::initialize(uint64 prefix, uint32 width, uint32 segsize) { _sWidth = width; _prefix = prefix; @@ -111,12 +111,9 @@ merylCountArray::initialize(uint64 prefix, uint32 width) { _bitsPerPage = getPageSize() * 8; _nReAlloc = 0; - // Set the segment size, in bits, to be a multiple of the page size. - // Reserve some space for OS allocator stuff (needs to be divisible by - // 64). - _segSize = pagesPerSegment() * _bitsPerPage - 8 * 64; - _segAlloc = 0; - _segments = NULL; + _segSize = 8 * (segsize * 1024 - 32); // Set the segment size to 'segsize' KB, + _segAlloc = 0; // in bits, reserving 32 bytes for + _segments = NULL; // allocator stuff that we don't control. _nBits = 0; _nBitsTrigger = 0; @@ -234,7 +231,7 @@ merylCountArray::removeSegments(void) { _nReAlloc = 0; _segAlloc = 0; // Don't forget to - _segments = NULL; // forget about it. + _segments = NULL; // foret about it. _nBits = 0; // Indicate that we've stored no data. _nBitsTrigger = 0; @@ -255,18 +252,21 @@ void merylCountArray::addSegment(uint32 seg) { if (_segAlloc == 0) { - resizeArray(_segments, _segAlloc, _segAlloc, 64, _raAct::copyData | _raAct::clearNew); + resizeArray(_segments, _segAlloc, _segAlloc, 32, resizeArray_copyData | resizeArray_clearNew); _nReAlloc++; } if (seg >= _segAlloc) { - resizeArray(_segments, _segAlloc, _segAlloc, 2 * _segAlloc, _raAct::copyData | _raAct::clearNew); + resizeArray(_segments, _segAlloc, _segAlloc, 2 * _segAlloc, resizeArray_copyData | resizeArray_clearNew); _nReAlloc++; } assert(_segments[seg] == NULL); + //if (seg > 0) + // fprintf(stderr, "Add segment %u\n", seg); + _segments[seg] = new uint64 [_segSize / 64]; - //memset(_segments[seg], 0, sizeof(uint64) * _segSize / 64); + memset(_segments[seg], 0, sizeof(uint64) * _segSize / 64); } @@ -496,7 +496,7 @@ merylCountArray::add(kmdata suffix) { uint64 seg = nBits / _segSize; // Which segment are we in? uint64 segPos = nBits % _segSize; // Bit position in that segment. - _nBits += _sWidth; + _nBits += _sWidth; // word position counts from high to low; 0 for the high bit and 63 for // the bit that represents integer 1. @@ -596,7 +596,7 @@ merylCountArray::add(kmdata suffix) { #endif _segments[seg][word+0] |= sta; - _segments[seg][word+1] = end; + _segments[seg][word+1] |= end; } if (thrWord) { @@ -618,8 +618,8 @@ merylCountArray::add(kmdata suffix) { #endif _segments[seg][word+0] |= sta; - _segments[seg][word+1] = mid; - _segments[seg][word+2] = end; + _segments[seg][word+1] |= mid; + _segments[seg][word+2] |= end; } } @@ -704,7 +704,7 @@ merylCountArray::add(kmdata suffix) { assert(word+1 == _segSize/64-1); _segments[seg+0][word+0] |= sta; - _segments[seg+0][word+1] = mid; + _segments[seg+0][word+1] |= mid; } // Move kmer bits to one or two words in the next segment. @@ -712,15 +712,15 @@ merylCountArray::add(kmdata suffix) { if (oneNext) { uint64 sta = (suffix << (64 - nextBits)); - _segments[seg+1][0] = sta; + _segments[seg+1][0] |= sta; } if (twoNext) { uint64 mid = (suffix >> (nextBits - 64)); uint64 end = (suffix << (128 - nextBits)); - _segments[seg+1][0] = mid; - _segments[seg+1][1] = end; + _segments[seg+1][0] |= mid; + _segments[seg+1][1] |= end; } } @@ -763,8 +763,8 @@ merylCountArray::get(uint64 kk) { // If the bits are entirely in one word, be done. - if (wordEnd <= 64) { - bits = (_segments[seg][word] >> (64 - wordEnd)) & buildLowBitMask(_sWidth); + if ((wordBgn >= 0) && (wordEnd <= 64)) { + bits = (_segments[seg][word] >> (64 - wordEnd)) & uint64MASK(_sWidth); return(bits); } diff --git a/ext/meryl/src/meryl/merylCountArray.H b/ext/meryl/src/meryl/merylCountArray.H index 52285cc..1a15ff6 100644 --- a/ext/meryl/src/meryl/merylCountArray.H +++ b/ext/meryl/src/meryl/merylCountArray.H @@ -52,7 +52,7 @@ public: merylCountArray(); ~merylCountArray(); - uint64 initialize(uint64 prefix, uint32 width); + uint64 initialize(uint64 prefix, uint32 width, uint32 segsize=64); uint64 initializeValues(kmvalu maxValue=0); @@ -98,50 +98,30 @@ public: public: - uint64 numBits(void) { return(_nBits); }; - uint64 numKmers(void) { return(_nBits / _sWidth); }; + uint64 numBits(void) { return(_nBits); }; + // Returns the number of bytes in pages touched by data in this object. + // It's a pretty tight bound. The extra 1 was added to make it + // be an overestimate of what 'top' is reporting. Without it, + // it underestimates by a significant amount. + // This is underestimating the actual resident memory usage. The constant + // +5 tries to adjust, but still underestimates on large data sets. -public: - // Using 1 here is probably not the most time efficient value, but the - // memory usage estimate seems to be the most accurate with it. - // - static - uint32 pagesPerSegment(void) { return(1); }; - - // We're doing accounting ourself instead of asking the OS for the current - // process size because some OSs (FreeBSD, probably MacOS) don't decrease - // the size and we need to when these tables are too full opposed to - // simply allocated. - // - // If the memset in merylCountArray() is enabled, this calculation does - // not represent the amount of resident memory. - // - // The number of pages used for data is complicated. We're allocating in - // blocks of pagesPerSegment() but reserving a few words for OS overhead. - // - // fSegmsUsed: the number of full segments allocated, _segSize bits in each. - // pPagesUsed: the leftover bits, plus one partial page used - // uint64 usedSize(void) { - uint64 memUsed = 0; - - uint64 fSegmsUsed = _nBits / _segSize; - uint64 pPagesUsed = (_nBits - fSegmsUsed * _segSize) / _bitsPerPage + 1; + uint64 fullSegs = (_nBits / _segSize); // Number of fully filled segments + uint64 partSeg = (_nBits % _segSize) + 64; // Number of bites (rounded to next word) in the last (partially filled) segment - uint64 pagesUsed = fSegmsUsed * pagesPerSegment() + pPagesUsed; + uint64 pagesUsed = 0; - memUsed += sizeof(merylCountArray); // For our metadata - memUsed += pagesUsed * _bitsPerPage / 8; // For the packed kmer data - memUsed += _segAlloc * sizeof(uint64 **); // For pointers to segments + pagesUsed += fullSegs * (_segSize / _bitsPerPage) + fullSegs * (((_segSize % _bitsPerPage) == 0) ? 0 : 1); + pagesUsed += (partSeg / _bitsPerPage) + (((partSeg % _bitsPerPage) == 0) ? 0 : 1); + pagesUsed += 5; + pagesUsed += _nReAlloc; - return(memUsed); + return(pagesUsed * _bitsPerPage / 8 + _segAlloc * sizeof(uint64 **) + sizeof(merylCountArray)); }; - // Returns the change in size since the last call, but sets a threshold so - // we don't spend a bunch of time calling usedSize(). - // uint64 usedSizeDelta(void) { if (_nBits < _nBitsTrigger) @@ -180,7 +160,6 @@ private: uint64 _bitsPerPage; uint64 _nReAlloc; -private: uint32 _segSize; // Number of bits in each segment. uint32 _segAlloc; // Number of segments we're allowed to allocate (size of the array below). uint64 **_segments; // An array of blocks of data. diff --git a/ext/meryl/src/meryl/merylInput.H b/ext/meryl/src/meryl/merylInput.H index c8fecc1..29076c5 100644 --- a/ext/meryl/src/meryl/merylInput.H +++ b/ext/meryl/src/meryl/merylInput.H @@ -72,9 +72,6 @@ public: bool isMultiSet(void) { return(_isMultiSet); }; - bool isCompressedFile(void) { return((_sequence != NULL) && - (_sequence->isCompressed())); }; - merylOperation *_operation; merylFileReader *_stream; dnaSeqFile *_sequence; diff --git a/ext/meryl/src/meryl/merylOp-count.C b/ext/meryl/src/meryl/merylOp-count.C index af63d88..59e1df1 100644 --- a/ext/meryl/src/meryl/merylOp-count.C +++ b/ext/meryl/src/meryl/merylOp-count.C @@ -20,6 +20,10 @@ #include "strings.H" #include "system.H" +// The number of KB to use for a merylCountArray segment. +#define SEGMENT_SIZE 64 +#define SEGMENT_SIZE_BITS (SEGMENT_SIZE * 1024 * 8) + // // mcaSize = sizeof(merylCountArray) == 80 @@ -46,7 +50,7 @@ // // (nKmers / nPrefix+1) / mersPerSeg = (memory - mcaSize * nPrefix) / (ptrSize * nPrefix + segSize * nPrefix) // nKmers / nPrefix+1 = mersPerSeg * (memory - mcaSize * nPrefix) / (ptrSize * nPrefix + segSize * nPrefix) -#if 0 + uint64 findMaxInputSizeForMemorySize(uint32 merSize, uint64 memSize) { uint64 mcaSize = sizeof(merylCountArray); @@ -108,7 +112,7 @@ findMaxInputSizeForMemorySize(uint32 merSize, uint64 memSize) { exit(0); } -#endif + @@ -166,18 +170,12 @@ findExpectedSimpleSize(uint64 nKmerEstimate, -// Returns bestPrefix_ and memoryUsed_ corresponding to the minimal memory -// estimate for the supplied nKmerEstimate. If no estimate is below -// memoryAllowed, 0 and UINT64_MAX, respectively, are returned. -// void findBestPrefixSize(uint64 nKmerEstimate, uint64 memoryAllowed, uint32 &bestPrefix_, uint64 &memoryUsed_) { - uint32 merSize = kmerTiny::merSize(); - uint32 segSizeBits = merylCountArray::pagesPerSegment() * getPageSize() * 8; - uint32 segSizeBytes = merylCountArray::pagesPerSegment() * getPageSize(); + uint32 merSize = kmerTiny::merSize(); bestPrefix_ = 0; memoryUsed_ = UINT64_MAX; @@ -195,18 +193,18 @@ findBestPrefixSize(uint64 nKmerEstimate, // we end up with a prefix or a suffix of size zero. for (uint32 wp=1; wp < 2 * merSize - 1; wp++) { - uint64 nPrefix = (uint64)1 << wp; // Number of prefix == number of blocks of data - uint64 kmersPerPrefix = nKmerEstimate / nPrefix + 1; // Expected number of kmers we need to store per prefix - uint64 kmersPerSeg = segSizeBits / (2 * merSize - wp); // Kmers per segment - uint64 segsPerPrefix = kmersPerPrefix / kmersPerSeg + 1; // + uint64 nPrefix = (uint64)1 << wp; // Number of prefix == number of blocks of data + uint64 kmersPerPrefix = nKmerEstimate / nPrefix + 1; // Expected number of kmers we need to store per prefix + uint64 kmersPerSeg = SEGMENT_SIZE_BITS / (2 * merSize - wp); // Kmers per segment + uint64 segsPerPrefix = kmersPerPrefix / kmersPerSeg + 1; // - if (wp + countNumberOfBits64(segsPerPrefix) + countNumberOfBits64(segSizeBytes) >= 64) + if (wp + countNumberOfBits64(segsPerPrefix) + countNumberOfBits64(SEGMENT_SIZE) + 10 >= 64) break; // Otherwise, dataMemory overflows. uint64 structMemory = ((sizeof(merylCountArray) * nPrefix) + // Basic structs (sizeof(uint64 *) * nPrefix * segsPerPrefix)); // Pointers to segments - uint64 dataMemoryMin = nPrefix * segSizeBytes; // Minimum memory needed for this size. - uint64 dataMemory = nPrefix * segsPerPrefix * segSizeBytes; // Expected memory for full batch. + uint64 dataMemoryMin = nPrefix * SEGMENT_SIZE * 1024; // Minimum memory needed for this size. + uint64 dataMemory = nPrefix * segsPerPrefix * SEGMENT_SIZE * 1024; // Expected memory for full batch. uint64 totalMemory = structMemory + dataMemory; // Pick a larger prefix if it is dramatically smaller than what we have. @@ -236,9 +234,7 @@ findBestValues(uint64 nKmerEstimate, uint64 &nPrefix_, uint32 &wData_, kmdata &wDataMask_) { - uint32 merSize = kmerTiny::merSize(); - uint32 segSizeBits = merylCountArray::pagesPerSegment() * getPageSize() * 8; - uint32 segSizeBytes = merylCountArray::pagesPerSegment() * getPageSize(); + uint32 merSize = kmerTiny::merSize(); fprintf(stderr, "\n"); fprintf(stderr, "\n"); @@ -252,16 +248,16 @@ findBestValues(uint64 nKmerEstimate, for (uint32 wp=1; wp < 2 * merSize - 1; wp++) { uint64 nPrefix = (uint64)1 << wp; // Number of prefix == number of blocks of data uint64 kmersPerPrefix = nKmerEstimate / nPrefix + 1; // Expected number of kmers we need to store per prefix - uint64 kmersPerSeg = segSizeBits / (2 * merSize - wp); // Kmers per segment + uint64 kmersPerSeg = SEGMENT_SIZE_BITS / (2 * merSize - wp); // Kmers per segment uint64 segsPerPrefix = kmersPerPrefix / kmersPerSeg + 1; // - if (wp + countNumberOfBits64(segsPerPrefix) + countNumberOfBits64(segSizeBytes) >= 64) + if (wp + countNumberOfBits64(segsPerPrefix) + countNumberOfBits64(SEGMENT_SIZE) + 10 >= 64) break; // Otherwise, dataMemory overflows. uint64 structMemory = ((sizeof(merylCountArray) * nPrefix) + // Basic structs (sizeof(uint64 *) * nPrefix * segsPerPrefix)); // Pointers to segments - uint64 dataMemoryMin = nPrefix * segSizeBytes; // Minimum memory needed for this size. - uint64 dataMemory = nPrefix * segsPerPrefix * segSizeBytes; // Expected memory for full batch. + uint64 dataMemoryMin = nPrefix * SEGMENT_SIZE * 1024; // Minimum memory needed for this size. + uint64 dataMemory = nPrefix * segsPerPrefix * SEGMENT_SIZE * 1024; // Expected memory for full batch. uint64 totalMemory = structMemory + dataMemory; fprintf(stderr, "%6" F_U32P " %4" F_U64P " %cP %4" F_U64P " %cB %4" F_U64P " %cM %4" F_U64P " %cS %4" F_U64P " %cB %4" F_U64P " %cB %4" F_U64P " %cB", @@ -296,6 +292,70 @@ findBestValues(uint64 nKmerEstimate, +void +reportNumberOfOutputs(uint64 nKmerEstimate, + uint64 memoryUsed, // expected memory needed for counting in one block + uint64 memoryAllowed, // memory the user said we can use + bool useSimple) { + uint32 nOutputsI = memoryUsed / memoryAllowed + 1; + double nOutputsD = (double)memoryUsed / memoryAllowed - (nOutputsI - 1); + + + fprintf(stderr, "\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "FINAL CONFIGURATION\n"); + fprintf(stderr, "-------------------\n"); + + if (useSimple == true) { + assert(nOutputsI == 1); + } + + else { + char batchString[64] = { 0 }; + + if (nOutputsD < 0.2) { + nOutputsI += 0; + snprintf(batchString, 42, "split into up to %u (possibly %u)", nOutputsI-1, nOutputsI); + } + + else if (nOutputsD < 0.8) { + nOutputsI += 0; + snprintf(batchString, 42, "split into up to %u", nOutputsI); + } + + else { + nOutputsI += 1; + snprintf(batchString, 42, "split into up to %u (possibly %u)", nOutputsI, nOutputsI+1); + } + + + if (nOutputsI > 1) { + fprintf(stderr, "\n"); + fprintf(stderr, "WARNING:\n"); + fprintf(stderr, "WARNING: Cannot fit into " F_U64 " %cB memory limit.\n", scaledNumber(memoryAllowed), scaledUnit(memoryAllowed)); + fprintf(stderr, "WARNING: Will %s batches, and merge them at the end.\n", batchString); + fprintf(stderr, "WARNING:\n"); + } + + if (nOutputsI > 32) { + fprintf(stderr, "WARNING: Large number of batches. Increase memory for better performance.\n"); + fprintf(stderr, "WARNING:\n"); + } + } + + // This is parsed by Canu. Do not change. + + fprintf(stderr, "\n"); + fprintf(stderr, "Configured %s mode for %.3f GB memory per batch, and up to %u batch%s.\n", + (useSimple == true) ? "simple" : "complex", + ((memoryUsed < memoryAllowed) ? memoryUsed : memoryAllowed) / 1024.0 / 1024.0 / 1024.0, + nOutputsI, + (nOutputsI == 1) ? "" : "es"); + fprintf(stderr, "\n"); +} + + + void merylOperation::configureCounting(uint64 memoryAllowed, // Input: Maximum allowed memory in bytes bool &useSimple_, // Output: algorithm to use @@ -346,12 +406,9 @@ merylOperation::configureCounting(uint64 memoryAllowed, // Input: Maxim uint64 memoryUsedComplex = UINT64_MAX; uint32 bestPrefix = 0; - uint32 nBatches = 0; - for (nBatches=1; memoryUsedComplex > memoryAllowed; nBatches++) - findBestPrefixSize(_expNumKmers / nBatches, memoryAllowed, bestPrefix, memoryUsedComplex); - - findBestValues(_expNumKmers / nBatches, bestPrefix, memoryUsedComplex, wPrefix_, nPrefix_, wData_, wDataMask_); + findBestPrefixSize(_expNumKmers, memoryAllowed, bestPrefix, memoryUsedComplex); + findBestValues(_expNumKmers, bestPrefix, memoryUsedComplex, wPrefix_, nPrefix_, wData_, wDataMask_); // // Decide simple or complex. useSimple_ is an output. @@ -380,21 +437,7 @@ merylOperation::configureCounting(uint64 memoryAllowed, // Input: Maxim // Output the configuration. // - fprintf(stderr, "\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "FINAL CONFIGURATION\n"); - fprintf(stderr, "-------------------\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Estimated to require %lu %cB memory out of %lu %cB allowed.\n", - scaledNumber(memoryUsed), scaledUnit(memoryUsed), - scaledNumber(memoryAllowed), scaledUnit(memoryAllowed)); - fprintf(stderr, "Estimated to require %u batch%s.\n", nBatches, (nBatches == 1) ? "" : "es"); - fprintf(stderr, "\n"); - fprintf(stderr, "Configured %s mode for %.3f GB memory per batch, and up to %u batch%s.\n", // This is parsed - (useSimple_ == true) ? "simple" : "complex", // by Canu. - ((memoryUsed < memoryAllowed) ? memoryUsed : memoryAllowed) / 1024.0 / 1024.0 / 1024.0, // DO NOT CHANGE! - nBatches, (nBatches == 1) ? "" : "es"); - fprintf(stderr, "\n"); + reportNumberOfOutputs(_expNumKmers, memoryUsed, memoryAllowed, useSimple_); } @@ -403,9 +446,9 @@ merylOperation::configureCounting(uint64 memoryAllowed, // Input: Maxim // rigorous went into the multipliers, just looked at a few sets of lambda reads. uint64 merylOperation::guesstimateNumberOfkmersInInput_dnaSeqFile(dnaSeqFile *sequence) { - uint64 numMers = 0; - char const *name = sequence->filename(); - uint32 len = strlen(name); + uint64 numMers = 0; + char *name = sequence->filename(); + uint32 len = strlen(name); if ((name[0] == '-') && (len == 1)) return(0); @@ -511,7 +554,7 @@ merylOperation::count(uint32 wPrefix, memUsed = memBase; for (uint32 pp=0; pp _maxMemory) { fprintf(stderr, "Memory full. Writing results to '%s', using " F_S32 " threads.\n", - _outputO->filename(), getMaxThreadsAllowed()); + _outputO->filename(), omp_get_max_threads()); fprintf(stderr, "\n"); #pragma omp parallel for schedule(dynamic, 1) @@ -616,7 +659,7 @@ merylOperation::count(uint32 wPrefix, fprintf(stderr, "\n"); fprintf(stderr, "Writing results to '%s', using " F_S32 " threads.\n", - _outputO->filename(), getMaxThreadsAllowed()); + _outputO->filename(), omp_get_max_threads()); //for (uint64 pp=0; ppfileNumber(pp)); diff --git a/ext/meryl/src/meryl/merylOp-countSimple.C b/ext/meryl/src/meryl/merylOp-countSimple.C index 7d88ee9..65aeed6 100644 --- a/ext/meryl/src/meryl/merylOp-countSimple.C +++ b/ext/meryl/src/meryl/merylOp-countSimple.C @@ -106,14 +106,14 @@ merylOperation::countSimple(void) { for (uint32 hib=0; hib < 64; hib++) { if (highBits[hib].isAllocated() == false) { - fprintf(stderr, "Increasing to %u-bit storage (for kmer 0x%s).\n", - lowBitsSize + hib + 1, toHex(kidx)); + fprintf(stderr, "Increasing to %u-bit storage (for kmer 0x%016lx).\n", + lowBitsSize + hib + 1, kidx); highBits[hib].allocate(nEntries); } - if (highBits[hib].flipBit(kidx) == 0) { // If not set, set it, - highBitMax = std::max(highBitMax, hib + 1); // remember the possible maximum bit set, - break; // and stop. + if (highBits[hib].flipBit(kidx) == 0) { // If not set, set it, + highBitMax = max(highBitMax, hib + 1); // remember the possible maximum bit set, + break; // and stop. } } } @@ -183,7 +183,7 @@ merylOperation::countSimple(void) { fprintf(stderr, "\n"); fprintf(stderr, "Writing results to '%s', using " F_S32 " threads.\n", - _outputO->filename(), getMaxThreadsAllowed()); + _outputO->filename(), omp_get_max_threads()); fprintf(stderr, " [ file ][ prefix ][ suffix ][ count-suffix ]\n"); fprintf(stderr, " widths [ 6 ][ %7u ][ %7u ][ %12u ]\n", wPrefix - 6, wSuffix, 2 * _countSuffixLength); fprintf(stderr, " number [ 64 ][ %7lu ][ %7lu ][ %12s ]\n", nPrefix / 64, nSuffix, _countSuffixString); @@ -207,7 +207,6 @@ merylOperation::countSimple(void) { uint64 kEnd = (bp << wSuffix) | sMask; uint64 nKmers = 0; -//CJ: avoid this print statment #if 0 fprintf(stderr, "thread %2d working on block 0x%08lx<0x%08lx<0x%08lx %8lu kmers between 0x%016lx and 0x%016lx\n", omp_get_thread_num(), diff --git a/ext/meryl/src/meryl/merylOp-countThreads.C b/ext/meryl/src/meryl/merylOp-countThreads.C index 2742fed..1521be2 100644 --- a/ext/meryl/src/meryl/merylOp-countThreads.C +++ b/ext/meryl/src/meryl/merylOp-countThreads.C @@ -23,18 +23,21 @@ #include +// The number of KB to use for a merylCountArray segment. +#define SEGMENT_SIZE 64 +#define SEGMENT_SIZE_BITS (SEGMENT_SIZE * 1024 * 8) + + class mcGlobalData { public: - mcGlobalData(std::vector &inputs, - merylOp op, - uint64 nPrefix, - uint32 wData, - kmdata wDataMask, - uint64 maxMemory, - uint32 maxThreads, - uint64 bufferSize, - merylFileWriter *output) : _inputs(inputs) { + mcGlobalData(vector &inputs, + merylOp op, + uint64 nPrefix, + uint32 wData, + kmdata wDataMask, + uint64 maxMemory, + merylFileWriter *output) : _inputs(inputs) { _operation = op; _nPrefix = nPrefix; _wData = wData; @@ -52,22 +55,20 @@ public: _memUsed = _memBase; _memReported = 0; - _maxThreads = maxThreads; - _loadThreads = 1; - - _bufferSize = bufferSize; - _kmersAdded = 0; - _kmersAddedMax = 0; _inputPos = 0; + //_inputs = inputs; for (uint32 ii=0; ii<65; ii++) _lastBuffer[ii] = 0; + _computeWait = 0; + _numComputing = 0; + for (uint32 pp=0; pp<_nPrefix; pp++) { // Initialize each bucket. _lock[pp].clear(); - _memUsed += _data[pp].initialize(pp, wData); + _memUsed += _data[pp].initialize(pp, wData, SEGMENT_SIZE); } }; @@ -77,59 +78,65 @@ public: delete [] _writer; }; - merylOp _operation; // Parameters. - uint64 _nPrefix; - uint32 _wData; - kmdata _wDataMask; + merylOp _operation; // Parameters. + uint64 _nPrefix; + uint32 _wData; + kmdata _wDataMask; - bool _dumping; + bool _dumping; - std::atomic_flag *_lock; - merylCountArray *_data; // Data for counting. - merylFileWriter *_output; - merylBlockWriter *_writer; // Data for writing. + std::atomic_flag *_lock; + merylCountArray *_data; // Data for counting. + merylFileWriter *_output; + merylBlockWriter *_writer; // Data for writing. - uint64 _maxMemory; // Maximum memory we can use. - uint64 _memBase; // Overhead memory. - uint64 _memUsed; // Sum of actual memory used. - uint64 _memReported; // Memory usage at last report. + uint64 _maxMemory; // Maximum memory we can use. + uint64 _memBase; // Overhead memory. + uint64 _memUsed; // Sum of actual memory used. + uint64 _memReported; // Memory usage at last report. - uint32 _maxThreads; // The max number of CPUs we can use. - uint32 _loadThreads; // The number of CPUs used for reading input. + uint64 _kmersAdded; // Boring statistics for the user. - uint64 _bufferSize; // Maximum size of a computation input buffer. + uint32 _inputPos; // Input files. + vector &_inputs; - uint64 _kmersAdded; // Number of kmers added; boring statistics for the user. - uint64 _kmersAddedMax; // Max kmers in any single merylCountArray; not boring. - - uint32 _inputPos; // Input files. - std::vector &_inputs; + char _lastBuffer[65]; // Wrap-around from the last buffer. - char _lastBuffer[65]; // Wrap-around from the last buffer. + uint32 _computeWait; + uint32 _numComputing; }; +#define BUF_MAX (1 * 1024 * 1024) + class mcComputation { public: - mcComputation(uint64 bufmax) { - _bufferMax = bufmax; - _buffer = new char [_bufferMax]; + mcComputation() { + _bufferMax = BUF_MAX; + _bufferLen = 0; + + _memUsed = 0; + _kmersAdded = 0; }; ~mcComputation() { - delete [] _buffer; }; - uint64 _bufferMax = 0; // Input data - uint64 _bufferLen = 0; - char *_buffer = nullptr; + // Data for input sequences. + uint64 _bufferMax; + uint64 _bufferLen; + char _buffer[BUF_MAX]; - kmerIterator _kiter; // Sequence to kmer conversion + // Data for converting sequence to kmers. + kmerIterator _kiter; - uint64 _memUsed = 0; // Output statistics on kmers added to - uint64 _kmersAdded = 0; // the merylCountArray but this block. - uint64 _kmersAddedMax = 0; + // Data for debugging. + //char _fstr[65]; // For debugging + //char _rstr[65]; + + uint64 _memUsed; + uint64 _kmersAdded; }; @@ -138,13 +145,12 @@ public: void * loadBases(void *G) { mcGlobalData *g = (mcGlobalData *)G; - mcComputation *s = new mcComputation(g->_bufferSize); + mcComputation *s = new mcComputation(); uint32 kl = kmerTiny::merSize() - 1; // Copy the end of the last block into our buffer. assert(s->_bufferLen == 0); - assert(s->_bufferMax > kl); if (g->_lastBuffer[0] != 0) { memcpy(s->_buffer, g->_lastBuffer, sizeof(char) * kl); @@ -159,14 +165,6 @@ loadBases(void *G) { if (g->_inputPos >= g->_inputs.size()) return(NULL); - // Update the number of threads used for loading. If the input is - // compressed, reserve 2 threads, otherwise reserve 1. - - if (g->_inputs[g->_inputPos]->isCompressedFile()) - g->_loadThreads = 2; - else - g->_loadThreads = 1; - // Try to load bases. Keep loading until the buffer is filled // or we exhaust the file. @@ -248,13 +246,13 @@ insertKmers(void *G, void *T, void *S) { if (useF == true) { pp = (kmdata)s->_kiter.fmer() >> g->_wData; mm = (kmdata)s->_kiter.fmer() & g->_wDataMask; - //fprintf(stderr, "useF F=%s R=%s ms=%u pp %llu mm %llu\n", s->_kiter.fmer().toString(fstr), s->_kiter.rmer().toString(rstr), s->_kiter.fmer().merSize(), pp, mm); + //fprintf(stderr, "useF F=%s R=%s ms=%u pp %lu mm %lu\n", s->_kiter.fmer().toString(fstr), s->_kiter.rmer().toString(rstr), s->_kiter.fmer().merSize(), pp, mm); } else { pp = (kmdata)s->_kiter.rmer() >> g->_wData; mm = (kmdata)s->_kiter.rmer() & g->_wDataMask; - //fprintf(stderr, "useR F=%s R=%s ms=%u pp %llu mm %llu\n", s->_kiter.fmer().toString(fstr), s->_kiter.rmer().toString(rstr), s->_kiter.rmer().merSize(), pp, mm); + //fprintf(stderr, "useR F=%s R=%s ms=%u pp %lu mm %lu\n", s->_kiter.fmer().toString(fstr), s->_kiter.rmer().toString(rstr), s->_kiter.rmer().merSize(), pp, mm); } assert(pp < g->_nPrefix); @@ -271,9 +269,8 @@ insertKmers(void *G, void *T, void *S) { while (g->_lock[pp].test_and_set(std::memory_order_relaxed) == true) ; - s->_memUsed += g->_data[pp].add(mm); - s->_kmersAdded += 1; - s->_kmersAddedMax = std::max(s->_kmersAddedMax, g->_data[pp].numKmers()); + s->_memUsed += g->_data[pp].add(mm); + s->_kmersAdded += 1; g->_lock[pp].clear(std::memory_order_relaxed); } @@ -290,41 +287,32 @@ writeBatch(void *G, void *S) { // Udpate memory used and kmers added. There's only one writer thread, // so this is thread safe! - g->_memUsed += s->_memUsed; - g->_kmersAdded += s->_kmersAdded; - g->_kmersAddedMax = std::max(s->_kmersAddedMax, g->_kmersAddedMax); - - // Free the input buffer. All the data is loaded into merylCountArrays, - // and all we needed to get from this is the stats above. - - delete s; - - // Estimate, poorly, how much memory we'll need to sort the arrays. It's - // a poor estimate because we'll never have all threads sorting the - // maximum number of kmers at the same time, but it's a safe poor - // estimate. + g->_memUsed += s->_memUsed; + g->_kmersAdded += s->_kmersAdded; - uint64 sortMem = g->_maxThreads * g->_kmersAddedMax * sizeof(kmdata); + // Do some accounting. - // Write a log every 128 MB of memory growth. + if (g->_memUsed - g->_memReported > (uint64)128 * 1024 * 1024) { + g->_memReported = g->_memUsed; - if (g->_memUsed + sortMem - g->_memReported > (uint64)128 * 1024 * 1024) { - g->_memReported = g->_memUsed + sortMem; - - fprintf(stderr, "Used %3.3f GB / %3.3f GB to store %12lu kmers; need %3.3f GB to sort %12lu kmers\n", + fprintf(stderr, "Used %3.3f GB out of %3.3f GB to store %12lu kmers.\n", g->_memUsed / 1024.0 / 1024.0 / 1024.0, g->_maxMemory / 1024.0 / 1024.0 / 1024.0, - g->_kmersAdded, - sortMem / 1024.0 / 1024.0 / 1024.0, g->_kmersAddedMax); + g->_kmersAdded); } + // Free the input buffer. + + delete s; + // If we haven't hit the memory limit yet, just return. + // Otherwise, dump data. - if (g->_memUsed + sortMem < g->_maxMemory) + if (g->_memUsed < g->_maxMemory) return; // Tell all the threads to pause, then grab all the locks to ensure nobody - // is still adding kmers to a merylCountArray. + // is still writing. g->_dumping = true; @@ -332,26 +320,17 @@ writeBatch(void *G, void *S) { while (g->_lock[pp].test_and_set(std::memory_order_relaxed) == true) ; - // Write data! For reasons I don't understand, we need to reset the max - // number of threads to use. Something is resetting it to the number of - // CPUs on the machine. - // - // Since we still have a sequence loader around, we need to leave threads - // for it. - - uint32 wThreads = (g->_maxThreads > g->_loadThreads) ? (g->_maxThreads - g->_loadThreads) : 1; - uint32 lThreads = g->_loadThreads; + // Write data! - fprintf(stderr, "Memory full. Writing results to '%s', using %u thread%s (%u thread%s still doing input).\n", - g->_output->filename(), - wThreads, (wThreads == 1) ? "" : "s", - lThreads, (lThreads == 1) ? "" : "s"); + fprintf(stderr, "Memory full. Writing results to '%s', using " F_S32 " threads.\n", + g->_output->filename(), omp_get_max_threads()); fprintf(stderr, "\n"); - omp_set_num_threads(wThreads); - #pragma omp parallel for schedule(dynamic, 1) for (uint32 ff=0; ff_output->numberOfFiles(); ff++) { + //fprintf(stderr, "thread %2u writes file %2u with prefixes 0x%016lx to 0x%016lx\n", + // omp_get_thread_num(), ff, g->_output->firstPrefixInFile(ff), g->_output->lastPrefixInFile(ff)); + for (uint64 pp=g->_output->firstPrefixInFile(ff); pp <= g->_output->lastPrefixInFile(ff); pp++) { g->_data[pp].countKmers(); // Convert the list of kmers into a list of (kmer, count). g->_data[pp].dumpCountedKmers(g->_writer); // Write that list to disk. @@ -368,8 +347,7 @@ writeBatch(void *G, void *S) { for (uint32 pp=0; pp_nPrefix; pp++) g->_memUsed += g->_data[pp].usedSize(); - g->_kmersAdded = 0; - g->_kmersAddedMax = 0; + g->_kmersAdded = 0; // Signal that threads can proceeed. @@ -404,53 +382,37 @@ merylOperation::countThreads(uint32 wPrefix, _outputO->initialize(wPrefix); // Initialize the counter. - // - // Tell it to use _maxMemory, but carve out space for the input buffers. - // At 2MB each, 16 per thread, and 16 threads, that's 512 MB. Not huge, - // but a big chunk of our (expected 16 or so GB total). The extra buffers - // are generally filled when a batch is dumped to disk. - - uint64 inputBufferSize = 2 * 1024 * 1024; - - mcGlobalData *g = new mcGlobalData(_inputs, - _operation, - nPrefix, - wData, - wDataMask, - _maxMemory - inputBufferSize * 4 * _maxThreads, - _maxThreads, - inputBufferSize, - _outputO); - - // Set up a sweatShop and run it. We'll reserve one thread for input, one - // for gzip and use the remaining for counting -- unless there are no - // remaining, then we'll just use one. - sweatShop *ss = new sweatShop(loadBases, insertKmers, writeBatch); + mcGlobalData *g = new mcGlobalData(_inputs, _operation, nPrefix, wData, wDataMask, _maxMemory, _outputO); - uint32 nw = (_maxThreads > 2) ? (_maxThreads - 2) : 1; + // Set up a sweatShop and run it. - ss->setLoaderBatchSize(1); // Load this many things before appending to input list - ss->setLoaderQueueSize(nw * 16); // Allow this many things on the input list before stalling the input - ss->setWriterQueueSize(nw); // Allow this many things on the output list before stalling the compute - ss->setNumberOfWorkers(nw); // Use this many worker CPUs; leave one for input and one for gzip. + sweatShop *ss = new sweatShop(loadBases, insertKmers, writeBatch); + uint32 nt = omp_get_max_threads(); + + ss->setLoaderBatchSize(1 * nt); + ss->setLoaderQueueSize(2 * nt); + ss->setWriterQueueSize(1 * nt); + ss->setNumberOfWorkers(1 * nt); ss->run(g, false); delete ss; - // All data loaded. Write the output. Reset threads before starting (see - // above) to the maximum possible since there is no loader threads around - // anymore. + // All data loaded. Write the output. fprintf(stderr, "\n"); - fprintf(stderr, "Input complete. Writing results to '%s', using %u thread%s.\n", - _outputO->filename(), _maxThreads, (_maxThreads == 1) ? "" : "s"); + fprintf(stderr, "Writing results to '%s', using " F_S32 " threads.\n", + _outputO->filename(), omp_get_max_threads()); - omp_set_num_threads(_maxThreads); + //for (uint64 pp=0; ppfileNumber(pp)); #pragma omp parallel for schedule(dynamic, 1) for (uint32 ff=0; ff<_outputO->numberOfFiles(); ff++) { + //fprintf(stderr, "thread %2u writes file %2u with prefixes 0x%016lx to 0x%016lx\n", + // omp_get_thread_num(), ff, _outputO->firstPrefixInFile(ff), _outputO->lastPrefixInFile(ff)); + for (uint64 pp=_outputO->firstPrefixInFile(ff); pp <= _outputO->lastPrefixInFile(ff); pp++) { g->_data[pp].countKmers(); // Convert the list of kmers into a list of (kmer, count). g->_data[pp].dumpCountedKmers(g->_writer); // Write that list to disk. diff --git a/ext/meryl/src/meryl/merylOp-histogram.C b/ext/meryl/src/meryl/merylOp-histogram.C index 0f9efbe..0834c54 100644 --- a/ext/meryl/src/meryl/merylOp-histogram.C +++ b/ext/meryl/src/meryl/merylOp-histogram.C @@ -62,7 +62,7 @@ merylOperation::reportStatistics(void) { // Now just dump it. - uint64 nUniverse = buildLowBitMask(kmer::merSize() * 2) + 1; + uint64 nUniverse = uint64MASK(kmer::merSize() * 2) + 1; uint64 sDistinct = 0; uint64 sTotal = 0; diff --git a/ext/meryl/src/meryl/merylOp-nextMer.C b/ext/meryl/src/meryl/merylOp-nextMer.C index 5970bbc..f09c710 100644 --- a/ext/meryl/src/meryl/merylOp-nextMer.C +++ b/ext/meryl/src/meryl/merylOp-nextMer.C @@ -17,7 +17,7 @@ */ #include "meryl.H" -#include + void @@ -48,19 +48,6 @@ merylOperation::findSumCount(void) { } -void -merylOperation::subtractCount(void) { - _value = _actCount[0]; - for (uint32 ii=1; ii<_actLen; ii++) { - if ( _value > _actCount[ii] ) - _value -= _actCount[ii]; - else { - _value = 0; - return; - } - } -} - void merylOperation::initializeThreshold(void) { @@ -222,7 +209,7 @@ merylOperation::doCounting(void) { // - add the counted output as an input if (_outputO) - strncpy(name, _outputO->filename(), FILENAME_MAX + 1); // know which input to open later. + strncpy(name, _outputO->filename(), FILENAME_MAX); // know which input to open later. delete _outputO; _outputO = NULL; @@ -539,16 +526,6 @@ merylOperation::nextMer(void) { else _value = _actCount[0] / _mathConstant; break; - case opDivideRound: - if (_mathConstant == 0) - _value = 0; // DIVIDE BY ZERO! - else { - if (_actCount[0] < _mathConstant) - _value = 1; - else - _value = round (_actCount[0] / (double) _mathConstant); - } - break; case opModulo: if (_mathConstant == 0) @@ -593,15 +570,6 @@ merylOperation::nextMer(void) { findSumCount(); break; - case opSubtract: - if (_actIndex[0] == 0) { - if (_actLen == 1) - _value = _actCount[0]; - else if (_actLen > 1) - subtractCount(); - } - break; - case opDifference: if ((_actLen == 1) && (_actIndex[0] == 0)) _value = _actCount[0]; diff --git a/ext/meryl/src/meryl/merylOp.C b/ext/meryl/src/meryl/merylOp.C index 31497b3..00b9840 100644 --- a/ext/meryl/src/meryl/merylOp.C +++ b/ext/meryl/src/meryl/merylOp.C @@ -315,7 +315,6 @@ toString(merylOp op) { case opDecrease: return("opDecrease"); break; case opMultiply: return("opMultiply"); break; case opDivide: return("opDivide"); break; - case opDivideRound: return("opDivideRound"); break; case opModulo: return("opModulo"); break; case opUnion: return("opUnion"); break; @@ -328,8 +327,6 @@ toString(merylOp op) { case opIntersectMax: return("opIntersectMax"); break; case opIntersectSum: return("opIntersectSum"); break; - case opSubtract: return("opSubtract"); break; - case opDifference: return("opDifference"); break; case opSymmetricDifference: return("opSymmetricDifference"); break; diff --git a/ext/meryl/src/meryl/merylOp.H b/ext/meryl/src/meryl/merylOp.H index 390a2d9..4959358 100644 --- a/ext/meryl/src/meryl/merylOp.H +++ b/ext/meryl/src/meryl/merylOp.H @@ -46,7 +46,6 @@ enum merylOp { opDecrease, opMultiply, opDivide, - opDivideRound, opModulo, opUnion, @@ -62,8 +61,6 @@ enum merylOp { opDifference, opSymmetricDifference, - opSubtract, // if count(a) >= count(b), keep count(a)-count(b). else, discard. - opHistogram, opStatistics, @@ -157,21 +154,17 @@ public: return(isCounting() == false); }; - bool needsThreshold(void) { + bool needsParameter(void) { return((_operation == opLessThan) || (_operation == opGreaterThan) || (_operation == opAtLeast) || (_operation == opAtMost) || (_operation == opEqualTo) || - (_operation == opNotEqualTo)); - }; - - bool needsConstant(void) { - return((_operation == opIncrease) || + (_operation == opNotEqualTo) || + (_operation == opIncrease) || (_operation == opDecrease) || (_operation == opMultiply) || (_operation == opDivide) || - (_operation == opDivideRound) || (_operation == opModulo)); }; @@ -234,9 +227,8 @@ private: void findMinCount(void); void findMaxCount(void); void findSumCount(void); - void subtractCount(void); - std::vector _inputs; + vector _inputs; bool _isMultiSet = false; merylOp _operation = opNothing; diff --git a/ext/meryl/src/tests/merylCountArrayTest.C b/ext/meryl/src/tests/merylCountArrayTest.C index 4aa6186..f1549ea 100644 --- a/ext/meryl/src/tests/merylCountArrayTest.C +++ b/ext/meryl/src/tests/merylCountArrayTest.C @@ -24,7 +24,7 @@ mtRandom *mt = NULL; void -display(char const *l, kmdata s) { +display(char *l, kmdata s) { uint64 a = (s >> 64); uint64 b = s; diff --git a/ext/meryl/src/tests/merylCountArrayTest.mk b/ext/meryl/src/tests/merylCountArrayTest.mk index 4605ce3..d0fee14 100644 --- a/ext/meryl/src/tests/merylCountArrayTest.mk +++ b/ext/meryl/src/tests/merylCountArrayTest.mk @@ -1,8 +1,21 @@ + +# If 'make' isn't run from the root directory, we need to set these to +# point to the upper level build directory. +ifeq "$(strip ${BUILD_DIR})" "" + BUILD_DIR := ../$(OSTYPE)-$(MACHINETYPE)/obj +endif +ifeq "$(strip ${TARGET_DIR})" "" + TARGET_DIR := ../$(OSTYPE)-$(MACHINETYPE) +endif + TARGET := merylCountArrayTest -SOURCES := merylCountArrayTest.C ../meryl/merylCountArray.C +SOURCES := merylCountArrayTest.C \ + ../meryl/merylCountArray.C SRC_INCDIRS := . ../utility/src/utility ../meryl TGT_LDFLAGS := -L${TARGET_DIR}/lib -TGT_LDLIBS := -l${MODULE} -TGT_PREREQS := lib${MODULE}.a +TGT_LDLIBS := -lmeryl +TGT_PREREQS := libmeryl.a + +SUBMAKEFILES := diff --git a/ext/meryl/src/tests/merylExactLookupTest.C b/ext/meryl/src/tests/merylExactLookupTest.C deleted file mode 100644 index 80ed3ad..0000000 --- a/ext/meryl/src/tests/merylExactLookupTest.C +++ /dev/null @@ -1,285 +0,0 @@ - -/****************************************************************************** - * - * This file is part of meryl, a genomic k-kmer counter with nice features. - * - * This software is based on: - * 'Canu' v2.0 (https://github.com/marbl/canu) - * which is based on: - * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net) - * the 'kmer package' r1994 (http://kmer.sourceforge.net) - * - * Except as indicated otherwise, this is a 'United States Government Work', - * and is released in the public domain. - * - * File 'README.licenses' in the root directory of this distribution - * contains full conditions and disclaimers. - */ - -#include "runtime.H" - -#include "kmers.H" -#include "sequence.H" -#include "bits.H" - - -void -loadLookup(char const *inputDBname, - uint64 minV, - uint64 maxV, - merylExactLookup &lookup) { - - fprintf(stderr, "==\n"); - fprintf(stderr, "== Create merylExactLookup from '%s'.\n", inputDBname); - fprintf(stderr, "==\n"); - - merylFileReader *merylDB = new merylFileReader(inputDBname); - - lookup.load(merylDB, 16.0, true, false, minV, maxV); - - fprintf(stderr, "\n"); - - delete merylDB; -} - - -void -loadMap(char const *inputDBname, - uint64 minV, - uint64 maxV, - std::map &lookup) { - - fprintf(stderr, "==\n"); - fprintf(stderr, "== Create merylExactLookup from '%s'.\n", inputDBname); - fprintf(stderr, "==\n"); - - merylFileReader *merylDB = new merylFileReader(inputDBname); - - uint64 nKmers = 0; - uint64 nSkips = 0; - - while (merylDB->nextMer() == true) { - kmer kmer = merylDB->theFMer(); - uint32 value = merylDB->theValue(); - - if ((minV <= value) && - (value <= maxV)) { - lookup[kmer] = value; - nKmers++; - } else { - nSkips++; - } - - if (((nKmers + nSkips) % 100000) == 0) - fprintf(stderr, "== Loaded %lu kmers; ignored %lu.\r", nKmers, nSkips); - } - - fprintf(stderr, "== Loaded %lu kmers; ignored %lu; map size %lu.\n", nKmers, nSkips, lookup.size()); - fprintf(stderr, "\n"); - - delete merylDB; -} - - - -int -main(int argc, char **argv) { - char *inputSeqName = nullptr; - char *inputDBname = nullptr; - uint64 minV = 0; - uint64 maxV = uint64max; - uint32 threads = 1; - - argc = AS_configure(argc, argv); - - std::vector err; - int arg = 1; - while (arg < argc) { - if (strcmp(argv[arg], "-sequence") == 0) { // INPUT READS and RANGE TO PROCESS - inputSeqName = argv[++arg]; - - } else if (strcmp(argv[arg], "-mers") == 0) { - inputDBname = argv[++arg]; - - //} else if (strcmp(argv[arg], "-min") == 0) { - // minV = strtouint64(argv[++arg]); - - //} else if (strcmp(argv[arg], "-max") == 0) { - // maxV = strtouint64(argv[++arg]); - - //} else if (strcmp(argv[arg], "-threads") == 0) { - // threads = strtouint32(argv[++arg]); - - } else { - char *s = new char [1024]; - snprintf(s, 1024, "Unknown option '%s'.\n", argv[arg]); - err.push_back(s); - } - - arg++; - } - - if (inputSeqName == nullptr) err.push_back("No input sequences (-sequence) supplied.\n"); - if (inputDBname == nullptr) err.push_back("No query meryl database (-mers) supplied.\n"); - - if (err.size() > 0) { - fprintf(stderr, "usage: %s ...\n", argv[0]); - fprintf(stderr, " -sequence X.fasta\n"); - fprintf(stderr, " -mers X.meryl\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Loads kmers in X.meryl into a merylExactLookup table and a standard\n"); - fprintf(stderr, "C++ associative map. Verifies that every kmer present in X.fasta is\n"); - fprintf(stderr, "present in both the merylExactLookup and the associative map, and that\n"); - fprintf(stderr, "the value returned by both is the same.\n"); - fprintf(stderr, "\n"); - - for (uint32 ii=0; ii kmerValue; - std::map kmerCheck; - - - loadLookup(inputDBname, minV, maxV, kmerLookup); - loadMap (inputDBname, minV, maxV, kmerValue); - - fprintf(stderr, "==\n"); - fprintf(stderr, "== Copy kmerValue to kmerCheck.\n"); - fprintf(stderr, "==\n"); - - kmerCheck = kmerValue; - - // - - fprintf(stderr, "\n"); - fprintf(stderr, "==\n"); - fprintf(stderr, "== Stream kmers from '%s'.\n", inputSeqName); - fprintf(stderr, "==\n"); - - dnaSeqFile *seqFile = new dnaSeqFile(inputSeqName); - - { - dnaSeq seq; - char fString[64]; - char rString[64]; - uint64 nTest = 0; - - while (seqFile->loadSequence(seq)) { - kmerIterator kiter(seq.bases(), seq.length()); - - while (kiter.nextMer()) { - kmer fMer = kiter.fmer(); - kmer rMer = kiter.rmer(); - kmer cMer = (fMer < rMer) ? fMer : rMer; - kmvalu value; - - if (kmerLookup.exists(cMer) == false) { - fprintf(stdout, "%s\t%s\t%s MISSING from kmerLookup::exists()\n", - seq.ident(), - kiter.fmer().toString(fString), - kiter.rmer().toString(rString)); - exit(1); -#ifdef SHOW_SUCCESS - } else { - fprintf(stdout, "%s\t%s\t%s FOUND in kmerLookup::exists()\n", - seq.ident(), - kiter.fmer().toString(fString), - kiter.rmer().toString(rString)); -#endif - } - - if (kmerLookup.exists(cMer, value) == false) { - fprintf(stdout, "%s\t%s\t%s MISSING from kmerLookup::exists(mer, value) - (not found)\n", - seq.ident(), - kiter.fmer().toString(fString), - kiter.rmer().toString(rString)); - exit(1); - } - if (value != kmerValue[cMer]) { - fprintf(stdout, "%s\t%s\t%s MISSING from kmerLookup::exists(mer, value) - kmerLookup=%u kmerValue=%u\n", - seq.ident(), - kiter.fmer().toString(fString), - kiter.rmer().toString(rString), - kmerLookup.value(cMer), - kmerCheck[cMer]); - exit(1); -#ifdef SHOW_SUCCESS - } else { - fprintf(stdout, "%s\t%s\t%s FOUND in kmerLookup::exists(mer, value)\n", - seq.ident(), - kiter.fmer().toString(fString), - kiter.rmer().toString(rString)); -#endif - } - - if (kmerLookup.value(cMer) != kmerValue[cMer]) { - fprintf(stdout, "%s\t%s\t%s MISSING from kmerLookup::value() -- kmerLookup=%u kmerValue=%u\n", - seq.ident(), - kiter.fmer().toString(fString), - kiter.rmer().toString(rString), - kmerLookup.value(cMer), - kmerCheck[cMer]); - exit(1); -#ifdef SHOW_SUCCESS - } else { - fprintf(stdout, "%s\t%s\t%s FOUND in kmerLookup::value()\n", - seq.ident(), - kiter.fmer().toString(fString), - kiter.rmer().toString(rString)); -#endif - } - - // Subtract one from the kmer check counters. If this is zero, the - // kmerIterator returned too many kmers. - - if (kmerCheck[cMer] == 0) { - fprintf(stdout, "%s\t%s\t%s ZERO\n", - seq.ident(), - kiter.fmer().toString(fString), - kiter.rmer().toString(rString)); - exit(1); - } - - --kmerCheck[cMer]; - - // Log. - - if ((++nTest % 100000) == 0) - fprintf(stderr, "== Tested %lu kmers.\r", nTest); - } - } - } - - delete seqFile; - - // Check that all values are zero. - - fprintf(stderr, "\n"); - fprintf(stderr, "==\n"); - fprintf(stderr, "== Checking all kmers were seen.\n"); - fprintf(stderr, "==\n"); - - for (auto it=kmerCheck.begin(); it != kmerCheck.end(); it++) { - kmer k = it->first; - uint32 v = it->second; - - if (v != 0) { - char kmerString[64]; - - fprintf(stderr, "%s\t%u\n", k.toString(kmerString), v); - } - } - - fprintf(stderr, "\n"); - fprintf(stderr, "Success!\n"); - fprintf(stderr, "\n"); - - exit(0); -} diff --git a/ext/meryl/src/tests/merylExactLookupTest.mk b/ext/meryl/src/tests/merylExactLookupTest.mk deleted file mode 100644 index 6273728..0000000 --- a/ext/meryl/src/tests/merylExactLookupTest.mk +++ /dev/null @@ -1,8 +0,0 @@ -TARGET := merylExactLookupTest -SOURCES := merylExactLookupTest.C \ - -SRC_INCDIRS := . ../utility/src/utility - -TGT_LDFLAGS := -L${TARGET_DIR}/lib -TGT_LDLIBS := -l${MODULE} -TGT_PREREQS := lib${MODULE}.a diff --git a/ext/meryl/src/utility/README.licenses b/ext/meryl/src/utility/README.licenses index 05c4804..1b646a6 100644 --- a/ext/meryl/src/utility/README.licenses +++ b/ext/meryl/src/utility/README.licenses @@ -137,108 +137,3 @@ For libbacktrace: LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --- -For SSW Library (align-ssw.C and align-ssw.H): - - The MIT License - - Copyright (c) 2012-2015 Boston College. - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - - - The 2-clause BSD License - - Copyright 2006 Michael Farrar. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --- -For parasail: - - Pairwise Sequence Alignment Library (parasail) - - Copyright (c) 2015, Battelle Memorial Institute - - 1. Battelle Memorial Institute (hereinafter Battelle) hereby grants - permission to any person or entity lawfully obtaining a copy of this - software and associated documentation files (hereinafter “the - Software”) to redistribute and use the Software in source and binary - forms, with or without modification. Such person or entity may use, - copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and may permit others to do so, subject to - the following conditions: - - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimers. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - - - Other than as used herein, neither the name Battelle Memorial - Institute or Battelle may be used in any form whatsoever without - the express written consent of Battelle. - - - Redistributions of the software in any form, and publications - based on work performed using the software should include the - following citation as a reference: - - Daily, Jeff. (2016). Parasail: SIMD C library for global, - semi-global, and local pairwise sequence alignments. *BMC - Bioinformatics*, 17(1), 1-11. doi:10.1186/s12859-016-0930-z - - 2. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL BATTELLE - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF - USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - SUCH DAMAGE. - diff --git a/ext/meryl/src/utility/scripts/version_update.pl b/ext/meryl/src/utility/scripts/version_update.pl index 8dd8f3f..57c782b 100755 --- a/ext/meryl/src/utility/scripts/version_update.pl +++ b/ext/meryl/src/utility/scripts/version_update.pl @@ -44,8 +44,6 @@ my $branch = "master"; my $version = "v$major.$minor"; -my @submodules; - my $commits = undef; my $hash1 = undef; # This from 'git describe' my $hash2 = undef; # This from 'git rev-list' @@ -87,12 +85,7 @@ $version = "v$major.$minor"; } else { - $major = "0"; - $minor = "0"; - $commits = "0"; - $hash1 = $_; - - $version = "v$major.$minor"; + die "Failed to parse describe string '$_'.\n"; } } close(F); @@ -139,16 +132,6 @@ $label = "branch"; $version = $branch; } - - - # Get information on any submodules here. - open(F, "git submodule status |"); - while () { - if (m/^(.*)\s+(.*)\s+\((.*)\)$/) { - push @submodules, "$2 $3 $1"; - } - } - close(F); } @@ -168,12 +151,11 @@ # Report what we found. This is really for the gmake output. if (defined($commits)) { - print "\$(info Building $label $version +$commits changes (r$revCount $hash1) ($dirty))\n"; - foreach my $s (@submodules) { - print "\$(info \$(space) $s)\n"; - } + print STDERR "Building $label $version +$commits changes (r$revCount $hash1) ($dirty)\n"; + print STDERR "\n"; } else { - print "\$(info Building $label $version)\n"; + print STDERR "Building $label $version\n"; + print STDERR "\n"; } # Dump a new file, but don't overwrite the original. diff --git a/ext/meryl/src/utility/src/Makefile b/ext/meryl/src/utility/src/Makefile index 6939127..641014e 100644 --- a/ext/meryl/src/utility/src/Makefile +++ b/ext/meryl/src/utility/src/Makefile @@ -27,6 +27,11 @@ # instances of "$" within them need to be escaped with a second "$" to # accomodate the double expansion that occurs when eval is invoked. +# Before doing ANYTHING, initialize submodules. +#ifeq ($(wildcard utility/src/Makefile), ) +# $(info $(shell git submodule update --init utility)) +# $(info $(space)) +#endif # ADD_CLEAN_RULE - Parameterized "function" that adds a new rule and phony # target for cleaning the specified target (removing its build-generated @@ -338,9 +343,8 @@ DIR_STACK := INCDIRS := TGT_STACK := -# Discover our OS and architecture. These were previously used to set -# BUILD_DIR and TARGET_DIR to allow multi-platform builds. DESTDIR will do -# that for us too. +# Discover our OS and architecture. These are used to set the BUILD_DIR and TARGET_DIR to +# something more useful than 'build' and '.'. OSTYPE := $(shell echo `uname`) OSVERSION := $(shell echo `uname -r`) @@ -365,18 +369,31 @@ ifeq (${OSTYPE}, SunOS) endif endif -# Set paths for building and installing. If DESTDIR doesn't exist, use the -# directory just above us. +# Some filesystems cannot use < or > in file names, but for reasons unknown +# (or, at least, reasons we're not going to admit to), files in the overlap +# store are named ####<###>. Enabling POSIX_FILE_NAMES Will change the +# names to ####.###. +# +# Be aware this will break object store compatibility. +# +ifeq ($(POSIX_FILE_NAMES), 1) + CXXFLAGS += -DPOSIX_FILE_NAMES -ifeq "$(strip ${DESTDIR})" "" - BUILD_DIR := $(realpath ..)/build/obj - TARGET_DIR := $(realpath ..)/build else - BUILD_DIR := $(DESTDIR)/$(MODULE)/build/obj - TARGET_DIR := $(DESTDIR)/$(MODULE)/build + # Try to create non- file names. It's tempting to use 'wildcard' instead + # of the 'ls', but it doesn't work. + $(shell touch "non--name" > /dev/null 2>&1) + + ifeq (non--name, $(shell ls "non--name" 2> /dev/null)) + #$(info Extended POSIX filenames allowed.) + else + #$(info POSIX filenames required.) + CXXFLAGS += -DPOSIX_FILE_NAMES + endif + + $(shell rm -f "non--name") endif -# # Set compiler and flags based on discovered hardware # # By default, debug symbols are included in all builds (even optimized). @@ -393,6 +410,7 @@ endif # BUILDJEMALLOC will enable jemalloc library support. # + ifeq ($(origin CXXFLAGS), undefined) ifeq ($(BUILDOPTIMIZED), 1) else @@ -421,15 +439,6 @@ ifeq ($(origin CXXFLAGS), undefined) CXXFLAGS += -Wno-deprecated-declarations CXXFLAGS += -Wno-format-truncation CXXFLAGS += -std=c++11 - - CFLAGS += -Wall -Wextra -Wformat - CFLAGS += -Wno-char-subscripts - CFLAGS += -Wno-sign-compare - CFLAGS += -Wno-unused-function - CFLAGS += -Wno-unused-parameter - CFLAGS += -Wno-unused-variable - CFLAGS += -Wno-deprecated-declarations - CFLAGS += -Wno-format-truncation else CXXFLAGSUSER := ${CXXFLAGS} endif @@ -472,26 +481,6 @@ ifeq (${OSTYPE}, Darwin) endif endif - ifeq ($(CC), cc) - CC8 := $(shell echo `which gcc-mp-8`) - CXX8 := $(shell echo `which g++-mp-8`) - - ifdef CXX8 - CC := $(CC8) - CXX := $(CXX8) - endif - endif - - ifeq ($(CC), cc) - CC9 := $(shell echo `which gcc-mp-9`) - CXX9 := $(shell echo `which g++-mp-9`) - - ifdef CXX9 - CC := $(CC9) - CXX := $(CXX9) - endif - endif - ifeq ($(CC), cc) CC8 := $(shell echo `which gcc-7`) CXX8 := $(shell echo `which g++-7`) @@ -512,16 +501,6 @@ ifeq (${OSTYPE}, Darwin) endif endif - ifeq ($(CC), cc) - CC9 := $(shell echo `which gcc-9`) - CXX9 := $(shell echo `which g++-9`) - - ifdef CXX9 - CC := $(CC9) - CXX := $(CXX9) - endif - endif - ifneq ($(shell echo `$(CXX) --version 2>&1 | grep -c clang`), 0) CPATH := $(shell echo `which $(CXX)`) CLANG := $(shell echo `$(CXX) --version 2>&1 | grep clang`) @@ -560,12 +539,9 @@ ifeq (${CANU_BUILD_ENV}, ports) else - # Ignore the gmake default 'c++' and force g++9. - ifeq ($(origin CXX), default) - CC = gcc9 - CXX = g++9 - CCLIB = -rpath /usr/local/lib/gcc9 - endif + CC ?= gcc6 + CXX ?= g++6 + CCLIB ?= -rpath /usr/local/lib/gcc6 # GCC CXXFLAGS += -I/usr/local/include -pthread -fopenmp -fPIC @@ -730,14 +706,16 @@ $(foreach TGT,${ALL_TGTS},\ $(foreach TGT,${ALL_TGTS},\ $(eval -include ${${TGT}_DEPS})) + @if [ ! -e ${TARGET_DIR}/bin ] ; then mkdir -p ${TARGET_DIR}/bin ; fi + # Makefile processed. Regenerate the version number file, make some # directories, and report that we're starting the build. -$(eval $(shell ../scripts/version_update.pl $(MODULE) utility/version.H)) +$(shell ../scripts/version_update.pl meryl-utility utility/version.H) $(shell mkdir -p ${TARGET_DIR}/bin) -$(info For '${OSTYPE}' '${OSVERSION}' as '${MACHINETYPE}' into '${TARGET_DIR}/{bin,obj}'.) +$(info For '${OSTYPE}' '${OSVERSION}' as '${MACHINETYPE}' into '${DESTDIR}${PREFIX}/$(OSTYPE)-$(MACHINETYPE)/{bin,obj}'.) $(info Using '$(shell which ${CXX})' version '${GXX_VV}'.) ifneq ($(origin CXXFLAGSUSER), undefined) $(info Using user-supplied CXXFLAGS '${CXXFLAGSUSER}'.) diff --git a/ext/meryl/src/utility/src/main.mk b/ext/meryl/src/utility/src/main.mk index ad3c585..c607e16 100644 --- a/ext/meryl/src/utility/src/main.mk +++ b/ext/meryl/src/utility/src/main.mk @@ -1,13 +1,31 @@ -MODULE := meryl-utility -TARGET := libmeryl-utility.a + +# If 'make' isn't run from the root directory, we need to set these to +# point to the upper level build directory. + +ifeq "$(strip ${DESTDIR})" "" + DESTDIR := +endif + +ifeq "$(strip ${PREFIX})" "" + ifeq "$(strip ${DESTDIR})" "" + PREFIX := $(realpath ..) + else + PREFIX := /canu + endif +endif + +ifeq "$(strip ${BUILD_DIR})" "" + BUILD_DIR := $(DESTDIR)$(PREFIX)/$(OSTYPE)-$(MACHINETYPE)/obj +endif + +ifeq "$(strip ${TARGET_DIR})" "" + TARGET_DIR := $(DESTDIR)$(PREFIX)/$(OSTYPE)-$(MACHINETYPE) +endif + +TARGET := libcanu.a + SOURCES := utility/runtime.C \ \ - utility/align-ksw2-extz.C \ - utility/align-ksw2-extz2-sse.C \ - utility/align-ksw2-driver.C \ - utility/align-ssw.C \ - utility/align-ssw-driver.C \ - utility/align-parasail-driver.C \ utility/edlib.C \ \ utility/files.C \ @@ -36,22 +54,13 @@ SOURCES := utility/runtime.C \ utility/kmers.C \ \ utility/bits.C \ - utility/bits-wordArray.C \ \ utility/hexDump.C \ utility/md5.C \ utility/mt19937ar.C \ + utility/objectStore.C \ utility/speedCounter.C \ - utility/sweatShop.C \ - \ - parasail/cpuid.c \ - parasail/memory.c \ - parasail/sg.c \ - parasail/sg_trace.c \ - parasail/sg_qx_dispatch.c \ - parasail/sg_qb_de_dispatch.c \ - parasail/sg_qe_db_dispatch.c \ - parasail/cigar.c + utility/sweatShop.C ifeq (${BUILDSTACKTRACE}, 1) @@ -73,24 +82,14 @@ endif SRC_INCDIRS := . \ - utility \ - parasail + utility SUBMAKEFILES := ifeq ($(BUILDTESTS), 1) -SUBMAKEFILES += tests/alignTest-ssw.mk \ - tests/alignTest-ksw2.mk \ - tests/bitsTest.mk \ +SUBMAKEFILES += tests/bitsTest.mk \ tests/filesTest.mk \ tests/intervalListTest.mk \ - tests/intervalsTest.mk \ tests/loggingTest.mk \ - tests/magicNumber.mk \ - tests/parasailTest.mk \ - tests/readLines.mk \ - tests/sequenceTest.mk \ - tests/stddevTest.mk \ - tests/systemTest.mk \ - tests/typesTest.mk + tests/stddevTest.mk endif diff --git a/ext/meryl/src/utility/src/tests/alignTest-ksw2.C b/ext/meryl/src/utility/src/tests/alignTest-ksw2.C index 0eab0b3..1e6504a 100644 --- a/ext/meryl/src/utility/src/tests/alignTest-ksw2.C +++ b/ext/meryl/src/utility/src/tests/alignTest-ksw2.C @@ -49,8 +49,8 @@ int main(int argc, char **argv) { dnaSeqFile *fileA, *fileB; dnaSeq dseqA, dseqB; - char const *seqA = nullptr; - char const *seqB = nullptr; + char *seqA = nullptr; + char *seqB = nullptr; //fprintf(stderr, "A -> %2u -> %c\n", encode2bitBase('A'), decode2bitBase(0)); assert(encode2bitBase('A') == 0); diff --git a/ext/meryl/src/utility/src/tests/bitsTest.C b/ext/meryl/src/utility/src/tests/bitsTest.C index 4403d6d..48b8b31 100644 --- a/ext/meryl/src/utility/src/tests/bitsTest.C +++ b/ext/meryl/src/utility/src/tests/bitsTest.C @@ -18,7 +18,6 @@ */ #include "bits.H" -#include "strings.H" #include "mt19937ar.H" char b1[65]; @@ -26,25 +25,6 @@ char b2[65]; char b3[65]; -void -testMasks(void) { - uint128 m128; - uint64 m64; - uint32 m32; - uint16 m16; - uint8 m8; - - for (uint32 ii=0; ii<=128; ii++) { - fprintf(stderr, "%3d: %s %s %s %s %s %s %s %s %s %s\n", ii, - toHex(buildLowBitMask(ii)), toHex(buildHighBitMask(ii)), - toHex(buildLowBitMask (ii)), toHex(buildHighBitMask (ii)), - toHex(buildLowBitMask (ii)), toHex(buildHighBitMask (ii)), - toHex(buildLowBitMask (ii)), toHex(buildHighBitMask (ii)), - toHex(buildLowBitMask (ii)), toHex(buildHighBitMask (ii))); - } -} - - void testLogBaseTwo(void) { uint64 val = 0; @@ -127,7 +107,7 @@ testBitArray(uint64 maxLength) { void testWordArray(uint64 wordSize) { - wordArray *wa = new wordArray(wordSize, 8 * 64, false); + wordArray *wa = new wordArray(wordSize, 8 * 64); for (uint32 ii=0; ii<1000; ii++) wa->set(ii, 0xffffffff); @@ -138,9 +118,7 @@ testWordArray(uint64 wordSize) { wa->show(); for (uint32 ii=0; ii<1000; ii++) - assert(wa->get(ii) == (ii & buildLowBitMask(wordSize))); - - fprintf(stderr, "Passed!\n"); + assert(wa->get(ii) == (ii & uint64MASK(wordSize))); delete wa; } @@ -294,7 +272,7 @@ testPrefixFree(uint32 type) { length += width[ii]; histo[width[ii]]++; - random[ii] = mt.mtRandom64() & buildLowBitMask(width[ii]); + random[ii] = mt.mtRandom64() & uint64MASK(width[ii]); if (random[ii] == 0) ii--; @@ -451,10 +429,6 @@ main(int argc, char **argv) { err++; } - else if (strcmp(argv[arg], "-masks") == 0) { - testMasks(); - } - else if (strcmp(argv[arg], "-logbasetwo") == 0) { testLogBaseTwo(); } @@ -464,24 +438,19 @@ main(int argc, char **argv) { } else if (strcmp(argv[arg], "-bitarray") == 0) { - if (++arg >= argc) - fprintf(stderr, "ERROR: -bitarray needs word-size argument.\n"), exit(1); + uint64 maxLength = strtouint64(argv[++arg]); - testBitArray(strtouint64(argv[arg])); + testBitArray(maxLength); } else if (strcmp(argv[arg], "-wordarray") == 0) { - if (++arg >= argc) - fprintf(stderr, "ERROR: -wordarray needs word-size argument.\n"), exit(1); + uint64 wordSize = strtouint64(argv[++arg]); - testWordArray(strtouint64(argv[arg])); + testWordArray(wordSize); } else if (strcmp(argv[arg], "-unary") == 0) { - if (++arg >= argc) - fprintf(stderr, "ERROR: -unary needs max-size argument.\n"), exit(1); - - uint32 maxSize = strtouint32(argv[arg]); + uint32 maxSize = strtouint32(argv[++arg]); #pragma omp parallel for for (uint32 xx=1; xx<=maxSize; xx++) { diff --git a/ext/meryl/src/utility/src/tests/bitsTest.mk b/ext/meryl/src/utility/src/tests/bitsTest.mk index 6d7a430..a29822a 100644 --- a/ext/meryl/src/utility/src/tests/bitsTest.mk +++ b/ext/meryl/src/utility/src/tests/bitsTest.mk @@ -1,8 +1,20 @@ + +# If 'make' isn't run from the root directory, we need to set these to +# point to the upper level build directory. +ifeq "$(strip ${BUILD_DIR})" "" + BUILD_DIR := ../$(OSTYPE)-$(MACHINETYPE)/obj +endif +ifeq "$(strip ${TARGET_DIR})" "" + TARGET_DIR := ../$(OSTYPE)-$(MACHINETYPE) +endif + TARGET := bitsTest SOURCES := bitsTest.C SRC_INCDIRS := .. ../utility TGT_LDFLAGS := -L${TARGET_DIR}/lib -TGT_LDLIBS := -l${MODULE} -TGT_PREREQS := lib${MODULE}.a +TGT_LDLIBS := -lcanu +TGT_PREREQS := libcanu.a + +SUBMAKEFILES := diff --git a/ext/meryl/src/utility/src/tests/filesTest.mk b/ext/meryl/src/utility/src/tests/filesTest.mk index db88fa6..469a199 100644 --- a/ext/meryl/src/utility/src/tests/filesTest.mk +++ b/ext/meryl/src/utility/src/tests/filesTest.mk @@ -1,8 +1,20 @@ + +# If 'make' isn't run from the root directory, we need to set these to +# point to the upper level build directory. +ifeq "$(strip ${BUILD_DIR})" "" + BUILD_DIR := ../$(OSTYPE)-$(MACHINETYPE)/obj +endif +ifeq "$(strip ${TARGET_DIR})" "" + TARGET_DIR := ../$(OSTYPE)-$(MACHINETYPE) +endif + TARGET := filesTest SOURCES := filesTest.C SRC_INCDIRS := .. ../utility TGT_LDFLAGS := -L${TARGET_DIR}/lib -TGT_LDLIBS := -l${MODULE} -TGT_PREREQS := lib${MODULE}.a +TGT_LDLIBS := -lcanu +TGT_PREREQS := libcanu.a + +SUBMAKEFILES := diff --git a/ext/meryl/src/utility/src/tests/intervalListTest.mk b/ext/meryl/src/utility/src/tests/intervalListTest.mk index 2a785cc..4ecfd65 100644 --- a/ext/meryl/src/utility/src/tests/intervalListTest.mk +++ b/ext/meryl/src/utility/src/tests/intervalListTest.mk @@ -1,8 +1,20 @@ + +# If 'make' isn't run from the root directory, we need to set these to +# point to the upper level build directory. +ifeq "$(strip ${BUILD_DIR})" "" + BUILD_DIR := ../$(OSTYPE)-$(MACHINETYPE)/obj +endif +ifeq "$(strip ${TARGET_DIR})" "" + TARGET_DIR := ../$(OSTYPE)-$(MACHINETYPE) +endif + TARGET := intervalListTest SOURCES := intervalListTest.C SRC_INCDIRS := .. ../utility TGT_LDFLAGS := -L${TARGET_DIR}/lib -TGT_LDLIBS := -l${MODULE} -TGT_PREREQS := lib${MODULE}.a +TGT_LDLIBS := -lcanu +TGT_PREREQS := libcanu.a + +SUBMAKEFILES := diff --git a/ext/meryl/src/utility/src/tests/intervalsTest.C b/ext/meryl/src/utility/src/tests/intervalsTest.C deleted file mode 100644 index 66501bb..0000000 --- a/ext/meryl/src/utility/src/tests/intervalsTest.C +++ /dev/null @@ -1,229 +0,0 @@ - -/****************************************************************************** - * - * This file is part of meryl-utility, a collection of miscellaneous code - * used by Meryl, Canu and others. - * - * This software is based on: - * 'Canu' v2.0 (https://github.com/marbl/canu) - * which is based on: - * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net) - * the 'kmer package' r1994 (http://kmer.sourceforge.net) - * - * Except as indicated otherwise, this is a 'United States Government Work', - * and is released in the public domain. - * - * File 'README.licenses' in the root directory of this distribution - * contains full conditions and disclaimers. - */ - -#include "runtime.H" -#include "intervals.H" - -#include "mt19937ar.H" - -void -boringTest(void) { - bool errors = false; - intervals t1; - - t1.add_span(11, -4); - t1.add_position(0, 10); - t1.add_span(8, 12); - - errors |= ((t1.size() != 3) || - (t1.bgn(0) != 7) || (t1.end(0) != 11) || - (t1.bgn(1) != 0) || (t1.end(1) != 10) || - (t1.bgn(2) != 8) || (t1.end(2) != 20)); - - if (errors) { - fprintf(stderr, "BEFORE:\n"); - for (uint32 ii=0; ii t1; - intervals t2; - - t1.add_position(-30, -10); - t1.add_position( -5, 5); - t1.add_position( 10, 30); - - t2.setToInversion(-20, 20, t1); - - errors |= ((t2.size() != 4) || - (t2.bgn(0) != -10) || (t2.end(0) != 20) || - (t2.bgn(1) != -20) || (t2.end(1) != -5) || - (t2.bgn(2) != 5) || (t2.end(2) != 20) || - (t2.bgn(3) != -20) || (t2.end(3) != 10)); - - if (errors) { - fprintf(stderr, "BEFORE:\n"); - for (uint32 ii=0; ii il; - - // Add intervals to the list. - // Sum depths explicitly. - for (uint32 ii=0; ii de(il); - - // Over all the depth regions, subtract the computed depth from - // the explicit depth. - for (uint32 xx=0; xx %u\n", cc, depth[cc], depth[cc] - dpt); - depth[cc] -= dpt; - } - } - - // Every explicit depth should now be zero, even the ones - // not covered. - for (uint32 cc=0; ccname\n"); - fprintf(O, "ACGT\n"); - fprintf(O, "> name \n"); - fprintf(O, "ACGT\n"); - fprintf(O, "> name flags\n"); - fprintf(O, "ACGT\n"); - fprintf(O, "> name f l a g s \n"); - fprintf(O, "ACGT\n"); - AS_UTL_closeFile(O); - - dnaSeqFile F("sequenceTest.data.fasta"); - dnaSeq S; - - F.loadSequence(S); - assert(strcmp(S.ident(), "name") == 0); - assert(strcmp(S.flags(), "") == 0); - assert(strcmp(S.bases(), "ACGT") == 0); - - F.loadSequence(S); - assert(strcmp(S.ident(), "name") == 0); - assert(strcmp(S.flags(), "") == 0); - assert(strcmp(S.bases(), "ACGT") == 0); - - F.loadSequence(S); - assert(strcmp(S.ident(), "name") == 0); - assert(strcmp(S.flags(), "flags") == 0); - assert(strcmp(S.bases(), "ACGT") == 0); - - F.loadSequence(S); - assert(strcmp(S.ident(), "name") == 0); - assert(strcmp(S.flags(), "f l a g s") == 0); - assert(strcmp(S.bases(), "ACGT") == 0); - - AS_UTL_unlink("sequenceTest.data.fasta"); - - fprintf(stderr, "Success!\n"); - - return(0); -} - diff --git a/ext/meryl/src/utility/src/tests/sequenceTest.mk b/ext/meryl/src/utility/src/tests/sequenceTest.mk deleted file mode 100644 index ebd7f16..0000000 --- a/ext/meryl/src/utility/src/tests/sequenceTest.mk +++ /dev/null @@ -1,8 +0,0 @@ -TARGET := sequenceTest -SOURCES := sequenceTest.C - -SRC_INCDIRS := .. ../utility - -TGT_LDFLAGS := -L${TARGET_DIR}/lib -TGT_LDLIBS := -l${MODULE} -TGT_PREREQS := lib${MODULE}.a diff --git a/ext/meryl/src/utility/src/tests/stddevTest.C b/ext/meryl/src/utility/src/tests/stddevTest.C index cda859e..184cfd6 100644 --- a/ext/meryl/src/utility/src/tests/stddevTest.C +++ b/ext/meryl/src/utility/src/tests/stddevTest.C @@ -108,6 +108,7 @@ testBig(uint32 nSamples) { fprintf(stderr, "\n"); fprintf(stderr, "testBig for nSamples %u\n", nSamples); + fprintf(stderr, "\n"); for (uint32 ii=0; ii sd; - - sd.insert(0.000000); - sd.insert(d); - sd.insert(0.000000); - - sd.remove(0.000000); - sd.remove(0.000000); // Fails here; the two if's in remove() resolve. - - sum += sd.mean(); // Add d. - - assert(d - 0.00001 <= sd.mean()); - assert(sd.mean() <= d + 0.00001); - assert(sd.variance() == 0.0); - - sd.remove(d); - - sum += sd.mean(); // Add zero. - - assert(sd.mean() == 0.0); - assert(sd.variance() == 0.0); - } - - fprintf(stderr, " %18.16f\n", sum); -} - - - -// Same idea, but this one fails before we hit the -// reset for one item. Grrrr! -void -testStability2(uint32 n) { - double sum = 0.0; - stdDev sd; - - if (n == 1) { - fprintf(stderr, "\n"); - fprintf(stderr, "testStability2 (values should be positive zero)\n"); - } - - for (uint32 ii=0; ii= 0.0); - - sd.insert(0.000220); - sd.remove(0.000220); - fprintf(stderr, "%2u %26.24f\n", n, sd.variance()); - assert(sd.variance() >= 0.0); - - for (uint32 ii=0; iilen; - resizeArrayPair(_cigarCode, _cigarValu, 0, _cigarMax, cigar->len + 1); + resizeArrayPair(_cigarCode, _cigarValu, _cigarLen, _cigarMax, (uint32)(cigar->len + 1), resizeArray_doNothing); // If the alignment begins with a gap, remove it and adjust the positions. @@ -342,13 +342,13 @@ parasailLib::analyzeAlignment(void) { // Compute the same erate as overlapper does. - _erate = (double)(_aMis + _aGap) / std::min((_endA - _bgnA), (_endB - _bgnB)); + _erate = (double)(_aMis + _aGap) / min((_endA - _bgnA), (_endB - _bgnB)); // Allocate stuff for building a map between the A and B sequences and the // cigar string. - resizeArrayPair(_cigarMapBgn, _cigarMapEnd, 0, _cigarMapMax, _cigarLen); - resizeArray (_aMap, 0, _aMapMax, _aLen); + resizeArrayPair(_cigarMapBgn, _cigarMapEnd, 0, _cigarMapMax, _cigarLen, resizeArray_doNothing); + resizeArray (_aMap, 0, _aMapMax, _aLen, resizeArray_doNothing); uint32 apos = _bgnA; uint32 bpos = _bgnB; diff --git a/ext/meryl/src/utility/src/utility/align-parasail-driver.H b/ext/meryl/src/utility/src/utility/align-parasail-driver.H index a8e66a4..816c7d2 100644 --- a/ext/meryl/src/utility/src/utility/align-parasail-driver.H +++ b/ext/meryl/src/utility/src/utility/align-parasail-driver.H @@ -49,13 +49,13 @@ public: bool alignDovetail(char const *seqA, uint32 lenA, char const *seqB, uint32 lenB, bool verbose=false) { return(align(seqA, lenA, 0, lenA, - seqB, lenB, 0, lenB, verbose, parasail_sg_qb_de_trace /*_striped_32*/)); + seqB, lenB, 0, lenB, verbose, parasail_sg_qb_de_trace_striped_32)); }; bool alignDovetail(char const *seqA, uint32 lenA, int32 bgnA, int32 endA, char const *seqB, uint32 lenB, int32 bgnB, int32 endB, bool verbose=false) { return(align(seqA, lenA, bgnA, endA, - seqB, lenB, bgnB, endB, verbose, parasail_sg_qb_de_trace /*_striped_32*/)); + seqB, lenB, bgnB, endB, verbose, parasail_sg_qb_de_trace_striped_32)); }; // Align with free gaps on either end of s2. diff --git a/ext/meryl/src/utility/src/utility/align-ssw-driver.C b/ext/meryl/src/utility/src/utility/align-ssw-driver.C index 9468c2d..2a554bc 100644 --- a/ext/meryl/src/utility/src/utility/align-ssw-driver.C +++ b/ext/meryl/src/utility/src/utility/align-ssw-driver.C @@ -135,8 +135,8 @@ sswLib::align(char const *seqA_, uint32 seqlenA_, int32 bgnA_, int32 endA_, // Allocate space for at least lenA (lenB) things. - resizeArray(_intA, 0, _maxA, _lenA); - resizeArray(_intB, 0, _maxB, _lenB); + resizeArray(_intA, 0, _maxA, _lenA, resizeArray_doNothing); + resizeArray(_intB, 0, _maxB, _lenB, resizeArray_doNothing); // Convert the input sequences into integers. @@ -193,7 +193,7 @@ sswLib::align(char const *seqA_, uint32 seqlenA_, int32 bgnA_, int32 endA_, // Make space for the alignment, and copy it over. - resizeArrayPair(_cigarCode, _cigarValu, 0, _cigarMax, result->cigarLen + 1); + resizeArrayPair(_cigarCode, _cigarValu, _cigarLen, _cigarMax, (uint32)(result->cigarLen + 1), resizeArray_doNothing); for (int32 cc=0; cccigarLen; ++cc) { _cigarCode[cc] = "MIDNSHP=X"[result->cigar[cc] & 0xf]; @@ -359,13 +359,13 @@ sswLib::analyzeAlignment(void) { // Compute the same erate as overlapper does. - _erate = (double)(_aMis + _aGap) / std::min((_endA - _bgnA), (_endB - _bgnB)); + _erate = (double)(_aMis + _aGap) / min((_endA - _bgnA), (_endB - _bgnB)); // Allocate stuff for building a map between the A and B sequences and the // cigar string. - resizeArrayPair(_cigarMapBgn, _cigarMapEnd, 0, _cigarMapMax, _cigarLen); - resizeArray (_aMap, 0, _aMapMax, _aLen); + resizeArrayPair(_cigarMapBgn, _cigarMapEnd, 0, _cigarMapMax, _cigarLen, resizeArray_doNothing); + resizeArray (_aMap, 0, _aMapMax, _aLen, resizeArray_doNothing); uint32 apos = _bgnA; uint32 bpos = _bgnB; diff --git a/ext/meryl/src/utility/src/utility/arrays.H b/ext/meryl/src/utility/src/utility/arrays.H index a34af0b..4e30b55 100644 --- a/ext/meryl/src/utility/src/utility/arrays.H +++ b/ext/meryl/src/utility/src/utility/arrays.H @@ -21,85 +21,29 @@ #define ARRAYS_H #include "types.H" -#include - - -enum class _raAct { - doNothing = 0x00, - copyData = 0x01, - clearNew = 0x02, - copyDataClearNew = 0x03, -}; - - -inline // Combine two _raAct into one. -_raAct -operator|(_raAct a, _raAct b) { - - if (a == _raAct::doNothing) return(b); - if (b == _raAct::doNothing) return(a); - - if ((a == _raAct::copyData) && (b == _raAct::copyData)) return(_raAct::copyData); - if ((a == _raAct::copyData) && (b == _raAct::clearNew)) return(_raAct::copyDataClearNew); - if ((a == _raAct::clearNew) && (b == _raAct::copyData)) return(_raAct::copyDataClearNew); - if ((a == _raAct::clearNew) && (b == _raAct::clearNew)) return(_raAct::clearNew); - if (a == _raAct::copyDataClearNew) return(_raAct::copyDataClearNew); - if (b == _raAct::copyDataClearNew) return(_raAct::copyDataClearNew); - - assert(0); - return(_raAct::doNothing); -} - - -inline // Return true if _raAct a has property b set. -bool -operator&(_raAct a, _raAct b) { +#include - if ((a == _raAct::copyData) && (b == _raAct::copyData)) return(true); - if ((a == _raAct::copyDataClearNew) && (b == _raAct::copyData)) return(true); +using namespace std; - if ((a == _raAct::clearNew) && (b == _raAct::clearNew)) return(true); - if ((a == _raAct::copyDataClearNew) && (b == _raAct::clearNew)) return(true); - if ((a == _raAct::copyDataClearNew) && (b == _raAct::copyDataClearNew)) return(true); +const uint32 resizeArray_doNothing = 0x00; +const uint32 resizeArray_copyData = 0x01; +const uint32 resizeArray_clearNew = 0x02; - return(false); -} -// Allocate an array of size 'allocSize', and set 'arrayMax' to that value. -// By default. clear the array. template void -allocateArray(TT*& array, LL &arrayMax, uint64 allocSize, _raAct op=_raAct::clearNew) { +allocateArray(TT*& array, LL arrayMax, uint32 op=resizeArray_clearNew) { if (array != NULL) delete [] array; - arrayMax = allocSize; - array = new TT [allocSize]; - - assert(arrayMax == allocSize); // Make sure we don't truncate the value! + array = new TT [arrayMax]; - if (op == _raAct::clearNew) - memset(array, 0, sizeof(TT) * allocSize); -} - - -// Allocate an array of size 'allocSize'. -// By default, clear the array. -template -void -allocateArray(TT*& array, uint64 allocSize, _raAct op=_raAct::clearNew) { - - if (array != NULL) - delete [] array; - - array = new TT [allocSize]; - - if (op == _raAct::clearNew) - memset(array, 0, sizeof(TT) * allocSize); + if (op == resizeArray_clearNew) + memset(array, 0, sizeof(TT) * arrayMax); } @@ -142,78 +86,55 @@ duplicateArray(TT*& to, LL &toLen, LL &toMax, TT const *fr, LL frLen, LL frMax=0 } -// Set the array size to 'newMax'. -// No guards, the array will ALWAYS be reallocated. -// +// Set the array size to 'newMax'. No guards, the array will ALWAYS be reallocated. + template void -setArraySize(TT*& array, uint64 arrayLen, LL &arrayMax, uint64 newMax, _raAct op=_raAct::copyData) { +setArraySize(TT*& array, uint64 arrayLen, LL &arrayMax, uint64 newMax, uint32 op=resizeArray_copyData) { - arrayMax = newMax; - arrayLen = std::min(newMax, arrayLen); + arrayMax = newMax; + arrayLen = min(newMax, arrayLen); TT *copy = new TT [arrayMax]; - if ((array != nullptr) && - (arrayLen > 0) && - ((op == _raAct::copyData) || - (op == _raAct::copyDataClearNew))) - for (uint32 ii=0; ii 0)) + memcpy(copy, array, sizeof(TT) * arrayLen); delete [] array; array = copy; - if ((op == _raAct::clearNew) || - (op == _raAct::copyDataClearNew)) - for (uint32 ii=arrayLen; ii arrayLen)) + memset(array + arrayLen, 0, sizeof(TT) * (arrayMax - arrayLen)); } + // Ensure that there is enough space to hold one more element in the array. // Increase the array by 'moreSpace' if needed. -// -// With the array used as a stack, a call of -// increaseArray(arr, arrLen, arrMax, 32) -// will allocate 32 more elements if arrLen == arrMax, and do nothing -// otherwise. After the call, array element arr[arrLen] is guaranteed to -// exist. If arrLen > arrMax, see below. -// -// With the array used for random access, a call of -// increaseArray(arr, idx, arrMax, 32) -// will do nothing if idx < arrMax, and resize the array to have idx+32 -// elements otherwise. -// -// In both cases, if 'moreSpace' is 0, it is reset to 1. -// -// If the array is reallocated, the contents of the entire array are copied -// to the new space. New elements are NOT cleared to zero; override op as -// desired. -// + template void -increaseArray(TT*& array, uint64 idx, LL &arrayMax, uint64 moreSpace, _raAct op=_raAct::copyData) { - uint64 newMax = idx + ((moreSpace == 0) ? 1 : moreSpace); +increaseArray(TT*& array, uint64 arrayLen, LL &arrayMax, uint64 moreSpace) { + uint64 newMax = arrayMax + ((moreSpace == 0) ? 1 : moreSpace); - if (idx < arrayMax) + if (arrayLen < arrayMax) return; - setArraySize(array, arrayMax, arrayMax, newMax, op); + setArraySize(array, arrayLen, arrayMax, newMax, resizeArray_copyData); } template void -increaseArrayPair(T1*& array1, T2*& array2, uint64 idx, LL &arrayMax, uint64 moreSpace, _raAct op=_raAct::copyData) { - uint64 newMax = idx + ((moreSpace == 0) ? 1 : moreSpace); +increaseArrayPair(T1*& array1, T2*& array2, uint64 arrayLen, LL &arrayMax, uint64 moreSpace) { + uint64 newMax = arrayMax + ((moreSpace == 0) ? 1 : moreSpace); - if (idx < arrayMax) + if (arrayLen < arrayMax) return; - setArraySize(array1, arrayMax, arrayMax, newMax, op); - setArraySize(array2, arrayMax, arrayMax, newMax, op); + setArraySize(array1, arrayLen, arrayMax, newMax, resizeArray_copyData); + setArraySize(array2, arrayLen, arrayMax, newMax, resizeArray_copyData); } @@ -222,7 +143,7 @@ increaseArrayPair(T1*& array1, T2*& array2, uint64 idx, LL &arrayMax, uint64 mor template void -resizeArray(TT*& array, uint64 arrayLen, LL &arrayMax, uint64 newMax, _raAct op=_raAct::copyData) { +resizeArray(TT*& array, uint64 arrayLen, LL &arrayMax, uint64 newMax, uint32 op=resizeArray_copyData) { if (newMax <= arrayMax) return; @@ -233,7 +154,7 @@ resizeArray(TT*& array, uint64 arrayLen, LL &arrayMax, uint64 newMax, _raAct op= template void -resizeArrayPair(T1*& array1, T2*& array2, uint64 arrayLen, LL &arrayMax, uint64 newMax, _raAct op=_raAct::copyData) { +resizeArrayPair(T1*& array1, T2*& array2, uint64 arrayLen, LL &arrayMax, LL newMax, uint32 op=resizeArray_copyData) { if (newMax <= arrayMax) return; diff --git a/ext/meryl/src/utility/src/utility/bits-wordArray.C b/ext/meryl/src/utility/src/utility/bits-wordArray.C deleted file mode 100644 index f4f79cc..0000000 --- a/ext/meryl/src/utility/src/utility/bits-wordArray.C +++ /dev/null @@ -1,153 +0,0 @@ - -/****************************************************************************** - * - * This file is part of meryl-utility, a collection of miscellaneous code - * used by Meryl, Canu and others. - * - * This software is based on: - * 'Canu' v2.0 (https://github.com/marbl/canu) - * which is based on: - * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net) - * the 'kmer package' r1994 (http://kmer.sourceforge.net) - * - * Except as indicated otherwise, this is a 'United States Government Work', - * and is released in the public domain. - * - * File 'README.licenses' in the root directory of this distribution - * contains full conditions and disclaimers. - */ - -#include "bits.H" - - -// -// At the default segmentSize of 64 KB = 524288 bits, we'll allocate 4096 -// 128-bit words per segment. With _wordsPerLock = 64, we'll then have -// 4096 / 64 = 64+1 locks per segment. -// -// Note that 'values' refers to the user-supplied data of some small size, -// while 'words' are the 128-bit machine words used to store the data. -// - -wordArray::wordArray(uint32 valueWidth, uint64 segmentSizeInBits, bool useLocks) { - - _valueWidth = valueWidth; // In bits. - _valueMask = buildLowBitMask(_valueWidth); - _segmentSize = segmentSizeInBits; // In bits. - - _valuesPerSegment = _segmentSize / _valueWidth; - - _wordsPerSegment = _segmentSize / 128; - _wordsPerLock = (useLocks == false) ? (0) : (64); - _locksPerSegment = (useLocks == false) ? (0) : (_segmentSize / 128 / _wordsPerLock + 1); - - _numValues = 0; - _numValuesLock.clear(); - - _segmentsLen = 0; - _segmentsMax = 16; - _segments = new uint128 * [_segmentsMax]; - _segLocks = new std::atomic_flag * [_segmentsMax]; - - for (uint32 ss=0; ss<_segmentsMax; ss++) { - _segments[ss] = nullptr; - _segLocks[ss] = nullptr; - } -} - - - -wordArray::~wordArray() { - for (uint32 i=0; i<_segmentsLen; i++) { - delete [] _segments[i]; - delete [] _segLocks[i]; - } - - delete [] _segments; - delete [] _segLocks; -} - - - -void -wordArray::clear(void) { - _numValues = 0; - _segmentsLen = 0; -} - - - -void -wordArray::allocate(uint64 nElements) { - uint64 segmentsNeeded = nElements / _valuesPerSegment + 1; - -#pragma omp critical (wordArrayAllocate) - { - - if (segmentsNeeded >= _segmentsMax) - resizeArrayPair(_segments, - _segLocks, - _segmentsLen, _segmentsMax, segmentsNeeded, - _raAct::copyData | _raAct::clearNew); - - for (uint32 seg=_segmentsLen; seg 0) { - _segLocks[seg] = new std::atomic_flag [ _locksPerSegment ]; - - for (uint32 ll=0; ll<_locksPerSegment; ll++) - _segLocks[seg][ll].clear(); - } - } - - _segmentsLen = segmentsNeeded; - - } // end critical -} - - - -void -wordArray::show(void) { - uint64 lastBit = _numValues * _valueWidth; - - fprintf(stderr, "wordArray:\n"); - fprintf(stderr, " numValues %10lu values\n", _numValues); - fprintf(stderr, " valueWidth %10lu bits\n", _valueWidth); - fprintf(stderr, " segmentSize %10lu bits\n", _segmentSize); - fprintf(stderr, " valuesPerSegment %10lu values\n", _valuesPerSegment); - fprintf(stderr, "\n"); - - // For each segment, dump full words, until we hit the end of data. - - for (uint64 ss=0; ss<_segmentsLen; ss++) { - fprintf(stderr, "Segment %lu:\n", ss); - - uint64 bitPos = ss * _valuesPerSegment * _valueWidth; - - for (uint64 ww=0; (ww < _wordsPerSegment) && (bitPos < lastBit); ww += 4) { - fprintf(stderr, "%5lu: %s %s %s %s\n", - ww, - (bitPos + 128 * 0 < lastBit) ? toHex(_segments[ss][ww+0]) : "", - (bitPos + 128 * 1 < lastBit) ? toHex(_segments[ss][ww+1]) : "", - (bitPos + 128 * 2 < lastBit) ? toHex(_segments[ss][ww+2]) : "", - (bitPos + 128 * 3 < lastBit) ? toHex(_segments[ss][ww+3]) : ""); - - bitPos += 128 * 4; - } - } - - fprintf(stderr, "\n"); - fprintf(stderr, "\n"); -} diff --git a/ext/meryl/src/utility/src/utility/bits-wordArray.H b/ext/meryl/src/utility/src/utility/bits-wordArray.H deleted file mode 100644 index 18f3a66..0000000 --- a/ext/meryl/src/utility/src/utility/bits-wordArray.H +++ /dev/null @@ -1,194 +0,0 @@ - -/****************************************************************************** - * - * This file is part of meryl-utility, a collection of miscellaneous code - * used by Meryl, Canu and others. - * - * This software is based on: - * 'Canu' v2.0 (https://github.com/marbl/canu) - * which is based on: - * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net) - * the 'kmer package' r1994 (http://kmer.sourceforge.net) - * - * Except as indicated otherwise, this is a 'United States Government Work', - * and is released in the public domain. - * - * File 'README.licenses' in the root directory of this distribution - * contains full conditions and disclaimers. - */ - -// To be included only by bits.H -#ifndef BITS_IMPLEMENTATIONS -#error Include bits.H instead of bits-wordArray.H -#endif - - -inline -uint128 -wordArray::get(uint64 eIdx) { - uint64 seg = eIdx / _valuesPerSegment; // Which segment are we in? - uint64 pos = _valueWidth * (eIdx % _valuesPerSegment); // Bit position of the start of the value. - - uint64 wrd = pos / 128; // The word we start in. - uint64 bit = pos % 128; // Starting at this bit. - - uint128 val = 0; - - if (eIdx >= _numValues) - fprintf(stderr, "wordArray::get()-- eIdx %lu >= _numValues %lu\n", eIdx, _numValues); - assert(eIdx < _numValues); - - // If the value is all in one word, just shift that word to the right to - // put the proper bits in the proper position. - // - // Otherwise, the value spans two words. - // - Shift the first word left to place the right-most bits at the left end of the return value. - // - Shift the second word right so the left-most bits are at the right end of the return value. - // - // ssssssssssss <- second shift amount - // [--first-word--][--second-word--] - // [--value--] - // fffff <- first shift amount - - if (bit + _valueWidth <= 128) { - val = _segments[seg][wrd] >> (128 - _valueWidth - bit); - } - else { - uint32 fShift = _valueWidth - (128 - bit); - uint32 sShift = 128 - fShift; - - val = _segments[seg][wrd+0] << fShift; - val |= _segments[seg][wrd+1] >> sShift; - } - - // Finally, mask off the stuff we don't care about. - - val &= _valueMask; - - return(val); -} - - - -inline -void -wordArray::setLock(uint64 seg, uint64 lockW1, uint64 lockW2) { - - if (lockW1 == lockW2) { - while (_segLocks[seg][lockW1].test_and_set(std::memory_order_relaxed) == true) - ; - } - - else { - while (_segLocks[seg][lockW1].test_and_set(std::memory_order_relaxed) == true) - ; - while (_segLocks[seg][lockW2].test_and_set(std::memory_order_relaxed) == true) - ; - } -} - - - -inline -void -wordArray::relLock(uint64 seg, uint64 lockW1, uint64 lockW2) { - - if (lockW1 == lockW2) { - _segLocks[seg][lockW1].clear(); - } - else { - _segLocks[seg][lockW2].clear(); - _segLocks[seg][lockW1].clear(); - } -} - - - -inline -void -wordArray::setNval(uint32 eIdx) { - - while (_numValuesLock.test_and_set(std::memory_order_relaxed) == true) - ; - - if (eIdx >= _numValues) - _numValues = eIdx + 1; - - _numValuesLock.clear(); -} - - -inline -void -wordArray::set(uint64 eIdx, uint128 value) { - uint64 seg = eIdx / _valuesPerSegment; // Which segment are we in? - uint64 pos = _valueWidth * (eIdx % _valuesPerSegment); // Which word in the segment? - - uint64 wrd = pos / 128; // The word we start in. - uint64 bit = pos % 128; // Starting at this bit. - - uint64 lockW1 = 0; // Address of locks, computed inline with the - uint64 lockW2 = 0; // setLock() function call below. - - // Allocate more segment pointers and any missing segments. - - if (seg >= _segmentsLen) - allocate(eIdx); - - // Mask the value, just in case. - - value &= _valueMask; - - // Grab the locks for the two words we're going to be accessing. - - if (_wordsPerLock > 0) - setLock(seg, - lockW1 = (wrd + 0) / _wordsPerLock, - lockW2 = (wrd + 1) / _wordsPerLock); - - // Remember the largest element set. Used for: - // - failing if get() accesses something out of bounds....but doesn't - // catch if we access something unset in the middle. - // - debug usage in show() - - if (_wordsPerLock > 0) - setNval(eIdx); - else if (eIdx >= _numValues) - _numValues = eIdx+1; - - // Set the value in one word.... - // - // [--------------------] - // [value] - // lSave rSave - // - // Or split the value across two words. - // - // --lSave-- --rSave-- - // [--word--][--first-word--][--second-word--][--word--] - // [----value---=] - // lSize rSize - - if (bit + _valueWidth <= 128) { - uint32 lSave = bit; - uint32 rSave = 128 - _valueWidth - bit; - - _segments[seg][wrd] = (saveLeftBits(_segments[seg][wrd], lSave) | - (value << rSave) | - saveRightBits(_segments[seg][wrd], rSave)); - } - - else { - uint32 lSave = bit, rSave = 128 - _valueWidth - bit; - uint32 lSize = 128 - bit, rSize = _valueWidth - (128 - bit); - - _segments[seg][wrd+0] = saveLeftBits(_segments[seg][wrd+0], lSave) | (value >> rSize); - _segments[seg][wrd+1] = (value << rSave) | saveRightBits(_segments[seg][wrd+1], rSave); - } - - // Release the locks. - - if (_wordsPerLock > 0) - relLock(seg, lockW1, lockW2); -} - diff --git a/ext/meryl/src/utility/src/utility/bits.C b/ext/meryl/src/utility/src/utility/bits.C index dbf59f3..1d3ca6d 100644 --- a/ext/meryl/src/utility/src/utility/bits.C +++ b/ext/meryl/src/utility/src/utility/bits.C @@ -21,7 +21,6 @@ #include "files.H" - stuffedBits::stuffedBits(uint64 nBits) { _dataBlockLenMaxB = nBits; @@ -257,7 +256,7 @@ stuffedBits::loadFromBuffer(readBuffer *B) { _dataBlockBgn = new uint64 [inLen]; _dataBlockLen = new uint64 [inLen]; - resizeArray(_dataBlocks, _dataBlocksLen, _dataBlocksMax, inLen, _raAct::copyData | _raAct::clearNew); + resizeArray(_dataBlocks, _dataBlocksLen, _dataBlocksMax, inLen, resizeArray_copyData | resizeArray_clearNew); } // Update the parameters. @@ -359,7 +358,7 @@ stuffedBits::loadFromFile(FILE *F) { _dataBlockBgn = new uint64 [inLen]; _dataBlockLen = new uint64 [inLen]; - resizeArray(_dataBlocks, _dataBlocksLen, _dataBlocksMax, inLen, _raAct::copyData | _raAct::clearNew); + resizeArray(_dataBlocks, _dataBlocksLen, _dataBlocksMax, inLen, resizeArray_copyData | resizeArray_clearNew); } // Update the parameters. diff --git a/ext/meryl/src/utility/src/utility/bits.H b/ext/meryl/src/utility/src/utility/bits.H index b2f070b..4c2c372 100644 --- a/ext/meryl/src/utility/src/utility/bits.H +++ b/ext/meryl/src/utility/src/utility/bits.H @@ -25,7 +25,15 @@ #include "files.H" #include -#include + +// Define this to enable testing that the width of the data element is greater than zero. The +// uint64MASK() macro (bri.h) does not generate a mask for 0. Compiler warnings are issued, +// because you shouldn't use this in production code. +// +// As it's expensive, emit a warning if it's enabled. +// +//#define CHECK_WIDTH +//#warning CHECK_WIDTH is EXPENSIVE // Writing in the middle of data is toublesome. @@ -59,64 +67,22 @@ displayWord(uint64 word, char *buffer=NULL) { }; - -// Generate a bit mask on the low (0x000fff) or high bits (0xfff000). -// -// Algorithm: -// - set the return value to all 1's -// - shift left or right to keep the desired numBits in the word -// - reset to all 0's if the numBits is zero -// (if zero, 'r & -0' == 'r & 0000..000) -// (if not zero, 'r & -1' == 'r & 1111..111) -// - reset to all 1's if the numBits is larger than the number of bits in the word -// -template -uintType -buildLowBitMask(uint32 numBits) { - uintType r; - - r = ~((uintType)0); - r >>= 8 * sizeof(uintType) - numBits; - r &= -(uintType)(numBits != 0); - r |= -(uintType)(numBits > 8 * sizeof(uintType)); - - return(r); -} - -template -uintType -buildHighBitMask(uint32 numBits) { - uintType r; - - r = ~((uintType)0); - r <<= 8 * sizeof(uintType) - numBits; - r &= -(uintType)(numBits != 0); - r |= -(uintType)(numBits > 8 * sizeof(uintType)); - - return(r); -} - - - // Return bits in a word: // Keeping the rightmost 64-n bits (mask out the leftmost n bits) // Keeping the leftmost 64-n bits (mask out the rigthmost n bits) // -inline uint64 clearLeftBits (uint64 v, uint32 l) { if (l >= 64) return(0); return(v & (uint64max >> l)); }; -inline uint64 saveLeftBits (uint64 v, uint32 l) { if (l == 0) return(0); return(v & (uint64max << (64 - l))); }; -inline uint64 clearRightBits (uint64 v, uint32 r) { if (r >= 64) return(0); return(v & (uint64max << r)); }; -inline uint64 saveRightBits (uint64 v, uint32 r) { if (r == 0) return(0); return(v & (uint64max >> (64 - r))); }; - -inline uint64 clearMiddleBits(uint64 v, uint32 l, uint32 r) { return( saveRightBits(v, r) | saveLeftBits(v, l)); }; -inline uint64 saveMiddleBits(uint64 v, uint32 l, uint32 r) { return(clearRightBits(v, r) & clearLeftBits(v, l)); }; +inline uint64 clearLeftBits (uint64 v, uint32 l) { if (l >= 64) return(0); return(v & (0xffffffffffffffffllu >> l)); }; +inline uint64 saveLeftBits (uint64 v, uint32 l) { if (l == 0) return(0); return(v & (0xffffffffffffffffllu << (64 - l))); }; +inline uint64 clearRightBits (uint64 v, uint32 r) { if (r >= 64) return(0); return(v & (0xffffffffffffffffllu << r)); }; +inline uint64 saveRightBits (uint64 v, uint32 r) { if (r == 0) return(0); return(v & (0xffffffffffffffffllu >> (64 - r))); }; -inline uint128 clearLeftBits (uint128 v, uint32 l) { if (l >= 128) return(0); return(v & (uint128max >> l)); }; -inline uint128 saveLeftBits (uint128 v, uint32 l) { if (l == 0) return(0); return(v & (uint128max << (128 - l))); }; -inline uint128 clearRightBits (uint128 v, uint32 r) { if (r >= 128) return(0); return(v & (uint128max << r)); }; -inline uint128 saveRightBits (uint128 v, uint32 r) { if (r == 0) return(0); return(v & (uint128max >> (128 - r))); }; +inline uint64 clearMiddleBits(uint64 v, uint32 l, uint32 r) { + return(saveRightBits(v, r) | saveLeftBits(v, l)); +}; -inline uint128 clearMiddleBits(uint128 v, uint32 l, uint32 r) { return( saveRightBits(v, r) | saveLeftBits(v, l)); }; -inline uint128 saveMiddleBits(uint128 v, uint32 l, uint32 r) { return(clearRightBits(v, r) & clearLeftBits(v, l)); }; +inline uint64 saveMiddleBits(uint64 v, uint32 l, uint32 r) { + return(clearRightBits(v, r) & clearLeftBits(v, l)); +}; @@ -128,23 +94,23 @@ inline uint128 saveMiddleBits(uint128 v, uint32 l, uint32 r) { return(clearRi inline uint64 reverseBits64(uint64 x) { - x = ((x >> 1) & 0x5555555555555555llu) | ((x << 1) & 0xaaaaaaaaaaaaaaaallu); - x = ((x >> 2) & 0x3333333333333333llu) | ((x << 2) & 0xccccccccccccccccllu); - x = ((x >> 4) & 0x0f0f0f0f0f0f0f0fllu) | ((x << 4) & 0xf0f0f0f0f0f0f0f0llu); - x = ((x >> 8) & 0x00ff00ff00ff00ffllu) | ((x << 8) & 0xff00ff00ff00ff00llu); - x = ((x >> 16) & 0x0000ffff0000ffffllu) | ((x << 16) & 0xffff0000ffff0000llu); - x = ((x >> 32) & 0x00000000ffffffffllu) | ((x << 32) & 0xffffffff00000000llu); + x = ((x >> 1) & uint64NUMBER(0x5555555555555555)) | ((x << 1) & uint64NUMBER(0xaaaaaaaaaaaaaaaa)); + x = ((x >> 2) & uint64NUMBER(0x3333333333333333)) | ((x << 2) & uint64NUMBER(0xcccccccccccccccc)); + x = ((x >> 4) & uint64NUMBER(0x0f0f0f0f0f0f0f0f)) | ((x << 4) & uint64NUMBER(0xf0f0f0f0f0f0f0f0)); + x = ((x >> 8) & uint64NUMBER(0x00ff00ff00ff00ff)) | ((x << 8) & uint64NUMBER(0xff00ff00ff00ff00)); + x = ((x >> 16) & uint64NUMBER(0x0000ffff0000ffff)) | ((x << 16) & uint64NUMBER(0xffff0000ffff0000)); + x = ((x >> 32) & uint64NUMBER(0x00000000ffffffff)) | ((x << 32) & uint64NUMBER(0xffffffff00000000)); return(x); } inline uint32 reverseBits32(uint32 x) { - x = ((x >> 1) & 0x55555555lu) | ((x << 1) & 0xaaaaaaaalu); - x = ((x >> 2) & 0x33333333lu) | ((x << 2) & 0xcccccccclu); - x = ((x >> 4) & 0x0f0f0f0flu) | ((x << 4) & 0xf0f0f0f0lu); - x = ((x >> 8) & 0x00ff00fflu) | ((x << 8) & 0xff00ff00lu); - x = ((x >> 16) & 0x0000fffflu) | ((x << 16) & 0xffff0000lu); + x = ((x >> 1) & uint32NUMBER(0x55555555)) | ((x << 1) & uint32NUMBER(0xaaaaaaaa)); + x = ((x >> 2) & uint32NUMBER(0x33333333)) | ((x << 2) & uint32NUMBER(0xcccccccc)); + x = ((x >> 4) & uint32NUMBER(0x0f0f0f0f)) | ((x << 4) & uint32NUMBER(0xf0f0f0f0)); + x = ((x >> 8) & uint32NUMBER(0x00ff00ff)) | ((x << 8) & uint32NUMBER(0xff00ff00)); + x = ((x >> 16) & uint32NUMBER(0x0000ffff)) | ((x << 16) & uint32NUMBER(0xffff0000)); return(x); } @@ -152,17 +118,17 @@ reverseBits32(uint32 x) { inline uint64 uint64Swap(uint64 x) { - x = ((x >> 8) & 0x00ff00ff00ff00ffllu) | ((x << 8) & 0xff00ff00ff00ff00llu); - x = ((x >> 16) & 0x0000ffff0000ffffllu) | ((x << 16) & 0xffff0000ffff0000llu); - x = ((x >> 32) & 0x00000000ffffffffllu) | ((x << 32) & 0xffffffff00000000llu); + x = ((x >> 8) & uint64NUMBER(0x00ff00ff00ff00ff)) | ((x << 8) & uint64NUMBER(0xff00ff00ff00ff00)); + x = ((x >> 16) & uint64NUMBER(0x0000ffff0000ffff)) | ((x << 16) & uint64NUMBER(0xffff0000ffff0000)); + x = ((x >> 32) & uint64NUMBER(0x00000000ffffffff)) | ((x << 32) & uint64NUMBER(0xffffffff00000000)); return(x); } inline uint32 uint32Swap(uint32 x) { - x = ((x >> 8) & 0x00ff00fflu) | ((x << 8) & 0xff00ff00lu); - x = ((x >> 16) & 0x0000fffflu) | ((x << 16) & 0xffff0000lu); + x = ((x >> 8) & uint32NUMBER(0x00ff00ff)) | ((x << 8) & uint32NUMBER(0xff00ff00)); + x = ((x >> 16) & uint32NUMBER(0x0000ffff)) | ((x << 16) & uint32NUMBER(0xffff0000)); return(x); } @@ -177,23 +143,23 @@ uint16Swap(uint16 x) { inline uint32 countNumberOfSetBits32(uint32 x) { - x = ((x >> 1) & 0x55555555lu) + (x & 0x55555555lu); - x = ((x >> 2) & 0x33333333lu) + (x & 0x33333333lu); - x = ((x >> 4) & 0x0f0f0f0flu) + (x & 0x0f0f0f0flu); - x = ((x >> 8) & 0x00ff00fflu) + (x & 0x00ff00fflu); - x = ((x >> 16) & 0x0000fffflu) + (x & 0x0000fffflu); + x = ((x >> 1) & uint32NUMBER(0x55555555)) + (x & uint32NUMBER(0x55555555)); + x = ((x >> 2) & uint32NUMBER(0x33333333)) + (x & uint32NUMBER(0x33333333)); + x = ((x >> 4) & uint32NUMBER(0x0f0f0f0f)) + (x & uint32NUMBER(0x0f0f0f0f)); + x = ((x >> 8) & uint32NUMBER(0x00ff00ff)) + (x & uint32NUMBER(0x00ff00ff)); + x = ((x >> 16) & uint32NUMBER(0x0000ffff)) + (x & uint32NUMBER(0x0000ffff)); return(x); } inline uint64 countNumberOfSetBits64(uint64 x) { - x = ((x >> 1) & 0x5555555555555555llu) + (x & 0x5555555555555555llu); - x = ((x >> 2) & 0x3333333333333333llu) + (x & 0x3333333333333333llu); - x = ((x >> 4) & 0x0f0f0f0f0f0f0f0fllu) + (x & 0x0f0f0f0f0f0f0f0fllu); - x = ((x >> 8) & 0x00ff00ff00ff00ffllu) + (x & 0x00ff00ff00ff00ffllu); - x = ((x >> 16) & 0x0000ffff0000ffffllu) + (x & 0x0000ffff0000ffffllu); - x = ((x >> 32) & 0x00000000ffffffffllu) + (x & 0x00000000ffffffffllu); + x = ((x >> 1) & uint64NUMBER(0x5555555555555555)) + (x & uint64NUMBER(0x5555555555555555)); + x = ((x >> 2) & uint64NUMBER(0x3333333333333333)) + (x & uint64NUMBER(0x3333333333333333)); + x = ((x >> 4) & uint64NUMBER(0x0f0f0f0f0f0f0f0f)) + (x & uint64NUMBER(0x0f0f0f0f0f0f0f0f)); + x = ((x >> 8) & uint64NUMBER(0x00ff00ff00ff00ff)) + (x & uint64NUMBER(0x00ff00ff00ff00ff)); + x = ((x >> 16) & uint64NUMBER(0x0000ffff0000ffff)) + (x & uint64NUMBER(0x0000ffff0000ffff)); + x = ((x >> 32) & uint64NUMBER(0x00000000ffffffff)) + (x & uint64NUMBER(0x00000000ffffffff)); return(x); } @@ -422,62 +388,208 @@ private: //////////////////////////////////////// // -// wordArray - An array that efficiently stores non-machine-word size -// integer words by packing the bits into machine-size words. The array is -// variable length but not sparse - accessing element 1,000,000 will -// allocate elements 0 through 999,999. +// wordArray +// +// An array that efficiently stores non-machine-word size integer words by +// packing the bits into machine-size words. // -// The size, in bits, of each element is set at construction time. All -// elements must be the same size. +// The array is variable length, but not sparse. Accessing element +// 1,000,000 will allocate elements 0 through 999,999. // -// The elements are stored in a set of fixed-size blocks. The block size -// can also be set at construction time. Note that this is specified IN -// BITS. The default size is 64 KB per block. Decrease this if you know -// you only need a few KB to store all values, or if you are storing several -// GB of data. There is no real performance loss/gain; it just adjusts the -// number of blocks allocated. There might be a slight degradation in -// performance of the memory management system if millions of blocks are -// allocated. +// No array operator can be provided since we cannot return a reference to +// values across machine words, let alone a reference to a value inside a +// machine word. +// +// The constructor needs to know the size of the words being stored, +// and how many bits to store per allocation. // class wordArray { public: - wordArray(uint32 valueWidth, uint64 segmentsSizeInBits, bool useLocks); - ~wordArray(); + wordArray(uint32 wordWidth, uint32 segmentSize = 65536 * 8) { + _valueWidth = wordWidth; + _segmentSize = segmentSize; + _valuesPerSegment = (uint64)_segmentSize / (uint64)_valueWidth; - void clear(void); // Reset the array to zero, doesn't deallocate space. + _nextElement = 0; - void allocate(uint64 nElements); // Pre-allocate space for nElements. + _segmentsLen = 0; + _segmentsMax = 16; + _segments = new uint64 * [_segmentsMax]; - uint128 get(uint64 eIdx); // Get the value of element eIdx. - void set(uint64 eIdx, uint128 v); // Set the value of element eIdx to v. + for (uint32 ss=0; ss<_segmentsMax; ss++) + _segments[ss] = NULL; + } -public: - void show(void); // Dump the wordArray to the screen; debugging. + ~wordArray() { + for (uint32 i=0; i<_segmentsLen; i++) + delete [] _segments[i]; -private: - void setLock(uint64 seg, uint64 lockW1, uint64 lockW2); - void relLock(uint64 seg, uint64 lockW1, uint64 lockW2); - void setNval(uint32 eIdx); + delete [] _segments; + }; -private: - uint64 _valueWidth = 0; // Width of the values stored. - uint64 _valueMask = 0; // Mask the low _valueWidth bits - uint64 _segmentSize = 0; // Size, in bits, of each block of data. + void clear(void) { + _nextElement = 0; + _segmentsLen = 0; + }; + + void allocate(uint64 nElements) { + uint64 nSegs = nElements / _valuesPerSegment + 1; + + //fprintf(stderr, "wordArray::allocate()-- allocating space for " F_U64 " elements, in " F_U64 " segments.\n", + // nElements, nSegs); + + assert(_segmentsLen == 0); + + resizeArray(_segments, _segmentsLen, _segmentsMax, nSegs, resizeArray_copyData | resizeArray_clearNew); + + for (uint32 seg=0; seg> (64 - _valueWidth - bit); + } + + // Otherwise, the value spans two words. First, shift the first word so + // the end of it is at the start of the value. Then shift the second + // word to the start of it is at the end of the value. + // + // ssssssssssssssssssssss <- second shift + // [--word--][--first-word--][--second-word--][--word--] + // [--value--] + // fffff <- first shift + // + else { + uint32 fShift = _valueWidth - (64 - bit); + uint32 sShift = 64 - fShift; + + val = _segments[seg][wrd+0] << fShift; + val |= _segments[seg][wrd+1] >> sShift; + } + + // Finally, mask off the stuff we don't care about. + + val &= uint64MASK(_valueWidth); + + return(val); + }; + + void set(uint64 element, uint64 value) { + uint64 seg = element / _valuesPerSegment; // Which segment are we in? + uint64 pos = _valueWidth * (element % _valuesPerSegment); // Which word in the segment? + + uint64 wrd = pos / 64; // The word we start in. + uint64 bit = pos % 64; // Starting at this bit. + + if (element >= _nextElement) + _nextElement = element+1; + + if (seg >= _segmentsMax) + resizeArray(_segments, _segmentsLen, _segmentsMax, seg + 16, resizeArray_copyData | resizeArray_clearNew); + + while (_segmentsLen <= seg) { + _segments[_segmentsLen] = new uint64 [_segmentSize / 64]; + + memset(_segments[_segmentsLen], 0xff, sizeof(uint64) * _segmentSize / 64); + + _segmentsLen++; + } + + // Mask the value, just in case. + + value &= uint64MASK(_valueWidth); + + // Set the value in the segment. + + // [--------------------] + // [value] + // lSave rSave + // + if (bit + _valueWidth <= 64) { + uint32 lSave = bit; + uint32 rSave = 64 - _valueWidth - bit; + + _segments[seg][wrd] = (saveLeftBits(_segments[seg][wrd], lSave) | + (value << rSave) | + saveRightBits(_segments[seg][wrd], rSave)); + } + + // --lSave-- --rSave-- + // [--word--][--first-word--][--second-word--][--word--] + // [----value---=] + // lSize rSize + // + else { + uint32 lSave = bit, rSave = 128 - _valueWidth - bit; + uint32 lSize = 64 - bit, rSize = _valueWidth - (64 - bit); + + _segments[seg][wrd+0] = saveLeftBits(_segments[seg][wrd+0], lSave) | (value >> rSize); + _segments[seg][wrd+1] = (value << rSave) | saveRightBits(_segments[seg][wrd+1], rSave); + } + }; + + void show(void) { + fprintf(stderr, "wordArray: valueWidth %2" F_U32P "\n", _valueWidth); + fprintf(stderr, "wordArray: segmentSize %8" F_U64P " valuesPerSegment %8" F_U64P "\n", _segmentSize, _valuesPerSegment); + fprintf(stderr, "\n"); + + uint32 bit = 64; + uint32 word = 0; + char bits[65]; + + for (uint32 ss=0; ss<_segmentsLen; ss++) { + fprintf(stderr, "Segment %u:\n", ss); + + for(uint32 wrd=0, bit=0; bit<_valuesPerSegment * _valueWidth; bit++) { + if ((bit % 64) == 0) { + displayWord(_segments[ss][wrd++], bits); + } + + if ((bit % _valueWidth) == 0) + fprintf(stderr, "word %2u: ", wrd); + + fprintf(stderr, "%c", bits[bit % 64]); - uint64 _segmentsLen = 0; // Number of blocks in use. - uint64 _segmentsMax = 0; // Number of block pointers allocated. - uint128 **_segments = nullptr; // List of blocks allocated. + if ((bit % _valueWidth) == _valueWidth - 1) + fprintf(stderr, "\n"); + } + } + + fprintf(stderr, "\n"); + fprintf(stderr, "\n"); + } + +private: + uint32 _valueWidth; + uint64 _segmentSize; + uint64 _valuesPerSegment; - std::atomic_flag **_segLocks = nullptr; // Locks on pieces of the segments. + uint64 _nextElement; // the first invalid element + + uint64 _segmentsLen; + uint64 _segmentsMax; + uint64 **_segments; }; @@ -753,13 +865,5 @@ private: }; -// Implementations. - -#define BITS_IMPLEMENTATIONS - -#include "bits-wordArray.H" - -#undef BITS_IMPLEMENTATIONS - #endif // LIBBITS_H diff --git a/ext/meryl/src/utility/src/utility/edlib.C b/ext/meryl/src/utility/src/utility/edlib.C index 8120c73..9658e74 100644 --- a/ext/meryl/src/utility/src/utility/edlib.C +++ b/ext/meryl/src/utility/src/utility/edlib.C @@ -437,7 +437,7 @@ void edlibAlignmentToStrings(const unsigned char* alignment, int alignmentLength } void -edlibAlignmentToStrings(EdlibAlignResult const &result, +edlibAlignmentToStrings(EdlibAlignResult result, const char *qry, const int qryLength, const char *tgt, const int tgtLength, char *qryAln, diff --git a/ext/meryl/src/utility/src/utility/edlib.H b/ext/meryl/src/utility/src/utility/edlib.H index 2b10740..07050a0 100644 --- a/ext/meryl/src/utility/src/utility/edlib.H +++ b/ext/meryl/src/utility/src/utility/edlib.H @@ -270,7 +270,7 @@ void edlibAlignmentToStrings(const unsigned char* alignment, int alignmentLength char *tgt_aln_str, char *qry_aln_str); -void edlibAlignmentToStrings(EdlibAlignResult const &result, +void edlibAlignmentToStrings(EdlibAlignResult result, const char *qry, const int qryLength, const char *tgt, const int tgtLength, char *qryAln, diff --git a/ext/meryl/src/utility/src/utility/files-buffered.C b/ext/meryl/src/utility/src/utility/files-buffered.C index 9d96fe8..5c5a568 100644 --- a/ext/meryl/src/utility/src/utility/files-buffered.C +++ b/ext/meryl/src/utility/src/utility/files-buffered.C @@ -71,9 +71,10 @@ readBuffer::initialize(const char *filename, uint64 bufferMax) { _ignoreCR = true; _bufferBgn = 0; + _bufferLen = 0; _bufferPos = 0; - _bufferLen = 0; + _bufferMax = (bufferMax == 0) ? 32 * 1024 : bufferMax; _buffer = new char [_bufferMax + 1]; @@ -110,9 +111,10 @@ readBuffer::readBuffer(FILE *file, uint64 bufferMax) { _ignoreCR = true; _bufferBgn = 0; + _bufferLen = 0; _bufferPos = 0; - _bufferLen = 0; + _bufferMax = (bufferMax == 0) ? 32 * 1024 : bufferMax; _buffer = new char [_bufferMax + 1]; @@ -142,17 +144,17 @@ readBuffer::~readBuffer() { void -readBuffer::fillBuffer(void) { +readBuffer::fillBuffer(uint64 extra) { // If there is still stuff in the buffer, no need to fill. - if (_bufferPos < _bufferLen) + if (_bufferPos + extra < _bufferLen) return; _bufferBgn += _bufferLen; + _bufferLen = 0; _bufferPos = 0; - _bufferLen = 0; assert(_filePos == _bufferBgn); @@ -392,7 +394,7 @@ readBuffer::readIFFchunk(char*name, uint8 *&data, uint32 &dataLen, uint32 &dataM // Allocate space for the data. - resizeArray(data, 0, dataMax, dataLen); + resizeArray(data, 0, dataMax, dataLen, resizeArray_doNothing); // Copy the data to 'data'. diff --git a/ext/meryl/src/utility/src/utility/files-buffered.H b/ext/meryl/src/utility/src/utility/files-buffered.H index 345590c..034352a 100644 --- a/ext/meryl/src/utility/src/utility/files-buffered.H +++ b/ext/meryl/src/utility/src/utility/files-buffered.H @@ -87,7 +87,7 @@ public: const char *filename(void) { return(_filename); }; private: - void fillBuffer(void); + void fillBuffer(uint64 extra=0); void init(int fileptr, const char *filename, uint64 bufferMax); char _filename[FILENAME_MAX+1]; @@ -102,9 +102,10 @@ private: bool _ignoreCR; // Ignore blasted DOS CR letters in read() and readuntil(). uint64 _bufferBgn; // File position where this buffer is from. + uint64 _bufferLen; // Length of the valid data in the buffer. uint64 _bufferPos; // Position in the buffer we're at. - uint64 _bufferLen; // Length of the valid data in the buffer. + uint64 _bufferMax; // Size of _buffer allocation. char *_buffer; // Data! }; diff --git a/ext/meryl/src/utility/src/utility/files-compressed.C b/ext/meryl/src/utility/src/utility/files-compressed.C index 5b4f9a5..6588a17 100644 --- a/ext/meryl/src/utility/src/utility/files-compressed.C +++ b/ext/meryl/src/utility/src/utility/files-compressed.C @@ -44,86 +44,25 @@ compressedFileType(char const *filename) { -static -bool -pigzAvailable(void) { - FILE *F = popen("pigz -h > /dev/null 2>&1", "r"); - - if (F == nullptr) - return(false); - - int32 e = pclose(F); - - return(e == 0); // If no error, then 'pigz' was able to run. -} - - - compressedFileReader::compressedFileReader(const char *filename) { + char cmd[FILENAME_MAX]; + int32 len = 0; _file = NULL; _filename = duplicateString(filename); - - _type = compressedFileType(_filename); - _pipe = false; _stdi = false; - reopen(); -} - - - -compressedFileReader::~compressedFileReader() { - - if (_file == NULL) - return; - - if (_stdi) - return; - - if (_pipe) - pclose(_file); - else - AS_UTL_closeFile(_file); - - delete [] _filename; -} - - - -void -compressedFileReader::reopen(void) { - char cmd[FILENAME_MAX]; - - int32 nThreads = omp_get_max_threads(); - bool pigz = false; - - // If input from stdin, do nothing. reopen() on this makes no sense, - // and doing nothing is _possibly_ more correct than failing. - if (_stdi) - return; - - // Close any existing file. - if ((_file) && (_pipe == true)) pclose(_file); - if ((_file) && (_pipe == false)) AS_UTL_closeFile(_file); - - // Blow up if the file doesn't exist. - if ((_type != cftSTDIN) && (fileExists(_filename) == false)) - fprintf(stderr, "ERROR: Failed to open input file '%s': %s\n", _filename, strerror(ENOENT)), exit(1); + cftType ft = compressedFileType(_filename); - if (_type == cftGZ) - pigz = pigzAvailable(); + if ((ft != cftSTDIN) && (fileExists(_filename) == false)) + fprintf(stderr, "ERROR: Failed to open input file '%s': %s\n", _filename, strerror(errno)), exit(1); - // Open the file! errno = 0; - switch (_type) { + switch (ft) { case cftGZ: - if (pigz) - snprintf(cmd, FILENAME_MAX, "pigz -dc -p %d '%s'", nThreads, _filename); - else - snprintf(cmd, FILENAME_MAX, "gzip -dc '%s'", _filename); + snprintf(cmd, FILENAME_MAX, "gzip -dc '%s'", _filename); _file = popen(cmd, "r"); _pipe = true; break; @@ -138,6 +77,11 @@ compressedFileReader::reopen(void) { snprintf(cmd, FILENAME_MAX, "xz -dc '%s'", _filename); _file = popen(cmd, "r"); _pipe = true; + + if (_file == NULL) // popen() returns NULL on error. It does not reliably set errno. + fprintf(stderr, "ERROR: Failed to open input file '%s': popen() returned NULL\n", _filename), exit(1); + + errno = 0; break; case cftSTDIN: @@ -151,18 +95,26 @@ compressedFileReader::reopen(void) { break; } - // Catch errors. - // - popen() does not set errno, so all we can do is fail. - // - otherwise, we can say something intelligent. + if (errno) + fprintf(stderr, "ERROR: Failed to open input file '%s': %s\n", _filename, strerror(errno)), exit(1); +} - if (_file == nullptr) { - if (_pipe) - fprintf(stderr, "ERROR: Failed to open file with command '%s'\n", cmd); - else - fprintf(stderr, "ERROR: Failed to open input file '%s': %s\n", _filename, strerror(errno)); - exit(1); - } + +compressedFileReader::~compressedFileReader() { + + if (_file == NULL) + return; + + if (_stdi) + return; + + if (_pipe) + pclose(_file); + else + AS_UTL_closeFile(_file); + + delete [] _filename; } @@ -170,8 +122,8 @@ compressedFileReader::reopen(void) { compressedFileWriter::compressedFileWriter(const char *filename, int32 level) { char cmd[FILENAME_MAX]; - int32 nThreads = omp_get_max_threads(); - bool pigz = false; + int32 nThreads = omp_get_max_threads(); + bool pigzAvailable = false; _file = NULL; _filename = duplicateString(filename); @@ -182,8 +134,22 @@ compressedFileWriter::compressedFileWriter(const char *filename, int32 level) { // Decide if we have pigz or gzip available. - if (ft == cftGZ) - pigz = pigzAvailable(); + if (ft == cftGZ) { + snprintf(cmd, FILENAME_MAX, "pigz -h > /dev/null 2>&1"); + + FILE *F = popen(cmd, "r"); + int32 e = pclose(F); + + if (e == 0) + pigzAvailable = true; + } + +#if 0 + if (pigzAvailable) + fprintf(stderr, "Using pigz for compression.\n"); + else + fprintf(stderr, "Using gzip for compression.\n"); +#endif // Open the output processor for input. @@ -191,7 +157,7 @@ compressedFileWriter::compressedFileWriter(const char *filename, int32 level) { switch (ft) { case cftGZ: - if (pigz) + if (pigzAvailable) snprintf(cmd, FILENAME_MAX, "pigz -%dc -p %d > '%s'", level, nThreads, _filename); else snprintf(cmd, FILENAME_MAX, "gzip -%dc > '%s'", level, _filename); diff --git a/ext/meryl/src/utility/src/utility/files-compressed.H b/ext/meryl/src/utility/src/utility/files-compressed.H index e716d8c..f05cb64 100644 --- a/ext/meryl/src/utility/src/utility/files-compressed.H +++ b/ext/meryl/src/utility/src/utility/files-compressed.H @@ -39,8 +39,6 @@ public: compressedFileReader(char const *filename); ~compressedFileReader(); - void reopen(void); - FILE *operator*(void) { return(_file); }; FILE *file(void) { return(_file); }; @@ -51,13 +49,10 @@ public: (_stdi == false)); }; private: - FILE *_file; - char *_filename; - - cftType _type; - - bool _pipe; - bool _stdi; + FILE *_file; + char *_filename; + bool _pipe; + bool _stdi; }; diff --git a/ext/meryl/src/utility/src/utility/files.C b/ext/meryl/src/utility/src/utility/files.C index b94e0d0..36a663d 100644 --- a/ext/meryl/src/utility/src/utility/files.C +++ b/ext/meryl/src/utility/src/utility/files.C @@ -69,7 +69,7 @@ writeToFile(void const *objects, // writing 16 GB of data at once; it seems to truncate to 32-bit somewhere. while (nWritten < nObjects) { - uint64 toWrite = std::min(blockSize, nObjects - nWritten); + uint64 toWrite = min(blockSize, nObjects - nWritten); errno = 0; uint64 written = fwrite(((char *)objects) + nWritten * objectSize, objectSize, toWrite, file); @@ -102,7 +102,7 @@ loadFromFile(void *objects, // we still read in 32 MB chunks. while (nLoaded < nObjects) { - uint64 toLoad = std::min(blockSize, nObjects - nLoaded); + uint64 toLoad = min(blockSize, nObjects - nLoaded); errno = 0; uint64 loaded = fread(((char *)objects) + nLoaded * objectSize, objectSize, toLoad, file); @@ -156,7 +156,7 @@ readLine(char *&L, uint32 &Llen, uint32 &Lmax, FILE *F) { return(false); if ((L == NULL) || (Lmax == 0)) - allocateArray(L, Lmax, 4, resizeArray_clearNew); + allocateArray(L, Lmax = 4, resizeArray_clearNew); L[Lmax-2] = 0; L[Lmax-1] = 0; @@ -188,7 +188,7 @@ readLine(char *&L, uint32 &Llen, uint32 &Lmax, FILE *F) { // Trim trailing whitespace. - while ((Llen > 0) && (isWhiteSpace(L[Llen-1]))) + while ((Llen > 0) && (isspace(L[Llen-1]))) L[--Llen] = 0; return(true); @@ -205,7 +205,7 @@ AS_UTL_readLine(char *&L, uint32 &Llen, uint32 &Lmax, FILE *F) { return(false); if ((L == NULL) || (Lmax == 0)) - allocateArray(L, Lmax, 1024); + allocateArray(L, Lmax = 1024, resizeArray_clearNew); Llen = 0; @@ -217,7 +217,7 @@ AS_UTL_readLine(char *&L, uint32 &Llen, uint32 &Lmax, FILE *F) { while ((feof(F) == false) && (ch != '\n')) { if (Llen + 1 >= Lmax) - resizeArray(L, Llen, Lmax, Lmax + growth, _raAct::copyData | _raAct::clearNew); // Grow the array. + resizeArray(L, Llen, Lmax, Lmax + growth, resizeArray_copyData | resizeArray_clearNew); // Grow the array. L[Llen++] = ch; @@ -230,7 +230,7 @@ AS_UTL_readLine(char *&L, uint32 &Llen, uint32 &Lmax, FILE *F) { // Trim trailing whitespace. - while ((Llen > 0) && (isWhiteSpace(L[Llen-1]))) + while ((Llen > 0) && (isspace(L[Llen-1]))) L[--Llen] = 0; return(true); @@ -344,27 +344,6 @@ AS_UTL_rename(char const *oldname, char const *newname) { -void -AS_UTL_rename(char const *oldprefix, char oldseparator, char const *oldsuffix, - char const *newprefix, char newseparator, char const *newsuffix) { - char oldpath[FILENAME_MAX+1] = {0}; - char newpath[FILENAME_MAX+1] = {0}; - - snprintf(oldpath, FILENAME_MAX, "%s%c%s", oldprefix, oldseparator, oldsuffix); - snprintf(newpath, FILENAME_MAX, "%s%c%s", newprefix, newseparator, newsuffix); - - if (pathExists(oldpath) == false) - return; - - errno = 0; - rename(oldpath, newpath); - if (errno) - fprintf(stderr, "AS_UTL_renane()-- Failed to rename file '%s' to '%s': %s\n", - oldpath, newpath, strerror(errno)), exit(1); -} - - - // Remove ALL write bits from a given path. bool AS_UTL_makeReadOnly(char const *prefix, char separator, char const *suffix) { @@ -542,41 +521,6 @@ AS_UTL_sizeOfFile(FILE *file) { -uint64 -AS_UTL_timeOfFile(char const *path) { - struct stat s; - - errno = 0; - if (stat(path, &s) == -1) - fprintf(stderr, "Failed to stat() file '%s': %s\n", path, strerror(errno)), exit(1); - -#ifdef __APPLE__ - return(s.st_mtimespec.tv_sec); -#else - return(s.st_mtim.tv_sec); -#endif -} - - - -uint64 -AS_UTL_timeOfFile(FILE *file) { - struct stat s; - off_t size = 0; - - errno = 0; - if (fstat(fileno(file), &s) == -1) - fprintf(stderr, "Failed to stat() FILE*: %s\n", strerror(errno)), exit(1); - -#ifdef __APPLE__ - return(s.st_mtimespec.tv_sec); -#else - return(s.st_mtim.tv_sec); -#endif -} - - - off_t AS_UTL_ftell(FILE *stream) { @@ -715,7 +659,7 @@ findSharedFile(char const *relpath, char const *filename) { void -AS_UTL_loadFileList(char const *fileName, std::vector &fileList) { +AS_UTL_loadFileList(char const *fileName, vector &fileList) { FILE *F = AS_UTL_openInputFile(fileName); @@ -869,10 +813,9 @@ AS_UTL_writeFastA(FILE *f, char const *s, int sl, int bl, char const *h, ...) { va_list ap; - int olen = sl + ((bl == 0) ? (1) : (sl / bl)) + 2; - char *o = new char [olen]; - int si = 0; - int oi = 0; + char *o = new char [sl + sl / ((bl == 0) ? sl : bl) + 2]; + int si = 0; + int oi = 0; while (si < sl) { o[oi++] = s[si++]; @@ -880,10 +823,8 @@ AS_UTL_writeFastA(FILE *f, if (bl != 0 && (si % bl) == 0) o[oi++] = '\n'; } - - if ((oi == 0) || (o[oi-1] != '\n')) + if (o[oi-1] != '\n') o[oi++] = '\n'; - o[oi] = 0; va_start(ap, h); @@ -953,46 +894,3 @@ AS_UTL_writeFastQ(FILE *f, -// A rather complicated output function. -// if seq is FASTQ and not wanting FASTA output -> FASTQ -// if seq is FASTA and wanting FASTQ output -> FASTQ with fixed QV -// else -> FASTA -// -// The else cases are -// seq is FASTQ and want FASTA output -// seq is FASTA and want FASTA output -// seq is FASTA and not want FASTQ output -// -void -outputSequence(FILE *OUT, - char const *outputName, - char const *outputBases, - uint8 const *outputQuals, uint32 outputBasesLen, - bool isFASTA, - bool isFASTQ, - bool outputFASTA, - bool outputFASTQ, - uint8 QV) { - - if ((isFASTQ == true) && (outputFASTA == false)) - AS_UTL_writeFastQ(OUT, - outputBases, outputBasesLen, - outputQuals, outputBasesLen, "@%s\n", outputName); - - else if ((isFASTA == true) && (outputFASTQ == true)) { - uint8 *qvs = new uint8 [outputBasesLen]; - - for (uint32 ii=0; ii%s\n", outputName); -} diff --git a/ext/meryl/src/utility/src/utility/files.H b/ext/meryl/src/utility/src/utility/files.H index a4f26e2..01c70f8 100644 --- a/ext/meryl/src/utility/src/utility/files.H +++ b/ext/meryl/src/utility/src/utility/files.H @@ -21,8 +21,11 @@ #define FILES_H #include "types.H" + #include +using namespace std; + // Provides a safe and reliable mechanism for reading / writing // binary data. @@ -48,8 +51,6 @@ void AS_UTL_symlink(char const *pathToFile, char const *pathToLink); void AS_UTL_unlink(char const *prefix, char separator='.', char const *suffix=NULL); void AS_UTL_rename(char const *oldname, char const *newname); -void AS_UTL_rename(char const *oldprefix, char oldseparator, char const *oldsuffix, - char const *newprefix, char newseparator, char const *newsuffix); bool AS_UTL_makeReadOnly(char const *prefix, char separator='.', char const *suffix=NULL); bool AS_UTL_makeWritable(char const *prefix, char separator='.', char const *suffix=NULL); @@ -62,9 +63,6 @@ bool directoryExists(char const *prefix, char separator='.', char const *suff off_t AS_UTL_sizeOfFile(char const *path); off_t AS_UTL_sizeOfFile(FILE *file); -uint64 AS_UTL_timeOfFile(char const *path); -uint64 AS_UTL_timeOfFile(FILE *file); - off_t AS_UTL_ftell(FILE *stream); void AS_UTL_fseek(FILE *stream, off_t offset, int whence); @@ -72,7 +70,7 @@ void AS_UTL_fseek(FILE *stream, off_t offset, int whence); char const *findSharedFile(char const *relpath, char const *filename); // Read a file-of-files into a vector -void AS_UTL_loadFileList(char const *fileName, std::vector &FILE); +void AS_UTL_loadFileList(char const *fileName, vector &FILE); FILE *AS_UTL_openInputFile (char const *prefix, char separator='.', char const *suffix=NULL, bool doOpen=true); FILE *AS_UTL_openOutputFile(char const *prefix, char separator='.', char const *suffix=NULL, bool doOpen=true); @@ -224,19 +222,6 @@ AS_UTL_writeFastQ(FILE *f, uint8 const *q, int ql, // As Sanger QV, from integer values char const *h, ...); -// Writes FASTA or FASTQ, depending on what data is present and what format -// is explicitly desired. -void -outputSequence(FILE *OUT, - char const *outputName, - char const *outputBases, - uint8 const *outputQuals, uint32 outputBasesLen, - bool isFASTA, - bool isFASTQ, - bool outputFASTA, - bool outputFASTQ, - uint8 QV); - #include "files-compressed.H" #include "files-buffered.H" diff --git a/ext/meryl/src/utility/src/utility/intervalList.H b/ext/meryl/src/utility/src/utility/intervalList.H index cef8700..45d6efe 100644 --- a/ext/meryl/src/utility/src/utility/intervalList.H +++ b/ext/meryl/src/utility/src/utility/intervalList.H @@ -158,8 +158,8 @@ intervalList::merge(iNum minOverlap) { if ((_list[thisI]._end >= _list[nextI]._end) || (_list[thisI]._end >= _list[nextI]._bgn + minOverlap)) { - _list[thisI]._end = std::max(_list[nextI]._end, _list[thisI]._end); - _list[thisI]._cnt += _list[nextI]._cnt; + _list[thisI]._end = max(_list[nextI]._end, _list[thisI]._end); + _list[thisI]._cnt += _list[nextI]._cnt; nextI++; } @@ -237,8 +237,8 @@ intervalList::invert(iNum invlo, iNum invhi) { inv[invLen++] = { invlo, _list[0]._bgn, 1 }; for (uint32 i=1; i<_listLen; i++) { - iNum bgn = std::max(invlo, _list[i-1]._end); - iNum end = std::min(_list[i]._bgn, invhi); + iNum bgn = max(invlo, _list[i-1]._end); + iNum end = min(_list[i]._bgn, invhi); if (bgn < end) inv[invLen++] = { bgn, end, 1 }; diff --git a/ext/meryl/src/utility/src/utility/intervals-implementation.H b/ext/meryl/src/utility/src/utility/intervals-implementation.H deleted file mode 100644 index c711704..0000000 --- a/ext/meryl/src/utility/src/utility/intervals-implementation.H +++ /dev/null @@ -1,423 +0,0 @@ - -/****************************************************************************** - * - * This file is part of meryl-utility, a collection of miscellaneous code - * used by Meryl, Canu and others. - * - * This software is based on: - * 'Canu' v2.0 (https://github.com/marbl/canu) - * which is based on: - * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net) - * the 'kmer package' r1994 (http://kmer.sourceforge.net) - * - * Except as indicated otherwise, this is a 'United States Government Work', - * and is released in the public domain. - * - * File 'README.licenses' in the root directory of this distribution - * contains full conditions and disclaimers. - */ - -#include "arrays.H" -#include - - -#ifndef INTERVALS_IMPLEMENTATION -#error Include intervals.H instead of intervals-implementation.H -#else - - -template -void -intervals::add_position(iNum bgn, iNum end) { - - if (bgn > end) - fprintf(stderr, "intervals::add_position()-- ERROR: bgn=%u > end=%u\n", bgn, end); - assert(bgn <= end); - - if (_listMax == 0) - allocateArray(_list, _listMax, 32); - - increaseArray(_list, _listLen, _listMax, _listMax / 4); - - _list[_listLen]._bgn = bgn; - _list[_listLen]._end = end; - _list[_listLen]._num = 1; - - _listLen++; - - _isSorted = false; - _isSquashed = false; -} - - - -template -void -intervals::add(intervals const &that) { - - resizeArray(_list, _listLen, _listMax, _listLen + that._listLen); - - for (uint32 ii=0; ii -void -intervals::remove(uint32 idx) { - - assert(idx < _listLen); - - for (uint32 ii=idx; ii+1<_listLen; ii++) - _list[ii] = _list[ii+1]; - - _listLen--; -} - - - -template -void -intervals::sort(void) { - - if ((_isSorted == true) || - (_listLen < 2)) - return; - - auto increasing = [](_ir const &a, - _ir const &b) { - return(((a._bgn < b._bgn)) || - ((a._bgn == b._bgn) && (a._end < b._end))); - }; - - std::sort(_list, _list + _listLen, increasing); - - _isSorted = true; -} - - - -template -void -intervals::squash(iNum minOverlap) { - uint32 intoI = 0; // Interval we're merging into. - uint32 fromI = 1; // Interval we're merging from. - - if (_isSquashed == true) - return; - - sort(); - - while (fromI < _listLen) { - assert(_list[intoI]._bgn < _list[intoI]._end); // Basic checks. Both intervals - assert(_list[fromI]._bgn < _list[fromI]._end); // cannot be empty, and intoI - assert(_list[intoI]._bgn <= _list[fromI]._bgn); // must be before fromI. - - // If the fromI intersects with intoI -- either contained in intoI, or - // has a thick overlap to intoI -- merge it in. We're guaranteed that - // this._bgn is before next._bgn, so all we need to do is extend - // this._end to cover the next interval. - - if ((_list[intoI]._end >= _list[fromI]._end) || - (_list[intoI]._end >= _list[fromI]._bgn + minOverlap)) { - _list[intoI]._end = std::max(_list[fromI]._end, _list[intoI]._end); - _list[intoI]._num += _list[fromI]._num; - } - - // Otherwise, move to the next intoI, copy the current fromI to it, and - // then move to the next fromI. We should, to be pedantic, check that - // intoI != fromI before the copy, but no harm if we don't. - - else { - _list[++intoI] = _list[fromI]; - } - - fromI++; - } - - _listLen = intoI + 1; // Update the length of the list, - _isSquashed = true; // and note that it's now merged. -} - - - -template -void -intervals::filter(iNum minLength, iNum maxLength) { - uint32 intoI = 0; - uint32 fromI = 0; - - // Over every interval, if it is long enough, copy it - // into the 'new' list. - - while (fromI < _listLen) { - iNum length = _list[fromI]._end - _list[fromI]._bgn; - - if ((minLength <= length) && - (length <= maxLength)) - _list[intoI++] = _list[fromI]; - - fromI++; - } - - _listLen = intoI; -} - - - - -#if 0 -template -void -setToUnion(intervals const &A, - intervals const &B) { -} - - -template -void -setToIntersection(intervals const &A, - intervals const &B) { -} - - -template -void -setToContained(intervals const &A, - intervals const &B) { -} - - -template -void -setToUnion(iNum bgn, iNum end, - intervals const &A) { -} - - -template -void -setToIntersection(iNum bgn, iNum end, - intervals const &A) { -} - - -template -void -setToContained(iNum bgn, iNum end, - intervals const &A) { -} -#endif - - -// Helper function to invert a squashed intervals list. -template -void -intervals::setToInversion1(iNum bgn, iNum end, - intervals const &A) { - - delete [] _list; - - _listLen = 0; // Create a new list to store the - _listMax = A._listLen + 1; // inversion. We need at most one - _list = new _ir [_listMax]; // more interval than the original. - - // If no existing list, just add a single interval covering the universe. - // - // If the inversion range falls entirely inside a gap in the original list - // (which would also result in the inverted list having one interval - // covering the whole range) we'll catch it in the loop below. - - if (A._listLen == 0) { - _list[_listLen++] = { bgn, end, 1 }; - } - - // For an existing list: - // 1) Add an interval for the first gap, if it's inside the inersion - // range. - // 2) Add intervals covering the middle gaps. Threshold each endpoint - // by the inversion range, and only add a new interval if it is of - // positive length. - // 3) Add an interval for the last gap, if it's inside the inversion - // range. - - else { - if (bgn < A._list[0]._bgn) - _list[_listLen++] = { bgn, A._list[0]._bgn, 1 }; - - for (uint32 ii=1; ii -void -intervals::setToInversion2(iNum bgn, iNum end, - intervals const &A) { - - delete [] _list; - - _listLen = 0; // Create a new list to store the - _listMax = A._listLen * 2; // inversion. We need at most twice - _list = new _ir [_listMax]; // the original size. - - // If no existing list, just add a single interval covering the universe. - - if (A._listLen == 0) { - _list[_listLen++] = { bgn, end, 1 }; - } - - // For an existing list: - // Add two intervals for each existing interval, one on each end of the - // interval. The new intervals are thresholded aginst the inversion - // range, and only added if they are of positive length. - // - // Note the symmetrically-opposite comparisons; these prevent us from - // adding length=0 intervals. - - else { - iNum nb, ne; - - for (uint32 ii=0; ii -void -intervals::setToInversion(iNum bgn, iNum end, - intervals const &A) { - if (A._isSquashed) - setToInversion1(bgn, end, A); - else - setToInversion2(bgn, end, A); -} - - - -template -void -intervalsDepth::computeDepth(intervals const &IL) { - uint32 idplen = IL.size() * 2; - _idp *idp = new _idp [idplen]; - - for (uint32 ii=0; ii 0) - computeDepth(idplen, idp); - - delete [] idp; -} - - - -template -void -intervalsDepth::computeDepth(uint32 idplen, _idp *idp) { - - // Sort regions so that earlier positions are first, and so that depth - // increases (+1) are before decreases (-1). - - auto increasing = [](_idp const &a, - _idp const &b) { - return(((a._pos < b._pos)) || - ((a._pos == b._pos) && (a._dlt > b._dlt))); - }; - - std::sort(idp, idp + idplen, increasing); - - // The first thing must be an 'open' event. If not, someone supplied a - // negative length to the original intervalList. Or, possibly, two - // zero-length intervals. - - if (idp[0]._dlt == -1) - for (uint32 ii=0; ii -class intervals { -private: - struct _ir { - iNum _bgn; - iNum _end; - uint32 _num; - }; - -public: - intervals() { }; - ~intervals() { delete [] _list; }; - - void clear(void) { - _isSorted = true; - _isSquashed = true; - _listLen = 0; - }; - - // Accessors. - - uint32 size(void) const { return(_listLen); }; - - iNum bgn (uint32 idx) const { return(_list[idx]._bgn); }; - iNum end (uint32 idx) const { return(_list[idx]._end); }; - iNum span(uint32 idx) const { return(_list[idx]._end - _list[idx]._bgn); }; - - uint32 count(uint32 idx) const { return(_list[idx]._num); }; - - // Modifiers. - - iNum &bgn (uint32 idx) { return(_list[idx]._bgn); }; - iNum &end (uint32 idx) { return(_list[idx]._end); }; - - uint32 &count(uint32 idx) { return(_list[idx]._num); }; - - void clear(uint32 idx) { - _list[idx]._bgn = iNum(); - _list[idx]._end = iNum(); - _list[idx]._num = 0; - } - - // Creation. - // - // Add a single interval to the list of intervals specified by either - // - the position of the end points - // - the position of the start and the length of the span - // - // Add(intervals) will copy all the intervals from B into this object, - // no further processing (sorting, squashing or filtering) is performed. - // - // Remove the interval at position 'idx' in our list. Doing so will - // greatly screw up interation over the intervals, and it is suggested - // to instead change the span of the interval to zero and then filter - // them out after iteration is complete. - - void add_position(iNum bgn, iNum end); - void add_span (iNum bgn, iNum len) { - if (len < 0) - add_position(bgn+len, bgn); - else - add_position(bgn, bgn+len); - }; - - void add(intervals const &B); - - void remove(uint32 idx); - - // Sort intervals by increasing coordinate, breaking ties with the end - // coordinate. - // - // Combine intervals that overlap by at least 'minOverlap' into one item. - // - // Discard intervals that are smaller than minLength or larger than - // maxLength. - - void sort(void); - void squash(iNum minOverlap=0); - void filter(iNum minLength, iNum maxLength); - - // setToUnion - populate this intervals object with all the intervals in A - // and B. If both A and B are squashed, this intervals object will also - // be squashed. - // - // setToIntersection - each interval in A (B) is intersected with all - // intervals in B (A), and the resulting interval is added to this object. - // - // setToContained - each interval in A that is contained fully in some - // interval in B is added to this intervals object. -#if 0 - void setToUnion (intervals const &A, intervals const &B); - void setToIntersection(intervals const &A, intervals const &B); - void setToContained (intervals const &A, intervals const &B); -#endif - // setToUnion - copy the intervals in A that oveerlap with the interval - // bgn-end. - // - // setToIntersection - copy the intervals in A that intersect with the - // interval bgn-end, and trim them to that range. - // - // setToContained - copy the intervals in A that are contained within the - // interval bgn-end. - // - // setToInversion - // - if A is squashed, intervals that fill the 'holes' in A, bounded by - // bgn and end) are added to this object. - // - if A is not squashed, each interval in A will contribute 0, 1 or 2 - // new intervals to this object, representing the holes, bounded by bgn and end, - // created by only that single interval in A. - // - // bgn[ ]end - // -------- --------- ---- A - // -------- --------- union - // -- --------- intersection - // --------- contained - // -- ---- inversion -#if 0 - void setToUnion (iNum bgn, iNum end, intervals const &A); - void setToIntersection(iNum bgn, iNum end, intervals const &A); - void setToContained (iNum bgn, iNum end, intervals const &A); -#endif - void setToInversion (iNum bgn, iNum end, intervals const &A); - - // Helper functions. -private: - void setToInversion1(iNum bgn, iNum end, intervals const &A); - void setToInversion2(iNum bgn, iNum end, intervals const &A); - -private: - bool _isSorted = true; - bool _isSquashed = true; - - uint32 _listMax = 0; - uint32 _listLen = 0; - _ir *_list = nullptr; -}; - - - -template -class intervalsDepth { -private: - struct _idp { // An intervalDepthPosition stores the position - iNum _pos; // of a change in depth, and the delta of that - int32 _dlt; // change (which is either +1 or -1). - }; - - struct _idr { // An intervalDepthRegion has the coordinates - iNum _bgn; // of the region and the depth. - iNum _end; - uint32 _dpt; - }; - -public: - intervalsDepth() { - }; - intervalsDepth(intervals const &IL) { - computeDepth(IL); - }; - ~intervalsDepth() { - delete [] _list; - }; - - uint32 size(void) { return(_listLen); }; - - iNum bgn (uint32 idx) { return(_list[idx]._bgn); }; - iNum end (uint32 idx) { return(_list[idx]._end); }; - iNum span(uint32 idx) { return(_list[idx]._end - _list[idx]._bgn); }; - - uint32 depth(uint32 idx) { return(_list[idx]._dpt); }; - - void computeDepth(intervals const &IL); - -private: - void computeDepth(uint32 idplen, _idp *idp); - - uint32 _listLen = 0; - _idr *_list = nullptr; -}; - - - -#define INTERVALS_IMPLEMENTATION -#include "intervals-implementation.H" -#undef INTERVALS_IMPLEMENTATION - - -#endif // INTERVALS_H diff --git a/ext/meryl/src/utility/src/utility/kmers-exact.C b/ext/meryl/src/utility/src/utility/kmers-exact.C index bb342da..0fbb82a 100644 --- a/ext/meryl/src/utility/src/utility/kmers-exact.C +++ b/ext/meryl/src/utility/src/utility/kmers-exact.C @@ -22,22 +22,43 @@ #include #include +using namespace std; -// Set some basic boring stuff. + +// If set, allocate another (large) array to verify that there are no holes in the +// data array. Holes would lead to false positives. // -void -merylExactLookup::initialize(merylFileReader *input_, kmvalu minValue_, kmvalu maxValue_) { +#undef VERIFY_SUFFIX_END + + + + + +double +bitsToGB(uint64 bits) { + return(bits / 8 / 1024.0 / 1024.0 / 1024.0); +} + +double +bitsToMB(uint64 bits) { + return(bits / 8 / 1024.0 / 1024.0); +} - // Save a pointer to the input data. - _input = input_; + + +// Set some basic boring stuff. +// +void +merylExactLookup::initialize(uint64 minValue_, + uint64 maxValue_) { // Silently make minValue and maxValue be valid values. if (minValue_ == 0) minValue_ = 1; - if (maxValue_ == kmvalumax) { + if (maxValue_ == UINT64_MAX) { uint32 nV = _input->stats()->histogramLength(); maxValue_ = _input->stats()->histogramValue(nV - 1); @@ -65,6 +86,7 @@ merylExactLookup::initialize(merylFileReader *input_, kmvalu minValue_, kmvalu m _valueBits = countNumberOfBits64(_maxValue + 1 - _minValue); _suffixMask = 0; + _dataMask = 0; _nPrefix = 0; // Number of entries in pointer table. _nSuffix = 0; // Number of entries in suffix dable. @@ -72,7 +94,7 @@ merylExactLookup::initialize(merylFileReader *input_, kmvalu minValue_, kmvalu m // Scan the histogram to count the number of kmers in range. for (uint32 ii=0; ii<_input->stats()->histogramLength(); ii++) { - kmvalu v = _input->stats()->histogramValue(ii); + uint64 v = _input->stats()->histogramValue(ii); if ((_minValue <= v) && (v <= _maxValue)) @@ -82,11 +104,10 @@ merylExactLookup::initialize(merylFileReader *input_, kmvalu minValue_, kmvalu m _prePtrBits = countNumberOfBits64(_nSuffix); // Width of an entry in the prefix table. _prePtrBits = 64; - _suffixBgn = nullptr; - _suffixLen = nullptr; - _suffixEnd = nullptr; - _sufData = nullptr; - _valData = nullptr; + _suffixBgn = NULL; + _suffixEnd = NULL; + _sufData = NULL; + _valData = NULL; } @@ -96,51 +117,31 @@ merylExactLookup::initialize(merylFileReader *input_, kmvalu minValue_, kmvalu m // use for indexing (prefixSize), and how many bits of data we need // to store explicitly (suffixBits and valueBits). // -void -merylExactLookup::configure(double memInGB, - double &memInGBmin, - double &memInGBmax, - bool useMinimalMemory, - bool useOptimalMemory, - bool reportMemory, - bool reportSizes) { - - // Convert the memory in GB to memory in BITS. If no memory - // size is supplied, as the OS how big we can get. - - if (memInGB == 0.0) - _maxMemory = getMaxMemoryAllowed() * 8; - else - _maxMemory = (uint64)(memInGB * 1024.0 * 1024.0 * 1024.0 * 8); - - // Find the prefixBits that results in the smallest allocated memory size. - // Due to threading over the files, we cannot use a prefix smaller than 6 - // bits. +bool +merylExactLookup::configure(void) { + + // First, find the prefixBits that results in the smallest allocated memory size. + // Due to threading over the files, we cannot use a prefix smaller than 6 bits. // // While it's nice to find the smallest memory size possible, that's also // about the slowest possible. Instead, empirically determined on a small - // test, allow a very sparse table of 16 to 32 prefixes per kmer (if - // possible). + // test, allow a very sparse table of 16 to 32 prefixes per kmer (if possible). - uint64 minSpace = uint64max; - uint64 optSpace = uint64max; - uint64 usdSpace = uint64max; + uint64 minSpace = UINT64_MAX; + uint64 optSpace = UINT64_MAX; // _nSuffix here is just the number of distinct kmers in the input. We'll // search for prefix sizes up to that size plus a bit more to show that // what we pick really is the best size. - // - // We save the smallest size, and the 'optimal' size, defined as something - // at least as big as the smallest, but not more than 8 times larger. uint32 pbMin = 0; uint32 pbOpt = 0; - uint32 pbMax = countNumberOfBits64(_nSuffix) + 1; + uint32 pbMax = countNumberOfBits64(_nSuffix) + 4; if (pbMax > kmer::merSize() * 2) pbMax = kmer::merSize() * 2; - for (uint32 pb=0; pb(_suffixBits); + _suffixMask = uint64MASK(_suffixBits); + _dataMask = uint64MASK(_valueBits); - _nPrefix = (uint64)1 << pbMin; - } - - if (useOptimalMemory == true) { - usdSpace = optSpace; - - _prefixBits = pbOpt; - _suffixBits = _Kbits - pbOpt; - - _suffixMask = buildLowBitMask(_suffixBits); - - _nPrefix = (uint64)1 << pbOpt; + _nPrefix = nprefix; + } } // And do it all again to keep the users entertained. - if (reportMemory) { + if (_verbose) { fprintf(stderr, "\n"); fprintf(stderr, " p prefixes bits gigabytes (allowed: %lu GB)\n", _maxMemory >> 33); fprintf(stderr, "-- -------------- ---------------- ---------\n"); @@ -198,36 +181,38 @@ merylExactLookup::configure(double memInGB, uint64 nprefix = (uint64)1 << pb; uint64 space = nprefix * _prePtrBits + _nSuffix * (_Kbits - pb) + _nSuffix * _valueBits; - if ((pb == pbMin) && - (pb == pbOpt)) - fprintf(stderr, "%2u %14lu %16lu %9.3f (smallest)\n", pb, nprefix, space, bitsToGB(space)); - else if (pb == pbMin) + if (pb == pbMin) fprintf(stderr, "%2u %14lu %16lu %9.3f (smallest)\n", pb, nprefix, space, bitsToGB(space)); + else if (pb == pbOpt) - fprintf(stderr, "%2u %14lu %16lu %9.3f (faster)\n", pb, nprefix, space, bitsToGB(space)); + fprintf(stderr, "%2u %14lu %16lu %9.3f (used)\n", pb, nprefix, space, bitsToGB(space)); + else fprintf(stderr, "%2u %14lu %16lu %9.3f\n", pb, nprefix, space, bitsToGB(space)); } fprintf(stderr, "-- -------------- ---------------- ---------\n"); - fprintf(stderr, " %14lu total kmers\n", _nSuffix); fprintf(stderr, "\n"); - } - if (reportSizes) { - fprintf(stderr, "\n"); - fprintf(stderr, "For %lu distinct %u-mers (with %u bits used for indexing and %u bits for tags):\n", _nSuffix, _Kbits / 2, _prefixBits, _suffixBits); - fprintf(stderr, " %7.3f GB memory for kmer indices - %12lu elements %2u bits wide)\n", bitsToGB(_nPrefix * _prePtrBits), _nPrefix, _prePtrBits); - fprintf(stderr, " %7.3f GB memory for kmer tags - %12lu elements %2u bits wide)\n", bitsToGB(_nSuffix * _suffixBits), _nSuffix, _suffixBits); - fprintf(stderr, " %7.3f GB memory for kmer values - %12lu elements %2u bits wide)\n", bitsToGB(_nSuffix * _valueBits), _nSuffix, _valueBits); - fprintf(stderr, " %7.3f GB memory\n", bitsToGB(usdSpace)); - fprintf(stderr, "\n"); + if (_prefixBits == 0) { + fprintf(stderr, "Not enough memory to load %lu distinct %u-kmers.\n", _nSuffix, _Kbits / 2); + fprintf(stderr, "Need at least %.3f GB memory.\n", bitsToGB(minSpace)); + } + + else { + fprintf(stderr, "For %lu distinct %u-mers (with %u bits used for indexing and %u bits for tags):\n", _nSuffix, _Kbits / 2, _prefixBits, _suffixBits); + fprintf(stderr, " %7.3f GB memory\n", bitsToGB(optSpace)); + fprintf(stderr, " %7.3f GB memory for index (%lu elements %u bits wide)\n", bitsToGB(_nPrefix * _prePtrBits), _nPrefix, _prePtrBits); + fprintf(stderr, " %7.3f GB memory for tags (%lu elements %u bits wide)\n", bitsToGB(_nSuffix * _suffixBits), _nSuffix, _suffixBits); + fprintf(stderr, " %7.3f GB memory for data (%lu elements %u bits wide)\n", bitsToGB(_nSuffix * _valueBits), _nSuffix, _valueBits); + fprintf(stderr, "\n"); + } } - // Copy the min and optimal memory sizes to the output variables. + if (_prefixBits == 0) + return(false); - memInGBmin = bitsToGB(minSpace); - memInGBmax = bitsToGB(optSpace); + return(true); } @@ -240,28 +225,15 @@ merylExactLookup::configure(double memInGB, void merylExactLookup::count(void) { - _suffixBgn = new uint64 [_nPrefix]; - _suffixLen = new uint64 [_nPrefix]; - _suffixEnd = new uint64 [_nPrefix]; + _suffixBgn = new uint64 [_nPrefix + 1]; - for (uint64 ii=0; ii<_nPrefix; ii++) - _suffixBgn[ii] = _suffixLen[ii] = _suffixEnd[ii] = uint64zero; + memset(_suffixBgn, 0, sizeof(uint64) * (_nPrefix + 1)); // Scan all kmer files, counting the number of kmers per prefix. // This is thread safe when _prefixBits is more than 6 (the number of files). uint32 nf = _input->numFiles(); - assert(nf == 64); - - uint64 minp[nf]; - uint64 maxp[nf]; - - for (uint32 ii=0; iiblockFile(ff); @@ -279,9 +251,9 @@ merylExactLookup::count(void) { block->decodeBlock(); for (uint32 ss=0; ssnKmers(); ss++) { - kmdata kbits = 0; - kmdata prefix = 0; - kmvalu value = block->values()[ss]; + uint64 sdata = 0; + uint64 prefix = 0; + uint64 value = block->values()[ss]; if (value < _minValue) { tooLow++; @@ -295,18 +267,15 @@ merylExactLookup::count(void) { loaded++; - kbits = block->prefix(); // Combine the file prefix and - kbits <<= _input->suffixSize(); // suffix data to reconstruct - kbits |= block->suffixes()[ss]; // the kmer bits. - - prefix = kbits >> _suffixBits; // Then extract the prefix + sdata = block->prefix(); // Reconstruct the kmer into sdata. This is just + sdata <<= _input->suffixSize(); // kmerTiny::setPrefixSuffix(). From the kmer, + sdata |= block->suffixes()[ss]; // generate the prefix we want to save it as. - minp[ff] = std::min(minp[ff], (uint64)prefix); - maxp[ff] = std::max(maxp[ff], (uint64)prefix); + prefix = sdata >> _suffixBits; assert(prefix < _nPrefix); - _suffixLen[prefix]++; // Count the number of kmers per prefix. + _suffixBgn[prefix]++; // Count the number of kmers per prefix. } } @@ -322,39 +291,28 @@ merylExactLookup::count(void) { AS_UTL_closeFile(blockFile); } - // If the min/max intersect, we've got a problem somewhere. Each 'prefix' - // will map to exactly one file, and they're supposed to map - // consecutively. Good luck figuring out what broke if this triggers. - - for (uint32 ii=1; ii> 6; + uint64 bgn = 0; + uint64 nxt = 0; - for (uint64 bgn=0, ii=0; ii<_nPrefix; ii++) { + for (uint64 ii=0; ii<_nPrefix; ii++) { + nxt = _suffixBgn[ii]; _suffixBgn[ii] = bgn; - _suffixEnd[ii] = bgn; + bgn += nxt; + } - bgn += _suffixLen[ii]; + assert(bgn == _nKmersLoaded); + _suffixBgn[_nPrefix] = bgn; - if ((ii & mask) == mask) - bgn += 256; - } +#ifdef VERIFY_SUFFIX_END + _suffixEnd = new uint64 [_nPrefix]; + + for (uint64 ii=0; ii<_nPrefix; ii++) + _suffixEnd[ii] = _suffixBgn[ii]; +#endif // Log. @@ -372,45 +330,33 @@ merylExactLookup::count(void) { // prevent the need for any locking or coordination when filling out the // array. // -double +void merylExactLookup::allocate(void) { - uint64 arraySize; - uint64 arrayBlockMin; - double memInGBused = 0.0; - - uint64 ns = _suffixEnd[_nPrefix-1]; // The largest word we access in wordArray. + uint64 arraySize, arrayBlockMin; if (_suffixBits > 0) { - arraySize = ns * _suffixBits; - arrayBlockMin = std::max(arraySize / 1024llu, 268435456llu); // In bits, so 32MB per block. - memInGBused += bitsToGB(arraySize); + arraySize = _nSuffix * _suffixBits; + arrayBlockMin = max(arraySize / 1024llu, 268435456llu); // In bits, so 32MB per block. if (_verbose) fprintf(stderr, "Allocating space for %lu suffixes of %u bits each -> %lu bits (%.3f GB) in blocks of %.3f MB\n", - ns, _suffixBits, arraySize, bitsToGB(arraySize), bitsToMB(arrayBlockMin)); - - assert(_suffixBits <= 128); + _nSuffix, _suffixBits, arraySize, bitsToGB(arraySize), bitsToMB(arrayBlockMin)); - _sufData = new wordArray(_suffixBits, arrayBlockMin, false); - _sufData->allocate(ns); + _sufData = new wordArray(_suffixBits, arrayBlockMin); + _sufData->allocate(_nSuffix); } if (_valueBits > 0) { - arraySize = ns * _valueBits; - arrayBlockMin = std::max(arraySize / 1024llu, 268435456llu); // In bits, so 32MB per block. - memInGBused += bitsToGB(arraySize); + arraySize = _nSuffix * _valueBits; + arrayBlockMin = max(arraySize / 1024llu, 268435456llu); // In bits, so 32MB per block. if (_verbose) fprintf(stderr, " %lu values of %u bits each -> %lu bits (%.3f GB) in blocks of %.3f MB\n", - ns, _valueBits, arraySize, bitsToGB(arraySize), bitsToMB(arrayBlockMin)); + _nSuffix, _valueBits, arraySize, bitsToGB(arraySize), bitsToMB(arrayBlockMin)); - assert(_valueBits <= 64); - - _valData = new wordArray(_valueBits, arrayBlockMin, false); - _valData->allocate(ns); + _valData = new wordArray(_valueBits, arrayBlockMin); + _valData->allocate(_nSuffix); } - - return(memInGBused); } @@ -421,9 +367,11 @@ merylExactLookup::allocate(void) { // In this case, we overallocate, but cannot cleanup at the end. void merylExactLookup::load(void) { - uint32 nf = _input->numFiles(); - uint64 sufMask = buildLowBitMask(_suffixBits); - uint64 valMask = buildLowBitMask(_valueBits); + + count(); + allocate(); + + uint32 nf = _input->numFiles(); #pragma omp parallel for schedule(dynamic, 1) for (uint32 ff=0; ffdecodeBlock(); for (uint32 ss=0; ssnKmers(); ss++) { - kmdata kbits = 0; - kmdata prefix = 0; - kmdata suffix = 0; - kmvalu value = block->values()[ss]; + uint64 prefix = 0; + uint64 suffix = 0; + uint64 value = block->values()[ss]; if ((value < _minValue) || // Sanity checking and counting done (_maxValue < value)) // in count() above. continue; - kbits = block->prefix(); // Combine the file prefix and - kbits <<= _input->suffixSize(); // suffix data to reconstruct - kbits |= block->suffixes()[ss]; // the kmer bits. + // Compute and store the prefix. - suffix = kbits & sufMask; // Then extract the prefix - prefix = kbits >> _suffixBits; // and suffix to use in the table + prefix = block->prefix(); // Reconstruct the kmer into sdata. This is just + prefix <<= _input->suffixSize(); // kmerTiny::setPrefixSuffix(). From the kmer, + prefix |= block->suffixes()[ss]; // generate the prefix we want to save it as. - _sufData->set(_suffixEnd[prefix], suffix); + suffix = prefix & uint64MASK(_suffixBits); + prefix >>= _suffixBits; + + _sufData->set(_suffixBgn[prefix], suffix); // Compute and store the value, if requested. @@ -460,16 +409,20 @@ merylExactLookup::load(void) { value -= _valueOffset; if (value > _maxValue + 1 - _minValue) - fprintf(stderr, "minValue " F_U32 " maxValue " F_U32 " value " F_U32 " bits " F_U32 "\n", + fprintf(stderr, "minValue " F_U64 " maxValue " F_U64 " value " F_U64 " bits " F_U32 "\n", _minValue, _maxValue, value, _valueBits); - assert(value <= valMask); + assert(value <= uint64MASK(_valueBits)); - _valData->set(_suffixEnd[prefix], value); + _valData->set(_suffixBgn[prefix], value); } // Move to the next item. + _suffixBgn[prefix]++; + +#ifdef VERIFY_SUFFIX_END _suffixEnd[prefix]++; +#endif } } @@ -478,92 +431,48 @@ merylExactLookup::load(void) { AS_UTL_closeFile(blockFile); } - // Check that we loaded the expected number of kmers into each space + // suffixBgn[i] is now the start of [i+1]; shift the array by one to + // restore the proper meaning of suffixBgn. - for (uint64 ii=0; ii<_nPrefix; ii++) - assert(_suffixBgn[ii] + _suffixLen[ii] == _suffixEnd[ii]); - - // Now just log. + for (uint64 ii=_nPrefix; ii>0; ii--) + _suffixBgn[ii] = _suffixBgn[ii-1]; - if (_verbose) - fprintf(stderr, "Loaded " F_U64 " kmers. Skipped " F_U64 " (too low) and " F_U64 " (too high) kmers.\n", - _nKmersLoaded, _nKmersTooLow, _nKmersTooHigh); -} + _suffixBgn[0] = 0; + // Optionally verify that bgn[i] == end[i-1]. +#ifdef VERIFY_SUFFIX_END + for (uint64 ii=1; ii<_nPrefix; ii++) + assert(_suffixBgn[ii] == _suffixEnd[ii-1]); -void -merylExactLookup::estimateMemoryUsage(merylFileReader *input_, - double maxMemInGB_, - double &minMemInGB_, - double &optMemInGB_, - kmvalu minValue_, - kmvalu maxValue_) { - initialize(input_, minValue_, maxValue_); - configure(maxMemInGB_, minMemInGB_, optMemInGB_, false, false, true, false); -} - + delete [] _suffixEnd; + _suffixEnd = NULL; +#endif + // Now just log. -double -merylExactLookup::load(merylFileReader *input_, - double maxMemInGB_, - bool useMinimalMemory, - bool useOptimalMemory, - kmvalu minValue_, - kmvalu maxValue_) { - double minMem = 0.0; - double maxMem = 0.0; - double memInGBused = 0.0; - - initialize(input_, minValue_, maxValue_); // Initialize ourself. - - configure(maxMemInGB_, // Find parameters. - minMem, - maxMem, - useMinimalMemory, - useOptimalMemory, - false, - true); - - if (_prefixBits == 0) // Fail if needed. - return(0.0); - - count(); // Count kmers/prefix. - memInGBused = allocate(); // Allocate space. - load(); // Load data. - - return(memInGBused); + if (_verbose) + fprintf(stderr, "Loaded " F_U64 " kmers. Skipped " F_U64 " (too low) and " F_U64 " (too high) kmers.\n", + _nKmersLoaded, _nKmersTooLow, _nKmersTooHigh); } - - - bool merylExactLookup::exists_test(kmer k) { - char kmerString[65]; - kmdata kmer = (kmdata)k; - kmdata prefix = kmer >> _suffixBits; - kmdata suffix = kmer & _suffixMask; - fprintf(stderr, "\n"); - fprintf(stderr, "kmer %s %s\n", toHex(kmer, 2 * k.merSize()), k.toString(kmerString)); - fprintf(stderr, "suffixBits %s %3u bits\n", toHex(_suffixMask, _suffixBits), _suffixBits); - fprintf(stderr, "prefix %s %3u bits\n", toHex(prefix, 2 * k.merSize() - _suffixBits), 2 * k.merSize() - _suffixBits); - fprintf(stderr, "suffix %s\n", toHex(suffix, _suffixBits)); + uint64 kmer = (uint64)k; + uint64 prefix = kmer >> _suffixBits; + uint64 suffix = kmer & _suffixMask; uint64 bgn = _suffixBgn[prefix]; uint64 mid; - uint64 end = _suffixEnd[prefix]; + uint64 end = _suffixBgn[prefix + 1]; - kmdata tag; + uint64 tag; // Binary search for the matching tag. - fprintf(stderr, "BINARY SEARCH the bucket %lu-%lu for suffix %s.\n", bgn, end, toHex(suffix)); - while (bgn + 8 < end) { mid = bgn + (end - bgn) / 2; @@ -589,26 +498,23 @@ merylExactLookup::exists_test(kmer k) { } fprintf(stderr, "\n"); - fprintf(stderr, "FAILED kmer 0x%s\n", toHex(kmer)); - fprintf(stderr, "FAILED prefix 0x%s\n", toHex(prefix)); - fprintf(stderr, "FAILED suffix 0x%s\n", toHex(suffix)); + fprintf(stderr, "FAILED kmer 0x%016lx\n", kmer); + fprintf(stderr, "FAILED prefix 0x%016lx\n", prefix); + fprintf(stderr, "FAILED suffix 0x%016lx\n", suffix); fprintf(stderr, "\n"); - fprintf(stderr, "original %9lu %9lu\n", _suffixBgn[prefix], _suffixEnd[prefix]); + fprintf(stderr, "original %9lu %9lu\n", _suffixBgn[prefix], _suffixBgn[prefix + 1]); fprintf(stderr, "final %9lu %9lu\n", bgn, end); fprintf(stderr, "\n"); bgn = _suffixBgn[prefix]; - end = _suffixEnd[prefix]; - - fprintf(stderr, "BINARY SEARCH the bucket %lu-%lu for suffix %s.\n", bgn, end, toHex(suffix)); + end = _suffixBgn[prefix + 1]; while (bgn + 8 < end) { mid = bgn + (end - bgn) / 2; tag = _sufData->get(mid); - fprintf(stderr, "TEST bgn %8lu %8lu %8lu end -- dat %s =?= %s suffix\n", - bgn, mid, end, toHex(tag), toHex(suffix)); + fprintf(stderr, "TEST bgn %8lu %8lu %8lu end -- dat %lu =?= %lu suffix\n", bgn, mid, end, tag, suffix); if (tag == suffix) return(true); @@ -620,35 +526,10 @@ merylExactLookup::exists_test(kmer k) { bgn = mid + 1; } - // Exhaustively search the bucket. - - fprintf(stderr, "LINEAR SEARCH the bucket %lu-%lu for suffix %s.\n", bgn, end, toHex(suffix)); - - for (mid=bgn; mid < end; mid++) { - tag = _sufData->get(mid); - - fprintf(stderr, "ITER bgn %8lu %8lu %8lu end -- dat %s\n", - bgn, mid, end, toHex(tag)); - - if (tag == suffix) - return(true); - } - - // Exhaustively search all buckets. - // - // THIS IS WRONG - it needs to skip the empty buckets in the middle, so needs to - // iterate over each suffixBgn/suffixEnd pair individually. - - bgn = _suffixBgn[0]; - end = _suffixEnd[_nPrefix - 1]; - - fprintf(stderr, "LINEAR SEARCH the entire table %lu-%lu for suffix %s.\n", bgn, end, toHex(suffix)); - for (mid=bgn; mid < end; mid++) { tag = _sufData->get(mid); - fprintf(stderr, "ITER bgn %8lu %8lu %8lu end -- dat %s\n", - bgn, mid, end, toHex(tag)); + fprintf(stderr, "ITER bgn %8lu %8lu %8lu end -- dat %lu =?= %lu suffix\n", bgn, mid, end, tag, suffix); if (tag == suffix) return(true); diff --git a/ext/meryl/src/utility/src/utility/kmers-files.C b/ext/meryl/src/utility/src/utility/kmers-files.C index 8814a59..627fef5 100644 --- a/ext/meryl/src/utility/src/utility/kmers-files.C +++ b/ext/meryl/src/utility/src/utility/kmers-files.C @@ -119,7 +119,10 @@ merylFileBlockReader::decodeBlock(void) { if (_data == NULL) return; - resizeArrayPair(_suffixes, _values, 0, _nKmersMax, _nKmers, _raAct::doNothing); + //fprintf(stderr, "decodeBlock() nKmersMax %lu nKmers %lu\n", _nKmersMax, _nKmers); + + resizeArrayPair(_suffixes, _values, 0, _nKmersMax, _nKmers, resizeArray_doNothing); + decodeBlock(_suffixes, _values); } diff --git a/ext/meryl/src/utility/src/utility/kmers-histogram.C b/ext/meryl/src/utility/src/utility/kmers-histogram.C index 232807e..8e4b267 100644 --- a/ext/meryl/src/utility/src/utility/kmers-histogram.C +++ b/ext/meryl/src/utility/src/utility/kmers-histogram.C @@ -95,7 +95,7 @@ merylHistogram::dump(stuffedBits *bits) { } } - for (auto it=_histBig.begin(); it != _histBig.end(); it++) { + for (map::iterator it=_histBig.begin(); it != _histBig.end(); it++) { bits->setBinary(64, it->first); // Value bits->setBinary(64, it->second); // Number of occurrences } diff --git a/ext/meryl/src/utility/src/utility/kmers-histogram.H b/ext/meryl/src/utility/src/utility/kmers-histogram.H index 921528a..4816f7c 100644 --- a/ext/meryl/src/utility/src/utility/kmers-histogram.H +++ b/ext/meryl/src/utility/src/utility/kmers-histogram.H @@ -24,8 +24,11 @@ #error "include kmers.H, not this." #endif + #include +using namespace std; + // Stores a histogram of kmer count values. @@ -71,17 +74,17 @@ public: uint64 histogramOccurrences(uint32 i) { return(_histOs[i]); }; private: - uint64 _numUnique; - uint64 _numDistinct; - uint64 _numTotal; + uint64 _numUnique; + uint64 _numDistinct; + uint64 _numTotal; - uint32 _histMax; // Max value that can be stored in _hist. - uint64 *_hist; - std::map _histBig; // Values bigger than _histMax; + uint32 _histMax; // Max value that can be stored in _hist. + uint64 *_hist; + map _histBig; // Values bigger than _histMax; - uint64 _histLen; // If loaded from disk, this is the unpacked histogram. - uint64 *_histVs; // The value this histogram entry is counting. - uint64 *_histOs; // The number of occurrences of that value. + uint64 _histLen; // If loaded from disk, this is the unpacked histogram. + uint64 *_histVs; // The value this histogram entry is counting. + uint64 *_histOs; // The number of occurrences of that value. }; diff --git a/ext/meryl/src/utility/src/utility/kmers-iterator.H b/ext/meryl/src/utility/src/utility/kmers-iterator.H index 27ba8a1..bb02f8d 100644 --- a/ext/meryl/src/utility/src/utility/kmers-iterator.H +++ b/ext/meryl/src/utility/src/utility/kmers-iterator.H @@ -36,7 +36,7 @@ public: addSequence(NULL, 0); }; kmerIterator(FILE *input); - kmerIterator(char const *buffer, uint64 bufferLen) { + kmerIterator(char *buffer, uint64 bufferLen) { assert(kmer::merSize() > 0); reset(); addSequence(buffer, bufferLen); @@ -48,7 +48,7 @@ public: _kmerValid = _fmer.merSize() - 1; }; - void addSequence(char const *buffer, uint64 bufferLen) { + void addSequence(char *buffer, uint64 bufferLen) { _buffer = buffer; _bufferLen = bufferLen; _bufferPos = 0; @@ -163,16 +163,16 @@ public: uint64 endPosition(void) { return(_bufferPos); }; private: - uint32 _kmerSize; - uint32 _kmerLoad; - uint32 _kmerValid; + uint32 _kmerSize; + uint32 _kmerLoad; + uint32 _kmerValid; - char const *_buffer; - uint64 _bufferLen; - uint64 _bufferPos; + char *_buffer; + uint64 _bufferLen; + uint64 _bufferPos; - kmerTiny _fmer; - kmerTiny _rmer; + kmerTiny _fmer; + kmerTiny _rmer; }; diff --git a/ext/meryl/src/utility/src/utility/kmers-lookup.H b/ext/meryl/src/utility/src/utility/kmers-lookup.H index 3feb1f0..4ec9d76 100644 --- a/ext/meryl/src/utility/src/utility/kmers-lookup.H +++ b/ext/meryl/src/utility/src/utility/kmers-lookup.H @@ -24,295 +24,247 @@ #error "include kmers.H, not this." #endif + class merylExactLookup { public: - merylExactLookup() { + merylExactLookup(merylFileReader *input_, + uint32 maxMemory_ = 0, + uint64 minValue_ = 0, + uint64 maxValue_ = UINT64_MAX) { + + _input = input_; + _maxMemory = maxMemory_; // maxMemory_ is In GB; _maxMemory should be in BITS! + _verbose = true; + + if (_maxMemory == 0) + _maxMemory = getPhysicalMemorySize() * 8; + else + _maxMemory <<= 33; + + initialize(minValue_, maxValue_); // Do NOT use minValue_ or maxValue_ from now on! }; + ~merylExactLookup() { delete [] _suffixBgn; - delete [] _suffixLen; delete [] _suffixEnd; delete _sufData; delete _valData; }; -public: - // Optional. Quickly analyze the input kmers and compute the minimum and - // 'optimal' memory needed for the lookup tables. - // - // maxMemInGB is used as an upper limit on minMem and optMem. + // To use this object: + // lookup = new merylExactLookup(input, 0, 0, UINT32_MAX); + // if (lookup->configure() == true) + // lookup->load() // - void estimateMemoryUsage(merylFileReader *input_, - double maxMemInGB_, - double &minMemInGB_, - double &optMemInGB_, - kmvalu minValue_ = 0, - kmvalu maxValue_ = kmvalumax); - -public: - // Load a new meryl database into the lookup table. - // - // maxMemInGB is used as an upper limit on the size of the lookup table. - // The actual size used is determined from useMinimalMemory or - // useOptimalMemory, as returned from estinmateMemoryUsage(). - // - // The difference between 'minimal' and 'optimal' is one of speed; lookups - // with 'minimal' memory will be slower than with 'optimal' memory; - // however, it isn't known how significant this is. - // - // The return value is the actual memory used, in GB, or 0.0 if loading - // failed. (I think) - // - double load(merylFileReader *input_, - double maxMemInGB_, - bool useMinimalMemory, - bool useOptimalMemory, - kmvalu minValue_ = 0, - kmvalu maxValue_ = kmvalumax); +private: + void initialize(uint64 minValue_, uint64 maxValue_); public: - // For describing what we've loaded. - // - uint64 nKmers(void) { return(_nKmersLoaded); }; - - // The accessors. - // - // Return true/false if the kmer exists/does not. - // Return true/false if the kmer exists/does not, and populate 'value' with the value. - // Return the value of the kmer, or zero if it doesn't exist. - // - bool exists(kmer k); - bool exists(kmer k, kmvalu &value); - kmvalu value(kmer k); - - // For testing the implementation. - // - bool exists_test(kmer k); - + bool configure(void); private: - // Used internally for construction. As tempting is it seems to call - // initialize() or configure() directly, you can't. - // - void initialize(merylFileReader *input_, kmvalu minValue_, kmvalu maxValue_); - void configure(double memInGB, - double &memInGBmin, - double &memInGBmax, - bool useMinimalMemory, - bool useOptimalMemory, - bool reportMemory, - bool reportSizes); void count(void); - double allocate(void); + void allocate(void); +public: void load(void); - kmvalu value_value(kmvalu value); - private: - merylFileReader *_input = nullptr; + uint64 value_value(uint64 value) { + if (_valueBits == 0) // Return 'true' if no value + return(1); // is stored. - uint64 _maxMemory = 0; - bool _verbose = true; + value &= uint64MASK(_valueBits); - kmvalu _minValue = 0; // Minimum value stored in the table -| both of these filter the - kmvalu _maxValue = 0; // Maximum value stored in the table -| input kmers. - kmvalu _valueOffset = 0; // Offset of values stored in the table. + //if (value == 0) // Return zero if the value + // return(0); // is actually zero. - uint64 _nKmersLoaded = 0; - uint64 _nKmersTooLow = 0; - uint64 _nKmersTooHigh = 0; + return(value + _valueOffset); // Otherwise, return the value. + }; - uint32 _Kbits; +public: + uint64 nKmers(void) { return(_nKmersLoaded); }; - uint32 _prefixBits = 0; // How many high-end bits of the kmer is an index into _suffixBgn. - uint32 _suffixBits = 0; // How many bits of the kmer are in the suffix table. - uint32 _valueBits = 0; // How many bits of the suffix entry are data. - kmdata _suffixMask = 0; + // Return true/false if the kmer exists/does not. + bool exists(kmer k) { + kmdata kmer = (kmdata)k; + uint64 prefix = kmer >> _suffixBits; + kmdata suffix = kmer & _suffixMask; - uint64 _nPrefix = 0; // How many entries in _suffixBgn == 2 ^ _prefixBits. - uint64 _nSuffix = 0; // How many entries in _suffixData == nDistinct in the input database. + uint64 bgn = _suffixBgn[prefix]; + uint64 mid; + uint64 end = _suffixBgn[prefix + 1]; - uint32 _prePtrBits = 0; // How many bits wide is _suffixBgn (used only if _suffixBgn is a wordArray). + kmdata tag; - uint64 *_suffixBgn = nullptr; // The start of a block of data in suffix Data. - uint64 *_suffixLen = nullptr; // The number of kmers to load in each block. - uint64 *_suffixEnd = nullptr; // The end of a block. (NOTE: bgn + len != end) - wordArray *_sufData = nullptr; // Finally, kmer suffix data! - wordArray *_valData = nullptr; // Finally, value data! -}; + // Binary search for the matching tag. + while (bgn + 8 < end) { + mid = bgn + (end - bgn) / 2; + tag = _sufData->get(mid); + if (tag == suffix) + return(true); + if (suffix < tag) + end = mid; + else + bgn = mid + 1; + } -inline -kmvalu -merylExactLookup::value_value(kmvalu value) { - if (_valueBits == 0) // Return 'true' if no value - return(1); // is stored. + // Switch to linear search when we're down to just a few candidates. - value &= buildLowBitMask(_valueBits); + for (mid=bgn; mid < end; mid++) { + tag = _sufData->get(mid); - //if (value == 0) // Return zero if the value - // return(0); // is actually zero. + if (tag == suffix) + return(true); + } - return(value + _valueOffset); // Otherwise, return the value. -}; + return(false); + } + // Return true/false if the kmer exists/does not. + // And populate 'value' with the value of the kmer. + bool exists(kmer k, uint64 &value) { + kmdata kmer = (kmdata)k; + uint64 prefix = kmer >> _suffixBits; + kmdata suffix = kmer & _suffixMask; -// Return true/false if the kmer exists/does not. -inline -bool -merylExactLookup::exists(kmer k) { - kmdata kmer = (kmdata)k; - uint64 prefix = kmer >> _suffixBits; - kmdata suffix = kmer & _suffixMask; + uint64 bgn = _suffixBgn[prefix]; + uint64 mid; + uint64 end = _suffixBgn[prefix + 1]; - uint64 bgn = _suffixBgn[prefix]; - uint64 mid; - uint64 end = _suffixEnd[prefix]; + kmdata tag; - kmdata tag; + // Binary search for the matching tag. - // Binary search for the matching tag. + while (bgn + 8 < end) { + mid = bgn + (end - bgn) / 2; - while (bgn + 8 < end) { - mid = bgn + (end - bgn) / 2; + tag = _sufData->get(mid); - tag = _sufData->get(mid); + if (tag == suffix) { + if (_valueBits == 0) + value = 1; + else + value = _valData->get(mid); + return(true); + } - if (tag == suffix) - return(true); + if (suffix < tag) + end = mid; - if (suffix < tag) - end = mid; + else + bgn = mid + 1; + } - else - bgn = mid + 1; - } + // Switch to linear search when we're down to just a few candidates. - // Switch to linear search when we're down to just a few candidates. + for (mid=bgn; mid < end; mid++) { + tag = _sufData->get(mid); - for (mid=bgn; mid < end; mid++) { - tag = _sufData->get(mid); + if (tag == suffix) { + if (_valueBits == 0) + value = 1; + else + value = _valData->get(mid); + return(true); + } + } - if (tag == suffix) - return(true); + value = 0; + return(false); } - return(false); -} + // Returns the value of the kmer, '0' if it doesn't exist. + uint64 value(kmer k) { + kmdata kmer = (kmdata)k; + uint64 prefix = kmer >> _suffixBits; + kmdata suffix = kmer & _suffixMask; + uint64 bgn = _suffixBgn[prefix]; + uint64 mid; + uint64 end = _suffixBgn[prefix + 1]; -// Return true/false if the kmer exists/does not. -// And populate 'value' with the value of the kmer. -inline -bool -merylExactLookup::exists(kmer k, kmvalu &value) { - kmdata kmer = (kmdata)k; - kmdata prefix = kmer >> _suffixBits; - kmdata suffix = kmer & _suffixMask; + kmdata tag; - uint64 bgn = _suffixBgn[prefix]; - uint64 mid; - uint64 end = _suffixEnd[prefix]; + // Binary search for the matching tag. - kmdata tag; + while (bgn + 8 < end) { + mid = bgn + (end - bgn) / 2; - // Binary search for the matching tag. + tag = _sufData->get(mid); - while (bgn + 8 < end) { - mid = bgn + (end - bgn) / 2; + if (tag == suffix) { + if (_valueBits == 0) + return(1); + else + return(_valData->get(mid)); + } - tag = _sufData->get(mid); + if (suffix < tag) + end = mid; - if (tag == suffix) { - if (_valueBits == 0) - value = 1; else - value = _valData->get(mid); - return(true); + bgn = mid + 1; } - if (suffix < tag) - end = mid; - - else - bgn = mid + 1; - } + // Switch to linear search when we're down to just a few candidates. - // Switch to linear search when we're down to just a few candidates. + for (mid=bgn; mid < end; mid++) { + tag = _sufData->get(mid); - for (mid=bgn; mid < end; mid++) { - tag = _sufData->get(mid); - - if (tag == suffix) { - if (_valueBits == 0) - value = 1; - else - value = _valData->get(mid); - return(true); + if (tag == suffix) { + if (_valueBits == 0) + return(1); + else + return(_valData->get(mid)); + } } - } - - value = 0; - return(false); -} + return(0); + }; -// Returns the value of the kmer, '0' if it doesn't exist. -inline -kmvalu -merylExactLookup::value(kmer k) { - kmdata kmer = (kmdata)k; - kmdata prefix = kmer >> _suffixBits; - kmdata suffix = kmer & _suffixMask; - uint64 bgn = _suffixBgn[prefix]; - uint64 mid; - uint64 end = _suffixEnd[prefix]; + bool exists_test(kmer k); - kmdata tag; - // Binary search for the matching tag. +private: + merylFileReader *_input; - while (bgn + 8 < end) { - mid = bgn + (end - bgn) / 2; + uint64 _maxMemory; + bool _verbose; - tag = _sufData->get(mid); + uint64 _minValue; // Minimum value stored in the table -| both of these filter the + uint64 _maxValue; // Maximum value stored in the table -| input kmers. + uint64 _valueOffset; // Offset of values stored in the table. - if (tag == suffix) { - if (_valueBits == 0) - return(1); - else - return(_valData->get(mid)); - } + uint64 _nKmersLoaded; + uint64 _nKmersTooLow; + uint64 _nKmersTooHigh; - if (suffix < tag) - end = mid; + uint32 _Kbits; - else - bgn = mid + 1; - } + uint32 _prefixBits; // How many high-end bits of the kmer is an index into _suffixBgn. + uint32 _suffixBits; // How many bits of the kmer are in the suffix table. + uint32 _valueBits; // How many bits of the suffix entry are data. - // Switch to linear search when we're down to just a few candidates. + kmdata _suffixMask; + uint64 _dataMask; - for (mid=bgn; mid < end; mid++) { - tag = _sufData->get(mid); + uint64 _nPrefix; // How many entries in _suffixBgn == 2 ^ _prefixBits. + uint64 _nSuffix; // How many entries in _suffixData == nDistinct in the input database. - if (tag == suffix) { - if (_valueBits == 0) - return(1); - else - return(_valData->get(mid)); - } - } + uint32 _prePtrBits; // How many bits wide is _suffixBgn (used only if _suffixBgn is a wordArray). - return(0); + uint64 *_suffixBgn; // The start of a block of data in suffix Data. The end is the next start. + uint64 *_suffixEnd; // The end. Temporary. + wordArray *_sufData; // Finally, kmer suffix data! + wordArray *_valData; // Finally, value data! }; - #endif // MERYL_UTIL_KMER_LOOKUP_H diff --git a/ext/meryl/src/utility/src/utility/kmers-reader.C b/ext/meryl/src/utility/src/utility/kmers-reader.C index d20b017..3aa4239 100644 --- a/ext/meryl/src/utility/src/utility/kmers-reader.C +++ b/ext/meryl/src/utility/src/utility/kmers-reader.C @@ -495,7 +495,7 @@ merylFileReader::nextMer(void) { // Make sure we have space for the decoded data - resizeArrayPair(_suffixes, _values, 0, _nKmersMax, _nKmers, _raAct::doNothing); + resizeArrayPair(_suffixes, _values, 0, _nKmersMax, _nKmers, resizeArray_doNothing); // Decode the block into _OUR_ space. // diff --git a/ext/meryl/src/utility/src/utility/kmers-tiny.H b/ext/meryl/src/utility/src/utility/kmers-tiny.H index 9e2b5c2..c8d9373 100644 --- a/ext/meryl/src/utility/src/utility/kmers-tiny.H +++ b/ext/meryl/src/utility/src/utility/kmers-tiny.H @@ -28,12 +28,9 @@ typedef uint128 kmdata; // 128 bits of kmer data typedef uint32 kmpref; // 32 bits of kmer prefix == 6 bits file prefix, 6 (default) suffix prefix -typedef uint32 kmvalu; // 32 bits of kmer value +typedef uint32 kmvalu; // 64 bits of kmer count typedef uint64 kmcolo; // 64 bits of kmer color -constexpr kmvalu kmvalumax = uint32max; -constexpr kmcolo kmcolomax = uint64max; - class kmerTiny { public: @@ -70,15 +67,6 @@ public: // to make space for the new base. Unlike the 'standard' two-bit encoding, // these encode bases as A=00, C=01, G=11, T=10. // - // +---------+-- upper/lower case bit - // | | - // A 1000001 a 1100001 == 00 - // C 1000011 c 1100011 == 01 - // G 1000111 g 1100111 == 11 - // T 1010100 t 1110100 == 10 - // || - // ++-- bits used for 2-bit encoding - // void addR(kmdata base) { _mer = (((_mer << 2) & _fullMask) | (((base >> 1) & 0x03llu) ) ); }; void addL(kmdata base) { _mer = (((_mer >> 2) & _leftMask) | (((base >> 1) & 0x03llu) ^ 0x02llu) << _leftShift); }; @@ -90,16 +78,16 @@ public: // Complement the bases - mer ^= build_uint128(0xaaaaaaaaaaaaaaaallu, 0xaaaaaaaaaaaaaaaallu); + mer ^= uint128NUMBER(0xaaaaaaaaaaaaaaaallu, 0xaaaaaaaaaaaaaaaallu); // Reverse the mer - mer = ((mer >> 2) & build_uint128(0x3333333333333333llu, 0x3333333333333333llu)) | ((mer << 2) & build_uint128(0xccccccccccccccccllu, 0xccccccccccccccccllu)); - mer = ((mer >> 4) & build_uint128(0x0f0f0f0f0f0f0f0fllu, 0x0f0f0f0f0f0f0f0fllu)) | ((mer << 4) & build_uint128(0xf0f0f0f0f0f0f0f0llu, 0xf0f0f0f0f0f0f0f0llu)); - mer = ((mer >> 8) & build_uint128(0x00ff00ff00ff00ffllu, 0x00ff00ff00ff00ffllu)) | ((mer << 8) & build_uint128(0xff00ff00ff00ff00llu, 0xff00ff00ff00ff00llu)); - mer = ((mer >> 16) & build_uint128(0x0000ffff0000ffffllu, 0x0000ffff0000ffffllu)) | ((mer << 16) & build_uint128(0xffff0000ffff0000llu, 0xffff0000ffff0000llu)); - mer = ((mer >> 32) & build_uint128(0x00000000ffffffffllu, 0x00000000ffffffffllu)) | ((mer << 32) & build_uint128(0xffffffff00000000llu, 0xffffffff00000000llu)); - mer = ((mer >> 64) & build_uint128(0x0000000000000000llu, 0xffffffffffffffffllu)) | ((mer << 64) & build_uint128(0xffffffffffffffffllu, 0x0000000000000000llu)); + mer = ((mer >> 2) & uint128NUMBER(0x3333333333333333llu, 0x3333333333333333llu)) | ((mer << 2) & uint128NUMBER(0xccccccccccccccccllu, 0xccccccccccccccccllu)); + mer = ((mer >> 4) & uint128NUMBER(0x0f0f0f0f0f0f0f0fllu, 0x0f0f0f0f0f0f0f0fllu)) | ((mer << 4) & uint128NUMBER(0xf0f0f0f0f0f0f0f0llu, 0xf0f0f0f0f0f0f0f0llu)); + mer = ((mer >> 8) & uint128NUMBER(0x00ff00ff00ff00ffllu, 0x00ff00ff00ff00ffllu)) | ((mer << 8) & uint128NUMBER(0xff00ff00ff00ff00llu, 0xff00ff00ff00ff00llu)); + mer = ((mer >> 16) & uint128NUMBER(0x0000ffff0000ffffllu, 0x0000ffff0000ffffllu)) | ((mer << 16) & uint128NUMBER(0xffff0000ffff0000llu, 0xffff0000ffff0000llu)); + mer = ((mer >> 32) & uint128NUMBER(0x00000000ffffffffllu, 0x00000000ffffffffllu)) | ((mer << 32) & uint128NUMBER(0xffffffff00000000llu, 0xffffffff00000000llu)); + mer = ((mer >> 64) & uint128NUMBER(0x0000000000000000llu, 0xffffffffffffffffllu)) | ((mer << 64) & uint128NUMBER(0xffffffffffffffffllu, 0x0000000000000000llu)); // Shift and mask out the bases not in the mer @@ -150,7 +138,7 @@ public: kmdata mask = _mer; mask >>= 1; - mask &= build_uint128(0x5555555555555555llu, 0x5555555555555555llu); + mask &= uint128NUMBER(0x5555555555555555llu, 0x5555555555555555llu); fmer ^= mask; // Convert from ACTG ordering to ACGT ordering. rmer ^= mask; @@ -167,12 +155,10 @@ public: return(_mer); }; - operator uint64 () const = delete; // Explicitly fail of someone tries to convert us to an integer - operator int64 () const = delete; // instead of to a kmdata. Without these, a cast to, say, uint64 - operator uint32 () const = delete; // would be first convert to kmdata (uint128) then down to uint64. - operator int32 () const = delete; // With these, you'll either get a compile-time error (because - operator uint16 () const = delete; // these are private) or link time error (because they're not - operator int16 () const = delete; // defined. + operator uint64 () const { + assert(0); + return(_mer); + }; void setPrefixSuffix(kmpref prefix, kmdata suffix, uint32 width) { _mer = prefix; diff --git a/ext/meryl/src/utility/src/utility/kmers-writer-block.C b/ext/meryl/src/utility/src/utility/kmers-writer-block.C index 4ca5535..b8a2708 100644 --- a/ext/meryl/src/utility/src/utility/kmers-writer-block.C +++ b/ext/meryl/src/utility/src/utility/kmers-writer-block.C @@ -24,7 +24,7 @@ merylBlockWriter::merylBlockWriter(merylFileWriter *writer) { _writer = writer; - strncpy(_outName, _writer->_outName, FILENAME_MAX+1); + strncpy(_outName, _writer->_outName, FILENAME_MAX); // Encoding data @@ -284,7 +284,7 @@ merylBlockWriter::mergeBatches(uint32 oi) { // Setup the merge. - resizeArrayPair(suffixes, values, 0, nKmersMax, totnKmers); + resizeArrayPair(suffixes, values, 0, nKmersMax, totnKmers, resizeArray_doNothing); // Merge! We don't know the number of different kmers in the input, and are forced // to loop infinitely. diff --git a/ext/meryl/src/utility/src/utility/kmers-writer-stream.C b/ext/meryl/src/utility/src/utility/kmers-writer-stream.C index 7c9fb1f..c9055df 100644 --- a/ext/meryl/src/utility/src/utility/kmers-writer-stream.C +++ b/ext/meryl/src/utility/src/utility/kmers-writer-stream.C @@ -24,7 +24,7 @@ merylStreamWriter::merylStreamWriter(merylFileWriter *writer, uint32 fileNumber) _writer = writer; - strncpy(_outName, _writer->_outName, FILENAME_MAX+1); + strncpy(_outName, _writer->_outName, FILENAME_MAX); // Encoding data diff --git a/ext/meryl/src/utility/src/utility/kmers-writer.C b/ext/meryl/src/utility/src/utility/kmers-writer.C index 89a660c..c0ca7af 100644 --- a/ext/meryl/src/utility/src/utility/kmers-writer.C +++ b/ext/meryl/src/utility/src/utility/kmers-writer.C @@ -23,14 +23,11 @@ void merylFileWriter::initialize(uint32 prefixSize, bool isMultiSet) { - // Fail if we're already initialized and asked to change the prefix size. - // But just ignore the re-init request if the prefix size is the same. - if ((_initialized == true) && (prefixSize != _prefixSize)) fprintf(stderr, "merylFileWriter::initialize()-- asked to initialize with different prefixSize (new %u existing %u).\n", prefixSize, _prefixSize), exit(1); - if (_initialized == true) + if (_initialized == true) // Nothing to do if we're already done. return; // If the global mersize isn't set, we're hosed. @@ -57,12 +54,12 @@ merylFileWriter::initialize(uint32 prefixSize, bool isMultiSet) { _prefixSize = 12; //max((uint32)8, 2 * kmer::merSize() / 3); _suffixSize = 2 * kmer::merSize() - _prefixSize; - _suffixMask = buildLowBitMask(_suffixSize); + _suffixMask = uint64MASK(_suffixSize); // Decide how many files to write. We can make up to 2^32 files, but will // run out of file handles _well_ before that. For now, limit to 2^6 = 64 files. - _numFilesBits = 6; + _numFilesBits = 6; //(_prefixSize < 7) ? _prefixSize : 6; _numBlocksBits = _prefixSize - _numFilesBits; _numFiles = (uint64)1 << _numFilesBits; @@ -276,7 +273,7 @@ merylFileWriter::writeBlockToFile(FILE *datFile, // Save the index entry. - uint64 block = blockPrefix & buildLowBitMask(_numBlocksBits); + uint64 block = blockPrefix & uint64MASK(_numBlocksBits); datFileIndex[block].set(blockPrefix, datFile, nKmers); diff --git a/ext/meryl/src/utility/src/utility/kmers-writer.H b/ext/meryl/src/utility/src/utility/kmers-writer.H index 35e1038..61df83a 100644 --- a/ext/meryl/src/utility/src/utility/kmers-writer.H +++ b/ext/meryl/src/utility/src/utility/kmers-writer.H @@ -73,7 +73,7 @@ private: uint32 _prefixSize; uint32 _suffixSize; - kmdata _suffixMask; + uint64 _suffixMask; uint32 _numFilesBits; uint32 _numBlocksBits; diff --git a/ext/meryl/src/utility/src/utility/logging.C b/ext/meryl/src/utility/src/utility/logging.C index c5cacc9..60638fa 100644 --- a/ext/meryl/src/utility/src/utility/logging.C +++ b/ext/meryl/src/utility/src/utility/logging.C @@ -99,9 +99,11 @@ public: _part = 0; - _length = 512 * 1024 * 1024; // Forces a rotate() on the first write. + _length = 0; _lengthMax = 512 * 1024 * 1024; + _bufferSize = bufferSize; // Forces a rotate() on the first write. + _output = NULL; }; @@ -146,10 +148,6 @@ public: _part++; - if (_prefix[0] == 0) - fprintf(stderr, "_prefix not set for thread %d\n", _threadID); - assert(_prefix[0] != 0); - if (_threadID < UINT32_MAX) { snprintf(_filePrefix, FILENAME_MAX, "%s.%03u.%s", _prefix, _order, _name); snprintf(_fileName, FILENAME_MAX, "%s.%03u.%s.thr%03d", _prefix, _order, _name, _threadID); @@ -216,6 +214,8 @@ private: uint32 _part; + uint32 _bufferSize; + writeBuffer *_output; uint64 _length; uint64 _lengthMax; @@ -230,18 +230,10 @@ private: logFile::logFile(char const *prefix, uint64 maxSize) { - _threadMax = 1024; - _threadNum = omp_get_max_threads(); - - _maxSize = maxSize; - - _mainI = new logFileInstance(prefix, UINT32_MAX, maxSize); - _threadI = new logFileInstance * [_threadMax]; + _mainI = new logFileInstance(prefix, UINT32_MAX, maxSize); + _threadI = new logFileInstance * [omp_get_max_threads()]; - for (uint32 ii=0; ii<_threadMax; ii++) - _threadI[ii] = nullptr; - - for (uint32 ii=0; ii<_threadNum; ii++) + for (uint32 ii=0; iisetPrefix(prefix); - for (uint32 ii=0; ii<_threadMax; ii++) { - if (_threadI[ii]) - _threadI[ii]->setPrefix(prefix); - } + for (uint32 ii=0; iisetPrefix(prefix); } @@ -296,9 +288,8 @@ logFile::setName(char const *name) { _mainI->setName(name); - for (uint32 ii=0; ii<_threadMax; ii++) - if (_threadI[ii]) - _threadI[ii]->setName(name); + for (uint32 ii=0; iisetName(name); } @@ -307,9 +298,8 @@ logFile::setMaxSize(uint64 size) { _mainI->setMaxSize(size); - for (uint32 ii=0; ii<_threadMax; ii++) - if (_threadI[ii]) - _threadI[ii]->setMaxSize(size); + for (uint32 ii=0; iisetMaxSize(size); } @@ -373,7 +363,7 @@ logFile::enable(char const *optionString, char const *levelName) { optionString++; } - while ((*optionString != 0)) { + while ((*optionString != 0) && (*optionString == '-')) { verbosity++; optionString++; } @@ -477,35 +467,12 @@ logFile::writeStatus(char const *fmt, va_list ap) { void logFile::writeLog(char const *fmt, va_list ap) { - int32 nt = omp_get_num_threads(); // Number of threads currently active - int32 tn = omp_get_thread_num(); // ID of this thread + int32 nt = omp_get_num_threads(); + int32 tn = omp_get_thread_num(); - // If tn is more than we have space for we need to allocate a new - // _threadI array. But this is hard. So just blow up. + logFileInstance *lf = (nt == 1) ? (_mainI) : (_threadI[tn]); - if (tn >= _threadMax) { - fprintf(stderr, "TOO MANY THREADS!\n"); - assert(0); - } - - // If we're only running a single thread, or we have already allocated an - // output for this thread, we can immediately write the log. - - if (nt == 1) { - _mainI->writeLog(fmt, ap); - } - - else if (_threadI[tn]) { - _threadI[tn]->writeLog(fmt, ap); - } - - // Otherwise, we need to allocate a new thread output and set it up before - // we can write. - - else { - _threadI[tn] = new logFileInstance(getPrefix(), tn, _maxSize); - _threadI[tn]->writeLog(fmt, ap); - } + lf->writeLog(fmt, ap); } @@ -619,8 +586,7 @@ void logFile::flush(void) { _mainI->flush(); - for (uint32 ii=0; ii<_threadMax; ii++) - if (_threadI[ii]) - _threadI[ii]->flush(); + for (uint32 ii=0; iiflush(); } diff --git a/ext/meryl/src/utility/src/utility/logging.H b/ext/meryl/src/utility/src/utility/logging.H index 16faeb6..d4a9eb3 100644 --- a/ext/meryl/src/utility/src/utility/logging.H +++ b/ext/meryl/src/utility/src/utility/logging.H @@ -152,11 +152,6 @@ public: void flush(void); private: - uint32 _threadMax; // How many threads we can allocate. - uint32 _threadNum; // How many threads we have configured. - - uint64 _maxSize; - logFileInstance *_mainI; logFileInstance **_threadI; diff --git a/ext/meryl/src/utility/src/utility/mt19937ar.C b/ext/meryl/src/utility/src/utility/mt19937ar.C index 70b1728..991f053 100644 --- a/ext/meryl/src/utility/src/utility/mt19937ar.C +++ b/ext/meryl/src/utility/src/utility/mt19937ar.C @@ -68,7 +68,7 @@ // initialize with a single seed void -mtRandom::mtSetSeed(uint32 s) { +mtRandom::construct(uint32 s) { mt[0] = s; @@ -79,7 +79,7 @@ mtRandom::mtSetSeed(uint32 s) { for (mti=1; mti> 30)) + mti); - mag01[0] = 0; + mag01[0] = uint32ZERO; mag01[1] = MT_MATRIX_A; } @@ -92,7 +92,7 @@ mtRandom::mtSetSeed(uint32 s) { /* slight change for C++, 2004/2/26 */ mtRandom::mtRandom(uint32 *init_key, uint32 key_length) { - mtSetSeed(19650218UL); + construct(19650218UL); int i = 1; int j = 0; @@ -136,14 +136,14 @@ mtRandom::mtRandom32(void) { for (kk=0; kk < MT_N - MT_M; kk++) { y = (mt[kk] & MT_UPPER_MASK) | (mt[kk+1] & MT_LOWER_MASK); - mt[kk] = mt[kk + MT_M] ^ (y >> 1) ^ mag01[y & 0x00000001UL]; + mt[kk] = mt[kk + MT_M] ^ (y >> 1) ^ mag01[y & uint32ONE]; } for (; kk < MT_N-1; kk++) { y = (mt[kk] & MT_UPPER_MASK) | (mt[kk + 1] & MT_LOWER_MASK); - mt[kk] = mt[kk + (MT_M - MT_N)] ^ (y >> 1) ^ mag01[y & 0x00000001UL]; + mt[kk] = mt[kk + (MT_M - MT_N)] ^ (y >> 1) ^ mag01[y & uint32ONE]; } y = (mt[MT_N-1] & MT_UPPER_MASK) | (mt[0] & MT_LOWER_MASK); - mt[MT_N-1] = mt[MT_M-1] ^ (y >> 1) ^ mag01[y & 0x00000001UL]; + mt[MT_N-1] = mt[MT_M-1] ^ (y >> 1) ^ mag01[y & uint32ONE]; mti = 0; } @@ -152,7 +152,7 @@ mtRandom::mtRandom32(void) { /* Tempering */ y ^= (y >> 11); - y ^= (y << 7) & 0x9d2c5680UL; + y ^= (y << 7) & 0x9d2c5680UL; y ^= (y << 15) & 0xefc60000UL; y ^= (y >> 18); diff --git a/ext/meryl/src/utility/src/utility/mt19937ar.H b/ext/meryl/src/utility/src/utility/mt19937ar.H index 4e46e1c..ac6ce97 100644 --- a/ext/meryl/src/utility/src/utility/mt19937ar.H +++ b/ext/meryl/src/utility/src/utility/mt19937ar.H @@ -40,15 +40,19 @@ static const uint32 MT_LOWER_MASK = 0x7fffffffUL; // least significant r bits class mtRandom { +private: + void construct(uint32 s); + public: - mtRandom() { mtSetSeed(getpid() * time(NULL)); }; - mtRandom(uint32 s) { mtSetSeed(s); }; + mtRandom() { construct(getpid() * time(NULL)); }; + mtRandom(uint32 s) { construct(s); }; mtRandom(uint32 *init_key, uint32 key_length); - void mtSetSeed(uint32 s); + ~mtRandom() { + }; - uint32 mtRandom32(void); - uint64 mtRandom64(void) { return((((uint64)mtRandom32()) << 32) | (uint64)mtRandom32()); } + uint32 mtRandom32(void); + uint64 mtRandom64(void) { return((((uint64)mtRandom32()) << 32) | (uint64)mtRandom32()); } // Real valued randomness // mtRandomRealOpen() -- on [0,1) real interval @@ -67,9 +71,9 @@ public: // returns a random number with gaussian distribution, mean of zero and std.dev. of 1 // - double mtRandomGaussian(double mean=0.0, double stddev=1.0); + double mtRandomGaussian(double mean=0.0, double stddev=1.0); - double mtRandomExponential(double lambda, double tau=1.0); + double mtRandomExponential(double lambda, double tau=1.0); private: uint32 mt[MT_N]; // State vector array diff --git a/ext/meryl/src/utility/src/utility/objectStore.C b/ext/meryl/src/utility/src/utility/objectStore.C new file mode 100644 index 0000000..13861fb --- /dev/null +++ b/ext/meryl/src/utility/src/utility/objectStore.C @@ -0,0 +1,306 @@ + +/****************************************************************************** + * + * This file is part of meryl-utility, a collection of miscellaneous code + * used by Meryl, Canu and others. + * + * This software is based on: + * 'Canu' v2.0 (https://github.com/marbl/canu) + * which is based on: + * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net) + * the 'kmer package' r1994 (http://kmer.sourceforge.net) + * + * Except as indicated otherwise, this is a 'United States Government Work', + * and is released in the public domain. + * + * File 'README.licenses' in the root directory of this distribution + * contains full conditions and disclaimers. + */ + +#include "types.H" +#include "arrays.H" +#include "strings.H" + +#include "objectStore.H" + +#include +#include + + + +extern char **environ; // Where, or where, is this really defined?! + + + +static +char * +findSeqStorePath(char *requested) { + splitToWords F(requested, splitPaths); + + if (F.numWords() < 2) + return(NULL); + + char *filename = F.last(0); + char *storename = F.last(1); + + // If not a blobs file name, return no file. + + if (strlen(filename) != 10) + return(NULL); + + if ((filename[0] != 'b') || + (filename[1] != 'l') || + (filename[2] != 'o') || + (filename[3] != 'b') || + (filename[4] != 's') || + (filename[5] != '.') || + (isdigit(filename[6]) == 0) || + (isdigit(filename[7]) == 0) || + (isdigit(filename[8]) == 0) || + (isdigit(filename[9]) == 0)) + return(NULL); + + // Now just paste the two components together in the proper + // way and return it. + + char *filepath = new char [FILENAME_MAX + 1]; + + snprintf(filepath, FILENAME_MAX, "%s/%s", storename, filename); + + return(filepath); +} + + + +static +char * +findOvlStorePath(char *requested) { + splitToWords F(requested, splitPaths); + + if (F.numWords() < 2) + return(NULL); + + char *basename = NULL; + char *storename = F.last(1); + char *filename = F.last(0); + + if (strlen(filename) != 9) + return(NULL); + + // If not an overlap store data file name, return no file. + + if ((isdigit(filename[0]) == 0) || + (isdigit(filename[1]) == 0) || + (isdigit(filename[2]) == 0) || + (isdigit(filename[3]) == 0) || + (filename[4] != '<') || + (isdigit(filename[5]) == 0) || + (isdigit(filename[6]) == 0) || + (isdigit(filename[7]) == 0) || + (filename[8] != '>')) + return(NULL); + + // Get ready for some ugly string parsing. We expect strings similar to: + // + // requested file F -- '../asm.ovlStore/0001<000>' + // current path P -- '/path/to/assembly/correction/2-correction' + // + // If the first component of F is '..', we drop it and the last component of P. + // When there are no more '..'s at the start, we should be left with the + // store name in F and the assembly stage in P. + + char *cwd = getcwd(new char [FILENAME_MAX+1], FILENAME_MAX); + + splitToWords P(cwd, splitPaths); + + delete [] cwd; + + uint32 nStrip = 0; + + //fprintf(stderr, "FROM cwd '%s'\n", cwd); + //fprintf(stderr, " requested '%s'\n", requested); + + // Remove identity components. + + while ((F.numWords() > 0) && + (strcmp(F.first(), ".") == 0)) + F.shift(); + + // Remove up components. + + while ((P.numWords() > 0) && + (F.numWords() > 0) && + (strcmp(F.first(), "..") == 0)) { + //fprintf(stderr, "STRIP '%s' from requested and '%s' from cwd\n", F.first(), P.last()); + + F.shift(); + P.pop(); + + nStrip++; + } + + //fprintf(stderr, "P.last '%s'\n", P.last()); + //fprintf(stderr, "F.first '%s'\n", F.first()); + + // We can run in one of three different places: + // 1) assembly_root/correction/1-stuff - ../asm.ovlStore/0001<001> + // 2) assembly_root/correction - ./asm.ovlStore/0001<001> + // 3) assembly_root - ./correction/asm.ovlStore/0001<001> + // + // In the first case, we strip off the '..' and '1-stuff', set basename + // to the last component in P, the storename to the first component + // in F and the file to the last component in F (which is always true). + // + // In the second case, nothing was stripped, and the result is the same. + // + // In the third case, again, nothing was stripped, but the basename is + // now in F, not P. + // + // All that boils down to + + if (nStrip > 0) { // First case. + basename = P.last(); + storename = F.first(); + assert(F.numWords() == 2); + } + + else if (F.numWords() == 2) { // Second case. + basename = P.last(); // (same result as third case) + storename = F.first(); + assert(F.numWords() == 2); + } + + else { // Third case. + basename = F.first(0); + storename = F.first(1); + assert(F.numWords() == 3); + } + + // We could check that the namespace -- the name of this assembly -- is before + // the basename (lots of work) and that the basename is one of 'correction', + // 'trimming', etc. But why? + + char *filepath = new char [FILENAME_MAX + 1]; + + //fprintf(stderr, "MAKE PATH STAGE '%s'\n", basename); + //fprintf(stderr, " STORENAME '%s'\n", storename); + //fprintf(stderr, " FILENAME '%s'\n", filename); + + snprintf(filepath, FILENAME_MAX, "%s/%s/%s", basename, storename, filename); + + return(filepath); +} + + + +bool +fetchFromObjectStore(char *requested) { + + // Decide if we even need to bother. If the file exists locally, or if + // one of the environment variables is missing, no, we don't need to bother. + + if (fileExists(requested)) + return(false); + + char *da = getenv("CANU_OBJECT_STORE_CLIENT_DA"); + char *ns = getenv("CANU_OBJECT_STORE_NAMESPACE"); + char *pr = getenv("CANU_OBJECT_STORE_PROJECT"); + + if ((da == NULL) || + (ns == NULL) || + (pr == NULL)) + return(false); + + // Try to figure out the object store path for this object based on the name + // of the requested file. Paths to stores are relative, but we need them + // rooted in the assembly root directory: + // + // ../../asm.seqStore -> ./asm.seqStore + // ../asm.ovlStore -> ./correction/asm.ovlStore + // + // For the seqStore, we can just grab the last two components. + // For the ovlStore, we need to parse out the subdirectory the store is in. + + char *path = NULL; + + if (path == NULL) + path = findSeqStorePath(requested); + + if (path == NULL) + path = findOvlStorePath(requested); + + if (path == NULL) + fprintf(stderr, "fetchFromObjectStore()-- requested file '%s', but don't know where that is.\n", requested), exit(1); + + // With the path to the object figured out, finish making the path by appending + // the PROJEXT and NAMESPACE. + + char *object = new char [FILENAME_MAX+1]; + + snprintf(object, FILENAME_MAX, "%s:%s/%s", pr, ns, path); + + // Then report what's going on. + + fprintf(stderr, "fetchFromObjectStore()-- fetching file '%s'\n", requested); + fprintf(stderr, "fetchFromObjectStore()-- from object '%s'\n", object); + + // Build up a command we can execute after forking. + + char *args[8]; + + args[0] = basename(da); + args[1] = duplicateString("download"); // Thanks, execve, for wanting mutable + args[2] = duplicateString("--overwrite"); // strings and making us jump through + args[3] = duplicateString("--no-progress"); // a hoop to get them without compiler + args[4] = duplicateString("--output"); // warnings. + args[5] = requested; + args[6] = object; + args[7] = NULL; + + // Fork and run the child command if we're the child. Normally, evecve() + // doesn't return (because it obliterated the process it could return to). + // If it does return, an error occurred, so we just go BOOM too. As per + // the manpage, _exit() MUST be used instead of exit(), so that + // stdin/out/err are left intact. + // + // vfork() is dangerous. If we're the child, all we're allowed to do + // after the call is execve() or _exit(). Absolutely nothing else. + + pid_t pid = vfork(); + + if (pid == 0) { + execve(da, args, environ); + fprintf(stderr, "fetchFromObjectStore()-- execve() failed with error '%s'.\n", strerror(errno)); + _exit(127); + } + + if (pid == -1) + fprintf(stderr, "fetchFromObjectStore()-- vfork() failed with error '%s'.\n", strerror(errno)), exit(1); + + // Otherwise, we're still the parent; wait for the child process to + // terminate. + + int status = 0; + pid_t wid = waitpid(pid, &status, 0); + + if (wid == -1) + fprintf(stderr, "fetchFromObjectStore()-- waitpid() failed with error '%s'.\n", strerror(errno)), exit(1); + + if ((WIFEXITED(status)) && + (WEXITSTATUS(status) == 127)) + fprintf(stderr, "fetchFromObjectStore()-- execve() failed to run the command.\n"), exit(1); + + // If no file, it's fatal. + if (fileExists(requested) == false) + fprintf(stderr, "fetchFromObjectStore()-- failed fetch file '%s'.\n", requested), exit(1); + + delete [] args[1]; + delete [] args[2]; + delete [] args[3]; + delete [] args[4]; + + delete [] path; + delete [] object; + + return(true); +} diff --git a/ext/meryl/src/utility/src/tests/readLines.C b/ext/meryl/src/utility/src/utility/objectStore.H similarity index 55% rename from ext/meryl/src/utility/src/tests/readLines.C rename to ext/meryl/src/utility/src/utility/objectStore.H index c536e86..148a306 100644 --- a/ext/meryl/src/utility/src/tests/readLines.C +++ b/ext/meryl/src/utility/src/utility/objectStore.H @@ -17,30 +17,19 @@ * contains full conditions and disclaimers. */ +#include "types.H" #include "files.H" -int32 -main(int32 argc, char **argv) { - uint32 lineMax = 0; - uint32 lineLen = 0; - char *line = nullptr; - uint32 nLines = 0; - - if (argc == 1) { - fprintf(stderr, "usage: %s inputFile[.gz]\n", argv[0]); - return(1); - } - - compressedFileReader *in = new compressedFileReader(argv[1]); - - while (AS_UTL_readLine(line, lineLen, lineMax, in->file())) { - nLines++; - } - - delete in; - delete [] line; - - fprintf(stderr, "Found %u lines! Yay!\n", nLines); - - return(0); -} +// Basic routines to fetch and stash files from an object store. +// Most of this is done in the executive, but low level fetching +// of sqStore and ovStore data is done here. +// +// NOTE that this function is limited in its ability to fetch files. +// It will ONLY work with seqStore and ovlStore data files: +// seqStore/blobs.* +// ovlStore/0000<000> +// +// Returns false if the file was not fetched (either no object store +// in use, or the file existed already), true if it was fetched. +// +bool fetchFromObjectStore(char *filename); diff --git a/ext/meryl/src/utility/src/utility/sampledDistribution.H b/ext/meryl/src/utility/src/utility/sampledDistribution.H index 26cbde6..7c92bc4 100644 --- a/ext/meryl/src/utility/src/utility/sampledDistribution.H +++ b/ext/meryl/src/utility/src/utility/sampledDistribution.H @@ -80,12 +80,12 @@ public: } while (_dataMax <= val) - resizeArray(_data, _dataLen, _dataMax, 2 * _dataMax, _raAct::copyData | _raAct::clearNew); + resizeArray(_data, _dataLen, _dataMax, 2 * _dataMax, resizeArray_copyData | resizeArray_clearNew); _data[val] += cnt; _dataSum += cnt; - _dataLen = std::max(_dataLen, val + 1); + _dataLen = max(_dataLen, val + 1); } AS_UTL_closeFile(D); diff --git a/ext/meryl/src/utility/src/utility/sequence.C b/ext/meryl/src/utility/src/utility/sequence.C index f5c23b3..900ea3a 100644 --- a/ext/meryl/src/utility/src/utility/sequence.C +++ b/ext/meryl/src/utility/src/utility/sequence.C @@ -461,123 +461,50 @@ encode8bitSequence(uint8 *&chunk, char *seq, uint32 seqLen) { -//////////////////////////////////////// -// dnaSeq functions -// - -dnaSeq::dnaSeq() { -}; - -dnaSeq::~dnaSeq() { - delete [] _name; - delete [] _seq; - delete [] _qlt; +// Saves the file offset of the first byte in the record: +// for FASTA, the '>' +// for FASTQ, the '@'. + +class dnaSeqIndexEntry { +public: + dnaSeqIndexEntry() { + _fileOffset = UINT64_MAX; + _sequenceLength = 0; + }; + ~dnaSeqIndexEntry() { + }; + + uint64 _fileOffset; + uint64 _sequenceLength; }; -void -dnaSeq::releaseAll(void) { - delete [] _name; _name = _ident = _flags = nullptr; - delete [] _seq; _seq = nullptr; - delete [] _qlt; _qlt = nullptr; - - _nameMax = 0; - _seqMax = 0; - _seqLen = 0; -} - - -void -dnaSeq::releaseBases(void) { - delete [] _seq; _seq = nullptr; - delete [] _qlt; _qlt = nullptr; - - _seqMax = 0; - _seqLen = 0; -} - - -bool -dnaSeq::copy(char *bout, - uint32 bgn, uint32 end, bool terminate) { - - if ((end < bgn) || (_seqLen < end)) - return(false); - - for (uint32 ii=bgn; iifile()); + _index = NULL; + _indexLen = 0; + _indexMax = 0; + if (indexed == false) + return; -//////////////////////////////////////// -// dnaSeqFile functions -// + if (_file->isCompressed() == true) + fprintf(stderr, "ERROR: cannot index compressed input '%s'.\n", filename), exit(1); -dnaSeqFile::dnaSeqFile(char const *filename, bool indexed) { - _filename = duplicateString(filename); + if (_file->isNormal() == false) + fprintf(stderr, "ERROR: cannot index pipe input.\n"), exit(1); - reopen(indexed); + generateIndex(); } dnaSeqFile::~dnaSeqFile() { - delete [] _filename; delete _file; delete _buffer; delete [] _index; @@ -585,33 +512,6 @@ dnaSeqFile::~dnaSeqFile() { -// Open, or reopen, an input file. -// -void -dnaSeqFile::reopen(bool indexed) { - - // If a _file exists already, reopen it, otherwise, make a new one. - if (_file) - _file->reopen(); - else - _file = new compressedFileReader(_filename); - - // Since the file object is always new, we need to make a new read buffer. - // gzip inputs seem to be (on FreeBSD) returning only 64k blocks - // regardless of the size of our buffer; but uncompressed inputs will - // benefit slightly from a bit larger buffer. - delete _buffer; - - _buffer = new readBuffer(_file->file(), 128 * 1024); - - // If we have an index already or one is requested, (re)generate it. - - if ((_index != nullptr) || (indexed == true)) - generateIndex(); -} - - - bool dnaSeqFile::findSequence(uint64 i) { @@ -620,8 +520,6 @@ dnaSeqFile::findSequence(uint64 i) { _buffer->seek(_index[i]._fileOffset); - _seqIdx = i; - return(true); } @@ -639,142 +537,81 @@ dnaSeqFile::sequenceLength(uint64 i) { -//////////////////////////////////////// -// dnaSeqFile indexing -// - -const uint64 dnaSeqVersion01 = 0x3130716553616e64; // dnaSeq01 -const uint64 dnaSeqVersion02 = 0x3230716553616e64; // dnaSeq02 - not used yet - - -char const * -makeIndexName(char const *prefix) { - char const *suffix = ".dnaSeqIndex"; - uint32 plen = strlen(prefix); - uint32 slen = strlen(suffix); - char *iname = new char [plen + slen + 1]; - - memcpy(iname, prefix, plen + 1); // +1 for the NUL byte. - memcpy(iname + plen, suffix, slen + 1); - - return(iname); +bool +dnaSeqFile::findSequence(const char *name) { + fprintf(stderr, "dnaSeqFile::findSequence(const char *) not supported.\n"); + exit(1); + return(false); } -// Load an index. Returns true if one was loaded. + bool dnaSeqFile::loadIndex(void) { - char const *indexName = makeIndexName(_filename); - FILE *indexFile = nullptr; - - if (fileExists(indexName) == true) { - FILE *indexFile = AS_UTL_openInputFile(indexName); - uint64 magic; - uint64 size; - uint64 date; - - loadFromFile(magic, "dnaSeqFile::magic", indexFile); - loadFromFile(size, "dnaSeqFile::size", indexFile); - loadFromFile(date, "dnaSeqFile::date", indexFile); - loadFromFile(_indexLen, "dnaSeqFile::indexLen", indexFile); - - if (magic != dnaSeqVersion01) { - fprintf(stderr, "ERROR: file '%s' isn't a dnaSeqIndex; manually remove this file.\n", indexName); - exit(1); - } + char indexName[FILENAME_MAX+1]; - if ((size == AS_UTL_sizeOfFile(_filename)) && - (date == AS_UTL_timeOfFile(_filename))) { - _index = new dnaSeqIndexEntry [_indexLen]; + snprintf(indexName, FILENAME_MAX, "%s.index", _file->filename()); - loadFromFile(_index, "dnaSeqFile::index", _indexLen, indexFile); + if (fileExists(indexName) == false) + return(false); - } else { - fprintf(stderr, "WARNING: file '%s' disagrees with index; recreating index.\n", _filename); + FILE *indexFile = AS_UTL_openInputFile(indexName); - _index = nullptr; - _indexLen = 0; - _indexMax = 0; - } + loadFromFile(_indexLen, "dnaSeqFile::indexLen", indexFile); - AS_UTL_closeFile(indexFile, indexName); - } + _index = new dnaSeqIndexEntry [_indexLen]; + + loadFromFile(_index, "dnaSeqFile::index", _indexLen, indexFile); - delete [] indexName; + AS_UTL_closeFile(indexFile, indexName); - return(_index != nullptr); // Return true if we have an index. + return(true); } void dnaSeqFile::saveIndex(void) { - char const *indexName = makeIndexName(_filename); - FILE *indexFile = AS_UTL_openOutputFile(indexName); + char indexName[FILENAME_MAX+1]; + + snprintf(indexName, FILENAME_MAX, "%s.index", _file->filename()); - uint64 magic = dnaSeqVersion01; - uint64 size = AS_UTL_sizeOfFile(_filename); - uint64 date = AS_UTL_timeOfFile(_filename); + FILE *indexFile = AS_UTL_openOutputFile(indexName); - writeToFile(magic, "dnaSeqFile::magic", indexFile); - writeToFile(size, "dnaSeqFile::size", indexFile); - writeToFile(date, "dnaSeqFile::date", indexFile); - writeToFile(_indexLen, "dnaSeqFile::indexLen", indexFile); + writeToFile(_indexLen, "dnaSeqFile::indexLen", indexFile); writeToFile(_index, "dnaSeqFile::index", _indexLen, indexFile); AS_UTL_closeFile(indexFile, indexName); - - delete [] indexName; } void dnaSeqFile::generateIndex(void) { - dnaSeq seq; - - // Fail if an index is requested for a compressed file. - - if (_file->isCompressed() == true) - fprintf(stderr, "ERROR: cannot index compressed input '%s'.\n", _filename), exit(1); - - if (_file->isNormal() == false) - fprintf(stderr, "ERROR: cannot index pipe input.\n"), exit(1); - - // If we can load an index, do it and return. + uint32 nameMax = 0; + char *name = NULL; + uint64 seqMax = 0; + char *seq = NULL; + uint8 *qlt = NULL; + uint64 seqLen = 0; if (loadIndex() == true) return; - // Rewind the buffer to make sure we're at the start of the file. - - _buffer->seek(0); - - // Allocate space for the index, set the first entry to the current - // position of the file. - _indexLen = 0; _indexMax = 1048576; _index = new dnaSeqIndexEntry [_indexMax]; - _index[0]._fileOffset = _buffer->tell(); - _index[0]._sequenceLength = 0; + _index[_indexLen]._fileOffset = _buffer->tell(); + _index[_indexLen]._sequenceLength = 0; // While we read sequences: - // update the length of the sequence (we've already saved the position) + // update the length of the sequence (we've already save the position) // make space for more sequences // save the position of the next sequence - - while (loadSequence(seq) == true) { - if (seq.wasError()) { - fprintf(stderr, "WARNING: error reading sequence at/before '%s'\n", seq.ident()); - } - - if (seq.wasReSync()) { - fprintf(stderr, "WARNING: lost sync reading before sequence '%s'\n", seq.ident()); - } - - _index[_indexLen]._sequenceLength = seq.length(); + // + while (loadSequence(name, nameMax, seq, qlt, seqMax, seqLen) == true) { + _index[_indexLen]._sequenceLength = seqLen; increaseArray(_index, _indexLen, _indexMax, 1048576); @@ -784,207 +621,128 @@ dnaSeqFile::generateIndex(void) { _index[_indexLen]._sequenceLength = 0; } - // Save whatever index we made. - - saveIndex(); -} - - - -void -dnaSeqFile::removeIndex(void) { - - delete [] _index; + //for (uint32 ii=0; ii<_indexLen; ii++) + // fprintf(stderr, "%u offset %lu length %lu\n", ii, _index[ii]._fileOffset, _index[ii]._sequenceLength); - _indexLen = 0; - _indexMax = 0; - _index = nullptr; + if (_indexLen > 0) + saveIndex(); } -bool -dnaSeqFile::loadFASTA(char *&name, uint32 &nameMax, - char *&seq, - uint8 *&qlt, uint64 &seqMax, uint64 &seqLen, uint64 &qltLen) { +uint64 +dnaSeqFile::loadFASTA(char *&name, uint32 &nameMax, + char *&seq, + uint8 *&qlt, uint64 &seqMax) { uint64 nameLen = 0; + uint64 seqLen = 0; char ch = _buffer->read(); - // Skip any whitespace. - - while (isWhiteSpace(ch)) - ch = _buffer->read(); - - // Fail rather ungracefully if we aren't at a sequence start. + assert(ch == '>'); - if (ch != '>') - return(false); - - // Read the header line into the name string. We cannot skip whitespace - // here, but we do allow DOS to insert a \r before any \n. + // Read the header line into the name string. for (ch=_buffer->read(); (ch != '\n') && (ch != 0); ch=_buffer->read()) { - if (ch == '\r') - continue; if (nameLen+1 >= nameMax) resizeArray(name, nameLen, nameMax, 3 * nameMax / 2); name[nameLen++] = ch; } - // Trim back the header line to remove white space at the end. The - // terminating NUL is tacked on at the end. + // Read sequence, skipping whitespace, until we hit a new sequence (or eof). - while ((nameLen > 0) && (isWhiteSpace(name[nameLen-1]))) - nameLen--; - - name[nameLen] = 0; - - // Read sequence, skipping whitespace, until we hit a new sequence or eof. - - seqLen = 0; - qltLen = 0; + for (ch=_buffer->readuntil('>'); (ch != '>') && (ch != 0); ch=_buffer->readuntil('>')) { + if ((ch == '\n') || (ch == '\r') || (ch == '\t') || (ch == ' ')) + continue; - for (ch = _buffer->peek(); ((ch != '>') && - (ch != '@') && - (ch != 0)); ch = _buffer->peek()) { assert(_buffer->eof() == false); - ch = _buffer->read(); - - if (isWhiteSpace(ch)) - continue; - if (seqLen+1 >= seqMax) resizeArrayPair(seq, qlt, seqLen, seqMax, 3 * seqMax / 2); - seq[seqLen++] = ch; - qlt[qltLen++] = 0; + seq[seqLen] = ch; + qlt[seqLen] = 0; + + seqLen++; } + name[nameLen] = 0; seq[seqLen] = 0; - qlt[qltLen] = 0; + qlt[seqLen] = 0; assert(nameLen < nameMax); assert(seqLen < seqMax); - assert(qltLen < seqMax); - - _seqIdx++; - return(true); + return(seqLen); } -bool -dnaSeqFile::loadFASTQ(char *&name, uint32 &nameMax, - char *&seq, - uint8 *&qlt, uint64 &seqMax, uint64 &seqLen, uint64 &qltLen) { +uint64 +dnaSeqFile::loadFASTQ(char *&name, uint32 &nameMax, + char *&seq, + uint8 *&qlt, uint64 &seqMax) { uint32 nameLen = 0; + uint64 seqLen = 0; + uint64 qltLen = 0; char ch = _buffer->read(); - // Skip any whitespace. + assert(ch == '@'); - while (isWhiteSpace(ch)) - ch = _buffer->read(); - - // Fail rather ungracefully if we aren't at a sequence start. - - if (ch != '@') - return(false); - - // Read the header line into the name string. We cannot skip whitespace - // here, but we do allow DOS to insert a \r before any \n. + // Read the header line into the name string. for (ch=_buffer->read(); (ch != '\n') && (ch != 0); ch=_buffer->read()) { - if (ch == '\r') - continue; if (nameLen+1 >= nameMax) resizeArray(name, nameLen, nameMax, 3 * nameMax / 2); name[nameLen++] = ch; } - // Trim back the header line to remove white space at the end. - - while ((nameLen > 0) && (isWhiteSpace(name[nameLen-1]))) - nameLen--; - - name[nameLen] = 0; - - // Skip any whitespace, again. Once we hit non-whitespace we'll suck in - // the whole line. - - while (isWhiteSpace(ch)) - ch = _buffer->read(); - - // Read sequence. Pesky DOS files end with \r\n, and it suffices - // to stop on the \n and ignore all the rest. + // Read sequence. - seqLen = 0; - qltLen = 0; - - for (; (ch != '\n') && (ch != 0); ch=_buffer->read()) { - if (isWhiteSpace(ch)) + for (ch=_buffer->read(); (ch != '\n') && (ch != 0); ch=_buffer->read()) { + if ((ch == '\n') || (ch == '\r') || (ch == '\t') || (ch == ' ')) continue; if (seqLen+1 >= seqMax) resizeArrayPair(seq, qlt, seqLen, seqMax, 3 * seqMax / 2); seq[seqLen++] = ch; } - // Skip any more whitespace, fail if we're not at a quality start, then - // suck in the quality line. And then skip more whitespace. - - while (isWhiteSpace(ch)) - ch = _buffer->read(); - - if (ch != '+') - return(false); + // Skip header line for (ch=_buffer->read(); (ch != '\n') && (ch != 0); ch=_buffer->read()) { ; } - while (isWhiteSpace(ch)) - ch = _buffer->read(); - - // Read qualities and convert to integers. + // Read qualities. - for (; (ch != '\n') && (ch != 0); ch=_buffer->read()) { - if (isWhiteSpace(ch)) + for (ch=_buffer->read(); (ch != '\n') && (ch != 0); ch=_buffer->read()) { + if ((ch == '\n') || (ch == '\r') || (ch == '\t') || (ch == ' ')) continue; if (qltLen+1 >= seqMax) resizeArrayPair(seq, qlt, qltLen, seqMax, 3 * seqMax / 2); - qlt[qltLen++] = ch - '!'; + qlt[qltLen++] = ch; } - // Skip whitespace after the sequence. This one is a little weird. It - // tests if the _next_ letter is whitespace, and if so, gets it from the - // buffer. After this loop, the _next_ letter in the buffer should be - // either a '>' or a '@'. - - while (isWhiteSpace(_buffer->peek())) - _buffer->read(); + //fprintf(stderr, "READ FASTQ name %u seq %lu qlt %lu\n", nameLen, seqLen, qltLen); + name[nameLen] = 0; seq[seqLen] = 0; qlt[qltLen] = 0; assert(nameLen < nameMax); assert(seqLen < seqMax); assert(qltLen < seqMax); + assert(seqLen == qltLen); - _seqIdx++; - - return(true); + return(seqLen); } bool -dnaSeqFile::loadSequence(char *&name, uint32 &nameMax, - char *&seq, - uint8 *&qlt, uint64 &seqMax, uint64 &seqLen, uint32 &error) { - uint64 qltLen = 0; - - // Allocate space for the arrays, if they're currently unallocated. +dnaSeqFile::loadSequence(char *&name, uint32 &nameMax, + char *&seq, + uint8 *&qlt, uint64 &seqMax, + uint64 &seqLen) { if (nameMax == 0) resizeArray(name, 0, nameMax, (uint32)1024); @@ -992,100 +750,27 @@ dnaSeqFile::loadSequence(char *&name, uint32 &nameMax, if (seqMax == 0) resizeArrayPair(seq, qlt, 0, seqMax, (uint64)65536); - // Clear our return values. - - bool loadSuccess = false; - - _isFASTA = false; - _isFASTQ = false; - - name[0] = 0; - seq[0] = 0; - qlt[0] = 0; - seqLen = 0; - - error = 0; - - // Skip any whitespace at the start of the file, or before the next FASTQ - // sequence (the FASTA reader will automagically skip whitespace at the - // end of the sequence). - - while (isWhiteSpace(_buffer->peek())) - _buffer->read(); - - // If we're not at a sequence start, scan ahead to find the next one. - // Not bulletproof; FASTQ qv's can match this. - - if ((_buffer->peek() != '>') && - (_buffer->peek() != '@') && - (_buffer->peek() != 0)) { - //fprintf(stderr, "dnaSeqFile::loadSequence()-- sequence sync lost at position %lu, attempting to find the next sequence.\n", _buffer->tell()); - error |= 0x02; - } - - bool lastWhite = isWhiteSpace(_buffer->peek()); - - while ((_buffer->peek() != '>') && - (_buffer->peek() != '@') && - (_buffer->peek() != 0)) { + while (_buffer->peek() == '\n') _buffer->read(); - } - - // Peek at the file to decide what type of sequence we need to read. - if (_buffer->peek() == '>') { - _isFASTA = true; - loadSuccess = loadFASTA(name, nameMax, seq, qlt, seqMax, seqLen, qltLen); - } - - else if (_buffer->peek() == '@') { - _isFASTQ = true; - loadSuccess = loadFASTQ(name, nameMax, seq, qlt, seqMax, seqLen, qltLen); - } + if (_buffer->peek() == '>') + seqLen = loadFASTA(name, nameMax, + seq, + qlt, seqMax); - else { - _isFASTA = false; - _isFASTQ = false; + else if (_buffer->peek() == '@') + seqLen = loadFASTQ(name, nameMax, + seq, + qlt, seqMax); + else return(false); - } - - // If we failed to load a sequence, report an error message and zero out - // the sequence. Leave the name as-is so we can at least return a length - // zero sequence. If we failed to load a name, it'll still be set to NUL. - - if (loadSuccess == false) { - //if (name[0] == 0) - // fprintf(stderr, "dnaSeqFile::loadSequence()-- failed to read sequence correctly at position %lu.\n", _buffer->tell()); - //else - // fprintf(stderr, "dnaSeqFile::loadSequence()-- failed to read sequence '%s' correctly at position %lu.\n", name, _buffer->tell()); - - error |= 0x01; - - seq[0] = 0; - qlt[0] = 0; - seqLen = 0; - } return(true); } -bool -dnaSeqFile::loadSequence(dnaSeq &seq) { - bool result = loadSequence(seq._name, seq._nameMax, - seq._seq, - seq._qlt, seq._seqMax, seq._seqLen, seq._error); - - if (result) - seq.findNameAndFlags(); - - return(result); -} - - - bool dnaSeqFile::loadBases(char *seq, uint64 maxLength, diff --git a/ext/meryl/src/utility/src/utility/sequence.H b/ext/meryl/src/utility/src/utility/sequence.H index a950ac3..fdab9c8 100644 --- a/ext/meryl/src/utility/src/utility/sequence.H +++ b/ext/meryl/src/utility/src/utility/sequence.H @@ -50,216 +50,128 @@ void decode3bitSequence(uint8 *chunk, uint32 chunkLen, char *seq, uint32 seqLe void decode8bitSequence(uint8 *chunk, uint32 chunkLen, char *seq, uint32 seqLen); -// Encode/decode an ACGT base to 0132. Relies on the ASCII encoding: -// -// A a 01c0 000 1 == 0 -> 0 -// C c 01c0 001 1 == 1 -> 1 -// T t 01c1 010 0 == 2 -> 2 -// G g 01c0 011 1 == 3 -> 3 -// N n 01c0 111 0 == 7 -> 4 -// ^^^ -// Decoding will always return uppercase letters (c=0). -// -// The inline arrays, in gcc anyway, compile to a single 64-bit constant -// and is equivalent to the C code: -// -// 0x0706050403020100llu >> (((base >> 1) & 0x07) << 3) & 0x0f -// -// with the additional optimization of removing the redundant shifts. - -inline -uint8 -encode2bitBase(char base) { - return((uint8 [8]){0, 1, 2, 3, 4, 4, 4, 4}[base >> 1 & 0x07]); -} - -inline -char -decode2bitBase(uint8 base) { - return("ACTGNNNN"[base & 0x07]); -} - - - -// A sequence loaded from disk. It should be treated as a read-only object. -// -// ident() returns the first word of the sequence header line, while flags() -// returns the rest of the line, or an empty line if there is no more line. -// -// It isn't possible to modify ident() and flags(). They're pointers -// into the same memory, and that isn't exposed. -// -// bases() and quals() could support modifications, as long as the length of -// the string doesn't change. Only canu needed to do that and it was worked -// around. -// -// If quality values are not available (e.g., FASTA) then all values are set -// to zero. -// -// The copy functions will copy bases (and qualities) from bgn to end, but -// not including the base at end -- that is, normal C-style semantics. The -// output will be NUL-terminated, unless explicitly told not to. Returns -// false if bgn or end are out of range or inconsistent. -// -class dnaSeq { -public: - dnaSeq(); - ~dnaSeq(); - char const *ident(void) { return(_ident); }; - char const *flags(void) { return(_flags); }; - char const *bases(void) { return(_seq); }; - uint8 const *quals(void) { return(_qlt); }; - uint64 length(void) { return(_seqLen); }; +class dnaSeqIndexEntry; // Internal use only, sorry. - void releaseAll(void); // Release all memory. - void releaseBases(void); // Release seq memory; keep the name. +class dnaSeq { +public: + dnaSeq() { + _nameMax = 0; + _name = NULL; + _seqMax = 0; + _seq = NULL; + _qlt = NULL; + _seqLen = 0; + }; - bool copy(char *bout, - uint32 bgn, uint32 end, bool terminate = true); + ~dnaSeq() { + delete [] _name; + delete [] _seq; + delete [] _qlt; + }; - bool copy(char *bout, - uint8 *qout, - uint32 bgn, uint32 end, bool terminate = true); - bool wasError(void) { return((_error & 0x01) == 0x01); }; - bool wasReSync(void) { return((_error & 0x02) == 0x02); }; + char *name(void) { return(_name); }; + char *bases(void) { return(_seq); }; + uint8 *quals(void) { return(_qlt); }; -private: - void findNameAndFlags(void); + uint64 length(void) { return(_seqLen); }; private: - char *_name = nullptr; - uint32 _nameMax = 0; - - char *_ident = nullptr; - char *_flags = nullptr; - - char *_seq = nullptr; - uint8 *_qlt = nullptr; - uint64 _seqMax = 0; // Space allocated. - uint64 _seqLen = 0; // Actual length. - - uint32 _error = 0; + uint32 _nameMax; + char *_name; + uint64 _seqMax; + char *_seq; + uint8 *_qlt; + uint64 _seqLen; friend class dnaSeqFile; }; -// An interface to FASTA and FASTQ files. -// -// Upon object creation, you can request that an index of the file be -// generated. Without an index, numberOfSequences(), findSequence() and -// sequenceLength() do not work well or at all. -// -// generateIndex() will force an index to be generated. -// removeIndex will remove any index. -// -// reopen() will reset the file to the start and. If the 'indexed' flag is -// true, or an index already exists, an index is (re)created. Note that -// setting 'indexed=false' will NOT remove an existing index. -// -// findSequence() will return true if the specified sequence is found in the -// file and leave the file positioned such that the next loadSequence() will -// load that sequence. -// - If an index exists, the index will be searched and the sequence will -// be returned regardless of where it is in the file. -// - If no index exists, the file will be searched forward until the -// sequence is found or the file ends. It is not possible to move -// 'backward' in the file in this case. -// -// sequenceLength() will return the length of sequence index i. If no index -// exists, or i is not a valid sequence index, UINT64_MAX is returned. -// -// isFASTA() and isFASTQ() return true if the last sequence loaded came from -// a FASTA or FASTQ source, respectively. If no sequence has been loaded -// yet, both functions will return false. -// -// loadSequence() will read the next sequence from the file. Returns false -// if the end of file is encountered, true otherwise. In particular, a -// sequence of length zero will return true. -// -// loadBases() will return a chunk of sequence from the file, up to -// 'maxLength' bases or the end of the current sequence. -// - Returns false only if EOF is encountered. -// - seqLength will have the length of the sequence returned. This can be zero. -// - endOfSequence will be true if the end of the sequence was encountered. -// - The returned sequence is NOT NUL terminated. -// class dnaSeqFile { public: - dnaSeqFile(char const *filename, bool indexed=false); + dnaSeqFile(const char *filename, bool indexed=false); ~dnaSeqFile(); - void reopen(bool indexed=false); - void generateIndex(void); - void removeIndex(void); - -public: - char const *filename(void) { return(_filename); }; - uint64 numberOfSequences(void) { return(_indexLen); }; - - bool findSequence(uint64 i); - uint64 sequenceLength(uint64 i); - -public: - // True if the last sequence loaded was from a FASTA or FASTQ file. - bool isFASTA(void) { return(_isFASTA); }; - bool isFASTQ(void) { return(_isFASTQ); }; - - // Return the sequence index of the last loaded sequence. - uint32 seqIdx(void) { return(_seqIdx-1); }; + compressedFileReader *_file; + readBuffer *_buffer; - // True if the input file is compressed (gzip, xz, etc). - bool isCompressed(void) { return(_file->isCompressed()); }; - -public: - bool loadSequence(char *&name, uint32 &nameMax, - char *&seq, - uint8 *&qlt, uint64 &seqMax, uint64 &seqLen, uint32 &errorCode); - bool loadSequence(dnaSeq &seq); - -public: - bool loadBases(char *seq, - uint64 maxLength, - uint64 &seqLength, - bool &endOfSequence); + dnaSeqIndexEntry *_index; + uint64 _indexLen; + uint64 _indexMax; private: bool loadIndex(void); void saveIndex(void); - bool - loadFASTA(char *&name, uint32 &nameMax, - char *&seq, - uint8 *&qlt, uint64 &seqMax, uint64 &seqLen, uint64 &qltLen); +public: + void generateIndex(void); + + // If indexed, searches the index for the proper sequence. + // + // If not indexed, searches forward in the file for the sequence. If not found, + // the file will be at the end. + // + // In both cases, the file is left positioned at the start of the sequence header. + // + // Returns true if found, false if not. + // + bool findSequence(uint64 i); + bool findSequence(const char *name); + + // Returns the number of sequences in the file. + uint64 numberOfSequences(void) { + return(_indexLen); + }; + + // Returns the length of sequence i. If no such sequence, returns UINT64_MAX. + uint64 sequenceLength(uint64 i); - bool - loadFASTQ(char *&name, uint32 &nameMax, - char *&seq, - uint8 *&qlt, uint64 &seqMax, uint64 &seqLen, uint64 &qltLen); + char *filename(void) { + return(_file->filename()); + } private: - char *_filename = nullptr; + uint64 + loadFASTA(char *&name, uint32 &nameMax, + char *&seq, + uint8 *&qlt, uint64 &seqMax); - bool _isFASTA = false; - bool _isFASTQ = false; - uint64 _seqIdx = 0; + uint64 + loadFASTQ(char *&name, uint32 &nameMax, + char *&seq, + uint8 *&qlt, uint64 &seqMax); - compressedFileReader *_file = nullptr; - readBuffer *_buffer = nullptr; - struct dnaSeqIndexEntry { // Offset of the first byte in the record: - uint64 _fileOffset; // '>' for FASTA, '@' for fastq. - uint64 _sequenceLength; // +public: + // Return the next sequence in the file. + // Returns false if EOF, true otherwise, even if the sequence is length zero. + // + bool loadSequence(char *&name, uint32 &nameMax, + char *&seq, + uint8 *&qlt, uint64 &seqMax, + uint64 &seqLen); + + bool loadSequence(dnaSeq &seq) { + return(loadSequence(seq._name, seq._nameMax, + seq._seq, + seq._qlt, seq._seqMax, + seq._seqLen)); }; - dnaSeqIndexEntry *_index = nullptr; - uint64 _indexLen = 0; - uint64 _indexMax = 0; + // Returns a chunk of sequence from the file, up to 'maxLength' bases or + // the end of the current sequence. This is NOT NUL terminated! + // + // Returns false if EOF is hit and no bases were loaded. + // + bool loadBases(char *seq, + uint64 maxLength, + uint64 &seqLength, + bool &endOfSequence); }; diff --git a/ext/meryl/src/utility/src/utility/speedCounter.H b/ext/meryl/src/utility/src/utility/speedCounter.H index 300684b..510aa3a 100644 --- a/ext/meryl/src/utility/src/utility/speedCounter.H +++ b/ext/meryl/src/utility/src/utility/speedCounter.H @@ -38,7 +38,7 @@ public: void enableLiner(void) { _line = true; }; bool tick(void) { - if (_enabled && ((++_count & _freq) == 0)) { + if (_enabled && ((++_count & _freq) == uint64ZERO)) { double v = _count / _unit; if (_spin) fputs(_spinr[_draws % 4], stderr); if (_line) fputs(_liner[_draws % 19], stderr); @@ -55,7 +55,7 @@ public: return(false); _count += increment; - if ((_count & _freq) == 0) { + if ((_count & _freq) == uint64ZERO) { double v = _count / _unit; if (_spin) fputs(_spinr[_draws % 4], stderr); if (_line) fputs(_liner[_draws % 19], stderr); diff --git a/ext/meryl/src/utility/src/utility/stddev.H b/ext/meryl/src/utility/src/utility/stddev.H index daa4fd4..59d36b5 100644 --- a/ext/meryl/src/utility/src/utility/stddev.H +++ b/ext/meryl/src/utility/src/utility/stddev.H @@ -26,6 +26,9 @@ #include #include +using namespace std; + + // Online mean and std.dev calculation. // B. P. Welford, Technometrics, Vol 4, No 3, Aug 1962 pp 419-420. @@ -60,19 +63,11 @@ public: _nn = n0; }; - void remove(TT val) { + void remove(double val) { uint32 n0 = _nn - 1; double m0 = (n0 == 0) ? (0) : ((_nn * _mn - val) / n0); double s0 = _sn - (val - m0) * (val - _mn); - if (n0 == 0) m0 = 0.0; // Reset mean and variance to zero when we can. - if (n0 <= 1) s0 = 0.0; // See tests/stddevTest.C testStability() for details. - - if (s0 < 0.0) // Assume negative values are due to stability problems, - s0 = 0.0; // and not mismatched insert() and delete() values. - if (-1e-10 <= m0 && m0 <= 1e-10) - m0 = 0.0; - if (_nn == 0) fprintf(stderr, "ERROR: stdDev has no data; can't remove() old value.\n"), exit(1); @@ -135,7 +130,7 @@ computeStdDev(TT *dist, uint64 distLen, double &mean, double &stddev, bool isSor // Sort the values. Lets us approximate the stddev for filtering out outliers. if (isSorted == false) - std::sort(dist, dist + distLen); + sort(dist, dist + distLen); // Approximate the stddev to filter out outliers. This is done by assuming we're normally // distributed, finding the values that would represent 1 standard deviation (about 68.27% of the @@ -182,7 +177,7 @@ computeStdDev(TT *dist, uint64 distLen, double &mean, double &stddev, bool isSor template void -computeStdDev(std::vector dist, double &mean, double &stddev, bool isSorted=false) { +computeStdDev(vector dist, double &mean, double &stddev, bool isSorted=false) { computeStdDev(dist.data(), dist.size(), mean, stddev, isSorted); } @@ -200,7 +195,7 @@ computeMode(TT *dist, uint64 distLen, TT &mode, bool isSorted=false) { return; if (isSorted == false) - std::sort(dist, dist + distLen); + sort(dist, dist + distLen); uint32 modeCnt = 0; TT modeVal = 0; @@ -232,7 +227,7 @@ computeMode(TT *dist, uint64 distLen, TT &mode, bool isSorted=false) { template void -computeMode(std::vector dist, TT &mode, bool isSorted=false) { +computeMode(vector dist, TT &mode, bool isSorted=false) { computeMode(dist.data(), dist.size(), mode, isSorted); } @@ -247,7 +242,7 @@ computeMedian(TT *dist, uint64 distLen, TT &median, bool isSorted=false) { return; if (isSorted == false) - std::sort(dist, dist + distLen); + sort(dist, dist + distLen); if (distLen % 2 == 0) median = (dist[distLen / 2 - 1] + dist[distLen / 2]) / 2; @@ -257,7 +252,7 @@ computeMedian(TT *dist, uint64 distLen, TT &median, bool isSorted=false) { template void -computeMedian(std::vector dist, TT &median, bool isSorted=false) { +computeMedian(vector dist, TT &median, bool isSorted=false) { computeMedian(dist.data(), dist.size(), median, isSorted); } @@ -274,11 +269,11 @@ computeMedianAbsoluteDeviation(TT *dist, uint64 distLen, TT &median, TT &mad, bo return; if (isSorted == false) - std::sort(dist, dist + distLen); + sort(dist, dist + distLen); computeMedian(dist, distLen, median, true); - std::vector m; + vector m; for (uint64 ii=0; ii void -computeMedianAbsoluteDeviation(std::vector dist, TT &median, TT &mad, bool isSorted=false) { +computeMedianAbsoluteDeviation(vector dist, TT &median, TT &mad, bool isSorted=false) { computeMedianAbsoluteDeviation(dist.data(), dist.size(), median, mad, isSorted); } @@ -334,7 +329,7 @@ public: void add(uint64 data, uint32 count=1) { while (_histogramAlloc <= data) - resizeArray(_histogram, _histogramMax+1, _histogramAlloc, _histogramAlloc * 2, _raAct::copyData | _raAct::clearNew); + resizeArray(_histogram, _histogramMax+1, _histogramAlloc, _histogramAlloc * 2, resizeArray_copyData | resizeArray_clearNew); if (_histogramMax < data) _histogramMax = data; @@ -352,6 +347,18 @@ public: uint64 median(void) { finalizeData(); return(_median); }; uint64 mad(void) { finalizeData(); return(_mad); }; +#if 0 + vector &histogram(void) { // Returns pointer to private histogram data + finalizeData(); + return(_histogram); + }; + + vector &Nstatistics(void) { // Returns pointer to private N data + finalizeData(); + return(_Nstatistics); + }; +#endif + void clearStatistics(void) { _numObjs = 0; diff --git a/ext/meryl/src/utility/src/utility/strings.C b/ext/meryl/src/utility/src/utility/strings.C index 1d5b8c3..4147ee9 100644 --- a/ext/meryl/src/utility/src/utility/strings.C +++ b/ext/meryl/src/utility/src/utility/strings.C @@ -20,165 +20,208 @@ #include "strings.H" #include "arrays.H" -//////////////////////////////////////////////////////////// -// -// Strip whitespace from the end of a line. -// -void -chomp(char *S) { - char *t = S; +uint64 +scaledNumber(uint64 n, uint32 div) { - while (*t != 0) - t++; + if (n > 9999) n /= div; + if (n > 9999) n /= div; + if (n > 9999) n /= div; + if (n > 9999) n /= div; + if (n > 9999) n /= div; + if (n > 9999) n /= div; + if (n > 9999) n /= div; + if (n > 9999) n /= div; + + return(n); +} - t--; - while ((t >= S) && (isWhiteSpace(*t) == true)) - *t-- = 0; +char +scaledUnit(uint64 n, uint32 div) { + char u = ' '; + + if (n > 9999) { n /= div; u = 'k'; } + if (n > 9999) { n /= div; u = 'M'; } + if (n > 9999) { n /= div; u = 'G'; } + if (n > 9999) { n /= div; u = 'T'; } + if (n > 9999) { n /= div; u = 'P'; } + if (n > 9999) { n /= div; u = 'E'; } + if (n > 9999) { n /= div; u = 'Z'; } + if (n > 9999) { n /= div; u = 'Y'; } + + return(u); } +const char * +scaledName(uint64 n, uint32 div) { + const char *s = ""; + + if (n > 9999) { n /= div; s = " thousand"; } + if (n > 9999) { n /= div; s = " million"; } + if (n > 9999) { n /= div; s = " billion"; } + if (n > 9999) { n /= div; s = " trillion"; } + if (n > 9999) { n /= div; s = " quadrillion"; } + if (n > 9999) { n /= div; s = " quintillion"; } + if (n > 9999) { n /= div; s = " sextillion"; } + if (n > 9999) { n /= div; s = " septillion"; } + + return(s); +} + -//////////////////////////////////////////////////////////// -// -// Convert a line into a key=value pair. -// bool -KeyAndValue::find(const char *line) { - char *ptr = nullptr; +decodeBoolean(char *value) { + bool ret = false; + + switch (value[0]) { + case '0': + case 'f': + case 'F': + case 'n': + case 'N': + ret = false; + break; + case '1': + case 't': + case 'T': + case 'y': + case 'Y': + ret = true; + break; + default: + fprintf(stderr, "decodeBoolean()-- unrecognized value '%s'\n", value); + break; + } - // Reset our state, but return fail if there is no line. + return(ret); +} - _key = nullptr; - _val = nullptr; - if (isEmptyString(line) == true) - return(false); - // Copy the string so we can do bad things to it. +// Returns true if a key and value are found. line is modified. +// Returns true, with value == NULL, if no delimiter is found. +// Returns false if the line is blank, or is a comment. +// +bool +KeyAndValue::find(char *line) { - duplicateArray(_line, _lineLen, _lineMax, line, (uint32)strlen(line) + 1); + key_ = NULL; + val_ = NULL; - // Zip ahead until the first non-space letter. - // - // If the letter is a comment or the delimiter, we're done; there is no key. + if (line == NULL) + return(false); - ptr = _line; + key_ = line; - while (isWhiteSpace(*ptr) == true) // Spaces before the key. - ptr++; + while (isspace(*key_) == true) // Spaces before the key + key_++; - if ((*ptr == 0) || - (isComment(*ptr) == true) || - (isDelimiter(*ptr) == true)) + if ((iscomment(*key_) == true) || // If we're at a comment right now, there is no key + (*key_ == 0)) { // and we return failure. + key_ = NULL; + val_ = NULL; return(false); + } - _key = ptr; + val_ = key_; // We're at the key now - // Keep zipping ahead until the end of the line. - // Detect the first comment mark that is preceeded by a space. - // Change it to NUL to terminate the string and return. - // - // Detect the key=value delimiter. - // Change it to a space so we can iterate over it. - // lastspace must be set before this is changed. + while ((*val_ != 0) && + (isdelimiter(*val_) == false)) // The key cannot contain a delimiter. + val_++; + + if (*val_ == 0) { // If at the end of the string, there isn't a + val_ = NULL; // value, but we'll return true and a key anyway. + return(true); + } - char *equals = nullptr; - char *eol = nullptr; - bool lastspace = false; + *val_++ = 0; - while (1) { - eol = ptr; + while (isdelimiter(*val_) == true) { // Spaces or delimiter after the key + *val_ = 0; + val_++; + } - if ((lastspace == true) && (isComment(*ptr) == true)) { - *ptr = 0; - break; - } + if (*val_ == 0) // And there is no value, must be a filename. + return(true); - lastspace = isWhiteSpace(*ptr); + char *eol = val_; // We're at the value now - if ((isDelimiter(*ptr) == true) && (equals == nullptr)) { - *ptr = ' '; - equals = ptr; - } + // If quoted, all we need to do is find the other quote and stop. + if ((*val_ == '"') || + (*val_ == '\'')) { + val_++; + eol++; - if (*ptr == 0) - break; + while (*eol != '"') // The value itself. + eol++; // The value CAN contain delimiters and comment markers. - ptr++; + *eol = 0; } - // If no delimiter, we're done. There cannot be a key/value pair. + // Otherwise, not quoted. Find the first comment marker (or eol) then backup to the first non-space. + else { + while (iscomment(*eol) == false) // The value MUST NOT contain delimiters or comment markers. + eol++; // But it can contains spaces and other nasties. - if (equals == nullptr) - return(false); + eol--; // Back up off the comment or eol. - // Cleanup 1: Find the last letter in the key make the key stop there. + while (isspace(*eol) == true) // And keep backing up as long as we're a space. + eol--; - while (isWhiteSpace(*equals) == true) - equals--; + eol++; // Move past the last non-space, non-comment - equals++; // Move from the last letter of the key. - *equals = 0; // Terminate the key string. - equals++; // Move to the next letter, either space or the value. + *eol = 0; // And terminate the value + } - // Cleanup 2: Find the first letter of the value. - // If we're at eol now, return true with an empty value string. + return(true); +} - while (isWhiteSpace(*equals) == true) - equals++; - _val = equals; - if (equals == eol) - return(true); +splitToWords::splitToWords(const char *string, splitType type) { + _wordsLen = 0; + _wordsMax = 0; + _words = NULL; - // Cleanup 3: Find the last letter of the value and make the value stop - // there. + _charsLen = 0; + _charsMax = 0; + _chars = NULL; - assert(*eol == 0); + if (string) + split(string, type); +} - eol--; - while (isWhiteSpace(*eol) == true) { - *eol = 0; - eol--; - } - return(true); +splitToWords::~splitToWords() { + delete [] _chars; + delete [] _words; } -//////////////////////////////////////////////////////////// -// -// Split the input 'line' into an array of words or path -// components. - void -splitToWords::split(const char *line, splitType type, char sep) { +splitToWords::split(const char *line, splitType type) { - // Initialize to no words and no characters. - // Then return if the input line is empty. + _wordsLen = 0; // Initialize to no words + _charsLen = 0; // and no characters. - _wordsLen = 0; - _charsLen = 0; - - if (isEmptyString(line) == true) + if (line == NULL) // Bail if there isn't a line to process. return; // Count the number of words and chars in the input line, then make // sure there is space for us to store them. while (line[_charsLen] != 0) - if (isSeparator(line[_charsLen++], type, sep)) + if (isSeparator(line[_charsLen++], type)) _wordsLen++; - resizeArray(_words, 0, _wordsMax, _wordsLen + 1); - resizeArray(_chars, 0, _charsMax, _charsLen + 1); + resizeArray(_words, 0, _wordsMax, _wordsLen + 1, resizeArray_doNothing); + resizeArray(_chars, 0, _charsMax, _charsLen + 1, resizeArray_doNothing); // Clear all the words pointers, and copy the input line to our storage. // This greatly simplifies the loop, as we don't need to worry about @@ -193,37 +236,17 @@ splitToWords::split(const char *line, splitType type, char sep) { _wordsLen = 0; for (uint32 st=1, ii=0; ii < _charsLen; ii++) { - if (isSeparator(line[ii], type, sep)) { // If the character is a word - _chars[ii] = 0; // separator, convert to NUL, - st = true; // and flag the next character - } // as the start of a new word. - - else if (st) { // Otherwise, if this is the - _words[_wordsLen++] = _chars + ii; // start of a word, make - st = false; // a new word. + if (isSeparator(line[ii], type)) { // If the character is a word + _chars[ii] = 0; // separator, convert to NUL, + st = true; // and flag the next character + } // as the start of a new word. + + else if (st) { // Otherwise, if this is the + _words[_wordsLen++] = _chars + ii; // start of a word, make + st = false; // a new word. } } } -void -splitToWords::clear(void) { - _wordsLen = 0; - _charsLen = 0; -} - - -void -splitToWords::erase(void) { - - delete [] _words; - delete [] _chars; - - _wordsLen = 0; - _wordsMax = 0; - _words = nullptr; - _charsLen = 0; - _charsMax = 0; - _chars = nullptr; -} diff --git a/ext/meryl/src/utility/src/utility/strings.H b/ext/meryl/src/utility/src/utility/strings.H index 1a98000..2b9a078 100644 --- a/ext/meryl/src/utility/src/utility/strings.H +++ b/ext/meryl/src/utility/src/utility/strings.H @@ -25,75 +25,144 @@ #include #include +using namespace std; -// Some string cleanup functions. -// -void chomp(char *S); // Remove whitespace from the end of a line. +// perl's chomp is pretty nice +// Not a great place to put this, but it's getting used all over. +#ifndef chomp +#define chomp(S) { char *t=(S); while (*t) t++; t--; while (t >= S && isspace(*t)) *t--=0; } +#endif +#ifndef munch +#define munch(S) { while (*(S) && isspace(*(S))) (S)++; } +#endif -// Basic string functions. +#ifndef crunch +#define crunch(S) { while (*(S) && !isspace(*(S))) (S)++; } +#endif + + + +// For pretty-printing numbers. Converts integers to, e.g., 123 k, 123 M, G, T, P. +uint64 scaledNumber(uint64 n, uint32 div=1024); +char scaledUnit (uint64 n, uint32 div=1024); +const char *scaledName (uint64 n, uint32 div=1024); + + + + + +template +char * +decodeRange(char *range, T &lo, T &hi) { + char *ap = range; + + strtonumber(lo, ap, &ap); // Grab the first number. + + hi = lo; // Set the second to that. + + if ((*ap == '-') || // If this is a range, + (*ap == '/')) { // or a one-of-many selection, + ap++; // grab the second number + strtonumber(hi, ap, &ap); + } + + if (*ap == ',') // If the next letter continues + return(ap + 1); // move past that and return. + + if (*ap == 0) // If the next letter is the end + return(NULL); // of the string, return NULL. + + // Otherwise, we can't decode this range. + + fprintf(stderr, "ERROR: invalid range '%s'\n", range); + exit(1); + + return(NULL); +} + + + +template +void +decodeRange(char *range, vector &bgn, vector &end) { + char *ap = range; + T av = 0; + T bv = 0; + + while ((ap != NULL) && (*ap != 0)) { + ap = decodeRange(ap, av, bv); + + bgn.push_back(av); + end.push_back(bv); + } +} + + + +template +void +decodeRange(char *range, set &ranges) { + char *ap = range; + T av = 0; + T bv = 0; + + while ((ap != NULL) && (*ap != 0)) { + ap = decodeRange(ap, av, bv); + + for (T xx=av; xx<=bv; xx++) + ranges.insert(xx); + } +} + + + +// Decodes a string with 0/1, false/true, no/yes into an integer flag. +bool decodeBoolean(char *value); -inline -bool -isEmptyString(char const *s) { return((s == nullptr) || (s[0] == 0)); } -// Convert a line into a key-value pair. -// -// The line should be of the form: -// - 'key' find() returns true with value == nullptr -// - 'key = value' find() returns true -// - 'key : value' find() returns true -// -// In all cases the line is modified by removing whitespace and inserting -// NUL characters at the end of the key and value. -// -// Comments are accepted with either '!' or '#' at the start -// of the line, or preceeded by a white-space letter. class KeyAndValue { public: - KeyAndValue(const char *line = nullptr) { find(line); }; - ~KeyAndValue() { delete [] _line; }; + KeyAndValue(char *line = NULL) { find(line); }; + ~KeyAndValue() { }; -public: - bool find(const char *line); + bool find(char *line); -public: - char *key(void) { return(_key); }; - char *value(void) { return(_val); }; + char *key(void) { return(key_); }; + char *value(void) { return(val_); }; -private: - uint32 _lineMax = 0; - uint32 _lineLen = 0; - char *_line = nullptr; + bool value_bool(void) { return(decodeBoolean(val_)); }; + + int32 value_int32(void) { return(strtol (val_, NULL, 10)); }; + int64 value_int64(void) { return(strtoll(val_, NULL, 10)); }; - char *_key = nullptr; - char *_val = nullptr; + uint32 value_uint32(void) { return(strtoul (val_, NULL, 10)); }; + uint64 value_uint64(void) { return(strtoull(val_, NULL, 10)); }; + + float value_float(void) { return(strtof(val_, NULL)); }; + double value_double(void) { return(strtod(val_, NULL)); }; + +public: + bool iscomment(char c) { return((c == '!') || (c == '#') || (c == 0)); }; + bool isdelimiter(char c) { return((c == ':') || (c == '=') || isspace(c)); }; + + char *key_; + char *val_; }; -// Split the input 'line' into an array of words or path -// components. + enum splitType { splitWords = 0, - splitPaths = 1, - splitLetter = 2 + splitPaths = 1 }; class splitToWords { public: - splitToWords(const char *string=nullptr, splitType type=splitWords, char sep=0) { - split(string, type, sep); - }; - - ~splitToWords() { - erase(); - }; - - void split(const char *line, splitType type=splitWords, char sep=0); - void clear(void); // Remove the words, but leave memory intact. - void erase(void); // Remove words and free memory. + splitToWords(const char *string=NULL, splitType type=splitWords); + ~splitToWords(); private: bool isPath(char c) { @@ -107,30 +176,32 @@ private: (c == '\r')); }; - bool isSeparator(char c, splitType type, char sep) { - return(((type == splitWords) && (isSpace(c))) || - ((type == splitPaths) && (isPath (c))) || - ((type == splitLetter) && (sep == c))); + bool isSeparator(char c, splitType type) { + return(((type == splitWords) && (isSpace(c))) || + ((type == splitPaths) && (isPath(c)))); }; public: + void split(const char *line, splitType type=splitWords); + uint32 numWords(void) { return(_wordsLen); }; char *operator[](uint32 i) { return(first(i)); }; - char *first(uint32 i=0) { return((_wordsLen <= i) ? nullptr : _words[i]); }; - char *last(uint32 i=0) { return((_wordsLen == 0) ? nullptr : _words[_wordsLen - i - 1]); }; - char *pop(void) { return((_wordsLen == 0) ? nullptr : _words[--_wordsLen]); }; + char *first(uint32 i=0) { return((_wordsLen <= i) ? NULL : _words[i]); }; char *shift(void) { if (_wordsLen == 0) // If no words, nothing to return. - return(nullptr); + return(NULL); for (uint32 ii=1; ii<_wordsLen; ii++) // Shift all words down one place, moving - std::swap(_words[ii-1], _words[ii]); // the word to shift off to the end. + swap(_words[ii-1], _words[ii]); // the word to shift off to the end. return(_words[--_wordsLen]); // Return the word we shifted out. }; + char *last(uint32 i=0) { return((_wordsLen == 0) ? NULL : _words[_wordsLen - i - 1]); }; + char *pop(void) { return((_wordsLen == 0) ? NULL : _words[--_wordsLen]); }; + int32 toint32(uint32 i) { return(strtoint32 (_words[i])); }; uint32 touint32(uint32 i) { return(strtouint32(_words[i])); }; int64 toint64(uint32 i) { return(strtoint64 (_words[i])); }; @@ -138,13 +209,13 @@ public: double todouble(uint32 i) { return(strtodouble(_words[i])); }; private: - uint32 _wordsLen = 0; // An array of pointers into _chars - uint32 _wordsMax = 0; // for the words in the string. - char **_words = nullptr; + uint32 _wordsLen; + uint32 _wordsMax; + char **_words; - uint32 _charsLen = 0; // A modified copy of the - uint32 _charsMax = 0; // input string. - char *_chars = nullptr; + uint32 _charsLen; + uint32 _charsMax; + char *_chars; }; diff --git a/ext/meryl/src/utility/src/utility/sweatShop.C b/ext/meryl/src/utility/src/utility/sweatShop.C index eb1055e..1169493 100644 --- a/ext/meryl/src/utility/src/utility/sweatShop.C +++ b/ext/meryl/src/utility/src/utility/sweatShop.C @@ -48,17 +48,15 @@ public: class sweatShopState { public: sweatShopState(void *userData) { - _user = userData; - _computed = false; - _outputted = false; - _next = 0L; + _user = userData; + _computed = false; + _next = 0L; }; ~sweatShopState() { }; void *_user; bool _computed; - bool _outputted; sweatShopState *_next; }; @@ -107,7 +105,6 @@ sweatShop::sweatShop(void*(*loaderfcn)(void *G), _loaderP = 0L; _showStatus = false; - _writeInOrder = true; _loaderQueueSize = 1024; _loaderQueueMax = 10240; @@ -149,7 +146,7 @@ sweatShop::setThreadData(uint32 t, void *x) { // Build a list of states to add in one swoop // void -sweatShop::loaderAddToLocal(sweatShopState *&tail, sweatShopState *&head, sweatShopState *thisState) { +sweatShop::loaderSave(sweatShopState *&tail, sweatShopState *&head, sweatShopState *thisState) { thisState->_next = 0L; @@ -159,13 +156,14 @@ sweatShop::loaderAddToLocal(sweatShopState *&tail, sweatShopState *&head, sweatS } else { tail = head = thisState; } + _numberLoaded++; } // Add a bunch of new states to the queue. // void -sweatShop::loaderAppendToGlobal(sweatShopState *&tail, sweatShopState *&head, uint32 num) { +sweatShop::loaderAppend(sweatShopState *&tail, sweatShopState *&head) { int err; if ((tail == 0L) || (head == 0L)) @@ -184,8 +182,6 @@ sweatShop::loaderAppendToGlobal(sweatShopState *&tail, sweatShopState *&head, ui } _loaderP = head; - _numberLoaded += num; - err = pthread_mutex_unlock(&_stateMutex); if (err != 0) fprintf(stderr, "sweatShop::loaderAppend()-- Failed to unlock mutex (%d). Fail.\n", err), exit(1); @@ -198,52 +194,56 @@ sweatShop::loaderAppendToGlobal(sweatShopState *&tail, sweatShopState *&head, ui void* sweatShop::loader(void) { - struct timespec naptime; - sweatShopState *tail = nullptr; // A local list, to reduce the number of times we - sweatShopState *head = nullptr; // lock the global list. - uint32 numLoaded = 0; + struct timespec naptime; naptime.tv_sec = 0; naptime.tv_nsec = 166666666ULL; // 1/6 second - while (1) { - void *object = NULL; + // We can batch several loads together before we push them onto the + // queue, this should reduce the number of times the loader needs to + // lock the queue. + // + // But it also increases the latency, so it's disabled by default. + // + sweatShopState *tail = 0L; // The first thing loaded + sweatShopState *head = 0L; // The last thing loaded + uint32 numLoaded = 0; + + bool moreToLoad = true; + + while (moreToLoad) { - while (_numberLoaded > _numberComputed + _loaderQueueSize) // Sleep if the queue is too big. + // Zzzzzzz.... + while (_numberLoaded > _numberComputed + _loaderQueueSize) nanosleep(&naptime, 0L); - // If a userLoader function exists, use it to load the data object, then - // make a new state for that object. + void *object = NULL; if (_userLoader) object = (*_userLoader)(_globalUserData); sweatShopState *thisState = new sweatShopState(object); - // If there is no user pointer, we've run out of inputs. - // Push on the empty state to the local list, force an append - // to the global list, and exit this loader function. - - if (thisState->_user == nullptr) { - loaderAddToLocal(tail, head, thisState); - loaderAppendToGlobal(tail, head, numLoaded + 1); - - return(nullptr); - } - - // Otherwise, we've loaded a user object. Push it onto the local list, - // then merge into the global list if the local list is long enough. - - loaderAddToLocal(tail, head, thisState); - numLoaded++; - - if (numLoaded >= _loaderBatchSize) { - loaderAppendToGlobal(tail, head, numLoaded); - numLoaded = 0; + // If we actually loaded a new state, add it + // + if (thisState->_user) { + loaderSave(tail, head, thisState); + numLoaded++; + if (numLoaded >= _loaderBatchSize) + loaderAppend(tail, head); + } else { + // Didn't read, must be all done! Push on the end-of-input marker state. + // + loaderSave(tail, head, new sweatShopState(0L)); + loaderAppend(tail, head); + + moreToLoad = false; + delete thisState; } } - return(nullptr); // Never returns. + //fprintf(stderr, "sweatShop::reader exits.\n"); + return(0L); } @@ -286,7 +286,7 @@ sweatShop::worker(sweatShopWorker *workerData) { err = pthread_mutex_unlock(&_stateMutex); if (err != 0) - fprintf(stderr, "sweatShop::worker()-- Failed to lock mutex (%d). Fail.\n", err), exit(1); + fprintf(stderr, "sweatShop::worler()-- Failed to lock mutex (%d). Fail.\n", err), exit(1); if (workerData->workerQueueLen == 0) { @@ -324,68 +324,45 @@ sweatShop::worker(sweatShopWorker *workerData) { } -void -sweatShop::writerWrite(sweatShopState *w) { - - if (_userWriter) - (*_userWriter)(_globalUserData, w->_user); - _numberOutput++; - - w->_outputted = true; -} - - void* sweatShop::writer(void) { sweatShopState *deleteState = 0L; - struct timespec naptime1 = { .tv_sec = 0, .tv_nsec = 5000000ULL }; - struct timespec naptime2 = { .tv_sec = 0, .tv_nsec = 5000000ULL }; - - - while ((_writerP != nullptr) && - (_writerP->_user != nullptr)) { - - // If a complete result, write it. - if ((_writerP->_computed == true) && - (_writerP->_outputted == false)) { - writerWrite(_writerP); - continue; - } - // If we can write output out-of-order, search ahead - // for any results and output them. - // if (_outOfOrder == true) - if (_writeInOrder == false) { - for (sweatShopState *ss = _writerP; ss != nullptr; ss = ss->_next) - if ((ss->_computed == true) && - (ss->_outputted == false)) { - writerWrite(ss); - } - } + // Wait for output to appear, then write. + // + while (_writerP && _writerP->_user) { - // If no next, wait for input to appear. We can't purge this node - // from the list until there is a next, else we lose the list! - if (_writerP->_next == nullptr) { - nanosleep(&naptime1, 0L); - continue; - } + if (_writerP->_computed == false) { + // Wait for a slow computation. + struct timespec naptime; + naptime.tv_sec = 0; + naptime.tv_nsec = 5000000ULL; - // If already output, remove the node. - if (_writerP->_outputted == true) { - sweatShopState *ds = _writerP; - _writerP = _writerP->_next; + //fprintf(stderr, "Writer waits for slow thread at " F_U64 ".\n", _numberOutput); + nanosleep(&naptime, 0L); + } else if (_writerP->_next == 0L) { + // Wait for the input. + struct timespec naptime; + naptime.tv_sec = 0; + naptime.tv_nsec = 5000000ULL; - delete ds; - continue; + //fprintf(stderr, "Writer waits for all threads at " F_U64 ".\n", _numberOutput); + nanosleep(&naptime, 0L); + } else { + if (_userWriter) + (*_userWriter)(_globalUserData, _writerP->_user); + _numberOutput++; + + deleteState = _writerP; + _writerP = _writerP->_next; + delete deleteState; } - - // Otherwise, we need to wait for a state to appear on the queue. - nanosleep(&naptime2, 0L); } // Tell status to stop. _writerP = 0L; + //fprintf(stderr, "sweatShop::writer exits.\n"); return(0L); } diff --git a/ext/meryl/src/utility/src/utility/sweatShop.H b/ext/meryl/src/utility/src/utility/sweatShop.H index 223dedf..ae62583 100644 --- a/ext/meryl/src/utility/src/utility/sweatShop.H +++ b/ext/meryl/src/utility/src/utility/sweatShop.H @@ -49,8 +49,6 @@ public: void setWriterQueueSize(uint32 queueSize) { _writerQueueSize = queueSize; _writerQueueMax = queueSize; }; - void setInOrderOutput(bool o) { _writeInOrder = o; }; - void run(void *user=0L, bool beVerbose=false); private: @@ -67,11 +65,9 @@ private: void *status(void); // Utilities for the loader thread - void loaderAddToLocal(sweatShopState *&tail, sweatShopState *&head, sweatShopState *thisState); - void loaderAppendToGlobal(sweatShopState *&tail, sweatShopState *&head, uint32 num); - - // Utilities for the writer thread - void writerWrite(sweatShopState *w); + //void loaderAdd(sweatShopState *thisState); + void loaderSave(sweatShopState *&tail, sweatShopState *&head, sweatShopState *thisState); + void loaderAppend(sweatShopState *&tail, sweatShopState *&head); pthread_mutex_t _stateMutex; @@ -86,7 +82,6 @@ private: sweatShopState *_loaderP; // Where input is put, the head bool _showStatus; - bool _writeInOrder; uint32 _loaderQueueSize, _loaderQueueMin, _loaderQueueMax; uint32 _loaderBatchSize; diff --git a/ext/meryl/src/utility/src/utility/system-stackTrace.C b/ext/meryl/src/utility/src/utility/system-stackTrace.C index 456ec7d..d391538 100644 --- a/ext/meryl/src/utility/src/utility/system-stackTrace.C +++ b/ext/meryl/src/utility/src/utility/system-stackTrace.C @@ -161,6 +161,10 @@ AS_UTL_catchCrash(int sig_num, siginfo_t *UNUSED(info), void *UNUSED(ctx)) { #include "backward.hpp" +//namespace backward { +//backward::SignalHandling sh; +//} // namespace backward + void AS_UTL_catchCrash(int sig_num, siginfo_t *UNUSED(info), void *UNUSED(ctx)) { @@ -212,6 +216,17 @@ AS_UTL_catchCrash(int sig_num, siginfo_t *UNUSED(info), void *UNUSED(ctx)) { } +#if 0 + backward::Printer p; + + p.snippet = true; + p.object = true; + p.color = false; + p.address = true; + + p.print(st); +#endif + // Pass the signal through, only so a core file can get generated. struct sigaction sa; diff --git a/ext/meryl/src/utility/src/utility/system.C b/ext/meryl/src/utility/src/utility/system.C index 38886ff..faab801 100644 --- a/ext/meryl/src/utility/src/utility/system.C +++ b/ext/meryl/src/utility/src/utility/system.C @@ -33,6 +33,10 @@ #include "jemalloc/jemalloc.h" #endif +#if !defined(__CYGWIN__) && !defined(_WIN32) +#include +#endif + double @@ -99,7 +103,7 @@ getProcessTime(void) { double tm = 0; if (gettimeofday(&tp, NULL) == 0) - tm = tp.tv_sec + tp.tv_usec / 1000000.0; + tm = tp.tv_sec + tp.tv_usec / 100000.0; if (st == 0.0) st = tm; @@ -129,7 +133,7 @@ getProcessSize(void) { uint64 getProcessSizeLimit(void) { struct rlimit rl; - uint64 sz = uint64max; + uint64 sz = ~uint64ZERO; if (getrlimit(rl) == true) sz = rl.rlim_cur; @@ -162,145 +166,56 @@ getBytesAllocated(void) { -uint64 -getPhysicalMemorySize(void) { - uint64 physPages = sysconf(_SC_PHYS_PAGES); - uint64 pageSize = sysconf(_SC_PAGESIZE); - uint64 physMemory = physPages * pageSize; - - return(physMemory); -} - +#ifdef HW_PHYSMEM +// MacOS, FreeBSD -// Return the size of a page of memory. Every OS we care about (MacOS, -// FreeBSD, Linux) claims to have getpagesize(). -// uint64 -getPageSize(void) { - return(getpagesize()); -} - - +getPhysicalMemorySize(void) { + uint64 physMemory = 0; -// Query the machine or the environment to find any memory size limit. If -// there is no environment limit, the physical memory size is returned. -// -// Slurm variables (from sbatch man page). -// SLURM_MEM_PER_CPU -// Set if --mem-per-cpu is supplied to sbatch. -// "SLURM_MEM_PER_CPU=2048" for a request of --mem-per-cpu=2g -// -// SLURM_MEM_PER_NODE -// Set if --mem is supplied to sbatch. -// "SLURM_MEM_PER_NODE=5120" for a request of --mem=5g -// -// SLURM_MEM_PER_GPU -// Requested memory per allocated GPU. -// Only set if the --mem-per-gpu option is specified. -// Not checked for below. -// -// There doesn't appear to be a comparable environment variable for SGE. -// -// PBS/OpenPBS/PBS Pro variables. -// PBS_RESC_MEM -// TORQUE_RESC_MEM (probably obsolete) -// Potentially memory in bytes. -// -// -uint64 -getMaxMemoryAllowed(void) { - char *env; - uint64 maxmem = getPhysicalMemorySize(); + int mib[2] = { CTL_HW, HW_PHYSMEM }; + size_t len = sizeof(uint64); - env = getenv("SLURM_MEM_PER_CPU"); - if (env) - maxmem = getMaxThreadsAllowed() * strtouint64(env) * 1024 * 1024; + errno = 0; - env = getenv("SLURM_MEM_PER_NODE"); - if (env) - maxmem = strtouint64(env) * 1024 * 1024; + if (sysctl(mib, 2, &physMemory, &len, NULL, 0) != 0) + fprintf(stderr, "getPhysicalMemorySize()-- sysctl() failed to return CTL_HW, HW_PHYSMEM: %s\n", strerror(errno)), exit(1); - env = getenv("PBS_RESC_MEM"); - if (env) - maxmem = strtouint64(env); + if (len != sizeof(uint64)) { +#ifdef HW_MEMSIZE + mib[1] = HW_MEMSIZE; + len = sizeof(uint64); + if (sysctl(mib, 2, &physMemory, &len, NULL, 0) != 0 || len != sizeof(uint64)) +#endif + fprintf(stderr, "getPhysicalMemorySize()-- sysctl() failed to return CTL_HW, HW_PHYSMEM: %s\n", strerror(errno)), exit(1); + } - return(maxmem); + return(physMemory); } +#else +// Linux, FreeBSD -// There is a bit of a race condition in here. On our grid, at least, a -// multi-cpu interactive job sets both SLURM_JOB_CPUS_PER_NODE and -// OMP_NUM_THREADS - but sets the former to the correct value and the -// latter to one. -// -// Because of this, we let the grid variables overwrite the OpenMP -// variable, and further reset OpenMP to use whatever the grid has -// told us to use. -// -// OpenMP variables. -// OMP_NUM_THREADS -// - we don't query this, and instead use omp_get_max_threads(), -// because if OMP_NUM_THREADS isn't set, the function will -// return the number of CPUs on the host. -// -// Slurm variables (from sbatch man page). -// SLURM_CPUS_ON_NODE -// - Number of CPUS on the allocated node. -// -// SLURM_JOB_CPUS_PER_NODE -// - --cpus-per-node -// - Count of processors available to the job on this node. Note the -// select/linear plugin allocates entire nodes to jobs, so the value -// indicates the total count of CPUs on the node. The select/cons_res -// plugin allocates individual processors to jobs, so this number -// indicates the number of processors on this node allocated to the -// job. -// -// SLURM_JOB_NUM_NODES -// - total number of nodes in the job's resource allocation -// -// PBS/OpenPBS/PBS Pro variables (from Torque 9.0.3). -// PBS_NUM_NODES - Number of nodes allocated to the job -// PBS_NUM_PPN - Number of procs per node allocated to the job -// PBS_NCPUS - (older version of PBS_NUM_PPN?) -// PBS_NP - Number of execution slots (cores) for the job -// TORQUE_RESC_PROC - (can't find any doc on this) -// -// SGE variables. -// NSLOTS -// -uint32 -getMaxThreadsAllowed(void) { - char *env; - uint32 nAllowed = omp_get_max_threads(); - - env = getenv("SLURM_JOB_CPUS_PER_NODE"); - if (env) - nAllowed = strtouint32(env); - - env = getenv("PBS_NCPUS"); - if (env) - nAllowed = strtouint32(env); - - env = getenv("PBS_NUM_PPN"); - if (env) - nAllowed = strtouint32(env); +uint64 +getPhysicalMemorySize(void) { + uint64 physPages = sysconf(_SC_PHYS_PAGES); + uint64 pageSize = sysconf(_SC_PAGESIZE); + uint64 physMemory = physPages * pageSize; - env = getenv("NSLOTS"); - if (env) - nAllowed = strtouint32(env); + return(physMemory); +} - omp_set_num_threads(nAllowed); +#endif - return(nAllowed); -} -uint32 -getNumThreadsActive(void) { - return(omp_get_num_threads()); +// Return the size of a page of memory. Every OS we care about (MacOS, FreeBSD, Linux) +// claims to have getpagesize(). +// +uint64 +getPageSize(void) { + return(getpagesize()); } - diff --git a/ext/meryl/src/utility/src/utility/system.H b/ext/meryl/src/utility/src/utility/system.H index 0df676c..088541e 100644 --- a/ext/meryl/src/utility/src/utility/system.H +++ b/ext/meryl/src/utility/src/utility/system.H @@ -40,11 +40,6 @@ uint64 getPhysicalMemorySize(void); uint64 getPageSize(void); -uint64 getMaxMemoryAllowed(void); - -uint32 getMaxThreadsAllowed(void); -uint32 getNumThreadsActive(void); - void AS_UTL_catchCrash(int sig_num, siginfo_t *info, void *ctx); diff --git a/ext/meryl/src/utility/src/utility/types.C b/ext/meryl/src/utility/src/utility/types.C index ff8c750..586e2b2 100644 --- a/ext/meryl/src/utility/src/utility/types.C +++ b/ext/meryl/src/utility/src/utility/types.C @@ -18,516 +18,82 @@ */ #include "types.H" -#include "strings.H" - -//////////////////////////////////////////////////////////// -// -// Sadly, there is no equivalent of strtoull() for 128-bit integers, so I -// provide my own. Only base 10 is supported. Overflow isn't handled. -// -// The obvious implementation of strtollll() -- that being to sum all the -// digits and then negate the sum for negative values -- doesn't handle -// int128min technically correct. It ends up overflowing the (positive) -// int128 by one. This results in int128min. Fortunately, int128min == -// -int128min, and the negation done by 'neg' doesn't do anything. As -// implemented below, though, we instead add or subtract each new digit, -// giving us overflow-free results (but a little slower). -// - -uint128 -strtoullll(char const *nptr, char **endptr) { - uint128 res = 0; - char const *ptr = nptr; - - if (isEmptyString(ptr)) - return(res); - - while ((*ptr != 0) && (isWhiteSpace(*ptr) == true)) - ptr++; - - while ((*ptr != 0) && (isDecDigit(*ptr) == true)) { - res *= 10; - res += asciiDecToInteger(*ptr); - - ptr++; - } - - if (endptr) - *endptr = (char *)ptr; - return(res); -} - -int128 -strtollll(char const *nptr, char **endptr) { - int128 res = 0; - bool neg = false; - char const *ptr = nptr; - - if (isEmptyString(ptr)) - return(res); - - while ((*ptr != 0) && (isWhiteSpace(*ptr) == true)) - ptr++; - - switch (*ptr) { - case '-': ptr++; neg = true; break; - case '+': ptr++; break; - default: break; - } - - while ((*ptr != 0) && (isDecDigit(*ptr) == true)) { - res *= 10; - - if (neg == false) - res += asciiDecToInteger(*ptr); - else - res -= asciiDecToInteger(*ptr); - - ptr++; - } - - if (endptr) - *endptr = (char *)ptr; - return(res); -} - - - -//////////////////////////////////////////////////////////// -// -// Test if a string is a number in the desired encoding. +// In hex, a 128-bit integer needs 32 digits. +// In dec, a 128-bit integer needs 39 digits. (it is 340,282,366,920,938,463,463,374,607,431,768,211,456) // +// We'll just allocate 64 digits and be done with it. Until we want to +// print that 128-bit integer as binary. (that we overallocate space makes +// conversion to decimal a little bit easier) -bool -isBinNumber(char const *s) { - if (isEmptyString(s) == true) - return(false); - - for (uint32 ii=0; s[ii] != 0; ii++) - if (isBinDigit(s[ii]) == false) - return(false); - - return(true); -} - - -bool -isOctNumber(char const *s) { - if (isEmptyString(s) == true) - return(false); - - for (uint32 ii=0; s[ii] != 0; ii++) - if (isOctDigit(s[ii]) == false) - return(false); - - return(true); -} - - -bool -isDecNumber(char const *s, char dot) { - if (isEmptyString(s) == true) - return(false); - - for (uint32 ii=0; s[ii] != 0; ii++) - if ((isDecDigit(s[ii]) == false) && (s[ii] != dot)) - return(false); - - return(true); -} - - -bool -isHexNumber(char const *s) { - if (isEmptyString(s) == true) - return(false); - - for (uint32 ii=0; s[ii] != 0; ii++) - if (isHexDigit(s[ii]) == false) - return(false); - - return(true); -} - - - -//////////////////////////////////////////////////////////// -// -// Convert a string of numbers to a pair of numbers, a vector of ranges, or -// a set of values. -// - -template -char const * -decodeRange(char const *range, numberType &bgn, numberType &end) { - char const *ap = range; - - ap = strtonumber(ap, bgn); // Grab the first number. - - end = bgn; // Set the second to that. - - if ((*ap == '-') || // If this is a range, - (*ap == '/')) // or a one-of-many selection, - ap = strtonumber(ap+1, end); // grab the second number - - if (*ap == ',') // If the next letter continues - return(ap + 1); // move past that and return. - - if (*ap == 0) // If the next letter is the end - return(nullptr); // of the string, return nullptr. - - // Otherwise, we can't decode this range. - - fprintf(stderr, "ERROR: invalid range '%s'\n", range); - exit(1); - - return(nullptr); -} - - -template -void -decodeRange(char const *range, std::vector &bgn, std::vector &end) { - char const *ap = range; - numberType av = 0; - numberType bv = 0; - - while (isEmptyString(ap) == false) { - ap = decodeRange(ap, av, bv); - - bgn.push_back(av); - end.push_back(bv); - } -} - - -template -void -decodeRange(char const *range, std::set &values) { - char const *ap = range; - numberType av = 0; - numberType bv = 0; - - while (isEmptyString(ap) == false) { - ap = decodeRange(ap, av, bv); - - for (numberType xx=av; xx<=bv; xx++) - values.insert(xx); - } -} - - -template char const *decodeRange(char const *range, uint128 &bgn, uint128 &end); -template char const *decodeRange (char const *range, int128 &bgn, int128 &end); -template char const *decodeRange (char const *range, uint64 &bgn, uint64 &end); -template char const *decodeRange (char const *range, int64 &bgn, int64 &end); -template char const *decodeRange (char const *range, uint32 &bgn, uint32 &end); -template char const *decodeRange (char const *range, int32 &bgn, int32 &end); -template char const *decodeRange (char const *range, uint16 &bgn, uint16 &end); -template char const *decodeRange (char const *range, int16 &bgn, int16 &end); -template char const *decodeRange (char const *range, uint8 &bgn, uint8 &end); -template char const *decodeRange (char const *range, int8 &bgn, int8 &end); -template char const *decodeRange (char const *range, double &bgn, double &end); - -template void decodeRange(char const *range, std::vector &bgn, std::vector &end); -template void decodeRange (char const *range, std::vector &bgn, std::vector &end); -template void decodeRange (char const *range, std::vector &bgn, std::vector &end); -template void decodeRange (char const *range, std::vector &bgn, std::vector &end); -template void decodeRange (char const *range, std::vector &bgn, std::vector &end); -template void decodeRange (char const *range, std::vector &bgn, std::vector &end); -template void decodeRange (char const *range, std::vector &bgn, std::vector &end); -template void decodeRange (char const *range, std::vector &bgn, std::vector &end); -template void decodeRange (char const *range, std::vector &bgn, std::vector &end); -template void decodeRange (char const *range, std::vector &bgn, std::vector &end); -template void decodeRange (char const *range, std::vector &bgn, std::vector &end); - -template void decodeRange(char const *range, std::set &values); -template void decodeRange (char const *range, std::set &values); -template void decodeRange (char const *range, std::set &values); -template void decodeRange (char const *range, std::set &values); -template void decodeRange (char const *range, std::set &values); -template void decodeRange (char const *range, std::set &values); -template void decodeRange (char const *range, std::set &values); -template void decodeRange (char const *range, std::set &values); -template void decodeRange (char const *range, std::set &values); -template void decodeRange (char const *range, std::set &values); -template void decodeRange (char const *range, std::set &values); - - - -//////////////////////////////////////////////////////////// -// -// Convert an unsigned integer to one with 3 significant digit number, and -// also return the correct SI base. -// -// This does NOT round correctly. We'd need to track the remainder -// and increment 'n' if the remainder is more than half 'div'. -// - -uint64 -scaledNumber(uint64 n, uint32 div) { - - if (n > 9999) n /= div; - if (n > 9999) n /= div; - if (n > 9999) n /= div; - if (n > 9999) n /= div; - if (n > 9999) n /= div; - if (n > 9999) n /= div; - if (n > 9999) n /= div; - if (n > 9999) n /= div; - - return(n); -} - -char -scaledUnit(uint64 n, uint32 div) { - char u = ' '; - - if (n > 9999) { n /= div; u = 'k'; } // kilo - if (n > 9999) { n /= div; u = 'M'; } // mega - if (n > 9999) { n /= div; u = 'G'; } // giga - if (n > 9999) { n /= div; u = 'T'; } // tera - if (n > 9999) { n /= div; u = 'P'; } // peta - if (n > 9999) { n /= div; u = 'E'; } // exa - if (n > 9999) { n /= div; u = 'Z'; } // zetta - if (n > 9999) { n /= div; u = 'Y'; } // yotta - - return(u); -} - -const char * -scaledName(uint64 n, uint32 div) { - const char *s = ""; - - if (n > 9999) { n /= div; s = " thousand"; } - if (n > 9999) { n /= div; s = " million"; } - if (n > 9999) { n /= div; s = " billion"; } - if (n > 9999) { n /= div; s = " trillion"; } - if (n > 9999) { n /= div; s = " quadrillion"; } - if (n > 9999) { n /= div; s = " quintillion"; } - if (n > 9999) { n /= div; s = " sextillion"; } - if (n > 9999) { n /= div; s = " septillion"; } - - return(s); -} - - - -//////////////////////////////////////////////////////////// -// -// Convert an unsigned integer to a character string in the desired base. -// -// All follow the same pattern except for the constants, and except for -// toDec() which also differs in the 'shift' operation. -// -// The helper function getNextString() is the only part that needs -// to worry about thread safety. Everything else operates on that -// returned buffer space. -// -// Instead of allocating 32 strings of max length, we could allocate 4096 -// bytes and dole pieces out of the appropriate max length as needed. -// - -char alpha[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; -char *strAlloc = nullptr; -char *str[32] = { nullptr }; -uint32 pos = 0; - -static -char * // Helper function to return the next available buffer. -getNextString(void) { // This is the only part that needs to care about threads. - char *ret = nullptr; // Everything else operates on the buffer returned. - -#pragma omp critical (toHEXlock) - { - if (strAlloc == nullptr) { - strAlloc = new char [32 * 129]; - - for (uint32 ii=0; ii<32; ii++) - str[ii] = strAlloc + ii * 129; - } - - ret = str[pos++]; - - if (pos >= 32) - pos = 0; - } - - return(ret); -} +char dec[10] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' }; +char hex[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; +char str[16][64] = { 0 }; +uint32 pos = 0; template -char * -toBin(uintType v, char *ret, uint32 w) { - uint32 L = w; - uint32 W = sizeof(uintType) * 8; - uint32 l = std::min(L, W); - uint32 p = l; +char const * +toHex(uintType v) { + char *ret = str[pos++]; + uint32 w = sizeof(uintType) * 2; + uint32 p = w; uint32 s = 0; + if (pos >= 16) + pos = 0; + while (p > 0) { p -= 1; - ret[p] = alpha[ (v >> s) & 0x1 ]; - s += 1; + ret[p] = hex[ (v >> s) & 0xf ]; + s += 4; } ret[w] = 0; - return(ret + w); -} - -template -char const * -toBin(uintType v, uint32 w) { - char *ret = getNextString(); - toBin(v, ret, w); return(ret); } -template char *toBin(uint128 v, char *out, uint32 width); -template char *toBin (uint64 v, char *out, uint32 width); -template char *toBin (uint32 v, char *out, uint32 width); -template char *toBin (uint16 v, char *out, uint32 width); -template char *toBin (uint8 v, char *out, uint32 width); - -template char const *toBin(uint128 v, uint32 width); -template char const *toBin (uint64 v, uint32 width); -template char const *toBin (uint32 v, uint32 width); -template char const *toBin (uint16 v, uint32 width); -template char const *toBin (uint8 v, uint32 width); - - +template char const *toHex(uint128 v); +template char const *toHex (uint64 v); +template char const *toHex (uint32 v); +template char const *toHex (uint16 v); -template -char * -toOct(uintType v, char *ret, uint32 w) { - uint32 L = (w + 2) / 3; - uint32 W = sizeof(uintType) * 8 / 3 + 1; - uint32 l = std::min(L, W); - uint32 p = l; - uint32 s = 0; - - while (p > 0) { - p -= 1; - ret[p] = alpha[ (v >> s) & 0x7 ]; - s += 3; - } - ret[l] = 0; - - return(ret + l); -} template char const * -toOct(uintType v, uint32 w) { - char *ret = getNextString(); - toOct(v, ret, w); - return(ret); -} - -template char *toOct(uint128 v, char *out, uint32 width); -template char *toOct (uint64 v, char *out, uint32 width); -template char *toOct (uint32 v, char *out, uint32 width); -template char *toOct (uint16 v, char *out, uint32 width); -template char *toOct (uint8 v, char *out, uint32 width); +toDec(uintType v) { + char *ret = str[pos++]; + uint32 p = 64; + uint32 x = 0; -template char const *toOct(uint128 v, uint32 width); -template char const *toOct (uint64 v, uint32 width); -template char const *toOct (uint32 v, uint32 width); -template char const *toOct (uint16 v, uint32 width); -template char const *toOct (uint8 v, uint32 width); - - - -template -char * -toDec(uintType v, char *ret, uint32 w) { - uint32 p = 64; - uint32 x = 0; + if (pos >= 16) + pos = 0; if (v == 0) { - ret[x++] = '0'; + ret[0] = '0'; + ret[1] = 0; } else { - while (v > 0) { // Write the number, low-order - p -= 1; // digits first, to the end - ret[p] = alpha[ v % 10 ]; // of the string. + while (v > 0) { // Write the number, low-order + p -= 1; // digits first, to the end + ret[p] = dec[ v % 10 ]; // of the string. v /= 10; } - for (x=0; p<64; x++, p++) // Shift the string so it - ret[x] = ret[p]; // starts at the origin. - } - - ret[x] = 0; - - return(ret + x); -} + for (x=0; p<64; x++, p++) // Shift the string so it + ret[x] = ret[p]; // starts at the origin. -template -char const * -toDec(uintType v, uint32 w) { - char *ret = getNextString(); - toDec(v, ret, w); - return(ret); -} - -template char *toDec(uint128 v, char *out, uint32 width); -template char *toDec (uint64 v, char *out, uint32 width); -template char *toDec (uint32 v, char *out, uint32 width); -template char *toDec (uint16 v, char *out, uint32 width); -template char *toDec (uint8 v, char *out, uint32 width); - -template char const *toDec(uint128 v, uint32 width); -template char const *toDec (uint64 v, uint32 width); -template char const *toDec (uint32 v, uint32 width); -template char const *toDec (uint16 v, uint32 width); -template char const *toDec (uint8 v, uint32 width); - - - -template -char * -toHex(uintType v, char *ret, uint32 w) { - uint32 L = sizeof(uintType) * 8 / 4; // The maximum possible width - uint32 W = (w + 3) / 4; // The user suggested width - uint32 l = std::min(L, W); - uint32 p = l; - uint32 s = 0; - - while (p > 0) { - p -= 1; - ret[p] = alpha[ (v >> s) & 0xf ]; - s += 4; + ret[x] = 0; } - ret[l] = 0; - - return(ret + l); -} - -template -char const * -toHex(uintType v, uint32 w) { - char *ret = getNextString(); - toHex(v, ret, w); return(ret); } -template char *toHex(uint128 v, char *out, uint32 width); -template char *toHex (uint64 v, char *out, uint32 width); -template char *toHex (uint32 v, char *out, uint32 width); -template char *toHex (uint16 v, char *out, uint32 width); -template char *toHex (uint8 v, char *out, uint32 width); - -template char const *toHex(uint128 v, uint32 width); -template char const *toHex (uint64 v, uint32 width); -template char const *toHex (uint32 v, uint32 width); -template char const *toHex (uint16 v, uint32 width); -template char const *toHex (uint8 v, uint32 width); - +template char const *toDec(uint128 v); +template char const *toDec (uint64 v); +template char const *toDec (uint32 v); +template char const *toDec (uint16 v); diff --git a/ext/meryl/src/utility/src/utility/types.H b/ext/meryl/src/utility/src/utility/types.H index 82ce17c..78421c9 100644 --- a/ext/meryl/src/utility/src/utility/types.H +++ b/ext/meryl/src/utility/src/utility/types.H @@ -66,314 +66,93 @@ #include -#include -#include -#include - #if defined(_FILE_OFFSET_BITS) && (_FILE_OFFSET_BITS == 32) #error I do not support 32-bit off_t. #endif -// Make the basic int types a bit more friendly. +typedef int8_t int8; +typedef int16_t int16; +typedef int32_t int32; +typedef int64_t int64; + +typedef uint8_t uint8; +typedef uint16_t uint16; +typedef uint32_t uint32; +typedef uint64_t uint64; +typedef __int128 int128; typedef unsigned __int128 uint128; -typedef __int128 int128; - -typedef uint64_t uint64; -typedef int64_t int64; - -typedef uint32_t uint32; -typedef int32_t int32; - -typedef uint16_t uint16; -typedef int16_t int16; - -typedef uint8_t uint8; -typedef int8_t int8; - -// There's no way to assign a constant value to the 128-bit integers -// directly, but with a helper function we can assign it using two 64-bit -// integers. This only really makes sense for the unsigned flavor, e.g., -// when used for bit packed quantities. - -constexpr inline uint128 build_uint128(uint64 a, uint64 b) { return(((uint128)a << 64) | ((uint128)b)); }; -constexpr inline int128 build_int128 ( int64 a, int64 b) { return( ((int128)a << 64) | ((int128)b)); }; - -// Some handy constants. -// -// numeric_limits<> on the 128-bit types is undefined, so we're forced to do -// it the hard way. - -constexpr uint128 uint128zero = 0; -constexpr uint128 uint128one = 1; -constexpr uint128 uint128min = 0; -constexpr uint128 uint128max = (uint128)(0xffffffffffffffffllu) << 64 | (uint128)(0xffffffffffffffffllu); - -constexpr int128 int128zero = 0; -constexpr int128 int128one = 1; -constexpr int128 int128min = (uint128)(0x8000000000000000llu) << 64 | (uint128)(0x0000000000000000llu); -constexpr int128 int128max = (uint128)(0x7fffffffffffffffllu) << 64 | (uint128)(0xffffffffffffffffllu); - -constexpr uint64 uint64zero = 0; -constexpr uint64 uint64one = 1; -constexpr uint64 uint64min = std::numeric_limits::min(); -constexpr uint64 uint64max = std::numeric_limits::max(); - -constexpr int64 int64zero = 0; -constexpr int64 int64one = 1; -constexpr int64 int64min = std::numeric_limits::min(); -constexpr int64 int64max = std::numeric_limits::max(); - -constexpr uint32 uint32zero = 0; -constexpr uint32 uint32one = 1; -constexpr uint32 uint32min = std::numeric_limits::min(); -constexpr uint32 uint32max = std::numeric_limits::max(); - -constexpr int32 int32zero = 0; -constexpr int32 int32one = 1; -constexpr int32 int32min = std::numeric_limits::min(); -constexpr int32 int32max = std::numeric_limits::max(); - -constexpr uint16 uint16zero = 0; -constexpr uint16 uint16one = 1; -constexpr uint16 uint16min = std::numeric_limits::min(); -constexpr uint16 uint16max = std::numeric_limits::max(); - -constexpr int16 int16zero = 0; -constexpr int16 int16one = 1; -constexpr int16 int16min = std::numeric_limits::min(); -constexpr int16 int16max = std::numeric_limits::max(); - -constexpr uint8 uint8zero = 0; -constexpr uint8 uint8one = 1; -constexpr uint8 uint8min = std::numeric_limits::min(); -constexpr uint8 uint8max = std::numeric_limits::max(); - -constexpr int8 int8zero = 0; -constexpr int8 int8one = 1; -constexpr int8 int8min = std::numeric_limits::min(); -constexpr int8 int8max = std::numeric_limits::max(); - -// Conversion from floating point to integer. lrint() rounds the -// floating-point argument to an integer value, using the current rounding -// mode. This mode can be set with std::fesetround(). - -inline int64 doubletoint64(double d) { return(std::llrint(d)); } -inline int32 doubletoint32(double d) { return(std:: lrint(d)); } - -// Decoding stings into numbers (and a boolean). -// - The first set simply convert the string to a number and return that -// number. -// - The second set converts the string to a number and returns a pointer -// to the letter in the string just after the number. -// -// There probably should be a strtobool() of the second form, but I'm not -// really sure what to do with the 'invalid' case that is currently treated -// as 'false'. - -uint128 strtoullll(char const *nptr, char **endptr); // The original strtoul() et al take char** - int128 strtollll (char const *nptr, char **endptr); // as the second arg. - -inline uint128 strtouint128(char const *str) { return((uint128)strtoullll(str, nullptr)); } -inline int128 strtoint128 (char const *str) { return( (int128)strtollll (str, nullptr)); } -inline uint64 strtouint64 (char const *str) { return((uint64) strtoull (str, nullptr, 10)); } -inline int64 strtoint64 (char const *str) { return( (int64) strtoll (str, nullptr, 10)); } -inline uint32 strtouint32 (char const *str) { return((uint32) strtoul (str, nullptr, 10)); } -inline int32 strtoint32 (char const *str) { return( (int32) strtol (str, nullptr, 10)); } -inline uint16 strtouint16 (char const *str) { return((uint16) strtoul (str, nullptr, 10)); } // WARNING: these convert to -inline int16 strtoint16 (char const *str) { return( (int16) strtol (str, nullptr, 10)); } // a 32-bit integer then cast -inline uint8 strtouint8 (char const *str) { return((uint8) strtoul (str, nullptr, 10)); } // to 16- or 8-bit integers. -inline int8 strtoint8 (char const *str) { return( (int8) strtol (str, nullptr, 10)); } -inline float strtofloat (char const *str) { return( (float) strtof (str, nullptr)); } -inline double strtodouble (char const *str) { return((double) strtod (str, nullptr)); } - -inline char const *strtonumber(char const *str, uint128 &num) { char *rem; num = (uint128)strtoullll(str, &rem); return(rem); } -inline char const *strtonumber(char const *str, int128 &num) { char *rem; num = (int128)strtollll (str, &rem); return(rem); } -inline char const *strtonumber(char const *str, uint64 &num) { char *rem; num = (uint64) strtoull (str, &rem, 10); return(rem); } -inline char const *strtonumber(char const *str, int64 &num) { char *rem; num = (int64) strtoll (str, &rem, 10); return(rem); } -inline char const *strtonumber(char const *str, uint32 &num) { char *rem; num = (uint32) strtoul (str, &rem, 10); return(rem); } -inline char const *strtonumber(char const *str, int32 &num) { char *rem; num = (int32) strtol (str, &rem, 10); return(rem); } -inline char const *strtonumber(char const *str, uint16 &num) { char *rem; num = (uint16) strtoul (str, &rem, 10); return(rem); } -inline char const *strtonumber(char const *str, int16 &num) { char *rem; num = (int16) strtol (str, &rem, 10); return(rem); } -inline char const *strtonumber(char const *str, uint8 &num) { char *rem; num = (uint8) strtoul (str, &rem, 10); return(rem); } -inline char const *strtonumber(char const *str, int8 &num) { char *rem; num = (int8) strtol (str, &rem, 10); return(rem); } -inline char const *strtonumber(char const *str, float &num) { char *rem; num = (double) strtof (str, &rem); return(rem); } -inline char const *strtonumber(char const *str, double &num) { char *rem; num = (double) strtod (str, &rem); return(rem); } - -inline bool strtobool(char const *str) { - if ((str == nullptr) || - (str[0] == 0)) - return(false); - - if (((str[0] == 'y') && (str[1] == 0)) || - ((str[0] == 'Y') && (str[1] == 0)) || - ((str[0] == 't') && (str[1] == 0)) || - ((str[0] == 'T') && (str[1] == 0)) || - ((str[0] == '1') && (str[1] == 0)) || - ((str[0] == '+') && (str[1] == 0))) - return(true); - - if ((strcasecmp(str, "yes") == 0) || - (strcasecmp(str, "true") == 0)) - return(true); - - return(false); -} - -// Test if a character or string is of the desired encoding. - -inline bool isNUL(char c) { return(c == 0); } - -inline bool isVisible(char c) { return(('!' <= c) && (c <= '~')); } - -inline bool isLetter(char c) { return((('a' <= c) && (c <= 'z')) || - (('A' <= c) && (c <= 'Z'))); } - -inline bool isWhiteSpace(char c) { return((c == ' ') || (c == '\n') || - (c == '\t') || (c == '\r')); }; - -inline bool isComment(char c) { return((c == '!') || (c == '#') || (c == 0)); }; -inline bool isDelimiter(char c) { return((c == ':') || (c == '=') || isWhiteSpace(c)); }; - -inline bool isBinDigit(char c) { return((('0' <= c) && (c <= '1'))); } -inline bool isOctDigit(char c) { return((('0' <= c) && (c <= '7'))); } -inline bool isDecDigit(char c) { return((('0' <= c) && (c <= '9'))); } -inline bool isHexDigit(char c) { return((('0' <= c) && (c <= '9')) || - (('a' <= c) && (c <= 'f')) || - (('A' <= c) && (c <= 'F'))); } - -bool isBinNumber (char const *s); -bool isOctNumber (char const *s); -bool isDecNumber (char const *s, char dot='.'); -bool inline isDecInteger(char const *s) { return(isDecNumber(s, 0)); }; -bool inline isDecFloat (char const *s) { return(isDecNumber(s, '.')); }; -bool isHexNumber (char const *s); - -// Disallow the usual character tests becuse of their goofy return values. - -#undef isalnum -#undef isalpha -#undef iscntrl -#undef isdigit -#undef isgraph -#undef islower -#undef isprint -#undef ispunct -#undef isspace -#undef isupper -#undef isxdigit -#undef isnumber - -int inline isalnum (char c) = delete; -int inline isalpha (char c) = delete; -int inline iscntrl (char c) = delete; -int inline isdigit (char c) = delete; -int inline isgraph (char c) = delete; -int inline islower (char c) = delete; -int inline isprint (char c) = delete; -int inline ispunct (char c) = delete; -int inline isspace (char c) = delete; -int inline isupper (char c) = delete; -int inline isxdigit(char c) = delete; -int inline isnumber(char c) = delete; - -// Convert an ascii binary, octal, decimal or hexadecimal letter to an -// integer. No type checking is performed; you've already called -// isHexNumber() et al, right? -// -// The pieces of asciiHexToInteger() are as follows: -// (d & 0xf) // Decodes '0'-'9' as 0-9, 'a' - 'f' as 1-6 -// (d >> 6) // Decodes digits as 0, letters as 1. -// ((d >> 6) << 3) // Decodes digits as 0, letters as 8. - -inline uint8 asciiBinToInteger(char d) { return(d - '0'); } // Pretty trivial. -inline uint8 asciiOctToInteger(char d) { return(d - '0'); } -inline uint8 asciiDecToInteger(char d) { return(d - '0'); } -inline uint8 asciiHexToInteger(char d) { return(((uint8)d & 0xf) + ((uint8)d >> 6) + (((uint8)d >> 6) << 3)); } - -// Convert an integer to a printable letter. If it's not a printable -// letter, returns '.'. - -inline -char -integerToLetter(uint32 i) { - return(((' ' <= i) && (i <= '~')) ? i : '.'); -} - -// Convert a string representing a set of numbers to -// - the first and last values (for form '#' or '#-#') -// - a vector of the low and high values -// - a set of the values -// -// The string should be comprised of multiple comma separated ranges: -// - # a single number -// - #-# a range of numbers -// - #/# a one-out-of-N specification -// -// The first form returns a pointer to the letter after the decoded values. -// -// If a single number is encountered in the first or second forms, both -// 'bgn' and 'end' are set to that value. -// -// If 'numberType' is a 128-bit integer, only 64-bit integers can be -// converted. - -template char const *decodeRange(char const *range, numberType &bgn, numberType &end); -template void decodeRange(char const *range, std::vector &bgn, std::vector &end); -template void decodeRange(char const *range, std::set &values); - -// Convert an unsigned integer representing bits or bytes to -// a floating point number representing GB or MB. - -inline double bitsToGB(uint64 bits) { return(bits / 8 / 1024.0 / 1024.0 / 1024.0); } -inline double bitsToMB(uint64 bits) { return(bits / 8 / 1024.0 / 1024.0); } - -// Convert an unsigned integer to one with 3 significant digit number, and -// also return the correct SI base. - -uint64 scaledNumber(uint64 n, uint32 div=1024); // Return n between 0 and div, -char scaledUnit (uint64 n, uint32 div=1024); // and the SI unit of that -const char *scaledName (uint64 n, uint32 div=1024); // scaling. - -// Convert an unsigned integer to a character string in the desired base. -// -// char *toXXX(v, str) -// Expects a pre-allocated character buffer 'str' with enough space for -// the output string and a NUL terminating byte. It returns a pointer -// to the NUL byte. A 128-bit integer in: -// binary needs 129 bytes -// octal needs 44 bytes -// decimal needs 40 bytes (it's 340,282,366,920,938,463,463,374,607,431,768,211,455) -// hexadecimal needs 33 bytes -// -// char const *toXX(v) -// Returns a pointer to one of 32 private string buffers. This is -// thread safe, as long as you don't use it more than 32 times at once. -// -// Both forms take an optional 'width' (in bits) to display. The actual -// width used is the minimum of this width and the number of bits in the -// type. toDec() accepts the width, but doesn't use it. - -template char *toBin(uintType value, char *out, uint32 width=128); -template char *toOct(uintType value, char *out, uint32 width=128); -template char *toDec(uintType value, char *out, uint32 width=128); -template char *toHex(uintType value, char *out, uint32 width=128); - -template char const *toBin(uintType value, uint32 width=128); -template char const *toOct(uintType value, uint32 width=128); -template char const *toDec(uintType value, uint32 width=128); -template char const *toHex(uintType value, uint32 width=128); - -// Format specifications for printf() - -#define F_PTR "0x%016p" // Pointers -#define F_C "%c" // Characters + +#define uint128NUMBER(A,B) ((((uint128)A) << 64) | ((uint128)B)) +#define uint64NUMBER(X) X ## LLU +#define uint32NUMBER(X) X ## LU + +#define uint64ZERO uint64NUMBER(0x0000000000000000) +#define uint64ONE uint64NUMBER(0x0000000000000001) +#define uint64MAX uint64NUMBER(0xffffffffffffffff) +#define uint64MASK(X) (((~uint64ZERO) >> (64 - (X))) & (-(uint64)((X) != 0))) + +#define uint32ZERO uint32NUMBER(0x00000000) +#define uint32ONE uint32NUMBER(0x00000001) +#define uint32MAX uint32NUMBER(0xffffffff) +#define uint32MASK(X) (((~uint32ZERO) >> (32 - (X))) & (-(uint32)((X) != 0))) + +#define uint16ZERO (0x0000) +#define uint16ONE (0x0001) +#define uint16MAX (0xffff) +#define uint16MASK(X) (((~uint16ZERO) >> (16 - (X))) & (-(uint16)((X) != 0))) + +#define uint8ZERO (0x00) +#define uint8ONE (0x01) +#define uint8MAX (0xff) +#define uint8MASK(X) (((~uint8ZERO) >> (8 - (X))) & (-(uint8)((X) != 0))) + + +inline int32 strtoint32 (char *str) { return( (int32)strtol (str, NULL, 10)); } +inline uint32 strtouint32(char *str) { return((uint32)strtoul (str, NULL, 10)); } +inline int64 strtoint64 (char *str) { return( (int64)strtoll (str, NULL, 10)); } +inline uint64 strtouint64(char *str) { return((uint64)strtoull(str, NULL, 10)); } +inline double strtodouble(char *str) { return((double)strtod (str, NULL)); } + +inline void strtonumber( int32 &num, char *str, char **rem) { num = (int32)strtol (str, rem, 10); } +inline void strtonumber(uint32 &num, char *str, char **rem) { num = (uint32)strtoul (str, rem, 10); } +inline void strtonumber( int64 &num, char *str, char **rem) { num = (int64)strtoll (str, rem, 10); } +inline void strtonumber(uint64 &num, char *str, char **rem) { num = (uint64)strtoull(str, rem, 10); } +inline void strtonumber(double &num, char *str, char **rem) { num = (double)strtod (str, rem); } + +inline int32 doubletoint32(double d) { return((int32) ((1.0 + 16.0 * DBL_EPSILON) * d)); }; +inline int64 doubletoint64(double d) { return((int64) ((1.0 + 16.0 * DBL_EPSILON) * d)); }; + +template char const *toHex(uintType v); +template char const *toDec(uintType v); + +// These macros are use to eliminate inter-platform differnces between +// calculated results +//#define DBL_TO_INT(X) ((int)((1.0+16.0*DBL_EPSILON)*(X))) +//#define ROUNDPOS(X) (DBL_TO_INT((X)+0.5) ) +//#define ROUND(X) (((X)>0.0) ? ROUNDPOS(X) : -ROUNDPOS(-(X)) ) +//#define ZERO_PLUS ( 16.0*DBL_EPSILON) +//#define ZERO_MINUS (-16.0*DBL_EPSILON) +//#define ONE_PLUS (1.0+ZERO_PLUS) +//#define ONE_MINUS (1.0+ZERO_MINUS) +//#define INT_EQ_DBL(I,D) (fabs((double)(I)-(D)) < 16.0*DBL_EPSILON ) +//#define DBL_EQ_DBL(A,B) (fabs((A)-(B))<16.0*DBL_EPSILON) + +// Pointers +#define F_PTR "0x%016p" + +// Characters +#define F_C "%c" #define F_CP "c" #define F_CI "%*c" -#define F_STR "%s" // Strings + +// Strings +#define F_STR "%s" #define F_STRP "s" #define F_STRI "%*s" -#define F_S16 "%" PRId16 // Integers + +// Integers +#define F_S16 "%" PRId16 #define F_S16P PRId16 #define F_S16I "%*" PRId16 #define F_U16 "%" PRIu16 @@ -394,15 +173,20 @@ template char const *toHex(uintType value, uint32 width=128); #define F_X64 "%016" PRIx64 #define F_X64P PRIx64 #define F_X64I "%*" PRIx64 -#define F_F32 "%f" // Floating points + +// Floating points +#define F_F32 "%f" #define F_F32P "f" #define F_F32I "%*f" #define F_F64 "%lf" #define F_F64P "lf" #define F_F64I "%*lf" -#define F_SIZE_T "%zu" // Standard typedefs + +// Standard typedefs +#define F_SIZE_T "%zu" #define F_SIZE_TP "zu" #define F_SIZE_TI "%*zu" + #define F_OFF_T F_S64 #define F_OFF_TP F_S64P #define F_OFF_TI F_S64I