diff --git a/.github/workflows/continuous-integration-pip.yml b/.github/workflows/continuous-integration-pip.yml index f4624dc8..76ca98f2 100644 --- a/.github/workflows/continuous-integration-pip.yml +++ b/.github/workflows/continuous-integration-pip.yml @@ -12,34 +12,16 @@ name: BE2 Unit Tests on MacOS and Ubuntu (c++17) on: [push, pull_request] jobs: - # https://github.com/marketplace/actions/skip-duplicate-actions - pre_job: - runs-on: ${{ matrix.os }} + build: strategy: matrix: - os: ['macos-latest', 'ubuntu-latest'] - outputs: - should_skip: ${{ steps.skip_check.outputs.should_skip }} - steps: - - id: skip_check - uses: fkirc/skip-duplicate-actions@v5.3.0 - with: - # All of these options are optional, so you can remove them if you are happy with the defaults - concurrent_skipping: 'never' - skip_after_successful_duplicate: 'true' - paths_ignore: '["**/README.md", "**/docs/**"]' - do_not_skip: '["pull_request", "workflow_dispatch", "schedule"]' - - main_job: - needs: pre_job - if: ${{ needs.pre_job.outputs.should_skip != 'true' }} + #os: ['macos-13', 'macos-13-arm64', 'ubuntu-22.04'] + os: ['ubuntu-22.04'] runs-on: ${{ matrix.os }} - strategy: - matrix: - os: ['macos-latest', 'ubuntu-latest'] env: DEBUG_5G: FALSE DEBUG_FAST: TRUE + MAKE_OPTS: -j2 steps: # https://github.com/actions/checkout @@ -53,50 +35,68 @@ jobs: run: | echo "" | bash etc/CONFIGURE_MACOS.bash - - name: "Ubuntu: run CONFIGURE_UBUNTU20LTS.bash" + - name: "Ubuntu: run CONFIGURE_UBUNTU22LTS.bash" if: startsWith(matrix.os, 'ubuntu') run: | - echo "" | bash etc/CONFIGURE_UBUNTU20LTS.bash + echo "" | bash etc/CONFIGURE_UBUNTU22LTS.bash + - - name: C++ checks with address-sanitizer (Mac and Linux) + - name: Make configure script run: | - echo GCC VERSION: + autoconf --version + automake --version + aclocal --version gcc --version - echo G++ VERSION: g++ --version bash bootstrap.sh - echo === Try Address Sanitizer Without Optimization === - ./configure -q --disable-opt --enable-address-sanitizer - make clean - make all - (cd src; make check || (cat test-suite.log; exit 1)) + + - name: Dump configure script + run: | + cat configure + + - name: C++ checks not optimizaiton with address-sanitizer (Mac and Linux) + run: | + ./configure --disable-opt --enable-address-sanitizer + make ${{MAKE_OPTS}} all + cd src + make ${{MAKE_OPTS}} bulk_extractor + make check || (cat test-suite.log; exit 1) + cd .. make distclean + + - name: C++ checks optimization with address-sanitizer (Mac and Linux) + run: | echo === Try Address Sanitizer Optimization === - ./configure -q --enable-address-sanitizer - make clean - make all - (cd src; make check || (cat test-suite.log; exit 1)) + ./configure --enable-address-sanitizer + make ${{ MAKE_OPTS }} all + cd src + make ${{ MAKE_OPTS }} bulk_extractor + make check || (cat test-suite.log; exit 1) + cd .. make distclean - name: C++ checks with thread-sanitizer on ubuntu are disabled if: startsWith(matrix.os, 'ubuntu-DISABLED') run: | bash bootstrap.sh - ./configure -q --enable-silent-rules --enable-thread-sanitizer + ./configure --enable-silent-rules --enable-thread-sanitizer make clean - make all - (cd src; make check || (cat test-suite.log; exit 1)) + make ${{ MAKE_OPTS }} all + cd src + make ${{ MAKE_OPTS }} bulk_extractor + make check || (cat test-suite.log; exit 1) + cd .. make distclean - name: C++ checks with codecov if: startsWith(matrix.os, 'ubuntu') run: | bash bootstrap.sh - ./configure -q --disable-opt --enable-silent-rules CFLAGS='-g -O0 -fprofile-arcs -ftest-coverage' CXXFLAGS='-g -O0 -fprofile-arcs -ftest-coverage' LIBS='-lgcov' + ./configure --disable-opt CFLAGS='-g -O0 -fprofile-arcs -ftest-coverage' CXXFLAGS='-g -O0 -fprofile-arcs -ftest-coverage' LIBS='-lgcov -lre2' make clean - make all + make ${{ MAKE_OPTS }} all cd src - make check || (echo ==error== ; cat test-suite.log; exit 1) + make ${{ MAKE_OPTS }} check || (echo ==error== ; cat test-suite.log; exit 1) - name: upload codecov report if: startsWith(matrix.os, 'ubuntu') @@ -107,24 +107,17 @@ jobs: gcov-9 -o . *.cpp be20_api/*.cpp bash <(curl -s https://codecov.io/bash) - # - # set up and run codecov - # - ## sudo apt install -y gpg - ## curl https://keybase.io/codecovsecurity/pgp_keys.asc | \ - ## gpg --no-default-keyring --keyring trustedkeys.gpg --import - ## curl -Os https://uploader.codecov.io/latest/linux/codecov - ## curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM - ## curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig - ## gpgv codecov.SHA256SUM.sig codecov.SHA256SUM - ## shasum -a 256 -c codecov.SHA256SUM - ## chmod +x codecov && ./codecov -t ${CODECOV_TOKEN} - - uses: ammaraskar/gcc-problem-matcher@0.2.0 name: GCC Problem Matcher - name: distcheck run: | - ./configure -q - make clean - make distcheck + ./configure + make ${{ MAKE_OPTS }} clean + #make distcheck + make ${{ MAKE_OPTS }} dist + ls -l + tar xfvz *.tar.gz + cd $(basename *gz .tar.gz) + ./configure + make ${{ MAKE_OPTS }} check diff --git a/Makefile.am b/Makefile.am index aa8448f3..244d0481 100644 --- a/Makefile.am +++ b/Makefile.am @@ -31,12 +31,13 @@ EXTRA_DIST = \ m4/ac_prog_javah.m4 \ m4/ac_try_compile_java.m4 \ m4/ac_try_run_javac.m4 \ - m4/ax_pthread.m4 \ + m4/ax_cxx_compile_stdcxx.m4 \ + m4/ax_cxx_compile_stdcxx_17.m4 \ bootstrap.sh \ README.md \ $(SRC_WIN_DIST) -ACLOCAL_AMFLAGS = -I m4 +ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -I m4 distclean2: @echo Deleting: diff --git a/configure.ac b/configure.ac index 77100c7f..6dc3356f 100644 --- a/configure.ac +++ b/configure.ac @@ -8,23 +8,22 @@ # and http://www.bioinf.uni-freiburg.de/~mmann/HowTo/automake.html AC_PREREQ([2.69]) -AC_INIT([BULK_EXTRACTOR],[2.0.6],[bugs@digitalcorpora.org]) +AC_INIT([BULK_EXTRACTOR],[2.1.0],[bugs@digitalcorpora.org]) AC_CONFIG_MACRO_DIR(m4) -AC_CONFIG_AUX_DIR([build-aux]) + +AC_MSG_NOTICE([at start CPPFLAGS are $CPPFLAGS]) ################################################################ ## Includes -m4_include([m4/slg_mingw_support.m4]) -m4_include([m4/slg_searchdirs.m4]) -m4_include([src/be20_api/be20_configure.m4]) -m4_include([src/be20_api/dfxml_cpp/src/dfxml_configure.m4]) - AC_CONFIG_FILES([Makefile doc/Makefile doc/latex_manuals/Makefile src/Makefile src/tests/Makefile man/Makefile \ python/Makefile specfiles/Makefile specfiles/bulk_extractor.spec.m4 tests/Makefile ]) AC_CONFIG_HEADERS([config.h]) +AC_CONFIG_AUX_DIR([build-aux]) + AM_INIT_AUTOMAKE AM_MAINTAINER_MODE +AC_PROG_RANLIB AC_PREFIX_PROGRAM(bulk_extractor) dnl build for same location # Programs we will be using @@ -33,20 +32,33 @@ AC_PROG_CXX AM_PROG_CC_C_O dnl allow per-produce flags AC_PROG_INSTALL AC_PROG_LEX([noyywrap]) -# LT_INIT([disable-shared]) # Must use C++17 mode. -# Validate that we have - m4_include([m4/ax_cxx_compile_stdcxx.m4]) AC_LANG_PUSH(C++) AX_CXX_COMPILE_STDCXX([17], [noext], [mandatory]) -AC_CHECK_HEADER([filesystem], [], AC_MSG_ERROR([ header not installed; bulk_extractor requires a C++ compiler with a full C++17 implementation])) AC_LANG_POP() # Turn on all warnings m4_include([src/be20_api/m4/slg_gcc_all_warnings.m4]) +################################################################ +# Specify our headers +# +# Check for headers used by bulk Extractor +# do not put pthread here +# +# Check dfxml and be20_api headers + +m4_include([src/be20_api/dfxml_cpp/src/dfxml_configure.m4]) +m4_include([src/be20_api/m4/slg_address_sanitizer.m4]) +m4_include([src/be20_api/m4/slg_noopt.m4]) +m4_include([src/be20_api/be20_configure.m4]) + +m4_include([m4/slg_mingw_support.m4]) +m4_include([m4/slg_searchdirs.m4]) + + # These are needed for bulk_extractor. They are not needed for tcpflow AC_DEFINE([BULK_EXTRACTOR],1,[We are compiling bulk_extractor]) @@ -66,8 +78,6 @@ else AC_MSG_ERROR([$LEX does not support -R. Please get a modern version of gnu flex]) fi - - ################################################################ ## rar support AC_ARG_ENABLE([rar], @@ -294,10 +304,6 @@ else CXXFLAGS="$CXXFLAGS -fPIC" fi -m4_include([src/be20_api/m4/slg_address_sanitizer.m4]) -m4_include([src/be20_api/m4/slg_noopt.m4]) - - ################################################################ ## Check on two annoying warnings @@ -307,12 +313,23 @@ m4_include([src/be20_api/m4/slg_noopt.m4]) # --------------------------------------------------------------- # Try to compile PROGRAM. AC_DEFUN([MY_COMPILE_CLEAN_IFELSE], -[AC_REQUIRE([AC_PROG_EGREP]) -AC_COMPILE_IFELSE([$1],[retval=0 -if $EGREP -i -c -E 'fatal|error|unrecognized|not found|not exist' conftest.err >/dev/null; then retval=1; fi -],[retval=1]) + [AC_REQUIRE([AC_PROG_EGREP]) + AC_COMPILE_IFELSE([$1],[retval=0 + if $EGREP -i -c -E 'fatal|error|unrecognized|not found|not exist' conftest.err >/dev/null; then retval=1; fi + ],[retval=1]) AS_IF([test $retval = 0],[$2],[$3])]) +################################################################ +# Take out duplicate flags +CFLAGS=$(echo $CFLAGS | tr ' ' '\n' | sort -u | tr '\n' ' ') +CPPFLAGS=$(echo $CPPFLAGS | tr ' ' '\n' | sort -u | tr '\n' ' ') +CXXFLAGS=$(echo $CXXFLAGS | tr ' ' '\n' | sort -u | tr '\n' ' ') + +################################################################ +AC_MSG_NOTICE([CFLAGS are now $CFLAGS]) +AC_MSG_NOTICE([CPPFLAGS are now $CPPFLAGS]) +AC_MSG_NOTICE([CXXFLAGS are now $CXXFLAGS]) + AC_MSG_NOTICE([*************************************]) AC_MSG_NOTICE([*************************************]) diff --git a/etc/CONFIGURE_MACOS.bash b/etc/CONFIGURE_MACOS.bash index 3a95835f..6b65c707 100755 --- a/etc/CONFIGURE_MACOS.bash +++ b/etc/CONFIGURE_MACOS.bash @@ -22,7 +22,7 @@ read # Note: openssl no longer required # Apple's provided flex is 2.6.4, which is the same that is provided by brew -PKGS+="wget libtool autoconf automake libtool libxml2 libewf json-c" +PKGS+="wget libtool autoconf automake libtool libxml2 libewf json-c re2 abseil pkg-config" $WHICH install $PKGS || (echo installation install failed; exit 1) exit 0 diff --git a/etc/CONFIGURE_UBUNTU20LTS.bash b/etc/CONFIGURE_UBUNTU20LTS.bash index e71a9621..304e2088 100755 --- a/etc/CONFIGURE_UBUNTU20LTS.bash +++ b/etc/CONFIGURE_UBUNTU20LTS.bash @@ -4,7 +4,7 @@ RELEASE=20 CONFIGURE="./configure -q --enable-silent-rules" AUTOCONF_DIST=https://ftp.gnu.org/gnu/autoconf/autoconf-2.71.tar.gz AUTOMAKE_DIST=https://ftp.gnu.org/gnu/automake/automake-1.16.3.tar.gz -MKPGS="build-essential flex libexpat1-dev libxml2-utils libssl-dev libtool make pkg-config zlib1g-dev" +MKPGS="build-essential flex libabsl-dev libexpat1-dev libre2-dev libssl-dev libtool libxml2-utils make pkg-config zlib1g-dev" # libxml2-utils needed for xmllint WGET="wget -nv --no-check-certificate" CONFIGURE="./configure -q --enable-silent-rules" diff --git a/etc/CONFIGURE_UBUNTU20_win64.bash b/etc/CONFIGURE_UBUNTU20_win64.bash index df01a801..53937719 100755 --- a/etc/CONFIGURE_UBUNTU20_win64.bash +++ b/etc/CONFIGURE_UBUNTU20_win64.bash @@ -3,3 +3,4 @@ sudo apt-get install -y mingw-w64 sudo apt-get install -y wine sudo apt install libz-mingw-w64-dev sudo apt install libgcrypt-mingw-w64-dev +sudo apt install libxml2-utils diff --git a/etc/CONFIGURE_UBUNTU22LTS.bash b/etc/CONFIGURE_UBUNTU22LTS.bash new file mode 100755 index 00000000..e37019b9 --- /dev/null +++ b/etc/CONFIGURE_UBUNTU22LTS.bash @@ -0,0 +1,68 @@ +#!/bin/bash +SCRIPT_DIR="$(readlink -f $(dirname "${BASH_SOURCE[0]}"))" +RELEASE=22 +CONFIGURE="./configure -q --enable-silent-rules" +MKPGS="autoconf automake g++ flex libabsl-dev libexpat1-dev libre2-dev libssl-dev libtool libssl-dev libxml2-utils make pkg-config zlib1g-dev" +WGET="wget -nv --no-check-certificate" +CONFIGURE="./configure -q --enable-silent-rules" +MAKE="make -j2" +trap "exit 1" TERM +export TOP_PID=$$ +cat </dev/null && sudo make install) +ls -l /etc/ld.so.conf.d/ +sudo ldconfig +ewfinfo -h >/dev/null || (echo could not install libewf; exit 1) diff --git a/etc/CONFIGURE_UBUNTU22_win64.bash b/etc/CONFIGURE_UBUNTU22_win64.bash index 1ed1ec9a..e39e89e1 100755 --- a/etc/CONFIGURE_UBUNTU22_win64.bash +++ b/etc/CONFIGURE_UBUNTU22_win64.bash @@ -5,22 +5,23 @@ OS_NAME=ubuntu OS_VERSION=22 MAKE_CONCURRENCY=-j2 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +MPKGS="autoconf automake flex g++ gcc git libtool libabsl-dev libre2-dev libxml2-utils libz-mingw-w64-dev libgcrypt-mingw-w64-dev libsqlite3-dev make mingw-w64 wine " cd $SCRIPT_DIR -. ./paths.bash +. /paths.bash -if [ ! -r /etc/os-release ]; then +if [[ ! -r /etc/os-release ]]; then echo This requires /etc/os-release exit 1 fi . /etc/os-release -if [ $ID != $OS_NAME ]; then +if [[ $ID != $OS_NAME ]]; then echo This requires $OS_NAME Linux. You have $ID. exit 1 fi -if [ $VERSION_ID -ne $OS_VERSION ]; then +if [[ $VERSION_ID != $OS_VERSION ]]; then echo This requires $OS_NAME version $OS_VERSION. You have $ID $VERSION_ID. exit 1 fi @@ -34,7 +35,6 @@ press any key to continue... EOF read -MPKGS="autoconf automake make flex gcc g++ git libtool mingw-w64 wine libz-mingw-w64-dev libgcrypt-mingw-w64-dev" sudo apt update -y sudo apt install -y $MPKGS diff --git a/etc/paths.bash b/etc/paths.bash index c278b035..862b5012 100755 --- a/etc/paths.bash +++ b/etc/paths.bash @@ -1,7 +1,11 @@ +echo paths.bash + LIBEWF_URL=https://github.com/libyal/libewf-legacy/releases/download/20140814/libewf-20140814.tar.gz LIBEWF_FNAME=$(basename $LIBEWF_URL) LIBEWF_DIR=$( echo $LIBEWF_FNAME | sed s/-experimental// | sed s/.tar.gz//) +echo LIBEWF_URL=$LIBEWF_URL + function make_libewf { echo echo "Now installing libewf into $LIBEWF_DIR" @@ -13,7 +17,7 @@ function make_libewf { && sudo make install) || (echo could not build libewf. Stop; exit 1) echo Cleaning up $LIBEWF_FNAME and $LIBEWF_DIR /bin/rm -rf $LIBEWF_FNAME $LIBEWF_DIR || (echo could not clean up. Stop; exit 1) - + # Make sure that /usr/local/lib is in ldconfig sudo /bin/rm -f /tmp/local.conf echo /usr/local/lib > /tmp/local.conf diff --git a/src/Makefile.am b/src/Makefile.am index 87518c75..6f64244c 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,23 +1,34 @@ +BE20_API_DIR= be20_api +include be20_api/Makefile.defs +include rar/Makefile.defs + +DFXML_SRC_DIR=be20_api/dfxml_cpp/src/ +include $(DFXML_SRC_DIR)Makefile.defs + +EXTRA_DIST = .gitignore $(BE20_API_EXTRA_DIST) + +ETAGS = etags-emacs +ACLOCAL_AMFLAGS = -I m4 + +# So that relative imports from be20_api work +AM_CPPFLAGS = @RE2_CFLAGS@ -I$(top_srcdir)/src/be20_api -I$(top_srcdir)/src/be20_api/utfcpp/source + +bulk_extractor_LDADD = @RE2_LIBS@ +test_be_LDADD = @RE2_LIBS@ + +AUTOMAKE_OPTIONS = subdir-objects + bin_PROGRAMS = bulk_extractor +check_PROGRAMS = test_be +TESTS = $(check_PROGRAMS) CLEANFILES = scan_accts.cpp scan_base16.cpp scan_email.cpp scan_gps.cpp \ be20_api/config.h be20_api/dfxml/src/config.h config.h *.d *~ -TESTS = test_be -check_PROGRAMS = test_be # bring in defs from two included projects # note: don't be clever and use the $variable on the Include line. it doesn't work -BE20_API_DIR= be20_api -include be20_api/Makefile.defs -include rar/Makefile.defs - -# So that relative imports from be20_api work -AM_CPPFLAGS = -I$(top_srcdir)/src/be20_api -I$(top_srcdir)/src/be20_api/utfcpp/source -AUTOMAKE_OPTIONS = subdir-objects -EXTRA_DIST = .gitignore $(BE20_API_EXTRA_DIST) - TSK3INCS = \ tsk3_fatdirs.h\ tsk3/auto/tsk_auto.h\ @@ -32,8 +43,6 @@ TSK3INCS = \ tsk3/tsk_incs.h\ tsk3/vs/tsk_vs.h - - # These scanners are based on GNUflex flex_scanners = \ sbuf_flex_scanner.h \ @@ -129,10 +138,9 @@ bulk_extractor_parts = \ sbuf_decompress.cpp \ sbuf_decompress.h - bulk_extractor_SOURCES = $(bulk_extractor_parts) $(scanners_builtin) main.cpp -test_be_SOURCES = $(bulk_extractor_parts) $(scanners_builtin) be20_api/catch.hpp test_be.h test_be1.cpp test_be2.cpp test_be3.cpp - +test_be_SOURCES = $(bulk_extractor_parts) $(scanners_builtin) \ + be20_api/catch.hpp test_be.h test_be1.cpp test_be2.cpp test_be3.cpp runs.txt: test_be tests/run_each.sh bash tests/run_each.sh > runs.txt 2>&1 diff --git a/src/be20_api b/src/be20_api index 278ec93b..0bd6a2a8 160000 --- a/src/be20_api +++ b/src/be20_api @@ -1 +1 @@ -Subproject commit 278ec93b0b522a1017fa69e60c3598bcf1d14fc6 +Subproject commit 0bd6a2a8a72dffcc867cef478a3f84cf306f131f diff --git a/src/bulk_extractor.cpp b/src/bulk_extractor.cpp index 4dd1646b..d78b5d86 100644 --- a/src/bulk_extractor.cpp +++ b/src/bulk_extractor.cpp @@ -56,13 +56,16 @@ int _CRT_fmode = _O_BINARY; [[noreturn]] void debug_help() { puts( "#define DEBUG_PEDANTIC 0x0001 // check values more rigorously" ); - puts( "#define DEBUG_PRINT_STEPS 0x0002 // prints as each scanner is started" ); + puts( "#define DEBUG_PRINT_STEPS 0x0002 // prints as each scanner is started" ); puts( "#define DEBUG_SCANNER 0x0004 // dump all feature writes to stderr" ); - puts( "#define DEBUG_NO_SCANNERS 0x0008 // do not run the scanners " ); + puts( "#define DEBUG_NO_SCANNERS 0x0008 // do not run the scanners " ); puts( "#define DEBUG_DUMP_DATA 0x0010 // dump data as it is seen " ); puts( "#define DEBUG_INFO 0x0040 // print extra info" ); puts( "#define DEBUG_EXIT_EARLY 1000 // just print the size of the volume and exis " ); puts( "#define DEBUG_ALLOCATE_512MiB 1002 // Allocate 512MiB, but don't set any flags " ); + puts( "// any debug can also be enabled by setting the corresponding environment variables"); + puts( "// e.g. DEBUG_PRINT_STEPS=1 ./bulk_extractor ..."); + exit( 1); } @@ -292,15 +295,21 @@ int bulk_extractor_main( std::ostream &cout, std::ostream &cerr, int argc,char * cfg.opt_pagesize = scaled_stoi64( result["pagesize"].as()); cfg.opt_marginsize = scaled_stoi64( result["marginsize"].as()); + /*** SET THREADING OPTIONS ***/ + if ( result.count("threads")>0 && result.count("no_threads") >0) { + throw std::runtime_error("--threads and --no_threads conflict"); + } + try { cfg.num_threads = result["threads"].as(); } catch ( cxxopts::option_has_no_value_exception &e ) { cfg.num_threads = 0; } - if ( result.count( "no_threads" )) { + if ( result.count("no_threads")) { cfg.num_threads = 0; } + /***/ sc.max_depth = result["max_depth"].as(); cfg.max_bad_alloc_errors = result["max_bad_alloc_errors"].as(); diff --git a/src/phase1.cpp b/src/phase1.cpp index a4309fbb..cd143a1c 100644 --- a/src/phase1.cpp +++ b/src/phase1.cpp @@ -282,12 +282,18 @@ void Phase1::dfxml_write_source() void Phase1::phase1_run() { assert(ss.get_current_phase() == scanner_params::PHASE_SCAN); - // save all of the pages we have seen in the DFXML file + + // save all of the pages we had previously seen (through restarting) in the DFXML file for (const auto &it : config.seen_page_ids) { - ss.record_work_start_pos0str( it ); + ss.record_work_start_stop_pos0str( it ); } + + // now start the new run xreport.push("runtime","xmlns:debug=\"http://www.github.com/simsong/bulk_extractor/issues\""); + + // process all of the sbufs read_process_sbufs(); + if (!config.opt_quiet) cout << "All data read; waiting for threads to finish..." << std::endl; ss.join(); xreport.pop("runtime"); diff --git a/src/scan_find.cpp b/src/scan_find.cpp index 1a238883..19a949ff 100644 --- a/src/scan_find.cpp +++ b/src/scan_find.cpp @@ -10,6 +10,9 @@ #include "be20_api/utils.h" // needs config.h #include "be20_api/dfxml_cpp/src/dfxml_writer.h" +// We need the defaults for page scan and margin. We really should get the current ones... +#include "phase1.h" + // anonymous namespace hides symbols from other cpp files (like "static" applied to functions) // TODO: make this not a global variable namespace { @@ -43,6 +46,41 @@ namespace { } } +void scan_find_sbuf(scanner_params &sp, sbuf_t &sbuf) +{ + feature_recorder &f = sp.named_feature_recorder("find"); + + auto *tbuf = sbuf_t::sbuf_malloc(sp.sbuf->pos0, sp.sbuf->bufsize+1, sp.sbuf->bufsize+1); + memcpy(tbuf->malloc_buf(), sp.sbuf->get_buf(), sp.sbuf->bufsize); + const char *tbase = static_cast(tbuf->malloc_buf()); + tbuf->wbuf(sp.sbuf->bufsize, 0); // null terminate + + /* Now see if we can find a string */ + for (size_t pos = 0; pos < sp.sbuf->pagesize && pos < sp.sbuf->bufsize;) { + std::string found; + size_t offset=0; + size_t len = 0; + + if ( find_list.search_all( tbase+pos, &found, &offset, &len)) { + if (len == 0) { + pos += 1; + continue; + } + f.write_buf( *sp.sbuf, pos+offset, len); + pos += offset+len; + } else { + /* nothing was found; skip past the first \0 and repeat. */ + const char *eos = static_cast(memchr( tbase+pos, '\000', sp.sbuf->bufsize-pos)); + if (eos){ + pos = (eos - tbase) + 1; // skip 1 past the \0 + } else { + break; + } + } + } + delete tbuf; +} + extern "C" void scan_find(scanner_params &sp) { @@ -79,33 +117,16 @@ void scan_find(scanner_params &sp) /* The current regex library treats \0 as the end of a string. * So we make a copy of the current buffer to search that's one bigger, and the copy has a \0 at the end. * This is super-wasteful. Does Lightgrep have this problem? + * + * We also want to not scan more than a full 'page' if we were scanning an image. Because a memory-mapped + * file will have an sbuf the size of the whole file, we split it up and scan scan_find_sbuf() */ - feature_recorder &f = sp.named_feature_recorder("find"); - - auto *tbuf = sbuf_t::sbuf_malloc(sp.sbuf->pos0, sp.sbuf->bufsize+1, sp.sbuf->bufsize+1); - memcpy(tbuf->malloc_buf(), sp.sbuf->get_buf(), sp.sbuf->bufsize); - const char *base = static_cast(tbuf->malloc_buf()); - tbuf->wbuf(sp.sbuf->bufsize, 0); // null terminate - - /* Now see if we can find a string */ - for (size_t pos = 0; pos < sp.sbuf->pagesize && pos < sp.sbuf->bufsize;) { - std::string found; - size_t offset=0; - size_t len = 0; - if ( find_list.search_all( base+pos, &found, &offset, &len)) { - if(len == 0) { - len+=1; - continue; - } - f.write_buf( *sp.sbuf, pos+offset, len); - pos += offset+len; - } else { - /* nothing was found; skip past the first \0 and repeat. */ - const char *eos = static_cast(memchr( base+pos, '\000', sp.sbuf->bufsize-pos)); - if (eos) pos=(eos-base)+1; // skip 1 past the \0 - else pos=sp.sbuf->bufsize; // skip to the end of the buffer - } + + Phase1::Config local_cfg; + + for(size_t pos = 0; pos < sp.sbuf->pagesize && pos < sp.sbuf->bufsize; pos+=local_cfg.opt_pagesize){ + sbuf_t sbuf(*sp.sbuf, pos, pos+local_cfg.opt_pagesize + local_cfg.opt_marginsize); + scan_find_sbuf(sp, sbuf); } - delete tbuf; } } diff --git a/src/test_be.h b/src/test_be.h index 8980b8e9..2d3827f0 100644 --- a/src/test_be.h +++ b/src/test_be.h @@ -12,7 +12,11 @@ extern bool debug; // return file the test directory mapped to an sbuf sbuf_t *map_file(std::filesystem::path p); + +// look for specific output in a file, and throw an exception if it cannot be found +void grep(const std::string str, std::filesystem::path fname ); void grep(const Feature &exp, std::filesystem::path fname ); + std::filesystem::path test_scanners(const std::vector & scanners, sbuf_t *sbuf); std::filesystem::path test_scanner(scanner_t scanner, sbuf_t *sbuf); bool requireFeature(const std::vector &lines, const std::string feature); diff --git a/src/test_be1.cpp b/src/test_be1.cpp index d9072081..e84c7e8b 100644 --- a/src/test_be1.cpp +++ b/src/test_be1.cpp @@ -3,6 +3,15 @@ #define CATCH_CONFIG_CONSOLE_WIDTH 120 #define DO_NOT_USE_WMAIN +/**************************************************************** + * test_be1.cpp: + * - Support functions for C++ self test + * - Test cases for individual scanners (in alphabetical order) + ****************************************************************/ + +/* Copyright (C) 2021-2023 Simson L. Garfinkel */ + + #include "config.h" #include @@ -67,6 +76,38 @@ const char *notify() } } +/**************************************************************** + * sbuf tests + ****************************************************************/ +sbuf_t *make_sbuf() +{ + auto sbuf = new sbuf_t("Hello World!"); + return sbuf; +} + +/* Test that sbuf data are not copied when moved to a child.*/ +std::atomic counter{0}; +const uint8_t *sbuf_buf_loc = nullptr; +void test_process_sbuf(sbuf_t *sbuf) +{ + if (sbuf_buf_loc != nullptr) { + REQUIRE( sbuf_buf_loc == sbuf->get_buf() ); + } + delete sbuf; +} + +TEST_CASE("sbuf_no_copy", "[threads]") { + for(int i=0;i<100;i++){ + auto sbuf = make_sbuf(); + sbuf_buf_loc = sbuf->get_buf(); + test_process_sbuf(sbuf); + } +} + +/****************************************************************/ + + + /* We assume that the tests are being run out of bulk_extractor/src/. * This returns the directory of the test subdirectory. */ diff --git a/src/test_be2.cpp b/src/test_be2.cpp index 1772b1bf..f7f9d069 100644 --- a/src/test_be2.cpp +++ b/src/test_be2.cpp @@ -2,6 +2,11 @@ #define CATCH_CONFIG_CONSOLE_WIDTH 120 #define DO_NOT_USE_WMAIN +/**************************************************************** + * test_be2.cpp: + * - Test cases that require the use of a scanner_set (with phases) + ****************************************************************/ + #include "config.h" #include @@ -124,6 +129,34 @@ bool feature_match(const Feature &feature, const std::string &line) } +/* Look for a line in a file and print an error if not found */ +void grep(const std::string str, std::filesystem::path fname ) +{ + for (int pass=0 ; pass<2 ; pass++){ + std::string line; + std::ifstream inFile; + inFile.open(fname); + if (!inFile.is_open()) { + std::cerr << "could not open: " << fname << std::endl; + throw std::runtime_error("Could not open: "+fname.string()); + } + while (std::getline(inFile, line)) { + switch (pass) { + case 0: + if (line.find(str) != std::string::npos){ + return; // found! + } + break; + case 1: + std::cerr << fname << ":" << line << std::endl; // print the entire file the second time through + break; + } + } + } + std::cerr << "**** did not find: " << str << std::endl; + REQUIRE(false); +} + /* Look for a line in a file and print an error if not found */ void grep(const Feature &feature, std::filesystem::path fname ) { @@ -264,6 +297,15 @@ bool validate_files(const std::filesystem::path &fn0, const std::filesystem::pat } +/** + * These test cases run the scanners in a scanner_set with a specified disk image, and then check for all of the results. + */ + + +/**************************************************************** + * scan_aes + ****************************************************************/ + TEST_CASE("test_aes", "[phase1]") { /* Test rotation with various sign extension snaffu */ uint8_t in[4]; @@ -330,6 +372,10 @@ TEST_CASE("schedule_aes", "[phase1]") { validate_aes128_key(key3); } +/**************************************************************** + * scan_base64 and scan_json + ****************************************************************/ + TEST_CASE("test_base16json", "[phase1]") { std::vector ex2 { Check("json.txt", @@ -348,6 +394,11 @@ TEST_CASE("test_base16json", "[phase1]") { validate("test_base16json.txt", ex2); } +/**************************************************************** + * scan_accts.flex + ****************************************************************/ + + TEST_CASE("test_ccn", "[phase1]") { auto *sbufp = map_file( "ccns.txt" ); auto outdir = test_scanner( scan_accts, sbufp); // deletes sbufp @@ -356,6 +407,10 @@ TEST_CASE("test_ccn", "[phase1]") { REQUIRE( requireFeature(ccns_txt,"378282246310005")); } +/**************************************************************** + * scan_elf + ****************************************************************/ + TEST_CASE("test_elf", "[phase1]") { std::vector ex { Check("elf.txt", Feature( "0", "9e218cee3b190e8f59ef323b27f4d339481516e9", "*")) @@ -364,6 +419,10 @@ TEST_CASE("test_elf", "[phase1]") { } +/**************************************************************** + * scan_gzip + ****************************************************************/ + TEST_CASE("test_gzip", "[phase1]") { std::vector ex3 { Check("email.txt", Feature( "0-GZIP-0", "hello@world.com", "hello@world.com\\012")) @@ -371,6 +430,10 @@ TEST_CASE("test_gzip", "[phase1]") { validate("test_hello.gz", ex3); } +/**************************************************************** + * scan_json + ****************************************************************/ + TEST_CASE("test_json", "[phase1]") { std::vector ex1 { Check("json.txt", Feature( "0", JSON1, "ef2b5d7ee21e14eeebb5623784f73724218ee5dd")), @@ -378,6 +441,24 @@ TEST_CASE("test_json", "[phase1]") { validate("test_json.txt", ex1); } + +/**************************************************************** + * san_jpeg & scan_rar + ****************************************************************/ + +TEST_CASE("test_jpeg_rar", "[phase1]") { + std::vector ex2 { + Check("jpeg_carved.txt", + Feature( "13259-RAR-0", "jpeg_carved/000/13259-RAR-0.jpg")) + + }; + validate("jpegs.rar", ex2); +} + +/**************************************************************** + * scan_kml + ****************************************************************/ + TEST_CASE("KML_Samples.kml","[phase1]"){ std::vector ex4 { Check("kml_carved.txt", @@ -389,14 +470,9 @@ TEST_CASE("KML_Samples.kml","[phase1]"){ validate("KML_Samples.kml", ex4); } -TEST_CASE("test_jpeg_rar", "[phase1]") { - std::vector ex2 { - Check("jpeg_carved.txt", - Feature( "13259-RAR-0", "jpeg_carved/000/13259-RAR-0.jpg")) - - }; - validate("jpegs.rar", ex2); -} +/**************************************************************** + * scan_net + ****************************************************************/ TEST_CASE("test_net1", "[phase1]") { std::vector ex2 { @@ -499,6 +575,10 @@ TEST_CASE("test_net-domexusers", "[phase1]") { } +/**************************************************************** + * scan_winpe + ****************************************************************/ + TEST_CASE("test_winpe", "[phase1]") { std::vector ex2 { Check("winpe.txt", Feature( "0", @@ -507,128 +587,3 @@ TEST_CASE("test_winpe", "[phase1]") { }; validate("hello_win64_exe", ex2); } - -/**************************************************************** - * Test process_dir - */ -TEST_CASE("process_dir", "[process_dir]") { - - /* This should throw NoSuchFile because there is is an E01 file */ - REQUIRE_THROWS_AS(image_process::open( test_dir(), true, 65536, 65536), image_process::FoundDiskImage); - - /* Get the right return code */ - std::filesystem::path inpath = test_dir(); - std::string inpath_string = inpath.string(); - std::filesystem::path outdir = NamedTemporaryDirectory(); - std::string outdir_string = outdir.string(); - std::stringstream ss; - const char *argv[] = {"bulk_extractor", notify(), "-Ro", outdir_string.c_str(), inpath_string.c_str(), nullptr}; - int ret = run_be(ss, ss, argv); - REQUIRE( ret==6 ); - - /* This should return the jpegs */ - image_process *p = nullptr; - try { - p = image_process::open( test_dir() / "jpegs", true, 65536, 65536); - } - catch (image_process::FoundDiskImage &e) { - std::cerr << "FoundDiskImage: " << e.what() << std::endl; - exit(1); - } - catch (image_process::IsADirectory &e) { - std::cerr << "IsAdirectory: " << e.what() << std::endl; - exit(1); - } - catch (image_process::NoSuchFile &e) { - std::cerr << "NoSuchFile: " << e.what() << std::endl; - std::cerr << "Current Directory: " << std::filesystem::current_path() << std::endl; - exit(1); - } - - //int count = 0; - for( image_process::iterator it = p->begin(); it != p->end(); ++it ){ - //count++; - pos0_t pos0 = it.get_pos0(); - REQUIRE( pos0.str().find(".jpg") != std::string::npos ); - } - delete p; -} - - -/**************************************************************** - * Test restarter - ** test sbufs (which is this here? - */ - -sbuf_t *make_sbuf() -{ - auto sbuf = new sbuf_t("Hello World!"); - return sbuf; -} - -/* Test that sbuf data are not copied when moved to a child.*/ -std::atomic counter{0}; -const uint8_t *sbuf_buf_loc = nullptr; -void test_process_sbuf(sbuf_t *sbuf) -{ - if (sbuf_buf_loc != nullptr) { - REQUIRE( sbuf_buf_loc == sbuf->get_buf() ); - } - delete sbuf; -} - -TEST_CASE("sbuf_no_copy", "[threads]") { - for(int i=0;i<100;i++){ - auto sbuf = make_sbuf(); - sbuf_buf_loc = sbuf->get_buf(); - test_process_sbuf(sbuf); - } -} - -/****************************************************************/ -TEST_CASE("image_process", "[phase1]") { - image_process *p = nullptr; - REQUIRE_THROWS_AS( p = image_process::open( "no-such-file", false, 65536, 65536), image_process::NoSuchFile); - REQUIRE_THROWS_AS( p = image_process::open( "no-such-file", false, 65536, 65536), image_process::NoSuchFile); - p = image_process::open( test_dir() / "test_json.txt", false, 65536, 65536); - REQUIRE( p != nullptr ); - int times = 0; - - for(auto it = p->begin(); it!=p->end(); ++it){ - REQUIRE( times==0 ); - sbuf_t *sbufp = it.sbuf_alloc(); - - REQUIRE( sbufp->bufsize == 79 ); - REQUIRE( sbufp->pagesize == 79 ); - delete sbufp; - times += 1; - } - REQUIRE(times==1); - delete p; -} - -/**************************************************************** - ** Test the path printer - **/ -TEST_CASE("path-printer1", "[path_printer]") { - scanner_config sc; - sc.input_fname = test_dir() / "test_hello.512b.gz"; - sc.enable_all_scanners(); - sc.allow_recurse = true; - - scanner_set ss(sc, feature_recorder_set::flags_disabled(), nullptr); - ss.add_scanners(scanners_builtin); - ss.apply_scanner_commands(); - - image_process *reader = image_process::open( sc.input_fname, false, 65536, 65536 ); - std::stringstream str; - class path_printer pp(ss, reader, str); - pp.process_path("512-GZIP-0/h"); // create a hex dump - - REQUIRE(str.str() == "0000: 6865 6c6c 6f40 776f 726c 642e 636f 6d0a hello@world.com.\n"); - str.str(""); - - pp.process_path("512-GZIP-2/r"); // create a hex dump with a different path and the /r - REQUIRE( str.str() == "14\r\nllo@world.com\n" ); - delete reader; -} diff --git a/src/test_be3.cpp b/src/test_be3.cpp index bc03ecf5..d8721ba6 100644 --- a/src/test_be3.cpp +++ b/src/test_be3.cpp @@ -1,4 +1,5 @@ /**************************************************************** + * test_be3.cpp: * end-to-end tests */ @@ -80,11 +81,62 @@ int run_be(std::ostream &cout, std::ostream &cerr, const char **argv) return ret; } +/* + * Run BE and capture the output + */ + int run_be(std::ostream &ss, const char **argv) { return run_be(ss, ss, argv); } +/**************************************************************** + * Test process_dir + */ +TEST_CASE("process_dir", "[process_dir]") { + + /* This should throw NoSuchFile because there is is an E01 file */ + REQUIRE_THROWS_AS(image_process::open( test_dir(), true, 65536, 65536), image_process::FoundDiskImage); + + /* Get the right return code */ + std::filesystem::path inpath = test_dir(); + std::string inpath_string = inpath.string(); + std::filesystem::path outdir = NamedTemporaryDirectory(); + std::string outdir_string = outdir.string(); + std::stringstream ss; + const char *argv[] = {"bulk_extractor", notify(), "-Ro", outdir_string.c_str(), inpath_string.c_str(), nullptr}; + int ret = run_be(ss, ss, argv); + REQUIRE( ret==6 ); + + /* This should return the jpegs */ + image_process *p = nullptr; + try { + p = image_process::open( test_dir() / "jpegs", true, 65536, 65536); + } + catch (image_process::FoundDiskImage &e) { + std::cerr << "FoundDiskImage: " << e.what() << std::endl; + exit(1); + } + catch (image_process::IsADirectory &e) { + std::cerr << "IsAdirectory: " << e.what() << std::endl; + exit(1); + } + catch (image_process::NoSuchFile &e) { + std::cerr << "NoSuchFile: " << e.what() << std::endl; + std::cerr << "Current Directory: " << std::filesystem::current_path() << std::endl; + exit(1); + } + + //int count = 0; + for( image_process::iterator it = p->begin(); it != p->end(); ++it ){ + //count++; + pos0_t pos0 = it.get_pos0(); + REQUIRE( pos0.str().find(".jpg") != std::string::npos ); + } + delete p; +} + + TEST_CASE("e2e-no-args", "[end-to-end]") { const char *argv[] = {"bulk_extractor", nullptr}; std::stringstream ss; @@ -92,6 +144,7 @@ TEST_CASE("e2e-no-args", "[end-to-end]") { REQUIRE( ret==3 ); // produces 3 } +/* Test -h */ TEST_CASE("e2e-h", "[end-to-end]") { /* Try the -h option */ const char *argv[] = {"bulk_extractor", "-h", nullptr}; @@ -100,6 +153,7 @@ TEST_CASE("e2e-h", "[end-to-end]") { REQUIRE( ret==1 ); // -h now produces 1 } +/* Test -H */ TEST_CASE("e2e-H", "[end-to-end]") { /* Try the -H option */ const char *argv[] = {"bulk_extractor", "-H", nullptr}; @@ -108,9 +162,14 @@ TEST_CASE("e2e-H", "[end-to-end]") { REQUIRE( ret==2 ); // -H produces 2 } +/* Run on the first 100k of the emails dataset + * bulk_extractor -0q -o [outdir] nps-2010-emails.100k.raw + * Runs twice, so that we can also test the restarting logic + */ TEST_CASE("e2e-0", "[end-to-end]") { std::filesystem::path inpath = test_dir() / "nps-2010-emails.100k.raw"; std::filesystem::path outdir = NamedTemporaryDirectory(); + /* Try to run twice. There seems to be a problem with the second time through. */ std::string inpath_string = inpath.string(); std::string outdir_string = outdir.string(); @@ -121,10 +180,23 @@ TEST_CASE("e2e-0", "[end-to-end]") { std::cerr << "STDOUT:" << std::endl << cout.str() << std::endl << std::endl << "STDERR:" << std::endl << cerr.str() << std::endl; REQUIRE( ret==0 ); } + + /* make sure that there are both debug:work_start and debug:work_stop tags in the output */ + auto xml_file = outdir_string + "/report.xml"; + grep( "debug:work_start", xml_file); + grep( "debug:work_stop", xml_file); + + /* Validate the dfxml file is valid dfxml*/ + std::string validate = std::string("xmllint --noout ") + xml_file; + int code = system( validate.c_str()); + REQUIRE( code==0 ); + + // This is the second time through - clear cout and cerr first // https://stackoverflow.com/questions/20731/how-do-you-clear-a-stringstream-variable std::stringstream().swap(cout); std::stringstream().swap(cerr); + // Re-run to make sure that works ret = run_be(cout, cerr, argv); if (ret!=0) { std::cerr << "STDOUT:" << std::endl << cout.str() << std::endl << std::endl @@ -132,12 +204,14 @@ TEST_CASE("e2e-0", "[end-to-end]") { REQUIRE( ret==0 ); } - /* Validate the output dfxml file */ - std::string validate = std::string("xmllint --noout ") + outdir_string + "/report.xml"; - int code = system( validate.c_str()); - REQUIRE( code==0 ); + /* make sure that both tags ended up in the second XML file (the one created from restarting) */ + grep( "debug:work_start", xml_file); + grep( "debug:work_stop", xml_file); } +/* + * -x all -e wordlist + */ TEST_CASE("select_scanners", "[end-to-end]") { std::filesystem::path inpath = test_dir() / "pdf_words2.pdf"; std::filesystem::path outdir = NamedTemporaryDirectory(); @@ -155,6 +229,8 @@ TEST_CASE("select_scanners", "[end-to-end]") { REQUIRE( endpos != startpos + 1); } +/* -f simsong + */ TEST_CASE("scan_find", "[end-to-end]") { std::filesystem::path inpath = test_dir() / "pdf_words2.pdf"; std::filesystem::path outdir = NamedTemporaryDirectory(); @@ -173,6 +249,10 @@ TEST_CASE("scan_find", "[end-to-end]") { grep( Feature(pos0_t("70-PDF-366"), "simsong", ""), outdir / "find.txt" ); } +/* + * Test the 5gb flat file if it is present and if the DEBUG_5G environment variable is set. + */ + TEST_CASE("5gb-flatfile", "[end-to-end]") { /* Make a 5GB file and try to read it. Make sure we get back the known content. */ if (!getenv_debug("DEBUG_5G")){ @@ -366,3 +446,57 @@ TEST_CASE("restarter", "[restarter]") { REQUIRE( cfg.seen_page_ids.find("369098752") != cfg.seen_page_ids.end() ); REQUIRE( cfg.seen_page_ids.find("369098752+") == cfg.seen_page_ids.end() ); } + + +/**************************************************************** + * Test restarter + ** test sbufs (which is this here? + */ + +/****************************************************************/ +TEST_CASE("image_process", "[phase1]") { + image_process *p = nullptr; + REQUIRE_THROWS_AS( p = image_process::open( "no-such-file", false, 65536, 65536), image_process::NoSuchFile); + REQUIRE_THROWS_AS( p = image_process::open( "no-such-file", false, 65536, 65536), image_process::NoSuchFile); + p = image_process::open( test_dir() / "test_json.txt", false, 65536, 65536); + REQUIRE( p != nullptr ); + int times = 0; + + for(auto it = p->begin(); it!=p->end(); ++it){ + REQUIRE( times==0 ); + sbuf_t *sbufp = it.sbuf_alloc(); + + REQUIRE( sbufp->bufsize == 79 ); + REQUIRE( sbufp->pagesize == 79 ); + delete sbufp; + times += 1; + } + REQUIRE(times==1); + delete p; +} + +/**************************************************************** + ** Test the path printer + **/ +TEST_CASE("path-printer1", "[path_printer]") { + scanner_config sc; + sc.input_fname = test_dir() / "test_hello.512b.gz"; + sc.enable_all_scanners(); + sc.allow_recurse = true; + + scanner_set ss(sc, feature_recorder_set::flags_disabled(), nullptr); + ss.add_scanners(scanners_builtin); + ss.apply_scanner_commands(); + + image_process *reader = image_process::open( sc.input_fname, false, 65536, 65536 ); + std::stringstream str; + class path_printer pp(ss, reader, str); + pp.process_path("512-GZIP-0/h"); // create a hex dump + + REQUIRE(str.str() == "0000: 6865 6c6c 6f40 776f 726c 642e 636f 6d0a hello@world.com.\n"); + str.str(""); + + pp.process_path("512-GZIP-2/r"); // create a hex dump with a different path and the /r + REQUIRE( str.str() == "14\r\nllo@world.com\n" ); + delete reader; +} diff --git a/tests/patterns.txt b/tests/patterns.txt index d4a615f9..79e130cc 100644 --- a/tests/patterns.txt +++ b/tests/patterns.txt @@ -1,2 +1,3 @@ [a-z]*@f3c.com [a-z]*@redellipse.net +@gzipfile.com