From de5bbc393a59bbed2c7c6dc600669fe7365f484d Mon Sep 17 00:00:00 2001
From: andrewkern <adkern@uoregon.edu>
Date: Mon, 22 Nov 2021 10:36:07 -0800
Subject: [PATCH 01/10] getting a pip package together

moving stuff

tf2 various updates

version

linting

working on pip

working on pip..

version

still werking

few more paths

what about conda?
---
 MANIFEST.in                                   |   5 +
 build.sh                                      |   2 +
 diploSHIC.py                                  | 371 --------
 diploshic/__init__.py                         |   3 +
 diploshic/diploSHIC                           | 759 ++++++++++++++++
 {examples => diploshic/examples}/README.md    |   0
 {examples => diploshic/examples}/Snakefile    |   0
 {examples => diploshic/examples}/collate.sh   |   0
 .../examples}/efficient_simulation.sh         |   0
 {examples => diploshic/examples}/train.sh     |   0
 fvTools.py => diploshic/fvTools.py            | 859 +++++++++++++-----
 .../generateSimLaunchScript.py                |  71 +-
 .../makeFeatureVecsForChrArmFromVcfDiploid.py | 292 ++++++
 .../makeFeatureVecsForChrArmFromVcf_ogSHIC.py | 306 +++++++
 .../makeFeatureVecsForSingleMsDiploid.py      | 324 +++++++
 .../makeFeatureVecsForSingleMs_ogSHIC.py      | 180 ++--
 .../makeTrainingSets.py                       |  83 +-
 misc.py => diploshic/misc.py                  | 129 ++-
 msTools.py => diploshic/msTools.py            |  99 +-
 diploshic/setup.py                            |  14 +
 shicstats.pyf => diploshic/shicstats.pyf      |   0
 {testing => diploshic/testing}/hard.fvec      |   0
 .../testing}/linkedHard.fvec                  |   0
 .../testing}/linkedSoft.fvec                  |   0
 {testing => diploshic/testing}/neut.fvec      |   0
 {testing => diploshic/testing}/soft.fvec      |   0
 {training => diploshic/training}/hard.fvec    |   0
 .../training}/linkedHard.fvec                 |   0
 .../training}/linkedSoft.fvec                 |   0
 {training => diploshic/training}/neut.fvec    |   0
 {training => diploshic/training}/soft.fvec    |   0
 utils.c => diploshic/utils.c                  |   0
 makeFeatureVecsForChrArmFromVcfDiploid.py     | 163 ----
 makeFeatureVecsForChrArmFromVcf_ogSHIC.py     | 171 ----
 makeFeatureVecsForSingleMsDiploid.py          | 193 ----
 meta.yaml                                     |  27 +
 setup.py                                      |  49 +-
 37 files changed, 2773 insertions(+), 1327 deletions(-)
 create mode 100644 MANIFEST.in
 create mode 100644 build.sh
 delete mode 100644 diploSHIC.py
 create mode 100644 diploshic/__init__.py
 create mode 100644 diploshic/diploSHIC
 rename {examples => diploshic/examples}/README.md (100%)
 rename {examples => diploshic/examples}/Snakefile (100%)
 rename {examples => diploshic/examples}/collate.sh (100%)
 rename {examples => diploshic/examples}/efficient_simulation.sh (100%)
 rename {examples => diploshic/examples}/train.sh (100%)
 rename fvTools.py => diploshic/fvTools.py (60%)
 rename generateSimLaunchScript.py => diploshic/generateSimLaunchScript.py (53%)
 create mode 100644 diploshic/makeFeatureVecsForChrArmFromVcfDiploid.py
 create mode 100644 diploshic/makeFeatureVecsForChrArmFromVcf_ogSHIC.py
 create mode 100644 diploshic/makeFeatureVecsForSingleMsDiploid.py
 rename makeFeatureVecsForSingleMs_ogSHIC.py => diploshic/makeFeatureVecsForSingleMs_ogSHIC.py (61%)
 rename makeTrainingSets.py => diploshic/makeTrainingSets.py (51%)
 rename misc.py => diploshic/misc.py (83%)
 rename msTools.py => diploshic/msTools.py (69%)
 create mode 100644 diploshic/setup.py
 rename shicstats.pyf => diploshic/shicstats.pyf (100%)
 rename {testing => diploshic/testing}/hard.fvec (100%)
 rename {testing => diploshic/testing}/linkedHard.fvec (100%)
 rename {testing => diploshic/testing}/linkedSoft.fvec (100%)
 rename {testing => diploshic/testing}/neut.fvec (100%)
 rename {testing => diploshic/testing}/soft.fvec (100%)
 rename {training => diploshic/training}/hard.fvec (100%)
 rename {training => diploshic/training}/linkedHard.fvec (100%)
 rename {training => diploshic/training}/linkedSoft.fvec (100%)
 rename {training => diploshic/training}/neut.fvec (100%)
 rename {training => diploshic/training}/soft.fvec (100%)
 rename utils.c => diploshic/utils.c (100%)
 delete mode 100644 makeFeatureVecsForChrArmFromVcfDiploid.py
 delete mode 100644 makeFeatureVecsForChrArmFromVcf_ogSHIC.py
 delete mode 100644 makeFeatureVecsForSingleMsDiploid.py
 create mode 100644 meta.yaml

diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..c295d7a
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,5 @@
+include diploshic/shicstats.pyf
+include diploshic/testEmpirical.fvec
+include diploshic/testing/*
+include diploshic/training/*
+include diploshic/utils.c
diff --git a/build.sh b/build.sh
new file mode 100644
index 0000000..1fcfd2f
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,2 @@
+python setup.py build
+python setup.py install --prefix=$PREFIX
\ No newline at end of file
diff --git a/diploSHIC.py b/diploSHIC.py
deleted file mode 100644
index 0e1e8bf..0000000
--- a/diploSHIC.py
+++ /dev/null
@@ -1,371 +0,0 @@
-import argparse,time,sys,subprocess
-
-pyExec = sys.executable
-if "/" in sys.argv[0]:
-    diploShicDir = "/".join(sys.argv[0].split("/")[:-1]) + "/"
-else:
-    diploShicDir = ""
-
-################# use argparse to get CL args and make sure we are kosher
-parser = argparse.ArgumentParser(description='calculate feature vectors, train, or predict with diploSHIC')
-parser._positionals.title = 'possible modes (enter \'python diploSHIC.py modeName -h\' for modeName\'s help message'
-subparsers = parser.add_subparsers(help='sub-command help')
-parser_c = subparsers.add_parser('fvecSim', help='Generate feature vectors from simulated data')
-parser_e = subparsers.add_parser('makeTrainingSets', help='Combine feature vectors from muliple fvecSim runs into 5 balanced training sets')
-parser_a = subparsers.add_parser('train', help='train and test a shic CNN')
-parser_d = subparsers.add_parser('fvecVcf', help='Generate feature vectors from data in a VCF file')
-parser_b = subparsers.add_parser('predict', help='perform prediction using an already-trained SHIC CNN')
-#parser_a.add_argument('nDims', metavar='nDims', type=int, 
-#                   help='dimensionality of the feature vector')
-parser_a.add_argument('trainDir', help='path to training set files')
-parser_a.add_argument('testDir', help='path to test set files, can be same as trainDir')
-parser_a.add_argument('outputModel', help='file name for output model, will create two files one with structure one with weights')
-parser_a.add_argument('--epochs', type=int, help='max epochs for training CNN (default = 100)', default=100)
-parser_a.add_argument('--numSubWins', type=int, help='number of subwindows that our chromosome is divided into (default = 11)', default=11)
-parser_a.add_argument('--confusionFile', help='optional file to which confusion matrix plot will be written (default = None)', default=None)
-parser_a.set_defaults(mode='train')
-parser_a._positionals.title = 'required arguments'
-
-#parser_b.add_argument('nDims', metavar='nDims', type=int, 
-#                   help='dimensionality of the feature vector')
-parser_b.add_argument('modelStructure', help='path to CNN structure .json file')
-parser_b.add_argument('modelWeights', help='path to CNN weights .h5 file')
-parser_b.add_argument('predictFile', help='input file to predict')
-parser_b.add_argument('predictFileOutput', help='output file name')
-parser_b.add_argument('--numSubWins', type=int, help='number of subwindows that our chromosome is divided into (default = 11)', default=11)
-parser_b.add_argument('--simData', help='Are we using simulated input data wihout coordinates?', action="store_true")
-parser_b.set_defaults(mode='predict')
-parser_b._positionals.title = 'required arguments'
-
-parser_c.add_argument('shicMode', help='specifies whether to use original haploid SHIC (use \'haploid\') or diploSHIC (\'diploid\')',
-                          default='diploid')
-parser_c.add_argument('msOutFile', help='path to simulation output file (must be same format used by Hudson\'s ms)')
-parser_c.add_argument('fvecFileName', help='path to file where feature vectors will be written')
-parser_c.add_argument('--totalPhysLen', type=int, help='Length of simulated chromosome for converting infinite sites ms output to finite sites (default=1100000)',
-                          default=1100000)
-parser_c.add_argument('--numSubWins', type=int, help='The number of subwindows that our chromosome will be divided into (default=11)', default=11)
-parser_c.add_argument('--maskFileName', help=('Path to a fasta-formatted file that contains masking information (marked by \'N\'). '
-                          'If specified, simulations will be masked in a manner mirroring windows drawn from this file.'), default="None")
-parser_c.add_argument('--vcfForMaskFileName', help=('Path to a VCF file that contains genotype information. This will be used to mask genotypes '
-                          'in a manner that mirrors how the true data are masked.'), default=None)
-parser_c.add_argument('--popForMask', help='The label of the population for which we should draw genotype information from the VCF for masking purposes.',
-                          default=None)
-parser_c.add_argument('--sampleToPopFileName', help=('Path to tab delimited file with population assignments (used for genotype masking); format: '
-                          'SampleID\tpopID'), default="None")
-parser_c.add_argument('--unmaskedGenoFracCutoff', type=float, help='Fraction of unmasked genotypes required to retain a site (default=0.75)', default=0.75)
-parser_c.add_argument('--chrArmsForMasking', help=('A comma-separated list (no spaces) of chromosome arms from which we want to draw masking '
-                          'information (or \'all\' if we want to use all arms. Ignored if maskFileName is not specified.'), default="all")
-parser_c.add_argument('--unmaskedFracCutoff', type=float, help='Minimum fraction of unmasked sites, if masking simulated data (default=0.25)', default=0.25)
-parser_c.add_argument('--outStatsDir', help='Path to a directory where values of each statistic in each subwindow are recorded for each rep',
-                          default="None")
-parser_c.add_argument('--ancFileName', help=('Path to a fasta-formatted file that contains inferred ancestral states (\'N\' if unknown).'
-                          ' This is used for masking, as sites that cannot be polarized are masked, and we mimic this in the simulted data.'
-                          ' Ignored in diploid mode which currently does not use ancestral state information'),
-                          default="None")
-parser_c.add_argument('--pMisPol', type=float, help='The fraction of sites that will be intentionally polarized to better approximate real data (default=0.0)',
-                          default=0.0)
-parser_c.set_defaults(mode='fvecSim')
-parser_c._positionals.title = 'required arguments'
-
-parser_d.add_argument('shicMode', help='specifies whether to use original haploid SHIC (use \'haploid\') or diploSHIC (\'diploid\')')
-parser_d.add_argument('chrArmVcfFile', help='VCF format file containing data for our chromosome arm (other arms will be ignored)')
-parser_d.add_argument('chrArm', help='Exact name of the chromosome arm for which feature vectors will be calculated')
-parser_d.add_argument('chrLen', type=int, help='Length of the chromosome arm')
-parser_d.add_argument('fvecFileName', help='path to file where feature vectors will be written')
-parser_d.add_argument('--targetPop', help='Population ID of samples we wish to include', default="None")
-parser_d.add_argument('--sampleToPopFileName', help=('Path to tab delimited file with population assignments; format: '
-                          'SampleID\tpopID'), default="None")
-parser_d.add_argument('--winSize', type=int, help='Length of the large window (default=1100000)', default=1100000)
-parser_d.add_argument('--numSubWins', type=int, help='Number of sub-windows within each large window (default=11)', default=11)
-parser_d.add_argument('--maskFileName', help=('Path to a fasta-formatted file that contains masking information (marked by \'N\'); '
-                          'must have an entry with title matching chrArm'), default="None")
-parser_d.add_argument('--unmaskedFracCutoff', type=float, help='Fraction of unmasked sites required to retain a subwindow (default=0.25)', default=0.25)
-parser_d.add_argument('--unmaskedGenoFracCutoff', type=float, help='Fraction of unmasked genotypes required to retain a site (default=0.75)', default=0.75)
-parser_d.add_argument('--ancFileName', help=('Path to a fasta-formatted file that contains inferred ancestral states (\'N\' if unknown); '
-                          'must have an entry with title matching chrArm. Ignored for diploid mode which currently does not use ancestral '
-                          'state information.'), default="None")
-parser_d.add_argument('--statFileName', help='Path to a file where statistics will be written for each subwindow that is not filtered out',
-                         default="None")
-parser_d.add_argument('--segmentStart', help='Left boundary of region in which feature vectors are calculated (whole arm if omitted)',
-                         default="None")
-parser_d.add_argument('--segmentEnd', help='Right boundary of region in which feature vectors are calculated (whole arm if omitted)',
-                         default="None")
-parser_d.set_defaults(mode='fvecVcf')
-parser_d._positionals.title = 'required arguments'
-
-parser_e.add_argument('neutTrainingFileName', help='Path to our neutral feature vectors')
-parser_e.add_argument('softTrainingFilePrefix', help=('Prefix (including higher-level path) of files containing soft training examples'
-                          '; files must end with \'_$i.$ext\' where $i is the subwindow index of the sweep and $ext is any extension.'))
-parser_e.add_argument('hardTrainingFilePrefix', help=('Prefix (including higher-level path) of files containing hard training examples'
-                          '; files must end with \'_$i.$ext\' where $i is the subwindow index of the sweep and $ext is any extension.'))
-parser_e.add_argument('sweepTrainingWindows', type=int, help=('comma-separated list of windows to classify as sweeps (usually just \'5\''
-                          ' but without the quotes)'))
-parser_e.add_argument('linkedTrainingWindows', help=('list of windows to treat as linked to sweeps (usually \'0,1,2,3,4,6,7,8,9,10\' but'
-                          ' without the quotes)'))
-parser_e.add_argument('outDir', help='path to directory where the training sets will be written')
-parser_e.set_defaults(mode='makeTrainingSets')
-parser_e._positionals.title = 'required arguments'
-if len(sys.argv)==1:
-    parser.print_help()
-    sys.exit(1)
-args = parser.parse_args()
-argsDict = vars(args)
-
-if argsDict['mode'] in ['train', 'predict']:
-###########################################################
-# Import a bunch of libraries if everything checks out
-    import matplotlib
-    matplotlib.use('Agg')
-    import numpy as np
-    from keras.models import Sequential, Model
-    from keras import optimizers
-    from keras.layers import Dense, Dropout, Activation, Flatten, Input
-    from keras.layers import Conv2D, MaxPooling2D,concatenate
-    from keras.utils import np_utils
-    from sklearn.model_selection import train_test_split
-    from keras.utils.layer_utils import convert_all_kernels_in_model
-    from keras.preprocessing.image import ImageDataGenerator
-    from keras.callbacks import EarlyStopping,ModelCheckpoint
-    import keras.backend as K
-    import fnmatch
-
-    #nDims = argsDict['nDims']
-    numSubWins = argsDict['numSubWins']
-
-if argsDict['mode'] == 'train':
-    trainingDir=argsDict['trainDir']
-    testingDir=argsDict['testDir']
-    epochOption = argsDict['epochs']
-    outputModel = argsDict['outputModel']
-    confusionFile = argsDict['confusionFile']
-    #nCores = 12
-    print("loading data now...")
-    #training data
-    
-    hard = np.loadtxt(trainingDir+"hard.fvec",skiprows=1)
-    nDims = int(hard.shape[1] / numSubWins)
-    h1 = np.reshape(hard,(hard.shape[0],nDims,numSubWins))
-    neut = np.loadtxt(trainingDir+"neut.fvec",skiprows=1)
-    n1 = np.reshape(neut,(neut.shape[0],nDims,numSubWins))
-    soft = np.loadtxt(trainingDir+"soft.fvec",skiprows=1)
-    s1 = np.reshape(soft,(soft.shape[0],nDims,numSubWins))
-    lsoft = np.loadtxt(trainingDir+"linkedSoft.fvec",skiprows=1)
-    ls1 = np.reshape(lsoft,(lsoft.shape[0],nDims,numSubWins))
-    lhard = np.loadtxt(trainingDir+"linkedHard.fvec",skiprows=1)
-    lh1 = np.reshape(lhard,(lhard.shape[0],nDims,numSubWins))
-
-    both=np.concatenate((h1,n1,s1,ls1,lh1))
-    y=np.concatenate((np.repeat(0,len(h1)),np.repeat(1,len(n1)), np.repeat(2,len(s1)), np.repeat(3,len(ls1)), np.repeat(4,len(lh1))))
-
-    #reshape both to explicitly set depth image. need for theanno not sure with tensorflow
-    both = both.reshape(both.shape[0],nDims,numSubWins,1)
-    if (trainingDir==testingDir):
-        X_train, X_test, y_train, y_test = train_test_split(both, y, test_size=0.2)
-    else:
-        X_train = both
-        y_train = y
-        #testing data
-        hard = np.loadtxt(testingDir+"hard.fvec",skiprows=1)
-        h1 = np.reshape(hard,(hard.shape[0],nDims,numSubWins))
-        neut = np.loadtxt(testingDir+"neut.fvec",skiprows=1)
-        n1 = np.reshape(neut,(neut.shape[0],nDims,numSubWins))
-        soft = np.loadtxt(testingDir+"soft.fvec",skiprows=1)
-        s1 = np.reshape(soft,(soft.shape[0],nDims,numSubWins))
-        lsoft = np.loadtxt(testingDir+"linkedSoft.fvec",skiprows=1)
-        ls1 = np.reshape(lsoft,(lsoft.shape[0],nDims,numSubWins))
-        lhard = np.loadtxt(testingDir+"linkedHard.fvec",skiprows=1)
-        lh1 = np.reshape(lhard,(lhard.shape[0],nDims,numSubWins))
-
-        both2=np.concatenate((h1,n1,s1,ls1,lh1))
-        X_test = both2.reshape(both2.shape[0],nDims,numSubWins,1)
-        y_test=np.concatenate((np.repeat(0,len(h1)),np.repeat(1,len(n1)), np.repeat(2,len(s1)), np.repeat(3,len(ls1)), np.repeat(4,len(lh1))))
-
-
-    Y_train = np_utils.to_categorical(y_train, 5)
-    Y_test = np_utils.to_categorical(y_test, 5)
-    X_valid, X_test, Y_valid, Y_test = train_test_split(X_test, Y_test, test_size=0.5)
-
-    datagen = ImageDataGenerator(
-        featurewise_center=True,
-        featurewise_std_normalization=True,
-        horizontal_flip=True)
-
-    validation_gen = ImageDataGenerator(
-        featurewise_center=True,
-        featurewise_std_normalization=True,
-        horizontal_flip=False)
-    test_gen = ImageDataGenerator(
-        featurewise_center=True,
-        featurewise_std_normalization=True,
-        horizontal_flip=False)
-
-
-    #print(X_train.shape)
-    print("training set has %d examples" % X_train.shape[0])
-    print("validation set has %d examples" % X_valid.shape[0])
-    print("test set has %d examples" % X_test.shape[0])
-    
-    model_in = Input(X_train.shape[1:])
-    h = Conv2D(128, 3, activation='relu',padding="same", name='conv1_1')(model_in)
-    h = Conv2D(64, 3, activation='relu',padding="same", name='conv1_2')(h)
-    h = MaxPooling2D(pool_size=3, name='pool1',padding="same")(h)
-    h = Dropout(0.15, name='drop1')(h)
-    h = Flatten(name='flaten1')(h)
-
-    dh = Conv2D(128, 2, activation='relu',dilation_rate=[1,3],padding="same", name='dconv1_1')(model_in)
-    dh = Conv2D(64, 2, activation='relu',dilation_rate=[1,3],padding="same", name='dconv1_2')(dh)
-    dh = MaxPooling2D(pool_size=2, name='dpool1')(dh)
-    dh = Dropout(0.15, name='ddrop1')(dh)
-    dh = Flatten(name='dflaten1')(dh)
-
-    dh1 = Conv2D(128, 2, activation='relu',dilation_rate=[1,4],padding="same", name='dconv4_1')(model_in)
-    dh1 = Conv2D(64, 2, activation='relu',dilation_rate=[1,4],padding="same", name='dconv4_2')(dh1)
-    dh1 = MaxPooling2D(pool_size=2, name='d1pool1')(dh1)
-    dh1 = Dropout(0.15, name='d1drop1')(dh1)
-    dh1 = Flatten(name='d1flaten1')(dh1)
-
-    h =  concatenate([h,dh,dh1])
-    h = Dense(512,name="512dense",activation='relu')(h)
-    h = Dropout(0.2, name='drop7')(h)
-    h = Dense(128,name="last_dense",activation='relu')(h)
-    h = Dropout(0.1, name='drop8')(h)
-    output = Dense(5,name="out_dense",activation='softmax')(h)
-    model = Model(inputs=[model_in], outputs=[output])
-
-
-
-
-
-    model.compile(loss='categorical_crossentropy',
-                  optimizer='adam',
-                  metrics=['accuracy'])
-
-    # define early stopping callback
-    earlystop = EarlyStopping(monitor='val_accuracy', min_delta=0.001, patience=5, \
-                              verbose=1, mode='auto')
-    
-    model_json = model.to_json()
-    with open(outputModel+".json", "w") as json_file:
-        json_file.write(model_json)
-    modWeightsFilepath=outputModel+".weights.hdf5"
-    checkpoint = ModelCheckpoint(modWeightsFilepath, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True, mode='auto')
-
-    callbacks_list = [earlystop,checkpoint]
-    #callbacks_list = [earlystop] #turning off checkpointing-- just want accuracy assessment
-
-    datagen.fit(X_train)
-    validation_gen.fit(X_valid)
-    test_gen.fit(X_test)
-    start = time.time()
-    model.fit_generator(datagen.flow(X_train, Y_train, batch_size=32), \
-                        steps_per_epoch=len(X_train) / 32, epochs=epochOption,verbose=1, \
-                        callbacks=callbacks_list, \
-                        validation_data=validation_gen.flow(X_valid,Y_valid, batch_size=32), \
-                        validation_steps=len(X_test)/32)
-    #model.fit(X_train, Y_train, batch_size=32, epochs=100,validation_data=(X_test,Y_test),callbacks=callbacks_list, verbose=1)
-    score = model.evaluate_generator(test_gen.flow(X_test,Y_test, batch_size=32),len(Y_test)/32)
-    sys.stderr.write("total time spent fitting and evaluating: %f secs\n" %(time.time()-start))
-
-    print("evaluation on test set:")
-    print("diploSHIC loss: %f" % score[0])
-    print("diploSHIC accuracy: %f" % score[1])
-    if confusionFile:
-        from misc import plot_confusion_matrix
-        import matplotlib.pyplot as plt
-        plot_confusion_matrix(model, test_gen.standardize(X_test), Y_test, labels=[0,4,2,3,1], display_labels=['Hard', 'Hard-linked', 'Soft', 'Soft-linked', 'Neutral'], cmap=plt.cm.Blues, normalize='true')
-        plt.savefig(confusionFile, bbox_inches='tight')
-
-elif argsDict['mode'] == 'predict':
-    import pandas as pd
-    from keras.models import model_from_json
-    
-    #import data from predictFile
-    x_df=pd.read_table(argsDict['predictFile'])
-    if argsDict['simData']:
-        testX = x_df[list(x_df)[:]].to_numpy()
-    else:
-        testX = x_df[list(x_df)[4:]].to_numpy()
-    nDims = int(testX.shape[1]/numSubWins)
-    np.reshape(testX,(testX.shape[0],nDims,numSubWins))
-    #add channels
-    testX = testX.reshape(testX.shape[0],nDims,numSubWins,1)
-    #set up generator for normalization 
-    validation_gen = ImageDataGenerator(
-        featurewise_center=True,
-        featurewise_std_normalization=True,
-        horizontal_flip=False)
-    validation_gen.fit(testX)
-    
-    #import model
-    json_file = open(argsDict['modelStructure'], 'r')
-    loaded_model_json = json_file.read()
-    json_file.close()
-    model = model_from_json(loaded_model_json)
-    # load weights into new model
-    model.load_weights(argsDict['modelWeights'])
-    print("Loaded model from disk")
-    
-    #get predictions
-    preds = model.predict(validation_gen.standardize(testX))
-    predictions = np.argmax(preds,axis=1)
-    
-    #np.repeat(0,len(h1)),np.repeat(1,len(n1)), np.repeat(2,len(s1)), np.repeat(3,len(ls1)), np.repeat(4,len(lh1)
-    classDict = {0:'hard',1:'neutral',2:'soft',3:'linkedSoft',4:'linkedHard'}
-    
-    #output the predictions
-    outputFile = open(argsDict['predictFileOutput'],'w')
-    if argsDict['simData']:
-        outputFile.write('predClass\tprob(neutral)\tprob(likedSoft)\tprob(linkedHard)\tprob(soft)\tprob(hard)\n')
-    else:
-        outputFile.write('chrom\tclassifiedWinStart\tclassifiedWinEnd\tbigWinRange\tpredClass\tprob(neutral)\tprob(likedSoft)\tprob(linkedHard)\tprob(soft)\tprob(hard)\n')
-
-    for index, row in x_df.iterrows():
-        if argsDict['simData']:
-            outputFile.write('{}\t{:f}\t{:f}\t{:f}\t{:f}\t{:f}\n'.format(classDict[predictions[index]],preds[index][1],preds[index][3],preds[index][4], \
-                preds[index][2],preds[index][0]))
-        else:
-            outputFile.write('{}\t{}\t{}\t{}\t{}\t{:f}\t{:f}\t{:f}\t{:f}\t{:f}\n'.format( row['chrom'],row['classifiedWinStart'],row['classifiedWinEnd'],row['bigWinRange'], \
-                classDict[predictions[index]],preds[index][1],preds[index][3],preds[index][4],preds[index][2],preds[index][0]))
-    outputFile.close()
-    print("{} predictions complete".format(index+1))
-elif argsDict['mode'] == 'fvecSim':
-    if argsDict['shicMode'].lower() == 'diploid':
-        cmdArgs = [argsDict['msOutFile'], argsDict['totalPhysLen'], argsDict['numSubWins'], argsDict['maskFileName'],
-                   argsDict['vcfForMaskFileName'], argsDict['popForMask'], argsDict['sampleToPopFileName'], argsDict['unmaskedGenoFracCutoff'],
-                   argsDict['chrArmsForMasking'], argsDict['unmaskedFracCutoff'], argsDict['outStatsDir'], argsDict['fvecFileName']]
-        cmd = pyExec + " " + diploShicDir + "makeFeatureVecsForSingleMsDiploid.py " + " ".join([str(x) for x in cmdArgs])
-    elif argsDict['shicMode'].lower() == 'haploid':
-        cmdArgs = [argsDict['msOutFile'], argsDict['totalPhysLen'], argsDict['numSubWins'], argsDict['maskFileName'], argsDict['ancFileName'],
-               argsDict['chrArmsForMasking'], argsDict['unmaskedFracCutoff'], argsDict['pMisPol'], argsDict['outStatsDir'],
-               argsDict['fvecFileName']]
-        cmd = pyExec + " " + diploShicDir + "makeFeatureVecsForSingleMs_ogSHIC.py " + " ".join([str(x) for x in cmdArgs])
-    else:
-        sys.exit("'shicMode' must be set to either 'diploid' or 'haploid'")
-    print(cmd)
-    subprocess.call(cmd.split())
-elif argsDict['mode'] == 'fvecVcf':
-    if argsDict['shicMode'].lower() == 'diploid':
-        cmdArgs = [argsDict['chrArmVcfFile'], argsDict['chrArm'], argsDict['chrLen'], argsDict['targetPop'], argsDict['winSize'],
-                   argsDict['numSubWins'], argsDict['maskFileName'], argsDict['unmaskedFracCutoff'], argsDict['unmaskedGenoFracCutoff'],
-                   argsDict['sampleToPopFileName'], argsDict['statFileName'], argsDict['fvecFileName']]
-        cmd = pyExec + " " + diploShicDir + "makeFeatureVecsForChrArmFromVcfDiploid.py " + " ".join([str(x) for x in cmdArgs])
-    elif argsDict['shicMode'].lower() == 'haploid':
-        cmdArgs = [argsDict['chrArmVcfFile'], argsDict['chrArm'], argsDict['chrLen'], argsDict['targetPop'], argsDict['winSize'],
-               argsDict['numSubWins'], argsDict['maskFileName'], argsDict['unmaskedFracCutoff'], argsDict['sampleToPopFileName'],
-               argsDict['ancFileName'], argsDict['statFileName'], argsDict['fvecFileName']]
-        cmd = pyExec + " " + diploShicDir + "makeFeatureVecsForChrArmFromVcf_ogSHIC.py " + " ".join([str(x) for x in cmdArgs])
-    else:
-        sys.exit("'shicMode' must be set to either 'diploid' or 'haploid'")
-    additionalArgs = []
-    if argsDict['segmentStart'] != "None":
-        additionalArgs += [argsDict['segmentStart'], argsDict['segmentEnd']]
-        cmd += " " + " ".join(additionalArgs)
-    #cmd += " > " + argsDict['fvecFileName']
-    print(cmd)
-    subprocess.call(cmd.split())
-elif argsDict['mode'] == 'makeTrainingSets':
-    cmdArgs = [argsDict['neutTrainingFileName'], argsDict['softTrainingFilePrefix'], argsDict['hardTrainingFilePrefix'],
-               argsDict['sweepTrainingWindows'], argsDict['linkedTrainingWindows'], argsDict['outDir']]
-    cmd = pyExec + " " + diploShicDir + "makeTrainingSets.py " + " ".join([str(x) for x in cmdArgs])
-    print(cmd)
-    subprocess.call(cmd.split())
diff --git a/diploshic/__init__.py b/diploshic/__init__.py
new file mode 100644
index 0000000..4b4f13d
--- /dev/null
+++ b/diploshic/__init__.py
@@ -0,0 +1,3 @@
+from diploshic.fvTools import *
+from diploshic.msTools import *
+from diploshic.shicstats import *
diff --git a/diploshic/diploSHIC b/diploshic/diploSHIC
new file mode 100644
index 0000000..f0134bc
--- /dev/null
+++ b/diploshic/diploSHIC
@@ -0,0 +1,759 @@
+#!/usr/bin/env python
+
+import argparse, time, sys, subprocess
+
+pyExec = sys.executable
+print(pyExec)
+if "/" in sys.argv[0]:
+    diploShicDir = "/".join(sys.argv[0].split("/")[:-1]) + "/"
+else:
+    diploShicDir = ""
+
+################# use argparse to get CL args and make sure we are kosher
+parser = argparse.ArgumentParser(
+    description="calculate feature vectors, train, or predict with diploSHIC"
+)
+parser._positionals.title = "possible modes (enter 'python diploSHIC.py modeName -h' for modeName's help message"
+subparsers = parser.add_subparsers(help="sub-command help")
+parser_c = subparsers.add_parser(
+    "fvecSim", help="Generate feature vectors from simulated data"
+)
+parser_e = subparsers.add_parser(
+    "makeTrainingSets",
+    help="Combine feature vectors from muliple fvecSim runs into 5 balanced training sets",
+)
+parser_a = subparsers.add_parser("train", help="train and test a shic CNN")
+parser_d = subparsers.add_parser(
+    "fvecVcf", help="Generate feature vectors from data in a VCF file"
+)
+parser_b = subparsers.add_parser(
+    "predict", help="perform prediction using an already-trained SHIC CNN"
+)
+# parser_a.add_argument('nDims', metavar='nDims', type=int,
+#                   help='dimensionality of the feature vector')
+parser_a.add_argument("trainDir", help="path to training set files")
+parser_a.add_argument(
+    "testDir", help="path to test set files, can be same as trainDir"
+)
+parser_a.add_argument(
+    "outputModel",
+    help="file name for output model, will create two files one with structure one with weights",
+)
+parser_a.add_argument(
+    "--epochs",
+    type=int,
+    help="max epochs for training CNN (default = 100)",
+    default=100,
+)
+parser_a.add_argument(
+    "--numSubWins",
+    type=int,
+    help="number of subwindows that our chromosome is divided into (default = 11)",
+    default=11,
+)
+parser_a.add_argument(
+    "--confusionFile",
+    help="optional file to which confusion matrix plot will be written (default = None)",
+    default=None,
+)
+parser_a.set_defaults(mode="train")
+parser_a._positionals.title = "required arguments"
+
+# parser_b.add_argument('nDims', metavar='nDims', type=int,
+#                   help='dimensionality of the feature vector')
+parser_b.add_argument(
+    "modelStructure", help="path to CNN structure .json file"
+)
+parser_b.add_argument("modelWeights", help="path to CNN weights .h5 file")
+parser_b.add_argument("predictFile", help="input file to predict")
+parser_b.add_argument("predictFileOutput", help="output file name")
+parser_b.add_argument(
+    "--numSubWins",
+    type=int,
+    help="number of subwindows that our chromosome is divided into (default = 11)",
+    default=11,
+)
+parser_b.add_argument(
+    "--simData",
+    help="Are we using simulated input data wihout coordinates?",
+    action="store_true",
+)
+parser_b.set_defaults(mode="predict")
+parser_b._positionals.title = "required arguments"
+
+parser_c.add_argument(
+    "shicMode",
+    help="specifies whether to use original haploid SHIC (use 'haploid') or diploSHIC ('diploid')",
+    default="diploid",
+)
+parser_c.add_argument(
+    "msOutFile",
+    help="path to simulation output file (must be same format used by Hudson's ms)",
+)
+parser_c.add_argument(
+    "fvecFileName", help="path to file where feature vectors will be written"
+)
+parser_c.add_argument(
+    "--totalPhysLen",
+    type=int,
+    help="Length of simulated chromosome for converting infinite sites ms output to finite sites (default=1100000)",
+    default=1100000,
+)
+parser_c.add_argument(
+    "--numSubWins",
+    type=int,
+    help="The number of subwindows that our chromosome will be divided into (default=11)",
+    default=11,
+)
+parser_c.add_argument(
+    "--maskFileName",
+    help=(
+        "Path to a fasta-formatted file that contains masking information (marked by 'N'). "
+        "If specified, simulations will be masked in a manner mirroring windows drawn from this file."
+    ),
+    default="None",
+)
+parser_c.add_argument(
+    "--vcfForMaskFileName",
+    help=(
+        "Path to a VCF file that contains genotype information. This will be used to mask genotypes "
+        "in a manner that mirrors how the true data are masked."
+    ),
+    default=None,
+)
+parser_c.add_argument(
+    "--popForMask",
+    help="The label of the population for which we should draw genotype information from the VCF for masking purposes.",
+    default=None,
+)
+parser_c.add_argument(
+    "--sampleToPopFileName",
+    help=(
+        "Path to tab delimited file with population assignments (used for genotype masking); format: "
+        "SampleID\tpopID"
+    ),
+    default="None",
+)
+parser_c.add_argument(
+    "--unmaskedGenoFracCutoff",
+    type=float,
+    help="Fraction of unmasked genotypes required to retain a site (default=0.75)",
+    default=0.75,
+)
+parser_c.add_argument(
+    "--chrArmsForMasking",
+    help=(
+        "A comma-separated list (no spaces) of chromosome arms from which we want to draw masking "
+        "information (or 'all' if we want to use all arms. Ignored if maskFileName is not specified."
+    ),
+    default="all",
+)
+parser_c.add_argument(
+    "--unmaskedFracCutoff",
+    type=float,
+    help="Minimum fraction of unmasked sites, if masking simulated data (default=0.25)",
+    default=0.25,
+)
+parser_c.add_argument(
+    "--outStatsDir",
+    help="Path to a directory where values of each statistic in each subwindow are recorded for each rep",
+    default="None",
+)
+parser_c.add_argument(
+    "--ancFileName",
+    help=(
+        "Path to a fasta-formatted file that contains inferred ancestral states ('N' if unknown)."
+        " This is used for masking, as sites that cannot be polarized are masked, and we mimic this in the simulted data."
+        " Ignored in diploid mode which currently does not use ancestral state information"
+    ),
+    default="None",
+)
+parser_c.add_argument(
+    "--pMisPol",
+    type=float,
+    help="The fraction of sites that will be intentionally polarized to better approximate real data (default=0.0)",
+    default=0.0,
+)
+parser_c.set_defaults(mode="fvecSim")
+parser_c._positionals.title = "required arguments"
+
+parser_d.add_argument(
+    "shicMode",
+    help="specifies whether to use original haploid SHIC (use 'haploid') or diploSHIC ('diploid')",
+)
+parser_d.add_argument(
+    "chrArmVcfFile",
+    help="VCF format file containing data for our chromosome arm (other arms will be ignored)",
+)
+parser_d.add_argument(
+    "chrArm",
+    help="Exact name of the chromosome arm for which feature vectors will be calculated",
+)
+parser_d.add_argument("chrLen", type=int, help="Length of the chromosome arm")
+parser_d.add_argument(
+    "fvecFileName", help="path to file where feature vectors will be written"
+)
+parser_d.add_argument(
+    "--targetPop",
+    help="Population ID of samples we wish to include",
+    default="None",
+)
+parser_d.add_argument(
+    "--sampleToPopFileName",
+    help=(
+        "Path to tab delimited file with population assignments; format: "
+        "SampleID\tpopID"
+    ),
+    default="None",
+)
+parser_d.add_argument(
+    "--winSize",
+    type=int,
+    help="Length of the large window (default=1100000)",
+    default=1100000,
+)
+parser_d.add_argument(
+    "--numSubWins",
+    type=int,
+    help="Number of sub-windows within each large window (default=11)",
+    default=11,
+)
+parser_d.add_argument(
+    "--maskFileName",
+    help=(
+        "Path to a fasta-formatted file that contains masking information (marked by 'N'); "
+        "must have an entry with title matching chrArm"
+    ),
+    default="None",
+)
+parser_d.add_argument(
+    "--unmaskedFracCutoff",
+    type=float,
+    help="Fraction of unmasked sites required to retain a subwindow (default=0.25)",
+    default=0.25,
+)
+parser_d.add_argument(
+    "--unmaskedGenoFracCutoff",
+    type=float,
+    help="Fraction of unmasked genotypes required to retain a site (default=0.75)",
+    default=0.75,
+)
+parser_d.add_argument(
+    "--ancFileName",
+    help=(
+        "Path to a fasta-formatted file that contains inferred ancestral states ('N' if unknown); "
+        "must have an entry with title matching chrArm. Ignored for diploid mode which currently does not use ancestral "
+        "state information."
+    ),
+    default="None",
+)
+parser_d.add_argument(
+    "--statFileName",
+    help="Path to a file where statistics will be written for each subwindow that is not filtered out",
+    default="None",
+)
+parser_d.add_argument(
+    "--segmentStart",
+    help="Left boundary of region in which feature vectors are calculated (whole arm if omitted)",
+    default="None",
+)
+parser_d.add_argument(
+    "--segmentEnd",
+    help="Right boundary of region in which feature vectors are calculated (whole arm if omitted)",
+    default="None",
+)
+parser_d.set_defaults(mode="fvecVcf")
+parser_d._positionals.title = "required arguments"
+
+parser_e.add_argument(
+    "neutTrainingFileName", help="Path to our neutral feature vectors"
+)
+parser_e.add_argument(
+    "softTrainingFilePrefix",
+    help=(
+        "Prefix (including higher-level path) of files containing soft training examples"
+        "; files must end with '_$i.$ext' where $i is the subwindow index of the sweep and $ext is any extension."
+    ),
+)
+parser_e.add_argument(
+    "hardTrainingFilePrefix",
+    help=(
+        "Prefix (including higher-level path) of files containing hard training examples"
+        "; files must end with '_$i.$ext' where $i is the subwindow index of the sweep and $ext is any extension."
+    ),
+)
+parser_e.add_argument(
+    "sweepTrainingWindows",
+    type=int,
+    help=(
+        "comma-separated list of windows to classify as sweeps (usually just '5'"
+        " but without the quotes)"
+    ),
+)
+parser_e.add_argument(
+    "linkedTrainingWindows",
+    help=(
+        "list of windows to treat as linked to sweeps (usually '0,1,2,3,4,6,7,8,9,10' but"
+        " without the quotes)"
+    ),
+)
+parser_e.add_argument(
+    "outDir", help="path to directory where the training sets will be written"
+)
+parser_e.set_defaults(mode="makeTrainingSets")
+parser_e._positionals.title = "required arguments"
+if len(sys.argv) == 1:
+    parser.print_help()
+    sys.exit(1)
+args = parser.parse_args()
+argsDict = vars(args)
+
+if argsDict["mode"] in ["train", "predict"]:
+    ###########################################################
+    # Import a bunch of libraries if everything checks out
+    import matplotlib
+
+    matplotlib.use("Agg")
+    import numpy as np
+    import tensorflow as tf
+    from keras.models import Sequential, Model
+    from keras import optimizers
+    from keras.layers import Dense, Dropout, Activation, Flatten, Input
+    from keras.layers import Conv2D, MaxPooling2D, concatenate
+    from sklearn.model_selection import train_test_split
+    from keras.preprocessing.image import ImageDataGenerator
+    from keras.callbacks import EarlyStopping, ModelCheckpoint
+    import keras.backend as K
+    import fnmatch
+
+    # nDims = argsDict['nDims']
+    numSubWins = argsDict["numSubWins"]
+
+if argsDict["mode"] == "train":
+    trainingDir = argsDict["trainDir"]
+    testingDir = argsDict["testDir"]
+    epochOption = argsDict["epochs"]
+    outputModel = argsDict["outputModel"]
+    confusionFile = argsDict["confusionFile"]
+    # nCores = 12
+    print("loading data now...")
+    # training data
+    hard = np.loadtxt(trainingDir + "hard.fvec", skiprows=1)
+    nDims = int(hard.shape[1] / numSubWins)
+    h1 = np.reshape(hard, (hard.shape[0], nDims, numSubWins))
+    neut = np.loadtxt(trainingDir + "neut.fvec", skiprows=1)
+    n1 = np.reshape(neut, (neut.shape[0], nDims, numSubWins))
+    soft = np.loadtxt(trainingDir + "soft.fvec", skiprows=1)
+    s1 = np.reshape(soft, (soft.shape[0], nDims, numSubWins))
+    lsoft = np.loadtxt(trainingDir + "linkedSoft.fvec", skiprows=1)
+    ls1 = np.reshape(lsoft, (lsoft.shape[0], nDims, numSubWins))
+    lhard = np.loadtxt(trainingDir + "linkedHard.fvec", skiprows=1)
+    lh1 = np.reshape(lhard, (lhard.shape[0], nDims, numSubWins))
+
+    both = np.concatenate((h1, n1, s1, ls1, lh1))
+    y = np.concatenate(
+        (
+            np.repeat(0, len(h1)),
+            np.repeat(1, len(n1)),
+            np.repeat(2, len(s1)),
+            np.repeat(3, len(ls1)),
+            np.repeat(4, len(lh1)),
+        )
+    )
+
+    # reshape both to explicitly set depth image. need for theanno not sure with tensorflow
+    both = both.reshape(both.shape[0], nDims, numSubWins, 1)
+    if trainingDir == testingDir:
+        X_train, X_test, y_train, y_test = train_test_split(
+            both, y, test_size=0.2
+        )
+    else:
+        X_train = both
+        y_train = y
+        # testing data
+        hard = np.loadtxt(testingDir + "hard.fvec", skiprows=1)
+        h1 = np.reshape(hard, (hard.shape[0], nDims, numSubWins))
+        neut = np.loadtxt(testingDir + "neut.fvec", skiprows=1)
+        n1 = np.reshape(neut, (neut.shape[0], nDims, numSubWins))
+        soft = np.loadtxt(testingDir + "soft.fvec", skiprows=1)
+        s1 = np.reshape(soft, (soft.shape[0], nDims, numSubWins))
+        lsoft = np.loadtxt(testingDir + "linkedSoft.fvec", skiprows=1)
+        ls1 = np.reshape(lsoft, (lsoft.shape[0], nDims, numSubWins))
+        lhard = np.loadtxt(testingDir + "linkedHard.fvec", skiprows=1)
+        lh1 = np.reshape(lhard, (lhard.shape[0], nDims, numSubWins))
+
+        both2 = np.concatenate((h1, n1, s1, ls1, lh1))
+        X_test = both2.reshape(both2.shape[0], nDims, numSubWins, 1)
+        y_test = np.concatenate(
+            (
+                np.repeat(0, len(h1)),
+                np.repeat(1, len(n1)),
+                np.repeat(2, len(s1)),
+                np.repeat(3, len(ls1)),
+                np.repeat(4, len(lh1)),
+            )
+        )
+
+    Y_train = tf.keras.utils.to_categorical(y_train, 5)
+    Y_test = tf.keras.utils.to_categorical(y_test, 5)
+    X_valid, X_test, Y_valid, Y_test = train_test_split(
+        X_test, Y_test, test_size=0.5
+    )
+
+    datagen = ImageDataGenerator(
+        featurewise_center=True,
+        featurewise_std_normalization=True,
+        horizontal_flip=True,
+    )
+
+    validation_gen = ImageDataGenerator(
+        featurewise_center=True,
+        featurewise_std_normalization=True,
+        horizontal_flip=False,
+    )
+    test_gen = ImageDataGenerator(
+        featurewise_center=True,
+        featurewise_std_normalization=True,
+        horizontal_flip=False,
+    )
+
+    # print(X_train.shape)
+    print("training set has %d examples" % X_train.shape[0])
+    print("validation set has %d examples" % X_valid.shape[0])
+    print("test set has %d examples" % X_test.shape[0])
+
+    model_in = Input(X_train.shape[1:])
+    h = Conv2D(128, 3, activation="relu", padding="same", name="conv1_1")(
+        model_in
+    )
+    h = Conv2D(64, 3, activation="relu", padding="same", name="conv1_2")(h)
+    h = MaxPooling2D(pool_size=3, name="pool1", padding="same")(h)
+    h = Dropout(0.15, name="drop1")(h)
+    h = Flatten(name="flaten1")(h)
+
+    dh = Conv2D(
+        128,
+        2,
+        activation="relu",
+        dilation_rate=[1, 3],
+        padding="same",
+        name="dconv1_1",
+    )(model_in)
+    dh = Conv2D(
+        64,
+        2,
+        activation="relu",
+        dilation_rate=[1, 3],
+        padding="same",
+        name="dconv1_2",
+    )(dh)
+    dh = MaxPooling2D(pool_size=2, name="dpool1")(dh)
+    dh = Dropout(0.15, name="ddrop1")(dh)
+    dh = Flatten(name="dflaten1")(dh)
+
+    dh1 = Conv2D(
+        128,
+        2,
+        activation="relu",
+        dilation_rate=[1, 4],
+        padding="same",
+        name="dconv4_1",
+    )(model_in)
+    dh1 = Conv2D(
+        64,
+        2,
+        activation="relu",
+        dilation_rate=[1, 4],
+        padding="same",
+        name="dconv4_2",
+    )(dh1)
+    dh1 = MaxPooling2D(pool_size=2, name="d1pool1")(dh1)
+    dh1 = Dropout(0.15, name="d1drop1")(dh1)
+    dh1 = Flatten(name="d1flaten1")(dh1)
+
+    h = concatenate([h, dh, dh1])
+    h = Dense(512, name="512dense", activation="relu")(h)
+    h = Dropout(0.2, name="drop7")(h)
+    h = Dense(128, name="last_dense", activation="relu")(h)
+    h = Dropout(0.1, name="drop8")(h)
+    output = Dense(5, name="out_dense", activation="softmax")(h)
+    model = Model(inputs=[model_in], outputs=[output])
+
+    model.compile(
+        loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
+    )
+
+    # define early stopping callback
+    earlystop = EarlyStopping(
+        monitor="val_accuracy",
+        min_delta=0.001,
+        patience=5,
+        verbose=1,
+        mode="auto",
+    )
+
+    model_json = model.to_json()
+    with open(outputModel + ".json", "w") as json_file:
+        json_file.write(model_json)
+    modWeightsFilepath = outputModel + ".weights.hdf5"
+    checkpoint = ModelCheckpoint(
+        modWeightsFilepath,
+        monitor="val_accuracy",
+        verbose=1,
+        save_best_only=True,
+        save_weights_only=True,
+        mode="auto",
+    )
+
+    callbacks_list = [earlystop, checkpoint]
+    # callbacks_list = [earlystop] #turning off checkpointing-- just want accuracy assessment
+
+    datagen.fit(X_train)
+    validation_gen.fit(X_valid)
+    test_gen.fit(X_test)
+    start = time.time()
+    model.fit(
+        datagen.flow(X_train, Y_train, batch_size=32),
+        steps_per_epoch=len(X_train) / 32,
+        epochs=epochOption,
+        verbose=1,
+        callbacks=callbacks_list,
+        validation_data=validation_gen.flow(X_valid, Y_valid, batch_size=32),
+        validation_steps=len(X_test) / 32,
+    )
+    # model.fit(X_train, Y_train, batch_size=32, epochs=100,validation_data=(X_test,Y_test),callbacks=callbacks_list, verbose=1)
+    score = model.evaluate(
+        test_gen.flow(X_test, Y_test, batch_size=32), steps=len(Y_test) / 32
+    )
+    sys.stderr.write(
+        "total time spent fitting and evaluating: %f secs\n"
+        % (time.time() - start)
+    )
+
+    print("evaluation on test set:")
+    print("diploSHIC loss: %f" % score[0])
+    print("diploSHIC accuracy: %f" % score[1])
+    if confusionFile:
+        from misc import plot_confusion_matrix
+        import matplotlib.pyplot as plt
+
+        plot_confusion_matrix(
+            model,
+            test_gen.standardize(X_test),
+            Y_test,
+            labels=[0, 4, 2, 3, 1],
+            display_labels=[
+                "Hard",
+                "Hard-linked",
+                "Soft",
+                "Soft-linked",
+                "Neutral",
+            ],
+            cmap=plt.cm.Blues,
+            normalize="true",
+        )
+        plt.savefig(confusionFile, bbox_inches="tight")
+
+elif argsDict["mode"] == "predict":
+    import pandas as pd
+    from keras.models import model_from_json
+
+    # import data from predictFile
+    x_df = pd.read_table(argsDict["predictFile"])
+    if argsDict["simData"]:
+        testX = x_df[list(x_df)[:]].to_numpy()
+    else:
+        testX = x_df[list(x_df)[4:]].to_numpy()
+    nDims = int(testX.shape[1] / numSubWins)
+    np.reshape(testX, (testX.shape[0], nDims, numSubWins))
+    # add channels
+    testX = testX.reshape(testX.shape[0], nDims, numSubWins, 1)
+    # set up generator for normalization
+    validation_gen = ImageDataGenerator(
+        featurewise_center=True,
+        featurewise_std_normalization=True,
+        horizontal_flip=False,
+    )
+    validation_gen.fit(testX)
+
+    # import model
+    json_file = open(argsDict["modelStructure"], "r")
+    loaded_model_json = json_file.read()
+    json_file.close()
+    model = model_from_json(loaded_model_json)
+    # load weights into new model
+    model.load_weights(argsDict["modelWeights"])
+    print("Loaded model from disk")
+
+    # get predictions
+    preds = model.predict(validation_gen.standardize(testX))
+    predictions = np.argmax(preds, axis=1)
+
+    # np.repeat(0,len(h1)),np.repeat(1,len(n1)), np.repeat(2,len(s1)), np.repeat(3,len(ls1)), np.repeat(4,len(lh1)
+    classDict = {
+        0: "hard",
+        1: "neutral",
+        2: "soft",
+        3: "linkedSoft",
+        4: "linkedHard",
+    }
+
+    # output the predictions
+    outputFile = open(argsDict["predictFileOutput"], "w")
+    if argsDict["simData"]:
+        outputFile.write(
+            "predClass\tprob(neutral)\tprob(likedSoft)\tprob(linkedHard)\tprob(soft)\tprob(hard)\n"
+        )
+    else:
+        outputFile.write(
+            "chrom\tclassifiedWinStart\tclassifiedWinEnd\tbigWinRange\tpredClass\tprob(neutral)\tprob(likedSoft)\tprob(linkedHard)\tprob(soft)\tprob(hard)\n"
+        )
+
+    for index, row in x_df.iterrows():
+        if argsDict["simData"]:
+            outputFile.write(
+                "{}\t{:f}\t{:f}\t{:f}\t{:f}\t{:f}\n".format(
+                    classDict[predictions[index]],
+                    preds[index][1],
+                    preds[index][3],
+                    preds[index][4],
+                    preds[index][2],
+                    preds[index][0],
+                )
+            )
+        else:
+            outputFile.write(
+                "{}\t{}\t{}\t{}\t{}\t{:f}\t{:f}\t{:f}\t{:f}\t{:f}\n".format(
+                    row["chrom"],
+                    row["classifiedWinStart"],
+                    row["classifiedWinEnd"],
+                    row["bigWinRange"],
+                    classDict[predictions[index]],
+                    preds[index][1],
+                    preds[index][3],
+                    preds[index][4],
+                    preds[index][2],
+                    preds[index][0],
+                )
+            )
+    outputFile.close()
+    print("{} predictions complete".format(index + 1))
+elif argsDict["mode"] == "fvecSim":
+    if argsDict["shicMode"].lower() == "diploid":
+        cmdArgs = [
+            argsDict["msOutFile"],
+            argsDict["totalPhysLen"],
+            argsDict["numSubWins"],
+            argsDict["maskFileName"],
+            argsDict["vcfForMaskFileName"],
+            argsDict["popForMask"],
+            argsDict["sampleToPopFileName"],
+            argsDict["unmaskedGenoFracCutoff"],
+            argsDict["chrArmsForMasking"],
+            argsDict["unmaskedFracCutoff"],
+            argsDict["outStatsDir"],
+            argsDict["fvecFileName"],
+        ]
+        cmd = (
+            pyExec
+            + " "
+            + diploShicDir
+            + "makeFeatureVecsForSingleMsDiploid.py "
+            + " ".join([str(x) for x in cmdArgs])
+        )
+    elif argsDict["shicMode"].lower() == "haploid":
+        cmdArgs = [
+            argsDict["msOutFile"],
+            argsDict["totalPhysLen"],
+            argsDict["numSubWins"],
+            argsDict["maskFileName"],
+            argsDict["ancFileName"],
+            argsDict["chrArmsForMasking"],
+            argsDict["unmaskedFracCutoff"],
+            argsDict["pMisPol"],
+            argsDict["outStatsDir"],
+            argsDict["fvecFileName"],
+        ]
+        cmd = (
+            pyExec
+            + " "
+            + diploShicDir
+            + "makeFeatureVecsForSingleMs_ogSHIC.py "
+            + " ".join([str(x) for x in cmdArgs])
+        )
+    else:
+        sys.exit("'shicMode' must be set to either 'diploid' or 'haploid'")
+    print(cmd)
+    subprocess.call(cmd.split())
+elif argsDict["mode"] == "fvecVcf":
+    if argsDict["shicMode"].lower() == "diploid":
+        cmdArgs = [
+            argsDict["chrArmVcfFile"],
+            argsDict["chrArm"],
+            argsDict["chrLen"],
+            argsDict["targetPop"],
+            argsDict["winSize"],
+            argsDict["numSubWins"],
+            argsDict["maskFileName"],
+            argsDict["unmaskedFracCutoff"],
+            argsDict["unmaskedGenoFracCutoff"],
+            argsDict["sampleToPopFileName"],
+            argsDict["statFileName"],
+            argsDict["fvecFileName"],
+        ]
+        cmd = (
+            pyExec
+            + " "
+            + diploShicDir
+            + "makeFeatureVecsForChrArmFromVcfDiploid.py "
+            + " ".join([str(x) for x in cmdArgs])
+        )
+    elif argsDict["shicMode"].lower() == "haploid":
+        cmdArgs = [
+            argsDict["chrArmVcfFile"],
+            argsDict["chrArm"],
+            argsDict["chrLen"],
+            argsDict["targetPop"],
+            argsDict["winSize"],
+            argsDict["numSubWins"],
+            argsDict["maskFileName"],
+            argsDict["unmaskedFracCutoff"],
+            argsDict["sampleToPopFileName"],
+            argsDict["ancFileName"],
+            argsDict["statFileName"],
+            argsDict["fvecFileName"],
+        ]
+        cmd = (
+            pyExec
+            + " "
+            + diploShicDir
+            + "makeFeatureVecsForChrArmFromVcf_ogSHIC.py "
+            + " ".join([str(x) for x in cmdArgs])
+        )
+    else:
+        sys.exit("'shicMode' must be set to either 'diploid' or 'haploid'")
+    additionalArgs = []
+    if argsDict["segmentStart"] != "None":
+        additionalArgs += [argsDict["segmentStart"], argsDict["segmentEnd"]]
+        cmd += " " + " ".join(additionalArgs)
+    # cmd += " > " + argsDict['fvecFileName']
+    print(cmd)
+    subprocess.call(cmd.split())
+elif argsDict["mode"] == "makeTrainingSets":
+    cmdArgs = [
+        argsDict["neutTrainingFileName"],
+        argsDict["softTrainingFilePrefix"],
+        argsDict["hardTrainingFilePrefix"],
+        argsDict["sweepTrainingWindows"],
+        argsDict["linkedTrainingWindows"],
+        argsDict["outDir"],
+    ]
+    cmd = (
+        pyExec
+        + " "
+        + diploShicDir
+        + "makeTrainingSets.py "
+        + " ".join([str(x) for x in cmdArgs])
+    )
+    print(cmd)
+    subprocess.call(cmd.split())
diff --git a/examples/README.md b/diploshic/examples/README.md
similarity index 100%
rename from examples/README.md
rename to diploshic/examples/README.md
diff --git a/examples/Snakefile b/diploshic/examples/Snakefile
similarity index 100%
rename from examples/Snakefile
rename to diploshic/examples/Snakefile
diff --git a/examples/collate.sh b/diploshic/examples/collate.sh
similarity index 100%
rename from examples/collate.sh
rename to diploshic/examples/collate.sh
diff --git a/examples/efficient_simulation.sh b/diploshic/examples/efficient_simulation.sh
similarity index 100%
rename from examples/efficient_simulation.sh
rename to diploshic/examples/efficient_simulation.sh
diff --git a/examples/train.sh b/diploshic/examples/train.sh
similarity index 100%
rename from examples/train.sh
rename to diploshic/examples/train.sh
diff --git a/fvTools.py b/diploshic/fvTools.py
similarity index 60%
rename from fvTools.py
rename to diploshic/fvTools.py
index 81f17c2..a2faa6f 100644
--- a/fvTools.py
+++ b/diploshic/fvTools.py
@@ -4,7 +4,7 @@
 from allel.model.ndarray import SortedIndex
 from allel.util import asarray_ndim
 from scipy.spatial.distance import squareform
-import shicstats
+import diploshic.shicstats as dps
 import numpy as np
 import random
 import gzip
@@ -12,7 +12,7 @@
 
 
 def misPolarizeAlleleCounts(ac, pMisPol):
-    pMisPolInv = 1-pMisPol
+    pMisPolInv = 1 - pMisPol
     mapping = []
     for i in range(len(ac)):
         if random.random() >= pMisPolInv:
@@ -33,7 +33,7 @@ def calledGenoFracAtSite(genosAtSite):
             missingCount += 1
         else:
             calledCount += 1
-    return calledCount/float(missingCount + calledCount)
+    return calledCount / float(missingCount + calledCount)
 
 
 def isHaploidVcfGenoArray(genos):
@@ -44,25 +44,28 @@ def diploidizeGenotypeArray(genos):
     numSnps, numSamples, numAlleles = genos.shape
     if numSamples % 2 != 0:
         sys.stderr.write(
-            "Diploidizing an odd-numbered sample. The last genome will be truncated.\n")
+            "Diploidizing an odd-numbered sample. The last genome will be truncated.\n"
+        )
         numSamples -= 1
     newGenos = []
     for i in range(numSnps):
         currSnp = []
         for j in range(0, numSamples, 2):
-            currSnp.append([genos[i, j, 0], genos[i, j+1, 0]])
+            currSnp.append([genos[i, j, 0], genos[i, j + 1, 0]])
         newGenos.append(currSnp)
     newGenos = np.array(newGenos)
     return allel.GenotypeArray(newGenos)
 
+
 # contains some bits modified from scikit-allel by Alistair Miles
 
 
-def readStatsDafsComputeStandardizationBins(statAndDafFileName,
-                                            nBins=50, pMisPol=0.0):
+def readStatsDafsComputeStandardizationBins(
+    statAndDafFileName, nBins=50, pMisPol=0.0
+):
     stats = {}
     dafs = []
-    pMisPolInv = 1-pMisPol
+    pMisPolInv = 1 - pMisPol
     misPolarizedSnps, totalSnps = 0, 0
     with open(statAndDafFileName) as statAndDafFile:
         first = True
@@ -77,7 +80,7 @@ def readStatsDafsComputeStandardizationBins(statAndDafFileName,
             else:
                 totalSnps += 1
                 if random.random() >= pMisPolInv:
-                    dafs.append(1-float(line[0]))
+                    dafs.append(1 - float(line[0]))
                     misPolarizedSnps += 1
                 else:
                     dafs.append(float(line[0]))
@@ -91,26 +94,32 @@ def readStatsDafsComputeStandardizationBins(statAndDafFileName,
         score_nonan = stats[statName][nonan]
         daf_nonan = np.array(dafs)[nonan]
         bins = allel.stats.selection.make_similar_sized_bins(daf_nonan, nBins)
-        mean_score, _, _ = scipy.stats.binned_statistic(daf_nonan, score_nonan,
-                                                        statistic=np.mean,
-                                                        bins=bins)
-        std_score, _, _ = scipy.stats.binned_statistic(daf_nonan, score_nonan,
-                                                       statistic=np.std,
-                                                       bins=bins)
+        mean_score, _, _ = scipy.stats.binned_statistic(
+            daf_nonan, score_nonan, statistic=np.mean, bins=bins
+        )
+        std_score, _, _ = scipy.stats.binned_statistic(
+            daf_nonan, score_nonan, statistic=np.std, bins=bins
+        )
         statInfo[statName] = (mean_score, std_score, bins)
-        sys.stderr.write("mispolarized %d of %d (%f%%) "\
-                      "SNPs when standardizing scores in %s\n" % (
-            misPolarizedSnps, totalSnps,
-            100*misPolarizedSnps/float(totalSnps),
-            statAndDafFileName))
+        sys.stderr.write(
+            "mispolarized %d of %d (%f%%) "
+            "SNPs when standardizing scores in %s\n"
+            % (
+                misPolarizedSnps,
+                totalSnps,
+                100 * misPolarizedSnps / float(totalSnps),
+                statAndDafFileName,
+            )
+        )
     return statInfo
 
+
 # includes a snippet copied from scikit-allel
 
 
-def standardize_by_allele_count_from_precomp_bins(score,
-                                                  dafs,
-                                                  standardizationInfo):
+def standardize_by_allele_count_from_precomp_bins(
+    score, dafs, standardizationInfo
+):
     score_standardized = np.empty_like(score)
     mean_score, std_score, bins = standardizationInfo
     dafs = np.array(dafs)
@@ -119,10 +128,10 @@ def standardize_by_allele_count_from_precomp_bins(score,
         x2 = bins[i + 1]
         if i == 0:
             # first bin
-            loc = (dafs < x2)
+            loc = dafs < x2
         elif i == len(bins) - 2:
             # last bin
-            loc = (dafs >= x1)
+            loc = dafs >= x1
         else:
             # middle bins
             loc = (dafs >= x1) & (dafs < x2)
@@ -137,7 +146,7 @@ def readFaArm(armFileName, armName=False):
         fopen = gzip.open
     else:
         fopen = open
-    with fopen(armFileName, 'rt') as armFile:
+    with fopen(armFileName, "rt") as armFile:
         reading = False
         seq = ""
         for line in armFile:
@@ -165,9 +174,9 @@ def polarizeSnps(unmasked, positions, refAlleles, altAlleles, ancArm):
 
     mapping = []
     for i in range(len(ancArm)):
-        if ancArm[i] in 'ACGT':
-            if i+1 in isSnp:
-                ref, alt = refAlleles[isSnp[i+1]], altAlleles[isSnp[i+1]]
+        if ancArm[i] in "ACGT":
+            if i + 1 in isSnp:
+                ref, alt = refAlleles[isSnp[i + 1]], altAlleles[isSnp[i + 1]]
                 if ancArm[i] == ref:
                     mapping.append([0, 1])  # no swap
                 elif ancArm[i] == alt:
@@ -177,12 +186,13 @@ def polarizeSnps(unmasked, positions, refAlleles, altAlleles, ancArm):
                     unmasked[i] = False
         elif ancArm[i] == "N":
             unmasked[i] = False
-            if i+1 in isSnp:
+            if i + 1 in isSnp:
                 mapping.append([0, 1])  # no swap -- failed to polarize
         else:
             sys.exit(
-                "Found a character in ancestral chromosome "\
-                 "that is not 'A', 'C', 'G', 'T' or 'N' (all upper case)!\n")
+                "Found a character in ancestral chromosome "
+                "that is not 'A', 'C', 'G', 'T' or 'N' (all upper case)!\n"
+            )
     assert len(mapping) == len(positions)
     return mapping, unmasked
 
@@ -192,11 +202,12 @@ def getAccessibilityInWins(isAccessibleArm, winLen, subWinLen, cutoff):
     badWinCount = 0
     lastWinEnd = len(isAccessibleArm) - len(isAccessibleArm) % winLen
     for i in range(0, lastWinEnd, winLen):
-        currWin = isAccessibleArm[i:i+winLen]
+        currWin = isAccessibleArm[i : i + winLen]
         goodWin = True
         for subWinStart in range(0, winLen, subWinLen):
-            unmaskedFrac = currWin[subWinStart:subWinStart +
-                                   subWinLen].count(True)/float(subWinLen)
+            unmaskedFrac = currWin[
+                subWinStart : subWinStart + subWinLen
+            ].count(True) / float(subWinLen)
             if unmaskedFrac < cutoff:
                 goodWin = False
         if goodWin:
@@ -206,8 +217,9 @@ def getAccessibilityInWins(isAccessibleArm, winLen, subWinLen, cutoff):
     return wins
 
 
-def windowVals(vals, subWinBounds,
-               positionArray, keepNans=False, absVal=False):
+def windowVals(
+    vals, subWinBounds, positionArray, keepNans=False, absVal=False
+):
     assert len(vals) == len(positionArray)
 
     subWinIndex = 0
@@ -238,7 +250,7 @@ def readFa(faFileName, upper=False):
         fopen = gzip.open
     else:
         fopen = open
-    with fopen(faFileName, 'rt') as faFile:
+    with fopen(faFileName, "rt") as faFile:
         reading = False
         for line in faFile:
             if line.startswith(">"):
@@ -262,14 +274,20 @@ def readFa(faFileName, upper=False):
     return seqData
 
 
-def readMaskAndAncDataForTraining(maskFileName, ancFileName,
-                                  totalPhysLen, subWinLen,
-                                  chrArmsForMasking,
-                                  shuffle=True, cutoff=0.25):
+def readMaskAndAncDataForTraining(
+    maskFileName,
+    ancFileName,
+    totalPhysLen,
+    subWinLen,
+    chrArmsForMasking,
+    shuffle=True,
+    cutoff=0.25,
+):
     isAccessible = []
     maskData, ancData = readFa(maskFileName, upper=True), readFa(
-        ancFileName, upper=True)
-    if 'all' in chrArmsForMasking:
+        ancFileName, upper=True
+    )
+    if "all" in chrArmsForMasking:
         chrArmsForMasking = sorted(maskData)
     for currChr in chrArmsForMasking:
         assert len(maskData[currChr]) == len(ancData[currChr])
@@ -280,7 +298,8 @@ def readMaskAndAncDataForTraining(maskFileName, ancFileName,
             else:
                 isAccessibleArm.append(True)
         windowedAccessibility = getAccessibilityInWins(
-            isAccessibleArm, totalPhysLen, subWinLen, cutoff)
+            isAccessibleArm, totalPhysLen, subWinLen, cutoff
+        )
         if windowedAccessibility:
             isAccessible += windowedAccessibility
     if shuffle:
@@ -293,50 +312,69 @@ def readMaskAndAncDataForTraining(maskFileName, ancFileName,
     return isAccessible
 
 
-def getGenoMaskInfoInWins(isAccessibleArm, genos,
-                          positions, positions2SnpIndices,
-                          winLen, subWinLen, cutoff, genoCutoff):
+def getGenoMaskInfoInWins(
+    isAccessibleArm,
+    genos,
+    positions,
+    positions2SnpIndices,
+    winLen,
+    subWinLen,
+    cutoff,
+    genoCutoff,
+):
     windowedAcc, windowedGenoMask = [], []
     badWinCount = 0
     lastWinEnd = len(isAccessibleArm) - len(isAccessibleArm) % winLen
     posIdx = 0
     snpIndicesInWins = []
-    sys.stderr.write("about to get geno masks from arm; "\
-            "len: %d, genos shape: %s, num snps: %d\n" % (
-        len(isAccessibleArm), genos.shape, len(positions)))
+    sys.stderr.write(
+        "about to get geno masks from arm; "
+        "len: %d, genos shape: %s, num snps: %d\n"
+        % (len(isAccessibleArm), genos.shape, len(positions))
+    )
     calledFracs = []
     for winOffset in range(0, lastWinEnd, winLen):
-        firstPos = winOffset+1
-        lastPos = winOffset+winLen
+        firstPos = winOffset + 1
+        lastPos = winOffset + winLen
         snpIndicesInWin = []
-        assert len(positions) == 0 or posIdx >= len(
-            positions) or positions[posIdx] >= firstPos
+        assert (
+            len(positions) == 0
+            or posIdx >= len(positions)
+            or positions[posIdx] >= firstPos
+        )
         while posIdx < len(positions) and positions[posIdx] <= lastPos:
-            if isAccessibleArm[positions[posIdx]-1]:
+            if isAccessibleArm[positions[posIdx] - 1]:
                 calledFrac = calledGenoFracAtSite(genos[posIdx])
                 calledFracs.append(calledFrac)
                 if calledFrac >= genoCutoff:
                     snpIndicesInWin.append(posIdx)
                 else:
-                    isAccessibleArm[positions[posIdx]-1] = False
+                    isAccessibleArm[positions[posIdx] - 1] = False
             posIdx += 1
         snpIndicesInWins.append(snpIndicesInWin)
     if len(calledFracs) > 0:
-        sys.stderr.write("min calledFrac: %g; max calledFrac: %g; "\
-                "mean: %g; median: %g\n" % (
-            min(calledFracs), max(calledFracs),
-            np.median(calledFracs), np.mean(calledFracs)))
+        sys.stderr.write(
+            "min calledFrac: %g; max calledFrac: %g; "
+            "mean: %g; median: %g\n"
+            % (
+                min(calledFracs),
+                max(calledFracs),
+                np.median(calledFracs),
+                np.mean(calledFracs),
+            )
+        )
     else:
         sys.stderr.write("no SNPs in chromosome!\n")
     winIndex = 0
     for winOffset in range(0, lastWinEnd, winLen):
-        currWin = isAccessibleArm[winOffset:winOffset+winLen]
+        currWin = isAccessibleArm[winOffset : winOffset + winLen]
         if len(snpIndicesInWins[winIndex]) > 0:
             currGenos = genos.subset(sel0=snpIndicesInWins[winIndex])
             goodWin = True
             for subWinStart in range(0, winLen, subWinLen):
-                unmaskedFrac = currWin[subWinStart:subWinStart +
-                                       subWinLen].count(True)/float(subWinLen)
+                unmaskedFrac = currWin[
+                    subWinStart : subWinStart + subWinLen
+                ].count(True) / float(subWinLen)
                 if unmaskedFrac < cutoff:
                     goodWin = False
             if goodWin:
@@ -346,9 +384,20 @@ def getGenoMaskInfoInWins(isAccessibleArm, genos,
                 badWinCount += 1
         winIndex += 1
     if windowedAcc:
-        sys.stderr.write("returning %d geno arrays, "\
-        "with an avg of %f snps\n" % (len(windowedGenoMask), sum(
-            [len(windowedGenoMask[i]) for i in range(len(windowedGenoMask))])/float(len(windowedGenoMask))))  # NOQA
+        sys.stderr.write(
+            "returning %d geno arrays, "
+            "with an avg of %f snps\n"
+            % (
+                len(windowedGenoMask),
+                sum(
+                    [
+                        len(windowedGenoMask[i])
+                        for i in range(len(windowedGenoMask))
+                    ]
+                )
+                / float(len(windowedGenoMask)),
+            )
+        )  # NOQA
     else:
         sys.stderr.write("returning 0 geno arrays\n")
     return windowedAcc, windowedGenoMask
@@ -363,52 +412,75 @@ def readSampleToPopFile(sampleToPopFileName):
     return table
 
 
-def extractGenosAndPositionsForArm(vcfFile, chroms,
-                                   currChr, sampleIndicesToKeep):
+def extractGenosAndPositionsForArm(
+    vcfFile, chroms, currChr, sampleIndicesToKeep
+):
     # sys.stderr.write("extracting vcf info for arm %s\n" %(currChr))
 
     rawgenos = np.take(
-        vcfFile["calldata/GT"], [i for i in range(len(chroms)) if chroms[i] == currChr], axis=0)  # NOQA
+        vcfFile["calldata/GT"],
+        [i for i in range(len(chroms)) if chroms[i] == currChr],
+        axis=0,
+    )  # NOQA
     if len(rawgenos) > 0:
         genos = allel.GenotypeArray(rawgenos).subset(sel1=sampleIndicesToKeep)
         if isHaploidVcfGenoArray(genos):
             sys.stderr.write(
-                "Detected haploid input for %s. "\
-                 "Converting into diploid individuals "\
-                 "(combining haplotypes in order).\n" % (currChr))
+                "Detected haploid input for %s. "
+                "Converting into diploid individuals "
+                "(combining haplotypes in order).\n" % (currChr)
+            )
             genos = diploidizeGenotypeArray(genos)
             sys.stderr.write("Done diploidizing %s\n" % (currChr))
         positions = np.extract(chroms == currChr, vcfFile["variants/POS"])
         if len(positions) > 0:
             genos = allel.GenotypeArray(
-                genos.subset(sel0=range(len(positions))))
+                genos.subset(sel0=range(len(positions)))
+            )
 
             positions2SnpIndices = {}
             for i in range(len(positions)):
                 positions2SnpIndices[positions[i]] = i
 
-            assert len(positions) == len(
-                positions2SnpIndices) and len(positions) == len(genos)
-            return genos, positions, positions2SnpIndices, genos.count_alleles().is_biallelic()  # NOQA
+            assert len(positions) == len(positions2SnpIndices) and len(
+                positions
+            ) == len(genos)
+            return (
+                genos,
+                positions,
+                positions2SnpIndices,
+                genos.count_alleles().is_biallelic(),
+            )  # NOQA
     return np.array([]), [], {}, np.array([])
 
 
-def readMaskDataForTraining(maskFileName, totalPhysLen,
-                            subWinLen, chrArmsForMasking,
-                            shuffle=True, cutoff=0.25,
-                            genoCutoff=0.75, vcfForMaskFileName=None,
-                            sampleToPopFileName=None, pop=None):
+def readMaskDataForTraining(
+    maskFileName,
+    totalPhysLen,
+    subWinLen,
+    chrArmsForMasking,
+    shuffle=True,
+    cutoff=0.25,
+    genoCutoff=0.75,
+    vcfForMaskFileName=None,
+    sampleToPopFileName=None,
+    pop=None,
+):
     if vcfForMaskFileName:
-        sys.stderr.write("reading geno mask info from %s\n" %
-                         (vcfForMaskFileName))
+        sys.stderr.write(
+            "reading geno mask info from %s\n" % (vcfForMaskFileName)
+        )
         vcfFile = allel.read_vcf(vcfForMaskFileName)
         sys.stderr.write("done with read\n")
         chroms = vcfFile["variants/CHROM"]
         samples = vcfFile["samples"]
         if sampleToPopFileName:
             sampleToPop = readSampleToPopFile(sampleToPopFileName)
-            sampleIndicesToKeep = [i for i in range(
-                len(samples)) if sampleToPop.get(samples[i], "popNotFound!") == pop]  # NOQA
+            sampleIndicesToKeep = [
+                i
+                for i in range(len(samples))
+                if sampleToPop.get(samples[i], "popNotFound!") == pop
+            ]  # NOQA
         else:
             sampleIndicesToKeep = [i for i in range(len(samples))]
     if maskFileName.endswith(".gz"):
@@ -421,48 +493,76 @@ def readMaskDataForTraining(maskFileName, totalPhysLen,
     readingMasks = False
     isAccessible, isAccessibleArm = [], []
     genoMaskInfo = []
-    with fopen(maskFileName, 'rt') as maskFile:
+    with fopen(maskFileName, "rt") as maskFile:
         for line in maskFile:
             if line.startswith(">"):
                 if readingMasks and len(isAccessibleArm) >= totalPhysLen:
                     if vcfForMaskFileName:
                         sys.stderr.write(
-                            "processing sites "\
-                             "and genos for %s\n" % (currChr))
-                        windowedAccessibility, windowedGenoMask = getGenoMaskInfoInWins(
-                            isAccessibleArm, genos, positions,
-                            positions2SnpIndices, totalPhysLen, subWinLen, cutoff, genoCutoff)
+                            "processing sites "
+                            "and genos for %s\n" % (currChr)
+                        )
+                        (
+                            windowedAccessibility,
+                            windowedGenoMask,
+                        ) = getGenoMaskInfoInWins(
+                            isAccessibleArm,
+                            genos,
+                            positions,
+                            positions2SnpIndices,
+                            totalPhysLen,
+                            subWinLen,
+                            cutoff,
+                            genoCutoff,
+                        )
                         if windowedAccessibility:
                             isAccessible += windowedAccessibility
                             genoMaskInfo += windowedGenoMask
                     else:
                         windowedAccessibility = getAccessibilityInWins(
-                            isAccessibleArm, totalPhysLen, subWinLen, cutoff)
+                            isAccessibleArm, totalPhysLen, subWinLen, cutoff
+                        )
                         if windowedAccessibility:
                             isAccessible += windowedAccessibility
 
                 currChr = line[1:].strip()
                 currPos = 0
                 # sys.stderr.write("chrom: " + currChr + "\n")
-                if 'all' in chrArmsForMasking or currChr in chrArmsForMasking:
+                if "all" in chrArmsForMasking or currChr in chrArmsForMasking:
                     readingMasks = True
                 else:
                     readingMasks = False
                 isAccessibleArm = []
                 if vcfForMaskFileName and readingMasks:
-                    sys.stderr.write("checking geno mask "\
-                            "info from %s for %s\n" % (
-                        vcfForMaskFileName, currChr))
-                    genos, positions, positions2SnpIndices, isBiallelic = extractGenosAndPositionsForArm(
-                        vcfFile, chroms, currChr, sampleIndicesToKeep)
+                    sys.stderr.write(
+                        "checking geno mask "
+                        "info from %s for %s\n" % (vcfForMaskFileName, currChr)
+                    )
+                    (
+                        genos,
+                        positions,
+                        positions2SnpIndices,
+                        isBiallelic,
+                    ) = extractGenosAndPositionsForArm(
+                        vcfFile, chroms, currChr, sampleIndicesToKeep
+                    )
             else:
                 if readingMasks:
                     for char in line.strip().upper():
-                        if char == 'N':
+                        if char == "N":
                             isAccessibleArm.append(False)
-                        elif vcfForMaskFileName and currPos in positions2SnpIndices:
+                        elif (
+                            vcfForMaskFileName
+                            and currPos in positions2SnpIndices
+                        ):
                             genosChecked += 1
-                            if isBiallelic[positions2SnpIndices[currPos]] and calledGenoFracAtSite(genos[positions2SnpIndices[currPos]]) >= genoCutoff:  # NOQA
+                            if (
+                                isBiallelic[positions2SnpIndices[currPos]]
+                                and calledGenoFracAtSite(
+                                    genos[positions2SnpIndices[currPos]]
+                                )
+                                >= genoCutoff
+                            ):  # NOQA
                                 isAccessibleArm.append(True)
                             else:
                                 isAccessibleArm.append(False)
@@ -473,15 +573,22 @@ def readMaskDataForTraining(maskFileName, totalPhysLen,
         if vcfForMaskFileName:
             sys.stderr.write("processing sites and genos for %s\n" % (currChr))
             windowedAccessibility, windowedGenoMask = getGenoMaskInfoInWins(
-                isAccessibleArm, genos, positions,
-                positions2SnpIndices, totalPhysLen,
-                subWinLen, cutoff, genoCutoff)
+                isAccessibleArm,
+                genos,
+                positions,
+                positions2SnpIndices,
+                totalPhysLen,
+                subWinLen,
+                cutoff,
+                genoCutoff,
+            )
             if windowedAccessibility:
                 isAccessible += windowedAccessibility
                 genoMaskInfo += windowedGenoMask
         else:
             windowedAccessibility = getAccessibilityInWins(
-                isAccessibleArm, totalPhysLen, subWinLen, cutoff)
+                isAccessibleArm, totalPhysLen, subWinLen, cutoff
+            )
             if windowedAccessibility:
                 isAccessible += windowedAccessibility
     if shuffle:
@@ -494,9 +601,11 @@ def readMaskDataForTraining(maskFileName, totalPhysLen,
             random.shuffle(isAccessible)
 
     if len(isAccessible) == 0:
-        sys.exit("Error: Couldn't find a single window in our "\
-                "real data for masking that survived filters. May have to "\
-                "disable masking.\n")
+        sys.exit(
+            "Error: Couldn't find a single window in our "
+            "real data for masking that survived filters. May have to "
+            "disable masking.\n"
+        )
     for i in range(len(isAccessible)):
         assert len(isAccessible[i]) == totalPhysLen
     sys.stderr.write("checked genotypes at %d sites\n" % (genosChecked))
@@ -534,7 +643,7 @@ def readMaskDataForScan(maskFileName, chrArm):
         fopen = gzip.open
     else:
         fopen = open
-    with fopen(maskFileName, 'rt') as maskFile:
+    with fopen(maskFileName, "rt") as maskFile:
         for line in maskFile:
             if line.startswith(">"):
                 currChr = line[1:].strip()
@@ -545,7 +654,7 @@ def readMaskDataForScan(maskFileName, chrArm):
             else:
                 if readingMasks:
                     for char in line.strip().upper():
-                        if char == 'N':
+                        if char == "N":
                             isAccessible.append(False)
                         else:
                             isAccessible.append(True)
@@ -555,14 +664,14 @@ def readMaskDataForScan(maskFileName, chrArm):
 def normalizeFeatureVec(statVec):
     minVal = min(statVec)
     if minVal < 0:
-        statVec = [x-minVal for x in statVec]
+        statVec = [x - minVal for x in statVec]
     normStatVec = []
     statSum = float(sum(statVec))
     if statSum == 0 or any(np.isinf(statVec)) or any(np.isnan(statVec)):
-        normStatVec = [1.0/len(statVec)]*len(statVec)
+        normStatVec = [1.0 / len(statVec)] * len(statVec)
     else:
         for k in range(len(statVec)):
-            normStatVec.append(statVec[k]/statSum)
+            normStatVec.append(statVec[k] / statSum)
     return normStatVec
 
 
@@ -587,35 +696,77 @@ def maxFDA(pos, ac, start=None, stop=None, is_accessible=None):
     dafs = []
     for i in range(len(ac)):
         p1 = ac[i, 1]
-        n = p1+ac[i, 0]
-        dafs.append(p1/float(n))
+        n = p1 + ac[i, 0]
+        dafs.append(p1 / float(n))
     return max(dafs)
 
 
-def calcAndAppendStatVal(alleleCounts, snpLocs, statName,
-                         subWinStart, subWinEnd, statVals,
-                         instanceIndex, subWinIndex, hapsInSubWin,
-                         unmasked, precomputedStats):
+def calcAndAppendStatVal(
+    alleleCounts,
+    snpLocs,
+    statName,
+    subWinStart,
+    subWinEnd,
+    statVals,
+    instanceIndex,
+    subWinIndex,
+    hapsInSubWin,
+    unmasked,
+    precomputedStats,
+):
     if statName == "tajD":
-        statVals[statName][instanceIndex].append(allel.stats.diversity.tajima_d(  # NOQA
-            alleleCounts, pos=snpLocs, start=subWinStart, stop=subWinEnd))
+        statVals[statName][instanceIndex].append(
+            allel.stats.diversity.tajima_d(  # NOQA
+                alleleCounts, pos=snpLocs, start=subWinStart, stop=subWinEnd
+            )
+        )
     elif statName == "pi":
-        statVals[statName][instanceIndex].append(allel.stats.diversity.sequence_diversity(
-            snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked))
+        statVals[statName][instanceIndex].append(
+            allel.stats.diversity.sequence_diversity(
+                snpLocs,
+                alleleCounts,
+                start=subWinStart,
+                stop=subWinEnd,
+                is_accessible=unmasked,
+            )
+        )
     elif statName == "thetaW":
-        statVals[statName][instanceIndex].append(allel.stats.diversity.watterson_theta(
-            snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked))
+        statVals[statName][instanceIndex].append(
+            allel.stats.diversity.watterson_theta(
+                snpLocs,
+                alleleCounts,
+                start=subWinStart,
+                stop=subWinEnd,
+                is_accessible=unmasked,
+            )
+        )
     elif statName == "thetaH":
-        statVals[statName][instanceIndex].append(thetah(
-            snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked))
+        statVals[statName][instanceIndex].append(
+            thetah(
+                snpLocs,
+                alleleCounts,
+                start=subWinStart,
+                stop=subWinEnd,
+                is_accessible=unmasked,
+            )
+        )
     elif statName == "fayWuH":
         statVals[statName][instanceIndex].append(
-            statVals["thetaH"][instanceIndex][subWinIndex]-statVals["pi"][instanceIndex][subWinIndex])
+            statVals["thetaH"][instanceIndex][subWinIndex]
+            - statVals["pi"][instanceIndex][subWinIndex]
+        )
     elif statName == "HapCount":
         statVals[statName][instanceIndex].append(len(hapsInSubWin.distinct()))
     elif statName == "maxFDA":
-        statVals[statName][instanceIndex].append(maxFDA(
-            snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked))
+        statVals[statName][instanceIndex].append(
+            maxFDA(
+                snpLocs,
+                alleleCounts,
+                start=subWinStart,
+                stop=subWinEnd,
+                is_accessible=unmasked,
+            )
+        )
     elif statName == "H1":
         h1, h12, h123, h21 = allel.stats.selection.garud_h(hapsInSubWin)
         statVals["H1"][instanceIndex].append(h1)
@@ -626,43 +777,58 @@ def calcAndAppendStatVal(alleleCounts, snpLocs, statName,
         if "H2/H1" in statVals:
             statVals["H2/H1"][instanceIndex].append(h21)
     elif statName == "ZnS":
-        r2Matrix = shicstats.computeR2Matrix(hapsInSubWin)
-        statVals["ZnS"][instanceIndex].append(shicstats.ZnS(r2Matrix)[0])
-        statVals["Omega"][instanceIndex].append(shicstats.omega(r2Matrix)[0])
+        r2Matrix = dps.computeR2Matrix(hapsInSubWin)
+        statVals["ZnS"][instanceIndex].append(dps.ZnS(r2Matrix)[0])
+        statVals["Omega"][instanceIndex].append(dps.omega(r2Matrix)[0])
     elif statName == "RH":
         rMatrixFlat = allel.stats.ld.rogers_huff_r(
-            hapsInSubWin.to_genotypes(ploidy=2).to_n_alt())
+            hapsInSubWin.to_genotypes(ploidy=2).to_n_alt()
+        )
         rhAvg = rMatrixFlat.mean()
         statVals["RH"][instanceIndex].append(rhAvg)
         r2Matrix = squareform(rMatrixFlat ** 2)
-        statVals["Omega"][instanceIndex].append(shicstats.omega(r2Matrix)[0])
+        statVals["Omega"][instanceIndex].append(dps.omega(r2Matrix)[0])
     elif statName == "iHSMean":
-        vals = [x for x in precomputedStats["iHS"][subWinIndex]
-                if not (math.isnan(x) or math.isinf(x))]
+        vals = [
+            x
+            for x in precomputedStats["iHS"][subWinIndex]
+            if not (math.isnan(x) or math.isinf(x))
+        ]
         if len(vals) == 0:
             statVals["iHSMean"][instanceIndex].append(0.0)
         else:
             statVals["iHSMean"][instanceIndex].append(
-                sum(vals)/float(len(vals)))
+                sum(vals) / float(len(vals))
+            )
     elif statName == "nSLMean":
-        vals = [x for x in precomputedStats["nSL"][subWinIndex]
-                if not (math.isnan(x) or math.isinf(x))]
+        vals = [
+            x
+            for x in precomputedStats["nSL"][subWinIndex]
+            if not (math.isnan(x) or math.isinf(x))
+        ]
         if len(vals) == 0:
             statVals["nSLMean"][instanceIndex].append(0.0)
         else:
             statVals["nSLMean"][instanceIndex].append(
-                sum(vals)/float(len(vals)))
+                sum(vals) / float(len(vals))
+            )
     elif statName == "iHSMax":
-        vals = [x for x in precomputedStats["iHS"][subWinIndex]
-                if not (math.isnan(x) or math.isinf(x))]
+        vals = [
+            x
+            for x in precomputedStats["iHS"][subWinIndex]
+            if not (math.isnan(x) or math.isinf(x))
+        ]
         if len(vals) == 0:
             maxVal = 0.0
         else:
             maxVal = max(vals)
         statVals["iHSMax"][instanceIndex].append(maxVal)
     elif statName == "nSLMax":
-        vals = [x for x in precomputedStats["nSL"][subWinIndex]
-                if not (math.isnan(x) or math.isinf(x))]
+        vals = [
+            x
+            for x in precomputedStats["nSL"][subWinIndex]
+            if not (math.isnan(x) or math.isinf(x))
+        ]
         if len(vals) == 0:
             maxVal = 0.0
         else:
@@ -670,41 +836,89 @@ def calcAndAppendStatVal(alleleCounts, snpLocs, statName,
         statVals["nSLMax"][instanceIndex].append(maxVal)
     elif statName == "iHSOutFrac":
         statVals["iHSOutFrac"][instanceIndex].append(
-            getOutlierFrac(precomputedStats["iHS"][subWinIndex]))
+            getOutlierFrac(precomputedStats["iHS"][subWinIndex])
+        )
     elif statName == "nSLOutFrac":
         statVals["nSLOutFrac"][instanceIndex].append(
-            getOutlierFrac(precomputedStats["nSL"][subWinIndex]))
+            getOutlierFrac(precomputedStats["nSL"][subWinIndex])
+        )
     elif statName == "distVar":
-        dists = shicstats.pairwiseDiffs(
-            hapsInSubWin)/float(unmasked[subWinStart-1:subWinEnd].count(True))
+        dists = dps.pairwiseDiffs(hapsInSubWin) / float(
+            unmasked[subWinStart - 1 : subWinEnd].count(True)
+        )
         statVals["distVar"][instanceIndex].append(np.var(dists, ddof=1))
         statVals["distSkew"][instanceIndex].append(scipy.stats.skew(dists))
         statVals["distKurt"][instanceIndex].append(scipy.stats.kurtosis(dists))
-    elif statName in ["H12", "H123", "H2/H1", "Omega", "distVar", "distSkew", "distKurt"]:
-        assert len(statVals[statName][instanceIndex]) == subWinIndex+1
-
-
-def calcAndAppendStatValDiplo(alleleCounts, snpLocs, statName, subWinStart, subWinEnd, statVals, instanceIndex, subWinIndex, genosInSubWin, unmasked):
+    elif statName in [
+        "H12",
+        "H123",
+        "H2/H1",
+        "Omega",
+        "distVar",
+        "distSkew",
+        "distKurt",
+    ]:
+        assert len(statVals[statName][instanceIndex]) == subWinIndex + 1
+
+
+def calcAndAppendStatValDiplo(
+    alleleCounts,
+    snpLocs,
+    statName,
+    subWinStart,
+    subWinEnd,
+    statVals,
+    instanceIndex,
+    subWinIndex,
+    genosInSubWin,
+    unmasked,
+):
     genosNAlt = genosInSubWin.to_n_alt()
     if statName == "tajD":
-        statVals[statName][instanceIndex].append(allel.stats.diversity.tajima_d(
-            alleleCounts, pos=snpLocs, start=subWinStart, stop=subWinEnd))
+        statVals[statName][instanceIndex].append(
+            allel.stats.diversity.tajima_d(
+                alleleCounts, pos=snpLocs, start=subWinStart, stop=subWinEnd
+            )
+        )
     elif statName == "pi":
-        statVals[statName][instanceIndex].append(allel.stats.diversity.sequence_diversity(
-            snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked))
+        statVals[statName][instanceIndex].append(
+            allel.stats.diversity.sequence_diversity(
+                snpLocs,
+                alleleCounts,
+                start=subWinStart,
+                stop=subWinEnd,
+                is_accessible=unmasked,
+            )
+        )
     elif statName == "thetaW":
-        statVals[statName][instanceIndex].append(allel.stats.diversity.watterson_theta(
-            snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked))
+        statVals[statName][instanceIndex].append(
+            allel.stats.diversity.watterson_theta(
+                snpLocs,
+                alleleCounts,
+                start=subWinStart,
+                stop=subWinEnd,
+                is_accessible=unmasked,
+            )
+        )
     elif statName == "thetaH":
-        statVals[statName][instanceIndex].append(thetah(
-            snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked))
+        statVals[statName][instanceIndex].append(
+            thetah(
+                snpLocs,
+                alleleCounts,
+                start=subWinStart,
+                stop=subWinEnd,
+                is_accessible=unmasked,
+            )
+        )
     elif statName == "fayWuH":
         statVals[statName][instanceIndex].append(
-            statVals["thetaH"][instanceIndex][subWinIndex]-statVals["pi"][instanceIndex][subWinIndex])
+            statVals["thetaH"][instanceIndex][subWinIndex]
+            - statVals["pi"][instanceIndex][subWinIndex]
+        )
     elif statName == "HapCount":
         statVals[statName][instanceIndex].append(len(hapsInSubWin.distinct()))
     elif statName == "nDiplos":
-        diplotypeCounts = shicstats.getHaplotypeFreqSpec(genosNAlt)
+        diplotypeCounts = dps.getHaplotypeFreqSpec(genosNAlt)
         nDiplos = diplotypeCounts[genosNAlt.shape[1]]
         statVals["nDiplos"][instanceIndex].append(nDiplos)
         diplotypeCounts = diplotypeCounts[:-1]
@@ -716,7 +930,7 @@ def calcAndAppendStatValDiplo(alleleCounts, snpLocs, statName, subWinStart, subW
         if "diplo_H12" in statVals:
             statVals["diplo_H12"][instanceIndex].append(dh12)
         if "diplo_H2/H1" in statVals:
-            statVals["diplo_H2/H1"][instanceIndex].append(dh2/dh1)
+            statVals["diplo_H2/H1"][instanceIndex].append(dh2 / dh1)
     elif statName == "diplo_ZnS":
         if genosNAlt.shape[0] == 1:
             statVals["diplo_ZnS"][instanceIndex].append(0.0)
@@ -726,43 +940,90 @@ def calcAndAppendStatValDiplo(alleleCounts, snpLocs, statName, subWinStart, subW
             r2Matrix2 = squareform(r2Matrix ** 2)
             statVals["diplo_ZnS"][instanceIndex].append(np.nanmean(r2Matrix2))
             statVals["diplo_Omega"][instanceIndex].append(
-                shicstats.omega(r2Matrix2)[0])
+                dps.omega(r2Matrix2)[0]
+            )
     elif statName == "distVar":
-        dists = shicstats.pairwiseDiffsDiplo(
-            genosNAlt)/float(unmasked[subWinStart-1:subWinEnd].count(True))
+        dists = dps.pairwiseDiffsDiplo(genosNAlt) / float(
+            unmasked[subWinStart - 1 : subWinEnd].count(True)
+        )
         statVals["distVar"][instanceIndex].append(np.var(dists, ddof=1))
         statVals["distSkew"][instanceIndex].append(scipy.stats.skew(dists))
         statVals["distKurt"][instanceIndex].append(scipy.stats.kurtosis(dists))
-    elif statName in ["diplo_H12", "diplo_H123", "diplo_H2/H1", "distVar", "distSkew", "distKurt", "diplo_Omega"]:
-        if not len(statVals[statName][instanceIndex]) == subWinIndex+1:
-            print(statName, instanceIndex, subWinIndex+1)
-            print(statVals["diplo_H1"][instanceIndex],
-                  statVals["diplo_H12"][instanceIndex])
+    elif statName in [
+        "diplo_H12",
+        "diplo_H123",
+        "diplo_H2/H1",
+        "distVar",
+        "distSkew",
+        "distKurt",
+        "diplo_Omega",
+    ]:
+        if not len(statVals[statName][instanceIndex]) == subWinIndex + 1:
+            print(statName, instanceIndex, subWinIndex + 1)
+            print(
+                statVals["diplo_H1"][instanceIndex],
+                statVals["diplo_H12"][instanceIndex],
+            )
             sys.exit()
 
 
-def calcAndAppendStatValForScanDiplo(alleleCounts, snpLocs, statName, subWinStart, subWinEnd, statVals, subWinIndex, genosInSubWin, unmasked):
+def calcAndAppendStatValForScanDiplo(
+    alleleCounts,
+    snpLocs,
+    statName,
+    subWinStart,
+    subWinEnd,
+    statVals,
+    subWinIndex,
+    genosInSubWin,
+    unmasked,
+):
     genosNAlt = genosInSubWin.to_n_alt()
     if statName == "tajD":
-        statVals[statName].append(allel.stats.diversity.tajima_d(
-            alleleCounts, pos=snpLocs, start=subWinStart, stop=subWinEnd))
+        statVals[statName].append(
+            allel.stats.diversity.tajima_d(
+                alleleCounts, pos=snpLocs, start=subWinStart, stop=subWinEnd
+            )
+        )
     elif statName == "pi":
-        statVals[statName].append(allel.stats.diversity.sequence_diversity(
-            snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked))
+        statVals[statName].append(
+            allel.stats.diversity.sequence_diversity(
+                snpLocs,
+                alleleCounts,
+                start=subWinStart,
+                stop=subWinEnd,
+                is_accessible=unmasked,
+            )
+        )
     elif statName == "thetaW":
-        statVals[statName].append(allel.stats.diversity.watterson_theta(
-            snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked))
+        statVals[statName].append(
+            allel.stats.diversity.watterson_theta(
+                snpLocs,
+                alleleCounts,
+                start=subWinStart,
+                stop=subWinEnd,
+                is_accessible=unmasked,
+            )
+        )
     elif statName == "thetaH":
-        statVals[statName].append(thetah(
-            snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked))
+        statVals[statName].append(
+            thetah(
+                snpLocs,
+                alleleCounts,
+                start=subWinStart,
+                stop=subWinEnd,
+                is_accessible=unmasked,
+            )
+        )
     elif statName == "fayWuH":
         statVals[statName].append(
-            statVals["thetaH"][subWinIndex]-statVals["pi"][subWinIndex])
+            statVals["thetaH"][subWinIndex] - statVals["pi"][subWinIndex]
+        )
     elif statName == "HapCount":
         # AK: undefined variables
         statVals[statName].append(len(hapsInSubWin.distinct()))
     elif statName == "nDiplos":
-        diplotypeCounts = shicstats.getHaplotypeFreqSpec(genosNAlt)
+        diplotypeCounts = dps.getHaplotypeFreqSpec(genosNAlt)
         nDiplos = diplotypeCounts[genosNAlt.shape[1]]
         statVals["nDiplos"].append(nDiplos)
         diplotypeCounts = diplotypeCounts[:-1]
@@ -774,7 +1035,7 @@ def calcAndAppendStatValForScanDiplo(alleleCounts, snpLocs, statName, subWinStar
         if "diplo_H12" in statVals:
             statVals["diplo_H12"].append(dh12)
         if "diplo_H2/H1" in statVals:
-            statVals["diplo_H2/H1"].append(dh2/dh1)
+            statVals["diplo_H2/H1"].append(dh2 / dh1)
     elif statName == "diplo_ZnS":
         if genosNAlt.shape[0] == 1:
             statVals["diplo_ZnS"].append(0.0)
@@ -783,16 +1044,25 @@ def calcAndAppendStatValForScanDiplo(alleleCounts, snpLocs, statName, subWinStar
             r2Matrix = allel.stats.ld.rogers_huff_r(genosNAlt)
             statVals["diplo_ZnS"].append(np.nanmean(r2Matrix))
             r2Matrix2 = squareform(r2Matrix ** 2)
-            statVals["diplo_Omega"].append(shicstats.omega(r2Matrix2)[0])
+            statVals["diplo_Omega"].append(dps.omega(r2Matrix2)[0])
     elif statName == "distVar":
-        dists = shicstats.pairwiseDiffsDiplo(
-            genosNAlt)/float(unmasked[subWinStart-1:subWinEnd].count(True))
+        dists = dps.pairwiseDiffsDiplo(genosNAlt) / float(
+            unmasked[subWinStart - 1 : subWinEnd].count(True)
+        )
         statVals["distVar"].append(np.var(dists, ddof=1))
         statVals["distSkew"].append(scipy.stats.skew(dists))
         statVals["distKurt"].append(scipy.stats.kurtosis(dists))
-    elif statName in ["diplo_H12", "diplo_H123", "diplo_H2/H1", "distVar", "distSkew", "distKurt", "diplo_Omega"]:
-        if not len(statVals[statName]) == subWinIndex+1:
-            print(statName, subWinIndex+1)
+    elif statName in [
+        "diplo_H12",
+        "diplo_H123",
+        "diplo_H2/H1",
+        "distVar",
+        "distSkew",
+        "distKurt",
+        "diplo_Omega",
+    ]:
+        if not len(statVals[statName]) == subWinIndex + 1:
+            print(statName, subWinIndex + 1)
             print(statVals["diplo_H1"], statVals["diplo_H12"])
             sys.exit()
 
@@ -811,10 +1081,12 @@ def getOutlierFrac(vals, cutoff=2.0):
         if denom == 0:
             return 0.0
         else:
-            return num/float(denom)
+            return num / float(denom)
 
 
-def appendStatValsForMonomorphic(statName, statVals, instanceIndex, subWinIndex):
+def appendStatValsForMonomorphic(
+    statName, statVals, instanceIndex, subWinIndex
+):
     if statName == "tajD":
         statVals[statName][instanceIndex].append(0.0)
     elif statName == "pi":
@@ -864,33 +1136,85 @@ def appendStatValsForMonomorphic(statName, statVals, instanceIndex, subWinIndex)
         statVals["iHSMax"][instanceIndex].append(0.0)
     elif statName == "nSLMax":
         statVals["nSLMax"][instanceIndex].append(0.0)
-    elif statName in ["H12", "H123", "H2/H1", "diplo_H12", "diplo_H123", "diplo_H2/H1", "Omega", "diplo_Omega"]:
-        #print(statName, statVals[statName][instanceIndex], subWinIndex+1)
-        assert len(statVals[statName][instanceIndex]) == subWinIndex+1
+    elif statName in [
+        "H12",
+        "H123",
+        "H2/H1",
+        "diplo_H12",
+        "diplo_H123",
+        "diplo_H2/H1",
+        "Omega",
+        "diplo_Omega",
+    ]:
+        # print(statName, statVals[statName][instanceIndex], subWinIndex+1)
+        assert len(statVals[statName][instanceIndex]) == subWinIndex + 1
     else:
         statVals[statName][instanceIndex].append(0.0)
 
 
-def calcAndAppendStatValForScan(alleleCounts, snpLocs, statName, subWinStart, subWinEnd, statVals, subWinIndex, hapsInSubWin, unmasked, precomputedStats):
+def calcAndAppendStatValForScan(
+    alleleCounts,
+    snpLocs,
+    statName,
+    subWinStart,
+    subWinEnd,
+    statVals,
+    subWinIndex,
+    hapsInSubWin,
+    unmasked,
+    precomputedStats,
+):
     if statName == "tajD":
-        statVals[statName].append(allel.stats.diversity.tajima_d(
-            alleleCounts, pos=snpLocs, start=subWinStart, stop=subWinEnd))
+        statVals[statName].append(
+            allel.stats.diversity.tajima_d(
+                alleleCounts, pos=snpLocs, start=subWinStart, stop=subWinEnd
+            )
+        )
     elif statName == "pi":
-        statVals[statName].append(allel.stats.diversity.sequence_diversity(  # NOQA
-            snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked))  # NOQA
+        statVals[statName].append(
+            allel.stats.diversity.sequence_diversity(  # NOQA
+                snpLocs,
+                alleleCounts,
+                start=subWinStart,
+                stop=subWinEnd,
+                is_accessible=unmasked,
+            )
+        )  # NOQA
     elif statName == "thetaW":
-        statVals[statName].append(allel.stats.diversity.watterson_theta(
-            snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked))  # NOQA
+        statVals[statName].append(
+            allel.stats.diversity.watterson_theta(
+                snpLocs,
+                alleleCounts,
+                start=subWinStart,
+                stop=subWinEnd,
+                is_accessible=unmasked,
+            )
+        )  # NOQA
     elif statName == "thetaH":
-        statVals[statName].append(thetah(
-            snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked))  # NOQA
+        statVals[statName].append(
+            thetah(
+                snpLocs,
+                alleleCounts,
+                start=subWinStart,
+                stop=subWinEnd,
+                is_accessible=unmasked,
+            )
+        )  # NOQA
     elif statName == "fayWuH":
         statVals[statName].append(
-            statVals["thetaH"][subWinIndex]-statVals["pi"][subWinIndex])
+            statVals["thetaH"][subWinIndex] - statVals["pi"][subWinIndex]
+        )
     elif statName == "maxFDA":
         # AK: undefined variables
-        statVals[statName].append(maxFDA(
-            snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked))
+        statVals[statName].append(
+            maxFDA(
+                snpLocs,
+                alleleCounts,
+                start=subWinStart,
+                stop=subWinEnd,
+                is_accessible=unmasked,
+            )
+        )
     elif statName == "HapCount":
         statVals[statName].append(len(hapsInSubWin.distinct()))
     elif statName == "H1":
@@ -903,61 +1227,84 @@ def calcAndAppendStatValForScan(alleleCounts, snpLocs, statName, subWinStart, su
         if "H2/H1" in statVals:
             statVals["H2/H1"].append(h21)
     elif statName == "ZnS":
-        r2Matrix = shicstats.computeR2Matrix(hapsInSubWin)
-        statVals["ZnS"].append(shicstats.ZnS(r2Matrix)[0])
-        statVals["Omega"].append(shicstats.omega(r2Matrix)[0])
+        r2Matrix = dps.computeR2Matrix(hapsInSubWin)
+        statVals["ZnS"].append(dps.ZnS(r2Matrix)[0])
+        statVals["Omega"].append(dps.omega(r2Matrix)[0])
     elif statName == "RH":
         rMatrixFlat = allel.stats.ld.rogers_huff_r(
-            hapsInSubWin.to_genotypes(ploidy=2).to_n_alt())
+            hapsInSubWin.to_genotypes(ploidy=2).to_n_alt()
+        )
         rhAvg = rMatrixFlat.mean()
         statVals["RH"].append(rhAvg)
         r2Matrix = squareform(rMatrixFlat ** 2)
-        statVals["Omega"].append(shicstats.omega(r2Matrix)[0])
+        statVals["Omega"].append(dps.omega(r2Matrix)[0])
     elif statName == "iHSMean":
-        vals = [x for x in precomputedStats["iHS"][subWinIndex]
-                if not (math.isnan(x) or math.isinf(x))]
+        vals = [
+            x
+            for x in precomputedStats["iHS"][subWinIndex]
+            if not (math.isnan(x) or math.isinf(x))
+        ]
         if len(vals) == 0:
             statVals["iHSMean"].append(0.0)
         else:
-            statVals["iHSMean"].append(sum(vals)/float(len(vals)))
+            statVals["iHSMean"].append(sum(vals) / float(len(vals)))
     elif statName == "nSLMean":
-        vals = [x for x in precomputedStats["nSL"][subWinIndex]
-                if not (math.isnan(x) or math.isnan(x))]
+        vals = [
+            x
+            for x in precomputedStats["nSL"][subWinIndex]
+            if not (math.isnan(x) or math.isnan(x))
+        ]
         if len(vals) == 0:
             statVals["nSLMean"].append(0.0)
         else:
-            statVals["nSLMean"].append(sum(vals)/float(len(vals)))
+            statVals["nSLMean"].append(sum(vals) / float(len(vals)))
     elif statName == "iHSMax":
-        vals = [x for x in precomputedStats["iHS"][subWinIndex]
-                if not (math.isnan(x) or math.isinf(x))]
+        vals = [
+            x
+            for x in precomputedStats["iHS"][subWinIndex]
+            if not (math.isnan(x) or math.isinf(x))
+        ]
         if len(vals) == 0:
             maxVal = 0.0
         else:
             maxVal = max(vals)
         statVals["iHSMax"].append(maxVal)
     elif statName == "nSLMax":
-        vals = [x for x in precomputedStats["nSL"][subWinIndex]
-                if not (math.isnan(x) or math.isnan(x))]
+        vals = [
+            x
+            for x in precomputedStats["nSL"][subWinIndex]
+            if not (math.isnan(x) or math.isnan(x))
+        ]
         if len(vals) == 0:
             maxVal = 0.0
         else:
             maxVal = max(vals)
         statVals["nSLMax"].append(maxVal)
     elif statName == "iHSOutFrac":
-        statVals["iHSOutFrac"].append(getOutlierFrac(
-            precomputedStats["iHS"][subWinIndex]))
+        statVals["iHSOutFrac"].append(
+            getOutlierFrac(precomputedStats["iHS"][subWinIndex])
+        )
     elif statName == "nSLOutFrac":
-        statVals["nSLOutFrac"].append(getOutlierFrac(
-            precomputedStats["nSL"][subWinIndex]))
+        statVals["nSLOutFrac"].append(
+            getOutlierFrac(precomputedStats["nSL"][subWinIndex])
+        )
     elif statName == "distVar":
-        dists = shicstats.pairwiseDiffs(
-            hapsInSubWin)/float(unmasked[subWinStart-1:subWinEnd].count(True))
+        dists = dps.pairwiseDiffs(hapsInSubWin) / float(
+            unmasked[subWinStart - 1 : subWinEnd].count(True)
+        )
         statVals["distVar"].append(np.var(dists, ddof=1))
         statVals["distSkew"].append(scipy.stats.skew(dists))
         statVals["distKurt"].append(scipy.stats.kurtosis(dists))
-    elif statName in ["H12", "H123", "H2/H1",
-                      "Omega", "distVar", "distSkew", "distKurt"]:
-        assert len(statVals[statName]) == subWinIndex+1
+    elif statName in [
+        "H12",
+        "H123",
+        "H2/H1",
+        "Omega",
+        "distVar",
+        "distSkew",
+        "distKurt",
+    ]:
+        assert len(statVals[statName]) == subWinIndex + 1
 
 
 def appendStatValsForMonomorphicForScan(statName, statVals, subWinIndex):
@@ -1010,19 +1357,27 @@ def appendStatValsForMonomorphicForScan(statName, statVals, subWinIndex):
         statVals["iHSMax"].append(0.0)
     elif statName == "nSLMax":
         statVals["nSLMax"].append(0.0)
-    elif statName in ["H12", "H123", "H2/H1", "diplo_H12",
-                      "diplo_H123", "diplo_H2/H1", "Omega", "diplo_Omega"]:
+    elif statName in [
+        "H12",
+        "H123",
+        "H2/H1",
+        "diplo_H12",
+        "diplo_H123",
+        "diplo_H2/H1",
+        "Omega",
+        "diplo_Omega",
+    ]:
         # print(statName, statVals[statName][instanceIndex], subWinIndex+1)
-        assert len(statVals[statName]) == subWinIndex+1
+        assert len(statVals[statName]) == subWinIndex + 1
     else:
         statVals[statName].append(0.0)
 
 
-'''
+"""
 WARNING: this code assumes that the second column of ac gives the derived alleles;
 please ensure that this is the case (and that you are using polarized data) if
 are going to use values of this statistic for the classifier!!
-''' # NOQA
+"""  # NOQA
 
 
 def thetah(pos, ac, start=None, stop=None, is_accessible=None):
@@ -1046,16 +1401,16 @@ def thetah(pos, ac, start=None, stop=None, is_accessible=None):
     h = 0
     for i in range(len(ac)):
         p1 = ac[i, 1]
-        n = p1+ac[i, 0]
+        n = p1 + ac[i, 0]
         if n > 1:
-            h += (p1*p1)/(n*(n-1.0))
+            h += (p1 * p1) / (n * (n - 1.0))
     h *= 2
 
     # calculate value per base
     if is_accessible is None:
         n_bases = stop - start + 1
     else:
-        n_bases = np.count_nonzero(is_accessible[start-1:stop])
+        n_bases = np.count_nonzero(is_accessible[start - 1 : stop])
 
     h = h / n_bases
     return h
@@ -1065,8 +1420,8 @@ def garudH1(hapCounts):
     h1 = 0.0
 
     for hapFreq in range(len(hapCounts), 0, -1):
-        pi = hapFreq/float(len(hapCounts))
-        h1 += hapCounts[hapFreq-1]*pi*pi
+        pi = hapFreq / float(len(hapCounts))
+        h1 += hapCounts[hapFreq - 1] * pi * pi
 
     return h1
 
@@ -1076,13 +1431,13 @@ def garudH2(hapCounts):
     first = True
 
     for hapFreq in range(len(hapCounts), 0, -1):
-        pi = hapFreq/float(len(hapCounts))
-        if hapCounts[hapFreq-1] > 0:
+        pi = hapFreq / float(len(hapCounts))
+        if hapCounts[hapFreq - 1] > 0:
             if first:
                 first = False
-                h2 += (hapCounts[hapFreq-1]-1)*pi*pi
+                h2 += (hapCounts[hapFreq - 1] - 1) * pi * pi
             else:
-                h2 += hapCounts[hapFreq-1]*pi*pi
+                h2 += hapCounts[hapFreq - 1] * pi * pi
 
     return h2
 
@@ -1092,14 +1447,14 @@ def garudH12(hapCounts):
     totalAdded = 0
 
     for hapFreq in range(len(hapCounts), 0, -1):
-        pi = hapFreq/float(len(hapCounts))
-        for i in range(hapCounts[hapFreq-1]):
+        pi = hapFreq / float(len(hapCounts))
+        for i in range(hapCounts[hapFreq - 1]):
             if totalAdded < 2:
                 part1 += pi
             else:
-                part2 += pi*pi
+                part2 += pi * pi
             totalAdded += 1
 
-    part1 = part1*part1
+    part1 = part1 * part1
 
-    return part1+part2
+    return part1 + part2
diff --git a/generateSimLaunchScript.py b/diploshic/generateSimLaunchScript.py
similarity index 53%
rename from generateSimLaunchScript.py
rename to diploshic/generateSimLaunchScript.py
index 2499973..f6ee035 100644
--- a/generateSimLaunchScript.py
+++ b/diploshic/generateSimLaunchScript.py
@@ -1,4 +1,5 @@
 import sys, os
+
 """
 python generateSimLaunchScript.py
 This script emits a bash launch script for running all of the training/testing simulations required by discoal
@@ -22,15 +23,19 @@
 # also, after performing simulation and calculating summary statistics, stats should be spot checked to make sure
 # that when a sweep occurs in the central subwindow, the effect has mostly dissipated in the outer windows
 # alpha/rho may have to be adjusted in order to achieve this
-trainingSampleNumber = 2000 #the number of simulation replicates we want to generate for each file in our training set
-testSampleNumber = 1000 #the number of simulations to create for each file in the test set
-sampleSize = 100 #the number of individuals in our population sample
-numSites = 55000 #total number of sites in our simulated window (i.e. S/HIC's subwindow size * 11)
-u = 3.5e-9 #per-site mutation rate (used to calculate mean theta)
-gensPerYear = 11.0 #number of generations per year
-maxSoftSweepInitFreq = 0.1 #maximum initial selected frequency for soft sweeps
-tauHigh = 0.05 #maximum FIXATION (not mutation) time (in units of 4N generations ago) in the past
-rhoOverTheta = 5.0 #crossover rate over mut rate (used to calculate mean rho)
+trainingSampleNumber = 2000  # the number of simulation replicates we want to generate for each file in our training set
+testSampleNumber = (
+    1000  # the number of simulations to create for each file in the test set
+)
+sampleSize = 100  # the number of individuals in our population sample
+numSites = 55000  # total number of sites in our simulated window (i.e. S/HIC's subwindow size * 11)
+u = 3.5e-9  # per-site mutation rate (used to calculate mean theta)
+gensPerYear = 11.0  # number of generations per year
+maxSoftSweepInitFreq = (
+    0.1  # maximum initial selected frequency for soft sweeps
+)
+tauHigh = 0.05  # maximum FIXATION (not mutation) time (in units of 4N generations ago) in the past
+rhoOverTheta = 5.0  # crossover rate over mut rate (used to calculate mean rho)
 
 sizeChanges = []
 with open(popSizeFileName) as popSizeFile:
@@ -43,42 +48,56 @@
             ne0 = ne
             prevSizeRatio = 1.0
         else:
-            t, sizeRatio = year*gensPerYear/(4*ne0), ne/ne0
+            t, sizeRatio = year * gensPerYear / (4 * ne0), ne / ne0
             if abs(sizeRatio - prevSizeRatio) > 1e-9:
                 sizeChanges.append((t, sizeRatio, ne))
                 prevSizeRatio = sizeRatio
 
 N0 = ne0
-thetaMean=4*N0*u*numSites
+thetaMean = 4 * N0 * u * numSites
 rhoMean = thetaMean * rhoOverTheta
-thetaLow = (2*thetaMean)/11.0
-thetaHigh = 10*thetaLow
+thetaLow = (2 * thetaMean) / 11.0
+thetaHigh = 10 * thetaLow
 rhoMax = 3 * rhoMean
 
-alphaHigh = 2*N0*0.005 # max selection coefficient s is 0.005
-alphaLow = 2*N0*0.0001 # mininum selection coefficient s at 0.0001
+alphaHigh = 2 * N0 * 0.005  # max selection coefficient s is 0.005
+alphaLow = 2 * N0 * 0.0001  # mininum selection coefficient s at 0.0001
 
-selStr = " -ws 0 -Pa %f %f -Pu 0 %f" %(alphaLow, alphaHigh, tauHigh)
-partialSelStr = " -ws 0 -Pa %f %f" %(alphaLow, alphaHigh)
-softStr = " -Pf 0 %f" %(maxSoftSweepInitFreq)
+selStr = " -ws 0 -Pa %f %f -Pu 0 %f" % (alphaLow, alphaHigh, tauHigh)
+partialSelStr = " -ws 0 -Pa %f %f" % (alphaLow, alphaHigh)
+softStr = " -Pf 0 %f" % (maxSoftSweepInitFreq)
 
 demogStr = ""
 for t, sizeRatio, ne in sizeChanges:
-    demogStr += " -en %f 0 %f" %(t, sizeRatio)
+    demogStr += " -en %f 0 %f" % (t, sizeRatio)
 
 sweepLocStr = " -x $x"
 
 print("#!/bin/bash")
-for sampleNumber, outDir, simTitle in [(trainingSampleNumber, trainingOutDir, "training data"), (testSampleNumber, testOutDir, "test data")]:
+for sampleNumber, outDir, simTitle in [
+    (trainingSampleNumber, trainingOutDir, "training data"),
+    (testSampleNumber, testOutDir, "test data"),
+]:
     partialStr = "-Pc 0.2 0.99"
-    print("\n#generating %s\n" %(simTitle))
-    neutDiscoalCmd = "discoal %d %d %d -Pt %f %f -Pre %f %f%s" %(sampleSize, sampleNumber, numSites, thetaLow, thetaHigh, rhoMean, rhoMax, demogStr)
-    print("%s > %s/Neut.msOut" %(neutDiscoalCmd, outDir))
+    print("\n#generating %s\n" % (simTitle))
+    neutDiscoalCmd = "discoal %d %d %d -Pt %f %f -Pre %f %f%s" % (
+        sampleSize,
+        sampleNumber,
+        numSites,
+        thetaLow,
+        thetaHigh,
+        rhoMean,
+        rhoMax,
+        demogStr,
+    )
+    print("%s > %s/Neut.msOut" % (neutDiscoalCmd, outDir))
     print("i=0")
-    print("for x in 0.045454545454545456 0.13636363636363635 0.22727272727272727 0.3181818181818182 0.4090909090909091 0.5 0.5909090909090909 0.6818181818181818 0.7727272727272727 0.8636363636363636 0.9545454545454546;\ndo")
+    print(
+        "for x in 0.045454545454545456 0.13636363636363635 0.22727272727272727 0.3181818181818182 0.4090909090909091 0.5 0.5909090909090909 0.6818181818181818 0.7727272727272727 0.8636363636363636 0.9545454545454546;\ndo"
+    )
     hardDiscoalCmd = neutDiscoalCmd + selStr + sweepLocStr
-    print("    %s > %s/Hard_$i.msOut" %(hardDiscoalCmd, outDir))
+    print("    %s > %s/Hard_$i.msOut" % (hardDiscoalCmd, outDir))
     softDiscoalCmd = neutDiscoalCmd + selStr + softStr + sweepLocStr
-    print("    %s > %s/Soft_$i.msOut" %(softDiscoalCmd, outDir))
+    print("    %s > %s/Soft_$i.msOut" % (softDiscoalCmd, outDir))
     print("    i=$((i + 1))\ndone")
     print("")
diff --git a/diploshic/makeFeatureVecsForChrArmFromVcfDiploid.py b/diploshic/makeFeatureVecsForChrArmFromVcfDiploid.py
new file mode 100644
index 0000000..71d522a
--- /dev/null
+++ b/diploshic/makeFeatureVecsForChrArmFromVcfDiploid.py
@@ -0,0 +1,292 @@
+import os
+import allel
+import h5py
+import numpy as np
+import sys
+import time
+from diploshic.fvTools import *
+
+if not len(sys.argv) in [13, 15]:
+    sys.exit(
+        "usage:\npython makeFeatureVecsForChrArmFromVcfDiploid.py vcfFileName chrArm chrLen targetPop winSize numSubWins maskFileName unmaskedFracCutoff unmaskedGenoFracCutoff sampleToPopFileName statFileName outFileName [segmentStart segmentEnd]\n"
+    )
+if len(sys.argv) == 15:
+    (
+        vcfFileName,
+        chrArm,
+        chrLen,
+        targetPop,
+        winSize,
+        numSubWins,
+        maskFileName,
+        unmaskedFracCutoff,
+        unmaskedGenoFracCutoff,
+        sampleToPopFileName,
+        statFileName,
+        outfn,
+        segmentStart,
+        segmentEnd,
+    ) = sys.argv[1:]
+    segmentStart, segmentEnd = int(segmentStart), int(segmentEnd)
+else:
+    (
+        vcfFileName,
+        chrArm,
+        chrLen,
+        targetPop,
+        winSize,
+        numSubWins,
+        maskFileName,
+        unmaskedFracCutoff,
+        unmaskedGenoFracCutoff,
+        sampleToPopFileName,
+        statFileName,
+        outfn,
+    ) = sys.argv[1:]
+    segmentStart = None
+
+unmaskedFracCutoff = float(unmaskedFracCutoff)
+if unmaskedFracCutoff < 0.0 or unmaskedFracCutoff > 1.0:
+    sys.exit(
+        "unmaskedFracCutoff=%s but must be within [0, 1]. AAAAARRRRGHHHHHH!!!\n"
+        % (unmaskedFracCutoff)
+    )
+unmaskedGenoFracCutoff = float(unmaskedGenoFracCutoff)
+if unmaskedGenoFracCutoff < 0.0 or unmaskedGenoFracCutoff > 1.0:
+    sys.exit(
+        "unmaskedGenoFracCutoff=%s but must be within [0, 1]. AAAAARRRRGHHHHHH!!!\n"
+        % (unmaskedGenoFracCutoff)
+    )
+chrLen, winSize, numSubWins = int(chrLen), int(winSize), int(numSubWins)
+assert winSize % numSubWins == 0 and numSubWins > 1
+subWinSize = int(winSize / numSubWins)
+
+
+def getSubWinBounds(chrLen, subWinSize):
+    lastSubWinEnd = chrLen - chrLen % subWinSize
+    lastSubWinStart = lastSubWinEnd - subWinSize + 1
+    subWinBounds = []
+    for subWinStart in range(1, lastSubWinStart + 1, subWinSize):
+        subWinEnd = subWinStart + subWinSize - 1
+        subWinBounds.append((subWinStart, subWinEnd))
+    return subWinBounds
+
+
+def getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, snpLocs):
+    subWinStart = 1
+    subWinEnd = subWinStart + subWinSize - 1
+    snpIndicesInSubWins = [[]]
+    for i in range(len(snpLocs)):
+        while snpLocs[i] <= lastSubWinEnd and not (
+            snpLocs[i] >= subWinStart and snpLocs[i] <= subWinEnd
+        ):
+            subWinStart += subWinSize
+            subWinEnd += subWinSize
+            snpIndicesInSubWins.append([])
+        if snpLocs[i] <= lastSubWinEnd:
+            snpIndicesInSubWins[-1].append(i)
+    while subWinEnd < lastSubWinEnd:
+        snpIndicesInSubWins.append([])
+        subWinStart += subWinSize
+        subWinEnd += subWinSize
+    return snpIndicesInSubWins
+
+
+def readSampleToPopFile(sampleToPopFileName):
+    table = {}
+    with open(sampleToPopFileName) as sampleToPopFile:
+        for line in sampleToPopFile:
+            sample, pop = line.strip().split()
+            table[sample] = pop
+    return table
+
+
+vcfFile = allel.read_vcf(vcfFileName)
+chroms = vcfFile["variants/CHROM"]
+positions = np.extract(chroms == chrArm, vcfFile["variants/POS"])
+
+if maskFileName.lower() in ["none", "false"]:
+    sys.stderr.write(
+        "Warning: a mask.fa file for the chr arm with all masked sites N'ed out is strongly recommended"
+        + " (pass in the reference to remove Ns at the very least)!\n"
+    )
+    unmasked = [True] * chrLen
+else:
+    unmasked = readMaskDataForScan(maskFileName, chrArm)
+    assert len(unmasked) == chrLen
+
+if statFileName.lower() in ["none", "false"]:
+    statFileName = None
+
+samples = vcfFile["samples"]
+if not sampleToPopFileName.lower() in ["none", "false"]:
+    sampleToPop = readSampleToPopFile(sampleToPopFileName)
+    sampleIndicesToKeep = [
+        i
+        for i in range(len(samples))
+        if sampleToPop.get(samples[i], "popNotFound!") == targetPop
+    ]
+else:
+    sampleIndicesToKeep = [i for i in range(len(samples))]
+rawgenos = np.take(
+    vcfFile["calldata/GT"],
+    [i for i in range(len(chroms)) if chroms[i] == chrArm],
+    axis=0,
+)
+genos = allel.GenotypeArray(rawgenos).subset(sel1=sampleIndicesToKeep)
+
+if segmentStart != None:
+    snpIndicesToKeep = [
+        i
+        for i in range(len(positions))
+        if segmentStart <= positions[i] <= segmentEnd
+    ]
+    positions = [positions[i] for i in snpIndicesToKeep]
+    genos = allel.GenotypeArray(genos.subset(sel0=snpIndicesToKeep))
+
+if isHaploidVcfGenoArray(genos):
+    sys.stderr.write(
+        "Detected haploid input. Converting into diploid individuals (combining haplotypes in order).\n"
+    )
+    genos = diploidizeGenotypeArray(genos)
+
+alleleCounts = genos.count_alleles()
+
+# remove all but mono/biallelic unmasked sites
+isBiallelic = alleleCounts.is_biallelic()
+for i in range(len(isBiallelic)):
+    if not (
+        isBiallelic[i]
+        and calledGenoFracAtSite(genos[i]) >= unmaskedGenoFracCutoff
+    ):
+        unmasked[positions[i] - 1] = False
+snpIndicesToKeep = [
+    i for i in range(len(positions)) if unmasked[positions[i] - 1]
+]
+genos = allel.GenotypeArray(genos.subset(sel0=snpIndicesToKeep))
+positions = [positions[i] for i in snpIndicesToKeep]
+alleleCounts = allel.AlleleCountsArray(
+    [[alleleCounts[i][0], max(alleleCounts[i][1:])] for i in snpIndicesToKeep]
+)
+
+statNames = [
+    "pi",
+    "thetaW",
+    "tajD",
+    "distVar",
+    "distSkew",
+    "distKurt",
+    "nDiplos",
+    "diplo_H1",
+    "diplo_H12",
+    "diplo_H2/H1",
+    "diplo_ZnS",
+    "diplo_Omega",
+]
+
+subWinBounds = getSubWinBounds(chrLen, subWinSize)
+
+header = "chrom classifiedWinStart classifiedWinEnd bigWinRange".split()
+statHeader = "chrom start end".split()
+for statName in statNames:
+    statHeader.append(statName)
+    for i in range(numSubWins):
+        header.append("%s_win%d" % (statName, i))
+statHeader = "\t".join(statHeader)
+header = "\t".join(header)
+outFile = open(outfn, "w")
+outFile.write(header + "\n")
+statVals = {}
+for statName in statNames:
+    statVals[statName] = []
+
+startTime = time.perf_counter()
+goodSubWins = []
+lastSubWinEnd = chrLen - chrLen % subWinSize
+snpIndicesInSubWins = getSnpIndicesInSubWins(
+    subWinSize, lastSubWinEnd, positions
+)
+subWinIndex = 0
+lastSubWinStart = lastSubWinEnd - subWinSize + 1
+if statFileName:
+    statFile = open(statFileName, "w")
+    statFile.write(statHeader + "\n")
+for subWinStart in range(1, lastSubWinStart + 1, subWinSize):
+    subWinEnd = subWinStart + subWinSize - 1
+    unmaskedFrac = unmasked[subWinStart - 1 : subWinEnd].count(True) / float(
+        subWinEnd - subWinStart + 1
+    )
+    if (
+        segmentStart == None
+        or subWinStart >= segmentStart
+        and subWinEnd <= segmentEnd
+    ):
+        sys.stderr.write(
+            "%d-%d num unmasked snps: %d; unmasked frac: %f\n"
+            % (
+                subWinStart,
+                subWinEnd,
+                len(snpIndicesInSubWins[subWinIndex]),
+                unmaskedFrac,
+            )
+        )
+    if (
+        len(snpIndicesInSubWins[subWinIndex]) > 0
+        and unmaskedFrac >= unmaskedFracCutoff
+    ):
+        genosInSubWin = allel.GenotypeArray(
+            genos.subset(sel0=snpIndicesInSubWins[subWinIndex])
+        )
+        statValStr = []
+        for statName in statNames:
+            calcAndAppendStatValForScanDiplo(
+                alleleCounts,
+                positions,
+                statName,
+                subWinStart,
+                subWinEnd,
+                statVals,
+                subWinIndex,
+                genosInSubWin,
+                unmasked,
+            )
+        goodSubWins.append(True)
+        if statFileName:
+            statFile.write(
+                "\t".join(
+                    [chrArm, str(subWinStart), str(subWinEnd)]
+                    + [str(statVals[statName][-1]) for statName in statNames]
+                )
+                + "\n"
+            )
+    else:
+        for statName in statNames:
+            appendStatValsForMonomorphicForScan(
+                statName, statVals, subWinIndex
+            )
+        goodSubWins.append(False)
+    if goodSubWins[-numSubWins:].count(True) == numSubWins:
+        outVec = []
+        for statName in statNames:
+            outVec += normalizeFeatureVec(statVals[statName][-numSubWins:])
+        midSubWinEnd = subWinEnd - subWinSize * (numSubWins / 2)
+        midSubWinStart = midSubWinEnd - subWinSize + 1
+        outFile.write(
+            "%s\t%d\t%d\t%d-%d\t"
+            % (
+                chrArm,
+                midSubWinStart,
+                midSubWinEnd,
+                subWinEnd - winSize + 1,
+                subWinEnd,
+            )
+            + "\t".join([str(x) for x in outVec])
+        )
+        outFile.write("\n")
+    subWinIndex += 1
+if statFileName:
+    statFile.close()
+outFile.close()
+sys.stderr.write(
+    "completed in %g seconds\n" % (time.perf_counter() - startTime)
+)
diff --git a/diploshic/makeFeatureVecsForChrArmFromVcf_ogSHIC.py b/diploshic/makeFeatureVecsForChrArmFromVcf_ogSHIC.py
new file mode 100644
index 0000000..7bd7639
--- /dev/null
+++ b/diploshic/makeFeatureVecsForChrArmFromVcf_ogSHIC.py
@@ -0,0 +1,306 @@
+import os
+import allel
+import h5py
+import numpy as np
+import sys
+import time
+from diploshic.fvTools import *
+
+if not len(sys.argv) in [13, 15]:
+    sys.exit(
+        "usage:\npython makeFeatureVecsForChrArmFromVcf_ogSHIC.py chrArmFileName chrArm chrLen targetPop winSize numSubWins maskFileName sampleToPopFileName ancestralArmFaFileName statFileName outFileName [segmentStart segmentEnd]\n"
+    )
+if len(sys.argv) == 15:
+    (
+        chrArmFileName,
+        chrArm,
+        chrLen,
+        targetPop,
+        winSize,
+        numSubWins,
+        maskFileName,
+        unmaskedFracCutoff,
+        sampleToPopFileName,
+        ancestralArmFaFileName,
+        statFileName,
+        outfn,
+        segmentStart,
+        segmentEnd,
+    ) = sys.argv[1:]
+    segmentStart, segmentEnd = int(segmentStart), int(segmentEnd)
+else:
+    (
+        chrArmFileName,
+        chrArm,
+        chrLen,
+        targetPop,
+        winSize,
+        numSubWins,
+        maskFileName,
+        unmaskedFracCutoff,
+        sampleToPopFileName,
+        ancestralArmFaFileName,
+        statFileName,
+        outfn,
+    ) = sys.argv[1:]
+    segmentStart = None
+
+unmaskedFracCutoff = float(unmaskedFracCutoff)
+chrLen, winSize, numSubWins = int(chrLen), int(winSize), int(numSubWins)
+assert winSize % numSubWins == 0 and numSubWins > 1
+subWinSize = int(winSize / numSubWins)
+
+
+def getSubWinBounds(chrLen, subWinSize):
+    lastSubWinEnd = chrLen - chrLen % subWinSize
+    lastSubWinStart = lastSubWinEnd - subWinSize + 1
+    subWinBounds = []
+    for subWinStart in range(1, lastSubWinStart + 1, subWinSize):
+        subWinEnd = subWinStart + subWinSize - 1
+        subWinBounds.append((subWinStart, subWinEnd))
+    return subWinBounds
+
+
+def getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, snpLocs):
+    subWinStart = 1
+    subWinEnd = subWinStart + subWinSize - 1
+    snpIndicesInSubWins = [[]]
+    for i in range(len(snpLocs)):
+        while snpLocs[i] <= lastSubWinEnd and not (
+            snpLocs[i] >= subWinStart and snpLocs[i] <= subWinEnd
+        ):
+            subWinStart += subWinSize
+            subWinEnd += subWinSize
+            snpIndicesInSubWins.append([])
+        if snpLocs[i] <= lastSubWinEnd:
+            snpIndicesInSubWins[-1].append(i)
+    while subWinEnd < lastSubWinEnd:
+        snpIndicesInSubWins.append([])
+        subWinStart += subWinSize
+        subWinEnd += subWinSize
+    return snpIndicesInSubWins
+
+
+chrArmFile = allel.read_vcf(chrArmFileName)
+chroms = chrArmFile["variants/CHROM"]
+positions = np.extract(chroms == chrArm, chrArmFile["variants/POS"])
+
+if maskFileName.lower() in ["none", "false"]:
+    sys.stderr.write(
+        "Warning: a mask.fa file for the chr arm with all masked sites N'ed out is strongly recommended"
+        + " (pass in the reference to remove Ns at the very least)!\n"
+    )
+    unmasked = [True] * chrLen
+else:
+    unmasked = readMaskDataForScan(maskFileName, chrArm)
+    assert len(unmasked) == chrLen
+
+if statFileName.lower() in ["none", "false"]:
+    statFileName = None
+
+samples = chrArmFile["samples"]
+if not sampleToPopFileName.lower() in ["none", "false"]:
+    sampleToPop = readSampleToPopFile(sampleToPopFileName)
+    sampleIndicesToKeep = [
+        i
+        for i in range(len(samples))
+        if sampleToPop.get(samples[i], "popNotFound!") == targetPop
+    ]
+else:
+    sampleIndicesToKeep = [i for i in range(len(samples))]
+
+rawgenos = np.take(
+    chrArmFile["calldata/GT"],
+    [i for i in range(len(chroms)) if chroms[i] == chrArm],
+    axis=0,
+)
+genos = allel.GenotypeArray(rawgenos)
+refAlleles = np.extract(chroms == chrArm, chrArmFile["variants/REF"])
+altAlleles = np.extract(chroms == chrArm, chrArmFile["variants/ALT"])
+if segmentStart != None:
+    snpIndicesToKeep = [
+        i
+        for i in range(len(positions))
+        if segmentStart <= positions[i] <= segmentEnd
+    ]
+    positions = [positions[i] for i in snpIndicesToKeep]
+    refAlleles = [refAlleles[i] for i in snpIndicesToKeep]
+    altAlleles = [altAlleles[i] for i in snpIndicesToKeep]
+    genos = allel.GenotypeArray(genos.subset(sel0=snpIndicesToKeep))
+genos = allel.GenotypeArray(genos.subset(sel1=sampleIndicesToKeep))
+alleleCounts = genos.count_alleles()
+
+# remove all but mono/biallelic unmasked sites
+isBiallelic = alleleCounts.is_biallelic()
+for i in range(len(isBiallelic)):
+    if not isBiallelic[i]:
+        unmasked[positions[i] - 1] = False
+
+# polarize
+if not ancestralArmFaFileName.lower() in ["none", "false"]:
+    sys.stderr.write("polarizing snps\n")
+    ancArm = readFaArm(ancestralArmFaFileName, chrArm).upper()
+    startTime = time.perf_counter()
+    # NOTE: mapping specifies which alleles to swap counts for based on polarization; leaves unpolarized snps alone
+    # NOTE: those snps need to be filtered later on (as done below)!
+    # this will also remove sites that could not be polarized
+    mapping, unmasked = polarizeSnps(
+        unmasked, positions, refAlleles, altAlleles, ancArm
+    )
+    sys.stderr.write("took %s seconds\n" % (time.perf_counter() - startTime))
+    statNames = [
+        "pi",
+        "thetaW",
+        "tajD",
+        "thetaH",
+        "fayWuH",
+        "maxFDA",
+        "HapCount",
+        "H1",
+        "H12",
+        "H2/H1",
+        "ZnS",
+        "Omega",
+        "distVar",
+        "distSkew",
+        "distKurt",
+    ]
+else:
+    statNames = [
+        "pi",
+        "thetaW",
+        "tajD",
+        "HapCount",
+        "H1",
+        "H12",
+        "H2/H1",
+        "ZnS",
+        "Omega",
+        "distVar",
+        "distSkew",
+        "distKurt",
+    ]
+
+snpIndicesToKeep = [
+    i for i in range(len(positions)) if unmasked[positions[i] - 1]
+]
+genos = allel.GenotypeArray(genos.subset(sel0=snpIndicesToKeep))
+positions = [positions[i] for i in snpIndicesToKeep]
+alleleCounts = allel.AlleleCountsArray(
+    [[alleleCounts[i][0], max(alleleCounts[i][1:])] for i in snpIndicesToKeep]
+)
+if not ancestralArmFaFileName.lower() in ["none", "false"]:
+    mapping = [mapping[i] for i in snpIndicesToKeep]
+    alleleCounts = alleleCounts.map_alleles(mapping)
+haps = genos.to_haplotypes()
+
+subWinBounds = getSubWinBounds(chrLen, subWinSize)
+precomputedStats = {}  # not using this
+
+header = "chrom classifiedWinStart classifiedWinEnd bigWinRange".split()
+statHeader = "chrom start end".split()
+for statName in statNames:
+    statHeader.append(statName)
+    for i in range(numSubWins):
+        header.append("%s_win%d" % (statName, i))
+statHeader = "\t".join(statHeader)
+header = "\t".join(header)
+outFile = open(outfn, "w")
+outFile.write(header + "\n")
+statVals = {}
+for statName in statNames:
+    statVals[statName] = []
+
+startTime = time.perf_counter()
+goodSubWins = []
+lastSubWinEnd = chrLen - chrLen % subWinSize
+snpIndicesInSubWins = getSnpIndicesInSubWins(
+    subWinSize, lastSubWinEnd, positions
+)
+subWinIndex = 0
+lastSubWinStart = lastSubWinEnd - subWinSize + 1
+if statFileName:
+    statFile = open(statFileName, "w")
+    statFile.write(statHeader + "\n")
+for subWinStart in range(1, lastSubWinStart + 1, subWinSize):
+    subWinEnd = subWinStart + subWinSize - 1
+    unmaskedFrac = unmasked[subWinStart - 1 : subWinEnd].count(True) / float(
+        subWinEnd - subWinStart + 1
+    )
+    if (
+        segmentStart == None
+        or subWinStart >= segmentStart
+        and subWinEnd <= segmentEnd
+    ):
+        sys.stderr.write(
+            "%d-%d num unmasked snps: %d; unmasked frac: %f\n"
+            % (
+                subWinStart,
+                subWinEnd,
+                len(snpIndicesInSubWins[subWinIndex]),
+                unmaskedFrac,
+            )
+        )
+    if (
+        len(snpIndicesInSubWins[subWinIndex]) > 0
+        and unmaskedFrac >= unmaskedFracCutoff
+    ):
+        hapsInSubWin = allel.HaplotypeArray(
+            haps.subset(sel0=snpIndicesInSubWins[subWinIndex])
+        )
+        statValStr = []
+        for statName in statNames:
+            calcAndAppendStatValForScan(
+                alleleCounts,
+                positions,
+                statName,
+                subWinStart,
+                subWinEnd,
+                statVals,
+                subWinIndex,
+                hapsInSubWin,
+                unmasked,
+                precomputedStats,
+            )
+            statValStr.append("%s: %s" % (statName, statVals[statName][-1]))
+        sys.stderr.write("\t".join(statValStr) + "\n")
+        goodSubWins.append(True)
+        if statFileName:
+            statFile.write(
+                "\t".join(
+                    [chrArm, str(subWinStart), str(subWinEnd)]
+                    + [str(statVals[statName][-1]) for statName in statNames]
+                )
+                + "\n"
+            )
+    else:
+        for statName in statNames:
+            appendStatValsForMonomorphicForScan(
+                statName, statVals, subWinIndex
+            )
+        goodSubWins.append(False)
+    if goodSubWins[-numSubWins:].count(True) == numSubWins:
+        outVec = []
+        for statName in statNames:
+            outVec += normalizeFeatureVec(statVals[statName][-numSubWins:])
+        midSubWinEnd = subWinEnd - subWinSize * (numSubWins / 2)
+        midSubWinStart = midSubWinEnd - subWinSize + 1
+        outFile.write(
+            "%s\t%d\t%d\t%d-%d\t"
+            % (
+                chrArm,
+                midSubWinStart,
+                midSubWinEnd,
+                subWinEnd - winSize + 1,
+                subWinEnd,
+            )
+            + "\t".join([str(x) for x in outVec])
+        )
+        outFile.write("\n")
+    subWinIndex += 1
+if statFileName:
+    statFile.close()
+outFile.close()
+sys.stderr.write(
+    "completed in %g seconds\n" % (time.perf_counter() - startTime)
+)
diff --git a/diploshic/makeFeatureVecsForSingleMsDiploid.py b/diploshic/makeFeatureVecsForSingleMsDiploid.py
new file mode 100644
index 0000000..54b322b
--- /dev/null
+++ b/diploshic/makeFeatureVecsForSingleMsDiploid.py
@@ -0,0 +1,324 @@
+import sys, os
+import allel
+import random
+import numpy as np
+from diploshic.msTools import *
+from diploshic.fvTools import *
+import time
+
+(
+    trainingDataFileName,
+    totalPhysLen,
+    numSubWins,
+    maskFileName,
+    vcfForMaskFileName,
+    popForMask,
+    sampleToPopFileName,
+    unmaskedGenoFracCutoff,
+    chrArmsForMasking,
+    unmaskedFracCutoff,
+    outStatsDir,
+    fvecFileName,
+) = sys.argv[1:]
+totalPhysLen = int(totalPhysLen)
+numSubWins = int(numSubWins)
+subWinLen = totalPhysLen // numSubWins
+assert totalPhysLen % numSubWins == 0 and numSubWins > 1
+
+sys.stderr.write("file name='%s'" % (trainingDataFileName))
+
+(
+    trainingDataFileObj,
+    sampleSize,
+    numInstances,
+) = openMsOutFileForSequentialReading(trainingDataFileName)
+
+if maskFileName.lower() in ["none", "false"]:
+    sys.stderr.write(
+        "maskFileName='%s': not masking any sites!\n" % (maskFileName)
+    )
+    maskFileName = False
+    unmaskedFracCutoff = 1.0
+else:
+    chrArmsForMasking = chrArmsForMasking.split(",")
+    unmaskedFracCutoff = float(unmaskedFracCutoff)
+    if unmaskedFracCutoff > 1.0 or unmaskedFracCutoff < 0.0:
+        sys.exit(
+            "unmaskedFracCutoff must lie within [0, 1]. AAARRRRGGGGHHHHH!!!!\n"
+        )
+
+if vcfForMaskFileName.lower() in ["none", "false"]:
+    sys.stderr.write(
+        "vcfForMaskFileName='%s': not masking any genotypes!\n"
+        % (vcfForMaskFileName)
+    )
+    vcfForMaskFileName = False
+else:
+    if not maskFileName:
+        sys.exit(
+            "Cannot mask genotypes without also supplying a file for masking entire sites (can use reference genome with Ns if desired). AAARRRGHHHHH!!!!!!\n"
+        )
+    if sampleToPopFileName.lower() in [
+        "none",
+        "false",
+    ] or popForMask.lower() in [
+        "none",
+        "false",
+    ]:
+        sampleToPopFileName = None
+        sys.stderr.write(
+            "No sampleToPopFileName specified. Using all individuals for masking genotypes.\n"
+        )
+    unmaskedGenoFracCutoff = float(unmaskedGenoFracCutoff)
+    if unmaskedGenoFracCutoff > 1.0 or unmaskedGenoFracCutoff < 0.0:
+        sys.exit(
+            "unmaskedGenoFracCutoff must lie within [0, 1]. AAARRRRGGGGHHHHH!!!!\n"
+        )
+
+
+def getSubWinBounds(subWinLen, totalPhysLen):  # get inclusive subwin bounds
+    subWinStart = 1
+    subWinEnd = subWinStart + subWinLen - 1
+    subWinBounds = [(subWinStart, subWinEnd)]
+    numSubWins = totalPhysLen // subWinLen
+    for i in range(1, numSubWins - 1):
+        subWinStart += subWinLen
+        subWinEnd += subWinLen
+        subWinBounds.append((subWinStart, subWinEnd))
+    subWinStart += subWinLen
+    # if our subwindows are 1 bp too short due to rounding error, the last window picks up all of the slack
+    subWinEnd = totalPhysLen
+    subWinBounds.append((subWinStart, subWinEnd))
+    return subWinBounds
+
+
+if not maskFileName:
+    unmasked = [True] * totalPhysLen
+else:
+    drawWithReplacement = False
+    sys.stderr.write("reading masking data...")
+    maskData = readMaskDataForTraining(
+        maskFileName,
+        totalPhysLen,
+        subWinLen,
+        chrArmsForMasking,
+        shuffle=True,
+        cutoff=unmaskedFracCutoff,
+        genoCutoff=unmaskedGenoFracCutoff,
+        vcfForMaskFileName=vcfForMaskFileName,
+        pop=popForMask,
+        sampleToPopFileName=sampleToPopFileName,
+    )
+    if vcfForMaskFileName:
+        maskData, genoMaskData = maskData
+    else:
+        genoMaskData = [None] * len(maskData)
+    sys.stderr.write("done!\n")
+
+    if len(maskData) < numInstances:
+        sys.stderr.write(
+            "Warning: didn't get enough windows from masked data (needed %d; got %d); will draw with replacement!!\n"
+            % (numInstances, len(maskData))
+        )
+        drawWithReplacement = True
+    else:
+        sys.stderr.write(
+            "Got enough windows from masked data (needed %d; got %d); will draw without replacement.\n"
+            % (numInstances, len(maskData))
+        )
+
+
+def getSnpIndicesInSubWins(subWinBounds, snpLocs):
+    snpIndicesInSubWins = []
+    for subWinIndex in range(len(subWinBounds)):
+        snpIndicesInSubWins.append([])
+
+    subWinIndex = 0
+    for i in range(len(snpLocs)):
+        while not (
+            snpLocs[i] >= subWinBounds[subWinIndex][0]
+            and snpLocs[i] <= subWinBounds[subWinIndex][1]
+        ):
+            subWinIndex += 1
+        snpIndicesInSubWins[subWinIndex].append(i)
+    return snpIndicesInSubWins
+
+
+subWinBounds = getSubWinBounds(subWinLen, totalPhysLen)
+# statNames = ["pi", "thetaW", "tajD", "nDiplos","diplo_H2","diplo_H12","diplo_H2/H1","diplo_ZnS","diplo_Omega"]
+statNames = [
+    "pi",
+    "thetaW",
+    "tajD",
+    "distVar",
+    "distSkew",
+    "distKurt",
+    "nDiplos",
+    "diplo_H1",
+    "diplo_H12",
+    "diplo_H2/H1",
+    "diplo_ZnS",
+    "diplo_Omega",
+]
+header = []
+for statName in statNames:
+    for i in range(numSubWins):
+        header.append("%s_win%d" % (statName, i))
+header = "\t".join(header)
+
+
+statVals = {}
+for statName in statNames:
+    statVals[statName] = []
+start = time.perf_counter()
+numInstancesDone = 0
+sys.stderr.write("ready to process sim reps. here we go!\n")
+for instanceIndex in range(numInstances):
+    sys.stderr.write("starting rep %d of %d\n" % (instanceIndex, numInstances))
+    hapArrayIn, positionArray = readNextMsRepToHaplotypeArrayIn(
+        trainingDataFileObj, sampleSize, totalPhysLen
+    )
+
+    haps = allel.HaplotypeArray(hapArrayIn, dtype="i1")
+    if maskFileName:
+        if drawWithReplacement:
+            randIndex = random.randint(0, len(maskData) - 1)
+            unmasked, genoMasks = maskData[randIndex], genoMaskData[randIndex]
+        else:
+            unmasked, genoMasks = (
+                maskData[instanceIndex],
+                genoMaskData[instanceIndex],
+            )
+        assert len(unmasked) == totalPhysLen
+    if haps.shape[1] % 2 == 1:
+        haps = haps[:, :-1]
+    genos = haps.to_genotypes(ploidy=2)
+    unmaskedSnpIndices = [
+        i for i in range(len(positionArray)) if unmasked[positionArray[i] - 1]
+    ]
+    if len(unmaskedSnpIndices) == 0:
+        sys.stderr.write("no snps for rep %d\n" % (instanceIndex))
+        for statName in statNames:
+            statVals[statName].append([])
+        for subWinIndex in range(numSubWins):
+            for statName in statNames:
+                appendStatValsForMonomorphic(
+                    statName, statVals, instanceIndex, subWinIndex
+                )
+    else:
+        sys.stderr.write("processing snps for rep %d\n" % (instanceIndex))
+        if maskFileName:
+            preMaskCount = np.sum(genos.count_alleles())
+            if genoMasks:
+                sys.stderr.write(
+                    "%d snps in the masking window for rep %d\n"
+                    % (len(genoMasks), instanceIndex)
+                )
+                genos = maskGenos(
+                    genos.subset(sel0=unmaskedSnpIndices), genoMasks
+                )
+            else:
+                genos = genos.subset(sel0=unmaskedSnpIndices)
+            alleleCountsUnmaskedOnly = genos.count_alleles()
+            maskedCount = preMaskCount - np.sum(alleleCountsUnmaskedOnly)
+            sys.stderr.write(
+                "%d of %d genotypes (%.2f%%) masked for rep %d\n"
+                % (
+                    maskedCount,
+                    preMaskCount,
+                    100 * maskedCount / preMaskCount,
+                    instanceIndex,
+                )
+            )
+        else:
+            alleleCountsUnmaskedOnly = genos.count_alleles()
+        positionArrayUnmaskedOnly = [
+            positionArray[i] for i in unmaskedSnpIndices
+        ]
+        snpIndicesInSubWins = getSnpIndicesInSubWins(
+            subWinBounds, positionArrayUnmaskedOnly
+        )
+        for statName in statNames:
+            statVals[statName].append([])
+        for subWinIndex in range(numSubWins):
+            subWinStart, subWinEnd = subWinBounds[subWinIndex]
+            unmaskedFrac = unmasked[subWinStart - 1 : subWinEnd].count(
+                True
+            ) / float(subWinLen)
+            assert unmaskedFrac >= unmaskedFracCutoff
+            snpIndicesInSubWinUnmasked = snpIndicesInSubWins[subWinIndex]
+            sys.stderr.write(
+                "examining subwindow %d which has %d unmasked SNPs\n"
+                % (subWinIndex, len(snpIndicesInSubWinUnmasked))
+            )
+            if len(snpIndicesInSubWinUnmasked) > 0:
+                genosInSubWin = genos.subset(sel0=snpIndicesInSubWinUnmasked)
+                for statName in statNames:
+                    calcAndAppendStatValDiplo(
+                        alleleCountsUnmaskedOnly,
+                        positionArrayUnmaskedOnly,
+                        statName,
+                        subWinStart,
+                        subWinEnd,
+                        statVals,
+                        instanceIndex,
+                        subWinIndex,
+                        genosInSubWin,
+                        unmasked,
+                    )
+            else:
+                for statName in statNames:
+                    appendStatValsForMonomorphic(
+                        statName, statVals, instanceIndex, subWinIndex
+                    )
+    numInstancesDone += 1
+    sys.stderr.write(
+        "finished %d reps after %f seconds\n"
+        % (numInstancesDone, time.perf_counter() - start)
+    )
+
+if numInstancesDone != numInstances:
+    sys.exit(
+        "Expected %d reps but only processed %d. Perhaps we are using malformed simulation output!\n"
+        % (numInstancesDone, numInstances)
+    )
+
+statFiles = []
+if outStatsDir.lower() != "none":
+    for subWinIndex in range(numSubWins):
+        statFileName = "%s/%s.%d.stats" % (
+            outStatsDir,
+            trainingDataFileName.split("/")[-1].rstrip(".gz"),
+            subWinIndex,
+        )
+        statFiles.append(open(statFileName, "w"))
+        statFiles[-1].write("\t".join(statNames) + "\n")
+with open(fvecFileName, "w") as fvecFile:
+    fvecFile.write(header + "\n")
+    for i in range(numInstancesDone):
+        statLines = []
+        for subWinIndex in range(numSubWins):
+            statLines.append([])
+        outVec = []
+        for statName in statNames:
+            outVec += normalizeFeatureVec(statVals[statName][i])
+            for subWinIndex in range(numSubWins):
+                statLines[subWinIndex].append(
+                    statVals[statName][i][subWinIndex]
+                )
+        if statFiles:
+            for subWinIndex in range(numSubWins):
+                statFiles[subWinIndex].write(
+                    "\t".join([str(x) for x in statLines[subWinIndex]]) + "\n"
+                )
+        fvecFile.write("\t".join([str(x) for x in outVec]) + "\n")
+
+if statFiles:
+    for subWinIndex in range(numSubWins):
+        statFiles[subWinIndex].close()
+
+sys.stderr.write(
+    "total time spent calculating summary statistics and generating feature vectors: %f secs\n"
+    % (time.perf_counter() - start)
+)
+closeMsOutFile(trainingDataFileObj)
diff --git a/makeFeatureVecsForSingleMs_ogSHIC.py b/diploshic/makeFeatureVecsForSingleMs_ogSHIC.py
similarity index 61%
rename from makeFeatureVecsForSingleMs_ogSHIC.py
rename to diploshic/makeFeatureVecsForSingleMs_ogSHIC.py
index 0f72f3c..d5168ee 100644
--- a/makeFeatureVecsForSingleMs_ogSHIC.py
+++ b/diploshic/makeFeatureVecsForSingleMs_ogSHIC.py
@@ -2,49 +2,61 @@
 import allel
 import random
 import numpy as np
-import msTools
-import fvTools
+import diploshic.msTools as msTools
+import diploshic.fvTools as fvTools
 import time
 
-'''usage example
+"""usage example
 python makeFeatureVecsForSingleMsFileDiploid.py /san/data/dan/simulations/discoal_multipopStuff/spatialSVMSims/trainingSets/equilibNeut.msout.gz 110000 11 /san/data/ag1kg/accessibility/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.accessible.fa /san/data/ag1kg/outgroups/anc.meru_mela.fa 2L,2R,3L,3R 0.25 0.01 trainingSetsStats/ trainingSetsFeatureVecs/equilibNeut.msout.gz.fvec
-''' # NOQA
+"""  # NOQA
 
-trainingDataFileName, totalPhysLen, numSubWins, maskFileName, \
-    ancFileName, chrArmsForMasking, unmaskedFracCutoff, pMisPol, \
-    outStatsDir, fvecFileName = sys.argv[1:]
+(
+    trainingDataFileName,
+    totalPhysLen,
+    numSubWins,
+    maskFileName,
+    ancFileName,
+    chrArmsForMasking,
+    unmaskedFracCutoff,
+    pMisPol,
+    outStatsDir,
+    fvecFileName,
+) = sys.argv[1:]
 totalPhysLen = int(totalPhysLen)
 numSubWins = int(numSubWins)
 pMisPol = float(pMisPol)
 # below was the old call to get the iHS normalizations
 # standardizationInfo = readStatsDafsComputeStandardizationBins(statAndDafFileName, nBins=50, pMisPol=pMisPol) # NOQA
-subWinLen = totalPhysLen//numSubWins
+subWinLen = totalPhysLen // numSubWins
 assert totalPhysLen % numSubWins == 0 and numSubWins > 1
 chrArmsForMasking = chrArmsForMasking.split(",")
 
 sys.stderr.write("file name='%s'" % (trainingDataFileName))
 
-trainingDataFileObj, sampleSize, numInstances =\
-        msTools.openMsOutFileForSequentialReading(trainingDataFileName)
+(
+    trainingDataFileObj,
+    sampleSize,
+    numInstances,
+) = msTools.openMsOutFileForSequentialReading(trainingDataFileName)
 
 if maskFileName.lower() in ["none", "false"]:
     sys.stderr.write(
-        "maskFileName='%s': not doing any masking!\n" % (maskFileName))
+        "maskFileName='%s': not doing any masking!\n" % (maskFileName)
+    )
     maskFileName = False
     unmaskedFracCutoff = 1.0
 else:
     unmaskedFracCutoff = float(unmaskedFracCutoff)
     if unmaskedFracCutoff > 1.0:
-        sys.exit(
-            "unmaskedFracCutoff must lie within [0, 1].\n")
+        sys.exit("unmaskedFracCutoff must lie within [0, 1].\n")
 
 
 def getSubWinBounds(subWinLen, totalPhysLen):  # get inclusive subwin bounds
     subWinStart = 1
     subWinEnd = subWinStart + subWinLen - 1
     subWinBounds = [(subWinStart, subWinEnd)]
-    numSubWins = totalPhysLen//subWinLen
-    for i in range(1, numSubWins-1):
+    numSubWins = totalPhysLen // subWinLen
+    for i in range(1, numSubWins - 1):
         subWinStart += subWinLen
         subWinEnd += subWinLen
         subWinBounds.append((subWinStart, subWinEnd))
@@ -62,19 +74,34 @@ def getSubWinBounds(subWinLen, totalPhysLen):  # get inclusive subwin bounds
     drawWithReplacement = False
     if ancFileName.lower() in ["none", "false"]:
         maskData = fvTools.readMaskDataForTraining(
-            maskFileName, totalPhysLen, subWinLen,
-            chrArmsForMasking, shuffle=True, cutoff=unmaskedFracCutoff)
+            maskFileName,
+            totalPhysLen,
+            subWinLen,
+            chrArmsForMasking,
+            shuffle=True,
+            cutoff=unmaskedFracCutoff,
+        )
     else:
         maskData = fvTools.readMaskAndAncDataForTraining(
-            maskFileName, ancFileName, totalPhysLen, subWinLen,
-            chrArmsForMasking, shuffle=True, cutoff=unmaskedFracCutoff)
+            maskFileName,
+            ancFileName,
+            totalPhysLen,
+            subWinLen,
+            chrArmsForMasking,
+            shuffle=True,
+            cutoff=unmaskedFracCutoff,
+        )
     if len(maskData) < numInstances:
-        sys.stderr.write("Warning: didn't get enough windows from masked data (needed %d; got %d); will draw with replacement!!\n" % ( # NOQA
-            numInstances, len(maskData)))
+        sys.stderr.write(
+            "Warning: didn't get enough windows from masked data (needed %d; got %d); will draw with replacement!!\n"
+            % (numInstances, len(maskData))  # NOQA
+        )
         drawWithReplacement = True
     else:
-        sys.stderr.write("Got enough windows from masked data (needed %d; got %d); will draw without replacement.\n" % ( # NOQA
-            numInstances, len(maskData)))
+        sys.stderr.write(
+            "Got enough windows from masked data (needed %d; got %d); will draw without replacement.\n"
+            % (numInstances, len(maskData))  # NOQA
+        )
 
 
 def getSnpIndicesInSubWins(subWinBounds, snpLocs):
@@ -84,17 +111,33 @@ def getSnpIndicesInSubWins(subWinBounds, snpLocs):
 
     subWinIndex = 0
     for i in range(len(snpLocs)):
-        while not (snpLocs[i] >= subWinBounds[subWinIndex][0] and
-                   snpLocs[i] <= subWinBounds[subWinIndex][1]):
+        while not (
+            snpLocs[i] >= subWinBounds[subWinIndex][0]
+            and snpLocs[i] <= subWinBounds[subWinIndex][1]
+        ):
             subWinIndex += 1
         snpIndicesInSubWins[subWinIndex].append(i)
     return snpIndicesInSubWins
 
 
 subWinBounds = getSubWinBounds(subWinLen, totalPhysLen)
-statNames = ["pi", "thetaW", "tajD", "thetaH", "fayWuH", "maxFDA", "HapCount",
-             "H1", "H12", "H2/H1", "ZnS", "Omega", "distVar",
-             "distSkew", "distKurt"]
+statNames = [
+    "pi",
+    "thetaW",
+    "tajD",
+    "thetaH",
+    "fayWuH",
+    "maxFDA",
+    "HapCount",
+    "H1",
+    "H12",
+    "H2/H1",
+    "ZnS",
+    "Omega",
+    "distVar",
+    "distSkew",
+    "distKurt",
+]
 header = []
 for statName in statNames:
     for i in range(numSubWins):
@@ -109,10 +152,11 @@ def getSnpIndicesInSubWins(subWinBounds, snpLocs):
 numInstancesDone = 0
 for instanceIndex in range(numInstances):
     hapArrayIn, positionArray = msTools.readNextMsRepToHaplotypeArrayIn(
-        trainingDataFileObj, sampleSize, totalPhysLen)
+        trainingDataFileObj, sampleSize, totalPhysLen
+    )
 
     snpIndicesInSubWins = getSnpIndicesInSubWins(subWinBounds, positionArray)
-    haps = allel.HaplotypeArray(hapArrayIn, dtype='i1')
+    haps = allel.HaplotypeArray(hapArrayIn, dtype="i1")
     if maskFileName:
         if drawWithReplacement:
             unmasked = random.choice(maskData)
@@ -120,26 +164,31 @@ def getSnpIndicesInSubWins(subWinBounds, snpLocs):
             unmasked = maskData[instanceIndex]
         assert len(unmasked) == totalPhysLen
     genos = haps.to_genotypes(ploidy=2)
-    unmaskedSnpIndices = [i for i in range(
-        len(positionArray)) if unmasked[positionArray[i]-1]]
+    unmaskedSnpIndices = [
+        i for i in range(len(positionArray)) if unmasked[positionArray[i] - 1]
+    ]
     if len(unmaskedSnpIndices) == 0:
         for statName in statNames:
             statVals[statName].append([])
         for subWinIndex in range(numSubWins):
             for statName in statNames:
                 fvTools.appendStatValsForMonomorphic(
-                    statName, statVals, instanceIndex, subWinIndex)
+                    statName, statVals, instanceIndex, subWinIndex
+                )
     else:
-        positionArrayUnmaskedOnly = [positionArray[i]
-                                     for i in unmaskedSnpIndices]
+        positionArrayUnmaskedOnly = [
+            positionArray[i] for i in unmaskedSnpIndices
+        ]
         ac = genos.count_alleles()
         alleleCountsUnmaskedOnly = allel.AlleleCountsArray(
-            np.array([ac[i] for i in unmaskedSnpIndices]))
+            np.array([ac[i] for i in unmaskedSnpIndices])
+        )
         sampleSizes = [sum(x) for x in alleleCountsUnmaskedOnly]
         assert len(set(sampleSizes)) == 1 and sampleSizes[0] == sampleSize
         if pMisPol > 0:
             alleleCountsUnmaskedOnly = fvTools.misPolarizeAlleleCounts(
-                alleleCountsUnmaskedOnly, pMisPol)
+                alleleCountsUnmaskedOnly, pMisPol
+            )
         # dafs = alleleCountsUnmaskedOnly[:,1]/float(sampleSizes[0])
         unmaskedHaps = haps.subset(sel0=unmaskedSnpIndices)
         unmaskedGenos = genos.subset(sel0=unmaskedSnpIndices)
@@ -148,38 +197,53 @@ def getSnpIndicesInSubWins(subWinBounds, snpLocs):
             statVals[statName].append([])
         for subWinIndex in range(numSubWins):
             subWinStart, subWinEnd = subWinBounds[subWinIndex]
-            unmaskedFrac = unmasked[subWinStart -
-                                    1:subWinEnd].count(True)/float(subWinLen)
+            unmaskedFrac = unmasked[subWinStart - 1 : subWinEnd].count(
+                True
+            ) / float(subWinLen)
             assert unmaskedFrac >= unmaskedFracCutoff
             snpIndicesInSubWinUnmasked = [
-                x for x in snpIndicesInSubWins[subWinIndex] if unmasked[positionArray[x]-1]] # NOQA
+                x
+                for x in snpIndicesInSubWins[subWinIndex]
+                if unmasked[positionArray[x] - 1]
+            ]  # NOQA
             if len(snpIndicesInSubWinUnmasked) > 0:
                 hapsInSubWin = haps.subset(sel0=snpIndicesInSubWinUnmasked)
                 genosInSubWin = genos.subset(sel0=snpIndicesInSubWinUnmasked)
                 for statName in statNames:
-                    fvTools.calcAndAppendStatVal(alleleCountsUnmaskedOnly,
-                                                 positionArrayUnmaskedOnly,
-                                                 statName, subWinStart,
-                                                 subWinEnd, statVals,
-                                                 instanceIndex, subWinIndex,
-                                                 hapsInSubWin, unmasked,
-                                                 precomputedStats)
+                    fvTools.calcAndAppendStatVal(
+                        alleleCountsUnmaskedOnly,
+                        positionArrayUnmaskedOnly,
+                        statName,
+                        subWinStart,
+                        subWinEnd,
+                        statVals,
+                        instanceIndex,
+                        subWinIndex,
+                        hapsInSubWin,
+                        unmasked,
+                        precomputedStats,
+                    )
             else:
                 for statName in statNames:
                     fvTools.appendStatValsForMonomorphic(
-                        statName, statVals, instanceIndex, subWinIndex)
+                        statName, statVals, instanceIndex, subWinIndex
+                    )
     numInstancesDone += 1
 
 if numInstancesDone != numInstances:
-    sys.exit("Expected %d reps but only processed %d. Perhaps we are using malformed simulation output!\n" % ( # NOQA
-        numInstancesDone, numInstances))
+    sys.exit(
+        "Expected %d reps but only processed %d. Perhaps we are using malformed simulation output!\n"
+        % (numInstancesDone, numInstances)  # NOQA
+    )
 
 statFiles = []
 if outStatsDir.lower() != "none":
     for subWinIndex in range(numSubWins):
         statFileName = "%s/%s.%d.stats" % (
-            outStatsDir, trainingDataFileName.split("/")[-1].rstrip(".gz"),
-            subWinIndex)
+            outStatsDir,
+            trainingDataFileName.split("/")[-1].rstrip(".gz"),
+            subWinIndex,
+        )
         statFiles.append(open(statFileName, "w"))
         statFiles[-1].write("\t".join(statNames) + "\n")
 with open(fvecFileName, "w") as fvecFile:
@@ -194,17 +258,21 @@ def getSnpIndicesInSubWins(subWinBounds, snpLocs):
             outVec += fvTools.normalizeFeatureVec(statVals[statName][i])
             for subWinIndex in range(numSubWins):
                 statLines[subWinIndex].append(
-                    statVals[statName][i][subWinIndex])
+                    statVals[statName][i][subWinIndex]
+                )
         if statFiles:
             for subWinIndex in range(numSubWins):
                 statFiles[subWinIndex].write(
-                    "\t".join([str(x) for x in statLines[subWinIndex]]) + "\n")
+                    "\t".join([str(x) for x in statLines[subWinIndex]]) + "\n"
+                )
         fvecFile.write("\t".join([str(x) for x in outVec]) + "\n")
 
 if statFiles:
     for subWinIndex in range(numSubWins):
         statFiles[subWinIndex].close()
 
-sys.stderr.write("total time spent calculating summary statistics and generating feature vectors: %f secs\n" % ( # NOQA
-    time.perf_counter()-start))
+sys.stderr.write(
+    "total time spent calculating summary statistics and generating feature vectors: %f secs\n"
+    % (time.perf_counter() - start)  # NOQA
+)
 msTools.closeMsOutFile(trainingDataFileObj)
diff --git a/makeTrainingSets.py b/diploshic/makeTrainingSets.py
similarity index 51%
rename from makeTrainingSets.py
rename to diploshic/makeTrainingSets.py
index 5881084..35e622e 100644
--- a/makeTrainingSets.py
+++ b/diploshic/makeTrainingSets.py
@@ -1,7 +1,14 @@
 import sys, os, random
 
-neutTrainingFileName, softTrainingFilePrefix, hardTrainingFilePrefix, sweepTrainingWindows, linkedTrainingWindows, outDir = sys.argv[1:]
-#sweepTrainingWindows and linkedTrainingWindows are comma-separated lists
+(
+    neutTrainingFileName,
+    softTrainingFilePrefix,
+    hardTrainingFilePrefix,
+    sweepTrainingWindows,
+    linkedTrainingWindows,
+    outDir,
+) = sys.argv[1:]
+# sweepTrainingWindows and linkedTrainingWindows are comma-separated lists
 
 sweepFilePaths, linkedFilePaths = {}, {}
 for trainingFilePrefix in [softTrainingFilePrefix, hardTrainingFilePrefix]:
@@ -16,14 +23,21 @@
         if fileName.startswith(trainingFilePrefixDirless):
             winNum = int(fileName.split("_")[1].split(".")[0])
             if winNum in linkedWins:
-                linkedFilePaths[trainingFilePrefix].append(trainingSetDir + "/" + fileName)
+                linkedFilePaths[trainingFilePrefix].append(
+                    trainingSetDir + "/" + fileName
+                )
             elif winNum in sweepWins:
-                sweepFilePaths[trainingFilePrefix].append(trainingSetDir + "/" + fileName)
+                sweepFilePaths[trainingFilePrefix].append(
+                    trainingSetDir + "/" + fileName
+                )
+
 
 def getExamplesFromFVFile(simFileName):
     try:
-        simFile = open(simFileName,'rt')
-        lines = [line.strip() for line in simFile.readlines() if not "nan" in line]
+        simFile = open(simFileName, "rt")
+        lines = [
+            line.strip() for line in simFile.readlines() if not "nan" in line
+        ]
         header = lines[0]
         examples = lines[1:]
         simFile.close()
@@ -31,6 +45,7 @@ def getExamplesFromFVFile(simFileName):
     except Exception:
         return "", []
 
+
 def getExamplesFromFVFileLs(simFileLs):
     examples = []
     keptHeader = ""
@@ -41,6 +56,7 @@ def getExamplesFromFVFileLs(simFileLs):
         examples += currExamples
     return keptHeader, examples
 
+
 def getMinButNonZeroExamples(lsLs):
     counts = []
     for ls in lsLs:
@@ -50,24 +66,57 @@ def getMinButNonZeroExamples(lsLs):
         raise Exception
     return min(counts)
 
+
 header, neutExamples = getExamplesFromFVFile(neutTrainingFileName)
-linkedSoftHeader, linkedSoftExamples = getExamplesFromFVFileLs(linkedFilePaths[softTrainingFilePrefix])
-softHeader, softExamples = getExamplesFromFVFileLs(sweepFilePaths[softTrainingFilePrefix])
-linkedHardHeader, linkedHardExamples = getExamplesFromFVFileLs(linkedFilePaths[hardTrainingFilePrefix])
-hardHeader, hardExamples = getExamplesFromFVFileLs(sweepFilePaths[hardTrainingFilePrefix])
-trainingSetLs = [linkedSoftExamples, softExamples, linkedHardExamples, hardExamples,neutExamples]
+linkedSoftHeader, linkedSoftExamples = getExamplesFromFVFileLs(
+    linkedFilePaths[softTrainingFilePrefix]
+)
+softHeader, softExamples = getExamplesFromFVFileLs(
+    sweepFilePaths[softTrainingFilePrefix]
+)
+linkedHardHeader, linkedHardExamples = getExamplesFromFVFileLs(
+    linkedFilePaths[hardTrainingFilePrefix]
+)
+hardHeader, hardExamples = getExamplesFromFVFileLs(
+    sweepFilePaths[hardTrainingFilePrefix]
+)
+trainingSetLs = [
+    linkedSoftExamples,
+    softExamples,
+    linkedHardExamples,
+    hardExamples,
+    neutExamples,
+]
 numExamplesToKeep = getMinButNonZeroExamples(trainingSetLs)
 for i in range(len(trainingSetLs)):
     random.shuffle(trainingSetLs[i])
     trainingSetLs[i] = trainingSetLs[i][:numExamplesToKeep]
-linkedSoftExamples, softExamples, linkedHardExamples, hardExamples, neutExamples = trainingSetLs
+(
+    linkedSoftExamples,
+    softExamples,
+    linkedHardExamples,
+    hardExamples,
+    neutExamples,
+) = trainingSetLs
 
-outFileNames = ["neut.fvec", "linkedSoft.fvec", "soft.fvec", "linkedHard.fvec", "hard.fvec"]
-outExamples = [neutExamples, linkedSoftExamples, softExamples, linkedHardExamples, hardExamples]
+outFileNames = [
+    "neut.fvec",
+    "linkedSoft.fvec",
+    "soft.fvec",
+    "linkedHard.fvec",
+    "hard.fvec",
+]
+outExamples = [
+    neutExamples,
+    linkedSoftExamples,
+    softExamples,
+    linkedHardExamples,
+    hardExamples,
+]
 for i in range(len(outFileNames)):
     if outExamples[i]:
-        outFile = open(outDir +"/"+ outFileNames[i], "w")
-        outFile.write(hardHeader+"\n")
+        outFile = open(outDir + "/" + outFileNames[i], "w")
+        outFile.write(hardHeader + "\n")
         for example in outExamples[i]:
-            outFile.write("%s\n" %(example))
+            outFile.write("%s\n" % (example))
         outFile.close()
diff --git a/misc.py b/diploshic/misc.py
similarity index 83%
rename from misc.py
rename to diploshic/misc.py
index 70d378a..79dc084 100644
--- a/misc.py
+++ b/diploshic/misc.py
@@ -1,8 +1,10 @@
 import numpy as np
 from scipy.sparse import coo_matrix
+
 """This is all a bunch of stuff copied from sk-learn 0.24.2 but shoving it in
 here for compatibility purposes. Some slight modifications were made."""
 
+
 class ConfusionMatrixDisplay:
     """Confusion Matrix visualization.
     It is recommend to use :func:`~sklearn.metrics.plot_confusion_matrix` to
@@ -53,13 +55,21 @@ class ConfusionMatrixDisplay:
     ...                               display_labels=clf.classes_)
     >>> disp.plot() # doctest: +SKIP
     """
+
     def __init__(self, confusion_matrix, *, display_labels=None):
         self.confusion_matrix = confusion_matrix
         self.display_labels = display_labels
 
-    def plot(self, *, include_values=True, cmap='viridis',
-             xticks_rotation='horizontal', values_format=None,
-             ax=None, colorbar=True):
+    def plot(
+        self,
+        *,
+        include_values=True,
+        cmap="viridis",
+        xticks_rotation="horizontal",
+        values_format=None,
+        ax=None,
+        colorbar=True
+    ):
         """Plot visualization.
         Parameters
         ----------
@@ -91,7 +101,7 @@ def plot(self, *, include_values=True, cmap='viridis',
 
         cm = self.confusion_matrix
         n_classes = cm.shape[0]
-        self.im_ = ax.imshow(cm, interpolation='nearest', cmap=cmap)
+        self.im_ = ax.imshow(cm, interpolation="nearest", cmap=cmap)
         self.text_ = None
         cmap_min, cmap_max = self.im_.cmap(0), self.im_.cmap(256)
 
@@ -106,18 +116,17 @@ def plot(self, *, include_values=True, cmap='viridis',
                     color = cmap_max if cm[i, j] < thresh else cmap_min
 
                     if values_format is None:
-                        text_cm = format(cm[i, j], '.2g')
-                        if cm.dtype.kind != 'f':
-                            text_d = format(cm[i, j], 'd')
+                        text_cm = format(cm[i, j], ".2g")
+                        if cm.dtype.kind != "f":
+                            text_d = format(cm[i, j], "d")
                             if len(text_d) < len(text_cm):
                                 text_cm = text_d
                     else:
                         text_cm = format(cm[i, j], values_format)
 
                     self.text_[i, j] = ax.text(
-                        j, i, text_cm,
-                        ha="center", va="center",
-                        color=color)
+                        j, i, text_cm, ha="center", va="center", color=color
+                    )
 
         if self.display_labels is None:
             display_labels = np.arange(n_classes)
@@ -125,12 +134,14 @@ def plot(self, *, include_values=True, cmap='viridis',
             display_labels = self.display_labels
         if colorbar:
             fig.colorbar(self.im_, ax=ax)
-        ax.set(xticks=np.arange(n_classes),
-               yticks=np.arange(n_classes),
-               xticklabels=display_labels,
-               yticklabels=display_labels,
-               ylabel="True label",
-               xlabel="Predicted label")
+        ax.set(
+            xticks=np.arange(n_classes),
+            yticks=np.arange(n_classes),
+            xticklabels=display_labels,
+            yticklabels=display_labels,
+            ylabel="True label",
+            xlabel="Predicted label",
+        )
 
         ax.set_ylim((n_classes - 0.5, -0.5))
         plt.setp(ax.get_xticklabels(), rotation=xticks_rotation)
@@ -140,8 +151,9 @@ def plot(self, *, include_values=True, cmap='viridis',
         return self
 
 
-def confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None,
-                     normalize=None):
+def confusion_matrix(
+    y_true, y_pred, *, labels=None, sample_weight=None, normalize=None
+):
     """Compute confusion matrix to evaluate the accuracy of a classification.
     By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`
     is equal to the number of observations known to be in group :math:`i` and
@@ -223,15 +235,20 @@ def confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None,
     else:
         sample_weight = np.asarray(sample_weight)
 
-    if normalize not in ['true', 'pred', 'all', None]:
-        raise ValueError("normalize must be one of {'true', 'pred', "
-                         "'all', None}")
+    if normalize not in ["true", "pred", "all", None]:
+        raise ValueError(
+            "normalize must be one of {'true', 'pred', " "'all', None}"
+        )
 
     n_labels = labels.size
     label_to_ind = {y: x for x, y in enumerate(labels)}
     # convert yt, yp into index
-    y_pred = np.array([label_to_ind.get(np.argmax(x), n_labels + 1) for x in y_pred])
-    y_true = np.array([label_to_ind.get(np.argmax(x), n_labels + 1) for x in y_true])
+    y_pred = np.array(
+        [label_to_ind.get(np.argmax(x), n_labels + 1) for x in y_pred]
+    )
+    y_true = np.array(
+        [label_to_ind.get(np.argmax(x), n_labels + 1) for x in y_true]
+    )
 
     # intersect y_pred, y_true with labels, eliminate items not in labels
     ind = np.logical_and(y_pred < n_labels, y_true < n_labels)
@@ -241,32 +258,45 @@ def confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None,
     sample_weight = sample_weight[ind]
 
     # Choose the accumulator dtype to always have high precision
-    if sample_weight.dtype.kind in {'i', 'u', 'b'}:
+    if sample_weight.dtype.kind in {"i", "u", "b"}:
         dtype = np.int64
     else:
         dtype = np.float64
 
-    cm = coo_matrix((sample_weight, (y_true, y_pred)),
-                    shape=(n_labels, n_labels), dtype=dtype,
-                    ).toarray()
+    cm = coo_matrix(
+        (sample_weight, (y_true, y_pred)),
+        shape=(n_labels, n_labels),
+        dtype=dtype,
+    ).toarray()
 
-    with np.errstate(all='ignore'):
-        if normalize == 'true':
+    with np.errstate(all="ignore"):
+        if normalize == "true":
             cm = cm / cm.sum(axis=1, keepdims=True)
-        elif normalize == 'pred':
+        elif normalize == "pred":
             cm = cm / cm.sum(axis=0, keepdims=True)
-        elif normalize == 'all':
+        elif normalize == "all":
             cm = cm / cm.sum()
         cm = np.nan_to_num(cm)
 
     return cm
 
-def plot_confusion_matrix(estimator, X, y_true, *, labels=None,
-                          sample_weight=None, normalize=None,
-                          display_labels=None, include_values=True,
-                          xticks_rotation='horizontal',
-                          values_format=None,
-                          cmap='viridis', ax=None, colorbar=True):
+
+def plot_confusion_matrix(
+    estimator,
+    X,
+    y_true,
+    *,
+    labels=None,
+    sample_weight=None,
+    normalize=None,
+    display_labels=None,
+    include_values=True,
+    xticks_rotation="horizontal",
+    values_format=None,
+    cmap="viridis",
+    ax=None,
+    colorbar=True
+):
     """Plot Confusion Matrix.
     Read more in the :ref:`User Guide <confusion_matrix>`.
     Parameters
@@ -332,10 +362,21 @@ def plot_confusion_matrix(estimator, X, y_true, *, labels=None,
     """
 
     y_pred = estimator.predict(X)
-    cm = confusion_matrix(y_true, y_pred, sample_weight=sample_weight,
-                          labels=labels, normalize=normalize)
-    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
-                                  display_labels=display_labels)
-    return disp.plot(include_values=include_values,
-                     cmap=cmap, ax=ax, xticks_rotation=xticks_rotation,
-                     values_format=values_format, colorbar=colorbar)
+    cm = confusion_matrix(
+        y_true,
+        y_pred,
+        sample_weight=sample_weight,
+        labels=labels,
+        normalize=normalize,
+    )
+    disp = ConfusionMatrixDisplay(
+        confusion_matrix=cm, display_labels=display_labels
+    )
+    return disp.plot(
+        include_values=include_values,
+        cmap=cmap,
+        ax=ax,
+        xticks_rotation=xticks_rotation,
+        values_format=values_format,
+        colorbar=colorbar,
+    )
diff --git a/msTools.py b/diploshic/msTools.py
similarity index 69%
rename from msTools.py
rename to diploshic/msTools.py
index b3aeba2..b3bd7d4 100644
--- a/msTools.py
+++ b/diploshic/msTools.py
@@ -17,7 +17,7 @@ def fillInSnpSlotsWithOverflowers(newPositions, totalPhysLen, overflowers):
         posH[pos] = 1
     for i in range(len(overflowers)):
         del newPositions[-1]
-    for pos in reversed(range(1, totalPhysLen+1)):
+    for pos in reversed(range(1, totalPhysLen + 1)):
         if pos not in posH:
             bisect.insort_left(newPositions, pos)
             overflowers.pop()
@@ -31,14 +31,16 @@ def msPositionsToIntegerPositions(positions, totalPhysLen):
     prevIntPos = -1
     newPositions = []
     for position in positions:
-        assert position >= 0 and position < 1., "Mutations positions must all be in [0, 1)"
+        assert (
+            position >= 0 and position < 1.0
+        ), "Mutations positions must all be in [0, 1)"
         assert position >= prevPos
         origPos = position
         if position == prevPos:
             position += 0.000001
         prevPos = origPos
 
-        intPos = int(totalPhysLen*position)
+        intPos = int(totalPhysLen * position)
         if intPos == 0:
             intPos = 1
         if intPos <= prevIntPos:
@@ -49,13 +51,17 @@ def msPositionsToIntegerPositions(positions, totalPhysLen):
     if overflowers:
         fillInSnpSlotsWithOverflowers(newPositions, totalPhysLen, overflowers)
     assert len(newPositions) == len(positions)
-    assert all(newPositions[i] <= newPositions[i+1]
-               for i in range(len(newPositions)-1))
+    assert all(
+        newPositions[i] <= newPositions[i + 1]
+        for i in range(len(newPositions) - 1)
+    )
     assert newPositions[-1] <= totalPhysLen
     return newPositions
 
 
-def msRepToHaplotypeArrayIn(samples, positions, totalPhysLen, transposeHaps, discretizePositions=True):
+def msRepToHaplotypeArrayIn(
+    samples, positions, totalPhysLen, transposeHaps, discretizePositions=True
+):
     for i in range(len(samples)):
         assert len(samples[i]) == len(positions)
     if discretizePositions:
@@ -72,16 +78,18 @@ def msRepToHaplotypeArrayIn(samples, positions, totalPhysLen, transposeHaps, dis
     return hapArrayIn, positions
 
 
-def msOutToHaplotypeArrayIn(msOutputFileName, totalPhysLen, discretizePositions=True):
+def msOutToHaplotypeArrayIn(
+    msOutputFileName, totalPhysLen, discretizePositions=True
+):
     if msOutputFileName == "stdin":
         isFile = False
         msStream = sys.stdin
     else:
         isFile = True
         if msOutputFileName.endswith(".gz"):
-            msStream = gzip.open(msOutputFileName, 'rt')
+            msStream = gzip.open(msOutputFileName, "rt")
         else:
-            msStream = open(msOutputFileName, 'rt')
+            msStream = open(msOutputFileName, "rt")
 
     header = msStream.readline()
     program, numSamples, numSims = header.strip().split()[:3]
@@ -96,7 +104,9 @@ def msOutToHaplotypeArrayIn(msOutputFileName, totalPhysLen, discretizePositions=
     while line:
         if line.strip() != "//":
             sys.exit(
-                "Malformed ms-style output file: read '%s' instead of '//'. \n" % (line.strip()))  # NOQA
+                "Malformed ms-style output file: read '%s' instead of '//'. \n"
+                % (line.strip())
+            )  # NOQA
         segsitesBlah, segsites = msStream.readline().strip().split()
         segsites = int(segsites)
         if segsitesBlah != "segsites:":
@@ -116,14 +126,26 @@ def msOutToHaplotypeArrayIn(msOutputFileName, totalPhysLen, discretizePositions=
             for i in range(numSamples):
                 sampleLine = msStream.readline().strip()
                 if len(sampleLine) != segsites:
-                    sys.exit("Malformed ms-style output file %s segsites but %s columns in line: %s; line %s of %s samples \n" %  # NOQA
-                             (segsites, len(sampleLine), sampleLine, i, numSamples)) # NOQA
+                    sys.exit(
+                        "Malformed ms-style output file %s segsites but %s columns in line: %s; line %s of %s samples \n"
+                        % (
+                            segsites,
+                            len(sampleLine),
+                            sampleLine,
+                            i,
+                            numSamples,
+                        )  # NOQA
+                    )  # NOQA
                 samples.append(sampleLine)
             if len(samples) != numSamples:
                 raise Exception
             hapArrayIn, positions = msRepToHaplotypeArrayIn(
-                samples, positions, totalPhysLen, True,
-                discretizePositions=discretizePositions)
+                samples,
+                positions,
+                totalPhysLen,
+                True,
+                discretizePositions=discretizePositions,
+            )
         hapArraysIn.append(hapArrayIn)
         positionArrays.append(positions)
         line = msStream.readline()
@@ -132,8 +154,10 @@ def msOutToHaplotypeArrayIn(msOutputFileName, totalPhysLen, discretizePositions=
             line = msStream.readline()
         # sys.stderr.write("finished rep %d\n" %(len(hapArraysIn)))
     if len(hapArraysIn) != numSims:
-        sys.exit("Malformed ms-style output file: %s of %s sims processed. \n" % # NOQA
-                 (len(hapArraysIn), numSims))
+        sys.exit(
+            "Malformed ms-style output file: %s of %s sims processed. \n"
+            % (len(hapArraysIn), numSims)  # NOQA
+        )
 
     if isFile:
         msStream.close()
@@ -147,7 +171,7 @@ def openMsOutFileForSequentialReading(msOutputFileName):
     else:
         isFile = True
         if msOutputFileName.endswith(".gz"):
-            msStream = gzip.open(msOutputFileName, 'rt')
+            msStream = gzip.open(msOutputFileName, "rt")
         else:
             msStream = open(msOutputFileName)
 
@@ -164,7 +188,13 @@ def closeMsOutFile(fileInfoTuple):
         msStream.close()
 
 
-def readNextMsRepToHaplotypeArrayIn(fileInfoTuple, numSamples, totalPhysLen, transposeHaps=True, discretizePositions=True):
+def readNextMsRepToHaplotypeArrayIn(
+    fileInfoTuple,
+    numSamples,
+    totalPhysLen,
+    transposeHaps=True,
+    discretizePositions=True,
+):
     msStream, isFile = fileInfoTuple
 
     # advance to next simulation
@@ -191,16 +221,37 @@ def readNextMsRepToHaplotypeArrayIn(fileInfoTuple, numSamples, totalPhysLen, tra
         for i in range(numSamples):
             sampleLine = msStream.readline().strip()
             if len(sampleLine) != segsites:
-                sys.exit("Malformed ms-style output file %s segsites but %s columns in line: %s; line %s of %s samples \n" %  # NOQA
-                         (segsites, len(sampleLine), sampleLine, i, numSamples)) # NOQA
+                sys.exit(
+                    "Malformed ms-style output file %s segsites but %s columns in line: %s; line %s of %s samples \n"
+                    % (
+                        segsites,
+                        len(sampleLine),
+                        sampleLine,
+                        i,
+                        numSamples,
+                    )  # NOQA
+                )  # NOQA
             samples.append(sampleLine)
         if len(samples) != numSamples:
             raise Exception
         hapArrayIn, positions = msRepToHaplotypeArrayIn(
-            samples, positions, totalPhysLen, transposeHaps,
-            discretizePositions=discretizePositions)
+            samples,
+            positions,
+            totalPhysLen,
+            transposeHaps,
+            discretizePositions=discretizePositions,
+        )
 
     return hapArrayIn, positions
 
-def readNextMsRepToGameteStrs(fileInfoTuple, numSamples, totalPhysLen, discretizePositions=True):
-    return readNextMsRepToHaplotypeArrayIn(fileInfoTuple, numSamples, totalPhysLen, transposeHaps=False, discretizePositions=discretizePositions)
+
+def readNextMsRepToGameteStrs(
+    fileInfoTuple, numSamples, totalPhysLen, discretizePositions=True
+):
+    return readNextMsRepToHaplotypeArrayIn(
+        fileInfoTuple,
+        numSamples,
+        totalPhysLen,
+        transposeHaps=False,
+        discretizePositions=discretizePositions,
+    )
diff --git a/diploshic/setup.py b/diploshic/setup.py
new file mode 100644
index 0000000..013d9ed
--- /dev/null
+++ b/diploshic/setup.py
@@ -0,0 +1,14 @@
+# File setup.py
+def configuration(parent_package="", top_path=None):
+    from numpy.distutils.misc_util import Configuration
+
+    config = Configuration("", parent_package, top_path)
+
+    config.add_extension("shicstats", sources=["shicstats.pyf", "utils.c"])
+    return config
+
+
+if __name__ == "__main__":
+    from numpy.distutils.core import setup
+
+    setup(**configuration(top_path="").todict())
diff --git a/shicstats.pyf b/diploshic/shicstats.pyf
similarity index 100%
rename from shicstats.pyf
rename to diploshic/shicstats.pyf
diff --git a/testing/hard.fvec b/diploshic/testing/hard.fvec
similarity index 100%
rename from testing/hard.fvec
rename to diploshic/testing/hard.fvec
diff --git a/testing/linkedHard.fvec b/diploshic/testing/linkedHard.fvec
similarity index 100%
rename from testing/linkedHard.fvec
rename to diploshic/testing/linkedHard.fvec
diff --git a/testing/linkedSoft.fvec b/diploshic/testing/linkedSoft.fvec
similarity index 100%
rename from testing/linkedSoft.fvec
rename to diploshic/testing/linkedSoft.fvec
diff --git a/testing/neut.fvec b/diploshic/testing/neut.fvec
similarity index 100%
rename from testing/neut.fvec
rename to diploshic/testing/neut.fvec
diff --git a/testing/soft.fvec b/diploshic/testing/soft.fvec
similarity index 100%
rename from testing/soft.fvec
rename to diploshic/testing/soft.fvec
diff --git a/training/hard.fvec b/diploshic/training/hard.fvec
similarity index 100%
rename from training/hard.fvec
rename to diploshic/training/hard.fvec
diff --git a/training/linkedHard.fvec b/diploshic/training/linkedHard.fvec
similarity index 100%
rename from training/linkedHard.fvec
rename to diploshic/training/linkedHard.fvec
diff --git a/training/linkedSoft.fvec b/diploshic/training/linkedSoft.fvec
similarity index 100%
rename from training/linkedSoft.fvec
rename to diploshic/training/linkedSoft.fvec
diff --git a/training/neut.fvec b/diploshic/training/neut.fvec
similarity index 100%
rename from training/neut.fvec
rename to diploshic/training/neut.fvec
diff --git a/training/soft.fvec b/diploshic/training/soft.fvec
similarity index 100%
rename from training/soft.fvec
rename to diploshic/training/soft.fvec
diff --git a/utils.c b/diploshic/utils.c
similarity index 100%
rename from utils.c
rename to diploshic/utils.c
diff --git a/makeFeatureVecsForChrArmFromVcfDiploid.py b/makeFeatureVecsForChrArmFromVcfDiploid.py
deleted file mode 100644
index 665df61..0000000
--- a/makeFeatureVecsForChrArmFromVcfDiploid.py
+++ /dev/null
@@ -1,163 +0,0 @@
-import os
-import allel
-import h5py
-import numpy as np
-import sys
-import time
-from fvTools import *
-
-if not len(sys.argv) in [13,15]:
-    sys.exit("usage:\npython makeFeatureVecsForChrArmFromVcfDiploid.py vcfFileName chrArm chrLen targetPop winSize numSubWins maskFileName unmaskedFracCutoff unmaskedGenoFracCutoff sampleToPopFileName statFileName outFileName [segmentStart segmentEnd]\n")
-if len(sys.argv) == 15:
-    vcfFileName, chrArm, chrLen, targetPop, winSize, numSubWins, maskFileName, unmaskedFracCutoff, unmaskedGenoFracCutoff, sampleToPopFileName, statFileName, outfn, segmentStart, segmentEnd = sys.argv[1:]
-    segmentStart, segmentEnd = int(segmentStart), int(segmentEnd)
-else:
-    vcfFileName, chrArm, chrLen, targetPop, winSize, numSubWins, maskFileName, unmaskedFracCutoff, unmaskedGenoFracCutoff, sampleToPopFileName, statFileName, outfn = sys.argv[1:]
-    segmentStart = None
-
-unmaskedFracCutoff = float(unmaskedFracCutoff)
-if unmaskedFracCutoff < 0.0 or unmaskedFracCutoff > 1.0:
-    sys.exit("unmaskedFracCutoff=%s but must be within [0, 1]. AAAAARRRRGHHHHHH!!!\n" %(unmaskedFracCutoff))
-unmaskedGenoFracCutoff = float(unmaskedGenoFracCutoff)
-if unmaskedGenoFracCutoff < 0.0 or unmaskedGenoFracCutoff > 1.0:
-    sys.exit("unmaskedGenoFracCutoff=%s but must be within [0, 1]. AAAAARRRRGHHHHHH!!!\n" %(unmaskedGenoFracCutoff))
-chrLen, winSize, numSubWins = int(chrLen), int(winSize), int(numSubWins)
-assert winSize % numSubWins == 0 and numSubWins > 1
-subWinSize = int(winSize/numSubWins)
-
-def getSubWinBounds(chrLen, subWinSize):
-    lastSubWinEnd = chrLen - chrLen % subWinSize
-    lastSubWinStart = lastSubWinEnd - subWinSize + 1
-    subWinBounds = []
-    for subWinStart in range(1, lastSubWinStart+1, subWinSize):
-        subWinEnd = subWinStart + subWinSize - 1
-        subWinBounds.append((subWinStart, subWinEnd))
-    return subWinBounds
-
-def getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, snpLocs):
-    subWinStart = 1
-    subWinEnd = subWinStart + subWinSize - 1
-    snpIndicesInSubWins = [[]]
-    for i in range(len(snpLocs)):
-        while snpLocs[i] <= lastSubWinEnd and not (snpLocs[i] >= subWinStart and snpLocs[i] <= subWinEnd):
-            subWinStart += subWinSize
-            subWinEnd += subWinSize
-            snpIndicesInSubWins.append([])
-        if snpLocs[i] <= lastSubWinEnd:
-            snpIndicesInSubWins[-1].append(i)
-    while subWinEnd < lastSubWinEnd:
-        snpIndicesInSubWins.append([])
-        subWinStart += subWinSize
-        subWinEnd += subWinSize
-    return snpIndicesInSubWins
-
-def readSampleToPopFile(sampleToPopFileName):
-    table = {}
-    with open(sampleToPopFileName) as sampleToPopFile:
-        for line in sampleToPopFile:
-            sample, pop = line.strip().split()
-            table[sample] = pop
-    return table
-
-vcfFile = allel.read_vcf(vcfFileName)
-chroms = vcfFile["variants/CHROM"]
-positions = np.extract(chroms == chrArm, vcfFile["variants/POS"])
-
-if maskFileName.lower() in ["none", "false"]:
-    sys.stderr.write("Warning: a mask.fa file for the chr arm with all masked sites N'ed out is strongly recommended" +
-        " (pass in the reference to remove Ns at the very least)!\n")
-    unmasked = [True] * chrLen
-else:
-    unmasked = readMaskDataForScan(maskFileName, chrArm)
-    assert len(unmasked) == chrLen
-
-if statFileName.lower() in ["none", "false"]:
-    statFileName = None
-
-samples = vcfFile["samples"]
-if not sampleToPopFileName.lower() in ["none", "false"]:
-    sampleToPop = readSampleToPopFile(sampleToPopFileName)
-    sampleIndicesToKeep = [i for i in range(len(samples)) if sampleToPop.get(samples[i], "popNotFound!") == targetPop]
-else:
-    sampleIndicesToKeep = [i for i in range(len(samples))]
-rawgenos = np.take(vcfFile["calldata/GT"], [i for i in range(len(chroms)) if chroms[i] == chrArm], axis=0)
-genos = allel.GenotypeArray(rawgenos).subset(sel1=sampleIndicesToKeep)
-
-if segmentStart != None:
-    snpIndicesToKeep = [i for i in range(len(positions)) if segmentStart <= positions[i] <= segmentEnd]
-    positions = [positions[i] for i in snpIndicesToKeep]
-    genos = allel.GenotypeArray(genos.subset(sel0=snpIndicesToKeep))
-
-if isHaploidVcfGenoArray(genos):
-    sys.stderr.write("Detected haploid input. Converting into diploid individuals (combining haplotypes in order).\n")
-    genos = diploidizeGenotypeArray(genos)
-
-alleleCounts = genos.count_alleles()
-
-#remove all but mono/biallelic unmasked sites
-isBiallelic = alleleCounts.is_biallelic()
-for i in range(len(isBiallelic)):
-    if not (isBiallelic[i] and calledGenoFracAtSite(genos[i]) >= unmaskedGenoFracCutoff):
-        unmasked[positions[i]-1] = False
-snpIndicesToKeep = [i for i in range(len(positions)) if unmasked[positions[i]-1]]
-genos = allel.GenotypeArray(genos.subset(sel0=snpIndicesToKeep))
-positions = [positions[i] for i in snpIndicesToKeep]
-alleleCounts = allel.AlleleCountsArray([[alleleCounts[i][0], max(alleleCounts[i][1:])] for i in snpIndicesToKeep])
-
-statNames = ["pi", "thetaW", "tajD", "distVar","distSkew","distKurt","nDiplos","diplo_H1","diplo_H12","diplo_H2/H1","diplo_ZnS","diplo_Omega"]
-
-subWinBounds = getSubWinBounds(chrLen, subWinSize)
-
-header = "chrom classifiedWinStart classifiedWinEnd bigWinRange".split()
-statHeader = "chrom start end".split()
-for statName in statNames:
-    statHeader.append(statName)
-    for i in range(numSubWins):
-        header.append("%s_win%d" %(statName, i))
-statHeader = "\t".join(statHeader)
-header = "\t".join(header)
-outFile=open(outfn,'w')
-outFile.write(header+"\n")
-statVals = {}
-for statName in statNames:
-    statVals[statName] = []
-
-startTime = time.perf_counter()
-goodSubWins = []
-lastSubWinEnd = chrLen - chrLen % subWinSize
-snpIndicesInSubWins = getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, positions)
-subWinIndex = 0
-lastSubWinStart = lastSubWinEnd - subWinSize + 1
-if statFileName:
-    statFile = open(statFileName, "w")
-    statFile.write(statHeader + "\n")
-for subWinStart in range(1, lastSubWinStart+1, subWinSize):
-    subWinEnd = subWinStart + subWinSize - 1
-    unmaskedFrac = unmasked[subWinStart-1:subWinEnd].count(True)/float(subWinEnd-subWinStart+1)
-    if segmentStart == None or subWinStart >= segmentStart and subWinEnd <= segmentEnd:
-        sys.stderr.write("%d-%d num unmasked snps: %d; unmasked frac: %f\n" %(subWinStart, subWinEnd, len(snpIndicesInSubWins[subWinIndex]), unmaskedFrac))
-    if len(snpIndicesInSubWins[subWinIndex]) > 0 and unmaskedFrac >= unmaskedFracCutoff:
-        genosInSubWin = allel.GenotypeArray(genos.subset(sel0=snpIndicesInSubWins[subWinIndex]))
-        statValStr = []
-        for statName in statNames:
-            calcAndAppendStatValForScanDiplo(alleleCounts, positions, statName, subWinStart, subWinEnd, statVals, subWinIndex, genosInSubWin, unmasked)
-        goodSubWins.append(True)
-        if statFileName:
-            statFile.write("\t".join([chrArm, str(subWinStart), str(subWinEnd)] + [str(statVals[statName][-1]) for statName in statNames]) + "\n")
-    else:
-        for statName in statNames:
-            appendStatValsForMonomorphicForScan(statName, statVals, subWinIndex)
-        goodSubWins.append(False)
-    if goodSubWins[-numSubWins:].count(True) == numSubWins:
-        outVec = []
-        for statName in statNames:
-            outVec += normalizeFeatureVec(statVals[statName][-numSubWins:])
-        midSubWinEnd = subWinEnd - subWinSize*(numSubWins/2)
-        midSubWinStart = midSubWinEnd-subWinSize+1
-        outFile.write("%s\t%d\t%d\t%d-%d\t" %(chrArm, midSubWinStart, midSubWinEnd, subWinEnd-winSize+1, subWinEnd) + "\t".join([str(x) for x in outVec]))
-        outFile.write('\n')
-    subWinIndex += 1
-if statFileName:
-    statFile.close()
-outFile.close()
-sys.stderr.write("completed in %g seconds\n" %(time.perf_counter()-startTime))
diff --git a/makeFeatureVecsForChrArmFromVcf_ogSHIC.py b/makeFeatureVecsForChrArmFromVcf_ogSHIC.py
deleted file mode 100644
index 2abceef..0000000
--- a/makeFeatureVecsForChrArmFromVcf_ogSHIC.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import os
-import allel
-import h5py
-import numpy as np
-import sys
-import time
-from fvTools import *
-
-if not len(sys.argv) in [13,15]:
-    sys.exit("usage:\npython makeFeatureVecsForChrArmFromVcf_ogSHIC.py chrArmFileName chrArm chrLen targetPop winSize numSubWins maskFileName sampleToPopFileName ancestralArmFaFileName statFileName outFileName [segmentStart segmentEnd]\n")
-if len(sys.argv) == 15:
-    chrArmFileName, chrArm, chrLen, targetPop, winSize, numSubWins, maskFileName, unmaskedFracCutoff, sampleToPopFileName, ancestralArmFaFileName, statFileName, outfn, segmentStart, segmentEnd = sys.argv[1:]
-    segmentStart, segmentEnd = int(segmentStart), int(segmentEnd)
-else:
-    chrArmFileName, chrArm, chrLen, targetPop, winSize, numSubWins, maskFileName, unmaskedFracCutoff, sampleToPopFileName, ancestralArmFaFileName, statFileName, outfn = sys.argv[1:]
-    segmentStart = None
-
-unmaskedFracCutoff = float(unmaskedFracCutoff)
-chrLen, winSize, numSubWins = int(chrLen), int(winSize), int(numSubWins)
-assert winSize % numSubWins == 0 and numSubWins > 1
-subWinSize = int(winSize/numSubWins)
-
-def getSubWinBounds(chrLen, subWinSize):
-    lastSubWinEnd = chrLen - chrLen % subWinSize
-    lastSubWinStart = lastSubWinEnd - subWinSize + 1
-    subWinBounds = []
-    for subWinStart in range(1, lastSubWinStart+1, subWinSize):
-        subWinEnd = subWinStart + subWinSize - 1
-        subWinBounds.append((subWinStart, subWinEnd))
-    return subWinBounds
-
-def getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, snpLocs):
-    subWinStart = 1
-    subWinEnd = subWinStart + subWinSize - 1
-    snpIndicesInSubWins = [[]]
-    for i in range(len(snpLocs)):
-        while snpLocs[i] <= lastSubWinEnd and not (snpLocs[i] >= subWinStart and snpLocs[i] <= subWinEnd):
-            subWinStart += subWinSize
-            subWinEnd += subWinSize
-            snpIndicesInSubWins.append([])
-        if snpLocs[i] <= lastSubWinEnd:
-            snpIndicesInSubWins[-1].append(i)
-    while subWinEnd < lastSubWinEnd:
-        snpIndicesInSubWins.append([])
-        subWinStart += subWinSize
-        subWinEnd += subWinSize
-    return snpIndicesInSubWins
-
-chrArmFile = allel.read_vcf(chrArmFileName)
-chroms = chrArmFile["variants/CHROM"]
-positions = np.extract(chroms == chrArm, chrArmFile["variants/POS"])
-
-if maskFileName.lower() in ["none", "false"]:
-    sys.stderr.write("Warning: a mask.fa file for the chr arm with all masked sites N'ed out is strongly recommended" +
-        " (pass in the reference to remove Ns at the very least)!\n")
-    unmasked = [True] * chrLen
-else:
-    unmasked = readMaskDataForScan(maskFileName, chrArm)
-    assert len(unmasked) == chrLen
-
-if statFileName.lower() in ["none", "false"]:
-    statFileName = None
-
-samples = chrArmFile["samples"]
-if not sampleToPopFileName.lower() in ["none", "false"]:
-    sampleToPop = readSampleToPopFile(sampleToPopFileName)
-    sampleIndicesToKeep = [i for i in range(len(samples)) if sampleToPop.get(samples[i], "popNotFound!") == targetPop]
-else:
-    sampleIndicesToKeep = [i for i in range(len(samples))]
-
-rawgenos = np.take(chrArmFile["calldata/GT"], [i for i in range(len(chroms)) if chroms[i] == chrArm], axis=0)
-genos = allel.GenotypeArray(rawgenos)
-refAlleles = np.extract(chroms == chrArm, chrArmFile['variants/REF'])
-altAlleles = np.extract(chroms == chrArm, chrArmFile['variants/ALT'])
-if segmentStart != None:
-    snpIndicesToKeep = [i for i in range(len(positions)) if segmentStart <= positions[i] <= segmentEnd]
-    positions = [positions[i] for i in snpIndicesToKeep]
-    refAlleles = [refAlleles[i] for i in snpIndicesToKeep]
-    altAlleles = [altAlleles[i] for i in snpIndicesToKeep]
-    genos = allel.GenotypeArray(genos.subset(sel0=snpIndicesToKeep))
-genos = allel.GenotypeArray(genos.subset(sel1=sampleIndicesToKeep))
-alleleCounts = genos.count_alleles()
-
-#remove all but mono/biallelic unmasked sites
-isBiallelic = alleleCounts.is_biallelic()
-for i in range(len(isBiallelic)):
-    if not isBiallelic[i]:
-        unmasked[positions[i]-1] = False
-
-#polarize
-if not ancestralArmFaFileName.lower() in ["none", "false"]:
-    sys.stderr.write("polarizing snps\n")
-    ancArm = readFaArm(ancestralArmFaFileName, chrArm).upper()
-    startTime = time.perf_counter()
-    #NOTE: mapping specifies which alleles to swap counts for based on polarization; leaves unpolarized snps alone
-    #NOTE: those snps need to be filtered later on (as done below)!
-    # this will also remove sites that could not be polarized
-    mapping, unmasked = polarizeSnps(unmasked, positions, refAlleles, altAlleles, ancArm)
-    sys.stderr.write("took %s seconds\n" %(time.perf_counter()-startTime))
-    statNames = ["pi", "thetaW", "tajD", "thetaH", "fayWuH", "maxFDA", "HapCount", "H1", "H12", "H2/H1", "ZnS", "Omega", "distVar", "distSkew", "distKurt"]
-else:
-    statNames = ["pi", "thetaW", "tajD", "HapCount", "H1", "H12", "H2/H1", "ZnS", "Omega", "distVar", "distSkew", "distKurt"]
-
-snpIndicesToKeep = [i for i in range(len(positions)) if unmasked[positions[i]-1]]
-genos = allel.GenotypeArray(genos.subset(sel0=snpIndicesToKeep))
-positions = [positions[i] for i in snpIndicesToKeep]
-alleleCounts = allel.AlleleCountsArray([[alleleCounts[i][0], max(alleleCounts[i][1:])] for i in snpIndicesToKeep])
-if not ancestralArmFaFileName.lower() in ["none", "false"]:
-    mapping = [mapping[i] for i in snpIndicesToKeep]
-    alleleCounts = alleleCounts.map_alleles(mapping)
-haps = genos.to_haplotypes()
-
-subWinBounds = getSubWinBounds(chrLen, subWinSize)
-precomputedStats = {} #not using this
-
-header = "chrom classifiedWinStart classifiedWinEnd bigWinRange".split()
-statHeader = "chrom start end".split()
-for statName in statNames:
-    statHeader.append(statName)
-    for i in range(numSubWins):
-        header.append("%s_win%d" %(statName, i))
-statHeader = "\t".join(statHeader)
-header = "\t".join(header)
-outFile=open(outfn,'w')
-outFile.write(header+"\n")
-statVals = {}
-for statName in statNames:
-    statVals[statName] = []
-
-startTime = time.perf_counter()
-goodSubWins = []
-lastSubWinEnd = chrLen - chrLen % subWinSize
-snpIndicesInSubWins = getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, positions)
-subWinIndex = 0
-lastSubWinStart = lastSubWinEnd - subWinSize + 1
-if statFileName:
-    statFile = open(statFileName, "w")
-    statFile.write(statHeader + "\n")
-for subWinStart in range(1, lastSubWinStart+1, subWinSize):
-    subWinEnd = subWinStart + subWinSize - 1
-    unmaskedFrac = unmasked[subWinStart-1:subWinEnd].count(True)/float(subWinEnd-subWinStart+1)
-    if segmentStart == None or subWinStart >= segmentStart and subWinEnd <= segmentEnd:
-        sys.stderr.write("%d-%d num unmasked snps: %d; unmasked frac: %f\n" %(subWinStart, subWinEnd, len(snpIndicesInSubWins[subWinIndex]), unmaskedFrac))
-    if len(snpIndicesInSubWins[subWinIndex]) > 0 and unmaskedFrac >= unmaskedFracCutoff:
-        hapsInSubWin = allel.HaplotypeArray(haps.subset(sel0=snpIndicesInSubWins[subWinIndex]))
-        statValStr = []
-        for statName in statNames:
-            calcAndAppendStatValForScan(alleleCounts, positions, statName, subWinStart, \
-                 subWinEnd, statVals, subWinIndex, hapsInSubWin, unmasked, precomputedStats)
-            statValStr.append("%s: %s" %(statName, statVals[statName][-1]))
-        sys.stderr.write("\t".join(statValStr) + "\n")
-        goodSubWins.append(True)
-        if statFileName:
-            statFile.write("\t".join([chrArm, str(subWinStart), str(subWinEnd)] + [str(statVals[statName][-1]) for statName in statNames]) + "\n")
-    else:
-        for statName in statNames:
-            appendStatValsForMonomorphicForScan(statName, statVals, subWinIndex)
-        goodSubWins.append(False)
-    if goodSubWins[-numSubWins:].count(True) == numSubWins:
-        outVec = []
-        for statName in statNames:
-            outVec += normalizeFeatureVec(statVals[statName][-numSubWins:])
-        midSubWinEnd = subWinEnd - subWinSize*(numSubWins/2)
-        midSubWinStart = midSubWinEnd-subWinSize+1
-        outFile.write("%s\t%d\t%d\t%d-%d\t" %(chrArm, midSubWinStart, midSubWinEnd, subWinEnd-winSize+1, subWinEnd) + "\t".join([str(x) for x in outVec]))
-        outFile.write('\n')
-    subWinIndex += 1
-if statFileName:
-    statFile.close()
-outFile.close()
-sys.stderr.write("completed in %g seconds\n" %(time.perf_counter()-startTime))
diff --git a/makeFeatureVecsForSingleMsDiploid.py b/makeFeatureVecsForSingleMsDiploid.py
deleted file mode 100644
index e1000f0..0000000
--- a/makeFeatureVecsForSingleMsDiploid.py
+++ /dev/null
@@ -1,193 +0,0 @@
-import sys, os
-import allel
-import random
-import numpy as np
-from msTools import *
-from fvTools import *
-import time
-
-trainingDataFileName, totalPhysLen, numSubWins, maskFileName, vcfForMaskFileName, popForMask, sampleToPopFileName, unmaskedGenoFracCutoff, chrArmsForMasking, unmaskedFracCutoff, outStatsDir, fvecFileName = sys.argv[1:]
-totalPhysLen = int(totalPhysLen)
-numSubWins = int(numSubWins)
-subWinLen = totalPhysLen//numSubWins
-assert totalPhysLen % numSubWins == 0 and numSubWins > 1
-
-sys.stderr.write("file name='%s'" %(trainingDataFileName))
-
-trainingDataFileObj, sampleSize, numInstances = openMsOutFileForSequentialReading(trainingDataFileName)
-
-if maskFileName.lower() in ["none", "false"]:
-    sys.stderr.write("maskFileName='%s': not masking any sites!\n" %(maskFileName))
-    maskFileName = False
-    unmaskedFracCutoff = 1.0
-else:
-    chrArmsForMasking = chrArmsForMasking.split(",")
-    unmaskedFracCutoff = float(unmaskedFracCutoff)
-    if unmaskedFracCutoff > 1.0 or unmaskedFracCutoff < 0.0:
-        sys.exit("unmaskedFracCutoff must lie within [0, 1]. AAARRRRGGGGHHHHH!!!!\n")
-
-if vcfForMaskFileName.lower() in ["none", "false"]:
-    sys.stderr.write("vcfForMaskFileName='%s': not masking any genotypes!\n" %(vcfForMaskFileName))
-    vcfForMaskFileName = False
-else:
-    if not maskFileName:
-        sys.exit("Cannot mask genotypes without also supplying a file for masking entire sites (can use reference genome with Ns if desired). AAARRRGHHHHH!!!!!!\n")
-    if sampleToPopFileName.lower() in ["none", "false"] or popForMask.lower() in ["none", "false"]:
-        sampleToPopFileName = None
-        sys.stderr.write("No sampleToPopFileName specified. Using all individuals for masking genotypes.\n")
-    unmaskedGenoFracCutoff = float(unmaskedGenoFracCutoff)
-    if unmaskedGenoFracCutoff > 1.0 or unmaskedGenoFracCutoff < 0.0:
-        sys.exit("unmaskedGenoFracCutoff must lie within [0, 1]. AAARRRRGGGGHHHHH!!!!\n")
-
-def getSubWinBounds(subWinLen, totalPhysLen): # get inclusive subwin bounds
-    subWinStart = 1
-    subWinEnd = subWinStart + subWinLen - 1
-    subWinBounds = [(subWinStart, subWinEnd)]
-    numSubWins = totalPhysLen//subWinLen
-    for i in range(1, numSubWins-1):
-        subWinStart += subWinLen
-        subWinEnd += subWinLen
-        subWinBounds.append((subWinStart, subWinEnd))
-    subWinStart += subWinLen
-    # if our subwindows are 1 bp too short due to rounding error, the last window picks up all of the slack
-    subWinEnd = totalPhysLen
-    subWinBounds.append((subWinStart, subWinEnd))
-    return subWinBounds
-
-if not maskFileName:
-    unmasked = [True] * totalPhysLen
-else:
-    drawWithReplacement = False
-    sys.stderr.write("reading masking data...")
-    maskData = readMaskDataForTraining(maskFileName, totalPhysLen, subWinLen, chrArmsForMasking, shuffle=True, cutoff=unmaskedFracCutoff,
-                                                     genoCutoff=unmaskedGenoFracCutoff, vcfForMaskFileName=vcfForMaskFileName, pop=popForMask,
-                                                     sampleToPopFileName=sampleToPopFileName)
-    if vcfForMaskFileName:
-        maskData, genoMaskData = maskData
-    else:
-        genoMaskData = [None]*len(maskData)
-    sys.stderr.write("done!\n")
-
-    if len(maskData) < numInstances:
-        sys.stderr.write("Warning: didn't get enough windows from masked data (needed %d; got %d); will draw with replacement!!\n" %(numInstances, len(maskData)))
-        drawWithReplacement = True
-    else:
-        sys.stderr.write("Got enough windows from masked data (needed %d; got %d); will draw without replacement.\n" %(numInstances, len(maskData)))
-
-def getSnpIndicesInSubWins(subWinBounds, snpLocs):
-    snpIndicesInSubWins = []
-    for subWinIndex in range(len(subWinBounds)):
-        snpIndicesInSubWins.append([])
-
-    subWinIndex = 0
-    for i in range(len(snpLocs)):
-        while not (snpLocs[i] >= subWinBounds[subWinIndex][0] and snpLocs[i] <= subWinBounds[subWinIndex][1]):
-            subWinIndex += 1
-        snpIndicesInSubWins[subWinIndex].append(i)
-    return snpIndicesInSubWins
-
-subWinBounds = getSubWinBounds(subWinLen, totalPhysLen)
-#statNames = ["pi", "thetaW", "tajD", "nDiplos","diplo_H2","diplo_H12","diplo_H2/H1","diplo_ZnS","diplo_Omega"]
-statNames = ["pi", "thetaW", "tajD", "distVar","distSkew","distKurt","nDiplos","diplo_H1","diplo_H12","diplo_H2/H1","diplo_ZnS","diplo_Omega"]
-header = []
-for statName in statNames:
-    for i in range(numSubWins):
-        header.append("%s_win%d" %(statName, i))
-header = "\t".join(header)
-
-
-statVals = {}
-for statName in statNames:
-    statVals[statName] = []
-start = time.perf_counter()
-numInstancesDone = 0
-sys.stderr.write("ready to process sim reps. here we go!\n")
-for instanceIndex in range(numInstances):
-    sys.stderr.write("starting rep %d of %d\n" %(instanceIndex, numInstances))
-    hapArrayIn, positionArray = readNextMsRepToHaplotypeArrayIn(trainingDataFileObj, sampleSize, totalPhysLen)
-
-    haps = allel.HaplotypeArray(hapArrayIn, dtype='i1')
-    if maskFileName:
-        if drawWithReplacement:
-            randIndex = random.randint(0, len(maskData)-1)
-            unmasked, genoMasks = maskData[randIndex], genoMaskData[randIndex]
-        else:
-            unmasked, genoMasks = maskData[instanceIndex], genoMaskData[instanceIndex]
-        assert len(unmasked) == totalPhysLen
-    if haps.shape[1] % 2 == 1:
-        haps = haps[:,:-1]
-    genos = haps.to_genotypes(ploidy=2)
-    unmaskedSnpIndices = [i for i in range(len(positionArray)) if unmasked[positionArray[i]-1]]
-    if len(unmaskedSnpIndices) == 0:
-        sys.stderr.write("no snps for rep %d\n" %(instanceIndex))
-        for statName in statNames:
-            statVals[statName].append([])
-        for subWinIndex in range(numSubWins):
-            for statName in statNames:
-                appendStatValsForMonomorphic(statName, statVals, instanceIndex, subWinIndex)
-    else:
-        sys.stderr.write("processing snps for rep %d\n" %(instanceIndex))
-        if maskFileName:
-            preMaskCount = np.sum(genos.count_alleles())
-            if genoMasks:
-                sys.stderr.write("%d snps in the masking window for rep %d\n" %(len(genoMasks), instanceIndex))
-                genos = maskGenos(genos.subset(sel0=unmaskedSnpIndices), genoMasks)
-            else:
-                genos = genos.subset(sel0=unmaskedSnpIndices)
-            alleleCountsUnmaskedOnly = genos.count_alleles()
-            maskedCount = preMaskCount - np.sum(alleleCountsUnmaskedOnly)
-            sys.stderr.write("%d of %d genotypes (%.2f%%) masked for rep %d\n" %(maskedCount, preMaskCount, 100*maskedCount/preMaskCount, instanceIndex))
-        else:
-            alleleCountsUnmaskedOnly = genos.count_alleles()
-        positionArrayUnmaskedOnly = [positionArray[i] for i in unmaskedSnpIndices]
-        snpIndicesInSubWins = getSnpIndicesInSubWins(subWinBounds, positionArrayUnmaskedOnly)
-        for statName in statNames:
-            statVals[statName].append([])
-        for subWinIndex in range(numSubWins):
-            subWinStart, subWinEnd = subWinBounds[subWinIndex]
-            unmaskedFrac = unmasked[subWinStart-1:subWinEnd].count(True)/float(subWinLen)
-            assert unmaskedFrac >= unmaskedFracCutoff
-            snpIndicesInSubWinUnmasked = snpIndicesInSubWins[subWinIndex]
-            sys.stderr.write("examining subwindow %d which has %d unmasked SNPs\n" %(subWinIndex, len(snpIndicesInSubWinUnmasked)))
-            if len(snpIndicesInSubWinUnmasked) > 0:
-                genosInSubWin = genos.subset(sel0=snpIndicesInSubWinUnmasked)
-                for statName in statNames:
-                    calcAndAppendStatValDiplo(alleleCountsUnmaskedOnly, positionArrayUnmaskedOnly, statName, subWinStart, \
-                                 subWinEnd, statVals, instanceIndex, subWinIndex, genosInSubWin, unmasked)
-            else:
-                for statName in statNames:
-                    appendStatValsForMonomorphic(statName, statVals, instanceIndex, subWinIndex)
-    numInstancesDone += 1
-    sys.stderr.write("finished %d reps after %f seconds\n" %(numInstancesDone, time.perf_counter()-start))
-
-if numInstancesDone != numInstances:
-    sys.exit("Expected %d reps but only processed %d. Perhaps we are using malformed simulation output!\n" %(numInstancesDone, numInstances))
-
-statFiles = []
-if outStatsDir.lower() != "none":
-    for subWinIndex in range(numSubWins):
-        statFileName = "%s/%s.%d.stats" %(outStatsDir, trainingDataFileName.split("/")[-1].rstrip(".gz"), subWinIndex)
-        statFiles.append(open(statFileName, "w"))
-        statFiles[-1].write("\t".join(statNames) + "\n")
-with open(fvecFileName, "w") as fvecFile:
-    fvecFile.write(header + "\n")
-    for i in range(numInstancesDone):
-        statLines = []
-        for subWinIndex in range(numSubWins):
-            statLines.append([])
-        outVec = []
-        for statName in statNames:
-            outVec += normalizeFeatureVec(statVals[statName][i])
-            for subWinIndex in range(numSubWins):
-                statLines[subWinIndex].append(statVals[statName][i][subWinIndex])
-        if statFiles:
-            for subWinIndex in range(numSubWins):
-                statFiles[subWinIndex].write("\t".join([str(x) for x in statLines[subWinIndex]]) + "\n")
-        fvecFile.write("\t".join([str(x) for x in outVec]) + "\n")
-
-if statFiles:
-    for subWinIndex in range(numSubWins):
-        statFiles[subWinIndex].close()
-
-sys.stderr.write("total time spent calculating summary statistics and generating feature vectors: %f secs\n" %(time.perf_counter()-start))
-closeMsOutFile(trainingDataFileObj)
diff --git a/meta.yaml b/meta.yaml
new file mode 100644
index 0000000..db68a76
--- /dev/null
+++ b/meta.yaml
@@ -0,0 +1,27 @@
+package:
+ name: diploshic
+ version: "0.33333"
+ 
+source:
+ git_url: https://github.com/kr-colab/diploshic
+ git_rev: 
+ 
+build:
+ number: 0
+ 
+Requirements:
+ Build:
+   - numpy
+ run:
+   - numpy
+   - scipy
+   - matplotlib
+   - pandas
+   - scikit-allel
+   - scikit-learn
+   - tensorflow
+   - keras
+ 
+about:
+ home: https://github.com/kr-colab/diploshic
+ license: MIT
diff --git a/setup.py b/setup.py
index baeebd0..0284f3e 100644
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,40 @@
-# File setup.py
-def configuration(parent_package='',top_path=None):
-    from numpy.distutils.misc_util import Configuration
-    config = Configuration('',parent_package,top_path)
+from numpy.distutils.core import Extension, setup
 
-    config.add_extension('shicstats',
-                         sources = ['shicstats.pyf','utils.c'])
-    return config
-if __name__ == "__main__":
-    from numpy.distutils.core import setup
-    setup(**configuration(top_path='').todict())
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+
+shic_stats = Extension("diploshic.shicstats",
+                       sources=["diploshic/shicstats.pyf",
+                                "diploshic/utils.c"],
+                      )
+setup(name='diploSHIC',
+      version='0.33333',
+      description='diploS/HIC',
+      long_description=long_description,
+      long_description_content_type="text/markdown",
+      url='https://github.com/kr-colab/diploSHIC',
+      author='Andrew Kern',
+      author_email='adkern@uoregon.edu',
+      license='MIT',      
+      packages=['diploshic'],
+      install_requires=['numpy',
+                        'scipy',
+                        'matplotlib',
+                        'pandas',
+                        'scikit-allel',
+                        'scikit-learn',
+                        'tensorflow',
+                        'keras'],
+      scripts=['diploshic/diploSHIC',
+              'diploshic/makeFeatureVecsForChrArmFromVcfDiploid.py',
+              'diploshic/makeFeatureVecsForChrArmFromVcf_ogSHIC.py',
+              'diploshic/makeFeatureVecsForSingleMsDiploid.py',
+              'diploshic/makeFeatureVecsForSingleMs_ogSHIC.py',
+              'diploshic/makeTrainingSets.py'],
+      zip_safe=False,
+      extras_require={
+          'dev': [],
+      },
+      setup_requires=[],
+      ext_modules=[shic_stats]
+)

From 7b166437dd8dfa95b1ad83aa7dba3ea6fc799c26 Mon Sep 17 00:00:00 2001
From: andrewkern <adkern@uoregon.edu>
Date: Tue, 23 Nov 2021 13:07:07 -0800
Subject: [PATCH 02/10] working on the readme adjustments

---
 README.md | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index b5abb43..7267c7d 100644
--- a/README.md
+++ b/README.md
@@ -22,33 +22,26 @@ such as `conda` or `pip`. The complete list of dependencies looks like this:
 
 ## Install on linux
 I'm going to focus on the steps involved to install on a linux machine using Anaconda as our python source / main
-package manager. First download and install Anaconda
+package manager. Assuming you have conda installed, create a new conda env
 
 ```
-$ wget https://repo.continuum.io/archive/Anaconda3-5.0.1-Linux-x86_64.sh
-$ bash Anaconda3-5.0.1-Linux-x86_64.sh
-```
-That will give us the basics (numpy, scipy, scikit-learn, etc). Next lets install scikit-allel using `conda`
-```
-$ conda install -c conda-forge scikit-allel
-```
-That's easy. Installing tensorflow and keras can be slightly more touchy. You will need to determine if 
-you want to use a CPU-only implementation (probably) or a GPU implementation of tensorflow. See
-https://www.tensorflow.org/install/install_linux for install instructions. I'm going to install the
-CPU version for simplicity. tensorflow and keras are the libraries which handle the deep learning
-portion of things so it is important to make sure the versions of these libraries play well together 
-```
-$ pip install tensorflow 
-$ pip install keras
+$ conda create -n diploshic python=3.9 --yes
 ```
+
 Note that because I'm using the Anaconda version of python, pip will only install this in the anaconda directory
-which is a good thing. Okay that should be the basics of dependencies. Now we are ready to install `diploS/HIC` itself
+which is a good thing. Now we are ready to install `diploS/HIC` itself
+
 ```
 $ git clone https://github.com/kern-lab/diploSHIC.git
 $ cd diploSHIC 
-$ python setup.py install
+$ pip install .
 ```
-Assuming all the dependencies were installed this should be all set
+
+This should automatically install all the dependencies including tensorflow.
+You will need to determine if 
+you want to use a CPU-only implementation (probably) or a GPU implementation of tensorflow. See
+https://www.tensorflow.org/install/install_linux for install instructions. 
+
 
 ## Usage
 The main program that you will interface with is `diploSHIC.py`. This script has four run modes that allow the user to 

From 611f58ec2cdfefd48321c91c92ddf2b8fea7be00 Mon Sep 17 00:00:00 2001
From: andrewkern <adkern@uoregon.edu>
Date: Tue, 23 Nov 2021 13:11:24 -0800
Subject: [PATCH 03/10] updated readme

---
 README.md | 48 +++++++++++++++++++++++++-----------------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 7267c7d..44a2c55 100644
--- a/README.md
+++ b/README.md
@@ -44,19 +44,21 @@ https://www.tensorflow.org/install/install_linux for install instructions.
 
 
 ## Usage
-The main program that you will interface with is `diploSHIC.py`. This script has four run modes that allow the user to 
+The main program that you will interface with is `diploSHIC`. This script is installed by default
+in the conda environment `bin` directory.
+ This script has four run modes that allow the user to 
 perform each of the main steps in the supervised machine learning process. We will briefly lay out the modes of use
 and then will provide a complete example of how to use the program for fun and profit.
 
-`diploSHIC.py` uses the `argparse` module in python to try to give the user a complete, command line based help menu. 
+`diploSHIC` uses the `argparse` module in python to try to give the user a complete, command line based help menu. 
 We can see the top level of this help by typing
 ```
-$ python diploSHIC.py -h
-usage: diploSHIC.py [-h] {train,predict,fvecSim,fvecVcf} ...
+$ diploSHIC -h
+usage: diploSHIC [-h] {train,predict,fvecSim,fvecVcf} ...
 
 calculate feature vectors, train, or predict with diploSHIC
 
-possible modes (enter 'python diploSHIC.py modeName -h' for modeName's help message:
+possible modes (enter 'diploSHIC modeName -h' for modeName's help message:
   {fvecSim,makeTrainingSets,train,fvecVcf,predict}
                         sub-command help
     fvecSim             Generate feature vectors from simulated data
@@ -77,15 +79,15 @@ https://github.com/kern-lab/discoal).
 
 ### feature vector generation modes
 The first task in our pipeline is generating feature vectors from simulation data (or empirical data) to
-use with the CNN that we will train and then use for prediction. The `diploSHIC.py` script eases this 
+use with the CNN that we will train and then use for prediction. The `diploSHIC` script eases this 
 process with two run modes
 
 #### fvecSim mode
-The fvecSim run mode is used for turning ms-style output into feature vectors compatible with `diploSHIC.py`. The
+The fvecSim run mode is used for turning ms-style output into feature vectors compatible with `diploSHIC`. The
 help message from this mode looks like this
 ```
-$ python diploSHIC.py fvecSim -h
-usage: diploSHIC.py fvecSim [-h] [--totalPhysLen TOTALPHYSLEN]
+$ diploSHIC fvecSim -h
+usage: diploSHIC fvecSim [-h] [--totalPhysLen TOTALPHYSLEN]
                             [--numSubWins NUMSUBWINS]
                             [--maskFileName MASKFILENAME]
                             [--chrArmsForMasking CHRARMSFORMASKING]
@@ -145,8 +147,8 @@ for a fleshed out example of how to use these features.
 The fvecVcf mode is used for calculating feature vectors from data that is stored as a VCF file. 
 The help message from this mode is as follows
 ```
-$ python diploSHIC.py fvecVcf -h
-usage: diploSHIC.py fvecVcf [-h] [--targetPop TARGETPOP]
+$ diploSHIC fvecVcf -h
+usage: diploSHIC fvecVcf [-h] [--targetPop TARGETPOP]
                             [--sampleToPopFileName SAMPLETOPOPFILENAME]
                             [--winSize WINSIZE] [--numSubWins NUMSUBWINS]
                             [--maskFileName MASKFILENAME]
@@ -212,8 +214,8 @@ Once we have feature vector files ready to go we can train and test our CNN and
 Before entering train mode we need to consolidate our training set into 5 files, one for each class. This is done using the
 makeTrainingSets mode whose help message is as follows:
 ```
-$ python diploSHIC.py makeTrainingSets -h
-usage: diploSHIC.py makeTrainingSets [-h]
+$ diploSHIC makeTrainingSets -h
+usage: diploSHIC makeTrainingSets [-h]
                                      neutTrainingFileName
                                      softTrainingFilePrefix
                                      hardTrainingFilePrefix
@@ -244,10 +246,10 @@ optional arguments:
   -h, --help            show this help message and exit
 ```
 #### train mode
-Here is the help message for the train mode of `diploSHIC.py`
+Here is the help message for the train mode of `diploSHIC`
 ```
-$ python diploSHIC.py train -h
-usage: diploSHIC.py train [-h] [--epochs EPOCHS] [--numSubWins NUMSUBWINS]
+$ diploSHIC train -h
+usage: diploSHIC train [-h] [--epochs EPOCHS] [--numSubWins NUMSUBWINS]
                           trainDir testDir outputModel
 
 required arguments:
@@ -266,7 +268,7 @@ optional arguments:
 As you will see in a moment train mode is used for training the deep learning classifier. Its required
 arguments are trainDir (the directory where the training feature vectors
 are kept), testDir (the directory where the testing feature vectors are kept), and outputModel the file name for the trained
-network. One note -- `diploSHIC.py` expects five files named `hard.fvec`, `soft.fvec`, `neut.fvec`, `linkedSoft.fvec`, and 
+network. One note -- `diploSHIC` expects five files named `hard.fvec`, `soft.fvec`, `neut.fvec`, `linkedSoft.fvec`, and 
 `linkedHard.fvec` in the training and testing directories. The training and testing directory can be the same directory in 
 which case 20% of the training examples are held out for use in testing and validation.
 
@@ -274,11 +276,11 @@ train mode has two options, the number of subwindows used for the feature vector
 network.
 
 ### predict mode
-Once a classifier has been trained, one uses the predict mode of `diploSHIC.py` to classify empirical data. Here is the help
+Once a classifier has been trained, one uses the predict mode of `diploSHIC` to classify empirical data. Here is the help
 statement
 ```
-$ python diploSHIC.py predict -h
-usage: diploSHIC.py predict [-h] [--numSubWins NUMSUBWINS]
+$ diploSHIC predict -h
+usage: diploSHIC predict [-h] [--numSubWins NUMSUBWINS]
                             modelStructure modelWeights predictFile
                             predictFileOutput
 
@@ -304,11 +306,11 @@ genomic data). Let's quickly give that code a spin. The directories `testing/` a
 formatted diploid feature vectors that are ready to be fed into diploSHIC. First we will train the diploSHIC CNN, but we will
 restrict the number of training epochs to 10 to keep things relatively brief (this runs in less than 5 minutes on our server). 
 ```
-$ python diploSHIC.py train training/ testing/ fooModel --epochs 10
+$ diploSHIC train training/ testing/ fooModel --epochs 10
 ```
 as it runs a bunch of information monitoring the training of the network will apear. We are tracking the loss and accuracy in the
 validation set. When optimization is complete our trained network will be contained in two files, `fooModel.json` and 
-`fooModel.weights.hdf5`. The last bit of output from `diploSHIC.py` gives us information about the loss and accuracy on
+`fooModel.weights.hdf5`. The last bit of output from `diploSHIC` gives us information about the loss and accuracy on
 the held out test data. From the above run my looks like this:
 ```
 evaluation on test set:
@@ -319,7 +321,7 @@ Not bad. In practice I would set the `--epochs` value much higher than 10- the d
 Now that we have a trained model we can make predictions on some empirical data. In the repo there is a file called `testEmpirical.fvec`
 that we will use as input
 ```
-$ python diploSHIC.py predict fooModel.json fooModel.weights.hdf5 testEmpirical.fvec testEmpirical.preds
+$ diploSHIC predict fooModel.json fooModel.weights.hdf5 testEmpirical.fvec testEmpirical.preds
 ```
 the output predictions will be saved in `testEmpirical.preds` and should be straightforward to interpret.
 

From f311f40e35133b1be9ddb0d4ed366545701130a9 Mon Sep 17 00:00:00 2001
From: Andrew Kern <adkern@uoregon.edu>
Date: Tue, 23 Nov 2021 13:38:58 -0800
Subject: [PATCH 04/10] Create build_wheels.yaml

---
 .github/workflows/build_wheels.yaml | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 .github/workflows/build_wheels.yaml

diff --git a/.github/workflows/build_wheels.yaml b/.github/workflows/build_wheels.yaml
new file mode 100644
index 0000000..1791681
--- /dev/null
+++ b/.github/workflows/build_wheels.yaml
@@ -0,0 +1,24 @@
+name: Build
+
+on: [push, pull_request]
+
+jobs:
+  build_wheels:
+    name: Build wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-20.04, windows-2019, macos-10.15]
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Build wheels
+        uses: pypa/cibuildwheel@v2.2.2
+        # to supply options, put them in 'env', like:
+        # env:
+        #   CIBW_SOME_OPTION: value
+
+      - uses: actions/upload-artifact@v2
+        with:
+          path: ./wheelhouse/*.whl

From 1804254bbbb449c3748b7293276fd702fdb3dc56 Mon Sep 17 00:00:00 2001
From: Andrew Kern <adkern@uoregon.edu>
Date: Tue, 23 Nov 2021 13:43:10 -0800
Subject: [PATCH 05/10] Create pyproject.toml

---
 pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 pyproject.toml

diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..bf8e1fb
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = ["setuptools", "numpy"]

From 2be05da17a85dba1c2ae9e8438fd3b216df9c846 Mon Sep 17 00:00:00 2001
From: Andrew Kern <adkern@uoregon.edu>
Date: Tue, 23 Nov 2021 13:48:05 -0800
Subject: [PATCH 06/10] Update build_wheels.yaml

---
 .github/workflows/build_wheels.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_wheels.yaml b/.github/workflows/build_wheels.yaml
index 1791681..a989ec3 100644
--- a/.github/workflows/build_wheels.yaml
+++ b/.github/workflows/build_wheels.yaml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-20.04, windows-2019, macos-10.15]
+        os: [ubuntu-20.04, macos-10.15]
 
     steps:
       - uses: actions/checkout@v2

From 334c5d244dc428a45050b38172e48681b6d62976 Mon Sep 17 00:00:00 2001
From: Andrew Kern <adkern@uoregon.edu>
Date: Tue, 23 Nov 2021 14:05:32 -0800
Subject: [PATCH 07/10] Update and rename build_wheels.yaml to deploy.yaml

---
 .github/workflows/build_wheels.yaml | 24 -----------
 .github/workflows/deploy.yaml       | 62 +++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 24 deletions(-)
 delete mode 100644 .github/workflows/build_wheels.yaml
 create mode 100644 .github/workflows/deploy.yaml

diff --git a/.github/workflows/build_wheels.yaml b/.github/workflows/build_wheels.yaml
deleted file mode 100644
index a989ec3..0000000
--- a/.github/workflows/build_wheels.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-name: Build
-
-on: [push, pull_request]
-
-jobs:
-  build_wheels:
-    name: Build wheels on ${{ matrix.os }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [ubuntu-20.04, macos-10.15]
-
-    steps:
-      - uses: actions/checkout@v2
-
-      - name: Build wheels
-        uses: pypa/cibuildwheel@v2.2.2
-        # to supply options, put them in 'env', like:
-        # env:
-        #   CIBW_SOME_OPTION: value
-
-      - uses: actions/upload-artifact@v2
-        with:
-          path: ./wheelhouse/*.whl
diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml
new file mode 100644
index 0000000..f51c186
--- /dev/null
+++ b/.github/workflows/deploy.yaml
@@ -0,0 +1,62 @@
+name: Build and upload to PyPI
+
+# Build on every branch push, tag push, and pull request change:
+on: [push, pull_request]
+# Alternatively, to publish when a (published) GitHub Release is created, use the following:
+# on:
+#   push:
+#   pull_request:
+#   release:
+#     types:
+#       - published
+
+jobs:
+  build_wheels:
+    name: Build wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-20.04]
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Build wheels
+        uses: pypa/cibuildwheel@v2.2.2
+
+      - uses: actions/upload-artifact@v2
+        with:
+          path: ./wheelhouse/*.whl
+
+  build_sdist:
+    name: Build source distribution
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Build sdist
+        run: pipx run build --sdist
+
+      - uses: actions/upload-artifact@v2
+        with:
+          path: dist/*.tar.gz
+
+  upload_pypi:
+    needs: [build_wheels, build_sdist]
+    runs-on: ubuntu-latest
+    # upload to PyPI on every tag starting with 'v'
+    if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v')
+    # alternatively, to publish when a GitHub Release is created, use the following rule:
+    # if: github.event_name == 'release' && github.event.action == 'published'
+    steps:
+      - uses: actions/download-artifact@v2
+        with:
+          name: artifact
+          path: dist
+
+      - uses: pypa/gh-action-pypi-publish@v1.4.2
+        with:
+          user: __token__
+          repository_url: https://test.pypi.org/legacy/
+          password: ${{ secrets.pypi_password }}
+          # To test: repository_url: https://test.pypi.org/legacy/

From 0167d683a404d485551ea4c0c92964bc75bbb7ce Mon Sep 17 00:00:00 2001
From: Andrew Kern <adkern@uoregon.edu>
Date: Tue, 23 Nov 2021 16:04:49 -0800
Subject: [PATCH 08/10] Update deploy.yaml

---
 .github/workflows/deploy.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml
index f51c186..acb375a 100644
--- a/.github/workflows/deploy.yaml
+++ b/.github/workflows/deploy.yaml
@@ -58,5 +58,5 @@ jobs:
         with:
           user: __token__
           repository_url: https://test.pypi.org/legacy/
-          password: ${{ secrets.pypi_password }}
+          password: ${{ secrets.PYPI_TOKEN }}
           # To test: repository_url: https://test.pypi.org/legacy/

From 5b589cc057175a26b458527ceaf2fed963fd6274 Mon Sep 17 00:00:00 2001
From: andrewkern <adkern@uoregon.edu>
Date: Tue, 23 Nov 2021 21:56:29 -0800
Subject: [PATCH 09/10] more packing woes

---
 README.md | 7 +++++--
 setup.py  | 7 ++++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 44a2c55..c4c1f3a 100644
--- a/README.md
+++ b/README.md
@@ -46,19 +46,20 @@ https://www.tensorflow.org/install/install_linux for install instructions.
 ## Usage
 The main program that you will interface with is `diploSHIC`. This script is installed by default
 in the conda environment `bin` directory.
- This script has four run modes that allow the user to 
+This script has four run modes that allow the user to 
 perform each of the main steps in the supervised machine learning process. We will briefly lay out the modes of use
 and then will provide a complete example of how to use the program for fun and profit.
 
 `diploSHIC` uses the `argparse` module in python to try to give the user a complete, command line based help menu. 
 We can see the top level of this help by typing
+
 ```
 $ diploSHIC -h
 usage: diploSHIC [-h] {train,predict,fvecSim,fvecVcf} ...
 
 calculate feature vectors, train, or predict with diploSHIC
 
-possible modes (enter 'diploSHIC modeName -h' for modeName's help message:
+possible modes (enter \'diploSHIC modeName -h\' for modeName\'s help message:
   {fvecSim,makeTrainingSets,train,fvecVcf,predict}
                         sub-command help
     fvecSim             Generate feature vectors from simulated data
@@ -71,6 +72,8 @@ possible modes (enter 'diploSHIC modeName -h' for modeName's help message:
 optional arguments:
   -h, --help            show this help message and exit
 ```
+
+
 ### before running diploSHIC: simulating training/testing data
 All flavors of S/HIC require simulated data for training (and ideally, testing). Users can select whatever simulator 
 they prefer and parameterize them however they wish. We have included an example script in this respository 
diff --git a/setup.py b/setup.py
index 0284f3e..c33ad40 100644
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,4 @@
+from setuptools import setup
 from numpy.distutils.core import Extension, setup
 
 with open("README.md", "r") as fh:
@@ -8,14 +9,14 @@
                                 "diploshic/utils.c"],
                       )
 setup(name='diploSHIC',
-      version='0.33333',
-      description='diploS/HIC',
+      version='0.333333',
+      description='diploSHIC',
       long_description=long_description,
       long_description_content_type="text/markdown",
       url='https://github.com/kr-colab/diploSHIC',
       author='Andrew Kern',
       author_email='adkern@uoregon.edu',
-      license='MIT',      
+      license='MIT',
       packages=['diploshic'],
       install_requires=['numpy',
                         'scipy',

From 17c0bfe1b77629e6772da7493be393d3b2725260 Mon Sep 17 00:00:00 2001
From: andrewkern <adkern@uoregon.edu>
Date: Wed, 24 Nov 2021 10:50:11 -0800
Subject: [PATCH 10/10] minor changes; things are working

---
 diploshic/examples/Snakefile | 10 +++++-----
 pyproject.toml               |  1 +
 requirements/development.txt | 25 +++++++++++++++++++++++++
 3 files changed, 31 insertions(+), 5 deletions(-)
 create mode 100644 requirements/development.txt

diff --git a/diploshic/examples/Snakefile b/diploshic/examples/Snakefile
index df8c260..10abb9f 100644
--- a/diploshic/examples/Snakefile
+++ b/diploshic/examples/Snakefile
@@ -21,8 +21,8 @@ replicates = 2
 seed_array = np.random.random_integers(1,2**31,replicates)
 
 # change to the correct locations of discoal and diploSHIC
-discoal_exec = "discoal/discoal"
-diploSHIC_exec = "diploSHIC/diploSHIC.py"
+discoal_exec = "./discoal"
+diploSHIC_exec = "diploSHIC"
 
 #some parameters that should probably be moved to a .json file
 trainSampleNumber = 10 #the number of simulation replicates we want to generate for each file in our training set
@@ -100,7 +100,7 @@ rule calc_sim_fvs:
     output:
         "{tDir}/discoal.{mod}.{x}.{i}.out.fvec"
     run:
-        cmd = f"python {diploSHIC_exec} fvecSim haploid {input} {output} --totalPhysLen={numSites} --maskFileName none --chrArmsForMasking none"
+        cmd = f"{diploSHIC_exec} fvecSim haploid {input} {output} --totalPhysLen={numSites} --maskFileName none --chrArmsForMasking none"
         shell(cmd)
 
 rule concat_fvecs:
@@ -127,7 +127,7 @@ rule make_training_sets:
     output:
         expand("trainingSets/{cl}.fvec", cl = ["hard", "linkedHard", "soft", "linkedSoft", "neut"])
     run:
-        cmd = f"python {diploSHIC_exec} makeTrainingSets train/neut_0.fvec train/soft train/hard 5 0,1,2,3,4,6,7,8,9,10 trainingSets/"
+        cmd = f"{diploSHIC_exec} makeTrainingSets train/neut_0.fvec train/soft train/hard 5 0,1,2,3,4,6,7,8,9,10 trainingSets/"
         shell(cmd)
 
 rule train_classifier:
@@ -137,7 +137,7 @@ rule train_classifier:
         "trained_model.json",
         "trained_model.weights.hdf5"
     run:
-        cmd = f"python {diploSHIC_exec} train trainingSets/ trainingSets/ trained_model --epochs=10"
+        cmd = f"{diploSHIC_exec} train trainingSets/ trainingSets/ trained_model --epochs=10"
         shell(cmd)
 
 rule clean:
diff --git a/pyproject.toml b/pyproject.toml
index bf8e1fb..282beb0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,4 @@
 [build-system]
 build-backend = "setuptools.build_meta"
 requires = ["setuptools", "numpy"]
+
diff --git a/requirements/development.txt b/requirements/development.txt
new file mode 100644
index 0000000..8bc609e
--- /dev/null
+++ b/requirements/development.txt
@@ -0,0 +1,25 @@
+black
+blacken-docs
+codecov
+coverage
+daiquiri
+flake8
+pytest
+pytest-cov
+pytest-xdist
+setuptools_scm
+sphinx
+sphinx-argparse
+sphinx-issues
+sphinx_rtd_theme
+sphinxcontrib-programoutput
+attrs
+appdirs
+humanize
+pre-commit
+pyslim>=0.600
+pandas
+numpy
+scikit-allel
+zarr>=2.4
+biopython