From de5bbc393a59bbed2c7c6dc600669fe7365f484d Mon Sep 17 00:00:00 2001 From: andrewkern Date: Mon, 22 Nov 2021 10:36:07 -0800 Subject: [PATCH 01/10] getting a pip package together moving stuff tf2 various updates version linting working on pip working on pip.. version still werking few more paths what about conda? --- MANIFEST.in | 5 + build.sh | 2 + diploSHIC.py | 371 -------- diploshic/__init__.py | 3 + diploshic/diploSHIC | 759 ++++++++++++++++ {examples => diploshic/examples}/README.md | 0 {examples => diploshic/examples}/Snakefile | 0 {examples => diploshic/examples}/collate.sh | 0 .../examples}/efficient_simulation.sh | 0 {examples => diploshic/examples}/train.sh | 0 fvTools.py => diploshic/fvTools.py | 859 +++++++++++++----- .../generateSimLaunchScript.py | 71 +- .../makeFeatureVecsForChrArmFromVcfDiploid.py | 292 ++++++ .../makeFeatureVecsForChrArmFromVcf_ogSHIC.py | 306 +++++++ .../makeFeatureVecsForSingleMsDiploid.py | 324 +++++++ .../makeFeatureVecsForSingleMs_ogSHIC.py | 180 ++-- .../makeTrainingSets.py | 83 +- misc.py => diploshic/misc.py | 129 ++- msTools.py => diploshic/msTools.py | 99 +- diploshic/setup.py | 14 + shicstats.pyf => diploshic/shicstats.pyf | 0 {testing => diploshic/testing}/hard.fvec | 0 .../testing}/linkedHard.fvec | 0 .../testing}/linkedSoft.fvec | 0 {testing => diploshic/testing}/neut.fvec | 0 {testing => diploshic/testing}/soft.fvec | 0 {training => diploshic/training}/hard.fvec | 0 .../training}/linkedHard.fvec | 0 .../training}/linkedSoft.fvec | 0 {training => diploshic/training}/neut.fvec | 0 {training => diploshic/training}/soft.fvec | 0 utils.c => diploshic/utils.c | 0 makeFeatureVecsForChrArmFromVcfDiploid.py | 163 ---- makeFeatureVecsForChrArmFromVcf_ogSHIC.py | 171 ---- makeFeatureVecsForSingleMsDiploid.py | 193 ---- meta.yaml | 27 + setup.py | 49 +- 37 files changed, 2773 insertions(+), 1327 deletions(-) create mode 100644 MANIFEST.in create mode 100644 build.sh delete mode 100644 diploSHIC.py create mode 100644 diploshic/__init__.py create mode 100644 diploshic/diploSHIC rename {examples => diploshic/examples}/README.md (100%) rename {examples => diploshic/examples}/Snakefile (100%) rename {examples => diploshic/examples}/collate.sh (100%) rename {examples => diploshic/examples}/efficient_simulation.sh (100%) rename {examples => diploshic/examples}/train.sh (100%) rename fvTools.py => diploshic/fvTools.py (60%) rename generateSimLaunchScript.py => diploshic/generateSimLaunchScript.py (53%) create mode 100644 diploshic/makeFeatureVecsForChrArmFromVcfDiploid.py create mode 100644 diploshic/makeFeatureVecsForChrArmFromVcf_ogSHIC.py create mode 100644 diploshic/makeFeatureVecsForSingleMsDiploid.py rename makeFeatureVecsForSingleMs_ogSHIC.py => diploshic/makeFeatureVecsForSingleMs_ogSHIC.py (61%) rename makeTrainingSets.py => diploshic/makeTrainingSets.py (51%) rename misc.py => diploshic/misc.py (83%) rename msTools.py => diploshic/msTools.py (69%) create mode 100644 diploshic/setup.py rename shicstats.pyf => diploshic/shicstats.pyf (100%) rename {testing => diploshic/testing}/hard.fvec (100%) rename {testing => diploshic/testing}/linkedHard.fvec (100%) rename {testing => diploshic/testing}/linkedSoft.fvec (100%) rename {testing => diploshic/testing}/neut.fvec (100%) rename {testing => diploshic/testing}/soft.fvec (100%) rename {training => diploshic/training}/hard.fvec (100%) rename {training => diploshic/training}/linkedHard.fvec (100%) rename {training => diploshic/training}/linkedSoft.fvec (100%) rename {training => diploshic/training}/neut.fvec (100%) rename {training => diploshic/training}/soft.fvec (100%) rename utils.c => diploshic/utils.c (100%) delete mode 100644 makeFeatureVecsForChrArmFromVcfDiploid.py delete mode 100644 makeFeatureVecsForChrArmFromVcf_ogSHIC.py delete mode 100644 makeFeatureVecsForSingleMsDiploid.py create mode 100644 meta.yaml diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..c295d7a --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,5 @@ +include diploshic/shicstats.pyf +include diploshic/testEmpirical.fvec +include diploshic/testing/* +include diploshic/training/* +include diploshic/utils.c diff --git a/build.sh b/build.sh new file mode 100644 index 0000000..1fcfd2f --- /dev/null +++ b/build.sh @@ -0,0 +1,2 @@ +python setup.py build +python setup.py install --prefix=$PREFIX \ No newline at end of file diff --git a/diploSHIC.py b/diploSHIC.py deleted file mode 100644 index 0e1e8bf..0000000 --- a/diploSHIC.py +++ /dev/null @@ -1,371 +0,0 @@ -import argparse,time,sys,subprocess - -pyExec = sys.executable -if "/" in sys.argv[0]: - diploShicDir = "/".join(sys.argv[0].split("/")[:-1]) + "/" -else: - diploShicDir = "" - -################# use argparse to get CL args and make sure we are kosher -parser = argparse.ArgumentParser(description='calculate feature vectors, train, or predict with diploSHIC') -parser._positionals.title = 'possible modes (enter \'python diploSHIC.py modeName -h\' for modeName\'s help message' -subparsers = parser.add_subparsers(help='sub-command help') -parser_c = subparsers.add_parser('fvecSim', help='Generate feature vectors from simulated data') -parser_e = subparsers.add_parser('makeTrainingSets', help='Combine feature vectors from muliple fvecSim runs into 5 balanced training sets') -parser_a = subparsers.add_parser('train', help='train and test a shic CNN') -parser_d = subparsers.add_parser('fvecVcf', help='Generate feature vectors from data in a VCF file') -parser_b = subparsers.add_parser('predict', help='perform prediction using an already-trained SHIC CNN') -#parser_a.add_argument('nDims', metavar='nDims', type=int, -# help='dimensionality of the feature vector') -parser_a.add_argument('trainDir', help='path to training set files') -parser_a.add_argument('testDir', help='path to test set files, can be same as trainDir') -parser_a.add_argument('outputModel', help='file name for output model, will create two files one with structure one with weights') -parser_a.add_argument('--epochs', type=int, help='max epochs for training CNN (default = 100)', default=100) -parser_a.add_argument('--numSubWins', type=int, help='number of subwindows that our chromosome is divided into (default = 11)', default=11) -parser_a.add_argument('--confusionFile', help='optional file to which confusion matrix plot will be written (default = None)', default=None) -parser_a.set_defaults(mode='train') -parser_a._positionals.title = 'required arguments' - -#parser_b.add_argument('nDims', metavar='nDims', type=int, -# help='dimensionality of the feature vector') -parser_b.add_argument('modelStructure', help='path to CNN structure .json file') -parser_b.add_argument('modelWeights', help='path to CNN weights .h5 file') -parser_b.add_argument('predictFile', help='input file to predict') -parser_b.add_argument('predictFileOutput', help='output file name') -parser_b.add_argument('--numSubWins', type=int, help='number of subwindows that our chromosome is divided into (default = 11)', default=11) -parser_b.add_argument('--simData', help='Are we using simulated input data wihout coordinates?', action="store_true") -parser_b.set_defaults(mode='predict') -parser_b._positionals.title = 'required arguments' - -parser_c.add_argument('shicMode', help='specifies whether to use original haploid SHIC (use \'haploid\') or diploSHIC (\'diploid\')', - default='diploid') -parser_c.add_argument('msOutFile', help='path to simulation output file (must be same format used by Hudson\'s ms)') -parser_c.add_argument('fvecFileName', help='path to file where feature vectors will be written') -parser_c.add_argument('--totalPhysLen', type=int, help='Length of simulated chromosome for converting infinite sites ms output to finite sites (default=1100000)', - default=1100000) -parser_c.add_argument('--numSubWins', type=int, help='The number of subwindows that our chromosome will be divided into (default=11)', default=11) -parser_c.add_argument('--maskFileName', help=('Path to a fasta-formatted file that contains masking information (marked by \'N\'). ' - 'If specified, simulations will be masked in a manner mirroring windows drawn from this file.'), default="None") -parser_c.add_argument('--vcfForMaskFileName', help=('Path to a VCF file that contains genotype information. This will be used to mask genotypes ' - 'in a manner that mirrors how the true data are masked.'), default=None) -parser_c.add_argument('--popForMask', help='The label of the population for which we should draw genotype information from the VCF for masking purposes.', - default=None) -parser_c.add_argument('--sampleToPopFileName', help=('Path to tab delimited file with population assignments (used for genotype masking); format: ' - 'SampleID\tpopID'), default="None") -parser_c.add_argument('--unmaskedGenoFracCutoff', type=float, help='Fraction of unmasked genotypes required to retain a site (default=0.75)', default=0.75) -parser_c.add_argument('--chrArmsForMasking', help=('A comma-separated list (no spaces) of chromosome arms from which we want to draw masking ' - 'information (or \'all\' if we want to use all arms. Ignored if maskFileName is not specified.'), default="all") -parser_c.add_argument('--unmaskedFracCutoff', type=float, help='Minimum fraction of unmasked sites, if masking simulated data (default=0.25)', default=0.25) -parser_c.add_argument('--outStatsDir', help='Path to a directory where values of each statistic in each subwindow are recorded for each rep', - default="None") -parser_c.add_argument('--ancFileName', help=('Path to a fasta-formatted file that contains inferred ancestral states (\'N\' if unknown).' - ' This is used for masking, as sites that cannot be polarized are masked, and we mimic this in the simulted data.' - ' Ignored in diploid mode which currently does not use ancestral state information'), - default="None") -parser_c.add_argument('--pMisPol', type=float, help='The fraction of sites that will be intentionally polarized to better approximate real data (default=0.0)', - default=0.0) -parser_c.set_defaults(mode='fvecSim') -parser_c._positionals.title = 'required arguments' - -parser_d.add_argument('shicMode', help='specifies whether to use original haploid SHIC (use \'haploid\') or diploSHIC (\'diploid\')') -parser_d.add_argument('chrArmVcfFile', help='VCF format file containing data for our chromosome arm (other arms will be ignored)') -parser_d.add_argument('chrArm', help='Exact name of the chromosome arm for which feature vectors will be calculated') -parser_d.add_argument('chrLen', type=int, help='Length of the chromosome arm') -parser_d.add_argument('fvecFileName', help='path to file where feature vectors will be written') -parser_d.add_argument('--targetPop', help='Population ID of samples we wish to include', default="None") -parser_d.add_argument('--sampleToPopFileName', help=('Path to tab delimited file with population assignments; format: ' - 'SampleID\tpopID'), default="None") -parser_d.add_argument('--winSize', type=int, help='Length of the large window (default=1100000)', default=1100000) -parser_d.add_argument('--numSubWins', type=int, help='Number of sub-windows within each large window (default=11)', default=11) -parser_d.add_argument('--maskFileName', help=('Path to a fasta-formatted file that contains masking information (marked by \'N\'); ' - 'must have an entry with title matching chrArm'), default="None") -parser_d.add_argument('--unmaskedFracCutoff', type=float, help='Fraction of unmasked sites required to retain a subwindow (default=0.25)', default=0.25) -parser_d.add_argument('--unmaskedGenoFracCutoff', type=float, help='Fraction of unmasked genotypes required to retain a site (default=0.75)', default=0.75) -parser_d.add_argument('--ancFileName', help=('Path to a fasta-formatted file that contains inferred ancestral states (\'N\' if unknown); ' - 'must have an entry with title matching chrArm. Ignored for diploid mode which currently does not use ancestral ' - 'state information.'), default="None") -parser_d.add_argument('--statFileName', help='Path to a file where statistics will be written for each subwindow that is not filtered out', - default="None") -parser_d.add_argument('--segmentStart', help='Left boundary of region in which feature vectors are calculated (whole arm if omitted)', - default="None") -parser_d.add_argument('--segmentEnd', help='Right boundary of region in which feature vectors are calculated (whole arm if omitted)', - default="None") -parser_d.set_defaults(mode='fvecVcf') -parser_d._positionals.title = 'required arguments' - -parser_e.add_argument('neutTrainingFileName', help='Path to our neutral feature vectors') -parser_e.add_argument('softTrainingFilePrefix', help=('Prefix (including higher-level path) of files containing soft training examples' - '; files must end with \'_$i.$ext\' where $i is the subwindow index of the sweep and $ext is any extension.')) -parser_e.add_argument('hardTrainingFilePrefix', help=('Prefix (including higher-level path) of files containing hard training examples' - '; files must end with \'_$i.$ext\' where $i is the subwindow index of the sweep and $ext is any extension.')) -parser_e.add_argument('sweepTrainingWindows', type=int, help=('comma-separated list of windows to classify as sweeps (usually just \'5\'' - ' but without the quotes)')) -parser_e.add_argument('linkedTrainingWindows', help=('list of windows to treat as linked to sweeps (usually \'0,1,2,3,4,6,7,8,9,10\' but' - ' without the quotes)')) -parser_e.add_argument('outDir', help='path to directory where the training sets will be written') -parser_e.set_defaults(mode='makeTrainingSets') -parser_e._positionals.title = 'required arguments' -if len(sys.argv)==1: - parser.print_help() - sys.exit(1) -args = parser.parse_args() -argsDict = vars(args) - -if argsDict['mode'] in ['train', 'predict']: -########################################################### -# Import a bunch of libraries if everything checks out - import matplotlib - matplotlib.use('Agg') - import numpy as np - from keras.models import Sequential, Model - from keras import optimizers - from keras.layers import Dense, Dropout, Activation, Flatten, Input - from keras.layers import Conv2D, MaxPooling2D,concatenate - from keras.utils import np_utils - from sklearn.model_selection import train_test_split - from keras.utils.layer_utils import convert_all_kernels_in_model - from keras.preprocessing.image import ImageDataGenerator - from keras.callbacks import EarlyStopping,ModelCheckpoint - import keras.backend as K - import fnmatch - - #nDims = argsDict['nDims'] - numSubWins = argsDict['numSubWins'] - -if argsDict['mode'] == 'train': - trainingDir=argsDict['trainDir'] - testingDir=argsDict['testDir'] - epochOption = argsDict['epochs'] - outputModel = argsDict['outputModel'] - confusionFile = argsDict['confusionFile'] - #nCores = 12 - print("loading data now...") - #training data - - hard = np.loadtxt(trainingDir+"hard.fvec",skiprows=1) - nDims = int(hard.shape[1] / numSubWins) - h1 = np.reshape(hard,(hard.shape[0],nDims,numSubWins)) - neut = np.loadtxt(trainingDir+"neut.fvec",skiprows=1) - n1 = np.reshape(neut,(neut.shape[0],nDims,numSubWins)) - soft = np.loadtxt(trainingDir+"soft.fvec",skiprows=1) - s1 = np.reshape(soft,(soft.shape[0],nDims,numSubWins)) - lsoft = np.loadtxt(trainingDir+"linkedSoft.fvec",skiprows=1) - ls1 = np.reshape(lsoft,(lsoft.shape[0],nDims,numSubWins)) - lhard = np.loadtxt(trainingDir+"linkedHard.fvec",skiprows=1) - lh1 = np.reshape(lhard,(lhard.shape[0],nDims,numSubWins)) - - both=np.concatenate((h1,n1,s1,ls1,lh1)) - y=np.concatenate((np.repeat(0,len(h1)),np.repeat(1,len(n1)), np.repeat(2,len(s1)), np.repeat(3,len(ls1)), np.repeat(4,len(lh1)))) - - #reshape both to explicitly set depth image. need for theanno not sure with tensorflow - both = both.reshape(both.shape[0],nDims,numSubWins,1) - if (trainingDir==testingDir): - X_train, X_test, y_train, y_test = train_test_split(both, y, test_size=0.2) - else: - X_train = both - y_train = y - #testing data - hard = np.loadtxt(testingDir+"hard.fvec",skiprows=1) - h1 = np.reshape(hard,(hard.shape[0],nDims,numSubWins)) - neut = np.loadtxt(testingDir+"neut.fvec",skiprows=1) - n1 = np.reshape(neut,(neut.shape[0],nDims,numSubWins)) - soft = np.loadtxt(testingDir+"soft.fvec",skiprows=1) - s1 = np.reshape(soft,(soft.shape[0],nDims,numSubWins)) - lsoft = np.loadtxt(testingDir+"linkedSoft.fvec",skiprows=1) - ls1 = np.reshape(lsoft,(lsoft.shape[0],nDims,numSubWins)) - lhard = np.loadtxt(testingDir+"linkedHard.fvec",skiprows=1) - lh1 = np.reshape(lhard,(lhard.shape[0],nDims,numSubWins)) - - both2=np.concatenate((h1,n1,s1,ls1,lh1)) - X_test = both2.reshape(both2.shape[0],nDims,numSubWins,1) - y_test=np.concatenate((np.repeat(0,len(h1)),np.repeat(1,len(n1)), np.repeat(2,len(s1)), np.repeat(3,len(ls1)), np.repeat(4,len(lh1)))) - - - Y_train = np_utils.to_categorical(y_train, 5) - Y_test = np_utils.to_categorical(y_test, 5) - X_valid, X_test, Y_valid, Y_test = train_test_split(X_test, Y_test, test_size=0.5) - - datagen = ImageDataGenerator( - featurewise_center=True, - featurewise_std_normalization=True, - horizontal_flip=True) - - validation_gen = ImageDataGenerator( - featurewise_center=True, - featurewise_std_normalization=True, - horizontal_flip=False) - test_gen = ImageDataGenerator( - featurewise_center=True, - featurewise_std_normalization=True, - horizontal_flip=False) - - - #print(X_train.shape) - print("training set has %d examples" % X_train.shape[0]) - print("validation set has %d examples" % X_valid.shape[0]) - print("test set has %d examples" % X_test.shape[0]) - - model_in = Input(X_train.shape[1:]) - h = Conv2D(128, 3, activation='relu',padding="same", name='conv1_1')(model_in) - h = Conv2D(64, 3, activation='relu',padding="same", name='conv1_2')(h) - h = MaxPooling2D(pool_size=3, name='pool1',padding="same")(h) - h = Dropout(0.15, name='drop1')(h) - h = Flatten(name='flaten1')(h) - - dh = Conv2D(128, 2, activation='relu',dilation_rate=[1,3],padding="same", name='dconv1_1')(model_in) - dh = Conv2D(64, 2, activation='relu',dilation_rate=[1,3],padding="same", name='dconv1_2')(dh) - dh = MaxPooling2D(pool_size=2, name='dpool1')(dh) - dh = Dropout(0.15, name='ddrop1')(dh) - dh = Flatten(name='dflaten1')(dh) - - dh1 = Conv2D(128, 2, activation='relu',dilation_rate=[1,4],padding="same", name='dconv4_1')(model_in) - dh1 = Conv2D(64, 2, activation='relu',dilation_rate=[1,4],padding="same", name='dconv4_2')(dh1) - dh1 = MaxPooling2D(pool_size=2, name='d1pool1')(dh1) - dh1 = Dropout(0.15, name='d1drop1')(dh1) - dh1 = Flatten(name='d1flaten1')(dh1) - - h = concatenate([h,dh,dh1]) - h = Dense(512,name="512dense",activation='relu')(h) - h = Dropout(0.2, name='drop7')(h) - h = Dense(128,name="last_dense",activation='relu')(h) - h = Dropout(0.1, name='drop8')(h) - output = Dense(5,name="out_dense",activation='softmax')(h) - model = Model(inputs=[model_in], outputs=[output]) - - - - - - model.compile(loss='categorical_crossentropy', - optimizer='adam', - metrics=['accuracy']) - - # define early stopping callback - earlystop = EarlyStopping(monitor='val_accuracy', min_delta=0.001, patience=5, \ - verbose=1, mode='auto') - - model_json = model.to_json() - with open(outputModel+".json", "w") as json_file: - json_file.write(model_json) - modWeightsFilepath=outputModel+".weights.hdf5" - checkpoint = ModelCheckpoint(modWeightsFilepath, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True, mode='auto') - - callbacks_list = [earlystop,checkpoint] - #callbacks_list = [earlystop] #turning off checkpointing-- just want accuracy assessment - - datagen.fit(X_train) - validation_gen.fit(X_valid) - test_gen.fit(X_test) - start = time.time() - model.fit_generator(datagen.flow(X_train, Y_train, batch_size=32), \ - steps_per_epoch=len(X_train) / 32, epochs=epochOption,verbose=1, \ - callbacks=callbacks_list, \ - validation_data=validation_gen.flow(X_valid,Y_valid, batch_size=32), \ - validation_steps=len(X_test)/32) - #model.fit(X_train, Y_train, batch_size=32, epochs=100,validation_data=(X_test,Y_test),callbacks=callbacks_list, verbose=1) - score = model.evaluate_generator(test_gen.flow(X_test,Y_test, batch_size=32),len(Y_test)/32) - sys.stderr.write("total time spent fitting and evaluating: %f secs\n" %(time.time()-start)) - - print("evaluation on test set:") - print("diploSHIC loss: %f" % score[0]) - print("diploSHIC accuracy: %f" % score[1]) - if confusionFile: - from misc import plot_confusion_matrix - import matplotlib.pyplot as plt - plot_confusion_matrix(model, test_gen.standardize(X_test), Y_test, labels=[0,4,2,3,1], display_labels=['Hard', 'Hard-linked', 'Soft', 'Soft-linked', 'Neutral'], cmap=plt.cm.Blues, normalize='true') - plt.savefig(confusionFile, bbox_inches='tight') - -elif argsDict['mode'] == 'predict': - import pandas as pd - from keras.models import model_from_json - - #import data from predictFile - x_df=pd.read_table(argsDict['predictFile']) - if argsDict['simData']: - testX = x_df[list(x_df)[:]].to_numpy() - else: - testX = x_df[list(x_df)[4:]].to_numpy() - nDims = int(testX.shape[1]/numSubWins) - np.reshape(testX,(testX.shape[0],nDims,numSubWins)) - #add channels - testX = testX.reshape(testX.shape[0],nDims,numSubWins,1) - #set up generator for normalization - validation_gen = ImageDataGenerator( - featurewise_center=True, - featurewise_std_normalization=True, - horizontal_flip=False) - validation_gen.fit(testX) - - #import model - json_file = open(argsDict['modelStructure'], 'r') - loaded_model_json = json_file.read() - json_file.close() - model = model_from_json(loaded_model_json) - # load weights into new model - model.load_weights(argsDict['modelWeights']) - print("Loaded model from disk") - - #get predictions - preds = model.predict(validation_gen.standardize(testX)) - predictions = np.argmax(preds,axis=1) - - #np.repeat(0,len(h1)),np.repeat(1,len(n1)), np.repeat(2,len(s1)), np.repeat(3,len(ls1)), np.repeat(4,len(lh1) - classDict = {0:'hard',1:'neutral',2:'soft',3:'linkedSoft',4:'linkedHard'} - - #output the predictions - outputFile = open(argsDict['predictFileOutput'],'w') - if argsDict['simData']: - outputFile.write('predClass\tprob(neutral)\tprob(likedSoft)\tprob(linkedHard)\tprob(soft)\tprob(hard)\n') - else: - outputFile.write('chrom\tclassifiedWinStart\tclassifiedWinEnd\tbigWinRange\tpredClass\tprob(neutral)\tprob(likedSoft)\tprob(linkedHard)\tprob(soft)\tprob(hard)\n') - - for index, row in x_df.iterrows(): - if argsDict['simData']: - outputFile.write('{}\t{:f}\t{:f}\t{:f}\t{:f}\t{:f}\n'.format(classDict[predictions[index]],preds[index][1],preds[index][3],preds[index][4], \ - preds[index][2],preds[index][0])) - else: - outputFile.write('{}\t{}\t{}\t{}\t{}\t{:f}\t{:f}\t{:f}\t{:f}\t{:f}\n'.format( row['chrom'],row['classifiedWinStart'],row['classifiedWinEnd'],row['bigWinRange'], \ - classDict[predictions[index]],preds[index][1],preds[index][3],preds[index][4],preds[index][2],preds[index][0])) - outputFile.close() - print("{} predictions complete".format(index+1)) -elif argsDict['mode'] == 'fvecSim': - if argsDict['shicMode'].lower() == 'diploid': - cmdArgs = [argsDict['msOutFile'], argsDict['totalPhysLen'], argsDict['numSubWins'], argsDict['maskFileName'], - argsDict['vcfForMaskFileName'], argsDict['popForMask'], argsDict['sampleToPopFileName'], argsDict['unmaskedGenoFracCutoff'], - argsDict['chrArmsForMasking'], argsDict['unmaskedFracCutoff'], argsDict['outStatsDir'], argsDict['fvecFileName']] - cmd = pyExec + " " + diploShicDir + "makeFeatureVecsForSingleMsDiploid.py " + " ".join([str(x) for x in cmdArgs]) - elif argsDict['shicMode'].lower() == 'haploid': - cmdArgs = [argsDict['msOutFile'], argsDict['totalPhysLen'], argsDict['numSubWins'], argsDict['maskFileName'], argsDict['ancFileName'], - argsDict['chrArmsForMasking'], argsDict['unmaskedFracCutoff'], argsDict['pMisPol'], argsDict['outStatsDir'], - argsDict['fvecFileName']] - cmd = pyExec + " " + diploShicDir + "makeFeatureVecsForSingleMs_ogSHIC.py " + " ".join([str(x) for x in cmdArgs]) - else: - sys.exit("'shicMode' must be set to either 'diploid' or 'haploid'") - print(cmd) - subprocess.call(cmd.split()) -elif argsDict['mode'] == 'fvecVcf': - if argsDict['shicMode'].lower() == 'diploid': - cmdArgs = [argsDict['chrArmVcfFile'], argsDict['chrArm'], argsDict['chrLen'], argsDict['targetPop'], argsDict['winSize'], - argsDict['numSubWins'], argsDict['maskFileName'], argsDict['unmaskedFracCutoff'], argsDict['unmaskedGenoFracCutoff'], - argsDict['sampleToPopFileName'], argsDict['statFileName'], argsDict['fvecFileName']] - cmd = pyExec + " " + diploShicDir + "makeFeatureVecsForChrArmFromVcfDiploid.py " + " ".join([str(x) for x in cmdArgs]) - elif argsDict['shicMode'].lower() == 'haploid': - cmdArgs = [argsDict['chrArmVcfFile'], argsDict['chrArm'], argsDict['chrLen'], argsDict['targetPop'], argsDict['winSize'], - argsDict['numSubWins'], argsDict['maskFileName'], argsDict['unmaskedFracCutoff'], argsDict['sampleToPopFileName'], - argsDict['ancFileName'], argsDict['statFileName'], argsDict['fvecFileName']] - cmd = pyExec + " " + diploShicDir + "makeFeatureVecsForChrArmFromVcf_ogSHIC.py " + " ".join([str(x) for x in cmdArgs]) - else: - sys.exit("'shicMode' must be set to either 'diploid' or 'haploid'") - additionalArgs = [] - if argsDict['segmentStart'] != "None": - additionalArgs += [argsDict['segmentStart'], argsDict['segmentEnd']] - cmd += " " + " ".join(additionalArgs) - #cmd += " > " + argsDict['fvecFileName'] - print(cmd) - subprocess.call(cmd.split()) -elif argsDict['mode'] == 'makeTrainingSets': - cmdArgs = [argsDict['neutTrainingFileName'], argsDict['softTrainingFilePrefix'], argsDict['hardTrainingFilePrefix'], - argsDict['sweepTrainingWindows'], argsDict['linkedTrainingWindows'], argsDict['outDir']] - cmd = pyExec + " " + diploShicDir + "makeTrainingSets.py " + " ".join([str(x) for x in cmdArgs]) - print(cmd) - subprocess.call(cmd.split()) diff --git a/diploshic/__init__.py b/diploshic/__init__.py new file mode 100644 index 0000000..4b4f13d --- /dev/null +++ b/diploshic/__init__.py @@ -0,0 +1,3 @@ +from diploshic.fvTools import * +from diploshic.msTools import * +from diploshic.shicstats import * diff --git a/diploshic/diploSHIC b/diploshic/diploSHIC new file mode 100644 index 0000000..f0134bc --- /dev/null +++ b/diploshic/diploSHIC @@ -0,0 +1,759 @@ +#!/usr/bin/env python + +import argparse, time, sys, subprocess + +pyExec = sys.executable +print(pyExec) +if "/" in sys.argv[0]: + diploShicDir = "/".join(sys.argv[0].split("/")[:-1]) + "/" +else: + diploShicDir = "" + +################# use argparse to get CL args and make sure we are kosher +parser = argparse.ArgumentParser( + description="calculate feature vectors, train, or predict with diploSHIC" +) +parser._positionals.title = "possible modes (enter 'python diploSHIC.py modeName -h' for modeName's help message" +subparsers = parser.add_subparsers(help="sub-command help") +parser_c = subparsers.add_parser( + "fvecSim", help="Generate feature vectors from simulated data" +) +parser_e = subparsers.add_parser( + "makeTrainingSets", + help="Combine feature vectors from muliple fvecSim runs into 5 balanced training sets", +) +parser_a = subparsers.add_parser("train", help="train and test a shic CNN") +parser_d = subparsers.add_parser( + "fvecVcf", help="Generate feature vectors from data in a VCF file" +) +parser_b = subparsers.add_parser( + "predict", help="perform prediction using an already-trained SHIC CNN" +) +# parser_a.add_argument('nDims', metavar='nDims', type=int, +# help='dimensionality of the feature vector') +parser_a.add_argument("trainDir", help="path to training set files") +parser_a.add_argument( + "testDir", help="path to test set files, can be same as trainDir" +) +parser_a.add_argument( + "outputModel", + help="file name for output model, will create two files one with structure one with weights", +) +parser_a.add_argument( + "--epochs", + type=int, + help="max epochs for training CNN (default = 100)", + default=100, +) +parser_a.add_argument( + "--numSubWins", + type=int, + help="number of subwindows that our chromosome is divided into (default = 11)", + default=11, +) +parser_a.add_argument( + "--confusionFile", + help="optional file to which confusion matrix plot will be written (default = None)", + default=None, +) +parser_a.set_defaults(mode="train") +parser_a._positionals.title = "required arguments" + +# parser_b.add_argument('nDims', metavar='nDims', type=int, +# help='dimensionality of the feature vector') +parser_b.add_argument( + "modelStructure", help="path to CNN structure .json file" +) +parser_b.add_argument("modelWeights", help="path to CNN weights .h5 file") +parser_b.add_argument("predictFile", help="input file to predict") +parser_b.add_argument("predictFileOutput", help="output file name") +parser_b.add_argument( + "--numSubWins", + type=int, + help="number of subwindows that our chromosome is divided into (default = 11)", + default=11, +) +parser_b.add_argument( + "--simData", + help="Are we using simulated input data wihout coordinates?", + action="store_true", +) +parser_b.set_defaults(mode="predict") +parser_b._positionals.title = "required arguments" + +parser_c.add_argument( + "shicMode", + help="specifies whether to use original haploid SHIC (use 'haploid') or diploSHIC ('diploid')", + default="diploid", +) +parser_c.add_argument( + "msOutFile", + help="path to simulation output file (must be same format used by Hudson's ms)", +) +parser_c.add_argument( + "fvecFileName", help="path to file where feature vectors will be written" +) +parser_c.add_argument( + "--totalPhysLen", + type=int, + help="Length of simulated chromosome for converting infinite sites ms output to finite sites (default=1100000)", + default=1100000, +) +parser_c.add_argument( + "--numSubWins", + type=int, + help="The number of subwindows that our chromosome will be divided into (default=11)", + default=11, +) +parser_c.add_argument( + "--maskFileName", + help=( + "Path to a fasta-formatted file that contains masking information (marked by 'N'). " + "If specified, simulations will be masked in a manner mirroring windows drawn from this file." + ), + default="None", +) +parser_c.add_argument( + "--vcfForMaskFileName", + help=( + "Path to a VCF file that contains genotype information. This will be used to mask genotypes " + "in a manner that mirrors how the true data are masked." + ), + default=None, +) +parser_c.add_argument( + "--popForMask", + help="The label of the population for which we should draw genotype information from the VCF for masking purposes.", + default=None, +) +parser_c.add_argument( + "--sampleToPopFileName", + help=( + "Path to tab delimited file with population assignments (used for genotype masking); format: " + "SampleID\tpopID" + ), + default="None", +) +parser_c.add_argument( + "--unmaskedGenoFracCutoff", + type=float, + help="Fraction of unmasked genotypes required to retain a site (default=0.75)", + default=0.75, +) +parser_c.add_argument( + "--chrArmsForMasking", + help=( + "A comma-separated list (no spaces) of chromosome arms from which we want to draw masking " + "information (or 'all' if we want to use all arms. Ignored if maskFileName is not specified." + ), + default="all", +) +parser_c.add_argument( + "--unmaskedFracCutoff", + type=float, + help="Minimum fraction of unmasked sites, if masking simulated data (default=0.25)", + default=0.25, +) +parser_c.add_argument( + "--outStatsDir", + help="Path to a directory where values of each statistic in each subwindow are recorded for each rep", + default="None", +) +parser_c.add_argument( + "--ancFileName", + help=( + "Path to a fasta-formatted file that contains inferred ancestral states ('N' if unknown)." + " This is used for masking, as sites that cannot be polarized are masked, and we mimic this in the simulted data." + " Ignored in diploid mode which currently does not use ancestral state information" + ), + default="None", +) +parser_c.add_argument( + "--pMisPol", + type=float, + help="The fraction of sites that will be intentionally polarized to better approximate real data (default=0.0)", + default=0.0, +) +parser_c.set_defaults(mode="fvecSim") +parser_c._positionals.title = "required arguments" + +parser_d.add_argument( + "shicMode", + help="specifies whether to use original haploid SHIC (use 'haploid') or diploSHIC ('diploid')", +) +parser_d.add_argument( + "chrArmVcfFile", + help="VCF format file containing data for our chromosome arm (other arms will be ignored)", +) +parser_d.add_argument( + "chrArm", + help="Exact name of the chromosome arm for which feature vectors will be calculated", +) +parser_d.add_argument("chrLen", type=int, help="Length of the chromosome arm") +parser_d.add_argument( + "fvecFileName", help="path to file where feature vectors will be written" +) +parser_d.add_argument( + "--targetPop", + help="Population ID of samples we wish to include", + default="None", +) +parser_d.add_argument( + "--sampleToPopFileName", + help=( + "Path to tab delimited file with population assignments; format: " + "SampleID\tpopID" + ), + default="None", +) +parser_d.add_argument( + "--winSize", + type=int, + help="Length of the large window (default=1100000)", + default=1100000, +) +parser_d.add_argument( + "--numSubWins", + type=int, + help="Number of sub-windows within each large window (default=11)", + default=11, +) +parser_d.add_argument( + "--maskFileName", + help=( + "Path to a fasta-formatted file that contains masking information (marked by 'N'); " + "must have an entry with title matching chrArm" + ), + default="None", +) +parser_d.add_argument( + "--unmaskedFracCutoff", + type=float, + help="Fraction of unmasked sites required to retain a subwindow (default=0.25)", + default=0.25, +) +parser_d.add_argument( + "--unmaskedGenoFracCutoff", + type=float, + help="Fraction of unmasked genotypes required to retain a site (default=0.75)", + default=0.75, +) +parser_d.add_argument( + "--ancFileName", + help=( + "Path to a fasta-formatted file that contains inferred ancestral states ('N' if unknown); " + "must have an entry with title matching chrArm. Ignored for diploid mode which currently does not use ancestral " + "state information." + ), + default="None", +) +parser_d.add_argument( + "--statFileName", + help="Path to a file where statistics will be written for each subwindow that is not filtered out", + default="None", +) +parser_d.add_argument( + "--segmentStart", + help="Left boundary of region in which feature vectors are calculated (whole arm if omitted)", + default="None", +) +parser_d.add_argument( + "--segmentEnd", + help="Right boundary of region in which feature vectors are calculated (whole arm if omitted)", + default="None", +) +parser_d.set_defaults(mode="fvecVcf") +parser_d._positionals.title = "required arguments" + +parser_e.add_argument( + "neutTrainingFileName", help="Path to our neutral feature vectors" +) +parser_e.add_argument( + "softTrainingFilePrefix", + help=( + "Prefix (including higher-level path) of files containing soft training examples" + "; files must end with '_$i.$ext' where $i is the subwindow index of the sweep and $ext is any extension." + ), +) +parser_e.add_argument( + "hardTrainingFilePrefix", + help=( + "Prefix (including higher-level path) of files containing hard training examples" + "; files must end with '_$i.$ext' where $i is the subwindow index of the sweep and $ext is any extension." + ), +) +parser_e.add_argument( + "sweepTrainingWindows", + type=int, + help=( + "comma-separated list of windows to classify as sweeps (usually just '5'" + " but without the quotes)" + ), +) +parser_e.add_argument( + "linkedTrainingWindows", + help=( + "list of windows to treat as linked to sweeps (usually '0,1,2,3,4,6,7,8,9,10' but" + " without the quotes)" + ), +) +parser_e.add_argument( + "outDir", help="path to directory where the training sets will be written" +) +parser_e.set_defaults(mode="makeTrainingSets") +parser_e._positionals.title = "required arguments" +if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) +args = parser.parse_args() +argsDict = vars(args) + +if argsDict["mode"] in ["train", "predict"]: + ########################################################### + # Import a bunch of libraries if everything checks out + import matplotlib + + matplotlib.use("Agg") + import numpy as np + import tensorflow as tf + from keras.models import Sequential, Model + from keras import optimizers + from keras.layers import Dense, Dropout, Activation, Flatten, Input + from keras.layers import Conv2D, MaxPooling2D, concatenate + from sklearn.model_selection import train_test_split + from keras.preprocessing.image import ImageDataGenerator + from keras.callbacks import EarlyStopping, ModelCheckpoint + import keras.backend as K + import fnmatch + + # nDims = argsDict['nDims'] + numSubWins = argsDict["numSubWins"] + +if argsDict["mode"] == "train": + trainingDir = argsDict["trainDir"] + testingDir = argsDict["testDir"] + epochOption = argsDict["epochs"] + outputModel = argsDict["outputModel"] + confusionFile = argsDict["confusionFile"] + # nCores = 12 + print("loading data now...") + # training data + hard = np.loadtxt(trainingDir + "hard.fvec", skiprows=1) + nDims = int(hard.shape[1] / numSubWins) + h1 = np.reshape(hard, (hard.shape[0], nDims, numSubWins)) + neut = np.loadtxt(trainingDir + "neut.fvec", skiprows=1) + n1 = np.reshape(neut, (neut.shape[0], nDims, numSubWins)) + soft = np.loadtxt(trainingDir + "soft.fvec", skiprows=1) + s1 = np.reshape(soft, (soft.shape[0], nDims, numSubWins)) + lsoft = np.loadtxt(trainingDir + "linkedSoft.fvec", skiprows=1) + ls1 = np.reshape(lsoft, (lsoft.shape[0], nDims, numSubWins)) + lhard = np.loadtxt(trainingDir + "linkedHard.fvec", skiprows=1) + lh1 = np.reshape(lhard, (lhard.shape[0], nDims, numSubWins)) + + both = np.concatenate((h1, n1, s1, ls1, lh1)) + y = np.concatenate( + ( + np.repeat(0, len(h1)), + np.repeat(1, len(n1)), + np.repeat(2, len(s1)), + np.repeat(3, len(ls1)), + np.repeat(4, len(lh1)), + ) + ) + + # reshape both to explicitly set depth image. need for theanno not sure with tensorflow + both = both.reshape(both.shape[0], nDims, numSubWins, 1) + if trainingDir == testingDir: + X_train, X_test, y_train, y_test = train_test_split( + both, y, test_size=0.2 + ) + else: + X_train = both + y_train = y + # testing data + hard = np.loadtxt(testingDir + "hard.fvec", skiprows=1) + h1 = np.reshape(hard, (hard.shape[0], nDims, numSubWins)) + neut = np.loadtxt(testingDir + "neut.fvec", skiprows=1) + n1 = np.reshape(neut, (neut.shape[0], nDims, numSubWins)) + soft = np.loadtxt(testingDir + "soft.fvec", skiprows=1) + s1 = np.reshape(soft, (soft.shape[0], nDims, numSubWins)) + lsoft = np.loadtxt(testingDir + "linkedSoft.fvec", skiprows=1) + ls1 = np.reshape(lsoft, (lsoft.shape[0], nDims, numSubWins)) + lhard = np.loadtxt(testingDir + "linkedHard.fvec", skiprows=1) + lh1 = np.reshape(lhard, (lhard.shape[0], nDims, numSubWins)) + + both2 = np.concatenate((h1, n1, s1, ls1, lh1)) + X_test = both2.reshape(both2.shape[0], nDims, numSubWins, 1) + y_test = np.concatenate( + ( + np.repeat(0, len(h1)), + np.repeat(1, len(n1)), + np.repeat(2, len(s1)), + np.repeat(3, len(ls1)), + np.repeat(4, len(lh1)), + ) + ) + + Y_train = tf.keras.utils.to_categorical(y_train, 5) + Y_test = tf.keras.utils.to_categorical(y_test, 5) + X_valid, X_test, Y_valid, Y_test = train_test_split( + X_test, Y_test, test_size=0.5 + ) + + datagen = ImageDataGenerator( + featurewise_center=True, + featurewise_std_normalization=True, + horizontal_flip=True, + ) + + validation_gen = ImageDataGenerator( + featurewise_center=True, + featurewise_std_normalization=True, + horizontal_flip=False, + ) + test_gen = ImageDataGenerator( + featurewise_center=True, + featurewise_std_normalization=True, + horizontal_flip=False, + ) + + # print(X_train.shape) + print("training set has %d examples" % X_train.shape[0]) + print("validation set has %d examples" % X_valid.shape[0]) + print("test set has %d examples" % X_test.shape[0]) + + model_in = Input(X_train.shape[1:]) + h = Conv2D(128, 3, activation="relu", padding="same", name="conv1_1")( + model_in + ) + h = Conv2D(64, 3, activation="relu", padding="same", name="conv1_2")(h) + h = MaxPooling2D(pool_size=3, name="pool1", padding="same")(h) + h = Dropout(0.15, name="drop1")(h) + h = Flatten(name="flaten1")(h) + + dh = Conv2D( + 128, + 2, + activation="relu", + dilation_rate=[1, 3], + padding="same", + name="dconv1_1", + )(model_in) + dh = Conv2D( + 64, + 2, + activation="relu", + dilation_rate=[1, 3], + padding="same", + name="dconv1_2", + )(dh) + dh = MaxPooling2D(pool_size=2, name="dpool1")(dh) + dh = Dropout(0.15, name="ddrop1")(dh) + dh = Flatten(name="dflaten1")(dh) + + dh1 = Conv2D( + 128, + 2, + activation="relu", + dilation_rate=[1, 4], + padding="same", + name="dconv4_1", + )(model_in) + dh1 = Conv2D( + 64, + 2, + activation="relu", + dilation_rate=[1, 4], + padding="same", + name="dconv4_2", + )(dh1) + dh1 = MaxPooling2D(pool_size=2, name="d1pool1")(dh1) + dh1 = Dropout(0.15, name="d1drop1")(dh1) + dh1 = Flatten(name="d1flaten1")(dh1) + + h = concatenate([h, dh, dh1]) + h = Dense(512, name="512dense", activation="relu")(h) + h = Dropout(0.2, name="drop7")(h) + h = Dense(128, name="last_dense", activation="relu")(h) + h = Dropout(0.1, name="drop8")(h) + output = Dense(5, name="out_dense", activation="softmax")(h) + model = Model(inputs=[model_in], outputs=[output]) + + model.compile( + loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"] + ) + + # define early stopping callback + earlystop = EarlyStopping( + monitor="val_accuracy", + min_delta=0.001, + patience=5, + verbose=1, + mode="auto", + ) + + model_json = model.to_json() + with open(outputModel + ".json", "w") as json_file: + json_file.write(model_json) + modWeightsFilepath = outputModel + ".weights.hdf5" + checkpoint = ModelCheckpoint( + modWeightsFilepath, + monitor="val_accuracy", + verbose=1, + save_best_only=True, + save_weights_only=True, + mode="auto", + ) + + callbacks_list = [earlystop, checkpoint] + # callbacks_list = [earlystop] #turning off checkpointing-- just want accuracy assessment + + datagen.fit(X_train) + validation_gen.fit(X_valid) + test_gen.fit(X_test) + start = time.time() + model.fit( + datagen.flow(X_train, Y_train, batch_size=32), + steps_per_epoch=len(X_train) / 32, + epochs=epochOption, + verbose=1, + callbacks=callbacks_list, + validation_data=validation_gen.flow(X_valid, Y_valid, batch_size=32), + validation_steps=len(X_test) / 32, + ) + # model.fit(X_train, Y_train, batch_size=32, epochs=100,validation_data=(X_test,Y_test),callbacks=callbacks_list, verbose=1) + score = model.evaluate( + test_gen.flow(X_test, Y_test, batch_size=32), steps=len(Y_test) / 32 + ) + sys.stderr.write( + "total time spent fitting and evaluating: %f secs\n" + % (time.time() - start) + ) + + print("evaluation on test set:") + print("diploSHIC loss: %f" % score[0]) + print("diploSHIC accuracy: %f" % score[1]) + if confusionFile: + from misc import plot_confusion_matrix + import matplotlib.pyplot as plt + + plot_confusion_matrix( + model, + test_gen.standardize(X_test), + Y_test, + labels=[0, 4, 2, 3, 1], + display_labels=[ + "Hard", + "Hard-linked", + "Soft", + "Soft-linked", + "Neutral", + ], + cmap=plt.cm.Blues, + normalize="true", + ) + plt.savefig(confusionFile, bbox_inches="tight") + +elif argsDict["mode"] == "predict": + import pandas as pd + from keras.models import model_from_json + + # import data from predictFile + x_df = pd.read_table(argsDict["predictFile"]) + if argsDict["simData"]: + testX = x_df[list(x_df)[:]].to_numpy() + else: + testX = x_df[list(x_df)[4:]].to_numpy() + nDims = int(testX.shape[1] / numSubWins) + np.reshape(testX, (testX.shape[0], nDims, numSubWins)) + # add channels + testX = testX.reshape(testX.shape[0], nDims, numSubWins, 1) + # set up generator for normalization + validation_gen = ImageDataGenerator( + featurewise_center=True, + featurewise_std_normalization=True, + horizontal_flip=False, + ) + validation_gen.fit(testX) + + # import model + json_file = open(argsDict["modelStructure"], "r") + loaded_model_json = json_file.read() + json_file.close() + model = model_from_json(loaded_model_json) + # load weights into new model + model.load_weights(argsDict["modelWeights"]) + print("Loaded model from disk") + + # get predictions + preds = model.predict(validation_gen.standardize(testX)) + predictions = np.argmax(preds, axis=1) + + # np.repeat(0,len(h1)),np.repeat(1,len(n1)), np.repeat(2,len(s1)), np.repeat(3,len(ls1)), np.repeat(4,len(lh1) + classDict = { + 0: "hard", + 1: "neutral", + 2: "soft", + 3: "linkedSoft", + 4: "linkedHard", + } + + # output the predictions + outputFile = open(argsDict["predictFileOutput"], "w") + if argsDict["simData"]: + outputFile.write( + "predClass\tprob(neutral)\tprob(likedSoft)\tprob(linkedHard)\tprob(soft)\tprob(hard)\n" + ) + else: + outputFile.write( + "chrom\tclassifiedWinStart\tclassifiedWinEnd\tbigWinRange\tpredClass\tprob(neutral)\tprob(likedSoft)\tprob(linkedHard)\tprob(soft)\tprob(hard)\n" + ) + + for index, row in x_df.iterrows(): + if argsDict["simData"]: + outputFile.write( + "{}\t{:f}\t{:f}\t{:f}\t{:f}\t{:f}\n".format( + classDict[predictions[index]], + preds[index][1], + preds[index][3], + preds[index][4], + preds[index][2], + preds[index][0], + ) + ) + else: + outputFile.write( + "{}\t{}\t{}\t{}\t{}\t{:f}\t{:f}\t{:f}\t{:f}\t{:f}\n".format( + row["chrom"], + row["classifiedWinStart"], + row["classifiedWinEnd"], + row["bigWinRange"], + classDict[predictions[index]], + preds[index][1], + preds[index][3], + preds[index][4], + preds[index][2], + preds[index][0], + ) + ) + outputFile.close() + print("{} predictions complete".format(index + 1)) +elif argsDict["mode"] == "fvecSim": + if argsDict["shicMode"].lower() == "diploid": + cmdArgs = [ + argsDict["msOutFile"], + argsDict["totalPhysLen"], + argsDict["numSubWins"], + argsDict["maskFileName"], + argsDict["vcfForMaskFileName"], + argsDict["popForMask"], + argsDict["sampleToPopFileName"], + argsDict["unmaskedGenoFracCutoff"], + argsDict["chrArmsForMasking"], + argsDict["unmaskedFracCutoff"], + argsDict["outStatsDir"], + argsDict["fvecFileName"], + ] + cmd = ( + pyExec + + " " + + diploShicDir + + "makeFeatureVecsForSingleMsDiploid.py " + + " ".join([str(x) for x in cmdArgs]) + ) + elif argsDict["shicMode"].lower() == "haploid": + cmdArgs = [ + argsDict["msOutFile"], + argsDict["totalPhysLen"], + argsDict["numSubWins"], + argsDict["maskFileName"], + argsDict["ancFileName"], + argsDict["chrArmsForMasking"], + argsDict["unmaskedFracCutoff"], + argsDict["pMisPol"], + argsDict["outStatsDir"], + argsDict["fvecFileName"], + ] + cmd = ( + pyExec + + " " + + diploShicDir + + "makeFeatureVecsForSingleMs_ogSHIC.py " + + " ".join([str(x) for x in cmdArgs]) + ) + else: + sys.exit("'shicMode' must be set to either 'diploid' or 'haploid'") + print(cmd) + subprocess.call(cmd.split()) +elif argsDict["mode"] == "fvecVcf": + if argsDict["shicMode"].lower() == "diploid": + cmdArgs = [ + argsDict["chrArmVcfFile"], + argsDict["chrArm"], + argsDict["chrLen"], + argsDict["targetPop"], + argsDict["winSize"], + argsDict["numSubWins"], + argsDict["maskFileName"], + argsDict["unmaskedFracCutoff"], + argsDict["unmaskedGenoFracCutoff"], + argsDict["sampleToPopFileName"], + argsDict["statFileName"], + argsDict["fvecFileName"], + ] + cmd = ( + pyExec + + " " + + diploShicDir + + "makeFeatureVecsForChrArmFromVcfDiploid.py " + + " ".join([str(x) for x in cmdArgs]) + ) + elif argsDict["shicMode"].lower() == "haploid": + cmdArgs = [ + argsDict["chrArmVcfFile"], + argsDict["chrArm"], + argsDict["chrLen"], + argsDict["targetPop"], + argsDict["winSize"], + argsDict["numSubWins"], + argsDict["maskFileName"], + argsDict["unmaskedFracCutoff"], + argsDict["sampleToPopFileName"], + argsDict["ancFileName"], + argsDict["statFileName"], + argsDict["fvecFileName"], + ] + cmd = ( + pyExec + + " " + + diploShicDir + + "makeFeatureVecsForChrArmFromVcf_ogSHIC.py " + + " ".join([str(x) for x in cmdArgs]) + ) + else: + sys.exit("'shicMode' must be set to either 'diploid' or 'haploid'") + additionalArgs = [] + if argsDict["segmentStart"] != "None": + additionalArgs += [argsDict["segmentStart"], argsDict["segmentEnd"]] + cmd += " " + " ".join(additionalArgs) + # cmd += " > " + argsDict['fvecFileName'] + print(cmd) + subprocess.call(cmd.split()) +elif argsDict["mode"] == "makeTrainingSets": + cmdArgs = [ + argsDict["neutTrainingFileName"], + argsDict["softTrainingFilePrefix"], + argsDict["hardTrainingFilePrefix"], + argsDict["sweepTrainingWindows"], + argsDict["linkedTrainingWindows"], + argsDict["outDir"], + ] + cmd = ( + pyExec + + " " + + diploShicDir + + "makeTrainingSets.py " + + " ".join([str(x) for x in cmdArgs]) + ) + print(cmd) + subprocess.call(cmd.split()) diff --git a/examples/README.md b/diploshic/examples/README.md similarity index 100% rename from examples/README.md rename to diploshic/examples/README.md diff --git a/examples/Snakefile b/diploshic/examples/Snakefile similarity index 100% rename from examples/Snakefile rename to diploshic/examples/Snakefile diff --git a/examples/collate.sh b/diploshic/examples/collate.sh similarity index 100% rename from examples/collate.sh rename to diploshic/examples/collate.sh diff --git a/examples/efficient_simulation.sh b/diploshic/examples/efficient_simulation.sh similarity index 100% rename from examples/efficient_simulation.sh rename to diploshic/examples/efficient_simulation.sh diff --git a/examples/train.sh b/diploshic/examples/train.sh similarity index 100% rename from examples/train.sh rename to diploshic/examples/train.sh diff --git a/fvTools.py b/diploshic/fvTools.py similarity index 60% rename from fvTools.py rename to diploshic/fvTools.py index 81f17c2..a2faa6f 100644 --- a/fvTools.py +++ b/diploshic/fvTools.py @@ -4,7 +4,7 @@ from allel.model.ndarray import SortedIndex from allel.util import asarray_ndim from scipy.spatial.distance import squareform -import shicstats +import diploshic.shicstats as dps import numpy as np import random import gzip @@ -12,7 +12,7 @@ def misPolarizeAlleleCounts(ac, pMisPol): - pMisPolInv = 1-pMisPol + pMisPolInv = 1 - pMisPol mapping = [] for i in range(len(ac)): if random.random() >= pMisPolInv: @@ -33,7 +33,7 @@ def calledGenoFracAtSite(genosAtSite): missingCount += 1 else: calledCount += 1 - return calledCount/float(missingCount + calledCount) + return calledCount / float(missingCount + calledCount) def isHaploidVcfGenoArray(genos): @@ -44,25 +44,28 @@ def diploidizeGenotypeArray(genos): numSnps, numSamples, numAlleles = genos.shape if numSamples % 2 != 0: sys.stderr.write( - "Diploidizing an odd-numbered sample. The last genome will be truncated.\n") + "Diploidizing an odd-numbered sample. The last genome will be truncated.\n" + ) numSamples -= 1 newGenos = [] for i in range(numSnps): currSnp = [] for j in range(0, numSamples, 2): - currSnp.append([genos[i, j, 0], genos[i, j+1, 0]]) + currSnp.append([genos[i, j, 0], genos[i, j + 1, 0]]) newGenos.append(currSnp) newGenos = np.array(newGenos) return allel.GenotypeArray(newGenos) + # contains some bits modified from scikit-allel by Alistair Miles -def readStatsDafsComputeStandardizationBins(statAndDafFileName, - nBins=50, pMisPol=0.0): +def readStatsDafsComputeStandardizationBins( + statAndDafFileName, nBins=50, pMisPol=0.0 +): stats = {} dafs = [] - pMisPolInv = 1-pMisPol + pMisPolInv = 1 - pMisPol misPolarizedSnps, totalSnps = 0, 0 with open(statAndDafFileName) as statAndDafFile: first = True @@ -77,7 +80,7 @@ def readStatsDafsComputeStandardizationBins(statAndDafFileName, else: totalSnps += 1 if random.random() >= pMisPolInv: - dafs.append(1-float(line[0])) + dafs.append(1 - float(line[0])) misPolarizedSnps += 1 else: dafs.append(float(line[0])) @@ -91,26 +94,32 @@ def readStatsDafsComputeStandardizationBins(statAndDafFileName, score_nonan = stats[statName][nonan] daf_nonan = np.array(dafs)[nonan] bins = allel.stats.selection.make_similar_sized_bins(daf_nonan, nBins) - mean_score, _, _ = scipy.stats.binned_statistic(daf_nonan, score_nonan, - statistic=np.mean, - bins=bins) - std_score, _, _ = scipy.stats.binned_statistic(daf_nonan, score_nonan, - statistic=np.std, - bins=bins) + mean_score, _, _ = scipy.stats.binned_statistic( + daf_nonan, score_nonan, statistic=np.mean, bins=bins + ) + std_score, _, _ = scipy.stats.binned_statistic( + daf_nonan, score_nonan, statistic=np.std, bins=bins + ) statInfo[statName] = (mean_score, std_score, bins) - sys.stderr.write("mispolarized %d of %d (%f%%) "\ - "SNPs when standardizing scores in %s\n" % ( - misPolarizedSnps, totalSnps, - 100*misPolarizedSnps/float(totalSnps), - statAndDafFileName)) + sys.stderr.write( + "mispolarized %d of %d (%f%%) " + "SNPs when standardizing scores in %s\n" + % ( + misPolarizedSnps, + totalSnps, + 100 * misPolarizedSnps / float(totalSnps), + statAndDafFileName, + ) + ) return statInfo + # includes a snippet copied from scikit-allel -def standardize_by_allele_count_from_precomp_bins(score, - dafs, - standardizationInfo): +def standardize_by_allele_count_from_precomp_bins( + score, dafs, standardizationInfo +): score_standardized = np.empty_like(score) mean_score, std_score, bins = standardizationInfo dafs = np.array(dafs) @@ -119,10 +128,10 @@ def standardize_by_allele_count_from_precomp_bins(score, x2 = bins[i + 1] if i == 0: # first bin - loc = (dafs < x2) + loc = dafs < x2 elif i == len(bins) - 2: # last bin - loc = (dafs >= x1) + loc = dafs >= x1 else: # middle bins loc = (dafs >= x1) & (dafs < x2) @@ -137,7 +146,7 @@ def readFaArm(armFileName, armName=False): fopen = gzip.open else: fopen = open - with fopen(armFileName, 'rt') as armFile: + with fopen(armFileName, "rt") as armFile: reading = False seq = "" for line in armFile: @@ -165,9 +174,9 @@ def polarizeSnps(unmasked, positions, refAlleles, altAlleles, ancArm): mapping = [] for i in range(len(ancArm)): - if ancArm[i] in 'ACGT': - if i+1 in isSnp: - ref, alt = refAlleles[isSnp[i+1]], altAlleles[isSnp[i+1]] + if ancArm[i] in "ACGT": + if i + 1 in isSnp: + ref, alt = refAlleles[isSnp[i + 1]], altAlleles[isSnp[i + 1]] if ancArm[i] == ref: mapping.append([0, 1]) # no swap elif ancArm[i] == alt: @@ -177,12 +186,13 @@ def polarizeSnps(unmasked, positions, refAlleles, altAlleles, ancArm): unmasked[i] = False elif ancArm[i] == "N": unmasked[i] = False - if i+1 in isSnp: + if i + 1 in isSnp: mapping.append([0, 1]) # no swap -- failed to polarize else: sys.exit( - "Found a character in ancestral chromosome "\ - "that is not 'A', 'C', 'G', 'T' or 'N' (all upper case)!\n") + "Found a character in ancestral chromosome " + "that is not 'A', 'C', 'G', 'T' or 'N' (all upper case)!\n" + ) assert len(mapping) == len(positions) return mapping, unmasked @@ -192,11 +202,12 @@ def getAccessibilityInWins(isAccessibleArm, winLen, subWinLen, cutoff): badWinCount = 0 lastWinEnd = len(isAccessibleArm) - len(isAccessibleArm) % winLen for i in range(0, lastWinEnd, winLen): - currWin = isAccessibleArm[i:i+winLen] + currWin = isAccessibleArm[i : i + winLen] goodWin = True for subWinStart in range(0, winLen, subWinLen): - unmaskedFrac = currWin[subWinStart:subWinStart + - subWinLen].count(True)/float(subWinLen) + unmaskedFrac = currWin[ + subWinStart : subWinStart + subWinLen + ].count(True) / float(subWinLen) if unmaskedFrac < cutoff: goodWin = False if goodWin: @@ -206,8 +217,9 @@ def getAccessibilityInWins(isAccessibleArm, winLen, subWinLen, cutoff): return wins -def windowVals(vals, subWinBounds, - positionArray, keepNans=False, absVal=False): +def windowVals( + vals, subWinBounds, positionArray, keepNans=False, absVal=False +): assert len(vals) == len(positionArray) subWinIndex = 0 @@ -238,7 +250,7 @@ def readFa(faFileName, upper=False): fopen = gzip.open else: fopen = open - with fopen(faFileName, 'rt') as faFile: + with fopen(faFileName, "rt") as faFile: reading = False for line in faFile: if line.startswith(">"): @@ -262,14 +274,20 @@ def readFa(faFileName, upper=False): return seqData -def readMaskAndAncDataForTraining(maskFileName, ancFileName, - totalPhysLen, subWinLen, - chrArmsForMasking, - shuffle=True, cutoff=0.25): +def readMaskAndAncDataForTraining( + maskFileName, + ancFileName, + totalPhysLen, + subWinLen, + chrArmsForMasking, + shuffle=True, + cutoff=0.25, +): isAccessible = [] maskData, ancData = readFa(maskFileName, upper=True), readFa( - ancFileName, upper=True) - if 'all' in chrArmsForMasking: + ancFileName, upper=True + ) + if "all" in chrArmsForMasking: chrArmsForMasking = sorted(maskData) for currChr in chrArmsForMasking: assert len(maskData[currChr]) == len(ancData[currChr]) @@ -280,7 +298,8 @@ def readMaskAndAncDataForTraining(maskFileName, ancFileName, else: isAccessibleArm.append(True) windowedAccessibility = getAccessibilityInWins( - isAccessibleArm, totalPhysLen, subWinLen, cutoff) + isAccessibleArm, totalPhysLen, subWinLen, cutoff + ) if windowedAccessibility: isAccessible += windowedAccessibility if shuffle: @@ -293,50 +312,69 @@ def readMaskAndAncDataForTraining(maskFileName, ancFileName, return isAccessible -def getGenoMaskInfoInWins(isAccessibleArm, genos, - positions, positions2SnpIndices, - winLen, subWinLen, cutoff, genoCutoff): +def getGenoMaskInfoInWins( + isAccessibleArm, + genos, + positions, + positions2SnpIndices, + winLen, + subWinLen, + cutoff, + genoCutoff, +): windowedAcc, windowedGenoMask = [], [] badWinCount = 0 lastWinEnd = len(isAccessibleArm) - len(isAccessibleArm) % winLen posIdx = 0 snpIndicesInWins = [] - sys.stderr.write("about to get geno masks from arm; "\ - "len: %d, genos shape: %s, num snps: %d\n" % ( - len(isAccessibleArm), genos.shape, len(positions))) + sys.stderr.write( + "about to get geno masks from arm; " + "len: %d, genos shape: %s, num snps: %d\n" + % (len(isAccessibleArm), genos.shape, len(positions)) + ) calledFracs = [] for winOffset in range(0, lastWinEnd, winLen): - firstPos = winOffset+1 - lastPos = winOffset+winLen + firstPos = winOffset + 1 + lastPos = winOffset + winLen snpIndicesInWin = [] - assert len(positions) == 0 or posIdx >= len( - positions) or positions[posIdx] >= firstPos + assert ( + len(positions) == 0 + or posIdx >= len(positions) + or positions[posIdx] >= firstPos + ) while posIdx < len(positions) and positions[posIdx] <= lastPos: - if isAccessibleArm[positions[posIdx]-1]: + if isAccessibleArm[positions[posIdx] - 1]: calledFrac = calledGenoFracAtSite(genos[posIdx]) calledFracs.append(calledFrac) if calledFrac >= genoCutoff: snpIndicesInWin.append(posIdx) else: - isAccessibleArm[positions[posIdx]-1] = False + isAccessibleArm[positions[posIdx] - 1] = False posIdx += 1 snpIndicesInWins.append(snpIndicesInWin) if len(calledFracs) > 0: - sys.stderr.write("min calledFrac: %g; max calledFrac: %g; "\ - "mean: %g; median: %g\n" % ( - min(calledFracs), max(calledFracs), - np.median(calledFracs), np.mean(calledFracs))) + sys.stderr.write( + "min calledFrac: %g; max calledFrac: %g; " + "mean: %g; median: %g\n" + % ( + min(calledFracs), + max(calledFracs), + np.median(calledFracs), + np.mean(calledFracs), + ) + ) else: sys.stderr.write("no SNPs in chromosome!\n") winIndex = 0 for winOffset in range(0, lastWinEnd, winLen): - currWin = isAccessibleArm[winOffset:winOffset+winLen] + currWin = isAccessibleArm[winOffset : winOffset + winLen] if len(snpIndicesInWins[winIndex]) > 0: currGenos = genos.subset(sel0=snpIndicesInWins[winIndex]) goodWin = True for subWinStart in range(0, winLen, subWinLen): - unmaskedFrac = currWin[subWinStart:subWinStart + - subWinLen].count(True)/float(subWinLen) + unmaskedFrac = currWin[ + subWinStart : subWinStart + subWinLen + ].count(True) / float(subWinLen) if unmaskedFrac < cutoff: goodWin = False if goodWin: @@ -346,9 +384,20 @@ def getGenoMaskInfoInWins(isAccessibleArm, genos, badWinCount += 1 winIndex += 1 if windowedAcc: - sys.stderr.write("returning %d geno arrays, "\ - "with an avg of %f snps\n" % (len(windowedGenoMask), sum( - [len(windowedGenoMask[i]) for i in range(len(windowedGenoMask))])/float(len(windowedGenoMask)))) # NOQA + sys.stderr.write( + "returning %d geno arrays, " + "with an avg of %f snps\n" + % ( + len(windowedGenoMask), + sum( + [ + len(windowedGenoMask[i]) + for i in range(len(windowedGenoMask)) + ] + ) + / float(len(windowedGenoMask)), + ) + ) # NOQA else: sys.stderr.write("returning 0 geno arrays\n") return windowedAcc, windowedGenoMask @@ -363,52 +412,75 @@ def readSampleToPopFile(sampleToPopFileName): return table -def extractGenosAndPositionsForArm(vcfFile, chroms, - currChr, sampleIndicesToKeep): +def extractGenosAndPositionsForArm( + vcfFile, chroms, currChr, sampleIndicesToKeep +): # sys.stderr.write("extracting vcf info for arm %s\n" %(currChr)) rawgenos = np.take( - vcfFile["calldata/GT"], [i for i in range(len(chroms)) if chroms[i] == currChr], axis=0) # NOQA + vcfFile["calldata/GT"], + [i for i in range(len(chroms)) if chroms[i] == currChr], + axis=0, + ) # NOQA if len(rawgenos) > 0: genos = allel.GenotypeArray(rawgenos).subset(sel1=sampleIndicesToKeep) if isHaploidVcfGenoArray(genos): sys.stderr.write( - "Detected haploid input for %s. "\ - "Converting into diploid individuals "\ - "(combining haplotypes in order).\n" % (currChr)) + "Detected haploid input for %s. " + "Converting into diploid individuals " + "(combining haplotypes in order).\n" % (currChr) + ) genos = diploidizeGenotypeArray(genos) sys.stderr.write("Done diploidizing %s\n" % (currChr)) positions = np.extract(chroms == currChr, vcfFile["variants/POS"]) if len(positions) > 0: genos = allel.GenotypeArray( - genos.subset(sel0=range(len(positions)))) + genos.subset(sel0=range(len(positions))) + ) positions2SnpIndices = {} for i in range(len(positions)): positions2SnpIndices[positions[i]] = i - assert len(positions) == len( - positions2SnpIndices) and len(positions) == len(genos) - return genos, positions, positions2SnpIndices, genos.count_alleles().is_biallelic() # NOQA + assert len(positions) == len(positions2SnpIndices) and len( + positions + ) == len(genos) + return ( + genos, + positions, + positions2SnpIndices, + genos.count_alleles().is_biallelic(), + ) # NOQA return np.array([]), [], {}, np.array([]) -def readMaskDataForTraining(maskFileName, totalPhysLen, - subWinLen, chrArmsForMasking, - shuffle=True, cutoff=0.25, - genoCutoff=0.75, vcfForMaskFileName=None, - sampleToPopFileName=None, pop=None): +def readMaskDataForTraining( + maskFileName, + totalPhysLen, + subWinLen, + chrArmsForMasking, + shuffle=True, + cutoff=0.25, + genoCutoff=0.75, + vcfForMaskFileName=None, + sampleToPopFileName=None, + pop=None, +): if vcfForMaskFileName: - sys.stderr.write("reading geno mask info from %s\n" % - (vcfForMaskFileName)) + sys.stderr.write( + "reading geno mask info from %s\n" % (vcfForMaskFileName) + ) vcfFile = allel.read_vcf(vcfForMaskFileName) sys.stderr.write("done with read\n") chroms = vcfFile["variants/CHROM"] samples = vcfFile["samples"] if sampleToPopFileName: sampleToPop = readSampleToPopFile(sampleToPopFileName) - sampleIndicesToKeep = [i for i in range( - len(samples)) if sampleToPop.get(samples[i], "popNotFound!") == pop] # NOQA + sampleIndicesToKeep = [ + i + for i in range(len(samples)) + if sampleToPop.get(samples[i], "popNotFound!") == pop + ] # NOQA else: sampleIndicesToKeep = [i for i in range(len(samples))] if maskFileName.endswith(".gz"): @@ -421,48 +493,76 @@ def readMaskDataForTraining(maskFileName, totalPhysLen, readingMasks = False isAccessible, isAccessibleArm = [], [] genoMaskInfo = [] - with fopen(maskFileName, 'rt') as maskFile: + with fopen(maskFileName, "rt") as maskFile: for line in maskFile: if line.startswith(">"): if readingMasks and len(isAccessibleArm) >= totalPhysLen: if vcfForMaskFileName: sys.stderr.write( - "processing sites "\ - "and genos for %s\n" % (currChr)) - windowedAccessibility, windowedGenoMask = getGenoMaskInfoInWins( - isAccessibleArm, genos, positions, - positions2SnpIndices, totalPhysLen, subWinLen, cutoff, genoCutoff) + "processing sites " + "and genos for %s\n" % (currChr) + ) + ( + windowedAccessibility, + windowedGenoMask, + ) = getGenoMaskInfoInWins( + isAccessibleArm, + genos, + positions, + positions2SnpIndices, + totalPhysLen, + subWinLen, + cutoff, + genoCutoff, + ) if windowedAccessibility: isAccessible += windowedAccessibility genoMaskInfo += windowedGenoMask else: windowedAccessibility = getAccessibilityInWins( - isAccessibleArm, totalPhysLen, subWinLen, cutoff) + isAccessibleArm, totalPhysLen, subWinLen, cutoff + ) if windowedAccessibility: isAccessible += windowedAccessibility currChr = line[1:].strip() currPos = 0 # sys.stderr.write("chrom: " + currChr + "\n") - if 'all' in chrArmsForMasking or currChr in chrArmsForMasking: + if "all" in chrArmsForMasking or currChr in chrArmsForMasking: readingMasks = True else: readingMasks = False isAccessibleArm = [] if vcfForMaskFileName and readingMasks: - sys.stderr.write("checking geno mask "\ - "info from %s for %s\n" % ( - vcfForMaskFileName, currChr)) - genos, positions, positions2SnpIndices, isBiallelic = extractGenosAndPositionsForArm( - vcfFile, chroms, currChr, sampleIndicesToKeep) + sys.stderr.write( + "checking geno mask " + "info from %s for %s\n" % (vcfForMaskFileName, currChr) + ) + ( + genos, + positions, + positions2SnpIndices, + isBiallelic, + ) = extractGenosAndPositionsForArm( + vcfFile, chroms, currChr, sampleIndicesToKeep + ) else: if readingMasks: for char in line.strip().upper(): - if char == 'N': + if char == "N": isAccessibleArm.append(False) - elif vcfForMaskFileName and currPos in positions2SnpIndices: + elif ( + vcfForMaskFileName + and currPos in positions2SnpIndices + ): genosChecked += 1 - if isBiallelic[positions2SnpIndices[currPos]] and calledGenoFracAtSite(genos[positions2SnpIndices[currPos]]) >= genoCutoff: # NOQA + if ( + isBiallelic[positions2SnpIndices[currPos]] + and calledGenoFracAtSite( + genos[positions2SnpIndices[currPos]] + ) + >= genoCutoff + ): # NOQA isAccessibleArm.append(True) else: isAccessibleArm.append(False) @@ -473,15 +573,22 @@ def readMaskDataForTraining(maskFileName, totalPhysLen, if vcfForMaskFileName: sys.stderr.write("processing sites and genos for %s\n" % (currChr)) windowedAccessibility, windowedGenoMask = getGenoMaskInfoInWins( - isAccessibleArm, genos, positions, - positions2SnpIndices, totalPhysLen, - subWinLen, cutoff, genoCutoff) + isAccessibleArm, + genos, + positions, + positions2SnpIndices, + totalPhysLen, + subWinLen, + cutoff, + genoCutoff, + ) if windowedAccessibility: isAccessible += windowedAccessibility genoMaskInfo += windowedGenoMask else: windowedAccessibility = getAccessibilityInWins( - isAccessibleArm, totalPhysLen, subWinLen, cutoff) + isAccessibleArm, totalPhysLen, subWinLen, cutoff + ) if windowedAccessibility: isAccessible += windowedAccessibility if shuffle: @@ -494,9 +601,11 @@ def readMaskDataForTraining(maskFileName, totalPhysLen, random.shuffle(isAccessible) if len(isAccessible) == 0: - sys.exit("Error: Couldn't find a single window in our "\ - "real data for masking that survived filters. May have to "\ - "disable masking.\n") + sys.exit( + "Error: Couldn't find a single window in our " + "real data for masking that survived filters. May have to " + "disable masking.\n" + ) for i in range(len(isAccessible)): assert len(isAccessible[i]) == totalPhysLen sys.stderr.write("checked genotypes at %d sites\n" % (genosChecked)) @@ -534,7 +643,7 @@ def readMaskDataForScan(maskFileName, chrArm): fopen = gzip.open else: fopen = open - with fopen(maskFileName, 'rt') as maskFile: + with fopen(maskFileName, "rt") as maskFile: for line in maskFile: if line.startswith(">"): currChr = line[1:].strip() @@ -545,7 +654,7 @@ def readMaskDataForScan(maskFileName, chrArm): else: if readingMasks: for char in line.strip().upper(): - if char == 'N': + if char == "N": isAccessible.append(False) else: isAccessible.append(True) @@ -555,14 +664,14 @@ def readMaskDataForScan(maskFileName, chrArm): def normalizeFeatureVec(statVec): minVal = min(statVec) if minVal < 0: - statVec = [x-minVal for x in statVec] + statVec = [x - minVal for x in statVec] normStatVec = [] statSum = float(sum(statVec)) if statSum == 0 or any(np.isinf(statVec)) or any(np.isnan(statVec)): - normStatVec = [1.0/len(statVec)]*len(statVec) + normStatVec = [1.0 / len(statVec)] * len(statVec) else: for k in range(len(statVec)): - normStatVec.append(statVec[k]/statSum) + normStatVec.append(statVec[k] / statSum) return normStatVec @@ -587,35 +696,77 @@ def maxFDA(pos, ac, start=None, stop=None, is_accessible=None): dafs = [] for i in range(len(ac)): p1 = ac[i, 1] - n = p1+ac[i, 0] - dafs.append(p1/float(n)) + n = p1 + ac[i, 0] + dafs.append(p1 / float(n)) return max(dafs) -def calcAndAppendStatVal(alleleCounts, snpLocs, statName, - subWinStart, subWinEnd, statVals, - instanceIndex, subWinIndex, hapsInSubWin, - unmasked, precomputedStats): +def calcAndAppendStatVal( + alleleCounts, + snpLocs, + statName, + subWinStart, + subWinEnd, + statVals, + instanceIndex, + subWinIndex, + hapsInSubWin, + unmasked, + precomputedStats, +): if statName == "tajD": - statVals[statName][instanceIndex].append(allel.stats.diversity.tajima_d( # NOQA - alleleCounts, pos=snpLocs, start=subWinStart, stop=subWinEnd)) + statVals[statName][instanceIndex].append( + allel.stats.diversity.tajima_d( # NOQA + alleleCounts, pos=snpLocs, start=subWinStart, stop=subWinEnd + ) + ) elif statName == "pi": - statVals[statName][instanceIndex].append(allel.stats.diversity.sequence_diversity( - snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked)) + statVals[statName][instanceIndex].append( + allel.stats.diversity.sequence_diversity( + snpLocs, + alleleCounts, + start=subWinStart, + stop=subWinEnd, + is_accessible=unmasked, + ) + ) elif statName == "thetaW": - statVals[statName][instanceIndex].append(allel.stats.diversity.watterson_theta( - snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked)) + statVals[statName][instanceIndex].append( + allel.stats.diversity.watterson_theta( + snpLocs, + alleleCounts, + start=subWinStart, + stop=subWinEnd, + is_accessible=unmasked, + ) + ) elif statName == "thetaH": - statVals[statName][instanceIndex].append(thetah( - snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked)) + statVals[statName][instanceIndex].append( + thetah( + snpLocs, + alleleCounts, + start=subWinStart, + stop=subWinEnd, + is_accessible=unmasked, + ) + ) elif statName == "fayWuH": statVals[statName][instanceIndex].append( - statVals["thetaH"][instanceIndex][subWinIndex]-statVals["pi"][instanceIndex][subWinIndex]) + statVals["thetaH"][instanceIndex][subWinIndex] + - statVals["pi"][instanceIndex][subWinIndex] + ) elif statName == "HapCount": statVals[statName][instanceIndex].append(len(hapsInSubWin.distinct())) elif statName == "maxFDA": - statVals[statName][instanceIndex].append(maxFDA( - snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked)) + statVals[statName][instanceIndex].append( + maxFDA( + snpLocs, + alleleCounts, + start=subWinStart, + stop=subWinEnd, + is_accessible=unmasked, + ) + ) elif statName == "H1": h1, h12, h123, h21 = allel.stats.selection.garud_h(hapsInSubWin) statVals["H1"][instanceIndex].append(h1) @@ -626,43 +777,58 @@ def calcAndAppendStatVal(alleleCounts, snpLocs, statName, if "H2/H1" in statVals: statVals["H2/H1"][instanceIndex].append(h21) elif statName == "ZnS": - r2Matrix = shicstats.computeR2Matrix(hapsInSubWin) - statVals["ZnS"][instanceIndex].append(shicstats.ZnS(r2Matrix)[0]) - statVals["Omega"][instanceIndex].append(shicstats.omega(r2Matrix)[0]) + r2Matrix = dps.computeR2Matrix(hapsInSubWin) + statVals["ZnS"][instanceIndex].append(dps.ZnS(r2Matrix)[0]) + statVals["Omega"][instanceIndex].append(dps.omega(r2Matrix)[0]) elif statName == "RH": rMatrixFlat = allel.stats.ld.rogers_huff_r( - hapsInSubWin.to_genotypes(ploidy=2).to_n_alt()) + hapsInSubWin.to_genotypes(ploidy=2).to_n_alt() + ) rhAvg = rMatrixFlat.mean() statVals["RH"][instanceIndex].append(rhAvg) r2Matrix = squareform(rMatrixFlat ** 2) - statVals["Omega"][instanceIndex].append(shicstats.omega(r2Matrix)[0]) + statVals["Omega"][instanceIndex].append(dps.omega(r2Matrix)[0]) elif statName == "iHSMean": - vals = [x for x in precomputedStats["iHS"][subWinIndex] - if not (math.isnan(x) or math.isinf(x))] + vals = [ + x + for x in precomputedStats["iHS"][subWinIndex] + if not (math.isnan(x) or math.isinf(x)) + ] if len(vals) == 0: statVals["iHSMean"][instanceIndex].append(0.0) else: statVals["iHSMean"][instanceIndex].append( - sum(vals)/float(len(vals))) + sum(vals) / float(len(vals)) + ) elif statName == "nSLMean": - vals = [x for x in precomputedStats["nSL"][subWinIndex] - if not (math.isnan(x) or math.isinf(x))] + vals = [ + x + for x in precomputedStats["nSL"][subWinIndex] + if not (math.isnan(x) or math.isinf(x)) + ] if len(vals) == 0: statVals["nSLMean"][instanceIndex].append(0.0) else: statVals["nSLMean"][instanceIndex].append( - sum(vals)/float(len(vals))) + sum(vals) / float(len(vals)) + ) elif statName == "iHSMax": - vals = [x for x in precomputedStats["iHS"][subWinIndex] - if not (math.isnan(x) or math.isinf(x))] + vals = [ + x + for x in precomputedStats["iHS"][subWinIndex] + if not (math.isnan(x) or math.isinf(x)) + ] if len(vals) == 0: maxVal = 0.0 else: maxVal = max(vals) statVals["iHSMax"][instanceIndex].append(maxVal) elif statName == "nSLMax": - vals = [x for x in precomputedStats["nSL"][subWinIndex] - if not (math.isnan(x) or math.isinf(x))] + vals = [ + x + for x in precomputedStats["nSL"][subWinIndex] + if not (math.isnan(x) or math.isinf(x)) + ] if len(vals) == 0: maxVal = 0.0 else: @@ -670,41 +836,89 @@ def calcAndAppendStatVal(alleleCounts, snpLocs, statName, statVals["nSLMax"][instanceIndex].append(maxVal) elif statName == "iHSOutFrac": statVals["iHSOutFrac"][instanceIndex].append( - getOutlierFrac(precomputedStats["iHS"][subWinIndex])) + getOutlierFrac(precomputedStats["iHS"][subWinIndex]) + ) elif statName == "nSLOutFrac": statVals["nSLOutFrac"][instanceIndex].append( - getOutlierFrac(precomputedStats["nSL"][subWinIndex])) + getOutlierFrac(precomputedStats["nSL"][subWinIndex]) + ) elif statName == "distVar": - dists = shicstats.pairwiseDiffs( - hapsInSubWin)/float(unmasked[subWinStart-1:subWinEnd].count(True)) + dists = dps.pairwiseDiffs(hapsInSubWin) / float( + unmasked[subWinStart - 1 : subWinEnd].count(True) + ) statVals["distVar"][instanceIndex].append(np.var(dists, ddof=1)) statVals["distSkew"][instanceIndex].append(scipy.stats.skew(dists)) statVals["distKurt"][instanceIndex].append(scipy.stats.kurtosis(dists)) - elif statName in ["H12", "H123", "H2/H1", "Omega", "distVar", "distSkew", "distKurt"]: - assert len(statVals[statName][instanceIndex]) == subWinIndex+1 - - -def calcAndAppendStatValDiplo(alleleCounts, snpLocs, statName, subWinStart, subWinEnd, statVals, instanceIndex, subWinIndex, genosInSubWin, unmasked): + elif statName in [ + "H12", + "H123", + "H2/H1", + "Omega", + "distVar", + "distSkew", + "distKurt", + ]: + assert len(statVals[statName][instanceIndex]) == subWinIndex + 1 + + +def calcAndAppendStatValDiplo( + alleleCounts, + snpLocs, + statName, + subWinStart, + subWinEnd, + statVals, + instanceIndex, + subWinIndex, + genosInSubWin, + unmasked, +): genosNAlt = genosInSubWin.to_n_alt() if statName == "tajD": - statVals[statName][instanceIndex].append(allel.stats.diversity.tajima_d( - alleleCounts, pos=snpLocs, start=subWinStart, stop=subWinEnd)) + statVals[statName][instanceIndex].append( + allel.stats.diversity.tajima_d( + alleleCounts, pos=snpLocs, start=subWinStart, stop=subWinEnd + ) + ) elif statName == "pi": - statVals[statName][instanceIndex].append(allel.stats.diversity.sequence_diversity( - snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked)) + statVals[statName][instanceIndex].append( + allel.stats.diversity.sequence_diversity( + snpLocs, + alleleCounts, + start=subWinStart, + stop=subWinEnd, + is_accessible=unmasked, + ) + ) elif statName == "thetaW": - statVals[statName][instanceIndex].append(allel.stats.diversity.watterson_theta( - snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked)) + statVals[statName][instanceIndex].append( + allel.stats.diversity.watterson_theta( + snpLocs, + alleleCounts, + start=subWinStart, + stop=subWinEnd, + is_accessible=unmasked, + ) + ) elif statName == "thetaH": - statVals[statName][instanceIndex].append(thetah( - snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked)) + statVals[statName][instanceIndex].append( + thetah( + snpLocs, + alleleCounts, + start=subWinStart, + stop=subWinEnd, + is_accessible=unmasked, + ) + ) elif statName == "fayWuH": statVals[statName][instanceIndex].append( - statVals["thetaH"][instanceIndex][subWinIndex]-statVals["pi"][instanceIndex][subWinIndex]) + statVals["thetaH"][instanceIndex][subWinIndex] + - statVals["pi"][instanceIndex][subWinIndex] + ) elif statName == "HapCount": statVals[statName][instanceIndex].append(len(hapsInSubWin.distinct())) elif statName == "nDiplos": - diplotypeCounts = shicstats.getHaplotypeFreqSpec(genosNAlt) + diplotypeCounts = dps.getHaplotypeFreqSpec(genosNAlt) nDiplos = diplotypeCounts[genosNAlt.shape[1]] statVals["nDiplos"][instanceIndex].append(nDiplos) diplotypeCounts = diplotypeCounts[:-1] @@ -716,7 +930,7 @@ def calcAndAppendStatValDiplo(alleleCounts, snpLocs, statName, subWinStart, subW if "diplo_H12" in statVals: statVals["diplo_H12"][instanceIndex].append(dh12) if "diplo_H2/H1" in statVals: - statVals["diplo_H2/H1"][instanceIndex].append(dh2/dh1) + statVals["diplo_H2/H1"][instanceIndex].append(dh2 / dh1) elif statName == "diplo_ZnS": if genosNAlt.shape[0] == 1: statVals["diplo_ZnS"][instanceIndex].append(0.0) @@ -726,43 +940,90 @@ def calcAndAppendStatValDiplo(alleleCounts, snpLocs, statName, subWinStart, subW r2Matrix2 = squareform(r2Matrix ** 2) statVals["diplo_ZnS"][instanceIndex].append(np.nanmean(r2Matrix2)) statVals["diplo_Omega"][instanceIndex].append( - shicstats.omega(r2Matrix2)[0]) + dps.omega(r2Matrix2)[0] + ) elif statName == "distVar": - dists = shicstats.pairwiseDiffsDiplo( - genosNAlt)/float(unmasked[subWinStart-1:subWinEnd].count(True)) + dists = dps.pairwiseDiffsDiplo(genosNAlt) / float( + unmasked[subWinStart - 1 : subWinEnd].count(True) + ) statVals["distVar"][instanceIndex].append(np.var(dists, ddof=1)) statVals["distSkew"][instanceIndex].append(scipy.stats.skew(dists)) statVals["distKurt"][instanceIndex].append(scipy.stats.kurtosis(dists)) - elif statName in ["diplo_H12", "diplo_H123", "diplo_H2/H1", "distVar", "distSkew", "distKurt", "diplo_Omega"]: - if not len(statVals[statName][instanceIndex]) == subWinIndex+1: - print(statName, instanceIndex, subWinIndex+1) - print(statVals["diplo_H1"][instanceIndex], - statVals["diplo_H12"][instanceIndex]) + elif statName in [ + "diplo_H12", + "diplo_H123", + "diplo_H2/H1", + "distVar", + "distSkew", + "distKurt", + "diplo_Omega", + ]: + if not len(statVals[statName][instanceIndex]) == subWinIndex + 1: + print(statName, instanceIndex, subWinIndex + 1) + print( + statVals["diplo_H1"][instanceIndex], + statVals["diplo_H12"][instanceIndex], + ) sys.exit() -def calcAndAppendStatValForScanDiplo(alleleCounts, snpLocs, statName, subWinStart, subWinEnd, statVals, subWinIndex, genosInSubWin, unmasked): +def calcAndAppendStatValForScanDiplo( + alleleCounts, + snpLocs, + statName, + subWinStart, + subWinEnd, + statVals, + subWinIndex, + genosInSubWin, + unmasked, +): genosNAlt = genosInSubWin.to_n_alt() if statName == "tajD": - statVals[statName].append(allel.stats.diversity.tajima_d( - alleleCounts, pos=snpLocs, start=subWinStart, stop=subWinEnd)) + statVals[statName].append( + allel.stats.diversity.tajima_d( + alleleCounts, pos=snpLocs, start=subWinStart, stop=subWinEnd + ) + ) elif statName == "pi": - statVals[statName].append(allel.stats.diversity.sequence_diversity( - snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked)) + statVals[statName].append( + allel.stats.diversity.sequence_diversity( + snpLocs, + alleleCounts, + start=subWinStart, + stop=subWinEnd, + is_accessible=unmasked, + ) + ) elif statName == "thetaW": - statVals[statName].append(allel.stats.diversity.watterson_theta( - snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked)) + statVals[statName].append( + allel.stats.diversity.watterson_theta( + snpLocs, + alleleCounts, + start=subWinStart, + stop=subWinEnd, + is_accessible=unmasked, + ) + ) elif statName == "thetaH": - statVals[statName].append(thetah( - snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked)) + statVals[statName].append( + thetah( + snpLocs, + alleleCounts, + start=subWinStart, + stop=subWinEnd, + is_accessible=unmasked, + ) + ) elif statName == "fayWuH": statVals[statName].append( - statVals["thetaH"][subWinIndex]-statVals["pi"][subWinIndex]) + statVals["thetaH"][subWinIndex] - statVals["pi"][subWinIndex] + ) elif statName == "HapCount": # AK: undefined variables statVals[statName].append(len(hapsInSubWin.distinct())) elif statName == "nDiplos": - diplotypeCounts = shicstats.getHaplotypeFreqSpec(genosNAlt) + diplotypeCounts = dps.getHaplotypeFreqSpec(genosNAlt) nDiplos = diplotypeCounts[genosNAlt.shape[1]] statVals["nDiplos"].append(nDiplos) diplotypeCounts = diplotypeCounts[:-1] @@ -774,7 +1035,7 @@ def calcAndAppendStatValForScanDiplo(alleleCounts, snpLocs, statName, subWinStar if "diplo_H12" in statVals: statVals["diplo_H12"].append(dh12) if "diplo_H2/H1" in statVals: - statVals["diplo_H2/H1"].append(dh2/dh1) + statVals["diplo_H2/H1"].append(dh2 / dh1) elif statName == "diplo_ZnS": if genosNAlt.shape[0] == 1: statVals["diplo_ZnS"].append(0.0) @@ -783,16 +1044,25 @@ def calcAndAppendStatValForScanDiplo(alleleCounts, snpLocs, statName, subWinStar r2Matrix = allel.stats.ld.rogers_huff_r(genosNAlt) statVals["diplo_ZnS"].append(np.nanmean(r2Matrix)) r2Matrix2 = squareform(r2Matrix ** 2) - statVals["diplo_Omega"].append(shicstats.omega(r2Matrix2)[0]) + statVals["diplo_Omega"].append(dps.omega(r2Matrix2)[0]) elif statName == "distVar": - dists = shicstats.pairwiseDiffsDiplo( - genosNAlt)/float(unmasked[subWinStart-1:subWinEnd].count(True)) + dists = dps.pairwiseDiffsDiplo(genosNAlt) / float( + unmasked[subWinStart - 1 : subWinEnd].count(True) + ) statVals["distVar"].append(np.var(dists, ddof=1)) statVals["distSkew"].append(scipy.stats.skew(dists)) statVals["distKurt"].append(scipy.stats.kurtosis(dists)) - elif statName in ["diplo_H12", "diplo_H123", "diplo_H2/H1", "distVar", "distSkew", "distKurt", "diplo_Omega"]: - if not len(statVals[statName]) == subWinIndex+1: - print(statName, subWinIndex+1) + elif statName in [ + "diplo_H12", + "diplo_H123", + "diplo_H2/H1", + "distVar", + "distSkew", + "distKurt", + "diplo_Omega", + ]: + if not len(statVals[statName]) == subWinIndex + 1: + print(statName, subWinIndex + 1) print(statVals["diplo_H1"], statVals["diplo_H12"]) sys.exit() @@ -811,10 +1081,12 @@ def getOutlierFrac(vals, cutoff=2.0): if denom == 0: return 0.0 else: - return num/float(denom) + return num / float(denom) -def appendStatValsForMonomorphic(statName, statVals, instanceIndex, subWinIndex): +def appendStatValsForMonomorphic( + statName, statVals, instanceIndex, subWinIndex +): if statName == "tajD": statVals[statName][instanceIndex].append(0.0) elif statName == "pi": @@ -864,33 +1136,85 @@ def appendStatValsForMonomorphic(statName, statVals, instanceIndex, subWinIndex) statVals["iHSMax"][instanceIndex].append(0.0) elif statName == "nSLMax": statVals["nSLMax"][instanceIndex].append(0.0) - elif statName in ["H12", "H123", "H2/H1", "diplo_H12", "diplo_H123", "diplo_H2/H1", "Omega", "diplo_Omega"]: - #print(statName, statVals[statName][instanceIndex], subWinIndex+1) - assert len(statVals[statName][instanceIndex]) == subWinIndex+1 + elif statName in [ + "H12", + "H123", + "H2/H1", + "diplo_H12", + "diplo_H123", + "diplo_H2/H1", + "Omega", + "diplo_Omega", + ]: + # print(statName, statVals[statName][instanceIndex], subWinIndex+1) + assert len(statVals[statName][instanceIndex]) == subWinIndex + 1 else: statVals[statName][instanceIndex].append(0.0) -def calcAndAppendStatValForScan(alleleCounts, snpLocs, statName, subWinStart, subWinEnd, statVals, subWinIndex, hapsInSubWin, unmasked, precomputedStats): +def calcAndAppendStatValForScan( + alleleCounts, + snpLocs, + statName, + subWinStart, + subWinEnd, + statVals, + subWinIndex, + hapsInSubWin, + unmasked, + precomputedStats, +): if statName == "tajD": - statVals[statName].append(allel.stats.diversity.tajima_d( - alleleCounts, pos=snpLocs, start=subWinStart, stop=subWinEnd)) + statVals[statName].append( + allel.stats.diversity.tajima_d( + alleleCounts, pos=snpLocs, start=subWinStart, stop=subWinEnd + ) + ) elif statName == "pi": - statVals[statName].append(allel.stats.diversity.sequence_diversity( # NOQA - snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked)) # NOQA + statVals[statName].append( + allel.stats.diversity.sequence_diversity( # NOQA + snpLocs, + alleleCounts, + start=subWinStart, + stop=subWinEnd, + is_accessible=unmasked, + ) + ) # NOQA elif statName == "thetaW": - statVals[statName].append(allel.stats.diversity.watterson_theta( - snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked)) # NOQA + statVals[statName].append( + allel.stats.diversity.watterson_theta( + snpLocs, + alleleCounts, + start=subWinStart, + stop=subWinEnd, + is_accessible=unmasked, + ) + ) # NOQA elif statName == "thetaH": - statVals[statName].append(thetah( - snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked)) # NOQA + statVals[statName].append( + thetah( + snpLocs, + alleleCounts, + start=subWinStart, + stop=subWinEnd, + is_accessible=unmasked, + ) + ) # NOQA elif statName == "fayWuH": statVals[statName].append( - statVals["thetaH"][subWinIndex]-statVals["pi"][subWinIndex]) + statVals["thetaH"][subWinIndex] - statVals["pi"][subWinIndex] + ) elif statName == "maxFDA": # AK: undefined variables - statVals[statName].append(maxFDA( - snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked)) + statVals[statName].append( + maxFDA( + snpLocs, + alleleCounts, + start=subWinStart, + stop=subWinEnd, + is_accessible=unmasked, + ) + ) elif statName == "HapCount": statVals[statName].append(len(hapsInSubWin.distinct())) elif statName == "H1": @@ -903,61 +1227,84 @@ def calcAndAppendStatValForScan(alleleCounts, snpLocs, statName, subWinStart, su if "H2/H1" in statVals: statVals["H2/H1"].append(h21) elif statName == "ZnS": - r2Matrix = shicstats.computeR2Matrix(hapsInSubWin) - statVals["ZnS"].append(shicstats.ZnS(r2Matrix)[0]) - statVals["Omega"].append(shicstats.omega(r2Matrix)[0]) + r2Matrix = dps.computeR2Matrix(hapsInSubWin) + statVals["ZnS"].append(dps.ZnS(r2Matrix)[0]) + statVals["Omega"].append(dps.omega(r2Matrix)[0]) elif statName == "RH": rMatrixFlat = allel.stats.ld.rogers_huff_r( - hapsInSubWin.to_genotypes(ploidy=2).to_n_alt()) + hapsInSubWin.to_genotypes(ploidy=2).to_n_alt() + ) rhAvg = rMatrixFlat.mean() statVals["RH"].append(rhAvg) r2Matrix = squareform(rMatrixFlat ** 2) - statVals["Omega"].append(shicstats.omega(r2Matrix)[0]) + statVals["Omega"].append(dps.omega(r2Matrix)[0]) elif statName == "iHSMean": - vals = [x for x in precomputedStats["iHS"][subWinIndex] - if not (math.isnan(x) or math.isinf(x))] + vals = [ + x + for x in precomputedStats["iHS"][subWinIndex] + if not (math.isnan(x) or math.isinf(x)) + ] if len(vals) == 0: statVals["iHSMean"].append(0.0) else: - statVals["iHSMean"].append(sum(vals)/float(len(vals))) + statVals["iHSMean"].append(sum(vals) / float(len(vals))) elif statName == "nSLMean": - vals = [x for x in precomputedStats["nSL"][subWinIndex] - if not (math.isnan(x) or math.isnan(x))] + vals = [ + x + for x in precomputedStats["nSL"][subWinIndex] + if not (math.isnan(x) or math.isnan(x)) + ] if len(vals) == 0: statVals["nSLMean"].append(0.0) else: - statVals["nSLMean"].append(sum(vals)/float(len(vals))) + statVals["nSLMean"].append(sum(vals) / float(len(vals))) elif statName == "iHSMax": - vals = [x for x in precomputedStats["iHS"][subWinIndex] - if not (math.isnan(x) or math.isinf(x))] + vals = [ + x + for x in precomputedStats["iHS"][subWinIndex] + if not (math.isnan(x) or math.isinf(x)) + ] if len(vals) == 0: maxVal = 0.0 else: maxVal = max(vals) statVals["iHSMax"].append(maxVal) elif statName == "nSLMax": - vals = [x for x in precomputedStats["nSL"][subWinIndex] - if not (math.isnan(x) or math.isnan(x))] + vals = [ + x + for x in precomputedStats["nSL"][subWinIndex] + if not (math.isnan(x) or math.isnan(x)) + ] if len(vals) == 0: maxVal = 0.0 else: maxVal = max(vals) statVals["nSLMax"].append(maxVal) elif statName == "iHSOutFrac": - statVals["iHSOutFrac"].append(getOutlierFrac( - precomputedStats["iHS"][subWinIndex])) + statVals["iHSOutFrac"].append( + getOutlierFrac(precomputedStats["iHS"][subWinIndex]) + ) elif statName == "nSLOutFrac": - statVals["nSLOutFrac"].append(getOutlierFrac( - precomputedStats["nSL"][subWinIndex])) + statVals["nSLOutFrac"].append( + getOutlierFrac(precomputedStats["nSL"][subWinIndex]) + ) elif statName == "distVar": - dists = shicstats.pairwiseDiffs( - hapsInSubWin)/float(unmasked[subWinStart-1:subWinEnd].count(True)) + dists = dps.pairwiseDiffs(hapsInSubWin) / float( + unmasked[subWinStart - 1 : subWinEnd].count(True) + ) statVals["distVar"].append(np.var(dists, ddof=1)) statVals["distSkew"].append(scipy.stats.skew(dists)) statVals["distKurt"].append(scipy.stats.kurtosis(dists)) - elif statName in ["H12", "H123", "H2/H1", - "Omega", "distVar", "distSkew", "distKurt"]: - assert len(statVals[statName]) == subWinIndex+1 + elif statName in [ + "H12", + "H123", + "H2/H1", + "Omega", + "distVar", + "distSkew", + "distKurt", + ]: + assert len(statVals[statName]) == subWinIndex + 1 def appendStatValsForMonomorphicForScan(statName, statVals, subWinIndex): @@ -1010,19 +1357,27 @@ def appendStatValsForMonomorphicForScan(statName, statVals, subWinIndex): statVals["iHSMax"].append(0.0) elif statName == "nSLMax": statVals["nSLMax"].append(0.0) - elif statName in ["H12", "H123", "H2/H1", "diplo_H12", - "diplo_H123", "diplo_H2/H1", "Omega", "diplo_Omega"]: + elif statName in [ + "H12", + "H123", + "H2/H1", + "diplo_H12", + "diplo_H123", + "diplo_H2/H1", + "Omega", + "diplo_Omega", + ]: # print(statName, statVals[statName][instanceIndex], subWinIndex+1) - assert len(statVals[statName]) == subWinIndex+1 + assert len(statVals[statName]) == subWinIndex + 1 else: statVals[statName].append(0.0) -''' +""" WARNING: this code assumes that the second column of ac gives the derived alleles; please ensure that this is the case (and that you are using polarized data) if are going to use values of this statistic for the classifier!! -''' # NOQA +""" # NOQA def thetah(pos, ac, start=None, stop=None, is_accessible=None): @@ -1046,16 +1401,16 @@ def thetah(pos, ac, start=None, stop=None, is_accessible=None): h = 0 for i in range(len(ac)): p1 = ac[i, 1] - n = p1+ac[i, 0] + n = p1 + ac[i, 0] if n > 1: - h += (p1*p1)/(n*(n-1.0)) + h += (p1 * p1) / (n * (n - 1.0)) h *= 2 # calculate value per base if is_accessible is None: n_bases = stop - start + 1 else: - n_bases = np.count_nonzero(is_accessible[start-1:stop]) + n_bases = np.count_nonzero(is_accessible[start - 1 : stop]) h = h / n_bases return h @@ -1065,8 +1420,8 @@ def garudH1(hapCounts): h1 = 0.0 for hapFreq in range(len(hapCounts), 0, -1): - pi = hapFreq/float(len(hapCounts)) - h1 += hapCounts[hapFreq-1]*pi*pi + pi = hapFreq / float(len(hapCounts)) + h1 += hapCounts[hapFreq - 1] * pi * pi return h1 @@ -1076,13 +1431,13 @@ def garudH2(hapCounts): first = True for hapFreq in range(len(hapCounts), 0, -1): - pi = hapFreq/float(len(hapCounts)) - if hapCounts[hapFreq-1] > 0: + pi = hapFreq / float(len(hapCounts)) + if hapCounts[hapFreq - 1] > 0: if first: first = False - h2 += (hapCounts[hapFreq-1]-1)*pi*pi + h2 += (hapCounts[hapFreq - 1] - 1) * pi * pi else: - h2 += hapCounts[hapFreq-1]*pi*pi + h2 += hapCounts[hapFreq - 1] * pi * pi return h2 @@ -1092,14 +1447,14 @@ def garudH12(hapCounts): totalAdded = 0 for hapFreq in range(len(hapCounts), 0, -1): - pi = hapFreq/float(len(hapCounts)) - for i in range(hapCounts[hapFreq-1]): + pi = hapFreq / float(len(hapCounts)) + for i in range(hapCounts[hapFreq - 1]): if totalAdded < 2: part1 += pi else: - part2 += pi*pi + part2 += pi * pi totalAdded += 1 - part1 = part1*part1 + part1 = part1 * part1 - return part1+part2 + return part1 + part2 diff --git a/generateSimLaunchScript.py b/diploshic/generateSimLaunchScript.py similarity index 53% rename from generateSimLaunchScript.py rename to diploshic/generateSimLaunchScript.py index 2499973..f6ee035 100644 --- a/generateSimLaunchScript.py +++ b/diploshic/generateSimLaunchScript.py @@ -1,4 +1,5 @@ import sys, os + """ python generateSimLaunchScript.py This script emits a bash launch script for running all of the training/testing simulations required by discoal @@ -22,15 +23,19 @@ # also, after performing simulation and calculating summary statistics, stats should be spot checked to make sure # that when a sweep occurs in the central subwindow, the effect has mostly dissipated in the outer windows # alpha/rho may have to be adjusted in order to achieve this -trainingSampleNumber = 2000 #the number of simulation replicates we want to generate for each file in our training set -testSampleNumber = 1000 #the number of simulations to create for each file in the test set -sampleSize = 100 #the number of individuals in our population sample -numSites = 55000 #total number of sites in our simulated window (i.e. S/HIC's subwindow size * 11) -u = 3.5e-9 #per-site mutation rate (used to calculate mean theta) -gensPerYear = 11.0 #number of generations per year -maxSoftSweepInitFreq = 0.1 #maximum initial selected frequency for soft sweeps -tauHigh = 0.05 #maximum FIXATION (not mutation) time (in units of 4N generations ago) in the past -rhoOverTheta = 5.0 #crossover rate over mut rate (used to calculate mean rho) +trainingSampleNumber = 2000 # the number of simulation replicates we want to generate for each file in our training set +testSampleNumber = ( + 1000 # the number of simulations to create for each file in the test set +) +sampleSize = 100 # the number of individuals in our population sample +numSites = 55000 # total number of sites in our simulated window (i.e. S/HIC's subwindow size * 11) +u = 3.5e-9 # per-site mutation rate (used to calculate mean theta) +gensPerYear = 11.0 # number of generations per year +maxSoftSweepInitFreq = ( + 0.1 # maximum initial selected frequency for soft sweeps +) +tauHigh = 0.05 # maximum FIXATION (not mutation) time (in units of 4N generations ago) in the past +rhoOverTheta = 5.0 # crossover rate over mut rate (used to calculate mean rho) sizeChanges = [] with open(popSizeFileName) as popSizeFile: @@ -43,42 +48,56 @@ ne0 = ne prevSizeRatio = 1.0 else: - t, sizeRatio = year*gensPerYear/(4*ne0), ne/ne0 + t, sizeRatio = year * gensPerYear / (4 * ne0), ne / ne0 if abs(sizeRatio - prevSizeRatio) > 1e-9: sizeChanges.append((t, sizeRatio, ne)) prevSizeRatio = sizeRatio N0 = ne0 -thetaMean=4*N0*u*numSites +thetaMean = 4 * N0 * u * numSites rhoMean = thetaMean * rhoOverTheta -thetaLow = (2*thetaMean)/11.0 -thetaHigh = 10*thetaLow +thetaLow = (2 * thetaMean) / 11.0 +thetaHigh = 10 * thetaLow rhoMax = 3 * rhoMean -alphaHigh = 2*N0*0.005 # max selection coefficient s is 0.005 -alphaLow = 2*N0*0.0001 # mininum selection coefficient s at 0.0001 +alphaHigh = 2 * N0 * 0.005 # max selection coefficient s is 0.005 +alphaLow = 2 * N0 * 0.0001 # mininum selection coefficient s at 0.0001 -selStr = " -ws 0 -Pa %f %f -Pu 0 %f" %(alphaLow, alphaHigh, tauHigh) -partialSelStr = " -ws 0 -Pa %f %f" %(alphaLow, alphaHigh) -softStr = " -Pf 0 %f" %(maxSoftSweepInitFreq) +selStr = " -ws 0 -Pa %f %f -Pu 0 %f" % (alphaLow, alphaHigh, tauHigh) +partialSelStr = " -ws 0 -Pa %f %f" % (alphaLow, alphaHigh) +softStr = " -Pf 0 %f" % (maxSoftSweepInitFreq) demogStr = "" for t, sizeRatio, ne in sizeChanges: - demogStr += " -en %f 0 %f" %(t, sizeRatio) + demogStr += " -en %f 0 %f" % (t, sizeRatio) sweepLocStr = " -x $x" print("#!/bin/bash") -for sampleNumber, outDir, simTitle in [(trainingSampleNumber, trainingOutDir, "training data"), (testSampleNumber, testOutDir, "test data")]: +for sampleNumber, outDir, simTitle in [ + (trainingSampleNumber, trainingOutDir, "training data"), + (testSampleNumber, testOutDir, "test data"), +]: partialStr = "-Pc 0.2 0.99" - print("\n#generating %s\n" %(simTitle)) - neutDiscoalCmd = "discoal %d %d %d -Pt %f %f -Pre %f %f%s" %(sampleSize, sampleNumber, numSites, thetaLow, thetaHigh, rhoMean, rhoMax, demogStr) - print("%s > %s/Neut.msOut" %(neutDiscoalCmd, outDir)) + print("\n#generating %s\n" % (simTitle)) + neutDiscoalCmd = "discoal %d %d %d -Pt %f %f -Pre %f %f%s" % ( + sampleSize, + sampleNumber, + numSites, + thetaLow, + thetaHigh, + rhoMean, + rhoMax, + demogStr, + ) + print("%s > %s/Neut.msOut" % (neutDiscoalCmd, outDir)) print("i=0") - print("for x in 0.045454545454545456 0.13636363636363635 0.22727272727272727 0.3181818181818182 0.4090909090909091 0.5 0.5909090909090909 0.6818181818181818 0.7727272727272727 0.8636363636363636 0.9545454545454546;\ndo") + print( + "for x in 0.045454545454545456 0.13636363636363635 0.22727272727272727 0.3181818181818182 0.4090909090909091 0.5 0.5909090909090909 0.6818181818181818 0.7727272727272727 0.8636363636363636 0.9545454545454546;\ndo" + ) hardDiscoalCmd = neutDiscoalCmd + selStr + sweepLocStr - print(" %s > %s/Hard_$i.msOut" %(hardDiscoalCmd, outDir)) + print(" %s > %s/Hard_$i.msOut" % (hardDiscoalCmd, outDir)) softDiscoalCmd = neutDiscoalCmd + selStr + softStr + sweepLocStr - print(" %s > %s/Soft_$i.msOut" %(softDiscoalCmd, outDir)) + print(" %s > %s/Soft_$i.msOut" % (softDiscoalCmd, outDir)) print(" i=$((i + 1))\ndone") print("") diff --git a/diploshic/makeFeatureVecsForChrArmFromVcfDiploid.py b/diploshic/makeFeatureVecsForChrArmFromVcfDiploid.py new file mode 100644 index 0000000..71d522a --- /dev/null +++ b/diploshic/makeFeatureVecsForChrArmFromVcfDiploid.py @@ -0,0 +1,292 @@ +import os +import allel +import h5py +import numpy as np +import sys +import time +from diploshic.fvTools import * + +if not len(sys.argv) in [13, 15]: + sys.exit( + "usage:\npython makeFeatureVecsForChrArmFromVcfDiploid.py vcfFileName chrArm chrLen targetPop winSize numSubWins maskFileName unmaskedFracCutoff unmaskedGenoFracCutoff sampleToPopFileName statFileName outFileName [segmentStart segmentEnd]\n" + ) +if len(sys.argv) == 15: + ( + vcfFileName, + chrArm, + chrLen, + targetPop, + winSize, + numSubWins, + maskFileName, + unmaskedFracCutoff, + unmaskedGenoFracCutoff, + sampleToPopFileName, + statFileName, + outfn, + segmentStart, + segmentEnd, + ) = sys.argv[1:] + segmentStart, segmentEnd = int(segmentStart), int(segmentEnd) +else: + ( + vcfFileName, + chrArm, + chrLen, + targetPop, + winSize, + numSubWins, + maskFileName, + unmaskedFracCutoff, + unmaskedGenoFracCutoff, + sampleToPopFileName, + statFileName, + outfn, + ) = sys.argv[1:] + segmentStart = None + +unmaskedFracCutoff = float(unmaskedFracCutoff) +if unmaskedFracCutoff < 0.0 or unmaskedFracCutoff > 1.0: + sys.exit( + "unmaskedFracCutoff=%s but must be within [0, 1]. AAAAARRRRGHHHHHH!!!\n" + % (unmaskedFracCutoff) + ) +unmaskedGenoFracCutoff = float(unmaskedGenoFracCutoff) +if unmaskedGenoFracCutoff < 0.0 or unmaskedGenoFracCutoff > 1.0: + sys.exit( + "unmaskedGenoFracCutoff=%s but must be within [0, 1]. AAAAARRRRGHHHHHH!!!\n" + % (unmaskedGenoFracCutoff) + ) +chrLen, winSize, numSubWins = int(chrLen), int(winSize), int(numSubWins) +assert winSize % numSubWins == 0 and numSubWins > 1 +subWinSize = int(winSize / numSubWins) + + +def getSubWinBounds(chrLen, subWinSize): + lastSubWinEnd = chrLen - chrLen % subWinSize + lastSubWinStart = lastSubWinEnd - subWinSize + 1 + subWinBounds = [] + for subWinStart in range(1, lastSubWinStart + 1, subWinSize): + subWinEnd = subWinStart + subWinSize - 1 + subWinBounds.append((subWinStart, subWinEnd)) + return subWinBounds + + +def getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, snpLocs): + subWinStart = 1 + subWinEnd = subWinStart + subWinSize - 1 + snpIndicesInSubWins = [[]] + for i in range(len(snpLocs)): + while snpLocs[i] <= lastSubWinEnd and not ( + snpLocs[i] >= subWinStart and snpLocs[i] <= subWinEnd + ): + subWinStart += subWinSize + subWinEnd += subWinSize + snpIndicesInSubWins.append([]) + if snpLocs[i] <= lastSubWinEnd: + snpIndicesInSubWins[-1].append(i) + while subWinEnd < lastSubWinEnd: + snpIndicesInSubWins.append([]) + subWinStart += subWinSize + subWinEnd += subWinSize + return snpIndicesInSubWins + + +def readSampleToPopFile(sampleToPopFileName): + table = {} + with open(sampleToPopFileName) as sampleToPopFile: + for line in sampleToPopFile: + sample, pop = line.strip().split() + table[sample] = pop + return table + + +vcfFile = allel.read_vcf(vcfFileName) +chroms = vcfFile["variants/CHROM"] +positions = np.extract(chroms == chrArm, vcfFile["variants/POS"]) + +if maskFileName.lower() in ["none", "false"]: + sys.stderr.write( + "Warning: a mask.fa file for the chr arm with all masked sites N'ed out is strongly recommended" + + " (pass in the reference to remove Ns at the very least)!\n" + ) + unmasked = [True] * chrLen +else: + unmasked = readMaskDataForScan(maskFileName, chrArm) + assert len(unmasked) == chrLen + +if statFileName.lower() in ["none", "false"]: + statFileName = None + +samples = vcfFile["samples"] +if not sampleToPopFileName.lower() in ["none", "false"]: + sampleToPop = readSampleToPopFile(sampleToPopFileName) + sampleIndicesToKeep = [ + i + for i in range(len(samples)) + if sampleToPop.get(samples[i], "popNotFound!") == targetPop + ] +else: + sampleIndicesToKeep = [i for i in range(len(samples))] +rawgenos = np.take( + vcfFile["calldata/GT"], + [i for i in range(len(chroms)) if chroms[i] == chrArm], + axis=0, +) +genos = allel.GenotypeArray(rawgenos).subset(sel1=sampleIndicesToKeep) + +if segmentStart != None: + snpIndicesToKeep = [ + i + for i in range(len(positions)) + if segmentStart <= positions[i] <= segmentEnd + ] + positions = [positions[i] for i in snpIndicesToKeep] + genos = allel.GenotypeArray(genos.subset(sel0=snpIndicesToKeep)) + +if isHaploidVcfGenoArray(genos): + sys.stderr.write( + "Detected haploid input. Converting into diploid individuals (combining haplotypes in order).\n" + ) + genos = diploidizeGenotypeArray(genos) + +alleleCounts = genos.count_alleles() + +# remove all but mono/biallelic unmasked sites +isBiallelic = alleleCounts.is_biallelic() +for i in range(len(isBiallelic)): + if not ( + isBiallelic[i] + and calledGenoFracAtSite(genos[i]) >= unmaskedGenoFracCutoff + ): + unmasked[positions[i] - 1] = False +snpIndicesToKeep = [ + i for i in range(len(positions)) if unmasked[positions[i] - 1] +] +genos = allel.GenotypeArray(genos.subset(sel0=snpIndicesToKeep)) +positions = [positions[i] for i in snpIndicesToKeep] +alleleCounts = allel.AlleleCountsArray( + [[alleleCounts[i][0], max(alleleCounts[i][1:])] for i in snpIndicesToKeep] +) + +statNames = [ + "pi", + "thetaW", + "tajD", + "distVar", + "distSkew", + "distKurt", + "nDiplos", + "diplo_H1", + "diplo_H12", + "diplo_H2/H1", + "diplo_ZnS", + "diplo_Omega", +] + +subWinBounds = getSubWinBounds(chrLen, subWinSize) + +header = "chrom classifiedWinStart classifiedWinEnd bigWinRange".split() +statHeader = "chrom start end".split() +for statName in statNames: + statHeader.append(statName) + for i in range(numSubWins): + header.append("%s_win%d" % (statName, i)) +statHeader = "\t".join(statHeader) +header = "\t".join(header) +outFile = open(outfn, "w") +outFile.write(header + "\n") +statVals = {} +for statName in statNames: + statVals[statName] = [] + +startTime = time.perf_counter() +goodSubWins = [] +lastSubWinEnd = chrLen - chrLen % subWinSize +snpIndicesInSubWins = getSnpIndicesInSubWins( + subWinSize, lastSubWinEnd, positions +) +subWinIndex = 0 +lastSubWinStart = lastSubWinEnd - subWinSize + 1 +if statFileName: + statFile = open(statFileName, "w") + statFile.write(statHeader + "\n") +for subWinStart in range(1, lastSubWinStart + 1, subWinSize): + subWinEnd = subWinStart + subWinSize - 1 + unmaskedFrac = unmasked[subWinStart - 1 : subWinEnd].count(True) / float( + subWinEnd - subWinStart + 1 + ) + if ( + segmentStart == None + or subWinStart >= segmentStart + and subWinEnd <= segmentEnd + ): + sys.stderr.write( + "%d-%d num unmasked snps: %d; unmasked frac: %f\n" + % ( + subWinStart, + subWinEnd, + len(snpIndicesInSubWins[subWinIndex]), + unmaskedFrac, + ) + ) + if ( + len(snpIndicesInSubWins[subWinIndex]) > 0 + and unmaskedFrac >= unmaskedFracCutoff + ): + genosInSubWin = allel.GenotypeArray( + genos.subset(sel0=snpIndicesInSubWins[subWinIndex]) + ) + statValStr = [] + for statName in statNames: + calcAndAppendStatValForScanDiplo( + alleleCounts, + positions, + statName, + subWinStart, + subWinEnd, + statVals, + subWinIndex, + genosInSubWin, + unmasked, + ) + goodSubWins.append(True) + if statFileName: + statFile.write( + "\t".join( + [chrArm, str(subWinStart), str(subWinEnd)] + + [str(statVals[statName][-1]) for statName in statNames] + ) + + "\n" + ) + else: + for statName in statNames: + appendStatValsForMonomorphicForScan( + statName, statVals, subWinIndex + ) + goodSubWins.append(False) + if goodSubWins[-numSubWins:].count(True) == numSubWins: + outVec = [] + for statName in statNames: + outVec += normalizeFeatureVec(statVals[statName][-numSubWins:]) + midSubWinEnd = subWinEnd - subWinSize * (numSubWins / 2) + midSubWinStart = midSubWinEnd - subWinSize + 1 + outFile.write( + "%s\t%d\t%d\t%d-%d\t" + % ( + chrArm, + midSubWinStart, + midSubWinEnd, + subWinEnd - winSize + 1, + subWinEnd, + ) + + "\t".join([str(x) for x in outVec]) + ) + outFile.write("\n") + subWinIndex += 1 +if statFileName: + statFile.close() +outFile.close() +sys.stderr.write( + "completed in %g seconds\n" % (time.perf_counter() - startTime) +) diff --git a/diploshic/makeFeatureVecsForChrArmFromVcf_ogSHIC.py b/diploshic/makeFeatureVecsForChrArmFromVcf_ogSHIC.py new file mode 100644 index 0000000..7bd7639 --- /dev/null +++ b/diploshic/makeFeatureVecsForChrArmFromVcf_ogSHIC.py @@ -0,0 +1,306 @@ +import os +import allel +import h5py +import numpy as np +import sys +import time +from diploshic.fvTools import * + +if not len(sys.argv) in [13, 15]: + sys.exit( + "usage:\npython makeFeatureVecsForChrArmFromVcf_ogSHIC.py chrArmFileName chrArm chrLen targetPop winSize numSubWins maskFileName sampleToPopFileName ancestralArmFaFileName statFileName outFileName [segmentStart segmentEnd]\n" + ) +if len(sys.argv) == 15: + ( + chrArmFileName, + chrArm, + chrLen, + targetPop, + winSize, + numSubWins, + maskFileName, + unmaskedFracCutoff, + sampleToPopFileName, + ancestralArmFaFileName, + statFileName, + outfn, + segmentStart, + segmentEnd, + ) = sys.argv[1:] + segmentStart, segmentEnd = int(segmentStart), int(segmentEnd) +else: + ( + chrArmFileName, + chrArm, + chrLen, + targetPop, + winSize, + numSubWins, + maskFileName, + unmaskedFracCutoff, + sampleToPopFileName, + ancestralArmFaFileName, + statFileName, + outfn, + ) = sys.argv[1:] + segmentStart = None + +unmaskedFracCutoff = float(unmaskedFracCutoff) +chrLen, winSize, numSubWins = int(chrLen), int(winSize), int(numSubWins) +assert winSize % numSubWins == 0 and numSubWins > 1 +subWinSize = int(winSize / numSubWins) + + +def getSubWinBounds(chrLen, subWinSize): + lastSubWinEnd = chrLen - chrLen % subWinSize + lastSubWinStart = lastSubWinEnd - subWinSize + 1 + subWinBounds = [] + for subWinStart in range(1, lastSubWinStart + 1, subWinSize): + subWinEnd = subWinStart + subWinSize - 1 + subWinBounds.append((subWinStart, subWinEnd)) + return subWinBounds + + +def getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, snpLocs): + subWinStart = 1 + subWinEnd = subWinStart + subWinSize - 1 + snpIndicesInSubWins = [[]] + for i in range(len(snpLocs)): + while snpLocs[i] <= lastSubWinEnd and not ( + snpLocs[i] >= subWinStart and snpLocs[i] <= subWinEnd + ): + subWinStart += subWinSize + subWinEnd += subWinSize + snpIndicesInSubWins.append([]) + if snpLocs[i] <= lastSubWinEnd: + snpIndicesInSubWins[-1].append(i) + while subWinEnd < lastSubWinEnd: + snpIndicesInSubWins.append([]) + subWinStart += subWinSize + subWinEnd += subWinSize + return snpIndicesInSubWins + + +chrArmFile = allel.read_vcf(chrArmFileName) +chroms = chrArmFile["variants/CHROM"] +positions = np.extract(chroms == chrArm, chrArmFile["variants/POS"]) + +if maskFileName.lower() in ["none", "false"]: + sys.stderr.write( + "Warning: a mask.fa file for the chr arm with all masked sites N'ed out is strongly recommended" + + " (pass in the reference to remove Ns at the very least)!\n" + ) + unmasked = [True] * chrLen +else: + unmasked = readMaskDataForScan(maskFileName, chrArm) + assert len(unmasked) == chrLen + +if statFileName.lower() in ["none", "false"]: + statFileName = None + +samples = chrArmFile["samples"] +if not sampleToPopFileName.lower() in ["none", "false"]: + sampleToPop = readSampleToPopFile(sampleToPopFileName) + sampleIndicesToKeep = [ + i + for i in range(len(samples)) + if sampleToPop.get(samples[i], "popNotFound!") == targetPop + ] +else: + sampleIndicesToKeep = [i for i in range(len(samples))] + +rawgenos = np.take( + chrArmFile["calldata/GT"], + [i for i in range(len(chroms)) if chroms[i] == chrArm], + axis=0, +) +genos = allel.GenotypeArray(rawgenos) +refAlleles = np.extract(chroms == chrArm, chrArmFile["variants/REF"]) +altAlleles = np.extract(chroms == chrArm, chrArmFile["variants/ALT"]) +if segmentStart != None: + snpIndicesToKeep = [ + i + for i in range(len(positions)) + if segmentStart <= positions[i] <= segmentEnd + ] + positions = [positions[i] for i in snpIndicesToKeep] + refAlleles = [refAlleles[i] for i in snpIndicesToKeep] + altAlleles = [altAlleles[i] for i in snpIndicesToKeep] + genos = allel.GenotypeArray(genos.subset(sel0=snpIndicesToKeep)) +genos = allel.GenotypeArray(genos.subset(sel1=sampleIndicesToKeep)) +alleleCounts = genos.count_alleles() + +# remove all but mono/biallelic unmasked sites +isBiallelic = alleleCounts.is_biallelic() +for i in range(len(isBiallelic)): + if not isBiallelic[i]: + unmasked[positions[i] - 1] = False + +# polarize +if not ancestralArmFaFileName.lower() in ["none", "false"]: + sys.stderr.write("polarizing snps\n") + ancArm = readFaArm(ancestralArmFaFileName, chrArm).upper() + startTime = time.perf_counter() + # NOTE: mapping specifies which alleles to swap counts for based on polarization; leaves unpolarized snps alone + # NOTE: those snps need to be filtered later on (as done below)! + # this will also remove sites that could not be polarized + mapping, unmasked = polarizeSnps( + unmasked, positions, refAlleles, altAlleles, ancArm + ) + sys.stderr.write("took %s seconds\n" % (time.perf_counter() - startTime)) + statNames = [ + "pi", + "thetaW", + "tajD", + "thetaH", + "fayWuH", + "maxFDA", + "HapCount", + "H1", + "H12", + "H2/H1", + "ZnS", + "Omega", + "distVar", + "distSkew", + "distKurt", + ] +else: + statNames = [ + "pi", + "thetaW", + "tajD", + "HapCount", + "H1", + "H12", + "H2/H1", + "ZnS", + "Omega", + "distVar", + "distSkew", + "distKurt", + ] + +snpIndicesToKeep = [ + i for i in range(len(positions)) if unmasked[positions[i] - 1] +] +genos = allel.GenotypeArray(genos.subset(sel0=snpIndicesToKeep)) +positions = [positions[i] for i in snpIndicesToKeep] +alleleCounts = allel.AlleleCountsArray( + [[alleleCounts[i][0], max(alleleCounts[i][1:])] for i in snpIndicesToKeep] +) +if not ancestralArmFaFileName.lower() in ["none", "false"]: + mapping = [mapping[i] for i in snpIndicesToKeep] + alleleCounts = alleleCounts.map_alleles(mapping) +haps = genos.to_haplotypes() + +subWinBounds = getSubWinBounds(chrLen, subWinSize) +precomputedStats = {} # not using this + +header = "chrom classifiedWinStart classifiedWinEnd bigWinRange".split() +statHeader = "chrom start end".split() +for statName in statNames: + statHeader.append(statName) + for i in range(numSubWins): + header.append("%s_win%d" % (statName, i)) +statHeader = "\t".join(statHeader) +header = "\t".join(header) +outFile = open(outfn, "w") +outFile.write(header + "\n") +statVals = {} +for statName in statNames: + statVals[statName] = [] + +startTime = time.perf_counter() +goodSubWins = [] +lastSubWinEnd = chrLen - chrLen % subWinSize +snpIndicesInSubWins = getSnpIndicesInSubWins( + subWinSize, lastSubWinEnd, positions +) +subWinIndex = 0 +lastSubWinStart = lastSubWinEnd - subWinSize + 1 +if statFileName: + statFile = open(statFileName, "w") + statFile.write(statHeader + "\n") +for subWinStart in range(1, lastSubWinStart + 1, subWinSize): + subWinEnd = subWinStart + subWinSize - 1 + unmaskedFrac = unmasked[subWinStart - 1 : subWinEnd].count(True) / float( + subWinEnd - subWinStart + 1 + ) + if ( + segmentStart == None + or subWinStart >= segmentStart + and subWinEnd <= segmentEnd + ): + sys.stderr.write( + "%d-%d num unmasked snps: %d; unmasked frac: %f\n" + % ( + subWinStart, + subWinEnd, + len(snpIndicesInSubWins[subWinIndex]), + unmaskedFrac, + ) + ) + if ( + len(snpIndicesInSubWins[subWinIndex]) > 0 + and unmaskedFrac >= unmaskedFracCutoff + ): + hapsInSubWin = allel.HaplotypeArray( + haps.subset(sel0=snpIndicesInSubWins[subWinIndex]) + ) + statValStr = [] + for statName in statNames: + calcAndAppendStatValForScan( + alleleCounts, + positions, + statName, + subWinStart, + subWinEnd, + statVals, + subWinIndex, + hapsInSubWin, + unmasked, + precomputedStats, + ) + statValStr.append("%s: %s" % (statName, statVals[statName][-1])) + sys.stderr.write("\t".join(statValStr) + "\n") + goodSubWins.append(True) + if statFileName: + statFile.write( + "\t".join( + [chrArm, str(subWinStart), str(subWinEnd)] + + [str(statVals[statName][-1]) for statName in statNames] + ) + + "\n" + ) + else: + for statName in statNames: + appendStatValsForMonomorphicForScan( + statName, statVals, subWinIndex + ) + goodSubWins.append(False) + if goodSubWins[-numSubWins:].count(True) == numSubWins: + outVec = [] + for statName in statNames: + outVec += normalizeFeatureVec(statVals[statName][-numSubWins:]) + midSubWinEnd = subWinEnd - subWinSize * (numSubWins / 2) + midSubWinStart = midSubWinEnd - subWinSize + 1 + outFile.write( + "%s\t%d\t%d\t%d-%d\t" + % ( + chrArm, + midSubWinStart, + midSubWinEnd, + subWinEnd - winSize + 1, + subWinEnd, + ) + + "\t".join([str(x) for x in outVec]) + ) + outFile.write("\n") + subWinIndex += 1 +if statFileName: + statFile.close() +outFile.close() +sys.stderr.write( + "completed in %g seconds\n" % (time.perf_counter() - startTime) +) diff --git a/diploshic/makeFeatureVecsForSingleMsDiploid.py b/diploshic/makeFeatureVecsForSingleMsDiploid.py new file mode 100644 index 0000000..54b322b --- /dev/null +++ b/diploshic/makeFeatureVecsForSingleMsDiploid.py @@ -0,0 +1,324 @@ +import sys, os +import allel +import random +import numpy as np +from diploshic.msTools import * +from diploshic.fvTools import * +import time + +( + trainingDataFileName, + totalPhysLen, + numSubWins, + maskFileName, + vcfForMaskFileName, + popForMask, + sampleToPopFileName, + unmaskedGenoFracCutoff, + chrArmsForMasking, + unmaskedFracCutoff, + outStatsDir, + fvecFileName, +) = sys.argv[1:] +totalPhysLen = int(totalPhysLen) +numSubWins = int(numSubWins) +subWinLen = totalPhysLen // numSubWins +assert totalPhysLen % numSubWins == 0 and numSubWins > 1 + +sys.stderr.write("file name='%s'" % (trainingDataFileName)) + +( + trainingDataFileObj, + sampleSize, + numInstances, +) = openMsOutFileForSequentialReading(trainingDataFileName) + +if maskFileName.lower() in ["none", "false"]: + sys.stderr.write( + "maskFileName='%s': not masking any sites!\n" % (maskFileName) + ) + maskFileName = False + unmaskedFracCutoff = 1.0 +else: + chrArmsForMasking = chrArmsForMasking.split(",") + unmaskedFracCutoff = float(unmaskedFracCutoff) + if unmaskedFracCutoff > 1.0 or unmaskedFracCutoff < 0.0: + sys.exit( + "unmaskedFracCutoff must lie within [0, 1]. AAARRRRGGGGHHHHH!!!!\n" + ) + +if vcfForMaskFileName.lower() in ["none", "false"]: + sys.stderr.write( + "vcfForMaskFileName='%s': not masking any genotypes!\n" + % (vcfForMaskFileName) + ) + vcfForMaskFileName = False +else: + if not maskFileName: + sys.exit( + "Cannot mask genotypes without also supplying a file for masking entire sites (can use reference genome with Ns if desired). AAARRRGHHHHH!!!!!!\n" + ) + if sampleToPopFileName.lower() in [ + "none", + "false", + ] or popForMask.lower() in [ + "none", + "false", + ]: + sampleToPopFileName = None + sys.stderr.write( + "No sampleToPopFileName specified. Using all individuals for masking genotypes.\n" + ) + unmaskedGenoFracCutoff = float(unmaskedGenoFracCutoff) + if unmaskedGenoFracCutoff > 1.0 or unmaskedGenoFracCutoff < 0.0: + sys.exit( + "unmaskedGenoFracCutoff must lie within [0, 1]. AAARRRRGGGGHHHHH!!!!\n" + ) + + +def getSubWinBounds(subWinLen, totalPhysLen): # get inclusive subwin bounds + subWinStart = 1 + subWinEnd = subWinStart + subWinLen - 1 + subWinBounds = [(subWinStart, subWinEnd)] + numSubWins = totalPhysLen // subWinLen + for i in range(1, numSubWins - 1): + subWinStart += subWinLen + subWinEnd += subWinLen + subWinBounds.append((subWinStart, subWinEnd)) + subWinStart += subWinLen + # if our subwindows are 1 bp too short due to rounding error, the last window picks up all of the slack + subWinEnd = totalPhysLen + subWinBounds.append((subWinStart, subWinEnd)) + return subWinBounds + + +if not maskFileName: + unmasked = [True] * totalPhysLen +else: + drawWithReplacement = False + sys.stderr.write("reading masking data...") + maskData = readMaskDataForTraining( + maskFileName, + totalPhysLen, + subWinLen, + chrArmsForMasking, + shuffle=True, + cutoff=unmaskedFracCutoff, + genoCutoff=unmaskedGenoFracCutoff, + vcfForMaskFileName=vcfForMaskFileName, + pop=popForMask, + sampleToPopFileName=sampleToPopFileName, + ) + if vcfForMaskFileName: + maskData, genoMaskData = maskData + else: + genoMaskData = [None] * len(maskData) + sys.stderr.write("done!\n") + + if len(maskData) < numInstances: + sys.stderr.write( + "Warning: didn't get enough windows from masked data (needed %d; got %d); will draw with replacement!!\n" + % (numInstances, len(maskData)) + ) + drawWithReplacement = True + else: + sys.stderr.write( + "Got enough windows from masked data (needed %d; got %d); will draw without replacement.\n" + % (numInstances, len(maskData)) + ) + + +def getSnpIndicesInSubWins(subWinBounds, snpLocs): + snpIndicesInSubWins = [] + for subWinIndex in range(len(subWinBounds)): + snpIndicesInSubWins.append([]) + + subWinIndex = 0 + for i in range(len(snpLocs)): + while not ( + snpLocs[i] >= subWinBounds[subWinIndex][0] + and snpLocs[i] <= subWinBounds[subWinIndex][1] + ): + subWinIndex += 1 + snpIndicesInSubWins[subWinIndex].append(i) + return snpIndicesInSubWins + + +subWinBounds = getSubWinBounds(subWinLen, totalPhysLen) +# statNames = ["pi", "thetaW", "tajD", "nDiplos","diplo_H2","diplo_H12","diplo_H2/H1","diplo_ZnS","diplo_Omega"] +statNames = [ + "pi", + "thetaW", + "tajD", + "distVar", + "distSkew", + "distKurt", + "nDiplos", + "diplo_H1", + "diplo_H12", + "diplo_H2/H1", + "diplo_ZnS", + "diplo_Omega", +] +header = [] +for statName in statNames: + for i in range(numSubWins): + header.append("%s_win%d" % (statName, i)) +header = "\t".join(header) + + +statVals = {} +for statName in statNames: + statVals[statName] = [] +start = time.perf_counter() +numInstancesDone = 0 +sys.stderr.write("ready to process sim reps. here we go!\n") +for instanceIndex in range(numInstances): + sys.stderr.write("starting rep %d of %d\n" % (instanceIndex, numInstances)) + hapArrayIn, positionArray = readNextMsRepToHaplotypeArrayIn( + trainingDataFileObj, sampleSize, totalPhysLen + ) + + haps = allel.HaplotypeArray(hapArrayIn, dtype="i1") + if maskFileName: + if drawWithReplacement: + randIndex = random.randint(0, len(maskData) - 1) + unmasked, genoMasks = maskData[randIndex], genoMaskData[randIndex] + else: + unmasked, genoMasks = ( + maskData[instanceIndex], + genoMaskData[instanceIndex], + ) + assert len(unmasked) == totalPhysLen + if haps.shape[1] % 2 == 1: + haps = haps[:, :-1] + genos = haps.to_genotypes(ploidy=2) + unmaskedSnpIndices = [ + i for i in range(len(positionArray)) if unmasked[positionArray[i] - 1] + ] + if len(unmaskedSnpIndices) == 0: + sys.stderr.write("no snps for rep %d\n" % (instanceIndex)) + for statName in statNames: + statVals[statName].append([]) + for subWinIndex in range(numSubWins): + for statName in statNames: + appendStatValsForMonomorphic( + statName, statVals, instanceIndex, subWinIndex + ) + else: + sys.stderr.write("processing snps for rep %d\n" % (instanceIndex)) + if maskFileName: + preMaskCount = np.sum(genos.count_alleles()) + if genoMasks: + sys.stderr.write( + "%d snps in the masking window for rep %d\n" + % (len(genoMasks), instanceIndex) + ) + genos = maskGenos( + genos.subset(sel0=unmaskedSnpIndices), genoMasks + ) + else: + genos = genos.subset(sel0=unmaskedSnpIndices) + alleleCountsUnmaskedOnly = genos.count_alleles() + maskedCount = preMaskCount - np.sum(alleleCountsUnmaskedOnly) + sys.stderr.write( + "%d of %d genotypes (%.2f%%) masked for rep %d\n" + % ( + maskedCount, + preMaskCount, + 100 * maskedCount / preMaskCount, + instanceIndex, + ) + ) + else: + alleleCountsUnmaskedOnly = genos.count_alleles() + positionArrayUnmaskedOnly = [ + positionArray[i] for i in unmaskedSnpIndices + ] + snpIndicesInSubWins = getSnpIndicesInSubWins( + subWinBounds, positionArrayUnmaskedOnly + ) + for statName in statNames: + statVals[statName].append([]) + for subWinIndex in range(numSubWins): + subWinStart, subWinEnd = subWinBounds[subWinIndex] + unmaskedFrac = unmasked[subWinStart - 1 : subWinEnd].count( + True + ) / float(subWinLen) + assert unmaskedFrac >= unmaskedFracCutoff + snpIndicesInSubWinUnmasked = snpIndicesInSubWins[subWinIndex] + sys.stderr.write( + "examining subwindow %d which has %d unmasked SNPs\n" + % (subWinIndex, len(snpIndicesInSubWinUnmasked)) + ) + if len(snpIndicesInSubWinUnmasked) > 0: + genosInSubWin = genos.subset(sel0=snpIndicesInSubWinUnmasked) + for statName in statNames: + calcAndAppendStatValDiplo( + alleleCountsUnmaskedOnly, + positionArrayUnmaskedOnly, + statName, + subWinStart, + subWinEnd, + statVals, + instanceIndex, + subWinIndex, + genosInSubWin, + unmasked, + ) + else: + for statName in statNames: + appendStatValsForMonomorphic( + statName, statVals, instanceIndex, subWinIndex + ) + numInstancesDone += 1 + sys.stderr.write( + "finished %d reps after %f seconds\n" + % (numInstancesDone, time.perf_counter() - start) + ) + +if numInstancesDone != numInstances: + sys.exit( + "Expected %d reps but only processed %d. Perhaps we are using malformed simulation output!\n" + % (numInstancesDone, numInstances) + ) + +statFiles = [] +if outStatsDir.lower() != "none": + for subWinIndex in range(numSubWins): + statFileName = "%s/%s.%d.stats" % ( + outStatsDir, + trainingDataFileName.split("/")[-1].rstrip(".gz"), + subWinIndex, + ) + statFiles.append(open(statFileName, "w")) + statFiles[-1].write("\t".join(statNames) + "\n") +with open(fvecFileName, "w") as fvecFile: + fvecFile.write(header + "\n") + for i in range(numInstancesDone): + statLines = [] + for subWinIndex in range(numSubWins): + statLines.append([]) + outVec = [] + for statName in statNames: + outVec += normalizeFeatureVec(statVals[statName][i]) + for subWinIndex in range(numSubWins): + statLines[subWinIndex].append( + statVals[statName][i][subWinIndex] + ) + if statFiles: + for subWinIndex in range(numSubWins): + statFiles[subWinIndex].write( + "\t".join([str(x) for x in statLines[subWinIndex]]) + "\n" + ) + fvecFile.write("\t".join([str(x) for x in outVec]) + "\n") + +if statFiles: + for subWinIndex in range(numSubWins): + statFiles[subWinIndex].close() + +sys.stderr.write( + "total time spent calculating summary statistics and generating feature vectors: %f secs\n" + % (time.perf_counter() - start) +) +closeMsOutFile(trainingDataFileObj) diff --git a/makeFeatureVecsForSingleMs_ogSHIC.py b/diploshic/makeFeatureVecsForSingleMs_ogSHIC.py similarity index 61% rename from makeFeatureVecsForSingleMs_ogSHIC.py rename to diploshic/makeFeatureVecsForSingleMs_ogSHIC.py index 0f72f3c..d5168ee 100644 --- a/makeFeatureVecsForSingleMs_ogSHIC.py +++ b/diploshic/makeFeatureVecsForSingleMs_ogSHIC.py @@ -2,49 +2,61 @@ import allel import random import numpy as np -import msTools -import fvTools +import diploshic.msTools as msTools +import diploshic.fvTools as fvTools import time -'''usage example +"""usage example python makeFeatureVecsForSingleMsFileDiploid.py /san/data/dan/simulations/discoal_multipopStuff/spatialSVMSims/trainingSets/equilibNeut.msout.gz 110000 11 /san/data/ag1kg/accessibility/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.accessible.fa /san/data/ag1kg/outgroups/anc.meru_mela.fa 2L,2R,3L,3R 0.25 0.01 trainingSetsStats/ trainingSetsFeatureVecs/equilibNeut.msout.gz.fvec -''' # NOQA +""" # NOQA -trainingDataFileName, totalPhysLen, numSubWins, maskFileName, \ - ancFileName, chrArmsForMasking, unmaskedFracCutoff, pMisPol, \ - outStatsDir, fvecFileName = sys.argv[1:] +( + trainingDataFileName, + totalPhysLen, + numSubWins, + maskFileName, + ancFileName, + chrArmsForMasking, + unmaskedFracCutoff, + pMisPol, + outStatsDir, + fvecFileName, +) = sys.argv[1:] totalPhysLen = int(totalPhysLen) numSubWins = int(numSubWins) pMisPol = float(pMisPol) # below was the old call to get the iHS normalizations # standardizationInfo = readStatsDafsComputeStandardizationBins(statAndDafFileName, nBins=50, pMisPol=pMisPol) # NOQA -subWinLen = totalPhysLen//numSubWins +subWinLen = totalPhysLen // numSubWins assert totalPhysLen % numSubWins == 0 and numSubWins > 1 chrArmsForMasking = chrArmsForMasking.split(",") sys.stderr.write("file name='%s'" % (trainingDataFileName)) -trainingDataFileObj, sampleSize, numInstances =\ - msTools.openMsOutFileForSequentialReading(trainingDataFileName) +( + trainingDataFileObj, + sampleSize, + numInstances, +) = msTools.openMsOutFileForSequentialReading(trainingDataFileName) if maskFileName.lower() in ["none", "false"]: sys.stderr.write( - "maskFileName='%s': not doing any masking!\n" % (maskFileName)) + "maskFileName='%s': not doing any masking!\n" % (maskFileName) + ) maskFileName = False unmaskedFracCutoff = 1.0 else: unmaskedFracCutoff = float(unmaskedFracCutoff) if unmaskedFracCutoff > 1.0: - sys.exit( - "unmaskedFracCutoff must lie within [0, 1].\n") + sys.exit("unmaskedFracCutoff must lie within [0, 1].\n") def getSubWinBounds(subWinLen, totalPhysLen): # get inclusive subwin bounds subWinStart = 1 subWinEnd = subWinStart + subWinLen - 1 subWinBounds = [(subWinStart, subWinEnd)] - numSubWins = totalPhysLen//subWinLen - for i in range(1, numSubWins-1): + numSubWins = totalPhysLen // subWinLen + for i in range(1, numSubWins - 1): subWinStart += subWinLen subWinEnd += subWinLen subWinBounds.append((subWinStart, subWinEnd)) @@ -62,19 +74,34 @@ def getSubWinBounds(subWinLen, totalPhysLen): # get inclusive subwin bounds drawWithReplacement = False if ancFileName.lower() in ["none", "false"]: maskData = fvTools.readMaskDataForTraining( - maskFileName, totalPhysLen, subWinLen, - chrArmsForMasking, shuffle=True, cutoff=unmaskedFracCutoff) + maskFileName, + totalPhysLen, + subWinLen, + chrArmsForMasking, + shuffle=True, + cutoff=unmaskedFracCutoff, + ) else: maskData = fvTools.readMaskAndAncDataForTraining( - maskFileName, ancFileName, totalPhysLen, subWinLen, - chrArmsForMasking, shuffle=True, cutoff=unmaskedFracCutoff) + maskFileName, + ancFileName, + totalPhysLen, + subWinLen, + chrArmsForMasking, + shuffle=True, + cutoff=unmaskedFracCutoff, + ) if len(maskData) < numInstances: - sys.stderr.write("Warning: didn't get enough windows from masked data (needed %d; got %d); will draw with replacement!!\n" % ( # NOQA - numInstances, len(maskData))) + sys.stderr.write( + "Warning: didn't get enough windows from masked data (needed %d; got %d); will draw with replacement!!\n" + % (numInstances, len(maskData)) # NOQA + ) drawWithReplacement = True else: - sys.stderr.write("Got enough windows from masked data (needed %d; got %d); will draw without replacement.\n" % ( # NOQA - numInstances, len(maskData))) + sys.stderr.write( + "Got enough windows from masked data (needed %d; got %d); will draw without replacement.\n" + % (numInstances, len(maskData)) # NOQA + ) def getSnpIndicesInSubWins(subWinBounds, snpLocs): @@ -84,17 +111,33 @@ def getSnpIndicesInSubWins(subWinBounds, snpLocs): subWinIndex = 0 for i in range(len(snpLocs)): - while not (snpLocs[i] >= subWinBounds[subWinIndex][0] and - snpLocs[i] <= subWinBounds[subWinIndex][1]): + while not ( + snpLocs[i] >= subWinBounds[subWinIndex][0] + and snpLocs[i] <= subWinBounds[subWinIndex][1] + ): subWinIndex += 1 snpIndicesInSubWins[subWinIndex].append(i) return snpIndicesInSubWins subWinBounds = getSubWinBounds(subWinLen, totalPhysLen) -statNames = ["pi", "thetaW", "tajD", "thetaH", "fayWuH", "maxFDA", "HapCount", - "H1", "H12", "H2/H1", "ZnS", "Omega", "distVar", - "distSkew", "distKurt"] +statNames = [ + "pi", + "thetaW", + "tajD", + "thetaH", + "fayWuH", + "maxFDA", + "HapCount", + "H1", + "H12", + "H2/H1", + "ZnS", + "Omega", + "distVar", + "distSkew", + "distKurt", +] header = [] for statName in statNames: for i in range(numSubWins): @@ -109,10 +152,11 @@ def getSnpIndicesInSubWins(subWinBounds, snpLocs): numInstancesDone = 0 for instanceIndex in range(numInstances): hapArrayIn, positionArray = msTools.readNextMsRepToHaplotypeArrayIn( - trainingDataFileObj, sampleSize, totalPhysLen) + trainingDataFileObj, sampleSize, totalPhysLen + ) snpIndicesInSubWins = getSnpIndicesInSubWins(subWinBounds, positionArray) - haps = allel.HaplotypeArray(hapArrayIn, dtype='i1') + haps = allel.HaplotypeArray(hapArrayIn, dtype="i1") if maskFileName: if drawWithReplacement: unmasked = random.choice(maskData) @@ -120,26 +164,31 @@ def getSnpIndicesInSubWins(subWinBounds, snpLocs): unmasked = maskData[instanceIndex] assert len(unmasked) == totalPhysLen genos = haps.to_genotypes(ploidy=2) - unmaskedSnpIndices = [i for i in range( - len(positionArray)) if unmasked[positionArray[i]-1]] + unmaskedSnpIndices = [ + i for i in range(len(positionArray)) if unmasked[positionArray[i] - 1] + ] if len(unmaskedSnpIndices) == 0: for statName in statNames: statVals[statName].append([]) for subWinIndex in range(numSubWins): for statName in statNames: fvTools.appendStatValsForMonomorphic( - statName, statVals, instanceIndex, subWinIndex) + statName, statVals, instanceIndex, subWinIndex + ) else: - positionArrayUnmaskedOnly = [positionArray[i] - for i in unmaskedSnpIndices] + positionArrayUnmaskedOnly = [ + positionArray[i] for i in unmaskedSnpIndices + ] ac = genos.count_alleles() alleleCountsUnmaskedOnly = allel.AlleleCountsArray( - np.array([ac[i] for i in unmaskedSnpIndices])) + np.array([ac[i] for i in unmaskedSnpIndices]) + ) sampleSizes = [sum(x) for x in alleleCountsUnmaskedOnly] assert len(set(sampleSizes)) == 1 and sampleSizes[0] == sampleSize if pMisPol > 0: alleleCountsUnmaskedOnly = fvTools.misPolarizeAlleleCounts( - alleleCountsUnmaskedOnly, pMisPol) + alleleCountsUnmaskedOnly, pMisPol + ) # dafs = alleleCountsUnmaskedOnly[:,1]/float(sampleSizes[0]) unmaskedHaps = haps.subset(sel0=unmaskedSnpIndices) unmaskedGenos = genos.subset(sel0=unmaskedSnpIndices) @@ -148,38 +197,53 @@ def getSnpIndicesInSubWins(subWinBounds, snpLocs): statVals[statName].append([]) for subWinIndex in range(numSubWins): subWinStart, subWinEnd = subWinBounds[subWinIndex] - unmaskedFrac = unmasked[subWinStart - - 1:subWinEnd].count(True)/float(subWinLen) + unmaskedFrac = unmasked[subWinStart - 1 : subWinEnd].count( + True + ) / float(subWinLen) assert unmaskedFrac >= unmaskedFracCutoff snpIndicesInSubWinUnmasked = [ - x for x in snpIndicesInSubWins[subWinIndex] if unmasked[positionArray[x]-1]] # NOQA + x + for x in snpIndicesInSubWins[subWinIndex] + if unmasked[positionArray[x] - 1] + ] # NOQA if len(snpIndicesInSubWinUnmasked) > 0: hapsInSubWin = haps.subset(sel0=snpIndicesInSubWinUnmasked) genosInSubWin = genos.subset(sel0=snpIndicesInSubWinUnmasked) for statName in statNames: - fvTools.calcAndAppendStatVal(alleleCountsUnmaskedOnly, - positionArrayUnmaskedOnly, - statName, subWinStart, - subWinEnd, statVals, - instanceIndex, subWinIndex, - hapsInSubWin, unmasked, - precomputedStats) + fvTools.calcAndAppendStatVal( + alleleCountsUnmaskedOnly, + positionArrayUnmaskedOnly, + statName, + subWinStart, + subWinEnd, + statVals, + instanceIndex, + subWinIndex, + hapsInSubWin, + unmasked, + precomputedStats, + ) else: for statName in statNames: fvTools.appendStatValsForMonomorphic( - statName, statVals, instanceIndex, subWinIndex) + statName, statVals, instanceIndex, subWinIndex + ) numInstancesDone += 1 if numInstancesDone != numInstances: - sys.exit("Expected %d reps but only processed %d. Perhaps we are using malformed simulation output!\n" % ( # NOQA - numInstancesDone, numInstances)) + sys.exit( + "Expected %d reps but only processed %d. Perhaps we are using malformed simulation output!\n" + % (numInstancesDone, numInstances) # NOQA + ) statFiles = [] if outStatsDir.lower() != "none": for subWinIndex in range(numSubWins): statFileName = "%s/%s.%d.stats" % ( - outStatsDir, trainingDataFileName.split("/")[-1].rstrip(".gz"), - subWinIndex) + outStatsDir, + trainingDataFileName.split("/")[-1].rstrip(".gz"), + subWinIndex, + ) statFiles.append(open(statFileName, "w")) statFiles[-1].write("\t".join(statNames) + "\n") with open(fvecFileName, "w") as fvecFile: @@ -194,17 +258,21 @@ def getSnpIndicesInSubWins(subWinBounds, snpLocs): outVec += fvTools.normalizeFeatureVec(statVals[statName][i]) for subWinIndex in range(numSubWins): statLines[subWinIndex].append( - statVals[statName][i][subWinIndex]) + statVals[statName][i][subWinIndex] + ) if statFiles: for subWinIndex in range(numSubWins): statFiles[subWinIndex].write( - "\t".join([str(x) for x in statLines[subWinIndex]]) + "\n") + "\t".join([str(x) for x in statLines[subWinIndex]]) + "\n" + ) fvecFile.write("\t".join([str(x) for x in outVec]) + "\n") if statFiles: for subWinIndex in range(numSubWins): statFiles[subWinIndex].close() -sys.stderr.write("total time spent calculating summary statistics and generating feature vectors: %f secs\n" % ( # NOQA - time.perf_counter()-start)) +sys.stderr.write( + "total time spent calculating summary statistics and generating feature vectors: %f secs\n" + % (time.perf_counter() - start) # NOQA +) msTools.closeMsOutFile(trainingDataFileObj) diff --git a/makeTrainingSets.py b/diploshic/makeTrainingSets.py similarity index 51% rename from makeTrainingSets.py rename to diploshic/makeTrainingSets.py index 5881084..35e622e 100644 --- a/makeTrainingSets.py +++ b/diploshic/makeTrainingSets.py @@ -1,7 +1,14 @@ import sys, os, random -neutTrainingFileName, softTrainingFilePrefix, hardTrainingFilePrefix, sweepTrainingWindows, linkedTrainingWindows, outDir = sys.argv[1:] -#sweepTrainingWindows and linkedTrainingWindows are comma-separated lists +( + neutTrainingFileName, + softTrainingFilePrefix, + hardTrainingFilePrefix, + sweepTrainingWindows, + linkedTrainingWindows, + outDir, +) = sys.argv[1:] +# sweepTrainingWindows and linkedTrainingWindows are comma-separated lists sweepFilePaths, linkedFilePaths = {}, {} for trainingFilePrefix in [softTrainingFilePrefix, hardTrainingFilePrefix]: @@ -16,14 +23,21 @@ if fileName.startswith(trainingFilePrefixDirless): winNum = int(fileName.split("_")[1].split(".")[0]) if winNum in linkedWins: - linkedFilePaths[trainingFilePrefix].append(trainingSetDir + "/" + fileName) + linkedFilePaths[trainingFilePrefix].append( + trainingSetDir + "/" + fileName + ) elif winNum in sweepWins: - sweepFilePaths[trainingFilePrefix].append(trainingSetDir + "/" + fileName) + sweepFilePaths[trainingFilePrefix].append( + trainingSetDir + "/" + fileName + ) + def getExamplesFromFVFile(simFileName): try: - simFile = open(simFileName,'rt') - lines = [line.strip() for line in simFile.readlines() if not "nan" in line] + simFile = open(simFileName, "rt") + lines = [ + line.strip() for line in simFile.readlines() if not "nan" in line + ] header = lines[0] examples = lines[1:] simFile.close() @@ -31,6 +45,7 @@ def getExamplesFromFVFile(simFileName): except Exception: return "", [] + def getExamplesFromFVFileLs(simFileLs): examples = [] keptHeader = "" @@ -41,6 +56,7 @@ def getExamplesFromFVFileLs(simFileLs): examples += currExamples return keptHeader, examples + def getMinButNonZeroExamples(lsLs): counts = [] for ls in lsLs: @@ -50,24 +66,57 @@ def getMinButNonZeroExamples(lsLs): raise Exception return min(counts) + header, neutExamples = getExamplesFromFVFile(neutTrainingFileName) -linkedSoftHeader, linkedSoftExamples = getExamplesFromFVFileLs(linkedFilePaths[softTrainingFilePrefix]) -softHeader, softExamples = getExamplesFromFVFileLs(sweepFilePaths[softTrainingFilePrefix]) -linkedHardHeader, linkedHardExamples = getExamplesFromFVFileLs(linkedFilePaths[hardTrainingFilePrefix]) -hardHeader, hardExamples = getExamplesFromFVFileLs(sweepFilePaths[hardTrainingFilePrefix]) -trainingSetLs = [linkedSoftExamples, softExamples, linkedHardExamples, hardExamples,neutExamples] +linkedSoftHeader, linkedSoftExamples = getExamplesFromFVFileLs( + linkedFilePaths[softTrainingFilePrefix] +) +softHeader, softExamples = getExamplesFromFVFileLs( + sweepFilePaths[softTrainingFilePrefix] +) +linkedHardHeader, linkedHardExamples = getExamplesFromFVFileLs( + linkedFilePaths[hardTrainingFilePrefix] +) +hardHeader, hardExamples = getExamplesFromFVFileLs( + sweepFilePaths[hardTrainingFilePrefix] +) +trainingSetLs = [ + linkedSoftExamples, + softExamples, + linkedHardExamples, + hardExamples, + neutExamples, +] numExamplesToKeep = getMinButNonZeroExamples(trainingSetLs) for i in range(len(trainingSetLs)): random.shuffle(trainingSetLs[i]) trainingSetLs[i] = trainingSetLs[i][:numExamplesToKeep] -linkedSoftExamples, softExamples, linkedHardExamples, hardExamples, neutExamples = trainingSetLs +( + linkedSoftExamples, + softExamples, + linkedHardExamples, + hardExamples, + neutExamples, +) = trainingSetLs -outFileNames = ["neut.fvec", "linkedSoft.fvec", "soft.fvec", "linkedHard.fvec", "hard.fvec"] -outExamples = [neutExamples, linkedSoftExamples, softExamples, linkedHardExamples, hardExamples] +outFileNames = [ + "neut.fvec", + "linkedSoft.fvec", + "soft.fvec", + "linkedHard.fvec", + "hard.fvec", +] +outExamples = [ + neutExamples, + linkedSoftExamples, + softExamples, + linkedHardExamples, + hardExamples, +] for i in range(len(outFileNames)): if outExamples[i]: - outFile = open(outDir +"/"+ outFileNames[i], "w") - outFile.write(hardHeader+"\n") + outFile = open(outDir + "/" + outFileNames[i], "w") + outFile.write(hardHeader + "\n") for example in outExamples[i]: - outFile.write("%s\n" %(example)) + outFile.write("%s\n" % (example)) outFile.close() diff --git a/misc.py b/diploshic/misc.py similarity index 83% rename from misc.py rename to diploshic/misc.py index 70d378a..79dc084 100644 --- a/misc.py +++ b/diploshic/misc.py @@ -1,8 +1,10 @@ import numpy as np from scipy.sparse import coo_matrix + """This is all a bunch of stuff copied from sk-learn 0.24.2 but shoving it in here for compatibility purposes. Some slight modifications were made.""" + class ConfusionMatrixDisplay: """Confusion Matrix visualization. It is recommend to use :func:`~sklearn.metrics.plot_confusion_matrix` to @@ -53,13 +55,21 @@ class ConfusionMatrixDisplay: ... display_labels=clf.classes_) >>> disp.plot() # doctest: +SKIP """ + def __init__(self, confusion_matrix, *, display_labels=None): self.confusion_matrix = confusion_matrix self.display_labels = display_labels - def plot(self, *, include_values=True, cmap='viridis', - xticks_rotation='horizontal', values_format=None, - ax=None, colorbar=True): + def plot( + self, + *, + include_values=True, + cmap="viridis", + xticks_rotation="horizontal", + values_format=None, + ax=None, + colorbar=True + ): """Plot visualization. Parameters ---------- @@ -91,7 +101,7 @@ def plot(self, *, include_values=True, cmap='viridis', cm = self.confusion_matrix n_classes = cm.shape[0] - self.im_ = ax.imshow(cm, interpolation='nearest', cmap=cmap) + self.im_ = ax.imshow(cm, interpolation="nearest", cmap=cmap) self.text_ = None cmap_min, cmap_max = self.im_.cmap(0), self.im_.cmap(256) @@ -106,18 +116,17 @@ def plot(self, *, include_values=True, cmap='viridis', color = cmap_max if cm[i, j] < thresh else cmap_min if values_format is None: - text_cm = format(cm[i, j], '.2g') - if cm.dtype.kind != 'f': - text_d = format(cm[i, j], 'd') + text_cm = format(cm[i, j], ".2g") + if cm.dtype.kind != "f": + text_d = format(cm[i, j], "d") if len(text_d) < len(text_cm): text_cm = text_d else: text_cm = format(cm[i, j], values_format) self.text_[i, j] = ax.text( - j, i, text_cm, - ha="center", va="center", - color=color) + j, i, text_cm, ha="center", va="center", color=color + ) if self.display_labels is None: display_labels = np.arange(n_classes) @@ -125,12 +134,14 @@ def plot(self, *, include_values=True, cmap='viridis', display_labels = self.display_labels if colorbar: fig.colorbar(self.im_, ax=ax) - ax.set(xticks=np.arange(n_classes), - yticks=np.arange(n_classes), - xticklabels=display_labels, - yticklabels=display_labels, - ylabel="True label", - xlabel="Predicted label") + ax.set( + xticks=np.arange(n_classes), + yticks=np.arange(n_classes), + xticklabels=display_labels, + yticklabels=display_labels, + ylabel="True label", + xlabel="Predicted label", + ) ax.set_ylim((n_classes - 0.5, -0.5)) plt.setp(ax.get_xticklabels(), rotation=xticks_rotation) @@ -140,8 +151,9 @@ def plot(self, *, include_values=True, cmap='viridis', return self -def confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, - normalize=None): +def confusion_matrix( + y_true, y_pred, *, labels=None, sample_weight=None, normalize=None +): """Compute confusion matrix to evaluate the accuracy of a classification. By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}` is equal to the number of observations known to be in group :math:`i` and @@ -223,15 +235,20 @@ def confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, else: sample_weight = np.asarray(sample_weight) - if normalize not in ['true', 'pred', 'all', None]: - raise ValueError("normalize must be one of {'true', 'pred', " - "'all', None}") + if normalize not in ["true", "pred", "all", None]: + raise ValueError( + "normalize must be one of {'true', 'pred', " "'all', None}" + ) n_labels = labels.size label_to_ind = {y: x for x, y in enumerate(labels)} # convert yt, yp into index - y_pred = np.array([label_to_ind.get(np.argmax(x), n_labels + 1) for x in y_pred]) - y_true = np.array([label_to_ind.get(np.argmax(x), n_labels + 1) for x in y_true]) + y_pred = np.array( + [label_to_ind.get(np.argmax(x), n_labels + 1) for x in y_pred] + ) + y_true = np.array( + [label_to_ind.get(np.argmax(x), n_labels + 1) for x in y_true] + ) # intersect y_pred, y_true with labels, eliminate items not in labels ind = np.logical_and(y_pred < n_labels, y_true < n_labels) @@ -241,32 +258,45 @@ def confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, sample_weight = sample_weight[ind] # Choose the accumulator dtype to always have high precision - if sample_weight.dtype.kind in {'i', 'u', 'b'}: + if sample_weight.dtype.kind in {"i", "u", "b"}: dtype = np.int64 else: dtype = np.float64 - cm = coo_matrix((sample_weight, (y_true, y_pred)), - shape=(n_labels, n_labels), dtype=dtype, - ).toarray() + cm = coo_matrix( + (sample_weight, (y_true, y_pred)), + shape=(n_labels, n_labels), + dtype=dtype, + ).toarray() - with np.errstate(all='ignore'): - if normalize == 'true': + with np.errstate(all="ignore"): + if normalize == "true": cm = cm / cm.sum(axis=1, keepdims=True) - elif normalize == 'pred': + elif normalize == "pred": cm = cm / cm.sum(axis=0, keepdims=True) - elif normalize == 'all': + elif normalize == "all": cm = cm / cm.sum() cm = np.nan_to_num(cm) return cm -def plot_confusion_matrix(estimator, X, y_true, *, labels=None, - sample_weight=None, normalize=None, - display_labels=None, include_values=True, - xticks_rotation='horizontal', - values_format=None, - cmap='viridis', ax=None, colorbar=True): + +def plot_confusion_matrix( + estimator, + X, + y_true, + *, + labels=None, + sample_weight=None, + normalize=None, + display_labels=None, + include_values=True, + xticks_rotation="horizontal", + values_format=None, + cmap="viridis", + ax=None, + colorbar=True +): """Plot Confusion Matrix. Read more in the :ref:`User Guide `. Parameters @@ -332,10 +362,21 @@ def plot_confusion_matrix(estimator, X, y_true, *, labels=None, """ y_pred = estimator.predict(X) - cm = confusion_matrix(y_true, y_pred, sample_weight=sample_weight, - labels=labels, normalize=normalize) - disp = ConfusionMatrixDisplay(confusion_matrix=cm, - display_labels=display_labels) - return disp.plot(include_values=include_values, - cmap=cmap, ax=ax, xticks_rotation=xticks_rotation, - values_format=values_format, colorbar=colorbar) + cm = confusion_matrix( + y_true, + y_pred, + sample_weight=sample_weight, + labels=labels, + normalize=normalize, + ) + disp = ConfusionMatrixDisplay( + confusion_matrix=cm, display_labels=display_labels + ) + return disp.plot( + include_values=include_values, + cmap=cmap, + ax=ax, + xticks_rotation=xticks_rotation, + values_format=values_format, + colorbar=colorbar, + ) diff --git a/msTools.py b/diploshic/msTools.py similarity index 69% rename from msTools.py rename to diploshic/msTools.py index b3aeba2..b3bd7d4 100644 --- a/msTools.py +++ b/diploshic/msTools.py @@ -17,7 +17,7 @@ def fillInSnpSlotsWithOverflowers(newPositions, totalPhysLen, overflowers): posH[pos] = 1 for i in range(len(overflowers)): del newPositions[-1] - for pos in reversed(range(1, totalPhysLen+1)): + for pos in reversed(range(1, totalPhysLen + 1)): if pos not in posH: bisect.insort_left(newPositions, pos) overflowers.pop() @@ -31,14 +31,16 @@ def msPositionsToIntegerPositions(positions, totalPhysLen): prevIntPos = -1 newPositions = [] for position in positions: - assert position >= 0 and position < 1., "Mutations positions must all be in [0, 1)" + assert ( + position >= 0 and position < 1.0 + ), "Mutations positions must all be in [0, 1)" assert position >= prevPos origPos = position if position == prevPos: position += 0.000001 prevPos = origPos - intPos = int(totalPhysLen*position) + intPos = int(totalPhysLen * position) if intPos == 0: intPos = 1 if intPos <= prevIntPos: @@ -49,13 +51,17 @@ def msPositionsToIntegerPositions(positions, totalPhysLen): if overflowers: fillInSnpSlotsWithOverflowers(newPositions, totalPhysLen, overflowers) assert len(newPositions) == len(positions) - assert all(newPositions[i] <= newPositions[i+1] - for i in range(len(newPositions)-1)) + assert all( + newPositions[i] <= newPositions[i + 1] + for i in range(len(newPositions) - 1) + ) assert newPositions[-1] <= totalPhysLen return newPositions -def msRepToHaplotypeArrayIn(samples, positions, totalPhysLen, transposeHaps, discretizePositions=True): +def msRepToHaplotypeArrayIn( + samples, positions, totalPhysLen, transposeHaps, discretizePositions=True +): for i in range(len(samples)): assert len(samples[i]) == len(positions) if discretizePositions: @@ -72,16 +78,18 @@ def msRepToHaplotypeArrayIn(samples, positions, totalPhysLen, transposeHaps, dis return hapArrayIn, positions -def msOutToHaplotypeArrayIn(msOutputFileName, totalPhysLen, discretizePositions=True): +def msOutToHaplotypeArrayIn( + msOutputFileName, totalPhysLen, discretizePositions=True +): if msOutputFileName == "stdin": isFile = False msStream = sys.stdin else: isFile = True if msOutputFileName.endswith(".gz"): - msStream = gzip.open(msOutputFileName, 'rt') + msStream = gzip.open(msOutputFileName, "rt") else: - msStream = open(msOutputFileName, 'rt') + msStream = open(msOutputFileName, "rt") header = msStream.readline() program, numSamples, numSims = header.strip().split()[:3] @@ -96,7 +104,9 @@ def msOutToHaplotypeArrayIn(msOutputFileName, totalPhysLen, discretizePositions= while line: if line.strip() != "//": sys.exit( - "Malformed ms-style output file: read '%s' instead of '//'. \n" % (line.strip())) # NOQA + "Malformed ms-style output file: read '%s' instead of '//'. \n" + % (line.strip()) + ) # NOQA segsitesBlah, segsites = msStream.readline().strip().split() segsites = int(segsites) if segsitesBlah != "segsites:": @@ -116,14 +126,26 @@ def msOutToHaplotypeArrayIn(msOutputFileName, totalPhysLen, discretizePositions= for i in range(numSamples): sampleLine = msStream.readline().strip() if len(sampleLine) != segsites: - sys.exit("Malformed ms-style output file %s segsites but %s columns in line: %s; line %s of %s samples \n" % # NOQA - (segsites, len(sampleLine), sampleLine, i, numSamples)) # NOQA + sys.exit( + "Malformed ms-style output file %s segsites but %s columns in line: %s; line %s of %s samples \n" + % ( + segsites, + len(sampleLine), + sampleLine, + i, + numSamples, + ) # NOQA + ) # NOQA samples.append(sampleLine) if len(samples) != numSamples: raise Exception hapArrayIn, positions = msRepToHaplotypeArrayIn( - samples, positions, totalPhysLen, True, - discretizePositions=discretizePositions) + samples, + positions, + totalPhysLen, + True, + discretizePositions=discretizePositions, + ) hapArraysIn.append(hapArrayIn) positionArrays.append(positions) line = msStream.readline() @@ -132,8 +154,10 @@ def msOutToHaplotypeArrayIn(msOutputFileName, totalPhysLen, discretizePositions= line = msStream.readline() # sys.stderr.write("finished rep %d\n" %(len(hapArraysIn))) if len(hapArraysIn) != numSims: - sys.exit("Malformed ms-style output file: %s of %s sims processed. \n" % # NOQA - (len(hapArraysIn), numSims)) + sys.exit( + "Malformed ms-style output file: %s of %s sims processed. \n" + % (len(hapArraysIn), numSims) # NOQA + ) if isFile: msStream.close() @@ -147,7 +171,7 @@ def openMsOutFileForSequentialReading(msOutputFileName): else: isFile = True if msOutputFileName.endswith(".gz"): - msStream = gzip.open(msOutputFileName, 'rt') + msStream = gzip.open(msOutputFileName, "rt") else: msStream = open(msOutputFileName) @@ -164,7 +188,13 @@ def closeMsOutFile(fileInfoTuple): msStream.close() -def readNextMsRepToHaplotypeArrayIn(fileInfoTuple, numSamples, totalPhysLen, transposeHaps=True, discretizePositions=True): +def readNextMsRepToHaplotypeArrayIn( + fileInfoTuple, + numSamples, + totalPhysLen, + transposeHaps=True, + discretizePositions=True, +): msStream, isFile = fileInfoTuple # advance to next simulation @@ -191,16 +221,37 @@ def readNextMsRepToHaplotypeArrayIn(fileInfoTuple, numSamples, totalPhysLen, tra for i in range(numSamples): sampleLine = msStream.readline().strip() if len(sampleLine) != segsites: - sys.exit("Malformed ms-style output file %s segsites but %s columns in line: %s; line %s of %s samples \n" % # NOQA - (segsites, len(sampleLine), sampleLine, i, numSamples)) # NOQA + sys.exit( + "Malformed ms-style output file %s segsites but %s columns in line: %s; line %s of %s samples \n" + % ( + segsites, + len(sampleLine), + sampleLine, + i, + numSamples, + ) # NOQA + ) # NOQA samples.append(sampleLine) if len(samples) != numSamples: raise Exception hapArrayIn, positions = msRepToHaplotypeArrayIn( - samples, positions, totalPhysLen, transposeHaps, - discretizePositions=discretizePositions) + samples, + positions, + totalPhysLen, + transposeHaps, + discretizePositions=discretizePositions, + ) return hapArrayIn, positions -def readNextMsRepToGameteStrs(fileInfoTuple, numSamples, totalPhysLen, discretizePositions=True): - return readNextMsRepToHaplotypeArrayIn(fileInfoTuple, numSamples, totalPhysLen, transposeHaps=False, discretizePositions=discretizePositions) + +def readNextMsRepToGameteStrs( + fileInfoTuple, numSamples, totalPhysLen, discretizePositions=True +): + return readNextMsRepToHaplotypeArrayIn( + fileInfoTuple, + numSamples, + totalPhysLen, + transposeHaps=False, + discretizePositions=discretizePositions, + ) diff --git a/diploshic/setup.py b/diploshic/setup.py new file mode 100644 index 0000000..013d9ed --- /dev/null +++ b/diploshic/setup.py @@ -0,0 +1,14 @@ +# File setup.py +def configuration(parent_package="", top_path=None): + from numpy.distutils.misc_util import Configuration + + config = Configuration("", parent_package, top_path) + + config.add_extension("shicstats", sources=["shicstats.pyf", "utils.c"]) + return config + + +if __name__ == "__main__": + from numpy.distutils.core import setup + + setup(**configuration(top_path="").todict()) diff --git a/shicstats.pyf b/diploshic/shicstats.pyf similarity index 100% rename from shicstats.pyf rename to diploshic/shicstats.pyf diff --git a/testing/hard.fvec b/diploshic/testing/hard.fvec similarity index 100% rename from testing/hard.fvec rename to diploshic/testing/hard.fvec diff --git a/testing/linkedHard.fvec b/diploshic/testing/linkedHard.fvec similarity index 100% rename from testing/linkedHard.fvec rename to diploshic/testing/linkedHard.fvec diff --git a/testing/linkedSoft.fvec b/diploshic/testing/linkedSoft.fvec similarity index 100% rename from testing/linkedSoft.fvec rename to diploshic/testing/linkedSoft.fvec diff --git a/testing/neut.fvec b/diploshic/testing/neut.fvec similarity index 100% rename from testing/neut.fvec rename to diploshic/testing/neut.fvec diff --git a/testing/soft.fvec b/diploshic/testing/soft.fvec similarity index 100% rename from testing/soft.fvec rename to diploshic/testing/soft.fvec diff --git a/training/hard.fvec b/diploshic/training/hard.fvec similarity index 100% rename from training/hard.fvec rename to diploshic/training/hard.fvec diff --git a/training/linkedHard.fvec b/diploshic/training/linkedHard.fvec similarity index 100% rename from training/linkedHard.fvec rename to diploshic/training/linkedHard.fvec diff --git a/training/linkedSoft.fvec b/diploshic/training/linkedSoft.fvec similarity index 100% rename from training/linkedSoft.fvec rename to diploshic/training/linkedSoft.fvec diff --git a/training/neut.fvec b/diploshic/training/neut.fvec similarity index 100% rename from training/neut.fvec rename to diploshic/training/neut.fvec diff --git a/training/soft.fvec b/diploshic/training/soft.fvec similarity index 100% rename from training/soft.fvec rename to diploshic/training/soft.fvec diff --git a/utils.c b/diploshic/utils.c similarity index 100% rename from utils.c rename to diploshic/utils.c diff --git a/makeFeatureVecsForChrArmFromVcfDiploid.py b/makeFeatureVecsForChrArmFromVcfDiploid.py deleted file mode 100644 index 665df61..0000000 --- a/makeFeatureVecsForChrArmFromVcfDiploid.py +++ /dev/null @@ -1,163 +0,0 @@ -import os -import allel -import h5py -import numpy as np -import sys -import time -from fvTools import * - -if not len(sys.argv) in [13,15]: - sys.exit("usage:\npython makeFeatureVecsForChrArmFromVcfDiploid.py vcfFileName chrArm chrLen targetPop winSize numSubWins maskFileName unmaskedFracCutoff unmaskedGenoFracCutoff sampleToPopFileName statFileName outFileName [segmentStart segmentEnd]\n") -if len(sys.argv) == 15: - vcfFileName, chrArm, chrLen, targetPop, winSize, numSubWins, maskFileName, unmaskedFracCutoff, unmaskedGenoFracCutoff, sampleToPopFileName, statFileName, outfn, segmentStart, segmentEnd = sys.argv[1:] - segmentStart, segmentEnd = int(segmentStart), int(segmentEnd) -else: - vcfFileName, chrArm, chrLen, targetPop, winSize, numSubWins, maskFileName, unmaskedFracCutoff, unmaskedGenoFracCutoff, sampleToPopFileName, statFileName, outfn = sys.argv[1:] - segmentStart = None - -unmaskedFracCutoff = float(unmaskedFracCutoff) -if unmaskedFracCutoff < 0.0 or unmaskedFracCutoff > 1.0: - sys.exit("unmaskedFracCutoff=%s but must be within [0, 1]. AAAAARRRRGHHHHHH!!!\n" %(unmaskedFracCutoff)) -unmaskedGenoFracCutoff = float(unmaskedGenoFracCutoff) -if unmaskedGenoFracCutoff < 0.0 or unmaskedGenoFracCutoff > 1.0: - sys.exit("unmaskedGenoFracCutoff=%s but must be within [0, 1]. AAAAARRRRGHHHHHH!!!\n" %(unmaskedGenoFracCutoff)) -chrLen, winSize, numSubWins = int(chrLen), int(winSize), int(numSubWins) -assert winSize % numSubWins == 0 and numSubWins > 1 -subWinSize = int(winSize/numSubWins) - -def getSubWinBounds(chrLen, subWinSize): - lastSubWinEnd = chrLen - chrLen % subWinSize - lastSubWinStart = lastSubWinEnd - subWinSize + 1 - subWinBounds = [] - for subWinStart in range(1, lastSubWinStart+1, subWinSize): - subWinEnd = subWinStart + subWinSize - 1 - subWinBounds.append((subWinStart, subWinEnd)) - return subWinBounds - -def getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, snpLocs): - subWinStart = 1 - subWinEnd = subWinStart + subWinSize - 1 - snpIndicesInSubWins = [[]] - for i in range(len(snpLocs)): - while snpLocs[i] <= lastSubWinEnd and not (snpLocs[i] >= subWinStart and snpLocs[i] <= subWinEnd): - subWinStart += subWinSize - subWinEnd += subWinSize - snpIndicesInSubWins.append([]) - if snpLocs[i] <= lastSubWinEnd: - snpIndicesInSubWins[-1].append(i) - while subWinEnd < lastSubWinEnd: - snpIndicesInSubWins.append([]) - subWinStart += subWinSize - subWinEnd += subWinSize - return snpIndicesInSubWins - -def readSampleToPopFile(sampleToPopFileName): - table = {} - with open(sampleToPopFileName) as sampleToPopFile: - for line in sampleToPopFile: - sample, pop = line.strip().split() - table[sample] = pop - return table - -vcfFile = allel.read_vcf(vcfFileName) -chroms = vcfFile["variants/CHROM"] -positions = np.extract(chroms == chrArm, vcfFile["variants/POS"]) - -if maskFileName.lower() in ["none", "false"]: - sys.stderr.write("Warning: a mask.fa file for the chr arm with all masked sites N'ed out is strongly recommended" + - " (pass in the reference to remove Ns at the very least)!\n") - unmasked = [True] * chrLen -else: - unmasked = readMaskDataForScan(maskFileName, chrArm) - assert len(unmasked) == chrLen - -if statFileName.lower() in ["none", "false"]: - statFileName = None - -samples = vcfFile["samples"] -if not sampleToPopFileName.lower() in ["none", "false"]: - sampleToPop = readSampleToPopFile(sampleToPopFileName) - sampleIndicesToKeep = [i for i in range(len(samples)) if sampleToPop.get(samples[i], "popNotFound!") == targetPop] -else: - sampleIndicesToKeep = [i for i in range(len(samples))] -rawgenos = np.take(vcfFile["calldata/GT"], [i for i in range(len(chroms)) if chroms[i] == chrArm], axis=0) -genos = allel.GenotypeArray(rawgenos).subset(sel1=sampleIndicesToKeep) - -if segmentStart != None: - snpIndicesToKeep = [i for i in range(len(positions)) if segmentStart <= positions[i] <= segmentEnd] - positions = [positions[i] for i in snpIndicesToKeep] - genos = allel.GenotypeArray(genos.subset(sel0=snpIndicesToKeep)) - -if isHaploidVcfGenoArray(genos): - sys.stderr.write("Detected haploid input. Converting into diploid individuals (combining haplotypes in order).\n") - genos = diploidizeGenotypeArray(genos) - -alleleCounts = genos.count_alleles() - -#remove all but mono/biallelic unmasked sites -isBiallelic = alleleCounts.is_biallelic() -for i in range(len(isBiallelic)): - if not (isBiallelic[i] and calledGenoFracAtSite(genos[i]) >= unmaskedGenoFracCutoff): - unmasked[positions[i]-1] = False -snpIndicesToKeep = [i for i in range(len(positions)) if unmasked[positions[i]-1]] -genos = allel.GenotypeArray(genos.subset(sel0=snpIndicesToKeep)) -positions = [positions[i] for i in snpIndicesToKeep] -alleleCounts = allel.AlleleCountsArray([[alleleCounts[i][0], max(alleleCounts[i][1:])] for i in snpIndicesToKeep]) - -statNames = ["pi", "thetaW", "tajD", "distVar","distSkew","distKurt","nDiplos","diplo_H1","diplo_H12","diplo_H2/H1","diplo_ZnS","diplo_Omega"] - -subWinBounds = getSubWinBounds(chrLen, subWinSize) - -header = "chrom classifiedWinStart classifiedWinEnd bigWinRange".split() -statHeader = "chrom start end".split() -for statName in statNames: - statHeader.append(statName) - for i in range(numSubWins): - header.append("%s_win%d" %(statName, i)) -statHeader = "\t".join(statHeader) -header = "\t".join(header) -outFile=open(outfn,'w') -outFile.write(header+"\n") -statVals = {} -for statName in statNames: - statVals[statName] = [] - -startTime = time.perf_counter() -goodSubWins = [] -lastSubWinEnd = chrLen - chrLen % subWinSize -snpIndicesInSubWins = getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, positions) -subWinIndex = 0 -lastSubWinStart = lastSubWinEnd - subWinSize + 1 -if statFileName: - statFile = open(statFileName, "w") - statFile.write(statHeader + "\n") -for subWinStart in range(1, lastSubWinStart+1, subWinSize): - subWinEnd = subWinStart + subWinSize - 1 - unmaskedFrac = unmasked[subWinStart-1:subWinEnd].count(True)/float(subWinEnd-subWinStart+1) - if segmentStart == None or subWinStart >= segmentStart and subWinEnd <= segmentEnd: - sys.stderr.write("%d-%d num unmasked snps: %d; unmasked frac: %f\n" %(subWinStart, subWinEnd, len(snpIndicesInSubWins[subWinIndex]), unmaskedFrac)) - if len(snpIndicesInSubWins[subWinIndex]) > 0 and unmaskedFrac >= unmaskedFracCutoff: - genosInSubWin = allel.GenotypeArray(genos.subset(sel0=snpIndicesInSubWins[subWinIndex])) - statValStr = [] - for statName in statNames: - calcAndAppendStatValForScanDiplo(alleleCounts, positions, statName, subWinStart, subWinEnd, statVals, subWinIndex, genosInSubWin, unmasked) - goodSubWins.append(True) - if statFileName: - statFile.write("\t".join([chrArm, str(subWinStart), str(subWinEnd)] + [str(statVals[statName][-1]) for statName in statNames]) + "\n") - else: - for statName in statNames: - appendStatValsForMonomorphicForScan(statName, statVals, subWinIndex) - goodSubWins.append(False) - if goodSubWins[-numSubWins:].count(True) == numSubWins: - outVec = [] - for statName in statNames: - outVec += normalizeFeatureVec(statVals[statName][-numSubWins:]) - midSubWinEnd = subWinEnd - subWinSize*(numSubWins/2) - midSubWinStart = midSubWinEnd-subWinSize+1 - outFile.write("%s\t%d\t%d\t%d-%d\t" %(chrArm, midSubWinStart, midSubWinEnd, subWinEnd-winSize+1, subWinEnd) + "\t".join([str(x) for x in outVec])) - outFile.write('\n') - subWinIndex += 1 -if statFileName: - statFile.close() -outFile.close() -sys.stderr.write("completed in %g seconds\n" %(time.perf_counter()-startTime)) diff --git a/makeFeatureVecsForChrArmFromVcf_ogSHIC.py b/makeFeatureVecsForChrArmFromVcf_ogSHIC.py deleted file mode 100644 index 2abceef..0000000 --- a/makeFeatureVecsForChrArmFromVcf_ogSHIC.py +++ /dev/null @@ -1,171 +0,0 @@ -import os -import allel -import h5py -import numpy as np -import sys -import time -from fvTools import * - -if not len(sys.argv) in [13,15]: - sys.exit("usage:\npython makeFeatureVecsForChrArmFromVcf_ogSHIC.py chrArmFileName chrArm chrLen targetPop winSize numSubWins maskFileName sampleToPopFileName ancestralArmFaFileName statFileName outFileName [segmentStart segmentEnd]\n") -if len(sys.argv) == 15: - chrArmFileName, chrArm, chrLen, targetPop, winSize, numSubWins, maskFileName, unmaskedFracCutoff, sampleToPopFileName, ancestralArmFaFileName, statFileName, outfn, segmentStart, segmentEnd = sys.argv[1:] - segmentStart, segmentEnd = int(segmentStart), int(segmentEnd) -else: - chrArmFileName, chrArm, chrLen, targetPop, winSize, numSubWins, maskFileName, unmaskedFracCutoff, sampleToPopFileName, ancestralArmFaFileName, statFileName, outfn = sys.argv[1:] - segmentStart = None - -unmaskedFracCutoff = float(unmaskedFracCutoff) -chrLen, winSize, numSubWins = int(chrLen), int(winSize), int(numSubWins) -assert winSize % numSubWins == 0 and numSubWins > 1 -subWinSize = int(winSize/numSubWins) - -def getSubWinBounds(chrLen, subWinSize): - lastSubWinEnd = chrLen - chrLen % subWinSize - lastSubWinStart = lastSubWinEnd - subWinSize + 1 - subWinBounds = [] - for subWinStart in range(1, lastSubWinStart+1, subWinSize): - subWinEnd = subWinStart + subWinSize - 1 - subWinBounds.append((subWinStart, subWinEnd)) - return subWinBounds - -def getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, snpLocs): - subWinStart = 1 - subWinEnd = subWinStart + subWinSize - 1 - snpIndicesInSubWins = [[]] - for i in range(len(snpLocs)): - while snpLocs[i] <= lastSubWinEnd and not (snpLocs[i] >= subWinStart and snpLocs[i] <= subWinEnd): - subWinStart += subWinSize - subWinEnd += subWinSize - snpIndicesInSubWins.append([]) - if snpLocs[i] <= lastSubWinEnd: - snpIndicesInSubWins[-1].append(i) - while subWinEnd < lastSubWinEnd: - snpIndicesInSubWins.append([]) - subWinStart += subWinSize - subWinEnd += subWinSize - return snpIndicesInSubWins - -chrArmFile = allel.read_vcf(chrArmFileName) -chroms = chrArmFile["variants/CHROM"] -positions = np.extract(chroms == chrArm, chrArmFile["variants/POS"]) - -if maskFileName.lower() in ["none", "false"]: - sys.stderr.write("Warning: a mask.fa file for the chr arm with all masked sites N'ed out is strongly recommended" + - " (pass in the reference to remove Ns at the very least)!\n") - unmasked = [True] * chrLen -else: - unmasked = readMaskDataForScan(maskFileName, chrArm) - assert len(unmasked) == chrLen - -if statFileName.lower() in ["none", "false"]: - statFileName = None - -samples = chrArmFile["samples"] -if not sampleToPopFileName.lower() in ["none", "false"]: - sampleToPop = readSampleToPopFile(sampleToPopFileName) - sampleIndicesToKeep = [i for i in range(len(samples)) if sampleToPop.get(samples[i], "popNotFound!") == targetPop] -else: - sampleIndicesToKeep = [i for i in range(len(samples))] - -rawgenos = np.take(chrArmFile["calldata/GT"], [i for i in range(len(chroms)) if chroms[i] == chrArm], axis=0) -genos = allel.GenotypeArray(rawgenos) -refAlleles = np.extract(chroms == chrArm, chrArmFile['variants/REF']) -altAlleles = np.extract(chroms == chrArm, chrArmFile['variants/ALT']) -if segmentStart != None: - snpIndicesToKeep = [i for i in range(len(positions)) if segmentStart <= positions[i] <= segmentEnd] - positions = [positions[i] for i in snpIndicesToKeep] - refAlleles = [refAlleles[i] for i in snpIndicesToKeep] - altAlleles = [altAlleles[i] for i in snpIndicesToKeep] - genos = allel.GenotypeArray(genos.subset(sel0=snpIndicesToKeep)) -genos = allel.GenotypeArray(genos.subset(sel1=sampleIndicesToKeep)) -alleleCounts = genos.count_alleles() - -#remove all but mono/biallelic unmasked sites -isBiallelic = alleleCounts.is_biallelic() -for i in range(len(isBiallelic)): - if not isBiallelic[i]: - unmasked[positions[i]-1] = False - -#polarize -if not ancestralArmFaFileName.lower() in ["none", "false"]: - sys.stderr.write("polarizing snps\n") - ancArm = readFaArm(ancestralArmFaFileName, chrArm).upper() - startTime = time.perf_counter() - #NOTE: mapping specifies which alleles to swap counts for based on polarization; leaves unpolarized snps alone - #NOTE: those snps need to be filtered later on (as done below)! - # this will also remove sites that could not be polarized - mapping, unmasked = polarizeSnps(unmasked, positions, refAlleles, altAlleles, ancArm) - sys.stderr.write("took %s seconds\n" %(time.perf_counter()-startTime)) - statNames = ["pi", "thetaW", "tajD", "thetaH", "fayWuH", "maxFDA", "HapCount", "H1", "H12", "H2/H1", "ZnS", "Omega", "distVar", "distSkew", "distKurt"] -else: - statNames = ["pi", "thetaW", "tajD", "HapCount", "H1", "H12", "H2/H1", "ZnS", "Omega", "distVar", "distSkew", "distKurt"] - -snpIndicesToKeep = [i for i in range(len(positions)) if unmasked[positions[i]-1]] -genos = allel.GenotypeArray(genos.subset(sel0=snpIndicesToKeep)) -positions = [positions[i] for i in snpIndicesToKeep] -alleleCounts = allel.AlleleCountsArray([[alleleCounts[i][0], max(alleleCounts[i][1:])] for i in snpIndicesToKeep]) -if not ancestralArmFaFileName.lower() in ["none", "false"]: - mapping = [mapping[i] for i in snpIndicesToKeep] - alleleCounts = alleleCounts.map_alleles(mapping) -haps = genos.to_haplotypes() - -subWinBounds = getSubWinBounds(chrLen, subWinSize) -precomputedStats = {} #not using this - -header = "chrom classifiedWinStart classifiedWinEnd bigWinRange".split() -statHeader = "chrom start end".split() -for statName in statNames: - statHeader.append(statName) - for i in range(numSubWins): - header.append("%s_win%d" %(statName, i)) -statHeader = "\t".join(statHeader) -header = "\t".join(header) -outFile=open(outfn,'w') -outFile.write(header+"\n") -statVals = {} -for statName in statNames: - statVals[statName] = [] - -startTime = time.perf_counter() -goodSubWins = [] -lastSubWinEnd = chrLen - chrLen % subWinSize -snpIndicesInSubWins = getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, positions) -subWinIndex = 0 -lastSubWinStart = lastSubWinEnd - subWinSize + 1 -if statFileName: - statFile = open(statFileName, "w") - statFile.write(statHeader + "\n") -for subWinStart in range(1, lastSubWinStart+1, subWinSize): - subWinEnd = subWinStart + subWinSize - 1 - unmaskedFrac = unmasked[subWinStart-1:subWinEnd].count(True)/float(subWinEnd-subWinStart+1) - if segmentStart == None or subWinStart >= segmentStart and subWinEnd <= segmentEnd: - sys.stderr.write("%d-%d num unmasked snps: %d; unmasked frac: %f\n" %(subWinStart, subWinEnd, len(snpIndicesInSubWins[subWinIndex]), unmaskedFrac)) - if len(snpIndicesInSubWins[subWinIndex]) > 0 and unmaskedFrac >= unmaskedFracCutoff: - hapsInSubWin = allel.HaplotypeArray(haps.subset(sel0=snpIndicesInSubWins[subWinIndex])) - statValStr = [] - for statName in statNames: - calcAndAppendStatValForScan(alleleCounts, positions, statName, subWinStart, \ - subWinEnd, statVals, subWinIndex, hapsInSubWin, unmasked, precomputedStats) - statValStr.append("%s: %s" %(statName, statVals[statName][-1])) - sys.stderr.write("\t".join(statValStr) + "\n") - goodSubWins.append(True) - if statFileName: - statFile.write("\t".join([chrArm, str(subWinStart), str(subWinEnd)] + [str(statVals[statName][-1]) for statName in statNames]) + "\n") - else: - for statName in statNames: - appendStatValsForMonomorphicForScan(statName, statVals, subWinIndex) - goodSubWins.append(False) - if goodSubWins[-numSubWins:].count(True) == numSubWins: - outVec = [] - for statName in statNames: - outVec += normalizeFeatureVec(statVals[statName][-numSubWins:]) - midSubWinEnd = subWinEnd - subWinSize*(numSubWins/2) - midSubWinStart = midSubWinEnd-subWinSize+1 - outFile.write("%s\t%d\t%d\t%d-%d\t" %(chrArm, midSubWinStart, midSubWinEnd, subWinEnd-winSize+1, subWinEnd) + "\t".join([str(x) for x in outVec])) - outFile.write('\n') - subWinIndex += 1 -if statFileName: - statFile.close() -outFile.close() -sys.stderr.write("completed in %g seconds\n" %(time.perf_counter()-startTime)) diff --git a/makeFeatureVecsForSingleMsDiploid.py b/makeFeatureVecsForSingleMsDiploid.py deleted file mode 100644 index e1000f0..0000000 --- a/makeFeatureVecsForSingleMsDiploid.py +++ /dev/null @@ -1,193 +0,0 @@ -import sys, os -import allel -import random -import numpy as np -from msTools import * -from fvTools import * -import time - -trainingDataFileName, totalPhysLen, numSubWins, maskFileName, vcfForMaskFileName, popForMask, sampleToPopFileName, unmaskedGenoFracCutoff, chrArmsForMasking, unmaskedFracCutoff, outStatsDir, fvecFileName = sys.argv[1:] -totalPhysLen = int(totalPhysLen) -numSubWins = int(numSubWins) -subWinLen = totalPhysLen//numSubWins -assert totalPhysLen % numSubWins == 0 and numSubWins > 1 - -sys.stderr.write("file name='%s'" %(trainingDataFileName)) - -trainingDataFileObj, sampleSize, numInstances = openMsOutFileForSequentialReading(trainingDataFileName) - -if maskFileName.lower() in ["none", "false"]: - sys.stderr.write("maskFileName='%s': not masking any sites!\n" %(maskFileName)) - maskFileName = False - unmaskedFracCutoff = 1.0 -else: - chrArmsForMasking = chrArmsForMasking.split(",") - unmaskedFracCutoff = float(unmaskedFracCutoff) - if unmaskedFracCutoff > 1.0 or unmaskedFracCutoff < 0.0: - sys.exit("unmaskedFracCutoff must lie within [0, 1]. AAARRRRGGGGHHHHH!!!!\n") - -if vcfForMaskFileName.lower() in ["none", "false"]: - sys.stderr.write("vcfForMaskFileName='%s': not masking any genotypes!\n" %(vcfForMaskFileName)) - vcfForMaskFileName = False -else: - if not maskFileName: - sys.exit("Cannot mask genotypes without also supplying a file for masking entire sites (can use reference genome with Ns if desired). AAARRRGHHHHH!!!!!!\n") - if sampleToPopFileName.lower() in ["none", "false"] or popForMask.lower() in ["none", "false"]: - sampleToPopFileName = None - sys.stderr.write("No sampleToPopFileName specified. Using all individuals for masking genotypes.\n") - unmaskedGenoFracCutoff = float(unmaskedGenoFracCutoff) - if unmaskedGenoFracCutoff > 1.0 or unmaskedGenoFracCutoff < 0.0: - sys.exit("unmaskedGenoFracCutoff must lie within [0, 1]. AAARRRRGGGGHHHHH!!!!\n") - -def getSubWinBounds(subWinLen, totalPhysLen): # get inclusive subwin bounds - subWinStart = 1 - subWinEnd = subWinStart + subWinLen - 1 - subWinBounds = [(subWinStart, subWinEnd)] - numSubWins = totalPhysLen//subWinLen - for i in range(1, numSubWins-1): - subWinStart += subWinLen - subWinEnd += subWinLen - subWinBounds.append((subWinStart, subWinEnd)) - subWinStart += subWinLen - # if our subwindows are 1 bp too short due to rounding error, the last window picks up all of the slack - subWinEnd = totalPhysLen - subWinBounds.append((subWinStart, subWinEnd)) - return subWinBounds - -if not maskFileName: - unmasked = [True] * totalPhysLen -else: - drawWithReplacement = False - sys.stderr.write("reading masking data...") - maskData = readMaskDataForTraining(maskFileName, totalPhysLen, subWinLen, chrArmsForMasking, shuffle=True, cutoff=unmaskedFracCutoff, - genoCutoff=unmaskedGenoFracCutoff, vcfForMaskFileName=vcfForMaskFileName, pop=popForMask, - sampleToPopFileName=sampleToPopFileName) - if vcfForMaskFileName: - maskData, genoMaskData = maskData - else: - genoMaskData = [None]*len(maskData) - sys.stderr.write("done!\n") - - if len(maskData) < numInstances: - sys.stderr.write("Warning: didn't get enough windows from masked data (needed %d; got %d); will draw with replacement!!\n" %(numInstances, len(maskData))) - drawWithReplacement = True - else: - sys.stderr.write("Got enough windows from masked data (needed %d; got %d); will draw without replacement.\n" %(numInstances, len(maskData))) - -def getSnpIndicesInSubWins(subWinBounds, snpLocs): - snpIndicesInSubWins = [] - for subWinIndex in range(len(subWinBounds)): - snpIndicesInSubWins.append([]) - - subWinIndex = 0 - for i in range(len(snpLocs)): - while not (snpLocs[i] >= subWinBounds[subWinIndex][0] and snpLocs[i] <= subWinBounds[subWinIndex][1]): - subWinIndex += 1 - snpIndicesInSubWins[subWinIndex].append(i) - return snpIndicesInSubWins - -subWinBounds = getSubWinBounds(subWinLen, totalPhysLen) -#statNames = ["pi", "thetaW", "tajD", "nDiplos","diplo_H2","diplo_H12","diplo_H2/H1","diplo_ZnS","diplo_Omega"] -statNames = ["pi", "thetaW", "tajD", "distVar","distSkew","distKurt","nDiplos","diplo_H1","diplo_H12","diplo_H2/H1","diplo_ZnS","diplo_Omega"] -header = [] -for statName in statNames: - for i in range(numSubWins): - header.append("%s_win%d" %(statName, i)) -header = "\t".join(header) - - -statVals = {} -for statName in statNames: - statVals[statName] = [] -start = time.perf_counter() -numInstancesDone = 0 -sys.stderr.write("ready to process sim reps. here we go!\n") -for instanceIndex in range(numInstances): - sys.stderr.write("starting rep %d of %d\n" %(instanceIndex, numInstances)) - hapArrayIn, positionArray = readNextMsRepToHaplotypeArrayIn(trainingDataFileObj, sampleSize, totalPhysLen) - - haps = allel.HaplotypeArray(hapArrayIn, dtype='i1') - if maskFileName: - if drawWithReplacement: - randIndex = random.randint(0, len(maskData)-1) - unmasked, genoMasks = maskData[randIndex], genoMaskData[randIndex] - else: - unmasked, genoMasks = maskData[instanceIndex], genoMaskData[instanceIndex] - assert len(unmasked) == totalPhysLen - if haps.shape[1] % 2 == 1: - haps = haps[:,:-1] - genos = haps.to_genotypes(ploidy=2) - unmaskedSnpIndices = [i for i in range(len(positionArray)) if unmasked[positionArray[i]-1]] - if len(unmaskedSnpIndices) == 0: - sys.stderr.write("no snps for rep %d\n" %(instanceIndex)) - for statName in statNames: - statVals[statName].append([]) - for subWinIndex in range(numSubWins): - for statName in statNames: - appendStatValsForMonomorphic(statName, statVals, instanceIndex, subWinIndex) - else: - sys.stderr.write("processing snps for rep %d\n" %(instanceIndex)) - if maskFileName: - preMaskCount = np.sum(genos.count_alleles()) - if genoMasks: - sys.stderr.write("%d snps in the masking window for rep %d\n" %(len(genoMasks), instanceIndex)) - genos = maskGenos(genos.subset(sel0=unmaskedSnpIndices), genoMasks) - else: - genos = genos.subset(sel0=unmaskedSnpIndices) - alleleCountsUnmaskedOnly = genos.count_alleles() - maskedCount = preMaskCount - np.sum(alleleCountsUnmaskedOnly) - sys.stderr.write("%d of %d genotypes (%.2f%%) masked for rep %d\n" %(maskedCount, preMaskCount, 100*maskedCount/preMaskCount, instanceIndex)) - else: - alleleCountsUnmaskedOnly = genos.count_alleles() - positionArrayUnmaskedOnly = [positionArray[i] for i in unmaskedSnpIndices] - snpIndicesInSubWins = getSnpIndicesInSubWins(subWinBounds, positionArrayUnmaskedOnly) - for statName in statNames: - statVals[statName].append([]) - for subWinIndex in range(numSubWins): - subWinStart, subWinEnd = subWinBounds[subWinIndex] - unmaskedFrac = unmasked[subWinStart-1:subWinEnd].count(True)/float(subWinLen) - assert unmaskedFrac >= unmaskedFracCutoff - snpIndicesInSubWinUnmasked = snpIndicesInSubWins[subWinIndex] - sys.stderr.write("examining subwindow %d which has %d unmasked SNPs\n" %(subWinIndex, len(snpIndicesInSubWinUnmasked))) - if len(snpIndicesInSubWinUnmasked) > 0: - genosInSubWin = genos.subset(sel0=snpIndicesInSubWinUnmasked) - for statName in statNames: - calcAndAppendStatValDiplo(alleleCountsUnmaskedOnly, positionArrayUnmaskedOnly, statName, subWinStart, \ - subWinEnd, statVals, instanceIndex, subWinIndex, genosInSubWin, unmasked) - else: - for statName in statNames: - appendStatValsForMonomorphic(statName, statVals, instanceIndex, subWinIndex) - numInstancesDone += 1 - sys.stderr.write("finished %d reps after %f seconds\n" %(numInstancesDone, time.perf_counter()-start)) - -if numInstancesDone != numInstances: - sys.exit("Expected %d reps but only processed %d. Perhaps we are using malformed simulation output!\n" %(numInstancesDone, numInstances)) - -statFiles = [] -if outStatsDir.lower() != "none": - for subWinIndex in range(numSubWins): - statFileName = "%s/%s.%d.stats" %(outStatsDir, trainingDataFileName.split("/")[-1].rstrip(".gz"), subWinIndex) - statFiles.append(open(statFileName, "w")) - statFiles[-1].write("\t".join(statNames) + "\n") -with open(fvecFileName, "w") as fvecFile: - fvecFile.write(header + "\n") - for i in range(numInstancesDone): - statLines = [] - for subWinIndex in range(numSubWins): - statLines.append([]) - outVec = [] - for statName in statNames: - outVec += normalizeFeatureVec(statVals[statName][i]) - for subWinIndex in range(numSubWins): - statLines[subWinIndex].append(statVals[statName][i][subWinIndex]) - if statFiles: - for subWinIndex in range(numSubWins): - statFiles[subWinIndex].write("\t".join([str(x) for x in statLines[subWinIndex]]) + "\n") - fvecFile.write("\t".join([str(x) for x in outVec]) + "\n") - -if statFiles: - for subWinIndex in range(numSubWins): - statFiles[subWinIndex].close() - -sys.stderr.write("total time spent calculating summary statistics and generating feature vectors: %f secs\n" %(time.perf_counter()-start)) -closeMsOutFile(trainingDataFileObj) diff --git a/meta.yaml b/meta.yaml new file mode 100644 index 0000000..db68a76 --- /dev/null +++ b/meta.yaml @@ -0,0 +1,27 @@ +package: + name: diploshic + version: "0.33333" + +source: + git_url: https://github.com/kr-colab/diploshic + git_rev: + +build: + number: 0 + +Requirements: + Build: + - numpy + run: + - numpy + - scipy + - matplotlib + - pandas + - scikit-allel + - scikit-learn + - tensorflow + - keras + +about: + home: https://github.com/kr-colab/diploshic + license: MIT diff --git a/setup.py b/setup.py index baeebd0..0284f3e 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,40 @@ -# File setup.py -def configuration(parent_package='',top_path=None): - from numpy.distutils.misc_util import Configuration - config = Configuration('',parent_package,top_path) +from numpy.distutils.core import Extension, setup - config.add_extension('shicstats', - sources = ['shicstats.pyf','utils.c']) - return config -if __name__ == "__main__": - from numpy.distutils.core import setup - setup(**configuration(top_path='').todict()) +with open("README.md", "r") as fh: + long_description = fh.read() + +shic_stats = Extension("diploshic.shicstats", + sources=["diploshic/shicstats.pyf", + "diploshic/utils.c"], + ) +setup(name='diploSHIC', + version='0.33333', + description='diploS/HIC', + long_description=long_description, + long_description_content_type="text/markdown", + url='https://github.com/kr-colab/diploSHIC', + author='Andrew Kern', + author_email='adkern@uoregon.edu', + license='MIT', + packages=['diploshic'], + install_requires=['numpy', + 'scipy', + 'matplotlib', + 'pandas', + 'scikit-allel', + 'scikit-learn', + 'tensorflow', + 'keras'], + scripts=['diploshic/diploSHIC', + 'diploshic/makeFeatureVecsForChrArmFromVcfDiploid.py', + 'diploshic/makeFeatureVecsForChrArmFromVcf_ogSHIC.py', + 'diploshic/makeFeatureVecsForSingleMsDiploid.py', + 'diploshic/makeFeatureVecsForSingleMs_ogSHIC.py', + 'diploshic/makeTrainingSets.py'], + zip_safe=False, + extras_require={ + 'dev': [], + }, + setup_requires=[], + ext_modules=[shic_stats] +) From 7b166437dd8dfa95b1ad83aa7dba3ea6fc799c26 Mon Sep 17 00:00:00 2001 From: andrewkern Date: Tue, 23 Nov 2021 13:07:07 -0800 Subject: [PATCH 02/10] working on the readme adjustments --- README.md | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index b5abb43..7267c7d 100644 --- a/README.md +++ b/README.md @@ -22,33 +22,26 @@ such as `conda` or `pip`. The complete list of dependencies looks like this: ## Install on linux I'm going to focus on the steps involved to install on a linux machine using Anaconda as our python source / main -package manager. First download and install Anaconda +package manager. Assuming you have conda installed, create a new conda env ``` -$ wget https://repo.continuum.io/archive/Anaconda3-5.0.1-Linux-x86_64.sh -$ bash Anaconda3-5.0.1-Linux-x86_64.sh -``` -That will give us the basics (numpy, scipy, scikit-learn, etc). Next lets install scikit-allel using `conda` -``` -$ conda install -c conda-forge scikit-allel -``` -That's easy. Installing tensorflow and keras can be slightly more touchy. You will need to determine if -you want to use a CPU-only implementation (probably) or a GPU implementation of tensorflow. See -https://www.tensorflow.org/install/install_linux for install instructions. I'm going to install the -CPU version for simplicity. tensorflow and keras are the libraries which handle the deep learning -portion of things so it is important to make sure the versions of these libraries play well together -``` -$ pip install tensorflow -$ pip install keras +$ conda create -n diploshic python=3.9 --yes ``` + Note that because I'm using the Anaconda version of python, pip will only install this in the anaconda directory -which is a good thing. Okay that should be the basics of dependencies. Now we are ready to install `diploS/HIC` itself +which is a good thing. Now we are ready to install `diploS/HIC` itself + ``` $ git clone https://github.com/kern-lab/diploSHIC.git $ cd diploSHIC -$ python setup.py install +$ pip install . ``` -Assuming all the dependencies were installed this should be all set + +This should automatically install all the dependencies including tensorflow. +You will need to determine if +you want to use a CPU-only implementation (probably) or a GPU implementation of tensorflow. See +https://www.tensorflow.org/install/install_linux for install instructions. + ## Usage The main program that you will interface with is `diploSHIC.py`. This script has four run modes that allow the user to From 611f58ec2cdfefd48321c91c92ddf2b8fea7be00 Mon Sep 17 00:00:00 2001 From: andrewkern Date: Tue, 23 Nov 2021 13:11:24 -0800 Subject: [PATCH 03/10] updated readme --- README.md | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 7267c7d..44a2c55 100644 --- a/README.md +++ b/README.md @@ -44,19 +44,21 @@ https://www.tensorflow.org/install/install_linux for install instructions. ## Usage -The main program that you will interface with is `diploSHIC.py`. This script has four run modes that allow the user to +The main program that you will interface with is `diploSHIC`. This script is installed by default +in the conda environment `bin` directory. + This script has four run modes that allow the user to perform each of the main steps in the supervised machine learning process. We will briefly lay out the modes of use and then will provide a complete example of how to use the program for fun and profit. -`diploSHIC.py` uses the `argparse` module in python to try to give the user a complete, command line based help menu. +`diploSHIC` uses the `argparse` module in python to try to give the user a complete, command line based help menu. We can see the top level of this help by typing ``` -$ python diploSHIC.py -h -usage: diploSHIC.py [-h] {train,predict,fvecSim,fvecVcf} ... +$ diploSHIC -h +usage: diploSHIC [-h] {train,predict,fvecSim,fvecVcf} ... calculate feature vectors, train, or predict with diploSHIC -possible modes (enter 'python diploSHIC.py modeName -h' for modeName's help message: +possible modes (enter 'diploSHIC modeName -h' for modeName's help message: {fvecSim,makeTrainingSets,train,fvecVcf,predict} sub-command help fvecSim Generate feature vectors from simulated data @@ -77,15 +79,15 @@ https://github.com/kern-lab/discoal). ### feature vector generation modes The first task in our pipeline is generating feature vectors from simulation data (or empirical data) to -use with the CNN that we will train and then use for prediction. The `diploSHIC.py` script eases this +use with the CNN that we will train and then use for prediction. The `diploSHIC` script eases this process with two run modes #### fvecSim mode -The fvecSim run mode is used for turning ms-style output into feature vectors compatible with `diploSHIC.py`. The +The fvecSim run mode is used for turning ms-style output into feature vectors compatible with `diploSHIC`. The help message from this mode looks like this ``` -$ python diploSHIC.py fvecSim -h -usage: diploSHIC.py fvecSim [-h] [--totalPhysLen TOTALPHYSLEN] +$ diploSHIC fvecSim -h +usage: diploSHIC fvecSim [-h] [--totalPhysLen TOTALPHYSLEN] [--numSubWins NUMSUBWINS] [--maskFileName MASKFILENAME] [--chrArmsForMasking CHRARMSFORMASKING] @@ -145,8 +147,8 @@ for a fleshed out example of how to use these features. The fvecVcf mode is used for calculating feature vectors from data that is stored as a VCF file. The help message from this mode is as follows ``` -$ python diploSHIC.py fvecVcf -h -usage: diploSHIC.py fvecVcf [-h] [--targetPop TARGETPOP] +$ diploSHIC fvecVcf -h +usage: diploSHIC fvecVcf [-h] [--targetPop TARGETPOP] [--sampleToPopFileName SAMPLETOPOPFILENAME] [--winSize WINSIZE] [--numSubWins NUMSUBWINS] [--maskFileName MASKFILENAME] @@ -212,8 +214,8 @@ Once we have feature vector files ready to go we can train and test our CNN and Before entering train mode we need to consolidate our training set into 5 files, one for each class. This is done using the makeTrainingSets mode whose help message is as follows: ``` -$ python diploSHIC.py makeTrainingSets -h -usage: diploSHIC.py makeTrainingSets [-h] +$ diploSHIC makeTrainingSets -h +usage: diploSHIC makeTrainingSets [-h] neutTrainingFileName softTrainingFilePrefix hardTrainingFilePrefix @@ -244,10 +246,10 @@ optional arguments: -h, --help show this help message and exit ``` #### train mode -Here is the help message for the train mode of `diploSHIC.py` +Here is the help message for the train mode of `diploSHIC` ``` -$ python diploSHIC.py train -h -usage: diploSHIC.py train [-h] [--epochs EPOCHS] [--numSubWins NUMSUBWINS] +$ diploSHIC train -h +usage: diploSHIC train [-h] [--epochs EPOCHS] [--numSubWins NUMSUBWINS] trainDir testDir outputModel required arguments: @@ -266,7 +268,7 @@ optional arguments: As you will see in a moment train mode is used for training the deep learning classifier. Its required arguments are trainDir (the directory where the training feature vectors are kept), testDir (the directory where the testing feature vectors are kept), and outputModel the file name for the trained -network. One note -- `diploSHIC.py` expects five files named `hard.fvec`, `soft.fvec`, `neut.fvec`, `linkedSoft.fvec`, and +network. One note -- `diploSHIC` expects five files named `hard.fvec`, `soft.fvec`, `neut.fvec`, `linkedSoft.fvec`, and `linkedHard.fvec` in the training and testing directories. The training and testing directory can be the same directory in which case 20% of the training examples are held out for use in testing and validation. @@ -274,11 +276,11 @@ train mode has two options, the number of subwindows used for the feature vector network. ### predict mode -Once a classifier has been trained, one uses the predict mode of `diploSHIC.py` to classify empirical data. Here is the help +Once a classifier has been trained, one uses the predict mode of `diploSHIC` to classify empirical data. Here is the help statement ``` -$ python diploSHIC.py predict -h -usage: diploSHIC.py predict [-h] [--numSubWins NUMSUBWINS] +$ diploSHIC predict -h +usage: diploSHIC predict [-h] [--numSubWins NUMSUBWINS] modelStructure modelWeights predictFile predictFileOutput @@ -304,11 +306,11 @@ genomic data). Let's quickly give that code a spin. The directories `testing/` a formatted diploid feature vectors that are ready to be fed into diploSHIC. First we will train the diploSHIC CNN, but we will restrict the number of training epochs to 10 to keep things relatively brief (this runs in less than 5 minutes on our server). ``` -$ python diploSHIC.py train training/ testing/ fooModel --epochs 10 +$ diploSHIC train training/ testing/ fooModel --epochs 10 ``` as it runs a bunch of information monitoring the training of the network will apear. We are tracking the loss and accuracy in the validation set. When optimization is complete our trained network will be contained in two files, `fooModel.json` and -`fooModel.weights.hdf5`. The last bit of output from `diploSHIC.py` gives us information about the loss and accuracy on +`fooModel.weights.hdf5`. The last bit of output from `diploSHIC` gives us information about the loss and accuracy on the held out test data. From the above run my looks like this: ``` evaluation on test set: @@ -319,7 +321,7 @@ Not bad. In practice I would set the `--epochs` value much higher than 10- the d Now that we have a trained model we can make predictions on some empirical data. In the repo there is a file called `testEmpirical.fvec` that we will use as input ``` -$ python diploSHIC.py predict fooModel.json fooModel.weights.hdf5 testEmpirical.fvec testEmpirical.preds +$ diploSHIC predict fooModel.json fooModel.weights.hdf5 testEmpirical.fvec testEmpirical.preds ``` the output predictions will be saved in `testEmpirical.preds` and should be straightforward to interpret. From f311f40e35133b1be9ddb0d4ed366545701130a9 Mon Sep 17 00:00:00 2001 From: Andrew Kern Date: Tue, 23 Nov 2021 13:38:58 -0800 Subject: [PATCH 04/10] Create build_wheels.yaml --- .github/workflows/build_wheels.yaml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 .github/workflows/build_wheels.yaml diff --git a/.github/workflows/build_wheels.yaml b/.github/workflows/build_wheels.yaml new file mode 100644 index 0000000..1791681 --- /dev/null +++ b/.github/workflows/build_wheels.yaml @@ -0,0 +1,24 @@ +name: Build + +on: [push, pull_request] + +jobs: + build_wheels: + name: Build wheels on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-20.04, windows-2019, macos-10.15] + + steps: + - uses: actions/checkout@v2 + + - name: Build wheels + uses: pypa/cibuildwheel@v2.2.2 + # to supply options, put them in 'env', like: + # env: + # CIBW_SOME_OPTION: value + + - uses: actions/upload-artifact@v2 + with: + path: ./wheelhouse/*.whl From 1804254bbbb449c3748b7293276fd702fdb3dc56 Mon Sep 17 00:00:00 2001 From: Andrew Kern Date: Tue, 23 Nov 2021 13:43:10 -0800 Subject: [PATCH 05/10] Create pyproject.toml --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..bf8e1fb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +build-backend = "setuptools.build_meta" +requires = ["setuptools", "numpy"] From 2be05da17a85dba1c2ae9e8438fd3b216df9c846 Mon Sep 17 00:00:00 2001 From: Andrew Kern Date: Tue, 23 Nov 2021 13:48:05 -0800 Subject: [PATCH 06/10] Update build_wheels.yaml --- .github/workflows/build_wheels.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_wheels.yaml b/.github/workflows/build_wheels.yaml index 1791681..a989ec3 100644 --- a/.github/workflows/build_wheels.yaml +++ b/.github/workflows/build_wheels.yaml @@ -8,7 +8,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-20.04, windows-2019, macos-10.15] + os: [ubuntu-20.04, macos-10.15] steps: - uses: actions/checkout@v2 From 334c5d244dc428a45050b38172e48681b6d62976 Mon Sep 17 00:00:00 2001 From: Andrew Kern Date: Tue, 23 Nov 2021 14:05:32 -0800 Subject: [PATCH 07/10] Update and rename build_wheels.yaml to deploy.yaml --- .github/workflows/build_wheels.yaml | 24 ----------- .github/workflows/deploy.yaml | 62 +++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 24 deletions(-) delete mode 100644 .github/workflows/build_wheels.yaml create mode 100644 .github/workflows/deploy.yaml diff --git a/.github/workflows/build_wheels.yaml b/.github/workflows/build_wheels.yaml deleted file mode 100644 index a989ec3..0000000 --- a/.github/workflows/build_wheels.yaml +++ /dev/null @@ -1,24 +0,0 @@ -name: Build - -on: [push, pull_request] - -jobs: - build_wheels: - name: Build wheels on ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-20.04, macos-10.15] - - steps: - - uses: actions/checkout@v2 - - - name: Build wheels - uses: pypa/cibuildwheel@v2.2.2 - # to supply options, put them in 'env', like: - # env: - # CIBW_SOME_OPTION: value - - - uses: actions/upload-artifact@v2 - with: - path: ./wheelhouse/*.whl diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml new file mode 100644 index 0000000..f51c186 --- /dev/null +++ b/.github/workflows/deploy.yaml @@ -0,0 +1,62 @@ +name: Build and upload to PyPI + +# Build on every branch push, tag push, and pull request change: +on: [push, pull_request] +# Alternatively, to publish when a (published) GitHub Release is created, use the following: +# on: +# push: +# pull_request: +# release: +# types: +# - published + +jobs: + build_wheels: + name: Build wheels on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-20.04] + + steps: + - uses: actions/checkout@v2 + + - name: Build wheels + uses: pypa/cibuildwheel@v2.2.2 + + - uses: actions/upload-artifact@v2 + with: + path: ./wheelhouse/*.whl + + build_sdist: + name: Build source distribution + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Build sdist + run: pipx run build --sdist + + - uses: actions/upload-artifact@v2 + with: + path: dist/*.tar.gz + + upload_pypi: + needs: [build_wheels, build_sdist] + runs-on: ubuntu-latest + # upload to PyPI on every tag starting with 'v' + if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') + # alternatively, to publish when a GitHub Release is created, use the following rule: + # if: github.event_name == 'release' && github.event.action == 'published' + steps: + - uses: actions/download-artifact@v2 + with: + name: artifact + path: dist + + - uses: pypa/gh-action-pypi-publish@v1.4.2 + with: + user: __token__ + repository_url: https://test.pypi.org/legacy/ + password: ${{ secrets.pypi_password }} + # To test: repository_url: https://test.pypi.org/legacy/ From 0167d683a404d485551ea4c0c92964bc75bbb7ce Mon Sep 17 00:00:00 2001 From: Andrew Kern Date: Tue, 23 Nov 2021 16:04:49 -0800 Subject: [PATCH 08/10] Update deploy.yaml --- .github/workflows/deploy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml index f51c186..acb375a 100644 --- a/.github/workflows/deploy.yaml +++ b/.github/workflows/deploy.yaml @@ -58,5 +58,5 @@ jobs: with: user: __token__ repository_url: https://test.pypi.org/legacy/ - password: ${{ secrets.pypi_password }} + password: ${{ secrets.PYPI_TOKEN }} # To test: repository_url: https://test.pypi.org/legacy/ From 5b589cc057175a26b458527ceaf2fed963fd6274 Mon Sep 17 00:00:00 2001 From: andrewkern Date: Tue, 23 Nov 2021 21:56:29 -0800 Subject: [PATCH 09/10] more packing woes --- README.md | 7 +++++-- setup.py | 7 ++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 44a2c55..c4c1f3a 100644 --- a/README.md +++ b/README.md @@ -46,19 +46,20 @@ https://www.tensorflow.org/install/install_linux for install instructions. ## Usage The main program that you will interface with is `diploSHIC`. This script is installed by default in the conda environment `bin` directory. - This script has four run modes that allow the user to +This script has four run modes that allow the user to perform each of the main steps in the supervised machine learning process. We will briefly lay out the modes of use and then will provide a complete example of how to use the program for fun and profit. `diploSHIC` uses the `argparse` module in python to try to give the user a complete, command line based help menu. We can see the top level of this help by typing + ``` $ diploSHIC -h usage: diploSHIC [-h] {train,predict,fvecSim,fvecVcf} ... calculate feature vectors, train, or predict with diploSHIC -possible modes (enter 'diploSHIC modeName -h' for modeName's help message: +possible modes (enter \'diploSHIC modeName -h\' for modeName\'s help message: {fvecSim,makeTrainingSets,train,fvecVcf,predict} sub-command help fvecSim Generate feature vectors from simulated data @@ -71,6 +72,8 @@ possible modes (enter 'diploSHIC modeName -h' for modeName's help message: optional arguments: -h, --help show this help message and exit ``` + + ### before running diploSHIC: simulating training/testing data All flavors of S/HIC require simulated data for training (and ideally, testing). Users can select whatever simulator they prefer and parameterize them however they wish. We have included an example script in this respository diff --git a/setup.py b/setup.py index 0284f3e..c33ad40 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,4 @@ +from setuptools import setup from numpy.distutils.core import Extension, setup with open("README.md", "r") as fh: @@ -8,14 +9,14 @@ "diploshic/utils.c"], ) setup(name='diploSHIC', - version='0.33333', - description='diploS/HIC', + version='0.333333', + description='diploSHIC', long_description=long_description, long_description_content_type="text/markdown", url='https://github.com/kr-colab/diploSHIC', author='Andrew Kern', author_email='adkern@uoregon.edu', - license='MIT', + license='MIT', packages=['diploshic'], install_requires=['numpy', 'scipy', From 17c0bfe1b77629e6772da7493be393d3b2725260 Mon Sep 17 00:00:00 2001 From: andrewkern Date: Wed, 24 Nov 2021 10:50:11 -0800 Subject: [PATCH 10/10] minor changes; things are working --- diploshic/examples/Snakefile | 10 +++++----- pyproject.toml | 1 + requirements/development.txt | 25 +++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 5 deletions(-) create mode 100644 requirements/development.txt diff --git a/diploshic/examples/Snakefile b/diploshic/examples/Snakefile index df8c260..10abb9f 100644 --- a/diploshic/examples/Snakefile +++ b/diploshic/examples/Snakefile @@ -21,8 +21,8 @@ replicates = 2 seed_array = np.random.random_integers(1,2**31,replicates) # change to the correct locations of discoal and diploSHIC -discoal_exec = "discoal/discoal" -diploSHIC_exec = "diploSHIC/diploSHIC.py" +discoal_exec = "./discoal" +diploSHIC_exec = "diploSHIC" #some parameters that should probably be moved to a .json file trainSampleNumber = 10 #the number of simulation replicates we want to generate for each file in our training set @@ -100,7 +100,7 @@ rule calc_sim_fvs: output: "{tDir}/discoal.{mod}.{x}.{i}.out.fvec" run: - cmd = f"python {diploSHIC_exec} fvecSim haploid {input} {output} --totalPhysLen={numSites} --maskFileName none --chrArmsForMasking none" + cmd = f"{diploSHIC_exec} fvecSim haploid {input} {output} --totalPhysLen={numSites} --maskFileName none --chrArmsForMasking none" shell(cmd) rule concat_fvecs: @@ -127,7 +127,7 @@ rule make_training_sets: output: expand("trainingSets/{cl}.fvec", cl = ["hard", "linkedHard", "soft", "linkedSoft", "neut"]) run: - cmd = f"python {diploSHIC_exec} makeTrainingSets train/neut_0.fvec train/soft train/hard 5 0,1,2,3,4,6,7,8,9,10 trainingSets/" + cmd = f"{diploSHIC_exec} makeTrainingSets train/neut_0.fvec train/soft train/hard 5 0,1,2,3,4,6,7,8,9,10 trainingSets/" shell(cmd) rule train_classifier: @@ -137,7 +137,7 @@ rule train_classifier: "trained_model.json", "trained_model.weights.hdf5" run: - cmd = f"python {diploSHIC_exec} train trainingSets/ trainingSets/ trained_model --epochs=10" + cmd = f"{diploSHIC_exec} train trainingSets/ trainingSets/ trained_model --epochs=10" shell(cmd) rule clean: diff --git a/pyproject.toml b/pyproject.toml index bf8e1fb..282beb0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,4 @@ [build-system] build-backend = "setuptools.build_meta" requires = ["setuptools", "numpy"] + diff --git a/requirements/development.txt b/requirements/development.txt new file mode 100644 index 0000000..8bc609e --- /dev/null +++ b/requirements/development.txt @@ -0,0 +1,25 @@ +black +blacken-docs +codecov +coverage +daiquiri +flake8 +pytest +pytest-cov +pytest-xdist +setuptools_scm +sphinx +sphinx-argparse +sphinx-issues +sphinx_rtd_theme +sphinxcontrib-programoutput +attrs +appdirs +humanize +pre-commit +pyslim>=0.600 +pandas +numpy +scikit-allel +zarr>=2.4 +biopython