diff --git a/.gitignore b/.gitignore index 92db3fb..625e75f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ **__pycache__ -*.pyc \ No newline at end of file +*.pyc +build +*.egg +*egg-info \ No newline at end of file diff --git a/build/lib/profile_dists/__init__.py b/build/lib/profile_dists/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/build/lib/profile_dists/constants.py b/build/lib/profile_dists/constants.py deleted file mode 100644 index bcd6af3..0000000 --- a/build/lib/profile_dists/constants.py +++ /dev/null @@ -1,41 +0,0 @@ -from profile_dists.version import __version__ -MIN_FILE_SIZE = 32 - - -EXTENSIONS = {'text': ['txt','tsv','mat','text'], - 'hd5': ['hd','h5','hdf5'], - 'parquet': ['parq','parquet','pq']} - - -FILE_FORMATS = ['tsv','parquet','json'] - -VALID_INT_TYPES = ['int64','int32','int16','int8'] - - -OUTPUT_FILES = [ - 'run.json', - 'allele_map.json', - 'results.{format}', -] - - -RUN_DATA = { - 'profile_dists': f'version: {__version__}', - 'analysis_start_time':'', - 'analysis_end_time':'', - 'parameters':{}, - 'query_profile_info':{ - 'num_samples':0, - 'num_samples_pass':0, - 'failed_samples':[], - 'parsed_file_path':'', - }, - 'ref_profile_info':{ - 'num_samples': 0, - 'num_samples_pass': 0, - 'failed_samples': [], - 'parsed_file_path':'', - }, - 'loci_removed': [], - 'result_file':'' -} \ No newline at end of file diff --git a/build/lib/profile_dists/profile_dists.py b/build/lib/profile_dists/profile_dists.py deleted file mode 100644 index 0488667..0000000 --- a/build/lib/profile_dists/profile_dists.py +++ /dev/null @@ -1,238 +0,0 @@ -import sys -from argparse import (ArgumentParser, ArgumentDefaultsHelpFormatter, RawDescriptionHelpFormatter) -import json -import os -from datetime import datetime -from profile_dists.version import __version__ -from profile_dists.utils import process_profile, is_file_ok, compare_headers, filter_columns, \ - count_missing_data, write_profiles, convert_profiles, calc_distances_scaled, calc_distances_hamming, \ - write_dist_results, calc_batch_size, get_missing_loci_counts, flag_samples, filter_samples -from profile_dists.constants import RUN_DATA - -def parse_args(): - """ Argument Parsing method. - - A function to parse the command line arguments passed at initialization of Clade-o-matic, - format these arguments, and return help prompts to the user shell when specified. - - Returns - ------- - ArgumentParser object - The arguments and their user specifications, the usage help prompts and the correct formatting - for the incoming argument (str, int, etc.) - """ - class CustomFormatter(ArgumentDefaultsHelpFormatter, RawDescriptionHelpFormatter): - """ - Class to instantiate the formatter classes required for the argument parser. - Required for the correct formatting of the default parser values - - Parameters - ---------- - ArgumentDefaultsHelpFormatter object - Instatiates the default values for the ArgumentParser for display on the command line. - RawDescriptionHelpFormatter object - Ensures the correct display of the default values for the ArgumentParser - """ - pass - - parser = ArgumentParser( - description="Profile Dists: Calculate genetic distances based on allele profiles v. {}".format(__version__), - formatter_class=CustomFormatter) - parser.add_argument('--query','-q', type=str, required=True, help='Query allelic profiles') - parser.add_argument('--ref','-r', type=str, required=True, help='Reference allelic profiles') - parser.add_argument('--outdir', '-o', type=str, required=True, help='Result output files') - parser.add_argument('--outfmt', '-u', type=str, required=False, help='Out format [matrix, pairwise]',default='matrix') - parser.add_argument('--file_type', '-e', type=str, required=False, help='Out format [text, parquet]',default='text') - parser.add_argument('--distm', '-d', type=str, required=False, help='Distance method raw hamming or scaled difference [hamming, scaled]',default='scaled') - parser.add_argument('--missing_thresh', '-t', type=float, required=False, - help='Maximum percentage of missing data allowed per locus (0 - 1)',default=1.0) - parser.add_argument('--sample_qual_thresh', '-c', type=float, required=False, - help='Maximum percentage of missing data allowed per sample (0 - 1)',default=1.0) - parser.add_argument('--match_threshold', '-a', type=str, required=False, - help='Either a integer or float depending on what distance method is used (only used with pairwise format') - parser.add_argument('--mapping_file', '-m', type=float, required=False, - help='json formatted allele mapping') - parser.add_argument('--force','-f', required=False, help='Overwrite existing directory', - action='store_true') - parser.add_argument('-s', '--skip', required=False, help='Skip QA/QC steps', - action='store_true') - parser.add_argument('-V', '--version', action='version', version="%(prog)s " + __version__) - - return parser.parse_args() - - -def main(): - cmd_args = parse_args() - query_profile = cmd_args.query - ref_profile = cmd_args.ref - outdir = cmd_args.outdir - outfmt = cmd_args.outfmt - file_type = cmd_args.file_type - dist_method = cmd_args.distm - missing_threshold = cmd_args.missing_thresh - allele_mapping_file = cmd_args.mapping_file - force = cmd_args.force - match_threshold = cmd_args.match_threshold - sample_qual_thresh = cmd_args.sample_qual_thresh - skip = cmd_args.skip - - run_data = RUN_DATA - run_data['analysis_start_time'] = datetime.now().strftime("%d/%m/%Y %H:%M:%S") - run_data['parameters'] = vars(cmd_args) - - input_files = [query_profile,ref_profile,allele_mapping_file] - for f in input_files: - if f is None: - continue - if not is_file_ok(f): - print(f'file {f} either does not exist or is too small to be valid') - sys.exit() - - allele_map = {} - if allele_mapping_file is not None: - with open(allele_mapping_file) as mapping_fh: - allele_map = json.loads(mapping_fh.read()) - - if not force and os.path.isdir(outdir): - print(f'folder {outdir} already exists, please choose new directory or use --force') - sys.exit() - - if outfmt != 'matrix' and outfmt != 'pairwise': - print(f'Supplied format does not match [matrix,pairwise]: {outfmt} ') - sys.exit() - - if not file_type in ['text', 'parquet']: - print(f'Supplied filetype does not match [text, parquet]: {outfmt} ') - sys.exit() - - if not dist_method in ['hamming','scaled']: - print(f'Supplied filetype does not match [hamming, scaled]: {dist_method} ') - sys.exit() - - if missing_threshold < 0 or missing_threshold > 1: - print(f'Supplied threshold is not between 0 - 1: {missing_threshold} ') - sys.exit() - - # initialize analysis directory - if not os.path.isdir(outdir): - os.makedirs(outdir, 0o755) - - (allele_map, qdf) = process_profile(query_profile,column_mapping=allele_map) - (allele_map, rdf) = process_profile(ref_profile, column_mapping=allele_map) - - - with open(os.path.join(outdir,"allele_map.json"),'w' ) as fh: - fh.write(json.dumps(allele_map, indent=4)) - - qcols = set(qdf.columns.values.tolist()) - rcols = set(rdf.columns.values.tolist()) - common_cols = sorted(list(qcols & rcols)) - - if len(common_cols) == 0: - print(f'Error there are no columns in common between: {query_profile}\t{ref_profile}') - sys.exit() - - #remove cols not present in both - qcols_to_remove = qcols - set(common_cols) - run_data['loci_removed'] = list(qcols_to_remove) - - if len(qcols_to_remove) > 0: - qdf = filter_columns(qdf, qcols_to_remove) - - rcols_to_remove = rcols - set(common_cols) - if len(rcols_to_remove) > 0: - rdf = filter_columns(rdf, qcols_to_remove) - - cols_to_remove = [] - if not skip: - qmissing = count_missing_data(qdf) - rmissing = count_missing_data(rdf) - - total_samples = len(qdf) + len(rdf) - missing_threshold = int(missing_threshold * total_samples) - - #Identify cols to remove - - for col in qmissing: - count = qmissing[col] - if not col in rmissing: - rmissing[col] = 0 - rmissing[col] += count - if rmissing[col] > missing_threshold: - cols_to_remove.append(col) - - run_data['loci_removed'] = sorted(list(set(run_data['loci_removed']) | set(cols_to_remove))) - - if len(cols_to_remove) > 0: - qdf = filter_columns(qdf, cols_to_remove) - rdf = filter_columns(rdf, cols_to_remove) - - #convert profiles for fast dist calculations - qlabels,qprofiles = convert_profiles(qdf) - rlabels,rprofiles = convert_profiles(rdf) - - - run_data['query_profile_info']['num_samples'] = len(qlabels) - run_data['query_profile_info']['num_samples_pass'] = run_data['query_profile_info']['num_samples'] - run_data['ref_profile_info']['num_samples'] = len(qlabels) - run_data['ref_profile_info']['num_samples_pass'] = run_data['ref_profile_info']['num_samples'] - - # write updated profiles - write_profiles(qdf, os.path.join(outdir, f'query_profile.{file_type}'), file_type) - run_data['query_profile_info']['parsed_file_path'] = os.path.join(outdir, f'query_profile.{file_type}') - write_profiles(rdf, os.path.join(outdir, f'ref_profile.{file_type}'), file_type) - run_data['ref_profile_info']['parsed_file_path'] = os.path.join(outdir, f'ref_profile.{file_type}') - - if not skip: - # Remove poor quality samples from the comparisons - query_missing_data_counts = get_missing_loci_counts(qprofiles, qlabels) - ref_missing_data_counts = get_missing_loci_counts(rprofiles, rlabels) - query_samples_to_remove = flag_samples(query_missing_data_counts, sample_qual_thresh) - run_data['query_profile_info']['failed_samples'] = query_samples_to_remove - run_data['query_profile_info']['num_samples_pass'] = run_data['query_profile_info']['num_samples'] - len(query_samples_to_remove) - ref_samples_to_remove = flag_samples(ref_missing_data_counts, sample_qual_thresh) - run_data['ref_profile_info']['failed_samples'] = ref_samples_to_remove - run_data['ref_profile_info']['num_samples_pass'] = run_data['ref_profile_info']['num_samples'] - len(ref_samples_to_remove) - - - qlabels,qprofiles = filter_samples(qlabels, qprofiles, set(query_samples_to_remove) | set(ref_samples_to_remove)) - rlabels, rprofiles = filter_samples(rlabels, rprofiles, - set(query_samples_to_remove) | set(ref_samples_to_remove)) - - - #Automatically determine batch size that fits in available memory - num_records = len(qlabels) + len(rlabels) - num_columns = len(qprofiles[0]) - byte_value_size = 8 #8 bytes for float64 which is the worst case - batch_size = calc_batch_size(num_records,num_columns,byte_value_size) - - #compute distances - dist_matrix_file = os.path.join(outdir,f'dists.parquet') - if os.path.isfile(dist_matrix_file): - os.remove(dist_matrix_file) - if dist_method == 'scaled': - calc_distances_scaled(qprofiles,qlabels,rprofiles,rlabels,dist_matrix_file,batch_size) - else: - calc_distances_hamming(qprofiles, qlabels, rprofiles, rlabels, dist_matrix_file,batch_size) - - - #format output for output format - results_file = os.path.join(outdir,"results.{}".format(file_type)) - run_data['result_file'] = results_file - write_dist_results(dist_matrix_file, - results_file, outfmt, - file_type, batch_size=batch_size, threshold=match_threshold) - - - run_data['analysis_end_time'] = datetime.now().strftime("%d/%m/%Y %H:%M:%S") - - - with open(os.path.join(outdir,"run.json"),'w' ) as fh: - fh.write(json.dumps(run_data, indent=4)) - - os.remove(dist_matrix_file) - - -# call main function -if __name__ == '__main__': - main() diff --git a/build/lib/profile_dists/test_data/__init__.py b/build/lib/profile_dists/test_data/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/build/lib/profile_dists/utils.py b/build/lib/profile_dists/utils.py deleted file mode 100644 index 058c502..0000000 --- a/build/lib/profile_dists/utils.py +++ /dev/null @@ -1,432 +0,0 @@ -import os.path -import shutil -import sys -import time -import psutil -import pandas as pd -import numpy as np -import fastparquet as fp -import tables -from numba import jit -from numba.typed import List -import pyarrow.parquet as pq -import re -from profile_dists.constants import MIN_FILE_SIZE, FILE_FORMATS, VALID_INT_TYPES - - -def guess_format(unique_values): - length_equal = is_all_same_len(unique_values) - has_integers = contains_integers(unique_values) - has_alpha = contains_alpha(unique_values) - - format = '' - #columns contains hash codes - if length_equal and has_integers and has_alpha: - format = 'hash' - - # columns contains a mix of integers and other info - elif has_integers and has_alpha: - format = 'mix' - - #columns contain only integers - elif has_integers: - format = 'int' - - return format - - -def is_all_same_len(unique_values): - l = set() - for idx,value in enumerate(unique_values): - if value != '0': - l.add(len(str(value))) - if len(l) == 1: - status = True - else: - status = False - return status - - -def contains_integers(unique_values): - status = False - for idx, value in enumerate(unique_values): - if isinstance(value, int) or re.search('[0-9]+',value): - status = True - break - return status - -def contains_alpha(unique_values): - status = False - for idx, value in enumerate(unique_values): - if isinstance(value, int) or isinstance(value, float): - continue - if re.search('[a-zA-Z]+',value): - status = True - break - return status - -def convert_allele_codes(unique_values,method): - converted_values = {} - counter = 1 - for idx,value in enumerate(unique_values): - if method == 'int': - converted_values[unique_values[idx]] = int(value) - elif method == 'hash': - if value == '0': - converted_values[unique_values[idx]] = 0 - else: - converted_values[unique_values[idx]] = counter - counter+=1 - else: - if re.search('[a-zA-Z]+',value) or re.search('\.|~|-',value): - value = '0' - converted_values[unique_values[idx]] = int(value) - return converted_values - - -def update_column_map(c1,c2): - for k in c2: - if not k in c1: - c1[k] = c2[k] - -def is_all_columns_int(column_dtypes): - count_non_int = 0 - for col in column_dtypes: - if col in VALID_INT_TYPES: - continue - count_non_int+=1 - if count_non_int > 0: - return False - return True - -def count_missing_data(df): - counts = {} - columns = df.columns.values.tolist() - for c in columns: - counts[c] = 0 - v = df[c].value_counts() - if 0 in v: - counts[c] = v[0] - return counts - -@jit(nopython=True) -def identify_cols_to_remove(column_counts,threshold): - cols_to_remove = [] - for c in column_counts: - if column_counts[c] > threshold: - cols_to_remove.append(c) - return cols_to_remove - - -def filter_columns(df,columns_to_remove): - return df.drop(columns_to_remove, axis=1) - - -def process_profile(profile_path,format="text",column_mapping={}): - - if format=='text': - df = pd.read_csv(profile_path,header=0,sep="\t",index_col=0,low_memory=False) - elif format=='parquet': - df = pd.read_parquet( - profile_path, - engine='auto', - columns=None, - storage_options=None, - ) - - columns = df.columns.values.tolist() - column_dtypes = df.dtypes.tolist() - is_correct_format = is_all_columns_int(column_dtypes) - - #If all columns are already integers then skip the extra processing steps - if is_correct_format: - return (column_mapping, df) - - df = df.replace('?', '0', regex=False) - df = df.replace(' ', '0', regex=False) - df = df.replace('-', '0', regex=False) - df = df.replace('', '0', regex=False) - - for column in columns: - unique_col_values = sorted(df[column].unique().tolist()) - method = guess_format(List(unique_col_values)) - if not column in column_mapping: - column_mapping[column] = convert_allele_codes(unique_col_values, method) - else: - update_column_map(column_mapping[column], convert_allele_codes(unique_col_values, method)) - - df[column] = df[column].map(column_mapping[column]) - return (column_mapping, df) - - -def convert_profiles(df): - labels = df.index.tolist() - profiles = [] - for index,row in df.iterrows(): - profiles.append(np.array(row.values.tolist())) - return labels, profiles - -def write_profiles(df,out_file,format): - if format == 'parquet': - df.to_parquet(out_file,compression='gzip') - else: - df.to_csv(out_file,sep="\t",header=True) - -@jit(nopython=True) -def count_missing(p): - count = 0 - for idx,value in enumerate(p): - if value ==0: - count+=1 - - return count - - -@jit(nopython=True) -def get_distance_raw(p1, p2): - count = 0 - for v1,v2 in zip(p1,p2): - if v1 == 0 or v2 == 0: - continue - if v1 != v2: - count+=1 - return count - -@jit(nopython=True) -def get_distance_scaled(p1, p2): - count_compared_sites = 0 - count_match = 0 - for v1,v2 in zip(p1,p2): - if v1 == 0 or v2 == 0: - continue - count_compared_sites+=1 - if v1 == v2: - count_match+=1 - if count_compared_sites: - return 100.0 * (float(count_compared_sites) - float(count_match)) / float(count_compared_sites) - else: - return 100.0 - - -def calc_batch_size(num_records,num_columns,byte_value_size): - mem = psutil.virtual_memory() - avail = mem.available - p = (byte_value_size * num_columns) + 56 - estimated_mem_needed = p * num_records - if estimated_mem_needed < avail: - return num_records - return int(avail / p) - -@jit(nopython=True) -def validate_file(f): - if not os.path.isfile(f): - return False - - if os.path.getsize(f) < MIN_FILE_SIZE: - return False - - return True - -def compare_headers(file1,file2): - h1 = [] - h2 = [] - with open(file1,'r') as f1: - h1 = next(f1).rstrip().split("\t") - with open(file2, 'r') as f2: - h2 = next(f2).rstrip().split("\t") - if len(h1) > 0 and len(h2) > 0 and len(h1) == len(h2): - ovl = set(h1) & set(h2) - if len(ovl) == len(h1): - return True - return False - -@jit(nopython=True) -def guess_profile_format(f): - ext = FILE_FORMATS - ftype = '' - - for format in ext: - for e in ext[format]: - if f.endswith(e): - ftype = format - break - if ftype != '': - break - - return ftype - - -def get_file_length(f): - return int(os.popen(f'wc -l {f}').read().split()[0]) - - -def calc_distances_scaled(query_profiles,query_labels,ref_profiles,ref_labels,parquet_file,batch_size=1): - - count = 0 - columns = ["dists"] + [str(x) for x in ref_labels] - num_query_profiles = len(query_profiles) - num_ref_profiles = len(ref_profiles) - dists = [] - - #Clear an existing file as this can cause unexpected behaviour - if os.path.isfile(parquet_file): - os.remove(parquet_file) - - for i in range(0, num_query_profiles): - d = [ query_labels[i] ] - for k in range(0, num_ref_profiles): - d.append(get_distance_scaled(query_profiles[i], ref_profiles[k])) - dists.append(d) - count += 1 - - if count == batch_size: - df = pd.DataFrame(dists, columns=columns) - if not os.path.isfile(parquet_file): - fp.write(parquet_file, df, compression='GZIP') - else: - fp.write(parquet_file, df, append=True, compression='GZIP') - dists = [] - count = 0 - - df = pd.DataFrame(dists, columns=columns) - if not os.path.isfile(parquet_file): - fp.write(parquet_file, df, compression='GZIP') - else: - fp.write(parquet_file, df, append=True, compression='GZIP') - -def calc_distances_hamming(query_profiles,query_labels,ref_profiles,ref_labels,parquet_file,batch_size=1): - count = 0 - columns = ["dists"] + ref_labels - num_query_profiles = len(query_profiles) - num_ref_profiles = len(ref_profiles) - dists = [] - - #Clear an existing file as this can cause unexpected behaviour - if os.path.isfile(parquet_file): - os.remove(parquet_file) - - for i in range(0, num_query_profiles): - d = [ query_labels[i] ] - for k in range(0, num_ref_profiles): - d.append(get_distance_raw(query_profiles[i], ref_profiles[k])) - dists.append(d) - count += 1 - - if count == batch_size: - df = pd.DataFrame(dists, columns=columns) - if not os.path.isfile(parquet_file): - fp.write(parquet_file, df, compression='GZIP') - else: - fp.write(parquet_file, df, append=True, compression='GZIP') - dists = [] - count = 0 - - df = pd.DataFrame(dists, columns=columns) - if not os.path.isfile(parquet_file): - fp.write(parquet_file, df, compression='GZIP') - else: - fp.write(parquet_file, df, append=True, compression='GZIP') - - -def is_file_ok(f): - status = True - if not os.path.isfile(f): - status = False - elif get_file_length(f) < 2: - status = False - elif os.path.getsize(f) < MIN_FILE_SIZE: - status = False - - return status - -@jit(nopython=True) -def filter_dists(labels,distances,threshold): - results = {} - for id, value in zip(labels,distances): - if value <= threshold: - results[id] = value - return results - - -def format_pairwise_dist(df,threshold=-1): - dists = {} - columns = df.columns.values.tolist()[1:] - for index,row in df.iterrows(): - dists[row[0]] = {row[0]:0} - if threshold != -1: - dists[row[0]] = filter_dists(List(columns), List(row[1:]), threshold) - else: - dists[row[0]] = dict(zip(columns, row[1:])) - dists[row[0]] = {k: v for k, v in sorted(dists[row[0]].items(), key=lambda item: item[1])} - - results = { - 'query_id':[], - 'ref_id':[], - 'dist':[] - } - - for qid in dists: - results['query_id'] += [qid] * len(dists[qid]) - results['ref_id'] += list(dists[qid].keys()) - results['dist'] += list(dists[qid].values()) - - - return pd.DataFrame(results) - - -def write_dist_results(mat,outfile,outtype,outfmt,batch_size=1,threshold=-1): - - #If the desired output is a matrix in parquet format simply rename the mat file - if outtype == 'matrix' and outfmt == 'parquet': - os.rename(mat,outfile) - return - init_file = True - parquet_file = pq.ParquetFile(mat) - for batch in parquet_file.iter_batches(batch_size): - batch_df = batch.to_pandas() - - if outtype == 'pairwise': - batch_df = format_pairwise_dist(batch_df, threshold=threshold) - if init_file: - init_file = False - if outfmt == 'text' and outtype == 'matrix': - batch_df.to_csv(outfile,index = False, header = True, sep="\t") - elif outfmt == 'text' and outtype == 'pairwise': - batch_df.to_csv(outfile, index=False, header=True, sep="\t") - else: - if not os.path.isfile(outfile): - fp.write(outfile, batch_df, compression='GZIP') - else: - if outfmt == 'text' and outtype == 'matrix': - batch_df.to_csv(outfile, mode ='a', index = False, header = False, sep="\t") - elif outfmt == 'text' and outtype == 'pairwise': - batch_df.to_csv(outfile, mode ='a', index = False, header = False, sep="\t") - else: - fp.write(parquet_file, batch_df, append=True, compression='GZIP') - -def get_missing_loci_counts(profiles,labels): - n = len(labels) - counts = {} - for i in range(0, n): - counts[labels[i]] = count_missing(profiles[i])/n - return counts - -def flag_samples(missing_counts,threshold): - r = [] - for sample_id in missing_counts: - if missing_counts[sample_id] > threshold: - r.append(sample_id) - return sorted(r) - -def filter_samples(labels,profiles,labels_to_remove): - l = [] - p = [] - for idx,label in enumerate(labels): - if label in labels_to_remove: - continue - l.append(label) - p.append(profiles[idx]) - return l, p - - -#write_dist_results('/Users/jrobertson/Desktop/ListeraClusterComp/BioNumerics.alleles.profiles5.parquet','/Users/jrobertson/Desktop/ListeraClusterComp/BioNumerics.alleles.profiles5.csv','pairwise','text',batch_size=500,threshold=10) \ No newline at end of file diff --git a/build/lib/profile_dists/version.py b/build/lib/profile_dists/version.py deleted file mode 100644 index 75977e6..0000000 --- a/build/lib/profile_dists/version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = '1.0.0' \ No newline at end of file diff --git a/profile_dists.egg-info/PKG-INFO b/profile_dists.egg-info/PKG-INFO deleted file mode 100644 index 6dab757..0000000 --- a/profile_dists.egg-info/PKG-INFO +++ /dev/null @@ -1,22 +0,0 @@ -Metadata-Version: 2.1 -Name: profile-dists -Version: 1.0.0 -Summary: Profile Dists: Rapid calcualtion of allele profile distances and distance base querying -Home-page: https://github.com/phac-nml/profile_dists -Author: James Robertson -Author-email: james.robertson@phac-aspc.gc.ca -License: GPLv3 -Keywords: cgMLST,wgMLST,outbreak,surveillance,clustering,distance matrix -Classifier: Development Status :: 4 - Beta -Classifier: Environment :: Console -Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+) -Classifier: Intended Audience :: Science/Research -Classifier: Topic :: Scientific/Engineering -Classifier: Topic :: Scientific/Engineering :: Bio-Informatics -Classifier: Programming Language :: Python -Classifier: Programming Language :: Python :: 3.8 -Classifier: Programming Language :: Python :: 3.9 -Classifier: Programming Language :: Python :: Implementation :: CPython -Classifier: Operating System :: POSIX :: Linux -Requires-Python: >=3.8.2,<4 -License-File: LICENSE diff --git a/profile_dists.egg-info/SOURCES.txt b/profile_dists.egg-info/SOURCES.txt deleted file mode 100644 index 409c13e..0000000 --- a/profile_dists.egg-info/SOURCES.txt +++ /dev/null @@ -1,17 +0,0 @@ -LICENSE -MANIFEST.in -README.md -setup.py -profile_dists/__init__.py -profile_dists/constants.py -profile_dists/main.py -profile_dists/utils.py -profile_dists/version.py -profile_dists.egg-info/PKG-INFO -profile_dists.egg-info/SOURCES.txt -profile_dists.egg-info/dependency_links.txt -profile_dists.egg-info/entry_points.txt -profile_dists.egg-info/requires.txt -profile_dists.egg-info/top_level.txt -profile_dists/test_data/__init__.py -tests/test_utils.py \ No newline at end of file diff --git a/profile_dists.egg-info/dependency_links.txt b/profile_dists.egg-info/dependency_links.txt deleted file mode 100644 index 8b13789..0000000 --- a/profile_dists.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/profile_dists.egg-info/entry_points.txt b/profile_dists.egg-info/entry_points.txt deleted file mode 100644 index 47d9b09..0000000 --- a/profile_dists.egg-info/entry_points.txt +++ /dev/null @@ -1,2 +0,0 @@ -[console_scripts] -profile_dists = profile_dists.main:main diff --git a/profile_dists.egg-info/requires.txt b/profile_dists.egg-info/requires.txt deleted file mode 100644 index 3598fa7..0000000 --- a/profile_dists.egg-info/requires.txt +++ /dev/null @@ -1,8 +0,0 @@ -pyarrow==12.0.0 -fastparquet==2023.4.0 -numba==0.57.1 -numpy==1.24.4 -tables==3.8.0 -six>=1.16.0 -pandas==2.0.2 -psutil diff --git a/profile_dists.egg-info/top_level.txt b/profile_dists.egg-info/top_level.txt deleted file mode 100644 index ddfd696..0000000 --- a/profile_dists.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -profile_dists