diff --git a/.gitignore b/.gitignore index 6c43e6c3..aedcbf58 100644 --- a/.gitignore +++ b/.gitignore @@ -84,3 +84,6 @@ docs/_build/ # sublime text *.idea* *sublime* + +# vscode +.vscode/ diff --git a/CHANGELOG b/CHANGELOG index dd72373d..dc1bb446 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,5 @@ +30.01.23 v2.0.1; update a bug introduced in "Fix globbing bug in splitp #221" that broke some flows +11.10.22 v2.0.0; port to Python 3 and update to newer versions of pandas, numpy, and more 02.01.20 Fix KeyError in allele_merge 30.07.19 v1.0.1; update changelog and increment version for edits on this date (see #164) 30.07.19 Fix to display of liability scale gencov, h2 in rg results log (see #162) diff --git a/README.md b/README.md index 57956127..f3253046 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -# LDSC (LD SCore) `v1.0.1` +# LDSC (LD SCore) `v2.0.1` `ldsc` is a command line tool for estimating heritability and genetic correlation from GWAS summary statistics. `ldsc` also computes LD Scores. diff --git a/environment.yml b/environment.yml index 60e24574..1108c580 100644 --- a/environment.yml +++ b/environment.yml @@ -2,12 +2,12 @@ name: ldsc channels: - bioconda dependencies: -- python=2.7 -- bitarray=0.8 -- nose=1.3 -- pybedtools=0.7 +- python=3.10 - pip - pip: - - scipy==0.18 - - pandas==0.20 - - numpy==1.16 + - bitarray==2.6.0 + - nose==1.3.7 + - pybedtools==0.9.0 + - scipy==1.9.2 + - pandas==1.5.0 + - numpy==1.23.3 diff --git a/ldsc.py b/ldsc.py index aa81340a..b734531e 100755 --- a/ldsc.py +++ b/ldsc.py @@ -8,7 +8,7 @@ 3. genetic covariance / correlation ''' -from __future__ import division + import ldscore.ldscore as ld import ldscore.parse as ps import ldscore.sumstats as sumstats @@ -18,6 +18,7 @@ from subprocess import call from itertools import product import time, sys, traceback, argparse +from functools import reduce try: @@ -26,7 +27,7 @@ except AttributeError: raise ImportError('LDSC requires pandas version >= 0.17.0') -__version__ = '1.0.1' +__version__ = '2.0.0' MASTHEAD = "*********************************************************************\n" MASTHEAD += "* LD Score Regression (LDSC)\n" MASTHEAD += "* Version {V}\n".format(V=__version__) @@ -37,7 +38,7 @@ pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) -pd.set_option('precision', 4) +pd.set_option('display.precision', 4) pd.set_option('max_colwidth',1000) np.set_printoptions(linewidth=1000) np.set_printoptions(precision=4) @@ -73,15 +74,15 @@ class Logger(object): ''' def __init__(self, fh): - self.log_fh = open(fh, 'wb') + self.log_fh = open(fh, 'w') def log(self, msg): ''' Print to log file and stdout with a single command. ''' - print >>self.log_fh, msg - print msg + print(msg, file=self.log_fh) + print(msg) def __filter__(fname, noun, verb, merge_obj): @@ -90,12 +91,12 @@ def __filter__(fname, noun, verb, merge_obj): f = lambda x,n: x.format(noun=noun, verb=verb, fname=fname, num=n) x = ps.FilterFile(fname) c = 'Read list of {num} {noun} to {verb} from {fname}' - print f(c, len(x.IDList)) + print(f(c, len(x.IDList))) merged_list = merge_obj.loj(x.IDList) len_merged_list = len(merged_list) if len_merged_list > 0: c = 'After merging, {num} {noun} remain' - print f(c, len_merged_list) + print(f(c, len_merged_list)) else: error_msg = 'No {noun} retained for analysis' raise ValueError(f(error_msg, 0)) @@ -106,7 +107,7 @@ def annot_sort_key(s): '''For use with --cts-bin. Fixes weird pandas crosstab column order.''' if type(s) == tuple: s = [x.split('_')[0] for x in s] - s = map(lambda x: float(x) if x != 'min' else -float('inf'), s) + s = [float(x) if x != 'min' else -float('inf') for x in s] else: # type(s) = str: s = s.split('_')[0] if s == 'min': @@ -184,7 +185,7 @@ def ldscore(args, log): raise ValueError(msg) else: - cts_colnames = ['ANNOT'+str(i) for i in xrange(len(cts_fnames))] + cts_colnames = ['ANNOT'+str(i) for i in range(len(cts_fnames))] log.log('Reading numbers with which to bin SNPs from {F}'.format(F=args.cts_bin)) @@ -215,7 +216,7 @@ def ldscore(args, log): name_breaks[0] = 'min' name_breaks[-1] = 'max' name_breaks = [str(x) for x in name_breaks] - labs = [name_breaks[i]+'_'+name_breaks[i+1] for i in xrange(n_breaks-1)] + labs = [name_breaks[i]+'_'+name_breaks[i+1] for i in range(n_breaks-1)] cut_vec = pd.Series(pd.cut(vec, bins=cut_breaks, labels=labs)) cts_levs.append(cut_vec) full_labs.append(labs) @@ -283,7 +284,7 @@ def ldscore(args, log): if args.ld_wind_snps: max_dist = args.ld_wind_snps - coords = np.array(xrange(geno_array.m)) + coords = np.array(range(geno_array.m)) elif args.ld_wind_kb: max_dist = args.ld_wind_kb*1000 coords = np.array(array_snps.df['BP'])[geno_array.kept_snps] @@ -339,7 +340,7 @@ def ldscore(args, log): F=args.print_snps, N=len(print_snps))) print_snps.columns=['SNP'] - df = df.ix[df.SNP.isin(print_snps.SNP),:] + df = df.iloc[df.SNP.isin(print_snps.SNP),:] if len(df) == 0: raise ValueError('After merging with --print-snps, no SNPs remain.') else: @@ -361,12 +362,12 @@ def ldscore(args, log): # print .M fout_M = open(args.out + '.'+ file_suffix +'.M','wb') - print >>fout_M, '\t'.join(map(str,M)) + print('\t'.join(map(str,M)), file=fout_M) fout_M.close() # print .M_5_50 fout_M_5_50 = open(args.out + '.'+ file_suffix +'.M_5_50','wb') - print >>fout_M_5_50, '\t'.join(map(str,M_5_50)) + print('\t'.join(map(str,M_5_50)), file=fout_M_5_50) fout_M_5_50.close() # print annot matrix @@ -383,19 +384,19 @@ def ldscore(args, log): # print LD Score summary pd.set_option('display.max_rows', 200) log.log('\nSummary of LD Scores in {F}'.format(F=out_fname+l2_suffix)) - t = df.ix[:,4:].describe() - log.log( t.ix[1:,:] ) + t = df.iloc[:,4:].describe() + log.log( t.iloc[1:,:] ) np.seterr(divide='ignore', invalid='ignore') # print NaN instead of weird errors # print correlation matrix including all LD Scores and sample MAF log.log('') log.log('MAF/LD Score Correlation Matrix') - log.log( df.ix[:,4:].corr() ) + log.log( df.iloc[:,4:].corr() ) # print condition number if n_annot > 1: # condition number of a column vector w/ nonzero var is trivially one log.log('\nLD Score Matrix Condition Number') - cond_num = np.linalg.cond(df.ix[:,5:]) + cond_num = np.linalg.cond(df.iloc[:,5:]) log.log( reg.remove_brackets(str(np.matrix(cond_num))) ) if cond_num > 10000: log.log('WARNING: ill-conditioned LD Score Matrix!') @@ -588,7 +589,7 @@ def ldscore(args, log): try: defaults = vars(parser.parse_args('')) opts = vars(args) - non_defaults = [x for x in opts.keys() if opts[x] != defaults[x]] + non_defaults = [x for x in list(opts.keys()) if opts[x] != defaults[x]] header = MASTHEAD header += "Call: \n" header += './ldsc.py \\\n' @@ -647,9 +648,9 @@ def ldscore(args, log): # bad flags else: - print header - print 'Error: no analysis selected.' - print 'ldsc.py -h describes options.' + print(header) + print('Error: no analysis selected.') + print('ldsc.py -h describes options.') except Exception: ex_type, ex, tb = sys.exc_info() log.log( traceback.format_exc(ex) ) diff --git a/ldscore/irwls.py b/ldscore/irwls.py index d635a2c7..e648b179 100644 --- a/ldscore/irwls.py +++ b/ldscore/irwls.py @@ -4,9 +4,9 @@ Iterativey re-weighted least squares. ''' -from __future__ import division + import numpy as np -import jackknife as jk +from . import jackknife as jk class IRWLS(object): @@ -109,10 +109,10 @@ def irwls(cls, x, y, update_func, n_blocks, w, slow=False, separators=None): 'w has shape {S}. w must have shape ({N}, 1).'.format(S=w.shape, N=n)) w = np.sqrt(w) - for i in xrange(2): # update this later + for i in range(2): # update this later new_w = np.sqrt(update_func(cls.wls(x, y, w))) if new_w.shape != w.shape: - print 'IRWLS update:', new_w.shape, w.shape + print('IRWLS update:', new_w.shape, w.shape) raise ValueError('New weights must have same shape.') else: w = new_w @@ -158,7 +158,7 @@ def wls(cls, x, y, w): x = cls._weight(x, w) y = cls._weight(y, w) - coef = np.linalg.lstsq(x, y) + coef = np.linalg.lstsq(x, y, rcond=None) return coef @classmethod diff --git a/ldscore/jackknife.py b/ldscore/jackknife.py index d9d0cba5..06af9620 100644 --- a/ldscore/jackknife.py +++ b/ldscore/jackknife.py @@ -12,7 +12,7 @@ ''' -from __future__ import division + import numpy as np from scipy.optimize import nnls np.seterr(divide='raise', invalid='raise') @@ -218,7 +218,7 @@ def __init__(self, x, y, n_blocks=None, nn=False, separators=None): func = lambda x, y: np.atleast_2d(nnls(x, np.array(y).T[0])[0]) else: func = lambda x, y: np.atleast_2d( - np.linalg.lstsq(x, np.array(y).T[0])[0]) + np.linalg.lstsq(x, np.array(y).T[0], rcond=None)[0]) self.est = func(x, y) self.delete_values = self.delete_values(x, y, func, self.separators) @@ -256,7 +256,7 @@ def delete_values(cls, x, y, func, s): ''' _check_shape(x, y) d = [func(np.vstack([x[0:s[i], ...], x[s[i + 1]:, ...]]), np.vstack([y[0:s[i], ...], y[s[i + 1]:, ...]])) - for i in xrange(len(s) - 1)] + for i in range(len(s) - 1)] return np.concatenate(d, axis=0) @@ -346,7 +346,7 @@ def block_values(cls, x, y, s): n_blocks = len(s) - 1 xtx_block_values = np.zeros((n_blocks, p, p)) xty_block_values = np.zeros((n_blocks, p)) - for i in xrange(n_blocks): + for i in range(n_blocks): xty_block_values[i, ...] = np.dot( x[s[i]:s[i + 1], ...].T, y[s[i]:s[i + 1], ...]).reshape((1, p)) xtx_block_values[i, ...] = np.dot( @@ -417,7 +417,7 @@ def block_values_to_delete_values(cls, xty_block_values, xtx_block_values): delete_values = np.zeros((n_blocks, p)) xty_tot = np.sum(xty_block_values, axis=0) xtx_tot = np.sum(xtx_block_values, axis=0) - for j in xrange(n_blocks): + for j in range(n_blocks): delete_xty = xty_tot - xty_block_values[j] delete_xtx = xtx_tot - xtx_block_values[j] delete_values[j, ...] = np.linalg.solve( @@ -507,7 +507,7 @@ def delete_values_to_pseudovalues(cls, est, denom, numer): ''' n_blocks, p = denom.shape pseudovalues = np.zeros((n_blocks, p)) - for j in xrange(0, n_blocks): + for j in range(0, n_blocks): pseudovalues[j, ...] = n_blocks * est - \ (n_blocks - 1) * numer[j, ...] / denom[j, ...] diff --git a/ldscore/ldscore.py b/ldscore/ldscore.py index f06ca6c9..fa55677c 100644 --- a/ldscore/ldscore.py +++ b/ldscore/ldscore.py @@ -1,4 +1,4 @@ -from __future__ import division + import numpy as np import bitarray as ba @@ -24,7 +24,7 @@ def getBlockLefts(coords, max_dist): M = len(coords) j = 0 block_left = np.zeros(M) - for i in xrange(M): + for i in range(M): while j < M and abs(coords[j] - coords[i]) > max_dist: j += 1 @@ -51,7 +51,7 @@ def block_left_to_right(block_left): M = len(block_left) j = 0 block_right = np.zeros(M) - for i in xrange(M): + for i in range(M): while j < M and block_left[j] <= i: j += 1 @@ -85,7 +85,7 @@ def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin= self.n) if self.n > 0: - print 'After filtering, {n} individuals remain'.format(n=self.n) + print('After filtering, {n} individuals remain'.format(n=self.n)) else: raise ValueError('After filtering, no individuals remain') @@ -99,7 +99,7 @@ def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin= self.geno, self.m, self.n, self.mafMin, keep_snps) if self.m > 0: - print 'After filtering, {m} SNPs remain'.format(m=self.m) + print('After filtering, {m} SNPs remain'.format(m=self.m)) else: raise ValueError('After filtering, no SNPs remain') @@ -190,7 +190,7 @@ def __corSumVarBlocks__(self, block_left, c, func, snp_getter, annot=None): rfuncAB = np.zeros((b, c)) rfuncBB = np.zeros((c, c)) # chunk inside of block - for l_B in xrange(0, b, c): # l_B := index of leftmost SNP in matrix B + for l_B in range(0, b, c): # l_B := index of leftmost SNP in matrix B B = A[:, l_B:l_B+c] np.dot(A.T, B / n, out=rfuncAB) rfuncAB = func(rfuncAB) @@ -199,7 +199,7 @@ def __corSumVarBlocks__(self, block_left, c, func, snp_getter, annot=None): b0 = b md = int(c*np.floor(m/c)) end = md + 1 if md != m else md - for l_B in xrange(b0, end, c): + for l_B in range(b0, end, c): # check if the annot matrix is all zeros for this block + chunk # this happens w/ sparse categories (i.e., pathways) # update the block @@ -294,7 +294,7 @@ def __filter_indivs__(self, geno, keep_indivs, m, n): nru_new = n_new + e nru = self.nru z = ba.bitarray(m*2*nru_new, endian="little") - z.setall(0) + z.setall(0) for e, i in enumerate(keep_indivs): z[2*e::2*nru_new] = geno[2*i::2*nru] z[2*e+1::2*nru_new] = geno[2*i+1::2*nru] @@ -335,7 +335,7 @@ def __filter_snps_maf__(self, geno, m, n, mafMin, keep_snps): m_poly = 0 y = ba.bitarray() if keep_snps is None: - keep_snps = xrange(m) + keep_snps = range(m) kept_snps = [] freq = [] for e, j in enumerate(keep_snps): @@ -397,7 +397,7 @@ def nextSNPs(self, b, minorRef=None): X = np.array(slice.decode(self._bedcode), dtype="float64").reshape((b, nru)).T X = X[0:n, :] Y = np.zeros(X.shape) - for j in xrange(0, b): + for j in range(0, b): newsnp = X[:, j] ii = newsnp != 9 avg = np.mean(newsnp[ii]) diff --git a/ldscore/parse.py b/ldscore/parse.py index 18fe7c98..aa6f2f25 100644 --- a/ldscore/parse.py +++ b/ldscore/parse.py @@ -5,7 +5,7 @@ ''' -from __future__ import division + import numpy as np import pandas as pd import os @@ -32,7 +32,7 @@ def sub_chr(s, chrom): def get_present_chrs(fh, num): '''Checks which chromosomes exist, assuming that the file base will be appended by a dot in any suffix.''' chrs = [] - for chrom in xrange(1,num): + for chrom in range(1,num): if glob.glob(sub_chr(fh, chrom) + '.*'): chrs.append(chrom) return chrs @@ -187,8 +187,8 @@ def annot(fh_list, num=None, frqfile=None): annot_suffix = ['.annot' for fh in fh_list] annot_compression = [] if num is not None: # 22 files, one for each chromosome - chrs = get_present_chrs(fh, num+1) for i, fh in enumerate(fh_list): + chrs = get_present_chrs(fh, num+1) first_fh = sub_chr(fh, chrs[0]) + annot_suffix[i] annot_s, annot_comp_single = which_compression(first_fh) annot_suffix[i] += annot_s diff --git a/ldscore/regressions.py b/ldscore/regressions.py index ec8fc49a..ae4930dc 100644 --- a/ldscore/regressions.py +++ b/ldscore/regressions.py @@ -7,12 +7,12 @@ Last column = intercept. ''' -from __future__ import division + import numpy as np import pandas as pd from scipy.stats import norm, chi2 -import jackknife as jk -from irwls import IRWLS +from . import jackknife as jk +from .irwls import IRWLS from scipy.stats import t as tdist from collections import namedtuple np.seterr(divide='raise', invalid='raise') @@ -182,8 +182,7 @@ def __init__(self, y, x, w, N, M, n_blocks, intercept=None, slow=False, step1_ii n1 = np.sum(step1_ii) self.twostep_filtered = n_snp - n1 x1 = x[np.squeeze(step1_ii), :] - yp1, w1, N1, initial_w1 = map( - lambda a: a[step1_ii].reshape((n1, 1)), (yp, w, N, initial_w)) + yp1, w1, N1, initial_w1 = [a[step1_ii].reshape((n1, 1)) for a in (yp, w, N, initial_w)] update_func1 = lambda a: self._update_func( a, x1, w1, N1, M_tot, Nbar, ii=step1_ii) step1_jknife = IRWLS( @@ -455,7 +454,7 @@ def summary(self, ref_ld_colnames=None, P=None, K=None, overlap=False): if self.n_annot > 1: if ref_ld_colnames is None: ref_ld_colnames = ['CAT_' + str(i) - for i in xrange(self.n_annot)] + for i in range(self.n_annot)] out.append('Categories: ' + ' '.join(ref_ld_colnames)) diff --git a/ldscore/sumstats.py b/ldscore/sumstats.py index 1c57491f..08de6573 100644 --- a/ldscore/sumstats.py +++ b/ldscore/sumstats.py @@ -5,13 +5,13 @@ into memory and checking that the input makes sense. There is no math here. LD Score regression is implemented in the regressions module. ''' -from __future__ import division + import numpy as np import pandas as pd from scipy import stats import itertools as it -import parse as ps -import regressions as reg +from . import parse as ps +from . import regressions as reg import sys import traceback import copy @@ -23,16 +23,16 @@ # complementary bases COMPLEMENT = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'} # bases -BASES = COMPLEMENT.keys() +BASES = list(COMPLEMENT.keys()) # true iff strand ambiguous STRAND_AMBIGUOUS = {''.join(x): x[0] == COMPLEMENT[x[1]] for x in it.product(BASES, BASES) if x[0] != x[1]} # SNPS we want to keep (pairs of alleles) -VALID_SNPS = {x for x in map(lambda y: ''.join(y), it.product(BASES, BASES)) +VALID_SNPS = {x for x in [''.join(y) for y in it.product(BASES, BASES)] if x[0] != x[1] and not STRAND_AMBIGUOUS[x]} # T iff SNP 1 has the same alleles as SNP 2 (allowing for strand or ref allele flip). -MATCH_ALLELES = {x for x in map(lambda y: ''.join(y), it.product(VALID_SNPS, VALID_SNPS)) +MATCH_ALLELES = {x for x in [''.join(y) for y in it.product(VALID_SNPS, VALID_SNPS)] # strand and ref match if ((x[0] == x[2]) and (x[1] == x[3])) or # ref match, strand flip @@ -69,7 +69,7 @@ def smart_merge(x, y): '''Check if SNP columns are equal. If so, save time by using concat instead of merge.''' if len(x) == len(y) and (x.index == y.index).all() and (x.SNP == y.SNP).all(): x = x.reset_index(drop=True) - y = y.reset_index(drop=True).drop('SNP', 1) + y = y.reset_index(drop=True).drop('SNP', axis=1) out = pd.concat([x, y], axis=1) else: out = pd.merge(x, y, how='inner', on='SNP') @@ -189,14 +189,14 @@ def _check_ld_condnum(args, log, ref_ld): def _check_variance(log, M_annot, ref_ld): '''Remove zero-variance LD Scores.''' - ii = ref_ld.ix[:, 1:].var() == 0 # NB there is a SNP column here + ii = ref_ld.iloc[:, 1:].var() == 0 # NB there is a SNP column here if ii.all(): raise ValueError('All LD Scores have zero variance.') else: log.log('Removing partitioned LD Scores with zero variance.') ii_snp = np.array([True] + list(~ii)) ii_m = np.array(~ii) - ref_ld = ref_ld.ix[:, ii_snp] + ref_ld = ref_ld.iloc[:, ii_snp] M_annot = M_annot[:, ii_m] return M_annot, ref_ld, ii @@ -272,7 +272,7 @@ def cell_type_specific(args, log): chisq_max = args.chisq_max ii = np.ravel(sumstats.Z**2 < chisq_max) - sumstats = sumstats.ix[ii, :] + sumstats = sumstats.iloc[ii, :] log.log('Removed {M} SNPs with chi^2 > {C} ({N} SNPs remain)'.format( C=chisq_max, N=np.sum(ii), M=n_snp-np.sum(ii))) n_snp = np.sum(ii) # lambdas are late-binding, so this works @@ -287,7 +287,7 @@ def cell_type_specific(args, log): ref_ld_cts_allsnps = _read_chr_split_files(ct_ld_chr, None, log, 'cts reference panel LD Score', ps.ldscore_fromlist) log.log('Performing regression.') - ref_ld_cts = np.array(pd.merge(keep_snps, ref_ld_cts_allsnps, on='SNP', how='left').ix[:,1:]) + ref_ld_cts = np.array(pd.merge(keep_snps, ref_ld_cts_allsnps, on='SNP', how='left').iloc[:,1:]) if np.any(np.isnan(ref_ld_cts)): raise ValueError ('Missing some LD scores from cts files. Are you sure all SNPs in ref-ld-chr are also in ref-ld-chr-cts') @@ -316,8 +316,8 @@ def estimate_h2(args, log): '''Estimate h2 and partitioned h2.''' args = copy.deepcopy(args) if args.samp_prev is not None and args.pop_prev is not None: - args.samp_prev, args.pop_prev = map( - float, [args.samp_prev, args.pop_prev]) + args.samp_prev, args.pop_prev = list(map( + float, [args.samp_prev, args.pop_prev])) if args.intercept_h2 is not None: args.intercept_h2 = float(args.intercept_h2) if args.no_intercept: @@ -344,7 +344,7 @@ def estimate_h2(args, log): chisq = s(sumstats.Z**2) if chisq_max is not None: ii = np.ravel(chisq < chisq_max) - sumstats = sumstats.ix[ii, :] + sumstats = sumstats.iloc[ii, :] log.log('Removed {M} SNPs with chi^2 > {C} ({N} SNPs remain)'.format( C=chisq_max, N=np.sum(ii), M=n_snp-np.sum(ii))) n_snp = np.sum(ii) # lambdas are late-binding, so this works @@ -382,15 +382,15 @@ def estimate_rg(args, log): rg_paths, rg_files = _parse_rg(args.rg) n_pheno = len(rg_paths) f = lambda x: _split_or_none(x, n_pheno) - args.intercept_h2, args.intercept_gencov, args.samp_prev, args.pop_prev = map(f, - (args.intercept_h2, args.intercept_gencov, args.samp_prev, args.pop_prev)) - map(lambda x: _check_arg_len(x, n_pheno), ((args.intercept_h2, '--intercept-h2'), + args.intercept_h2, args.intercept_gencov, args.samp_prev, args.pop_prev = list(map(f, + (args.intercept_h2, args.intercept_gencov, args.samp_prev, args.pop_prev))) + list(map(lambda x: _check_arg_len(x, n_pheno), ((args.intercept_h2, '--intercept-h2'), (args.intercept_gencov, '--intercept-gencov'), (args.samp_prev, '--samp-prev'), - (args.pop_prev, '--pop-prev'))) + (args.pop_prev, '--pop-prev')))) if args.no_intercept: - args.intercept_h2 = [1 for _ in xrange(n_pheno)] - args.intercept_gencov = [0 for _ in xrange(n_pheno)] + args.intercept_h2 = [1 for _ in range(n_pheno)] + args.intercept_gencov = [0 for _ in range(n_pheno)] p1 = rg_paths[0] out_prefix = args.out + rg_files[0] M_annot, w_ld_cname, ref_ld_cnames, sumstats, _ = _read_ld_sumstats(args, log, p1, @@ -449,28 +449,28 @@ def _get_rg_table(rg_paths, RG, args): '''Print a table of genetic correlations.''' t = lambda attr: lambda obj: getattr(obj, attr, 'NA') x = pd.DataFrame() - x['p1'] = [rg_paths[0] for i in xrange(1, len(rg_paths))] + x['p1'] = [rg_paths[0] for i in range(1, len(rg_paths))] x['p2'] = rg_paths[1:len(rg_paths)] - x['rg'] = map(t('rg_ratio'), RG) - x['se'] = map(t('rg_se'), RG) - x['z'] = map(t('z'), RG) - x['p'] = map(t('p'), RG) + x['rg'] = list(map(t('rg_ratio'), RG)) + x['se'] = list(map(t('rg_se'), RG)) + x['z'] = list(map(t('z'), RG)) + x['p'] = list(map(t('p'), RG)) if args.samp_prev is not None and \ args.pop_prev is not None and \ all((i is not None for i in args.samp_prev)) and \ all((i is not None for it in args.pop_prev)): - c = map(lambda x, y: reg.h2_obs_to_liab(1, x, y), args.samp_prev[1:], args.pop_prev[1:]) - x['h2_liab'] = map(lambda x, y: x * y, c, map(t('tot'), map(t('hsq2'), RG))) - x['h2_liab_se'] = map(lambda x, y: x * y, c, map(t('tot_se'), map(t('hsq2'), RG))) + c = list(map(lambda x, y: reg.h2_obs_to_liab(1, x, y), args.samp_prev[1:], args.pop_prev[1:])) + x['h2_liab'] = list(map(lambda x, y: x * y, c, list(map(t('tot'), list(map(t('hsq2'), RG)))))) + x['h2_liab_se'] = list(map(lambda x, y: x * y, c, list(map(t('tot_se'), list(map(t('hsq2'), RG)))))) else: - x['h2_obs'] = map(t('tot'), map(t('hsq2'), RG)) - x['h2_obs_se'] = map(t('tot_se'), map(t('hsq2'), RG)) + x['h2_obs'] = list(map(t('tot'), list(map(t('hsq2'), RG)))) + x['h2_obs_se'] = list(map(t('tot_se'), list(map(t('hsq2'), RG)))) - x['h2_int'] = map(t('intercept'), map(t('hsq2'), RG)) - x['h2_int_se'] = map(t('intercept_se'), map(t('hsq2'), RG)) - x['gcov_int'] = map(t('intercept'), map(t('gencov'), RG)) - x['gcov_int_se'] = map(t('intercept_se'), map(t('gencov'), RG)) + x['h2_int'] = list(map(t('intercept'), list(map(t('hsq2'), RG)))) + x['h2_int_se'] = list(map(t('intercept_se'), list(map(t('hsq2'), RG)))) + x['gcov_int'] = list(map(t('intercept'), list(map(t('gencov'), RG)))) + x['gcov_int_se'] = list(map(t('intercept_se'), list(map(t('gencov'), RG)))) return x.to_string(header=True, index=False) + '\n' @@ -529,7 +529,7 @@ def _rg(sumstats, args, log, M_annot, ref_ld_cnames, w_ld_cname, i): n_snp = np.sum(ii) # lambdas are late binding, so this works sumstats = sumstats[ii] n_blocks = min(args.n_blocks, n_snp) - ref_ld = sumstats.as_matrix(columns=ref_ld_cnames) + ref_ld = sumstats[ref_ld_cnames].to_numpy() intercepts = [args.intercept_h2[0], args.intercept_h2[ i + 1], args.intercept_gencov[i + 1]] rghat = reg.RG(s(sumstats.Z1), s(sumstats.Z2), @@ -568,9 +568,9 @@ def _print_rg_cov(rghat, fh, log): def _split_or_none(x, n): if x is not None: - y = map(float, x.replace('N', '-').split(',')) + y = list(map(float, x.replace('N', '-').split(','))) else: - y = [None for _ in xrange(n)] + y = [None for _ in range(n)] return y diff --git a/make_annot.py b/make_annot.py index cf337b41..3d4bf551 100755 --- a/make_annot.py +++ b/make_annot.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -from __future__ import print_function + import pandas as pd import numpy as np import argparse diff --git a/munge_sumstats.py b/munge_sumstats.py index b84e98c9..376547c7 100755 --- a/munge_sumstats.py +++ b/munge_sumstats.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -from __future__ import division + import pandas as pd import numpy as np import os @@ -144,7 +144,7 @@ def get_cname_map(flag, default, ignore): clean_ignore = [clean_header(x) for x in ignore] cname_map = {x: flag[x] for x in flag if x not in clean_ignore} cname_map.update( - {x: default[x] for x in default if x not in clean_ignore + flag.keys()}) + {x: default[x] for x in default if x not in clean_ignore + list(flag.keys())}) return cname_map @@ -239,16 +239,15 @@ def parse_dat(dat_gen, convert_colname, merge_alleles, log, args): sys.stdout.write('.') tot_snps += len(dat) old = len(dat) - dat = dat.dropna(axis=0, how="any", subset=filter( - lambda x: x != 'INFO', dat.columns)).reset_index(drop=True) + dat = dat.dropna(axis=0, how="any", subset=[x for x in dat.columns if x != 'INFO']).reset_index(drop=True) drops['NA'] += old - len(dat) - dat.columns = map(lambda x: convert_colname[x], dat.columns) + dat.columns = [convert_colname[x] for x in dat.columns] wrong_types = [c for c in dat.columns if c in numeric_cols and not np.issubdtype(dat[c].dtype, np.number)] if len(wrong_types) > 0: raise ValueError('Columns {} are expected to be numeric'.format(wrong_types)) - ii = np.array([True for i in xrange(len(dat))]) + ii = np.array([True for i in range(len(dat))]) if args.merge_alleles: old = ii.sum() ii = dat.SNP.isin(merge_alleles.SNP) @@ -257,7 +256,7 @@ def parse_dat(dat_gen, convert_colname, merge_alleles, log, args): continue dat = dat[ii].reset_index(drop=True) - ii = np.array([True for i in xrange(len(dat))]) + ii = np.array([True for i in range(len(dat))]) if 'INFO' in dat.columns: old = ii.sum() @@ -471,7 +470,7 @@ def allele_merge(dat, alleles, log): help="Use this flag to parse Stephan Ripke's daner* file format.") parser.add_argument('--daner-n', default=False, action='store_true', help="Use this flag to parse more recent daner* formatted files, which " - "include sample size column 'Nca' and 'Nco'.") + "include sample size column 'Nca' and 'Nco'.") parser.add_argument('--no-alleles', default=False, action="store_true", help="Don't require alleles. Useful if only unsigned summary statistics are available " "and the goal is h2 / partitioned h2 estimation rather than rg estimation.") @@ -533,13 +532,13 @@ def munge_sumstats(args, p=True): raise ValueError( '--no-alleles and --merge-alleles are not compatible.') if args.daner and args.daner_n: - raise ValueError('--daner and --daner-n are not compatible. Use --daner for sample ' + - 'size from FRQ_A/FRQ_U headers, use --daner-n for values from Nca/Nco columns') + raise ValueError('--daner and --daner-n are not compatible. Use --daner for sample ' + + 'size from FRQ_A/FRQ_U headers, use --daner-n for values from Nca/Nco columns') if p: defaults = vars(parser.parse_args('')) opts = vars(args) - non_defaults = [x for x in opts.keys() if opts[x] != defaults[x]] + non_defaults = [x for x in list(opts.keys()) if opts[x] != defaults[x]] header = MASTHEAD header += "Call: \n" header += './munge_sumstats.py \\\n' @@ -565,8 +564,8 @@ def munge_sumstats(args, p=True): cname_map = get_cname_map( flag_cnames, mod_default_cnames, ignore_cnames) if args.daner: - frq_u = filter(lambda x: x.startswith('FRQ_U_'), file_cnames)[0] - frq_a = filter(lambda x: x.startswith('FRQ_A_'), file_cnames)[0] + frq_u = [x for x in file_cnames if x.startswith('FRQ_U_')][0] + frq_a = [x for x in file_cnames if x.startswith('FRQ_A_')][0] N_cas = float(frq_a[6:]) N_con = float(frq_u[6:]) log.log( @@ -580,21 +579,21 @@ def munge_sumstats(args, p=True): cname_map[frq_u] = 'FRQ' - if args.daner_n: - frq_u = filter(lambda x: x.startswith('FRQ_U_'), file_cnames)[0] - cname_map[frq_u] = 'FRQ' - try: - dan_cas = clean_header(file_cnames[file_cnames.index('Nca')]) - except ValueError: - raise ValueError('Could not find Nca column expected for daner-n format') - - try: - dan_con = clean_header(file_cnames[file_cnames.index('Nco')]) - except ValueError: - raise ValueError('Could not find Nco column expected for daner-n format') + if args.daner_n: + frq_u = [x for x in file_cnames if x.startswith('FRQ_U_')][0] + cname_map[frq_u] = 'FRQ' + try: + dan_cas = clean_header(file_cnames[file_cnames.index('Nca')]) + except ValueError: + raise ValueError('Could not find Nca column expected for daner-n format') + + try: + dan_con = clean_header(file_cnames[file_cnames.index('Nco')]) + except ValueError: + raise ValueError('Could not find Nco column expected for daner-n format') cname_map[dan_cas] = 'N_CAS' - cname_map[dan_con] = 'N_CON' + cname_map[dan_con] = 'N_CON' cname_translation = {x: cname_map[clean_header(x)] for x in file_cnames if clean_header(x) in cname_map} # note keys not cleaned @@ -623,31 +622,31 @@ def munge_sumstats(args, p=True): req_cols = ['SNP', 'P'] for c in req_cols: - if c not in cname_translation.values(): + if c not in list(cname_translation.values()): raise ValueError('Could not find {C} column.'.format(C=c)) # check aren't any duplicated column names in mapping - for field in cname_translation: - numk = file_cnames.count(field) - if numk > 1: - raise ValueError('Found {num} columns named {C}'.format(C=field,num=str(numk))) + for field in cname_translation: + numk = file_cnames.count(field) + if numk > 1: + raise ValueError('Found {num} columns named {C}'.format(C=field,num=str(numk))) # check multiple different column names don't map to same data field - for head in cname_translation.values(): - numc = cname_translation.values().count(head) - if numc > 1: + for head in list(cname_translation.values()): + numc = list(cname_translation.values()).count(head) + if numc > 1: raise ValueError('Found {num} different {C} columns'.format(C=head,num=str(numc))) - if (not args.N) and (not (args.N_cas and args.N_con)) and ('N' not in cname_translation.values()) and\ - (any(x not in cname_translation.values() for x in ['N_CAS', 'N_CON'])): + if (not args.N) and (not (args.N_cas and args.N_con)) and ('N' not in list(cname_translation.values())) and\ + (any(x not in list(cname_translation.values()) for x in ['N_CAS', 'N_CON'])): raise ValueError('Could not determine N.') - if ('N' in cname_translation.values() or all(x in cname_translation.values() for x in ['N_CAS', 'N_CON']))\ - and 'NSTUDY' in cname_translation.values(): + if ('N' in list(cname_translation.values()) or all(x in list(cname_translation.values()) for x in ['N_CAS', 'N_CON']))\ + and 'NSTUDY' in list(cname_translation.values()): nstudy = [ x for x in cname_translation if cname_translation[x] == 'NSTUDY'] for x in nstudy: del cname_translation[x] - if not args.no_alleles and not all(x in cname_translation.values() for x in ['A1', 'A2']): + if not args.no_alleles and not all(x in list(cname_translation.values()) for x in ['A1', 'A2']): raise ValueError('Could not find A1/A2 columns.') log.log('Interpreting column names as follows:') @@ -677,9 +676,9 @@ def munge_sumstats(args, p=True): # figure out which columns are going to involve sign information, so we can ensure # they're read as floats - signed_sumstat_cols = [k for k,v in cname_translation.items() if v=='SIGNED_SUMSTAT'] + signed_sumstat_cols = [k for k,v in list(cname_translation.items()) if v=='SIGNED_SUMSTAT'] dat_gen = pd.read_csv(args.sumstats, delim_whitespace=True, header=0, - compression=compression, usecols=cname_translation.keys(), + compression=compression, usecols=list(cname_translation.keys()), na_values=['.', 'NA'], iterator=True, chunksize=args.chunksize, dtype={c:np.float64 for c in signed_sumstat_cols}) @@ -733,8 +732,7 @@ def munge_sumstats(args, p=True): except Exception: log.log('\nERROR converting summary statistics:\n') - ex_type, ex, tb = sys.exc_info() - log.log(traceback.format_exc(ex)) + log.log(traceback.format_exc()) raise finally: log.log('\nConversion finished at {T}'.format(T=time.ctime())) diff --git a/requirements.txt b/requirements.txt index cc076b03..fe0e1bdc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,10 @@ -bitarray>=0.8,<0.9 -nose>=1.3,<1.4 -pybedtools>=0.7,<0.8 -scipy>=0.18,<0.19 -numpy>=1.16,<1.17 -pandas>=0.20,<0.21 +bitarray==2.6.0 +nose==1.3.7 +numpy==1.23.3 +pandas==1.5.0 +pybedtools==0.9.0 +pysam==0.19.1 +python-dateutil==2.8.2 +pytz==2022.4 +scipy==1.9.2 +six==1.16.0 diff --git a/setup.py b/setup.py index e78ccfb6..443d0e89 100644 --- a/setup.py +++ b/setup.py @@ -1,20 +1,27 @@ from setuptools import setup +# read the contents of your README file +from pathlib import Path +this_directory = Path(__file__).parent +long_description = (this_directory / "README.md").read_text() + setup(name='ldsc', - version='1.0', + version='2.0.1', description='LD Score Regression (LDSC)', + long_description=long_description, + long_description_content_type='text/markdown', url='http://github.com/bulik/ldsc', author='Brendan Bulik-Sullivan and Hilary Finucane', author_email='', license='GPLv3', packages=['ldscore'], - scripts=['ldsc.py', 'munge_sumstats.py'], + scripts=['ldsc.py', 'munge_sumstats.py', 'make_annot.py'], install_requires = [ - 'bitarray>=0.8,<0.9', - 'nose>=1.3,<1.4', - 'pybedtools>=0.7,<0.8', - 'scipy>=0.18,<0.19', - 'numpy>=1.16,<1.17', - 'pandas>=0.20,<0.21' + 'bitarray>=2.6.0', + 'nose>=1.3.7', # this is only a test dep + 'pybedtools>=0.9.0', + 'scipy>=1.9.2', + 'numpy>=1.23.3', + 'pandas>=1.5.0' ] ) diff --git a/test/simulate.py b/test/simulate.py index b7a6cfed..706eb484 100644 --- a/test/simulate.py +++ b/test/simulate.py @@ -2,7 +2,7 @@ Generates .sumstats and .l2.ldscore/.l2.M files used for simulation testing. ''' -from __future__ import division + import numpy as np import pandas as pd @@ -17,17 +17,17 @@ def print_ld(x, fh, M): l2 = '.l2.ldscore' m = '.l2.M_5_50' x.to_csv(fh + l2, sep='\t', index=False, float_format='%.3f') - print >>open(fh + m, 'wb'), '\t'.join(map(str, M)) + print('\t'.join(map(str, M)), file=open(fh + m, 'wb')) # chr1 y = x.iloc[0:int(len(x) / 2), ] y.to_csv(fh + '1' + l2, sep='\t', index=False, float_format='%.3f') - print >>open(fh + '1' + m, 'wb'), '\t'.join((str(x / 2) for x in M)) + print('\t'.join((str(x / 2) for x in M)), file=open(fh + '1' + m, 'wb')) # chr2 y = x.iloc[int(len(x) / 2):len(x), ] y.to_csv(fh + '2' + l2, sep='\t', index=False, float_format='%.3f') - print >>open(fh + '2' + m, 'wb'), '\t'.join((str(x / 2) for x in M)) + print('\t'.join((str(x / 2) for x in M)), file=open(fh + '2' + m, 'wb')) two_ldsc = np.abs(100 * np.random.normal(size=2 * N_SNP)).reshape((N_SNP, 2)) single_ldsc = np.sum(two_ldsc, axis=1).reshape((N_SNP, 1)) @@ -35,7 +35,7 @@ def print_ld(x, fh, M): M = np.sum(single_ldsc) ld = pd.DataFrame({ 'CHR': np.ones(N_SNP), - 'SNP': ['rs' + str(i) for i in xrange(1000)], + 'SNP': ['rs' + str(i) for i in range(1000)], 'BP': np.arange(N_SNP)}) # 2 LD Scores 2 files @@ -64,12 +64,12 @@ def print_ld(x, fh, M): index=False, sep='\t', float_format='%.3f') # split across chromosomes df = pd.DataFrame({ - 'SNP': ['rs' + str(i) for i in xrange(1000)], - 'A1': ['A' for _ in xrange(1000)], - 'A2': ['G' for _ in xrange(1000)], + 'SNP': ['rs' + str(i) for i in range(1000)], + 'A1': ['A' for _ in range(1000)], + 'A2': ['G' for _ in range(1000)], 'N': np.ones(1000) * N_INDIV }) -for i in xrange(N_SIMS): +for i in range(N_SIMS): z = np.random.normal(size=N_SNP).reshape((N_SNP,)) c = np.sqrt( 1 + N_INDIV * (h21 * two_ldsc[:, 0] / float(M_two[0]) + h22 * two_ldsc[:, 1] / float(M_two[1]))) diff --git a/test/test_irwls.py b/test/test_irwls.py index 7bfe7c4c..dcba96b9 100644 --- a/test/test_irwls.py +++ b/test/test_irwls.py @@ -1,4 +1,4 @@ -from __future__ import division + from ldscore.irwls import IRWLS import unittest import numpy as np @@ -15,7 +15,7 @@ def setUp(self): self.w = np.abs(np.random.normal(size=4).reshape((4, 1))) self.w = self.w / np.sum(self.w) self.update_func = lambda x: np.ones((4, 1)) - print 'w=\n', self.w + print('w=\n', self.w) def test_weight_2d(self): x = np.ones((4, 2)) @@ -45,7 +45,7 @@ def setUp(self): self.w = np.abs(np.random.normal(size=4).reshape((4, 1))) self.w = self.w / np.sum(self.w) self.update_func = lambda x: np.ones((4, 1)) - print 'w=\n', self.w + print('w=\n', self.w) def test_weight_1d(self): assert_array_almost_equal(IRWLS._weight(self.x, self.w), self.w) diff --git a/test/test_jackknife.py b/test/test_jackknife.py index 4ec075e8..3f87f7b0 100644 --- a/test/test_jackknife.py +++ b/test/test_jackknife.py @@ -1,4 +1,4 @@ -from __future__ import division + import ldscore.jackknife as jk import unittest import numpy as np @@ -12,19 +12,20 @@ class Test_Jackknife(unittest.TestCase): def test_separators(self): N = 20 x = np.arange(N) - for i in xrange(2, int(np.floor(N / 2))): + for i in range(2, int(np.floor(N / 2))): s = jk.Jackknife.get_separators(N, i) - lengths = [len(x[s[j]:s[j + 1]]) for j in xrange(len(s) - 2)] + lengths = [len(x[s[j]:s[j + 1]]) for j in range(len(s) - 2)] self.assertTrue(max(lengths) - min(lengths) <= 1) def test_jknife_1d(self): pseudovalues = np.atleast_2d(np.arange(10)).T (est, var, se, cov) = jk.Jackknife.jknife(pseudovalues) - nose.tools.assert_almost_equal(var, 0.91666667) - nose.tools.assert_almost_equal(est, 4.5) - nose.tools.assert_almost_equal(cov, var) - nose.tools.assert_almost_equal(se ** 2, var) + print(jk.Jackknife.jknife(pseudovalues)) + self.assertTrue(np.isclose(var.flat, 0.91666667)) + self.assertTrue(np.isclose(est, 4.5)) + self.assertTrue(np.isclose(cov, var)) + self.assertTrue(np.isclose(se ** 2, var)) self.assertTrue(not np.any(np.isnan(cov))) assert_array_equal(cov.shape, (1, 1)) assert_array_equal(var.shape, (1, 1)) @@ -194,8 +195,8 @@ def test_block_to_delete_2d(self): def test_eq_slow(self): x = np.atleast_2d(np.random.normal(size=(100, 2))) y = np.atleast_2d(np.random.normal(size=(100, 1))) - print x.shape - for n_blocks in xrange(2, 49): + print(x.shape) + for n_blocks in range(2, 49): b1 = jk.LstsqJackknifeFast(x, y, n_blocks=n_blocks).est b2 = jk.LstsqJackknifeSlow(x, y, n_blocks=n_blocks).est assert_array_almost_equal(b1, b2) @@ -205,7 +206,7 @@ def test_bad_data(self): assert_raises(ValueError, jk.LstsqJackknifeFast, x, x, n_blocks=3) assert_raises(ValueError, jk.LstsqJackknifeFast, x.T, x.T, n_blocks=8) assert_raises( - ValueError, jk.LstsqJackknifeFast, x.T, x.T, separators=range(10)) + ValueError, jk.LstsqJackknifeFast, x.T, x.T, separators=list(range(10))) class Test_RatioJackknife(unittest.TestCase): diff --git a/test/test_ldscore.py b/test/test_ldscore.py index fc9d3e8c..4bccda9b 100644 --- a/test/test_ldscore.py +++ b/test/test_ldscore.py @@ -37,16 +37,16 @@ def setUp(self): def test_bed(self): bed = ld.PlinkBEDFile('test/plink_test/plink.bed', self.N, self.bim) # remove three monomorphic SNPs - print bed.geno - print bed.m + print(bed.geno) + print(bed.m) assert bed.m == 4 # no individuals removed - print bed.n + print(bed.n) assert self.N == bed.n # 5 indivs * 4 polymorphic SNPs - print len(bed.geno) + print(len(bed.geno)) assert len(bed.geno) == 64 - print bed.freq + print(bed.freq) correct = np.array( [0.59999999999999998, 0.59999999999999998, 0.625, 0.625]) assert np.all(bed.freq == correct) @@ -77,7 +77,7 @@ def test_filter_indivs_and_snps(self): keep_snps=keep_snps, keep_indivs=keep_indivs) assert bed.m == 1 assert bed.n == 2 - print bed.geno + print(bed.geno) assert bed.geno[0:4] == ba.bitarray('0001') @nose.tools.raises(ValueError) diff --git a/test/test_munge_sumstats.py b/test/test_munge_sumstats.py index 59f878f5..babdf002 100644 --- a/test/test_munge_sumstats.py +++ b/test/test_munge_sumstats.py @@ -1,11 +1,10 @@ -from __future__ import division + import munge_sumstats as munge import unittest import numpy as np import pandas as pd import nose -from pandas.util.testing import assert_series_equal -from pandas.util.testing import assert_frame_equal +from pandas.testing import assert_series_equal, assert_frame_equal from numpy.testing import assert_array_equal, assert_array_almost_equal, assert_allclose @@ -72,8 +71,8 @@ def setUp(self): def test_n_col(self): self.dat['N'] = self.N dat = munge.process_n(self.dat, self.args, log) - print dat - print self.dat_filtered + print(dat) + print(self.dat_filtered) assert_frame_equal(dat, self.dat_filtered) def test_nstudy(self): @@ -204,7 +203,7 @@ def test_merge_alleles(self): merge_alleles['MA'] = ['AG', 'AG', 'AG'] dat = munge.parse_dat( self.dat_gen, self.convert_colname, merge_alleles, log, self.args) - print self.dat.loc[0:2, ['SNP', 'A1', 'A2', 'P']] + print(self.dat.loc[0:2, ['SNP', 'A1', 'A2', 'P']]) assert_frame_equal(dat, self.dat.loc[0:2, ['SNP', 'A1', 'A2', 'P']]) def test_standard(self): diff --git a/test/test_parse.py b/test/test_parse.py index 85e926e9..b609dcc4 100644 --- a/test/test_parse.py +++ b/test/test_parse.py @@ -1,4 +1,4 @@ -from __future__ import division + from ldscore import parse as ps import unittest import numpy as np @@ -59,22 +59,22 @@ class Test_ldscore(unittest.TestCase): def test_ldscore(self): x = ps.ldscore(os.path.join(DIR, 'parse_test/test')) assert_equal(list(x['SNP']), ['rs' + str(i) for i in range(1, 23)]) - assert_equal(list(x['AL2']), range(1, 23)) - assert_equal(list(x['BL2']), range(2, 46, 2)) + assert_equal(list(x['AL2']), list(range(1, 23))) + assert_equal(list(x['BL2']), list(range(2, 46, 2))) def test_ldscore_loop(self): x = ps.ldscore(os.path.join(DIR, 'parse_test/test'), 2) assert_equal(list(x['SNP']), ['rs' + str(i) for i in range(1, 3)]) - assert_equal(list(x['AL2']), range(1, 3)) - assert_equal(list(x['BL2']), range(2, 6, 2)) + assert_equal(list(x['AL2']), list(range(1, 3))) + assert_equal(list(x['BL2']), list(range(2, 6, 2))) def test_ldscore_fromlist(self): fh = os.path.join(DIR, 'parse_test/test') x = ps.ldscore_fromlist([fh, fh]) assert_array_equal(x.shape, (22, 5)) y = ps.ldscore(os.path.join(DIR, 'parse_test/test')) - assert_array_equal(x.ix[:, 0:3], y) - assert_array_equal(x.ix[:, [0, 3, 4]], y) + assert_array_equal(x.iloc[:, 0:3], y) + assert_array_equal(x.iloc[:, [0, 3, 4]], y) assert_raises( ValueError, ps.ldscore_fromlist, [fh, os.path.join(DIR, 'parse_test/test2')]) diff --git a/test/test_regressions.py b/test/test_regressions.py index b6d2d14c..28d2cf63 100644 --- a/test/test_regressions.py +++ b/test/test_regressions.py @@ -1,4 +1,4 @@ -from __future__ import division + import ldscore.regressions as reg import unittest import numpy as np @@ -14,7 +14,7 @@ def test_update_separators(): ii3 = [False, True, False, True, True, False, False] ii4 = [False, True, False, True, True, False, True] ii5 = [True, True, True, True, True, True, True] - iis = map(np.array, [ii1, ii2, ii3, ii4, ii5]) + iis = list(map(np.array, [ii1, ii2, ii3, ii4, ii5])) ids = np.arange(len(ii1)) for ii in iis: s = np.arange(np.sum(ii) + 1) @@ -144,8 +144,8 @@ def setUp(self): self.hsq_noint = reg.Hsq( chisq, ld, w_ld, N, self.M, n_blocks=3, intercept=1) self.hsq_int = reg.Hsq(chisq, ld, w_ld, N, self.M, n_blocks=3) - print self.hsq_noint.summary() - print self.hsq_int.summary() + print(self.hsq_noint.summary()) + print(self.hsq_int.summary()) def test_coef(self): a = [self.hsq1 / self.M[0, 0], self.hsq2 / self.M[0, 1]] @@ -194,7 +194,7 @@ def setUp(self): def test_summary(self): # not much to test; we can at least make sure no errors at runtime self.hsq.summary(['asdf', 'qwer']) - # change to random 7/30/2019 to avoid inconsistent singular matrix errors + # change to random 7/30/2019 to avoid inconsistent singular matrix errors self.ld += np.random.normal(scale=0.1, size=(17,2)) self.chisq += np.arange(17).reshape((17, 1)) hsq = reg.Hsq( @@ -289,9 +289,9 @@ def test_eq_hsq(self): self.M, 0, 0, 0, 0, n_blocks=3, intercept_gencov=1) hsq = reg.Hsq(np.square(self.z1), self.ld, self.w_ld, self.N1, self.M, n_blocks=3, intercept=1) - print gencov.summary(['asdf', 'asdf']) - print - print hsq.summary(['asdf', 'asdf']) + print(gencov.summary(['asdf', 'asdf'])) + print() + print(hsq.summary(['asdf', 'asdf'])) assert_array_almost_equal(gencov.tot, hsq.tot) assert_array_almost_equal(gencov.tot_se, hsq.tot_se) assert_array_almost_equal(gencov.tot_cov, hsq.tot_cov) @@ -313,8 +313,8 @@ def setUp(self): def test_summary(self): # just make sure it doesn't encounter any errors at runtime - print self.rg.summary() - print self.rg.summary(silly=True) + print(self.rg.summary()) + print(self.rg.summary(silly=True)) def test_rg(self): # won't be exactly 1 because the h2 values passed to Gencov aren't 0 @@ -333,8 +333,8 @@ def test_negative_h2(self): M, 1.0, 1.0, 0, n_blocks=20) assert rg._negative_hsq # check no runtime errors when _negative_hsq is True - print rg.summary() - print rg.summary(silly=True) + print(rg.summary()) + print(rg.summary(silly=True)) assert rg.rg_ratio == 'NA' assert rg.rg_se == 'NA' assert rg.rg == 'NA' diff --git a/test/test_sumstats.py b/test/test_sumstats.py index cda514d0..b1f75cf4 100644 --- a/test/test_sumstats.py +++ b/test/test_sumstats.py @@ -1,10 +1,10 @@ -from __future__ import division + import ldscore.sumstats as s import ldscore.parse as ps import unittest import numpy as np import pandas as pd -from pandas.util.testing import assert_series_equal, assert_frame_equal +from pandas.testing import assert_series_equal, assert_frame_equal from nose.tools import * from numpy.testing import assert_array_equal, assert_array_almost_equal, assert_allclose from nose.plugins.attrib import attr @@ -26,7 +26,7 @@ def __init__(self): def log(self, x): # pass - print x + print(x) log = Mock() args = Mock() @@ -65,7 +65,7 @@ def test_align_alleles(): def test_filter_bad_alleles(): alleles = pd.Series(['ATAT', 'ATAG', 'DIID', 'ACAC']) bad_alleles = s._filter_alleles(alleles) - print bad_alleles + print(bad_alleles) assert_series_equal(bad_alleles, pd.Series([False, False, False, True])) @@ -201,77 +201,77 @@ def setUpClass(cls): args.ref_ld = DIR + '/simulate_test/ldscore/twold_onefile' args.w_ld = DIR + '/simulate_test/ldscore/w' args.rg = ','.join( - (DIR + '/simulate_test/sumstats/' + str(i) for i in xrange(N_REP))) + (DIR + '/simulate_test/sumstats/' + str(i) for i in range(N_REP))) args.out = DIR + '/simulate_test/1' x = s.estimate_rg(args, log) - args.intercept_gencov = ','.join(('0' for _ in xrange(N_REP))) - args.intercept_h2 = ','.join(('1' for _ in xrange(N_REP))) + args.intercept_gencov = ','.join(('0' for _ in range(N_REP))) + args.intercept_h2 = ','.join(('1' for _ in range(N_REP))) y = s.estimate_rg(args, log) cls.rg = x cls.rg_noint = y def test_rg_ratio(self): - assert_allclose(np.nanmean(map(t('rg_ratio'), self.rg)), 0, atol=0.02) + assert_allclose(np.nanmean(list(map(t('rg_ratio'), self.rg))), 0, atol=0.02) def test_rg_ratio_noint(self): assert_allclose( - np.nanmean(map(t('rg_ratio'), self.rg_noint)), 0, atol=0.02) + np.nanmean(list(map(t('rg_ratio'), self.rg_noint))), 0, atol=0.02) def test_rg_se(self): - assert_allclose(np.nanmean(map(t('rg_se'), self.rg)), np.nanstd( - map(t('rg_ratio'), self.rg)), atol=0.02) + assert_allclose(np.nanmean(list(map(t('rg_se'), self.rg))), np.nanstd( + list(map(t('rg_ratio'), self.rg))), atol=0.02) def test_rg_se_noint(self): - assert_allclose(np.nanmean(map(t('rg_se'), self.rg_noint)), np.nanstd( - map(t('rg_ratio'), self.rg_noint)), atol=0.02) + assert_allclose(np.nanmean(list(map(t('rg_se'), self.rg_noint))), np.nanstd( + list(map(t('rg_ratio'), self.rg_noint))), atol=0.02) def test_gencov_tot(self): assert_allclose( - np.nanmean(map(t('tot'), map(t('gencov'), self.rg))), 0, atol=0.02) + np.nanmean(list(map(t('tot'), list(map(t('gencov'), self.rg))))), 0, atol=0.02) def test_gencov_tot_noint(self): assert_allclose( - np.nanmean(map(t('tot'), map(t('gencov'), self.rg_noint))), 0, atol=0.02) + np.nanmean(list(map(t('tot'), list(map(t('gencov'), self.rg_noint))))), 0, atol=0.02) def test_gencov_tot_se(self): - assert_allclose(np.nanstd(map(t('tot'), map(t('gencov'), self.rg))), np.nanmean( - map(t('tot_se'), map(t('gencov'), self.rg))), atol=0.02) + assert_allclose(np.nanstd(list(map(t('tot'), list(map(t('gencov'), self.rg))))), np.nanmean( + list(map(t('tot_se'), list(map(t('gencov'), self.rg))))), atol=0.02) def test_gencov_tot_se_noint(self): - assert_allclose(np.nanstd(map(t('tot'), map(t('gencov'), self.rg_noint))), np.nanmean( - map(t('tot_se'), map(t('gencov'), self.rg_noint))), atol=0.02) + assert_allclose(np.nanstd(list(map(t('tot'), list(map(t('gencov'), self.rg_noint))))), np.nanmean( + list(map(t('tot_se'), list(map(t('gencov'), self.rg_noint))))), atol=0.02) def test_gencov_cat(self): assert_allclose( - np.nanmean(map(t('cat'), map(t('gencov'), self.rg))), [0, 0], atol=0.02) + np.nanmean(list(map(t('cat'), list(map(t('gencov'), self.rg))))), [0, 0], atol=0.02) def test_gencov_cat_noint(self): assert_allclose( - np.nanmean(map(t('cat'), map(t('gencov'), self.rg_noint))), [0, 0], atol=0.02) + np.nanmean(list(map(t('cat'), list(map(t('gencov'), self.rg_noint))))), [0, 0], atol=0.02) def test_gencov_cat_se(self): - assert_allclose(np.nanstd(map(t('cat'), map(t('gencov'), self.rg))), np.nanmean( - map(t('cat_se'), map(t('gencov'), self.rg))), atol=0.02) + assert_allclose(np.nanstd(list(map(t('cat'), list(map(t('gencov'), self.rg))))), np.nanmean( + list(map(t('cat_se'), list(map(t('gencov'), self.rg))))), atol=0.02) def test_gencov_cat_se_noint(self): - assert_allclose(np.nanstd(map(t('cat'), map(t('gencov'), self.rg_noint))), np.nanmean( - map(t('cat_se'), map(t('gencov'), self.rg_noint))), atol=0.02) + assert_allclose(np.nanstd(list(map(t('cat'), list(map(t('gencov'), self.rg_noint))))), np.nanmean( + list(map(t('cat_se'), list(map(t('gencov'), self.rg_noint))))), atol=0.02) def test_gencov_int(self): assert_allclose( - np.nanmean(map(t('intercept'), map(t('gencov'), self.rg))), 0, atol=0.1) + np.nanmean(list(map(t('intercept'), list(map(t('gencov'), self.rg))))), 0, atol=0.1) def test_gencov_int_se(self): - assert_allclose(np.nanmean(map(t('intercept_se'), map(t('gencov'), self.rg))), np.nanstd( - map(t('intercept'), map(t('gencov'), self.rg))), atol=0.1) + assert_allclose(np.nanmean(list(map(t('intercept_se'), list(map(t('gencov'), self.rg))))), np.nanstd( + list(map(t('intercept'), list(map(t('gencov'), self.rg))))), atol=0.1) def test_hsq_int(self): assert_allclose( - np.nanmean(map(t('intercept'), map(t('hsq2'), self.rg))), 1, atol=0.1) + np.nanmean(list(map(t('intercept'), list(map(t('hsq2'), self.rg))))), 1, atol=0.1) def test_hsq_int_se(self): - assert_allclose(np.nanmean(map(t('intercept_se'), map(t('hsq2'), self.rg))), np.nanstd( - map(t('intercept'), map(t('hsq2'), self.rg))), atol=0.1) + assert_allclose(np.nanmean(list(map(t('intercept_se'), list(map(t('hsq2'), self.rg))))), np.nanstd( + list(map(t('intercept'), list(map(t('hsq2'), self.rg))))), atol=0.1) @attr('h2') @@ -286,7 +286,7 @@ def setUpClass(cls): args.chisq_max = 99999 h2 = [] h2_noint = [] - for i in xrange(N_REP): + for i in range(N_REP): args.intercept_h2 = None args.h2 = DIR + '/simulate_test/sumstats/' + str(i) args.out = DIR + '/simulate_test/1' @@ -298,65 +298,65 @@ def setUpClass(cls): cls.h2_noint = h2_noint def test_tot(self): - assert_allclose(np.nanmean(map(t('tot'), self.h2)), 0.9, atol=0.05) + assert_allclose(np.nanmean(list(map(t('tot'), self.h2))), 0.9, atol=0.05) def test_tot_noint(self): assert_allclose( - np.nanmean(map(t('tot'), self.h2_noint)), 0.9, atol=0.05) + np.nanmean(list(map(t('tot'), self.h2_noint))), 0.9, atol=0.05) def test_tot_se(self): - assert_allclose(np.nanmean(map(t('tot_se'), self.h2)), np.nanstd( - map(t('tot'), self.h2)), atol=0.05) + assert_allclose(np.nanmean(list(map(t('tot_se'), self.h2))), np.nanstd( + list(map(t('tot'), self.h2))), atol=0.05) def test_tot_se_noint(self): - assert_allclose(np.nanmean(map(t('tot_se'), self.h2_noint)), np.nanstd( - map(t('tot'), self.h2_noint)), atol=0.05) + assert_allclose(np.nanmean(list(map(t('tot_se'), self.h2_noint))), np.nanstd( + list(map(t('tot'), self.h2_noint))), atol=0.05) def test_cat(self): - x = np.nanmean(map(t('cat'), self.h2_noint), axis=0) + x = np.nanmean(list(map(t('cat'), self.h2_noint)), axis=0) y = np.array((0.3, 0.6)).reshape(x.shape) assert_allclose(x, y, atol=0.05) def test_cat_noint(self): - x = np.nanmean(map(t('cat'), self.h2_noint), axis=0) + x = np.nanmean(list(map(t('cat'), self.h2_noint)), axis=0) y = np.array((0.3, 0.6)).reshape(x.shape) assert_allclose(x, y, atol=0.05) def test_cat_se(self): - x = np.nanmean(map(t('cat_se'), self.h2), axis=0) - y = np.nanstd(map(t('cat'), self.h2), axis=0).reshape(x.shape) + x = np.nanmean(list(map(t('cat_se'), self.h2)), axis=0) + y = np.nanstd(list(map(t('cat'), self.h2)), axis=0).reshape(x.shape) assert_allclose(x, y, atol=0.05) def test_cat_se_noint(self): - x = np.nanmean(map(t('cat_se'), self.h2_noint), axis=0) - y = np.nanstd(map(t('cat'), self.h2_noint), axis=0).reshape(x.shape) + x = np.nanmean(list(map(t('cat_se'), self.h2_noint)), axis=0) + y = np.nanstd(list(map(t('cat'), self.h2_noint)), axis=0).reshape(x.shape) assert_allclose(x, y, atol=0.05) def test_coef(self): # should be h^2/M = [[0.3, 0.9]] / M coef = np.array(((0.3, 0.9))) / self.h2[0].M for h in [self.h2, self.h2_noint]: - assert np.all(np.abs(np.nanmean(map(t('coef'), h), axis=0) - coef) < 1e6) + assert np.all(np.abs(np.nanmean(list(map(t('coef'), h)), axis=0) - coef) < 1e6) def test_coef_se(self): for h in [self.h2, self.h2_noint]: - assert_array_almost_equal(np.nanmean(map(t('coef_se'), h), axis=0), - np.nanstd(map(t('coef'), h), axis=0)) + assert_array_almost_equal(np.nanmean(list(map(t('coef_se'), h)), axis=0), + np.nanstd(list(map(t('coef'), h)), axis=0)) def test_prop(self): for h in [self.h2, self.h2_noint]: - assert np.all(np.nanmean(map(t('prop'), h), axis=0) - [1/3, 2/3] < 0.02) + assert np.all(np.nanmean(list(map(t('prop'), h)), axis=0) - [1/3, 2/3] < 0.02) def test_prop_se(self): for h in [self.h2, self.h2_noint]: - assert np.all(np.nanmean(map(t('prop_se'), h), axis=0) - np.nanstd(map(t('prop'), h), axis=0) < 0.02) + assert np.all(np.nanmean(list(map(t('prop_se'), h)), axis=0) - np.nanstd(list(map(t('prop'), h)), axis=0) < 0.02) def test_int(self): - assert_allclose(np.nanmean(map(t('intercept'), self.h2)), 1, atol=0.1) + assert_allclose(np.nanmean(list(map(t('intercept'), self.h2))), 1, atol=0.1) def test_int_se(self): - assert_allclose(np.nanstd(map(t('intercept'), self.h2)), np.nanmean( - map(t('intercept_se'), self.h2)), atol=0.1) + assert_allclose(np.nanstd(list(map(t('intercept'), self.h2))), np.nanmean( + list(map(t('intercept_se'), self.h2))), atol=0.1) class Test_Estimate(unittest.TestCase): @@ -409,11 +409,11 @@ def test_rg_M(self): args.ref_ld = DIR + '/simulate_test/ldscore/oneld_onefile' args.w_ld = DIR + '/simulate_test/ldscore/w' args.rg = ','.join( - [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)]) + [DIR + '/simulate_test/sumstats/1' for _ in range(2)]) args.out = DIR + '/simulate_test/1' x = s.estimate_rg(args, log)[0] args.M = open( - DIR + '/simulate_test/ldscore/oneld_onefile.l2.M_5_50', 'rb').read().rstrip('\n') + DIR + '/simulate_test/ldscore/oneld_onefile.l2.M_5_50', 'r').read().rstrip('\n') y = s.estimate_rg(args, log)[0] assert_array_almost_equal(x.rg_ratio, y.rg_ratio) assert_array_almost_equal(x.rg_se, y.rg_se) @@ -427,7 +427,7 @@ def test_rg_ref_ld(self): args.ref_ld_chr = DIR + '/simulate_test/ldscore/twold_onefile' args.w_ld = DIR + '/simulate_test/ldscore/w' args.rg = ','.join( - [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)]) + [DIR + '/simulate_test/sumstats/1' for _ in range(2)]) args.out = DIR + '/simulate_test/1' args.print_cov = True # right now just check no runtime errors args.print_delete_vals = True @@ -447,7 +447,7 @@ def test_no_check_alleles(self): args.ref_ld = DIR + '/simulate_test/ldscore/oneld_onefile' args.w_ld = DIR + '/simulate_test/ldscore/w' args.rg = ','.join( - [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)]) + [DIR + '/simulate_test/sumstats/1' for _ in range(2)]) args.out = DIR + '/simulate_test/1' x = s.estimate_rg(args, log)[0] args.no_check_alleles = True @@ -477,7 +477,7 @@ def test_twostep_rg(self): args.ref_ld_chr = DIR + '/simulate_test/ldscore/oneld_onefile' args.w_ld = DIR + '/simulate_test/ldscore/w' args.rg = ','.join( - [DIR + '/simulate_test/sumstats/1' for _ in xrange(2)]) + [DIR + '/simulate_test/sumstats/1' for _ in range(2)]) args.out = DIR + '/simulate_test/rg' args.two_step = 999 x = s.estimate_rg(args, log)[0]