Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port ldsc to Python 3 #360

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,6 @@ docs/_build/
# sublime text
*.idea*
*sublime*

# vscode
.vscode/
2 changes: 2 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
30.01.23 v2.0.1; update a bug introduced in "Fix globbing bug in splitp #221" that broke some flows
11.10.22 v2.0.0; port to Python 3 and update to newer versions of pandas, numpy, and more
02.01.20 Fix KeyError in allele_merge
30.07.19 v1.0.1; update changelog and increment version for edits on this date (see #164)
30.07.19 Fix to display of liability scale gencov, h2 in rg results log (see #162)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

# LDSC (LD SCore) `v1.0.1`
# LDSC (LD SCore) `v2.0.1`

`ldsc` is a command line tool for estimating heritability and genetic correlation from GWAS summary statistics. `ldsc` also computes LD Scores.

Expand Down
14 changes: 7 additions & 7 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@ name: ldsc
channels:
- bioconda
dependencies:
- python=2.7
- bitarray=0.8
- nose=1.3
- pybedtools=0.7
- python=3.10
- pip
- pip:
- scipy==0.18
- pandas==0.20
- numpy==1.16
- bitarray==2.6.0
- nose==1.3.7
- pybedtools==0.9.0
- scipy==1.9.2
- pandas==1.5.0
- numpy==1.23.3
47 changes: 24 additions & 23 deletions ldsc.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
3. genetic covariance / correlation

'''
from __future__ import division

import ldscore.ldscore as ld
import ldscore.parse as ps
import ldscore.sumstats as sumstats
Expand All @@ -18,6 +18,7 @@
from subprocess import call
from itertools import product
import time, sys, traceback, argparse
from functools import reduce


try:
Expand All @@ -26,7 +27,7 @@
except AttributeError:
raise ImportError('LDSC requires pandas version >= 0.17.0')

__version__ = '1.0.1'
__version__ = '2.0.0'
MASTHEAD = "*********************************************************************\n"
MASTHEAD += "* LD Score Regression (LDSC)\n"
MASTHEAD += "* Version {V}\n".format(V=__version__)
Expand All @@ -37,7 +38,7 @@
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('precision', 4)
pd.set_option('display.precision', 4)
pd.set_option('max_colwidth',1000)
np.set_printoptions(linewidth=1000)
np.set_printoptions(precision=4)
Expand Down Expand Up @@ -73,15 +74,15 @@ class Logger(object):

'''
def __init__(self, fh):
self.log_fh = open(fh, 'wb')
self.log_fh = open(fh, 'w')

def log(self, msg):
'''
Print to log file and stdout with a single command.

'''
print >>self.log_fh, msg
print msg
print(msg, file=self.log_fh)
print(msg)


def __filter__(fname, noun, verb, merge_obj):
Expand All @@ -90,12 +91,12 @@ def __filter__(fname, noun, verb, merge_obj):
f = lambda x,n: x.format(noun=noun, verb=verb, fname=fname, num=n)
x = ps.FilterFile(fname)
c = 'Read list of {num} {noun} to {verb} from {fname}'
print f(c, len(x.IDList))
print(f(c, len(x.IDList)))
merged_list = merge_obj.loj(x.IDList)
len_merged_list = len(merged_list)
if len_merged_list > 0:
c = 'After merging, {num} {noun} remain'
print f(c, len_merged_list)
print(f(c, len_merged_list))
else:
error_msg = 'No {noun} retained for analysis'
raise ValueError(f(error_msg, 0))
Expand All @@ -106,7 +107,7 @@ def annot_sort_key(s):
'''For use with --cts-bin. Fixes weird pandas crosstab column order.'''
if type(s) == tuple:
s = [x.split('_')[0] for x in s]
s = map(lambda x: float(x) if x != 'min' else -float('inf'), s)
s = [float(x) if x != 'min' else -float('inf') for x in s]
else: # type(s) = str:
s = s.split('_')[0]
if s == 'min':
Expand Down Expand Up @@ -184,7 +185,7 @@ def ldscore(args, log):
raise ValueError(msg)

else:
cts_colnames = ['ANNOT'+str(i) for i in xrange(len(cts_fnames))]
cts_colnames = ['ANNOT'+str(i) for i in range(len(cts_fnames))]

log.log('Reading numbers with which to bin SNPs from {F}'.format(F=args.cts_bin))

Expand Down Expand Up @@ -215,7 +216,7 @@ def ldscore(args, log):
name_breaks[0] = 'min'
name_breaks[-1] = 'max'
name_breaks = [str(x) for x in name_breaks]
labs = [name_breaks[i]+'_'+name_breaks[i+1] for i in xrange(n_breaks-1)]
labs = [name_breaks[i]+'_'+name_breaks[i+1] for i in range(n_breaks-1)]
cut_vec = pd.Series(pd.cut(vec, bins=cut_breaks, labels=labs))
cts_levs.append(cut_vec)
full_labs.append(labs)
Expand Down Expand Up @@ -283,7 +284,7 @@ def ldscore(args, log):

if args.ld_wind_snps:
max_dist = args.ld_wind_snps
coords = np.array(xrange(geno_array.m))
coords = np.array(range(geno_array.m))
elif args.ld_wind_kb:
max_dist = args.ld_wind_kb*1000
coords = np.array(array_snps.df['BP'])[geno_array.kept_snps]
Expand Down Expand Up @@ -339,7 +340,7 @@ def ldscore(args, log):
F=args.print_snps, N=len(print_snps)))

print_snps.columns=['SNP']
df = df.ix[df.SNP.isin(print_snps.SNP),:]
df = df.iloc[df.SNP.isin(print_snps.SNP),:]
if len(df) == 0:
raise ValueError('After merging with --print-snps, no SNPs remain.')
else:
Expand All @@ -361,12 +362,12 @@ def ldscore(args, log):

# print .M
fout_M = open(args.out + '.'+ file_suffix +'.M','wb')
print >>fout_M, '\t'.join(map(str,M))
print('\t'.join(map(str,M)), file=fout_M)
fout_M.close()

# print .M_5_50
fout_M_5_50 = open(args.out + '.'+ file_suffix +'.M_5_50','wb')
print >>fout_M_5_50, '\t'.join(map(str,M_5_50))
print('\t'.join(map(str,M_5_50)), file=fout_M_5_50)
fout_M_5_50.close()

# print annot matrix
Expand All @@ -383,19 +384,19 @@ def ldscore(args, log):
# print LD Score summary
pd.set_option('display.max_rows', 200)
log.log('\nSummary of LD Scores in {F}'.format(F=out_fname+l2_suffix))
t = df.ix[:,4:].describe()
log.log( t.ix[1:,:] )
t = df.iloc[:,4:].describe()
log.log( t.iloc[1:,:] )

np.seterr(divide='ignore', invalid='ignore') # print NaN instead of weird errors
# print correlation matrix including all LD Scores and sample MAF
log.log('')
log.log('MAF/LD Score Correlation Matrix')
log.log( df.ix[:,4:].corr() )
log.log( df.iloc[:,4:].corr() )

# print condition number
if n_annot > 1: # condition number of a column vector w/ nonzero var is trivially one
log.log('\nLD Score Matrix Condition Number')
cond_num = np.linalg.cond(df.ix[:,5:])
cond_num = np.linalg.cond(df.iloc[:,5:])
log.log( reg.remove_brackets(str(np.matrix(cond_num))) )
if cond_num > 10000:
log.log('WARNING: ill-conditioned LD Score Matrix!')
Expand Down Expand Up @@ -588,7 +589,7 @@ def ldscore(args, log):
try:
defaults = vars(parser.parse_args(''))
opts = vars(args)
non_defaults = [x for x in opts.keys() if opts[x] != defaults[x]]
non_defaults = [x for x in list(opts.keys()) if opts[x] != defaults[x]]
header = MASTHEAD
header += "Call: \n"
header += './ldsc.py \\\n'
Expand Down Expand Up @@ -647,9 +648,9 @@ def ldscore(args, log):

# bad flags
else:
print header
print 'Error: no analysis selected.'
print 'ldsc.py -h describes options.'
print(header)
print('Error: no analysis selected.')
print('ldsc.py -h describes options.')
except Exception:
ex_type, ex, tb = sys.exc_info()
log.log( traceback.format_exc(ex) )
Expand Down
10 changes: 5 additions & 5 deletions ldscore/irwls.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
Iterativey re-weighted least squares.

'''
from __future__ import division

import numpy as np
import jackknife as jk
from . import jackknife as jk


class IRWLS(object):
Expand Down Expand Up @@ -109,10 +109,10 @@ def irwls(cls, x, y, update_func, n_blocks, w, slow=False, separators=None):
'w has shape {S}. w must have shape ({N}, 1).'.format(S=w.shape, N=n))

w = np.sqrt(w)
for i in xrange(2): # update this later
for i in range(2): # update this later
new_w = np.sqrt(update_func(cls.wls(x, y, w)))
if new_w.shape != w.shape:
print 'IRWLS update:', new_w.shape, w.shape
print('IRWLS update:', new_w.shape, w.shape)
raise ValueError('New weights must have same shape.')
else:
w = new_w
Expand Down Expand Up @@ -158,7 +158,7 @@ def wls(cls, x, y, w):

x = cls._weight(x, w)
y = cls._weight(y, w)
coef = np.linalg.lstsq(x, y)
coef = np.linalg.lstsq(x, y, rcond=None)
return coef

@classmethod
Expand Down
12 changes: 6 additions & 6 deletions ldscore/jackknife.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

'''

from __future__ import division

import numpy as np
from scipy.optimize import nnls
np.seterr(divide='raise', invalid='raise')
Expand Down Expand Up @@ -218,7 +218,7 @@ def __init__(self, x, y, n_blocks=None, nn=False, separators=None):
func = lambda x, y: np.atleast_2d(nnls(x, np.array(y).T[0])[0])
else:
func = lambda x, y: np.atleast_2d(
np.linalg.lstsq(x, np.array(y).T[0])[0])
np.linalg.lstsq(x, np.array(y).T[0], rcond=None)[0])

self.est = func(x, y)
self.delete_values = self.delete_values(x, y, func, self.separators)
Expand Down Expand Up @@ -256,7 +256,7 @@ def delete_values(cls, x, y, func, s):
'''
_check_shape(x, y)
d = [func(np.vstack([x[0:s[i], ...], x[s[i + 1]:, ...]]), np.vstack([y[0:s[i], ...], y[s[i + 1]:, ...]]))
for i in xrange(len(s) - 1)]
for i in range(len(s) - 1)]

return np.concatenate(d, axis=0)

Expand Down Expand Up @@ -346,7 +346,7 @@ def block_values(cls, x, y, s):
n_blocks = len(s) - 1
xtx_block_values = np.zeros((n_blocks, p, p))
xty_block_values = np.zeros((n_blocks, p))
for i in xrange(n_blocks):
for i in range(n_blocks):
xty_block_values[i, ...] = np.dot(
x[s[i]:s[i + 1], ...].T, y[s[i]:s[i + 1], ...]).reshape((1, p))
xtx_block_values[i, ...] = np.dot(
Expand Down Expand Up @@ -417,7 +417,7 @@ def block_values_to_delete_values(cls, xty_block_values, xtx_block_values):
delete_values = np.zeros((n_blocks, p))
xty_tot = np.sum(xty_block_values, axis=0)
xtx_tot = np.sum(xtx_block_values, axis=0)
for j in xrange(n_blocks):
for j in range(n_blocks):
delete_xty = xty_tot - xty_block_values[j]
delete_xtx = xtx_tot - xtx_block_values[j]
delete_values[j, ...] = np.linalg.solve(
Expand Down Expand Up @@ -507,7 +507,7 @@ def delete_values_to_pseudovalues(cls, est, denom, numer):
'''
n_blocks, p = denom.shape
pseudovalues = np.zeros((n_blocks, p))
for j in xrange(0, n_blocks):
for j in range(0, n_blocks):
pseudovalues[j, ...] = n_blocks * est - \
(n_blocks - 1) * numer[j, ...] / denom[j, ...]

Expand Down
20 changes: 10 additions & 10 deletions ldscore/ldscore.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from __future__ import division

import numpy as np
import bitarray as ba

Expand All @@ -24,7 +24,7 @@ def getBlockLefts(coords, max_dist):
M = len(coords)
j = 0
block_left = np.zeros(M)
for i in xrange(M):
for i in range(M):
while j < M and abs(coords[j] - coords[i]) > max_dist:
j += 1

Expand All @@ -51,7 +51,7 @@ def block_left_to_right(block_left):
M = len(block_left)
j = 0
block_right = np.zeros(M)
for i in xrange(M):
for i in range(M):
while j < M and block_left[j] <= i:
j += 1

Expand Down Expand Up @@ -85,7 +85,7 @@ def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=
self.n)

if self.n > 0:
print 'After filtering, {n} individuals remain'.format(n=self.n)
print('After filtering, {n} individuals remain'.format(n=self.n))
else:
raise ValueError('After filtering, no individuals remain')

Expand All @@ -99,7 +99,7 @@ def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=
self.geno, self.m, self.n, self.mafMin, keep_snps)

if self.m > 0:
print 'After filtering, {m} SNPs remain'.format(m=self.m)
print('After filtering, {m} SNPs remain'.format(m=self.m))
else:
raise ValueError('After filtering, no SNPs remain')

Expand Down Expand Up @@ -190,7 +190,7 @@ def __corSumVarBlocks__(self, block_left, c, func, snp_getter, annot=None):
rfuncAB = np.zeros((b, c))
rfuncBB = np.zeros((c, c))
# chunk inside of block
for l_B in xrange(0, b, c): # l_B := index of leftmost SNP in matrix B
for l_B in range(0, b, c): # l_B := index of leftmost SNP in matrix B
B = A[:, l_B:l_B+c]
np.dot(A.T, B / n, out=rfuncAB)
rfuncAB = func(rfuncAB)
Expand All @@ -199,7 +199,7 @@ def __corSumVarBlocks__(self, block_left, c, func, snp_getter, annot=None):
b0 = b
md = int(c*np.floor(m/c))
end = md + 1 if md != m else md
for l_B in xrange(b0, end, c):
for l_B in range(b0, end, c):
# check if the annot matrix is all zeros for this block + chunk
# this happens w/ sparse categories (i.e., pathways)
# update the block
Expand Down Expand Up @@ -294,7 +294,7 @@ def __filter_indivs__(self, geno, keep_indivs, m, n):
nru_new = n_new + e
nru = self.nru
z = ba.bitarray(m*2*nru_new, endian="little")
z.setall(0)
z.setall(0)
for e, i in enumerate(keep_indivs):
z[2*e::2*nru_new] = geno[2*i::2*nru]
z[2*e+1::2*nru_new] = geno[2*i+1::2*nru]
Expand Down Expand Up @@ -335,7 +335,7 @@ def __filter_snps_maf__(self, geno, m, n, mafMin, keep_snps):
m_poly = 0
y = ba.bitarray()
if keep_snps is None:
keep_snps = xrange(m)
keep_snps = range(m)
kept_snps = []
freq = []
for e, j in enumerate(keep_snps):
Expand Down Expand Up @@ -397,7 +397,7 @@ def nextSNPs(self, b, minorRef=None):
X = np.array(slice.decode(self._bedcode), dtype="float64").reshape((b, nru)).T
X = X[0:n, :]
Y = np.zeros(X.shape)
for j in xrange(0, b):
for j in range(0, b):
newsnp = X[:, j]
ii = newsnp != 9
avg = np.mean(newsnp[ii])
Expand Down
Loading