Skip to content

Commit

Permalink
Merge pull request #645 from macs3-project/feat/macs3/narrowpeak
Browse files Browse the repository at this point in the history
Feat/macs3/narrowpeak
  • Loading branch information
taoliu authored May 16, 2024
2 parents d459079 + e03da87 commit 0936aa9
Show file tree
Hide file tree
Showing 34 changed files with 210,916 additions and 206,326 deletions.
4 changes: 2 additions & 2 deletions MACS3/Commands/bdgbroadcall_cmd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2020-12-03 14:26:29 Tao Liu>
# Time-stamp: <2024-05-15 10:41:43 Tao Liu>

"""Description: Fine-tuning script to call broad peaks from a single
bedGraph track for scores.
Expand Down Expand Up @@ -41,7 +41,7 @@
def run( options ):
info("Read and build bedGraph...")
bio = BedGraphIO.bedGraphIO(options.ifile)
btrack = bio.build_bdgtrack(baseline_value=0)
btrack = bio.read_bedGraph(baseline_value=0)

info("Call peaks from bedGraph...")

Expand Down
9 changes: 5 additions & 4 deletions MACS3/Commands/bdgcmp_cmd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2020-11-30 13:49:34 Tao Liu>
# Time-stamp: <2024-05-15 10:13:23 Tao Liu>

"""Description: compare bdg files
Expand Down Expand Up @@ -39,11 +39,11 @@ def run( options ):

info("Read and build treatment bedGraph...")
tbio = BedGraphIO.bedGraphIO(options.tfile)
tbtrack = tbio.build_bdgtrack()
tbtrack = tbio.read_bedGraph()

info("Read and build control bedGraph...")
cbio = BedGraphIO.bedGraphIO(options.cfile)
cbtrack = cbio.build_bdgtrack()
cbtrack = cbio.read_bedGraph()

info("Build ScoreTrackII...")
sbtrack = tbtrack.make_ScoreTrackII_for_macs( cbtrack, depth1 = pseudo_depth, depth2 = pseudo_depth )
Expand Down Expand Up @@ -86,6 +86,7 @@ def run( options ):
raise Exception("Can't reach here!")

info("Write bedGraph of scores...")
ofhd = open(ofile,"w")
ofhd = open( ofile, "w" )
# write_bedGraph function for ScoreTrack
sbtrack.write_bedGraph(ofhd,name="%s_Scores" % (method.upper()),description="Scores calculated by %s" % (method.upper()), column = 3)
info("Finished '%s'! Please check '%s'!" % (method, ofile))
10 changes: 5 additions & 5 deletions MACS3/Commands/bdgdiff_cmd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2020-11-26 17:04:26 Tao Liu>
# Time-stamp: <2024-05-15 10:42:27 Tao Liu>

"""Description: Naive call differential peaks from 4 bedGraph tracks for scores.
Expand Down Expand Up @@ -47,19 +47,19 @@ def run( options ):

info("Read and build treatment 1 bedGraph...")
t1bio = BedGraphIO.bedGraphIO(options.t1bdg)
t1btrack = t1bio.build_bdgtrack()
t1btrack = t1bio.read_bedGraph()

info("Read and build control 1 bedGraph...")
c1bio = BedGraphIO.bedGraphIO(options.c1bdg)
c1btrack = c1bio.build_bdgtrack()
c1btrack = c1bio.read_bedGraph()

info("Read and build treatment 2 bedGraph...")
t2bio = BedGraphIO.bedGraphIO(options.t2bdg)
t2btrack = t2bio.build_bdgtrack()
t2btrack = t2bio.read_bedGraph()

info("Read and build control 2 bedGraph...")
c2bio = BedGraphIO.bedGraphIO(options.c2bdg)
c2btrack = c2bio.build_bdgtrack()
c2btrack = c2bio.read_bedGraph()

depth1 = options.depth1
depth2 = options.depth2
Expand Down
11 changes: 5 additions & 6 deletions MACS3/Commands/bdgopt_cmd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2020-11-24 16:46:33 Tao Liu>
# Time-stamp: <2024-05-15 11:15:48 Tao Liu>

"""Description: Modify bedGraph file
Expand Down Expand Up @@ -40,7 +40,7 @@ def run( options ):

info("Read and build bedGraph...")
bio = BedGraphIO.bedGraphIO(options.ifile)
btrack = bio.build_bdgtrack(baseline_value=0)
btrack = bio.read_bedGraph(baseline_value=0)

info("Modify bedGraph...")
if options.method.lower() == "p2q":
Expand All @@ -57,11 +57,10 @@ def run( options ):
elif options.method.lower() == "min":
btrack.apply_func( lambda x: x if x< extraparam else extraparam )

ofile = os.path.join( options.outdir, options.ofile )
ofile = BedGraphIO.bedGraphIO( os.path.join( options.outdir, options.ofile ), data = btrack )
info("Write bedGraph of modified scores...")
ofhd = open(ofile,"w")
btrack.write_bedGraph(ofhd,name="%s_modified_scores" % (options.method.upper()),description="Scores calculated by %s" % (options.method.upper()))
info("Finished '%s'! Please check '%s'!" % (options.method, ofile))
ofile.write_bedGraph(name="%s_modified_scores" % (options.method.upper()),description="Scores calculated by %s" % (options.method.upper()))
info("Finished '%s'! Please check '%s'!" % (options.method, ofile.bedGraph_filename))



6 changes: 3 additions & 3 deletions MACS3/Commands/bdgpeakcall_cmd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2023-11-02 11:52:48 Tao Liu>
# Time-stamp: <2024-05-15 10:43:03 Tao Liu>

"""Description: Naive call peaks from a single bedGraph track for
scores.
Expand Down Expand Up @@ -39,11 +39,11 @@
def run( options ):
info("Read and build bedGraph...")
bio = BedGraphIO.bedGraphIO(options.ifile)
btrack = bio.build_bdgtrack(baseline_value=0)
btrack = bio.read_bedGraph(baseline_value=0)

if options.cutoff_analysis:
info("Analyze cutoff vs number of peaks/total length of peaks/average length of peak")
cutoff_analysis_result = btrack.cutoff_analysis( int(options.maxgap), int(options.minlen), min_score = btrack.minvalue, max_score = btrack.maxvalue, steps = int(options.cutoff_analysis_steps) )
cutoff_analysis_result = btrack.cutoff_analysis( int(options.maxgap), int(options.minlen), min_score = btrack.minvalue, max_score = int(options.cutoff_analysis_max), steps = int(options.cutoff_analysis_steps) )
info("Write report...")
if options.ofile:
fhd = open( os.path.join( options.outdir, options.ofile ), 'w' )
Expand Down
14 changes: 7 additions & 7 deletions MACS3/Commands/cmbreps_cmd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2020-11-24 16:47:11 Tao Liu>
# Time-stamp: <2024-05-15 11:16:04 Tao Liu>

"""Description: combine replicates
Expand Down Expand Up @@ -38,16 +38,16 @@ def run( options ):
i = 1
for ifile in options.ifile:
info("Read file #%d" % i)
reps.append( BedGraphIO.bedGraphIO( ifile ).build_bdgtrack( ) )
reps.append( BedGraphIO.bedGraphIO( ifile ).read_bedGraph() )
i += 1

# first two reps

info("combining tracks 1-%i with method '%s'" % (i - 1, options.method))
cmbtrack = reps[ 0 ].overlie( [reps[ j ] for j in range(1, i - 1)], func=options.method )
ofile = os.path.join( options.outdir, options.ofile )

# now output
ofile = BedGraphIO.bedGraphIO( os.path.join( options.outdir, options.ofile ), data = cmbtrack )
info("Write bedGraph of combined scores...")
ofhd = open(ofile,"w")
cmbtrack.write_bedGraph(ofhd,name="%s_combined_scores" % (options.method.upper()),description="Scores calculated by %s" % (options.method.upper()))
info("Finished '%s'! Please check '%s'!" % (options.method, ofile))
ofile.write_bedGraph(name="%s_combined_scores" % (options.method.upper()),description="Scores calculated by %s" % (options.method.upper()))
info("Finished '%s'! Please check '%s'!" % (options.method, ofile.bedGraph_filename))

103 changes: 49 additions & 54 deletions MACS3/Commands/hmmratac_cmd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Time-stamp: <2024-04-26 15:46:03 Tao Liu>
# Time-stamp: <2024-05-15 19:40:45 Tao Liu>


"""Description: Main HMMR command
Expand Down Expand Up @@ -27,13 +28,13 @@
from MACS3.Utilities.Constants import *
from MACS3.Utilities.OptValidator import opt_validate_hmmratac
from MACS3.IO.PeakIO import PeakIO
from MACS3.IO.PeakIO import BroadPeakIO
from MACS3.IO.Parser import BAMPEParser, BEDPEParser #BAMaccessor
from MACS3.Signal.HMMR_EM import HMMR_EM
from MACS3.Signal.HMMR_Signal_Processing import generate_weight_mapping, generate_digested_signals, extract_signals_from_regions
from MACS3.Signal.HMMR_HMM import hmm_training, hmm_predict, hmm_model_init, hmm_model_save
from MACS3.Signal.Region import Regions
from MACS3.Signal.BedGraph import bedGraphTrackI
from MACS3.IO.BedGraphIO import bedGraphIO

#from MACS3.IO.BED import BEDreader # this hasn't been implemented yet.

Expand Down Expand Up @@ -61,15 +62,21 @@ def run( args ):
mono_bdgfile = os.path.join( options.outdir, options.name+"_digested_mono.bdg" )
di_bdgfile = os.path.join( options.outdir, options.name+"_digested_di.bdg" )
tri_bdgfile = os.path.join( options.outdir, options.name+"_digested_tri.bdg" )

training_region_bedfile = os.path.join( options.outdir, options.name+"_training_regions.bed" )
training_datafile = os.path.join( options.outdir, options.name+"_training_data.txt" )
training_datalengthfile = os.path.join( options.outdir, options.name+"_training_lengths.txt" )

hmm_modelfile = os.path.join( options.outdir, options.name+"_model.json" )

open_state_bdgfile = os.path.join( options.outdir, options.name+"_open.bdg" )
nuc_state_bdgfile = os.path.join( options.outdir, options.name+"_nuc.bdg" )
bg_state_bdgfile = os.path.join( options.outdir, options.name+"_bg.bdg" )

states_file = os.path.join( options.outdir, options.name+"_states.bed" )
accessible_file = os.path.join( options.outdir, options.name+"_accessible_regions.gappedPeak" )

accessible_file = os.path.join( options.outdir, options.name+"_accessible_regions.narrowPeak" )

cutoffanalysis_file = os.path.join( options.outdir, options.name+"_cutoff_analysis.tsv" )

#############################################
Expand Down Expand Up @@ -158,18 +165,17 @@ def run( args ):

# save three types of signals if needed
if options.save_digested:
fhd = open(short_bdgfile,"w")
digested_atac_signals[ 0 ].write_bedGraph(fhd, "short","short")
fhd.close()
fhd = open(mono_bdgfile,"w")
digested_atac_signals[ 1 ].write_bedGraph(fhd, "mono","mono")
fhd.close()
fhd = open(di_bdgfile,"w")
digested_atac_signals[ 2 ].write_bedGraph(fhd, "di","di")
fhd.close()
fhd = open(tri_bdgfile,"w")
digested_atac_signals[ 3 ].write_bedGraph(fhd, "tri","tri")
fhd.close()
bdgshort = BedGraphIO( short_bdgfile, data = digested_atac_signals[ 0 ] )
bdgshort.write_bedGraph("short","short")

bdgmono = BedGraphIO( mono_bdgfile, data = digested_atac_signals[ 1 ] )
bdgmono.write_bedGraph("mono", "mono")

bdgdi = BedGraphIO( di_bdgfile, data = digested_atac_signals[ 2 ] )
bdgdi.write_bedGraph("di", "di")

bdgtri = BedGraphIO( tri_bdgfile, data = digested_atac_signals[ 3 ] )
bdgtri.write_bedGraph("tri", "tri")

minlen = int(petrack.average_template_length)
# if options.pileup_short is on, we pile up only the short fragments to identify training
Expand Down Expand Up @@ -389,10 +395,9 @@ def run( args ):
# Note: we implement in a way that we will decode the candidate regions 10000 regions at a time so 1. we can make it running in parallel in the future; 2. we can reduce the memory usage.
options.info( f"# Use HMM to predict states")
n = 0
#predicted_proba_file_name = "predicted_proba.csv"
#predicted_proba_file = open(predicted_proba_file_name, "wb")

# we create a temporary file to save the proba predicted from hmm
predicted_proba_file = tempfile.TemporaryFile(mode="w+b")
# predicted_proba_file = open("predicted_proba.csv", "w+b")

while candidate_regions.total != 0:
n += 1
Expand All @@ -414,11 +419,8 @@ def run( args ):
cr_data_lengths = []
cr_bins = []
prob_data = []

#options.debug( "# clean up complete")
gc.collect()

#predicted_proba_file.close()
predicted_proba_file.seek(0) # reset
options.info( f"# predicted_proba files written...")

Expand Down Expand Up @@ -449,25 +451,30 @@ def run( args ):
# # Generate states path:
states_path = generate_states_path( predicted_proba_file, options.hmm_binsize, i_open_region, i_nucleosomal_region, i_background_region )
options.info( f"# finished generating states path")
predicted_proba_file.close() #kill the tempfile
predicted_proba_file.close() #kill the temp file
# Save states path if needed
# PS: we need to implement extra feature to include those regions NOT in candidate_bins and assign them as 'background state'.
if options.save_states:
options.info( f"# Write states assignments in a BED file: {options.name}_states.bed" )
with open( states_file, "w" ) as f:
save_states_bed( states_path, f )

options.info( f"# Write accessible regions in a gappedPeak file: {options.name}_accessible_regions.gappedPeak")
options.info( f"# Write accessible regions in a narrowPeak file: {options.name}_accessible_regions.narrowPeak")
with open( accessible_file, "w" ) as ofhd:
save_accessible_regions( states_path, ofhd, options.openregion_minlen )
save_accessible_regions( states_path, ofhd, options.openregion_minlen, fc_bdg )

options.info( f"# Finished")

def save_proba_to_bedGraph( predicted_proba_file, binsize, open_state_bdg_file, nuc_state_bdg_file, bg_state_bdg_file, i_open, i_nuc, i_bg ):
open_state_bdg = bedGraphTrackI( baseline_value = 0 )
nuc_state_bdg = bedGraphTrackI( baseline_value = 0 )
bg_state_bdg = bedGraphTrackI( baseline_value = 0 )

open_state_bdg_file = bedGraphIO( open_state_bdg_file )
nuc_state_bdg_file = bedGraphIO( nuc_state_bdg_file )
bg_state_bdg_file = bedGraphIO( bg_state_bdg_file )

open_state_bdg = open_state_bdg_file.data
nuc_state_bdg = nuc_state_bdg_file.data
bg_state_bdg = bg_state_bdg_file.data

prev_chrom_name = None
prev_bin_end = None

Expand Down Expand Up @@ -501,9 +508,9 @@ def save_proba_to_bedGraph( predicted_proba_file, binsize, open_state_bdg_file,
bg_state_bdg.add_loc( chrname, start_pos, end_pos, pp_bg )
prev_bin_end = end_pos

open_state_bdg.write_bedGraph( open_state_bdg_file, "Open States", "Likelihoods of being Open States" )
nuc_state_bdg.write_bedGraph( nuc_state_bdg_file, "Nucleosomal States", "Likelihoods of being Nucleosomal States" )
bg_state_bdg.write_bedGraph( bg_state_bdg_file, "Background States", "Likelihoods of being Background States" )
open_state_bdg_file.write_bedGraph( "Open States", "Likelihoods of being Open States", trackline = False )
nuc_state_bdg_file.write_bedGraph( "Nucleosomal States", "Likelihoods of being Nucleosomal States", trackline = False )
bg_state_bdg_file.write_bedGraph( "Background States", "Likelihoods of being Background States", trackline = False )
return

def save_states_bed( states_path, states_bedfile ):
Expand All @@ -515,6 +522,7 @@ def save_states_bed( states_path, states_bedfile ):
return

def generate_states_path(predicted_proba_file, binsize, i_open, i_nuc, i_bg):
# predicted_proba_file is a temporary file
ret_states_path = []
labels_list = ["open", "nuc", "bg"]

Expand Down Expand Up @@ -558,7 +566,7 @@ def generate_states_path(predicted_proba_file, binsize, i_open, i_nuc, i_bg):
prev_bin_end = end_pos
return ret_states_path

def save_accessible_regions(states_path, accessible_region_file, openregion_minlen):
def save_accessible_regions(states_path, accessible_region_file, openregion_minlen, bdgscore):
# Function to add regions to the list
def add_regions(i, regions):
for j in range(i, i+3):
Expand All @@ -572,31 +580,18 @@ def add_regions(i, regions):
for i in range(len(states_path)-2):
if (states_path[i][3] == 'nuc' and states_path[i+1][3] == 'open' and states_path[i+2][3] == 'nuc' and
states_path[i][2] == states_path[i+1][1] and states_path[i+1][2] == states_path[i+2][1] and
states_path[i+1][2] - states_path[i+1][1] > openregion_minlen):
states_path[i+2][2] - states_path[i][1] > openregion_minlen): # require nuc-open-nuc entire region start/endpos > openregion_minlen
accessible_regions = add_regions(i, accessible_regions)
# Group states by region
list_of_groups = []
one_group = [accessible_regions[0]]
for j in range(1, len(accessible_regions)):
if accessible_regions[j][1] == accessible_regions[j-1][2]:
one_group.append(accessible_regions[j])
else:
list_of_groups.append(one_group)
one_group = [accessible_regions[j]]
accessible_regions = list_of_groups

# remove 'nuc' regions:
accessible_regions = [tup for tup in accessible_regions if tup[3] != 'nuc']

# Generate broadpeak object
broadpeak = BroadPeakIO()
openpeak = PeakIO()
for region in accessible_regions[:-1]:
block_num = sum('open' in tup for tup in region)
block_sizes = ','.join(str(region[j][2] - region[j][1]) for j in range(1, len(region) - 1, 2))
block_starts = ','.join(str(region[j][1] - region[0][1]) for j in range(1, len(region) - 1, 2))
broadpeak.add(region[1][0], region[0][1], region[-1][2],
#bytes(region[1][0], encoding="raw_unicode_escape"), region[0][1], region[-1][2],
thickStart=bytes(str(region[1][1]), encoding="raw_unicode_escape"),
thickEnd=bytes(str(region[-2][2]), encoding="raw_unicode_escape"),
blockNum=block_num,
blockSizes=bytes(block_sizes, encoding="raw_unicode_escape"),
blockStarts=bytes(block_starts, encoding="raw_unicode_escape"))
broadpeak.write_to_gappedPeak(accessible_region_file)
openpeak.add(chromosome=region[0], start=region[1], end=region[2])

# refine peak summit and score using bedGraphTrackI with scores
openpeak = bdgscore.refine_peaks( openpeak )
openpeak.write_to_narrowPeak(accessible_region_file)
return
Loading

0 comments on commit 0936aa9

Please sign in to comment.