diff --git a/Dockerfile b/Dockerfile index 9096bb75..d37bb668 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ RUN apt-get install -y libpng-dev uuid-dev libmysqlclient-dev RUN apt-get install -y python3 build-essential git python3-pip wget pigz RUN apt-get install -y zlib1g-dev libbz2-dev gsl-bin libgsl0-dev RUN apt-get install -y libncurses5-dev liblzma-dev libssl-dev libcurl4-openssl-dev -RUN pip3 install matplotlib multiprocess +RUN pip3 install 'matplotlib<3.0' multiprocess RUN mkdir /usr/local/build; cd /usr/local/build RUN git clone --recursive https://github.com/heathsc/gemBS.git RUN (cd gemBS; python3 setup.py install) diff --git a/README.md b/README.md index 45a30cf8..885d2fa5 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,12 @@ Documentation can be found at Changelog: ---------- + 3.3.0 Make new release for IHEC + 3.3.0 Switch conversion default in IHEC_standard configuration to 0.01,0.05 rather than auto, which can give odd results if conversion controls not present or not working correctly + 3.3.0 Fix bug where conversion parameters could be ignored + 3.2.13 Fix formatting bug in mextr with multiple samples (not triggered in normal gemBS use) + 3.2.12 Ensure that conversion statistics are correctly calculated for non-stranded or reverse conversion protocols + 3.2.11 Introduce reverse_conversion option for mapping where read 1 is G2A converted and read 2 is C2T converted 3.2.10 Correct regex patch for single end reads 3.2.9 Update Singularity and Dockerfile recipes to allow kemp utils to be built correctly 3.2.9 Make setup.py and gemBS/commands.py read the version information from gemBS/version.py, so ensuring consistency diff --git a/Singularity b/Singularity index 724ee9ff..0ab67a46 100644 --- a/Singularity +++ b/Singularity @@ -11,10 +11,10 @@ From: ubuntu:xenial (mkdir /ext && cd /ext && mkdir disk1 disk2 disk3 disk4 disk5 disk6 disk7 disk8 disk9) apt-get update apt-get install -y libpng-dev uuid-dev libmysqlclient-dev - apt-get install -y python3 build-essential git python3-pip wget pigz + apt-get install -y python3 build-essential git python3-pip wget pigz apt-get install -y zlib1g-dev libbz2-dev gsl-bin libgsl0-dev apt-get install -y libncurses5-dev liblzma-dev libssl-dev libcurl4-openssl-dev - pip3 install matplotlib multiprocess + pip3 install 'matplotlib<3.0' multiprocess mkdir /usr/local/build; cd /usr/local/build git clone --recursive https://github.com/heathsc/gemBS.git (cd gemBS; python3 setup.py install) diff --git a/gemBS/__init__.py b/gemBS/__init__.py index 101de8f9..5605362b 100644 --- a/gemBS/__init__.py +++ b/gemBS/__init__.py @@ -660,7 +660,7 @@ def makeChromSizes(index_name=None,output=None): raise ValueError("Info file {} (normally generated by gem-indexer) does not exist".format(info_file)) def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None, - read_non_stranded=False,outfile=None, + read_non_stranded=False,reverse_conv=False,outfile=None, paired=False,tmpDir="/tmp",threads=1,under_conversion=None, over_conversion=None): """ Start the GEM Bisulfite mapping on the given input. @@ -670,6 +670,7 @@ def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None, inputFiles -- List of input files ftype -- input file type read_non_stranded -- Read non stranded + reverse_conv - Reverse the normal conversion outputDir -- Directory to store the Bisulfite mapping results paired -- Paired End flag tmpDir -- Temporary directory to perform sorting operations @@ -699,9 +700,15 @@ def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None, #Paired End if paired: mapping.append("-p") - #Non Stranded + #Conversion if read_non_stranded: - mapping.extend(["--bisulfite-read","non-stranded"]) + mapping.extend(["--bisulfite-conversion","non-stranded"]) + else: + if reverse_conv: + mapping.extend(["--bisulfite-conversion","inferred-G2A-C2T"]) + else: + mapping.extend(["--bisulfite-conversion","inferred-C2T-G2A"]) + #Number of threads mapping.extend(["-t",threads]) #Mapping stats @@ -831,11 +838,10 @@ def prepare(self, sample, input_bam, chrom_list, output_bcf, report_file, contig if self.haploid: parameters_bscall.append('-1') if self.conversion != None: - if self.conversion.lower() == "auto": - if sample in self.sample_conversion: - parameters_bscall.extend(['--conversion', self.sample_conversion[sample]]) - else: - parameters_bscall.extend(['--conversion', self.conversion]) + if self.conversion.lower() == "auto" and sample in self.sample_conversion: + parameters_bscall.extend(['--conversion', self.sample_conversion[sample]]) + else: + parameters_bscall.extend(['--conversion', self.conversion]) if self.ref_bias != None: parameters_bscall.extend(['--reference-bias', self.ref_bias]) #Thresholds diff --git a/gemBS/etc/gemBS_configs/IHEC_standard.conf b/gemBS/etc/gemBS_configs/IHEC_standard.conf index 9dcd28a8..baa976ab 100644 --- a/gemBS/etc/gemBS_configs/IHEC_standard.conf +++ b/gemBS/etc/gemBS_configs/IHEC_standard.conf @@ -17,7 +17,7 @@ right_trim = 0 keep_improper_pairs = False keep_duplicates = False haploid = False -conversion = auto +conversion = 0.01,0.05 remove_individual_bcfs = True # Contigs smaller than contig_pool_limit will be called together diff --git a/gemBS/parser.py b/gemBS/parser.py index 1926c221..19cec773 100755 --- a/gemBS/parser.py +++ b/gemBS/parser.py @@ -133,7 +133,7 @@ def read(self, infile): state = 0 known_var = { - 'mapping': ('tmp_dir', 'threads', 'non_stranded', 'remove_individual_bams', 'underconversion_sequence', 'overconversion_sequence', 'bam_dir', 'sequence_dir'), + 'mapping': ('tmp_dir', 'threads', 'non_stranded', 'reverse_conversion', 'remove_individual_bams', 'underconversion_sequence', 'overconversion_sequence', 'bam_dir', 'sequence_dir'), 'index': ('index', 'index_dir', 'reference', 'extra_references', 'reference_basename', 'nonbs_index', 'contig_sizes', 'threads', 'dbsnp_files', 'dbsnp_index', 'sampling_rate'), 'calling': ('bcf_dir', 'mapq_threshold', 'qual_threshold', 'left_trim', 'right_trim', 'threads', 'jobs', 'species', 'keep_duplicates', 'keep_improper_pairs', 'remove_individual_bcfs', 'haploid', 'reference_bias', 'conversion', 'contig_list', 'contig_pool_limit'), diff --git a/gemBS/production.py b/gemBS/production.py index 9d7fb415..448842fb 100644 --- a/gemBS/production.py +++ b/gemBS/production.py @@ -254,6 +254,7 @@ def register(self,parser): parser.add_argument('-T', '--type', dest="ftype", help='Type of data file (PAIRED, SINGLE, INTERLEAVED, STREAM, BAM)') parser.add_argument('-p', '--paired-end', dest="paired_end", action="store_true", help="Input data is Paired End") parser.add_argument('-r', '--remove', dest="remove", action="store_true", help='Remove individual BAM files after merging.', required=False) + parser.add_argument('-R', '--reverse-conversion', dest="reverse_conv", action="store_true", help='Perform G2A conversion on read 1 and C2T on read 2 rather than the reverse.', required=False) parser.add_argument('-s', '--read-non-stranded', dest="read_non_stranded", action="store_true", help='Automatically selects the proper C->T and G->A read conversions based on the level of Cs and Gs on the read.') parser.add_argument('-u', '--underconversion-sequence', dest="underconversion_sequence", metavar="SEQUENCE", help='Name of unmethylated sequencing control.', default=None,required=False) @@ -316,7 +317,10 @@ def run(self, args): self.tmp_dir = self.jsonData.check(section='mapping',key='tmp_dir',arg=args.tmp_dir,dir_type=True) self.threads = self.jsonData.check(section='mapping',key='threads',arg=args.threads,default='1') + self.reverse_conv = self.jsonData.check(section='mapping',key='reverse_conversion',arg=args.reverse_conv, boolean=True) self.read_non_stranded = self.jsonData.check(section='mapping',key='non_stranded',arg=args.read_non_stranded, boolean=True) + if self.read_non_stranded: + self.reverse_conv = False self.remove = self.jsonData.check(section='mapping',key='remove_individual_bams',arg=args.remove, boolean=True) self.underconversion_sequence = self.jsonData.check(section='mapping',key='underconversion_sequence',arg=args.underconversion_sequence) self.overconversion_sequence = self.jsonData.check(section='mapping',key='overconversion_sequence',arg=args.overconversion_sequence) @@ -530,6 +534,7 @@ def do_mapping(self, fli): if args.threads: com.extend(['-t',args.threads]) if args.tmp_dir: com.extend(['-d',args.tmp_dir]) if args.read_non_stranded: com.append('-s') + if args.reverse_conv: com.append('-R') if args.underconversion_sequence: com.extend(['-u',args.underconversion_sequence]) if args.overconversion_sequence: com.extend(['-v',args.overconversion_sequence]) if not bis: com.append('--non-bs') @@ -554,7 +559,7 @@ def do_mapping(self, fli): tmp = os.path.dirname(outfile) ret = mapping(name=fli,index=self.index,fliInfo=fliInfo,inputFiles=inputFiles,ftype=ftype, - read_non_stranded=self.read_non_stranded, + read_non_stranded=self.read_non_stranded, reverse_conv=self.reverse_conv, outfile=outfile,paired=self.paired,tmpDir=tmp,threads=self.threads, under_conversion=self.underconversion_sequence,over_conversion=self.overconversion_sequence) @@ -672,16 +677,17 @@ def extra_log(self): printer = logging.gemBS.gt printer("------------ Mapping Parameters ------------") - printer("Sample barcode : %s", self.name) - printer("Data set : %s", self.curr_fli) - printer("No. threads : %s", self.threads) - printer("Index : %s", self.index) - printer("Paired : %s", self.paired) - printer("Read non stranded: %s", self.read_non_stranded) - printer("Type : %s", self.curr_ftype) + printer("Sample barcode : %s", self.name) + printer("Data set : %s", self.curr_fli) + printer("No. threads : %s", self.threads) + printer("Index : %s", self.index) + printer("Paired : %s", self.paired) + printer("Read non stranded : %s", self.read_non_stranded) + printer("Reverse conversion: %s", self.reverse_conv) + printer("Type : %s", self.curr_ftype) if self.inputFiles: - printer("Input Files : %s", ','.join(self.inputFiles)) - printer("Output dir : %s", self.curr_output_dir) + printer("Input Files : %s", ','.join(self.inputFiles)) + printer("Output dir : %s", self.curr_output_dir) printer("") @@ -963,17 +969,17 @@ def run(self,args): stats = SampleStats(name=sample,list_lane_stats=list_stats_lanes) uc = stats.getUnderConversionRate() oc = stats.getOverConversionRate() - if uc == "NA": + if uc == "NA" or uc < 0.0: uc = 0.99 elif uc < 0.95: uc = 0.95 elif uc > 0.999: uc = 0.999 - if oc == "NA": + if oc == "NA" or oc < 0.0: oc = 0.05 elif oc > 0.15: oc = 0.15 - elif oc < 0.0: + elif oc < 0.001: oc = 0.01 self.sample_conversion[sample] = "{:.4f},{:.4f}".format(1-uc,oc) @@ -1100,6 +1106,7 @@ def run(self,args): if args.qual_threshold != None: com2.extend(['-Q',str(args.mapq_threshold)]) if args.right_trim != None: com2.extend(['--right-trim',str(args.right_trim)]) if args.left_trim != None: com2.extend(['--left-trim',str(args.left_trim)]) + if args.conversion != None: com2.extend(['--conversion',str(args.conversion)]) if args.keep_duplicates != None: com2.append('-u') if args.ignore_duplicates != None: com2.append('-U') if args.keep_unmatched != None: com2.append('-k') diff --git a/gemBS/version.py b/gemBS/version.py index 105c55c1..a7a65841 100644 --- a/gemBS/version.py +++ b/gemBS/version.py @@ -1,4 +1,4 @@ __VERSION_MAJOR = "3" -__VERSION_MINOR = "2" -__VERSION_SUBMINOR = "10" +__VERSION_MINOR = "3" +__VERSION_SUBMINOR = "0" __VERSION__ = "%s.%s.%s" % (__VERSION_MAJOR, __VERSION_MINOR,__VERSION_SUBMINOR) diff --git a/tools/gem3-mapper b/tools/gem3-mapper index 6eccd5bb..97f66ada 160000 --- a/tools/gem3-mapper +++ b/tools/gem3-mapper @@ -1 +1 @@ -Subproject commit 6eccd5bb9a0fb9a5d4196078b2cf5dc49e080819 +Subproject commit 97f66ada5a28d00e42aeb99d09f1fcde14965268 diff --git a/tools/gemBS_plugins/calc_gt_prob.c b/tools/gemBS_plugins/calc_gt_prob.c index bd9ff5d1..e29a18aa 100644 --- a/tools/gemBS_plugins/calc_gt_prob.c +++ b/tools/gemBS_plugins/calc_gt_prob.c @@ -253,6 +253,7 @@ void calc_cpg_meth(args_t *args, int ns, cpg_prob *cpg, gt_meth *g1, gt_meth *g2 double wval[3] = {1.0, 1.0, 0.5}; double pval[3] = {1.0, 0.5, 1.0}; for(int ix = 0; ix < ns; ix++) { + if(g1[ix].skip || g2[ix].skip) continue; int gt1 = g1[ix].max_gt; int gt2 = g2[ix].max_gt; cpg[ix].max_gt[0] = gt1; diff --git a/tools/gemBS_plugins/compress.c b/tools/gemBS_plugins/compress.c index c89c557d..1c3e24fa 100644 --- a/tools/gemBS_plugins/compress.c +++ b/tools/gemBS_plugins/compress.c @@ -57,7 +57,7 @@ static void init_compress(void) { path = DEFAULT_PATH; for (i = 0; i < COMPRESS_NONE; i++) { compress_data.compress_suffix[i] = strdup(suff[i]); - compress_data.comp_path[i][0] = compress_data.comp_path[i][0] = NULL; + compress_data.comp_path[i][0] = compress_data.comp_path[i][1] = NULL; } int ix = 0; while(pnames[ix][0] != NULL) { diff --git a/tools/gemBS_plugins/mextr.c b/tools/gemBS_plugins/mextr.c index 2e5f4b56..8cc10f39 100644 --- a/tools/gemBS_plugins/mextr.c +++ b/tools/gemBS_plugins/mextr.c @@ -1,7 +1,6 @@ #include #include #include -#include #include #include #include @@ -32,7 +31,8 @@ static FILE *open_ofile(char *name, int compress, bool append) { char *p = strrchr(tname, '.'); if(p == NULL || strcmp(p + 1, suffix)) { // No, so we will have to add it - asprintf(&tname, "%s.%s", name, suffix); + tname = malloc(strlen(name) + strlen(suffix) + 2); + sprintf(tname, "%s.%s", name, suffix); } int i = child_open(WRITE, tname, cdata->comp_path[comp_ix][0]); fp = fdopen(i, "w"); @@ -108,8 +108,9 @@ static void print_file_header(FILE *fp, int ns, char **names) { fputs("Contig\tPos0\tPos1\tRef", fp); for(int i = 0; i < ns; i++) { char *name = names[i]; - fprintf(fp, "\t%s:Call\t%s:Flags\t%s:Meth\t%s:non_conv\t%s:conv\t%s:support_call\t%s:total\n", name, name, name, name, name, name, name); + fprintf(fp, "\t%s:Call\t%s:Flags\t%s:Meth\t%s:non_conv\t%s:conv\t%s:support_call\t%s:total", name, name, name, name, name, name, name); } + fputc('\n', fp); } } @@ -461,11 +462,10 @@ static fmt_field_t tags[] = { bcf1_t *process(bcf1_t *rec) { static int idx; - static int32_t curr_rid = -1, prev_pos, mq[2]; + static int32_t curr_rid = -1, prev_pos; static bool valid[2] = {false, false}; static bcf1_t prev_rec; - (void)setlocale(LC_NUMERIC, "C"); int ns = bcf_hdr_nsamples(args.hdr); stats_t *st = args.stats; if(st != NULL) st->n_sites++; @@ -561,8 +561,6 @@ bcf1_t *process(bcf1_t *rec) } } } - mq[idx] = -1; - if(mq_p != NULL) mq[idx] = (int32_t)(0.5 + sqrt(ms_mq / (double)tot_n)); valid[idx] = true; // Here is the logic for deciding what we print if(rec->rid != curr_rid) curr_rid = rec->rid; diff --git a/tools/gemBS_plugins/output.c b/tools/gemBS_plugins/output.c index 4e0204ab..360a03cc 100644 --- a/tools/gemBS_plugins/output.c +++ b/tools/gemBS_plugins/output.c @@ -178,7 +178,6 @@ void output_cpg(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sample_gt double z = 0.0; gt_meth *g = sample_gt[idx ^ pos] + ix; if(!g->skip) { - double m = get_meth(g, pos); if(!pos) { if(g->counts[5] >= args->min_nc && (g->counts[5] + g->counts[7] >= args->min_inform)) { if(args->sel_mode == SELECT_HOM) z = exp(g->gt_prob[4]); @@ -260,8 +259,9 @@ static char *rgb_tab[11] = { "0,255,0", "55,255,0", "105,255,0", "155,255,0", "2 void output_bedmethyl(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sample_gt[], int idx) { static char *cx; - static int32_t cx_n,old_rid = 0xffffffff; + static int32_t cx_n,old_rid = 0xffffffff, old_pos = -1; + if(rec->rid == old_rid && rec->pos <= old_pos) return; int ns = bcf_hdr_nsamples(args->hdr); if(ns > 1) return; gt_meth *g = sample_gt[idx]; @@ -310,7 +310,6 @@ void output_bedmethyl(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sam FILE *fp = args->wigfile; if(fp != NULL) { if(rec->rid != old_rid) { - old_rid = rec->rid; fprintf(fp, "variableStep chrom=%s\n", args->hdr->id[BCF_DT_CTG][rec->rid].key); } fprintf(fp, "%u\t%.4g\n", rec->pos + 1, 100.0 * m); @@ -323,5 +322,7 @@ void output_bedmethyl(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sam args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos, rec->pos + 1, args->bedmethyl_desc, cov > 1000 ? 1000 : cov, strand, rec->pos, rec->pos + 1, rgb_tab[(int)(m * 10.0 + 0.5)], cov, (int)(100.0 * m), rtmp, rtmp + 4, gq); } + old_rid = rec->rid; + old_pos = rec->pos; } }