From 4da68069dd40aac9c708bdef7442ccacecd23101 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 1 Mar 2023 11:12:11 +0000 Subject: [PATCH] Support for telomere (POS=0) coordinate This follows changes in https://github.com/samtools/htslib/pull/1573 See also https://github.com/samtools/htslib/issues/1571 --- csq.c | 6 +++--- test/telomere.0.out | 1 + test/telomere.1.out | 1 + test/telomere.vcf | 5 +++++ test/test.pl | 2 ++ vcfconcat.c | 37 ++++++++++++++++++---------------- vcfconvert.c | 48 ++++++++++++++++++++++----------------------- vcfmerge.c | 8 ++++---- vcfnorm.c | 6 +++--- 9 files changed, 63 insertions(+), 51 deletions(-) create mode 100644 test/telomere.0.out create mode 100644 test/telomere.1.out create mode 100644 test/telomere.vcf diff --git a/csq.c b/csq.c index 49812d4de..a7d5d5883 100644 --- a/csq.c +++ b/csq.c @@ -3374,7 +3374,7 @@ void vbuf_flush(args_t *args, uint32_t pos) i = rbuf_shift(&args->vcf_rbuf); assert( i>=0 ); vbuf = args->vcf_buf[i]; - int pos = vbuf->n ? vbuf->vrec[0]->line->pos : -1; + int pos = vbuf->n ? vbuf->vrec[0]->line->pos : CSI_COOR_EMPTY; for (i=0; in; i++) { vrec_t *vrec = vbuf->vrec[i]; @@ -3413,7 +3413,7 @@ void vbuf_flush(args_t *args, uint32_t pos) bcf_empty(vrec->line); vrec->line->pos = save_pos; } - if ( pos!=-1 ) + if ( pos!=CSI_COOR_EMPTY ) { khint_t k = kh_get(pos2vbuf, args->pos2vbuf, pos); if ( k != kh_end(args->pos2vbuf) ) kh_del(pos2vbuf, args->pos2vbuf, k); @@ -4169,7 +4169,7 @@ static void process(args_t *args, bcf1_t **rec_ptr) } bcf1_t *rec = *rec_ptr; - static int32_t prev_rid = -1, prev_pos = -1; + static int32_t prev_rid = -1, prev_pos = CSI_COOR_EMPTY; if ( prev_rid!=rec->rid ) { prev_rid = rec->rid; diff --git a/test/telomere.0.out b/test/telomere.0.out new file mode 100644 index 000000000..8b88f8412 --- /dev/null +++ b/test/telomere.0.out @@ -0,0 +1 @@ +22 0 id0 C G . . . diff --git a/test/telomere.1.out b/test/telomere.1.out new file mode 100644 index 000000000..ca28f4d72 --- /dev/null +++ b/test/telomere.1.out @@ -0,0 +1 @@ +22 1 id1 C G . . . diff --git a/test/telomere.vcf b/test/telomere.vcf new file mode 100644 index 000000000..243cab79c --- /dev/null +++ b/test/telomere.vcf @@ -0,0 +1,5 @@ +##fileformat=VCFv4.2 +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +22 0 id0 C G . . . +22 1 id1 C G . . . diff --git a/test/test.pl b/test/test.pl index 0d8490240..713704e7a 100755 --- a/test/test.pl +++ b/test/test.pl @@ -38,6 +38,8 @@ run_test(\&test_tabix,$opts,in=>'merge.a',reg=>'1:3000151-3000151',out=>'tabix.1.3000151.out'); run_test(\&test_index,$opts,in=>'large_chrom_csi_limit',reg=>'chr20:1-2147483647',out=>'large_chrom_csi_limit.20.1.2147483647.out'); # 2147483647 (1<<31-1) is the current chrom limit for csi. bcf conversion and indexing fail above this run_test(\&test_index,$opts,in=>'large_chrom_csi_limit',reg=>'chr20',out=>'large_chrom.20.1.2147483647.out'); # this fails until bug resolved +run_test(\&test_index,$opts,in=>'telomere',reg=>'22:0',out=>'telomere.0.out'); +run_test(\&test_index,$opts,in=>'telomere',reg=>'22:1',out=>'telomere.1.out'); run_test(\&test_vcf_idxstats,$opts,in=>'idx',args=>'-s',out=>'idx.out'); run_test(\&test_vcf_idxstats,$opts,in=>'idx',args=>'-n',out=>'idx_count.out'); run_test(\&test_vcf_idxstats,$opts,in=>'empty',args=>'-s',out=>'empty.idx.out'); diff --git a/vcfconcat.c b/vcfconcat.c index 74fd036b8..48ffff337 100644 --- a/vcfconcat.c +++ b/vcfconcat.c @@ -1,6 +1,6 @@ /* vcfconcat.c -- Concatenate or combine VCF/BCF files. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -39,6 +39,8 @@ THE SOFTWARE. */ #include #include "bcftools.h" +#define EMPTY_FILE -3 + typedef struct _args_t { bcf_srs_t *files; @@ -95,11 +97,11 @@ static void init_data(args_t *args) if ( args->phased_concat ) { int ret = bcf_read(fp, hdr, line); - if ( ret!=0 ) args->start_pos[i] = -2; // empty file + if ( ret!=0 ) args->start_pos[i] = EMPTY_FILE; else { int chrid = bcf_hdr_id2int(args->out_hdr,BCF_DT_CTG,bcf_seqname(hdr,line)); - args->start_pos[i] = chrid==prev_chrid ? line->pos : -1; + args->start_pos[i] = chrid==prev_chrid ? line->pos : CSI_COOR_EMPTY; prev_chrid = chrid; } } @@ -171,11 +173,11 @@ static void init_data(args_t *args) int nok = 0; while (1) { - while ( noknfnames && args->start_pos[nok]!=-2 ) nok++; + while ( noknfnames && args->start_pos[nok]!=EMPTY_FILE ) nok++; if ( nok==args->nfnames ) break; i = nok; - while ( infnames && args->start_pos[i]==-2 ) i++; + while ( infnames && args->start_pos[i]==EMPTY_FILE ) i++; if ( i==args->nfnames ) break; int tmp = args->start_pos[nok]; args->start_pos[nok] = args->start_pos[i]; args->start_pos[i] = tmp; @@ -185,7 +187,7 @@ static void init_data(args_t *args) args->nfnames = nok; for (i=1; infnames; i++) - if ( args->start_pos[i-1]!=-1 && args->start_pos[i]!=-1 && args->start_pos[i]start_pos[i-1] ) + if ( args->start_pos[i-1]!=CSI_COOR_EMPTY && args->start_pos[i]!=CSI_COOR_EMPTY && args->start_pos[i]start_pos[i-1] ) error("The files not in ascending order: %d in %s, %d in %s\n", args->start_pos[i-1]+1,args->fnames[i-1],args->start_pos[i]+1,args->fnames[i]); args->prev_chr = -1; @@ -264,7 +266,7 @@ static void phased_flush(args_t *args) bcf1_t *brec = args->buf[i+1]; int nGTs = bcf_get_genotypes(ahdr, arec, &args->GTa, &args->mGTa); - if ( nGTs < 0 ) + if ( nGTs < 0 ) { if ( !gt_absent_warned ) { @@ -359,7 +361,7 @@ static void phased_flush(args_t *args) bcf_update_format_int32(args->out_hdr,rec,"PQ",args->phase_qual,nsmpl); PQ_printed = 1; for (j=0; jphase_qual[j] < args->min_PQ ) + if ( args->phase_qual[j] < args->min_PQ ) { args->phase_set[j] = rec->pos+1; args->phase_set_changed = 1; @@ -404,7 +406,7 @@ static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec, int is_overlap if ( args->seen_seq[chr_id] ) error("The chromosome block %s is not contiguous\n", arec ? bcf_seqname(ahdr,arec) : bcf_seqname(bhdr,brec)); args->seen_seq[chr_id] = 1; args->prev_chr = chr_id; - args->prev_pos_check = -1; + args->prev_pos_check = CSI_COOR_EMPTY; } if ( !is_overlap ) @@ -463,12 +465,12 @@ static void concat(args_t *args) new_file = 1; args->ifname++; - if ( args->start_pos[args->ifname-1]==-1 ) break; // new chromosome, start with only one file open - if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==-1 ) break; // next file starts on a different chromosome + if ( args->start_pos[args->ifname-1]==CSI_COOR_EMPTY ) break; // new chromosome, start with only one file open + if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==CSI_COOR_EMPTY ) break; // next file starts on a different chromosome } // is there a line from the previous run? Seek the newly opened reader to that position - int seek_pos = -1; + int seek_pos = CSI_COOR_EMPTY; int seek_chr = -1; if ( bcf_sr_has_line(args->files,0) ) { @@ -521,11 +523,12 @@ static void concat(args_t *args) // This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to. if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[ir].header,line)) ) continue; - seek_pos = seek_chr = -1; + seek_pos = CSI_COOR_EMPTY; + seek_chr = -1; // Check if the position overlaps with the next, yet unopened, reader int must_seek = 0; - while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=-1 && line->pos >= args->start_pos[args->ifname] ) + while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=CSI_COOR_EMPTY && line->pos >= args->start_pos[args->ifname] ) { must_seek = 1; if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum)); @@ -618,7 +621,7 @@ static void concat(args_t *args) if ( chr_id<0 ) error("\nThe sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]); if ( prev_chr_id!=chr_id ) { - prev_pos = -1; + prev_pos = CSI_COOR_EMPTY; if ( args->seen_seq[chr_id] ) error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", tmp.s); } @@ -643,7 +646,7 @@ static void concat(args_t *args) if ( prev_chr_id!=line->rid ) { - prev_pos = -1; + prev_pos = CSI_COOR_EMPTY; if ( args->seen_seq[line->rid] ) error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); } @@ -980,7 +983,7 @@ int main_vcfconcat(int argc, char *argv[]) case 'R': args->regions_list = optarg; args->regions_is_file = 1; break; case 'd': args->remove_dups = optarg; break; case 'D': args->remove_dups = "exact"; break; - case 'q': + case 'q': args->min_PQ = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg); break; diff --git a/vcfconvert.c b/vcfconvert.c index ce5ed9981..dac134d05 100644 --- a/vcfconvert.c +++ b/vcfconvert.c @@ -1,6 +1,6 @@ /* vcfconvert.c -- convert between VCF/BCF and related formats. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -59,7 +59,7 @@ struct _args_t bcf_hdr_t *header; void (*convert_func)(struct _args_t *); struct { - int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing; + int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing; } n; kstring_t str; int32_t *gts; @@ -160,7 +160,7 @@ static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) // REF,ALT args->str.l = 0; se = ++ss; - while ( se < tsv->se && *se!='_' ) se++; + while ( se < tsv->se && *se!='_' ) se++; if ( *se!='_' ) return -1; kputsn(ss,se-ss,&args->str); ss = ++se; @@ -202,14 +202,14 @@ static int tsv_setter_chrom_pos_ref_alt_or_id(tsv_t *tsv, bcf1_t *rec, void *usr { args_t *args = (args_t*)usr; if ( _set_chrom_pos_ref_alt(tsv,rec,usr)==0 ) return 0; - rec->pos = -1; // mark the record as unset + rec->pos = CSI_COOR_EMPTY; // mark the record as unset if ( !args->output_vcf_ids) return 0; return tsv_setter_id(tsv,rec,usr); } static int tsv_setter_chrom_pos_ref_alt_id_or_die(tsv_t *tsv, bcf1_t *rec, void *usr) { args_t *args = (args_t*)usr; - if ( rec->pos!=-1 ) + if ( rec->pos!=CSI_COOR_EMPTY ) { if ( !args->output_vcf_ids ) return 0; return tsv_setter_id(tsv,rec,usr); @@ -269,12 +269,12 @@ static int tsv_setter_gt_gp(tsv_t *tsv, bcf1_t *rec, void *usr) if ( aa >= ab ) { if ( aa >= bb ) args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(0); - else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1); + else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1); } - else if ( ab >= bb ) + else if ( ab >= bb ) { args->gts[2*i+0] = bcf_gt_unphased(0); - args->gts[2*i+1] = bcf_gt_unphased(1); + args->gts[2*i+1] = bcf_gt_unphased(1); } else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1); } @@ -293,7 +293,7 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) else { a0 = bcf_gt_phased(0); a1 = bcf_gt_phased(1); } // up is short for "unphased" - int nup = 0; + int nup = 0; for (i=0; iss + 4*i + nup; @@ -324,11 +324,11 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) break; default : fprintf(stderr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss); - return -1; + return -1; } if( ss[all*2+up+1]=='*' ) up = up + 1; } - + if(up && up != 2) { fprintf(stderr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss); @@ -356,13 +356,13 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr) static void gensample_to_vcf(args_t *args) { /* - * Inpute: IMPUTE2 output (indentation changed here for clarity): + * Inpute: IMPUTE2 output (indentation changed here for clarity): * * 20:62116619_C_T 20:62116619 62116619 C T 0.969 0.031 0 ... * --- 20:62116698_C_A 62116698 C A 1 0 0 ... * * Second column is expected in the form of CHROM:POS_REF_ALT. We use second - * column because the first can be empty ("--") when filling sites from reference + * column because the first can be empty ("--") when filling sites from reference * panel. When the option --vcf-ids is given, the first column is used to set the * VCF ID. * @@ -784,7 +784,7 @@ char *init_sample2sex(bcf_hdr_t *hdr, char *sex_fname) } for (i=0; isex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname); @@ -877,7 +877,7 @@ static void vcf_to_gensample(args_t *args) return; } - int prev_rid = -1, prev_pos = -1; + int prev_rid = -1, prev_pos = CSI_COOR_EMPTY; int no_alt = 0, non_biallelic = 0, filtered = 0, ndup = 0, nok = 0; BGZF *gout = bgzf_open(gen_fname, gen_compressed ? "wg" : "wu"); while ( bcf_sr_next_line(args->files) ) @@ -915,7 +915,7 @@ static void vcf_to_gensample(args_t *args) nok++; } } - fprintf(stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n", + fprintf(stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n", nok, no_alt+non_biallelic+filtered+ndup, no_alt, non_biallelic, filtered, ndup); if ( str.m ) free(str.s); @@ -976,7 +976,7 @@ static void vcf_to_haplegendsample(args_t *args) { char *sample2sex = NULL; if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname); - + int i; BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu"); str.l = 0; @@ -1078,7 +1078,7 @@ static void vcf_to_hapsample(args_t *args) kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %ID %POS %REF %FIRST_ALT ", &str); else kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str); - + if ( args->hap2dip ) kputs("%_GT_TO_HAP2\n", &str); else @@ -1229,7 +1229,7 @@ static inline int tsv_setter_aa1(args_t *args, char *ss, char *se, int alleles[] if ( alleles[a0]<0 ) alleles[a0] = (*nals)++; if ( alleles[a1]<0 ) alleles[a1] = (*nals)++; - gts[0] = bcf_gt_unphased(alleles[a0]); + gts[0] = bcf_gt_unphased(alleles[a0]); gts[1] = ss[1] ? bcf_gt_unphased(alleles[a1]) : bcf_int32_vector_end; if ( ref==a0 && ref==a1 ) args->n.hom_rr++; // hom ref: RR @@ -1265,7 +1265,7 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr) } ret = tsv_setter_aa1(args, tsv->ss, tsv->se, alleles, &nals, iref, args->gts+i*2); if ( ret==-1 ) error("Error parsing the site %s:%"PRId64", expected two characters\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1); - if ( ret==-2 ) + if ( ret==-2 ) { // something else than a SNP free(ref); @@ -1275,7 +1275,7 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr) args->str.l = 0; kputc(ref[0], &args->str); - for (i=0; i<5; i++) + for (i=0; i<5; i++) { if ( alleles[i]>0 ) { @@ -1419,7 +1419,7 @@ static void gvcf_to_vcf(args_t *args) { int pass = filter_test(args->filter, line, NULL); if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; - if ( !pass ) + if ( !pass ) { if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); continue; @@ -1667,7 +1667,7 @@ int main_vcfconvert(int argc, char *argv[]) else args->infname = argv[optind]; } if ( !args->infname ) usage(); - + if ( args->convert_func ) args->convert_func(args); else vcf_to_vcf(args); diff --git a/vcfmerge.c b/vcfmerge.c index 621f4102c..def553188 100644 --- a/vcfmerge.c +++ b/vcfmerge.c @@ -1,6 +1,6 @@ /* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file. - Copyright (C) 2012-2022 Genome Research Ltd. + Copyright (C) 2012-2023 Genome Research Ltd. Author: Petr Danecek @@ -828,7 +828,7 @@ void maux_reset(maux_t *ma, int *rid_tab) } const char *chr = NULL; ma->nals = 0; - ma->pos = -1; + ma->pos = CSI_COOR_EMPTY; for (i=0; in; i++) { if ( !bcf_sr_has_line(ma->files,i) ) continue; @@ -897,7 +897,7 @@ void merge_chrom2qual(args_t *args, bcf1_t *out) bcf_float_set_missing(out->qual); // CHROM, POS, ID, QUAL - out->pos = -1; + out->pos = CSI_COOR_EMPTY; for (i=0; inreaders; i++) { bcf1_t *line = maux_get_line(args, i); @@ -916,7 +916,7 @@ void merge_chrom2qual(args_t *args, bcf1_t *out) } // position - if ( out->pos==-1 ) + if ( out->pos==CSI_COOR_EMPTY ) { const char *chr = hdr->id[BCF_DT_CTG][line->rid].key; out->rid = bcf_hdr_name2id(out_hdr, chr); diff --git a/vcfnorm.c b/vcfnorm.c index 9538f8d01..213edd501 100644 --- a/vcfnorm.c +++ b/vcfnorm.c @@ -1,6 +1,6 @@ /* vcfnorm.c -- Left-align and normalize indels. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -1819,7 +1819,7 @@ static void flush_buffer(args_t *args, htsFile *file, int n) { bcf1_t *line; int i, k; - int prev_rid = -1, prev_pos = -1, prev_type = 0; + int prev_rid = -1, prev_pos = CSI_COOR_EMPTY, prev_type = 0; for (i=0; irbuf); @@ -2020,7 +2020,7 @@ static void normalize_vcf(args_t *args) if ( bcf_hdr_write(args->out, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); bcf1_t *line; - int prev_rid = -1, prev_pos = -1, prev_type = 0; + int prev_rid = -1, prev_pos = CSI_COOR_EMPTY, prev_type = 0; while ( (line = next_atomized_line(args)) ) { args->ntotal++;