Skip to content

Commit

Permalink
Speed up of VCF parsing / writing with FORMAT fields.
Browse files Browse the repository at this point in the history
If we're reading and writing VCF without intervening change to BCF or
use of bcf_unpack with BCF_UN_FMT (so no querying and/or modification
of the per-sample FORMAT columns), then we keep the data in string
form instead of binary bcf form.

The bcf binary form is held in kstring b->indiv.  We also hold the
string format there too, but use b->unpacked & VCF_UN_FMT as a
bit-flag to indicate whether this is encoding as VCF or BCF.

The benefit is reading is considerably faster for many-sample files
(~4 fold for a 2.5k sample 1000 genomes test) and similarly (~3 fold)
for a read/write loop to VCF as we don't have to re-encode the data
from uBCF to VCF either.

If we're converting to BCF, or doing a query, we still need the call
to vcf_parse_format as before, which reformats b->indiv from VCF to
BCF internally.  This is done on the fly.

There is some nastiness to how this works sadly as bcf_unpack just
takes the bcf record and has no access to the header, but this is
needed for parsing.  Therefore we stash a copy of it in the kstring
along with the text itself, but this assumes that no one is doing
header oddities such as reading the file, freeing the header, and
then attempting to unpack the contents.  It feels like a safe
assumption, although it's not something that has ever been explicitly
spelt out before.

Note this causes a few bcftools test failures due to fixups that
happen when doing a full conversion to BCF.  A similar case happens
here in test/noroundtrip.vcf, which has an underspecified FORMAT of
"GT:GT 0/1".  The test checks it comes out as "GT:GT 2,4:.".  This
won't happen with pure VCF as we're not parsing that data, but the
test fix is to force conversion to BCF and back again to test that
code path.  A similar fix would need to be made to the failing
bcftools tests.
  • Loading branch information
jkbonfield committed May 28, 2020
1 parent 9de45b7 commit a482b91
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 41 deletions.
4 changes: 3 additions & 1 deletion htslib/vcf.h
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,8 @@ typedef struct {
float qual; // QUAL
uint32_t n_info:16, n_allele:16;
uint32_t n_fmt:8, n_sample:24;
kstring_t shared, indiv;
kstring_t shared;
kstring_t indiv; // Per sample data. unpacked & VCF_UN_FMT => VCF / BCF
bcf_dec_t d; // lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack()
int max_unpack; // Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed
int unpacked; // remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work
Expand Down Expand Up @@ -389,6 +390,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write().
#define BCF_UN_FMT 8 // unpack format and each sample
#define BCF_UN_IND BCF_UN_FMT // a synonymo of BCF_UN_FMT
#define BCF_UN_ALL (BCF_UN_SHR|BCF_UN_FMT) // everything
#define VCF_UN_FMT 16 // indiv.s is VCF string
HTSLIB_EXPORT
int bcf_unpack(bcf1_t *b, int which);

Expand Down
2 changes: 1 addition & 1 deletion test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -822,7 +822,7 @@ sub test_vcf_various
test_cmd($opts, %args, out => "formatcols.vcf",
cmd => "$$opts{bin}/htsfile -c $$opts{path}/formatcols.vcf");
test_cmd($opts, %args, out => "noroundtrip-out.vcf",
cmd => "$$opts{bin}/htsfile -c $$opts{path}/noroundtrip.vcf");
cmd => "$$opts{path}/test_view -b $$opts{path}/noroundtrip.vcf | $$opts{bin}/htsfile -c -");
test_cmd($opts, %args, out => "formatmissing-out.vcf",
cmd => "$$opts{bin}/htsfile -c $$opts{path}/formatmissing.vcf");
}
Expand Down
171 changes: 132 additions & 39 deletions vcf.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, N

#define BCF_IS_64BIT (1<<30)

static int vcf_parse_format_partial(bcf1_t *v);

static char *find_chrom_header_line(char *s)
{
Expand Down Expand Up @@ -1439,6 +1440,11 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
{
if (rec->unpacked & VCF_UN_FMT) {
if (vcf_parse_format_partial(rec) < 0)
return -1;
}

if ( !hdr->keep_samples ) return 0;
if ( !bcf_hdr_nsamples(hdr) )
{
Expand Down Expand Up @@ -1734,16 +1740,23 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
if ( h->dirty ) {
if (bcf_hdr_sync(h) < 0) return -1;
}

if ( hfp->format.format == vcf || hfp->format.format == text_format )
return vcf_write(hfp,h,v);

if (v->unpacked & VCF_UN_FMT) {
// slow, but round trip test
if (vcf_parse_format_partial(v) < 0)
return -1;
}

if ( bcf_hdr_nsamples(h)!=v->n_sample )
{
hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
return -1;
}

if ( hfp->format.format == vcf || hfp->format.format == text_format )
return vcf_write(hfp,h,v);

if ( v->errcode )
{
// vcf_parse1() encountered a new contig or tag, undeclared in the
Expand Down Expand Up @@ -2569,6 +2582,45 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
return 0;
}

// The indiv.s kstring is the textual VCF representation for FORMAT
// column and all the subsequent SAMPLE columns.
//
// We also need the header, and this cannot be passed in as bcf_unpack calls
// this and it doesn't have the header available.
// So we also cache a pointer to the header in the first bytes of the
// kstring too.
//
// This packing ensures the data is still in kstring format and amenable
// to freeing / reuse. I.e.:
//
// s.s p q s.l s.m
// | | | | |
// <HDR_PTR><FORMAT><SAMPLE\tSAMPLE>.........
//
// Returns 0 on success,
// <0 on failure.
static int vcf_parse_format_partial(bcf1_t *v) {
if (!(v->unpacked & VCF_UN_FMT))
return 0;
kstring_t s = v->indiv;
const bcf_hdr_t *h = *(const bcf_hdr_t **)s.s;
char *p = s.s + sizeof(const bcf_hdr_t *);
char *q = p + strlen(p);

v->indiv.s = NULL;
v->indiv.l = v->indiv.m = 0;

int ret;
if ((ret = vcf_parse_format(&s, h, v, p, q) == 0)) {
v->unpacked &= ~VCF_UN_FMT;
free(s.s);
return ret;
} else {
v->indiv = s;
return ret;
}
}

static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
// Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
// been already printed, but will enable tools like vcfcheck to proceed.
Expand Down Expand Up @@ -2890,7 +2942,29 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
}
if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
} else if (i == 8) {// FORMAT
return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2;
// Consider complete copy of s, obtained via ks_release,
// and cache of p/q pointers. This is then a generalised
// parse delay that works for any max_unpack value.

kstring_t *iv = &v->indiv;
ks_clear(iv);
if (kputsn((char *)&h, sizeof(&h), iv) < 0 ||
kputsn(p, s->s + s->l - p, iv) < 0)
goto err;

v->unpacked |= VCF_UN_FMT;

// We can't accurately judge n_sample and some things may check
// this without doing an explicit bcf_unpack call, but we
// set it to bcf_hdr_nsamples(h) as a starting point.
// (vcf_parse_format validates this and it's an error if it
// mismatches, so this is initial value is incorrect if the
// data itself is incorrect, and we'll spot that on an explict
// bcf_unpack call.)
v->n_sample = bcf_hdr_nsamples(h);

return 0;
//return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2;
}
}

Expand Down Expand Up @@ -3002,6 +3076,10 @@ int bcf_unpack(bcf1_t *b, int which)
ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
b->unpacked |= BCF_UN_INFO;
}
if ((which & BCF_UN_FMT) && (b->unpacked & VCF_UN_FMT)) {
if (vcf_parse_format_partial(b) < 0)
return -1;
}
if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
ptr = (uint8_t*)b->indiv.s;
hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
Expand All @@ -3016,7 +3094,8 @@ int bcf_unpack(bcf1_t *b, int which)
int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
{
int i;
bcf_unpack((bcf1_t*)v, BCF_UN_ALL);
bcf_unpack((bcf1_t*)v, (v->unpacked & VCF_UN_FMT)
? BCF_UN_SHR : BCF_UN_ALL);
kputs(h->id[BCF_DT_CTG][v->rid].key, s); // CHROM
kputc('\t', s); kputll(v->pos + 1, s); // POS
kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
Expand Down Expand Up @@ -3075,45 +3154,54 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
if ( first ) kputc('.', s);
} else kputc('.', s);
// FORMAT and individual information
if (v->n_sample)
{
int i,j;
if ( v->n_fmt)
if (v->unpacked & VCF_UN_FMT) {
size_t l = strlen(v->indiv.s + sizeof(bcf_hdr_t *));
kputc('\t', s);
kputsn(v->indiv.s + sizeof(bcf_hdr_t *), l, s);
kputc('\t', s);
kputsn(v->indiv.s + sizeof(bcf_hdr_t *) + l+1,
v->indiv.l - (sizeof(bcf_hdr_t *) + l+1), s);
} else {
if (v->n_sample)
{
int gt_i = -1;
bcf_fmt_t *fmt = v->d.fmt;
int first = 1;
for (i = 0; i < (int)v->n_fmt; ++i) {
if ( !fmt[i].p ) continue;
kputc(!first ? ':' : '\t', s); first = 0;
if ( fmt[i].id<0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
{
hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", fmt[i].id, bcf_seqname_safe(h,(bcf1_t*)v), v->pos+1);
abort();
}
kputs(h->id[BCF_DT_ID][fmt[i].id].key, s);
if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i;
}
if ( first ) kputs("\t.", s);
for (j = 0; j < v->n_sample; ++j) {
kputc('\t', s);
first = 1;
int i,j;
if ( v->n_fmt)
{
int gt_i = -1;
bcf_fmt_t *fmt = v->d.fmt;
int first = 1;
for (i = 0; i < (int)v->n_fmt; ++i) {
bcf_fmt_t *f = &fmt[i];
if ( !f->p ) continue;
if (!first) kputc(':', s);
first = 0;
if (gt_i == i)
bcf_format_gt(f,j,s);
else
bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
if ( !fmt[i].p ) continue;
kputc(!first ? ':' : '\t', s); first = 0;
if ( fmt[i].id<0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
{
hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", fmt[i].id, bcf_seqname_safe(h,(bcf1_t*)v), v->pos+1);
abort();
}
kputs(h->id[BCF_DT_ID][fmt[i].id].key, s);
if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i;
}
if ( first ) kputs("\t.", s);
for (j = 0; j < v->n_sample; ++j) {
kputc('\t', s);
first = 1;
for (i = 0; i < (int)v->n_fmt; ++i) {
bcf_fmt_t *f = &fmt[i];
if ( !f->p ) continue;
if (!first) kputc(':', s);
first = 0;
if (gt_i == i)
bcf_format_gt(f,j,s);
else
bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
}
if ( first ) kputc('.', s);
}
if ( first ) kputc('.', s);
}
else
for (j=0; j<=v->n_sample; j++)
kputs("\t.", s);
}
else
for (j=0; j<=v->n_sample; j++)
kputs("\t.", s);
}
kputc('\n', s);
return 0;
Expand Down Expand Up @@ -3824,6 +3912,11 @@ int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)

int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
{
if (v->unpacked & VCF_UN_FMT) {
if (vcf_parse_format_partial(v) < 0)
return -1;
}

kstring_t ind;
ind.s = 0; ind.l = ind.m = 0;
if (n) {
Expand Down

0 comments on commit a482b91

Please sign in to comment.