Skip to content

Commit

Permalink
merging with upstream
Browse files Browse the repository at this point in the history
  • Loading branch information
Wala committed Sep 13, 2023
2 parents 609311b + 139f68f commit 2715a41
Show file tree
Hide file tree
Showing 38 changed files with 2,392 additions and 119 deletions.
46 changes: 46 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
name: CI

on:
push:
branches:
- master
pull_request:

jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
compiler: [gcc, clang]

steps:
- name: Checkout bwa
uses: actions/checkout@v3

- name: Compile with ${{ matrix.compiler }}
run: make CC=${{ matrix.compiler }}

build-aarch64:
runs-on: ubuntu-latest
strategy:
matrix:
compiler: [gcc, clang]

steps:
- name: Checkout bwa
uses: actions/checkout@v3

- name: Compile with ${{ matrix.compiler }}
uses: uraimo/run-on-arch-action@v2
with:
arch: aarch64
distro: ubuntu20.04
githubToken: ${{ github.token }}
dockerRunArgs: |
--volume "${PWD}:/bwa"
install: |
apt-get update -q -y
apt-get install -q -y make ${{ matrix.compiler }} zlib1g-dev
run: |
cd /bwa
make CC=${{ matrix.compiler }}
5 changes: 0 additions & 5 deletions .travis.yml

This file was deleted.

10 changes: 5 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,15 @@ endif
.SUFFIXES:.c .o .cc

.c.o:
$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $(CPPFLAGS) $< -o $@

all:$(PROG)

bwa:libbwa.a $(AOBJS) main.o
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
$(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)

bwamem-lite:libbwa.a example.o
$(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS)
$(CC) $(CFLAGS) $(LDFLAGS) example.o -o $@ -L. -lbwa $(LIBS)

libbwa.a:$(LOBJS)
$(AR) -csru $@ $(LOBJS)
Expand All @@ -42,7 +42,7 @@ clean:
rm -f gmon.out *.o a.out $(PROG) *~ *.a

depend:
( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c )
( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) $(CPPFLAGS) -- *.c )

# DO NOT DELETE THIS LINE -- make depend depends on it.

Expand Down Expand Up @@ -80,7 +80,7 @@ fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h
is.o: malloc_wrap.h
kopen.o: malloc_wrap.h
kstring.o: kstring.h malloc_wrap.h
ksw.o: ksw.h malloc_wrap.h
ksw.o: ksw.h neon_sse.h scalar_sse.h malloc_wrap.h
main.o: kstring.h malloc_wrap.h utils.h
malloc_wrap.o: malloc_wrap.h
maxk.o: bwa.h bntseq.h bwt.h bwamem.h kseq.h malloc_wrap.h
Expand Down
30 changes: 29 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,35 @@
Release 0.7.17 (23 October 2017)
--------------------------------

This release adds option -q to preserve the mapping quality of split alignment
with a lower alignment score than the primary alignment. Option -5
automatically applies -q as well.

(0.7.17: 23 October 2017, r1188)



Release 0.7.16 (30 July 2017)
-----------------------------

This release added a couple of minor features and incorporated multiple pull
requests, including:

* Added option -5, which is useful to some Hi-C pipelines.

* Fixed an error with samtools sorting (#129). Updated download link for
GRCh38 (#123). Fixed README MarkDown formatting (#70). Addressed multiple
issues via a collected pull request #139 by @jmarshall. Avoid malformatted
SAM header when -R is used with TAB (#84). Output mate CIGAR (#138).

(0.7.16: 30 July 2017, r1180)



Release 0.7.15 (31 May 2016)
----------------------------

Fixed a long existing bug which potentially leads underestimated insert size
Fixed a long existing bug which potentially leads to underestimated insert size
upper bound. This bug should have little effect in practice.

(0.7.15: 31 May 2016, r1140)
Expand Down
42 changes: 27 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,26 @@
[![Build Status](https://travis-ci.org/lh3/bwa.svg?branch=dev)](https://travis-ci.org/lh3/bwa)
[![Build Status](https://drone.io/github.com/lh3/bwa/status.png)](https://drone.io/github.com/lh3/bwa/latest)
##Getting started
[![Build Status](https://github.com/lh3/bwa/actions/workflows/ci.yaml/badge.svg)](https://github.com/lh3/bwa/actions)
[![SourceForge Downloads](https://img.shields.io/sourceforge/dt/bio-bwa.svg?label=SF%20downloads)](https://sourceforge.net/projects/bio-bwa/files/?source=navbar)
[![GitHub Downloads](https://img.shields.io/github/downloads/lh3/bwa/total.svg?style=flat&label=GitHub%20downloads)](https://github.com/lh3/bwa/releases)
[![BioConda Install](https://img.shields.io/conda/dn/bioconda/bwa.svg?style=flag&label=BioConda%20install)](https://anaconda.org/bioconda/bwa)

**Note: [minimap2][minimap2] has replaced BWA-MEM for __PacBio and Nanopore__ read
alignment.** It retains all major BWA-MEM features, but is ~50 times as fast,
more versatile, more accurate and produces better base-level alignment.
A beta version of [BWA-MEM2][bwa-mem2] has been released for short-read mapping.
BWA-MEM2 is about twice as fast as BWA-MEM and outputs near identical alignments.

[minimap2]: https://github.com/lh3/minimap2
[bwa-mem2]: https://github.com/bwa-mem2/bwa-mem2

## Getting started

git clone https://github.com/lh3/bwa.git
cd bwa; make
./bwa index ref.fa
./bwa mem ref.fa read-se.fq.gz | gzip -3 > aln-se.sam.gz
./bwa mem ref.fa read1.fq read2.fq | gzip -3 > aln-pe.sam.gz

##Introduction
## Introduction

BWA is a software package for mapping DNA sequences against a large reference
genome, such as the human genome. It consists of three algorithms:
Expand All @@ -24,7 +36,7 @@ reference genome (the **index** command). Alignment algorithms are invoked with
different sub-commands: **aln/samse/sampe** for BWA-backtrack,
**bwasw** for BWA-SW and **mem** for the BWA-MEM algorithm.

##Availability
## Availability

BWA is released under [Apache 2.0][1]. The latest source code is [freely
available at github][2]. Released packages can [be downloaded][3] at
Expand All @@ -37,7 +49,7 @@ In addition to BWA, this self-consistent package also comes with bwa-associated
and 3rd-party tools for proper BAM-to-FASTQ conversion, mapping to ALT contigs,
adapter triming, duplicate marking, HLA typing and associated data files.

##Seeking helps
## Seeking help

The detailed usage is described in the man page available together with the
source code. You can use `man ./bwa.1` to view the man page in a terminal. The
Expand All @@ -46,7 +58,7 @@ have questions about BWA, you may [sign up the mailing list][6] and then send
the questions to [[email protected]][7]. You may also ask questions
in forums such as [BioStar][8] and [SEQanswers][9].

##Citing BWA
## Citing BWA

* Li H. and Durbin R. (2009) Fast and accurate short read alignment with
Burrows-Wheeler transform. *Bioinformatics*, **25**, 1754-1760. [PMID:
Expand All @@ -63,7 +75,7 @@ in forums such as [BioStar][8] and [SEQanswers][9].
Please note that the last reference is a preprint hosted at [arXiv.org][13]. I
do not have plan to submit it to a peer-reviewed journal in the near future.

##Frequently asked questions (FAQs)
## Frequently asked questions (FAQs)

1. [What types of data does BWA work with?](#type)
2. [Why does a read appear multiple times in the output SAM?](#multihit)
Expand All @@ -73,7 +85,7 @@ do not have plan to submit it to a peer-reviewed journal in the near future.
6. [Does BWA work with ALT contigs in the GRCh38 release?](#altctg)
7. [Can I just run BWA-MEM against GRCh38+ALT without post-processing?](#postalt)

####<a name="type"></a>1. What types of data does BWA work with?
#### <a name="type"></a>1. What types of data does BWA work with?

BWA works with a variety types of DNA sequence data, though the optimal
algorithm and setting may vary. The following list gives the recommended
Expand Down Expand Up @@ -108,42 +120,42 @@ errors given longer query sequences as the chance of missing all seeds is small.
As is shown above, with non-default settings, BWA-MEM works with Oxford Nanopore
reads with a sequencing error rate over 20%.

####<a name="multihit"></a>2. Why does a read appear multiple times in the output SAM?
#### <a name="multihit"></a>2. Why does a read appear multiple times in the output SAM?

BWA-SW and BWA-MEM perform local alignments. If there is a translocation, a gene
fusion or a long deletion, a read bridging the break point may have two hits,
occupying two lines in the SAM output. With the default setting of BWA-MEM, one
and only one line is primary and is soft clipped; other lines are tagged with
0x800 SAM flag (supplementary alignment) and are hard clipped.

####<a name="4gb"></a>3. Does BWA work on reference sequences longer than 4GB in total?
#### <a name="4gb"></a>3. Does BWA work on reference sequences longer than 4GB in total?

Yes. Since 0.6.x, all BWA algorithms work with a genome with total length over
4GB. However, individual chromosome should not be longer than 2GB.

####<a name="pe0"></a>4. Why can one read in a pair has high mapping quality but the other has zero?
#### <a name="pe0"></a>4. Why can one read in a pair have a high mapping quality but the other has zero?

This is correct. Mapping quality is assigned for individual read, not for a read
pair. It is possible that one read can be mapped unambiguously, but its mate
falls in a tandem repeat and thus its accurate position cannot be determined.

####<a name="endref"></a>5. How can a BWA-backtrack alignment stands out of the end of a chromosome?
#### <a name="endref"></a>5. How can a BWA-backtrack alignment stand out of the end of a chromosome?

Internally BWA concatenates all reference sequences into one long sequence. A
read may be mapped to the junction of two adjacent reference sequences. In this
case, BWA-backtrack will flag the read as unmapped (0x4), but you will see
position, CIGAR and all the tags. A similar issue may occur to BWA-SW alignment
as well. BWA-MEM does not have this problem.

####<a name="altctg"></a>6. Does BWA work with ALT contigs in the GRCh38 release?
#### <a name="altctg"></a>6. Does BWA work with ALT contigs in the GRCh38 release?

Yes, since 0.7.11, BWA-MEM officially supports mapping to GRCh38+ALT.
BWA-backtrack and BWA-SW don't properly support ALT mapping as of now. Please
see [README-alt.md][18] for details. Briefly, it is recommended to use
[bwakit][17], the binary release of BWA, for generating the reference genome
and for mapping.

####<a name="postalt"></a>7. Can I just run BWA-MEM against GRCh38+ALT without post-processing?
#### <a name="postalt"></a>7. Can I just run BWA-MEM against GRCh38+ALT without post-processing?

If you are not interested in hits to ALT contigs, it is okay to run BWA-MEM
without post-processing. The alignments produced this way are very close to
Expand Down
21 changes: 13 additions & 8 deletions bntseq.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
/* The MIT License
Copyright (c) 2008 Genome Research Ltd (GRL).
Copyright (c) 2018- Dana-Farber Cancer Institute
2009-2018 Broad Institute, Inc.
2008-2009 Genome Research Ltd. (GRL)
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
Expand All @@ -22,9 +24,6 @@
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

/* Contact: Heng Li <[email protected]> */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
Expand Down Expand Up @@ -197,7 +196,13 @@ bntseq_t *bns_restore(const char *prefix)
}
while (c != '\n' && c != EOF) c = fgetc(fp);
i = 0;
} else str[i++] = c; // FIXME: potential segfault here
} else {
if (i >= 1022) {
fprintf(stderr, "[E::%s] sequence name longer than 1023 characters. Abort!\n", __func__);
exit(1);
}
str[i++] = c;
}
}
kh_destroy(str, h);
fclose(fp);
Expand Down Expand Up @@ -299,9 +304,9 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only)
// read sequences
while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q);
if (!for_only) { // add the reverse complemented sequence
m_pac = (bns->l_pac * 2 + 3) / 4 * 4;
pac = realloc(pac, m_pac/4);
memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4);
int64_t ll_pac = (bns->l_pac * 2 + 3) / 4 * 4;
if (ll_pac > m_pac) pac = realloc(pac, ll_pac/4);
memset(pac + (bns->l_pac+3)/4, 0, (ll_pac - (bns->l_pac+3)/4*4) / 4);
for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac)
_set_pac(pac, bns->l_pac, 3-_get_pac(pac, l));
}
Expand Down
6 changes: 3 additions & 3 deletions bntseq.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
/* The MIT License
Copyright (c) 2008 Genome Research Ltd (GRL).
Copyright (c) 2018- Dana-Farber Cancer Institute
2009-2018 Broad Institute, Inc.
2008-2009 Genome Research Ltd. (GRL)
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
Expand All @@ -23,8 +25,6 @@
SOFTWARE.
*/

/* Contact: Heng Li <[email protected]> */

#ifndef BWT_BNTSEQ_H
#define BWT_BNTSEQ_H

Expand Down
Loading

0 comments on commit 2715a41

Please sign in to comment.