diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
new file mode 100644
index 00000000..8eb2f062
--- /dev/null
+++ b/.github/workflows/ci.yaml
@@ -0,0 +1,46 @@
+name: CI
+
+on:
+  push:
+    branches:
+    - master
+  pull_request:
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        compiler: [gcc, clang]
+
+    steps:
+    - name: Checkout bwa
+      uses: actions/checkout@v3
+
+    - name: Compile with ${{ matrix.compiler }}
+      run:  make CC=${{ matrix.compiler }}
+
+  build-aarch64:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        compiler: [gcc, clang]
+
+    steps:
+    - name: Checkout bwa
+      uses: actions/checkout@v3
+
+    - name: Compile with ${{ matrix.compiler }}
+      uses: uraimo/run-on-arch-action@v2
+      with:
+        arch: aarch64
+        distro: ubuntu20.04
+        githubToken: ${{ github.token }}
+        dockerRunArgs: |
+          --volume "${PWD}:/bwa"
+        install: |
+          apt-get update -q -y
+          apt-get install -q -y make ${{ matrix.compiler }} zlib1g-dev
+        run: |
+          cd /bwa
+          make CC=${{ matrix.compiler }}
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index e386b272..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,5 +0,0 @@
-language: c
-compiler:
-  - gcc
-  - clang
-script: make
diff --git a/Makefile b/Makefile
index 923ca27a..1bf2d6e3 100644
--- a/Makefile
+++ b/Makefile
@@ -25,15 +25,15 @@ endif
 .SUFFIXES:.c .o .cc
 
 .c.o:
-		$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+		$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $(CPPFLAGS) $< -o $@
 
 all:$(PROG)
 
 bwa:libbwa.a $(AOBJS) main.o
-		$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
+		$(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
 
 bwamem-lite:libbwa.a example.o
-		$(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS)
+		$(CC) $(CFLAGS) $(LDFLAGS) example.o -o $@ -L. -lbwa $(LIBS)
 
 libbwa.a:$(LOBJS)
 		$(AR) -csru $@ $(LOBJS)
@@ -42,7 +42,7 @@ clean:
 		rm -f gmon.out *.o a.out $(PROG) *~ *.a
 
 depend:
-	( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c )
+	( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) $(CPPFLAGS) -- *.c )
 
 # DO NOT DELETE THIS LINE -- make depend depends on it.
 
@@ -80,7 +80,7 @@ fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h
 is.o: malloc_wrap.h
 kopen.o: malloc_wrap.h
 kstring.o: kstring.h malloc_wrap.h
-ksw.o: ksw.h malloc_wrap.h
+ksw.o: ksw.h neon_sse.h scalar_sse.h malloc_wrap.h
 main.o: kstring.h malloc_wrap.h utils.h
 malloc_wrap.o: malloc_wrap.h
 maxk.o: bwa.h bntseq.h bwt.h bwamem.h kseq.h malloc_wrap.h
diff --git a/NEWS.md b/NEWS.md
index 9a63bef9..151e024b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,7 +1,35 @@
+Release 0.7.17 (23 October 2017)
+--------------------------------
+
+This release adds option -q to preserve the mapping quality of split alignment
+with a lower alignment score than the primary alignment. Option -5
+automatically applies -q as well.
+
+(0.7.17: 23 October 2017, r1188)
+
+
+
+Release 0.7.16 (30 July 2017)
+-----------------------------
+
+This release added a couple of minor features and incorporated multiple pull
+requests, including:
+
+ * Added option -5, which is useful to some Hi-C pipelines.
+
+ * Fixed an error with samtools sorting (#129). Updated download link for
+   GRCh38 (#123). Fixed README MarkDown formatting (#70). Addressed multiple
+   issues via a collected pull request #139 by @jmarshall. Avoid malformatted
+   SAM header when -R is used with TAB (#84). Output mate CIGAR (#138).
+
+(0.7.16: 30 July 2017, r1180)
+
+
+
 Release 0.7.15 (31 May 2016)
 ----------------------------
 
-Fixed a long existing bug which potentially leads underestimated insert size
+Fixed a long existing bug which potentially leads to underestimated insert size
 upper bound. This bug should have little effect in practice.
 
 (0.7.15: 31 May 2016, r1140)
diff --git a/README.md b/README.md
index e6f677d1..7fc930bd 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,18 @@
-[![Build Status](https://travis-ci.org/lh3/bwa.svg?branch=dev)](https://travis-ci.org/lh3/bwa)
-[![Build Status](https://drone.io/github.com/lh3/bwa/status.png)](https://drone.io/github.com/lh3/bwa/latest)
-##Getting started
+[![Build Status](https://github.com/lh3/bwa/actions/workflows/ci.yaml/badge.svg)](https://github.com/lh3/bwa/actions)
+[![SourceForge Downloads](https://img.shields.io/sourceforge/dt/bio-bwa.svg?label=SF%20downloads)](https://sourceforge.net/projects/bio-bwa/files/?source=navbar)
+[![GitHub Downloads](https://img.shields.io/github/downloads/lh3/bwa/total.svg?style=flat&label=GitHub%20downloads)](https://github.com/lh3/bwa/releases)
+[![BioConda Install](https://img.shields.io/conda/dn/bioconda/bwa.svg?style=flag&label=BioConda%20install)](https://anaconda.org/bioconda/bwa)
+
+**Note: [minimap2][minimap2] has replaced BWA-MEM for __PacBio and Nanopore__ read
+alignment.** It retains all major BWA-MEM features, but is ~50 times as fast,
+more versatile, more accurate and produces better base-level alignment.
+A beta version of [BWA-MEM2][bwa-mem2] has been released for short-read mapping.
+BWA-MEM2 is about twice as fast as BWA-MEM and outputs near identical alignments.
+
+[minimap2]: https://github.com/lh3/minimap2
+[bwa-mem2]: https://github.com/bwa-mem2/bwa-mem2
+
+## Getting started
 
 	git clone https://github.com/lh3/bwa.git
 	cd bwa; make
@@ -8,7 +20,7 @@
 	./bwa mem ref.fa read-se.fq.gz | gzip -3 > aln-se.sam.gz
 	./bwa mem ref.fa read1.fq read2.fq | gzip -3 > aln-pe.sam.gz
 
-##Introduction
+## Introduction
 
 BWA is a software package for mapping DNA sequences against a large reference
 genome, such as the human genome. It consists of three algorithms:
@@ -24,7 +36,7 @@ reference genome (the **index** command). Alignment algorithms are invoked with
 different sub-commands: **aln/samse/sampe** for BWA-backtrack,
 **bwasw** for BWA-SW and **mem** for the BWA-MEM algorithm.
 
-##Availability
+## Availability
 
 BWA is released under [Apache 2.0][1]. The latest source code is [freely
 available at github][2]. Released packages can [be downloaded][3] at
@@ -37,7 +49,7 @@ In addition to BWA, this self-consistent package also comes with bwa-associated
 and 3rd-party tools for proper BAM-to-FASTQ conversion, mapping to ALT contigs,
 adapter triming, duplicate marking, HLA typing and associated data files.
 
-##Seeking helps
+## Seeking help
 
 The detailed usage is described in the man page available together with the
 source code. You can use `man ./bwa.1` to view the man page in a terminal. The
@@ -46,7 +58,7 @@ have questions about BWA, you may [sign up the mailing list][6] and then send
 the questions to [bio-bwa-help@sourceforge.net][7]. You may also ask questions
 in forums such as [BioStar][8] and [SEQanswers][9].
 
-##Citing BWA
+## Citing BWA
 
 * Li H. and Durbin R. (2009) Fast and accurate short read alignment with
  Burrows-Wheeler transform. *Bioinformatics*, **25**, 1754-1760. [PMID:
@@ -63,7 +75,7 @@ in forums such as [BioStar][8] and [SEQanswers][9].
 Please note that the last reference is a preprint hosted at [arXiv.org][13]. I
 do not have plan to submit it to a peer-reviewed journal in the near future.
 
-##Frequently asked questions (FAQs)
+## Frequently asked questions (FAQs)
 
 1. [What types of data does BWA work with?](#type)
 2. [Why does a read appear multiple times in the output SAM?](#multihit)
@@ -73,7 +85,7 @@ do not have plan to submit it to a peer-reviewed journal in the near future.
 6. [Does BWA work with ALT contigs in the GRCh38 release?](#altctg)
 7. [Can I just run BWA-MEM against GRCh38+ALT without post-processing?](#postalt)
 
-####<a name="type"></a>1. What types of data does BWA work with?
+#### <a name="type"></a>1. What types of data does BWA work with?
 
 BWA works with a variety types of DNA sequence data, though the optimal
 algorithm and setting may vary. The following list gives the recommended
@@ -108,7 +120,7 @@ errors given longer query sequences as the chance of missing all seeds is small.
 As is shown above, with non-default settings, BWA-MEM works with Oxford Nanopore
 reads with a sequencing error rate over 20%.
 
-####<a name="multihit"></a>2. Why does a read appear multiple times in the output SAM?
+#### <a name="multihit"></a>2. Why does a read appear multiple times in the output SAM?
 
 BWA-SW and BWA-MEM perform local alignments. If there is a translocation, a gene
 fusion or a long deletion, a read bridging the break point may have two hits,
@@ -116,18 +128,18 @@ occupying two lines in the SAM output. With the default setting of BWA-MEM, one
 and only one line is primary and is soft clipped; other lines are tagged with
 0x800 SAM flag (supplementary alignment) and are hard clipped.
 
-####<a name="4gb"></a>3. Does BWA work on reference sequences longer than 4GB in total?
+#### <a name="4gb"></a>3. Does BWA work on reference sequences longer than 4GB in total?
 
 Yes. Since 0.6.x, all BWA algorithms work with a genome with total length over
 4GB. However, individual chromosome should not be longer than 2GB.
 
-####<a name="pe0"></a>4. Why can one read in a pair has high mapping quality but the other has zero?
+#### <a name="pe0"></a>4. Why can one read in a pair have a high mapping quality but the other has zero?
 
 This is correct. Mapping quality is assigned for individual read, not for a read
 pair. It is possible that one read can be mapped unambiguously, but its mate
 falls in a tandem repeat and thus its accurate position cannot be determined.
 
-####<a name="endref"></a>5. How can a BWA-backtrack alignment stands out of the end of a chromosome?
+#### <a name="endref"></a>5. How can a BWA-backtrack alignment stand out of the end of a chromosome?
 
 Internally BWA concatenates all reference sequences into one long sequence. A
 read may be mapped to the junction of two adjacent reference sequences. In this
@@ -135,7 +147,7 @@ case, BWA-backtrack will flag the read as unmapped (0x4), but you will see
 position, CIGAR and all the tags. A similar issue may occur to BWA-SW alignment
 as well. BWA-MEM does not have this problem.
 
-####<a name="altctg"></a>6. Does BWA work with ALT contigs in the GRCh38 release?
+#### <a name="altctg"></a>6. Does BWA work with ALT contigs in the GRCh38 release?
 
 Yes, since 0.7.11, BWA-MEM officially supports mapping to GRCh38+ALT.
 BWA-backtrack and BWA-SW don't properly support ALT mapping as of now. Please
@@ -143,7 +155,7 @@ see [README-alt.md][18] for details. Briefly, it is recommended to use
 [bwakit][17], the binary release of BWA, for generating the reference genome
 and for mapping.
 
-####<a name="postalt"></a>7. Can I just run BWA-MEM against GRCh38+ALT without post-processing?
+#### <a name="postalt"></a>7. Can I just run BWA-MEM against GRCh38+ALT without post-processing?
 
 If you are not interested in hits to ALT contigs, it is okay to run BWA-MEM
 without post-processing. The alignments produced this way are very close to
diff --git a/bntseq.c b/bntseq.c
index 465e3832..63806896 100644
--- a/bntseq.c
+++ b/bntseq.c
@@ -1,6 +1,8 @@
 /* The MIT License
 
-   Copyright (c) 2008 Genome Research Ltd (GRL).
+   Copyright (c) 2018-     Dana-Farber Cancer Institute
+                 2009-2018 Broad Institute, Inc.
+                 2008-2009 Genome Research Ltd. (GRL)
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
@@ -22,9 +24,6 @@
    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    SOFTWARE.
 */
-
-/* Contact: Heng Li <lh3@sanger.ac.uk> */
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -197,7 +196,13 @@ bntseq_t *bns_restore(const char *prefix)
 				}
 				while (c != '\n' && c != EOF) c = fgetc(fp);
 				i = 0;
-			} else str[i++] = c; // FIXME: potential segfault here
+			} else {
+				if (i >= 1022) {
+					fprintf(stderr, "[E::%s] sequence name longer than 1023 characters. Abort!\n", __func__);
+					exit(1);
+				}
+				str[i++] = c;
+			}
 		}
 		kh_destroy(str, h);
 		fclose(fp);
@@ -299,9 +304,9 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only)
 	// read sequences
 	while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q);
 	if (!for_only) { // add the reverse complemented sequence
-		m_pac = (bns->l_pac * 2 + 3) / 4 * 4;
-		pac = realloc(pac, m_pac/4);
-		memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4);
+		int64_t ll_pac = (bns->l_pac * 2 + 3) / 4 * 4;
+		if (ll_pac > m_pac) pac = realloc(pac, ll_pac/4);
+		memset(pac + (bns->l_pac+3)/4, 0, (ll_pac - (bns->l_pac+3)/4*4) / 4);
 		for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac)
 			_set_pac(pac, bns->l_pac, 3-_get_pac(pac, l));
 	}
diff --git a/bntseq.h b/bntseq.h
index 03671d68..92e3afc8 100644
--- a/bntseq.h
+++ b/bntseq.h
@@ -1,6 +1,8 @@
 /* The MIT License
 
-   Copyright (c) 2008 Genome Research Ltd (GRL).
+   Copyright (c) 2018-     Dana-Farber Cancer Institute
+                 2009-2018 Broad Institute, Inc.
+                 2008-2009 Genome Research Ltd. (GRL)
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
@@ -23,8 +25,6 @@
    SOFTWARE.
 */
 
-/* Contact: Heng Li <lh3@sanger.ac.uk> */
-
 #ifndef BWT_BNTSEQ_H
 #define BWT_BNTSEQ_H
 
diff --git a/bwa.1 b/bwa.1
index b9a65f16..eefb235a 100644
--- a/bwa.1
+++ b/bwa.1
@@ -1,4 +1,4 @@
-.TH bwa 1 "31 May 2016" "bwa-0.7.15-r1140" "Bioinformatics tools"
+.TH bwa 1 "23 October 2017" "bwa-0.7.17-r1188" "Bioinformatics tools"
 .SH NAME
 .PP
 bwa - Burrows-Wheeler Alignment Tool
@@ -107,6 +107,8 @@ appropriate algorithm will be chosen automatically.
 .IR clipPen ]
 .RB [ -U
 .IR unpairPen ]
+.RB [ -x
+.IR readType ]
 .RB [ -R
 .IR RGline ]
 .RB [ -H
@@ -152,8 +154,8 @@ batch of reads.
 
 The BWA-MEM algorithm performs local alignment. It may produce multiple primary
 alignments for different part of a query sequence. This is a crucial feature
-for long sequences. However, some tools such as Picard's markDuplicates does
-not work with split alignments. One may consider to use option
+for long sequences. However, some tools may not work with split alignments. 
+One may consider to use option
 .B -M
 to flag shorter split hits as secondary.
 
@@ -256,7 +258,23 @@ Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as
 and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these
 two scores to determine whether we should force pairing. A larger value leads to
 more aggressive read pair. [17]
-
+.TP
+.BI -x \ STR
+Read type. Changes multiple parameters unless overriden [null]
+.RS
+.TP 10
+.BR pacbio :
+.B -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0
+(PacBio reads to ref)
+.TP
+.BR ont2d :
+.B -k14 -W20 -r10 -A1 -B1 -O1 -E1 -L0
+(Oxford Nanopore 2D-reads to ref)
+.TP
+.BR intractg :
+.B -B9 -O16 -L5
+(intra-species contigs to ref)
+.RE
 .TP
 .B INPUT/OUTPUT OPTIONS:
 .TP
@@ -277,6 +295,34 @@ If ARG starts with @, it is interpreted as a string and gets inserted into the
 output SAM header; otherwise, ARG is interpreted as a file with all lines
 starting with @ in the file inserted into the SAM header. [null]
 .TP
+.BI -o \ FILE
+Write the output SAM file to
+.IR FILE .
+For compatibility with other BWA commands, this option may also be given as
+.B -f
+.IR FILE .
+[standard ouptut]
+.TP
+.B -q
+ Don't reduce the mapping quality of split alignment of lower alignment score.
+.TP
+.B -5
+For split alignment, mark the segment with the smallest coordinate as the
+primary. It automatically applies option
+.B -q
+as well. This option may help some Hi-C pipelines. By default, BWA-MEM marks
+highest scoring segment as primary.
+.TP
+.B -K \ INT
+Process
+.I INT
+input bases in each batch regardless of the number of threads in use
+.RI [10000000* nThreads ].
+By default, the batch size is proportional to the number of threads in use.
+Because the inferred insert size distribution slightly depends on the batch
+size, using different number of threads may produce different output.
+Specifying this option helps reproducibility.
+.TP
 .BI -T \ INT
 Don't output alignment with score lower than
 .IR INT .
@@ -302,7 +348,7 @@ Output all found alignments for single-end or unpaired paired-end reads. These
 alignments will be flagged as secondary alignments.
 .TP
 .B -C
-Append append FASTA/Q comment to SAM output. This option can be used to
+Append FASTA/Q comment to SAM output. This option can be used to
 transfer read meta information (e.g. barcode) to the SAM output. Note that the
 FASTA/Q comment (the string after a space in the header line) must conform the SAM
 spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output.
@@ -316,7 +362,7 @@ supplementary alignments.
 Mark shorter split hits as secondary (for Picard compatibility).
 .TP
 .BI -v \ INT
-Control the verbose level of the output. This option has not been fully
+Control the verbosity level of the output. This option has not been fully
 supported throughout BWA. Ideally, a value 0 for disabling all the output to
 stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for
 all normal messages; 4 or higher for debugging. When this option takes value
diff --git a/bwa.c b/bwa.c
index c78ebb67..104c95c5 100644
--- a/bwa.c
+++ b/bwa.c
@@ -1,3 +1,29 @@
+/* The MIT License
+
+   Copyright (c) 2018-     Dana-Farber Cancer Institute
+                 2009-2018 Broad Institute, Inc.
+                 2008-2009 Genome Research Ltd. (GRL)
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
 #include <string.h>
 #include <stdio.h>
 #include <zlib.h>
@@ -14,6 +40,7 @@
 #endif
 
 int bwa_verbose = 3;
+int bwa_dbg = 0;
 char bwa_rg_id[256];
 char *bwa_pg;
 
@@ -30,13 +57,23 @@ static inline void trim_readno(kstring_t *s)
 		s->l -= 2, s->s[s->l] = 0;
 }
 
+static inline char *dupkstring(const kstring_t *str, int dupempty)
+{
+	char *s = (str->l > 0 || dupempty)? malloc(str->l + 1) : NULL;
+	if (!s) return NULL;
+
+	memcpy(s, str->s, str->l);
+	s[str->l] = '\0';
+	return s;
+}
+
 static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s)
 { // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice
-	s->name = strdup(ks->name.s);
-	s->comment = ks->comment.l? strdup(ks->comment.s) : 0;
-	s->seq = strdup(ks->seq.s);
-	s->qual = ks->qual.l? strdup(ks->qual.s) : 0;
-	s->l_seq = strlen(s->seq);
+	s->name = dupkstring(&ks->name, 1);
+	s->comment = dupkstring(&ks->comment, 0);
+	s->seq = dupkstring(&ks->seq, 1);
+	s->qual = dupkstring(&ks->qual, 0);
+	s->l_seq = ks->seq.l;
 }
 
 bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
@@ -144,9 +181,9 @@ uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins,
 		max_del = (int)((double)(((l_query+1)>>1) * mat[0] - o_del) / e_del + 1.);
 		max_gap = max_ins > max_del? max_ins : max_del;
 		max_gap = max_gap > 1? max_gap : 1;
-		w = (max_gap + abs(rlen - l_query) + 1) >> 1;
+		w = (max_gap + abs((int)rlen - l_query) + 1) >> 1;
 		w = w < w_? w : w_;
-		min_w = abs(rlen - l_query) + 3;
+		min_w = abs((int)rlen - l_query) + 3;
 		w = w > min_w? w : min_w;
 		// NW alignment
 		if (bwa_verbose >= 4) {
@@ -369,10 +406,17 @@ int bwa_idx2mem(bwaidx_t *idx)
 
 void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line)
 {
-	int i, n_SQ = 0;
+	int i, n_HD = 0, n_SQ = 0;
 	extern char *bwa_pg;
+	
 	if (hdr_line) {
+		// check for HD line
 		const char *p = hdr_line;
+		if ((p = strstr(p, "@HD")) != 0) {
+			++n_HD;
+		}	
+		// check for SQ lines
+		p = hdr_line;
 		while ((p = strstr(p, "@SQ\t")) != 0) {
 			if (p == hdr_line || *(p-1) == '\n') ++n_SQ;
 			p += 4;
@@ -386,6 +430,9 @@ void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line)
 		}
 	} else if (n_SQ != bns->n_seqs && bwa_verbose >= 2)
 		fprintf(stderr, "[W::%s] %d @SQ lines provided with -H; %d sequences in the index. Continue anyway.\n", __func__, n_SQ, bns->n_seqs);
+	if (n_HD == 0) {
+		err_printf("@HD\tVN:1.5\tSO:unsorted\tGO:query\n");
+	}
 	if (hdr_line) err_printf("%s\n", hdr_line);
 	if (bwa_pg) err_printf("%s\n", bwa_pg);
 }
@@ -414,10 +461,14 @@ char *bwa_set_rg(const char *s)
 		if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__);
 		goto err_set_rg;
 	}
+	if (strstr(s, "\t") != NULL) {
+		if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line contained literal <tab> characters -- replace with escaped tabs: \\t\n", __func__);
+		goto err_set_rg;
+	}
 	rg_line = strdup(s);
 	bwa_escape(rg_line);
 	if ((p = strstr(rg_line, "\tID:")) == 0) {
-		if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID at the read group line\n", __func__);
+		if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID within the read group line\n", __func__);
 		goto err_set_rg;
 	}
 	p += 4;
diff --git a/bwa.h b/bwa.h
index aa21725e..95c324b7 100644
--- a/bwa.h
+++ b/bwa.h
@@ -1,3 +1,29 @@
+/* The MIT License
+
+   Copyright (c) 2018-     Dana-Farber Cancer Institute
+                 2009-2018 Broad Institute, Inc.
+                 2008-2009 Genome Research Ltd. (GRL)
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
 #ifndef BWA_H_
 #define BWA_H_
 
@@ -17,6 +43,8 @@
 #define BWTALGO_BWTSW 2
 #define BWTALGO_IS    3
 
+#define BWA_DBG_QNAME 0x1
+
 typedef struct {
 	bwt_t    *bwt; // FM-index
 	bntseq_t *bns; // information on the reference sequences
@@ -32,7 +60,7 @@ typedef struct {
 	char *name, *comment, *seq, *qual, *sam;
 } bseq1_t;
 
-extern int bwa_verbose;
+extern int bwa_verbose, bwa_dbg;
 extern char bwa_rg_id[256];
 
 #ifdef __cplusplus
diff --git a/bwakit/bwa-postalt.js b/bwakit/bwa-postalt.js
index bfc41900..e00d3dc7 100644
--- a/bwakit/bwa-postalt.js
+++ b/bwakit/bwa-postalt.js
@@ -283,7 +283,7 @@ function bwa_postalt(args)
 	// process SAM
 	var buf2 = [], hla = {};
 	file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File();
-	while (file.readline(buf) >= 0) {
+	while (file.readline(buf) > 0) {
 		var m, line = buf.toString();
 
 		if (line.charAt(0) == '@') { // print and then skip the header line
diff --git a/bwakit/run-bwamem b/bwakit/run-bwamem
index 462bafe6..4c0e8d06 100755
--- a/bwakit/run-bwamem
+++ b/bwakit/run-bwamem
@@ -5,7 +5,7 @@ use warnings;
 use Getopt::Std;
 
 my %opts = (t=>1);
-getopts("PSadskHo:R:x:t:", \%opts);
+getopts("MPSadskHo:R:x:t:", \%opts);
 
 die('
 Usage:   run-bwamem [options] <idxbase> <file1> [file2]
@@ -24,6 +24,7 @@ Options: -o STR    prefix for output files                       [inferred from
          -S        for BAM input, don\'t shuffle
          -s        sort the output alignment (via samtools; requring more RAM)
          -k        keep temporary files generated by typeHLA
+         -M        mark shorter split hits as secondary
 
 Examples:
 
@@ -143,7 +144,7 @@ if ($is_bam) {
 	$cmd = "cat $ARGV[1] \\\n";
 }
 
-my $bwa_opts = "-p " . ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : "");
+my $bwa_opts = "-p " . ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : "") . (defined($opts{M})? "-M " : "");
 $bwa_opts .= join(" ", @RG_lines) . " -C " if @RG_lines > 0;
 
 $cmd .= "  | $root/trimadap 2> $prefix.log.trim \\\n" if defined($opts{a});
@@ -163,7 +164,7 @@ if (-f "$ARGV[0].alt" && !defined($opts{P})) {
 }
 
 my $t_sort = $opts{t} < 4? $opts{t} : 4;
-$cmd .= defined($opts{s})? "  | $root/samtools sort -@ $t_sort -m1G - $prefix.aln;\n" : "  | $root/samtools view -1 - > $prefix.aln.bam;\n";
+$cmd .= defined($opts{s})? "  | $root/samtools sort -@ $t_sort -m1G - -o $prefix.aln.bam;\n" : "  | $root/samtools view -1 - > $prefix.aln.bam;\n";
 
 if ($has_hla && defined($opts{H}) && (!defined($opts{x}) || $opts{x} eq 'intractg')) {
 	$cmd .= "$root/run-HLA ". (defined($opts{x}) && $opts{x} eq 'intractg'? "-A " : "") . "$prefix.hla > $prefix.hla.top 2> $prefix.log.hla;\n";
diff --git a/bwakit/run-gen-ref b/bwakit/run-gen-ref
index 3ed63b2b..58fab68a 100755
--- a/bwakit/run-gen-ref
+++ b/bwakit/run-gen-ref
@@ -2,7 +2,7 @@
 
 root=`dirname $0`
 
-url38="ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz"
+url38="ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz"
 url37d5="ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
 
 if [ $# -eq 0 ]; then
diff --git a/bwamem.c b/bwamem.c
index 535606b6..03e2a059 100644
--- a/bwamem.c
+++ b/bwamem.c
@@ -1,3 +1,29 @@
+/* The MIT License
+
+   Copyright (c) 2018-     Dana-Farber Cancer Institute
+                 2009-2018 Broad Institute, Inc.
+                 2008-2009 Genome Research Ltd. (GRL)
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
@@ -809,6 +835,19 @@ static inline int get_rlen(int n_cigar, const uint32_t *cigar)
 	return l;
 }
 
+static inline void add_cigar(const mem_opt_t *opt, mem_aln_t *p, kstring_t *str, int which)
+{
+	int i;
+	if (p->n_cigar) { // aligned
+		for (i = 0; i < p->n_cigar; ++i) {
+			int c = p->cigar[i]&0xf;
+			if (!(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt && (c == 3 || c == 4))
+				c = which? 4 : 3; // use hard clipping for supplementary alignments
+			kputw(p->cigar[i]>>4, str); kputc("MIDSH"[c], str);
+		}
+	} else kputc('*', str); // having a coordinate but unaligned (e.g. when copy_mate is true)
+}
+
 void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_)
 {
 	int i, l_name;
@@ -835,14 +874,7 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq
 		kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME
 		kputl(p->pos + 1, str); kputc('\t', str); // POS
 		kputw(p->mapq, str); kputc('\t', str); // MAPQ
-		if (p->n_cigar) { // aligned
-			for (i = 0; i < p->n_cigar; ++i) {
-				int c = p->cigar[i]&0xf;
-				if (!(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt && (c == 3 || c == 4))
-					c = which? 4 : 3; // use hard clipping for supplementary alignments
-				kputw(p->cigar[i]>>4, str); kputc("MIDSH"[c], str);
-			}
-		} else kputc('*', str); // having a coordinate but unaligned (e.g. when copy_mate is true)
+		add_cigar(opt, p, str, which);
 	} else kputsn("*\t0\t0\t*", 7, str); // without coordinte
 	kputc('\t', str);
 
@@ -899,6 +931,8 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq
 		kputsn("\tNM:i:", 6, str); kputw(p->NM, str);
 		kputsn("\tMD:Z:", 6, str); kputs((char*)(p->cigar + p->n_cigar), str);
 	}
+	if (m && m->n_cigar) { kputsn("\tMC:Z:", 6, str); add_cigar(opt, m, str, which); }
+	if (m) { kputsn("\tMQ:i:", 6, str); kputw(m->mapq, str);}
 	if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); }
 	if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); }
 	if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); }
@@ -925,7 +959,10 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq
 		if (p->alt_sc > 0)
 			ksprintf(str, "\tpa:f:%.3f", (double)p->score / p->alt_sc);
 	}
-	if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); }
+	if (p->XA) {
+		kputsn((opt->flag&MEM_F_XB)? "\tXB:Z:" : "\tXA:Z:", 6, str);
+		kputs(p->XA, str);
+	}
 	if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
 	if ((opt->flag&MEM_F_REF_HDR) && p->rid >= 0 && bns->anns[p->rid].anno != 0 && bns->anns[p->rid].anno[0] != 0) {
 		int tmp;
@@ -1019,7 +1056,8 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
 		if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score
 		if (l && p->secondary < 0) // if supplementary
 			q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800;
-		if (l && !p->is_alt && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq;
+		if (!(opt->flag & MEM_F_KEEP_SUPP_MAPQ) && l && !p->is_alt && q->mapq > aa.a[0].mapq)
+			q->mapq = aa.a[0].mapq; // lower mapq for supplementary mappings, unless -5 or -q is applied
 		++l;
 	}
 	if (aa.n == 0) { // no alignments good enough; then write an unaligned record
diff --git a/bwamem.h b/bwamem.h
index 4e2b1d8c..0a0e3bb4 100644
--- a/bwamem.h
+++ b/bwamem.h
@@ -1,3 +1,29 @@
+/* The MIT License
+
+   Copyright (c) 2018-     Dana-Farber Cancer Institute
+                 2009-2018 Broad Institute, Inc.
+                 2008-2009 Genome Research Ltd. (GRL)
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
 #ifndef BWAMEM_H_
 #define BWAMEM_H_
 
@@ -20,6 +46,8 @@ typedef struct __smem_i smem_i;
 #define MEM_F_SOFTCLIP  0x200
 #define MEM_F_SMARTPE   0x400
 #define MEM_F_PRIMARY5  0x800
+#define MEM_F_KEEP_SUPP_MAPQ 0x1000
+#define MEM_F_XB        0x2000
 
 typedef struct {
 	int a, b;               // match score and mismatch penalty
diff --git a/bwamem_extra.c b/bwamem_extra.c
index 54f70931..a87e18b5 100644
--- a/bwamem_extra.c
+++ b/bwamem_extra.c
@@ -1,3 +1,29 @@
+/* The MIT License
+
+   Copyright (c) 2018-     Dana-Farber Cancer Institute
+                 2009-2018 Broad Institute, Inc.
+                 2008-2009 Genome Research Ltd. (GRL)
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
 #include <limits.h>
 #include "bwa.h"
 #include "bwamem.h"
@@ -128,6 +154,12 @@ char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
 			kputc("MIDSHN"[t.cigar[k]&0xf], &str);
 		}
 		kputc(',', &str); kputw(t.NM, &str);
+		if (opt->flag & MEM_F_XB) {
+			kputc(',', &str);
+			kputw(t.score, &str);
+			kputc(',', &str);
+			kputw(t.mapq, &str);
+		}
 		kputc(';', &str);
 		free(t.cigar);
 		kputsn(str.s, str.l, &aln[r]);
diff --git a/bwamem_pair.c b/bwamem_pair.c
index 9beb84b6..ef795218 100644
--- a/bwamem_pair.c
+++ b/bwamem_pair.c
@@ -1,3 +1,29 @@
+/* The MIT License
+
+   Copyright (c) 2018-     Dana-Farber Cancer Institute
+                 2009-2018 Broad Institute, Inc.
+                 2008-2009 Genome Research Ltd. (GRL)
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
diff --git a/bwape.c b/bwape.c
index f2d3d8e0..fa4f7f7c 100644
--- a/bwape.c
+++ b/bwape.c
@@ -624,7 +624,8 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
 void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line)
 {
 	extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
-	int i, j, n_seqs, tot_seqs = 0;
+	int i, j, n_seqs;
+	long long tot_seqs = 0;
 	bwa_seq_t *seqs[2];
 	bwa_seqio_t *ks[2];
 	clock_t t;
@@ -711,7 +712,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
 
 		for (j = 0; j < 2; ++j)
 			bwa_free_read_seq(n_seqs, seqs[j]);
-		fprintf(stderr, "[bwa_sai2sam_pe_core] %d sequences have been processed.\n", tot_seqs);
+		fprintf(stderr, "[bwa_sai2sam_pe_core] %lld sequences have been processed.\n", tot_seqs);
 		last_ii = ii;
 	}
 
diff --git a/bwase.c b/bwase.c
index 77c50db9..18e86718 100644
--- a/bwase.c
+++ b/bwase.c
@@ -173,13 +173,15 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l
 	ubyte_t *rseq;
 	int64_t k, rb, re, rlen;
 	int8_t mat[25];
+	int w;
 
 	bwa_fill_scmat(1, 3, mat);
 	rb = *_rb; re = rb + len + ref_shift;
 	assert(re <= l_pac);
 	rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen);
 	assert(re - rb == rlen);
-	ksw_global(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW > abs(rlen - len) * 1.5? SW_BW : abs(rlen - len) * 1.5, n_cigar, &cigar32);
+	w = abs((int)rlen - len) * 1.5;
+	ksw_global(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW > w? SW_BW : w, n_cigar, &cigar32);
 	assert(*n_cigar > 0);
 	if ((cigar32[*n_cigar - 1]&0xf) == 1) cigar32[*n_cigar - 1] = (cigar32[*n_cigar - 1]>>4<<4) | 3; // change endding ins to soft clipping
 	if ((cigar32[0]&0xf) == 1) cigar32[0] = (cigar32[0]>>4<<4) | 3; // change beginning ins to soft clipping
@@ -505,7 +507,8 @@ void bwase_initialize()
 void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line)
 {
 	extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
-	int i, n_seqs, tot_seqs = 0, m_aln;
+	int i, n_seqs, m_aln;
+	long long tot_seqs = 0;
 	bwt_aln1_t *aln = 0;
 	bwa_seq_t *seqs;
 	bwa_seqio_t *ks;
@@ -563,7 +566,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
 		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
 
 		bwa_free_read_seq(n_seqs, seqs);
-		fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
+		fprintf(stderr, "[bwa_aln_core] %lld sequences have been processed.\n", tot_seqs);
 	}
 
 	// destroy
diff --git a/bwt.h b/bwt.h
index c71d6b5e..daba87fd 100644
--- a/bwt.h
+++ b/bwt.h
@@ -1,6 +1,8 @@
 /* The MIT License
 
-   Copyright (c) 2008 Genome Research Ltd (GRL).
+   Copyright (c) 2018-     Dana-Farber Cancer Institute
+                 2009-2018 Broad Institute, Inc.
+                 2008-2009 Genome Research Ltd. (GRL)
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
@@ -23,7 +25,7 @@
    SOFTWARE.
 */
 
-/* Contact: Heng Li <lh3@sanger.ac.uk> */
+/* Contact: Heng Li <hli@jimmy.harvard.edu> */
 
 #ifndef BWA_BWT_H
 #define BWA_BWT_H
diff --git a/bwt_gen.c b/bwt_gen.c
new file mode 100644
index 00000000..3adcc220
--- /dev/null
+++ b/bwt_gen.c
@@ -0,0 +1,1629 @@
+/*
+
+   BWTConstruct.c		BWT-Index Construction
+
+   This module constructs BWT and auxiliary data structures.
+
+   Copyright (C) 2004, Wong Chi Kwong.
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation; either version 2
+   of the License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <stdint.h>
+#include <errno.h>
+#include "QSufSort.h"
+
+#ifdef USE_MALLOC_WRAPPERS
+#  include "malloc_wrap.h"
+#endif
+
+typedef uint64_t bgint_t;
+typedef int64_t sbgint_t;
+
+#define ALPHABET_SIZE				4
+#define BIT_PER_CHAR				2
+#define CHAR_PER_WORD				16
+#define CHAR_PER_BYTE				4
+
+#define BITS_IN_WORD 32
+#define BITS_IN_BYTE 8
+#define BYTES_IN_WORD 4
+
+#define ALL_ONE_MASK 0xFFFFFFFF
+#define DNA_OCC_CNT_TABLE_SIZE_IN_WORD	65536
+
+#define BITS_PER_OCC_VALUE			16
+#define OCC_VALUE_PER_WORD			2
+#define OCC_INTERVAL				256
+#define OCC_INTERVAL_MAJOR			65536
+
+#define TRUE    1
+#define FALSE   0
+
+#define BWTINC_INSERT_SORT_NUM_ITEM 7
+
+#define MIN_AVAILABLE_WORD 0x10000
+
+#define average(value1, value2)					( ((value1) & (value2)) + ((value1) ^ (value2)) / 2 )
+#define min(value1, value2)						( ((value1) < (value2)) ? (value1) : (value2) )
+#define max(value1, value2)						( ((value1) > (value2)) ? (value1) : (value2) )
+#define med3(a, b, c)							( a<b ? (b<c ? b : a<c ? c : a) : (b>c ? b : a>c ? c : a))
+#define swap(a, b, t);							t = a; a = b; b = t;
+#define truncateLeft(value, offset)				( (value) << (offset) >> (offset) )
+#define truncateRight(value, offset)			( (value) >> (offset) << (offset) )
+#define DNA_OCC_SUM_EXCEPTION(sum)			((sum & 0xfefefeff) == 0)
+
+typedef struct BWT {
+	bgint_t textLength;					// length of the text
+	bgint_t inverseSa0;					// SA-1[0]
+	bgint_t *cumulativeFreq;			// cumulative frequency
+	unsigned int *bwtCode;				// BWT code
+	unsigned int *occValue;				// Occurrence values stored explicitly
+	bgint_t *occValueMajor;				// Occurrence values stored explicitly
+	unsigned int *decodeTable;			// For decoding BWT by table lookup
+	bgint_t bwtSizeInWord;				// Temporary variable to hold the memory allocated
+	bgint_t occSizeInWord;				// Temporary variable to hold the memory allocated
+	bgint_t occMajorSizeInWord;			// Temporary variable to hold the memory allocated
+} BWT;
+
+typedef struct BWTInc {
+	BWT *bwt;
+	unsigned int numberOfIterationDone;
+	bgint_t *cumulativeCountInCurrentBuild;
+	bgint_t availableWord;
+	bgint_t buildSize;
+	bgint_t initialMaxBuildSize;
+	bgint_t incMaxBuildSize;
+	unsigned int firstCharInLastIteration;
+	unsigned int *workingMemory;
+	unsigned int *packedText;
+	unsigned char *textBuffer;
+	unsigned int *packedShift;
+} BWTInc;
+
+static bgint_t TextLengthFromBytePacked(bgint_t bytePackedLength, unsigned int bitPerChar,
+											 unsigned int lastByteLength)
+{
+	return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength;
+}
+
+static void initializeVAL(unsigned int *startAddr, const bgint_t length, const unsigned int initValue)
+{
+	bgint_t i;
+	for (i=0; i<length; i++) startAddr[i] = initValue;
+}
+
+static void initializeVAL_bg(bgint_t *startAddr, const bgint_t length, const bgint_t initValue)
+{
+	bgint_t i;
+	for (i=0; i<length; i++) startAddr[i] = initValue;
+}
+
+static void GenerateDNAOccCountTable(unsigned int *dnaDecodeTable)
+{
+	unsigned int i, j, c, t;
+
+	for (i=0; i<DNA_OCC_CNT_TABLE_SIZE_IN_WORD; i++) {
+		dnaDecodeTable[i] = 0;
+		c = i;
+		for (j=0; j<8; j++) {
+			t = c & 0x00000003;
+			dnaDecodeTable[i] += 1 << (t * 8);
+			c >>= 2;
+		}
+	}
+
+}
+// for BWTIncCreate()
+static bgint_t BWTOccValueMajorSizeInWord(const bgint_t numChar)
+{
+	bgint_t numOfOccValue;
+	unsigned numOfOccIntervalPerMajor;
+	numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding
+	numOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL;
+	return (numOfOccValue + numOfOccIntervalPerMajor - 1) / numOfOccIntervalPerMajor * ALPHABET_SIZE;
+}
+// for BWTIncCreate()
+static bgint_t BWTOccValueMinorSizeInWord(const bgint_t numChar)
+{
+	bgint_t numOfOccValue;
+	numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;		// Value at both end for bi-directional encoding
+	return (numOfOccValue + OCC_VALUE_PER_WORD - 1) / OCC_VALUE_PER_WORD * ALPHABET_SIZE;
+}
+// for BWTIncCreate()
+static bgint_t BWTResidentSizeInWord(const bgint_t numChar) {
+
+	bgint_t numCharRoundUpToOccInterval;
+
+	// The $ in BWT at the position of inverseSa0 is not encoded
+	numCharRoundUpToOccInterval = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL;
+
+	return (numCharRoundUpToOccInterval + CHAR_PER_WORD - 1) / CHAR_PER_WORD;
+
+}
+
+static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc)
+{
+	bgint_t maxBuildSize;
+
+	if (bwtInc->bwt->textLength == 0) {
+		// initial build
+		// Minus 2 because n+1 entries of seq and rank needed for n char
+		maxBuildSize = (bwtInc->availableWord - (2 + OCC_INTERVAL / CHAR_PER_WORD) * (sizeof(bgint_t) / 4))
+							/ (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD / (sizeof(bgint_t) / 4);
+		if (bwtInc->initialMaxBuildSize > 0) {
+			bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize);
+		} else {
+			bwtInc->buildSize = maxBuildSize;
+		}
+	} else {
+		// Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char
+		// Minus numberOfIterationDone because bwt slightly shift to left in each iteration
+		maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord
+							 - (3 + bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR) * (sizeof(bgint_t) / 4)) 
+							 / 3 / (sizeof(bgint_t) / 4);
+		if (maxBuildSize < CHAR_PER_WORD) {
+			fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n");
+			exit(1);
+		}
+		if (bwtInc->incMaxBuildSize > 0) {
+            bwtInc->buildSize = min(bwtInc->incMaxBuildSize, maxBuildSize);
+		} else {
+			bwtInc->buildSize = maxBuildSize;
+		}
+		if (bwtInc->buildSize < CHAR_PER_WORD)
+			bwtInc->buildSize = CHAR_PER_WORD;
+	}
+
+	if (bwtInc->buildSize < CHAR_PER_WORD) {
+		fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n");
+		exit(1);
+	}
+
+	bwtInc->buildSize = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD;
+
+	bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1) * (sizeof(bgint_t) / 4);
+	bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + (bwtInc->buildSize + 1) * (sizeof(bgint_t) / 4));
+}
+
+// for ceilLog2()
+unsigned int leadingZero(const unsigned int input)
+{
+	unsigned int l;
+	const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+											 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+											 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+											 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+											 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+											 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+											 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+											 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+
+	if (input & 0xFFFF0000) {
+		if (input & 0xFF000000) {
+			l = leadingZero8bit[input >> 24];
+		} else {
+			l = 8 + leadingZero8bit[input >> 16];
+		}
+	} else {
+		if (input & 0x0000FF00) {
+			l = 16 + leadingZero8bit[input >> 8];
+		} else {
+			l = 24 + leadingZero8bit[input];
+		}
+	}
+	return l;
+
+}
+// for BitPerBytePackedChar()
+static unsigned int ceilLog2(const unsigned int input)
+{
+	if (input <= 1) return 0;
+	return BITS_IN_WORD - leadingZero(input - 1);
+
+}
+// for ConvertBytePackedToWordPacked()
+static unsigned int BitPerBytePackedChar(const unsigned int alphabetSize)
+{
+	unsigned int bitPerChar;
+	bitPerChar = ceilLog2(alphabetSize);
+	// Return the largest number of bit that does not affect packing efficiency
+	if (BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar) > bitPerChar)
+		bitPerChar = BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar);
+	return bitPerChar;
+}
+// for ConvertBytePackedToWordPacked()
+static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize)
+{
+	return ceilLog2(alphabetSize);
+}
+
+static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize,
+										  const bgint_t textLength)
+{
+	bgint_t i;
+	unsigned int j, k, c;
+	unsigned int bitPerBytePackedChar;
+	unsigned int bitPerWordPackedChar;
+	unsigned int charPerWord;
+	unsigned int charPerByte;
+	unsigned int bytePerIteration;
+	bgint_t byteProcessed = 0;
+	bgint_t wordProcessed = 0;
+	unsigned int mask, shift;
+	
+	unsigned int buffer[BITS_IN_WORD];
+
+	bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize);
+	bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize);
+	charPerByte = BITS_IN_BYTE / bitPerBytePackedChar;
+	charPerWord = BITS_IN_WORD / bitPerWordPackedChar;
+
+	bytePerIteration = charPerWord / charPerByte;
+	mask = truncateRight(ALL_ONE_MASK, BITS_IN_WORD - bitPerWordPackedChar);
+	shift = BITS_IN_WORD - BITS_IN_BYTE + bitPerBytePackedChar - bitPerWordPackedChar;
+
+	while ((wordProcessed + 1) * charPerWord < textLength) {
+
+		k = 0;
+		for (i=0; i<bytePerIteration; i++) {
+			c = (unsigned int)input[byteProcessed] << shift;
+			for (j=0; j<charPerByte; j++) {
+				buffer[k] = c & mask;
+				c <<= bitPerBytePackedChar;
+				k++;
+			}
+			byteProcessed++;
+		}
+
+		c = 0;
+		for (i=0; i<charPerWord; i++) {
+			c |= buffer[i] >> bitPerWordPackedChar * i;
+		}
+		output[wordProcessed] = c;
+		wordProcessed++;
+
+	}
+
+	k = 0;
+	for (i=0; i < (textLength - wordProcessed * charPerWord - 1) / charPerByte + 1; i++) {
+		c = (unsigned int)input[byteProcessed] << shift;
+		for (j=0; j<charPerByte; j++) {
+			buffer[k] = c & mask;
+			c <<= bitPerBytePackedChar;
+			k++;
+		}
+		byteProcessed++;
+	}
+
+	c = 0;
+	for (i=0; i<textLength - wordProcessed * charPerWord; i++) {
+		c |= buffer[i] >> bitPerWordPackedChar * i;
+	}
+	output[wordProcessed] = c;
+}
+
+BWT *BWTCreate(const bgint_t textLength, unsigned int *decodeTable)
+{
+	BWT *bwt;
+
+	bwt = (BWT*)calloc(1, sizeof(BWT));
+
+	bwt->textLength = 0;
+
+	bwt->cumulativeFreq = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t));
+	initializeVAL_bg(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0);
+
+	bwt->bwtSizeInWord = 0;
+
+	// Generate decode tables
+	if (decodeTable == NULL) {
+		bwt->decodeTable = (unsigned*)calloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int));
+		GenerateDNAOccCountTable(bwt->decodeTable);
+	} else {
+		// FIXME Prevent BWTFree() from freeing decodeTable in this case
+		bwt->decodeTable = decodeTable;
+	}
+
+	bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength);
+	bwt->occValueMajor = (bgint_t*)calloc(bwt->occMajorSizeInWord, sizeof(bgint_t));
+
+	bwt->occSizeInWord = 0;
+	bwt->occValue = NULL;
+
+	return bwt;
+}
+
+BWTInc *BWTIncCreate(const bgint_t textLength, unsigned int initialMaxBuildSize, unsigned int incMaxBuildSize)
+{
+	BWTInc *bwtInc;
+	unsigned int i, n_iter;
+
+	if (textLength < incMaxBuildSize) incMaxBuildSize = textLength;
+	if (textLength < initialMaxBuildSize) initialMaxBuildSize = textLength;
+
+	bwtInc = (BWTInc*)calloc(1, sizeof(BWTInc));
+	bwtInc->numberOfIterationDone = 0;
+	bwtInc->bwt = BWTCreate(textLength, NULL);
+	bwtInc->initialMaxBuildSize = initialMaxBuildSize;
+	bwtInc->incMaxBuildSize = incMaxBuildSize;
+	bwtInc->cumulativeCountInCurrentBuild = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t));
+	initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0);
+
+	// Build frequently accessed data
+	bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int));
+	for (i=0; i<CHAR_PER_WORD; i++)
+		bwtInc->packedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR;
+
+	n_iter = (textLength - initialMaxBuildSize) / incMaxBuildSize + 1;
+	bwtInc->availableWord = BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength) // minimal memory requirement
+		+ OCC_INTERVAL / BIT_PER_CHAR * n_iter * 2 * (sizeof(bgint_t) / 4) // buffer at the end of occ array 
+		+ incMaxBuildSize/5 * 3 * (sizeof(bgint_t) / 4); // space for the 3 temporary arrays in each iteration
+	if (bwtInc->availableWord < MIN_AVAILABLE_WORD) bwtInc->availableWord = MIN_AVAILABLE_WORD; // lh3: otherwise segfaul when availableWord is too small
+	fprintf(stderr, "[%s] textLength=%ld, availableWord=%ld\n", __func__, (long)textLength, (long)bwtInc->availableWord);
+	bwtInc->workingMemory = (unsigned*)calloc(bwtInc->availableWord, BYTES_IN_WORD);
+
+	return bwtInc;
+}
+// for BWTIncConstruct()
+static void BWTIncPutPackedTextToRank(const unsigned int *packedText, bgint_t* __restrict rank,
+									  bgint_t* __restrict cumulativeCount, const bgint_t numChar)
+{
+	bgint_t i;
+	unsigned int j;
+	unsigned int c, t;
+	unsigned int packedMask;
+	bgint_t rankIndex;
+	bgint_t lastWord;
+	unsigned int numCharInLastWord;
+
+	lastWord = (numChar - 1) / CHAR_PER_WORD;
+	numCharInLastWord = numChar - lastWord * CHAR_PER_WORD;
+
+	packedMask = ALL_ONE_MASK >> (BITS_IN_WORD - BIT_PER_CHAR);
+	rankIndex = numChar - 1;
+
+	t = packedText[lastWord] >> (BITS_IN_WORD - numCharInLastWord * BIT_PER_CHAR);
+	for (i=0; i<numCharInLastWord; i++) {
+		c = t & packedMask;
+		cumulativeCount[c+1]++;
+		rank[rankIndex] = c;
+		rankIndex--;
+		t >>= BIT_PER_CHAR;
+	}
+
+	for (i=lastWord; i--;) {	// loop from lastWord - 1 to 0
+		t = packedText[i];
+		for (j=0; j<CHAR_PER_WORD; j++) {
+			c = t & packedMask;
+			cumulativeCount[c+1]++;
+			rank[rankIndex] = c;
+			rankIndex--;
+			t >>= BIT_PER_CHAR;
+		}
+	}
+
+	// Convert occurrence to cumulativeCount
+	cumulativeCount[2] += cumulativeCount[1];
+	cumulativeCount[3] += cumulativeCount[2];
+	cumulativeCount[4] += cumulativeCount[3];
+}
+
+
+static void ForwardDNAAllOccCountNoLimit(const unsigned int*  dna, const bgint_t index,
+										 bgint_t* __restrict occCount, const unsigned int*  dnaDecodeTable)
+{
+	static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000,
+											   0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000,
+											   0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00,
+											   0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC };
+
+	bgint_t iteration, i;
+	unsigned int wordToCount, charToCount;
+	unsigned int j, c, sum;
+
+	occCount[0] = 0;
+	occCount[1] = 0;
+	occCount[2] = 0;
+	occCount[3] = 0;
+
+	iteration = index / 256;
+	wordToCount = (index - iteration * 256) / 16;
+	charToCount = index - iteration * 256 - wordToCount * 16;
+
+	for (i=0; i<iteration; i++) {
+
+		sum = 0;
+		for (j=0; j<16; j++) {
+			sum += dnaDecodeTable[*dna >> 16];
+			sum += dnaDecodeTable[*dna & 0x0000FFFF];
+			dna++;
+		}
+		if (!DNA_OCC_SUM_EXCEPTION(sum)) {
+			occCount[0] += sum & 0x000000FF;	sum >>= 8;
+			occCount[1] += sum & 0x000000FF;	sum >>= 8;
+			occCount[2] += sum & 0x000000FF;	sum >>= 8;
+			occCount[3] += sum;
+		} else {
+			// only some or all of the 3 bits are on
+			// in reality, only one of the four cases are possible
+			if (sum == 0x00000100) {
+				occCount[0] += 256;
+			} else if (sum == 0x00010000) {
+				occCount[1] += 256;
+			} else if (sum == 0x01000000) {
+				occCount[2] += 256;
+			} else if (sum == 0x00000000) {
+				occCount[3] += 256;
+			} else {
+				fprintf(stderr, "ForwardDNAAllOccCountNoLimit(): DNA occ sum exception!\n");
+				exit(1);
+			}
+		}
+
+	}
+
+	sum = 0;
+	for (j=0; j<wordToCount; j++) {
+		sum += dnaDecodeTable[*dna >> 16];
+		sum += dnaDecodeTable[*dna & 0x0000FFFF];
+		dna++;
+	}
+
+	if (charToCount > 0) {
+		c = *dna & truncateRightMask[charToCount];	// increase count of 'a' by 16 - c;
+		sum += dnaDecodeTable[c >> 16];
+		sum += dnaDecodeTable[c & 0xFFFF];
+		sum += charToCount - 16;	// decrease count of 'a' by 16 - positionToProcess
+	}
+
+	occCount[0] += sum & 0x000000FF;	sum >>= 8;
+	occCount[1] += sum & 0x000000FF;	sum >>= 8;
+	occCount[2] += sum & 0x000000FF;	sum >>= 8;
+	occCount[3] += sum;
+}
+
+static void BWTIncBuildPackedBwt(const bgint_t *relativeRank, unsigned int* __restrict bwt, const bgint_t numChar,
+								 const bgint_t *cumulativeCount, const unsigned int *packedShift) {
+
+	bgint_t i, r;
+	unsigned int c;
+	bgint_t previousRank, currentRank;
+	bgint_t wordIndex, charIndex;
+	bgint_t inverseSa0;
+
+	inverseSa0 = previousRank = relativeRank[0];
+
+	for (i=1; i<=numChar; i++) {
+		currentRank = relativeRank[i];
+		// previousRank > cumulativeCount[c] because $ is one of the char
+		c = (previousRank > cumulativeCount[1]) + (previousRank > cumulativeCount[2]) 
+											    + (previousRank > cumulativeCount[3]);
+		// set bwt for currentRank
+		if (c > 0) {
+			// c <> 'a'
+			r = currentRank;
+			if (r > inverseSa0) {
+				// - 1 because $ at inverseSa0 is not encoded			
+				r--;
+			}
+			wordIndex = r / CHAR_PER_WORD;
+			charIndex = r - wordIndex * CHAR_PER_WORD;
+			bwt[wordIndex] |= c << packedShift[charIndex];
+		}
+		previousRank = currentRank;
+	}
+}
+
+static inline bgint_t BWTOccValueExplicit(const BWT *bwt, const bgint_t occIndexExplicit,
+											   const unsigned int character)
+{
+	bgint_t occIndexMajor;
+
+	occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR;
+
+	if (occIndexExplicit % OCC_VALUE_PER_WORD == 0) {
+		return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] +
+			   (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] >> 16);
+
+	} else {
+		return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] +
+			   (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] & 0x0000FFFF);
+	}
+}
+
+
+static unsigned int ForwardDNAOccCount(const unsigned int*  dna, const unsigned int index, const unsigned int character,
+									   const unsigned int*  dnaDecodeTable)
+{
+	static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000,
+											   0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000,
+											   0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00,
+											   0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC };
+
+	unsigned int wordToCount, charToCount;
+	unsigned int i, c;
+	unsigned int sum = 0;
+
+	wordToCount = index / 16;
+	charToCount = index - wordToCount * 16;
+
+	for (i=0; i<wordToCount; i++) {
+		sum += dnaDecodeTable[dna[i] >> 16];
+		sum += dnaDecodeTable[dna[i] & 0x0000FFFF];
+	}
+
+	if (charToCount > 0) {
+		c = dna[i] & truncateRightMask[charToCount];	// increase count of 'a' by 16 - c;
+		sum += dnaDecodeTable[c >> 16];
+		sum += dnaDecodeTable[c & 0xFFFF];
+		sum += charToCount - 16;	// decrease count of 'a' by 16 - positionToProcess
+	}
+
+	return (sum >> (character * 8)) & 0x000000FF;
+
+}
+
+static unsigned int BackwardDNAOccCount(const unsigned int*  dna, const unsigned int index, const unsigned int character,
+										const unsigned int*  dnaDecodeTable)
+{
+	static const unsigned int truncateLeftMask[16] =  { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F,
+											   0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF,
+											   0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF,
+											   0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF };
+
+	unsigned int wordToCount, charToCount;
+	unsigned int i, c;
+	unsigned int sum = 0;
+
+	wordToCount = index / 16;
+	charToCount = index - wordToCount * 16;
+
+	dna -= wordToCount + 1;
+
+	if (charToCount > 0) {
+		c = *dna & truncateLeftMask[charToCount];	// increase count of 'a' by 16 - c;
+		sum += dnaDecodeTable[c >> 16];
+		sum += dnaDecodeTable[c & 0xFFFF];
+		sum += charToCount - 16;	// decrease count of 'a' by 16 - positionToProcess
+	}
+	
+	for (i=0; i<wordToCount; i++) {
+		dna++;
+		sum += dnaDecodeTable[*dna >> 16];
+		sum += dnaDecodeTable[*dna & 0x0000FFFF];
+	}
+
+	return (sum >> (character * 8)) & 0x000000FF;
+
+}
+
+bgint_t BWTOccValue(const BWT *bwt, bgint_t index, const unsigned int character)
+{
+	bgint_t occValue;
+	bgint_t occExplicitIndex, occIndex;
+
+	// $ is supposed to be positioned at inverseSa0 but it is not encoded
+	// therefore index is subtracted by 1 for adjustment
+	if (index > bwt->inverseSa0)
+		index--;
+
+	occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL;	// Bidirectional encoding
+	occIndex = occExplicitIndex * OCC_INTERVAL;
+	occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character);
+
+	if (occIndex == index)
+		return occValue;
+
+	if (occIndex < index) {
+		return occValue + ForwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, index - occIndex, character, bwt->decodeTable);
+	} else {
+		return occValue - BackwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, occIndex - index, character, bwt->decodeTable);
+	}
+}
+
+static bgint_t BWTIncGetAbsoluteRank(BWT *bwt, bgint_t* __restrict absoluteRank, bgint_t* __restrict seq,
+										  const unsigned int *packedText, const bgint_t numChar,
+										  const bgint_t* cumulativeCount, const unsigned int firstCharInLastIteration)
+{
+	bgint_t saIndex;
+	bgint_t lastWord;
+	unsigned int packedMask;
+	bgint_t i;
+	unsigned int c, t, j;
+	bgint_t rankIndex;
+	unsigned int shift;
+	bgint_t seqIndexFromStart[ALPHABET_SIZE];
+	bgint_t seqIndexFromEnd[ALPHABET_SIZE];
+
+	for (i=0; i<ALPHABET_SIZE; i++) {
+		seqIndexFromStart[i] = cumulativeCount[i];
+		seqIndexFromEnd[i] = cumulativeCount[i+1] - 1;
+	}
+
+	shift = BITS_IN_WORD - BIT_PER_CHAR;
+	packedMask = ALL_ONE_MASK >> shift;
+	saIndex = bwt->inverseSa0;
+	rankIndex = numChar - 1;
+
+	lastWord = numChar / CHAR_PER_WORD;
+	for (i=lastWord; i--;) {	// loop from lastWord - 1 to 0
+		t = packedText[i];
+		for (j=0; j<CHAR_PER_WORD; j++) {
+			c = t & packedMask;
+			saIndex = bwt->cumulativeFreq[c] + BWTOccValue(bwt, saIndex, c) + 1;
+			// A counting sort using the first character of suffix is done here
+			// If rank > inverseSa0 -> fill seq from end, otherwise fill seq from start -> to leave the right entry for inverseSa0
+			if (saIndex > bwt->inverseSa0) {
+				seq[seqIndexFromEnd[c]] = rankIndex;
+				absoluteRank[seqIndexFromEnd[c]] = saIndex;
+				seqIndexFromEnd[c]--;
+			} else {
+				seq[seqIndexFromStart[c]] = rankIndex;
+				absoluteRank[seqIndexFromStart[c]] = saIndex;
+				seqIndexFromStart[c]++;
+			}
+			rankIndex--;
+			t >>= BIT_PER_CHAR;
+		}
+	}
+
+	absoluteRank[seqIndexFromStart[firstCharInLastIteration]] = bwt->inverseSa0;	// representing the substring of all preceding characters
+	seq[seqIndexFromStart[firstCharInLastIteration]] = numChar;
+
+	return seqIndexFromStart[firstCharInLastIteration];
+}
+
+static void BWTIncSortKey(bgint_t* __restrict key, bgint_t* __restrict seq, const bgint_t numItem)
+{
+	#define EQUAL_KEY_THRESHOLD	4	// Partition for equal key if data array size / the number of data with equal value with pivot < EQUAL_KEY_THRESHOLD
+
+	int64_t lowIndex, highIndex, midIndex;
+	int64_t lowPartitionIndex, highPartitionIndex;
+	int64_t lowStack[32], highStack[32];
+	int stackDepth;
+	int64_t i, j;
+	bgint_t tempSeq, tempKey;
+	int64_t numberOfEqualKey;
+
+	if (numItem < 2) return;
+
+	stackDepth = 0;
+
+    lowIndex = 0;
+    highIndex = numItem - 1;
+
+	for (;;) {
+
+		for (;;) {
+
+			// Sort small array of data
+			if (highIndex - lowIndex < BWTINC_INSERT_SORT_NUM_ITEM) {	 // Insertion sort on smallest arrays
+				for (i=lowIndex+1; i<=highIndex; i++) {
+					tempSeq = seq[i];
+					tempKey = key[i];
+					for (j = i; j > lowIndex && key[j-1] > tempKey; j--) {
+						seq[j] = seq[j-1];
+						key[j] = key[j-1];
+					}
+					if (j != i) {
+						seq[j] = tempSeq;
+						key[j] = tempKey;
+					}
+				}
+				break;
+			}
+
+			// Choose pivot as median of the lowest, middle, and highest data; sort the three data
+
+			midIndex = average(lowIndex, highIndex);
+			if (key[lowIndex] > key[midIndex]) {
+				tempSeq = seq[lowIndex];
+				tempKey = key[lowIndex];
+				seq[lowIndex] = seq[midIndex];
+				key[lowIndex] = key[midIndex];
+				seq[midIndex] = tempSeq;
+				key[midIndex] = tempKey;
+			}
+			if (key[lowIndex] > key[highIndex]) {
+				tempSeq = seq[lowIndex];
+				tempKey = key[lowIndex];
+				seq[lowIndex] = seq[highIndex];
+				key[lowIndex] = key[highIndex];
+				seq[highIndex] = tempSeq;
+				key[highIndex] = tempKey;
+			}
+			if (key[midIndex] > key[highIndex]) {
+				tempSeq = seq[midIndex];
+				tempKey = key[midIndex];
+				seq[midIndex] = seq[highIndex];
+				key[midIndex] = key[highIndex];
+				seq[highIndex] = tempSeq;
+				key[highIndex] = tempKey;
+			}
+
+			// Partition data
+
+			numberOfEqualKey = 0;
+
+			lowPartitionIndex = lowIndex + 1;
+			highPartitionIndex = highIndex - 1;
+
+			for (;;) {
+				while (lowPartitionIndex <= highPartitionIndex && key[lowPartitionIndex] <= key[midIndex]) {
+					numberOfEqualKey += (key[lowPartitionIndex] == key[midIndex]);
+					lowPartitionIndex++;
+				}
+				while (lowPartitionIndex < highPartitionIndex) {
+					if (key[midIndex] >= key[highPartitionIndex]) {
+						numberOfEqualKey += (key[midIndex] == key[highPartitionIndex]);
+						break;
+					}
+					highPartitionIndex--;
+				}
+				if (lowPartitionIndex >= highPartitionIndex) {
+					break;
+				}
+				tempSeq = seq[lowPartitionIndex];
+				tempKey = key[lowPartitionIndex];
+				seq[lowPartitionIndex] = seq[highPartitionIndex];
+				key[lowPartitionIndex] = key[highPartitionIndex];
+				seq[highPartitionIndex] = tempSeq;
+				key[highPartitionIndex] = tempKey;
+				if (highPartitionIndex == midIndex) {
+					// partition key has been moved
+					midIndex = lowPartitionIndex;
+				}
+				lowPartitionIndex++;
+				highPartitionIndex--;
+			}
+
+			// Adjust the partition index
+			highPartitionIndex = lowPartitionIndex;
+			lowPartitionIndex--;
+
+			// move the partition key to end of low partition
+			tempSeq = seq[midIndex];
+			tempKey = key[midIndex];
+			seq[midIndex] = seq[lowPartitionIndex];
+			key[midIndex] = key[lowPartitionIndex];
+			seq[lowPartitionIndex] = tempSeq;
+			key[lowPartitionIndex] = tempKey;
+
+			if (highIndex - lowIndex + BWTINC_INSERT_SORT_NUM_ITEM <= EQUAL_KEY_THRESHOLD * numberOfEqualKey) {
+
+				// Many keys = partition key; separate the equal key data from the lower partition
+		
+				midIndex = lowIndex;
+
+				for (;;) {
+					while (midIndex < lowPartitionIndex && key[midIndex] < key[lowPartitionIndex]) {
+						midIndex++;
+					}
+					while (midIndex < lowPartitionIndex && key[lowPartitionIndex] == key[lowPartitionIndex - 1]) {
+						lowPartitionIndex--;
+					}
+					if (midIndex >= lowPartitionIndex) {
+						break;
+					}
+					tempSeq = seq[midIndex];
+					tempKey = key[midIndex];
+					seq[midIndex] = seq[lowPartitionIndex - 1];
+					key[midIndex] = key[lowPartitionIndex - 1];
+					seq[lowPartitionIndex - 1] = tempSeq;
+					key[lowPartitionIndex - 1] = tempKey;
+					midIndex++;
+					lowPartitionIndex--;
+				}
+
+			}
+
+			if (lowPartitionIndex - lowIndex > highIndex - highPartitionIndex) {
+				// put the larger partition to stack
+				lowStack[stackDepth] = lowIndex;
+				highStack[stackDepth] = lowPartitionIndex - 1;
+				stackDepth++;
+				// sort the smaller partition first
+				lowIndex = highPartitionIndex;
+			} else {
+				// put the larger partition to stack
+				lowStack[stackDepth] = highPartitionIndex;
+				highStack[stackDepth] = highIndex;
+				stackDepth++;
+				// sort the smaller partition first
+				if (lowPartitionIndex > lowIndex) {
+					highIndex = lowPartitionIndex - 1;
+				} else {
+					// all keys in the partition equals to the partition key
+					break;
+				}
+			}
+			continue;
+		}
+
+		// Pop a range from stack
+		if (stackDepth > 0) {
+			stackDepth--;
+			lowIndex = lowStack[stackDepth];
+			highIndex = highStack[stackDepth];
+			continue;
+		} else return;
+	}
+}
+
+
+static void BWTIncBuildRelativeRank(bgint_t* __restrict sortedRank, bgint_t* __restrict seq,
+									bgint_t* __restrict relativeRank, const bgint_t numItem,
+									bgint_t oldInverseSa0, const bgint_t *cumulativeCount)
+{
+	bgint_t i, c;
+	bgint_t s, r;
+	bgint_t lastRank, lastIndex;
+	bgint_t oldInverseSa0RelativeRank = 0;
+	bgint_t freq;
+
+	lastIndex = numItem;
+	lastRank = sortedRank[numItem];
+	if (lastRank > oldInverseSa0) {
+		sortedRank[numItem]--;	// to prepare for merging; $ is not encoded in bwt
+	}
+	s = seq[numItem];
+	relativeRank[s] = numItem;
+	if (lastRank == oldInverseSa0) {
+		oldInverseSa0RelativeRank = numItem;
+		oldInverseSa0++;	// so that this segment of code is not run again
+		lastRank++;			// so that oldInverseSa0 become a sorted group with 1 item
+	}
+
+	c = ALPHABET_SIZE - 1;
+	freq = cumulativeCount[c];
+
+	for (i=numItem; i--;) {	// from numItem - 1 to 0
+		r = sortedRank[i];
+		if (r > oldInverseSa0)
+			sortedRank[i]--;	// to prepare for merging; $ is not encoded in bwt
+		s = seq[i];
+		if (i < freq) {
+			if (lastIndex >= freq)
+				lastRank++;	// to trigger the group across alphabet boundary to be split
+			c--;
+			freq = cumulativeCount[c];
+		}
+		if (r == lastRank) {
+			relativeRank[s] = lastIndex;
+		} else {
+			if (i == lastIndex - 1) {
+				if (lastIndex < numItem && (sbgint_t)seq[lastIndex + 1] < 0) {
+					seq[lastIndex] = seq[lastIndex + 1] - 1;
+				} else {
+					seq[lastIndex] = (bgint_t)-1;
+				}
+			}
+			lastIndex = i;
+			lastRank = r;
+			relativeRank[s] = i;
+			if (r == oldInverseSa0) {
+				oldInverseSa0RelativeRank = i;
+				oldInverseSa0++;	// so that this segment of code is not run again
+				lastRank++;			// so that oldInverseSa0 become a sorted group with 1 item
+			}
+		}
+	}
+
+}
+
+static void BWTIncBuildBwt(unsigned int* insertBwt, const bgint_t *relativeRank, const bgint_t numChar,
+						   const bgint_t *cumulativeCount)
+{
+	unsigned int c;
+	bgint_t i;
+	bgint_t previousRank, currentRank;
+
+	previousRank = relativeRank[0];
+
+	for (i=1; i<=numChar; i++) {
+		currentRank = relativeRank[i];
+		c = (previousRank >= cumulativeCount[1]) + (previousRank >= cumulativeCount[2])
+											  	 + (previousRank >= cumulativeCount[3]);
+		insertBwt[currentRank] = c;
+		previousRank = currentRank;
+	}
+}
+
+static void BWTIncMergeBwt(const bgint_t *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt,
+						   unsigned int* __restrict mergedBwt, const bgint_t numOldBwt, const bgint_t numInsertBwt)
+{
+	unsigned int bitsInWordMinusBitPerChar;
+	bgint_t leftShift, rightShift;
+	bgint_t o;
+	bgint_t oIndex, iIndex, mIndex;
+	bgint_t mWord, mChar, oWord, oChar;
+	bgint_t numInsert;
+
+	bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR;
+
+	oIndex = 0;
+	iIndex = 0;
+	mIndex = 0;
+
+	mWord = 0;
+	mChar = 0;
+
+	mergedBwt[0] = 0;	// this can be cleared as merged Bwt slightly shift to the left in each iteration
+
+	while (oIndex < numOldBwt) {
+
+		// copy from insertBwt
+		while (iIndex <= numInsertBwt && sortedRank[iIndex] <= oIndex) {
+			if (sortedRank[iIndex] != 0) {	// special value to indicate that this is for new inverseSa0
+				mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR);
+				mIndex++;
+				mChar++;
+				if (mChar == CHAR_PER_WORD) {
+					mChar = 0;
+					mWord++;
+					mergedBwt[mWord] = 0;	// no need to worry about crossing mergedBwt boundary
+				}
+			}
+			iIndex++;
+		}
+
+		// Copy from oldBwt to mergedBwt
+		if (iIndex <= numInsertBwt) {
+			o = sortedRank[iIndex];
+		} else {
+			o = numOldBwt;
+		}
+		numInsert = o - oIndex;
+
+		oWord = oIndex / CHAR_PER_WORD;
+		oChar = oIndex - oWord * CHAR_PER_WORD;
+		if (oChar > mChar) {
+			leftShift = (oChar - mChar) * BIT_PER_CHAR;
+			rightShift = (CHAR_PER_WORD + mChar - oChar) * BIT_PER_CHAR;
+			mergedBwt[mWord] = mergedBwt[mWord]
+								| (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR))
+								| (oldBwt[oWord+1] >> rightShift);
+			oIndex += min(numInsert, CHAR_PER_WORD - mChar);
+			while (o > oIndex) {
+				oWord++;
+				mWord++;
+				mergedBwt[mWord] = (oldBwt[oWord] << leftShift) | (oldBwt[oWord+1] >> rightShift);
+				oIndex += CHAR_PER_WORD;
+			}
+		} else if (oChar < mChar) {
+			rightShift = (mChar - oChar) * BIT_PER_CHAR;
+			leftShift = (CHAR_PER_WORD + oChar - mChar) * BIT_PER_CHAR;
+			mergedBwt[mWord] = mergedBwt[mWord] 
+								| (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR));
+			oIndex += min(numInsert, CHAR_PER_WORD - mChar);
+			while (o > oIndex) {
+				oWord++;
+				mWord++;
+				mergedBwt[mWord] = (oldBwt[oWord-1] << leftShift) | (oldBwt[oWord] >> rightShift);
+				oIndex += CHAR_PER_WORD;
+			}
+		} else { // oChar == mChar
+			mergedBwt[mWord] = mergedBwt[mWord] | truncateLeft(oldBwt[oWord], mChar * BIT_PER_CHAR);
+			oIndex += min(numInsert, CHAR_PER_WORD - mChar);
+			while (o > oIndex) {
+				oWord++;
+				mWord++;
+				mergedBwt[mWord] = oldBwt[oWord];
+				oIndex += CHAR_PER_WORD;
+			}
+		}
+		oIndex = o;
+		mIndex += numInsert;
+
+		// Clear the trailing garbage in mergedBwt
+		mWord = mIndex / CHAR_PER_WORD;
+		mChar = mIndex - mWord * CHAR_PER_WORD;
+		if (mChar == 0) {
+			mergedBwt[mWord] = 0;
+		} else {
+			mergedBwt[mWord] = truncateRight(mergedBwt[mWord], (BITS_IN_WORD - mChar * BIT_PER_CHAR));
+		}
+
+	}
+
+	// copy from insertBwt
+	while (iIndex <= numInsertBwt) {
+		if (sortedRank[iIndex] != 0) {
+			mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR);
+			mIndex++;
+			mChar++;
+			if (mChar == CHAR_PER_WORD) {
+				mChar = 0;
+				mWord++;
+				mergedBwt[mWord] = 0;	// no need to worry about crossing mergedBwt boundary
+			}
+		}
+		iIndex++;
+	}
+}
+
+void BWTClearTrailingBwtCode(BWT *bwt)
+{
+	bgint_t bwtResidentSizeInWord;
+	bgint_t wordIndex, offset;
+	bgint_t i;
+
+	bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength);
+
+	wordIndex = bwt->textLength / CHAR_PER_WORD;
+	offset = (bwt->textLength - wordIndex * CHAR_PER_WORD) * BIT_PER_CHAR;
+	if (offset > 0) {
+		bwt->bwtCode[wordIndex] = truncateRight(bwt->bwtCode[wordIndex], BITS_IN_WORD - offset);
+	} else {
+		if (wordIndex < bwtResidentSizeInWord) {
+			bwt->bwtCode[wordIndex] = 0;
+		}
+	}
+
+	for (i=wordIndex+1; i<bwtResidentSizeInWord; i++) {
+		bwt->bwtCode[i] = 0;
+	}
+}
+
+
+void BWTGenerateOccValueFromBwt(const unsigned int*  bwt, unsigned int* __restrict occValue,
+								bgint_t* __restrict occValueMajor,
+								const bgint_t textLength, const unsigned int*  decodeTable)
+{
+	bgint_t numberOfOccValueMajor, numberOfOccValue;
+	unsigned int wordBetweenOccValue;
+	bgint_t numberOfOccIntervalPerMajor;
+	unsigned int c;
+	bgint_t i, j;
+	bgint_t occMajorIndex;
+	bgint_t occIndex, bwtIndex;
+	bgint_t sum; // perhaps unsigned is big enough
+	bgint_t tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE];
+
+	wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD;
+
+	// Calculate occValue
+	numberOfOccValue = (textLength + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;				// Value at both end for bi-directional encoding
+	numberOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL;
+	numberOfOccValueMajor = (numberOfOccValue + numberOfOccIntervalPerMajor - 1) / numberOfOccIntervalPerMajor;
+
+	tempOccValue0[0] = 0;
+	tempOccValue0[1] = 0;
+	tempOccValue0[2] = 0;
+	tempOccValue0[3] = 0;
+	occValueMajor[0] = 0;
+	occValueMajor[1] = 0;
+	occValueMajor[2] = 0;
+	occValueMajor[3] = 0;
+
+	occIndex = 0;
+	bwtIndex = 0;
+	for (occMajorIndex=1; occMajorIndex<numberOfOccValueMajor; occMajorIndex++) {
+
+		for (i=0; i<numberOfOccIntervalPerMajor/2; i++) {
+
+			sum = 0;
+			tempOccValue1[0] = tempOccValue0[0];
+			tempOccValue1[1] = tempOccValue0[1];
+			tempOccValue1[2] = tempOccValue0[2];
+			tempOccValue1[3] = tempOccValue0[3];
+
+			for (j=0; j<wordBetweenOccValue; j++) {
+				c = bwt[bwtIndex];
+				sum += decodeTable[c >> 16];
+				sum += decodeTable[c & 0x0000FFFF];
+				bwtIndex++;
+			}
+			if (!DNA_OCC_SUM_EXCEPTION(sum)) {
+				tempOccValue1[0] += (sum & 0x000000FF);	sum >>= 8;
+				tempOccValue1[1] += (sum & 0x000000FF);	sum >>= 8;
+				tempOccValue1[2] += (sum & 0x000000FF);	sum >>= 8;
+				tempOccValue1[3] += sum;
+			} else {
+				if (sum == 0x00000100) {
+					tempOccValue1[0] += 256;
+				} else if (sum == 0x00010000) {
+					tempOccValue1[1] += 256;
+				} else if (sum == 0x01000000) {
+					tempOccValue1[2] += 256;
+				} else {
+					tempOccValue1[3] += 256;
+				}
+			}
+			occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
+			occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
+			occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
+			occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
+			tempOccValue0[0] = tempOccValue1[0];
+			tempOccValue0[1] = tempOccValue1[1];
+			tempOccValue0[2] = tempOccValue1[2];
+			tempOccValue0[3] = tempOccValue1[3];
+			sum = 0;
+
+			occIndex++;
+
+			for (j=0; j<wordBetweenOccValue; j++) {
+				c = bwt[bwtIndex];
+				sum += decodeTable[c >> 16];
+				sum += decodeTable[c & 0x0000FFFF];
+				bwtIndex++;
+			}
+			if (!DNA_OCC_SUM_EXCEPTION(sum)) {
+				tempOccValue0[0] += (sum & 0x000000FF);	sum >>= 8;
+				tempOccValue0[1] += (sum & 0x000000FF);	sum >>= 8;
+				tempOccValue0[2] += (sum & 0x000000FF);	sum >>= 8;
+				tempOccValue0[3] += sum;
+			} else {
+				if (sum == 0x00000100) {
+					tempOccValue0[0] += 256;
+				} else if (sum == 0x00010000) {
+					tempOccValue0[1] += 256;
+				} else if (sum == 0x01000000) {
+					tempOccValue0[2] += 256;
+				} else {
+					tempOccValue0[3] += 256;
+				}
+			}
+		}
+
+		occValueMajor[occMajorIndex * 4 + 0] = occValueMajor[(occMajorIndex - 1) * 4 + 0] + tempOccValue0[0];
+		occValueMajor[occMajorIndex * 4 + 1] = occValueMajor[(occMajorIndex - 1) * 4 + 1] + tempOccValue0[1];
+		occValueMajor[occMajorIndex * 4 + 2] = occValueMajor[(occMajorIndex - 1) * 4 + 2] + tempOccValue0[2];
+		occValueMajor[occMajorIndex * 4 + 3] = occValueMajor[(occMajorIndex - 1) * 4 + 3] + tempOccValue0[3];
+		tempOccValue0[0] = 0;
+		tempOccValue0[1] = 0;
+		tempOccValue0[2] = 0;
+		tempOccValue0[3] = 0;
+
+	}
+
+	while (occIndex < (numberOfOccValue-1)/2) {
+		sum = 0;
+		tempOccValue1[0] = tempOccValue0[0];
+		tempOccValue1[1] = tempOccValue0[1];
+		tempOccValue1[2] = tempOccValue0[2];
+		tempOccValue1[3] = tempOccValue0[3];
+		for (j=0; j<wordBetweenOccValue; j++) {
+			c = bwt[bwtIndex];
+			sum += decodeTable[c >> 16];
+			sum += decodeTable[c & 0x0000FFFF];
+			bwtIndex++;
+		}
+		if (!DNA_OCC_SUM_EXCEPTION(sum)) {
+			tempOccValue1[0] += (sum & 0x000000FF);	sum >>= 8;
+			tempOccValue1[1] += (sum & 0x000000FF);	sum >>= 8;
+			tempOccValue1[2] += (sum & 0x000000FF);	sum >>= 8;
+			tempOccValue1[3] += sum;
+		} else {
+			if (sum == 0x00000100) {
+				tempOccValue1[0] += 256;
+			} else if (sum == 0x00010000) {
+				tempOccValue1[1] += 256;
+			} else if (sum == 0x01000000) {
+				tempOccValue1[2] += 256;
+			} else {
+				tempOccValue1[3] += 256;
+			}
+		}
+		occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
+		occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
+		occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
+		occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
+		tempOccValue0[0] = tempOccValue1[0];
+		tempOccValue0[1] = tempOccValue1[1];
+		tempOccValue0[2] = tempOccValue1[2];
+		tempOccValue0[3] = tempOccValue1[3];
+		sum = 0;
+		occIndex++;
+
+		for (j=0; j<wordBetweenOccValue; j++) {
+			c = bwt[bwtIndex];
+			sum += decodeTable[c >> 16];
+			sum += decodeTable[c & 0x0000FFFF];
+			bwtIndex++;
+		}
+		if (!DNA_OCC_SUM_EXCEPTION(sum)) {
+			tempOccValue0[0] += (sum & 0x000000FF);	sum >>= 8;
+			tempOccValue0[1] += (sum & 0x000000FF);	sum >>= 8;
+			tempOccValue0[2] += (sum & 0x000000FF);	sum >>= 8;
+			tempOccValue0[3] += sum;
+		} else {
+			if (sum == 0x00000100) {
+				tempOccValue0[0] += 256;
+			} else if (sum == 0x00010000) {
+				tempOccValue0[1] += 256;
+			} else if (sum == 0x01000000) {
+				tempOccValue0[2] += 256;
+			} else {
+				tempOccValue0[3] += 256;
+			}
+		}
+	}
+
+	sum = 0;
+	tempOccValue1[0] = tempOccValue0[0];
+	tempOccValue1[1] = tempOccValue0[1];
+	tempOccValue1[2] = tempOccValue0[2];
+	tempOccValue1[3] = tempOccValue0[3];
+
+	if (occIndex * 2 < numberOfOccValue - 1) {
+		for (j=0; j<wordBetweenOccValue; j++) {
+			c = bwt[bwtIndex];
+			sum += decodeTable[c >> 16];
+			sum += decodeTable[c & 0x0000FFFF];
+			bwtIndex++;
+		}
+		if (!DNA_OCC_SUM_EXCEPTION(sum)) {
+			tempOccValue1[0] += (sum & 0x000000FF);	sum >>= 8;
+			tempOccValue1[1] += (sum & 0x000000FF);	sum >>= 8;
+			tempOccValue1[2] += (sum & 0x000000FF);	sum >>= 8;
+			tempOccValue1[3] += sum;
+		} else {
+			if (sum == 0x00000100) {
+				tempOccValue1[0] += 256;
+			} else if (sum == 0x00010000) {
+				tempOccValue1[1] += 256;
+			} else if (sum == 0x01000000) {
+				tempOccValue1[2] += 256;
+			} else {
+				tempOccValue1[3] += 256;
+			}
+		}
+	}
+
+	occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0];
+	occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1];
+	occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2];
+	occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3];
+
+}
+
+static void BWTIncConstruct(BWTInc *bwtInc, const bgint_t numChar)
+{
+	unsigned int i;
+	bgint_t mergedBwtSizeInWord, mergedOccSizeInWord;
+	unsigned int firstCharInThisIteration;
+
+	bgint_t *relativeRank, *seq, *sortedRank;
+	unsigned int *insertBwt, *mergedBwt;
+	bgint_t newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0;
+
+	mergedBwtSizeInWord = BWTResidentSizeInWord(bwtInc->bwt->textLength + numChar);
+	mergedOccSizeInWord = BWTOccValueMinorSizeInWord(bwtInc->bwt->textLength + numChar);
+
+	initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0);
+
+	if (bwtInc->bwt->textLength == 0) {		// Initial build
+
+		// Set address
+		seq = (bgint_t*)bwtInc->workingMemory;
+		relativeRank = seq + bwtInc->buildSize + 1;
+		// mergedBwt and packedTex may share memory
+		mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord;	// build in place
+
+		assert((void*)(relativeRank + bwtInc->buildSize + 1) <= (void*)bwtInc->packedText);
+		assert((void*)(relativeRank + bwtInc->buildSize + 1) <= (void*)mergedBwt);
+
+		// ->packedText is not used any more and may be overwritten by mergedBwt
+		BWTIncPutPackedTextToRank(bwtInc->packedText, relativeRank, bwtInc->cumulativeCountInCurrentBuild, numChar);
+
+		firstCharInThisIteration = relativeRank[0];
+		relativeRank[numChar] = 0;
+
+		// Sort suffix
+		QSufSortSuffixSort((qsint_t*)relativeRank, (qsint_t*)seq, (qsint_t)numChar, (qsint_t)ALPHABET_SIZE - 1, 0, FALSE);
+		newInverseSa0 = relativeRank[0];
+
+		// Clear BWT area
+		initializeVAL(insertBwt, mergedBwtSizeInWord, 0);
+
+		// Build BWT
+		BWTIncBuildPackedBwt(relativeRank, insertBwt, numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->packedShift);
+
+		// so that the cumulativeCount is not deducted
+		bwtInc->firstCharInLastIteration = ALPHABET_SIZE;
+
+	} else {		// Incremental build
+		// Set address
+		sortedRank = (bgint_t*)bwtInc->workingMemory;
+		seq = sortedRank + bwtInc->buildSize + 1;
+		insertBwt = (unsigned*)seq; // insertBwt and seq share memory
+		// relativeRank and ->packedText may share memory
+		relativeRank = seq + bwtInc->buildSize + 1;
+
+		assert((void*)relativeRank <= (void*)bwtInc->packedText);
+
+		// Store the first character of this iteration
+		firstCharInThisIteration = bwtInc->packedText[0] >> (BITS_IN_WORD - BIT_PER_CHAR);
+
+		// Count occurrence of input text
+		ForwardDNAAllOccCountNoLimit(bwtInc->packedText, numChar, bwtInc->cumulativeCountInCurrentBuild + 1, bwtInc->bwt->decodeTable);
+		// Add the first character of the previous iteration to represent the inverseSa0 of the previous iteration
+		bwtInc->cumulativeCountInCurrentBuild[bwtInc->firstCharInLastIteration + 1]++;
+		bwtInc->cumulativeCountInCurrentBuild[2] += bwtInc->cumulativeCountInCurrentBuild[1];
+		bwtInc->cumulativeCountInCurrentBuild[3] += bwtInc->cumulativeCountInCurrentBuild[2];
+		bwtInc->cumulativeCountInCurrentBuild[4] += bwtInc->cumulativeCountInCurrentBuild[3];
+
+		// Get rank of new suffix among processed suffix
+		// The seq array is built into ALPHABET_SIZE + 2 groups; ALPHABET_SIZE groups + 1 group divided into 2 by inverseSa0 + inverseSa0 as 1 group
+		// ->packedText is not used any more and will be overwritten by relativeRank
+		oldInverseSa0RelativeRank = BWTIncGetAbsoluteRank(bwtInc->bwt, sortedRank, seq, bwtInc->packedText, 
+														  numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->firstCharInLastIteration);
+
+		// Sort rank by ALPHABET_SIZE + 2 groups (or ALPHABET_SIZE + 1 groups when inverseSa0 sit on the border of a group)
+		for (i=0; i<ALPHABET_SIZE; i++) {
+			if (bwtInc->cumulativeCountInCurrentBuild[i] > oldInverseSa0RelativeRank ||
+				bwtInc->cumulativeCountInCurrentBuild[i+1] <= oldInverseSa0RelativeRank) {
+				BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], bwtInc->cumulativeCountInCurrentBuild[i+1] - bwtInc->cumulativeCountInCurrentBuild[i]);
+			} else {
+				if (bwtInc->cumulativeCountInCurrentBuild[i] < oldInverseSa0RelativeRank) {
+					BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], oldInverseSa0RelativeRank - bwtInc->cumulativeCountInCurrentBuild[i]);
+				}
+				if (bwtInc->cumulativeCountInCurrentBuild[i+1] > oldInverseSa0RelativeRank + 1) {
+					BWTIncSortKey(sortedRank + oldInverseSa0RelativeRank + 1, seq + oldInverseSa0RelativeRank + 1, bwtInc->cumulativeCountInCurrentBuild[i+1] - oldInverseSa0RelativeRank - 1);
+				}
+			}
+		}
+
+		// build relative rank; sortedRank is updated for merging to cater for the fact that $ is not encoded in bwt
+		// the cumulative freq information is used to make sure that inverseSa0 and suffix beginning with different characters are kept in different unsorted groups)
+		BWTIncBuildRelativeRank(sortedRank, seq, relativeRank, numChar, bwtInc->bwt->inverseSa0, bwtInc->cumulativeCountInCurrentBuild);
+		assert(relativeRank[numChar] == oldInverseSa0RelativeRank);
+
+		// Sort suffix
+		QSufSortSuffixSort((qsint_t*)relativeRank, (qsint_t*)seq, (qsint_t)numChar, (qsint_t)numChar, 1, TRUE);
+
+		newInverseSa0RelativeRank = relativeRank[0];
+		newInverseSa0 = sortedRank[newInverseSa0RelativeRank] + newInverseSa0RelativeRank;
+
+		sortedRank[newInverseSa0RelativeRank] = 0;	// a special value so that this is skipped in the merged bwt
+
+		// Build BWT; seq is overwritten by insertBwt
+		BWTIncBuildBwt(insertBwt, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild);
+
+		// Merge BWT; relativeRank may be overwritten by mergedBwt
+		mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord 
+				    - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR * (sizeof(bgint_t) / 4); // minus numberOfIteration * occInterval to create a buffer for merging
+		assert(mergedBwt >= insertBwt + numChar);
+		BWTIncMergeBwt(sortedRank, bwtInc->bwt->bwtCode, insertBwt, mergedBwt, bwtInc->bwt->textLength, numChar);
+	}
+
+	// Build auxiliary structure and update info and pointers in BWT
+	bwtInc->bwt->textLength += numChar;
+	bwtInc->bwt->bwtCode = mergedBwt;
+	bwtInc->bwt->bwtSizeInWord = mergedBwtSizeInWord;
+	bwtInc->bwt->occSizeInWord = mergedOccSizeInWord;
+	assert(mergedBwt >= bwtInc->workingMemory + mergedOccSizeInWord);
+
+	bwtInc->bwt->occValue = mergedBwt - mergedOccSizeInWord;
+
+	BWTClearTrailingBwtCode(bwtInc->bwt);
+	BWTGenerateOccValueFromBwt(bwtInc->bwt->bwtCode, bwtInc->bwt->occValue, bwtInc->bwt->occValueMajor,
+							   bwtInc->bwt->textLength, bwtInc->bwt->decodeTable);
+
+	bwtInc->bwt->inverseSa0 = newInverseSa0;
+	
+	bwtInc->bwt->cumulativeFreq[1] += bwtInc->cumulativeCountInCurrentBuild[1] - (bwtInc->firstCharInLastIteration <= 0);
+	bwtInc->bwt->cumulativeFreq[2] += bwtInc->cumulativeCountInCurrentBuild[2] - (bwtInc->firstCharInLastIteration <= 1);
+	bwtInc->bwt->cumulativeFreq[3] += bwtInc->cumulativeCountInCurrentBuild[3] - (bwtInc->firstCharInLastIteration <= 2);
+	bwtInc->bwt->cumulativeFreq[4] += bwtInc->cumulativeCountInCurrentBuild[4] - (bwtInc->firstCharInLastIteration <= 3);
+
+	bwtInc->firstCharInLastIteration = firstCharInThisIteration;
+
+	// Set build size and text address for the next build
+	BWTIncSetBuildSizeAndTextAddr(bwtInc);
+	bwtInc->numberOfIterationDone++;
+
+}
+
+BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxBuildSize, bgint_t incMaxBuildSize)
+{
+
+	FILE *packedFile;
+	bgint_t packedFileLen;
+	bgint_t totalTextLength;
+	bgint_t textToLoad, textSizeInByte;
+	bgint_t processedTextLength;
+	unsigned char lastByteLength;
+
+	BWTInc *bwtInc;
+
+	packedFile = (FILE*)fopen(inputFileName, "rb");
+
+	if (packedFile == NULL) {
+		fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open %s : %s\n",
+				inputFileName, strerror(errno));
+		exit(1);
+	}
+
+	if (fseek(packedFile, -1, SEEK_END) != 0) {
+		fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n",
+				inputFileName, strerror(errno));
+		exit(1);
+	}
+	packedFileLen = ftell(packedFile);
+	if (packedFileLen == -1) {
+		fprintf(stderr, "BWTIncConstructFromPacked() : Can't ftell on %s : %s\n",
+				inputFileName, strerror(errno));
+		exit(1);
+	}
+	if (fread(&lastByteLength, sizeof(unsigned char), 1, packedFile) != 1) {
+		fprintf(stderr,
+				"BWTIncConstructFromPacked() : Can't read from %s : %s\n",
+				inputFileName,
+				ferror(packedFile)? strerror(errno) : "Unexpected end of file");
+		exit(1);
+	}
+	totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength);
+
+	bwtInc = BWTIncCreate(totalTextLength, initialMaxBuildSize, incMaxBuildSize);
+
+	BWTIncSetBuildSizeAndTextAddr(bwtInc);
+
+	if (bwtInc->buildSize > totalTextLength) {
+		textToLoad = totalTextLength;
+	} else {
+		textToLoad = totalTextLength - ((totalTextLength - bwtInc->buildSize + CHAR_PER_WORD - 1) / CHAR_PER_WORD * CHAR_PER_WORD);
+	}
+	textSizeInByte = textToLoad / CHAR_PER_BYTE;	// excluded the odd byte
+
+	if (fseek(packedFile, -((long)textSizeInByte + 2), SEEK_CUR) != 0) {
+		fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n",
+				inputFileName, strerror(errno));
+		exit(1);
+	}
+	if (fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile) != textSizeInByte + 1) {
+		fprintf(stderr,
+				"BWTIncConstructFromPacked() : Can't read from %s : %s\n",
+				inputFileName,
+				ferror(packedFile)? strerror(errno) : "Unexpected end of file");
+		exit(1);
+	}
+	if (fseek(packedFile, -((long)textSizeInByte + 1), SEEK_CUR) != 0) {
+		fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n",
+				inputFileName, strerror(errno));
+		exit(1);
+	}
+
+	ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad);
+	BWTIncConstruct(bwtInc, textToLoad);
+
+	processedTextLength = textToLoad;
+
+	while (processedTextLength < totalTextLength) {
+		textToLoad = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD;
+		if (textToLoad > totalTextLength - processedTextLength) {
+			textToLoad = totalTextLength - processedTextLength;
+		}
+		textSizeInByte = textToLoad / CHAR_PER_BYTE;
+		if (fseek(packedFile, -((long)textSizeInByte), SEEK_CUR) != 0) {
+			fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n",
+					inputFileName, strerror(errno));
+			exit(1);
+		}
+		if (fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile) != textSizeInByte) {
+			fprintf(stderr,
+				"BWTIncConstructFromPacked() : Can't read from %s : %s\n",
+				inputFileName,
+				ferror(packedFile)? strerror(errno) : "Unexpected end of file");
+			exit(1);
+		}
+		if (fseek(packedFile, -((long)textSizeInByte), SEEK_CUR) != 0) {
+			fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n",
+					inputFileName, strerror(errno));
+			exit(1);
+		}
+		ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad);
+		BWTIncConstruct(bwtInc, textToLoad);
+		processedTextLength += textToLoad;
+		if (bwtInc->numberOfIterationDone % 10 == 0) {
+			fprintf(stderr, "[BWTIncConstructFromPacked] %lu iterations done. %lu characters processed.\n",
+					(long)bwtInc->numberOfIterationDone, (long)processedTextLength);
+		}
+	}
+
+	fclose(packedFile);
+	return bwtInc;
+}
+
+void BWTIncFree(BWTInc *bwtInc)
+{
+	if (bwtInc == 0) return;
+	free(bwtInc->bwt->cumulativeFreq);
+	free(bwtInc->bwt->occValueMajor);
+	free(bwtInc->bwt->decodeTable);
+	free(bwtInc->bwt);
+	free(bwtInc->workingMemory);
+	free(bwtInc->cumulativeCountInCurrentBuild);
+	free(bwtInc->packedShift);
+	free(bwtInc);
+}
+
+static bgint_t BWTFileSizeInWord(const bgint_t numChar)
+{
+	// The $ in BWT at the position of inverseSa0 is not encoded
+	return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD;
+}
+
+void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *occValueFileName)
+{
+	FILE *bwtFile;
+/*	FILE *occValueFile; */
+	bgint_t bwtLength;
+
+	bwtFile = (FILE*)fopen(bwtFileName, "wb");
+	if (bwtFile == NULL) {
+		fprintf(stderr,
+				"BWTSaveBwtCodeAndOcc(): Cannot open %s for writing: %s\n",
+				bwtFileName, strerror(errno));
+		exit(1);
+	}
+
+	bwtLength = BWTFileSizeInWord(bwt->textLength);
+
+	if (fwrite(&bwt->inverseSa0, sizeof(bgint_t), 1, bwtFile) != 1
+		|| fwrite(bwt->cumulativeFreq + 1,
+				  sizeof(bgint_t), ALPHABET_SIZE, bwtFile) != ALPHABET_SIZE
+		|| fwrite(bwt->bwtCode,
+				  sizeof(unsigned int), bwtLength, bwtFile) != bwtLength) {
+		fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Error writing to %s : %s\n",
+				bwtFileName, strerror(errno));
+		exit(1);
+	}
+	if (fclose(bwtFile) != 0) {
+		fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Error on closing %s : %s\n",
+				bwtFileName, strerror(errno));
+		exit(1);
+	}
+}
+
+void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size)
+{
+	BWTInc *bwtInc;
+	bwtInc = BWTIncConstructFromPacked(fn_pac, block_size, block_size);
+	fprintf(stderr, "[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone);
+	BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0);
+	BWTIncFree(bwtInc);
+}
+
+void bwt_bwtgen(const char *fn_pac, const char *fn_bwt)
+{
+	bwt_bwtgen2(fn_pac, fn_bwt, 10000000);
+}
+
+int bwt_bwtgen_main(int argc, char *argv[])
+{
+	if (argc < 3) {
+		fprintf(stderr, "Usage: bwtgen <in.pac> <out.bwt>\n");
+		return 1;
+	}
+	bwt_bwtgen(argv[1], argv[2]);
+	return 0;
+}
+
+#ifdef MAIN_BWT_GEN
+
+int main(int argc, char *argv[])
+{
+	return bwt_bwtgen_main(argc, argv);
+}
+
+#endif
diff --git a/bwt_lite.c b/bwt_lite.c
index f7946f54..eb16da64 100644
--- a/bwt_lite.c
+++ b/bwt_lite.c
@@ -48,7 +48,7 @@ bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq)
 	}
 	{ // generate cnt_table
 		for (i = 0; i != 256; ++i) {
-			u_int32_t j, x = 0;
+			uint32_t j, x = 0;
 			for (j = 0; j != 4; ++j)
 				x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3);
 			b->cnt_table[i] = x;
diff --git a/bwtaln.c b/bwtaln.c
index 20b01cd3..a348fdfa 100644
--- a/bwtaln.c
+++ b/bwtaln.c
@@ -158,7 +158,8 @@ bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa)
 
 void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
 {
-	int i, n_seqs, tot_seqs = 0;
+	int i, n_seqs;
+	long long tot_seqs = 0;
 	bwa_seq_t *seqs;
 	bwa_seqio_t *ks;
 	clock_t t;
@@ -218,7 +219,7 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
 		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
 
 		bwa_free_read_seq(n_seqs, seqs);
-		fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
+		fprintf(stderr, "[bwa_aln_core] %lld sequences have been processed.\n", tot_seqs);
 	}
 
 	// destroy
diff --git a/bwtgap.c b/bwtgap.c
index 08bc1f48..7dde443d 100644
--- a/bwtgap.c
+++ b/bwtgap.c
@@ -58,7 +58,7 @@ static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, i
 		q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries);
 	}
 	p = q->stack + q->n_entries;
-	p->info = (u_int32_t)score<<21 | i; p->k = k; p->l = l;
+	p->info = (uint32_t)score<<21 | i; p->k = k; p->l = l;
 	p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape;
 	p->n_ins = n_ins; p->n_del = n_del;
 	p->state = state; 
diff --git a/bwtgap.h b/bwtgap.h
index 7dd61659..9085b626 100644
--- a/bwtgap.h
+++ b/bwtgap.h
@@ -5,9 +5,9 @@
 #include "bwtaln.h"
 
 typedef struct { // recursion stack
-	u_int32_t info; // score<<21 | i
-	u_int32_t n_mm:8, n_gapo:8, n_gape:8, state:2, n_seed_mm:6;
-	u_int32_t n_ins:16, n_del:16;
+	uint32_t info; // score<<21 | i
+	uint32_t n_mm:8, n_gapo:8, n_gape:8, state:2, n_seed_mm:6;
+	uint32_t n_ins:16, n_del:16;
 	int last_diff_pos;
 	bwtint_t k, l; // (k,l) is the SA region of [i,n-1]
 } gap_entry_t;
diff --git a/bwtindex.c b/bwtindex.c
index 7521de8c..ae35e76b 100644
--- a/bwtindex.c
+++ b/bwtindex.c
@@ -1,6 +1,8 @@
 /* The MIT License
 
-   Copyright (c) 2008 Genome Research Ltd (GRL).
+   Copyright (c) 2018-     Dana-Farber Cancer Institute
+                 2009-2018 Broad Institute, Inc.
+                 2008-2009 Genome Research Ltd. (GRL)
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
@@ -22,9 +24,6 @@
    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    SOFTWARE.
 */
-
-/* Contact: Heng Li <lh3@sanger.ac.uk> */
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -118,7 +117,7 @@ bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
 		}
 		rope_destroy(r);
 	}
-	bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4);
+	bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4);
 	for (i = 0; i < bwt->seq_len; ++i)
 		bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
 	free(buf);
@@ -174,7 +173,7 @@ void bwt_bwtupdate_core(bwt_t *bwt)
 int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command
 {
 	bwt_t *bwt;
-	if (argc < 2) {
+	if (argc != 2) {
 		fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
 		return 1;
 	}
diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c
index b945128b..f50b25bd 100644
--- a/bwtsw2_pair.c
+++ b/bwtsw2_pair.c
@@ -70,6 +70,12 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins)
 	for (i = x = 0, r.avg = 0; i < k; ++i)
 		if (isize[i] >= r.low && isize[i] <= r.high)
 			r.avg += isize[i], ++x;
+	if (x == 0) {
+		ksprintf(msg, "[%s] fail to infer the insert size distribution: no pairs within boundaries.\n", __func__);
+		free(isize);
+		r.failed = 1;
+		return r;
+	}
 	r.avg /= x;
 	for (i = 0, r.std = 0; i < k; ++i)
 		if (isize[i] >= r.low && isize[i] <= r.high)
@@ -236,7 +242,7 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b
 					double diff;
 					G[0] = hits[i]->hits[0].G + a[1].G;
 					G[1] = hits[i+1]->hits[0].G + a[0].G;
-					diff = fabs(G[0] - G[1]) / (opt->a + opt->b) / ((hits[i]->hits[0].len + a[1].len + hits[i+1]->hits[0].len + a[0].len) / 2.);
+					diff = fabs((double)(G[0] - G[1])) / (opt->a + opt->b) / ((hits[i]->hits[0].len + a[1].len + hits[i+1]->hits[0].len + a[0].len) / 2.);
 					if (diff > 0.05) a[G[0] > G[1]? 0 : 1].G = 0;
 				}
 				if (a[0].G == 0 || a[1].G == 0) { // one proper pair only
diff --git a/code_of_conduct.md b/code_of_conduct.md
new file mode 100644
index 00000000..b7175c1f
--- /dev/null
+++ b/code_of_conduct.md
@@ -0,0 +1,30 @@
+## Contributor Code of Conduct
+
+As contributors and maintainers of this project, we pledge to respect all
+people who contribute through reporting issues, posting feature requests,
+updating documentation, submitting pull requests or patches, and other
+activities.
+
+We are committed to making participation in this project a harassment-free
+experience for everyone, regardless of level of experience, gender, gender
+identity and expression, sexual orientation, disability, personal appearance,
+body size, race, age, or religion.
+
+Examples of unacceptable behavior by participants include the use of sexual
+language or imagery, derogatory comments or personal attacks, trolling, public
+or private harassment, insults, or other unprofessional conduct.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct. Project maintainers or
+contributors who do not follow the Code of Conduct may be removed from the
+project team.
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by opening an issue or contacting the maintainer via email.
+
+This Code of Conduct is adapted from the [Contributor Covenant][cc], [version
+1.0.0][v1].
+
+[cc]: http://contributor-covenant.org/
+[v1]: http://contributor-covenant.org/version/1/0/0/
diff --git a/fastmap.c b/fastmap.c
index 7b7145d9..be7ba0ef 100644
--- a/fastmap.c
+++ b/fastmap.c
@@ -1,3 +1,29 @@
+/* The MIT License
+
+   Copyright (c) 2018-     Dana-Farber Cancer Institute
+                 2009-2018 Broad Institute, Inc.
+                 2008-2009 Genome Research Ltd. (GRL)
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
 #include <zlib.h>
 #include <stdio.h>
 #include <unistd.h>
@@ -130,7 +156,7 @@ int main_mem(int argc, char *argv[])
 
 	aux.opt = opt = mem_opt_init();
 	memset(&opt0, 0, sizeof(mem_opt_t));
-	while ((c = getopt(argc, argv, "51paMCSPVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:X:H:")) >= 0) {
+	while ((c = getopt(argc, argv, "51qpaMCSPVYjuk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:")) >= 0) {
 		if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
 		else if (c == '1') no_mt_io = 1;
 		else if (c == 'x') mode = optarg;
@@ -147,7 +173,9 @@ int main_mem(int argc, char *argv[])
 		else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE;
 		else if (c == 'Y') opt->flag |= MEM_F_SOFTCLIP;
 		else if (c == 'V') opt->flag |= MEM_F_REF_HDR;
-		else if (c == '5') opt->flag |= MEM_F_PRIMARY5;
+		else if (c == '5') opt->flag |= MEM_F_PRIMARY5 | MEM_F_KEEP_SUPP_MAPQ; // always apply MEM_F_KEEP_SUPP_MAPQ with -5
+		else if (c == 'q') opt->flag |= MEM_F_KEEP_SUPP_MAPQ;
+		else if (c == 'u') opt->flag |= MEM_F_XB;
 		else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1;
 		else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1;
 		else if (c == 'v') bwa_verbose = atoi(optarg);
@@ -158,17 +186,20 @@ int main_mem(int argc, char *argv[])
 		else if (c == 's') opt->split_width = atoi(optarg), opt0.split_width = 1;
 		else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1;
 		else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1;
+		else if (c == 'o' || c == 'f') xreopen(optarg, "wb", stdout);
 		else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1;
 		else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1;
 		else if (c == 'C') aux.copy_comment = 1;
 		else if (c == 'K') fixed_chunk_size = atoi(optarg);
 		else if (c == 'X') opt->mask_level = atof(optarg);
+		else if (c == 'F') bwa_dbg = atoi(optarg);
 		else if (c == 'h') {
 			opt0.max_XA_hits = opt0.max_XA_hits_alt = 1;
 			opt->max_XA_hits = opt->max_XA_hits_alt = strtol(optarg, &p, 10);
 			if (*p != 0 && ispunct(*p) && isdigit(p[1]))
 				opt->max_XA_hits_alt = strtol(p+1, &p, 10);
 		}
+		else if (c == 'z') opt->XA_drop_ratio = atof(optarg);
 		else if (c == 'Q') {
 			opt0.mapQ_coef_len = 1;
 			opt->mapQ_coef_len = atoi(optarg);
@@ -257,7 +288,7 @@ int main_mem(int argc, char *argv[])
 		fprintf(stderr, "       -E INT[,INT]  gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins);
 		fprintf(stderr, "       -L INT[,INT]  penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3);
 		fprintf(stderr, "       -U INT        penalty for an unpaired read pair [%d]\n\n", opt->pen_unpaired);
-		fprintf(stderr, "       -x STR        read type. Setting -x changes multiple parameters unless overriden [null]\n");
+		fprintf(stderr, "       -x STR        read type. Setting -x changes multiple parameters unless overridden [null]\n");
 		fprintf(stderr, "                     pacbio: -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0  (PacBio reads to ref)\n");
 		fprintf(stderr, "                     ont2d: -k14 -W20 -r10 -A1 -B1 -O1 -E1 -L0  (Oxford Nanopore 2D-reads to ref)\n");
 		fprintf(stderr, "                     intractg: -B9 -O16 -L5  (intra-species contigs to ref)\n");
@@ -265,12 +296,20 @@ int main_mem(int argc, char *argv[])
 		fprintf(stderr, "       -p            smart pairing (ignoring in2.fq)\n");
 		fprintf(stderr, "       -R STR        read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
 		fprintf(stderr, "       -H STR/FILE   insert STR to header if it starts with @; or insert lines in FILE [null]\n");
+		fprintf(stderr, "       -o FILE       sam file to output results to [stdout]\n");
 		fprintf(stderr, "       -j            treat ALT contigs as part of the primary assembly (i.e. ignore <idxbase>.alt file)\n");
-		fprintf(stderr, "       -5            always take the leftmost alignment on a read as primary\n");
+		fprintf(stderr, "       -5            for split alignment, take the alignment with the smallest query (not genomic) coordinate as primary\n");
+		fprintf(stderr, "       -q            don't modify mapQ of supplementary alignments\n");
+		fprintf(stderr, "       -K INT        process INT input bases in each batch regardless of nThreads (for reproducibility) []\n");
 		fprintf(stderr, "\n");
-		fprintf(stderr, "       -v INT        verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
+		fprintf(stderr, "       -v INT        verbosity level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
 		fprintf(stderr, "       -T INT        minimum score to output [%d]\n", opt->T);
-		fprintf(stderr, "       -h INT[,INT]  if there are <INT hits with score >80%% of the max score, output all in XA [%d,%d]\n", opt->max_XA_hits, opt->max_XA_hits_alt);
+		fprintf(stderr, "       -h INT[,INT]  if there are <INT hits with score >%.2f%% of the max score, output all in XA [%d,%d]\n", 
+				opt->XA_drop_ratio * 100.0,
+				opt->max_XA_hits, opt->max_XA_hits_alt);
+		fprintf(stderr, "                     A second value may be given for alternate sequences.\n");
+		fprintf(stderr, "       -z FLOAT      The fraction of the max score to use with -h [%f].\n", opt->XA_drop_ratio);
+		fprintf(stderr, "                     specify the mean, standard deviation (10%% of the mean if absent), max\n");
 		fprintf(stderr, "       -a            output all alignments for SE or unpaired PE\n");
 		fprintf(stderr, "       -C            append FASTA/FASTQ comment to SAM output\n");
 		fprintf(stderr, "       -V            output the reference FASTA header in the XR tag\n");
@@ -280,6 +319,7 @@ int main_mem(int argc, char *argv[])
 		fprintf(stderr, "                     specify the mean, standard deviation (10%% of the mean if absent), max\n");
 		fprintf(stderr, "                     (4 sigma from the mean if absent) and min of the insert size distribution.\n");
 		fprintf(stderr, "                     FR orientation only. [inferred]\n");
+		fprintf(stderr, "       -u            output XB instead of XA; XB is XA with the alignment score and mapping quality added.\n");
 		fprintf(stderr, "\n");
 		fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n");
 		fprintf(stderr, "\n");
diff --git a/ksw.c b/ksw.c
index 9793e5eb..1e584e9d 100644
--- a/ksw.c
+++ b/ksw.c
@@ -26,7 +26,13 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include <assert.h>
+#if defined __SSE2__
 #include <emmintrin.h>
+#elif defined __ARM_NEON
+#include "neon_sse.h"
+#else
+#include "scalar_sse.h"
+#endif
 #include "ksw.h"
 
 #ifdef USE_MALLOC_WRAPPERS
@@ -108,6 +114,11 @@ kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t
 	return q;
 }
 
+#if defined __ARM_NEON
+// This macro implicitly uses each function's `zero` local variable
+#define _mm_slli_si128(a, n) (vextq_u8(zero, (a), 16 - (n)))
+#endif
+
 kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
 {
 	int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
@@ -115,6 +126,7 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del
 	__m128i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax;
 	kswr_t r;
 
+#if defined __SSE2__
 #define __max_16(ret, xx) do { \
 		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
 		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \
@@ -123,6 +135,18 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del
     	(ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \
 	} while (0)
 
+// Given entries with arbitrary values, return whether they are all 0x00
+#define allzero_16(xx) (_mm_movemask_epi8(_mm_cmpeq_epi8((xx), zero)) == 0xffff)
+
+#elif defined __ARM_NEON
+#define __max_16(ret, xx) (ret) = vmaxvq_u8((xx))
+#define allzero_16(xx) (vmaxvq_u8((xx)) == 0)
+
+#else
+#define __max_16(ret, xx) (ret) = m128i_max_u8((xx))
+#define allzero_16(xx) (m128i_allzero((xx)))
+#endif
+
 	// initialization
 	r = g_defr;
 	minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
@@ -143,7 +167,7 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del
 	}
 	// the core loop
 	for (i = 0; i < tlen; ++i) {
-		int j, k, cmp, imax;
+		int j, k, imax;
 		__m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
 		h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
 		h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
@@ -182,8 +206,7 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del
 				_mm_store_si128(H1 + j, h);
 				h = _mm_subs_epu8(h, oe_ins);
 				f = _mm_subs_epu8(f, e_ins);
-				cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero));
-				if (UNLIKELY(cmp == 0xffff)) goto end_loop16;
+				if (UNLIKELY(allzero_16(_mm_subs_epu8(f, h)))) goto end_loop16;
 			}
 		}
 end_loop16:
@@ -236,6 +259,7 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_de
 	__m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax;
 	kswr_t r;
 
+#if defined __SSE2__
 #define __max_8(ret, xx) do { \
 		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
 		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
@@ -243,6 +267,18 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_de
     	(ret) = _mm_extract_epi16((xx), 0); \
 	} while (0)
 
+// Given entries all either 0x0000 or 0xffff, return whether they are all 0x0000
+#define allzero_0f_8(xx) (!_mm_movemask_epi8((xx)))
+
+#elif defined __ARM_NEON
+#define __max_8(ret, xx) (ret) = vmaxvq_s16(vreinterpretq_s16_u8((xx)))
+#define allzero_0f_8(xx) (vmaxvq_u16(vreinterpretq_u16_u8((xx))) == 0)
+
+#else
+#define __max_8(ret, xx) (ret) = m128i_max_s16((xx))
+#define allzero_0f_8(xx) (m128i_allzero((xx)))
+#endif
+
 	// initialization
 	r = g_defr;
 	minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
@@ -267,7 +303,7 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_de
 		h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
 		h = _mm_slli_si128(h, 2);
 		for (j = 0; LIKELY(j < slen); ++j) {
-			h = _mm_adds_epi16(h, *S++);
+			h = _mm_adds_epi16(h, _mm_load_si128(S++));
 			e = _mm_load_si128(E + j);
 			h = _mm_max_epi16(h, e);
 			h = _mm_max_epi16(h, f);
@@ -290,7 +326,7 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_de
 				_mm_store_si128(H1 + j, h);
 				h = _mm_subs_epu16(h, oe_ins);
 				f = _mm_subs_epu16(f, e_ins);
-				if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8;
+				if(UNLIKELY(allzero_0f_8(_mm_cmpgt_epi16(f, h)))) goto end_loop8;
 			}
 		}
 end_loop8:
diff --git a/kthread.c b/kthread.c
index 1b13494f..780de19c 100644
--- a/kthread.c
+++ b/kthread.c
@@ -1,4 +1,5 @@
 #include <pthread.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <limits.h>
 
diff --git a/main.c b/main.c
index 68747981..d7d79d9e 100644
--- a/main.c
+++ b/main.c
@@ -1,10 +1,36 @@
+/* The MIT License
+
+   Copyright (c) 2018-     Dana-Farber Cancer Institute
+                 2009-2018 Broad Institute, Inc.
+                 2008-2009 Genome Research Ltd. (GRL)
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
 #include <stdio.h>
 #include <string.h>
 #include "kstring.h"
 #include "utils.h"
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.7.15-r1142-dirty"
+#define PACKAGE_VERSION "0.7.17-r1198-dirty"
 #endif
 
 int bwa_fa2pac(int argc, char *argv[]);
@@ -31,7 +57,7 @@ static int usage()
 	fprintf(stderr, "\n");
 	fprintf(stderr, "Program: bwa (alignment via Burrows-Wheeler transformation)\n");
 	fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
-	fprintf(stderr, "Contact: Heng Li <lh3@sanger.ac.uk>\n\n");
+	fprintf(stderr, "Contact: Heng Li <hli@ds.dfci.harvard.edu>\n\n");
 	fprintf(stderr, "Usage:   bwa <command> [options]\n\n");
 	fprintf(stderr, "Command: index         index sequences in the FASTA format\n");
 	fprintf(stderr, "         mem           BWA-MEM algorithm\n");
@@ -40,7 +66,7 @@ static int usage()
 	fprintf(stderr, "         aln           gapped/ungapped alignment\n");
 	fprintf(stderr, "         samse         generate alignment (single ended)\n");
 	fprintf(stderr, "         sampe         generate alignment (paired ended)\n");
-	fprintf(stderr, "         bwasw         BWA-SW for long queries\n");
+	fprintf(stderr, "         bwasw         BWA-SW for long queries (DEPRECATED)\n");
 	fprintf(stderr, "\n");
 	fprintf(stderr, "         shm           manage indices in shared memory\n");
 	fprintf(stderr, "         fa2pac        convert FASTA to PAC format\n");
diff --git a/malloc_wrap.c b/malloc_wrap.c
index 100b8cb6..6165e043 100644
--- a/malloc_wrap.c
+++ b/malloc_wrap.c
@@ -13,7 +13,7 @@ void *wrap_calloc(size_t nmemb, size_t size,
 	void *p = calloc(nmemb, size);
 	if (NULL == p) {
 		fprintf(stderr,
-				"[%s] Failed to allocate %zd bytes at %s line %u: %s\n",
+				"[%s] Failed to allocate %zu bytes at %s line %u: %s\n",
 				func, nmemb * size, file, line, strerror(errno));
 		exit(EXIT_FAILURE);
 	}
@@ -25,7 +25,7 @@ void *wrap_malloc(size_t size,
 	void *p = malloc(size);
 	if (NULL == p) {
 		fprintf(stderr,
-				"[%s] Failed to allocate %zd bytes at %s line %u: %s\n",
+				"[%s] Failed to allocate %zu bytes at %s line %u: %s\n",
 				func, size, file, line, strerror(errno));
 		exit(EXIT_FAILURE);
 	}
@@ -37,7 +37,7 @@ void *wrap_realloc(void *ptr, size_t size,
 	void *p = realloc(ptr, size);
 	if (NULL == p) {
 		fprintf(stderr,
-				"[%s] Failed to allocate %zd bytes at %s line %u: %s\n",
+				"[%s] Failed to allocate %zu bytes at %s line %u: %s\n",
 				func, size, file, line, strerror(errno));
 		exit(EXIT_FAILURE);
 	}
@@ -49,7 +49,7 @@ char *wrap_strdup(const char *s,
 	char *p = strdup(s);
 	if (NULL == p) {
 		fprintf(stderr,
-				"[%s] Failed to allocate %zd bytes at %s line %u: %s\n",
+				"[%s] Failed to allocate %zu bytes at %s line %u: %s\n",
 				func, strlen(s), file, line, strerror(errno));
 		exit(EXIT_FAILURE);
 	}
diff --git a/neon_sse.h b/neon_sse.h
new file mode 100644
index 00000000..bd55f8b4
--- /dev/null
+++ b/neon_sse.h
@@ -0,0 +1,33 @@
+#ifndef NEON_SSE_H
+#define NEON_SSE_H
+
+#include <arm_neon.h>
+
+typedef uint8x16_t __m128i;
+
+static inline __m128i _mm_load_si128(const __m128i *ptr) { return vld1q_u8((const uint8_t *) ptr); }
+static inline __m128i _mm_set1_epi32(int n) { return vreinterpretq_u8_s32(vdupq_n_s32(n)); }
+static inline void _mm_store_si128(__m128i *ptr, __m128i a) { vst1q_u8((uint8_t *) ptr, a); }
+
+static inline __m128i _mm_adds_epu8(__m128i a, __m128i b) { return vqaddq_u8(a, b); }
+static inline __m128i _mm_max_epu8(__m128i a, __m128i b) { return vmaxq_u8(a, b); }
+static inline __m128i _mm_set1_epi8(int8_t n) { return vreinterpretq_u8_s8(vdupq_n_s8(n)); }
+static inline __m128i _mm_subs_epu8(__m128i a, __m128i b) { return vqsubq_u8(a, b); }
+
+#define M128I(a)  vreinterpretq_u8_s16((a))
+#define UM128I(a) vreinterpretq_u8_u16((a))
+#define S16(a)    vreinterpretq_s16_u8((a))
+#define U16(a)    vreinterpretq_u16_u8((a))
+
+static inline __m128i _mm_adds_epi16(__m128i a, __m128i b) { return M128I(vqaddq_s16(S16(a), S16(b))); }
+static inline __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) { return UM128I(vcgtq_s16(S16(a), S16(b))); }
+static inline __m128i _mm_max_epi16(__m128i a, __m128i b) { return M128I(vmaxq_s16(S16(a), S16(b))); }
+static inline __m128i _mm_set1_epi16(int16_t n) { return vreinterpretq_u8_s16(vdupq_n_s16(n)); }
+static inline __m128i _mm_subs_epu16(__m128i a, __m128i b) { return UM128I(vqsubq_u16(U16(a), U16(b))); }
+
+#undef M128I
+#undef UM128I
+#undef S16
+#undef U16
+
+#endif
diff --git a/rle.h b/rle.h
index 0d594846..4f8946dc 100644
--- a/rle.h
+++ b/rle.h
@@ -30,7 +30,7 @@ extern "C" {
  *** 43+3 codec ***
  ******************/
 
-const uint8_t rle_auxtab[8];
+extern const uint8_t rle_auxtab[8];
 
 #define RLE_MIN_SPACE 18
 #define rle_nptr(block) ((uint16_t*)(block))
diff --git a/scalar_sse.h b/scalar_sse.h
new file mode 100644
index 00000000..9c2c326d
--- /dev/null
+++ b/scalar_sse.h
@@ -0,0 +1,119 @@
+#ifndef SCALAR_SSE_H
+#define SCALAR_SSE_H
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+typedef union m128i {
+	uint8_t u8[16];
+	int16_t i16[8];
+} __m128i;
+
+static inline __m128i _mm_set1_epi32(int32_t n) {
+	assert(n >= 0 && n <= 255);
+	__m128i r; memset(&r, n, sizeof r); return r;
+}
+
+static inline __m128i _mm_load_si128(const __m128i *ptr) { __m128i r; memcpy(&r, ptr, sizeof r); return r; }
+static inline void _mm_store_si128(__m128i *ptr, __m128i a) { memcpy(ptr, &a, sizeof a); }
+
+static inline int m128i_allzero(__m128i a) {
+	static const char zero[] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
+	return memcmp(&a, zero, sizeof a) == 0;
+}
+
+static inline __m128i _mm_slli_si128(__m128i a, int n) {
+	int i;
+	memmove(&a.u8[n], &a.u8[0], 16 - n);
+	for (i = 0; i < n; i++) a.u8[i] = 0;
+	return a;
+}
+
+static inline __m128i _mm_adds_epu8(__m128i a, __m128i b) {
+	int i;
+	for (i = 0; i < 16; i++) {
+		uint16_t aa = a.u8[i];
+		aa += b.u8[i];
+		a.u8[i] = (aa < 256)? aa : 255;
+	}
+	return a;
+}
+
+static inline __m128i _mm_max_epu8(__m128i a, __m128i b) {
+	int i;
+	for (i = 0; i < 16; i++)
+		if (a.u8[i] < b.u8[i]) a.u8[i] = b.u8[i];
+	return a;
+}
+
+static inline uint8_t m128i_max_u8(__m128i a) {
+	uint8_t max = 0;
+	int i;
+	for (i = 0; i < 16; i++)
+		if (max < a.u8[i]) max = a.u8[i];
+	return max;
+}
+
+static inline __m128i _mm_set1_epi8(int8_t n) { __m128i r; memset(&r, n, sizeof r); return r; }
+
+static inline __m128i _mm_subs_epu8(__m128i a, __m128i b) {
+	int i;
+	for (i = 0; i < 16; i++) {
+		int16_t aa = a.u8[i];
+		aa -= b.u8[i];
+		a.u8[i] = (aa >= 0)? aa : 0;
+	}
+	return a;
+}
+
+static inline __m128i _mm_adds_epi16(__m128i a, __m128i b) {
+	int i;
+	for (i = 0; i < 8; i++) {
+		int32_t aa = a.i16[i];
+		aa += b.i16[i];
+		a.i16[i] = (aa < 32768)? aa : 32767;
+	}
+	return a;
+}
+
+static inline __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) {
+	int i;
+	for (i = 0; i < 8; i++)
+		a.i16[i] = (a.i16[i] > b.i16[i])? 0xffff : 0x0000;
+	return a;
+}
+
+static inline __m128i _mm_max_epi16(__m128i a, __m128i b) {
+	int i;
+	for (i = 0; i < 8; i++)
+		if (a.i16[i] < b.i16[i]) a.i16[i] = b.i16[i];
+	return a;
+}
+
+static inline __m128i _mm_set1_epi16(int16_t n) {
+	__m128i r;
+	r.i16[0] = r.i16[1] = r.i16[2] = r.i16[3] =
+	r.i16[4] = r.i16[5] = r.i16[6] = r.i16[7] = n;
+	return r;
+}
+
+static inline int16_t m128i_max_s16(__m128i a) {
+	int16_t max = -32768;
+	int i;
+	for (i = 0; i < 8; i++)
+		if (max < a.i16[i]) max = a.i16[i];
+	return max;
+}
+
+static inline __m128i _mm_subs_epu16(__m128i a, __m128i b) {
+	int i;
+	for (i = 0; i < 8; i++) {
+		int32_t aa = a.i16[i];
+		aa -= b.i16[i];
+		a.i16[i] = (aa >= 0)? aa : 0;
+	}
+	return a;
+}
+
+#endif
diff --git a/utils.c b/utils.c
index 29832617..9ceb1be1 100644
--- a/utils.c
+++ b/utils.c
@@ -1,6 +1,8 @@
 /* The MIT License
 
-   Copyright (c) 2008 Genome Research Ltd (GRL).
+   Copyright (c) 2018-     Dana-Farber Cancer Institute
+                 2009-2018 Broad Institute, Inc.
+                 2008-2009 Genome Research Ltd. (GRL)
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
@@ -22,8 +24,6 @@
    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    SOFTWARE.
 */
-
-/* Contact: Heng Li <lh3@sanger.ac.uk> */
 #define FSYNC_ON_FLUSH
 
 #include <stdio.h>
@@ -279,17 +279,28 @@ int err_gzclose(gzFile file)
  * Timer *
  *********/
 
-double cputime()
+double cputime(void)
 {
 	struct rusage r;
 	getrusage(RUSAGE_SELF, &r);
 	return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec);
 }
 
-double realtime()
+double realtime(void)
 {
 	struct timeval tp;
 	struct timezone tzp;
 	gettimeofday(&tp, &tzp);
 	return tp.tv_sec + tp.tv_usec * 1e-6;
 }
+
+long peakrss(void)
+{
+	struct rusage r;
+	getrusage(RUSAGE_SELF, &r);
+#ifdef __linux__
+	return r.ru_maxrss * 1024;
+#else
+	return r.ru_maxrss;
+#endif
+}
diff --git a/utils.h b/utils.h
index 11966b8d..d4d45500 100644
--- a/utils.h
+++ b/utils.h
@@ -1,6 +1,8 @@
 /* The MIT License
 
-   Copyright (c) 2008 Genome Research Ltd (GRL).
+   Copyright (c) 2018-     Dana-Farber Cancer Institute
+                 2009-2018 Broad Institute, Inc.
+                 2008-2009 Genome Research Ltd. (GRL)
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
@@ -22,9 +24,6 @@
    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    SOFTWARE.
 */
-
-/* Contact: Heng Li <lh3@sanger.ac.uk> */
-
 #ifndef LH3_UTILS_H
 #define LH3_UTILS_H
 
@@ -85,8 +84,9 @@ extern "C" {
 	int err_fclose(FILE *stream);
 	int err_gzclose(gzFile file);
 
-	double cputime();
-	double realtime();
+	double cputime(void);
+	double realtime(void);
+	long peakrss(void);
 
 	void ks_introsort_64 (size_t n, uint64_t *a);
 	void ks_introsort_128(size_t n, pair64_t *a);