gc_extrap.cpp

/*    gc_extrap: extrapolate genomic complexity 
 *
 *    Copyright (C) 2013 University of Southern California and
 *			 Andrew D. Smith and Timothy Daley
 *
 *    Authors: Andrew D. Smith and Timothy Daley
 *
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.	If not, see <http://www.gnu.org/licenses/>.
 */

#include <fstream>
#include <numeric>
#include <vector>
#include <iomanip>
#include <queue>
#include <string>
#include <sys/types.h>
#include <unistd.h>

#include <gsl/gsl_cdf.h>
#include <gsl/gsl_randist.h>
#include <gsl/gsl_statistics_double.h>

#include <OptionParser.hpp>
#include <smithlab_utils.hpp>
#include <GenomicRegion.hpp>
#include <MappedRead.hpp>
#include <RNG.hpp>
#include <smithlab_os.hpp>

#include "continued_fraction.hpp"

using std::string;
using std::vector;
using std::endl;
using std::cerr;
using std::max;
using std::priority_queue;

using std::setw;
using std::fixed;
using std::setprecision;
using std::tr1::unordered_map;


/**************** FOR CLARITY BELOW WHEN COMPARING READS **************/

static inline bool
chrom_greater(const GenomicRegion &a, const GenomicRegion &b) {
  return a.get_chrom() > b.get_chrom();
}
static inline bool
same_start(const GenomicRegion &a, const GenomicRegion &b) {
  return a.get_start() == b.get_start();
}
static inline bool
start_greater(const GenomicRegion &a, const GenomicRegion &b) {
  return a.get_start() > b.get_start();
}
static inline bool
end_greater(const GenomicRegion &a, const GenomicRegion &b) {
  return a.get_end() > b.get_end();
}
/******************************************************************************/


struct GenomicRegionOrderChecker {
  bool operator()(const GenomicRegion &prev, const GenomicRegion &gr) const {
    return start_check(prev, gr);
  }
  static bool 
  is_ready(const priority_queue<GenomicRegion, vector<GenomicRegion>, GenomicRegionOrderChecker> &pq,
	   const GenomicRegion &gr, const size_t max_width) {
    return !pq.top().same_chrom(gr) || pq.top().get_end() + max_width < gr.get_start();
  }
  static bool 
  start_check(const GenomicRegion &prev, const GenomicRegion &gr) {
    return (chrom_greater(prev, gr)
	    || (prev.same_chrom(gr) && start_greater(prev, gr))
	    || (prev.same_chrom(gr) && same_start(prev, gr) && end_greater(prev, gr)));
  }
};


// probabilistically split genomic regions into mutiple
// genomic regions of width equal to bin_size
static void
SplitGenomicRegion(const GenomicRegion &inputGR,
		   Runif &runif,
		   const size_t bin_size,
		   vector<GenomicRegion> &outputGRs){
  outputGRs.clear();
  GenomicRegion gr(inputGR);

  double frac =
    static_cast<double>(gr.get_start() % bin_size)/bin_size;
  const size_t width = gr.get_width();

  if(runif.runif(0.0, 1.0) > frac){
    gr.set_start(std::floor(static_cast<double>(gr.get_start())/
			    bin_size)*bin_size);
    gr.set_end(gr.get_start() + width);
  }
  else {
    gr.set_start(std::ceil(static_cast<double>(gr.get_start())/
			   bin_size)*bin_size);
    gr.set_end(gr.get_start() + width);
  }

  for(size_t i = 0; i < gr.get_width(); i += bin_size){

    const size_t curr_start = gr.get_start() + i;
    const size_t curr_end = std::min(gr.get_end(), curr_start + bin_size);
    frac = static_cast<double>(curr_end - curr_start)/bin_size;

    if(runif.runif(0.0, 1.0) <= frac){
      GenomicRegion binned_gr(gr.get_chrom(), curr_start, curr_start + bin_size,
			      gr.get_name(), gr.get_score(), 
			      gr.get_strand());

      outputGRs.push_back(binned_gr);
    }
  }
}


// split a mapped read into multiple genomic regions
// based on the number of bases in each
static void
SplitMappedRead(const bool VERBOSE,
		const MappedRead &inputMR,
		Runif &runif,
		const size_t bin_size,
		vector<GenomicRegion> &outputGRs){
  outputGRs.clear();

  size_t covered_bases = 0;
  size_t read_iterator = inputMR.r.get_start();
  size_t seq_iterator = 0;
  size_t total_covered_bases = 0;

  // not sure why this didn't work:
  //std::string::iterator seq_iterator = inputMR.seq.begin();
  //  while(seq_iterator != inputMR.seq.end()){
  //  if(*seq_iterator != 'N')
  //  covered_bases++;
  while(seq_iterator < inputMR.seq.size()){
    if(inputMR.seq[seq_iterator] != 'N')
      covered_bases++;

    // if we reach the end of a bin, probabilistically create a binned read
    // with probability proportional to the number of covered bases
    if(read_iterator % bin_size == bin_size - 1){
      double frac = static_cast<double>(covered_bases)/bin_size;
      if(runif.runif(0.0, 1.0) <= frac){
	const size_t curr_start = read_iterator - (read_iterator % bin_size);
	const size_t curr_end = curr_start + bin_size;
	GenomicRegion binned_gr(inputMR.r.get_chrom(), curr_start, curr_end,
				inputMR.r.get_name(), inputMR.r.get_score(),
				inputMR.r.get_strand());
	outputGRs.push_back(binned_gr);
      }
      total_covered_bases += covered_bases;
      covered_bases = 0;
    }
    seq_iterator++;
    read_iterator++;
  }

  double frac = static_cast<double>(covered_bases)/bin_size;
  if(runif.runif(0.0, 1.0) <= frac){
    const size_t curr_start = read_iterator - (read_iterator % bin_size);
    const size_t curr_end = curr_start + bin_size;
    GenomicRegion binned_gr(inputMR.r.get_chrom(), curr_start, curr_end,
			    inputMR.r.get_name(), inputMR.r.get_score(),
			    inputMR.r.get_strand());
    outputGRs.push_back(binned_gr);
  }


}

static inline bool
GenomicRegionIsNull(const GenomicRegion &gr){
  GenomicRegion null_gr;
  if(gr == null_gr)
    return true;

  return false;
}

// extend the read by increasing the end pos by n_bases
static void
ExtendMappedRead(const size_t n_bases, 
		 MappedRead &mr){
  size_t curr_end = mr.r.get_end();
  mr.r.set_end(curr_end + n_bases);
  mr.seq.resize(mr.seq.size() + n_bases, '_');
  mr.scr.resize(mr.scr.size() + n_bases, 'B');
}


static size_t
load_values_MR(const bool VERBOSE,
	       const string input_file_name, 
	       const size_t bin_size,
	       const size_t max_width,
	       const size_t n_bases_extend, 
	       vector<double> &vals_hist) {

  srand(time(0) + getpid());
  Runif runif(rand());

  std::ifstream in(input_file_name.c_str());
  if (!in)
    throw SMITHLABException("problem opening file: " + input_file_name);

  MappedRead mr;
  if (!(in >> mr))
    throw SMITHLABException("problem reading from: " + input_file_name);

 // initialize prioirty queue to reorder the split reads
  std::priority_queue<GenomicRegion, vector<GenomicRegion>, GenomicRegionOrderChecker> PQ;

  size_t n_reads = 0;
  size_t n_bins = 0;
  GenomicRegion curr_gr, prev_gr;
  size_t current_count = 1;
  do {

    if(mr.r.get_width() > max_width){
      cerr << "Encountered read of width " << mr.r.get_width() << endl;
      throw SMITHLABException("max_width set too small.");
    }

    ExtendMappedRead(n_bases_extend, mr);

    vector<GenomicRegion> splitGRs;
    SplitMappedRead(VERBOSE, mr, runif, bin_size, splitGRs);  
    
    n_reads++;
    n_bins += splitGRs.size();


   // add split Genomic Regions to the priority queue
      for(size_t i = 0; i < splitGRs.size(); i++){
	PQ.push(splitGRs[i]);
      }

   // remove Genomic Regions from the priority queue
    if(splitGRs.size() > 0){
      while(!PQ.empty() && 
	    GenomicRegionOrderChecker::is_ready(PQ, splitGRs.back(), max_width)){
	curr_gr = PQ.top();
	PQ.pop();

     // only compare if the previous is not null (in the 1st iteration)
	if(!GenomicRegionIsNull(prev_gr)){
	  if(curr_gr.same_chrom(prev_gr) && curr_gr.get_start() < prev_gr.get_start()){
	    cerr << "current:\t" << curr_gr << endl;
	    cerr << "previous:\t" << prev_gr << endl;
	    throw SMITHLABException("split reads unsorted");
	  }
	  // next genomic region is not the same as last, update histogram
	  if(!curr_gr.same_chrom(prev_gr) || curr_gr.get_start() != prev_gr.get_start()){
	    // count is too big, resize histogram
	    if(vals_hist.size() < current_count + 1)
	      vals_hist.resize(current_count + 1, 0.0);

	    // increment histogram at current count
	    ++vals_hist[current_count];
	    current_count = 1;
	  }
	  // next genomic region is same as last, increment count
	  else 
	    ++current_count;
	}
	prev_gr.swap(curr_gr);
      }
    }


  } while (in >> mr);

 // done adding reads, now spit the rest out
  while(!PQ.empty()){
    curr_gr = PQ.top();
    PQ.pop();
     // only compare if the previous is not null (in the 1st iteration)
    if(curr_gr.same_chrom(prev_gr) && curr_gr.get_start() < prev_gr.get_start()){
      cerr << "current:\t" << curr_gr << endl;
      cerr << "previous:\t" << prev_gr << endl;
      throw SMITHLABException("split reads unsorted");
    }
	  // next genomic region is not the same as last, update histogram
    if(!curr_gr.same_chrom(prev_gr) || curr_gr.get_start() != prev_gr.get_start()){
	    // count is too big, resize histogram
      if(vals_hist.size() < current_count + 1)
	vals_hist.resize(current_count + 1, 0.0);

	    // increment histogram at current count
      ++vals_hist[current_count];
      current_count = 1;
    }
	  // next genomic region is same as last, increment count
    else 
      ++current_count;

    prev_gr.swap(curr_gr);
  }

 return n_reads;
}


// extend the read by increasing the end pos by n_bases
static void
ExtendGenomicRegion(const size_t n_bases,
		    GenomicRegion &gr){
  GenomicRegion outputGR = gr;
  outputGR.set_end(gr.get_end() + n_bases);
  gr.swap(outputGR);
}


static size_t
load_values_GR(const string input_file_name, 
	       const size_t bin_size,
	       const size_t max_width,
	       const size_t n_bases_extend, 
	       vector<double> &vals_hist) {

  srand(time(0) + getpid());
  Runif runif(rand());

 std::ifstream in(input_file_name.c_str());
 if (!in)
   throw "problem opening file: " + input_file_name;

 GenomicRegion inputGR;
 if (!(in >> inputGR))
   throw "problem reading from: " + input_file_name;

 // initialize prioirty queue to reorder the split reads
 std::priority_queue<GenomicRegion, vector<GenomicRegion>, GenomicRegionOrderChecker> PQ;

 // prev and current Genomic Regions to compare
 GenomicRegion curr_gr, prev_gr;
 size_t n_reads = 0;
 size_t current_count = 1;
 do {
   ExtendGenomicRegion(n_bases_extend, inputGR);

   vector<GenomicRegion> splitGRs;
   SplitGenomicRegion(inputGR, runif, bin_size, splitGRs);
   // add split Genomic Regions to the priority queue
   for(size_t i = 0; i < splitGRs.size(); i++){
     PQ.push(splitGRs[i]);
   }

   if(splitGRs.size() > 0){
   // remove Genomic Regions from the priority queue
     while(!PQ.empty() && 
	   GenomicRegionOrderChecker::is_ready(PQ, splitGRs.back(), max_width)){
       curr_gr = PQ.top();
       PQ.pop();
 // only compare if the previous is not null (in the 1st iteration)
       if(!GenomicRegionIsNull(prev_gr)){
	 if(curr_gr.same_chrom(prev_gr) && curr_gr.get_start() < prev_gr.get_start()){
	   cerr << "current:\t" << curr_gr << endl;
	   cerr << "previous:\t" << prev_gr << endl;
	   throw SMITHLABException("split reads unsorted");
	 }
 
	  // next genomic region is not the same as last, update histogram
	 if(!curr_gr.same_chrom(prev_gr) || curr_gr.get_start() != prev_gr.get_start()){
	    // count is too big, resize histogram
	   if(vals_hist.size() < current_count + 1)
	     vals_hist.resize(current_count + 1, 0.0);

	    // increment histogram at current count
	   ++vals_hist[current_count];
	   current_count = 1;
	 }
	  // next genomic region is same as last, increment count
	 else 
	   ++current_count;
       }
       prev_gr.swap(curr_gr);
     }
   }

   n_reads++;


 } while (in >> inputGR);

 // done adding reads, now spit the rest out
 while(!PQ.empty()){
   curr_gr = PQ.top();
   PQ.pop();
     // only compare if the previous is not null (in the 1st iteration)
   if(curr_gr.same_chrom(prev_gr) && curr_gr.get_start() < prev_gr.get_start()){
     cerr << "current:\t" << curr_gr << endl;
     cerr << "previous:\t" << prev_gr << endl;
     throw SMITHLABException("split reads unsorted");
   }
	  // next genomic region is not the same as last, update histogram
   if(!curr_gr.same_chrom(prev_gr) || curr_gr.get_start() != prev_gr.get_start()){
	    // count is too big, resize histogram
     if(vals_hist.size() < current_count + 1)
       vals_hist.resize(current_count + 1, 0.0);

	    // increment histogram at current count
     ++vals_hist[current_count];
     current_count = 1;
   }
	  // next genomic region is same as last, increment count
   else 
     ++current_count;
   
   prev_gr.swap(curr_gr);
 }

 return n_reads;
}

// vals_hist[j] = n_{j} = # (counts = j)
// vals_hist_distinct_counts[k] = kth index j s.t. vals_hist[j] > 0
// stores kth index of vals_hist that is positive
// distinct_counts_hist[k] = vals_hist[vals_hist_distinct_counts[k]]  
// stores the kth positive value of vals_hist
void
resample_hist(const gsl_rng *rng, const vector<size_t> &vals_hist_distinct_counts,
	      const vector<double> &distinct_counts_hist,
	      vector<double> &out_hist) {
  
  vector<unsigned int> sample_distinct_counts_hist(distinct_counts_hist.size(), 0);

  const unsigned int distinct = 
    static_cast<unsigned int>(accumulate(distinct_counts_hist.begin(), distinct_counts_hist.end(), 0.0));
  
  gsl_ran_multinomial(rng, distinct_counts_hist.size(), distinct,
		      &distinct_counts_hist.front(), 
		      &sample_distinct_counts_hist.front());

  out_hist.clear();
  out_hist.resize(vals_hist_distinct_counts.back() + 1, 0.0);
  for(size_t i = 0; i < sample_distinct_counts_hist.size(); i++)
    out_hist[vals_hist_distinct_counts[i]] = 
      static_cast<double>(sample_distinct_counts_hist[i]);
}


static double
sample_count_distinct(const gsl_rng *rng,
		      const vector<size_t> &full_umis,
		      const size_t sample_size) {
  vector<size_t> sample_umis(sample_size);
  gsl_ran_choose(rng, (size_t *)&sample_umis.front(), sample_size,
		 (size_t *)&full_umis.front(), full_umis.size(), 
		 sizeof(size_t));
  double count = 1.0;
  for (size_t i = 1; i < sample_umis.size(); i++)
    if(sample_umis[i] != sample_umis[i-1])
      count++;

  return count;
}


static bool
check_yield_estimates(const vector<double> &estimates) {
  
  if (estimates.empty()) 
    return false;

  // make sure that the estimate is increasing in the time_step and is
  // below the initial distinct per step_size
  if (!finite(accumulate(estimates.begin(), estimates.end(), 0.0)))
    return false;
  
  for (size_t i = 1; i < estimates.size(); ++i)
    if ((estimates[i] < estimates[i - 1]) ||
	(i >= 2 && (estimates[i] - estimates[i - 1] >
		    estimates[i - 1] - estimates[i - 2])) ||
	(estimates[i] < 0.0))
      return false;
  
  return true;
}

void
estimates_bootstrap(const bool VERBOSE, const vector<double> &orig_hist, 
		    const size_t bootstraps, const size_t orig_max_terms, 
		    const int diagonal, const double bin_step_size, 
		    const double max_extrapolation, const double dupl_level, 
		    const double tolerance, const size_t max_iter,
		    //		    vector<double> &Ylevel_estimates,
		    vector< vector<double> > &bootstrap_estimates) {
  // clear returning vectors
  bootstrap_estimates.clear();
  
  //setup rng
  srand(time(0) + getpid());
  gsl_rng_env_setup();
  gsl_rng *rng = gsl_rng_alloc(gsl_rng_default);
  gsl_rng_set(rng, rand()); 

  double vals_sum = 0.0;
  for(size_t i = 0; i < orig_hist.size(); i++)
    vals_sum += orig_hist[i]*i;

  const double initial_distinct = accumulate(orig_hist.begin(), orig_hist.end(), 0.0);


  vector<size_t> orig_hist_distinct_counts;
  vector<double> distinct_orig_hist;
  for(size_t i = 0; i < orig_hist.size(); i++){
    if(orig_hist[i] > 0){
      orig_hist_distinct_counts.push_back(i);
      distinct_orig_hist.push_back(orig_hist[i]);
    }
  }
  
  for (size_t iter = 0; 
       (iter < max_iter && bootstrap_estimates.size() < bootstraps); 
       ++iter) {
    
    vector<double> yield_vector;
    vector<double> hist;
    resample_hist(rng, orig_hist_distinct_counts, distinct_orig_hist, hist);
    
    double sample_vals_sum = 0.0;
    for(size_t i = 0; i < hist.size(); i++)
      sample_vals_sum += i*hist[i];

    //  const double sample_max_val = max_extrapolation/sample_vals_sum;
    //   const double sample_val_step = step_size/sample_vals_sum;

    //resize boot_hist to remove excess zeros
    while (hist.back() == 0)
      hist.pop_back();

    //construct umi vector to sample from
    vector<size_t> umis;
    size_t umi = 1;
    for(size_t i = 1; i < hist.size(); i++){
      for(size_t j = 0; j < hist[i]; j++){
	for(size_t k = 0; k < i; k++)
	  umis.push_back(umi);
	umi++;
      }
    }
    assert(umis.size() == static_cast<size_t>(sample_vals_sum));

    // compute complexity curve by random sampling w/out replacement
    const size_t upper_limit = static_cast<size_t>(sample_vals_sum);
    const size_t step = static_cast<size_t>(bin_step_size);
    size_t sample = step;
    while(sample < upper_limit){
      yield_vector.push_back(sample_count_distinct(rng, umis, sample));
      sample += step;
    }

    // ENSURE THAT THE MAX TERMS ARE ACCEPTABLE
    size_t counts_before_first_zero = 1;
    while (counts_before_first_zero < hist.size() &&
	   hist[counts_before_first_zero] > 0)
      ++counts_before_first_zero;
    
    size_t max_terms = std::min(orig_max_terms, counts_before_first_zero - 1);
    // refit curve for lower bound (degree of approx is 1 less than
    // max_terms)
    max_terms = max_terms - (max_terms % 2 == 1);
    
    //refit curve for lower bound
    const ContinuedFractionApproximation 
      lower_cfa(diagonal, max_terms, bin_step_size, max_extrapolation);
    
    const ContinuedFraction 
      lower_cf(lower_cfa.optimal_cont_frac_distinct(hist));
    
    //extrapolate the curve start
    if (lower_cf.is_valid()){
      double sample_size = static_cast<double>(sample);
      while(sample_size < max_extrapolation){
	double t = (sample_size - sample_vals_sum)/sample_vals_sum;
	assert(t >= 0.0);
	yield_vector.push_back(initial_distinct + t*lower_cf(t));
	sample_size += bin_step_size;
      }
    
    // SANITY CHECK    
      if (check_yield_estimates(yield_vector)) {
	bootstrap_estimates.push_back(yield_vector);
	if (VERBOSE) cerr << '.';
	//	Ylevel_estimates.push_back(lower_cf.Ylevel(hist, dupl_level, sample_vals_sum, sample_max_val, 
	//					   tolerance, max_iter));
      }
      else if (VERBOSE){
	cerr << "_";
      }
    }
    else if (VERBOSE){
      cerr << "_";
    }
    
  }
  if (VERBOSE)
    cerr << endl;
  if (bootstrap_estimates.size() < bootstraps)
    throw SMITHLABException("too many iterations, poor sample");
}

static bool
single_estimates(const bool VERBOSE, vector<double> &hist,
		 size_t max_terms, const int diagonal, 
		 const double step_size, const double max_extrapolation, 
		 vector<double> &yield_estimate) {

  //setup rng
  srand(time(0) + getpid());
  gsl_rng_env_setup();
  gsl_rng *rng = gsl_rng_alloc(gsl_rng_default);
  gsl_rng_set(rng, rand()); 
  

  yield_estimate.clear();
  double vals_sum = 0.0;
  for(size_t i = 0; i < hist.size(); i++)
    vals_sum += i*hist[i];
  const double initial_distinct = accumulate(hist.begin(), hist.end(), 0.0);

  // const double max_val = max_extrapolation/vals_sum;
  // const double val_step = step_size/vals_sum;

    //construct umi vector to sample from
  vector<size_t> umis;
  size_t umi = 1;
  for(size_t i = 1; i < hist.size(); i++){
    for(size_t j = 0; j < hist[i]; j++){
      for(size_t k = 0; k < i; k++)
	umis.push_back(umi);
      umi++;
    }
  }
  assert(umis.size() == static_cast<size_t>(vals_sum));

    // compute complexity curve by random sampling w/out replacement
  size_t upper_limit = static_cast<size_t>(vals_sum);
  size_t step = static_cast<size_t>(step_size);
  size_t sample = step;
  while(sample < upper_limit){
    yield_estimate.push_back(sample_count_distinct(rng, umis, sample));
    sample += step;
  }
  
  // ENSURE THAT THE MAX TERMS ARE ACCEPTABLE
  size_t counts_before_first_zero = 1;
  while (counts_before_first_zero < hist.size() && 
	 hist[counts_before_first_zero] > 0)
    ++counts_before_first_zero;


  // Ensure we are not using a zero term
  max_terms = std::min(max_terms, counts_before_first_zero - 1);
  
  // refit curve for lower bound (degree of approx is 1 less than
  // max_terms)
  max_terms = max_terms - (max_terms % 2 == 0);
  
  //refit curve for lower bound
  const ContinuedFractionApproximation 
    lower_cfa(diagonal, max_terms, step_size, max_extrapolation);
    
  const ContinuedFraction 
    lower_cf(lower_cfa.optimal_cont_frac_distinct(hist));
       
    //extrapolate the curve start
  if (lower_cf.is_valid()){
    double sample_size = static_cast<double>(sample);
    while(sample_size < max_extrapolation){
      double t = (sample_size - vals_sum)/vals_sum;
      assert(t >= 0.0);
      yield_estimate.push_back(initial_distinct + t*lower_cf(t));
      sample_size += step_size;
    }
  }
  else{
    // FAIL!
    // lower_cf unacceptable, need to bootstrap to obtain estimates
    return false;
  }

  if (VERBOSE) {
    cerr << "CF_OFFSET_COEFF_ESTIMATES" << endl;
    copy(lower_cf.offset_coeffs.begin(), lower_cf.offset_coeffs.end(),
	 std::ostream_iterator<double>(cerr, "\n"));
    
    cerr << "CF_COEFF_ESTIMATES" << endl;
    copy(lower_cf.cf_coeffs.begin(), lower_cf.cf_coeffs.end(),
	 std::ostream_iterator<double>(cerr, "\n"));
  }

  // SUCCESS!!  
  return true;
}


static inline double
alpha_log_confint_multiplier(const double estimate,
			     const double variance, const double alpha) {
  const double inv_norm_alpha = gsl_cdf_ugaussian_Qinv(alpha/2.0);
  return exp(inv_norm_alpha*
	     sqrt(log(1.0 + variance/pow(estimate, 2))));
}

static void
vector_median_ci(const vector<vector<double> > &bootstrap_estimates,
		 const double ci_level, vector<double> &yield_estimates,
		 vector<double> &lower_ci_lognormal, 
		 vector<double> &upper_ci_lognormal) {
  
  yield_estimates.clear();
  const double alpha = 1.0 - ci_level;
  assert(!bootstrap_estimates.empty());
  
  const size_t n_est = bootstrap_estimates.size();
  vector<double> estimates_row(bootstrap_estimates.size(), 0.0);
  for (size_t i = 0; i < bootstrap_estimates[0].size(); i++) {
    
    // estimates is in wrong order, work locally on const val
    for (size_t k = 0; k < n_est; ++k)
      estimates_row[k] = bootstrap_estimates[k][i];

    sort(estimates_row.begin(), estimates_row.end());
    const double median_estimate = 
      gsl_stats_median_from_sorted_data(&estimates_row[0], 1, n_est);
    
    // sort to get confidence interval
    const double variance = gsl_stats_variance(&estimates_row[0], 1, n_est);
    const double confint_mltr = 
      alpha_log_confint_multiplier(median_estimate, variance, alpha);
    yield_estimates.push_back(median_estimate);
    lower_ci_lognormal.push_back(median_estimate/confint_mltr);
    upper_ci_lognormal.push_back(median_estimate*confint_mltr);
  }
}


static void
median_and_ci(const vector<double> &estimates,
	      const double ci_level,
	      double &median_estimate,
	      double &lower_ci_estimate,
	      double &upper_ci_estimate){
  assert(!estimates.empty());
  const double alpha = 1.0 - ci_level;
  const size_t n_est = estimates.size();
  vector<double> sorted_estimates(estimates);
  sort(sorted_estimates.begin(), sorted_estimates.end());
  median_estimate = 
    gsl_stats_median_from_sorted_data(&sorted_estimates[0], 1, n_est);
  const double variance = gsl_stats_variance(&sorted_estimates[0], 1, n_est);
  const double confint_mltr = 
    alpha_log_confint_multiplier(median_estimate, variance, alpha);

  lower_ci_estimate = median_estimate/confint_mltr;
  upper_ci_estimate = median_estimate*confint_mltr;

}

static void
write_predicted_curve(const string outfile, 
		      const double c_level,
		      const double base_step_size,
		      const size_t bin_size,
		      const vector<double> &yield_estimates,
		      const vector<double> &yield_lower_ci_lognormal,
		      const vector<double> &yield_upper_ci_lognormal) {
  std::ofstream of;
  if (!outfile.empty()) of.open(outfile.c_str());
  std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
  
  out << "TOTAL_BASES\tEXPECTED_COVERED_BASES\t"
      << "LOWER_" << 100*c_level << "%CI\t"
      << "UPPER_" << 100*c_level << "%CI" << endl;
  
  out.setf(std::ios_base::fixed, std::ios_base::floatfield);
  out.precision(1);
  
  out << 0 << '\t' << 0 << '\t' << 0 << '\t' << 0 << endl;
  for (size_t i = 0; i < yield_estimates.size(); ++i)
    out << (i + 1)*base_step_size << '\t' 
	<< yield_estimates[i]*bin_size << '\t'
	<< yield_lower_ci_lognormal[i]*bin_size << '\t' 
	<< yield_upper_ci_lognormal[i]*bin_size << endl;
}


int
main(const int argc, const char **argv) {

  try {
    
    const size_t MIN_REQUIRED_COUNTS = 8;

    /* FILES */
    string outfile;
    
    size_t orig_max_terms = 1000;
    double max_extrapolation = 1.0e13;
    double base_step_size = 1e9;
    size_t bootstraps = 100;
    int diagonal = -1;
    double c_level = 0.95;
    size_t max_width = 1000;
    size_t bin_size = 20;
    size_t max_iter = 0;
    double tolerance = 1.0e-20;
    double reads_per_base = 2.0;
    double fixed_fold = 20;
    size_t n_bases_extend = 0;
    
    /* FLAGS */
    bool VERBOSE = false;
    bool NO_SEQUENCE = false;
    bool SINGLE_ESTIMATE = false;
    
    
    /**************** GET COMMAND LINE ARGUMENTS ***********************/
    OptionParser opt_parse(strip_path(argv[0]), 
			   "", "<sorted-mapped-read-file>");
    opt_parse.add_opt("output", 'o', "yield output file (default: stdout)",
		      false , outfile);
    opt_parse.add_opt("max_width", 'w', "max fragment length, "
		      "set equal to read length for single end reads",
		      false, max_width);
    opt_parse.add_opt("bin_size", 'b', "bin size "
		      "(default: " + toa(bin_size) + ")",
		      false, bin_size);
    opt_parse.add_opt("read_extend", 'r', "number of bases to extend reads by "
		      "(default: " + toa(n_bases_extend) + ", no extension)",
		      false, n_bases_extend);
    opt_parse.add_opt("extrap",'e',"maximum extrapolation in base pairs "
		      "(default: " + toa(max_extrapolation) + ")",
		      false, max_extrapolation);
    opt_parse.add_opt("step",'s',"step size in bases between extrapolations "
		      "(default: " + toa(base_step_size) + ")", 
		      false, base_step_size);
    opt_parse.add_opt("bootstraps",'n',"number of bootstraps "
		      "(default: " + toa(bootstraps) + "), ",
		      false, bootstraps);
    opt_parse.add_opt("cval", 'c', "level for confidence intervals "
		      "(default: " + toa(c_level) + ")", false, c_level);
    opt_parse.add_opt("reads_per_base", 'd', "average reads per base "
		      "to predict sequencing level",
		      false, reads_per_base);
    opt_parse.add_opt("terms",'x',"maximum number of terms", 
		      false, orig_max_terms);
    opt_parse.add_opt("fixed_fold",'f',"fixed fold extrapolation to predict",
		      false, fixed_fold);
    //    opt_parse.add_opt("tol",'t', "numerical tolerance", false, tolerance);
    //    opt_parse.add_opt("max_iter",'i', "maximum number of iteration",
    //		      false, max_iter);
    opt_parse.add_opt("verbose", 'v', "print more information", 
		      false, VERBOSE);
    opt_parse.add_opt("bed", 'B', "input is in bed format without sequence information",
    		      false, NO_SEQUENCE);
    opt_parse.add_opt("quick",'Q', 
		      "quick mode: run gc_extrap without bootstrapping for confidence intervals",
		      false, SINGLE_ESTIMATE);
    
    vector<string> leftover_args;
    opt_parse.parse(argc, argv, leftover_args);
    if (argc == 1 || opt_parse.help_requested()) {
      cerr << opt_parse.help_message() << endl;
      return EXIT_SUCCESS;
    }
    if (opt_parse.about_requested()) {
      cerr << opt_parse.about_message() << endl;
      return EXIT_SUCCESS;
    }
    if (opt_parse.option_missing()) {
      cerr << opt_parse.option_missing_message() << endl;
      return EXIT_SUCCESS;
    }
    if (leftover_args.empty()) {
      cerr << opt_parse.help_message() << endl;
      return EXIT_SUCCESS;
    }
    const string input_file_name = leftover_args.front();
    /******************************************************************/
    

    max_width = max_width + n_bases_extend;

    vector<double> coverage_hist;
    size_t n_reads = 0;

    double avg_bins_per_read = 0.0;
    const double dupl_level = 1.0/reads_per_base;

    if(VERBOSE)
      cerr << "LOADING READS" << endl;

    if(NO_SEQUENCE)
      n_reads = load_values_GR(input_file_name, bin_size, max_width, n_bases_extend, coverage_hist);
    else 
      n_reads = load_values_MR(VERBOSE, input_file_name, bin_size, max_width, n_bases_extend, coverage_hist);

    double total_bins = 0.0;
    for(size_t i = 0; i < coverage_hist.size(); i++)
      total_bins += coverage_hist[i]*i;
    const double distinct_bins = accumulate(coverage_hist.begin(), coverage_hist.end(), 0.0);

    avg_bins_per_read = total_bins/n_reads;
    double bin_step_size = base_step_size/bin_size;
    // for large initial experiments need to adjust step size
    // otherwise small relative steps do not account for variance
    // in extrapolation
    if(bin_step_size < (total_bins/20)){
       bin_step_size = std::max(bin_step_size, bin_step_size*round(total_bins/(20*bin_step_size)));
       if(VERBOSE)
	 cerr << "ADJUSTED_STEP_SIZE = " << bin_step_size << endl;
       base_step_size = bin_step_size*bin_size;
    }
    // recorrect the read step size
    //read_step_size = bin_step_size/avg_bins_per_read;
       
    const size_t max_observed_count = coverage_hist.size() - 1;
        
    if (VERBOSE)
      cerr << "TOTAL READS         = " << n_reads << endl
	   << "BASE STEP SIZE      = " << base_step_size << endl
	   << "BIN STEP SIZE       = " << bin_step_size << endl
	   << "TOTAL BINS          = " << total_bins << endl
	   << "BINS PER READ       = " << avg_bins_per_read << endl
	   << "DISTINCT BINS       = " << distinct_bins << endl
	   << "TOTAL BASES         = " << total_bins*bin_size << endl
	   << "TOTAL COVERED BASES = " << distinct_bins*bin_size << endl
	   << "MAX COUNT           = " << max_observed_count << endl
	   << "COUNTS OF 1         = " << coverage_hist[1] << endl;
    
    if (VERBOSE) {
      // OUTPUT THE ORIGINAL HISTOGRAM
      cerr << "OBSERVED BIN COUNTS (" << coverage_hist.size() << ")" << endl;
      for (size_t i = 0; i < coverage_hist.size(); i++)
	if (coverage_hist[i] > 0)
	  cerr << i << '\t' << coverage_hist[i] << endl;
      cerr << endl;
    }

    // catch if all reads are distinct
    if (max_observed_count < MIN_REQUIRED_COUNTS)
      throw SMITHLABException("sample not sufficiently deep or duplicates removed");
    
    /////////////////////////////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////////

    if(VERBOSE)
      cerr << "[ESTIMATING YIELD CURVE]" << endl;
   if(SINGLE_ESTIMATE){
     vector<double> yield_estimates;
     bool SINGLE_ESTIMATE_SUCCESS = 
       single_estimates(VERBOSE, coverage_hist, orig_max_terms, diagonal,
			bin_step_size, max_extrapolation/bin_size, 
			yield_estimates);


      // IF FAILURE, EXIT
      if(!SINGLE_ESTIMATE_SUCCESS)
	throw SMITHLABException("SINGLE ESTIMATE FAILED, NEED TO RUN FULL MODE FOR ESTIMATES");

      std::ofstream of;
      if (!outfile.empty()) of.open(outfile.c_str());
      std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
  
      out << "TOTAL_BASES\tEXPECTED_DISTINCT" << endl;

      out.setf(std::ios_base::fixed, std::ios_base::floatfield);
      out.precision(1);
  
      out << 0 << '\t' << 0 << endl;
      for (size_t i = 0; i < yield_estimates.size(); ++i)
	out << (i + 1)*base_step_size << '\t' 
	    << yield_estimates[i]*bin_size << endl;
   }
   else{
     if (VERBOSE) 
       cerr << "[BOOTSTRAP ESTIMATES]" << endl;
  
     if(max_iter == 0)
       max_iter = 4*bootstraps;
        
     vector<vector <double> > bootstrap_estimates;
     vector<double> Ylevel_estimates;
     estimates_bootstrap(VERBOSE, coverage_hist,  bootstraps, orig_max_terms,
			 diagonal, bin_step_size, max_extrapolation/bin_size, dupl_level,
			 tolerance, max_iter, // Ylevel_estimates, 
			 bootstrap_estimates);
      
     if (VERBOSE)
       cerr << "[COMPUTING CONFIDENCE INTERVALS]" << endl;

    // yield ci    
     vector<double> yield_estimates, yield_upper_ci_lognormal, yield_lower_ci_lognormal;

	// use bootstrap estimates to obtain median estimates
     vector_median_ci(bootstrap_estimates, c_level, yield_estimates, 
		      yield_lower_ci_lognormal, yield_upper_ci_lognormal);

     /*      
    // Y50 median and ci
     double Ylevel_median = 0.0;
     double Ylevel_lower_ci = 0.0;
     double Ylevel_upper_ci = 0.0;
     median_and_ci(Ylevel_estimates, c_level, Ylevel_median,
		   Ylevel_lower_ci, Ylevel_upper_ci);

     */
    /////////////////////////////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////////
    /////////////////////////////////////////////////////////////////////
     if (VERBOSE) 
       cerr << "[WRITING OUTPUT]" << endl;
    
     write_predicted_curve(outfile, c_level, base_step_size, bin_size,
			   yield_estimates, yield_lower_ci_lognormal, 
			   yield_upper_ci_lognormal);

     /*
     if(VERBOSE){
       cerr << "Y" << 100*dupl_level << "MEASURE OF LIBRARY QUALITY: "
	    << "EXPECTED # READS TO REACH "
	    << 100*dupl_level << "% DUPLICATES" << endl;
       cerr << "Y" << 100*dupl_level << " = " << Ylevel_median << endl;
       cerr << 100*c_level << "%CI: (" << Ylevel_lower_ci << ", " 
	    << Ylevel_upper_ci << ")" << endl;
     } 
     */
   }
      
  }
  catch (SMITHLABException &e) {
    cerr << "ERROR:\t" << e.what() << endl;
    return EXIT_FAILURE;
  }
  catch (std::bad_alloc &ba) {
    cerr << "ERROR: could not allocate memory" << endl;
    return EXIT_FAILURE;
  }
  return EXIT_SUCCESS;
}