Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement cram chunking and minimap2-based Hi-C alignments #113

Merged
merged 19 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bin/awk_filter_reads.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
awk 'BEGIN{OFS="\t"}{if($1 ~ /^\@/) {print($0)} else {$2=and($2,compl(2048)); print(substr($0,2))}}'
109 changes: 109 additions & 0 deletions bin/filter_five_end.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/usr/bin/perl
use strict;
use warnings;

my $prev_id = "";
my @five;
my @three;
my @unmap;
my @mid;
my @all;
my $counter = 0;

while (<STDIN>){
chomp;
if (/^@/){
print $_."\n";
next;
}
my ($id, $flag, $chr_from, $loc_from, $mapq, $cigar, $d1, $d2, $d3, $read, $read_qual, @rest) = split /\t/;
my $bin = reverse(dec2bin($flag));
my @binary = split(//,$bin);
if ($prev_id ne $id && $prev_id ne ""){
if ($counter == 1){
if (@five == 1){
print $five[0]."\n";
}
else{
my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0];
my $bin_1 = reverse(dec2bin($flag_1));
my @binary_1 = split(//,$bin_1);
$binary_1[2] = 1;
my $bin_1_new = reverse(join("",@binary_1));
my $flag_1_new = bin2dec($bin_1_new);
print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n");
}
}
elsif ($counter == 2 && @five == 1){
print $five[0]."\n";
}
else{
my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0];
my $bin_1 = reverse(dec2bin($flag_1));
my @binary_1 = split(//,$bin_1);
$binary_1[2] = 1;
my $bin_1_new = reverse(join("",@binary_1));
my $flag_1_new = bin2dec($bin_1_new);
print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n");
}

$counter = 0;
undef @unmap;
undef @five;
undef @three;
undef @mid;
undef @all;
}

$counter++;
$prev_id = $id;
push @all,$_;
if ($binary[2]==1){
push @unmap,$_;
}
elsif ($binary[4]==0 && $cigar =~ m/^[0-9]*M/ || $binary[4]==1 && $cigar =~ m/.*M$/){
push @five, $_;
}
elsif ($binary[4]==1 && $cigar =~ m/^[0-9]*M/ || $binary[4]==0 && $cigar =~ m/.*M$/){
push @three, $_;
}
elsif ($cigar =~ m/^[0-9]*[HS].*M.*[HS]$/){
push @mid, $_;
}
}

if ($counter == 1){
if (@five == 1){
print $five[0]."\n";
}
else{
my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0];
my $bin_1 = reverse(dec2bin($flag_1));
my @binary_1 = split(//,$bin_1);
$binary_1[2] = 1;
my $bin_1_new = reverse(join("",@binary_1));
my $flag_1_new = bin2dec($bin_1_new);
print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n");
}
}
elsif ($counter == 2 && @five == 1){
print $five[0]."\n";
}
else{
my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0];
my $bin_1 = reverse(dec2bin($flag_1));
my @binary_1 = split(//,$bin_1);
$binary_1[2] = 1;
my $bin_1_new = reverse(join("",@binary_1));
my $flag_1_new = bin2dec($bin_1_new);
print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n");
}

sub dec2bin {
my $str = unpack("B32", pack("N", shift));
return $str;
}

sub bin2dec {
return unpack("N", pack("B32", substr("0" x 32 . shift, -32)));
}
99 changes: 99 additions & 0 deletions bin/generate_cram_csv.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/bin/bash

# generate_cram_csv.sh
# -------------------
# Generate a csv file describing the CRAM folder
# ><((((°> Y ><((((°> U ><((((°> M ><((((°> I ><((((°>
# Author = yy5
# ><((((°> Y ><((((°> U ><((((°> M ><((((°> I ><((((°>

# NOTE: chunk_size is the number of containers per chunk (not records/reads)

# Function to process chunking of a CRAM file

chunk_cram() {
local cram=$1
local chunkn=$2
local outcsv=$3
local crai=$4
local chunk_size=$5

local rgline=$(samtools view -H "${realcram}" | grep "@RG" | sed 's/\t/\\t/g' | sed "s/'//g")
local ncontainers=$(zcat "${realcrai}" | wc -l)
local base=$(basename "${realcram}" .cram)

if [ $chunk_size -gt $ncontainers ]; then
chunk_size=$((ncontainers - 1))
fi
local from=0
local to=$((chunk_size - 1))

while [ $to -lt $ncontainers ]; do
#echo "chunk $chunkn: $from - $to"
echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv
from=$((to + 1))
to=$((to + chunk_size))
((chunkn++))
done

# Catch any remainder
if [ $from -lt $ncontainers ]; then
to=$((ncontainers - 1))
#echo "chunk $chunkn: $from - $to"
echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv
((chunkn++))
fi
}

# Function to process a CRAM file
process_cram_file() {
local cram=$1
local chunkn=$2
local outcsv=$3
local crai=$4
local chunk_size=$5

local read_groups=$(samtools view -H "$cram" | grep '@RG' | awk '{for(i=1;i<=NF;i++){if($i ~ /^ID:/){print substr($i,4)}}}')
local num_read_groups=$(echo "$read_groups" | wc -w)
if [ "$num_read_groups" -gt 1 ]; then
# Multiple read groups: process each separately
for rg in $read_groups; do
local output_cram="$(basename "${cram%.cram}")_output_${rg}.cram"
samtools view -h -r "$rg" -o "$output_cram" "$cram"
#chunkn=$(chunk_cram "$output_cram" "$chunkn" "$outcsv" "$crai" "$chunk_size")
chunk_cram "$output_cram" "$chunkn" "$outcsv" "$crai" "$chunk_size"
done
else
# Single read group or no read groups
#chunkn=$(chunk_cram "$cram" "$chunkn" "$outcsv" "$crai" "$chunk_size")
chunk_cram "$cram" "$chunkn" "$outcsv" "$crai" "$chunk_size"
fi
}

# /\_/\ /\_/\
# ( o.o ) main ( o.o )
# > ^ < > ^ <

# Check if cram_path is provided
if [ $# -lt 3 ]; then
echo "Usage: $0 <cram_path> <output_csv> <crai_file> <chunk_size>"
exit 1
fi

cram=$1
outcsv=$2
crai=$3
if [ -z "$4" ]; then
chunk_size=10000
else
chunk_size=$4
fi
chunkn=0

# Operates on a single CRAM file
realcram=$(readlink -f $cram)
realcrai=$(readlink -f $crai)
if [ -f "$outcsv" ]; then
rm "$outcsv"
fi
process_cram_file $realcram $chunkn $outcsv $realcrai $chunk_size
10 changes: 10 additions & 0 deletions bin/grep_pg.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

# grep_pg.sh
# -------------------
# A shell script to exclude pg lines and label read 1 and read 2 from cram containers
#
# -------------------
# Author = yy5

grep -v "^\@PG" | awk '{if($1 ~ /^\@/) {print($0)} else {if(and($2,64)>0) {print(1$0)} else {print(2$0)}}}'
21 changes: 21 additions & 0 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ process {
time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) }
}

withName: SAMTOOLS_INDEX {
cpus = { log_increase_cpus(2, 6*task.attempt, 1, 2) }
memory = { check_max( 4.GB + 850.MB * log_increase_cpus(2, 6*task.attempt, 1, 2) * task.attempt + 0.6.GB * Math.ceil( meta.read_count / 100000000 ), 'memory' ) }
time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) }
}

withName: BLAST_BLASTN {
time = { check_max( 2.hour * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) }
memory = { check_max( 100.MB + 20.MB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) }
Expand Down Expand Up @@ -88,6 +94,21 @@ process {
time = { check_max( 1.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) }
}

withName: CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT {
cpus = { check_max( 16 * 1 , 'cpus' ) }
memory = { check_max( 1.GB * ( reference.size() < 2e9 ? 50 : Math.ceil( ( reference.size() / 1e+9 ) * 20 ) * Math.ceil( task.attempt * 1 ) ) , 'memory') }
}

withName: CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT {
cpus = { check_max( 16 * 1 , 'cpus' ) }
memory = { check_max( 1.GB * ( reference.size() < 2e9 ? 50 : Math.ceil( ( reference.size() / 1e+9 ) * 20 ) * Math.ceil( task.attempt * 1 ) ) , 'memory') }
}

withName: MINIMAP2_INDEX {
cpus = { check_max( 2 * task.attempt, 'cpus' ) }
memory = { check_max( 1.GB * Math.ceil( 30 * fasta.size() / 1e+9 ) * task.attempt, 'memory' ) }
}

withName: CRUMBLE {
// No correlation between memory usage and the number of reads or the genome size.
// Most genomes seem happy with 1 GB, then some with 2 GB, then some with 5 GB.
Expand Down
61 changes: 53 additions & 8 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,6 @@ process {
ext.args = '-F 0x200 -nt'
}

withName: '.*:.*:ALIGN_HIC:BWAMEM2_MEM' {
ext.args = { "-5SPCp -R ${meta.read_group}" }
}

withName: '.*:.*:ALIGN_ILLUMINA:BWAMEM2_MEM' {
ext.args = { "-R ${meta.read_group}" }
}

withName: SAMTOOLS_MERGE {
ext.args = { "-c -p" }
ext.prefix = { "${meta.id}.merge" }
Expand All @@ -46,6 +38,54 @@ process {
ext.args = "-be '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index"
}

withName: CONVERT_CRAM {
ext.args = "--output-fmt cram"
}

withName: '.*:.*:ALIGN_HIC:BWAMEM2_MEM' {
ext.args = { "-5SPCp -R ${meta.read_group}" }
}

withName: '.*:.*:ALIGN_ILLUMINA:BWAMEM2_MEM' {
ext.args = { "-p -R ${meta.read_group}" }
}

withName: ".*:ALIGN_ILLUMINA:.*:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" {
ext.args = ""
ext.args1 = { "-F 0x200 -nt" }
ext.args2 = { "-p -R '${rglines}'" }
ext.args3 = "-mpu"
ext.args4 = { "--write-index -l1" }
}

withName: ".*:ALIGN_ILLUMINA:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
ext.args = ""
ext.args1 = { "-F 0x200 -nt" }
ext.args2 = { "-ax sr" }
ext.args3 = "-mpu"
ext.args4 = { "--write-index -l1" }
}

withName: ".*:ALIGN_HIC:.*:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" {
ext.args = ""
ext.args1 = { "-F 0x200 -nt" }
ext.args2 = { "-5SPCp -R '${rglines}'" }
ext.args3 = "-mpu"
ext.args4 = { "--write-index -l1" }
}

withName: ".*:ALIGN_HIC:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" {
ext.args = ""
ext.args1 = { "" }
ext.args2 = { "-ax sr" }
ext.args3 = "-mpu"
ext.args4 = { "--write-index -l1" }
}

withName: "MINIMAP2_INDEX" {
ext.args = { "${fasta.size() > 2.5e9 ? (" -I " + Math.ceil(fasta.size()/1e9)+"G") : ""} "}
}

// minimap2 2.24 can only work with genomes up to 4 Gbp. For larger genomes, add the -I option with the genome size in Gbp.
// In fact, we can also use -I to *decrease* the memory requirements for smaller genomes
// NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp
Expand Down Expand Up @@ -81,6 +121,11 @@ process {
ext.prefix = { "${bam.baseName}" }
}

withName: SAMTOOLS_ADDREPLACERG {
ext.prefix = { "${input.baseName}_addRG" }
ext.args = { "-r ${meta.read_group} --no-PG" }
}

withName: SAMTOOLS_STATS {
ext.prefix = { "${input.baseName}" }
}
Expand Down
3 changes: 3 additions & 0 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ params {
// Output directory
outdir = "${projectDir}/results"

// Aligner
short_aligner = "minimap2"

// Fasta references
fasta = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz"
header = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.header.sam"
Expand Down
10 changes: 10 additions & 0 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@
"git_sha": "a33ef9475558c6b8da08c5f522ddaca1ec810306",
"installed_by": ["modules"]
},
"minimap2/index": {
"branch": "master",
"git_sha": "72e277acfd9e61a9f1368eafb4a9e83f5bcaa9f5",
"installed_by": ["modules"]
},
"samtools/faidx": {
"branch": "master",
"git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519",
Expand All @@ -60,6 +65,11 @@
"git_sha": "46eca555142d6e597729fcb682adcc791796f514",
"installed_by": ["modules"]
},
"samtools/index": {
"branch": "master",
"git_sha": "46eca555142d6e597729fcb682adcc791796f514",
"installed_by": ["modules"]
},
"samtools/merge": {
"branch": "master",
"git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519",
Expand Down
Loading
Loading