-
Notifications
You must be signed in to change notification settings - Fork 36
/
khmer_re_pair.pl
executable file
·80 lines (59 loc) · 1.99 KB
/
khmer_re_pair.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env perl
=head1 NAME
khmer_re_pair.pl
=head1 SYNOPSIS
khmer_re_pair.pl -i original_interleaved_fasta -k khmer_filtered_fasta
=head1 DESCRIPTION
- Takes -i, an original paired (interleaved) fasta read file (one line per sequence)
- Takes the khmer_filtered_fasta file
- For each read in the khmer_filtered_fasta file, it prints out both the /1 and the /2 (without duplicates)
- The goal is to not lose pairing info which khmer does lose
- Assumes the khmer_filtered_fasta file was created from the original file, and is in the same order
=head1 AUTHORS
[email protected] 2012.07.08
=cut
use strict;
use warnings;
use Getopt::Long qw(:config pass_through no_ignore_case);
my $original_file;
my $khmer_file;
GetOptions (
"interleaved=s"=> \$original_file,
"khmer=s" => \$khmer_file,
);
if (not $original_file and not $khmer_file) {
print STDERR "Usage: khmer_re_pair.pl -i interleaved_original_fasta -k khmer_filtered_fasta\n";
exit;
}
my $original_fh = &read_fh ($original_file);
my $khmer_fh = &read_fh ($khmer_file);
my %khmer_hash;
my $pair = "";
#-------------------------------------------
# load khmer filtered file headers in memory
while (<$khmer_fh>) {
die "-khmer file does not seem to have /1 or /2 as the first read's suffix" unless /^(>\S+?)\/\d\b/;
$khmer_hash{$1} = 1;
<$khmer_fh>; #ignore the sequence
}
#-------------------------------------------
# go through original file and print out
# all pairs where header is in khmer_hash
while (<$original_fh>) {
die "-original file does not seem to have /1 or /2 as the first read's suffix" unless /^(>\S+?)\/\d\b/;
$pair = $_;
for (1..3) {$pair .= <$original_fh>}
print $pair if $khmer_hash{$1};
}
########################################
sub read_fh {
my $filename = shift @_;
my $filehandle;
if ($filename =~ /gz$/) {
open $filehandle, "gunzip -dc $filename |" or die $!;
}
else {
open $filehandle, "<$filename" or die $!;
}
return $filehandle;
}