This repos documents the everyday unix functions that I use.
mkdir -p directory/{src,bin,lib,man}
After this is run, the directory structure will be as below
$ find . | grep directory
./directory
./directory/bin
./directory/lib
./directory/man
./directory/src
This assumes the chromosome name is field 1 of the GFF file.
- Step 1: Pull fields 1, 4 and 5 (Name, Left and Right respectively)
- Step 2: Get rid of comment lines
- Step 3: Sort first by chromosome, then numerically by the left position
cut -f1,4,5 Ncbi_Gff_File.gff | grep -v '^#' | sort -k1,1 -k2n,2
awk '{if(NR%4==1) {printf(">%s\n",substr($0,2));} else if(NR%4==2) print;}' file1.fastq > file1.fasta
bc -l <<< `stat -c "%s" file1`/`stat -c "%s" file2`
bunzip2 -c file.bz2 | gzip > file.gz
grep -v "^#" results.tbl | awk '{print $1"\t"$4"\t"$5}' | sort -k1,1 -k3g | awk '$1!=h {print} {h=$1}'
awk '{if ($1 ~ />/){print $1} else {print $0}}' file1.fasta > file2.fasta
awk '(NR%4==2){l+=length; i+=1} END {print l/i}' file.fastq
echo '"name","sequence"' && cat file.fna | cut -f1 -d" " | awk '{if ($1 ~ ">") {if(h != ""){print "\""h"\",\""s"\""}; s="";h=substr($1,2);} else{s=s$1}} END {print "\""h"\",\""s"\""}'
find ./*.fna -mtime -1 -type f -exec gzip {} \;
find ./*.fna -mtime -1 -type f -print0 | parallel -q0 gzip
Set up the database
makeblastdb -in sequence.fa -out sequence -dbtype nucl
Query the database for all self-hits that aren't that aren't the same region
blastn -query sequence.fa -db sequence -outfmt 6 | awk '($1==$2)&&(($7!=$9)||($8!=$10)){print $0}'
If the sequence.fa file is large, you can gain serious speedups by parallelizing
cat sequence.fa | parallel --block 50k --recstart '>' --pipe blastn -outfmt 6 -db sequence -query - | awk '($1==$2)&&(($7!=$9)||($8!=$10)){print $0}'
hmmfetch -f list_of_pfams Pfam-A.hmm > shortened_pfams.hmm
find . -type f -size +5G -exec ls -lShr {} \;
parallel --gnu gzip ::: *.fq
gzip -c file.fq >> compress_file.fq.gz
lsof | grep `pwd` | grep '1w' | awk '{print $9}' | xargs -r ls -l
This will print "Waiting..." to the screen until the job is done and everything is ready.
echo Waiting...; while ps -p $PID > /dev/null; do sleep 1; done; nohup script.sh &
find . -type f -not -name "NC_000913.fna" | xargs rm
cat output.fna | parallel --block 50k --recstart '>' --pipe ~/Desktop/Job/Sandia/software/Prodigal/prodigal -a /dev/stdout -d /dev/stderr -q -p meta -o output_test.gff -f gff 1>>out.faa 2>>out.ffn