Skip to content

Commit

Permalink
DEV: additional bindings, cli for PyPI install
Browse files Browse the repository at this point in the history
  • Loading branch information
anuradhawick committed Aug 27, 2024
1 parent 3d33eca commit 4fea51d
Show file tree
Hide file tree
Showing 13 changed files with 277 additions and 154 deletions.
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 15 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
[![Conda](https://img.shields.io/conda/v/bioconda/kmertools)](https://anaconda.org/bioconda/kmertools)
[![Conda](https://img.shields.io/conda/dn/bioconda/kmertools)](https://anaconda.org/bioconda/kmertools)
[![codecov](https://codecov.io/gh/anuradhawick/kmertools/graph/badge.svg?token=IDGRE54SSQ)](https://codecov.io/gh/anuradhawick/kmertools)
![PyPI - Version](https://img.shields.io/pypi/v/pykmertools)

<div align="center">
<pre>
Expand All @@ -25,14 +26,19 @@ $$ | \$$\ $$ | $$ | $$ |\$$$$$$$\ $$ | $$ | \$$$$$$ |\$$$$$$ |$$ |$$

`kmertools` is a k-mer based feature extraction tool designed to support metagenomics and other bioinformatics analytics. This tool leverages k-mer analysis to vectorize DNA sequences, facilitating the use of these vectors in various AI/ML applications.

**NEW:** `kmertools` is now available on bioconda at [https://anaconda.org/bioconda/kmertools](https://anaconda.org/bioconda/kmertools).
**NEW**

* `kmertools` is now available on BioConda at [https://anaconda.org/bioconda/kmertools](https://anaconda.org/bioconda/kmertools).
* `kmertools` is now available on PyPI at [https://pypi.org/project/pykmertools/](https://pypi.org/project/pykmertools/).
* `kmertools` now provide python bindings. Read more in our [Wiki](https://github.com/anuradhawick/kmertools/wiki).

## Features

- **Oligonucleotide Frequency Vectors:** Generate frequency vectors for oligonucleotides.
- **Minimiser Binning:** Efficiently bin sequences using minimisers to reduce data complexity.
- **Chaos Game Representation (CGR):** Compute CGR vectors for DNA sequences based on k-mers or whole sequence transformation.
- **Coverage Histograms:** Create coverage histograms to analyze the depth of sequencing reads.
- **Python Binding:** You can import kmertools functionality using `import pykmertools as kt`

## Installation

Expand All @@ -47,8 +53,15 @@ conda create -n kmertools -c bioconda kmertools
# activate environment
conda activate kmertools
```
### Option 2: from PyPI

You can install `kmertools` from PyPI at https://pypi.org/project/pykmertools/.

```bash
pip install pykmertools
```

### Option 2: from sources
### Option 3: from sources

You can install `kmertools` directly from the source by cloning the repository and using Rust's package manager `cargo`.

Expand Down
1 change: 1 addition & 0 deletions kmer/src/kmer_minimisers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use std::cmp::min;
use std::collections::VecDeque;
use std::iter::Iterator;

// TODO I cannot remember why I made this, probably a test script, but well.
// https://github.com/lh3/minimap2/blob/0cc3cdca27f050fb80a19c90d25ecc6ab0b0907b/sketch.c#L9C1-L26C3
const SEQ_NT4_TABLE: [u8; 256] = [
0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
Expand Down
1 change: 0 additions & 1 deletion kmer/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
pub mod kmer;
pub mod kmer_minimisers;
pub mod minimiser;
pub type Kmer = u64;

Expand Down
132 changes: 132 additions & 0 deletions kmertools/src/args.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
use clap::{Args, Parser, Subcommand, ValueEnum};
use composition::{cgr::CgrComputer, oligo::OligoComputer, oligocgr::OligoCgrComputer};
use coverage::CovComputer;
use ktio::fops::create_directory;
use misc::minimisers;

const ABOUT: &str = "kmertools: DNA vectorisation
Expand Down Expand Up @@ -226,3 +230,131 @@ pub struct CounterCommand {
#[arg(short, long, default_value_t = 0)]
pub threads: usize,
}

#[cfg(not(tarpaulin_include))]
pub fn cli(cli: Cli) {
match cli.command {
Commands::Comp { command } => match command {
CompositionCommands::Oligo(command) => {
let mut com =
OligoComputer::new(command.input, command.output, command.k_size as usize);
if command.threads > 0 {
com.set_threads(command.threads);
}
com.set_norm(!command.counts);
com.set_header(command.header);

match command.preset {
VecFmtPreset::Csv => com.set_delim(",".to_owned()),
VecFmtPreset::Spc => com.set_delim(" ".to_owned()),
VecFmtPreset::Tsv => com.set_delim("\t".to_owned()),
}
if let Err(e) = com.vectorise() {
eprintln!("Error: {}", e);
}
}
CompositionCommands::Cgr(command) => {
if let Some(ksize) = command.k_size {
let vecsize = command
.vec_size
.unwrap_or((ksize as f64).powf(4.0).powf(0.5) as u64)
as usize;
let mut cgr = OligoCgrComputer::new(
command.input,
command.output,
ksize as usize,
vecsize,
);
if command.threads > 0 {
cgr.set_threads(command.threads);
}
cgr.set_norm(!command.counts);
if let Err(e) = cgr.vectorise() {
eprintln!("Error: {}", e);
}
} else {
if command.counts {
eprintln!("Error: cannot use counts in whole sequence CGR!");
return;
}
let vecsize = command.vec_size.unwrap_or(1) as usize;
let mut cgr = CgrComputer::new(command.input, command.output, vecsize);
if command.threads > 0 {
cgr.set_threads(command.threads);
}
if let Err(e) = cgr.vectorise() {
eprintln!("Error: {}", e);
}
}
}
},
Commands::Cov(command) => {
create_directory(&command.output).unwrap();
let mut cov = CovComputer::new(
command.input,
command.output,
command.k_size as usize,
command.bin_size as usize,
command.bin_count as usize,
);
if command.threads > 0 {
cov.set_threads(command.threads);
}
if let Some(path) = command.alt_input {
cov.set_kmer_path(path);
}
if command.counts {
cov.set_norm(false);
}
cov.set_max_memory(command.memory as f64);
match command.preset {
VecFmtPreset::Csv => cov.set_delim(",".to_owned()),
VecFmtPreset::Spc => cov.set_delim(" ".to_owned()),
VecFmtPreset::Tsv => cov.set_delim("\t".to_owned()),
}
cov.build_table().unwrap();
cov.compute_coverages();
}
Commands::Min(command) => {
if command.w_size <= command.m_size && command.w_size > 0 {
eprintln!("Window size must be longer than minimiser size!");
return;
}
if command.m_size >= 31 {
eprintln!("Minimisers longer than 30 bases not allowed!");
return;
}

match command.preset {
MinFmtPreset::M2s => minimisers::bin_sequences(
command.w_size as usize,
command.m_size as usize,
&command.input,
&command.output,
command.threads,
),
MinFmtPreset::S2m => minimisers::seq_to_min(
command.w_size as usize,
command.m_size as usize,
&command.input,
&command.output,
command.threads,
),
}
}
Commands::Ctr(command) => {
create_directory(&command.output).unwrap();
let mut ctr =
counter::CountComputer::new(command.input, command.output, command.k_size as usize);
if command.threads > 0 {
ctr.set_threads(command.threads);
}
if command.acgt {
ctr.set_acgt_output(true);
}
ctr.set_max_memory(command.memory as f64);
ctr.count();
ctr.merge(true);
}
}
}
1 change: 1 addition & 0 deletions kmertools/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub mod args;
136 changes: 3 additions & 133 deletions kmertools/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,139 +1,9 @@
use args::CompositionCommands;
use args::Cli;
use clap::Parser;
use composition::oligo::OligoComputer;
use coverage::CovComputer;
use misc::minimisers;
mod args;

#[cfg(not(tarpaulin_include))]
fn main() {
use composition::{cgr::CgrComputer, oligocgr::OligoCgrComputer};
use ktio::fops::create_directory;

let cli = args::Cli::parse();

match cli.command {
args::Commands::Comp { command } => match command {
CompositionCommands::Oligo(command) => {
let mut com =
OligoComputer::new(command.input, command.output, command.k_size as usize);
if command.threads > 0 {
com.set_threads(command.threads);
}
com.set_norm(!command.counts);
com.set_header(command.header);

match command.preset {
args::VecFmtPreset::Csv => com.set_delim(",".to_owned()),
args::VecFmtPreset::Spc => com.set_delim(" ".to_owned()),
args::VecFmtPreset::Tsv => com.set_delim("\t".to_owned()),
}
if let Err(e) = com.vectorise() {
eprintln!("Error: {}", e);
}
}
CompositionCommands::Cgr(command) => {
if let Some(ksize) = command.k_size {
let vecsize = command
.vec_size
.unwrap_or((ksize as f64).powf(4.0).powf(0.5) as u64)
as usize;
let mut cgr = OligoCgrComputer::new(
command.input,
command.output,
ksize as usize,
vecsize,
);
if command.threads > 0 {
cgr.set_threads(command.threads);
}
cgr.set_norm(!command.counts);
if let Err(e) = cgr.vectorise() {
eprintln!("Error: {}", e);
}
} else {
if command.counts {
eprintln!("Error: cannot use counts in whole sequence CGR!");
return;
}
let vecsize = command.vec_size.unwrap_or(1) as usize;
let mut cgr = CgrComputer::new(command.input, command.output, vecsize);
if command.threads > 0 {
cgr.set_threads(command.threads);
}
if let Err(e) = cgr.vectorise() {
eprintln!("Error: {}", e);
}
}
}
},
args::Commands::Cov(command) => {
create_directory(&command.output).unwrap();
let mut cov = CovComputer::new(
command.input,
command.output,
command.k_size as usize,
command.bin_size as usize,
command.bin_count as usize,
);
if command.threads > 0 {
cov.set_threads(command.threads);
}
if let Some(path) = command.alt_input {
cov.set_kmer_path(path);
}
if command.counts {
cov.set_norm(false);
}
cov.set_max_memory(command.memory as f64);
match command.preset {
args::VecFmtPreset::Csv => cov.set_delim(",".to_owned()),
args::VecFmtPreset::Spc => cov.set_delim(" ".to_owned()),
args::VecFmtPreset::Tsv => cov.set_delim("\t".to_owned()),
}
cov.build_table().unwrap();
cov.compute_coverages();
}
args::Commands::Min(command) => {
if command.w_size <= command.m_size && command.w_size > 0 {
eprintln!("Window size must be longer than minimiser size!");
return;
}
if command.m_size >= 31 {
eprintln!("Minimisers longer than 30 bases not allowed!");
return;
}

match command.preset {
args::MinFmtPreset::M2s => minimisers::bin_sequences(
command.w_size as usize,
command.m_size as usize,
&command.input,
&command.output,
command.threads,
),
args::MinFmtPreset::S2m => minimisers::seq_to_min(
command.w_size as usize,
command.m_size as usize,
&command.input,
&command.output,
command.threads,
),
}
}
args::Commands::Ctr(command) => {
create_directory(&command.output).unwrap();
let mut ctr =
counter::CountComputer::new(command.input, command.output, command.k_size as usize);
if command.threads > 0 {
ctr.set_threads(command.threads);
}
if command.acgt {
ctr.set_acgt_output(true);
}
ctr.set_max_memory(command.memory as f64);
ctr.count();
ctr.merge(true);
}
}
let cli = Cli::parse();
args::cli(cli);
}
2 changes: 2 additions & 0 deletions pykmertools/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,7 @@ test = false
[dependencies]
pyo3 = { version = "0.22.0", "features" = ["abi3-py39"] }
rayon = "1.10.0"
clap = { version = "4.5.4" }
composition = { path = "../composition" }
kmer = { path = "../kmer" }
kmertools = { path = "../kmertools" }
3 changes: 3 additions & 0 deletions pykmertools/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,6 @@ Documentation = "https://github.com/anuradhawick/kmertools/wiki"

[tool.maturin]
features = ["pyo3/extension-module"]

[project.scripts]
kmertools = "pykmertools:run_cli"
13 changes: 0 additions & 13 deletions pykmertools/src/cov.rs

This file was deleted.

Loading

0 comments on commit 4fea51d

Please sign in to comment.