Skip to content

Commit

Permalink
Merge branch 'c-python' of github.com:esrice/trio_binning into c-python
Browse files Browse the repository at this point in the history
  • Loading branch information
esrice committed Sep 29, 2022
2 parents 74b33ac + 35a3805 commit 6b3cc73
Show file tree
Hide file tree
Showing 9 changed files with 75 additions and 37 deletions.
8 changes: 4 additions & 4 deletions c/kmers.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,13 @@ uint64_t kmer_to_int(char* kmer, unsigned char k) {
// no need to waste time doing 'kmer_int |= (0 << (i*2))'
break;
case 'C':
kmer_int |= (1 << (i*2));
kmer_int |= (UINT64_C(1) << (i*2));
break;
case 'G':
kmer_int |= (2 << (i*2));
kmer_int |= (UINT64_C(2) << (i*2));
break;
case 'T':
kmer_int |= (3 << (i*2));
kmer_int |= (UINT64_C(3) << (i*2));
}
}

Expand Down Expand Up @@ -290,7 +290,7 @@ void count_kmers_in_read(
kmer[j] = read[i+j];
if (kmer_in_hash_set(kmer, kmer_revcomp, haplotype_A))
(*count_A)++;
if (kmer_in_hash_set(kmer, kmer_revcomp, haplotype_B))
else if (kmer_in_hash_set(kmer, kmer_revcomp, haplotype_B))
(*count_B)++;
}

Expand Down
21 changes: 21 additions & 0 deletions src/trio_binning/kmers.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,27 @@ class _HashSet(Structure):
POINTER(c_int),
]

kmer_to_int_c = lib.kmer_to_int
kmer_to_int_c.argtypes = [c_char_p, c_ubyte]
kmer_to_int_c.restype = c_uint64


def kmer_to_int(kmer: str) -> int:
"""Convert a kmer to integer format"""
return kmer_to_int_c(bytes(kmer, "utf-8"), c_ubyte(len(kmer)))


reverse_complement_c = lib.reverse_complement
reverse_complement_c.argtypes = [c_char_p, c_char_p, c_ubyte]


def reverse_complement(kmer: str) -> str:
"""Reverse complement a k-mer"""
out_kmer = bytes("x" * len(kmer), "utf-8")
reverse_complement_c(bytes(kmer, "utf-8"), out_kmer, len(kmer))
return out_kmer.decode("utf-8")


# this is ugly as sin, but necessary because mypy is ok with `pointer`
# as a subscriptable type while python runtime is not
if TYPE_CHECKING:
Expand Down
9 changes: 6 additions & 3 deletions tests/data/hapA.fastq

Large diffs are not rendered by default.

10 changes: 4 additions & 6 deletions tests/data/hapA.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
CTACCGCCCTCCC
GAAGATCCTCATC
GAGGAGATTTAGA
CTACGGCGCTCAC
GTGAGCCCCCCGA
CCTGGGCACACAC
ACCTCTAAGAAGCTTTGAAAA
CTTATCATGTCTTTGTTTTCA
CCAGCTGTGTGATTGGTCCGC
AAAAATAAGACAGCATGTGAC
10 changes: 3 additions & 7 deletions tests/data/hapB.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
ATGCGTAGCAGTA
CATCGCCATCAGC
CCCCTCCCTGCCC
GCTCGACTCACAC
CACTGCGCATCTC
ATAGAGATATATA
CTCCCGCCCTCCC
AACACCAAAAAAAAAAACCTC
AACCCTGTTGTTACCCTTGTG
AAACTGTTTTAAAACTAAAGT
Binary file added tests/data/test.ccs.fastq.gz
Binary file not shown.
12 changes: 0 additions & 12 deletions tests/data/unclassified.fastq

This file was deleted.

2 changes: 1 addition & 1 deletion tests/test_classify_by_kmers.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_classify_by_kmers(capsys, tmpdir):
"sys.argv",
[
"classify-by-kmers",
join(dirname(__file__), "data", "test.fastq"),
join(dirname(__file__), "data", "test.ccs.fastq.gz"),
join(dirname(__file__), "data", "hapA.txt"),
join(dirname(__file__), "data", "hapB.txt"),
"--haplotype-a-out-prefix",
Expand Down
40 changes: 36 additions & 4 deletions tests/test_kmers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import os.path

import kmers
import pytest

from trio_binning import kmers


def test_build_kmer_hash():
kmers_path = os.path.join(os.path.dirname(__file__), "data", "hapA.txt")
kmer_set = kmers.create_kmer_hash_set(kmers_path)
assert kmers.get_number_kmers_in_set(kmer_set) == 6
assert kmers.get_number_kmers_in_set(kmer_set) == 4


def test_count_kmers_in_read():
Expand All @@ -17,5 +19,35 @@ def test_count_kmers_in_read():
hap_b_set = kmers.create_kmer_hash_set(hap_b_path)

assert kmers.count_kmers_in_read(
"GAGGAGATTTAGAGTGTGAGTCGAGCATAGAGATATATA", hap_a_set, hap_b_set
) == (1, 2)
"CTTATCATGTCTTTGTTTTCAAAGCTTCTTAGAGGTTTTTTTTTTTGGTGTTAATTGGCATAAATTATGGCT",
hap_a_set,
hap_b_set,
) == (2, 1)


@pytest.mark.parametrize(
"kmer_str,kmer_int",
[
("ATGCTAGCTAGAGAGAGAGGA", 696357446508),
("TTTTTTTTTTTTTTTTTTTTTTTTTTTT", 72057594037927935),
("TTTTTTTTTTTTTTTAGGCCCACTTTTT", 72006304612220927),
("AAAAAAAAAAAAAAAAAAAAAAAAAA", 0),
("GGGAGGGAGGGAGGGAGGGAGGGAGGG", 11868309606246954),
],
)
def test_kmer_to_int(kmer_str, kmer_int):
assert kmers.kmer_to_int(kmer_str) == kmer_int


@pytest.mark.parametrize(
"kmer,revcomp_kmer",
[
("ATGCTAGCTAGAGAGAGAGGA", "TCCTCTCTCTCTAGCTAGCAT"),
("TTTTTTTTTTTTTTTTTTTTTTTTTTTT", "AAAAAAAAAAAAAAAAAAAAAAAAAAAA"),
("TTTTTTTTTTTTTTTAGGCCCACTTTTT", "AAAAAGTGGGCCTAAAAAAAAAAAAAAA"),
("AAAAAAAAAAAAAAAAAAAAAAAAAA", "TTTTTTTTTTTTTTTTTTTTTTTTTT"),
("GGGAGGGAGGGAGGGAGGGAGGGAGGG", "CCCTCCCTCCCTCCCTCCCTCCCTCCC"),
],
)
def test_reverse_complement(kmer, revcomp_kmer):
assert kmers.reverse_complement(kmer) == revcomp_kmer

0 comments on commit 6b3cc73

Please sign in to comment.