diff --git a/alphabet/alphabet.go b/alphabet/alphabet.go index 7355a83f..c1359aa5 100644 --- a/alphabet/alphabet.go +++ b/alphabet/alphabet.go @@ -3,12 +3,14 @@ Package alphabet provides structs for defining biological sequence alphabets. */ package alphabet +// TODO: add Alphabet for codons + import "fmt" // Alphabet is a struct that holds a list of symbols and a map of symbols to their index in the list. type Alphabet struct { symbols []string - encoding map[interface{}]int + encoding map[interface{}]uint8 } // Error is an error type that is returned when a symbol is not in the alphabet. @@ -23,23 +25,47 @@ func (e *Error) Error() string { // NewAlphabet creates a new alphabet from a list of symbols. func NewAlphabet(symbols []string) *Alphabet { - encoding := make(map[interface{}]int) + encoding := make(map[interface{}]uint8) for index, symbol := range symbols { - encoding[symbol] = index - encoding[index] = index + encoding[symbol] = uint8(index) + encoding[index] = uint8(index) } return &Alphabet{symbols, encoding} } // Encode returns the index of a symbol in the alphabet. -func (alphabet *Alphabet) Encode(symbol interface{}) (int, error) { +func (alphabet *Alphabet) Encode(symbol interface{}) (uint8, error) { c, ok := alphabet.encoding[symbol] if !ok { - return 0, &Error{fmt.Sprintf("Symbol %v not in alphabet", symbol)} + return 0, fmt.Errorf("Symbol %v not in alphabet", symbol) } return c, nil } +// TODO: compress more when len(symbols) << 2^8 +// TODO: DecodeAll +func (alphabet *Alphabet) EncodeAll(seq string) ([]uint8, error) { + encoded := make([]uint8, len(seq)) + for i, r := range seq { + encoding, err := alphabet.Encode(string(r)) + if err != nil { + return nil, fmt.Errorf("Symbol %c in position %d not in alphabet", r, i) + } + encoded[i] = uint8(encoding) + } + return encoded, nil +} + +func (alphabet *Alphabet) Check(seq string) int { + for i, r := range seq { + encoding, err := alphabet.Encode(string(r)) + if err != nil { + return i + } + } + return -1 +} + // Decode returns the symbol at a given index in the alphabet. func (alphabet *Alphabet) Decode(code interface{}) (string, error) { c, ok := code.(int) diff --git a/alphabet/alphabet_test.go b/alphabet/alphabet_test.go index 81dd70ad..dc8ea04f 100644 --- a/alphabet/alphabet_test.go +++ b/alphabet/alphabet_test.go @@ -16,7 +16,7 @@ func TestAlphabet(t *testing.T) { if err != nil { t.Errorf("Unexpected error encoding symbol %s: %v", symbol, err) } - if code != i { + if int(code) != i { t.Errorf("Incorrect encoding of symbol %s: expected %d, got %d", symbol, i, code) } } @@ -48,7 +48,7 @@ func TestAlphabet(t *testing.T) { if err != nil { t.Errorf("Unexpected error encoding symbol %s: %v", symbol, err) } - if code != i { + if int(code) != i { t.Errorf("Incorrect encoding of symbol %s: expected %d, got %d", symbol, i, code) } } @@ -57,7 +57,7 @@ func TestAlphabet(t *testing.T) { if err != nil { t.Errorf("Unexpected error encoding symbol %s: %v", symbol, err) } - if code != i+len(symbols) { + if int(code) != i+len(symbols) { t.Errorf("Incorrect encoding of symbol %s: expected %d, got %d", symbol, i+len(symbols), code) } } diff --git a/alphabet/kmer_counter.go b/alphabet/kmer_counter.go new file mode 100644 index 00000000..73fefc32 --- /dev/null +++ b/alphabet/kmer_counter.go @@ -0,0 +1,175 @@ +// TODO: create benchmark for space and time costs vs https://github.com/shenwei356/bio/tree/master/sketches +// TODO: create efficient storage for children + counts in top node via multidimensional array +// TODO: integrate with IO parsers +// TODO: enable reading and writing to binary format +// TODO: iterate over observed kmers (what is the appropriate iteration method?) +// TODO: comparison between two KmerCounter's +// TODO: initialize upper levels while respecting cache locality +// TODO: add counts from other KmerCounter, to enable reducing parallel counts +// TODO: lookup neighbors +// TODO: path compression for internal nodes +// TODO: have last succ point to next node from same level for faster iteration + +package alphabet + +import ( + "fmt" +) + +type KmerCounter struct { + alphabet *Alphabet + num_symbols uint8 + max_k uint8 + total uint64 + children []Node +} + +type Node struct { + succ *Node + child *Node + encoding uint8 + count uint32 +} + +func NewKmerCounter(a *Alphabet, max_k uint8) *KmerCounter { + kc := new(KmerCounter) + kc.alphabet = a + kc.num_symbols = uint8(len(a.symbols)) + kc.max_k = max_k + + kc.children = make([]Node, kc.num_symbols) + for i := uint8(1); i < kc.num_symbols; i++ { + kc.children[i].encoding = i + kc.children[i-1].succ = &kc.children[i] + } + return kc +} + +func lookupChild(n *Node, encoding uint8) *Node { + for c := n.child; c != nil && encoding <= c.encoding; c = c.succ { + if encoding == c.encoding { + return c + } + } + return nil +} + +func insertChild(p *Node, encoding uint8, n *Node) { + if p.child == nil { + p.child = n + } else if n.encoding < p.child.encoding { + n.succ = p.child + p.child = n + } else { + for c := p.child; c.encoding < encoding; c = c.succ { + if encoding < c.succ.encoding { + n.succ = c.succ + c.succ = n + } + } + } + n.count++ // not sure why this is needed + n.encoding = encoding +} + +func Observe(kc *KmerCounter, seq string) error { + cbuf, err := kc.alphabet.EncodeAll(seq[:kc.max_k]) + if err != nil { + return err + } + + CreateChildren := func(index int, remaining int) *Node { + if remaining == 0 { + return nil + } // this condition should never happen + nodes := make([]Node, remaining) + + for j, n := range nodes { + n.count = 1 + if j != 0 { + nodes[j-1].child = &n + n.encoding = cbuf[(index+j)%int(kc.max_k)] + } + } + + return &nodes[0] + } + + UpdateBuffer := func(index int) (max_k int, err error) { + max_k = int(kc.max_k) + lookahead := index + max_k - 1 + var encoding uint8 + if lookahead < len(seq) { + next := string(seq[lookahead]) + encoding, err = kc.alphabet.Encode(next) + if err != nil { + err = fmt.Errorf("in position %d: %w", index, err) + return + } + cbuf[lookahead%int(kc.max_k)] = encoding + } else { + max_k = len(seq) - index + } + return + } + + var encoding uint8 + for i := range seq { + max_k, err := UpdateBuffer(i) + if err != nil { + return err + } + + p := &kc.children[cbuf[i%int(kc.max_k)]] + kc.total++ + for k := 1; k <= max_k; k++ { + p.count++ + // fmt.Printf("%d %d %v\n", i, k, p) + + if k != max_k { + encoding = cbuf[(i+k)%int(kc.max_k)] + c := lookupChild(p, encoding) + if c == nil { + insertChild(p, encoding, CreateChildren(i+k, max_k-k)) + break // inserted nodes already have count = 1 added + } + p = c + } + } + } + return nil +} + +func LookupCount(kc *KmerCounter, kmer string) (count uint32, err error) { + if len(kmer) > int(kc.max_k) { + err = fmt.Errorf("kmer_counter: attempted to lookup count of %d-mer which exceeds max supported length %d", len(kmer), kc.max_k) + return + } + if len(kmer) == 0 { + err = fmt.Errorf("kmer_counter: attempted to lookup count of 0mer") + return + } + encoded, err := kc.alphabet.EncodeAll(kmer) + if err != nil { + return + } + + var k int + var p *Node + for k, p = 1, &kc.children[encoded[0]]; k < len(kmer) && p != nil; k++ { + p = lookupChild(p, encoded[k]) + } + if p == nil { + return + } + count = p.count + return +} + +func LookupFrequency(kc *KmerCounter, kmer string) (float64, error) { + count, err := LookupCount(kc, kmer) + if err != nil { + return 0, err + } + return float64(count) / float64(kc.total-uint64(len(kmer)-1)), nil +} diff --git a/alphabet/kmer_counter_test.go b/alphabet/kmer_counter_test.go new file mode 100644 index 00000000..a3788196 --- /dev/null +++ b/alphabet/kmer_counter_test.go @@ -0,0 +1,67 @@ +package alphabet + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/TimothyStiles/poly/transform/variants" +) + +func Assertf(t *testing.T, cond bool, format string, args ...any) { + if !cond { + t.Errorf(format, args...) + } +} + +func TestKmerCounterEmpty(t *testing.T) { + kc := NewKmerCounter(DNA, 3) + + count, err := LookupCount(kc, "A") + assert.EqualValues(t, 0, count) + assert.NoError(t, err) + count, err = LookupCount(kc, "X") + assert.Error(t, err) +} + +func TestKmerCounterRepeated(t *testing.T) { + kc := NewKmerCounter(DNA, 3) + seq := strings.Repeat("A", 12) + assert.Equal(t, 12, len(seq)) + err := Observe(kc, seq) + assert.NoError(t, err) + + assert.Equal(t, 12, int(kc.total)) + + onemers, _ := variants.AllVariantsIUPAC("N") + for _, kmer := range onemers { + count, err := LookupCount(kc, kmer) + assert.NoError(t, err) + if kmer == "A" { + assert.EqualValues(t, 12, count, "Wrong count") + } else { + assert.EqualValues(t, 0, count, "Reports nonzero count for 1mer not present in sequence %v", kmer) + } + } + twomers, _ := variants.AllVariantsIUPAC("NN") + for _, kmer := range twomers { + count, err := LookupCount(kc, kmer) + assert.NoError(t, err) + if kmer == "AA" { + assert.EqualValues(t, 11, count, "Wrong count") + } else { + assert.EqualValues(t, 0, count, "Reports nonzero count for 2mer not present in sequence %v", kmer) + } + } + threemers, _ := variants.AllVariantsIUPAC("NNN") + for _, kmer := range threemers { + count, err := LookupCount(kc, kmer) + assert.NoError(t, err) + if kmer == "AAA" { + assert.EqualValues(t, 10, count, "Wrong count") + } else { + assert.EqualValues(t, 0, count, "Reports nonzero count for 3mer not present in sequence %v", kmer) + } + } +} diff --git a/io/genbank/genbank.go b/io/genbank/genbank.go index 34062b36..0f353750 100644 --- a/io/genbank/genbank.go +++ b/io/genbank/genbank.go @@ -811,8 +811,8 @@ func getSourceOrganism(metadataData []string) (string, string, []string) { func parseLocation(locationString string) (Location, error) { var location Location location.GbkLocationString = locationString - if !(strings.ContainsAny(locationString, "(")) { // Case checks for simple expression of x..x - if !(strings.ContainsAny(locationString, ".")) { //Case checks for simple expression x + if !strings.ContainsAny(locationString, "(") { // Case checks for simple expression of x..x + if !strings.ContainsAny(locationString, ".") { //Case checks for simple expression x position, err := strconv.Atoi(locationString) if err != nil { return Location{}, err @@ -841,26 +841,34 @@ func parseLocation(locationString string) (Location, error) { if strings.ContainsAny(expression, "(") { firstInnerParentheses := strings.Index(expression, "(") ParenthesesCount := 1 - comma := 0 - for i := 1; ParenthesesCount > 0; i++ { // "(" is at 0, so we start at 1 - comma = i - switch expression[firstInnerParentheses+i] { - case []byte("(")[0]: + prevSubLocationStart := 0 + for i := firstInnerParentheses+1; i < len(expression); i++ { // "(" is at 0, so we start at 1 + switch expression[i] { + case '(': ParenthesesCount++ - case []byte(")")[0]: + case ')': ParenthesesCount-- + case ',': + if ParenthesesCount == 0 { + parsedSubLocation, err := parseLocation(expression[prevSubLocationStart:i]) + if err != nil { + return Location{}, err + } + parsedSubLocation.GbkLocationString = locationString + location.SubLocations = append(location.SubLocations, parsedSubLocation) + prevSubLocationStart = i+1 + } } } - parseLeftLocation, err := parseLocation(expression[:firstInnerParentheses+comma+1]) - if err != nil { - return Location{}, err + if ParenthesesCount != 0 { + return Location{}, fmt.Errorf("Unbalanced parentheses") } - parseRightLocation, err := parseLocation(expression[2+firstInnerParentheses+comma:]) + parsedSubLocation, err := parseLocation(expression[prevSubLocationStart:]) if err != nil { return Location{}, err } - - location.SubLocations = append(location.SubLocations, parseLeftLocation, parseRightLocation) + parsedSubLocation.GbkLocationString = locationString + location.SubLocations = append(location.SubLocations, parsedSubLocation) } else { // This is the default join(x..x,x..x) for _, numberRange := range strings.Split(expression, ",") { joinLocation, err := parseLocation(numberRange) @@ -899,6 +907,7 @@ func parseLocation(locationString string) (Location, error) { return location, nil } + // buildMetaString is a helper function to build the meta section of genbank files. func buildMetaString(name string, data string) string { keyWhitespaceTrailLength := 12 - len(name) // I wish I was kidding. diff --git a/io/genbank/genbank_test.go b/io/genbank/genbank_test.go index 17090386..c4e0fb77 100644 --- a/io/genbank/genbank_test.go +++ b/io/genbank/genbank_test.go @@ -2,6 +2,7 @@ package genbank import ( "errors" + "fmt" "io" "os" "path/filepath" @@ -115,12 +116,11 @@ func TestGbkLocationStringBuilder(t *testing.T) { func TestGbLocationStringBuilder(t *testing.T) { tmpDataDir, err := os.MkdirTemp("", "data-*") - if err != nil { - t.Error(err) - } + assert.NoError(t, err) defer os.RemoveAll(tmpDataDir) - scrubbedGb, _ := Read("../../data/t4_intron.gb") + scrubbedGb, err := Read("../../data/t4_intron.gb") + assert.NoError(t, err) // removing gbkLocationString from features to allow testing for gbkLocationBuilder for featureIndex := range scrubbedGb.Features { @@ -128,10 +128,13 @@ func TestGbLocationStringBuilder(t *testing.T) { } tmpGbFilePath := filepath.Join(tmpDataDir, "t4_intron_test.gb") - _ = Write(scrubbedGb, tmpGbFilePath) + err = Write(scrubbedGb, tmpGbFilePath) + assert.NoError(t, err) - testInputGb, _ := Read("../../data/t4_intron.gb") - testOutputGb, _ := Read(tmpGbFilePath) + testInputGb, err := Read("../../data/t4_intron.gb") + assert.NoError(t, err) + testOutputGb, err := Read(tmpGbFilePath) + assert.NoError(t, err) if diff := cmp.Diff(testInputGb, testOutputGb, []cmp.Option{cmpopts.IgnoreFields(Feature{}, "ParentSequence")}...); diff != "" { t.Errorf("Issue with either Join or complement location building. Parsing the output of Build() does not produce the same output as parsing the original file read with Read(). Got this diff:\n%s", diff) @@ -160,6 +163,15 @@ func TestPartialLocationParseRegression(t *testing.T) { } } +func TestSubLocationStringParseRegression(t *testing.T) { + location := "join(complement(5306942..5307394),complement(5304401..5305029),complement(5303328..5303393),complement(5301928..5302004))" + parsedLocation, err := parseLocation(location) + if err != nil { + t.Errorf("Failed to parse location string. Got err: %s", err) + } + fmt.Println(parsedLocation) +} + func TestSnapgeneGenbankRegression(t *testing.T) { snapgene, err := Read("../../data/puc19_snapgene.gb") diff --git a/random/random.go b/random/random.go index 7b933a9b..fa929974 100644 --- a/random/random.go +++ b/random/random.go @@ -6,8 +6,13 @@ package random import ( "errors" "math/rand" + + "github.com/TimothyStiles/poly/transform/variants" ) +var iupacToBases = variants.IUPAC2Bases() +var iupacToAAs = variants.IUPAC2AAs() + // ProteinSequence returns a random protein sequence string of a given length and seed. // All returned sequences start M (Methionine) and end with * (stop codon). func ProteinSequence(length int, seed int64) (string, error) { @@ -39,11 +44,44 @@ func ProteinSequence(length int, seed int64) (string, error) { return string(randomSequence), nil } +func RandomRune(runes []rune) rune { + randomIndex := rand.Intn(len(runes)) + return runes[randomIndex] +} + +func ProteinSequenceFromPattern(pattern string, seed int64) string { + randomSequence := make([]rune, len(pattern)) + rand.Seed(seed) + + for res, peptide := range pattern { + if options, found := iupacToAAs[peptide]; found { + randomSequence[res] = RandomRune(options) + } else { + randomSequence[res] = peptide + } + } + return string(randomSequence) +} + // DNASequence returns a random DNA sequence string of a given length and seed. func DNASequence(length int, seed int64) (string, error) { return randomNucelotideSequence(length, seed, []rune("ACTG")), nil } +func DNASequenceFromPattern(pattern string, seed int64) string { + randomSequence := make([]rune, len(pattern)) + rand.Seed(seed) + + for i, base := range pattern { + if options, found := iupacToBases[base]; found { + randomSequence[i] = RandomRune(options) + } else { + randomSequence[i] = base + } + } + return string(randomSequence) +} + // RNASequence returns a random DNA sequence string of a given length and seed. func RNASequence(length int, seed int64) (string, error) { return randomNucelotideSequence(length, seed, []rune("ACUG")), nil diff --git a/synthesis/fix/patterns.go b/synthesis/fix/patterns.go new file mode 100644 index 00000000..b13494d2 --- /dev/null +++ b/synthesis/fix/patterns.go @@ -0,0 +1,72 @@ +package fix + +import ( + "fmt" + "log" + "regexp" + "strings" + + "github.com/TimothyStiles/poly/transform/variants" +) + +var translator = buildPatternTranslator(variants.IUPAC2Bases()) + +func buildPatternTranslator(ambiguityCodes map[rune][]rune) map[rune]string { + trans := make(map[rune]string) + for r, options := range ambiguityCodes { + if len(options) > 1 { + trans[r] = "[" + string(options) + "]" + } + } + trans['('] = "(?:" + + return trans +} + +// TODO support (n/m) notation for point of cleavage for non-palindromic enzymes +func patternToRegexp(buf *strings.Builder, pattern string, translator map[rune]string) { + buf.WriteRune('(') + for _, r := range pattern { + if match, found := translator[r]; found { + buf.WriteString(match) + } else { + buf.WriteRune(r) + } + } + buf.WriteRune(')') +} + +func PatternsToRegexp(patterns []string, doubleStranded bool) (*regexp.Regexp, error) { + var buf strings.Builder + if len(patterns) == 0 { + return nil, nil + } + + // expand to reduce reallocations + estSize := -1 + for _, pat := range patterns { + estSize += len(pat) + 3 + } + buf.Grow(estSize) + + for i, pat := range patterns { + if i != 0 { + buf.WriteRune('|') + } + patternToRegexp(&buf, pat, translator) + if doubleStranded { + reverseComplementPat := transform.ReverseComplement(pat) + if reverseComplementPat != pat { + buf.WriteRune('|') + patternToRegexp(&buf, reverseComplementPat, translator) + } + } + } + // TODO: only print during testing + log.Println(buf.String()) + return regexp.Compile(buf.String()) +} + +func HomopolymericRuns(length int) string { + return fmt.Sprintf("N{%d}", length) +} diff --git a/synthesis/fix/patterns_test.go b/synthesis/fix/patterns_test.go new file mode 100644 index 00000000..3d0e3e82 --- /dev/null +++ b/synthesis/fix/patterns_test.go @@ -0,0 +1,66 @@ +package fix + +import ( + "testing" + + "github.com/TimothyStiles/poly/random" + "github.com/TimothyStiles/poly/transform/variants" +) + +func Assertf(t *testing.T, cond bool, format string, args ...any) { + if !cond { + t.Errorf(format, args...) + } +} + +func TestOverlapping(t *testing.T) { + re, err := patternsToRegexp([]string{"AA"}) + Assertf(t, err == nil, "Encountered error building regexp") + matches := re.FindAllStringSubmatchIndex("AAAA", -1) + Assertf(t, len(matches) != 3, "Expected 3 matches") +} + +func TestAmbiguous(t *testing.T) { + re, err := patternsToRegexp([]string{"N"}) + Assertf(t, err == nil, "Encountered error building regexp") + matches := re.FindAllStringSubmatchIndex("AGCT", -1) + Assertf(t, len(matches) == 4, "Expected 4 matches, got %d", len(matches)) +} + +func TestMultiple(t *testing.T) { + re, err := patternsToRegexp([]string{"A", "C"}) + Assertf(t, err == nil, "Encountered error building regexp") + matches := re.FindAllStringSubmatchIndex("AGCT", -1) + Assertf(t, len(matches) == 2, "Expected 2 matches, got %d", len(matches)) +} + +func keys[K comparable, V any](m map[K]V) []K { + array := make([]K, len(m)) + var i = 0 + for k := range m { + array[i] = k + i += 1 + } + return array +} + +func TestAcceptsRandomGen(t *testing.T) { + allBases := string(keys[rune, []rune](variants.IUPAC2Bases())) + re, err := patternsToRegexp([]string{allBases}) + Assertf(t, err == nil, "Encountered error building regexp") + sample := random.DNASequenceFromPattern(allBases, 0) + Assertf(t, len(re.FindAllStringSubmatchIndex(sample, -1)) == 1, "Expected sample DNA sequence %s to match pattern %s", sample, allBases) +} + +func TestCheckHomopolymericRuns(t *testing.T) { + testSeq := "GTAAAACGACGGCCAGT" // M13 fwd + if run := checkHomopolymericRuns(testSeq); run == false { + t.Errorf("checkHomopolymericRuns has changed on test. Got false instead of true") + } + + testSeq = "ACGATGGCAGTAGCATGC" //"GTAAAACGACGGCCAGT" // M13 fwd + + if run := checkHomopolymericRuns(testSeq); run == true { + t.Errorf("checkHomopolymericRuns has changed on test. Got true instead of false") + } +} diff --git a/synthesis/fix/synthesis.go b/synthesis/fix/synthesis.go index 4aa50767..0799b7eb 100644 --- a/synthesis/fix/synthesis.go +++ b/synthesis/fix/synthesis.go @@ -54,9 +54,11 @@ type Change struct { Reason string `db:"reason"` } +type ProblematicSequenceFunc func(string, chan DnaSuggestion, *sync.WaitGroup) + // RemoveSequence is a generator for a problematicSequenceFuncs for specific // sequences. -func RemoveSequence(sequencesToRemove []string, reason string) func(string, chan DnaSuggestion, *sync.WaitGroup) { +func RemoveSequence(sequencesToRemove []string, reason string) ProblematicSequenceFunc { return func(sequence string, c chan DnaSuggestion, waitgroup *sync.WaitGroup) { var sequencesToRemoveForReverse []string for _, seq := range sequencesToRemove { @@ -82,8 +84,27 @@ func RemoveSequence(sequencesToRemove []string, reason string) func(string, chan } } +// TODO: support custom reasons per pattern +// TODO: benchmark against RemoveSequence +func RemovePatterns(patternsToRemove []string, reason string) (ProblematicSequenceFunc, err) { + re, err := PatternsToRegex(patternsToRemove) + + return func(sequence string, c chan DnaSuggestion, waitgroup *sync.WaitGroup) { + for _, site := range sequencesToRemoveForReverse { + locations := re.FindAllStringSubmatchIndex(sequence, -1) + for _, location := range locations { + codonLength := 3 + position := location[0] / codonLength + c <- DnaSuggestion{position, (location[1] / codonLength) - 1, "NA", 1, reason} + } + } + waitgroup.Done() + } +} + // RemoveRepeat is a generator to make a problematicSequenceFunc for repeats. -func RemoveRepeat(repeatLen int) func(string, chan DnaSuggestion, *sync.WaitGroup) { +// TODO: replace with more efficient kmer counter +func RemoveRepeat(repeatLen int) ProblematicSequenceFunc { return func(sequence string, c chan DnaSuggestion, waitgroup *sync.WaitGroup) { // Get a kmer list kmers := make(map[string]bool) @@ -114,7 +135,7 @@ func RemoveRepeat(repeatLen int) func(string, chan DnaSuggestion, *sync.WaitGrou // base pairs in comparison to adenine and thymine base pairs. Usually, you // want the range to be somewhere around 50%, with a decent upperBound being // 80% GC and a decent lowerBound being 20%. -func GcContentFixer(upperBound, lowerBound float64) func(string, chan DnaSuggestion, *sync.WaitGroup) { +func GcContentFixer(upperBound, lowerBound float64) ProblematicSequenceFunc { return func(sequence string, c chan DnaSuggestion, waitgroup *sync.WaitGroup) { gcContent := checks.GcContent(sequence) var numberOfChanges int @@ -149,7 +170,7 @@ func getSuggestions(suggestions chan DnaSuggestion, suggestionOutputs chan []Dna // findProblems is a helper function in FixCDS that concurrently runs each // sequence check and returns a list of all the suggested changes. -func findProblems(sequence string, problematicSequenceFuncs []func(string, chan DnaSuggestion, *sync.WaitGroup)) []DnaSuggestion { +func findProblems(sequence string, problematicSequenceFuncs []ProblematicSequenceFunc) []DnaSuggestion { // Run functions to get suggestions suggestions := make(chan DnaSuggestion) suggestionOutputs := make(chan []DnaSuggestion) @@ -216,7 +237,7 @@ changes that were done to the sequence. // Cds fixes a CDS given the CDS sequence, a codon table, a list of // functions to solve for, and a number of iterations to attempt fixing. // Unless you are an advanced user, you should use CdsSimple. -func Cds(sequence string, codontable codon.Table, problematicSequenceFuncs []func(string, chan DnaSuggestion, *sync.WaitGroup)) (string, []Change, error) { +func Cds(sequence string, codontable codon.Table, problematicSequenceFuncs []ProblematicSequenceFunc) (string, []Change, error) { codonLength := 3 if len(sequence)%codonLength != 0 { return "", []Change{}, errors.New("this sequence isn't a complete CDS, please try to use a CDS without interrupted codons") diff --git a/transform/variants/variants.go b/transform/variants/variants.go index 8ea276a4..254cddec 100644 --- a/transform/variants/variants.go +++ b/transform/variants/variants.go @@ -13,32 +13,49 @@ import ( "strings" ) +var iupacToBases = map[rune][]rune{ // rune map of all iupac nucleotide variants + 'G': {'G'}, + 'A': {'A'}, + 'T': {'T'}, + 'C': {'C'}, + 'R': {'G', 'A'}, + 'Y': {'T', 'C'}, + 'M': {'A', 'C'}, + 'K': {'G', 'T'}, + 'S': {'G', 'C'}, + 'W': {'A', 'T'}, + 'H': {'A', 'C', 'T'}, + 'B': {'G', 'T', 'C'}, + 'V': {'G', 'C', 'A'}, + 'D': {'G', 'A', 'T'}, + 'N': {'G', 'A', 'T', 'C'}, +} + +var iupacToAAs = map[rune][]rune{ + 'X': []rune("ACDEFGHIJLMNPQRSTVWY"), // Unknown + 'B': {'D', 'N'}, + 'Z': {'E', 'Q'}, + 'J': {'I', 'L'}, + '+': {'K', 'R', 'H'}, // Positively charged + '-': {'D', 'E'}, // Negatively charged +} + +func IUPAC2Bases() map[rune][]rune { + return iupacToBases +} + +func IUPAC2AAs() map[rune][]rune { + return iupacToAAs +} + // AllVariantsIUPAC takes a string as input // and returns all iupac variants as output func AllVariantsIUPAC(seq string) ([]string, error) { seqVariantList := [][]rune{} seqVariants := []string{} - iupac := map[rune][]rune{ // rune map of all iupac nucleotide variants - 'G': {'G'}, - 'A': {'A'}, - 'T': {'T'}, - 'C': {'C'}, - 'R': {'G', 'A'}, - 'Y': {'T', 'C'}, - 'M': {'A', 'C'}, - 'K': {'G', 'T'}, - 'S': {'G', 'C'}, - 'W': {'A', 'T'}, - 'H': {'A', 'C', 'T'}, - 'B': {'G', 'T', 'C'}, - 'V': {'G', 'C', 'A'}, - 'D': {'G', 'A', 'T'}, - 'N': {'G', 'A', 'T', 'C'}, - } - for _, s := range strings.ToUpper(seq) { - variantsIUPAC, ok := iupac[s] + variantsIUPAC, ok := iupacToBases[s] if ok { seqVariantList = append(seqVariantList, variantsIUPAC) } else {