// GuessAlphabet guesses alphabet by given func GuessAlphabet(seqs []byte) *Alphabet { if len(seqs) == 0 { return Unlimit } var alphabetMap map[byte]bool if AlphabetGuessSeqLenghtThreshold == 0 || len(seqs) <= AlphabetGuessSeqLenghtThreshold { alphabetMap = slice2map(byteutil.Alphabet(seqs)) } else { // reduce guessing time alphabetMap = slice2map(byteutil.Alphabet(seqs[0:AlphabetGuessSeqLenghtThreshold])) } if isSubset(alphabetMap, abDNA) { return DNA } if isSubset(alphabetMap, abRNA) { return RNA } if isSubset(alphabetMap, abDNAredundant) { return DNAredundant } if isSubset(alphabetMap, abRNAredundant) { return RNAredundant } if isSubset(alphabetMap, abProtein) { return Protein } return Unlimit }
func init() { DNA, _ = NewAlphabet( "DNA", false, []byte("acgtACGT"), []byte("tgcaTGCA"), []byte(" -"), []byte("nN.")) DNAredundant, _ = NewAlphabet( "DNAredundant", false, []byte("acgtryswkmbdhvACGTRYSWKMBDHV"), []byte("tgcayrswmkvhdbTGCAYRSWMKVHDB"), []byte(" -"), []byte("nN.")) RNA, _ = NewAlphabet( "RNA", false, []byte("acguACGU"), []byte("ugcaUGCA"), []byte(" -"), []byte("nN")) RNAredundant, _ = NewAlphabet( "RNAredundant", false, []byte("acguryswkmbdhvACGURYSWKMBDHV"), []byte("ugcayrswmkvhdbUGCAYRSWMKVHDB"), []byte(" -"), []byte("nN")) Protein, _ = NewAlphabet( "Protein", false, []byte("abcdefghijklmnpqrstvwyzABCDEFGHIJKLMNPQRSTVWYZ"), []byte("abcdefghijklmnpqrstvwyzABCDEFGHIJKLMNPQRSTVWYZ"), []byte(" -"), []byte("xX*_.")) Unlimit, _ = NewAlphabet( "Unlimit", true, nil, nil, nil, nil) abProtein = slice2map(byteutil.Alphabet(Protein.AllLetters())) abDNAredundant = slice2map(byteutil.Alphabet(DNAredundant.AllLetters())) abDNA = slice2map(byteutil.Alphabet(DNA.AllLetters())) abRNAredundant = slice2map(byteutil.Alphabet(RNAredundant.AllLetters())) abRNA = slice2map(byteutil.Alphabet(RNA.AllLetters())) }
// Locate locates the pattern func (fmi *FMIndex) Locate(query []byte, mismatches int) ([]int, error) { locations := []int{} letters := byteutil.Alphabet(query) for _, letter := range letters { if _, ok := fmi.CountOfLetters[letter]; !ok { return locations, nil } } if fmi.SuffixArray == nil { return nil, errors.New("SuffixArray is nil, you should call TransformForLocate instead of Transform") } n := len(fmi.BWT) var matches stack.Stack type Match struct { query []byte start, end int mismatches int } matches.Put(Match{query, 0, n - 1, mismatches}) for !matches.Empty() { match := matches.Pop().(Match) query = match.query[0 : len(query)-1] last := match.query[len(query)-1] var letters []byte if mismatches == 0 { letters = []byte{last} } else { letters = fmi.Alphabet } for _, c := range letters { start := fmi.C[c] + fmi.Occ[c][match.start-2] + 1 end := fmi.C[c] + fmi.Occ[c][match.end-1] if start <= end { if len(query) == 0 { for _, i := range fmi.SuffixArray[start : end+1] { locations = append(locations, i) } } else { mm := match.mismatches if c != last { if match.mismatches-1 > 0 { mm = match.mismatches - 1 } else { mm = 0 } } matches.Put(Match{query, start, end, mm}) } } } } sort.Ints(locations) return locations, nil }
// ComputeC computes C. // C[c] is a table that, for each character c in the alphabet, // contains the number of occurrences of lexically smaller characters // in the text. func ComputeC(m [][]byte, alphabet []byte) map[byte]int { if alphabet == nil { byteutil.Alphabet(m[0]) } C := make(map[byte]int, len(alphabet)) count := 0 for _, r := range m { c := r[0] if _, ok := C[c]; !ok { C[c] = count } count++ } return C }
// ComputeOccurrence returns occurrence information. // Occ(c, k) is the number of occurrences of character c in the prefix L[1..k] func ComputeOccurrence(bwt []byte, letters []byte) map[byte][]int { if letters == nil { letters = byteutil.Alphabet(bwt) } occ := make(map[byte][]int, len(letters)) for _, letter := range letters { occ[letter] = []int{0} } occ[bwt[0]] = []int{1} for _, letter := range bwt[1:] { for k := range occ { if k == letter { occ[k] = append(occ[k], occ[k][len(occ[k])-1]+1) } else { occ[k] = append(occ[k], occ[k][len(occ[k])-1]) } } } return occ }