Пример #1
0
// GuessAlphabet guesses alphabet by given
func GuessAlphabet(seqs []byte) *Alphabet {
	if len(seqs) == 0 {
		return Unlimit
	}
	var alphabetMap map[byte]bool
	if AlphabetGuessSeqLenghtThreshold == 0 || len(seqs) <= AlphabetGuessSeqLenghtThreshold {
		alphabetMap = slice2map(byteutil.Alphabet(seqs))
	} else { // reduce guessing time
		alphabetMap = slice2map(byteutil.Alphabet(seqs[0:AlphabetGuessSeqLenghtThreshold]))
	}
	if isSubset(alphabetMap, abDNA) {
		return DNA
	}
	if isSubset(alphabetMap, abRNA) {
		return RNA
	}
	if isSubset(alphabetMap, abDNAredundant) {
		return DNAredundant
	}
	if isSubset(alphabetMap, abRNAredundant) {
		return RNAredundant
	}
	if isSubset(alphabetMap, abProtein) {
		return Protein
	}

	return Unlimit
}
Пример #2
0
func init() {
	DNA, _ = NewAlphabet(
		"DNA",
		false,
		[]byte("acgtACGT"),
		[]byte("tgcaTGCA"),
		[]byte(" -"),
		[]byte("nN."))

	DNAredundant, _ = NewAlphabet(
		"DNAredundant",
		false,
		[]byte("acgtryswkmbdhvACGTRYSWKMBDHV"),
		[]byte("tgcayrswmkvhdbTGCAYRSWMKVHDB"),
		[]byte(" -"),
		[]byte("nN."))

	RNA, _ = NewAlphabet(
		"RNA",
		false,
		[]byte("acguACGU"),
		[]byte("ugcaUGCA"),
		[]byte(" -"),
		[]byte("nN"))

	RNAredundant, _ = NewAlphabet(
		"RNAredundant",
		false,
		[]byte("acguryswkmbdhvACGURYSWKMBDHV"),
		[]byte("ugcayrswmkvhdbUGCAYRSWMKVHDB"),
		[]byte(" -"),
		[]byte("nN"))

	Protein, _ = NewAlphabet(
		"Protein",
		false,
		[]byte("abcdefghijklmnpqrstvwyzABCDEFGHIJKLMNPQRSTVWYZ"),
		[]byte("abcdefghijklmnpqrstvwyzABCDEFGHIJKLMNPQRSTVWYZ"),
		[]byte(" -"),
		[]byte("xX*_."))

	Unlimit, _ = NewAlphabet(
		"Unlimit",
		true,
		nil,
		nil,
		nil,
		nil)

	abProtein = slice2map(byteutil.Alphabet(Protein.AllLetters()))
	abDNAredundant = slice2map(byteutil.Alphabet(DNAredundant.AllLetters()))
	abDNA = slice2map(byteutil.Alphabet(DNA.AllLetters()))
	abRNAredundant = slice2map(byteutil.Alphabet(RNAredundant.AllLetters()))
	abRNA = slice2map(byteutil.Alphabet(RNA.AllLetters()))
}
Пример #3
0
// Locate locates the pattern
func (fmi *FMIndex) Locate(query []byte, mismatches int) ([]int, error) {
	locations := []int{}
	letters := byteutil.Alphabet(query)
	for _, letter := range letters {
		if _, ok := fmi.CountOfLetters[letter]; !ok {
			return locations, nil
		}
	}

	if fmi.SuffixArray == nil {
		return nil, errors.New("SuffixArray is nil, you should call TransformForLocate instead of Transform")
	}

	n := len(fmi.BWT)
	var matches stack.Stack
	type Match struct {
		query      []byte
		start, end int
		mismatches int
	}
	matches.Put(Match{query, 0, n - 1, mismatches})
	for !matches.Empty() {
		match := matches.Pop().(Match)
		query = match.query[0 : len(query)-1]
		last := match.query[len(query)-1]
		var letters []byte
		if mismatches == 0 {
			letters = []byte{last}
		} else {
			letters = fmi.Alphabet
		}
		for _, c := range letters {
			start := fmi.C[c] + fmi.Occ[c][match.start-2] + 1
			end := fmi.C[c] + fmi.Occ[c][match.end-1]
			if start <= end {
				if len(query) == 0 {
					for _, i := range fmi.SuffixArray[start : end+1] {
						locations = append(locations, i)
					}
				} else {
					mm := match.mismatches
					if c != last {
						if match.mismatches-1 > 0 {
							mm = match.mismatches - 1
						} else {
							mm = 0
						}
					}
					matches.Put(Match{query, start, end, mm})
				}
			}
		}
	}
	sort.Ints(locations)
	return locations, nil
}
Пример #4
0
// ComputeC computes C.
// C[c] is a table that, for each character c in the alphabet,
// contains the number of occurrences of lexically smaller characters
// in the text.
func ComputeC(m [][]byte, alphabet []byte) map[byte]int {
	if alphabet == nil {
		byteutil.Alphabet(m[0])
	}
	C := make(map[byte]int, len(alphabet))
	count := 0
	for _, r := range m {
		c := r[0]
		if _, ok := C[c]; !ok {
			C[c] = count
		}
		count++
	}
	return C
}
Пример #5
0
// ComputeOccurrence returns occurrence information.
// Occ(c, k) is the number of occurrences of character c in the prefix L[1..k]
func ComputeOccurrence(bwt []byte, letters []byte) map[byte][]int {
	if letters == nil {
		letters = byteutil.Alphabet(bwt)
	}
	occ := make(map[byte][]int, len(letters))
	for _, letter := range letters {
		occ[letter] = []int{0}
	}
	occ[bwt[0]] = []int{1}
	for _, letter := range bwt[1:] {
		for k := range occ {
			if k == letter {
				occ[k] = append(occ[k], occ[k][len(occ[k])-1]+1)
			} else {
				occ[k] = append(occ[k], occ[k][len(occ[k])-1])
			}
		}
	}
	return occ
}