// alignUngapped takes a coarse and an original sub-sequence and returns a // length corresponding to the number of amino acids scanned by greedily // consuming successive K-mer matches in N-mer windows. // // The algorithm works by attempting to find *exact* K-mer matches between the // sequences in N-mer windows. If N residues are scanned and no K-mer match // is found, the the current value of length is returned (which may be 0). // If a K-mer match is found, the current value of length is set to the total // number of amino acid residues scanned, and a search for the next K-mer match // for the next N-mer window is started. func alignUngapped(rseq []byte, oseq []byte, windowSize, kmerSize, idThreshold int) int { length, scanned, successive := 0, 0, 0 tryNextWindow := true for tryNextWindow { tryNextWindow = false for i := 0; i < windowSize; i++ { // If we've scanned all residues in one of the sub-sequences, then // there is nothing left to do for ungapped extension. Therefore, // quit and return the number of residues scanned up until the // *last* match. if scanned >= len(rseq) || scanned >= len(oseq) { break } if rseq[scanned] == oseq[scanned] { successive++ } else { successive = 0 } scanned++ if successive == kmerSize { // Get the residues between matches: i.e., after the last // match to the start of this match. But only if there is at // least one residue in that range. if (scanned-kmerSize)-length > 0 { id := mica.SeqIdentity( rseq[length:scanned-kmerSize], oseq[length:scanned-kmerSize]) // If the identity is less than the threshold, then this // K-mer match is no good. But keep trying until the window // is closed. (We "keep trying" by decrementing successive // matches by 1.) if id < idThreshold { successive-- continue } } // If we're here, then we've found a valid match. Update the // length to indicate the number of residues scanned and make // sure we try the next Ungapped window. length = scanned successive = 0 tryNextWindow = true break } } } return length }
// compress will convert an original sequence into a compressed sequence. // The process involves finding commonality in the original sequence with // other sequences in the coarse database, and linking those common // sub-sequences to sub-sequences in the coarse database. // // N.B. `mem` is used in alignment and seed lookups to prevent allocation. // Think of them as goroutine-specific memory arenas. func compress(db *mica.DB, orgSeqId int, orgSeq *mica.OriginalSeq, mem *memory) mica.CompressedSeq { // cseqExt and oseqExt will contain `extSeedSize` residues after the end // of any particular seed in coarse and original sequences, respectively. // If the residues are not equivalent, that particular seed is skipped. var cseqExt, oseqExt []byte // Start the creation of a compressed sequence. cseq := mica.NewCompressedSeq(orgSeqId, orgSeq.Name) // Convenient aliases. coarsedb := db.CoarseDB mapSeedSize := db.MapSeedSize extSeedSize := db.ExtSeedSize olen := orgSeq.Len() // Keep track of two pointers. 'current' refers to the residue index in the // original sequence that extension is currently originating from. // 'lastMatch' refers to the residue index of the *end* of the last match // with a coarse sequence in the compressed database. lastMatch, current := 0, 0 // Iterate through the original sequence a 'kmer' at a time. for current = 0; current < olen-mapSeedSize-extSeedSize; current++ { kmer := orgSeq.Residues[current : current+mapSeedSize] seeds := coarsedb.Seeds.Lookup(kmer, &mem.seeds) // Before trying to extend this with seeds, check to see if there is // a low complexity region within `db.MinMatchLen` residues from // `current`. If there is, skip ahead to the end of it. if db.LowComplexity > 0 { skip := skipLowComplexity( orgSeq.Residues[current:], db.MinMatchLen, db.LowComplexity) if skip > 0 { current += skip continue } } // Each seed location corresponding to the current K-mer must be // used to attempt to extend a match. for _, seedLoc := range seeds { corSeqId := int(seedLoc[0]) corResInd := int(seedLoc[1]) corSeq := coarsedb.CoarseSeqGet(uint(corSeqId)) // If the seed extension extends beyond the end of the coarse // sequence pointed to by seedLoc, then move along. extCorStart := corResInd + mapSeedSize extOrgStart := current + mapSeedSize if extCorStart+extSeedSize >= corSeq.Len() { continue } // If the seed extensions in each sequence are not equivalent, // skip this seedLoc. cseqExt = corSeq.Residues[extCorStart : extCorStart+extSeedSize] oseqExt = orgSeq.Residues[extOrgStart : extOrgStart+extSeedSize] if !bytes.Equal(cseqExt, oseqExt) { continue } // The "match" between coarse and original sequence will // occur somewhere between the the residue index of the seed and // the end of the sequence for the coarse sequence, and the // position of the "current" pointer and the end of the sequence // for the original sequence. corMatch, orgMatch := extendMatch( corSeq.Residues[corResInd:], orgSeq.Residues[current:], db.GappedWindowSize, db.UngappedWindowSize, db.MatchKmerSize, db.ExtSeqIdThreshold, mem) // If the part of the original sequence does not exceed the // minimum match length, then we don't accept the match and move // on to the next one. if len(orgMatch) < db.MinMatchLen { continue } alignment := nwAlign(corMatch, orgMatch, mem) id := mica.SeqIdentity(alignment[0], alignment[1]) if id < db.MatchSeqIdThreshold { continue } // If we end up extending a match because we're close to // some boundary (either a sequence or a match boundary), then // we need to perform another alignment. changed := false // If we're close to the end of the original sequence, extend // the match to the end. if len(orgMatch)+db.MatchExtend >= orgSeq.Len()-int(current) { orgMatch = orgSeq.Residues[current:] changed = true } // And if we're close to the end of the last match, extend this // match backwards. if current-lastMatch <= db.MatchExtend { end := current + len(orgMatch) orgMatch = orgSeq.Residues[lastMatch:end] current = lastMatch changed = true } // If we've extended our match, we need another alignment. if changed { alignment = nwAlign(corMatch, orgMatch, mem) } // Otherwise, we accept the first valid match and move on to the // next kmer after the match ends. corStart := corResInd corEnd := corStart + len(corMatch) orgStart := current orgEnd := orgStart + len(orgMatch) // If there are residues between the end of the last match // and the start of this match, then that means no good match // could be found for those residues. Thus, they are added to // the coarse database. (A pathological LinkToCoarse is // created with an empty diff script that points to the added // region in the coarse database in its entirety.) if orgStart-lastMatch > 0 { orgSub := orgSeq.NewSubSequence( uint(lastMatch), uint(current)) addWithoutMatch(&cseq, coarsedb, orgSeqId, orgSub) } // For the given match, add a LinkToCoarse to the portion of // the coarse sequence matched. This serves as a component // of a compressed original sequence. Also, add a // LinkToCompressed to the coarse sequence matched. This // serves as a bridge to expand coarse sequences into their // original sequences. cseq.Add(mica.NewLinkToCoarse( uint(corSeqId), uint(corStart), uint(corEnd), alignment)) corSeq.AddLink(mica.NewLinkToCompressed( uint32(orgSeqId), uint16(corStart), uint16(corEnd))) // Skip the current pointer ahead to the end of this match. // Update the lastMatch pointer to point at the end of this // match. lastMatch = orgEnd current = orgEnd - 1 // Don't process any more seedLocs for this K-mer once we've // found a match. break } } // If there are any leftover residues, then no good match for them // could be found. Therefore, add them to the coarse database and // create the appropriate links. if orgSeq.Len()-lastMatch > 0 { orgSub := orgSeq.NewSubSequence(uint(lastMatch), uint(orgSeq.Len())) addWithoutMatch(&cseq, coarsedb, orgSeqId, orgSub) } return cseq }
func TestNeedlemanWunsch(t *testing.T) { type test struct { seq1, seq2 string out1, out2 string } tests := []test{ { "ABCD", "ABCD", "ABCD", "ABCD", }, { "PPPGHIKLMNPQR", "GAAAHIKLMN", "PPPGHIKLMNPQR", "---GAAAHIKLMN", }, { "GHIKLMNPQRSTVW", "GAAAHIKLMNPQRSTVW", "---GHIKLMNPQRSTVW", "GAAAHIKLMNPQRSTVW", }, { "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", }, { "NNNNNNNN", "NNNNNNNN", "NNNNNNNN", "NNNNNNNN", }, { "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN", "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN", "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN", "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN", }, // { // "ABCDEFGWXYZ", // "ABCDEFMNPQRSTZABEGWXYZ", // "ABCDEF-----------GWXYZ", // "ABCDEFMNPQRSTZABEGWXYZ", // }, } sep := strings.Repeat("-", 45) mem := newMemory() for _, test := range tests { alignment := nwAlign([]byte(test.seq1), []byte(test.seq2), mem) sout1, sout2 := string(alignment[0]), string(alignment[1]) if sout1 != test.out1 || sout2 != test.out2 { t.Fatalf( `Alignment for: (sequence identitiy: %d) %s %s %s %s resulted in %s %s %s %s but should have been %s %s %s %s`, mica.SeqIdentity(alignment[0], alignment[1]), sep, test.seq1, test.seq2, sep, sep, sout1, sout2, sep, sep, test.out1, test.out2, sep) } } }
// extendMatch uses a combination of ungapped and gapped extension to find // quality candidates for compression. func extendMatch(corRes, orgRes []byte, gappedWindowSize, ungappedWindowSize, kmerSize, idThreshold int, mem *memory) (corMatchRes, orgMatchRes []byte) { // Starting at seedLoc.resInd and current (from 'compress'), corMatchLen // and orgMatchLen correspond to the length of the match in each of // the coarse and the original sequence, respectively. // At the end of the loop, the slices [seedLoc.resInd:corMatchLen] // and [current:orgMatchLen] will correspond to the match. (Again, this // is in the context of the inner loop in 'compress'. For this particular // function, corMatchLen and orgMatch start at 0, so that the matches // eventually returned correspond to the [:corMatchLen] and [:orgMatchLen] // slices.) corMatchLen, orgMatchLen := 0, 0 for { // If the match has consumed either of the coarse or original // sequence, then we must quit with what we have. if corMatchLen == len(corRes) || orgMatchLen == len(orgRes) { break } // Ungapped extension returns an integer corresponding to the // number of residues that the match was extended by. matchLen := alignUngapped( corRes[corMatchLen:], orgRes[orgMatchLen:], ungappedWindowSize, kmerSize, idThreshold) // Since ungapped extension increases the coarse and // original sequence match portions equivalently, add the // match length to both. corMatchLen += matchLen orgMatchLen += matchLen // Gapped extension returns an alignment corresponding to the // window starting after the previous ungapped extension // ended plus the gapped window size. (It is bounded by the // length of each sequence.) alignment := nwAlign( corRes[corMatchLen:min(len(corRes), corMatchLen+gappedWindowSize)], orgRes[orgMatchLen:min(len(orgRes), orgMatchLen+gappedWindowSize)], mem) // If the alignment has a sequence identity below the // threshold, then gapped extension has failed. We therefore // quit and are forced to be satisfied with whatever // corMatchLen and orgMatchLen are set to. id := mica.SeqIdentity(alignment[0], alignment[1]) if id < idThreshold { break } // We live to die another day. // We need to add to the corMatch{Pos,Len} and orgMatch{Pos,Len} // just like we did for ungapped extension. However, an // alignment can correspond to two different sized subsequences // of the coarse and original sequence. Therefore, only // increase each by the corresponding sizes from the // alignment. corMatchLen += alignLen(alignment[0]) orgMatchLen += alignLen(alignment[1]) } return corRes[:corMatchLen], orgRes[:orgMatchLen] }