Esempio n. 1
0
// alignUngapped takes a coarse and an original sub-sequence and returns a
// length corresponding to the number of amino acids scanned by greedily
// consuming successive K-mer matches in N-mer windows.
//
// The algorithm works by attempting to find *exact* K-mer matches between the
// sequences in N-mer windows. If N residues are scanned and no K-mer match
// is found, the the current value of length is returned (which may be 0).
// If a K-mer match is found, the current value of length is set to the total
// number of amino acid residues scanned, and a search for the next K-mer match
// for the next N-mer window is started.
func alignUngapped(rseq []byte, oseq []byte,
	windowSize, kmerSize, idThreshold int) int {

	length, scanned, successive := 0, 0, 0
	tryNextWindow := true
	for tryNextWindow {
		tryNextWindow = false
		for i := 0; i < windowSize; i++ {
			// If we've scanned all residues in one of the sub-sequences, then
			// there is nothing left to do for ungapped extension. Therefore,
			// quit and return the number of residues scanned up until the
			// *last* match.
			if scanned >= len(rseq) || scanned >= len(oseq) {
				break
			}

			if rseq[scanned] == oseq[scanned] {
				successive++
			} else {
				successive = 0
			}

			scanned++
			if successive == kmerSize {
				// Get the residues between matches: i.e., after the last
				// match to the start of this match. But only if there is at
				// least one residue in that range.
				if (scanned-kmerSize)-length > 0 {
					id := mica.SeqIdentity(
						rseq[length:scanned-kmerSize],
						oseq[length:scanned-kmerSize])

					// If the identity is less than the threshold, then this
					// K-mer match is no good. But keep trying until the window
					// is closed. (We "keep trying" by decrementing successive
					// matches by 1.)
					if id < idThreshold {
						successive--
						continue
					}
				}

				// If we're here, then we've found a valid match. Update the
				// length to indicate the number of residues scanned and make
				// sure we try the next Ungapped window.
				length = scanned
				successive = 0
				tryNextWindow = true
				break
			}
		}
	}
	return length
}
Esempio n. 2
0
// compress will convert an original sequence into a compressed sequence.
// The process involves finding commonality in the original sequence with
// other sequences in the coarse database, and linking those common
// sub-sequences to sub-sequences in the coarse database.
//
// N.B. `mem` is used in alignment and seed lookups to prevent allocation.
// Think of them as goroutine-specific memory arenas.
func compress(db *mica.DB, orgSeqId int,
	orgSeq *mica.OriginalSeq, mem *memory) mica.CompressedSeq {

	// cseqExt and oseqExt will contain `extSeedSize` residues after the end
	// of any particular seed in coarse and original sequences, respectively.
	// If the residues are not equivalent, that particular seed is skipped.
	var cseqExt, oseqExt []byte

	// Start the creation of a compressed sequence.
	cseq := mica.NewCompressedSeq(orgSeqId, orgSeq.Name)

	// Convenient aliases.
	coarsedb := db.CoarseDB
	mapSeedSize := db.MapSeedSize
	extSeedSize := db.ExtSeedSize
	olen := orgSeq.Len()

	// Keep track of two pointers. 'current' refers to the residue index in the
	// original sequence that extension is currently originating from.
	// 'lastMatch' refers to the residue index of the *end* of the last match
	// with a coarse sequence in the compressed database.
	lastMatch, current := 0, 0

	// Iterate through the original sequence a 'kmer' at a time.
	for current = 0; current < olen-mapSeedSize-extSeedSize; current++ {
		kmer := orgSeq.Residues[current : current+mapSeedSize]
		seeds := coarsedb.Seeds.Lookup(kmer, &mem.seeds)

		// Before trying to extend this with seeds, check to see if there is
		// a low complexity region within `db.MinMatchLen` residues from
		// `current`. If there is, skip ahead to the end of it.
		if db.LowComplexity > 0 {
			skip := skipLowComplexity(
				orgSeq.Residues[current:], db.MinMatchLen, db.LowComplexity)
			if skip > 0 {
				current += skip
				continue
			}
		}

		// Each seed location corresponding to the current K-mer must be
		// used to attempt to extend a match.
		for _, seedLoc := range seeds {
			corSeqId := int(seedLoc[0])
			corResInd := int(seedLoc[1])
			corSeq := coarsedb.CoarseSeqGet(uint(corSeqId))

			// If the seed extension extends beyond the end of the coarse
			// sequence pointed to by seedLoc, then move along.
			extCorStart := corResInd + mapSeedSize
			extOrgStart := current + mapSeedSize
			if extCorStart+extSeedSize >= corSeq.Len() {
				continue
			}

			// If the seed extensions in each sequence are not equivalent,
			// skip this seedLoc.
			cseqExt = corSeq.Residues[extCorStart : extCorStart+extSeedSize]
			oseqExt = orgSeq.Residues[extOrgStart : extOrgStart+extSeedSize]
			if !bytes.Equal(cseqExt, oseqExt) {
				continue
			}

			// The "match" between coarse and original sequence will
			// occur somewhere between the the residue index of the seed and
			// the end of the sequence for the coarse sequence, and the
			// position of the "current" pointer and the end of the sequence
			// for the original sequence.
			corMatch, orgMatch := extendMatch(
				corSeq.Residues[corResInd:], orgSeq.Residues[current:],
				db.GappedWindowSize, db.UngappedWindowSize,
				db.MatchKmerSize, db.ExtSeqIdThreshold,
				mem)

			// If the part of the original sequence does not exceed the
			// minimum match length, then we don't accept the match and move
			// on to the next one.
			if len(orgMatch) < db.MinMatchLen {
				continue
			}

			alignment := nwAlign(corMatch, orgMatch, mem)
			id := mica.SeqIdentity(alignment[0], alignment[1])
			if id < db.MatchSeqIdThreshold {
				continue
			}

			// If we end up extending a match because we're close to
			// some boundary (either a sequence or a match boundary), then
			// we need to perform another alignment.
			changed := false

			// If we're close to the end of the original sequence, extend
			// the match to the end.
			if len(orgMatch)+db.MatchExtend >= orgSeq.Len()-int(current) {
				orgMatch = orgSeq.Residues[current:]
				changed = true
			}

			// And if we're close to the end of the last match, extend this
			// match backwards.
			if current-lastMatch <= db.MatchExtend {
				end := current + len(orgMatch)
				orgMatch = orgSeq.Residues[lastMatch:end]
				current = lastMatch
				changed = true
			}

			// If we've extended our match, we need another alignment.
			if changed {
				alignment = nwAlign(corMatch, orgMatch, mem)
			}

			// Otherwise, we accept the first valid match and move on to the
			// next kmer after the match ends.
			corStart := corResInd
			corEnd := corStart + len(corMatch)
			orgStart := current
			orgEnd := orgStart + len(orgMatch)

			// If there are residues between the end of the last match
			// and the start of this match, then that means no good match
			// could be found for those residues. Thus, they are added to
			// the coarse database. (A pathological LinkToCoarse is
			// created with an empty diff script that points to the added
			// region in the coarse database in its entirety.)
			if orgStart-lastMatch > 0 {
				orgSub := orgSeq.NewSubSequence(
					uint(lastMatch), uint(current))
				addWithoutMatch(&cseq, coarsedb, orgSeqId, orgSub)
			}

			// For the given match, add a LinkToCoarse to the portion of
			// the coarse sequence matched. This serves as a component
			// of a compressed original sequence. Also, add a
			// LinkToCompressed to the coarse sequence matched. This
			// serves as a bridge to expand coarse sequences into their
			// original sequences.
			cseq.Add(mica.NewLinkToCoarse(
				uint(corSeqId), uint(corStart), uint(corEnd), alignment))
			corSeq.AddLink(mica.NewLinkToCompressed(
				uint32(orgSeqId), uint16(corStart), uint16(corEnd)))

			// Skip the current pointer ahead to the end of this match.
			// Update the lastMatch pointer to point at the end of this
			// match.
			lastMatch = orgEnd
			current = orgEnd - 1

			// Don't process any more seedLocs for this K-mer once we've
			// found a match.
			break
		}
	}

	// If there are any leftover residues, then no good match for them
	// could be found. Therefore, add them to the coarse database and
	// create the appropriate links.
	if orgSeq.Len()-lastMatch > 0 {
		orgSub := orgSeq.NewSubSequence(uint(lastMatch), uint(orgSeq.Len()))
		addWithoutMatch(&cseq, coarsedb, orgSeqId, orgSub)
	}

	return cseq
}
Esempio n. 3
0
func TestNeedlemanWunsch(t *testing.T) {
	type test struct {
		seq1, seq2 string
		out1, out2 string
	}

	tests := []test{
		{
			"ABCD",
			"ABCD",
			"ABCD",
			"ABCD",
		},
		{
			"PPPGHIKLMNPQR",
			"GAAAHIKLMN",
			"PPPGHIKLMNPQR",
			"---GAAAHIKLMN",
		},
		{
			"GHIKLMNPQRSTVW",
			"GAAAHIKLMNPQRSTVW",
			"---GHIKLMNPQRSTVW",
			"GAAAHIKLMNPQRSTVW",
		},
		{
			"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
			"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
			"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
			"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
		},
		{
			"NNNNNNNN",
			"NNNNNNNN",
			"NNNNNNNN",
			"NNNNNNNN",
		},
		{
			"NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN",
			"NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN",
			"NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN",
			"NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN",
		},
		// {
		// "ABCDEFGWXYZ",
		// "ABCDEFMNPQRSTZABEGWXYZ",
		// "ABCDEF-----------GWXYZ",
		// "ABCDEFMNPQRSTZABEGWXYZ",
		// },
	}
	sep := strings.Repeat("-", 45)
	mem := newMemory()
	for _, test := range tests {
		alignment := nwAlign([]byte(test.seq1), []byte(test.seq2), mem)
		sout1, sout2 := string(alignment[0]), string(alignment[1])

		if sout1 != test.out1 || sout2 != test.out2 {
			t.Fatalf(
				`Alignment for: (sequence identitiy: %d)
%s
%s
%s
%s
resulted in
%s
%s
%s
%s
but should have been
%s
%s
%s
%s`,
				mica.SeqIdentity(alignment[0], alignment[1]),
				sep, test.seq1, test.seq2, sep,
				sep, sout1, sout2, sep,
				sep, test.out1, test.out2, sep)
		}
	}
}
Esempio n. 4
0
// extendMatch uses a combination of ungapped and gapped extension to find
// quality candidates for compression.
func extendMatch(corRes, orgRes []byte,
	gappedWindowSize, ungappedWindowSize, kmerSize, idThreshold int,
	mem *memory) (corMatchRes, orgMatchRes []byte) {

	// Starting at seedLoc.resInd and current (from 'compress'), corMatchLen
	// and orgMatchLen correspond to the length of the match in each of
	// the coarse and the original sequence, respectively.
	// At the end of the loop, the slices [seedLoc.resInd:corMatchLen]
	// and [current:orgMatchLen] will correspond to the match. (Again, this
	// is in the context of the inner loop in 'compress'. For this particular
	// function, corMatchLen and orgMatch start at 0, so that the matches
	// eventually returned correspond to the [:corMatchLen] and [:orgMatchLen]
	// slices.)
	corMatchLen, orgMatchLen := 0, 0
	for {
		// If the match has consumed either of the coarse or original
		// sequence, then we must quit with what we have.
		if corMatchLen == len(corRes) || orgMatchLen == len(orgRes) {
			break
		}

		// Ungapped extension returns an integer corresponding to the
		// number of residues that the match was extended by.
		matchLen := alignUngapped(
			corRes[corMatchLen:], orgRes[orgMatchLen:],
			ungappedWindowSize, kmerSize, idThreshold)

		// Since ungapped extension increases the coarse and
		// original sequence match portions equivalently, add the
		// match length to both.
		corMatchLen += matchLen
		orgMatchLen += matchLen

		// Gapped extension returns an alignment corresponding to the
		// window starting after the previous ungapped extension
		// ended plus the gapped window size. (It is bounded by the
		// length of each sequence.)
		alignment := nwAlign(
			corRes[corMatchLen:min(len(corRes), corMatchLen+gappedWindowSize)],
			orgRes[orgMatchLen:min(len(orgRes), orgMatchLen+gappedWindowSize)],
			mem)

		// If the alignment has a sequence identity below the
		// threshold, then gapped extension has failed. We therefore
		// quit and are forced to be satisfied with whatever
		// corMatchLen and orgMatchLen are set to.
		id := mica.SeqIdentity(alignment[0], alignment[1])
		if id < idThreshold {
			break
		}

		// We live to die another day.
		// We need to add to the corMatch{Pos,Len} and orgMatch{Pos,Len}
		// just like we did for ungapped extension. However, an
		// alignment can correspond to two different sized subsequences
		// of the coarse and original sequence. Therefore, only
		// increase each by the corresponding sizes from the
		// alignment.
		corMatchLen += alignLen(alignment[0])
		orgMatchLen += alignLen(alignment[1])
	}

	return corRes[:corMatchLen], orgRes[:orgMatchLen]
}