示例#1
0
// addWithoutMatch adds a portion of an original sequence that could not be
// matched to anything in the coarse database to the coarse database.
// A LinkToCompressed is created and automatically added to the new coarse
// sequence.
//
// An appropriate link is also added to the given compressed sequence.
func addWithoutMatch(cseq *mica.CompressedSeq,
	coarsedb *mica.CoarseDB, orgSeqId int, orgSub *mica.OriginalSeq) {

	// Explicitly copy residues to avoid pinning memory.
	subCpy := make([]byte, len(orgSub.Residues))
	copy(subCpy, orgSub.Residues)

	corSeqId, corSeq := coarsedb.Add(subCpy)
	corSeq.AddLink(
		mica.NewLinkToCompressed(uint32(orgSeqId), 0, uint16(len(subCpy))))

	cseq.Add(
		mica.NewLinkToCoarseNoDiff(uint(corSeqId), 0, uint(len(subCpy))))
}
示例#2
0
// compress will convert an original sequence into a compressed sequence.
// The process involves finding commonality in the original sequence with
// other sequences in the coarse database, and linking those common
// sub-sequences to sub-sequences in the coarse database.
//
// N.B. `mem` is used in alignment and seed lookups to prevent allocation.
// Think of them as goroutine-specific memory arenas.
func compress(db *mica.DB, orgSeqId int,
	orgSeq *mica.OriginalSeq, mem *memory) mica.CompressedSeq {

	// cseqExt and oseqExt will contain `extSeedSize` residues after the end
	// of any particular seed in coarse and original sequences, respectively.
	// If the residues are not equivalent, that particular seed is skipped.
	var cseqExt, oseqExt []byte

	// Start the creation of a compressed sequence.
	cseq := mica.NewCompressedSeq(orgSeqId, orgSeq.Name)

	// Convenient aliases.
	coarsedb := db.CoarseDB
	mapSeedSize := db.MapSeedSize
	extSeedSize := db.ExtSeedSize
	olen := orgSeq.Len()

	// Keep track of two pointers. 'current' refers to the residue index in the
	// original sequence that extension is currently originating from.
	// 'lastMatch' refers to the residue index of the *end* of the last match
	// with a coarse sequence in the compressed database.
	lastMatch, current := 0, 0

	// Iterate through the original sequence a 'kmer' at a time.
	for current = 0; current < olen-mapSeedSize-extSeedSize; current++ {
		kmer := orgSeq.Residues[current : current+mapSeedSize]
		seeds := coarsedb.Seeds.Lookup(kmer, &mem.seeds)

		// Before trying to extend this with seeds, check to see if there is
		// a low complexity region within `db.MinMatchLen` residues from
		// `current`. If there is, skip ahead to the end of it.
		if db.LowComplexity > 0 {
			skip := skipLowComplexity(
				orgSeq.Residues[current:], db.MinMatchLen, db.LowComplexity)
			if skip > 0 {
				current += skip
				continue
			}
		}

		// Each seed location corresponding to the current K-mer must be
		// used to attempt to extend a match.
		for _, seedLoc := range seeds {
			corSeqId := int(seedLoc[0])
			corResInd := int(seedLoc[1])
			corSeq := coarsedb.CoarseSeqGet(uint(corSeqId))

			// If the seed extension extends beyond the end of the coarse
			// sequence pointed to by seedLoc, then move along.
			extCorStart := corResInd + mapSeedSize
			extOrgStart := current + mapSeedSize
			if extCorStart+extSeedSize >= corSeq.Len() {
				continue
			}

			// If the seed extensions in each sequence are not equivalent,
			// skip this seedLoc.
			cseqExt = corSeq.Residues[extCorStart : extCorStart+extSeedSize]
			oseqExt = orgSeq.Residues[extOrgStart : extOrgStart+extSeedSize]
			if !bytes.Equal(cseqExt, oseqExt) {
				continue
			}

			// The "match" between coarse and original sequence will
			// occur somewhere between the the residue index of the seed and
			// the end of the sequence for the coarse sequence, and the
			// position of the "current" pointer and the end of the sequence
			// for the original sequence.
			corMatch, orgMatch := extendMatch(
				corSeq.Residues[corResInd:], orgSeq.Residues[current:],
				db.GappedWindowSize, db.UngappedWindowSize,
				db.MatchKmerSize, db.ExtSeqIdThreshold,
				mem)

			// If the part of the original sequence does not exceed the
			// minimum match length, then we don't accept the match and move
			// on to the next one.
			if len(orgMatch) < db.MinMatchLen {
				continue
			}

			alignment := nwAlign(corMatch, orgMatch, mem)
			id := mica.SeqIdentity(alignment[0], alignment[1])
			if id < db.MatchSeqIdThreshold {
				continue
			}

			// If we end up extending a match because we're close to
			// some boundary (either a sequence or a match boundary), then
			// we need to perform another alignment.
			changed := false

			// If we're close to the end of the original sequence, extend
			// the match to the end.
			if len(orgMatch)+db.MatchExtend >= orgSeq.Len()-int(current) {
				orgMatch = orgSeq.Residues[current:]
				changed = true
			}

			// And if we're close to the end of the last match, extend this
			// match backwards.
			if current-lastMatch <= db.MatchExtend {
				end := current + len(orgMatch)
				orgMatch = orgSeq.Residues[lastMatch:end]
				current = lastMatch
				changed = true
			}

			// If we've extended our match, we need another alignment.
			if changed {
				alignment = nwAlign(corMatch, orgMatch, mem)
			}

			// Otherwise, we accept the first valid match and move on to the
			// next kmer after the match ends.
			corStart := corResInd
			corEnd := corStart + len(corMatch)
			orgStart := current
			orgEnd := orgStart + len(orgMatch)

			// If there are residues between the end of the last match
			// and the start of this match, then that means no good match
			// could be found for those residues. Thus, they are added to
			// the coarse database. (A pathological LinkToCoarse is
			// created with an empty diff script that points to the added
			// region in the coarse database in its entirety.)
			if orgStart-lastMatch > 0 {
				orgSub := orgSeq.NewSubSequence(
					uint(lastMatch), uint(current))
				addWithoutMatch(&cseq, coarsedb, orgSeqId, orgSub)
			}

			// For the given match, add a LinkToCoarse to the portion of
			// the coarse sequence matched. This serves as a component
			// of a compressed original sequence. Also, add a
			// LinkToCompressed to the coarse sequence matched. This
			// serves as a bridge to expand coarse sequences into their
			// original sequences.
			cseq.Add(mica.NewLinkToCoarse(
				uint(corSeqId), uint(corStart), uint(corEnd), alignment))
			corSeq.AddLink(mica.NewLinkToCompressed(
				uint32(orgSeqId), uint16(corStart), uint16(corEnd)))

			// Skip the current pointer ahead to the end of this match.
			// Update the lastMatch pointer to point at the end of this
			// match.
			lastMatch = orgEnd
			current = orgEnd - 1

			// Don't process any more seedLocs for this K-mer once we've
			// found a match.
			break
		}
	}

	// If there are any leftover residues, then no good match for them
	// could be found. Therefore, add them to the coarse database and
	// create the appropriate links.
	if orgSeq.Len()-lastMatch > 0 {
		orgSub := orgSeq.NewSubSequence(uint(lastMatch), uint(orgSeq.Len()))
		addWithoutMatch(&cseq, coarsedb, orgSeqId, orgSub)
	}

	return cseq
}