Beispiel #1
0
// AlignmentProb computes the probability of the sequence `s` aligning
// with the HMM in `frag`. The sequence must have length equivalent
// to the fragment size.
func (lib *sequenceHMM) AlignmentProb(fragi int, s seq.Sequence) seq.Prob {
	frag := lib.Fragments[fragi]
	if s.Len() != len(frag.Nodes) {
		panic(fmt.Sprintf("Sequence length %d != fragment size %d",
			s.Len(), len(frag.Nodes)))
	}
	return frag.ViterbiScore(s)
}
Beispiel #2
0
func expandCoarseSequence(db *mica.DB, seqId int, coarseSequence *seq.Sequence) ([]mica.OriginalSeq, error) {
	originalSeqs, err := db.CoarseDB.Expand(db.ComDB, seqId, 0, coarseSequence.Len())
	if err != nil {
		return nil, err
	}

	return originalSeqs, nil
}
Beispiel #3
0
// NewFragment constructs a new fragment from a full query sequence and the
// hit from the HHR file.
//
// Since NewFragment requires access to the raw PDB alpha-carbon atoms (and
// the sequence) of the template hit, you'll also need to pass a path to the
// PDB database. (Which is a directory containing a flat list of all
// PDB files used to construct the corresponding hhblits database.) This
// database is usually located inside the 'pdb' directory contained in the
// corresponding hhsuite database. i.e., $HHLIB/data/pdb-select25/pdb
func NewFragment(
	pdbDb PDBDatabase, qs seq.Sequence, hit hhr.Hit) (Fragment, error) {

	pdbName := getTemplatePdbName(hit.Name)
	pdbEntry, err := pdb.ReadPDB(path.Join(
		pdbDb.PDB(), fmt.Sprintf("%s.pdb", pdbName)))
	if err != nil {
		pdbEntry, err = pdb.ReadPDB(path.Join(
			pdbDb.PDB(), fmt.Sprintf("%s.ent.gz", pdbName)))
		if err != nil {
			return Fragment{}, err
		}
	}

	// Load in the sequence from the PDB file using the SEQRES residues.
	ts, te := hit.TemplateStart, hit.TemplateEnd
	chain := pdbEntry.Chain(pdbName[4])
	if chain == nil {
		return Fragment{}, fmt.Errorf("Could not find chain '%c' in PDB "+
			"entry '%s'.", pdbName[4], pdbEntry.Path)
	}
	tseq := seq.Sequence{
		Name:     pdbName,
		Residues: make([]seq.Residue, te-ts+1),
	}

	// We copy here to avoid pinning pdb.Entry objects.
	copy(tseq.Residues, chain.Sequence[ts-1:te])

	frag := Fragment{
		Query:    qs.Slice(hit.QueryStart-1, hit.QueryEnd),
		Template: tseq,
		Hit:      hit,
		CaAtoms:  nil,
	}

	// We designate "corrupt" if the query/template hit regions are of
	// different length. i.e., we don't allow gaps (yet).
	// BUG(burntsushi): Fragments with gaps are marked as corrupt.
	if hit.QueryEnd-hit.QueryStart != hit.TemplateEnd-hit.TemplateStart {
		return frag, nil
	}

	// We also designate "corrupt" if there are any gaps in our alpha-carbon
	// atom list.
	atoms := chain.SequenceCaAtomSlice(ts-1, te)
	if atoms == nil {
		return frag, nil
	}

	// One again, we copy to avoid pinning memory.
	frag.CaAtoms = make([]structure.Coords, len(atoms))
	copy(frag.CaAtoms, atoms)

	return frag, nil
}
Beispiel #4
0
// Write writes a single FASTA entry to the underlying io.Writer.
//
// You may need to call Flush in order for the changes to be written.
//
// XXX: Currently, the sequence is not checked. Should it be?
func (w *Writer) Write(s seq.Sequence) error {
	var out string
	if w.Asterisk {
		if w.Columns > 0 && s.Len()%w.Columns == 0 {
			out = fmt.Sprintf("%s\n*\n", SequenceFasta(s, w.Columns))
		} else {
			out = fmt.Sprintf("%s*\n", SequenceFasta(s, w.Columns))
		}
	} else {
		out = fmt.Sprintf("%s\n", SequenceFasta(s, w.Columns))
	}
	_, err := w.buf.WriteString(out)
	return err
}
Beispiel #5
0
// SequenceBow is a helper function to compute a bag-of-words given a
// sequence fragment library and a query sequence.
//
// If the lib given is a weighted library, then the BOW returned will also
// be weighted.
//
// Note that this function should only be used when providing your own
// implementation of the SequenceBower interface. Otherwise, BOWs should
// be computed using the SequenceBow method of the interface.
func SequenceBow(lib fragbag.SequenceLibrary, s seq.Sequence) Bow {
	var best, uplimit int

	b := NewBow(lib.Size())
	libSize := lib.FragmentSize()
	uplimit = s.Len() - libSize
	for i := 0; i <= uplimit; i++ {
		best = lib.BestSequenceFragment(s.Slice(i, i+libSize))
		if best < 0 {
			continue
		}
		b.Freqs[best] += 1
	}
	if wlib, ok := lib.(fragbag.WeightedLibrary); ok {
		b = b.Weighted(wlib)
	}
	return b
}
Beispiel #6
0
func expandCoarseSequence(db *cablastp.DB, seqId int, coarseSequence *seq.Sequence) ([]cablastp.OriginalSeq, error) {
	originalSeqs, err := db.CoarseDB.Expand(db.ComDB, seqId, 0, coarseSequence.Len())
	if err != nil {
		return nil, err
	}
	// var redSeqs [originalSeqs]cablastp.ReducedSeq
	// for _, oSeq := range originalSeqs {
	// 	redSeq := &cablastp.ReducedSeq{
	// 		&cablastp.Sequence{
	// 			Name:     readSeq.Seq.Name,
	// 			Residues: readSeq.Seq.Residues,
	// 			Offset:   readSeq.Seq.Offset,
	// 			Id:       readSeq.Seq.Id,
	// 		},
	// 	}
	// }

	return originalSeqs, nil
}
Beispiel #7
0
func aminoFromStructure(chain *pdb.Chain) seq.Sequence {
	var name string
	if len(chain.Entry.Cath) > 0 {
		name = chain.Entry.Cath
	} else if len(chain.Entry.Scop) > 0 {
		name = chain.Entry.Scop
	} else {
		name = fmt.Sprintf("%s%c", chain.Entry.IdCode, chain.Ident)
	}
	s := seq.Sequence{
		Name:     name,
		Residues: make([]seq.Residue, 0, 50),
	}
	lasti := 0
	for _, r := range chain.Models[0].Residues {
		if lasti != r.SequenceNum {
			s.Residues = append(s.Residues, r.Name)
			lasti = r.SequenceNum
		}
	}
	return s
}
Beispiel #8
0
// AlignmentProb computes the probability of the sequence `s` aligning
// with the profile in `frag`. The sequence must have length equivalent
// to the fragment size.
func (lib *sequenceProfile) AlignmentProb(fragi int, s seq.Sequence) seq.Prob {
	frag := lib.Fragments[fragi]
	if s.Len() != frag.Len() {
		panic(fmt.Sprintf("Sequence length %d != fragment size %d",
			s.Len(), frag.Len()))
	}
	prob := seq.Prob(0.0)
	for c := 0; c < s.Len(); c++ {
		prob += frag.Emissions[c].Lookup(s.Residues[c])
	}
	return prob
}
Beispiel #9
0
// Best returns the number of the fragment that best corresponds
// to the string of amino acids provided.
// The length of `sequence` must be equivalent to the fragment size.
//
// If no "good" fragments can be found, then `-1` is returned. This
// behavior will almost certainly change in the future.
func (lib *sequenceHMM) BestSequenceFragment(s seq.Sequence) int {
	if s.Len() != lib.FragmentSize() {
		panic(fmt.Sprintf("Sequence length %d != fragment size %d",
			s.Len(), lib.FragmentSize()))
	}
	var testAlign seq.Prob
	dynamicTable := seq.AllocTable(lib.FragmentSize(), s.Len())
	bestAlign, bestFragNum := seq.MinProb, -1
	for _, frag := range lib.Fragments {
		testAlign = frag.ViterbiScoreMem(s, dynamicTable)
		if bestAlign.Less(testAlign) {
			bestAlign, bestFragNum = testAlign, frag.FragNumber
		}
	}
	return bestFragNum
}
Beispiel #10
0
func main() {
	pdbEntry := util.PDBRead(flag.Arg(0))

	fasEntries := make([]seq.Sequence, 0, 5)
	if !flagSeparateChains {
		var fasEntry seq.Sequence
		if len(pdbEntry.Chains) == 1 {
			fasEntry.Name = chainHeader(pdbEntry.OneChain())
		} else {
			fasEntry.Name = fmt.Sprintf("%s", strings.ToLower(pdbEntry.IdCode))
		}

		seq := make([]seq.Residue, 0, 100)
		for _, chain := range pdbEntry.Chains {
			if isChainUsable(chain) {
				seq = append(seq, chain.Sequence...)
			}
		}
		fasEntry.Residues = seq

		if len(fasEntry.Residues) == 0 {
			util.Fatalf("Could not find any amino acids.")
		}
		fasEntries = append(fasEntries, fasEntry)
	} else {
		for _, chain := range pdbEntry.Chains {
			if !isChainUsable(chain) {
				continue
			}

			fasEntry := seq.Sequence{
				Name:     chainHeader(chain),
				Residues: chain.Sequence,
			}
			fasEntries = append(fasEntries, fasEntry)
		}
	}
	if len(fasEntries) == 0 {
		util.Fatalf("Could not find any chains with amino acids.")
	}

	var fasOut io.Writer
	if flag.NArg() == 1 {
		fasOut = os.Stdout
	} else {
		if len(flagSplit) > 0 {
			util.Fatalf("The '--split' option is incompatible with a single " +
				"output file.")
		}
		fasOut = util.CreateFile(util.Arg(1))
	}

	if len(flagSplit) == 0 {
		util.Assert(fasta.NewWriter(fasOut).WriteAll(fasEntries),
			"Could not write FASTA file '%s'", fasOut)
	} else {
		for _, entry := range fasEntries {
			fp := path.Join(flagSplit, fmt.Sprintf("%s.fasta", entry.Name))
			out := util.CreateFile(fp)

			w := fasta.NewWriter(out)
			util.Assert(w.Write(entry), "Could not write to '%s'", fp)
			util.Assert(w.Flush(), "Could not write to '%s'", fp)
		}
	}
}
Beispiel #11
0
// newFastaSeq creates a new *sequence value from seq's Sequence type, and
// ensures that all residues in the sequence are upper cased.
func newFastaSeq(id int, s seq.Sequence) *sequence {
	return newSeq(id, s.Name, s.Bytes())
}
Beispiel #12
0
func NewFastaCoarseSeq(id int, s seq.Sequence) *CoarseSeq {
	return NewCoarseSeq(id, s.Name, s.Bytes())
}
Beispiel #13
0
func (m MapConfig) computeMap(
	pdbDb PDBDatabase, qseq seq.Sequence, qhhm *hmm.HHM) (*FragmentMap, error) {

	type maybeFrag struct {
		frags Fragments
		err   error
	}

	wg := new(sync.WaitGroup)
	jobs := make(chan int, 10)
	fragsChan := make(chan maybeFrag, 10)
	workers := runtime.GOMAXPROCS(0)
	if workers < 1 {
		workers = 1
	}

	for i := 0; i < workers; i++ {
		go func() {
			wg.Add(1)
			defer wg.Done()

			min, max := m.WindowMin, m.WindowMax
		CHANNEL:
			for start := range jobs {
				var best *Fragments
				for end := min; end <= max && (start+end) <= qseq.Len(); end++ {
					frags, err := FindFragments(
						pdbDb, m.Blits, qhhm, qseq, start, start+end)
					if err != nil {
						fragsChan <- maybeFrag{
							err: err,
						}
						continue CHANNEL
					}
					if best == nil || frags.better(*best) {
						best = frags
					}
				}
				fragsChan <- maybeFrag{
					frags: *best,
				}
			}
		}()
	}
	go func() {
		for s := 0; s <= qseq.Len()-m.WindowMin; s += m.WindowIncrement {
			jobs <- s
		}
		close(jobs)
		wg.Wait()
		close(fragsChan)
	}()

	fmap := &FragmentMap{
		Name:     qseq.Name,
		Segments: make([]Fragments, 0, 50),
	}
	for maybeFrag := range fragsChan {
		if maybeFrag.err != nil {
			return nil, maybeFrag.err
		}
		fmap.Segments = append(fmap.Segments, maybeFrag.frags)
	}
	sort.Sort(fmap)
	return fmap, nil
}
Beispiel #14
0
// ReadSequence is exported for use in other packages that read FASTA-like
// files.
//
// The 'translate' function is used when sequences are checked for valid
// characters.
//
// If you're just reading FASTA files, this method SHOULD NOT be used.
func (r *Reader) ReadSequence(translate Translator) (seq.Sequence, error) {
	s := seq.Sequence{}
	seenHeader := false

	// Before entering the main loop, we have to check to see if we've
	// already read this entry's header.
	if r.nextHeader != nil {
		s.Name = trimHeader(r.nextHeader)
		r.nextHeader = nil
		seenHeader = true
	}
	for {
		line, err := r.buf.ReadBytes('\n')
		if err == io.EOF {
			if len(line) == 0 {
				return s, io.EOF
			}
		} else if err != nil {
			return seq.Sequence{}, fmt.Errorf("Error on line %d: %s",
				r.line, err)
		}
		line = bytes.TrimSpace(line)

		// If it's empty, increment the counter and skip ahead.
		if len(line) == 0 {
			r.line++
			continue
		}

		// If the line starts with PIR junk, ignore the line.
		if bytes.HasPrefix(line, []byte("C;")) ||
			bytes.HasPrefix(line, []byte("structure")) ||
			bytes.HasPrefix(line, []byte("sequence")) {

			r.line++
			continue
		}

		// If we haven't seen the header yet, this better be it.
		if !seenHeader {
			if line[0] != '>' {
				return seq.Sequence{},
					fmt.Errorf("Expected '>', got '%c' on line %d.",
						line[0], r.line)
			}

			// Trim the '>' and load this line into the header.
			s.Name = trimHeader(line)
			seenHeader = true

			r.line++
			continue
		} else if line[0] == '>' {
			// This means we've begun reading the next entry.
			// So slap this line into 'nextHeader' and return the current entry.
			r.nextHeader = line

			r.line++
			return s, nil
		}

		// Finally, time to start reading the sequence.
		// If we trust the sequences, then we can just append this line
		// willy nilly. Otherwise we've got to check each character.
		if s.Residues == nil {
			s.Residues = make([]seq.Residue, 0, 50)
		}
		if r.TrustSequences {
			for _, b := range line {
				s.Residues = append(s.Residues, seq.Residue(b))
			}
		} else {
			for _, b := range line {
				bNew, ok := translate(b)
				if !ok {
					return seq.Sequence{},
						fmt.Errorf("Invalid character '%c' on line %d.",
							b, r.line)
				}

				// If the zero byte is returned from translate, then we
				// don't keep this residue around.
				if bNew > 0 {
					s.Residues = append(s.Residues, bNew)
				}
			}
		}
		r.line++
	}
	panic("unreachable")
}