// AlignmentProb computes the probability of the sequence `s` aligning // with the HMM in `frag`. The sequence must have length equivalent // to the fragment size. func (lib *sequenceHMM) AlignmentProb(fragi int, s seq.Sequence) seq.Prob { frag := lib.Fragments[fragi] if s.Len() != len(frag.Nodes) { panic(fmt.Sprintf("Sequence length %d != fragment size %d", s.Len(), len(frag.Nodes))) } return frag.ViterbiScore(s) }
func expandCoarseSequence(db *mica.DB, seqId int, coarseSequence *seq.Sequence) ([]mica.OriginalSeq, error) { originalSeqs, err := db.CoarseDB.Expand(db.ComDB, seqId, 0, coarseSequence.Len()) if err != nil { return nil, err } return originalSeqs, nil }
// NewFragment constructs a new fragment from a full query sequence and the // hit from the HHR file. // // Since NewFragment requires access to the raw PDB alpha-carbon atoms (and // the sequence) of the template hit, you'll also need to pass a path to the // PDB database. (Which is a directory containing a flat list of all // PDB files used to construct the corresponding hhblits database.) This // database is usually located inside the 'pdb' directory contained in the // corresponding hhsuite database. i.e., $HHLIB/data/pdb-select25/pdb func NewFragment( pdbDb PDBDatabase, qs seq.Sequence, hit hhr.Hit) (Fragment, error) { pdbName := getTemplatePdbName(hit.Name) pdbEntry, err := pdb.ReadPDB(path.Join( pdbDb.PDB(), fmt.Sprintf("%s.pdb", pdbName))) if err != nil { pdbEntry, err = pdb.ReadPDB(path.Join( pdbDb.PDB(), fmt.Sprintf("%s.ent.gz", pdbName))) if err != nil { return Fragment{}, err } } // Load in the sequence from the PDB file using the SEQRES residues. ts, te := hit.TemplateStart, hit.TemplateEnd chain := pdbEntry.Chain(pdbName[4]) if chain == nil { return Fragment{}, fmt.Errorf("Could not find chain '%c' in PDB "+ "entry '%s'.", pdbName[4], pdbEntry.Path) } tseq := seq.Sequence{ Name: pdbName, Residues: make([]seq.Residue, te-ts+1), } // We copy here to avoid pinning pdb.Entry objects. copy(tseq.Residues, chain.Sequence[ts-1:te]) frag := Fragment{ Query: qs.Slice(hit.QueryStart-1, hit.QueryEnd), Template: tseq, Hit: hit, CaAtoms: nil, } // We designate "corrupt" if the query/template hit regions are of // different length. i.e., we don't allow gaps (yet). // BUG(burntsushi): Fragments with gaps are marked as corrupt. if hit.QueryEnd-hit.QueryStart != hit.TemplateEnd-hit.TemplateStart { return frag, nil } // We also designate "corrupt" if there are any gaps in our alpha-carbon // atom list. atoms := chain.SequenceCaAtomSlice(ts-1, te) if atoms == nil { return frag, nil } // One again, we copy to avoid pinning memory. frag.CaAtoms = make([]structure.Coords, len(atoms)) copy(frag.CaAtoms, atoms) return frag, nil }
// Write writes a single FASTA entry to the underlying io.Writer. // // You may need to call Flush in order for the changes to be written. // // XXX: Currently, the sequence is not checked. Should it be? func (w *Writer) Write(s seq.Sequence) error { var out string if w.Asterisk { if w.Columns > 0 && s.Len()%w.Columns == 0 { out = fmt.Sprintf("%s\n*\n", SequenceFasta(s, w.Columns)) } else { out = fmt.Sprintf("%s*\n", SequenceFasta(s, w.Columns)) } } else { out = fmt.Sprintf("%s\n", SequenceFasta(s, w.Columns)) } _, err := w.buf.WriteString(out) return err }
// SequenceBow is a helper function to compute a bag-of-words given a // sequence fragment library and a query sequence. // // If the lib given is a weighted library, then the BOW returned will also // be weighted. // // Note that this function should only be used when providing your own // implementation of the SequenceBower interface. Otherwise, BOWs should // be computed using the SequenceBow method of the interface. func SequenceBow(lib fragbag.SequenceLibrary, s seq.Sequence) Bow { var best, uplimit int b := NewBow(lib.Size()) libSize := lib.FragmentSize() uplimit = s.Len() - libSize for i := 0; i <= uplimit; i++ { best = lib.BestSequenceFragment(s.Slice(i, i+libSize)) if best < 0 { continue } b.Freqs[best] += 1 } if wlib, ok := lib.(fragbag.WeightedLibrary); ok { b = b.Weighted(wlib) } return b }
func expandCoarseSequence(db *cablastp.DB, seqId int, coarseSequence *seq.Sequence) ([]cablastp.OriginalSeq, error) { originalSeqs, err := db.CoarseDB.Expand(db.ComDB, seqId, 0, coarseSequence.Len()) if err != nil { return nil, err } // var redSeqs [originalSeqs]cablastp.ReducedSeq // for _, oSeq := range originalSeqs { // redSeq := &cablastp.ReducedSeq{ // &cablastp.Sequence{ // Name: readSeq.Seq.Name, // Residues: readSeq.Seq.Residues, // Offset: readSeq.Seq.Offset, // Id: readSeq.Seq.Id, // }, // } // } return originalSeqs, nil }
func aminoFromStructure(chain *pdb.Chain) seq.Sequence { var name string if len(chain.Entry.Cath) > 0 { name = chain.Entry.Cath } else if len(chain.Entry.Scop) > 0 { name = chain.Entry.Scop } else { name = fmt.Sprintf("%s%c", chain.Entry.IdCode, chain.Ident) } s := seq.Sequence{ Name: name, Residues: make([]seq.Residue, 0, 50), } lasti := 0 for _, r := range chain.Models[0].Residues { if lasti != r.SequenceNum { s.Residues = append(s.Residues, r.Name) lasti = r.SequenceNum } } return s }
// AlignmentProb computes the probability of the sequence `s` aligning // with the profile in `frag`. The sequence must have length equivalent // to the fragment size. func (lib *sequenceProfile) AlignmentProb(fragi int, s seq.Sequence) seq.Prob { frag := lib.Fragments[fragi] if s.Len() != frag.Len() { panic(fmt.Sprintf("Sequence length %d != fragment size %d", s.Len(), frag.Len())) } prob := seq.Prob(0.0) for c := 0; c < s.Len(); c++ { prob += frag.Emissions[c].Lookup(s.Residues[c]) } return prob }
// Best returns the number of the fragment that best corresponds // to the string of amino acids provided. // The length of `sequence` must be equivalent to the fragment size. // // If no "good" fragments can be found, then `-1` is returned. This // behavior will almost certainly change in the future. func (lib *sequenceHMM) BestSequenceFragment(s seq.Sequence) int { if s.Len() != lib.FragmentSize() { panic(fmt.Sprintf("Sequence length %d != fragment size %d", s.Len(), lib.FragmentSize())) } var testAlign seq.Prob dynamicTable := seq.AllocTable(lib.FragmentSize(), s.Len()) bestAlign, bestFragNum := seq.MinProb, -1 for _, frag := range lib.Fragments { testAlign = frag.ViterbiScoreMem(s, dynamicTable) if bestAlign.Less(testAlign) { bestAlign, bestFragNum = testAlign, frag.FragNumber } } return bestFragNum }
func main() { pdbEntry := util.PDBRead(flag.Arg(0)) fasEntries := make([]seq.Sequence, 0, 5) if !flagSeparateChains { var fasEntry seq.Sequence if len(pdbEntry.Chains) == 1 { fasEntry.Name = chainHeader(pdbEntry.OneChain()) } else { fasEntry.Name = fmt.Sprintf("%s", strings.ToLower(pdbEntry.IdCode)) } seq := make([]seq.Residue, 0, 100) for _, chain := range pdbEntry.Chains { if isChainUsable(chain) { seq = append(seq, chain.Sequence...) } } fasEntry.Residues = seq if len(fasEntry.Residues) == 0 { util.Fatalf("Could not find any amino acids.") } fasEntries = append(fasEntries, fasEntry) } else { for _, chain := range pdbEntry.Chains { if !isChainUsable(chain) { continue } fasEntry := seq.Sequence{ Name: chainHeader(chain), Residues: chain.Sequence, } fasEntries = append(fasEntries, fasEntry) } } if len(fasEntries) == 0 { util.Fatalf("Could not find any chains with amino acids.") } var fasOut io.Writer if flag.NArg() == 1 { fasOut = os.Stdout } else { if len(flagSplit) > 0 { util.Fatalf("The '--split' option is incompatible with a single " + "output file.") } fasOut = util.CreateFile(util.Arg(1)) } if len(flagSplit) == 0 { util.Assert(fasta.NewWriter(fasOut).WriteAll(fasEntries), "Could not write FASTA file '%s'", fasOut) } else { for _, entry := range fasEntries { fp := path.Join(flagSplit, fmt.Sprintf("%s.fasta", entry.Name)) out := util.CreateFile(fp) w := fasta.NewWriter(out) util.Assert(w.Write(entry), "Could not write to '%s'", fp) util.Assert(w.Flush(), "Could not write to '%s'", fp) } } }
// newFastaSeq creates a new *sequence value from seq's Sequence type, and // ensures that all residues in the sequence are upper cased. func newFastaSeq(id int, s seq.Sequence) *sequence { return newSeq(id, s.Name, s.Bytes()) }
func NewFastaCoarseSeq(id int, s seq.Sequence) *CoarseSeq { return NewCoarseSeq(id, s.Name, s.Bytes()) }
func (m MapConfig) computeMap( pdbDb PDBDatabase, qseq seq.Sequence, qhhm *hmm.HHM) (*FragmentMap, error) { type maybeFrag struct { frags Fragments err error } wg := new(sync.WaitGroup) jobs := make(chan int, 10) fragsChan := make(chan maybeFrag, 10) workers := runtime.GOMAXPROCS(0) if workers < 1 { workers = 1 } for i := 0; i < workers; i++ { go func() { wg.Add(1) defer wg.Done() min, max := m.WindowMin, m.WindowMax CHANNEL: for start := range jobs { var best *Fragments for end := min; end <= max && (start+end) <= qseq.Len(); end++ { frags, err := FindFragments( pdbDb, m.Blits, qhhm, qseq, start, start+end) if err != nil { fragsChan <- maybeFrag{ err: err, } continue CHANNEL } if best == nil || frags.better(*best) { best = frags } } fragsChan <- maybeFrag{ frags: *best, } } }() } go func() { for s := 0; s <= qseq.Len()-m.WindowMin; s += m.WindowIncrement { jobs <- s } close(jobs) wg.Wait() close(fragsChan) }() fmap := &FragmentMap{ Name: qseq.Name, Segments: make([]Fragments, 0, 50), } for maybeFrag := range fragsChan { if maybeFrag.err != nil { return nil, maybeFrag.err } fmap.Segments = append(fmap.Segments, maybeFrag.frags) } sort.Sort(fmap) return fmap, nil }
// ReadSequence is exported for use in other packages that read FASTA-like // files. // // The 'translate' function is used when sequences are checked for valid // characters. // // If you're just reading FASTA files, this method SHOULD NOT be used. func (r *Reader) ReadSequence(translate Translator) (seq.Sequence, error) { s := seq.Sequence{} seenHeader := false // Before entering the main loop, we have to check to see if we've // already read this entry's header. if r.nextHeader != nil { s.Name = trimHeader(r.nextHeader) r.nextHeader = nil seenHeader = true } for { line, err := r.buf.ReadBytes('\n') if err == io.EOF { if len(line) == 0 { return s, io.EOF } } else if err != nil { return seq.Sequence{}, fmt.Errorf("Error on line %d: %s", r.line, err) } line = bytes.TrimSpace(line) // If it's empty, increment the counter and skip ahead. if len(line) == 0 { r.line++ continue } // If the line starts with PIR junk, ignore the line. if bytes.HasPrefix(line, []byte("C;")) || bytes.HasPrefix(line, []byte("structure")) || bytes.HasPrefix(line, []byte("sequence")) { r.line++ continue } // If we haven't seen the header yet, this better be it. if !seenHeader { if line[0] != '>' { return seq.Sequence{}, fmt.Errorf("Expected '>', got '%c' on line %d.", line[0], r.line) } // Trim the '>' and load this line into the header. s.Name = trimHeader(line) seenHeader = true r.line++ continue } else if line[0] == '>' { // This means we've begun reading the next entry. // So slap this line into 'nextHeader' and return the current entry. r.nextHeader = line r.line++ return s, nil } // Finally, time to start reading the sequence. // If we trust the sequences, then we can just append this line // willy nilly. Otherwise we've got to check each character. if s.Residues == nil { s.Residues = make([]seq.Residue, 0, 50) } if r.TrustSequences { for _, b := range line { s.Residues = append(s.Residues, seq.Residue(b)) } } else { for _, b := range line { bNew, ok := translate(b) if !ok { return seq.Sequence{}, fmt.Errorf("Invalid character '%c' on line %d.", b, r.line) } // If the zero byte is returned from translate, then we // don't keep this residue around. if bNew > 0 { s.Residues = append(s.Residues, bNew) } } } r.line++ } panic("unreachable") }