func readEmissions(alphabet []seq.Residue, flds []string) (*seq.EProbs, error) { var p seq.Prob var err error ep := seq.NewEProbs(alphabet) for i := 0; i < len(alphabet); i++ { if p, err = readProb(flds[i]); err != nil { return nil, err } ep.Set(alphabet[i], p) } return &ep, nil }
func readHMM(buf *bytes.Buffer) (hmm *seq.HMM, err error) { var nullFields []string hmm = new(seq.HMM) for { line, err := buf.ReadBytes('\n') if err == io.EOF && len(line) == 0 { break } if err != nil && err != io.EOF { panic(err) } line = trim(line) if len(line) == 0 { continue } switch { case hasPrefix(line, "NULL"): // We can't read the NULL emissions yet, because we don't have // an alphabet. (Which we'll get on the next line.) // We'll slurp this into a seq.EProbs value in a little bit, after // we get an alphabet. nullFields = strings.Fields(str(line[4:])) case hasPrefix(line, "HMM"): // We slurp up three lines here. The first is the alphabet // (the current line). The second is the ordering of transition // probabilities. And the third are transition probabilities for // the begin state. We ignore the second two. if _, err := demandLine(buf); err != nil { return nil, fmt.Errorf("%s (expected transition ordering)", err) } if _, err := demandLine(buf); err != nil { return nil, fmt.Errorf("%s (expected start transitions)", err) } // Get the ordering of the alphabet. hmm.Alphabet = make([]seq.Residue, 0, 20) residues := bytes.Split(trim(line[3:]), []byte{'\t'}) for _, residue := range residues { hmm.Alphabet = append(hmm.Alphabet, seq.Residue(residue[0])) } // Remember those null probabilities? Well, we have an alphabet now. ep, err := readEmissions(hmm.Alphabet, nullFields) if err != nil { return nil, fmt.Errorf("Could not read NULL emissions '%s': %s", strings.Join(nullFields, " "), err) } hmm.Null = *ep default: // finally, reading a node in the HMM // Each node in the HMM is made up of two lines. // The first line starts with the amino acid in the reference // sequence, followed by the node number, followed by N match // emission probabilities (where N = len(alphabet)), and finally // followed by the node number again. (What the *f**k* is up with // that? Seriously.) // // The second line is made up of 7 transition probabilities, // followed by 3 diversity (the 'neff' stuff) scores. // // Also, each field is separated by spaces OR tabs. Lovely, eh? line2, err := demandLine(buf) if err != nil { return nil, fmt.Errorf("%s (expected transition probs)", err) } fields1 := strings.Fields(string(line)) fields2 := strings.Fields(string(line2)) node := seq.HMMNode{ Residue: seq.Residue(fields1[0][0]), } node.NodeNum, err = strconv.Atoi(fields1[1]) if err != nil { return nil, fmt.Errorf("Could not parse node number '%s': %s", fields1[1], err) } ep, err := readEmissions(hmm.Alphabet, fields1[2:]) if err != nil { return nil, fmt.Errorf("Could not read emissions '%s': %s", strings.Join(fields1[2:], " "), err) } node.MatEmit = *ep node.InsEmit = seq.NewEProbs(hmm.Alphabet) for _, residue := range hmm.Alphabet { node.InsEmit.Set(residue, hmm.Null.Lookup(residue)) } node.Transitions, err = readTransitions(fields2) if err != nil { return nil, fmt.Errorf("Could not read transitions '%s': %s", strings.Join(fields2, " "), err) } node.NeffM, node.NeffI, node.NeffD, err = readDiversity(fields2[7:]) if err != nil { return nil, fmt.Errorf("Could not read diversity '%s': %s", strings.Join(fields2[7:], " "), err) } hmm.Nodes = append(hmm.Nodes, node) } } return hmm, nil }