Пример #1
0
func readEmissions(alphabet []seq.Residue, flds []string) (*seq.EProbs, error) {
	var p seq.Prob
	var err error

	ep := seq.NewEProbs(alphabet)
	for i := 0; i < len(alphabet); i++ {
		if p, err = readProb(flds[i]); err != nil {
			return nil, err
		}
		ep.Set(alphabet[i], p)
	}
	return &ep, nil
}
Пример #2
0
func readHMM(buf *bytes.Buffer) (hmm *seq.HMM, err error) {
	var nullFields []string
	hmm = new(seq.HMM)
	for {
		line, err := buf.ReadBytes('\n')
		if err == io.EOF && len(line) == 0 {
			break
		}
		if err != nil && err != io.EOF {
			panic(err)
		}
		line = trim(line)

		if len(line) == 0 {
			continue
		}
		switch {
		case hasPrefix(line, "NULL"):
			// We can't read the NULL emissions yet, because we don't have
			// an alphabet. (Which we'll get on the next line.)
			// We'll slurp this into a seq.EProbs value in a little bit, after
			// we get an alphabet.
			nullFields = strings.Fields(str(line[4:]))
		case hasPrefix(line, "HMM"):
			// We slurp up three lines here. The first is the alphabet
			// (the current line). The second is the ordering of transition
			// probabilities. And the third are transition probabilities for
			// the begin state. We ignore the second two.
			if _, err := demandLine(buf); err != nil {
				return nil, fmt.Errorf("%s (expected transition ordering)", err)
			}
			if _, err := demandLine(buf); err != nil {
				return nil, fmt.Errorf("%s (expected start transitions)", err)
			}

			// Get the ordering of the alphabet.
			hmm.Alphabet = make([]seq.Residue, 0, 20)
			residues := bytes.Split(trim(line[3:]), []byte{'\t'})
			for _, residue := range residues {
				hmm.Alphabet = append(hmm.Alphabet, seq.Residue(residue[0]))
			}

			// Remember those null probabilities? Well, we have an alphabet now.
			ep, err := readEmissions(hmm.Alphabet, nullFields)
			if err != nil {
				return nil, fmt.Errorf("Could not read NULL emissions '%s': %s",
					strings.Join(nullFields, " "), err)
			}
			hmm.Null = *ep
		default: // finally, reading a node in the HMM
			// Each node in the HMM is made up of two lines.
			// The first line starts with the amino acid in the reference
			// sequence, followed by the node number, followed by N match
			// emission probabilities (where N = len(alphabet)), and finally
			// followed by the node number again. (What the *f**k* is up with
			// that? Seriously.)
			//
			// The second line is made up of 7 transition probabilities,
			// followed by 3 diversity (the 'neff' stuff) scores.
			//
			// Also, each field is separated by spaces OR tabs. Lovely, eh?
			line2, err := demandLine(buf)
			if err != nil {
				return nil, fmt.Errorf("%s (expected transition probs)", err)
			}
			fields1 := strings.Fields(string(line))
			fields2 := strings.Fields(string(line2))
			node := seq.HMMNode{
				Residue: seq.Residue(fields1[0][0]),
			}

			node.NodeNum, err = strconv.Atoi(fields1[1])
			if err != nil {
				return nil, fmt.Errorf("Could not parse node number '%s': %s",
					fields1[1], err)
			}

			ep, err := readEmissions(hmm.Alphabet, fields1[2:])
			if err != nil {
				return nil, fmt.Errorf("Could not read emissions '%s': %s",
					strings.Join(fields1[2:], " "), err)
			}
			node.MatEmit = *ep

			node.InsEmit = seq.NewEProbs(hmm.Alphabet)
			for _, residue := range hmm.Alphabet {
				node.InsEmit.Set(residue, hmm.Null.Lookup(residue))
			}

			node.Transitions, err = readTransitions(fields2)
			if err != nil {
				return nil, fmt.Errorf("Could not read transitions '%s': %s",
					strings.Join(fields2, " "), err)
			}

			node.NeffM, node.NeffI, node.NeffD, err = readDiversity(fields2[7:])
			if err != nil {
				return nil, fmt.Errorf("Could not read diversity '%s': %s",
					strings.Join(fields2[7:], " "), err)
			}

			hmm.Nodes = append(hmm.Nodes, node)
		}
	}
	return hmm, nil
}