Пример #1
0
func read(r *fasta.Reader, fasta bool) (seq.MSA, error) {
	msa := seq.NewMSA()
	for {
		s, err := readSequence(r)
		if err == io.EOF {
			break
		}
		if err != nil {
			return seq.MSA{}, err
		}

		if fasta {
			msa.AddFasta(s)
		} else {
			msa.Add(s)
		}
		if len(msa.Entries) > 1 {
			// We can't use 's' directly, because a sequence added to an MSA
			// may be modified if it isn't already in A2M format.
			lastEntry := msa.Entries[len(msa.Entries)-1]
			if lastEntry.Len() != msa.Entries[0].Len() {
				return seq.MSA{},
					fmt.Errorf("Sequence '%s' has length %d, but other "+
						"sequences have length %d.",
						s.Name, lastEntry, msa.Entries[0].Len())
			}
		}
	}
	return msa, nil
}
Пример #2
0
func readStockholm(r io.Reader, trusted bool) (seq.MSA, error) {
	msa := seq.NewMSA()
	ef := fmt.Errorf

	scanner := bufio.NewScanner(r)
	if scanner.Scan() {
		first := bytes.ToLower(bytes.Trim(scanner.Bytes(), " #"))
		if !bytes.Equal([]byte("stockholm 1.0"), first) {
			fmt.Printf("%s\n", first)
			return seq.MSA{}, ef("First line does not contain 'STOCKHOLM 1.0'.")
		}
	}
	for scanner.Scan() {
		line := bytes.TrimSpace(scanner.Bytes())
		if line[0] == '#' {
			continue
		}
		if line[0] == '/' && line[1] == '/' { // alignment done, says the spec
			break
		}

		pieces := bytes.Fields(line)
		residues, err := asResidues(pieces[len(pieces)-1], trusted)
		if err != nil {
			return seq.MSA{}, err
		}

		s := seq.Sequence{
			Name:     string(concat(pieces[0 : len(pieces)-1])),
			Residues: residues,
		}
		msa.Add(s)
		if len(msa.Entries) > 1 {
			// We can't use 's' directly, because a sequence added to an MSA
			// may be modified if it isn't already in A2M format.
			lastEntry := msa.Entries[len(msa.Entries)-1]
			if lastEntry.Len() != msa.Entries[0].Len() {
				return seq.MSA{},
					fmt.Errorf("Sequence '%s' has length %d, but other "+
						"sequences have length %d.",
						s.Name, lastEntry, msa.Entries[0].Len())
			}
		}
	}
	if err := scanner.Err(); err != nil {
		return seq.MSA{}, err
	}
	return msa, nil
}
Пример #3
0
func readSeqs(buf *bytes.Buffer) (HHMSecondary, seq.MSA, error) {
	// Remember, the sequence portion of an HHM file actually has two parts.
	// The first part is optional and contains secondary structure information.
	// These SS sequences can be identified by special sequence headers:
	// "ss_dssp", "sa_dssp", "ss_pred", "ss_conf", and "Consensus".
	// If a sequence doesn't contain a special header, then that signifies that
	// we should start reading the MSA, which comes after the SS information.
	doneSS := false

	ss := HHMSecondary{}
	msa := seq.NewMSA()

	reader := fasta.NewReader(buf)
	reader.TrustSequences = true
	seqs, err := reader.ReadAll()
	if err != nil {
		return HHMSecondary{}, seq.MSA{}, err
	}

	for _, s := range seqs {
		s := s
		if !doneSS {
			switch {
			case strings.HasPrefix(s.Name, "ss_dssp"):
				ss.SSdssp = &s
			case strings.HasPrefix(s.Name, "sa_dssp"):
				ss.SAdssp = &s
			case strings.HasPrefix(s.Name, "ss_pred"):
				ss.SSpred = &s
			case strings.HasPrefix(s.Name, "ss_conf"):
				ss.SSconf = &s
			case strings.HasPrefix(s.Name, "Consensus"):
				ss.Consensus = &s
			default:
				doneSS = true
			}
		}
		if doneSS {
			msa.Add(s)
		}
	}
	return ss, msa, nil
}
Пример #4
0
func mkSeqHMM(c *command) {
	c.assertLeastNArg(3)

	structLib := util.StructureLibrary(c.flags.Arg(0))
	outPath := c.flags.Arg(1)
	entries := c.flags.Args()[2:]

	util.AssertOverwritable(outPath, flagOverwrite)
	saveto := util.CreateFile(outPath)

	// Stores intermediate files produced by hhmake.
	tempDir, err := ioutil.TempDir("", "mk-seqlib-hmm")
	util.Assert(err, "Could not create temporary directory.")
	defer os.RemoveAll(tempDir)

	// Initialize a MSA for each structural fragment.
	var msas []seq.MSA
	var msaChans []chan seq.Sequence
	for i := 0; i < structLib.Size(); i++ {
		msa := seq.NewMSA()
		msa.SetLen(structLib.FragmentSize())
		msas = append(msas, msa)
		msaChans = append(msaChans, make(chan seq.Sequence))
	}

	// Now spin up a goroutine for each fragment that is responsible for
	// adding a sequence slice to itself.
	for i := 0; i < structLib.Size(); i++ {
		addToMSA(msaChans[i], &msas[i])
	}

	// Create a channel that sends the PDB entries given.
	entryChan := make(chan string)
	go func() {
		for _, fp := range entries {
			entryChan <- fp
		}
		close(entryChan)
	}()

	progress := util.NewProgress(len(entries))
	for i := 0; i < flagCpu; i++ {
		wgPDBChains.Add(1)
		go func() {
			for entryPath := range entryChan {
				_, chains, err := util.PDBOpen(entryPath)
				progress.JobDone(err)
				if err != nil {
					continue
				}

				for _, chain := range chains {
					structureToSequence(structLib, chain, nil, msaChans)
				}
			}
			wgPDBChains.Done()
		}()
	}
	wgPDBChains.Wait()
	progress.Close()

	// We've finishing reading all the PDB inputs. Now close the channels
	// and let the sequence fragments finish.
	for i := 0; i < structLib.Size(); i++ {
		close(msaChans[i])
	}
	wgSeqFragments.Wait()

	util.Verbosef("Building profile HMMs from MSAs...")

	// Finally, add the sequence fragments to a new sequence fragment
	// library and save.
	hmms := make([]*seq.HMM, structLib.Size())
	hhmake := func(i int) struct{} {
		fname := path.Join(tempDir, fmt.Sprintf("%d.fasta", i))
		f := util.CreateFile(fname)
		util.Assert(msa.WriteFasta(f, msas[i]))

		hhm, err := hhsuite.HHMakePseudo.Run(fname)
		util.Assert(err)
		hmms[i] = hhm.HMM
		return struct{}{} // my unifier sucks, i guess
	}
	fun.ParMap(hhmake, fun.Range(0, structLib.Size()))

	lib, err := fragbag.NewSequenceHMM(structLib.Name(), hmms)
	util.Assert(err)
	util.Assert(fragbag.Save(saveto, lib))
}
Пример #5
0
func makeMSA(seqs []seq.Sequence) seq.MSA {
	msa := seq.NewMSA()
	msa.AddSlice(seqs)
	return msa
}