Пример #1
0
func main() {
	if len(util.FlagCpuProf) > 0 {
		f := util.CreateFile(util.FlagCpuProf)
		pprof.StartCPUProfile(f)
		defer f.Close()
		defer pprof.StopCPUProfile()
	}
	if len(flagGobIt) > 0 {
		astralDir := util.Arg(0)
		dists := readAlignmentDists(astralDir)
		enc := gob.NewEncoder(util.CreateFile(flagGobIt))
		util.Assert(enc.Encode(dists), "Could not GOB encode distances")
		return
	}

	var dists *intern.Table
	if util.IsDir(util.Arg(0)) {
		dists = readAlignmentDists(util.Arg(0))
	} else {
		dec := gob.NewDecoder(util.OpenFile(util.Arg(0)))
		util.Assert(dec.Decode(&dists), "Could not GOB decode distances")
	}

	treeFile := util.Arg(1)
	outPath := util.Arg(2)

	treeReader := newick.NewReader(util.OpenFile(treeFile))
	tree, err := treeReader.ReadTree()
	util.Assert(err, "Could not read newick tree")

	csvw := csv.NewWriter(util.CreateFile(outPath))
	clusters := treeClusters(flagThreshold, dists, tree)
	util.Assert(csvw.WriteAll(clusters))
}
Пример #2
0
func main() {
	var f io.Reader
	var err error

	f = util.OpenFile(flag.Arg(0))
	if strings.HasSuffix(flag.Arg(0), ".gz") {
		f, err = gzip.NewReader(f)
		util.Assert(err)
	}
	cifEntry, err := pdbx.Read(f)
	util.Assert(err, "Could not read PDBx/mmCIF file")

	fasEntries := make([]seq.Sequence, 0, 5)
	for _, ent := range cifEntry.Entities {
		for _, chain := range ent.Chains {
			if !isChainUsable(chain) || len(ent.Seq) == 0 {
				continue
			}

			fasEntry := seq.Sequence{
				Name:     chainHeader(chain),
				Residues: ent.Seq,
			}
			fasEntries = append(fasEntries, fasEntry)
		}
	}
	if len(fasEntries) == 0 {
		util.Fatalf("Could not find any chains with amino acids.")
	}

	var fasOut io.Writer
	if flag.NArg() == 1 {
		fasOut = os.Stdout
	} else {
		if len(flagSplit) > 0 {
			util.Fatalf("The '--split' option is incompatible with a single " +
				"output file.")
		}
		fasOut = util.CreateFile(util.Arg(1))
	}

	if len(flagSplit) == 0 {
		util.Assert(fasta.NewWriter(fasOut).WriteAll(fasEntries),
			"Could not write FASTA file '%s'", fasOut)
	} else {
		for _, entry := range fasEntries {
			fp := path.Join(flagSplit, fmt.Sprintf("%s.fasta", entry.Name))
			out := util.CreateFile(fp)

			w := fasta.NewWriter(out)
			util.Assert(w.Write(entry), "Could not write to '%s'", fp)
			util.Assert(w.Flush(), "Could not write to '%s'", fp)
		}
	}
}
Пример #3
0
func main() {
	hhmFile := util.Arg(0)
	start := util.ParseInt(util.Arg(1))
	end := util.ParseInt(util.Arg(2))

	fhhm := util.OpenFile(hhmFile)

	qhhm, err := hmm.ReadHHM(fhhm)
	util.Assert(err)

	util.Assert(hmm.WriteHHM(os.Stdout, qhhm.Slice(start, end)))
}
Пример #4
0
func mkPaired(c *command) {
	c.assertNArg(2)

	in := util.Library(c.flags.Arg(0))
	outPath := c.flags.Arg(1)
	util.AssertOverwritable(outPath, flagOverwrite)

	if _, ok := in.(fragbag.WeightedLibrary); ok {
		util.Fatalf("%s is a weighted library (not allowed)", in.Name())
	}

	name := fmt.Sprintf("paired-%s", in.Name())
	if fragbag.IsStructure(in) {
		var pairs [][]structure.Coords
		lib := in.(fragbag.StructureLibrary)
		nfrags := lib.Size()
		for i := 0; i < nfrags; i++ {
			for j := 0; j < nfrags; j++ {
				if i == j {
					continue
				}
				f1, f2 := lib.Atoms(i), lib.Atoms(j)
				pairs = append(pairs, append(f1, f2...))
			}
		}
		pairLib, err := fragbag.NewStructureAtoms(name, pairs)
		util.Assert(err)
		fragbag.Save(util.CreateFile(outPath), pairLib)
	} else if strings.Contains(in.Tag(), "hmm") {
		var pairs []*seq.HMM
		lib := in.(fragbag.SequenceLibrary)
		nfrags := lib.Size()
		for i := 0; i < nfrags; i++ {
			for j := 0; j < nfrags; j++ {
				if i == j {
					continue
				}
				f1, f2 := lib.Fragment(i).(*seq.HMM), lib.Fragment(j).(*seq.HMM)
				pairs = append(pairs, seq.HMMCat(f1, f2))
			}
		}
		pairLib, err := fragbag.NewSequenceHMM(name, pairs)
		util.Assert(err)
		fragbag.Save(util.CreateFile(outPath), pairLib)
	} else if strings.Contains(in.Tag(), "profile") {
		util.Fatalf("Sequence profiles not implemented.")
	} else {
		util.Fatalf("Unrecognized fragment library: %s", in.Tag())
	}
}
Пример #5
0
func main() {
	inFasta := util.Arg(0)
	outHHM := util.Arg(1)

	hhblits := hhsuite.HHBlitsDefault
	hhmake := hhsuite.HHMakePseudo
	hhblits.Verbose = !flagQuiet
	hhmake.Verbose = !flagQuiet

	HHM, err := hhsuite.BuildHHM(
		hhblits, hhmake, util.FlagSeqDB, inFasta)
	util.Assert(err, "Error building HHM")

	util.Assert(hmm.WriteHHM(util.CreateFile(outHHM), HHM),
		"Error writing HHM '%s'", outHHM)
}
Пример #6
0
func main() {
	outDir := util.Arg(0)
	fasInps := util.Args()[1:]

	util.Assert(os.MkdirAll(outDir, 0777))

	fastaChan := make(chan string)
	wg := new(sync.WaitGroup)
	for i := 0; i < max(1, runtime.GOMAXPROCS(0)); i++ {
		go func() {
			wg.Add(1)
			for fasta := range fastaChan {
				util.Verbosef("Computing map for '%s'...", fasta)
				fmap := util.GetFmap(fasta)
				outF := path.Join(outDir, fmt.Sprintf("%s.fmap", fmap.Name))
				util.FmapWrite(util.CreateFile(outF), fmap)
			}
			wg.Done()
		}()
	}

	for _, fasta := range fasInps {
		fastaChan <- fasta
	}

	close(fastaChan)
	wg.Wait()
}
Пример #7
0
func search(c *command) {
	c.assertLeastNArg(2)

	// Some search options don't translate directly to command line parameters
	// specified by the flag package.
	if flagSearchDesc {
		flagSearchOpts.Order = bowdb.OrderDesc
	}
	switch flagSearchSort {
	case "cosine":
		flagSearchOpts.SortBy = bowdb.SortByCosine
	case "euclid":
		flagSearchOpts.SortBy = bowdb.SortByEuclid
	default:
		util.Fatalf("Unknown sort field '%s'.", flagSearchSort)
	}

	db := util.OpenBowDB(c.flags.Arg(0))
	bowPaths := c.flags.Args()[1:]

	_, err := db.ReadAll()
	util.Assert(err, "Could not read BOW database entries")

	// always hide the progress bar here.
	bows := util.ProcessBowers(bowPaths, db.Lib, false, flagCpu, true)
	out, outDone := outputter()

	// launch goroutines to search queries in parallel
	wgSearch := new(sync.WaitGroup)
	for i := 0; i < flagCpu; i++ {
		wgSearch.Add(1)
		go func() {
			defer wgSearch.Done()

			for b := range bows {
				sr := db.Search(flagSearchOpts, b)
				out <- searchResult{b, sr}
			}
		}()
	}

	wgSearch.Wait()
	close(out)
	<-outDone
	util.Assert(db.Close())
}
Пример #8
0
func mkBowDb(c *command) {
	c.assertLeastNArg(3)

	dbPath := c.flags.Arg(0)
	flib := util.Library(c.flags.Arg(1))
	bowPaths := c.flags.Args()[2:]

	util.AssertOverwritable(dbPath, flagOverwrite)

	db, err := bowdb.Create(flib, dbPath)
	util.Assert(err)

	bows := util.ProcessBowers(bowPaths, flib, false, flagCpu, util.FlagQuiet)
	for b := range bows {
		db.Add(b)
	}
	util.Assert(db.Close())
}
Пример #9
0
func main() {
	a3mPath := util.Arg(0)
	fa3m := util.OpenFile(a3mPath)

	freader := fasta.NewReader(fa3m)
	freader.TrustSequences = true
	seqs, err := freader.ReadAll()
	util.Assert(err, "Could not read fasta format '%s'", a3mPath)
	util.Assert(fa3m.Close())

	w := util.CreateFile(a3mPath)
	fwriter := fasta.NewWriter(w)
	fwriter.Columns = 0
	for _, seq := range seqs {
		if len(seq.Residues) > 0 {
			util.Assert(fwriter.Write(seq))
		}
	}
	util.Assert(fwriter.Flush())
	util.Assert(w.Close())
}
Пример #10
0
func main() {
	rfasta := util.OpenFasta(util.Arg(0))
	dir := util.Arg(1)
	util.Assert(os.MkdirAll(dir, 0777))

	fr := fasta.NewReader(rfasta)
	for {
		s, err := fr.Read()
		if err != nil {
			if err == io.EOF {
				break
			}
			util.Assert(err)
		}

		s.Name = strings.Fields(s.Name)[0]
		fw := util.CreateFile(path.Join(dir, s.Name+".fasta"))
		w := fasta.NewWriter(fw)
		util.Assert(w.Write(s))
		util.Assert(w.Flush())
		util.Assert(fw.Close())
	}
}
Пример #11
0
func main() {
	pdbs := util.OpenFile(flag.Arg(0))
	defer pdbs.Close()

	entries, err := slct.NewReader(pdbs).ReadAll()
	util.Assert(err)

	for _, entry := range entries {
		if flagPaths {
			fmt.Println(util.PDBPath(entry.ChainID))
		} else {
			fmt.Println(entry.ChainID)
		}
	}
}
Пример #12
0
func main() {
	pdbf1, chain1, s1, e1 := util.Arg(0), util.Arg(1), util.Arg(2), util.Arg(3)
	pdbf2, chain2, s2, e2 := util.Arg(4), util.Arg(5), util.Arg(6), util.Arg(7)

	entry1 := util.PDBRead(pdbf1)
	entry2 := util.PDBRead(pdbf2)

	s1n, e1n := util.ParseInt(s1), util.ParseInt(e1)
	s2n, e2n := util.ParseInt(s2), util.ParseInt(e2)

	r, err := pdb.RMSD(
		entry1, chain1[0], s1n, e1n, entry2, chain2[0], s2n, e2n)
	util.Assert(err)
	fmt.Println(r)
}
Пример #13
0
func mkWeighted(c *command) {
	c.assertLeastNArg(4)

	train := util.Library(c.flags.Arg(0))
	in := util.Library(c.flags.Arg(1))
	outPath := c.flags.Arg(2)
	bowPaths := c.flags.Args()[3:]

	util.AssertOverwritable(outPath, flagOverwrite)

	// The inverse-document-frequencies of each fragment in the "in" fragment
	// library.
	numFrags := in.Size()
	idfs := make([]float32, numFrags)
	for i := range idfs {
		idfs[i] = 1 // pseudocount
	}

	// Compute the BOWs for each bower against the training fragment lib.
	bows := util.ProcessBowers(bowPaths, train, false, flagCpu, util.FlagQuiet)

	// Now tally the number of bowers that each fragment occurred in.
	totalBows := float32(1) // for pseudocount correction
	for bow := range bows {
		totalBows += 1
		for fragi := 0; fragi < numFrags; fragi++ {
			if bow.Bow.Freqs[fragi] > 0 {
				idfs[fragi]++
			}
		}
	}

	// Compute the IDF using the frequencies against all the BOWs.
	for i := range idfs {
		idfs[i] = float32(math.Log(float64(totalBows / idfs[i])))
	}

	// Finally, wrap the given library as a weighted library and save it.
	wlib, err := fragbag.NewWeightedTfIdf(in, idfs)
	util.Assert(err)
	fragbag.Save(util.CreateFile(outPath), wlib)
}
Пример #14
0
func mkSeqProfile(c *command) {
	c.assertLeastNArg(3)

	structLib := util.StructureLibrary(c.flags.Arg(0))
	outPath := c.flags.Arg(1)
	entries := c.flags.Args()[2:]

	util.AssertOverwritable(outPath, flagOverwrite)
	saveto := util.CreateFile(outPath)

	// Initialize a frequency and null profile for each structural fragment.
	var freqProfiles []*seq.FrequencyProfile
	var fpChans []chan seq.Sequence
	for i := 0; i < structLib.Size(); i++ {
		fp := seq.NewFrequencyProfile(structLib.FragmentSize())
		freqProfiles = append(freqProfiles, fp)
		fpChans = append(fpChans, make(chan seq.Sequence))
	}

	// Now spin up a goroutine for each fragment that is responsible for
	// adding a sequence slice to itself.
	nullChan, nullProfile := addToNull()
	for i := 0; i < structLib.Size(); i++ {
		addToProfile(fpChans[i], freqProfiles[i])
	}

	// Create a channel that sends the PDB entries given.
	entryChan := make(chan string)
	go func() {
		for _, fp := range entries {
			entryChan <- fp
		}
		close(entryChan)
	}()

	progress := util.NewProgress(len(entries))
	for i := 0; i < flagCpu; i++ {
		wgPDBChains.Add(1)
		go func() {
			for entryPath := range entryChan {
				_, chains, err := util.PDBOpen(entryPath)
				progress.JobDone(err)
				if err != nil {
					continue
				}

				for _, chain := range chains {
					structureToSequence(structLib, chain, nullChan, fpChans)
				}
			}
			wgPDBChains.Done()
		}()
	}
	wgPDBChains.Wait()
	progress.Close()

	// We've finishing reading all the PDB inputs. Now close the channels
	// and let the sequence fragments finish.
	close(nullChan)
	for i := 0; i < structLib.Size(); i++ {
		close(fpChans[i])
	}
	wgSeqFragments.Wait()

	// Finally, add the sequence fragments to a new sequence fragment
	// library and save.
	profs := make([]*seq.Profile, structLib.Size())
	for i := 0; i < structLib.Size(); i++ {
		profs[i] = freqProfiles[i].Profile(nullProfile)
	}
	lib, err := fragbag.NewSequenceProfile(structLib.Name(), profs)
	util.Assert(err)
	util.Assert(fragbag.Save(saveto, lib))
}
Пример #15
0
func main() {
	rfasta := util.OpenFasta(util.Arg(0))
	count, err := fasta.QuickSequenceCount(rfasta)
	util.Assert(err)
	fmt.Println(count)
}
Пример #16
0
func readFloat(s string) float64 {
	num, err := strconv.ParseFloat(s, 64)
	util.Assert(err, "Expected float, but got '%s'.", s)
	return num
}
Пример #17
0
func main() {
	pdbEntry := util.PDBRead(flag.Arg(0))

	fasEntries := make([]seq.Sequence, 0, 5)
	if !flagSeparateChains {
		var fasEntry seq.Sequence
		if len(pdbEntry.Chains) == 1 {
			fasEntry.Name = chainHeader(pdbEntry.OneChain())
		} else {
			fasEntry.Name = fmt.Sprintf("%s", strings.ToLower(pdbEntry.IdCode))
		}

		seq := make([]seq.Residue, 0, 100)
		for _, chain := range pdbEntry.Chains {
			if isChainUsable(chain) {
				seq = append(seq, chain.Sequence...)
			}
		}
		fasEntry.Residues = seq

		if len(fasEntry.Residues) == 0 {
			util.Fatalf("Could not find any amino acids.")
		}
		fasEntries = append(fasEntries, fasEntry)
	} else {
		for _, chain := range pdbEntry.Chains {
			if !isChainUsable(chain) {
				continue
			}

			fasEntry := seq.Sequence{
				Name:     chainHeader(chain),
				Residues: chain.Sequence,
			}
			fasEntries = append(fasEntries, fasEntry)
		}
	}
	if len(fasEntries) == 0 {
		util.Fatalf("Could not find any chains with amino acids.")
	}

	var fasOut io.Writer
	if flag.NArg() == 1 {
		fasOut = os.Stdout
	} else {
		if len(flagSplit) > 0 {
			util.Fatalf("The '--split' option is incompatible with a single " +
				"output file.")
		}
		fasOut = util.CreateFile(util.Arg(1))
	}

	if len(flagSplit) == 0 {
		util.Assert(fasta.NewWriter(fasOut).WriteAll(fasEntries),
			"Could not write FASTA file '%s'", fasOut)
	} else {
		for _, entry := range fasEntries {
			fp := path.Join(flagSplit, fmt.Sprintf("%s.fasta", entry.Name))
			out := util.CreateFile(fp)

			w := fasta.NewWriter(out)
			util.Assert(w.Write(entry), "Could not write to '%s'", fp)
			util.Assert(w.Flush(), "Could not write to '%s'", fp)
		}
	}
}
Пример #18
0
func readAlignmentDists(dir string) *intern.Table {
	dists := intern.NewTable(11000)
	threads := util.FlagCpu
	addDists := make(chan []pair)
	alignFile := make(chan string)
	done := make(chan struct{})

	go func() {
		for fileDists := range addDists {
			for _, pair := range fileDists {
				a1, a2 := dists.Atom(pair.key[0]), dists.Atom(pair.key[1])
				dists.Set(a1, a2, pair.dist)
			}
		}
		done <- struct{}{}
	}()

	wg := new(sync.WaitGroup)
	for i := 0; i < threads; i++ {
		wg.Add(1)
		go func() {
			for fpath := range alignFile {
				log.Printf("Reading %s (%s)", fpath, time.Now())

				f := util.OpenFile(fpath)
				defer f.Close()

				csvr := csv.NewReader(f)
				csvr.Comma = '\t'
				csvr.TrimLeadingSpace = true
				csvr.FieldsPerRecord = -1 // data is poorly formatted

				records, err := csvr.ReadAll()
				util.Assert(err, "[%s]", fpath)

				fileDists := make([]pair, 0, 100000)
				for _, record := range records {
					if len(record) != 9 {
						continue
					}
					p := recordToDist(record)
					fileDists = append(fileDists, p)
				}
				addDists <- fileDists
			}
			wg.Done()
		}()
	}

	for _, fpath := range util.RecursiveFiles(dir) {
		if strings.HasPrefix(path.Base(fpath), ".") {
			continue
		}
		alignFile <- fpath
	}
	close(alignFile)
	wg.Wait()
	close(addDists)
	<-done
	return dists
}
Пример #19
0
func mkSeqHMM(c *command) {
	c.assertLeastNArg(3)

	structLib := util.StructureLibrary(c.flags.Arg(0))
	outPath := c.flags.Arg(1)
	entries := c.flags.Args()[2:]

	util.AssertOverwritable(outPath, flagOverwrite)
	saveto := util.CreateFile(outPath)

	// Stores intermediate files produced by hhmake.
	tempDir, err := ioutil.TempDir("", "mk-seqlib-hmm")
	util.Assert(err, "Could not create temporary directory.")
	defer os.RemoveAll(tempDir)

	// Initialize a MSA for each structural fragment.
	var msas []seq.MSA
	var msaChans []chan seq.Sequence
	for i := 0; i < structLib.Size(); i++ {
		msa := seq.NewMSA()
		msa.SetLen(structLib.FragmentSize())
		msas = append(msas, msa)
		msaChans = append(msaChans, make(chan seq.Sequence))
	}

	// Now spin up a goroutine for each fragment that is responsible for
	// adding a sequence slice to itself.
	for i := 0; i < structLib.Size(); i++ {
		addToMSA(msaChans[i], &msas[i])
	}

	// Create a channel that sends the PDB entries given.
	entryChan := make(chan string)
	go func() {
		for _, fp := range entries {
			entryChan <- fp
		}
		close(entryChan)
	}()

	progress := util.NewProgress(len(entries))
	for i := 0; i < flagCpu; i++ {
		wgPDBChains.Add(1)
		go func() {
			for entryPath := range entryChan {
				_, chains, err := util.PDBOpen(entryPath)
				progress.JobDone(err)
				if err != nil {
					continue
				}

				for _, chain := range chains {
					structureToSequence(structLib, chain, nil, msaChans)
				}
			}
			wgPDBChains.Done()
		}()
	}
	wgPDBChains.Wait()
	progress.Close()

	// We've finishing reading all the PDB inputs. Now close the channels
	// and let the sequence fragments finish.
	for i := 0; i < structLib.Size(); i++ {
		close(msaChans[i])
	}
	wgSeqFragments.Wait()

	util.Verbosef("Building profile HMMs from MSAs...")

	// Finally, add the sequence fragments to a new sequence fragment
	// library and save.
	hmms := make([]*seq.HMM, structLib.Size())
	hhmake := func(i int) struct{} {
		fname := path.Join(tempDir, fmt.Sprintf("%d.fasta", i))
		f := util.CreateFile(fname)
		util.Assert(msa.WriteFasta(f, msas[i]))

		hhm, err := hhsuite.HHMakePseudo.Run(fname)
		util.Assert(err)
		hmms[i] = hhm.HMM
		return struct{}{} // my unifier sucks, i guess
	}
	fun.ParMap(hhmake, fun.Range(0, structLib.Size()))

	lib, err := fragbag.NewSequenceHMM(structLib.Name(), hmms)
	util.Assert(err)
	util.Assert(fragbag.Save(saveto, lib))
}