Пример #1
0
func viewLib(c *command) {
	c.assertNArg(1)

	lib := util.Library(c.flags.Arg(0))

	fmt.Printf("Name: %s\n", lib.Name())
	fmt.Printf("Tag: %s\n", strings.Join(libraryTag(lib), "/"))
	fmt.Printf("Size: %d\n", lib.Size())
	fmt.Printf("Fragment Size: %d\n", lib.FragmentSize())
	fmt.Printf("IsStructure: %v\n", fragbag.IsStructure(lib))
	fmt.Printf("IsSequence: %v\n", fragbag.IsSequence(lib))
}
Пример #2
0
func mkPaired(c *command) {
	c.assertNArg(2)

	in := util.Library(c.flags.Arg(0))
	outPath := c.flags.Arg(1)
	util.AssertOverwritable(outPath, flagOverwrite)

	if _, ok := in.(fragbag.WeightedLibrary); ok {
		util.Fatalf("%s is a weighted library (not allowed)", in.Name())
	}

	name := fmt.Sprintf("paired-%s", in.Name())
	if fragbag.IsStructure(in) {
		var pairs [][]structure.Coords
		lib := in.(fragbag.StructureLibrary)
		nfrags := lib.Size()
		for i := 0; i < nfrags; i++ {
			for j := 0; j < nfrags; j++ {
				if i == j {
					continue
				}
				f1, f2 := lib.Atoms(i), lib.Atoms(j)
				pairs = append(pairs, append(f1, f2...))
			}
		}
		pairLib, err := fragbag.NewStructureAtoms(name, pairs)
		util.Assert(err)
		fragbag.Save(util.CreateFile(outPath), pairLib)
	} else if strings.Contains(in.Tag(), "hmm") {
		var pairs []*seq.HMM
		lib := in.(fragbag.SequenceLibrary)
		nfrags := lib.Size()
		for i := 0; i < nfrags; i++ {
			for j := 0; j < nfrags; j++ {
				if i == j {
					continue
				}
				f1, f2 := lib.Fragment(i).(*seq.HMM), lib.Fragment(j).(*seq.HMM)
				pairs = append(pairs, seq.HMMCat(f1, f2))
			}
		}
		pairLib, err := fragbag.NewSequenceHMM(name, pairs)
		util.Assert(err)
		fragbag.Save(util.CreateFile(outPath), pairLib)
	} else if strings.Contains(in.Tag(), "profile") {
		util.Fatalf("Sequence profiles not implemented.")
	} else {
		util.Fatalf("Unrecognized fragment library: %s", in.Tag())
	}
}
Пример #3
0
// ProcessBowers is a convenient wrapper around BowerOpen that processes each
// bower value in parallel and sends the resulting BOW value on the channel
// returned. The number of goroutines spawned is equivalent to N.
//
// It is appropriate for fpaths to be the arguments given to be command line
// arguments. Each directory is recursively expanded to its files, and special
// syntax is parsed as well.
//
// If `hideProgress` is true, then a progress bar will not be emitted to
// stderr.
//
// If `models` is true, then every model in a PDB bower file will have its
// BOW computed. Otherwise, the first model from each chain will be used.
func ProcessBowers(
	fpaths []string,
	lib fragbag.Library,
	models bool,
	n int,
	hideProgress bool,
) <-chan bow.Bowed {
	if n <= 0 {
		n = 1
	}
	results := make(chan bow.Bowed, n*2)
	fpaths = AllFilesFromArgs(fpaths)

	go func() {
		var progress *Progress
		totalJobs := 0
		if !hideProgress {
			totalJobs = numJobs(fpaths)
			progress = NewProgress(totalJobs)
		}

		// We use two levels of concurrency here. The first is at the level
		// of translating files into bowers. The second is at the level of
		// computing BOWs from bowers.
		// The first level is necessary because there can a large number of
		// bower files given, where each file only produces a few BOWs.
		// The second level is necessary because there is a lot of variation
		// between the number of bowers that a single file can produce. For
		// example, while most PDB files only produce a few, some FASTA files
		// can produce millions.

		files := make(chan string, n*2)
		bs := make(chan interface{}, n*2) // channel of bowers
		wgBowers := new(sync.WaitGroup)
		wgFiles := new(sync.WaitGroup)

		// goroutines for computing BOWs from bowers
		for i := 0; i < n; i++ {
			wgBowers.Add(1)
			go func() {
				defer wgBowers.Done()
				for b := range bs {
					var bw bow.Bowed
					if fragbag.IsStructure(lib) {
						lib := lib.(fragbag.StructureLibrary)
						bw = b.(bow.StructureBower).StructureBow(lib)
					} else if fragbag.IsSequence(lib) {
						lib := lib.(fragbag.SequenceLibrary)
						bw = b.(bow.SequenceBower).SequenceBow(lib)
					} else {
						Fatalf("Unknown fragment library %T", lib)
					}
					results <- bw
				}
			}()
		}

		// goroutines for translating files into bowers
		for i := 0; i < n; i++ {
			wgFiles.Add(1)
			go func() {
				defer wgFiles.Done()

				for fpath := range files {
					var err error
					for b := range BowerOpen(fpath, lib, models) {
						if b.Err != nil {
							err = b.Err
						} else {
							bs <- b.Bower
						}
						if IsFasta(fpath) { // each sequence counts
							progress.JobDone(err)
						}
					}
					if IsPDB(fpath) { // PDB file only counts as one job
						progress.JobDone(err)
					}
				}
			}()
		}
		for _, fpath := range fpaths {
			files <- fpath
		}

		close(files)
		wgFiles.Wait()
		close(bs)
		wgBowers.Wait()
		progress.Close()
		close(results)
	}()
	return results
}
Пример #4
0
// BowerOpen reads the contents of `fpath` and attempts to interpret it as a
// value (or values) that implement the `bow.Bower` interface. The list of
// `bow.Bower` values returned is guaranteed to be homogenous: they will
// either be all `bow.SequenceBower` values or `bow.StructureBower` values.
//
// The actual return value of the function is a receive-only channel of BowerErr
// values. Each BowerErr value either has the `Bower` member set or has the
// `err` field set to an error that prevented the file from being opened.
// Errors in this case are reserved for files that appear to be capable of
// producing a BOW, but were unable to be read.
//
// If the fpath given cannot be detected as a bower file, then a closed empty
// channel will be returned. A warning is also emitted to stderr.
//
// `lib` is a fragment library that is used to help interpret what kind of
// value must be in `r`. For example, if `lib` is a sequence fragment library,
// then `BowerOpen` is guaranteed to return a `Bower` value that implements the
// `bow.SequenceBower` interface.
//
// As of now, `BowerOpen` can read these types of files:
//
//	File extension                 Format    Interpretation
//	*.{ent.gz,pdb,ent}             PDB       whatever `lib` is
//	*.{fasta,fas,fasta.gz,fas.gz}  FASTA     sequence
//	everything else                error     invalid
//
// Note that special syntax for PDB file names is supported. Namely, chain
// identifiers can be appended to the end of the file name, and only that chain
// will be included in the `bow.Bower` value. Otherwise, all chains in the PDB
// entry will be returned as individual `bow.Bower` values.
//
// The format is simple and easily demonstrated by examples:
//
//	1ctf.end.gz       Chains A and B
//	1ctf.ent.gz:A     Only chain A
//	1ctf.ent.gz:B     Only chain B
//	1ctf.ent.gz:A,B   Chains A and B
//
// A secondary format is also accepted. The following are equivalent to their
// corresponding examples above:
//
//	1ctf
//	1ctfA
//	1ctfB
//	1ctf:A,B
//
// Finally, `fpath` may be the name of a PDB identifier and its file path will
// be inferred from the value of the `PDB_PATH` environment variable.
// Alternatively, `fpath` may be the name of a SCOP domain, and its
// corresponding PDB file will be inferred from the value of the
// `SCOP_PDB_PATH` environment variable.
func BowerOpen(fpath string, lib fragbag.Library, models bool) <-chan BowerErr {
	if lib == nil {
		Fatalf("Files can only be converted to Fragbag frequency vectors " +
			"if a fragment library is specified.")
	}

	bowers := make(chan BowerErr, 100)
	switch {
	case IsPDB(fpath):
		go func() {
			defer close(bowers)

			entry, chains, err := PDBOpen(fpath)
			if err != nil {
				err = fmt.Errorf("Error reading '%s': %s", fpath, err)
				bowers <- BowerErr{Err: err}
				return
			}

			if fragbag.IsStructure(lib) {
				for i := range chains {
					if !chains[i].IsProtein() {
						continue
					}

					if !models {
						b := bow.BowerFromChain(chains[i])
						bowers <- BowerErr{Bower: b}
					} else {
						for _, m := range chains[i].Models {
							b := bow.BowerFromModel(m)
							bowers <- BowerErr{Bower: b}
						}
					}
				}
			} else {
				for i := range chains {
					if !chains[i].IsProtein() {
						continue
					}

					s := chains[i].AsSequence()
					if s.Len() == 0 {
						s = aminoFromStructure(chains[i])
						if s.Len() == 0 {
							Warnf("Chain '%s:%c' has no amino sequence.",
								entry.IdCode, chains[i].Ident)
							continue
						}
					}
					bowers <- BowerErr{Bower: bow.BowerFromSequence(s)}
				}
			}
		}()
		return bowers
	case IsFasta(fpath) && !fragbag.IsStructure(lib):
		go func() {
			defer close(bowers)

			r, fp, err := fastaOpen(fpath)
			if err != nil {
				err = fmt.Errorf("Error reading file: %s", err)
				bowers <- BowerErr{Err: err}
				return
			}
			defer fp.Close()

			fr := fasta.NewReader(r)
			for {
				s, err := fr.Read()
				if err != nil {
					if err == io.EOF {
						break
					}
					err = fmt.Errorf("Error reading file: %s", err)
					bowers <- BowerErr{Err: err}
					return
				}
				bowers <- BowerErr{Bower: bow.BowerFromSequence(s)}
			}
		}()
		return bowers
	}
	Warnf("I don't know how to produce a Fragbag frequency vector "+
		"from the file '%s'.", fpath)
	close(bowers)
	return bowers
}