Beispiel #1
0
func TranslateQuerySeqs(
	query *bytes.Reader, action SearchOperator) (*bytes.Reader, error) {

	buf := new(bytes.Buffer)
	f := fasta.NewWriter(buf)
	reader := fasta.NewReader(query)
	for i := 0; true; i++ {
		sequence, err := reader.Read()
		if err == io.EOF {
			break
		}
		if err != nil {
			return nil, err
		}
		origSeq := sequence.Bytes()
		n := sequence.Name
		// generate 6 ORFs
		transSeqs := Translate(origSeq)
		for _, s := range transSeqs {
			result := seq.NewSequenceString(n, string(Reduce(s)))
			f.Write(result)
		}

	}

	return bytes.NewReader(buf.Bytes()), nil
}
Beispiel #2
0
// ReadOriginalSeqs reads a FASTA formatted file and returns a channel that
// each new sequence is sent to.
func ReadOriginalSeqs(
	fileName string,
	ignore []byte,
) (chan ReadOriginalSeq, error) {
	var f io.Reader
	var err error

	f, err = os.Open(fileName)
	if err != nil {
		return nil, err
	}
	if strings.HasSuffix(fileName, ".gz") {
		f, err = gzip.NewReader(f)
		if err != nil {
			return nil, err
		}
	}

	reader := fasta.NewReader(f)
	seqChan := make(chan ReadOriginalSeq, 200)
	go func() {
		for i := 0; true; i++ {
			sequence, err := reader.Read()
			if err == io.EOF {
				close(seqChan)
				break
			}
			if err != nil {
				seqChan <- ReadOriginalSeq{
					Seq: nil,
					Err: err,
				}
				close(seqChan)
				break
			}
			for i, residue := range sequence.Residues {
				for _, toignore := range ignore {
					if toignore == byte(residue) {
						sequence.Residues[i] = 'X'
						break
					}
				}
			}
			seqChan <- ReadOriginalSeq{
				Seq: NewFastaOriginalSeq(i, sequence),
				Err: nil,
			}
		}
	}()
	return seqChan, nil
}
Beispiel #3
0
func readSeqs(buf *bytes.Buffer) (HHMSecondary, seq.MSA, error) {
	// Remember, the sequence portion of an HHM file actually has two parts.
	// The first part is optional and contains secondary structure information.
	// These SS sequences can be identified by special sequence headers:
	// "ss_dssp", "sa_dssp", "ss_pred", "ss_conf", and "Consensus".
	// If a sequence doesn't contain a special header, then that signifies that
	// we should start reading the MSA, which comes after the SS information.
	doneSS := false

	ss := HHMSecondary{}
	msa := seq.NewMSA()

	reader := fasta.NewReader(buf)
	reader.TrustSequences = true
	seqs, err := reader.ReadAll()
	if err != nil {
		return HHMSecondary{}, seq.MSA{}, err
	}

	for _, s := range seqs {
		s := s
		if !doneSS {
			switch {
			case strings.HasPrefix(s.Name, "ss_dssp"):
				ss.SSdssp = &s
			case strings.HasPrefix(s.Name, "sa_dssp"):
				ss.SAdssp = &s
			case strings.HasPrefix(s.Name, "ss_pred"):
				ss.SSpred = &s
			case strings.HasPrefix(s.Name, "ss_conf"):
				ss.SSconf = &s
			case strings.HasPrefix(s.Name, "Consensus"):
				ss.Consensus = &s
			default:
				doneSS = true
			}
		}
		if doneSS {
			msa.Add(s)
		}
	}
	return ss, msa, nil
}
Beispiel #4
0
func (coarsedb *CoarseDB) readFasta() error {
	Vprintf("\t\tReading %s...\n", FileCoarseFasta)
	timer := time.Now()

	fastaReader := fasta.NewReader(coarsedb.FileFasta)
	for i := 0; true; i++ {
		seq, err := fastaReader.Read()
		if err == io.EOF {
			break
		}
		if err != nil {
			return err
		}
		coarsedb.Seqs = append(coarsedb.Seqs, NewFastaCoarseSeq(i, seq))
	}
	coarsedb.seqsRead = len(coarsedb.Seqs)

	Vprintf("\t\tDone reading %s (%s).\n", FileCoarseFasta, time.Since(timer))
	return nil
}
Beispiel #5
0
// This file isn't in the repo, so if it isn't there, we just return nil.
// The caller must not fail with nil returned.
func readDisordered() map[string]seq.Sequence {
	f, err := os.Open("pdb_disordered.fasta")
	if err != nil {
		return nil
	}

	r := fasta.NewReader(f)
	r.TrustSequences = true
	m := make(map[string]seq.Sequence, 10000)
	for {
		s, err := r.Read()
		if err == io.EOF {
			break
		}
		assert(err)

		m[strings.ToUpper(s.Name)] = s
	}
	return m
}
Beispiel #6
0
func main() {
	a3mPath := util.Arg(0)
	fa3m := util.OpenFile(a3mPath)

	freader := fasta.NewReader(fa3m)
	freader.TrustSequences = true
	seqs, err := freader.ReadAll()
	util.Assert(err, "Could not read fasta format '%s'", a3mPath)
	util.Assert(fa3m.Close())

	w := util.CreateFile(a3mPath)
	fwriter := fasta.NewWriter(w)
	fwriter.Columns = 0
	for _, seq := range seqs {
		if len(seq.Residues) > 0 {
			util.Assert(fwriter.Write(seq))
		}
	}
	util.Assert(fwriter.Flush())
	util.Assert(w.Close())
}
Beispiel #7
0
func getOneFastaSequence(queryFasta string) (s seq.Sequence, err error) {
	fquery, err := os.Open(queryFasta)
	if err != nil {
		return
	}
	defer fquery.Close()

	seqs, err := fasta.NewReader(fquery).ReadAll()
	if err != nil {
		return
	} else if len(seqs) == 0 {
		err = fmt.Errorf("No sequences found in '%s'.", queryFasta)
		return
	} else if len(seqs) > 1 {
		err = fmt.Errorf("%d sequences found in '%s'. Expected only 1.",
			len(seqs), queryFasta)
		return
	}
	s = seqs[0]
	return
}
Beispiel #8
0
func main() {
	rfasta := util.OpenFasta(util.Arg(0))
	dir := util.Arg(1)
	util.Assert(os.MkdirAll(dir, 0777))

	fr := fasta.NewReader(rfasta)
	for {
		s, err := fr.Read()
		if err != nil {
			if err == io.EOF {
				break
			}
			util.Assert(err)
		}

		s.Name = strings.Fields(s.Name)[0]
		fw := util.CreateFile(path.Join(dir, s.Name+".fasta"))
		w := fasta.NewWriter(fw)
		util.Assert(w.Write(s))
		util.Assert(w.Flush())
		util.Assert(fw.Close())
	}
}
Beispiel #9
0
func ReduceQuerySeqs(
	query *bytes.Reader) (*bytes.Reader, error) {
	buf := new(bytes.Buffer)
	f := fasta.NewWriter(buf)
	reader := fasta.NewReader(query)
	for i := 0; true; i++ {
		sequence, err := reader.Read()
		if err == io.EOF {
			break
		}
		if err != nil {
			return nil, err
		}
		rs := Reduce(sequence.Bytes())
		n := sequence.Name

		result := seq.NewSequenceString(n, string(rs))
		f.Write(result)
	}
	f.Flush()

	return bytes.NewReader(buf.Bytes()), nil
}
Beispiel #10
0
func processCompressedQueries(db *cablastp.DB, queryDBConf *cablastp.DBConf, inputQueryFilename string, searchBuf *bytes.Buffer) error {
	cablastp.Vprintln("Compressing queries into a database...")
	dbDirLoc := "./tmp_query_database" // TODO this should be a parameter
	qDBDirLoc, err := compressQueries(inputQueryFilename, queryDBConf, dbDirLoc)
	handleFatalError("Error compressing queries", err)
	cablastp.Vprintln("Opening DB for reading")
	qDB, err := cablastp.NewReadDB(qDBDirLoc)
	handleFatalError("Error opening query database", err)
	cablastp.Vprintln("Opening compressed queries for search...")
	compQueryFilename := qDB.CoarseFastaLocation()
	compQueries, err := getInputFasta(compQueryFilename)
	handleFatalError("Error opening compressed query file", err)

	queryBuf := new(bytes.Buffer)
	f := fasta.NewWriter(queryBuf)
	reader := fasta.NewReader(compQueries)

	for origSeqID := 0; true; origSeqID++ {

		sequence, err := reader.Read()
		if err == io.EOF {
			break
		}

		if err != nil {
			fatalf("Could not read input fasta query: %s\n", err)
		}

		origSeq := sequence.Bytes()
		n := sequence.Name
		// generate 6 ORFs
		transSeqs := cablastp.Translate(origSeq)
		for _, s := range transSeqs {
			// reduce each one
			result := seq.NewSequenceString(n, string(cablastp.Reduce(s)))

			f.Write(result)

		}

		f.Flush()
		transCoarseQueries := bytes.NewReader(queryBuf.Bytes())

		cablastp.Vprintln("\nBlasting query on coarse database...")
		err = blastCoarse(db, transCoarseQueries, searchBuf)
		handleFatalError("Error blasting coarse database", err)

		cablastp.Vprintln("Decompressing coarse blast hits...")
		expandedSequences, err := expandBlastHits(db, searchBuf)
		handleFatalError("Error decompressing coarse blast hits", err)
		if len(expandedSequences) == 0 {
			cablastp.Vprintln("No results from coarse search")
		} else {
			cablastp.Vprintln("Making FASTA from coarse blast hits...")
			searchBuf.Reset()
			err = writeFasta(expandedSequences, searchBuf)
			handleFatalError("Could not create FASTA input from coarse hits", err)

			cablastp.Vprintln("Expanding coarse query...")
			expQuery, err := expandCoarseSequence(qDB, origSeqID, &sequence)
			handleFatalError("Could not expand coarse queries", err)

			fineQueryBuf := new(bytes.Buffer)
			fineWriter := fasta.NewWriter(fineQueryBuf)
			for _, fineQuery := range expQuery {
				fineQueryBytes := fineQuery.FastaSeq().Bytes() // <- Is This the same as fineQuery.Residues()?
				fineName := fineQuery.Name
				writeSeq := seq.NewSequenceString(fineName, string(fineQueryBytes))
				fineWriter.Write(writeSeq)
			}
			fineWriter.Flush()
			transFineQueries := bytes.NewReader(fineQueryBuf.Bytes())

			cablastp.Vprintln("Building fine BLAST target database...")
			targetTmpDir, err := makeFineBlastDB(db, searchBuf)
			handleFatalError("Could not create fine database to search on", err)

			cablastp.Vprintln("Blasting original query on fine database...")
			err = blastFine(db, targetTmpDir, transFineQueries)
			handleFatalError("Error blasting fine database", err)
			err = os.RemoveAll(targetTmpDir)
			handleFatalError("Could not remove fine database", err)
		}
		queryBuf.Reset()
	}
	cablastp.Vprintln("Cleaning up...")
	err = os.RemoveAll(dbDirLoc)
	handleFatalError("Could not remove query database", err)
	return nil
}
Beispiel #11
0
func main() {

	searchBuf := new(bytes.Buffer) // might need more than 1 buffer

	if flag.NArg() != 2 {
		flag.Usage()
	}

	// If the quiet flag isn't set, enable verbose output.
	if !flagQuiet {
		cablastp.Verbose = true
	}

	queryDBConf := argDBConf.DeepCopy() // deep copy of the default DBConf
	// updated by the args
	inputFastaQueryName := flag.Arg(1)
	db, err := cablastp.NewReadDB(flag.Arg(0))
	if err != nil {
		fatalf("Could not open '%s' database: %s\n", flag.Arg(0), err)
	}
	// For query-compression mode, we first run compression on the query file
	// then coarse-coarse search, decompress both, fine-fine search.
	// otherwise, just coarse search, decompress results, fine search.
	// iterate over the query sequences in the input fasta
	// initially, only implement standard search.

	if flagCompressQuery {

		processCompressedQueries(db, queryDBConf, inputFastaQueryName, searchBuf)

	} else {

		queryBuf := new(bytes.Buffer) // might need more than 1 buffer
		inputFastaQuery, err := getInputFasta(inputFastaQueryName)
		handleFatalError("Could not read input fasta query", err)

		f := fasta.NewWriter(queryBuf)
		reader := fasta.NewReader(inputFastaQuery)

		for i := 0; true; i++ {

			if flagIterativeQuery {

				for j := 0; j < flagQueryChunkSize; j++ {
					translateQueries(reader, f)
				}

				transQueries := bytes.NewReader(queryBuf.Bytes())
				processQueries(db, transQueries, searchBuf)
				queryBuf.Reset()

			} else {
				translateQueries(reader, f)
			}

		}

		if !flagIterativeQuery {
			cablastp.Vprintln("\nProcessing Queries in one batch...")
			f.Flush()
			transQueries := bytes.NewReader(queryBuf.Bytes())
			processQueries(db, transQueries, searchBuf)
		}
	}

	cleanup(db)
}
Beispiel #12
0
// BowerOpen reads the contents of `fpath` and attempts to interpret it as a
// value (or values) that implement the `bow.Bower` interface. The list of
// `bow.Bower` values returned is guaranteed to be homogenous: they will
// either be all `bow.SequenceBower` values or `bow.StructureBower` values.
//
// The actual return value of the function is a receive-only channel of BowerErr
// values. Each BowerErr value either has the `Bower` member set or has the
// `err` field set to an error that prevented the file from being opened.
// Errors in this case are reserved for files that appear to be capable of
// producing a BOW, but were unable to be read.
//
// If the fpath given cannot be detected as a bower file, then a closed empty
// channel will be returned. A warning is also emitted to stderr.
//
// `lib` is a fragment library that is used to help interpret what kind of
// value must be in `r`. For example, if `lib` is a sequence fragment library,
// then `BowerOpen` is guaranteed to return a `Bower` value that implements the
// `bow.SequenceBower` interface.
//
// As of now, `BowerOpen` can read these types of files:
//
//	File extension                 Format    Interpretation
//	*.{ent.gz,pdb,ent}             PDB       whatever `lib` is
//	*.{fasta,fas,fasta.gz,fas.gz}  FASTA     sequence
//	everything else                error     invalid
//
// Note that special syntax for PDB file names is supported. Namely, chain
// identifiers can be appended to the end of the file name, and only that chain
// will be included in the `bow.Bower` value. Otherwise, all chains in the PDB
// entry will be returned as individual `bow.Bower` values.
//
// The format is simple and easily demonstrated by examples:
//
//	1ctf.end.gz       Chains A and B
//	1ctf.ent.gz:A     Only chain A
//	1ctf.ent.gz:B     Only chain B
//	1ctf.ent.gz:A,B   Chains A and B
//
// A secondary format is also accepted. The following are equivalent to their
// corresponding examples above:
//
//	1ctf
//	1ctfA
//	1ctfB
//	1ctf:A,B
//
// Finally, `fpath` may be the name of a PDB identifier and its file path will
// be inferred from the value of the `PDB_PATH` environment variable.
// Alternatively, `fpath` may be the name of a SCOP domain, and its
// corresponding PDB file will be inferred from the value of the
// `SCOP_PDB_PATH` environment variable.
func BowerOpen(fpath string, lib fragbag.Library, models bool) <-chan BowerErr {
	if lib == nil {
		Fatalf("Files can only be converted to Fragbag frequency vectors " +
			"if a fragment library is specified.")
	}

	bowers := make(chan BowerErr, 100)
	switch {
	case IsPDB(fpath):
		go func() {
			defer close(bowers)

			entry, chains, err := PDBOpen(fpath)
			if err != nil {
				err = fmt.Errorf("Error reading '%s': %s", fpath, err)
				bowers <- BowerErr{Err: err}
				return
			}

			if fragbag.IsStructure(lib) {
				for i := range chains {
					if !chains[i].IsProtein() {
						continue
					}

					if !models {
						b := bow.BowerFromChain(chains[i])
						bowers <- BowerErr{Bower: b}
					} else {
						for _, m := range chains[i].Models {
							b := bow.BowerFromModel(m)
							bowers <- BowerErr{Bower: b}
						}
					}
				}
			} else {
				for i := range chains {
					if !chains[i].IsProtein() {
						continue
					}

					s := chains[i].AsSequence()
					if s.Len() == 0 {
						s = aminoFromStructure(chains[i])
						if s.Len() == 0 {
							Warnf("Chain '%s:%c' has no amino sequence.",
								entry.IdCode, chains[i].Ident)
							continue
						}
					}
					bowers <- BowerErr{Bower: bow.BowerFromSequence(s)}
				}
			}
		}()
		return bowers
	case IsFasta(fpath) && !fragbag.IsStructure(lib):
		go func() {
			defer close(bowers)

			r, fp, err := fastaOpen(fpath)
			if err != nil {
				err = fmt.Errorf("Error reading file: %s", err)
				bowers <- BowerErr{Err: err}
				return
			}
			defer fp.Close()

			fr := fasta.NewReader(r)
			for {
				s, err := fr.Read()
				if err != nil {
					if err == io.EOF {
						break
					}
					err = fmt.Errorf("Error reading file: %s", err)
					bowers <- BowerErr{Err: err}
					return
				}
				bowers <- BowerErr{Bower: bow.BowerFromSequence(s)}
			}
		}()
		return bowers
	}
	Warnf("I don't know how to produce a Fragbag frequency vector "+
		"from the file '%s'.", fpath)
	close(bowers)
	return bowers
}
Beispiel #13
0
// ReadTrustedFasta will read a single MSA from trusted input, where the input
// can be formatted/ in FASTA format. Sequences are read until io.EOF.
//
// "Trust" in this context means that the input doesn't contain any illegal
// characters in the sequence. Trusting the input should be faster.
//
// If you need to read A2M or A3M aligned formats, use ReadTrusted.
func ReadTrustedFasta(reader io.Reader) (seq.MSA, error) {
	r := fasta.NewReader(reader)
	r.TrustSequences = true
	return read(r, true)
}
Beispiel #14
0
// Read will read a single MSA from the input, where the input can be formatted
// in A2M or A3M formats. Sequences are read until io.EOF.
//
// If you need to read FASTA aligned format, use ReadFasta.
func Read(reader io.Reader) (seq.MSA, error) {
	r := fasta.NewReader(reader)
	r.TrustSequences = false
	return read(r, false)
}