Пример #1
0
func init() {
	var err error
	m, err = NewMulti("example multi",
		[]seq.Sequence{
			linear.NewSeq("example DNA 1", []alphabet.Letter("ACGCTGACTTGGTGCACGT"), alphabet.DNA),
			linear.NewSeq("example DNA 2", []alphabet.Letter("ACGGTGACCTGGCGCGCAT"), alphabet.DNA),
			linear.NewSeq("example DNA 3", []alphabet.Letter("ACGATGACGTGGCGCTCAT"), alphabet.DNA),
		},
		seq.DefaultConsensus)

	if err != nil {
		panic(err)
	}
}
Пример #2
0
func (s *S) TestReadFasta(c *check.C) {
	var (
		obtainN []string
		obtainS [][]alphabet.Letter
	)

	for _, fa := range fas {
		r := NewReader(bytes.NewBufferString(fa), linear.NewSeq("", nil, alphabet.Protein))
		for {
			if s, err := r.Read(); err != nil {
				if err == io.EOF {
					break
				} else {
					c.Fatalf("Failed to read %q: %s", fa, err)
				}
			} else {
				t := s.(*linear.Seq)
				header := t.Name()
				if desc := t.Description(); len(desc) > 0 {
					header += " " + desc
				}
				obtainN = append(obtainN, header)
				obtainS = append(obtainS, t.Slice().(alphabet.Letters))
			}
		}
		c.Check(obtainN, check.DeepEquals, expectN)
		obtainN = nil
		for i := range obtainS {
			c.Check(len(obtainS[i]), check.Equals, len(expectS[i]))
			c.Check(obtainS[i], check.DeepEquals, expectS[i])
		}
		obtainS = nil
	}
}
Пример #3
0
func (s *S) TestReadFromFunc(c *check.C) {
	var (
		obtainNfa []string
		obtainSfa [][]alphabet.Letter
	)

	sc := seqio.NewScannerFromFunc(
		fasta.NewReader(
			bytes.NewBufferString(testaln0),
			linear.NewSeq("", nil, alphabet.Protein),
		).Read,
	)
	for sc.Next() {
		t := sc.Seq().(*linear.Seq)
		header := t.Name()
		if desc := t.Description(); len(desc) > 0 {
			header += " " + desc
		}
		obtainNfa = append(obtainNfa, header)
		obtainSfa = append(obtainSfa, t.Slice().(alphabet.Letters))
	}
	c.Check(sc.Error(), check.Equals, nil)
	c.Check(obtainNfa, check.DeepEquals, expectNfa)
	for i := range obtainSfa {
		c.Check(len(obtainSfa[i]), check.Equals, len(expectSfa[i]))
		c.Check(obtainSfa[i], check.DeepEquals, expectSfa[i])
	}
}
Пример #4
0
func (s *S) TestReadFasta(c *check.C) {
	r := fasta.NewReader(strings.NewReader(fa), linear.NewSeq("", nil, alphabet.Protein))
	m, _ := multi.NewMulti("", nil, seq.DefaultConsensus)
	a, err := NewReader(r, m).Read()
	c.Check(err, check.Equals, nil)
	c.Check(a.Rows(), check.Equals, 11)
}
Пример #5
0
func main() {
	flag.Parse()
	if *exclude == "" {
		flag.Usage()
		os.Exit(1)
	}

	nameSet := make(map[string]struct{})
	f, err := os.Open(*exclude)
	if err != nil {
		log.Fatalf("failed to open exclude file %q: %v", *exclude, err)
	}
	ls := bufio.NewScanner(f)
	for ls.Scan() {
		nameSet[ls.Text()] = struct{}{}
	}
	err = ls.Err()
	if err != nil {
		log.Fatalf("failed to read exclude file: %v", err)
	}

	sc := seqio.NewScanner(fasta.NewReader(os.Stdin, linear.NewSeq("", nil, alphabet.DNA)))
	for sc.Next() {
		s := sc.Seq().(*linear.Seq)
		if _, ok := nameSet[s.ID]; ok {
			continue
		}
		fmt.Printf("%60a\n", s)
	}
	if err := sc.Error(); err != nil {
		log.Fatalf("error during gff read: %v", err)
	}
}
Пример #6
0
func ExampleSet_AppendEach() {
	ss := [][]alphabet.Letter{
		[]alphabet.Letter("ACGCTGACTTGGTGCACGT"),
		[]alphabet.Letter("ACGACTGGGACGT"),
		[]alphabet.Letter("ACGCTGACTGGCCGT"),
		[]alphabet.Letter("GCCTTTGCACGT"),
	}
	set = make(Set, 4)
	for i := range set {
		set[i] = linear.NewSeq(fmt.Sprintf("example DNA %d", i), ss[i], alphabet.DNA)
	}
	as := [][]alphabet.QLetter{
		alphabet.QLetter{L: 'A'}.Repeat(2),
		alphabet.QLetter{L: 'C'}.Repeat(2),
		alphabet.QLetter{L: 'G'}.Repeat(2),
		alphabet.QLetter{L: 'T'}.Repeat(2),
	}

	set.AppendEach(as)

	for _, s := range set {
		fmt.Printf("%-s\n", s)
	}
	// Output:
	// ACGCTGACTTGGTGCACGTAA
	// ACGACTGGGACGTCC
	// ACGCTGACTGGCCGTGG
	// GCCTTTGCACGTTT
}
Пример #7
0
func main() {
	flag.Parse()
	if *in == "" {
		flag.Usage()
		os.Exit(1)
	}

	f, err := os.Open(*in)
	if err != nil {
		log.Fatalf("failed to open %q: %v", *in, err)
	}
	events := make(map[string][]*gff.Feature)
	fsc := featio.NewScanner(gff.NewReader(f))
	for fsc.Next() {
		f := fsc.Feat().(*gff.Feature)
		fields := strings.Fields(f.FeatAttributes.Get("Read"))
		if len(fields) != 3 {
			log.Fatalf("bad record: %+v", f)
		}
		events[fields[0]] = append(events[fields[0]], f)
	}
	if err := fsc.Error(); err != nil {
		log.Fatalf("error during gff read: %v", err)
	}
	f.Close()

	for _, ref := range flag.Args() {
		f, err = os.Open(ref)
		if err != nil {
			log.Fatalf("failed to open reference %q: %v", ref, err)
		}
		ssc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNA)))
		for ssc.Next() {
			seq := ssc.Seq().(*linear.Seq)
			for _, f := range events[seq.Name()] {
				fields := strings.Fields(f.FeatAttributes.Get("Read"))
				if len(fields) != 3 {
					log.Fatalf("bad record: %+v", f)
				}
				start, err := strconv.Atoi(fields[1])
				if err != nil {
					log.Fatalf("failed to get start coordinate: %v", err)
				}
				end, err := strconv.Atoi(fields[2])
				if err != nil {
					log.Fatalf("failed to get end coordinate: %v", err)
				}
				tmp := *seq
				tmp.ID += fmt.Sprintf("//%d_%d", start, end)
				tmp.Seq = tmp.Seq[start:end]
				fmt.Printf("%60a\n", &tmp)
			}
		}
		if err := ssc.Error(); err != nil {
			log.Fatalf("error during fasta read: %v", err)
		}
		f.Close()
	}
}
Пример #8
0
func (s *S) SetUpSuite(c *check.C) {
	MaxKmerLen = 14
	s.Seq = linear.NewSeq("", nil, alphabet.DNA)
	s.Seq.Seq = make(alphabet.Letters, testLen)
	for i := range s.Seq.Seq {
		s.Seq.Seq[i] = [...]alphabet.Letter{'A', 'C', 'G', 'T', 'a', 'c', 'g', 't'}[rand.Int()%8]
	}
}
Пример #9
0
func ExampleNewMulti() {
	m, err := NewMulti("example multi",
		[]seq.Sequence{
			linear.NewSeq("example DNA 1", []alphabet.Letter("ACGCTGACTTGGTGCACGT"), alphabet.DNA),
			linear.NewSeq("example DNA 2", []alphabet.Letter("ACGGTGACCTGGCGCGCAT"), alphabet.DNA),
			linear.NewSeq("example DNA 3", []alphabet.Letter("ACGATGACGTGGCGCTCAT"), alphabet.DNA),
		},
		seq.DefaultConsensus)

	if err != nil {
		return
	}

	fmt.Printf("%- s\n\n%-s\n", m, m.Consensus(false))
	// Output:
	// ACGCTGACTTGGTGCACGT
	// ACGGTGACCTGGCGCGCAT
	// ACGATGACGTGGCGCTCAT
	//
	// acgntgacntggcgcncat
}
Пример #10
0
func getFasta(fn string) (seq.Sequence, error) {
	fasta_file, err := os.Open(fn)
	if err != nil {
		fmt.Println("Erro ao ler o arquivo", err)
	}
	defer fasta_file.Close()
	var s []alphabet.Letter
	t := linear.NewSeq("", s, alphabet.Protein)
	reader := fasta.NewReader(fasta_file, t)
	seq, _ := reader.Read()
	return seq, nil
}
Пример #11
0
// writeFlankSeqs writes fasta files containing the sequence of unmapped flanks
// identified in the primary hits provided. cutoff specifies the minimum sequence
// length to consider. left and right specify the filenames for the left and right
// flank fasta sequence files.
func writeFlankSeqs(reads string, hits hitSet, cutoff int, left, right string) error {
	f, err := os.Open(reads)
	if err != nil {
		return err
	}
	defer f.Close()

	lf, err := os.Create(left)
	if err != nil {
		return err
	}
	rf, err := os.Create(right)
	if err != nil {
		return err
	}

	r := fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNA))
	sc := seqio.NewScanner(r)
	for sc.Next() {
		seq := sc.Seq().(*linear.Seq)
		h, ok := hits[seq.Name()]
		if !ok {
			continue
		}

		all := seq.Seq
		if h.qStart >= cutoff {
			seq.Seq = all[:h.qStart]
			_, err := fmt.Fprintf(lf, "%60a\n", seq)
			if err != nil {
				return err
			}
		}
		if h.qLen-h.qEnd >= cutoff {
			seq.Seq = all[h.qEnd:]
			_, err := fmt.Fprintf(rf, "%60a\n", seq)
			if err != nil {
				return err
			}
		}
	}
	err = sc.Error()
	if err != nil {
		return err
	}
	err = lf.Close()
	if err != nil {
		return err
	}
	return rf.Close()
}
Пример #12
0
func readFasta(fn string) (name string, seq string, err error) {
	fFasta, err := os.Open(fn)
	defer fFasta.Close()
	if err != nil {
		return "", "", err
	}
	t := linear.NewSeq("", nil, alphabet.Protein)
	reader := fasta.NewReader(fFasta, t)
	s, err := reader.Read()
	if err != nil {
		return "", "", err
	}
	sl := s.(*linear.Seq)
	return sl.Name(), sl.String(), nil
}
Пример #13
0
func readContigs(file string) (map[string]int, error) {
	f, err := os.Open(file)
	if err != nil {
		return nil, err
	}
	lengths := make(map[string]int)
	sc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNA)))
	for sc.Next() {
		s := sc.Seq()
		lengths[s.Name()] = s.Len()
	}
	if err != nil {
		log.Fatalf("error during fasta read: %v", err)
	}
	return lengths, nil
}
Пример #14
0
func (r Row) Clone() seq.Sequence {
	b := make([]alphabet.Letter, r.Len())
	for i, c := range r.Align.Seq {
		b[i] = c[r.Row]
	}
	switch {
	case r.Row < 0:
		panic("under")
	case r.Row >= r.Align.Rows():
		panic("bang over Rows()")
	case r.Row >= len(r.Align.SubAnnotations):

		panic(fmt.Sprintf("bang over len(SubAnns): %d %d", r.Row, len(r.Align.SubAnnotations)))
	}
	return linear.NewSeq(r.Name(), b, r.Alphabet())
}
Пример #15
0
func readContigs(file string) (map[string]*linear.Seq, error) {
	f, err := os.Open(file)
	if err != nil {
		return nil, err
	}
	seqs := make(map[string]*linear.Seq)
	sc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNAgapped)))
	for sc.Next() {
		s := sc.Seq().(*linear.Seq)
		seqs[s.ID] = s
	}
	if err != nil {
		return nil, err
	}
	return seqs, nil
}
Пример #16
0
func main() {
	flag.Parse()
	if *in == "" {
		flag.Usage()
		os.Exit(1)
	}

	f, err := os.Open(*in)
	if err != nil {
		log.Fatalf("failed to open %q: %v", *in, err)
	}
	defer f.Close()

	names := make(map[string][]string)

	sc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNAgapped)))
	for sc.Next() {
		seq := sc.Seq().(*linear.Seq)
		idx := strings.LastIndex(seq.ID, "/")
		names[seq.ID[:idx]] = append(names[seq.ID[:idx]], seq.ID[idx+1:])
	}
	if err := sc.Error(); err != nil {
		log.Fatalf("error during fasta read: %v", err)
	}
	f.Close()

	base := filepath.Base(*in)
	unique, err := os.Create(base + ".unique.text")
	if err != nil {
		log.Fatalf("failed to create %q: %v", base+".unique.text", err)
	}
	defer unique.Close()
	nonUnique, err := os.Create(base + ".non-unique.text")
	if err != nil {
		log.Fatalf("failed to create %q: %v", base+".non-unique.text", err)
	}
	defer nonUnique.Close()
	for name, coords := range names {
		switch len(coords) {
		case 0:
		case 1:
			fmt.Fprintln(unique, name)
		default:
			fmt.Fprintf(nonUnique, "%s\t%v\n", name, coords)
		}
	}
}
Пример #17
0
func main() {
	flag.Parse()
	if *in == "" {
		flag.Usage()
		os.Exit(1)
	}

	inFile, err := os.Open(*in)
	if err != nil {
		log.Fatalf("failed to open input:%v", err)
	}
	defer inFile.Close()
	*in = filepath.Base(*in)

	sc := seqio.NewScanner(fasta.NewReader(inFile, linear.NewSeq("", nil, alphabet.DNA)))

	var i, size int
	out, err := os.Create(fmt.Sprintf("%s-%d.fa", *in, i))
	for sc.Next() {
		if sc.Seq().Len() < *cut {
			continue
		}
		if size != 0 && size+sc.Seq().Len() > *bundle {
			err = out.Close()
			if err != nil {
				log.Fatalf("failed to close file bundle %d: %v", i, err)
			}
			i++
			size = 0
			out, err = os.Create(fmt.Sprintf("%s-%d.fa", *in, i))
			if err != nil {
				log.Fatalf("failed to open file bundle %d: %v", i, err)
			}
		}
		size += sc.Seq().Len()
		fmt.Fprintf(out, "%60a\n", sc.Seq())
	}
	if sc.Error() != nil {
		log.Fatal(sc.Error())
	}
	err = out.Close()
	if err != nil {
		log.Fatalf("failed to close file bundle %d: %v", i, err)
	}
}
Пример #18
0
func (r *Reader) metaSeq(moltype, id []byte) (seq.Sequence, error) {
	var line, body []byte

	var err error
	for {
		line, err = r.r.ReadBytes('\n')
		if err != nil {
			if err == io.EOF {
				return nil, err
			}
			return nil, &csv.ParseError{Line: r.line, Err: err}
		}
		r.line++
		line = bytes.TrimSpace(line)
		if len(line) == 0 {
			continue
		}
		if len(line) < 2 || !bytes.HasPrefix(line, []byte("##")) {
			return nil, &csv.ParseError{Line: r.line, Err: ErrBadSequence}
		}
		line = bytes.TrimSpace(line[2:])
		if unsafeString(line) == "end-"+unsafeString(moltype) {
			break
		} else {
			line = bytes.Join(bytes.Fields(line), nil)
			body = append(body, line...)
		}
	}

	var alpha alphabet.Alphabet
	switch feat.ParseMoltype(unsafeString(moltype)) {
	case feat.DNA:
		alpha = alphabet.DNA
	case feat.RNA:
		alpha = alphabet.RNA
	case feat.Protein:
		alpha = alphabet.Protein
	default:
		return nil, ErrBadMoltype
	}
	s := linear.NewSeq(string(id), alphabet.BytesToLetters(body), alpha)

	return s, err
}
Пример #19
0
func (s *S) TestWriteFasta(c *check.C) {
	fa := fas[0]
	b := &bytes.Buffer{}
	w := NewWriter(b, 60)

	seq := linear.NewSeq("", nil, alphabet.Protein)

	var n int
	for i := range expectN {
		seq.ID = expectN[i]
		seq.Seq = expectS[i]
		_n, err := w.Write(seq)
		if err != nil {
			c.Fatalf("Failed to write to buffer: %s", err)
		}
		n += _n
	}

	c.Check(n, check.Equals, b.Len())
	c.Check(string(b.Bytes()), check.Equals, fa)
}
Пример #20
0
func mangle() {
	seen := make(map[string]bool)
	hash := sha1.New()
	sc := seqio.NewScanner(fasta.NewReader(os.Stdin, linear.NewSeq("", nil, alphabet.DNA)))
	for sc.Next() {
		s := sc.Seq().(*linear.Seq)
		if s.Desc == "" {
			s.Desc = s.ID
		} else {
			s.Desc = fmt.Sprintf("%s %s", s.ID, s.Desc)
		}
		hash.Write([]byte(s.Desc))
		s.ID = fmt.Sprintf("%040x", hash.Sum(nil))
		if seen[s.ID] {
			log.Fatalf("duplicate sha1: %s", s.ID)
		}
		seen[s.ID] = true
		hash.Reset()
		fmt.Printf("%60a\n", s)
	}
}
Пример #21
0
func main() {
	flag.Parse()
	if *in == "" || *typ < 0 || 2 < *typ {
		flag.Usage()
		os.Exit(1)
	}

	cfn := []func(s seq.Sequence, start, end int) (float64, error){
		0: complexity.WF,
		1: complexity.Entropic,
		2: complexity.Z,
	}[*typ]

	f, err := os.Open(*in)
	if err != nil {
		log.Fatalf("failed to open %q: %v", *in, err)
	}
	defer f.Close()

	sc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNAgapped)))
	for sc.Next() {
		seq := sc.Seq().(*linear.Seq)

		// err is always nil for a linear.Seq Start() and End().
		c, _ := cfn(seq, seq.Start(), seq.End())

		if *dist {
			fmt.Printf("%s\t%v\t%d\n", seq.Name(), c, seq.Len())
			continue
		}
		if c >= *thresh {
			fmt.Printf("%60a\n", seq)
		}
	}
	if err := sc.Error(); err != nil {
		log.Fatalf("error during fasta read: %v", err)
	}
}
Пример #22
0
func unmangle(mapfile string) {
	table := make(map[string]string)
	sc := seqio.NewScanner(fasta.NewReader(os.Stdin, linear.NewSeq("", nil, alphabet.DNA)))
	for sc.Next() {
		s := sc.Seq().(*linear.Seq)
		id := strings.Fields(s.Desc)[0]
		if id == "" {
			log.Fatalf("no id for sequence %s", s.ID)
		}
		table[s.ID] = id
	}

	f, err := os.Open(mapfile)
	if err != nil {
		log.Fatalf("failed to open map file %q: %v", mapfile, err)
	}
	s := bufio.NewScanner(f)
	for s.Scan() {
		line := s.Text()
		fields := strings.Fields(line)
		if len(fields) <= *queryNameField {
			log.Fatalf("unexpected number of fields in line %q", line)
		}
		id := table[fields[*queryNameField]]
		if id == "" {
			log.Fatalf("no id for map query %s", fields[*queryNameField])
		}
		fields[*queryNameField] = id
		for i, f := range fields {
			if i != 0 {
				fmt.Print("\t")
			}
			fmt.Print(f)
		}
		fmt.Println()
	}
}
Пример #23
0
// Helper
func stringToSeq(s string) *linear.Seq {
	return linear.NewSeq("", alphabet.BytesToLetters([]byte(s)), alphabet.DNA)
}
Пример #24
0
func main() {
	if len(os.Args) < 2 {
		fmt.Fprintln(os.Stderr, "invalid invocation: must have at least one reads file")
		os.Exit(1)
	}

	extract := make(map[string][2]int)
	sc := featio.NewScanner(gff.NewReader(os.Stdin))
	for sc.Next() {
		f := sc.Feat().(*gff.Feature)
		read := f.FeatAttributes.Get("Read")
		if read == "" {
			continue
		}
		fields := strings.Fields(read)
		name := fields[0]
		start, err := strconv.Atoi(fields[1])
		if err != nil {
			log.Fatalf("failed to parse %q: %v", read, err)
		}
		end, err := strconv.Atoi(fields[2])
		if err != nil {
			log.Fatalf("failed to parse %q: %v", read, err)
		}
		extract[name] = [2]int{start, end}
	}
	err := sc.Error()
	if err != nil {
		log.Fatalf("error during GFF read: %v", err)
	}

	for _, reads := range os.Args[1:] {
		sf, err := os.Open(reads)
		if err != nil {
			log.Fatalf("failed to open %q: %v", reads, err)
		}
		sr, err := sam.NewReader(sf)
		if err != nil {
			log.Fatalf("failed to open SAM input %q: %v", reads, err)
		}
		for {
			r, err := sr.Read()
			if err != nil {
				if err != io.EOF {
					log.Fatalf("unexpected error reading SAM: %v", err)
				}
				break
			}

			v, ok := extract[r.Name]
			if !ok {
				continue
			}
			// Currently reefer only expects a single hit per read,
			// so any multiples are due to duplicate read file input.
			// Update this behaviour if we change reefer to look at
			// remapping soft-clipped segments.
			delete(extract, r.Name)

			reverse := r.Flags&sam.Reverse != 0
			rng := fmt.Sprintf("//%d_%d", v[0], v[1])
			if reverse {
				rng += "(-)"
				len := r.Seq.Length
				v[0], v[1] = len-v[1], len-v[0]
			}
			v[0] = feat.OneToZero(v[0])
			s := linear.NewSeq(
				r.Name+rng,
				alphabet.BytesToLetters(r.Seq.Expand())[v[0]:v[1]],
				alphabet.DNA,
			)
			if reverse {
				s.Desc = "(sequence revcomp relative to read)"
			}
			fmt.Printf("%60a\n", s)
		}
		sf.Close()
	}
}
Пример #25
0
##DNA <seqname>
##acggctcggattggcgctggatgatagatcagacgac
##...
##end-DNA
##RNA <seqname>
##acggcucggauuggcgcuggaugauagaucagacgac
##...
##end-RNA
##Protein <seqname>
##MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSF
##...
##end-Protein
##sequence-region <seqname> 1 5
`,
			feat: []feat.Feature{
				linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("acggctcggattggcgctggatgatagatcagacgac...")), alphabet.DNA),
				linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("acggcucggauuggcgcuggaugauagaucagacgac...")), alphabet.RNA),
				linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSF...")), alphabet.Protein),
				&Region{Sequence: Sequence{SeqName: "<seqname>", Type: feat.DNA}, RegionStart: 0, RegionEnd: 5},
			},
			write: []interface{}{
				2,
				"source-version <source> <version-text>",
				mustTime(time.Parse(Astronomical, "1997-11-08")),
				Sequence{SeqName: "<seqname>", Type: feat.DNA},
				linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("acggctcggattggcgctggatgatagatcagacgac...")), alphabet.DNA),
				linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("acggcucggauuggcgcuggaugauagaucagacgac...")), alphabet.RNA),
				linear.NewSeq("<seqname>", alphabet.BytesToLetters([]byte("MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSF...")), alphabet.Protein),
				&Region{Sequence: Sequence{SeqName: "<seqname>"}, RegionStart: 0, RegionEnd: 5},
			},
		},
Пример #26
0
func main() {
	flag.IntVar(&maxFam, "maxFam", 0, "maxFam indicates maximum family size considered (0 == no limit).")
	flag.BoolVar(&subSample, "subsample", false, "choose maxFam members of a family if the family has more than maxFam members.")
	flag.BoolVar(&consFasta, "fasta", false, "output consensus as fasta with quality case filtering.")
	flag.StringVar(&cDir, "cDir", "", "target directory for consensus output. If not empty Dir is deleted first.")
	flag.StringVar(&aDir, "aDir", "", "target directory for alignment output. If not empty dir is deleted first.")
	flag.StringVar(&sDir, "sDir", "", "target directory for sequence information output. If not empty dir is deleted first.")
	flag.Parse()

	fmt.Printf("Initialising Files: %s\n", flag.Args()[0])

	checks(cDir, aDir, sDir)

	//Opening files
	f, fErr := os.Open(flag.Args()[0])
	if fErr != nil {
		log.Printf("error: could not open %s to read %v", flag.Args()[0], fErr)
	}
	defer f.Close()

	// Reading in sequences
	var v []seq.Sequence
	r := fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNA))
	sc := seqio.NewScanner(r)
	for sc.Next() {
		v = append(v, sc.Seq())
	}
	if sc.Error() != nil {
		log.Fatalf("failed to read sequences: %v", sc.Error())
	}
	//Checking that there aren't too many sequences
	if maxFam != 0 && !subSample && len(v) > maxFam {
		log.Fatalf("too many sequences: %d", len(v))
	}

	var aveLength int
	if subSample {
		// Shuffle first.
		w := make([]seq.Sequence, 0, len(v))
		for _, j := range rand.Perm(len(v)) {
			w = append(w, v[j])
		}
		v = w

		// Calculating lengths and averages for all sequences read
		var LenAllSeq = make([]int, len(v))

		for num, i := range v {
			LenAllSeq[num] = i.Len()
		}
		//	fmt.Printf("The variable: %v\n", LenAllSeq)
		totalLength := 0
		for i := 0; i < len(v); i++ {

			totalLength = totalLength + LenAllSeq[i]
		}
		aveLength = totalLength / len(v)
		fmt.Printf("Average length: %v\n", aveLength)
	}

	// FIXME(brittany): Make this more scientifically robust.
	var (
		sampled int
		buf     bytes.Buffer
	)

	// creating a file for the sequence information
	seqFile := fmt.Sprintf("%s_included_sequences.fq", flag.Args()[0])
	seqOut, sErr := os.Create(filepath.Join(aDir, seqFile))
	if sErr != nil {
		log.Fatalf("failed to create %s: %v", seqFile, sErr)
	}
	defer seqOut.Close()
	seqBufOut := bufio.NewWriter(f)
	defer seqBufOut.Flush()

	// printing subsampled sequences to a file
	var (
		LenSubSeq = make([]int, maxFam) // length of each sequence
	)
	for num, s := range v {
		if sampled++; subSample && sampled > maxFam {
			break
		}
		fmt.Fprintf(&buf, "%60a\n", s)
		fmt.Fprintf(seqOut, "including: %s %s\n", s.Name(), s.Description())

		// Calculating lengths and averages for the subsampled sequences
		LenSubSeq[num] = s.Len()
	}
	totalSubLength := 0
	for i := 0; i < maxFam; i++ {
		totalSubLength = totalSubLength + LenSubSeq[i]
	}
	fmt.Printf("total length of subbed: %v\n", totalSubLength)
	aveSubLength := totalSubLength / maxFam
	fmt.Printf("Average length of subbed: %v\n", aveSubLength)
	fmt.Printf("The sub seq variable for %v sequences: %v\n", maxFam, LenSubSeq)

	var (
		c   *linear.QSeq
		m   *multi.Multi
		err error
	)
	//Creating the consensus

	fmt.Println("Creating consensus")
	c, m, err = consensus(&buf)
	if err != nil {
		log.Printf("failed to generate consensus for %s: %v", flag.Args()[0], err)
		return
	}
	c.ID = fmt.Sprintf("%s_consensus", flag.Args()[0])
	c.Desc = fmt.Sprintf("%d members total %v members sampled", len(v), sampled-1)
	c.Threshold = 42
	c.QFilter = seq.CaseFilter
	conLength := c.Len()

	// Find a way to make the consensus the same case

	// Calculating cutoff of consensus length to mean sequence length
	fmt.Printf("Length of consensus:%v\n", conLength)
	cutoffSubs := float64(conLength) / float64(aveLength)
	cutoffTotal := float64(conLength) / float64(aveSubLength)
	fmt.Printf("ratio for all seqs: %f, \nratio for just sub-sampled: %f\n", cutoffTotal, cutoffSubs)

	// Creating a file for the consensus length information
	confile := fmt.Sprintf("%s_consensus-length-%v.txt", flag.Args()[0], maxFam)
	conOut, err := os.Create(filepath.Join(cDir, confile))
	if err != nil {
		log.Fatalf("failed to create %s: %v", confile, err)
	}

	fmt.Fprintf(conOut, "number sampled: %v \nratio for all seqs: %f \nratio for just sub-sampled: %f\n\n %f\t%f", maxFam, cutoffTotal, cutoffSubs, cutoffTotal, cutoffSubs)

	// Creating a file for the consensus
	file := fmt.Sprintf("%s_consensus.fq%v", flag.Args()[0], maxFam)
	out, err := os.Create(filepath.Join(cDir, file))
	if err != nil {
		log.Fatalf("failed to create %s: %v", file, err)
	}
	if consFasta {
		fmt.Fprintf(out, "%60a\n", c)
	} else {
		fmt.Fprintf(out, "%q\n", c)
	}

	// creating a file for the mutliple alignment
	alignFile := fmt.Sprintf("%s_multiple_alignment%v.fq", flag.Args()[0], maxFam)
	AlignOut, err := os.Create(filepath.Join(aDir, alignFile))
	if err != nil {
		log.Fatalf("failed to create %s: %v", alignFile, err)
	}

	if consFasta {
		fmt.Fprintf(AlignOut, "%60a\n", m)
	} else {
		fmt.Fprintf(AlignOut, "%q\n", m)
	}

	out.Close()

	fmt.Printf("Complete\n\n")
}
Пример #27
0
// adjustDeletion performs a deletion ends refinement based on a
// pair of Smith-Waterman alignments.
//
//                    l      s   e      r
//  ref:         -----|------+~~~+------|----------
//
//  query_left:  ----|-----------+~~~~~~|~~~~~~+---------------
//                   l           s      m      e
//  query_right: ----------------+~~~~~~|~~~~~~+-----------|---
//                               s      m      e           r
//
//  where ~~ is the region found by CIGAR score walking above in the
//  deletions function.
//
//  align ref(l..r) with query_left(l..m) -> ref(s)-query_left(s)
//  align ref(l..r) with query_right(m..r) -> ref(e)-query_left(e)
//
// This can give either of two outcomes:
//  1. ref(s) < ref(e)
//  2. ref(e) <= ref(s)
//
// The first case is a standard colinear alignment:
//
//                              s   e
//  ref:             -----------+---+-----------------
//                             /     \
//                            /       \
//                           /         \
//                          /           \
//  query: ----------------+-------------+---------------
//                         s             e
//
//
// The second case is a non-colinear alignment:
//
//                              e   s
//  ref:             -----------+---+-----------------
//                               \ /
//                                /
//                               / \
//                              /   \
//                             /     \
//                            /       \
//                           /         \
//                          /           \
//  query: ----------------+-------------+---------------
//                         s             e
//
//
// which has a potential target site duplication interpretation:
//
//                              e   s
//  ref:             -----------+---+-----------------
//                             / \ / \
//                            /   /   \
//                           /   / \   \
//                          /   /   \   \
//                         /   /     \   \
//                        /   /       \   \
//                       /   /         \   \
//                      /   /           \   \
//  query: ------------+---+-------------+---+-----------
//                         s             e
//
// adjustDeletions handles the second case by making ref(s=e) for the
// reference and adding annotation for the length of the duplication
// (d) in ref:
//
//                             s|e s+d
//  ref:             -----------+---+-----------------
//                             / \ / \
//                            /   /   \
//                           /   / \   \
//                          /   /   \   \
//                         /   /     \   \
//                        /   /       \   \
//                       /   /         \   \
//                      /   /           \   \
//  query: ------------+---+-------------+---+-----------
//                    s-d  s             e  e+d
//
func (r *refiner) adjust(d deletion) (refined deletion, ok bool, err error) {
	if r == nil {
		return d, false, nil
	}
	if d.qend-d.qstart < d.rend-d.rstart {
		// Do not do any work for deletions.
		return d, false, fmt.Errorf("not an insertion: len(q)=%d len(r)=%d", d.qend-d.qstart, d.rend-d.rstart)
	}

	name := d.record.Ref.Name()
	ref, ok := r.ref[name]
	if !ok {
		return d, false, fmt.Errorf("no reference sequence for %q", name)
	}

	rs := *ref
	rOff := max(0, d.rstart-r.refWindow/2)
	rs.Seq = ref.Seq[rOff:min(d.rend+r.refWindow/2, len(ref.Seq))]

	q := alphabet.BytesToLetters(d.record.Seq.Expand())

	// Align the left junction of the qeuery to
	// the reference around the indel site.
	qsl := linear.NewSeq(d.record.Name, nil, alphabet.DNAgapped)
	qOffLeft := max(0, d.qstart-r.queryWindow)
	qsl.Seq = q[qOffLeft : (d.qstart+d.qend)/2]
	alnl, err := r.sw.Align(&rs, qsl)
	if err != nil {
		return d, false, err
	}

	// Align the right junction of the qeuery to
	// the reference around the indel site.
	qsr := linear.NewSeq(d.record.Name, nil, alphabet.DNAgapped)
	qOffRight := (d.qstart + d.qend) / 2
	qsr.Seq = q[qOffRight:min(d.qend+r.queryWindow, len(q))]
	alnr, err := r.sw.Align(&rs, qsr)
	if err != nil {
		return d, false, err
	}

	// Get left and right ends of insertion in read
	// and the aligned segment of the reference.
	left := alnl[len(alnl)-1].Features()
	right := alnr[0].Features()

	// Bail out if the alignment extends too far.
	// We might have continued alignment.
	if flank := right[0].Start(); flank < r.minRefFlank {
		return d, false, fmt.Errorf("skipping: right ref flank less than %d from left: len(flank)=%v",
			r.minRefFlank, flank)
	}
	if flank := left[0].End(); len(rs.Seq)-flank < r.minRefFlank {
		return d, false, fmt.Errorf("skipping: left ref flank less than %d from right: len(flank)=%v",
			r.minRefFlank, len(rs.Seq)-flank)
	}

	centrel := r.queryWindow + (d.qend-d.qstart)/2
	centrer := 0

	// Bail out if the insertion is too short.
	// We might have continued alignment.
	if gap := centrel - left[1].End(); gap < r.minQueryGap {
		return d, false, fmt.Errorf("skipping left: left query gap less than %d from centre: len(gap)=%v",
			r.minQueryGap, gap)
	}
	if gap := right[1].Start() - centrer; gap < r.minQueryGap {
		return d, false, fmt.Errorf("skipping right: right query gap less than %d from centre: len(gap)=%v",
			r.minQueryGap, gap)
	}

	d.rstart = rOff + left[0].End()
	d.rend = rOff + right[0].Start()
	if d.rend <= d.rstart {
		d.dup = d.rstart - d.rend
		d.rstart = d.rend
	}

	d.qstart = qOffLeft + left[1].End()
	d.qend = qOffRight + alnr[0].Features()[1].Start()

	return d, true, nil
}
Пример #28
0
func main() {
	flag.Var(&alnmat, "align", "specify the match, mismatch and gap parameters")
	flag.Parse()
	if *in == "" {
		flag.Usage()
		os.Exit(1)
	}

	f, err := os.Open(*in)
	if err != nil {
		log.Fatalf("failed to open %q: %v", *in, err)
	}
	events := make(map[string][]*gff.Feature)
	fsc := featio.NewScanner(gff.NewReader(f))
	for fsc.Next() {
		f := fsc.Feat().(*gff.Feature)
		fields := strings.Fields(f.FeatAttributes.Get("Read"))
		if len(fields) != 3 {
			log.Fatalf("bad record: %+v", f)
		}
		events[fields[0]] = append(events[fields[0]], f)
	}
	if err := fsc.Error(); err != nil {
		log.Fatalf("error during gff read: %v", err)
	}
	f.Close()

	w := gff.NewWriter(os.Stdout, 60, true)
	w.WriteComment("Right coordinates (field 5) and strand (field 7) are hypothetical.")

	var out *os.File
	if *fastaOut != "" {
		out, err = os.Create(*fastaOut)
		if err != nil {
			log.Fatalf("failed to create fasta insertion output file %q: %v", *fastaOut, err)
		}
		defer out.Close()
	}

	hw := *window / 2
	sw := makeTable(alphabet.DNAgapped, alnmat)
	for _, ref := range flag.Args() {
		f, err = os.Open(ref)
		if err != nil {
			log.Fatalf("failed to open reference %q: %v", ref, err)
		}
		ssc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNAgapped)))
	loop:
		for ssc.Next() {
			seq := ssc.Seq().(*linear.Seq)
			for _, f := range events[seq.Name()] {
				fields := strings.Fields(f.FeatAttributes.Get("Read"))
				if len(fields) != 3 {
					log.Fatalf("bad record: %+v", f)
				}
				start, err := strconv.Atoi(fields[1])
				if err != nil {
					log.Fatalf("failed to get start coordinate: %v", err)
				}
				end, err := strconv.Atoi(fields[2])
				if err != nil {
					log.Fatalf("failed to get end coordinate: %v", err)
				}

				if out != nil {
					insert := *seq
					if insert.Desc != "" {
						insert.Desc += " "
					}
					insert.Desc += fmt.Sprintf("[%d,%d)", start, end)
					insert.Seq = insert.Seq[start:end]
					fmt.Fprintf(out, "%60a\n", &insert)
				}

				var lOff, lEnd, rOff, rEnd int
				// If we have refined ends, use them.
				if dup := f.FeatAttributes.Get("Dup"); dup != "" {
					d, err := strconv.Atoi(dup)
					if err != nil {
						log.Fatalf("failed to get duplication length: %v", err)
					}
					lOff = max(0, start-d)
					lEnd = start
					rOff = end
					rEnd = min(len(seq.Seq), end+d)
				} else {
					lOff = max(0, start-hw)
					lEnd = min(len(seq.Seq), start+hw)
					rOff = max(0, end-hw)
					rEnd = min(len(seq.Seq), end+hw)

					// Ensure windows don't overlap.
					if lEnd > rOff {
						lEnd = (lEnd + rOff) / 2
						rOff = lEnd
					}
				}

				if lEnd-lOff < *thresh || rEnd-rOff < *thresh {
					// Don't do fruitless work.
					continue loop
				}

				left := *seq
				left.ID = "prefix"
				left.Seq = left.Seq[lOff:lEnd]
				right := *seq
				right.ID = "postfix"
				right.Seq = right.Seq[rOff:rEnd]

				aln, err := sw.Align(&right, &left)
				if err != nil {
					log.Fatal(err)
				}

				fa := align.Format(&right, &left, aln, '-')
				for _, seg := range fa {
					var n int
					for _, l := range seg.(alphabet.Letters) {
						if l != '-' {
							n++
						}
					}
					if n < *thresh {
						continue loop
					}
				}

				var sc int
				for _, seg := range aln {
					type scorer interface {
						Score() int
					}
					sc += seg.(scorer).Score()
				}
				f.FeatAttributes = append(f.FeatAttributes, gff.Attribute{
					Tag: "TSD", Value: fmt.Sprintf(`%v %d %d %v "%v" %d`,
						fa[0], aln[len(aln)-1].Features()[0].End()+lOff,
						aln[0].Features()[1].Start()+rOff, fa[1],
						aln, sc),
				})
				w.Write(f)
			}
		}
		if err := ssc.Error(); err != nil {
			log.Fatalf("error during fasta read: %v", err)
		}
		f.Close()
	}
}