Beispiel #1
0
func main() {
	flag.Parse()
	if *exclude == "" {
		flag.Usage()
		os.Exit(1)
	}

	nameSet := make(map[string]struct{})
	f, err := os.Open(*exclude)
	if err != nil {
		log.Fatalf("failed to open exclude file %q: %v", *exclude, err)
	}
	ls := bufio.NewScanner(f)
	for ls.Scan() {
		nameSet[ls.Text()] = struct{}{}
	}
	err = ls.Err()
	if err != nil {
		log.Fatalf("failed to read exclude file: %v", err)
	}

	sc := seqio.NewScanner(fasta.NewReader(os.Stdin, linear.NewSeq("", nil, alphabet.DNA)))
	for sc.Next() {
		s := sc.Seq().(*linear.Seq)
		if _, ok := nameSet[s.ID]; ok {
			continue
		}
		fmt.Printf("%60a\n", s)
	}
	if err := sc.Error(); err != nil {
		log.Fatalf("error during gff read: %v", err)
	}
}
Beispiel #2
0
func (s *S) TestReadFastq(c *check.C) {
	var (
		obtainNfq []string
		obtainQL  [][]alphabet.QLetter
	)

	sc := seqio.NewScanner(
		fastq.NewReader(
			bytes.NewBufferString(fq0),
			linear.NewQSeq("", nil, alphabet.DNA, alphabet.Sanger),
		),
	)
	for sc.Next() {
		t := sc.Seq().(*linear.QSeq)
		header := t.Name()
		if desc := t.Description(); len(desc) > 0 {
			header += " " + desc
		}
		obtainNfq = append(obtainNfq, header)
		obtainQL = append(obtainQL, (t.Slice().(alphabet.QLetters)))
	}
	c.Check(sc.Error(), check.Equals, nil)
	c.Check(obtainNfq, check.DeepEquals, expectNfq)
	c.Check(obtainQL, check.DeepEquals, expectQL)
}
Beispiel #3
0
func (s *S) TestReadFasta(c *check.C) {
	var (
		obtainNfa []string
		obtainSfa [][]alphabet.Letter
	)

	sc := seqio.NewScanner(
		fasta.NewReader(
			bytes.NewBufferString(testaln0),
			linear.NewSeq("", nil, alphabet.Protein),
		),
	)
	for sc.Next() {
		t := sc.Seq().(*linear.Seq)
		header := t.Name()
		if desc := t.Description(); len(desc) > 0 {
			header += " " + desc
		}
		obtainNfa = append(obtainNfa, header)
		obtainSfa = append(obtainSfa, t.Slice().(alphabet.Letters))
	}
	c.Check(sc.Error(), check.Equals, nil)
	c.Check(obtainNfa, check.DeepEquals, expectNfa)
	for i := range obtainSfa {
		c.Check(len(obtainSfa[i]), check.Equals, len(expectSfa[i]))
		c.Check(obtainSfa[i], check.DeepEquals, expectSfa[i])
	}
}
Beispiel #4
0
func main() {
	flag.Parse()
	if *in == "" {
		flag.Usage()
		os.Exit(1)
	}

	f, err := os.Open(*in)
	if err != nil {
		log.Fatalf("failed to open %q: %v", *in, err)
	}
	events := make(map[string][]*gff.Feature)
	fsc := featio.NewScanner(gff.NewReader(f))
	for fsc.Next() {
		f := fsc.Feat().(*gff.Feature)
		fields := strings.Fields(f.FeatAttributes.Get("Read"))
		if len(fields) != 3 {
			log.Fatalf("bad record: %+v", f)
		}
		events[fields[0]] = append(events[fields[0]], f)
	}
	if err := fsc.Error(); err != nil {
		log.Fatalf("error during gff read: %v", err)
	}
	f.Close()

	for _, ref := range flag.Args() {
		f, err = os.Open(ref)
		if err != nil {
			log.Fatalf("failed to open reference %q: %v", ref, err)
		}
		ssc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNA)))
		for ssc.Next() {
			seq := ssc.Seq().(*linear.Seq)
			for _, f := range events[seq.Name()] {
				fields := strings.Fields(f.FeatAttributes.Get("Read"))
				if len(fields) != 3 {
					log.Fatalf("bad record: %+v", f)
				}
				start, err := strconv.Atoi(fields[1])
				if err != nil {
					log.Fatalf("failed to get start coordinate: %v", err)
				}
				end, err := strconv.Atoi(fields[2])
				if err != nil {
					log.Fatalf("failed to get end coordinate: %v", err)
				}
				tmp := *seq
				tmp.ID += fmt.Sprintf("//%d_%d", start, end)
				tmp.Seq = tmp.Seq[start:end]
				fmt.Printf("%60a\n", &tmp)
			}
		}
		if err := ssc.Error(); err != nil {
			log.Fatalf("error during fasta read: %v", err)
		}
		f.Close()
	}
}
Beispiel #5
0
// writeFlankSeqs writes fasta files containing the sequence of unmapped flanks
// identified in the primary hits provided. cutoff specifies the minimum sequence
// length to consider. left and right specify the filenames for the left and right
// flank fasta sequence files.
func writeFlankSeqs(reads string, hits hitSet, cutoff int, left, right string) error {
	f, err := os.Open(reads)
	if err != nil {
		return err
	}
	defer f.Close()

	lf, err := os.Create(left)
	if err != nil {
		return err
	}
	rf, err := os.Create(right)
	if err != nil {
		return err
	}

	r := fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNA))
	sc := seqio.NewScanner(r)
	for sc.Next() {
		seq := sc.Seq().(*linear.Seq)
		h, ok := hits[seq.Name()]
		if !ok {
			continue
		}

		all := seq.Seq
		if h.qStart >= cutoff {
			seq.Seq = all[:h.qStart]
			_, err := fmt.Fprintf(lf, "%60a\n", seq)
			if err != nil {
				return err
			}
		}
		if h.qLen-h.qEnd >= cutoff {
			seq.Seq = all[h.qEnd:]
			_, err := fmt.Fprintf(rf, "%60a\n", seq)
			if err != nil {
				return err
			}
		}
	}
	err = sc.Error()
	if err != nil {
		return err
	}
	err = lf.Close()
	if err != nil {
		return err
	}
	return rf.Close()
}
Beispiel #6
0
func readContigs(file string) (map[string]int, error) {
	f, err := os.Open(file)
	if err != nil {
		return nil, err
	}
	lengths := make(map[string]int)
	sc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNA)))
	for sc.Next() {
		s := sc.Seq()
		lengths[s.Name()] = s.Len()
	}
	if err != nil {
		log.Fatalf("error during fasta read: %v", err)
	}
	return lengths, nil
}
Beispiel #7
0
func readContigs(file string) (map[string]*linear.Seq, error) {
	f, err := os.Open(file)
	if err != nil {
		return nil, err
	}
	seqs := make(map[string]*linear.Seq)
	sc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNAgapped)))
	for sc.Next() {
		s := sc.Seq().(*linear.Seq)
		seqs[s.ID] = s
	}
	if err != nil {
		return nil, err
	}
	return seqs, nil
}
Beispiel #8
0
func main() {
	flag.Parse()
	if *in == "" {
		flag.Usage()
		os.Exit(1)
	}

	f, err := os.Open(*in)
	if err != nil {
		log.Fatalf("failed to open %q: %v", *in, err)
	}
	defer f.Close()

	names := make(map[string][]string)

	sc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNAgapped)))
	for sc.Next() {
		seq := sc.Seq().(*linear.Seq)
		idx := strings.LastIndex(seq.ID, "/")
		names[seq.ID[:idx]] = append(names[seq.ID[:idx]], seq.ID[idx+1:])
	}
	if err := sc.Error(); err != nil {
		log.Fatalf("error during fasta read: %v", err)
	}
	f.Close()

	base := filepath.Base(*in)
	unique, err := os.Create(base + ".unique.text")
	if err != nil {
		log.Fatalf("failed to create %q: %v", base+".unique.text", err)
	}
	defer unique.Close()
	nonUnique, err := os.Create(base + ".non-unique.text")
	if err != nil {
		log.Fatalf("failed to create %q: %v", base+".non-unique.text", err)
	}
	defer nonUnique.Close()
	for name, coords := range names {
		switch len(coords) {
		case 0:
		case 1:
			fmt.Fprintln(unique, name)
		default:
			fmt.Fprintf(nonUnique, "%s\t%v\n", name, coords)
		}
	}
}
Beispiel #9
0
func main() {
	flag.Parse()
	if *in == "" {
		flag.Usage()
		os.Exit(1)
	}

	inFile, err := os.Open(*in)
	if err != nil {
		log.Fatalf("failed to open input:%v", err)
	}
	defer inFile.Close()
	*in = filepath.Base(*in)

	sc := seqio.NewScanner(fasta.NewReader(inFile, linear.NewSeq("", nil, alphabet.DNA)))

	var i, size int
	out, err := os.Create(fmt.Sprintf("%s-%d.fa", *in, i))
	for sc.Next() {
		if sc.Seq().Len() < *cut {
			continue
		}
		if size != 0 && size+sc.Seq().Len() > *bundle {
			err = out.Close()
			if err != nil {
				log.Fatalf("failed to close file bundle %d: %v", i, err)
			}
			i++
			size = 0
			out, err = os.Create(fmt.Sprintf("%s-%d.fa", *in, i))
			if err != nil {
				log.Fatalf("failed to open file bundle %d: %v", i, err)
			}
		}
		size += sc.Seq().Len()
		fmt.Fprintf(out, "%60a\n", sc.Seq())
	}
	if sc.Error() != nil {
		log.Fatal(sc.Error())
	}
	err = out.Close()
	if err != nil {
		log.Fatalf("failed to close file bundle %d: %v", i, err)
	}
}
Beispiel #10
0
func mangle() {
	seen := make(map[string]bool)
	hash := sha1.New()
	sc := seqio.NewScanner(fasta.NewReader(os.Stdin, linear.NewSeq("", nil, alphabet.DNA)))
	for sc.Next() {
		s := sc.Seq().(*linear.Seq)
		if s.Desc == "" {
			s.Desc = s.ID
		} else {
			s.Desc = fmt.Sprintf("%s %s", s.ID, s.Desc)
		}
		hash.Write([]byte(s.Desc))
		s.ID = fmt.Sprintf("%040x", hash.Sum(nil))
		if seen[s.ID] {
			log.Fatalf("duplicate sha1: %s", s.ID)
		}
		seen[s.ID] = true
		hash.Reset()
		fmt.Printf("%60a\n", s)
	}
}
Beispiel #11
0
func main() {
	flag.Parse()
	if *in == "" || *typ < 0 || 2 < *typ {
		flag.Usage()
		os.Exit(1)
	}

	cfn := []func(s seq.Sequence, start, end int) (float64, error){
		0: complexity.WF,
		1: complexity.Entropic,
		2: complexity.Z,
	}[*typ]

	f, err := os.Open(*in)
	if err != nil {
		log.Fatalf("failed to open %q: %v", *in, err)
	}
	defer f.Close()

	sc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNAgapped)))
	for sc.Next() {
		seq := sc.Seq().(*linear.Seq)

		// err is always nil for a linear.Seq Start() and End().
		c, _ := cfn(seq, seq.Start(), seq.End())

		if *dist {
			fmt.Printf("%s\t%v\t%d\n", seq.Name(), c, seq.Len())
			continue
		}
		if c >= *thresh {
			fmt.Printf("%60a\n", seq)
		}
	}
	if err := sc.Error(); err != nil {
		log.Fatalf("error during fasta read: %v", err)
	}
}
func consensus(in io.Reader) (*linear.QSeq, *multi.Multi, error) {
	m, err := muscle.Muscle{Quiet: true}.BuildCommand()
	if err != nil {
		return nil, nil, err
	}
	m.Stdin = in
	buf := &bytes.Buffer{}
	m.Stdout = buf
	err = m.Run()
	if err != nil {
		return nil, nil, err
	}
	var (
		r  = fasta.NewReader(buf, &linear.Seq{Annotation: seq.Annotation{Alpha: alphabet.DNA}})
		ms = &multi.Multi{ColumnConsense: seq.DefaultQConsensus}
	)
	sc := seqio.NewScanner(r)
	for sc.Next() {
		ms.Add(sc.Seq())
	}

	return ms.Consensus(true), ms, sc.Error()
}
Beispiel #13
0
func unmangle(mapfile string) {
	table := make(map[string]string)
	sc := seqio.NewScanner(fasta.NewReader(os.Stdin, linear.NewSeq("", nil, alphabet.DNA)))
	for sc.Next() {
		s := sc.Seq().(*linear.Seq)
		id := strings.Fields(s.Desc)[0]
		if id == "" {
			log.Fatalf("no id for sequence %s", s.ID)
		}
		table[s.ID] = id
	}

	f, err := os.Open(mapfile)
	if err != nil {
		log.Fatalf("failed to open map file %q: %v", mapfile, err)
	}
	s := bufio.NewScanner(f)
	for s.Scan() {
		line := s.Text()
		fields := strings.Fields(line)
		if len(fields) <= *queryNameField {
			log.Fatalf("unexpected number of fields in line %q", line)
		}
		id := table[fields[*queryNameField]]
		if id == "" {
			log.Fatalf("no id for map query %s", fields[*queryNameField])
		}
		fields[*queryNameField] = id
		for i, f := range fields {
			if i != 0 {
				fmt.Print("\t")
			}
			fmt.Print(f)
		}
		fmt.Println()
	}
}
func main() {
	flag.IntVar(&maxFam, "maxFam", 0, "maxFam indicates maximum family size considered (0 == no limit).")
	flag.BoolVar(&subSample, "subsample", false, "choose maxFam members of a family if the family has more than maxFam members.")
	flag.BoolVar(&consFasta, "fasta", false, "output consensus as fasta with quality case filtering.")
	flag.StringVar(&cDir, "cDir", "", "target directory for consensus output. If not empty Dir is deleted first.")
	flag.StringVar(&aDir, "aDir", "", "target directory for alignment output. If not empty dir is deleted first.")
	flag.StringVar(&sDir, "sDir", "", "target directory for sequence information output. If not empty dir is deleted first.")
	flag.Parse()

	fmt.Printf("Initialising Files: %s\n", flag.Args()[0])

	checks(cDir, aDir, sDir)

	//Opening files
	f, fErr := os.Open(flag.Args()[0])
	if fErr != nil {
		log.Printf("error: could not open %s to read %v", flag.Args()[0], fErr)
	}
	defer f.Close()

	// Reading in sequences
	var v []seq.Sequence
	r := fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNA))
	sc := seqio.NewScanner(r)
	for sc.Next() {
		v = append(v, sc.Seq())
	}
	if sc.Error() != nil {
		log.Fatalf("failed to read sequences: %v", sc.Error())
	}
	//Checking that there aren't too many sequences
	if maxFam != 0 && !subSample && len(v) > maxFam {
		log.Fatalf("too many sequences: %d", len(v))
	}

	var aveLength int
	if subSample {
		// Shuffle first.
		w := make([]seq.Sequence, 0, len(v))
		for _, j := range rand.Perm(len(v)) {
			w = append(w, v[j])
		}
		v = w

		// Calculating lengths and averages for all sequences read
		var LenAllSeq = make([]int, len(v))

		for num, i := range v {
			LenAllSeq[num] = i.Len()
		}
		//	fmt.Printf("The variable: %v\n", LenAllSeq)
		totalLength := 0
		for i := 0; i < len(v); i++ {

			totalLength = totalLength + LenAllSeq[i]
		}
		aveLength = totalLength / len(v)
		fmt.Printf("Average length: %v\n", aveLength)
	}

	// FIXME(brittany): Make this more scientifically robust.
	var (
		sampled int
		buf     bytes.Buffer
	)

	// creating a file for the sequence information
	seqFile := fmt.Sprintf("%s_included_sequences.fq", flag.Args()[0])
	seqOut, sErr := os.Create(filepath.Join(aDir, seqFile))
	if sErr != nil {
		log.Fatalf("failed to create %s: %v", seqFile, sErr)
	}
	defer seqOut.Close()
	seqBufOut := bufio.NewWriter(f)
	defer seqBufOut.Flush()

	// printing subsampled sequences to a file
	var (
		LenSubSeq = make([]int, maxFam) // length of each sequence
	)
	for num, s := range v {
		if sampled++; subSample && sampled > maxFam {
			break
		}
		fmt.Fprintf(&buf, "%60a\n", s)
		fmt.Fprintf(seqOut, "including: %s %s\n", s.Name(), s.Description())

		// Calculating lengths and averages for the subsampled sequences
		LenSubSeq[num] = s.Len()
	}
	totalSubLength := 0
	for i := 0; i < maxFam; i++ {
		totalSubLength = totalSubLength + LenSubSeq[i]
	}
	fmt.Printf("total length of subbed: %v\n", totalSubLength)
	aveSubLength := totalSubLength / maxFam
	fmt.Printf("Average length of subbed: %v\n", aveSubLength)
	fmt.Printf("The sub seq variable for %v sequences: %v\n", maxFam, LenSubSeq)

	var (
		c   *linear.QSeq
		m   *multi.Multi
		err error
	)
	//Creating the consensus

	fmt.Println("Creating consensus")
	c, m, err = consensus(&buf)
	if err != nil {
		log.Printf("failed to generate consensus for %s: %v", flag.Args()[0], err)
		return
	}
	c.ID = fmt.Sprintf("%s_consensus", flag.Args()[0])
	c.Desc = fmt.Sprintf("%d members total %v members sampled", len(v), sampled-1)
	c.Threshold = 42
	c.QFilter = seq.CaseFilter
	conLength := c.Len()

	// Find a way to make the consensus the same case

	// Calculating cutoff of consensus length to mean sequence length
	fmt.Printf("Length of consensus:%v\n", conLength)
	cutoffSubs := float64(conLength) / float64(aveLength)
	cutoffTotal := float64(conLength) / float64(aveSubLength)
	fmt.Printf("ratio for all seqs: %f, \nratio for just sub-sampled: %f\n", cutoffTotal, cutoffSubs)

	// Creating a file for the consensus length information
	confile := fmt.Sprintf("%s_consensus-length-%v.txt", flag.Args()[0], maxFam)
	conOut, err := os.Create(filepath.Join(cDir, confile))
	if err != nil {
		log.Fatalf("failed to create %s: %v", confile, err)
	}

	fmt.Fprintf(conOut, "number sampled: %v \nratio for all seqs: %f \nratio for just sub-sampled: %f\n\n %f\t%f", maxFam, cutoffTotal, cutoffSubs, cutoffTotal, cutoffSubs)

	// Creating a file for the consensus
	file := fmt.Sprintf("%s_consensus.fq%v", flag.Args()[0], maxFam)
	out, err := os.Create(filepath.Join(cDir, file))
	if err != nil {
		log.Fatalf("failed to create %s: %v", file, err)
	}
	if consFasta {
		fmt.Fprintf(out, "%60a\n", c)
	} else {
		fmt.Fprintf(out, "%q\n", c)
	}

	// creating a file for the mutliple alignment
	alignFile := fmt.Sprintf("%s_multiple_alignment%v.fq", flag.Args()[0], maxFam)
	AlignOut, err := os.Create(filepath.Join(aDir, alignFile))
	if err != nil {
		log.Fatalf("failed to create %s: %v", alignFile, err)
	}

	if consFasta {
		fmt.Fprintf(AlignOut, "%60a\n", m)
	} else {
		fmt.Fprintf(AlignOut, "%q\n", m)
	}

	out.Close()

	fmt.Printf("Complete\n\n")
}
Beispiel #15
0
func main() {
	flag.Var(&alnmat, "align", "specify the match, mismatch and gap parameters")
	flag.Parse()
	if *in == "" {
		flag.Usage()
		os.Exit(1)
	}

	f, err := os.Open(*in)
	if err != nil {
		log.Fatalf("failed to open %q: %v", *in, err)
	}
	events := make(map[string][]*gff.Feature)
	fsc := featio.NewScanner(gff.NewReader(f))
	for fsc.Next() {
		f := fsc.Feat().(*gff.Feature)
		fields := strings.Fields(f.FeatAttributes.Get("Read"))
		if len(fields) != 3 {
			log.Fatalf("bad record: %+v", f)
		}
		events[fields[0]] = append(events[fields[0]], f)
	}
	if err := fsc.Error(); err != nil {
		log.Fatalf("error during gff read: %v", err)
	}
	f.Close()

	w := gff.NewWriter(os.Stdout, 60, true)
	w.WriteComment("Right coordinates (field 5) and strand (field 7) are hypothetical.")

	var out *os.File
	if *fastaOut != "" {
		out, err = os.Create(*fastaOut)
		if err != nil {
			log.Fatalf("failed to create fasta insertion output file %q: %v", *fastaOut, err)
		}
		defer out.Close()
	}

	hw := *window / 2
	sw := makeTable(alphabet.DNAgapped, alnmat)
	for _, ref := range flag.Args() {
		f, err = os.Open(ref)
		if err != nil {
			log.Fatalf("failed to open reference %q: %v", ref, err)
		}
		ssc := seqio.NewScanner(fasta.NewReader(f, linear.NewSeq("", nil, alphabet.DNAgapped)))
	loop:
		for ssc.Next() {
			seq := ssc.Seq().(*linear.Seq)
			for _, f := range events[seq.Name()] {
				fields := strings.Fields(f.FeatAttributes.Get("Read"))
				if len(fields) != 3 {
					log.Fatalf("bad record: %+v", f)
				}
				start, err := strconv.Atoi(fields[1])
				if err != nil {
					log.Fatalf("failed to get start coordinate: %v", err)
				}
				end, err := strconv.Atoi(fields[2])
				if err != nil {
					log.Fatalf("failed to get end coordinate: %v", err)
				}

				if out != nil {
					insert := *seq
					if insert.Desc != "" {
						insert.Desc += " "
					}
					insert.Desc += fmt.Sprintf("[%d,%d)", start, end)
					insert.Seq = insert.Seq[start:end]
					fmt.Fprintf(out, "%60a\n", &insert)
				}

				var lOff, lEnd, rOff, rEnd int
				// If we have refined ends, use them.
				if dup := f.FeatAttributes.Get("Dup"); dup != "" {
					d, err := strconv.Atoi(dup)
					if err != nil {
						log.Fatalf("failed to get duplication length: %v", err)
					}
					lOff = max(0, start-d)
					lEnd = start
					rOff = end
					rEnd = min(len(seq.Seq), end+d)
				} else {
					lOff = max(0, start-hw)
					lEnd = min(len(seq.Seq), start+hw)
					rOff = max(0, end-hw)
					rEnd = min(len(seq.Seq), end+hw)

					// Ensure windows don't overlap.
					if lEnd > rOff {
						lEnd = (lEnd + rOff) / 2
						rOff = lEnd
					}
				}

				if lEnd-lOff < *thresh || rEnd-rOff < *thresh {
					// Don't do fruitless work.
					continue loop
				}

				left := *seq
				left.ID = "prefix"
				left.Seq = left.Seq[lOff:lEnd]
				right := *seq
				right.ID = "postfix"
				right.Seq = right.Seq[rOff:rEnd]

				aln, err := sw.Align(&right, &left)
				if err != nil {
					log.Fatal(err)
				}

				fa := align.Format(&right, &left, aln, '-')
				for _, seg := range fa {
					var n int
					for _, l := range seg.(alphabet.Letters) {
						if l != '-' {
							n++
						}
					}
					if n < *thresh {
						continue loop
					}
				}

				var sc int
				for _, seg := range aln {
					type scorer interface {
						Score() int
					}
					sc += seg.(scorer).Score()
				}
				f.FeatAttributes = append(f.FeatAttributes, gff.Attribute{
					Tag: "TSD", Value: fmt.Sprintf(`%v %d %d %v "%v" %d`,
						fa[0], aln[len(aln)-1].Features()[0].End()+lOff,
						aln[0].Features()[1].Start()+rOff, fa[1],
						aln, sc),
				})
				w.Write(f)
			}
		}
		if err := ssc.Error(); err != nil {
			log.Fatalf("error during fasta read: %v", err)
		}
		f.Close()
	}
}