Beispiel #1
0
// WriteMetaData writes a meta data line to a GFF file. The type of metadata line
// depends on the type of d: strings and byte slices are written verbatim, an int is
// interpreted as a version number and can only be written before any other data,
// feat.Moltype and gff.Sequence types are written as sequence type lines, gff.Features
// and gff.Regions are written as sequence regions, sequences are written _n GFF
// format and time.Time values are written as date line. All other type return an
// ErrNotHandled.
func (w *Writer) WriteMetaData(d interface{}) (n int, err error) {
	defer func() { w.header = true }()
	switch d := d.(type) {
	case string:
		return fmt.Fprintf(w.w, "##%s\n", d)
	case []byte:
		return fmt.Fprintf(w.w, "##%s\n", d)
	case int:
		if w.header {
			return 0, ErrCannotHeader
		}
		return fmt.Fprintf(w.w, "##gff-version %d\n", d)
	case feat.Moltype:
		return fmt.Fprintf(w.w, "##Type %s\n", d)
	case Sequence:
		return fmt.Fprintf(w.w, "##Type %s %s\n", d.Type, d.SeqName)
	case *Feature:
		return fmt.Fprintf(w.w, "##sequence-region %s %d %d\n", d.SeqName, feat.ZeroToOne(d.FeatStart), d.FeatEnd)
	case feat.Feature:
		return w.Write(d)
	case time.Time:
		return fmt.Fprintf(w.w, "##date %s\n", d.Format(w.TimeFormat))
	}
	return 0, ErrNotHandled
}
Beispiel #2
0
// Write writes a single feature and return the number of bytes written and any error.
func (w *Writer) Write(pair *Pair) (n int, err error) {
	t := w.t
	t.SeqName = pair.B.Location().Name()
	t.FeatStart = pair.B.Start()
	t.FeatEnd = pair.B.End()
	t.FeatScore = floatPtr(float64(pair.Score))
	t.FeatStrand = pair.Strand
	t.FeatFrame = gff.NoFrame
	t.FeatAttributes = append(t.FeatAttributes[:0],
		gff.Attribute{
			Tag:   "Target",
			Value: fmt.Sprintf("%s %d %d", pair.A.Location().Name(), feat.ZeroToOne(pair.A.Start()), pair.A.End()),
		},
		gff.Attribute{
			Tag:   "maxe",
			Value: fmt.Sprintf("%.2g", pair.Error),
		},
	)

	return w.w.Write(t)
}
Beispiel #3
0
// Write writes a single feature and return the number of bytes written and any error.
// gff.Features are written as a canonical GFF line, seq.Sequences are written as inline
// sequence in GFF format (note that only sequences of feat.Moltype DNA, RNA and Protein
// are supported). gff.Sequences are not handled as they have a zero length. All other
// feat.Feature are written as sequence region metadata lines.
func (w *Writer) Write(f feat.Feature) (n int, err error) {
	if f.Start() >= f.End() {
		return 0, ErrBadFeature
	}
	w.header = true
	switch f := f.(type) {
	case *Feature:
		defer func() {
			if err != nil {
				return
			}
			_, err = w.w.Write([]byte{'\n'})
			if err != nil {
				return
			}
			n++
		}()
		n, err = fmt.Fprintf(w.w, "%s\t%s\t%s\t%d\t%d\t",
			f.SeqName,
			f.Source,
			f.Feature,
			feat.ZeroToOne(f.FeatStart),
			f.FeatEnd,
		)
		if err != nil {
			return n, err
		}
		var _n int
		if f.FeatScore != nil && !math.IsNaN(*f.FeatScore) {
			if w.Precision < 0 {
				_n, err = fmt.Fprintf(w.w, "%v", *f.FeatScore)
			} else {
				_n, err = fmt.Fprintf(w.w, "%.*f", w.Precision, *f.FeatScore)
			}
			if err != nil {
				return n, err
			}
			n += _n
		} else {
			_, err = w.w.Write([]byte{'.'})
			if err != nil {
				return n, err
			}
			n++
		}
		_n, err = fmt.Fprintf(w.w, "\t%s\t%s",
			f.FeatStrand,
			f.FeatFrame,
		)
		n += _n
		if err != nil {
			return n, err
		}
		if f.FeatAttributes != nil {
			_n, err = fmt.Fprintf(w.w, "\t%v", f.FeatAttributes)
			if err != nil {
				return n, err
			}
			n += _n
		} else if f.Comments != "" {
			_, err = w.w.Write([]byte{'\t'})
			if err != nil {
				return
			}
			n++
		}
		if f.Comments != "" {
			_n, err = fmt.Fprintf(w.w, "\t%s", f.Comments)
			n += _n
		}
		return n, err
	case seq.Sequence:
		sw := fasta.NewWriter(w.w, w.Width)
		moltype := f.Alphabet().Moltype()
		if moltype < feat.DNA || moltype > feat.Protein {
			return 0, ErrNotHandled
		}
		sw.IDPrefix = [...][]byte{
			feat.DNA:     []byte("##DNA "),
			feat.RNA:     []byte("##RNA "),
			feat.Protein: []byte("##Protein "),
		}[moltype]
		sw.SeqPrefix = []byte("##")
		n, err = sw.Write(f)
		if err != nil {
			return n, err
		}
		var _n int
		_n, err = w.w.Write([...][]byte{
			feat.DNA:     []byte("##end-DNA\n"),
			feat.RNA:     []byte("##end-RNA\n"),
			feat.Protein: []byte("##end-Protein\n"),
		}[moltype])
		return n + _n, err
	case Sequence:
		return 0, ErrNotHandled
	case *Region:
		return fmt.Fprintf(w.w, "##sequence-region %s %d %d\n", f.SeqName, feat.ZeroToOne(f.RegionStart), f.RegionEnd)
	default:
		return fmt.Fprintf(w.w, "##sequence-region %s %d %d\n", f.Name(), feat.ZeroToOne(f.Start()), f.End())
	}
}
Beispiel #4
0
// deletions analyses *sam.Records from mapping reads to the given reference
// using the suffix array file if provided. If run is false, blasr is not
// run and the existing blasr output is used to provide the *sam.Records.
// procs specifies the number of blasr threads to use.
func deletions(reads, ref, suff, ext string, procs int, run bool, window, min int, br *refiner, w *gff.Writer) error {
	base := filepath.Base(reads)
	b := blasr.BLASR{
		Cmd: *blasrPath,

		Reads: reads, Genome: ref, SuffixArray: suff,
		BestN: 1,

		SAM:           true,
		Clipping:      "soft",
		SAMQV:         true,
		CIGARSeqMatch: true,

		Aligned:   base + ".blasr." + ext,
		Unaligned: base + ".blasr.unmapped.fasta",

		Procs: procs,
	}
	if run {
		cmd, err := b.BuildCommand()
		if err != nil {
			return err
		}
		cmd.Stdout = errStream
		cmd.Stderr = errStream
		err = cmd.Run()
		if err != nil {
			return err
		}
	}

	f, err := os.Open(b.Aligned)
	if err != nil {
		return err
	}
	defer f.Close()

	cost := [...]float64{
		sam.CigarInsertion: -2,
		sam.CigarDeletion:  -2,
		sam.CigarEqual:     1,
		sam.CigarMismatch:  -1,

		// Included for explicitness
		sam.CigarSoftClipped: 0,

		// Included to ensure no bounds panic.
		// All CIGAR operations not listed above
		// are given a zero cost.
		sam.CigarBack: 0,
	}

	_, err = w.WriteComment(fmt.Sprintf("smoothing window=%d", window))
	if err != nil {
		return nil
	}
	_, err = w.WriteComment(fmt.Sprintf("minimum feature length=%d", min))
	if err != nil {
		return nil
	}
	gf := &gff.Feature{
		Source:         "reefer",
		Feature:        "discordance",
		FeatFrame:      gff.NoFrame,
		FeatAttributes: gff.Attributes{{Tag: "Read"}, {Tag: "Dup"}},
	}
	var sr interface {
		Read() (*sam.Record, error)
	}
	switch ext {
	case "sam":
		sr, err = sam.NewReader(f)
		if err != nil {
			return err
		}
	case "bam":
		var br *bam.Reader
		br, err = bam.NewReader(f, 0)
		if err != nil {
			return err
		}
		defer br.Close()
		sr = br
	default:
		panic("reefer: invalid extension")
	}
	for {
		r, err := sr.Read()
		if err != nil {
			if err != io.EOF {
				return err
			}
			break
		}

		var (
			scores []costPos
			ref    = r.Start()
			query  int
		)
		for _, co := range r.Cigar {
			for i := 0; i < co.Len(); i++ {
				scores = append(scores, costPos{
					ref:   ref,
					query: query,
					cost:  cost[co.Type()],
				})
				consume := co.Type().Consumes()
				ref += consume.Reference
				query += consume.Query
			}
		}
		if len(scores) <= window {
			continue
		}
		smoothed := make([]costPos, len(scores)-window)
		for i := range scores[:len(scores)-window] {
			smoothed[i] = mean(scores[i : i+window])
		}

		var d deletion
		for i, v := range smoothed[1:] {
			switch {
			case d.record == nil && v.cost < 0 && smoothed[i].cost >= 0:
				d = deletion{record: r, rstart: v.ref + 1, qstart: v.query + 1}
			case d.record != nil && v.cost >= 0 && smoothed[i].cost < 0:
				d.rend = v.ref
				d.qend = v.query
				if d.rend-d.rstart >= min || d.qend-d.qstart >= min {
					gf.SeqName = d.record.Ref.Name()
					gf.FeatStrand = strandFor(d.record)
					if gf.FeatStrand == seq.Minus {
						len := d.record.Seq.Length
						d.qstart, d.qend = len-d.qend, len-d.qstart
					}

					// Adjust ends based on paired SW alignments.
					var refined bool
					d, refined, err = br.adjust(d)
					if err != nil && *verbose {
						log.Printf("failed alignment %s: %v", d.record.Name, err)
					}

					gf.FeatStart = d.rstart
					gf.FeatEnd = d.rend
					if gf.FeatStart == gf.FeatEnd {
						// This is disgusting garbage resulting from
						// GFF not allowing zero length features.
						gf.FeatEnd++
					}

					if refined {
						gf.FeatAttributes = gf.FeatAttributes[:2]
						gf.FeatAttributes[1].Value = strconv.Itoa(d.dup)
					} else {
						gf.FeatAttributes = gf.FeatAttributes[:1]
					}
					gf.FeatAttributes[0].Value = fmt.Sprintf("%s %d %d", d.record.Name, feat.ZeroToOne(d.qstart), d.qend)
					_, err = w.Write(gf)
					if err != nil {
						return err
					}
				}
				d.record = nil
			}
		}
	}
	return nil
}