Esempio n. 1
0
// hitSetFrom returns a hitSet from mapping reads to the given reference
// using the suffix array file if provided. If run is false, blasr is not
// run and the existing blasr output is used to reconstruct the hitSet.
// procs specifies the number of blasr threads to use.
func hitSetFrom(reads, ref, suff string, procs int, run bool) (hitSet, error) {
	base := filepath.Base(reads)
	b := blasr.BLASR{
		Cmd: *blasrPath,

		Reads: reads, Genome: ref, SuffixArray: suff,
		BestN: 1, Format: 4,

		Aligned:   base + ".blasr",
		Unaligned: base + ".blasr.unmapped",

		Procs: procs,
	}
	if run {
		cmd, err := b.BuildCommand()
		if err != nil {
			return nil, err
		}
		cmd.Stdout = errStream
		cmd.Stderr = errStream
		err = cmd.Run()
		if err != nil {
			return nil, err
		}
	}

	f, err := os.Open(b.Aligned)
	if err != nil {
		return nil, err
	}
	defer f.Close()

	hits := make(hitSet)
	sc := bufio.NewScanner(f)
	for sc.Scan() {
		b, err := newBlasrHit(sc.Text())
		if err != nil {
			return nil, err
		}
		hits[b.qName] = b
	}

	return hits, sc.Err()
}
Esempio n. 2
0
// deletions analyses *sam.Records from mapping reads to the given reference
// using the suffix array file if provided. If run is false, blasr is not
// run and the existing blasr output is used to provide the *sam.Records.
// procs specifies the number of blasr threads to use.
func deletions(reads, ref, suff, ext string, procs int, run bool, window, min int, br *refiner, w *gff.Writer) error {
	base := filepath.Base(reads)
	b := blasr.BLASR{
		Cmd: *blasrPath,

		Reads: reads, Genome: ref, SuffixArray: suff,
		BestN: 1,

		SAM:           true,
		Clipping:      "soft",
		SAMQV:         true,
		CIGARSeqMatch: true,

		Aligned:   base + ".blasr." + ext,
		Unaligned: base + ".blasr.unmapped.fasta",

		Procs: procs,
	}
	if run {
		cmd, err := b.BuildCommand()
		if err != nil {
			return err
		}
		cmd.Stdout = errStream
		cmd.Stderr = errStream
		err = cmd.Run()
		if err != nil {
			return err
		}
	}

	f, err := os.Open(b.Aligned)
	if err != nil {
		return err
	}
	defer f.Close()

	cost := [...]float64{
		sam.CigarInsertion: -2,
		sam.CigarDeletion:  -2,
		sam.CigarEqual:     1,
		sam.CigarMismatch:  -1,

		// Included for explicitness
		sam.CigarSoftClipped: 0,

		// Included to ensure no bounds panic.
		// All CIGAR operations not listed above
		// are given a zero cost.
		sam.CigarBack: 0,
	}

	_, err = w.WriteComment(fmt.Sprintf("smoothing window=%d", window))
	if err != nil {
		return nil
	}
	_, err = w.WriteComment(fmt.Sprintf("minimum feature length=%d", min))
	if err != nil {
		return nil
	}
	gf := &gff.Feature{
		Source:         "reefer",
		Feature:        "discordance",
		FeatFrame:      gff.NoFrame,
		FeatAttributes: gff.Attributes{{Tag: "Read"}, {Tag: "Dup"}},
	}
	var sr interface {
		Read() (*sam.Record, error)
	}
	switch ext {
	case "sam":
		sr, err = sam.NewReader(f)
		if err != nil {
			return err
		}
	case "bam":
		var br *bam.Reader
		br, err = bam.NewReader(f, 0)
		if err != nil {
			return err
		}
		defer br.Close()
		sr = br
	default:
		panic("reefer: invalid extension")
	}
	for {
		r, err := sr.Read()
		if err != nil {
			if err != io.EOF {
				return err
			}
			break
		}

		var (
			scores []costPos
			ref    = r.Start()
			query  int
		)
		for _, co := range r.Cigar {
			for i := 0; i < co.Len(); i++ {
				scores = append(scores, costPos{
					ref:   ref,
					query: query,
					cost:  cost[co.Type()],
				})
				consume := co.Type().Consumes()
				ref += consume.Reference
				query += consume.Query
			}
		}
		if len(scores) <= window {
			continue
		}
		smoothed := make([]costPos, len(scores)-window)
		for i := range scores[:len(scores)-window] {
			smoothed[i] = mean(scores[i : i+window])
		}

		var d deletion
		for i, v := range smoothed[1:] {
			switch {
			case d.record == nil && v.cost < 0 && smoothed[i].cost >= 0:
				d = deletion{record: r, rstart: v.ref + 1, qstart: v.query + 1}
			case d.record != nil && v.cost >= 0 && smoothed[i].cost < 0:
				d.rend = v.ref
				d.qend = v.query
				if d.rend-d.rstart >= min || d.qend-d.qstart >= min {
					gf.SeqName = d.record.Ref.Name()
					gf.FeatStrand = strandFor(d.record)
					if gf.FeatStrand == seq.Minus {
						len := d.record.Seq.Length
						d.qstart, d.qend = len-d.qend, len-d.qstart
					}

					// Adjust ends based on paired SW alignments.
					var refined bool
					d, refined, err = br.adjust(d)
					if err != nil && *verbose {
						log.Printf("failed alignment %s: %v", d.record.Name, err)
					}

					gf.FeatStart = d.rstart
					gf.FeatEnd = d.rend
					if gf.FeatStart == gf.FeatEnd {
						// This is disgusting garbage resulting from
						// GFF not allowing zero length features.
						gf.FeatEnd++
					}

					if refined {
						gf.FeatAttributes = gf.FeatAttributes[:2]
						gf.FeatAttributes[1].Value = strconv.Itoa(d.dup)
					} else {
						gf.FeatAttributes = gf.FeatAttributes[:1]
					}
					gf.FeatAttributes[0].Value = fmt.Sprintf("%s %d %d", d.record.Name, feat.ZeroToOne(d.qstart), d.qend)
					_, err = w.Write(gf)
					if err != nil {
						return err
					}
				}
				d.record = nil
			}
		}
	}
	return nil
}