예제 #1
0
func load_seq(h autoio.AutoioHandle) {
	//fold := 50

	for h.ReadScan() {
		l := h.ReadText()
		if len(l) == 0 {
			continue
		}
		g_seq = l
	}

	g_seq = strings.ToLower(g_seq)

}
예제 #2
0
func load_tagset(h autoio.AutoioHandle) error {
	line_no := 0

	for h.ReadScan() {
		line_no++
		l := h.ReadText()
		if len(l) == 0 {
			continue
		}
		fields := strings.Split(l, ",")
		if len(fields) != 2 {
			return fmt.Errorf("bad read on line %s", line_no)
		}

		g_tagset[fields[0]] = fields[1]
		g_tagseq[fields[1]] = fields[0]
	}

	return nil

}
예제 #3
0
func convert(gff_ain *autoio.AutoioHandle, ref_ain *simplestream.SimpleStream, aout *os.File, start_pos int64, allele_num int) error {
	var e error

	bufout := bufio.NewWriter(aout)
	defer bufout.Flush()

	for gff_ain.ReadScan() {
		l := gff_ain.ReadText()

		if len(l) == 0 || l[0] == '#' {
			continue
		}

		gff_parts := strings.Split(l, "\t")
		if len(gff_parts) < 9 {
			return fmt.Errorf("not enough gff parts")
		}
		chrom := gff_parts[0]
		_ = chrom
		typ := gff_parts[2]
		spos1ref, e0 := strconv.ParseInt(gff_parts[3], 10, 64)
		epos1ref, e1 := strconv.ParseInt(gff_parts[4], 10, 64)
		info := gff_parts[8]

		// REFERENCE CHECK
		//
		gff_ref_seq := make([]byte, 0, 8)
		idx := strings.Index(info, ";ref_allele ")
		if idx >= 0 {
			idx += len(";ref_allele ")
			for ; idx < len(info); idx++ {
				if info[idx] == 'a' || info[idx] == 'A' ||
					info[idx] == 'c' || info[idx] == 'C' ||
					info[idx] == 'g' || info[idx] == 'G' ||
					info[idx] == 't' || info[idx] == 'T' ||
					info[idx] == '-' {
					gff_ref_seq = append(gff_ref_seq, info[idx])
				} else {
					break
				}
				if gff_ref_seq[len(gff_ref_seq)-1] == '-' {
					break
				}
			}
		}
		g_GFF_REF = gff_ref_seq
		//
		// REFERENCE CHECK

		alt_seq := ""

		del_n := epos1ref - spos1ref + 1
		spos0ref := spos1ref - 1

		if typ != "REF" {
			info_parts := strings.Split(info, ";")
			alleles_info := info_parts[0]

			if len(alleles_info) < 2 {
				return fmt.Errorf(fmt.Sprintf("Invalid alleles info (%s)", info))
			}

			alts := strings.Split(alleles_info, " ")
			alt_seqs := strings.Split(alts[1], "/")
			if len(alt_seqs) == 0 {
				alt_seq = alt_seqs[0]
			} else {
				if allele_num < len(alt_seqs) {
					alt_seq = alt_seqs[allele_num]
				} else {
					alt_seq = alt_seqs[0]
				}
				if alt_seq == "-" {
					alt_seq = ""
				}
			}
		}

		if e0 != nil {
			return e0
		}
		if e1 != nil {
			return e1
		}

		if start_pos < 0 {
			if gVerboseFlag {
				fmt.Printf("\n{\"comment\":\"initializing start_pos=%d\"}\n", spos0ref)
			}
			start_pos = spos0ref
		}

		if start_pos < spos0ref {
			e = emit_nocall(start_pos, spos0ref-start_pos, ref_ain, bufout)
			if e != nil {
				return e
			}
			start_pos = spos0ref
		}

		if typ == "REF" {
			e = emit_ref(start_pos, del_n, ref_ain, bufout)
			if e != nil {
				return e
			}
			start_pos += del_n
		} else {

			//DEBUG
			//if int(del_n) != len(alt_seq) { fmt.Printf("\n>>>>> [%d] del_n %d, alt_seq %s (%d)\n", start_pos, del_n, alt_seq, len(alt_seq)) }

			e = emit_alt(start_pos, del_n, alt_seq, ref_ain, bufout)
			if e != nil {
				return e
			}
			start_pos += del_n
		}

	}

	return nil
}
예제 #4
0
func LoadSampleFastj(scan *autoio.AutoioHandle) ([][]TileInfo, error) {
	line_no := 0

	cur_seq := make([]byte, 0, 1024)
	tilepath := -1
	tilestep := -1
	tilevar := -1
	nocall := 0

	_ = nocall

	allele_path := make([][]TileInfo, 2)
	for i := 0; i < len(allele_path); i++ {
		allele_path[i] = make([]TileInfo, 0, 1024)
	}

	var first_tile bool = true
	var tileid string
	var s_tag string
	var e_tag string
	var md5sum_str string
	var span_len int
	var start_tile_flag bool
	var end_tile_flag bool

	for scan.ReadScan() {
		l := scan.ReadText()
		line_no++
		if len(l) == 0 {
			continue
		}
		if l[0] == '\n' {
			continue
		}

		if l[0] == '>' {

			// store tile sequence
			//
			if !first_tile {
				m5 := Md5sum2str(md5.Sum(cur_seq))
				if m5 != md5sum_str {
					return nil, fmt.Errorf("md5sums do not match %s != %s (line %d)", m5, md5sum_str, line_no)
				}
				ti := emit_fastj_tile(tilepath, tilestep, span_len, s_tag, cur_seq, e_tag)

				if tilevar == 0 {
					allele_path[0] = append(allele_path[0], ti)
				} else if tilevar == 1 {
					allele_path[1] = append(allele_path[1], ti)
				} else {
					return nil, fmt.Errorf("invalid tile variant allele %d", tilevar)
				}

			}
			first_tile = false

			var pos int = 0

			tileid, pos = simple_text_field(l[1:], "tileID")
			if pos < 0 {
				return nil, fmt.Errorf("no tileID found at line %d", line_no)
			}

			md5sum_str, pos = simple_text_field(l[1:], "md5sum")
			if pos < 0 {
				return nil, fmt.Errorf("no md5sum found at line %d", line_no)
			}

			span_len, pos = simple_int_field(l[1:], "seedTileLength")
			if pos < 0 {
				return nil, fmt.Errorf("no md5sum found at line %d", line_no)
			}

			s_tag, pos = simple_text_field(l[1:], "startTag")
			if pos < 0 {
				return nil, fmt.Errorf("no startTag found at line %d", line_no)
			}
			_ = s_tag

			e_tag, pos = simple_text_field(l[1:], "endTag")
			if pos < 0 {
				return nil, fmt.Errorf("no endTag found at line %d", line_no)
			}
			_ = e_tag

			start_tile_flag, pos = simple_bool_field(l[1:], "startTile")
			if pos < 0 {
				return nil, fmt.Errorf("no startTile found at line %d", line_no)
			}
			_ = start_tile_flag

			end_tile_flag, pos = simple_bool_field(l[1:], "endTile")
			if pos < 0 {
				return nil, fmt.Errorf("no endTile found at line %d", line_no)
			}
			_ = end_tile_flag

			tile_parts := strings.Split(tileid, ".")
			if t, e := strconv.ParseInt(tile_parts[0], 16, 64); e == nil {
				tilepath = int(t)
			} else {
				return nil, e
			}

			if t, e := strconv.ParseInt(tile_parts[2], 16, 64); e == nil {
				tilestep = int(t)
			} else {
				return nil, e
			}

			if t, e := strconv.ParseInt(tile_parts[3], 16, 64); e == nil {
				tilevar = int(t)
			} else {
				return nil, e
			}

			// Header parsed, go on
			//
			cur_seq = cur_seq[0:0]
			continue
		}

		if first_tile {
			return nil, fmt.Errorf("found body before header (line %d)", line_no)
		}

		cur_seq = append(cur_seq, l[:]...)

	}

	// store tile sequence
	//
	if !first_tile {
		m5 := Md5sum2str(md5.Sum(cur_seq))
		if m5 != md5sum_str {
			return nil, fmt.Errorf("md5sums do not match %s != %s (line %d)", m5, md5sum_str, line_no)
		}
		ti := emit_fastj_tile(tilepath, tilestep, span_len, s_tag, cur_seq, e_tag)

		if tilevar == 0 {
			allele_path[0] = append(allele_path[0], ti)
		} else if tilevar == 1 {
			allele_path[1] = append(allele_path[1], ti)
		} else {
			return nil, fmt.Errorf("invalid tile variant allele %d", tilevar)
		}

	}

	return allele_path, nil
}
예제 #5
0
func convert(gvcf_ain *autoio.AutoioHandle, ref_ain *simplestream.SimpleStream, aout *os.File, start_pos int64) error {
	var e error

	//start_pos := int64(0)
	allele_num := 0
	_ = allele_num

	bufout := bufio.NewWriter(aout)
	defer bufout.Flush()

	cur_spos := int64(0)

	// All co-ordinates are 0-ref.
	// End is inclusive
	//
	for gvcf_ain.ReadScan() {
		l := gvcf_ain.ReadText()

		if len(l) == 0 || l[0] == '#' {
			continue
		}

		gvcf_parts := strings.Split(l, "\t")
		if len(gvcf_parts) < 9 {
			return fmt.Errorf("not enough gvcf parts")
		}
		chrom := gvcf_parts[0]
		_ = chrom
		spos, e0 := strconv.ParseInt(gvcf_parts[1], 10, 64)
		if e0 != nil {
			return e0
		}
		spos--

		id_str := gvcf_parts[2]
		_ = id_str
		ref_anch := gvcf_parts[3]
		_ = ref_anch
		alt_str := gvcf_parts[4]
		_ = alt_str
		qual := gvcf_parts[5]
		_ = qual
		filt := gvcf_parts[6]
		_ = filt
		info_str := gvcf_parts[7]
		_ = info_str
		fmt_str := gvcf_parts[8]
		_ = fmt_str
		samp_str := gvcf_parts[9]
		_ = samp_str

		// Check for END
		//
		epos := int64(-1)
		info_parts := strings.Split(info_str, ";")
		for i := 0; i < len(info_parts); i++ {
			if strings.HasPrefix(info_parts[i], "END=") {
				end_parts := strings.Split(info_parts[i], "=")

				// End is inclusive
				//
				epos, e = strconv.ParseInt(end_parts[1], 10, 64)
				epos--

				if e != nil {
					return e
				}
				break
			}
		}

		ref_len := int64(len(ref_anch))
		if epos >= 0 {
			ref_len = epos - spos + 1
		}

		typ := "NOCALL"
		if filt == "PASS" {
			typ = "REF"
		}

		// Catch up to current position
		//
		if (cur_spos >= 0) && ((spos - cur_spos) > 0) {

			//fmt.Printf("\nnocall catchup %d+%d\n", cur_spos, spos-cur_spos)

			emit_nocall_ref(cur_spos, spos-cur_spos, ref_ain, bufout)
		}

		// Update previous end position
		//
		cur_spos = spos + ref_len

		// Process current line
		//
		if typ == "NOCALL" {

			//fmt.Printf("\nnocall ref %d+%d\n", spos, ref_len)

			emit_nocall_ref(spos, ref_len, ref_ain, bufout)

			continue
		}

		refseq, _, e := peel_ref(spos, ref_len, ref_ain)
		if e != nil {
			return e
		}

		gt_idx, er := get_field_index(fmt_str, "GT", ":")
		if er != nil {
			return er
		}

		samp_parts := strings.Split(samp_str, ":")
		if len(samp_parts) <= gt_idx {
			return fmt.Errorf(fmt.Sprintf("%s <-- NO GT FIELD", l))
		}

		gt_field := samp_parts[gt_idx]

		gt_parts := []string{}
		if strings.Index(gt_field, "/") != -1 {
			gt_parts = strings.Split(gt_field, "/")

			//fmt.Printf("  %s %s (un)\n", gt_parts[0], gt_parts[1])
		} else if strings.Index(gt_field, "|") != -1 {
			gt_parts = strings.Split(gt_field, "|")

			//fmt.Printf("  %s %s (ph)\n", gt_parts[0], gt_parts[1])
		} else {
			gt_parts = append(gt_parts, gt_field)

			//fmt.Printf("  %s\n", gt_field)
		}

		gt_allele_idx, e := convert_textvec(gt_parts)

		//fmt.Printf(">> ref %s\n", refseq)
		alt_fields := strings.Split(alt_str, ",")

		for i := 0; i < len(gt_allele_idx); i++ {
			if gt_allele_idx[i] == 0 {
				//fmt.Printf("> alt%d %s\n", gt_allele_idx[i], refseq)
				//aout.WriteString(refseq)
				bufout.WriteString(refseq)

				gCounter += len(refseq)
			} else if (gt_allele_idx[i] - 1) < len(alt_fields) {
				//fmt.Printf("> alt%d %s\n", gt_allele_idx[i],
				//aout.WriteString(alt_fields[gt_allele_idx[i]-1])
				bufout.WriteString(alt_fields[gt_allele_idx[i]-1])

				gCounter += len(alt_fields[gt_allele_idx[i]-1])

			} else {
				return fmt.Errorf(fmt.Sprintf("%s <-- invalid GT field", l))
			}

			//DEBUG
			// Only first allele for now
			break
			//DEBUG

		}

		//fmt.Printf("chrom %s, spos %d, epos %d\n", chrom, spos, epos)

	}

	//fmt.Printf("\n\ngCounter %d\n", gCounter)

	return nil
}
예제 #6
0
파일: pasta.go 프로젝트: abeconnelly/pasta
func diff_to_interleave(ain *autoio.AutoioHandle) {

	n_allele := 2
	lfmod := 50
	bp_count := 0

	chrom := ""
	pos := -1

	first_pass := true

	for ain.ReadScan() {
		l := ain.ReadText()

		if len(l) == 0 || l == "" {
			continue
		}

		diff_parts := strings.Split(l, "\t")

		chrom_s := diff_parts[0]
		type_s := diff_parts[1]
		st_s := diff_parts[2]
		_ = st_s
		en_s := diff_parts[3]
		_ = en_s
		field := diff_parts[4]

		control_message := false

		if chrom != chrom_s {

			if !first_pass && !control_message {
				fmt.Printf("\n")
			}

			fmt.Printf(">C{%s}", chrom_s)
			chrom = chrom_s

			control_message = true
		}

		_st, e := strconv.ParseUint(st_s, 10, 64)
		if e == nil {

			if pos != int(_st) {
				if !first_pass && !control_message {
					fmt.Printf("\n")
				}
				fmt.Printf(">P{%d}", _st)
				pos = int(_st)

				control_message = true
			}

		}

		if control_message {
			fmt.Printf("\n")
		}
		first_pass = false

		if type_s == "ref" {

			for i := 0; i < len(field); i++ {
				for a := 0; a < n_allele; a++ {
					fmt.Printf("%c", field[i])

					bp_count++
					if (lfmod > 0) && ((bp_count % lfmod) == 0) {
						fmt.Printf("\n")
					}
				}
			}

			pos += len(field)

		} else if type_s == "alt" || type_s == "nca" || type_s == "noc" {

			field_parts := strings.Split(field, ";")
			alt_parts := strings.Split(field_parts[0], "/")
			if len(alt_parts) == 1 {
				alt_parts = append(alt_parts, alt_parts[0])
			}
			refseq := field_parts[1]

			mM := len(alt_parts[0])
			if len(alt_parts[1]) > mM {
				mM = len(alt_parts[1])
			}
			if len(refseq) > mM {
				mM = len(refseq)
			}

			for i := 0; i < mM; i++ {

				for a := 0; a < len(alt_parts); a++ {

					if i < len(alt_parts[a]) {
						if i < len(refseq) {
							fmt.Printf("%c", pasta.SubMap[refseq[i]][alt_parts[a][i]])
						} else {
							fmt.Printf("%c", pasta.InsMap[alt_parts[a][i]])
						}
					} else if i < len(refseq) {
						fmt.Printf("%c", pasta.DelMap[refseq[i]])
					} else {
						fmt.Printf(".")
					}

					bp_count++
					if (lfmod > 0) && ((bp_count % lfmod) == 0) {
						fmt.Printf("\n")
					}

				}

			}

			if refseq != "-" {
				pos += len(refseq)
			}

		}

	}

	fmt.Printf("\n")

}