func load_seq(h autoio.AutoioHandle) { //fold := 50 for h.ReadScan() { l := h.ReadText() if len(l) == 0 { continue } g_seq = l } g_seq = strings.ToLower(g_seq) }
func load_tagset(h autoio.AutoioHandle) error { line_no := 0 for h.ReadScan() { line_no++ l := h.ReadText() if len(l) == 0 { continue } fields := strings.Split(l, ",") if len(fields) != 2 { return fmt.Errorf("bad read on line %s", line_no) } g_tagset[fields[0]] = fields[1] g_tagseq[fields[1]] = fields[0] } return nil }
func convert(gff_ain *autoio.AutoioHandle, ref_ain *simplestream.SimpleStream, aout *os.File, start_pos int64, allele_num int) error { var e error bufout := bufio.NewWriter(aout) defer bufout.Flush() for gff_ain.ReadScan() { l := gff_ain.ReadText() if len(l) == 0 || l[0] == '#' { continue } gff_parts := strings.Split(l, "\t") if len(gff_parts) < 9 { return fmt.Errorf("not enough gff parts") } chrom := gff_parts[0] _ = chrom typ := gff_parts[2] spos1ref, e0 := strconv.ParseInt(gff_parts[3], 10, 64) epos1ref, e1 := strconv.ParseInt(gff_parts[4], 10, 64) info := gff_parts[8] // REFERENCE CHECK // gff_ref_seq := make([]byte, 0, 8) idx := strings.Index(info, ";ref_allele ") if idx >= 0 { idx += len(";ref_allele ") for ; idx < len(info); idx++ { if info[idx] == 'a' || info[idx] == 'A' || info[idx] == 'c' || info[idx] == 'C' || info[idx] == 'g' || info[idx] == 'G' || info[idx] == 't' || info[idx] == 'T' || info[idx] == '-' { gff_ref_seq = append(gff_ref_seq, info[idx]) } else { break } if gff_ref_seq[len(gff_ref_seq)-1] == '-' { break } } } g_GFF_REF = gff_ref_seq // // REFERENCE CHECK alt_seq := "" del_n := epos1ref - spos1ref + 1 spos0ref := spos1ref - 1 if typ != "REF" { info_parts := strings.Split(info, ";") alleles_info := info_parts[0] if len(alleles_info) < 2 { return fmt.Errorf(fmt.Sprintf("Invalid alleles info (%s)", info)) } alts := strings.Split(alleles_info, " ") alt_seqs := strings.Split(alts[1], "/") if len(alt_seqs) == 0 { alt_seq = alt_seqs[0] } else { if allele_num < len(alt_seqs) { alt_seq = alt_seqs[allele_num] } else { alt_seq = alt_seqs[0] } if alt_seq == "-" { alt_seq = "" } } } if e0 != nil { return e0 } if e1 != nil { return e1 } if start_pos < 0 { if gVerboseFlag { fmt.Printf("\n{\"comment\":\"initializing start_pos=%d\"}\n", spos0ref) } start_pos = spos0ref } if start_pos < spos0ref { e = emit_nocall(start_pos, spos0ref-start_pos, ref_ain, bufout) if e != nil { return e } start_pos = spos0ref } if typ == "REF" { e = emit_ref(start_pos, del_n, ref_ain, bufout) if e != nil { return e } start_pos += del_n } else { //DEBUG //if int(del_n) != len(alt_seq) { fmt.Printf("\n>>>>> [%d] del_n %d, alt_seq %s (%d)\n", start_pos, del_n, alt_seq, len(alt_seq)) } e = emit_alt(start_pos, del_n, alt_seq, ref_ain, bufout) if e != nil { return e } start_pos += del_n } } return nil }
func LoadSampleFastj(scan *autoio.AutoioHandle) ([][]TileInfo, error) { line_no := 0 cur_seq := make([]byte, 0, 1024) tilepath := -1 tilestep := -1 tilevar := -1 nocall := 0 _ = nocall allele_path := make([][]TileInfo, 2) for i := 0; i < len(allele_path); i++ { allele_path[i] = make([]TileInfo, 0, 1024) } var first_tile bool = true var tileid string var s_tag string var e_tag string var md5sum_str string var span_len int var start_tile_flag bool var end_tile_flag bool for scan.ReadScan() { l := scan.ReadText() line_no++ if len(l) == 0 { continue } if l[0] == '\n' { continue } if l[0] == '>' { // store tile sequence // if !first_tile { m5 := Md5sum2str(md5.Sum(cur_seq)) if m5 != md5sum_str { return nil, fmt.Errorf("md5sums do not match %s != %s (line %d)", m5, md5sum_str, line_no) } ti := emit_fastj_tile(tilepath, tilestep, span_len, s_tag, cur_seq, e_tag) if tilevar == 0 { allele_path[0] = append(allele_path[0], ti) } else if tilevar == 1 { allele_path[1] = append(allele_path[1], ti) } else { return nil, fmt.Errorf("invalid tile variant allele %d", tilevar) } } first_tile = false var pos int = 0 tileid, pos = simple_text_field(l[1:], "tileID") if pos < 0 { return nil, fmt.Errorf("no tileID found at line %d", line_no) } md5sum_str, pos = simple_text_field(l[1:], "md5sum") if pos < 0 { return nil, fmt.Errorf("no md5sum found at line %d", line_no) } span_len, pos = simple_int_field(l[1:], "seedTileLength") if pos < 0 { return nil, fmt.Errorf("no md5sum found at line %d", line_no) } s_tag, pos = simple_text_field(l[1:], "startTag") if pos < 0 { return nil, fmt.Errorf("no startTag found at line %d", line_no) } _ = s_tag e_tag, pos = simple_text_field(l[1:], "endTag") if pos < 0 { return nil, fmt.Errorf("no endTag found at line %d", line_no) } _ = e_tag start_tile_flag, pos = simple_bool_field(l[1:], "startTile") if pos < 0 { return nil, fmt.Errorf("no startTile found at line %d", line_no) } _ = start_tile_flag end_tile_flag, pos = simple_bool_field(l[1:], "endTile") if pos < 0 { return nil, fmt.Errorf("no endTile found at line %d", line_no) } _ = end_tile_flag tile_parts := strings.Split(tileid, ".") if t, e := strconv.ParseInt(tile_parts[0], 16, 64); e == nil { tilepath = int(t) } else { return nil, e } if t, e := strconv.ParseInt(tile_parts[2], 16, 64); e == nil { tilestep = int(t) } else { return nil, e } if t, e := strconv.ParseInt(tile_parts[3], 16, 64); e == nil { tilevar = int(t) } else { return nil, e } // Header parsed, go on // cur_seq = cur_seq[0:0] continue } if first_tile { return nil, fmt.Errorf("found body before header (line %d)", line_no) } cur_seq = append(cur_seq, l[:]...) } // store tile sequence // if !first_tile { m5 := Md5sum2str(md5.Sum(cur_seq)) if m5 != md5sum_str { return nil, fmt.Errorf("md5sums do not match %s != %s (line %d)", m5, md5sum_str, line_no) } ti := emit_fastj_tile(tilepath, tilestep, span_len, s_tag, cur_seq, e_tag) if tilevar == 0 { allele_path[0] = append(allele_path[0], ti) } else if tilevar == 1 { allele_path[1] = append(allele_path[1], ti) } else { return nil, fmt.Errorf("invalid tile variant allele %d", tilevar) } } return allele_path, nil }
func convert(gvcf_ain *autoio.AutoioHandle, ref_ain *simplestream.SimpleStream, aout *os.File, start_pos int64) error { var e error //start_pos := int64(0) allele_num := 0 _ = allele_num bufout := bufio.NewWriter(aout) defer bufout.Flush() cur_spos := int64(0) // All co-ordinates are 0-ref. // End is inclusive // for gvcf_ain.ReadScan() { l := gvcf_ain.ReadText() if len(l) == 0 || l[0] == '#' { continue } gvcf_parts := strings.Split(l, "\t") if len(gvcf_parts) < 9 { return fmt.Errorf("not enough gvcf parts") } chrom := gvcf_parts[0] _ = chrom spos, e0 := strconv.ParseInt(gvcf_parts[1], 10, 64) if e0 != nil { return e0 } spos-- id_str := gvcf_parts[2] _ = id_str ref_anch := gvcf_parts[3] _ = ref_anch alt_str := gvcf_parts[4] _ = alt_str qual := gvcf_parts[5] _ = qual filt := gvcf_parts[6] _ = filt info_str := gvcf_parts[7] _ = info_str fmt_str := gvcf_parts[8] _ = fmt_str samp_str := gvcf_parts[9] _ = samp_str // Check for END // epos := int64(-1) info_parts := strings.Split(info_str, ";") for i := 0; i < len(info_parts); i++ { if strings.HasPrefix(info_parts[i], "END=") { end_parts := strings.Split(info_parts[i], "=") // End is inclusive // epos, e = strconv.ParseInt(end_parts[1], 10, 64) epos-- if e != nil { return e } break } } ref_len := int64(len(ref_anch)) if epos >= 0 { ref_len = epos - spos + 1 } typ := "NOCALL" if filt == "PASS" { typ = "REF" } // Catch up to current position // if (cur_spos >= 0) && ((spos - cur_spos) > 0) { //fmt.Printf("\nnocall catchup %d+%d\n", cur_spos, spos-cur_spos) emit_nocall_ref(cur_spos, spos-cur_spos, ref_ain, bufout) } // Update previous end position // cur_spos = spos + ref_len // Process current line // if typ == "NOCALL" { //fmt.Printf("\nnocall ref %d+%d\n", spos, ref_len) emit_nocall_ref(spos, ref_len, ref_ain, bufout) continue } refseq, _, e := peel_ref(spos, ref_len, ref_ain) if e != nil { return e } gt_idx, er := get_field_index(fmt_str, "GT", ":") if er != nil { return er } samp_parts := strings.Split(samp_str, ":") if len(samp_parts) <= gt_idx { return fmt.Errorf(fmt.Sprintf("%s <-- NO GT FIELD", l)) } gt_field := samp_parts[gt_idx] gt_parts := []string{} if strings.Index(gt_field, "/") != -1 { gt_parts = strings.Split(gt_field, "/") //fmt.Printf(" %s %s (un)\n", gt_parts[0], gt_parts[1]) } else if strings.Index(gt_field, "|") != -1 { gt_parts = strings.Split(gt_field, "|") //fmt.Printf(" %s %s (ph)\n", gt_parts[0], gt_parts[1]) } else { gt_parts = append(gt_parts, gt_field) //fmt.Printf(" %s\n", gt_field) } gt_allele_idx, e := convert_textvec(gt_parts) //fmt.Printf(">> ref %s\n", refseq) alt_fields := strings.Split(alt_str, ",") for i := 0; i < len(gt_allele_idx); i++ { if gt_allele_idx[i] == 0 { //fmt.Printf("> alt%d %s\n", gt_allele_idx[i], refseq) //aout.WriteString(refseq) bufout.WriteString(refseq) gCounter += len(refseq) } else if (gt_allele_idx[i] - 1) < len(alt_fields) { //fmt.Printf("> alt%d %s\n", gt_allele_idx[i], //aout.WriteString(alt_fields[gt_allele_idx[i]-1]) bufout.WriteString(alt_fields[gt_allele_idx[i]-1]) gCounter += len(alt_fields[gt_allele_idx[i]-1]) } else { return fmt.Errorf(fmt.Sprintf("%s <-- invalid GT field", l)) } //DEBUG // Only first allele for now break //DEBUG } //fmt.Printf("chrom %s, spos %d, epos %d\n", chrom, spos, epos) } //fmt.Printf("\n\ngCounter %d\n", gCounter) return nil }
func diff_to_interleave(ain *autoio.AutoioHandle) { n_allele := 2 lfmod := 50 bp_count := 0 chrom := "" pos := -1 first_pass := true for ain.ReadScan() { l := ain.ReadText() if len(l) == 0 || l == "" { continue } diff_parts := strings.Split(l, "\t") chrom_s := diff_parts[0] type_s := diff_parts[1] st_s := diff_parts[2] _ = st_s en_s := diff_parts[3] _ = en_s field := diff_parts[4] control_message := false if chrom != chrom_s { if !first_pass && !control_message { fmt.Printf("\n") } fmt.Printf(">C{%s}", chrom_s) chrom = chrom_s control_message = true } _st, e := strconv.ParseUint(st_s, 10, 64) if e == nil { if pos != int(_st) { if !first_pass && !control_message { fmt.Printf("\n") } fmt.Printf(">P{%d}", _st) pos = int(_st) control_message = true } } if control_message { fmt.Printf("\n") } first_pass = false if type_s == "ref" { for i := 0; i < len(field); i++ { for a := 0; a < n_allele; a++ { fmt.Printf("%c", field[i]) bp_count++ if (lfmod > 0) && ((bp_count % lfmod) == 0) { fmt.Printf("\n") } } } pos += len(field) } else if type_s == "alt" || type_s == "nca" || type_s == "noc" { field_parts := strings.Split(field, ";") alt_parts := strings.Split(field_parts[0], "/") if len(alt_parts) == 1 { alt_parts = append(alt_parts, alt_parts[0]) } refseq := field_parts[1] mM := len(alt_parts[0]) if len(alt_parts[1]) > mM { mM = len(alt_parts[1]) } if len(refseq) > mM { mM = len(refseq) } for i := 0; i < mM; i++ { for a := 0; a < len(alt_parts); a++ { if i < len(alt_parts[a]) { if i < len(refseq) { fmt.Printf("%c", pasta.SubMap[refseq[i]][alt_parts[a][i]]) } else { fmt.Printf("%c", pasta.InsMap[alt_parts[a][i]]) } } else if i < len(refseq) { fmt.Printf("%c", pasta.DelMap[refseq[i]]) } else { fmt.Printf(".") } bp_count++ if (lfmod > 0) && ((bp_count % lfmod) == 0) { fmt.Printf("\n") } } } if refseq != "-" { pos += len(refseq) } } } fmt.Printf("\n") }