// Parse Sequence CSV file. // Assumes no header. // Assumes Sequence element order is: // // 0 1 2 3 4 // ID,fastaID,sequenceRecordName,md5checksum,length // func import_sequence(fn string) error { h, e := autoio.OpenReadScannerSimple(fn) if e != nil { return e } defer h.Close() line_no := -1 for h.ReadScan() { line_no++ l := h.ReadText() if len(l) == 0 { continue } line_parts := strings.Split(l, ",") id, e := strconv.ParseInt(line_parts[0], 10, 64) if e != nil { return fmt.Errorf("ERROR: parsing ID in Sequence file (line %d): %s", line_no, line_parts[0]) } fastaid := line_parts[1] _ = fastaid seqname := line_parts[2] _ = seqname m5 := line_parts[3] _ = m5 seqlen, e := strconv.ParseInt(line_parts[4], 10, 64) _ = seqlen if e != nil { return fmt.Errorf("ERROR: parsing seqlen in Sequence file (line %d): %s", line_no, line_parts[4]) } g_md5_seqid_map[m5] = id } return nil }
// Open a stream and read the FastJ file. // Populate g_tile_lib. This will group // tiles by path.step in g_tile_lib. In // each grouping there will be a tile per md5sum // with the appropriate TileInfo field. // func import_fastj(name, fn string) error { var prev_md5sum string var prev_tile_path string var prev_tileid string _ = prev_tileid var prev_seedlen int var prev_path_i int64 var prev_step_i int64 curseq := make([]string, 0, 10) h, e := autoio.OpenReadScannerSimple(fn) if e != nil { return e } defer h.Close() for h.ReadScan() { l := h.ReadText() if len(l) == 0 { continue } if l[0] == '>' { sj, e := sloppyjson.Loads(l[1:]) if e != nil { return e } md5sum := sj.O["md5sum"].S tileid := sj.O["tileID"].S seedlen := int(sj.O["seedTileLength"].P) tile_parts := strings.SplitN(tileid, ".", 4) tile_path := fmt.Sprintf("%s.%s", tile_parts[0], tile_parts[2]) path_i, e := strconv.ParseInt(tile_parts[0], 16, 64) if e != nil { return e } step_i, e := strconv.ParseInt(tile_parts[2], 16, 64) if e != nil { return e } if _, ok := g_path_md5sum_freq[tile_path]; !ok { g_path_md5sum_freq[tile_path] = make(map[string]int) } g_path_md5sum_freq[tile_path][md5sum]++ pfx := "0:" if tile_parts[3] == "001" { pfx = "1:" } g_path_md5sum[tile_path] = append(g_path_md5sum[tile_path], pfx+md5sum) if len(curseq) > 0 { tile_seq := strings.Join(curseq, "") pfx_tag := tile_seq[0:24] sfx_tag := tile_seq[len(tile_seq)-24:] md5_tile_seq := md5sum_str(tile_seq) if md5_tile_seq != prev_md5sum { log.Fatal(fmt.Sprintf("previous md5sum %s (%s) != current md5sum %s (%s)\n", prev_md5sum, tileid, md5_tile_seq, prev_tileid)) } if _, ok := g_md5sum_seq[md5_tile_seq]; !ok { g_md5sum_seq[md5_tile_seq] = tile_seq } pfx_tag_id := create_tag_id(prev_tile_path, pfx_tag) if _, ok := g_id_tag[pfx_tag_id]; !ok { g_id_tag[pfx_tag_id] = pfx_tag } //sfx_tag_id := create_tag_id(prev_tile_path, sfx_tag) prev_sfx_tile_path := fmt.Sprintf("%03x.%04x", prev_path_i, prev_step_i+int64(prev_seedlen)) sfx_tag_id := create_tag_id(prev_sfx_tile_path, sfx_tag) if _, ok := g_id_tag[sfx_tag_id]; !ok { g_id_tag[sfx_tag_id] = sfx_tag } if _, ok := g_tile_lib[prev_tile_path]; !ok { g_tile_lib[prev_tile_path] = make(map[string]TileInfo) } if _, ok := g_tile_lib[prev_tile_path][prev_md5sum]; !ok { g_tile_lib[prev_tile_path][prev_md5sum] = TileInfo{prev_md5sum, prev_tile_path, prev_seedlen, 1, -1} } else { z := g_tile_lib[prev_tile_path][prev_md5sum] z.Freq++ g_tile_lib[prev_tile_path][prev_md5sum] = z } } else { } curseq = curseq[0:0] prev_md5sum = md5sum prev_tile_path = tile_path prev_tileid = tileid prev_seedlen = seedlen prev_path_i = path_i prev_step_i = step_i continue } curseq = append(curseq, l) } if len(curseq) > 0 { tile_seq := strings.Join(curseq, "") pfx_tag := tile_seq[0:24] sfx_tag := tile_seq[len(tile_seq)-24:] md5_tile_seq := md5sum_str(tile_seq) if md5_tile_seq != prev_md5sum { log.Fatal(fmt.Sprintf("previous md5sum %s != current md5sum %s (%s)\n", prev_md5sum, md5_tile_seq, prev_tileid)) } if _, ok := g_md5sum_seq[md5_tile_seq]; !ok { g_md5sum_seq[md5_tile_seq] = tile_seq } pfx_tag_id := create_tag_id(prev_tile_path, pfx_tag) if _, ok := g_id_tag[pfx_tag_id]; !ok { g_id_tag[pfx_tag_id] = pfx_tag } //sfx_tag_id := create_tag_id(prev_tile_path, sfx_tag) prev_sfx_tile_path := fmt.Sprintf("%03x.%04x", prev_path_i, prev_step_i+int64(prev_seedlen)) sfx_tag_id := create_tag_id(prev_sfx_tile_path, sfx_tag) if _, ok := g_id_tag[sfx_tag_id]; !ok { g_id_tag[sfx_tag_id] = sfx_tag } if _, ok := g_tile_lib[prev_tile_path]; !ok { g_tile_lib[prev_tile_path] = make(map[string]TileInfo) } if _, ok := g_tile_lib[prev_tile_path][prev_md5sum]; !ok { g_tile_lib[prev_tile_path][prev_md5sum] = TileInfo{prev_md5sum, prev_tile_path, prev_seedlen, 1, -1} } else { z := g_tile_lib[prev_tile_path][prev_md5sum] z.Freq++ g_tile_lib[prev_tile_path][prev_md5sum] = z } } _ = name return nil }
func _main(c *cli.Context) { if c.String("input") == "" { fmt.Fprintf(os.Stderr, "Input required, exiting\n") cli.ShowAppHelp(c) os.Exit(1) } gff_ain, err := autoio.OpenReadScannerSimple(c.String("input")) _ = gff_ain if err != nil { fmt.Fprintf(os.Stderr, "%v", err) os.Exit(1) } defer gff_ain.Close() ref_ain := simplestream.SimpleStream{} ref_fp := os.Stdin if c.String("ref-input") != "-" { var e error ref_fp, e = os.Open(c.String("ref-input")) if e != nil { fmt.Fprintf(os.Stderr, "%v", err) os.Exit(1) } defer ref_fp.Close() } ref_ain.Init(ref_fp) var ref_start int64 ref_start = 0 ss := c.Int("ref-start") if ss > 0 { ref_start = int64(ss) } var seq_start int64 seq_start = 0 _ = seq_start ss = c.Int("seq-start") if ss > 0 { seq_start = int64(ss) } aout := os.Stdout if c.String("output") != "-" { aout, err = os.Open(c.String("output")) if err != nil { fmt.Fprintf(os.Stderr, "%v", err) os.Exit(1) } defer aout.Close() } if c.Bool("pprof") { gProfileFlag = true gProfileFile = c.String("pprof-file") } if c.Bool("mprof") { gMemProfileFlag = true gMemProfileFile = c.String("mprof-file") } gVerboseFlag = c.Bool("Verbose") if c.Int("max-procs") > 0 { runtime.GOMAXPROCS(c.Int("max-procs")) } allele := c.Int("allele") if gProfileFlag { prof_f, err := os.Create(gProfileFile) if err != nil { fmt.Fprintf(os.Stderr, "Could not open profile file %s: %v\n", gProfileFile, err) os.Exit(2) } pprof.StartCPUProfile(prof_f) defer pprof.StopCPUProfile() } e := convert(&gff_ain, &ref_ain, aout, ref_start, allele) if e != nil && e != io.EOF { panic(e) } aout.Sync() }
// Open a stream and read the FastJ file. // Populate g_tile_lib. This will group // tiles by path.step in g_tile_lib. In // each grouping there will be a tile per md5sum // with the appropriate TileInfo field. // func import_fastj(name, fn string) error { var prev_md5sum string var prev_tile_path string _ = prev_tile_path var prev_tileid string _ = prev_tileid var prev_seedlen int _ = prev_seedlen var prev_path_i int64 _ = prev_path_i var prev_step_i int64 _ = prev_step_i var prev_tile_allele int64 _ = prev_tile_allele var prev_allele_name_id string curseq := make([]string, 0, 10) h, e := autoio.OpenReadScannerSimple(fn) if e != nil { return e } for h.ReadScan() { l := h.ReadText() if len(l) == 0 { continue } if l[0] == '>' { sj, e := sloppyjson.Loads(l[1:]) if e != nil { return e } md5sum := sj.O["md5sum"].S tileid := sj.O["tileID"].S seedlen := int(sj.O["seedTileLength"].P) tile_parts := strings.SplitN(tileid, ".", 4) tile_path := fmt.Sprintf("%s.%s", tile_parts[0], tile_parts[2]) path_i, e := strconv.ParseInt(tile_parts[0], 16, 64) if e != nil { return e } step_i, e := strconv.ParseInt(tile_parts[2], 16, 64) if e != nil { return e } tile_allele, e := strconv.ParseInt(tile_parts[3], 16, 64) if e != nil { return e } allele_name_id := fmt.Sprintf("%s:%d", name, tile_allele) // Initialize everything if we haven't seen it before // if _, ok := g_allele[allele_name_id]; !ok { callset_id := g_callset[name].Id ploidy := 1 variant_set_id := g_START_VARIANTSET_ID g_allele[allele_name_id] = Allele{g_ALLELE_ID, variant_set_id, allele_name_id, 0} g_allele_path_item[allele_name_id] = make([]AllelePathItem, 0, 1024) g_allele_call[allele_name_id] = AlleleCall{g_ALLELE_ID, callset_id, ploidy} g_ALLELE_ID++ } if len(curseq) > 0 { tile_seq := strings.Join(curseq, "") pfx_tag := tile_seq[0:24] sfx_tag := tile_seq[len(tile_seq)-24:] body_seq := tile_seq[24 : len(tile_seq)-24] md5_tile_seq := md5sum_str(tile_seq) if md5_tile_seq != prev_md5sum { log.Fatal(fmt.Sprintf("previous md5sum %s (%s) != current md5sum %s (%s)\n", prev_md5sum, tileid, md5_tile_seq, prev_tileid)) } var ok bool var seqid int64 allele_id := g_allele[prev_allele_name_id].Id allele_path := g_allele_path_item[prev_allele_name_id] cur_idx := len(allele_path) pfx_md5 := md5sum_str(pfx_tag) if seqid, ok = g_md5_seqid_map[pfx_md5]; !ok { log.Fatal(fmt.Sprintf("ERROR: could not find tag '%s' (%s) in Sequence map", pfx_tag, pfx_md5)) } if cur_idx == 0 { allele_path = append(allele_path, AllelePathItem{allele_id, cur_idx, int(seqid), 0, 24, "'TRUE'"}) cur_idx++ } body_md5 := md5sum_str(body_seq) if seqid, ok = g_md5_seqid_map[body_md5]; !ok { log.Fatal(fmt.Sprintf("ERROR: could not find body (%s) in Sequence map", body_md5)) } allele_path = append(allele_path, AllelePathItem{allele_id, cur_idx, int(seqid), 0, len(body_seq), "'TRUE'"}) cur_idx++ sfx_md5 := md5sum_str(sfx_tag) if seqid, ok = g_md5_seqid_map[sfx_md5]; !ok { log.Fatal(fmt.Sprintf("ERROR: could not find tag '%s' (%s) in Sequence map", sfx_tag, sfx_md5)) } allele_path = append(allele_path, AllelePathItem{allele_id, cur_idx, int(seqid), 0, 24, "'TRUE'"}) cur_idx++ g_allele_path_item[prev_allele_name_id] = allele_path } else { } curseq = curseq[0:0] prev_md5sum = md5sum prev_tile_path = tile_path prev_tileid = tileid prev_seedlen = seedlen prev_path_i = path_i prev_step_i = step_i prev_tile_allele = tile_allele prev_allele_name_id = allele_name_id continue } curseq = append(curseq, l) } if len(curseq) > 0 { tile_seq := strings.Join(curseq, "") pfx_tag := tile_seq[0:24] sfx_tag := tile_seq[len(tile_seq)-24:] body_seq := tile_seq[24 : len(tile_seq)-24] md5_tile_seq := md5sum_str(tile_seq) if md5_tile_seq != prev_md5sum { log.Fatal(fmt.Sprintf("previous md5sum %s != current md5sum %s (%s)\n", prev_md5sum, md5_tile_seq, prev_tileid)) } var ok bool var seqid int64 pfx_md5 := md5sum_str(pfx_tag) if seqid, ok = g_md5_seqid_map[pfx_md5]; !ok { log.Fatal(fmt.Sprintf("ERROR: could not find tag '%s' (%s) in Sequence map", pfx_tag, pfx_md5)) } // Only add the prefix tag if it's the first one in the AllelePathItem // allele_id := g_allele[prev_allele_name_id].Id allele_path := g_allele_path_item[prev_allele_name_id] cur_idx := len(allele_path) if cur_idx == 0 { allele_path = append(allele_path, AllelePathItem{allele_id, cur_idx, int(seqid), 0, 24, "'TRUE'"}) cur_idx++ } body_md5 := md5sum_str(body_seq) if seqid, ok = g_md5_seqid_map[body_md5]; !ok { log.Fatal(fmt.Sprintf("ERROR: could not find body (%s) in Sequence map", body_md5)) } allele_path = append(allele_path, AllelePathItem{allele_id, cur_idx, int(seqid), 0, len(body_seq), "'TRUE'"}) cur_idx++ sfx_md5 := md5sum_str(sfx_tag) if seqid, ok = g_md5_seqid_map[sfx_md5]; !ok { log.Fatal(fmt.Sprintf("ERROR: could not find tag '%s' (%s) in Sequence map", sfx_tag, sfx_md5)) } allele_path = append(allele_path, AllelePathItem{allele_id, cur_idx, int(seqid), 0, 24, "'TRUE'"}) cur_idx++ g_allele_path_item[prev_allele_name_id] = allele_path } _ = name return nil }
func _main(c *cli.Context) { g_verboseFlag = c.Bool("Verbose") beg_str := c.String("start") end_str := c.String("end") if len(beg_str) > 0 { parse_filter(beg_str, &g_beg_path, &g_beg_ver, &g_beg_step, &g_beg_variant) } if len(end_str) > 0 { parse_filter(end_str, &g_end_path, &g_end_ver, &g_end_step, &g_end_variant) } if len(c.String("input-fastj")) == 0 { fmt.Fprintf(os.Stderr, "Provide input FastJ file\n") cli.ShowAppHelp(c) os.Exit(1) } scanner, err := autoio.OpenReadScannerSimple(c.String("input-fastj")) if err != nil { fmt.Fprintf(os.Stderr, "%v", err) os.Exit(1) } defer scanner.Close() h_line := "" fold_width := 50 first_pass := true line_no := 0 seq := make([]byte, 300) var prev_tileid string var prev_seed_tile_len int for scanner.ReadScan() { line_no++ l := scanner.ReadText() if len(l) == 0 { continue } if l[0] == '>' { sj, e := sloppyjson.Loads(l[1:]) if e != nil { log.Fatal(e) } tileid := sj.O["tileID"].S seed_tile_len := int(sj.O["seedTileLength"].P) if !first_pass { if pass_filter(prev_tileid, prev_seed_tile_len) { fmt.Printf("%s\n", h_line) p := 0 for ; p < (len(seq) - fold_width); p += fold_width { fmt.Printf("%s\n", seq[p:p+fold_width]) } if p < len(seq) { fmt.Printf("%s\n", seq[p:]) } fmt.Printf("\n") } } first_pass = false h_line = l seq = seq[0:0] prev_tileid = tileid prev_seed_tile_len = seed_tile_len continue } seq = append(seq, []byte(l)...) } if !first_pass { if pass_filter(prev_tileid, prev_seed_tile_len) { fmt.Printf("%s\n", h_line) p := 0 for ; p < (len(seq) - fold_width); p += fold_width { fmt.Printf("%s\n", seq[p:p+fold_width]) } if p < len(seq) { fmt.Printf("%s\n", seq[p:]) } fmt.Printf("\n") } } }
func _main(c *cli.Context) { g_build_prefix = c.String("build-prefix") g_seq_start = c.Int("seq-start") if c.String("input") == "" { fmt.Fprintf(os.Stderr, "Input required, exiting\n") cli.ShowAppHelp(c) os.Exit(1) } seq_fp, err := autoio.OpenReadScannerSimple(c.String("input")) _ = seq_fp if err != nil { fmt.Fprintf(os.Stderr, "%v", err) os.Exit(1) } defer seq_fp.Close() tileset_fp, err := autoio.OpenReadScannerSimple(c.String("tileset")) _ = tileset_fp if err != nil { fmt.Fprintf(os.Stderr, "%v", err) os.Exit(1) } defer tileset_fp.Close() load_tagset(tileset_fp) load_seq(seq_fp) find_tag_positions() gen_tiling() if c.Bool("pprof") { gProfileFlag = true gProfileFile = c.String("pprof-file") } if c.Bool("mprof") { gMemProfileFlag = true gMemProfileFile = c.String("mprof-file") } gVerboseFlag = c.Bool("Verbose") if c.Int("max-procs") > 0 { runtime.GOMAXPROCS(c.Int("max-procs")) } if gProfileFlag { prof_f, err := os.Create(gProfileFile) if err != nil { fmt.Fprintf(os.Stderr, "Could not open profile file %s: %v\n", gProfileFile, err) os.Exit(2) } pprof.StartCPUProfile(prof_f) defer pprof.StopCPUProfile() } }