Exemplo n.º 1
0
// Parse Sequence CSV file.
// Assumes no header.
// Assumes Sequence element order is:
//
//  0    1           2                3        4
// ID,fastaID,sequenceRecordName,md5checksum,length
//
func import_sequence(fn string) error {
	h, e := autoio.OpenReadScannerSimple(fn)
	if e != nil {
		return e
	}
	defer h.Close()

	line_no := -1

	for h.ReadScan() {
		line_no++
		l := h.ReadText()
		if len(l) == 0 {
			continue
		}

		line_parts := strings.Split(l, ",")

		id, e := strconv.ParseInt(line_parts[0], 10, 64)
		if e != nil {
			return fmt.Errorf("ERROR: parsing ID in Sequence file (line %d): %s", line_no, line_parts[0])
		}

		fastaid := line_parts[1]
		_ = fastaid
		seqname := line_parts[2]
		_ = seqname
		m5 := line_parts[3]
		_ = m5

		seqlen, e := strconv.ParseInt(line_parts[4], 10, 64)
		_ = seqlen
		if e != nil {
			return fmt.Errorf("ERROR: parsing seqlen in Sequence file (line %d): %s", line_no, line_parts[4])
		}

		g_md5_seqid_map[m5] = id
	}

	return nil

}
// Open a stream and read the FastJ file.
// Populate g_tile_lib.  This will group
// tiles by path.step in g_tile_lib.  In
// each grouping there will be a tile per md5sum
// with the appropriate TileInfo field.
//
func import_fastj(name, fn string) error {
	var prev_md5sum string
	var prev_tile_path string
	var prev_tileid string
	_ = prev_tileid
	var prev_seedlen int

	var prev_path_i int64
	var prev_step_i int64

	curseq := make([]string, 0, 10)

	h, e := autoio.OpenReadScannerSimple(fn)
	if e != nil {
		return e
	}
	defer h.Close()

	for h.ReadScan() {
		l := h.ReadText()
		if len(l) == 0 {
			continue
		}

		if l[0] == '>' {
			sj, e := sloppyjson.Loads(l[1:])
			if e != nil {
				return e
			}

			md5sum := sj.O["md5sum"].S
			tileid := sj.O["tileID"].S
			seedlen := int(sj.O["seedTileLength"].P)

			tile_parts := strings.SplitN(tileid, ".", 4)
			tile_path := fmt.Sprintf("%s.%s", tile_parts[0], tile_parts[2])

			path_i, e := strconv.ParseInt(tile_parts[0], 16, 64)
			if e != nil {
				return e
			}
			step_i, e := strconv.ParseInt(tile_parts[2], 16, 64)
			if e != nil {
				return e
			}

			if _, ok := g_path_md5sum_freq[tile_path]; !ok {
				g_path_md5sum_freq[tile_path] = make(map[string]int)
			}
			g_path_md5sum_freq[tile_path][md5sum]++

			pfx := "0:"
			if tile_parts[3] == "001" {
				pfx = "1:"
			}
			g_path_md5sum[tile_path] = append(g_path_md5sum[tile_path], pfx+md5sum)

			if len(curseq) > 0 {

				tile_seq := strings.Join(curseq, "")
				pfx_tag := tile_seq[0:24]
				sfx_tag := tile_seq[len(tile_seq)-24:]

				md5_tile_seq := md5sum_str(tile_seq)
				if md5_tile_seq != prev_md5sum {
					log.Fatal(fmt.Sprintf("previous md5sum %s (%s) != current md5sum %s (%s)\n", prev_md5sum, tileid, md5_tile_seq, prev_tileid))
				}

				if _, ok := g_md5sum_seq[md5_tile_seq]; !ok {
					g_md5sum_seq[md5_tile_seq] = tile_seq
				}

				pfx_tag_id := create_tag_id(prev_tile_path, pfx_tag)
				if _, ok := g_id_tag[pfx_tag_id]; !ok {
					g_id_tag[pfx_tag_id] = pfx_tag
				}

				//sfx_tag_id := create_tag_id(prev_tile_path, sfx_tag)
				prev_sfx_tile_path := fmt.Sprintf("%03x.%04x", prev_path_i, prev_step_i+int64(prev_seedlen))
				sfx_tag_id := create_tag_id(prev_sfx_tile_path, sfx_tag)

				if _, ok := g_id_tag[sfx_tag_id]; !ok {
					g_id_tag[sfx_tag_id] = sfx_tag
				}

				if _, ok := g_tile_lib[prev_tile_path]; !ok {
					g_tile_lib[prev_tile_path] = make(map[string]TileInfo)
				}

				if _, ok := g_tile_lib[prev_tile_path][prev_md5sum]; !ok {
					g_tile_lib[prev_tile_path][prev_md5sum] = TileInfo{prev_md5sum, prev_tile_path, prev_seedlen, 1, -1}
				} else {
					z := g_tile_lib[prev_tile_path][prev_md5sum]
					z.Freq++
					g_tile_lib[prev_tile_path][prev_md5sum] = z
				}

			} else {
			}

			curseq = curseq[0:0]

			prev_md5sum = md5sum
			prev_tile_path = tile_path
			prev_tileid = tileid
			prev_seedlen = seedlen
			prev_path_i = path_i
			prev_step_i = step_i

			continue

		}

		curseq = append(curseq, l)

	}

	if len(curseq) > 0 {
		tile_seq := strings.Join(curseq, "")
		pfx_tag := tile_seq[0:24]
		sfx_tag := tile_seq[len(tile_seq)-24:]

		md5_tile_seq := md5sum_str(tile_seq)
		if md5_tile_seq != prev_md5sum {
			log.Fatal(fmt.Sprintf("previous md5sum %s != current md5sum %s (%s)\n", prev_md5sum, md5_tile_seq, prev_tileid))
		}

		if _, ok := g_md5sum_seq[md5_tile_seq]; !ok {
			g_md5sum_seq[md5_tile_seq] = tile_seq
		}

		pfx_tag_id := create_tag_id(prev_tile_path, pfx_tag)
		if _, ok := g_id_tag[pfx_tag_id]; !ok {
			g_id_tag[pfx_tag_id] = pfx_tag
		}

		//sfx_tag_id := create_tag_id(prev_tile_path, sfx_tag)
		prev_sfx_tile_path := fmt.Sprintf("%03x.%04x", prev_path_i, prev_step_i+int64(prev_seedlen))
		sfx_tag_id := create_tag_id(prev_sfx_tile_path, sfx_tag)

		if _, ok := g_id_tag[sfx_tag_id]; !ok {
			g_id_tag[sfx_tag_id] = sfx_tag
		}

		if _, ok := g_tile_lib[prev_tile_path]; !ok {
			g_tile_lib[prev_tile_path] = make(map[string]TileInfo)
		}

		if _, ok := g_tile_lib[prev_tile_path][prev_md5sum]; !ok {
			g_tile_lib[prev_tile_path][prev_md5sum] = TileInfo{prev_md5sum, prev_tile_path, prev_seedlen, 1, -1}
		} else {
			z := g_tile_lib[prev_tile_path][prev_md5sum]
			z.Freq++
			g_tile_lib[prev_tile_path][prev_md5sum] = z
		}

	}

	_ = name
	return nil

}
Exemplo n.º 3
0
func _main(c *cli.Context) {

	if c.String("input") == "" {
		fmt.Fprintf(os.Stderr, "Input required, exiting\n")
		cli.ShowAppHelp(c)
		os.Exit(1)
	}

	gff_ain, err := autoio.OpenReadScannerSimple(c.String("input"))
	_ = gff_ain
	if err != nil {
		fmt.Fprintf(os.Stderr, "%v", err)
		os.Exit(1)
	}
	defer gff_ain.Close()

	ref_ain := simplestream.SimpleStream{}
	ref_fp := os.Stdin
	if c.String("ref-input") != "-" {
		var e error
		ref_fp, e = os.Open(c.String("ref-input"))
		if e != nil {
			fmt.Fprintf(os.Stderr, "%v", err)
			os.Exit(1)
		}
		defer ref_fp.Close()
	}
	ref_ain.Init(ref_fp)

	var ref_start int64
	ref_start = 0
	ss := c.Int("ref-start")
	if ss > 0 {
		ref_start = int64(ss)
	}

	var seq_start int64
	seq_start = 0
	_ = seq_start
	ss = c.Int("seq-start")
	if ss > 0 {
		seq_start = int64(ss)
	}

	aout := os.Stdout
	if c.String("output") != "-" {
		aout, err = os.Open(c.String("output"))
		if err != nil {
			fmt.Fprintf(os.Stderr, "%v", err)
			os.Exit(1)
		}
		defer aout.Close()
	}

	if c.Bool("pprof") {
		gProfileFlag = true
		gProfileFile = c.String("pprof-file")
	}

	if c.Bool("mprof") {
		gMemProfileFlag = true
		gMemProfileFile = c.String("mprof-file")
	}

	gVerboseFlag = c.Bool("Verbose")

	if c.Int("max-procs") > 0 {
		runtime.GOMAXPROCS(c.Int("max-procs"))
	}

	allele := c.Int("allele")

	if gProfileFlag {
		prof_f, err := os.Create(gProfileFile)
		if err != nil {
			fmt.Fprintf(os.Stderr, "Could not open profile file %s: %v\n", gProfileFile, err)
			os.Exit(2)
		}

		pprof.StartCPUProfile(prof_f)
		defer pprof.StopCPUProfile()
	}

	e := convert(&gff_ain, &ref_ain, aout, ref_start, allele)
	if e != nil && e != io.EOF {
		panic(e)
	}

	aout.Sync()

}
Exemplo n.º 4
0
// Open a stream and read the FastJ file.
// Populate g_tile_lib.  This will group
// tiles by path.step in g_tile_lib.  In
// each grouping there will be a tile per md5sum
// with the appropriate TileInfo field.
//
func import_fastj(name, fn string) error {
	var prev_md5sum string
	var prev_tile_path string
	_ = prev_tile_path
	var prev_tileid string
	_ = prev_tileid
	var prev_seedlen int
	_ = prev_seedlen

	var prev_path_i int64
	_ = prev_path_i
	var prev_step_i int64
	_ = prev_step_i

	var prev_tile_allele int64
	_ = prev_tile_allele
	var prev_allele_name_id string

	curseq := make([]string, 0, 10)

	h, e := autoio.OpenReadScannerSimple(fn)
	if e != nil {
		return e
	}

	for h.ReadScan() {
		l := h.ReadText()
		if len(l) == 0 {
			continue
		}

		if l[0] == '>' {
			sj, e := sloppyjson.Loads(l[1:])
			if e != nil {
				return e
			}

			md5sum := sj.O["md5sum"].S
			tileid := sj.O["tileID"].S
			seedlen := int(sj.O["seedTileLength"].P)

			tile_parts := strings.SplitN(tileid, ".", 4)
			tile_path := fmt.Sprintf("%s.%s", tile_parts[0], tile_parts[2])

			path_i, e := strconv.ParseInt(tile_parts[0], 16, 64)
			if e != nil {
				return e
			}
			step_i, e := strconv.ParseInt(tile_parts[2], 16, 64)
			if e != nil {
				return e
			}

			tile_allele, e := strconv.ParseInt(tile_parts[3], 16, 64)
			if e != nil {
				return e
			}

			allele_name_id := fmt.Sprintf("%s:%d", name, tile_allele)

			// Initialize everything if we haven't seen it before
			//
			if _, ok := g_allele[allele_name_id]; !ok {

				callset_id := g_callset[name].Id
				ploidy := 1
				variant_set_id := g_START_VARIANTSET_ID

				g_allele[allele_name_id] = Allele{g_ALLELE_ID, variant_set_id, allele_name_id, 0}
				g_allele_path_item[allele_name_id] = make([]AllelePathItem, 0, 1024)
				g_allele_call[allele_name_id] = AlleleCall{g_ALLELE_ID, callset_id, ploidy}
				g_ALLELE_ID++
			}

			if len(curseq) > 0 {

				tile_seq := strings.Join(curseq, "")
				pfx_tag := tile_seq[0:24]
				sfx_tag := tile_seq[len(tile_seq)-24:]
				body_seq := tile_seq[24 : len(tile_seq)-24]

				md5_tile_seq := md5sum_str(tile_seq)
				if md5_tile_seq != prev_md5sum {
					log.Fatal(fmt.Sprintf("previous md5sum %s (%s) != current md5sum %s (%s)\n", prev_md5sum, tileid, md5_tile_seq, prev_tileid))
				}

				var ok bool
				var seqid int64

				allele_id := g_allele[prev_allele_name_id].Id
				allele_path := g_allele_path_item[prev_allele_name_id]
				cur_idx := len(allele_path)

				pfx_md5 := md5sum_str(pfx_tag)
				if seqid, ok = g_md5_seqid_map[pfx_md5]; !ok {
					log.Fatal(fmt.Sprintf("ERROR: could not find tag '%s' (%s) in Sequence map", pfx_tag, pfx_md5))
				}

				if cur_idx == 0 {
					allele_path = append(allele_path, AllelePathItem{allele_id, cur_idx, int(seqid), 0, 24, "'TRUE'"})
					cur_idx++
				}

				body_md5 := md5sum_str(body_seq)
				if seqid, ok = g_md5_seqid_map[body_md5]; !ok {
					log.Fatal(fmt.Sprintf("ERROR: could not find body (%s) in Sequence map", body_md5))
				}

				allele_path = append(allele_path, AllelePathItem{allele_id, cur_idx, int(seqid), 0, len(body_seq), "'TRUE'"})
				cur_idx++

				sfx_md5 := md5sum_str(sfx_tag)
				if seqid, ok = g_md5_seqid_map[sfx_md5]; !ok {
					log.Fatal(fmt.Sprintf("ERROR: could not find tag '%s' (%s) in Sequence map", sfx_tag, sfx_md5))
				}

				allele_path = append(allele_path, AllelePathItem{allele_id, cur_idx, int(seqid), 0, 24, "'TRUE'"})
				cur_idx++

				g_allele_path_item[prev_allele_name_id] = allele_path

			} else {
			}

			curseq = curseq[0:0]

			prev_md5sum = md5sum
			prev_tile_path = tile_path
			prev_tileid = tileid
			prev_seedlen = seedlen
			prev_path_i = path_i
			prev_step_i = step_i
			prev_tile_allele = tile_allele
			prev_allele_name_id = allele_name_id

			continue

		}

		curseq = append(curseq, l)

	}

	if len(curseq) > 0 {
		tile_seq := strings.Join(curseq, "")
		pfx_tag := tile_seq[0:24]
		sfx_tag := tile_seq[len(tile_seq)-24:]
		body_seq := tile_seq[24 : len(tile_seq)-24]

		md5_tile_seq := md5sum_str(tile_seq)
		if md5_tile_seq != prev_md5sum {
			log.Fatal(fmt.Sprintf("previous md5sum %s != current md5sum %s (%s)\n", prev_md5sum, md5_tile_seq, prev_tileid))
		}

		var ok bool
		var seqid int64

		pfx_md5 := md5sum_str(pfx_tag)
		if seqid, ok = g_md5_seqid_map[pfx_md5]; !ok {
			log.Fatal(fmt.Sprintf("ERROR: could not find tag '%s' (%s) in Sequence map", pfx_tag, pfx_md5))
		}

		// Only add the prefix tag if it's the first one in the AllelePathItem
		//
		allele_id := g_allele[prev_allele_name_id].Id
		allele_path := g_allele_path_item[prev_allele_name_id]
		cur_idx := len(allele_path)

		if cur_idx == 0 {
			allele_path = append(allele_path, AllelePathItem{allele_id, cur_idx, int(seqid), 0, 24, "'TRUE'"})
			cur_idx++
		}

		body_md5 := md5sum_str(body_seq)
		if seqid, ok = g_md5_seqid_map[body_md5]; !ok {
			log.Fatal(fmt.Sprintf("ERROR: could not find body (%s) in Sequence map", body_md5))
		}

		allele_path = append(allele_path, AllelePathItem{allele_id, cur_idx, int(seqid), 0, len(body_seq), "'TRUE'"})
		cur_idx++

		sfx_md5 := md5sum_str(sfx_tag)
		if seqid, ok = g_md5_seqid_map[sfx_md5]; !ok {
			log.Fatal(fmt.Sprintf("ERROR: could not find tag '%s' (%s) in Sequence map", sfx_tag, sfx_md5))
		}

		allele_path = append(allele_path, AllelePathItem{allele_id, cur_idx, int(seqid), 0, 24, "'TRUE'"})
		cur_idx++

		g_allele_path_item[prev_allele_name_id] = allele_path

	}

	_ = name
	return nil

}
Exemplo n.º 5
0
func _main(c *cli.Context) {
	g_verboseFlag = c.Bool("Verbose")

	beg_str := c.String("start")
	end_str := c.String("end")

	if len(beg_str) > 0 {
		parse_filter(beg_str, &g_beg_path, &g_beg_ver, &g_beg_step, &g_beg_variant)
	}

	if len(end_str) > 0 {
		parse_filter(end_str, &g_end_path, &g_end_ver, &g_end_step, &g_end_variant)
	}

	if len(c.String("input-fastj")) == 0 {
		fmt.Fprintf(os.Stderr, "Provide input FastJ file\n")
		cli.ShowAppHelp(c)
		os.Exit(1)
	}

	scanner, err := autoio.OpenReadScannerSimple(c.String("input-fastj"))
	if err != nil {
		fmt.Fprintf(os.Stderr, "%v", err)
		os.Exit(1)
	}
	defer scanner.Close()

	h_line := ""
	fold_width := 50

	first_pass := true

	line_no := 0
	seq := make([]byte, 300)

	var prev_tileid string
	var prev_seed_tile_len int

	for scanner.ReadScan() {
		line_no++

		l := scanner.ReadText()
		if len(l) == 0 {
			continue
		}
		if l[0] == '>' {

			sj, e := sloppyjson.Loads(l[1:])
			if e != nil {
				log.Fatal(e)
			}

			tileid := sj.O["tileID"].S
			seed_tile_len := int(sj.O["seedTileLength"].P)

			if !first_pass {
				if pass_filter(prev_tileid, prev_seed_tile_len) {
					fmt.Printf("%s\n", h_line)
					p := 0
					for ; p < (len(seq) - fold_width); p += fold_width {
						fmt.Printf("%s\n", seq[p:p+fold_width])
					}
					if p < len(seq) {
						fmt.Printf("%s\n", seq[p:])
					}
					fmt.Printf("\n")
				}
			}

			first_pass = false

			h_line = l
			seq = seq[0:0]
			prev_tileid = tileid
			prev_seed_tile_len = seed_tile_len

			continue
		}

		seq = append(seq, []byte(l)...)
	}

	if !first_pass {
		if pass_filter(prev_tileid, prev_seed_tile_len) {
			fmt.Printf("%s\n", h_line)
			p := 0
			for ; p < (len(seq) - fold_width); p += fold_width {
				fmt.Printf("%s\n", seq[p:p+fold_width])
			}
			if p < len(seq) {
				fmt.Printf("%s\n", seq[p:])
			}
			fmt.Printf("\n")
		}
	}

}
Exemplo n.º 6
0
func _main(c *cli.Context) {

	g_build_prefix = c.String("build-prefix")
	g_seq_start = c.Int("seq-start")

	if c.String("input") == "" {
		fmt.Fprintf(os.Stderr, "Input required, exiting\n")
		cli.ShowAppHelp(c)
		os.Exit(1)
	}

	seq_fp, err := autoio.OpenReadScannerSimple(c.String("input"))
	_ = seq_fp
	if err != nil {
		fmt.Fprintf(os.Stderr, "%v", err)
		os.Exit(1)
	}
	defer seq_fp.Close()

	tileset_fp, err := autoio.OpenReadScannerSimple(c.String("tileset"))
	_ = tileset_fp
	if err != nil {
		fmt.Fprintf(os.Stderr, "%v", err)
		os.Exit(1)
	}
	defer tileset_fp.Close()

	load_tagset(tileset_fp)
	load_seq(seq_fp)

	find_tag_positions()
	gen_tiling()

	if c.Bool("pprof") {
		gProfileFlag = true
		gProfileFile = c.String("pprof-file")
	}

	if c.Bool("mprof") {
		gMemProfileFlag = true
		gMemProfileFile = c.String("mprof-file")
	}

	gVerboseFlag = c.Bool("Verbose")

	if c.Int("max-procs") > 0 {
		runtime.GOMAXPROCS(c.Int("max-procs"))
	}

	if gProfileFlag {
		prof_f, err := os.Create(gProfileFile)
		if err != nil {
			fmt.Fprintf(os.Stderr, "Could not open profile file %s: %v\n", gProfileFile, err)
			os.Exit(2)
		}

		pprof.StartCPUProfile(prof_f)
		defer pprof.StopCPUProfile()
	}

}