func remoteQueryTaxid2TaxonByFile(host string, port int, dataFile string, chunkSize int, threads int) { if chunkSize <= 0 { chunkSize = 1000 } fn := func(line string) (interface{}, bool, error) { line = strings.TrimRight(line, "\n") if line == "" { return "", false, nil } return line, true, nil } reader, err := breader.NewBufferedReader(dataFile, threads, chunkSize, fn) checkError(err) chResults := make(chan taxon.MessageTaxid2TaxonMap, threads) // receive result and print chDone := make(chan int) go func() { for msg := range chResults { if msg.Status != "OK" { log.Error(msg.Message) } for taxid, taxon := range msg.Taxons { fmt.Printf("Query TaxIDs: %s\n", taxid) bs, err := json.MarshalIndent(taxon, "", " ") checkError(err) fmt.Printf("Taxon: %s\n\n", string(bs)) } } chDone <- 1 }() // querying var wg sync.WaitGroup tokens := make(chan int, threads) for chunk := range reader.Ch { tokens <- 1 wg.Add(1) queries := make([]string, len(chunk.Data)) for i, data := range chunk.Data { queries[i] = data.(string) } go func(queries []string) { defer func() { wg.Done() <-tokens }() msg := taxon.RemoteQueryTaxid2Taxon(host, port, queries) checkError(err) chResults <- msg }(queries) } wg.Wait() close(chResults) <-chDone }
// ImportGiTaxid reads gi_taxid_nucl or gi_taxid_prot file and writes the data to database func ImportGiTaxid(dbFile string, bucket string, dataFile string, chunkSize int, force bool) { db, err := bolt.Open(dbFile, 0600, nil) checkError(err) defer db.Close() if force { err = deleteBucket(db, bucket) checkError(err) log.Info("Old database deleted: %s", bucket) } if chunkSize <= 0 { chunkSize = 1000000 } fn := func(line string) (interface{}, bool, error) { line = strings.TrimRight(line, "\n") if line == "" || line[0] == '#' { return nil, false, nil } items := strings.Split(line, "\t") if len(items) != 2 { return nil, false, nil } if items[0] == "" || items[1] == "" { return nil, false, nil } return items, true, nil } reader, err := breader.NewBufferedReader(dataFile, runtime.NumCPU(), chunkSize, fn) checkError(err) n := 0 for chunk := range reader.Ch { if chunk.Err != nil { checkError(chunk.Err) return } records := make([][]string, len(chunk.Data)) for i, data := range chunk.Data { switch reflect.TypeOf(data).Kind() { case reflect.Slice: s := reflect.ValueOf(data) items := make([]string, s.Len()) for i := 0; i < s.Len(); i++ { items[i] = s.Index(i).String() } records[i] = items } } write2db(records, db, bucket) n += len(records) log.Info("%d records imported to %s", n, dbFile) } }
func remoteQueryGi2TaxidByFile(host string, port int, dataType string, dataFile string, chunkSize int, threads int) { if chunkSize <= 0 { chunkSize = 1000 } fn := func(line string) (interface{}, bool, error) { line = strings.TrimSpace(strings.TrimRight(line, "\n")) if line == "" { return "", false, nil } return line, true, nil } reader, err := breader.NewBufferedReader(dataFile, threads, chunkSize, fn) checkError(err) chResults := make(chan taxon.MessageGI2TaxidMap, threads) // receive result and print chDone := make(chan int) go func() { for msg := range chResults { if msg.Status != "OK" { log.Error(msg.Message) } for gi, taxid := range msg.Taxids { fmt.Printf("%s\t%s\n", gi, taxid) } } chDone <- 1 }() // querying var wg sync.WaitGroup tokens := make(chan int, threads) for chunk := range reader.Ch { tokens <- 1 wg.Add(1) gis := make([]string, len(chunk.Data)) for i, data := range chunk.Data { gis[i] = data.(string) } go func(gis []string) { defer func() { wg.Done() <-tokens }() msg := taxon.RemoteQueryGi2Taxid(host, port, dataType, gis) checkError(err) chResults <- msg }(gis) } wg.Wait() close(chResults) <-chDone }
// ImportNodes reads data from nodes.dmp and write to bolt database func ImportNodes(dbFile string, bucket string, dataFile string, batchSize int, force bool) { db, err := bolt.Open(dbFile, 0600, nil) checkError(err) defer db.Close() if force { err = deleteBucket(db, bucket) checkError(err) log.Info("Old database deleted: %s", bucket) } if batchSize <= 0 { batchSize = 10000 } re := regexp.MustCompile(`\t\|$`) fn := func(line string) (interface{}, bool, error) { line = strings.TrimRight(line, "\n") if line == "" { return nil, false, nil } items := strings.Split(re.ReplaceAllString(line, ""), "\t|\t") if len(items) != 13 { return nil, false, nil } return nodes.NodeFromArgs(items), true, nil } reader, err := breader.NewBufferedReader(dataFile, runtime.NumCPU(), batchSize, fn) checkError(err) n := 0 for chunk := range reader.Ch { if chunk.Err != nil { checkError(chunk.Err) return } records := make([][]string, len(chunk.Data)) for i, data := range chunk.Data { node := data.(nodes.Node) nodeJSONStr, err := node.ToJSON() if err != nil { checkError(chunk.Err) return } records[i] = []string{node.TaxID, nodeJSONStr} } write2db(records, db, bucket) n += len(records) log.Info("%d records imported to %s", n, dbFile) } }
// ReadFeatures returns bed data of a file, availabe type values are 3,4,5,6 func ReadFeatures(file string, n int) ([]Feature, map[string]map[string]string, error) { if _, err := os.Stat(file); os.IsNotExist(err) { return nil, nil, err } fn := func(line string) (interface{}, bool, error) { if line[0] == '#' { return nil, false, nil } if string(line[0:7]) == "browser" { items := strings.Split(strings.TrimRight(line, "\n"), " ") if len(items) < 3 { return nil, false, ErrBadBrowserLine } details := make(map[string]string) details[items[1]] = items[2] return meta{"browser", details}, true, nil } if string(line[0:5]) == "track" { details := make(map[string]string) found := TrackItemRegexp.FindAllStringSubmatch(line, -1) for _, sub := range found { details[sub[0]] = sub[1] } return meta{"track", details}, true, nil } return nil, false, nil } reader, err := breader.NewBufferedReader(file, runtime.NumCPU(), 100, fn) if err != nil { return nil, nil, err } features := []Feature{} for chunk := range reader.Ch { if chunk.Err != nil { return nil, nil, chunk.Err } for _, data := range chunk.Data { fmt.Println(reflect.TypeOf(data).Kind()) } } return features, nil, nil }
func queryGi2TaxidByFile(dbFilePath string, queryType string, dataFile string, chunkSize int, threads int) { if chunkSize <= 0 { chunkSize = 10000 } fn := func(line string) (interface{}, bool, error) { line = strings.TrimSpace(strings.TrimRight(line, "\n")) if line == "" { return "", false, nil } return line, true, nil } reader, err := breader.NewBufferedReader(dataFile, runtime.NumCPU(), chunkSize, fn) checkError(err) pool := taxon.NewDBPool(dbFilePath, threads) chResults := make(chan [][]string, threads) // receive result and print chDone := make(chan int) go func() { for s := range chResults { gis, taxids := s[0], s[1] for i, gi := range gis { fmt.Printf("%s\t%s\n", gi, taxids[i]) } } chDone <- 1 }() // querying var wg sync.WaitGroup tokens := make(chan int, threads) for chunk := range reader.Ch { if chunk.Err != nil { checkError(chunk.Err) break } tokens <- 1 wg.Add(1) gis := make([]string, len(chunk.Data)) for i, data := range chunk.Data { gis[i] = data.(string) } go func(gis []string) { db := pool.GetDB() defer func() { pool.ReleaseDB(db) wg.Done() <-tokens }() taxids, err := taxon.QueryGi2Taxid(db, queryType, gis) checkError(err) chResults <- [][]string{gis, taxids} }(gis) } wg.Wait() close(chResults) <-chDone }
// ImportNames reads data from names.dmp and write to bolt database func ImportNames(dbFile string, bucket string, dataFile string, chunkSize int, force bool) { db, err := bolt.Open(dbFile, 0600, nil) checkError(err) defer db.Close() if force { err = deleteBucket(db, bucket) checkError(err) log.Info("Old database deleted: %s", bucket) } if chunkSize <= 0 { chunkSize = 10000 } re := regexp.MustCompile(`\t\|$`) fn := func(line string) (interface{}, bool, error) { line = strings.TrimRight(line, "\n") if line == "" { return nil, false, nil } items := strings.Split(re.ReplaceAllString(line, ""), "\t|\t") if len(items) != 4 { return nil, false, nil } return nodes.NameFromArgs(items), true, nil } reader, err := breader.NewBufferedReader(dataFile, runtime.NumCPU(), chunkSize, fn) checkError(err) names := make(map[string]nodes.Name) n := 0 for chunk := range reader.Ch { if chunk.Err != nil { checkError(chunk.Err) return } for _, data := range chunk.Data { name := data.(nodes.Name) if _, ok := names[name.TaxID]; ok { names[name.TaxID] = nodes.MergeNames(names[name.TaxID], name) } else { names[name.TaxID] = name } } n += len(chunk.Data) log.Info("%d records readed", n) } chResults := make(chan []string, runtime.NumCPU()) // write to db chDone := make(chan int) go func() { records := make([][]string, chunkSize) i := 0 n := 0 for s := range chResults { records[i] = s i++ n++ if i%chunkSize == 0 { write2db(records, db, bucket) log.Info("%d records imported to %s", n, dbFile) records = make([][]string, chunkSize) i = 0 } } log.Info("%d records imported to %s", n, dbFile) chDone <- 1 }() // name to json tokens := make(chan int, runtime.NumCPU()) var wg sync.WaitGroup for _, name := range names { tokens <- 1 wg.Add(1) go func(name nodes.Name) { defer func() { wg.Done() <-tokens }() nameJSONStr, err := name.ToJSON() checkError(err) chResults <- []string{name.TaxID, nameJSONStr} }(name) } wg.Wait() close(chResults) <-chDone }
// ReadFilteredFeatures returns gtf features of specific chrs in a file func ReadFilteredFeatures(file string, chrs []string, feats []string, attrs []string) ([]Feature, error) { if _, err := os.Stat(file); os.IsNotExist(err) { return nil, err } chrsMap := make(map[string]struct{}, len(chrs)) for _, chr := range chrs { chrsMap[strings.ToLower(chr)] = struct{}{} } featsMap := make(map[string]struct{}, len(feats)) for _, f := range feats { featsMap[strings.ToLower(f)] = struct{}{} } attrsMap := make(map[string]struct{}, len(attrs)) for _, f := range attrs { attrsMap[strings.ToLower(f)] = struct{}{} } fn := func(line string) (interface{}, bool, error) { if len(line) == 0 || line[0] == '#' { return nil, false, nil } line = strings.TrimRight(line, "\r\n") items := strings.Split(line, "\t") if len(items) != 9 { return nil, false, nil } if len(chrs) > 0 { // selected chrs if _, ok := chrsMap[strings.ToLower(items[0])]; !ok { return nil, false, nil } } if len(feats) > 0 { // selected features if _, ok := featsMap[strings.ToLower(items[2])]; !ok { return nil, false, nil } } var err error start, err := strconv.Atoi(items[3]) if err != nil { return nil, false, fmt.Errorf("bad start: %s", items[3]) } end, err := strconv.Atoi(items[4]) if err != nil { return nil, false, fmt.Errorf("bad end: %s", items[4]) } var score *float64 if items[5] != "." { s, err := strconv.ParseFloat(items[5], 64) if err != nil { return nil, false, fmt.Errorf("bad score: %s", items[5]) } score = &s } var strand *string if items[6] != "." { s := items[6] if !(s == "+" || s == "-") { return nil, false, fmt.Errorf("illigal strand: %s", s) } strand = &s } var frame *int if items[7] != "." { f, err := strconv.Atoi(items[7]) if err != nil { return nil, false, fmt.Errorf("bad frame: %s", items[7]) } if !(f == 0 || f == 1 || f == 2) { return nil, false, fmt.Errorf("illigal frame: %d", f) } frame = &f } feature := Feature{items[0], items[1], items[2], start, end, score, strand, frame, nil} tagValues := strings.Split(items[8], "; ") if len(tagValues) > 0 { var ok bool feature.Attributes = []Attribute{} for _, tagValue := range tagValues[0 : len(tagValues)-1] { items2 := strings.SplitN(tagValue, " ", 2) tag := items2[0] if _, ok = attrsMap[tag]; !ok { continue } value := items2[1] // if value[len(value)-1] == ';' { // value = value[0 : len(value)-1] // } if len(value) > 2 { value = value[1 : len(value)-1] } else { value = "" } feature.Attributes = append(feature.Attributes, Attribute{tag, value}) } } return feature, true, nil } reader, err := breader.NewBufferedReader(file, Threads, 100, fn) if err != nil { return nil, err } features := []Feature{} for chunk := range reader.Ch { if chunk.Err != nil { return nil, chunk.Err } for _, data := range chunk.Data { features = append(features, data.(Feature)) } } return features, nil }
func remoteName2TaxIDByFile(host string, port int, useRegexp bool, nameClass string, dataFile string, chunkSize int, threads int) { if chunkSize <= 0 { chunkSize = 1000 } fn := func(line string) (interface{}, bool, error) { line = strings.TrimRight(line, "\n") if line == "" { return "", false, nil } return line, true, nil } reader, err := breader.NewBufferedReader(dataFile, threads, chunkSize, fn) checkError(err) chResults := make(chan taxon.MssageName2TaxIDMap, threads) // receive result and print chDone := make(chan int) go func() { for msg := range chResults { if msg.Status != "OK" { log.Error(msg.Message) } for name, items := range msg.TaxIDs { idnames := make([]string, len(items)) for i, item := range items { idnames[i] = fmt.Sprintf("%d(%s)", item.TaxID, item.ScientificName) } fmt.Printf("%s\t%s\n", name, strings.Join(idnames, ",")) } } chDone <- 1 }() // querying var wg sync.WaitGroup tokens := make(chan int, threads) for chunk := range reader.Ch { tokens <- 1 wg.Add(1) queries := make([]string, len(chunk.Data)) for i, data := range chunk.Data { queries[i] = data.(string) } go func(queries []string) { defer func() { wg.Done() <-tokens }() msg := taxon.RemoteQueryName2TaxID(host, port, useRegexp, nameClass, queries) checkError(err) chResults <- msg }(queries) } wg.Wait() close(chResults) <-chDone }