func indexSite(index bleve.Index, maxBatchSize int) error { startT := time.Now() count := 0 batch := index.NewBatch() batchSize := 10 log.Printf("Walking %s", path.Join(htdocs, "repositories")) err := filepath.Walk(path.Join(htdocs, "repositories"), func(p string, f os.FileInfo, err error) error { if strings.Contains(p, "/accessions/") == true && strings.HasSuffix(p, ".json") == true { src, err := ioutil.ReadFile(p) if err != nil { log.Printf("Can't read %s, %s", p, err) return nil } view := new(cait.NormalizedAccessionView) err = json.Unmarshal(src, &view) if err != nil { log.Printf("Can't parse %s, %s", p, err) return nil } // Trim the htdocs and trailing .json extension //log.Printf("Queued %s", p) err = batch.Index(strings.TrimSuffix(strings.TrimPrefix(p, htdocs), "json"), view) if err != nil { log.Printf("Indexing error %s, %s", p, err) return nil } if batch.Size() >= batchSize { log.Printf("Indexing %d items", batch.Size()) err := index.Batch(batch) if err != nil { log.Fatal(err) } count += batch.Size() batch = index.NewBatch() log.Printf("Indexed: %d items, batch size %d, running %s\n", count, batchSize, time.Now().Sub(startT)) if batchSize < maxBatchSize { batchSize = batchSize * 2 } if batchSize > maxBatchSize { batchSize = maxBatchSize } } } return nil }) if batch.Size() > 0 { log.Printf("Indexing %d items", batch.Size()) err := index.Batch(batch) if err != nil { log.Fatal(err) } count += batch.Size() log.Printf("Indexed: %d items, batch size %d, running %s\n", count, batchSize, time.Now().Sub(startT)) } log.Printf("Total indexed: %d times, total run time %s\n", count, time.Now().Sub(startT)) return err }
func indexBeer(i bleve.Index) error { // open the directory dirEntries, err := ioutil.ReadDir(*jsonDir) if err != nil { return err } // walk the directory entries for indexing log.Printf("Indexing...") count := 0 startTime := time.Now() batch := i.NewBatch() batchCount := 0 for _, dirEntry := range dirEntries { filename := dirEntry.Name() // read the bytes jsonBytes, err := ioutil.ReadFile(*jsonDir + "/" + filename) if err != nil { return err } // // shred them into a document ext := filepath.Ext(filename) docId := filename[:(len(filename) - len(ext))] batch.Index(docId, jsonBytes) batchCount++ if batchCount >= *batchSize { err = i.Batch(batch) if err != nil { return err } batch = i.NewBatch() batchCount = 0 } count++ if count%1000 == 0 { indexDuration := time.Since(startTime) indexDurationSeconds := float64(indexDuration) / float64(time.Second) timePerDoc := float64(indexDuration) / float64(count) log.Printf("Indexed %d documents, in %.2fs (average %.2fms/doc)", count, indexDurationSeconds, timePerDoc/float64(time.Millisecond)) } } // flush the last batch if batchCount > 0 { err = i.Batch(batch) if err != nil { log.Fatal(err) } } indexDuration := time.Since(startTime) indexDurationSeconds := float64(indexDuration) / float64(time.Second) timePerDoc := float64(indexDuration) / float64(count) log.Printf("Indexed %d documents, in %.2fs (average %.2fms/doc)", count, indexDurationSeconds, timePerDoc/float64(time.Millisecond)) return nil }
func (be *BleveEngine) BatchIndex(documents []*Document) (int64, error) { start := time.Now().UnixNano() / int64(time.Millisecond) var index bleve.Index mapping := bleve.NewIndexMapping() index, err := bleve.New(INDEX, mapping) if err != nil { index, _ = bleve.Open(INDEX) } batch := index.NewBatch() for _, document := range documents { batch.Index(document.Id, document.Data) } index.Batch(batch) index.Close() return time.Now().UnixNano()/int64(time.Millisecond) - start, nil }
func readingWorker(index bleve.Index, work chan *Work) { wikiReader, err := blevebench.NewWikiReader(*source) if err != nil { log.Fatal(err) } defer wikiReader.Close() i := 0 if *batchSize > 1 { batch := index.NewBatch() bytesInBatch := uint64(0) a, err := wikiReader.Next() for a != nil && err == nil && i < *count { err = batch.Index(strconv.Itoa(i), a) i++ if err != nil { break } bytesInBatch += uint64(len(a.Title)) bytesInBatch += uint64(len(a.Text)) if batch.Size() >= *batchSize { work <- &Work{ batch: batch, plainTextBytes: bytesInBatch, } batch = index.NewBatch() bytesInBatch = 0 } a, err = wikiReader.Next() } if err != nil { log.Fatalf("reading worker fatal: %v", err) } // close last batch if batch.Size() > 0 { work <- &Work{ batch: batch, plainTextBytes: bytesInBatch, } } } else { a, err := wikiReader.Next() for a != nil && err == nil && i <= *count { i++ work <- &Work{ doc: a, id: strconv.Itoa(i), plainTextBytes: uint64(len(a.Title) + len(a.Text)), } a, err = wikiReader.Next() } if err != nil { log.Fatalf("reading worker fatal: %v", err) } } close(work) // dump mem stats if requested if *memprofile != "" { f, err := os.Create(*memprofile) if err != nil { log.Fatal(err) } pprof.WriteHeapProfile(f) } }