func findOffersFromText(index bleve.Index, query string, ids []string) ( []datedOffer, error) { if query == "" { return nil, nil } datedOffers := []datedOffer{} q, err := makeSearchQuery(query, ids) if err != nil { return nil, err } rq := bleve.NewSearchRequest(q) rq.Size = 20000 rq.Fields = []string{"date"} res, err := index.Search(rq) if err != nil { return nil, err } for _, doc := range res.Hits { date, ok := doc.Fields["date"].(string) if !ok { return nil, fmt.Errorf("could not retrieve date for %s", doc.ID) } datedOffers = append(datedOffers, datedOffer{ Date: date, Id: doc.ID, }) } return datedOffers, nil }
func batchIndexingWorker(index bleve.Index, workChan chan *Work, start time.Time) { for { select { case work, ok := <-workChan: if !ok { return } workSize := 1 if work.batch != nil { err := index.Batch(work.batch) if err != nil { log.Fatalf("indexer worker fatal: %v", err) } workSize = work.batch.Size() } else { err := index.Index(work.id, work.doc) if err != nil { log.Fatalf("indexer worker fatal: %v", err) } } elapsedTime := time.Since(start) / time.Millisecond updatedTotal := atomic.AddUint64(&totalIndexed, uint64(workSize)) atomic.AddUint64(&totalPlainTextIndexed, work.plainTextBytes) if updatedTotal%uint64(*printCount) == 0 { log.Printf("%d,%d", updatedTotal, elapsedTime) } } } }
func listIndexIds(index bleve.Index) ([]string, error) { idx, _, err := index.Advanced() if err != nil { return nil, err } reader, err := idx.Reader() if err != nil { return nil, err } defer reader.Close() idReader, err := reader.DocIDReaderAll() if err != nil { return nil, err } defer idReader.Close() ids := []string{} for { id, err := idReader.Next() if err != nil { return nil, err } if id == nil { break } extId, err := reader.ExternalID(id) if err != nil { return nil, err } ids = append(ids, extId) } return ids, nil }
func indexWorker(index bleve.Index) { wikiReader, err := blevebench.NewWikiReader(*source) if err != nil { log.Fatal(err) } defer wikiReader.Close() i := 0 a, err := wikiReader.Next() for a != nil && err == nil && i <= *count { i++ index.Index(strconv.Itoa(i), a) a, err = wikiReader.Next() } if err != nil { log.Fatalf("reading worker fatal: %v", err) } // dump mem stats if requested if *memprofile != "" { f, err := os.Create(*memprofile) if err != nil { log.Fatal(err) } pprof.WriteHeapProfile(f) } }
func processDelete(index bleve.Index, repo *git.Repository, path string) { log.Printf("delete: %s", path) rp := relativePath(path) err := index.Delete(rp) if err != nil { log.Print(err) } }
func indexRecipeLink(i bleve.Index, jsonFilePath string) error { jsonBytes, err := ioutil.ReadFile(jsonFilePath) if err != nil { return err } fileName := filepath.Base(jsonFilePath) ext := filepath.Ext(fileName) docId := fileName[:(len(fileName) - len(ext))] return i.Index(docId, jsonBytes) }
func RegisterIndexName(name string, idx bleve.Index) { indexNameMappingLock.Lock() defer indexNameMappingLock.Unlock() if indexNameMapping == nil { indexNameMapping = make(map[string]bleve.Index) } indexNameMapping[name] = idx indexStats[name] = idx.Stats() }
// listPoints returns the location of offers satisfying specified full-text // query. If query is empty, it returns all locations. If not nil, spatial is // exploited as a cache to fetch indexed offers and their locations, which // avoid store lookups. func listPoints(store *Store, index bleve.Index, spatial *SpatialIndex, query string) ([]Point, error) { var ids []string if query == "" { if spatial != nil { ids = spatial.List() } else { list, err := store.List() if err != nil { return nil, err } ids = list } } else { q, err := makeSearchQuery(query, nil) if err != nil { return nil, err } rq := bleve.NewSearchRequest(q) rq.Size = 20000 res, err := index.Search(rq) if err != nil { return nil, err } for _, doc := range res.Hits { ids = append(ids, doc.ID) } } points := make([]Point, 0, len(ids)) for _, id := range ids { var p *Point if spatial != nil { offer := spatial.Get(id) if offer != nil { p = &offer.Point } } if p == nil { loc, _, err := store.GetLocation(id) if err != nil { return nil, err } if loc == nil { continue } p = &Point{ Lat: loc.Lat, Lon: loc.Lon, } } points = append(points, *p) } return points, nil }
func processUpdate(index bleve.Index, repo *git.Repository, path string) { log.Printf("updated: %s", path) rp := relativePath(path) wiki, err := NewWikiFromFile(path) if err != nil { log.Print(err) } else { doGitStuff(repo, rp, wiki) index.Index(rp, wiki) } }
func queryWorker(index bleve.Index, sr *bleve.SearchRequest, repeat int) { termQueryCount := 0 for termQueryCount < repeat { termQueryStart := time.Now() _, err := index.Search(sr) if err != nil { log.Fatal(err) } atomic.AddUint64(&totalRequests, 1) atomic.AddUint64(&totalTimeTaken, uint64(time.Since(termQueryStart))) termQueryCount++ } }
func indexBeer(i bleve.Index) error { // open the directory dirEntries, err := ioutil.ReadDir(*jsonDir) if err != nil { return err } // walk the directory entries for indexing log.Printf("Indexing...") count := 0 startTime := time.Now() batch := bleve.NewBatch() batchCount := 0 for _, dirEntry := range dirEntries { filename := dirEntry.Name() // read the bytes jsonBytes, err := ioutil.ReadFile(*jsonDir + "/" + filename) if err != nil { return err } // // shred them into a document ext := filepath.Ext(filename) docId := filename[:(len(filename) - len(ext))] batch.Index(docId, jsonBytes) batchCount++ if batchCount >= *batchSize { err = i.Batch(batch) if err != nil { return err } batch = bleve.NewBatch() batchCount = 0 } count++ if count%1000 == 0 { indexDuration := time.Since(startTime) indexDurationSeconds := float64(indexDuration) / float64(time.Second) timePerDoc := float64(indexDuration) / float64(count) log.Printf("Indexed %d documents, in %.2fs (average %.2fms/doc)", count, indexDurationSeconds, timePerDoc/float64(time.Millisecond)) } } indexDuration := time.Since(startTime) indexDurationSeconds := float64(indexDuration) / float64(time.Second) timePerDoc := float64(indexDuration) / float64(count) log.Printf("Indexed %d documents, in %.2fs (average %.2fms/doc)", count, indexDurationSeconds, timePerDoc/float64(time.Millisecond)) return nil }
func (be *BleveEngine) Index(document *Document) (int64, error) { start := time.Now().UnixNano() / int64(time.Millisecond) var index bleve.Index mapping := bleve.NewIndexMapping() index, err := bleve.New(INDEX, mapping) if err != nil { index, _ = bleve.Open(INDEX) } id := (*document).Id data := (*document).Data index.Index(id, data) index.Close() return time.Now().UnixNano()/int64(time.Millisecond) - start, nil }
func indexSite(index bleve.Index, maxBatchSize int) error { startT := time.Now() count := 0 batch := index.NewBatch() batchSize := 10 log.Printf("Walking %s", path.Join(htdocs, "repositories")) err := filepath.Walk(path.Join(htdocs, "repositories"), func(p string, f os.FileInfo, err error) error { if strings.Contains(p, "/accessions/") == true && strings.HasSuffix(p, ".json") == true { src, err := ioutil.ReadFile(p) if err != nil { log.Printf("Can't read %s, %s", p, err) return nil } view := new(cait.NormalizedAccessionView) err = json.Unmarshal(src, &view) if err != nil { log.Printf("Can't parse %s, %s", p, err) return nil } // Trim the htdocs and trailing .json extension //log.Printf("Queued %s", p) err = batch.Index(strings.TrimSuffix(strings.TrimPrefix(p, htdocs), "json"), view) if err != nil { log.Printf("Indexing error %s, %s", p, err) return nil } if batch.Size() >= batchSize { log.Printf("Indexing %d items", batch.Size()) err := index.Batch(batch) if err != nil { log.Fatal(err) } count += batch.Size() batch = index.NewBatch() log.Printf("Indexed: %d items, batch size %d, running %s\n", count, batchSize, time.Now().Sub(startT)) if batchSize < maxBatchSize { batchSize = batchSize * 2 } if batchSize > maxBatchSize { batchSize = maxBatchSize } } } return nil }) if batch.Size() > 0 { log.Printf("Indexing %d items", batch.Size()) err := index.Batch(batch) if err != nil { log.Fatal(err) } count += batch.Size() log.Printf("Indexed: %d items, batch size %d, running %s\n", count, batchSize, time.Now().Sub(startT)) } log.Printf("Total indexed: %d times, total run time %s\n", count, time.Now().Sub(startT)) return err }
func batchIndexingWorker(index bleve.Index, workChan chan *Work, timeStart time.Time) { for work := range workChan { workSize := 1 if work.batch != nil { err := index.Batch(work.batch) if err != nil { log.Fatalf("indexer worker fatal: %v", err) } workSize = work.batch.Size() } else { err := index.Index(work.id, work.doc) if err != nil { log.Fatalf("indexer worker fatal: %v", err) } } atomic.AddUint64(&totalIndexed, uint64(workSize)) atomic.AddUint64(&totalPlainTextIndexed, work.plainTextBytes) } }
func (indexes *BleveIndexes) Query(namespace string, query interface{}, offset int64, size int64) (int64, []string, error) { var total int64 = 0 var uris []string indexes.RLock() defer indexes.RUnlock() node := indexes.nodes.Node(namespace) if node == nil { return total, uris, fmt.Errorf("none node for namespace: %s", namespace) } var idx bleve.Index if node.GetBind() == nil { index, err := indexes.index(namespace, false) if err != nil { return total, uris, err } if index == nil { return total, uris, nil } node.SetBind(index) idx = index } else { idx = node.GetBind().(bleve.Index) } q, ok := query.(bleve.Query) if !ok { return total, uris, fmt.Errorf("query type convert failed") } request := bleve.NewSearchRequestOptions(q, IndexSize, IndexOffset, false) response, err := idx.Search(request) if err != nil { return total, uris, err } total = int64(response.Total) for _, doc := range response.Hits { uris = append(uris, doc.ID) } return total, uris, nil }
func dumpDictionary(index bleve.Index, field string) { i, _, err := index.Advanced() if err != nil { log.Fatal(err) } r, err := i.Reader() if err != nil { log.Fatal(err) } d, err := r.FieldDict(field) if err != nil { log.Fatal(err) } de, err := d.Next() for err == nil && de != nil { fmt.Printf("%s - %d\n", de.Term, de.Count) de, err = d.Next() } }
func main() { flag.Parse() if *indexPath == "" { log.Fatal("must specify index path") } // create a new default mapping mapping := bleve.NewIndexMapping() if *mappingFile != "" { mappingBytes, err := ioutil.ReadFile(*mappingFile) if err != nil { log.Fatal(err) } err = json.Unmarshal(mappingBytes, &mapping) if err != nil { log.Fatal(err) } } // create the index var index bleve.Index var err error if *storeType != "" { index, err = bleve.NewUsing(*indexPath, mapping, *storeType, nil) } else { index, err = bleve.New(*indexPath, mapping) } if err != nil { log.Fatal(err) } defer func() { cerr := index.Close() if cerr != nil { log.Fatalf("error closing index: %v", err) } }() log.Printf("Created bleve index at: %s", *indexPath) }
func (be *BleveEngine) BatchIndex(documents []*Document) (int64, error) { start := time.Now().UnixNano() / int64(time.Millisecond) var index bleve.Index mapping := bleve.NewIndexMapping() index, err := bleve.New(INDEX, mapping) if err != nil { index, _ = bleve.Open(INDEX) } batch := index.NewBatch() for _, document := range documents { batch.Index(document.Id, document.Data) } index.Batch(batch) index.Close() return time.Now().UnixNano()/int64(time.Millisecond) - start, nil }
func query(term, highlight string, index bleve.Index, u content.User, feedIds []data.FeedId, paging ...int) (ua []content.UserArticle, err error) { var query bleve.Query query = bleve.NewQueryStringQuery(term) if len(feedIds) > 0 { queries := make([]bleve.Query, len(feedIds)) conjunct := make([]bleve.Query, 2) for i, id := range feedIds { q := bleve.NewTermQuery(strconv.FormatInt(int64(id), 10)) q.SetField("FeedId") queries[i] = q } disjunct := bleve.NewDisjunctionQuery(queries) conjunct[0] = query conjunct[1] = disjunct query = bleve.NewConjunctionQuery(conjunct) } searchRequest := bleve.NewSearchRequest(query) if highlight != "" { searchRequest.Highlight = bleve.NewHighlightWithStyle(highlight) } limit, offset := pagingLimit(paging) searchRequest.Size = limit searchRequest.From = offset searchResult, err := index.Search(searchRequest) if err != nil { return } if len(searchResult.Hits) == 0 { return } articleIds := []data.ArticleId{} hitMap := map[data.ArticleId]*search.DocumentMatch{} for _, hit := range searchResult.Hits { if articleId, err := strconv.ParseInt(hit.ID, 10, 64); err == nil { id := data.ArticleId(articleId) articleIds = append(articleIds, id) hitMap[id] = hit } } ua = u.ArticlesById(articleIds) if u.HasErr() { return ua, u.Err() } for i := range ua { data := ua[i].Data() hit := hitMap[data.Id] if len(hit.Fragments) > 0 { data.Hit.Fragments = hit.Fragments ua[i].Data(data) } } return }
func readingWorker(index bleve.Index, work chan *Work) { wikiReader, err := blevebench.NewWikiReader(*source) if err != nil { log.Fatal(err) } defer wikiReader.Close() i := 0 if *batchSize > 1 { batch := index.NewBatch() bytesInBatch := uint64(0) a, err := wikiReader.Next() for a != nil && err == nil && i < *count { err = batch.Index(strconv.Itoa(i), a) i++ if err != nil { break } bytesInBatch += uint64(len(a.Title)) bytesInBatch += uint64(len(a.Text)) if batch.Size() >= *batchSize { work <- &Work{ batch: batch, plainTextBytes: bytesInBatch, } batch = index.NewBatch() bytesInBatch = 0 } a, err = wikiReader.Next() } if err != nil { log.Fatalf("reading worker fatal: %v", err) } // close last batch if batch.Size() > 0 { work <- &Work{ batch: batch, plainTextBytes: bytesInBatch, } } } else { a, err := wikiReader.Next() for a != nil && err == nil && i <= *count { i++ work <- &Work{ doc: a, id: strconv.Itoa(i), plainTextBytes: uint64(len(a.Title) + len(a.Text)), } a, err = wikiReader.Next() } if err != nil { log.Fatalf("reading worker fatal: %v", err) } } close(work) // dump mem stats if requested if *memprofile != "" { f, err := os.Create(*memprofile) if err != nil { log.Fatal(err) } pprof.WriteHeapProfile(f) } }
// See the License for the specific language governing permissions and // limitations under the License. package cmd import ( "fmt" "os" "github.com/blevesearch/bleve" "github.com/spf13/cobra" ) var cfgFile string var idx bleve.Index // RootCmd represents the base command when called without any subcommands var RootCmd = &cobra.Command{ Use: "bleve", Short: "command-line tool to interact with a bleve index", Long: `Bleve is a command-line tool to interact with a bleve index.`, PersistentPreRunE: func(cmd *cobra.Command, args []string) error { if len(args) < 1 { return fmt.Errorf("must specify path to index") } var err error idx, err = bleve.Open(args[0]) if err != nil { return fmt.Errorf("error opening bleve index: %v", err) }
// Index is used to add the event in the bleve index. func (e *Event) Index(index bleve.Index) error { err := index.Index(string(e.ID), e) return err }
func indexTPB(i bleve.Index) error { batch := bleve.NewBatch() batchCount := 0 gzDumpFile, err := os.Open(*dump) if err != nil { return err } defer gzDumpFile.Close() dumpFile, err := gzip.NewReader(gzDumpFile) if err != nil { return err } reader := csv.NewReader(dumpFile) reader.FieldsPerRecord = 7 reader.Comma = '|' count := 0 startTime := time.Now() log.Printf("Indexing...") for { r, err := reader.Read() if err == io.EOF { break } else if err != nil { continue } size, err := strconv.ParseInt(r[1], 10, 0) if err != nil { fmt.Println("%#v", size) size = 0 } batch.Index(r[2], tpbDoc{ Name: r[0], Size: size, Hash: r[2], Category: r[4], Type: "torrent", }) batchCount++ if batchCount >= *batchSize { err = i.Batch(batch) if err != nil { return err } batch = bleve.NewBatch() batchCount = 0 } count++ if count%1000 == 0 { indexDuration := time.Since(startTime) indexDurationSeconds := float64(indexDuration) / float64(time.Second) timePerDoc := float64(indexDuration) / float64(count) log.Printf("Indexed %d documents in %.2fs (average %.2fms/doc)", count, indexDurationSeconds, timePerDoc/float64(time.Millisecond)) } if *indexLimit > 0 && count >= *indexLimit { break } } // flush the last batch if batchCount > 0 { err := i.Batch(batch) if err != nil { log.Fatal(err) } } indexDuration := time.Since(startTime) indexDurationSeconds := float64(indexDuration) / float64(time.Second) timePerDoc := float64(indexDuration) / float64(count) log.Printf("Finished indexing %d documents in %.2fs (average %.2fms/doc)", count, indexDurationSeconds, timePerDoc/float64(time.Millisecond)) log.Printf("Still listening on http://%v", bind) return nil }