Beispiel #1
1
func findOffersFromText(index bleve.Index, query string, ids []string) (
	[]datedOffer, error) {

	if query == "" {
		return nil, nil
	}
	datedOffers := []datedOffer{}
	q, err := makeSearchQuery(query, ids)
	if err != nil {
		return nil, err
	}
	rq := bleve.NewSearchRequest(q)
	rq.Size = 20000
	rq.Fields = []string{"date"}
	res, err := index.Search(rq)
	if err != nil {
		return nil, err
	}
	for _, doc := range res.Hits {
		date, ok := doc.Fields["date"].(string)
		if !ok {
			return nil, fmt.Errorf("could not retrieve date for %s", doc.ID)
		}
		datedOffers = append(datedOffers, datedOffer{
			Date: date,
			Id:   doc.ID,
		})
	}
	return datedOffers, nil
}
Beispiel #2
0
func batchIndexingWorker(index bleve.Index, workChan chan *Work, start time.Time) {
	for {
		select {
		case work, ok := <-workChan:
			if !ok {
				return
			}
			workSize := 1
			if work.batch != nil {
				err := index.Batch(work.batch)
				if err != nil {
					log.Fatalf("indexer worker fatal: %v", err)
				}
				workSize = work.batch.Size()
			} else {
				err := index.Index(work.id, work.doc)
				if err != nil {
					log.Fatalf("indexer worker fatal: %v", err)
				}
			}
			elapsedTime := time.Since(start) / time.Millisecond
			updatedTotal := atomic.AddUint64(&totalIndexed, uint64(workSize))
			atomic.AddUint64(&totalPlainTextIndexed, work.plainTextBytes)
			if updatedTotal%uint64(*printCount) == 0 {
				log.Printf("%d,%d", updatedTotal, elapsedTime)
			}
		}
	}
}
Beispiel #3
0
func listIndexIds(index bleve.Index) ([]string, error) {
	idx, _, err := index.Advanced()
	if err != nil {
		return nil, err
	}
	reader, err := idx.Reader()
	if err != nil {
		return nil, err
	}
	defer reader.Close()
	idReader, err := reader.DocIDReaderAll()
	if err != nil {
		return nil, err
	}
	defer idReader.Close()
	ids := []string{}
	for {
		id, err := idReader.Next()
		if err != nil {
			return nil, err
		}
		if id == nil {
			break
		}
		extId, err := reader.ExternalID(id)
		if err != nil {
			return nil, err
		}
		ids = append(ids, extId)
	}
	return ids, nil
}
Beispiel #4
0
func indexWorker(index bleve.Index) {
	wikiReader, err := blevebench.NewWikiReader(*source)
	if err != nil {
		log.Fatal(err)
	}
	defer wikiReader.Close()
	i := 0
	a, err := wikiReader.Next()
	for a != nil && err == nil && i <= *count {
		i++
		index.Index(strconv.Itoa(i), a)
		a, err = wikiReader.Next()
	}
	if err != nil {
		log.Fatalf("reading worker fatal: %v", err)
	}

	// dump mem stats if requested
	if *memprofile != "" {
		f, err := os.Create(*memprofile)
		if err != nil {
			log.Fatal(err)
		}
		pprof.WriteHeapProfile(f)
	}
}
func processDelete(index bleve.Index, repo *git.Repository, path string) {
	log.Printf("delete: %s", path)
	rp := relativePath(path)
	err := index.Delete(rp)
	if err != nil {
		log.Print(err)
	}
}
Beispiel #6
0
func indexRecipeLink(i bleve.Index, jsonFilePath string) error {
	jsonBytes, err := ioutil.ReadFile(jsonFilePath)
	if err != nil {
		return err
	}
	fileName := filepath.Base(jsonFilePath)
	ext := filepath.Ext(fileName)
	docId := fileName[:(len(fileName) - len(ext))]
	return i.Index(docId, jsonBytes)
}
Beispiel #7
0
func RegisterIndexName(name string, idx bleve.Index) {
	indexNameMappingLock.Lock()
	defer indexNameMappingLock.Unlock()

	if indexNameMapping == nil {
		indexNameMapping = make(map[string]bleve.Index)
	}
	indexNameMapping[name] = idx
	indexStats[name] = idx.Stats()
}
Beispiel #8
0
// listPoints returns the location of offers satisfying specified full-text
// query. If query is empty, it returns all locations. If not nil, spatial is
// exploited as a cache to fetch indexed offers and their locations, which
// avoid store lookups.
func listPoints(store *Store, index bleve.Index, spatial *SpatialIndex,
	query string) ([]Point, error) {

	var ids []string
	if query == "" {
		if spatial != nil {
			ids = spatial.List()
		} else {
			list, err := store.List()
			if err != nil {
				return nil, err
			}
			ids = list
		}
	} else {
		q, err := makeSearchQuery(query, nil)
		if err != nil {
			return nil, err
		}
		rq := bleve.NewSearchRequest(q)
		rq.Size = 20000
		res, err := index.Search(rq)
		if err != nil {
			return nil, err
		}
		for _, doc := range res.Hits {
			ids = append(ids, doc.ID)
		}
	}
	points := make([]Point, 0, len(ids))
	for _, id := range ids {
		var p *Point
		if spatial != nil {
			offer := spatial.Get(id)
			if offer != nil {
				p = &offer.Point
			}
		}
		if p == nil {
			loc, _, err := store.GetLocation(id)
			if err != nil {
				return nil, err
			}
			if loc == nil {
				continue
			}
			p = &Point{
				Lat: loc.Lat,
				Lon: loc.Lon,
			}
		}
		points = append(points, *p)
	}
	return points, nil
}
func processUpdate(index bleve.Index, repo *git.Repository, path string) {
	log.Printf("updated: %s", path)
	rp := relativePath(path)
	wiki, err := NewWikiFromFile(path)
	if err != nil {
		log.Print(err)
	} else {
		doGitStuff(repo, rp, wiki)
		index.Index(rp, wiki)
	}
}
Beispiel #10
0
func queryWorker(index bleve.Index, sr *bleve.SearchRequest, repeat int) {
	termQueryCount := 0
	for termQueryCount < repeat {
		termQueryStart := time.Now()
		_, err := index.Search(sr)
		if err != nil {
			log.Fatal(err)
		}
		atomic.AddUint64(&totalRequests, 1)
		atomic.AddUint64(&totalTimeTaken, uint64(time.Since(termQueryStart)))
		termQueryCount++
	}
}
Beispiel #11
0
func indexBeer(i bleve.Index) error {

	// open the directory
	dirEntries, err := ioutil.ReadDir(*jsonDir)
	if err != nil {
		return err
	}

	// walk the directory entries for indexing
	log.Printf("Indexing...")
	count := 0
	startTime := time.Now()
	batch := bleve.NewBatch()
	batchCount := 0
	for _, dirEntry := range dirEntries {
		filename := dirEntry.Name()
		// read the bytes
		jsonBytes, err := ioutil.ReadFile(*jsonDir + "/" + filename)
		if err != nil {
			return err
		}
		// // shred them into a document
		ext := filepath.Ext(filename)
		docId := filename[:(len(filename) - len(ext))]
		batch.Index(docId, jsonBytes)
		batchCount++

		if batchCount >= *batchSize {
			err = i.Batch(batch)
			if err != nil {
				return err
			}
			batch = bleve.NewBatch()
			batchCount = 0
		}
		count++
		if count%1000 == 0 {
			indexDuration := time.Since(startTime)
			indexDurationSeconds := float64(indexDuration) / float64(time.Second)
			timePerDoc := float64(indexDuration) / float64(count)
			log.Printf("Indexed %d documents, in %.2fs (average %.2fms/doc)", count, indexDurationSeconds, timePerDoc/float64(time.Millisecond))
		}
	}
	indexDuration := time.Since(startTime)
	indexDurationSeconds := float64(indexDuration) / float64(time.Second)
	timePerDoc := float64(indexDuration) / float64(count)
	log.Printf("Indexed %d documents, in %.2fs (average %.2fms/doc)", count, indexDurationSeconds, timePerDoc/float64(time.Millisecond))
	return nil
}
Beispiel #12
0
func (be *BleveEngine) Index(document *Document) (int64, error) {
	start := time.Now().UnixNano() / int64(time.Millisecond)

	var index bleve.Index
	mapping := bleve.NewIndexMapping()
	index, err := bleve.New(INDEX, mapping)
	if err != nil {
		index, _ = bleve.Open(INDEX)
	}
	id := (*document).Id
	data := (*document).Data
	index.Index(id, data)
	index.Close()

	return time.Now().UnixNano()/int64(time.Millisecond) - start, nil
}
Beispiel #13
0
func indexSite(index bleve.Index, maxBatchSize int) error {
	startT := time.Now()
	count := 0
	batch := index.NewBatch()
	batchSize := 10
	log.Printf("Walking %s", path.Join(htdocs, "repositories"))
	err := filepath.Walk(path.Join(htdocs, "repositories"), func(p string, f os.FileInfo, err error) error {
		if strings.Contains(p, "/accessions/") == true && strings.HasSuffix(p, ".json") == true {
			src, err := ioutil.ReadFile(p)
			if err != nil {
				log.Printf("Can't read %s, %s", p, err)
				return nil
			}
			view := new(cait.NormalizedAccessionView)
			err = json.Unmarshal(src, &view)
			if err != nil {
				log.Printf("Can't parse %s, %s", p, err)
				return nil
			}
			// Trim the htdocs and trailing .json extension
			//log.Printf("Queued %s", p)
			err = batch.Index(strings.TrimSuffix(strings.TrimPrefix(p, htdocs), "json"), view)
			if err != nil {
				log.Printf("Indexing error %s, %s", p, err)
				return nil
			}
			if batch.Size() >= batchSize {
				log.Printf("Indexing %d items", batch.Size())
				err := index.Batch(batch)
				if err != nil {
					log.Fatal(err)
				}
				count += batch.Size()
				batch = index.NewBatch()
				log.Printf("Indexed: %d items, batch size %d, running %s\n", count, batchSize, time.Now().Sub(startT))
				if batchSize < maxBatchSize {
					batchSize = batchSize * 2
				}
				if batchSize > maxBatchSize {
					batchSize = maxBatchSize
				}
			}
		}
		return nil
	})
	if batch.Size() > 0 {
		log.Printf("Indexing %d items", batch.Size())
		err := index.Batch(batch)
		if err != nil {
			log.Fatal(err)
		}
		count += batch.Size()
		log.Printf("Indexed: %d items, batch size %d, running %s\n", count, batchSize, time.Now().Sub(startT))
	}
	log.Printf("Total indexed: %d times, total run time %s\n", count, time.Now().Sub(startT))
	return err
}
Beispiel #14
0
func batchIndexingWorker(index bleve.Index, workChan chan *Work, timeStart time.Time) {
	for work := range workChan {
		workSize := 1
		if work.batch != nil {
			err := index.Batch(work.batch)
			if err != nil {
				log.Fatalf("indexer worker fatal: %v", err)
			}
			workSize = work.batch.Size()
		} else {
			err := index.Index(work.id, work.doc)
			if err != nil {
				log.Fatalf("indexer worker fatal: %v", err)
			}
		}
		atomic.AddUint64(&totalIndexed, uint64(workSize))
		atomic.AddUint64(&totalPlainTextIndexed, work.plainTextBytes)
	}
}
Beispiel #15
0
func (indexes *BleveIndexes) Query(namespace string, query interface{}, offset int64, size int64) (int64, []string, error) {
	var total int64 = 0
	var uris []string

	indexes.RLock()
	defer indexes.RUnlock()

	node := indexes.nodes.Node(namespace)
	if node == nil {
		return total, uris, fmt.Errorf("none node for namespace: %s", namespace)
	}

	var idx bleve.Index
	if node.GetBind() == nil {
		index, err := indexes.index(namespace, false)
		if err != nil {
			return total, uris, err
		}
		if index == nil {
			return total, uris, nil
		}

		node.SetBind(index)
		idx = index
	} else {
		idx = node.GetBind().(bleve.Index)
	}

	q, ok := query.(bleve.Query)
	if !ok {
		return total, uris, fmt.Errorf("query type convert failed")
	}
	request := bleve.NewSearchRequestOptions(q, IndexSize, IndexOffset, false)
	response, err := idx.Search(request)
	if err != nil {
		return total, uris, err
	}
	total = int64(response.Total)
	for _, doc := range response.Hits {
		uris = append(uris, doc.ID)
	}
	return total, uris, nil
}
Beispiel #16
0
func dumpDictionary(index bleve.Index, field string) {
	i, _, err := index.Advanced()
	if err != nil {
		log.Fatal(err)
	}
	r, err := i.Reader()
	if err != nil {
		log.Fatal(err)
	}
	d, err := r.FieldDict(field)
	if err != nil {
		log.Fatal(err)
	}

	de, err := d.Next()
	for err == nil && de != nil {
		fmt.Printf("%s - %d\n", de.Term, de.Count)
		de, err = d.Next()
	}
}
Beispiel #17
0
func main() {

	flag.Parse()

	if *indexPath == "" {
		log.Fatal("must specify index path")
	}

	// create a new default mapping
	mapping := bleve.NewIndexMapping()
	if *mappingFile != "" {
		mappingBytes, err := ioutil.ReadFile(*mappingFile)
		if err != nil {
			log.Fatal(err)
		}
		err = json.Unmarshal(mappingBytes, &mapping)
		if err != nil {
			log.Fatal(err)
		}
	}

	// create the index
	var index bleve.Index
	var err error
	if *storeType != "" {
		index, err = bleve.NewUsing(*indexPath, mapping, *storeType, nil)
	} else {
		index, err = bleve.New(*indexPath, mapping)
	}
	if err != nil {
		log.Fatal(err)
	}
	defer func() {
		cerr := index.Close()
		if cerr != nil {
			log.Fatalf("error closing index: %v", err)
		}
	}()

	log.Printf("Created bleve index at: %s", *indexPath)
}
Beispiel #18
0
func (be *BleveEngine) BatchIndex(documents []*Document) (int64, error) {
	start := time.Now().UnixNano() / int64(time.Millisecond)
	var index bleve.Index
	mapping := bleve.NewIndexMapping()
	index, err := bleve.New(INDEX, mapping)
	if err != nil {
		index, _ = bleve.Open(INDEX)
	}

	batch := index.NewBatch()

	for _, document := range documents {
		batch.Index(document.Id, document.Data)
	}

	index.Batch(batch)
	index.Close()

	return time.Now().UnixNano()/int64(time.Millisecond) - start, nil
}
Beispiel #19
0
func query(term, highlight string, index bleve.Index, u content.User, feedIds []data.FeedId, paging ...int) (ua []content.UserArticle, err error) {
	var query bleve.Query

	query = bleve.NewQueryStringQuery(term)

	if len(feedIds) > 0 {
		queries := make([]bleve.Query, len(feedIds))
		conjunct := make([]bleve.Query, 2)

		for i, id := range feedIds {
			q := bleve.NewTermQuery(strconv.FormatInt(int64(id), 10))
			q.SetField("FeedId")

			queries[i] = q
		}

		disjunct := bleve.NewDisjunctionQuery(queries)

		conjunct[0] = query
		conjunct[1] = disjunct

		query = bleve.NewConjunctionQuery(conjunct)
	}

	searchRequest := bleve.NewSearchRequest(query)

	if highlight != "" {
		searchRequest.Highlight = bleve.NewHighlightWithStyle(highlight)
	}

	limit, offset := pagingLimit(paging)
	searchRequest.Size = limit
	searchRequest.From = offset

	searchResult, err := index.Search(searchRequest)

	if err != nil {
		return
	}

	if len(searchResult.Hits) == 0 {
		return
	}

	articleIds := []data.ArticleId{}
	hitMap := map[data.ArticleId]*search.DocumentMatch{}

	for _, hit := range searchResult.Hits {
		if articleId, err := strconv.ParseInt(hit.ID, 10, 64); err == nil {
			id := data.ArticleId(articleId)
			articleIds = append(articleIds, id)
			hitMap[id] = hit
		}
	}

	ua = u.ArticlesById(articleIds)
	if u.HasErr() {
		return ua, u.Err()
	}

	for i := range ua {
		data := ua[i].Data()

		hit := hitMap[data.Id]

		if len(hit.Fragments) > 0 {
			data.Hit.Fragments = hit.Fragments
			ua[i].Data(data)
		}
	}
	return
}
Beispiel #20
0
func readingWorker(index bleve.Index, work chan *Work) {
	wikiReader, err := blevebench.NewWikiReader(*source)
	if err != nil {
		log.Fatal(err)
	}
	defer wikiReader.Close()

	i := 0

	if *batchSize > 1 {
		batch := index.NewBatch()
		bytesInBatch := uint64(0)
		a, err := wikiReader.Next()
		for a != nil && err == nil && i < *count {
			err = batch.Index(strconv.Itoa(i), a)
			i++
			if err != nil {
				break
			}
			bytesInBatch += uint64(len(a.Title))
			bytesInBatch += uint64(len(a.Text))
			if batch.Size() >= *batchSize {
				work <- &Work{
					batch:          batch,
					plainTextBytes: bytesInBatch,
				}
				batch = index.NewBatch()
				bytesInBatch = 0
			}

			a, err = wikiReader.Next()
		}
		if err != nil {
			log.Fatalf("reading worker fatal: %v", err)
		}
		// close last batch
		if batch.Size() > 0 {
			work <- &Work{
				batch:          batch,
				plainTextBytes: bytesInBatch,
			}
		}

	} else {
		a, err := wikiReader.Next()
		for a != nil && err == nil && i <= *count {
			i++
			work <- &Work{
				doc:            a,
				id:             strconv.Itoa(i),
				plainTextBytes: uint64(len(a.Title) + len(a.Text)),
			}
			a, err = wikiReader.Next()
		}
		if err != nil {
			log.Fatalf("reading worker fatal: %v", err)
		}
	}

	close(work)

	// dump mem stats if requested
	if *memprofile != "" {
		f, err := os.Create(*memprofile)
		if err != nil {
			log.Fatal(err)
		}
		pprof.WriteHeapProfile(f)
	}
}
Beispiel #21
0
// See the License for the specific language governing permissions and
// limitations under the License.

package cmd

import (
	"fmt"
	"os"

	"github.com/blevesearch/bleve"
	"github.com/spf13/cobra"
)

var cfgFile string

var idx bleve.Index

// RootCmd represents the base command when called without any subcommands
var RootCmd = &cobra.Command{
	Use:   "bleve",
	Short: "command-line tool to interact with a bleve index",
	Long:  `Bleve is a command-line tool to interact with a bleve index.`,
	PersistentPreRunE: func(cmd *cobra.Command, args []string) error {
		if len(args) < 1 {
			return fmt.Errorf("must specify path to index")
		}
		var err error
		idx, err = bleve.Open(args[0])
		if err != nil {
			return fmt.Errorf("error opening bleve index: %v", err)
		}
Beispiel #22
0
// Index is used to add the event in the bleve index.
func (e *Event) Index(index bleve.Index) error {
	err := index.Index(string(e.ID), e)
	return err
}
Beispiel #23
0
func indexTPB(i bleve.Index) error {
	batch := bleve.NewBatch()
	batchCount := 0

	gzDumpFile, err := os.Open(*dump)
	if err != nil {
		return err
	}
	defer gzDumpFile.Close()

	dumpFile, err := gzip.NewReader(gzDumpFile)
	if err != nil {
		return err
	}

	reader := csv.NewReader(dumpFile)
	reader.FieldsPerRecord = 7
	reader.Comma = '|'

	count := 0
	startTime := time.Now()

	log.Printf("Indexing...")

	for {
		r, err := reader.Read()
		if err == io.EOF {
			break
		} else if err != nil {
			continue
		}

		size, err := strconv.ParseInt(r[1], 10, 0)
		if err != nil {
			fmt.Println("%#v", size)
			size = 0
		}

		batch.Index(r[2], tpbDoc{
			Name:     r[0],
			Size:     size,
			Hash:     r[2],
			Category: r[4],
			Type:     "torrent",
		})
		batchCount++

		if batchCount >= *batchSize {
			err = i.Batch(batch)
			if err != nil {
				return err
			}
			batch = bleve.NewBatch()
			batchCount = 0
		}

		count++

		if count%1000 == 0 {
			indexDuration := time.Since(startTime)
			indexDurationSeconds := float64(indexDuration) / float64(time.Second)
			timePerDoc := float64(indexDuration) / float64(count)
			log.Printf("Indexed %d documents in %.2fs (average %.2fms/doc)", count, indexDurationSeconds, timePerDoc/float64(time.Millisecond))
		}

		if *indexLimit > 0 && count >= *indexLimit {
			break
		}
	}

	// flush the last batch
	if batchCount > 0 {
		err := i.Batch(batch)
		if err != nil {
			log.Fatal(err)
		}
	}

	indexDuration := time.Since(startTime)
	indexDurationSeconds := float64(indexDuration) / float64(time.Second)
	timePerDoc := float64(indexDuration) / float64(count)
	log.Printf("Finished indexing %d documents in %.2fs (average %.2fms/doc)", count, indexDurationSeconds, timePerDoc/float64(time.Millisecond))

	log.Printf("Still listening on http://%v", bind)

	return nil
}