Esempio n. 1
0
func indexSite(index bleve.Index, maxBatchSize int) error {
	startT := time.Now()
	count := 0
	batch := index.NewBatch()
	batchSize := 10
	log.Printf("Walking %s", path.Join(htdocs, "repositories"))
	err := filepath.Walk(path.Join(htdocs, "repositories"), func(p string, f os.FileInfo, err error) error {
		if strings.Contains(p, "/accessions/") == true && strings.HasSuffix(p, ".json") == true {
			src, err := ioutil.ReadFile(p)
			if err != nil {
				log.Printf("Can't read %s, %s", p, err)
				return nil
			}
			view := new(cait.NormalizedAccessionView)
			err = json.Unmarshal(src, &view)
			if err != nil {
				log.Printf("Can't parse %s, %s", p, err)
				return nil
			}
			// Trim the htdocs and trailing .json extension
			//log.Printf("Queued %s", p)
			err = batch.Index(strings.TrimSuffix(strings.TrimPrefix(p, htdocs), "json"), view)
			if err != nil {
				log.Printf("Indexing error %s, %s", p, err)
				return nil
			}
			if batch.Size() >= batchSize {
				log.Printf("Indexing %d items", batch.Size())
				err := index.Batch(batch)
				if err != nil {
					log.Fatal(err)
				}
				count += batch.Size()
				batch = index.NewBatch()
				log.Printf("Indexed: %d items, batch size %d, running %s\n", count, batchSize, time.Now().Sub(startT))
				if batchSize < maxBatchSize {
					batchSize = batchSize * 2
				}
				if batchSize > maxBatchSize {
					batchSize = maxBatchSize
				}
			}
		}
		return nil
	})
	if batch.Size() > 0 {
		log.Printf("Indexing %d items", batch.Size())
		err := index.Batch(batch)
		if err != nil {
			log.Fatal(err)
		}
		count += batch.Size()
		log.Printf("Indexed: %d items, batch size %d, running %s\n", count, batchSize, time.Now().Sub(startT))
	}
	log.Printf("Total indexed: %d times, total run time %s\n", count, time.Now().Sub(startT))
	return err
}
Esempio n. 2
0
func indexBeer(i bleve.Index) error {

	// open the directory
	dirEntries, err := ioutil.ReadDir(*jsonDir)
	if err != nil {
		return err
	}

	// walk the directory entries for indexing
	log.Printf("Indexing...")
	count := 0
	startTime := time.Now()
	batch := i.NewBatch()
	batchCount := 0
	for _, dirEntry := range dirEntries {
		filename := dirEntry.Name()
		// read the bytes
		jsonBytes, err := ioutil.ReadFile(*jsonDir + "/" + filename)
		if err != nil {
			return err
		}
		// // shred them into a document
		ext := filepath.Ext(filename)
		docId := filename[:(len(filename) - len(ext))]
		batch.Index(docId, jsonBytes)
		batchCount++

		if batchCount >= *batchSize {
			err = i.Batch(batch)
			if err != nil {
				return err
			}
			batch = i.NewBatch()
			batchCount = 0
		}
		count++
		if count%1000 == 0 {
			indexDuration := time.Since(startTime)
			indexDurationSeconds := float64(indexDuration) / float64(time.Second)
			timePerDoc := float64(indexDuration) / float64(count)
			log.Printf("Indexed %d documents, in %.2fs (average %.2fms/doc)", count, indexDurationSeconds, timePerDoc/float64(time.Millisecond))
		}
	}
	// flush the last batch
	if batchCount > 0 {
		err = i.Batch(batch)
		if err != nil {
			log.Fatal(err)
		}
	}
	indexDuration := time.Since(startTime)
	indexDurationSeconds := float64(indexDuration) / float64(time.Second)
	timePerDoc := float64(indexDuration) / float64(count)
	log.Printf("Indexed %d documents, in %.2fs (average %.2fms/doc)", count, indexDurationSeconds, timePerDoc/float64(time.Millisecond))
	return nil
}
Esempio n. 3
0
func (be *BleveEngine) BatchIndex(documents []*Document) (int64, error) {
	start := time.Now().UnixNano() / int64(time.Millisecond)
	var index bleve.Index
	mapping := bleve.NewIndexMapping()
	index, err := bleve.New(INDEX, mapping)
	if err != nil {
		index, _ = bleve.Open(INDEX)
	}

	batch := index.NewBatch()

	for _, document := range documents {
		batch.Index(document.Id, document.Data)
	}

	index.Batch(batch)
	index.Close()

	return time.Now().UnixNano()/int64(time.Millisecond) - start, nil
}
Esempio n. 4
0
func readingWorker(index bleve.Index, work chan *Work) {
	wikiReader, err := blevebench.NewWikiReader(*source)
	if err != nil {
		log.Fatal(err)
	}
	defer wikiReader.Close()

	i := 0

	if *batchSize > 1 {
		batch := index.NewBatch()
		bytesInBatch := uint64(0)
		a, err := wikiReader.Next()
		for a != nil && err == nil && i < *count {
			err = batch.Index(strconv.Itoa(i), a)
			i++
			if err != nil {
				break
			}
			bytesInBatch += uint64(len(a.Title))
			bytesInBatch += uint64(len(a.Text))
			if batch.Size() >= *batchSize {
				work <- &Work{
					batch:          batch,
					plainTextBytes: bytesInBatch,
				}
				batch = index.NewBatch()
				bytesInBatch = 0
			}

			a, err = wikiReader.Next()
		}
		if err != nil {
			log.Fatalf("reading worker fatal: %v", err)
		}
		// close last batch
		if batch.Size() > 0 {
			work <- &Work{
				batch:          batch,
				plainTextBytes: bytesInBatch,
			}
		}

	} else {
		a, err := wikiReader.Next()
		for a != nil && err == nil && i <= *count {
			i++
			work <- &Work{
				doc:            a,
				id:             strconv.Itoa(i),
				plainTextBytes: uint64(len(a.Title) + len(a.Text)),
			}
			a, err = wikiReader.Next()
		}
		if err != nil {
			log.Fatalf("reading worker fatal: %v", err)
		}
	}

	close(work)

	// dump mem stats if requested
	if *memprofile != "" {
		f, err := os.Create(*memprofile)
		if err != nil {
			log.Fatal(err)
		}
		pprof.WriteHeapProfile(f)
	}
}