Exemplo n.º 1
0
func (t *TextField) Analyze() (int, analysis.TokenFrequencies) {
	var tokens analysis.TokenStream
	if t.analyzer != nil {
		bytesToAnalyze := t.Value()
		if t.options.IsStored() {
			// need to copy
			bytesCopied := make([]byte, len(bytesToAnalyze))
			copy(bytesCopied, bytesToAnalyze)
			bytesToAnalyze = bytesCopied
		}
		tokens = t.analyzer.Analyze(bytesToAnalyze)
	} else {
		tokens = analysis.TokenStream{
			&analysis.Token{
				Start:    0,
				End:      len(t.value),
				Term:     t.value,
				Position: 1,
				Type:     analysis.AlphaNumeric,
			},
		}
	}
	fieldLength := len(tokens) // number of tokens in this doc field
	tokenFreqs := analysis.TokenFrequency(tokens, t.arrayPositions, t.options.IncludeTermVectors())
	return fieldLength, tokenFreqs
}
Exemplo n.º 2
0
func (n *DateTimeField) Analyze() (int, analysis.TokenFrequencies) {
	tokens := make(analysis.TokenStream, 0)
	tokens = append(tokens, &analysis.Token{
		Start:    0,
		End:      len(n.value),
		Term:     n.value,
		Position: 1,
		Type:     analysis.DateTime,
	})

	original, err := n.value.Int64()
	if err == nil {

		shift := DefaultDateTimePrecisionStep
		for shift < 64 {
			shiftEncoded, err := numeric_util.NewPrefixCodedInt64(original, shift)
			if err != nil {
				break
			}
			token := analysis.Token{
				Start:    0,
				End:      len(shiftEncoded),
				Term:     shiftEncoded,
				Position: 1,
				Type:     analysis.DateTime,
			}
			tokens = append(tokens, &token)
			shift += DefaultDateTimePrecisionStep
		}
	}

	fieldLength := len(tokens)
	tokenFreqs := analysis.TokenFrequency(tokens, n.arrayPositions)
	return fieldLength, tokenFreqs
}
Exemplo n.º 3
0
func (b *BooleanField) Analyze() (int, analysis.TokenFrequencies) {
	tokens := make(analysis.TokenStream, 0)
	tokens = append(tokens, &analysis.Token{
		Start:    0,
		End:      len(b.value),
		Term:     b.value,
		Position: 1,
		Type:     analysis.Boolean,
	})

	fieldLength := len(tokens)
	tokenFreqs := analysis.TokenFrequency(tokens, b.arrayPositions, b.options.IncludeTermVectors())
	return fieldLength, tokenFreqs
}
Exemplo n.º 4
0
func BenchmarkAnalysis(b *testing.B) {
	for i := 0; i < b.N; i++ {

		cache := registry.NewCache()
		analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name)
		if err != nil {
			b.Fatal(err)
		}

		ts := analyzer.Analyze(bleveWikiArticle)
		freqs := analysis.TokenFrequency(ts, nil, true)
		if len(freqs) != 511 {
			b.Errorf("expected %d freqs, got %d", 511, len(freqs))
		}
	}
}
Exemplo n.º 5
0
func (t *TextField) Analyze() (int, analysis.TokenFrequencies) {
	var tokens analysis.TokenStream
	if t.analyzer != nil {
		tokens = t.analyzer.Analyze(t.Value())
	} else {
		tokens = analysis.TokenStream{
			&analysis.Token{
				Start:    0,
				End:      len(t.value),
				Term:     t.value,
				Position: 1,
				Type:     analysis.AlphaNumeric,
			},
		}
	}
	fieldLength := len(tokens) // number of tokens in this doc field
	tokenFreqs := analysis.TokenFrequency(tokens)
	return fieldLength, tokenFreqs
}
Exemplo n.º 6
0
func (udc *SmolderingCouch) Analyze(d *document.Document) *index.AnalysisResult {
	rv := &index.AnalysisResult{
		DocID: d.ID,
		Rows:  make([]index.IndexRow, 0, 100),
	}

	docNumBytes := EncodeUvarintAscending(nil, d.Number)

	// track our back index entries
	backIndexStoredEntries := make([]*BackIndexStoreEntry, 0)

	// information we collate as we merge fields with same name
	fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies)
	fieldLengths := make(map[uint16]int)
	fieldIncludeTermVectors := make(map[uint16]bool)
	fieldNames := make(map[uint16]string)

	// set the value for the _id field
	idBytes := []byte(d.ID)
	fieldTermFreqs[0] = analysis.TokenFrequency(analysis.TokenStream{
		&analysis.Token{
			Term:     idBytes,
			Position: 1,
			Start:    0,
			End:      len(d.ID),
		},
	}, nil, false)
	// store the _id field as well
	f := document.NewTextField("_id", nil, []byte(idBytes))
	rv.Rows, backIndexStoredEntries = udc.storeField(docNumBytes, f, 0, rv.Rows, backIndexStoredEntries)

	analyzeField := func(field document.Field, storable bool) {
		fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(field.Name())
		if newFieldRow != nil {
			rv.Rows = append(rv.Rows, newFieldRow)
		}
		fieldNames[fieldIndex] = field.Name()

		if field.Options().IsIndexed() {
			fieldLength, tokenFreqs := field.Analyze()
			existingFreqs := fieldTermFreqs[fieldIndex]
			if existingFreqs == nil {
				fieldTermFreqs[fieldIndex] = tokenFreqs
			} else {
				existingFreqs.MergeAll(field.Name(), tokenFreqs)
				fieldTermFreqs[fieldIndex] = existingFreqs
			}
			fieldLengths[fieldIndex] += fieldLength
			fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors()
		}

		if storable && field.Options().IsStored() {
			rv.Rows, backIndexStoredEntries = udc.storeField(docNumBytes, field, fieldIndex, rv.Rows, backIndexStoredEntries)
		}
	}

	// walk all the fields, record stored fields now
	// place information about indexed fields into map
	// this collates information across fields with
	// same names (arrays)
	for _, field := range d.Fields {
		analyzeField(field, true)
	}

	if len(d.CompositeFields) > 0 {
		for fieldIndex, tokenFreqs := range fieldTermFreqs {
			if fieldIndex == 0 {
				// dont add id to any composite field
				continue
			}
			// see if any of the composite fields need this
			for _, compositeField := range d.CompositeFields {
				compositeField.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs)
			}
		}

		for _, compositeField := range d.CompositeFields {
			analyzeField(compositeField, false)
		}
	}

	rowsCapNeeded := len(rv.Rows) + 1
	for _, tokenFreqs := range fieldTermFreqs {
		rowsCapNeeded += len(tokenFreqs)
	}

	rv.Rows = append(make([]index.IndexRow, 0, rowsCapNeeded), rv.Rows...)

	backIndexTermsEntries := make([]*BackIndexTermsEntry, 0, len(fieldTermFreqs))

	// walk through the collated information and process
	// once for each indexed field (unique name)
	for fieldIndex, tokenFreqs := range fieldTermFreqs {
		fieldLength := fieldLengths[fieldIndex]
		includeTermVectors := fieldIncludeTermVectors[fieldIndex]

		// encode this field
		rv.Rows, backIndexTermsEntries = udc.indexField(docNumBytes, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows, backIndexTermsEntries)
	}

	// build the back index row
	backIndexRow := NewBackIndexRow(docNumBytes, backIndexTermsEntries, backIndexStoredEntries)
	rv.Rows = append(rv.Rows, backIndexRow)

	return rv
}