func (t *TextField) Analyze() (int, analysis.TokenFrequencies) { var tokens analysis.TokenStream if t.analyzer != nil { bytesToAnalyze := t.Value() if t.options.IsStored() { // need to copy bytesCopied := make([]byte, len(bytesToAnalyze)) copy(bytesCopied, bytesToAnalyze) bytesToAnalyze = bytesCopied } tokens = t.analyzer.Analyze(bytesToAnalyze) } else { tokens = analysis.TokenStream{ &analysis.Token{ Start: 0, End: len(t.value), Term: t.value, Position: 1, Type: analysis.AlphaNumeric, }, } } fieldLength := len(tokens) // number of tokens in this doc field tokenFreqs := analysis.TokenFrequency(tokens, t.arrayPositions, t.options.IncludeTermVectors()) return fieldLength, tokenFreqs }
func (n *DateTimeField) Analyze() (int, analysis.TokenFrequencies) { tokens := make(analysis.TokenStream, 0) tokens = append(tokens, &analysis.Token{ Start: 0, End: len(n.value), Term: n.value, Position: 1, Type: analysis.DateTime, }) original, err := n.value.Int64() if err == nil { shift := DefaultDateTimePrecisionStep for shift < 64 { shiftEncoded, err := numeric_util.NewPrefixCodedInt64(original, shift) if err != nil { break } token := analysis.Token{ Start: 0, End: len(shiftEncoded), Term: shiftEncoded, Position: 1, Type: analysis.DateTime, } tokens = append(tokens, &token) shift += DefaultDateTimePrecisionStep } } fieldLength := len(tokens) tokenFreqs := analysis.TokenFrequency(tokens, n.arrayPositions) return fieldLength, tokenFreqs }
func (b *BooleanField) Analyze() (int, analysis.TokenFrequencies) { tokens := make(analysis.TokenStream, 0) tokens = append(tokens, &analysis.Token{ Start: 0, End: len(b.value), Term: b.value, Position: 1, Type: analysis.Boolean, }) fieldLength := len(tokens) tokenFreqs := analysis.TokenFrequency(tokens, b.arrayPositions, b.options.IncludeTermVectors()) return fieldLength, tokenFreqs }
func BenchmarkAnalysis(b *testing.B) { for i := 0; i < b.N; i++ { cache := registry.NewCache() analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name) if err != nil { b.Fatal(err) } ts := analyzer.Analyze(bleveWikiArticle) freqs := analysis.TokenFrequency(ts, nil, true) if len(freqs) != 511 { b.Errorf("expected %d freqs, got %d", 511, len(freqs)) } } }
func (t *TextField) Analyze() (int, analysis.TokenFrequencies) { var tokens analysis.TokenStream if t.analyzer != nil { tokens = t.analyzer.Analyze(t.Value()) } else { tokens = analysis.TokenStream{ &analysis.Token{ Start: 0, End: len(t.value), Term: t.value, Position: 1, Type: analysis.AlphaNumeric, }, } } fieldLength := len(tokens) // number of tokens in this doc field tokenFreqs := analysis.TokenFrequency(tokens) return fieldLength, tokenFreqs }
func (udc *SmolderingCouch) Analyze(d *document.Document) *index.AnalysisResult { rv := &index.AnalysisResult{ DocID: d.ID, Rows: make([]index.IndexRow, 0, 100), } docNumBytes := EncodeUvarintAscending(nil, d.Number) // track our back index entries backIndexStoredEntries := make([]*BackIndexStoreEntry, 0) // information we collate as we merge fields with same name fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies) fieldLengths := make(map[uint16]int) fieldIncludeTermVectors := make(map[uint16]bool) fieldNames := make(map[uint16]string) // set the value for the _id field idBytes := []byte(d.ID) fieldTermFreqs[0] = analysis.TokenFrequency(analysis.TokenStream{ &analysis.Token{ Term: idBytes, Position: 1, Start: 0, End: len(d.ID), }, }, nil, false) // store the _id field as well f := document.NewTextField("_id", nil, []byte(idBytes)) rv.Rows, backIndexStoredEntries = udc.storeField(docNumBytes, f, 0, rv.Rows, backIndexStoredEntries) analyzeField := func(field document.Field, storable bool) { fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(field.Name()) if newFieldRow != nil { rv.Rows = append(rv.Rows, newFieldRow) } fieldNames[fieldIndex] = field.Name() if field.Options().IsIndexed() { fieldLength, tokenFreqs := field.Analyze() existingFreqs := fieldTermFreqs[fieldIndex] if existingFreqs == nil { fieldTermFreqs[fieldIndex] = tokenFreqs } else { existingFreqs.MergeAll(field.Name(), tokenFreqs) fieldTermFreqs[fieldIndex] = existingFreqs } fieldLengths[fieldIndex] += fieldLength fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors() } if storable && field.Options().IsStored() { rv.Rows, backIndexStoredEntries = udc.storeField(docNumBytes, field, fieldIndex, rv.Rows, backIndexStoredEntries) } } // walk all the fields, record stored fields now // place information about indexed fields into map // this collates information across fields with // same names (arrays) for _, field := range d.Fields { analyzeField(field, true) } if len(d.CompositeFields) > 0 { for fieldIndex, tokenFreqs := range fieldTermFreqs { if fieldIndex == 0 { // dont add id to any composite field continue } // see if any of the composite fields need this for _, compositeField := range d.CompositeFields { compositeField.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs) } } for _, compositeField := range d.CompositeFields { analyzeField(compositeField, false) } } rowsCapNeeded := len(rv.Rows) + 1 for _, tokenFreqs := range fieldTermFreqs { rowsCapNeeded += len(tokenFreqs) } rv.Rows = append(make([]index.IndexRow, 0, rowsCapNeeded), rv.Rows...) backIndexTermsEntries := make([]*BackIndexTermsEntry, 0, len(fieldTermFreqs)) // walk through the collated information and process // once for each indexed field (unique name) for fieldIndex, tokenFreqs := range fieldTermFreqs { fieldLength := fieldLengths[fieldIndex] includeTermVectors := fieldIncludeTermVectors[fieldIndex] // encode this field rv.Rows, backIndexTermsEntries = udc.indexField(docNumBytes, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows, backIndexTermsEntries) } // build the back index row backIndexRow := NewBackIndexRow(docNumBytes, backIndexTermsEntries, backIndexStoredEntries) rv.Rows = append(rv.Rows, backIndexRow) return rv }