func newNumericDocValuesWriter(fieldInfo *FieldInfo, iwBytesUsed util.Counter, trackDocsWithField bool) *NumericDocValuesWriter { ans := &NumericDocValuesWriter{ fieldInfo: fieldInfo, iwBytesUsed: iwBytesUsed, } if trackDocsWithField { ans.docsWithField = util.NewFixedBitSetOf(64) } ans.pending = packed.DeltaPackedBuilder(packed.PackedInts.COMPACT) ans.bytesUsed = ans.pending.RamBytesUsed() + ans.docsWithFieldBytesUsed() ans.iwBytesUsed.AddAndGet(ans.bytesUsed) return ans }
/* Walk through all unique text tokens (Posting instances) found in this field and serialie them into a single RAM segment. */ func (w *FreqProxTermsWriterPerField) flush(fieldName string, consumer FieldsConsumer, state *SegmentWriteState) error { if !w.fieldInfo.IsIndexed() { return nil // nothing to flush, don't bother the codc with the unindexed field } termsConsumer, err := consumer.AddField(w.fieldInfo) if err != nil { return err } termComp := termsConsumer.Comparator() // CONFUSING: this.indexOptions holds the index options that were // current when we first saw this field. But it's posible this has // changed, e.g. when other documents are indexed that cause a // "downgrade" of the IndexOptions. So we must decode the in-RAM // buffer according to this.indexOptions, but then write the new // segment to the directory according to currentFieldIndexOptions: currentFieldIndexOptions := w.fieldInfo.IndexOptions() assert(int(currentFieldIndexOptions) != 0) writeTermFreq := int(currentFieldIndexOptions) >= int(INDEX_OPT_DOCS_AND_FREQS) writePositions := int(currentFieldIndexOptions) >= int(INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS) writeOffsets := int(currentFieldIndexOptions) >= int(INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) readTermFreq := w.hasFreq readPositions := w.hasProx readOffsets := w.hasOffsets // fmt.Printf("flush readTF=%v readPos=%v readOffs=%v\n", // readTermFreq, readPositions, readOffsets) // Make sure FieldInfo.update is working correctly assert(!writeTermFreq || readTermFreq) assert(!writePositions || readPositions) assert(!writeOffsets || readOffsets) assert(!writeOffsets || writePositions) var segUpdates map[*Term]int if state.SegUpdates != nil && len(state.SegUpdates.(*BufferedUpdates).terms) > 0 { segUpdates = state.SegUpdates.(*BufferedUpdates).terms } termIDs := w.sortPostings(termComp) numTerms := w.bytesHash.Size() text := new(util.BytesRef) postings := w.freqProxPostingsArray freq := newByteSliceReader() prox := newByteSliceReader() visitedDocs := util.NewFixedBitSetOf(state.SegmentInfo.DocCount()) sumTotalTermFreq := int64(0) sumDocFreq := int64(0) protoTerm := NewEmptyTerm(fieldName) for i := 0; i < numTerms; i++ { termId := termIDs[i] // fmt.Printf("term=%v\n", termId) // Get BytesRef textStart := postings.textStarts[termId] w.bytePool.SetBytesRef(text, textStart) w.initReader(freq, termId, 0) if readPositions || readOffsets { w.initReader(prox, termId, 1) } // TODO: really TermsHashPerField shold take over most of this // loop, including merge sort of terms from multiple threads and // interacting with the TermsConsumer, only calling out to us // (passing us the DocConsumer) to handle delivery of docs/positions postingsConsumer, err := termsConsumer.StartTerm(text.ToBytes()) if err != nil { return err } delDocLimit := 0 if segUpdates != nil { protoTerm.Bytes = text.ToBytes() if docIDUpto, ok := segUpdates[protoTerm]; ok { delDocLimit = docIDUpto } } // Now termStates has numToMerge FieldMergeStates which call // share the same term. Now we must interleave the docID streams. docFreq := 0 totalTermFreq := int64(0) docId := 0 for { // fmt.Println(" cycle") var termFreq int if freq.eof() { if postings.lastDocCodes[termId] != -1 { // return last doc docId = postings.lastDocIDs[termId] if readTermFreq { termFreq = postings.termFreqs[termId] } else { termFreq = -1 } postings.lastDocCodes[termId] = -1 } else { // EOF break } } else { code, err := freq.ReadVInt() if err != nil { return err } if !readTermFreq { docId += int(code) termFreq = -1 } else { docId += int(uint(code) >> 1) if (code & 1) != 0 { termFreq = 1 } else { n, err := freq.ReadVInt() if err != nil { return err } termFreq = int(n) } } assert(docId != postings.lastDocIDs[termId]) } docFreq++ assert2(docId < state.SegmentInfo.DocCount(), "doc=%v maxDoc=%v", docId, state.SegmentInfo.DocCount()) // NOTE: we could check here if the docID was deleted, and skip // it. However, this is somewhat dangerous because it can yield // non-deterministic behavior since we may see the docID before // we see the term that caused it to be deleted. This would // mean some (but not all) of its postings may make it into the // index, which'd alter the docFreq for those terms. We could // fix this by doing two passes, i.e. first sweep marks all del // docs, and 2nd sweep does the real flush, but I suspect // that'd add too much time to flush. visitedDocs.Set(docId) err := postingsConsumer.StartDoc(docId, map[bool]int{true: termFreq, false: -1}[writeTermFreq]) if err != nil { return err } if docId < delDocLimit { panic("not implemented yet") } totalTermFreq += int64(termFreq) // Carefully copy over the prox + payload info, changing the // format to match Lucene's segment format. if readPositions || readOffsets { // we did record positions (& maybe payload) and/or offsets position := 0 // offset := 0 for j := 0; j < termFreq; j++ { var thisPayload []byte if readPositions { code, err := prox.ReadVInt() if err != nil { return err } position += int(uint(code) >> 1) if (code & 1) != 0 { panic("not implemented yet") } if readOffsets { panic("not implemented yet") } else if writePositions { err = postingsConsumer.AddPosition(position, thisPayload, -1, -1) if err != nil { return err } } } } } err = postingsConsumer.FinishDoc() if err != nil { return err } } err = termsConsumer.FinishTerm(text.ToBytes(), codec.NewTermStats(docFreq, map[bool]int64{true: totalTermFreq, false: -1}[writeTermFreq])) if err != nil { return err } sumTotalTermFreq += int64(totalTermFreq) sumDocFreq += int64(docFreq) } return termsConsumer.Finish( map[bool]int64{true: sumTotalTermFreq, false: -1}[writeTermFreq], sumDocFreq, visitedDocs.Cardinality()) }