예제 #1
0
func newNumericDocValuesWriter(fieldInfo *FieldInfo,
	iwBytesUsed util.Counter, trackDocsWithField bool) *NumericDocValuesWriter {
	ans := &NumericDocValuesWriter{
		fieldInfo:   fieldInfo,
		iwBytesUsed: iwBytesUsed,
	}
	if trackDocsWithField {
		ans.docsWithField = util.NewFixedBitSetOf(64)
	}
	ans.pending = packed.DeltaPackedBuilder(packed.PackedInts.COMPACT)
	ans.bytesUsed = ans.pending.RamBytesUsed() + ans.docsWithFieldBytesUsed()
	ans.iwBytesUsed.AddAndGet(ans.bytesUsed)
	return ans
}
/*
Walk through all unique text tokens (Posting instances) found in this
field and serialie them into a single RAM segment.
*/
func (w *FreqProxTermsWriterPerField) flush(fieldName string,
	consumer FieldsConsumer, state *SegmentWriteState) error {
	if !w.fieldInfo.IsIndexed() {
		return nil // nothing to flush, don't bother the codc with the unindexed field
	}

	termsConsumer, err := consumer.AddField(w.fieldInfo)
	if err != nil {
		return err
	}
	termComp := termsConsumer.Comparator()

	// CONFUSING: this.indexOptions holds the index options that were
	// current when we first saw this field. But it's posible this has
	// changed, e.g. when other documents are indexed that cause a
	// "downgrade" of the IndexOptions. So we must decode the in-RAM
	// buffer according to this.indexOptions, but then write the new
	// segment to the directory according to currentFieldIndexOptions:
	currentFieldIndexOptions := w.fieldInfo.IndexOptions()
	assert(int(currentFieldIndexOptions) != 0)

	writeTermFreq := int(currentFieldIndexOptions) >= int(INDEX_OPT_DOCS_AND_FREQS)
	writePositions := int(currentFieldIndexOptions) >= int(INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS)
	writeOffsets := int(currentFieldIndexOptions) >= int(INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

	readTermFreq := w.hasFreq
	readPositions := w.hasProx
	readOffsets := w.hasOffsets

	// fmt.Printf("flush readTF=%v readPos=%v readOffs=%v\n",
	// 	readTermFreq, readPositions, readOffsets)

	// Make sure FieldInfo.update is working correctly
	assert(!writeTermFreq || readTermFreq)
	assert(!writePositions || readPositions)
	assert(!writeOffsets || readOffsets)

	assert(!writeOffsets || writePositions)

	var segUpdates map[*Term]int
	if state.SegUpdates != nil && len(state.SegUpdates.(*BufferedUpdates).terms) > 0 {
		segUpdates = state.SegUpdates.(*BufferedUpdates).terms
	}

	termIDs := w.sortPostings(termComp)
	numTerms := w.bytesHash.Size()
	text := new(util.BytesRef)
	postings := w.freqProxPostingsArray
	freq := newByteSliceReader()
	prox := newByteSliceReader()

	visitedDocs := util.NewFixedBitSetOf(state.SegmentInfo.DocCount())
	sumTotalTermFreq := int64(0)
	sumDocFreq := int64(0)

	protoTerm := NewEmptyTerm(fieldName)
	for i := 0; i < numTerms; i++ {
		termId := termIDs[i]
		// fmt.Printf("term=%v\n", termId)
		// Get BytesRef
		textStart := postings.textStarts[termId]
		w.bytePool.SetBytesRef(text, textStart)

		w.initReader(freq, termId, 0)
		if readPositions || readOffsets {
			w.initReader(prox, termId, 1)
		}

		// TODO: really TermsHashPerField shold take over most of this
		// loop, including merge sort of terms from multiple threads and
		// interacting with the TermsConsumer, only calling out to us
		// (passing us the DocConsumer) to handle delivery of docs/positions

		postingsConsumer, err := termsConsumer.StartTerm(text.ToBytes())
		if err != nil {
			return err
		}

		delDocLimit := 0
		if segUpdates != nil {
			protoTerm.Bytes = text.ToBytes()
			if docIDUpto, ok := segUpdates[protoTerm]; ok {
				delDocLimit = docIDUpto
			}
		}

		// Now termStates has numToMerge FieldMergeStates which call
		// share the same term. Now we must interleave the docID streams.
		docFreq := 0
		totalTermFreq := int64(0)
		docId := 0

		for {
			// fmt.Println("  cycle")
			var termFreq int
			if freq.eof() {
				if postings.lastDocCodes[termId] != -1 {
					// return last doc
					docId = postings.lastDocIDs[termId]
					if readTermFreq {
						termFreq = postings.termFreqs[termId]
					} else {
						termFreq = -1
					}
					postings.lastDocCodes[termId] = -1
				} else {
					// EOF
					break
				}
			} else {
				code, err := freq.ReadVInt()
				if err != nil {
					return err
				}
				if !readTermFreq {
					docId += int(code)
					termFreq = -1
				} else {
					docId += int(uint(code) >> 1)
					if (code & 1) != 0 {
						termFreq = 1
					} else {
						n, err := freq.ReadVInt()
						if err != nil {
							return err
						}
						termFreq = int(n)
					}
				}

				assert(docId != postings.lastDocIDs[termId])
			}

			docFreq++
			assert2(docId < state.SegmentInfo.DocCount(),
				"doc=%v maxDoc=%v", docId, state.SegmentInfo.DocCount())

			// NOTE: we could check here if the docID was deleted, and skip
			// it. However, this is somewhat dangerous because it can yield
			// non-deterministic behavior since we may see the docID before
			// we see the term that caused it to be deleted. This would
			// mean some (but not all) of its postings may make it into the
			// index, which'd alter the docFreq for those terms. We could
			// fix this by doing two passes, i.e. first sweep marks all del
			// docs, and 2nd sweep does the real flush, but I suspect
			// that'd add too much time to flush.
			visitedDocs.Set(docId)
			err := postingsConsumer.StartDoc(docId,
				map[bool]int{true: termFreq, false: -1}[writeTermFreq])
			if err != nil {
				return err
			}
			if docId < delDocLimit {
				panic("not implemented yet")
			}

			totalTermFreq += int64(termFreq)

			// Carefully copy over the prox + payload info, changing the
			// format to match Lucene's segment format.

			if readPositions || readOffsets {
				// we did record positions (& maybe payload) and/or offsets
				position := 0
				// offset := 0
				for j := 0; j < termFreq; j++ {
					var thisPayload []byte

					if readPositions {
						code, err := prox.ReadVInt()
						if err != nil {
							return err
						}
						position += int(uint(code) >> 1)

						if (code & 1) != 0 {
							panic("not implemented yet")
						}

						if readOffsets {
							panic("not implemented yet")
						} else if writePositions {
							err = postingsConsumer.AddPosition(position, thisPayload, -1, -1)
							if err != nil {
								return err
							}
						}
					}
				}
			}
			err = postingsConsumer.FinishDoc()
			if err != nil {
				return err
			}
		}
		err = termsConsumer.FinishTerm(text.ToBytes(), codec.NewTermStats(docFreq,
			map[bool]int64{true: totalTermFreq, false: -1}[writeTermFreq]))
		if err != nil {
			return err
		}
		sumTotalTermFreq += int64(totalTermFreq)
		sumDocFreq += int64(docFreq)
	}

	return termsConsumer.Finish(
		map[bool]int64{true: sumTotalTermFreq, false: -1}[writeTermFreq],
		sumDocFreq, visitedDocs.Cardinality())
}