func newLucene49NormsConsumer(state *SegmentWriteState, dataCodec, dataExtension, metaCodec, metaExtension string) (nc *NormsConsumer, err error) { assert(packed.PackedFormat(packed.PACKED_SINGLE_BLOCK).IsSupported(1)) assert(packed.PackedFormat(packed.PACKED_SINGLE_BLOCK).IsSupported(2)) assert(packed.PackedFormat(packed.PACKED_SINGLE_BLOCK).IsSupported(4)) nc = &NormsConsumer{maxDoc: state.SegmentInfo.DocCount()} var success = false defer func() { if !success { util.CloseWhileSuppressingError(nc) } }() dataName := util.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, dataExtension) if nc.data, err = state.Directory.CreateOutput(dataName, state.Context); err != nil { return nil, err } if err = codec.WriteHeader(nc.data, dataCodec, VERSION_CURRENT); err != nil { return nil, err } metaName := util.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, metaExtension) if nc.meta, err = state.Directory.CreateOutput(metaName, state.Context); err != nil { return nil, err } if err = codec.WriteHeader(nc.meta, metaCodec, VERSION_CURRENT); err != nil { return nil, err } success = true return nc, nil }
func NewForUtil(in DataInput) (fu ForUtil, err error) { self := ForUtil{} packedIntsVersion, err := in.ReadVInt() if err != nil { return self, err } packed.CheckVersion(packedIntsVersion) self.encodedSizes = make([]int32, 33) self.encoders = make([]packed.PackedIntsEncoder, 33) self.decoders = make([]packed.PackedIntsDecoder, 33) self.iterations = make([]int32, 33) for bpv := 1; bpv <= 32; bpv++ { code, err := in.ReadVInt() if err != nil { return self, err } formatId := uint32(code) >> 5 bitsPerValue := (uint32(code) & 31) + 1 format := packed.PackedFormat(formatId) // assert format.isSupported(bitsPerValue) self.encodedSizes[bpv] = encodedSize(format, packedIntsVersion, bitsPerValue) self.encoders[bpv] = packed.GetPackedIntsEncoder(format, packedIntsVersion, bitsPerValue) self.decoders[bpv] = packed.GetPackedIntsDecoder(format, packedIntsVersion, bitsPerValue) self.iterations[bpv] = computeIterations(self.decoders[bpv]) } return self, nil }
func (np *NormsProducer) loadNorms(field *FieldInfo) (NumericDocValues, error) { entry, ok := np.norms[int(field.Number)] assert(ok) switch entry.format { case CONST_COMPRESSED: return func(int) int64 { return entry.offset }, nil case UNCOMPRESSED: panic("not implemented yet") case DELTA_COMPRESSED: panic("not implemented yet") case TABLE_COMPRESSED: var err error if err = np.data.Seek(entry.offset); err == nil { var packedVersion int32 if packedVersion, err = np.data.ReadVInt(); err == nil { var size int if size, err = int32ToInt(np.data.ReadVInt()); err == nil { if size > 256 { return nil, errors.New(fmt.Sprintf( "TABLE_COMPRESSED cannot have more than 256 distinct values, input=%v", np.data)) } decode := make([]int64, size) for i, _ := range decode { if decode[i], err = np.data.ReadLong(); err != nil { break } } if err == nil { var formatId int if formatId, err = int32ToInt(np.data.ReadVInt()); err == nil { var bitsPerValue int32 if bitsPerValue, err = np.data.ReadVInt(); err == nil { var ordsReader packed.PackedIntsReader if ordsReader, err = packed.ReaderNoHeader(np.data, packed.PackedFormat(formatId), packedVersion, int32(np.maxDoc), uint32(bitsPerValue)); err == nil { atomic.AddInt64(&np.ramBytesUsed, util.SizeOf(decode)+ordsReader.RamBytesUsed()) return func(docId int) int64 { return decode[int(ordsReader.Get(docId))] }, nil } } } } } } } if err != nil { return nil, err } default: panic("assert fail") } panic("should not be here") }
func computeMaxDataSize() int { maxDataSize := 0 // for each version for version := packed.PACKED_VERSION_START; version <= packed.PACKED_VERSION_CURRENT; version++ { // for each packed format for format := packed.PACKED; format <= packed.PACKED_SINGLE_BLOCK; format++ { // for each bit-per-value for bpv := uint32(1); bpv <= 32; bpv++ { if !packed.PackedFormat(format).IsSupported(bpv) { continue } decoder := packed.GetPackedIntsDecoder(packed.PackedFormat(format), int32(version), bpv) iterations := int(computeIterations(decoder)) if n := iterations * decoder.ByteValueCount(); n > maxDataSize { maxDataSize = n } } } } return maxDataSize }
func saveInts(values []int, out DataOutput) error { length := len(values) assert(length > 0) if length == 1 { return out.WriteVInt(int32(values[0])) } var allEqual = true var sentinel = values[0] for _, v := range values[1:] { if v != sentinel { allEqual = false break } } if allEqual { err := out.WriteVInt(0) if err == nil { err = out.WriteVInt(int32(values[0])) } return err } var max int64 = 0 for _, v := range values { max |= int64(v) } var bitsRequired = packed.BitsRequired(max) err := out.WriteVInt(int32(bitsRequired)) if err != nil { return err } w := packed.WriterNoHeader(out, packed.PackedFormat(packed.PACKED), length, bitsRequired, 1) for _, v := range values { if err = w.Add(int64(v)); err != nil { return err } } return w.Finish() }
func (w *StoredFieldsIndexWriter) writeBlock() error { assert(w.blockChunks > 0) err := w.fieldsIndexOut.WriteVInt(int32(w.blockChunks)) if err != nil { return err } // The trick here is that we only store the difference from the // average start pointer or doc base, this helps save bits per // value. And in order to prevent a few chunks that would be far // from the average to raise the number of bits per value for all // of them, we only encode blocks of 1024 chunks at once. // See LUCENE-4512 // doc bases var avgChunkDocs int if w.blockChunks == 1 { avgChunkDocs = 0 } else { avgChunkDocs = int(math.Floor(float64(w.blockDocs-w.docBaseDeltas[w.blockChunks-1])/float64(w.blockChunks-1) + 0.5)) } err = w.fieldsIndexOut.WriteVInt(int32(w.totalDocs - w.blockDocs)) // doc base if err == nil { err = w.fieldsIndexOut.WriteVInt(int32(avgChunkDocs)) } if err != nil { return err } var docBase int = 0 var maxDelta int64 = 0 for i := 0; i < w.blockChunks; i++ { delta := docBase - avgChunkDocs*i maxDelta |= moveSignToLowOrderBit(int64(delta)) docBase += w.docBaseDeltas[i] } bitsPerDocbase := packed.BitsRequired(maxDelta) err = w.fieldsIndexOut.WriteVInt(int32(bitsPerDocbase)) if err != nil { return err } writer := packed.WriterNoHeader(w.fieldsIndexOut, packed.PackedFormat(packed.PACKED), w.blockChunks, bitsPerDocbase, 1) docBase = 0 for i := 0; i < w.blockChunks; i++ { delta := docBase - avgChunkDocs*i assert(packed.BitsRequired(moveSignToLowOrderBit(int64(delta))) <= writer.BitsPerValue()) err = writer.Add(moveSignToLowOrderBit(int64(delta))) if err != nil { return err } docBase += w.docBaseDeltas[i] } err = writer.Finish() if err != nil { return err } // start pointers w.fieldsIndexOut.WriteVLong(w.firstStartPointer) var avgChunkSize int64 if w.blockChunks == 1 { avgChunkSize = 0 } else { avgChunkSize = (w.maxStartPointer - w.firstStartPointer) / int64(w.blockChunks-1) } err = w.fieldsIndexOut.WriteVLong(avgChunkSize) if err != nil { return err } var startPointer int64 = 0 maxDelta = 0 for i := 0; i < w.blockChunks; i++ { startPointer += w.startPointerDeltas[i] delta := startPointer - avgChunkSize*int64(i) maxDelta |= moveSignToLowOrderBit(delta) } bitsPerStartPointer := packed.BitsRequired(maxDelta) err = w.fieldsIndexOut.WriteVInt(int32(bitsPerStartPointer)) if err != nil { return err } writer = packed.WriterNoHeader(w.fieldsIndexOut, packed.PackedFormat(packed.PACKED), w.blockChunks, bitsPerStartPointer, 1) startPointer = 0 for i := 0; i < w.blockChunks; i++ { startPointer += w.startPointerDeltas[i] delta := startPointer - avgChunkSize*int64(i) assert(packed.BitsRequired(moveSignToLowOrderBit(delta)) <= writer.BitsPerValue()) err = writer.Add(moveSignToLowOrderBit(delta)) if err != nil { return err } } return writer.Finish() }
func (r *CompressingStoredFieldsReader) VisitDocument(docID int, visitor StoredFieldVisitor) error { err := r.fieldsStream.Seek(r.indexReader.startPointer(docID)) if err != nil { return err } docBase, err := int32AsInt(r.fieldsStream.ReadVInt()) if err != nil { return err } chunkDocs, err := int32AsInt(r.fieldsStream.ReadVInt()) if err != nil { return err } if docID < docBase || docID >= docBase+chunkDocs || docBase+chunkDocs > r.numDocs { return errors.New(fmt.Sprintf( "Corrupted: docID=%v, docBase=%v, chunkDocs=%v, numDocs=%v (resource=%v)", docID, docBase, chunkDocs, r.numDocs, r.fieldsStream)) } var numStoredFields, offset, length, totalLength int if chunkDocs == 1 { if numStoredFields, err = int32AsInt(r.fieldsStream.ReadVInt()); err != nil { return err } offset = 0 if length, err = int32AsInt(r.fieldsStream.ReadVInt()); err != nil { return err } totalLength = length } else { bitsPerStoredFields, err := int32AsInt(r.fieldsStream.ReadVInt()) if err != nil { return err } if bitsPerStoredFields == 0 { numStoredFields, err = int32AsInt(r.fieldsStream.ReadVInt()) if err != nil { return err } } else if bitsPerStoredFields > 31 { return errors.New(fmt.Sprintf("bitsPerStoredFields=%v (resource=%v)", bitsPerStoredFields, r.fieldsStream)) } else { panic("not implemented yet") } bitsPerLength, err := int32AsInt(r.fieldsStream.ReadVInt()) if err != nil { return err } if bitsPerLength == 0 { if length, err = int32AsInt(r.fieldsStream.ReadVInt()); err != nil { return err } offset = (docID - docBase) * length totalLength = chunkDocs * length } else if bitsPerLength > 31 { return errors.New(fmt.Sprintf("bitsPerLength=%v (resource=%v)", bitsPerLength, r.fieldsStream)) } else { it := packed.ReaderIteratorNoHeader( r.fieldsStream, packed.PackedFormat(packed.PACKED), r.packedIntsVersion, chunkDocs, bitsPerLength, 1) var n int64 off := 0 for i := 0; i < docID-docBase; i++ { if n, err = it.Next(); err != nil { return err } off += int(n) } offset = off if n, err = it.Next(); err != nil { return err } length = int(n) off += length for i := docID - docBase + 1; i < chunkDocs; i++ { if n, err = it.Next(); err != nil { return err } off += int(n) } totalLength = off } } if (length == 0) != (numStoredFields == 0) { return errors.New(fmt.Sprintf( "length=%v, numStoredFields=%v (resource=%v)", length, numStoredFields, r.fieldsStream)) } if numStoredFields == 0 { // nothing to do return nil } var documentInput util.DataInput if r.version >= VERSION_BIG_CHUNKS && totalLength >= 2*r.chunkSize { panic("not implemented yet") } else { var bytes []byte if totalLength <= BUFFER_REUSE_THRESHOLD { bytes = r.bytes } else { bytes = make([]byte, 0) } bytes, err = r.decompressor(r.fieldsStream, totalLength, offset, length, bytes) if err != nil { return err } assert(len(bytes) == length) documentInput = store.NewByteArrayDataInput(bytes) } for fieldIDX := 0; fieldIDX < numStoredFields; fieldIDX++ { infoAndBits, err := documentInput.ReadVLong() if err != nil { return err } fieldNumber := int(uint64(infoAndBits) >> uint64(TYPE_BITS)) fieldInfo := r.fieldInfos.FieldInfoByNumber(fieldNumber) bits := int(infoAndBits & int64(TYPE_MASK)) assertWithMessage(bits <= NUMERIC_DOUBLE, fmt.Sprintf("bits=%x", bits)) status, err := visitor.NeedsField(fieldInfo) if err != nil { return err } switch status { case STORED_FIELD_VISITOR_STATUS_YES: r.readField(documentInput, visitor, fieldInfo, bits) case STORED_FIELD_VISITOR_STATUS_NO: panic("not implemented yet") case STORED_FIELD_VISITOR_STATUS_STOP: return nil } } return nil }
func (nc *NormsConsumer) AddNumericField(field *FieldInfo, iter func() func() (interface{}, bool)) (err error) { if err = nc.meta.WriteVInt(field.Number); err != nil { return } minValue, maxValue := int64(math.MaxInt64), int64(math.MinInt64) // TODO: more efficient? uniqueValues := newNormMap() count := int64(0) next := iter() for { nv, ok := next() if !ok { break } assert2(nv != nil, "illegal norms data for field %v, got null for value: %v", field.Name, count) v := nv.(int64) if v < minValue { minValue = v } if v > maxValue { maxValue = v } if uniqueValues != nil && uniqueValues.add(v) && uniqueValues.size > 256 { uniqueValues = nil } count++ } assert2(count == int64(nc.maxDoc), "illegal norms data for field %v, expected %v values, got %v", field.Name, nc.maxDoc, count) if uniqueValues != nil && uniqueValues.size == 1 { // 0 bpv if err = nc.meta.WriteByte(CONST_COMPRESSED); err != nil { return } if err = nc.meta.WriteLong(minValue); err != nil { return } } else if uniqueValues != nil { // small number of unique values; this is the typical case: // we only use bpv=1,2,4,8 format := packed.PackedFormat(packed.PACKED_SINGLE_BLOCK) bitsPerValue := packed.BitsRequired(int64(uniqueValues.size) - 1) if bitsPerValue == 3 { bitsPerValue = 4 } else if bitsPerValue > 4 { bitsPerValue = 8 } if bitsPerValue == 8 && minValue >= 0 && maxValue <= 255 { if err = store.Stream(nc.meta).WriteByte(UNCOMPRESSED). // uncompressed []byte WriteLong(nc.data.FilePointer()). Close(); err != nil { return err } next = iter() for { nv, ok := next() if !ok { break } n := byte(0) if nv != nil { n = byte(nv.(int64)) } if err = nc.data.WriteByte(byte(n)); err != nil { return err } } } else { if err = store.Stream(nc.meta).WriteByte(TABLE_COMPRESSED). // table-compressed WriteLong(nc.data.FilePointer()). Close(); err != nil { return err } if err = nc.data.WriteVInt(packed.VERSION_CURRENT); err != nil { return err } decode := uniqueValues.decodeTable() // upgrade to power of two sized array size := 1 << uint(bitsPerValue) if err = nc.data.WriteVInt(int32(size)); err != nil { return err } for _, v := range decode { if err = nc.data.WriteLong(v); err != nil { return err } } for i := len(decode); i < size; i++ { if err = nc.data.WriteLong(0); err != nil { return err } } if err = store.Stream(nc.data).WriteVInt(int32(format.Id())). WriteVInt(int32(bitsPerValue)). Close(); err != nil { return err } writer := packed.WriterNoHeader(nc.data, format, nc.maxDoc, bitsPerValue, packed.DEFAULT_BUFFER_SIZE) next = iter() for { nv, ok := next() if !ok { break } if err = writer.Add(int64(uniqueValues.ord(nv.(int64)))); err != nil { return err } } if err = writer.Finish(); err != nil { return err } } } else { panic("not implemented yet") } return nil }