func bitsRequired(data []int) int { or := int64(0) for _, v := range data[:LUCENE41_BLOCK_SIZE] { assert(v >= 0) or |= int64(v) } return packed.BitsRequired(or) }
func (h *LZ4HashTable) reset(length int) { bitsPerOffset := packed.BitsRequired(int64(length - LAST_LITERALS)) bitsPerOffsetLog := ceilLog2(bitsPerOffset) h.hashLog = MEMORY_USAGE + 3 - bitsPerOffsetLog assert(h.hashLog > 0) if h.hashTable == nil || h.hashTable.Size() < (1<<uint(h.hashLog)) || h.hashTable.BitsPerValue() < bitsPerOffset { h.hashTable = packed.MutableFor(1<<uint(h.hashLog), bitsPerOffset, packed.PackedInts.DEFAULT) } else { h.hashTable.Clear() } }
/* Compress bytes into out using at most 16KB of memory. ht shouldn't be shared across threads but can safely be reused. */ func LZ4Compress(bytes []byte, out DataOutput, ht *LZ4HashTable) error { offset, length := 0, len(bytes) base, end := offset, offset+length anchor := offset offset++ if length > LAST_LITERALS+MIN_MATCH { limit := end - LAST_LITERALS matchLimit := limit - MIN_MATCH ht.reset(length) hashLog := ht.hashLog hashTable := ht.hashTable for offset <= limit { // find a match var ref int var hasMore = offset < matchLimit for hasMore { v := readInt(bytes, offset) h := hash(v, hashLog) ref = base + int(hashTable.Get(h)) assert(packed.BitsRequired(int64(offset-base)) <= hashTable.BitsPerValue()) hashTable.Set(h, int64(offset-base)) if offset-ref < MAX_DISTANCE && readInt(bytes, ref) == v { break } offset++ hasMore = offset < matchLimit } if !hasMore { break } // compute match length matchLen := MIN_MATCH + commonBytes( bytes[ref+MIN_MATCH:limit], bytes[offset+MIN_MATCH:limit]) err := encodeSequence(bytes[anchor:offset], offset-ref, matchLen, out) if err != nil { return err } offset += matchLen anchor = offset } } // last literals literalLen := end - anchor assert(literalLen >= LAST_LITERALS || literalLen == length) return encodeLastLiterals(bytes[anchor:], out) }
func saveInts(values []int, out DataOutput) error { length := len(values) assert(length > 0) if length == 1 { return out.WriteVInt(int32(values[0])) } var allEqual = true var sentinel = values[0] for _, v := range values[1:] { if v != sentinel { allEqual = false break } } if allEqual { err := out.WriteVInt(0) if err == nil { err = out.WriteVInt(int32(values[0])) } return err } var max int64 = 0 for _, v := range values { max |= int64(v) } var bitsRequired = packed.BitsRequired(max) err := out.WriteVInt(int32(bitsRequired)) if err != nil { return err } w := packed.WriterNoHeader(out, packed.PackedFormat(packed.PACKED), length, bitsRequired, 1) for _, v := range values { if err = w.Add(int64(v)); err != nil { return err } } return w.Finish() }
func (w *StoredFieldsIndexWriter) writeBlock() error { assert(w.blockChunks > 0) err := w.fieldsIndexOut.WriteVInt(int32(w.blockChunks)) if err != nil { return err } // The trick here is that we only store the difference from the // average start pointer or doc base, this helps save bits per // value. And in order to prevent a few chunks that would be far // from the average to raise the number of bits per value for all // of them, we only encode blocks of 1024 chunks at once. // See LUCENE-4512 // doc bases var avgChunkDocs int if w.blockChunks == 1 { avgChunkDocs = 0 } else { avgChunkDocs = int(math.Floor(float64(w.blockDocs-w.docBaseDeltas[w.blockChunks-1])/float64(w.blockChunks-1) + 0.5)) } err = w.fieldsIndexOut.WriteVInt(int32(w.totalDocs - w.blockDocs)) // doc base if err == nil { err = w.fieldsIndexOut.WriteVInt(int32(avgChunkDocs)) } if err != nil { return err } var docBase int = 0 var maxDelta int64 = 0 for i := 0; i < w.blockChunks; i++ { delta := docBase - avgChunkDocs*i maxDelta |= moveSignToLowOrderBit(int64(delta)) docBase += w.docBaseDeltas[i] } bitsPerDocbase := packed.BitsRequired(maxDelta) err = w.fieldsIndexOut.WriteVInt(int32(bitsPerDocbase)) if err != nil { return err } writer := packed.WriterNoHeader(w.fieldsIndexOut, packed.PackedFormat(packed.PACKED), w.blockChunks, bitsPerDocbase, 1) docBase = 0 for i := 0; i < w.blockChunks; i++ { delta := docBase - avgChunkDocs*i assert(packed.BitsRequired(moveSignToLowOrderBit(int64(delta))) <= writer.BitsPerValue()) err = writer.Add(moveSignToLowOrderBit(int64(delta))) if err != nil { return err } docBase += w.docBaseDeltas[i] } err = writer.Finish() if err != nil { return err } // start pointers w.fieldsIndexOut.WriteVLong(w.firstStartPointer) var avgChunkSize int64 if w.blockChunks == 1 { avgChunkSize = 0 } else { avgChunkSize = (w.maxStartPointer - w.firstStartPointer) / int64(w.blockChunks-1) } err = w.fieldsIndexOut.WriteVLong(avgChunkSize) if err != nil { return err } var startPointer int64 = 0 maxDelta = 0 for i := 0; i < w.blockChunks; i++ { startPointer += w.startPointerDeltas[i] delta := startPointer - avgChunkSize*int64(i) maxDelta |= moveSignToLowOrderBit(delta) } bitsPerStartPointer := packed.BitsRequired(maxDelta) err = w.fieldsIndexOut.WriteVInt(int32(bitsPerStartPointer)) if err != nil { return err } writer = packed.WriterNoHeader(w.fieldsIndexOut, packed.PackedFormat(packed.PACKED), w.blockChunks, bitsPerStartPointer, 1) startPointer = 0 for i := 0; i < w.blockChunks; i++ { startPointer += w.startPointerDeltas[i] delta := startPointer - avgChunkSize*int64(i) assert(packed.BitsRequired(moveSignToLowOrderBit(delta)) <= writer.BitsPerValue()) err = writer.Add(moveSignToLowOrderBit(delta)) if err != nil { return err } } return writer.Finish() }
) /* hard limit on the maximum number of documents per chunk */ const MAX_DOCUMENTS_PER_CHUNK = 128 const ( STRING = 0x00 BYTE_ARR = 0x01 NUMERIC_INT = 0x02 NUMERIC_FLOAT = 0x03 NUMERIC_LONG = 0x04 NUMERIC_DOUBLE = 0x05 ) var ( TYPE_BITS = packed.BitsRequired(NUMERIC_DOUBLE) TYPE_MASK = int(packed.MaxValue(TYPE_BITS)) ) const ( CODEC_SFX_IDX = "Index" CODEC_SFX_DAT = "Data" VERSION_START = 0 VERSION_BIG_CHUNKS = 1 VERSION_CHECKSUM = 2 VERSION_CURRENT = VERSION_CHECKSUM ) /* StoredFieldsWriter impl for CompressingStoredFieldsFormat */ type CompressingStoredFieldsWriter struct { directory store.Directory
func (nc *NormsConsumer) AddNumericField(field *FieldInfo, iter func() func() (interface{}, bool)) (err error) { if err = nc.meta.WriteVInt(field.Number); err != nil { return } minValue, maxValue := int64(math.MaxInt64), int64(math.MinInt64) // TODO: more efficient? uniqueValues := newNormMap() count := int64(0) next := iter() for { nv, ok := next() if !ok { break } assert2(nv != nil, "illegal norms data for field %v, got null for value: %v", field.Name, count) v := nv.(int64) if v < minValue { minValue = v } if v > maxValue { maxValue = v } if uniqueValues != nil && uniqueValues.add(v) && uniqueValues.size > 256 { uniqueValues = nil } count++ } assert2(count == int64(nc.maxDoc), "illegal norms data for field %v, expected %v values, got %v", field.Name, nc.maxDoc, count) if uniqueValues != nil && uniqueValues.size == 1 { // 0 bpv if err = nc.meta.WriteByte(CONST_COMPRESSED); err != nil { return } if err = nc.meta.WriteLong(minValue); err != nil { return } } else if uniqueValues != nil { // small number of unique values; this is the typical case: // we only use bpv=1,2,4,8 format := packed.PackedFormat(packed.PACKED_SINGLE_BLOCK) bitsPerValue := packed.BitsRequired(int64(uniqueValues.size) - 1) if bitsPerValue == 3 { bitsPerValue = 4 } else if bitsPerValue > 4 { bitsPerValue = 8 } if bitsPerValue == 8 && minValue >= 0 && maxValue <= 255 { if err = store.Stream(nc.meta).WriteByte(UNCOMPRESSED). // uncompressed []byte WriteLong(nc.data.FilePointer()). Close(); err != nil { return err } next = iter() for { nv, ok := next() if !ok { break } n := byte(0) if nv != nil { n = byte(nv.(int64)) } if err = nc.data.WriteByte(byte(n)); err != nil { return err } } } else { if err = store.Stream(nc.meta).WriteByte(TABLE_COMPRESSED). // table-compressed WriteLong(nc.data.FilePointer()). Close(); err != nil { return err } if err = nc.data.WriteVInt(packed.VERSION_CURRENT); err != nil { return err } decode := uniqueValues.decodeTable() // upgrade to power of two sized array size := 1 << uint(bitsPerValue) if err = nc.data.WriteVInt(int32(size)); err != nil { return err } for _, v := range decode { if err = nc.data.WriteLong(v); err != nil { return err } } for i := len(decode); i < size; i++ { if err = nc.data.WriteLong(0); err != nil { return err } } if err = store.Stream(nc.data).WriteVInt(int32(format.Id())). WriteVInt(int32(bitsPerValue)). Close(); err != nil { return err } writer := packed.WriterNoHeader(nc.data, format, nc.maxDoc, bitsPerValue, packed.DEFAULT_BUFFER_SIZE) next = iter() for { nv, ok := next() if !ok { break } if err = writer.Add(int64(uniqueValues.ord(nv.(int64)))); err != nil { return err } } if err = writer.Finish(); err != nil { return err } } } else { panic("not implemented yet") } return nil }