Пример #1
0
func bitsRequired(data []int) int {
	or := int64(0)
	for _, v := range data[:LUCENE41_BLOCK_SIZE] {
		assert(v >= 0)
		or |= int64(v)
	}
	return packed.BitsRequired(or)
}
Пример #2
0
func (h *LZ4HashTable) reset(length int) {
	bitsPerOffset := packed.BitsRequired(int64(length - LAST_LITERALS))
	bitsPerOffsetLog := ceilLog2(bitsPerOffset)
	h.hashLog = MEMORY_USAGE + 3 - bitsPerOffsetLog
	assert(h.hashLog > 0)
	if h.hashTable == nil || h.hashTable.Size() < (1<<uint(h.hashLog)) || h.hashTable.BitsPerValue() < bitsPerOffset {
		h.hashTable = packed.MutableFor(1<<uint(h.hashLog), bitsPerOffset, packed.PackedInts.DEFAULT)
	} else {
		h.hashTable.Clear()
	}
}
Пример #3
0
/*
Compress bytes into out using at most 16KB of memory. ht shouldn't be
shared across threads but can safely be reused.
*/
func LZ4Compress(bytes []byte, out DataOutput, ht *LZ4HashTable) error {
	offset, length := 0, len(bytes)
	base, end := offset, offset+length

	anchor := offset
	offset++

	if length > LAST_LITERALS+MIN_MATCH {
		limit := end - LAST_LITERALS
		matchLimit := limit - MIN_MATCH
		ht.reset(length)
		hashLog := ht.hashLog
		hashTable := ht.hashTable

		for offset <= limit {
			// find a match
			var ref int
			var hasMore = offset < matchLimit
			for hasMore {
				v := readInt(bytes, offset)
				h := hash(v, hashLog)
				ref = base + int(hashTable.Get(h))
				assert(packed.BitsRequired(int64(offset-base)) <= hashTable.BitsPerValue())
				hashTable.Set(h, int64(offset-base))
				if offset-ref < MAX_DISTANCE && readInt(bytes, ref) == v {
					break
				}
				offset++
				hasMore = offset < matchLimit
			}
			if !hasMore {
				break
			}

			// compute match length
			matchLen := MIN_MATCH + commonBytes(
				bytes[ref+MIN_MATCH:limit],
				bytes[offset+MIN_MATCH:limit])

			err := encodeSequence(bytes[anchor:offset], offset-ref, matchLen, out)
			if err != nil {
				return err
			}
			offset += matchLen
			anchor = offset
		}
	}

	// last literals
	literalLen := end - anchor
	assert(literalLen >= LAST_LITERALS || literalLen == length)
	return encodeLastLiterals(bytes[anchor:], out)
}
Пример #4
0
func saveInts(values []int, out DataOutput) error {
	length := len(values)
	assert(length > 0)
	if length == 1 {
		return out.WriteVInt(int32(values[0]))
	}

	var allEqual = true
	var sentinel = values[0]
	for _, v := range values[1:] {
		if v != sentinel {
			allEqual = false
			break
		}
	}
	if allEqual {
		err := out.WriteVInt(0)
		if err == nil {
			err = out.WriteVInt(int32(values[0]))
		}
		return err
	}

	var max int64 = 0
	for _, v := range values {
		max |= int64(v)
	}
	var bitsRequired = packed.BitsRequired(max)
	err := out.WriteVInt(int32(bitsRequired))
	if err != nil {
		return err
	}

	w := packed.WriterNoHeader(out, packed.PackedFormat(packed.PACKED), length, bitsRequired, 1)
	for _, v := range values {
		if err = w.Add(int64(v)); err != nil {
			return err
		}
	}
	return w.Finish()
}
func (w *StoredFieldsIndexWriter) writeBlock() error {
	assert(w.blockChunks > 0)
	err := w.fieldsIndexOut.WriteVInt(int32(w.blockChunks))
	if err != nil {
		return err
	}

	// The trick here is that we only store the difference from the
	// average start pointer or doc base, this helps save bits per
	// value. And in order to prevent a few chunks that would be far
	// from the average to raise the number of bits per value for all
	// of them, we only encode blocks of 1024 chunks at once.
	// See LUCENE-4512

	// doc bases
	var avgChunkDocs int
	if w.blockChunks == 1 {
		avgChunkDocs = 0
	} else {
		avgChunkDocs = int(math.Floor(float64(w.blockDocs-w.docBaseDeltas[w.blockChunks-1])/float64(w.blockChunks-1) + 0.5))
	}
	err = w.fieldsIndexOut.WriteVInt(int32(w.totalDocs - w.blockDocs)) // doc base
	if err == nil {
		err = w.fieldsIndexOut.WriteVInt(int32(avgChunkDocs))
	}
	if err != nil {
		return err
	}
	var docBase int = 0
	var maxDelta int64 = 0
	for i := 0; i < w.blockChunks; i++ {
		delta := docBase - avgChunkDocs*i
		maxDelta |= moveSignToLowOrderBit(int64(delta))
		docBase += w.docBaseDeltas[i]
	}

	bitsPerDocbase := packed.BitsRequired(maxDelta)
	err = w.fieldsIndexOut.WriteVInt(int32(bitsPerDocbase))
	if err != nil {
		return err
	}
	writer := packed.WriterNoHeader(w.fieldsIndexOut,
		packed.PackedFormat(packed.PACKED), w.blockChunks, bitsPerDocbase, 1)
	docBase = 0
	for i := 0; i < w.blockChunks; i++ {
		delta := docBase - avgChunkDocs*i
		assert(packed.BitsRequired(moveSignToLowOrderBit(int64(delta))) <= writer.BitsPerValue())
		err = writer.Add(moveSignToLowOrderBit(int64(delta)))
		if err != nil {
			return err
		}
		docBase += w.docBaseDeltas[i]
	}
	err = writer.Finish()
	if err != nil {
		return err
	}

	// start pointers
	w.fieldsIndexOut.WriteVLong(w.firstStartPointer)
	var avgChunkSize int64
	if w.blockChunks == 1 {
		avgChunkSize = 0
	} else {
		avgChunkSize = (w.maxStartPointer - w.firstStartPointer) / int64(w.blockChunks-1)
	}
	err = w.fieldsIndexOut.WriteVLong(avgChunkSize)
	if err != nil {
		return err
	}
	var startPointer int64 = 0
	maxDelta = 0
	for i := 0; i < w.blockChunks; i++ {
		startPointer += w.startPointerDeltas[i]
		delta := startPointer - avgChunkSize*int64(i)
		maxDelta |= moveSignToLowOrderBit(delta)
	}

	bitsPerStartPointer := packed.BitsRequired(maxDelta)
	err = w.fieldsIndexOut.WriteVInt(int32(bitsPerStartPointer))
	if err != nil {
		return err
	}
	writer = packed.WriterNoHeader(w.fieldsIndexOut,
		packed.PackedFormat(packed.PACKED), w.blockChunks, bitsPerStartPointer, 1)
	startPointer = 0
	for i := 0; i < w.blockChunks; i++ {
		startPointer += w.startPointerDeltas[i]
		delta := startPointer - avgChunkSize*int64(i)
		assert(packed.BitsRequired(moveSignToLowOrderBit(delta)) <= writer.BitsPerValue())
		err = writer.Add(moveSignToLowOrderBit(delta))
		if err != nil {
			return err
		}
	}
	return writer.Finish()
}
Пример #6
0
)

/* hard limit on the maximum number of documents per chunk */
const MAX_DOCUMENTS_PER_CHUNK = 128

const (
	STRING         = 0x00
	BYTE_ARR       = 0x01
	NUMERIC_INT    = 0x02
	NUMERIC_FLOAT  = 0x03
	NUMERIC_LONG   = 0x04
	NUMERIC_DOUBLE = 0x05
)

var (
	TYPE_BITS = packed.BitsRequired(NUMERIC_DOUBLE)
	TYPE_MASK = int(packed.MaxValue(TYPE_BITS))
)

const (
	CODEC_SFX_IDX      = "Index"
	CODEC_SFX_DAT      = "Data"
	VERSION_START      = 0
	VERSION_BIG_CHUNKS = 1
	VERSION_CHECKSUM   = 2
	VERSION_CURRENT    = VERSION_CHECKSUM
)

/* StoredFieldsWriter impl for CompressingStoredFieldsFormat */
type CompressingStoredFieldsWriter struct {
	directory     store.Directory
Пример #7
0
func (nc *NormsConsumer) AddNumericField(field *FieldInfo,
	iter func() func() (interface{}, bool)) (err error) {

	if err = nc.meta.WriteVInt(field.Number); err != nil {
		return
	}
	minValue, maxValue := int64(math.MaxInt64), int64(math.MinInt64)
	// TODO: more efficient?
	uniqueValues := newNormMap()

	count := int64(0)
	next := iter()
	for {
		nv, ok := next()
		if !ok {
			break
		}
		assert2(nv != nil, "illegal norms data for field %v, got null for value: %v", field.Name, count)
		v := nv.(int64)

		if v < minValue {
			minValue = v
		}
		if v > maxValue {
			maxValue = v
		}

		if uniqueValues != nil && uniqueValues.add(v) && uniqueValues.size > 256 {
			uniqueValues = nil
		}

		count++
	}
	assert2(count == int64(nc.maxDoc),
		"illegal norms data for field %v, expected %v values, got %v",
		field.Name, nc.maxDoc, count)

	if uniqueValues != nil && uniqueValues.size == 1 {
		// 0 bpv
		if err = nc.meta.WriteByte(CONST_COMPRESSED); err != nil {
			return
		}
		if err = nc.meta.WriteLong(minValue); err != nil {
			return
		}
	} else if uniqueValues != nil {
		// small number of unique values; this is the typical case:
		// we only use bpv=1,2,4,8
		format := packed.PackedFormat(packed.PACKED_SINGLE_BLOCK)
		bitsPerValue := packed.BitsRequired(int64(uniqueValues.size) - 1)
		if bitsPerValue == 3 {
			bitsPerValue = 4
		} else if bitsPerValue > 4 {
			bitsPerValue = 8
		}

		if bitsPerValue == 8 && minValue >= 0 && maxValue <= 255 {
			if err = store.Stream(nc.meta).WriteByte(UNCOMPRESSED). // uncompressed []byte
										WriteLong(nc.data.FilePointer()).
										Close(); err != nil {
				return err
			}
			next = iter()
			for {
				nv, ok := next()
				if !ok {
					break
				}
				n := byte(0)
				if nv != nil {
					n = byte(nv.(int64))
				}
				if err = nc.data.WriteByte(byte(n)); err != nil {
					return err
				}
			}
		} else {
			if err = store.Stream(nc.meta).WriteByte(TABLE_COMPRESSED). // table-compressed
											WriteLong(nc.data.FilePointer()).
											Close(); err != nil {
				return err
			}
			if err = nc.data.WriteVInt(packed.VERSION_CURRENT); err != nil {
				return err
			}

			decode := uniqueValues.decodeTable()
			// upgrade to power of two sized array
			size := 1 << uint(bitsPerValue)
			if err = nc.data.WriteVInt(int32(size)); err != nil {
				return err
			}
			for _, v := range decode {
				if err = nc.data.WriteLong(v); err != nil {
					return err
				}
			}
			for i := len(decode); i < size; i++ {
				if err = nc.data.WriteLong(0); err != nil {
					return err
				}
			}

			if err = store.Stream(nc.data).WriteVInt(int32(format.Id())).
				WriteVInt(int32(bitsPerValue)).
				Close(); err != nil {
				return err
			}

			writer := packed.WriterNoHeader(nc.data, format, nc.maxDoc, bitsPerValue, packed.DEFAULT_BUFFER_SIZE)
			next = iter()
			for {
				nv, ok := next()
				if !ok {
					break
				}
				if err = writer.Add(int64(uniqueValues.ord(nv.(int64)))); err != nil {
					return err
				}
			}
			if err = writer.Finish(); err != nil {
				return err
			}
		}
	} else {
		panic("not implemented yet")
	}
	return nil
}