예제 #1
0
func NewCompoundFileDirectory(directory Directory, fileName string, context IOContext, openForWrite bool) (d *CompoundFileDirectory, err error) {
	self := &CompoundFileDirectory{
		Locker:         &sync.Mutex{},
		directory:      directory,
		fileName:       fileName,
		readBufferSize: bufferSize(context),
		openForWrite:   openForWrite}
	self.DirectoryImpl = NewDirectoryImpl(self)
	self.BaseDirectory = NewBaseDirectory(self)

	if !openForWrite {
		// log.Printf("Open for read.")
		success := false
		defer func() {
			if !success {
				util.CloseWhileSuppressingError(self.handle)
			}
		}()
		self.handle, err = directory.OpenInput(fileName, context)
		if err != nil {
			return nil, err
		}
		self.entries, err = self.readEntries(self.handle, directory, fileName)
		if err != nil {
			return nil, err
		}
		if self.version >= CFD_VERSION_CHECKSUM {
			if _, err = codec.CheckHeader(self.handle, CFD_DATA_CODEC,
				int32(self.version), int32(self.version)); err != nil {
				return nil, err
			}
			// NOTE: data file is too costly to verify checksum against all the
			// bytes on open, but for now we at least verify proper structure
			// of the checksum footer: which looks for FOOTER_MAGIC +
			// algorithmID. This is cheap and can detect some forms of
			// corruption such as file trucation.
			if _, err = codec.RetrieveChecksum(self.handle); err != nil {
				return nil, err
			}
		}
		success = true
		self.BaseDirectory.IsOpen = true
		return self, nil
	} else {
		assert2(reflect.TypeOf(directory).Name() != "CompoundFileDirectory",
			"compound file inside of compound file: %v", fileName)
		self.entries = SENTINEL
		self.IsOpen = true
		self.writer = newCompoundFileWriter(directory, fileName)
		self.handle = nil
		return self, nil
	}
}
예제 #2
0
// Sole constructor
func newCompressingStoredFieldsReader(d store.Directory,
	si *model.SegmentInfo, segmentSuffix string,
	fn model.FieldInfos, ctx store.IOContext, formatName string,
	compressionMode CompressionMode) (r *CompressingStoredFieldsReader, err error) {

	r = &CompressingStoredFieldsReader{}
	r.compressionMode = compressionMode
	segment := si.Name
	r.fieldInfos = fn
	r.numDocs = si.DocCount()

	var indexStream store.ChecksumIndexInput
	success := false
	defer func() {
		if !success {
			util.CloseWhileSuppressingError(r, indexStream)
		}
	}()

	indexStreamFN := util.SegmentFileName(segment, segmentSuffix, lucene40.FIELDS_INDEX_EXTENSION)
	fieldsStreamFN := util.SegmentFileName(segment, segmentSuffix, lucene40.FIELDS_EXTENSION)
	// Load the index into memory
	if indexStream, err = d.OpenChecksumInput(indexStreamFN, ctx); err != nil {
		return nil, err
	}
	codecNameIdx := formatName + CODEC_SFX_IDX
	if r.version, err = int32AsInt(codec.CheckHeader(indexStream, codecNameIdx,
		VERSION_START, VERSION_CURRENT)); err != nil {
		return nil, err
	}
	assert(int64(codec.HeaderLength(codecNameIdx)) == indexStream.FilePointer())
	if r.indexReader, err = newCompressingStoredFieldsIndexReader(indexStream, si); err != nil {
		return nil, err
	}

	var maxPointer int64 = -1

	if r.version >= VERSION_CHECKSUM {
		if maxPointer, err = indexStream.ReadVLong(); err != nil {
			return nil, err
		}
		if _, err = codec.CheckFooter(indexStream); err != nil {
			return nil, err
		}
	} else {
		if err = codec.CheckEOF(indexStream); err != nil {
			return nil, err
		}
	}

	if err = indexStream.Close(); err != nil {
		return nil, err
	}
	indexStream = nil

	// Open the data file and read metadata
	if r.fieldsStream, err = d.OpenInput(fieldsStreamFN, ctx); err != nil {
		return nil, err
	}
	if r.version >= VERSION_CHECKSUM {
		if maxPointer+codec.FOOTER_LENGTH != r.fieldsStream.Length() {
			return nil, errors.New(fmt.Sprintf(
				"Invalid fieldsStream maxPointer (file truncated?): maxPointer=%v, length=%v",
				maxPointer, r.fieldsStream.Length()))
		}
	} else {
		maxPointer = r.fieldsStream.Length()
	}
	r.maxPointer = maxPointer
	codecNameDat := formatName + CODEC_SFX_DAT
	var fieldsVersion int
	if fieldsVersion, err = int32AsInt(codec.CheckHeader(r.fieldsStream,
		codecNameDat, VERSION_START, VERSION_CURRENT)); err != nil {
		return nil, err
	}
	assert2(r.version == fieldsVersion,
		"Version mismatch between stored fields index and data: %v != %v",
		r.version, fieldsVersion)
	assert(int64(codec.HeaderLength(codecNameDat)) == r.fieldsStream.FilePointer())

	r.chunkSize = -1
	if r.version >= VERSION_BIG_CHUNKS {
		if r.chunkSize, err = int32AsInt(r.fieldsStream.ReadVInt()); err != nil {
			return nil, err
		}
	}

	if r.packedIntsVersion, err = int32AsInt(r.fieldsStream.ReadVInt()); err != nil {
		return nil, err
	}
	r.decompressor = compressionMode.NewDecompressor()
	r.bytes = make([]byte, 0)

	if r.version >= VERSION_CHECKSUM {
		// NOTE: data file is too costly to verify checksum against all the
		// bytes on open, but fo rnow we at least verify proper structure
		// of the checksum footer: which looks for FOOTER_MATIC +
		// algorithmID. This is cheap and can detect some forms of
		// corruption such as file trucation.
		if _, err = codec.RetrieveChecksum(r.fieldsStream); err != nil {
			return nil, err
		}
	}

	success = true
	return r, nil
}
예제 #3
0
func NewBlockTreeTermsReader(dir store.Directory,
	fieldInfos FieldInfos, info *SegmentInfo,
	postingsReader PostingsReaderBase, ctx store.IOContext,
	segmentSuffix string, indexDivisor int) (p FieldsProducer, err error) {

	// log.Print("Initializing BlockTreeTermsReader...")
	fp := &BlockTreeTermsReader{
		postingsReader: postingsReader,
		fields:         make(map[string]FieldReader),
		segment:        info.Name,
	}
	fp.in, err = dir.OpenInput(util.SegmentFileName(info.Name, segmentSuffix, TERMS_EXTENSION), ctx)
	if err != nil {
		return nil, err
	}

	success := false
	var indexIn store.IndexInput
	defer func() {
		if !success {
			fmt.Println("Failed to initialize BlockTreeTermsReader.")
			if err != nil {
				fmt.Println("DEBUG ", err)
			}
			// this.close() will close in:
			util.CloseWhileSuppressingError(indexIn, fp)
		}
	}()

	fp.version, err = fp.readHeader(fp.in)
	if err != nil {
		return nil, err
	}
	// log.Printf("Version: %v", fp.version)

	if indexDivisor != -1 {
		filename := util.SegmentFileName(info.Name, segmentSuffix, TERMS_INDEX_EXTENSION)
		indexIn, err = dir.OpenInput(filename, ctx)
		if err != nil {
			return nil, err
		}

		indexVersion, err := fp.readIndexHeader(indexIn)
		if err != nil {
			return nil, err
		}
		// log.Printf("Index version: %v", indexVersion)
		if int(indexVersion) != fp.version {
			return nil, errors.New(fmt.Sprintf("mixmatched version files: %v=%v,%v=%v", fp.in, fp.version, indexIn, indexVersion))
		}
	}

	// verify
	if indexIn != nil && fp.version >= TERMS_VERSION_CURRENT {
		if _, err = store.ChecksumEntireFile(indexIn); err != nil {
			return nil, err
		}
	}

	// Have PostingsReader init itself
	postingsReader.Init(fp.in)

	if fp.version >= TERMS_VERSION_CHECKSUM {
		// NOTE: data file is too costly to verify checksum against all the
		// bytes on open, but for now we at least verify proper structure
		// of the checksum footer: which looks for FOOTER_MAGIC +
		// algorithmID. This is cheap and can detect some forms of
		// corruption such as file trucation.
		if _, err = codec.RetrieveChecksum(fp.in); err != nil {
			return nil, err
		}
	}

	// Read per-field details
	fp.seekDir(fp.in, fp.dirOffset)
	if indexDivisor != -1 {
		fp.seekDir(indexIn, fp.indexDirOffset)
	}

	numFields, err := fp.in.ReadVInt()
	if err != nil {
		return nil, err
	}
	// log.Printf("Fields number: %v", numFields)
	if numFields < 0 {
		return nil, errors.New(fmt.Sprintf("invalid numFields: %v (resource=%v)", numFields, fp.in))
	}

	for i := int32(0); i < numFields; i++ {
		// log.Printf("Next field...")
		field, err := fp.in.ReadVInt()
		if err != nil {
			return nil, err
		}
		// log.Printf("Field: %v", field)

		numTerms, err := fp.in.ReadVLong()
		if err != nil {
			return nil, err
		}
		assert2(numTerms > 0,
			"Illegal numTerms for field number: %v (resource=%v)", field, fp.in)
		// log.Printf("Terms number: %v", numTerms)

		numBytes, err := fp.in.ReadVInt()
		if err != nil {
			return nil, err
		}
		assert2(numBytes >= 0,
			"invalid rootCode for field number: %v, numBytes=%v (resource=%v)",
			field, numBytes, fp.in)
		// log.Printf("Bytes number: %v", numBytes)

		rootCode := make([]byte, numBytes)
		err = fp.in.ReadBytes(rootCode)
		if err != nil {
			return nil, err
		}
		fieldInfo := fieldInfos.FieldInfoByNumber(int(field))
		assert2(fieldInfo != nil, "invalid field numebr: %v (resource=%v)", field, fp.in)
		var sumTotalTermFreq int64
		if fieldInfo.IndexOptions() == INDEX_OPT_DOCS_ONLY {
			sumTotalTermFreq = -1
		} else {
			sumTotalTermFreq, err = fp.in.ReadVLong()
			if err != nil {
				return nil, err
			}
		}
		sumDocFreq, err := fp.in.ReadVLong()
		if err != nil {
			return nil, err
		}
		var docCount int
		if docCount, err = asInt(fp.in.ReadVInt()); err != nil {
			return nil, err
		}
		// fmt.Printf("DocCount: %v\n", docCount)
		var longsSize int
		if fp.version >= TERMS_VERSION_META_ARRAY {
			if longsSize, err = asInt(fp.in.ReadVInt()); err != nil {
				return nil, err
			}
		}
		assert2(longsSize >= 0,
			"invalid longsSize for field: %v, longsSize=%v (resource=%v)",
			fieldInfo.Name, longsSize, fp.in)
		var minTerm, maxTerm []byte
		if fp.version >= TERMS_VERSION_MIN_MAX_TERMS {
			if minTerm, err = readBytesRef(fp.in); err != nil {
				return nil, err
			}
			if maxTerm, err = readBytesRef(fp.in); err != nil {
				return nil, err
			}
		}
		if docCount < 0 || int(docCount) > info.DocCount() { // #docs with field must be <= #docs
			return nil, errors.New(fmt.Sprintf(
				"invalid docCount: %v maxDoc: %v (resource=%v)",
				docCount, info.DocCount(), fp.in))
		}
		if sumDocFreq < int64(docCount) { // #postings must be >= #docs with field
			return nil, errors.New(fmt.Sprintf(
				"invalid sumDocFreq: %v docCount: %v (resource=%v)",
				sumDocFreq, docCount, fp.in))
		}
		if sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq { // #positions must be >= #postings
			return nil, errors.New(fmt.Sprintf(
				"invalid sumTotalTermFreq: %v sumDocFreq: %v (resource=%v)",
				sumTotalTermFreq, sumDocFreq, fp.in))
		}

		var indexStartFP int64
		if indexDivisor != -1 {
			if indexStartFP, err = indexIn.ReadVLong(); err != nil {
				return nil, err
			}
		}
		// log.Printf("indexStartFP: %v", indexStartFP)
		if _, ok := fp.fields[fieldInfo.Name]; ok {
			return nil, errors.New(fmt.Sprintf(
				"duplicate field: %v (resource=%v)", fieldInfo.Name, fp.in))
		}
		if fp.fields[fieldInfo.Name], err = newFieldReader(fp,
			fieldInfo, numTerms, rootCode, sumTotalTermFreq,
			sumDocFreq, docCount, indexStartFP, longsSize,
			indexIn, minTerm, maxTerm); err != nil {
			return nil, err
		}
	}

	if indexDivisor != -1 {
		if err = indexIn.Close(); err != nil {
			return nil, err
		}
	}

	success = true

	return fp, nil
}
예제 #4
0
func NewLucene41PostingsReader(dir store.Directory,
	fis FieldInfos, si *SegmentInfo,
	ctx store.IOContext, segmentSuffix string) (r PostingsReaderBase, err error) {

	// fmt.Println("Initializing Lucene41PostingsReader...")
	success := false
	var docIn, posIn, payIn store.IndexInput = nil, nil, nil
	defer func() {
		if !success {
			fmt.Println("Failed to initialize Lucene41PostingsReader.")
			util.CloseWhileSuppressingError(docIn, posIn, payIn)
		}
	}()

	docIn, err = dir.OpenInput(util.SegmentFileName(si.Name, segmentSuffix, LUCENE41_DOC_EXTENSION), ctx)
	if err != nil {
		return nil, err
	}
	var version int32
	version, err = codec.CheckHeader(docIn, LUCENE41_DOC_CODEC, LUCENE41_VERSION_START, LUCENE41_VERSION_CURRENT)
	if err != nil {
		return nil, err
	}
	forUtil, err := NewForUtilFrom(docIn)
	if err != nil {
		return nil, err
	}

	if version >= LUCENE41_VERSION_CHECKSUM {
		// NOTE: data file is too costly to verify checksum against all the
		// bytes on open, but for now we at least verify proper structure
		// of the checksum footer: which looks for FOOTER_MAGIC +
		// algorithmID. This is cheap and can detect some forms of
		// corruption such as file trucation.
		if _, err = codec.RetrieveChecksum(docIn); err != nil {
			return nil, err
		}
	}

	if fis.HasProx {
		posIn, err = dir.OpenInput(util.SegmentFileName(si.Name, segmentSuffix, LUCENE41_POS_EXTENSION), ctx)
		if err != nil {
			return nil, err
		}
		_, err = codec.CheckHeader(posIn, LUCENE41_POS_CODEC, version, version)
		if err != nil {
			return nil, err
		}

		if version >= LUCENE41_VERSION_CHECKSUM {
			// NOTE: data file is too costly to verify checksum against all the
			// bytes on open, but for now we at least verify proper structure
			// of the checksum footer: which looks for FOOTER_MAGIC +
			// algorithmID. This is cheap and can detect some forms of
			// corruption such as file trucation.
			if _, err = codec.RetrieveChecksum(posIn); err != nil {
				return nil, err
			}
		}

		if fis.HasPayloads || fis.HasOffsets {
			payIn, err = dir.OpenInput(util.SegmentFileName(si.Name, segmentSuffix, LUCENE41_PAY_EXTENSION), ctx)
			if err != nil {
				return nil, err
			}
			_, err = codec.CheckHeader(payIn, LUCENE41_PAY_CODEC, version, version)
			if err != nil {
				return nil, err
			}

			if version >= LUCENE41_VERSION_CHECKSUM {
				// NOTE: data file is too costly to verify checksum against all the
				// bytes on open, but for now we at least verify proper structure
				// of the checksum footer: which looks for FOOTER_MAGIC +
				// algorithmID. This is cheap and can detect some forms of
				// corruption such as file trucation.
				if _, err = codec.RetrieveChecksum(payIn); err != nil {
					return nil, err
				}

			}
		}
	}

	success = true
	return &Lucene41PostingsReader{docIn, posIn, payIn, forUtil, int(version)}, nil
}
예제 #5
0
func newLucene49NormsProducer(state SegmentReadState,
	dataCodec, dataExtension, metaCodec, metaExtension string) (np *NormsProducer, err error) {

	np = &NormsProducer{
		Locker:       new(sync.Mutex),
		norms:        make(map[int]*NormsEntry),
		instances:    make(map[int]NumericDocValues),
		maxDoc:       state.SegmentInfo.DocCount(),
		ramBytesUsed: util.ShallowSizeOfInstance(reflect.TypeOf(np)),
	}
	metaName := util.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, metaExtension)
	// read in the entries from the metadta file.
	var in store.ChecksumIndexInput
	if in, err = state.Dir.OpenChecksumInput(metaName, state.Context); err != nil {
		return nil, err
	}

	if err = func() error {
		var success = false
		defer func() {
			if success {
				err = util.Close(in)
			} else {
				util.CloseWhileSuppressingError(in)
			}
		}()

		if np.version, err = codec.CheckHeader(in, metaCodec, VERSION_START, VERSION_CURRENT); err != nil {
			return err
		}
		if err = np.readFields(in, state.FieldInfos); err != nil {
			return err
		}
		if _, err = codec.CheckFooter(in); err != nil {
			return err
		}
		success = true
		return nil
	}(); err != nil {
		return nil, err
	}

	dataName := util.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, dataExtension)
	if np.data, err = state.Dir.OpenInput(dataName, state.Context); err != nil {
		return nil, err
	}
	var success = false
	defer func() {
		if !success {
			util.CloseWhileSuppressingError(np.data)
		}
	}()

	var version2 int32
	if version2, err = codec.CheckHeader(np.data, dataCodec, VERSION_START, VERSION_CURRENT); err != nil {
		return nil, err
	}
	if version2 != np.version {
		return nil, errors.New("Format versions mismatch")
	}

	// NOTE: data file is too costly to verify checksum against all the
	// bytes on open, but fo rnow we at least verify proper structure
	// of the checksum footer: which looks for FOOTER_MATIC +
	// algorithmID. This is cheap and can detect some forms of
	// corruption such as file trucation.
	if _, err = codec.RetrieveChecksum(np.data); err != nil {
		return nil, err
	}

	success = true

	return np, nil
}