/* Clones the provided input, reads all bytes from the file, and calls CheckFooter(). Note that this method may be slow, as it must process the entire file. If you just need to extract the checksum value, call retrieveChecksum(). */ func ChecksumEntireFile(input IndexInput) (hash int64, err error) { clone := input.Clone() if err = clone.Seek(0); err != nil { return 0, err } in := newBufferedChecksumIndexInput(clone) assert(in.FilePointer() == 0) if err = in.Seek(in.Length() - codec.FOOTER_LENGTH); err != nil { return 0, err } return codec.CheckFooter(in) }
/* Read a particular segmentFileName. Note that this may return IO error if a commit is in process. */ func (sis *SegmentInfos) Read(directory store.Directory, segmentFileName string) (err error) { // fmt.Printf("Reading segment info from %v...\n", segmentFileName) // Clear any previous segments: sis.Clear() sis.generation = GenerationFromSegmentsFileName(segmentFileName) sis.lastGeneration = sis.generation var input store.ChecksumIndexInput if input, err = directory.OpenChecksumInput(segmentFileName, store.IO_CONTEXT_READ); err != nil { return } var success = false defer func() { if !success { // Clear any segment infos we had loaded so we // have a clean slate on retry: sis.Clear() util.CloseWhileSuppressingError(input) } else { err = input.Close() } }() var format int if format, err = asInt(input.ReadInt()); err != nil { return } var actualFormat int if format == codec.CODEC_MAGIC { // 4.0+ if actualFormat, err = asInt(codec.CheckHeaderNoMagic(input, "segments", VERSION_40, VERSION_49)); err != nil { return } if sis.version, err = input.ReadLong(); err != nil { return } if sis.counter, err = asInt(input.ReadInt()); err != nil { return } var numSegments int if numSegments, err = asInt(input.ReadInt()); err != nil { return } else if numSegments < 0 { return errors.New(fmt.Sprintf("invalid segment count: %v (resource: %v)", numSegments, input)) } var segName, codecName string var fCodec Codec var delGen, fieldInfosGen, dvGen int64 var delCount int for seg := 0; seg < numSegments; seg++ { if segName, err = input.ReadString(); err != nil { return } if codecName, err = input.ReadString(); err != nil { return } fCodec = LoadCodec(codecName) assert2(fCodec != nil, "Invalid codec name: %v", codecName) // fmt.Printf("SIS.read seg=%v codec=%v\n", seg, fCodec) var info *SegmentInfo if info, err = fCodec.SegmentInfoFormat().SegmentInfoReader().Read(directory, segName, store.IO_CONTEXT_READ); err != nil { return } info.SetCodec(fCodec) if delGen, err = input.ReadLong(); err != nil { return } if delCount, err = asInt(input.ReadInt()); err != nil { return } else if delCount < 0 || delCount > info.DocCount() { return errors.New(fmt.Sprintf( "invalid deletion count: %v vs docCount=%v (resource: %v)", delCount, info.DocCount(), input)) } fieldInfosGen = -1 if actualFormat >= VERSION_46 { if fieldInfosGen, err = input.ReadLong(); err != nil { return } } dvGen = -1 if actualFormat >= VERSION_49 { if dvGen, err = input.ReadLong(); err != nil { return } } else { dvGen = fieldInfosGen } siPerCommit := NewSegmentCommitInfo(info, delCount, delGen, fieldInfosGen, dvGen) if actualFormat >= VERSION_46 { if actualFormat < VERSION_49 { panic("not implemented yet") } else { var ss map[string]bool if ss, err = input.ReadStringSet(); err != nil { return err } siPerCommit.SetFieldInfosFiles(ss) var dvUpdatesFiles map[int]map[string]bool var numDVFields int if numDVFields, err = asInt(input.ReadInt()); err != nil { return err } if numDVFields == 0 { dvUpdatesFiles = make(map[int]map[string]bool) } else { panic("not implemented yet") } siPerCommit.SetDocValuesUpdatesFiles(dvUpdatesFiles) } } sis.Segments = append(sis.Segments, siPerCommit) } if sis.userData, err = input.ReadStringStringMap(); err != nil { return err } } else { // TODO support <4.0 index panic("Index format pre-4.0 not supported yet") } if actualFormat >= VERSION_48 { if _, err = codec.CheckFooter(input); err != nil { return } } else { var checksumNow = int64(input.Checksum()) var checksumThen int64 if checksumThen, err = input.ReadLong(); err != nil { return } if checksumNow != checksumThen { return errors.New(fmt.Sprintf( "checksum mismatch in segments file: %v vs %v (resource: %v)", checksumNow, checksumThen, input)) } if err = codec.CheckEOF(input); err != nil { return } } success = true return nil }
// TODO support IndexCommit func (fsf *FindSegmentsFile) run(commit IndexCommit) (interface{}, error) { // fmt.Println("Finding segments file...") if commit != nil { if fsf.directory != commit.Directory() { return nil, errors.New("the specified commit does not match the specified Directory") } return fsf.doBody(commit.SegmentsFileName()) } lastGen := int64(-1) gen := int64(0) genLookaheadCount := 0 var exc error retryCount := 0 useFirstMethod := true // Loop until we succeed in calling doBody() without // hitting an IOException. An IOException most likely // means a commit was in process and has finished, in // the time it took us to load the now-old infos files // (and segments files). It's also possible it's a // true error (corrupt index). To distinguish these, // on each retry we must see "forward progress" on // which generation we are trying to load. If we // don't, then the original error is real and we throw // it. // We have three methods for determining the current // generation. We try the first two in parallel (when // useFirstMethod is true), and fall back to the third // when necessary. for { // fmt.Println("Trying...") if useFirstMethod { // fmt.Println("Trying first method...") // List the directory and use the highest // segments_N file. This method works well as long // as there is no stale caching on the directory // contents (NOTE: NFS clients often have such stale // caching): genA := int64(-1) files, err := fsf.directory.ListAll() if err != nil { return nil, err } if files != nil { genA = LastCommitGeneration(files) } // message("directory listing genA=%v", genA) // Also open segments.gen and read its // contents. Then we take the larger of the two // gens. This way, if either approach is hitting // a stale cache (NFS) we have a better chance of // getting the right generation. genB := int64(-1) genInput, err := fsf.directory.OpenChecksumInput(INDEX_FILENAME_SEGMENTS_GEN, store.IO_CONTEXT_READ) if err != nil { message("segments.gen open: %v", err) } else { defer genInput.Close() // fmt.Println("Reading segments info...") var version int32 if version, err = genInput.ReadInt(); err != nil { return nil, err } // fmt.Printf("Version: %v\n", version) if version == FORMAT_SEGMENTS_GEN_47 || version == FORMAT_SEGMENTS_GEN_CURRENT { // fmt.Println("Version is current.") var gen0, gen1 int64 if gen0, err = genInput.ReadLong(); err != nil { return nil, err } if gen1, err = genInput.ReadLong(); err != nil { return nil, err } message("fallback check: %v; %v", gen0, gen1) if version == FORMAT_SEGMENTS_GEN_CHECKSUM { if _, err = codec.CheckFooter(genInput); err != nil { return nil, err } } else { if err = codec.CheckEOF(genInput); err != nil { return nil, err } } if gen0 == gen1 { // The file is consistent. genB = gen0 } } else { return nil, codec.NewIndexFormatTooNewError(genInput, version, FORMAT_SEGMENTS_GEN_CURRENT, FORMAT_SEGMENTS_GEN_CURRENT) } } message("%v check: genB=%v", INDEX_FILENAME_SEGMENTS_GEN, genB) // Pick the larger of the two gen's: gen = genA if genB > gen { gen = genB } if gen == -1 { // Neither approach found a generation return nil, errors.New(fmt.Sprintf("no segments* file found in %v: files: %#v", fsf.directory, files)) } } if useFirstMethod && lastGen == gen && retryCount >= 2 { // Give up on first method -- this is 3rd cycle on // listing directory and checking gen file to // attempt to locate the segments file. useFirstMethod = false } // Second method: since both directory cache and // file contents cache seem to be stale, just // advance the generation. if !useFirstMethod { if genLookaheadCount < fsf.defaultGenLookaheadCount { gen++ genLookaheadCount++ message("look ahead increment gen to %v", gen) } else { // All attempts have failed -- throw first exc: return nil, exc } } else if lastGen == gen { // This means we're about to try the same // segments_N last tried. retryCount++ } else { // Segment file has advanced since our last loop // (we made "progress"), so reset retryCount: retryCount = 0 } lastGen = gen segmentFileName := util.FileNameFromGeneration(INDEX_FILENAME_SEGMENTS, "", gen) // fmt.Printf("SegmentFileName: %v\n", segmentFileName) var v interface{} var err error if v, err = fsf.doBody(segmentFileName); err == nil { message("success on %v", segmentFileName) return v, nil } // Save the original root cause: if exc == nil { exc = err } message("primary Exception on '%v': %v; will retry: retryCount = %v; gen = %v", segmentFileName, err, retryCount, gen) if gen > 1 && useFirstMethod && retryCount == 1 { // This is our second time trying this same segments // file (because retryCount is 1), and, there is // possibly a segments_(N-1) (because gen > 1). // So, check if the segments_(N-1) exists and // try it if so: prevSegmentFileName := util.FileNameFromGeneration(INDEX_FILENAME_SEGMENTS, "", gen-1) if prevExists := fsf.directory.FileExists(prevSegmentFileName); prevExists { message("fallback to prior segment file '%v'", prevSegmentFileName) if v, err = fsf.doBody(prevSegmentFileName); err != nil { message("secondary Exception on '%v': %v; will retry", prevSegmentFileName, err) } else { message("success on fallback %v", prevSegmentFileName) return v, nil } } } } }
// Sole constructor func newCompressingStoredFieldsReader(d store.Directory, si *model.SegmentInfo, segmentSuffix string, fn model.FieldInfos, ctx store.IOContext, formatName string, compressionMode CompressionMode) (r *CompressingStoredFieldsReader, err error) { r = &CompressingStoredFieldsReader{} r.compressionMode = compressionMode segment := si.Name r.fieldInfos = fn r.numDocs = si.DocCount() var indexStream store.ChecksumIndexInput success := false defer func() { if !success { util.CloseWhileSuppressingError(r, indexStream) } }() indexStreamFN := util.SegmentFileName(segment, segmentSuffix, lucene40.FIELDS_INDEX_EXTENSION) fieldsStreamFN := util.SegmentFileName(segment, segmentSuffix, lucene40.FIELDS_EXTENSION) // Load the index into memory if indexStream, err = d.OpenChecksumInput(indexStreamFN, ctx); err != nil { return nil, err } codecNameIdx := formatName + CODEC_SFX_IDX if r.version, err = int32AsInt(codec.CheckHeader(indexStream, codecNameIdx, VERSION_START, VERSION_CURRENT)); err != nil { return nil, err } assert(int64(codec.HeaderLength(codecNameIdx)) == indexStream.FilePointer()) if r.indexReader, err = newCompressingStoredFieldsIndexReader(indexStream, si); err != nil { return nil, err } var maxPointer int64 = -1 if r.version >= VERSION_CHECKSUM { if maxPointer, err = indexStream.ReadVLong(); err != nil { return nil, err } if _, err = codec.CheckFooter(indexStream); err != nil { return nil, err } } else { if err = codec.CheckEOF(indexStream); err != nil { return nil, err } } if err = indexStream.Close(); err != nil { return nil, err } indexStream = nil // Open the data file and read metadata if r.fieldsStream, err = d.OpenInput(fieldsStreamFN, ctx); err != nil { return nil, err } if r.version >= VERSION_CHECKSUM { if maxPointer+codec.FOOTER_LENGTH != r.fieldsStream.Length() { return nil, errors.New(fmt.Sprintf( "Invalid fieldsStream maxPointer (file truncated?): maxPointer=%v, length=%v", maxPointer, r.fieldsStream.Length())) } } else { maxPointer = r.fieldsStream.Length() } r.maxPointer = maxPointer codecNameDat := formatName + CODEC_SFX_DAT var fieldsVersion int if fieldsVersion, err = int32AsInt(codec.CheckHeader(r.fieldsStream, codecNameDat, VERSION_START, VERSION_CURRENT)); err != nil { return nil, err } assert2(r.version == fieldsVersion, "Version mismatch between stored fields index and data: %v != %v", r.version, fieldsVersion) assert(int64(codec.HeaderLength(codecNameDat)) == r.fieldsStream.FilePointer()) r.chunkSize = -1 if r.version >= VERSION_BIG_CHUNKS { if r.chunkSize, err = int32AsInt(r.fieldsStream.ReadVInt()); err != nil { return nil, err } } if r.packedIntsVersion, err = int32AsInt(r.fieldsStream.ReadVInt()); err != nil { return nil, err } r.decompressor = compressionMode.NewDecompressor() r.bytes = make([]byte, 0) if r.version >= VERSION_CHECKSUM { // NOTE: data file is too costly to verify checksum against all the // bytes on open, but fo rnow we at least verify proper structure // of the checksum footer: which looks for FOOTER_MATIC + // algorithmID. This is cheap and can detect some forms of // corruption such as file trucation. if _, err = codec.RetrieveChecksum(r.fieldsStream); err != nil { return nil, err } } success = true return r, nil }
if normsType, err = getDocValuesType(input, (val>>4)&0x0F); err != nil { return } if dvGen, err = input.ReadLong(); err != nil { return } if attributes, err = input.ReadStringStringMap(); err != nil { return } infos = append(infos, NewFieldInfo(name, isIndexed, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValuesType, normsType, dvGen, attributes)) } if codecVersion >= FI_FORMAT_CHECKSUM { if _, err = codec.CheckFooter(input); err != nil { return } } else { if err = codec.CheckEOF(input); err != nil { return } } fis = NewFieldInfos(infos) success = true return fis, nil } func getDocValuesType(input store.IndexInput, b byte) (t DocValuesType, err error) { switch b { case 0:
func newLucene49NormsProducer(state SegmentReadState, dataCodec, dataExtension, metaCodec, metaExtension string) (np *NormsProducer, err error) { np = &NormsProducer{ Locker: new(sync.Mutex), norms: make(map[int]*NormsEntry), instances: make(map[int]NumericDocValues), maxDoc: state.SegmentInfo.DocCount(), ramBytesUsed: util.ShallowSizeOfInstance(reflect.TypeOf(np)), } metaName := util.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, metaExtension) // read in the entries from the metadta file. var in store.ChecksumIndexInput if in, err = state.Dir.OpenChecksumInput(metaName, state.Context); err != nil { return nil, err } if err = func() error { var success = false defer func() { if success { err = util.Close(in) } else { util.CloseWhileSuppressingError(in) } }() if np.version, err = codec.CheckHeader(in, metaCodec, VERSION_START, VERSION_CURRENT); err != nil { return err } if err = np.readFields(in, state.FieldInfos); err != nil { return err } if _, err = codec.CheckFooter(in); err != nil { return err } success = true return nil }(); err != nil { return nil, err } dataName := util.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, dataExtension) if np.data, err = state.Dir.OpenInput(dataName, state.Context); err != nil { return nil, err } var success = false defer func() { if !success { util.CloseWhileSuppressingError(np.data) } }() var version2 int32 if version2, err = codec.CheckHeader(np.data, dataCodec, VERSION_START, VERSION_CURRENT); err != nil { return nil, err } if version2 != np.version { return nil, errors.New("Format versions mismatch") } // NOTE: data file is too costly to verify checksum against all the // bytes on open, but fo rnow we at least verify proper structure // of the checksum footer: which looks for FOOTER_MATIC + // algorithmID. This is cheap and can detect some forms of // corruption such as file trucation. if _, err = codec.RetrieveChecksum(np.data); err != nil { return nil, err } success = true return np, nil }
func (d *CompoundFileDirectory) readEntries(handle IndexInput, dir Directory, name string) (mapping map[string]FileSlice, err error) { var stream IndexInput = nil var entriesStream ChecksumIndexInput = nil // read the first VInt. If it is negative, it's the version number // otherwise it's the count (pre-3.1 indexes) var success = false defer func() { if success { err = util.Close(stream, entriesStream) } else { util.CloseWhileSuppressingError(stream, entriesStream) } }() stream = handle.Clone() // fmt.Printf("Reading from stream: %v\n", stream) firstInt, err := stream.ReadVInt() if err != nil { return nil, err } // impossible for 3.0 to have 63 files in a .cfs, CFS writer was not visible // and separate norms/etc are outside of cfs. if firstInt == int32(CODEC_MAGIC_BYTE1) { if secondByte, err := stream.ReadByte(); err == nil { if thirdByte, err := stream.ReadByte(); err == nil { if fourthByte, err := stream.ReadByte(); err == nil { if secondByte != CODEC_MAGIC_BYTE2 || thirdByte != CODEC_MAGIC_BYTE3 || fourthByte != CODEC_MAGIC_BYTE4 { return nil, errors.New(fmt.Sprintf( "Illegal/impossible header for CFS file: %v,%v,%v", secondByte, thirdByte, fourthByte)) } } } } if err != nil { return nil, err } d.version, err = int32ToInt(codec.CheckHeaderNoMagic(stream, CFD_DATA_CODEC, CFD_VERSION_START, CFD_VERSION_CURRENT)) if err != nil { return nil, err } entriesFileName := util.SegmentFileName(util.StripExtension(name), "", COMPOUND_FILE_ENTRIES_EXTENSION) entriesStream, err = dir.OpenChecksumInput(entriesFileName, IO_CONTEXT_READONCE) if err != nil { return nil, err } _, err = codec.CheckHeader(entriesStream, CFD_ENTRY_CODEC, CFD_VERSION_START, CFD_VERSION_CURRENT) if err != nil { return nil, err } numEntries, err := entriesStream.ReadVInt() if err != nil { return nil, err } mapping = make(map[string]FileSlice) // fmt.Printf("Entries number: %v\n", numEntries) for i := int32(0); i < numEntries; i++ { id, err := entriesStream.ReadString() if err != nil { return nil, err } if _, ok := mapping[id]; ok { return nil, errors.New(fmt.Sprintf( "Duplicate cfs entry id=%v in CFS: %v", id, entriesStream)) } // log.Printf("Found entry: %v", id) offset, err := entriesStream.ReadLong() if err != nil { return nil, err } length, err := entriesStream.ReadLong() if err != nil { return nil, err } mapping[id] = FileSlice{offset, length} } if d.version >= CFD_VERSION_CHECKSUM { _, err = codec.CheckFooter(entriesStream) } else { err = codec.CheckEOF(entriesStream) } if err != nil { return nil, err } } else { // TODO remove once 3.x is not supported anymore panic("not supported yet; will also be obsolete soon") } success = true return mapping, nil }