Exemple #1
0
/*
Clones the provided input, reads all bytes from the file, and calls
CheckFooter().

Note that this method may be slow, as it must process the entire file.
If you just need to extract the checksum value, call retrieveChecksum().
*/
func ChecksumEntireFile(input IndexInput) (hash int64, err error) {
	clone := input.Clone()
	if err = clone.Seek(0); err != nil {
		return 0, err
	}
	in := newBufferedChecksumIndexInput(clone)
	assert(in.FilePointer() == 0)
	if err = in.Seek(in.Length() - codec.FOOTER_LENGTH); err != nil {
		return 0, err
	}
	return codec.CheckFooter(in)
}
Exemple #2
0
/*
Read a particular segmentFileName. Note that this may return IO error
if a commit is in process.
*/
func (sis *SegmentInfos) Read(directory store.Directory, segmentFileName string) (err error) {
	// fmt.Printf("Reading segment info from %v...\n", segmentFileName)

	// Clear any previous segments:
	sis.Clear()

	sis.generation = GenerationFromSegmentsFileName(segmentFileName)
	sis.lastGeneration = sis.generation

	var input store.ChecksumIndexInput
	if input, err = directory.OpenChecksumInput(segmentFileName, store.IO_CONTEXT_READ); err != nil {
		return
	}

	var success = false
	defer func() {
		if !success {
			// Clear any segment infos we had loaded so we
			// have a clean slate on retry:
			sis.Clear()
			util.CloseWhileSuppressingError(input)
		} else {
			err = input.Close()
		}
	}()

	var format int
	if format, err = asInt(input.ReadInt()); err != nil {
		return
	}

	var actualFormat int
	if format == codec.CODEC_MAGIC {
		// 4.0+
		if actualFormat, err = asInt(codec.CheckHeaderNoMagic(input, "segments", VERSION_40, VERSION_49)); err != nil {
			return
		}
		if sis.version, err = input.ReadLong(); err != nil {
			return
		}
		if sis.counter, err = asInt(input.ReadInt()); err != nil {
			return
		}
		var numSegments int
		if numSegments, err = asInt(input.ReadInt()); err != nil {
			return
		} else if numSegments < 0 {
			return errors.New(fmt.Sprintf("invalid segment count: %v (resource: %v)", numSegments, input))
		}
		var segName, codecName string
		var fCodec Codec
		var delGen, fieldInfosGen, dvGen int64
		var delCount int
		for seg := 0; seg < numSegments; seg++ {
			if segName, err = input.ReadString(); err != nil {
				return
			}
			if codecName, err = input.ReadString(); err != nil {
				return
			}
			fCodec = LoadCodec(codecName)
			assert2(fCodec != nil, "Invalid codec name: %v", codecName)
			// fmt.Printf("SIS.read seg=%v codec=%v\n", seg, fCodec)
			var info *SegmentInfo
			if info, err = fCodec.SegmentInfoFormat().SegmentInfoReader().Read(directory, segName, store.IO_CONTEXT_READ); err != nil {
				return
			}
			info.SetCodec(fCodec)
			if delGen, err = input.ReadLong(); err != nil {
				return
			}
			if delCount, err = asInt(input.ReadInt()); err != nil {
				return
			} else if delCount < 0 || delCount > info.DocCount() {
				return errors.New(fmt.Sprintf(
					"invalid deletion count: %v vs docCount=%v (resource: %v)",
					delCount, info.DocCount(), input))
			}
			fieldInfosGen = -1
			if actualFormat >= VERSION_46 {
				if fieldInfosGen, err = input.ReadLong(); err != nil {
					return
				}
			}
			dvGen = -1
			if actualFormat >= VERSION_49 {
				if dvGen, err = input.ReadLong(); err != nil {
					return
				}
			} else {
				dvGen = fieldInfosGen
			}
			siPerCommit := NewSegmentCommitInfo(info, delCount, delGen, fieldInfosGen, dvGen)
			if actualFormat >= VERSION_46 {
				if actualFormat < VERSION_49 {
					panic("not implemented yet")
				} else {
					var ss map[string]bool
					if ss, err = input.ReadStringSet(); err != nil {
						return err
					}
					siPerCommit.SetFieldInfosFiles(ss)
					var dvUpdatesFiles map[int]map[string]bool
					var numDVFields int
					if numDVFields, err = asInt(input.ReadInt()); err != nil {
						return err
					}
					if numDVFields == 0 {
						dvUpdatesFiles = make(map[int]map[string]bool)
					} else {
						panic("not implemented yet")
					}
					siPerCommit.SetDocValuesUpdatesFiles(dvUpdatesFiles)
				}
			}
			sis.Segments = append(sis.Segments, siPerCommit)
		}
		if sis.userData, err = input.ReadStringStringMap(); err != nil {
			return err
		}
	} else {
		// TODO support <4.0 index
		panic("Index format pre-4.0 not supported yet")
	}

	if actualFormat >= VERSION_48 {
		if _, err = codec.CheckFooter(input); err != nil {
			return
		}
	} else {
		var checksumNow = int64(input.Checksum())
		var checksumThen int64
		if checksumThen, err = input.ReadLong(); err != nil {
			return
		}
		if checksumNow != checksumThen {
			return errors.New(fmt.Sprintf(
				"checksum mismatch in segments file: %v vs %v (resource: %v)",
				checksumNow, checksumThen, input))
		}
		if err = codec.CheckEOF(input); err != nil {
			return
		}
	}

	success = true
	return nil
}
Exemple #3
0
// TODO support IndexCommit
func (fsf *FindSegmentsFile) run(commit IndexCommit) (interface{}, error) {
	// fmt.Println("Finding segments file...")
	if commit != nil {
		if fsf.directory != commit.Directory() {
			return nil, errors.New("the specified commit does not match the specified Directory")
		}
		return fsf.doBody(commit.SegmentsFileName())
	}

	lastGen := int64(-1)
	gen := int64(0)
	genLookaheadCount := 0
	var exc error
	retryCount := 0

	useFirstMethod := true

	// Loop until we succeed in calling doBody() without
	// hitting an IOException.  An IOException most likely
	// means a commit was in process and has finished, in
	// the time it took us to load the now-old infos files
	// (and segments files).  It's also possible it's a
	// true error (corrupt index).  To distinguish these,
	// on each retry we must see "forward progress" on
	// which generation we are trying to load.  If we
	// don't, then the original error is real and we throw
	// it.

	// We have three methods for determining the current
	// generation.  We try the first two in parallel (when
	// useFirstMethod is true), and fall back to the third
	// when necessary.

	for {
		// fmt.Println("Trying...")
		if useFirstMethod {
			// fmt.Println("Trying first method...")
			// List the directory and use the highest
			// segments_N file.  This method works well as long
			// as there is no stale caching on the directory
			// contents (NOTE: NFS clients often have such stale
			// caching):
			genA := int64(-1)

			files, err := fsf.directory.ListAll()
			if err != nil {
				return nil, err
			}
			if files != nil {
				genA = LastCommitGeneration(files)
			}

			// message("directory listing genA=%v", genA)

			// Also open segments.gen and read its
			// contents.  Then we take the larger of the two
			// gens.  This way, if either approach is hitting
			// a stale cache (NFS) we have a better chance of
			// getting the right generation.
			genB := int64(-1)
			genInput, err := fsf.directory.OpenChecksumInput(INDEX_FILENAME_SEGMENTS_GEN, store.IO_CONTEXT_READ)
			if err != nil {
				message("segments.gen open: %v", err)
			} else {
				defer genInput.Close()
				// fmt.Println("Reading segments info...")

				var version int32
				if version, err = genInput.ReadInt(); err != nil {
					return nil, err
				}
				// fmt.Printf("Version: %v\n", version)
				if version == FORMAT_SEGMENTS_GEN_47 || version == FORMAT_SEGMENTS_GEN_CURRENT {
					// fmt.Println("Version is current.")
					var gen0, gen1 int64
					if gen0, err = genInput.ReadLong(); err != nil {
						return nil, err
					}
					if gen1, err = genInput.ReadLong(); err != nil {
						return nil, err
					}
					message("fallback check: %v; %v", gen0, gen1)
					if version == FORMAT_SEGMENTS_GEN_CHECKSUM {
						if _, err = codec.CheckFooter(genInput); err != nil {
							return nil, err
						}
					} else {
						if err = codec.CheckEOF(genInput); err != nil {
							return nil, err
						}
					}
					if gen0 == gen1 {
						// The file is consistent.
						genB = gen0
					}
				} else {
					return nil, codec.NewIndexFormatTooNewError(genInput, version,
						FORMAT_SEGMENTS_GEN_CURRENT, FORMAT_SEGMENTS_GEN_CURRENT)
				}
			}

			message("%v check: genB=%v", INDEX_FILENAME_SEGMENTS_GEN, genB)

			// Pick the larger of the two gen's:
			gen = genA
			if genB > gen {
				gen = genB
			}

			if gen == -1 {
				// Neither approach found a generation
				return nil, errors.New(fmt.Sprintf("no segments* file found in %v: files: %#v", fsf.directory, files))
			}
		}

		if useFirstMethod && lastGen == gen && retryCount >= 2 {
			// Give up on first method -- this is 3rd cycle on
			// listing directory and checking gen file to
			// attempt to locate the segments file.
			useFirstMethod = false
		}

		// Second method: since both directory cache and
		// file contents cache seem to be stale, just
		// advance the generation.
		if !useFirstMethod {
			if genLookaheadCount < fsf.defaultGenLookaheadCount {
				gen++
				genLookaheadCount++
				message("look ahead increment gen to %v", gen)
			} else {
				// All attempts have failed -- throw first exc:
				return nil, exc
			}
		} else if lastGen == gen {
			// This means we're about to try the same
			// segments_N last tried.
			retryCount++
		} else {
			// Segment file has advanced since our last loop
			// (we made "progress"), so reset retryCount:
			retryCount = 0
		}

		lastGen = gen
		segmentFileName := util.FileNameFromGeneration(INDEX_FILENAME_SEGMENTS, "", gen)
		// fmt.Printf("SegmentFileName: %v\n", segmentFileName)

		var v interface{}
		var err error
		if v, err = fsf.doBody(segmentFileName); err == nil {
			message("success on %v", segmentFileName)
			return v, nil
		}
		// Save the original root cause:
		if exc == nil {
			exc = err
		}

		message("primary Exception on '%v': %v; will retry: retryCount = %v; gen = %v",
			segmentFileName, err, retryCount, gen)

		if gen > 1 && useFirstMethod && retryCount == 1 {
			// This is our second time trying this same segments
			// file (because retryCount is 1), and, there is
			// possibly a segments_(N-1) (because gen > 1).
			// So, check if the segments_(N-1) exists and
			// try it if so:
			prevSegmentFileName := util.FileNameFromGeneration(INDEX_FILENAME_SEGMENTS, "", gen-1)

			if prevExists := fsf.directory.FileExists(prevSegmentFileName); prevExists {
				message("fallback to prior segment file '%v'", prevSegmentFileName)
				if v, err = fsf.doBody(prevSegmentFileName); err != nil {
					message("secondary Exception on '%v': %v; will retry", prevSegmentFileName, err)
				} else {
					message("success on fallback %v", prevSegmentFileName)
					return v, nil
				}
			}
		}
	}
}
// Sole constructor
func newCompressingStoredFieldsReader(d store.Directory,
	si *model.SegmentInfo, segmentSuffix string,
	fn model.FieldInfos, ctx store.IOContext, formatName string,
	compressionMode CompressionMode) (r *CompressingStoredFieldsReader, err error) {

	r = &CompressingStoredFieldsReader{}
	r.compressionMode = compressionMode
	segment := si.Name
	r.fieldInfos = fn
	r.numDocs = si.DocCount()

	var indexStream store.ChecksumIndexInput
	success := false
	defer func() {
		if !success {
			util.CloseWhileSuppressingError(r, indexStream)
		}
	}()

	indexStreamFN := util.SegmentFileName(segment, segmentSuffix, lucene40.FIELDS_INDEX_EXTENSION)
	fieldsStreamFN := util.SegmentFileName(segment, segmentSuffix, lucene40.FIELDS_EXTENSION)
	// Load the index into memory
	if indexStream, err = d.OpenChecksumInput(indexStreamFN, ctx); err != nil {
		return nil, err
	}
	codecNameIdx := formatName + CODEC_SFX_IDX
	if r.version, err = int32AsInt(codec.CheckHeader(indexStream, codecNameIdx,
		VERSION_START, VERSION_CURRENT)); err != nil {
		return nil, err
	}
	assert(int64(codec.HeaderLength(codecNameIdx)) == indexStream.FilePointer())
	if r.indexReader, err = newCompressingStoredFieldsIndexReader(indexStream, si); err != nil {
		return nil, err
	}

	var maxPointer int64 = -1

	if r.version >= VERSION_CHECKSUM {
		if maxPointer, err = indexStream.ReadVLong(); err != nil {
			return nil, err
		}
		if _, err = codec.CheckFooter(indexStream); err != nil {
			return nil, err
		}
	} else {
		if err = codec.CheckEOF(indexStream); err != nil {
			return nil, err
		}
	}

	if err = indexStream.Close(); err != nil {
		return nil, err
	}
	indexStream = nil

	// Open the data file and read metadata
	if r.fieldsStream, err = d.OpenInput(fieldsStreamFN, ctx); err != nil {
		return nil, err
	}
	if r.version >= VERSION_CHECKSUM {
		if maxPointer+codec.FOOTER_LENGTH != r.fieldsStream.Length() {
			return nil, errors.New(fmt.Sprintf(
				"Invalid fieldsStream maxPointer (file truncated?): maxPointer=%v, length=%v",
				maxPointer, r.fieldsStream.Length()))
		}
	} else {
		maxPointer = r.fieldsStream.Length()
	}
	r.maxPointer = maxPointer
	codecNameDat := formatName + CODEC_SFX_DAT
	var fieldsVersion int
	if fieldsVersion, err = int32AsInt(codec.CheckHeader(r.fieldsStream,
		codecNameDat, VERSION_START, VERSION_CURRENT)); err != nil {
		return nil, err
	}
	assert2(r.version == fieldsVersion,
		"Version mismatch between stored fields index and data: %v != %v",
		r.version, fieldsVersion)
	assert(int64(codec.HeaderLength(codecNameDat)) == r.fieldsStream.FilePointer())

	r.chunkSize = -1
	if r.version >= VERSION_BIG_CHUNKS {
		if r.chunkSize, err = int32AsInt(r.fieldsStream.ReadVInt()); err != nil {
			return nil, err
		}
	}

	if r.packedIntsVersion, err = int32AsInt(r.fieldsStream.ReadVInt()); err != nil {
		return nil, err
	}
	r.decompressor = compressionMode.NewDecompressor()
	r.bytes = make([]byte, 0)

	if r.version >= VERSION_CHECKSUM {
		// NOTE: data file is too costly to verify checksum against all the
		// bytes on open, but fo rnow we at least verify proper structure
		// of the checksum footer: which looks for FOOTER_MATIC +
		// algorithmID. This is cheap and can detect some forms of
		// corruption such as file trucation.
		if _, err = codec.RetrieveChecksum(r.fieldsStream); err != nil {
			return nil, err
		}
	}

	success = true
	return r, nil
}
Exemple #5
0
		if normsType, err = getDocValuesType(input, (val>>4)&0x0F); err != nil {
			return
		}
		if dvGen, err = input.ReadLong(); err != nil {
			return
		}
		if attributes, err = input.ReadStringStringMap(); err != nil {
			return
		}
		infos = append(infos, NewFieldInfo(name, isIndexed, fieldNumber,
			storeTermVector, omitNorms, storePayloads, indexOptions,
			docValuesType, normsType, dvGen, attributes))
	}

	if codecVersion >= FI_FORMAT_CHECKSUM {
		if _, err = codec.CheckFooter(input); err != nil {
			return
		}
	} else {
		if err = codec.CheckEOF(input); err != nil {
			return
		}
	}
	fis = NewFieldInfos(infos)
	success = true
	return fis, nil
}

func getDocValuesType(input store.IndexInput, b byte) (t DocValuesType, err error) {
	switch b {
	case 0:
Exemple #6
0
func newLucene49NormsProducer(state SegmentReadState,
	dataCodec, dataExtension, metaCodec, metaExtension string) (np *NormsProducer, err error) {

	np = &NormsProducer{
		Locker:       new(sync.Mutex),
		norms:        make(map[int]*NormsEntry),
		instances:    make(map[int]NumericDocValues),
		maxDoc:       state.SegmentInfo.DocCount(),
		ramBytesUsed: util.ShallowSizeOfInstance(reflect.TypeOf(np)),
	}
	metaName := util.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, metaExtension)
	// read in the entries from the metadta file.
	var in store.ChecksumIndexInput
	if in, err = state.Dir.OpenChecksumInput(metaName, state.Context); err != nil {
		return nil, err
	}

	if err = func() error {
		var success = false
		defer func() {
			if success {
				err = util.Close(in)
			} else {
				util.CloseWhileSuppressingError(in)
			}
		}()

		if np.version, err = codec.CheckHeader(in, metaCodec, VERSION_START, VERSION_CURRENT); err != nil {
			return err
		}
		if err = np.readFields(in, state.FieldInfos); err != nil {
			return err
		}
		if _, err = codec.CheckFooter(in); err != nil {
			return err
		}
		success = true
		return nil
	}(); err != nil {
		return nil, err
	}

	dataName := util.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, dataExtension)
	if np.data, err = state.Dir.OpenInput(dataName, state.Context); err != nil {
		return nil, err
	}
	var success = false
	defer func() {
		if !success {
			util.CloseWhileSuppressingError(np.data)
		}
	}()

	var version2 int32
	if version2, err = codec.CheckHeader(np.data, dataCodec, VERSION_START, VERSION_CURRENT); err != nil {
		return nil, err
	}
	if version2 != np.version {
		return nil, errors.New("Format versions mismatch")
	}

	// NOTE: data file is too costly to verify checksum against all the
	// bytes on open, but fo rnow we at least verify proper structure
	// of the checksum footer: which looks for FOOTER_MATIC +
	// algorithmID. This is cheap and can detect some forms of
	// corruption such as file trucation.
	if _, err = codec.RetrieveChecksum(np.data); err != nil {
		return nil, err
	}

	success = true

	return np, nil
}
Exemple #7
0
func (d *CompoundFileDirectory) readEntries(handle IndexInput, dir Directory, name string) (mapping map[string]FileSlice, err error) {
	var stream IndexInput = nil
	var entriesStream ChecksumIndexInput = nil
	// read the first VInt. If it is negative, it's the version number
	// otherwise it's the count (pre-3.1 indexes)
	var success = false
	defer func() {
		if success {
			err = util.Close(stream, entriesStream)
		} else {
			util.CloseWhileSuppressingError(stream, entriesStream)
		}
	}()

	stream = handle.Clone()
	// fmt.Printf("Reading from stream: %v\n", stream)
	firstInt, err := stream.ReadVInt()
	if err != nil {
		return nil, err
	}
	// impossible for 3.0 to have 63 files in a .cfs, CFS writer was not visible
	// and separate norms/etc are outside of cfs.
	if firstInt == int32(CODEC_MAGIC_BYTE1) {
		if secondByte, err := stream.ReadByte(); err == nil {
			if thirdByte, err := stream.ReadByte(); err == nil {
				if fourthByte, err := stream.ReadByte(); err == nil {
					if secondByte != CODEC_MAGIC_BYTE2 ||
						thirdByte != CODEC_MAGIC_BYTE3 ||
						fourthByte != CODEC_MAGIC_BYTE4 {
						return nil, errors.New(fmt.Sprintf(
							"Illegal/impossible header for CFS file: %v,%v,%v",
							secondByte, thirdByte, fourthByte))
					}
				}
			}
		}
		if err != nil {
			return nil, err
		}

		d.version, err = int32ToInt(codec.CheckHeaderNoMagic(stream, CFD_DATA_CODEC, CFD_VERSION_START, CFD_VERSION_CURRENT))
		if err != nil {
			return nil, err
		}
		entriesFileName := util.SegmentFileName(util.StripExtension(name), "", COMPOUND_FILE_ENTRIES_EXTENSION)
		entriesStream, err = dir.OpenChecksumInput(entriesFileName, IO_CONTEXT_READONCE)
		if err != nil {
			return nil, err
		}
		_, err = codec.CheckHeader(entriesStream, CFD_ENTRY_CODEC, CFD_VERSION_START, CFD_VERSION_CURRENT)
		if err != nil {
			return nil, err
		}
		numEntries, err := entriesStream.ReadVInt()
		if err != nil {
			return nil, err
		}

		mapping = make(map[string]FileSlice)
		// fmt.Printf("Entries number: %v\n", numEntries)
		for i := int32(0); i < numEntries; i++ {
			id, err := entriesStream.ReadString()
			if err != nil {
				return nil, err
			}
			if _, ok := mapping[id]; ok {
				return nil, errors.New(fmt.Sprintf(
					"Duplicate cfs entry id=%v in CFS: %v", id, entriesStream))
			}
			// log.Printf("Found entry: %v", id)
			offset, err := entriesStream.ReadLong()
			if err != nil {
				return nil, err
			}
			length, err := entriesStream.ReadLong()
			if err != nil {
				return nil, err
			}
			mapping[id] = FileSlice{offset, length}
		}
		if d.version >= CFD_VERSION_CHECKSUM {
			_, err = codec.CheckFooter(entriesStream)
		} else {
			err = codec.CheckEOF(entriesStream)
		}
		if err != nil {
			return nil, err
		}
	} else {
		// TODO remove once 3.x is not supported anymore
		panic("not supported yet; will also be obsolete soon")
	}
	success = true
	return mapping, nil
}