// makeRoomForWrite ensures that there is room in d.mem for the next write. // // d.mu must be held when calling this, but the mutex may be dropped and // re-acquired during the course of this method. func (d *DB) makeRoomForWrite(force bool) error { allowDelay := !force for { // TODO: check any previous sticky error, if the paranoid option is set. if allowDelay && len(d.versions.currentVersion().files[0]) > l0SlowdownWritesTrigger { // We are getting close to hitting a hard limit on the number of // L0 files. Rather than delaying a single write by several // seconds when we hit the hard limit, start delaying each // individual write by 1ms to reduce latency variance. d.mu.Unlock() time.Sleep(1 * time.Millisecond) d.mu.Lock() allowDelay = false // TODO: how do we ensure we are still 'at the front of the writer queue'? continue } if !force && d.mem.ApproximateMemoryUsage() <= d.opts.GetWriteBufferSize() { // There is room in the current memtable. break } if d.imm != nil { // We have filled up the current memtable, but the previous // one is still being compacted, so we wait. d.compactionCond.Wait() continue } if len(d.versions.currentVersion().files[0]) > l0StopWritesTrigger { // There are too many level-0 files. d.compactionCond.Wait() continue } // Attempt to switch to a new memtable and trigger compaction of old // TODO: drop and re-acquire d.mu around the I/O. newLogNumber := d.versions.nextFileNum() newLogFile, err := d.opts.GetFileSystem().Create(dbFilename(d.dirname, fileTypeLog, newLogNumber)) if err != nil { return err } newLog := record.NewWriter(newLogFile) if err := d.log.Close(); err != nil { newLogFile.Close() return err } if err := d.logFile.Close(); err != nil { newLog.Close() newLogFile.Close() return err } d.logNumber, d.logFile, d.log = newLogNumber, newLogFile, newLog d.imm, d.mem = d.mem, memdb.New(&d.icmpOpts) force = false d.maybeScheduleCompaction() } return nil }
func createDB(dirname string, opts *db.Options) (retErr error) { const manifestFileNum = 1 ve := versionEdit{ comparatorName: opts.GetComparer().Name(), nextFileNumber: manifestFileNum + 1, } manifestFilename := dbFilename(dirname, fileTypeManifest, manifestFileNum) f, err := opts.GetFileSystem().Create(manifestFilename) if err != nil { return fmt.Errorf("leveldb: could not create %q: %v", manifestFilename, err) } defer func() { if retErr != nil { opts.GetFileSystem().Remove(manifestFilename) } }() defer f.Close() recWriter := record.NewWriter(f) w, err := recWriter.Next() if err != nil { return err } err = ve.encode(w) if err != nil { return err } err = recWriter.Close() if err != nil { return err } return setCurrentFile(dirname, opts.GetFileSystem(), manifestFileNum) }
// createManifest creates a manifest file that contains a snapshot of vs. func (vs *versionSet) createManifest(dirname string) (err error) { var ( filename = dbFilename(dirname, fileTypeManifest, vs.manifestFileNumber) manifestFile db.File manifest *record.Writer ) defer func() { if manifest != nil { manifest.Close() } if manifestFile != nil { manifestFile.Close() } if err != nil { vs.fs.Remove(filename) } }() manifestFile, err = vs.fs.Create(filename) if err != nil { return err } manifest = record.NewWriter(manifestFile) snapshot := versionEdit{ comparatorName: vs.ucmp.Name(), } // TODO: save compaction pointers. for level, fileMetadata := range vs.currentVersion().files { for _, meta := range fileMetadata { snapshot.newFiles = append(snapshot.newFiles, newFileEntry{ level: level, meta: meta, }) } } w, err1 := manifest.Next() if err1 != nil { return err1 } err1 = snapshot.encode(w) if err1 != nil { return err1 } vs.manifest, manifest = manifest, nil vs.manifestFile, manifestFile = manifestFile, nil return nil }
// 创建leveldb存储目录时,需要传入db.Options, 那么db.Options有哪些属性呢? func createDB(dirname string, opts *db.Options) (retErr error) { const manifestFileNum = 1 // versionEdit保存版本之间的差异,oldVersion _+ versionEdit = newVersion ve := versionEdit{ // 原始key比较器的名称 comparatorName: opts.GetComparer().Name(), // ? nextFileNumber: manifestFileNum + 1, } // 先创建manifest文件,往里面写一些元信息,如上面versionEdit的comparatorName、nextFileNumber字段 manifestFilename := dbFilename(dirname, fileTypeManifest, manifestFileNum) f, err := opts.GetFileSystem().Create(manifestFilename) if err != nil { return fmt.Errorf("leveldb: could not create %q: %v", manifestFilename, err) } defer func() { if retErr != nil { opts.GetFileSystem().Remove(manifestFilename) } }() defer f.Close() // 生成一个record.Writer对象 recWriter := record.NewWriter(f) // 返回一个record.SingleWriter对象w w, err := recWriter.Next() if err != nil { return err } // 将ve中的数据写入manifest文件中 err = ve.encode(w) if err != nil { return err } // 加入重启点, 压缩,crc校验? err = recWriter.Close() if err != nil { return err } // 创建current文件,其中写入了当前的manifest文件名 return setCurrentFile(dirname, opts.GetFileSystem(), manifestFileNum) }
// makeRoomForWrite ensures that there is room in d.mem for the next write. // // d.mu must be held when calling this, but the mutex may be dropped and // re-acquired during the course of this method. func (d *DB) makeRoomForWrite(force bool) error { // allowDelay:允许延迟写 // force = true表示强制立即写? allowDelay := !force // 无条件循环 for { // TODO: check any previous sticky error, if the paranoid option is set. // 若当前版本(即链表versions的最后一个元素)中level0的文件数大于l0SlowdownWritesTrigger(8),则对当前写操作延迟1毫秒 if allowDelay && len(d.versions.currentVersion().files[0]) > l0SlowdownWritesTrigger { // We are getting close to hitting a hard limit on the number of // L0 files. Rather than delaying a single write by several // seconds when we hit the hard limit, **start delaying each // individual write by 1ms to reduce latency variance.** d.mu.Unlock() time.Sleep(1 * time.Millisecond) d.mu.Lock() allowDelay = false // TODO: how do we ensure we are still 'at the front of the writer queue'? continue } // 若当前memtable中仍有写入的空间,则直接break返回 // d.mem.ApproximateMemoryUsage()返回len(d.mem.kvData) // d.opts.GetWriteBufferSize是memtable的临界大小,也是一个level0的db文件的临界大小 // 但从 <= 可以看到,这个临界大小是可以被超过的。。。 if !force && d.mem.ApproximateMemoryUsage() <= d.opts.GetWriteBufferSize() { // There is room in the current memtable. break } // 如果当前memtable(即d.mem)已写满,且immutable memtable(d.imm)存在,则需等待,因为说明上一次从imm到level0的compaction操作还没有结束 // 这也意味着从imm到level0的compaction过程结束时会将d.imm置为nil // 而什么时候d.imm会变成非nil呢?见下面,当d.mem满时,会将d.imm指向d.mem,而d.mem则指向一个新申请的内存空间 // 那么d.imm的存在时间最长为新d.mem从空到被写满的时间 if d.imm != nil { // We have filled up the current memtable, but the previous // one is still being compacted, so we wait. // d.compactionCond.Wait()类似于在等待一个唤醒的信号 d.compactionCond.Wait() continue } // 若当前版本(即链表versions的最后一个元素)中level0的文件数大于l0StopWritesTrigger(12),则需等待从level0到level1的compaction过程结束 if len(d.versions.currentVersion().files[0]) > l0StopWritesTrigger { // There are too many level-0 files. d.compactionCond.Wait() continue } // Attempt to switch to a new memtable and trigger compaction of old // 因为d.mem已经写满,而d.imm也已经被置为nil,那么可以将d.imm指向d.mem,并为d.mem申请一块新的内存空间 // 从以下代码中可以看到在从d.mem切换到d.imm之前,先创建了一个新的log文件,并打开。这说明一个log文件的生命周期和d.mem是一样的, // 即一个log记录的一个d.mem上的增删改操作 // TODO: drop and re-acquire d.mu around the I/O. // 不同的的log文件,通过文件名中不同的数字序号来区分 newLogNumber := d.versions.nextFileNum() newLogFile, err := d.opts.GetFileSystem().Create(dbFilename(d.dirname, fileTypeLog, newLogNumber)) if err != nil { return err } newLog := record.NewWriter(newLogFile) if err := d.log.Close(); err != nil { newLogFile.Close() return err } if err := d.logFile.Close(); err != nil { newLog.Close() newLogFile.Close() return err } // 设置d(DB)的属性 // 切换到新的日志,新的imm和mem d.logNumber, d.logFile, d.log = newLogNumber, newLogFile, newLog // memdb.New申请一个新的memtable内存空间 d.imm, d.mem = d.mem, memdb.New(&d.icmpOpts) force = false // 由于这时d.imm不为nil,则应该调度一次compaction,将d.imm中的数据写到level0磁盘文件中 d.maybeScheduleCompaction() } return nil }
// Open opens a LevelDB whose files live in the given directory. func Open(dirname string, opts *db.Options) (*DB, error) { d := &DB{ dirname: dirname, opts: opts, icmp: internalKeyComparer{opts.GetComparer()}, pendingOutputs: make(map[uint64]struct{}), } if opts != nil { d.icmpOpts = *opts } d.icmpOpts.Comparer = d.icmp tableCacheSize := opts.GetMaxOpenFiles() - numNonTableCacheFiles if tableCacheSize < minTableCacheSize { tableCacheSize = minTableCacheSize } // tableCache初始化 d.tableCache.init(dirname, opts.GetFileSystem(), &d.icmpOpts, tableCacheSize) // 初始化一个MemDB d.mem = memdb.New(&d.icmpOpts) // sync.Cond在Locker的基础上增加的一个消息通知的功能。 // Cond有三个方法:Wait,Signal,Broadcast。 // Wait添加一个计数,也就是添加一个阻塞的goroutine。 // Signal解除一个goroutine的阻塞,计数减一。 // Broadcast接触所有wait goroutine的阻塞。 d.compactionCond = sync.Cond{L: &d.mu} fs := opts.GetFileSystem() d.mu.Lock() defer d.mu.Unlock() // Lock the database directory. // If the directory already exists, MkdirAll does nothing and returns nil. // 如果目录已经存在,则MkdirAll啥都不干 err := fs.MkdirAll(dirname, 0755) if err != nil { return nil, err } // 创建LOCK文件,并加文件锁 fileLock, err := fs.Lock(dbFilename(dirname, fileTypeLock, 0)) if err != nil { return nil, err } defer func() { if fileLock != nil { fileLock.Close() } }() // 若CURRENT文件不存在,则调用createDB if _, err := fs.Stat(dbFilename(dirname, fileTypeCurrent, 0)); os.IsNotExist(err) { // Create the DB if it did not already exist. if err := createDB(dirname, opts); err != nil { return nil, err } } else if err != nil { return nil, fmt.Errorf("leveldb: database %q: %v", dirname, err) } else if opts.GetErrorIfDBExists() { return nil, fmt.Errorf("leveldb: database %q already exists", dirname) } // Load the version set. // 先读取CURRENT文件内容,获取manifest文件名,然后逐条记录读取manifest文件的内容,根据内容生成一个新version,放入d.versions中 err = d.versions.load(dirname, opts) if err != nil { return nil, err } // Replay any newer log files than the ones named in the manifest. var ve versionEdit ls, err := fs.List(dirname) if err != nil { return nil, err } var logFiles fileNumAndNameSlice for _, filename := range ls { ft, fn, ok := parseDBFilename(filename) if ok && ft == fileTypeLog && (fn >= d.versions.logNumber || fn == d.versions.prevLogNumber) { logFiles = append(logFiles, fileNumAndName{fn, filename}) } } sort.Sort(logFiles) for _, lf := range logFiles { // 根据日志文件重做日志中记录的操作,先将这些操作记录存入一个临时的memtable中,然后转存入磁盘上level0存储文件中 maxSeqNum, err := d.replayLogFile(&ve, fs, filepath.Join(dirname, lf.name)) if err != nil { return nil, err } d.versions.markFileNumUsed(lf.num) // 设置最新的操作序列号 if d.versions.lastSequence < maxSeqNum { d.versions.lastSequence = maxSeqNum } } // Create an empty .log file. // 创建一个新的空log文件 ve.logNumber = d.versions.nextFileNum() d.logNumber = ve.logNumber logFile, err := fs.Create(dbFilename(dirname, fileTypeLog, ve.logNumber)) if err != nil { return nil, err } defer func() { if logFile != nil { logFile.Close() } }() d.log = record.NewWriter(logFile) // Write a new manifest to disk. // 根据前面重做日志得到的ve的信息创建一个新的manifest文件 // 并在CURRENT文件中指向这个新manifest文件 if err := d.versions.logAndApply(dirname, &ve); err != nil { return nil, err } d.deleteObsoleteFiles() // 尝试调度compaction d.maybeScheduleCompaction() d.logFile, logFile = logFile, nil d.fileLock, fileLock = fileLock, nil return d, nil }
// Open opens a LevelDB whose files live in the given directory. func Open(dirname string, opts *db.Options) (*DB, error) { d := &DB{ dirname: dirname, opts: opts, icmp: internalKeyComparer{opts.GetComparer()}, } fs := opts.GetFileSystem() // Lock the database directory. err := fs.MkdirAll(dirname, 0755) if err != nil { return nil, err } fileLock, err := fs.Lock(dbFilename(dirname, fileTypeLock, 0)) if err != nil { return nil, err } defer func() { if fileLock != nil { fileLock.Close() } }() // TODO: add options for CreateIfMissing and ErrorIfExists, and check them here. // Load the version set. err = d.versions.load(dirname, opts) if err != nil { return nil, err } // Replay any newer log files than the ones named in the manifest. var ve versionEdit ls, err := fs.List(dirname) if err != nil { return nil, err } var logFiles fileNumAndNameSlice for _, filename := range ls { n := logFileNum(filename) if n != 0 && (n >= d.versions.logNumber || n == d.versions.prevLogNumber) { logFiles = append(logFiles, fileNumAndName{n, filename}) } } sort.Sort(logFiles) for _, lf := range logFiles { maxSeqNum, err := d.replayLogFile(&ve, fs, filepath.Join(dirname, lf.name)) if err != nil { return nil, err } d.versions.markFileNumUsed(lf.num) if d.versions.lastSequence < maxSeqNum { d.versions.lastSequence = maxSeqNum } } // Create an empty .log file. ve.logNumber = d.versions.nextFileNum() logFile, err := fs.Create(dbFilename(dirname, fileTypeLog, ve.logNumber)) if err != nil { return nil, err } defer func() { if logFile != nil { logFile.Close() } }() d.log = record.NewWriter(logFile) // Write a new manifest to disk. if err := d.versions.logAndApply(dirname, &ve); err != nil { return nil, err } // TODO: delete obsolete files. // TODO: maybe schedule compaction? d.logFile, logFile = logFile, nil d.fileLock, fileLock = fileLock, nil return d, nil }
// Open opens a LevelDB whose files live in the given directory. func Open(dirname string, opts *db.Options) (*DB, error) { d := &DB{ dirname: dirname, opts: opts, icmp: internalKeyComparer{opts.GetComparer()}, pendingOutputs: make(map[uint64]struct{}), } if opts != nil { d.icmpOpts = *opts } d.icmpOpts.Comparer = d.icmp tableCacheSize := opts.GetMaxOpenFiles() - numNonTableCacheFiles if tableCacheSize < minTableCacheSize { tableCacheSize = minTableCacheSize } d.tableCache.init(dirname, opts.GetFileSystem(), &d.icmpOpts, tableCacheSize) d.mem = memdb.New(&d.icmpOpts) d.compactionCond = sync.Cond{L: &d.mu} fs := opts.GetFileSystem() d.mu.Lock() defer d.mu.Unlock() // Lock the database directory. err := fs.MkdirAll(dirname, 0755) if err != nil { return nil, err } fileLock, err := fs.Lock(dbFilename(dirname, fileTypeLock, 0)) if err != nil { return nil, err } defer func() { if fileLock != nil { fileLock.Close() } }() if _, err := fs.Stat(dbFilename(dirname, fileTypeCurrent, 0)); os.IsNotExist(err) { // Create the DB if it did not already exist. if err := createDB(dirname, opts); err != nil { return nil, err } } else if err != nil { return nil, fmt.Errorf("leveldb: database %q: %v", dirname, err) } else if opts.GetErrorIfDBExists() { return nil, fmt.Errorf("leveldb: database %q already exists", dirname) } // Load the version set. err = d.versions.load(dirname, opts) if err != nil { return nil, err } // Replay any newer log files than the ones named in the manifest. var ve versionEdit ls, err := fs.List(dirname) if err != nil { return nil, err } var logFiles fileNumAndNameSlice for _, filename := range ls { ft, fn, ok := parseDBFilename(filename) if ok && ft == fileTypeLog && (fn >= d.versions.logNumber || fn == d.versions.prevLogNumber) { logFiles = append(logFiles, fileNumAndName{fn, filename}) } } sort.Sort(logFiles) for _, lf := range logFiles { maxSeqNum, err := d.replayLogFile(&ve, fs, filepath.Join(dirname, lf.name)) if err != nil { return nil, err } d.versions.markFileNumUsed(lf.num) if d.versions.lastSequence < maxSeqNum { d.versions.lastSequence = maxSeqNum } } // Create an empty .log file. ve.logNumber = d.versions.nextFileNum() d.logNumber = ve.logNumber logFile, err := fs.Create(dbFilename(dirname, fileTypeLog, ve.logNumber)) if err != nil { return nil, err } defer func() { if logFile != nil { logFile.Close() } }() d.log = record.NewWriter(logFile) // Write a new manifest to disk. if err := d.versions.logAndApply(dirname, &ve); err != nil { return nil, err } d.deleteObsoleteFiles() d.maybeScheduleCompaction() d.logFile, logFile = logFile, nil d.fileLock, fileLock = fileLock, nil return d, nil }