// writeLevel0Table writes a memtable to a level-0 on-disk table. // // If no error is returned, it adds the file number of that on-disk table to // d.pendingOutputs. It is the caller's responsibility to remove that fileNum // from that set when it has been applied to d.versions. // // d.mu must be held when calling this, but the mutex may be dropped and // re-acquired during the course of this method. func (d *DB) writeLevel0Table(fs db.FileSystem, mem *memdb.MemDB) (meta fileMetadata, err error) { // meta用于记录新创建的level0 db文件的元信息 meta.fileNum = d.versions.nextFileNum() // filename:新db文件的文件名 filename := dbFilename(d.dirname, fileTypeTable, meta.fileNum) d.pendingOutputs[meta.fileNum] = struct{}{} defer func(fileNum uint64) { // 如果异常退出(err不为nil),则从d.pendingOutputs中删除新db文件的记录 // 否则d.pendingOutputs是用来干什么的呢? if err != nil { delete(d.pendingOutputs, fileNum) } }(meta.fileNum) // Release the d.mu lock while doing I/O. // Note the unusual order: Unlock and then Lock. d.mu.Unlock() defer d.mu.Lock() var ( file db.File tw *table.Writer iter db.Iterator ) defer func() { if iter != nil { err = firstError(err, iter.Close()) } if tw != nil { err = firstError(err, tw.Close()) } if file != nil { err = firstError(err, file.Close()) } if err != nil { fs.Remove(filename) meta = fileMetadata{} } }() file, err = fs.Create(filename) if err != nil { return fileMetadata{}, err } // table为磁盘db文件封装写入方式 tw = table.NewWriter(file, &db.Options{ Comparer: d.icmp, }) // Find返回一个迭代器,用于遍历mem(这里即是d.imm)中的数据 // memtable是以skiplist来组织数据的,有序 // 所以取到的第一个数据的key即为当前imm中最小的key iter = mem.Find(nil, nil) iter.Next() // meta.smallest记录新db文件中最小的内部key,在写入memtable时就已经将用户key封装成了内部key,那么为什么还要使用internalKey来做类型转换? // 下面的meta.largest就没有进行类型转换就调用clone()方法了。 meta.smallest = internalKey(iter.Key()).clone() for { // 最后一次循环中的key即为最大的key,但为什么不封装成内部key呢? meta.largest = iter.Key() // 将key、value写到新db文件中 if err1 := tw.Set(meta.largest, iter.Value(), nil); err1 != nil { return fileMetadata{}, err1 } // 如果imm中的数据已经遍历完,全部存入新db文件,则break if !iter.Next() { break } } meta.largest = meta.largest.clone() if err1 := iter.Close(); err1 != nil { iter = nil return fileMetadata{}, err1 } iter = nil if err1 := tw.Close(); err1 != nil { tw = nil return fileMetadata{}, err1 } tw = nil // TODO: currently, closing a table.Writer closes its underlying file. // We have to re-open the file to Sync or Stat it, which seems stupid. file, err = fs.Open(filename) if err != nil { return fileMetadata{}, err } if err1 := file.Sync(); err1 != nil { return fileMetadata{}, err1 } if stat, err1 := file.Stat(); err1 != nil { return fileMetadata{}, err1 } else { size := stat.Size() if size < 0 { return fileMetadata{}, fmt.Errorf("leveldb: table file %q has negative size %d", filename, size) } // 将文件的大小值存入meta.size meta.size = uint64(size) } // TODO: compaction stats. /* 此时,meta的四个成员: - filenum - smallest - largest - size 已经全部填上了 */ return meta, nil }
// writeLevel0Table writes a memtable to a level-0 on-disk table. // // If no error is returned, it adds the file number of that on-disk table to // d.pendingOutputs. It is the caller's responsibility to remove that fileNum // from that set when it has been applied to d.versions. // // d.mu must be held when calling this, but the mutex may be dropped and // re-acquired during the course of this method. func (d *DB) writeLevel0Table(fs db.FileSystem, mem *memdb.MemDB) (meta fileMetadata, err error) { meta.fileNum = d.versions.nextFileNum() filename := dbFilename(d.dirname, fileTypeTable, meta.fileNum) d.pendingOutputs[meta.fileNum] = struct{}{} defer func(fileNum uint64) { if err != nil { delete(d.pendingOutputs, fileNum) } }(meta.fileNum) // Release the d.mu lock while doing I/O. // Note the unusual order: Unlock and then Lock. d.mu.Unlock() defer d.mu.Lock() var ( file db.File tw *table.Writer iter db.Iterator ) defer func() { if iter != nil { err = firstError(err, iter.Close()) } if tw != nil { err = firstError(err, tw.Close()) } if file != nil { err = firstError(err, file.Close()) } if err != nil { fs.Remove(filename) meta = fileMetadata{} } }() file, err = fs.Create(filename) if err != nil { return fileMetadata{}, err } tw = table.NewWriter(file, &db.Options{ Comparer: d.icmp, }) iter = mem.Find(nil, nil) iter.Next() meta.smallest = internalKey(iter.Key()).clone() for { meta.largest = iter.Key() if err1 := tw.Set(meta.largest, iter.Value(), nil); err1 != nil { return fileMetadata{}, err1 } if !iter.Next() { break } } meta.largest = meta.largest.clone() if err1 := iter.Close(); err1 != nil { iter = nil return fileMetadata{}, err1 } iter = nil if err1 := tw.Close(); err1 != nil { tw = nil return fileMetadata{}, err1 } tw = nil // TODO: currently, closing a table.Writer closes its underlying file. // We have to re-open the file to Sync or Stat it, which seems stupid. file, err = fs.Open(filename) if err != nil { return fileMetadata{}, err } if err1 := file.Sync(); err1 != nil { return fileMetadata{}, err1 } if stat, err1 := file.Stat(); err1 != nil { return fileMetadata{}, err1 } else { size := stat.Size() if size < 0 { return fileMetadata{}, fmt.Errorf("leveldb: table file %q has negative size %d", filename, size) } meta.size = uint64(size) } // TODO: compaction stats. return meta, nil }
func (d *DB) writeLevel0Table(fs db.FileSystem, mem *memdb.MemDB) (meta fileMetadata, err error) { meta.fileNum = d.versions.nextFileNum() filename := dbFilename(d.dirname, fileTypeTable, meta.fileNum) // TODO: add meta.fileNum to a set of 'pending outputs' so that a // concurrent sweep of obsolete db files won't delete the fileNum file. // It is the caller's responsibility to remove that fileNum from the // set of pending outputs. var ( file db.File tw *table.Writer iter db.Iterator ) defer func() { if iter != nil { err = firstError(err, iter.Close()) } if tw != nil { err = firstError(err, tw.Close()) } if file != nil { err = firstError(err, file.Close()) } if err != nil { fs.Remove(filename) meta = fileMetadata{} } }() file, err = fs.Create(filename) if err != nil { return fileMetadata{}, err } tw = table.NewWriter(file, &db.Options{ Comparer: d.icmp, }) iter = mem.Find(nil, nil) iter.Next() meta.smallest = internalKey(iter.Key()).clone() for { meta.largest = iter.Key() if err1 := tw.Set(meta.largest, iter.Value(), nil); err1 != nil { return fileMetadata{}, err1 } if !iter.Next() { break } } meta.largest = meta.largest.clone() if err1 := iter.Close(); err1 != nil { iter = nil return fileMetadata{}, err1 } iter = nil if err1 := tw.Close(); err1 != nil { tw = nil return fileMetadata{}, err1 } tw = nil // TODO: currently, closing a table.Writer closes its underlying file. // We have to re-open the file to Sync or Stat it, which seems stupid. file, err = fs.Open(filename) if err != nil { return fileMetadata{}, err } if err1 := file.Sync(); err1 != nil { return fileMetadata{}, err1 } if stat, err1 := file.Stat(); err1 != nil { return fileMetadata{}, err1 } else { size := stat.Size() if size < 0 { return fileMetadata{}, fmt.Errorf("leveldb: table file %q has negative size %d", filename, size) } meta.size = uint64(size) } // TODO: compaction stats. return meta, nil }
// compactDiskTables runs a compaction that produces new on-disk tables from // old on-disk tables. // // d.mu must be held when calling this, but the mutex may be dropped and // re-acquired during the course of this method. func (d *DB) compactDiskTables(c *compaction) (ve *versionEdit, pendingOutputs []uint64, retErr error) { defer func() { if retErr != nil { for _, fileNum := range pendingOutputs { delete(d.pendingOutputs, fileNum) } pendingOutputs = nil } }() // TODO: track snapshots. smallestSnapshot := d.versions.lastSequence // Release the d.mu lock while doing I/O. // Note the unusual order: Unlock and then Lock. d.mu.Unlock() defer d.mu.Lock() iter, err := compactionIterator(&d.tableCache, d.icmp, c) if err != nil { return nil, pendingOutputs, err } // TODO: output to more than one table, if it would otherwise be too large. var ( fileNum uint64 filename string tw *table.Writer ) defer func() { if iter != nil { retErr = firstError(retErr, iter.Close()) } if tw != nil { retErr = firstError(retErr, tw.Close()) } if retErr != nil { d.opts.GetFileSystem().Remove(filename) } }() currentUkey := make([]byte, 0, 4096) hasCurrentUkey := false lastSeqNumForKey := internalKeySeqNumMax smallest, largest := internalKey(nil), internalKey(nil) for iter.Next() { // TODO: prioritize compacting d.imm. // TODO: support c.shouldStopBefore. ikey := internalKey(iter.Key()) if !ikey.valid() { // Do not hide invalid keys. currentUkey = currentUkey[:0] hasCurrentUkey = false lastSeqNumForKey = internalKeySeqNumMax } else { ukey := ikey.ukey() if !hasCurrentUkey || d.icmp.userCmp.Compare(currentUkey, ukey) != 0 { // This is the first occurrence of this user key. currentUkey = append(currentUkey[:0], ukey...) hasCurrentUkey = true lastSeqNumForKey = internalKeySeqNumMax } drop, ikeySeqNum := false, ikey.seqNum() if lastSeqNumForKey <= smallestSnapshot { drop = true // Rule (A) referenced below. } else if ikey.kind() == internalKeyKindDelete && ikeySeqNum <= smallestSnapshot && c.isBaseLevelForUkey(d.icmp.userCmp, ukey) { // For this user key: // (1) there is no data in higher levels // (2) data in lower levels will have larger sequence numbers // (3) data in layers that are being compacted here and have // smaller sequence numbers will be dropped in the next // few iterations of this loop (by rule (A) above). // Therefore this deletion marker is obsolete and can be dropped. drop = true } lastSeqNumForKey = ikeySeqNum if drop { continue } } if tw == nil { d.mu.Lock() fileNum = d.versions.nextFileNum() d.pendingOutputs[fileNum] = struct{}{} pendingOutputs = append(pendingOutputs, fileNum) d.mu.Unlock() filename = dbFilename(d.dirname, fileTypeTable, fileNum) file, err := d.opts.GetFileSystem().Create(filename) if err != nil { return nil, pendingOutputs, err } tw = table.NewWriter(file, &d.icmpOpts) smallest = make(internalKey, len(ikey)) copy(smallest, ikey) largest = make(internalKey, 0, 2*len(ikey)) } largest = append(largest[:0], ikey...) if err := tw.Set(ikey, iter.Value(), nil); err != nil { return nil, pendingOutputs, err } } ve = &versionEdit{ deletedFiles: map[deletedFileEntry]bool{}, newFiles: []newFileEntry{ { level: c.level + 1, meta: fileMetadata{ fileNum: fileNum, size: 1, smallest: smallest, largest: largest, }, }, }, } for i := 0; i < 2; i++ { for _, f := range c.inputs[i] { ve.deletedFiles[deletedFileEntry{ level: c.level + i, fileNum: f.fileNum, }] = true } } return ve, pendingOutputs, nil }