// 读取数据,外部需要准备好够存放的desBuf func (this *BigFile) Read(i BigFileIndex, desBuf []byte) error { if i.FileNo >= this.bigfileStat.FileCnt { return log.Error("BigFile.Read FileNo[%d] Error", i.FileNo) } if i.Length > uint32(len(desBuf)) { return log.Error("BigFile.Read BigFileIndex.Length[%d] > len(desBuf)[%d]", i.Length, uint32(len(desBuf))) } var f *os.File if i.FileNo == this.bigfileStat.FileCnt-1 { f = this.readwriteFile } else { f = this.readOnlyFile[i.FileNo] } n, err := f.ReadAt(desBuf[:i.Length], int64(i.Offset)) if err == io.EOF { if uint32(n) == i.Length { // 刚刚好读完 return nil } } if uint32(n) != i.Length { return log.Error("Read Length Error offset[%d] destBuf len[%d],ReadAt len[%d]", i.Offset, i.Length, n) } if err != nil { return log.Error("ReadAt file", err.Error()) } return nil }
// 追加数据,返回追加数据的存储信息.不可并发进行写操作. func (this *BigFile) Append(buf []byte) (*BigFileIndex, error) { f, err := this.getRwFile() if err != nil { return nil, err } i := BigFileIndex{} i.FileNo = this.bigfileStat.FileCnt - 1 i.Length = uint32(len(buf)) off, err := this.readwriteFile.Seek(0, 1) i.Offset = uint32(off) if i.Offset != this.bigfileStat.LastFileOffset { return nil, log.Error("BigFile.Append getOffset[%d] LastFileOffset[%d]", i.Offset, this.bigfileStat.LastFileOffset) } n, err := f.Write(buf) if err != nil { return nil, log.Error("BigFile.Append write fail : %s", err.Error()) } if uint32(n) != i.Length { // 写成功,但是写入长度跟期望对不上 // 回滚文件指针 this.readwriteFile.Seek(int64(i.Offset), 0) return nil, log.Error("BigFile.Append write succ bug length error : %s", err.Error()) } // 更新状态文件 this.bigfileStat.LastFileOffset = i.Offset + i.Length this.saveStatFile() return &i, nil }
// 检索模式运行 func (this *Goose) searchModeRun() { log.Debug("run in search mode") if this.searchSty == nil { log.Error("Please set search strategy,see Goose.SetSearchStrategy()") return } if this.indexSty == nil { log.Warn("can't build index real time witout Index Strategy") } gooseSearch := NewGooseSearch() err := gooseSearch.Init(this.confPath, this.indexSty, this.searchSty) if err != nil { log.Error(err) return } log.Debug("goose search init succ") err = gooseSearch.Run() if err != nil { log.Error(err) return } }
// 读取Data数据,可以并发. func (this *DataManager) ReadData(inId InIdType, buf *Data) error { if inId < 1 || inId > this.dataStatus.MaxInId { return log.Error("inId [%d] illegal MaxInId[%d]", inId, this.dataStatus.MaxInId) } // 读一级索引 bigFileI, err := this.readData0(inId) if err != nil { return err } if bigFileI.Length == 0 { return log.Error("Read data0 inId[%d],fileNo[%d],length[%d],offset[%d]", inId, bigFileI.FileNo, bigFileI.Length, bigFileI.Offset) } // 读二级索引 if bigFileI.Length > uint32(buf.Len()) { *buf = NewData(int(bigFileI.Length)) } err = this.data1.Read(bigFileI, *buf) if err != nil { return err } return nil }
// 打开已存在的大文件,如果不存在,直接返回错误 func (this *BigFile) Open(path string, name string) error { // 是打开已有数据文件状态 this.fileModel = bigFileModelOpen this.filePath = path this.fileName = name this.statFileFullPath = filepath.Join(this.filePath, fmt.Sprintf("%s%s", this.fileName, statFileSuffix)) // 解析获取文件信息 err := this.parseStatFile() if err != nil { return log.Warn(err) } // 检验状态文件 if this.bigfileStat.SuggestFileSize == 0 { return log.Error("BigFile.Open stat file error") } // 除了最后一个文件,其它以只读方式打开 readOnlyFileCnt := uint8(0) if this.bigfileStat.FileCnt > 0 { readOnlyFileCnt = this.bigfileStat.FileCnt - 1 } this.readOnlyFile = make([]*os.File, readOnlyFileCnt) for i := 0; uint8(i) < readOnlyFileCnt; i++ { f, err := this.openRoFile(uint8(i)) if err != nil { return err } this.readOnlyFile[i] = f // 校验这些只读文件的大小,他们肯定是大于等于配置才对 // TODO } // 最后一个文件已读写方式打开 if this.bigfileStat.FileCnt > 0 { err = this.openRwFile(this.bigfileStat.FileCnt - 1) if err != nil { return err } // 设置文件指针 this.readwriteFile.Seek(int64(this.bigfileStat.LastFileOffset), 0) // 最后一个文件的文件指针应该就是文件大小 sz, _ := FileSize(this.readwriteFile) if sz != int64(this.bigfileStat.LastFileOffset) { return log.Error("BigFile.Open", "FileStatInfo Error LastFileOffset:[%d] != FileSize:[%d]", this.bigfileStat.LastFileOffset, sz) } } else { this.readwriteFile = nil } return nil }
// 根据唯一外部ID,分配内部ID,可并发内部有锁控制按顺序分配 func (this *DBSearcher) AllocID(outID OutIdType) (InIdType, error) { if this.varIndex == nil { return 0, log.Error("No Var Index") } if this.idMgr == nil { return 0, log.Error("no id manager") } return this.idMgr.AllocID(outID) }
// 写入Value数据,可并发写入. func (this *DBSearcher) WriteValue(InID InIdType, v Value) error { if this.varIndex == nil { return log.Error("No Var Index") } if this.valueMgr == nil { return log.Error("no value manager") } return this.valueMgr.WriteValue(InID, v) }
// 写入Data数据,可并发调用. func (this *DBSearcher) WriteData(InID InIdType, d Data) error { if this.varIndex == nil { return log.Error("No Var Index") } if this.dataMgr == nil { return log.Error("no data manager") } // dataMgr内部锁控制,并发写顺序写入 return this.dataMgr.Append(InID, d) }
// 创建全新的磁盘索引,初始化后只允许进行索引写入. // maxFileSz 索引大文件单个文件的最大大小. // MaxTermCnt 是预期要写入的term的总数量. func (this *DiskIndex) Init(path string, name string, maxFileSz uint32, MaxTermCnt int64) error { this.lock.Lock() defer this.lock.Unlock() if this.indexStatus != DiskIndexInit { return log.Error("index status error") } if len(path) == 0 || len(name) == 0 { return log.Error("path[%s] name[%s] error") } this.filePath = path this.fileName = name this.StatusFilePath = filepath.Join(this.filePath, fmt.Sprintf("%s.index.stat", this.fileName)) // 磁盘状态文件需要设置的两个步骤:(1)指示要写入的结构;(2)设置写入路径 this.SelfStatus = &this.diskStatus this.diskStatus.MaxTermCount = MaxTermCnt // 初始化三级索引 this.index3 = &BigFile{} ind3name := fmt.Sprintf("%s.index3", this.fileName) err := this.index3.Init(this.filePath, ind3name, maxFileSz) if err != nil { return log.Error(err) } // 打开二级索引 ind2name := filepath.Join(this.filePath, fmt.Sprintf("%s.index2", this.fileName)) // 打开新文件,创建|截断|只写 this.index2, err = os.OpenFile(ind2name, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) if err != nil { return log.Error(err) } // 计算预期一级索引大小 index1Sz := this.diskStatus.MaxTermCount * int64(binary.Size(TermSign(0))) // 打开一级索引 this.index1 = new(MmapFile) ind1name := fmt.Sprintf("%s.index1", this.fileName) err = this.index1.OpenFile(this.filePath, ind1name, uint32(index1Sz)) if err != nil { return log.Error("mmap open[%s] size[%d] fail : %s", ind1name, index1Sz, err) } this.indexStatus = DiskIndexWriteOnly return this.SaveJsonFile() }
func (this *GooseSearch) runIndexServer(listenPort int, requestBufSize int) error { if 0 == listenPort || 0 == requestBufSize { return log.Error("arg error istenPort[%d] requestBufSize[%d]", listenPort, requestBufSize) } if this.varIndexer == nil { return nil } listener, err := net.Listen("tcp", fmt.Sprintf("localhost:%d", listenPort)) if err != nil { log.Error("runIndexServer listen fail : %s", err.Error()) return err } // 简单一个协程完成接受请求和完成处理.索引更新不要求高并发性. go func() { reqbuf := make([]byte, requestBufSize) for { var reqlen int conn, err := listener.Accept() if err != nil { log.Warn("IndexServer accept fail : %s", err.Error()) goto LabelError } // receive data reqlen, err = conn.Read(reqbuf) if err != nil { log.Warn("IndexSearcher read fail : %s", err.Error()) goto LabelError } // index err = this.varIndexer.BuildIndex(NewBufferIterOnce(reqbuf[:reqlen])) if err != nil { log.Warn("IndexSearcher BuildIndex fail : %s", err.Error()) goto LabelError } LabelError: conn.Close() } }() return nil }
// 读取offset开始的destSz个字节作为数字返回 func (this *MmapFile) ReadNum(offset uint32, destSz uint32) (uint64, error) { if destSz == 0 { return 0, log.Error("Mmapfile.ReadNum not a basic num") } if int64(offset)+int64(destSz) > int64(len(this.fileMmap)) { return 0, log.Error("Mmapfile.ReadNum over length limit") } buf := this.fileMmap[offset : offset+uint32(destSz)] order := binary.BigEndian switch destSz { case 1: return uint64(buf[0]), nil case 2: return uint64(order.Uint16(buf)), nil case 4: return uint64(order.Uint32(buf)), nil case 8: return uint64(order.Uint64(buf)), nil default: return 0, log.Error("MmapFile.ReadNum Wrong Type") } return 0, nil /* switch v := n.(type) { case *int8: *v = int8(buf[0]) case *uint8: *v = buf[0] case *int16: *v = int16(order.Uint16(buf)) case *uint16: *v = uint16(order.Uint16(buf)) case *int32: *v = int32(order.Uint32(buf)) case *uint32: *v = uint32(order.Uint32(buf)) case *int64: *v = int64(order.Uint64(buf)) case *uint64: *v = uint64(order.Uint64(buf)) default: return NewGooseError("MmapFile.ReadNum","Wrong Type","") } return nil */ }
// 写入索引,不可并发写入 func (this *DBBuilder) WriteIndex(InID InIdType, termlist []TermInDoc) error { if this.transformMgr == nil { return log.Error("no transform manager") } return this.transformMgr.WriteIndex(InID, termlist) }
// 写入Value数据,可并发写入 func (this *DBBuilder) WriteValue(InID InIdType, v Value) error { if this.valueMgr == nil { return log.Error("no value manager") } return this.valueMgr.WriteValue(InID, v) }
func (this *StaticIndexer) parseDoc() { // context context := NewStyContext() // 一直从chan中获取doc,直到这个chan被close for doc := range this.parseDocChan { var err error // parse parseRes := &docParsed{} parseRes.outId, parseRes.termList, parseRes.value, parseRes.data, err = this.strategy.ParseDoc(doc, context) if err != nil { log.Error(err) parseRes = nil } // 打印策略日志 context.Log.PrintAllInfo() // toWriteDbQueue是待写入db的队列. // 阻塞等待队列有空余位置然后写入队列. this.writeDbQueue <- parseRes } log.Info("Finish parseDoc , goroutine exit.") }
// read bytes (reference) func (this *MmapFile) ReadBytes(offset uint32, length uint32) ([]byte, error) { if uint64(offset+length) > uint64(len(this.fileMmap)) { return nil, log.Error("Mmapfile.ReadBytes over length limit") } return this.fileMmap[offset : offset+length], nil }
// 分配内部id func (this *IdManager) AllocID(outId OutIdType) (InIdType, error) { this.lock.Lock() defer this.lock.Unlock() if outId == 0 { return 0, log.Warn("illegal outId [%d]", 0) } if this.idStatus.CurId >= this.idStatus.MaxInId { return 0, log.Error("InId [%d] out of limit MaxInId[%d]", this.idStatus.CurId, this.idStatus.MaxInId) } inID := this.idStatus.CurId // 分配信息,写入mmap offset := inID * idSize err := this.mfile.WriteNum(uint32(offset), uint32(outId)) if err != nil { return 0, err } // 确认分配成功才真正占用这个id this.idStatus.CurId++ return inID, nil }
// 全新初始化数据文件 func (this *DataManager) Init(path string, maxId InIdType, maxFileSz uint32) error { this.dataStatus.MaxInId = maxId this.maxDataFileSize = maxFileSz this.filePath = path // 磁盘状态文件需要设置的两个步骤:(1)指示要写入的结构;(2)设置写入路径 this.SelfStatus = &this.dataStatus this.StatusFilePath = filepath.Join(this.filePath, "data.stat") // 一级索引mmap打开 // id有效范围[1,MaxInId],0不使用导致后面要多分配一个空间 data0Size := uint32(1+this.dataStatus.MaxInId) * uint32(binary.Size(BigFileIndex{})) data0Name := fmt.Sprintf("data.d0") err := this.data0.OpenFile(this.filePath, data0Name, data0Size) if err != nil { return log.Error("mmap open[%s] size[%d] fail : %s", data0Name, data0Size, err) } // 二级索引BigFile打开 this.data1 = new(BigFile) data1Name := fmt.Sprintf("data.d1") err = this.data1.Init(this.filePath, data1Name, this.maxDataFileSize) if err != nil { return err } return this.SaveJsonFile() }
func (this *ValueManager) Init(path string, maxId InIdType, valueSz uint32) error { this.lock.Lock() defer this.lock.Unlock() this.filePath = path this.valueStatus.MaxInId = maxId this.valueStatus.ValueSize = valueSz // 磁盘状态文件需要设置的两个步骤:(1)指示要写入的结构;(2)设置写入路径 this.SelfStatus = &this.valueStatus this.StatusFilePath = filepath.Join(this.filePath, "value.stat") this.fileValueMaxCnt = uint32(maxValueFileSize / this.valueStatus.ValueSize) this.fileCnt = uint32(uint32(maxId)/this.fileValueMaxCnt) + 1 this.mfile = make([]MmapFile, this.fileCnt) // 分配磁盘空间 for i := 0; uint32(i) < this.fileCnt; i++ { tname := fmt.Sprintf("value.n%d", i) sz := uint32(this.fileValueMaxCnt * this.valueStatus.ValueSize) err := this.mfile[i].OpenFile(path, tname, sz) if err != nil { return log.Error("open mfile[%d],szie[%d] fail[%s]", i, sz, err.Error()) } } return this.SaveJsonFile() }
// 打开已存在的数据文件 func (this *DataManager) Open(path string) error { this.filePath = path // 磁盘状态文件需要设置的两个步骤:(1)指示要写入的结构;(2)设置写入路径 this.SelfStatus = &this.dataStatus this.StatusFilePath = filepath.Join(this.filePath, "data.stat") err := this.ParseJsonFile() if err != nil { return err } // 一级索引mmap打开 // id有效范围[1,MaxInId],0不使用导致后面要多分配一个空间 // 打开也要多打开一个单位的空间 data0Size := uint32(1+this.dataStatus.MaxInId) * uint32(binary.Size(BigFileIndex{})) data0Name := fmt.Sprintf("data.d0") err = this.data0.OpenFile(this.filePath, data0Name, data0Size) if err != nil { return log.Error("mmap open[%s] size[%d] fail : %s", data0Name, data0Size, err) } // 二级索引BigFile打开 this.data1 = new(BigFile) data1Name := fmt.Sprintf("data.d1") err = this.data1.Open(this.filePath, data1Name) if err != nil { return err } return nil }
// 写入Data数据,可并发调用,内部锁控制 func (this *DBBuilder) WriteData(InID InIdType, d Data) error { if this.dataMgr == nil { return log.Error("no data manager") } // dataMgr内部锁控制,并发写顺序写入 return this.dataMgr.Append(InID, d) }
// 根据配置文件进行初始化. // 需要外部指定索引策略,策略可以重新设计. // 需要外部知道被索引文件(这个易变信息不适合放配置) func (this *GooseBuild) Init(confPath string, indexSty IndexStrategy, toIndexFile string) (err error) { defer func() { if r := recover(); r != nil { err = log.Error(r) } }() // load conf this.conf, err = config.NewConf(confPath) if err != nil { return } // set max procs maxProcs := int(this.conf.Int64("GooseBuild.MaxProcs")) if maxProcs <= 0 { maxProcs = runtime.NumCPU() } runtime.GOMAXPROCS(maxProcs) // init dbbuilder dbPath := this.conf.String("GooseBuild.DataBase.DbPath") transformMaxTermCnt := this.conf.Int64("GooseBuild.DataBase.TransformMaxTermCnt") maxId := this.conf.Int64("GooseBuild.DataBase.MaxId") maxIndexFileSize := this.conf.Int64("GooseBuild.DataBase.MaxIndexFileSize") maxDataFileSize := this.conf.Int64("GooseBuild.DataBase.MaxDataFileSize") valueSize := this.conf.Int64("GooseBuild.DataBase.ValueSize") this.staticDB = NewDBBuilder() err = this.staticDB.Init(dbPath, int(transformMaxTermCnt), InIdType(maxId), uint32(valueSize), uint32(maxIndexFileSize), uint32(maxDataFileSize)) if err != nil { return } // index strategy global init err = indexSty.Init(this.conf) if err != nil { return } // static indexer this.staticIndexer, err = NewStaticIndexer(this.staticDB, indexSty) if err != nil { return } // open data file this.fileHd, err = os.OpenFile(toIndexFile, os.O_RDONLY, 0644) if err != nil { return } // file iter this.fileIter = NewFileIter(this.fileHd) return nil }
func (this *StaticIndexer) writeDoc() { for parseRes := range this.writeDbQueue { if parseRes == nil { log.Error("get nil pointer from queue") this.finishedWg.Done() continue } // id inId, err := this.db.AllocID(parseRes.outId) if err != nil { log.Error(err) this.finishedWg.Done() continue } // index err = this.db.WriteIndex(inId, parseRes.termList) if err != nil { log.Error(err) this.finishedWg.Done() continue } // value err = this.db.WriteValue(inId, parseRes.value) if err != nil { log.Error(err) this.finishedWg.Done() continue } // data err = this.db.WriteData(inId, parseRes.data) if err != nil { log.Error(err) this.finishedWg.Done() continue } this.finishedWg.Done() } log.Info("Finish writeDoc,goroutine exit.") }
// read bytes (copy slice) func (this *MmapFile) ReadBytesCopy(offset int32, length int32) ([]byte, error) { if int64(offset+length) > int64(len(this.fileMmap)) { return nil, log.Error("Mmapfile.ReadBytes over length limit") } newbuf := make([]byte, length) copy(newbuf, this.fileMmap[offset:offset+length]) return newbuf, nil }
func (this *BigFile) openRoFile(fileno uint8) (*os.File, error) { var err error tname := fmt.Sprintf("%s%s%d", this.fileName, dataFileSuffix, fileno) f, err := os.OpenFile(filepath.Join(this.filePath, tname), os.O_RDONLY, 0644) if err != nil { return nil, log.Error("open readonly file fail : %s", err.Error()) } return f, nil }
// 读取value的引用,value只能进行读操作,任何写操作都是非法的 func (this *ValueManager) ReadValue(inId InIdType) (Value, error) { if inId > this.valueStatus.MaxInId { return nil, log.Error("inId [%d] illegal MaxInId[%d]", inId, this.valueStatus.MaxInId) } fileNo := uint32(int64(inId) / int64(this.fileValueMaxCnt)) offset := uint32(int64(inId)%int64(this.fileValueMaxCnt)) * this.valueStatus.ValueSize if fileNo >= this.fileCnt { return nil, log.Error("inId out of limit") } v, err := this.mfile[fileNo].ReadBytes(offset, this.valueStatus.ValueSize) if err != nil { return nil, err } return v[:], nil }
// 读取索引,每次查询在内部分配一块内存返回 func (this *DiskIndex) ReadIndex(t TermSign) (*InvList, error) { // 读取不加锁 // 打开的磁盘只读索引下才允许读取 if this.indexStatus != DiskIndexReadOnly { return nil, log.Error("DiskIndex.Read status error") } return this.readIndex3(t) }
// 写入Value.可并发写 func (this *ValueManager) WriteValue(inId InIdType, v Value) error { if inId > this.valueStatus.MaxInId { return log.Error("inId [%d] illegal MaxInId[%d]", inId, this.valueStatus.MaxInId) } fileNo := uint32(int64(inId) / int64(this.fileValueMaxCnt)) offset := uint32(int64(inId)%int64(this.fileValueMaxCnt)) * this.valueStatus.ValueSize if fileNo >= this.fileCnt { return log.Error("inId out of limit") } // 最多写入this.valueStatus.ValueSize个字节 err := this.mfile[fileNo].WriteBytes(offset, v[:], this.valueStatus.ValueSize) if err != nil { return err } return nil }
// 写入索引,不可并发写入. func (this *DBSearcher) WriteIndex(InID InIdType, termlist []TermInDoc) error { if this.varIndex == nil { return log.Error("No Var Index") } for _, term := range termlist { l := NewInvList(1) l.Append(Index{InID: InID, Weight: term.Weight}) this.varIndex.WriteIndex(term.Sign, &l) } return nil }
// 根据文件数量打开最后一个可读写文件 func (this *BigFile) openRwFile(fileno uint8) error { var err error tname := fmt.Sprintf("%s%s%d", this.fileName, dataFileSuffix, fileno) this.readwriteFileFullPath = filepath.Join(this.filePath, tname) if this.fileModel == bigFileModelInit { // 全新初始化的,文件打开后直接做截断处理 this.readwriteFile, err = os.OpenFile(this.readwriteFileFullPath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0644) } else if this.fileModel == bigFileModelOpen { // 已有文件,追加写 this.readwriteFile, err = os.OpenFile(this.readwriteFileFullPath, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0644) } else { return log.Error("File Status Error : [%d]", this.fileModel) } if err != nil { return log.Error("open readwrite file fail : %s", err.Error()) } return nil }
func (this *BigFileIndex) Encode(buf []byte) error { order := binary.BigEndian if len(buf) < 9 { return log.Error("BigFileIndex.Decode buf length [%d] error", len(buf)) } buf[0] = this.FileNo order.PutUint32(buf[1:5], this.Offset) order.PutUint32(buf[5:9], this.Length) return nil }