func (ds *Dataset) validateRefAsCommit(r types.Ref) types.Struct { v := ds.store.ReadValue(r.TargetHash()) d.Exp.True(v != nil, "%v cannot be found", r) d.Exp.True(v.Type().Equals(datas.NewCommit().Type()), "Not a Commit: %+v", v) return v.(types.Struct) }
func (ds *Dataset) validateRefAsCommit(r types.Ref) types.Struct { v := ds.store.ReadValue(r.TargetHash()) if v == nil { panic(r.TargetHash().String() + " not found") } if !datas.IsCommitType(v.Type()) { panic("Not a commit: " + types.EncodedValue(v)) } return v.(types.Struct) }
func traverseSource(srcRef types.Ref, srcDB, sinkDB Database) traverseResult { h := srcRef.TargetHash() if !sinkDB.has(h) { srcBS := srcDB.validatingBatchStore() c := srcBS.Get(h) v := types.DecodeValue(c, srcDB) d.Chk.True(v != nil, "Expected decoded chunk to be non-nil.") sinkDB.validatingBatchStore().SchedulePut(c, srcRef.Height(), types.Hints{}) return traverseResult{h, v.Chunks(), len(c.Data())} } return traverseResult{} }
func traverseCommon(comRef, sinkHead types.Ref, db Database) traverseResult { if comRef.Height() > 1 && isRefOfCommitType(comRef.Type()) { commit := comRef.TargetValue(db).(types.Struct) // We don't want to traverse the parents of sinkHead, but we still want to traverse its Value on the sinkDB side. We also still want to traverse all children, in both the srcDB and sinkDB, of any common Commit that is not at the Head of sinkDB. exclusionSet := types.NewSet() if comRef.Equals(sinkHead) { exclusionSet = commit.Get(ParentsField).(types.Set) } chunks := types.RefSlice(commit.Chunks()) for i := 0; i < len(chunks); { if exclusionSet.Has(chunks[i]) { end := len(chunks) - 1 chunks.Swap(i, end) chunks = chunks[:end] continue } i++ } return traverseResult{comRef.TargetHash(), chunks, 0} } return traverseResult{} }
// SomeChunksP invokes callbacks on every unique chunk reachable from |r| in top-down order. Callbacks are invoked only once for each chunk regardless of how many times the chunk appears. // // |stopCb| is invoked for the types.Ref of every chunk. It can return true to stop SomeChunksP from descending any further. // |chunkCb| is optional, invoked with the chunks.Chunk referenced by |stopCb| if it didn't return true. func SomeChunksP(r types.Ref, bs types.BatchStore, stopCb SomeChunksStopCallback, chunkCb SomeChunksChunkCallback, concurrency int) { rq := newRefQueue() wg := sync.WaitGroup{} mu := sync.Mutex{} visitedRefs := map[hash.Hash]bool{} walkChunk := func(r types.Ref) { defer wg.Done() tr := r.TargetHash() mu.Lock() visited := visitedRefs[tr] visitedRefs[tr] = true mu.Unlock() if visited || stopCb(r) { return } // Try to avoid the cost of reading |c|. It's only necessary if the caller wants to know about every chunk, or if we need to descend below |c| (ref height > 1). var c chunks.Chunk if chunkCb != nil || r.Height() > 1 { c = bs.Get(tr) d.Chk.False(c.IsEmpty()) if chunkCb != nil { chunkCb(r, c) } } if r.Height() == 1 { return } v := types.DecodeValue(c, nil) for _, r1 := range v.Chunks() { wg.Add(1) rq.tail() <- r1 } } iter := func() { for r := range rq.head() { walkChunk(r) } } for i := 0; i < concurrency; i++ { go iter() } wg.Add(1) rq.tail() <- r wg.Wait() rq.close() }
func traverseSink(sinkRef types.Ref, db Database) traverseResult { if sinkRef.Height() > 1 { return traverseResult{sinkRef.TargetHash(), sinkRef.TargetValue(db).Chunks(), 0} } return traverseResult{} }
// Pull objects that descends from sourceRef from srcDB to sinkDB. sinkHeadRef should point to a Commit (in sinkDB) that's an ancestor of sourceRef. This allows the algorithm to figure out which portions of data are already present in sinkDB and skip copying them. func Pull(srcDB, sinkDB Database, sourceRef, sinkHeadRef types.Ref, concurrency int, progressCh chan PullProgress) { srcQ, sinkQ := &types.RefByHeight{sourceRef}, &types.RefByHeight{sinkHeadRef} // We generally expect that sourceRef descends from sinkHeadRef, so that walking down from sinkHeadRef yields useful hints. If it's not even in the srcDB, then just clear out sinkQ right now and don't bother. if !srcDB.has(sinkHeadRef.TargetHash()) { sinkQ.PopBack() } // Since we expect sourceRef to descend from sinkHeadRef, we assume srcDB has a superset of the data in sinkDB. There are some cases where, logically, the code wants to read data it knows to be in sinkDB. In this case, it doesn't actually matter which Database the data comes from, so as an optimization we use whichever is a LocalDatabase -- if either is. mostLocalDB := srcDB if _, ok := sinkDB.(*LocalDatabase); ok { mostLocalDB = sinkDB } // traverseWorker below takes refs off of {src,sink,com}Chan, processes them to figure out what reachable refs should be traversed, and then sends the results to {srcRes,sinkRes,comRes}Chan. // sending to (or closing) the 'done' channel causes traverseWorkers to exit. srcChan := make(chan types.Ref) sinkChan := make(chan types.Ref) comChan := make(chan types.Ref) srcResChan := make(chan traverseResult) sinkResChan := make(chan traverseResult) comResChan := make(chan traverseResult) done := make(chan struct{}) workerWg := &sync.WaitGroup{} defer func() { close(done) workerWg.Wait() close(srcChan) close(sinkChan) close(comChan) close(srcResChan) close(sinkResChan) close(comResChan) }() traverseWorker := func() { workerWg.Add(1) go func() { for { select { case srcRef := <-srcChan: srcResChan <- traverseSource(srcRef, srcDB, sinkDB) case sinkRef := <-sinkChan: sinkResChan <- traverseSink(sinkRef, mostLocalDB) case comRef := <-comChan: comResChan <- traverseCommon(comRef, sinkHeadRef, mostLocalDB) case <-done: workerWg.Done() return } } }() } for i := 0; i < concurrency; i++ { traverseWorker() } var doneCount, knownCount, doneBytes uint64 updateProgress := func(moreDone, moreKnown, moreBytes uint64) { if progressCh == nil { return } doneCount, knownCount, doneBytes = doneCount+moreDone, knownCount+moreKnown, doneBytes+moreBytes progressCh <- PullProgress{doneCount, knownCount + uint64(srcQ.Len()), doneBytes} } // hc and reachableChunks aren't goroutine-safe, so only write them here. hc := hintCache{} reachableChunks := hash.HashSet{} for !srcQ.Empty() { srcRefs, sinkRefs, comRefs := planWork(srcQ, sinkQ) srcWork, sinkWork, comWork := len(srcRefs), len(sinkRefs), len(comRefs) if srcWork+comWork > 0 { updateProgress(0, uint64(srcWork+comWork), 0) } // These goroutines send work to traverseWorkers, blocking when all are busy. They self-terminate when they've sent all they have. go sendWork(srcChan, srcRefs) go sendWork(sinkChan, sinkRefs) go sendWork(comChan, comRefs) // Don't use srcRefs, sinkRefs, or comRefs after this point. The goroutines above own them. for srcWork+sinkWork+comWork > 0 { select { case res := <-srcResChan: for _, reachable := range res.reachables { srcQ.PushBack(reachable) reachableChunks.Insert(reachable.TargetHash()) } if !res.readHash.IsEmpty() { reachableChunks.Remove(res.readHash) } srcWork-- updateProgress(1, 0, uint64(res.readBytes)) case res := <-sinkResChan: for _, reachable := range res.reachables { sinkQ.PushBack(reachable) hc[reachable.TargetHash()] = res.readHash } sinkWork-- case res := <-comResChan: isHeadOfSink := res.readHash == sinkHeadRef.TargetHash() for _, reachable := range res.reachables { sinkQ.PushBack(reachable) if !isHeadOfSink { srcQ.PushBack(reachable) } hc[reachable.TargetHash()] = res.readHash } comWork-- updateProgress(1, 0, uint64(res.readBytes)) } } sort.Sort(sinkQ) sort.Sort(srcQ) sinkQ.Unique() srcQ.Unique() } hints := types.Hints{} for hash := range reachableChunks { if hint, present := hc[hash]; present { hints[hint] = struct{}{} } } sinkDB.validatingBatchStore().AddHints(hints) }