func (c *Coordinator) onStepBarrierChange(step int, m *donut.SafeMap) { if m.Len() == c.workers.Len() { defer m.Clear() barrierName := "superstep-" + strconv.Itoa(step) // the barrier is full, collect information and launch the next step c.graph.globalStat.reset() c.graph.globalStat.step = step // collect and unmarshal data for all entries in the barrier lm := m.GetCopy() for k := range lm { if data, _, err := c.zk.Get(path.Join(c.barriersPath, barrierName, k)); err == nil { var info map[string]interface{} if err := json.Unmarshal([]byte(data), &info); err != nil { panic(err) } c.graph.globalStat.active += int(info["active"].(float64)) c.graph.globalStat.msgs += int(info["msgs"].(float64)) } else { panic(err) } } // kill the watcher on this barrier c.watchers[barrierName] <- 1 delete(c.watchers, barrierName) if c.graph.globalStat.active == 0 && c.graph.globalStat.msgs == 0 { atomic.StoreInt32(&c.state, WriteState) go c.createWriteWork() } else { go c.createStepWork(step + 1) } } else { log.Printf("step barrier change: %d entries out of %d", m.Len(), c.workers.Len()) } }
// XXX pulled this out of donut, maybe i should make a zk util lib? // Watch the children at path until a byte is sent on the returned channel // Uses the SafeMap more like a set, so you'll have to use Contains() for entries func watchZKChildren(zk *zookeeper.Conn, path string, children *donut.SafeMap, onChange func(*donut.SafeMap)) (chan byte, error) { initial, _, watch, err := zk.ChildrenW(path) if err != nil { return nil, err } m := children.RangeLock() for _, node := range initial { m[node] = nil } children.RangeUnlock() kill := make(chan byte, 1) log.Printf("watching "+path+" len is %d", children.Len()) go func() { defer close(kill) var nodes []string var err error for { select { case <-kill: // close(watch) log.Printf("got kill") return case event := <-watch: if !event.Ok() { continue } // close(watch) nodes, _, watch, err = zk.ChildrenW(path) if err != nil { log.Printf("Error in watchZkChildren: %v", err) // XXX I should really provide some way for the client to find out about this error... return } m := children.RangeLock() // mark all dead for k := range m { m[k] = 0 } for _, node := range nodes { m[node] = 1 } for k, v := range m { if v.(int) == 0 { delete(m, k) } } children.RangeUnlock() onChange(children) } } }() log.Printf("watcher setup on %s", path) return kill, nil }
func (c *Coordinator) onLoadBarrierChange(m *donut.SafeMap) { if m.Len() == len(c.graph.job.LoadPaths()) { log.Printf("load complete") c.watchers["load"] <- 1 delete(c.watchers, "load") if !atomic.CompareAndSwapInt32(&c.state, LoadState, RunState) { log.Println("Could not properly move from LoadState to RunState") return } go c.createStepWork(1) } else { log.Printf("Load barrier has %d/%d entries", m.Len(), len(c.graph.job.LoadPaths())) } }
func (c *Coordinator) onWorkersChange(m *donut.SafeMap) { log.Println("workers updated") if atomic.LoadInt32(&c.state) > SetupState { // invalidate current step // update partition mapping // roll back to last checkpoint } else { if m.Len() == c.config.InitialWorkers { // go into prepare state if !atomic.CompareAndSwapInt32(&c.state, SetupState, PrepareState) { log.Println("Could not properly move from SetupState to PrepareState") return } log.Printf("InitialWorkers met, preparing node for work") // everyone is here, create the partition mapping lm := m.RangeLock() var workers []string for k := range lm { workers = append(workers, k) } m.RangeUnlock() sort.Strings(workers) for i := 0; i < len(workers); i++ { c.partitions[i] = workers[i] if workers[i] == c.config.NodeId { c.graph.partitionId = i } } // set up connections to all the other nodes c.cachedWorkerInfo = make(map[string]map[string]interface{}) c.rpcClients = make(map[string]*rpc.Client) for _, w := range workers { // pull down worker info for all of the existing workers c.cachedWorkerInfo[w] = c.workerInfo(w) c.rpcClients[w], _ = rpc.DialHTTP("tcp", net.JoinHostPort(c.cachedWorkerInfo[w]["host"].(string), c.cachedWorkerInfo[w]["port"].(string))) } // go into loadstate if !atomic.CompareAndSwapInt32(&c.state, PrepareState, LoadState) { log.Println("Could not properly move from PrepareState to LoadState") return } go c.createLoadWork() } } }
func (c *Coordinator) onWriteBarrierChange(m *donut.SafeMap) { if m.Len() == c.workers.Len() { log.Println("Write barrier full, ending job") c.done <- 1 } }