// XXX pulled this out of donut, maybe i should make a zk util lib? // Watch the children at path until a byte is sent on the returned channel // Uses the SafeMap more like a set, so you'll have to use Contains() for entries func watchZKChildren(zk *zookeeper.Conn, path string, children *donut.SafeMap, onChange func(*donut.SafeMap)) (chan byte, error) { initial, _, watch, err := zk.ChildrenW(path) if err != nil { return nil, err } m := children.RangeLock() for _, node := range initial { m[node] = nil } children.RangeUnlock() kill := make(chan byte, 1) log.Printf("watching "+path+" len is %d", children.Len()) go func() { defer close(kill) var nodes []string var err error for { select { case <-kill: // close(watch) log.Printf("got kill") return case event := <-watch: if !event.Ok() { continue } // close(watch) nodes, _, watch, err = zk.ChildrenW(path) if err != nil { log.Printf("Error in watchZkChildren: %v", err) // XXX I should really provide some way for the client to find out about this error... return } m := children.RangeLock() // mark all dead for k := range m { m[k] = 0 } for _, node := range nodes { m[node] = 1 } for k, v := range m { if v.(int) == 0 { delete(m, k) } } children.RangeUnlock() onChange(children) } } }() log.Printf("watcher setup on %s", path) return kill, nil }
func (c *Coordinator) onWorkersChange(m *donut.SafeMap) { log.Println("workers updated") if atomic.LoadInt32(&c.state) > SetupState { // invalidate current step // update partition mapping // roll back to last checkpoint } else { if m.Len() == c.config.InitialWorkers { // go into prepare state if !atomic.CompareAndSwapInt32(&c.state, SetupState, PrepareState) { log.Println("Could not properly move from SetupState to PrepareState") return } log.Printf("InitialWorkers met, preparing node for work") // everyone is here, create the partition mapping lm := m.RangeLock() var workers []string for k := range lm { workers = append(workers, k) } m.RangeUnlock() sort.Strings(workers) for i := 0; i < len(workers); i++ { c.partitions[i] = workers[i] if workers[i] == c.config.NodeId { c.graph.partitionId = i } } // set up connections to all the other nodes c.cachedWorkerInfo = make(map[string]map[string]interface{}) c.rpcClients = make(map[string]*rpc.Client) for _, w := range workers { // pull down worker info for all of the existing workers c.cachedWorkerInfo[w] = c.workerInfo(w) c.rpcClients[w], _ = rpc.DialHTTP("tcp", net.JoinHostPort(c.cachedWorkerInfo[w]["host"].(string), c.cachedWorkerInfo[w]["port"].(string))) } // go into loadstate if !atomic.CompareAndSwapInt32(&c.state, PrepareState, LoadState) { log.Println("Could not properly move from PrepareState to LoadState") return } go c.createLoadWork() } } }