Example #1
0
// XXX pulled this out of donut, maybe i should make a zk util lib?
// Watch the children at path until a byte is sent on the returned channel
// Uses the SafeMap more like a set, so you'll have to use Contains() for entries
func watchZKChildren(zk *zookeeper.Conn, path string, children *donut.SafeMap, onChange func(*donut.SafeMap)) (chan byte, error) {
	initial, _, watch, err := zk.ChildrenW(path)
	if err != nil {
		return nil, err
	}
	m := children.RangeLock()
	for _, node := range initial {
		m[node] = nil
	}
	children.RangeUnlock()
	kill := make(chan byte, 1)
	log.Printf("watching "+path+" len is %d", children.Len())
	go func() {
		defer close(kill)
		var nodes []string
		var err error
		for {
			select {
			case <-kill:
				// close(watch)
				log.Printf("got kill")
				return
			case event := <-watch:
				if !event.Ok() {
					continue
				}
				// close(watch)
				nodes, _, watch, err = zk.ChildrenW(path)
				if err != nil {
					log.Printf("Error in watchZkChildren: %v", err)
					// XXX I should really provide some way for the client to find out about this error...
					return
				}
				m := children.RangeLock()
				// mark all dead
				for k := range m {
					m[k] = 0
				}
				for _, node := range nodes {
					m[node] = 1
				}
				for k, v := range m {
					if v.(int) == 0 {
						delete(m, k)
					}
				}
				children.RangeUnlock()
				onChange(children)
			}
		}
	}()
	log.Printf("watcher setup on %s", path)
	return kill, nil
}
Example #2
0
func (c *Coordinator) onLoadBarrierChange(m *donut.SafeMap) {
	if m.Len() == len(c.graph.job.LoadPaths()) {
		log.Printf("load complete")
		c.watchers["load"] <- 1
		delete(c.watchers, "load")
		if !atomic.CompareAndSwapInt32(&c.state, LoadState, RunState) {
			log.Println("Could not properly move from LoadState to RunState")
			return
		}
		go c.createStepWork(1)
	} else {
		log.Printf("Load barrier has %d/%d entries", m.Len(), len(c.graph.job.LoadPaths()))
	}
}
Example #3
0
func (c *Coordinator) onStepBarrierChange(step int, m *donut.SafeMap) {
	if m.Len() == c.workers.Len() {
		defer m.Clear()
		barrierName := "superstep-" + strconv.Itoa(step)
		// the barrier is full, collect information and launch the next step
		c.graph.globalStat.reset()
		c.graph.globalStat.step = step
		// collect and unmarshal data for all entries in the barrier
		lm := m.GetCopy()
		for k := range lm {
			if data, _, err := c.zk.Get(path.Join(c.barriersPath, barrierName, k)); err == nil {
				var info map[string]interface{}
				if err := json.Unmarshal([]byte(data), &info); err != nil {
					panic(err)
				}
				c.graph.globalStat.active += int(info["active"].(float64))
				c.graph.globalStat.msgs += int(info["msgs"].(float64))
			} else {
				panic(err)
			}
		}
		// kill the watcher on this barrier
		c.watchers[barrierName] <- 1
		delete(c.watchers, barrierName)
		if c.graph.globalStat.active == 0 && c.graph.globalStat.msgs == 0 {
			atomic.StoreInt32(&c.state, WriteState)
			go c.createWriteWork()
		} else {
			go c.createStepWork(step + 1)
		}
	} else {
		log.Printf("step barrier change: %d entries out of %d", m.Len(), c.workers.Len())
	}
}
Example #4
0
func (c *Coordinator) onWorkersChange(m *donut.SafeMap) {
	log.Println("workers updated")
	if atomic.LoadInt32(&c.state) > SetupState {
		// invalidate current step
		// update partition mapping
		// roll back to last checkpoint
	} else {
		if m.Len() == c.config.InitialWorkers {
			// go into prepare state
			if !atomic.CompareAndSwapInt32(&c.state, SetupState, PrepareState) {
				log.Println("Could not properly move from SetupState to PrepareState")
				return
			}
			log.Printf("InitialWorkers met, preparing node for work")
			// everyone is here, create the partition mapping
			lm := m.RangeLock()
			var workers []string
			for k := range lm {
				workers = append(workers, k)
			}
			m.RangeUnlock()
			sort.Strings(workers)
			for i := 0; i < len(workers); i++ {
				c.partitions[i] = workers[i]
				if workers[i] == c.config.NodeId {
					c.graph.partitionId = i
				}
			}

			// set up connections to all the other nodes
			c.cachedWorkerInfo = make(map[string]map[string]interface{})
			c.rpcClients = make(map[string]*rpc.Client)
			for _, w := range workers {
				// pull down worker info for all of the existing workers
				c.cachedWorkerInfo[w] = c.workerInfo(w)
				c.rpcClients[w], _ = rpc.DialHTTP("tcp", net.JoinHostPort(c.cachedWorkerInfo[w]["host"].(string), c.cachedWorkerInfo[w]["port"].(string)))
			}

			// go into loadstate
			if !atomic.CompareAndSwapInt32(&c.state, PrepareState, LoadState) {
				log.Println("Could not properly move from PrepareState to LoadState")
				return
			}
			go c.createLoadWork()
		}
	}
}
Example #5
0
func (c *Coordinator) onWriteBarrierChange(m *donut.SafeMap) {
	if m.Len() == c.workers.Len() {
		log.Println("Write barrier full, ending job")
		c.done <- 1
	}
}