// A non-nil return signals that event processing should stop. func (agent *ActionAgent) dispatchAction(actionPath, data string) error { agent.actionMutex.Lock() defer agent.actionMutex.Unlock() log.Infof("action dispatch %v", actionPath) actionNode, err := actionnode.ActionNodeFromJson(data, actionPath) if err != nil { log.Errorf("action decode failed: %v %v", actionPath, err) return nil } cmd := []string{ agent.vtActionBinFile, "-action", actionNode.Action, "-action-node", actionPath, "-action-guid", actionNode.ActionGuid, } cmd = append(cmd, logutil.GetSubprocessFlags()...) cmd = append(cmd, topo.GetSubprocessFlags()...) cmd = append(cmd, dbconfigs.GetSubprocessFlags()...) cmd = append(cmd, mysqlctl.GetSubprocessFlags()...) log.Infof("action launch %v", cmd) vtActionCmd := exec.Command(cmd[0], cmd[1:]...) stdOut, vtActionErr := vtActionCmd.CombinedOutput() if vtActionErr != nil { log.Errorf("agent action failed: %v %v\n%s", actionPath, vtActionErr, stdOut) // If the action failed, preserve single execution path semantics. return vtActionErr } log.Infof("Agent action completed %v %s", actionPath, stdOut) agent.afterAction(actionPath, actionNode.Action == actionnode.TABLET_ACTION_APPLY_SCHEMA) return nil }
// startFakeTabletActionLoop will start the action loop for a fake tablet, // using mysqlDaemon as the backing mysqld. func startFakeTabletActionLoop(t *testing.T, wr *Wrangler, tabletAlias topo.TabletAlias, mysqlDaemon mysqlctl.MysqlDaemon, done chan struct{}) { go func() { f := func(actionPath, data string) error { actionNode, err := actionnode.ActionNodeFromJson(data, actionPath) if err != nil { t.Fatalf("ActionNodeFromJson failed: %v\n%v", err, data) } ta := tabletmanager.NewTabletActor(nil, mysqlDaemon, wr.ts, tabletAlias) if err := ta.HandleAction(actionPath, actionNode.Action, actionNode.ActionGuid, false); err != nil { // action may just fail for any good reason t.Logf("HandleAction failed for %v: %v", actionNode.Action, err) } // this part would also be done by the agent tablet, err := wr.ts.GetTablet(tabletAlias) if err != nil { t.Logf("Cannot get tablet: %v", err) } else { updatedTablet := tabletmanager.CheckTabletMysqlPort(wr.ts, mysqlDaemon, tablet) if updatedTablet != nil { t.Logf("Updated tablet record") } } return nil } wr.ts.ActionEventLoop(tabletAlias, f, done) }() }
func getActions(wr *wrangler.Wrangler, zconn zk.Conn, actionPath string) ([]*actionnode.ActionNode, error) { actions, _, err := zconn.Children(actionPath) if err != nil { return nil, fmt.Errorf("getActions failed: %v %v", actionPath, err) } sort.Strings(actions) wg := sync.WaitGroup{} mu := sync.Mutex{} nodes := make([]*actionnode.ActionNode, 0, len(actions)) for _, action := range actions { wg.Add(1) go func(action string) { defer wg.Done() actionNodePath := path.Join(actionPath, action) data, _, err := zconn.Get(actionNodePath) if err != nil && !zookeeper.IsError(err, zookeeper.ZNONODE) { wr.Logger().Warningf("getActions: %v %v", actionNodePath, err) return } actionNode, err := actionnode.ActionNodeFromJson(data, actionNodePath) if err != nil { wr.Logger().Warningf("getActions: %v %v", actionNodePath, err) return } mu.Lock() nodes = append(nodes, actionNode) mu.Unlock() }(action) } wg.Wait() return nodes, nil }
// StartActionLoop will start the action loop for a fake tablet, // using ft.FakeMysqlDaemon as the backing mysqld. func (ft *FakeTablet) StartActionLoop(t *testing.T, wr *wrangler.Wrangler) { if ft.Done != nil { t.Fatalf("ActionLoop for %v is already running", ft.Tablet.Alias) } ft.Done = make(chan struct{}, 1) go func() { wr.TopoServer().ActionEventLoop(ft.Tablet.Alias, func(actionPath, data string) error { actionNode, err := actionnode.ActionNodeFromJson(data, actionPath) if err != nil { t.Fatalf("ActionNodeFromJson failed: %v\n%v", err, data) } ta := actor.NewTabletActor(nil, ft.FakeMysqlDaemon, wr.TopoServer(), ft.Tablet.Alias) if err := ta.HandleAction(actionPath, actionNode.Action, actionNode.ActionGuid, false); err != nil { // action may just fail for any good reason t.Logf("HandleAction failed for %v: %v", actionNode.Action, err) } // this part would also be done by the agent tablet, err := wr.TopoServer().GetTablet(ft.Tablet.Alias) if err != nil { t.Logf("Cannot get tablet: %v", err) } else { updatedTablet := actor.CheckTabletMysqlPort(wr.TopoServer(), ft.FakeMysqlDaemon, tablet) if updatedTablet != nil { t.Logf("Updated tablet record") } } return nil }, ft.Done) }() }
// startFakeTabletActionLoop will start the action loop for a fake // tablet. func (fix *Fixture) startFakeTabletActionLoop(tablet *tabletPack) { go func() { f := func(actionPath, data string) error { actionNode, err := actionnode.ActionNodeFromJson(data, actionPath) if err != nil { fix.Fatalf("ActionNodeFromJson failed: %v\n%v", err, data) } ta := actor.NewTabletActor(nil, tablet.mysql, fix.Topo, tablet.Alias) if err := ta.HandleAction(actionPath, actionNode.Action, actionNode.ActionGuid, false); err != nil { // action may just fail for any good reason fix.Logf("HandleAction failed for %v: %v", actionNode.Action, err) } return nil } fix.Topo.ActionEventLoop(tablet.Alias, f, fix.done) }() }
func staleActions(zkts *zktopo.Server, zkActionPath string, maxStaleness time.Duration) ([]*actionnode.ActionNode, error) { // get the stale strings actionNodes, err := zkts.StaleActions(zkActionPath, maxStaleness, actionnode.ActionNodeIsStale) if err != nil { return nil, err } // convert to ActionNode staleActions := make([]*actionnode.ActionNode, len(actionNodes)) for i, actionNodeStr := range actionNodes { actionNode, err := actionnode.ActionNodeFromJson(actionNodeStr, "") if err != nil { return nil, err } staleActions[i] = actionNode } return staleActions, nil }
func WaitForCompletion(ts topo.Server, actionPath string, waitTime time.Duration) (interface{}, error) { // If there is no duration specified, block for a sufficiently long time if waitTime <= 0 { waitTime = 24 * time.Hour } data, err := ts.WaitForTabletAction(actionPath, waitTime, interrupted) if err != nil { return nil, err } // parse it actionNode, dataErr := actionnode.ActionNodeFromJson(data, "") if dataErr != nil { return nil, fmt.Errorf("action data error: %v %v %#v", actionPath, dataErr, data) } else if actionNode.Error != "" { return nil, fmt.Errorf("action failed: %v %v", actionPath, actionNode.Error) } return actionNode.Reply, nil }
// This function should be protected from unforseen panics, as // dispatchAction will catch everything. The rest of the code in this // function should not panic. func (ta *TabletActor) HandleAction(actionPath, action, actionGuid string, forceRerun bool) error { tabletAlias, data, version, err := ta.ts.ReadTabletActionPath(actionPath) ta.tabletAlias = tabletAlias actionNode, err := actionnode.ActionNodeFromJson(data, actionPath) if err != nil { log.Errorf("HandleAction failed unmarshaling %v: %v", actionPath, err) return err } switch actionNode.State { case actionnode.ACTION_STATE_RUNNING: // see if the process is still running, and if so, wait for it proc, _ := os.FindProcess(actionNode.Pid) if proc.Signal(syscall.Signal(0)) == syscall.ESRCH { // process is dead, either clean up or re-run if !forceRerun { actionErr := fmt.Errorf("Previous vtaction process died") if err := StoreActionResponse(ta.ts, actionNode, actionPath, actionErr); err != nil { log.Errorf("Dead process detector failed to update actionNode: %v", err) return actionErr } if err := ta.ts.UnblockTabletAction(actionPath); err != nil { log.Errorf("Dead process detector failed unblocking: %v", err) } return actionErr } } else { log.Warningf("HandleAction waiting for running action: %v", actionPath) _, err := initiator.WaitForCompletion(ta.ts, actionPath, 0) return err } case actionnode.ACTION_STATE_FAILED: // this happens only in a couple cases: // - vtaction was killed by a signal and we caught it // - vtaction died unexpectedly, and the next vtaction run detected it return fmt.Errorf(actionNode.Error) case actionnode.ACTION_STATE_DONE: // this is bad return fmt.Errorf("Unexpected finished ActionNode in action queue: %v", actionPath) } // Claim the action by this process. actionNode.State = actionnode.ACTION_STATE_RUNNING actionNode.Pid = os.Getpid() newData := actionNode.ToJson() err = ta.ts.UpdateTabletAction(actionPath, newData, version) if err != nil { if err == topo.ErrBadVersion { // The action is schedule by another // actor. Most likely the tablet restarted // during an action. Just wait for completion. log.Warningf("HandleAction waiting for scheduled action: %v", actionPath) _, err = initiator.WaitForCompletion(ta.ts, actionPath, 0) return err } else { return err } } // signal handler after we've signed up for the action c := make(chan os.Signal, 2) signal.Notify(c, syscall.SIGTERM, syscall.SIGINT) go func() { for sig := range c { err := StoreActionResponse(ta.ts, actionNode, actionPath, fmt.Errorf("vtaction interrupted by signal: %v", sig)) if err != nil { log.Errorf("Signal handler failed to update actionNode: %v", err) os.Exit(-2) } os.Exit(-1) } }() log.Infof("HandleAction: %v %v", actionPath, data) // validate actions, but don't write this back into topo.Server if actionNode.Action != action || actionNode.ActionGuid != actionGuid { log.Errorf("HandleAction validation failed %v: (%v,%v) (%v,%v)", actionPath, actionNode.Action, action, actionNode.ActionGuid, actionGuid) return TabletActorError("invalid action initiation: " + action + " " + actionGuid) } actionErr := ta.dispatchAction(actionNode) if err := StoreActionResponse(ta.ts, actionNode, actionPath, actionErr); err != nil { return err } // unblock in topo.Server on completion if err := ta.ts.UnblockTabletAction(actionPath); err != nil { log.Errorf("HandleAction failed unblocking: %v", err) return err } return actionErr }