func (s *HandlersSuite) TestHealingHistoryHandler(c *check.C) { evt1, err := event.NewInternal(&event.Opts{ Target: event.Target{Type: event.TargetTypeNode, Value: "addr1"}, InternalKind: "healer", CustomData: map[string]interface{}{"node": cluster.Node{Address: "addr1"}}, Allowed: event.Allowed(permission.PermPool), }) c.Assert(err, check.IsNil) evt1.DoneCustomData(nil, cluster.Node{Address: "addr2"}) time.Sleep(10 * time.Millisecond) evt2, err := event.NewInternal(&event.Opts{ Target: event.Target{Type: event.TargetTypeNode, Value: "addr3"}, InternalKind: "healer", CustomData: map[string]interface{}{"node": cluster.Node{Address: "addr3"}}, Allowed: event.Allowed(permission.PermPool), }) c.Assert(err, check.IsNil) evt2.DoneCustomData(errors.New("some error"), cluster.Node{}) time.Sleep(10 * time.Millisecond) evt3, err := event.NewInternal(&event.Opts{ Target: event.Target{Type: event.TargetTypeContainer, Value: "1234"}, InternalKind: "healer", CustomData: container.Container{ID: "1234"}, Allowed: event.Allowed(permission.PermApp), }) c.Assert(err, check.IsNil) evt3.DoneCustomData(nil, container.Container{ID: "9876"}) recorder := httptest.NewRecorder() request, err := http.NewRequest("GET", "/docker/healing", nil) c.Assert(err, check.IsNil) request.Header.Set("Authorization", "bearer "+s.token.GetValue()) server := api.RunServer(true) server.ServeHTTP(recorder, request) c.Assert(recorder.Code, check.Equals, http.StatusOK) c.Assert(recorder.Header().Get("Content-Type"), check.Equals, "application/json") body := recorder.Body.Bytes() var healings []healer.HealingEvent err = json.Unmarshal(body, &healings) c.Assert(err, check.IsNil) c.Assert(healings, check.HasLen, 3) c.Assert(healings[2].StartTime.UTC().Format(time.Stamp), check.Equals, evt1.StartTime.UTC().Format(time.Stamp)) c.Assert(healings[2].EndTime.UTC().Format(time.Stamp), check.Equals, evt1.EndTime.UTC().Format(time.Stamp)) c.Assert(healings[2].FailingNode.Address, check.Equals, "addr1") c.Assert(healings[2].CreatedNode.Address, check.Equals, "addr2") c.Assert(healings[2].Error, check.Equals, "") c.Assert(healings[2].Successful, check.Equals, true) c.Assert(healings[2].Action, check.Equals, "node-healing") c.Assert(healings[1].FailingNode.Address, check.Equals, "addr3") c.Assert(healings[1].CreatedNode.Address, check.Equals, "") c.Assert(healings[1].Error, check.Equals, "some error") c.Assert(healings[1].Successful, check.Equals, false) c.Assert(healings[1].Action, check.Equals, "node-healing") c.Assert(healings[0].FailingContainer.ID, check.Equals, "1234") c.Assert(healings[0].CreatedContainer.ID, check.Equals, "9876") c.Assert(healings[0].Successful, check.Equals, true) c.Assert(healings[0].Error, check.Equals, "") c.Assert(healings[0].Action, check.Equals, "container-healing") }
func (s *HandlersSuite) TestAutoScaleHistoryHandler(c *check.C) { evt1, err := event.NewInternal(&event.Opts{ Target: event.Target{Type: poolMetadataName, Value: "poolx"}, InternalKind: autoScaleEventKind, Allowed: event.Allowed(permission.PermPool), }) c.Assert(err, check.IsNil) evt1.Logf("my evt1") err = evt1.DoneCustomData(nil, evtCustomData{ Result: &scalerResult{ToAdd: 1, Reason: "r1"}, }) c.Assert(err, check.IsNil) time.Sleep(100 * time.Millisecond) evt2, err := event.NewInternal(&event.Opts{ Target: event.Target{Type: poolMetadataName, Value: "pooly"}, InternalKind: autoScaleEventKind, Allowed: event.Allowed(permission.PermPool), }) c.Assert(err, check.IsNil) evt2.Logf("my evt2") err = evt2.DoneCustomData(nil, evtCustomData{ Result: &scalerResult{ToRebalance: true, Reason: "r2"}, }) c.Assert(err, check.IsNil) recorder := httptest.NewRecorder() request, err := http.NewRequest("GET", "/docker/autoscale", nil) c.Assert(err, check.IsNil) request.Header.Set("Authorization", "bearer "+s.token.GetValue()) server := api.RunServer(true) server.ServeHTTP(recorder, request) c.Assert(recorder.Code, check.Equals, http.StatusOK) c.Assert(recorder.Header().Get("Content-Type"), check.Equals, "application/json") body := recorder.Body.Bytes() history := []autoScaleEvent{} err = json.Unmarshal(body, &history) c.Assert(err, check.IsNil) c.Assert(history, check.HasLen, 2) c.Assert(evt1.StartTime.Sub(history[1].StartTime) < time.Second, check.Equals, true) c.Assert(evt2.StartTime.Sub(history[0].StartTime) < time.Second, check.Equals, true) c.Assert(history[1].MetadataValue, check.Equals, "poolx") c.Assert(history[0].MetadataValue, check.Equals, "pooly") c.Assert(history[1].Action, check.Equals, "add") c.Assert(history[0].Action, check.Equals, "rebalance") c.Assert(history[1].Reason, check.Equals, "r1") c.Assert(history[0].Reason, check.Equals, "r2") c.Assert(history[1].Log, check.Equals, "my evt1\n") c.Assert(history[0].Log, check.Equals, "my evt2\n") }
func (h *NodeHealer) tryHealingNode(node provision.Node, reason string, lastCheck *NodeChecks) error { _, hasIaas := node.Metadata()["iaas"] if !hasIaas { log.Debugf("node %q doesn't have IaaS information, healing (%s) won't run on it.", node.Address(), reason) return nil } poolName := node.Metadata()[poolMetadataName] evt, err := event.NewInternal(&event.Opts{ Target: event.Target{Type: event.TargetTypeNode, Value: node.Address()}, InternalKind: "healer", CustomData: NodeHealerCustomData{ Node: provision.NodeToSpec(node), Reason: reason, LastCheck: lastCheck, }, Allowed: event.Allowed(permission.PermPoolReadEvents, permission.Context(permission.CtxPool, poolName)), }) if err != nil { if _, ok := err.(event.ErrEventLocked); ok { // Healing in progress. return nil } return errors.Wrap(err, "Error trying to insert node healing event, healing aborted") } var createdNode *provision.NodeSpec var evtErr error defer func() { var updateErr error if evtErr == nil && createdNode == nil { updateErr = evt.Abort() } else { updateErr = evt.DoneCustomData(evtErr, createdNode) } if updateErr != nil { log.Errorf("error trying to update healing event: %s", updateErr) } }() _, err = node.Provisioner().GetNode(node.Address()) if err != nil { if err == provision.ErrNodeNotFound { return nil } evtErr = errors.Wrap(err, "unable to check if node still exists") return evtErr } shouldHeal, err := h.shouldHealNode(node) if err != nil { evtErr = errors.Wrap(err, "unable to check if node should be healed") return evtErr } if !shouldHeal { return nil } log.Errorf("initiating healing process for node %q due to: %s", node.Address(), reason) createdNode, evtErr = h.healNode(node) return evtErr }
func (s *S) TestHealerHandleErrorThrottled(c *check.C) { factory, iaasInst := iaasTesting.NewHealerIaaSConstructorWithInst("addr1") iaas.RegisterIaasProvider("my-healer-iaas", factory) _, err := iaas.CreateMachineForIaaS("my-healer-iaas", map[string]string{}) c.Assert(err, check.IsNil) iaasInst.Addr = "addr2" config.Set("iaas:node-protocol", "http") config.Set("iaas:node-port", 2) defer config.Unset("iaas:node-protocol") defer config.Unset("iaas:node-port") p := provisiontest.ProvisionerInstance err = p.AddNode(provision.AddNodeOptions{ Address: "http://addr1:1", Metadata: map[string]string{"iaas": "my-healer-iaas"}, }) c.Assert(err, check.IsNil) node, err := p.GetNode("http://addr1:1") c.Assert(err, check.IsNil) healer := newNodeHealer(nodeHealerArgs{ FailuresBeforeHealing: 1, WaitTimeNewMachine: time.Minute, }) healer.Shutdown() healer.started = time.Now().Add(-3 * time.Second) conf := healerConfig() err = conf.SaveBase(NodeHealerConfig{Enabled: boolPtr(true), MaxUnresponsiveTime: intPtr(1)}) c.Assert(err, check.IsNil) err = healer.UpdateNodeData(node, []provision.NodeCheckResult{}) c.Assert(err, check.IsNil) time.Sleep(1200 * time.Millisecond) nodes, err := p.ListNodes(nil) c.Assert(err, check.IsNil) c.Assert(nodes, check.HasLen, 1) c.Assert(nodes[0].Address(), check.Equals, "http://addr1:1") machines, err := iaas.ListMachines() c.Assert(err, check.IsNil) c.Assert(machines, check.HasLen, 1) c.Assert(machines[0].Address, check.Equals, "addr1") for i := 0; i < 3; i++ { var evt *event.Event evt, err = event.NewInternal(&event.Opts{ Target: event.Target{Type: "node", Value: nodes[0].Address()}, InternalKind: "healer", Allowed: event.Allowed(permission.PermPoolReadEvents), }) c.Assert(err, check.IsNil) err = evt.Done(nil) c.Assert(err, check.IsNil) } err = healer.tryHealingNode(nodes[0], "myreason", nil) c.Assert(err, check.ErrorMatches, "Error trying to insert node healing event, healing aborted: event throttled, limit for healer on node \".*?\" is 3 every 5m0s") nodes, err = p.ListNodes(nil) c.Assert(err, check.IsNil) c.Assert(nodes, check.HasLen, 1) c.Assert(nodes[0].Address(), check.Equals, "http://addr1:1") }
func (h *ContainerHealer) healContainerIfNeeded(cont container.Container) error { if cont.LastSuccessStatusUpdate.IsZero() { if !cont.MongoID.Time().Before(time.Now().Add(-h.maxUnresponsiveTime)) { return nil } } isAsExpected, err := h.isAsExpected(cont) if err != nil { log.Errorf("Containers healing: couldn't verify running processes in container %q: %s", cont.ID, err) } if isAsExpected { cont.SetStatus(h.provisioner, cont.ExpectedStatus(), true) return nil } locked := h.locker.Lock(cont.AppName) if !locked { return errors.Errorf("Containers healing: unable to heal %q couldn't lock app %s", cont.ID, cont.AppName) } defer h.locker.Unlock(cont.AppName) // Sanity check, now we have a lock, let's find out if the container still exists _, err = h.provisioner.GetContainer(cont.ID) if err != nil { if _, isNotFound := err.(*provision.UnitNotFoundError); isNotFound { return nil } return errors.Wrapf(err, "Containers healing: unable to heal %q couldn't verify it still exists", cont.ID) } a, err := app.GetByName(cont.AppName) if err != nil { return errors.Wrapf(err, "Containers healing: unable to heal %q couldn't get app %q", cont.ID, cont.AppName) } log.Errorf("Initiating healing process for container %q, unresponsive since %s.", cont.ID, cont.LastSuccessStatusUpdate) evt, err := event.NewInternal(&event.Opts{ Target: event.Target{Type: event.TargetTypeContainer, Value: cont.ID}, InternalKind: "healer", CustomData: cont, Allowed: event.Allowed(permission.PermAppReadEvents, append(permission.Contexts(permission.CtxTeam, a.Teams), permission.Context(permission.CtxApp, a.Name), permission.Context(permission.CtxPool, a.Pool), )...), }) if err != nil { return errors.Wrap(err, "Error trying to insert container healing event, healing aborted") } newCont, healErr := h.healContainer(cont) if healErr != nil { healErr = errors.Errorf("Error healing container %q: %s", cont.ID, healErr.Error()) } err = evt.DoneCustomData(healErr, newCont) if err != nil { log.Errorf("Error trying to update containers healing event: %s", err) } return healErr }
func (s *S) TestListHealingHistory(c *check.C) { evt1, err := event.NewInternal(&event.Opts{ Target: event.Target{Type: "node", Value: "addr1"}, InternalKind: "healer", Allowed: event.Allowed(permission.PermPoolReadEvents), }) c.Assert(err, check.IsNil) err = evt1.Done(nil) c.Assert(err, check.IsNil) time.Sleep(100 * time.Millisecond) evt2, err := event.NewInternal(&event.Opts{ Target: event.Target{Type: "container", Value: "cont1"}, InternalKind: "healer", Allowed: event.Allowed(permission.PermPoolReadEvents), }) c.Assert(err, check.IsNil) err = evt2.Done(nil) c.Assert(err, check.IsNil) evts, err := ListHealingHistory("") c.Assert(err, check.IsNil) c.Assert(evts, check.DeepEquals, []HealingEvent{ { ID: evt2.UniqueID, StartTime: mongoTime(evt2.StartTime), EndTime: mongoTime(evt2.EndTime), Action: "container-healing", Successful: true, Error: "", }, { ID: evt1.UniqueID, StartTime: mongoTime(evt1.StartTime), EndTime: mongoTime(evt1.EndTime), Action: "node-healing", Successful: true, Error: "", }, }) }
func (s *S) TestRunContainerHealerThrottled(c *check.C) { p, err := dockertest.StartMultipleServersCluster() c.Assert(err, check.IsNil) defer p.Destroy() node1 := p.Servers()[0] app := newFakeAppInDB("myapp", "python", 0) _, err = p.StartContainers(dockertest.StartContainersArgs{ Endpoint: node1.URL(), App: app, Amount: map[string]int{"web": 2}, Image: "tsuru/python", PullImage: true, }) c.Assert(err, check.IsNil) containers := p.AllContainers() c.Assert(containers, check.HasLen, 2) c.Assert(containers[0].HostAddr, check.Equals, net.URLToHost(node1.URL())) c.Assert(containers[1].HostAddr, check.Equals, net.URLToHost(node1.URL())) node1.MutateContainer(containers[0].ID, docker.State{Running: false, Restarting: false}) node1.MutateContainer(containers[1].ID, docker.State{Running: false, Restarting: false}) toMoveCont := containers[1] toMoveCont.LastSuccessStatusUpdate = time.Now().Add(-5 * time.Minute) for i := 0; i < 3; i++ { var evt *event.Event evt, err = event.NewInternal(&event.Opts{ Target: event.Target{Type: "container", Value: toMoveCont.ID}, InternalKind: "healer", CustomData: toMoveCont, Allowed: event.Allowed(permission.PermAppReadEvents), }) c.Assert(err, check.IsNil) err = evt.DoneCustomData(nil, nil) c.Assert(err, check.IsNil) } healer := NewContainerHealer(ContainerHealerArgs{Provisioner: p, Locker: dockertest.NewFakeLocker()}) err = healer.healContainerIfNeeded(toMoveCont) c.Assert(err, check.ErrorMatches, "Error trying to insert container healing event, healing aborted: event throttled, limit for healer on container \".*?\" is 3 every 5m0s") }
func (a *autoScaleConfig) runScalerInNodes(pool string, nodes []*cluster.Node) { evt, err := event.NewInternal(&event.Opts{ Target: event.Target{Type: event.TargetTypePool, Value: pool}, InternalKind: autoScaleEventKind, Allowed: event.Allowed(permission.PermPoolReadEvents, permission.Context(permission.CtxPool, pool)), }) if err != nil { if _, ok := err.(event.ErrEventLocked); ok { a.logDebug("skipping already running for: %s", pool) } else { a.logError("error creating scale event %s: %s", pool, err.Error()) } return } evt.SetLogWriter(a.writer) var retErr error var sResult *scalerResult var evtNodes []cluster.Node var rule *autoScaleRule defer func() { if retErr != nil { evt.Logf(retErr.Error()) } if (sResult == nil && retErr == nil) || (sResult != nil && sResult.NoAction()) { evt.Logf("nothing to do for %q: %q", poolMetadataName, pool) evt.Abort() } else { evt.DoneCustomData(retErr, evtCustomData{ Result: sResult, Nodes: evtNodes, Rule: rule, }) } }() rule, err = autoScaleRuleForMetadata(pool) if err == mgo.ErrNotFound { rule, err = autoScaleRuleForMetadata("") } if err != nil { if err != mgo.ErrNotFound { retErr = errors.Wrapf(err, "unable to fetch auto scale rules for %s", pool) return } evt.Logf("no auto scale rule for %s", pool) return } if !rule.Enabled { evt.Logf("auto scale rule disabled for %s", pool) return } scaler, err := a.scalerForRule(rule) if err != nil { retErr = errors.Wrapf(err, "error getting scaler for %s", pool) return } evt.Logf("running scaler %T for %q: %q", scaler, poolMetadataName, pool) sResult, err = scaler.scale(pool, nodes) if err != nil { if _, ok := err.(errAppNotLocked); ok { evt.Logf("aborting scaler for now, gonna retry later: %s", err) return } retErr = errors.Wrapf(err, "error scaling group %s", pool) return } if sResult.ToAdd > 0 { evt.Logf("running event \"add\" for %q: %#v", pool, sResult) evtNodes, err = a.addMultipleNodes(evt, nodes, sResult.ToAdd) if err != nil { if len(evtNodes) == 0 { retErr = err return } evt.Logf("not all required nodes were created: %s", err) } } else if len(sResult.ToRemove) > 0 { evt.Logf("running event \"remove\" for %q: %#v", pool, sResult) evtNodes = sResult.ToRemove err = a.removeMultipleNodes(evt, sResult.ToRemove) if err != nil { retErr = err return } } if !rule.PreventRebalance { err := a.rebalanceIfNeeded(evt, pool, nodes, sResult) if err != nil { if sResult.IsRebalanceOnly() { retErr = err } else { evt.Logf("unable to rebalance: %s", err.Error()) } } } }
func (s *S) TestListFilterMany(c *check.C) { var allEvts []event.Event var create = func(opts *event.Opts) { evt, err := event.New(opts) c.Assert(err, check.IsNil) allEvts = append(allEvts, *evt) } var createi = func(opts *event.Opts) { evt, err := event.NewInternal(opts) c.Assert(err, check.IsNil) allEvts = append(allEvts, *evt) } var checkFilters = func(f *event.Filter, expected interface{}) { evts, err := event.List(f) c.Assert(err, check.IsNil) c.Assert(evts, eventtest.EvtEquals, expected) } create(&event.Opts{ Target: event.Target{Type: "app", Value: "myapp"}, Kind: permission.PermAppUpdateEnvSet, Owner: s.token, Allowed: event.Allowed(permission.PermAppReadEvents, permission.Context(permission.CtxApp, "myapp")), }) time.Sleep(100 * time.Millisecond) t0 := time.Now().UTC() create(&event.Opts{ Target: event.Target{Type: "app", Value: "myapp2"}, Kind: permission.PermAppUpdateEnvSet, Owner: s.token, Allowed: event.Allowed(permission.PermAppAdmin), }) t05 := time.Now().UTC() time.Sleep(100 * time.Millisecond) create(&event.Opts{ Target: event.Target{Type: "app2", Value: "myapp"}, Kind: permission.PermAppUpdateEnvSet, Owner: s.token, Allowed: event.Allowed(permission.PermAppAdmin), }) t1 := time.Now().UTC() time.Sleep(100 * time.Millisecond) createi(&event.Opts{ Target: event.Target{Type: "node", Value: "http://10.0.1.1"}, InternalKind: "healer", Allowed: event.Allowed(permission.PermAppAdmin), }) createi(&event.Opts{ Target: event.Target{Type: "node", Value: "http://10.0.1.2"}, InternalKind: "healer", Allowed: event.Allowed(permission.PermAppAdmin), }) createi(&event.Opts{ Target: event.Target{Type: "nodex", Value: "http://10.0.1.3"}, InternalKind: "healer", Allowed: event.Allowed(permission.PermAppAdmin), }) err := event.MarkAsRemoved(event.Target{Type: "nodex", Value: "http://10.0.1.3"}) c.Assert(err, check.IsNil) allEvts[len(allEvts)-2].Done(nil) allEvts[len(allEvts)-3].Done(errors.New("my err")) checkFilters(&event.Filter{Sort: "_id"}, allEvts[:len(allEvts)-1]) checkFilters(&event.Filter{Running: boolPtr(false), Sort: "_id"}, allEvts[len(allEvts)-3:len(allEvts)-1]) checkFilters(&event.Filter{Running: boolPtr(true), Sort: "_id"}, allEvts[:len(allEvts)-3]) checkFilters(&event.Filter{ErrorOnly: true, Sort: "_id"}, allEvts[len(allEvts)-3]) checkFilters(&event.Filter{Target: event.Target{Type: "app"}, Sort: "_id"}, []event.Event{allEvts[0], allEvts[1]}) checkFilters(&event.Filter{Target: event.Target{Type: "app", Value: "myapp"}}, allEvts[0]) checkFilters(&event.Filter{KindType: event.KindTypeInternal, Sort: "_id"}, allEvts[3:len(allEvts)-1]) checkFilters(&event.Filter{KindType: event.KindTypePermission, Sort: "_id"}, allEvts[:3]) checkFilters(&event.Filter{KindType: event.KindTypePermission, KindName: "kind"}, nil) checkFilters(&event.Filter{KindType: event.KindTypeInternal, KindName: "healer", Sort: "_id"}, allEvts[3:len(allEvts)-1]) checkFilters(&event.Filter{OwnerType: event.OwnerTypeUser, Sort: "_id"}, allEvts[:3]) checkFilters(&event.Filter{OwnerType: event.OwnerTypeInternal, Sort: "_id"}, allEvts[3:len(allEvts)-1]) checkFilters(&event.Filter{OwnerType: event.OwnerTypeUser, OwnerName: s.token.GetUserName(), Sort: "_id"}, allEvts[:3]) checkFilters(&event.Filter{Since: t0, Sort: "_id"}, allEvts[1:len(allEvts)-1]) checkFilters(&event.Filter{Until: t05, Sort: "_id"}, allEvts[:2]) checkFilters(&event.Filter{Since: t0, Until: t1, Sort: "_id"}, allEvts[1:3]) checkFilters(&event.Filter{Limit: 2, Sort: "_id"}, allEvts[:2]) checkFilters(&event.Filter{Limit: 1, Sort: "-_id"}, allEvts[len(allEvts)-2]) checkFilters(&event.Filter{Target: event.Target{Type: "nodex"}}, allEvts[:0]) checkFilters(&event.Filter{Target: event.Target{Type: "nodex"}, IncludeRemoved: true}, allEvts[5:6]) checkFilters(&event.Filter{AllowedTargets: []event.TargetFilter{}, Sort: "_id"}, allEvts[:0]) checkFilters(&event.Filter{AllowedTargets: []event.TargetFilter{ {Type: "app"}, }, Sort: "_id"}, allEvts[:2]) checkFilters(&event.Filter{AllowedTargets: []event.TargetFilter{ {Type: "app", Values: []string{}}, }, Sort: "_id"}, allEvts[:0]) checkFilters(&event.Filter{AllowedTargets: []event.TargetFilter{ {Type: "app", Values: []string{"myapp", "myapp2"}}, }, Sort: "_id"}, allEvts[:2]) checkFilters(&event.Filter{AllowedTargets: []event.TargetFilter{ {Type: "app", Values: []string{"myapp"}}, {Type: "node", Values: []string{"http://10.0.1.2"}}, }, Sort: "_id"}, []event.Event{allEvts[0], allEvts[4]}) checkFilters(&event.Filter{Permissions: []permission.Permission{ {Scheme: permission.PermAll, Context: permission.Context(permission.CtxGlobal, "")}, }, Sort: "_id"}, allEvts[:len(allEvts)-1]) checkFilters(&event.Filter{Permissions: []permission.Permission{ {Scheme: permission.PermAll}, }, Sort: "_id"}, allEvts[:0]) checkFilters(&event.Filter{Permissions: []permission.Permission{ {Scheme: permission.PermAppRead, Context: permission.Context(permission.CtxApp, "myapp")}, {Scheme: permission.PermAppRead, Context: permission.Context(permission.CtxApp, "invalid-app")}, }, Sort: "_id"}, allEvts[:1]) checkFilters(&event.Filter{Permissions: []permission.Permission{ {Scheme: permission.PermAppRead, Context: permission.Context(permission.CtxApp, "invalid-app")}, }, Sort: "_id"}, allEvts[:0]) }