// DFSVolumeMonitorPollUpdateFunc restarts nfs based on status of monitored remotes func (m *Monitor) DFSVolumeMonitorPollUpdateFunc(mountpoint, remoteIP string, hasUpdatedFile bool) { // monitor dfs; log warnings each cycle; restart dfs if needed if hasUpdatedFile { return } else if len(m.getMonitorRemoteHosts()) == 0 { return } glog.Warningf("DFS NFS volume %s is not seen by remoteIP:%s - further action may be needed i.e: restart nfs", mountpoint, remoteIP) now := time.Now() since := now.Sub(m.previousRestart) if !m.shouldRestart { glog.Warningf("Not restarting DFS NFS service due to configuration setting: SERVICED_MONITOR_DFS_MASTER_RESTART=0") return } else if since < m.monitorInterval { glog.Warningf("Not restarting DFS NFS service - have not surpassed interval: %s since last restart", m.monitorInterval) return } m.previousRestart = now if err := m.driver.Restart(); err != nil { glog.Errorf("Error restarting DFS NFS service: %s", err) } }
func (svc *IService) remove(notify chan<- int) { defer close(notify) ctr, err := docker.FindContainer(svc.name()) if err == docker.ErrNoSuchContainer { return } else if err != nil { glog.Errorf("Could not get isvc container %s", svc.Name) return } // report the log output if output, err := exec.Command("docker", "logs", "--tail", "1000", ctr.ID).CombinedOutput(); err != nil { glog.Warningf("Could not get logs for container %s", ctr.Name) } else { glog.V(1).Infof("Exited isvc %s:\n %s", svc.Name, string(output)) } // kill the container if it is running if ctr.IsRunning() { glog.Warningf("isvc %s is still running; killing", svc.Name) ctr.Kill() } // get the exit code rc, _ := ctr.Wait(time.Second) defer func() { notify <- rc }() // delete the container if err := ctr.Delete(true); err != nil { glog.Errorf("Could not remove isvc %s: %s", ctr.Name, err) } }
func (d *daemon) addTemplates() { root := utils.LocalDir("templates") glog.V(1).Infof("Adding templates from %s", root) // Don't block startup for this. It's merely a convenience. go func() { err := filepath.Walk(root, func(path string, info os.FileInfo, err error) error { if err != nil { return err } if info == nil || !strings.HasSuffix(info.Name(), ".json") { return nil } if info.IsDir() { return filepath.SkipDir } var reader io.ReadCloser if reader, err = os.Open(path); err != nil { glog.Warningf("Unable to open template %s", path) return nil } defer reader.Close() st := servicetemplate.ServiceTemplate{} if err := json.NewDecoder(reader).Decode(&st); err != nil { glog.Warningf("Unable to parse template file %s", path) return nil } glog.V(1).Infof("Adding service template %s", path) d.facade.AddServiceTemplate(d.dsContext, st) return nil }) if err != nil { glog.Warningf("Not loading templates from %s: %s", root, err) } }() }
// Send the list of stats to the TSDB. func Post(destination string, stats []Sample) error { payload := map[string][]Sample{"metrics": stats} data, err := json.Marshal(payload) if err != nil { glog.Warningf("Couldn't marshal stats: ", err) return err } statsreq, err := http.NewRequest("POST", destination, bytes.NewBuffer(data)) if err != nil { glog.Warningf("Couldn't create stats request: ", err) return err } statsreq.Header["User-Agent"] = []string{"Zenoss Metric Publisher"} statsreq.Header["Content-Type"] = []string{"application/json"} resp, reqerr := http.DefaultClient.Do(statsreq) if reqerr != nil { glog.Warningf("Couldn't post stats: ", reqerr) return reqerr } if !strings.Contains(resp.Status, "200 OK") { glog.Warningf("couldn't post stats: ", resp.Status) return nil } resp.Body.Close() return nil }
// UpdateRemoteMonitorFile is used by remote clients to write a tiny file to the DFS volume at the given cycle func UpdateRemoteMonitorFile(localPath string, writeInterval time.Duration, ipAddr string, shutdown <-chan interface{}) { monitorPath := path.Join(localPath, monitorSubDir) remoteFile := path.Join(localPath, monitorSubDir, ipAddr) glog.Infof("updating DFS volume monitor file %s at write interval: %s", remoteFile, writeInterval) for { glog.V(2).Infof("checking DFS monitor path %s", monitorPath) _, err := os.Stat(monitorPath) if err != nil { glog.V(2).Infof("unable to stat DFS monitor path: %s %s", monitorPath, err) if err := os.MkdirAll(monitorPath, 0755); err != nil { glog.Warningf("unable to create DFS volume monitor path %s: %s", monitorPath, err) } else { glog.Infof("created DFS volume monitor path %s", monitorPath) } } glog.V(2).Infof("writing DFS file %s", remoteFile) if err := ioutil.WriteFile(remoteFile, []byte(ipAddr), 0600); err != nil { glog.Warningf("unable to write DFS file %s: %s", remoteFile, err) } // wait for next cycle or shutdown select { case <-time.After(writeInterval): case <-shutdown: glog.Infof("no longer writing remote monitor status for DFS volume %s to %s", localPath, remoteFile) return } } }
// start schedules the given service instances with the proviced instance ID. func (l *ServiceListener) start(svc *service.Service, instanceIDs []int) int { var i, id int for i, id = range instanceIDs { if success := func(instanceID int) bool { glog.V(2).Infof("Waiting to acquire scheduler lock for service %s (%s)", svc.Name, svc.ID) // only one service instance can be scheduled at a time l.Lock() defer l.Unlock() // If the service lock is enabled, do not try to start the service instance glog.V(2).Infof("Scheduler lock acquired for service %s (%s); checking service lock", svc.Name, svc.ID) if locked, err := IsServiceLocked(l.conn); err != nil { glog.Errorf("Could not check service lock: %s", err) return false } else if locked { glog.Warningf("Could not start instance %d; service %s (%s) is locked", instanceID, svc.Name, svc.ID) return false } glog.V(2).Infof("Service is not locked, selecting a host for service %s (%s) #%d", svc.Name, svc.ID, id) host, err := l.handler.SelectHost(svc) if err != nil { glog.Warningf("Could not assign a host to service %s (%s): %s", svc.Name, svc.ID, err) return false } glog.V(2).Infof("Host %s found, building service instance %d for %s (%s)", host.ID, id, svc.Name, svc.ID) state, err := servicestate.BuildFromService(svc, host.ID) if err != nil { glog.Warningf("Error creating service state for service %s (%s): %s", svc.Name, svc.ID, err) return false } state.HostIP = host.IPAddr state.InstanceID = instanceID if err := addInstance(l.conn, *state); err != nil { glog.Warningf("Could not add service instance %s for service %s (%s): %s", state.ID, svc.Name, svc.ID, err) return false } glog.V(2).Infof("Starting service instance %s for service %s (%s) on host %s", state.ID, svc.Name, svc.ID, host.ID) return true }(id); !success { // 'i' is the index of the unsuccessful instance id which should portray // the number of successful instances. If you have 2 successful instances // started, then i = 2 because it attempted to create the third index and // failed glog.Warningf("Started %d of %d service instances for %s (%s)", i, len(instanceIDs), svc.Name, svc.ID) return i } } // add 1 because the index of the last instance 'i' would be len(instanceIDs) - 1 return i + 1 }
// Start starts a group of listeners that are governed by a master listener. // When the master exits, it shuts down all of the child listeners and waits // for all of the subprocesses to exit func Start(shutdown <-chan interface{}, conn client.Connection, master Listener, listeners ...Listener) { // shutdown the parent and child listeners _shutdown := make(chan interface{}) // start the master masterDone := make(chan struct{}) defer func() { <-masterDone }() masterReady := make(chan error, 1) go func() { defer close(masterDone) Listen(_shutdown, masterReady, conn, master) }() // wait for the master to be ready and then start the slaves var childDone chan struct{} select { case err := <-masterReady: if err != nil { glog.Errorf("master listener at %s failed to start: %s", master.GetPath(), err) return } childDone := make(chan struct{}) defer func() { <-childDone }() go func() { defer close(childDone) // this handles restarts; retryLimit to reduce flapping for i := 0; i <= retryLimit; i++ { start(_shutdown, conn, listeners...) select { case <-_shutdown: return default: glog.Warningf("Restarting child listeners for master at %s", master.GetPath()) } } glog.Warningf("Shutting down master listener at %s; child listeners exceeded retry limit", master.GetPath()) }() case <-masterDone: case <-shutdown: } defer close(_shutdown) select { case <-masterDone: glog.Warningf("Master listener at %s died prematurely; shutting down", master.GetPath()) case <-childDone: glog.Warningf("Child listeners for master %s died prematurely; shutting down", master.GetPath()) case <-shutdown: glog.Infof("Receieved signal to shutdown for master listener %s", master.GetPath()) } }
// addEndpoint adds a mapping to defined application, if a mapping does not exist this method creates the list and adds the first element func (a *HostAgent) addEndpoint(key string, endpoint dao.ApplicationEndpoint, endpoints map[string][]dao.ApplicationEndpoint) { if _, ok := endpoints[key]; !ok { endpoints[key] = make([]dao.ApplicationEndpoint, 0) } else { if len(endpoints[key]) > 0 { glog.Warningf("Service %s has duplicate internal endpoint for key %s len(endpointList)=%d", endpoint.ServiceID, key, len(endpoints[key])) for _, ep := range endpoints[key] { glog.Warningf(" %+v", ep) } } } endpoints[key] = append(endpoints[key], endpoint) }
func (this *ControlPlaneDao) UpdateService(svc service.Service, unused *int) error { if err := this.facade.UpdateService(datastore.Get(), svc); err != nil { return err } // Create the tenant volume if tenantID, err := this.facade.GetTenantID(datastore.Get(), svc.ID); err != nil { glog.Warningf("Could not get tenant for service %s: %s", svc.ID, err) } else if _, err := this.dfs.GetVolume(tenantID); err != nil { glog.Warningf("Could not create volume for tenant %s: %s", tenantID, err) } return nil }
func (dfs *DistributedFilesystem) desynchronize(image *docker.Image) error { // inspect the image dImg, err := image.Inspect() if err != nil { glog.Errorf("Could not inspect image %s (%s): %s", image.ID, image.UUID, err) return err } // look up services for that tenant svcs, err := dfs.facade.GetServices(datastore.Get(), dao.ServiceRequest{TenantID: image.ID.User}) if err != nil { glog.Errorf("Could not get services for tenant %s from %s (%s): %s", image.ID.User, image.ID, image.UUID, err) return err } for _, svc := range svcs { // figure out which services are using the provided image svcImageID, err := commons.ParseImageID(svc.ImageID) if err != nil { glog.Warningf("Could not parse image %s for %s (%s): %s", svc.ImageID, svc.Name, svc.ID) continue } else if !svcImageID.Equals(image.ID) { continue } // TODO: we need to switch to using dao.ControlPlane conn, err := zzk.GetLocalConnection(zzk.GeneratePoolPath(svc.PoolID)) if err != nil { glog.Warningf("Could not acquire connection to the coordinator (%s): %s", svc.PoolID, err) continue } states, err := zkservice.GetServiceStates(conn, svc.ID) if err != nil { glog.Warningf("Could not get running services for %s (%s): %s", svc.Name, svc.ID) continue } for _, state := range states { // check if the instance has been running since before the commit if state.IsRunning() && state.Started.Before(dImg.Created) { state.InSync = false if err := zkservice.UpdateServiceState(conn, &state); err != nil { glog.Warningf("Could not update service state %s for %s (%s) as out of sync: %s", state.ID, svc.Name, svc.ID, err) continue } } } } return nil }
// connect returns a connection object or times out trying func (zconn *zconn) connect(timeout time.Duration) (client.Connection, error) { connC := make(chan client.Connection, 1) zconn.connC <- connC select { case conn := <-connC: return conn, nil case <-time.After(timeout): glog.Warningf("timed out waiting for connection") return nil, ErrTimeout case <-zconn.shutdownC: glog.Warningf("receieved signal to shutdown") return nil, ErrShutdown } }
// monitor checks for changes in a path-based connection func (zconn *zconn) monitor(path string) { var ( connC chan<- client.Connection conn client.Connection err error ) defer func() { if conn != nil { conn.Close() } }() for { // wait for someone to request a connection, or shutdown select { case connC = <-zconn.connC: case <-zconn.shutdownC: return } retry: // create a connection if it doesn't exist or ping the existing connection if conn == nil { conn, err = zconn.client.GetCustomConnection(path) if err != nil { glog.Warningf("Could not obtain a connection to %s: %s", path, err) } } else if _, err := conn.Children("/"); err == client.ErrConnectionClosed { glog.Warningf("Could not ping connection to %s: %s", path, err) conn = nil } // send the connection back if conn != nil { connC <- conn continue } // if conn is nil, try to create a new connection select { case <-time.After(time.Second): glog.Infof("Refreshing connection to zookeeper") goto retry case <-zconn.shutdownC: return } } }
func (s *scheduler) startRemote(cancel <-chan struct{}, remote, local client.Connection) <-chan interface{} { var ( shutdown = make(chan interface{}) done = make(chan interface{}) ) // wait to receieve a cancel channel or a done channel and shutdown go func() { defer close(shutdown) select { case <-cancel: case <-done: } }() // start the listeners and wait for shutdown or for something to break go func() { defer close(done) glog.Infof("Remote connection established; synchronizing") zzk.Start(shutdown, remote, nil, s.getPoolSynchronizer(), s.getEndpointSynchronizer(local)) glog.Warningf("Running in disconnected mode") }() // indicate when the listeners a finished return done }
// StopServiceInstance stops a host state instance func StopServiceInstance(conn client.Connection, hostID, stateID string) error { // verify that the host is active var isActive bool hostIDs, err := GetActiveHosts(conn) if err != nil { glog.Warningf("Could not verify if host %s is active: %s", hostID, err) isActive = false } else { for _, hid := range hostIDs { if isActive = hid == hostID; isActive { break } } } if isActive { // try to stop the instance nicely return updateInstance(conn, hostID, stateID, func(hsdata *HostState, _ *ss.ServiceState) { glog.V(2).Infof("Stopping service instance via %s host %s", stateID, hostID) hsdata.DesiredState = int(service.SVCStop) }) } else { // if the host isn't active, then remove the instance var hs HostState if err := conn.Get(hostpath(hostID, stateID), &hs); err != nil { glog.Errorf("Could not look up host instance %s on host %s: %s", stateID, hostID, err) return err } return removeInstance(conn, hs.ServiceID, hs.HostID, hs.ServiceStateID) } }
func (mux *TCPMux) acceptor(listener net.Listener, closing chan chan struct{}) { defer func() { close(mux.connections) }() for { conn, err := mux.listener.Accept() if err != nil { if strings.Contains(err.Error(), "too many open files") { glog.Warningf("error accepting connections, retrying in 50 ms: %s", err) select { case <-closing: glog.V(5).Info("shutting down acceptor") return case <-time.After(time.Millisecond * 50): continue } } glog.Errorf("shutting down acceptor: %s", err) return } glog.V(5).Infof("accepted connection: %s", conn) select { case <-closing: glog.V(5).Info("shutting down acceptor") conn.Close() return case mux.connections <- conn: } } }
// getEnvMinDuration returns the time.Duration env var meeting minimum and default duration func getEnvMinDuration(envvar string, def, min int32) time.Duration { duration := def envval := os.Getenv(envvar) if len(strings.TrimSpace(envval)) == 0 { // ignore unset envvar } else if intVal, intErr := strconv.ParseInt(envval, 0, 32); intErr != nil { glog.Warningf("ignoring invalid %s of '%s': %s", envvar, envval, intErr) duration = min } else if int32(intVal) < min { glog.Warningf("ignoring invalid %s of '%s' < minimum:%v seconds", envvar, envval, min) } else { duration = int32(intVal) } return time.Duration(duration) * time.Second }
// Retrieve service container port info. func (ss *ServiceState) GetHostEndpointInfo(applicationRegex *regexp.Regexp) (hostPort, containerPort uint16, protocol string, match bool) { for _, ep := range ss.Endpoints { if ep.Purpose == "export" { if applicationRegex.MatchString(ep.Application) { if ep.PortTemplate != "" { port, err := ss.evalPortTemplate(ep.PortTemplate) if err != nil { glog.Errorf("%+v", err) break } ep.PortNumber = uint16(port) } portS := fmt.Sprintf("%d/%s", ep.PortNumber, strings.ToLower(ep.Protocol)) external := ss.PortMapping[portS] if len(external) == 0 { glog.Warningf("Found match for %s:%s, but no portmapping is available", applicationRegex, portS) break } extPort, err := strconv.ParseUint(external[0].HostPort, 10, 16) if err != nil { glog.Errorf("Portmap parsing failed for %s:%s %v", applicationRegex, portS, err) break } return uint16(extPort), ep.PortNumber, ep.Protocol, true } } } return 0, 0, "", false }
// listenAndproxy listens, locally, on the prxy's specified Port. For each // incoming connection a goroutine running the prxy method is created. func (p *proxy) listenAndproxy() { connections := make(chan net.Conn) go func(lsocket net.Listener, conns chan net.Conn) { for { conn, err := lsocket.Accept() if err != nil { glog.Fatal("Error (net.Accept): ", err) } conns <- conn } }(p.listener, connections) i := 0 for { select { case conn := <-connections: if len(p.addresses) == 0 { glog.Warningf("No remote services available for prxying %v", p) conn.Close() continue } i++ // round robin connections to list of addresses glog.V(1).Infof("choosing address from %v", p.addresses) go p.prxy(conn, p.addresses[i%len(p.addresses)]) case p.addresses = <-p.newAddresses: case errc := <-p.closing: p.listener.Close() errc <- nil return } } }
func updateServiceInstances(cpDao dao.ControlPlane, conn *zk.Conn, service *dao.Service, serviceStates []*dao.ServiceState) error { var err error // pick services instances to start if len(serviceStates) < service.Instances { instancesToStart := service.Instances - len(serviceStates) glog.V(2).Infof("updateServiceInstances wants to start %d instances", instancesToStart) var poolHosts []*dao.PoolHost err = cpDao.GetHostsForResourcePool(service.PoolId, &poolHosts) if err != nil { glog.Errorf("Leader unable to acquire hosts for pool %s: %v", service.PoolId, err) return err } if len(poolHosts) == 0 { glog.Warningf("Pool %s has no hosts", service.PoolId) return nil } return startServiceInstances(conn, service, poolHosts, instancesToStart) } else if len(serviceStates) > service.Instances { instancesToKill := len(serviceStates) - service.Instances glog.V(2).Infof("updateServiceInstances wants to kill %d instances", instancesToKill) shutdownServiceInstances(conn, serviceStates, instancesToKill) } return nil }
func (f *Facade) RestoreIPs(ctx datastore.Context, svc service.Service) error { for _, ep := range svc.Endpoints { if ep.AddressAssignment.IPAddr != "" { if assign, err := f.FindAssignmentByServiceEndpoint(ctx, svc.ID, ep.Name); err != nil { glog.Errorf("Could not look up address assignment %s for service %s (%s): %s", ep.Name, svc.Name, svc.ID, err) return err } else if assign == nil || !assign.EqualIP(ep.AddressAssignment) { ip, err := f.getManualAssignment(ctx, svc.PoolID, ep.AddressAssignment.IPAddr, ep.AddressConfig.Port) if err != nil { glog.Warningf("Could not assign ip (%s) to endpoint %s for service %s (%s): %s", ep.AddressAssignment.IPAddr, ep.Name, svc.Name, svc.ID, err) continue } assign = &addressassignment.AddressAssignment{ AssignmentType: ip.Type, HostID: ip.HostID, PoolID: svc.PoolID, IPAddr: ip.IP, Port: ep.AddressConfig.Port, ServiceID: svc.ID, EndpointName: ep.Name, } if _, err := f.assign(ctx, *assign); err != nil { glog.Errorf("Could not restore address assignment for %s of service %s at %s:%d: %s", assign.EndpointName, assign.ServiceID, assign.IPAddr, assign.Port, err) return err } glog.Infof("Restored address assignment for endpoint %s of service %s at %s:%d", assign.EndpointName, assign.ServiceID, assign.IPAddr, assign.Port) } else { glog.Infof("Endpoint %s for service %s (%s) already assigned; skipping", assign.EndpointName, assign.ServiceID) } } } return nil }
func start(shutdown <-chan interface{}, conn client.Connection, listeners ...Listener) { var count int done := make(chan int) defer func() { glog.Infof("Shutting down %d child listeners", len(listeners)) for count > 0 { count -= <-done } }() _shutdown := make(chan interface{}) defer close(_shutdown) for i := range listeners { count++ go func(l Listener) { defer func() { done <- 1 }() Listen(_shutdown, make(chan error, 1), conn, l) glog.Infof("Listener at %s exited", l.GetPath()) }(listeners[i]) } select { case i := <-done: glog.Warningf("Listener exited prematurely, stopping all listeners") count -= i case <-shutdown: glog.Infof("Receieved signal to shutdown") } }
func (svc *IService) doHealthChecks(halt <-chan struct{}) { if len(svc.HealthChecks) == 0 { return } var found bool var checkDefinition healthCheckDefinition if checkDefinition, found = svc.HealthChecks[DEFAULT_HEALTHCHECK_NAME]; !found { glog.Warningf("Default healthcheck %q not found for isvc %s", DEFAULT_HEALTHCHECK_NAME, svc.Name) return } timer := time.Tick(checkDefinition.Interval) for { select { case <-halt: glog.Infof("Stopped healthchecks for %s", svc.Name) return case currentTime := <-timer: err := svc.runCheckOrTimeout(checkDefinition) svc.setHealthStatus(err, currentTime.Unix()) if err != nil { glog.Errorf("Healthcheck for isvc %s failed: %s", svc.Name, err) } } } }
//createSystemUser updates the running instance password as well as the user record in elastic func createSystemUser(s *ControlPlaneDao) error { user := userdomain.User{} err := s.GetUser(SYSTEM_USER_NAME, &user) if err != nil { glog.Warningf("%s", err) glog.V(0).Info("'default' user not found; creating...") // create the system user user := userdomain.User{} user.Name = SYSTEM_USER_NAME userName := SYSTEM_USER_NAME if err := s.AddUser(user, &userName); err != nil { return err } } // update the instance password password, err := utils.NewUUID36() if err != nil { return err } user.Name = SYSTEM_USER_NAME user.Password = password INSTANCE_PASSWORD = password unused := 0 return s.UpdateUser(user, &unused) }
// startupHealthcheck runs the default healthchecks (if any) and the return the result. // If the healthcheck fails, then this method will sleep 1 second, and then // repeat the healthcheck, continuing that sleep/retry pattern until // the healthcheck succeeds or 2 minutes has elapsed. // // An error is returned if the no healtchecks succeed in the 2 minute interval, // otherwise nil is returned func (svc *IService) startupHealthcheck() <-chan error { err := make(chan error, 1) go func() { var result error if len(svc.HealthChecks) > 0 { checkDefinition, found := svc.HealthChecks[DEFAULT_HEALTHCHECK_NAME] if !found { glog.Warningf("Default healthcheck %q not found for isvc %s", DEFAULT_HEALTHCHECK_NAME, svc.Name) err <- nil return } startCheck := time.Now() for { currentTime := time.Now() result = svc.runCheckOrTimeout(checkDefinition) svc.setHealthStatus(result, currentTime.Unix()) if result == nil || time.Since(startCheck).Seconds() > WAIT_FOR_INITIAL_HEALTHCHECK.Seconds() { break } glog.Infof("waiting for %s to start, checking health status again in 1 second", svc.Name) time.Sleep(time.Second) } err <- result } else { svc.setHealthStatus(nil, time.Now().Unix()) err <- nil } }() return err }
// PostProcess deletes any orphaned data that exists locally func (l *Synchronizer) PostProcess(processing map[string]struct{}) { // Get all locally stored data nodes, err := l.GetAll() if err != nil { glog.Warningf("Could not access locally stored data: %s", err) return } // Delete any local nodes that do not exist remotely and are not in process for _, node := range nodes { if _, ok := processing[node.GetID()]; ok { // pass } else if err := l.Delete(node.GetID()); err != nil { glog.Warningf("Could not delete %s from locally stored data: %s", node.GetID(), err) } } }
func watchService(cpDao dao.ControlPlane, conn *zk.Conn, shutdown <-chan int, done chan<- string, serviceId string) { defer func() { glog.V(3).Info("Exiting function watchService ", serviceId) done <- serviceId }() for { var service dao.Service _, zkEvent, err := zzk.LoadServiceW(conn, serviceId, &service) if err != nil { glog.Errorf("Unable to load service %s: %v", serviceId, err) return } _, _, childEvent, err := conn.ChildrenW(zzk.ServicePath(serviceId)) glog.V(1).Info("Leader watching for changes to service ", service.Name) // check current state var serviceStates []*dao.ServiceState err = zzk.GetServiceStates(conn, &serviceStates, serviceId) if err != nil { glog.Error("Unable to retrieve running service states: ", err) return } // Is the service supposed to be running at all? switch { case service.DesiredState == dao.SVC_STOP: shutdownServiceInstances(conn, serviceStates, len(serviceStates)) case service.DesiredState == dao.SVC_RUN: updateServiceInstances(cpDao, conn, &service, serviceStates) default: glog.Warningf("Unexpected desired state %d for service %s", service.DesiredState, service.Name) } select { case evt := <-zkEvent: if evt.Type == zk.EventNodeDeleted { glog.V(0).Info("Shutting down due to node delete ", serviceId) shutdownServiceInstances(conn, serviceStates, len(serviceStates)) return } glog.V(1).Infof("Service %s received event: %v", service.Name, evt) continue case evt := <-childEvent: glog.V(1).Infof("Service %s received child event: %v", service.Name, evt) continue case <-shutdown: glog.V(1).Info("Leader stopping watch on ", service.Name) return } } }
// pause updates the state of the given service instance to paused func (l *ServiceListener) pause(rss []dao.RunningService) { for _, state := range rss { // pauseInstance updates the service state ONLY if it has a RUN DesiredState if err := pauseInstance(l.conn, state.HostID, state.ID); err != nil { glog.Warningf("Could not pause service instance %s (%s) for service %s: %s", state.ID, state.Name, state.ServiceID, err) continue } glog.V(2).Infof("Pausing service instance %s (%s) for service %s on host %s", state.ID, state.Name, state.ServiceID, state.HostID) } }
func (dfs *DistributedFilesystem) Lock() error { dfs.mutex.Lock() err := dfs.lock.Lock() if err != nil { glog.Warningf("Could not lock services! Operation may be unstable: %s", err) } dfs.logger = new(logger).init() return err }
// stop unschedules the provided service instances func (l *ServiceListener) stop(rss []dao.RunningService) { for _, state := range rss { if err := StopServiceInstance(l.conn, state.HostID, state.ID); err != nil { glog.Warningf("Service instance %s (%s) from service %s won't die: %s", state.ID, state.Name, state.ServiceID, err) removeInstance(l.conn, state.ServiceID, state.HostID, state.ID) continue } glog.V(2).Infof("Stopping service instance %s (%s) for service %s on host %s", state.ID, state.Name, state.ServiceID, state.HostID) } }
// Rollback rolls back the volume to the given snapshot func (c *BtrfsConn) Rollback(label string) error { if exists, err := c.snapshotExists(label); err != nil || !exists { if err != nil { return err } else { return fmt.Errorf("snapshot %s does not exist", label) } } c.Lock() defer c.Unlock() vd := path.Join(c.root, c.name) dirp, err := volume.IsDir(vd) if err != nil { return err } glog.Infof("starting rollback of snapshot %s", label) start := time.Now() if dirp { timeout := getEnvMinDuration("SERVICED_BTRFS_ROLLBACK_TIMEOUT", 300, 120) glog.Infof("rollback using env var SERVICED_BTRFS_ROLLBACK_TIMEOUT:%s", timeout) for { cmd := []string{"subvolume", "delete", vd} output, deleteError := runcmd(c.sudoer, cmd...) if deleteError == nil { break } now := time.Now() if now.Sub(start) > timeout { glog.Errorf("rollback of snapshot %s failed - btrfs subvolume deletes took %s for cmd:%s", label, timeout, cmd) return deleteError } else if strings.Contains(string(output), "Device or resource busy") { waitTime := time.Duration(5 * time.Second) glog.Warningf("retrying rollback subvolume delete in %s - unable to run cmd:%s output:%s error:%s", waitTime, cmd, string(output), deleteError) time.Sleep(waitTime) } else { return deleteError } } } cmd := []string{"subvolume", "snapshot", c.SnapshotPath(label), vd} _, err = runcmd(c.sudoer, cmd...) if err != nil { glog.Errorf("rollback of snapshot %s failed for cmd:%s", label, cmd) } else { duration := time.Now().Sub(start) glog.Infof("rollback of snapshot %s took %s", label, duration) } return err }