// Fetch will attempt to retreive a user from token cache // If cacheHit == true and u == nil and err == nil then we KNOW they don't // exist (and so we don't have to bother looking them up via login service) func (c *memcacheCacher) Fetch(sessId string) (u *User, cacheHit bool, err error) { t := time.Now() u, hit, err := c.doFetch(sessId) instTiming("auth.cache.fetch", err, t) if hit { inst.Counter(1.0, "auth.cache.fetch.hit", 1) } else { inst.Counter(1.0, "auth.cache.fetch.miss", 1) } return u, hit, err }
// Authorise tests auth func (a *simpleAuthoriser) Authorise(req *Request) errors.Error { // If we require neither a role or a user, then there is no need to authorise if !a.requireUser && !a.requireRole { log.Debugf("Skipping auth from %s to %s, as neither user or role required", req.From(), req.Destination()) return nil } // Otherwise, authorise this request scope := req.Auth() log.Tracef("Scope user: %v", scope.AuthUser()) if a.requireUser && !scope.IsAuth() { return errors.Forbidden("com.hailocab.kernel.auth.notsignedin", fmt.Sprintf("Must be signed in to call this endpoint[endpoint=%s, service=%s, from=%s]", req.Endpoint(), req.Service(), req.From()), "201") } if a.requireRole { matchesRole := false for _, r := range a.roles { if scope.HasAccess(r) { matchesRole = true break } } if !matchesRole { if scope.HasTriedAuth() { return errors.Forbidden("com.hailocab.kernel.auth.badrole", fmt.Sprintf("Must be signed in to call this endpoint[endpoint=%s, service=%s, from=%s]", req.Endpoint(), req.Service(), req.From()), "201") } // Instrument when service to service auth fails inst.Counter(1.0, "auth.servicetoservice.failed", 1) return BadRoleError(req) } } return nil }
// Auth will pass the supplied details onto the login service in an attempt // to authenticate a brand new session func (s *realScope) Auth(mech, device string, creds map[string]string) error { t := time.Now() u, err := s.doAuth(mech, device, creds) instTiming("auth.auth", err, t) if s.IsAuth() { inst.Counter(0.01, "auth.authenticate.recovered", 1) } else { inst.Counter(1.0, "auth.authenticate.badCredentials", 1) } s.Lock() defer s.Unlock() s.authUser = u if err == nil || err == BadCredentialsError { s.triedAuth = true } return err }
// RegionTimedLock attempts to achieve a regional lock on `id`, waiting for `waitFor` time in // case of contention (before giving up) and reserving the lock for a maximum `holdFor` // in the event of failing to Unlock() func RegionTimedLock(id []byte, waitFor, holdFor time.Duration) (Lock, error) { // Ensure we are reaping once.Do(startReaper) if int64(holdFor) < int64(minRegionHoldFor) { return nil, ErrRegionHoldFor } lockId := string(id) // we use []byte for function signature compatibility with the global lock // Our locks are namespaced per service path, err := constructLockPath(regionLockNamespace, lockId) if err != nil { return nil, err } log.Tracef("[Sync:RegionTimedLock] Attempting to acquire '%s'; held for %v", path, holdFor) // Create new lock which we will be lock()ed lock := ®ionLock{ zkLock: zk.NewLock(path, gozk.WorldACL(gozk.PermAll)), } lock.zkLock.SetTTL(holdFor) lock.zkLock.SetTimeout(waitFor) // Acquire a lock startTime := time.Now() err = lock.zkLock.Lock() inst.Timing(1.0, "sync.regionlock.acquire", time.Since(startTime)) defaultReaper.addPath(path) // only add path to reaper AFTER we've acquired the lock (or not) if err == nil { log.Tracef("[Sync:RegionTimedLock] Successfully acquired '%s'", path) inst.Counter(1.0, "sync.regionlock.acquire.success") } else { log.Errorf("[Sync:RegionTimedLock] Failed to acquire '%s': %s", path, err.Error()) inst.Counter(1.0, "sync.regionlock.acquire.failure") } return lock, err }
// RecoverSession will try to turn a sessId into a valid user/token, if possible // error will be non-nil if something goes wrong during this process - if we // can't find any valid user with this sessId that is *not* an error // If there is an error, the current state of the scope *will not have been changed* // If there is no error, then the state will be updated, either to the recovered // user *or* to nil, if no user was recovered func (s *realScope) RecoverSession(sessId string) error { t := time.Now() u, err := s.doRecoverSession(sessId) instTiming("auth.recoverSession", err, t) if s.IsAuth() { inst.Counter(0.01, "auth.recoverSession.recovered", 1) } else { inst.Counter(1.0, "auth.recoverSession.badCredentials", 1) } s.Lock() defer s.Unlock() s.authUser = u if err == nil { s.triedAuth = true } return err }
// Rescind should be called to indicate you no longer wish to be the leader func (rl *regionLeader) Rescind() { rl.cleanup.Do(func() { log.Debugf("[Sync:RegionLeader] Cleaning up leadership of '%v'...", rl.lockNode) close(rl.rescinded) // keep trying to delete the ZK node (to release leadership) until we're sure it doesn't exist for { err := zookeeper.Delete(rl.lockNode, -1) if err == nil || err == gozk.ErrNoNode { log.Debugf("[Sync:RegionLeader] Have deleted leadership node '%v'", rl.lockNode) inst.Counter(1.0, "sync.regionleader.rescinded") break } log.Warnf("[Sync:RegionLeader] Failed to cleanup/rescind leadership (will retry): %v", err) time.Sleep(cleanupDelay) } }) }
func newRegionLeader(lockNode string) *regionLeader { rl := ®ionLeader{ active: true, lockNode: lockNode, rescinded: make(chan struct{}), } // establish a watch to cleanup go func() { _, _, watch, err := zookeeper.GetW(rl.lockNode) if err != nil { rl.Rescind() return } <-watch log.Debugf("[Sync:RegionLeader] Watch triggered on '%v', will rescind leadership", rl.lockNode) inst.Counter(1.0, "sync.regionleader.remotely-rescinded") rl.Rescind() return }() return rl }
// RegionLeader block indefinitely until this invocation has been elected the "leader" within the local operating region. // It will then return a channel that will eventually be closed when leadership is rescinded. func RegionLeader(id string) Leader { path := fmt.Sprintf(regionLeaderPath, id) prefix := path + "/lock-" var lockNode string for { // create our lock node -- retry until this is done for { var err error lockNode, err = zookeeper.CreateProtectedEphemeralSequential(prefix, []byte{}, gozk.WorldACL(gozk.PermAll)) if err == gozk.ErrNoNode { createParents(path) } else if err == nil { break } else { log.Warnf("[Sync:RegionLeader] ZooKeeper error creating ephemeral lock node for leadership election: %s", err.Error()) } } err := waitForWinner(path, lockNode) if err != nil { // try to cleanup - then go again zookeeper.Delete(lockNode, -1) time.Sleep(time.Second) continue } // we are the leader break } log.Infof("[Sync:RegionLeader] Elected leader of '%v'", id) inst.Counter(1.0, "sync.regionleader.elected") return newRegionLeader(lockNode) }
// doReq sends a request, with timeout options and retries, waits for response and returns it func (c *client) doReq(req *Request, options ...Options) (*Response, errors.Error) { if circuitbreaker.Open(req.service, req.endpoint) { inst.Counter(1.0, fmt.Sprintf("client.error.%s.%s.circuitbroken", req.service, req.endpoint), 1) log.Warnf("Broken Circuit for %s.%s", req.service, req.endpoint) return nil, errors.CircuitBroken("com.hailocab.kernel.platform.circuitbreaker", "Circuit is open") } retries := c.defaults["retries"].(int) var timeout time.Duration timeoutSupplied := false if len(options) == 1 { if _, ok := options[0]["retries"]; ok { retries = options[0]["retries"].(int) } if _, ok := options[0]["timeout"]; ok { timeout = options[0]["timeout"].(time.Duration) timeoutSupplied = true } } // setup the response channel rc := make(chan *Response, retries) c.responses.add(req, rc) defer c.responses.removeByRequest(req) instPrefix := fmt.Sprintf("client.%s.%s", req.service, req.endpoint) tAllRetries := time.Now() for i := 1; i <= retries+1; i++ { t := time.Now() c.RLock() con := c.listening c.RUnlock() if !con { log.Debug("[Client] not yet listening, establishing now...") ch := make(chan bool) go c.listen(ch) if online := <-ch; !online { log.Error("[Client] Listener failed") inst.Timing(1.0, fmt.Sprintf("%s.error", instPrefix), time.Since(t)) inst.Counter(1.0, "client.error.com.hailocab.kernel.platform.client.listenfail", 1) return nil, errors.InternalServerError("com.hailocab.kernel.platform.client.listenfail", "Listener failed") } log.Info("[Client] Listener online") } // figure out what timeout to use if !timeoutSupplied { timeout = c.timeout.Get(req.service, req.endpoint, i) } log.Tracef("[Client] Sync request attempt %d for %s using timeout %v", i, req.MessageID(), timeout) // only bother sending the request if we are listening, otherwise allow to timeout if err := raven.SendRequest(req, c.instanceID); err != nil { log.Errorf("[Client] Failed to send request: %v", err) } select { case payload := <-rc: if payload.IsError() { inst.Timing(1.0, fmt.Sprintf("%s.error", instPrefix), time.Since(t)) errorProto := &pe.PlatformError{} if err := payload.Unmarshal(errorProto); err != nil { inst.Counter(1.0, "client.error.com.hailocab.kernel.platform.badresponse", 1) return nil, errors.BadResponse("com.hailocab.kernel.platform.badresponse", err.Error()) } err := errors.FromProtobuf(errorProto) inst.Counter(1.0, fmt.Sprintf("client.error.%s", err.Code()), 1) circuitbreaker.Result(req.service, req.endpoint, err) return nil, err } inst.Timing(1.0, fmt.Sprintf("%s.success", instPrefix), time.Since(t)) circuitbreaker.Result(req.service, req.endpoint, nil) return payload, nil case <-time.After(timeout): // timeout log.Errorf("[Client] Timeout talking to %s.%s after %v for %s", req.Service(), req.Endpoint(), timeout, req.MessageID()) inst.Timing(1.0, fmt.Sprintf("%s.error", instPrefix), time.Since(t)) c.traceAttemptTimeout(req, i, timeout) circuitbreaker.Result(req.service, req.endpoint, errors.Timeout("com.hailocab.kernel.platform.timeout", fmt.Sprintf("Request timed out talking to %s.%s from %s (most recent timeout %v)", req.Service(), req.Endpoint(), req.From(), timeout), req.Service(), req.Endpoint())) } } inst.Timing(1.0, fmt.Sprintf("%s.error.timedOut", instPrefix), time.Since(tAllRetries)) inst.Counter(1.0, "client.error.com.hailocab.kernel.platform.timeout", 1) return nil, errors.Timeout( "com.hailocab.kernel.platform.timeout", fmt.Sprintf("Request timed out talking to %s.%s from %s (most recent timeout %v)", req.Service(), req.Endpoint(), req.From(), timeout), req.Service(), req.Endpoint(), ) }
// HandleRequest and send back response func HandleRequest(req *Request) { defer func() { if r := recover(); r != nil { log.Criticalf("[Server] Panic \"%v\" when handling request: (id: %s, endpoint: %s, content-type: %s,"+ " content-length: %d)", r, req.MessageID(), req.Destination(), req.delivery.ContentType, len(req.delivery.Body)) inst.Counter(1.0, "runtime.panic", 1) publishFailure(r) debug.PrintStack() } }() if len(req.Service()) > 0 && req.Service() != Name { log.Criticalf(`[Server] Message meant for "%s" not "%s"`, req.Service(), Name) return } reqProcessor: switch { case req.isHeartbeat(): if dsc.IsConnected() { log.Tracef("[Server] Inbound heartbeat from: %s", req.ReplyTo()) dsc.hb.beat() raven.SendResponse(PongResponse(req), InstanceID) } else { log.Warnf("[Server] Not connected but heartbeat from: %s", req.ReplyTo()) } case req.IsPublication(): log.Tracef("[Server] Inbound publication on topic: %s", req.Topic()) if endpoint, ok := reg.find(req.Topic()); ok { // Match + call handler if data, err := endpoint.unmarshalRequest(req); err != nil { log.Warnf("[Server] Failed to unmarshal published message: %s", err.Error()) break reqProcessor } else { req.unmarshaledData = data } if _, err := commonLogHandler(commonLogger, endpoint.instrumentedHandler)(req); err != nil { // don't do anything on error apart from log - it's a pub sub call so no response required log.Warnf("[Server] Failed to process published message: %v", err) } } default: log.Tracef("[Server] Inbound message %s from %s", req.MessageID(), req.ReplyTo()) // Match a handler endpoint, ok := reg.find(req.Endpoint()) if !ok { if rsp, err := ErrorResponse(req, errors.InternalServerError("com.hailocab.kernel.handler.missing", fmt.Sprintf("No handler registered for %s", req.Destination()))); err != nil { log.Criticalf("[Server] Unable to build response: %v", err) } else { raven.SendResponse(rsp, InstanceID) } return } // Unmarshal the request data var ( reqData, rspData proto.Message err errors.Error ) if reqData, err = endpoint.unmarshalRequest(req); err == nil { req.unmarshaledData = reqData } // Call handler - constraining the max concurrent requests handled if err == nil { callerName := req.From() if callerName == "" { callerName = "unknown" } tokenBucketName := fmt.Sprintf("server.tokens.%s", callerName) reqsBucketName := fmt.Sprintf("server.inflightrequests.%s", callerName) tokC := tokensChan(callerName) select { case t := <-tokC: func() { defer func() { atomic.AddUint64(&inFlightRequests, ^uint64(0)) // This is actually a subtraction tokC <- t // Return the token to the pool }() nowInFlight := atomic.AddUint64(&inFlightRequests, 1) // Update active request counters inst.Gauge(1.0, tokenBucketName, len(tokC)) inst.Gauge(1.0, reqsBucketName, int(nowInFlight)) rspData, err = commonLogHandler(commonLogger, endpoint.instrumentedHandler)(req) }() case <-time.After(time.Duration(endpoint.Mean) * time.Millisecond): inst.Gauge(1.0, tokenBucketName, len(tokC)) inst.Counter(1.0, "server.error.capacity", 1) err = errors.InternalServerError("com.hailocab.kernel.server.capacity", fmt.Sprintf("Server %v out of capacity", Name)) } } // Check response type matches what's registered if err == nil && rspData != nil { _, rspProtoT := endpoint.ProtoTypes() rspDataT := reflect.TypeOf(rspData) if rspProtoT != nil && rspProtoT != rspDataT { err = errors.InternalServerError("com.hailocab.kernel.server.mismatchedprotocol", fmt.Sprintf("Mismatched response protocol. %s != %s", rspDataT.String(), rspProtoT.String())) } } if err != nil { inst.Counter(1.0, fmt.Sprintf("server.error.%s", err.Code()), 1) switch err.Type() { case errors.ErrorBadRequest, errors.ErrorForbidden, errors.ErrorNotFound: log.Debugf("[Server] Handler error %s calling %v.%v from %v: %v", err.Type(), req.Service(), req.Endpoint(), req.From(), err) case errors.ErrorInternalServer: go publishError(req, err) fallthrough default: log.Errorf("[Server] Handler error %s calling %v.%v from %v: %v", err.Type(), req.Service(), req.Endpoint(), req.From(), err) } if rsp, err := ErrorResponse(req, err); err != nil { log.Criticalf("[Server] Unable to build response: %v", err) } else { raven.SendResponse(rsp, InstanceID) } return } if rsp, err := ReplyResponse(req, rspData); err != nil { if rsp, err2 := ErrorResponse(req, errors.InternalServerError("com.hailocab.kernel.marshal.error", fmt.Sprintf("Could not marshal response %v", err))); err2 != nil { log.Criticalf("[Server] Unable to build error response: %v", err2) } else { // Send the error response raven.SendResponse(rsp, InstanceID) } } else { // Send the succesful response raven.SendResponse(rsp, InstanceID) } } }
// Init is a local init call that handles setup func Init() { // Parse flags and handle them. No other code should be calling flag.Parse() handleFlags() if len(Name) == 0 { log.Critical("No service name found") cleanupLogs() os.Exit(1) } // GO! log.Infof("Starting up %v (%v)\n%v\n", Name, Version, banner.PrintS(Name)) rand.Seed(time.Now().UnixNano()) tokens = make(map[string]chan bool, 100) servicesConfigFile := configDir + "/" + Name + "-seelog.xml" if _, err := os.Stat(servicesConfigFile); err == nil { loadLogConfig(servicesConfigFile) } // Try and open the access log file if accessLogDir := os.Getenv(accessLogEnv); accessLogDir != "" { var err error accessLogFilename := filepath.Join(accessLogDir, Name+"-access.log") commonLogger, err = os.OpenFile(accessLogFilename, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0666) if err != nil { log.Errorf("[Server] Error opening access log file: %v", err) commonLogger = nil } } else { log.Errorf("[Server] No access log env (%v) set", accessLogEnv) } log.Infof("[Server] Running with %d cores", runtime.GOMAXPROCS(0)) uuid, _ := uuid.NewV4() InstanceID = fmt.Sprintf("server-%s-%s", Name, uuid) // Configure key service layer components, loading config from the config service automatically config.LoadFromService(Name) inst.SetNamespace(Name) ssync.SetRegionLockNamespace(Name) // Ping graphite that we have started inst.Counter(1.0, "runtime.started", 1) // Connect the raven and keep checking its status ch := raven.Connect() if online := <-ch; !online { log.Warn("[Server] Failed to connect the raven on first attempt") } go monitorRaven(ch) // Create a new registry for the endpoints reg = newRegistry() // Add default endpoints reg.add(&Endpoint{ Name: "health", Mean: 100, Upper95: 200, Handler: healthHandler, RequestProtocol: new(healthproto.Request), ResponseProtocol: new(healthproto.Response), }) reg.add(&Endpoint{ Name: "stats", Mean: 100, Upper95: 200, Handler: statsHandler, RequestProtocol: new(statsproto.Request), ResponseProtocol: new(statsproto.PlatformStats), }) reg.add(&Endpoint{ Name: "loadedconfig", Mean: 100, Upper95: 200, Handler: loadedConfigHandler, RequestProtocol: new(loadedconfigproto.Request), ResponseProtocol: new(loadedconfigproto.Response), }) reg.add(&Endpoint{ Name: "jsonschema", Mean: 100, Upper95: 200, Handler: jsonschemaHandler, RequestProtocol: new(jsonschemaproto.Request), ResponseProtocol: new(jsonschemaproto.Response), Authoriser: OpenToTheWorldAuthoriser(), }) // Initialise platform healthchecks initHealthChecks() initialised = true }
// GlobalTimedLock attempts to achieve a global lock on `id`, waiting for `waitFor` time in // case of contention (before giving up) and reserving the lock for a maximum `holdFor` // in the event of failure // NOTE: locks can and will be held for longer than `holdFor`, but in the case of failure // (eg: binary crashes) then this is the maximum amount of time other programs will hang // around contending for the now defunkt lock func GlobalTimedLock(id []byte, waitFor, holdFor time.Duration) (Lock, error) { if int64(holdFor) < int64(minHoldFor) { return nil, ErrHoldFor } u, err := gossie.NewTimeUUID() if err != nil { log.Warnf("[Sync:GlobalLock] Failed to generate time UUID: %v", err) return nil, ErrGenUuid } l := &globalLock{ id: id, lockId: u, exit: make(chan struct{}), } // make my node in C* pool, err := cassandra.ConnectionPool(keyspace) if err != nil { return nil, fmt.Errorf("Error locking due to C*: %v", err) } writer := pool.Writer() writer.ConsistencyLevel(gossie.CONSISTENCY_QUORUM).Insert(cfGlobalLock, &gossie.Row{ Key: l.id, Columns: []*gossie.Column{ { Name: []byte(l.lockId[:]), Value: []byte{}, // @todo could inject some data about who has the lock here Ttl: durationToSeconds(holdFor, 1.0), }, }, }) startTime := time.Now() err = writer.Run() if err != nil { inst.Timing(1.0, "sync.globaltimedlock.acquire", time.Since(startTime)) inst.Counter(1.0, "sync.globaltimedlock.acquire.failure") return nil, err } // read all back and ensure i'm the lowest reader := pool.Reader().ConsistencyLevel(gossie.CONSISTENCY_QUORUM).Cf(cfGlobalLock) attempts := 0 errs := multierror.New() start := time.Now() for { // break out if we've waited too long if attempts > 0 { if time.Now().After(start.Add(waitFor)) { inst.Timing(1.0, "sync.globaltimedlock.acquire", time.Since(startTime)) inst.Counter(1.0, "sync.globaltimedlock.acquire.failure") l.Unlock() return nil, ErrContended } // delay a bit to avoid hammering C* time.Sleep(addJitter(delayFor)) } attempts++ row, err := reader.Get(l.id) if err != nil { errs.Add(fmt.Errorf("C* read back error: %v", err)) continue } if row == nil || len(row.Columns) == 0 { errs.Add(fmt.Errorf("C* read back error: no columns returned from query")) continue } col := row.Columns[0] if bytes.Equal(col.Name, []byte(l.lockId[:])) { // we have the lock break } } inst.Timing(1.0, "sync.globaltimedlock.acquire", time.Since(startTime)) inst.Counter(1.0, "sync.globaltimedlock.acquire.success") // put in place the refresher loop @todo go func() { for { log.Debug("[Sync:GlobalLock] Doing refresher loop…") refresh := time.Duration(float64(holdFor) * 0.75) select { case <-l.exit: log.Debugf("[Sync:GlobalLock] Breaking out of refresher loop") return case <-time.After(refresh): log.Debugf("[Sync:GlobalLock] Refreshing %s [%s]", string(l.id), l.lockId.String()) writer.ConsistencyLevel(gossie.CONSISTENCY_QUORUM).Insert(cfGlobalLock, &gossie.Row{ Key: l.id, Columns: []*gossie.Column{{ Name: []byte(l.lockId[:]), Value: []byte{}, // @todo could inject some data about who has the lock here Ttl: durationToSeconds(holdFor, 1.5), // 1.5 is because we renew the lock earlier than the timeout, so we need to cover that extra bit }}, }) if err := writer.Run(); err != nil { // @todo we could inform clients of this, somehow, eg: via a channel log.Warnf("[Sync:GlobalLock] failed to refresh lock .. cannot guarantee exclusivity") } } } }() return l, nil }