// instrumentedHandler wraps the handler to provide instrumentation func (ep *Endpoint) instrumentedHandler(req *Request) (proto.Message, perrors.Error) { start := time.Now() var err perrors.Error var m proto.Message // In a defer in case the handler panics defer func() { stats.Record(ep, err, time.Since(start)) if err == nil { inst.Timing(1.0, "success."+ep.Name, time.Since(start)) return } switch err.Type() { case perrors.ErrorBadRequest, perrors.ErrorNotFound: // Ignore errors that are caused by clients // TODO: consider a new stat for clienterror? inst.Timing(1.0, "success."+ep.Name, time.Since(start)) return default: inst.Timing(1.0, "error."+ep.Name, time.Since(start)) } }() traceIn(req) // check auth, only call handler if auth passes err = ep.Authoriser.Authorise(req) if err == nil { req.Auth().SetAuthorised(true) m, err = ep.Handler(req) } traceOut(req, m, err, time.Since(start)) return m, err }
// MultiPublish pubs X messages at once, synchronously, to N of M NSQs func (p *HostpoolPublisher) MultiPublish(topic string, body [][]byte) error { if len(body) <= 0 { // This is a fatal error in the server that causes a disconnect. // Let's not send it return ErrEmptyBody } p.once.Do(p.setup) p.RLock() defer p.RUnlock() // must have some hosts configured if p.count == 0 { return fmt.Errorf("No NSQ pubHosts configured (hailo.service.nsq.pubHosts)") } // need to achieve N successful results; we'll round-robin from random starting point seen := make(map[string]bool) maxTries := p.count + p.count errarr := make([]error, 0) for try := 0; try < maxTries; try++ { // Get new host from pool hpr := p.hostpool.Get() host := hpr.Host() // have we already PUBbed to this? if _, exists := seen[host]; exists { continue } // Instrument payload body size inst.Timing(p.instrumentationSampleRate, "nsq.publish."+topic+".size", time.Duration(len(body))) // try this one err := p.producers[host].MultiPublish(topic, body) hpr.Mark(err) // Update hostpool if err != nil { errarr = append(errarr, err) continue } seen[host] = true // have we PUBbed to enough yet? if len(seen) >= p.n { break } } if len(seen) < p.n { errStr := "Errors" for _, anErr := range errarr { errStr += fmt.Sprintf(" :: %v", anErr) } return fmt.Errorf("Could not PUB to enough NSQs to consider this a success. Did PUB to %v out of %v attempts. %v", len(seen), maxTries, errStr) } return nil }
func instTiming(key string, err error, t time.Time) { if err == nil { key += ".success" } else { key += ".failure" } inst.Timing(1.0, key, time.Since(t)) }
// RegionTimedLock attempts to achieve a regional lock on `id`, waiting for `waitFor` time in // case of contention (before giving up) and reserving the lock for a maximum `holdFor` // in the event of failing to Unlock() func RegionTimedLock(id []byte, waitFor, holdFor time.Duration) (Lock, error) { // Ensure we are reaping once.Do(startReaper) if int64(holdFor) < int64(minRegionHoldFor) { return nil, ErrRegionHoldFor } lockId := string(id) // we use []byte for function signature compatibility with the global lock // Our locks are namespaced per service path, err := constructLockPath(regionLockNamespace, lockId) if err != nil { return nil, err } log.Tracef("[Sync:RegionTimedLock] Attempting to acquire '%s'; held for %v", path, holdFor) // Create new lock which we will be lock()ed lock := ®ionLock{ zkLock: zk.NewLock(path, gozk.WorldACL(gozk.PermAll)), } lock.zkLock.SetTTL(holdFor) lock.zkLock.SetTimeout(waitFor) // Acquire a lock startTime := time.Now() err = lock.zkLock.Lock() inst.Timing(1.0, "sync.regionlock.acquire", time.Since(startTime)) defaultReaper.addPath(path) // only add path to reaper AFTER we've acquired the lock (or not) if err == nil { log.Tracef("[Sync:RegionTimedLock] Successfully acquired '%s'", path) inst.Counter(1.0, "sync.regionlock.acquire.success") } else { log.Errorf("[Sync:RegionTimedLock] Failed to acquire '%s': %s", path, err.Error()) inst.Counter(1.0, "sync.regionlock.acquire.failure") } return lock, err }
// doReq sends a request, with timeout options and retries, waits for response and returns it func (c *client) doReq(req *Request, options ...Options) (*Response, errors.Error) { if circuitbreaker.Open(req.service, req.endpoint) { inst.Counter(1.0, fmt.Sprintf("client.error.%s.%s.circuitbroken", req.service, req.endpoint), 1) log.Warnf("Broken Circuit for %s.%s", req.service, req.endpoint) return nil, errors.CircuitBroken("com.hailocab.kernel.platform.circuitbreaker", "Circuit is open") } retries := c.defaults["retries"].(int) var timeout time.Duration timeoutSupplied := false if len(options) == 1 { if _, ok := options[0]["retries"]; ok { retries = options[0]["retries"].(int) } if _, ok := options[0]["timeout"]; ok { timeout = options[0]["timeout"].(time.Duration) timeoutSupplied = true } } // setup the response channel rc := make(chan *Response, retries) c.responses.add(req, rc) defer c.responses.removeByRequest(req) instPrefix := fmt.Sprintf("client.%s.%s", req.service, req.endpoint) tAllRetries := time.Now() for i := 1; i <= retries+1; i++ { t := time.Now() c.RLock() con := c.listening c.RUnlock() if !con { log.Debug("[Client] not yet listening, establishing now...") ch := make(chan bool) go c.listen(ch) if online := <-ch; !online { log.Error("[Client] Listener failed") inst.Timing(1.0, fmt.Sprintf("%s.error", instPrefix), time.Since(t)) inst.Counter(1.0, "client.error.com.hailocab.kernel.platform.client.listenfail", 1) return nil, errors.InternalServerError("com.hailocab.kernel.platform.client.listenfail", "Listener failed") } log.Info("[Client] Listener online") } // figure out what timeout to use if !timeoutSupplied { timeout = c.timeout.Get(req.service, req.endpoint, i) } log.Tracef("[Client] Sync request attempt %d for %s using timeout %v", i, req.MessageID(), timeout) // only bother sending the request if we are listening, otherwise allow to timeout if err := raven.SendRequest(req, c.instanceID); err != nil { log.Errorf("[Client] Failed to send request: %v", err) } select { case payload := <-rc: if payload.IsError() { inst.Timing(1.0, fmt.Sprintf("%s.error", instPrefix), time.Since(t)) errorProto := &pe.PlatformError{} if err := payload.Unmarshal(errorProto); err != nil { inst.Counter(1.0, "client.error.com.hailocab.kernel.platform.badresponse", 1) return nil, errors.BadResponse("com.hailocab.kernel.platform.badresponse", err.Error()) } err := errors.FromProtobuf(errorProto) inst.Counter(1.0, fmt.Sprintf("client.error.%s", err.Code()), 1) circuitbreaker.Result(req.service, req.endpoint, err) return nil, err } inst.Timing(1.0, fmt.Sprintf("%s.success", instPrefix), time.Since(t)) circuitbreaker.Result(req.service, req.endpoint, nil) return payload, nil case <-time.After(timeout): // timeout log.Errorf("[Client] Timeout talking to %s.%s after %v for %s", req.Service(), req.Endpoint(), timeout, req.MessageID()) inst.Timing(1.0, fmt.Sprintf("%s.error", instPrefix), time.Since(t)) c.traceAttemptTimeout(req, i, timeout) circuitbreaker.Result(req.service, req.endpoint, errors.Timeout("com.hailocab.kernel.platform.timeout", fmt.Sprintf("Request timed out talking to %s.%s from %s (most recent timeout %v)", req.Service(), req.Endpoint(), req.From(), timeout), req.Service(), req.Endpoint())) } } inst.Timing(1.0, fmt.Sprintf("%s.error.timedOut", instPrefix), time.Since(tAllRetries)) inst.Counter(1.0, "client.error.com.hailocab.kernel.platform.timeout", 1) return nil, errors.Timeout( "com.hailocab.kernel.platform.timeout", fmt.Sprintf("Request timed out talking to %s.%s from %s (most recent timeout %v)", req.Service(), req.Endpoint(), req.From(), timeout), req.Service(), req.Endpoint(), ) }
func Set(item *memcache.Item) error { start := time.Now() defer inst.Timing(timingSampleRate, "memcached.set", time.Since(start)) return defaultClient.Set(item) }
func Increment(key string, delta uint64) (newValue uint64, err error) { start := time.Now() defer inst.Timing(timingSampleRate, "memcached.increment", time.Since(start)) return defaultClient.Increment(key, delta) }
func GetMulti(keys []string) (map[string]*memcache.Item, error) { start := time.Now() defer inst.Timing(timingSampleRate, "memcached.get-multi", time.Since(start)) return defaultClient.GetMulti(keys) }
func Get(key string) (item *memcache.Item, err error) { start := time.Now() defer inst.Timing(timingSampleRate, "memcached.get", time.Since(start)) return defaultClient.Get(key) }
func Delete(key string) error { start := time.Now() defer inst.Timing(timingSampleRate, "memcached.delete", time.Since(start)) return defaultClient.Delete(key) }
func CompareAndSwap(item *memcache.Item) error { start := time.Now() defer inst.Timing(timingSampleRate, "memcached.compare-and-swap", time.Since(start)) return defaultClient.CompareAndSwap(item) }
// GlobalTimedLock attempts to achieve a global lock on `id`, waiting for `waitFor` time in // case of contention (before giving up) and reserving the lock for a maximum `holdFor` // in the event of failure // NOTE: locks can and will be held for longer than `holdFor`, but in the case of failure // (eg: binary crashes) then this is the maximum amount of time other programs will hang // around contending for the now defunkt lock func GlobalTimedLock(id []byte, waitFor, holdFor time.Duration) (Lock, error) { if int64(holdFor) < int64(minHoldFor) { return nil, ErrHoldFor } u, err := gossie.NewTimeUUID() if err != nil { log.Warnf("[Sync:GlobalLock] Failed to generate time UUID: %v", err) return nil, ErrGenUuid } l := &globalLock{ id: id, lockId: u, exit: make(chan struct{}), } // make my node in C* pool, err := cassandra.ConnectionPool(keyspace) if err != nil { return nil, fmt.Errorf("Error locking due to C*: %v", err) } writer := pool.Writer() writer.ConsistencyLevel(gossie.CONSISTENCY_QUORUM).Insert(cfGlobalLock, &gossie.Row{ Key: l.id, Columns: []*gossie.Column{ { Name: []byte(l.lockId[:]), Value: []byte{}, // @todo could inject some data about who has the lock here Ttl: durationToSeconds(holdFor, 1.0), }, }, }) startTime := time.Now() err = writer.Run() if err != nil { inst.Timing(1.0, "sync.globaltimedlock.acquire", time.Since(startTime)) inst.Counter(1.0, "sync.globaltimedlock.acquire.failure") return nil, err } // read all back and ensure i'm the lowest reader := pool.Reader().ConsistencyLevel(gossie.CONSISTENCY_QUORUM).Cf(cfGlobalLock) attempts := 0 errs := multierror.New() start := time.Now() for { // break out if we've waited too long if attempts > 0 { if time.Now().After(start.Add(waitFor)) { inst.Timing(1.0, "sync.globaltimedlock.acquire", time.Since(startTime)) inst.Counter(1.0, "sync.globaltimedlock.acquire.failure") l.Unlock() return nil, ErrContended } // delay a bit to avoid hammering C* time.Sleep(addJitter(delayFor)) } attempts++ row, err := reader.Get(l.id) if err != nil { errs.Add(fmt.Errorf("C* read back error: %v", err)) continue } if row == nil || len(row.Columns) == 0 { errs.Add(fmt.Errorf("C* read back error: no columns returned from query")) continue } col := row.Columns[0] if bytes.Equal(col.Name, []byte(l.lockId[:])) { // we have the lock break } } inst.Timing(1.0, "sync.globaltimedlock.acquire", time.Since(startTime)) inst.Counter(1.0, "sync.globaltimedlock.acquire.success") // put in place the refresher loop @todo go func() { for { log.Debug("[Sync:GlobalLock] Doing refresher loop…") refresh := time.Duration(float64(holdFor) * 0.75) select { case <-l.exit: log.Debugf("[Sync:GlobalLock] Breaking out of refresher loop") return case <-time.After(refresh): log.Debugf("[Sync:GlobalLock] Refreshing %s [%s]", string(l.id), l.lockId.String()) writer.ConsistencyLevel(gossie.CONSISTENCY_QUORUM).Insert(cfGlobalLock, &gossie.Row{ Key: l.id, Columns: []*gossie.Column{{ Name: []byte(l.lockId[:]), Value: []byte{}, // @todo could inject some data about who has the lock here Ttl: durationToSeconds(holdFor, 1.5), // 1.5 is because we renew the lock earlier than the timeout, so we need to cover that extra bit }}, }) if err := writer.Run(); err != nil { // @todo we could inform clients of this, somehow, eg: via a channel log.Warnf("[Sync:GlobalLock] failed to refresh lock .. cannot guarantee exclusivity") } } } }() return l, nil }