Beispiel #1
0
// instrumentedHandler wraps the handler to provide instrumentation
func (ep *Endpoint) instrumentedHandler(req *Request) (proto.Message, perrors.Error) {
	start := time.Now()

	var err perrors.Error
	var m proto.Message

	// In a defer in case the handler panics
	defer func() {
		stats.Record(ep, err, time.Since(start))
		if err == nil {
			inst.Timing(1.0, "success."+ep.Name, time.Since(start))
			return
		}
		switch err.Type() {
		case perrors.ErrorBadRequest, perrors.ErrorNotFound:
			// Ignore errors that are caused by clients
			// TODO: consider a new stat for clienterror?
			inst.Timing(1.0, "success."+ep.Name, time.Since(start))
			return
		default:
			inst.Timing(1.0, "error."+ep.Name, time.Since(start))
		}
	}()

	traceIn(req)
	// check auth, only call handler if auth passes
	err = ep.Authoriser.Authorise(req)
	if err == nil {
		req.Auth().SetAuthorised(true)
		m, err = ep.Handler(req)
	}
	traceOut(req, m, err, time.Since(start))
	return m, err
}
Beispiel #2
0
// MultiPublish pubs X messages at once, synchronously, to N of M NSQs
func (p *HostpoolPublisher) MultiPublish(topic string, body [][]byte) error {

	if len(body) <= 0 {
		// This is a fatal error in the server that causes a disconnect.
		// Let's not send it
		return ErrEmptyBody
	}

	p.once.Do(p.setup)

	p.RLock()
	defer p.RUnlock()

	// must have some hosts configured
	if p.count == 0 {
		return fmt.Errorf("No NSQ pubHosts configured (hailo.service.nsq.pubHosts)")
	}

	// need to achieve N successful results; we'll round-robin from random starting point
	seen := make(map[string]bool)
	maxTries := p.count + p.count
	errarr := make([]error, 0)
	for try := 0; try < maxTries; try++ {
		// Get new host from pool
		hpr := p.hostpool.Get()
		host := hpr.Host()

		// have we already PUBbed to this?
		if _, exists := seen[host]; exists {
			continue
		}

		// Instrument payload body size
		inst.Timing(p.instrumentationSampleRate, "nsq.publish."+topic+".size", time.Duration(len(body)))

		// try this one
		err := p.producers[host].MultiPublish(topic, body)
		hpr.Mark(err) // Update hostpool
		if err != nil {
			errarr = append(errarr, err)
			continue
		}

		seen[host] = true
		// have we PUBbed to enough yet?
		if len(seen) >= p.n {
			break
		}
	}

	if len(seen) < p.n {
		errStr := "Errors"
		for _, anErr := range errarr {
			errStr += fmt.Sprintf(" :: %v", anErr)
		}
		return fmt.Errorf("Could not PUB to enough NSQs to consider this a success. Did PUB to %v out of %v attempts. %v", len(seen), maxTries, errStr)
	}

	return nil
}
Beispiel #3
0
func instTiming(key string, err error, t time.Time) {
	if err == nil {
		key += ".success"
	} else {
		key += ".failure"
	}
	inst.Timing(1.0, key, time.Since(t))
}
Beispiel #4
0
// RegionTimedLock attempts to achieve a regional lock on `id`, waiting for `waitFor` time in
// case of contention (before giving up) and reserving the lock for a maximum `holdFor`
// in the event of failing to Unlock()
func RegionTimedLock(id []byte, waitFor, holdFor time.Duration) (Lock, error) {
	// Ensure we are reaping
	once.Do(startReaper)

	if int64(holdFor) < int64(minRegionHoldFor) {
		return nil, ErrRegionHoldFor
	}

	lockId := string(id) // we use []byte for function signature compatibility with the global lock

	// Our locks are namespaced per service
	path, err := constructLockPath(regionLockNamespace, lockId)
	if err != nil {
		return nil, err
	}
	log.Tracef("[Sync:RegionTimedLock] Attempting to acquire '%s'; held for %v", path, holdFor)

	// Create new lock which we will be lock()ed
	lock := &regionLock{
		zkLock: zk.NewLock(path, gozk.WorldACL(gozk.PermAll)),
	}
	lock.zkLock.SetTTL(holdFor)
	lock.zkLock.SetTimeout(waitFor)
	// Acquire a lock
	startTime := time.Now()
	err = lock.zkLock.Lock()
	inst.Timing(1.0, "sync.regionlock.acquire", time.Since(startTime))
	defaultReaper.addPath(path) // only add path to reaper AFTER we've acquired the lock (or not)

	if err == nil {
		log.Tracef("[Sync:RegionTimedLock] Successfully acquired '%s'", path)
		inst.Counter(1.0, "sync.regionlock.acquire.success")
	} else {
		log.Errorf("[Sync:RegionTimedLock] Failed to acquire '%s': %s", path, err.Error())
		inst.Counter(1.0, "sync.regionlock.acquire.failure")
	}

	return lock, err
}
Beispiel #5
0
// doReq sends a request, with timeout options and retries, waits for response and returns it
func (c *client) doReq(req *Request, options ...Options) (*Response, errors.Error) {

	if circuitbreaker.Open(req.service, req.endpoint) {
		inst.Counter(1.0, fmt.Sprintf("client.error.%s.%s.circuitbroken", req.service, req.endpoint), 1)
		log.Warnf("Broken Circuit for %s.%s", req.service, req.endpoint)
		return nil, errors.CircuitBroken("com.hailocab.kernel.platform.circuitbreaker", "Circuit is open")
	}

	retries := c.defaults["retries"].(int)
	var timeout time.Duration
	timeoutSupplied := false
	if len(options) == 1 {
		if _, ok := options[0]["retries"]; ok {
			retries = options[0]["retries"].(int)
		}
		if _, ok := options[0]["timeout"]; ok {
			timeout = options[0]["timeout"].(time.Duration)
			timeoutSupplied = true
		}
	}

	// setup the response channel
	rc := make(chan *Response, retries)
	c.responses.add(req, rc)
	defer c.responses.removeByRequest(req)

	instPrefix := fmt.Sprintf("client.%s.%s", req.service, req.endpoint)
	tAllRetries := time.Now()

	for i := 1; i <= retries+1; i++ {
		t := time.Now()

		c.RLock()
		con := c.listening
		c.RUnlock()
		if !con {
			log.Debug("[Client] not yet listening, establishing now...")
			ch := make(chan bool)
			go c.listen(ch)
			if online := <-ch; !online {
				log.Error("[Client] Listener failed")
				inst.Timing(1.0, fmt.Sprintf("%s.error", instPrefix), time.Since(t))
				inst.Counter(1.0, "client.error.com.hailocab.kernel.platform.client.listenfail", 1)
				return nil, errors.InternalServerError("com.hailocab.kernel.platform.client.listenfail", "Listener failed")
			}

			log.Info("[Client] Listener online")
		}

		// figure out what timeout to use
		if !timeoutSupplied {
			timeout = c.timeout.Get(req.service, req.endpoint, i)
		}
		log.Tracef("[Client] Sync request attempt %d for %s using timeout %v", i, req.MessageID(), timeout)

		// only bother sending the request if we are listening, otherwise allow to timeout
		if err := raven.SendRequest(req, c.instanceID); err != nil {
			log.Errorf("[Client] Failed to send request: %v", err)
		}

		select {
		case payload := <-rc:
			if payload.IsError() {
				inst.Timing(1.0, fmt.Sprintf("%s.error", instPrefix), time.Since(t))

				errorProto := &pe.PlatformError{}
				if err := payload.Unmarshal(errorProto); err != nil {
					inst.Counter(1.0, "client.error.com.hailocab.kernel.platform.badresponse", 1)
					return nil, errors.BadResponse("com.hailocab.kernel.platform.badresponse", err.Error())
				}

				err := errors.FromProtobuf(errorProto)
				inst.Counter(1.0, fmt.Sprintf("client.error.%s", err.Code()), 1)

				circuitbreaker.Result(req.service, req.endpoint, err)

				return nil, err
			}

			inst.Timing(1.0, fmt.Sprintf("%s.success", instPrefix), time.Since(t))
			circuitbreaker.Result(req.service, req.endpoint, nil)
			return payload, nil
		case <-time.After(timeout):
			// timeout
			log.Errorf("[Client] Timeout talking to %s.%s after %v for %s", req.Service(), req.Endpoint(), timeout, req.MessageID())
			inst.Timing(1.0, fmt.Sprintf("%s.error", instPrefix), time.Since(t))
			c.traceAttemptTimeout(req, i, timeout)

			circuitbreaker.Result(req.service, req.endpoint, errors.Timeout("com.hailocab.kernel.platform.timeout",
				fmt.Sprintf("Request timed out talking to %s.%s from %s (most recent timeout %v)", req.Service(), req.Endpoint(), req.From(), timeout),
				req.Service(),
				req.Endpoint()))
		}
	}

	inst.Timing(1.0, fmt.Sprintf("%s.error.timedOut", instPrefix), time.Since(tAllRetries))
	inst.Counter(1.0, "client.error.com.hailocab.kernel.platform.timeout", 1)

	return nil, errors.Timeout(
		"com.hailocab.kernel.platform.timeout",
		fmt.Sprintf("Request timed out talking to %s.%s from %s (most recent timeout %v)", req.Service(), req.Endpoint(), req.From(), timeout),
		req.Service(),
		req.Endpoint(),
	)
}
Beispiel #6
0
func Set(item *memcache.Item) error {
	start := time.Now()
	defer inst.Timing(timingSampleRate, "memcached.set", time.Since(start))
	return defaultClient.Set(item)
}
Beispiel #7
0
func Increment(key string, delta uint64) (newValue uint64, err error) {
	start := time.Now()
	defer inst.Timing(timingSampleRate, "memcached.increment", time.Since(start))
	return defaultClient.Increment(key, delta)
}
Beispiel #8
0
func GetMulti(keys []string) (map[string]*memcache.Item, error) {
	start := time.Now()
	defer inst.Timing(timingSampleRate, "memcached.get-multi", time.Since(start))
	return defaultClient.GetMulti(keys)
}
Beispiel #9
0
func Get(key string) (item *memcache.Item, err error) {
	start := time.Now()
	defer inst.Timing(timingSampleRate, "memcached.get", time.Since(start))
	return defaultClient.Get(key)
}
Beispiel #10
0
func Delete(key string) error {
	start := time.Now()
	defer inst.Timing(timingSampleRate, "memcached.delete", time.Since(start))
	return defaultClient.Delete(key)
}
Beispiel #11
0
func CompareAndSwap(item *memcache.Item) error {
	start := time.Now()
	defer inst.Timing(timingSampleRate, "memcached.compare-and-swap", time.Since(start))
	return defaultClient.CompareAndSwap(item)
}
Beispiel #12
0
// GlobalTimedLock attempts to achieve a global lock on `id`, waiting for `waitFor` time in
// case of contention (before giving up) and reserving the lock for a maximum `holdFor`
// in the event of failure
// NOTE: locks can and will be held for longer than `holdFor`, but in the case of failure
// (eg: binary crashes) then this is the maximum amount of time other programs will hang
// around contending for the now defunkt lock
func GlobalTimedLock(id []byte, waitFor, holdFor time.Duration) (Lock, error) {
	if int64(holdFor) < int64(minHoldFor) {
		return nil, ErrHoldFor
	}

	u, err := gossie.NewTimeUUID()
	if err != nil {
		log.Warnf("[Sync:GlobalLock] Failed to generate time UUID: %v", err)
		return nil, ErrGenUuid
	}
	l := &globalLock{
		id:     id,
		lockId: u,
		exit:   make(chan struct{}),
	}

	// make my node in C*
	pool, err := cassandra.ConnectionPool(keyspace)
	if err != nil {
		return nil, fmt.Errorf("Error locking due to C*: %v", err)
	}
	writer := pool.Writer()
	writer.ConsistencyLevel(gossie.CONSISTENCY_QUORUM).Insert(cfGlobalLock, &gossie.Row{
		Key: l.id,
		Columns: []*gossie.Column{
			{
				Name:  []byte(l.lockId[:]),
				Value: []byte{}, // @todo could inject some data about who has the lock here
				Ttl:   durationToSeconds(holdFor, 1.0),
			},
		},
	})
	startTime := time.Now()
	err = writer.Run()
	if err != nil {
		inst.Timing(1.0, "sync.globaltimedlock.acquire", time.Since(startTime))
		inst.Counter(1.0, "sync.globaltimedlock.acquire.failure")
		return nil, err
	}

	// read all back and ensure i'm the lowest
	reader := pool.Reader().ConsistencyLevel(gossie.CONSISTENCY_QUORUM).Cf(cfGlobalLock)
	attempts := 0
	errs := multierror.New()
	start := time.Now()
	for {
		// break out if we've waited too long
		if attempts > 0 {
			if time.Now().After(start.Add(waitFor)) {
				inst.Timing(1.0, "sync.globaltimedlock.acquire", time.Since(startTime))
				inst.Counter(1.0, "sync.globaltimedlock.acquire.failure")
				l.Unlock()
				return nil, ErrContended
			}
			// delay a bit to avoid hammering C*
			time.Sleep(addJitter(delayFor))
		}

		attempts++

		row, err := reader.Get(l.id)
		if err != nil {
			errs.Add(fmt.Errorf("C* read back error: %v", err))
			continue
		}
		if row == nil || len(row.Columns) == 0 {
			errs.Add(fmt.Errorf("C* read back error: no columns returned from query"))
			continue
		}

		col := row.Columns[0]
		if bytes.Equal(col.Name, []byte(l.lockId[:])) {
			// we have the lock
			break
		}
	}

	inst.Timing(1.0, "sync.globaltimedlock.acquire", time.Since(startTime))
	inst.Counter(1.0, "sync.globaltimedlock.acquire.success")

	// put in place the refresher loop @todo
	go func() {
		for {
			log.Debug("[Sync:GlobalLock] Doing refresher loop…")
			refresh := time.Duration(float64(holdFor) * 0.75)
			select {
			case <-l.exit:
				log.Debugf("[Sync:GlobalLock] Breaking out of refresher loop")
				return
			case <-time.After(refresh):
				log.Debugf("[Sync:GlobalLock] Refreshing %s [%s]", string(l.id), l.lockId.String())
				writer.ConsistencyLevel(gossie.CONSISTENCY_QUORUM).Insert(cfGlobalLock, &gossie.Row{
					Key: l.id,
					Columns: []*gossie.Column{{
						Name:  []byte(l.lockId[:]),
						Value: []byte{},                        // @todo could inject some data about who has the lock here
						Ttl:   durationToSeconds(holdFor, 1.5), // 1.5 is because we renew the lock earlier than the timeout, so we need to cover that extra bit
					}},
				})
				if err := writer.Run(); err != nil {
					// @todo we could inform clients of this, somehow, eg: via a channel
					log.Warnf("[Sync:GlobalLock] failed to refresh lock .. cannot guarantee exclusivity")
				}
			}
		}
	}()

	return l, nil
}