Пример #1
0
// @rest POST /v1/jobs/:appid/:topic/:ver
func (this *manServer) createJobHandler(w http.ResponseWriter, r *http.Request, params httprouter.Params) {
	topic := params.ByName(UrlParamTopic)
	if !manager.Default.ValidateTopicName(topic) {
		log.Warn("illegal topic: %s", topic)

		writeBadRequest(w, "illegal topic")
		return
	}

	realIp := getHttpRemoteIp(r)

	if !this.throttleAddTopic.Pour(realIp, 1) {
		writeQuotaExceeded(w)
		return
	}

	hisAppid := params.ByName(UrlParamAppid)
	appid := r.Header.Get(HttpHeaderAppid)
	pubkey := r.Header.Get(HttpHeaderPubkey)
	ver := params.ByName(UrlParamVersion)
	if !manager.Default.AuthAdmin(appid, pubkey) {
		log.Warn("suspicous create job %s(%s) {appid:%s pubkey:%s topic:%s ver:%s}",
			r.RemoteAddr, realIp, appid, pubkey, topic, ver)

		writeAuthFailure(w, manager.ErrAuthenticationFail)
		return
	}

	cluster, found := manager.Default.LookupCluster(hisAppid)
	if !found {
		log.Error("create job %s(%s) {appid:%s topic:%s ver:%s} invalid appid",
			r.RemoteAddr, realIp, hisAppid, topic, ver)

		writeBadRequest(w, "invalid appid")
		return
	}

	log.Info("create job[%s] %s(%s) {appid:%s topic:%s ver:%s}",
		appid, r.RemoteAddr, realIp, hisAppid, topic, ver)

	rawTopic := manager.Default.KafkaTopic(hisAppid, topic, ver)
	if err := job.Default.CreateJobQueue(Options.AssignJobShardId, hisAppid, rawTopic); err != nil {
		log.Error("create job[%s] %s(%s) {shard:%d appid:%s topic:%s ver:%s} %v",
			appid, r.RemoteAddr, realIp, Options.AssignJobShardId, hisAppid, topic, ver, err)

		writeServerError(w, err.Error())
		return
	}

	if err := this.gw.zkzone.CreateJobQueue(rawTopic, cluster); err != nil {
		log.Error("app[%s] %s(%s) create job: {shard:%d appid:%s topic:%s ver:%s} %v",
			appid, r.RemoteAddr, realIp, Options.AssignJobShardId, hisAppid, topic, ver, err)

		writeServerError(w, err.Error())
		return
	}

	w.WriteHeader(http.StatusCreated)
	w.Write(ResponseOk)
}
Пример #2
0
func (this *controller) RunForever() (err error) {
	log.Info("controller[%s] starting", this.Id())

	if err = this.orchestrator.RegisterActor(this.Id(), this.Bytes()); err != nil {
		return err
	}
	defer this.orchestrator.ResignActor(this.Id())

	if err = manager.Default.Start(); err != nil {
		return
	}
	log.Trace("manager[%s] started", manager.Default.Name())

	go this.runWebServer()

	jobDispatchQuit := make(chan struct{})
	go this.dispatchJobQueues(jobDispatchQuit)

	webhookDispatchQuit := make(chan struct{})
	go this.dispatchWebhooks(webhookDispatchQuit)

	select {
	case <-jobDispatchQuit:
		log.Warn("dispatchJobQueues quit")

	case <-webhookDispatchQuit:
		log.Warn("dispatchWebhooks quit")
	}

	manager.Default.Stop()
	log.Trace("manager[%s] stopped", manager.Default.Name())

	return
}
Пример #3
0
func (this *WatchConsumers) frequentOffsetCommit() (n int64) {
	const frequentThreshold = time.Second * 10

	this.Zkzone.ForSortedClusters(func(zkcluster *zk.ZkCluster) {
		for group, consumers := range zkcluster.ConsumersByGroup("") {
			for _, c := range consumers {
				if !c.Online {
					continue
				}

				if c.ConsumerZnode == nil {
					log.Warn("cluster[%s] group[%s] topic[%s/%s] unrecognized consumer", zkcluster.Name(), group, c.Topic, c.PartitionId)

					continue
				}

				gtp := structs.GroupTopicPartition{Group: group, Topic: c.Topic, PartitionID: c.PartitionId}
				if t, present := this.offsetMtimeMap[gtp]; present {
					if interval := c.Mtime.Time().Sub(t); interval < frequentThreshold {
						if this.logFrequentConsumer {
							log.Warn("cluster[%s] group[%s] topic[%s/%s] too frequent offset commit: %s", zkcluster.Name(), group, c.Topic, c.PartitionId, interval)
						}

						n++
					}
				}

				this.offsetMtimeMap[gtp] = c.Mtime.Time()
			}
		}
	})

	return
}
Пример #4
0
func (this *mysql) Exec(query string, args ...interface{}) (afftectedRows int64,
	lastInsertId int64, err error) {
	if this.db == nil {
		return 0, 0, ErrNotOpen
	}
	if this.breaker.Open() {
		return 0, 0, ErrCircuitOpen
	}

	var result sql.Result
	result, err = this.db.Exec(query, args...)
	if err != nil {
		if this.isSystemError(err) {
			log.Warn("mysql exec breaks: %s", err.Error())
			this.breaker.Fail()
		}

		return 0, 0, err
	}

	afftectedRows, err = result.RowsAffected()
	if err != nil {
		if this.isSystemError(err) {
			log.Warn("mysql exec2 breaks: %s", err.Error())
			this.breaker.Fail()
		}
	} else {
		this.breaker.Succeed()
	}

	lastInsertId, _ = result.LastInsertId()
	return
}
Пример #5
0
// watchTopicPartitionsChange watch partition changes on a topic.
func (cg *ConsumerGroup) watchTopicPartitionsChange(topic string, stopper <-chan struct{},
	topicPartitionsChanged chan<- string, outstanding *sync.WaitGroup) {
	defer outstanding.Done()

	_, ch, err := cg.kazoo.Topic(topic).WatchPartitions()
	if err != nil {
		if err == zk.ErrNoNode {
			err = ErrInvalidTopic
		}
		log.Error("[%s/%s] topic[%s] watch partitions: %s", cg.group.Name, cg.shortID(), topic, err)
		cg.emitError(err, topic, -1)
		return
	}

	var (
		backoff    = time.Duration(5)
		maxRetries = 3
	)
	select {
	case <-cg.stopper:
		return

	case <-stopper:
		return

	case <-ch:
		// when partitions scales up, the zk node might not be completely ready, await it ready
		//
		// even if zk node ready, kafka broker might not be ready:
		// kafka server: Request was for a topic or partition that does not exist on this broker
		// so we blindly wait: should be enough for most cases
		// in rare cases, that is still not enough: imagine partitions 1->1000, which takes long
		// ok, just return that err to client to retry
		time.Sleep(time.Second * backoff)
		for retries := 0; retries < maxRetries; retries++ {
			// retrieve brokers/topics/{topic}/partitions/{partition}/state and find the leader broker id
			// the new partitions state znode might not be ready yet
			if partitions, err := cg.kazoo.Topic(topic).Partitions(); err == nil {
				if _, err = retrievePartitionLeaders(partitions); err == nil {
					log.Debug("[%s/%s] topic[%s] partitions change complete", cg.group.Name, cg.shortID(), topic)
					break
				} else {
					log.Warn("[%s/%s] topic[%s] partitions change retry#%d waiting: %v", cg.group.Name, cg.shortID(), topic, retries, err)
					backoff-- // don't worry if negative
					time.Sleep(time.Second * backoff)
				}
			} else {
				log.Warn("[%s/%s] topic[%s] partitions change retry#%d waiting: %v", cg.group.Name, cg.shortID(), topic, retries, err)
				backoff--
				time.Sleep(time.Second * backoff)
			}
		}

		// safe to trigger rebalance
		select {
		case topicPartitionsChanged <- topic:
		default:
		}
	}
}
Пример #6
0
func (this *WatchSub) subLags() (lags int) {
	now := time.Now()
	// find sub lags
	for _, zkcluster := range this.zkclusters {
		for group, consumers := range zkcluster.ConsumersByGroup("") {
			for _, c := range consumers {
				if !c.Online {
					continue
				}

				if c.ConsumerZnode == nil {
					log.Warn("cluster[%s] group[%s] topic[%s/%s] unrecognized consumer", zkcluster.Name(), group, c.Topic, c.PartitionId)

					continue
				}

				if time.Since(c.ConsumerZnode.Uptime()) < time.Minute*2 {
					log.Info("cluster[%s] group[%s] just started, topic[%s/%s]", zkcluster.Name(), group, c.Topic, c.PartitionId)

					this.unsuspect(group, c.Topic, c.PartitionId)
					continue
				}

				// offset commit every 1m, sublag runs every 1m, so the gap might be 2m
				// TODO lag too much, even if it's still alive, emit alarm
				elapsed := time.Since(c.Mtime.Time())
				if c.Lag == 0 || elapsed < time.Minute*3 {
					this.unsuspect(group, c.Topic, c.PartitionId)
					continue
				}

				// it might be lagging, but need confirm with last round
				if !this.isSuspect(group, c.Topic, c.PartitionId) {
					// suspect it, next round if it is still lagging, put on trial
					log.Warn("cluster[%s] group[%s] suspected topic[%s/%s] %d - %d = %d, offset commit elapsed: %s",
						zkcluster.Name(), group, c.Topic, c.PartitionId, c.ProducerOffset, c.ConsumerOffset, c.Lag, elapsed.String())

					this.suspect(group, c.Topic, c.PartitionId, c.ProducerOffset, c.ConsumerOffset, now)
					continue
				}

				if this.isCriminal(group, c.Topic, c.PartitionId, c.ProducerOffset, c.ConsumerOffset, now) {
					// bingo! consumer is lagging and seems to be DEAD
					log.Error("cluster[%s] group[%s] confirmed topic[%s/%s] %d - %d = %d, offset commit elapsed: %s",
						zkcluster.Name(), group, c.Topic, c.PartitionId, c.ProducerOffset, c.ConsumerOffset, c.Lag, elapsed.String())

					lags++
				} else {
					log.Warn("cluster[%s] group[%s] lagging but still alive topic[%s/%s] %d - %d = %d, offset commit elapsed: %s",
						zkcluster.Name(), group, c.Topic, c.PartitionId, c.ProducerOffset, c.ConsumerOffset, c.Lag, elapsed.String())
				}

			}
		}

	}

	return
}
Пример #7
0
func (this *ResourcePool) get(wait bool) (resource Resource, err error) {
	if this == nil || this.IsClosed() {
		return nil, CLOSED_ERR
	}

	var (
		wrapper   resourceWrapper
		stillOpen bool
	)
	select {
	case wrapper, stillOpen = <-this.resourcePool:
		if !stillOpen {
			return nil, CLOSED_ERR
		}

		this.waitCount.Set(0) // reset
		if wrapper.resource != nil {
			this.diagnosticTracker.BorrowResource(wrapper.resource)
		}

	default:
		if !wait {
			return nil, nil
		}

		this.waitCount.Add(1)
		log.Warn("ResourcePool[%s] busy, pending:%d waited:%s",
			this.name, this.WaitCount(), this.waitTime.Get())

		t1 := time.Now()
		wrapper = <-this.resourcePool
		this.waitTime.Add(time.Now().Sub(t1))
	}

	// Close the aged idle resource
	timeout := this.idleTimeout.Get()
	if wrapper.resource != nil && timeout > 0 &&
		wrapper.timeUsed.Add(timeout).Sub(time.Now()) < 0 {
		this.diagnosticTracker.ReturnResource(wrapper.resource)

		log.Warn("ResourcePool[%s] resource:%d idle too long: closed", this.name,
			wrapper.resource.Id())
		wrapper.resource.Close()
		wrapper.resource = nil
	}

	if wrapper.resource == nil {
		wrapper.resource, err = this.factory()
		if err != nil {
			this.resourcePool <- resourceWrapper{}
		} else {
			this.diagnosticTracker.BorrowResource(wrapper.resource)
		}
	}

	return wrapper.resource, err
}
Пример #8
0
func (this *Start) main() {
	ctx.LoadFromHome()
	this.zkzone = zk.NewZkZone(zk.DefaultConfig(this.zone, ctx.ZoneZkAddrs(this.zone)))
	zkConnEvt, ok := this.zkzone.SessionEvents()
	if !ok {
		panic("someone stealing my events")
	}

	registry.Default = zkr.New(this.zkzone)

	log.Info("ehaproxy[%s] starting...", gafka.BuildId)
	go this.runMonitorServer(this.httpAddr)

	zkConnected := false
	for {
		instances, instancesChange, err := registry.Default.WatchInstances()
		if err != nil {
			log.Error("zone[%s] %s", this.zkzone.Name(), err)
			time.Sleep(time.Second)
			continue
		}

		if zkConnected {
			if len(instances) > 0 {
				this.reload(instances)
			} else {
				// resilience to zk problem by local cache
				log.Warn("backend all shutdown? skip this change")
				time.Sleep(time.Second)
				continue
			}
		}

		select {
		case <-this.quitCh:
			return

		case evt := <-zkConnEvt:
			if evt.State == zklib.StateHasSession && !zkConnected {
				log.Info("zk connected")
				zkConnected = true
			} else if zkConnected {
				log.Warn("zk jitter: %+v", evt)
			}

		case <-instancesChange:
			log.Info("instances changed!!")
		}
	}

}
Пример #9
0
// consume topic: __consumer_offsets and process the message to get offsets of consumers
func (this *ZkCluster) processConsumerOffsetsMessage(msg *sarama.ConsumerMessage) {
	var keyver, valver uint16
	var partition uint32
	var offset, timestamp uint64

	buf := bytes.NewBuffer(msg.Key)
	err := binary.Read(buf, binary.BigEndian, &keyver)
	if (err != nil) || ((keyver != 0) && (keyver != 1)) {
		log.Warn("Failed to decode %s:%v offset %v: keyver", msg.Topic, msg.Partition, msg.Offset)
		return
	}
	group, err := readString(buf)
	if err != nil {
		log.Warn("Failed to decode %s:%v offset %v: group", msg.Topic, msg.Partition, msg.Offset)
		return
	}
	topic, err := readString(buf)
	if err != nil {
		log.Warn("Failed to decode %s:%v offset %v: topic", msg.Topic, msg.Partition, msg.Offset)
		return
	}
	err = binary.Read(buf, binary.BigEndian, &partition)
	if err != nil {
		log.Warn("Failed to decode %s:%v offset %v: partition", msg.Topic, msg.Partition, msg.Offset)
		return
	}

	buf = bytes.NewBuffer(msg.Value)
	err = binary.Read(buf, binary.BigEndian, &valver)
	if (err != nil) || ((valver != 0) && (valver != 1)) {
		log.Warn("Failed to decode %s:%v offset %v: valver", msg.Topic, msg.Partition, msg.Offset)
		return
	}
	err = binary.Read(buf, binary.BigEndian, &offset)
	if err != nil {
		log.Warn("Failed to decode %s:%v offset %v: offset", msg.Topic, msg.Partition, msg.Offset)
		return
	}
	_, err = readString(buf)
	if err != nil {
		log.Warn("Failed to decode %s:%v offset %v: metadata", msg.Topic, msg.Partition, msg.Offset)
		return
	}
	err = binary.Read(buf, binary.BigEndian, &timestamp)
	if err != nil {
		log.Warn("Failed to decode %s:%v offset %v: timestamp", msg.Topic, msg.Partition, msg.Offset)
		return
	}

	partitionOffset := &PartitionOffset{
		Cluster:   this.Name(),
		Topic:     topic,
		Partition: int32(partition),
		Group:     group,
		Timestamp: int64(timestamp),
		Offset:    int64(offset),
	}
	log.Debug("%+v", partitionOffset)
	return
}
Пример #10
0
func (this *mysql) Query(query string, args ...interface{}) (rows *sql.Rows,
	err error) {
	if this.db == nil {
		return nil, ErrNotOpen
	}
	if this.breaker.Open() {
		return nil, ErrCircuitOpen
	}

	var stmt *sql.Stmt = nil
	if this.stmtsStore != nil {
		if stmtc, present := this.stmtsStore.Get(query); present {
			stmt = stmtc.(*sql.Stmt)
		} else {
			// FIXME thundering hurd
			stmt, err = this.db.Prepare(query)
			if err != nil {
				if this.isSystemError(err) {
					log.Warn("mysql prepare breaks: %s", err.Error())
					this.breaker.Fail()
				}

				return nil, err
			}

			this.mutex.Lock()
			this.stmtsStore.Set(query, stmt)
			this.mutex.Unlock()
			log.Debug("[%s] stmt[%s] open", this.dsn, query)
		}
	}

	// Under the hood, db.Query() actually prepares, executes, and closes
	// a prepared statement. That's three round-trips to the database.
	if stmt != nil {
		rows, err = stmt.Query(args...)
	} else {
		rows, err = this.db.Query(query, args...)
	}
	if err != nil {
		if this.isSystemError(err) {
			log.Warn("mysql query breaks: %s", err.Error())
			this.breaker.Fail()
		}
	} else {
		this.breaker.Succeed()
	}

	return
}
Пример #11
0
func dumpMaintainConfigPhp(info []string) {
	if config.maintainTargetFile == "" ||
		config.maintainTemplateFile == "" ||
		maintainTemplateContents == "" {
		log.Warn("Invalid maintain conf, disabled")
		return
	}

	templateData := make(map[string]string)
	for _, s := range info {
		// s is like "kingdom_1:30"
		parts := strings.SplitN(s, ":", 2)
		templateData[parts[0]] = parts[1]
	}

	t := template.Must(template.New("maintain").Parse(maintainTemplateContents))
	wr := new(bytes.Buffer)
	t.Execute(wr, templateData)

	err := ioutil.WriteFile(config.maintainTargetFile, wr.Bytes(), 0644)
	if err != nil {
		log.Error("dump[%s]: %s", config.maintainTargetFile, err.Error())
	} else {
		log.Info("dumped[%s]: %+v", config.maintainTargetFile, templateData)
	}

}
Пример #12
0
// TODO from live meta or zk?
func (this *pubPool) RefreshBrokerList(brokerList []string) {
	if len(brokerList) == 0 {
		if len(this.brokerList) > 0 {
			log.Warn("%s meta store found empty broker list, refresh refused", this.cluster)
		}
		return
	}

	setOld, setNew := set.NewSet(), set.NewSet()
	for _, b := range this.brokerList {
		setOld.Add(b)
	}
	for _, b := range brokerList {
		setNew.Add(b)
	}

	if !setOld.Equal(setNew) {
		log.Info("%s broker list from %+v to %+v", this.cluster, this.brokerList, brokerList)

		// rebuild the kafka conn pool
		this.brokerList = brokerList
		this.Close()
		this.buildPools()
	}
}
Пример #13
0
func (this *zkMetaStore) TopicPartitions(cluster, topic string) []int32 {
	ct := structs.ClusterTopic{Cluster: cluster, Topic: topic}

	this.pmapLock.RLock()
	if partitionIDs, present := this.partitionsMap[ct]; present {
		this.pmapLock.RUnlock()
		return partitionIDs
	}
	this.pmapLock.RUnlock()

	this.pmapLock.Lock()
	defer this.pmapLock.Unlock()

	// double check
	if partitionIDs, present := this.partitionsMap[ct]; present {
		return partitionIDs
	}

	// cache miss
	this.mu.RLock()
	c, ok := this.clusters[cluster]
	this.mu.RUnlock()
	if !ok {
		log.Warn("invalid cluster: %s", cluster)
		return nil
	}

	partitionIDs := c.Partitions(topic)
	// set cache
	this.partitionsMap[ct] = partitionIDs

	return partitionIDs
}
Пример #14
0
func dumpFaeConfigPhp(servers []string) {
	if config.faeTargetFile == "" ||
		config.faeTemplateFile == "" ||
		faeTemplateContents == "" {
		log.Warn("Invalid fae conf, disabled")
		return
	}

	type tempateVar struct {
		Servers []string
		Ports   []string
	}
	templateData := tempateVar{Servers: make([]string, 0), Ports: make([]string, 0)}
	for _, s := range servers {
		// s is like "12.3.11.2:9001"
		parts := strings.SplitN(s, ":", 2)
		templateData.Servers = append(templateData.Servers, parts[0])
		templateData.Ports = append(templateData.Ports, parts[1])
	}

	t := template.Must(template.New("fae").Parse(faeTemplateContents))
	wr := new(bytes.Buffer)
	t.Execute(wr, templateData)

	err := ioutil.WriteFile(config.faeTargetFile, wr.Bytes(), 0644)
	if err != nil {
		log.Error("dump[%s]: %s", config.faeTargetFile, err.Error())
	} else {
		log.Info("dumped[%s]: %+v", config.faeTargetFile, templateData)
	}

}
Пример #15
0
func (this *profiler) do(callName string, ctx *rpc.Context, format string,
	args ...interface{}) {
	if this == nil {
		return
	}

	elapsed := time.Since(this.t1)
	slow := elapsed > config.Engine.Servants.CallSlowThreshold
	if !(slow || this.on) {
		return
	}

	body := fmt.Sprintf(format, args...)
	if slow {
		svtStats.incCallSlow()

		header := fmt.Sprintf("SLOW=%s/%s Q=%s ",
			elapsed, time.Since(this.t0), callName)
		log.Warn(header + this.truncatedStr(body))
	} else if this.on {
		header := fmt.Sprintf("T=%s/%s Q=%s ",
			elapsed, time.Since(this.t0), callName)
		log.Trace(header + this.truncatedStr(body))
	}

}
Пример #16
0
func newRpcDispatcher(prefork bool, maxOutstandingSessions int,
	handler rpcClientHandler) (this *rpcDispatcher) {
	this = &rpcDispatcher{
		handler:     handler,
		preforkMode: prefork,
	}

	if !this.preforkMode {
		this.throttleChan = make(chan null.NullStruct, maxOutstandingSessions)
		return
	}

	this.clientSocketChan = make(chan thrift.TTransport, maxOutstandingSessions)
	for i := 0; i < maxOutstandingSessions; i++ {
		// prefork
		go func() {
			for {
				// reuse goroutines to reduce GC
				this.handler(<-this.clientSocketChan)
			}

			log.Warn("dispatcher[%d] terminated", i)
		}()
	}

	return
}
Пример #17
0
func (this *WatchConsumers) runSubQpsTimer() {
	this.Zkzone.ForSortedClusters(func(zkcluster *zk.ZkCluster) {
		consumerGroups := zkcluster.ConsumerGroups()
		for group, _ := range consumerGroups {
			offsetMap := zkcluster.ConsumerOffsetsOfGroup(group)
			for topic, m := range offsetMap {
				offsetOfGroupOnTopic := int64(0)
				for _, offset := range m {
					offsetOfGroupOnTopic += offset
				}

				// cluster, topic, group, offset
				tag := telemetry.Tag(zkcluster.Name(), strings.Replace(topic, ".", "_", -1), strings.Replace(group, ".", "_", -1))
				if _, present := this.consumerQps[tag]; !present {
					this.consumerQps[tag] = metrics.NewRegisteredMeter(tag+"consumer.qps", nil)
				}
				lastOffset := this.lastOffsets[tag]
				if lastOffset == 0 {
					// first run
					this.lastOffsets[tag] = offsetOfGroupOnTopic
				} else {
					delta := offsetOfGroupOnTopic - lastOffset
					if delta >= 0 {
						this.consumerQps[tag].Mark(delta)
						this.lastOffsets[tag] = offsetOfGroupOnTopic
					} else {
						log.Warn("cluster[%s] topic[%s] group[%s] offset rewinds: %d %d",
							zkcluster.Name(), topic, group, offsetOfGroupOnTopic, lastOffset)
					}

				}
			}
		}
	})
}
Пример #18
0
func (this *WatchLoadAvg) highLoadCount() (n int64, err error) {
	const threshold = 6.

	cmd := pipestream.New("consul", "exec",
		"uptime", "|", "grep", "load")
	err = cmd.Open()
	if err != nil {
		return
	}
	defer cmd.Close()

	scanner := bufio.NewScanner(cmd.Reader())
	scanner.Split(bufio.ScanLines)
	for scanner.Scan() {
		line := scanner.Text()
		load1m, e := ctx.ExtractLoadAvg1m(line)
		if e != nil {
			continue
		}

		if load1m > threshold {
			log.Warn(line)
			n++
		}
	}

	return
}
Пример #19
0
// nextSegmentID returns the next segment ID that is free
func (q *queue) nextSegmentID() (uint64, error) {
	segments, err := ioutil.ReadDir(q.dir)
	if err != nil {
		return 0, err
	}

	var maxID uint64
	for _, segment := range segments {
		if segment.IsDir() || segment.Name() == cursorFile {
			continue
		}

		// Segments file names are all numeric
		segmentID, err := strconv.ParseUint(segment.Name(), 10, 64)
		if err != nil {
			log.Warn("unexpected segment file: %s", filepath.Join(q.dir, segment.Name()))
			continue
		}

		if segmentID > maxID {
			maxID = segmentID
		}
	}

	return maxID + 1, nil
}
Пример #20
0
func (this *Proxy) refreshPeers(peers []string) {
	// add all latest peers
	for _, peerAddr := range peers {
		this.addRemotePeerIfNecessary(peerAddr)
	}

	// kill died peers
	for peerAddr, _ := range this.remotePeerPools {
		alive := false
		for _, p := range peers {
			if p == peerAddr {
				// still alive
				alive = true
				break
			}
		}

		if !alive {
			log.Warn("peer[%s] gone away", peerAddr)

			this.mutex.Lock()
			this.remotePeerPools[peerAddr].Close() // kill all conns in this pool
			delete(this.remotePeerPools, peerAddr)
			this.mutex.Unlock()
		}
	}
}
Пример #21
0
func (this *WatchAppError) Run() {
	defer this.Wg.Done()

	return // disable for now TODO

	appError := metrics.NewRegisteredCounter("kateway.apperr", nil)
	msgChan := make(chan *sarama.ConsumerMessage, 2000)

	if err := this.consumeAppErrLogs(msgChan); err != nil {
		close(msgChan)

		log.Error("%v", err)
		return
	}

	for {
		select {
		case <-this.Stop:
			log.Info("kateway.apperr stopped")
			return

		case msg, ok := <-msgChan:
			if !ok {
				return
			}

			appError.Inc(1)
			log.Warn("%d/%d %s", msg.Partition, msg.Offset, string(msg.Value))
		}
	}
}
Пример #22
0
func (this *WatchExec) Run() {
	defer this.Wg.Done()

	if this.confDir == "" {
		log.Warn("empty confd, external.exec disabled")
		return
	}

	ticker := time.NewTicker(time.Minute)
	defer ticker.Stop()

	if err := this.watchConfigDir(); err != nil {
		log.Error("%v", err)
		return
	}

	for {
		select {
		case <-this.Stop:
			log.Info("external.exec stopped")
			return

		case <-ticker.C:

		}
	}
}
Пример #23
0
func (this *subServer) wsReadPump(clientGone chan struct{}, ws *websocket.Conn) {
	ws.SetReadLimit(this.wsReadLimit)
	ws.SetReadDeadline(time.Now().Add(this.wsPongWait))
	ws.SetPongHandler(func(string) error {
		ws.SetReadDeadline(time.Now().Add(this.wsPongWait))
		return nil
	})

	// if kateway shutdown while there are open ws conns, the shutdown will
	// wait 1m: this.subServer.wsPongWait
	for {
		_, message, err := ws.ReadMessage()
		if err != nil {
			if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway) {
				log.Warn("%s: %v", ws.RemoteAddr(), err)
			} else {
				log.Debug("%s: %v", ws.RemoteAddr(), err)
			}

			close(clientGone)
			break
		}

		log.Debug("ws[%s] read: %s", ws.RemoteAddr(), string(message))
	}
}
Пример #24
0
func (this *Lock) Lock(key string) (success bool) {
	this.mutex.Lock()

	mtime, present := this.items.Get(key)
	if !present {
		this.items.Set(key, time.Now())

		this.mutex.Unlock()
		return true
	}

	// present, check expires
	elapsed := time.Since(mtime.(time.Time))
	if this.cf.Expires > 0 && elapsed > this.cf.Expires {
		log.Warn("lock[%s] expires: %s, kicked", key, elapsed)

		// ignore the aged lock, refresh the lock
		this.items.Set(key, time.Now())

		this.mutex.Unlock()
		return true
	}

	this.mutex.Unlock()
	return false
}
Пример #25
0
func (this *pubStore) doRefresh() {
	if time.Since(this.lastRefreshedAt) <= time.Second*5 {
		log.Warn("ignored too frequent refresh: %s", time.Since(this.lastRefreshedAt))
		return
	}

	this.pubPoolsLock.Lock()
	defer this.pubPoolsLock.Unlock()

	// pub pool
	activeClusters := make(map[string]struct{})
	for _, cluster := range meta.Default.ClusterNames() {
		activeClusters[cluster] = struct{}{}
		if _, present := this.pubPools[cluster]; !present {
			// found a new cluster
			this.pubPools[cluster] = newPubPool(this, cluster,
				meta.Default.BrokerList(cluster), this.pubPoolsCapcity)
		} else {
			this.pubPools[cluster].RefreshBrokerList(meta.Default.BrokerList(cluster))
		}
	}

	// shutdown the dead clusters
	for cluster, pool := range this.pubPools {
		if _, present := activeClusters[cluster]; !present {
			// this cluster is dead or removed forever
			pool.Close()
			delete(this.pubPools, cluster)
		}
	}

	this.lastRefreshedAt = time.Now()
}
Пример #26
0
func (this *controller) watchZk() {
	evtCh, ok := this.orchestrator.SessionEvents()
	if !ok {
		panic("someone else is stealing my zk events?")
	}

	// during connecting phase, the following events are fired:
	// StateConnecting -> StateConnected -> StateHasSession
	firstHandShaked := false
	for {
		select {
		case <-this.quiting:
			return

		case evt := <-evtCh:
			if !firstHandShaked {
				if evt.State == zklib.StateHasSession {
					firstHandShaked = true
				}

				continue
			}

			log.Warn("zk jitter: %+v", evt)

			if evt.State == zklib.StateHasSession {
				log.Warn("zk reconnected after session lost, watcher/ephemeral might be lost")

				registered, err := this.orchestrator.ActorRegistered(this.Id())
				if err != nil {
					log.Error("registry: %s", err)
					this.orchestrator.CallSOS(fmt.Sprintf("actord[%s]", this.Id()), "zk session expired")
				} else if !registered {
					if err = this.orchestrator.RegisterActor(this.Id(), this.Bytes()); err != nil {
						log.Error("registry: %s", err)
					} else {
						log.Info("registry re-register controller[%s] ok", this.ident)
					}
				} else {
					log.Info("registry lucky, ephemeral still present")
				}

			}
		}
	}
}
Пример #27
0
// @rest GET /v1/partitions/:appid/:topic/:ver
func (this *manServer) partitionsHandler(w http.ResponseWriter, r *http.Request, params httprouter.Params) {
	topic := params.ByName(UrlParamTopic)
	hisAppid := params.ByName(UrlParamAppid)
	appid := r.Header.Get(HttpHeaderAppid)
	pubkey := r.Header.Get(HttpHeaderPubkey)
	ver := params.ByName(UrlParamVersion)
	realIp := getHttpRemoteIp(r)

	cluster, found := manager.Default.LookupCluster(hisAppid)
	if !found {
		log.Error("partitions[%s] %s(%s) {app:%s topic:%s ver:%s} invalid appid",
			appid, r.RemoteAddr, realIp, hisAppid, topic, ver)

		writeBadRequest(w, "invalid appid")
		return
	}

	if !manager.Default.AuthAdmin(appid, pubkey) {
		log.Warn("suspicous partitions call from %s(%s) {cluster:%s app:%s key:%s topic:%s ver:%s}",
			r.RemoteAddr, realIp, cluster, appid, pubkey, topic, ver)

		writeAuthFailure(w, manager.ErrAuthenticationFail)
		return
	}

	log.Info("partitions[%s] %s(%s) {cluster:%s app:%s topic:%s ver:%s}",
		appid, r.RemoteAddr, realIp, cluster, hisAppid, topic, ver)

	zkcluster := meta.Default.ZkCluster(cluster)
	if zkcluster == nil {
		log.Error("suspicous partitions call from %s(%s) {cluster:%s app:%s key:%s topic:%s ver:%s} undefined cluster",
			r.RemoteAddr, realIp, cluster, appid, pubkey, topic, ver)

		writeBadRequest(w, "undefined cluster")
		return
	}

	kfk, err := sarama.NewClient(zkcluster.BrokerList(), sarama.NewConfig())
	if err != nil {
		log.Error("cluster[%s] %v", zkcluster.Name(), err)

		writeServerError(w, err.Error())
		return
	}
	defer kfk.Close()

	partitions, err := kfk.Partitions(manager.Default.KafkaTopic(hisAppid, topic, ver))
	if err != nil {
		log.Error("cluster[%s] from %s(%s) {app:%s topic:%s ver:%s} %v",
			zkcluster.Name(), r.RemoteAddr, realIp, hisAppid, topic, ver, err)

		writeServerError(w, err.Error())
		return
	}

	w.Write([]byte(fmt.Sprintf(`{"num": %d}`, len(partitions))))
}
Пример #28
0
func (this *WatchReplicas) report() (deadPartitions, outOfSyncPartitions int64) {
	this.Zkzone.ForSortedClusters(func(zkcluster *zk.ZkCluster) {
		brokerList := zkcluster.BrokerList()
		if len(brokerList) == 0 {
			log.Warn("cluster[%s] empty brokers", zkcluster.Name())
			return
		}

		kfk, err := sarama.NewClient(brokerList, sarama.NewConfig())
		if err != nil {
			log.Error("cluster[%s] %v", zkcluster.Name(), err)
			return
		}
		defer kfk.Close()

		topics, err := kfk.Topics()
		if err != nil {
			log.Error("cluster[%s] %v", zkcluster.Name(), err)
			return
		}

		for _, topic := range topics {
			alivePartitions, err := kfk.WritablePartitions(topic)
			if err != nil {
				log.Error("cluster[%s] topic:%s %v", zkcluster.Name(), topic, err)
				continue
			}
			partions, err := kfk.Partitions(topic)
			if err != nil {
				log.Error("cluster[%s] topic:%s %v", zkcluster.Name(), topic, err)
				continue
			}

			// some partitions are dead
			if len(alivePartitions) != len(partions) {
				deadPartitions += 1
			}

			for _, partitionID := range alivePartitions {
				replicas, err := kfk.Replicas(topic, partitionID)
				if err != nil {
					log.Error("cluster[%s] topic:%s partition:%d %v",
						zkcluster.Name(), topic, partitionID, err)
					continue
				}

				isr, _, _ := zkcluster.Isr(topic, partitionID)
				if len(isr) != len(replicas) {
					outOfSyncPartitions += 1
				}
			}
		}
	})

	return
}
Пример #29
0
func (this *FunServantImpl) Lock(ctx *rpc.Context,
	reason string, key string) (r bool, ex error) {
	const IDENT = "lock"

	svtStats.inc(IDENT)
	profiler, err := this.getSession(ctx).startProfiler()
	if err != nil {
		ex = err
		return
	}

	var peer string
	if ctx.IsSetSticky() && *ctx.Sticky {
		svtStats.incPeerCall()

		r = this.lk.Lock(key)
	} else {
		svt, err := this.proxy.ServantByKey(key) // FIXME add prefix?
		if err != nil {
			ex = err
			if svt != nil {
				if proxy.IsIoError(err) {
					svt.Close()
				}
				svt.Recycle()
			}
			return
		}

		if svt == proxy.Self {
			r = this.lk.Lock(key)
		} else {
			svtStats.incCallPeer()

			peer = svt.Addr()
			svt.HijackContext(ctx)
			r, ex = svt.Lock(ctx, reason, key)
			if ex != nil {
				if proxy.IsIoError(ex) {
					svt.Close()
				}
			}

			svt.Recycle()
		}
	}

	profiler.do(IDENT, ctx, "P=%s {reason^%s key^%s} {r^%v}",
		peer, reason, key, r)

	if !r {
		log.Warn("P=%s lock failed: {reason^%s key^%s}", peer, reason, key)
	}

	return
}
Пример #30
0
func dumpActorConfigPhp(servers []string) {
	if config.actorTargetFile == "" ||
		config.actorTemplateFile == "" ||
		actorTemplateContents == "" {
		log.Warn("Invalid actor conf, disabled")
		return
	}

	log.Info("dumped[%s]: %+v", config.actorTargetFile, servers)
}