// @rest POST /v1/jobs/:appid/:topic/:ver func (this *manServer) createJobHandler(w http.ResponseWriter, r *http.Request, params httprouter.Params) { topic := params.ByName(UrlParamTopic) if !manager.Default.ValidateTopicName(topic) { log.Warn("illegal topic: %s", topic) writeBadRequest(w, "illegal topic") return } realIp := getHttpRemoteIp(r) if !this.throttleAddTopic.Pour(realIp, 1) { writeQuotaExceeded(w) return } hisAppid := params.ByName(UrlParamAppid) appid := r.Header.Get(HttpHeaderAppid) pubkey := r.Header.Get(HttpHeaderPubkey) ver := params.ByName(UrlParamVersion) if !manager.Default.AuthAdmin(appid, pubkey) { log.Warn("suspicous create job %s(%s) {appid:%s pubkey:%s topic:%s ver:%s}", r.RemoteAddr, realIp, appid, pubkey, topic, ver) writeAuthFailure(w, manager.ErrAuthenticationFail) return } cluster, found := manager.Default.LookupCluster(hisAppid) if !found { log.Error("create job %s(%s) {appid:%s topic:%s ver:%s} invalid appid", r.RemoteAddr, realIp, hisAppid, topic, ver) writeBadRequest(w, "invalid appid") return } log.Info("create job[%s] %s(%s) {appid:%s topic:%s ver:%s}", appid, r.RemoteAddr, realIp, hisAppid, topic, ver) rawTopic := manager.Default.KafkaTopic(hisAppid, topic, ver) if err := job.Default.CreateJobQueue(Options.AssignJobShardId, hisAppid, rawTopic); err != nil { log.Error("create job[%s] %s(%s) {shard:%d appid:%s topic:%s ver:%s} %v", appid, r.RemoteAddr, realIp, Options.AssignJobShardId, hisAppid, topic, ver, err) writeServerError(w, err.Error()) return } if err := this.gw.zkzone.CreateJobQueue(rawTopic, cluster); err != nil { log.Error("app[%s] %s(%s) create job: {shard:%d appid:%s topic:%s ver:%s} %v", appid, r.RemoteAddr, realIp, Options.AssignJobShardId, hisAppid, topic, ver, err) writeServerError(w, err.Error()) return } w.WriteHeader(http.StatusCreated) w.Write(ResponseOk) }
func (this *controller) RunForever() (err error) { log.Info("controller[%s] starting", this.Id()) if err = this.orchestrator.RegisterActor(this.Id(), this.Bytes()); err != nil { return err } defer this.orchestrator.ResignActor(this.Id()) if err = manager.Default.Start(); err != nil { return } log.Trace("manager[%s] started", manager.Default.Name()) go this.runWebServer() jobDispatchQuit := make(chan struct{}) go this.dispatchJobQueues(jobDispatchQuit) webhookDispatchQuit := make(chan struct{}) go this.dispatchWebhooks(webhookDispatchQuit) select { case <-jobDispatchQuit: log.Warn("dispatchJobQueues quit") case <-webhookDispatchQuit: log.Warn("dispatchWebhooks quit") } manager.Default.Stop() log.Trace("manager[%s] stopped", manager.Default.Name()) return }
func (this *WatchConsumers) frequentOffsetCommit() (n int64) { const frequentThreshold = time.Second * 10 this.Zkzone.ForSortedClusters(func(zkcluster *zk.ZkCluster) { for group, consumers := range zkcluster.ConsumersByGroup("") { for _, c := range consumers { if !c.Online { continue } if c.ConsumerZnode == nil { log.Warn("cluster[%s] group[%s] topic[%s/%s] unrecognized consumer", zkcluster.Name(), group, c.Topic, c.PartitionId) continue } gtp := structs.GroupTopicPartition{Group: group, Topic: c.Topic, PartitionID: c.PartitionId} if t, present := this.offsetMtimeMap[gtp]; present { if interval := c.Mtime.Time().Sub(t); interval < frequentThreshold { if this.logFrequentConsumer { log.Warn("cluster[%s] group[%s] topic[%s/%s] too frequent offset commit: %s", zkcluster.Name(), group, c.Topic, c.PartitionId, interval) } n++ } } this.offsetMtimeMap[gtp] = c.Mtime.Time() } } }) return }
func (this *mysql) Exec(query string, args ...interface{}) (afftectedRows int64, lastInsertId int64, err error) { if this.db == nil { return 0, 0, ErrNotOpen } if this.breaker.Open() { return 0, 0, ErrCircuitOpen } var result sql.Result result, err = this.db.Exec(query, args...) if err != nil { if this.isSystemError(err) { log.Warn("mysql exec breaks: %s", err.Error()) this.breaker.Fail() } return 0, 0, err } afftectedRows, err = result.RowsAffected() if err != nil { if this.isSystemError(err) { log.Warn("mysql exec2 breaks: %s", err.Error()) this.breaker.Fail() } } else { this.breaker.Succeed() } lastInsertId, _ = result.LastInsertId() return }
// watchTopicPartitionsChange watch partition changes on a topic. func (cg *ConsumerGroup) watchTopicPartitionsChange(topic string, stopper <-chan struct{}, topicPartitionsChanged chan<- string, outstanding *sync.WaitGroup) { defer outstanding.Done() _, ch, err := cg.kazoo.Topic(topic).WatchPartitions() if err != nil { if err == zk.ErrNoNode { err = ErrInvalidTopic } log.Error("[%s/%s] topic[%s] watch partitions: %s", cg.group.Name, cg.shortID(), topic, err) cg.emitError(err, topic, -1) return } var ( backoff = time.Duration(5) maxRetries = 3 ) select { case <-cg.stopper: return case <-stopper: return case <-ch: // when partitions scales up, the zk node might not be completely ready, await it ready // // even if zk node ready, kafka broker might not be ready: // kafka server: Request was for a topic or partition that does not exist on this broker // so we blindly wait: should be enough for most cases // in rare cases, that is still not enough: imagine partitions 1->1000, which takes long // ok, just return that err to client to retry time.Sleep(time.Second * backoff) for retries := 0; retries < maxRetries; retries++ { // retrieve brokers/topics/{topic}/partitions/{partition}/state and find the leader broker id // the new partitions state znode might not be ready yet if partitions, err := cg.kazoo.Topic(topic).Partitions(); err == nil { if _, err = retrievePartitionLeaders(partitions); err == nil { log.Debug("[%s/%s] topic[%s] partitions change complete", cg.group.Name, cg.shortID(), topic) break } else { log.Warn("[%s/%s] topic[%s] partitions change retry#%d waiting: %v", cg.group.Name, cg.shortID(), topic, retries, err) backoff-- // don't worry if negative time.Sleep(time.Second * backoff) } } else { log.Warn("[%s/%s] topic[%s] partitions change retry#%d waiting: %v", cg.group.Name, cg.shortID(), topic, retries, err) backoff-- time.Sleep(time.Second * backoff) } } // safe to trigger rebalance select { case topicPartitionsChanged <- topic: default: } } }
func (this *WatchSub) subLags() (lags int) { now := time.Now() // find sub lags for _, zkcluster := range this.zkclusters { for group, consumers := range zkcluster.ConsumersByGroup("") { for _, c := range consumers { if !c.Online { continue } if c.ConsumerZnode == nil { log.Warn("cluster[%s] group[%s] topic[%s/%s] unrecognized consumer", zkcluster.Name(), group, c.Topic, c.PartitionId) continue } if time.Since(c.ConsumerZnode.Uptime()) < time.Minute*2 { log.Info("cluster[%s] group[%s] just started, topic[%s/%s]", zkcluster.Name(), group, c.Topic, c.PartitionId) this.unsuspect(group, c.Topic, c.PartitionId) continue } // offset commit every 1m, sublag runs every 1m, so the gap might be 2m // TODO lag too much, even if it's still alive, emit alarm elapsed := time.Since(c.Mtime.Time()) if c.Lag == 0 || elapsed < time.Minute*3 { this.unsuspect(group, c.Topic, c.PartitionId) continue } // it might be lagging, but need confirm with last round if !this.isSuspect(group, c.Topic, c.PartitionId) { // suspect it, next round if it is still lagging, put on trial log.Warn("cluster[%s] group[%s] suspected topic[%s/%s] %d - %d = %d, offset commit elapsed: %s", zkcluster.Name(), group, c.Topic, c.PartitionId, c.ProducerOffset, c.ConsumerOffset, c.Lag, elapsed.String()) this.suspect(group, c.Topic, c.PartitionId, c.ProducerOffset, c.ConsumerOffset, now) continue } if this.isCriminal(group, c.Topic, c.PartitionId, c.ProducerOffset, c.ConsumerOffset, now) { // bingo! consumer is lagging and seems to be DEAD log.Error("cluster[%s] group[%s] confirmed topic[%s/%s] %d - %d = %d, offset commit elapsed: %s", zkcluster.Name(), group, c.Topic, c.PartitionId, c.ProducerOffset, c.ConsumerOffset, c.Lag, elapsed.String()) lags++ } else { log.Warn("cluster[%s] group[%s] lagging but still alive topic[%s/%s] %d - %d = %d, offset commit elapsed: %s", zkcluster.Name(), group, c.Topic, c.PartitionId, c.ProducerOffset, c.ConsumerOffset, c.Lag, elapsed.String()) } } } } return }
func (this *ResourcePool) get(wait bool) (resource Resource, err error) { if this == nil || this.IsClosed() { return nil, CLOSED_ERR } var ( wrapper resourceWrapper stillOpen bool ) select { case wrapper, stillOpen = <-this.resourcePool: if !stillOpen { return nil, CLOSED_ERR } this.waitCount.Set(0) // reset if wrapper.resource != nil { this.diagnosticTracker.BorrowResource(wrapper.resource) } default: if !wait { return nil, nil } this.waitCount.Add(1) log.Warn("ResourcePool[%s] busy, pending:%d waited:%s", this.name, this.WaitCount(), this.waitTime.Get()) t1 := time.Now() wrapper = <-this.resourcePool this.waitTime.Add(time.Now().Sub(t1)) } // Close the aged idle resource timeout := this.idleTimeout.Get() if wrapper.resource != nil && timeout > 0 && wrapper.timeUsed.Add(timeout).Sub(time.Now()) < 0 { this.diagnosticTracker.ReturnResource(wrapper.resource) log.Warn("ResourcePool[%s] resource:%d idle too long: closed", this.name, wrapper.resource.Id()) wrapper.resource.Close() wrapper.resource = nil } if wrapper.resource == nil { wrapper.resource, err = this.factory() if err != nil { this.resourcePool <- resourceWrapper{} } else { this.diagnosticTracker.BorrowResource(wrapper.resource) } } return wrapper.resource, err }
func (this *Start) main() { ctx.LoadFromHome() this.zkzone = zk.NewZkZone(zk.DefaultConfig(this.zone, ctx.ZoneZkAddrs(this.zone))) zkConnEvt, ok := this.zkzone.SessionEvents() if !ok { panic("someone stealing my events") } registry.Default = zkr.New(this.zkzone) log.Info("ehaproxy[%s] starting...", gafka.BuildId) go this.runMonitorServer(this.httpAddr) zkConnected := false for { instances, instancesChange, err := registry.Default.WatchInstances() if err != nil { log.Error("zone[%s] %s", this.zkzone.Name(), err) time.Sleep(time.Second) continue } if zkConnected { if len(instances) > 0 { this.reload(instances) } else { // resilience to zk problem by local cache log.Warn("backend all shutdown? skip this change") time.Sleep(time.Second) continue } } select { case <-this.quitCh: return case evt := <-zkConnEvt: if evt.State == zklib.StateHasSession && !zkConnected { log.Info("zk connected") zkConnected = true } else if zkConnected { log.Warn("zk jitter: %+v", evt) } case <-instancesChange: log.Info("instances changed!!") } } }
// consume topic: __consumer_offsets and process the message to get offsets of consumers func (this *ZkCluster) processConsumerOffsetsMessage(msg *sarama.ConsumerMessage) { var keyver, valver uint16 var partition uint32 var offset, timestamp uint64 buf := bytes.NewBuffer(msg.Key) err := binary.Read(buf, binary.BigEndian, &keyver) if (err != nil) || ((keyver != 0) && (keyver != 1)) { log.Warn("Failed to decode %s:%v offset %v: keyver", msg.Topic, msg.Partition, msg.Offset) return } group, err := readString(buf) if err != nil { log.Warn("Failed to decode %s:%v offset %v: group", msg.Topic, msg.Partition, msg.Offset) return } topic, err := readString(buf) if err != nil { log.Warn("Failed to decode %s:%v offset %v: topic", msg.Topic, msg.Partition, msg.Offset) return } err = binary.Read(buf, binary.BigEndian, &partition) if err != nil { log.Warn("Failed to decode %s:%v offset %v: partition", msg.Topic, msg.Partition, msg.Offset) return } buf = bytes.NewBuffer(msg.Value) err = binary.Read(buf, binary.BigEndian, &valver) if (err != nil) || ((valver != 0) && (valver != 1)) { log.Warn("Failed to decode %s:%v offset %v: valver", msg.Topic, msg.Partition, msg.Offset) return } err = binary.Read(buf, binary.BigEndian, &offset) if err != nil { log.Warn("Failed to decode %s:%v offset %v: offset", msg.Topic, msg.Partition, msg.Offset) return } _, err = readString(buf) if err != nil { log.Warn("Failed to decode %s:%v offset %v: metadata", msg.Topic, msg.Partition, msg.Offset) return } err = binary.Read(buf, binary.BigEndian, ×tamp) if err != nil { log.Warn("Failed to decode %s:%v offset %v: timestamp", msg.Topic, msg.Partition, msg.Offset) return } partitionOffset := &PartitionOffset{ Cluster: this.Name(), Topic: topic, Partition: int32(partition), Group: group, Timestamp: int64(timestamp), Offset: int64(offset), } log.Debug("%+v", partitionOffset) return }
func (this *mysql) Query(query string, args ...interface{}) (rows *sql.Rows, err error) { if this.db == nil { return nil, ErrNotOpen } if this.breaker.Open() { return nil, ErrCircuitOpen } var stmt *sql.Stmt = nil if this.stmtsStore != nil { if stmtc, present := this.stmtsStore.Get(query); present { stmt = stmtc.(*sql.Stmt) } else { // FIXME thundering hurd stmt, err = this.db.Prepare(query) if err != nil { if this.isSystemError(err) { log.Warn("mysql prepare breaks: %s", err.Error()) this.breaker.Fail() } return nil, err } this.mutex.Lock() this.stmtsStore.Set(query, stmt) this.mutex.Unlock() log.Debug("[%s] stmt[%s] open", this.dsn, query) } } // Under the hood, db.Query() actually prepares, executes, and closes // a prepared statement. That's three round-trips to the database. if stmt != nil { rows, err = stmt.Query(args...) } else { rows, err = this.db.Query(query, args...) } if err != nil { if this.isSystemError(err) { log.Warn("mysql query breaks: %s", err.Error()) this.breaker.Fail() } } else { this.breaker.Succeed() } return }
func dumpMaintainConfigPhp(info []string) { if config.maintainTargetFile == "" || config.maintainTemplateFile == "" || maintainTemplateContents == "" { log.Warn("Invalid maintain conf, disabled") return } templateData := make(map[string]string) for _, s := range info { // s is like "kingdom_1:30" parts := strings.SplitN(s, ":", 2) templateData[parts[0]] = parts[1] } t := template.Must(template.New("maintain").Parse(maintainTemplateContents)) wr := new(bytes.Buffer) t.Execute(wr, templateData) err := ioutil.WriteFile(config.maintainTargetFile, wr.Bytes(), 0644) if err != nil { log.Error("dump[%s]: %s", config.maintainTargetFile, err.Error()) } else { log.Info("dumped[%s]: %+v", config.maintainTargetFile, templateData) } }
// TODO from live meta or zk? func (this *pubPool) RefreshBrokerList(brokerList []string) { if len(brokerList) == 0 { if len(this.brokerList) > 0 { log.Warn("%s meta store found empty broker list, refresh refused", this.cluster) } return } setOld, setNew := set.NewSet(), set.NewSet() for _, b := range this.brokerList { setOld.Add(b) } for _, b := range brokerList { setNew.Add(b) } if !setOld.Equal(setNew) { log.Info("%s broker list from %+v to %+v", this.cluster, this.brokerList, brokerList) // rebuild the kafka conn pool this.brokerList = brokerList this.Close() this.buildPools() } }
func (this *zkMetaStore) TopicPartitions(cluster, topic string) []int32 { ct := structs.ClusterTopic{Cluster: cluster, Topic: topic} this.pmapLock.RLock() if partitionIDs, present := this.partitionsMap[ct]; present { this.pmapLock.RUnlock() return partitionIDs } this.pmapLock.RUnlock() this.pmapLock.Lock() defer this.pmapLock.Unlock() // double check if partitionIDs, present := this.partitionsMap[ct]; present { return partitionIDs } // cache miss this.mu.RLock() c, ok := this.clusters[cluster] this.mu.RUnlock() if !ok { log.Warn("invalid cluster: %s", cluster) return nil } partitionIDs := c.Partitions(topic) // set cache this.partitionsMap[ct] = partitionIDs return partitionIDs }
func dumpFaeConfigPhp(servers []string) { if config.faeTargetFile == "" || config.faeTemplateFile == "" || faeTemplateContents == "" { log.Warn("Invalid fae conf, disabled") return } type tempateVar struct { Servers []string Ports []string } templateData := tempateVar{Servers: make([]string, 0), Ports: make([]string, 0)} for _, s := range servers { // s is like "12.3.11.2:9001" parts := strings.SplitN(s, ":", 2) templateData.Servers = append(templateData.Servers, parts[0]) templateData.Ports = append(templateData.Ports, parts[1]) } t := template.Must(template.New("fae").Parse(faeTemplateContents)) wr := new(bytes.Buffer) t.Execute(wr, templateData) err := ioutil.WriteFile(config.faeTargetFile, wr.Bytes(), 0644) if err != nil { log.Error("dump[%s]: %s", config.faeTargetFile, err.Error()) } else { log.Info("dumped[%s]: %+v", config.faeTargetFile, templateData) } }
func (this *profiler) do(callName string, ctx *rpc.Context, format string, args ...interface{}) { if this == nil { return } elapsed := time.Since(this.t1) slow := elapsed > config.Engine.Servants.CallSlowThreshold if !(slow || this.on) { return } body := fmt.Sprintf(format, args...) if slow { svtStats.incCallSlow() header := fmt.Sprintf("SLOW=%s/%s Q=%s ", elapsed, time.Since(this.t0), callName) log.Warn(header + this.truncatedStr(body)) } else if this.on { header := fmt.Sprintf("T=%s/%s Q=%s ", elapsed, time.Since(this.t0), callName) log.Trace(header + this.truncatedStr(body)) } }
func newRpcDispatcher(prefork bool, maxOutstandingSessions int, handler rpcClientHandler) (this *rpcDispatcher) { this = &rpcDispatcher{ handler: handler, preforkMode: prefork, } if !this.preforkMode { this.throttleChan = make(chan null.NullStruct, maxOutstandingSessions) return } this.clientSocketChan = make(chan thrift.TTransport, maxOutstandingSessions) for i := 0; i < maxOutstandingSessions; i++ { // prefork go func() { for { // reuse goroutines to reduce GC this.handler(<-this.clientSocketChan) } log.Warn("dispatcher[%d] terminated", i) }() } return }
func (this *WatchConsumers) runSubQpsTimer() { this.Zkzone.ForSortedClusters(func(zkcluster *zk.ZkCluster) { consumerGroups := zkcluster.ConsumerGroups() for group, _ := range consumerGroups { offsetMap := zkcluster.ConsumerOffsetsOfGroup(group) for topic, m := range offsetMap { offsetOfGroupOnTopic := int64(0) for _, offset := range m { offsetOfGroupOnTopic += offset } // cluster, topic, group, offset tag := telemetry.Tag(zkcluster.Name(), strings.Replace(topic, ".", "_", -1), strings.Replace(group, ".", "_", -1)) if _, present := this.consumerQps[tag]; !present { this.consumerQps[tag] = metrics.NewRegisteredMeter(tag+"consumer.qps", nil) } lastOffset := this.lastOffsets[tag] if lastOffset == 0 { // first run this.lastOffsets[tag] = offsetOfGroupOnTopic } else { delta := offsetOfGroupOnTopic - lastOffset if delta >= 0 { this.consumerQps[tag].Mark(delta) this.lastOffsets[tag] = offsetOfGroupOnTopic } else { log.Warn("cluster[%s] topic[%s] group[%s] offset rewinds: %d %d", zkcluster.Name(), topic, group, offsetOfGroupOnTopic, lastOffset) } } } } }) }
func (this *WatchLoadAvg) highLoadCount() (n int64, err error) { const threshold = 6. cmd := pipestream.New("consul", "exec", "uptime", "|", "grep", "load") err = cmd.Open() if err != nil { return } defer cmd.Close() scanner := bufio.NewScanner(cmd.Reader()) scanner.Split(bufio.ScanLines) for scanner.Scan() { line := scanner.Text() load1m, e := ctx.ExtractLoadAvg1m(line) if e != nil { continue } if load1m > threshold { log.Warn(line) n++ } } return }
// nextSegmentID returns the next segment ID that is free func (q *queue) nextSegmentID() (uint64, error) { segments, err := ioutil.ReadDir(q.dir) if err != nil { return 0, err } var maxID uint64 for _, segment := range segments { if segment.IsDir() || segment.Name() == cursorFile { continue } // Segments file names are all numeric segmentID, err := strconv.ParseUint(segment.Name(), 10, 64) if err != nil { log.Warn("unexpected segment file: %s", filepath.Join(q.dir, segment.Name())) continue } if segmentID > maxID { maxID = segmentID } } return maxID + 1, nil }
func (this *Proxy) refreshPeers(peers []string) { // add all latest peers for _, peerAddr := range peers { this.addRemotePeerIfNecessary(peerAddr) } // kill died peers for peerAddr, _ := range this.remotePeerPools { alive := false for _, p := range peers { if p == peerAddr { // still alive alive = true break } } if !alive { log.Warn("peer[%s] gone away", peerAddr) this.mutex.Lock() this.remotePeerPools[peerAddr].Close() // kill all conns in this pool delete(this.remotePeerPools, peerAddr) this.mutex.Unlock() } } }
func (this *WatchAppError) Run() { defer this.Wg.Done() return // disable for now TODO appError := metrics.NewRegisteredCounter("kateway.apperr", nil) msgChan := make(chan *sarama.ConsumerMessage, 2000) if err := this.consumeAppErrLogs(msgChan); err != nil { close(msgChan) log.Error("%v", err) return } for { select { case <-this.Stop: log.Info("kateway.apperr stopped") return case msg, ok := <-msgChan: if !ok { return } appError.Inc(1) log.Warn("%d/%d %s", msg.Partition, msg.Offset, string(msg.Value)) } } }
func (this *WatchExec) Run() { defer this.Wg.Done() if this.confDir == "" { log.Warn("empty confd, external.exec disabled") return } ticker := time.NewTicker(time.Minute) defer ticker.Stop() if err := this.watchConfigDir(); err != nil { log.Error("%v", err) return } for { select { case <-this.Stop: log.Info("external.exec stopped") return case <-ticker.C: } } }
func (this *subServer) wsReadPump(clientGone chan struct{}, ws *websocket.Conn) { ws.SetReadLimit(this.wsReadLimit) ws.SetReadDeadline(time.Now().Add(this.wsPongWait)) ws.SetPongHandler(func(string) error { ws.SetReadDeadline(time.Now().Add(this.wsPongWait)) return nil }) // if kateway shutdown while there are open ws conns, the shutdown will // wait 1m: this.subServer.wsPongWait for { _, message, err := ws.ReadMessage() if err != nil { if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway) { log.Warn("%s: %v", ws.RemoteAddr(), err) } else { log.Debug("%s: %v", ws.RemoteAddr(), err) } close(clientGone) break } log.Debug("ws[%s] read: %s", ws.RemoteAddr(), string(message)) } }
func (this *Lock) Lock(key string) (success bool) { this.mutex.Lock() mtime, present := this.items.Get(key) if !present { this.items.Set(key, time.Now()) this.mutex.Unlock() return true } // present, check expires elapsed := time.Since(mtime.(time.Time)) if this.cf.Expires > 0 && elapsed > this.cf.Expires { log.Warn("lock[%s] expires: %s, kicked", key, elapsed) // ignore the aged lock, refresh the lock this.items.Set(key, time.Now()) this.mutex.Unlock() return true } this.mutex.Unlock() return false }
func (this *pubStore) doRefresh() { if time.Since(this.lastRefreshedAt) <= time.Second*5 { log.Warn("ignored too frequent refresh: %s", time.Since(this.lastRefreshedAt)) return } this.pubPoolsLock.Lock() defer this.pubPoolsLock.Unlock() // pub pool activeClusters := make(map[string]struct{}) for _, cluster := range meta.Default.ClusterNames() { activeClusters[cluster] = struct{}{} if _, present := this.pubPools[cluster]; !present { // found a new cluster this.pubPools[cluster] = newPubPool(this, cluster, meta.Default.BrokerList(cluster), this.pubPoolsCapcity) } else { this.pubPools[cluster].RefreshBrokerList(meta.Default.BrokerList(cluster)) } } // shutdown the dead clusters for cluster, pool := range this.pubPools { if _, present := activeClusters[cluster]; !present { // this cluster is dead or removed forever pool.Close() delete(this.pubPools, cluster) } } this.lastRefreshedAt = time.Now() }
func (this *controller) watchZk() { evtCh, ok := this.orchestrator.SessionEvents() if !ok { panic("someone else is stealing my zk events?") } // during connecting phase, the following events are fired: // StateConnecting -> StateConnected -> StateHasSession firstHandShaked := false for { select { case <-this.quiting: return case evt := <-evtCh: if !firstHandShaked { if evt.State == zklib.StateHasSession { firstHandShaked = true } continue } log.Warn("zk jitter: %+v", evt) if evt.State == zklib.StateHasSession { log.Warn("zk reconnected after session lost, watcher/ephemeral might be lost") registered, err := this.orchestrator.ActorRegistered(this.Id()) if err != nil { log.Error("registry: %s", err) this.orchestrator.CallSOS(fmt.Sprintf("actord[%s]", this.Id()), "zk session expired") } else if !registered { if err = this.orchestrator.RegisterActor(this.Id(), this.Bytes()); err != nil { log.Error("registry: %s", err) } else { log.Info("registry re-register controller[%s] ok", this.ident) } } else { log.Info("registry lucky, ephemeral still present") } } } } }
// @rest GET /v1/partitions/:appid/:topic/:ver func (this *manServer) partitionsHandler(w http.ResponseWriter, r *http.Request, params httprouter.Params) { topic := params.ByName(UrlParamTopic) hisAppid := params.ByName(UrlParamAppid) appid := r.Header.Get(HttpHeaderAppid) pubkey := r.Header.Get(HttpHeaderPubkey) ver := params.ByName(UrlParamVersion) realIp := getHttpRemoteIp(r) cluster, found := manager.Default.LookupCluster(hisAppid) if !found { log.Error("partitions[%s] %s(%s) {app:%s topic:%s ver:%s} invalid appid", appid, r.RemoteAddr, realIp, hisAppid, topic, ver) writeBadRequest(w, "invalid appid") return } if !manager.Default.AuthAdmin(appid, pubkey) { log.Warn("suspicous partitions call from %s(%s) {cluster:%s app:%s key:%s topic:%s ver:%s}", r.RemoteAddr, realIp, cluster, appid, pubkey, topic, ver) writeAuthFailure(w, manager.ErrAuthenticationFail) return } log.Info("partitions[%s] %s(%s) {cluster:%s app:%s topic:%s ver:%s}", appid, r.RemoteAddr, realIp, cluster, hisAppid, topic, ver) zkcluster := meta.Default.ZkCluster(cluster) if zkcluster == nil { log.Error("suspicous partitions call from %s(%s) {cluster:%s app:%s key:%s topic:%s ver:%s} undefined cluster", r.RemoteAddr, realIp, cluster, appid, pubkey, topic, ver) writeBadRequest(w, "undefined cluster") return } kfk, err := sarama.NewClient(zkcluster.BrokerList(), sarama.NewConfig()) if err != nil { log.Error("cluster[%s] %v", zkcluster.Name(), err) writeServerError(w, err.Error()) return } defer kfk.Close() partitions, err := kfk.Partitions(manager.Default.KafkaTopic(hisAppid, topic, ver)) if err != nil { log.Error("cluster[%s] from %s(%s) {app:%s topic:%s ver:%s} %v", zkcluster.Name(), r.RemoteAddr, realIp, hisAppid, topic, ver, err) writeServerError(w, err.Error()) return } w.Write([]byte(fmt.Sprintf(`{"num": %d}`, len(partitions)))) }
func (this *WatchReplicas) report() (deadPartitions, outOfSyncPartitions int64) { this.Zkzone.ForSortedClusters(func(zkcluster *zk.ZkCluster) { brokerList := zkcluster.BrokerList() if len(brokerList) == 0 { log.Warn("cluster[%s] empty brokers", zkcluster.Name()) return } kfk, err := sarama.NewClient(brokerList, sarama.NewConfig()) if err != nil { log.Error("cluster[%s] %v", zkcluster.Name(), err) return } defer kfk.Close() topics, err := kfk.Topics() if err != nil { log.Error("cluster[%s] %v", zkcluster.Name(), err) return } for _, topic := range topics { alivePartitions, err := kfk.WritablePartitions(topic) if err != nil { log.Error("cluster[%s] topic:%s %v", zkcluster.Name(), topic, err) continue } partions, err := kfk.Partitions(topic) if err != nil { log.Error("cluster[%s] topic:%s %v", zkcluster.Name(), topic, err) continue } // some partitions are dead if len(alivePartitions) != len(partions) { deadPartitions += 1 } for _, partitionID := range alivePartitions { replicas, err := kfk.Replicas(topic, partitionID) if err != nil { log.Error("cluster[%s] topic:%s partition:%d %v", zkcluster.Name(), topic, partitionID, err) continue } isr, _, _ := zkcluster.Isr(topic, partitionID) if len(isr) != len(replicas) { outOfSyncPartitions += 1 } } } }) return }
func (this *FunServantImpl) Lock(ctx *rpc.Context, reason string, key string) (r bool, ex error) { const IDENT = "lock" svtStats.inc(IDENT) profiler, err := this.getSession(ctx).startProfiler() if err != nil { ex = err return } var peer string if ctx.IsSetSticky() && *ctx.Sticky { svtStats.incPeerCall() r = this.lk.Lock(key) } else { svt, err := this.proxy.ServantByKey(key) // FIXME add prefix? if err != nil { ex = err if svt != nil { if proxy.IsIoError(err) { svt.Close() } svt.Recycle() } return } if svt == proxy.Self { r = this.lk.Lock(key) } else { svtStats.incCallPeer() peer = svt.Addr() svt.HijackContext(ctx) r, ex = svt.Lock(ctx, reason, key) if ex != nil { if proxy.IsIoError(ex) { svt.Close() } } svt.Recycle() } } profiler.do(IDENT, ctx, "P=%s {reason^%s key^%s} {r^%v}", peer, reason, key, r) if !r { log.Warn("P=%s lock failed: {reason^%s key^%s}", peer, reason, key) } return }
func dumpActorConfigPhp(servers []string) { if config.actorTargetFile == "" || config.actorTemplateFile == "" || actorTemplateContents == "" { log.Warn("Invalid actor conf, disabled") return } log.Info("dumped[%s]: %+v", config.actorTargetFile, servers) }