func StartStatsD() { if !g.Config().StatsD.Enabled { return } addr := g.Config().StatsD.Listen udpAddr, err := net.ResolveUDPAddr("udp", addr) if err != nil { log.Fatalf("net.ResolveUDPAddr fail: %s", err) } listener, err := net.ListenUDP("udp", udpAddr) if err != nil { log.Fatalf("listen %s fail: %s", addr, err) } else { log.Println("statsd listening", addr) } defer listener.Close() go calc() for { message := make([]byte, 512) n, remaddr, error := listener.ReadFrom(message) if error != nil { continue } buf := bytes.NewBuffer(message[0:n]) go handleMessage(listener, remaddr, buf) } }
func StartSocket() { if !g.Config().Socket.Enabled { return } addr := g.Config().Socket.Listen tcpAddr, err := net.ResolveTCPAddr("tcp", addr) if err != nil { log.Fatalf("net.ResolveTCPAddr fail: %s", err) } listener, err := net.ListenTCP("tcp", tcpAddr) if err != nil { log.Fatalf("listen %s fail: %s", addr, err) } else { log.Println("socket listening", addr) } defer listener.Close() for { conn, err := listener.Accept() if err != nil { log.Println("listener.Accept occur error:", err) continue } go socketTelnetHandle(conn) } }
func StartRpc() { if !g.Config().Rpc.Enabled { return } addr := g.Config().Rpc.Listen tcpAddr, err := net.ResolveTCPAddr("tcp", addr) if err != nil { log.Fatalf("net.ResolveTCPAddr fail: %s", err) } listener, err := net.ListenTCP("tcp", tcpAddr) if err != nil { log.Fatalf("listen %s fail: %s", addr, err) } else { log.Println("rpc listening", addr) } server := rpc.NewServer() server.Register(new(Transfer)) for { conn, err := listener.Accept() if err != nil { log.Println("listener.Accept occur error:", err) continue } // go rpc.ServeConn(conn) go server.ServeCodec(jsonrpc.NewServerCodec(conn)) } }
func initConnPools() { cfg := g.Config() judgeInstances := nset.NewStringSet() for _, instance := range cfg.Judge.Cluster { judgeInstances.Add(instance) } JudgeConnPools = cpool.CreateSafeRpcConnPools(cfg.Judge.MaxConns, cfg.Judge.MaxIdle, cfg.Judge.ConnTimeout, cfg.Judge.CallTimeout, judgeInstances.ToSlice()) // graph graphInstances := nset.NewSafeSet() for _, nitem := range cfg.Graph.Cluster2 { for _, addr := range nitem.Addrs { graphInstances.Add(addr) } } GraphConnPools = cpool.CreateSafeRpcConnPools(cfg.Graph.MaxConns, cfg.Graph.MaxIdle, cfg.Graph.ConnTimeout, cfg.Graph.CallTimeout, graphInstances.ToSlice()) // graph migrating if cfg.Graph.Migrating && cfg.Graph.ClusterMigrating != nil { graphMigratingInstances := nset.NewSafeSet() for _, cnode := range cfg.Graph.ClusterMigrating2 { for _, addr := range cnode.Addrs { graphMigratingInstances.Add(addr) } } GraphMigratingConnPools = cpool.CreateSafeRpcConnPools(cfg.Graph.MaxConns, cfg.Graph.MaxIdle, cfg.Graph.ConnTimeout, cfg.Graph.CallTimeout, graphMigratingInstances.ToSlice()) } }
func initSendQueues() { cfg := g.Config() for node, _ := range cfg.Judge.Cluster { Q := nlist.NewSafeListLimited(DefaultSendQueueMaxSize) JudgeQueues[node] = Q } for node, nitem := range cfg.Graph.Cluster2 { for _, addr := range nitem.Addrs { Q := nlist.NewSafeListLimited(DefaultSendQueueMaxSize) GraphQueues[node+addr] = Q } } if cfg.Graph.Migrating && cfg.Graph.ClusterMigrating != nil { for node, cnode := range cfg.Graph.ClusterMigrating2 { for _, addr := range cnode.Addrs { Q := nlist.NewSafeListLimited(DefaultSendQueueMaxSize) GraphMigratingQueues[node+addr] = Q } } } if cfg.Tsdb.Enabled { TsdbQueue = nlist.NewSafeListLimited(DefaultSendQueueMaxSize) } }
func initConnPools() { cfg := g.Config() judgeInstances := set.NewStringSet() for _, instance := range cfg.Judge.Cluster { judgeInstances.Add(instance) } JudgeConnPools = cpool.CreateSafeRpcConnPools(cfg.Judge.MaxConns, cfg.Judge.MaxIdle, cfg.Judge.ConnTimeout, cfg.Judge.CallTimeout, judgeInstances.ToSlice()) graphInstances := set.NewStringSet() for _, instance := range cfg.Graph.Cluster { graphInstances.Add(instance) } GraphConnPools = cpool.CreateSafeRpcConnPools(cfg.Graph.MaxConns, cfg.Graph.MaxIdle, cfg.Graph.ConnTimeout, cfg.Graph.CallTimeout, graphInstances.ToSlice()) if cfg.Graph.Migrating && cfg.Graph.ClusterMigrating != nil { graphMigratingInstances := set.NewStringSet() for _, instance := range cfg.Graph.ClusterMigrating { graphMigratingInstances.Add(instance) } GraphMigratingConnPools = cpool.CreateSafeRpcConnPools(cfg.Graph.MaxConns, cfg.Graph.MaxIdle, cfg.Graph.ConnTimeout, cfg.Graph.CallTimeout, graphMigratingInstances.ToSlice()) } }
// 将数据 打入 某个Graph的发送缓存队列, 具体是哪一个Graph 由一致性哈希 决定 // 如果正在数据迁移, 数据除了打到原有配置上 还要向新的配置上打一份(新老重叠时要去重,防止将一条数据向一台Graph上打两次) func Push2GraphSendQueue(items []*cmodel.MetaData, migrating bool) { cfg := g.Config().Graph for _, item := range items { graphItem, err := convert2GraphItem(item) if err != nil { log.Println("E:", err) continue } pk := item.PK() // statistics. 为了效率,放到了这里,因此只有graph是enbale时才能trace proc.RecvDataTrace.Trace(pk, item) proc.RecvDataFilter.Filter(pk, item.Value, item) node, err := GraphNodeRing.GetNode(pk) if err != nil { log.Println("E:", err) continue } cnode := cfg.Cluster2[node] errCnt := 0 for _, addr := range cnode.Addrs { Q := GraphQueues[node+addr] if !Q.PushFront(graphItem) { errCnt += 1 } } // statistics if errCnt > 0 { proc.SendToGraphDropCnt.Incr() } if migrating { migratingNode, err := GraphMigratingNodeRing.GetNode(pk) if err != nil { log.Println("E:", err) continue } if node != migratingNode { // 数据迁移时,进行新老去重 cnodem := cfg.ClusterMigrating2[migratingNode] errCnt := 0 for _, addr := range cnodem.Addrs { MQ := GraphMigratingQueues[migratingNode+addr] if !MQ.PushFront(graphItem) { errCnt += 1 } } // statistics if errCnt > 0 { proc.SendToGraphMigratingDropCnt.Incr() } } } } }
func configCommonRoutes() { http.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) { w.Write([]byte("ok\n")) }) http.HandleFunc("/version", func(w http.ResponseWriter, r *http.Request) { w.Write([]byte(fmt.Sprintf("%s\n", g.VERSION))) }) http.HandleFunc("/workdir", func(w http.ResponseWriter, r *http.Request) { w.Write([]byte(fmt.Sprintf("%s\n", file.SelfDir()))) }) http.HandleFunc("/config", func(w http.ResponseWriter, r *http.Request) { RenderDataJson(w, g.Config()) }) http.HandleFunc("/config/reload", func(w http.ResponseWriter, r *http.Request) { if strings.HasPrefix(r.RemoteAddr, "127.0.0.1") { g.ParseConfig(g.ConfigFile) RenderDataJson(w, "ok") } else { RenderDataJson(w, "no privilege") } }) }
// TODO 添加对发送任务的控制,比如stop等 func startSendTasks() { cfg := g.Config() // init semaphore judgeConcurrent := cfg.Judge.MaxIdle graphConcurrent := cfg.Graph.MaxIdle if judgeConcurrent < 1 { judgeConcurrent = 1 } if graphConcurrent < 1 { graphConcurrent = 1 } // init send go-routines for node, _ := range cfg.Judge.Cluster { queue := JudgeQueues[node] go forward2JudgeTask(queue, node, judgeConcurrent) } for node, nitem := range cfg.Graph.Cluster2 { for _, addr := range nitem.Addrs { queue := GraphQueues[node+addr] go forward2GraphTask(queue, node, addr, graphConcurrent) } } if cfg.Graph.Migrating { for node, cnodem := range cfg.Graph.ClusterMigrating2 { for _, addr := range cnodem.Addrs { queue := GraphMigratingQueues[node+addr] go forward2GraphMigratingTask(queue, node, addr, graphConcurrent) } } } }
func socketTelnetHandle(conn net.Conn) { defer conn.Close() items := []*cmodel.MetaData{} buf := bufio.NewReader(conn) cfg := g.Config() timeout := time.Duration(cfg.Socket.Timeout) * time.Second for { conn.SetReadDeadline(time.Now().Add(timeout)) line, err := buf.ReadString('\n') if err != nil { break } line = strings.Trim(line, "\n") if line == "quit" { break } if line == "" { continue } t := strings.Fields(line) if len(t) < 2 { continue } cmd := t[0] if cmd != "update" { continue } item, err := convertLine2MetaData(t[1:]) if err != nil { continue } items = append(items, item) } // statistics proc.SocketRecvCnt.IncrBy(int64(len(items))) proc.RecvCnt.IncrBy(int64(len(items))) if cfg.Graph.Enabled { sender.Push2GraphSendQueue(items, cfg.Graph.Migrating) } if cfg.Judge.Enabled { sender.Push2JudgeSendQueue(items) } return }
func calc() { t := time.NewTicker(time.Duration(g.Config().StatsD.Interval) * time.Second) for { select { case <-t.C: transfer() case s := <-metrics: switch s.Modifier { case "ms": _, ok := timers[s.Bucket] if !ok { timers[s.Bucket] = []float32{} } timers[s.Bucket] = append(timers[s.Bucket], float32(s.Value)*s.Sampling) case "g": gauges[s.Bucket] = s.Value case "s": _, ok := sets[s.Bucket] if !ok { sets[s.Bucket] = mapset.NewSet() } sets[s.Bucket].Add(s.Value) case "c": _, ok := counters[s.Bucket] if !ok { counters[s.Bucket] = 0 } counters[s.Bucket] += float32(s.Value) * (1.0 / s.Sampling) } } } }
// TODO 添加对发送任务的控制,比如stop等 func startSendTasks() { cfg := g.Config() // init semaphore judgeConcurrent := cfg.Judge.MaxIdle / 2 graphConcurrent := cfg.Graph.MaxIdle / 2 if judgeConcurrent < 1 { judgeConcurrent = 1 } if graphConcurrent < 1 { graphConcurrent = 1 } semaSendToJudge = nsema.NewSemaphore(judgeConcurrent) semaSendToGraph = nsema.NewSemaphore(graphConcurrent) semaSendToGraphMigrating = nsema.NewSemaphore(graphConcurrent) // init send go-routines for node, _ := range cfg.Judge.Cluster { queue := JudgeQueues[node] go forward2JudgeTask(queue, node, judgeConcurrent) } for node, _ := range cfg.Graph.Cluster { queue := GraphQueues[node] go forward2GraphTask(queue, node, graphConcurrent) } if cfg.Graph.Migrating { for node, _ := range cfg.Graph.ClusterMigrating { queue := GraphMigratingQueues[node] go forward2GraphMigratingTask(queue, node, graphConcurrent) } } }
// Judge定时任务, 将 Judge发送缓存中的数据 通过rpc连接池 发送到Judge func forward2JudgeTask(Q *list.SafeLinkedListLimited, node string, concurrent int) { batch := g.Config().Judge.Batch // 一次发送,最多batch条数据 addr := g.Config().Judge.Cluster[node] sema := nsema.NewSemaphore(concurrent) for { items := Q.PopBack(batch) count := len(items) if count == 0 { time.Sleep(DefaultSendTaskSleepInterval) continue } judgeItems := make([]*cmodel.JudgeItem, count) for i := 0; i < count; i++ { judgeItems[i] = items[i].(*cmodel.JudgeItem) } // 同步Call + 有限并发 进行发送 sema.Acquire() go func(addr string, judgeItems []*cmodel.JudgeItem, count int) { defer sema.Release() resp := &cmodel.SimpleRpcResponse{} var err error sendOk := false for i := 0; i < 3; i++ { //最多重试3次 err = JudgeConnPools.Call(addr, "Judge.Send", judgeItems, resp) if err == nil { sendOk = true break } time.Sleep(time.Millisecond * 10) } if !sendOk { log.Printf("send judge %s fail: %v", addr, err) // statistics proc.SendToJudgeFailCnt.IncrBy(int64(count)) } else { // statistics proc.SendToJudgeCnt.IncrBy(int64(count)) } }(addr, judgeItems, count) } }
// Graph定时任务, 进行数据迁移时的 数据冗余发送 func forward2GraphMigratingTask(Q *list.SafeLinkedListLimited, node string, concurrent int) { batch := g.Config().Graph.Batch // 一次发送,最多batch条数据 addr := g.Config().Graph.ClusterMigrating[node] sema := nsema.NewSemaphore(concurrent) for { items := Q.PopBack(batch) count := len(items) if count == 0 { time.Sleep(DefaultSendTaskSleepInterval) continue } graphItems := make([]*cmodel.GraphItem, count) for i := 0; i < count; i++ { graphItems[i] = items[i].(*cmodel.GraphItem) } sema.Acquire() go func(addr string, graphItems []*cmodel.GraphItem, count int) { defer sema.Release() resp := &cmodel.SimpleRpcResponse{} var err error sendOk := false for i := 0; i < 3; i++ { //最多重试3次 err = GraphMigratingConnPools.Call(addr, "Graph.Send", graphItems, resp) if err == nil { sendOk = true break } time.Sleep(time.Millisecond * 10) //发送失败了,睡10ms } if !sendOk { log.Printf("send to graph migrating %s fail: %v", addr, err) // statistics proc.SendToGraphMigratingFailCnt.IncrBy(int64(count)) } else { // statistics proc.SendToGraphMigratingCnt.IncrBy(int64(count)) } }(addr, graphItems, count) } }
func initNodeRings() { cfg := g.Config() JudgeNodeRing = newConsistentHashNodesRing(cfg.Judge.Replicas, KeysOfMap(cfg.Judge.Cluster)) GraphNodeRing = newConsistentHashNodesRing(cfg.Graph.Replicas, KeysOfMap(cfg.Graph.Cluster)) if cfg.Graph.Migrating && cfg.Graph.ClusterMigrating != nil { GraphMigratingNodeRing = newConsistentHashNodesRing(cfg.Graph.Replicas, KeysOfMap(cfg.Graph.ClusterMigrating)) } }
// Tsdb定时任务, 将数据通过api发送到tsdb func forward2TsdbTask(concurrent int) { batch := g.Config().Tsdb.Batch // 一次发送,最多batch条数据 retry := g.Config().Tsdb.MaxRetry sema := nsema.NewSemaphore(concurrent) for { items := TsdbQueue.PopBackBy(batch) if len(items) == 0 { time.Sleep(DefaultSendTaskSleepInterval) continue } // 同步Call + 有限并发 进行发送 sema.Acquire() go func(itemList []interface{}) { defer sema.Release() var tsdbBuffer bytes.Buffer for i := 0; i < len(itemList); i++ { tsdbItem := itemList[i].(*cmodel.TsdbItem) tsdbBuffer.WriteString(tsdbItem.TsdbString()) tsdbBuffer.WriteString("\n") } var err error for i := 0; i < retry; i++ { err = TsdbConnPoolHelper.Send(tsdbBuffer.Bytes()) if err == nil { proc.SendToTsdbCnt.IncrBy(int64(len(itemList))) break } time.Sleep(100 * time.Millisecond) } if err != nil { proc.SendToTsdbFailCnt.IncrBy(int64(len(itemList))) log.Println(err) return } }(items) } }
func startHttpServer() { if !g.Config().Http.Enabled { return } addr := g.Config().Http.Listen if addr == "" { return } configCommonRoutes() configProcHttpRoutes() configDebugHttpRoutes() configApiHttpRoutes() s := &http.Server{ Addr: addr, MaxHeaderBytes: 1 << 30, } log.Println("http.startHttpServer ok, listening", addr) log.Fatalln(s.ListenAndServe()) }
// process new metric values func RecvMetricValues(args []*cmodel.MetricValue, reply *cmodel.TransferResponse, from string) error { start := time.Now() reply.Invalid = 0 items := []*cmodel.MetaData{} for _, v := range args { if v == nil { reply.Invalid += 1 continue } // 历史遗留问题. // 老版本agent上报的metric=kernel.hostname的数据,其取值为string类型,现在已经不支持了;所以,这里硬编码过滤掉 if v.Metric == "kernel.hostname" { reply.Invalid += 1 continue } if v.Metric == "" || v.Endpoint == "" { reply.Invalid += 1 continue } if v.Type != g.COUNTER && v.Type != g.GAUGE && v.Type != g.DERIVE { reply.Invalid += 1 continue } if v.Value == "" { reply.Invalid += 1 continue } if v.Step <= 0 { reply.Invalid += 1 continue } if len(v.Metric)+len(v.Tags) > 510 { reply.Invalid += 1 continue } // TODO 呵呵,这里需要再优雅一点 now := start.Unix() if v.Timestamp <= 0 || v.Timestamp > now*2 { v.Timestamp = now } fv := &cmodel.MetaData{ Metric: v.Metric, Endpoint: v.Endpoint, Timestamp: v.Timestamp, Step: v.Step, CounterType: v.Type, Tags: cutils.DictedTagstring(v.Tags), //TODO tags键值对的个数,要做一下限制 } valid := true var vv float64 var err error switch cv := v.Value.(type) { case string: vv, err = strconv.ParseFloat(cv, 64) if err != nil { valid = false } case float64: vv = cv case int64: vv = float64(cv) default: valid = false } if !valid { reply.Invalid += 1 continue } fv.Value = vv items = append(items, fv) } // statistics cnt := int64(len(items)) proc.RecvCnt.IncrBy(cnt) if from == "rpc" { proc.RpcRecvCnt.IncrBy(cnt) } else if from == "http" { proc.HttpRecvCnt.IncrBy(cnt) } cfg := g.Config() if cfg.Graph.Enabled { sender.Push2GraphSendQueue(items, cfg.Graph.Migrating) } if cfg.Judge.Enabled { sender.Push2JudgeSendQueue(items) } if cfg.Tsdb.Enabled { sender.Push2TsdbSendQueue(items) } reply.Message = "ok" reply.Total = len(args) reply.Latency = (time.Now().UnixNano() - start.UnixNano()) / 1000000 return nil }