func StartSocket() { if !g.Config().Socket.Enabled { return } addr := g.Config().Socket.Listen tcpAddr, err := net.ResolveTCPAddr("tcp", addr) if err != nil { log.Fatalf("net.ResolveTCPAddr fail: %s", err) } listener, err := net.ListenTCP("tcp", tcpAddr) if err != nil { log.Fatalf("listen %s fail: %s", addr, err) } else { log.Println("socket listening", addr) } defer listener.Close() for { conn, err := listener.Accept() if err != nil { log.Println("listener.Accept occur error:", err) continue } go socketTelnetHandle(conn) } }
func newInfluxdbConnPool(name string, address string, connTimeout time.Duration, maxConns int, maxIdle int) *ConnPool { pool := NewConnPool(name, address, maxConns, maxIdle) pool.Username = g.Config().Influxdb.Username pool.Password = g.Config().Influxdb.Password pool.Database = g.Config().Influxdb.Database pool.Precision = "s" pool.New = func(connName string) (NConn, error) { nconn := InfluxdbClient{ name: connName, Address: pool.Address, Username: pool.Username, Password: pool.Password, Database: pool.Database, Precision: pool.Precision, Timeout: connTimeout, } err := nconn.Connect() if err != nil { return nil, err } return nconn, nil } return pool }
func StartRpc() { if !g.Config().Rpc.Enabled { return } addr := g.Config().Rpc.Listen tcpAddr, err := net.ResolveTCPAddr("tcp", addr) if err != nil { log.Fatalf("net.ResolveTCPAddr fail: %s", err) } listener, err := net.ListenTCP("tcp", tcpAddr) if err != nil { log.Fatalf("listen %s fail: %s", addr, err) } else { log.Println("rpc listening", addr) } server := rpc.NewServer() server.Register(new(Transfer)) for { conn, err := listener.Accept() if err != nil { log.Println("listener.Accept occur error:", err) continue } // go rpc.ServeConn(conn) go server.ServeCodec(jsonrpc.NewServerCodec(conn)) } }
// 将数据 打入 某个Graph的发送缓存队列, 具体是哪一个Graph 由一致性哈希 决定 // 如果正在数据迁移, 数据除了打到原有配置上 还要向新的配置上打一份(新老重叠时要去重,防止将一条数据向一台Graph上打两次) func Push2GraphSendQueue(items []*cmodel.MetaData, migrating bool) { cfg := g.Config().Graph for _, item := range items { graphItem, err := convert2GraphItem(item) if err != nil { log.Println("E:", err) continue } pk := item.PK() // statistics. 为了效率,放到了这里,因此只有graph是enbale时才能trace proc.RecvDataTrace.Trace(pk, item) proc.RecvDataFilter.Filter(pk, item.Value, item) node, err := GraphNodeRing.GetNode(pk) if err != nil { log.Println("E:", err) continue } cnode := cfg.Cluster2[node] errCnt := 0 for _, addr := range cnode.Addrs { Q := GraphQueues[node+addr] if !Q.PushFront(graphItem) { errCnt += 1 } } // statistics if errCnt > 0 { proc.SendToGraphDropCnt.Incr() } if migrating { migratingNode, err := GraphMigratingNodeRing.GetNode(pk) if err != nil { log.Println("E:", err) continue } if node != migratingNode { // 数据迁移时,进行新老去重 cnodem := cfg.ClusterMigrating2[migratingNode] errCnt := 0 for _, addr := range cnodem.Addrs { MQ := GraphMigratingQueues[migratingNode+addr] if !MQ.PushFront(graphItem) { errCnt += 1 } } // statistics if errCnt > 0 { proc.SendToGraphMigratingDropCnt.Incr() } } } } }
func configCommonRoutes() { http.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) { w.Write([]byte("ok\n")) }) http.HandleFunc("/version", func(w http.ResponseWriter, r *http.Request) { w.Write([]byte(fmt.Sprintf("%s\n", g.VERSION))) }) http.HandleFunc("/workdir", func(w http.ResponseWriter, r *http.Request) { w.Write([]byte(fmt.Sprintf("%s\n", file.SelfDir()))) }) http.HandleFunc("/config", func(w http.ResponseWriter, r *http.Request) { RenderDataJson(w, g.Config()) }) http.HandleFunc("/config/reload", func(w http.ResponseWriter, r *http.Request) { if strings.HasPrefix(r.RemoteAddr, "127.0.0.1") { g.ParseConfig(g.ConfigFile) RenderDataJson(w, "ok") } else { RenderDataJson(w, "no privilege") } }) }
func initSendQueues() { cfg := g.Config() for node, _ := range cfg.Judge.Cluster { Q := nlist.NewSafeListLimited(DefaultSendQueueMaxSize) JudgeQueues[node] = Q } if cfg.Influxdb.Enabled { for tnode, _ := range cfg.Influxdb.Cluster { Q := nlist.NewSafeListLimited(DefaultSendQueueMaxSize) InfluxdbQueues[tnode] = Q } } for node, nitem := range cfg.Graph.Cluster2 { for _, addr := range nitem.Addrs { Q := nlist.NewSafeListLimited(DefaultSendQueueMaxSize) GraphQueues[node+addr] = Q } } if cfg.Graph.Migrating && cfg.Graph.ClusterMigrating != nil { for node, cnode := range cfg.Graph.ClusterMigrating2 { for _, addr := range cnode.Addrs { Q := nlist.NewSafeListLimited(DefaultSendQueueMaxSize) GraphMigratingQueues[node+addr] = Q } } } }
// Judge定时任务, 将 Judge发送缓存中的数据 通过rpc连接池 发送到Judge func forward2JudgeTask(Q *list.SafeListLimited, node string, concurrent int) { batch := g.Config().Judge.Batch // 一次发送,最多batch条数据 addr := g.Config().Judge.Cluster[node] sema := nsema.NewSemaphore(concurrent) for { items := Q.PopBackBy(batch) count := len(items) if count == 0 { time.Sleep(DefaultSendTaskSleepInterval) continue } judgeItems := make([]*cmodel.JudgeItem, count) for i := 0; i < count; i++ { judgeItems[i] = items[i].(*cmodel.JudgeItem) } // 同步Call + 有限并发 进行发送 sema.Acquire() go func(addr string, judgeItems []*cmodel.JudgeItem, count int) { defer sema.Release() resp := &cmodel.SimpleRpcResponse{} var err error sendOk := false for i := 0; i < 3; i++ { //最多重试3次 err = JudgeConnPools.Call(addr, "Judge.Send", judgeItems, resp) if err == nil { sendOk = true break } time.Sleep(time.Millisecond * 10) } // statistics if !sendOk { log.Printf("send judge %s:%s fail: %v", node, addr, err) proc.SendToJudgeFailCnt.IncrBy(int64(count)) } else { proc.SendToJudgeCnt.IncrBy(int64(count)) } }(addr, judgeItems, count) } }
func initNodeRings() { cfg := g.Config() JudgeNodeRing = newConsistentHashNodesRing(cfg.Judge.Replicas, KeysOfMap(cfg.Judge.Cluster)) GraphNodeRing = newConsistentHashNodesRing(cfg.Graph.Replicas, KeysOfMap(cfg.Graph.Cluster)) if cfg.Graph.Migrating && cfg.Graph.ClusterMigrating != nil { GraphMigratingNodeRing = newConsistentHashNodesRing(cfg.Graph.Replicas, KeysOfMap(cfg.Graph.ClusterMigrating)) } }
func startHttpServer() { if !g.Config().Http.Enabled { return } addr := g.Config().Http.Listen if addr == "" { return } configCommonRoutes() configProcHttpRoutes() configDebugHttpRoutes() configApiHttpRoutes() s := &http.Server{ Addr: addr, MaxHeaderBytes: 1 << 30, } log.Println("http.startHttpServer ok, listening", addr) log.Fatalln(s.ListenAndServe()) }
// TODO 添加对发送任务的控制,比如stop等 func startSendTasks() { cfg := g.Config() // init semaphore influxdbConcurrent := cfg.Influxdb.MaxIdle judgeConcurrent := cfg.Judge.MaxIdle graphConcurrent := cfg.Graph.MaxIdle if influxdbConcurrent < 1 { influxdbConcurrent = 1 } if judgeConcurrent < 1 { judgeConcurrent = 1 } if graphConcurrent < 1 { graphConcurrent = 1 } // init send go-routines if cfg.Influxdb.Enabled { for node, _ := range cfg.Influxdb.Cluster { queue := InfluxdbQueues[node] go forward2InfluxdbTask(queue, node, influxdbConcurrent) } } for node, _ := range cfg.Judge.Cluster { queue := JudgeQueues[node] go forward2JudgeTask(queue, node, judgeConcurrent) } for node, nitem := range cfg.Graph.Cluster2 { for _, addr := range nitem.Addrs { queue := GraphQueues[node+addr] go forward2GraphTask(queue, node, addr, graphConcurrent) } } if cfg.Graph.Migrating { for node, cnodem := range cfg.Graph.ClusterMigrating2 { for _, addr := range cnodem.Addrs { queue := GraphMigratingQueues[node+addr] go forward2GraphMigratingTask(queue, node, addr, graphConcurrent) } } } }
// Graph定时任务, 进行数据迁移时的 数据冗余发送 func forward2GraphMigratingTask(Q *list.SafeListLimited, node string, addr string, concurrent int) { batch := g.Config().Graph.Batch // 一次发送,最多batch条数据 sema := nsema.NewSemaphore(concurrent) for { items := Q.PopBackBy(batch) count := len(items) if count == 0 { time.Sleep(DefaultSendTaskSleepInterval) continue } graphItems := make([]*cmodel.GraphItem, count) for i := 0; i < count; i++ { graphItems[i] = items[i].(*cmodel.GraphItem) } sema.Acquire() go func(addr string, graphItems []*cmodel.GraphItem, count int) { defer sema.Release() resp := &cmodel.SimpleRpcResponse{} var err error sendOk := false for i := 0; i < 3; i++ { //最多重试3次 err = GraphMigratingConnPools.Call(addr, "Graph.Send", graphItems, resp) if err == nil { sendOk = true break } time.Sleep(time.Millisecond * 10) //发送失败了,睡10ms } // statistics if !sendOk { log.Printf("send to graph migrating %s:%s fail: %v", node, addr, err) proc.SendToGraphMigratingFailCnt.IncrBy(int64(count)) } else { proc.SendToGraphMigratingCnt.IncrBy(int64(count)) } }(addr, graphItems, count) } }
func initConnPools() { cfg := g.Config() if cfg.Influxdb.Enabled { influxdbInstances := nset.NewStringSet() for _, instance := range cfg.Influxdb.Cluster { influxdbInstances.Add(instance) } InfluxdbConnPools = cpool.CreateInfluxdbCliPools(cfg.Influxdb.MaxConns, cfg.Influxdb.MaxIdle, cfg.Influxdb.ConnTimeout, cfg.Influxdb.CallTimeout, influxdbInstances.ToSlice()) } judgeInstances := nset.NewStringSet() for _, instance := range cfg.Judge.Cluster { judgeInstances.Add(instance) } JudgeConnPools = cpool.CreateSafeRpcConnPools(cfg.Judge.MaxConns, cfg.Judge.MaxIdle, cfg.Judge.ConnTimeout, cfg.Judge.CallTimeout, judgeInstances.ToSlice()) // graph graphInstances := nset.NewSafeSet() for _, nitem := range cfg.Graph.Cluster2 { for _, addr := range nitem.Addrs { graphInstances.Add(addr) } } GraphConnPools = cpool.CreateSafeRpcConnPools(cfg.Graph.MaxConns, cfg.Graph.MaxIdle, cfg.Graph.ConnTimeout, cfg.Graph.CallTimeout, graphInstances.ToSlice()) // graph migrating if cfg.Graph.Migrating && cfg.Graph.ClusterMigrating != nil { graphMigratingInstances := nset.NewSafeSet() for _, cnode := range cfg.Graph.ClusterMigrating2 { for _, addr := range cnode.Addrs { graphMigratingInstances.Add(addr) } } GraphMigratingConnPools = cpool.CreateSafeRpcConnPools(cfg.Graph.MaxConns, cfg.Graph.MaxIdle, cfg.Graph.ConnTimeout, cfg.Graph.CallTimeout, graphMigratingInstances.ToSlice()) } }
// Tsdb定时任务, 将 Tsdb发送缓存中的数据 通过api连接池 发送到Tsdb // 单个Cluster配置多个influxdb地址,修改为并发发送。 func forward2InfluxdbTask(Q *list.SafeListLimited, node string, concurrent int) { cfg := g.Config() batch := cfg.Influxdb.Batch // 一次发送,最多batch条数据 sema := nsema.NewSemaphore(concurrent * len(cfg.Influxdb.Cluster2[node].Addrs)) retry := cfg.Influxdb.MaxRetry for { items := Q.PopBackBy(batch) count := len(items) if count == 0 { time.Sleep(DefaultSendTaskSleepInterval) continue } pts := make([]*client.Point, count) for i := 0; i < count; i++ { pts[i] = items[i].(*client.Point) } for _, addr := range cfg.Influxdb.Cluster2[node].Addrs { sema.Acquire() go coreSend2Influxdb(addr, sema, retry, pts) } } }
// 将数据 打入所有的Tsdb的发送缓存队列, 相互备份 func Push2TsdbSendQueue(items []*cmodel.MetaData) { removeMetrics := g.Config().Influxdb.RemoveMetrics //log.Printf("Push2TsdbSendQueue") for _, item := range items { b, ok := removeMetrics[item.Metric] //log.Printf ("select:%V,%V,%V", b, ok,item ) if b && ok { continue } influxPoint := Convert2InfluxPoint(item) errCnt := 0 for _, Q := range InfluxdbQueues { if !Q.PushFront(influxPoint) { errCnt += 1 } } // statistics if errCnt > 0 { proc.SendToInfluxdbDropCnt.Incr() } } }
func socketTelnetHandle(conn net.Conn) { defer conn.Close() items := []*cmodel.MetaData{} buf := bufio.NewReader(conn) cfg := g.Config() timeout := time.Duration(cfg.Socket.Timeout) * time.Second for { conn.SetReadDeadline(time.Now().Add(timeout)) line, err := buf.ReadString('\n') if err != nil { break } line = strings.Trim(line, "\n") if line == "quit" { break } if line == "" { continue } t := strings.Fields(line) if len(t) < 2 { continue } cmd := t[0] if cmd != "update" { continue } item, err := convertLine2MetaData(t[1:]) if err != nil { continue } items = append(items, item) } // statistics proc.SocketRecvCnt.IncrBy(int64(len(items))) proc.RecvCnt.IncrBy(int64(len(items))) if cfg.Graph.Enabled { sender.Push2GraphSendQueue(items, cfg.Graph.Migrating) } if cfg.Judge.Enabled { sender.Push2JudgeSendQueue(items) } if cfg.Influxdb.Enabled { sender.Push2TsdbSendQueue(items) } return }
// process new metric values func RecvMetricValues(args []*cmodel.MetricValue, reply *cmodel.TransferResponse, from string) error { start := time.Now() reply.Invalid = 0 items := []*cmodel.MetaData{} for _, v := range args { if v == nil { reply.Invalid += 1 continue } // 历史遗留问题. // 老版本agent上报的metric=kernel.hostname的数据,其取值为string类型,现在已经不支持了;所以,这里硬编码过滤掉 if v.Metric == "kernel.hostname" { reply.Invalid += 1 continue } if v.Metric == "" || v.Endpoint == "" { reply.Invalid += 1 continue } if v.Type != g.COUNTER && v.Type != g.GAUGE && v.Type != g.DERIVE { reply.Invalid += 1 continue } if v.Value == "" { reply.Invalid += 1 continue } if v.Step <= 0 { reply.Invalid += 1 continue } if len(v.Metric)+len(v.Tags) > 510 { reply.Invalid += 1 continue } // TODO 呵呵,这里需要再优雅一点 now := start.Unix() if v.Timestamp <= 0 || v.Timestamp > now*2 { v.Timestamp = now } fv := &cmodel.MetaData{ Metric: v.Metric, Endpoint: v.Endpoint, Timestamp: v.Timestamp, Step: v.Step, CounterType: v.Type, Tags: cutils.DictedTagstring(v.Tags), //TODO tags键值对的个数,要做一下限制 } valid := true var vv float64 var err error switch cv := v.Value.(type) { case string: vv, err = strconv.ParseFloat(cv, 64) if err != nil { valid = false } case float64: vv = cv case int64: vv = float64(cv) default: valid = false } if !valid { reply.Invalid += 1 continue } fv.Value = vv items = append(items, fv) } // statistics cnt := int64(len(items)) proc.RecvCnt.IncrBy(cnt) if from == "rpc" { proc.RpcRecvCnt.IncrBy(cnt) } else if from == "http" { proc.HttpRecvCnt.IncrBy(cnt) } cfg := g.Config() if cfg.Graph.Enabled { sender.Push2GraphSendQueue(items, cfg.Graph.Migrating) } if cfg.Influxdb.Enabled { sender.Push2TsdbSendQueue(items) } if cfg.Judge.Enabled { sender.Push2JudgeSendQueue(items) } reply.Message = "ok" reply.Total = len(args) reply.Latency = (time.Now().UnixNano() - start.UnixNano()) / 1000000 return nil }