func proxy(method string, args interface{}, reply interface{}) error { // 随机遍历hbs列表,直到数据发送成功 或者 遍历完 err := fmt.Errorf("proxy connections not available") sendOk := false rint := rand.Int() for i := 0; i < HbsNum && !sendOk; i++ { idx := (i + rint) % HbsNum host := HbsHostnames[idx] addr := HbsMap[host] // 过滤掉建连缓慢的host, 否则会严重影响发送速率 key := addr + "." + method cc := pfc.GetCounterCount(key) if cc >= HbsMaxConns { continue } pfc.Counter(key, 1) err = ConnPools.Call(addr, method, args, reply) pfc.Counter(key, -1) if err == nil { pfc.Meter(key+".ok", 1) sendOk = true } else { pfc.Meter(key+".error", 1) } } return err }
func socketTelnetHandle(conn net.Conn) { defer conn.Close() items := []*cmodel.MetaData{} buf := bufio.NewReader(conn) cfg := g.Config() timeout := time.Duration(cfg.Socket.Timeout) * time.Second for { conn.SetReadDeadline(time.Now().Add(timeout)) line, err := buf.ReadString('\n') if err != nil { break } line = strings.Trim(line, "\n") if line == "quit" { break } if line == "" { continue } t := strings.Fields(line) if len(t) < 2 { continue } cmd := t[0] if cmd != "update" { continue } item, err := convertLine2MetaData(t[1:]) if err != nil { continue } items = append(items, item) } // statistics count := int64(len(items)) pfc.Meter("SocketRecv", count) pfc.Meter("Recv", count) if cfg.Transfer.Enabled { sender.Push2SendQueue(items) } return }
func net_task_worker(idx int, ch chan *Net_task_t, client **rpc.Client, addr string) { var err error for { select { case task := <-ch: if task.Method == NET_TASK_M_SEND { if err = send_data(client, task.Key, addr); err != nil { pfc.Meter("migrate.send.err", 1) atomic.AddUint64(&stat_cnt[SEND_S_ERR], 1) } else { pfc.Meter("migrate.send.ok", 1) atomic.AddUint64(&stat_cnt[SEND_S_SUCCESS], 1) } } else if task.Method == NET_TASK_M_QUERY { if err = query_data(client, addr, task.Args, task.Reply); err != nil { pfc.Meter("migrate.query.err", 1) atomic.AddUint64(&stat_cnt[QUERY_S_ERR], 1) } else { pfc.Meter("migrate.query.ok", 1) atomic.AddUint64(&stat_cnt[QUERY_S_SUCCESS], 1) } } else if task.Method == NET_TASK_M_PULL { if atomic.LoadInt32(&flushrrd_timeout) != 0 { // hope this more faster than fetch_rrd if err = send_data(client, task.Key, addr); err != nil { pfc.Meter("migrate.sendbusy.err", 1) atomic.AddUint64(&stat_cnt[SEND_S_ERR], 1) } else { pfc.Meter("migrate.sendbusy.ok", 1) atomic.AddUint64(&stat_cnt[SEND_S_SUCCESS], 1) } } else { if err = fetch_rrd(client, task.Key, addr); err != nil { if os.IsNotExist(err) { pfc.Meter("migrate.scprrd.null", 1) //文件不存在时,直接将缓存数据刷入本地 atomic.AddUint64(&stat_cnt[FETCH_S_ISNOTEXIST], 1) store.GraphItems.SetFlag(task.Key, 0) CommitByKey(task.Key) } else { pfc.Meter("migrate.scprrd.err", 1) //warning:其他异常情况,缓存数据会堆积 atomic.AddUint64(&stat_cnt[FETCH_S_ERR], 1) } } else { pfc.Meter("migrate.scprrd.ok", 1) atomic.AddUint64(&stat_cnt[FETCH_S_SUCCESS], 1) } } } else { err = errors.New("error net task method") } if task.Done != nil { task.Done <- err } } } }
func basic() { for _ = range time.Tick(time.Second * time.Duration(10)) { // (常用) Meter,用于累加求和、计算变化率。使用场景如,统计首页访问次数、gvm的CG次数等。 pv := int64(rand.Int() % 100) pfc.Meter("test.meter", pv) pfc.Meter("test.meter.2", pv-50) // (常用) Gauge,用于保存数值类型的瞬时记录值。使用场景如,统计队列长度、统计CPU使用率等 queueSize := int64(rand.Int()%100 - 50) pfc.Gauge("test.gauge", queueSize) cpuUtil := float64(rand.Int()%10000) / float64(100) pfc.GaugeFloat64("test.gauge.float64", cpuUtil) } }
// status calc func monitor() { startTs := time.Now().Unix() _monitor() endTs := time.Now().Unix() log.Printf("monitor, startTs %s, time-consuming %d sec\n", ntime.FormatTs(startTs), endTs-startTs) // statistics pfc.Meter("MonitorCronCnt", 1) pfc.Gauge("MonitorCronTs", endTs-startTs) }
func Push2SendQueue(items []*cmodel.MetaData) { for _, item := range items { // statistics pk := item.PK() g.RecvDataTrace.Trace(pk, item) g.RecvDataFilter.Filter(pk, item.Value, item) isOk := SenderQueue.PushFront(item) // statistics if !isOk { pfc.Meter("SendDrop", 1) } } }
// TODO addr to node func reconnection(client **rpc.Client, addr string) { pfc.Meter("migrate.reconnection."+addr, 1) var err error atomic.AddUint64(&stat_cnt[CONN_S_ERR], 1) if *client != nil { (*client).Close() } *client, err = dial(addr, time.Second) atomic.AddUint64(&stat_cnt[CONN_S_DIAL], 1) for err != nil { //danger!! block routine time.Sleep(time.Millisecond * 500) *client, err = dial(addr, time.Second) atomic.AddUint64(&stat_cnt[CONN_S_DIAL], 1) } }
// alarm judge func alarmJudge() { interval := time.Duration(10) * time.Second for { time.Sleep(interval) var content bytes.Buffer keys := alarmCache.Keys() if len(keys) == 0 { continue } for _, key := range keys { aitem, found := alarmCache.GetAndRemove(key) if !found { continue } content.WriteString(aitem.(*Alarm).String() + "\n") } if content.Len() < 6 { return } cfg := g.Config() // mail if cfg.Mail.Enable { hn, _ := os.Hostname() mailContent := formAlarmMailContent(cfg.Mail.Receivers, "AntEye.Alarm.From.["+hn+"]", content.String(), "AntEye") err := sendMail(cfg.Mail.Url, mailContent) if err != nil { log.Println("alarm send mail error, mail:", mailContent, "", err) } else { // statistics pfc.Meter("MonitorAlarmMail", 1) } } // sms if cfg.Sms.Enable { smsContent := formAlarmSmsContent(cfg.Sms.Receivers, content.String(), "AntEye") err := sendSms(cfg.Sms.Url, smsContent) if err != nil { log.Println("alarm send sms error, sms:", smsContent, "", err) } else { // statistics pfc.Meter("MonitorAlarmSms", 1) } } // callback if cfg.Callback.Enable { cbc := content.String() err := alarmCallback(cfg.Callback.Url, cbc) if err != nil { log.Println("alarm callback error, callback:", cfg.Callback, ", content:", cbc, "", err) } else { // statistics pfc.Meter("MonitorAlarmCallback", 1) } } } }
// process new metric values func RecvMetricValues(args []*cmodel.MetricValue, reply *g.TransferResp, from string) error { start := time.Now() reply.ErrInvalid = 0 items := []*cmodel.MetaData{} for _, v := range args { if v == nil { reply.ErrInvalid += 1 continue } // 历史遗留问题. // 老版本agent上报的metric=kernel.hostname的数据,其取值为string类型,现在已经不支持了;所以,这里硬编码过滤掉 if v.Metric == "kernel.hostname" { reply.ErrInvalid += 1 continue } if v.Metric == "" || v.Endpoint == "" { reply.ErrInvalid += 1 continue } if v.Type != g.COUNTER && v.Type != g.GAUGE && v.Type != g.DERIVE { reply.ErrInvalid += 1 continue } if v.Value == "" { reply.ErrInvalid += 1 continue } if v.Step <= 0 { reply.ErrInvalid += 1 continue } if len(v.Metric)+len(v.Tags) > 510 { reply.ErrInvalid += 1 continue } errtags, tags := cutils.SplitTagsString(v.Tags) if errtags != nil { reply.ErrInvalid += 1 continue } // TODO 呵呵,这里需要再优雅一点 now := start.Unix() if v.Timestamp <= 0 || v.Timestamp > now*2 { v.Timestamp = now } fv := &cmodel.MetaData{ Metric: v.Metric, Endpoint: v.Endpoint, Timestamp: v.Timestamp, Step: v.Step, CounterType: v.Type, Tags: tags, //TODO tags键值对的个数,要做一下限制 } valid := true var vv float64 var err error switch cv := v.Value.(type) { case string: vv, err = strconv.ParseFloat(cv, 64) if err != nil { valid = false } case float64: vv = cv case int64: vv = float64(cv) default: valid = false } if !valid { reply.ErrInvalid += 1 continue } fv.Value = vv items = append(items, fv) } // statistics cnt := int64(len(items)) pfc.Meter("Recv", cnt) if from == "rpc" { pfc.Meter("RpcRecv", cnt) } else if from == "http" { pfc.Meter("HttpRecv", cnt) } cfg := g.Config() if cfg.Transfer.Enabled { sender.Push2SendQueue(items) } reply.Msg = "ok" reply.Total = len(args) reply.Latency = (time.Now().UnixNano() - start.UnixNano()) / 1000000 return nil }
func forward2TransferTask(Q *nlist.SafeListLimited, concurrent int32) { cfg := g.Config() batch := int(cfg.Transfer.Batch) maxConns := int64(cfg.Transfer.MaxConns) retry := int(cfg.Transfer.Retry) if retry < 1 { retry = 1 } sema := nsema.NewSemaphore(int(concurrent)) transNum := len(TransferHostnames) for { items := Q.PopBackBy(batch) count := len(items) if count == 0 { time.Sleep(time.Millisecond * 50) continue } transItems := make([]*cmodel.MetricValue, count) for i := 0; i < count; i++ { transItems[i] = convert(items[i].(*cmodel.MetaData)) } sema.Acquire() go func(transItems []*cmodel.MetricValue, count int) { defer sema.Release() var err error // 随机遍历transfer列表,直到数据发送成功 或者 遍历完;随机遍历,可以缓解慢transfer resp := &g.TransferResp{} sendOk := false for j := 0; j < retry && !sendOk; j++ { rint := rand.Int() for i := 0; i < transNum && !sendOk; i++ { idx := (i + rint) % transNum host := TransferHostnames[idx] addr := TransferMap[host] // 过滤掉建连缓慢的host, 否则会严重影响发送速率 cc := pfc.GetCounterCount(host) if cc >= maxConns { continue } pfc.Counter(host, 1) err = SenderConnPools.Call(addr, "Transfer.Update", transItems, resp) pfc.Counter(host, -1) if err == nil { sendOk = true // statistics TransferSendCnt[host].IncrBy(int64(count)) } else { // statistics TransferSendFailCnt[host].IncrBy(int64(count)) } } } // statistics if !sendOk { if cfg.Debug { log.Printf("send to transfer fail, connpool:%v", SenderConnPools.Proc()) } pfc.Meter("SendFail", int64(count)) } else { pfc.Meter("Send", int64(count)) } }(transItems, count) } }