func (agent *Agent) Clean() { // we will wait for all TASK FINISHED // but after quit_time, we will KILL subprocess by SIGUSR1 start_quit := time.Now().Unix() for l := len(agent.Process); l > 0; { log.Warning("process still running, we should quit after all TASK FINISHED, please wait") log.Warning("running task is:") for task, _ := range agent.Process { log.Warningf("%s ", task) } time.Sleep(5 * time.Second) l = len(agent.Process) if now := time.Now().Unix(); now-start_quit > agent.Conf.QuitTime { log.Warning("quit_time timeout, we will kill subprocess by SIGUSR1") for task_id, p := range agent.Process { if err := p.Signal(syscall.SIGUSR1); err != nil { log.Warningf("SIGUSR1 task:%s failed...", task_id) } log.Warningf("SIGUSR1 task:%s OK...wait subprocess quit", task_id) } goto quit } } quit: time.Sleep(2 * time.Second) close(agent.StatusLoopQuitChan) log.Warning("all process DONE, we quit success.") }
// LoadCronMetaData will only be called in GenJobs Function // when store unavilable , and must be called once, so we needn't get Lock func (agent *Agent) LoadCronMetaData() { cronSlice := make([]string, 0) meta_file := fmt.Sprintf("%s/dcms_agent.metadata", agent.Conf.WorkDir) f, err := os.Open(meta_file) if err != nil { log.Warningf("reading metadata file: %s failed %s", meta_file, err) return } if data, err := ioutil.ReadAll(f); err != nil { log.Warningf("ioutil metadata file read all failed %s", err) } else { if err = json.Unmarshal(data, &cronSlice); err != nil { log.Warningf("json unmarshal meta data failed: %s", string(data)) return } for _, v := range cronSlice { log.Debug("receive cron from metadata file:", v) var cj *CronJob if err = json.Unmarshal([]byte(v), &cj); err != nil { log.Warningf("json unmarshal failed for:", v) continue } cj.Dcms = agent agent.Jobs[cj.Id] = cj } for id, job := range agent.Jobs { log.Debug("now Agent has jobs:", id, job) } } }
func (c *ClusterClient) reloadSlots() { defer atomic.StoreUint32(&c.reloading, 0) var ( client *Client err error ) for i := 0; i < 3; i++ { client, err = c.randomClient() if err != nil { log.Warningf("redis: randomClient failed for %d times: %s", i+1, err) if i == 2 { return } continue } break } slots, err := client.ClusterSlots().Result() if err != nil { log.Warningf("redis: ClusterSlots failed: %s", err) return } c.setSlots(slots) }
//DeleteTaskById will kill subprocess of task func (s *Server) DeleteTaskById(p martini.Params) (int, string) { log.Debug("Server dcms http_api DeleteJobById") taskid, ok := p["taskid"] if !ok { return responseError(500, "GetTaskById without taskid") } //we will KILL subprocess async, in one goroutine go func(taskid string) { defer func() { if err := recover(); err != nil { log.Warningf("Delete Task By Id:%s panic: %s", taskid, err) } }() s.DCMS.Lock.Lock() defer s.DCMS.Lock.Unlock() for _, task := range s.DCMS.Running { if task.TaskId != taskid { continue } s.DCMS.KillTask(task) return } log.Warningf("Delete Task By Id:%s not exists or may be done", taskid) }(taskid) return responseSuccess(fmt.Sprintf("Task:%s will be killed async, or may be done normal", taskid)) }
func (pc *ProxyConfig) apply() { log.SetLevelByString(pc.logLevel) if pc.logFile != "" { err := log.SetOutputByName(pc.logFile) if err != nil { log.Fatalf("ProxyConfig SetOutputByName %s failed %s ", pc.logFile, err.Error()) } log.SetRotateByDay() } if pc.name == "" { log.Fatal("ProxyConfig name must not empty") } if pc.port == 0 { log.Fatal("ProxyConfig port must not 0") } if pc.cpu > runtime.NumCPU() { log.Warningf("ProxyConfig cpu %d exceed %d, adjust to %d ", pc.cpu, runtime.NumCPU(), runtime.NumCPU()) pc.cpu = runtime.NumCPU() } if pc.maxConn > 10000 { log.Warningf("ProxyConfig maxconn %d exceed 10000, adjust to 10000", pc.maxConn) pc.maxConn = 10000 } runtime.GOMAXPROCS(pc.cpu) if pc.poolSize <= 0 || pc.poolSize > 30 { log.Warning("ProxyConfig poolSize %d , adjust to 10 ", pc.poolSize) pc.poolSize = 10 } if pc.cpuFile != "" { f, err := os.Create(pc.cpuFile) if err != nil { log.Fatal(err) } log.Warning("Archer start CPUProfile ", pc.cpuFile) pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } if pc.memFile != "" { f, err := os.Create(pc.memFile) if err == nil { log.Warning("Archer start HeapProfile ", pc.memFile) pprof.WriteHeapProfile(f) } } go func() { log.Warning(http.ListenAndServe(":6061", nil)) }() }
// not thread-safe, caller must kill after get Lock func (agent *Agent) KillTask(t *Task) { p, ok := agent.Process[t.TaskId] if !ok { log.Warningf("In KillTask %s, can't find process in agent.Process", t.TaskId) } else { pid := p.Pid log.Warning("KillTask pid is: we try to kill", pid) if err := p.Kill(); err != nil { // double check log.Warningf("kill err %s, we try again ", err) util.KillTaskForceByPid(pid) } } }
// we will Save Cron MetaData periodically, currently for 5min func (agent *Agent) SaveCronMetaData() { meta_file := fmt.Sprintf("%s/dcms_agent.metadata", agent.Conf.WorkDir) cronSlice := make([]string, 0) agent.Lock.Lock() defer agent.Lock.Unlock() for k, v := range agent.Jobs { if data, err := json.Marshal(v); err == nil { cronSlice = append(cronSlice, string(data)) } else { log.Warningf("marshal task: %d failed: %s", k, err) return } } if data, err := json.Marshal(cronSlice); err != nil { log.Warning("json marshal cronslice failed, ", err) } else { if len(cronSlice) == 0 { log.Warning("cronSlice json empty, just skip write MetaData") return } log.Debug("len of cronSlice:", len(data), data) log.Debugf("cronSlice length:%d content:%s", len(cronSlice), cronSlice) if e := ioutil.WriteFile(meta_file, data, os.ModePerm); e != nil { log.Warning("ioutil write meta_file failed,", e) } } }
func (s *Server) handleConn(c net.Conn) { log.Info("new connection", c.RemoteAddr()) s.counter.Add("connections", 1) client := &session{ Conn: c, r: bufio.NewReader(c), CreateAt: time.Now(), } var err error defer func() { if err != nil { //todo: fix this ugly error check if GetOriginError(err.(*errors.Err)).Error() != io.EOF.Error() { log.Warningf("close connection %v, %+v, %v", c.RemoteAddr(), client, errors.ErrorStack(err)) } else { log.Infof("close connection %v, %+v", c.RemoteAddr(), client) } } else { log.Infof("close connection %v, %+v", c.RemoteAddr(), client) } c.Close() s.counter.Add("connections", -1) }() for { err = s.redisTunnel(client) if err != nil { return } client.Ops++ } }
func (s *Server) resetSchemaInfo() error { for _, c := range s.clients { if len(c.txConns) > 0 { return errors.Errorf("transaction exist") } } cfg, err := config.ParseConfigFile(s.configFile) if err != nil { log.Error(err) return err } s.cleanup() s.autoSchamas = make(map[string]*tabletserver.SchemaInfo) for _, n := range s.shards { n.Close() } s.shards = nil s.schemas = nil log.Warningf("%#v", cfg) log.SetLevelByString(cfg.LogLevel) s.cfg = cfg return s.loadSchemaInfo() }
func (t *Task) genLogFile() { defer func() { if e := recover(); e != nil { log.Warning("genLogFile fatal:", e) } }() d := time.Now().Format("20060102") filename := fmt.Sprintf("%s/DCMS-%s/%d-%s-%s.log", t.Job.Dcms.Conf.WorkDir, d, t.Job.Id, t.Job.Name, t.TaskId) log.Info("generate logfile :", filename) logdir := fmt.Sprintf("%s/DCMS-%s", t.Job.Dcms.Conf.WorkDir, d) if err := os.MkdirAll(logdir, os.ModePerm); err != nil { log.Warningf("in run exec goroutine, mkdir workdir %s failed!!!! ", t.Job.Dcms.Conf.WorkDir) } if f, err := os.OpenFile(filename, os.O_RDWR|os.O_CREATE|os.O_TRUNC, os.ModePerm); err != nil { log.Warning("in genLogFile os.OpenFile create failed: ", f) t.logfile = nil t.LogFilename = "" } else { t.logfile = f t.LogFilename = filename } }
func (self *ResMan) addReadyTask(id string) (string, error) { if self.ready.Exist(id) { return "", fmt.Errorf("%s already exist: %+v", id, self.ready.Get(id)) } job, err := scheduler.GetJobById(id) if err != nil { return "", err } persistentTask := &scheduler.Task{TaskId: self.genTaskId(), Status: scheduler.STATUS_READY, StartTs: time.Now().Unix(), JobName: job.Name} log.Warningf("%+v", persistentTask) err = persistentTask.Save() if err != nil { log.Error(err) return "", err } job.LastTaskId = persistentTask.TaskId job.Save() t := &Task{Tid: persistentTask.TaskId, job: job, state: taskReady} self.ready.Add(t.Tid, t) log.Debugf("ready task %+v, total count:%d", t, self.ready.Length()) return persistentTask.TaskId, nil }
func getInsertPKValues(pkColumnNumbers []int, rowList sqlparser.Values, tableInfo *schema.Table) (pkValues []interface{}, err error) { pkValues = make([]interface{}, len(pkColumnNumbers)) for index, columnNumber := range pkColumnNumbers { if columnNumber == -1 { pkValues[index] = tableInfo.GetPKColumn(index).Default continue } values := make([]interface{}, len(rowList)) for j := 0; j < len(rowList); j++ { if _, ok := rowList[j].(*sqlparser.Subquery); ok { return nil, errors.New("row subquery not supported for inserts") } row := rowList[j].(sqlparser.ValTuple) if columnNumber >= len(row) { return nil, errors.New("column count doesn't match value count") } node := row[columnNumber] if !sqlparser.IsValue(node) { log.Warningf("insert is too complex %v", node) return nil, nil } var err error values[j], err = sqlparser.AsInterface(node) if err != nil { return nil, err } } if len(values) == 1 { pkValues[index] = values[0] } else { pkValues[index] = values } } return pkValues, nil }
// filter must split by '|', for example "fatal|error|fail|failed" func HitFilter(filename string, filter string) bool { log.Debug("HitFilter run:", filename, filter) filterExp, err := regexp.Compile(fmt.Sprintf(`(?i:(%s))`, filter)) if err != nil { log.Warningf("HitFilter regexp.Compile for %s failed:%s", filter, err) return false } if f, err := os.Open(filename); err != nil { log.Warning("HitFilter open file failed ", filename, err) return false } else { defer f.Close() freader := bufio.NewReader(f) for { var str string str, err = freader.ReadString('\n') s := filterExp.FindStringSubmatch(str) if len(s) > 0 { log.Debugf("HitFilter hit msg_filter ", s, str) return true } if err == io.EOF { break } } } return false }
//ping codis-server find crashed codis-server func CheckAliveAndPromote(groups []models.ServerGroup) ([]models.Server, error) { errCh := make(chan interface{}, 100) var serverCnt int for _, group := range groups { //each group for _, s := range group.Servers { //each server serverCnt++ rc := acf(s.Addr, 5*time.Second) news := s go PingServer(rc, news, errCh) } } //get result var crashedServer []models.Server for i := 0; i < serverCnt; i++ { s := <-errCh if s == nil { //alive continue } log.Warningf("server maybe crashed %+v", s) crashedServer = append(crashedServer, *s.(*models.Server)) err := handleCrashedServer(s.(*models.Server)) if err != nil { return crashedServer, err } } return crashedServer, nil }
// Interrupt releases a lock that's held. func (zm *zMutex) Interrupt() { select { case zm.interrupted <- struct{}{}: default: log.Warningf("zmutex interrupt blocked") } }
// we will check output file, if content contain msg_filter, we will change status to Failed func (agent *Agent) HandleStatusSuccess(s *TaskStatus) { agent.Lock.Lock() defer agent.Lock.Unlock() if !util.HitFilter(s.TaskPtr.LogFilename, s.TaskPtr.Job.MsgFilter) { s.TaskPtr.Job.LastSuccessAt = s.CreateAt s.TaskPtr.Job.LastTaskId = s.TaskPtr.TaskId if agent.Running[s.TaskPtr.JobId].Status == StatusTimeout { s.TaskPtr.Job.LastStatus = JobTimeout } else { s.TaskPtr.Job.LastStatus = JobSuccess } delete(agent.Process, s.TaskPtr.TaskId) delete(agent.Running, s.TaskPtr.JobId) s.TaskPtr.Job.SuccessCnt += 1 log.Warning("Task success : ", s.TaskPtr.TaskId, s.TaskPtr.Job.Name, s.TaskPtr.ExecDuration) } else { s.TaskPtr.Job.LastErrAt = s.CreateAt s.TaskPtr.Job.LastTaskId = s.TaskPtr.TaskId s.TaskPtr.Job.LastStatus = JobFail s.Status = StatusFailed delete(agent.Process, s.TaskPtr.TaskId) delete(agent.Running, s.TaskPtr.JobId) s.TaskPtr.Job.ErrCnt += 1 log.Warningf("Task failed : hit msg_filter error", s.TaskPtr.TaskId, s.TaskPtr.Job.Name, s.TaskPtr.ExecDuration) s.Err = errors.New(fmt.Sprintf("Task: %s Job: %s failed. hit msg_filter error", s.TaskPtr.TaskId, s.TaskPtr.Job.Name)) } s.Message = util.GetFileContent(s.TaskPtr.LogFilename, 65535, 1) if ok := agent.store.UpdateTaskStatus(s); !ok { log.Warning("Task status Store Or Update failed ", s) } agent.PostTaskStatus(s) }
// heartbeat monitors state of each shard in the ring. func (ring *Ring) heartbeat() { ticker := time.NewTicker(100 * time.Millisecond) defer ticker.Stop() for _ = range ticker.C { var rebalance bool ring.mx.RLock() if ring.closed { ring.mx.RUnlock() break } for _, shard := range ring.shards { err := shard.Client.Ping().Err() if shard.Vote(err == nil) { log.Warningf("redis: ring shard state changed: %s", shard) rebalance = true } } ring.mx.RUnlock() if rebalance { ring.rebalance() } } }
func (si *SchemaInfo) override() { for _, override := range si.overrides { table, ok := si.tables[override.Name] if !ok { log.Warningf("Table not found for override: %v, %v", override, si.tables) continue } if override.PKColumns != nil { log.Infof("SetPK Table name %s, pk %v", override.Name, override.PKColumns) if err := table.SetPK(override.PKColumns); err != nil { log.Errorf("%s: %v", errors.ErrorStack(err), override) continue } } if si.cachePool.IsClosed() || override.Cache == nil { log.Infof("%+v", override) continue } switch override.Cache.Type { case "RW": table.CacheType = schema.CACHE_RW table.Cache = NewRowCache(table, si.cachePool) case "W": table.CacheType = schema.CACHE_W if len(override.Cache.Table) == 0 { log.Warningf("Incomplete cache specs: %v", override) continue } totable, ok := si.tables[override.Cache.Table] if !ok { log.Warningf("Table not found: %v", override) continue } if totable.Cache == nil { log.Warningf("Table has no cache: %v", override) continue } table.Cache = totable.Cache default: log.Warningf("Ignoring cache override: %+v", override) } } }
func (c *Cluster) PutConn(cn Conn) { pool, ok := c.pools[cn.ID()] if !ok { log.Warningf("Cluster PutConn %s, belong no pool", cn.ID()) return } pool.Put(cn) }
func formatSec(dur time.Duration) string { if dur > 0 && dur < time.Second { log.Warningf( "redis: specified duration is %s, but minimal supported value is %s", dur, time.Second, ) } return strconv.FormatInt(int64(dur/time.Second), 10) }
func (top *Topology) doWatch(evtch <-chan topo.Event, evtbus chan interface{}) { e := <-evtch if e.State == topo.StateExpired || e.Type == topo.EventNotWatching { log.Fatalf("session expired: %+v", e) } log.Warningf("topo event %+v", e) switch e.Type { //case topo.EventNodeCreated: //case topo.EventNodeDataChanged: case topo.EventNodeChildrenChanged: //only care children changed //todo:get changed node and decode event default: log.Warningf("%+v", e) } evtbus <- e }
func (self *ResMan) TimeoutCheck(sec int) { var timeoutTasks []string self.running.Each(func(key string, t *Task) bool { if t.state == taskRuning && time.Since(t.LastUpdate).Seconds() > float64(sec) { log.Warningf("%+v timeout", t) timeoutTasks = append(timeoutTasks, key) } return true }) for _, taskId := range timeoutTasks { log.Warningf("remove timeout task %s", taskId) mid := &mesos.TaskID{} id := taskId mid.Value = &id self.driver.KillTask(mid) //self.running.Del(taskId) } }
func validCmd(cmd uint32) bool { if cmd >= common.CAN_DO && cmd <= common.SUBMIT_JOB_EPOCH { return true } if cmd != 39 { //filter gearmand log.Warningf("invalid cmd %d", cmd) } return false }
func (p *Proxy) Start() { for { c, err := p.l.Accept() if err != nil { log.Warning("got error when Accept network connect ", err) if nerr, ok := err.(net.Error); ok && nerr.Temporary() { log.Warningf("NOTICE: temporary Accept() failure - %s", err) runtime.Gosched() continue } // theres no direct way to detect this error because it is not exposed if !strings.Contains(err.Error(), "use of closed network connection") { log.Warningf("ERROR: listener.Accept() - %s", err) } break } go HandleConn(p, c) } }
func (t *Topology) reloadSlots() { ss, err := t.getSlots() if err != nil { log.Warningf("ReloadLoop failed ", err) return } t.rw.Lock() t.slots = ss t.rw.Unlock() }
func (p *connPool) Remove(cn *conn) error { // Replace existing connection with new one and unblock waiter. newcn, err := p.new() if err != nil { log.Warningf("redis: new failed: %s", err) return p.conns.Remove(cn) } err = p.conns.Replace(cn, newcn) p.freeConns <- newcn return err }
func (p *connPool) Put(cn *conn) error { if cn.rd.Buffered() != 0 { b, _ := cn.rd.ReadN(cn.rd.Buffered()) log.Warningf("redis: connection has unread data: %q", b) return p.Remove(cn) } if p.opt.getIdleTimeout() > 0 { cn.usedAt = time.Now() } p.freeConns <- cn return nil }
func GetStmtExecPlan(stmt sqlparser.Statement, getTable TableGetter, alloc arena.ArenaAllocator) (plan *ExecPlan, err error) { plan, err = analyzeSQL(stmt, getTable, alloc) if err != nil { return nil, err } if plan.PlanId == PLAN_PASS_DML { log.Warningf("PASS_DML: %s", sqlparser.String(stmt, alloc)) } return plan, nil }
func (s *Server) handleMigrateState(slotIndex int, key []byte) error { shd := s.slots[slotIndex] if shd.slotInfo.State.Status != models.SLOT_STATUS_MIGRATE { return nil } if shd.migrateFrom == nil { log.Fatalf("migrateFrom not exist %+v", shd) } if shd.dst.Master() == shd.migrateFrom.Master() { log.Fatalf("the same migrate src and dst, %+v", shd) } redisConn, err := s.pools.GetConn(shd.migrateFrom.Master()) if err != nil { return errors.Trace(err) } defer s.pools.ReleaseConn(redisConn) redisReader := redisConn.(*redispool.PooledConn).BufioReader() err = WriteMigrateKeyCmd(redisConn.(*redispool.PooledConn), shd.dst.Master(), 30*1000, key) if err != nil { redisConn.Close() log.Warningf("migrate key %s error, from %s to %s", string(key), shd.migrateFrom.Master(), shd.dst.Master()) return errors.Trace(err) } //handle migrate result resp, err := parser.Parse(redisReader) if err != nil { redisConn.Close() return errors.Trace(err) } result, err := resp.Bytes() log.Debug("migrate", string(key), "from", shd.migrateFrom.Master(), "to", shd.dst.Master(), string(result)) if resp.Type == parser.ErrorResp { redisConn.Close() log.Error(string(key), string(resp.Raw), "migrateFrom", shd.migrateFrom.Master()) return errors.New(string(resp.Raw)) } s.counter.Add("Migrate", 1) return nil }
func ExpireBlackLists() { ticker := time.NewTicker(30 * time.Second) for { select { case <-ticker.C: for k, b := range BlackKeyLists { if b.Deadline.Before(time.Now()) { log.Warningf("Black key: %s last: %s deadline: %s reached, will be expired...", b.Name, b.Deadline.Sub(b.Startup), b.Deadline.String()) delete(BlackKeyLists, k) } } } } }