Beispiel #1
0
func (agent *Agent) Clean() {
	// we will wait for all TASK FINISHED
	// but after quit_time, we will KILL subprocess by SIGUSR1
	start_quit := time.Now().Unix()
	for l := len(agent.Process); l > 0; {
		log.Warning("process still running, we should quit after all TASK FINISHED, please wait")
		log.Warning("running task is:")
		for task, _ := range agent.Process {
			log.Warningf("%s ", task)
		}
		time.Sleep(5 * time.Second)
		l = len(agent.Process)
		if now := time.Now().Unix(); now-start_quit > agent.Conf.QuitTime {
			log.Warning("quit_time timeout, we will kill subprocess by SIGUSR1")
			for task_id, p := range agent.Process {
				if err := p.Signal(syscall.SIGUSR1); err != nil {
					log.Warningf("SIGUSR1 task:%s failed...", task_id)
				}
				log.Warningf("SIGUSR1 task:%s OK...wait subprocess quit", task_id)
			}
			goto quit
		}

	}
quit:
	time.Sleep(2 * time.Second)
	close(agent.StatusLoopQuitChan)
	log.Warning("all process DONE, we quit success.")
}
Beispiel #2
0
// LoadCronMetaData will only be called in GenJobs Function
// when store unavilable , and must be called once, so we needn't get Lock
func (agent *Agent) LoadCronMetaData() {
	cronSlice := make([]string, 0)
	meta_file := fmt.Sprintf("%s/dcms_agent.metadata", agent.Conf.WorkDir)
	f, err := os.Open(meta_file)
	if err != nil {
		log.Warningf("reading metadata file: %s failed %s", meta_file, err)
		return
	}
	if data, err := ioutil.ReadAll(f); err != nil {
		log.Warningf("ioutil metadata file read all failed %s", err)
	} else {
		if err = json.Unmarshal(data, &cronSlice); err != nil {
			log.Warningf("json unmarshal meta data failed: %s", string(data))
			return
		}
		for _, v := range cronSlice {
			log.Debug("receive cron from metadata file:", v)
			var cj *CronJob
			if err = json.Unmarshal([]byte(v), &cj); err != nil {
				log.Warningf("json unmarshal failed for:", v)
				continue
			}

			cj.Dcms = agent
			agent.Jobs[cj.Id] = cj
		}
		for id, job := range agent.Jobs {
			log.Debug("now Agent has jobs:", id, job)
		}
	}
}
Beispiel #3
0
func (c *ClusterClient) reloadSlots() {
	defer atomic.StoreUint32(&c.reloading, 0)
	var (
		client *Client
		err    error
	)

	for i := 0; i < 3; i++ {
		client, err = c.randomClient()
		if err != nil {
			log.Warningf("redis: randomClient failed for %d times: %s", i+1, err)
			if i == 2 {
				return
			}
			continue
		}
		break
	}

	slots, err := client.ClusterSlots().Result()
	if err != nil {
		log.Warningf("redis: ClusterSlots failed: %s", err)
		return
	}
	c.setSlots(slots)
}
Beispiel #4
0
//DeleteTaskById will kill subprocess of task
func (s *Server) DeleteTaskById(p martini.Params) (int, string) {
	log.Debug("Server dcms http_api DeleteJobById")
	taskid, ok := p["taskid"]
	if !ok {
		return responseError(500, "GetTaskById without taskid")
	}

	//we will KILL subprocess async, in one goroutine
	go func(taskid string) {
		defer func() {
			if err := recover(); err != nil {
				log.Warningf("Delete Task By Id:%s panic: %s", taskid, err)
			}
		}()
		s.DCMS.Lock.Lock()
		defer s.DCMS.Lock.Unlock()
		for _, task := range s.DCMS.Running {
			if task.TaskId != taskid {
				continue
			}
			s.DCMS.KillTask(task)
			return
		}
		log.Warningf("Delete Task By Id:%s not exists or may be done", taskid)
	}(taskid)

	return responseSuccess(fmt.Sprintf("Task:%s will be killed async, or may be done normal", taskid))

}
Beispiel #5
0
func (pc *ProxyConfig) apply() {
	log.SetLevelByString(pc.logLevel)

	if pc.logFile != "" {
		err := log.SetOutputByName(pc.logFile)
		if err != nil {
			log.Fatalf("ProxyConfig SetOutputByName %s failed %s ", pc.logFile, err.Error())
		}
		log.SetRotateByDay()
	}

	if pc.name == "" {
		log.Fatal("ProxyConfig name must not empty")
	}

	if pc.port == 0 {
		log.Fatal("ProxyConfig port  must not 0")
	}

	if pc.cpu > runtime.NumCPU() {
		log.Warningf("ProxyConfig cpu  %d exceed %d, adjust to %d ", pc.cpu, runtime.NumCPU(), runtime.NumCPU())
		pc.cpu = runtime.NumCPU()
	}

	if pc.maxConn > 10000 {
		log.Warningf("ProxyConfig maxconn %d exceed 10000, adjust to 10000", pc.maxConn)
		pc.maxConn = 10000
	}

	runtime.GOMAXPROCS(pc.cpu)

	if pc.poolSize <= 0 || pc.poolSize > 30 {
		log.Warning("ProxyConfig poolSize %d , adjust to 10 ", pc.poolSize)
		pc.poolSize = 10
	}

	if pc.cpuFile != "" {
		f, err := os.Create(pc.cpuFile)
		if err != nil {
			log.Fatal(err)
		}
		log.Warning("Archer start CPUProfile ", pc.cpuFile)
		pprof.StartCPUProfile(f)
		defer pprof.StopCPUProfile()
	}

	if pc.memFile != "" {
		f, err := os.Create(pc.memFile)
		if err == nil {
			log.Warning("Archer start HeapProfile ", pc.memFile)
			pprof.WriteHeapProfile(f)
		}
	}

	go func() {
		log.Warning(http.ListenAndServe(":6061", nil))
	}()
}
Beispiel #6
0
// not thread-safe, caller must kill after get Lock
func (agent *Agent) KillTask(t *Task) {
	p, ok := agent.Process[t.TaskId]
	if !ok {
		log.Warningf("In KillTask  %s, can't find process in agent.Process", t.TaskId)
	} else {
		pid := p.Pid
		log.Warning("KillTask pid is: we try to kill", pid)
		if err := p.Kill(); err != nil {
			// double check
			log.Warningf("kill err %s, we try again ", err)
			util.KillTaskForceByPid(pid)
		}
	}
}
Beispiel #7
0
// we will Save Cron MetaData periodically, currently for 5min
func (agent *Agent) SaveCronMetaData() {
	meta_file := fmt.Sprintf("%s/dcms_agent.metadata", agent.Conf.WorkDir)
	cronSlice := make([]string, 0)

	agent.Lock.Lock()
	defer agent.Lock.Unlock()
	for k, v := range agent.Jobs {
		if data, err := json.Marshal(v); err == nil {
			cronSlice = append(cronSlice, string(data))
		} else {
			log.Warningf("marshal task: %d failed: %s", k, err)
			return
		}
	}

	if data, err := json.Marshal(cronSlice); err != nil {
		log.Warning("json marshal cronslice failed, ", err)
	} else {
		if len(cronSlice) == 0 {
			log.Warning("cronSlice json empty, just skip write MetaData")
			return
		}
		log.Debug("len of cronSlice:", len(data), data)
		log.Debugf("cronSlice length:%d content:%s", len(cronSlice), cronSlice)
		if e := ioutil.WriteFile(meta_file, data, os.ModePerm); e != nil {
			log.Warning("ioutil write meta_file failed,", e)
		}
	}
}
Beispiel #8
0
func (s *Server) handleConn(c net.Conn) {
	log.Info("new connection", c.RemoteAddr())

	s.counter.Add("connections", 1)
	client := &session{
		Conn:     c,
		r:        bufio.NewReader(c),
		CreateAt: time.Now(),
	}

	var err error

	defer func() {
		if err != nil { //todo: fix this ugly error check
			if GetOriginError(err.(*errors.Err)).Error() != io.EOF.Error() {
				log.Warningf("close connection %v, %+v, %v", c.RemoteAddr(), client, errors.ErrorStack(err))
			} else {
				log.Infof("close connection %v, %+v", c.RemoteAddr(), client)
			}
		} else {
			log.Infof("close connection %v, %+v", c.RemoteAddr(), client)
		}

		c.Close()
		s.counter.Add("connections", -1)
	}()

	for {
		err = s.redisTunnel(client)
		if err != nil {
			return
		}
		client.Ops++
	}
}
Beispiel #9
0
func (s *Server) resetSchemaInfo() error {
	for _, c := range s.clients {
		if len(c.txConns) > 0 {
			return errors.Errorf("transaction exist")
		}
	}

	cfg, err := config.ParseConfigFile(s.configFile)
	if err != nil {
		log.Error(err)
		return err
	}

	s.cleanup()
	s.autoSchamas = make(map[string]*tabletserver.SchemaInfo)
	for _, n := range s.shards {
		n.Close()
	}

	s.shards = nil
	s.schemas = nil

	log.Warningf("%#v", cfg)

	log.SetLevelByString(cfg.LogLevel)

	s.cfg = cfg
	return s.loadSchemaInfo()
}
Beispiel #10
0
func (t *Task) genLogFile() {
	defer func() {
		if e := recover(); e != nil {
			log.Warning("genLogFile fatal:", e)
		}
	}()
	d := time.Now().Format("20060102")
	filename := fmt.Sprintf("%s/DCMS-%s/%d-%s-%s.log",
		t.Job.Dcms.Conf.WorkDir,
		d,
		t.Job.Id,
		t.Job.Name,
		t.TaskId)
	log.Info("generate logfile :", filename)

	logdir := fmt.Sprintf("%s/DCMS-%s", t.Job.Dcms.Conf.WorkDir, d)

	if err := os.MkdirAll(logdir, os.ModePerm); err != nil {
		log.Warningf("in run exec goroutine, mkdir workdir %s failed!!!! ", t.Job.Dcms.Conf.WorkDir)
	}

	if f, err := os.OpenFile(filename, os.O_RDWR|os.O_CREATE|os.O_TRUNC, os.ModePerm); err != nil {
		log.Warning("in genLogFile os.OpenFile create failed: ", f)
		t.logfile = nil
		t.LogFilename = ""
	} else {
		t.logfile = f
		t.LogFilename = filename
	}
}
Beispiel #11
0
func (self *ResMan) addReadyTask(id string) (string, error) {
	if self.ready.Exist(id) {
		return "", fmt.Errorf("%s already exist: %+v", id, self.ready.Get(id))
	}

	job, err := scheduler.GetJobById(id)
	if err != nil {
		return "", err
	}

	persistentTask := &scheduler.Task{TaskId: self.genTaskId(), Status: scheduler.STATUS_READY,
		StartTs: time.Now().Unix(), JobName: job.Name}
	log.Warningf("%+v", persistentTask)
	err = persistentTask.Save()
	if err != nil {
		log.Error(err)
		return "", err
	}

	job.LastTaskId = persistentTask.TaskId
	job.Save()

	t := &Task{Tid: persistentTask.TaskId, job: job, state: taskReady}
	self.ready.Add(t.Tid, t)
	log.Debugf("ready task %+v, total count:%d", t, self.ready.Length())

	return persistentTask.TaskId, nil
}
Beispiel #12
0
func getInsertPKValues(pkColumnNumbers []int, rowList sqlparser.Values, tableInfo *schema.Table) (pkValues []interface{}, err error) {
	pkValues = make([]interface{}, len(pkColumnNumbers))
	for index, columnNumber := range pkColumnNumbers {
		if columnNumber == -1 {
			pkValues[index] = tableInfo.GetPKColumn(index).Default
			continue
		}
		values := make([]interface{}, len(rowList))
		for j := 0; j < len(rowList); j++ {
			if _, ok := rowList[j].(*sqlparser.Subquery); ok {
				return nil, errors.New("row subquery not supported for inserts")
			}
			row := rowList[j].(sqlparser.ValTuple)
			if columnNumber >= len(row) {
				return nil, errors.New("column count doesn't match value count")
			}
			node := row[columnNumber]
			if !sqlparser.IsValue(node) {
				log.Warningf("insert is too complex %v", node)
				return nil, nil
			}
			var err error
			values[j], err = sqlparser.AsInterface(node)
			if err != nil {
				return nil, err
			}
		}
		if len(values) == 1 {
			pkValues[index] = values[0]
		} else {
			pkValues[index] = values
		}
	}
	return pkValues, nil
}
Beispiel #13
0
// filter must split by '|', for example "fatal|error|fail|failed"
func HitFilter(filename string, filter string) bool {
	log.Debug("HitFilter run:", filename, filter)
	filterExp, err := regexp.Compile(fmt.Sprintf(`(?i:(%s))`, filter))
	if err != nil {
		log.Warningf("HitFilter regexp.Compile for %s failed:%s", filter, err)
		return false
	}

	if f, err := os.Open(filename); err != nil {
		log.Warning("HitFilter open file failed ", filename, err)
		return false
	} else {
		defer f.Close()
		freader := bufio.NewReader(f)
		for {
			var str string
			str, err = freader.ReadString('\n')
			s := filterExp.FindStringSubmatch(str)
			if len(s) > 0 {
				log.Debugf("HitFilter hit msg_filter ", s, str)
				return true
			}
			if err == io.EOF {
				break
			}
		}
	}
	return false
}
Beispiel #14
0
//ping codis-server find crashed codis-server
func CheckAliveAndPromote(groups []models.ServerGroup) ([]models.Server, error) {
	errCh := make(chan interface{}, 100)
	var serverCnt int
	for _, group := range groups { //each group
		for _, s := range group.Servers { //each server
			serverCnt++
			rc := acf(s.Addr, 5*time.Second)
			news := s
			go PingServer(rc, news, errCh)
		}
	}

	//get result
	var crashedServer []models.Server
	for i := 0; i < serverCnt; i++ {
		s := <-errCh
		if s == nil { //alive
			continue
		}

		log.Warningf("server maybe crashed %+v", s)
		crashedServer = append(crashedServer, *s.(*models.Server))

		err := handleCrashedServer(s.(*models.Server))
		if err != nil {
			return crashedServer, err
		}
	}

	return crashedServer, nil
}
Beispiel #15
0
// Interrupt releases a lock that's held.
func (zm *zMutex) Interrupt() {
	select {
	case zm.interrupted <- struct{}{}:
	default:
		log.Warningf("zmutex interrupt blocked")
	}
}
Beispiel #16
0
// we will check output file, if content contain msg_filter, we will change status to Failed
func (agent *Agent) HandleStatusSuccess(s *TaskStatus) {
	agent.Lock.Lock()
	defer agent.Lock.Unlock()
	if !util.HitFilter(s.TaskPtr.LogFilename, s.TaskPtr.Job.MsgFilter) {
		s.TaskPtr.Job.LastSuccessAt = s.CreateAt
		s.TaskPtr.Job.LastTaskId = s.TaskPtr.TaskId
		if agent.Running[s.TaskPtr.JobId].Status == StatusTimeout {
			s.TaskPtr.Job.LastStatus = JobTimeout
		} else {
			s.TaskPtr.Job.LastStatus = JobSuccess
		}
		delete(agent.Process, s.TaskPtr.TaskId)
		delete(agent.Running, s.TaskPtr.JobId)
		s.TaskPtr.Job.SuccessCnt += 1

		log.Warning("Task success : ", s.TaskPtr.TaskId, s.TaskPtr.Job.Name, s.TaskPtr.ExecDuration)
	} else {
		s.TaskPtr.Job.LastErrAt = s.CreateAt
		s.TaskPtr.Job.LastTaskId = s.TaskPtr.TaskId
		s.TaskPtr.Job.LastStatus = JobFail
		s.Status = StatusFailed
		delete(agent.Process, s.TaskPtr.TaskId)
		delete(agent.Running, s.TaskPtr.JobId)
		s.TaskPtr.Job.ErrCnt += 1
		log.Warningf("Task failed : hit msg_filter error", s.TaskPtr.TaskId, s.TaskPtr.Job.Name, s.TaskPtr.ExecDuration)
		s.Err = errors.New(fmt.Sprintf("Task: %s  Job: %s failed.  hit msg_filter error", s.TaskPtr.TaskId, s.TaskPtr.Job.Name))
	}
	s.Message = util.GetFileContent(s.TaskPtr.LogFilename, 65535, 1)
	if ok := agent.store.UpdateTaskStatus(s); !ok {
		log.Warning("Task status Store Or Update failed ", s)
	}
	agent.PostTaskStatus(s)
}
Beispiel #17
0
// heartbeat monitors state of each shard in the ring.
func (ring *Ring) heartbeat() {
	ticker := time.NewTicker(100 * time.Millisecond)
	defer ticker.Stop()
	for _ = range ticker.C {
		var rebalance bool

		ring.mx.RLock()

		if ring.closed {
			ring.mx.RUnlock()
			break
		}

		for _, shard := range ring.shards {
			err := shard.Client.Ping().Err()
			if shard.Vote(err == nil) {
				log.Warningf("redis: ring shard state changed: %s", shard)
				rebalance = true
			}
		}

		ring.mx.RUnlock()

		if rebalance {
			ring.rebalance()
		}
	}
}
Beispiel #18
0
func (si *SchemaInfo) override() {
	for _, override := range si.overrides {
		table, ok := si.tables[override.Name]
		if !ok {
			log.Warningf("Table not found for override: %v, %v", override, si.tables)
			continue
		}
		if override.PKColumns != nil {
			log.Infof("SetPK Table name %s, pk %v", override.Name, override.PKColumns)
			if err := table.SetPK(override.PKColumns); err != nil {
				log.Errorf("%s: %v", errors.ErrorStack(err), override)
				continue
			}
		}
		if si.cachePool.IsClosed() || override.Cache == nil {
			log.Infof("%+v", override)
			continue
		}

		switch override.Cache.Type {
		case "RW":
			table.CacheType = schema.CACHE_RW
			table.Cache = NewRowCache(table, si.cachePool)
		case "W":
			table.CacheType = schema.CACHE_W
			if len(override.Cache.Table) == 0 {
				log.Warningf("Incomplete cache specs: %v", override)
				continue
			}

			totable, ok := si.tables[override.Cache.Table]
			if !ok {
				log.Warningf("Table not found: %v", override)
				continue
			}

			if totable.Cache == nil {
				log.Warningf("Table has no cache: %v", override)
				continue
			}

			table.Cache = totable.Cache
		default:
			log.Warningf("Ignoring cache override: %+v", override)
		}
	}
}
Beispiel #19
0
func (c *Cluster) PutConn(cn Conn) {
	pool, ok := c.pools[cn.ID()]
	if !ok {
		log.Warningf("Cluster PutConn %s, belong no pool", cn.ID())
		return
	}
	pool.Put(cn)
}
Beispiel #20
0
func formatSec(dur time.Duration) string {
	if dur > 0 && dur < time.Second {
		log.Warningf(
			"redis: specified duration is %s, but minimal supported value is %s",
			dur, time.Second,
		)
	}
	return strconv.FormatInt(int64(dur/time.Second), 10)
}
Beispiel #21
0
func (top *Topology) doWatch(evtch <-chan topo.Event, evtbus chan interface{}) {
	e := <-evtch
	if e.State == topo.StateExpired || e.Type == topo.EventNotWatching {
		log.Fatalf("session expired: %+v", e)
	}

	log.Warningf("topo event %+v", e)

	switch e.Type {
	//case topo.EventNodeCreated:
	//case topo.EventNodeDataChanged:
	case topo.EventNodeChildrenChanged: //only care children changed
		//todo:get changed node and decode event
	default:
		log.Warningf("%+v", e)
	}

	evtbus <- e
}
Beispiel #22
0
func (self *ResMan) TimeoutCheck(sec int) {
	var timeoutTasks []string
	self.running.Each(func(key string, t *Task) bool {
		if t.state == taskRuning && time.Since(t.LastUpdate).Seconds() > float64(sec) {
			log.Warningf("%+v timeout", t)
			timeoutTasks = append(timeoutTasks, key)
		}
		return true
	})

	for _, taskId := range timeoutTasks {
		log.Warningf("remove timeout task %s", taskId)
		mid := &mesos.TaskID{}
		id := taskId
		mid.Value = &id
		self.driver.KillTask(mid)
		//self.running.Del(taskId)
	}
}
Beispiel #23
0
func validCmd(cmd uint32) bool {
	if cmd >= common.CAN_DO && cmd <= common.SUBMIT_JOB_EPOCH {
		return true
	}

	if cmd != 39 { //filter gearmand
		log.Warningf("invalid cmd %d", cmd)
	}

	return false
}
Beispiel #24
0
func (p *Proxy) Start() {
	for {
		c, err := p.l.Accept()
		if err != nil {
			log.Warning("got error when Accept network connect ", err)
			if nerr, ok := err.(net.Error); ok && nerr.Temporary() {
				log.Warningf("NOTICE: temporary Accept() failure - %s", err)
				runtime.Gosched()
				continue
			}
			// theres no direct way to detect this error because it is not exposed
			if !strings.Contains(err.Error(), "use of closed network connection") {
				log.Warningf("ERROR: listener.Accept() - %s", err)
			}
			break
		}

		go HandleConn(p, c)
	}
}
Beispiel #25
0
func (t *Topology) reloadSlots() {
	ss, err := t.getSlots()
	if err != nil {
		log.Warningf("ReloadLoop failed ", err)
		return
	}

	t.rw.Lock()
	t.slots = ss
	t.rw.Unlock()
}
Beispiel #26
0
func (p *connPool) Remove(cn *conn) error {
	// Replace existing connection with new one and unblock waiter.
	newcn, err := p.new()
	if err != nil {
		log.Warningf("redis: new failed: %s", err)
		return p.conns.Remove(cn)
	}
	err = p.conns.Replace(cn, newcn)
	p.freeConns <- newcn
	return err
}
Beispiel #27
0
func (p *connPool) Put(cn *conn) error {
	if cn.rd.Buffered() != 0 {
		b, _ := cn.rd.ReadN(cn.rd.Buffered())
		log.Warningf("redis: connection has unread data: %q", b)
		return p.Remove(cn)
	}
	if p.opt.getIdleTimeout() > 0 {
		cn.usedAt = time.Now()
	}
	p.freeConns <- cn
	return nil
}
Beispiel #28
0
func GetStmtExecPlan(stmt sqlparser.Statement, getTable TableGetter, alloc arena.ArenaAllocator) (plan *ExecPlan, err error) {
	plan, err = analyzeSQL(stmt, getTable, alloc)
	if err != nil {
		return nil, err
	}

	if plan.PlanId == PLAN_PASS_DML {
		log.Warningf("PASS_DML: %s", sqlparser.String(stmt, alloc))
	}

	return plan, nil
}
Beispiel #29
0
func (s *Server) handleMigrateState(slotIndex int, key []byte) error {
	shd := s.slots[slotIndex]
	if shd.slotInfo.State.Status != models.SLOT_STATUS_MIGRATE {
		return nil
	}

	if shd.migrateFrom == nil {
		log.Fatalf("migrateFrom not exist %+v", shd)
	}

	if shd.dst.Master() == shd.migrateFrom.Master() {
		log.Fatalf("the same migrate src and dst, %+v", shd)
	}

	redisConn, err := s.pools.GetConn(shd.migrateFrom.Master())
	if err != nil {
		return errors.Trace(err)
	}

	defer s.pools.ReleaseConn(redisConn)

	redisReader := redisConn.(*redispool.PooledConn).BufioReader()

	err = WriteMigrateKeyCmd(redisConn.(*redispool.PooledConn), shd.dst.Master(), 30*1000, key)
	if err != nil {
		redisConn.Close()
		log.Warningf("migrate key %s error, from %s to %s",
			string(key), shd.migrateFrom.Master(), shd.dst.Master())
		return errors.Trace(err)
	}

	//handle migrate result
	resp, err := parser.Parse(redisReader)
	if err != nil {
		redisConn.Close()
		return errors.Trace(err)
	}

	result, err := resp.Bytes()

	log.Debug("migrate", string(key), "from", shd.migrateFrom.Master(), "to", shd.dst.Master(),
		string(result))

	if resp.Type == parser.ErrorResp {
		redisConn.Close()
		log.Error(string(key), string(resp.Raw), "migrateFrom", shd.migrateFrom.Master())
		return errors.New(string(resp.Raw))
	}

	s.counter.Add("Migrate", 1)
	return nil
}
Beispiel #30
0
func ExpireBlackLists() {
	ticker := time.NewTicker(30 * time.Second)
	for {
		select {
		case <-ticker.C:
			for k, b := range BlackKeyLists {
				if b.Deadline.Before(time.Now()) {
					log.Warningf("Black key: %s last: %s deadline: %s reached, will be expired...", b.Name, b.Deadline.Sub(b.Startup), b.Deadline.String())
					delete(BlackKeyLists, k)
				}
			}
		}
	}
}