func convertUpdate(rule *config.Rule, rows [][]interface{}) ([]elastic.BulkableRequest, error) { if len(rows)%2 != 0 { return nil, errors.Errorf("invalid update rows event, must have 2x rows, but %d", len(rows)) } reqs := make([]elastic.BulkableRequest, 0, len(rows)) var err error = nil for i := 0; i < len(rows); i += 2 { beforeID, err := rule.DocId(rows[i]) if err != nil { log.Warnf("skipping row update due to problem with before update values: %v\n", err) continue } afterID, err := rule.DocId(rows[i+1]) if err != nil { log.Warnf("skipping row update due to problem with update values: %v\n", err) continue } beforeParentID, afterParentID := "", "" beforeParentID, err = rule.ParentId(rows[i]) if err != nil { return nil, errors.Trace(err) } afterParentID, err = rule.ParentId(rows[i+1]) if err != nil { return nil, errors.Trace(err) } var req elastic.BulkableRequest req = elastic.NewBulkUpdateRequest().Index(rule.Index).Type(rule.Type).Parent(beforeParentID).Id(beforeID).Routing(beforeParentID) if beforeID != afterID || beforeParentID != afterParentID { // if an id is changing, delete the old document and insert a new one req = elastic.NewBulkDeleteRequest().Index(rule.Index).Type(rule.Type).Id(beforeID).Routing(beforeParentID) reqs = append(reqs, req) temp, err := convertInsert(rule, [][]interface{}{rows[i+1]}) if err == nil { req = temp[0] } } else { doc := convertUpdateRow(rule, rows[i], rows[i+1]) req = elastic.NewBulkUpdateRequest().Index(rule.Index).Type(rule.Type).Parent(beforeParentID).Id(beforeID).Routing(beforeParentID).Doc(doc) } reqs = append(reqs, req) } return reqs, err }
// Interrupt releases a lock that's held. func (zm *zMutex) Interrupt() { select { case zm.interrupted <- struct{}{}: default: log.Warnf("zmutex interrupt blocked") } }
func (r *River) makeReqColumnData(col *schema.TableColumn, value interface{}) interface{} { switch col.Type { case schema.TYPE_ENUM: switch value := value.(type) { case int64: // for binlog, ENUM may be int64, but for dump, enum is string eNum := value - 1 if eNum < 0 || eNum >= int64(len(col.EnumValues)) { // we insert invalid enum value before, so return empty log.Warnf("invalid binlog enum index %d, for enum %v", eNum, col.EnumValues) return "" } return col.EnumValues[eNum] } case schema.TYPE_SET: switch value := value.(type) { case int64: // for binlog, SET may be int64, but for dump, SET is string bitmask := value sets := make([]string, 0, len(col.SetValues)) for i, s := range col.SetValues { if bitmask&int64(1<<uint(i)) > 0 { sets = append(sets, s) } } return strings.Join(sets, ",") } } return value }
func (e *RotateEvent) Decode(data []byte) error { e.Position = binary.LittleEndian.Uint64(data[0:]) e.NextLogName = data[8:] if e.Position != 4 { // FIXME: why is this happening? log.Warnf("RotateEvent(%s, %v) doesn't specify expected offset 4; forcing to 4 to continue", string(e.NextLogName), e.Position) e.Position = 4 } return nil }
func (r *River) createIndex(idx string, settings map[string]interface{}) error { exists, err := r.es.IndexExists(idx).Do() if exists { log.Warnf("Index '%s' already exists; settings and mappings not updated", idx) return nil } log.Infof("Creating index with settings from %v: %v", idx, settings) _, err = r.es.CreateIndex(idx).BodyJson(settings).Do() return err }
func (t *electorTask) Stop() { t.z.noticeLeaderCh(false) t.interrupted.Set(false) select { case t.stop <- struct{}{}: default: log.Warnf("stop chan blocked") } }
func (z *Zk) noticeLeaderCh(b bool) { z.isLeader.Set(b) for { select { case z.leaderCh <- b: return default: log.Warnf("%s leader chan blocked, leader: %v", z.c.Addr, b) select { case <-z.leaderCh: default: } } } }
func (t *electorTask) Run() error { t.z.wg.Add(1) defer t.z.wg.Done() log.Infof("begin leader %s, run", t.z.c.Addr) if err := t.z.getMasters(); err != nil { t.interrupted.Set(true) log.Errorf("get masters err %v", err) return err } t.z.noticeLeaderCh(true) for { select { case <-t.z.quit: log.Info("zk close, interrupt elector running task") t.z.noticeLeaderCh(false) t.interrupted.Set(true) return nil case <-t.stop: log.Info("stop elector running task") return nil case a := <-t.z.actionCh: if a.timeout.Get() { log.Warnf("wait action %s masters %v timeout, skip it", a.a.Cmd, a.a.Masters) } else { err := t.z.handleAction(a.a) a.ch <- err } } } return nil }
// for insert and delete func convertAction(rule *config.Rule, action string, rows [][]interface{}) ([]elastic.BulkableRequest, error) { reqs := make([]elastic.BulkableRequest, 0, len(rows)) for _, values := range rows { if id, err := rule.DocId(values); err != nil { log.Warnf("skipping row %s due to: %v\n", action, err) continue } else { var req elastic.BulkableRequest if parentId, err := rule.ParentId(values); err != nil { return nil, err } else if action == canal.DeleteAction { req = elastic.NewBulkDeleteRequest().Index(rule.Index).Type(rule.Type).Id(id).Routing(parentId) } else { doc := convertRow(rule, values) req = elastic.NewBulkIndexRequest().Index(rule.Index).Type(rule.Type).Id(id).Parent(parentId).Doc(doc) } reqs = append(reqs, req) } } return reqs, nil }
func (c *Canal) startSyncBinlog() error { pos := mysql.Position{c.master.Name, c.master.Position} log.Infof("Start sync'ing binlog from %v", pos) s, err := c.syncer.StartSync(pos) if err != nil { return errors.Errorf("Failed starting sync at %v: %v", pos, err) } originalTimeout := time.Second timeout := originalTimeout forceSavePos := false for { ev, err := s.GetEventTimeout(timeout) if err != nil && err != replication.ErrGetEventTimeout { return errors.Trace(err) } else if err == replication.ErrGetEventTimeout { if timeout == 2*originalTimeout { log.Debugf("Flushing event handlers since sync has gone idle") if err := c.flushEventHandlers(); err != nil { log.Warnf("Error occurred during flush: %v", err) } } timeout = 2 * timeout continue } timeout = time.Second //next binlog pos pos.Pos = ev.Header.LogPos forceSavePos = false log.Debugf("Syncing %v", ev) switch e := ev.Event.(type) { case *replication.RotateEvent: c.flushEventHandlers() pos.Name = string(e.NextLogName) pos.Pos = uint32(e.Position) // r.ev <- pos forceSavePos = true log.Infof("Rotate binlog to %v", pos) case *replication.RowsEvent: // we only focus row based event if err = c.handleRowsEvent(ev); err != nil { log.Errorf("Error handling rows event: %v", err) return errors.Trace(err) } case *replication.QueryEvent: if err = c.handleQueryEvent(ev); err != nil { log.Errorf("Error handling rows event: %v", err) return errors.Trace(err) } default: log.Debugf("Ignored event: %+v", e) } c.master.Update(pos.Name, pos.Pos) c.master.Save(forceSavePos) } return nil }
// LockWithTimeout returns nil when the lock is acquired. A lock is // held if the file exists and you are the creator. Setting the wait // to zero makes this a nonblocking lock check. // // FIXME(msolo) Disallow non-super users from removing the lock? func (zm *zMutex) LockWithTimeout(wait time.Duration, desc string) (err error) { timer := time.NewTimer(wait) defer func() { if panicErr := recover(); panicErr != nil || err != nil { zm.deleteLock() } }() // Ensure the rendezvous node is here. // FIXME(msolo) Assuming locks are contended, it will be cheaper to assume this just // exists. _, err = zkhelper.CreateRecursive(zm.zconn, zm.path, "", 0, zk.WorldACL(zkhelper.PERM_DIRECTORY)) if err != nil && !zkhelper.ZkErrorEqual(err, zk.ErrNodeExists) { return err } lockPrefix := path.Join(zm.path, "lock-") zflags := zk.FlagSequence if zm.ephemeral { zflags = zflags | zk.FlagEphemeral } // update node content var lockContent map[string]interface{} err = json.Unmarshal([]byte(zm.contents), &lockContent) if err != nil { return err } lockContent["desc"] = desc newContent, err := json.Marshal(lockContent) if err != nil { return err } createlock: lockCreated, err := zm.zconn.Create(lockPrefix, newContent, int32(zflags), zk.WorldACL(zkhelper.PERM_FILE)) if err != nil { return err } name := path.Base(lockCreated) zm.mu.Lock() zm.name = name zm.mu.Unlock() trylock: children, _, err := zm.zconn.Children(zm.path) if err != nil { return fmt.Errorf("zkutil: trylock failed %v", err) } sort.Strings(children) if len(children) == 0 { return fmt.Errorf("zkutil: empty lock: %v", zm.path) } if children[0] == name { // We are the lock owner. return nil } if zm.onRetryLock != nil { zm.onRetryLock() } // This is the degenerate case of a nonblocking lock check. It's not optimal, but // also probably not worth optimizing. if wait == 0 { return zkhelper.ErrTimeout } prevLock := "" for i := 1; i < len(children); i++ { if children[i] == name { prevLock = children[i-1] break } } if prevLock == "" { // This is an interesting case. The node disappeared // underneath us, probably due to a session loss. We can // recreate the lock node (with a new sequence number) and // keep trying. log.Warnf("zkutil: no lock node found: %v/%v", zm.path, zm.name) goto createlock } zkPrevLock := path.Join(zm.path, prevLock) exist, stat, watch, err := zm.zconn.ExistsW(zkPrevLock) if err != nil { // FIXME(msolo) Should this be a retry? return fmt.Errorf("zkutil: unable to watch previous lock node %v %v", zkPrevLock, err) } if stat == nil || !exist { goto trylock } select { case <-timer.C: return zkhelper.ErrTimeout case <-zm.interrupted: return zkhelper.ErrInterrupted case event := <-watch: log.Infof("zkutil: lock event: %v", event) // The precise event doesn't matter - try to read again regardless. goto trylock } panic("unexpected") }
// RunTask returns nil when the underlyingtask ends or the error it // generated. func (ze *zElector) RunTask(task *electorTask) error { leaderPath := path.Join(ze.path, "leader") for { _, err := zkhelper.CreateRecursive(ze.zconn, leaderPath, "", 0, zk.WorldACL(zkhelper.PERM_FILE)) if err == nil || zkhelper.ZkErrorEqual(err, zk.ErrNodeExists) { break } log.Warnf("election leader create failed: %v", err) time.Sleep(500 * time.Millisecond) } for { err := ze.Lock("RunTask") if err != nil { log.Warnf("election lock failed: %v", err) if err == zkhelper.ErrInterrupted { return zkhelper.ErrInterrupted } continue } // Confirm your win and deliver acceptance speech. This notifies // listeners who will have been watching the leader node for // changes. _, err = ze.zconn.Set(leaderPath, []byte(ze.contents), -1) if err != nil { log.Warnf("election promotion failed: %v", err) continue } log.Infof("election promote leader %v", leaderPath) taskErrChan := make(chan error) go func() { taskErrChan <- task.Run() }() watchLeader: // Watch the leader so we can get notified if something goes wrong. data, _, watch, err := ze.zconn.GetW(leaderPath) if err != nil { log.Warnf("election unable to watch leader node %v %v", leaderPath, err) // FIXME(msolo) Add delay goto watchLeader } if string(data) != ze.contents { log.Warnf("election unable to promote leader") task.Stop() // We won the election, but we didn't become the leader. How is that possible? // (see Bush v. Gore for some inspiration) // It means: // 1. Someone isn't playing by the election rules (a bad actor). // Hard to detect - let's assume we don't have this problem. :) // 2. We lost our connection somehow and the ephemeral lock was cleared, // allowing someone else to win the election. continue } // This is where we start our target process and watch for its failure. waitForEvent: select { case <-ze.interrupted: log.Warn("election interrupted - stop child process") task.Stop() // Once the process dies from the signal, this will all tear down. goto waitForEvent case taskErr := <-taskErrChan: // If our code fails, unlock to trigger an election. log.Infof("election child process ended: %v", taskErr) ze.Unlock() if task.Interrupted() { log.Warnf("election child process interrupted - stepping down") return zkhelper.ErrInterrupted } continue case zevent := <-watch: // We had a zk connection hiccup. We have a few choices, // but it depends on the constraints and the events. // // If we get SESSION_EXPIRED our connection loss triggered an // election that we won't have won and the thus the lock was // automatically freed. We have no choice but to start over. if zevent.State == zk.StateExpired { log.Warnf("election leader watch expired") task.Stop() continue } // Otherwise, we had an intermittent issue or something touched // the node. Either we lost our position or someone broke // protocol and touched the leader node. We just reconnect and // revalidate. In the meantime, assume we are still the leader // until we determine otherwise. // // On a reconnect we will be able to see the leader // information. If we still hold the position, great. If not, we // kill the associated process. // // On a leader node change, we need to perform the same // validation. It's possible an election completes without the // old leader realizing he is out of touch. log.Warnf("election leader watch event %v", zevent) goto watchLeader } } panic("unreachable") }
// Close the release channel when you want to clean up nicely. func CreatePidNode(zconn Conn, zkPath string, contents string, done chan struct{}) error { // On the first try, assume the cluster is up and running, that will // help hunt down any config issues present at startup if _, err := zconn.Create(zkPath, []byte(contents), zk.FlagEphemeral, zk.WorldACL(PERM_FILE)); err != nil { if ZkErrorEqual(err, zk.ErrNodeExists) { err = zconn.Delete(zkPath, -1) } if err != nil { return fmt.Errorf("zkutil: failed deleting pid node: %v: %v", zkPath, err) } _, err = zconn.Create(zkPath, []byte(contents), zk.FlagEphemeral, zk.WorldACL(PERM_FILE)) if err != nil { return fmt.Errorf("zkutil: failed creating pid node: %v: %v", zkPath, err) } } go func() { for { _, _, watch, err := zconn.GetW(zkPath) if err != nil { if ZkErrorEqual(err, zk.ErrNoNode) { _, err = zconn.Create(zkPath, []byte(contents), zk.FlagEphemeral, zk.WorldACL(zk.PermAll)) if err != nil { log.Warnf("failed recreating pid node: %v: %v", zkPath, err) } else { log.Infof("recreated pid node: %v", zkPath) continue } } else { log.Warnf("failed reading pid node: %v", err) } } else { select { case event := <-watch: if ZkEventOk(event) && event.Type == zk.EventNodeDeleted { // Most likely another process has started up. However, // there is a chance that an ephemeral node is deleted by // the session expiring, yet that same session gets a watch // notification. This seems like buggy behavior, but rather // than race too hard on the node, just wait a bit and see // if the situation resolves itself. log.Warnf("pid deleted: %v", zkPath) } else { log.Infof("pid node event: %v", event) } // break here and wait for a bit before attempting case <-done: log.Infof("pid watcher stopped on done: %v", zkPath) return } } select { // No one likes a thundering herd, least of all zk. case <-time.After(5*time.Second + time.Duration(rand.Int63n(55e9))): case <-done: log.Infof("pid watcher stopped on done: %v", zkPath) return } } }() return nil }
func (a *App) checkMaster(wg *sync.WaitGroup, g *Group) { defer wg.Done() // later, add check strategy, like check failed n numbers in n seconds and do failover, etc. // now only check once. err := g.Check() if err == nil { return } oldMaster := g.Master.Addr if err == ErrNodeType { log.Errorf("server %s is not master now, we will skip it", oldMaster) // server is not master, we will not check it. a.delMasters([]string{oldMaster}) return } errNum := time.Duration(g.CheckErrNum.Get()) downTime := errNum * time.Duration(a.c.CheckInterval) * time.Millisecond if downTime < time.Duration(a.c.MaxDownTime)*time.Second { log.Warnf("check master %s err %v, down time: %0.2fs, retry check", oldMaster, err, downTime.Seconds()) return } // If check error, we will remove it from saved masters and not check. // I just want to avoid some errors if below failover failed, at that time, // handling it manually seems a better way. // If you want to recheck it, please add it again. a.delMasters([]string{oldMaster}) log.Errorf("check master %s err %v, do failover", oldMaster, err) if err := a.onBeforeFailover(oldMaster); err != nil { //give up failover return } // first elect a candidate newMaster, err := g.Elect() if err != nil { // elect error return } log.Errorf("master is down, elect %s as new master, do failover", newMaster) // promote the candiate to master err = g.Promote(newMaster) if err != nil { log.Fatalf("do master %s failover err: %v", oldMaster, err) return } a.addMasters([]string{newMaster}) a.onAfterFailover(oldMaster, newMaster) }