func (self *NsqdCoordinator) putMessageOnSlave(coord *TopicCoordinator, logData CommitLogData, msg *nsqd.Message) *CoordErr { var logMgr *TopicCommitLogMgr var topic *nsqd.Topic var queueEnd nsqd.BackendQueueEnd checkDupOnSlave := func(tc *coordData) bool { if coordLog.Level() >= levellogger.LOG_DETAIL { topicName := tc.topicInfo.Name coordLog.Debugf("pub on slave : %v, msg %v", topicName, msg.ID) } logMgr = tc.logMgr if logMgr.IsCommitted(logData.LogID) { coordLog.Infof("pub the already committed log id : %v", logData.LogID) return true } return false } doLocalWriteOnSlave := func(tc *coordData) *CoordErr { var localErr error topicName := tc.topicInfo.Name partition := tc.topicInfo.Partition topic, localErr = self.localNsqd.GetExistingTopic(topicName, partition) if localErr != nil { coordLog.Infof("pub on slave missing topic : %v", topicName) // leave the isr and try re-sync with leader return &CoordErr{localErr.Error(), RpcErrTopicNotExist, CoordSlaveErr} } if topic.GetTopicPart() != partition { coordLog.Errorf("topic on slave has different partition : %v vs %v", topic.GetTopicPart(), partition) return &CoordErr{ErrLocalTopicPartitionMismatch.String(), RpcErrTopicNotExist, CoordSlaveErr} } topic.Lock() queueEnd, localErr = topic.PutMessageOnReplica(msg, nsqd.BackendOffset(logData.MsgOffset)) topic.Unlock() if localErr != nil { coordLog.Errorf("put message on slave failed: %v", localErr) return &CoordErr{localErr.Error(), RpcCommonErr, CoordSlaveErr} } return nil } doLocalCommit := func() error { localErr := logMgr.AppendCommitLog(&logData, true) if localErr != nil { coordLog.Errorf("write commit log on slave failed: %v", localErr) return localErr } topic.Lock() topic.UpdateCommittedOffset(queueEnd) topic.Unlock() return nil } doLocalExit := func(err *CoordErr) { if err != nil { coordLog.Infof("slave put message %v error: %v", logData, err) } } return self.doWriteOpOnSlave(coord, checkDupOnSlave, doLocalWriteOnSlave, doLocalCommit, doLocalExit) }
func (self *NsqdCoordinator) putMessagesOnSlave(coord *TopicCoordinator, logData CommitLogData, msgs []*nsqd.Message) *CoordErr { if len(msgs) == 0 { return ErrPubArgError } if logData.LogID != int64(msgs[0].ID) { return ErrPubArgError } var logMgr *TopicCommitLogMgr // this last log id should be used on slave to avoid the slave switch // override the leader's prev mpub message id. // While slave is chosen as leader, the next id should be larger than the last logid. // Because the mpub maybe already committed after the leader is down, the new leader should begin // with the last message id + 1 for next message. lastMsgLogID := int64(msgs[len(msgs)-1].ID) if logData.LastMsgLogID != lastMsgLogID { return ErrPubArgError } var queueEnd nsqd.BackendQueueEnd var topic *nsqd.Topic checkDupOnSlave := func(tc *coordData) bool { if coordLog.Level() >= levellogger.LOG_DETAIL { topicName := tc.topicInfo.Name coordLog.Debugf("pub on slave : %v, msg count: %v", topicName, len(msgs)) } logMgr = tc.logMgr if logMgr.IsCommitted(logData.LogID) { coordLog.Infof("put the already committed log id : %v", logData.LogID) return true } return false } doLocalWriteOnSlave := func(tc *coordData) *CoordErr { var localErr error var start time.Time checkCost := coordLog.Level() >= levellogger.LOG_DEBUG if self.enableBenchCost { checkCost = true } if checkCost { start = time.Now() } topicName := tc.topicInfo.Name partition := tc.topicInfo.Partition topic, localErr = self.localNsqd.GetExistingTopic(topicName, partition) if localErr != nil { coordLog.Infof("pub on slave missing topic : %v", topicName) // leave the isr and try re-sync with leader return &CoordErr{localErr.Error(), RpcErrTopicNotExist, CoordSlaveErr} } topic.Lock() var cost time.Duration if checkCost { cost = time.Now().Sub(start) if cost > time.Millisecond { coordLog.Infof("prepare write on slave local cost :%v", cost) } } queueEnd, localErr = topic.PutMessagesOnReplica(msgs, nsqd.BackendOffset(logData.MsgOffset)) if checkCost { cost2 := time.Now().Sub(start) if cost2 > time.Millisecond { coordLog.Infof("write local on slave cost :%v, %v", cost, cost2) } } topic.Unlock() if localErr != nil { logIndex, lastLogOffset, lastLog, _ := logMgr.GetLastCommitLogOffsetV2() coordLog.Errorf("put messages on slave failed: %v, slave last logid: %v, data: %v:%v, %v", localErr, logMgr.GetLastCommitLogID(), logIndex, lastLogOffset, lastLog) return &CoordErr{localErr.Error(), RpcCommonErr, CoordSlaveErr} } return nil } doLocalCommit := func() error { localErr := logMgr.AppendCommitLog(&logData, true) if localErr != nil { coordLog.Errorf("write commit log on slave failed: %v", localErr) return localErr } topic.Lock() topic.UpdateCommittedOffset(queueEnd) topic.Unlock() return nil } doLocalExit := func(err *CoordErr) { if err != nil { coordLog.Warningf("failed to batch put messages on slave: %v", err) } } return self.doWriteOpOnSlave(coord, checkDupOnSlave, doLocalWriteOnSlave, doLocalCommit, doLocalExit) }
func (self *NsqdCoordinator) PutMessageToCluster(topic *nsqd.Topic, body []byte, traceID uint64) (nsqd.MessageID, nsqd.BackendOffset, int32, nsqd.BackendQueueEnd, error) { var commitLog CommitLogData var queueEnd nsqd.BackendQueueEnd msg := nsqd.NewMessage(0, body) msg.TraceID = traceID topicName := topic.GetTopicName() partition := topic.GetTopicPart() coord, checkErr := self.getTopicCoord(topicName, partition) if checkErr != nil { return msg.ID, nsqd.BackendOffset(commitLog.MsgOffset), commitLog.MsgSize, queueEnd, checkErr.ToErrorType() } var logMgr *TopicCommitLogMgr doLocalWrite := func(d *coordData) *CoordErr { logMgr = d.logMgr topic.Lock() id, offset, writeBytes, qe, localErr := topic.PutMessageNoLock(msg) queueEnd = qe topic.Unlock() if localErr != nil { coordLog.Warningf("put message to local failed: %v", localErr) return &CoordErr{localErr.Error(), RpcNoErr, CoordLocalErr} } commitLog.LogID = int64(id) // epoch should not be changed. // leader epoch change means leadership change, leadership change // need disable write which should hold the write lock. // However, we are holding write lock while doing the cluster write replication. commitLog.Epoch = d.GetTopicEpochForWrite() commitLog.LastMsgLogID = commitLog.LogID commitLog.MsgOffset = int64(offset) commitLog.MsgSize = writeBytes commitLog.MsgCnt = queueEnd.TotalMsgCnt() commitLog.MsgNum = 1 return nil } doLocalExit := func(err *CoordErr) { if err != nil { coordLog.Infof("topic %v PutMessageToCluster msg %v error: %v", topic.GetFullName(), msg, err) if coord.IsWriteDisabled() { topic.DisableForSlave() } } } doLocalCommit := func() error { localErr := logMgr.AppendCommitLog(&commitLog, false) if localErr != nil { coordLog.Errorf("topic : %v, Generator %v failed write commit log : %v, logmgr: %v, %v", topic.GetFullName(), topic.GetMsgGenerator(), localErr, logMgr.pLogID, logMgr.nLogID) } topic.Lock() topic.UpdateCommittedOffset(queueEnd) topic.Unlock() return localErr } doLocalRollback := func() { coordLog.Warningf("failed write begin rollback : %v, %v", topic.GetFullName(), commitLog) topic.Lock() topic.RollbackNoLock(nsqd.BackendOffset(commitLog.MsgOffset), 1) topic.Unlock() } doRefresh := func(d *coordData) *CoordErr { logMgr = d.logMgr if d.GetTopicEpochForWrite() != commitLog.Epoch { coordLog.Warningf("write epoch changed during write: %v, %v", d.GetTopicEpochForWrite(), commitLog) return ErrEpochMismatch } self.requestNotifyNewTopicInfo(d.topicInfo.Name, d.topicInfo.Partition) return nil } doSlaveSync := func(c *NsqdRpcClient, nodeID string, tcData *coordData) *CoordErr { // should retry if failed, and the slave should keep the last success write to avoid the duplicated putErr := c.PutMessage(&tcData.topicLeaderSession, &tcData.topicInfo, commitLog, msg) if putErr != nil { coordLog.Infof("sync write to replica %v failed: %v. put offset:%v, logmgr: %v, %v", nodeID, putErr, commitLog, logMgr.pLogID, logMgr.nLogID) } return putErr } handleSyncResult := func(successNum int, tcData *coordData) bool { if successNum == len(tcData.topicInfo.ISR) { return true } return false } clusterErr := self.doSyncOpToCluster(true, coord, doLocalWrite, doLocalExit, doLocalCommit, doLocalRollback, doRefresh, doSlaveSync, handleSyncResult) var err error if clusterErr != nil { err = clusterErr.ToErrorType() } return msg.ID, nsqd.BackendOffset(commitLog.MsgOffset), commitLog.MsgSize, queueEnd, err }