// 获取下一个active状态的BackendConn func (s *BackServiceLB) nextBackendConn() *BackendConnLB { s.activeConnsLock.Lock() defer s.activeConnsLock.Unlock() // TODO: 暂时采用RoundRobin的方法,可以采用其他具有优先级排列的方法 var backSocket *BackendConnLB if len(s.activeConns) == 0 { if s.verbose { log.Printf(Cyan("[%s]ActiveConns Len 0"), s.serviceName) } backSocket = nil } else { if s.currentConnIndex >= len(s.activeConns) { s.currentConnIndex = 0 } backSocket = s.activeConns[s.currentConnIndex] s.currentConnIndex++ if s.verbose { log.Printf(Cyan("[%s]ActiveConns Len %d, CurrentIndex: %d"), s.serviceName, len(s.activeConns), s.currentConnIndex) } } return backSocket }
// 处理所有的等待中的请求 func (bc *BackendConn) flushRequests(err error) { // 告诉BackendService, 不再接受新的请求 bc.MarkConnActiveFalse() bc.Lock() seqRequest := bc.seqNum2Request bc.seqNum2Request = make(map[int32]*Request, 4096) bc.Unlock() threshold := time.Now().Add(-time.Second * 5) for _, request := range seqRequest { if request.Start > 0 { t := time.Unix(request.Start, 0) if t.After(threshold) { // 似乎在笔记本上,合上显示器之后出出现网络错误 log.Printf(Red("[%s]Handle Failed Request: %s, Started: %s"), request.Service, request.Request.Name, FormatYYYYmmDDHHMMSS(t)) } } else { log.Printf(Red("[%s]Handle Failed Request: %s"), request.Service, request.Request.Name) } request.Response.Err = err if request.Wait != nil { request.Wait.Done() } } }
// 配对 Request, resp, err // PARAM: resp []byte 为一帧完整的thrift数据包 func (bc *BackendConn) setResponse(r *Request, data []byte, err error) error { // 表示出现错误了 if data == nil { log.Printf("[%s]No Data From Server, error: %v", r.Service, err) r.Response.Err = err } else { // 从resp中读取基本的信息 typeId, seqId, err := DecodeThriftTypIdSeqId(data) // 解码错误,直接报错 if err != nil { return err } // 找到对应的Request bc.Lock() req, ok := bc.seqNum2Request[seqId] if ok { delete(bc.seqNum2Request, seqId) } bc.Unlock() // 如果是心跳,则OK if typeId == MESSAGE_TYPE_HEART_BEAT { // log.Printf(Magenta("Get Ping/Pang Back")) bc.hbLastTime.Set(time.Now().Unix()) return nil } if !ok { return errors.New("Invalid Response") } if bc.verbose { log.Printf("[%s]Data From Server, seqId: %d, Request: %d", req.Service, seqId, req.Request.SeqId) } r = req r.Response.TypeId = typeId } r.Response.Data, r.Response.Err = data, err // 还原SeqId if data != nil { r.RestoreSeqId() } // 设置几个控制用的channel if err != nil && r.Failed != nil { r.Failed.Set(true) } if r.Wait != nil { r.Wait.Done() } return err }
func (p *fakeServer) Dispatch(r *Request) error { log.Printf("Request SeqId: %d, MethodName: %s\n", r.Request.SeqId, r.Request.Name) r.Wait.Add(1) go func() { time.Sleep(time.Millisecond) r.Response.Data = []byte(string(r.Request.Data)) typeId, seqId, _ := DecodeThriftTypIdSeqId(r.Response.Data) log.Printf(Green("TypeId: %d, SeqId: %d\n"), typeId, seqId) r.Wait.Done() }() // r.RestoreSeqId() // r.Wait.Done() return nil }
// // 不断建立到后端的逻辑,负责: BackendConn#input到redis的数据的输入和返回 // func (bc *BackendConn) Run() { for k := 0; !bc.IsMarkOffline.Get(); k++ { // 1. 首先BackendConn将当前 input中的数据写到后端服务中 transport, err := bc.ensureConn() if err != nil { log.ErrorErrorf(err, "[%s]BackendConn#ensureConn error: %v", bc.service, err) return } c := NewTBufferedFramedTransport(transport, 100*time.Microsecond, 20) // 2. 将 bc.input 中的请求写入 后端的Rpc Server err = bc.loopWriter(c) // 同步 // 3. 停止接受Request bc.MarkConnActiveFalse() // 4. 将bc.input中剩余的 Request直接出错处理 if err == nil { log.Printf(Red("[%s]BackendConn#loopWriter normal Exit..."), bc.service) break } else { // 对于尚未处理的Request, 直接报错 for i := len(bc.input); i != 0; i-- { r := <-bc.input bc.setResponse(r, nil, err) } } } }
// 处理所有的等待中的请求 func (bc *BackendConnLB) flushRequests(err error) { // 告诉BackendService, 不再接受新的请求 bc.MarkConnActiveFalse() bc.Lock() seqRequest := bc.seqNum2Request bc.seqNum2Request = make(map[int32]*Request) bc.Unlock() for _, request := range seqRequest { if request.Request.TypeId == MESSAGE_TYPE_HEART_BEAT { // 心跳出错了,则直接直接跳过 } else { log.Printf(Red("Handle Failed Request: %s.%s"), request.Service, request.Request.Name) request.Response.Err = err if request.Wait != nil { request.Wait.Done() } } } // 关闭输入 close(bc.input) }
// // 后端如何处理一个Request, 处理完毕之后直接返回,因为Caller已经做好异步处理了 // func (s *BackServiceLB) Dispatch(r *Request) error { backendConn := s.nextBackendConn() r.Service = s.serviceName if backendConn == nil { // 没有后端服务 if s.verbose { log.Printf(Red("[%s]No BackSocket Found: %s"), s.serviceName, r.Request.Name) } // 从errMsg来构建异常 errMsg := GetWorkerNotFoundData(r, "BackServiceLB") // log.Printf(Magenta("---->Convert Error Back to Exception:[%d] %s\n"), len(errMsg), string(errMsg)) r.Response.Data = errMsg return nil } else { // if s.verbose { // log.Println("SendMessage With: ", backendConn.Addr4Log(), "For Service: ", s.serviceName) // } backendConn.PushBack(r) r.Wait.Wait() return nil } }
func (s *Session) loopWriter(tasks <-chan *Request) error { // Proxy: Session ---> Client for r := range tasks { // 1. 等待Request对应的Response // 出错了如何处理呢? s.handleResponse(r) // 2. 将结果写回给Client if s.verbose { log.Printf("[%s]Session#loopWriter --> client FrameSize: %d", r.Service, len(r.Response.Data)) } // r.Response.Data ---> Client _, err := s.TBufferedFramedTransport.Write(r.Response.Data) if err != nil { log.ErrorErrorf(err, "Write back Data Error: %v", err) return err } // 3. Flush err = s.TBufferedFramedTransport.FlushBuffer(true) // len(tasks) == 0 if err != nil { log.ErrorErrorf(err, "Write back Data Error: %v", err) return err } r.Recycle() } return nil }
// 配对 Request, resp, err // PARAM: resp []byte 为一帧完整的thrift数据包 func (bc *BackendConnLB) setResponse(r *Request, data []byte, err error) error { // 表示出现错误了 if data == nil { log.Printf("No Data From Server, error: %v\n", err) r.Response.Err = err } else { // 从resp中读取基本的信息 typeId, seqId, err := DecodeThriftTypIdSeqId(data) // 解码错误,直接报错 if err != nil { return err } if typeId == MESSAGE_TYPE_STOP { // 不再接受新的输入 // 直接来自后端的服务(不遵循: Request/Reply模型) bc.MarkConnActiveFalse() return nil } // 找到对应的Request bc.Lock() req, ok := bc.seqNum2Request[seqId] if ok { delete(bc.seqNum2Request, seqId) } bc.Unlock() // 如果是心跳,则OK if typeId == MESSAGE_TYPE_HEART_BEAT { bc.hbLastTime.Set(time.Now().Unix()) return nil } if !ok { return errors.New("Invalid Response") } // log.Printf("Data From Server, seqId: %d, Request: %d\n", seqId, req.Request.SeqId) r = req r.Response.TypeId = typeId } r.Response.Data, r.Response.Err = data, err // 还原SeqId if data != nil { r.RestoreSeqId() } // 设置几个控制用的channel if err != nil && r.Failed != nil { r.Failed.Set(true) } if r.Wait != nil { r.Wait.Done() } return err }
// // 确保Socket成功连接到后端服务器 // func (bc *BackendConn) ensureConn() (transport thrift.TTransport, err error) { // 1. 创建连接(只要IP没有问题, err一般就是空) timeout := time.Second * 5 if strings.Contains(bc.addr, ":") { transport, err = thrift.NewTSocketTimeout(bc.addr, timeout) } else { transport, err = NewTUnixDomainTimeout(bc.addr, timeout) } log.Printf(Cyan("[%s]Create Socket To: %s"), bc.service, bc.addr) if err != nil { log.ErrorErrorf(err, "[%s]Create Socket Failed: %v, Addr: %s", err, bc.service, bc.addr) // 连接不上,失败 return nil, err } // 2. 只要服务存在,一般不会出现err sleepInterval := 1 err = transport.Open() for err != nil && !bc.IsMarkOffline.Get() { log.ErrorErrorf(err, "[%s]Socket Open Failed: %v, Addr: %s", bc.service, err, bc.addr) // Sleep: 1, 2, 4这几个间隔 time.Sleep(time.Duration(sleepInterval) * time.Second) if sleepInterval < 4 { sleepInterval *= 2 } err = transport.Open() } return transport, err }
// // 两参数是必须的: ProductName, zkAddress, frontAddr可以用来测试 // func (p *ProxyServer) Run() { var transport thrift.TServerTransport var err error log.Printf(Magenta("Start Proxy at Address: %s"), p.proxyAddr) // 读取后端服务的配置 isUnixDomain := false if !strings.Contains(p.proxyAddr, ":") { if FileExist(p.proxyAddr) { os.Remove(p.proxyAddr) } transport, err = NewTServerUnixDomain(p.proxyAddr) isUnixDomain = true } else { transport, err = thrift.NewTServerSocket(p.proxyAddr) } if err != nil { log.ErrorErrorf(err, "Server Socket Create Failed: %v, Front: %s", err, p.proxyAddr) } // 开始监听 // transport.Open() transport.Listen() ch := make(chan thrift.TTransport, 4096) defer close(ch) go func() { var address string for c := range ch { // 为每个Connection建立一个Session socket, ok := c.(SocketAddr) if isUnixDomain { address = p.proxyAddr } else if ok { address = socket.Addr().String() } else { address = "unknow" } x := NewSession(c, address, p.verbose) // Session独立处理自己的请求 go x.Serve(p.router, 1000) } }() // Accept什么时候出错,出错之后如何处理呢? for { c, err := transport.Accept() if err != nil { log.ErrorErrorf(err, "Accept Error: %v", err) break } else { ch <- c } } }
func (s *BackService) StateChanged(conn *BackendConn) { log.Printf(Cyan("[%s]StateChanged: %s, Index: %d, Count: %d, IsConnActive: %t"), s.serviceName, conn.addr, conn.Index, len(s.activeConns), conn.IsConnActive.Get()) s.activeConnsLock.Lock() defer s.activeConnsLock.Unlock() if conn.IsConnActive.Get() { log.Printf(Cyan("[%s]MarkConnActiveOK: %s, Index: %d, Count: %d"), s.serviceName, conn.addr, conn.Index, len(s.activeConns)) if conn.Index == INVALID_ARRAY_INDEX { conn.Index = len(s.activeConns) s.activeConns = append(s.activeConns, conn) log.Printf(Green("[%s]Add BackendConn to activeConns: %s, Total Actives: %d"), s.serviceName, conn.Addr(), len(s.activeConns)) } } else { log.Printf(Red("[%s]Remove BackendConn From activeConns: %s, Index: %d"), s.serviceName, conn.Addr(), conn.Index) if conn.Index != INVALID_ARRAY_INDEX { lastIndex := len(s.activeConns) - 1 // 将最后一个元素和当前的元素交换位置 if lastIndex != conn.Index { lastConn := s.activeConns[lastIndex] s.activeConns[conn.Index] = lastConn lastConn.Index = conn.Index } s.activeConns[lastIndex] = nil conn.Index = INVALID_ARRAY_INDEX // slice s.activeConns = s.activeConns[0:lastIndex] log.Printf(Red("[%s]Remove BackendConn From activeConns: %s, Remains: %d"), s.serviceName, conn.Addr(), len(s.activeConns)) } } }
// // MarkOffline发生场景: // 1. 后端服务即将下线,预先通知 // 2. 后端服务已经挂了,zk检测到 // // BackendConn 在这里暂时理解关闭conn, 而是从 backend_service_proxy中下线当前的conn, // 然后conn的关闭根据 心跳&Conn的读写异常来判断; 因此 IsConnActive = false 情况下,心跳不能关闭 // func (bc *BackendConn) MarkOffline() { if !bc.IsMarkOffline.Get() { log.Printf(Magenta("[%s]BackendConn: %s MarkOffline"), bc.service, bc.addr) bc.IsMarkOffline.Set(true) // 不再接受(来自backend_service_proxy的)新的输入 bc.MarkConnActiveFalse() close(bc.input) } }
func (bc *BackendConn) MarkConnActiveFalse() { if bc.IsConnActive.Get() { log.Printf(Red("[%s]MarkConnActiveFalse: %s, %p"), bc.service, bc.addr, bc.delegate) // 从Active切换到非正常状态 bc.IsConnActive.Set(false) if bc.delegate != nil { bc.delegate.StateChanged(bc) // 通知其他人状态出现问题 } } }
// 只有在conn出现错误时才会调用 func (s *BackServiceLB) StateChanged(conn *BackendConnLB) { s.activeConnsLock.Lock() defer s.activeConnsLock.Unlock() log.Printf(Green("[%s]StateChanged: %s, Index: %d, Count: %d"), conn.serviceName, conn.addr4Log, conn.Index, len(s.activeConns)) if conn.IsConnActive.Get() { // BackServiceLB 只有一个状态转移: Active --> Not Active log.Printf(Magenta("Unexpected BackendConnLB State")) if s.verbose { panic("Unexpected BackendConnLB State") } } else { log.Printf(Red("Remove BackendConn From activeConns: %s, Index: %d, Count: %d"), conn.Addr4Log(), conn.Index, len(s.activeConns)) // 从数组中删除一个元素(O(1)的操作) if conn.Index != INVALID_ARRAY_INDEX { // 1. 和最后一个元素进行交换 lastIndex := len(s.activeConns) - 1 if lastIndex != conn.Index { lastConn := s.activeConns[lastIndex] // 将最后一个元素和当前的元素交换位置 s.activeConns[conn.Index] = lastConn lastConn.Index = conn.Index // 删除引用 s.activeConns[lastIndex] = nil conn.Index = INVALID_ARRAY_INDEX } log.Printf(Red("Remove BackendConn From activeConns: %s"), conn.Addr4Log()) // 2. slice s.activeConns = s.activeConns[0:lastIndex] } } }
// run之间 transport刚刚建立,因此服务的可靠性比较高 func (bc *BackendConnLB) Run() { log.Printf(Green("[%s]Add New BackendConnLB: %s\n"), bc.serviceName, bc.addr4Log) // 1. 首先BackendConn将当前 input中的数据写到后端服务中 err := bc.loopWriter() // 2. 从Active切换到非正常状态, 同时不再从backend_service_lb接受新的任务 // 可能出现异常,也可能正常退出(反正不干活了) bc.MarkConnActiveFalse() log.Printf(Red("[%s]Remove Faild BackendConnLB: %s\n"), bc.serviceName, bc.addr4Log) if err == nil { // bc.input被关闭了,应该就没有 Request 了 } else { // 如果出现err, 则将bc.input中现有的数据都flush回去(直接报错) for i := len(bc.input); i != 0; i-- { r := <-bc.input bc.setResponse(r, nil, err) } } }
// // // 等待Request请求的返回: Session最终被Block住 // func (s *Session) handleResponse(r *Request) { // 等待结果的出现 r.Wait.Wait() // 将Err转换成为Exception if r.Response.Err != nil { r.Response.Data = GetThriftException(r, "proxy_session") log.Printf(Magenta("---->Convert Error Back to Exception")) } // 如何处理Data和Err呢? incrOpStats(r.OpStr, microseconds()-r.Start) }
// 处理来自Client的请求 func (s *Session) handleRequest(request []byte, d Dispatcher) (*Request, error) { // 构建Request if s.verbose { log.Printf("HandleRequest: %s", string(request)) } r := NewRequest(request, true) // 增加统计 s.LastOpUnix = time.Now().Unix() s.Ops++ // 交给Dispatch // Router return r, d.Dispatch(r) }
// // 删除过期的Endpoints // func (p *BackSockets) PurgeEndpoints() { // 没有需要删除的对象 if p.Active == len(p.Sockets) { return } log.Printf(utils.Green("PurgeEndpoints %d vs. %d"), p.Active, len(p.Sockets)) p.Lock() defer p.Unlock() now := time.Now().Unix() nowStr := time.Now().Format("@2006-01-02 15:04:05") for i := p.Active; i < len(p.Sockets); i++ { // 逐步删除过期的Sockets current := p.Sockets[i] lastIndex := len(p.Sockets) - 1 if now-current.markedOfflineTime > 5 { // 将i和最后一个元素交换 p.swap(current, p.Sockets[lastIndex]) // 关闭 // current // 关闭旧的Socket log.Println(utils.Red("PurgeEndpoints#Purge Old Socket: "), current.Addr, nowStr) // 由Socket自己维护自己的状态 // current.Socket.Close() p.Sockets[lastIndex] = nil p.Sockets = p.Sockets[0:lastIndex] i-- // 保持原位 } } }
func NewThriftLoadBalanceServer(config *utils.Config) *ThriftLoadBalanceServer { log.Printf("FrontAddr: %s\n", Magenta(config.FrontendAddr)) // 前端对接rpc_proxy p := &ThriftLoadBalanceServer{ config: config, zkAddr: config.ZkAddr, productName: config.ProductName, serviceName: config.Service, frontendAddr: config.FrontendAddr, backendAddr: config.BackAddr, verbose: config.Verbose, exitEvt: make(chan bool), } p.topo = zk.NewTopology(p.productName, p.zkAddr) p.lbServiceName = GetServiceIdentity(p.frontendAddr) // 后端对接: 各种python的rpc server p.backendService = NewBackServiceLB(p.serviceName, p.backendAddr, p.verbose, p.exitEvt) return p }
func (s *BackService) Stop() { // 标志停止 s.stop.Set(true) // 触发一个事件(之后ServiceNodes也不再监控) s.evtbus <- true go func() { // TODO: for true { now := time.Now().Unix() if now-s.lastRequestTime.Get() > 10 { break } else { time.Sleep(time.Second) } } for len(s.activeConns) > 0 { s.activeConns[0].MarkOffline() } log.Printf(Red("Mark All Connections Off: %s"), s.serviceName) }() }
// 创建一个BackService func NewBackService(productName string, serviceName string, topo *zk.Topology, verbose bool) *BackService { service := &BackService{ productName: productName, serviceName: serviceName, activeConns: make([]*BackendConn, 0, 10), addr2Conn: make(map[string]*BackendConn), topo: topo, verbose: verbose, } service.WatchBackServiceNodes() go func() { for !service.stop.Get() { log.Printf(Blue("[Report]: %s --> %d backservice, coroutine: %d"), service.serviceName, service.Active(), runtime.NumGoroutine()) time.Sleep(time.Second * 10) } }() return service }
func mainBody(zkAddr string, productName string, serviceName string, frontendAddr string, backendAddr string) { // 1. 创建到zk的连接 var topo *zk.Topology topo = zk.NewTopology(productName, zkAddr) // 2. 启动服务 frontend, _ := zmq.NewSocket(zmq.ROUTER) backend, _ := zmq.NewSocket(zmq.ROUTER) defer frontend.Close() defer backend.Close() // ROUTER/ROUTER绑定到指定的端口 // tcp://127.0.0.1:5555 --> tcp://127_0_0_1:5555 lbServiceName := GetServiceIdentity(frontendAddr) frontend.SetIdentity(lbServiceName) frontend.Bind(frontendAddr) // For clients "tcp://*:5555" backend.Bind(backendAddr) // For workers "tcp://*:5556" log.Printf("FrontAddr: %s, BackendAddr: %s\n", magenta(frontendAddr), magenta(backendAddr)) // 后端的workers queue workersQueue := queue.NewPPQueue() // 心跳间隔1s heartbeat_at := time.Tick(HEARTBEAT_INTERVAL) poller1 := zmq.NewPoller() poller1.Add(backend, zmq.POLLIN) poller2 := zmq.NewPoller() // 前提: // 1. 当zeromq通知消息可读时,那么整个Message(所有的msg parts)都可读 // 2. 往zeromq写数据时,是异步的,因此也不存在block(除非数据量巨大) // poller2.Add(backend, zmq.POLLIN) poller2.Add(frontend, zmq.POLLIN) // 3. 注册zk var endpointInfo map[string]interface{} = make(map[string]interface{}) endpointInfo["frontend"] = frontendAddr endpointInfo["backend"] = backendAddr topo.AddServiceEndPoint(serviceName, lbServiceName, endpointInfo) isAlive := true isAliveLock := &sync.RWMutex{} go func() { servicePath := topo.ProductServicePath(serviceName) evtbus := make(chan interface{}) for true { // 只是为了监控状态 _, err := topo.WatchNode(servicePath, evtbus) if err == nil { // 等待事件 e := (<-evtbus).(topozk.Event) if e.State == topozk.StateExpired || e.Type == topozk.EventNotWatching { // Session过期了,则需要删除之前的数据,因为这个数据的Owner不是当前的Session topo.DeleteServiceEndPoint(serviceName, lbServiceName) topo.AddServiceEndPoint(serviceName, lbServiceName, endpointInfo) } } else { time.Sleep(time.Second) } isAliveLock.RLock() isAlive1 := isAlive isAliveLock.RUnlock() if !isAlive1 { break } } }() ch := make(chan os.Signal, 1) signal.Notify(ch, syscall.SIGTERM, syscall.SIGINT, syscall.SIGKILL) // syscall.SIGKILL // kill -9 pid // kill -s SIGKILL pid 还是留给运维吧 // // 自动退出条件: // var suideTime time.Time for { var sockets []zmq.Polled var err error sockets, err = poller2.Poll(HEARTBEAT_INTERVAL) if err != nil { // break // Interrupted log.Errorf("Error When Pollling: %v\n", err) continue } hasValidMsg := false for _, socket := range sockets { switch socket.Socket { case backend: // 格式: // 后端: // <"", proxy_id, "", client_id, "", rpc_data> // Backend Socket读取到的: // <wokerid, "", proxy_id, "", client_id, "", rpc_data> // msgs, err := backend.RecvMessage(0) if err != nil { log.Errorf("Error When RecvMessage from background: %v\n", err) continue } if config.VERBOSE { // log.Println("Message from backend: ", msgs) } // 消息类型: // msgs: <worker_id, "", proxy_id, "", client_id, "", rpc_data> // <worker_id, "", rpc_control_data> worker_id, msgs := utils.Unwrap(msgs) // rpc_control_data 控制信息 // msgs: <rpc_control_data> if len(msgs) == 1 { // PPP_READY // PPP_HEARTBEAT controlMsg := msgs[0] // 碰到无效的信息,则直接跳过去 if len(controlMsg) == 0 { continue } if config.VERBOSE { // log.Println("Got Message From Backend...") } if controlMsg[0] == PPP_READY || controlMsg[0] == PPP_HEARTBEAT { // 后端服务剩余的并发能力 var concurrency int if len(controlMsg) >= 3 { concurrency = int(controlMsg[2]) } else { concurrency = 1 } if config.VERBOSE { // utils.PrintZeromqMsgs(msgs, "control msg") } force_update := controlMsg[0] == PPP_READY workersQueue.UpdateWorkerStatus(worker_id, concurrency, force_update) } else if controlMsg[0] == PPP_STOP { // 停止指定的后端服务 workersQueue.UpdateWorkerStatus(worker_id, -1, true) } else { log.Errorf("Unexpected Control Message: %d", controlMsg[0]) } } else { hasValidMsg = true // 将信息发送到前段服务, 如果前端服务挂了,则消息就丢失 // log.Println("Send Message to frontend") workersQueue.UpdateWorkerStatus(worker_id, 0, false) // msgs: <proxy_id, "", client_id, "", rpc_data> frontend.SendMessage(msgs) } case frontend: hasValidMsg = true log.Println("----->Message from front: ") msgs, err := frontend.RecvMessage(0) if err != nil { log.Errorf("Error when reading from frontend: %v\n", err) continue } // msgs: // <proxy_id, "", client_id, "", rpc_data> if config.VERBOSE { utils.PrintZeromqMsgs(msgs, "frontend") } msgs = utils.TrimLeftEmptyMsg(msgs) // 将msgs交给后端服务器 worker := workersQueue.NextWorker() if worker != nil { if config.VERBOSE { log.Println("Send Msg to Backend worker: ", worker.Identity) } backend.SendMessage(worker.Identity, "", msgs) } else { // 怎么返回错误消息呢? if config.VERBOSE { log.Println("No backend worker found") } errMsg := proxy.GetWorkerNotFoundData("account", 0) // <proxy_id, "", client_id, "", rpc_data> frontend.SendMessage(msgs[0:(len(msgs)-1)], errMsg) } } } // 如果安排的suiside, 则需要处理 suiside的时间 isAliveLock.RLock() isAlive1 := isAlive isAliveLock.RUnlock() if !isAlive1 { if hasValidMsg { suideTime = time.Now().Add(time.Second * 3) } else { if time.Now().After(suideTime) { log.Println(utils.Green("Load Balance Suiside Gracefully")) break } } } // 心跳同步 select { case <-heartbeat_at: now := time.Now() // 给workerQueue中的所有的worker发送心跳消息 for _, worker := range workersQueue.WorkerQueue { if worker.Expire.After(now) { // log.Println("Sending Hb to Worker: ", worker.Identity) backend.SendMessage(worker.Identity, "", PPP_HEARTBEAT_STR) } } workersQueue.PurgeExpired() case sig := <-ch: isAliveLock.Lock() isAlive1 := isAlive isAlive = false isAliveLock.Unlock() if isAlive1 { // 准备退出(但是需要处理完毕手上的活) // 需要退出: topo.DeleteServiceEndPoint(serviceName, lbServiceName) if sig == syscall.SIGKILL { log.Println(utils.Red("Got Kill Signal, Return Directly")) break } else { suideTime = time.Now().Add(time.Second * 3) log.Println(utils.Red("Schedule to suicide at: "), suideTime.Format("@2006-01-02 15:04:05")) } } default: } } }
func (p *ThriftLoadBalanceServer) Run() { // // 1. 创建到zk的连接 // 127.0.0.1:5555 --> 127_0_0_1:5555 exitSignal := make(chan os.Signal, 1) signal.Notify(exitSignal, syscall.SIGTERM, syscall.SIGINT, syscall.SIGKILL) // syscall.SIGKILL // kill -9 pid // kill -s SIGKILL pid 还是留给运维吧 // // 注册服务 evtExit := make(chan interface{}) serviceEndpoint := RegisterService(p.serviceName, p.frontendAddr, p.lbServiceName, p.topo, evtExit, p.config.WorkDir, p.config.CodeUrlVersion) // var suideTime time.Time // isAlive := true // 3. 读取后端服务的配置 var transport thrift.TServerTransport var err error isUnixDomain := false // 127.0.0.1:9999(以:区分不同的类型) if !strings.Contains(p.frontendAddr, ":") { if FileExist(p.frontendAddr) { os.Remove(p.frontendAddr) } transport, err = NewTServerUnixDomain(p.frontendAddr) isUnixDomain = true } else { transport, err = thrift.NewTServerSocket(p.frontendAddr) } if err != nil { log.ErrorErrorf(err, "Server Socket Create Failed: %v", err) panic(fmt.Sprintf("Invalid FrontendAddress: %s", p.frontendAddr)) } err = transport.Listen() if err != nil { log.ErrorErrorf(err, "Server Socket Create Failed: %v", err) panic(fmt.Sprintf("Binding Error FrontendAddress: %s", p.frontendAddr)) } ch := make(chan thrift.TTransport, 4096) defer close(ch) // 强制退出? TODO: Graceful退出 go func() { <-exitSignal // 通知RegisterService终止循环 evtExit <- true log.Info(Green("Receive Exit Signals....")) serviceEndpoint.DeleteServiceEndpoint(p.topo) start := time.Now().Unix() for true { // 如果5s内没有接受到新的请求了,则退出 now := time.Now().Unix() if now-p.lastRequestTime.Get() > 5 { log.Printf(Red("[%s]Graceful Exit..."), p.serviceName) break } else { log.Printf(Cyan("[%s]Sleeping %d seconds before Exit...\n"), p.serviceName, now-start) time.Sleep(time.Second) } } transport.Interrupt() transport.Close() }() go func() { var address string for c := range ch { // 为每个Connection建立一个Session socket, ok := c.(SocketAddr) if ok { if isUnixDomain { address = p.frontendAddr } else { address = socket.Addr().String() } } else { address = "unknow" } x := NewNonBlockSession(c, address, p.verbose, &p.lastRequestTime) // Session独立处理自己的请求 go x.Serve(p.backendService, 1000) } }() // Accept什么时候出错,出错之后如何处理呢? for { c, err := transport.Accept() if err != nil { close(ch) break } else { ch <- c } } }
// // go test github.com/wfxiang08/rpc_proxy/proxy -v -run "TestSession" // func TestSession(t *testing.T) { // 作为一个Server transport, err := thrift.NewTServerSocket("127.0.0.1:0") assert.NoError(t, err) err = transport.Open() // 打开Transport assert.NoError(t, err) defer transport.Close() err = transport.Listen() // 开始监听 assert.NoError(t, err) addr := transport.Addr().String() fmt.Println("Addr: ", addr) var requestNum int32 = 10 requests := make([]*Request, 0, requestNum) var i int32 for i = 0; i < requestNum; i++ { buf := make([]byte, 100, 100) l := fakeData("Hello", thrift.CALL, i+1, buf[0:0]) buf = buf[0:l] req := NewRequest(buf, true) req.Wait.Add(1) // 因为go routine可能还没有执行,代码就跑到最后面进行校验了 assert.Equal(t, i+1, req.Request.SeqId, "Request SeqId是否靠谱") requests = append(requests, req) } go func() { // 模拟请求: // 客户端代码 bc := NewBackendConn(addr, nil, "test", true) bc.currentSeqId = 10 // 准备发送数据 var i int32 for i = 0; i < requestNum; i++ { fmt.Println("Sending Request to Backend Conn", i) bc.PushBack(requests[i]) requests[i].Wait.Done() } // 需要等待数据返回? time.Sleep(time.Second * 2) }() server := &fakeServer{} go func() { // 服务器端代码 tran, err := transport.Accept() defer tran.Close() if err != nil { log.ErrorErrorf(err, "Error: %v\n", err) } assert.NoError(t, err) session := NewSession(tran, "", true) session.Serve(server, 6) time.Sleep(time.Second * 2) }() for i = 0; i < requestNum; i++ { fmt.Println("===== Before Wait") requests[i].Wait.Wait() fmt.Println("===== Before After Wait") log.Printf("Request: %d, .....", i) assert.Equal(t, len(requests[i].Response.Data), len(requests[i].Request.Data)) } }
// // 如何处理后端服务的变化呢? // func (s *BackService) WatchBackServiceNodes() { s.evtbus = make(chan interface{}, 2) servicePath := s.topo.ProductServicePath(s.serviceName) go func() { for !s.stop.Get() { serviceIds, err := s.topo.WatchChildren(servicePath, s.evtbus) if err == nil { // 如何监听endpoints的变化呢? addressMap := make(map[string]bool, len(serviceIds)) for _, serviceId := range serviceIds { log.Printf(Green("---->Find Endpoint: %s for Service: %s"), serviceId, s.serviceName) endpointInfo, err := GetServiceEndpoint(s.topo, s.serviceName, serviceId) if err != nil { log.ErrorErrorf(err, "Service Endpoint Read Error: %v\n", err) } else { log.Printf(Green("---->Add endpoint %s To Service %s"), endpointInfo.Frontend, s.serviceName) if strings.Contains(endpointInfo.Frontend, ":") { addressMap[endpointInfo.Frontend] = true } else if s.productName == TEST_PRODUCT_NAME { // unix domain socket只在测试的时候可以使用(因为不能实现跨机器访问) addressMap[endpointInfo.Frontend] = true } } } for addr, _ := range addressMap { conn, ok := s.addr2Conn[addr] if ok && !conn.IsMarkOffline.Get() { continue } else { // 创建新的连接(心跳成功之后就自动加入到 s.activeConns 中 s.addr2Conn[addr] = NewBackendConn(addr, s, s.serviceName, s.verbose) } } for addr, conn := range s.addr2Conn { _, ok := addressMap[addr] if !ok { conn.MarkOffline() // 删除: 然后等待Conn自生自灭 delete(s.addr2Conn, addr) } } // 等待事件 <-s.evtbus } else { log.WarnErrorf(err, "zk read failed: %s", servicePath) // 如果读取失败则,则继续等待5s time.Sleep(time.Duration(5) * time.Second) } } }() }
// // go test github.com/wfxiang08/rpc_proxy/proxy -v -run "TestBackend" // func TestBackend(t *testing.T) { // 作为一个Server transport, err := thrift.NewTServerSocket("127.0.0.1:0") assert.NoError(t, err) err = transport.Open() // 打开Transport assert.NoError(t, err) defer transport.Close() err = transport.Listen() // 开始监听 assert.NoError(t, err) addr := transport.Addr().String() fmt.Println("Addr: ", addr) var requestNum int32 = 10 requests := make([]*Request, 0, requestNum) var i int32 for i = 0; i < requestNum; i++ { buf := make([]byte, 100, 100) l := fakeData("Hello", thrift.CALL, i+1, buf[0:0]) buf = buf[0:l] req := NewRequest(buf, false) req.Wait.Add(1) // 因为go routine可能还没有执行,代码就跑到最后面进行校验了 assert.Equal(t, i+1, req.Request.SeqId, "Request SeqId是否靠谱") requests = append(requests, req) } go func() { // 客户端代码 bc := NewBackendConn(addr, nil, "test", true) bc.currentSeqId = 10 // 准备发送数据 var i int32 for i = 0; i < requestNum; i++ { fmt.Println("Sending Request to Backend Conn", i) bc.PushBack(requests[i]) requests[i].Wait.Done() } // 需要等待数据返回? time.Sleep(time.Second * 2) }() go func() { // 服务器端代码 tran, err := transport.Accept() if err != nil { log.ErrorErrorf(err, "Error: %v\n", err) } assert.NoError(t, err) bt := NewTBufferedFramedTransport(tran, time.Microsecond*100, 2) // 在当前的这个t上读写数据 var i int32 for i = 0; i < requestNum; i++ { request, err := bt.ReadFrame() assert.NoError(t, err) req := NewRequest(request, false) assert.Equal(t, req.Request.SeqId, i+10) fmt.Printf("Server Got Request, and SeqNum OK, Id: %d, Frame Size: %d\n", i, len(request)) // 回写数据 bt.Write(request) bt.FlushBuffer(true) } tran.Close() }() fmt.Println("Requests Len: ", len(requests)) for idx, r := range requests { r.Wait.Wait() // r 原始的请求 req := NewRequest(r.Response.Data, false) log.Printf(Green("SeqMatch[%d]: Orig: %d, Return: %d\n"), idx, req.Request.SeqId, r.Request.SeqId) assert.Equal(t, req.Request.SeqId, r.Request.SeqId) } log.Println("OK") }
func RpcMain(binaryName string, serviceDesc string, configCheck ConfigCheck, serverFactory ServerFactorory, buildDate string, gitVersion string) { // 1. 准备解析参数 usage = fmt.Sprintf(usage, binaryName, binaryName) version := fmt.Sprintf("Version: %s\nBuildDate: %s\nDesc: %s\nAuthor: [email protected]", gitVersion, buildDate, serviceDesc) args, err := docopt.Parse(usage, nil, true, version, true) if err != nil { fmt.Println(err) os.Exit(1) } if s, ok := args["-V"].(bool); ok && s { fmt.Println(Green(version)) os.Exit(1) } // 这就是为什么 Codis 傻乎乎起一个 http server的目的 if s, ok := args["--profile-addr"].(string); ok && len(s) > 0 { go func() { log.Printf(Red("Profile Address: %s"), s) log.Println(http.ListenAndServe(s, nil)) }() } // 2. 解析Log相关的配置 log.SetLevel(log.LEVEL_INFO) var maxKeepDays int = 3 if s, ok := args["--log-keep-days"].(string); ok && s != "" { v, err := strconv.ParseInt(s, 10, 32) if err != nil { log.PanicErrorf(err, "invalid max log file keep days = %s", s) } maxKeepDays = int(v) } // set output log file if s, ok := args["-L"].(string); ok && s != "" { f, err := log.NewRollingFile(s, maxKeepDays) if err != nil { log.PanicErrorf(err, "open rolling log file failed: %s", s) } else { defer f.Close() log.StdLog = log.New(f, "") } } log.SetLevel(log.LEVEL_INFO) log.SetFlags(log.Flags() | log.Lshortfile) // set log level if s, ok := args["--log-level"].(string); ok && s != "" { SetLogLevel(s) } // 没有就没有 workDir, _ := args["--work-dir"].(string) codeUrlVersion, _ := args["--code-url-version"].(string) if len(workDir) == 0 { workDir, _ = os.Getwd() } log.Printf("WorkDir: %s, CodeUrl: %s, Wd: %s", workDir, codeUrlVersion) // 3. 解析Config configFile := args["-c"].(string) conf, err := utils.LoadConf(configFile) if err != nil { log.PanicErrorf(err, "load config failed") } // 额外的配置信息 conf.WorkDir = workDir conf.CodeUrlVersion = codeUrlVersion if configCheck != nil { configCheck(conf) } else { log.Panic("No Config Check Given") } // 每次启动的时候都打印版本信息 log.Infof(Green("-----------------\n%s\n--------------------------------------------------------------------"), version) // 启动服务 server := serverFactory(conf) server.Run() }
func (s *BackServiceLB) run() { go func() { // 定时汇报当前的状态 for true { log.Printf(Green("[Report]: %s --> %d workers, coroutine: %d"), s.serviceName, s.Active(), runtime.NumGoroutine()) time.Sleep(time.Second * 10) } }() var transport thrift.TServerTransport var err error // 3. 读取后端服务的配置 isUnixDomain := false // 127.0.0.1:9999(以:区分不同的类型) if !strings.Contains(s.backendAddr, ":") { if FileExist(s.backendAddr) { os.Remove(s.backendAddr) } transport, err = NewTServerUnixDomain(s.backendAddr) isUnixDomain = true } else { transport, err = thrift.NewTServerSocket(s.backendAddr) } if err != nil { log.ErrorErrorf(err, "[%s]Server Socket Create Failed: %v", s.serviceName, err) panic("BackendAddr Invalid") } err = transport.Listen() if err != nil { log.ErrorErrorf(err, "[%s]Server Socket Open Failed: %v", s.serviceName, err) panic("Server Socket Open Failed") } // 和transport.open做的事情一样,如果Open没错,则Listen也不会有问题 log.Printf(Green("[%s]LB Backend Services listens at: %s"), s.serviceName, s.backendAddr) s.ch = make(chan thrift.TTransport, 4096) // 强制退出? TODO: Graceful退出 go func() { <-s.exitEvt log.Info(Red("Receive Exit Signals....")) transport.Interrupt() transport.Close() }() go func() { var backendAddr string for trans := range s.ch { // 为每个Connection建立一个Session socket, ok := trans.(SocketAddr) if ok { if isUnixDomain { backendAddr = s.backendAddr } else { backendAddr = socket.Addr().String() } conn := NewBackendConnLB(trans, s.serviceName, backendAddr, s, s.verbose) // 因为连接刚刚建立,可靠性还是挺高的,因此直接加入到列表中 s.activeConnsLock.Lock() conn.Index = len(s.activeConns) s.activeConns = append(s.activeConns, conn) s.activeConnsLock.Unlock() log.Printf(Green("%s --> %d workers"), s.serviceName, conn.Index) } else { panic("Invalid Socket Type") } } }() // Accept什么时候出错,出错之后如何处理呢? go func() { for { c, err := transport.Accept() if err != nil { return } else { s.ch <- c } } }() }
func (s *Session) Close() error { s.closed.Set(true) log.Printf(Red("Close Proxy Session")) return s.TBufferedFramedTransport.Close() }