func (f *fileOutputer) extract(bp *bytes.Buffer) { buf := make([]byte, 4) bp.Read(buf) l, _ := binary.Uvarint(buf) headerLen := int(l) //get pack header buf = make([]byte, headerLen) bp.Read(buf) header := tcp_pack.ParseHeader(buf) r, err := zlib.NewReader(bp) if err != nil { loglib.Error("zlib reader Error: " + err.Error()) } else { lines, _ := strconv.Atoi(header["lines"]) done := false if header["done"] == "1" { done = true } f.ic.Add(header["ip"], header["hour"], header["id"], lines, done) writerKey := header["ip"] + "_" + header["hour"] fout := f.getWriter(f.writers, f.dataDir, writerKey) //一头一尾写头信息,节省硬盘 buf = append(buf, '\n') //fout.Write(buf) nn, err := io.Copy(fout, r) if err != nil { loglib.Warning(fmt.Sprintf("save %s_%s_%s error:%s, saved:%d", header["ip"], header["hour"], header["id"], err, nn)) } //fout.Write(buf) //单独存一份header便于查数 fout = f.getWriter(f.headerWriters, f.headerDir, writerKey) n, err := fout.Write(buf) if err != nil { loglib.Info(fmt.Sprintf("writer header %s %d %s", writerKey, n, err.Error())) } if done || time.Now().Unix() > f.checkTime.Unix() { hourFinish, _ := f.ic.Check() for ip, hours := range hourFinish { for _, hour := range hours { writerKey = ip + "_" + hour } } f.closeWriters(f.writers) f.closeWriters(f.headerWriters) f.checkTime.Add(2 * time.Minute) } r.Close() } }
func (this *MongoDbOutputer) extract(bp *bytes.Buffer) (r io.ReadCloser, packId string, date string, err error) { buf := make([]byte, 4) bp.Read(buf) l, _ := binary.Uvarint(buf) headerLen := int(l) //get pack header buf = make([]byte, headerLen) bp.Read(buf) header := tcp_pack.ParseHeader(buf) r, err = zlib.NewReader(bp) if err != nil { loglib.Error("zlib reader Error: " + err.Error()) } date = header["hour"][0:8] //用于按天分库 packId = fmt.Sprintf("%s_%s_%s", header["ip"], header["hour"], header["id"]) return }
func (e *etlOutputer) runEtl(spiderList string, colsFile string, hostsList string, ipBlackList string) { wg := &sync.WaitGroup{} fkeyChan := make(chan string, 100) defer func() { if err := recover(); err != nil { loglib.Error(fmt.Sprintf("runEtl() panic:%v", err)) } e.ic.SaveStatus() e.closeWriters(e.writers) e.closeWriters(e.headerWriters) close(fkeyChan) //等待etl routine结束 wg.Wait() }() for i := 0; i < 5; i++ { wg.Add(1) go e.doEtl(fkeyChan, e.dataDir, e.etlDir, e.etlDoneDir, e.etlFailDir, spiderList, colsFile, hostsList, ipBlackList, wg) } nextCheckTime := time.Now().Add(2 * time.Minute) //使用range遍历,方便安全退出,只要发送方退出时关闭chan,这里就可以退出了 for b := range e.buffer { loglib.Info(fmt.Sprintf("pack in chan: %d", len(e.buffer))) buf := make([]byte, 4) bp := &b bp.Read(buf) l, _ := binary.Uvarint(buf) headerLen := int(l) //get pack header buf = make([]byte, headerLen) bp.Read(buf) header := tcp_pack.ParseHeader(buf) r, err := zlib.NewReader(bp) if err != nil { loglib.Error("zlib reader Error: " + err.Error()) } else { lines, _ := strconv.Atoi(header["lines"]) done := false if header["done"] == "1" { done = true } e.ic.Add(header["ip"], header["hour"], header["id"], lines, done) writerKey := header["ip"] + "_" + header["hour"] fout := e.getWriter(e.writers, e.dataDir, writerKey) buf = append(buf, '\n') /* //一头一尾写头信息,节省硬盘 n, err := fout.Write(buf) if err != nil { loglib.Info(fmt.Sprintf("write %s %d %s", writerKey, n, err.Error())) } */ nn, err := io.Copy(fout, r) if err != nil { loglib.Warning(fmt.Sprintf("save %s_%s_%s error:%s, saved:%d", header["ip"], header["hour"], header["id"], err, nn)) } //fout.Write(buf) //单独存一份header便于查数 fout = e.getWriter(e.headerWriters, e.headerDir, writerKey) n, err := fout.Write(buf) if err != nil { loglib.Info(fmt.Sprintf("writer header %s %d %s", writerKey, n, err.Error())) } //增加2分钟check一次的规则,避免done包先到,其他的包未到,则可能要等到下一小时才能check if done || time.Now().Unix() > nextCheckTime.Unix() { hourFinish, _ := e.ic.Check() for ip, hours := range hourFinish { for _, hour := range hours { writerKey = ip + "_" + hour loglib.Info(fmt.Sprintf("fkeychan %d", len(fkeyChan))) fkeyChan <- writerKey } } e.closeWriters(e.writers) e.closeWriters(e.headerWriters) e.ic.SaveStatus() nextCheckTime = time.Now().Add(2 * time.Minute) } r.Close() } } }
func (t *TcpReceiver) handleConnnection(conn net.Conn, wg *sync.WaitGroup) { defer func() { if err := recover(); err != nil { loglib.Error(fmt.Sprintf("tcp receiver connection panic:%v", err)) } conn.Close() wg.Done() }() /* 用于标识收到退出信号后,能否直接退出 只要接收信号时,包没有收完,都是可退出的, 发送方会缓存以后重传; 如果收完了就不能直接退出,可能包已传给下一级处理但是 却告诉发送方发送失败 */ var quit = false //用于标识是否要退出 go lib.HandleQuitSignal(func() { //关闭连接,避免阻塞在网络io上 conn.Close() quit = true }) request := make([]byte, 512*1024) //缓冲为512k var packLen int = 0 currLen := 0 var b = new(bytes.Buffer) var content = new(bytes.Buffer) inAddr := conn.RemoteAddr().String() parts := strings.Split(inAddr, ":") inIp := parts[0] packId := "unkown" var routeInfo map[string]string var rePull = false //是否补拉,如果是补拉就不做重复包检验 loglib.Info("incoming: " + inAddr) outer: for !quit { st := time.Now() if packLen == 0 { conn.SetReadDeadline(time.Now().Add(5 * time.Minute)) time1 := time.Now() //时间打点 // read zlib pack header length buf := make([]byte, 4) _, err := conn.Read(buf) if err != nil { loglib.Warning(fmt.Sprintf("conn:%s, get header len, tcp receiver read error:%s, elapse:%s", inAddr, err.Error(), time.Now().Sub(time1))) break } l, _ := binary.Uvarint(buf) headerLen := int(l) //get pack header headerBuf := make([]byte, headerLen) time2 := time.Now() _, err = conn.Read(headerBuf) if err != nil { loglib.Warning(fmt.Sprintf("conn:%s, get header, tcp receiver read error:%s, elapse:%s", inAddr, err.Error(), time.Now().Sub(time2))) break } //是否补拉 route0 := tcp_pack.ParseHeader(headerBuf) if v, ok := route0["repull"]; ok && v == "1" { rePull = true } else { rePull = false } buf = append(buf, headerBuf...) header, _, err := tcp_pack.ExtractHeader(buf) if err != nil { loglib.Error("wrong format header " + string(headerBuf) + " " + err.Error()) conn.Write([]byte("wrong header")) break } packId = tcp_pack.GetPackId(buf) packLen = header.PackLen currLen = 0 routeInfo = make(map[string]string) b = new(bytes.Buffer) content = new(bytes.Buffer) loglib.Info(fmt.Sprintf("conn:%s, start receive pack %s, pack len:%d, header len:%d, header elapse:%s", inAddr, packId, packLen, headerLen, time.Now().Sub(time1))) b.Write(buf) routeInfo["ip"] = lib.GetIp() routeInfo["stage"] = "tcp recv" routeInfo["st"] = st.Format("2006-01-02 15:04:05.000") } //读包体的超时 conn.SetReadDeadline(time.Now().Add(5 * time.Minute)) time3 := time.Now() //read enough bytes for currLen < packLen { requestLen, err := conn.Read(request) if requestLen == 0 || err != nil { //sender有重发机制,所以可丢弃 packLen = 0 //设为0以便读取新的包 ed := time.Now() loglib.Warning(fmt.Sprintf("conn:%s, not full! ip:%s, packid:%s, received:%d, end recv:%s, elapse:%s, body elapse:%s, error:%s", inAddr, inIp, packId, currLen, ed, ed.Sub(st), ed.Sub(time3), err.Error())) break outer //连接出错直接跳出外层循环 } currLen += requestLen content.Write(request[:requestLen]) } if packLen > 0 && currLen >= packLen { //收完马上应答 _, err := conn.Write([]byte("ok")) if err != nil { loglib.Warning(fmt.Sprintf("ip:%s, packid:%s received, but response back error:%s", inIp, packId, err.Error())) } else { loglib.Info(fmt.Sprintf("conn:%s, response to packid:%s", inAddr, packId)) } //避免收到重复包(补拉例外) appeared, ok, code := t.hasAppeared(content) if !ok || rePull { ed := time.Now() routeInfo["ed"] = ed.Format("2006-01-02 15:04:05.000") routeInfo["elapse"] = ed.Sub(st).String() b.Write(content.Bytes()) vbytes := tcp_pack.Packing(b.Bytes(), routeInfo, true) b = bytes.NewBuffer(vbytes) t.buffer <- *b packAppear := PackAppear{time.Now().Unix(), packId} t.mutex.Lock() t.footPrint[code] = packAppear //这里挂过 t.mutex.Unlock() loglib.Info(fmt.Sprintf("conn:%s, finish ip:%s, packid:%s, repull:%v, received:%d, elapse:%s, body elapse:%s", inAddr, inIp, packId, rePull, currLen, ed.Sub(st), ed.Sub(time3))) } else { loglib.Info(fmt.Sprintf("conn:%s, pack %s repeat %s already appear at %s", inAddr, packId, appeared.Id, time.Unix(appeared.Time, 0))) } packLen = 0 } } loglib.Info("conn finish: " + inAddr) }