Example #1
0
func (f *fileOutputer) extract(bp *bytes.Buffer) {
	buf := make([]byte, 4)
	bp.Read(buf)

	l, _ := binary.Uvarint(buf)
	headerLen := int(l)
	//get pack header
	buf = make([]byte, headerLen)
	bp.Read(buf)
	header := tcp_pack.ParseHeader(buf)

	r, err := zlib.NewReader(bp)
	if err != nil {
		loglib.Error("zlib reader Error: " + err.Error())
	} else {
		lines, _ := strconv.Atoi(header["lines"])
		done := false
		if header["done"] == "1" {
			done = true
		}
		f.ic.Add(header["ip"], header["hour"], header["id"], lines, done)

		writerKey := header["ip"] + "_" + header["hour"]
		fout := f.getWriter(f.writers, f.dataDir, writerKey)

		//一头一尾写头信息,节省硬盘
		buf = append(buf, '\n')
		//fout.Write(buf)
		nn, err := io.Copy(fout, r)
		if err != nil {
			loglib.Warning(fmt.Sprintf("save %s_%s_%s error:%s, saved:%d", header["ip"], header["hour"], header["id"], err, nn))
		}
		//fout.Write(buf)

		//单独存一份header便于查数
		fout = f.getWriter(f.headerWriters, f.headerDir, writerKey)
		n, err := fout.Write(buf)
		if err != nil {
			loglib.Info(fmt.Sprintf("writer header %s %d %s", writerKey, n, err.Error()))
		}

		if done || time.Now().Unix() > f.checkTime.Unix() {
			hourFinish, _ := f.ic.Check()
			for ip, hours := range hourFinish {
				for _, hour := range hours {
					writerKey = ip + "_" + hour
				}
			}
			f.closeWriters(f.writers)
			f.closeWriters(f.headerWriters)
			f.checkTime.Add(2 * time.Minute)
		}

		r.Close()
	}
}
Example #2
0
func (this *MongoDbOutputer) extract(bp *bytes.Buffer) (r io.ReadCloser, packId string, date string, err error) {
	buf := make([]byte, 4)
	bp.Read(buf)

	l, _ := binary.Uvarint(buf)
	headerLen := int(l)
	//get pack header
	buf = make([]byte, headerLen)
	bp.Read(buf)
	header := tcp_pack.ParseHeader(buf)

	r, err = zlib.NewReader(bp)
	if err != nil {
		loglib.Error("zlib reader Error: " + err.Error())
	}
	date = header["hour"][0:8] //用于按天分库
	packId = fmt.Sprintf("%s_%s_%s", header["ip"], header["hour"], header["id"])
	return
}
Example #3
0
func (e *etlOutputer) runEtl(spiderList string, colsFile string, hostsList string, ipBlackList string) {
	wg := &sync.WaitGroup{}
	fkeyChan := make(chan string, 100)
	defer func() {
		if err := recover(); err != nil {
			loglib.Error(fmt.Sprintf("runEtl() panic:%v", err))
		}

		e.ic.SaveStatus()
		e.closeWriters(e.writers)
		e.closeWriters(e.headerWriters)
		close(fkeyChan)
		//等待etl routine结束
		wg.Wait()
	}()

	for i := 0; i < 5; i++ {
		wg.Add(1)
		go e.doEtl(fkeyChan, e.dataDir, e.etlDir, e.etlDoneDir, e.etlFailDir, spiderList, colsFile, hostsList, ipBlackList, wg)
	}
	nextCheckTime := time.Now().Add(2 * time.Minute)
	//使用range遍历,方便安全退出,只要发送方退出时关闭chan,这里就可以退出了
	for b := range e.buffer {
		loglib.Info(fmt.Sprintf("pack in chan: %d", len(e.buffer)))
		buf := make([]byte, 4)
		bp := &b
		bp.Read(buf)

		l, _ := binary.Uvarint(buf)
		headerLen := int(l)
		//get pack header
		buf = make([]byte, headerLen)
		bp.Read(buf)
		header := tcp_pack.ParseHeader(buf)

		r, err := zlib.NewReader(bp)
		if err != nil {
			loglib.Error("zlib reader Error: " + err.Error())
		} else {
			lines, _ := strconv.Atoi(header["lines"])
			done := false
			if header["done"] == "1" {
				done = true
			}
			e.ic.Add(header["ip"], header["hour"], header["id"], lines, done)

			writerKey := header["ip"] + "_" + header["hour"]
			fout := e.getWriter(e.writers, e.dataDir, writerKey)

			buf = append(buf, '\n')
			/*
			   //一头一尾写头信息,节省硬盘
			   n, err := fout.Write(buf)
			   if err != nil {
			       loglib.Info(fmt.Sprintf("write %s %d %s", writerKey, n, err.Error()))
			   }
			*/
			nn, err := io.Copy(fout, r)
			if err != nil {
				loglib.Warning(fmt.Sprintf("save %s_%s_%s error:%s, saved:%d", header["ip"], header["hour"], header["id"], err, nn))
			}
			//fout.Write(buf)
			//单独存一份header便于查数
			fout = e.getWriter(e.headerWriters, e.headerDir, writerKey)
			n, err := fout.Write(buf)
			if err != nil {
				loglib.Info(fmt.Sprintf("writer header %s %d %s", writerKey, n, err.Error()))
			}
			//增加2分钟check一次的规则,避免done包先到,其他的包未到,则可能要等到下一小时才能check
			if done || time.Now().Unix() > nextCheckTime.Unix() {
				hourFinish, _ := e.ic.Check()
				for ip, hours := range hourFinish {
					for _, hour := range hours {
						writerKey = ip + "_" + hour
						loglib.Info(fmt.Sprintf("fkeychan %d", len(fkeyChan)))
						fkeyChan <- writerKey
					}
				}
				e.closeWriters(e.writers)
				e.closeWriters(e.headerWriters)
				e.ic.SaveStatus()
				nextCheckTime = time.Now().Add(2 * time.Minute)
			}

			r.Close()
		}
	}
}
Example #4
0
func (t *TcpReceiver) handleConnnection(conn net.Conn, wg *sync.WaitGroup) {
	defer func() {
		if err := recover(); err != nil {
			loglib.Error(fmt.Sprintf("tcp receiver connection panic:%v", err))
		}
		conn.Close()
		wg.Done()
	}()
	/*
	   用于标识收到退出信号后,能否直接退出
	   只要接收信号时,包没有收完,都是可退出的,
	   发送方会缓存以后重传;
	   如果收完了就不能直接退出,可能包已传给下一级处理但是
	   却告诉发送方发送失败
	*/
	var quit = false //用于标识是否要退出

	go lib.HandleQuitSignal(func() {
		//关闭连接,避免阻塞在网络io上
		conn.Close()
		quit = true
	})

	request := make([]byte, 512*1024) //缓冲为512k

	var packLen int = 0
	currLen := 0
	var b = new(bytes.Buffer)
	var content = new(bytes.Buffer)
	inAddr := conn.RemoteAddr().String()
	parts := strings.Split(inAddr, ":")
	inIp := parts[0]

	packId := "unkown"

	var routeInfo map[string]string
	var rePull = false //是否补拉,如果是补拉就不做重复包检验

	loglib.Info("incoming: " + inAddr)

outer:
	for !quit {

		st := time.Now()
		if packLen == 0 {
			conn.SetReadDeadline(time.Now().Add(5 * time.Minute))
			time1 := time.Now() //时间打点
			// read zlib pack header length
			buf := make([]byte, 4)
			_, err := conn.Read(buf)
			if err != nil {
				loglib.Warning(fmt.Sprintf("conn:%s, get header len, tcp receiver read error:%s, elapse:%s", inAddr, err.Error(), time.Now().Sub(time1)))
				break
			}
			l, _ := binary.Uvarint(buf)
			headerLen := int(l)
			//get pack header
			headerBuf := make([]byte, headerLen)
			time2 := time.Now()
			_, err = conn.Read(headerBuf)
			if err != nil {
				loglib.Warning(fmt.Sprintf("conn:%s, get header, tcp receiver read error:%s, elapse:%s", inAddr, err.Error(), time.Now().Sub(time2)))
				break
			}

			//是否补拉
			route0 := tcp_pack.ParseHeader(headerBuf)
			if v, ok := route0["repull"]; ok && v == "1" {
				rePull = true
			} else {
				rePull = false
			}

			buf = append(buf, headerBuf...)
			header, _, err := tcp_pack.ExtractHeader(buf)
			if err != nil {
				loglib.Error("wrong format header " + string(headerBuf) + " " + err.Error())
				conn.Write([]byte("wrong header"))
				break
			}

			packId = tcp_pack.GetPackId(buf)
			packLen = header.PackLen
			currLen = 0
			routeInfo = make(map[string]string)
			b = new(bytes.Buffer)
			content = new(bytes.Buffer)

			loglib.Info(fmt.Sprintf("conn:%s, start receive pack %s, pack len:%d, header len:%d, header elapse:%s", inAddr, packId, packLen, headerLen, time.Now().Sub(time1)))
			b.Write(buf)

			routeInfo["ip"] = lib.GetIp()
			routeInfo["stage"] = "tcp recv"
			routeInfo["st"] = st.Format("2006-01-02 15:04:05.000")
		}
		//读包体的超时
		conn.SetReadDeadline(time.Now().Add(5 * time.Minute))
		time3 := time.Now()
		//read enough bytes
		for currLen < packLen {
			requestLen, err := conn.Read(request)
			if requestLen == 0 || err != nil {
				//sender有重发机制,所以可丢弃
				packLen = 0 //设为0以便读取新的包

				ed := time.Now()
				loglib.Warning(fmt.Sprintf("conn:%s, not full! ip:%s, packid:%s, received:%d, end recv:%s, elapse:%s, body elapse:%s, error:%s", inAddr, inIp, packId, currLen, ed, ed.Sub(st), ed.Sub(time3), err.Error()))
				break outer //连接出错直接跳出外层循环
			}
			currLen += requestLen
			content.Write(request[:requestLen])
		}
		if packLen > 0 && currLen >= packLen {
			//收完马上应答
			_, err := conn.Write([]byte("ok"))
			if err != nil {
				loglib.Warning(fmt.Sprintf("ip:%s, packid:%s received, but response back error:%s", inIp, packId, err.Error()))
			} else {
				loglib.Info(fmt.Sprintf("conn:%s, response to packid:%s", inAddr, packId))
			}
			//避免收到重复包(补拉例外)
			appeared, ok, code := t.hasAppeared(content)
			if !ok || rePull {
				ed := time.Now()
				routeInfo["ed"] = ed.Format("2006-01-02 15:04:05.000")
				routeInfo["elapse"] = ed.Sub(st).String()
				b.Write(content.Bytes())
				vbytes := tcp_pack.Packing(b.Bytes(), routeInfo, true)
				b = bytes.NewBuffer(vbytes)
				t.buffer <- *b
				packAppear := PackAppear{time.Now().Unix(), packId}
				t.mutex.Lock()
				t.footPrint[code] = packAppear //这里挂过
				t.mutex.Unlock()

				loglib.Info(fmt.Sprintf("conn:%s, finish ip:%s, packid:%s, repull:%v, received:%d, elapse:%s, body elapse:%s", inAddr, inIp, packId, rePull, currLen, ed.Sub(st), ed.Sub(time3)))
			} else {
				loglib.Info(fmt.Sprintf("conn:%s, pack %s repeat %s already appear at %s", inAddr, packId, appeared.Id, time.Unix(appeared.Time, 0)))
			}
			packLen = 0
		}

	}
	loglib.Info("conn finish: " + inAddr)
}