Пример #1
0
func (sc *SingleConnection) reconnect(conn *net.TCPConn) {

	if sc.failedTimes < sc.max_try_times {
		newConn, err := createSingleConnection(sc.currentAddr)
		if err != nil {
			sc.failedTimes++

		} else {
			sc.conn = newConn
			sc.failedTimes = 0
		}
	} else {
		tmpAddr := sc.currentAddr
		sc.currentAddr = sc.bakAddr
		sc.bakAddr = tmpAddr
		sc.failedTimes = 0
		loglib.Warning("try bakup address:" + sc.currentAddr)
		newConn, err := createSingleConnection(sc.currentAddr)
		if err == nil {
			loglib.Warning("use bakup address:" + sc.currentAddr)
			sc.conn = newConn
		}
	}

}
Пример #2
0
func (e *etlOutputer) doEtl(fkeyChan chan string, logDataDir string, etlDir string, etlDoneDir string, etlFailDir string, spiderList string, colsFile string, hostsList string, ipBlackList string, wg *sync.WaitGroup) {
	defer func() {
		if err := recover(); err != nil {
			loglib.Error(fmt.Sprintf("doEtl() panic:%v", err))
		}

		wg.Done()
	}()
	loglib.Info("etl routine start")
	for fkey := range fkeyChan {
		sv := etl.NewFileSaver(colsFile, etlDir, fkey)
		d := etl.NewDispatcher(sv, 6, hostsList, ipBlackList)
		g := etl.NewGlobalHao123(spiderList, 100, 200, 8, d)
		go g.Start(false)

		fname := filepath.Join(logDataDir, fkey)
		loglib.Info("start etl for " + fname)

		err := g.ParseFile(fname)
		g.Wait()
		// etl success
		// mark success
		if err == nil {
			//采用循环,增加打tag的成功率
			for i := 0; i < 5; i++ {
				fd, err := os.Create(filepath.Join(etlDoneDir, fkey))
				if err == nil {
					fd.Close()
					loglib.Info("finish etl for " + fname)
					break
				} else {
					loglib.Warning("mark etl done for " + fname + " failed! error: " + err.Error())
				}
			}
		} else {
			//采用循环,增加打tag的成功率
			for i := 0; i < 5; i++ {
				fd, err := os.Create(filepath.Join(etlFailDir, fkey))
				if err == nil {
					fd.Close()
					loglib.Info("failed etl for " + fname)
					break
				} else {
					loglib.Warning("mark etl fail for " + fname + " failed! error: " + err.Error())
				}
			}

		}
	}
	loglib.Info("etl routine finish")
}
Пример #3
0
func (f *fileOutputer) extract(bp *bytes.Buffer) {
	buf := make([]byte, 4)
	bp.Read(buf)

	l, _ := binary.Uvarint(buf)
	headerLen := int(l)
	//get pack header
	buf = make([]byte, headerLen)
	bp.Read(buf)
	header := tcp_pack.ParseHeader(buf)

	r, err := zlib.NewReader(bp)
	if err != nil {
		loglib.Error("zlib reader Error: " + err.Error())
	} else {
		lines, _ := strconv.Atoi(header["lines"])
		done := false
		if header["done"] == "1" {
			done = true
		}
		f.ic.Add(header["ip"], header["hour"], header["id"], lines, done)

		writerKey := header["ip"] + "_" + header["hour"]
		fout := f.getWriter(f.writers, f.dataDir, writerKey)

		//一头一尾写头信息,节省硬盘
		buf = append(buf, '\n')
		//fout.Write(buf)
		nn, err := io.Copy(fout, r)
		if err != nil {
			loglib.Warning(fmt.Sprintf("save %s_%s_%s error:%s, saved:%d", header["ip"], header["hour"], header["id"], err, nn))
		}
		//fout.Write(buf)

		//单独存一份header便于查数
		fout = f.getWriter(f.headerWriters, f.headerDir, writerKey)
		n, err := fout.Write(buf)
		if err != nil {
			loglib.Info(fmt.Sprintf("writer header %s %d %s", writerKey, n, err.Error()))
		}

		if done || time.Now().Unix() > f.checkTime.Unix() {
			hourFinish, _ := f.ic.Check()
			for ip, hours := range hourFinish {
				for _, hour := range hours {
					writerKey = ip + "_" + hour
				}
			}
			f.closeWriters(f.writers)
			f.closeWriters(f.headerWriters)
			f.checkTime.Add(2 * time.Minute)
		}

		r.Close()
	}
}
Пример #4
0
func (s Sender) sendData(data []byte, conn *net.TCPConn) bool {
	if len(data) == 0 {
		return true
	}

	if conn == nil {
		return false
	}
	/*
	   lenBuf := make([]byte, 4)
	   nData := len(data)
	   binary.PutUvarint(lenBuf, uint64(nData))
	   data = append(lenBuf, data...)
	*/

	st := time.Now()
	packId := tcp_pack.GetPackId(data)

	conn.SetDeadline(time.Now().Add(5 * time.Minute)) //设置超时
	loglib.Info(fmt.Sprintf("sender%d start sending pack:%s length:%d", s.id, packId, len(data)))
	n, err := conn.Write(data)
	ed := time.Now()
	loglib.Info(fmt.Sprintf("sender%d end sending pack:%s length:%d elapse:%s", s.id, packId, n, ed.Sub(st)))

	lib.CheckError(err)

	//写失败了就不用等应答了,肯定拿不到
	if err == nil {
		conn.SetReadDeadline(time.Now().Add(8 * time.Minute)) //设置超时
		time1 := time.Now()
		var temp []byte = make([]byte, 128)
		count, err := conn.Read(temp)
		if err == nil {
			loglib.Info(fmt.Sprintf("sender%d get anwser data len:%d for pack:%s elapse:%s", s.id, count, packId, time.Now().Sub(time1)))
		} else {
			loglib.Info(fmt.Sprintf("sender%d get anwser data len:%d for pack:%s elapse:%s, error:%s", s.id, count, packId, time.Now().Sub(time1), err.Error()))
		}

		temp = temp[:count]
		if string(temp) == "ok" { //发送成功
			return true
		} else if string(temp) == "wrong header" {
			//包头错误,丢弃
			loglib.Info(packId + " has wrong header, retry later!")
			return false
		} else { //发送失败
			//报警
			return false
		}
	} else {
		loglib.Warning(fmt.Sprintf("write pack %s error:%s", packId, err.Error()))
	}
	return false
}
Пример #5
0
func (t *TcpReceiver) loadFootPrint(fname string) map[string]PackAppear {
	fp := make(map[string]PackAppear)
	if lib.FileExists(fname) {
		vbytes, err := ioutil.ReadFile(fname)
		if err != nil {
			loglib.Error("read footprint file error:" + err.Error())
		} else {
			err = json.Unmarshal(vbytes, &fp)
			if err != nil {
				loglib.Error("unmarshal footprint error:" + err.Error())
			} else {
				loglib.Info("load footprint success !")
			}
		}
	} else {
		loglib.Warning("footprint file " + fname + " not found!")
	}
	return fp
}
Пример #6
0
func (s *Sender) writeToFile(data bytes.Buffer) {
	//写入文件
	filename := createFileName(s.id)
	//创建文件
	_, err := os.Create(filename)
	lib.CheckError(err)

	d := data.Bytes()

	packId := tcp_pack.GetPackId(d)

	loglib.Info(fmt.Sprintf("sender%d save pack %s to file %s len:%d", s.id, packId, filename, len(d)))
	err = ioutil.WriteFile(filename, d, 0666)
	if err != nil {
		loglib.Warning("write to file " + filename + " error:" + err.Error())
		lib.CheckError(err)
	} else {
		//追加fileCacheList
		fileList.PushBack(filename)
	}
}
Пример #7
0
func (e *etlOutputer) runEtl(spiderList string, colsFile string, hostsList string, ipBlackList string) {
	wg := &sync.WaitGroup{}
	fkeyChan := make(chan string, 100)
	defer func() {
		if err := recover(); err != nil {
			loglib.Error(fmt.Sprintf("runEtl() panic:%v", err))
		}

		e.ic.SaveStatus()
		e.closeWriters(e.writers)
		e.closeWriters(e.headerWriters)
		close(fkeyChan)
		//等待etl routine结束
		wg.Wait()
	}()

	for i := 0; i < 5; i++ {
		wg.Add(1)
		go e.doEtl(fkeyChan, e.dataDir, e.etlDir, e.etlDoneDir, e.etlFailDir, spiderList, colsFile, hostsList, ipBlackList, wg)
	}
	nextCheckTime := time.Now().Add(2 * time.Minute)
	//使用range遍历,方便安全退出,只要发送方退出时关闭chan,这里就可以退出了
	for b := range e.buffer {
		loglib.Info(fmt.Sprintf("pack in chan: %d", len(e.buffer)))
		buf := make([]byte, 4)
		bp := &b
		bp.Read(buf)

		l, _ := binary.Uvarint(buf)
		headerLen := int(l)
		//get pack header
		buf = make([]byte, headerLen)
		bp.Read(buf)
		header := tcp_pack.ParseHeader(buf)

		r, err := zlib.NewReader(bp)
		if err != nil {
			loglib.Error("zlib reader Error: " + err.Error())
		} else {
			lines, _ := strconv.Atoi(header["lines"])
			done := false
			if header["done"] == "1" {
				done = true
			}
			e.ic.Add(header["ip"], header["hour"], header["id"], lines, done)

			writerKey := header["ip"] + "_" + header["hour"]
			fout := e.getWriter(e.writers, e.dataDir, writerKey)

			buf = append(buf, '\n')
			/*
			   //一头一尾写头信息,节省硬盘
			   n, err := fout.Write(buf)
			   if err != nil {
			       loglib.Info(fmt.Sprintf("write %s %d %s", writerKey, n, err.Error()))
			   }
			*/
			nn, err := io.Copy(fout, r)
			if err != nil {
				loglib.Warning(fmt.Sprintf("save %s_%s_%s error:%s, saved:%d", header["ip"], header["hour"], header["id"], err, nn))
			}
			//fout.Write(buf)
			//单独存一份header便于查数
			fout = e.getWriter(e.headerWriters, e.headerDir, writerKey)
			n, err := fout.Write(buf)
			if err != nil {
				loglib.Info(fmt.Sprintf("writer header %s %d %s", writerKey, n, err.Error()))
			}
			//增加2分钟check一次的规则,避免done包先到,其他的包未到,则可能要等到下一小时才能check
			if done || time.Now().Unix() > nextCheckTime.Unix() {
				hourFinish, _ := e.ic.Check()
				for ip, hours := range hourFinish {
					for _, hour := range hours {
						writerKey = ip + "_" + hour
						loglib.Info(fmt.Sprintf("fkeychan %d", len(fkeyChan)))
						fkeyChan <- writerKey
					}
				}
				e.closeWriters(e.writers)
				e.closeWriters(e.headerWriters)
				e.ic.SaveStatus()
				nextCheckTime = time.Now().Add(2 * time.Minute)
			}

			r.Close()
		}
	}
}
Пример #8
0
func (t *TcpReceiver) handleConnnection(conn net.Conn, wg *sync.WaitGroup) {
	defer func() {
		if err := recover(); err != nil {
			loglib.Error(fmt.Sprintf("tcp receiver connection panic:%v", err))
		}
		conn.Close()
		wg.Done()
	}()
	/*
	   用于标识收到退出信号后,能否直接退出
	   只要接收信号时,包没有收完,都是可退出的,
	   发送方会缓存以后重传;
	   如果收完了就不能直接退出,可能包已传给下一级处理但是
	   却告诉发送方发送失败
	*/
	var quit = false //用于标识是否要退出

	go lib.HandleQuitSignal(func() {
		//关闭连接,避免阻塞在网络io上
		conn.Close()
		quit = true
	})

	request := make([]byte, 512*1024) //缓冲为512k

	var packLen int = 0
	currLen := 0
	var b = new(bytes.Buffer)
	var content = new(bytes.Buffer)
	inAddr := conn.RemoteAddr().String()
	parts := strings.Split(inAddr, ":")
	inIp := parts[0]

	packId := "unkown"

	var routeInfo map[string]string
	var rePull = false //是否补拉,如果是补拉就不做重复包检验

	loglib.Info("incoming: " + inAddr)

outer:
	for !quit {

		st := time.Now()
		if packLen == 0 {
			conn.SetReadDeadline(time.Now().Add(5 * time.Minute))
			time1 := time.Now() //时间打点
			// read zlib pack header length
			buf := make([]byte, 4)
			_, err := conn.Read(buf)
			if err != nil {
				loglib.Warning(fmt.Sprintf("conn:%s, get header len, tcp receiver read error:%s, elapse:%s", inAddr, err.Error(), time.Now().Sub(time1)))
				break
			}
			l, _ := binary.Uvarint(buf)
			headerLen := int(l)
			//get pack header
			headerBuf := make([]byte, headerLen)
			time2 := time.Now()
			_, err = conn.Read(headerBuf)
			if err != nil {
				loglib.Warning(fmt.Sprintf("conn:%s, get header, tcp receiver read error:%s, elapse:%s", inAddr, err.Error(), time.Now().Sub(time2)))
				break
			}

			//是否补拉
			route0 := tcp_pack.ParseHeader(headerBuf)
			if v, ok := route0["repull"]; ok && v == "1" {
				rePull = true
			} else {
				rePull = false
			}

			buf = append(buf, headerBuf...)
			header, _, err := tcp_pack.ExtractHeader(buf)
			if err != nil {
				loglib.Error("wrong format header " + string(headerBuf) + " " + err.Error())
				conn.Write([]byte("wrong header"))
				break
			}

			packId = tcp_pack.GetPackId(buf)
			packLen = header.PackLen
			currLen = 0
			routeInfo = make(map[string]string)
			b = new(bytes.Buffer)
			content = new(bytes.Buffer)

			loglib.Info(fmt.Sprintf("conn:%s, start receive pack %s, pack len:%d, header len:%d, header elapse:%s", inAddr, packId, packLen, headerLen, time.Now().Sub(time1)))
			b.Write(buf)

			routeInfo["ip"] = lib.GetIp()
			routeInfo["stage"] = "tcp recv"
			routeInfo["st"] = st.Format("2006-01-02 15:04:05.000")
		}
		//读包体的超时
		conn.SetReadDeadline(time.Now().Add(5 * time.Minute))
		time3 := time.Now()
		//read enough bytes
		for currLen < packLen {
			requestLen, err := conn.Read(request)
			if requestLen == 0 || err != nil {
				//sender有重发机制,所以可丢弃
				packLen = 0 //设为0以便读取新的包

				ed := time.Now()
				loglib.Warning(fmt.Sprintf("conn:%s, not full! ip:%s, packid:%s, received:%d, end recv:%s, elapse:%s, body elapse:%s, error:%s", inAddr, inIp, packId, currLen, ed, ed.Sub(st), ed.Sub(time3), err.Error()))
				break outer //连接出错直接跳出外层循环
			}
			currLen += requestLen
			content.Write(request[:requestLen])
		}
		if packLen > 0 && currLen >= packLen {
			//收完马上应答
			_, err := conn.Write([]byte("ok"))
			if err != nil {
				loglib.Warning(fmt.Sprintf("ip:%s, packid:%s received, but response back error:%s", inIp, packId, err.Error()))
			} else {
				loglib.Info(fmt.Sprintf("conn:%s, response to packid:%s", inAddr, packId))
			}
			//避免收到重复包(补拉例外)
			appeared, ok, code := t.hasAppeared(content)
			if !ok || rePull {
				ed := time.Now()
				routeInfo["ed"] = ed.Format("2006-01-02 15:04:05.000")
				routeInfo["elapse"] = ed.Sub(st).String()
				b.Write(content.Bytes())
				vbytes := tcp_pack.Packing(b.Bytes(), routeInfo, true)
				b = bytes.NewBuffer(vbytes)
				t.buffer <- *b
				packAppear := PackAppear{time.Now().Unix(), packId}
				t.mutex.Lock()
				t.footPrint[code] = packAppear //这里挂过
				t.mutex.Unlock()

				loglib.Info(fmt.Sprintf("conn:%s, finish ip:%s, packid:%s, repull:%v, received:%d, elapse:%s, body elapse:%s", inAddr, inIp, packId, rePull, currLen, ed.Sub(st), ed.Sub(time3)))
			} else {
				loglib.Info(fmt.Sprintf("conn:%s, pack %s repeat %s already appear at %s", inAddr, packId, appeared.Id, time.Unix(appeared.Time, 0)))
			}
			packLen = 0
		}

	}
	loglib.Info("conn finish: " + inAddr)
}
Пример #9
0
func (this *MongoDbOutputer) parseLogLine(line string) (m bson.M) {
	slen := len(line)
	//截取ip
	p1 := strings.Index(line, " ")
	p2 := slen
	if p1 > 0 && p1 < slen-1 {
		p := strings.Index(line[p1+1:], " ")
		if p > 0 {
			p2 = p + p1 + 1 //注意!p只是slice中的index,不是line中的
		}
	} else {
		p1 = 0
	}
	ipInLong := lib.IpToUint32(line[p1+1 : p2])
	// host第一段
	p1 = strings.Index(line, ".")
	hostPrefix := line[:p1] + "_"
	//截取时间
	p1 = strings.Index(line, "[")
	p2 = strings.Index(line, "]")
	hourStr := line[p1+1 : p2]
	var timestamp int64 = 0
	var day int = 0
	var hour int = -1
	tm, err := time.ParseInLocation("02/Jan/2006:15:04:05 -0700", hourStr, time.Local)
	if err != nil {
		loglib.Warning("parse time error" + err.Error())
	} else {
		timestamp = tm.Unix()
		dayStr := tm.Format("20060102")
		day, err = strconv.Atoi(dayStr)
		if err != nil {
			loglib.Error(fmt.Sprintf("conv %s to int error: %v", dayStr, err))
		}
		hour = tm.Hour()
	}
	//截取请求url
	urlStr := ""
	p3 := strings.Index(line, "\"")
	p4 := strings.Index(line[p3+1:], "\"") + p3 + 1
	reqStr := line[p3+1 : p4]
	parts := strings.Split(reqStr, " ")

	m = make(bson.M)
	if len(parts) == 3 {
		urlStr = parts[1]
		u, err := url.Parse(urlStr)
		if err == nil {
			q := u.Query()
			tid := q.Get(this.transactionIdKey) //检验有无transaction id
			if tid != "" {
				//参数对放入bson
				for k, _ := range q {
					newK := k
					if k != this.transactionIdKey {
						newK = hostPrefix + k
					}
					m[newK] = q.Get(k)
				}
				m[hostPrefix+"ipinlong"] = ipInLong
				m[hostPrefix+"time"] = timestamp
				m[hostPrefix+"day"] = day
				m[hostPrefix+"hour"] = hour
			}
		}
	}
	return
}