Esempio n. 1
func (sc *SingleConnection) reconnect(conn *net.TCPConn) {

	if sc.failedTimes < sc.max_try_times {
		newConn, err := createSingleConnection(sc.currentAddr)
		if err != nil {

		} else {
			sc.conn = newConn
			sc.failedTimes = 0
	} else {
		tmpAddr := sc.currentAddr
		sc.currentAddr = sc.bakAddr
		sc.bakAddr = tmpAddr
		sc.failedTimes = 0
		loglib.Warning("try bakup address:" + sc.currentAddr)
		newConn, err := createSingleConnection(sc.currentAddr)
		if err == nil {
			loglib.Warning("use bakup address:" + sc.currentAddr)
			sc.conn = newConn

Esempio n. 2
func (e *etlOutputer) doEtl(fkeyChan chan string, logDataDir string, etlDir string, etlDoneDir string, etlFailDir string, spiderList string, colsFile string, hostsList string, ipBlackList string, wg *sync.WaitGroup) {
	defer func() {
		if err := recover(); err != nil {
			loglib.Error(fmt.Sprintf("doEtl() panic:%v", err))

	loglib.Info("etl routine start")
	for fkey := range fkeyChan {
		sv := etl.NewFileSaver(colsFile, etlDir, fkey)
		d := etl.NewDispatcher(sv, 6, hostsList, ipBlackList)
		g := etl.NewGlobalHao123(spiderList, 100, 200, 8, d)
		go g.Start(false)

		fname := filepath.Join(logDataDir, fkey)
		loglib.Info("start etl for " + fname)

		err := g.ParseFile(fname)
		// etl success
		// mark success
		if err == nil {
			for i := 0; i < 5; i++ {
				fd, err := os.Create(filepath.Join(etlDoneDir, fkey))
				if err == nil {
					loglib.Info("finish etl for " + fname)
				} else {
					loglib.Warning("mark etl done for " + fname + " failed! error: " + err.Error())
		} else {
			for i := 0; i < 5; i++ {
				fd, err := os.Create(filepath.Join(etlFailDir, fkey))
				if err == nil {
					loglib.Info("failed etl for " + fname)
				} else {
					loglib.Warning("mark etl fail for " + fname + " failed! error: " + err.Error())

	loglib.Info("etl routine finish")
Esempio n. 3
func (f *fileOutputer) extract(bp *bytes.Buffer) {
	buf := make([]byte, 4)

	l, _ := binary.Uvarint(buf)
	headerLen := int(l)
	//get pack header
	buf = make([]byte, headerLen)
	header := tcp_pack.ParseHeader(buf)

	r, err := zlib.NewReader(bp)
	if err != nil {
		loglib.Error("zlib reader Error: " + err.Error())
	} else {
		lines, _ := strconv.Atoi(header["lines"])
		done := false
		if header["done"] == "1" {
			done = true
		f.ic.Add(header["ip"], header["hour"], header["id"], lines, done)

		writerKey := header["ip"] + "_" + header["hour"]
		fout := f.getWriter(f.writers, f.dataDir, writerKey)

		buf = append(buf, '\n')
		nn, err := io.Copy(fout, r)
		if err != nil {
			loglib.Warning(fmt.Sprintf("save %s_%s_%s error:%s, saved:%d", header["ip"], header["hour"], header["id"], err, nn))

		fout = f.getWriter(f.headerWriters, f.headerDir, writerKey)
		n, err := fout.Write(buf)
		if err != nil {
			loglib.Info(fmt.Sprintf("writer header %s %d %s", writerKey, n, err.Error()))

		if done || time.Now().Unix() > f.checkTime.Unix() {
			hourFinish, _ := f.ic.Check()
			for ip, hours := range hourFinish {
				for _, hour := range hours {
					writerKey = ip + "_" + hour
			f.checkTime.Add(2 * time.Minute)

Esempio n. 4
func (s Sender) sendData(data []byte, conn *net.TCPConn) bool {
	if len(data) == 0 {
		return true

	if conn == nil {
		return false
	   lenBuf := make([]byte, 4)
	   nData := len(data)
	   binary.PutUvarint(lenBuf, uint64(nData))
	   data = append(lenBuf, data...)

	st := time.Now()
	packId := tcp_pack.GetPackId(data)

	conn.SetDeadline(time.Now().Add(5 * time.Minute)) //设置超时
	loglib.Info(fmt.Sprintf("sender%d start sending pack:%s length:%d",, packId, len(data)))
	n, err := conn.Write(data)
	ed := time.Now()
	loglib.Info(fmt.Sprintf("sender%d end sending pack:%s length:%d elapse:%s",, packId, n, ed.Sub(st)))


	if err == nil {
		conn.SetReadDeadline(time.Now().Add(8 * time.Minute)) //设置超时
		time1 := time.Now()
		var temp []byte = make([]byte, 128)
		count, err := conn.Read(temp)
		if err == nil {
			loglib.Info(fmt.Sprintf("sender%d get anwser data len:%d for pack:%s elapse:%s",, count, packId, time.Now().Sub(time1)))
		} else {
			loglib.Info(fmt.Sprintf("sender%d get anwser data len:%d for pack:%s elapse:%s, error:%s",, count, packId, time.Now().Sub(time1), err.Error()))

		temp = temp[:count]
		if string(temp) == "ok" { //发送成功
			return true
		} else if string(temp) == "wrong header" {
			loglib.Info(packId + " has wrong header, retry later!")
			return false
		} else { //发送失败
			return false
	} else {
		loglib.Warning(fmt.Sprintf("write pack %s error:%s", packId, err.Error()))
	return false
Esempio n. 5
func (t *TcpReceiver) loadFootPrint(fname string) map[string]PackAppear {
	fp := make(map[string]PackAppear)
	if lib.FileExists(fname) {
		vbytes, err := ioutil.ReadFile(fname)
		if err != nil {
			loglib.Error("read footprint file error:" + err.Error())
		} else {
			err = json.Unmarshal(vbytes, &fp)
			if err != nil {
				loglib.Error("unmarshal footprint error:" + err.Error())
			} else {
				loglib.Info("load footprint success !")
	} else {
		loglib.Warning("footprint file " + fname + " not found!")
	return fp
Esempio n. 6
func (this *IntegrityChecker) LoadStatus(filename string) map[string]map[string]map[string]map[string]int {
	m := make(map[string]map[string]map[string]map[string]int)
	m["hour_received"] = make(map[string]map[string]map[string]int)
	m["day_received"] = make(map[string]map[string]map[string]int)
	if lib.FileExists(filename) {
		vbytes, err := ioutil.ReadFile(filename)
		if err != nil {
			loglib.Error("read log received file error:" + err.Error())
		} else {
			err = json.Unmarshal(vbytes, &m)
			if err != nil {
				loglib.Error("unmarshal log received error:" + err.Error())
			} else {
				loglib.Info("load log received success !")
	} else {
		loglib.Warning("log received file " + filename + " not found!")
	return m
Esempio n. 7
func (s *Sender) writeToFile(data bytes.Buffer) {
	filename := createFileName(
	_, err := os.Create(filename)

	d := data.Bytes()

	packId := tcp_pack.GetPackId(d)

	loglib.Info(fmt.Sprintf("sender%d save pack %s to file %s len:%d",, packId, filename, len(d)))
	err = ioutil.WriteFile(filename, d, 0666)
	if err != nil {
		loglib.Warning("write to file " + filename + " error:" + err.Error())
	} else {
Esempio n. 8
func (this *MongoDbOutputer) parseLogLine(line string) (m bson.M) {
    slen := len(line)
    p1 := strings.Index(line, " ")
    p2 := slen
    if p1 > 0 && p1 < slen-1 {
        p := strings.Index(line[p1+1:], " ")
        if p > 0 {
            p2 = p + p1 + 1  //注意!p只是slice中的index,不是line中的
        p1 = 0
    ipInLong := lib.IpToUint32(line[p1+1 : p2])
    // host第一段
    p1 = strings.Index(line, ".")
    hostPrefix := line[:p1] + "_"
    p1 = strings.Index(line, "[")
    p2 = strings.Index(line, "]")
    hourStr := line[p1+1 : p2]
    var timestamp int64 = 0
    var day int = 0
    var hour int = -1
    tm, err := time.ParseInLocation("02/Jan/2006:15:04:05 -0700", hourStr, time.Local)
    if err != nil {
        loglib.Warning("parse time error" + err.Error())
        timestamp = tm.Unix()
        dayStr := tm.Format("20060102")
        day, err = strconv.Atoi(dayStr)
        if err != nil {
            loglib.Error(fmt.Sprintf("conv %s to int error: %v", dayStr, err))
        hour = tm.Hour()
    urlStr := ""
    p3 := strings.Index(line, "\"")
    p4 := strings.Index(line[p3+1: ], "\"") + p3 + 1
    reqStr := line[p3+1 : p4]
    parts := strings.Split(reqStr, " ")

    m = make(bson.M)
    if len(parts) == 3 {
        urlStr = parts[1]
        u, err := url.Parse(urlStr)
        if err == nil {
            q := u.Query()
            tid := q.Get( this.transactionIdKey )  //检验有无transaction id
            if tid != "" {
                for k, _ := range q {
                    newK := k
                    if k != this.transactionIdKey {
                        newK = hostPrefix + k
                    m[newK] = q.Get(k)
                m[hostPrefix + "ipinlong"] = ipInLong
                m[hostPrefix + "time"] = timestamp
                m[hostPrefix + "day"] = day
                m[hostPrefix + "hour"] = hour
Esempio n. 9
func (this *IntegrityChecker) Check() (hourFinish map[string][]string, dayFinish map[string][]string) {
	hourFinish = make(map[string][]string)
	dayFinish = make(map[string][]string)
	interval := int64(86400 * 4) //4天前的不完整数据将被删除
	now := time.Now().Unix()
	for ip, m1 := range this.hourReceived {
		for hour, m2 := range m1 {
			totalPacks, ok := m2["total_packs"]
			if ok && totalPacks > 0 {
				miss := make([]string, 0)
				var id = ""
				for i := 1; i <= totalPacks; i++ {
					id = strconv.Itoa(i)
					_, ok = m2[id]
					if !ok {
						miss = append(miss, id)
				if len(miss) == 0 && this.makeHourTag(ip, hour, m2["total_lines"]) && this.addHour(ip, hour) {
					_, ok1 := hourFinish[ip]
					if !ok1 {
						hourFinish[ip] = make([]string, 0)
					hourFinish[ip] = append(hourFinish[ip], hour)

					delete(this.hourReceived[ip], hour)
					if len(this.hourReceived[ip]) == 0 {
						delete(this.hourReceived, ip)
				} else {
					loglib.Warning(fmt.Sprintf("%s_%s total %d, miss %s", ip, hour, totalPacks, strings.Join(miss, ",")))

			tm, err := time.Parse("2006010215", hour)
			if err != nil || (now-tm.Unix()) > interval {
				delete(this.hourReceived[ip], hour)
				loglib.Info(fmt.Sprintf("hour integrity: %s %s overtime", ip, hour))

	for ip, m1 := range this.dayReceived {
		for day, m2 := range m1 {
			if len(m2) == 24 && this.makeDayTag(ip, day) {
				loglib.Info(ip + "_" + day + " all received")

				_, ok1 := dayFinish[ip]
				if !ok1 {
					dayFinish[ip] = make([]string, 0)
				dayFinish[ip] = append(dayFinish[ip], day)

				delete(this.dayReceived[ip], day)
				if len(this.dayReceived[ip]) == 0 {
					delete(this.dayReceived, ip)
			tm, err := time.Parse("20060102", day)
			if err != nil || (now-tm.Unix()) > interval {
				delete(this.dayReceived[ip], day)
				loglib.Info(fmt.Sprintf("day integrity: %s %s overtime", ip, day))

Esempio n. 10
func (t *TcpReceiver) handleConnnection(conn net.Conn, wg *sync.WaitGroup) {
	defer func() {
		if err := recover(); err != nil {
			loglib.Error(fmt.Sprintf("tcp receiver connection panic:%v", err))
	var quit = false //用于标识是否要退出

	go lib.HandleQuitSignal(func() {
		quit = true

	request := make([]byte, 512*1024) //缓冲为512k

	var packLen int = 0
	currLen := 0
	var b = new(bytes.Buffer)
	var content = new(bytes.Buffer)
	inAddr := conn.RemoteAddr().String()
	parts := strings.Split(inAddr, ":")
	inIp := parts[0]

	packId := "unkown"

	var routeInfo map[string]string
	var rePull = false //是否补拉,如果是补拉就不做重复包检验

	loglib.Info("incoming: " + inAddr)

	for !quit {

		st := time.Now()
		if packLen == 0 {
			conn.SetReadDeadline(time.Now().Add(5 * time.Minute))
			time1 := time.Now() //时间打点
			// read zlib pack header length
			buf := make([]byte, 4)
			_, err := conn.Read(buf)
			if err != nil {
				loglib.Warning(fmt.Sprintf("conn:%s, get header len, tcp receiver read error:%s, elapse:%s", inAddr, err.Error(), time.Now().Sub(time1)))
			l, _ := binary.Uvarint(buf)
			headerLen := int(l)
			//get pack header
			headerBuf := make([]byte, headerLen)
			time2 := time.Now()
			_, err = conn.Read(headerBuf)
			if err != nil {
				loglib.Warning(fmt.Sprintf("conn:%s, get header, tcp receiver read error:%s, elapse:%s", inAddr, err.Error(), time.Now().Sub(time2)))

			route0 := tcp_pack.ParseHeader(headerBuf)
			if v, ok := route0["repull"]; ok && v == "1" {
				rePull = true
			} else {
				rePull = false

			buf = append(buf, headerBuf...)
			header, _, err := tcp_pack.ExtractHeader(buf)
			if err != nil {
				loglib.Error("wrong format header " + string(headerBuf) + " " + err.Error())
				conn.Write([]byte("wrong header"))

			packId = tcp_pack.GetPackId(buf)
			packLen = header.PackLen
			currLen = 0
			routeInfo = make(map[string]string)
			b = new(bytes.Buffer)
			content = new(bytes.Buffer)

			loglib.Info(fmt.Sprintf("conn:%s, start receive pack %s, pack len:%d, header len:%d, header elapse:%s", inAddr, packId, packLen, headerLen, time.Now().Sub(time1)))

			routeInfo["ip"] = lib.GetIp()
			routeInfo["stage"] = "tcp recv"
			routeInfo["st"] = st.Format("2006-01-02 15:04:05.000")
		conn.SetReadDeadline(time.Now().Add(5 * time.Minute))
		time3 := time.Now()
		//read enough bytes
		for currLen < packLen {
			requestLen, err := conn.Read(request)
			if requestLen == 0 || err != nil {
				packLen = 0 //设为0以便读取新的包

				ed := time.Now()
				loglib.Warning(fmt.Sprintf("conn:%s, not full! ip:%s, packid:%s, received:%d, end recv:%s, elapse:%s, body elapse:%s, error:%s", inAddr, inIp, packId, currLen, ed, ed.Sub(st), ed.Sub(time3), err.Error()))
				break outer //连接出错直接跳出外层循环
			currLen += requestLen
		if packLen > 0 && currLen >= packLen {
			_, err := conn.Write([]byte("ok"))
			if err != nil {
				loglib.Warning(fmt.Sprintf("ip:%s, packid:%s received, but response back error:%s", inIp, packId, err.Error()))
			} else {
				loglib.Info(fmt.Sprintf("conn:%s, response to packid:%s", inAddr, packId))
			appeared, ok, code := t.hasAppeared(content)
			if !ok || rePull {
				ed := time.Now()
				routeInfo["ed"] = ed.Format("2006-01-02 15:04:05.000")
				routeInfo["elapse"] = ed.Sub(st).String()
				vbytes := tcp_pack.Packing(b.Bytes(), routeInfo, true)
				b = bytes.NewBuffer(vbytes)
				t.buffer <- *b
				packAppear := PackAppear{time.Now().Unix(), packId}
				t.footPrint[code] = packAppear //这里挂过

				loglib.Info(fmt.Sprintf("conn:%s, finish ip:%s, packid:%s, repull:%v, received:%d, elapse:%s, body elapse:%s", inAddr, inIp, packId, rePull, currLen, ed.Sub(st), ed.Sub(time3)))
			} else {
				loglib.Info(fmt.Sprintf("conn:%s, pack %s repeat %s already appear at %s", inAddr, packId, appeared.Id, time.Unix(appeared.Time, 0)))
			packLen = 0

	loglib.Info("conn finish: " + inAddr)
Esempio n. 11
func (e *etlOutputer) runEtl(spiderList string, colsFile string, hostsList string, ipBlackList string) {
	wg := &sync.WaitGroup{}
	fkeyChan := make(chan string, 100)
	defer func() {
		if err := recover(); err != nil {
			loglib.Error(fmt.Sprintf("runEtl() panic:%v", err))

		//等待etl routine结束

	for i := 0; i < 5; i++ {
		go e.doEtl(fkeyChan, e.dataDir, e.etlDir, e.etlDoneDir, e.etlFailDir, spiderList, colsFile, hostsList, ipBlackList, wg)
	nextCheckTime := time.Now().Add(2 * time.Minute)
	for b := range e.buffer {
		loglib.Info(fmt.Sprintf("pack in chan: %d", len(e.buffer)))
		buf := make([]byte, 4)
		bp := &b

		l, _ := binary.Uvarint(buf)
		headerLen := int(l)
		//get pack header
		buf = make([]byte, headerLen)
		header := tcp_pack.ParseHeader(buf)

		r, err := zlib.NewReader(bp)
		if err != nil {
			loglib.Error("zlib reader Error: " + err.Error())
		} else {
			lines, _ := strconv.Atoi(header["lines"])
			done := false
			if header["done"] == "1" {
				done = true
			e.ic.Add(header["ip"], header["hour"], header["id"], lines, done)

			writerKey := header["ip"] + "_" + header["hour"]
			fout := e.getWriter(e.writers, e.dataDir, writerKey)

			buf = append(buf, '\n')
			   n, err := fout.Write(buf)
			   if err != nil {
			       loglib.Info(fmt.Sprintf("write %s %d %s", writerKey, n, err.Error()))
			nn, err := io.Copy(fout, r)
			if err != nil {
				loglib.Warning(fmt.Sprintf("save %s_%s_%s error:%s, saved:%d", header["ip"], header["hour"], header["id"], err, nn))
			fout = e.getWriter(e.headerWriters, e.headerDir, writerKey)
			n, err := fout.Write(buf)
			if err != nil {
				loglib.Info(fmt.Sprintf("writer header %s %d %s", writerKey, n, err.Error()))
			if done || time.Now().Unix() > nextCheckTime.Unix() {
				hourFinish, _ := e.ic.Check()
				for ip, hours := range hourFinish {
					for _, hour := range hours {
						writerKey = ip + "_" + hour
						loglib.Info(fmt.Sprintf("fkeychan %d", len(fkeyChan)))
						fkeyChan <- writerKey
				nextCheckTime = time.Now().Add(2 * time.Minute)
