Пример #1
0
func (sc *SingleConnection) reconnect(conn *net.TCPConn) {

	if sc.failedTimes < sc.max_try_times {
		newConn, err := createSingleConnection(sc.currentAddr)
		if err != nil {
			sc.failedTimes++

		} else {
			sc.conn = newConn
			sc.failedTimes = 0
		}
	} else {
		tmpAddr := sc.currentAddr
		sc.currentAddr = sc.bakAddr
		sc.bakAddr = tmpAddr
		sc.failedTimes = 0
		loglib.Warning("try bakup address:" + sc.currentAddr)
		newConn, err := createSingleConnection(sc.currentAddr)
		if err == nil {
			loglib.Warning("use bakup address:" + sc.currentAddr)
			sc.conn = newConn
		}
	}

}
Пример #2
0
func (e *etlOutputer) doEtl(fkeyChan chan string, logDataDir string, etlDir string, etlDoneDir string, etlFailDir string, spiderList string, colsFile string, hostsList string, ipBlackList string, wg *sync.WaitGroup) {
	defer func() {
		if err := recover(); err != nil {
			loglib.Error(fmt.Sprintf("doEtl() panic:%v", err))
		}

		wg.Done()
	}()
	loglib.Info("etl routine start")
	for fkey := range fkeyChan {
		sv := etl.NewFileSaver(colsFile, etlDir, fkey)
		d := etl.NewDispatcher(sv, 6, hostsList, ipBlackList)
		g := etl.NewGlobalHao123(spiderList, 100, 200, 8, d)
		go g.Start(false)

		fname := filepath.Join(logDataDir, fkey)
		loglib.Info("start etl for " + fname)

		err := g.ParseFile(fname)
		g.Wait()
		// etl success
		// mark success
		if err == nil {
			//采用循环,增加打tag的成功率
			for i := 0; i < 5; i++ {
				fd, err := os.Create(filepath.Join(etlDoneDir, fkey))
				if err == nil {
					fd.Close()
					loglib.Info("finish etl for " + fname)
					break
				} else {
					loglib.Warning("mark etl done for " + fname + " failed! error: " + err.Error())
				}
			}
		} else {
			//采用循环,增加打tag的成功率
			for i := 0; i < 5; i++ {
				fd, err := os.Create(filepath.Join(etlFailDir, fkey))
				if err == nil {
					fd.Close()
					loglib.Info("failed etl for " + fname)
					break
				} else {
					loglib.Warning("mark etl fail for " + fname + " failed! error: " + err.Error())
				}
			}

		}
	}
	loglib.Info("etl routine finish")
}
Пример #3
0
func (f *fileOutputer) extract(bp *bytes.Buffer) {
	buf := make([]byte, 4)
	bp.Read(buf)

	l, _ := binary.Uvarint(buf)
	headerLen := int(l)
	//get pack header
	buf = make([]byte, headerLen)
	bp.Read(buf)
	header := tcp_pack.ParseHeader(buf)

	r, err := zlib.NewReader(bp)
	if err != nil {
		loglib.Error("zlib reader Error: " + err.Error())
	} else {
		lines, _ := strconv.Atoi(header["lines"])
		done := false
		if header["done"] == "1" {
			done = true
		}
		f.ic.Add(header["ip"], header["hour"], header["id"], lines, done)

		writerKey := header["ip"] + "_" + header["hour"]
		fout := f.getWriter(f.writers, f.dataDir, writerKey)

		//一头一尾写头信息,节省硬盘
		buf = append(buf, '\n')
		//fout.Write(buf)
		nn, err := io.Copy(fout, r)
		if err != nil {
			loglib.Warning(fmt.Sprintf("save %s_%s_%s error:%s, saved:%d", header["ip"], header["hour"], header["id"], err, nn))
		}
		//fout.Write(buf)

		//单独存一份header便于查数
		fout = f.getWriter(f.headerWriters, f.headerDir, writerKey)
		n, err := fout.Write(buf)
		if err != nil {
			loglib.Info(fmt.Sprintf("writer header %s %d %s", writerKey, n, err.Error()))
		}

		if done || time.Now().Unix() > f.checkTime.Unix() {
			hourFinish, _ := f.ic.Check()
			for ip, hours := range hourFinish {
				for _, hour := range hours {
					writerKey = ip + "_" + hour
				}
			}
			f.closeWriters(f.writers)
			f.closeWriters(f.headerWriters)
			f.checkTime.Add(2 * time.Minute)
		}

		r.Close()
	}
}
Пример #4
0
func (s Sender) sendData(data []byte, conn *net.TCPConn) bool {
	if len(data) == 0 {
		return true
	}

	if conn == nil {
		return false
	}
	/*
	   lenBuf := make([]byte, 4)
	   nData := len(data)
	   binary.PutUvarint(lenBuf, uint64(nData))
	   data = append(lenBuf, data...)
	*/

	st := time.Now()
	packId := tcp_pack.GetPackId(data)

	conn.SetDeadline(time.Now().Add(5 * time.Minute)) //设置超时
	loglib.Info(fmt.Sprintf("sender%d start sending pack:%s length:%d", s.id, packId, len(data)))
	n, err := conn.Write(data)
	ed := time.Now()
	loglib.Info(fmt.Sprintf("sender%d end sending pack:%s length:%d elapse:%s", s.id, packId, n, ed.Sub(st)))

	lib.CheckError(err)

	//写失败了就不用等应答了,肯定拿不到
	if err == nil {
		conn.SetReadDeadline(time.Now().Add(8 * time.Minute)) //设置超时
		time1 := time.Now()
		var temp []byte = make([]byte, 128)
		count, err := conn.Read(temp)
		if err == nil {
			loglib.Info(fmt.Sprintf("sender%d get anwser data len:%d for pack:%s elapse:%s", s.id, count, packId, time.Now().Sub(time1)))
		} else {
			loglib.Info(fmt.Sprintf("sender%d get anwser data len:%d for pack:%s elapse:%s, error:%s", s.id, count, packId, time.Now().Sub(time1), err.Error()))
		}

		temp = temp[:count]
		if string(temp) == "ok" { //发送成功
			return true
		} else if string(temp) == "wrong header" {
			//包头错误,丢弃
			loglib.Info(packId + " has wrong header, retry later!")
			return false
		} else { //发送失败
			//报警
			return false
		}
	} else {
		loglib.Warning(fmt.Sprintf("write pack %s error:%s", packId, err.Error()))
	}
	return false
}
Пример #5
0
func (t *TcpReceiver) loadFootPrint(fname string) map[string]PackAppear {
	fp := make(map[string]PackAppear)
	if lib.FileExists(fname) {
		vbytes, err := ioutil.ReadFile(fname)
		if err != nil {
			loglib.Error("read footprint file error:" + err.Error())
		} else {
			err = json.Unmarshal(vbytes, &fp)
			if err != nil {
				loglib.Error("unmarshal footprint error:" + err.Error())
			} else {
				loglib.Info("load footprint success !")
			}
		}
	} else {
		loglib.Warning("footprint file " + fname + " not found!")
	}
	return fp
}
Пример #6
0
func (this *IntegrityChecker) LoadStatus(filename string) map[string]map[string]map[string]map[string]int {
	m := make(map[string]map[string]map[string]map[string]int)
	m["hour_received"] = make(map[string]map[string]map[string]int)
	m["day_received"] = make(map[string]map[string]map[string]int)
	if lib.FileExists(filename) {
		vbytes, err := ioutil.ReadFile(filename)
		if err != nil {
			loglib.Error("read log received file error:" + err.Error())
		} else {
			err = json.Unmarshal(vbytes, &m)
			if err != nil {
				loglib.Error("unmarshal log received error:" + err.Error())
			} else {
				loglib.Info("load log received success !")
			}
		}
	} else {
		loglib.Warning("log received file " + filename + " not found!")
	}
	return m
}
Пример #7
0
func (s *Sender) writeToFile(data bytes.Buffer) {
	//写入文件
	filename := createFileName(s.id)
	//创建文件
	_, err := os.Create(filename)
	lib.CheckError(err)

	d := data.Bytes()

	packId := tcp_pack.GetPackId(d)

	loglib.Info(fmt.Sprintf("sender%d save pack %s to file %s len:%d", s.id, packId, filename, len(d)))
	err = ioutil.WriteFile(filename, d, 0666)
	if err != nil {
		loglib.Warning("write to file " + filename + " error:" + err.Error())
		lib.CheckError(err)
	} else {
		//追加fileCacheList
		fileList.PushBack(filename)
	}
}
Пример #8
0
func (this *MongoDbOutputer) parseLogLine(line string) (m bson.M) {
    slen := len(line)
    //截取ip
    p1 := strings.Index(line, " ")
    p2 := slen
    if p1 > 0 && p1 < slen-1 {
        p := strings.Index(line[p1+1:], " ")
        if p > 0 {
            p2 = p + p1 + 1  //注意!p只是slice中的index,不是line中的
        }
    }else{
        p1 = 0
    }
    ipInLong := lib.IpToUint32(line[p1+1 : p2])
    // host第一段
    p1 = strings.Index(line, ".")
    hostPrefix := line[:p1] + "_"
    //截取时间
    p1 = strings.Index(line, "[")
    p2 = strings.Index(line, "]")
    hourStr := line[p1+1 : p2]
    var timestamp int64 = 0
    var day int = 0
    var hour int = -1
    tm, err := time.ParseInLocation("02/Jan/2006:15:04:05 -0700", hourStr, time.Local)
    if err != nil {
        loglib.Warning("parse time error" + err.Error())
    }else{
        timestamp = tm.Unix()
        dayStr := tm.Format("20060102")
        day, err = strconv.Atoi(dayStr)
        if err != nil {
            loglib.Error(fmt.Sprintf("conv %s to int error: %v", dayStr, err))
        }
        hour = tm.Hour()
    }
    //截取请求url
    urlStr := ""
    p3 := strings.Index(line, "\"")
    p4 := strings.Index(line[p3+1: ], "\"") + p3 + 1
    reqStr := line[p3+1 : p4]
    parts := strings.Split(reqStr, " ")

    m = make(bson.M)
    if len(parts) == 3 {
        urlStr = parts[1]
        u, err := url.Parse(urlStr)
        if err == nil {
            q := u.Query()
            tid := q.Get( this.transactionIdKey )  //检验有无transaction id
            if tid != "" {
                //参数对放入bson
                for k, _ := range q {
                    newK := k
                    if k != this.transactionIdKey {
                        newK = hostPrefix + k
                    }
                    m[newK] = q.Get(k)
                }
                m[hostPrefix + "ipinlong"] = ipInLong
                m[hostPrefix + "time"] = timestamp
                m[hostPrefix + "day"] = day
                m[hostPrefix + "hour"] = hour
            }
        }
    }
    return
}
Пример #9
0
//检查日志是否完整,返回当前这次检查已完成的小时和日期
func (this *IntegrityChecker) Check() (hourFinish map[string][]string, dayFinish map[string][]string) {
	hourFinish = make(map[string][]string)
	dayFinish = make(map[string][]string)
	interval := int64(86400 * 4) //4天前的不完整数据将被删除
	now := time.Now().Unix()
	//检查每小时是否完整
	for ip, m1 := range this.hourReceived {
		for hour, m2 := range m1 {
			totalPacks, ok := m2["total_packs"]
			if ok && totalPacks > 0 {
				miss := make([]string, 0)
				var id = ""
				//这小时已接收到最后一个包,可以check了
				for i := 1; i <= totalPacks; i++ {
					id = strconv.Itoa(i)
					_, ok = m2[id]
					if !ok {
						miss = append(miss, id)
					}
				}
				//if条件顺序不要错
				if len(miss) == 0 && this.makeHourTag(ip, hour, m2["total_lines"]) && this.addHour(ip, hour) {
					_, ok1 := hourFinish[ip]
					if !ok1 {
						hourFinish[ip] = make([]string, 0)
					}
					hourFinish[ip] = append(hourFinish[ip], hour)

					delete(this.hourReceived[ip], hour)
					if len(this.hourReceived[ip]) == 0 {
						delete(this.hourReceived, ip)
					}
				} else {
					loglib.Warning(fmt.Sprintf("%s_%s total %d, miss %s", ip, hour, totalPacks, strings.Join(miss, ",")))
				}
			}

			tm, err := time.Parse("2006010215", hour)
			if err != nil || (now-tm.Unix()) > interval {
				delete(this.hourReceived[ip], hour)
				loglib.Info(fmt.Sprintf("hour integrity: %s %s overtime", ip, hour))
			}
		}
	}

	//检查每天是否完整
	for ip, m1 := range this.dayReceived {
		for day, m2 := range m1 {
			if len(m2) == 24 && this.makeDayTag(ip, day) {
				loglib.Info(ip + "_" + day + " all received")

				_, ok1 := dayFinish[ip]
				if !ok1 {
					dayFinish[ip] = make([]string, 0)
				}
				dayFinish[ip] = append(dayFinish[ip], day)

				delete(this.dayReceived[ip], day)
				if len(this.dayReceived[ip]) == 0 {
					delete(this.dayReceived, ip)
				}
			}
			tm, err := time.Parse("20060102", day)
			if err != nil || (now-tm.Unix()) > interval {
				delete(this.dayReceived[ip], day)
				loglib.Info(fmt.Sprintf("day integrity: %s %s overtime", ip, day))
			}
		}
	}

	return
}
Пример #10
0
func (t *TcpReceiver) handleConnnection(conn net.Conn, wg *sync.WaitGroup) {
	defer func() {
		if err := recover(); err != nil {
			loglib.Error(fmt.Sprintf("tcp receiver connection panic:%v", err))
		}
		conn.Close()
		wg.Done()
	}()
	/*
	   用于标识收到退出信号后,能否直接退出
	   只要接收信号时,包没有收完,都是可退出的,
	   发送方会缓存以后重传;
	   如果收完了就不能直接退出,可能包已传给下一级处理但是
	   却告诉发送方发送失败
	*/
	var quit = false //用于标识是否要退出

	go lib.HandleQuitSignal(func() {
		//关闭连接,避免阻塞在网络io上
		conn.Close()
		quit = true
	})

	request := make([]byte, 512*1024) //缓冲为512k

	var packLen int = 0
	currLen := 0
	var b = new(bytes.Buffer)
	var content = new(bytes.Buffer)
	inAddr := conn.RemoteAddr().String()
	parts := strings.Split(inAddr, ":")
	inIp := parts[0]

	packId := "unkown"

	var routeInfo map[string]string
	var rePull = false //是否补拉,如果是补拉就不做重复包检验

	loglib.Info("incoming: " + inAddr)

outer:
	for !quit {

		st := time.Now()
		if packLen == 0 {
			conn.SetReadDeadline(time.Now().Add(5 * time.Minute))
			time1 := time.Now() //时间打点
			// read zlib pack header length
			buf := make([]byte, 4)
			_, err := conn.Read(buf)
			if err != nil {
				loglib.Warning(fmt.Sprintf("conn:%s, get header len, tcp receiver read error:%s, elapse:%s", inAddr, err.Error(), time.Now().Sub(time1)))
				break
			}
			l, _ := binary.Uvarint(buf)
			headerLen := int(l)
			//get pack header
			headerBuf := make([]byte, headerLen)
			time2 := time.Now()
			_, err = conn.Read(headerBuf)
			if err != nil {
				loglib.Warning(fmt.Sprintf("conn:%s, get header, tcp receiver read error:%s, elapse:%s", inAddr, err.Error(), time.Now().Sub(time2)))
				break
			}

			//是否补拉
			route0 := tcp_pack.ParseHeader(headerBuf)
			if v, ok := route0["repull"]; ok && v == "1" {
				rePull = true
			} else {
				rePull = false
			}

			buf = append(buf, headerBuf...)
			header, _, err := tcp_pack.ExtractHeader(buf)
			if err != nil {
				loglib.Error("wrong format header " + string(headerBuf) + " " + err.Error())
				conn.Write([]byte("wrong header"))
				break
			}

			packId = tcp_pack.GetPackId(buf)
			packLen = header.PackLen
			currLen = 0
			routeInfo = make(map[string]string)
			b = new(bytes.Buffer)
			content = new(bytes.Buffer)

			loglib.Info(fmt.Sprintf("conn:%s, start receive pack %s, pack len:%d, header len:%d, header elapse:%s", inAddr, packId, packLen, headerLen, time.Now().Sub(time1)))
			b.Write(buf)

			routeInfo["ip"] = lib.GetIp()
			routeInfo["stage"] = "tcp recv"
			routeInfo["st"] = st.Format("2006-01-02 15:04:05.000")
		}
		//读包体的超时
		conn.SetReadDeadline(time.Now().Add(5 * time.Minute))
		time3 := time.Now()
		//read enough bytes
		for currLen < packLen {
			requestLen, err := conn.Read(request)
			if requestLen == 0 || err != nil {
				//sender有重发机制,所以可丢弃
				packLen = 0 //设为0以便读取新的包

				ed := time.Now()
				loglib.Warning(fmt.Sprintf("conn:%s, not full! ip:%s, packid:%s, received:%d, end recv:%s, elapse:%s, body elapse:%s, error:%s", inAddr, inIp, packId, currLen, ed, ed.Sub(st), ed.Sub(time3), err.Error()))
				break outer //连接出错直接跳出外层循环
			}
			currLen += requestLen
			content.Write(request[:requestLen])
		}
		if packLen > 0 && currLen >= packLen {
			//收完马上应答
			_, err := conn.Write([]byte("ok"))
			if err != nil {
				loglib.Warning(fmt.Sprintf("ip:%s, packid:%s received, but response back error:%s", inIp, packId, err.Error()))
			} else {
				loglib.Info(fmt.Sprintf("conn:%s, response to packid:%s", inAddr, packId))
			}
			//避免收到重复包(补拉例外)
			appeared, ok, code := t.hasAppeared(content)
			if !ok || rePull {
				ed := time.Now()
				routeInfo["ed"] = ed.Format("2006-01-02 15:04:05.000")
				routeInfo["elapse"] = ed.Sub(st).String()
				b.Write(content.Bytes())
				vbytes := tcp_pack.Packing(b.Bytes(), routeInfo, true)
				b = bytes.NewBuffer(vbytes)
				t.buffer <- *b
				packAppear := PackAppear{time.Now().Unix(), packId}
				t.mutex.Lock()
				t.footPrint[code] = packAppear //这里挂过
				t.mutex.Unlock()

				loglib.Info(fmt.Sprintf("conn:%s, finish ip:%s, packid:%s, repull:%v, received:%d, elapse:%s, body elapse:%s", inAddr, inIp, packId, rePull, currLen, ed.Sub(st), ed.Sub(time3)))
			} else {
				loglib.Info(fmt.Sprintf("conn:%s, pack %s repeat %s already appear at %s", inAddr, packId, appeared.Id, time.Unix(appeared.Time, 0)))
			}
			packLen = 0
		}

	}
	loglib.Info("conn finish: " + inAddr)
}
Пример #11
0
func (e *etlOutputer) runEtl(spiderList string, colsFile string, hostsList string, ipBlackList string) {
	wg := &sync.WaitGroup{}
	fkeyChan := make(chan string, 100)
	defer func() {
		if err := recover(); err != nil {
			loglib.Error(fmt.Sprintf("runEtl() panic:%v", err))
		}

		e.ic.SaveStatus()
		e.closeWriters(e.writers)
		e.closeWriters(e.headerWriters)
		close(fkeyChan)
		//等待etl routine结束
		wg.Wait()
	}()

	for i := 0; i < 5; i++ {
		wg.Add(1)
		go e.doEtl(fkeyChan, e.dataDir, e.etlDir, e.etlDoneDir, e.etlFailDir, spiderList, colsFile, hostsList, ipBlackList, wg)
	}
	nextCheckTime := time.Now().Add(2 * time.Minute)
	//使用range遍历,方便安全退出,只要发送方退出时关闭chan,这里就可以退出了
	for b := range e.buffer {
		loglib.Info(fmt.Sprintf("pack in chan: %d", len(e.buffer)))
		buf := make([]byte, 4)
		bp := &b
		bp.Read(buf)

		l, _ := binary.Uvarint(buf)
		headerLen := int(l)
		//get pack header
		buf = make([]byte, headerLen)
		bp.Read(buf)
		header := tcp_pack.ParseHeader(buf)

		r, err := zlib.NewReader(bp)
		if err != nil {
			loglib.Error("zlib reader Error: " + err.Error())
		} else {
			lines, _ := strconv.Atoi(header["lines"])
			done := false
			if header["done"] == "1" {
				done = true
			}
			e.ic.Add(header["ip"], header["hour"], header["id"], lines, done)

			writerKey := header["ip"] + "_" + header["hour"]
			fout := e.getWriter(e.writers, e.dataDir, writerKey)

			buf = append(buf, '\n')
			/*
			   //一头一尾写头信息,节省硬盘
			   n, err := fout.Write(buf)
			   if err != nil {
			       loglib.Info(fmt.Sprintf("write %s %d %s", writerKey, n, err.Error()))
			   }
			*/
			nn, err := io.Copy(fout, r)
			if err != nil {
				loglib.Warning(fmt.Sprintf("save %s_%s_%s error:%s, saved:%d", header["ip"], header["hour"], header["id"], err, nn))
			}
			//fout.Write(buf)
			//单独存一份header便于查数
			fout = e.getWriter(e.headerWriters, e.headerDir, writerKey)
			n, err := fout.Write(buf)
			if err != nil {
				loglib.Info(fmt.Sprintf("writer header %s %d %s", writerKey, n, err.Error()))
			}
			//增加2分钟check一次的规则,避免done包先到,其他的包未到,则可能要等到下一小时才能check
			if done || time.Now().Unix() > nextCheckTime.Unix() {
				hourFinish, _ := e.ic.Check()
				for ip, hours := range hourFinish {
					for _, hour := range hours {
						writerKey = ip + "_" + hour
						loglib.Info(fmt.Sprintf("fkeychan %d", len(fkeyChan)))
						fkeyChan <- writerKey
					}
				}
				e.closeWriters(e.writers)
				e.closeWriters(e.headerWriters)
				e.ic.SaveStatus()
				nextCheckTime = time.Now().Add(2 * time.Minute)
			}

			r.Close()
		}
	}
}