示例#1
0
文件: config.go 项目: nix8/sequence
func ReadConfig(file string) error {
	var configInfo struct {
		Version     string
		TimeFormats []string
		Fields      []string

		Analyzer struct {
			Prekeys  map[string][]string
			Keywords map[string][]string
		}
	}

	if _, err := toml.DecodeFile(file, &configInfo); err != nil {
		return err
	}

	timeFsmRoot = buildTimeFSM(configInfo.TimeFormats)

	config.fieldIDs = make(map[string]FieldType, 30)
	config.fieldNames = config.fieldNames[:0]
	config.fieldTypes = config.fieldTypes[:0]

	keymaps.keywords = make(map[string]FieldType, 30)
	keymaps.prekeys = make(map[string][]FieldType, 30)

	var ftype FieldType = 0
	config.fieldIDs["funknown"] = ftype
	config.fieldNames = append(config.fieldNames, "funknown")
	config.fieldTypes = append(config.fieldTypes, TokenUnknown)
	ftype++

	for _, f := range configInfo.Fields {
		fs := strings.Split(f, ":")
		if len(fs) != 2 || fs[1] == "" {
			return fmt.Errorf("Error parsing field %q: missing token type", f)
		}

		// field type name, token type
		tt := name2TokenType(fs[1])
		if tt < TokenLiteral || tt > TokenString {
			return fmt.Errorf("Error parsing field %q: invalid token type", f)
		}

		config.fieldIDs[fs[0]] = ftype
		config.fieldNames = append(config.fieldNames, fs[0])
		config.fieldTypes = append(config.fieldTypes, tt)
		ftype++
	}

	for f, t := range config.fieldIDs {
		predefineAnalyzerFields(f, t)
	}

	for w, list := range configInfo.Analyzer.Keywords {
		if f, ok := config.fieldIDs[w]; ok {
			for _, kw := range list {
				pw := porter2.Stem(kw)
				keymaps.keywords[pw] = f
			}
		}
	}

	for w, m := range configInfo.Analyzer.Prekeys {
		for _, fw := range m {
			if f, ok := config.fieldIDs[fw]; ok {
				keymaps.prekeys[w] = append(keymaps.prekeys[w], f)
			}
		}
	}

	FieldTypesCount = len(config.fieldNames)
	allTypesCount = TokenTypesCount + FieldTypesCount

	return nil
}
示例#2
0
文件: analyzer.go 项目: nix8/sequence
func analyzeSequence(seq Sequence) Sequence {
	l := len(seq)
	var fexists = make([]bool, FieldTypesCount)

	defer func() {
		// Step 7: try to see if we can find any srcport and dstport fields
		for i, tok := range seq {
			if tok.Type == token__host__ || tok.Type == token__email__ {
				seq[i].Type = TokenString
			}

			if i < l-2 && tok.Type == TokenIPv4 && (seq[i+1].Value == "/" || seq[i+1].Value == ":") &&
				seq[i+2].Type == TokenInteger {

				switch tok.Field {
				case FieldSrcIP:
					seq[i+2].Field = FieldSrcPort
					seq[i+2].Type = seq[i+2].Field.TokenType()
					fexists[seq[i+2].Field] = true

				case FieldDstIP:
					seq[i+2].Field = FieldDstPort
					seq[i+2].Type = seq[i+2].Field.TokenType()
					fexists[seq[i+2].Field] = true

				case FieldSrcIPNAT:
					seq[i+2].Field = FieldSrcPortNAT
					seq[i+2].Type = seq[i+2].Field.TokenType()
					fexists[seq[i+2].Field] = true

				case FieldDstIPNAT:
					seq[i+2].Field = FieldDstPortNAT
					seq[i+2].Type = seq[i+2].Field.TokenType()
					fexists[seq[i+2].Field] = true
				}

			}
		}

		//glog.Debugf("7. %s", seq)

	}()

	// Step 1: mark all key=value pairs, as well as any prekey words as key
	seq = markSequenceKV(seq)

	for i, tok := range seq {
		if _, ok := keymaps.prekeys[tok.Value]; ok {
			seq[i].isKey = true
		}
	}

	// Step 2: lower case all literals, and try to recognize emails and host names
	for i, tok := range seq {
		if tok.Type == TokenLiteral && tok.Field == FieldUnknown {
			seq[i].Value = strings.ToLower(tok.Value)

			// Matching a effective top level domain
			if etld.Match(tok.Value) > 0 {
				// Matching an email address
				if strings.Index(tok.Value, "@") > 0 {
					seq[i].Type = token__email__
				} else if strings.Index(tok.Value, ".") > 0 {
					seq[i].Type = token__host__
				}
			}
		}
	}

	//glog.Debugf("2. %s", seq.PrintTokens())

	// Step 3: try to recognize syslog headers (RFC5424 and RFC3164)
	// RFC5424
	// - "1 2003-10-11T22:14:15.003Z mymachine.example.com evntslog - ID47 ..."
	// - "1 2003-08-24T05:14:15.000003-07:00 192.0.2.1 myproc 8710 - ..."
	// - "1 2003-10-11T22:14:15.003Z mymachine.example.com su - ID47 ..."
	// RFC3164
	// - "Oct 11 22:14:15 mymachine su: ..."
	// - "Aug 24 05:34:00 CST 1987 mymachine myproc[10]: ..."
	if len(seq) >= 6 && seq[0].Type == TokenInteger && seq[1].Type == TokenTime &&
		(seq[2].Type == TokenIPv4 || seq[2].Type == TokenIPv6 || seq[2].Type == token__host__ || seq[2].Type == TokenLiteral || seq[2].Type == TokenString) &&
		seq[3].Type == TokenLiteral &&
		(seq[4].Type == TokenInteger || (seq[4].Type == TokenLiteral && seq[4].Value == "-")) &&
		(seq[5].Type == TokenLiteral) {

		// RFC5424 header format
		// message time
		seq[1].Field = FieldMsgTime
		seq[1].Type = seq[1].Field.TokenType()
		fexists[seq[1].Field] = true

		// app ip or hostname
		switch seq[2].Type {
		case TokenIPv4:
			seq[2].Field = FieldAppIP

		case token__host__, TokenLiteral, TokenString:
			seq[2].Field = FieldAppHost
		}

		seq[2].Type = seq[2].Field.TokenType()
		fexists[seq[2].Field] = true

		// appname
		seq[3].Field = FieldAppName
		seq[3].Type = seq[3].Field.TokenType()
		fexists[seq[3].Field] = true

		// session id (or proc id)
		seq[4].Field = FieldSessionID
		seq[4].Type = seq[4].Field.TokenType()
		fexists[seq[4].Field] = true

		// message id
		seq[5].Field = FieldMsgId
		seq[5].Type = seq[5].Field.TokenType()
		fexists[seq[5].Field] = true
	} else if len(seq) >= 4 && seq[0].Type == TokenTime &&
		(seq[1].Type == TokenIPv4 || seq[1].Type == TokenIPv6 || seq[1].Type == token__host__ || seq[1].Type == TokenLiteral || seq[1].Type == TokenString) &&
		(seq[2].Type == TokenLiteral || seq[2].Type == TokenString) &&
		(seq[3].Type == TokenLiteral && seq[3].Value == ":") {

		// RFC3164 format 1 - "Oct 11 22:14:15 mymachine su: ..."
		// message time
		seq[0].Field = FieldMsgTime
		seq[0].Type = seq[0].Field.TokenType()
		fexists[seq[0].Field] = true

		// app ip or hostname
		switch seq[1].Type {
		case TokenIPv4:
			seq[1].Field = FieldAppIP

		case token__host__, TokenLiteral, TokenString:
			seq[1].Field = FieldAppHost
		}

		seq[1].Type = seq[1].Field.TokenType()
		fexists[seq[1].Field] = true

		// appname
		seq[2].Field = FieldAppName
		seq[2].Type = seq[2].Field.TokenType()
		fexists[seq[2].Field] = true
	} else if len(seq) >= 7 && seq[0].Type == TokenTime &&
		(seq[1].Type == TokenIPv4 || seq[1].Type == TokenIPv6 || seq[1].Type == token__host__ || seq[1].Type == TokenLiteral || seq[1].Type == TokenString) &&
		(seq[2].Type == TokenLiteral || seq[2].Type == TokenString) &&
		(seq[3].Type == TokenLiteral && seq[3].Value == "[") &&
		(seq[4].Type == TokenInteger) &&
		(seq[5].Type == TokenLiteral && seq[5].Value == "]") &&
		(seq[6].Type == TokenLiteral && seq[6].Value == ":") {

		// RFC3164 format 2 - "Aug 24 05:34:00 CST 1987 mymachine myproc[10]: ..."
		// message time
		seq[0].Field = FieldMsgTime
		seq[0].Type = seq[0].Field.TokenType()
		fexists[seq[0].Field] = true

		// app ip or hostname
		switch seq[1].Type {
		case TokenIPv4:
			seq[1].Field = FieldAppIP

		case token__host__, TokenLiteral, TokenString:
			seq[1].Field = FieldAppHost
		}

		seq[1].Type = seq[1].Field.TokenType()
		fexists[seq[1].Field] = true

		// appname
		seq[2].Field = FieldAppName
		seq[2].Type = seq[2].Field.TokenType()
		fexists[seq[2].Field] = true

		// session id (or proc id)
		seq[4].Field = FieldSessionID
		seq[4].Type = seq[4].Field.TokenType()
		fexists[seq[4].Field] = true
	} else if len(seq) >= 7 && seq[0].Type == TokenTime &&
		(seq[1].Type == TokenIPv4 || seq[1].Type == TokenIPv6 || seq[1].Type == token__host__ || seq[1].Type == TokenLiteral || seq[1].Type == TokenString) &&
		seq[2].Value == "last" {

		// "jan 12 06:49:56 irc last message repeated 6 times"
		// message time
		seq[0].Field = FieldMsgTime
		seq[0].Type = seq[0].Field.TokenType()
		fexists[seq[0].Field] = true

		// app ip or hostname
		switch seq[1].Type {
		case TokenIPv4:
			seq[1].Field = FieldAppIP

		case token__host__, TokenLiteral, TokenString:
			seq[1].Field = FieldAppHost
		}

		seq[1].Type = seq[1].Field.TokenType()
		fexists[seq[1].Field] = true
	}

	// glog.Debugf("3. %s", seq)

	// Step 5: identify the likely fields by their prekeys (literals that usually
	// exist before non-literals). All values must be within 2 tokens away, not
	// counting single character non-a-zA-Z tokens.
	distance := 2

LOOP:
	for i, tok := range seq {
		// Only mark unknown tokens
		if tok.Field != FieldUnknown {
			continue
		}

		//glog.Debugf("1. checking tok=%q", tok)

		if fields, ok := keymaps.prekeys[tok.Value]; ok {

			// This token is a matching prekey

			// Match anyting non-string fields first
			for _, f := range fields {

				if fexists[f] || f.TokenType() == TokenString || f.TokenType() == TokenUnknown {
					continue
				}

				var j int // j is the number of tokens away from the key

				// This is a specific type, so match the type, within the next 2 tokens
				// away, not counting single character non-a-zA-Z tokens.
				for k := i + 1; k < l && j < distance; k++ {
					if !fexists[f] && seq[k].Field == FieldUnknown && f.TokenType() == seq[k].Type && !seq[k].isKey {
						seq[k].Field = f
						seq[k].Type = seq[k].Field.TokenType()
						fexists[seq[k].Field] = true

						//glog.Debugf("found something for tok=%q", tok)

						// Found what we need, let's go to the next token
						continue LOOP
					}

					if seq[k].Type != TokenLiteral ||
						(seq[k].Type == TokenLiteral && len(seq[k].Value) > 1) ||
						(seq[k].Type == TokenLiteral && len(seq[k].Value) == 1 &&
							((seq[k].Value[0] >= 'a' && seq[k].Value[0] <= 'z') ||
								(seq[k].Value[0] >= 'A' && seq[k].Value[0] <= 'Z'))) {

						j++
					}
				}
			}

			for _, f := range fields {

				//glog.Debugf("2. checking tok=%q", tok)

				// If the field type is already taken, move on
				// Should ONLY have TokenString left not touched
				if fexists[f] || f.TokenType() != TokenString {
					continue
				}

				switch f {
				case FieldSrcHost, FieldDstHost, FieldSrcEmail, FieldDstEmail:
					for k := i + 1; k < l && k < i+distance; k++ {
						if !fexists[f] && seq[k].Field == FieldUnknown && !seq[k].isKey &&
							(seq[k].Type == token__host__ && (f == FieldSrcHost || f == FieldDstHost)) ||
							(seq[k].Type == token__email__ && (f == FieldSrcEmail || f == FieldDstEmail)) {

							seq[k].Field = f
							seq[k].Type = seq[k].Field.TokenType()
							fexists[seq[k].Field] = true
							continue LOOP
						}
					}

				default:
					var j int // j is the number of tokens away from the key

					// This is a regular string type, let's find a literal or string
					// token, within the next 2 tokens
					for k := i + 1; k < l && j < distance; k++ {
						// if the value field type is a string, then we only look for
						// either TokenString or TokenLiteral tokens in the next one or
						// two tokens. The token should not include any single character
						// literals that are not a-zA-Z.
						if seq[k].Field == FieldUnknown && !seq[k].isKey &&
							(seq[k].Type == TokenString ||
								(seq[k].Type == TokenLiteral && len(seq[k].Value) > 1) ||
								(seq[k].Type == TokenLiteral && len(seq[k].Value) == 1 &&
									((seq[k].Value[0] >= 'a' && seq[k].Value[0] <= 'z') ||
										(seq[k].Value[0] >= 'A' && seq[k].Value[0] <= 'Z')))) {

							seq[k].Field = f
							seq[k].Type = seq[k].Field.TokenType()
							fexists[seq[k].Field] = true
							continue LOOP
						}

						if seq[k].Type != TokenLiteral ||
							(seq[k].Type == TokenLiteral && len(seq[k].Value) > 1) ||
							(seq[k].Type == TokenLiteral && len(seq[k].Value) == 1 &&
								((seq[k].Value[0] >= 'a' && seq[k].Value[0] <= 'z') ||
									(seq[k].Value[0] >= 'A' && seq[k].Value[0] <= 'Z'))) {

							j++
						}
					}
				}
			}
		}
	}

	//glog.Debugf("5. %s", seq)

	// Step 4: match any key actions, statuses, objects and other keywords, and mark
	// accordingly We do seq step after the k=v step so we don't mistakenly mark
	// any keys
	for i, tok := range seq {
		if !tok.isKey && !tok.isValue && (tok.Type == TokenLiteral || tok.Type == TokenString) && tok.Field == FieldUnknown {
			pw := porter2.Stem(tok.Value)
			if f, ok := keymaps.keywords[pw]; ok {
				if !fexists[f] {
					seq[i].Field = f
					seq[i].Type = f.TokenType()
					fexists[f] = true
				}
			}
		}
	}

	//glog.Debugf("4. %s", seq)
	// Step 6: look for the first and second of these types, and mark accordingly
	for i, tok := range seq {
		if tok.Field == FieldUnknown {
			switch tok.Type {
			case TokenTime:
				if !fexists[FieldMsgTime] {
					seq[i].Field = FieldMsgTime
					seq[i].Type = seq[i].Field.TokenType()
					fexists[FieldMsgTime] = true
				}

			case TokenURI:
				if !fexists[FieldObject] {
					seq[i].Field = FieldObject
					seq[i].Type = seq[i].Field.TokenType()
					fexists[FieldObject] = true
				}

			case TokenMac:
				if !fexists[FieldSrcMac] {
					seq[i].Field = FieldSrcMac
					seq[i].Type = seq[i].Field.TokenType()
					fexists[FieldSrcMac] = true
				} else if !fexists[FieldDstMac] {
					seq[i].Field = FieldDstMac
					seq[i].Type = seq[i].Field.TokenType()
					fexists[FieldDstMac] = true
				}

			case TokenIPv4:
				if !fexists[FieldSrcIP] {
					seq[i].Field = FieldSrcIP
					seq[i].Type = seq[i].Field.TokenType()
					fexists[FieldSrcIP] = true
				} else if !fexists[FieldDstIP] {
					seq[i].Field = FieldDstIP
					seq[i].Type = seq[i].Field.TokenType()
					fexists[FieldDstIP] = true
				}

			case token__host__:
				if !fexists[FieldSrcHost] {
					seq[i].Field = FieldSrcHost
					seq[i].Type = seq[i].Field.TokenType()
					fexists[FieldSrcHost] = true
				} else if !fexists[FieldDstHost] {
					seq[i].Field = FieldDstHost
					seq[i].Type = seq[i].Field.TokenType()
					fexists[FieldDstHost] = true
				}

			case token__email__:
				if !fexists[FieldSrcEmail] {
					seq[i].Field = FieldSrcEmail
					seq[i].Type = seq[i].Field.TokenType()
					fexists[FieldSrcEmail] = true
				} else if !fexists[FieldDstEmail] {
					seq[i].Field = FieldDstEmail
					seq[i].Type = seq[i].Field.TokenType()
					fexists[FieldDstEmail] = true
				}
			}
		}
	}

	//glog.Debugf("6. %s", seq)

	return seq
}