func ReadConfig(file string) error { var configInfo struct { Version string TimeFormats []string Fields []string Analyzer struct { Prekeys map[string][]string Keywords map[string][]string } } if _, err := toml.DecodeFile(file, &configInfo); err != nil { return err } timeFsmRoot = buildTimeFSM(configInfo.TimeFormats) config.fieldIDs = make(map[string]FieldType, 30) config.fieldNames = config.fieldNames[:0] config.fieldTypes = config.fieldTypes[:0] keymaps.keywords = make(map[string]FieldType, 30) keymaps.prekeys = make(map[string][]FieldType, 30) var ftype FieldType = 0 config.fieldIDs["funknown"] = ftype config.fieldNames = append(config.fieldNames, "funknown") config.fieldTypes = append(config.fieldTypes, TokenUnknown) ftype++ for _, f := range configInfo.Fields { fs := strings.Split(f, ":") if len(fs) != 2 || fs[1] == "" { return fmt.Errorf("Error parsing field %q: missing token type", f) } // field type name, token type tt := name2TokenType(fs[1]) if tt < TokenLiteral || tt > TokenString { return fmt.Errorf("Error parsing field %q: invalid token type", f) } config.fieldIDs[fs[0]] = ftype config.fieldNames = append(config.fieldNames, fs[0]) config.fieldTypes = append(config.fieldTypes, tt) ftype++ } for f, t := range config.fieldIDs { predefineAnalyzerFields(f, t) } for w, list := range configInfo.Analyzer.Keywords { if f, ok := config.fieldIDs[w]; ok { for _, kw := range list { pw := porter2.Stem(kw) keymaps.keywords[pw] = f } } } for w, m := range configInfo.Analyzer.Prekeys { for _, fw := range m { if f, ok := config.fieldIDs[fw]; ok { keymaps.prekeys[w] = append(keymaps.prekeys[w], f) } } } FieldTypesCount = len(config.fieldNames) allTypesCount = TokenTypesCount + FieldTypesCount return nil }
func analyzeSequence(seq Sequence) Sequence { l := len(seq) var fexists = make([]bool, FieldTypesCount) defer func() { // Step 7: try to see if we can find any srcport and dstport fields for i, tok := range seq { if tok.Type == token__host__ || tok.Type == token__email__ { seq[i].Type = TokenString } if i < l-2 && tok.Type == TokenIPv4 && (seq[i+1].Value == "/" || seq[i+1].Value == ":") && seq[i+2].Type == TokenInteger { switch tok.Field { case FieldSrcIP: seq[i+2].Field = FieldSrcPort seq[i+2].Type = seq[i+2].Field.TokenType() fexists[seq[i+2].Field] = true case FieldDstIP: seq[i+2].Field = FieldDstPort seq[i+2].Type = seq[i+2].Field.TokenType() fexists[seq[i+2].Field] = true case FieldSrcIPNAT: seq[i+2].Field = FieldSrcPortNAT seq[i+2].Type = seq[i+2].Field.TokenType() fexists[seq[i+2].Field] = true case FieldDstIPNAT: seq[i+2].Field = FieldDstPortNAT seq[i+2].Type = seq[i+2].Field.TokenType() fexists[seq[i+2].Field] = true } } } //glog.Debugf("7. %s", seq) }() // Step 1: mark all key=value pairs, as well as any prekey words as key seq = markSequenceKV(seq) for i, tok := range seq { if _, ok := keymaps.prekeys[tok.Value]; ok { seq[i].isKey = true } } // Step 2: lower case all literals, and try to recognize emails and host names for i, tok := range seq { if tok.Type == TokenLiteral && tok.Field == FieldUnknown { seq[i].Value = strings.ToLower(tok.Value) // Matching a effective top level domain if etld.Match(tok.Value) > 0 { // Matching an email address if strings.Index(tok.Value, "@") > 0 { seq[i].Type = token__email__ } else if strings.Index(tok.Value, ".") > 0 { seq[i].Type = token__host__ } } } } //glog.Debugf("2. %s", seq.PrintTokens()) // Step 3: try to recognize syslog headers (RFC5424 and RFC3164) // RFC5424 // - "1 2003-10-11T22:14:15.003Z mymachine.example.com evntslog - ID47 ..." // - "1 2003-08-24T05:14:15.000003-07:00 192.0.2.1 myproc 8710 - ..." // - "1 2003-10-11T22:14:15.003Z mymachine.example.com su - ID47 ..." // RFC3164 // - "Oct 11 22:14:15 mymachine su: ..." // - "Aug 24 05:34:00 CST 1987 mymachine myproc[10]: ..." if len(seq) >= 6 && seq[0].Type == TokenInteger && seq[1].Type == TokenTime && (seq[2].Type == TokenIPv4 || seq[2].Type == TokenIPv6 || seq[2].Type == token__host__ || seq[2].Type == TokenLiteral || seq[2].Type == TokenString) && seq[3].Type == TokenLiteral && (seq[4].Type == TokenInteger || (seq[4].Type == TokenLiteral && seq[4].Value == "-")) && (seq[5].Type == TokenLiteral) { // RFC5424 header format // message time seq[1].Field = FieldMsgTime seq[1].Type = seq[1].Field.TokenType() fexists[seq[1].Field] = true // app ip or hostname switch seq[2].Type { case TokenIPv4: seq[2].Field = FieldAppIP case token__host__, TokenLiteral, TokenString: seq[2].Field = FieldAppHost } seq[2].Type = seq[2].Field.TokenType() fexists[seq[2].Field] = true // appname seq[3].Field = FieldAppName seq[3].Type = seq[3].Field.TokenType() fexists[seq[3].Field] = true // session id (or proc id) seq[4].Field = FieldSessionID seq[4].Type = seq[4].Field.TokenType() fexists[seq[4].Field] = true // message id seq[5].Field = FieldMsgId seq[5].Type = seq[5].Field.TokenType() fexists[seq[5].Field] = true } else if len(seq) >= 4 && seq[0].Type == TokenTime && (seq[1].Type == TokenIPv4 || seq[1].Type == TokenIPv6 || seq[1].Type == token__host__ || seq[1].Type == TokenLiteral || seq[1].Type == TokenString) && (seq[2].Type == TokenLiteral || seq[2].Type == TokenString) && (seq[3].Type == TokenLiteral && seq[3].Value == ":") { // RFC3164 format 1 - "Oct 11 22:14:15 mymachine su: ..." // message time seq[0].Field = FieldMsgTime seq[0].Type = seq[0].Field.TokenType() fexists[seq[0].Field] = true // app ip or hostname switch seq[1].Type { case TokenIPv4: seq[1].Field = FieldAppIP case token__host__, TokenLiteral, TokenString: seq[1].Field = FieldAppHost } seq[1].Type = seq[1].Field.TokenType() fexists[seq[1].Field] = true // appname seq[2].Field = FieldAppName seq[2].Type = seq[2].Field.TokenType() fexists[seq[2].Field] = true } else if len(seq) >= 7 && seq[0].Type == TokenTime && (seq[1].Type == TokenIPv4 || seq[1].Type == TokenIPv6 || seq[1].Type == token__host__ || seq[1].Type == TokenLiteral || seq[1].Type == TokenString) && (seq[2].Type == TokenLiteral || seq[2].Type == TokenString) && (seq[3].Type == TokenLiteral && seq[3].Value == "[") && (seq[4].Type == TokenInteger) && (seq[5].Type == TokenLiteral && seq[5].Value == "]") && (seq[6].Type == TokenLiteral && seq[6].Value == ":") { // RFC3164 format 2 - "Aug 24 05:34:00 CST 1987 mymachine myproc[10]: ..." // message time seq[0].Field = FieldMsgTime seq[0].Type = seq[0].Field.TokenType() fexists[seq[0].Field] = true // app ip or hostname switch seq[1].Type { case TokenIPv4: seq[1].Field = FieldAppIP case token__host__, TokenLiteral, TokenString: seq[1].Field = FieldAppHost } seq[1].Type = seq[1].Field.TokenType() fexists[seq[1].Field] = true // appname seq[2].Field = FieldAppName seq[2].Type = seq[2].Field.TokenType() fexists[seq[2].Field] = true // session id (or proc id) seq[4].Field = FieldSessionID seq[4].Type = seq[4].Field.TokenType() fexists[seq[4].Field] = true } else if len(seq) >= 7 && seq[0].Type == TokenTime && (seq[1].Type == TokenIPv4 || seq[1].Type == TokenIPv6 || seq[1].Type == token__host__ || seq[1].Type == TokenLiteral || seq[1].Type == TokenString) && seq[2].Value == "last" { // "jan 12 06:49:56 irc last message repeated 6 times" // message time seq[0].Field = FieldMsgTime seq[0].Type = seq[0].Field.TokenType() fexists[seq[0].Field] = true // app ip or hostname switch seq[1].Type { case TokenIPv4: seq[1].Field = FieldAppIP case token__host__, TokenLiteral, TokenString: seq[1].Field = FieldAppHost } seq[1].Type = seq[1].Field.TokenType() fexists[seq[1].Field] = true } // glog.Debugf("3. %s", seq) // Step 5: identify the likely fields by their prekeys (literals that usually // exist before non-literals). All values must be within 2 tokens away, not // counting single character non-a-zA-Z tokens. distance := 2 LOOP: for i, tok := range seq { // Only mark unknown tokens if tok.Field != FieldUnknown { continue } //glog.Debugf("1. checking tok=%q", tok) if fields, ok := keymaps.prekeys[tok.Value]; ok { // This token is a matching prekey // Match anyting non-string fields first for _, f := range fields { if fexists[f] || f.TokenType() == TokenString || f.TokenType() == TokenUnknown { continue } var j int // j is the number of tokens away from the key // This is a specific type, so match the type, within the next 2 tokens // away, not counting single character non-a-zA-Z tokens. for k := i + 1; k < l && j < distance; k++ { if !fexists[f] && seq[k].Field == FieldUnknown && f.TokenType() == seq[k].Type && !seq[k].isKey { seq[k].Field = f seq[k].Type = seq[k].Field.TokenType() fexists[seq[k].Field] = true //glog.Debugf("found something for tok=%q", tok) // Found what we need, let's go to the next token continue LOOP } if seq[k].Type != TokenLiteral || (seq[k].Type == TokenLiteral && len(seq[k].Value) > 1) || (seq[k].Type == TokenLiteral && len(seq[k].Value) == 1 && ((seq[k].Value[0] >= 'a' && seq[k].Value[0] <= 'z') || (seq[k].Value[0] >= 'A' && seq[k].Value[0] <= 'Z'))) { j++ } } } for _, f := range fields { //glog.Debugf("2. checking tok=%q", tok) // If the field type is already taken, move on // Should ONLY have TokenString left not touched if fexists[f] || f.TokenType() != TokenString { continue } switch f { case FieldSrcHost, FieldDstHost, FieldSrcEmail, FieldDstEmail: for k := i + 1; k < l && k < i+distance; k++ { if !fexists[f] && seq[k].Field == FieldUnknown && !seq[k].isKey && (seq[k].Type == token__host__ && (f == FieldSrcHost || f == FieldDstHost)) || (seq[k].Type == token__email__ && (f == FieldSrcEmail || f == FieldDstEmail)) { seq[k].Field = f seq[k].Type = seq[k].Field.TokenType() fexists[seq[k].Field] = true continue LOOP } } default: var j int // j is the number of tokens away from the key // This is a regular string type, let's find a literal or string // token, within the next 2 tokens for k := i + 1; k < l && j < distance; k++ { // if the value field type is a string, then we only look for // either TokenString or TokenLiteral tokens in the next one or // two tokens. The token should not include any single character // literals that are not a-zA-Z. if seq[k].Field == FieldUnknown && !seq[k].isKey && (seq[k].Type == TokenString || (seq[k].Type == TokenLiteral && len(seq[k].Value) > 1) || (seq[k].Type == TokenLiteral && len(seq[k].Value) == 1 && ((seq[k].Value[0] >= 'a' && seq[k].Value[0] <= 'z') || (seq[k].Value[0] >= 'A' && seq[k].Value[0] <= 'Z')))) { seq[k].Field = f seq[k].Type = seq[k].Field.TokenType() fexists[seq[k].Field] = true continue LOOP } if seq[k].Type != TokenLiteral || (seq[k].Type == TokenLiteral && len(seq[k].Value) > 1) || (seq[k].Type == TokenLiteral && len(seq[k].Value) == 1 && ((seq[k].Value[0] >= 'a' && seq[k].Value[0] <= 'z') || (seq[k].Value[0] >= 'A' && seq[k].Value[0] <= 'Z'))) { j++ } } } } } } //glog.Debugf("5. %s", seq) // Step 4: match any key actions, statuses, objects and other keywords, and mark // accordingly We do seq step after the k=v step so we don't mistakenly mark // any keys for i, tok := range seq { if !tok.isKey && !tok.isValue && (tok.Type == TokenLiteral || tok.Type == TokenString) && tok.Field == FieldUnknown { pw := porter2.Stem(tok.Value) if f, ok := keymaps.keywords[pw]; ok { if !fexists[f] { seq[i].Field = f seq[i].Type = f.TokenType() fexists[f] = true } } } } //glog.Debugf("4. %s", seq) // Step 6: look for the first and second of these types, and mark accordingly for i, tok := range seq { if tok.Field == FieldUnknown { switch tok.Type { case TokenTime: if !fexists[FieldMsgTime] { seq[i].Field = FieldMsgTime seq[i].Type = seq[i].Field.TokenType() fexists[FieldMsgTime] = true } case TokenURI: if !fexists[FieldObject] { seq[i].Field = FieldObject seq[i].Type = seq[i].Field.TokenType() fexists[FieldObject] = true } case TokenMac: if !fexists[FieldSrcMac] { seq[i].Field = FieldSrcMac seq[i].Type = seq[i].Field.TokenType() fexists[FieldSrcMac] = true } else if !fexists[FieldDstMac] { seq[i].Field = FieldDstMac seq[i].Type = seq[i].Field.TokenType() fexists[FieldDstMac] = true } case TokenIPv4: if !fexists[FieldSrcIP] { seq[i].Field = FieldSrcIP seq[i].Type = seq[i].Field.TokenType() fexists[FieldSrcIP] = true } else if !fexists[FieldDstIP] { seq[i].Field = FieldDstIP seq[i].Type = seq[i].Field.TokenType() fexists[FieldDstIP] = true } case token__host__: if !fexists[FieldSrcHost] { seq[i].Field = FieldSrcHost seq[i].Type = seq[i].Field.TokenType() fexists[FieldSrcHost] = true } else if !fexists[FieldDstHost] { seq[i].Field = FieldDstHost seq[i].Type = seq[i].Field.TokenType() fexists[FieldDstHost] = true } case token__email__: if !fexists[FieldSrcEmail] { seq[i].Field = FieldSrcEmail seq[i].Type = seq[i].Field.TokenType() fexists[FieldSrcEmail] = true } else if !fexists[FieldDstEmail] { seq[i].Field = FieldDstEmail seq[i].Type = seq[i].Field.TokenType() fexists[FieldDstEmail] = true } } } } //glog.Debugf("6. %s", seq) return seq }