func compileSyntax(re *syntax.Regexp, expr string, longest bool) (*Regexp, error) { maxCap := re.MaxCap() capNames := re.CapNames() re = re.Simplify() prog, err := syntax.Compile(re) if err != nil { return nil, err } regexp := &Regexp{ expr: expr, prog: prog, onepass: compileOnePass(prog), numSubexp: maxCap, subexpNames: capNames, cond: prog.StartCond(), longest: longest, } if regexp.onepass == notOnePass { regexp.prefix, regexp.prefixComplete = prog.Prefix() } else { regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog) } if regexp.prefix != "" { // TODO(rsc): Remove this allocation by adding // IndexString to package bytes. regexp.prefixBytes = []byte(regexp.prefix) regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix) } return regexp, nil }
func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) { re, err := syntax.Parse(expr, mode) if err != nil { return nil, err } maxCap := re.MaxCap() capNames := re.CapNames() re = re.Simplify() prog, err := syntax.Compile(re) if err != nil { return nil, err } regexp := &Regexp{ expr: expr, prog: prog, numSubexp: maxCap, subexpNames: capNames, cond: prog.StartCond(), longest: longest, } regexp.prefix, regexp.prefixComplete = prog.Prefix() if regexp.prefix != "" { // TODO(rsc): Remove this allocation by adding // IndexString to package bytes. regexp.prefixBytes = []byte(regexp.prefix) regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix) } return regexp, nil }
/* I'm sorry, dear reader. I really am. The problem here is to take an arbitrary regular expression and: 1. return a regular expression that is just like it, but left-anchored, preferring to return the original if possible. 2. determine a string literal prefix that all matches of this regular expression have, much like regexp.Regexp.Prefix(). Unfortunately, Prefix() does not work in the presence of anchors, so we need to write it ourselves. What this actually means is that we need to sketch on the internals of the standard regexp library to forcefully extract the information we want. Unfortunately, regexp.Regexp hides a lot of its state, so our abstraction is going to be pretty leaky. The biggest leak is that we blindly assume that all regular expressions are perl-style, not POSIX. This is probably Mostly True, and I think most users of the library probably won't be able to notice. */ func sketchOnRegex(re *regexp.Regexp) (*regexp.Regexp, string) { rawRe := re.String() sRe, err := syntax.Parse(rawRe, syntax.Perl) if err != nil { log.Printf("WARN(web): unable to parse regexp %v as perl. "+ "This route might behave unexpectedly.", re) return re, "" } sRe = sRe.Simplify() p, err := syntax.Compile(sRe) if err != nil { log.Printf("WARN(web): unable to compile regexp %v. This "+ "route might behave unexpectedly.", re) return re, "" } if p.StartCond()&syntax.EmptyBeginText == 0 { // I hope doing this is always legal... newRe, err := regexp.Compile(`\A` + rawRe) if err != nil { log.Printf("WARN(web): unable to create a left-"+ "anchored regexp from %v. This route might "+ "behave unexpectedly", re) return re, "" } re = newRe } // Run the regular expression more or less by hand :( pc := uint32(p.Start) atStart := true i := &p.Inst[pc] var buf bytes.Buffer Sadness: for { switch i.Op { case syntax.InstEmptyWidth: if !atStart { break Sadness } case syntax.InstCapture, syntax.InstNop: // nop! case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: atStart = false if len(i.Rune) != 1 || syntax.Flags(i.Arg)&syntax.FoldCase != 0 { break Sadness } buf.WriteRune(i.Rune[0]) default: break Sadness } pc = i.Out i = &p.Inst[pc] } return re, buf.String() }
// Check that one-pass cutoff does trigger. func TestOnePassCutoff(t *testing.T) { re, err := syntax.Parse(`^x{1,1000}y{1,1000}$`, syntax.Perl) if err != nil { t.Fatalf("parse: %v", err) } p, err := syntax.Compile(re.Simplify()) if err != nil { t.Fatalf("compile: %v", err) } if compileOnePass(p) != notOnePass { t.Fatalf("makeOnePass succeeded; wanted notOnePass") } }
func testRegex() { for k, v := range lexer_rules { fmt.Println("\nparsing ", k, " = ", v) re, err := syntax.Parse(lexer_rules["word"], syntax.Simple|syntax.UnicodeGroups|syntax.PerlX) if !tlog.Ok(err) { return } // fmt.Println(re) prog, err := syntax.Compile(re) if !tlog.Ok(err) { return } fmt.Println(prog) } }
// Compile parses a regular expression and returns, if successful, // a Regexp object that can be used to match against lines of text. func Compile(expr string) (*Regexp, error) { re, err := syntax.Parse(expr, syntax.Perl) if err != nil { return nil, err } sre := re.Simplify() prog, err := syntax.Compile(sre) if err != nil { return nil, err } if err := toByteProg(prog); err != nil { return nil, err } r := &Regexp{ Syntax: re, expr: expr, } if err := r.m.init(prog); err != nil { return nil, err } return r, nil }
func TestCompileOnePass(t *testing.T) { var ( p *syntax.Prog re *syntax.Regexp err error ) for _, test := range onePassTests { if re, err = syntax.Parse(test.re, syntax.Perl); err != nil { t.Errorf("Parse(%q) got err:%s, want success", test.re, err) continue } // needs to be done before compile... re = re.Simplify() if p, err = syntax.Compile(re); err != nil { t.Errorf("Compile(%q) got err:%s, want success", test.re, err) continue } onePass = compileOnePass(p) if (onePass == notOnePass) != (test.onePass == notOnePass) { t.Errorf("CompileOnePass(%q) got %v, expected %v", test.re, onePass, test.onePass) } } }
func compile(exprs ...string) (*matcher, error) { var progs []*syntax.Prog for _, expr := range exprs { re, err := syntax.Parse(expr, syntax.Perl) if err != nil { return nil, err } sre := re.Simplify() prog, err := syntax.Compile(sre) if err != nil { return nil, err } if err := toByteProg(prog); err != nil { return nil, err } progs = append(progs, prog) } m := &matcher{} if err := m.init(joinProgs(progs), len(progs)); err != nil { return nil, err } return m, nil }
func main() { var args []string argv := os.Args argc := len(argv) for n := 1; n < argc; n++ { if len(argv[n]) > 1 && argv[n][0] == '-' && argv[n][1] != '-' { switch argv[n][1] { case '8': utf8 = true case 'F': fixed = true case 'R': recursive = true case 'S': verbose = true case 'i': ignorecase = true case 'l': list = true case 'n': number = true case 'P': perl = true case 'G': basic = true case 'v': invert = true case 'o': only = true case 'f': if n < argc-1 { infile = argv[n+1] n++ continue } case 'V': fmt.Fprintf(os.Stdout, "%s\n", version) os.Exit(0) default: usage() } if len(argv[n]) > 2 { argv[n] = "-" + argv[n][2:] n-- } } else if len(argv[n]) > 1 && argv[n][0] == '-' && argv[n][1] == '-' { if n == argc-1 { usage() } switch argv[n] { case "--enc": encs = argv[n+1] case "--exclude": exclude = argv[n+1] default: usage() } n++ } else { args = append(args, argv[n]) } } if len(args) == 0 { usage() } var err error var errs *string var pattern interface{} if encs != "" { encodings = strings.Split(encs, ",") } else { enc_env := os.Getenv("JVGREP_ENCODINGS") if enc_env != "" { encodings = strings.Split(enc_env, ",") } } if runtime.GOOS == "windows" { // set dll name that is first to try to load by go-iconv. os.Setenv("ICONV_DLL", "jvgrep-iconv.dll") } var oc *iconv.Iconv if !utf8 { oc, err = iconv.Open("char", "utf-8") if err != nil { oc, err = iconv.Open("utf-8", "utf-8") } } defer func() { if oc != nil { oc.Close() } }() instr := "" argindex := 0 if len(infile) > 0 { b, err := ioutil.ReadFile(infile) if err != nil { errorline(err.Error()) os.Exit(-1) } instr = strings.TrimSpace(string(b)) } else { instr = args[0] argindex = 1 } if fixed { pattern = instr } else if perl { re, err := syntax.Parse(instr, syntax.Perl) if err != nil { errorline(err.Error()) os.Exit(-1) } rec, err := syntax.Compile(re) if err != nil { errorline(err.Error()) os.Exit(-1) } instr = rec.String() if ignorecase { instr = "(?i:" + instr + ")" } pattern, err = regexp.Compile(instr) if err != nil { errorline(err.Error()) os.Exit(-1) } } else { if ignorecase { instr = "(?i:" + instr + ")" } pattern, err = regexp.Compile(instr) if err != nil { errorline(err.Error()) os.Exit(-1) } } var ere *regexp.Regexp if exclude != "" { ere, err = regexp.Compile(exclude) if errs != nil { errorline(err.Error()) os.Exit(-1) } } if len(args) == 1 && argindex != 0 { Grep(&GrepArg{pattern, os.Stdin, oc, true}) return } envre := regexp.MustCompile(`^(\$[a-zA-Z][a-zA-Z0-9_]+|\$\([a-zA-Z][a-zA-Z0-9_]+\))$`) globmask := "" ch := make(chan *GrepArg) done := make(chan int) go GoGrep(ch, done) nargs := len(args[argindex:]) for ai, arg := range args[argindex:] { globmask = "" root := "" arg = strings.Trim(arg, `"`) for n, i := range strings.Split(filepath.ToSlash(arg), "/") { if root == "" && strings.Index(i, "*") != -1 { if globmask == "" { root = "." } else { root = filepath.ToSlash(globmask) } } if n == 0 && i == "~" { if runtime.GOOS == "windows" { i = os.Getenv("USERPROFILE") } else { i = os.Getenv("HOME") } } if envre.MatchString(i) { i = strings.Trim(strings.Trim(os.Getenv(i[1:]), "()"), `"`) } globmask = filepath.Join(globmask, i) if n == 0 { if runtime.GOOS == "windows" && filepath.VolumeName(i) != "" { globmask = i + "/" } else if len(globmask) == 0 { globmask = "/" } } } if root == "" { path, _ := filepath.Abs(arg) if !recursive { if verbose { println("search:", path) } println(ai, nargs-1) ch <- &GrepArg{pattern, path, oc, ai == nargs-1} continue } else { root = path globmask = "**/" + globmask } } if globmask == "" { globmask = "." } globmask = filepath.ToSlash(filepath.Clean(globmask)) if recursive { if strings.Index(globmask, "/") > -1 { globmask += "/" } else { globmask = "**/" + globmask } } if runtime.GOOS == "windows" { // keep double backslask windows UNC. if len(arg) > 2 && (arg[0:2] == `\\` || arg[0:2] == `//`) { root = "/" + root globmask = "/" + globmask } } cc := []rune(globmask) dirmask := "" filemask := "" for i := 0; i < len(cc); i++ { if cc[i] == '*' { if i < len(cc)-2 && cc[i+1] == '*' && cc[i+2] == '/' { filemask += "(.*/)?" dirmask = filemask i += 2 } else { filemask += "[^/]*" } } else { c := cc[i] if c == '/' || ('0' <= c && c <= '9') || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || 255 < c { filemask += string(c) } else { filemask += fmt.Sprintf("[\\x%x]", c) } if c == '/' && dirmask == "" && strings.Index(filemask, "*") != -1 { dirmask = filemask } } } if dirmask == "" { dirmask = filemask } if len(filemask) > 0 && filemask[len(filemask)-1] == '/' { if root == "" { root = filemask } filemask += "[^/]*" } if runtime.GOOS == "windows" || runtime.GOOS == "darwin" { dirmask = "(?i:" + dirmask + ")" filemask = "(?i:" + filemask + ")" } dre := regexp.MustCompile("^" + dirmask) fre := regexp.MustCompile("^" + filemask + "$") root = filepath.Clean(root) if verbose { println("dirmask:", dirmask) println("filemask:", filemask) println("root:", root) } filepath.Walk(root, func(path string, info os.FileInfo, err error) error { if info == nil { return err } path = filepath.ToSlash(path) if ere != nil && ere.MatchString(path) { if info.IsDir() { return filepath.SkipDir } return nil } if info.IsDir() { if path == "." || recursive || len(path) <= len(root) || dre.MatchString(path+"/") { return nil } return filepath.SkipDir } if fre.MatchString(path) { if verbose { println("search:", path) } //ch <- &GrepArg{pattern, path, oc, ai == nargs-1} ch <- &GrepArg{pattern, path, oc, false} } return nil }) } ch <- nil <-done }
func main() { var args []string argv := os.Args argc := len(argv) for n := 1; n < argc; n++ { if len(argv[n]) > 1 && argv[n][0] == '-' && argv[n][1] != '-' { switch argv[n][1] { case 'A': if n < argc-1 { after, _ = strconv.Atoi(argv[n+1]) n++ continue } case 'B': if n < argc-1 { before, _ = strconv.Atoi(argv[n+1]) n++ continue } case '8': utf8out = true case 'F': fixed = true case 'R': recursive = true case 'S': verbose = true case 'c': count = true case 'r': fullpath = false case 'i': ignorecase = true case 'I': ignorebinary = true case 'l': list = true case 'n': number = true case 'P': perl = true case 'G': basic = true case 'v': invert = true case 'o': only = true case 'f': if n < argc-1 { infile = argv[n+1] n++ continue } case 'z': zeroData = true case 'Z': zeroFile = true case 'V': fmt.Fprintf(os.Stdout, "%s\n", version) os.Exit(0) default: usage(true) } if len(argv[n]) > 2 { argv[n] = "-" + argv[n][2:] n-- } } else if len(argv[n]) > 1 && argv[n][0] == '-' && argv[n][1] == '-' { name := argv[n][2:] switch { case strings.HasPrefix(name, "enc="): encs = name[4:] case name == "enc" && n < argc-1: encs = argv[n+1] n++ case strings.HasPrefix(name, "exclude="): exclude = name[8:] case name == "exclude" && n < argc-1: exclude = argv[n+1] n++ case strings.HasPrefix(name, "color="): color = name[6:] case name == "color" && n < argc-1: color = argv[n+1] n++ case strings.HasPrefix(name, "separator="): separator = name[10:] case name == "separator": separator = argv[n+1] n++ case name == "null": zeroFile = true case name == "null-data": zeroData = true case name == "help": usage(false) default: usage(true) } } else { args = append(args, argv[n]) } } if len(args) == 0 { usage(true) } var err error var pattern interface{} if encs != "" { encodings = strings.Split(encs, ",") } else { enc_env := os.Getenv("JVGREP_ENCODINGS") if enc_env != "" { encodings = strings.Split(enc_env, ",") } } out_enc := os.Getenv("JVGREP_OUTPUT_ENCODING") if out_enc != "" { ee, _ := charset.Lookup(out_enc) if ee == nil { errorline(fmt.Sprintf("unknown encoding: %s", out_enc)) os.Exit(1) } oc = transform.NewWriter(os.Stdout, ee.NewEncoder()) } instr := "" argindex := 0 if len(infile) > 0 { b, err := ioutil.ReadFile(infile) if err != nil { errorline(err.Error()) os.Exit(1) } instr = strings.TrimSpace(string(b)) } else { instr = args[0] argindex = 1 } if fixed { pattern = instr } else if perl { re, err := syntax.Parse(instr, syntax.Perl) if err != nil { errorline(err.Error()) os.Exit(1) } rec, err := syntax.Compile(re) if err != nil { errorline(err.Error()) os.Exit(1) } instr = rec.String() if ignorecase { instr = "(?i:" + instr + ")" } pattern, err = regexp.Compile(instr) if err != nil { errorline(err.Error()) os.Exit(1) } } else { if ignorecase { instr = "(?i:" + instr + ")" } pattern, err = regexp.Compile(instr) if err != nil { errorline(err.Error()) os.Exit(1) } } if exclude == "" { exclude = os.Getenv("JVGREP_EXCLUDE") } if exclude == "" { exclude = excludeDefaults } ere, err := regexp.Compile(exclude) if err != nil { errorline(err.Error()) os.Exit(1) } atty := false if color == "" { color = os.Getenv("JVGREP_COLOR") } if color == "" || color == "auto" { atty = isatty.IsTerminal(os.Stdout.Fd()) } else if color == "always" { atty = true } else if color == "never" { atty = false } else { usage(true) } if atty { sc := make(chan os.Signal, 10) signal.Notify(sc, syscall.SIGTERM, syscall.SIGINT, syscall.SIGHUP) go func() { for _ = range sc { ct.ResetColor() os.Exit(0) } }() } if len(args) == 1 && argindex != 0 { Grep(&GrepArg{ pattern: pattern, input: os.Stdin, single: true, atty: atty, }) return } envre := regexp.MustCompile(`^(\$[a-zA-Z][a-zA-Z0-9_]+|\$\([a-zA-Z][a-zA-Z0-9_]+\))$`) globmask := "" ch := make(chan *GrepArg, 10) done := make(chan int) go GoGrep(ch, done) nargs := len(args[argindex:]) for _, arg := range args[argindex:] { globmask = "" root := "" arg = strings.Trim(arg, `"`) for n, i := range strings.Split(filepath.ToSlash(arg), "/") { if root == "" && strings.Index(i, "*") != -1 { if globmask == "" { root = "." } else { root = filepath.ToSlash(globmask) } } if n == 0 && i == "~" { if runtime.GOOS == "windows" { i = os.Getenv("USERPROFILE") } else { i = os.Getenv("HOME") } } if envre.MatchString(i) { i = strings.Trim(strings.Trim(os.Getenv(i[1:]), "()"), `"`) } globmask = filepath.Join(globmask, i) if n == 0 { if runtime.GOOS == "windows" && filepath.VolumeName(i) != "" { globmask = i + "/" } else if len(globmask) == 0 { globmask = "/" } } } if root == "" { path, _ := filepath.Abs(arg) fi, err := os.Stat(path) if err != nil { errorline(fmt.Sprintf("jvgrep: %s: No such file or directory", arg)) os.Exit(1) } if !fi.IsDir() { if verbose { println("search:", path) } ch <- &GrepArg{ pattern: pattern, input: path, single: nargs == 1, atty: atty, } continue } else { root = path if fi.IsDir() { globmask = "**/*" } else { globmask = "**/" + globmask } } } if globmask == "" { globmask = "." } globmask = filepath.ToSlash(filepath.Clean(globmask)) if recursive { if strings.Index(globmask, "/") > -1 { globmask += "/" } else { globmask = "**/" + globmask } } cc := []rune(globmask) dirmask := "" filemask := "" for i := 0; i < len(cc); i++ { if cc[i] == '*' { if i < len(cc)-2 && cc[i+1] == '*' && cc[i+2] == '/' { filemask += "(.*/)?" dirmask = filemask i += 2 } else { filemask += "[^/]*" } } else { c := cc[i] if c == '/' || ('0' <= c && c <= '9') || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || 255 < c { filemask += string(c) } else { filemask += fmt.Sprintf("[\\x%x]", c) } if c == '/' && dirmask == "" && strings.Index(filemask, "*") != -1 { dirmask = filemask } } } if dirmask == "" { dirmask = filemask } if len(filemask) > 0 && filemask[len(filemask)-1] == '/' { if root == "" { root = filemask } filemask += "[^/]*" } if runtime.GOOS == "windows" || runtime.GOOS == "darwin" { dirmask = "(?i:" + dirmask + ")" filemask = "(?i:" + filemask + ")" } dre := regexp.MustCompile("^" + dirmask) fre := regexp.MustCompile("^" + filemask + "$") root = filepath.Clean(root) if verbose { println("dirmask:", dirmask) println("filemask:", filemask) println("root:", root) } filepath.Walk(root, func(path string, info os.FileInfo, err error) error { if info == nil { return err } path = filepath.ToSlash(path) if ere != nil && ere.MatchString(path) { if info.IsDir() { return filepath.SkipDir } return nil } if info.IsDir() { if path == "." || recursive || len(path) <= len(root) || dre.MatchString(path+"/") { return nil } return filepath.SkipDir } if fre.MatchString(path) && info.Mode().IsRegular() { if verbose { println("search:", path) } ch <- &GrepArg{ pattern: pattern, input: path, single: false, atty: atty, } } return nil }) } ch <- nil if count { fmt.Println(countMatch) } <-done }
/* I'm sorry, dear reader. I really am. The problem here is to take an arbitrary regular expression and: 1. return a regular expression that is just like it, but left-anchored, preferring to return the original if possible. 2. determine a string literal prefix that all matches of this regular expression have, much like regexp.Regexp.Prefix(). Unfortunately, Prefix() does not work in the presence of anchors, so we need to write it ourselves. What this actually means is that we need to sketch on the internals of the standard regexp library to forcefully extract the information we want. Unfortunately, regexp.Regexp hides a lot of its state, so our abstraction is going to be pretty leaky. The biggest leak is that we blindly assume that all regular expressions are perl-style, not POSIX. This is probably Mostly True, and I think most users of the library probably won't be able to notice. */ func sketchOnRegex(re *regexp.Regexp) (*regexp.Regexp, string) { // Re-parse the regex from the string representation. rawRe := re.String() sRe, err := syntax.Parse(rawRe, syntax.Perl) if err != nil { // TODO: better way to warn? log.Printf("WARN(router): unable to parse regexp %v as perl. "+ "This route might behave unexpectedly.", re) return re, "" } // Simplify and then compile the regex. sRe = sRe.Simplify() p, err := syntax.Compile(sRe) if err != nil { // TODO: better way to warn? log.Printf("WARN(router): unable to compile regexp %v. This "+ "route might behave unexpectedly.", re) return re, "" } // If it's not left-anchored, we add that now. if p.StartCond()&syntax.EmptyBeginText == 0 { // I hope doing this is always legal... newRe, err := regexp.Compile(`\A` + rawRe) if err != nil { // TODO: better way to warn? log.Printf("WARN(router): unable to create a left-"+ "anchored regexp from %v. This route might "+ "behave unexpectedly", re) return re, "" } re = newRe } // We run the regular expression more or less by hand in order to calculate // the prefix. pc := uint32(p.Start) atStart := true i := &p.Inst[pc] var buf bytes.Buffer OuterLoop: for { switch i.Op { // There's may be an 'empty' operation at the beginning of every regex, // due to OpBeginText. case syntax.InstEmptyWidth: if !atStart { break OuterLoop } // Captures and no-ops don't affect the prefix case syntax.InstCapture, syntax.InstNop: // nop! // We handle runes case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: atStart = false // If we don't have exactly one rune, or if the 'fold case' flag is // set, then we don't count this as part of the prefix. Due to // unicode case-crazyness, it's too hard to deal with case // insensitivity... if len(i.Rune) != 1 || syntax.Flags(i.Arg)&syntax.FoldCase != 0 { break OuterLoop } // Add to the prefix, continue. buf.WriteRune(i.Rune[0]) // All other instructions may affect the prefix, so we continue. default: break OuterLoop } // Continue to the next instruction pc = i.Out i = &p.Inst[pc] } return re, buf.String() }