func PerformTokenization(text string, splitToken *regexp.Regexp) (words map[string]int64) { words = make(map[string]int64) for _, w := range splitToken.Split(text, -1) { if len(w) > 2 { words[strings.ToLower(w)]++ } } return }
func Cut(sentence string, cut_all bool, HMM bool) []string { result := make([]string, 0) var re_han, re_skip *regexp.Regexp if cut_all { re_han = regexp.MustCompile(`\p{Han}+`) re_skip = regexp.MustCompile(`[^[:alnum:]+#\n]`) } else { re_han = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`) re_skip = regexp.MustCompile(`(\r\n|\s)`) } blocks := RegexpSplit(re_han, sentence) var cut_block cutAction if HMM { cut_block = cut_DAG } else { cut_block = cut_DAG_NO_HMM } if cut_all { cut_block = cut_All } for _, blk := range blocks { if len(blk) == 0 { continue } if re_han.MatchString(blk) { for _, word := range cut_block(blk) { result = append(result, word) } } else { type skipSplitFunc func(sentence string) []string var ssf skipSplitFunc if cut_all { ssf = func(sentence string) []string { return re_skip.Split(sentence, -1) } } else { ssf = func(sentence string) []string { return RegexpSplit(re_skip, sentence) } } for _, x := range ssf(blk) { if re_skip.MatchString(x) { result = append(result, x) } else if !cut_all { for _, xx := range x { result = append(result, string(xx)) } } else { result = append(result, x) } } } } return result }
// given a channel of lines, split into tokens given an re object // results go to an output chan of strings func SplitTokenizer(split_re *regexp.Regexp, lines <-chan string, tokens chan<- string) { for line := range lines { for _, token := range split_re.Split(line, -1) { if token == "" { continue } tokens <- token } } close(tokens) }
// SeparateString is an abstraction of stringToSlice that takes two kinds of // separators, and splits a string into a 2D slice based on those separators func SeparateString(rowSep *regexp.Regexp, colSep *regexp.Regexp, str string) (output Table) { lines := rowSep.Split(str, -1) for _, line := range lines { rawRow := colSep.Split(line, -1) row := []string{} for _, cell := range rawRow { row = append(row, strings.TrimSpace(cell)) } if len(row) > 0 && HasNonEmpty(row) { output = append(output, row) } } return output }
// getColumnRegex is the core of the logic. It determines which regex most // accurately splits the data into columns by testing the deviation in the // row lengths using different regexps. func getColumnRegex(str string, rowSep *regexp.Regexp) *regexp.Regexp { // matchesMost is used to ensure that our regexp actually is splitting the // lines of a table, instead of just returning them whole. matchesMost := func(re *regexp.Regexp, rows []string) bool { count := 0 for _, row := range rows { if re.MatchString(row) { count++ } } return count >= (len(rows) / 2) } // getRowLengths returns row length counts for each table getRowLengths := func(table Table) (lengths []int) { for _, row := range table { lengths = append(lengths, len(row)) } return lengths } // getVariance returns the variance of the split provided by a regexp, // after discarding a number of outliers getVariance := func(colSep *regexp.Regexp, outliers int) float64 { table := SeparateString(rowSep, colSep, str) rowLengths := getRowLengths(table) for i := 0; i < outliers; i++ { rowLengths = chauvenet(rowLengths) } return variance(rowLengths) } // testRegexp determines whether or not a given regexp gives perfectly even // line lengths, including discarding of a number of outliers testRegexp := func(colSep *regexp.Regexp, outliers int) bool { for i := 0; i < outliers; i++ { variance := getVariance(colSep, i) if variance <= .1 { return true } } return false } // different column separators to try out initialColSeps := []*regexp.Regexp{ regexp.MustCompile(`\t+`), // tabs regexp.MustCompile(`\s{4}`), // exactly four whitespaces regexp.MustCompile(`\s{2,}`), // two+ whitespace (spaces in cols) regexp.MustCompile(`\s+`), // any whitespace } // filter regexps that have no matches at all - they will always return // rows of even length (length 1). colSeps := []*regexp.Regexp{} rows := rowSep.Split(str, -1) for _, re := range initialColSeps { if matchesMost(re, rows) { colSeps = append(colSeps, re) } } if len(colSeps) < 1 { log.WithFields(log.Fields{ "attempted": initialColSeps, "table": str, }).Warn("ProbabalisticSplit couldn't find a column separator.") colSeps = initialColSeps } // discarding up to passes outliers, test each regexp for row length // consistency passes := 3 for i := 0; i < passes; i++ { for _, re := range colSeps { if testRegexp(re, i) { return re } } } // if still not done, just pick the one with the lowest variance log.WithFields(log.Fields{ "attempted": initialColSeps, "outliers": passes, }).Debug("ProbabalisticSplit couldn't find a consistent regexp") var variances []float64 for _, colSep := range colSeps { variances = append(variances, getVariance(colSep, passes)) } // ensure that index can be found in tables minVarianceIndex := extremaIndex(minFunc, variances) if len(colSeps) <= minVarianceIndex { msg := "Internal error: minVarianceIndex couldn't be found in colSeps" log.WithFields(log.Fields{ "index": minVarianceIndex, "colSeps": colSeps, }).Fatal(msg) } return colSeps[minVarianceIndex] }
func main() { version := flag.Bool("version", false, "print version and exit") quote := flag.Bool("quote", false, "quote returned fields") unquote := flag.Bool("unquote", false, "quote returned fields") ifs := flag.String("ifs", " ", "input field separator") ire := flag.String("ifs-re", "", "input field separator (as regular expression)") ofs := flag.String("ofs", " ", "output field separator") re := flag.String("re", "", "regular expression for parsing input") grep := flag.String("grep", "", "output only lines that match the regular expression") format := flag.String("printf", "", "output is formatted according to specified format") matches := flag.String("matches", "", "return status code 100 if any line matches the specified pattern, 101 otherwise") after := flag.String("after", "", "process fields in line after specified tag") afterline := flag.String("after-line", "", "process lines after lines that matches") afterlinen := flag.Int("after-linen", 0, "process lines after n lines") printline := flag.Bool("line", false, "print line numbers") debug := flag.Bool("debug", false, "print debug info") flag.Parse() if *version { extra := "" if gitCommit != "" { extra = fmt.Sprintf(" (%.4v %v)", gitCommit, buildDate) } fmt.Printf("%s version %s%v\n", path.Base(os.Args[0]), VERSION, extra) return } pos := make([]Pos, len(flag.Args())) for i, arg := range flag.Args() { pos[i].Set(arg) } if len(*format) > 0 && !strings.HasSuffix(*format, "\n") { *format += "\n" } var split_re *regexp.Regexp var split_pattern *regexp.Regexp var match_pattern *regexp.Regexp var grep_pattern *regexp.Regexp status_code := OK if len(*matches) > 0 { match_pattern = regexp.MustCompile(*matches) status_code = MATCH_NOT_FOUND } if len(*grep) > 0 { grep_pattern = regexp.MustCompile(*grep) } if len(*re) > 0 { split_pattern = regexp.MustCompile(*re) } if len(*ire) > 0 { split_re = regexp.MustCompile(*ire) } scanner := bufio.NewScanner(os.Stdin) len_after := len(*after) len_afterline := len(*afterline) lineno := 0 for scanner.Scan() { if scanner.Err() != nil { log.Fatal(scanner.Err()) } line := scanner.Text() lineno += 1 if *afterlinen >= lineno { continue } if len_afterline > 0 { if strings.Contains(line, *afterline) { len_afterline = 0 } continue } if len_after > 0 { i := strings.Index(line, *after) if i < 0 { continue // no match } line = line[i+len_after:] } fields := []string{line} // $0 is the full line if grep_pattern != nil { if matches := grep_pattern.FindStringSubmatch(line); matches != nil { fields = matches } else { continue } } else if split_pattern != nil { if matches := split_pattern.FindStringSubmatch(line); matches != nil { fields = matches } } else if split_re != nil { // split line according to input regular expression fields = append(fields, split_re.Split(line, -1)...) } else if *ifs == " " { // split line on spaces (compact multiple spaces) fields = append(fields, SPACES.Split(strings.TrimSpace(line), -1)...) } else { // split line according to input field separator fields = append(fields, strings.Split(line, *ifs)...) } if *debug { log.Printf("input fields: %q\n", fields) if len(pos) > 0 { log.Printf("output fields: %q\n", pos) } } var result []string // do some processing if len(pos) > 0 { result = make([]string, 0) for _, p := range pos { result = append(result, Slice(fields, p)...) } } else { result = fields[1:] } if *unquote { result = Unquote(result) } if *quote { result = Quote(result) } if *printline { fmt.Printf("%d: ", lineno) } if len(*format) > 0 { Print(*format, result) } else { // join the result according to output field separator fmt.Println(strings.Join(result, *ofs)) } if match_pattern != nil && match_pattern.MatchString(line) { status_code = MATCH_FOUND } } os.Exit(status_code) }
// Public method to provider API // Actually you could not pass method's parameters func Parents(args ...string) []string { var ( isWindows = runtime.GOOS == "windows" reg *regexp.Regexp init array.Array cwd string sep string c string ) if n := len(args); n == 0 { cwd, _ = os.Getwd() } else if n == 1 { cwd = args[0] } else { cwd = args[0] isWindows = strings.HasPrefix(args[1], "win") } if isWindows { c = `[\\\/]` init = array.Array{""} } else { c = `/` init = array.Array{"/"} } reg = regexp.MustCompile(c) var join = func(x, y interface{}) array.Array { tmpArray := array.Array{x, y} var ps = tmpArray.Filter(func(p interface{}, args ...interface{}) bool { switch p.(type) { case string: if p.(string) != "" { return true } return false default: return false } return false }) if isWindows { sep = "\\" } else { sep = "/" } return array.Array{path.Clean(ps.Join(sep))} } var res = path.Clean(cwd) arr := array.Array{} for _, v := range reg.Split(res, -1) { arr.Push(v) } arrReduce := arr.Reduce(func(acc, dir interface{}, ix ...interface{}) interface{} { tmpAcc := acc.(array.Array) index := ix[0].(int) tmpAcc = tmpAcc.Concat(join(tmpAcc[index], dir)) return tmpAcc }, init) // Not like javascript support method links // everytime you should assign arrSlice := arrReduce.(array.Array) arrReverse := arrSlice.Slice(1, 0) arrReverse.Reverse() if len(arrReverse) >= 2 { if arrReverse[0] == arrReverse[1] { return []string{arrReverse[0].(string)} } } if isWindows && strings.HasPrefix(cwd, "\\") { cut := arrReverse.Slice(0, -1) cut.Map(func(d interface{}, args ...interface{}) interface{} { var ch = d.(string)[0] if ch == '\\' { return d } else if ch == '.' { return "\\" + d.(string)[1:] } else { return "\\" + d.(string) } }) return cut.ToString() } return arrReverse.ToString() }