Esempio n. 1
0
func PerformTokenization(text string, splitToken *regexp.Regexp) (words map[string]int64) {
	words = make(map[string]int64)
	for _, w := range splitToken.Split(text, -1) {
		if len(w) > 2 {
			words[strings.ToLower(w)]++
		}
	}
	return
}
Esempio n. 2
0
func Cut(sentence string, cut_all bool, HMM bool) []string {
	result := make([]string, 0)
	var re_han, re_skip *regexp.Regexp
	if cut_all {
		re_han = regexp.MustCompile(`\p{Han}+`)
		re_skip = regexp.MustCompile(`[^[:alnum:]+#\n]`)
	} else {
		re_han = regexp.MustCompile(`([\p{Han}+[:alnum:]+#&\._]+)`)
		re_skip = regexp.MustCompile(`(\r\n|\s)`)
	}
	blocks := RegexpSplit(re_han, sentence)
	var cut_block cutAction
	if HMM {
		cut_block = cut_DAG
	} else {
		cut_block = cut_DAG_NO_HMM
	}
	if cut_all {
		cut_block = cut_All
	}
	for _, blk := range blocks {
		if len(blk) == 0 {
			continue
		}
		if re_han.MatchString(blk) {
			for _, word := range cut_block(blk) {
				result = append(result, word)
			}
		} else {
			type skipSplitFunc func(sentence string) []string
			var ssf skipSplitFunc
			if cut_all {
				ssf = func(sentence string) []string {
					return re_skip.Split(sentence, -1)
				}
			} else {
				ssf = func(sentence string) []string {
					return RegexpSplit(re_skip, sentence)
				}
			}

			for _, x := range ssf(blk) {
				if re_skip.MatchString(x) {
					result = append(result, x)
				} else if !cut_all {
					for _, xx := range x {
						result = append(result, string(xx))
					}
				} else {
					result = append(result, x)
				}
			}
		}
	}
	return result
}
Esempio n. 3
0
// given a channel of lines, split into tokens given an re object
// results go to an output chan of strings
func SplitTokenizer(split_re *regexp.Regexp, lines <-chan string, tokens chan<- string) {

	for line := range lines {
		for _, token := range split_re.Split(line, -1) {
			if token == "" {
				continue
			}
			tokens <- token
		}
	}
	close(tokens)
}
Esempio n. 4
0
// SeparateString is an abstraction of stringToSlice that takes two kinds of
// separators, and splits a string into a 2D slice based on those separators
func SeparateString(rowSep *regexp.Regexp, colSep *regexp.Regexp, str string) (output Table) {
	lines := rowSep.Split(str, -1)
	for _, line := range lines {
		rawRow := colSep.Split(line, -1)
		row := []string{}
		for _, cell := range rawRow {
			row = append(row, strings.TrimSpace(cell))
		}
		if len(row) > 0 && HasNonEmpty(row) {
			output = append(output, row)
		}
	}
	return output
}
// getColumnRegex is the core of the logic. It determines which regex most
// accurately splits the data into columns by testing the deviation in the
// row lengths using different regexps.
func getColumnRegex(str string, rowSep *regexp.Regexp) *regexp.Regexp {
	// matchesMost is used to ensure that our regexp actually is splitting the
	// lines of a table, instead of just returning them whole.
	matchesMost := func(re *regexp.Regexp, rows []string) bool {
		count := 0
		for _, row := range rows {
			if re.MatchString(row) {
				count++
			}
		}
		return count >= (len(rows) / 2)
	}
	// getRowLengths returns row length counts for each table
	getRowLengths := func(table Table) (lengths []int) {
		for _, row := range table {
			lengths = append(lengths, len(row))
		}
		return lengths
	}
	// getVariance returns the variance of the split provided by a regexp,
	// after discarding a number of outliers
	getVariance := func(colSep *regexp.Regexp, outliers int) float64 {
		table := SeparateString(rowSep, colSep, str)
		rowLengths := getRowLengths(table)
		for i := 0; i < outliers; i++ {
			rowLengths = chauvenet(rowLengths)
		}
		return variance(rowLengths)
	}
	// testRegexp determines whether or not a given regexp gives perfectly even
	// line lengths, including discarding of a number of outliers
	testRegexp := func(colSep *regexp.Regexp, outliers int) bool {
		for i := 0; i < outliers; i++ {
			variance := getVariance(colSep, i)
			if variance <= .1 {
				return true
			}
		}
		return false
	}
	// different column separators to try out
	initialColSeps := []*regexp.Regexp{
		regexp.MustCompile(`\t+`),    // tabs
		regexp.MustCompile(`\s{4}`),  // exactly four whitespaces
		regexp.MustCompile(`\s{2,}`), // two+ whitespace (spaces in cols)
		regexp.MustCompile(`\s+`),    // any whitespace
	}
	// filter regexps that have no matches at all - they will always return
	// rows of even length (length 1).
	colSeps := []*regexp.Regexp{}
	rows := rowSep.Split(str, -1)
	for _, re := range initialColSeps {
		if matchesMost(re, rows) {
			colSeps = append(colSeps, re)
		}
	}
	if len(colSeps) < 1 {
		log.WithFields(log.Fields{
			"attempted": initialColSeps,
			"table":     str,
		}).Warn("ProbabalisticSplit couldn't find a column separator.")
		colSeps = initialColSeps
	}
	// discarding up to passes outliers, test each regexp for row length
	// consistency
	passes := 3
	for i := 0; i < passes; i++ {
		for _, re := range colSeps {
			if testRegexp(re, i) {
				return re
			}
		}
	}
	// if still not done, just pick the one with the lowest variance
	log.WithFields(log.Fields{
		"attempted": initialColSeps,
		"outliers":  passes,
	}).Debug("ProbabalisticSplit couldn't find a consistent regexp")
	var variances []float64
	for _, colSep := range colSeps {
		variances = append(variances, getVariance(colSep, passes))
	}
	// ensure that index can be found in tables
	minVarianceIndex := extremaIndex(minFunc, variances)
	if len(colSeps) <= minVarianceIndex {
		msg := "Internal error: minVarianceIndex couldn't be found in colSeps"
		log.WithFields(log.Fields{
			"index":   minVarianceIndex,
			"colSeps": colSeps,
		}).Fatal(msg)
	}
	return colSeps[minVarianceIndex]
}
Esempio n. 6
0
File: glin.go Progetto: raff/glin
func main() {
	version := flag.Bool("version", false, "print version and exit")
	quote := flag.Bool("quote", false, "quote returned fields")
	unquote := flag.Bool("unquote", false, "quote returned fields")
	ifs := flag.String("ifs", " ", "input field separator")
	ire := flag.String("ifs-re", "", "input field separator (as regular expression)")
	ofs := flag.String("ofs", " ", "output field separator")
	re := flag.String("re", "", "regular expression for parsing input")
	grep := flag.String("grep", "", "output only lines that match the regular expression")
	format := flag.String("printf", "", "output is formatted according to specified format")
	matches := flag.String("matches", "", "return status code 100 if any line matches the specified pattern, 101 otherwise")
	after := flag.String("after", "", "process fields in line after specified tag")
	afterline := flag.String("after-line", "", "process lines after lines that matches")
	afterlinen := flag.Int("after-linen", 0, "process lines after n lines")
	printline := flag.Bool("line", false, "print line numbers")
	debug := flag.Bool("debug", false, "print debug info")

	flag.Parse()

	if *version {
		extra := ""
		if gitCommit != "" {
			extra = fmt.Sprintf(" (%.4v %v)", gitCommit, buildDate)
		}

		fmt.Printf("%s version %s%v\n", path.Base(os.Args[0]), VERSION, extra)
		return
	}

	pos := make([]Pos, len(flag.Args()))

	for i, arg := range flag.Args() {
		pos[i].Set(arg)
	}

	if len(*format) > 0 && !strings.HasSuffix(*format, "\n") {
		*format += "\n"
	}

	var split_re *regexp.Regexp
	var split_pattern *regexp.Regexp
	var match_pattern *regexp.Regexp
	var grep_pattern *regexp.Regexp
	status_code := OK

	if len(*matches) > 0 {
		match_pattern = regexp.MustCompile(*matches)
		status_code = MATCH_NOT_FOUND
	}

	if len(*grep) > 0 {
		grep_pattern = regexp.MustCompile(*grep)
	}

	if len(*re) > 0 {
		split_pattern = regexp.MustCompile(*re)
	}

	if len(*ire) > 0 {
		split_re = regexp.MustCompile(*ire)
	}

	scanner := bufio.NewScanner(os.Stdin)
	len_after := len(*after)
	len_afterline := len(*afterline)
	lineno := 0

	for scanner.Scan() {
		if scanner.Err() != nil {
			log.Fatal(scanner.Err())
		}

		line := scanner.Text()

		lineno += 1

		if *afterlinen >= lineno {
			continue
		}

		if len_afterline > 0 {
			if strings.Contains(line, *afterline) {
				len_afterline = 0
			}

			continue
		}

		if len_after > 0 {
			i := strings.Index(line, *after)
			if i < 0 {
				continue // no match
			}

			line = line[i+len_after:]
		}

		fields := []string{line} // $0 is the full line

		if grep_pattern != nil {
			if matches := grep_pattern.FindStringSubmatch(line); matches != nil {
				fields = matches
			} else {
				continue
			}
		} else if split_pattern != nil {
			if matches := split_pattern.FindStringSubmatch(line); matches != nil {
				fields = matches
			}
		} else if split_re != nil {
			// split line according to input regular expression
			fields = append(fields, split_re.Split(line, -1)...)
		} else if *ifs == " " {
			// split line on spaces (compact multiple spaces)
			fields = append(fields, SPACES.Split(strings.TrimSpace(line), -1)...)
		} else {
			// split line according to input field separator
			fields = append(fields, strings.Split(line, *ifs)...)
		}

		if *debug {
			log.Printf("input fields: %q\n", fields)
			if len(pos) > 0 {
				log.Printf("output fields: %q\n", pos)
			}
		}

		var result []string

		// do some processing
		if len(pos) > 0 {
			result = make([]string, 0)

			for _, p := range pos {
				result = append(result, Slice(fields, p)...)
			}
		} else {
			result = fields[1:]
		}

		if *unquote {
			result = Unquote(result)
		}

		if *quote {
			result = Quote(result)
		}

		if *printline {
			fmt.Printf("%d: ", lineno)
		}

		if len(*format) > 0 {
			Print(*format, result)
		} else {
			// join the result according to output field separator
			fmt.Println(strings.Join(result, *ofs))
		}

		if match_pattern != nil && match_pattern.MatchString(line) {
			status_code = MATCH_FOUND
		}
	}

	os.Exit(status_code)
}
Esempio n. 7
0
// Public method to provider API
// Actually you could not pass method's parameters
func Parents(args ...string) []string {
	var (
		isWindows = runtime.GOOS == "windows"
		reg       *regexp.Regexp
		init      array.Array
		cwd       string
		sep       string
		c         string
	)
	if n := len(args); n == 0 {
		cwd, _ = os.Getwd()
	} else if n == 1 {
		cwd = args[0]
	} else {
		cwd = args[0]
		isWindows = strings.HasPrefix(args[1], "win")
	}

	if isWindows {
		c = `[\\\/]`
		init = array.Array{""}
	} else {
		c = `/`
		init = array.Array{"/"}
	}

	reg = regexp.MustCompile(c)

	var join = func(x, y interface{}) array.Array {
		tmpArray := array.Array{x, y}
		var ps = tmpArray.Filter(func(p interface{}, args ...interface{}) bool {
			switch p.(type) {
			case string:
				if p.(string) != "" {
					return true
				}
				return false
			default:
				return false
			}
			return false
		})
		if isWindows {
			sep = "\\"
		} else {
			sep = "/"
		}
		return array.Array{path.Clean(ps.Join(sep))}
	}

	var res = path.Clean(cwd)
	arr := array.Array{}
	for _, v := range reg.Split(res, -1) {
		arr.Push(v)
	}

	arrReduce := arr.Reduce(func(acc, dir interface{}, ix ...interface{}) interface{} {
		tmpAcc := acc.(array.Array)
		index := ix[0].(int)
		tmpAcc = tmpAcc.Concat(join(tmpAcc[index], dir))
		return tmpAcc
	}, init)

	// Not like javascript support method links
	// everytime you should assign
	arrSlice := arrReduce.(array.Array)
	arrReverse := arrSlice.Slice(1, 0)
	arrReverse.Reverse()
	if len(arrReverse) >= 2 {
		if arrReverse[0] == arrReverse[1] {
			return []string{arrReverse[0].(string)}
		}
	}

	if isWindows && strings.HasPrefix(cwd, "\\") {
		cut := arrReverse.Slice(0, -1)
		cut.Map(func(d interface{}, args ...interface{}) interface{} {
			var ch = d.(string)[0]
			if ch == '\\' {
				return d
			} else if ch == '.' {
				return "\\" + d.(string)[1:]
			} else {
				return "\\" + d.(string)
			}
		})
		return cut.ToString()
	}
	return arrReverse.ToString()
}