Esempio n. 1
0
func find_keywords(dict darts.Darts, line string) map[string]int {
	arr := []rune(strings.ToUpper(line))
	result := make(map[string]int)
	for i := 0; i < len(arr); i++ {
		offset := i
		c := arr[offset]
		if unicode.IsSpace(c) || unicode.IsPunct(c) {
			continue
		}
		for pos := 2; offset+pos < len(arr); pos++ {
			c := arr[offset+pos-1]
			if unicode.IsPunct(c) {
				break
			}
			// log.Info(string(arr[offset : offset+pos]))
			exist, results := dict.CommonPrefixSearch(arr[offset:offset+pos], 0)
			if len(results) > 0 {
				key := string(arr[offset : offset+pos])
				result[key] = result[key] + 1
				offset = offset + pos - 1
			} else if !exist {
				break
			}
		}
	}
	return result
}
func owp(dst io.Writer, src io.Reader) {
	byte_in := func() byte {
		bs := make([]byte, 1)
		src.Read(bs)
		return bs[0]
	}
	byte_out := func(b byte) { dst.Write([]byte{b}) }
	odd := func() byte {
		for {
			b := byte_in()
			if unicode.IsPunct(int(b)) {
				return b
			}
			defer byte_out(b)
		}
		panic("impossible")
	}
	for {
		for {
			b := byte_in()
			byte_out(b)
			if b == '.' {
				return
			}
			if unicode.IsPunct(rune(b)) {
				break
			}
		}
		b := odd()
		byte_out(b)
		if b == '.' {
			return
		}
	}
}
func owp(dst io.Writer, src io.Reader) {
	byte_in := func() byte {
		bs := make([]byte, 1)
		src.Read(bs)
		return bs[0]
	}
	byte_out := func(b byte) { dst.Write([]byte{b}) }
	var odd func() byte
	odd = func() byte {
		s := byte_in()
		if unicode.IsPunct(rune(s)) {
			return s
		}
		b := odd()
		byte_out(s)
		return b
	}
	for {
		for {
			b := byte_in()
			byte_out(b)
			if b == '.' {
				return
			}
			if unicode.IsPunct(rune(b)) {
				break
			}
		}
		b := odd()
		byte_out(b)
		if b == '.' {
			return
		}
	}
}
Esempio n. 4
0
File: utils.go Progetto: akavel/vfmd
func DeEscapeProse(p md.Prose) md.Prose {
	result := make(md.Prose, 0, len(p))
	var buf []byte
runs:
	for i := 0; i < len(p); i++ {
		if buf == nil {
			buf = p[i].Bytes
		}
		for j := 0; ; {
			k := bytes.IndexByte(buf[j:], '\\')
			if k == -1 {
				result = append(result, md.Run{
					Line:  p[i].Line,
					Bytes: buf,
				})
				buf = nil
				continue runs
			}
			j += k
			r, _ := utf8.DecodeRune(buf[j+1:])
			if unicode.IsPunct(r) || unicode.IsSymbol(r) {
				result = append(result, md.Run{
					Line:  p[i].Line,
					Bytes: buf[:j],
				})
				buf = buf[j+1:]
				i--
				continue runs
			}
			j++
		}
	}
	return result
}
Esempio n. 5
0
File: util.go Progetto: nmeum/cpod
// Escape escapes the given data to make sure it is safe to use it as a
// filename. It also replaces spaces and other seperation characters
// with the '-' character. It returns an error if the escaped string is
// empty.
func Escape(name string) (escaped string, err error) {
	mfunc := func(r rune) rune {
		switch {
		case unicode.IsLetter(r):
			return r
		case unicode.IsNumber(r):
			return r
		case unicode.IsSpace(r):
			return '-'
		case unicode.IsPunct(r):
			return '-'
		}

		return -1
	}

	escaped = strings.Map(mfunc, html.UnescapeString(name))
	for strings.Contains(escaped, "--") {
		escaped = strings.Replace(escaped, "--", "-", -1)
	}

	escaped = strings.TrimPrefix(escaped, "-")
	escaped = strings.TrimSuffix(escaped, "-")

	if len(escaped) <= 0 {
		err = errors.New("couldn't escape title")
	}

	return
}
Esempio n. 6
0
func splitText(t string) (ws []string) {
	start := 0
	inWord := false

	for i, r := range t {
		sep := unicode.IsPunct(r) || unicode.IsSpace(r)

		if sep {
			switch {
			case r == '\'': // Accept things like "boy's"

			case inWord:
				ws = append(ws, t[start:i])
				start = i + 1
				inWord = false

			default:
				start += utf8.RuneLen(r)
			}
		}

		inWord = !sep
	}

	if start < len(t) {
		ws = append(ws, t[start:])
	}

	return
}
Esempio n. 7
0
// IsSearchWordRune defines the runes that can be used in unquoted predicate arguments
// or unquoted literals. These are all unicode letters, digits and punctuation,
// execpt for ':', which is used for predicate marking,  and '(', ')', which are used
// for predicate grouping.
func isSearchWordRune(r rune) bool {
	switch r {
	case ':', ')', '(':
		return false
	}
	return unicode.IsLetter(r) || unicode.IsDigit(r) || unicode.IsPunct(r)
}
Esempio n. 8
0
// CharCount scans a *bufio.Reader and returns a map of the counts of its
// Unicode character types.
func CharCount(in *bufio.Reader) map[string]int {
	counts := make(map[string]int) // counts of Unicode character types

	for {
		r, n, err := in.ReadRune() // returns rune, nbytes, error
		if err == io.EOF {
			break
		}
		if err != nil {
			fmt.Fprintf(os.Stderr, "charcount: %v\n", err)
			os.Exit(1)
		}

		switch {
		case r == unicode.ReplacementChar && n == 1:
			counts["invalid"]++
		case unicode.IsControl(r):
			counts["control"]++
		case unicode.IsLetter(r):
			counts["letter"]++
		case unicode.IsMark(r):
			counts["mark"]++
		case unicode.IsNumber(r):
			counts["number"]++
		case unicode.IsPunct(r):
			counts["punct"]++
		case unicode.IsSpace(r):
			counts["space"]++
		case unicode.IsSymbol(r):
			counts["symbol"]++
		}
	}
	return counts
}
Esempio n. 9
0
// MorseKeys translates an input string into a series of keys.
func MorseKeys(in string) ([]key, error) {
	afterWord := false
	afterChar := false
	result := []key{}
	for _, c := range in {
		if unicode.IsSpace(c) {
			afterWord = true
			continue
		}
		morse, ok := runeToKeys[c]
		if !ok {
			return nil, fmt.Errorf("can't translate %c to morse", c)
		}
		if unicode.IsPunct(c) && afterChar {
			result = append(result, punctGap...)
		} else if afterWord {
			result = append(result, wordGap...)
		} else if afterChar {
			result = append(result, charGap...)
		}
		result = append(result, morse...)
		afterChar = true
		afterWord = false
	}
	return result, nil
}
Esempio n. 10
0
func main() {
	counts := make(map[rune]int)      // counts of Unicode characters
	var utflen [utf8.UTFMax + 1]int   // count of lengths of UTF-8 encodings
	invalid := 0                      // count of invalid UTF-8 characters
	catCounts := make(map[string]int) // counts per Unicode category
	unknown := 0                      // count of characters of unknown category

	in := bufio.NewReader(os.Stdin)
	for {
		r, n, err := in.ReadRune() // returns rune, nbytes, error
		if err == io.EOF {
			break
		}
		if err != nil {
			fmt.Fprintf(os.Stderr, "charcount: %v\n", err)
			os.Exit(1)
		}
		if r == unicode.ReplacementChar && n == 1 {
			invalid++
			continue
		}
		counts[r]++
		utflen[n]++
		switch {
		case unicode.IsLetter(r):
			catCounts["Letter"]++
		case unicode.IsDigit(r):
			catCounts["Digit"]++
		case unicode.IsSymbol(r):
			catCounts["Symbol"]++
		case unicode.IsPunct(r):
			catCounts["Punct"]++
		case unicode.IsSpace(r):
			catCounts["Space"]++
		default:
			unknown++
		}
	}
	fmt.Printf("rune\tcount\n")
	for c, n := range counts {
		fmt.Printf("%q\t%d\n", c, n)
	}
	fmt.Print("\nlen\tcount\n")
	for i, n := range utflen {
		if i > 0 {
			fmt.Printf("%d\t%d\n", i, n)
		}
	}
	if invalid > 0 {
		fmt.Printf("\n%d invalid UTF-8 characters\n", invalid)
	}

	fmt.Print("\ncat\tcount\n")
	for cat, n := range catCounts {
		fmt.Printf("%s\t%d\n", cat, n)
	}
	if unknown > 0 {
		fmt.Printf("\n%d characters of unknown category\n", unknown)
	}
}
Esempio n. 11
0
func TestRune_IsIndependent(t *testing.T) {
	numbers := make([]rune, 0)
	letters := make([]rune, 0)
	marks := make([]rune, 0)
	symbols := make([]rune, 0)
	puncts := make([]rune, 0)
	others := make([]rune, 0)
	for _, r := range unicode.Myanmar.R16 {
		for c := r.Lo; c <= r.Hi; c++ {
			switch mr := rune(c); true {
			case unicode.IsLetter(mr):
				letters = append(letters, mr)
			case unicode.IsNumber(mr):
				numbers = append(numbers, mr)
			case unicode.IsMark(mr):
				marks = append(marks, mr)
			case unicode.IsPunct(mr):
				puncts = append(puncts, mr)
			case unicode.IsSymbol(mr):
				symbols = append(symbols, mr)
			default:
				others = append(others, mr)
			}
		}
	}

	independents := string(letters) + string(numbers) + string(puncts) + " \t\r\n"
	for _, consonant := range independents {
		if ok, _ := Rune(consonant).IsIndependent(); !ok {
			t.Errorf("[%U] expected result is true, but it returns false", consonant)
		}
	}
}
Esempio n. 12
0
// CharType returns a string representing the unicode type of a rune
func CharType(r rune) string {
	switch {
	case unicode.IsLetter(r):
		return "letter"
	case unicode.IsSpace(r):
		return "space"
	case unicode.IsPunct(r):
		return "punct"
	case unicode.IsNumber(r):
		return "number"
	case unicode.IsSymbol(r):
		return "symbol"
	case unicode.IsMark(r):
		return "mark"
	case unicode.IsDigit(r):
		return "digit"
	case unicode.IsPrint(r):
		return "print"
	case unicode.IsControl(r):
		return "control"
	case unicode.IsGraphic(r):
		return "graphic"
	default:
		return "invalid"
	}
}
Esempio n. 13
0
// Stat calculates statistics for all runes read from r.
func (m *Main) Stat(r io.RuneReader) (Stats, error) {
	var stats Stats

	for {
		// Read next character.
		ch, sz, err := r.ReadRune()
		if err == io.EOF {
			break
		} else if err != nil {
			return stats, err
		}

		// Calculate stats.
		stats.TotalN++
		if unicode.IsControl(ch) {
			stats.ControlN++
		}
		if unicode.IsDigit(ch) {
			stats.DigitN++
		}
		if unicode.IsGraphic(ch) {
			stats.GraphicN++
		}
		if unicode.IsLetter(ch) {
			stats.LetterN++
		}
		if unicode.IsLower(ch) {
			stats.LowerN++
		}
		if unicode.IsMark(ch) {
			stats.MarkN++
		}
		if unicode.IsNumber(ch) {
			stats.NumberN++
		}
		if unicode.IsPrint(ch) {
			stats.PrintN++
		}
		if unicode.IsPunct(ch) {
			stats.PunctN++
		}
		if unicode.IsSpace(ch) {
			stats.SpaceN++
		}
		if unicode.IsSymbol(ch) {
			stats.SymbolN++
		}
		if unicode.IsTitle(ch) {
			stats.TitleN++
		}
		if unicode.IsUpper(ch) {
			stats.UpperN++
		}
		if sz > 1 {
			stats.MultiByteN++
		}
	}

	return stats, nil
}
Esempio n. 14
0
func incrementCount(r rune, counts map[int]int) {
	switch {
	case unicode.IsControl(r):
		counts[isControl]++

	case unicode.IsNumber(r):
		counts[isNumber]++

	case unicode.IsDigit(r):
		counts[isDigit]++

	case unicode.IsLetter(r):
		counts[isLetter]++

	case unicode.IsMark(r):
		counts[isMark]++

	case unicode.IsPunct(r):
		counts[isPunct]++

	case unicode.IsSpace(r):
		counts[isSpace]++

	case unicode.IsSymbol(r):
		counts[isSymbol]++

	case unicode.IsPrint(r):
		counts[isPrint]++

	case unicode.IsGraphic(r):
		counts[isGraphic]++
	}

}
Esempio n. 15
0
func test_password(pass string) bool {
	// Windows AD password needs at leat 7 characters password,  and must contain characters from three of the following five categories:
	// uppercase character
	// lowercase character
	// digit character
	// nonalphanumeric characters
	// any Unicode character that is categorized as an alphabetic character but is not uppercase or lowercase
	if len(pass) < 7 {
		return false
	}
	d := 0
	l := 0
	u := 0
	p := 0
	o := 0
	for _, c := range pass {
		if unicode.IsDigit(c) { // check digit character
			d = 1
		} else if unicode.IsLower(c) { // check lowercase character
			l = 1
		} else if unicode.IsUpper(c) { // check uppercase character
			u = 1
		} else if unicode.IsPunct(c) { // check nonalphanumeric character
			p = 1
		} else { // other unicode character
			o = 1
		}
	}
	if d+l+u+p+o < 3 {
		return false
	}
	return true
}
Esempio n. 16
0
File: scanner.go Progetto: rread/rsi
func isLetter(ch rune) bool {
	if ch == '(' || ch == ')' || ch == '\'' || ch == '"' {
		return false
	}

	return unicode.IsLetter(ch) || unicode.IsPunct(ch) || unicode.IsSymbol(ch)
}
Esempio n. 17
0
// synopsis extracts the first sentence from s. All runs of whitespace are
// replaced by a single space.
func synopsis(s string) string {

	parts := strings.SplitN(s, "\n\n", 2)
	s = parts[0]

	var buf []byte
	const (
		other = iota
		period
		space
	)
	last := space
Loop:
	for i := 0; i < len(s); i++ {
		b := s[i]
		switch b {
		case ' ', '\t', '\r', '\n':
			switch last {
			case period:
				break Loop
			case other:
				buf = append(buf, ' ')
				last = space
			}
		case '.':
			last = period
			buf = append(buf, b)
		default:
			last = other
			buf = append(buf, b)
		}
	}

	// Ensure that synopsis fits an App Engine datastore text property.
	const m = 400
	if len(buf) > m {
		buf = buf[:m]
		if i := bytes.LastIndex(buf, []byte{' '}); i >= 0 {
			buf = buf[:i]
		}
		buf = append(buf, " ..."...)
	}

	s = string(buf)

	r, n := utf8.DecodeRuneInString(s)
	if n < 0 || unicode.IsPunct(r) || unicode.IsSymbol(r) {
		// ignore Markdown headings, editor settings, Go build constraints, and * in poorly formatted block comments.
		s = ""
	} else {
		for _, prefix := range badSynopsisPrefixes {
			if strings.HasPrefix(s, prefix) {
				s = ""
				break
			}
		}
	}

	return s
}
Esempio n. 18
0
/*
 * Password rules:
 * at least 7 letters
 * at least 1 number
 * at least 1 upper case
 * at least 1 special character
 */
func ValidatePassword(value, local string) error {

	fmt.Println("Validate password", value)
	if len(value) < 7 {
		return errors.New(i18n.Translate(local, i18nSec, "text03"))
	}

	var num, lower, upper, spec bool
	for _, r := range value {
		switch {
		case unicode.IsDigit(r):
			num = true
		case unicode.IsUpper(r):
			upper = true
		case unicode.IsLower(r):
			lower = true
		case unicode.IsSymbol(r), unicode.IsPunct(r):
			spec = true
		}
	}
	if num && lower && upper && spec {
		return nil
	}

	return errors.New(i18n.Translate(local, i18nSec, "text03"))
}
Esempio n. 19
0
func AllPunctOrSpace(s string) bool {
	for _, u := range s {
		if !unicode.IsPunct(u) && !unicode.IsSpace(u) {
			return false
		}
	}
	return true
}
Esempio n. 20
0
func sanitize(s string) string {
	buf := new(bytes.Buffer)
	for _, rne := range s {
		if !unicode.IsPunct(rne) {
			buf.WriteRune(rne)
		}
	}
	return buf.String()
}
Esempio n. 21
0
func IsNumber(s string) bool {
	for _, r := range s {
		if unicode.IsNumber(r) || unicode.IsPunct(r) {
			continue
		}
		return false
	}
	return true
}
Esempio n. 22
0
func startPar(out io.Writer, indent0, indent string, max int) *parFmt {
	rc := make(chan string)
	ec := make(chan bool, 1)
	wc := make(chan string)
	pf := &parFmt{rc, ec}
	go func() {
		for s := range rc {
			if s == "\n" {
				wc <- s
				continue
			}
			words := strings.Fields(strings.TrimSpace(s))
			for _, w := range words {
				wc <- w
			}
		}
		close(wc)
	}()
	go func() {
		pos, _ := fmt.Fprintf(out, "%s", indent0)
		firstword := true
		lastword := "x"
		for w := range wc {
			if len(w) == 0 {
				continue
			}
			if w == "\n" {
				fmt.Fprintf(out, "\n")
				firstword = true
				pos = 0
				continue
			}
			if pos+len(w)+1 > max {
				fmt.Fprintf(out, "\n")
				pos, _ = fmt.Fprintf(out, "%s", indent)
				firstword = true
			}
			if !firstword && len(w)>0 && !unicode.IsPunct(rune(w[0])) {
				lastr := rune(lastword[len(lastword)-1])
				if !strings.ContainsRune("([{", lastr) {
					fmt.Fprintf(out, " ")
					pos++
				}
			}
			fmt.Fprintf(out, "%s", w)
			pos += len(w)
			firstword = false
			lastword = w
		}
		if !firstword {
			fmt.Fprintf(out, "\n")
		}
		close(ec)
	}()
	return pf
}
Esempio n. 23
0
func Sanitize(r rune) rune {
	switch {
	case unicode.IsPunct(r):
		return ' '
	case unicode.IsMark(r):
		return ' '
	case unicode.IsSymbol(r):
		return ' '
	}
	return r
}
Esempio n. 24
0
func main() {
	counts := make(map[string]int)
	var utflen [utf8.UTFMax + 1]int
	invalid := 0

	in := bufio.NewReader(os.Stdin)
	for {
		r, n, err := in.ReadRune()
		if err == io.EOF {
			break
		}
		if err != nil {
			fmt.Fprintf(os.Stderr, "charcount: %v\n", err)
			os.Exit(1)
		}
		if r == unicode.ReplacementChar && n == 1 {
			invalid++
			continue
		}

		utflen[n]++
		switch {
		case unicode.IsLetter(r):
			counts["Letter"]++
		case unicode.IsMark(r):
			counts["Mark"]++
		case unicode.IsNumber(r):
			counts["Number"]++
		case unicode.IsPunct(r):
			counts["Punct"]++
		case unicode.IsSymbol(r):
			counts["Symbol"]++
		case unicode.IsSpace(r):
			counts["Space"]++
		default:
			counts["Other"]++
		}
	}

	fmt.Printf("rune\tcount\n")
	for c, n := range counts {
		fmt.Printf("%s\t%d\n", c, n)
	}
	fmt.Print("\nlen\tcount\n")
	for i, n := range utflen {
		if i > 0 {
			fmt.Printf("%d\t%d\n", i, n)
		}
	}
	if invalid > 0 {
		fmt.Printf("\n%d invalid UTF-8 characters\n", invalid)
	}
}
Esempio n. 25
0
File: ex8.go Progetto: yyBeta/gopl
func main() {
	counts := make(map[rune]int) // counts of Unicode characters
	var utflen [utf8.UTFMax]int  // count of lengths of UTF-8 encodings
	invalid := 0                 // count of invalid UTF-8 characters
	cats := make(map[string]int) // counts of Unicode categories

	// In a terminal, use CTRL+Z at line start to signal EOF with ENTER.
	in := bufio.NewReader(os.Stdin)
	for {
		r, n, err := in.ReadRune() // returns rune, nbytes, error
		if err == io.EOF {
			break
		}
		if err != nil {
			fmt.Fprintf(os.Stderr, "charcount: %v\n", err)
			os.Exit(1)
		}
		if r == unicode.ReplacementChar && n == 1 {
			invalid++
			continue
		}
		switch {
		case unicode.IsLetter(r):
			cats["letter"]++
		case unicode.IsDigit(r):
			cats["digit"]++
		case unicode.IsControl(r):
			cats["control"]++
		case unicode.IsMark(r):
			cats["mark"]++
		case unicode.IsPunct(r):
			cats["punct"]++
		case unicode.IsSymbol(r):
			cats["symbol"]++
		}
		counts[r]++
		utflen[n-1]++
	}
	fmt.Printf("rune\tcount\n")
	for c, n := range counts {
		fmt.Printf("%q\t%d\n", c, n)
	}
	fmt.Print("\nlen\tcount\n")
	for i, n := range utflen {
		fmt.Printf("%d\t%d\n", i+1, n)
	}
	fmt.Print("\ncat\tcount\n")
	for s, n := range cats {
		fmt.Printf("%v\t%d\n", s, n)
	}
	fmt.Printf("\n%d invalid UTF-8 characters\n", invalid)
}
Esempio n. 26
0
func main() {
	in := bufio.NewReader(os.Stdin)
	counts := make(map[string]int)  // counts of Unicode character types
	var utflen [utf8.UTFMax + 1]int // count of lengths of UTF-8 encodings
	invalid := 0                    // count of invalid UTF-8 characters

	for {
		r, n, err := in.ReadRune() // returns rune, nbytes, error
		if err == io.EOF {
			break
		}
		if err != nil {
			fmt.Fprintf(os.Stderr, "charcount: %v\n", err)
			os.Exit(1)
		}
		if r == unicode.ReplacementChar && n == 1 {
			invalid++
			continue
		}

		switch {
		case unicode.IsControl(r):
			counts["control"]++
		case unicode.IsLetter(r):
			counts["letter"]++
		case unicode.IsMark(r):
			counts["mark"]++
		case unicode.IsNumber(r):
			counts["number"]++
		case unicode.IsPunct(r):
			counts["punct"]++
		case unicode.IsSpace(r):
			counts["space"]++
		case unicode.IsSymbol(r):
			counts["symbol"]++
		}
		utflen[n]++
	}
	fmt.Printf("rune\tcount\n")
	for c, n := range counts {
		fmt.Printf("%q\t%d\n", c, n)
	}
	fmt.Print("\nlen\tcount\n")
	for i, n := range utflen {
		if i > 0 {
			fmt.Printf("%d\t%d\n", i, n)
		}
	}
	if invalid > 0 {
		fmt.Printf("\n%d invalid UTF-8 characters\n", invalid)
	}
}
Esempio n. 27
0
// atTerminator reports whether the input is at valid termination character to
// appear after an identifier.
func (l *Scanner) atTerminator() bool {
	r := l.peek()
	if r == eof || isSpace(r) || isEndOfLine(r) || unicode.IsPunct(r) || unicode.IsSymbol(r) {
		return true
	}
	// Does r start the delimiter? This can be ambiguous (with delim=="//", $x/2 will
	// succeed but should fail) but only in extremely rare cases caused by willfully
	// bad choice of delimiter.
	if rd, _ := utf8.DecodeRuneInString(l.rightDelim); rd == r {
		return true
	}
	return false
}
Esempio n. 28
0
// Provide an integer score for delay-value of a word.
// A "word" gets a boost for:
// * Ending in punctuation marks.
// * Being longer than N letters
func delayPercent(word string) int {
	wordScore := 0
	rword := []rune(word)
	clearword := make([]rune, 0, len(rword))
	for _, r := range rword {
		if unicode.IsLetter(r) || unicode.IsNumber(r) {
			clearword = append(clearword, r)
		}
	}
	if unicode.IsPunct(rword[len(rword)-1]) {
		wordScore = wordScore + 2
	}
	if unicode.IsPunct([]rune(rword)[0]) {
		wordScore = wordScore + 2
	}
	if len(clearword) > 8 {
		wordScore = wordScore + 1
	}
	if len(clearword) > 12 {
		wordScore = wordScore + 1
	}
	return 100 + (10 * wordScore)
}
Esempio n. 29
0
File: jee.go Progetto: ltt1987/gojee
func getIdent(r rune) int {
	i, ok := Ident[r]
	switch {
	case ok:
		return i
	case unicode.IsNumber(r):
		return CONST
	case unicode.IsLetter(r) || unicode.IsPunct(r) || unicode.IsSymbol(r):
		return RESERVED
	case unicode.IsSpace(r):
		return SPACE
	}
	return ZERO
}
Esempio n. 30
0
// normalize does unicode normalization.
func normalize(in []byte) ([]byte, error) {
	// We need a new transformer for each input as it cannot be reused.
	filter := func(r rune) bool {
		return unicode.Is(unicode.Mn, r) // Mn: nonspacing marks (to be removed)
	}
	transformer := transform.Chain(norm.NFD, transform.RemoveFunc(filter), norm.NFC)
	out, _, err := transform.Bytes(transformer, in)
	out = bytes.Map(func(r rune) rune {
		if unicode.IsPunct(r) { // Replace punctuations with spaces.
			return ' '
		}
		return unicode.ToLower(r) // Convert to lower case.
	}, out)
	return out, err
}