Example #1
0
// Matcher method for iBoundaryCase. If either left or right is not within the
// target string, then -1 should be provided.
func (s *instr) matchBoundaryMode(left rune, right rune) bool {
	if s.mode != iBoundaryCase {
		return false
	}
	switch s.lr {
	case bBeginText:
		return left == -1
	case bBeginLine:
		return left == -1 || left == '\n'
	case bEndText:
		return right == -1
	case bEndLine:
		return right == -1 || right == '\n'
	case bWordBoundary, bNotWordBoundary:
		// TODO: This is ASCII-only at this point.
		word_range := perl_groups['w']
		whitespace_range := perl_groups['s']
		wb := (unicode.Is(word_range, left) && unicode.Is(whitespace_range, right)) || (unicode.Is(whitespace_range, left) && unicode.Is(word_range, right))
		if s.lr == bWordBoundary {
			return wb
		} else {
			return !wb
		}
	}
	panic("unexpected lr mode")
}
Example #2
0
File: xml.go Project: WXB506/golang
// Get name: /first(first|second)*/
// Do not set p.err if the name is missing (unless unexpected EOF is received):
// let the caller provide better context.
func (p *Parser) name() (s string, ok bool) {
	var b byte
	if b, ok = p.mustgetc(); !ok {
		return
	}

	// As a first approximation, we gather the bytes [A-Za-z_:.-\x80-\xFF]*
	if b < utf8.RuneSelf && !isNameByte(b) {
		p.ungetc(b)
		return "", false
	}
	p.buf.Reset()
	p.buf.WriteByte(b)
	for {
		if b, ok = p.mustgetc(); !ok {
			return
		}
		if b < utf8.RuneSelf && !isNameByte(b) {
			p.ungetc(b)
			break
		}
		p.buf.WriteByte(b)
	}

	// Then we check the characters.
	s = p.buf.String()
	for i, c := range s {
		if !unicode.Is(first, c) && (i == 0 || !unicode.Is(second, c)) {
			p.err = p.syntaxError("invalid XML name: " + s)
			return "", false
		}
	}
	return s, true
}
Example #3
0
func main() {
	counter := make(map[string]int, 0)

	data := FetchURL(os.Args[1])

	var buffer []rune

	printed := false

	for _, r := range data {
		if unicode.Is(unicode.Han, r) || unicode.Is(unicode.Hiragana, r) || unicode.Is(unicode.Katakana, r) || r == 'ー' {
			buffer = append(buffer, r)

			printed = false
		} else if printed != true {
			printed = true
			counter[string(buffer)] += 1
			buffer = make([]rune, 0)
		}
	}

	for k, v := range counter {
		fmt.Println(v, k)
	}
}
// reversePreservingCombiningCharacters interprets its argument as UTF-8
// and ignores bytes that do not form valid UTF-8.  return value is UTF-8.
func reversePreservingCombiningCharacters(s string) string {
	if s == "" {
		return ""
	}
	p := []rune(s)
	r := make([]rune, len(p))
	start := len(r)
	for i := 0; i < len(p); {
		// quietly skip invalid UTF-8
		if p[i] == utf8.RuneError {
			i++
			continue
		}
		j := i + 1
		for j < len(p) && (unicode.Is(unicode.Mn, p[j]) ||
			unicode.Is(unicode.Me, p[j]) || unicode.Is(unicode.Mc, p[j])) {
			j++
		}
		for k := j - 1; k >= i; k-- {
			start--
			r[start] = p[k]
		}
		i = j
	}
	return (string(r[start:]))
}
Example #5
0
func EncCharacter(char int) (bool, string) {

	if unicode.Is(unicode.Cc, char) || unicode.Is(unicode.Cf, char) || unicode.Is(unicode.Co, char) || unicode.Is(unicode.Cs, char) || unicode.Is(unicode.Zl, char) || unicode.Is(unicode.Zp, char) || unicode.Is(unicode.Zs, char) {
		return false, ""
	}
	s := string(char)

	return true, s
}
func JconvCharset(str string) int {
	arr := JconvRune(str)

	// Hiragana test
	is_hiragana := true
	for _, r := range arr {
		if !unicode.Is(unicode.Hiragana, r) {
			is_hiragana = false
			break
		}
	}
	if is_hiragana {
		return 1
	}

	// Katakana test
	is_katakana := true
	for _, r := range arr {
		if !unicode.Is(unicode.Katakana, r) && r != 'ー' {
			is_katakana = false
			break
		}
	}
	if is_katakana {
		return 2
	}

	// Full cjk range
	rt := unicode.RangeTable{
		R16: []unicode.Range16{
			{Lo: 0x3000, Hi: 0x303f, Stride: 1}, // Punctuation
			{Lo: 0x3040, Hi: 0x309f, Stride: 1}, // Hiragana
			{Lo: 0x30a0, Hi: 0x30ff, Stride: 1}, // Katakana
			{Lo: 0x3400, Hi: 0x4dbf, Stride: 1}, // CJK unified ext A
			{Lo: 0x4e00, Hi: 0x9faf, Stride: 1}, // CJK unified
			{Lo: 0xff00, Hi: 0xffef, Stride: 1}, // Romanji and hw-katakana
		},
		R32:         []unicode.Range32{},
		LatinOffset: 0,
	}
	is_cjk := true
	for _, r := range arr {
		if !unicode.Is(&rt, r) {
			is_cjk = false
			break
		}
	}
	if is_cjk {
		return 3
	}

	// Failed to detect charset
	return 0
}
func (m *minificationText) processText(in string) string {
	var buffer bytes.Buffer
	var rRaw, r rune
	var size int
	prevIsSeparator := false
	prevRune := ' '
	isFirst := true
	for len(in) > 0 {
		rRaw, size = utf8.DecodeRuneInString(in)
		r = unicode.ToLower(rRaw)
		isSeparator := !unicode.Is(notSeparatorRT, r)

		// digits
		if isSeparator && !prevIsSeparator {
			rRaw, _ = utf8.DecodeRuneInString(in[size:])
			isSeparator = !m.isDigit(prevRune, r, rRaw)
		}

		if !isSeparator && prevIsSeparator && !isFirst {
			_ = buffer.WriteByte(' ')
		}

		if !isSeparator {
			_, _ = buffer.WriteRune(r)
			isFirst = false
		}

		prevIsSeparator = isSeparator
		prevRune = r
		in = in[size:]
	}

	return buffer.String()
}
Example #8
0
func main() {
	fmt.Println(strings.Contains("Hello, world!", "wo"))
	fmt.Println(strings.ContainsAny("Hello, world", "w o"))
	fmt.Println(strings.Count("Hello Helium", "He"))

	s1 := []string{"Hello,", "world"}
	fmt.Println(strings.Join(s1, " "))

	s2 := strings.Split("Hello, world", " ")
	fmt.Println(s2[1])

	s3 := strings.Fields("Hello, world")
	fmt.Println(s3[1])

	f := func(r rune) bool {
		return unicode.Is(unicode.Hangul, r)
	}

	s4 := strings.FieldsFunc("Hello안녕Hello", f)
	fmt.Println(s4)

	fmt.Println(strings.Repeat("Hello", 10))
	fmt.Println(strings.Replace("Hello, world", "world", "go", 1))
	fmt.Println(strings.Replace("Hello Hello", "llo", "Go", 2))
}
Example #9
0
func TestPredicate(t *testing.T) {
	testConditional(t, func(rt *unicode.RangeTable, t, f transform.Transformer) transform.Transformer {
		return If(Predicate(func(r rune) bool {
			return unicode.Is(rt, r)
		}), t, f)
	})
}
Example #10
0
func lexPackageName(l *Lexer) stateFn {

	// lex package name
	var lastPeriod bool
OUTER:
	for {

		switch r := l.next(); {
		case unicode.IsLetter(r):
			lastPeriod = false
		case r == '.' || r == '_':
			lastPeriod = true
		case unicode.Is(unicode.White_Space, r):
			l.backup()
			break OUTER
		default:
			l.backup()
			lastPeriod = false
			return l.errorf("expected newline after package name")
		}
	}

	if lastPeriod {
		return l.errorf("package names cannot end with a period or underscore")
	}

	// emit package name
	l.emit(TokenPackageName)

	return lexText
}
Example #11
0
File: data.go Project: bak1an/sre2
// Generate a RuneFilter matching a valid Unicode class. If no matching classes
// are found, then this method will return nil.
// Note that if just a single character is given, Categories will be searched
// for this as a prefix (so that 'N' will match 'Nd', 'Nl', 'No' etc).
func matchUnicodeClass(class string) RuneFilter {
	found := false
	match := make([]*unicode.RangeTable, 0)
	if len(class) == 1 {
		// A single character is a shorthand request for any category starting with this.
		for key, r := range unicode.Categories {
			if key[0] == class[0] {
				found = true
				match = append(match, r)
			}
		}
	} else {
		// Search for the unicode class name inside cats/props/scripts.
		options := []map[string]*unicode.RangeTable{
			unicode.Categories, unicode.Properties, unicode.Scripts}
		for _, option := range options {
			if r, ok := option[class]; ok {
				found = true
				match = append(match, r)
			}
		}
	}

	if found {
		return func(r rune) bool {
			for _, table := range match {
				if unicode.Is(table, r) {
					return true
				}
			}
			return false
		}
	}
	return nil
}
Example #12
0
func print_rune_is(char rune, props map[string]*unicode.RangeTable) {
	for prop, table := range props {
		if unicode.Is(table, char) {
			fmt.Println("  ", prop)
		}
	}
}
Example #13
0
func main() {
	fmt.Println(strings.Index("Hello, world!", "He"))  // 0: He가 맨 처음에 있으므로 0
	fmt.Println(strings.Index("Hello, world!", "wor")) // 7: wor가 8번째에 있으므로 7
	fmt.Println(strings.Index("Hello, world!", "ow"))  // -1: ow는 없으므로 -1

	fmt.Println(strings.IndexAny("Hello, world!", "eo")) // 1: e가 2번째에 있으므로 1
	fmt.Println(strings.IndexAny("Hello, world!", "f"))  // -1: f는 없으므로 -1

	var c byte
	c = 'd'
	fmt.Println(strings.IndexByte("Hello, world!", c)) // 11: d가 12번째에 있으므로 11
	c = 'f'
	fmt.Println(strings.IndexByte("Hello, world!", c)) // -1: f는 없으므로 -1

	var r rune
	r = '언'
	fmt.Println(strings.IndexRune("고 언어", r)) // 4: "언"이 시작되는 인덱스가 4

	f := func(r rune) bool {
		return unicode.Is(unicode.Hangul, r) // r이 한글 유니코드이면 true를 리턴
	}
	fmt.Println(strings.IndexFunc("Go 언어", f))       // 3: 한글이 4번째부터 시작하므로 3
	fmt.Println(strings.IndexFunc("Go Language", f)) // -1: 한글이 없으므로 -1

	fmt.Println(strings.LastIndex("Hello Hello Hello, world!", "Hello"))
	// 12: 마지막 Hello가 13번째에 있으므로 12

	fmt.Println(strings.LastIndexAny("Hello, world", "ol")) // 10: 마지막 l이 11번째에 있으므로 10

	fmt.Println(strings.LastIndexFunc("Go 언어 안녕", f)) // 13: 마지막 한글인 '녕'이 시작되는 인덱스가 13
}
Example #14
0
func NormalizeTitle(title string) string {
	normalizedTitle := title
	normalizedTitle = strings.ToLower(normalizedTitle)
	normalizedTitle = RomanizeHepburn(title)
	normalizedTitle = strings.ToLower(normalizedTitle)
	normalizedTitle = RemoveTrailingApostrophe(normalizedTitle)
	normalizedTitle, _, _ = transform.String(transform.Chain(
		norm.NFD,
		transform.RemoveFunc(func(r rune) bool {
			return unicode.Is(unicode.Mn, r)
		}),
		norm.NFC), normalizedTitle)
	normalizedTitle = strings.ToLower(normalizedTitle)
	normalizedTitle = regexp.MustCompile(`\(\d+\)`).ReplaceAllString(normalizedTitle, " ")
	normalizedTitle = strings.Map(func(r rune) rune {
		if !unicode.IsLetter(r) && !unicode.IsDigit(r) && r != '.' {
			return ' '
		}
		return r
	}, normalizedTitle)
	normalizedTitle = regexp.MustCompile(`\s+`).ReplaceAllString(normalizedTitle, " ")
	normalizedTitle = strings.TrimSpace(normalizedTitle)

	return normalizedTitle
}
Example #15
0
func loadSpoolFiles(dirname string, depth int) {
	dh, err := os.Open(dirname)
	o.MightFail(err, "Couldn't open %s", dirname)
	nodes, err := dh.Readdir(-1)
	o.MightFail(err, "Couldn't readdir on %s", dirname)
	if depth > 0 {
		for _, n := range nodes {
			abspath := path.Join(dirname, n.Name())
			if (n.Mode() & os.ModeType) == os.ModeDir {
				// if not a single character, it's not a spool node.
				if len(n.Name()) != 1 {
					continue
				}
				if n.Name() == "." {
					// we're not interested in .
					continue
				}
				nrunes := []rune(n.Name())
				if unicode.Is(unicode.ASCII_Hex_Digit, nrunes[0]) {
					loadSpoolFiles(abspath, depth-1)
				} else {
					o.Warn("Foreign dirent %s found in spool tree", abspath)
				}
			}
		}
	} else {
		// depth == 0 - only interested in files.
		for _, n := range nodes {
			abspath := path.Join(dirname, n.Name())
			if n.Mode()&os.ModeType == 0 {
				if len(n.Name()) != 16 {
					shuffleToCorrupted(abspath, "Filename incorrect length")
					continue
				}
				id, err := strconv.ParseUint(n.Name(), 16, 64)
				if err != nil {
					shuffleToCorrupted(abspath, "Invalid Filename")
					continue
				}
				fh, err := os.Open(abspath)
				if err != nil {
					shuffleToCorrupted(abspath, "Couldn't open")
					continue
				}
				defer fh.Close()
				jr, err := JobRequestFromReader(fh)
				if err != nil || jr.Id != id {
					o.Warn("Couldn't parse?! %s", err)
					shuffleToCorrupted(abspath, "Parse Failure")
					continue
				}
				// Add the request to the registry directly.
				if !RestoreJobState(jr) {
					shuffleToCorrupted(abspath, "Job State Invalid")
				}
			}
		}
	}
}
Example #16
0
func IsChineseChar(str string) bool {
	for _, r := range str {
		if unicode.Is(unicode.Scripts["Han"], r) {
			return true
		}
	}
	return false
}
Example #17
0
func lookupScript(r rune) *unicode.RangeTable {
	for script := range scripts {
		if unicode.Is(script, r) {
			return script
		}
	}
	return nil
}
Example #18
0
// IsHalfwidth reports whether the rune is in range of half width character of East Asian.
func IsHalfwidth(char rune) bool {
	for _, halfwidthRangeTable := range Halfwidth() {
		if unicode.Is(halfwidthRangeTable, char) {
			return true
		}
	}
	return false
}
Example #19
0
// IsFullwidth reports whether the rune is in range of full width character of East Asian.
func IsFullwidth(char rune) bool {
	for _, fullwidthRangeTable := range Fullwidth() {
		if unicode.Is(fullwidthRangeTable, char) {
			return true
		}
	}
	return false
}
Example #20
0
// AcceptRange advances l's position if the current rune is in tab.
func (l *Lexer) AcceptRange(tab *unicode.RangeTable) (ok bool) {
	r, _ := l.Advance()
	ok = unicode.Is(tab, r)
	if !ok {
		l.Backup()
	}
	return
}
Example #21
0
func TestMap(t *testing.T) {
	// Run a couple of awful growth/shrinkage tests
	a := tenRunes('a')
	// 1.  Grow.  This triggers two reallocations in Map.
	maxRune := func(rune) rune { return unicode.MaxRune }
	m := Map(maxRune, a)
	expect := tenRunes(unicode.MaxRune)
	if m != expect {
		t.Errorf("growing: expected %q got %q", expect, m)
	}

	// 2. Shrink
	minRune := func(rune) rune { return 'a' }
	m = Map(minRune, tenRunes(unicode.MaxRune))
	expect = a
	if m != expect {
		t.Errorf("shrinking: expected %q got %q", expect, m)
	}

	// 3. Rot13
	m = Map(rot13, "a to zed")
	expect = "n gb mrq"
	if m != expect {
		t.Errorf("rot13: expected %q got %q", expect, m)
	}

	// 4. Rot13^2
	m = Map(rot13, Map(rot13, "a to zed"))
	expect = "a to zed"
	if m != expect {
		t.Errorf("rot13: expected %q got %q", expect, m)
	}

	// 5. Drop
	dropNotLatin := func(r rune) rune {
		if unicode.Is(unicode.Latin, r) {
			return r
		}
		return -1
	}
	m = Map(dropNotLatin, "Hello, 세계")
	expect = "Hello"
	if m != expect {
		t.Errorf("drop: expected %q got %q", expect, m)
	}

	// 6. Identity
	identity := func(r rune) rune {
		return r
	}
	orig := "Input string that we expect not to be copied."
	m = Map(identity, orig)
	//Haxe does not store strings using StringHeader
	//if (*reflect.StringHeader)(unsafe.Pointer(&orig)).Data !=
	//	(*reflect.StringHeader)(unsafe.Pointer(&m)).Data {
	//	t.Error("unexpected copy during identity map")
	//}
}
Example #22
0
func Nsentences(text string) int64 {
	s := bufio.NewScanner(strings.NewReader(text))

	s.Split(func(data []byte, atEOF bool) (advance int, token []byte, err error) {
		start := 0
		width := 0
		for ; start < len(data); start += width {
			var r rune

			r, width = utf8.DecodeRune(data[start:])

			if !unicode.Is(unicode.STerm, r) {
				break
			}
		}

		if atEOF && len(data) == 0 {
			return 0, nil, nil
		}

		for i := 0; i < len(data); i += width {
			var r rune

			r, width = utf8.DecodeRune(data[i:])

			if unicode.Is(unicode.STerm, r) {
				return i + width, data[start:i], nil
			}
		}

		if atEOF && len(data) > start {
			return len(data), data[start:], nil
		}

		return 0, nil, nil
	})

	var count int64

	for s.Scan() {
		count++
	}

	return count
}
Example #23
0
func verifyRange(name string, inCategory Op, table []unicode.Range) {
	for i := range chars {
		web := inCategory(i);
		pkg := unicode.Is(table, i);
		if web != pkg {
			fmt.Fprintf(os.Stderr, "%s: U+%04X: web=%t pkg=%t\n", name, i, web, pkg)
		}
	}
}
func BenchmarkMerged(t *testing.B) {
	rt := Merge(unicode.GraphicRanges...)

	for i := 0; i < t.N; i++ {
		for _, r := range runes {
			unicode.Is(rt, r)
		}
	}
}
Example #25
0
func needQuote(str string) bool {
	for _, char := range str {
		if unicode.IsSpace(char) || !unicode.IsPrint(char) || unicode.Is(unicode.Quotation_Mark, char) {
			return true
		}
	}

	return false
}
Example #26
0
func TestMap(t *testing.T) {
	// Run a couple of awful growth/shrinkage tests
	a := tenRunes('a')

	// 1.  Grow. This triggers two reallocations in Map.
	maxRune := func(r rune) rune { return unicode.MaxRune }
	m := Map(maxRune, []byte(a))
	expect := tenRunes(unicode.MaxRune)
	if string(m) != expect {
		t.Errorf("growing: expected %q got %q", expect, m)
	}

	// 2. Shrink
	minRune := func(r rune) rune { return 'a' }
	m = Map(minRune, []byte(tenRunes(unicode.MaxRune)))
	expect = a
	if string(m) != expect {
		t.Errorf("shrinking: expected %q got %q", expect, m)
	}

	// 3. Rot13
	m = Map(rot13, []byte("a to zed"))
	expect = "n gb mrq"
	if string(m) != expect {
		t.Errorf("rot13: expected %q got %q", expect, m)
	}

	// 4. Rot13^2
	m = Map(rot13, Map(rot13, []byte("a to zed")))
	expect = "a to zed"
	if string(m) != expect {
		t.Errorf("rot13: expected %q got %q", expect, m)
	}

	// 5. Drop
	dropNotLatin := func(r rune) rune {
		if unicode.Is(unicode.Latin, r) {
			return r
		}
		return -1
	}
	m = Map(dropNotLatin, []byte("Hello, 세계"))
	expect = "Hello"
	if string(m) != expect {
		t.Errorf("drop: expected %q got %q", expect, m)
	}

	// 6. Invalid rune
	invalidRune := func(r rune) rune {
		return utf8.MaxRune + 1
	}
	m = Map(invalidRune, []byte("x"))
	expect = "\uFFFD"
	if string(m) != expect {
		t.Errorf("invalidRune: expected %q got %q", expect, m)
	}
}
Example #27
0
func TestIsControl(t *testing.T) {
	t.Skip()
	for i := 0; i < 256; i++ {
		control := i < 0x20 || i == 0x7f
		if lib := unicode.Is(unicode.Cc, rune(i)); control != lib {
			t.Errorf("%x: is control? %s", i, lib)
		}
	}
}
Example #28
0
// IndexFunc returns the index into s of the first Unicode code point satisfying f(c)
// or -1 if none do
func IndexFunc(s string, f func(rune) bool) int {
	function := func(c rune) bool {
		return unicode.Is(unicode.Han, c)
	}

	fmt.Println(strings.IndexFunc("Hello, 世界", function))    // 7
	fmt.Println(strings.IndexFunc("Hello, world", function)) // -1
	return strings.IndexFunc(s, f)
}
Example #29
0
// LastIndexFunc returns the index into s of the last Unicode code point
// satisfying f(c) or -1 if none do
func LastIndexFunc(s string, f func(rune) bool) int {
	function := func(c rune) bool {
		return unicode.Is(unicode.Han, c)
	}

	fmt.Println(strings.LastIndexFunc("hello 世界", function))    // 10 一个汉字貌似占3个位置
	fmt.Println(strings.LastIndexFunc("hello world", function)) // -1
	return strings.LastIndexFunc(s, f)
}
Example #30
0
//if the words contain any chinese character, return true
func chinese(words string) (zh bool) {
	for _, r := range words {
		if unicode.Is(unicode.Scripts["Han"], r) {
			zh = true
			break
		}
	}
	return
}