// Matcher method for iBoundaryCase. If either left or right is not within the // target string, then -1 should be provided. func (s *instr) matchBoundaryMode(left rune, right rune) bool { if s.mode != iBoundaryCase { return false } switch s.lr { case bBeginText: return left == -1 case bBeginLine: return left == -1 || left == '\n' case bEndText: return right == -1 case bEndLine: return right == -1 || right == '\n' case bWordBoundary, bNotWordBoundary: // TODO: This is ASCII-only at this point. word_range := perl_groups['w'] whitespace_range := perl_groups['s'] wb := (unicode.Is(word_range, left) && unicode.Is(whitespace_range, right)) || (unicode.Is(whitespace_range, left) && unicode.Is(word_range, right)) if s.lr == bWordBoundary { return wb } else { return !wb } } panic("unexpected lr mode") }
// Get name: /first(first|second)*/ // Do not set p.err if the name is missing (unless unexpected EOF is received): // let the caller provide better context. func (p *Parser) name() (s string, ok bool) { var b byte if b, ok = p.mustgetc(); !ok { return } // As a first approximation, we gather the bytes [A-Za-z_:.-\x80-\xFF]* if b < utf8.RuneSelf && !isNameByte(b) { p.ungetc(b) return "", false } p.buf.Reset() p.buf.WriteByte(b) for { if b, ok = p.mustgetc(); !ok { return } if b < utf8.RuneSelf && !isNameByte(b) { p.ungetc(b) break } p.buf.WriteByte(b) } // Then we check the characters. s = p.buf.String() for i, c := range s { if !unicode.Is(first, c) && (i == 0 || !unicode.Is(second, c)) { p.err = p.syntaxError("invalid XML name: " + s) return "", false } } return s, true }
func main() { counter := make(map[string]int, 0) data := FetchURL(os.Args[1]) var buffer []rune printed := false for _, r := range data { if unicode.Is(unicode.Han, r) || unicode.Is(unicode.Hiragana, r) || unicode.Is(unicode.Katakana, r) || r == 'ー' { buffer = append(buffer, r) printed = false } else if printed != true { printed = true counter[string(buffer)] += 1 buffer = make([]rune, 0) } } for k, v := range counter { fmt.Println(v, k) } }
// reversePreservingCombiningCharacters interprets its argument as UTF-8 // and ignores bytes that do not form valid UTF-8. return value is UTF-8. func reversePreservingCombiningCharacters(s string) string { if s == "" { return "" } p := []rune(s) r := make([]rune, len(p)) start := len(r) for i := 0; i < len(p); { // quietly skip invalid UTF-8 if p[i] == utf8.RuneError { i++ continue } j := i + 1 for j < len(p) && (unicode.Is(unicode.Mn, p[j]) || unicode.Is(unicode.Me, p[j]) || unicode.Is(unicode.Mc, p[j])) { j++ } for k := j - 1; k >= i; k-- { start-- r[start] = p[k] } i = j } return (string(r[start:])) }
func EncCharacter(char int) (bool, string) { if unicode.Is(unicode.Cc, char) || unicode.Is(unicode.Cf, char) || unicode.Is(unicode.Co, char) || unicode.Is(unicode.Cs, char) || unicode.Is(unicode.Zl, char) || unicode.Is(unicode.Zp, char) || unicode.Is(unicode.Zs, char) { return false, "" } s := string(char) return true, s }
func JconvCharset(str string) int { arr := JconvRune(str) // Hiragana test is_hiragana := true for _, r := range arr { if !unicode.Is(unicode.Hiragana, r) { is_hiragana = false break } } if is_hiragana { return 1 } // Katakana test is_katakana := true for _, r := range arr { if !unicode.Is(unicode.Katakana, r) && r != 'ー' { is_katakana = false break } } if is_katakana { return 2 } // Full cjk range rt := unicode.RangeTable{ R16: []unicode.Range16{ {Lo: 0x3000, Hi: 0x303f, Stride: 1}, // Punctuation {Lo: 0x3040, Hi: 0x309f, Stride: 1}, // Hiragana {Lo: 0x30a0, Hi: 0x30ff, Stride: 1}, // Katakana {Lo: 0x3400, Hi: 0x4dbf, Stride: 1}, // CJK unified ext A {Lo: 0x4e00, Hi: 0x9faf, Stride: 1}, // CJK unified {Lo: 0xff00, Hi: 0xffef, Stride: 1}, // Romanji and hw-katakana }, R32: []unicode.Range32{}, LatinOffset: 0, } is_cjk := true for _, r := range arr { if !unicode.Is(&rt, r) { is_cjk = false break } } if is_cjk { return 3 } // Failed to detect charset return 0 }
func (m *minificationText) processText(in string) string { var buffer bytes.Buffer var rRaw, r rune var size int prevIsSeparator := false prevRune := ' ' isFirst := true for len(in) > 0 { rRaw, size = utf8.DecodeRuneInString(in) r = unicode.ToLower(rRaw) isSeparator := !unicode.Is(notSeparatorRT, r) // digits if isSeparator && !prevIsSeparator { rRaw, _ = utf8.DecodeRuneInString(in[size:]) isSeparator = !m.isDigit(prevRune, r, rRaw) } if !isSeparator && prevIsSeparator && !isFirst { _ = buffer.WriteByte(' ') } if !isSeparator { _, _ = buffer.WriteRune(r) isFirst = false } prevIsSeparator = isSeparator prevRune = r in = in[size:] } return buffer.String() }
func main() { fmt.Println(strings.Contains("Hello, world!", "wo")) fmt.Println(strings.ContainsAny("Hello, world", "w o")) fmt.Println(strings.Count("Hello Helium", "He")) s1 := []string{"Hello,", "world"} fmt.Println(strings.Join(s1, " ")) s2 := strings.Split("Hello, world", " ") fmt.Println(s2[1]) s3 := strings.Fields("Hello, world") fmt.Println(s3[1]) f := func(r rune) bool { return unicode.Is(unicode.Hangul, r) } s4 := strings.FieldsFunc("Hello안녕Hello", f) fmt.Println(s4) fmt.Println(strings.Repeat("Hello", 10)) fmt.Println(strings.Replace("Hello, world", "world", "go", 1)) fmt.Println(strings.Replace("Hello Hello", "llo", "Go", 2)) }
func TestPredicate(t *testing.T) { testConditional(t, func(rt *unicode.RangeTable, t, f transform.Transformer) transform.Transformer { return If(Predicate(func(r rune) bool { return unicode.Is(rt, r) }), t, f) }) }
func lexPackageName(l *Lexer) stateFn { // lex package name var lastPeriod bool OUTER: for { switch r := l.next(); { case unicode.IsLetter(r): lastPeriod = false case r == '.' || r == '_': lastPeriod = true case unicode.Is(unicode.White_Space, r): l.backup() break OUTER default: l.backup() lastPeriod = false return l.errorf("expected newline after package name") } } if lastPeriod { return l.errorf("package names cannot end with a period or underscore") } // emit package name l.emit(TokenPackageName) return lexText }
// Generate a RuneFilter matching a valid Unicode class. If no matching classes // are found, then this method will return nil. // Note that if just a single character is given, Categories will be searched // for this as a prefix (so that 'N' will match 'Nd', 'Nl', 'No' etc). func matchUnicodeClass(class string) RuneFilter { found := false match := make([]*unicode.RangeTable, 0) if len(class) == 1 { // A single character is a shorthand request for any category starting with this. for key, r := range unicode.Categories { if key[0] == class[0] { found = true match = append(match, r) } } } else { // Search for the unicode class name inside cats/props/scripts. options := []map[string]*unicode.RangeTable{ unicode.Categories, unicode.Properties, unicode.Scripts} for _, option := range options { if r, ok := option[class]; ok { found = true match = append(match, r) } } } if found { return func(r rune) bool { for _, table := range match { if unicode.Is(table, r) { return true } } return false } } return nil }
func print_rune_is(char rune, props map[string]*unicode.RangeTable) { for prop, table := range props { if unicode.Is(table, char) { fmt.Println(" ", prop) } } }
func main() { fmt.Println(strings.Index("Hello, world!", "He")) // 0: He가 맨 처음에 있으므로 0 fmt.Println(strings.Index("Hello, world!", "wor")) // 7: wor가 8번째에 있으므로 7 fmt.Println(strings.Index("Hello, world!", "ow")) // -1: ow는 없으므로 -1 fmt.Println(strings.IndexAny("Hello, world!", "eo")) // 1: e가 2번째에 있으므로 1 fmt.Println(strings.IndexAny("Hello, world!", "f")) // -1: f는 없으므로 -1 var c byte c = 'd' fmt.Println(strings.IndexByte("Hello, world!", c)) // 11: d가 12번째에 있으므로 11 c = 'f' fmt.Println(strings.IndexByte("Hello, world!", c)) // -1: f는 없으므로 -1 var r rune r = '언' fmt.Println(strings.IndexRune("고 언어", r)) // 4: "언"이 시작되는 인덱스가 4 f := func(r rune) bool { return unicode.Is(unicode.Hangul, r) // r이 한글 유니코드이면 true를 리턴 } fmt.Println(strings.IndexFunc("Go 언어", f)) // 3: 한글이 4번째부터 시작하므로 3 fmt.Println(strings.IndexFunc("Go Language", f)) // -1: 한글이 없으므로 -1 fmt.Println(strings.LastIndex("Hello Hello Hello, world!", "Hello")) // 12: 마지막 Hello가 13번째에 있으므로 12 fmt.Println(strings.LastIndexAny("Hello, world", "ol")) // 10: 마지막 l이 11번째에 있으므로 10 fmt.Println(strings.LastIndexFunc("Go 언어 안녕", f)) // 13: 마지막 한글인 '녕'이 시작되는 인덱스가 13 }
func NormalizeTitle(title string) string { normalizedTitle := title normalizedTitle = strings.ToLower(normalizedTitle) normalizedTitle = RomanizeHepburn(title) normalizedTitle = strings.ToLower(normalizedTitle) normalizedTitle = RemoveTrailingApostrophe(normalizedTitle) normalizedTitle, _, _ = transform.String(transform.Chain( norm.NFD, transform.RemoveFunc(func(r rune) bool { return unicode.Is(unicode.Mn, r) }), norm.NFC), normalizedTitle) normalizedTitle = strings.ToLower(normalizedTitle) normalizedTitle = regexp.MustCompile(`\(\d+\)`).ReplaceAllString(normalizedTitle, " ") normalizedTitle = strings.Map(func(r rune) rune { if !unicode.IsLetter(r) && !unicode.IsDigit(r) && r != '.' { return ' ' } return r }, normalizedTitle) normalizedTitle = regexp.MustCompile(`\s+`).ReplaceAllString(normalizedTitle, " ") normalizedTitle = strings.TrimSpace(normalizedTitle) return normalizedTitle }
func loadSpoolFiles(dirname string, depth int) { dh, err := os.Open(dirname) o.MightFail(err, "Couldn't open %s", dirname) nodes, err := dh.Readdir(-1) o.MightFail(err, "Couldn't readdir on %s", dirname) if depth > 0 { for _, n := range nodes { abspath := path.Join(dirname, n.Name()) if (n.Mode() & os.ModeType) == os.ModeDir { // if not a single character, it's not a spool node. if len(n.Name()) != 1 { continue } if n.Name() == "." { // we're not interested in . continue } nrunes := []rune(n.Name()) if unicode.Is(unicode.ASCII_Hex_Digit, nrunes[0]) { loadSpoolFiles(abspath, depth-1) } else { o.Warn("Foreign dirent %s found in spool tree", abspath) } } } } else { // depth == 0 - only interested in files. for _, n := range nodes { abspath := path.Join(dirname, n.Name()) if n.Mode()&os.ModeType == 0 { if len(n.Name()) != 16 { shuffleToCorrupted(abspath, "Filename incorrect length") continue } id, err := strconv.ParseUint(n.Name(), 16, 64) if err != nil { shuffleToCorrupted(abspath, "Invalid Filename") continue } fh, err := os.Open(abspath) if err != nil { shuffleToCorrupted(abspath, "Couldn't open") continue } defer fh.Close() jr, err := JobRequestFromReader(fh) if err != nil || jr.Id != id { o.Warn("Couldn't parse?! %s", err) shuffleToCorrupted(abspath, "Parse Failure") continue } // Add the request to the registry directly. if !RestoreJobState(jr) { shuffleToCorrupted(abspath, "Job State Invalid") } } } } }
func IsChineseChar(str string) bool { for _, r := range str { if unicode.Is(unicode.Scripts["Han"], r) { return true } } return false }
func lookupScript(r rune) *unicode.RangeTable { for script := range scripts { if unicode.Is(script, r) { return script } } return nil }
// IsHalfwidth reports whether the rune is in range of half width character of East Asian. func IsHalfwidth(char rune) bool { for _, halfwidthRangeTable := range Halfwidth() { if unicode.Is(halfwidthRangeTable, char) { return true } } return false }
// IsFullwidth reports whether the rune is in range of full width character of East Asian. func IsFullwidth(char rune) bool { for _, fullwidthRangeTable := range Fullwidth() { if unicode.Is(fullwidthRangeTable, char) { return true } } return false }
// AcceptRange advances l's position if the current rune is in tab. func (l *Lexer) AcceptRange(tab *unicode.RangeTable) (ok bool) { r, _ := l.Advance() ok = unicode.Is(tab, r) if !ok { l.Backup() } return }
func TestMap(t *testing.T) { // Run a couple of awful growth/shrinkage tests a := tenRunes('a') // 1. Grow. This triggers two reallocations in Map. maxRune := func(rune) rune { return unicode.MaxRune } m := Map(maxRune, a) expect := tenRunes(unicode.MaxRune) if m != expect { t.Errorf("growing: expected %q got %q", expect, m) } // 2. Shrink minRune := func(rune) rune { return 'a' } m = Map(minRune, tenRunes(unicode.MaxRune)) expect = a if m != expect { t.Errorf("shrinking: expected %q got %q", expect, m) } // 3. Rot13 m = Map(rot13, "a to zed") expect = "n gb mrq" if m != expect { t.Errorf("rot13: expected %q got %q", expect, m) } // 4. Rot13^2 m = Map(rot13, Map(rot13, "a to zed")) expect = "a to zed" if m != expect { t.Errorf("rot13: expected %q got %q", expect, m) } // 5. Drop dropNotLatin := func(r rune) rune { if unicode.Is(unicode.Latin, r) { return r } return -1 } m = Map(dropNotLatin, "Hello, 세계") expect = "Hello" if m != expect { t.Errorf("drop: expected %q got %q", expect, m) } // 6. Identity identity := func(r rune) rune { return r } orig := "Input string that we expect not to be copied." m = Map(identity, orig) //Haxe does not store strings using StringHeader //if (*reflect.StringHeader)(unsafe.Pointer(&orig)).Data != // (*reflect.StringHeader)(unsafe.Pointer(&m)).Data { // t.Error("unexpected copy during identity map") //} }
func Nsentences(text string) int64 { s := bufio.NewScanner(strings.NewReader(text)) s.Split(func(data []byte, atEOF bool) (advance int, token []byte, err error) { start := 0 width := 0 for ; start < len(data); start += width { var r rune r, width = utf8.DecodeRune(data[start:]) if !unicode.Is(unicode.STerm, r) { break } } if atEOF && len(data) == 0 { return 0, nil, nil } for i := 0; i < len(data); i += width { var r rune r, width = utf8.DecodeRune(data[i:]) if unicode.Is(unicode.STerm, r) { return i + width, data[start:i], nil } } if atEOF && len(data) > start { return len(data), data[start:], nil } return 0, nil, nil }) var count int64 for s.Scan() { count++ } return count }
func verifyRange(name string, inCategory Op, table []unicode.Range) { for i := range chars { web := inCategory(i); pkg := unicode.Is(table, i); if web != pkg { fmt.Fprintf(os.Stderr, "%s: U+%04X: web=%t pkg=%t\n", name, i, web, pkg) } } }
func BenchmarkMerged(t *testing.B) { rt := Merge(unicode.GraphicRanges...) for i := 0; i < t.N; i++ { for _, r := range runes { unicode.Is(rt, r) } } }
func needQuote(str string) bool { for _, char := range str { if unicode.IsSpace(char) || !unicode.IsPrint(char) || unicode.Is(unicode.Quotation_Mark, char) { return true } } return false }
func TestMap(t *testing.T) { // Run a couple of awful growth/shrinkage tests a := tenRunes('a') // 1. Grow. This triggers two reallocations in Map. maxRune := func(r rune) rune { return unicode.MaxRune } m := Map(maxRune, []byte(a)) expect := tenRunes(unicode.MaxRune) if string(m) != expect { t.Errorf("growing: expected %q got %q", expect, m) } // 2. Shrink minRune := func(r rune) rune { return 'a' } m = Map(minRune, []byte(tenRunes(unicode.MaxRune))) expect = a if string(m) != expect { t.Errorf("shrinking: expected %q got %q", expect, m) } // 3. Rot13 m = Map(rot13, []byte("a to zed")) expect = "n gb mrq" if string(m) != expect { t.Errorf("rot13: expected %q got %q", expect, m) } // 4. Rot13^2 m = Map(rot13, Map(rot13, []byte("a to zed"))) expect = "a to zed" if string(m) != expect { t.Errorf("rot13: expected %q got %q", expect, m) } // 5. Drop dropNotLatin := func(r rune) rune { if unicode.Is(unicode.Latin, r) { return r } return -1 } m = Map(dropNotLatin, []byte("Hello, 세계")) expect = "Hello" if string(m) != expect { t.Errorf("drop: expected %q got %q", expect, m) } // 6. Invalid rune invalidRune := func(r rune) rune { return utf8.MaxRune + 1 } m = Map(invalidRune, []byte("x")) expect = "\uFFFD" if string(m) != expect { t.Errorf("invalidRune: expected %q got %q", expect, m) } }
func TestIsControl(t *testing.T) { t.Skip() for i := 0; i < 256; i++ { control := i < 0x20 || i == 0x7f if lib := unicode.Is(unicode.Cc, rune(i)); control != lib { t.Errorf("%x: is control? %s", i, lib) } } }
// IndexFunc returns the index into s of the first Unicode code point satisfying f(c) // or -1 if none do func IndexFunc(s string, f func(rune) bool) int { function := func(c rune) bool { return unicode.Is(unicode.Han, c) } fmt.Println(strings.IndexFunc("Hello, 世界", function)) // 7 fmt.Println(strings.IndexFunc("Hello, world", function)) // -1 return strings.IndexFunc(s, f) }
// LastIndexFunc returns the index into s of the last Unicode code point // satisfying f(c) or -1 if none do func LastIndexFunc(s string, f func(rune) bool) int { function := func(c rune) bool { return unicode.Is(unicode.Han, c) } fmt.Println(strings.LastIndexFunc("hello 世界", function)) // 10 一个汉字貌似占3个位置 fmt.Println(strings.LastIndexFunc("hello world", function)) // -1 return strings.LastIndexFunc(s, f) }
//if the words contain any chinese character, return true func chinese(words string) (zh bool) { for _, r := range words { if unicode.Is(unicode.Scripts["Han"], r) { zh = true break } } return }