// String is a best-effort attempt to get a UTF-8 encoded version of // Value. Only MicrosoftUnicode (3,1 ,X), MacRomain (1,0,X) and Unicode platform // strings are supported. func (nameEntry *NameEntry) String() string { if nameEntry.PlatformID == PlatformUnicode || (nameEntry.PlatformID == PlatformMicrosoft && nameEntry.EncodingID == PlatformEncodingMicrosoftUnicode) { decoder := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM).NewDecoder() outstr, _, err := transform.String(decoder, string(nameEntry.Value)) if err == nil { return outstr } } if nameEntry.PlatformID == PlatformMac && nameEntry.EncodingID == PlatformEncodingMacRoman { decoder := charmap.Macintosh.NewDecoder() outstr, _, err := transform.String(decoder, string(nameEntry.Value)) if err == nil { return outstr } } return string(nameEntry.Value) }
// Predict takes in a document, predicts the // class of the document based on the training // data passed so far, and returns the class // estimated for the document. func (b *NaiveBayes) Predict(sentence string) uint8 { sums := make([]float64, len(b.Count)) sentence, _, _ = transform.String(b.sanitize, sentence) w := strings.Split(strings.ToLower(sentence), " ") for _, word := range w { if _, ok := b.Words[word]; !ok { continue } for i := range sums { sums[i] += math.Log(float64(b.Words[word].Count[i]+1) / float64(b.Words[word].Seen+b.DictCount)) } } for i := range sums { sums[i] += math.Log(b.Probabilities[i]) } // find best class var maxI int for i := range sums { if sums[i] > sums[maxI] { maxI = i } } return uint8(maxI) }
func (e *Evaluator) funcConvert(f *ast.FuncConvertExpr) bool { value := f.Expr.GetValue() // Casting nil to any type returns nil if value == nil { f.SetValue(nil) return true } str, ok := value.(string) if !ok { return true } if strings.ToLower(f.Charset) == "ascii" { f.SetValue(value) return true } else if strings.ToLower(f.Charset) == "utf8mb4" { f.SetValue(value) return true } encoding, _ := charset.Lookup(f.Charset) if encoding == nil { e.err = ErrInvalidOperation.Gen("unknown encoding: %s", f.Charset) return false } target, _, err := transform.String(encoding.NewDecoder(), str) if err != nil { log.Errorf("Convert %s to %s with error: %v", str, f.Charset, err) e.err = errors.Trace(err) return false } f.SetValue(target) return true }
// See https://dev.mysql.com/doc/refman/5.7/en/cast-functions.html#function_convert func builtinConvert(args []types.Datum, _ context.Context) (d types.Datum, err error) { // Casting nil to any type returns nil if args[0].Kind() != types.KindString { return d, nil } str := args[0].GetString() Charset := args[1].GetString() if strings.ToLower(Charset) == "ascii" { d.SetString(str) return d, nil } else if strings.ToLower(Charset) == "utf8mb4" { d.SetString(str) return d, nil } encoding, _ := charset.Lookup(Charset) if encoding == nil { return d, errors.Errorf("unknown encoding: %s", Charset) } target, _, err := transform.String(encoding.NewDecoder(), str) if err != nil { log.Errorf("Convert %s to %s with error: %v", str, Charset, err) return d, errors.Trace(err) } d.SetString(target) return d, nil }
func TestNonRepertoire(t *testing.T) { testCases := []struct { init func(e encoding.Encoding) (string, transform.Transformer, error) e encoding.Encoding src, want string }{ {dec, EUCKR, "\xfe\xfe", "\ufffd"}, // {dec, EUCKR, "א", "\ufffd"}, // TODO: why is this different? {enc, EUCKR, "א", ""}, {enc, EUCKR, "aא", "a"}, {enc, EUCKR, "\uac00א", "\xb0\xa1"}, // TODO: should we also handle Jamo? } for _, tc := range testCases { dir, tr, wantErr := tc.init(tc.e) dst, _, err := transform.String(tr, tc.src) if err != wantErr { t.Errorf("%s %v(%q): got %v; want %v", dir, tc.e, tc.src, err, wantErr) } if got := string(dst); got != tc.want { t.Errorf("%s %v(%q):\ngot %q\nwant %q", dir, tc.e, tc.src, got, tc.want) } } }
func TestNonRepertoire(t *testing.T) { testCases := []struct { init func(e encoding.Encoding) (string, transform.Transformer, error) e encoding.Encoding src, want string }{ {dec, EUCJP, "\xfe\xfc", "\ufffd"}, {dec, ISO2022JP, "\x1b$B\x7e\x7e", "\ufffd"}, {dec, ShiftJIS, "\xef\xfc", "\ufffd"}, {enc, EUCJP, "갂", ""}, {enc, EUCJP, "a갂", "a"}, {enc, EUCJP, "丌갂", "\x8f\xb0\xa4"}, {enc, ISO2022JP, "갂", ""}, {enc, ISO2022JP, "a갂", "a"}, {enc, ISO2022JP, "朗갂", "\x1b$BzF\x1b(B"}, // switch back to ASCII mode at end {enc, ShiftJIS, "갂", ""}, {enc, ShiftJIS, "a갂", "a"}, {enc, ShiftJIS, "\u2190갂", "\x81\xa9"}, } for _, tc := range testCases { dir, tr, wantErr := tc.init(tc.e) dst, _, err := transform.String(tr, tc.src) if err != wantErr { t.Errorf("%s %v(%q): got %v; want %v", dir, tc.e, tc.src, err, wantErr) } if got := string(dst); got != tc.want { t.Errorf("%s %v(%q):\ngot %q\nwant %q", dir, tc.e, tc.src, got, tc.want) } } }
// String converts the given encoded string to UTF-8. It returns the converted // string or "", err if any error occurred. func (d *Decoder) String(s string) (string, error) { s, _, err := transform.String(d, s) if err != nil { return "", err } return s, nil }
func TestNonRepertoire(t *testing.T) { testCases := []struct { init func(e encoding.Encoding) (string, transform.Transformer, error) e encoding.Encoding src, want string }{ {dec, GBK, "a\xfe\xfeb", "a\ufffdb"}, {dec, HZGB2312, "~{z~", "\ufffd"}, {enc, GBK, "갂", ""}, {enc, GBK, "a갂", "a"}, {enc, GBK, "\u4e02갂", "\x81@"}, {enc, HZGB2312, "갂", ""}, {enc, HZGB2312, "a갂", "a"}, {enc, HZGB2312, "\u6cf5갂", "~{1C~}"}, } for _, tc := range testCases { dir, tr, wantErr := tc.init(tc.e) dst, _, err := transform.String(tr, tc.src) if err != wantErr { t.Errorf("%s %v(%q): got %v; want %v", dir, tc.e, tc.src, err, wantErr) } if got := string(dst); got != tc.want { t.Errorf("%s %v(%q):\ngot %q\nwant %q", dir, tc.e, tc.src, got, tc.want) } } }
// Predict takes in a document, predicts the // class of the document based on the training // data passed so far, and returns the class // estimated for the document. func (b *NaiveBayes) Predict(sentence string) uint8 { sums := make([]float64, len(b.Count)) sentence, _, _ = transform.String(b.sanitize, sentence) words := b.Tokenizer.Tokenize(sentence) for _, word := range words { w, ok := b.Words.Get(word) if !ok { continue } for i := range sums { sums[i] += math.Log(float64(w.Count[i]+1) / float64(w.Seen+b.DictCount)) } } for i := range sums { sums[i] += math.Log(b.Probabilities[i]) } // find best class var maxI int for i := range sums { if sums[i] > sums[maxI] { maxI = i } } return uint8(maxI) }
// Probability takes in a small document, returns the // estimated class of the document based on the model // as well as the probability that the model is part // of that class // // NOTE: you should only use this for small documents // because, as discussed in the docs for the model, the // probability will often times underflow because you // are multiplying together a bunch of probabilities // which range on [0,1]. As such, the returned float // could be NaN, and the predicted class could be // 0 always. // // Basically, use Predict to be robust for larger // documents. Use Probability only on relatively small // (MAX of maybe a dozen words - basically just // sentences and words) documents. func (b *NaiveBayes) Probability(sentence string) (uint8, float64) { sums := make([]float64, len(b.Count)) for i := range sums { sums[i] = 1 } sentence, _, _ = transform.String(b.sanitize, sentence) words := b.Tokenizer.Tokenize(sentence) for _, word := range words { w, ok := b.Words.Get(word) if !ok { continue } for i := range sums { sums[i] *= float64(w.Count[i]+1) / float64(w.Seen+b.DictCount) } } for i := range sums { sums[i] *= b.Probabilities[i] } var denom float64 var maxI int for i := range sums { if sums[i] > sums[maxI] { maxI = i } denom += sums[i] } return uint8(maxI), sums[maxI] / denom }
func ToUtf8WithErr(content []byte) (error, string) { charsetLabel, err := DetectEncoding(content) if err != nil { return err, "" } if charsetLabel == "utf8" { return nil, string(content) } encoding, _ := charset.Lookup(charsetLabel) if encoding == nil { return fmt.Errorf("unknow char decoder %s", charsetLabel), string(content) } result, n, err := transform.String(encoding.NewDecoder(), string(content)) // If there is an error, we concatenate the nicely decoded part and the // original left over. This way we won't loose data. if err != nil { result = result + string(content[n:]) } return err, result }
// String returns a string with the result of converting s using t. It calls // Reset on t. It returns the empty string if any error was found. This can only // happen if an error-producing Transformer is passed to If. func (t Transformer) String(s string) string { s, _, err := transform.String(t, s) if err != nil { return "" } return s }
func TestNormalize(t *testing.T) { testCases := []struct { in string want string }{ {"hello, world\r\n", "hello, world\n"}, {"hello, world\r", "hello, world\n"}, {"hello, world\n", "hello, world\n"}, {"", ""}, {"\r\n", "\n"}, {"hello,\r\nworld", "hello,\nworld"}, {"hello,\rworld", "hello,\nworld"}, {"hello,\nworld", "hello,\nworld"}, {"hello,\n\rworld", "hello,\n\nworld"}, {"hello,\r\n\r\nworld", "hello,\n\nworld"}, } n := new(Normalize) for _, c := range testCases { got, _, err := transform.String(n, c.in) if err != nil { t.Errorf("error transforming %q: %v", c.in, err) continue } if got != c.want { t.Errorf("transforming %q: got %q, want %q", c.in, got, c.want) } } }
// String returns a string with the result of applying the profile to s. func (p Profile) String(s string) (string, error) { s, _, err := transform.String(p.NewTransformer(), s) if err == nil && p.options.disallowEmpty && len(s) == 0 { return s, errors.New("enforce resulted in empty string") } return s, err }
// Eval implements the Expression Eval interface. func (f *FunctionConvert) Eval(ctx context.Context, args map[interface{}]interface{}) (interface{}, error) { value, err := f.Expr.Eval(ctx, args) if err != nil { return nil, err } // Casting nil to any type returns nil if value == nil { return nil, nil } str, ok := value.(string) if !ok { return nil, nil } if strings.ToLower(f.Charset) == "ascii" { return value, nil } else if strings.ToLower(f.Charset) == "utf8mb4" { return value, nil } encoding, _ := Lookup(f.Charset) if encoding == nil { return nil, fmt.Errorf("unknown encoding: %s", f.Charset) } target, _, err := transform.String(encoding.NewDecoder(), str) if err != nil { log.Errorf("Convert %s to %s with error: %v", str, f.Charset, err) return nil, errors.Trace(err) } return target, nil }
// Probability takes in a small document, returns the // estimated class of the document based on the model // as well as the probability that the model is part // of that class // // NOTE: you should only use this for small documents // because, as discussed in the docs for the model, the // probability will often times underflow because you // are multiplying together a bunch of probabilities // which range on [0,1]. As such, the returned float // could be NaN, and the predicted class could be // 0 always. // // Basically, use Predict to be robust for larger // documents. Use Probability only on relatively small // (MAX of maybe a dozen words - basically just // sentences and words) documents. func (b *NaiveBayes) Probability(sentence string) (uint8, float64) { sums := make([]float64, len(b.Count)) for i := range sums { sums[i] = 1 } sentence, _, _ = transform.String(b.sanitize, sentence) w := strings.Split(strings.ToLower(sentence), " ") for _, word := range w { if _, ok := b.Words[word]; !ok { continue } for i := range sums { sums[i] *= float64(b.Words[word].Count[i]+1) / float64(b.Words[word].Seen+b.DictCount) } } for i := range sums { sums[i] *= b.Probabilities[i] } var denom float64 var maxI int for i := range sums { if sums[i] > sums[maxI] { maxI = i } denom += sums[i] } return uint8(maxI), sums[maxI] / denom }
func NormalizeTitle(title string) string { normalizedTitle := title normalizedTitle = strings.ToLower(normalizedTitle) normalizedTitle = RomanizeHepburn(title) normalizedTitle = strings.ToLower(normalizedTitle) normalizedTitle = RemoveTrailingApostrophe(normalizedTitle) normalizedTitle, _, _ = transform.String(transform.Chain( norm.NFD, transform.RemoveFunc(func(r rune) bool { return unicode.Is(unicode.Mn, r) }), norm.NFC), normalizedTitle) normalizedTitle = strings.ToLower(normalizedTitle) normalizedTitle = regexp.MustCompile(`\(\d+\)`).ReplaceAllString(normalizedTitle, " ") normalizedTitle = strings.Map(func(r rune) rune { if !unicode.IsLetter(r) && !unicode.IsDigit(r) && r != '.' { return ' ' } return r }, normalizedTitle) normalizedTitle = regexp.MustCompile(`\s+`).ReplaceAllString(normalizedTitle, " ") normalizedTitle = strings.TrimSpace(normalizedTitle) return normalizedTitle }
func removeNlChars(str string) string { isOk := func(r rune) bool { return r < 32 || r >= 127 } t := transform.Chain(norm.NFKD, transform.RemoveFunc(isOk)) str, _, _ = transform.String(t, str) return str }
func TestLettersShouldPass1(t *testing.T) { s, _, _ := transform.String(transform.RemoveFunc(OnlyLetters), "THIS iz A L337 aNd Un'Sani~~~~tized sentence") sanitized := []rune(s) for i := range sanitized { assert.False(t, OnlyLetters(sanitized[i]), "Letter %v should be sanitized", sanitized[i]) } }
// stripAccents attempts to replace accented characters with an ASCII // equivalent. This is an extreme oversimplication, but since cobe // only uses this to create token equivalence (these strings are never // displayed) it gets a pass. func stripAccents(s string) string { s2, _, err := transform.String(stripT, s) if err != nil { return s } return s2 }
func QueryEscape(charset, content string) string { encoding := GetCharset(charset) new_content, _, err := transform.String(encoding.NewEncoder(), content) if err != nil { return content } return url.QueryEscape(new_content) }
func ExampleRemove() { t := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC) s, _, _ := transform.String(t, "résumé") fmt.Println(s) // Output: // resume }
// http://blog.golang.org/normalization#TOC_10. func normalize(normalizer transform.Transformer, in string) string { result, _, err := transform.String(normalizer, in) if err != nil { log.Printf("unable to transform text:\n\n%q\nerr: %s", in, err) return in } // replace non-breaking spaces! return strings.Replace(result, "\u00a0", " ", -1) }
func ExampleIn() { // Convert Latin characters to their canonical form, while keeping other // width distinctions. t := runes.If(runes.In(unicode.Latin), width.Fold, nil) s, _, _ := transform.String(t, "アルアノリウ tech / アルアノリウ tech") fmt.Println(s) // Output: // アルアノリウ tech / アルアノリウ tech }
func ExampleIf() { // Widen everything but ASCII. isASCII := func(r rune) bool { return r <= unicode.MaxASCII } t := runes.If(runes.Predicate(isASCII), nil, width.Widen) s, _, _ := transform.String(t, "アルアノリウ tech / 中國 / 5₩") fmt.Println(s) // Output: // アルアノリウ tech / 中國 / 5₩ }
//function to sanitize input //from: http://rosettacode.org/wiki/Strip_control_codes_and_extended_characters_from_a_string#Go func stripCtlAndExtFromUnicode(str string) string { isOk := func(r rune) bool { return r < 32 || r >= 127 } // The isOk filter is such that there is no need to chain to norm.NFC t := transform.Chain(norm.NFKD, transform.RemoveFunc(isOk)) // This Transformer could also trivially be applied as an io.Reader // or io.Writer filter to automatically do such filtering when reading // or writing data anywhere. str, _, _ = transform.String(t, str) return str }
func TestAsciiLetters(t *testing.T) { tests := []testCase{ {"THIS iz A L337 aNd Un'Sani~~~~tized sentence", "THISizALaNdUnSanitizedsentence"}, {"here're some unicode letters: --Æ.ÒÑ", "hereresomeunicodeletters"}, {")(*&^%$@!@#$%^&*(*&^%$#@#$%", ""}, } for _, test := range tests { s, _, _ := transform.String(transform.RemoveFunc(OnlyAsciiLetters), test.input) if s != test.expectedOutput { t.Errorf("got \"%s\" expected \"%s\"\n", s, test.expectedOutput) } } }
// DecodeCharset detects charset of str decodes it. func decodeCharset(str, label string) (nstr string, err error) { enc, _ := charset.Lookup(label) if enc == nil { enc, _, _ = charset.DetermineEncoding([]byte(str), "text/plain") } nstr, _, err = transform.String(enc.NewDecoder(), str) if err != nil { return nstr, err } return stripNonUTF8(nstr), nil }
func ExampleMap() { replaceHyphens := runes.Map(func(r rune) rune { if unicode.Is(unicode.Hyphen, r) { return '|' } return r }) s, _, _ := transform.String(replaceHyphens, "a-b‐c⸗d﹣e") fmt.Println(s) // Output: // a|b|c|d|e }
func normalize(name, src string) (string, error) { if name == "" { name = baseWithoutExt(src) } t := transform.Chain(norm.NFD, transform.RemoveFunc(remove), norm.NFC) name = strings.TrimSpace(name) name, _, err := transform.String(t, name) if err != nil { return "", err } name = strings.ToLower(name) name = strings.Replace(name, " ", "_", -1) return name, nil }