Exemple #1
0
// String is a best-effort attempt to get a UTF-8 encoded version of
// Value. Only MicrosoftUnicode (3,1 ,X), MacRomain (1,0,X) and Unicode platform
// strings are supported.
func (nameEntry *NameEntry) String() string {

	if nameEntry.PlatformID == PlatformUnicode || (nameEntry.PlatformID == PlatformMicrosoft &&
		nameEntry.EncodingID == PlatformEncodingMicrosoftUnicode) {

		decoder := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM).NewDecoder()

		outstr, _, err := transform.String(decoder, string(nameEntry.Value))

		if err == nil {
			return outstr
		}
	}

	if nameEntry.PlatformID == PlatformMac &&
		nameEntry.EncodingID == PlatformEncodingMacRoman {

		decoder := charmap.Macintosh.NewDecoder()

		outstr, _, err := transform.String(decoder, string(nameEntry.Value))

		if err == nil {
			return outstr
		}
	}

	return string(nameEntry.Value)
}
Exemple #2
0
// Predict takes in a document, predicts the
// class of the document based on the training
// data passed so far, and returns the class
// estimated for the document.
func (b *NaiveBayes) Predict(sentence string) uint8 {
	sums := make([]float64, len(b.Count))

	sentence, _, _ = transform.String(b.sanitize, sentence)
	w := strings.Split(strings.ToLower(sentence), " ")
	for _, word := range w {
		if _, ok := b.Words[word]; !ok {
			continue
		}

		for i := range sums {
			sums[i] += math.Log(float64(b.Words[word].Count[i]+1) / float64(b.Words[word].Seen+b.DictCount))
		}
	}

	for i := range sums {
		sums[i] += math.Log(b.Probabilities[i])
	}

	// find best class
	var maxI int
	for i := range sums {
		if sums[i] > sums[maxI] {
			maxI = i
		}
	}

	return uint8(maxI)
}
Exemple #3
0
func (e *Evaluator) funcConvert(f *ast.FuncConvertExpr) bool {
	value := f.Expr.GetValue()

	// Casting nil to any type returns nil
	if value == nil {
		f.SetValue(nil)
		return true
	}
	str, ok := value.(string)
	if !ok {
		return true
	}
	if strings.ToLower(f.Charset) == "ascii" {
		f.SetValue(value)
		return true
	} else if strings.ToLower(f.Charset) == "utf8mb4" {
		f.SetValue(value)
		return true
	}

	encoding, _ := charset.Lookup(f.Charset)
	if encoding == nil {
		e.err = ErrInvalidOperation.Gen("unknown encoding: %s", f.Charset)
		return false
	}

	target, _, err := transform.String(encoding.NewDecoder(), str)
	if err != nil {
		log.Errorf("Convert %s to %s with error: %v", str, f.Charset, err)
		e.err = errors.Trace(err)
		return false
	}
	f.SetValue(target)
	return true
}
Exemple #4
0
// See https://dev.mysql.com/doc/refman/5.7/en/cast-functions.html#function_convert
func builtinConvert(args []types.Datum, _ context.Context) (d types.Datum, err error) {
	// Casting nil to any type returns nil
	if args[0].Kind() != types.KindString {
		return d, nil
	}

	str := args[0].GetString()
	Charset := args[1].GetString()

	if strings.ToLower(Charset) == "ascii" {
		d.SetString(str)
		return d, nil
	} else if strings.ToLower(Charset) == "utf8mb4" {
		d.SetString(str)
		return d, nil
	}

	encoding, _ := charset.Lookup(Charset)
	if encoding == nil {
		return d, errors.Errorf("unknown encoding: %s", Charset)
	}

	target, _, err := transform.String(encoding.NewDecoder(), str)
	if err != nil {
		log.Errorf("Convert %s to %s with error: %v", str, Charset, err)
		return d, errors.Trace(err)
	}
	d.SetString(target)
	return d, nil
}
func TestNonRepertoire(t *testing.T) {
	testCases := []struct {
		init      func(e encoding.Encoding) (string, transform.Transformer, error)
		e         encoding.Encoding
		src, want string
	}{
		{dec, EUCKR, "\xfe\xfe", "\ufffd"},
		// {dec, EUCKR, "א", "\ufffd"}, // TODO: why is this different?

		{enc, EUCKR, "א", ""},
		{enc, EUCKR, "aא", "a"},
		{enc, EUCKR, "\uac00א", "\xb0\xa1"},
		// TODO: should we also handle Jamo?
	}
	for _, tc := range testCases {
		dir, tr, wantErr := tc.init(tc.e)

		dst, _, err := transform.String(tr, tc.src)
		if err != wantErr {
			t.Errorf("%s %v(%q): got %v; want %v", dir, tc.e, tc.src, err, wantErr)
		}
		if got := string(dst); got != tc.want {
			t.Errorf("%s %v(%q):\ngot  %q\nwant %q", dir, tc.e, tc.src, got, tc.want)
		}
	}
}
Exemple #6
0
func TestNonRepertoire(t *testing.T) {
	testCases := []struct {
		init      func(e encoding.Encoding) (string, transform.Transformer, error)
		e         encoding.Encoding
		src, want string
	}{
		{dec, EUCJP, "\xfe\xfc", "\ufffd"},
		{dec, ISO2022JP, "\x1b$B\x7e\x7e", "\ufffd"},
		{dec, ShiftJIS, "\xef\xfc", "\ufffd"},

		{enc, EUCJP, "갂", ""},
		{enc, EUCJP, "a갂", "a"},
		{enc, EUCJP, "丌갂", "\x8f\xb0\xa4"},

		{enc, ISO2022JP, "갂", ""},
		{enc, ISO2022JP, "a갂", "a"},
		{enc, ISO2022JP, "朗갂", "\x1b$BzF\x1b(B"}, // switch back to ASCII mode at end

		{enc, ShiftJIS, "갂", ""},
		{enc, ShiftJIS, "a갂", "a"},
		{enc, ShiftJIS, "\u2190갂", "\x81\xa9"},
	}
	for _, tc := range testCases {
		dir, tr, wantErr := tc.init(tc.e)

		dst, _, err := transform.String(tr, tc.src)
		if err != wantErr {
			t.Errorf("%s %v(%q): got %v; want %v", dir, tc.e, tc.src, err, wantErr)
		}
		if got := string(dst); got != tc.want {
			t.Errorf("%s %v(%q):\ngot  %q\nwant %q", dir, tc.e, tc.src, got, tc.want)
		}
	}
}
Exemple #7
0
// String converts the given encoded string to UTF-8. It returns the converted
// string or "", err if any error occurred.
func (d *Decoder) String(s string) (string, error) {
	s, _, err := transform.String(d, s)
	if err != nil {
		return "", err
	}
	return s, nil
}
func TestNonRepertoire(t *testing.T) {
	testCases := []struct {
		init      func(e encoding.Encoding) (string, transform.Transformer, error)
		e         encoding.Encoding
		src, want string
	}{
		{dec, GBK, "a\xfe\xfeb", "a\ufffdb"},
		{dec, HZGB2312, "~{z~", "\ufffd"},

		{enc, GBK, "갂", ""},
		{enc, GBK, "a갂", "a"},
		{enc, GBK, "\u4e02갂", "\x81@"},

		{enc, HZGB2312, "갂", ""},
		{enc, HZGB2312, "a갂", "a"},
		{enc, HZGB2312, "\u6cf5갂", "~{1C~}"},
	}
	for _, tc := range testCases {
		dir, tr, wantErr := tc.init(tc.e)

		dst, _, err := transform.String(tr, tc.src)
		if err != wantErr {
			t.Errorf("%s %v(%q): got %v; want %v", dir, tc.e, tc.src, err, wantErr)
		}
		if got := string(dst); got != tc.want {
			t.Errorf("%s %v(%q):\ngot  %q\nwant %q", dir, tc.e, tc.src, got, tc.want)
		}
	}
}
Exemple #9
0
// Predict takes in a document, predicts the
// class of the document based on the training
// data passed so far, and returns the class
// estimated for the document.
func (b *NaiveBayes) Predict(sentence string) uint8 {
	sums := make([]float64, len(b.Count))

	sentence, _, _ = transform.String(b.sanitize, sentence)
	words := b.Tokenizer.Tokenize(sentence)
	for _, word := range words {
		w, ok := b.Words.Get(word)
		if !ok {
			continue
		}

		for i := range sums {
			sums[i] += math.Log(float64(w.Count[i]+1) / float64(w.Seen+b.DictCount))
		}
	}

	for i := range sums {
		sums[i] += math.Log(b.Probabilities[i])
	}

	// find best class
	var maxI int
	for i := range sums {
		if sums[i] > sums[maxI] {
			maxI = i
		}
	}

	return uint8(maxI)
}
Exemple #10
0
// Probability takes in a small document, returns the
// estimated class of the document based on the model
// as well as the probability that the model is part
// of that class
//
// NOTE: you should only use this for small documents
// because, as discussed in the docs for the model, the
// probability will often times underflow because you
// are multiplying together a bunch of probabilities
// which range on [0,1]. As such, the returned float
// could be NaN, and the predicted class could be
// 0 always.
//
// Basically, use Predict to be robust for larger
// documents. Use Probability only on relatively small
// (MAX of maybe a dozen words - basically just
// sentences and words) documents.
func (b *NaiveBayes) Probability(sentence string) (uint8, float64) {
	sums := make([]float64, len(b.Count))
	for i := range sums {
		sums[i] = 1
	}

	sentence, _, _ = transform.String(b.sanitize, sentence)
	words := b.Tokenizer.Tokenize(sentence)
	for _, word := range words {
		w, ok := b.Words.Get(word)
		if !ok {
			continue
		}

		for i := range sums {
			sums[i] *= float64(w.Count[i]+1) / float64(w.Seen+b.DictCount)
		}
	}

	for i := range sums {
		sums[i] *= b.Probabilities[i]
	}

	var denom float64
	var maxI int
	for i := range sums {
		if sums[i] > sums[maxI] {
			maxI = i
		}

		denom += sums[i]
	}

	return uint8(maxI), sums[maxI] / denom
}
Exemple #11
0
func ToUtf8WithErr(content []byte) (error, string) {
	charsetLabel, err := DetectEncoding(content)
	if err != nil {
		return err, ""
	}

	if charsetLabel == "utf8" {
		return nil, string(content)
	}

	encoding, _ := charset.Lookup(charsetLabel)

	if encoding == nil {
		return fmt.Errorf("unknow char decoder %s", charsetLabel), string(content)
	}

	result, n, err := transform.String(encoding.NewDecoder(), string(content))

	// If there is an error, we concatenate the nicely decoded part and the
	// original left over. This way we won't loose data.
	if err != nil {
		result = result + string(content[n:])
	}

	return err, result
}
Exemple #12
0
// String returns a string with the result of converting s using t. It calls
// Reset on t. It returns the empty string if any error was found. This can only
// happen if an error-producing Transformer is passed to If.
func (t Transformer) String(s string) string {
	s, _, err := transform.String(t, s)
	if err != nil {
		return ""
	}
	return s
}
Exemple #13
0
func TestNormalize(t *testing.T) {
	testCases := []struct {
		in   string
		want string
	}{
		{"hello, world\r\n", "hello, world\n"},
		{"hello, world\r", "hello, world\n"},
		{"hello, world\n", "hello, world\n"},
		{"", ""},
		{"\r\n", "\n"},
		{"hello,\r\nworld", "hello,\nworld"},
		{"hello,\rworld", "hello,\nworld"},
		{"hello,\nworld", "hello,\nworld"},
		{"hello,\n\rworld", "hello,\n\nworld"},
		{"hello,\r\n\r\nworld", "hello,\n\nworld"},
	}

	n := new(Normalize)

	for _, c := range testCases {
		got, _, err := transform.String(n, c.in)
		if err != nil {
			t.Errorf("error transforming %q: %v", c.in, err)
			continue
		}
		if got != c.want {
			t.Errorf("transforming %q: got %q, want %q", c.in, got, c.want)
		}
	}
}
// String returns a string with the result of applying the profile to s.
func (p Profile) String(s string) (string, error) {
	s, _, err := transform.String(p.NewTransformer(), s)
	if err == nil && p.options.disallowEmpty && len(s) == 0 {
		return s, errors.New("enforce resulted in empty string")
	}
	return s, err
}
Exemple #15
0
// Eval implements the Expression Eval interface.
func (f *FunctionConvert) Eval(ctx context.Context, args map[interface{}]interface{}) (interface{}, error) {
	value, err := f.Expr.Eval(ctx, args)
	if err != nil {
		return nil, err
	}

	// Casting nil to any type returns nil
	if value == nil {
		return nil, nil
	}
	str, ok := value.(string)
	if !ok {
		return nil, nil
	}
	if strings.ToLower(f.Charset) == "ascii" {
		return value, nil
	} else if strings.ToLower(f.Charset) == "utf8mb4" {
		return value, nil
	}

	encoding, _ := Lookup(f.Charset)
	if encoding == nil {
		return nil, fmt.Errorf("unknown encoding: %s", f.Charset)
	}

	target, _, err := transform.String(encoding.NewDecoder(), str)
	if err != nil {
		log.Errorf("Convert %s to %s with error: %v", str, f.Charset, err)
		return nil, errors.Trace(err)
	}
	return target, nil
}
Exemple #16
0
// Probability takes in a small document, returns the
// estimated class of the document based on the model
// as well as the probability that the model is part
// of that class
//
// NOTE: you should only use this for small documents
// because, as discussed in the docs for the model, the
// probability will often times underflow because you
// are multiplying together a bunch of probabilities
// which range on [0,1]. As such, the returned float
// could be NaN, and the predicted class could be
// 0 always.
//
// Basically, use Predict to be robust for larger
// documents. Use Probability only on relatively small
// (MAX of maybe a dozen words - basically just
// sentences and words) documents.
func (b *NaiveBayes) Probability(sentence string) (uint8, float64) {
	sums := make([]float64, len(b.Count))
	for i := range sums {
		sums[i] = 1
	}

	sentence, _, _ = transform.String(b.sanitize, sentence)
	w := strings.Split(strings.ToLower(sentence), " ")
	for _, word := range w {
		if _, ok := b.Words[word]; !ok {
			continue
		}

		for i := range sums {
			sums[i] *= float64(b.Words[word].Count[i]+1) / float64(b.Words[word].Seen+b.DictCount)
		}
	}

	for i := range sums {
		sums[i] *= b.Probabilities[i]
	}

	var denom float64
	var maxI int
	for i := range sums {
		if sums[i] > sums[maxI] {
			maxI = i
		}

		denom += sums[i]
	}

	return uint8(maxI), sums[maxI] / denom
}
Exemple #17
0
func NormalizeTitle(title string) string {
	normalizedTitle := title
	normalizedTitle = strings.ToLower(normalizedTitle)
	normalizedTitle = RomanizeHepburn(title)
	normalizedTitle = strings.ToLower(normalizedTitle)
	normalizedTitle = RemoveTrailingApostrophe(normalizedTitle)
	normalizedTitle, _, _ = transform.String(transform.Chain(
		norm.NFD,
		transform.RemoveFunc(func(r rune) bool {
			return unicode.Is(unicode.Mn, r)
		}),
		norm.NFC), normalizedTitle)
	normalizedTitle = strings.ToLower(normalizedTitle)
	normalizedTitle = regexp.MustCompile(`\(\d+\)`).ReplaceAllString(normalizedTitle, " ")
	normalizedTitle = strings.Map(func(r rune) rune {
		if !unicode.IsLetter(r) && !unicode.IsDigit(r) && r != '.' {
			return ' '
		}
		return r
	}, normalizedTitle)
	normalizedTitle = regexp.MustCompile(`\s+`).ReplaceAllString(normalizedTitle, " ")
	normalizedTitle = strings.TrimSpace(normalizedTitle)

	return normalizedTitle
}
Exemple #18
0
func removeNlChars(str string) string {
	isOk := func(r rune) bool {
		return r < 32 || r >= 127
	}
	t := transform.Chain(norm.NFKD, transform.RemoveFunc(isOk))
	str, _, _ = transform.String(t, str)
	return str
}
Exemple #19
0
func TestLettersShouldPass1(t *testing.T) {
	s, _, _ := transform.String(transform.RemoveFunc(OnlyLetters), "THIS iz A L337 aNd Un'Sani~~~~tized sentence")
	sanitized := []rune(s)

	for i := range sanitized {
		assert.False(t, OnlyLetters(sanitized[i]), "Letter %v should be sanitized", sanitized[i])
	}
}
Exemple #20
0
// stripAccents attempts to replace accented characters with an ASCII
// equivalent. This is an extreme oversimplication, but since cobe
// only uses this to create token equivalence (these strings are never
// displayed) it gets a pass.
func stripAccents(s string) string {
	s2, _, err := transform.String(stripT, s)
	if err != nil {
		return s
	}

	return s2
}
Exemple #21
0
func QueryEscape(charset, content string) string {
	encoding := GetCharset(charset)
	new_content, _, err := transform.String(encoding.NewEncoder(), content)
	if err != nil {
		return content
	}
	return url.QueryEscape(new_content)
}
Exemple #22
0
func ExampleRemove() {
	t := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
	s, _, _ := transform.String(t, "résumé")
	fmt.Println(s)

	// Output:
	// resume
}
Exemple #23
0
// http://blog.golang.org/normalization#TOC_10.
func normalize(normalizer transform.Transformer, in string) string {
	result, _, err := transform.String(normalizer, in)
	if err != nil {
		log.Printf("unable to transform text:\n\n%q\nerr: %s", in, err)
		return in
	}
	// replace non-breaking spaces!
	return strings.Replace(result, "\u00a0", " ", -1)
}
Exemple #24
0
func ExampleIn() {
	// Convert Latin characters to their canonical form, while keeping other
	// width distinctions.
	t := runes.If(runes.In(unicode.Latin), width.Fold, nil)
	s, _, _ := transform.String(t, "アルアノリウ tech / アルアノリウ tech")
	fmt.Println(s)

	// Output:
	// アルアノリウ tech / アルアノリウ tech
}
Exemple #25
0
func ExampleIf() {
	// Widen everything but ASCII.
	isASCII := func(r rune) bool { return r <= unicode.MaxASCII }
	t := runes.If(runes.Predicate(isASCII), nil, width.Widen)
	s, _, _ := transform.String(t, "アルアノリウ tech / 中國 / 5₩")
	fmt.Println(s)

	// Output:
	// アルアノリウ tech / 中國 / 5₩
}
Exemple #26
0
//function to sanitize input
//from: http://rosettacode.org/wiki/Strip_control_codes_and_extended_characters_from_a_string#Go
func stripCtlAndExtFromUnicode(str string) string {
	isOk := func(r rune) bool {
		return r < 32 || r >= 127
	}
	// The isOk filter is such that there is no need to chain to norm.NFC
	t := transform.Chain(norm.NFKD, transform.RemoveFunc(isOk))
	// This Transformer could also trivially be applied as an io.Reader
	// or io.Writer filter to automatically do such filtering when reading
	// or writing data anywhere.
	str, _, _ = transform.String(t, str)
	return str
}
Exemple #27
0
func TestAsciiLetters(t *testing.T) {
	tests := []testCase{
		{"THIS iz A L337 aNd Un'Sani~~~~tized sentence", "THISizALaNdUnSanitizedsentence"},
		{"here're some unicode letters: --Æ.ÒÑ", "hereresomeunicodeletters"},
		{")(*&^%$@!@#$%^&*(*&^%$#@#$%", ""},
	}
	for _, test := range tests {
		s, _, _ := transform.String(transform.RemoveFunc(OnlyAsciiLetters), test.input)
		if s != test.expectedOutput {
			t.Errorf("got \"%s\" expected \"%s\"\n", s, test.expectedOutput)
		}
	}
}
Exemple #28
0
// DecodeCharset detects charset of str decodes it.
func decodeCharset(str, label string) (nstr string, err error) {
	enc, _ := charset.Lookup(label)
	if enc == nil {
		enc, _, _ = charset.DetermineEncoding([]byte(str), "text/plain")
	}

	nstr, _, err = transform.String(enc.NewDecoder(), str)
	if err != nil {
		return nstr, err
	}

	return stripNonUTF8(nstr), nil
}
Exemple #29
0
func ExampleMap() {
	replaceHyphens := runes.Map(func(r rune) rune {
		if unicode.Is(unicode.Hyphen, r) {
			return '|'
		}
		return r
	})
	s, _, _ := transform.String(replaceHyphens, "a-b‐c⸗d﹣e")
	fmt.Println(s)

	// Output:
	// a|b|c|d|e
}
Exemple #30
0
func normalize(name, src string) (string, error) {
	if name == "" {
		name = baseWithoutExt(src)
	}
	t := transform.Chain(norm.NFD, transform.RemoveFunc(remove), norm.NFC)
	name = strings.TrimSpace(name)
	name, _, err := transform.String(t, name)
	if err != nil {
		return "", err
	}
	name = strings.ToLower(name)
	name = strings.Replace(name, " ", "_", -1)
	return name, nil
}