func Encode(code uint8, text string) []byte { switch code { // в зависимости от подходящей кодировки выбираем соответствующий метод кодирования case 8: // ucs8 enc := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM).NewEncoder() es, _, _ := transform.Bytes(enc, []byte(text)) return es case 3: // latin1 es, _, _ := transform.Bytes(charmap.Windows1252.NewEncoder(), []byte(text)) return es case 0: // декодируем в GSM 03.38 var result bytes.Buffer for _, r := range text { if nr, ok := utf8GsmChars[r]; ok { // делаем замены известным символам result.WriteString(nr) continue } if r > '\u007F' { // удаляем все, что не входит в формат result.WriteRune('?') continue } result.WriteRune(r) // добавляем как есть } return result.Bytes() default: return []byte(text) } }
func Fuzz(data []byte) int { _, _, err := transform.Bytes(new(Normalize), data) if err != nil { panic(err) } _, _, err = transform.Bytes(ToCRLF{}, data) if err != nil { panic(err) } return 0 }
// Bytes returns a new byte slice with the result of converting b using t. It // calls Reset on t. It returns nil if any error was found. This can only happen // if an error-producing Transformer is passed to If. func (t Transformer) Bytes(b []byte) []byte { b, _, err := transform.Bytes(t, b) if err != nil { return nil } return b }
// Bytes returns a new byte slice with the result of applying the profile to b. func (p Profile) Bytes(b []byte) ([]byte, error) { b, _, err := transform.Bytes(p.NewTransformer(), b) if err == nil && p.options.disallowEmpty && len(b) == 0 { return b, errors.New("enforce resulted in empty string") } return b, err }
// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if // any error occurred. func (e *Encoder) Bytes(b []byte) ([]byte, error) { b, _, err := transform.Bytes(e, b) if err != nil { return nil, err } return b, nil }
func encodeText(e encoding.Encoding, text string) []byte { res, _, err := transform.Bytes(e.NewEncoder(), []byte(text)) if err != nil { panic(err) } return res }
// Decode from Latin1. func (s Latin1) Decode() []byte { e := charmap.Windows1252.NewDecoder() es, _, err := transform.Bytes(e, s) if err != nil { return s } return es }
// Decode from UCS2. func (s UCS2) Decode() []byte { e := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM) es, _, err := transform.Bytes(e.NewDecoder(), s) if err != nil { return s } return es }
func base64StringWithGB2312(txt string) string { buf := bytes.NewBufferString("=?GB2312?B?") bs, _, e := transform.Bytes(simplifiedchinese.GB18030.NewEncoder(), []byte(txt)) if nil != e { return qpString(txt) } buf.WriteString(base64.StdEncoding.EncodeToString(bs)) buf.WriteString("?=") return buf.String() }
func qpStringWithGB2312(txt string) string { buf := bytes.NewBufferString("=?GB2312?Q?") bs, _, e := transform.Bytes(simplifiedchinese.GB18030.NewEncoder(), []byte(txt)) if nil != e { return qpString(txt) } w := qp.NewWriter(buf) w.Write(bs) w.Close() buf.WriteString("?=") return buf.String() }
func Decode(code uint8, text []byte) string { switch code { case 8: // UCS2 es, _, _ := transform.Bytes( unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM).NewDecoder(), text) return string(es) case 3: // latin1 (windows1252) es, _, _ := transform.Bytes(charmap.Windows1252.NewDecoder(), text) return string(es) case 0: // декодируем из формата GSM 03.38 var result bytes.Buffer for _, r := range text { if nr, ok := gsmUtf8Chars[rune(r)]; ok { // делаем замены известным символам result.WriteString(nr) continue } result.WriteByte(r) // добавляем как есть } return result.String() default: return string(text) } }
// normalize does unicode normalization. func normalize(in []byte) ([]byte, error) { // We need a new transformer for each input as it cannot be reused. filter := func(r rune) bool { return unicode.Is(unicode.Mn, r) // Mn: nonspacing marks (to be removed) } transformer := transform.Chain(norm.NFD, transform.RemoveFunc(filter), norm.NFC) out, _, err := transform.Bytes(transformer, in) out = bytes.Map(func(r rune) rune { if unicode.IsPunct(r) { // Replace punctuations with spaces. return ' ' } return unicode.ToLower(r) // Convert to lower case. }, out) return out, err }
func removeNonAlphaNumeric(s string) string { in := []rune(s) res := make([]rune, len(in)) i := 0 for _, x := range s { if x == '-' { res[i] = ' ' i++ continue } if unicode.IsLetter(x) || unicode.IsDigit(x) || unicode.IsSpace(x) { res[i] = unicode.ToLower(x) i++ } } result, _, _ := transform.Bytes(transformer, []byte(string(res[:i]))) return string(result) }
// NewLine creates a new Line reader object func NewLine(input io.Reader, codec encoding.Encoding, bufferSize int) (*Line, error) { encoder := codec.NewEncoder() // Create newline char based on encoding nl, _, err := transform.Bytes(encoder, []byte{'\n'}) if err != nil { return nil, err } return &Line{ reader: input, codec: codec, bufferSize: bufferSize, nl: nl, decoder: codec.NewDecoder(), inBuffer: streambuf.New(nil), outBuffer: streambuf.New(nil), }, nil }
// ToUtf8 takes a page body, determines its character encoding, and converts // it to UTF8. func ToUtf8(html []byte) ([]byte, error) { r, err := charsetDetector.DetectBest(html) if err != nil { return nil, err } if r.Charset == "UTF-8" || r.Charset == "ISO-8859-1" { return html, nil } t, ok := charsetDetectors[r.Charset] if !ok { return nil, fmt.Errorf( "could not find charset decoder for `%s`", r.Charset) } html, _, err = transform.Bytes(t.NewDecoder(), html) return html, err }
// parseDirListLine parses a directory line in a format based on the output of // the MS-DOS DIR command. func parseDirListLine(line string) (*Entry, error) { e := &Entry{} var err error line = strings.TrimLeftFunc(line, unicode.IsSpace) // Try various time formats that DIR might use, and stop when one works. for _, format := range dirTimeFormats { e.Time, err = time.Parse(format, line[:len(format)]) if err == nil { line = line[len(format):] break } } if err != nil { // None of the time formats worked. return nil, errUnsupportedListLine } line = strings.TrimLeftFunc(line, unicode.IsSpace) if strings.HasPrefix(line, "<DIR>") { e.Type = EntryTypeFolder line = strings.TrimPrefix(line, "<DIR>") } else { space := strings.Index(line, " ") if space == -1 { return nil, errUnsupportedListLine } e.Size, err = strconv.ParseUint(line[:space], 10, 64) if err != nil { return nil, errUnsupportedListLine } e.Type = EntryTypeFile line = line[space:] } e.Name = strings.TrimSpace(line) if rb, _, err := transform.Bytes(gbk_decoder, []byte(e.Name)); nil == err { e.Name = string(rb) } return e, nil }
func (l *lineReader) init( input io.Reader, codec encoding.Encoding, bufferSize int, ) error { l.rawInput = input l.codec = codec l.bufferSize = bufferSize l.codec.NewEncoder() nl, _, err := transform.Bytes(l.codec.NewEncoder(), []byte{'\n'}) if err != nil { return err } l.nl = nl l.decoder = l.codec.NewDecoder() l.inBuffer = streambuf.New(nil) l.outBuffer = streambuf.New(nil) return nil }
func readFileAsUTF8String(filename string) (*string, error) { b, err := ioutil.ReadFile(filename) if err != nil { return nil, err } encoding, _, _ := charset.DetermineEncoding(b, mimeType) decoder := encoding.NewDecoder() decodedBytes, _, err := transform.Bytes(decoder, b) if err != nil { return nil, err } // Drop the UTF-8 BOM that may have been added. This isn't necessary, and // it's going to be written into another UTF-8 buffer anyway once it's JSON // serialized. // // The standard recommends omitting the BOM. See // http://www.unicode.org/versions/Unicode5.0.0/ch02.pdf decodedBytes = bytes.TrimPrefix(decodedBytes, utf8BOM) s := string(decodedBytes) return &s, nil }
func importWinFile(file1 string, file2 string) { end = "\n" mb, err := ioutil.ReadFile(".\\" + file1) check(err) n := len(mb) //For more granular writes, open a file for writing. f1, err := os.Create(file2) check(err) //It’s idiomatic to defer a Close immediately after opening a file. defer f1.Close() enc := charmap.Windows1252 //Nu krijg je een transformer trf := enc.NewDecoder() pb, i1, err := transform.Bytes(trf, mb) check(err) if i1 == n { fmt.Println("Alles ok") } else { fmt.Println("FOUT: v% - v%", n, i1) } writeBytes(f1, pb) }
// GetPageBody gets and returns a body of a page. func (bot *Bot) GetPageBody(urlinfo *UrlInfo, customHeaders map[string]string) error { if urlinfo.URL == "" { return errors.New("Empty URL") } // Build the request. req, err := http.NewRequest("GET", urlinfo.URL, nil) if err != nil { return err } if customHeaders["User-Agent"] == "" { customHeaders["User-Agent"] = bot.Config.HttpDefaultUserAgent } for k, v := range customHeaders { req.Header.Set(k, v) } // Get response. resp, err := bot.HTTPClient.Do(req) if err != nil { return err } defer resp.Body.Close() // Update the URL if it changed after redirects. final_link := resp.Request.URL.String() if final_link != "" && final_link != urlinfo.URL { bot.Log.Debugf("%s becomes %s", urlinfo.URL, final_link) urlinfo.URL = final_link } // Load the body up to PageBodyMaxSize. body := make([]byte, bot.Config.PageBodyMaxSize, bot.Config.PageBodyMaxSize) if num, err := io.ReadFull(resp.Body, body); err != nil && err != io.ErrUnexpectedEOF { return err } else { // Trim unneeded 0 bytes so that JSON unmarshaller won't complain. body = body[:num] } // Get the content-type contentType := resp.Header.Get("Content-Type") if contentType == "" { contentType = http.DetectContentType(body) } urlinfo.ContentType = contentType // If type is text, decode the body to UTF-8. if strings.Contains(contentType, "text/") { // Try to get more significant part for encoding detection. sample := bytes.Join(bot.webContentSampleRe.FindAll(body, -1), []byte{}) if len(sample) < 100 { sample = body } // Unescape HTML tokens. sample = []byte(html.UnescapeString(string(sample))) // Try to only get charset from content type. Needed because some pages serve broken Content-Type header. detectionContentType := contentType tokens := strings.Split(contentType, ";") for _, t := range tokens { if strings.Contains(strings.ToLower(t), "charset") { detectionContentType = "text/plain; " + t break } } // Detect encoding and transform. encoding, _, _ := charset.DetermineEncoding(sample, detectionContentType) decodedBody, _, _ := transform.Bytes(encoding.NewDecoder(), body) urlinfo.Body = decodedBody } else if strings.Contains(contentType, "application/json") { urlinfo.Body = body } else { bot.Log.Debugf("Not fetching the body for Content-Type: %s", contentType) } return nil }
func Bytes(b []byte) ([]byte, error) { t := transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC) res, _, err := transform.Bytes(t, b) return res, err }
// Bytes returns a new byte slice with the result of converting b to the case // form implemented by c. func (c Caser) Bytes(b []byte) []byte { b, _, _ = transform.Bytes(c.t, b) return b }
func transformBytes(e transform.Transformer, text []byte) (string, error) { res, _, err := transform.Bytes(e, text) return string(res), err }
// Bytes returns a new byte slice with the result of applying t to b. func (t Transformer) Bytes(b []byte) []byte { b, _, _ = transform.Bytes(t, b) return b }