Пример #1
0
func Encode(code uint8, text string) []byte {
	switch code { // в зависимости от подходящей кодировки выбираем соответствующий метод кодирования
	case 8: // ucs8
		enc := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM).NewEncoder()
		es, _, _ := transform.Bytes(enc, []byte(text))
		return es
	case 3: // latin1
		es, _, _ := transform.Bytes(charmap.Windows1252.NewEncoder(), []byte(text))
		return es
	case 0: // декодируем в GSM 03.38
		var result bytes.Buffer
		for _, r := range text {
			if nr, ok := utf8GsmChars[r]; ok { // делаем замены известным символам
				result.WriteString(nr)
				continue
			}
			if r > '\u007F' { // удаляем все, что не входит в формат
				result.WriteRune('?')
				continue
			}
			result.WriteRune(r) // добавляем как есть
		}
		return result.Bytes()
	default:
		return []byte(text)
	}
}
Пример #2
0
func Fuzz(data []byte) int {
	_, _, err := transform.Bytes(new(Normalize), data)
	if err != nil {
		panic(err)
	}

	_, _, err = transform.Bytes(ToCRLF{}, data)
	if err != nil {
		panic(err)
	}

	return 0
}
Пример #3
0
// Bytes returns a new byte slice with the result of converting b using t.  It
// calls Reset on t. It returns nil if any error was found. This can only happen
// if an error-producing Transformer is passed to If.
func (t Transformer) Bytes(b []byte) []byte {
	b, _, err := transform.Bytes(t, b)
	if err != nil {
		return nil
	}
	return b
}
Пример #4
0
// Bytes returns a new byte slice with the result of applying the profile to b.
func (p Profile) Bytes(b []byte) ([]byte, error) {
	b, _, err := transform.Bytes(p.NewTransformer(), b)
	if err == nil && p.options.disallowEmpty && len(b) == 0 {
		return b, errors.New("enforce resulted in empty string")
	}
	return b, err
}
Пример #5
0
// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
// any error occurred.
func (e *Encoder) Bytes(b []byte) ([]byte, error) {
	b, _, err := transform.Bytes(e, b)
	if err != nil {
		return nil, err
	}
	return b, nil
}
Пример #6
0
func encodeText(e encoding.Encoding, text string) []byte {
	res, _, err := transform.Bytes(e.NewEncoder(), []byte(text))
	if err != nil {
		panic(err)
	}
	return res
}
Пример #7
0
// Decode from Latin1.
func (s Latin1) Decode() []byte {
	e := charmap.Windows1252.NewDecoder()
	es, _, err := transform.Bytes(e, s)
	if err != nil {
		return s
	}
	return es
}
Пример #8
0
// Decode from UCS2.
func (s UCS2) Decode() []byte {
	e := unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
	es, _, err := transform.Bytes(e.NewDecoder(), s)
	if err != nil {
		return s
	}
	return es
}
Пример #9
0
func base64StringWithGB2312(txt string) string {
	buf := bytes.NewBufferString("=?GB2312?B?")
	bs, _, e := transform.Bytes(simplifiedchinese.GB18030.NewEncoder(), []byte(txt))
	if nil != e {
		return qpString(txt)
	}
	buf.WriteString(base64.StdEncoding.EncodeToString(bs))
	buf.WriteString("?=")
	return buf.String()
}
Пример #10
0
func qpStringWithGB2312(txt string) string {
	buf := bytes.NewBufferString("=?GB2312?Q?")
	bs, _, e := transform.Bytes(simplifiedchinese.GB18030.NewEncoder(), []byte(txt))
	if nil != e {
		return qpString(txt)
	}
	w := qp.NewWriter(buf)
	w.Write(bs)
	w.Close()

	buf.WriteString("?=")
	return buf.String()
}
Пример #11
0
func Decode(code uint8, text []byte) string {
	switch code {
	case 8: // UCS2
		es, _, _ := transform.Bytes(
			unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM).NewDecoder(), text)
		return string(es)
	case 3: // latin1 (windows1252)
		es, _, _ := transform.Bytes(charmap.Windows1252.NewDecoder(), text)
		return string(es)
	case 0: // декодируем из формата GSM 03.38
		var result bytes.Buffer
		for _, r := range text {
			if nr, ok := gsmUtf8Chars[rune(r)]; ok { // делаем замены известным символам
				result.WriteString(nr)
				continue
			}
			result.WriteByte(r) // добавляем как есть
		}
		return result.String()
	default:
		return string(text)
	}
}
Пример #12
0
// normalize does unicode normalization.
func normalize(in []byte) ([]byte, error) {
	// We need a new transformer for each input as it cannot be reused.
	filter := func(r rune) bool {
		return unicode.Is(unicode.Mn, r) // Mn: nonspacing marks (to be removed)
	}
	transformer := transform.Chain(norm.NFD, transform.RemoveFunc(filter), norm.NFC)
	out, _, err := transform.Bytes(transformer, in)
	out = bytes.Map(func(r rune) rune {
		if unicode.IsPunct(r) { // Replace punctuations with spaces.
			return ' '
		}
		return unicode.ToLower(r) // Convert to lower case.
	}, out)
	return out, err
}
Пример #13
0
func removeNonAlphaNumeric(s string) string {
	in := []rune(s)
	res := make([]rune, len(in))
	i := 0
	for _, x := range s {
		if x == '-' {
			res[i] = ' '
			i++
			continue
		}
		if unicode.IsLetter(x) || unicode.IsDigit(x) || unicode.IsSpace(x) {
			res[i] = unicode.ToLower(x)
			i++
		}
	}
	result, _, _ := transform.Bytes(transformer, []byte(string(res[:i])))
	return string(result)
}
Пример #14
0
// NewLine creates a new Line reader object
func NewLine(input io.Reader, codec encoding.Encoding, bufferSize int) (*Line, error) {

	encoder := codec.NewEncoder()

	// Create newline char based on encoding
	nl, _, err := transform.Bytes(encoder, []byte{'\n'})
	if err != nil {
		return nil, err
	}

	return &Line{
		reader:     input,
		codec:      codec,
		bufferSize: bufferSize,
		nl:         nl,
		decoder:    codec.NewDecoder(),
		inBuffer:   streambuf.New(nil),
		outBuffer:  streambuf.New(nil),
	}, nil
}
Пример #15
0
// ToUtf8 takes a page body, determines its character encoding, and converts
// it to UTF8.
func ToUtf8(html []byte) ([]byte, error) {
	r, err := charsetDetector.DetectBest(html)
	if err != nil {
		return nil, err
	}

	if r.Charset == "UTF-8" || r.Charset == "ISO-8859-1" {
		return html, nil
	}

	t, ok := charsetDetectors[r.Charset]
	if !ok {
		return nil, fmt.Errorf(
			"could not find charset decoder for `%s`",
			r.Charset)
	}

	html, _, err = transform.Bytes(t.NewDecoder(), html)
	return html, err
}
Пример #16
0
// parseDirListLine parses a directory line in a format based on the output of
// the MS-DOS DIR command.
func parseDirListLine(line string) (*Entry, error) {
	e := &Entry{}
	var err error

	line = strings.TrimLeftFunc(line, unicode.IsSpace)
	// Try various time formats that DIR might use, and stop when one works.
	for _, format := range dirTimeFormats {
		e.Time, err = time.Parse(format, line[:len(format)])
		if err == nil {
			line = line[len(format):]
			break
		}
	}
	if err != nil {
		// None of the time formats worked.
		return nil, errUnsupportedListLine
	}

	line = strings.TrimLeftFunc(line, unicode.IsSpace)
	if strings.HasPrefix(line, "<DIR>") {
		e.Type = EntryTypeFolder
		line = strings.TrimPrefix(line, "<DIR>")
	} else {
		space := strings.Index(line, " ")
		if space == -1 {
			return nil, errUnsupportedListLine
		}
		e.Size, err = strconv.ParseUint(line[:space], 10, 64)
		if err != nil {
			return nil, errUnsupportedListLine
		}
		e.Type = EntryTypeFile
		line = line[space:]
	}

	e.Name = strings.TrimSpace(line)
	if rb, _, err := transform.Bytes(gbk_decoder, []byte(e.Name)); nil == err {
		e.Name = string(rb)
	}
	return e, nil
}
Пример #17
0
func (l *lineReader) init(
	input io.Reader,
	codec encoding.Encoding,
	bufferSize int,
) error {
	l.rawInput = input
	l.codec = codec
	l.bufferSize = bufferSize

	l.codec.NewEncoder()
	nl, _, err := transform.Bytes(l.codec.NewEncoder(), []byte{'\n'})
	if err != nil {
		return err
	}

	l.nl = nl
	l.decoder = l.codec.NewDecoder()
	l.inBuffer = streambuf.New(nil)
	l.outBuffer = streambuf.New(nil)
	return nil
}
Пример #18
0
func readFileAsUTF8String(filename string) (*string, error) {
	b, err := ioutil.ReadFile(filename)
	if err != nil {
		return nil, err
	}

	encoding, _, _ := charset.DetermineEncoding(b, mimeType)
	decoder := encoding.NewDecoder()
	decodedBytes, _, err := transform.Bytes(decoder, b)
	if err != nil {
		return nil, err
	}

	// Drop the UTF-8 BOM that may have been added. This isn't necessary, and
	// it's going to be written into another UTF-8 buffer anyway once it's JSON
	// serialized.
	//
	// The standard recommends omitting the BOM. See
	// http://www.unicode.org/versions/Unicode5.0.0/ch02.pdf
	decodedBytes = bytes.TrimPrefix(decodedBytes, utf8BOM)

	s := string(decodedBytes)
	return &s, nil
}
Пример #19
0
func importWinFile(file1 string, file2 string) {
	end = "\n"
	mb, err := ioutil.ReadFile(".\\" + file1)
	check(err)
	n := len(mb)

	//For more granular writes, open a file for writing.
	f1, err := os.Create(file2)
	check(err)

	//It’s idiomatic to defer a Close immediately after opening a file.
	defer f1.Close()
	enc := charmap.Windows1252
	//Nu krijg je een transformer
	trf := enc.NewDecoder()
	pb, i1, err := transform.Bytes(trf, mb)
	check(err)
	if i1 == n {
		fmt.Println("Alles ok")
	} else {
		fmt.Println("FOUT: v% - v%", n, i1)
	}
	writeBytes(f1, pb)
}
Пример #20
0
// GetPageBody gets and returns a body of a page.
func (bot *Bot) GetPageBody(urlinfo *UrlInfo, customHeaders map[string]string) error {
	if urlinfo.URL == "" {
		return errors.New("Empty URL")
	}
	// Build the request.
	req, err := http.NewRequest("GET", urlinfo.URL, nil)
	if err != nil {
		return err
	}
	if customHeaders["User-Agent"] == "" {
		customHeaders["User-Agent"] = bot.Config.HttpDefaultUserAgent
	}
	for k, v := range customHeaders {
		req.Header.Set(k, v)
	}

	// Get response.
	resp, err := bot.HTTPClient.Do(req)
	if err != nil {
		return err
	}
	defer resp.Body.Close()

	// Update the URL if it changed after redirects.
	final_link := resp.Request.URL.String()
	if final_link != "" && final_link != urlinfo.URL {
		bot.Log.Debugf("%s becomes %s", urlinfo.URL, final_link)
		urlinfo.URL = final_link
	}

	// Load the body up to PageBodyMaxSize.
	body := make([]byte, bot.Config.PageBodyMaxSize, bot.Config.PageBodyMaxSize)
	if num, err := io.ReadFull(resp.Body, body); err != nil && err != io.ErrUnexpectedEOF {
		return err
	} else {
		// Trim unneeded 0 bytes so that JSON unmarshaller won't complain.
		body = body[:num]
	}
	// Get the content-type
	contentType := resp.Header.Get("Content-Type")
	if contentType == "" {
		contentType = http.DetectContentType(body)
	}
	urlinfo.ContentType = contentType

	// If type is text, decode the body to UTF-8.
	if strings.Contains(contentType, "text/") {
		// Try to get more significant part for encoding detection.
		sample := bytes.Join(bot.webContentSampleRe.FindAll(body, -1), []byte{})
		if len(sample) < 100 {
			sample = body
		}
		// Unescape HTML tokens.
		sample = []byte(html.UnescapeString(string(sample)))
		// Try to only get charset from content type. Needed because some pages serve broken Content-Type header.
		detectionContentType := contentType
		tokens := strings.Split(contentType, ";")
		for _, t := range tokens {
			if strings.Contains(strings.ToLower(t), "charset") {
				detectionContentType = "text/plain; " + t
				break
			}
		}
		// Detect encoding and transform.
		encoding, _, _ := charset.DetermineEncoding(sample, detectionContentType)
		decodedBody, _, _ := transform.Bytes(encoding.NewDecoder(), body)
		urlinfo.Body = decodedBody
	} else if strings.Contains(contentType, "application/json") {
		urlinfo.Body = body
	} else {
		bot.Log.Debugf("Not fetching the body for Content-Type: %s", contentType)
	}
	return nil
}
Пример #21
0
func Bytes(b []byte) ([]byte, error) {
	t := transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC)
	res, _, err := transform.Bytes(t, b)
	return res, err
}
Пример #22
0
// Bytes returns a new byte slice with the result of converting b to the case
// form implemented by c.
func (c Caser) Bytes(b []byte) []byte {
	b, _, _ = transform.Bytes(c.t, b)
	return b
}
Пример #23
0
func transformBytes(e transform.Transformer, text []byte) (string, error) {
	res, _, err := transform.Bytes(e, text)
	return string(res), err
}
Пример #24
0
// Bytes returns a new byte slice with the result of applying t to b.
func (t Transformer) Bytes(b []byte) []byte {
	b, _, _ = transform.Bytes(t, b)
	return b
}