Example #1
0
func DecodeReader(s io.Reader, enc string) (string, error) {
	reader, err := charset.NewReaderLabel(enc, s)
	if err != nil {
		return "", err
	}
	bytes, err := ioutil.ReadAll(reader)
	if err != nil {
		return "", err
	}
	return string(bytes), nil
}
Example #2
0
func NewReaderLabel(label string, input io.Reader) (io.Reader, error) {
	conv, err := charset.NewReaderLabel(label, input)

	if err != nil {
		return nil, err
	}

	// Wrap the charset decoder reader with a XML sanitizer
	//clean := NewXMLSanitizerReader(conv)
	return conv, nil
}
Example #3
0
// GetBodyStr returns plain string crawled.
func (self *Context) initText() {
	// 采用surf内核下载时,尝试自动转码
	if self.Request.DownloaderID == request.SURF_ID {
		var contentType, pageEncode string
		// 优先从响应头读取编码类型
		contentType = self.Response.Header.Get("Content-Type")
		if _, params, err := mime.ParseMediaType(contentType); err == nil {
			if cs, ok := params["charset"]; ok {
				pageEncode = strings.ToLower(strings.TrimSpace(cs))
			}
		}
		// 响应头未指定编码类型时,从请求头读取
		if len(pageEncode) == 0 {
			contentType = self.Request.Header.Get("Content-Type")
			if _, params, err := mime.ParseMediaType(contentType); err == nil {
				if cs, ok := params["charset"]; ok {
					pageEncode = strings.ToLower(strings.TrimSpace(cs))
				}
			}
		}

		switch pageEncode {
		// 不做转码处理
		case "", "utf8", "utf-8", "unicode-1-1-utf-8":
		default:
			// 指定了编码类型,但不是utf8时,自动转码为utf8
			// get converter to utf-8
			// Charset auto determine. Use golang.org/x/net/html/charset. Get response body and change it to utf-8
			destReader, err := charset.NewReaderLabel(pageEncode, self.Response.Body)
			if err == nil {
				self.text, err = ioutil.ReadAll(destReader)
				if err == nil {
					self.Response.Body.Close()
					return
				} else {
					logs.Log.Warning(" *     [convert][%v]: %v (ignore transcoding)\n", self.GetUrl(), err)
				}
			} else {
				logs.Log.Warning(" *     [convert][%v]: %v (ignore transcoding)\n", self.GetUrl(), err)
			}
		}
	}

	// 不做转码处理
	var err error
	self.text, err = ioutil.ReadAll(self.Response.Body)
	self.Response.Body.Close()
	if err != nil {
		panic(err.Error())
		return
	}

}
Example #4
0
func ParseHTML(rawHTML []byte) (*html.Node, error) {
	enc := findCharset("", rawHTML)
	var r io.Reader
	r = strings.NewReader(string(rawHTML))
	if enc != "utf-8" {
		// we'll be translating to utf-8
		var err error
		r, err = charset.NewReaderLabel(enc, r)
		if err != nil {
			return nil, err
		}
	}

	return html.Parse(r)
}
Example #5
0
File: pcm.go Project: cfstras/pcm
func loadConns() (result types.Configuration) {
	filename := replaceHome(connectionsPath)
	rd, err := os.Open(filename)
	p(err, "opening "+filename)
	defer rd.Close()
	rd2, err := charset.NewReaderLabel("utf-16", rd)
	p(err, "loading charset")

	decoder := xml.NewDecoder(rd2)
	decoder.CharsetReader = DummyReader
	p(decoder.Decode(&result), "decoding xml")

	result.AllConnections = listConnections(&result, false)

	result.Root.Expanded = true
	return
}
Example #6
0
func (c *Client) do(r request, result interface{}) error {
	u, err := url.Parse(ROOT_URL + "/" + r.Method + ".xml")
	if err != nil {
		return err
	}
	r.Args.Set("apikey", c.Apikey)
	r.Args.Set("lang", c.Lang)
	r.Args.Set("enableutf8", "1")
	u.RawQuery = r.Args.Encode()

	req, err := http.NewRequest("get", u.String(), nil)
	if err != nil {
		return err
	}
	if r.Auth {
		if c.user == "" || c.pass == "" {
			return errors.New("Not logged in: username or password empty")
		}
		req.SetBasicAuth(c.user, c.pass)
	}
	resp, err := c.http.Do(req)
	if err != nil {
		return err
	}
	if resp.StatusCode != http.StatusOK {
		return errors.New("Unexpected status code: " + resp.Status)
	}

	// b, _ := ioutil.ReadAll(resp.Body)
	// fmt.Println(string(b))

	defer resp.Body.Close()
	dec := xml.NewDecoder(resp.Body)
	dec.CharsetReader = func(label string, in io.Reader) (io.Reader, error) {
		return charset.NewReaderLabel(label, in)
	}
	if err := dec.Decode(result); err != nil {
		return err
	}
	return nil
}