func DecodeReader(s io.Reader, enc string) (string, error) { reader, err := charset.NewReaderLabel(enc, s) if err != nil { return "", err } bytes, err := ioutil.ReadAll(reader) if err != nil { return "", err } return string(bytes), nil }
func NewReaderLabel(label string, input io.Reader) (io.Reader, error) { conv, err := charset.NewReaderLabel(label, input) if err != nil { return nil, err } // Wrap the charset decoder reader with a XML sanitizer //clean := NewXMLSanitizerReader(conv) return conv, nil }
// GetBodyStr returns plain string crawled. func (self *Context) initText() { // 采用surf内核下载时,尝试自动转码 if self.Request.DownloaderID == request.SURF_ID { var contentType, pageEncode string // 优先从响应头读取编码类型 contentType = self.Response.Header.Get("Content-Type") if _, params, err := mime.ParseMediaType(contentType); err == nil { if cs, ok := params["charset"]; ok { pageEncode = strings.ToLower(strings.TrimSpace(cs)) } } // 响应头未指定编码类型时,从请求头读取 if len(pageEncode) == 0 { contentType = self.Request.Header.Get("Content-Type") if _, params, err := mime.ParseMediaType(contentType); err == nil { if cs, ok := params["charset"]; ok { pageEncode = strings.ToLower(strings.TrimSpace(cs)) } } } switch pageEncode { // 不做转码处理 case "", "utf8", "utf-8", "unicode-1-1-utf-8": default: // 指定了编码类型,但不是utf8时,自动转码为utf8 // get converter to utf-8 // Charset auto determine. Use golang.org/x/net/html/charset. Get response body and change it to utf-8 destReader, err := charset.NewReaderLabel(pageEncode, self.Response.Body) if err == nil { self.text, err = ioutil.ReadAll(destReader) if err == nil { self.Response.Body.Close() return } else { logs.Log.Warning(" * [convert][%v]: %v (ignore transcoding)\n", self.GetUrl(), err) } } else { logs.Log.Warning(" * [convert][%v]: %v (ignore transcoding)\n", self.GetUrl(), err) } } } // 不做转码处理 var err error self.text, err = ioutil.ReadAll(self.Response.Body) self.Response.Body.Close() if err != nil { panic(err.Error()) return } }
func ParseHTML(rawHTML []byte) (*html.Node, error) { enc := findCharset("", rawHTML) var r io.Reader r = strings.NewReader(string(rawHTML)) if enc != "utf-8" { // we'll be translating to utf-8 var err error r, err = charset.NewReaderLabel(enc, r) if err != nil { return nil, err } } return html.Parse(r) }
func loadConns() (result types.Configuration) { filename := replaceHome(connectionsPath) rd, err := os.Open(filename) p(err, "opening "+filename) defer rd.Close() rd2, err := charset.NewReaderLabel("utf-16", rd) p(err, "loading charset") decoder := xml.NewDecoder(rd2) decoder.CharsetReader = DummyReader p(decoder.Decode(&result), "decoding xml") result.AllConnections = listConnections(&result, false) result.Root.Expanded = true return }
func (c *Client) do(r request, result interface{}) error { u, err := url.Parse(ROOT_URL + "/" + r.Method + ".xml") if err != nil { return err } r.Args.Set("apikey", c.Apikey) r.Args.Set("lang", c.Lang) r.Args.Set("enableutf8", "1") u.RawQuery = r.Args.Encode() req, err := http.NewRequest("get", u.String(), nil) if err != nil { return err } if r.Auth { if c.user == "" || c.pass == "" { return errors.New("Not logged in: username or password empty") } req.SetBasicAuth(c.user, c.pass) } resp, err := c.http.Do(req) if err != nil { return err } if resp.StatusCode != http.StatusOK { return errors.New("Unexpected status code: " + resp.Status) } // b, _ := ioutil.ReadAll(resp.Body) // fmt.Println(string(b)) defer resp.Body.Close() dec := xml.NewDecoder(resp.Body) dec.CharsetReader = func(label string, in io.Reader) (io.Reader, error) { return charset.NewReaderLabel(label, in) } if err := dec.Decode(result); err != nil { return err } return nil }