func (this *HttpDownloader) changeCharsetEncodingAutoGzipSupport(contentTypeStr string, sor io.ReadCloser) string { var err error gzipReader, err := gzip.NewReader(sor) if err != nil { mlog.LogInst().LogError(err.Error()) return "" } defer gzipReader.Close() destReader, err := charset.NewReader(gzipReader, contentTypeStr) if err != nil { mlog.LogInst().LogError(err.Error()) destReader = sor } var sorbody []byte if sorbody, err = ioutil.ReadAll(destReader); err != nil { mlog.LogInst().LogError(err.Error()) // For gb2312, an error will be returned. // Error like: simplifiedchinese: invalid GBK encoding // return "" } //e,name,certain := charset.DetermineEncoding(sorbody,contentTypeStr) bodystr := string(sorbody) return bodystr }
func fetchHTML(url string) ([]byte, *url.URL, error) { r, e := get(url) if e != nil { return nil, nil, e } if !(r.StatusCode >= 200 && r.StatusCode < 300) { return nil, nil, errors.New("besticon: not found") } b, e := getBodyBytes(r) if e != nil { return nil, nil, e } if len(b) == 0 { return nil, nil, errors.New("besticon: empty response") } reader := bytes.NewReader(b) contentType := r.Header.Get("Content-Type") utf8reader, e := charset.NewReader(reader, contentType) if e != nil { return nil, nil, e } utf8bytes, e := ioutil.ReadAll(utf8reader) if e != nil { return nil, nil, e } return utf8bytes, r.Request.URL, nil }
// getHTMLPage - get html by http(s) as http.Response func getHTMLPage(url string, ua string, timeout int, dontDetectCharset bool) (htmlReader io.Reader, err error) { cookie, _ := cookiejar.New(nil) client := &http.Client{ Jar: cookie, Timeout: time.Duration(timeout) * time.Second, } request, err := http.NewRequest("GET", url, nil) if err != nil { return htmlReader, err } if ua != "" { request.Header.Set("User-Agent", ua) } response, err := client.Do(request) if err != nil { return htmlReader, err } if contentType := response.Header.Get("Content-Type"); contentType != "" && !dontDetectCharset { htmlReader, err = charset.NewReader(response.Body, contentType) if err != nil { return htmlReader, err } } else { return response.Body, nil } return htmlReader, nil }
func parseItemXml(client *http.Client, cookies []*http.Cookie, str string) *feeds.Item { var entry EntryXml /* print the item xml */ // fmt.Println(str) // change from gbk to utf8 d := xml.NewDecoder(bytes.NewReader([]byte(str))) d.CharsetReader = func(s string, r io.Reader) (io.Reader, error) { return charset.NewReader(r, s) } err := d.Decode(&entry) if err != nil { fmt.Printf("xml entryXml unmarshal failed: %v\n", err) return nil } url, err := fetchFeedUrl(client, cookies, BaseURL+entry.Item.Display.Url) if err != nil { return nil } return &feeds.Item{ Title: entry.Item.Display.Title, Link: &feeds.Link{Href: url}, Description: entry.Item.Display.Content, Id: entry.Item.Display.Docid, Author: &feeds.Author{Name: entry.Item.Display.Source}, //Created: entry.Item.Display.Date, Updated: modifyTime(entry.Item.Display.Update), } }
// send uses the given *http.Request to make an HTTP request. func (bow *Browser) httpRequest(req *http.Request) error { bow.preSend() resp, err := bow.buildClient().Do(req) if err != nil { return err } defer resp.Body.Close() if resp.StatusCode == 503 && resp.Header.Get("Server") == "cloudflare-nginx" { if !bow.solveCF(resp, req.URL) { return fmt.Errorf("Page protected with cloudflare with unknown algorythm") } else { return nil } } content_type := resp.Header.Get("Content-Type") if resp.StatusCode != 403 { if content_type == "text/html; charset=GBK" { enc := mahonia.NewDecoder("gbk") e := enc.NewReader(resp.Body) bow.body, err = ioutil.ReadAll(e) if err != nil { return err } } else { fixedBody, err := charset.NewReader(resp.Body, content_type) if err == nil { bow.body, err = ioutil.ReadAll(fixedBody) if err != nil { return err } } else { bow.body, err = ioutil.ReadAll(resp.Body) if err != nil { return err } } } bow.contentConversion(content_type) } else { bow.body = []byte(`<html></html>`) } buff := bytes.NewBuffer(bow.body) dom, err := goquery.NewDocumentFromReader(buff) if err != nil { return err } bow.history.Push(bow.state) bow.state = jar.NewHistoryState(req, resp, dom) bow.postSend() return nil }
func forecast(loc location) (string, error) { doc := xmlx.New() url := fmt.Sprintf(fcURLFmt, loc.Lat, loc.Lon) err := doc.LoadUri(url, func(str string, rdr io.Reader) (io.Reader, error) { return charset.NewReader(rdr, str) }) if err != nil { return "", errFc } startTimeNodes := doc.SelectNodes("", "start-valid-time") endTimeNodes := doc.SelectNodes("", "end-valid-time") if len(startTimeNodes) == 0 || len(endTimeNodes) == 0 { return "", errFc } if len(endTimeNodes) > maxHours { endTimeNodes = endTimeNodes[:maxHours] } startTime, _ := time.Parse(time.RFC3339, startTimeNodes[0].GetValue()) endTime, _ := time.Parse(time.RFC3339, endTimeNodes[len(endTimeNodes)-1].GetValue()) temps := findVals("temperature", "hourly", doc) humids := findVals("humidity", "", doc) precips := findVals("probability-of-precipitation", "", doc) speeds := findVals("wind-speed", "sustained", doc) dirs := findVals("direction", "", doc) minTemp, maxTemp, tempGraph := makeGraph(temps) minHumid, maxHumid, humidGraph := makeGraph(humids) minPrecip, maxPrecip, precipGraph := makeGraph(precips) minSpeed, maxSpeed, speedGraph := makeGraph(speeds) dirGraph := "" for _, dir := range dirs { idx := dirIndex(dir) dirGraph += string([]rune(arrows)[idx]) } timeFmt := "2006-01-02 15:04" start, end := startTime.Format(timeFmt), endTime.Format(timeFmt) tempRange := fmt.Sprintf("%3d %3d", minTemp, maxTemp) humidRange := fmt.Sprintf("%3d %3d", minHumid, maxHumid) precipRange := fmt.Sprintf("%3d %3d", minPrecip, maxPrecip) speedRange := fmt.Sprintf("%3d %3d", minSpeed, maxSpeed) out := fmt.Sprintf("Forecast for %s\n", loc.Name) out += fmt.Sprintf(" min max %-24s%24s\n", start, end) out += fmt.Sprintf("Temp °F %7s %s\n", tempRange, tempGraph) out += fmt.Sprintf("Humid %% %7s %s\n", humidRange, humidGraph) // esc % 2X for later fmt use out += fmt.Sprintf("Precip %% %7s %s\n", precipRange, precipGraph) out += fmt.Sprintf("Wind mph %7s %s\n", speedRange, speedGraph) out += fmt.Sprintf("Wind dir %s\n", dirGraph) return out, nil }
// Parse return information about page // @param s - contains page source // @params pageURL - contains URL from where the data was taken [optional] // @params contentType - contains Content-Type header value [optional] // if no url is given then parser won't attempt to parse oembed info func (info *HTMLInfo) Parse(s io.Reader, pageURL *string, contentType *string) error { contentTypeStr := "text/html" if contentType != nil && len(*contentType) > 0 { contentTypeStr = *contentType } utf8s, err := charset.NewReader(s, contentTypeStr) if err != nil { return err } if pageURL != nil { tu, _ := url.Parse(*pageURL) info.url = tu } doc, err := html.Parse(utf8s) if err != nil { return err } var f func(*html.Node) f = func(n *html.Node) { for c := n.FirstChild; c != nil; c = c.NextSibling { if c.Type == html.ElementNode { if c.Data == "head" { info.parseHead(c) continue } else if c.Data == "body" { info.parseBody(c) continue } } f(c) } } f(doc) if info.AllowOembedFetching && pageURL != nil && len(info.OembedJSONURL) > 0 { pu, _ := url.Parse(info.OembedJSONURL) siteName := info.OGInfo.SiteName siteURL := strings.ToLower(pu.Scheme) + "://" + pu.Host if len(siteName) == 0 { siteName = pu.Host } oiItem := &oembed.Item{EndpointURL: info.OembedJSONURL, ProviderName: siteName, ProviderURL: siteURL, IsEndpointURLComplete: true} oi, _ := oiItem.FetchOembed(*pageURL, info.Client) if oi != nil && oi.Status < 300 { info.OembedInfo = oi } } return nil }
func (p *Page) Iconv(reader io.Reader) (io.Reader, error) { contentType := p.ContentType switch { case contain(contentType, "text"): return charset.NewReader(reader, contentType) } return reader, nil }
// 采用surf内核下载时,可以尝试自动转码为utf8 // 采用phantomjs内核时,无需转码(已是utf8) func AutoToUTF8(resp *http.Response) error { destReader, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type")) if err == nil { resp.Body = &Body{ ReadCloser: resp.Body, Reader: destReader, } } return err }
func parseXML(content []byte, v interface{}) error { d := xml.NewDecoder(bytes.NewReader(content)) d.CharsetReader = func(s string, r io.Reader) (io.Reader, error) { //converts GBK to UTF-8. if s == "GBK" { return transform.NewReader(r, simplifiedchinese.GB18030.NewDecoder()), nil } return charset.NewReader(r, s) } err := d.Decode(v) return err }
// ParseResponse - wrapps sequence of URL fate functions // user is response to handle: defer response.Body.Close() func ParseResponse(response *http.Response, toks ...Tok) error { contentType := response.Header.Get("Content-Type") if !IsTextHTML(contentType) { return ErrResponseBodyIsNotHTML } if response.Body == nil { return ErrResponseBodyIsEmpty } r, err := charset.NewReader(response.Body, contentType) if err != nil { return err } return ParseReader(r, toks...) }
// Charset auto determine. Use golang.org/x/net/html/charset. Get page body and change it to utf-8 // 自动转码 func (self *HttpDownloader) changeCharsetEncodingAuto(contentTypeStr string, sor io.ReadCloser) string { if len(strings.Split(contentTypeStr, " ")) < 2 { contentTypeStr = self.DefaultContentType } destReader, err := charset.NewReader(sor, contentTypeStr) if err != nil { mlog.LogInst().LogError(err.Error()) destReader = sor } var sorbody []byte sorbody, err = ioutil.ReadAll(destReader) if err != nil { mlog.LogInst().LogError(err.Error()) } bodystr := string(sorbody) return bodystr }
func changeCharsetEncodingAuto(sor io.ReadCloser, contentTypeStr string) string { var err error destReader, err := charset.NewReader(sor, contentTypeStr) if err != nil { log.Error(err) destReader = sor } var sorbody []byte if sorbody, err = ioutil.ReadAll(destReader); err != nil { log.Error(err) } bodystr := string(sorbody) return bodystr }
func main() { if lang != "ru" { prefix = "Title" unkPrefix = "HTTP Header" } if len(flag.Args()) > 0 && len(flag.Arg(0)) > 0 { link = flag.Arg(0) } if len(link) <= 4 || link[0:4] != "http" { usage(lang) return } res, err := http.Get(link) contType = res.Header.Get("Content-Type") if showHeaders || len(contType) >= 9 && contType[0:9] != "text/html" { if ok, _ := regexp.MatchString(exclude, contType); ok && exclude != "" { return } fmt.Print(unkPrefix + ":") for k, v := range res.Header { if showHeaders || (k == "Content-Type" || k == "Content-Length") { fmt.Printf("\n%s: %s", k, v) } } return } if err != nil { log.Fatal(err) } var title string if text, err := charset.NewReader(res.Body, contType); err == nil { title, _ = getTag(text, "title") } title = strings.Trim(title, "\n ") if len(title) > 0 { fmt.Print(prefix + ": " + title) } res.Body.Close() if err != nil { log.Fatal(err) } }
// Fetch get contents and extract it. func (c *Crawler) Fetch(rawurl string) (*Article, error) { client := http.DefaultClient client.Timeout = c.config.timeout req, err := http.NewRequest("GET", rawurl, nil) if err != nil { return nil, err } req.Header.Set("User-Agent", c.config.browserUserAgent) resp, err := client.Do(req) if err != nil { return nil, err } defer resp.Body.Close() reader, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type")) if err != nil { return nil, err } return c.Extract(reader, rawurl) }
//получение страницы из урла url func gethtmlpage(url string) []byte { resp, err := http.Get(url) if err != nil { LogFile.Println("HTTP error:", err) panic("HTTP error") } defer resp.Body.Close() // вот здесь и начинается самое интересное utf8, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type")) if err != nil { LogFile.Println("Encoding error:", err) panic("Encoding error") } body, err := ioutil.ReadAll(utf8) if err != nil { LogFile.Println("IO error:", err) panic("IO error") } return body }
// Charset auto determine. Use golang.org/x/net/html/charset. Get response body and change it to utf-8 func changeCharsetEncodingAuto(sor io.ReadCloser, contentTypeStr string) string { var err error destReader, err := charset.NewReader(sor, contentTypeStr) if err != nil { logs.Log.Error("%v", err) destReader = sor } var sorbody []byte if sorbody, err = ioutil.ReadAll(destReader); err != nil { logs.Log.Error("%v", err) // For gb2312, an error will be returned. // Error like: simplifiedchinese: invalid GBK encoding // return "" } //e,name,certain := charset.DetermineEncoding(sorbody,contentTypeStr) bodystr := string(sorbody) return bodystr }
// GetBodyStr returns plain string crawled. func (self *Context) initText() { defer self.Response.Body.Close() // get converter to utf-8 // Charset auto determine. Use golang.org/x/net/html/charset. Get response body and change it to utf-8 destReader, err := charset.NewReader(self.Response.Body, self.Response.Header.Get("Content-Type")) if err != nil { logs.Log.Warning(err.Error()) destReader = self.Response.Body } sorbody, err := ioutil.ReadAll(destReader) if err != nil { logs.Log.Error(err.Error()) return // For gb2312, an error will be returned. // Error like: simplifiedchinese: invalid GBK encoding } //e,name,certain := charset.DetermineEncoding(sorbody,self.Response.Header.Get("Content-Type")) self.text = util.Bytes2String(sorbody) }
// Charset auto determine. Use golang.org/x/net/html/charset. Get response body and change it to utf-8 func (self *HttpDownloader) changeCharsetEncodingAuto(contentTypeStr string, sor io.ReadCloser) string { var err error destReader, err := charset.NewReader(sor, contentTypeStr) if err != nil { reporter.Log.Println(err.Error()) destReader = sor } var sorbody []byte if sorbody, err = ioutil.ReadAll(destReader); err != nil { reporter.Log.Println(err.Error()) // For gb2312, an error will be returned. // Error like: simplifiedchinese: invalid GBK encoding // return "" } //e,name,certain := charset.DetermineEncoding(sorbody,contentTypeStr) bodystr := string(sorbody) return bodystr }
func (this *Curl) Do() (*Response, error) { resp, err := this.client.Do(this.req.Request) if err != nil { return NewResponse(nil, this.req.Url, ""), err } defer resp.Body.Close() var body string if resp.StatusCode == 200 { if resp.Header.Get("Content-Encoding") == "gzip" { reader, _ := gzip.NewReader(resp.Body) for { buf := make([]byte, 1024) n, err := reader.Read(buf) if err != nil && err != io.EOF { return NewResponse(nil, this.req.Url, ""), err } if n == 0 { break } body += string(buf) } } else { contentType := resp.Header.Get("Content-Type") newBody, err := charset.NewReader(resp.Body, contentType) if err != nil { return NewResponse(nil, this.req.Url, ""), err } bodyByte, err := ioutil.ReadAll(newBody) if err != nil { return NewResponse(nil, this.req.Url, ""), err } body = string(bodyByte) } } else { return NewResponse(nil, this.req.Url, ""), errors.New(fmt.Sprintf("Response StatusCode: %d", resp.StatusCode)) } return NewResponse(resp, this.req.Url, body), nil }
//получение страницы из урла url func gethtmlpage(url string) ([]byte, bool) { resp, err := http.Get(url) if err != nil { fmt.Println("HTTP error:", err) // panic("HTTP error") return make([]byte, 0), false } defer resp.Body.Close() // вот здесь и начинается самое интересное utf8, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type")) if err != nil { fmt.Println("Encoding error:", err) return make([]byte, 0), false // panic("Encoding error") } body, err := ioutil.ReadAll(utf8) if err != nil { fmt.Println("IO error:", err) return make([]byte, 0), false // panic("IO error") } return body, true }
// decodeSection attempts to decode the data from reader using the algorithm listed in // the Content-Transfer-Encoding header, returning the raw data if it does not known // the encoding type. func decodeSection(transfer_encoding string, content_type string, mediatype string, reader io.Reader) ([]byte, error) { // Default is to just read input into bytes decoder := reader switch strings.ToLower(transfer_encoding) { case "quoted-printable": decoder = qprintable.NewDecoder(qprintable.WindowsTextEncoding, reader) case "base64": cleaner := NewBase64Cleaner(reader) decoder = base64.NewDecoder(base64.StdEncoding, cleaner) } if len(mediatype) > 4 && mediatype[0:5] == "text/" { // Decode text to utf-8 readerInUTF8, err := charset.NewReader(decoder, content_type) if err != nil { return nil, err } buf := new(bytes.Buffer) _, err = buf.ReadFrom(readerInUTF8) if err != nil { return nil, err } return buf.Bytes(), nil } else { // Pass raw data buf := new(bytes.Buffer) _, err := buf.ReadFrom(decoder) if err != nil { return nil, err } return buf.Bytes(), nil } // Read bytes into buffer }
// simple xml to string support utf8 func XML2mapstr(xmldoc string) map[string]string { var t xml.Token var err error inputReader := strings.NewReader(xmldoc) decoder := xml.NewDecoder(inputReader) decoder.CharsetReader = func(s string, r io.Reader) (io.Reader, error) { return charset.NewReader(r, s) } m := make(map[string]string, 32) key := "" for t, err = decoder.Token(); err == nil; t, err = decoder.Token() { switch token := t.(type) { case xml.StartElement: key = token.Name.Local case xml.CharData: content := string([]byte(token)) m[key] = content default: // ... } } return m }
func WrappedCharsetReader(s string, i io.Reader) (io.Reader, error) { return charset.NewReader(i, s) }
// GetBodyStr returns plain string crawled. func (self *Context) initText() { var err error // 采用surf内核下载时,尝试自动转码 if self.Request.DownloaderID == request.SURF_ID { var contentType, pageEncode string // 优先从响应头读取编码类型 contentType = self.Response.Header.Get("Content-Type") if _, params, err := mime.ParseMediaType(contentType); err == nil { if cs, ok := params["charset"]; ok { pageEncode = strings.ToLower(strings.TrimSpace(cs)) } } // 响应头未指定编码类型时,从请求头读取 if len(pageEncode) == 0 { contentType = self.Request.Header.Get("Content-Type") if _, params, err := mime.ParseMediaType(contentType); err == nil { if cs, ok := params["charset"]; ok { pageEncode = strings.ToLower(strings.TrimSpace(cs)) } } } switch pageEncode { // 不做转码处理 case "utf8", "utf-8", "unicode-1-1-utf-8": default: // 指定了编码类型,但不是utf8时,自动转码为utf8 // get converter to utf-8 // Charset auto determine. Use golang.org/x/net/html/charset. Get response body and change it to utf-8 var destReader io.Reader if len(pageEncode) == 0 { destReader, err = charset.NewReader(self.Response.Body, "") } else { destReader, err = charset.NewReaderLabel(pageEncode, self.Response.Body) } if err == nil { self.text, err = ioutil.ReadAll(destReader) if err == nil { self.Response.Body.Close() return } else { logs.Log.Warning(" * [convert][%v]: %v (ignore transcoding)\n", self.GetUrl(), err) } } else { logs.Log.Warning(" * [convert][%v]: %v (ignore transcoding)\n", self.GetUrl(), err) } } } // 不做转码处理 self.text, err = ioutil.ReadAll(self.Response.Body) self.Response.Body.Close() if err != nil { panic(err.Error()) return } }
func (t *URLTitleExtractor) WriteURLTitle(event *irc.Event) { var urls []string = FindURLs(event.Arguments[1]) var err error var resp *http.Response var contentType string var foundcharset string var ureader io.Reader var htmlnode *html.Node for _, oneurl := range urls { // URL valid? _, err = url.Parse(oneurl) if err != nil { continue } resp, err = http.Head(oneurl) if err != nil { log.Print("Error getting Head: ", err) continue } // No HTML? contentType = resp.Header.Get("Content-Type") // Content type does not start with "text/html" or "application/xhtml+xml"? if !strings.HasPrefix(contentType, "text/html") && !strings.HasPrefix(contentType, "application/xhtml+xml") { log.Print("Wrong content type: ", contentType, " Expecting application/xhtml+xml or text/html") continue } // Get the charset foundcharset = ExtractCharset(contentType) // Get the Body resp, err = http.Get(oneurl) if err != nil { log.Print("Error during HTTP GET: ", err) continue } // Close later defer resp.Body.Close() if foundcharset != "" && strings.ToLower(foundcharset) != "utf-8" && strings.ToLower(foundcharset) != "utf8" { log.Print("Converting from ", foundcharset, " to UTF-8") ureader, err = charset.NewReader(resp.Body, foundcharset) if err != nil { log.Print("Error during utf-8 transformation: ", err) continue } } else { ureader = resp.Body } // Get the top HTML node htmlnode, err = html.Parse(ureader) if err != nil { log.Print("Error parsing HTML file: ", err) continue } var htmltag *html.Node = htmlnode.FirstChild // doctype, if well formed // Advance until we find the html tag or until no elements are left. for htmltag != nil && (htmltag.Type != html.ElementNode || htmltag.DataAtom != atom.Html) { htmltag = htmltag.NextSibling } // In case of broken HTML where everything is a top level element: if htmltag == nil { htmltag = htmlnode.FirstChild } else { htmlnode = htmltag // If head is missing we can continue from here htmltag = htmltag.FirstChild } for htmltag != nil && (htmltag.Type != html.ElementNode || htmltag.DataAtom != atom.Head) { htmltag = htmltag.NextSibling } // In case of even more broken HTML where even the Head is missing if htmltag == nil { htmltag = htmlnode.FirstChild } else { htmlnode = htmltag htmltag = htmltag.FirstChild // Go into head's first child } // Continue until finding title element or no elements are left for htmltag != nil && (htmltag.Type != html.ElementNode || htmltag.DataAtom != atom.Title) { htmltag = htmltag.NextSibling } if htmltag != nil && htmltag.FirstChild != nil && htmltag.FirstChild.Type == html.TextNode { log.Print(htmltag.FirstChild.Data) // Add a new message to the buffer to be delivered when it's time. t.msgbuffer.AddMessage(event.Arguments[0], "Title: "+strings.TrimSpace(htmltag.FirstChild.Data)) } } }