func testCodepage(t *testing.T, name string, inReader, outReader func(io.Reader) io.Reader) { data := make([]byte, 256) for i := range data { data[i] = byte(i) } inr := inReader(bytes.NewBuffer(data)) r, err := charset.NewReader(name, inr) if err != nil { t.Fatalf("cannot make reader for charset %q: %v", name, err) } outr := outReader(r) r = outr var outbuf bytes.Buffer w, err := charset.NewWriter(name, &outbuf) if err != nil { t.Fatalf("cannot make writer for charset %q: %v", name, err) } _, err = io.Copy(w, r) if err != nil { t.Fatalf("copy failed: %v", err) } err = w.Close() if err != nil { t.Fatalf("close failed: %v", err) } if len(outbuf.Bytes()) != len(data) { t.Fatalf("short result of roundtrip, charset %q, readers %T, %T; expected 256, got %d", name, inr, outr, len(outbuf.Bytes())) } for i, x := range outbuf.Bytes() { if data[i] != x { t.Fatalf("charset %q, round trip expected %d, got %d", name, i, data[i]) } } }
// Will receive an input stream which would convert the response to utf-8 // The given function must close the reader r, in order to close the response body. func HandleStringReader(f func(r io.Reader, ctx *goproxy.ProxyCtx) io.Reader) goproxy.RespHandler { return goproxy.FuncRespHandler(func(resp *http.Response, ctx *goproxy.ProxyCtx) *http.Response { if ctx.Error != nil { return nil } charsetName := ctx.Charset() if charsetName == "" { charsetName = "utf-8" } if strings.ToLower(charsetName) != "utf-8" { r, err := charset.NewReader(charsetName, resp.Body) if err != nil { ctx.Warnf("Cannot convert from %v to utf-8: %v", charsetName, err) return resp } tr, err := charset.TranslatorTo(charsetName) if err != nil { ctx.Warnf("Can't translate to %v from utf-8: %v", charsetName, err) return resp } if err != nil { ctx.Warnf("Cannot translate to %v: %v", charsetName, err) return resp } newr := charset.NewTranslatingReader(f(r, ctx), tr) resp.Body = &readFirstCloseBoth{ioutil.NopCloser(newr), resp.Body} } else { //no translation is needed, already at utf-8 resp.Body = &readFirstCloseBoth{ioutil.NopCloser(f(resp.Body, ctx)), resp.Body} } return resp }) }
func ExampleNewReader() { r, err := charset.NewReader("latin1", strings.NewReader("\xa35 for Pepp\xe9")) if err != nil { log.Fatal(err) } result, err := ioutil.ReadAll(r) if err != nil { log.Fatal(err) } fmt.Printf("%s\n", result) // Output: £5 for Peppé }
func main() { flag.Usage = func() { fmt.Fprintf(os.Stderr, "usage: tcs [-l] [-v] [charset]\n") fmt.Fprintf(os.Stderr, "\ttcs [-f charset] [-t charset] [file]\n") } flag.Parse() if *listFlag { cs := "" switch flag.NArg() { case 1: cs = flag.Arg(0) case 0: default: flag.Usage() } listCharsets(*verboseFlag, cs) return } var f *os.File switch flag.NArg() { case 0: f = os.Stdin case 1: var err error f, err = os.Open(flag.Arg(0)) if err != nil { fatalf("cannot open %q: %v", err) } } r, err := charset.NewReader(*fromCharset, f) if err != nil { fatalf("cannot translate from %q: %v", *fromCharset, err) } w, err := charset.NewWriter(*toCharset, os.Stdout) if err != nil { fatalf("cannot translate to %q: ", err) } _, err = io.Copy(w, r) if err != nil { fatalf("%v", err) } }
// Crawl fetches the HTML body and returns an Article func (c Crawler) Crawl() (*Article, error) { article := new(Article) c.assignParseCandidate() err := c.assignHTML() if err != nil { return nil, err } // This was supposed to have been set by assignHTML, so something went wrong if c.RawHTML == "" { return article, nil } reader := strings.NewReader(c.RawHTML) document, err := goquery.NewDocumentFromReader(reader) if err != nil { return nil, err } selection := document.Find("meta").EachWithBreak(func(i int, s *goquery.Selection) bool { attr, exists := s.Attr("http-equiv") if exists && attr == "Content-Type" { return false } return true }) if selection != nil { attr, _ := selection.Attr("content") attr = strings.Replace(attr, " ", "", -1) if strings.HasPrefix(attr, "text/html;charset=") { cs := strings.TrimPrefix(attr, "text/html;charset=") cs = strings.ToLower(cs) if cs != "utf-8" { r, err1 := charset.NewReader(cs, strings.NewReader(c.RawHTML)) if err1 != nil { // On error, skip the read c.RawHTML = "" } else { utf8, _ := ioutil.ReadAll(r) c.RawHTML = string(utf8) } reader = strings.NewReader(c.RawHTML) document, err = goquery.NewDocumentFromReader(reader) } } } if err != nil { return nil, err } extractor := NewExtractor(c.config) html, err := document.Html() if err != nil { return nil, err } startTime := time.Now().UnixNano() article.RawHTML = html article.FinalURL = c.helper.url article.LinkHash = c.helper.linkHash article.Doc = document article.Title = extractor.getTitle(article) article.MetaLang = extractor.getMetaLanguage(article) article.MetaFavicon = extractor.getFavicon(article) article.MetaDescription = extractor.getMetaContentWithSelector(article, "meta[name#=(?i)^description$]") article.MetaKeywords = extractor.getMetaContentWithSelector(article, "meta[name#=(?i)^keywords$]") article.CanonicalLink = extractor.getCanonicalLink(article) article.Domain = extractor.getDomain(article) article.Tags = extractor.getTags(article) cleaner := NewCleaner(c.config) article.Doc = cleaner.clean(article) article.TopImage = OpenGraphResolver(article) if article.TopImage == "" { article.TopImage = WebPageResolver(article) } article.TopNode = extractor.calculateBestNode(article) if article.TopNode != nil { article.TopNode = extractor.postCleanup(article.TopNode) outputFormatter := new(outputFormatter) article.CleanedText, article.Links = outputFormatter.getFormattedText(article) videoExtractor := NewVideoExtractor() article.Movies = videoExtractor.GetVideos(article) } article.Delta = time.Now().UnixNano() - startTime return article, nil }
func (this Crawler) Crawl() *Article { article := new(Article) this.assignParseCandidate() this.assignHtml() if this.rawHtml == "" { return article } reader := strings.NewReader(this.rawHtml) document, err := goquery.NewDocumentFromReader(reader) if err != nil { panic(err.Error()) } attr := "" selection := document.Find("meta").EachWithBreak(func(i int, s *goquery.Selection) bool { attr, exists := s.Attr("http-equiv") if exists && attr == "Content-Type" { return false } return true }) if selection != nil { attr, _ = selection.Attr("content") attr = strings.Replace(attr, " ", "", -1) if strings.HasPrefix(attr, "text/html;charset=") { cs := strings.TrimPrefix(attr, "text/html;charset=") cs = strings.ToLower(cs) if cs != "utf-8" { r, err := charset.NewReader(cs, strings.NewReader(this.rawHtml)) if err != nil { // On error, skip the read this.rawHtml = "" } else { utf8, _ := ioutil.ReadAll(r) this.rawHtml = string(utf8) } reader = strings.NewReader(this.rawHtml) document, err = goquery.NewDocumentFromReader(reader) } } } if err == nil { extractor := NewExtractor(this.config) html, _ := document.Html() start := TimeInNanoseconds() article.RawHtml = html article.FinalUrl = this.helper.url article.LinkHash = this.helper.linkHash article.Doc = document article.Title = extractor.getTitle(article) article.MetaLang = extractor.getMetaLanguage(article) article.MetaFavicon = extractor.getFavicon(article) article.MetaDescription = extractor.getMetaContentWithSelector(article, "meta[name#=(?i)^description$]") article.MetaKeywords = extractor.getMetaContentWithSelector(article, "meta[name#=(?i)^keywords$]") article.CanonicalLink = extractor.getCanonicalLink(article) article.Domain = extractor.getDomain(article) article.Tags = extractor.getTags(article) cleaner := NewCleaner(this.config) article.Doc = cleaner.clean(article) article.TopImage = OpenGraphResolver(article) if article.TopImage == "" { article.TopImage = WebPageResolver(article) } article.TopNode = extractor.calculateBestNode(article) if article.TopNode != nil { article.TopNode = extractor.postCleanup(article.TopNode) outputFormatter := new(outputFormatter) article.CleanedText = outputFormatter.getFormattedText(article) videoExtractor := NewVideoExtractor() article.Movies = videoExtractor.GetVideos(article) } stop := TimeInNanoseconds() delta := stop - start article.Delta = delta } else { panic(err.Error()) } return article }