func testCodepage(t *testing.T, name string, inReader, outReader func(io.Reader) io.Reader) {
	data := make([]byte, 256)
	for i := range data {
		data[i] = byte(i)
	}
	inr := inReader(bytes.NewBuffer(data))
	r, err := charset.NewReader(name, inr)
	if err != nil {
		t.Fatalf("cannot make reader for charset %q: %v", name, err)
	}
	outr := outReader(r)
	r = outr

	var outbuf bytes.Buffer
	w, err := charset.NewWriter(name, &outbuf)
	if err != nil {
		t.Fatalf("cannot make writer  for charset %q: %v", name, err)
	}
	_, err = io.Copy(w, r)
	if err != nil {
		t.Fatalf("copy failed: %v", err)
	}
	err = w.Close()
	if err != nil {
		t.Fatalf("close failed: %v", err)
	}
	if len(outbuf.Bytes()) != len(data) {
		t.Fatalf("short result of roundtrip, charset %q, readers %T, %T; expected 256, got %d", name, inr, outr, len(outbuf.Bytes()))
	}
	for i, x := range outbuf.Bytes() {
		if data[i] != x {
			t.Fatalf("charset %q, round trip expected %d, got %d", name, i, data[i])
		}
	}
}
Beispiel #2
0
// Will receive an input stream which would convert the response to utf-8
// The given function must close the reader r, in order to close the response body.
func HandleStringReader(f func(r io.Reader, ctx *goproxy.ProxyCtx) io.Reader) goproxy.RespHandler {
	return goproxy.FuncRespHandler(func(resp *http.Response, ctx *goproxy.ProxyCtx) *http.Response {
		if ctx.Error != nil {
			return nil
		}
		charsetName := ctx.Charset()
		if charsetName == "" {
			charsetName = "utf-8"
		}

		if strings.ToLower(charsetName) != "utf-8" {
			r, err := charset.NewReader(charsetName, resp.Body)
			if err != nil {
				ctx.Warnf("Cannot convert from %v to utf-8: %v", charsetName, err)
				return resp
			}
			tr, err := charset.TranslatorTo(charsetName)
			if err != nil {
				ctx.Warnf("Can't translate to %v from utf-8: %v", charsetName, err)
				return resp
			}
			if err != nil {
				ctx.Warnf("Cannot translate to %v: %v", charsetName, err)
				return resp
			}
			newr := charset.NewTranslatingReader(f(r, ctx), tr)
			resp.Body = &readFirstCloseBoth{ioutil.NopCloser(newr), resp.Body}
		} else {
			//no translation is needed, already at utf-8
			resp.Body = &readFirstCloseBoth{ioutil.NopCloser(f(resp.Body, ctx)), resp.Body}
		}
		return resp
	})
}
func ExampleNewReader() {
	r, err := charset.NewReader("latin1", strings.NewReader("\xa35 for Pepp\xe9"))
	if err != nil {
		log.Fatal(err)
	}
	result, err := ioutil.ReadAll(r)
	if err != nil {
		log.Fatal(err)
	}
	fmt.Printf("%s\n", result)
	// Output: £5 for Peppé
}
func main() {
	flag.Usage = func() {
		fmt.Fprintf(os.Stderr, "usage: tcs [-l] [-v] [charset]\n")
		fmt.Fprintf(os.Stderr, "\ttcs [-f charset] [-t charset] [file]\n")
	}
	flag.Parse()
	if *listFlag {
		cs := ""
		switch flag.NArg() {
		case 1:
			cs = flag.Arg(0)
		case 0:
		default:
			flag.Usage()
		}
		listCharsets(*verboseFlag, cs)
		return
	}
	var f *os.File
	switch flag.NArg() {
	case 0:
		f = os.Stdin
	case 1:
		var err error
		f, err = os.Open(flag.Arg(0))
		if err != nil {
			fatalf("cannot open %q: %v", err)
		}
	}
	r, err := charset.NewReader(*fromCharset, f)
	if err != nil {
		fatalf("cannot translate from %q: %v", *fromCharset, err)
	}
	w, err := charset.NewWriter(*toCharset, os.Stdout)
	if err != nil {
		fatalf("cannot translate to %q: ", err)
	}
	_, err = io.Copy(w, r)
	if err != nil {
		fatalf("%v", err)
	}
}
Beispiel #5
0
// Crawl fetches the HTML body and returns an Article
func (c Crawler) Crawl() (*Article, error) {

	article := new(Article)
	c.assignParseCandidate()
	err := c.assignHTML()
	if err != nil {
		return nil, err
	}

	// This was supposed to have been set by assignHTML, so something went wrong
	if c.RawHTML == "" {
		return article, nil
	}

	reader := strings.NewReader(c.RawHTML)
	document, err := goquery.NewDocumentFromReader(reader)
	if err != nil {
		return nil, err
	}

	selection := document.Find("meta").EachWithBreak(func(i int, s *goquery.Selection) bool {
		attr, exists := s.Attr("http-equiv")
		if exists && attr == "Content-Type" {
			return false
		}
		return true
	})

	if selection != nil {
		attr, _ := selection.Attr("content")
		attr = strings.Replace(attr, " ", "", -1)

		if strings.HasPrefix(attr, "text/html;charset=") {
			cs := strings.TrimPrefix(attr, "text/html;charset=")
			cs = strings.ToLower(cs)

			if cs != "utf-8" {
				r, err1 := charset.NewReader(cs, strings.NewReader(c.RawHTML))
				if err1 != nil {
					// On error, skip the read
					c.RawHTML = ""
				} else {
					utf8, _ := ioutil.ReadAll(r)
					c.RawHTML = string(utf8)
				}
				reader = strings.NewReader(c.RawHTML)
				document, err = goquery.NewDocumentFromReader(reader)
			}
		}
	}

	if err != nil {
		return nil, err
	}

	extractor := NewExtractor(c.config)
	html, err := document.Html()
	if err != nil {
		return nil, err
	}

	startTime := time.Now().UnixNano()
	article.RawHTML = html
	article.FinalURL = c.helper.url
	article.LinkHash = c.helper.linkHash
	article.Doc = document
	article.Title = extractor.getTitle(article)
	article.MetaLang = extractor.getMetaLanguage(article)
	article.MetaFavicon = extractor.getFavicon(article)

	article.MetaDescription = extractor.getMetaContentWithSelector(article, "meta[name#=(?i)^description$]")
	article.MetaKeywords = extractor.getMetaContentWithSelector(article, "meta[name#=(?i)^keywords$]")
	article.CanonicalLink = extractor.getCanonicalLink(article)
	article.Domain = extractor.getDomain(article)
	article.Tags = extractor.getTags(article)

	cleaner := NewCleaner(c.config)
	article.Doc = cleaner.clean(article)

	article.TopImage = OpenGraphResolver(article)
	if article.TopImage == "" {
		article.TopImage = WebPageResolver(article)
	}
	article.TopNode = extractor.calculateBestNode(article)
	if article.TopNode != nil {
		article.TopNode = extractor.postCleanup(article.TopNode)

		outputFormatter := new(outputFormatter)
		article.CleanedText, article.Links = outputFormatter.getFormattedText(article)

		videoExtractor := NewVideoExtractor()
		article.Movies = videoExtractor.GetVideos(article)
	}

	article.Delta = time.Now().UnixNano() - startTime

	return article, nil
}
Beispiel #6
0
func (this Crawler) Crawl() *Article {

	article := new(Article)
	this.assignParseCandidate()
	this.assignHtml()

	if this.rawHtml == "" {
		return article
	}

	reader := strings.NewReader(this.rawHtml)
	document, err := goquery.NewDocumentFromReader(reader)

	if err != nil {
		panic(err.Error())
	}

	attr := ""
	selection := document.Find("meta").EachWithBreak(func(i int, s *goquery.Selection) bool {
		attr, exists := s.Attr("http-equiv")
		if exists && attr == "Content-Type" {
			return false
		}
		return true
	})

	if selection != nil {
		attr, _ = selection.Attr("content")
		attr = strings.Replace(attr, " ", "", -1)

		if strings.HasPrefix(attr, "text/html;charset=") {
			cs := strings.TrimPrefix(attr, "text/html;charset=")
			cs = strings.ToLower(cs)

			if cs != "utf-8" {
				r, err := charset.NewReader(cs, strings.NewReader(this.rawHtml))
				if err != nil {
					// On error, skip the read
					this.rawHtml = ""
				} else {
					utf8, _ := ioutil.ReadAll(r)
					this.rawHtml = string(utf8)
				}
				reader = strings.NewReader(this.rawHtml)
				document, err = goquery.NewDocumentFromReader(reader)
			}
		}
	}

	if err == nil {
		extractor := NewExtractor(this.config)
		html, _ := document.Html()
		start := TimeInNanoseconds()
		article.RawHtml = html
		article.FinalUrl = this.helper.url
		article.LinkHash = this.helper.linkHash
		article.Doc = document
		article.Title = extractor.getTitle(article)
		article.MetaLang = extractor.getMetaLanguage(article)
		article.MetaFavicon = extractor.getFavicon(article)

		article.MetaDescription = extractor.getMetaContentWithSelector(article, "meta[name#=(?i)^description$]")
		article.MetaKeywords = extractor.getMetaContentWithSelector(article, "meta[name#=(?i)^keywords$]")
		article.CanonicalLink = extractor.getCanonicalLink(article)
		article.Domain = extractor.getDomain(article)
		article.Tags = extractor.getTags(article)

		cleaner := NewCleaner(this.config)
		article.Doc = cleaner.clean(article)

		article.TopImage = OpenGraphResolver(article)
		if article.TopImage == "" {
			article.TopImage = WebPageResolver(article)
		}
		article.TopNode = extractor.calculateBestNode(article)
		if article.TopNode != nil {
			article.TopNode = extractor.postCleanup(article.TopNode)

			outputFormatter := new(outputFormatter)
			article.CleanedText = outputFormatter.getFormattedText(article)

			videoExtractor := NewVideoExtractor()
			article.Movies = videoExtractor.GetVideos(article)
		}

		stop := TimeInNanoseconds()
		delta := stop - start
		article.Delta = delta

	} else {
		panic(err.Error())
	}
	return article
}