コード例 #1
0
ファイル: charset.go プロジェクト: jmptrader/goexample
func main() {
	s := "Hello, \x90\xA2\x8A\x45" // CP932 encoded version of "Hello, 世界" , 这里的 s 是string类型,说明string没有字符集的概念

	r, _ := charset.NewReader("CP932", strings.NewReader(s)) // convert from CP932 to UTF-8
	s2_, _ := ioutil.ReadAll(r)
	s2 := string(s2_)
	fmt.Println(s2)                         // => Hello, 世界
	fmt.Println(len(s2))                    // => 13
	fmt.Println(utf8.RuneCountInString(s2)) // => 9
	fmt.Println(utf8.ValidString(s2))       // => true
	fmt.Println(utf8.ValidString(s))        // => false
	fmt.Printf("%T|%#v\n", s, s)            // 注意 %v 与 %#v 的区别

	ss := "This is not utf-8 string \xa1"
	fmt.Println(utf8.ValidString(ss)) // => false

	pice := []int32{20, 30, 40, 90}
	sss := string(pice)                                             // string 似乎执行了内存拷贝,但是不会涉及到字符集的处理(转换或校验)
	fmt.Printf("%T:%p %T:%p:%d\n", pice, pice, sss, &sss, len(sss)) // 为什么打印字符串变量的地址还需要取地址符

	tr, err := charset.TranslatorTo("windows-1252") //需要检查字符集列表
	if err != nil {
		fmt.Println(err)
		os.Exit(1)
	}
	_, gbk, err2 := tr.Translate([]byte("utf-8汉字"), true)
	if err2 != nil {
		fmt.Println(err2)
		os.Exit(1)
	}
	fmt.Println(gbk)
}
コード例 #2
0
ファイル: html.go プロジェクト: Clarifai/kubernetes
// Will receive an input stream which would convert the response to utf-8
// The given function must close the reader r, in order to close the response body.
func HandleStringReader(f func(r io.Reader, ctx *goproxy.ProxyCtx) io.Reader) goproxy.RespHandler {
	return goproxy.FuncRespHandler(func(resp *http.Response, ctx *goproxy.ProxyCtx) *http.Response {
		if ctx.Error != nil {
			return nil
		}
		charsetName := ctx.Charset()
		if charsetName == "" {
			charsetName = "utf-8"
		}

		if strings.ToLower(charsetName) != "utf-8" {
			r, err := charset.NewReader(charsetName, resp.Body)
			if err != nil {
				ctx.Warnf("Cannot convert from %v to utf-8: %v", charsetName, err)
				return resp
			}
			tr, err := charset.TranslatorTo(charsetName)
			if err != nil {
				ctx.Warnf("Can't translate to %v from utf-8: %v", charsetName, err)
				return resp
			}
			if err != nil {
				ctx.Warnf("Cannot translate to %v: %v", charsetName, err)
				return resp
			}
			newr := charset.NewTranslatingReader(f(r, ctx), tr)
			resp.Body = &readFirstCloseBoth{ioutil.NopCloser(newr), resp.Body}
		} else {
			//no translation is needed, already at utf-8
			resp.Body = &readFirstCloseBoth{ioutil.NopCloser(f(resp.Body, ctx)), resp.Body}
		}
		return resp
	})
}
コード例 #3
0
func testCodepage(t *testing.T, name string, inReader, outReader func(io.Reader) io.Reader) {
	data := make([]byte, 256)
	for i := range data {
		data[i] = byte(i)
	}
	inr := inReader(bytes.NewBuffer(data))
	r, err := charset.NewReader(name, inr)
	if err != nil {
		t.Fatalf("cannot make reader for charset %q: %v", name, err)
	}
	outr := outReader(r)
	r = outr

	var outbuf bytes.Buffer
	w, err := charset.NewWriter(name, &outbuf)
	if err != nil {
		t.Fatalf("cannot make writer  for charset %q: %v", name, err)
	}
	_, err = io.Copy(w, r)
	if err != nil {
		t.Fatalf("copy failed: %v", err)
	}
	err = w.Close()
	if err != nil {
		t.Fatalf("close failed: %v", err)
	}
	if len(outbuf.Bytes()) != len(data) {
		t.Fatalf("short result of roundtrip, charset %q, readers %T, %T; expected 256, got %d", name, inr, outr, len(outbuf.Bytes()))
	}
	for i, x := range outbuf.Bytes() {
		if data[i] != x {
			t.Fatalf("charset %q, round trip expected %d, got %d", name, i, data[i])
		}
	}
}
コード例 #4
0
ファイル: article.go プロジェクト: kedorlaomer/loread
// see RFC 2047; TODO: duplication of code in FormatArticle?
func decodeHeader(header string) string {
	parts := strings.Split(header, "?")
	contentCharset, encoding, text := parts[1], parts[2], parts[3]
	err := error(nil)
	bytes := []byte{}
	switch strings.ToUpper(encoding) {
	// quoted-printable
	case "Q":
		bytes, err = DecodeQuotedPrintable(text)

	case "B":
		bytes, err = base64.StdEncoding.DecodeString(text)

	default:
		bytes = []byte(fmt.Sprintf("<<Couldn't decode '%s'>>", encoding))
	}

	if err != nil {
		panic(fmt.Sprintf("Fehler (?): %s bei Header: %s\n", header))
	}

	r, err := charset.NewReader(contentCharset, strings.NewReader(string(bytes)))

	if err != nil {
		return "<<Couldn't decode header '" + header + "'>>"
	}

	rv, _ := ioutil.ReadAll(r)
	return string(rv)
}
コード例 #5
0
ファイル: noreload.go プロジェクト: tanaton/noreload2ch
func sjisToUtf8(data []byte) ([]byte, error) {
	r, err := charset.NewReader("cp932", bytes.NewReader(data))
	if err != nil {
		return nil, err
	}
	result, err := ioutil.ReadAll(r)
	return result, err
}
コード例 #6
0
ファイル: utils.go プロジェクト: ruslanfirefly/parsetoys
func encode_string(str string, charset string) string {
	r, err := goCharset.NewReader(charset, strings.NewReader(str))
	error_log(err)
	result, err := ioutil.ReadAll(r)
	error_log(err)
	fmt.Printf("%s\n", result)
	return string(result)
}
コード例 #7
0
ファイル: wiener.go プロジェクト: tanaton/wiener2ch
func sjisToUtf8(data []byte) (ret []byte, err error) {
	r, e := charset.NewReader("cp932", bytes.NewReader(data))
	if e == nil {
		ret, err = ioutil.ReadAll(r)
	} else {
		err = e
	}
	return
}
コード例 #8
0
ファイル: util.go プロジェクト: johan--/scrapegoat
// Wraps the reader in charset.Reader if charset header found in resp
func charsetReader(resp *http.Response) io.Reader {
	cs, err := getCharset(resp.Header)
	if err != nil {
		return resp.Body
	}

	r, err := charset.NewReader(cs, resp.Body)
	if err != nil {
		panic("Charset error " + cs)
	}
	return r
}
コード例 #9
0
func ExampleNewReader() {
	r, err := charset.NewReader("latin1", strings.NewReader("\xa35 for Pepp\xe9"))
	if err != nil {
		log.Fatal(err)
	}
	result, err := ioutil.ReadAll(r)
	if err != nil {
		log.Fatal(err)
	}
	fmt.Printf("%s\n", result)
	// Output: £5 for Peppé
}
コード例 #10
0
ファイル: tcs.go プロジェクト: Bosh-for-Cpi/bosh-2605
func main() {
	flag.Usage = func() {
		fmt.Fprintf(os.Stderr, "usage: tcs [-l] [-v] [charset]\n")
		fmt.Fprintf(os.Stderr, "\ttcs [-f charset] [-t charset] [file]\n")
	}
	flag.Parse()
	if *listFlag {
		cs := ""
		switch flag.NArg() {
		case 1:
			cs = flag.Arg(0)
		case 0:
		default:
			flag.Usage()
		}
		listCharsets(*verboseFlag, cs)
		return
	}
	var f *os.File
	switch flag.NArg() {
	case 0:
		f = os.Stdin
	case 1:
		var err error
		f, err = os.Open(flag.Arg(0))
		if err != nil {
			fatalf("cannot open %q: %v", err)
		}
	}
	r, err := charset.NewReader(*fromCharset, f)
	if err != nil {
		fatalf("cannot translate from %q: %v", *fromCharset, err)
	}
	w, err := charset.NewWriter(*toCharset, os.Stdout)
	if err != nil {
		fatalf("cannot translate to %q: ", err)
	}
	_, err = io.Copy(w, r)
	if err != nil {
		fatalf("%v", err)
	}
}
コード例 #11
0
ファイル: document.go プロジェクト: andradeandrey/go-pkg-xmlx
// Load the contents of this document from the supplied reader.
func (this *Document) LoadStream(r io.Reader) (err error) {
	xp := xml.NewDecoder(r)
	xp.Entity = this.Entity
	xp.CharsetReader = func(enc string, input io.Reader) (io.Reader, error) {
		return charset.NewReader(enc, input)
	}

	this.Root = NewNode(NT_ROOT)
	ct := this.Root

	var tok xml.Token
	var t *Node
	var doctype string

	for {
		if tok, err = xp.Token(); err != nil {
			if err == io.EOF {
				return nil
			}
			return err
		}

		switch tt := tok.(type) {
		case xml.SyntaxError:
			return errors.New(tt.Error())
		case xml.CharData:
			ct.Value = strings.TrimSpace(string([]byte(tt)))
		case xml.Comment:
			t := NewNode(NT_COMMENT)
			t.Value = strings.TrimSpace(string([]byte(tt)))
			ct.AddChild(t)
		case xml.Directive:
			t = NewNode(NT_DIRECTIVE)
			t.Value = strings.TrimSpace(string([]byte(tt)))
			ct.AddChild(t)
		case xml.StartElement:
			t = NewNode(NT_ELEMENT)
			t.Name = tt.Name
			t.Attributes = make([]*Attr, len(tt.Attr))
			for i, v := range tt.Attr {
				t.Attributes[i] = new(Attr)
				t.Attributes[i].Name = v.Name
				t.Attributes[i].Value = v.Value
			}
			ct.AddChild(t)
			ct = t
		case xml.ProcInst:
			if tt.Target == "xml" { // xml doctype
				doctype = strings.TrimSpace(string(tt.Inst))
				if i := strings.Index(doctype, `standalone="`); i > -1 {
					this.StandAlone = doctype[i+len(`standalone="`) : len(doctype)]
					i = strings.Index(this.StandAlone, `"`)
					this.StandAlone = this.StandAlone[0:i]
				}
			} else {
				t = NewNode(NT_PROCINST)
				t.Target = strings.TrimSpace(tt.Target)
				t.Value = strings.TrimSpace(string(tt.Inst))
				ct.AddChild(t)
			}
		case xml.EndElement:
			if ct = ct.Parent; ct == nil {
				return
			}
		}
	}

	return
}
コード例 #12
0
ファイル: crawler.go プロジェクト: minond/GoOse
func (this Crawler) Crawl() *Article {

	article := new(Article)
	this.assignParseCandidate()
	this.assignHtml()

	if this.rawHtml == "" {
		return article
	}

	reader := strings.NewReader(this.rawHtml)
	document, err := goquery.NewDocumentFromReader(reader)

	if err != nil {
		panic(err.Error())
	}

	attr := ""
	selection := document.Find("meta").EachWithBreak(func(i int, s *goquery.Selection) bool {
		attr, exists := s.Attr("http-equiv")
		if exists && attr == "Content-Type" {
			return false
		}
		return true
	})

	if selection != nil {
		attr, _ = selection.Attr("content")
		attr = strings.Replace(attr, " ", "", -1)

		if strings.HasPrefix(attr, "text/html;charset=") {
			cs := strings.TrimPrefix(attr, "text/html;charset=")
			cs = strings.ToLower(cs)

			if cs != "utf-8" {
				r, err := charset.NewReader(cs, strings.NewReader(this.rawHtml))
				if err != nil {
					// On error, skip the read
					this.rawHtml = ""
				} else {
					utf8, _ := ioutil.ReadAll(r)
					this.rawHtml = string(utf8)
				}
				reader = strings.NewReader(this.rawHtml)
				document, err = goquery.NewDocumentFromReader(reader)
			}
		}
	}

	if err == nil {
		extractor := NewExtractor(this.config)
		html, _ := document.Html()
		start := TimeInNanoseconds()
		article.RawHtml = html
		article.FinalUrl = this.helper.url
		article.LinkHash = this.helper.linkHash
		article.Doc = document
		article.Title = extractor.getTitle(article)
		article.MetaLang = extractor.getMetaLanguage(article)
		article.MetaFavicon = extractor.getFavicon(article)

		article.MetaDescription = extractor.getMetaContentWithSelector(article, "meta[name#=(?i)description]")
		article.MetaKeywords = extractor.getMetaContentWithSelector(article, "meta[name#=(?i)keywords]")
		article.CanonicalLink = extractor.getCanonicalLink(article)
		article.Domain = extractor.getDomain(article)
		article.Tags = extractor.getTags(article)

		cleaner := NewCleaner(this.config)
		article.Doc = cleaner.clean(article)

		article.TopImage = OpenGraphResolver(article)
		if article.TopImage == "" {
			article.TopImage = WebPageResolver(article)
		}
		article.TopNode = extractor.calculateBestNode(article)
		if article.TopNode != nil {
			article.TopNode = extractor.postCleanup(article.TopNode)

			outputFormatter := new(outputFormatter)
			article.CleanedText = outputFormatter.getFormattedText(article)

			videoExtractor := NewVideoExtractor()
			article.Movies = videoExtractor.GetVideos(article)
		}

		stop := TimeInNanoseconds()
		delta := stop - start
		article.Delta = delta

	} else {
		panic(err.Error())
	}
	return article
}
コード例 #13
0
ファイル: reader.go プロジェクト: ivan1993spb/ru-supplier
func (p *OrderReader) ReadOrders(resp *http.Response) (
	[]*Order, error) {

	if resp == nil {
		panic("ReadOrders(): passed nil response")
	}
	if resp.StatusCode != 200 {
		return nil, errors.New("Server return status " + resp.Status)
	}

	w1251rdr, err := charset.NewReader(_STREAM_CHARSET, resp.Body)
	if err != nil {
		return nil, err
	}
	brdr := bufio.NewReaderSize(w1251rdr, _BUFFER_SIZE)

	// skip first line with topics
	if _, err = brdr.ReadString('\n'); err != nil {
		if err == io.EOF {
			return nil, nil
		}
		return nil, errors.New("Skip first line err: " + err.Error())
	}

	// Below get newest chunk from stream and checking chunk
	// from store. Current newest chunk will checking chunk at next
	// time
	newestChunk, err := brdr.ReadBytes('\n')
	if err != nil && err != io.EOF {
		return nil, err
	}
	// There is no orders. Empty result
	if len(newestChunk) == 0 && err == io.EOF {
		return nil, nil
	}
	if newestChunk[len(newestChunk)-1] == '\n' {
		// cut delim \n if there is
		newestChunk = newestChunk[:len(newestChunk)-1]
	}
	// below get checking chunk by url
	rawurl := resp.Request.URL.String()
	// checking chunk was newest chunk at last time
	checkingChunk, exists := p.HashStore.GetHashChunk(rawurl)
	// check for updates if exists data in hashstore by comparing
	// newest chunk and checking chunk
	if exists {
		hash := md5.New()
		hash.Write(newestChunk)
		if bytes.Compare(checkingChunk, hash.Sum(nil)) == 0 {
			// return if feed was not updated
			return nil, nil
		}
	}
	// save newest chunk in cache
	p.HashStore.SetHashChunk(rawurl, newestChunk)
	if err = p.HashStore.Save(); err != nil {
		log.Println("Can't save cache:", err)
	}

	var orders []*Order
	if order, err := ParseOrder(newestChunk); err == nil {
		orders = append(orders, order)
	} else {
		log.Println("Parsing order error:", err)
	}

	// if exists checking chunk read while does not find matched chunk
	if exists {
		brdr = bufio.NewReader(NewCacheReader(
			brdr, []byte{'\n'}, checkingChunk,
		))
	}

	for {
		rowData, err := brdr.ReadBytes('\n')
		if err != nil && err != io.EOF {
			return orders, err
		}
		if err == io.EOF && len(rowData) == 0 {
			break
		}
		if order, err := ParseOrder(rowData); err == nil {
			orders = append(orders, order)
		} else {
			log.Println("Parsing order error:", err)
		}
		if err == io.EOF {
			break
		}
	}

	return orders, nil
}
コード例 #14
0
ファイル: article.go プロジェクト: kedorlaomer/loread
// Separates body and headers; determines subject, references
// etc.; deals with encoding and charset issues.
func FormatArticle(article RawArticle) ParsedArticle {
	rawHeaders, body := firstAndRest(string(article), "\n\n")
	body = TrimWhite(body)

	// every element is one header line
	joinedHeaders := make([]string, 0)

	buf := ""

	// some headers are multiline (see RFC 3977, 3.6, „folded“)
	for _, line := range strings.Split(rawHeaders, "\n") {
		firstChar := line[0]
		// line for itself
		if firstChar != '\t' && firstChar != ' ' && len(buf) > 0 {
			joinedHeaders = append(joinedHeaders, TrimWhite(buf))
			buf = ""
		}

		buf = buf + line + "\n"
	}

	// all headers
	headers := make(map[string]string)

	for _, headerLine := range joinedHeaders {
		key, value := firstAndRest(headerLine, ": ")
		key = http.CanonicalHeaderKey(key)
		headers[key] = value
	}

	/*
	 * some important headers
	 */

	// References, In-Reply-To
	rawRefs := headers["References"] + " " + headers["In-Reply-To"]

	references := headers["References"]
	inReplyTo := headers["In-Reply-To"]

	if references != "" && inReplyTo != "" {
		first := ""
		// take first that looks like a message id
		for _, ref := range SplitByWhite(inReplyTo) {
			if looksLikedMessageId(ref) {
				first = ref
				break
			}
		}

		rawRefs = references + " " + first
	}

	delete(headers, "References")
	delete(headers, "In-Reply-To")
	refs := make([]MessageId, 0)

	for _, ref := range SplitByWhite(rawRefs) {
		if ref != "" {
			refs = append(refs, MessageId(TrimWhite(ref)))
		}
	}

	// Subject
	subj := headers["Subject"]
	// base64 or quoted-printable encoded; see RFC 2047
	if len(subj) > 0 && subj[0:2] == "=?" {
		subj = decodeHeader(subj)
	}
	delete(headers, "Subject")

	// „From“ is not important, but nevertheless we have do deal
	// with it, since it may be base64 or quoted-printable
	// encoded

	if from := headers["From"]; len(from) > 0 && from[0:2] == "=?" {
		headers["From"] = decodeHeader(from)
	}

	// Id
	msgId := headers["Message-Id"]
	delete(headers, "Message-Id")

	/*
	 * encoding/charset issues
	 */

	// Content-Transfer-Encoding
	var err error
	encoding := headers["Content-Transfer-Encoding"]
	var decoded []byte

	switch encoding {
	case "base64":
		decoded, err = base64.StdEncoding.DecodeString(body)

	case "quoted-printable":
		decoded, err = DecodeQuotedPrintable(body)

		// 7bit, 8bit, other unknown types or nil
	default:
		err = nil
		decoded = []byte(body)
	}

	if err != nil {
		panic(fmt.Sprintf("Fehler (?): %s bei Id: %s und Inhalt '%s'\n", err, msgId, body))
	}

	// determine encoding („charset“) from Content-Type
	contentType := headers["Content-Type"]
	contentCharset := "UTF-8" // default charset

	// contentType looks like „text/plain; charset=UTF-8“
	for _, entry := range strings.Split(contentType, ";") {
		entry = TrimWhite(entry)
		if len(entry) > 0 && strings.Index(entry, "charset") >= 0 {
			i := strings.Index(entry, "=")
			if i >= 0 {
				contentCharset = entry[i+1:]

				// maybe the charset is specified with "quotes"
				if contentCharset[0] == '"' {
					contentCharset = contentCharset[1 : len(contentCharset)-1]
				}

				break
			}
		}
	}

	// apply contentCharset
	for {
		// „decoded“ is nil if the Content-Transfer-Encoding was not
		// base64 or quoted-printable
		if normaliseCharset(contentCharset) != "utf8" {
			r, err := charset.NewReader(contentCharset, strings.NewReader(string(decoded)))

			// copy bytes for unknown encoding
			if err != nil {
				body = string(decoded)
				log.Printf("encoding error: %s", err)
				break
			}

			decoded, _ = ioutil.ReadAll(r)
		}

		body = string(decoded)
		break
	}

	var aTime time.Time
	if date, ok := headers["Date"]; ok {
		// we found all these date formats in our corpus,
		// containing 40000+ messages from comp.lang.forth
		// comp.lang.lisp, comp.lang.haskell and
		// rec.games.abstract
		layouts := []string{
			"Mon, 2 Jan 2006 15:04:05 -0700 (MST)",
			"Mon, 2 Jan 2006 15:04:05 -0700",
			"Mon, 2 Jan 2006 15:04:05 MST",
			"Mon, 2 Jan 2006 15:04:05 -0700 (MST-07:00)",
			"2 Jan 2006 15:04:05 -0700",
			"2 Jan 2006 15:04:05 MST",
			"Mon, 2 Jan 2006 15:04 -0700",
		}

		for _, layout := range layouts {
			aTime, err = time.Parse(layout, date)
			if err == nil {
				break
			}
		}
	}

	return ParsedArticle{
		References:   refs,
		Subject:      subj,
		OtherHeaders: headers,
		Id:           MessageId(msgId),
		Body:         body,
		Date:         aTime,
	}
}
コード例 #15
0
ファイル: main.go プロジェクト: stroborobo/csv
func main() {
	var fileEncoding,
		outputEncoding,
		parseSeperator,
		printSeperator string
	var debug bool
	flag.StringVar(&fileEncoding, "e", "", "input encoding, e.g. latin9, defaults to UTF-8")
	flag.StringVar(&outputEncoding, "o", "", "output encoding, e.g. latin9, defaults to LC_ALL/LANG or UTF-8")
	flag.StringVar(&parseSeperator, "c", ";", "seperator char used for parsing")
	flag.StringVar(&printSeperator, "s", "|", "seperator string used for printing")
	flag.BoolVar(&debug, "d", false, "debug output")
	// TODO
	//var alignRight bool
	//flag.BoolVar(&alignRight, "r", false, "align values to the right instead to the left")

	flag.Parse()

	if utf8.RuneCountInString(parseSeperator) > 1 {
		fmt.Fprintln(os.Stderr, "The parse seperator must be a single char.")
		flag.Usage()
		os.Exit(5)
	}

	if outputEncoding == "" {
		outputEncoding = getOutputEnc()
	}

	var f *os.File
	var err error
	if len(flag.Args()) != 0 {
		f, err = os.Open(flag.Arg(0))
		if err != nil {
			fmt.Fprintln(os.Stderr, err)
			os.Exit(10)
		}
	} else {
		f = os.Stdin
	}
	var inputReader io.Reader
	if fileEncoding != "" {
		inputReader, err = charset.NewReader(fileEncoding, f)
		if err != nil {
			fmt.Fprintf(os.Stderr, "input encoding: %s\n", err)
			os.Exit(20)
		}
	} else {
		inputReader = f
	}
	r := csv.NewReader(inputReader)
	r.Comma, _ = utf8.DecodeLastRuneInString(parseSeperator)
	r.TrailingComma = true
	r.TrimLeadingSpace = true
	r.LazyQuotes = true

	data, err := r.ReadAll()
	if len(os.Args) == 2 {
		f.Close()
	}
	if err != nil {
		fmt.Fprintln(os.Stderr, err)
		os.Exit(30)
	}
	if len(data) == 0 || len(data[0]) == 0 {
		os.Exit(0)
	}

	if debug {
		fmt.Fprintf(os.Stderr, "DEBUG columns: %d\n", len(data[0]))
	}

	colLens := make(map[int]int)
	for ri, row := range data {
		for ci, col := range row {
			col = strings.Trim(col, " \t")
			data[ri][ci] = col
			cl := utf8.RuneCountInString(col)
			l, ex := colLens[ci]
			if !ex || cl > l {
				colLens[ci] = cl
			}
		}
	}

	var out io.Writer = os.Stdout
	if outputEncoding != "UTF-8" {
		out, err = charset.NewWriter(outputEncoding, out)
		if err != nil {
			fmt.Fprintf(os.Stderr, "output encoding: %s\n", err)
			os.Exit(40)
		}
	}

	for _, row := range data {
		for i, col := range row {
			fmt.Fprintf(out, fmt.Sprint("%-", colLens[i]+1, "s"), col)
			if i != len(colLens)-1 {
				fmt.Fprintf(out, "%s ", printSeperator)
			}
		}
		fmt.Fprint(out, "\n")
	}
}
コード例 #16
0
ファイル: scraper.go プロジェクト: oyiptong/dmozscrape
func fetchPage(url string) string {
	req, err := http.NewRequest("GET", url, nil)
	req.Header.Set("User-Agent", scraperConfig.UserAgent)

	httpClient := http.Client{
		Transport: &http.Transport{
			Dial:              timeoutDialler(time.Duration(10 * time.Second)),
			DisableKeepAlives: true,
		},
	}

	resp, err := httpClient.Do(req)
	if err != nil {
		log.Println("HTTP_ERROR:", err)
		return ""
	}
	defer resp.Body.Close()

	if resp.StatusCode == 200 {
		var dataStream io.Reader

		switch charType := fetchCharset(resp.Header.Get("Content-Type")); {

		case charType == "utf-8":
			dataStream = resp.Body

		case chartypeSet[charType]:
			// charset in available list for conversion
			charsetStream, err := charset.NewReader(charType, resp.Body)
			if err != nil {
				log.Println("ENCODING_ERROR:", err)
			} else {
				dataStream = charsetStream
			}

		default:
			//need to guess chartype
			bodyBytes, err := ioutil.ReadAll(resp.Body)
			if err != nil {
				log.Println("IO_ERROR:", err)
			}

			detector := chardet.NewHtmlDetector()
			result, err := detector.DetectBest(bodyBytes)
			if err != nil {
				log.Println("ENCODING_ERROR no_known_encoding", url)
				return ""
			}

			charType = strings.ToLower(result.Charset)
			if chartypeSet[charType] {
				dataStream = bytes.NewReader(bodyBytes)
				charsetStream, err := charset.NewReader(charType, dataStream)
				if err != nil {
					log.Println("ENCODING_ERROR:", err)
				} else {
					dataStream = charsetStream
				}
			}
		}

		if dataStream != nil {
			var bodyBytes []byte
			bodyBytes, err := ioutil.ReadAll(dataStream)
			if err != nil {
				log.Println("ERROR:", err)
			}

			return string(bodyBytes)
		} else {
			log.Println("ENCODING_ERROR: no suitable encoding found for", url)
		}
	}
	return ""
}