func main() { s := "Hello, \x90\xA2\x8A\x45" // CP932 encoded version of "Hello, 世界" , 这里的 s 是string类型,说明string没有字符集的概念 r, _ := charset.NewReader("CP932", strings.NewReader(s)) // convert from CP932 to UTF-8 s2_, _ := ioutil.ReadAll(r) s2 := string(s2_) fmt.Println(s2) // => Hello, 世界 fmt.Println(len(s2)) // => 13 fmt.Println(utf8.RuneCountInString(s2)) // => 9 fmt.Println(utf8.ValidString(s2)) // => true fmt.Println(utf8.ValidString(s)) // => false fmt.Printf("%T|%#v\n", s, s) // 注意 %v 与 %#v 的区别 ss := "This is not utf-8 string \xa1" fmt.Println(utf8.ValidString(ss)) // => false pice := []int32{20, 30, 40, 90} sss := string(pice) // string 似乎执行了内存拷贝,但是不会涉及到字符集的处理(转换或校验) fmt.Printf("%T:%p %T:%p:%d\n", pice, pice, sss, &sss, len(sss)) // 为什么打印字符串变量的地址还需要取地址符 tr, err := charset.TranslatorTo("windows-1252") //需要检查字符集列表 if err != nil { fmt.Println(err) os.Exit(1) } _, gbk, err2 := tr.Translate([]byte("utf-8汉字"), true) if err2 != nil { fmt.Println(err2) os.Exit(1) } fmt.Println(gbk) }
// Will receive an input stream which would convert the response to utf-8 // The given function must close the reader r, in order to close the response body. func HandleStringReader(f func(r io.Reader, ctx *goproxy.ProxyCtx) io.Reader) goproxy.RespHandler { return goproxy.FuncRespHandler(func(resp *http.Response, ctx *goproxy.ProxyCtx) *http.Response { if ctx.Error != nil { return nil } charsetName := ctx.Charset() if charsetName == "" { charsetName = "utf-8" } if strings.ToLower(charsetName) != "utf-8" { r, err := charset.NewReader(charsetName, resp.Body) if err != nil { ctx.Warnf("Cannot convert from %v to utf-8: %v", charsetName, err) return resp } tr, err := charset.TranslatorTo(charsetName) if err != nil { ctx.Warnf("Can't translate to %v from utf-8: %v", charsetName, err) return resp } if err != nil { ctx.Warnf("Cannot translate to %v: %v", charsetName, err) return resp } newr := charset.NewTranslatingReader(f(r, ctx), tr) resp.Body = &readFirstCloseBoth{ioutil.NopCloser(newr), resp.Body} } else { //no translation is needed, already at utf-8 resp.Body = &readFirstCloseBoth{ioutil.NopCloser(f(resp.Body, ctx)), resp.Body} } return resp }) }
func testCodepage(t *testing.T, name string, inReader, outReader func(io.Reader) io.Reader) { data := make([]byte, 256) for i := range data { data[i] = byte(i) } inr := inReader(bytes.NewBuffer(data)) r, err := charset.NewReader(name, inr) if err != nil { t.Fatalf("cannot make reader for charset %q: %v", name, err) } outr := outReader(r) r = outr var outbuf bytes.Buffer w, err := charset.NewWriter(name, &outbuf) if err != nil { t.Fatalf("cannot make writer for charset %q: %v", name, err) } _, err = io.Copy(w, r) if err != nil { t.Fatalf("copy failed: %v", err) } err = w.Close() if err != nil { t.Fatalf("close failed: %v", err) } if len(outbuf.Bytes()) != len(data) { t.Fatalf("short result of roundtrip, charset %q, readers %T, %T; expected 256, got %d", name, inr, outr, len(outbuf.Bytes())) } for i, x := range outbuf.Bytes() { if data[i] != x { t.Fatalf("charset %q, round trip expected %d, got %d", name, i, data[i]) } } }
// see RFC 2047; TODO: duplication of code in FormatArticle? func decodeHeader(header string) string { parts := strings.Split(header, "?") contentCharset, encoding, text := parts[1], parts[2], parts[3] err := error(nil) bytes := []byte{} switch strings.ToUpper(encoding) { // quoted-printable case "Q": bytes, err = DecodeQuotedPrintable(text) case "B": bytes, err = base64.StdEncoding.DecodeString(text) default: bytes = []byte(fmt.Sprintf("<<Couldn't decode '%s'>>", encoding)) } if err != nil { panic(fmt.Sprintf("Fehler (?): %s bei Header: %s\n", header)) } r, err := charset.NewReader(contentCharset, strings.NewReader(string(bytes))) if err != nil { return "<<Couldn't decode header '" + header + "'>>" } rv, _ := ioutil.ReadAll(r) return string(rv) }
func sjisToUtf8(data []byte) ([]byte, error) { r, err := charset.NewReader("cp932", bytes.NewReader(data)) if err != nil { return nil, err } result, err := ioutil.ReadAll(r) return result, err }
func encode_string(str string, charset string) string { r, err := goCharset.NewReader(charset, strings.NewReader(str)) error_log(err) result, err := ioutil.ReadAll(r) error_log(err) fmt.Printf("%s\n", result) return string(result) }
func sjisToUtf8(data []byte) (ret []byte, err error) { r, e := charset.NewReader("cp932", bytes.NewReader(data)) if e == nil { ret, err = ioutil.ReadAll(r) } else { err = e } return }
// Wraps the reader in charset.Reader if charset header found in resp func charsetReader(resp *http.Response) io.Reader { cs, err := getCharset(resp.Header) if err != nil { return resp.Body } r, err := charset.NewReader(cs, resp.Body) if err != nil { panic("Charset error " + cs) } return r }
func ExampleNewReader() { r, err := charset.NewReader("latin1", strings.NewReader("\xa35 for Pepp\xe9")) if err != nil { log.Fatal(err) } result, err := ioutil.ReadAll(r) if err != nil { log.Fatal(err) } fmt.Printf("%s\n", result) // Output: £5 for Peppé }
func main() { flag.Usage = func() { fmt.Fprintf(os.Stderr, "usage: tcs [-l] [-v] [charset]\n") fmt.Fprintf(os.Stderr, "\ttcs [-f charset] [-t charset] [file]\n") } flag.Parse() if *listFlag { cs := "" switch flag.NArg() { case 1: cs = flag.Arg(0) case 0: default: flag.Usage() } listCharsets(*verboseFlag, cs) return } var f *os.File switch flag.NArg() { case 0: f = os.Stdin case 1: var err error f, err = os.Open(flag.Arg(0)) if err != nil { fatalf("cannot open %q: %v", err) } } r, err := charset.NewReader(*fromCharset, f) if err != nil { fatalf("cannot translate from %q: %v", *fromCharset, err) } w, err := charset.NewWriter(*toCharset, os.Stdout) if err != nil { fatalf("cannot translate to %q: ", err) } _, err = io.Copy(w, r) if err != nil { fatalf("%v", err) } }
// Load the contents of this document from the supplied reader. func (this *Document) LoadStream(r io.Reader) (err error) { xp := xml.NewDecoder(r) xp.Entity = this.Entity xp.CharsetReader = func(enc string, input io.Reader) (io.Reader, error) { return charset.NewReader(enc, input) } this.Root = NewNode(NT_ROOT) ct := this.Root var tok xml.Token var t *Node var doctype string for { if tok, err = xp.Token(); err != nil { if err == io.EOF { return nil } return err } switch tt := tok.(type) { case xml.SyntaxError: return errors.New(tt.Error()) case xml.CharData: ct.Value = strings.TrimSpace(string([]byte(tt))) case xml.Comment: t := NewNode(NT_COMMENT) t.Value = strings.TrimSpace(string([]byte(tt))) ct.AddChild(t) case xml.Directive: t = NewNode(NT_DIRECTIVE) t.Value = strings.TrimSpace(string([]byte(tt))) ct.AddChild(t) case xml.StartElement: t = NewNode(NT_ELEMENT) t.Name = tt.Name t.Attributes = make([]*Attr, len(tt.Attr)) for i, v := range tt.Attr { t.Attributes[i] = new(Attr) t.Attributes[i].Name = v.Name t.Attributes[i].Value = v.Value } ct.AddChild(t) ct = t case xml.ProcInst: if tt.Target == "xml" { // xml doctype doctype = strings.TrimSpace(string(tt.Inst)) if i := strings.Index(doctype, `standalone="`); i > -1 { this.StandAlone = doctype[i+len(`standalone="`) : len(doctype)] i = strings.Index(this.StandAlone, `"`) this.StandAlone = this.StandAlone[0:i] } } else { t = NewNode(NT_PROCINST) t.Target = strings.TrimSpace(tt.Target) t.Value = strings.TrimSpace(string(tt.Inst)) ct.AddChild(t) } case xml.EndElement: if ct = ct.Parent; ct == nil { return } } } return }
func (this Crawler) Crawl() *Article { article := new(Article) this.assignParseCandidate() this.assignHtml() if this.rawHtml == "" { return article } reader := strings.NewReader(this.rawHtml) document, err := goquery.NewDocumentFromReader(reader) if err != nil { panic(err.Error()) } attr := "" selection := document.Find("meta").EachWithBreak(func(i int, s *goquery.Selection) bool { attr, exists := s.Attr("http-equiv") if exists && attr == "Content-Type" { return false } return true }) if selection != nil { attr, _ = selection.Attr("content") attr = strings.Replace(attr, " ", "", -1) if strings.HasPrefix(attr, "text/html;charset=") { cs := strings.TrimPrefix(attr, "text/html;charset=") cs = strings.ToLower(cs) if cs != "utf-8" { r, err := charset.NewReader(cs, strings.NewReader(this.rawHtml)) if err != nil { // On error, skip the read this.rawHtml = "" } else { utf8, _ := ioutil.ReadAll(r) this.rawHtml = string(utf8) } reader = strings.NewReader(this.rawHtml) document, err = goquery.NewDocumentFromReader(reader) } } } if err == nil { extractor := NewExtractor(this.config) html, _ := document.Html() start := TimeInNanoseconds() article.RawHtml = html article.FinalUrl = this.helper.url article.LinkHash = this.helper.linkHash article.Doc = document article.Title = extractor.getTitle(article) article.MetaLang = extractor.getMetaLanguage(article) article.MetaFavicon = extractor.getFavicon(article) article.MetaDescription = extractor.getMetaContentWithSelector(article, "meta[name#=(?i)description]") article.MetaKeywords = extractor.getMetaContentWithSelector(article, "meta[name#=(?i)keywords]") article.CanonicalLink = extractor.getCanonicalLink(article) article.Domain = extractor.getDomain(article) article.Tags = extractor.getTags(article) cleaner := NewCleaner(this.config) article.Doc = cleaner.clean(article) article.TopImage = OpenGraphResolver(article) if article.TopImage == "" { article.TopImage = WebPageResolver(article) } article.TopNode = extractor.calculateBestNode(article) if article.TopNode != nil { article.TopNode = extractor.postCleanup(article.TopNode) outputFormatter := new(outputFormatter) article.CleanedText = outputFormatter.getFormattedText(article) videoExtractor := NewVideoExtractor() article.Movies = videoExtractor.GetVideos(article) } stop := TimeInNanoseconds() delta := stop - start article.Delta = delta } else { panic(err.Error()) } return article }
func (p *OrderReader) ReadOrders(resp *http.Response) ( []*Order, error) { if resp == nil { panic("ReadOrders(): passed nil response") } if resp.StatusCode != 200 { return nil, errors.New("Server return status " + resp.Status) } w1251rdr, err := charset.NewReader(_STREAM_CHARSET, resp.Body) if err != nil { return nil, err } brdr := bufio.NewReaderSize(w1251rdr, _BUFFER_SIZE) // skip first line with topics if _, err = brdr.ReadString('\n'); err != nil { if err == io.EOF { return nil, nil } return nil, errors.New("Skip first line err: " + err.Error()) } // Below get newest chunk from stream and checking chunk // from store. Current newest chunk will checking chunk at next // time newestChunk, err := brdr.ReadBytes('\n') if err != nil && err != io.EOF { return nil, err } // There is no orders. Empty result if len(newestChunk) == 0 && err == io.EOF { return nil, nil } if newestChunk[len(newestChunk)-1] == '\n' { // cut delim \n if there is newestChunk = newestChunk[:len(newestChunk)-1] } // below get checking chunk by url rawurl := resp.Request.URL.String() // checking chunk was newest chunk at last time checkingChunk, exists := p.HashStore.GetHashChunk(rawurl) // check for updates if exists data in hashstore by comparing // newest chunk and checking chunk if exists { hash := md5.New() hash.Write(newestChunk) if bytes.Compare(checkingChunk, hash.Sum(nil)) == 0 { // return if feed was not updated return nil, nil } } // save newest chunk in cache p.HashStore.SetHashChunk(rawurl, newestChunk) if err = p.HashStore.Save(); err != nil { log.Println("Can't save cache:", err) } var orders []*Order if order, err := ParseOrder(newestChunk); err == nil { orders = append(orders, order) } else { log.Println("Parsing order error:", err) } // if exists checking chunk read while does not find matched chunk if exists { brdr = bufio.NewReader(NewCacheReader( brdr, []byte{'\n'}, checkingChunk, )) } for { rowData, err := brdr.ReadBytes('\n') if err != nil && err != io.EOF { return orders, err } if err == io.EOF && len(rowData) == 0 { break } if order, err := ParseOrder(rowData); err == nil { orders = append(orders, order) } else { log.Println("Parsing order error:", err) } if err == io.EOF { break } } return orders, nil }
// Separates body and headers; determines subject, references // etc.; deals with encoding and charset issues. func FormatArticle(article RawArticle) ParsedArticle { rawHeaders, body := firstAndRest(string(article), "\n\n") body = TrimWhite(body) // every element is one header line joinedHeaders := make([]string, 0) buf := "" // some headers are multiline (see RFC 3977, 3.6, „folded“) for _, line := range strings.Split(rawHeaders, "\n") { firstChar := line[0] // line for itself if firstChar != '\t' && firstChar != ' ' && len(buf) > 0 { joinedHeaders = append(joinedHeaders, TrimWhite(buf)) buf = "" } buf = buf + line + "\n" } // all headers headers := make(map[string]string) for _, headerLine := range joinedHeaders { key, value := firstAndRest(headerLine, ": ") key = http.CanonicalHeaderKey(key) headers[key] = value } /* * some important headers */ // References, In-Reply-To rawRefs := headers["References"] + " " + headers["In-Reply-To"] references := headers["References"] inReplyTo := headers["In-Reply-To"] if references != "" && inReplyTo != "" { first := "" // take first that looks like a message id for _, ref := range SplitByWhite(inReplyTo) { if looksLikedMessageId(ref) { first = ref break } } rawRefs = references + " " + first } delete(headers, "References") delete(headers, "In-Reply-To") refs := make([]MessageId, 0) for _, ref := range SplitByWhite(rawRefs) { if ref != "" { refs = append(refs, MessageId(TrimWhite(ref))) } } // Subject subj := headers["Subject"] // base64 or quoted-printable encoded; see RFC 2047 if len(subj) > 0 && subj[0:2] == "=?" { subj = decodeHeader(subj) } delete(headers, "Subject") // „From“ is not important, but nevertheless we have do deal // with it, since it may be base64 or quoted-printable // encoded if from := headers["From"]; len(from) > 0 && from[0:2] == "=?" { headers["From"] = decodeHeader(from) } // Id msgId := headers["Message-Id"] delete(headers, "Message-Id") /* * encoding/charset issues */ // Content-Transfer-Encoding var err error encoding := headers["Content-Transfer-Encoding"] var decoded []byte switch encoding { case "base64": decoded, err = base64.StdEncoding.DecodeString(body) case "quoted-printable": decoded, err = DecodeQuotedPrintable(body) // 7bit, 8bit, other unknown types or nil default: err = nil decoded = []byte(body) } if err != nil { panic(fmt.Sprintf("Fehler (?): %s bei Id: %s und Inhalt '%s'\n", err, msgId, body)) } // determine encoding („charset“) from Content-Type contentType := headers["Content-Type"] contentCharset := "UTF-8" // default charset // contentType looks like „text/plain; charset=UTF-8“ for _, entry := range strings.Split(contentType, ";") { entry = TrimWhite(entry) if len(entry) > 0 && strings.Index(entry, "charset") >= 0 { i := strings.Index(entry, "=") if i >= 0 { contentCharset = entry[i+1:] // maybe the charset is specified with "quotes" if contentCharset[0] == '"' { contentCharset = contentCharset[1 : len(contentCharset)-1] } break } } } // apply contentCharset for { // „decoded“ is nil if the Content-Transfer-Encoding was not // base64 or quoted-printable if normaliseCharset(contentCharset) != "utf8" { r, err := charset.NewReader(contentCharset, strings.NewReader(string(decoded))) // copy bytes for unknown encoding if err != nil { body = string(decoded) log.Printf("encoding error: %s", err) break } decoded, _ = ioutil.ReadAll(r) } body = string(decoded) break } var aTime time.Time if date, ok := headers["Date"]; ok { // we found all these date formats in our corpus, // containing 40000+ messages from comp.lang.forth // comp.lang.lisp, comp.lang.haskell and // rec.games.abstract layouts := []string{ "Mon, 2 Jan 2006 15:04:05 -0700 (MST)", "Mon, 2 Jan 2006 15:04:05 -0700", "Mon, 2 Jan 2006 15:04:05 MST", "Mon, 2 Jan 2006 15:04:05 -0700 (MST-07:00)", "2 Jan 2006 15:04:05 -0700", "2 Jan 2006 15:04:05 MST", "Mon, 2 Jan 2006 15:04 -0700", } for _, layout := range layouts { aTime, err = time.Parse(layout, date) if err == nil { break } } } return ParsedArticle{ References: refs, Subject: subj, OtherHeaders: headers, Id: MessageId(msgId), Body: body, Date: aTime, } }
func main() { var fileEncoding, outputEncoding, parseSeperator, printSeperator string var debug bool flag.StringVar(&fileEncoding, "e", "", "input encoding, e.g. latin9, defaults to UTF-8") flag.StringVar(&outputEncoding, "o", "", "output encoding, e.g. latin9, defaults to LC_ALL/LANG or UTF-8") flag.StringVar(&parseSeperator, "c", ";", "seperator char used for parsing") flag.StringVar(&printSeperator, "s", "|", "seperator string used for printing") flag.BoolVar(&debug, "d", false, "debug output") // TODO //var alignRight bool //flag.BoolVar(&alignRight, "r", false, "align values to the right instead to the left") flag.Parse() if utf8.RuneCountInString(parseSeperator) > 1 { fmt.Fprintln(os.Stderr, "The parse seperator must be a single char.") flag.Usage() os.Exit(5) } if outputEncoding == "" { outputEncoding = getOutputEnc() } var f *os.File var err error if len(flag.Args()) != 0 { f, err = os.Open(flag.Arg(0)) if err != nil { fmt.Fprintln(os.Stderr, err) os.Exit(10) } } else { f = os.Stdin } var inputReader io.Reader if fileEncoding != "" { inputReader, err = charset.NewReader(fileEncoding, f) if err != nil { fmt.Fprintf(os.Stderr, "input encoding: %s\n", err) os.Exit(20) } } else { inputReader = f } r := csv.NewReader(inputReader) r.Comma, _ = utf8.DecodeLastRuneInString(parseSeperator) r.TrailingComma = true r.TrimLeadingSpace = true r.LazyQuotes = true data, err := r.ReadAll() if len(os.Args) == 2 { f.Close() } if err != nil { fmt.Fprintln(os.Stderr, err) os.Exit(30) } if len(data) == 0 || len(data[0]) == 0 { os.Exit(0) } if debug { fmt.Fprintf(os.Stderr, "DEBUG columns: %d\n", len(data[0])) } colLens := make(map[int]int) for ri, row := range data { for ci, col := range row { col = strings.Trim(col, " \t") data[ri][ci] = col cl := utf8.RuneCountInString(col) l, ex := colLens[ci] if !ex || cl > l { colLens[ci] = cl } } } var out io.Writer = os.Stdout if outputEncoding != "UTF-8" { out, err = charset.NewWriter(outputEncoding, out) if err != nil { fmt.Fprintf(os.Stderr, "output encoding: %s\n", err) os.Exit(40) } } for _, row := range data { for i, col := range row { fmt.Fprintf(out, fmt.Sprint("%-", colLens[i]+1, "s"), col) if i != len(colLens)-1 { fmt.Fprintf(out, "%s ", printSeperator) } } fmt.Fprint(out, "\n") } }
func fetchPage(url string) string { req, err := http.NewRequest("GET", url, nil) req.Header.Set("User-Agent", scraperConfig.UserAgent) httpClient := http.Client{ Transport: &http.Transport{ Dial: timeoutDialler(time.Duration(10 * time.Second)), DisableKeepAlives: true, }, } resp, err := httpClient.Do(req) if err != nil { log.Println("HTTP_ERROR:", err) return "" } defer resp.Body.Close() if resp.StatusCode == 200 { var dataStream io.Reader switch charType := fetchCharset(resp.Header.Get("Content-Type")); { case charType == "utf-8": dataStream = resp.Body case chartypeSet[charType]: // charset in available list for conversion charsetStream, err := charset.NewReader(charType, resp.Body) if err != nil { log.Println("ENCODING_ERROR:", err) } else { dataStream = charsetStream } default: //need to guess chartype bodyBytes, err := ioutil.ReadAll(resp.Body) if err != nil { log.Println("IO_ERROR:", err) } detector := chardet.NewHtmlDetector() result, err := detector.DetectBest(bodyBytes) if err != nil { log.Println("ENCODING_ERROR no_known_encoding", url) return "" } charType = strings.ToLower(result.Charset) if chartypeSet[charType] { dataStream = bytes.NewReader(bodyBytes) charsetStream, err := charset.NewReader(charType, dataStream) if err != nil { log.Println("ENCODING_ERROR:", err) } else { dataStream = charsetStream } } } if dataStream != nil { var bodyBytes []byte bodyBytes, err := ioutil.ReadAll(dataStream) if err != nil { log.Println("ERROR:", err) } return string(bodyBytes) } else { log.Println("ENCODING_ERROR: no suitable encoding found for", url) } } return "" }