// DetectCharset returns best guess for the reesponse body character set. func (res *Response) DetectCharset() { // Detect via BOM / HTML meta tag _, cs1, ok1 := charset.DetermineEncoding(res.Body, res.MediaType) // Detect via ICU cs2, ok2, html := "", false, false var det *chardet.Detector if strings.Contains(res.MediaType, "html") || true { det = chardet.NewHtmlDetector() html = true } else { det = chardet.NewTextDetector() } r, err := det.DetectAll(res.Body) if err == nil && len(r) > 0 { cs2 = strings.ToLower(r[0].Charset) ok2 = r[0].Confidence > 50 } // Prefer charset if HTML, otherwise ICU if !ok2 && (ok1 || html) { res.Charset = cs1 } else { res.Charset = cs2 } // fmt.Printf("Detected charset via go.net/html/charset: %s (%t)\n", cs1, ok1) // fmt.Printf("Detected charset via saintfish/chardet: %s (%d)\n", cs2, r[0].Confidence) }
func GetUTF8HtmlTitle(str string) string { var e encoding.Encoding var name string e, name, _ = charset.DetermineEncoding([]byte(str), "text/html") if name == "windows-1252" { e, name, _ = charset.DetermineEncoding([]byte(str), "text/html;charset=gbk") } r := transform.NewReader(strings.NewReader(str), e.NewDecoder()) if b, err := ioutil.ReadAll(r); err != nil { return "" } else { return getHtmlTitle(string(b)) } return "" }
// Returns the body of resp as a decoded string, detecting its encoding func DecodedBody(resp *http.Response) (content []byte, encoding string, err error) { defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil && err != io.EOF { content = body return } e, encoding, _ := charset.DetermineEncoding(body, resp.Header.Get("Content-Type")) t := e.NewDecoder() content = make([]byte, len(body)) start := 0 for { var nDst, nSrc int nDst, nSrc, err = t.Transform(content[start:], body, true) body = body[nSrc:] start += nDst switch err { case transform.ErrShortDst: newContent := make([]byte, len(content)*2) copy(newContent, content) content = newContent case transform.ErrShortSrc: return default: content = content[:start] return } } return }
func bodyToUTF8(body []byte, contentType string) (*transform.Reader, error) { enc, _, _ := charset.DetermineEncoding(body, contentType) if enc == encoding.Nop { return nil, werrors.New(ErrEncodingNotFound) } return transform.NewReader(bytes.NewReader(body), enc.NewDecoder()), nil }
func parseReader(r io.Reader, url *url.URL) (*Document, error) { b, _ := ioutil.ReadAll(r) enc, _, _ := charset.DetermineEncoding(b, "text/html") root, e := html.Parse(bytes.NewReader(b)) if e != nil { return nil, e } return newDocument(root, url, enc.NewDecoder()), nil }
// ReEncodeReader re-encodes a reader based on the http content type provided func ReEncodeReader(input io.ReadCloser, contentType string) io.ReadCloser { if e, _, _ := charset.DetermineEncoding([]byte{}, contentType); e != encoding.Nop { type closer struct { io.Reader io.Closer } tr := transform.NewReader(input, e.NewDecoder()) return closer{tr, input} } return input }
// DecodeCharset detects charset of str decodes it. func decodeCharset(str, label string) (nstr string, err error) { enc, _ := charset.Lookup(label) if enc == nil { enc, _, _ = charset.DetermineEncoding([]byte(str), "text/plain") } nstr, _, err = transform.String(enc.NewDecoder(), str) if err != nil { return nstr, err } return stripNonUTF8(nstr), nil }
func main() { content := Gethtml("http://www.jb51.net/") file.WriteStringToFile(string(content), "./gbk.html") //content := Gethtml("http://www.baidu.com") e, n, c := charset.DetermineEncoding(content, "text/html") println(e) println(n) println(c) if n != "utf-8" { if e != nil { s, err := transformString(e.NewDecoder(), string(content)) if err == nil { file.WriteStringToFile(s, "./out.html") } } } }
func (r *Response) convToUTF8(preview []byte, query func(*url.URL) string) { // Convert to UTF-8 if media.IsHTML(r.ContentType) { e, name, certain := charset.DetermineEncoding( preview, r.ContentType, ) // according to charset package source, default unknown charset is windows-1252. if !certain && name == "windows-1252" { if e, name = charset.Lookup(query(r.URL)); e != nil { certain = true } } r.Charset, r.CertainCharset, r.Encoding = name, certain, e if name != "" && e != nil { r.Body, _ = util.NewUTF8Reader(name, r.Body) } } }
func encodingReader(body []byte, contentType string) (encoding.Encoding, error) { preview := make([]byte, 1024) var r io.Reader = bytes.NewReader(body) n, err := io.ReadFull(r, preview) switch { case err == io.ErrUnexpectedEOF: preview = preview[:n] r = bytes.NewReader(preview) case err != nil: return nil, err default: r = io.MultiReader(bytes.NewReader(preview), r) } e, _, certain := charset.DetermineEncoding(preview, contentType) if !certain && e == charmap.Windows1252 && utf8.Valid(body) { e = encoding.Nop } return e, nil }
func readFileAsUTF8String(filename string) (*string, error) { b, err := ioutil.ReadFile(filename) if err != nil { return nil, err } encoding, _, _ := charset.DetermineEncoding(b, mimeType) decoder := encoding.NewDecoder() decodedBytes, _, err := transform.Bytes(decoder, b) if err != nil { return nil, err } // Drop the UTF-8 BOM that may have been added. This isn't necessary, and // it's going to be written into another UTF-8 buffer anyway once it's JSON // serialized. // // The standard recommends omitting the BOM. See // http://www.unicode.org/versions/Unicode5.0.0/ch02.pdf decodedBytes = bytes.TrimPrefix(decodedBytes, utf8BOM) s := string(decodedBytes) return &s, nil }
func extractTest(test test, writeextract bool) ([]byte, string) { in, err := test.input.Open() must(err) defer in.Close() body, err := ioutil.ReadAll(in) must(err) e, _, _ := charset.DetermineEncoding(body, "UTF-8") r := transform.NewReader(bytes.NewReader(body), e.NewDecoder()) node, err := html.Parse(r) must(err) _, output, simplified, flattened, cleaned, err := sandblast.ExtractEx(node, 0) must(err) if writeextract { fmt.Printf("SIMPLIFIED:\n%s\n", simplified.DebugString()) fmt.Printf("FLATTENED:\n%s\n", flattened.DebugString()) fmt.Printf("CLEANED:\n%s\n", cleaned.DebugString()) } return body, output }
// GetPageBody gets and returns a body of a page. func (bot *Bot) GetPageBody(urlinfo *UrlInfo, customHeaders map[string]string) error { if urlinfo.URL == "" { return errors.New("Empty URL") } // Build the request. req, err := http.NewRequest("GET", urlinfo.URL, nil) if err != nil { return err } if customHeaders["User-Agent"] == "" { customHeaders["User-Agent"] = bot.Config.HttpDefaultUserAgent } for k, v := range customHeaders { req.Header.Set(k, v) } // Get response. resp, err := bot.HTTPClient.Do(req) if err != nil { return err } defer resp.Body.Close() // Update the URL if it changed after redirects. final_link := resp.Request.URL.String() if final_link != "" && final_link != urlinfo.URL { bot.Log.Debugf("%s becomes %s", urlinfo.URL, final_link) urlinfo.URL = final_link } // Load the body up to PageBodyMaxSize. body := make([]byte, bot.Config.PageBodyMaxSize, bot.Config.PageBodyMaxSize) if num, err := io.ReadFull(resp.Body, body); err != nil && err != io.ErrUnexpectedEOF { return err } else { // Trim unneeded 0 bytes so that JSON unmarshaller won't complain. body = body[:num] } // Get the content-type contentType := resp.Header.Get("Content-Type") if contentType == "" { contentType = http.DetectContentType(body) } urlinfo.ContentType = contentType // If type is text, decode the body to UTF-8. if strings.Contains(contentType, "text/") { // Try to get more significant part for encoding detection. sample := bytes.Join(bot.webContentSampleRe.FindAll(body, -1), []byte{}) if len(sample) < 100 { sample = body } // Unescape HTML tokens. sample = []byte(html.UnescapeString(string(sample))) // Try to only get charset from content type. Needed because some pages serve broken Content-Type header. detectionContentType := contentType tokens := strings.Split(contentType, ";") for _, t := range tokens { if strings.Contains(strings.ToLower(t), "charset") { detectionContentType = "text/plain; " + t break } } // Detect encoding and transform. encoding, _, _ := charset.DetermineEncoding(sample, detectionContentType) decodedBody, _, _ := transform.Bytes(encoding.NewDecoder(), body) urlinfo.Body = decodedBody } else if strings.Contains(contentType, "application/json") { urlinfo.Body = body } else { bot.Log.Debugf("Not fetching the body for Content-Type: %s", contentType) } return nil }
func (h ProxyHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { activeConnections.Add(1) defer activeConnections.Done() conf := GetConfig() if !conf.ACLsLoaded { http.Error(w, "Redwood proxy configuration needs to be updated for this version of Redwood.\n(Use ACLs)", 500) return } if len(r.URL.String()) > 10000 { http.Error(w, "URL too long", http.StatusRequestURITooLong) return } client := r.RemoteAddr host, _, err := net.SplitHostPort(client) if err == nil { client = host } if conf.AuthCacheTime > 0 { auth := r.Header.Get("Proxy-Authorization") if auth == "" { authCacheLock.RLock() ar, ok := authCache[client] authCacheLock.RUnlock() if ok && time.Now().Sub(ar.Time) < time.Duration(conf.AuthCacheTime)*time.Second { r.Header.Set("Proxy-Authorization", ar.ProxyAuthorization) } } else { authCacheLock.Lock() authCache[client] = authRecord{ ProxyAuthorization: auth, Time: time.Now(), } authCacheLock.Unlock() } } if r.Header.Get("Proxy-Authorization") != "" { user, pass := ProxyCredentials(r) if !conf.ValidCredentials(user, pass) { log.Printf("Incorrect username or password from %v: %q:%q", r.RemoteAddr, user, pass) r.Header.Del("Proxy-Authorization") } } // Reconstruct the URL if it is incomplete (i.e. on a transparent proxy). if r.URL.Host == "" { r.URL.Host = r.Host } if r.URL.Scheme == "" { if h.TLS { r.URL.Scheme = "https" } else { r.URL.Scheme = "http" } } var userAgent string if conf.LogUserAgent { userAgent = r.Header.Get("User-Agent") } if realHost, ok := conf.VirtualHosts[r.Host]; ok { r.Host = realHost r.URL.Host = realHost } user := client var authUser string if h.user != "" { authUser = h.user } else if u, _ := ProxyCredentials(r); u != "" { authUser = u } if authUser != "" { user = authUser } tally := conf.URLRules.MatchingRules(r.URL) scores := conf.categoryScores(tally) categories := conf.significantCategories(scores) reqACLs := conf.ACLs.requestACLs(r, authUser) possibleActions := []string{ "allow", "block", "block-invisible", } if r.Header.Get("Proxy-Authorization") == "" && !h.TLS { possibleActions = append(possibleActions, "require-auth") } if r.Method == "CONNECT" && conf.TLSReady { possibleActions = append(possibleActions, "ssl-bump") } thisRule, ignored := conf.ChooseACLCategoryAction(reqACLs, categories, possibleActions...) if r.Method == "CONNECT" && conf.TLSReady && thisRule.Action == "" { // If the result is unclear, go ahead and start to bump the connection. // The ACLs will be checked one more time anyway. thisRule.Action = "ssl-bump" } switch thisRule.Action { case "require-auth": conf.send407(w) log.Printf("Missing required proxy authentication from %v to %v", r.RemoteAddr, r.URL) return case "block": conf.showBlockPage(w, r, user, tally, scores, thisRule) logAccess(r, nil, 0, false, user, tally, scores, thisRule, "", ignored, userAgent) return case "block-invisible": showInvisibleBlock(w) logAccess(r, nil, 0, false, user, tally, scores, thisRule, "", ignored, userAgent) return case "ssl-bump": conn, err := newHijackedConn(w) if err != nil { fmt.Fprintln(conn, "HTTP/1.1 500 Internal Server Error") fmt.Fprintln(conn) fmt.Fprintln(conn, err) conn.Close() return } fmt.Fprint(conn, "HTTP/1.1 200 Connection Established\r\n\r\n") SSLBump(conn, r.URL.Host, user, authUser) return } if r.Host == localServer { conf.ServeMux.ServeHTTP(w, r) return } if r.Method == "CONNECT" { conn, err := newHijackedConn(w) if err != nil { fmt.Fprintln(conn, "HTTP/1.1 500 Internal Server Error") fmt.Fprintln(conn) fmt.Fprintln(conn, err) conn.Close() return } fmt.Fprint(conn, "HTTP/1.1 200 Connection Established\r\n\r\n") logAccess(r, nil, 0, false, user, tally, scores, thisRule, "", ignored, userAgent) connectDirect(conn, r.URL.Host, nil) return } if r.Header.Get("Upgrade") == "websocket" { h.makeWebsocketConnection(w, r) return } r.Header.Add("Via", r.Proto+" Redwood") r.Header.Add("X-Forwarded-For", client) gzipOK := !conf.DisableGZIP && strings.Contains(r.Header.Get("Accept-Encoding"), "gzip") && !lanAddress(client) r.Header.Del("Accept-Encoding") urlChanged := conf.changeQuery(r.URL) if !urlChanged { // Rebuild the URL in a way that will preserve which characters are escaped // and which aren't, for compatibility with broken servers. rawURL := r.RequestURI if strings.HasPrefix(rawURL, r.URL.Scheme) { rawURL = rawURL[len(r.URL.Scheme):] rawURL = strings.TrimPrefix(rawURL, "://") slash := strings.Index(rawURL, "/") if slash == -1 { rawURL = "/" } else { rawURL = rawURL[slash:] } } q := strings.Index(rawURL, "?") if q != -1 { rawURL = rawURL[:q] } if strings.HasPrefix(rawURL, "//") { // The path should start with a single slash not two. rawURL = rawURL[1:] } r.URL.Opaque = rawURL } proxied := false var rt http.RoundTripper if h.rt == nil { if r.URL.Opaque != "" && transport.Proxy != nil { if p, _ := transport.Proxy(r); p != nil { // If the request is going through a proxy, the host needs to be // included in the opaque element. r.URL.Opaque = "//" + r.URL.Host + r.URL.Opaque proxied = true } } rt = &transport } else { rt = h.rt } if !proxied { r.Header.Del("Proxy-Authorization") } resp, err := rt.RoundTrip(r) r.URL.Opaque = "" if err != nil { http.Error(w, err.Error(), http.StatusServiceUnavailable) log.Printf("error fetching %s: %s", r.URL, err) logAccess(r, nil, 0, false, user, tally, scores, thisRule, "", ignored, userAgent) return } defer resp.Body.Close() // Prevent switching to QUIC. resp.Header.Del("Alternate-Protocol") originalContentType := resp.Header.Get("Content-Type") fixContentType(resp) respACLs := conf.ACLs.responseACLs(resp) acls := unionACLSets(reqACLs, respACLs) thisRule, ignored = conf.ChooseACLCategoryAction(acls, categories, "allow", "block", "block-invisible", "hash-image", "phrase-scan") if thisRule.Action == "" { thisRule.Action = "allow" } switch thisRule.Action { case "allow": resp.Header.Set("Content-Type", originalContentType) copyResponseHeader(w, resp) n, err := io.Copy(w, resp.Body) if err != nil { log.Printf("error while copying response (URL: %s): %s", r.URL, err) } logAccess(r, resp, int(n), false, user, tally, scores, thisRule, "", ignored, userAgent) return case "block": conf.showBlockPage(w, r, user, tally, scores, thisRule) logAccess(r, resp, 0, false, user, tally, scores, thisRule, "", ignored, userAgent) return case "block-invisible": showInvisibleBlock(w) logAccess(r, resp, 0, false, user, tally, scores, thisRule, "", ignored, userAgent) return } lr := &io.LimitedReader{ R: resp.Body, N: 1e6, } content, err := ioutil.ReadAll(lr) if err != nil { log.Printf("error while reading response body (URL: %s): %s", r.URL, err) } if lr.N == 0 { log.Println("response body too long to filter:", r.URL) resp.Header.Set("Content-Type", originalContentType) var dest io.Writer = w if gzipOK { resp.Header.Set("Content-Encoding", "gzip") resp.Header.Del("Content-Length") gzw := gzip.NewWriter(w) defer gzw.Close() dest = gzw } copyResponseHeader(w, resp) dest.Write(content) n, err := io.Copy(dest, resp.Body) if err != nil { log.Printf("error while copying response (URL: %s): %s", r.URL, err) } logAccess(r, resp, int(n)+len(content), false, user, tally, scores, ACLActionRule{Action: "allow", Needed: []string{"too-long-to-filter"}}, "", ignored, userAgent) return } modified := false pageTitle := "" switch thisRule.Action { case "phrase-scan": contentType := resp.Header.Get("Content-Type") _, cs, _ := charset.DetermineEncoding(content, contentType) if strings.Contains(contentType, "html") { var doc *html.Node if conf.LogTitle { doc, err = parseHTML(content, cs) if err != nil { log.Printf("Error parsing HTML from %s: %s", r.URL, err) } else { t := titleSelector.MatchFirst(doc) if t != nil { if titleText := t.FirstChild; titleText != nil && titleText.Type == html.TextNode { pageTitle = titleText.Data } } } } modified = conf.pruneContent(r.URL, &content, cs, acls, doc) if modified { resp.Header.Set("Content-Type", "text/html; charset=utf-8") cs = "utf-8" resp.Header.Del("Content-Length") } } conf.scanContent(content, contentType, cs, tally) case "hash-image": img, _, err := image.Decode(bytes.NewReader(content)) if err != nil { log.Printf("Error decoding image from %v: %v", r.URL, err) break } hash := dhash.New(img) for _, h := range conf.ImageHashes { if dhash.Distance(hash, h) <= conf.DhashThreshold { tally[rule{imageHash, h.String()}]++ } } } scores = conf.categoryScores(tally) categories = conf.significantCategories(scores) thisRule, ignored = conf.ChooseACLCategoryAction(acls, categories, "allow", "block", "block-invisible") if thisRule.Action == "" { thisRule.Action = "allow" } switch thisRule.Action { case "block": conf.showBlockPage(w, r, user, tally, scores, thisRule) logAccess(r, resp, len(content), modified, user, tally, scores, thisRule, pageTitle, ignored, userAgent) return case "block-invisible": showInvisibleBlock(w) logAccess(r, resp, len(content), modified, user, tally, scores, thisRule, pageTitle, ignored, userAgent) return } if !modified { resp.Header.Set("Content-Type", originalContentType) } if gzipOK && len(content) > 1000 { resp.Header.Set("Content-Encoding", "gzip") resp.Header.Del("Content-Length") copyResponseHeader(w, resp) gzw := gzip.NewWriter(w) gzw.Write(content) gzw.Close() } else { copyResponseHeader(w, resp) w.Write(content) } logAccess(r, resp, len(content), modified, user, tally, scores, thisRule, pageTitle, ignored, userAgent) }
// runURLTest prints debugging information about how the URL and its content would be rated. func runURLTest(u string) { conf := getConfig() URL, err := url.Parse(u) if err != nil { fmt.Println("Could not parse the URL.") return } if URL.Scheme == "" { url2, err := url.Parse("http://" + u) if err == nil { URL = url2 } } fmt.Println("URL:", URL) fmt.Println() tally := conf.URLRules.MatchingRules(URL) scores := conf.categoryScores(tally) categories := conf.significantCategories(scores) if len(tally) == 0 { fmt.Println("No URL rules match.") } else { fmt.Println("The following URL rules match:") for s, _ := range tally { fmt.Println(s) } } if len(scores) > 0 { fmt.Println() fmt.Println("The request has the following category scores:") printSortedTally(scores) } req := &http.Request{ Method: "GET", URL: URL, Header: make(http.Header), } reqACLs := conf.ACLs.requestACLs(req, "") if len(reqACLs) > 0 { fmt.Println() fmt.Println("The request matches the following ACLs:") for acl := range reqACLs { fmt.Println(acl) } } thisRule, ignored := conf.ChooseACLCategoryAction(reqACLs, categories, "allow", "block", "block-invisible") fmt.Println() if thisRule.Action == "" { fmt.Println("No ACL rule was triggered.") } else { fmt.Println("Triggered rule:", thisRule.Action, thisRule.Conditions()) if len(ignored) > 0 { fmt.Println("Ignored categories:", strings.Join(ignored, ", ")) } } if conf.changeQuery(URL) { fmt.Println() fmt.Println("URL modified to:", URL) } fmt.Println() fmt.Println("Downloading content...") resp, err := http.DefaultTransport.RoundTrip(req) if err != nil { fmt.Println(err) return } defer resp.Body.Close() fmt.Println(resp.Status) fmt.Println() fixContentType(resp) respACLs := conf.ACLs.responseACLs(resp) acls := unionACLSets(reqACLs, respACLs) if len(respACLs) > 0 { fmt.Println("The response matches the following ACLs:") for acl := range respACLs { fmt.Println(acl) } fmt.Println() } thisRule, ignored = conf.ChooseACLCategoryAction(acls, categories, "allow", "block", "block-invisible", "hash-image", "phrase-scan") if thisRule.Action == "" { fmt.Println("No ACL rule was triggered.") } else { fmt.Println("Triggered rule:", thisRule.Action, thisRule.Conditions()) if len(ignored) > 0 { fmt.Println("Ignored categories:", strings.Join(ignored, ", ")) } } if thisRule.Action != "phrase-scan" && thisRule.Action != "hash-image" { return } fmt.Println() contentType := resp.Header.Get("Content-Type") content, err := ioutil.ReadAll(resp.Body) if err != nil { fmt.Println("Error while reading response body:", err) return } var doc *html.Node switch thisRule.Action { case "phrase-scan": modified := false _, cs, _ := charset.DetermineEncoding(content, resp.Header.Get("Content-Type")) if strings.Contains(contentType, "html") { modified = conf.pruneContent(URL, &content, cs, acls, &doc) } if modified { cs = "utf-8" fmt.Println("Performed content pruning.") fmt.Println() } conf.scanContent(content, contentType, cs, tally) if len(tally) == 0 { fmt.Println("No content phrases match.") } else { fmt.Println("The following rules match:") printSortedTally(stringTally(tally)) } case "hash-image": img, _, err := image.Decode(bytes.NewReader(content)) if err != nil { fmt.Printf("Error decoding image: %v\n", err) return } hash := dhash.New(img) fmt.Println("The image's hash is", hash) for _, h := range conf.ImageHashes { distance := dhash.Distance(hash, h.Hash) if distance <= h.Threshold || h.Threshold == -1 && distance <= conf.DhashThreshold { tally[rule{imageHash, h.String()}]++ fmt.Printf("Matching image hash found: %v (%d bits difference)\n", h, distance) } } } scores = conf.categoryScores(tally) categories = conf.significantCategories(scores) if len(scores) > 0 { fmt.Println() fmt.Println("The response has the following category scores:") printSortedTally(scores) } fmt.Println() thisRule, ignored = conf.ChooseACLCategoryAction(acls, categories, "allow", "block", "block-invisible") if thisRule.Action == "" { fmt.Println("No ACL rule was triggered.") } else { fmt.Println("Triggered rule:", thisRule.Action, thisRule.Conditions()) if len(ignored) > 0 { fmt.Println("Ignored categories:", strings.Join(ignored, ", ")) } } }
func DetectEncoding(content []byte) string { _, name, _ := charset.DetermineEncoding(content, setting.Repository.AnsiCharset) return name }