// GoodAsText applies some heuristics to make the data look good when displayed // as simple text. For example, if the data is escaped HTML then other heuristics are // applied to remove the HTML. For example if the data contains an HTML image tag, // goodAsText will return the alt text. If nothing good is found then an empty slice is // returned. func goodAsText(d []byte) []byte { unesc := html.UnescapeString(string(d)) nodes, err := html.ParseFragment(strings.NewReader(unesc), bodyNode) if err != nil { log.Printf("failed to parse [%s] as HTML: %v", unesc, err) return d } var buf bytes.Buffer for _, root := range nodes { walk(root, func(n *html.Node) { if n.Type == html.TextNode { buf.WriteString(strings.TrimSpace(n.Data)) return } if n := buf.Len(); n > 0 && buf.Bytes()[n-1] != ' ' { buf.WriteString(" ") } if n.DataAtom == atom.Img { if alt := altTextOrEmpty(n); alt != "" { buf.WriteString(alt) } } }) } return buf.Bytes() }
func (c atomContent) Data() []byte { unesc := c.Contents if c.Type != "xhtml" { unesc = []byte(html.UnescapeString(string(c.Contents))) } return unesc }
func pageTitle(body []byte) string { match := matchTitle(body, 1) if len(match) == 0 { return "" } return html.UnescapeString(string(match[0][1])) }
// Try to (up to 10 times) to unescape a string. // Some feeds are double escaped with things like: &amp; func fullyHTMLUnescape(orig string) string { mod := orig for i := 0; i < 10; i++ { mod = html.UnescapeString(orig) if orig == mod { return mod } orig = mod } return mod }
func (n *Node) AllText(pat ...string) *string { ss := []string{} for _, n := range n.Descendants(TextNode).All() { if text := n.Text(pat...); text != nil && *text != "" { ss = append(ss, *text) } } s := html.UnescapeString(strings.Join(ss, " ")) if s != "" { return &s } return nil }
func SnipText(s string, length int) string { s = snipRe.ReplaceAllString(strings.TrimSpace(s), " ") s = html.UnescapeString(s) if len(s) <= length { return s } s = s[:length] i := strings.LastIndexAny(s, " .-!?") if i != -1 { return s[:i] } return CleanNonUTF8(s) }
func cleanNode(c *Config, n *html.Node) *html.Node { allowedAttr, ok1 := c.elem[n.DataAtom] customAttr, ok2 := c.elemCustom[n.Data] if ok1 || ok2 { cleanChildren(c, n) haveSrc := false attrs := n.Attr n.Attr = make([]html.Attribute, 0, len(attrs)) for _, attr := range attrs { a := atom.Lookup([]byte(attr.Key)) re1, ok1 := allowedAttr[a] re2, ok2 := customAttr[attr.Key] _, ok3 := c.attr[a] _, ok4 := c.attrCustom[attr.Key] if attr.Namespace != "" || (!ok1 && !ok2 && !ok3 && !ok4) { continue } if !cleanURL(c, a, &attr) { continue } if re1 != nil && !re1.MatchString(attr.Val) { continue } if re2 != nil && !re2.MatchString(attr.Val) { continue } haveSrc = haveSrc || a == atom.Src n.Attr = append(n.Attr, attr) } if n.DataAtom == atom.Img && !haveSrc { // replace it with an empty text node return &html.Node{Type: html.TextNode} } return n } return text(html.UnescapeString(Render(n))) }
func (entityDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { for nSrc < len(src) && nDst < len(dst) { if c := src[nSrc]; c != '&' { dst[nDst] = c nSrc++ nDst++ continue } // Try to decode a character entity. entityLen := 1 for entityLen < 32 { if nSrc+entityLen == len(src) { if atEOF { break } else { err = transform.ErrShortSrc return } } if b := src[nSrc+entityLen]; 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || '0' <= b && b <= '9' || entityLen == 1 && b == '#' || b == ';' { entityLen++ if b == ';' { break } } else { break } } e := string(src[nSrc : nSrc+entityLen]) decoded := html.UnescapeString(e) n := copy(dst[nDst:], decoded) if n < len(decoded) { err = transform.ErrShortDst return } nSrc += entityLen nDst += len(decoded) } if nSrc < len(src) && err == nil { err = transform.ErrShortDst } return }
func GetTrending(useragent string) (games_result []Trending, success bool) { games := []Trending{} client := &http.Client{} req, err := http.NewRequest("GET", "http://steamcharts.com/", nil) if err != nil { log.Error(err.Error()) return games, false } req.Header.Set("User-Agent", useragent) resp, err := client.Do(req) if err != nil { log.Error(err.Error()) return games, false } defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { log.Error(err.Error()) return games, false } s_body := string(body) s_body = strings.Replace(s_body, "\n", "", -1) re := regexp.MustCompile("td class=\"game-name left\">.+?href=\"/app/(\\d+?)\">(.+?)</a>.+?\"gain\">(.+?)</td>.+? class=\"num\">(.+?)</td>") matches := re.FindAllStringSubmatch(s_body, -1) if matches != nil { for _, match := range matches { log.Debugf("Found match: %s, %s, %s, %s", match[1], match[2], match[3], match[4]) app_s := match[1] name := strings.TrimSpace(match[2]) gain := html.UnescapeString(match[3]) num_s := strings.Replace(match[4], ",", "", -1) app, _ := strconv.Atoi(app_s) num, _ := strconv.Atoi(num_s) games = append(games, Trending{app, name, gain, num}) } return games, true } return games, false }
func parseRdf(u string, b []byte) (*Feed, []*Story, error) { rd := rdf.RDF{} d := xml.NewDecoder(bytes.NewReader(b)) d.CharsetReader = charset.NewReaderLabel d.Strict = false d.Entity = xml.HTMLEntity err := d.Decode(&rd) if err != nil { return nil, nil, err } f := Feed{ URL: u, } s := []*Story{} if rd.Channel != nil { f.Title = rd.Channel.Title f.Link = rd.Channel.Link if t, err := parseDate(rd.Channel.Date); err == nil { f.Updated = t } } for _, i := range rd.Item { st := Story{ ID: i.About, Title: i.Title, Link: i.Link, Author: i.Creator, Feed: &f, } st.Content = html.UnescapeString(i.Description) if t, err := parseDate(i.Date); err == nil { st.Published = t st.Updated = t } s = append(s, &st) } return parseFix(&f, s) }
func (t Text) HTML() (s string, err error) { switch t.Type { case "html": t := html.UnescapeString(t.Content) s = fmt.Sprintf("<div>%s</div>", t) case "xhtml": r := strings.NewReader(t.Content) tokenizer := html.NewTokenizer(r) err = nextToken(tokenizer) if err != nil { return } s, err = buildHTML(tokenizer) case "text": s = fmt.Sprintf("<pre>%s</pre>", t.Content) default: s = fmt.Sprintf("<pre>%s</pre>", t.Content) } return }
func getGroupKeyword(n *html.Node) string { s, _ := selector.Selector("#news_detail .icon_box .icon-name") nodes := s.Find(n) if len(nodes) > 0 { s := strings.TrimSpace( html.UnescapeString( extractNodeString(nodes[0]), ), ) // we don't need the year suffix. if strings.HasPrefix(s, "モーニング娘。") { return "モーニング娘。" } // TODO: any other imports than ハロコン? if s == "HELLO! PROJECT" { return "ハロコン" } return s } return "" }
func (info *HTMLInfo) parseBody(n *html.Node) { if !info.AllowMainContentExtraction { return } buf := new(bytes.Buffer) err := html.Render(buf, n) if err != nil { return } bufStr := buf.String() doc, err := readability.NewDocument(bufStr) if err != nil { return } doc.WhitelistTags = []string{"div", "p", "img"} doc.WhitelistAttrs["img"] = []string{"src", "title", "alt"} content := doc.Content() content = html.UnescapeString(content) info.MainContent = strings.Trim(content, "\r\n\t ") }
func main() { flag.Parse() // panic("Just Quit") getHostConfig() // runtime.GOMAXPROCS(2) timeout = 1000 fmt.Println("Feeds") //http://careers.stackoverflow.com/jobs/feed?searchTerm=big+data&location=san+francisco&range=100&distanceUnits=Miles // feeds = append(feeds, Feed{index: 0, url: "http://careers.stackoverflow.com/jobs/feed?searchTerm=big+data&location=san+francisco&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false }) feeds = append(feeds, Feed{index: 0, url: "http://careers.stackoverflow.com/jobs/feed?location=san+francisco%2c+ca&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false}) feeds = append(feeds, Feed{index: 1, url: "http://careers.stackoverflow.com/jobs/feed?location=new+york+city%2c+ny&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false}) feeds = append(feeds, Feed{index: 2, url: "http://careers.stackoverflow.com/jobs/feed?location=los+angeles%2c+ca&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false}) feeds = append(feeds, Feed{index: 3, url: "http://careers.stackoverflow.com/jobs/feed?location=boston%2c+ma&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false}) feeds = append(feeds, Feed{index: 4, url: "http://careers.stackoverflow.com/jobs/feed?location=seattle%2cwa&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false}) feeds = append(feeds, Feed{index: 5, url: "http://careers.stackoverflow.com/jobs/feed?location=austin%2ctx&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false}) feeds = append(feeds, Feed{index: 6, url: "http://careers.stackoverflow.com/jobs/feed?location=chicago%2cil&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false}) mutex = &sync.Mutex{} skillMap = make(map[string]int, 200) loadSkillMapFile(skillMap) fmt.Println("GetRSS") getRSS2() saveSkillMapFile(skillMap) if conf.hbaseZkURL != "" { saveSkillsMapHBase(skillMap) } for i := 0; i < len(guidList); i++ { fmt.Println(guidList[i]) } // guidList := make([]string, 4) // guidList[0] = "http://careers.stackoverflow.com/jobs/103310/senior-software-engineer-american-society-of-clinical" // guidList[1] = "http://careers.stackoverflow.com/jobs/94152/senior-software-engineer-platform-flixster" // guidList[2] = "http://careers.stackoverflow.com/jobs/103328/senior-full-stack-engineer-data-science-adroll" // guidList[3] = "http://careers.stackoverflow.com/jobs/104086/enterprise-architect-new-relic" // fmt.Printf("%v\n", s) // map random times & make s3names fw.Slice(guidList).Map(func(sURL string) URLTuple { fmt.Printf("Map1: %v\n", sURL) fName := "jobs_sof/" + strings.Replace(strings.TrimPrefix(sURL, "http://careers.stackoverflow.com/jobs/"), "/", "_", -1) ms := rand.Intn(3000) return URLTuple{sURL, fName, ms} // Filter already-acquired URLs }).Filter(func(uTuple URLTuple) bool { // is file already stored in S3? //fmt.Printf("Filter:%s, %v\n", uTuple.s3Name, uTuple) svcS3 := s3.New(session.New(&aws.Config{Region: aws.String("us-east-1")})) var params *s3.HeadObjectInput params = &s3.HeadObjectInput{ Bucket: aws.String("opps"), // Required Key: aws.String(uTuple.s3Name), // Required } hobj, _ := svcS3.HeadObject(params) fmt.Printf("Filter: %s => %v\n", uTuple.s3Name, hobj.ContentLength == nil) return hobj.ContentLength == nil // get the URLs }).Map(func(uTuple URLTuple) statusTuple { fmt.Printf("Map3: %v\n", uTuple) // random sleep time.Sleep(time.Duration(uTuple.msWait) * time.Millisecond) // get URL resp, err := http.Get(uTuple.gURL) if err != nil { panic(err) } defer resp.Body.Close() // fmt.Println("Body:", resp.Body) // fmt.Println("Proto:", resp.Proto) // fmt.Printf("response Status = <%s> / Length = %d\n", resp.Status, resp.ContentLength) // fmt.Println("response Headers:", resp.Header) // fmt.Printf("response %+v:\n", resp) // fmt.Println("response Body:", string(body)) failed := 0 passed := 0 if resp.StatusCode == 200 { passed = 1 } else { failed = 1 } // store in S3 if passed == 1 { body, _ := ioutil.ReadAll(resp.Body) reader := strings.NewReader(string(body)) root, err := html.Parse(reader) if err != nil { fmt.Printf("%+v\n", err) } var b bytes.Buffer html.Render(&b, root) fixedHtml := b.String() isOk := func(r rune) bool { return r < 32 || r >= 127 } // The isOk filter is such that there is no need to chain to norm.NFC t2 := transform.Chain(norm.NFKD, transform.RemoveFunc(isOk)) // This Transformer could also trivially be applied as an io.Reader // or io.Writer filter to automatically do such filtering when reading // or writing data anywhere. fixedUnicodeNFKD, _, _ := transform.String(t2, fixedHtml) // fmt.Println("\n\n\n"+fixedUnicodeNFKD) reader = strings.NewReader(fixedUnicodeNFKD) xmlroot, xmlerr := xmlpath.ParseHTML(reader) if xmlerr != nil { log.Fatal(xmlerr) } // fmt.Printf("xml root = %+v\n------\n", xmlroot) path := &xmlpath.Path{} pstr := string("") pstr = `/html/head/title` path = xmlpath.MustCompile(pstr) var ok bool title := "" if title, ok = path.String(xmlroot); ok { // fmt.Printf("%s: %s\n", pstr, title) } fmt.Printf("**** Title: %s\n", title) var iter *xmlpath.Iter var list *xmlpath.Path var cnt int // Location - needs Trim pstr = `//*[@id="hed"]/ul[1]/li/text()` path = xmlpath.MustCompile(pstr) location := "" if location, ok = path.String(xmlroot); ok { // fmt.Printf("Location - %s: %s\n", pstr, strings.Trim(location, " \n")) location = strings.Trim(location, " \n") } // Base Skills - LOOP from 1 until not ok var skills []string list = xmlpath.MustCompile(`//*[@id="hed"]/div[2]/p/a`) iter = list.Iter(xmlroot) for iter.Next() { ele := iter.Node().String() skills = append(skills, ele) // fmt.Printf("Sk-Desc: %s\n", ele) } var desc []string list = xmlpath.MustCompile(`//*[@id="jobdetailpage"]/div[2]/div[1]/div[2]/p`) iter = list.Iter(xmlroot) for iter.Next() { ele := iter.Node().String() desc = append(desc, ele) // fmt.Printf("it-Desc1: %s\n", ele) } list = xmlpath.MustCompile(`//*[@id="jobdetailpage"]/div[2]/div[1]/div[2]/ul/li`) iter = list.Iter(xmlroot) for iter.Next() { ele := iter.Node().String() desc = append(desc, ele) // fmt.Printf("it-Desc2: %s\n", ele) } var sSNR []string list = xmlpath.MustCompile(`//*[@id="jobdetailpage"]/div[2]/div[1]/div[3]/p`) iter = list.Iter(xmlroot) cnt = 0 for iter.Next() { ele := iter.Node().String() sSNR = append(sSNR, ele) // fmt.Printf("Skills1 (%d): %s\n", cnt, ele) cnt++ } list = xmlpath.MustCompile(`//*[@id="jobdetailpage"]/div[2]/div[1]/div[3]/ul/li/text()`) iter = list.Iter(xmlroot) cnt = 0 for iter.Next() { ele := iter.Node().String() sSNR = append(sSNR, ele) // fmt.Printf("Skills2(%d): %s\n", cnt, ele) cnt++ } list = xmlpath.MustCompile(`//*[@id="jobdetailpage"]/div[2]/div[1]/div[3]/ul/li/ul/li/text()`) iter = list.Iter(xmlroot) cnt = 0 for iter.Next() { ele := iter.Node().String() sSNR = append(sSNR, ele) // fmt.Printf("Skills3(%d): %s\n", cnt, ele) cnt++ } // // // about company - // pstr = `//*[@id="jobdetailpage"]/div[2]/div[1]/div[4]/p/text()` // //*[@id="jobdetailpage"]/div[2]/div[1]/div[2]/p[2]/text()[1] // path = xmlpath.MustCompile(pstr) // about := "" // if about, ok = path.String(xmlroot); ok { // fmt.Printf("About: %s - %s\n", pstr, about) // } var about []string list = xmlpath.MustCompile(`//*[@id="jobdetailpage"]/div[2]/div[1]/div[4]/p`) //*[@id="jobdetailpage"]/div[2]/div[1]/div[4]/p[2]/text()[1] iter = list.Iter(xmlroot) cnt = 0 for iter.Next() { ele := iter.Node().String() about = append(about, ele) // fmt.Printf("About(%d): %s\n", cnt, ele) cnt++ } var sep string baseAbout := "ABOUT: " sep = "" for i := 0; i < len(about); i++ { baseAbout += sep + about[i] sep = "\n" } baseSkills := "BASESKILLS: " sep = "" // fmt.Printf("base skills = %+v\n", skills) for i := 0; i < len(skills); i++ { baseSkills += sep + skills[i] sep = " " } baseReqs := "REQUIREMENTS: " sep = "" for i := 0; i < len(sSNR); i++ { baseReqs += sep + sSNR[i] sep = "\n" } baseDesc := "DESCRIPTION: " sep = "" for i := 0; i < len(desc); i++ { baseDesc += sep + desc[i] sep = "\n" } var storage string storage = uTuple.gURL + "\n\n" + "DATE: " + time.Now().Format(time.RFC850) + "\n\n" + "TITLE: " + html.UnescapeString(title) + "\n\n" + "LOCATION: " + html.UnescapeString(location) + "\n\n" + html.UnescapeString(baseSkills) + "\n\n" + html.UnescapeString(baseAbout) + "\n\n" + html.UnescapeString(baseDesc) + "\n\n" + // no second slash html.UnescapeString(baseReqs) + "\n" fmt.Printf("Storing (len = %d):\n***\n%s\n***\n", len(storage), storage) svcS3 := s3.New(session.New(&aws.Config{Region: aws.String("us-east-1")})) bucket := "opps" key := uTuple.s3Name _, err = svcS3.PutObject(&s3.PutObjectInput{ Body: strings.NewReader(string(storage)), Bucket: &bucket, Key: &key, }) if err != nil { fmt.Printf("Failed to upload data to %s/%s, %s\n", bucket, key, err) failed = 1 passed = 0 } } // return statusTuple{passed, failed} return statusTuple{passed, failed} // count URLs }).Reduce(func(x statusTuple, y statusTuple) statusTuple { fmt.Printf("Red1: x= %v, y = %v\n", x, y) return statusTuple{x.pass + y.pass, x.fail + y.fail} }).Map(func(x statusTuple) { fmt.Printf("Map4 Result: passed = %d, failed = %d\n", x.pass, x.fail) }).Run() }
func processWords(feed Feed) { //placeholder session, _ := mgo.Dial("localhost") feeds := session.DB("wcproc").C("feeds") resp, err := http.Get(feed.Link) //there's no reason to panic on this if err != nil { fmt.Printf("Couldn't reach URL: %v \n\n", feed.Link) return } doc, err := html.Parse(resp.Body) checkError(err) body := cascadia.MustCompile(feed.ArticleId).MatchAll(doc) var strBuffer bytes.Buffer re := regexp.MustCompile("\\<[^>]*\\>") for _, element := range body { var buf bytes.Buffer html.Render(&buf, element) strBuffer.WriteString(" " + re.ReplaceAllString(html.UnescapeString(buf.String()), "")) //fmt.Printf("... %v ... \n", re.ReplaceAllString(html.UnescapeString(buf.String()), "")) } f := func(c rune) bool { return !unicode.IsLetter(c) && unicode.IsNumber(c) } strings.FieldsFunc(strBuffer.String(), f) words := make(map[string]int) for _, w := range strings.Fields(strBuffer.String()) { words[w]++ } omitWords := []string{"the", "of", "a", "at", "as", "with", "been", "in", "that", "and", "with", "from", "more", "been", "we", "not", "by", "he", "who", "were", "so", "just", "also", "his", "will", "up", "had", "out", "if", "an", "to", "on", "which", "just", "they", "is", "it", "but", "its", "could", "us", "him", "next", "time", "like", "...", "both", "stil", "why", "it", "even", "no", "do", "first", "two", "for", "or", "our", "did", "very", "yet", "most", "new", "how", "you", "i", "we", "sure", "move", "close", "until", "my", "get", "go", "those", "though", "be", "me", "met", "recent", "rest", "end", "put", "seen", "else", "should", "met", "center", "over", "would", "much", "lot", "room", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "see", "set", "mr", "few", "old", "key", "sent", "tell", "ever", "under", "through", "led", "own", "such", "people", "due", "role", "never", "look", "full", "try", "was", "said", "this", "are", "their", "when", "can", "now", "after", "than", "some", "when", "her", "image", "about", "she", "i", "all", "one", "have", "has", "your", "what", "other", "there", "caption", "copyright"} //fmt.Printf("OMITTING:") for key, value := range words { //get rid of words that have these in them if !strings.ContainsAny(key, "-<>/_{}=;#&()*%$@1234567890") { if !containWords(key, omitWords) { //keep these words but trim off these chars item := Word{Name: strings.ToLower(strings.Trim(key, ". ,\"")), Count: value} feed.Words = append(feed.Words, item) } else { //fmt.Printf("%v \n", key) } } else { //fmt.Printf("%v \n", key) } } feed.Processed = true feeds.Update(bson.M{"_id": feed.Id}, feed) session.Close() }
func get_appinfo_steampowered(appid int, useragent string) (SteamApp, bool) { s_appid := strconv.Itoa(appid) app := SteamApp{} app.Id = appid client := &http.Client{} req, err := http.NewRequest("GET", "http://store.steampowered.com/app/"+s_appid+"/", nil) if err != nil { log.Error(err.Error()) return app, false } req.Header.Set("User-Agent", useragent) resp, err := client.Do(req) if err != nil { log.Error(err.Error()) return app, false } defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { log.Error(err.Error()) return app, false } s_body := string(body) s_body_nocr := strings.Replace(s_body, "\n", "", -1) re_name := regexp.MustCompile("<span itemprop=\"name\">(.+?)</span>") re_releasedate := regexp.MustCompile("<span class=\"date\">(.+?)</span>") release := re_releasedate.FindStringSubmatch(s_body) if release != nil { date := strings.Replace(release[1], ",", "", -1) date_p := strings.Split(date, " ") app.ReleaseDate = release[1] app.ReleaseYear = date_p[2] } else { log.Debug("Unable to parse release date.") } name := re_name.FindStringSubmatch(s_body) if name != nil { app.Name = name[1] } // Parse rating re_rating := regexp.MustCompile("(\\d+?\\.*\\d+?)% of the (\\d+,*\\d*?) user reviews for this game") re_rating_m := re_rating.FindStringSubmatch(s_body) if re_rating_m != nil { log.Debug(re_rating_m[0]) f_rating, _err := strconv.ParseFloat(re_rating_m[1], 32) if _err == nil { app.Rating = float32(f_rating) } i_reviews, _err := strconv.Atoi(strings.Replace(re_rating_m[2], ",", "", -1)) if _err == nil { app.Reviews = i_reviews } } re_dev := regexp.MustCompile("\\?developer.+\">(.+?)</a>") re_pub := regexp.MustCompile("\\?publisher.+\">(.+?)</a>") re_price := regexp.MustCompile("<div class=\"game_purchase_price price\">(.+?)</div>") re_price_orig := regexp.MustCompile("<div class=\"discount_original_price\">(.+?)</div>") re_price_discount := regexp.MustCompile("<div class=\"discount_final_price\">(.+?)</div>") price := re_price.FindStringSubmatch(s_body_nocr) price_orig := re_price_orig.FindStringSubmatch(s_body) price_discount := re_price_discount.FindStringSubmatch(s_body) if price != nil { app.Price = strings.TrimSpace(price[1]) } if price_orig != nil { app.Price = strings.TrimSpace(price_orig[1]) } if price_discount != nil { app.PriceDiscount = strings.TrimSpace(price_discount[1]) } dev := re_dev.FindStringSubmatch(s_body) if dev != nil { app.Developer = html.UnescapeString(dev[1]) } pub := re_pub.FindStringSubmatch(s_body) if pub != nil { app.Publisher = html.UnescapeString(dev[1]) } // OS app.Linux = strings.Contains(s_body, "platform_img linux") app.Windows = strings.Contains(s_body, "platform_img win") app.OSX = strings.Contains(s_body, "platform_img mac") // Features app.SteamCloud = strings.Contains(s_body, ">Steam Cloud</a>") app.SinglePlayer = strings.Contains(s_body, ">Single-player</a>") app.MultiPlayer = strings.Contains(s_body, ">Multi-player</a>") app.Coop = strings.Contains(s_body, ">Local Co-op</a>") app.MMO = strings.Contains(s_body, ">MMO</a>") app.VAC = strings.Contains(s_body, ">Valve Anti-Cheat enabled</a>") app.EarlyAccess = strings.Contains(s_body, "<h1 class=\"inset\">Early Access Game</h1>") app.TradingCards = strings.Contains(s_body, ">Steam Trading Cards</a>") app.Achievements = strings.Contains(s_body, ">Steam Achievements</a>") app.Workshop = strings.Contains(s_body, ">Steam Workshop</a>") return app, true }
func get_appinfo_steamdb(appid int, useragent string) (SteamApp, bool) { s_appid := strconv.Itoa(appid) app := SteamApp{} app.Id = appid client := &http.Client{} req, err := http.NewRequest("GET", "https://steamdb.info/app/"+s_appid+"/info/", nil) if err != nil { log.Error(err.Error()) return app, false } req.Header.Set("User-Agent", useragent) resp, err := client.Do(req) if err != nil { log.Error(err.Error()) return app, false } defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { log.Error(err.Error()) return app, false } s_body := string(body) s_body = strings.Replace(s_body, "\n", "", -1) re := regexp.MustCompile("<table class=\"table table-bordered table-hover table-dark\">(.+?)</table>") match := re.FindStringSubmatch(s_body) if match == nil { log.Debug("Unable to find table.") return app, false } else { //fmt.Println(match[1]) } table := match[1] // Parse release date re_releasedate := regexp.MustCompile("Release Date</td><td>(.+?)<i") re_inner := regexp.MustCompile("<.*?>(.+?)<") re_cells := regexp.MustCompile("<td.*?>(.+?)</td>") cells := re_cells.FindAllStringSubmatch(table, -1) release := re_releasedate.FindStringSubmatch(s_body) if release != nil { date := strings.Replace(release[1], ",", "", -1) date_p := strings.Split(date, " ") app.ReleaseDate = release[1] app.ReleaseYear = date_p[2] } else { log.Debug("Unable to parse release date.") } // Parse rating re_rating := regexp.MustCompile("(\\d+?\\.*\\d+?)% of the (\\d+,*\\d*?) user reviews for this game") re_rating_m := re_rating.FindStringSubmatch(s_body) if re_rating_m != nil { log.Debug(re_rating_m[0]) f_rating, _err := strconv.ParseFloat(re_rating_m[1], 32) if _err == nil { app.Rating = float32(f_rating) } i_reviews, _err := strconv.Atoi(strings.Replace(re_rating_m[2], ",", "", -1)) if _err == nil { app.Reviews = i_reviews } } for i, cell := range cells { content := "" if i != len(cells)-1 { content = cells[i+1][1] content = strings.Replace(content, "®", "", -1) content = strings.TrimSpace(content) } if strings.Contains(cell[1], "App Type") { app.AppType = content } if strings.Contains(cell[1], "Name") && !strings.Contains(cell[1], "Store") { // discard "Store Name" app.Name = html.UnescapeString(content) } if strings.Contains(cell[1], "Developer") { dev := re_inner.FindStringSubmatch(content) if dev != nil { app.Developer = strings.TrimSpace(html.UnescapeString(dev[1])) } } if strings.Contains(cell[1], "Publisher") { publisher := re_inner.FindStringSubmatch(content) if publisher != nil { app.Publisher = strings.TrimSpace(html.UnescapeString(publisher[1])) } } } // OS app.Linux = strings.Contains(table, "icon-linux") app.Windows = strings.Contains(table, "icon-windows") app.OSX = strings.Contains(table, "icon-macos") // Features app.SteamCloud = strings.Contains(s_body, "aria-label=\"Steam Cloud\"") app.SinglePlayer = strings.Contains(s_body, "aria-label=\"Single-player\"") app.MultiPlayer = strings.Contains(s_body, "aria-label=\"Multi-player\"") app.Coop = strings.Contains(s_body, "aria-label=\"Co-op\"") app.MMO = strings.Contains(s_body, "aria-label=\"MMO\"") app.VAC = strings.Contains(s_body, "aria-label=\"Valve Anti-Cheat enabled\"") app.EarlyAccess = strings.Contains(s_body, "aria-label=\"Early Access\"") app.TradingCards = strings.Contains(s_body, "aria-label=\"Steam Trading Cards\"") app.Achievements = strings.Contains(s_body, "aria-label=\"Steam Achievements\"") app.Workshop = strings.Contains(s_body, "aria-label=\"Steam Workshop\"") log.Debug("Done collecting info.") return app, true }
func parseVenue(venuestr string) string { m := venueRegxp.FindStringSubmatch(venuestr) return strings.TrimSpace(html.UnescapeString(m[2])) }
func parseFix(f *Feed, ss []*Story) (*Feed, []*Story, error) { f.Checked = time.Now() f.Link = strings.TrimSpace(f.Link) f.Title = html.UnescapeString(strings.TrimSpace(f.Title)) if u, err := url.Parse(f.URL); err == nil { if ul, err := u.Parse(f.Link); err == nil { f.Link = ul.String() } } base, err := url.Parse(f.Link) if err != nil { logrus.Infof("unable to parse link: %v", f.Link) } for _, s := range ss { s.Created = f.Checked s.Link = strings.TrimSpace(s.Link) if !s.Updated.IsZero() && s.Published.IsZero() { s.Published = s.Updated } if s.Published.IsZero() || f.Checked.Before(s.Published) { s.Published = f.Checked } if !s.Updated.IsZero() { s.Date = s.Updated.Unix() } else { s.Date = s.Published.Unix() } if s.ID == "" { if s.Link != "" { s.ID = s.Link } else if s.Title != "" { s.ID = s.Title } else { logrus.Infof("feed: story has no id: %v", s) return nil, nil, fmt.Errorf("story has no id: %v", s) } } s.Title = fullyHTMLUnescape(s.Title) // if a story doesn't have a link, see if its id is a URL if s.Link == "" { if u, err := url.Parse(s.ID); err == nil { s.Link = u.String() } } if base != nil && s.Link != "" { link, err := base.Parse(s.Link) if err == nil { s.Link = link.String() } else { logrus.Infof("feed: unable to resolve link: %s: %v", err, s.Link) } } _, serr := url.Parse(s.Link) if serr != nil { s.Link = "" } // Most mail readers disallow IFRAMES in mail content. This breaks // embedding of things like youtube videos. By changing them to anchor // tags things like Gmail will do their own embedding when reading the // mail. // // The following ends up parsing each of the feed items at least 3 times // which seems excessive - but meh. s.Content, err = cleanFeedContent(s.Content) if err != nil { logrus.Errorf("feed: error cleaning up content: %s", err) } p := bluemonday.UGCPolicy() s.Content = fullyHTMLUnescape(p.Sanitize(s.Content)) s.Content, err = rewriteFeedContent(s.Content) if err != nil { logrus.Errorf("feed: error cleaning up content: %s", err) } } return f, ss, nil }
func filter(x string) string { return strings.Replace(html.UnescapeString(x), "\u200B", "", -1) }