Ejemplo n.º 1
0
// GoodAsText applies some heuristics to make the data look good when displayed
// as simple text. For example, if the data is escaped HTML then other heuristics are
// applied to remove the HTML. For example if the data contains an HTML image tag,
// goodAsText will return the alt text. If nothing good is found then an empty slice is
// returned.
func goodAsText(d []byte) []byte {
	unesc := html.UnescapeString(string(d))
	nodes, err := html.ParseFragment(strings.NewReader(unesc), bodyNode)
	if err != nil {
		log.Printf("failed to parse [%s] as HTML: %v", unesc, err)
		return d
	}

	var buf bytes.Buffer
	for _, root := range nodes {
		walk(root, func(n *html.Node) {
			if n.Type == html.TextNode {
				buf.WriteString(strings.TrimSpace(n.Data))
				return
			}

			if n := buf.Len(); n > 0 && buf.Bytes()[n-1] != ' ' {
				buf.WriteString(" ")
			}
			if n.DataAtom == atom.Img {
				if alt := altTextOrEmpty(n); alt != "" {
					buf.WriteString(alt)
				}
			}
		})
	}
	return buf.Bytes()
}
Ejemplo n.º 2
0
func (c atomContent) Data() []byte {
	unesc := c.Contents
	if c.Type != "xhtml" {
		unesc = []byte(html.UnescapeString(string(c.Contents)))
	}
	return unesc
}
Ejemplo n.º 3
0
Archivo: scrap.go Proyecto: husio/apps
func pageTitle(body []byte) string {
	match := matchTitle(body, 1)
	if len(match) == 0 {
		return ""
	}
	return html.UnescapeString(string(match[0][1]))
}
Ejemplo n.º 4
0
// Try to (up to 10 times) to unescape a string.
// Some feeds are double escaped with things like: &
func fullyHTMLUnescape(orig string) string {
	mod := orig
	for i := 0; i < 10; i++ {
		mod = html.UnescapeString(orig)
		if orig == mod {
			return mod
		}
		orig = mod
	}
	return mod
}
Ejemplo n.º 5
0
func (n *Node) AllText(pat ...string) *string {
	ss := []string{}
	for _, n := range n.Descendants(TextNode).All() {
		if text := n.Text(pat...); text != nil && *text != "" {
			ss = append(ss, *text)
		}
	}
	s := html.UnescapeString(strings.Join(ss, " "))
	if s != "" {
		return &s
	}
	return nil
}
Ejemplo n.º 6
0
func SnipText(s string, length int) string {
	s = snipRe.ReplaceAllString(strings.TrimSpace(s), " ")
	s = html.UnescapeString(s)
	if len(s) <= length {
		return s
	}
	s = s[:length]
	i := strings.LastIndexAny(s, " .-!?")
	if i != -1 {
		return s[:i]
	}
	return CleanNonUTF8(s)
}
Ejemplo n.º 7
0
func cleanNode(c *Config, n *html.Node) *html.Node {
	allowedAttr, ok1 := c.elem[n.DataAtom]
	customAttr, ok2 := c.elemCustom[n.Data]
	if ok1 || ok2 {
		cleanChildren(c, n)

		haveSrc := false

		attrs := n.Attr
		n.Attr = make([]html.Attribute, 0, len(attrs))
		for _, attr := range attrs {
			a := atom.Lookup([]byte(attr.Key))

			re1, ok1 := allowedAttr[a]
			re2, ok2 := customAttr[attr.Key]
			_, ok3 := c.attr[a]
			_, ok4 := c.attrCustom[attr.Key]

			if attr.Namespace != "" || (!ok1 && !ok2 && !ok3 && !ok4) {
				continue
			}

			if !cleanURL(c, a, &attr) {
				continue
			}

			if re1 != nil && !re1.MatchString(attr.Val) {
				continue
			}
			if re2 != nil && !re2.MatchString(attr.Val) {
				continue
			}

			haveSrc = haveSrc || a == atom.Src

			n.Attr = append(n.Attr, attr)
		}

		if n.DataAtom == atom.Img && !haveSrc {
			// replace it with an empty text node
			return &html.Node{Type: html.TextNode}
		}

		return n
	}
	return text(html.UnescapeString(Render(n)))
}
Ejemplo n.º 8
0
func (entityDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	for nSrc < len(src) && nDst < len(dst) {
		if c := src[nSrc]; c != '&' {
			dst[nDst] = c
			nSrc++
			nDst++
			continue
		}

		// Try to decode a character entity.
		entityLen := 1
		for entityLen < 32 {
			if nSrc+entityLen == len(src) {
				if atEOF {
					break
				} else {
					err = transform.ErrShortSrc
					return
				}
			}
			if b := src[nSrc+entityLen]; 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || '0' <= b && b <= '9' || entityLen == 1 && b == '#' || b == ';' {
				entityLen++
				if b == ';' {
					break
				}
			} else {
				break
			}
		}

		e := string(src[nSrc : nSrc+entityLen])
		decoded := html.UnescapeString(e)
		n := copy(dst[nDst:], decoded)
		if n < len(decoded) {
			err = transform.ErrShortDst
			return
		}
		nSrc += entityLen
		nDst += len(decoded)
	}

	if nSrc < len(src) && err == nil {
		err = transform.ErrShortDst
	}
	return
}
Ejemplo n.º 9
0
func GetTrending(useragent string) (games_result []Trending, success bool) {
	games := []Trending{}

	client := &http.Client{}
	req, err := http.NewRequest("GET", "http://steamcharts.com/", nil)
	if err != nil {
		log.Error(err.Error())
		return games, false
	}
	req.Header.Set("User-Agent", useragent)
	resp, err := client.Do(req)
	if err != nil {
		log.Error(err.Error())
		return games, false
	}

	defer resp.Body.Close()
	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		log.Error(err.Error())
		return games, false
	}
	s_body := string(body)
	s_body = strings.Replace(s_body, "\n", "", -1)

	re := regexp.MustCompile("td class=\"game-name left\">.+?href=\"/app/(\\d+?)\">(.+?)</a>.+?\"gain\">(.+?)</td>.+? class=\"num\">(.+?)</td>")
	matches := re.FindAllStringSubmatch(s_body, -1)
	if matches != nil {
		for _, match := range matches {
			log.Debugf("Found match: %s, %s, %s, %s", match[1], match[2], match[3], match[4])
			app_s := match[1]
			name := strings.TrimSpace(match[2])
			gain := html.UnescapeString(match[3])
			num_s := strings.Replace(match[4], ",", "", -1)

			app, _ := strconv.Atoi(app_s)
			num, _ := strconv.Atoi(num_s)

			games = append(games, Trending{app, name, gain, num})
		}
		return games, true
	}
	return games, false
}
Ejemplo n.º 10
0
func parseRdf(u string, b []byte) (*Feed, []*Story, error) {
	rd := rdf.RDF{}

	d := xml.NewDecoder(bytes.NewReader(b))
	d.CharsetReader = charset.NewReaderLabel
	d.Strict = false
	d.Entity = xml.HTMLEntity
	err := d.Decode(&rd)
	if err != nil {
		return nil, nil, err
	}

	f := Feed{
		URL: u,
	}
	s := []*Story{}

	if rd.Channel != nil {
		f.Title = rd.Channel.Title
		f.Link = rd.Channel.Link
		if t, err := parseDate(rd.Channel.Date); err == nil {
			f.Updated = t
		}
	}

	for _, i := range rd.Item {
		st := Story{
			ID:     i.About,
			Title:  i.Title,
			Link:   i.Link,
			Author: i.Creator,
			Feed:   &f,
		}
		st.Content = html.UnescapeString(i.Description)
		if t, err := parseDate(i.Date); err == nil {
			st.Published = t
			st.Updated = t
		}
		s = append(s, &st)
	}

	return parseFix(&f, s)
}
Ejemplo n.º 11
0
Archivo: atom.go Proyecto: lufia/news
func (t Text) HTML() (s string, err error) {
	switch t.Type {
	case "html":
		t := html.UnescapeString(t.Content)
		s = fmt.Sprintf("<div>%s</div>", t)
	case "xhtml":
		r := strings.NewReader(t.Content)
		tokenizer := html.NewTokenizer(r)
		err = nextToken(tokenizer)
		if err != nil {
			return
		}
		s, err = buildHTML(tokenizer)
	case "text":
		s = fmt.Sprintf("<pre>%s</pre>", t.Content)
	default:
		s = fmt.Sprintf("<pre>%s</pre>", t.Content)
	}
	return
}
Ejemplo n.º 12
0
func getGroupKeyword(n *html.Node) string {
	s, _ := selector.Selector("#news_detail .icon_box .icon-name")
	nodes := s.Find(n)
	if len(nodes) > 0 {
		s := strings.TrimSpace(
			html.UnescapeString(
				extractNodeString(nodes[0]),
			),
		)
		// we don't need the year suffix.
		if strings.HasPrefix(s, "モーニング娘。") {
			return "モーニング娘。"
		}
		// TODO: any other imports than ハロコン?
		if s == "HELLO! PROJECT" {
			return "ハロコン"
		}
		return s
	}
	return ""
}
Ejemplo n.º 13
0
func (info *HTMLInfo) parseBody(n *html.Node) {
	if !info.AllowMainContentExtraction {
		return
	}

	buf := new(bytes.Buffer)
	err := html.Render(buf, n)
	if err != nil {
		return
	}
	bufStr := buf.String()
	doc, err := readability.NewDocument(bufStr)
	if err != nil {
		return
	}

	doc.WhitelistTags = []string{"div", "p", "img"}
	doc.WhitelistAttrs["img"] = []string{"src", "title", "alt"}

	content := doc.Content()
	content = html.UnescapeString(content)

	info.MainContent = strings.Trim(content, "\r\n\t ")
}
Ejemplo n.º 14
0
func main() {
	flag.Parse()
	//	panic("Just Quit")
	getHostConfig()
	//	runtime.GOMAXPROCS(2)
	timeout = 1000
	fmt.Println("Feeds")
	//http://careers.stackoverflow.com/jobs/feed?searchTerm=big+data&location=san+francisco&range=100&distanceUnits=Miles
	//	feeds = append(feeds, Feed{index: 0, url: "http://careers.stackoverflow.com/jobs/feed?searchTerm=big+data&location=san+francisco&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false })

	feeds = append(feeds, Feed{index: 0, url: "http://careers.stackoverflow.com/jobs/feed?location=san+francisco%2c+ca&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false})
	feeds = append(feeds, Feed{index: 1, url: "http://careers.stackoverflow.com/jobs/feed?location=new+york+city%2c+ny&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false})
	feeds = append(feeds, Feed{index: 2, url: "http://careers.stackoverflow.com/jobs/feed?location=los+angeles%2c+ca&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false})
	feeds = append(feeds, Feed{index: 3, url: "http://careers.stackoverflow.com/jobs/feed?location=boston%2c+ma&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false})
	feeds = append(feeds, Feed{index: 4, url: "http://careers.stackoverflow.com/jobs/feed?location=seattle%2cwa&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false})
	feeds = append(feeds, Feed{index: 5, url: "http://careers.stackoverflow.com/jobs/feed?location=austin%2ctx&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false})
	feeds = append(feeds, Feed{index: 6, url: "http://careers.stackoverflow.com/jobs/feed?location=chicago%2cil&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false})
	mutex = &sync.Mutex{}
	skillMap = make(map[string]int, 200)
	loadSkillMapFile(skillMap)
	fmt.Println("GetRSS")
	getRSS2()
	saveSkillMapFile(skillMap)
	if conf.hbaseZkURL != "" {
		saveSkillsMapHBase(skillMap)
	}

	for i := 0; i < len(guidList); i++ {
		fmt.Println(guidList[i])
	}

	//	guidList := make([]string, 4)
	//	guidList[0] = "http://careers.stackoverflow.com/jobs/103310/senior-software-engineer-american-society-of-clinical"
	//	guidList[1] = "http://careers.stackoverflow.com/jobs/94152/senior-software-engineer-platform-flixster"
	//	guidList[2] = "http://careers.stackoverflow.com/jobs/103328/senior-full-stack-engineer-data-science-adroll"
	//	guidList[3] = "http://careers.stackoverflow.com/jobs/104086/enterprise-architect-new-relic"
	//	fmt.Printf("%v\n", s)

	// map random times & make s3names
	fw.Slice(guidList).Map(func(sURL string) URLTuple {
		fmt.Printf("Map1: %v\n", sURL)
		fName := "jobs_sof/" + strings.Replace(strings.TrimPrefix(sURL, "http://careers.stackoverflow.com/jobs/"), "/", "_", -1)
		ms := rand.Intn(3000)
		return URLTuple{sURL, fName, ms}
		//	Filter already-acquired URLs
	}).Filter(func(uTuple URLTuple) bool {
		// is file already stored in S3?
		//fmt.Printf("Filter:%s, %v\n", uTuple.s3Name, uTuple)
		svcS3 := s3.New(session.New(&aws.Config{Region: aws.String("us-east-1")}))
		var params *s3.HeadObjectInput

		params = &s3.HeadObjectInput{
			Bucket: aws.String("opps"),        // Required
			Key:    aws.String(uTuple.s3Name), // Required
		}
		hobj, _ := svcS3.HeadObject(params)

		fmt.Printf("Filter: %s => %v\n", uTuple.s3Name, hobj.ContentLength == nil)
		return hobj.ContentLength == nil
		//	get the URLs
	}).Map(func(uTuple URLTuple) statusTuple {
		fmt.Printf("Map3: %v\n", uTuple)
		// random sleep
		time.Sleep(time.Duration(uTuple.msWait) * time.Millisecond)

		// get URL
		resp, err := http.Get(uTuple.gURL)
		if err != nil {
			panic(err)
		}
		defer resp.Body.Close()

		//		fmt.Println("Body:", resp.Body)
		//		fmt.Println("Proto:", resp.Proto)
		//		fmt.Printf("response Status = <%s> / Length = %d\n", resp.Status, resp.ContentLength)
		//		fmt.Println("response Headers:", resp.Header)
		//		fmt.Printf("response %+v:\n", resp)
		//		fmt.Println("response Body:", string(body))
		failed := 0
		passed := 0
		if resp.StatusCode == 200 {
			passed = 1
		} else {
			failed = 1
		}
		// store in S3
		if passed == 1 {
			body, _ := ioutil.ReadAll(resp.Body)
			reader := strings.NewReader(string(body))
			root, err := html.Parse(reader)

			if err != nil {
				fmt.Printf("%+v\n", err)
			}

			var b bytes.Buffer
			html.Render(&b, root)
			fixedHtml := b.String()

			isOk := func(r rune) bool {
				return r < 32 || r >= 127
			}
			// The isOk filter is such that there is no need to chain to norm.NFC
			t2 := transform.Chain(norm.NFKD, transform.RemoveFunc(isOk))
			// This Transformer could also trivially be applied as an io.Reader
			// or io.Writer filter to automatically do such filtering when reading
			// or writing data anywhere.
			fixedUnicodeNFKD, _, _ := transform.String(t2, fixedHtml)

			//			fmt.Println("\n\n\n"+fixedUnicodeNFKD)
			reader = strings.NewReader(fixedUnicodeNFKD)

			xmlroot, xmlerr := xmlpath.ParseHTML(reader)
			if xmlerr != nil {
				log.Fatal(xmlerr)
			}
			//	fmt.Printf("xml root = %+v\n------\n", xmlroot)
			path := &xmlpath.Path{}
			pstr := string("")

			pstr = `/html/head/title`
			path = xmlpath.MustCompile(pstr)
			var ok bool

			title := ""
			if title, ok = path.String(xmlroot); ok {
				//		fmt.Printf("%s: %s\n", pstr, title)
			}
			fmt.Printf("**** Title: %s\n", title)
			var iter *xmlpath.Iter
			var list *xmlpath.Path
			var cnt int

			// Location - needs Trim
			pstr = `//*[@id="hed"]/ul[1]/li/text()`
			path = xmlpath.MustCompile(pstr)
			location := ""
			if location, ok = path.String(xmlroot); ok {
				//		fmt.Printf("Location - %s: %s\n", pstr, strings.Trim(location, " \n"))
				location = strings.Trim(location, " \n")
			}

			// Base Skills - LOOP from 1 until not ok
			var skills []string

			list = xmlpath.MustCompile(`//*[@id="hed"]/div[2]/p/a`)
			iter = list.Iter(xmlroot)
			for iter.Next() {
				ele := iter.Node().String()
				skills = append(skills, ele)
				//		fmt.Printf("Sk-Desc: %s\n", ele)
			}

			var desc []string
			list = xmlpath.MustCompile(`//*[@id="jobdetailpage"]/div[2]/div[1]/div[2]/p`)
			iter = list.Iter(xmlroot)
			for iter.Next() {
				ele := iter.Node().String()
				desc = append(desc, ele)
				//		fmt.Printf("it-Desc1: %s\n", ele)
			}

			list = xmlpath.MustCompile(`//*[@id="jobdetailpage"]/div[2]/div[1]/div[2]/ul/li`)
			iter = list.Iter(xmlroot)
			for iter.Next() {
				ele := iter.Node().String()
				desc = append(desc, ele)
				//		fmt.Printf("it-Desc2: %s\n", ele)
			}

			var sSNR []string
			list = xmlpath.MustCompile(`//*[@id="jobdetailpage"]/div[2]/div[1]/div[3]/p`)
			iter = list.Iter(xmlroot)
			cnt = 0
			for iter.Next() {
				ele := iter.Node().String()
				sSNR = append(sSNR, ele)
				//		fmt.Printf("Skills1 (%d): %s\n", cnt, ele)
				cnt++
			}

			list = xmlpath.MustCompile(`//*[@id="jobdetailpage"]/div[2]/div[1]/div[3]/ul/li/text()`)
			iter = list.Iter(xmlroot)
			cnt = 0
			for iter.Next() {
				ele := iter.Node().String()
				sSNR = append(sSNR, ele)
				//		fmt.Printf("Skills2(%d): %s\n", cnt, ele)
				cnt++
			}

			list = xmlpath.MustCompile(`//*[@id="jobdetailpage"]/div[2]/div[1]/div[3]/ul/li/ul/li/text()`)
			iter = list.Iter(xmlroot)
			cnt = 0
			for iter.Next() {
				ele := iter.Node().String()
				sSNR = append(sSNR, ele)
				//		fmt.Printf("Skills3(%d): %s\n", cnt, ele)
				cnt++
			}
			//
			//    // about company -
			//	pstr = `//*[@id="jobdetailpage"]/div[2]/div[1]/div[4]/p/text()`
			//	//*[@id="jobdetailpage"]/div[2]/div[1]/div[2]/p[2]/text()[1]
			//	path = xmlpath.MustCompile(pstr)
			//	about := ""
			//	if about, ok = path.String(xmlroot); ok {
			//		fmt.Printf("About: %s - %s\n", pstr, about)
			//	}

			var about []string
			list = xmlpath.MustCompile(`//*[@id="jobdetailpage"]/div[2]/div[1]/div[4]/p`)
			//*[@id="jobdetailpage"]/div[2]/div[1]/div[4]/p[2]/text()[1]
			iter = list.Iter(xmlroot)
			cnt = 0
			for iter.Next() {
				ele := iter.Node().String()
				about = append(about, ele)
				//		fmt.Printf("About(%d): %s\n", cnt, ele)
				cnt++
			}

			var sep string

			baseAbout := "ABOUT: "
			sep = ""
			for i := 0; i < len(about); i++ {
				baseAbout += sep + about[i]
				sep = "\n"
			}

			baseSkills := "BASESKILLS: "
			sep = ""
			//	fmt.Printf("base skills = %+v\n", skills)
			for i := 0; i < len(skills); i++ {
				baseSkills += sep + skills[i]
				sep = " "
			}

			baseReqs := "REQUIREMENTS: "
			sep = ""
			for i := 0; i < len(sSNR); i++ {
				baseReqs += sep + sSNR[i]
				sep = "\n"
			}

			baseDesc := "DESCRIPTION: "
			sep = ""
			for i := 0; i < len(desc); i++ {
				baseDesc += sep + desc[i]
				sep = "\n"
			}

			var storage string
			storage =
				uTuple.gURL + "\n\n" +
					"DATE: " + time.Now().Format(time.RFC850) + "\n\n" +
					"TITLE: " + html.UnescapeString(title) + "\n\n" +
					"LOCATION: " + html.UnescapeString(location) + "\n\n" +
					html.UnescapeString(baseSkills) + "\n\n" +
					html.UnescapeString(baseAbout) + "\n\n" +
					html.UnescapeString(baseDesc) + "\n\n" + // no second slash
					html.UnescapeString(baseReqs) + "\n"

			fmt.Printf("Storing (len = %d):\n***\n%s\n***\n", len(storage), storage)

			svcS3 := s3.New(session.New(&aws.Config{Region: aws.String("us-east-1")}))
			bucket := "opps"
			key := uTuple.s3Name
			_, err = svcS3.PutObject(&s3.PutObjectInput{
				Body:   strings.NewReader(string(storage)),
				Bucket: &bucket,
				Key:    &key,
			})
			if err != nil {
				fmt.Printf("Failed to upload data to %s/%s, %s\n", bucket, key, err)
				failed = 1
				passed = 0
			}
		}
		//		return statusTuple{passed, failed}
		return statusTuple{passed, failed}
		// count URLs
	}).Reduce(func(x statusTuple, y statusTuple) statusTuple {
		fmt.Printf("Red1: x= %v, y = %v\n", x, y)
		return statusTuple{x.pass + y.pass, x.fail + y.fail}
	}).Map(func(x statusTuple) {
		fmt.Printf("Map4 Result: passed = %d, failed = %d\n", x.pass, x.fail)
	}).Run()

}
Ejemplo n.º 15
0
func processWords(feed Feed) {

	//placeholder
	session, _ := mgo.Dial("localhost")
	feeds := session.DB("wcproc").C("feeds")

	resp, err := http.Get(feed.Link)
	//there's no reason to panic on this
	if err != nil {
		fmt.Printf("Couldn't reach URL: %v \n\n", feed.Link)
		return
	}

	doc, err := html.Parse(resp.Body)

	checkError(err)

	body := cascadia.MustCompile(feed.ArticleId).MatchAll(doc)

	var strBuffer bytes.Buffer
	re := regexp.MustCompile("\\<[^>]*\\>")

	for _, element := range body {
		var buf bytes.Buffer
		html.Render(&buf, element)

		strBuffer.WriteString(" " + re.ReplaceAllString(html.UnescapeString(buf.String()), ""))
		//fmt.Printf("... %v ... \n", re.ReplaceAllString(html.UnescapeString(buf.String()), ""))
	}

	f := func(c rune) bool {
		return !unicode.IsLetter(c) && unicode.IsNumber(c)
	}
	strings.FieldsFunc(strBuffer.String(), f)

	words := make(map[string]int)

	for _, w := range strings.Fields(strBuffer.String()) {
		words[w]++
	}

	omitWords := []string{"the", "of", "a", "at", "as", "with", "been", "in", "that", "and", "with", "from", "more", "been", "we", "not", "by", "he", "who", "were",
		"so", "just", "also", "his", "will", "up", "had", "out", "if", "an", "to", "on", "which", "just", "they", "is", "it", "but", "its", "could", "us",
		"him", "next", "time", "like", "...", "both", "stil", "why", "it", "even", "no", "do", "first", "two", "for", "or", "our", "did", "very", "yet",
		"most", "new", "how", "you", "i", "we", "sure", "move", "close", "until", "my", "get", "go", "those", "though", "be", "me", "met", "recent",
		"rest", "end", "put", "seen", "else", "should", "met", "center", "over", "would", "much", "lot", "room", "three", "four", "five", "six", "seven",
		"eight", "nine", "ten", "see", "set", "mr", "few", "old", "key", "sent", "tell", "ever", "under", "through", "led", "own", "such", "people",
		"due", "role", "never", "look", "full", "try", "was", "said", "this", "are", "their", "when", "can", "now", "after", "than", "some", "when",
		"her", "image", "about", "she", "i", "all", "one", "have", "has", "your", "what", "other", "there", "caption", "copyright"}

	//fmt.Printf("OMITTING:")
	for key, value := range words {
		//get rid of words that have these in them
		if !strings.ContainsAny(key, "-<>/_{}=;#&()*%$@1234567890") {
			if !containWords(key, omitWords) {

				//keep these words but trim off these chars
				item := Word{Name: strings.ToLower(strings.Trim(key, ". ,\"")), Count: value}
				feed.Words = append(feed.Words, item)
			} else {
				//fmt.Printf("%v \n", key)
			}
		} else {
			//fmt.Printf("%v \n", key)
		}
	}

	feed.Processed = true
	feeds.Update(bson.M{"_id": feed.Id}, feed)
	session.Close()

}
Ejemplo n.º 16
0
func get_appinfo_steampowered(appid int, useragent string) (SteamApp, bool) {
	s_appid := strconv.Itoa(appid)
	app := SteamApp{}
	app.Id = appid

	client := &http.Client{}
	req, err := http.NewRequest("GET", "http://store.steampowered.com/app/"+s_appid+"/", nil)
	if err != nil {
		log.Error(err.Error())
		return app, false
	}
	req.Header.Set("User-Agent", useragent)
	resp, err := client.Do(req)
	if err != nil {
		log.Error(err.Error())
		return app, false
	}

	defer resp.Body.Close()
	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		log.Error(err.Error())
		return app, false
	}
	s_body := string(body)
	s_body_nocr := strings.Replace(s_body, "\n", "", -1)

	re_name := regexp.MustCompile("<span itemprop=\"name\">(.+?)</span>")
	re_releasedate := regexp.MustCompile("<span class=\"date\">(.+?)</span>")
	release := re_releasedate.FindStringSubmatch(s_body)
	if release != nil {
		date := strings.Replace(release[1], ",", "", -1)
		date_p := strings.Split(date, " ")
		app.ReleaseDate = release[1]
		app.ReleaseYear = date_p[2]
	} else {
		log.Debug("Unable to parse release date.")
	}

	name := re_name.FindStringSubmatch(s_body)
	if name != nil {
		app.Name = name[1]
	}

	// Parse rating
	re_rating := regexp.MustCompile("(\\d+?\\.*\\d+?)% of the (\\d+,*\\d*?) user reviews for this game")
	re_rating_m := re_rating.FindStringSubmatch(s_body)
	if re_rating_m != nil {
		log.Debug(re_rating_m[0])
		f_rating, _err := strconv.ParseFloat(re_rating_m[1], 32)
		if _err == nil {
			app.Rating = float32(f_rating)
		}
		i_reviews, _err := strconv.Atoi(strings.Replace(re_rating_m[2], ",", "", -1))
		if _err == nil {
			app.Reviews = i_reviews
		}
	}
	re_dev := regexp.MustCompile("\\?developer.+\">(.+?)</a>")
	re_pub := regexp.MustCompile("\\?publisher.+\">(.+?)</a>")
	re_price := regexp.MustCompile("<div class=\"game_purchase_price price\">(.+?)</div>")
	re_price_orig := regexp.MustCompile("<div class=\"discount_original_price\">(.+?)</div>")
	re_price_discount := regexp.MustCompile("<div class=\"discount_final_price\">(.+?)</div>")

	price := re_price.FindStringSubmatch(s_body_nocr)
	price_orig := re_price_orig.FindStringSubmatch(s_body)
	price_discount := re_price_discount.FindStringSubmatch(s_body)
	if price != nil {
		app.Price = strings.TrimSpace(price[1])
	}
	if price_orig != nil {
		app.Price = strings.TrimSpace(price_orig[1])
	}
	if price_discount != nil {
		app.PriceDiscount = strings.TrimSpace(price_discount[1])
	}

	dev := re_dev.FindStringSubmatch(s_body)
	if dev != nil {
		app.Developer = html.UnescapeString(dev[1])
	}
	pub := re_pub.FindStringSubmatch(s_body)
	if pub != nil {
		app.Publisher = html.UnescapeString(dev[1])
	}

	// OS
	app.Linux = strings.Contains(s_body, "platform_img linux")
	app.Windows = strings.Contains(s_body, "platform_img win")
	app.OSX = strings.Contains(s_body, "platform_img mac")

	// Features
	app.SteamCloud = strings.Contains(s_body, ">Steam Cloud</a>")
	app.SinglePlayer = strings.Contains(s_body, ">Single-player</a>")
	app.MultiPlayer = strings.Contains(s_body, ">Multi-player</a>")
	app.Coop = strings.Contains(s_body, ">Local Co-op</a>")
	app.MMO = strings.Contains(s_body, ">MMO</a>")
	app.VAC = strings.Contains(s_body, ">Valve Anti-Cheat enabled</a>")
	app.EarlyAccess = strings.Contains(s_body, "<h1 class=\"inset\">Early Access Game</h1>")
	app.TradingCards = strings.Contains(s_body, ">Steam Trading Cards</a>")
	app.Achievements = strings.Contains(s_body, ">Steam Achievements</a>")
	app.Workshop = strings.Contains(s_body, ">Steam Workshop</a>")

	return app, true
}
Ejemplo n.º 17
0
func get_appinfo_steamdb(appid int, useragent string) (SteamApp, bool) {
	s_appid := strconv.Itoa(appid)
	app := SteamApp{}
	app.Id = appid

	client := &http.Client{}
	req, err := http.NewRequest("GET", "https://steamdb.info/app/"+s_appid+"/info/", nil)
	if err != nil {
		log.Error(err.Error())
		return app, false
	}
	req.Header.Set("User-Agent", useragent)
	resp, err := client.Do(req)
	if err != nil {
		log.Error(err.Error())
		return app, false
	}

	defer resp.Body.Close()
	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		log.Error(err.Error())
		return app, false
	}
	s_body := string(body)
	s_body = strings.Replace(s_body, "\n", "", -1)
	re := regexp.MustCompile("<table class=\"table table-bordered table-hover table-dark\">(.+?)</table>")
	match := re.FindStringSubmatch(s_body)
	if match == nil {
		log.Debug("Unable to find table.")
		return app, false
	} else {
		//fmt.Println(match[1])
	}
	table := match[1]

	// Parse release date
	re_releasedate := regexp.MustCompile("Release Date</td><td>(.+?)<i")
	re_inner := regexp.MustCompile("<.*?>(.+?)<")
	re_cells := regexp.MustCompile("<td.*?>(.+?)</td>")
	cells := re_cells.FindAllStringSubmatch(table, -1)
	release := re_releasedate.FindStringSubmatch(s_body)
	if release != nil {
		date := strings.Replace(release[1], ",", "", -1)
		date_p := strings.Split(date, " ")
		app.ReleaseDate = release[1]
		app.ReleaseYear = date_p[2]
	} else {
		log.Debug("Unable to parse release date.")
	}

	// Parse rating
	re_rating := regexp.MustCompile("(\\d+?\\.*\\d+?)% of the (\\d+,*\\d*?) user reviews for this game")
	re_rating_m := re_rating.FindStringSubmatch(s_body)
	if re_rating_m != nil {
		log.Debug(re_rating_m[0])
		f_rating, _err := strconv.ParseFloat(re_rating_m[1], 32)
		if _err == nil {
			app.Rating = float32(f_rating)
		}
		i_reviews, _err := strconv.Atoi(strings.Replace(re_rating_m[2], ",", "", -1))
		if _err == nil {
			app.Reviews = i_reviews
		}
	}
	for i, cell := range cells {
		content := ""
		if i != len(cells)-1 {
			content = cells[i+1][1]
			content = strings.Replace(content, "&reg;", "", -1)
			content = strings.TrimSpace(content)
		}
		if strings.Contains(cell[1], "App Type") {
			app.AppType = content
		}
		if strings.Contains(cell[1], "Name") && !strings.Contains(cell[1], "Store") { // discard "Store Name"
			app.Name = html.UnescapeString(content)
		}
		if strings.Contains(cell[1], "Developer") {
			dev := re_inner.FindStringSubmatch(content)
			if dev != nil {
				app.Developer = strings.TrimSpace(html.UnescapeString(dev[1]))
			}
		}
		if strings.Contains(cell[1], "Publisher") {
			publisher := re_inner.FindStringSubmatch(content)
			if publisher != nil {
				app.Publisher = strings.TrimSpace(html.UnescapeString(publisher[1]))
			}
		}
	}

	// OS
	app.Linux = strings.Contains(table, "icon-linux")
	app.Windows = strings.Contains(table, "icon-windows")
	app.OSX = strings.Contains(table, "icon-macos")

	// Features
	app.SteamCloud = strings.Contains(s_body, "aria-label=\"Steam Cloud\"")
	app.SinglePlayer = strings.Contains(s_body, "aria-label=\"Single-player\"")
	app.MultiPlayer = strings.Contains(s_body, "aria-label=\"Multi-player\"")
	app.Coop = strings.Contains(s_body, "aria-label=\"Co-op\"")
	app.MMO = strings.Contains(s_body, "aria-label=\"MMO\"")
	app.VAC = strings.Contains(s_body, "aria-label=\"Valve Anti-Cheat enabled\"")
	app.EarlyAccess = strings.Contains(s_body, "aria-label=\"Early Access\"")
	app.TradingCards = strings.Contains(s_body, "aria-label=\"Steam Trading Cards\"")
	app.Achievements = strings.Contains(s_body, "aria-label=\"Steam Achievements\"")
	app.Workshop = strings.Contains(s_body, "aria-label=\"Steam Workshop\"")

	log.Debug("Done collecting info.")

	return app, true
}
Ejemplo n.º 18
0
func parseVenue(venuestr string) string {
	m := venueRegxp.FindStringSubmatch(venuestr)
	return strings.TrimSpace(html.UnescapeString(m[2]))
}
Ejemplo n.º 19
0
func parseFix(f *Feed, ss []*Story) (*Feed, []*Story, error) {
	f.Checked = time.Now()

	f.Link = strings.TrimSpace(f.Link)
	f.Title = html.UnescapeString(strings.TrimSpace(f.Title))

	if u, err := url.Parse(f.URL); err == nil {
		if ul, err := u.Parse(f.Link); err == nil {
			f.Link = ul.String()
		}
	}
	base, err := url.Parse(f.Link)
	if err != nil {
		logrus.Infof("unable to parse link: %v", f.Link)
	}

	for _, s := range ss {
		s.Created = f.Checked
		s.Link = strings.TrimSpace(s.Link)
		if !s.Updated.IsZero() && s.Published.IsZero() {
			s.Published = s.Updated
		}
		if s.Published.IsZero() || f.Checked.Before(s.Published) {
			s.Published = f.Checked
		}
		if !s.Updated.IsZero() {
			s.Date = s.Updated.Unix()
		} else {
			s.Date = s.Published.Unix()
		}
		if s.ID == "" {
			if s.Link != "" {
				s.ID = s.Link
			} else if s.Title != "" {
				s.ID = s.Title
			} else {
				logrus.Infof("feed: story has no id: %v", s)
				return nil, nil, fmt.Errorf("story has no id: %v", s)
			}
		}
		s.Title = fullyHTMLUnescape(s.Title)
		// if a story doesn't have a link, see if its id is a URL
		if s.Link == "" {
			if u, err := url.Parse(s.ID); err == nil {
				s.Link = u.String()
			}
		}
		if base != nil && s.Link != "" {
			link, err := base.Parse(s.Link)
			if err == nil {
				s.Link = link.String()
			} else {
				logrus.Infof("feed: unable to resolve link: %s: %v", err, s.Link)
			}
		}
		_, serr := url.Parse(s.Link)
		if serr != nil {
			s.Link = ""
		}

		// Most mail readers disallow IFRAMES in mail content.  This breaks
		// embedding of things like youtube videos.  By changing them to anchor
		// tags things like Gmail will do their own embedding when reading the
		// mail.
		//
		// The following ends up parsing each of the feed items at least 3 times
		// which seems excessive - but meh.
		s.Content, err = cleanFeedContent(s.Content)
		if err != nil {
			logrus.Errorf("feed: error cleaning up content: %s", err)
		}

		p := bluemonday.UGCPolicy()
		s.Content = fullyHTMLUnescape(p.Sanitize(s.Content))

		s.Content, err = rewriteFeedContent(s.Content)
		if err != nil {
			logrus.Errorf("feed: error cleaning up content: %s", err)
		}

	}

	return f, ss, nil
}
Ejemplo n.º 20
0
func filter(x string) string {
	return strings.Replace(html.UnescapeString(x), "\u200B", "", -1)
}