func scrape(language string, filename string) { var doc *goquery.Document var e error // var w *bufio.Writer f, err := os.OpenFile(filename, os.O_APPEND|os.O_WRONLY, 0600) if err != nil { panic(err) } defer f.Close() if _, err = f.WriteString(fmt.Sprintf("\n####%s\n", language)); err != nil { panic(err) } if doc, e = goquery.NewDocument(fmt.Sprintf("https://github.com/trending?l=%s", language)); e != nil { panic(e.Error()) } doc.Find("li.repo-leaderboard-list-item").Each(func(i int, s *goquery.Selection) { title := s.Find("div h2 a").Text() owner := s.Find("span.owner-name").Text() repoName := s.Find("strong").Text() description := s.Find("p.repo-leaderboard-description").Text() url, _ := s.Find("h2 a").Attr("href") url = "https://github.com" + url fmt.Println("owner: ", owner) fmt.Println("repo: ", repoName) fmt.Println("URL: ", url) if _, err = f.WriteString("* [" + title + "](" + url + "): " + description + "\n"); err != nil { panic(err) } }) }
func (this *cleaner) cleanBadTags(doc *goquery.Document) *goquery.Document { body := doc.Find("body") children := body.Children() selectors := [3]string{"id", "class", "name"} for _, selector := range selectors { children.Each(func(i int, s *goquery.Selection) { naughtyList := s.Find("*[" + selector + "]") cont := 0 naughtyList.Each(func(j int, e *goquery.Selection) { attribute, _ := e.Attr(selector) if this.matchNodeRegEx(attribute, REMOVENODES_RE) { if this.config.debug { log.Printf("Cleaning: Removing node with %s: %s\n", selector, this.config.parser.name(selector, e)) } this.config.parser.removeNode(e) cont++ } }) if this.config.debug { log.Printf("%d naughty %s elements found", cont, selector) } }) } return doc }
func (this *cleaner) cleanDivs(doc *goquery.Document) *goquery.Document { frames := make(map[string]int) framesNodes := make(map[string]*list.List) divs := doc.Find("div") divs.Each(func(i int, s *goquery.Selection) { children := s.Children() if children.Size() == 0 { text := s.Text() text = strings.Trim(text, " ") text = strings.Trim(text, "\t") text = strings.ToLower(text) frames[text]++ if framesNodes[text] == nil { framesNodes[text] = list.New() } framesNodes[text].PushBack(s) } }) for text, freq := range frames { if freq > 1 { selections := framesNodes[text] for s := selections.Front(); s != nil; s = s.Next() { selection := s.Value.(*goquery.Selection) this.config.parser.removeNode(selection) } } } return doc }
// ogtags extracts the og:title, og:image, ... tags from a webpage func defaultHTML(i *data.Item, sourceURL string, doc *goquery.Document) { fmt.Println("Running OG extract.") selection := doc.Find("title") if len(selection.Nodes) != 0 { i.Caption = selection.Nodes[0].FirstChild.Data } selection = doc.Find("meta[property*='og']") for _, e := range selection.Nodes { m := htmlAttributeToMap(e.Attr) if m["property"] == "og:title" { i.Caption = m["content"] } if m["property"] == "og:image" { if !govalidator.IsRequestURL(m["content"]) { log.Println("Invalid url in og:image. " + sourceURL) continue } i.ImageURL = m["content"] } if m["property"] == "og:url" { if !govalidator.IsRequestURL(m["content"]) { log.Println("Invalid url in og:url. " + sourceURL) continue } i.URL = m["content"] } if m["property"] == "og:description" { i.Description = m["content"] } } }
func parseOrderListPage(s *goquery.Document) ([]Order, bool, error) { c := s.Find(".container").First() t := c.Find("div").First().Text() if t != ">注文情報(一覧)<" && t != ">注文情報(検索)<" { return nil, false, fmt.Errorf("cannot open \"注文情報(一覧)\", but %#v", t) } // タイトル行の削除 c.Find("hr").First().Next().PrevAll().Remove() results := []Order{} c.Find("a").Each( func(_ int, s *goquery.Selection) { href, ok := s.Attr("href") if !ok || !strings.HasPrefix(href, "../otc/C003.html?") { return } u, err := url.Parse(href) if err != nil || u.RawQuery == "" { return } v, err := url.ParseQuery(u.RawQuery) results = append(results, Order{ OrderId: v.Get("order_id"), OrderMethod: v.Get("order_method"), }) }) return results, c.Find("a[accesskey=\"#\"]").Length() == 1, nil }
//获取店铺名称 func GetShopName(p *goquery.Document) string { name := p.Find(".tb-shop-name").Text() if name == "" { name = p.Find(".slogo-shopname").Text() } return strings.TrimSpace(name) }
func feedsFromDoc(doc *goquery.Document, text string) []string { sel := "link[type='application/rss+xml']" sel += ", link[type='application/atom+xml']" matches := doc.Find(sel) if matches.Length() > 0 { feeds := make([]string, matches.Length()) matches.Each(func(i int, s *goquery.Selection) { url, _ := s.Attr("href") feeds[i] = url }) return feeds } rx := regexp.MustCompile(`href=['"]([^'"]*(rss|atom|feed|xml)[^'"]*)['"]`) if rx.FindString(text) != "" { matches := rx.FindAllStringSubmatch(text, -1) feeds := make([]string, len(matches)) for i, e := range matches { feeds[i] = e[1] } return feeds } return make([]string, 0) }
/* Get the two teams in a match */ func getTeamsId(d *goquery.Document) ([2]int, error) { var ids [2]int url1, ok := d.Find("div.container.left h3 a").Attr("href") if !ok { return ids, errors.New("could not find team a") } idA, err := parseTeam(BASE + url1) if err != nil { return ids, err } url2, ok := d.Find("div.container.right h3 a").Attr("href") if !ok { return ids, errors.New("could not find team b") } idB, err := parseTeam(BASE + url2) if err != nil { return ids, err } ids[0] = idA ids[1] = idB return ids, nil }
// Parse html func perseHTML(htmldata *goquery.Document) []string { var dates []string htmldata.Find("a.bt-open").Each(func(_ int, s *goquery.Selection) { if jsonData, ok := s.Attr("id"); ok { //decode htmlStringDecode(&jsonData) //analyze json object var jsonObject map[string]interface{} //json.JsonAnalyze(jsonData, &jsonObject) json.Unmarshal([]byte(jsonData), &jsonObject) //extract date from json object //e.g. 2016-02-27 03:30:00 strDate := jsonObject["field19"].(string) if isTimeApplicable(strDate) { dates = append(dates, strDate) } } }) return dates }
/* ** get friends' friends info */ func (w *SocialWorker) GetFFInfo(query *goquery.Document) { var user User // var uid string var usex string // var usersId []string // var usersName []string // uidString, _ := query.Find("div.c").Eq(1).Find("a").Attr("href") // var digitsRegexp = regexp.MustCompile(`(^|&|\?)uid=([^&]*)(&|$)`) /* ** 获取粉丝的粉丝的uid(str) */ // str := digitsRegexp.FindStringSubmatch(uidString) // uid = crawlUrl.Id // usersId = append(usersId, uid) uStr := query.Find("div.c").Eq(2).Text() nameStr_1 := GetBetweenStr(uStr, ":", "性别") nameStr_2 := GetBetweenStr(nameStr_1, ":", "认证") nameStr_3 := strings.Split(nameStr_2, ":") uname := nameStr_3[1] sexStr_1 := GetBetweenStr(uStr, "性别", "地区") sexStr_2 := strings.Split(sexStr_1, ":") if sexStr_2[1] == "男" { usex = "male" } else { usex = "famale" } user.uid = crawlUrl.FatherId user.friendid = crawlUrl.Id user.uname = uname user.usex = usex glog.Infoln(user) w.putItems(user) }
func getItems(doc *goquery.Document) (items []item, maxWidth int) { doc.Find("td.title a").EachWithBreak(func(i int, s *goquery.Selection) bool { if i == maxItems { return false } if s.Text() == "More" { return true } href, _ := s.Attr("href") title := s.Text() points := s.Parent().Parent().Next().Find("span").Text() a, b := len(fmt.Sprintf("%s (%s)", title, points)), len(href) maxWidth = max(a, b, maxWidth) items = append(items, item{ title: title, url: href, points: points, }) return true }) return }
/* **get friends url */ func (w *SocialWorker) GetFriendsUrl(query *goquery.Document, p *page.Page) { var str_1 string // newCrawlUrl := models.CrawlUrl{} query.Find("div.c").Find("table").Find("tbody").Find("tr").Find("a:last-child").Each(func(j int, s *goquery.Selection) { if j%2 != 0 { friendsUrlString, _ := s.Attr("href") var digitsRegexp = regexp.MustCompile(`(^|&|\?)uid=([^&]*)(&|$)`) str := digitsRegexp.FindStringSubmatch(friendsUrlString) if str == nil { str_1 = "1" } else { str_1 = str[2] } friendsInfoUrl := "http://weibo.cn/" + str_1 + "/info" // newCrawlUrl.Url = "http://weibo.cn/" + str_1 + "/fans" // p.AddTargetRequestWithHeaderFile(friendsInfoUrl, "html", "./header.json") // newCrawlUrl.Id = str_1 // newCrawlUrl.Layer = crawlUrl.Layer + 1 // newCrawlUrl.FatherId = crawlUrl.Id // w.SendMessageToSQS(newCrawlUrl) Urls = append(Urls, friendsInfoUrl) UrlsLevel = append(UrlsLevel, UrlsLevel[i]+1) } }) }
// Parse 获取url对应的资源并根据规则进行解析 func (this *RedditLogic) Parse(redditUrl string) error { redditUrl = strings.TrimSpace(redditUrl) if redditUrl == "" { redditUrl = this.domain + this.golang } else if !strings.HasPrefix(redditUrl, "https") { redditUrl = "https://" + redditUrl } var ( doc *goquery.Document err error ) // if doc, err = goquery.NewDocument(redditUrl); err != nil { if doc, err = this.newDocumentFromResp(redditUrl); err != nil { logger.Errorln("goquery reddit newdocument error:", err) return err } // 最后面的先入库处理 resourcesSelection := doc.Find("#siteTable .link") for i := resourcesSelection.Length() - 1; i >= 0; i-- { err = this.dealRedditOneResource(goquery.NewDocumentFromNode(resourcesSelection.Get(i)).Selection) if err != nil { logger.Errorln(err) } } return err }
func doWork(links <-chan string, results chan<- string) { for link := range links { var doc *goquery.Document for i := 1; ; i++ { var err error doc, err = goquery.NewDocument(link) if err == nil { break } fmt.Fprintf(os.Stderr, "[Tentativa %d] Erro tentando processar página de servidor: %s. Erro: %q", i, link, err) if i == maxRetries { fmt.Fprintf(os.Stderr, "Página não processada: %s", link) return } time.Sleep(time.Duration(i) * time.Duration(rand.Intn(5)) * time.Second) } var row []string doc.Find("td.desc").Each(func(i int, s *goquery.Selection) { cell := strings.Replace( strings.Trim(s.Next().Text(), " \n"), ",", ".", 1) row = append(row, cell) }) if len(row) > 0 { results <- strings.Join(row, *sep) } else { fmt.Fprintf(os.Stderr, "Não achou td.desc: %s\n", link) } } }
// ARSOPotresi returs slice of Potres struct func ARSOPotresi() []Potres { var potresi []Potres var doc *goquery.Document var e error if res, found := cacheArso.Get("potresi"); found { return res.([]Potres) } if doc, e = goquery.NewDocument("http://www.arso.gov.si/potresi/obvestila%20o%20potresih/aip/"); e != nil { return potresi } doc.Find("#glavna td.vsebina table tr").Each(func(i int, s *goquery.Selection) { magnituda, err := strconv.ParseFloat(s.Find("td:nth-child(4)").Text(), 2) if magnituda > 0 && err == nil { potres := Potres{} potres.Magnituda = magnituda potres.Lat, _ = strconv.ParseFloat(s.Find("td:nth-child(2)").Text(), 3) potres.Lon, _ = strconv.ParseFloat(s.Find("td:nth-child(3)").Text(), 3) potres.Lokacija = s.Find("td:nth-child(6)").Text() potres.Datum = s.Find("td:nth-child(1)").Text() potresi = append(potresi, potres) } }) cacheArso.Set("potresi", potresi, cache.DefaultExpiration) return potresi }
func garfield(i *data.Item, sourceURL string, doc *goquery.Document) { if !strings.Contains(sourceURL, "www.gocomics.com/garfield") { return } fmt.Println("Running Garfield plugin.") // update title selection := doc.Find(".strip") if len(selection.Nodes) == 0 { fmt.Println("Garfield plugin found no .strip. " + sourceURL) } else { if len(selection.Nodes) > 1 { fmt.Println("Garfield plugin found >1 .strip. " + sourceURL) } m := htmlAttributeToMap(selection.Nodes[0].Attr) if govalidator.IsRequestURL(m["src"]) { i.Description = "<img src =\"" i.Description += m["src"] i.Description += "\" />" } else { fmt.Println("Amazon plugin invalid url. " + m["src"]) } i.ImageURL = "" } }
//获取淘宝属性信息 func GetAttrbuites(p *goquery.Document) string { attribute := make([]string, 0, 20) p.Find("#J_AttrUL li").Each(func(index int, element *goquery.Selection) { as := strings.Split(element.Text(), ":") if len(as) < 2 { as = strings.Split(element.Text(), ":") } b := "" if len(as) >= 2 && !utf8.ValidString(as[1]) { as[1] = as[1] b = as[1] } attribute = append(attribute, as[0]+":"+b) }) if len(attribute) == 0 { p.Find("#attributes .attributes-list li").Each(func(index int, element *goquery.Selection) { attribute = append(attribute, element.Text()) }) } return strings.Join(attribute, "##") }
func scrapeSearch(document *goquery.Document, url string) { pagesStr := document.Find("a.next_page").Prev().Text() pages, _ := strconv.Atoi(pagesStr) page := 1 for page <= pages { pageURL := url + "&p=" + strconv.Itoa(page) fmt.Println("Analyzing page: " + pageURL) doc := downloadURL(pageURL) doc.Find(".user-list-item").Each(func(i int, s *goquery.Selection) { email := s.Find("a.email").Text() profileURL, _ := s.Find("a").Eq(1).Attr("href") username := profileURL[1:len(profileURL)] profileURL = "http://github.com" + profileURL info := s.Find(".user-list-info") _ = info.Find("ul.user-list-meta").Remove() _ = info.Find("a").Remove() name := strings.TrimSpace(info.Text()) fmt.Println("Parsed user: " + username) user := user{name: name, email: email, url: profileURL, username: username} dumpToCSV(user) }) page = page + 1 } }
//获取店铺地址 func GetShopUrl(p *goquery.Document) string { href, _ := p.Find(".tb-seller-name").Attr("href") if href == "" { href, _ = p.Find(".slogo-shopname").Attr("href") } return strings.TrimSpace("https:" + href) }
func getTerms(doc *goquery.Document) ([]string, error) { terms := make([]string, 0) doc.Find("p").Each(func(i int, s *goquery.Selection) { // Decode any HTML-encoded characters so they can be parsed correctly. bdy := html.UnescapeString(s.Text()) // TODO: condense into a regex? bdy = strings.Replace(bdy, "-", " ", -1) bdy = strings.Replace(bdy, ",", " ", -1) bdy = strings.Replace(bdy, ".", " ", -1) bdy = strings.Replace(bdy, ";", " ", -1) bdy = strings.Replace(bdy, "\"", " ", -1) terms = append(terms, strings.Fields(bdy)...) }) re, err := regexp.Compile("[^A-Za-z0-9]+") if err != nil { log.Println("Unexpected regex compilation error: " + err.Error()) return []string{}, err } for i := 0; i < len(terms); i++ { terms[i] = re.ReplaceAllString(terms[i], "") } return terms, nil }
func descriptionFromDoc(doc *goquery.Document) string { sel := "meta[property='og:description']" sel += ", meta[name='twitter:description']" sel += ", meta[name='description']" desc, _ := doc.Find(sel).First().Attr("content") return desc }
func (t *TownClient) getSValue() (sValue string) { log.WithField("tag", TAG).Info("getting sValue for town login") sValue = "" var doc *goquery.Document var e error log.WithField("tag", TAG).Infof("GET %v", ROOT) if doc, e = goquery.NewDocument(ROOT); e != nil { log.WithField("tag", TAG).Errorf("%s", e.Error()) return } doc.Find("input").Each(func(i int, s *goquery.Selection) { attr, exists := s.Attr("name") if exists == true { if attr == "s" { bla, exists := s.Attr("value") if exists == true { sValue = bla } } } }) log.WithField("tag", TAG).Infof("sValue: %v", sValue) return sValue }
func main() { fmt.Println("starting...") flag.Parse() var doc *goquery.Document var err error if doc, err = goquery.NewDocument("http://science.nasa.gov/missions/?group=all"); err != nil { log.Fatal("Failed to fetch page") } doc.Find(".missions").Find("tbody").Children().Each(func(i int, s *goquery.Selection) { m := unpackMission(s) if m.Phase == "Operating" { missions = append(missions, m) } }) if *asJson == true { b, err := json.Marshal(missions) if err != nil { log.Fatal(err) } os.Stdout.Write(b) } else { for _, m := range missions { fmt.Println(m) } } }
func parse(d *goquery.Document) []Result { // Select the tales table rowsSel := d.Find("#tale_list > tbody:nth-child(2) > tr") // var rows []Result rows := make([]Result, rowsSel.Length()) rowsSel.Each(func(i int, s *goquery.Selection) { // Get all the rows children td tags tdSel := s.Children() rows[i] = Result{ ID: getMovieID(tdSel.Eq(1)), Name: getMovieName(tdSel.Eq(1)), SubName: getMovieSubName(tdSel.Eq(1)), Views: getViews(tdSel.Eq(4)), Author: getAuthor(tdSel.Eq(8)), FPS: getFPS(tdSel.Eq(6)), SubtitleLink: getSubtitleLink(tdSel.Eq(1)), Links: getMovieLinks(tdSel.Eq(3)), Genres: getGenres(tdSel.Eq(2)), Created: getDate(tdSel.Eq(0)), } }) return rows }
func parseTrendingRepos(doc *goquery.Document) []GithubRepo { var repos []GithubRepo var regStars = regexp.MustCompile("[0-9]+") doc.Find("li.repo-list-item").Each(func(i int, s *goquery.Selection) { title := strings.Trim(s.Find("h3.repo-list-name a").Text(), "\n\t ") title = strings.Replace(title, " ", "", -1) title = strings.Replace(title, "\n", "", -1) description := strings.Trim(s.Find("p.repo-list-description").Text(), "\n\t ") url, _ := s.Find("h3.repo-list-name a").Attr("href") url = "https://github.com" + url starsString := s.Find("p.repo-list-meta").Text() starsString = strings.Replace(starsString, ",", "", -1) starsString = regStars.FindString(starsString) if starsString == "" { starsString = "0" } stars, _ := strconv.Atoi(starsString) repo := GithubRepo{ Title: title, Description: description, Url: url, Stars: stars, Forks: 0, Date: time.Now().UTC().Unix(), } repos = append(repos, repo) }) return repos }
func HtmlUlLiToRst(doc *goquery.Document) *goquery.Document { for ul := doc.Find("ul").First(); ul.Length() != 0; ul = doc.Find("ul").First() { processUl(ul, 0) } return doc }
func (this *cleaner) cleanCites(doc *goquery.Document) *goquery.Document { cites := doc.Find("cite") cites.Each(func(i int, s *goquery.Selection) { this.config.parser.removeNode(s) }) return doc }
// extractLinks from a document func (h *HttpFetcher) extractLinks(doc *goquery.Document) ([]*url.URL, error) { // Blank slice to hold the links on this page urls := make([]*url.URL, 0) // Extract all 'a' elements from the document sel := doc.Find("a") if sel == nil { // Assume zero links on failure return nil, nil } // Range over links, and add them to the list if valid for _, n := range sel.Nodes { // Validate the node is a link, and extract the target URL href, err := h.extractValidHref(n) if err != nil || href == "" { continue } // Normalise the URL and add if valid if uri := h.normaliseUrl(doc.Url, href); uri != nil { urls = append(urls, uri) } } return h.dedupeUrls(urls), nil }
func GetStudent(d *goquery.Document, rollno int) (s Student, ok bool) { //sanity on document // if v := d.Find(".titlehead").Children().Text(); v != "JEE (Advanced) - 2013 Result" { // return Student{}, false // } dtext := strings.Trim(d.Text(), " ") dfields := strings.Fields(dtext) for _, v := range dfields { s.Plaintext += v + " " } s.Plaintext = strings.Trim(s.Plaintext, " ") if isInvalid(dtext) { return s, false } ok = true s.Rollno = rollno s.Region = s.Rollno / 10000 if !isSelected(dtext) { return } s.Selected = true s.Rank, _ = strconv.Atoi(d.Find(".style7").First().Text()) text, _ := d.Find(".titlehead").First().Parent().Next().Children().Children().First().Html() tokens := strings.Split(text, "<br/>") nameToks := strings.Fields(tokens[1]) nameToks = nameToks[2:len(nameToks)] for _, v := range nameToks { s.Name += v + " " } s.Name = strings.Trim(s.Name, " ") s.Q = GetQuota(dtext) return }
// Determines whether or not the client is currently logged in based on a goquery.Document. func loginStatus(doc *goquery.Document) bool { if doc.Find(`#user-controls`).Length() > 0 { return true } return false }