// this function returns some specific signature of a selection // so it can be easy found to get data quickly next time func getSelectionSignature(s *goquery.Selection) string { var signature string tag, _ := goquery.OuterHtml(s) pos := strings.Index(tag, ">") if pos > -1 { tag = tag[1:pos] } else { return "" } signature = convertTagToJqueryFormat(tag, s) s.Parents().Each(func(i int, sec *goquery.Selection) { ohtml, _ := goquery.OuterHtml(sec) pos := strings.Index(ohtml, ">") if pos > -1 { ohtml = ohtml[1:pos] } tag := convertTagToJqueryFormat(ohtml, sec) signature = tag + " " + signature }) return signature }
//checks the density of links within a node, is there not much text and most of it contains bad links? //if so it's no good func (this *contentExtractor) isHighLinkDensity(node *goquery.Selection) bool { links := node.Find("a") if links == nil || links.Size() == 0 { return false } text := node.Text() words := strings.Split(text, " ") nwords := len(words) sb := make([]string, 0) links.Each(func(i int, s *goquery.Selection) { linkText := s.Text() sb = append(sb, linkText) }) linkText := strings.Join(sb, "") linkWords := strings.Split(linkText, " ") nlinkWords := len(linkWords) nlinks := links.Size() linkDivisor := float64(nlinkWords) / float64(nwords) score := linkDivisor * float64(nlinks) if this.config.debug { logText := "" if len(node.Text()) >= 51 { logText = node.Text()[0:50] } else { logText = node.Text() } log.Printf("Calculated link density score as %1.5f for node %s\n", score, logText) } if score > 1.0 { return true } return false }
func (b baiduNews) commonPrase(ctx *Context) (infoStr string) { body := ctx.GetDom().Find("body") var info *goquery.Selection if h1s := body.Find("h1"); len(h1s.Nodes) != 0 { for i := 0; i < len(h1s.Nodes); i++ { info = b.findP(h1s.Eq(i)) } } else if h2s := body.Find("h2"); len(h2s.Nodes) != 0 { for i := 0; i < len(h2s.Nodes); i++ { info = b.findP(h2s.Eq(i)) } } else if h3s := body.Find("h3"); len(h3s.Nodes) != 0 { for i := 0; i < len(h3s.Nodes); i++ { info = b.findP(h3s.Eq(i)) } } else { info = body.Find("body") } infoStr, _ = info.Html() // 清洗HTML infoStr = CleanHtml(infoStr, 5) return }
func parseColors(s *goquery.Selection) string { colors := "" s.Each(func(i int, s *goquery.Selection) { colors += s.Text() }) return colors }
// attributeOrDefault reads an attribute and returns it or the default value when it's empty. func (bow *Browser) attrOrDefault(name, def string, sel *goquery.Selection) string { a, ok := sel.Attr(name) if ok { return a } return def }
func (b baiduNews) commonPrase(resp *context.Response) (infoStr string) { body := resp.GetDom().Find("body") var info *goquery.Selection if h1s := body.Find("h1"); len(h1s.Nodes) != 0 { for i := 0; i < len(h1s.Nodes); i++ { info = b.findP(h1s.Eq(i)) } } else if h2s := body.Find("h2"); len(h2s.Nodes) != 0 { for i := 0; i < len(h2s.Nodes); i++ { info = b.findP(h2s.Eq(i)) } } else if h3s := body.Find("h3"); len(h3s.Nodes) != 0 { for i := 0; i < len(h3s.Nodes); i++ { info = b.findP(h3s.Eq(i)) } } else { info = body.Find("body") } // 去除标签 // info.RemoveFiltered("script") // info.RemoveFiltered("style") infoStr, _ = info.Html() // 清洗HTML infoStr = CleanHtml(infoStr, 5) return }
func scrapPayload(s *goquery.Selection, n int) string { url, ok := s.Find("a").Attr("href") if !ok { die("unable to find URL for scrapping") } return scrapPayloadURL("https://developer.github.com"+url, n) }
func JoinNodesWithSpace(s *goquery.Selection) string { texts := []string{} s.Each(func(i int, s *goquery.Selection) { texts = append(texts, s.Text()) }) return strings.Join(texts, " ") }
func (this *parser) dropTag(selection *goquery.Selection) { selection.Each(func(i int, s *goquery.Selection) { node := s.Get(0) node.Data = s.Text() node.Type = html.TextNode }) }
func parseGamePosition(selection *goquery.Selection) (position int) { positionString := strings.TrimSpace(selection.Children().First().Text()) var err error position, err = strconv.Atoi(strings.TrimSpace(positionString)) helper.HandleFatalError("parsing game position failed:", err) return }
func ScrapeExamples(s *goquery.Selection) []string { examples := []string{} s.Find("span.h").Each(func(i int, s *goquery.Selection) { examples = append(examples, s.Text()) }) return examples }
func extractCredits(selection *goquery.Selection) string { if result := trim(selection.Find(".credits").Text()); strings.Contains(result, "#") { return "0" } else { return result } }
func extractCourseDescription(selection *goquery.Selection) string { url := trim(fmt.Sprintln(selection.Find(".catalogdescription a").AttrOr("href", ""))) fmt.Println("LOGGING URL", url) client := http.Client{} req, _ := http.NewRequest("GET", "http://catalog.njit.edu/ribbit/index.cgi?format=html&page=fsinjector.rjs&fullpage=true", nil) req.Header.Add("Referer", url) resp, err := client.Do(req) if err != nil { return "" } if resp != nil { defer resp.Body.Close() } body, _ := ioutil.ReadAll(resp.Body) //checkError(err) result := substringAfter(string(body), "courseblockdesc") if len(result) < 4 { return "" } result = substringBefore(result[3:], "<b") if string(result[0]) == "<" || strings.Contains(result, "at SISConnxService") { return "" } result = strings.Replace(result, "\\\"", "\"", -1) doc, _ := goquery.NewDocumentFromReader(strings.NewReader(result)) return trim(doc.Text()) }
func convertTagToJqueryFormat(tag string, s *goquery.Selection) string { tagitself := tag pos := strings.Index(tag, " ") if pos > -1 { tagitself = tag[0:pos] } else { return tag } class, found := s.Attr("class") if found && class != "" { pos := strings.Index(class, " ") // leave only a first class from a list if pos > -1 { class = class[0:pos] } tagitself = tagitself + "." + class } return tagitself }
func testList(t *testing.T, list *goquery.Selection) { list.Find("ul").Each(func(_ int, items *goquery.Selection) { testList(t, items) items.RemoveFiltered("ul") }) checkAlphabeticOrder(t, list) }
func (this *parser) delAttr(selection *goquery.Selection, attr string) { idx := this.indexOfAttribute(selection, attr) if idx > -1 { node := selection.Get(0) node.Attr = append(node.Attr[:idx], node.Attr[idx+1:]...) } }
// toPage is a helper function that accepts an anchor // tag referencing a markdown file, parsing the markdown // file and returning a page to be included in our docs. func toPage(site *Site, el *goquery.Selection) (*Page, error) { // follow the link to see if this is a page // that should be added to our documentation. href, ok := el.Attr("href") if !ok || href == "#" { return nil, nil } // read the markdown file, convert to html and // read into a dom element. doc, err := toDocument(filepath.Join(site.base, href)) if err != nil { return nil, err } // convert the extension from markdown to // html, in preparation for type conversion. href = strings.Replace(href, ".md", ".html", -1) el.SetAttr("href", href) page := &Page{} page.Href = href page.html, err = doc.Html() return page, err }
func (this *parser) name(selector string, selection *goquery.Selection) string { value, exists := selection.Attr(selector) if exists { return value } return "" }
func (d *Document) classWeight(s *goquery.Selection) int { weight := 0 if !d.WeightClasses { return weight } class, _ := s.Attr("class") id, _ := s.Attr("id") if class != "" { if negativeRegexp.MatchString(class) { weight -= 25 } if positiveRegexp.MatchString(class) { weight += 25 } } if id != "" { if negativeRegexp.MatchString(id) { weight -= 25 } if positiveRegexp.MatchString(id) { weight += 25 } } return weight }
func parseTranslations(elements *goquery.Selection) (results []Translation) { elements.Each(func(index int, element *goquery.Selection) { results = append(results, Translation{parseMeaning(element), parseHref(element), parsePhrase(element)}) }) return }
func ParseCourse(s *goquery.Selection) Course { subject := strings.TrimSpace(s.Find("td").Eq(0).Text()) catalog := strings.TrimSpace(s.Find("td").Eq(1).Text()) termStr := strings.TrimSpace(s.Find("td").Eq(2).Text()) class := strings.TrimSpace(s.Find("td").Eq(3).Text()) title := strings.TrimSpace(s.Find("td").Eq(4).Text()) instructor := strings.TrimSpace(s.Find("td").Eq(5).Text()) credits := strings.TrimSpace(s.Find("td").Eq(6).Text()) catalogNum, _ := strconv.Atoi(catalog) classNum, _ := strconv.Atoi(strings.TrimSpace(class)) // Damn you unicode NBSP!!! filter := strings.Replace(termStr, "\u0020", "", -1) termCleaned := strings.Split(filter, "\u00A0")[0] course := Course{ Subject: subject, CatalogNumber: catalogNum, ClassNumber: classNum, Title: title, Instructor: instructor, Credits: credits, Term: NewTerm(termCleaned), } return course }
func parsePhrase(selection *goquery.Selection) (result []string) { selection.Find(".translation_item").Each(func(index int, meaning *goquery.Selection) { result = append(result, strings.TrimSpace(meaning.Text())) }) return }
func unpackMission(s *goquery.Selection) *Mission { m := Mission{} tds := s.Children() r, err := tds.First().Html() if err != nil { log.Printf("Error parsing HTML: %+v\n", err) } else { m.Division = r } node := tds.Next().Children() name, err := node.Html() if err != nil { log.Println("Error getting name: ", err) } m.Name = strings.TrimSpace(name) href, ok := node.Attr("href") if !ok { log.Println("No href") } m.Url = href node = tds.Next() desc, err := node.Find(".desc").Children().Html() if err != nil { log.Println("Err getting desc", err) } m.Description = strings.TrimSpace(desc) node = tds.Next() date, err := node.Next().Children().Html() m.LaunchDate = date date2 := strings.Trim(node.Next().First().Text(), "1234567890") m.LaunchDateHuman = strings.TrimSpace(date2) m.Phase = strings.TrimLeft(tds.Last().Text(), "1234567890") return &m }
// node returns a string representation of the selection. func node(i int, s *goquery.Selection) string { switch node := s.Get(0); { case node.Data == "h1": return fmt.Sprintf(" \033[%dm# %s\033[0m\n\n", blue, text(s)) case node.Data == "h2": return fmt.Sprintf(" \033[%dm## %s\033[0m\n\n", blue, text(s)) case node.Data == "h3": return fmt.Sprintf(" \033[%dm### %s\033[0m\n\n", blue, text(s)) case node.Data == "p": return fmt.Sprintf("\033[%dm%s\033[0m\n\n", none, indent(text(s), 1)) case node.Data == "pre" || s.HasClass("highlight"): return fmt.Sprintf("\033[1m%s\033[0m\n\n", indent(text(s), 2)) case node.Data == "a": return fmt.Sprintf("%s (%s) ", s.Text(), s.AttrOr("href", "missing link")) case node.Data == "li": return fmt.Sprintf(" • %s\n", contents(s)) case node.Data == "ul": return fmt.Sprintf("%s\n", nodes(s)) case node.Data == "code": return fmt.Sprintf("\033[1m%s\033[0m ", s.Text()) case node.Type == html.TextNode: return strings.TrimSpace(node.Data) default: return "" } }
func (ve *VideoExtractor) getSrc(node *goquery.Selection) string { value, exists := node.Attr("src") if exists { return value } return "" }
func score(tag *goquery.Selection) int { src, _ := tag.Attr("src") if src == "" { src, _ = tag.Attr("data-src") } if src == "" { src, _ = tag.Attr("data-lazy-src") } if src == "" { return -1 } tagScore := 0 for rule, score := range rules { if rule.MatchString(src) { tagScore += score } } alt, exists := tag.Attr("alt") if exists { if strings.Contains(alt, "thumbnail") { tagScore-- } } id, exists := tag.Attr("id") if exists { if id == "fbPhotoImage" { tagScore++ } } return tagScore }
func (this *cleaner) replaceWithPara(div *goquery.Selection) { if div.Size() > 0 { node := div.Get(0) node.Data = atom.P.String() node.DataAtom = atom.P } }
func (rc *TwitterChecker) findSigInTweet(h SigHint, s *goquery.Selection) ProofError { inside := s.Text() html, err := s.Html() checkText := h.checkText if err != nil { return NewProofError(keybase1.ProofStatus_CONTENT_FAILURE, "No HTML tweet found: %s", err) } G.Log.Debug("+ Checking tweet '%s' for signature '%s'", inside, checkText) G.Log.Debug("| HTML is: %s", html) rxx := regexp.MustCompile(`^(@[a-zA-Z0-9_-]+\s+)`) for { if m := rxx.FindStringSubmatchIndex(inside); m == nil { break } else { prefix := inside[m[2]:m[3]] inside = inside[m[3]:] G.Log.Debug("| Stripping off @prefx: %s", prefix) } } if strings.HasPrefix(inside, checkText) { return nil } return NewProofError(keybase1.ProofStatus_DELETED, "Could not find '%s' in '%s'", checkText, inside) }
// Serialize converts the form fields into a url.Values type. // Returns two url.Value types. The first is the form field values, and the // second is the form button values. func serializeForm(sel *goquery.Selection) (url.Values, url.Values) { input := sel.Find("input,button,textarea") if input.Length() == 0 { return url.Values{}, url.Values{} } fields := make(url.Values) buttons := make(url.Values) input.Each(func(_ int, s *goquery.Selection) { name, ok := s.Attr("name") if ok { typ, ok := s.Attr("type") if ok || s.Is("textarea") { if typ == "submit" { val, ok := s.Attr("value") if ok { buttons.Add(name, val) } else { buttons.Add(name, "") } } else { val, ok := s.Attr("value") if !ok { val = "" } fields.Add(name, val) } } } }) return fields, buttons }
func GetText(s *goquery.Selection) string { texts, _ := s.Find("td").Html() texts = TrimLinefeed(texts) texts = strings.Replace(texts, "<br/>", ",", -1) texts = strings.Replace(texts, "デッキレベル0再録", "", -1) return ReplaceIcon(texts) }