Пример #1
0
// this function returns some specific signature of a selection
// so it can be easy found to get data quickly next time
func getSelectionSignature(s *goquery.Selection) string {
	var signature string

	tag, _ := goquery.OuterHtml(s)

	pos := strings.Index(tag, ">")

	if pos > -1 {
		tag = tag[1:pos]
	} else {
		return ""
	}

	signature = convertTagToJqueryFormat(tag, s)

	s.Parents().Each(func(i int, sec *goquery.Selection) {
		ohtml, _ := goquery.OuterHtml(sec)

		pos := strings.Index(ohtml, ">")

		if pos > -1 {
			ohtml = ohtml[1:pos]
		}

		tag := convertTagToJqueryFormat(ohtml, sec)

		signature = tag + " " + signature
	})

	return signature
}
Пример #2
0
//checks the density of links within a node, is there not much text and most of it contains bad links?
//if so it's no good
func (this *contentExtractor) isHighLinkDensity(node *goquery.Selection) bool {
	links := node.Find("a")
	if links == nil || links.Size() == 0 {
		return false
	}
	text := node.Text()
	words := strings.Split(text, " ")
	nwords := len(words)
	sb := make([]string, 0)
	links.Each(func(i int, s *goquery.Selection) {
		linkText := s.Text()
		sb = append(sb, linkText)
	})
	linkText := strings.Join(sb, "")
	linkWords := strings.Split(linkText, " ")
	nlinkWords := len(linkWords)
	nlinks := links.Size()
	linkDivisor := float64(nlinkWords) / float64(nwords)
	score := linkDivisor * float64(nlinks)

	if this.config.debug {
		logText := ""
		if len(node.Text()) >= 51 {
			logText = node.Text()[0:50]
		} else {
			logText = node.Text()
		}
		log.Printf("Calculated link density score as %1.5f for node %s\n", score, logText)
	}
	if score > 1.0 {
		return true
	}
	return false
}
Пример #3
0
func (b baiduNews) commonPrase(ctx *Context) (infoStr string) {
	body := ctx.GetDom().Find("body")

	var info *goquery.Selection

	if h1s := body.Find("h1"); len(h1s.Nodes) != 0 {
		for i := 0; i < len(h1s.Nodes); i++ {
			info = b.findP(h1s.Eq(i))
		}
	} else if h2s := body.Find("h2"); len(h2s.Nodes) != 0 {
		for i := 0; i < len(h2s.Nodes); i++ {
			info = b.findP(h2s.Eq(i))
		}
	} else if h3s := body.Find("h3"); len(h3s.Nodes) != 0 {
		for i := 0; i < len(h3s.Nodes); i++ {
			info = b.findP(h3s.Eq(i))
		}
	} else {
		info = body.Find("body")
	}
	infoStr, _ = info.Html()

	// 清洗HTML
	infoStr = CleanHtml(infoStr, 5)
	return
}
Пример #4
0
func parseColors(s *goquery.Selection) string {
	colors := ""
	s.Each(func(i int, s *goquery.Selection) {
		colors += s.Text()
	})
	return colors
}
Пример #5
0
// attributeOrDefault reads an attribute and returns it or the default value when it's empty.
func (bow *Browser) attrOrDefault(name, def string, sel *goquery.Selection) string {
	a, ok := sel.Attr(name)
	if ok {
		return a
	}
	return def
}
Пример #6
0
func (b baiduNews) commonPrase(resp *context.Response) (infoStr string) {
	body := resp.GetDom().Find("body")

	var info *goquery.Selection

	if h1s := body.Find("h1"); len(h1s.Nodes) != 0 {
		for i := 0; i < len(h1s.Nodes); i++ {
			info = b.findP(h1s.Eq(i))
		}
	} else if h2s := body.Find("h2"); len(h2s.Nodes) != 0 {
		for i := 0; i < len(h2s.Nodes); i++ {
			info = b.findP(h2s.Eq(i))
		}
	} else if h3s := body.Find("h3"); len(h3s.Nodes) != 0 {
		for i := 0; i < len(h3s.Nodes); i++ {
			info = b.findP(h3s.Eq(i))
		}
	} else {
		info = body.Find("body")
	}
	// 去除标签
	// info.RemoveFiltered("script")
	// info.RemoveFiltered("style")
	infoStr, _ = info.Html()

	// 清洗HTML
	infoStr = CleanHtml(infoStr, 5)
	return
}
Пример #7
0
func scrapPayload(s *goquery.Selection, n int) string {
	url, ok := s.Find("a").Attr("href")
	if !ok {
		die("unable to find URL for scrapping")
	}
	return scrapPayloadURL("https://developer.github.com"+url, n)
}
Пример #8
0
func JoinNodesWithSpace(s *goquery.Selection) string {
	texts := []string{}
	s.Each(func(i int, s *goquery.Selection) {
		texts = append(texts, s.Text())
	})
	return strings.Join(texts, " ")
}
Пример #9
0
func (this *parser) dropTag(selection *goquery.Selection) {
	selection.Each(func(i int, s *goquery.Selection) {
		node := s.Get(0)
		node.Data = s.Text()
		node.Type = html.TextNode
	})
}
Пример #10
0
func parseGamePosition(selection *goquery.Selection) (position int) {
	positionString := strings.TrimSpace(selection.Children().First().Text())
	var err error
	position, err = strconv.Atoi(strings.TrimSpace(positionString))
	helper.HandleFatalError("parsing game position failed:", err)
	return
}
Пример #11
0
func ScrapeExamples(s *goquery.Selection) []string {
	examples := []string{}
	s.Find("span.h").Each(func(i int, s *goquery.Selection) {
		examples = append(examples, s.Text())
	})
	return examples
}
Пример #12
0
func extractCredits(selection *goquery.Selection) string {
	if result := trim(selection.Find(".credits").Text()); strings.Contains(result, "#") {
		return "0"
	} else {
		return result
	}
}
Пример #13
0
func extractCourseDescription(selection *goquery.Selection) string {
	url := trim(fmt.Sprintln(selection.Find(".catalogdescription a").AttrOr("href", "")))
	fmt.Println("LOGGING URL", url)
	client := http.Client{}
	req, _ := http.NewRequest("GET", "http://catalog.njit.edu/ribbit/index.cgi?format=html&page=fsinjector.rjs&fullpage=true", nil)
	req.Header.Add("Referer", url)
	resp, err := client.Do(req)
	if err != nil {

		return ""
	}
	if resp != nil {
		defer resp.Body.Close()
	}

	body, _ := ioutil.ReadAll(resp.Body)
	//checkError(err)
	result := substringAfter(string(body), "courseblockdesc")
	if len(result) < 4 {
		return ""
	}
	result = substringBefore(result[3:], "<b")
	if string(result[0]) == "<" || strings.Contains(result, "at SISConnxService") {
		return ""
	}
	result = strings.Replace(result, "\\\"", "\"", -1)
	doc, _ := goquery.NewDocumentFromReader(strings.NewReader(result))

	return trim(doc.Text())
}
Пример #14
0
func convertTagToJqueryFormat(tag string, s *goquery.Selection) string {
	tagitself := tag

	pos := strings.Index(tag, " ")

	if pos > -1 {
		tagitself = tag[0:pos]
	} else {

		return tag
	}

	class, found := s.Attr("class")

	if found && class != "" {
		pos := strings.Index(class, " ")
		// leave only a first class from a list
		if pos > -1 {
			class = class[0:pos]
		}

		tagitself = tagitself + "." + class
	}

	return tagitself
}
Пример #15
0
func testList(t *testing.T, list *goquery.Selection) {
	list.Find("ul").Each(func(_ int, items *goquery.Selection) {
		testList(t, items)
		items.RemoveFiltered("ul")
	})
	checkAlphabeticOrder(t, list)
}
Пример #16
0
func (this *parser) delAttr(selection *goquery.Selection, attr string) {
	idx := this.indexOfAttribute(selection, attr)
	if idx > -1 {
		node := selection.Get(0)
		node.Attr = append(node.Attr[:idx], node.Attr[idx+1:]...)
	}
}
Пример #17
0
// toPage is a helper function that accepts an anchor
// tag referencing a markdown file, parsing the markdown
// file and returning a page to be included in our docs.
func toPage(site *Site, el *goquery.Selection) (*Page, error) {

	// follow the link to see if this is a page
	// that should be added to our documentation.
	href, ok := el.Attr("href")
	if !ok || href == "#" {
		return nil, nil
	}

	// read the markdown file, convert to html and
	// read into a dom element.
	doc, err := toDocument(filepath.Join(site.base, href))
	if err != nil {
		return nil, err
	}

	// convert the extension from markdown to
	// html, in preparation for type conversion.
	href = strings.Replace(href, ".md", ".html", -1)
	el.SetAttr("href", href)

	page := &Page{}
	page.Href = href
	page.html, err = doc.Html()
	return page, err
}
Пример #18
0
func (this *parser) name(selector string, selection *goquery.Selection) string {
	value, exists := selection.Attr(selector)
	if exists {
		return value
	}
	return ""
}
Пример #19
0
func (d *Document) classWeight(s *goquery.Selection) int {
	weight := 0
	if !d.WeightClasses {
		return weight
	}

	class, _ := s.Attr("class")
	id, _ := s.Attr("id")

	if class != "" {
		if negativeRegexp.MatchString(class) {
			weight -= 25
		}

		if positiveRegexp.MatchString(class) {
			weight += 25
		}
	}

	if id != "" {
		if negativeRegexp.MatchString(id) {
			weight -= 25
		}

		if positiveRegexp.MatchString(id) {
			weight += 25
		}
	}

	return weight
}
func parseTranslations(elements *goquery.Selection) (results []Translation) {
	elements.Each(func(index int, element *goquery.Selection) {
		results = append(results, Translation{parseMeaning(element), parseHref(element), parsePhrase(element)})
	})

	return
}
Пример #21
0
func ParseCourse(s *goquery.Selection) Course {
	subject := strings.TrimSpace(s.Find("td").Eq(0).Text())
	catalog := strings.TrimSpace(s.Find("td").Eq(1).Text())
	termStr := strings.TrimSpace(s.Find("td").Eq(2).Text())
	class := strings.TrimSpace(s.Find("td").Eq(3).Text())
	title := strings.TrimSpace(s.Find("td").Eq(4).Text())
	instructor := strings.TrimSpace(s.Find("td").Eq(5).Text())
	credits := strings.TrimSpace(s.Find("td").Eq(6).Text())
	catalogNum, _ := strconv.Atoi(catalog)
	classNum, _ := strconv.Atoi(strings.TrimSpace(class))

	// Damn you unicode NBSP!!!
	filter := strings.Replace(termStr, "\u0020", "", -1)
	termCleaned := strings.Split(filter, "\u00A0")[0]

	course := Course{
		Subject:       subject,
		CatalogNumber: catalogNum,
		ClassNumber:   classNum,
		Title:         title,
		Instructor:    instructor,
		Credits:       credits,
		Term:          NewTerm(termCleaned),
	}
	return course
}
func parsePhrase(selection *goquery.Selection) (result []string) {
	selection.Find(".translation_item").Each(func(index int, meaning *goquery.Selection) {
		result = append(result, strings.TrimSpace(meaning.Text()))
	})

	return
}
Пример #23
0
func unpackMission(s *goquery.Selection) *Mission {
	m := Mission{}
	tds := s.Children()
	r, err := tds.First().Html()
	if err != nil {
		log.Printf("Error parsing HTML: %+v\n", err)
	} else {
		m.Division = r
	}
	node := tds.Next().Children()
	name, err := node.Html()
	if err != nil {
		log.Println("Error getting name: ", err)
	}
	m.Name = strings.TrimSpace(name)
	href, ok := node.Attr("href")
	if !ok {
		log.Println("No href")
	}
	m.Url = href
	node = tds.Next()
	desc, err := node.Find(".desc").Children().Html()
	if err != nil {
		log.Println("Err getting desc", err)
	}
	m.Description = strings.TrimSpace(desc)
	node = tds.Next()
	date, err := node.Next().Children().Html()
	m.LaunchDate = date
	date2 := strings.Trim(node.Next().First().Text(), "1234567890")
	m.LaunchDateHuman = strings.TrimSpace(date2)
	m.Phase = strings.TrimLeft(tds.Last().Text(), "1234567890")
	return &m
}
Пример #24
0
// node returns a string representation of the selection.
func node(i int, s *goquery.Selection) string {
	switch node := s.Get(0); {
	case node.Data == "h1":
		return fmt.Sprintf(" \033[%dm# %s\033[0m\n\n", blue, text(s))
	case node.Data == "h2":
		return fmt.Sprintf(" \033[%dm## %s\033[0m\n\n", blue, text(s))
	case node.Data == "h3":
		return fmt.Sprintf(" \033[%dm### %s\033[0m\n\n", blue, text(s))
	case node.Data == "p":
		return fmt.Sprintf("\033[%dm%s\033[0m\n\n", none, indent(text(s), 1))
	case node.Data == "pre" || s.HasClass("highlight"):
		return fmt.Sprintf("\033[1m%s\033[0m\n\n", indent(text(s), 2))
	case node.Data == "a":
		return fmt.Sprintf("%s (%s) ", s.Text(), s.AttrOr("href", "missing link"))
	case node.Data == "li":
		return fmt.Sprintf("  • %s\n", contents(s))
	case node.Data == "ul":
		return fmt.Sprintf("%s\n", nodes(s))
	case node.Data == "code":
		return fmt.Sprintf("\033[1m%s\033[0m ", s.Text())
	case node.Type == html.TextNode:
		return strings.TrimSpace(node.Data)
	default:
		return ""
	}
}
Пример #25
0
func (ve *VideoExtractor) getSrc(node *goquery.Selection) string {
	value, exists := node.Attr("src")
	if exists {
		return value
	}
	return ""
}
Пример #26
0
func score(tag *goquery.Selection) int {
	src, _ := tag.Attr("src")
	if src == "" {
		src, _ = tag.Attr("data-src")
	}
	if src == "" {
		src, _ = tag.Attr("data-lazy-src")
	}
	if src == "" {
		return -1
	}
	tagScore := 0
	for rule, score := range rules {
		if rule.MatchString(src) {
			tagScore += score
		}
	}

	alt, exists := tag.Attr("alt")
	if exists {
		if strings.Contains(alt, "thumbnail") {
			tagScore--
		}
	}

	id, exists := tag.Attr("id")
	if exists {
		if id == "fbPhotoImage" {
			tagScore++
		}
	}
	return tagScore
}
Пример #27
0
func (this *cleaner) replaceWithPara(div *goquery.Selection) {
	if div.Size() > 0 {
		node := div.Get(0)
		node.Data = atom.P.String()
		node.DataAtom = atom.P
	}
}
Пример #28
0
func (rc *TwitterChecker) findSigInTweet(h SigHint, s *goquery.Selection) ProofError {

	inside := s.Text()
	html, err := s.Html()

	checkText := h.checkText

	if err != nil {
		return NewProofError(keybase1.ProofStatus_CONTENT_FAILURE, "No HTML tweet found: %s", err)
	}

	G.Log.Debug("+ Checking tweet '%s' for signature '%s'", inside, checkText)
	G.Log.Debug("| HTML is: %s", html)

	rxx := regexp.MustCompile(`^(@[a-zA-Z0-9_-]+\s+)`)
	for {
		if m := rxx.FindStringSubmatchIndex(inside); m == nil {
			break
		} else {
			prefix := inside[m[2]:m[3]]
			inside = inside[m[3]:]
			G.Log.Debug("| Stripping off @prefx: %s", prefix)
		}
	}
	if strings.HasPrefix(inside, checkText) {
		return nil
	}

	return NewProofError(keybase1.ProofStatus_DELETED, "Could not find '%s' in '%s'",
		checkText, inside)
}
Пример #29
0
Файл: form.go Проект: kekek/test
// Serialize converts the form fields into a url.Values type.
// Returns two url.Value types. The first is the form field values, and the
// second is the form button values.
func serializeForm(sel *goquery.Selection) (url.Values, url.Values) {
	input := sel.Find("input,button,textarea")
	if input.Length() == 0 {
		return url.Values{}, url.Values{}
	}

	fields := make(url.Values)
	buttons := make(url.Values)
	input.Each(func(_ int, s *goquery.Selection) {
		name, ok := s.Attr("name")
		if ok {
			typ, ok := s.Attr("type")
			if ok || s.Is("textarea") {
				if typ == "submit" {
					val, ok := s.Attr("value")
					if ok {
						buttons.Add(name, val)
					} else {
						buttons.Add(name, "")
					}
				} else {
					val, ok := s.Attr("value")
					if !ok {
						val = ""
					}
					fields.Add(name, val)
				}
			}
		}
	})

	return fields, buttons
}
Пример #30
0
func GetText(s *goquery.Selection) string {
	texts, _ := s.Find("td").Html()
	texts = TrimLinefeed(texts)
	texts = strings.Replace(texts, "<br/>", ",", -1)
	texts = strings.Replace(texts, "デッキレベル0再録", "", -1)
	return ReplaceIcon(texts)
}