Beispiel #1
0
// Serialize converts the form fields into a url.Values type.
// Returns two url.Value types. The first is the form field values, and the
// second is the form button values.
func serializeForm(sel *goquery.Selection) (url.Values, url.Values) {
	input := sel.Find("input,button,textarea")
	if input.Length() == 0 {
		return url.Values{}, url.Values{}
	}

	fields := make(url.Values)
	buttons := make(url.Values)
	input.Each(func(_ int, s *goquery.Selection) {
		name, ok := s.Attr("name")
		if ok {
			typ, ok := s.Attr("type")
			if ok || s.Is("textarea") {
				if typ == "submit" {
					val, ok := s.Attr("value")
					if ok {
						buttons.Add(name, val)
					} else {
						buttons.Add(name, "")
					}
				} else {
					val, ok := s.Attr("value")
					if !ok {
						val = ""
					}
					fields.Add(name, val)
				}
			}
		}
	})

	return fields, buttons
}
Beispiel #2
0
func encuentraHorarios(tabla *goquery.Selection) (horarios []Horario) {
	trs := tabla.Find("tr")
	trs.Each(func(i int, tr *goquery.Selection) {
		tds := tr.Find("td")
		var puesto, nombre, diasStr, horaStr, salonStr string
		if tds.Length() > 0 {
			puesto = strings.TrimSpace(tds.Eq(0).Text())
		}
		if tds.Length() > 1 {
			nombre = strings.TrimSpace(tds.Eq(1).Text())
		}
		if tds.Length() > 2 {
			diasStr = strings.TrimSpace(tds.Eq(2).Text())
		}
		if tds.Length() > 3 {
			horaStr = strings.TrimSpace(tds.Eq(3).Text())
		}
		if tds.Length() > 4 {
			salonStr = strings.TrimSpace(tds.Eq(4).Text())
		}
		dia := generaDia(diasStr, horaStr, salonStr)
		horario := Horario{puesto: puesto, nombre: nombre, dias: dia}
		horarios = append(horarios, horario)
	})
	return
}
func parsePhrase(selection *goquery.Selection) (result []string) {
	selection.Find(".translation_item").Each(func(index int, meaning *goquery.Selection) {
		result = append(result, strings.TrimSpace(meaning.Text()))
	})

	return
}
Beispiel #4
0
func testList(t *testing.T, list *goquery.Selection) {
	list.Find("ul").Each(func(_ int, items *goquery.Selection) {
		testList(t, items)
		items.RemoveFiltered("ul")
	})
	checkAlphabeticOrder(t, list)
}
// Parse from div.tweet
func (tweet *Tweet) Parse(s *goquery.Selection) (err error) {
	success := false
	attrs := []string{
		"data-item-id",
		"data-screen-name",
		"data-name",
	}
	data := map[string]string{}

	for _, attr := range attrs {
		var value string
		if value, success = s.Attr(attr); !success {
			tweet.Success = 0
			err = fmt.Errorf("not having %s attribute", attr)
			return
		}
		data[attr] = value
	}

	tweet.ItemID = data["data-item-id"]
	tweet.ScreenName = data["data-screen-name"]
	tweet.Name = data["data-name"]
	tweet.Success = 1

	// if could get the above attribues, allow the following values to be blank.
	tweet.Time, _ = s.Find("._timestamp").Attr("data-time")
	tweet.Text = s.Find(".tweet-text").Text()
	return
}
Beispiel #6
0
func ScrapeExamples(s *goquery.Selection) []string {
	examples := []string{}
	s.Find("span.h").Each(func(i int, s *goquery.Selection) {
		examples = append(examples, s.Text())
	})
	return examples
}
Beispiel #7
0
func scrapPayload(s *goquery.Selection, n int) string {
	url, ok := s.Find("a").Attr("href")
	if !ok {
		die("unable to find URL for scrapping")
	}
	return scrapPayloadURL("https://developer.github.com"+url, n)
}
Beispiel #8
0
func extractCredits(selection *goquery.Selection) string {
	if result := trim(selection.Find(".credits").Text()); strings.Contains(result, "#") {
		return "0"
	} else {
		return result
	}
}
Beispiel #9
0
func ParseCourse(s *goquery.Selection) Course {
	subject := strings.TrimSpace(s.Find("td").Eq(0).Text())
	catalog := strings.TrimSpace(s.Find("td").Eq(1).Text())
	termStr := strings.TrimSpace(s.Find("td").Eq(2).Text())
	class := strings.TrimSpace(s.Find("td").Eq(3).Text())
	title := strings.TrimSpace(s.Find("td").Eq(4).Text())
	instructor := strings.TrimSpace(s.Find("td").Eq(5).Text())
	credits := strings.TrimSpace(s.Find("td").Eq(6).Text())
	catalogNum, _ := strconv.Atoi(catalog)
	classNum, _ := strconv.Atoi(strings.TrimSpace(class))

	// Damn you unicode NBSP!!!
	filter := strings.Replace(termStr, "\u0020", "", -1)
	termCleaned := strings.Split(filter, "\u00A0")[0]

	course := Course{
		Subject:       subject,
		CatalogNumber: catalogNum,
		ClassNumber:   classNum,
		Title:         title,
		Instructor:    instructor,
		Credits:       credits,
		Term:          NewTerm(termCleaned),
	}
	return course
}
Beispiel #10
0
//checks the density of links within a node, is there not much text and most of it contains bad links?
//if so it's no good
func (this *contentExtractor) isHighLinkDensity(node *goquery.Selection) bool {
	links := node.Find("a")
	if links == nil || links.Size() == 0 {
		return false
	}
	text := node.Text()
	words := strings.Split(text, " ")
	nwords := len(words)
	sb := make([]string, 0)
	links.Each(func(i int, s *goquery.Selection) {
		linkText := s.Text()
		sb = append(sb, linkText)
	})
	linkText := strings.Join(sb, "")
	linkWords := strings.Split(linkText, " ")
	nlinkWords := len(linkWords)
	nlinks := links.Size()
	linkDivisor := float64(nlinkWords) / float64(nwords)
	score := linkDivisor * float64(nlinks)

	if this.config.debug {
		logText := ""
		if len(node.Text()) >= 51 {
			logText = node.Text()[0:50]
		} else {
			logText = node.Text()
		}
		log.Printf("Calculated link density score as %1.5f for node %s\n", score, logText)
	}
	if score > 1.0 {
		return true
	}
	return false
}
Beispiel #11
0
func extractCourseDescription(selection *goquery.Selection) string {
	url := trim(fmt.Sprintln(selection.Find(".catalogdescription a").AttrOr("href", "")))
	fmt.Println("LOGGING URL", url)
	client := http.Client{}
	req, _ := http.NewRequest("GET", "http://catalog.njit.edu/ribbit/index.cgi?format=html&page=fsinjector.rjs&fullpage=true", nil)
	req.Header.Add("Referer", url)
	resp, err := client.Do(req)
	if err != nil {

		return ""
	}
	if resp != nil {
		defer resp.Body.Close()
	}

	body, _ := ioutil.ReadAll(resp.Body)
	//checkError(err)
	result := substringAfter(string(body), "courseblockdesc")
	if len(result) < 4 {
		return ""
	}
	result = substringBefore(result[3:], "<b")
	if string(result[0]) == "<" || strings.Contains(result, "at SISConnxService") {
		return ""
	}
	result = strings.Replace(result, "\\\"", "\"", -1)
	doc, _ := goquery.NewDocumentFromReader(strings.NewReader(result))

	return trim(doc.Text())
}
Beispiel #12
0
func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection {
	ps := make([]*goquery.Selection, 0)
	if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 {
		ps = append(ps, currentSibling)
		return ps
	} else {
		potentialParagraphs := currentSibling.Find("p")
		potentialParagraphs.Each(func(i int, s *goquery.Selection) {
			text := s.Text()
			if len(text) > 0 {
				ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text)
				paragraphScore := ws.stopWordCount
				siblingBaselineScore := 0.30
				highLinkDensity := this.isHighLinkDensity(s)
				score := siblingBaselineScore * baselinescoreSiblingsPara
				if score < float64(paragraphScore) && !highLinkDensity {
					node := new(html.Node)
					node.Type = html.TextNode
					node.Data = text
					node.DataAtom = atom.P
					nodes := make([]*html.Node, 1)
					nodes[0] = node
					newSelection := new(goquery.Selection)
					newSelection.Nodes = nodes
					ps = append(ps, newSelection)
				}
			}

		})
	}
	return ps
}
Beispiel #13
0
func GetText(s *goquery.Selection) string {
	texts, _ := s.Find("td").Html()
	texts = TrimLinefeed(texts)
	texts = strings.Replace(texts, "<br/>", ",", -1)
	texts = strings.Replace(texts, "デッキレベル0再録", "", -1)
	return ReplaceIcon(texts)
}
Beispiel #14
0
func ScrapeNotes(s *goquery.Selection) []string {
	notes := []string{}
	s.Find("abbr").Not("abbr:first-of-type").Not("abbr.c").Each(func(i int, s *goquery.Selection) {
		note, _ := s.Attr("title")
		notes = append(notes, note)
	})
	return notes
}
Beispiel #15
0
func ScrapeOrigins(s *goquery.Selection) []string {
	origins := []string{}
	s.Find("abbr.c").Each(func(i int, s *goquery.Selection) {
		origin, _ := s.Attr("title")
		origins = append(origins, origin)
	})
	return origins
}
func (t *TorrentEntry) processMagnet(torrentData *goquery.Selection) {
	u, pU := torrentData.Find(".download a").First().Attr("href")
	if pU {
		t.Magnet = strings.TrimSpace(u)
	} else {
		t.Magnet = ""
	}
}
Beispiel #17
0
// getCaptionFromClass is a hack to get captions working for
// http://www.bloomberg.com/graphics/2015-paul-ford-what-is-code/
func getCaptionFromClass(doc *goquery.Selection) string {
	caption := doc.Find(".photoCaption").Text()

	// Don't want caption text to appear in normal text body anymore:
	doc.Find(".photoCaption").Remove()

	return caption
}
Beispiel #18
0
func (cb *citibike) parse_station(node *goquery.Selection, name_div string) (Station, error) {
	station_label := node.Find(name_div).Text()
	if station, ok := (*cb.stations)[station_label]; ok {
		return station, nil
	}

	return Station{}, fmt.Errorf("Unknown station: %s", station_label)
}
Beispiel #19
0
func getDate(td *goquery.Selection) (date time.Time) {
	rawData := strings.TrimSpace(td.Find("font").Last().Text())
	date, err := time.Parse("02.01.2006", rawData)
	if err != nil {
		date = time.Time{}
	}
	return date
}
Beispiel #20
0
func extractRoomNum(selection *goquery.Selection) string {
	s, _ := selection.Find(".room").Html()
	s = strings.Replace(s, "<br/>", "\n", -1)
	doc, err := goquery.NewDocumentFromReader(strings.NewReader(s))
	if err != nil {
		fmt.Print(err)
	}
	return trim(doc.Text())
}
Beispiel #21
0
func getTimerValue(s *goquery.Selection) string {

	timeVal := s.Find("span.timer").AttrOr("class", "")

	timeVal = strings.TrimSuffix(timeVal, " timer")
	timeVal = strings.TrimPrefix(timeVal, "timeleft_")

	return timeVal
}
Beispiel #22
0
func parseArticleSection(section *goquery.Selection) Articles {
	result := Articles{}

	section.Find("article").Each(func(i int, s *goquery.Selection) {
		result = append(result, parseArticle(s))
	})

	return result
}
Beispiel #23
0
func (this *parser) getElementsByTags(div *goquery.Selection, tags []string) *goquery.Selection {
	selection := new(goquery.Selection)
	for _, tag := range tags {
		selections := div.Find(tag)
		if selections != nil {
			selection = selection.Union(selections)
		}
	}
	return selection
}
Beispiel #24
0
func parseHeader(element *goquery.Selection, info *TrainInfo) {
	element.Find("span").Each(func(i int, element *goquery.Selection) {
		switch i {
		case 0:
			info.Category, info.Number, info.Name = parseTrainDenomination(element.Text())
		case 2:
			info.From, info.To = parseTrainRoute(element.Text())
		}
	})
}
Beispiel #25
0
func (d *Document) getLinkDensity(s *goquery.Selection) float32 {
	linkLength := len(s.Find("a").Text())
	textLength := len(s.Text())

	if textLength == 0 {
		return 0
	}

	return float32(linkLength) / float32(textLength)
}
Beispiel #26
0
func FetchTexts(doc *goquery.Selection, query string) ([]string, *Error) {
	elems := doc.Find(query)
	resArray := make([]string, elems.Size())
	i := 0
	elems.Each(func(_ int, elem *goquery.Selection) {
		resArray[i] = elem.Text()
		i++
	})
	return resArray, nil
}
Beispiel #27
0
func getLimit(s *goquery.Selection) (limit int) {

	limitText := s.Find(".limit").Text()

	num, err := strconv.Atoi(strings.TrimPrefix(limitText, "/ "))
	if err != nil {
		num = 0
	}
	return num
}
Beispiel #28
0
func ScrapeDefinition(s *goquery.Selection) *Definition {
	category, _ := s.Find("span[title]").First().Attr("title")

	return &Definition{
		Category:   category,
		Definition: strings.TrimSpace(s.Find("span.b").Clone().Children().Not("a").Remove().End().End().Text()),
		Origin:     ScrapeOrigins(s),
		Notes:      ScrapeNotes(s),
		Examples:   ScrapeExamples(s),
	}
}
Beispiel #29
0
func ScrapeDefinition(s *goquery.Selection) *Definition {
	category, _ := s.Find("abbr").First().Attr("title")

	return &Definition{
		Category:   category,
		Definition: JoinNodesWithSpace(s.Children().First().NextAll().Not("abbr").Not("span.h")),
		Origin:     ScrapeOrigins(s),
		Notes:      ScrapeNotes(s),
		Examples:   ScrapeExamples(s),
	}
}
Beispiel #30
0
func processTr(tr *goquery.Selection, fRstOutput *os.File) {
	tr.Find("td").Each(func(indexOfTd int, td *goquery.Selection) {
		lines := StringToLines(td.Text())
		for indexOfLine, line := range lines {
			line = strings.TrimSpace(line)
			fmt.Fprintf(fRstOutput, rstListTablePrefixOfEachLine(indexOfTd, indexOfLine))
			fmt.Fprintf(fRstOutput, line)
			fmt.Fprintf(fRstOutput, "\n")
		}
	})
}