// Serialize converts the form fields into a url.Values type. // Returns two url.Value types. The first is the form field values, and the // second is the form button values. func serializeForm(sel *goquery.Selection) (url.Values, url.Values) { input := sel.Find("input,button,textarea") if input.Length() == 0 { return url.Values{}, url.Values{} } fields := make(url.Values) buttons := make(url.Values) input.Each(func(_ int, s *goquery.Selection) { name, ok := s.Attr("name") if ok { typ, ok := s.Attr("type") if ok || s.Is("textarea") { if typ == "submit" { val, ok := s.Attr("value") if ok { buttons.Add(name, val) } else { buttons.Add(name, "") } } else { val, ok := s.Attr("value") if !ok { val = "" } fields.Add(name, val) } } } }) return fields, buttons }
func encuentraHorarios(tabla *goquery.Selection) (horarios []Horario) { trs := tabla.Find("tr") trs.Each(func(i int, tr *goquery.Selection) { tds := tr.Find("td") var puesto, nombre, diasStr, horaStr, salonStr string if tds.Length() > 0 { puesto = strings.TrimSpace(tds.Eq(0).Text()) } if tds.Length() > 1 { nombre = strings.TrimSpace(tds.Eq(1).Text()) } if tds.Length() > 2 { diasStr = strings.TrimSpace(tds.Eq(2).Text()) } if tds.Length() > 3 { horaStr = strings.TrimSpace(tds.Eq(3).Text()) } if tds.Length() > 4 { salonStr = strings.TrimSpace(tds.Eq(4).Text()) } dia := generaDia(diasStr, horaStr, salonStr) horario := Horario{puesto: puesto, nombre: nombre, dias: dia} horarios = append(horarios, horario) }) return }
func parsePhrase(selection *goquery.Selection) (result []string) { selection.Find(".translation_item").Each(func(index int, meaning *goquery.Selection) { result = append(result, strings.TrimSpace(meaning.Text())) }) return }
func testList(t *testing.T, list *goquery.Selection) { list.Find("ul").Each(func(_ int, items *goquery.Selection) { testList(t, items) items.RemoveFiltered("ul") }) checkAlphabeticOrder(t, list) }
// Parse from div.tweet func (tweet *Tweet) Parse(s *goquery.Selection) (err error) { success := false attrs := []string{ "data-item-id", "data-screen-name", "data-name", } data := map[string]string{} for _, attr := range attrs { var value string if value, success = s.Attr(attr); !success { tweet.Success = 0 err = fmt.Errorf("not having %s attribute", attr) return } data[attr] = value } tweet.ItemID = data["data-item-id"] tweet.ScreenName = data["data-screen-name"] tweet.Name = data["data-name"] tweet.Success = 1 // if could get the above attribues, allow the following values to be blank. tweet.Time, _ = s.Find("._timestamp").Attr("data-time") tweet.Text = s.Find(".tweet-text").Text() return }
func ScrapeExamples(s *goquery.Selection) []string { examples := []string{} s.Find("span.h").Each(func(i int, s *goquery.Selection) { examples = append(examples, s.Text()) }) return examples }
func scrapPayload(s *goquery.Selection, n int) string { url, ok := s.Find("a").Attr("href") if !ok { die("unable to find URL for scrapping") } return scrapPayloadURL("https://developer.github.com"+url, n) }
func extractCredits(selection *goquery.Selection) string { if result := trim(selection.Find(".credits").Text()); strings.Contains(result, "#") { return "0" } else { return result } }
func ParseCourse(s *goquery.Selection) Course { subject := strings.TrimSpace(s.Find("td").Eq(0).Text()) catalog := strings.TrimSpace(s.Find("td").Eq(1).Text()) termStr := strings.TrimSpace(s.Find("td").Eq(2).Text()) class := strings.TrimSpace(s.Find("td").Eq(3).Text()) title := strings.TrimSpace(s.Find("td").Eq(4).Text()) instructor := strings.TrimSpace(s.Find("td").Eq(5).Text()) credits := strings.TrimSpace(s.Find("td").Eq(6).Text()) catalogNum, _ := strconv.Atoi(catalog) classNum, _ := strconv.Atoi(strings.TrimSpace(class)) // Damn you unicode NBSP!!! filter := strings.Replace(termStr, "\u0020", "", -1) termCleaned := strings.Split(filter, "\u00A0")[0] course := Course{ Subject: subject, CatalogNumber: catalogNum, ClassNumber: classNum, Title: title, Instructor: instructor, Credits: credits, Term: NewTerm(termCleaned), } return course }
//checks the density of links within a node, is there not much text and most of it contains bad links? //if so it's no good func (this *contentExtractor) isHighLinkDensity(node *goquery.Selection) bool { links := node.Find("a") if links == nil || links.Size() == 0 { return false } text := node.Text() words := strings.Split(text, " ") nwords := len(words) sb := make([]string, 0) links.Each(func(i int, s *goquery.Selection) { linkText := s.Text() sb = append(sb, linkText) }) linkText := strings.Join(sb, "") linkWords := strings.Split(linkText, " ") nlinkWords := len(linkWords) nlinks := links.Size() linkDivisor := float64(nlinkWords) / float64(nwords) score := linkDivisor * float64(nlinks) if this.config.debug { logText := "" if len(node.Text()) >= 51 { logText = node.Text()[0:50] } else { logText = node.Text() } log.Printf("Calculated link density score as %1.5f for node %s\n", score, logText) } if score > 1.0 { return true } return false }
func extractCourseDescription(selection *goquery.Selection) string { url := trim(fmt.Sprintln(selection.Find(".catalogdescription a").AttrOr("href", ""))) fmt.Println("LOGGING URL", url) client := http.Client{} req, _ := http.NewRequest("GET", "http://catalog.njit.edu/ribbit/index.cgi?format=html&page=fsinjector.rjs&fullpage=true", nil) req.Header.Add("Referer", url) resp, err := client.Do(req) if err != nil { return "" } if resp != nil { defer resp.Body.Close() } body, _ := ioutil.ReadAll(resp.Body) //checkError(err) result := substringAfter(string(body), "courseblockdesc") if len(result) < 4 { return "" } result = substringBefore(result[3:], "<b") if string(result[0]) == "<" || strings.Contains(result, "at SISConnxService") { return "" } result = strings.Replace(result, "\\\"", "\"", -1) doc, _ := goquery.NewDocumentFromReader(strings.NewReader(result)) return trim(doc.Text()) }
func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection { ps := make([]*goquery.Selection, 0) if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 { ps = append(ps, currentSibling) return ps } else { potentialParagraphs := currentSibling.Find("p") potentialParagraphs.Each(func(i int, s *goquery.Selection) { text := s.Text() if len(text) > 0 { ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text) paragraphScore := ws.stopWordCount siblingBaselineScore := 0.30 highLinkDensity := this.isHighLinkDensity(s) score := siblingBaselineScore * baselinescoreSiblingsPara if score < float64(paragraphScore) && !highLinkDensity { node := new(html.Node) node.Type = html.TextNode node.Data = text node.DataAtom = atom.P nodes := make([]*html.Node, 1) nodes[0] = node newSelection := new(goquery.Selection) newSelection.Nodes = nodes ps = append(ps, newSelection) } } }) } return ps }
func GetText(s *goquery.Selection) string { texts, _ := s.Find("td").Html() texts = TrimLinefeed(texts) texts = strings.Replace(texts, "<br/>", ",", -1) texts = strings.Replace(texts, "デッキレベル0再録", "", -1) return ReplaceIcon(texts) }
func ScrapeNotes(s *goquery.Selection) []string { notes := []string{} s.Find("abbr").Not("abbr:first-of-type").Not("abbr.c").Each(func(i int, s *goquery.Selection) { note, _ := s.Attr("title") notes = append(notes, note) }) return notes }
func ScrapeOrigins(s *goquery.Selection) []string { origins := []string{} s.Find("abbr.c").Each(func(i int, s *goquery.Selection) { origin, _ := s.Attr("title") origins = append(origins, origin) }) return origins }
func (t *TorrentEntry) processMagnet(torrentData *goquery.Selection) { u, pU := torrentData.Find(".download a").First().Attr("href") if pU { t.Magnet = strings.TrimSpace(u) } else { t.Magnet = "" } }
// getCaptionFromClass is a hack to get captions working for // http://www.bloomberg.com/graphics/2015-paul-ford-what-is-code/ func getCaptionFromClass(doc *goquery.Selection) string { caption := doc.Find(".photoCaption").Text() // Don't want caption text to appear in normal text body anymore: doc.Find(".photoCaption").Remove() return caption }
func (cb *citibike) parse_station(node *goquery.Selection, name_div string) (Station, error) { station_label := node.Find(name_div).Text() if station, ok := (*cb.stations)[station_label]; ok { return station, nil } return Station{}, fmt.Errorf("Unknown station: %s", station_label) }
func getDate(td *goquery.Selection) (date time.Time) { rawData := strings.TrimSpace(td.Find("font").Last().Text()) date, err := time.Parse("02.01.2006", rawData) if err != nil { date = time.Time{} } return date }
func extractRoomNum(selection *goquery.Selection) string { s, _ := selection.Find(".room").Html() s = strings.Replace(s, "<br/>", "\n", -1) doc, err := goquery.NewDocumentFromReader(strings.NewReader(s)) if err != nil { fmt.Print(err) } return trim(doc.Text()) }
func getTimerValue(s *goquery.Selection) string { timeVal := s.Find("span.timer").AttrOr("class", "") timeVal = strings.TrimSuffix(timeVal, " timer") timeVal = strings.TrimPrefix(timeVal, "timeleft_") return timeVal }
func parseArticleSection(section *goquery.Selection) Articles { result := Articles{} section.Find("article").Each(func(i int, s *goquery.Selection) { result = append(result, parseArticle(s)) }) return result }
func (this *parser) getElementsByTags(div *goquery.Selection, tags []string) *goquery.Selection { selection := new(goquery.Selection) for _, tag := range tags { selections := div.Find(tag) if selections != nil { selection = selection.Union(selections) } } return selection }
func parseHeader(element *goquery.Selection, info *TrainInfo) { element.Find("span").Each(func(i int, element *goquery.Selection) { switch i { case 0: info.Category, info.Number, info.Name = parseTrainDenomination(element.Text()) case 2: info.From, info.To = parseTrainRoute(element.Text()) } }) }
func (d *Document) getLinkDensity(s *goquery.Selection) float32 { linkLength := len(s.Find("a").Text()) textLength := len(s.Text()) if textLength == 0 { return 0 } return float32(linkLength) / float32(textLength) }
func FetchTexts(doc *goquery.Selection, query string) ([]string, *Error) { elems := doc.Find(query) resArray := make([]string, elems.Size()) i := 0 elems.Each(func(_ int, elem *goquery.Selection) { resArray[i] = elem.Text() i++ }) return resArray, nil }
func getLimit(s *goquery.Selection) (limit int) { limitText := s.Find(".limit").Text() num, err := strconv.Atoi(strings.TrimPrefix(limitText, "/ ")) if err != nil { num = 0 } return num }
func ScrapeDefinition(s *goquery.Selection) *Definition { category, _ := s.Find("span[title]").First().Attr("title") return &Definition{ Category: category, Definition: strings.TrimSpace(s.Find("span.b").Clone().Children().Not("a").Remove().End().End().Text()), Origin: ScrapeOrigins(s), Notes: ScrapeNotes(s), Examples: ScrapeExamples(s), } }
func ScrapeDefinition(s *goquery.Selection) *Definition { category, _ := s.Find("abbr").First().Attr("title") return &Definition{ Category: category, Definition: JoinNodesWithSpace(s.Children().First().NextAll().Not("abbr").Not("span.h")), Origin: ScrapeOrigins(s), Notes: ScrapeNotes(s), Examples: ScrapeExamples(s), } }
func processTr(tr *goquery.Selection, fRstOutput *os.File) { tr.Find("td").Each(func(indexOfTd int, td *goquery.Selection) { lines := StringToLines(td.Text()) for indexOfLine, line := range lines { line = strings.TrimSpace(line) fmt.Fprintf(fRstOutput, rstListTablePrefixOfEachLine(indexOfTd, indexOfLine)) fmt.Fprintf(fRstOutput, line) fmt.Fprintf(fRstOutput, "\n") } }) }