// Add street to result and find street information (to WikipediaMoscow.result) func (parser *WikipediaMoscow) processLink(_ int, s *goquery.Selection, done chan<- *StreetInfo) { name := strings.TrimSpace(s.Text()) if len(name) == 0 { done <- parser.getEmptyInfo("") return } href, exists := s.Attr("href") if !exists { done <- parser.getEmptyInfo("") return } var ( class string info *StreetInfo ) class, exists = s.Attr("class") if exists && class == "new" { info = parser.getEmptyInfo(name) } else if resp, err := http.Get(parser.baseURL + href); err != nil { info = parser.getEmptyInfo(name) } else { streetparser := NewWikipediaStreetParser() info, err = streetparser.ParseStreetInfo(name, resp.Body) if err != nil { info = parser.getEmptyInfo(name) } } done <- info }
// node returns a string representation of the selection. func node(i int, s *goquery.Selection) string { switch node := s.Get(0); { case node.Data == "h1": return fmt.Sprintf(" \033[%dm# %s\033[0m\n\n", blue, text(s)) case node.Data == "h2": return fmt.Sprintf(" \033[%dm## %s\033[0m\n\n", blue, text(s)) case node.Data == "h3": return fmt.Sprintf(" \033[%dm### %s\033[0m\n\n", blue, text(s)) case node.Data == "p": return fmt.Sprintf("\033[%dm%s\033[0m\n\n", none, indent(text(s), 1)) case node.Data == "pre" || s.HasClass("highlight"): return fmt.Sprintf("\033[1m%s\033[0m\n\n", indent(text(s), 2)) case node.Data == "a": return fmt.Sprintf("%s (%s) ", s.Text(), s.AttrOr("href", "missing link")) case node.Data == "li": return fmt.Sprintf(" • %s\n", contents(s)) case node.Data == "ul": return fmt.Sprintf("%s\n", nodes(s)) case node.Data == "code": return fmt.Sprintf("\033[1m%s\033[0m ", s.Text()) case node.Type == html.TextNode: return strings.TrimSpace(node.Data) default: return "" } }
func ScrapeExamples(s *goquery.Selection) []string { examples := []string{} s.Find("span.h").Each(func(i int, s *goquery.Selection) { examples = append(examples, s.Text()) }) return examples }
func JoinNodesWithSpace(s *goquery.Selection) string { texts := []string{} s.Each(func(i int, s *goquery.Selection) { texts = append(texts, s.Text()) }) return strings.Join(texts, " ") }
func parseColors(s *goquery.Selection) string { colors := "" s.Each(func(i int, s *goquery.Selection) { colors += s.Text() }) return colors }
func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection { ps := make([]*goquery.Selection, 0) if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 { ps = append(ps, currentSibling) return ps } else { potentialParagraphs := currentSibling.Find("p") potentialParagraphs.Each(func(i int, s *goquery.Selection) { text := s.Text() if len(text) > 0 { ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text) paragraphScore := ws.stopWordCount siblingBaselineScore := 0.30 highLinkDensity := this.isHighLinkDensity(s) score := siblingBaselineScore * baselinescoreSiblingsPara if score < float64(paragraphScore) && !highLinkDensity { node := new(html.Node) node.Type = html.TextNode node.Data = text node.DataAtom = atom.P nodes := make([]*html.Node, 1) nodes[0] = node newSelection := new(goquery.Selection) newSelection.Nodes = nodes ps = append(ps, newSelection) } } }) } return ps }
func (rc *TwitterChecker) findSigInTweet(h SigHint, s *goquery.Selection) ProofError { inside := s.Text() html, err := s.Html() checkText := h.checkText if err != nil { return NewProofError(keybase1.ProofStatus_CONTENT_FAILURE, "No HTML tweet found: %s", err) } G.Log.Debug("+ Checking tweet '%s' for signature '%s'", inside, checkText) G.Log.Debug("| HTML is: %s", html) rxx := regexp.MustCompile(`^(@[a-zA-Z0-9_-]+\s+)`) for { if m := rxx.FindStringSubmatchIndex(inside); m == nil { break } else { prefix := inside[m[2]:m[3]] inside = inside[m[3]:] G.Log.Debug("| Stripping off @prefx: %s", prefix) } } if strings.HasPrefix(inside, checkText) { return nil } return NewProofError(keybase1.ProofStatus_DELETED, "Could not find '%s' in '%s'", checkText, inside) }
//a lot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to //boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs //so we'll want to make sure that the next sibling is a paragraph and has at least some substatial weight to it func (this *contentExtractor) isBoostable(node *goquery.Selection) bool { stepsAway := 0 next := node.Next() for next != nil && stepsAway < node.Siblings().Length() { currentNodeTag := node.Get(0).DataAtom.String() if currentNodeTag == "p" { if stepsAway >= 3 { if this.config.debug { log.Println("Next paragraph is too far away, not boosting") } return false } paraText := node.Text() ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, paraText) if ws.stopWordCount > 5 { if this.config.debug { log.Println("We're gonna boost this node, seems content") } return true } } stepsAway++ next = next.Next() } return false }
func getDataFromDOM(s *gq.Selection, arr []string, code string) string { var dt string if arr[0] == "text" { dt = s.Text() } else { dt, _ = s.Attr(arr[0]) } return encode_string(dt, code) }
func (e extractContent) noParasWithoutTable(s *goquery.Selection) bool { s.FindMatcher(pTags).Each(func(i int, s *goquery.Selection) { if len(s.Text()) < 25 { s.Remove() } }) return s.FindMatcher(pTags).Length() == 0 && !nodeIs(s.Nodes[0], atom.Td) }
func (d *Document) getLinkDensity(s *goquery.Selection) float32 { linkLength := len(s.Find("a").Text()) textLength := len(s.Text()) if textLength == 0 { return 0 } return float32(linkLength) / float32(textLength) }
func parseHeader(element *goquery.Selection, info *TrainInfo) { element.Find("span").Each(func(i int, element *goquery.Selection) { switch i { case 0: info.Category, info.Number, info.Name = parseTrainDenomination(element.Text()) case 2: info.From, info.To = parseTrainRoute(element.Text()) } }) }
func displayDetails(single *goquery.Selection) { text := strings.TrimSpace(single.Text()) href, _ := single.Attr("href") length := utf8.RuneCountInString(text) if ((length > 5) && wordExists(text, "keywords")) || ((length > 5) && wordExists(href, "keywords")) { if wordExists(text, "products") { fmt.Println("Link", single.Text(), "--->", href) } } }
func parseResource(s *goquery.Selection) (_production, _stored, _capacity int) { productionStr, _ := s.Attr("title") production, _ := strconv.Atoi(productionStr) status := s.Text() split := strings.Split(status, "/") stored, _ := strconv.Atoi(split[0]) capacity, _ := strconv.Atoi(split[1]) return production, stored, capacity }
func addIngredient(ingredients []data.Ingredient, a *goquery.Selection) []data.Ingredient { if href, ok := a.Attr("href"); ok { glog.V(2).Info(" href: " + href) id, err := strconv.Atoi(strings.Split(href, "/")[2]) if err != nil { glog.Errorf("Failed to extract id from %s: %v", href, err) } else { ingredients = append(ingredients, data.Ingredient{Name: a.Text(), Id: id}) } } return ingredients }
func describeSentences(s *goquery.Selection) TextDescription { var d TextDescription var text string // get text of this node and then split for sentences if s.Children().Length() > 0 { text = getTextFromHtml(s) } else { text = s.Text() } sentences := tokenizer.Tokenize(text) d.CountSentences = len(sentences) //fmt.Println("==============================================") for _, s := range sentences { sentence := s.Text if len(sentence) == 0 { continue } c := len(get_words_from(sentence)) //fmt.Println(sentence) d.AverageWords += c if c > 3 { // presume normal sentence usually has more 3 words d.CountLongSentences++ if c < 25 { // but a sentence should not have nore 25 words. We will not // consider such sentence as a good one d.CountGoodSentences++ } lastsymbol := sentence[len(sentence)-1:] if strings.ContainsAny(lastsymbol, ".?!") { d.CountCorrectSentences++ } } } if d.CountSentences > 0 { d.AverageWords = int(d.AverageWords / d.CountSentences) } return d }
//checks the density of links within a node, is there not much text and most of it contains bad links? //if so it's no good func (this *contentExtractor) isHighLinkDensity(node *goquery.Selection) bool { links := node.Find("a") if links == nil || links.Size() == 0 { return false } text := node.Text() words := strings.Split(text, " ") nwords := len(words) sb := make([]string, 0) links.Each(func(i int, s *goquery.Selection) { linkText := s.Text() sb = append(sb, linkText) }) linkText := strings.Join(sb, "") linkWords := strings.Split(linkText, " ") nlinkWords := len(linkWords) nlinks := links.Size() linkDivisor := float64(nlinkWords) / float64(nwords) score := linkDivisor * float64(nlinks) if this.config.debug { logText := "" if len(node.Text()) >= 51 { logText = node.Text()[0:50] } else { logText = node.Text() } log.Printf("Calculated link density score as %1.5f for node %s\n", score, logText) } if score > 1.0 { return true } return false }
func IsAllCode(bodyContent *goquery.Selection) bool { preDom := bodyContent.Find("pre") if preDom != nil { for index := 0; index < preDom.Length(); index++ { dom := preDom.Eq(index) dom.Remove() } } conStr := bodyContent.Text() conStr = strings.TrimSpace(conStr) if len(conStr) < 100 { return true } return false }
func getText(s *goquery.Selection, includeDecendents bool) string { if s.Length() == 0 { return "" } if includeDecendents { return strings.TrimSpace(s.Text()) } var buff []string for node := s.First().Nodes[0].FirstChild; node != nil; node = node.NextSibling { if node.Type == html.TextNode { buff = append(buff, node.Data) } } return strings.TrimSpace(strings.Join(buff, "")) }
func (s *StateMachine) ProcessSelection(sl *goquery.Selection) { if strings.HasPrefix(sl.Text(), "備註") { s.State = InFootnote } if strings.HasPrefix(sl.Text(), "資訊更新日期") { s.State = NotInFootnote } if s.State == InFootnote { if sl.Size() != 1 { panic("element size is not 1") } s.ProcessNode(sl.Nodes[0]) } }
// return the chapter number and the line number for the sentence func getLineNumber(sel *goquery.Selection) (chapterNumber int, lineNumber int) { rawtext := sel.Text() textArray := strings.Split(rawtext, ":") var err error = nil chapterNumber, err = strconv.Atoi(textArray[0]) if err != nil { log.Fatal(err) } lineNumber, err = strconv.Atoi(textArray[1]) if err != nil { log.Fatal(err) } return }
func addEntryType(typename string, s *goquery.Selection) { linkLabel := s.Find(".memItemRight").Find("a") linkLabel.Each(func(i int, s *goquery.Selection) { structName := s.Text() structName = strings.Trim(structName, "\r\n ") link, _ := s.Attr("href") link = strings.Trim(link, "\r\n ") if len(structName) != 0 && len(link) != 0 { _, err := db.Exec("insert or ignore into searchIndex(name,type,path) VALUES('" + structName + "','" + typename + "','" + link + "')") if err != nil { log.Fatal("Insert " + typename + " " + structName + "Failed!") return } log.Print("Insert " + typename + structName) } }) }
func ScrapeFileLink(s *goquery.Selection) { i := Images{Created_at: time.Now(), Updated_at: time.Now()} i.Name = s.Text() href, _ := s.Attr("href") isImage := i.IsImageType() if isImage { i.Source = fmt.Sprintf("http://rghost.ru%s/image.png", href) downloaded := i.DownloadImage() if downloaded { i.Uploaded_to = "yes" i.Archived = false err := i.InsertImage() if err != nil { log.Fatal(err) } } } }
func processUl(ul *goquery.Selection, depth int) { ul.Find("li").Each(func(_ int, li *goquery.Selection) { li.Find("ul").Each(func(_ int, childUl *goquery.Selection) { processUl(childUl, depth+1) }) lines := StringToLines(li.Text()) var indentedLines []string for i, line := range lines { if i == 0 { liMarkIndex := depth % 2 mark := liMark[liMarkIndex] indentedLines = append(indentedLines, "\n"+mark+" "+line) } else { indentedLines = append(indentedLines, " "+line) } } li.ReplaceWithHtml(strings.Join(indentedLines, "\n")) }) ul.ReplaceWithHtml(ul.Text()) }
func (e extractContent) getSiblingContent( a *Article, s *goquery.Selection, baseScore uint) []*html.Node { var ret []*html.Node if nodeIs(s.Nodes[0], atom.P) && len(s.Text()) > 0 { return s.Nodes } ps := s.FindMatcher(pTags) for _, n := range ps.Nodes { cc := a.getCCache(n) if len(cc.text) > 0 { if cc.stopwords > baseScore && !cc.highLinkDensity { ret = append(ret, createNode(atom.P, "p", cc.text)) } } } return ret }
func printSelectionTextWithTitle(title string, sel *goquery.Selection) { Debug("%v selection: %v", title, sel.Text()) }
func printSelectionText(sel *goquery.Selection) { Debug("selection: %v", sel.Text()) }
func UpdatePageSummary(docName, PageFilePath, PreviewFilePath string) error { var pageSummaryData PageSummaryData html, err := readPageAsHtml(docName, PageFilePath) if err != nil { return err } htmlreader := bytes.NewReader(html) //htmlString := string(html) //log.Info("HtmlString " + htmlString) doc, err := gq.NewDocumentFromReader(htmlreader) if err != nil { return err } var SelectedNodes *gq.Selection //======= work out the document heading ======== DocTitle := "" if DocTitle == "" { SelectedNodes = doc.Find("h1, h2, h3, h4").First() if len(SelectedNodes.Nodes) == 1 { DocTitle = strings.TrimSpace(SelectedNodes.Text()) } } if DocTitle == "" { DocTitle = docName } //DocTitle = base64.StdEncoding.EncodeToString([]byte(DocTitle)) DocTitle = malkovich.FolderNameToDocName(DocTitle) pageSummaryData.PageTitle = DocTitle //======== look for an image ========= DocImage := "" SelectedNodes = doc.Find("img").First() if len(SelectedNodes.Nodes) == 1 { for _, nodeAttr := range SelectedNodes.Nodes[0].Attr { if nodeAttr.Key == "src" { DocImage = nodeAttr.Val break } } } //DocImage = base64.StdEncoding.EncodeToString([]byte(DocImage)) pageSummaryData.FirstImage = DocImage //======== look for the first paragraph ========= FirstParagraph := "" SelectedNodes = doc.Find("p").First() if len(SelectedNodes.Nodes) == 1 { FirstParagraph = strings.TrimSpace(SelectedNodes.Text()) } //TODO:HIGH Maybe limit to a set number of charactors here. //FirstParagraph = base64.StdEncoding.EncodeToString([]byte(FirstParagraph)) pageSummaryData.FirstParagraph = FirstParagraph jsonData, err := json.Marshal(pageSummaryData) if err != nil { return err } // TODO:MED what would the best file permissions be here? err = ioutil.WriteFile(PreviewFilePath, jsonData, os.FileMode(0644)) if err != nil { panic(err.Error()) } return nil }
// return the sentence pure content without comment number func getPureContent(sel *goquery.Selection) string { pureText := sel.Text() return pureText }
func (ve *VideoExtractor) getEmbedCode(node *goquery.Selection) string { return node.Text() }