Ejemplo n.º 1
0
// Add street to result and find street information (to WikipediaMoscow.result)
func (parser *WikipediaMoscow) processLink(_ int, s *goquery.Selection, done chan<- *StreetInfo) {
	name := strings.TrimSpace(s.Text())
	if len(name) == 0 {
		done <- parser.getEmptyInfo("")
		return
	}

	href, exists := s.Attr("href")
	if !exists {
		done <- parser.getEmptyInfo("")
		return
	}

	var (
		class string
		info  *StreetInfo
	)

	class, exists = s.Attr("class")

	if exists && class == "new" {
		info = parser.getEmptyInfo(name)
	} else if resp, err := http.Get(parser.baseURL + href); err != nil {
		info = parser.getEmptyInfo(name)
	} else {
		streetparser := NewWikipediaStreetParser()
		info, err = streetparser.ParseStreetInfo(name, resp.Body)
		if err != nil {
			info = parser.getEmptyInfo(name)
		}
	}
	done <- info
}
Ejemplo n.º 2
0
// node returns a string representation of the selection.
func node(i int, s *goquery.Selection) string {
	switch node := s.Get(0); {
	case node.Data == "h1":
		return fmt.Sprintf(" \033[%dm# %s\033[0m\n\n", blue, text(s))
	case node.Data == "h2":
		return fmt.Sprintf(" \033[%dm## %s\033[0m\n\n", blue, text(s))
	case node.Data == "h3":
		return fmt.Sprintf(" \033[%dm### %s\033[0m\n\n", blue, text(s))
	case node.Data == "p":
		return fmt.Sprintf("\033[%dm%s\033[0m\n\n", none, indent(text(s), 1))
	case node.Data == "pre" || s.HasClass("highlight"):
		return fmt.Sprintf("\033[1m%s\033[0m\n\n", indent(text(s), 2))
	case node.Data == "a":
		return fmt.Sprintf("%s (%s) ", s.Text(), s.AttrOr("href", "missing link"))
	case node.Data == "li":
		return fmt.Sprintf("  • %s\n", contents(s))
	case node.Data == "ul":
		return fmt.Sprintf("%s\n", nodes(s))
	case node.Data == "code":
		return fmt.Sprintf("\033[1m%s\033[0m ", s.Text())
	case node.Type == html.TextNode:
		return strings.TrimSpace(node.Data)
	default:
		return ""
	}
}
Ejemplo n.º 3
0
Archivo: scrape.go Proyecto: squat/drae
func ScrapeExamples(s *goquery.Selection) []string {
	examples := []string{}
	s.Find("span.h").Each(func(i int, s *goquery.Selection) {
		examples = append(examples, s.Text())
	})
	return examples
}
Ejemplo n.º 4
0
Archivo: scrape.go Proyecto: squat/drae
func JoinNodesWithSpace(s *goquery.Selection) string {
	texts := []string{}
	s.Each(func(i int, s *goquery.Selection) {
		texts = append(texts, s.Text())
	})
	return strings.Join(texts, " ")
}
Ejemplo n.º 5
0
func parseColors(s *goquery.Selection) string {
	colors := ""
	s.Each(func(i int, s *goquery.Selection) {
		colors += s.Text()
	})
	return colors
}
Ejemplo n.º 6
0
func (this *contentExtractor) getSiblingsContent(currentSibling *goquery.Selection, baselinescoreSiblingsPara float64) []*goquery.Selection {
	ps := make([]*goquery.Selection, 0)
	if currentSibling.Get(0).DataAtom.String() == "p" && len(currentSibling.Text()) > 0 {
		ps = append(ps, currentSibling)
		return ps
	} else {
		potentialParagraphs := currentSibling.Find("p")
		potentialParagraphs.Each(func(i int, s *goquery.Selection) {
			text := s.Text()
			if len(text) > 0 {
				ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, text)
				paragraphScore := ws.stopWordCount
				siblingBaselineScore := 0.30
				highLinkDensity := this.isHighLinkDensity(s)
				score := siblingBaselineScore * baselinescoreSiblingsPara
				if score < float64(paragraphScore) && !highLinkDensity {
					node := new(html.Node)
					node.Type = html.TextNode
					node.Data = text
					node.DataAtom = atom.P
					nodes := make([]*html.Node, 1)
					nodes[0] = node
					newSelection := new(goquery.Selection)
					newSelection.Nodes = nodes
					ps = append(ps, newSelection)
				}
			}

		})
	}
	return ps
}
Ejemplo n.º 7
0
func (rc *TwitterChecker) findSigInTweet(h SigHint, s *goquery.Selection) ProofError {

	inside := s.Text()
	html, err := s.Html()

	checkText := h.checkText

	if err != nil {
		return NewProofError(keybase1.ProofStatus_CONTENT_FAILURE, "No HTML tweet found: %s", err)
	}

	G.Log.Debug("+ Checking tweet '%s' for signature '%s'", inside, checkText)
	G.Log.Debug("| HTML is: %s", html)

	rxx := regexp.MustCompile(`^(@[a-zA-Z0-9_-]+\s+)`)
	for {
		if m := rxx.FindStringSubmatchIndex(inside); m == nil {
			break
		} else {
			prefix := inside[m[2]:m[3]]
			inside = inside[m[3]:]
			G.Log.Debug("| Stripping off @prefx: %s", prefix)
		}
	}
	if strings.HasPrefix(inside, checkText) {
		return nil
	}

	return NewProofError(keybase1.ProofStatus_DELETED, "Could not find '%s' in '%s'",
		checkText, inside)
}
Ejemplo n.º 8
0
//a lot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to
//boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs
//so we'll want to make sure that the next sibling is a paragraph and has at least some substatial weight to it
func (this *contentExtractor) isBoostable(node *goquery.Selection) bool {
	stepsAway := 0
	next := node.Next()
	for next != nil && stepsAway < node.Siblings().Length() {
		currentNodeTag := node.Get(0).DataAtom.String()
		if currentNodeTag == "p" {
			if stepsAway >= 3 {
				if this.config.debug {
					log.Println("Next paragraph is too far away, not boosting")
				}
				return false
			}

			paraText := node.Text()
			ws := this.config.stopWords.stopWordsCount(this.config.targetLanguage, paraText)
			if ws.stopWordCount > 5 {
				if this.config.debug {
					log.Println("We're gonna boost this node, seems content")
				}
				return true
			}
		}

		stepsAway++
		next = next.Next()
	}

	return false
}
Ejemplo n.º 9
0
func getDataFromDOM(s *gq.Selection, arr []string, code string) string {
	var dt string
	if arr[0] == "text" {
		dt = s.Text()
	} else {
		dt, _ = s.Attr(arr[0])
	}
	return encode_string(dt, code)
}
Ejemplo n.º 10
0
func (e extractContent) noParasWithoutTable(s *goquery.Selection) bool {
	s.FindMatcher(pTags).Each(func(i int, s *goquery.Selection) {
		if len(s.Text()) < 25 {
			s.Remove()
		}
	})

	return s.FindMatcher(pTags).Length() == 0 && !nodeIs(s.Nodes[0], atom.Td)
}
Ejemplo n.º 11
0
func (d *Document) getLinkDensity(s *goquery.Selection) float32 {
	linkLength := len(s.Find("a").Text())
	textLength := len(s.Text())

	if textLength == 0 {
		return 0
	}

	return float32(linkLength) / float32(textLength)
}
Ejemplo n.º 12
0
func parseHeader(element *goquery.Selection, info *TrainInfo) {
	element.Find("span").Each(func(i int, element *goquery.Selection) {
		switch i {
		case 0:
			info.Category, info.Number, info.Name = parseTrainDenomination(element.Text())
		case 2:
			info.From, info.To = parseTrainRoute(element.Text())
		}
	})
}
Ejemplo n.º 13
0
func displayDetails(single *goquery.Selection) {
	text := strings.TrimSpace(single.Text())
	href, _ := single.Attr("href")
	length := utf8.RuneCountInString(text)
	if ((length > 5) && wordExists(text, "keywords")) || ((length > 5) && wordExists(href, "keywords")) {
		if wordExists(text, "products") {
			fmt.Println("Link", single.Text(), "--->", href)
		}
	}

}
Ejemplo n.º 14
0
func parseResource(s *goquery.Selection) (_production, _stored, _capacity int) {
	productionStr, _ := s.Attr("title")
	production, _ := strconv.Atoi(productionStr)

	status := s.Text()
	split := strings.Split(status, "/")

	stored, _ := strconv.Atoi(split[0])
	capacity, _ := strconv.Atoi(split[1])

	return production, stored, capacity
}
Ejemplo n.º 15
0
func addIngredient(ingredients []data.Ingredient, a *goquery.Selection) []data.Ingredient {
	if href, ok := a.Attr("href"); ok {
		glog.V(2).Info("    href: " + href)
		id, err := strconv.Atoi(strings.Split(href, "/")[2])
		if err != nil {
			glog.Errorf("Failed to extract id from %s: %v", href, err)
		} else {
			ingredients = append(ingredients, data.Ingredient{Name: a.Text(), Id: id})
		}
	}
	return ingredients
}
Ejemplo n.º 16
0
func describeSentences(s *goquery.Selection) TextDescription {
	var d TextDescription

	var text string
	// get text of this node and then split for sentences
	if s.Children().Length() > 0 {
		text = getTextFromHtml(s)
	} else {
		text = s.Text()
	}

	sentences := tokenizer.Tokenize(text)

	d.CountSentences = len(sentences)
	//fmt.Println("==============================================")
	for _, s := range sentences {
		sentence := s.Text

		if len(sentence) == 0 {
			continue
		}

		c := len(get_words_from(sentence))
		//fmt.Println(sentence)

		d.AverageWords += c

		if c > 3 {
			// presume normal sentence usually has more 3 words
			d.CountLongSentences++

			if c < 25 {
				// but a sentence should not have nore 25 words. We will not
				// consider such sentence as a good one
				d.CountGoodSentences++

			}
			lastsymbol := sentence[len(sentence)-1:]

			if strings.ContainsAny(lastsymbol, ".?!") {
				d.CountCorrectSentences++
			}
		}

	}

	if d.CountSentences > 0 {
		d.AverageWords = int(d.AverageWords / d.CountSentences)
	}

	return d
}
Ejemplo n.º 17
0
//checks the density of links within a node, is there not much text and most of it contains bad links?
//if so it's no good
func (this *contentExtractor) isHighLinkDensity(node *goquery.Selection) bool {
	links := node.Find("a")
	if links == nil || links.Size() == 0 {
		return false
	}
	text := node.Text()
	words := strings.Split(text, " ")
	nwords := len(words)
	sb := make([]string, 0)
	links.Each(func(i int, s *goquery.Selection) {
		linkText := s.Text()
		sb = append(sb, linkText)
	})
	linkText := strings.Join(sb, "")
	linkWords := strings.Split(linkText, " ")
	nlinkWords := len(linkWords)
	nlinks := links.Size()
	linkDivisor := float64(nlinkWords) / float64(nwords)
	score := linkDivisor * float64(nlinks)

	if this.config.debug {
		logText := ""
		if len(node.Text()) >= 51 {
			logText = node.Text()[0:50]
		} else {
			logText = node.Text()
		}
		log.Printf("Calculated link density score as %1.5f for node %s\n", score, logText)
	}
	if score > 1.0 {
		return true
	}
	return false
}
Ejemplo n.º 18
0
func IsAllCode(bodyContent *goquery.Selection) bool {
	preDom := bodyContent.Find("pre")
	if preDom != nil {
		for index := 0; index < preDom.Length(); index++ {
			dom := preDom.Eq(index)
			dom.Remove()
		}
	}
	conStr := bodyContent.Text()
	conStr = strings.TrimSpace(conStr)
	if len(conStr) < 100 {
		return true
	}
	return false
}
Ejemplo n.º 19
0
func getText(s *goquery.Selection, includeDecendents bool) string {
	if s.Length() == 0 {
		return ""
	}
	if includeDecendents {
		return strings.TrimSpace(s.Text())
	}
	var buff []string
	for node := s.First().Nodes[0].FirstChild; node != nil; node = node.NextSibling {
		if node.Type == html.TextNode {
			buff = append(buff, node.Data)
		}
	}
	return strings.TrimSpace(strings.Join(buff, ""))
}
Ejemplo n.º 20
0
func (s *StateMachine) ProcessSelection(sl *goquery.Selection) {
	if strings.HasPrefix(sl.Text(), "備註") {
		s.State = InFootnote
	}

	if strings.HasPrefix(sl.Text(), "資訊更新日期") {
		s.State = NotInFootnote
	}

	if s.State == InFootnote {
		if sl.Size() != 1 {
			panic("element size is not 1")
		}
		s.ProcessNode(sl.Nodes[0])
	}
}
Ejemplo n.º 21
0
// return the chapter number and the line number for the sentence
func getLineNumber(sel *goquery.Selection) (chapterNumber int, lineNumber int) {
	rawtext := sel.Text()
	textArray := strings.Split(rawtext, ":")

	var err error = nil

	chapterNumber, err = strconv.Atoi(textArray[0])
	if err != nil {
		log.Fatal(err)
	}

	lineNumber, err = strconv.Atoi(textArray[1])
	if err != nil {
		log.Fatal(err)
	}
	return
}
Ejemplo n.º 22
0
func addEntryType(typename string, s *goquery.Selection) {
	linkLabel := s.Find(".memItemRight").Find("a")
	linkLabel.Each(func(i int, s *goquery.Selection) {
		structName := s.Text()
		structName = strings.Trim(structName, "\r\n ")
		link, _ := s.Attr("href")
		link = strings.Trim(link, "\r\n ")
		if len(structName) != 0 && len(link) != 0 {
			_, err := db.Exec("insert or ignore into searchIndex(name,type,path) VALUES('" + structName + "','" + typename + "','" + link + "')")
			if err != nil {
				log.Fatal("Insert " + typename + " " + structName + "Failed!")
				return
			}
			log.Print("Insert " + typename + structName)
		}
	})
}
Ejemplo n.º 23
0
func ScrapeFileLink(s *goquery.Selection) {
	i := Images{Created_at: time.Now(), Updated_at: time.Now()}
	i.Name = s.Text()
	href, _ := s.Attr("href")
	isImage := i.IsImageType()
	if isImage {
		i.Source = fmt.Sprintf("http://rghost.ru%s/image.png", href)
		downloaded := i.DownloadImage()
		if downloaded {
			i.Uploaded_to = "yes"
			i.Archived = false
			err := i.InsertImage()
			if err != nil {
				log.Fatal(err)
			}
		}
	}
}
Ejemplo n.º 24
0
func processUl(ul *goquery.Selection, depth int) {
	ul.Find("li").Each(func(_ int, li *goquery.Selection) {
		li.Find("ul").Each(func(_ int, childUl *goquery.Selection) {
			processUl(childUl, depth+1)
		})

		lines := StringToLines(li.Text())
		var indentedLines []string
		for i, line := range lines {
			if i == 0 {
				liMarkIndex := depth % 2
				mark := liMark[liMarkIndex]
				indentedLines = append(indentedLines, "\n"+mark+" "+line)
			} else {
				indentedLines = append(indentedLines, "  "+line)
			}
		}
		li.ReplaceWithHtml(strings.Join(indentedLines, "\n"))
	})

	ul.ReplaceWithHtml(ul.Text())
}
Ejemplo n.º 25
0
func (e extractContent) getSiblingContent(
	a *Article,
	s *goquery.Selection,
	baseScore uint) []*html.Node {

	var ret []*html.Node

	if nodeIs(s.Nodes[0], atom.P) && len(s.Text()) > 0 {
		return s.Nodes
	}

	ps := s.FindMatcher(pTags)
	for _, n := range ps.Nodes {
		cc := a.getCCache(n)
		if len(cc.text) > 0 {
			if cc.stopwords > baseScore && !cc.highLinkDensity {
				ret = append(ret, createNode(atom.P, "p", cc.text))
			}
		}
	}

	return ret
}
Ejemplo n.º 26
0
func printSelectionTextWithTitle(title string, sel *goquery.Selection) {
	Debug("%v selection: %v", title, sel.Text())
}
Ejemplo n.º 27
0
func printSelectionText(sel *goquery.Selection) {
	Debug("selection: %v", sel.Text())
}
Ejemplo n.º 28
0
func UpdatePageSummary(docName, PageFilePath, PreviewFilePath string) error {
	var pageSummaryData PageSummaryData

	html, err := readPageAsHtml(docName, PageFilePath)
	if err != nil {
		return err
	}
	htmlreader := bytes.NewReader(html)

	//htmlString := string(html)
	//log.Info("HtmlString " + htmlString)

	doc, err := gq.NewDocumentFromReader(htmlreader)
	if err != nil {
		return err
	}

	var SelectedNodes *gq.Selection

	//======= work out the document heading ========
	DocTitle := ""

	if DocTitle == "" {
		SelectedNodes = doc.Find("h1, h2, h3, h4").First()
		if len(SelectedNodes.Nodes) == 1 {
			DocTitle = strings.TrimSpace(SelectedNodes.Text())
		}
	}
	if DocTitle == "" {
		DocTitle = docName
	}
	//DocTitle = base64.StdEncoding.EncodeToString([]byte(DocTitle))
	DocTitle = malkovich.FolderNameToDocName(DocTitle)
	pageSummaryData.PageTitle = DocTitle

	//======== look for an image =========
	DocImage := ""
	SelectedNodes = doc.Find("img").First()
	if len(SelectedNodes.Nodes) == 1 {
		for _, nodeAttr := range SelectedNodes.Nodes[0].Attr {
			if nodeAttr.Key == "src" {
				DocImage = nodeAttr.Val
				break
			}
		}
	}
	//DocImage = base64.StdEncoding.EncodeToString([]byte(DocImage))
	pageSummaryData.FirstImage = DocImage

	//======== look for the first paragraph =========
	FirstParagraph := ""
	SelectedNodes = doc.Find("p").First()
	if len(SelectedNodes.Nodes) == 1 {
		FirstParagraph = strings.TrimSpace(SelectedNodes.Text())
	}
	//TODO:HIGH Maybe limit to a set number of charactors here.
	//FirstParagraph = base64.StdEncoding.EncodeToString([]byte(FirstParagraph))
	pageSummaryData.FirstParagraph = FirstParagraph

	jsonData, err := json.Marshal(pageSummaryData)
	if err != nil {
		return err
	}

	// TODO:MED what would the best file permissions be here?
	err = ioutil.WriteFile(PreviewFilePath, jsonData, os.FileMode(0644))
	if err != nil {
		panic(err.Error())
	}

	return nil
}
Ejemplo n.º 29
0
// return the sentence pure content without comment number
func getPureContent(sel *goquery.Selection) string {
	pureText := sel.Text()
	return pureText
}
Ejemplo n.º 30
0
func (ve *VideoExtractor) getEmbedCode(node *goquery.Selection) string {
	return node.Text()
}