Пример #1
0
/*
 * Extracts first HTML commend as map. It expects it as a valid YAML map.
 */
func (gen *Generator) extractPageConfig(doc *html.HtmlDocument) (config map[interface{}]interface{}, err error) {
	result, _ := doc.Search("//comment()")
	if len(result) > 0 {
		err = yaml.Unmarshal([]byte(result[0].Content()), &config)
	}
	return
}
Пример #2
0
Файл: bi.go Проект: iorme/kurs
func parseBiHtml(document *html.HtmlDocument) (map[string]Currency, error) {
	var matauang string
	var nilai string
	var kursjual string
	var kursbeli string

	kurs := make(map[string]Currency)

	doc, err := document.Search("//table[@id='ctl00_PlaceHolderMain_biWebKursTransaksiBI_GridView1']/tr")
	for i, tr := range doc {
		t := 0
		for td := tr.FirstChild(); td != nil; td = td.NextSibling() {
			teks := strings.TrimSpace(td.Content())
			if i > 0 && len(teks) > 0 {
				if t == 0 {
					matauang = teks
				} else if t == 1 {
					nilai = teks
				} else if t == 2 {
					kursjual = teks
				} else if t == 3 {
					kursbeli = teks
				}
				t += 1
				kurs[matauang] = Currency{Nilai: nilai, KursJual: kursjual, KursBeli: kursbeli}
			}
		}
	}

	return kurs, err
}
Пример #3
0
/*
 * Handles <embed> tags.
 *
 * They can be handled with MIME type plugins or internal exported methods like Markdown.
 */
func (gen *Generator) handleEmbedTags(doc *html.HtmlDocument) (err error) {
	result, err := doc.Search("//embed")
	if err != nil {
		return
	}
	for _, e := range result {
		plugin := gen.resolveMIMETypePlugin(e.Attribute("type").Value())
		method := reflect.ValueOf(gen).MethodByName(strings.Title(plugin))
		if method == reflect.ValueOf(nil) {
			err = gen.handleMIMETypePlugin(e, doc)
		} else {
			args := make([]reflect.Value, 2)
			args[0] = reflect.ValueOf(e)
			args[1] = reflect.ValueOf(doc)
			r := method.Call(args)
			rerr := r[0].Interface()
			if ierr, ok := rerr.(error); ok {
				err = ierr
			}
		}
		if err != nil {
			return
		}
	}
	return
}
Пример #4
0
/*
 * Returns first H1 tag as page title.
 */
func (gen *Generator) getTitle(doc *html.HtmlDocument) (title string) {
	result, _ := doc.Search("//h1")
	if len(result) > 0 {
		title = result[0].FirstChild().Content()
	}
	return
}
Пример #5
0
/*
 * Removes unnecessary paragraph HTML tags generated during Markdown processing by
 * deleting any <p> without child text nodes (just to avoid deletion if semantic tags
 * are inside).
 */
func (gen *Generator) cleanUnnecessaryPTags(doc *html.HtmlDocument) (err error) {
	ps, err := doc.Search("//p")
	if err != nil {
		return
	}
	for _, p := range ps {
		hasText := false
		child := p.FirstChild()
		for child != nil {
			typ := child.NodeType()
			if typ == xml.XML_TEXT_NODE {
				// Little heuristic to remove nodes with visually empty content.
				content := strings.TrimSpace(child.Content())
				if content != "" {
					hasText = true
					break
				}
			}
			child = child.NextSibling()
		}
		// If current <p> tag doesn't have any child text node, extract children and add to its parent.
		if !hasText {
			parent := p.Parent()
			child = p.FirstChild()
			for child != nil {
				parent.AddChild(child)
				child = child.NextSibling()
			}
			p.Remove()
		}
	}
	return
}
Пример #6
0
func parseMandiriHtml(document *html.HtmlDocument) (map[string]Currency, error) {
	nilai := "1.00"
	var matauang string
	var kursjual string
	var kursbeli string

	var kurs = make(map[string]Currency)
	doc, err := document.Search("//table[@class='tbl-view']/tr")
	for i, tr := range doc {
		t := 0
		for td := tr.FirstChild(); td != nil; td = td.NextSibling() {
			teks := strings.TrimSpace(td.Content())
			if i > 0 && i <= 15 && len(teks) > 0 {
				if t == 1 {
					matauang = teks
				} else if t == 2 {
					kursbeli = teks
				} else if t == 3 {
					kursjual = teks
				}
				t += 1

				if kursjual != "" && kursbeli != "" {
					kurs[matauang] = Currency{Nilai: nilai, KursJual: kursjual, KursBeli: kursbeli}
				}
			}
		}
	}

	return kurs, err
}
Пример #7
0
func getLastUpdatedMandiri(document *html.HtmlDocument) string {
	str, _ := document.Search("//p[@class='catatan']")
	firstData := strings.Split(strings.Split(str[0].InnerHtml(), "<br>")[0], " ")
	lastUpdated := firstData[2] + " " + firstData[3] + " " + firstData[4] + " " + firstData[5] + " " + firstData[6]

	return lastUpdated
}
Пример #8
0
func (c *Client) matchDocNode(doc *gokogirihtml.HtmlDocument, xpath string, str string) *gokogirihtml.HtmlDocument {
	nodes, nodeerr := doc.Search(xpath)
	if nodeerr != nil {
		c.Fail("element search error")
		return doc
	}
	if len(nodes) == 0 {
		c.Fail("element is not found: %s", xpath)
		return doc
	}
	matched, _ := regexp.MatchString(str, nodes[0].String())
	if matched {
		c.Success(1.0)
		return doc
	}
	c.Fail("%s match %s", xpath, str)
	return doc
}
Пример #9
0
func getCandidates(doc *html.HtmlDocument, minLen int) (map[string]*Candidate, error) {

	candidates := make(map[string]*Candidate)

	paragraphs, err := doc.Search(`//p|//td`)
	if err != nil {
		return nil, err
	}

	for _, elem := range paragraphs {
		text := elem.Content()

		if len(text) < minLen {
			continue
		}

		sc := 1.0
		sc += float64(len(strings.Split(text, ",")))
		sc += math.Min(float64(len(text)/100.0), 3.0)

		parent := elem.Parent()
		grandParent := parent.Parent()

		if _, found := candidates[parent.String()]; !found {
			candidates[parent.String()] = newCadidate(parent)
		}
		candidates[parent.String()].score += sc

		if grandParent != nil && grandParent.IsValid() {
			if _, found := candidates[grandParent.String()]; !found {
				candidates[grandParent.String()] = newCadidate(grandParent)
			}
			candidates[grandParent.String()].score += (sc / 2.0)
		}

		for _, candidate := range candidates {
			candidate.score = (candidate.score * (1 - linkDensity(candidate.node)))
		}

	}

	return candidates, nil
}
Пример #10
0
func (s *Scenario) CheckAssets(w *Worker, doc *html.HtmlDocument) {
	var wg sync.WaitGroup

	base, err := url.Parse(s.Path)

	if err != nil {
		return
	}

	// <link>
	links, err := doc.Search("//link")
	if err == nil {
		for _, link := range links {
			if link.Attr("href") != "" {
				wg.Add(1)
				go func(link xml.Node) {
					s.GetAsset(w, base, link, "href")
					wg.Done()
				}(link)
			}
		}
	}

	// <script>
	scripts, err := doc.Search("//script")
	if err == nil {
		for _, script := range scripts {
			if script.Attr("src") != "" {
				wg.Add(1)
				go func(script xml.Node) {
					s.GetAsset(w, base, script, "src")
					wg.Done()
				}(script)
			}
		}
	}

	// img
	imgs, err := doc.Search("//img")
	if err == nil {
		for _, img := range imgs {
			if img.Attr("src") != "" {
				wg.Add(1)
				go func(img xml.Node) {
					s.GetAsset(w, base, img, "src")
					wg.Done()
				}(img)
			}
		}
	}

	wg.Wait()
}
Пример #11
0
Файл: bi.go Проект: iorme/kurs
func getLastUpdatedBi(document *html.HtmlDocument) string {
	span, _ := document.Search("//span[@id='ctl00_PlaceHolderMain_biWebKursTransaksiBI_lblUpdate']")
	lastUpdated := span[0].InnerHtml()

	return lastUpdated
}