Esempio n. 1
0
func Tokenize() {

	extension := ".html"
	directory := ""

	ss := util.GetFilesByExtension(directory, extension, false)
	pss := stringspb.IndentedDump(ss)
	pf("%v \n\n", *pss)

	if len(ss) < 1 {
		pf("did not find any files with %q\n", extension)
		return
	}

	ss = ss[0:1]

	for i := 0; i < len(ss); i++ {
		sb, err := ioutil.ReadFile(ss[i])
		if err != nil {
			pf("%v \n", err)
		}

		r := bytes.NewReader(sb)
		b, err := cleanseHtml(r)
		if err != nil {
			pf("%v \n", err)
		}

		util.WriteBytesToFilename("xx_"+ss[i], b)

		//
		pf("\n\n")
		r = bytes.NewReader(b.Bytes())
		decomposeHtml(r)

	}

}
Esempio n. 2
0
func ParseHtmlFiles() {

	testDataDir := "./"
	testFiles, err := filepath.Glob(testDataDir + "test*.html")
	if err != nil {
		pf("%v \n", err)
	}

	for _, tf := range testFiles {
		pf("%v\n", tf)

		f, err := os.Open(tf)
		if err != nil {
			pf("1 %v \n", err)
		}
		defer f.Close()
		r1 := bufio.NewReader(f)

		var docRoot *html.Node
		docRoot, err = html.Parse(r1)
		if err != nil {
			pf("3 %v \n", err)
		}

		fRecurse = func(n *html.Node) {
			if n.Type == html.ElementNode && n.Data == "a" {
				s := strings.TrimSpace(fNodeModify(n))
				//pf("found %v\n", s)
				nNew := new(html.Node)
				nNew.Type = html.TextNode
				nNew.Data = s

				// We want to remove all children.
				// Direct loop impossible, since "NextSibling" is set to nil
				// 		during Remove().
				// Therefore first assembling separately, then removing.
				children := map[*html.Node]string{}
				for c := n.FirstChild; c != nil; c = c.NextSibling {
					children[c] = "xx"
				}
				for k, _ := range children {
					n.RemoveChild(k)
					// pf("  removed  %q\n", strings.TrimSpace(k.Data))
				}
				n.AppendChild(nNew)

			}
			for c := n.FirstChild; c != nil; c = c.NextSibling {
				fRecurse(c)
			}
		}
		fRecurse(docRoot)

		var b bytes.Buffer
		html.Render(&b, docRoot)
		util.WriteBytesToFilename("yy_"+tf, &b)
		//fixedHtml := b.String()

		//fmt.Printf("%s \n", spew.Sdump(docRoot))

	}
}