func Tokenize() { extension := ".html" directory := "" ss := util.GetFilesByExtension(directory, extension, false) pss := stringspb.IndentedDump(ss) pf("%v \n\n", *pss) if len(ss) < 1 { pf("did not find any files with %q\n", extension) return } ss = ss[0:1] for i := 0; i < len(ss); i++ { sb, err := ioutil.ReadFile(ss[i]) if err != nil { pf("%v \n", err) } r := bytes.NewReader(sb) b, err := cleanseHtml(r) if err != nil { pf("%v \n", err) } util.WriteBytesToFilename("xx_"+ss[i], b) // pf("\n\n") r = bytes.NewReader(b.Bytes()) decomposeHtml(r) } }
func ParseHtmlFiles() { testDataDir := "./" testFiles, err := filepath.Glob(testDataDir + "test*.html") if err != nil { pf("%v \n", err) } for _, tf := range testFiles { pf("%v\n", tf) f, err := os.Open(tf) if err != nil { pf("1 %v \n", err) } defer f.Close() r1 := bufio.NewReader(f) var docRoot *html.Node docRoot, err = html.Parse(r1) if err != nil { pf("3 %v \n", err) } fRecurse = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "a" { s := strings.TrimSpace(fNodeModify(n)) //pf("found %v\n", s) nNew := new(html.Node) nNew.Type = html.TextNode nNew.Data = s // We want to remove all children. // Direct loop impossible, since "NextSibling" is set to nil // during Remove(). // Therefore first assembling separately, then removing. children := map[*html.Node]string{} for c := n.FirstChild; c != nil; c = c.NextSibling { children[c] = "xx" } for k, _ := range children { n.RemoveChild(k) // pf(" removed %q\n", strings.TrimSpace(k.Data)) } n.AppendChild(nNew) } for c := n.FirstChild; c != nil; c = c.NextSibling { fRecurse(c) } } fRecurse(docRoot) var b bytes.Buffer html.Render(&b, docRoot) util.WriteBytesToFilename("yy_"+tf, &b) //fixedHtml := b.String() //fmt.Printf("%s \n", spew.Sdump(docRoot)) } }