func Tokenize() { extension := ".html" directory := "" ss := util.GetFilesByExtension(directory, extension, false) pss := util.IndentedDump(ss) pf("%v \n\n", *pss) if len(ss) < 1 { pf("did not find any files with %q\n", extension) return } ss = ss[0:1] for i := 0; i < len(ss); i++ { sb, err := ioutil.ReadFile(ss[i]) if err != nil { pf("%v \n", err) } r := bytes.NewReader(sb) b, err := cleanseHtml(r) if err != nil { pf("%v \n", err) } util.WriteBytesToFilename("xx_"+ss[i], b) // pf("\n\n") r = bytes.NewReader(b.Bytes()) decomposeHtml(r) } }
func Fetch(amount int) { go func() { for { pfa := <-c fa := *pfa fullArticles = append(fullArticles, fa) pf("done fetching %v \n", fa.URL[27:]) } }() // cx := appengine.NewContext(r) // cl := urlfetch.Client(cx) cl := http.DefaultClient resp, err := cl.Get("http://www.handelsblatt.com/contentexport/feed/schlagzeilen") if err != nil { pf("%v\n", err) } bcntent, err := ioutil.ReadAll(resp.Body) defer resp.Body.Close() if err != nil { pf("%v\n", err) } bcntent = bytes.Replace(bcntent, []byte("content:encoded>"), []byte("content-encoded>S"), -1) // scntent := string(bcntent) // pf("size: %v \n%v\n", len(scntent), util.Ellipsoider(scntent, 1450)) var rssDoc RSS err = xml.Unmarshal(bcntent, &rssDoc) if err != nil { pf("%v\n", err) } ps := util.IndentedDump(rssDoc) s := *ps pf("- %v - \n%v\n", len(s), s[:util.Min(1600, len(s)-1)]) items := rssDoc.Items for i := 0; i < len(items.ItemList); i++ { lpItem := items.ItemList[i] pf("%v: %v - %v\n", i, lpItem.Published[5:22], lpItem.Link) go func(argURL string) { cl := http.DefaultClient resp, err := cl.Get(argURL) if err != nil { pf(" full art %v %v\n", argURL, err) } bcntent, err := ioutil.ReadAll(resp.Body) defer resp.Body.Close() if err != nil { pf(" full art %v %v\n", argURL, err) } fa := FullArticle{} fa.URL = argURL fa.Body = &bcntent c <- &fa }(lpItem.Link) if i+1 >= amount { break } } time.Sleep(4 * time.Second) pf("\n\n\n") for i := 0; i < len(fullArticles); i++ { lpFa := fullArticles[i] indir := *fullArticles[i].Body // pf("%v: %v\n\n", lpFa.URL[27:], util.Ellipsoider(string(indir), 200)) fileName := lpFa.URL fileName = strings.Replace(fileName, "https://", "", -1) fileName = strings.Replace(fileName, "http://", "", -1) pf("%v\n", fileName) fileName = fileName[strings.Index(fileName, "/")+1:] fileName = strings.Replace(fileName, "/", "--", 1) pf("%v\n", fileName) nextSlash := strings.Index(fileName, "/") if nextSlash > 0 { fileName = fileName[:strings.Index(fileName, "/")] fileName += ".html" } pf("%v\n", fileName) f, err := os.Create(fileName) if err != nil { pf(" file open %v %v\n", fileName, err) } defer f.Close() n2, err := f.Write(indir) pf("wrote %d bytes - err |%v| \n", n2, err) } }