Esempio n. 1
0
func WriteFile(fs fsi.FileSystem, fn string, b []byte) error {

	dir, _ := fs.SplitX(fn)

	err := fs.MkdirAll(dir, os.ModePerm)
	if err != nil && err != fsi.ErrFileExists {
		return err
	}

	err = fs.WriteFile(fn, b, 0)
	if err != nil {
		return err
	}

	return nil
}
Esempio n. 2
0
func saveDigest(lg loghttp.FuncBufUniv, fs fsi.FileSystem, fnDigest string, treeX *DirTree) {

	treeX.LastFound = time.Now()

	b, err := json.MarshalIndent(treeX, "", "\t")
	lg(err)

	if len(b) > 1024*1024-1 || true {
		b1 := snappy.Encode(nil, b)
		lg("digest encoded from %vkB to %vkB ", len(b)/1024, len(b1)/1024)
		b = b1
		fnDigest = strings.Replace(fnDigest, ".json", ".json.snappy", -1)
	}

	err = fs.MkdirAll(path.Dir(fnDigest), 0755)
	lg(err)

	err = fs.WriteFile(fnDigest, b, 0755)
	lg(err)

}
Esempio n. 3
0
//
//
// Fetches the RSS.xml file.
func rssXMLFile(w http.ResponseWriter, r *http.Request, fs fsi.FileSystem, rssUrl string) (RSS, *url.URL) {

	lg, lge := loghttp.Logger(w, r)

	bts, respInf, err := fetch.UrlGetter(r, fetch.Options{URL: rssUrl})
	lge(err)

	bts = bytes.Replace(bts, []byte("content:encoded>"), []byte("content-encoded>S"), -1) // hack

	rssDoc := RSS{}
	err = xml.Unmarshal(bts, &rssDoc)
	lge(err)

	// save it
	bdmp := stringspb.IndentedDumpBytes(rssDoc)
	err = fs.MkdirAll(path.Join(docRoot, respInf.URL.Host), 0755)
	lge(err)
	err = fs.WriteFile(path.Join(docRoot, respInf.URL.Host, "outp_rss.xml"), bdmp, 0755)
	lge(err)
	lg("RSS resp size %5.2vkB, saved to %v", len(bdmp)/1024, respInf.URL.Host+"/outp_rss.xml")

	return rssDoc, respInf.URL
}
Esempio n. 4
0
func CreateSys(fs fsi.FileSystem) (*bytes.Buffer, string) {

	bb := new(bytes.Buffer)
	wpf(bb, "--------create-dirs---------\n")

	fc1 := func(p []string) {
		path := pth.Join(p...)
		err := fs.MkdirAll(relOpt+path, os.ModePerm)
		if err != nil {
			wpf(bb, "MkdirAll failed %v\n", err)
		}
	}

	fc1([]string{"ch1"})
	fc1([]string{"ch1", "ch2"})
	fc1([]string{"ch1", "ch2", "ch3"})
	fc1([]string{"ch1", "ch2", "ch3", "ch4"})
	fc1([]string{"ch1", "ch2a"})
	fc1([]string{"ch1A"})
	fc1([]string{"ch1B"})
	fc1([]string{"d1", "d2", "d3_secretdir", "neverwalked"})
	fc1([]string{"d1", "d2", "d3a", "willwalk"})

	wpf(bb, "\n--------retrieve-dirs---------\n")

	// retrieval
	gotByPath := 0
	wntByPath := 5
	fc2 := func(p []string) {
		path := pth.Join(p...)
		wpf(bb, "searching... %q\n", path)
		f, err := fs.Lstat(relOpt + path)
		if err != nil {
			wpf(bb, "   nothing retrieved - err %v\n", err)
		} else {
			wpf(bb, "   fnd %v \n", pth.Join(path, f.Name()))
			gotByPath++
		}
	}
	fc2([]string{"ch1"})
	fc2([]string{"ch1", "ch2"})
	fc2([]string{"ch1", "non-exist-dir"})
	fc2([]string{"ch1", "ch2", "ch3"})
	fc2([]string{"ch1A"})
	fc2([]string{rel})

	wpf(bb, "\nfnd %v of %v dirs \n", gotByPath, wntByPath)

	wpf(bb, "\n-------create and save some files----\n")

	fc4a := func(name, content string) {
		err := fs.WriteFile(relOpt+name, []byte(content), os.ModePerm)
		if err != nil {
			wpf(bb, "WriteFile %v failed %v\n", name, err)
		}
	}
	fc4b := func(name, content string) {
		f, err := fs.Create(relOpt + name)
		if err != nil {
			wpf(bb, "Create %v failed %v\n", name, err)
		}
		if err != nil {
			return
		}
		_, err = f.WriteString(content)
		if err != nil {
			wpf(bb, "WriteString %v failed %v\n", name, err)
		}
		err = f.Close()
		if err != nil {
			wpf(bb, "Sync %v failed %v\n", name, err)
		}
	}

	fc4a("ch1/ch2/file_1", "content 1")
	fc4b("ch1/ch2/file_2", "content 2")
	fc4a("ch1/ch2/ch3/file3", "another content")
	fc4b(relPsep+"file4", "chq content 2")

	// fsc, ok := memfs.Unwrap(fs)
	// if ok {
	// 	fsc.Dump()
	// }
	// return bb, ""

	wpf(bb, "\n-------retrieve files again----\n\n")

	gotNumFiles := 0
	wntNumFiles := 4
	gotSizeFiles := 0
	wntSizeFiles := 9 + 9 + 15 + 13

	fc5 := func(path string) {
		files, err := fs.ReadDir(relOpt + path)
		if err != nil {
			wpf(bb, "filesByPath %v failed %v\n", path, err)
		}
		wpf(bb, " srch %-20q yielded %v dirs+files\n", relOpt+path, len(files))

		for k, v := range files {
			if v.IsDir() {
				wpf(bb, "   skip dir %v \n", v.Name())
				continue
			}
			data, err := fs.ReadFile(pth.Join(path, v.Name()))
			if err != nil {
				wpf(bb, "could not get content of %v =>  %v\n", pth.Join(path, v.Name()), err)
			}
			wpf(bb, "     %v  -  %v %s\n", k, pth.Join(path, v.Name()), data)
			gotNumFiles++
			gotSizeFiles += len(data)
		}
	}

	fc5("ch1/ch2")
	fc5("ch1/ch2/ch3")
	fc5(rel)

	wpf(bb, "\n")

	wpf(bb, "fnd %2v of %2v fils \n", gotNumFiles, wntNumFiles)
	wpf(bb, "fnd %2v of %2v fsize \n", gotSizeFiles, wntSizeFiles)
	wpf(bb, "\n")

	testRes := ""
	if gotNumFiles != wntNumFiles {
		testRes += spf("Create files num :   wnt %2v - got %v\n", wntNumFiles, gotNumFiles)
	}
	if gotSizeFiles != wntSizeFiles {
		testRes += spf("Create files size:   wnt %2v - got %v\n", wntSizeFiles, gotSizeFiles)
	}
	return bb, testRes
}
Esempio n. 5
0
// Fetch takes a RSS XML uri and fetches some of its documents.
// It uses a three staged pipeline for parallel fetching.
// Results are stored into the given filesystem fs.
// Config points to the source of RSS XML,
// and has some rules for conflating URI directories.
// uriPrefix and config.DesiredNumber tell the func
// which subdirs of the RSS dir should be fetched - and how many at max.
func FetchUsingRSS(w http.ResponseWriter, r *http.Request,
	fs fsi.FileSystem, config FetchCommand,
) {

	lg, b := loghttp.BuffLoggerUniversal(w, r)
	closureOverBuf := func(bUnused *bytes.Buffer) {
		loghttp.Pf(w, r, b.String())
	}
	defer closureOverBuf(b) // the argument is ignored,

	if config.Host == "" {
		lg(" empty host; returning")
		return
	}

	config = addDefaults(config)

	// Fetching the rssXML takes time.
	// We do it before the timouts of the pipeline stages are set off.
	lg(" ")
	lg(config.Host)
	if config.Host == "test.economist.com" {
		switchTData(w, r)
	}

	// lg(stringspb.IndentedDump(config))
	dirTree := &DirTree{Name: "/", Dirs: map[string]DirTree{}, EndPoint: true}

	fnDigest := path.Join(docRoot, config.Host, "digest2.json")
	loadDigest(w, r, lg, fs, fnDigest, dirTree) // previous

	age := time.Now().Sub(dirTree.LastFound)
	lg("DirTree is %5.2v hours old (%v)", age.Hours(), dirTree.LastFound.Format(time.ANSIC))
	if age.Hours() > 0.001 {

		rssUrl := matchingRSSURI(w, r, config)
		if rssUrl == "" {
			m := new(MyWorker)
			m.r = r
			m.lg = lg
			m.fs1 = fs
			m.SURL = path.Join(config.Host, config.SearchPrefix)
			_, _, _, err := fetchSave(m)
			lg(err)
			if err != nil {
				return
			}
		} else {
			rssUrl = path.Join(config.Host, rssUrl)
			rssDoc, rssUrlObj := rssXMLFile(w, r, fs, rssUrl)
			_ = rssUrlObj
			rssDoc2DirTree(w, r, dirTree, rssDoc, config.Host)
		}

		saveDigest(lg, fs, fnDigest, dirTree)
	}

	// lg(dirTree.String())
	//
	//
	// setting up a 3 staged pipeline from bottom up
	//
	var fullArticles []FullArticle

	var inn chan *FullArticle = make(chan *FullArticle) // jobs are stuffed in here
	var out chan *FullArticle = make(chan *FullArticle) // completed jobs are delivered here
	var fin chan struct{} = make(chan struct{})         // downstream signals end to upstream
	var stage3Wait sync.WaitGroup

	// stage 3
	// fire up the "collector", a fan-in
	go func() {
		stage3Wait.Add(1)
		// 400 good value; critical point at 35
		// economist.com required 800 ms
		const delayInitial = 1200
		const delayRefresh = 800
		cout := time.After(time.Millisecond * delayInitial)
		for {
			select {

			case fa := <-out:
				fullArticles = append(fullArticles, *fa)
				pth := fetch.PathFromStringUrl(fa.Url)
				lg("    fetched   %v - %v ", fa.Mod.Format("15:04:05"), stringspb.Ellipsoider(pth, 50))
				cout = time.After(time.Millisecond * delayRefresh) // refresh timeout
			case <-cout:
				lg("timeout after %v articles", len(fullArticles))
				// we are using channel == nil - channel closed combinations
				// inspired by http://dave.cheney.net/2013/04/30/curious-channels
				out = nil // not close(out) => case above is now blocked
				close(fin)
				lg("fin closed; out nilled")
				stage3Wait.Done()
				return
			}
		}
	}()

	//
	// stage 2
	for i := 0; i < numWorkers; i++ {
		// fire up a dedicated fetcher routine, a worker
		// we are using channel == nil - channel closed combinations
		// inspired by http://dave.cheney.net/2013/04/30/curious-channels
		go func() {
			var a *FullArticle
			for {
				select {
				case a = <-inn:
					var err error
					var inf fetch.Info
					a.Body, inf, err = fetch.UrlGetter(r, fetch.Options{URL: a.Url})
					lg(err)
					if a.Mod.IsZero() {
						a.Mod = inf.Mod
					}
					select {
					case out <- a:
					case <-fin:
						lg("    worker spinning down; branch 1; abandoning %v", a.Url)
						return
					}
					a = new(FullArticle)
				case <-fin:
					if a != nil && a.Url != "" {
						u, _ := url.Parse(a.Url)
						lg("    abandoned %v", u.Path)
					} else {
						lg("    worker spinning down; branch 2")
					}
					return
				}
			}
		}()
	}

	//
	//
	//
	// loading stage 1
	uriPrefix := config.SearchPrefix
	found := 0
	uriPrefixExcl := "impossible"
	for i := 0; i < 15; i++ {
		lg("  searching for prefix   %v    - excl %q    - %v of %v", uriPrefix, uriPrefixExcl, found, config.DesiredNumber)
		found += stuffStage1(w, r, config, inn, fin, dirTree,
			uriPrefixExcl, uriPrefix, config.DesiredNumber-found)

		if found >= config.DesiredNumber {
			break
		}

		if uriPrefix == "/" || uriPrefix == "." {
			lg("  root exhausted")
			break
		}

		newPrefix := path.Dir(uriPrefix)
		uriPrefixExcl = uriPrefix
		uriPrefix = newPrefix
	}
	lg("  found %v of %v", found, config.DesiredNumber)

	//
	lg("stage3Wait.Wait() before")
	stage3Wait.Wait()
	lg("stage3Wait.Wait() after")

	// workers spin down earlier -
	// but ae log writer and response writer need some time
	// to record the spin-down messages
	time.Sleep(120 * time.Millisecond)

	// compile out directory statistics
	histoDir := map[string]int{}
	for _, a := range fullArticles {
		u, err := url.Parse(a.Url)
		lg(err)
		semanticUri := condenseTrailingDir(u.Path, config.CondenseTrailingDirs)
		dir := path.Dir(semanticUri)
		histoDir[dir]++
	}
	sr := sortmap.SortMapByCount(histoDir)
	_ = sr

	// Create dirs
	for k, _ := range histoDir {
		dir := path.Join(docRoot, k) // config.Host already contained in k
		err := fs.MkdirAll(dir, 0755)
		lg(err)
		err = fs.Chtimes(dir, time.Now(), time.Now())
		lg(err)
	}

	// Saving as files
	for _, a := range fullArticles {
		if len(a.Body) == 0 {
			continue
		}
		u, err := url.Parse(a.Url)
		u.Fragment = ""
		u.RawQuery = ""
		lg(err)
		semanticUri := condenseTrailingDir(u.RequestURI(), config.CondenseTrailingDirs)
		p := path.Join(docRoot, semanticUri)
		err = fs.WriteFile(p, a.Body, 0644)
		lg(err)
		err = fs.Chtimes(p, a.Mod, a.Mod)
		lg(err)
	}

	{
		b, err := json.MarshalIndent(histoDir, "  ", "\t")
		lg(err)
		fnDigest := path.Join(docRoot, config.Host, "fetchDigest.json")
		err = fs.WriteFile(fnDigest, b, 0755)
		lg(err)
	}

	// fsm, ok := memfs.Unwrap(fs)
	// if ok {
	// 	fsm.Dump()
	// }

}