func DiveToDeepestMatch(dirTree *DirTree, uriPrefixIncl string) (*DirTree, string) { var subtree *DirTree subtree = dirTree head, dir, remainder := "", "", uriPrefixIncl if uriPrefixIncl == "/" || uriPrefixIncl == "" || uriPrefixIncl == "." { // exception for root head = "" // not "/" } else { // recur deeper for { dir, remainder, _ = osutilpb.PathDirReverse(remainder) head += dir // lg(" %-10q %-10q %-10q - head - dir - remainder", head, dir, remainder) if newSubtr, ok := subtree.Dirs[dir]; ok { subtree = &newSubtr // lg(" recursion found %-10v %-10v for %v (%v)", dir, subtree.Name, uriPrefixIncl, subtree.Name) } else { // lg(" recursion failed %-10v %-10v for %v (%v)", dir, subtree.Name, uriPrefixIncl, subtree.Name) // Calling off searching on this level // StuffStage() itsself can step up if it wants to // // *not* setting subtree = nil // would keep us one level higher than this level. subtree = nil break } if remainder == "" { break } } } return subtree, head }
func path2DirTree(lg loghttp.FuncBufUniv, treeX *DirTree, articles []FullArticle, domain string, IsRSS bool) { if treeX == nil { return } var trLp *DirTree trLp = treeX pfx1 := "http://" + domain pfx2 := "https://" + domain for _, art := range articles { href := art.Url if art.Mod.IsZero() { art.Mod = time.Now() } href = strings.TrimPrefix(href, pfx1) href = strings.TrimPrefix(href, pfx2) if strings.HasPrefix(href, "/") { // ignore other domains parsed, err := url.Parse(href) lg(err) href = parsed.Path // lg("%v", href) trLp = treeX // lg("trLp is %v", trLp.String()) dir, remainder, remDirs := "", href, []string{} lvl := 0 for { dir, remainder, remDirs = osutilpb.PathDirReverse(remainder) if dir == "/" && remainder == "" { // skip root break } if lvl > 0 { trLp.Name = dir // lvl==0 => root } trLp.LastFound = art.Mod.Truncate(time.Minute) // lg(" %v, %v", dir, remainder) // New creation if _, ok := trLp.Dirs[dir]; !ok { if IsRSS { trLp.Dirs[dir] = DirTree{Name: dir, Dirs: map[string]DirTree{}, SrcRSS: true} } else { trLp.Dirs[dir] = DirTree{Name: dir, Dirs: map[string]DirTree{}} } } // We "cannot assign" to map struct directly: // trLp.Dirs[dir].LastFound = art.Mod // fails with "cannot assign" addressable := trLp.Dirs[dir] addressable.LastFound = art.Mod.Truncate(time.Minute) // We can rely that the *last* dir or html is an endpoint. // We cannot tell about higher paths, unless explicitly linked somewhere // Previous distinction between RSS URLs and crawl URLs dropped if len(remDirs) < 1 { addressable.EndPoint = true } if dir == "/2015" || dir == "/08" || dir == "/09" { addressable.EndPoint = true } trLp.Dirs[dir] = addressable trLp = &addressable if remainder == "" { // lg("break\n") break } lvl++ } } } }