Пример #1
0
func DiveToDeepestMatch(dirTree *DirTree, uriPrefixIncl string) (*DirTree, string) {

	var subtree *DirTree
	subtree = dirTree
	head, dir, remainder := "", "", uriPrefixIncl

	if uriPrefixIncl == "/" || uriPrefixIncl == "" || uriPrefixIncl == "." {
		// exception for root
		head = "" // not "/"
	} else {
		// recur deeper
		for {
			dir, remainder, _ = osutilpb.PathDirReverse(remainder)
			head += dir
			// lg("    %-10q %-10q %-10q - head - dir - remainder", head, dir, remainder)
			if newSubtr, ok := subtree.Dirs[dir]; ok {
				subtree = &newSubtr
				// lg("    recursion found  %-10v %-10v for %v (%v)", dir, subtree.Name, uriPrefixIncl, subtree.Name)
			} else {
				// lg("    recursion failed %-10v %-10v for %v (%v)", dir, subtree.Name, uriPrefixIncl, subtree.Name)

				// Calling off searching on this level
				// StuffStage() itsself can step up if it wants to
				//
				// *not* setting subtree = nil
				// would keep us one level higher than this level.
				subtree = nil

				break
			}

			if remainder == "" {
				break
			}
		}
	}

	return subtree, head
}
Пример #2
0
func path2DirTree(lg loghttp.FuncBufUniv, treeX *DirTree, articles []FullArticle, domain string, IsRSS bool) {

	if treeX == nil {
		return
	}
	var trLp *DirTree
	trLp = treeX

	pfx1 := "http://" + domain
	pfx2 := "https://" + domain

	for _, art := range articles {
		href := art.Url
		if art.Mod.IsZero() {
			art.Mod = time.Now()
		}
		href = strings.TrimPrefix(href, pfx1)
		href = strings.TrimPrefix(href, pfx2)
		if strings.HasPrefix(href, "/") { // ignore other domains
			parsed, err := url.Parse(href)
			lg(err)
			href = parsed.Path
			// lg("%v", href)
			trLp = treeX
			// lg("trLp is %v", trLp.String())
			dir, remainder, remDirs := "", href, []string{}
			lvl := 0
			for {

				dir, remainder, remDirs = osutilpb.PathDirReverse(remainder)

				if dir == "/" && remainder == "" {
					// skip root
					break
				}

				if lvl > 0 {
					trLp.Name = dir // lvl==0 => root
				}
				trLp.LastFound = art.Mod.Truncate(time.Minute)

				// lg("   %v, %v", dir, remainder)

				// New creation
				if _, ok := trLp.Dirs[dir]; !ok {
					if IsRSS {
						trLp.Dirs[dir] = DirTree{Name: dir, Dirs: map[string]DirTree{}, SrcRSS: true}
					} else {
						trLp.Dirs[dir] = DirTree{Name: dir, Dirs: map[string]DirTree{}}
					}
				}

				// We "cannot assign" to map struct directly:
				// trLp.Dirs[dir].LastFound = art.Mod   // fails with "cannot assign"
				addressable := trLp.Dirs[dir]
				addressable.LastFound = art.Mod.Truncate(time.Minute)

				// We can rely that the *last* dir or html is an endpoint.
				// We cannot tell about higher paths, unless explicitly linked somewhere
				// Previous distinction between RSS URLs and crawl URLs dropped
				if len(remDirs) < 1 {
					addressable.EndPoint = true
				}

				if dir == "/2015" || dir == "/08" || dir == "/09" {
					addressable.EndPoint = true
				}

				trLp.Dirs[dir] = addressable
				trLp = &addressable

				if remainder == "" {
					// lg("break\n")
					break
				}

				lvl++
			}

		}
	}

}