// Crawl uses fetcher to recursively crawl // pages starting with url, to a maximum of depth. func Crawl(url string, depth int, fetcher crawler.Fetcher, visitUrl func(string, string)) { // TODO: Fetch URLs in parallel. // TODO: Don't fetch the same URL twice. // This implementation doesn't do either: if depth <= 0 { return } body, urls, err := fetcher.Fetch(url) if err != nil { fmt.Println(err) return } visitUrl(url, body) for _, u := range urls { Crawl(u, depth-1, fetcher, visitUrl) } return }
// Crawl uses fetcher to recursively crawl // pages starting with url, to a maximum of depth. func Crawl(url string, depth int, fetcher crawler.Fetcher, visitUrl func(string, string)) { if depth <= 0 { fmt.Printf("<- Done with %v, depth 0.\n", url) return } fetched.Lock() if _, ok := fetched.m[url]; ok { fetched.Unlock() fmt.Printf("<- Done with %v, already fetched.\n", url) return } // We mark the url to be loading to avoid others reloading it at the same time. fetched.m[url] = loading fetched.Unlock() // We load it concurrently. body, urls, err := fetcher.Fetch(url) // And update the status in a synced zone. fetched.Lock() fetched.m[url] = err fetched.Unlock() if err != nil { fmt.Printf("<- Error on %v: %v\n", url, err) return } visitUrl(url, body) done := make(chan bool) for i, u := range urls { fmt.Printf("-> Crawling child %v/%v of %v : %v.\n", i, len(urls), url, u) go func(url string) { Crawl(url, depth-1, fetcher, visitUrl) done <- true }(u) } for i, u := range urls { fmt.Printf("<- [%v] %v/%v Waiting for child %v.\n", url, i, len(urls), u) <-done } fmt.Printf("<- Done with %v\n", url) }