func UrlParser(sf types.SetupFunction) { in, out := sf.SetName("UrlParser").AsFilter("UrlProducer").Build() defer close(out) for urlBM := range in { if sf.Cancelled() { continue } urlStr := ToString(urlBM) if path.Ext(urlStr) == "" && urlStr[len(urlStr)-1] != '/' { urlStr += "/" } url, err := urlParser.Parse(urlStr) if err != nil { continue } body := Download(url.String()) for link := range fetchLinks(body) { joinedLink, err := url.Parse(link) if err != nil { continue } out <- NewStringMarshaler(joinedLink.String()) } } }
func MimeDetector(sf types.SetupFunction) { in, out := sf.SetName("MimeDetector").AsFilter("UrlParser").Build() defer close(out) buffer := make([]byte, 512) for urlBM := range in { if sf.Cancelled() { continue } url := ToString(urlBM) body := Download(url) if body == nil { continue } func(body io.ReadCloser) { defer body.Close() n, err := body.Read(buffer) if err == nil || err == io.EOF { encoded := http.DetectContentType(buffer[:n]) + "->" + url out <- NewStringMarshaler(encoded) } }(body) } }
func MimeSplitterHtml(sf types.SetupFunction) { in, out := sf.SetName("MimeSplitterHtml").AsFilter("MimeDetector").Build() defer close(out) for urlBM := range in { if sf.Cancelled() { continue } url := ToString(urlBM) mime, u := decodeMimeUrl(url) if strings.Contains(mime, "html") { out <- NewStringMarshaler(u) } } }
func UrlProducer(sf types.SetupFunction, parent string) { in, out := sf.SetName("UrlProducer").AsFilter(parent).Build() defer close(out) visitedUrls := make(map[string]interface{}) for urlBm := range in { if sf.Cancelled() { continue } url := ToString(urlBm) if _, visited := visitedUrls[url]; !visited { visitedUrls[url] = nil out <- NewStringMarshaler(url) } } }