Example #1
0
func UrlParser(sf types.SetupFunction) {
	in, out := sf.SetName("UrlParser").AsFilter("UrlProducer").Build()
	defer close(out)

	for urlBM := range in {
		if sf.Cancelled() {
			continue
		}
		urlStr := ToString(urlBM)
		if path.Ext(urlStr) == "" && urlStr[len(urlStr)-1] != '/' {
			urlStr += "/"
		}
		url, err := urlParser.Parse(urlStr)
		if err != nil {
			continue
		}

		body := Download(url.String())
		for link := range fetchLinks(body) {
			joinedLink, err := url.Parse(link)
			if err != nil {
				continue
			}

			out <- NewStringMarshaler(joinedLink.String())
		}
	}
}
Example #2
0
func MimeDetector(sf types.SetupFunction) {
	in, out := sf.SetName("MimeDetector").AsFilter("UrlParser").Build()
	defer close(out)

	buffer := make([]byte, 512)

	for urlBM := range in {
		if sf.Cancelled() {
			continue
		}
		url := ToString(urlBM)
		body := Download(url)
		if body == nil {
			continue
		}
		func(body io.ReadCloser) {
			defer body.Close()
			n, err := body.Read(buffer)

			if err == nil || err == io.EOF {
				encoded := http.DetectContentType(buffer[:n]) + "->" + url
				out <- NewStringMarshaler(encoded)
			}
		}(body)
	}
}
Example #3
0
func MimeSplitterHtml(sf types.SetupFunction) {
	in, out := sf.SetName("MimeSplitterHtml").AsFilter("MimeDetector").Build()
	defer close(out)
	for urlBM := range in {
		if sf.Cancelled() {
			continue
		}
		url := ToString(urlBM)
		mime, u := decodeMimeUrl(url)
		if strings.Contains(mime, "html") {
			out <- NewStringMarshaler(u)
		}
	}
}
Example #4
0
func UrlProducer(sf types.SetupFunction, parent string) {
	in, out := sf.SetName("UrlProducer").AsFilter(parent).Build()
	defer close(out)

	visitedUrls := make(map[string]interface{})

	for urlBm := range in {
		if sf.Cancelled() {
			continue
		}
		url := ToString(urlBm)
		if _, visited := visitedUrls[url]; !visited {
			visitedUrls[url] = nil
			out <- NewStringMarshaler(url)
		}
	}
}