func MimeDetector(sf types.SetupFunction) { in, out := sf.SetName("MimeDetector").AsFilter("UrlParser").Build() defer close(out) buffer := make([]byte, 512) for urlBM := range in { if sf.Cancelled() { continue } url := ToString(urlBM) body := Download(url) if body == nil { continue } func(body io.ReadCloser) { defer body.Close() n, err := body.Read(buffer) if err == nil || err == io.EOF { encoded := http.DetectContentType(buffer[:n]) + "->" + url out <- NewStringMarshaler(encoded) } }(body) } }
func UrlParser(sf types.SetupFunction) { in, out := sf.SetName("UrlParser").AsFilter("UrlProducer").Build() defer close(out) for urlBM := range in { if sf.Cancelled() { continue } urlStr := ToString(urlBM) if path.Ext(urlStr) == "" && urlStr[len(urlStr)-1] != '/' { urlStr += "/" } url, err := urlParser.Parse(urlStr) if err != nil { continue } body := Download(url.String()) for link := range fetchLinks(body) { joinedLink, err := url.Parse(link) if err != nil { continue } out <- NewStringMarshaler(joinedLink.String()) } } }
func pathProducer(sf types.SetupFunction, argv []string) { out := sf.SetName("PathProducer").AsProducer().Build() defer close(out) for _, path := range argv { out <- NewStringMarshaler(path) } }
func PathValidator(sf types.SetupFunction, parent string) { in, out := sf.SetName("PathValidator").AsFilter(parent).Build() defer close(out) for path := range in { if _, err := os.Stat(ToString(path)); err == nil { out <- path } } }
func urlLooper(sf types.SetupFunction, feeder chan<- interface{}, done types.AtomicBool) { in := sf.AsConsumer("MimeSplitterHtml").Build() for url := range in { if !done.Get() { u := ToString(url) feeder <- u } } }
func urlFeeder(sf types.SetupFunction, feeder <-chan interface{}, done types.AtomicBool) { out := sf.SetName("UrlFeeder").AsProducer().Build() defer close(out) for url := range feeder { if !done.Get() { out <- NewStringMarshaler(url.(string)) } } }
func wordPrinter(sf types.SetupFunction, done chan struct{}) { defer close(done) in := sf.AsConsumer("github.com/apoydence/hydra/examples/wordCount.FinalWordCounter").Build() for wordMap := range in { for k, v := range ToMap(wordMap) { println(k, v) } } }
func WordCounter(sf types.SetupFunction) { in, out := sf.AsFilter("github.com/apoydence/hydra/examples/wordCount.SymbolRemover").Build() defer close(out) m := make(map[string]uint32) for word := range in { incMap(ToString(word), 1, m) } out <- NewWordCountMarshaler(m) }
func FinalWordCounter(sf types.SetupFunction) { in, out := sf.AsFilter("github.com/apoydence/hydra/examples/wordCount.WordCounter").Build() defer close(out) m := make(map[string]uint32) for wordMap := range in { for k, v := range ToMap(wordMap) { incMap(k, v, m) } } out <- NewWordCountMarshaler(m) }
func MimeSplitterHtml(sf types.SetupFunction) { in, out := sf.SetName("MimeSplitterHtml").AsFilter("MimeDetector").Build() defer close(out) for urlBM := range in { if sf.Cancelled() { continue } url := ToString(urlBM) mime, u := decodeMimeUrl(url) if strings.Contains(mime, "html") { out <- NewStringMarshaler(u) } } }
func SymbolRemover(sf types.SetupFunction) { in, out := sf.AsFilter("github.com/apoydence/hydra/examples/wordCount.WordExtractor").Build() defer close(out) for word := range in { str := make([]byte, 0) bytes := []byte(strings.ToLower(ToString(word))) for _, x := range bytes { if (x >= 0x30 && x <= 0x39) || (x >= 0x61 && x <= 0x7a) { str = append(str, x) } } if len(str) > 0 { out <- NewStringMarshaler(string(str)) } } }
func WordExtractor(sf types.SetupFunction) { in, out := sf.AsFilter("PathValidator").Build() defer close(out) for path := range in { p := ToString(path) f, err := os.Open(p) if err == nil { scanner := bufio.NewScanner(f) scanner.Split(bufio.ScanWords) for scanner.Scan() { out <- NewStringMarshaler(scanner.Text()) } } } }
func UrlProducer(sf types.SetupFunction, parent string) { in, out := sf.SetName("UrlProducer").AsFilter(parent).Build() defer close(out) visitedUrls := make(map[string]interface{}) for urlBm := range in { if sf.Cancelled() { continue } url := ToString(urlBm) if _, visited := visitedUrls[url]; !visited { visitedUrls[url] = nil out <- NewStringMarshaler(url) } } }
func textDownloader(sf types.SetupFunction, download string, mb int, done chan struct{}, closer func()) { in := sf.AsConsumer("MimeSplitterText").Build() var totalSize int64 = 0 for url := range in { if totalSize >= int64(mb)*1024*1024 { closer() continue } u := ToString(url) println("download", u, path.Base(u)) totalSize += saveToFile(Download(u), path.Join(download, path.Base(u))) println("Size", totalSize) } close(done) }