/****************** INDEX RELATED FUNCS ******************/ func fetchPage(payload *gabs.Container) error { //request the http url url := payload.Path("initial_input.url").Data().(string) payload.SetP(url, "return_value.url") //Example of forcing a halting condition and returning an error up to the bolt engine and api client //payload.SetP(boltsdk.HaltCallCommandName, "nextCommand") //payload.SetP("FAKE ERROR", "error") //return errors.New("FAKE ERROR") resp, err := http.Get(url) if err != nil { payload.SetP("Error getting url content: "+err.Error(), "error.fetchPage") return errors.New("Error getting url content: " + err.Error()) } defer resp.Body.Close() body, err := ioutil.ReadAll(resp.Body) if err != nil { payload.SetP("Error getting url content: "+err.Error(), "error.fetchPage") return errors.New("Error getting url content: " + err.Error()) } payload.SetP(string(body), "return_value.html") return nil }
func parseKeywords(payload *gabs.Container) error { //parse through html to extract unique 5+ letter a-zA-Z keywords html := "" if payload.Path("return_value.html").Data() != nil { html = payload.Path("return_value.html").Data().(string) } //super inefficient way to do this, a full app would parse the html tree //properly, treat certain elements differently, etc tmptokens := strings.Split(html, " ") //easy way to de-dupe keywords we'll be filling this map using the keys as the keyword //could also use it to up the # of occurences of the keyword to add weights keywords := make(map[string]int) for i := range tmptokens { //here we make sure to only include tokens that are alpha w/ no numerics or special chars matched, err := regexp.MatchString("^[a-zA-Z]+$", tmptokens[i]) if err == nil && len(tmptokens[i]) >= 5 && matched { keywords[strings.ToLower(tmptokens[i])] = 1 } } //fill a slice with the unique keywords kwarray := []string{} for kw := range keywords { kwarray = append(kwarray, kw) } //cheapo extract title of page without full html parse title := "(Unknown)" i := strings.Index(html, "<title>") e := strings.Index(html, "</title>") if i >= 0 && e >= 0 && i < e { title = html[i+7 : e] } //add to the payload payload.SetP(kwarray, "return_value.keywords") payload.SetP(title, "return_value.title") payload.SetP("", "return_value.html") //clear out the html, no need to send it to next workers return nil }
/****************** SEARCH RELATED FUNCS ******************/ func getBaseResults(payload *gabs.Container) error { //do some work/db queries, etc (in this case, 'query' the search index) stopAt := int(payload.Path("params.stopAt").Data().(float64)) results := doSearch(payload.Path("initial_input.searchtext").Data().(string), stopAt) _, err := payload.SetP(results, "return_value.results") return err }
func saveToIndex(payload *gabs.Container) error { //loop through keywords, adding the url to keyword maps //(in a real application, this would be some sort of real datastore, of course) kw := payload.Path("return_value.keywords").Data().([]interface{}) url := payload.Path("return_value.url").Data().(string) title := payload.Path("return_value.title").Data().(string) //fmt.Println(kw, url, title) sites[url] = &ResultInfo{Title: title, URL: url, Meta: ""} //with multiple worker threads running, this might cause locking and need a mutex, but for example purpose its ok. //we look through all keywords, and store the association to the url for keywordi := range kw { keyword := kw[keywordi].(string) urls, ok := keywords[keyword] if ok { skip := false for u := range urls.C { if url == urls.C[u] { //site already in this keyword assoc skip = true } } if !skip { urls.C = append(urls.C, url) } } else { keywords[keyword] = &URLCollection{C: []string{url}} } } return nil }