func dehtmlize(in *bytes.Buffer) *bytes.Buffer { out, err := html2text.FromReader(in) if err != nil { ret := &bytes.Buffer{} ret.WriteString(err.Error()) ret.WriteString("\n") if false { ret.Write(in.Bytes()) } return ret } ret := bytes.NewBufferString(out) return ret }
func CrawlEventURL(url string, lang string, baseURL string) (models.Event, error) { auth := textapi.Auth{constants.AYLIEN_ID, constants.AYLIEN_KEY} client, err := textapi.NewClient(auth, true) if err != nil { panic(err) } params := &textapi.ExtractParams{URL: utils.GetFullURL(url, baseURL), BestImage: true, Language: lang} article, err := client.Extract(params) if err != nil { return models.Event{}, err } eventObject := models.Event{ URL: url, Description: article.Article, Title: article.Title, Image: article.Image, Lang: lang, } resp, err := http.Get(utils.GetFullURL(url, baseURL)) if err != nil { return eventObject, err } defer resp.Body.Close() bodyStr, err := html2text.FromReader(resp.Body) if err != nil { fmt.Print(err) return eventObject, err } dates := utils.RetrieveDatesFromString(bodyStr, lang, article.Article[:50]) if len(dates) > 0 { eventObject.DateBegin = dates[0] if len(dates) > 1 { eventObject.DateEnd = dates[1] } } return eventObject, nil }
// search takes a search term and a slice of URLs, fetches the // page content for each URL, performs a search, and then returns // a slice of results containing the result and any errors encountered. func search(term string, urls []string) []result { // If no search term was provided, exit. if term == "" { log.Fatal("go-search", "No search term was provided. Expected arguments: '-search=searchTerm'.") } else { // Lowercase the search term so our comparisons will be case-insensitive. term = strings.ToLower(term) } // Create a chan of strings to send work to be processed (urls). // Create a chan of type result to send results. // Set up a WaitGroup so we can track when all goroutines have finished processing. ch := make(chan string) done := make(chan result) var wg sync.WaitGroup // Create a single http Client with an 8 second timeout. // From the docs: "Clients should be reused instead of created as // needed. Clients are safe for concurrent use by multiple goroutines." client := &http.Client{ Timeout: 8 * time.Second, } // If there are less than 20 urls in urls.txt, decrease maxReqs to // the number of urls to avoid spinning up unnecessary goroutines. if maxReqs > len(urls) { maxReqs = len(urls) } log.Info("go-search", "Fetching and searching urls...") log.Info("go-search", "Go ahead, queue up your favorite jam: this will take ~30 seconds") // Spin up 'maxReqs' number of goroutines. wg.Add(maxReqs) for i := 0; i < maxReqs; i++ { go func() { for { // Recieve work from the chan of strings (urls). site, ok := <-ch if !ok { // If the channel is closed, there is no more work to be done and we can return. wg.Done() return } // Provide some visual feedback to the user for each url processed. v := flag.Lookup("verbose") if v.Value.String() == "false" { fmt.Print(".") } // Fetch the page content. response, err := client.Get("http://" + site) if err != nil { // If there are errors, try again with the 'www' host prefix. log.Debug("go-search", fmt.Sprintf("Initial request failed for %s, attempting 'www' prefix.", site), "error", err) response, err = client.Get("http://www." + site) } // If there are still errors, return the error message and continue looping. if err != nil { log.Debug("go-search", fmt.Sprintf("Both requests failed for %s, returning an error.", site), "error", err) done <- result{site, false, err} continue } // Extract the human-readable text from the response. // Note that FromReader uses html.Parse under the hood, // which reads to EOF in the same manner as ioutil.ReadAll. // https://github.com/jaytaylor/html2text/blob/master/html2text.go#L167 text, err := html2text.FromReader(response.Body) response.Body.Close() if err != nil { done <- result{site, false, err} continue } // Search for the search term in the page text and return the final result. found := strings.Contains(strings.ToLower(text), term) done <- result{site, found, nil} } }() } // Send work to be processed as goroutines become available. go func() { for _, site := range urls { log.Debug("go-search", fmt.Sprintf("Sending work: %s", site)) ch <- site } }() // Receive the results on the done chan. results := []result{} for i := 0; i < len(urls); i++ { select { case result := <-done: log.Debug("go-search", fmt.Sprintf("Receiving result: %s", result.site)) results = append(results, result) } } // Close the channel as a signal to the goroutines that no additional work needs to be processed. close(ch) // Wait for the goroutines to be done processing. wg.Wait() fmt.Print("Done!\n") return results }