Example #1
0
func dehtmlize(in *bytes.Buffer) *bytes.Buffer {
	out, err := html2text.FromReader(in)
	if err != nil {
		ret := &bytes.Buffer{}
		ret.WriteString(err.Error())
		ret.WriteString("\n")
		if false {
			ret.Write(in.Bytes())
		}
		return ret
	}
	ret := bytes.NewBufferString(out)
	return ret
}
Example #2
0
func CrawlEventURL(url string, lang string, baseURL string) (models.Event, error) {

	auth := textapi.Auth{constants.AYLIEN_ID, constants.AYLIEN_KEY}
	client, err := textapi.NewClient(auth, true)
	if err != nil {
		panic(err)
	}

	params := &textapi.ExtractParams{URL: utils.GetFullURL(url, baseURL), BestImage: true, Language: lang}
	article, err := client.Extract(params)

	if err != nil {
		return models.Event{}, err
	}

	eventObject := models.Event{
		URL:         url,
		Description: article.Article,
		Title:       article.Title,
		Image:       article.Image,
		Lang:        lang,
	}

	resp, err := http.Get(utils.GetFullURL(url, baseURL))

	if err != nil {
		return eventObject, err
	}
	defer resp.Body.Close()

	bodyStr, err := html2text.FromReader(resp.Body)
	if err != nil {
		fmt.Print(err)
		return eventObject, err
	}

	dates := utils.RetrieveDatesFromString(bodyStr, lang, article.Article[:50])

	if len(dates) > 0 {
		eventObject.DateBegin = dates[0]
		if len(dates) > 1 {
			eventObject.DateEnd = dates[1]
		}
	}

	return eventObject, nil
}
Example #3
0
// search takes a search term and a slice of URLs, fetches the
// page content for each URL, performs a search, and then returns
// a slice of results containing the result and any errors encountered.
func search(term string, urls []string) []result {

	// If no search term was provided, exit.
	if term == "" {
		log.Fatal("go-search", "No search term was provided. Expected arguments: '-search=searchTerm'.")
	} else {
		// Lowercase the search term so our comparisons will be case-insensitive.
		term = strings.ToLower(term)
	}

	// Create a chan of strings to send work to be processed (urls).
	// Create a chan of type result to send results.
	// Set up a WaitGroup so we can track when all goroutines have finished processing.
	ch := make(chan string)
	done := make(chan result)
	var wg sync.WaitGroup

	// Create a single http Client with an 8 second timeout.
	// From the docs: "Clients should be reused instead of created as
	// needed. Clients are safe for concurrent use by multiple goroutines."
	client := &http.Client{
		Timeout: 8 * time.Second,
	}

	// If there are less than 20 urls in urls.txt, decrease maxReqs to
	// the number of urls to avoid spinning up unnecessary goroutines.
	if maxReqs > len(urls) {
		maxReqs = len(urls)
	}

	log.Info("go-search", "Fetching and searching urls...")
	log.Info("go-search", "Go ahead, queue up your favorite jam: this will take ~30 seconds")

	// Spin up 'maxReqs' number of goroutines.
	wg.Add(maxReqs)
	for i := 0; i < maxReqs; i++ {
		go func() {
			for {
				// Recieve work from the chan of strings (urls).
				site, ok := <-ch
				if !ok {
					// If the channel is closed, there is no more work to be done and we can return.
					wg.Done()
					return
				}

				// Provide some visual feedback to the user for each url processed.
				v := flag.Lookup("verbose")
				if v.Value.String() == "false" {
					fmt.Print(".")
				}

				// Fetch the page content.
				response, err := client.Get("http://" + site)
				if err != nil {
					// If there are errors, try again with the 'www' host prefix.
					log.Debug("go-search", fmt.Sprintf("Initial request failed for %s, attempting 'www' prefix.", site), "error", err)

					response, err = client.Get("http://www." + site)
				}

				// If there are still errors, return the error message and continue looping.
				if err != nil {
					log.Debug("go-search", fmt.Sprintf("Both requests failed for %s, returning an error.", site), "error", err)

					done <- result{site, false, err}
					continue
				}

				// Extract the human-readable text from the response.
				// Note that FromReader uses html.Parse under the hood,
				// which reads to EOF in the same manner as ioutil.ReadAll.
				// https://github.com/jaytaylor/html2text/blob/master/html2text.go#L167
				text, err := html2text.FromReader(response.Body)
				response.Body.Close()
				if err != nil {
					done <- result{site, false, err}
					continue
				}

				// Search for the search term in the page text and return the final result.
				found := strings.Contains(strings.ToLower(text), term)
				done <- result{site, found, nil}
			}
		}()
	}

	// Send work to be processed as goroutines become available.
	go func() {
		for _, site := range urls {
			log.Debug("go-search", fmt.Sprintf("Sending work: %s", site))
			ch <- site
		}
	}()

	// Receive the results on the done chan.
	results := []result{}
	for i := 0; i < len(urls); i++ {
		select {
		case result := <-done:
			log.Debug("go-search", fmt.Sprintf("Receiving result: %s", result.site))
			results = append(results, result)
		}
	}

	// Close the channel as a signal to the goroutines that no additional work needs to be processed.
	close(ch)

	// Wait for the goroutines to be done processing.
	wg.Wait()

	fmt.Print("Done!\n")
	return results
}