func (r Repository) SaveAllArticles(articles []*entity.Article) error { tx := r.DB.Begin() for i, a := range articles { article := entity.Article{ID: a.ID} tx.Preload("Images").Preload("Crawl").FirstOrCreate(&article) article.Ordering = len(articles) - 1 - i article.Title = a.Title article.URL = a.URL article.Preview = a.Preview author := entity.Author{} tx.First(&author, "name = ?", strings.TrimSpace(a.Author.Name)) if author.ID == 0 { r.Logger.Log("msg", "Can't find author for article ", "author", a.Author.Name, "article", a.URL) continue } article.AuthorID = author.ID for _, i := range a.Images { article.AddImage(i) } if article.Crawl.ID == 0 { article.Crawl = entity.Crawl{Next: time.Now()} } tx.Save(&article) } tx.Commit() return nil }
func (scraper Scraper) scrapeArticle(a *entity.Article) error { log.Println(scraper.host + a.URL) doc, err := goquery.NewDocument(scraper.host + a.URL) if err != nil { return err } articleNode := doc.Find("main article.article") contentNode := articleNode.Find(".article-content") if articleNode.Length() == 0 { log.Printf("article %s has no content", a.URL) } contentHTML, err := contentNode.Html() if err != nil { return err } a.Headline = strings.TrimSpace(articleNode.Find(".article--title").Text()) a.Excerpt = strings.TrimSpace(contentNode.Find(".article--teaser").Text()) a.Content = strings.TrimSpace(contentHTML) authorNode := articleNode.Find(".author .author--link") authorURL, _ := authorNode.Attr("href") authorName := strings.TrimSpace(authorNode.Text()) idMatches := idRegex.FindStringSubmatch(authorURL) if len(idMatches) != 2 { log.Printf("couldn't parse id for author %s\n", authorURL) } // ID authorID, err := strconv.Atoi(idMatches[1]) if err != nil { log.Printf("couldn't parse id for author %s\n", authorURL) } author := entity.Author{ ID: authorID, Name: authorName, URL: authorURL, } scraper.db.Preload("Images").Preload("Crawl").FirstOrCreate(&author) if author.Crawl.ID == 0 { author.Crawl = entity.Crawl{Next: time.Now()} } scraper.db.Save(&author) a.Crawl.Next = time.Now().Add(time.Duration(float64(rand.Intn(18000))+30*time.Minute.Seconds()) * time.Second) a.AuthorID = author.ID scraper.db.Save(&a) return nil }