// handleCharEncForTextFields handles the character encoding for all article // fields that where retrieved using doc.Text() in Search() and so contain // no meta html chars. func (a *Article) handleTextFieldCharEncoding() { // first escape Latex Meta chars because in html2tex.Unicode() a lot // of '\' occur, which would be escaped twice html2tex.EscapeLatexMetaChars(&a.Title) //html2tex.EscapeLatexMetaChars(&a.Abstract) html2tex.EscapeLatexMetaChars(&a.Date) html2tex.Unicode(&a.Title) //html2tex.Unicode(&a.Abstract) html2tex.Unicode(&a.Date) for i, _ := range a.Authors { html2tex.EscapeLatexMetaChars(&a.Authors[i]) html2tex.Unicode(&a.Authors[i]) } }
func extractArticle(url string) (Article, error) { doc, err := getDocument(url) if err != nil { return Article{}, err } removeHtmlCommentsFrom(doc) var authors, journal, title, abstract, htmlBody, date string var authorsList []string scraper, err := getScraper(url) if err != nil { return Article{}, err } journal = scraper.Journal title = strings.TrimSpace(Search(doc, scraper.Items["title"])) abstract = strings.TrimSpace(Search(doc, scraper.Items["abstract"])) authors = strings.TrimSpace(Search(doc, scraper.Items["authors"])) date = strings.TrimSpace(Search(doc, scraper.Items["date"])) // Always fetch body last htmlBody = strings.TrimSpace(Search(doc, scraper.Items["body"])) LatexEscUrl := url html2tex.EscapeLatexMetaChars(&LatexEscUrl) //htmlBody = html2Latex(htmlBody) authorsList = CreateAuthorList(&authors) date = sanatizeDate(date) title = sanatizeTitle(title) log.Println("Authors:", authorsList) log.Println("Date:", date) baseName := baseFileName(authorsList, journal, title) return Article{baseName, url, LatexEscUrl, journal, title, abstract, authorsList, date, htmlBody}, err }