Example #1
0
// handleCharEncForTextFields handles the character encoding for all article
// fields that where retrieved using doc.Text() in Search() and so contain
// no meta html chars.
func (a *Article) handleTextFieldCharEncoding() {
	// first escape Latex Meta chars because in html2tex.Unicode() a lot
	// of '\' occur, which would be escaped twice
	html2tex.EscapeLatexMetaChars(&a.Title)
	//html2tex.EscapeLatexMetaChars(&a.Abstract)
	html2tex.EscapeLatexMetaChars(&a.Date)

	html2tex.Unicode(&a.Title)
	//html2tex.Unicode(&a.Abstract)
	html2tex.Unicode(&a.Date)

	for i, _ := range a.Authors {
		html2tex.EscapeLatexMetaChars(&a.Authors[i])
		html2tex.Unicode(&a.Authors[i])
	}
}
Example #2
0
func extractArticle(url string) (Article, error) {

	doc, err := getDocument(url)
	if err != nil {
		return Article{}, err
	}

	removeHtmlCommentsFrom(doc)

	var authors, journal, title, abstract, htmlBody, date string
	var authorsList []string

	scraper, err := getScraper(url)
	if err != nil {
		return Article{}, err
	}

	journal = scraper.Journal
	title = strings.TrimSpace(Search(doc, scraper.Items["title"]))
	abstract = strings.TrimSpace(Search(doc, scraper.Items["abstract"]))
	authors = strings.TrimSpace(Search(doc, scraper.Items["authors"]))
	date = strings.TrimSpace(Search(doc, scraper.Items["date"]))

	// Always fetch body last
	htmlBody = strings.TrimSpace(Search(doc, scraper.Items["body"]))

	LatexEscUrl := url
	html2tex.EscapeLatexMetaChars(&LatexEscUrl)

	//htmlBody = html2Latex(htmlBody)
	authorsList = CreateAuthorList(&authors)
	date = sanatizeDate(date)
	title = sanatizeTitle(title)

	log.Println("Authors:", authorsList)
	log.Println("Date:", date)

	baseName := baseFileName(authorsList, journal, title)

	return Article{baseName, url, LatexEscUrl, journal, title, abstract, authorsList, date, htmlBody}, err
}