// Convert returns a latex text body. It takes the html source and the url // of the source to create a goquery document and work with that. // Additionally, a codeLanguage to use as highlight for code blocks can be // provided. A codeLanguage can be an empty string. // When excludeFigures is true, figures are not included in the latex // source. func Convert(htmlStr string, articleUrl string, codeLang string, excludeFigures bool) string { doc := generateDoc(htmlStr, articleUrl) escapeDocLatexMetaChars(doc) convertDocUniCode(doc) unwrapDrop(doc) // wrap all html environments by corresponding latex environments. convertImages(doc, excludeFigures) convertVideos(doc) convertCode(doc, codeLang) // code blocks include latex metachars {,} convertFootnotes(doc) convertLinks(doc) convertHeading(doc) convertList(doc) convertQuotations(doc) // inside tex macros transform <br> linebreaks with \\ so they are handled // gracefully by the text engine wrapElementsAndDeleteLinebreak(doc, "em", "\\emph{", "}") wrapElementsAndDeleteLinebreak(doc, "i", "\\textit{", "}") wrapElementsAndDeleteLinebreak(doc, "b", "\\textbf{", "}") wrapElementsAndDeleteLinebreak(doc, "strong", "\\textbf{", "}") wrapElementsAndDeleteLinebreak(doc, "u", "\\underline{", "}") wrapElementsAndDeleteLinebreak(doc, "sup", "\\textsuperscript{", "}") wrapElementsAndDeleteLinebreak(doc, "sub", "\\textsubscript{", "}") wrapElementsAndDeleteLinebreak(doc, "strike", "\\sout{", "}") wrapElementsAndDeleteLinebreak(doc, "span[style=\"text-decoration: line-through\"]", "\\sout{", "}") // outside tex macros use double line break to separate paragraphs etc wrapElementsAndKeepLinebreak(doc, "p", "\n\n", "") wrapElementsAndKeepLinebreak(doc, "br", "", "\n\n") // When .Text() is called, the latex code survives. t := doc.Text() t = convertInlineMath(t) t = strtrans.LinebreaksToTwoLinebreaks(t) t = strdel.EmptyBrackets(t) t = deleteSpaceBeforeClosingBrackets(t) //Unicode(&t) return t /* sample := " talking to Amazon’s Web services" fmt.Printf("%+q\n", sample) h, _ := doc.Html() fmt.Println(t) */ }
func (a Article) Text() string { // // Titel -- Subtitle // ================= // By Authors // // Abstract // // --------------------- // Content // --------------------- // // From Journal on Date // Source: Url // Title title := a.Title title += "\n" title += strings.Repeat("=", len(title)-1) title += "\n" // Authors authors := "By " for i, auth := range a.Authors { authors += auth if i == len(a.Authors)-1 { authors += "\n" break } authors += " and " } authors += "\n\n" // Abstract var abstract string if a.Abstract != "" { abstract = a.Abstract + "\n\n" } // Content tmp := "From " + a.Journal + " on " + a.Date content := strings.Repeat("-", len(tmp)) + "\n" content += a.PlainTextContent() + "\n\n" content += strings.Repeat("-", len(tmp)) + "\n\n" // Reference ref := "From " + a.Journal + " on " + a.Date + "\n" ref += "Source: " + a.Url return strtrans.LinebreaksToTwoLinebreaks(title + authors + abstract + content + ref) }