func sanatizeCaptions(caption string) string { // captions may contain spurious <p> and \n's, which lead // to pdflatex fails because of line breaks. Hence, we remove them. caption = strings.Replace(caption, "\n", " ", -1) // captions may contain spurious <p> and \n's, which lead // to pdflatex fails because of line breaks. Hence, we remove them. caption = strdel.RegExp(caption, `Figure \d+: `) caption = strdel.RegExp(caption, `Fig. \d+: `) //Debug("total : |%q|\n", total) return caption }
func sanatizeExtractedAuthor(a string) string { a = removeDateFromAuthor(a) // remove by-words a = strdel.RegExp(a, "[Bb]y") a = strdel.Word(a, "on") a = strdel.Word(a, "at") a = strdel.RegExp(a, "[Pp]osted") a = strdel.RegExp(a, "[Ee]ditor") a = strdel.RegExp(a, "Guest Contributor") // remove Twitter a = strdel.RegExp(a, `Follow|@[[:alpha:]]+|([a-zA-Z]+)Twitter`) // Remove punctation a = strings.Replace(a, ".", "", -1) return strings.TrimSpace(a) }
// removeNonRegChars deletes any number of non-regular chars, except for _- // from string s func removeNonRegChars(s string) string { return strdel.RegExp(s, "[^A-Za-z0-9_\\- ]+") }