Пример #1
0
func sanatizeCaptions(caption string) string {

	// captions may contain spurious <p> and \n's, which lead
	// to pdflatex fails because of line breaks. Hence, we remove them.
	caption = strings.Replace(caption, "\n", " ", -1)

	// captions may contain spurious <p> and \n's, which lead
	// to pdflatex fails because of line breaks. Hence, we remove them.
	caption = strdel.RegExp(caption, `Figure \d+: `)
	caption = strdel.RegExp(caption, `Fig. \d+: `)

	//Debug("total   : |%q|\n", total)

	return caption
}
Пример #2
0
func sanatizeExtractedAuthor(a string) string {

	a = removeDateFromAuthor(a)

	// remove by-words
	a = strdel.RegExp(a, "[Bb]y")
	a = strdel.Word(a, "on")
	a = strdel.Word(a, "at")
	a = strdel.RegExp(a, "[Pp]osted")
	a = strdel.RegExp(a, "[Ee]ditor")
	a = strdel.RegExp(a, "Guest Contributor")

	// remove Twitter
	a = strdel.RegExp(a, `Follow|@[[:alpha:]]+|([a-zA-Z]+)Twitter`)

	// Remove punctation
	a = strings.Replace(a, ".", "", -1)

	return strings.TrimSpace(a)
}
Пример #3
0
// removeNonRegChars deletes any number of non-regular chars, except for _-
// from string s
func removeNonRegChars(s string) string {
	return strdel.RegExp(s, "[^A-Za-z0-9_\\- ]+")
}