Ejemplo n.º 1
0
func (doc *Document) Authors() (authors []finc.Author) {
	for _, ra := range doc.Author {
		authors = append(authors, finc.Author{
			FirstName: AuthorReplacer.Replace(span.UnescapeTrim(ra.Given)),
			LastName:  AuthorReplacer.Replace(span.UnescapeTrim(ra.Family)),
		})
	}
	return authors
}
Ejemplo n.º 2
0
// CombinedTitle returns a longish title.
func (doc *Document) CombinedTitle() string {
	if len(doc.Title) > 0 {
		if len(doc.Subtitle) > 0 {
			return span.UnescapeTrim(fmt.Sprintf("%s : %s", strings.Join(doc.Title, " "), strings.Join(doc.Subtitle, " ")))
		}
		return span.UnescapeTrim(strings.Join(doc.Title, " "))
	}
	if len(doc.Subtitle) > 0 {
		return span.UnescapeTrim(strings.Join(doc.Subtitle, " "))
	}
	return ""
}
Ejemplo n.º 3
0
// ToIntermediateSchema converts a crossref document into IS.
func (doc *Document) ToIntermediateSchema() (*finc.IntermediateSchema, error) {
	var err error
	output := finc.NewIntermediateSchema()

	output.Date, err = doc.Issued.Date()

	if err != nil {
		return output, err
	}

	output.RawDate = output.Date.Format("2006-01-02")

	if doc.URL == "" {
		return output, errNoURL
	}

	output.RecordID = doc.RecordID()
	if len(output.RecordID) > span.KeyLengthLimit {
		return output, span.Skip{Reason: fmt.Sprintf("ID_TOO_LONG %s", output.RecordID)}
	}

	if output.Date.After(Future) {
		return output, span.Skip{Reason: fmt.Sprintf("TOO_FUTURISTIC %s", output.RecordID)}
	}

	if doc.Type == "journal-issue" {
		return output, span.Skip{Reason: fmt.Sprintf("JOURNAL_ISSUE %s", output.RecordID)}
	}

	output.ArticleTitle = doc.CombinedTitle()
	if len(output.ArticleTitle) == 0 {
		return output, span.Skip{Reason: fmt.Sprintf("NO_ATITLE %s", output.RecordID)}
	}

	for _, title := range ArticleTitleBlocker {
		if output.ArticleTitle == title {
			return output, span.Skip{Reason: fmt.Sprintf("BLOCKED_ATITLE %s", output.RecordID)}
		}
	}

	for _, p := range ArticleTitleCleanerPatterns {
		output.ArticleTitle = p.ReplaceAllString(output.ArticleTitle, "")
	}

	// refs. #8428
	if len(output.ArticleTitle) > 32000 {
		return output, span.Skip{Reason: fmt.Sprintf("TOO_LONG_TITLE %s", output.RecordID)}
	}

	output.DOI = doc.DOI
	output.Format = Formats.LookupDefault(doc.Type, DefaultFormat)
	output.Genre = Genres.LookupDefault(doc.Type, "unknown")
	output.ISSN = doc.ISSN
	output.Issue = doc.Issue
	output.Languages = []string{"eng"}
	output.Publishers = append(output.Publishers, doc.Publisher)
	output.RefType = RefTypes.LookupDefault(doc.Type, "GEN")
	output.SourceID = SourceID
	output.Subjects = doc.Subjects
	output.Type = doc.Type
	output.URL = append(output.URL, doc.URL)
	output.Volume = doc.Volume

	if len(doc.ContainerTitle) > 0 {
		output.JournalTitle = span.UnescapeTrim(doc.ContainerTitle[0])
	} else {
		return output, span.Skip{Reason: fmt.Sprintf("NO_JTITLE %s", output.RecordID)}
	}

	if len(doc.Subtitle) > 0 {
		output.ArticleSubtitle = span.UnescapeTrim(doc.Subtitle[0])
	}

	output.Authors = doc.Authors()

	// TODO(miku): do we need a config for these things?
	// Maybe a generic filter (in js?) that will gather exclusion rules?
	// if len(output.Authors) == 0 {
	// 	return output, span.Skip{Reason: fmt.Sprintf("NO_AUTHORS %s", output.RecordID)}
	// }

	pi := doc.PageInfo()
	output.StartPage = fmt.Sprintf("%d", pi.StartPage)
	output.EndPage = fmt.Sprintf("%d", pi.EndPage)
	output.Pages = pi.RawMessage
	output.PageCount = fmt.Sprintf("%d", pi.PageCount())

	if doc.Publisher == "" {
		output.MegaCollection = fmt.Sprintf("X-U (CrossRef)")
	} else {
		output.MegaCollection = fmt.Sprintf("%s (CrossRef)", doc.Publisher)
	}

	return output, nil
}
Ejemplo n.º 4
0
// ShortTitle returns the first main title only.
func (doc *Document) ShortTitle() (s string) {
	if len(doc.Title) > 0 {
		s = span.UnescapeTrim(doc.Title[0])
	}
	return
}