Example #1
0
func (article Article) ToIntermediateSchema() (*finc.IntermediateSchema, error) {
	output := finc.NewIntermediateSchema()
	output.RecordID = article.RecordID()
	output.SourceID = SourceID
	output.DOI = article.DOI()
	if len(output.DOI) == 0 {
		return output, fmt.Errorf("empty DOI")
	}

	output.MegaCollection = Collection
	output.Genre = Genre
	output.Format = Format

	output.URL = []string{fmt.Sprintf("http://doi.org/%s", output.DOI)}
	output.Volume = article.Front.ArticleMeta.Volume
	output.Issue = article.Front.ArticleMeta.Issue
	output.ArticleTitle = article.Front.ArticleMeta.TitleGroup.ArticleTitle
	output.JournalTitle = article.Front.JournalMeta.JournalTitleGroup.Title
	output.ISSN = article.Front.JournalMeta.Issn
	output.StartPage = article.Front.ArticleMeta.Fpage
	output.EndPage = article.Front.ArticleMeta.Lpage
	output.Abstract = article.Front.ArticleMeta.Abstract.Text
	t, err := article.ParseTime()
	if err != nil {
		return output, err
	}
	output.Date = t
	output.RawDate = output.Date.Format("2006-01-02")
	output.RefType = DefaultRefType

	if article.Front.JournalMeta.Publisher.Name != "" {
		output.Publishers = []string{article.Front.JournalMeta.Publisher.Name}
	}
	return output, nil
}
Example #2
0
// ToInternalSchema converts a jats article into an internal schema.
// This is a basic implementation, different source might implement their own.
func (article *Article) ToIntermediateSchema() (*finc.IntermediateSchema, error) {
	output := finc.NewIntermediateSchema()

	output.Date = article.Date()
	output.RawDate = output.Date.Format("2006-01-02")

	output.Abstract = string(article.Front.Article.Abstract.Value)
	output.ArticleTitle = article.CombinedTitle()
	output.Authors = article.Authors()
	output.Fulltext = article.Body.Section.Value
	output.Genre = "article"
	output.RefType = "JOUR"
	output.Headings = article.Headings()
	output.ISSN = article.ISSN()
	output.Issue = article.Front.Article.Issue.Value
	output.JournalTitle = article.JournalTitle()
	output.Languages = article.Languages()
	output.Publishers = append(output.Publishers, article.Front.Journal.Publisher.Name.Value)
	output.Subjects = article.Subjects()
	output.Volume = article.Front.Article.Volume.Value

	output.StartPage = article.Front.Article.FirstPage.Value
	output.EndPage = article.Front.Article.LastPage.Value
	output.PageCount = article.PageCount()
	output.Pages = fmt.Sprintf("%s-%s", output.StartPage, output.EndPage)

	return output, nil
}
Example #3
0
// ToIntermediateSchema does a type conversion only.
func (p Publication) ToIntermediateSchema() (*finc.IntermediateSchema, error) {
	is := finc.NewIntermediateSchema()
	is.JournalTitle = p.Title
	is.ArticleTitle = p.Volume.Article.Title

	is.ISSN = p.PaperISSN()
	is.EISSN = p.OnlineISSN()

	if len(is.ISSN) == 0 && len(is.EISSN) == 0 {
		// TODO(miku): sort out the various types
		return is, span.Skip{Reason: "no ISSN"}
	}

	is.Abstract = p.Volume.Article.Articleinfo.Abstract

	date, err := p.Date()
	if err != nil {
		return is, span.Skip{Reason: err.Error()}
	}
	is.Date = date
	is.RawDate = date.Format("2006-01-02")

	is.Authors = p.Authors()

	is.URL = []string{}

	is.SourceID = SourceID
	is.MegaCollection = Collection
	is.Format = Format

	if p.Volume.Article.Articleinfo.Amsid != "" {
		is.URL = append(is.URL, fmt.Sprintf("http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=%s", p.Volume.Article.Articleinfo.Amsid))
		is.RecordID = fmt.Sprintf("ai-89-%s", base64.RawURLEncoding.EncodeToString([]byte(p.Volume.Article.Articleinfo.Amsid)))
	} else {
		return is, ErrNoIdentifier
	}
	if p.Volume.Article.Articleinfo.Articledoi != "" {
		is.DOI = p.Volume.Article.Articleinfo.Articledoi
		is.URL = append(is.URL, fmt.Sprintf("http://doi.org/%s", is.DOI))
	}

	is.Volume = p.Volume.Volumeinfo.Volumenum
	is.Issue = p.Volume.Article.Articleinfo.Issuenum
	is.Pages = p.Volume.Article.Articleinfo.Numpages
	is.Publishers = []string{"IEEE"}

	is.Subjects = []string{}
	for _, kw := range p.Volume.Article.Articleinfo.Keywordset.Keyword {
		is.Subjects = append(is.Subjects, kw.Term)
	}

	is.RefType = DefaultRefType

	return is, nil
}
Example #4
0
File: dc.go Project: ubleipzig/span
// ToInternalSchema converts a jats article into an internal schema.
// This is a basic implementation, different source might implement their own.
func (r *Record) ToIntermediateSchema() (*finc.IntermediateSchema, error) {
	output := finc.NewIntermediateSchema()

	date, err := r.Date()
	if err != nil {
		return output, nil
	}

	output.Date = date
	output.RawDate = output.Date.Format("2006-01-02")
	output.Abstract = strings.TrimSpace(r.Description())
	output.ArticleTitle = r.Title()
	output.Publishers = r.Metadata.Dc.Publisher
	output.Authors = r.Authors()
	output.URL = r.Links()
	output.Subjects = r.Metadata.Dc.Subject
	// TODO(miku): normalize
	output.Languages = r.Metadata.Dc.Language
	output.DOI = r.DOI()

	output.Series = strings.Join(r.Metadata.Dc.Source, ", ")

	return output, nil
}
Example #5
0
// ToIntermediateSchema converts a genios document into an intermediate schema document.
// Will fail/skip records with unusable dates.
func (doc Document) ToIntermediateSchema() (*finc.IntermediateSchema, error) {
	var err error
	output := finc.NewIntermediateSchema()

	output.Date, err = doc.Date()
	if err != nil {
		return output, span.Skip{Reason: err.Error()}
	}
	output.RawDate = output.Date.Format("2006-01-02")

	output.Authors = doc.Authors()

	output.URL = append(output.URL, doc.URL())

	if isNomenNescio(doc.Abstract) {
		cutoff := len(doc.Text)
		if cutoff > textAsAbstractCutoff {
			cutoff = textAsAbstractCutoff
		}
		output.Abstract = strings.TrimSpace(doc.Text[:cutoff])
	} else {
		output.Abstract = strings.TrimSpace(doc.Abstract)

	}

	output.ArticleTitle = strings.TrimSpace(doc.Title)
	if len(output.ArticleTitle) > 16384 {
		return output, span.Skip{Reason: fmt.Sprintf("article title too long: %d", len(output.ArticleTitle))}
	}
	output.JournalTitle = strings.Replace(strings.TrimSpace(doc.PublicationTitle), "\n", " ", -1)

	output.ISSN = doc.ISSNList()

	if !isNomenNescio(doc.Issue) {
		output.Issue = strings.TrimSpace(doc.Issue)
	}

	if !isNomenNescio(doc.Volume) {
		output.Volume = strings.TrimSpace(doc.Volume)
	}

	output.Fulltext = doc.Text
	output.Format = Format
	output.Genre = Genre
	output.Languages = doc.Languages()

	var packageNames = dbmap.LookupDefault(doc.DB, []string{})

	var prefixedPackageNames []string
	for _, name := range packageNames {
		prefixedPackageNames = append(prefixedPackageNames, fmt.Sprintf("Genios (%s)", name))
	}

	// hack, to move Genios (LIT) further down
	sort.Sort(sort.Reverse(sort.StringSlice(prefixedPackageNames)))

	// Note DB name as well as package name (Wiwi, Sowi, Recht, etc.) as well
	// as kind, which - a bit confusingly - is also package in licensing terms (FZS).
	output.Packages = append([]string{doc.DB}, prefixedPackageNames...)

	if len(prefixedPackageNames) > 0 {
		output.MegaCollection = prefixedPackageNames[0]
	} else {
		log.Printf("genios: db is not associated with package: %s, using generic default", doc.DB)
		output.MegaCollection = fmt.Sprintf("Genios")
	}

	id := doc.RecordID()
	// 250 is a limit on memcached keys; offending key was:
	// ai-48-R1JFUl9fU2NoZWliIEVsZWt0cm90ZWNobmlrIEdtYkggwr\
	// dTdGV1ZXJ1bmdzYmF1IMK3SW5kdXN0cmllLUVsZWt0cm9uaWsgwr\
	// dFbGVrdHJvbWFzY2hpbmVuYmF1IMK3SW5kdXN0cmllLVNlcnZpY2\
	// UgwrdEYW5mb3NzLVN5c3RlbXBhcnRuZXIgwrdEYW5mb3NzIERyaX\
	// ZlcyBDZW50ZXIgwrdNYXJ0aW4gU2ljaGVyaGVpdHN0ZWNobmlr
	if len(id) > span.KeyLengthLimit {
		return output, span.Skip{Reason: fmt.Sprintf("id too long: %s", id)}
	}
	output.RecordID = id
	output.SourceID = SourceID
	output.Subjects = doc.Headings()

	// keep the date indicator, so we can create an update order
	output.Indicator = doc.XIssue
	output.RefType = DefaultRefType

	return output, nil
}
Example #6
0
// BatchConvert converts all items for a shipment into importable objects.
func (s Shipment) BatchConvert() ([]span.Importer, error) {
	var outputs []span.Importer

	for _, ji := range s.dataset.DatasetContent.JournalIssue {
		pii := ji.JournalIssueUniqueIds.Pii
		si, ok := s.issues[pii]
		if !ok {
			log.Println(fmt.Sprintf("skipping, issue referenced %s, but not cached", pii))
			continue
		}
		for _, sec := range si.IssueBody.IssueSec {
			for _, ii := range sec.IncludeItem {
				output := finc.NewIntermediateSchema()

				article, ok := s.articles[ii.Pii]

				if !ok {
					log.Println(fmt.Sprintf("skipping, article referenced %s, but not cached", ii.Pii))
					continue
				}

				output.Authors = article.Authors()
				output.DOI = article.ItemInfo.Doi
				output.Format = Format
				output.Genre = Genre
				output.ISSN = []string{si.IssueInfo.Issn}
				output.Issue = si.IssueInfo.VolumeIssueNumber.IssFirst
				output.Languages = []string{"eng"}
				output.MegaCollection = Collection
				output.RecordID = fmt.Sprintf("ai-%s-%s", SourceID, base64.RawURLEncoding.EncodeToString([]byte(article.ItemInfo.Doi)))
				output.RefType = DefaultRefType
				output.SourceID = SourceID
				output.Volume = si.IssueInfo.VolumeIssueNumber.VolFirst

				output.ArticleTitle = article.Title()
				output.JournalTitle = ji.JournalIssueProperties.CollectionTitle

				output.StartPage = ii.Pages.FirstPage
				output.EndPage = ii.Pages.LastPage
				output.Pages = ii.Pages.Total()

				output.URL = []string{
					fmt.Sprintf("http://doi.org/%s", article.ItemInfo.Doi),
				}

				date, err := article.Date()
				if err != nil {
					log.Printf("%+v: %s", article.Head, err)
					continue
				}

				output.Date = date
				output.RawDate = date.Format("2006-01-02")

				var buf bytes.Buffer
				for _, abs := range article.Head.Abstract {
					buf.WriteString(sanitize.HTML(abs.Text))
				}
				output.Abstract = buf.String()

				outputs = append(outputs, span.Importer(SchemaFunc(*output)))
			}
		}
	}
	return outputs, nil
}
Example #7
0
// ToIntermediateSchema is a toy converter.
func (s mockSchema) ToIntermediateSchema() (*finc.IntermediateSchema, error) {
	is := finc.NewIntermediateSchema()
	is.ArticleTitle = s.Name
	return is, nil
}
Example #8
0
// ToIntermediateSchema converts a crossref document into IS.
func (doc *Document) ToIntermediateSchema() (*finc.IntermediateSchema, error) {
	var err error
	output := finc.NewIntermediateSchema()

	output.Date, err = doc.Issued.Date()

	if err != nil {
		return output, err
	}

	output.RawDate = output.Date.Format("2006-01-02")

	if doc.URL == "" {
		return output, errNoURL
	}

	output.RecordID = doc.RecordID()
	if len(output.RecordID) > span.KeyLengthLimit {
		return output, span.Skip{Reason: fmt.Sprintf("ID_TOO_LONG %s", output.RecordID)}
	}

	if output.Date.After(Future) {
		return output, span.Skip{Reason: fmt.Sprintf("TOO_FUTURISTIC %s", output.RecordID)}
	}

	if doc.Type == "journal-issue" {
		return output, span.Skip{Reason: fmt.Sprintf("JOURNAL_ISSUE %s", output.RecordID)}
	}

	output.ArticleTitle = doc.CombinedTitle()
	if len(output.ArticleTitle) == 0 {
		return output, span.Skip{Reason: fmt.Sprintf("NO_ATITLE %s", output.RecordID)}
	}

	for _, title := range ArticleTitleBlocker {
		if output.ArticleTitle == title {
			return output, span.Skip{Reason: fmt.Sprintf("BLOCKED_ATITLE %s", output.RecordID)}
		}
	}

	for _, p := range ArticleTitleCleanerPatterns {
		output.ArticleTitle = p.ReplaceAllString(output.ArticleTitle, "")
	}

	// refs. #8428
	if len(output.ArticleTitle) > 32000 {
		return output, span.Skip{Reason: fmt.Sprintf("TOO_LONG_TITLE %s", output.RecordID)}
	}

	output.DOI = doc.DOI
	output.Format = Formats.LookupDefault(doc.Type, DefaultFormat)
	output.Genre = Genres.LookupDefault(doc.Type, "unknown")
	output.ISSN = doc.ISSN
	output.Issue = doc.Issue
	output.Languages = []string{"eng"}
	output.Publishers = append(output.Publishers, doc.Publisher)
	output.RefType = RefTypes.LookupDefault(doc.Type, "GEN")
	output.SourceID = SourceID
	output.Subjects = doc.Subjects
	output.Type = doc.Type
	output.URL = append(output.URL, doc.URL)
	output.Volume = doc.Volume

	if len(doc.ContainerTitle) > 0 {
		output.JournalTitle = span.UnescapeTrim(doc.ContainerTitle[0])
	} else {
		return output, span.Skip{Reason: fmt.Sprintf("NO_JTITLE %s", output.RecordID)}
	}

	if len(doc.Subtitle) > 0 {
		output.ArticleSubtitle = span.UnescapeTrim(doc.Subtitle[0])
	}

	output.Authors = doc.Authors()

	// TODO(miku): do we need a config for these things?
	// Maybe a generic filter (in js?) that will gather exclusion rules?
	// if len(output.Authors) == 0 {
	// 	return output, span.Skip{Reason: fmt.Sprintf("NO_AUTHORS %s", output.RecordID)}
	// }

	pi := doc.PageInfo()
	output.StartPage = fmt.Sprintf("%d", pi.StartPage)
	output.EndPage = fmt.Sprintf("%d", pi.EndPage)
	output.Pages = pi.RawMessage
	output.PageCount = fmt.Sprintf("%d", pi.PageCount())

	if doc.Publisher == "" {
		output.MegaCollection = fmt.Sprintf("X-U (CrossRef)")
	} else {
		output.MegaCollection = fmt.Sprintf("%s (CrossRef)", doc.Publisher)
	}

	return output, nil
}
Example #9
0
// ToIntermediateSchema converts a doaj document to intermediate schema. For
// now any record, that has no usable date will be skipped.
func (doc Document) ToIntermediateSchema() (*finc.IntermediateSchema, error) {
	var err error

	output := finc.NewIntermediateSchema()
	output.Date, err = doc.Date()
	if err != nil {
		return output, span.Skip{Reason: err.Error()}
	}
	output.RawDate = output.Date.Format("2006-01-02")

	id := fmt.Sprintf("ai-%s-%s", SourceID, doc.ID)
	if len(id) > span.KeyLengthLimit {
		return output, span.Skip{Reason: fmt.Sprintf("id too long: %s", id)}
	}
	output.RecordID = id
	output.Genre = Genre

	output.DOI = doc.DOI()
	output.Format = Format
	output.MegaCollection = Collection
	output.SourceID = SourceID

	output.ISSN = doc.Index.ISSN
	output.ArticleTitle = doc.BibJson.Title
	output.JournalTitle = doc.BibJson.Journal.Title
	output.Volume = doc.BibJson.Journal.Volume
	output.Publishers = append(output.Publishers, doc.BibJson.Journal.Publisher)

	for _, link := range doc.BibJson.Link {
		output.URL = append(output.URL, link.URL)
	}

	output.StartPage = doc.BibJson.StartPage
	output.EndPage = doc.BibJson.EndPage

	if sp, err := strconv.Atoi(doc.BibJson.StartPage); err == nil {
		if ep, err := strconv.Atoi(doc.BibJson.EndPage); err == nil {
			output.PageCount = fmt.Sprintf("%d", ep-sp)
			output.Pages = fmt.Sprintf("%d-%d", sp, ep)
		}
	}

	subjects := container.NewStringSet()
	for _, s := range doc.Index.SchemaCode {
		class := LCCPatterns.LookupDefault(strings.Replace(s, "LCC:", "", -1), finc.NOT_ASSIGNED)
		if class != finc.NOT_ASSIGNED {
			subjects.Add(class)
		}
	}
	if subjects.Size() == 0 {
		output.Subjects = []string{finc.NOT_ASSIGNED}
	} else {
		output.Subjects = subjects.SortedValues()
	}

	languages := container.NewStringSet()
	for _, l := range doc.Index.Language {
		languages.Add(LanguageMap.LookupDefault(l, "und"))
	}
	output.Languages = languages.Values()

	for _, author := range doc.BibJson.Author {
		output.Authors = append(output.Authors, finc.Author{Name: author.Name})
	}

	return output, nil
}
Example #10
0
// ToIntermediateSchema converts a genios document into an intermediate schema document.
// Will fail/skip records with unusable dates.
func (doc Document) ToIntermediateSchema() (*finc.IntermediateSchema, error) {
	var err error
	output := finc.NewIntermediateSchema()

	output.Date, err = doc.Date()
	if err != nil {
		return output, span.Skip{Reason: err.Error()}
	}
	output.RawDate = output.Date.Format("2006-01-02")

	for _, author := range doc.Authors() {
		output.Authors = append(output.Authors, finc.Author{Name: author})
	}

	output.URL = append(output.URL, doc.URL())

	if !NomenNescio(doc.Abstract) {
		output.Abstract = strings.TrimSpace(doc.Abstract)
	} else {
		cutoff := len(doc.Text)
		if cutoff > textAsAbstractCutoff {
			cutoff = textAsAbstractCutoff
		}
		output.Abstract = strings.TrimSpace(doc.Text[:cutoff])
	}

	output.ArticleTitle = strings.TrimSpace(doc.Title)
	output.JournalTitle = strings.TrimSpace(doc.PublicationTitle)

	if !NomenNescio(doc.ISSN) {
		output.ISSN = append(output.ISSN, strings.TrimSpace(doc.ISSN))
	}

	if !NomenNescio(doc.Issue) {
		output.Issue = strings.TrimSpace(doc.Issue)
	}

	if !NomenNescio(doc.Volume) {
		output.Volume = strings.TrimSpace(doc.Volume)
	}

	output.Format = Format
	output.Genre = Genre
	output.Languages = doc.Languages()
	output.MegaCollection = fmt.Sprintf("Genios (%s)", collections[doc.Group])
	id := doc.RecordID()
	// 250 is a limit on memcached keys; offending key was:
	// ai-48-R1JFUl9fU2NoZWliIEVsZWt0cm90ZWNobmlrIEdtYkggwr\
	// dTdGV1ZXJ1bmdzYmF1IMK3SW5kdXN0cmllLUVsZWt0cm9uaWsgwr\
	// dFbGVrdHJvbWFzY2hpbmVuYmF1IMK3SW5kdXN0cmllLVNlcnZpY2\
	// UgwrdEYW5mb3NzLVN5c3RlbXBhcnRuZXIgwrdEYW5mb3NzIERyaX\
	// ZlcyBDZW50ZXIgwrdNYXJ0aW4gU2ljaGVyaGVpdHN0ZWNobmlr
	if len(id) > span.KeyLengthLimit {
		return output, span.Skip{Reason: fmt.Sprintf("id too long: %s", id)}
	}
	output.RecordID = id
	output.SourceID = SourceID
	output.Subjects = doc.Headings()

	return output, nil
}
Example #11
0
File: tm.go Project: ubleipzig/span
func (doc Document) ToIntermediateSchema() (*finc.IntermediateSchema, error) {
	output := finc.NewIntermediateSchema()

	id, err := doc.RecordID()
	if err != nil {
		return output, err
	}
	output.RecordID = id
	output.SourceID = SourceID
	output.MegaCollection = Collection
	output.Genre = Genre
	output.Format = Format

	doi, err := doc.DOI()
	if err != nil {
		return output, err
	}
	output.DOI = doi

	date, err := doc.Date()
	if err != nil {
		return output, span.Skip{Reason: err.Error()}
	}
	output.Date = date

	journal := doc.Journal
	if journal.PublisherName != "" {
		output.Publishers = append(output.Publishers, journal.PublisherName)
	}

	if journal.JournalTitle == "" {
		return output, span.Skip{Reason: fmt.Sprintf("NO_JTITLE %s", output.RecordID)}
	}

	output.JournalTitle = journal.JournalTitle
	output.ISSN = append(output.ISSN, journal.ISSN)
	output.EISSN = append(output.EISSN, journal.EISSN)
	output.Volume = journal.Volume
	output.Issue = journal.Issue

	output.ArticleTitle = doc.ArticleTitle
	if output.ArticleTitle == "" {
		output.ArticleTitle = doc.VernacularTitle
	}

	output.Abstract = doc.Abstract
	if output.Abstract == "" {
		output.Abstract = doc.VernacularAbstract
	}

	for _, link := range doc.Links {
		output.URL = append(output.URL, link)
	}

	var subjects []string
	for _, s := range doc.Subject {
		if len(strings.TrimSpace(s)) > 0 {
			subjects = append(subjects, s)
		}
	}
	output.Subjects = subjects

	if doc.Language != "" {
		output.Languages = append(output.Languages, LanguageMap.LookupDefault(strings.ToUpper(doc.Language), "und"))
	} else {
		if doc.VernacularLanguage != "" {
			output.Languages = append(output.Languages, LanguageMap.LookupDefault(strings.ToUpper(doc.VernacularLanguage), "und"))
		}
	}

	var authors []finc.Author
	for _, author := range doc.AuthorList.Authors {
		authors = append(authors, finc.Author{FirstName: author.FirstName, LastName: author.LastName})
	}
	output.Authors = authors
	output.RefType = DefaultRefType

	return output, nil
}