func (article Article) ToIntermediateSchema() (*finc.IntermediateSchema, error) { output := finc.NewIntermediateSchema() output.RecordID = article.RecordID() output.SourceID = SourceID output.DOI = article.DOI() if len(output.DOI) == 0 { return output, fmt.Errorf("empty DOI") } output.MegaCollection = Collection output.Genre = Genre output.Format = Format output.URL = []string{fmt.Sprintf("http://doi.org/%s", output.DOI)} output.Volume = article.Front.ArticleMeta.Volume output.Issue = article.Front.ArticleMeta.Issue output.ArticleTitle = article.Front.ArticleMeta.TitleGroup.ArticleTitle output.JournalTitle = article.Front.JournalMeta.JournalTitleGroup.Title output.ISSN = article.Front.JournalMeta.Issn output.StartPage = article.Front.ArticleMeta.Fpage output.EndPage = article.Front.ArticleMeta.Lpage output.Abstract = article.Front.ArticleMeta.Abstract.Text t, err := article.ParseTime() if err != nil { return output, err } output.Date = t output.RawDate = output.Date.Format("2006-01-02") output.RefType = DefaultRefType if article.Front.JournalMeta.Publisher.Name != "" { output.Publishers = []string{article.Front.JournalMeta.Publisher.Name} } return output, nil }
// ToInternalSchema converts a jats article into an internal schema. // This is a basic implementation, different source might implement their own. func (article *Article) ToIntermediateSchema() (*finc.IntermediateSchema, error) { output := finc.NewIntermediateSchema() output.Date = article.Date() output.RawDate = output.Date.Format("2006-01-02") output.Abstract = string(article.Front.Article.Abstract.Value) output.ArticleTitle = article.CombinedTitle() output.Authors = article.Authors() output.Fulltext = article.Body.Section.Value output.Genre = "article" output.RefType = "JOUR" output.Headings = article.Headings() output.ISSN = article.ISSN() output.Issue = article.Front.Article.Issue.Value output.JournalTitle = article.JournalTitle() output.Languages = article.Languages() output.Publishers = append(output.Publishers, article.Front.Journal.Publisher.Name.Value) output.Subjects = article.Subjects() output.Volume = article.Front.Article.Volume.Value output.StartPage = article.Front.Article.FirstPage.Value output.EndPage = article.Front.Article.LastPage.Value output.PageCount = article.PageCount() output.Pages = fmt.Sprintf("%s-%s", output.StartPage, output.EndPage) return output, nil }
// ToIntermediateSchema does a type conversion only. func (p Publication) ToIntermediateSchema() (*finc.IntermediateSchema, error) { is := finc.NewIntermediateSchema() is.JournalTitle = p.Title is.ArticleTitle = p.Volume.Article.Title is.ISSN = p.PaperISSN() is.EISSN = p.OnlineISSN() if len(is.ISSN) == 0 && len(is.EISSN) == 0 { // TODO(miku): sort out the various types return is, span.Skip{Reason: "no ISSN"} } is.Abstract = p.Volume.Article.Articleinfo.Abstract date, err := p.Date() if err != nil { return is, span.Skip{Reason: err.Error()} } is.Date = date is.RawDate = date.Format("2006-01-02") is.Authors = p.Authors() is.URL = []string{} is.SourceID = SourceID is.MegaCollection = Collection is.Format = Format if p.Volume.Article.Articleinfo.Amsid != "" { is.URL = append(is.URL, fmt.Sprintf("http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=%s", p.Volume.Article.Articleinfo.Amsid)) is.RecordID = fmt.Sprintf("ai-89-%s", base64.RawURLEncoding.EncodeToString([]byte(p.Volume.Article.Articleinfo.Amsid))) } else { return is, ErrNoIdentifier } if p.Volume.Article.Articleinfo.Articledoi != "" { is.DOI = p.Volume.Article.Articleinfo.Articledoi is.URL = append(is.URL, fmt.Sprintf("http://doi.org/%s", is.DOI)) } is.Volume = p.Volume.Volumeinfo.Volumenum is.Issue = p.Volume.Article.Articleinfo.Issuenum is.Pages = p.Volume.Article.Articleinfo.Numpages is.Publishers = []string{"IEEE"} is.Subjects = []string{} for _, kw := range p.Volume.Article.Articleinfo.Keywordset.Keyword { is.Subjects = append(is.Subjects, kw.Term) } is.RefType = DefaultRefType return is, nil }
// ToInternalSchema converts a jats article into an internal schema. // This is a basic implementation, different source might implement their own. func (r *Record) ToIntermediateSchema() (*finc.IntermediateSchema, error) { output := finc.NewIntermediateSchema() date, err := r.Date() if err != nil { return output, nil } output.Date = date output.RawDate = output.Date.Format("2006-01-02") output.Abstract = strings.TrimSpace(r.Description()) output.ArticleTitle = r.Title() output.Publishers = r.Metadata.Dc.Publisher output.Authors = r.Authors() output.URL = r.Links() output.Subjects = r.Metadata.Dc.Subject // TODO(miku): normalize output.Languages = r.Metadata.Dc.Language output.DOI = r.DOI() output.Series = strings.Join(r.Metadata.Dc.Source, ", ") return output, nil }
// ToIntermediateSchema converts a genios document into an intermediate schema document. // Will fail/skip records with unusable dates. func (doc Document) ToIntermediateSchema() (*finc.IntermediateSchema, error) { var err error output := finc.NewIntermediateSchema() output.Date, err = doc.Date() if err != nil { return output, span.Skip{Reason: err.Error()} } output.RawDate = output.Date.Format("2006-01-02") output.Authors = doc.Authors() output.URL = append(output.URL, doc.URL()) if isNomenNescio(doc.Abstract) { cutoff := len(doc.Text) if cutoff > textAsAbstractCutoff { cutoff = textAsAbstractCutoff } output.Abstract = strings.TrimSpace(doc.Text[:cutoff]) } else { output.Abstract = strings.TrimSpace(doc.Abstract) } output.ArticleTitle = strings.TrimSpace(doc.Title) if len(output.ArticleTitle) > 16384 { return output, span.Skip{Reason: fmt.Sprintf("article title too long: %d", len(output.ArticleTitle))} } output.JournalTitle = strings.Replace(strings.TrimSpace(doc.PublicationTitle), "\n", " ", -1) output.ISSN = doc.ISSNList() if !isNomenNescio(doc.Issue) { output.Issue = strings.TrimSpace(doc.Issue) } if !isNomenNescio(doc.Volume) { output.Volume = strings.TrimSpace(doc.Volume) } output.Fulltext = doc.Text output.Format = Format output.Genre = Genre output.Languages = doc.Languages() var packageNames = dbmap.LookupDefault(doc.DB, []string{}) var prefixedPackageNames []string for _, name := range packageNames { prefixedPackageNames = append(prefixedPackageNames, fmt.Sprintf("Genios (%s)", name)) } // hack, to move Genios (LIT) further down sort.Sort(sort.Reverse(sort.StringSlice(prefixedPackageNames))) // Note DB name as well as package name (Wiwi, Sowi, Recht, etc.) as well // as kind, which - a bit confusingly - is also package in licensing terms (FZS). output.Packages = append([]string{doc.DB}, prefixedPackageNames...) if len(prefixedPackageNames) > 0 { output.MegaCollection = prefixedPackageNames[0] } else { log.Printf("genios: db is not associated with package: %s, using generic default", doc.DB) output.MegaCollection = fmt.Sprintf("Genios") } id := doc.RecordID() // 250 is a limit on memcached keys; offending key was: // ai-48-R1JFUl9fU2NoZWliIEVsZWt0cm90ZWNobmlrIEdtYkggwr\ // dTdGV1ZXJ1bmdzYmF1IMK3SW5kdXN0cmllLUVsZWt0cm9uaWsgwr\ // dFbGVrdHJvbWFzY2hpbmVuYmF1IMK3SW5kdXN0cmllLVNlcnZpY2\ // UgwrdEYW5mb3NzLVN5c3RlbXBhcnRuZXIgwrdEYW5mb3NzIERyaX\ // ZlcyBDZW50ZXIgwrdNYXJ0aW4gU2ljaGVyaGVpdHN0ZWNobmlr if len(id) > span.KeyLengthLimit { return output, span.Skip{Reason: fmt.Sprintf("id too long: %s", id)} } output.RecordID = id output.SourceID = SourceID output.Subjects = doc.Headings() // keep the date indicator, so we can create an update order output.Indicator = doc.XIssue output.RefType = DefaultRefType return output, nil }
// BatchConvert converts all items for a shipment into importable objects. func (s Shipment) BatchConvert() ([]span.Importer, error) { var outputs []span.Importer for _, ji := range s.dataset.DatasetContent.JournalIssue { pii := ji.JournalIssueUniqueIds.Pii si, ok := s.issues[pii] if !ok { log.Println(fmt.Sprintf("skipping, issue referenced %s, but not cached", pii)) continue } for _, sec := range si.IssueBody.IssueSec { for _, ii := range sec.IncludeItem { output := finc.NewIntermediateSchema() article, ok := s.articles[ii.Pii] if !ok { log.Println(fmt.Sprintf("skipping, article referenced %s, but not cached", ii.Pii)) continue } output.Authors = article.Authors() output.DOI = article.ItemInfo.Doi output.Format = Format output.Genre = Genre output.ISSN = []string{si.IssueInfo.Issn} output.Issue = si.IssueInfo.VolumeIssueNumber.IssFirst output.Languages = []string{"eng"} output.MegaCollection = Collection output.RecordID = fmt.Sprintf("ai-%s-%s", SourceID, base64.RawURLEncoding.EncodeToString([]byte(article.ItemInfo.Doi))) output.RefType = DefaultRefType output.SourceID = SourceID output.Volume = si.IssueInfo.VolumeIssueNumber.VolFirst output.ArticleTitle = article.Title() output.JournalTitle = ji.JournalIssueProperties.CollectionTitle output.StartPage = ii.Pages.FirstPage output.EndPage = ii.Pages.LastPage output.Pages = ii.Pages.Total() output.URL = []string{ fmt.Sprintf("http://doi.org/%s", article.ItemInfo.Doi), } date, err := article.Date() if err != nil { log.Printf("%+v: %s", article.Head, err) continue } output.Date = date output.RawDate = date.Format("2006-01-02") var buf bytes.Buffer for _, abs := range article.Head.Abstract { buf.WriteString(sanitize.HTML(abs.Text)) } output.Abstract = buf.String() outputs = append(outputs, span.Importer(SchemaFunc(*output))) } } } return outputs, nil }
// ToIntermediateSchema is a toy converter. func (s mockSchema) ToIntermediateSchema() (*finc.IntermediateSchema, error) { is := finc.NewIntermediateSchema() is.ArticleTitle = s.Name return is, nil }
// ToIntermediateSchema converts a crossref document into IS. func (doc *Document) ToIntermediateSchema() (*finc.IntermediateSchema, error) { var err error output := finc.NewIntermediateSchema() output.Date, err = doc.Issued.Date() if err != nil { return output, err } output.RawDate = output.Date.Format("2006-01-02") if doc.URL == "" { return output, errNoURL } output.RecordID = doc.RecordID() if len(output.RecordID) > span.KeyLengthLimit { return output, span.Skip{Reason: fmt.Sprintf("ID_TOO_LONG %s", output.RecordID)} } if output.Date.After(Future) { return output, span.Skip{Reason: fmt.Sprintf("TOO_FUTURISTIC %s", output.RecordID)} } if doc.Type == "journal-issue" { return output, span.Skip{Reason: fmt.Sprintf("JOURNAL_ISSUE %s", output.RecordID)} } output.ArticleTitle = doc.CombinedTitle() if len(output.ArticleTitle) == 0 { return output, span.Skip{Reason: fmt.Sprintf("NO_ATITLE %s", output.RecordID)} } for _, title := range ArticleTitleBlocker { if output.ArticleTitle == title { return output, span.Skip{Reason: fmt.Sprintf("BLOCKED_ATITLE %s", output.RecordID)} } } for _, p := range ArticleTitleCleanerPatterns { output.ArticleTitle = p.ReplaceAllString(output.ArticleTitle, "") } // refs. #8428 if len(output.ArticleTitle) > 32000 { return output, span.Skip{Reason: fmt.Sprintf("TOO_LONG_TITLE %s", output.RecordID)} } output.DOI = doc.DOI output.Format = Formats.LookupDefault(doc.Type, DefaultFormat) output.Genre = Genres.LookupDefault(doc.Type, "unknown") output.ISSN = doc.ISSN output.Issue = doc.Issue output.Languages = []string{"eng"} output.Publishers = append(output.Publishers, doc.Publisher) output.RefType = RefTypes.LookupDefault(doc.Type, "GEN") output.SourceID = SourceID output.Subjects = doc.Subjects output.Type = doc.Type output.URL = append(output.URL, doc.URL) output.Volume = doc.Volume if len(doc.ContainerTitle) > 0 { output.JournalTitle = span.UnescapeTrim(doc.ContainerTitle[0]) } else { return output, span.Skip{Reason: fmt.Sprintf("NO_JTITLE %s", output.RecordID)} } if len(doc.Subtitle) > 0 { output.ArticleSubtitle = span.UnescapeTrim(doc.Subtitle[0]) } output.Authors = doc.Authors() // TODO(miku): do we need a config for these things? // Maybe a generic filter (in js?) that will gather exclusion rules? // if len(output.Authors) == 0 { // return output, span.Skip{Reason: fmt.Sprintf("NO_AUTHORS %s", output.RecordID)} // } pi := doc.PageInfo() output.StartPage = fmt.Sprintf("%d", pi.StartPage) output.EndPage = fmt.Sprintf("%d", pi.EndPage) output.Pages = pi.RawMessage output.PageCount = fmt.Sprintf("%d", pi.PageCount()) if doc.Publisher == "" { output.MegaCollection = fmt.Sprintf("X-U (CrossRef)") } else { output.MegaCollection = fmt.Sprintf("%s (CrossRef)", doc.Publisher) } return output, nil }
// ToIntermediateSchema converts a doaj document to intermediate schema. For // now any record, that has no usable date will be skipped. func (doc Document) ToIntermediateSchema() (*finc.IntermediateSchema, error) { var err error output := finc.NewIntermediateSchema() output.Date, err = doc.Date() if err != nil { return output, span.Skip{Reason: err.Error()} } output.RawDate = output.Date.Format("2006-01-02") id := fmt.Sprintf("ai-%s-%s", SourceID, doc.ID) if len(id) > span.KeyLengthLimit { return output, span.Skip{Reason: fmt.Sprintf("id too long: %s", id)} } output.RecordID = id output.Genre = Genre output.DOI = doc.DOI() output.Format = Format output.MegaCollection = Collection output.SourceID = SourceID output.ISSN = doc.Index.ISSN output.ArticleTitle = doc.BibJson.Title output.JournalTitle = doc.BibJson.Journal.Title output.Volume = doc.BibJson.Journal.Volume output.Publishers = append(output.Publishers, doc.BibJson.Journal.Publisher) for _, link := range doc.BibJson.Link { output.URL = append(output.URL, link.URL) } output.StartPage = doc.BibJson.StartPage output.EndPage = doc.BibJson.EndPage if sp, err := strconv.Atoi(doc.BibJson.StartPage); err == nil { if ep, err := strconv.Atoi(doc.BibJson.EndPage); err == nil { output.PageCount = fmt.Sprintf("%d", ep-sp) output.Pages = fmt.Sprintf("%d-%d", sp, ep) } } subjects := container.NewStringSet() for _, s := range doc.Index.SchemaCode { class := LCCPatterns.LookupDefault(strings.Replace(s, "LCC:", "", -1), finc.NOT_ASSIGNED) if class != finc.NOT_ASSIGNED { subjects.Add(class) } } if subjects.Size() == 0 { output.Subjects = []string{finc.NOT_ASSIGNED} } else { output.Subjects = subjects.SortedValues() } languages := container.NewStringSet() for _, l := range doc.Index.Language { languages.Add(LanguageMap.LookupDefault(l, "und")) } output.Languages = languages.Values() for _, author := range doc.BibJson.Author { output.Authors = append(output.Authors, finc.Author{Name: author.Name}) } return output, nil }
// ToIntermediateSchema converts a genios document into an intermediate schema document. // Will fail/skip records with unusable dates. func (doc Document) ToIntermediateSchema() (*finc.IntermediateSchema, error) { var err error output := finc.NewIntermediateSchema() output.Date, err = doc.Date() if err != nil { return output, span.Skip{Reason: err.Error()} } output.RawDate = output.Date.Format("2006-01-02") for _, author := range doc.Authors() { output.Authors = append(output.Authors, finc.Author{Name: author}) } output.URL = append(output.URL, doc.URL()) if !NomenNescio(doc.Abstract) { output.Abstract = strings.TrimSpace(doc.Abstract) } else { cutoff := len(doc.Text) if cutoff > textAsAbstractCutoff { cutoff = textAsAbstractCutoff } output.Abstract = strings.TrimSpace(doc.Text[:cutoff]) } output.ArticleTitle = strings.TrimSpace(doc.Title) output.JournalTitle = strings.TrimSpace(doc.PublicationTitle) if !NomenNescio(doc.ISSN) { output.ISSN = append(output.ISSN, strings.TrimSpace(doc.ISSN)) } if !NomenNescio(doc.Issue) { output.Issue = strings.TrimSpace(doc.Issue) } if !NomenNescio(doc.Volume) { output.Volume = strings.TrimSpace(doc.Volume) } output.Format = Format output.Genre = Genre output.Languages = doc.Languages() output.MegaCollection = fmt.Sprintf("Genios (%s)", collections[doc.Group]) id := doc.RecordID() // 250 is a limit on memcached keys; offending key was: // ai-48-R1JFUl9fU2NoZWliIEVsZWt0cm90ZWNobmlrIEdtYkggwr\ // dTdGV1ZXJ1bmdzYmF1IMK3SW5kdXN0cmllLUVsZWt0cm9uaWsgwr\ // dFbGVrdHJvbWFzY2hpbmVuYmF1IMK3SW5kdXN0cmllLVNlcnZpY2\ // UgwrdEYW5mb3NzLVN5c3RlbXBhcnRuZXIgwrdEYW5mb3NzIERyaX\ // ZlcyBDZW50ZXIgwrdNYXJ0aW4gU2ljaGVyaGVpdHN0ZWNobmlr if len(id) > span.KeyLengthLimit { return output, span.Skip{Reason: fmt.Sprintf("id too long: %s", id)} } output.RecordID = id output.SourceID = SourceID output.Subjects = doc.Headings() return output, nil }
func (doc Document) ToIntermediateSchema() (*finc.IntermediateSchema, error) { output := finc.NewIntermediateSchema() id, err := doc.RecordID() if err != nil { return output, err } output.RecordID = id output.SourceID = SourceID output.MegaCollection = Collection output.Genre = Genre output.Format = Format doi, err := doc.DOI() if err != nil { return output, err } output.DOI = doi date, err := doc.Date() if err != nil { return output, span.Skip{Reason: err.Error()} } output.Date = date journal := doc.Journal if journal.PublisherName != "" { output.Publishers = append(output.Publishers, journal.PublisherName) } if journal.JournalTitle == "" { return output, span.Skip{Reason: fmt.Sprintf("NO_JTITLE %s", output.RecordID)} } output.JournalTitle = journal.JournalTitle output.ISSN = append(output.ISSN, journal.ISSN) output.EISSN = append(output.EISSN, journal.EISSN) output.Volume = journal.Volume output.Issue = journal.Issue output.ArticleTitle = doc.ArticleTitle if output.ArticleTitle == "" { output.ArticleTitle = doc.VernacularTitle } output.Abstract = doc.Abstract if output.Abstract == "" { output.Abstract = doc.VernacularAbstract } for _, link := range doc.Links { output.URL = append(output.URL, link) } var subjects []string for _, s := range doc.Subject { if len(strings.TrimSpace(s)) > 0 { subjects = append(subjects, s) } } output.Subjects = subjects if doc.Language != "" { output.Languages = append(output.Languages, LanguageMap.LookupDefault(strings.ToUpper(doc.Language), "und")) } else { if doc.VernacularLanguage != "" { output.Languages = append(output.Languages, LanguageMap.LookupDefault(strings.ToUpper(doc.VernacularLanguage), "und")) } } var authors []finc.Author for _, author := range doc.AuthorList.Authors { authors = append(authors, finc.Author{FirstName: author.FirstName, LastName: author.LastName}) } output.Authors = authors output.RefType = DefaultRefType return output, nil }