// Languages returns the given and guessed languages // found in abstract and fulltext. Note: This is slow. // Skip detection on too short strings. func (article *Article) Languages() []string { set := container.NewStringSet() if article.Front.Article.Abstract.Lang != "" { base, err := language.ParseBase(article.Front.Article.Abstract.Lang) if err == nil { set.Add(base.ISO3()) } } vals := []string{ article.Front.Article.Abstract.Value, article.Front.Article.TranslatedAbstract.Title.Value, article.Body.Section.Value, } for _, s := range vals { if len(s) < 20 { continue } lang, err := span.DetectLang3(s) if err != nil || lang == "und" { continue } if !acceptedLanguages.Contains(lang) { continue } set.Add(lang) } return set.Values() }
// ISSNList returns a list of ISSN. func (doc Document) ISSNList() []string { issns := container.NewStringSet() for _, s := range span.ISSNPattern.FindAllString(doc.ISSN, -1) { issns.Add(s) } return issns.Values() }
func MustLoadStringSet(paths ...string) *container.StringSet { s := container.NewStringSet() for _, path := range paths { b, err := Asset(path) if err != nil { panic(err) } rdr := bufio.NewReader(bytes.NewReader(b)) for { line, err := rdr.ReadString('\n') if err == io.EOF { break } if err != nil { panic(err) } line = strings.TrimSpace(line) if line == "" { continue } s.Add(line) } } return s }
// UnmarshalJSON turns a config fragment into a ISSN filter. func (f *CollectionFilter) UnmarshalJSON(p []byte) error { var s struct { Collections []string `json:"collection"` } if err := json.Unmarshal(p, &s); err != nil { return err } f.values = *container.NewStringSet(s.Collections...) return nil }
// UnmarshalJSON turns a config fragment into a filter. func (f *PackageFilter) UnmarshalJSON(p []byte) error { var s struct { Packages []string `json:"package"` } if err := json.Unmarshal(p, &s); err != nil { return err } f.values = *container.NewStringSet(s.Packages...) return nil }
// Tags returns all ISILs that could be attached to a given intermediate // schema record. func (t ISILTagger) Tags(is finc.IntermediateSchema) []string { isils := container.NewStringSet() for isil, filters := range t { for _, f := range filters { if f.Apply(is) { isils.Add(isil) } } } return isils.Values() }
// Languages returns a list of language in 3-letter format. func (article *Article) Languages() []string { set := container.NewStringSet() for _, cm := range article.Front.Article.CustomMetaGroup.CustomMeta { if cm.Name.Value == "lang" { base, err := language.ParseBase(cm.Value.Value) if err == nil { set.Add(base.ISO3()) } } } return set.Values() }
// Export method from intermediate schema to solr 4/13 schema. func (s *Solr4Vufind13v1) Convert(is finc.IntermediateSchema) error { s.Allfields = is.Allfields() s.Formats = append(s.Formats, is.Format) s.Fullrecord = "blob:" + is.RecordID s.Fulltext = is.Fulltext s.HierarchyParentTitle = append(s.HierarchyParentTitle, is.JournalTitle) s.ID = is.RecordID s.Imprint = is.Imprint() s.ISSN = is.ISSNList() s.MegaCollections = append(s.MegaCollections, is.MegaCollection) s.PublishDateSort = is.Date.Year() s.Publishers = is.Publishers s.RecordType = finc.AIRecordType s.Series = append(s.Series, is.JournalTitle) s.SourceID = is.SourceID s.Subtitle = is.ArticleSubtitle s.TitleSort = is.SortableTitle() s.Topics = is.Subjects s.URL = is.URL classes := container.NewStringSet() for _, s := range is.Subjects { for _, class := range SubjectMapping.LookupDefault(s, []string{}) { classes.Add(class) } } s.FincClassFacet = classes.Values() sanitized := sanitize.HTML(is.ArticleTitle) s.Title, s.TitleFull, s.TitleShort = sanitized, sanitized, sanitized for _, lang := range is.Languages { s.Languages = append(s.Languages, LanguageMap.LookupDefault(lang, lang)) } for _, author := range is.Authors { s.SecondaryAuthors = append(s.SecondaryAuthors, author.String()) s.AuthorFacet = append(s.AuthorFacet, author.String()) } if len(s.SecondaryAuthors) > 0 { s.Author = s.SecondaryAuthors[0] } s.AccessFacet = AIAccessFacet s.FormatDe15 = []string{FormatDe15.LookupDefault(is.Format, "")} return nil }
// NewAttachByList reads one record per line from reader. Empty lines are ignored. func NewListFilter(r io.Reader) (ListFilter, error) { br := bufio.NewReader(r) f := ListFilter{Set: container.NewStringSet()} for { line, err := br.ReadString('\n') if err == io.EOF { break } if err != nil { return f, err } line = strings.TrimSpace(line) if line != "" { f.Set.Add(line) } } return f, nil }
// Languages returns the given and guessed languages found in abstract and // fulltext. Note: This is slow. Skip detection on too short strings. func (doc Document) Languages() []string { set := container.NewStringSet() vals := []string{doc.Title, doc.Text} for _, s := range vals { if len(s) < 20 { continue } lang, err := span.DetectLang3(s) if err != nil { continue } if !acceptedLanguages.Contains(lang) { continue } if lang == "und" { continue } set.Add(lang) } return set.Values() }
"time" "github.com/miku/span" "github.com/miku/span/container" "github.com/miku/span/finc" "golang.org/x/text/language" ) var ( errNoDOI = errors.New("DOI is missing") errNotImplemented = errors.New("not implemented") ) var ( // Restricts the possible languages for detection. acceptedLanguages = container.NewStringSet("deu", "eng", "fra", "ita", "spa") // Candidate patterns for parsing publishing dates. datePatterns = []string{ "2006", "2006-", "2006-1", "2006-01", "2006-1-2", "2006-1-02", "2006-01-2", "2006-01-02", "2006-Jan", "2006-January", "2006-Jan-2", "2006-Jan-02",
// Export method from intermediate schema to solr 4/13 schema. func (s *Solr5Vufind3) Convert(is finc.IntermediateSchema, withFullrecord bool) error { s.Allfields = is.Allfields() s.Formats = append(s.Formats, is.Format) s.Fullrecord = "blob:" + is.RecordID s.Fulltext = is.Fulltext s.ID = is.RecordID s.Imprint = is.Imprint() s.ISSN = is.ISSNList() s.MegaCollections = append(s.MegaCollections, is.MegaCollection) s.PublishDateSort = is.Date.Year() s.PublishDate = []string{is.Date.Format("2006-01-02")} s.Publishers = is.Publishers s.RecordType = finc.AIRecordType s.Series = append(s.Series, is.JournalTitle) s.SourceID = is.SourceID s.Subtitle = is.ArticleSubtitle s.TitleSort = is.SortableTitle() s.Topics = is.Subjects s.URL = is.URL classes := container.NewStringSet() for _, s := range is.Subjects { for _, class := range SubjectMapping.LookupDefault(s, []string{}) { classes.Add(class) } } s.FincClassFacet = classes.Values() sanitized := sanitize.HTML(is.ArticleTitle) s.Title, s.TitleFull, s.TitleShort = sanitized, sanitized, sanitized // is we do not have a title yet be rft.btitle is non-empty, use that if s.Title == "" && is.BookTitle != "" { sanitized := sanitize.HTML(is.BookTitle) s.Title, s.TitleFull, s.TitleShort = sanitized, sanitized, sanitized } for _, lang := range is.Languages { s.Languages = append(s.Languages, LanguageMap.LookupDefault(lang, lang)) } // collect sanizized authors var authors []string for _, author := range is.Authors { sanitized := AuthorReplacer.Replace(author.String()) if sanitized == "" { continue } authors = append(authors, sanitized) // first, random author goes into author field, others into secondary field, refs. #5778 if s.VF1Author == "" { s.VF1Author = sanitized } else { s.VF1SecondaryAuthors = append(s.VF1SecondaryAuthors, sanitized) } s.AuthorFacet = append(s.AuthorFacet, sanitized) } if s.VF1Author == "" { s.VF1Author = finc.NOT_ASSIGNED } if len(authors) == 0 { s.Authors = []string{finc.NOT_ASSIGNED} } else { s.Authors = authors } s.AccessFacet = AIAccessFacet // site specific formats s.FormatDe105 = []string{FormatDe105.LookupDefault(is.Format, "")} s.FormatDe14 = []string{FormatDe14.LookupDefault(is.Format, "")} s.FormatDe15 = []string{FormatDe15.LookupDefault(is.Format, "")} s.FormatDe520 = []string{FormatDe520.LookupDefault(is.Format, "")} s.FormatDe540 = []string{FormatDe540.LookupDefault(is.Format, "")} s.FormatDeCh1 = []string{FormatDeCh1.LookupDefault(is.Format, "")} s.FormatDed117 = []string{FormatDed117.LookupDefault(is.Format, "")} s.FormatDeGla1 = []string{FormatDeGla1.LookupDefault(is.Format, "")} s.FormatDel152 = []string{FormatDel152.LookupDefault(is.Format, "")} s.FormatDel189 = []string{FormatDel189.LookupDefault(is.Format, "")} s.FormatDeZi4 = []string{FormatDeZi4.LookupDefault(is.Format, "")} s.FormatDeZwi2 = []string{FormatDeZwi2.LookupDefault(is.Format, "")} s.FormatNrw = []string{FormatNrw.LookupDefault(is.Format, "")} s.ContainerVolume = is.Volume s.ContainerIssue = is.Issue s.ContainerStartPage = is.StartPage s.ContainerTitle = is.JournalTitle s.Institutions = is.Labels if withFullrecord { // refs. #8031 b, err := json.Marshal(is) if err != nil { return err } s.Fullrecord = string(b) } return nil }
RawDate string `xml:"Date"` Volume string `xml:"Volume"` Issue string `xml:"Issue"` RawAuthors []string `xml:"Authors>Author"` Language string `xml:"Language"` Abstract string `xml:"Abstract"` Descriptors string `xml:"Descriptors>Descriptor"` Text string `xml:"Text"` XGroup string `xml:"x-group"` XIssue string `xml:"x-issue"` } var ( rawDateReplacer = strings.NewReplacer(`"`, "", "\n", "", "\t", "") // acceptedLanguages restricts the possible languages for detection. acceptedLanguages = container.NewStringSet("deu", "eng") // dbmap maps a database name to one or more "package names" dbmap = assetutil.MustLoadStringSliceMap("assets/genios/dbmap.json") ) type Genios struct{} // Iterate emits Converter elements via XML decoding. func (s Genios) Iterate(r io.Reader) (<-chan []span.Importer, error) { return span.FromXML(r, "Document", func(d *xml.Decoder, se xml.StartElement) (span.Importer, error) { doc := new(Document) err := d.DecodeElement(&doc, &se) return doc, err }) }
// ReadAll loads entries from a reader. func (r *Reader) ReadEntries() (holdings.Entries, error) { entries := make(holdings.Entries) for { cols, entry, err := r.Read() if err == io.EOF { break } switch err { case ErrMissingIdentifiers: if r.SkipMissingIdentifiers { log.Println("skipping line with missing identifiers") continue } else { return entries, err } case ErrIncompleteLine: if r.SkipIncompleteLines { log.Println("skipping incomplete line") continue } else { return entries, err } case ErrInvalidEmbargo: if r.SkipInvalidEmbargo { log.Println("skipping invalid embargo") continue } else { return entries, err } } pi := strings.TrimSpace(cols.PrintIdentifier) oi := strings.TrimSpace(cols.OnlineIdentifier) // Slight ISSN restoration (e.g. http://www.jstor.org/kbart/collections/as). if len(pi) == 8 { pi = fmt.Sprintf("%s-%s", pi[:4], pi[4:]) } if len(oi) == 8 { oi = fmt.Sprintf("%s-%s", oi[:4], oi[4:]) } // Collect all identifiers. identifiers := container.NewStringSet() if pi != "" { identifiers.Add(pi) } if oi != "" { identifiers.Add(oi) } // Extract ISSN from anchor field. for _, issn := range span.ISSNPattern.FindAllString(cols.Anchor, -1) { identifiers.Add(issn) } if identifiers.Size() == 0 { if !r.SkipMissingIdentifiers { return entries, ErrMissingIdentifiers } } for _, id := range identifiers.Values() { entries[id] = append(entries[id], holdings.License(entry)) } } return entries, nil }
// UnmarshalJSON turns a config fragment into a filter. func (f *ISSNFilter) UnmarshalJSON(p []byte) error { var s struct { ISSN struct { Values []string `json:"list"` File string `json:"file"` Link string `json:"url"` } `json:"issn"` } if err := json.Unmarshal(p, &s); err != nil { return err } f.values = *container.NewStringSet() if s.ISSN.Link != "" { f, err := ioutil.TempFile("", "span-") if err != nil { return err } log.Printf("ISSNFilter: fetching: %s", s.ISSN.Link) resp, err := http.Get(s.ISSN.Link) if err != nil { return err } defer resp.Body.Close() if _, err := io.Copy(f, resp.Body); err != nil { return err } if err := f.Close(); err != nil { return err } s.ISSN.File = f.Name() } if s.ISSN.File != "" { file, err := os.Open(s.ISSN.File) if err != nil { return err } defer file.Close() reader := bufio.NewReader(file) for { line, err := reader.ReadString('\n') if err == io.EOF { break } if err != nil { return err } line = strings.TrimSpace(line) if line == "" { continue } // valid ISSN can contain x, normalize to uppercase line = strings.ToUpper(line) // sniff ISSNs issns := container.NewStringSet() for _, s := range span.ISSNPattern.FindAllString(line, -1) { issns.Add(s) } if issns.Size() == 0 { log.Printf("warning: no ISSNs found on line: %s", line) } for _, issn := range issns.Values() { f.values.Add(issn) } } } for _, v := range s.ISSN.Values { f.values.Add(v) } return nil }
// ToIntermediateSchema converts a doaj document to intermediate schema. For // now any record, that has no usable date will be skipped. func (doc Document) ToIntermediateSchema() (*finc.IntermediateSchema, error) { var err error output := finc.NewIntermediateSchema() output.Date, err = doc.Date() if err != nil { return output, span.Skip{Reason: err.Error()} } output.RawDate = output.Date.Format("2006-01-02") id := fmt.Sprintf("ai-%s-%s", SourceID, doc.ID) if len(id) > span.KeyLengthLimit { return output, span.Skip{Reason: fmt.Sprintf("id too long: %s", id)} } output.RecordID = id output.Genre = Genre output.DOI = doc.DOI() output.Format = Format output.MegaCollection = Collection output.SourceID = SourceID output.ISSN = doc.Index.ISSN output.ArticleTitle = doc.BibJson.Title output.JournalTitle = doc.BibJson.Journal.Title output.Volume = doc.BibJson.Journal.Volume output.Publishers = append(output.Publishers, doc.BibJson.Journal.Publisher) for _, link := range doc.BibJson.Link { output.URL = append(output.URL, link.URL) } output.StartPage = doc.BibJson.StartPage output.EndPage = doc.BibJson.EndPage if sp, err := strconv.Atoi(doc.BibJson.StartPage); err == nil { if ep, err := strconv.Atoi(doc.BibJson.EndPage); err == nil { output.PageCount = fmt.Sprintf("%d", ep-sp) output.Pages = fmt.Sprintf("%d-%d", sp, ep) } } subjects := container.NewStringSet() for _, s := range doc.Index.SchemaCode { class := LCCPatterns.LookupDefault(strings.Replace(s, "LCC:", "", -1), finc.NOT_ASSIGNED) if class != finc.NOT_ASSIGNED { subjects.Add(class) } } if subjects.Size() == 0 { output.Subjects = []string{finc.NOT_ASSIGNED} } else { output.Subjects = subjects.SortedValues() } languages := container.NewStringSet() for _, l := range doc.Index.Language { languages.Add(LanguageMap.LookupDefault(l, "und")) } output.Languages = languages.Values() for _, author := range doc.BibJson.Author { output.Authors = append(output.Authors, finc.Author{Name: author.Name}) } return output, nil }
func main() { filename := flag.String("file", "", "path to holdings file") format := flag.String("format", "kbart", "holding file format, kbart, google, ovid") permissiveMode := flag.Bool("permissive", false, "if we cannot check, we allow") ignoreUnmarshalErrors := flag.Bool("ignore-unmarshal-errors", false, "keep using what could be unmarshalled") version := flag.Bool("version", false, "show version") flag.Parse() if *version { fmt.Println(istools.Version) os.Exit(0) } if *filename == "" { log.Fatal("holding -file required") } var r *bufio.Reader if flag.NArg() == 0 { r = bufio.NewReader(os.Stdin) } else { file, err := os.Open(flag.Arg(0)) if err != nil { log.Fatal(err) } r = bufio.NewReader(file) } hfile, err := os.Open(*filename) if err != nil { log.Fatal(err) } var hr holdings.File switch *format { case "kbart": hr = kbart.NewReader(hfile) case "ovid": hr = ovid.NewReader(hfile) case "google": hr = google.NewReader(hfile) default: log.Fatalf("invalid holding file format: %s", *format) } entries, err := hr.ReadAll() if err != nil { switch err.(type) { case holdings.ParseError: if *ignoreUnmarshalErrors { log.Println(err) } else { log.Fatal(err) } default: log.Fatal(err) } } for { b, err := r.ReadBytes('\n') if err == io.EOF { break } if err != nil { log.Fatal(err) } var is finc.IntermediateSchema if err := json.Unmarshal(b, &is); err != nil { log.Fatal(err) } signature := holdings.Signature{ Date: is.Date.Format("2006-01-02"), Volume: is.Volume, Issue: is.Issue, } // validate record, if at least one license allows this item var valid bool var messages = container.NewStringSet() LOOP: for _, issn := range append(is.ISSN, is.EISSN...) { licenses := entries.Licenses(issn) if len(licenses) == 0 { messages.Add(fmt.Sprintf("ISSN not in holdings")) } if len(licenses) == 0 && *permissiveMode { messages.Add("PERMISSIVE_OK") valid = true break LOOP } for _, license := range licenses { if err := license.Covers(signature); err != nil { messages.Add(err.Error()) } else { if err := license.TimeRestricted(is.Date); err != nil { messages.Add(err.Error()) } else { messages.Add("OK") valid = true break LOOP } } } } if len(is.ISSN) == 0 && len(is.EISSN) == 0 { messages.Add("Record has no ISSN") } if len(is.ISSN) == 0 && len(is.EISSN) == 0 && *permissiveMode { messages.Add("PERMISSIVE_OK") valid = true } values := messages.Values() sort.Strings(values) fmt.Printf("%s\t%v\t%v\n", is.RecordID, valid, strings.Join(values, ", ")) } }
// Export method from intermediate schema to solr 4/13 schema. func (s *Solr4Vufind13v3) Convert(is finc.IntermediateSchema) error { s.Allfields = is.Allfields() s.Formats = append(s.Formats, is.Format) s.Fullrecord = "blob:" + is.RecordID s.Fulltext = is.Fulltext s.ID = is.RecordID s.Imprint = is.Imprint() s.ISSN = is.ISSNList() s.MegaCollections = append(s.MegaCollections, is.MegaCollection) s.PublishDateSort = is.Date.Year() s.Publishers = is.Publishers s.RecordType = finc.AIRecordType s.Series = append(s.Series, is.JournalTitle) s.SourceID = is.SourceID s.Subtitle = is.ArticleSubtitle s.TitleSort = is.SortableTitle() s.Topics = is.Subjects s.URL = is.URL classes := container.NewStringSet() for _, s := range is.Subjects { for _, class := range SubjectMapping.LookupDefault(s, []string{}) { classes.Add(class) } } s.FincClassFacet = classes.Values() sanitized := sanitize.HTML(is.ArticleTitle) s.Title, s.TitleFull, s.TitleShort = sanitized, sanitized, sanitized for _, lang := range is.Languages { s.Languages = append(s.Languages, LanguageMap.LookupDefault(lang, lang)) } for _, author := range is.Authors { s.SecondaryAuthors = append(s.SecondaryAuthors, author.String()) s.AuthorFacet = append(s.AuthorFacet, author.String()) } if len(s.SecondaryAuthors) > 0 { s.Author = s.SecondaryAuthors[0] } s.AccessFacet = AIAccessFacet // site specific formats s.FormatDe105 = []string{FormatDe105.LookupDefault(is.Format, "")} s.FormatDe14 = []string{FormatDe14.LookupDefault(is.Format, "")} s.FormatDe15 = []string{FormatDe15.LookupDefault(is.Format, "")} s.FormatDe520 = []string{FormatDe520.LookupDefault(is.Format, "")} s.FormatDe540 = []string{FormatDe540.LookupDefault(is.Format, "")} s.FormatDeCh1 = []string{FormatDeCh1.LookupDefault(is.Format, "")} s.FormatDed117 = []string{FormatDed117.LookupDefault(is.Format, "")} s.FormatDeGla1 = []string{FormatDeGla1.LookupDefault(is.Format, "")} s.FormatDel152 = []string{FormatDel152.LookupDefault(is.Format, "")} s.FormatDel189 = []string{FormatDel189.LookupDefault(is.Format, "")} s.FormatDeZi4 = []string{FormatDeZi4.LookupDefault(is.Format, "")} s.FormatDeZwi2 = []string{FormatDeZwi2.LookupDefault(is.Format, "")} s.ContainerVolume = is.Volume s.ContainerIssue = is.Issue s.ContainerStartPage = is.StartPage s.ContainerTitle = is.JournalTitle return nil }