func main() { showVersion := flag.Bool("v", false, "prints current program version") size := flag.Int("b", 20000, "batch size") numWorkers := flag.Int("w", runtime.NumCPU(), "number of workers") flag.Parse() if *showVersion { fmt.Println(span.AppVersion) os.Exit(0) } var readers []io.Reader if flag.NArg() == 0 { readers = append(readers, os.Stdin) } else { for _, filename := range flag.Args() { file, err := os.Open(filename) if err != nil { log.Fatal(err) } defer file.Close() readers = append(readers, file) } } for _, r := range readers { p := bytebatch.NewLineProcessor(r, os.Stdout, func(b []byte) ([]byte, error) { is := finc.IntermediateSchema{} if err := json.Unmarshal(b, &is); err != nil { log.Printf("failed to unmarshal: %s", string(b)) return b, err } // Redact full text. is.Fulltext = "" bb, err := json.Marshal(is) if err != nil { return bb, err } bb = append(bb, '\n') return bb, nil }) p.NumWorkers = *numWorkers p.BatchSize = *size if err := p.Run(); err != nil { log.Fatal(err) } } }
// Tag takes an intermediate schema and returns a labeled version of that // schema. func (t *Tagger) Tag(is finc.IntermediateSchema) finc.IntermediateSchema { var tags []string for tag, filter := range t.filtermap { if filter.Apply(is) { tags = append(tags, tag) } } is.Labels = tags return is }
// Export method from intermediate schema to solr 4/13 schema. func (s *Solr4Vufind13v1) Convert(is finc.IntermediateSchema) error { s.Allfields = is.Allfields() s.Formats = append(s.Formats, is.Format) s.Fullrecord = "blob:" + is.RecordID s.Fulltext = is.Fulltext s.HierarchyParentTitle = append(s.HierarchyParentTitle, is.JournalTitle) s.ID = is.RecordID s.Imprint = is.Imprint() s.ISSN = is.ISSNList() s.MegaCollections = append(s.MegaCollections, is.MegaCollection) s.PublishDateSort = is.Date.Year() s.Publishers = is.Publishers s.RecordType = finc.AIRecordType s.Series = append(s.Series, is.JournalTitle) s.SourceID = is.SourceID s.Subtitle = is.ArticleSubtitle s.TitleSort = is.SortableTitle() s.Topics = is.Subjects s.URL = is.URL classes := container.NewStringSet() for _, s := range is.Subjects { for _, class := range SubjectMapping.LookupDefault(s, []string{}) { classes.Add(class) } } s.FincClassFacet = classes.Values() sanitized := sanitize.HTML(is.ArticleTitle) s.Title, s.TitleFull, s.TitleShort = sanitized, sanitized, sanitized for _, lang := range is.Languages { s.Languages = append(s.Languages, LanguageMap.LookupDefault(lang, lang)) } for _, author := range is.Authors { s.SecondaryAuthors = append(s.SecondaryAuthors, author.String()) s.AuthorFacet = append(s.AuthorFacet, author.String()) } if len(s.SecondaryAuthors) > 0 { s.Author = s.SecondaryAuthors[0] } s.AccessFacet = AIAccessFacet s.FormatDe15 = []string{FormatDe15.LookupDefault(is.Format, "")} return nil }
// Export method from intermediate schema to solr 4/13 schema. func (s *Solr5Vufind3) Convert(is finc.IntermediateSchema, withFullrecord bool) error { s.Allfields = is.Allfields() s.Formats = append(s.Formats, is.Format) s.Fullrecord = "blob:" + is.RecordID s.Fulltext = is.Fulltext s.ID = is.RecordID s.Imprint = is.Imprint() s.ISSN = is.ISSNList() s.MegaCollections = append(s.MegaCollections, is.MegaCollection) s.PublishDateSort = is.Date.Year() s.PublishDate = []string{is.Date.Format("2006-01-02")} s.Publishers = is.Publishers s.RecordType = finc.AIRecordType s.Series = append(s.Series, is.JournalTitle) s.SourceID = is.SourceID s.Subtitle = is.ArticleSubtitle s.TitleSort = is.SortableTitle() s.Topics = is.Subjects s.URL = is.URL classes := container.NewStringSet() for _, s := range is.Subjects { for _, class := range SubjectMapping.LookupDefault(s, []string{}) { classes.Add(class) } } s.FincClassFacet = classes.Values() sanitized := sanitize.HTML(is.ArticleTitle) s.Title, s.TitleFull, s.TitleShort = sanitized, sanitized, sanitized // is we do not have a title yet be rft.btitle is non-empty, use that if s.Title == "" && is.BookTitle != "" { sanitized := sanitize.HTML(is.BookTitle) s.Title, s.TitleFull, s.TitleShort = sanitized, sanitized, sanitized } for _, lang := range is.Languages { s.Languages = append(s.Languages, LanguageMap.LookupDefault(lang, lang)) } // collect sanizized authors var authors []string for _, author := range is.Authors { sanitized := AuthorReplacer.Replace(author.String()) if sanitized == "" { continue } authors = append(authors, sanitized) // first, random author goes into author field, others into secondary field, refs. #5778 if s.VF1Author == "" { s.VF1Author = sanitized } else { s.VF1SecondaryAuthors = append(s.VF1SecondaryAuthors, sanitized) } s.AuthorFacet = append(s.AuthorFacet, sanitized) } if s.VF1Author == "" { s.VF1Author = finc.NOT_ASSIGNED } if len(authors) == 0 { s.Authors = []string{finc.NOT_ASSIGNED} } else { s.Authors = authors } s.AccessFacet = AIAccessFacet // site specific formats s.FormatDe105 = []string{FormatDe105.LookupDefault(is.Format, "")} s.FormatDe14 = []string{FormatDe14.LookupDefault(is.Format, "")} s.FormatDe15 = []string{FormatDe15.LookupDefault(is.Format, "")} s.FormatDe520 = []string{FormatDe520.LookupDefault(is.Format, "")} s.FormatDe540 = []string{FormatDe540.LookupDefault(is.Format, "")} s.FormatDeCh1 = []string{FormatDeCh1.LookupDefault(is.Format, "")} s.FormatDed117 = []string{FormatDed117.LookupDefault(is.Format, "")} s.FormatDeGla1 = []string{FormatDeGla1.LookupDefault(is.Format, "")} s.FormatDel152 = []string{FormatDel152.LookupDefault(is.Format, "")} s.FormatDel189 = []string{FormatDel189.LookupDefault(is.Format, "")} s.FormatDeZi4 = []string{FormatDeZi4.LookupDefault(is.Format, "")} s.FormatDeZwi2 = []string{FormatDeZwi2.LookupDefault(is.Format, "")} s.FormatNrw = []string{FormatNrw.LookupDefault(is.Format, "")} s.ContainerVolume = is.Volume s.ContainerIssue = is.Issue s.ContainerStartPage = is.StartPage s.ContainerTitle = is.JournalTitle s.Institutions = is.Labels if withFullrecord { // refs. #8031 b, err := json.Marshal(is) if err != nil { return err } s.Fullrecord = string(b) } return nil }
func main() { filename := flag.String("file", "", "path to holdings file") format := flag.String("format", "kbart", "holding file format, kbart, google, ovid") permissiveMode := flag.Bool("permissive", false, "if we cannot check, we allow") ignoreUnmarshalErrors := flag.Bool("ignore-unmarshal-errors", false, "keep using what could be unmarshalled") version := flag.Bool("version", false, "show version") label := flag.String("label", "X", "label to add") var tags istools.TagSlice flag.Var(&tags, "x", "ISIL:/path/to/kbart.txt") flag.Parse() if *version { fmt.Println(istools.Version) os.Exit(0) } if *filename == "" { log.Fatal("holding -file required") } var r *bufio.Reader if flag.NArg() == 0 { r = bufio.NewReader(os.Stdin) } else { file, err := os.Open(flag.Arg(0)) if err != nil { log.Fatal(err) } r = bufio.NewReader(file) } hfile, err := os.Open(*filename) if err != nil { log.Fatal(err) } var hr holdings.File switch *format { case "kbart": hr = kbart.NewReader(hfile) case "ovid": hr = ovid.NewReader(hfile) case "google": hr = google.NewReader(hfile) default: log.Fatalf("invalid holding file format: %s", *format) } entries, err := hr.ReadAll() if err != nil { switch err.(type) { case holdings.ParseError: if *ignoreUnmarshalErrors { log.Println(err) } else { log.Fatal(err) } default: log.Fatal(err) } } for { b, err := r.ReadBytes('\n') if err == io.EOF { break } if err != nil { log.Fatal(err) } var is = new(finc.IntermediateSchema) if err := json.Unmarshal(b, is); err != nil { log.Fatal(err) } signature := holdings.Signature{ Date: is.Date.Format("2006-01-02"), Volume: is.Volume, Issue: is.Issue, } // validate record, if at least one license allows this item var valid bool var messages = container.NewStringSet() LOOP: for _, issn := range append(is.ISSN, is.EISSN...) { licenses := entries.Licenses(issn) if len(licenses) == 0 { messages.Add(fmt.Sprintf("ISSN not in holdings")) } if len(licenses) == 0 && *permissiveMode { messages.Add("PERMISSIVE_OK") valid = true break LOOP } for _, license := range licenses { if err := license.Covers(signature); err != nil { messages.Add(err.Error()) } else { if err := license.TimeRestricted(is.Date); err != nil { messages.Add(err.Error()) } else { messages.Add("OK") valid = true break LOOP } } } } if len(is.ISSN) == 0 && len(is.EISSN) == 0 { messages.Add("Record has no ISSN") } if len(is.ISSN) == 0 && len(is.EISSN) == 0 && *permissiveMode { messages.Add("PERMISSIVE_OK") valid = true } if valid { is.Labels = []string{*label} } bs, err := json.Marshal(is) if err != nil { log.Fatal(err) } fmt.Println(string(bs)) // values := messages.Values() // sort.Strings(values) // fmt.Printf("%s\t%v\t%v\n", is.RecordID, valid, strings.Join(values, ", ")) } }
// Export method from intermediate schema to solr 4/13 schema. func (s *Solr4Vufind13v3) Convert(is finc.IntermediateSchema) error { s.Allfields = is.Allfields() s.Formats = append(s.Formats, is.Format) s.Fullrecord = "blob:" + is.RecordID s.Fulltext = is.Fulltext s.ID = is.RecordID s.Imprint = is.Imprint() s.ISSN = is.ISSNList() s.MegaCollections = append(s.MegaCollections, is.MegaCollection) s.PublishDateSort = is.Date.Year() s.Publishers = is.Publishers s.RecordType = finc.AIRecordType s.Series = append(s.Series, is.JournalTitle) s.SourceID = is.SourceID s.Subtitle = is.ArticleSubtitle s.TitleSort = is.SortableTitle() s.Topics = is.Subjects s.URL = is.URL classes := container.NewStringSet() for _, s := range is.Subjects { for _, class := range SubjectMapping.LookupDefault(s, []string{}) { classes.Add(class) } } s.FincClassFacet = classes.Values() sanitized := sanitize.HTML(is.ArticleTitle) s.Title, s.TitleFull, s.TitleShort = sanitized, sanitized, sanitized for _, lang := range is.Languages { s.Languages = append(s.Languages, LanguageMap.LookupDefault(lang, lang)) } for _, author := range is.Authors { s.SecondaryAuthors = append(s.SecondaryAuthors, author.String()) s.AuthorFacet = append(s.AuthorFacet, author.String()) } if len(s.SecondaryAuthors) > 0 { s.Author = s.SecondaryAuthors[0] } s.AccessFacet = AIAccessFacet // site specific formats s.FormatDe105 = []string{FormatDe105.LookupDefault(is.Format, "")} s.FormatDe14 = []string{FormatDe14.LookupDefault(is.Format, "")} s.FormatDe15 = []string{FormatDe15.LookupDefault(is.Format, "")} s.FormatDe520 = []string{FormatDe520.LookupDefault(is.Format, "")} s.FormatDe540 = []string{FormatDe540.LookupDefault(is.Format, "")} s.FormatDeCh1 = []string{FormatDeCh1.LookupDefault(is.Format, "")} s.FormatDed117 = []string{FormatDed117.LookupDefault(is.Format, "")} s.FormatDeGla1 = []string{FormatDeGla1.LookupDefault(is.Format, "")} s.FormatDel152 = []string{FormatDel152.LookupDefault(is.Format, "")} s.FormatDel189 = []string{FormatDel189.LookupDefault(is.Format, "")} s.FormatDeZi4 = []string{FormatDeZi4.LookupDefault(is.Format, "")} s.FormatDeZwi2 = []string{FormatDeZwi2.LookupDefault(is.Format, "")} s.ContainerVolume = is.Volume s.ContainerIssue = is.Issue s.ContainerStartPage = is.StartPage s.ContainerTitle = is.JournalTitle return nil }