예제 #1
0
파일: main.go 프로젝트: ubleipzig/span
func main() {
	showVersion := flag.Bool("v", false, "prints current program version")
	size := flag.Int("b", 20000, "batch size")
	numWorkers := flag.Int("w", runtime.NumCPU(), "number of workers")

	flag.Parse()

	if *showVersion {
		fmt.Println(span.AppVersion)
		os.Exit(0)
	}

	var readers []io.Reader

	if flag.NArg() == 0 {
		readers = append(readers, os.Stdin)
	} else {
		for _, filename := range flag.Args() {
			file, err := os.Open(filename)
			if err != nil {
				log.Fatal(err)
			}
			defer file.Close()
			readers = append(readers, file)
		}
	}

	for _, r := range readers {
		p := bytebatch.NewLineProcessor(r, os.Stdout, func(b []byte) ([]byte, error) {
			is := finc.IntermediateSchema{}

			if err := json.Unmarshal(b, &is); err != nil {
				log.Printf("failed to unmarshal: %s", string(b))
				return b, err
			}

			// Redact full text.
			is.Fulltext = ""

			bb, err := json.Marshal(is)
			if err != nil {
				return bb, err
			}
			bb = append(bb, '\n')
			return bb, nil
		})

		p.NumWorkers = *numWorkers
		p.BatchSize = *size

		if err := p.Run(); err != nil {
			log.Fatal(err)
		}
	}
}
예제 #2
0
파일: filter.go 프로젝트: ubleipzig/span
// Tag takes an intermediate schema and returns a labeled version of that
// schema.
func (t *Tagger) Tag(is finc.IntermediateSchema) finc.IntermediateSchema {
	var tags []string
	for tag, filter := range t.filtermap {
		if filter.Apply(is) {
			tags = append(tags, tag)
		}
	}
	is.Labels = tags
	return is
}
예제 #3
0
파일: solr4vu13v1.go 프로젝트: voxadam/span
// Export method from intermediate schema to solr 4/13 schema.
func (s *Solr4Vufind13v1) Convert(is finc.IntermediateSchema) error {
	s.Allfields = is.Allfields()
	s.Formats = append(s.Formats, is.Format)
	s.Fullrecord = "blob:" + is.RecordID
	s.Fulltext = is.Fulltext
	s.HierarchyParentTitle = append(s.HierarchyParentTitle, is.JournalTitle)
	s.ID = is.RecordID
	s.Imprint = is.Imprint()
	s.ISSN = is.ISSNList()
	s.MegaCollections = append(s.MegaCollections, is.MegaCollection)
	s.PublishDateSort = is.Date.Year()
	s.Publishers = is.Publishers
	s.RecordType = finc.AIRecordType
	s.Series = append(s.Series, is.JournalTitle)
	s.SourceID = is.SourceID
	s.Subtitle = is.ArticleSubtitle
	s.TitleSort = is.SortableTitle()
	s.Topics = is.Subjects
	s.URL = is.URL

	classes := container.NewStringSet()
	for _, s := range is.Subjects {
		for _, class := range SubjectMapping.LookupDefault(s, []string{}) {
			classes.Add(class)
		}
	}
	s.FincClassFacet = classes.Values()

	sanitized := sanitize.HTML(is.ArticleTitle)
	s.Title, s.TitleFull, s.TitleShort = sanitized, sanitized, sanitized

	for _, lang := range is.Languages {
		s.Languages = append(s.Languages, LanguageMap.LookupDefault(lang, lang))
	}

	for _, author := range is.Authors {
		s.SecondaryAuthors = append(s.SecondaryAuthors, author.String())
		s.AuthorFacet = append(s.AuthorFacet, author.String())
	}

	if len(s.SecondaryAuthors) > 0 {
		s.Author = s.SecondaryAuthors[0]
	}

	s.AccessFacet = AIAccessFacet
	s.FormatDe15 = []string{FormatDe15.LookupDefault(is.Format, "")}

	return nil
}
예제 #4
0
파일: solr5vu3.go 프로젝트: ubleipzig/span
// Export method from intermediate schema to solr 4/13 schema.
func (s *Solr5Vufind3) Convert(is finc.IntermediateSchema, withFullrecord bool) error {
	s.Allfields = is.Allfields()
	s.Formats = append(s.Formats, is.Format)
	s.Fullrecord = "blob:" + is.RecordID
	s.Fulltext = is.Fulltext
	s.ID = is.RecordID
	s.Imprint = is.Imprint()
	s.ISSN = is.ISSNList()
	s.MegaCollections = append(s.MegaCollections, is.MegaCollection)
	s.PublishDateSort = is.Date.Year()
	s.PublishDate = []string{is.Date.Format("2006-01-02")}
	s.Publishers = is.Publishers
	s.RecordType = finc.AIRecordType
	s.Series = append(s.Series, is.JournalTitle)
	s.SourceID = is.SourceID
	s.Subtitle = is.ArticleSubtitle
	s.TitleSort = is.SortableTitle()
	s.Topics = is.Subjects
	s.URL = is.URL

	classes := container.NewStringSet()
	for _, s := range is.Subjects {
		for _, class := range SubjectMapping.LookupDefault(s, []string{}) {
			classes.Add(class)
		}
	}
	s.FincClassFacet = classes.Values()

	sanitized := sanitize.HTML(is.ArticleTitle)
	s.Title, s.TitleFull, s.TitleShort = sanitized, sanitized, sanitized

	// is we do not have a title yet be rft.btitle is non-empty, use that
	if s.Title == "" && is.BookTitle != "" {
		sanitized := sanitize.HTML(is.BookTitle)
		s.Title, s.TitleFull, s.TitleShort = sanitized, sanitized, sanitized
	}

	for _, lang := range is.Languages {
		s.Languages = append(s.Languages, LanguageMap.LookupDefault(lang, lang))
	}

	// collect sanizized authors
	var authors []string
	for _, author := range is.Authors {
		sanitized := AuthorReplacer.Replace(author.String())
		if sanitized == "" {
			continue
		}
		authors = append(authors, sanitized)

		// first, random author goes into author field, others into secondary field, refs. #5778
		if s.VF1Author == "" {
			s.VF1Author = sanitized
		} else {
			s.VF1SecondaryAuthors = append(s.VF1SecondaryAuthors, sanitized)
		}
		s.AuthorFacet = append(s.AuthorFacet, sanitized)
	}

	if s.VF1Author == "" {
		s.VF1Author = finc.NOT_ASSIGNED
	}

	if len(authors) == 0 {
		s.Authors = []string{finc.NOT_ASSIGNED}
	} else {
		s.Authors = authors
	}

	s.AccessFacet = AIAccessFacet

	// site specific formats
	s.FormatDe105 = []string{FormatDe105.LookupDefault(is.Format, "")}
	s.FormatDe14 = []string{FormatDe14.LookupDefault(is.Format, "")}
	s.FormatDe15 = []string{FormatDe15.LookupDefault(is.Format, "")}
	s.FormatDe520 = []string{FormatDe520.LookupDefault(is.Format, "")}
	s.FormatDe540 = []string{FormatDe540.LookupDefault(is.Format, "")}
	s.FormatDeCh1 = []string{FormatDeCh1.LookupDefault(is.Format, "")}
	s.FormatDed117 = []string{FormatDed117.LookupDefault(is.Format, "")}
	s.FormatDeGla1 = []string{FormatDeGla1.LookupDefault(is.Format, "")}
	s.FormatDel152 = []string{FormatDel152.LookupDefault(is.Format, "")}
	s.FormatDel189 = []string{FormatDel189.LookupDefault(is.Format, "")}
	s.FormatDeZi4 = []string{FormatDeZi4.LookupDefault(is.Format, "")}
	s.FormatDeZwi2 = []string{FormatDeZwi2.LookupDefault(is.Format, "")}
	s.FormatNrw = []string{FormatNrw.LookupDefault(is.Format, "")}

	s.ContainerVolume = is.Volume
	s.ContainerIssue = is.Issue
	s.ContainerStartPage = is.StartPage
	s.ContainerTitle = is.JournalTitle

	s.Institutions = is.Labels

	if withFullrecord {
		// refs. #8031
		b, err := json.Marshal(is)
		if err != nil {
			return err
		}
		s.Fullrecord = string(b)
	}
	return nil
}
예제 #5
0
파일: main.go 프로젝트: miku/istools
func main() {
	filename := flag.String("file", "", "path to holdings file")
	format := flag.String("format", "kbart", "holding file format, kbart, google, ovid")
	permissiveMode := flag.Bool("permissive", false, "if we cannot check, we allow")
	ignoreUnmarshalErrors := flag.Bool("ignore-unmarshal-errors", false, "keep using what could be unmarshalled")
	version := flag.Bool("version", false, "show version")
	label := flag.String("label", "X", "label to add")

	var tags istools.TagSlice
	flag.Var(&tags, "x", "ISIL:/path/to/kbart.txt")

	flag.Parse()

	if *version {
		fmt.Println(istools.Version)
		os.Exit(0)
	}

	if *filename == "" {
		log.Fatal("holding -file required")
	}

	var r *bufio.Reader
	if flag.NArg() == 0 {
		r = bufio.NewReader(os.Stdin)
	} else {
		file, err := os.Open(flag.Arg(0))
		if err != nil {
			log.Fatal(err)
		}
		r = bufio.NewReader(file)
	}

	hfile, err := os.Open(*filename)
	if err != nil {
		log.Fatal(err)
	}

	var hr holdings.File

	switch *format {
	case "kbart":
		hr = kbart.NewReader(hfile)
	case "ovid":
		hr = ovid.NewReader(hfile)
	case "google":
		hr = google.NewReader(hfile)
	default:
		log.Fatalf("invalid holding file format: %s", *format)
	}

	entries, err := hr.ReadAll()
	if err != nil {
		switch err.(type) {
		case holdings.ParseError:
			if *ignoreUnmarshalErrors {
				log.Println(err)
			} else {
				log.Fatal(err)
			}
		default:
			log.Fatal(err)
		}
	}

	for {
		b, err := r.ReadBytes('\n')
		if err == io.EOF {
			break
		}
		if err != nil {
			log.Fatal(err)
		}
		var is = new(finc.IntermediateSchema)
		if err := json.Unmarshal(b, is); err != nil {
			log.Fatal(err)
		}
		signature := holdings.Signature{
			Date:   is.Date.Format("2006-01-02"),
			Volume: is.Volume,
			Issue:  is.Issue,
		}

		// validate record, if at least one license allows this item
		var valid bool
		var messages = container.NewStringSet()

	LOOP:
		for _, issn := range append(is.ISSN, is.EISSN...) {
			licenses := entries.Licenses(issn)

			if len(licenses) == 0 {
				messages.Add(fmt.Sprintf("ISSN not in holdings"))
			}

			if len(licenses) == 0 && *permissiveMode {
				messages.Add("PERMISSIVE_OK")
				valid = true
				break LOOP
			}

			for _, license := range licenses {
				if err := license.Covers(signature); err != nil {
					messages.Add(err.Error())
				} else {
					if err := license.TimeRestricted(is.Date); err != nil {
						messages.Add(err.Error())
					} else {
						messages.Add("OK")
						valid = true
						break LOOP
					}
				}
			}
		}

		if len(is.ISSN) == 0 && len(is.EISSN) == 0 {
			messages.Add("Record has no ISSN")
		}

		if len(is.ISSN) == 0 && len(is.EISSN) == 0 && *permissiveMode {
			messages.Add("PERMISSIVE_OK")
			valid = true
		}

		if valid {
			is.Labels = []string{*label}
		}

		bs, err := json.Marshal(is)
		if err != nil {
			log.Fatal(err)
		}
		fmt.Println(string(bs))

		// values := messages.Values()
		// sort.Strings(values)
		// fmt.Printf("%s\t%v\t%v\n", is.RecordID, valid, strings.Join(values, ", "))
	}
}
예제 #6
0
파일: solr4vu13v3.go 프로젝트: voxadam/span
// Export method from intermediate schema to solr 4/13 schema.
func (s *Solr4Vufind13v3) Convert(is finc.IntermediateSchema) error {
	s.Allfields = is.Allfields()
	s.Formats = append(s.Formats, is.Format)
	s.Fullrecord = "blob:" + is.RecordID
	s.Fulltext = is.Fulltext
	s.ID = is.RecordID
	s.Imprint = is.Imprint()
	s.ISSN = is.ISSNList()
	s.MegaCollections = append(s.MegaCollections, is.MegaCollection)
	s.PublishDateSort = is.Date.Year()
	s.Publishers = is.Publishers
	s.RecordType = finc.AIRecordType
	s.Series = append(s.Series, is.JournalTitle)
	s.SourceID = is.SourceID
	s.Subtitle = is.ArticleSubtitle
	s.TitleSort = is.SortableTitle()
	s.Topics = is.Subjects
	s.URL = is.URL

	classes := container.NewStringSet()
	for _, s := range is.Subjects {
		for _, class := range SubjectMapping.LookupDefault(s, []string{}) {
			classes.Add(class)
		}
	}
	s.FincClassFacet = classes.Values()

	sanitized := sanitize.HTML(is.ArticleTitle)
	s.Title, s.TitleFull, s.TitleShort = sanitized, sanitized, sanitized

	for _, lang := range is.Languages {
		s.Languages = append(s.Languages, LanguageMap.LookupDefault(lang, lang))
	}

	for _, author := range is.Authors {
		s.SecondaryAuthors = append(s.SecondaryAuthors, author.String())
		s.AuthorFacet = append(s.AuthorFacet, author.String())
	}

	if len(s.SecondaryAuthors) > 0 {
		s.Author = s.SecondaryAuthors[0]
	}

	s.AccessFacet = AIAccessFacet

	// site specific formats
	s.FormatDe105 = []string{FormatDe105.LookupDefault(is.Format, "")}
	s.FormatDe14 = []string{FormatDe14.LookupDefault(is.Format, "")}
	s.FormatDe15 = []string{FormatDe15.LookupDefault(is.Format, "")}
	s.FormatDe520 = []string{FormatDe520.LookupDefault(is.Format, "")}
	s.FormatDe540 = []string{FormatDe540.LookupDefault(is.Format, "")}
	s.FormatDeCh1 = []string{FormatDeCh1.LookupDefault(is.Format, "")}
	s.FormatDed117 = []string{FormatDed117.LookupDefault(is.Format, "")}
	s.FormatDeGla1 = []string{FormatDeGla1.LookupDefault(is.Format, "")}
	s.FormatDel152 = []string{FormatDel152.LookupDefault(is.Format, "")}
	s.FormatDel189 = []string{FormatDel189.LookupDefault(is.Format, "")}
	s.FormatDeZi4 = []string{FormatDeZi4.LookupDefault(is.Format, "")}
	s.FormatDeZwi2 = []string{FormatDeZwi2.LookupDefault(is.Format, "")}

	s.ContainerVolume = is.Volume
	s.ContainerIssue = is.Issue
	s.ContainerStartPage = is.StartPage
	s.ContainerTitle = is.JournalTitle

	return nil
}