Пример #1
0
func main() {

	var hfiles, lfiles, cfiles, any, source container.StringSlice
	flag.Var(&hfiles, "f", "ISIL:/path/to/ovid.xml")
	flag.Var(&lfiles, "l", "ISIL:/path/to/list.txt")
	flag.Var(&cfiles, "c", "ISIL:/path/to/collections.txt")
	flag.Var(&any, "any", "ISIL")
	flag.Var(&source, "source", "ISIL:SID")

	skip := flag.Bool("skip", false, "skip errors")
	showVersion := flag.Bool("v", false, "prints current program version")
	dumpFilters := flag.Bool("dump", false, "dump filters and exit")
	size := flag.Int("b", 20000, "batch size")
	numWorkers := flag.Int("w", runtime.NumCPU(), "number of workers")
	cpuprofile := flag.String("cpuprofile", "", "write cpu profile to file")
	format := flag.String("o", "solr4vu13v4", "output format")
	listFormats := flag.Bool("list", false, "list output formats")
	gzipOutput := flag.Bool("z", false, "gzip output")
	doiBlacklist := flag.String("doi-blacklist", "", "a list of DOIs to skip")

	flag.Parse()

	runtime.GOMAXPROCS(*numWorkers)

	if *showVersion {
		fmt.Println(span.AppVersion)
		os.Exit(0)
	}

	if *listFormats {
		for k := range Exporters {
			fmt.Println(k)
		}
		os.Exit(0)
	}

	if *cpuprofile != "" {
		f, err := os.Create(*cpuprofile)
		if err != nil {
			log.Fatal(err)
		}
		pprof.StartCPUProfile(f)
		defer pprof.StopCPUProfile()
	}

	tagger := make(filter.ISILTagger)

	for _, s := range hfiles {
		isil, file, err := parseTagPath(s)
		if err != nil {
			log.Fatal(err)
		}
		defer file.Close()
		f, err := filter.NewHoldingFilter(file)
		if err != nil && !*skip {
			log.Fatal(err)
		}
		tagger[isil] = append(tagger[isil], f)
	}

	for _, s := range cfiles {
		isil, file, err := parseTagPath(s)
		if err != nil {
			log.Fatal(err)
		}
		defer file.Close()
		f, err := filter.NewCollectionFilter(file)
		if err != nil && !*skip {
			log.Fatal(err)
		}
		tagger[isil] = append(tagger[isil], f)
	}

	for _, s := range lfiles {
		isil, file, err := parseTagPath(s)
		if err != nil {
			log.Fatal(err)
		}
		defer file.Close()
		f, err := filter.NewListFilter(file)
		if err != nil && !*skip {
			log.Fatal(err)
		}
		tagger[isil] = append(tagger[isil], f)
	}

	for _, s := range source {
		ss := strings.Split(s, ":")
		if len(ss) != 2 {
			log.Fatal("use ISIL:SID")
		}
		isil, sid := ss[0], ss[1]
		tagger[isil] = append(tagger[isil], filter.SourceFilter{SourceID: sid})
	}

	for _, isil := range any {
		tagger[isil] = []filter.Filter{filter.Any{}}
	}

	if *dumpFilters {
		b, err := json.Marshal(tagger)
		if err != nil {
			log.Fatal(err)
		}
		fmt.Println(string(b))
		os.Exit(0)
	}

	// TODO(miku): stutter less
	var filters []filter.Filter

	if *doiBlacklist != "" {
		file, err := os.Open(*doiBlacklist)
		if err != nil {
			log.Fatal(err)
		}
		f, err := filter.NewDOIFilter(bufio.NewReader(file))
		if err != nil {
			log.Fatal(err)
		}
		filters = append(filters, f)
	}

	exportSchemaFunc, ok := Exporters[*format]
	if !ok {
		log.Fatal("unknown export schema")
	}
	opts := options{tagger: tagger, exportSchemaFunc: exportSchemaFunc, filters: filters}

	queue := make(chan []string)
	out := make(chan []byte)
	done := make(chan bool)

	if *gzipOutput {
		go span.GzipSink(os.Stdout, out, done)
	} else {
		go span.ByteSink(os.Stdout, out, done)
	}

	var wg sync.WaitGroup

	for i := 0; i < *numWorkers; i++ {
		wg.Add(1)
		go worker(queue, out, opts, &wg)
	}

	var batch []string
	var i int

	var readers []io.Reader

	if flag.NArg() == 0 {
		readers = append(readers, os.Stdin)
	} else {
		for _, filename := range flag.Args() {
			file, err := os.Open(filename)
			if err != nil {
				log.Fatal(err)
			}
			defer file.Close()
			readers = append(readers, file)
		}
	}

	for _, r := range readers {
		br := bufio.NewReader(r)
		for {
			line, err := br.ReadString('\n')
			if err == io.EOF {
				break
			}
			if err != nil {
				log.Fatal(err)
			}
			batch = append(batch, line)
			if i%*size == 0 {
				b := make([]string, len(batch))
				copy(b, batch)
				queue <- b
				batch = batch[:0]
			}
			i++
		}
	}

	b := make([]string, len(batch))
	copy(b, batch)
	queue <- b

	close(queue)
	wg.Wait()
	close(out)
	<-done
}
Пример #2
0
func main() {
	inputFormat := flag.String("i", "", "input format")
	listFormats := flag.Bool("list", false, "list formats")
	members := flag.String("members", "", "path to LDJ file, one member per line")
	numWorkers := flag.Int("w", runtime.NumCPU(), "number of workers")
	logfile := flag.String("log", "", "if given log to file")
	showVersion := flag.Bool("v", false, "prints current program version")
	cpuprofile := flag.String("cpuprofile", "", "write cpu profile to file")
	verbose := flag.Bool("verbose", false, "more output")
	gzipOutput := flag.Bool("z", false, "gzip output")

	flag.Parse()

	if *showVersion {
		fmt.Println(span.AppVersion)
		os.Exit(0)
	}

	if *cpuprofile != "" {
		f, err := os.Create(*cpuprofile)
		if err != nil {
			log.Fatal(err)
		}
		pprof.StartCPUProfile(f)
		defer pprof.StopCPUProfile()
	}

	runtime.GOMAXPROCS(*numWorkers)

	if *listFormats {
		for k := range formats {
			fmt.Println(k)
		}
		os.Exit(0)
	}

	if *inputFormat == "" {
		log.Fatal(errFormatRequired)
	}

	if _, ok := formats[*inputFormat]; !ok {
		log.Fatal(errFormatUnsupported)
	}

	if *members != "" {
		err := crossref.PopulateMemberNameCache(*members)
		if err != nil {
			log.Fatal(err)
		}
	}

	if flag.Arg(0) == "" {
		log.Fatal("input file required")
	}

	queue := make(chan span.Batcher)
	out := make(chan []byte)
	done := make(chan bool)

	if *gzipOutput {
		go span.GzipSink(os.Stdout, out, done)
	} else {
		go span.ByteSink(os.Stdout, out, done)
	}

	var wg sync.WaitGroup
	opts := options{verbose: *verbose}

	for i := 0; i < *numWorkers; i++ {
		wg.Add(1)
		go batcherWorker(queue, out, opts, &wg)
	}

	if *logfile != "" {
		ff, err := os.Create(*logfile)
		if err != nil {
			log.Fatal(err)
		}
		bw := bufio.NewWriter(ff)
		logger = log.New(bw, "", 0)
		defer ff.Close()
		defer bw.Flush()
	}

	filename := flag.Arg(0)
	file, err := os.Open(filename)
	if err != nil {
		log.Fatal(err)
	}

	source, _ := formats[*inputFormat]

	ch, err := source.Iterate(file)
	if err != nil {
		log.Fatal(err)
	}

	for item := range ch {
		switch item.(type) {
		case span.Importer:
			doc := item.(span.Importer)
			output, err := doc.ToIntermediateSchema()
			if err != nil {
				log.Fatal(err)
			}
			b, err := json.Marshal(output)
			if err != nil {
				log.Fatal(err)
			}
			out <- b
		case span.Batcher:
			queue <- item.(span.Batcher)
		default:
			log.Fatal(errCannotConvert)
		}
	}

	close(queue)
	wg.Wait()
	close(out)
	<-done
}