func main() { var hfiles, lfiles, cfiles, any, source container.StringSlice flag.Var(&hfiles, "f", "ISIL:/path/to/ovid.xml") flag.Var(&lfiles, "l", "ISIL:/path/to/list.txt") flag.Var(&cfiles, "c", "ISIL:/path/to/collections.txt") flag.Var(&any, "any", "ISIL") flag.Var(&source, "source", "ISIL:SID") skip := flag.Bool("skip", false, "skip errors") showVersion := flag.Bool("v", false, "prints current program version") dumpFilters := flag.Bool("dump", false, "dump filters and exit") size := flag.Int("b", 20000, "batch size") numWorkers := flag.Int("w", runtime.NumCPU(), "number of workers") cpuprofile := flag.String("cpuprofile", "", "write cpu profile to file") format := flag.String("o", "solr4vu13v4", "output format") listFormats := flag.Bool("list", false, "list output formats") gzipOutput := flag.Bool("z", false, "gzip output") doiBlacklist := flag.String("doi-blacklist", "", "a list of DOIs to skip") flag.Parse() runtime.GOMAXPROCS(*numWorkers) if *showVersion { fmt.Println(span.AppVersion) os.Exit(0) } if *listFormats { for k := range Exporters { fmt.Println(k) } os.Exit(0) } if *cpuprofile != "" { f, err := os.Create(*cpuprofile) if err != nil { log.Fatal(err) } pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } tagger := make(filter.ISILTagger) for _, s := range hfiles { isil, file, err := parseTagPath(s) if err != nil { log.Fatal(err) } defer file.Close() f, err := filter.NewHoldingFilter(file) if err != nil && !*skip { log.Fatal(err) } tagger[isil] = append(tagger[isil], f) } for _, s := range cfiles { isil, file, err := parseTagPath(s) if err != nil { log.Fatal(err) } defer file.Close() f, err := filter.NewCollectionFilter(file) if err != nil && !*skip { log.Fatal(err) } tagger[isil] = append(tagger[isil], f) } for _, s := range lfiles { isil, file, err := parseTagPath(s) if err != nil { log.Fatal(err) } defer file.Close() f, err := filter.NewListFilter(file) if err != nil && !*skip { log.Fatal(err) } tagger[isil] = append(tagger[isil], f) } for _, s := range source { ss := strings.Split(s, ":") if len(ss) != 2 { log.Fatal("use ISIL:SID") } isil, sid := ss[0], ss[1] tagger[isil] = append(tagger[isil], filter.SourceFilter{SourceID: sid}) } for _, isil := range any { tagger[isil] = []filter.Filter{filter.Any{}} } if *dumpFilters { b, err := json.Marshal(tagger) if err != nil { log.Fatal(err) } fmt.Println(string(b)) os.Exit(0) } // TODO(miku): stutter less var filters []filter.Filter if *doiBlacklist != "" { file, err := os.Open(*doiBlacklist) if err != nil { log.Fatal(err) } f, err := filter.NewDOIFilter(bufio.NewReader(file)) if err != nil { log.Fatal(err) } filters = append(filters, f) } exportSchemaFunc, ok := Exporters[*format] if !ok { log.Fatal("unknown export schema") } opts := options{tagger: tagger, exportSchemaFunc: exportSchemaFunc, filters: filters} queue := make(chan []string) out := make(chan []byte) done := make(chan bool) if *gzipOutput { go span.GzipSink(os.Stdout, out, done) } else { go span.ByteSink(os.Stdout, out, done) } var wg sync.WaitGroup for i := 0; i < *numWorkers; i++ { wg.Add(1) go worker(queue, out, opts, &wg) } var batch []string var i int var readers []io.Reader if flag.NArg() == 0 { readers = append(readers, os.Stdin) } else { for _, filename := range flag.Args() { file, err := os.Open(filename) if err != nil { log.Fatal(err) } defer file.Close() readers = append(readers, file) } } for _, r := range readers { br := bufio.NewReader(r) for { line, err := br.ReadString('\n') if err == io.EOF { break } if err != nil { log.Fatal(err) } batch = append(batch, line) if i%*size == 0 { b := make([]string, len(batch)) copy(b, batch) queue <- b batch = batch[:0] } i++ } } b := make([]string, len(batch)) copy(b, batch) queue <- b close(queue) wg.Wait() close(out) <-done }
func main() { inputFormat := flag.String("i", "", "input format") listFormats := flag.Bool("list", false, "list formats") numWorkers := flag.Int("w", runtime.NumCPU(), "number of workers") logfile := flag.String("log", "", "if given log to file") showVersion := flag.Bool("v", false, "prints current program version") cpuprofile := flag.String("cpuprofile", "", "write cpu profile to file") verbose := flag.Bool("verbose", false, "more output") flag.Parse() if *showVersion { fmt.Println(span.AppVersion) os.Exit(0) } if *cpuprofile != "" { f, err := os.Create(*cpuprofile) if err != nil { log.Fatal(err) } pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } runtime.GOMAXPROCS(*numWorkers) if *listFormats { var names []string for k := range formats { names = append(names, k) } sort.Strings(names) for _, name := range names { fmt.Println(name) } os.Exit(0) } if *inputFormat == "" { log.Fatal(errFormatRequired) } if _, ok := formats[*inputFormat]; !ok { log.Fatal(errFormatUnsupported) } if flag.Arg(0) == "" { log.Fatal("input file required") } queue := make(chan []span.Importer) out := make(chan []byte) done := make(chan bool) go span.ByteSink(os.Stdout, out, done) var wg sync.WaitGroup opts := options{verbose: *verbose} for i := 0; i < *numWorkers; i++ { wg.Add(1) go worker(queue, out, opts, &wg) } if *logfile != "" { ff, err := os.Create(*logfile) if err != nil { log.Fatal(err) } bw := bufio.NewWriter(ff) logger = log.New(bw, "", 0) defer ff.Close() defer bw.Flush() } filename := flag.Arg(0) file, err := os.Open(filename) if err != nil { log.Fatal(err) } source, _ := formats[*inputFormat] ch, err := source.Iterate(file) if err != nil { log.Fatal(err) } for batch := range ch { queue <- batch } close(queue) wg.Wait() close(out) <-done }
func main() { inputFormat := flag.String("i", "", "input format") listFormats := flag.Bool("list", false, "list formats") members := flag.String("members", "", "path to LDJ file, one member per line") numWorkers := flag.Int("w", runtime.NumCPU(), "number of workers") logfile := flag.String("log", "", "if given log to file") showVersion := flag.Bool("v", false, "prints current program version") cpuprofile := flag.String("cpuprofile", "", "write cpu profile to file") verbose := flag.Bool("verbose", false, "more output") gzipOutput := flag.Bool("z", false, "gzip output") flag.Parse() if *showVersion { fmt.Println(span.AppVersion) os.Exit(0) } if *cpuprofile != "" { f, err := os.Create(*cpuprofile) if err != nil { log.Fatal(err) } pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } runtime.GOMAXPROCS(*numWorkers) if *listFormats { for k := range formats { fmt.Println(k) } os.Exit(0) } if *inputFormat == "" { log.Fatal(errFormatRequired) } if _, ok := formats[*inputFormat]; !ok { log.Fatal(errFormatUnsupported) } if *members != "" { err := crossref.PopulateMemberNameCache(*members) if err != nil { log.Fatal(err) } } if flag.Arg(0) == "" { log.Fatal("input file required") } queue := make(chan span.Batcher) out := make(chan []byte) done := make(chan bool) if *gzipOutput { go span.GzipSink(os.Stdout, out, done) } else { go span.ByteSink(os.Stdout, out, done) } var wg sync.WaitGroup opts := options{verbose: *verbose} for i := 0; i < *numWorkers; i++ { wg.Add(1) go batcherWorker(queue, out, opts, &wg) } if *logfile != "" { ff, err := os.Create(*logfile) if err != nil { log.Fatal(err) } bw := bufio.NewWriter(ff) logger = log.New(bw, "", 0) defer ff.Close() defer bw.Flush() } filename := flag.Arg(0) file, err := os.Open(filename) if err != nil { log.Fatal(err) } source, _ := formats[*inputFormat] ch, err := source.Iterate(file) if err != nil { log.Fatal(err) } for item := range ch { switch item.(type) { case span.Importer: doc := item.(span.Importer) output, err := doc.ToIntermediateSchema() if err != nil { log.Fatal(err) } b, err := json.Marshal(output) if err != nil { log.Fatal(err) } out <- b case span.Batcher: queue <- item.(span.Batcher) default: log.Fatal(errCannotConvert) } } close(queue) wg.Wait() close(out) <-done }
func main() { verbose := flag.Bool("verbose", false, "be verbose") showVersion := flag.Bool("v", false, "prints current program version") size := flag.Int("b", 20000, "batch size") numWorkers := flag.Int("w", runtime.NumCPU(), "number of workers") flag.Parse() if *showVersion { fmt.Println(span.AppVersion) os.Exit(0) } var readers []io.Reader if flag.NArg() == 0 { readers = append(readers, os.Stdin) } else { for _, filename := range flag.Args() { file, err := os.Open(filename) if err != nil { log.Fatal(err) } defer file.Close() readers = append(readers, file) } } errc := make(chan string) done := make(chan bool) go statsCounter(errc, done) out := make(chan []byte) go span.ByteSink(os.Stdout, out, done) for _, r := range readers { p := bytebatch.NewLineProcessor(r, os.Stdout, func(b []byte) ([]byte, error) { var is finc.IntermediateSchema if err := json.Unmarshal(b, &is); err != nil { return b, err } for _, t := range qa.TestSuite { if err := t.TestRecord(is); err != nil { issue, ok := err.(qa.Issue) if !ok { log.Fatalf("unexpected error type: %s", err) } errc <- issue.Err.Error() if *verbose { b, err := json.Marshal(issue) if err != nil { log.Fatal(err) } out <- b } } } return nil, nil }) p.NumWorkers = *numWorkers p.BatchSize = *size if err := p.Run(); err != nil { log.Fatal(err) } } close(errc) close(out) // wait for both queue and writer <-done <-done b, err := json.Marshal(map[string]interface{}{"stats": stats}) if err != nil { log.Fatal(err) } fmt.Fprintln(os.Stderr, string(b)) }