// IDList returns a slice of strings, containing all ids of the given marc file func IDList(filename string) []string { fallback := false yaz, err := exec.LookPath("yaz-marcdump") if err != nil { fallback = true } awk, err := exec.LookPath("awk") if err != nil { fallback = true } var ids []string if fallback { // use slower iteration over records fi, err := os.Open(filename) if err != nil { log.Fatal(err) } defer func() { if err := fi.Close(); err != nil { log.Fatal(err) } }() for { record, err := marc22.ReadRecord(fi) if err == io.EOF { break } if err != nil { log.Fatal(err) } fields := record.GetControlFields("001") if len(fields) != 1 { log.Fatalf("invalid 001 field count: %d\n", len(fields)) } ids = append(ids, strings.TrimSpace(fields[0].Data)) } } else { // fast version using yaz and awk command := fmt.Sprintf("%s '%s' | %s ' /^001 / {print $2}'", yaz, filename, awk) out, err := exec.Command("bash", "-c", command).Output() if err != nil { log.Fatal(err) } for _, line := range strings.Split(string(out), "\n") { line = strings.TrimSpace(line) if len(line) == 0 { continue } ids = append(ids, strings.TrimSpace(line)) } } return ids }
func main() { version := flag.Bool("v", false, "prints current program version") cpuprofile := flag.String("cpuprofile", "", "write cpu profile to file") var PrintUsage = func() { fmt.Fprintf(os.Stderr, "Usage: %s [OPTIONS] MARCFILE\n", os.Args[0]) flag.PrintDefaults() } flag.Parse() if *cpuprofile != "" { f, err := os.Create(*cpuprofile) if err != nil { log.Fatal(err) } pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } if *version { fmt.Println(marctools.AppVersion) os.Exit(0) } if flag.NArg() != 1 { PrintUsage() os.Exit(1) } fi, err := os.Open(flag.Args()[0]) if err != nil { log.Fatalf("%s\n", err) } defer func() { if err := fi.Close(); err != nil { log.Fatalf("%s\n", err) } }() for { record, err := marc22.ReadRecord(fi) if err == io.EOF { break } if err != nil { log.Fatalf("%s\n", err) } fmt.Printf("%s\n", record.String()) } return }
func TestRecordToTSV(t *testing.T) { for _, tt := range recordToTSVTests { reader := strings.NewReader(tt.record) record, err := marc22.ReadRecord(reader) if err != nil { t.Error(err) } result := RecordToTSV(record, tt.tags, tt.fillNA, tt.separator, tt.skipIncompleteLines) if result != tt.out { t.Errorf("RecordToTSV(%s, %v, %s, %s, %t) => %+v, want: %+v", record, tt.tags, tt.fillNA, tt.separator, tt.skipIncompleteLines, result, tt.out) } } }
func TestRecordToMap(t *testing.T) { for _, tt := range recordMapTests { reader := strings.NewReader(tt.record) record, err := marc22.ReadRecord(reader) if err != nil { t.Error(err) } result := RecordMap(record, tt.filterMap, tt.includeLeader) if result == nil { t.Error("RecordToMap should not return nil") } b, err := json.Marshal(result) if err != nil { t.Error("RecordToMap should return something JSON-serializable") } if string(b) != tt.out { t.Errorf("RecordToMap(%s, %+v, %v) => %+v, want: %+v", tt.record, tt.filterMap, tt.includeLeader, string(b), tt.out) } } }
func main() { ignore := flag.Bool("i", false, "ignore marc errors (not recommended)") version := flag.Bool("v", false, "prints current program version") outfile := flag.String("o", "", "output file (or stdout if none given)") exclude := flag.String("x", "", "comma separated list of ids to exclude (or filename with one id per line)") var PrintUsage = func() { fmt.Fprintf(os.Stderr, "Usage: %s [OPTIONS] MARCFILE\n", os.Args[0]) flag.PrintDefaults() } flag.Parse() // display version and exit if *version { fmt.Println(marctools.AppVersion) os.Exit(0) } if flag.NArg() != 1 { PrintUsage() os.Exit(1) } // input file fi, err := os.Open(flag.Args()[0]) if err != nil { log.Fatalln(err) } defer func() { if err := fi.Close(); err != nil { log.Fatalln(err) } }() // output file or stdout var output *os.File if *outfile == "" { output = os.Stdout } else { output, err = os.Create(*outfile) if err != nil { log.Fatalln(err) } defer func() { if err := output.Close(); err != nil { log.Fatalln(err) } }() } // exclude list excludedIds := marctools.NewStringSet() if *exclude != "" { if _, err := os.Stat(*exclude); err != nil { if os.IsNotExist(err) { fmt.Fprintf(os.Stderr, "excluded ids interpreted as string\n") for _, value := range strings.Split(*exclude, ",") { excludedIds.Add(strings.TrimSpace(value)) } } else if err != nil { log.Fatalln(err) } } else { fmt.Fprintf(os.Stderr, "excluded ids interpreted as file\n") // read one id per line from file file, err := os.Open(*exclude) if err != nil { log.Fatalln(err) } defer func() { if err := file.Close(); err != nil { log.Fatalln(err) } }() scanner := bufio.NewScanner(file) for scanner.Scan() { excludedIds.Add(strings.TrimSpace(scanner.Text())) } } fmt.Fprintf(os.Stderr, "%d ids to exclude loaded\n", excludedIds.Size()) } // collect the excluded ids here excluded := make([]string, 0, 0) // keep track of all ids ids := marctools.NewStringSet() // collect the duplicate ids; array, since same id may occur many times // skipped could be an integer for now, because we do not display the skipped // records (TODO: add flag to display skipped records) skipped := make([]string, 0, 0) // just count the total records and those without id var counter, without_id int for { head, _ := fi.Seek(0, os.SEEK_CUR) record, err := marc22.ReadRecord(fi) if err == io.EOF { break } if err != nil { if *ignore { fmt.Fprintf(os.Stderr, "skipping error: %s\n", err) continue } else { log.Fatalln(err) } } tail, _ := fi.Seek(0, os.SEEK_CUR) length := tail - head fields := record.GetControlFields("001") if len(fields) > 0 { id := fields[0].Data if ids.Contains(id) { skipped = append(skipped, id) } else if excludedIds.Contains(id) { excluded = append(excluded, id) } else { ids.Add(id) fi.Seek(head, 0) buf := make([]byte, length) n, err := fi.Read(buf) if err != nil { log.Fatalln(err) } if _, err := output.Write(buf[:n]); err != nil { log.Fatalln(err) } } } else if len(fields) == 0 { without_id += 1 } counter += 1 } fmt.Fprintf(os.Stderr, "%d records read\n", counter) fmt.Fprintf(os.Stderr, "%d records written, %d skipped, %d excluded, %d without ID (001)\n", ids.Size(), len(skipped), len(excluded), without_id) }
func main() { cpuprofile := flag.String("cpuprofile", "", "write cpu profile to file") ignoreErrors := flag.Bool("i", false, "ignore marc errors (not recommended)") numWorkers := flag.Int("w", runtime.NumCPU(), "number of workers") version := flag.Bool("v", false, "prints current program version and exit") fillna := flag.String("f", "<NULL>", "fill missing values with this") separator := flag.String("s", "", "separator to use for multiple values") skipIncompleteLines := flag.Bool("k", false, "skip incomplete lines (missing values)") var PrintUsage = func() { fmt.Fprintf(os.Stderr, "Usage: %s [OPTIONS] MARCFILE TAG [TAG, TAG, ...]\n", os.Args[0]) flag.PrintDefaults() } flag.Parse() if *numWorkers > 0 { runtime.GOMAXPROCS(*numWorkers) } if *cpuprofile != "" { f, err := os.Create(*cpuprofile) if err != nil { log.Fatal(err) } pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } if *version { fmt.Println(marctools.AppVersion) os.Exit(0) } if flag.NArg() < 1 { PrintUsage() os.Exit(1) } file, err := os.Open(flag.Args()[0]) if err != nil { log.Fatalln(err) } defer func() { if err := file.Close(); err != nil { log.Fatalln(err) } }() tags := flag.Args()[1:] if len(tags) == 0 { log.Fatalln("at least one tag is required") } queue := make(chan Work) results := make(chan string) done := make(chan bool) writer := bufio.NewWriter(os.Stdout) defer writer.Flush() go FanInWriter(writer, results, done) var wg sync.WaitGroup for i := 0; i < *numWorkers; i++ { wg.Add(1) go Worker(queue, results, &wg) } for { record, err := marc22.ReadRecord(file) if err == io.EOF { break } if err != nil { if *ignoreErrors { log.Printf("[EE] %s\n", err) continue } else { log.Fatalln(err) } } work := Work{Record: record, Tags: tags, FillNA: *fillna, Separator: *separator, SkipIncompleteLines: *skipIncompleteLines} queue <- work } close(queue) wg.Wait() close(results) <-done }
func main() { cpuprofile := flag.String("cpuprofile", "", "write cpu profile to file") ignoreErrors := flag.Bool("i", false, "ignore marc errors (not recommended)") numWorkers := flag.Int("w", runtime.NumCPU(), "number of workers") version := flag.Bool("v", false, "prints current program version and exit") filterVar := flag.String("r", "", "only dump the given tags (e.g. 001,003)") includeLeader := flag.Bool("l", false, "dump the leader as well") metaVar := flag.String("m", "", "a key=value pair to pass to meta") recordKey := flag.String("recordkey", "record", "key name of the record") plainMode := flag.Bool("p", false, "plain mode: dump without content and meta") batchSize := flag.Int("b", 10000, "batch size for intercom") var PrintUsage = func() { fmt.Fprintf(os.Stderr, "Usage: %s [OPTIONS] MARCFILE\n", os.Args[0]) flag.PrintDefaults() } flag.Parse() if *numWorkers > 0 { runtime.GOMAXPROCS(*numWorkers) } if *cpuprofile != "" { f, err := os.Create(*cpuprofile) if err != nil { log.Fatal(err) } pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } if *version { fmt.Println(marctools.AppVersion) os.Exit(0) } if flag.NArg() < 1 { PrintUsage() os.Exit(1) } file, err := os.Open(flag.Args()[0]) if err != nil { log.Fatal(err) } defer func() { if err := file.Close(); err != nil { log.Fatal(err) } }() filterMap := marctools.StringToMapSet(*filterVar) metaMap, err := marctools.KeyValueStringToMap(*metaVar) if err != nil { log.Fatal(err) } queue := make(chan []*marc22.Record) results := make(chan []byte) done := make(chan bool) writer := bufio.NewWriter(os.Stdout) defer writer.Flush() go marctools.FanInWriter(writer, results, done) var wg sync.WaitGroup options := marctools.JsonConversionOptions{ FilterMap: filterMap, MetaMap: metaMap, IncludeLeader: *includeLeader, PlainMode: *plainMode, IgnoreErrors: *ignoreErrors, RecordKey: *recordKey, } for i := 0; i < *numWorkers; i++ { wg.Add(1) go marctools.BatchWorker(queue, results, &wg, options) } counter := 0 var records []*marc22.Record for { record, err := marc22.ReadRecord(file) if err == io.EOF { break } if err != nil { if *ignoreErrors { log.Println(err) continue } else { log.Fatal(err) } } records = append(records, record) counter += 1 if counter%*batchSize == 0 { queue <- records records = records[:0] } } queue <- records close(queue) wg.Wait() close(results) <-done }