Exemplo n.º 1
0
// IDList returns a slice of strings, containing all ids of the given marc file
func IDList(filename string) []string {
	fallback := false
	yaz, err := exec.LookPath("yaz-marcdump")
	if err != nil {
		fallback = true
	}

	awk, err := exec.LookPath("awk")
	if err != nil {
		fallback = true
	}

	var ids []string

	if fallback {
		// use slower iteration over records
		fi, err := os.Open(filename)
		if err != nil {
			log.Fatal(err)
		}
		defer func() {
			if err := fi.Close(); err != nil {
				log.Fatal(err)
			}
		}()

		for {
			record, err := marc22.ReadRecord(fi)
			if err == io.EOF {
				break
			}
			if err != nil {
				log.Fatal(err)
			}

			fields := record.GetControlFields("001")
			if len(fields) != 1 {
				log.Fatalf("invalid 001 field count: %d\n", len(fields))
			}
			ids = append(ids, strings.TrimSpace(fields[0].Data))
		}
	} else {
		// fast version using yaz and awk
		command := fmt.Sprintf("%s '%s' | %s ' /^001 / {print $2}'", yaz, filename, awk)
		out, err := exec.Command("bash", "-c", command).Output()
		if err != nil {
			log.Fatal(err)
		}

		for _, line := range strings.Split(string(out), "\n") {
			line = strings.TrimSpace(line)
			if len(line) == 0 {
				continue
			}
			ids = append(ids, strings.TrimSpace(line))
		}
	}

	return ids
}
Exemplo n.º 2
0
func main() {

	version := flag.Bool("v", false, "prints current program version")
	cpuprofile := flag.String("cpuprofile", "", "write cpu profile to file")

	var PrintUsage = func() {
		fmt.Fprintf(os.Stderr, "Usage: %s [OPTIONS] MARCFILE\n", os.Args[0])
		flag.PrintDefaults()
	}

	flag.Parse()

	if *cpuprofile != "" {
		f, err := os.Create(*cpuprofile)
		if err != nil {
			log.Fatal(err)
		}
		pprof.StartCPUProfile(f)
		defer pprof.StopCPUProfile()
	}

	if *version {
		fmt.Println(marctools.AppVersion)
		os.Exit(0)
	}

	if flag.NArg() != 1 {
		PrintUsage()
		os.Exit(1)
	}

	fi, err := os.Open(flag.Args()[0])
	if err != nil {
		log.Fatalf("%s\n", err)
	}
	defer func() {
		if err := fi.Close(); err != nil {
			log.Fatalf("%s\n", err)
		}
	}()

	for {
		record, err := marc22.ReadRecord(fi)
		if err == io.EOF {
			break
		}
		if err != nil {
			log.Fatalf("%s\n", err)
		}

		fmt.Printf("%s\n", record.String())
	}
	return
}
Exemplo n.º 3
0
func TestRecordToTSV(t *testing.T) {
	for _, tt := range recordToTSVTests {
		reader := strings.NewReader(tt.record)
		record, err := marc22.ReadRecord(reader)
		if err != nil {
			t.Error(err)
		}
		result := RecordToTSV(record, tt.tags, tt.fillNA, tt.separator, tt.skipIncompleteLines)
		if result != tt.out {
			t.Errorf("RecordToTSV(%s, %v, %s, %s, %t) => %+v, want: %+v", record, tt.tags, tt.fillNA, tt.separator, tt.skipIncompleteLines, result, tt.out)
		}
	}
}
Exemplo n.º 4
0
func TestRecordToMap(t *testing.T) {
	for _, tt := range recordMapTests {
		reader := strings.NewReader(tt.record)
		record, err := marc22.ReadRecord(reader)
		if err != nil {
			t.Error(err)
		}
		result := RecordMap(record, tt.filterMap, tt.includeLeader)
		if result == nil {
			t.Error("RecordToMap should not return nil")
		}
		b, err := json.Marshal(result)
		if err != nil {
			t.Error("RecordToMap should return something JSON-serializable")
		}
		if string(b) != tt.out {
			t.Errorf("RecordToMap(%s, %+v, %v) => %+v, want: %+v", tt.record, tt.filterMap, tt.includeLeader, string(b), tt.out)
		}
	}
}
Exemplo n.º 5
0
func main() {

	ignore := flag.Bool("i", false, "ignore marc errors (not recommended)")
	version := flag.Bool("v", false, "prints current program version")
	outfile := flag.String("o", "", "output file (or stdout if none given)")
	exclude := flag.String("x", "", "comma separated list of ids to exclude (or filename with one id per line)")

	var PrintUsage = func() {
		fmt.Fprintf(os.Stderr, "Usage: %s [OPTIONS] MARCFILE\n", os.Args[0])
		flag.PrintDefaults()
	}

	flag.Parse()

	// display version and exit
	if *version {
		fmt.Println(marctools.AppVersion)
		os.Exit(0)
	}

	if flag.NArg() != 1 {
		PrintUsage()
		os.Exit(1)
	}

	// input file
	fi, err := os.Open(flag.Args()[0])
	if err != nil {
		log.Fatalln(err)
	}

	defer func() {
		if err := fi.Close(); err != nil {
			log.Fatalln(err)
		}
	}()

	// output file or stdout
	var output *os.File
	if *outfile == "" {
		output = os.Stdout
	} else {
		output, err = os.Create(*outfile)
		if err != nil {
			log.Fatalln(err)
		}
		defer func() {
			if err := output.Close(); err != nil {
				log.Fatalln(err)
			}
		}()
	}

	// exclude list
	excludedIds := marctools.NewStringSet()

	if *exclude != "" {
		if _, err := os.Stat(*exclude); err != nil {
			if os.IsNotExist(err) {
				fmt.Fprintf(os.Stderr, "excluded ids interpreted as string\n")
				for _, value := range strings.Split(*exclude, ",") {
					excludedIds.Add(strings.TrimSpace(value))
				}
			} else if err != nil {
				log.Fatalln(err)
			}
		} else {
			fmt.Fprintf(os.Stderr, "excluded ids interpreted as file\n")

			// read one id per line from file
			file, err := os.Open(*exclude)
			if err != nil {
				log.Fatalln(err)
			}

			defer func() {
				if err := file.Close(); err != nil {
					log.Fatalln(err)
				}
			}()

			scanner := bufio.NewScanner(file)
			for scanner.Scan() {
				excludedIds.Add(strings.TrimSpace(scanner.Text()))
			}
		}
		fmt.Fprintf(os.Stderr, "%d ids to exclude loaded\n", excludedIds.Size())
	}

	// collect the excluded ids here
	excluded := make([]string, 0, 0)

	// keep track of all ids
	ids := marctools.NewStringSet()
	// collect the duplicate ids; array, since same id may occur many times
	// skipped could be an integer for now, because we do not display the skipped
	// records (TODO: add flag to display skipped records)
	skipped := make([]string, 0, 0)
	// just count the total records and those without id
	var counter, without_id int

	for {
		head, _ := fi.Seek(0, os.SEEK_CUR)
		record, err := marc22.ReadRecord(fi)
		if err == io.EOF {
			break
		}
		if err != nil {
			if *ignore {
				fmt.Fprintf(os.Stderr, "skipping error: %s\n", err)
				continue
			} else {
				log.Fatalln(err)
			}
		}
		tail, _ := fi.Seek(0, os.SEEK_CUR)
		length := tail - head

		fields := record.GetControlFields("001")
		if len(fields) > 0 {
			id := fields[0].Data
			if ids.Contains(id) {
				skipped = append(skipped, id)
			} else if excludedIds.Contains(id) {
				excluded = append(excluded, id)
			} else {
				ids.Add(id)
				fi.Seek(head, 0)
				buf := make([]byte, length)
				n, err := fi.Read(buf)
				if err != nil {
					log.Fatalln(err)
				}
				if _, err := output.Write(buf[:n]); err != nil {
					log.Fatalln(err)
				}
			}
		} else if len(fields) == 0 {
			without_id += 1
		}
		counter += 1
	}

	fmt.Fprintf(os.Stderr, "%d records read\n", counter)
	fmt.Fprintf(os.Stderr, "%d records written, %d skipped, %d excluded, %d without ID (001)\n",
		ids.Size(), len(skipped), len(excluded), without_id)
}
Exemplo n.º 6
0
func main() {

	cpuprofile := flag.String("cpuprofile", "", "write cpu profile to file")
	ignoreErrors := flag.Bool("i", false, "ignore marc errors (not recommended)")
	numWorkers := flag.Int("w", runtime.NumCPU(), "number of workers")
	version := flag.Bool("v", false, "prints current program version and exit")

	fillna := flag.String("f", "<NULL>", "fill missing values with this")
	separator := flag.String("s", "", "separator to use for multiple values")
	skipIncompleteLines := flag.Bool("k", false, "skip incomplete lines (missing values)")

	var PrintUsage = func() {
		fmt.Fprintf(os.Stderr, "Usage: %s [OPTIONS] MARCFILE TAG [TAG, TAG, ...]\n", os.Args[0])
		flag.PrintDefaults()
	}

	flag.Parse()

	if *numWorkers > 0 {
		runtime.GOMAXPROCS(*numWorkers)
	}

	if *cpuprofile != "" {
		f, err := os.Create(*cpuprofile)
		if err != nil {
			log.Fatal(err)
		}
		pprof.StartCPUProfile(f)
		defer pprof.StopCPUProfile()
	}

	if *version {
		fmt.Println(marctools.AppVersion)
		os.Exit(0)
	}

	if flag.NArg() < 1 {
		PrintUsage()
		os.Exit(1)
	}

	file, err := os.Open(flag.Args()[0])
	if err != nil {
		log.Fatalln(err)
	}

	defer func() {
		if err := file.Close(); err != nil {
			log.Fatalln(err)
		}
	}()

	tags := flag.Args()[1:]

	if len(tags) == 0 {
		log.Fatalln("at least one tag is required")
	}

	queue := make(chan Work)
	results := make(chan string)
	done := make(chan bool)

	writer := bufio.NewWriter(os.Stdout)
	defer writer.Flush()
	go FanInWriter(writer, results, done)

	var wg sync.WaitGroup
	for i := 0; i < *numWorkers; i++ {
		wg.Add(1)
		go Worker(queue, results, &wg)
	}

	for {
		record, err := marc22.ReadRecord(file)
		if err == io.EOF {
			break
		}
		if err != nil {
			if *ignoreErrors {
				log.Printf("[EE] %s\n", err)
				continue
			} else {
				log.Fatalln(err)
			}
		}

		work := Work{Record: record,
			Tags:                tags,
			FillNA:              *fillna,
			Separator:           *separator,
			SkipIncompleteLines: *skipIncompleteLines}
		queue <- work
	}

	close(queue)
	wg.Wait()
	close(results)
	<-done
}
Exemplo n.º 7
0
func main() {

	cpuprofile := flag.String("cpuprofile", "", "write cpu profile to file")
	ignoreErrors := flag.Bool("i", false, "ignore marc errors (not recommended)")
	numWorkers := flag.Int("w", runtime.NumCPU(), "number of workers")
	version := flag.Bool("v", false, "prints current program version and exit")

	filterVar := flag.String("r", "", "only dump the given tags (e.g. 001,003)")
	includeLeader := flag.Bool("l", false, "dump the leader as well")
	metaVar := flag.String("m", "", "a key=value pair to pass to meta")
	recordKey := flag.String("recordkey", "record", "key name of the record")
	plainMode := flag.Bool("p", false, "plain mode: dump without content and meta")
	batchSize := flag.Int("b", 10000, "batch size for intercom")

	var PrintUsage = func() {
		fmt.Fprintf(os.Stderr, "Usage: %s [OPTIONS] MARCFILE\n", os.Args[0])
		flag.PrintDefaults()
	}

	flag.Parse()

	if *numWorkers > 0 {
		runtime.GOMAXPROCS(*numWorkers)
	}

	if *cpuprofile != "" {
		f, err := os.Create(*cpuprofile)
		if err != nil {
			log.Fatal(err)
		}
		pprof.StartCPUProfile(f)
		defer pprof.StopCPUProfile()
	}

	if *version {
		fmt.Println(marctools.AppVersion)
		os.Exit(0)
	}

	if flag.NArg() < 1 {
		PrintUsage()
		os.Exit(1)
	}

	file, err := os.Open(flag.Args()[0])
	if err != nil {
		log.Fatal(err)
	}

	defer func() {
		if err := file.Close(); err != nil {
			log.Fatal(err)
		}
	}()

	filterMap := marctools.StringToMapSet(*filterVar)
	metaMap, err := marctools.KeyValueStringToMap(*metaVar)
	if err != nil {
		log.Fatal(err)
	}

	queue := make(chan []*marc22.Record)
	results := make(chan []byte)
	done := make(chan bool)

	writer := bufio.NewWriter(os.Stdout)
	defer writer.Flush()
	go marctools.FanInWriter(writer, results, done)

	var wg sync.WaitGroup
	options := marctools.JsonConversionOptions{
		FilterMap:     filterMap,
		MetaMap:       metaMap,
		IncludeLeader: *includeLeader,
		PlainMode:     *plainMode,
		IgnoreErrors:  *ignoreErrors,
		RecordKey:     *recordKey,
	}
	for i := 0; i < *numWorkers; i++ {
		wg.Add(1)
		go marctools.BatchWorker(queue, results, &wg, options)
	}

	counter := 0
	var records []*marc22.Record

	for {
		record, err := marc22.ReadRecord(file)
		if err == io.EOF {
			break
		}
		if err != nil {
			if *ignoreErrors {
				log.Println(err)
				continue
			} else {
				log.Fatal(err)
			}
		}
		records = append(records, record)
		counter += 1
		if counter%*batchSize == 0 {
			queue <- records
			records = records[:0]
		}
	}
	queue <- records
	close(queue)
	wg.Wait()
	close(results)
	<-done
}