Пример #1
0
func readCsv(ch chan []string) {
	var reader *csv.Reader
	if inputFn == "" {
		reader = csv.NewReader(os.Stdin)
	} else {
		file, err := os.Open(inputFn)
		if err != nil {
			fmt.Println("Error:", err)
			os.Exit(1)
		}
		defer file.Close()
		reader = csv.NewReader(file)
	}
	if !strictLen {
		reader.FieldsPerRecord = -1
	}
	r, _ := utf8.DecodeRuneInString(inputSep)
	reader.Comma = r
	reader.LazyQuotes = lazyQuotes

	for {
		record, err := reader.Read()
		if err == io.EOF {
			close(ch)
			break
		} else if err != nil {
			fmt.Println("Error:", err)
			close(ch)
			break
		}
		ch <- record
	}
}
Пример #2
0
func (t *table) start(reader *csv.Reader) {
	defer t.Stop()
	defer close(t.rows)

	headers, err := reader.Read()
	if err != nil {
		if perr, ok := err.(*csv.ParseError); ok {
			// Modifies the underlying err
			perr.Err = fmt.Errorf("%s. %s", perr.Err, "This can happen when the CSV is malformed, or when the wrong delimiter is used")
		}
		t.handleErr(err)
		return
	}

	reader.FieldsPerRecord = len(headers)
	for {
		if t.stopped {
			break
		}
		line, err := reader.Read()
		if err != nil {
			t.handleErr(err)
			return
		}
		t.rows <- convertLineToRow(line, headers)
	}
}
Пример #3
0
/* Given a CSV reader, populate a histogram table of the field counts
 */
func countFields(csvReader *csv.Reader, histogram []int64) {
	histogramLen := len(histogram)
	csvReader.FieldsPerRecord = -1 // Tell the CVS reader to expect an unknown field count
	for {
		strs, err := csvReader.Read()
		if nil != err {
			break
		}
		f := len(strs)
		if f < histogramLen {
			if 0 < f {
				histogram[f]++
			} // There's no such thing as a 0 length field record.
		} else {
			fmt.Print("\nWARNING:", histogramLen, "<", f, "histogram length.")
		}
	}
	return
}
Пример #4
0
// Run is the block's main loop. Here we listen on the different channels we set up.
func (b *ParseCSV) Run() {
	var tree *jee.TokenTree
	var path string
	var err error
	var headers []string
	var csvReader *csv.Reader

	for {
		select {
		case ruleI := <-b.inrule:
			// set a parameter of the block
			path, err = util.ParseString(ruleI, "Path")
			if err != nil {
				b.Error(err)
				continue
			}
			token, err := jee.Lexer(path)
			if err != nil {
				b.Error(err)
				continue
			}
			tree, err = jee.Parser(token)
			if err != nil {
				b.Error(err)
				continue
			}

			headers, err = util.ParseArrayString(ruleI, "Headers")
			if err != nil {
				b.Error(err)
				continue
			}
		case <-b.quit:
			// quit the block
			return
		case msg := <-b.in:
			// deal with inbound data
			if tree == nil {
				continue
			}
			var data string

			dataI, err := jee.Eval(tree, msg)
			if err != nil {
				b.Error(err)
				continue
			}

			switch value := dataI.(type) {
			case []byte:
				data = string(value[:])

			case string:
				data = value

			default:
				b.Error("data should be a string or a []byte")
				continue
			}

			csvReader = csv.NewReader(strings.NewReader(data))
			csvReader.TrimLeadingSpace = true
			// allow records to have variable numbers of fields
			csvReader.FieldsPerRecord = -1

		case <-b.inpoll:
			if csvReader == nil {
				b.Error("this block needs data to be pollable")
				break
			}
			record, err := csvReader.Read()
			if err != nil && err != io.EOF {
				b.Error(err)
				continue
			}
			row := make(map[string]interface{})
			for fieldIndex, field := range record {
				if fieldIndex >= len(headers) {
					row[strconv.Itoa(fieldIndex)] = field
				} else {
					header := headers[fieldIndex]
					row[header] = field
				}
			}

			b.out <- row

		case MsgChan := <-b.queryrule:
			// deal with a query request
			MsgChan <- map[string]interface{}{
				"Path":    path,
				"Headers": headers,
			}
		}
	}
}
Пример #5
0
func importCSV(filename string, connStr string, schema string, tableName string, ignoreErrors bool, skipHeader bool, fields string, delimiter string) error {

	db, err := connect(connStr, schema)
	if err != nil {
		return err
	}
	defer db.Close()

	var reader *csv.Reader
	var bar *pb.ProgressBar
	if filename != "" {
		file, err := os.Open(filename)
		if err != nil {
			return err
		}
		defer file.Close()

		bar = NewProgressBar(file)
		reader = csv.NewReader(io.TeeReader(file, bar))
	} else {
		reader = csv.NewReader(os.Stdin)
	}

	reader.Comma, _ = utf8.DecodeRuneInString(delimiter)
	reader.LazyQuotes = true

	columns, err := parseColumns(reader, skipHeader, fields)
	if err != nil {
		return err
	}

	reader.FieldsPerRecord = len(columns)

	i, err := NewCSVImport(db, schema, tableName, columns)
	if err != nil {
		return err
	}

	var success, failed int
	if filename != "" {
		bar.Start()
		err, success, failed = copyCSVRows(i, reader, ignoreErrors, delimiter, columns)
		bar.Finish()
	} else {
		err, success, failed = copyCSVRows(i, reader, ignoreErrors, delimiter, columns)
	}

	if err != nil {
		lineNumber := success + failed
		if !skipHeader {
			lineNumber++
		}
		return errors.New(fmt.Sprintf("line %d: %s", lineNumber, err))
	} else {
		fmt.Println(fmt.Sprintf("%d rows imported into %s.%s", success, schema, tableName))

		if ignoreErrors && failed > 0 {
			fmt.Println(fmt.Sprintf("%d rows could not be imported into %s.%s and have been written to stderr.", failed, schema, tableName))
		}

		return i.Commit()
	}
}
Пример #6
0
func StreamCSV(csvChannel chan<- map[string]string, filename string) {
	var err error
	// open compressed csv file
	var file *os.File
	if file, err = os.Open(filename); err != nil {
		log.Fatal(err)
	}
	defer file.Close()

	var ioReader io.Reader

	filename = strings.ToLower(filename)

	var csvReader *csv.Reader

	if strings.HasSuffix(filename, ".bz2") {
		ioReader = bzip2.NewReader(file)
		csvReader = csv.NewReader(ioReader)
	} else if strings.HasSuffix(filename, ".xz") {
		ioReader = bufio.NewReader(file)
		//if ioReader, err = xz.NewReader(ioReader, 0); err != nil {
		if ioReader, err = xz.NewReader(file, 0); err != nil {
			log.Fatal(err)
		}
		csvReader = csv.NewReader(ioReader)
	} else {
		// log.Fatal("input mush be copressed")
		//reader = bufio.NewReader(file)
		//reader = os.Open(file)
		// bufio.NewReader(file)
		csvReader = csv.NewReader(file)
	}
	// create csv reader
	csvReader.FieldsPerRecord = 0
	// ident colum names
	colMapping := make(map[int]string)
	rec, err := csvReader.Read()
	if err != nil {
		log.Fatalf("ERROR: %v\n", err)
	}

	for key, value := range rec {
		colMapping[key] = value
	}

	for {
		rec, err := csvReader.Read()
		if err != nil {
			break
		}
		line := make(map[string]string)
		for key, value := range rec {
			if value == "" {
				continue
			}
			line[colMapping[key]] = value
		}
		csvChannel <- line
	}
	close(csvChannel)
}
Пример #7
0
func ImportDictionaries() map[string][]*models.SuggestItem {
	var itemMap = make(map[string][]*models.SuggestItem)

	fileInfo, err := ioutil.ReadDir(DataDirectory)
	if err != nil {
		log.Print(err)
		return itemMap
	}
	numberOfDictionaries := 0
	for _, file := range fileInfo {
		if !file.IsDir() && (strings.HasSuffix(file.Name(), ".txt") || strings.HasSuffix(file.Name(), ".txt.gz")) {
			dictionaryFile := fmt.Sprintf("%s%s%s", DataDirectory, string(os.PathSeparator), file.Name())
			dictionaryName := strings.TrimSuffix(strings.TrimSuffix(file.Name(), ".gz"), ".txt")
			log.Printf("Importing dictionary %s from file %s", dictionaryName, dictionaryFile)

			csvFile, err := os.Open(dictionaryFile)
			if err != nil {
				log.Print(err)
				continue
			}
			defer csvFile.Close()
			var csvReader *csv.Reader
			if strings.HasSuffix(file.Name(), ".txt.gz") {
				gzipReader, gzerr := gzip.NewReader(csvFile)
				if gzerr == nil {
					defer gzipReader.Close()
					csvReader = csv.NewReader(gzipReader)
				} else {
					log.Print(gzerr)
					continue
				}
			} else {
				csvReader = csv.NewReader(csvFile)
			}

			csvReader.FieldsPerRecord = 2
			csvReader.Comma = '|'
			csvReader.LazyQuotes = true
			csvReader.TrimLeadingSpace = true

			rawCSVdata, err := csvReader.ReadAll()
			if err != nil {
				log.Print(err)
				continue
			}

			for _, each := range rawCSVdata {
				var suggestItem = new(models.SuggestItem)
				suggestItem.Term = each[0]
				weight, err := strconv.Atoi(each[1])
				if err == nil {
					suggestItem.Weight = weight
					itemMap[dictionaryName] = append(itemMap[dictionaryName], suggestItem)
				}

			}
			numberOfDictionaries++
		}
	}

	log.Printf("Imported %d dictionaries", numberOfDictionaries)
	return itemMap
}
Пример #8
0
func load(db *sql.DB, tbl string, cr *csv.Reader) error {
	head, err := cr.Read()
	if err != nil {
		return err
	}
	cr.FieldsPerRecord = len(head)
	marks := make([]string, len(head))
	for i := range marks {
		marks[i] = "?"
	}
	qry := "INSERT INTO " + tbl + " (" + strings.Join(head, ",") + ") VALUES (" + strings.Join(marks, ",") + ")"
	Log.Info("insert", "qry", qry)

	var wg sync.WaitGroup
	conc := runtime.GOMAXPROCS(-1)
	blocks := make(chan [][]string, conc)
	errs := make(chan error, conc)
	rowCount := new(int32)

	R := func(f func() error) {
		defer wg.Done()
		errs <- f()
	}
	for i := 0; i < conc; i++ {
		wg.Add(1)
		go R(func() error {
			var (
				tx *sql.Tx
				st *sql.Stmt
			)
			n := 0
			values := make([]interface{}, len(marks))

			for block := range blocks {
				for _, row := range block {
					for i, v := range row {
						values[i] = v
					}
					if tx == nil {
						if st != nil {
							st.Close()
							st = nil
						}
						if tx, err = db.Begin(); err != nil {
							return err
						}
						if st, err = tx.Prepare(qry); err != nil {
							return err
						}
					}
					if _, err = st.Exec(values...); err != nil {
						return fmt.Errorf("error inserting %q with %q: %v", row, qry, err)
					}
					n++
					atomic.AddInt32(rowCount, 1)
					if n%1000 == 0 {
						if err = tx.Commit(); err != nil {
							return err
						}
						tx = nil
						Log.Info("commit", "n", n, "rowCount", atomic.LoadInt32(rowCount))
					}
				}
			}
			Log.Info("commit", "n", n, "rowCount", atomic.LoadInt32(rowCount))
			if st != nil {
				st.Close()
			}
			if tx != nil {
				return tx.Commit()
			}
			return nil
		})
	}

	var block [][]string
	t := time.Now()
	for {
		row, err := cr.Read()
		if err != nil {
			if err == io.EOF {
				break
			}
			Log.Error("read row", "error", err)
			continue
		}
		if block == nil {
			block = make([][]string, 0, batchLen)
		}
		block = append(block, row)
		if len(block) == batchLen {
			blocks <- block
			block = nil
		}
	}
	if len(block) > 0 {
		blocks <- block
	}
	close(blocks)
	wg.Wait()
	n, d := atomic.LoadInt32(rowCount), time.Since(t)
	fmt.Fprintf(os.Stderr, "Written %d rows under %s: %.3f rows/sec\n",
		n, d, float64(n)/float64(d/time.Second))
	close(errs)
	for err := range errs {
		if err == nil {
			continue
		}
		return err
	}

	return nil
}