Example #1
0
func (schema *Schema) createMetadata() *thrift.FileMetaData {
	root_children := int32(1)

	root := thrift.NewSchemaElement()
	root.Name = "root"
	root.NumChildren = &root_children

	// the root of the schema does not have to have a repetition type.
	// All the other elements do.
	elements := []*thrift.SchemaElement{root}

	//typeint := thrift.Type_INT32

	//offset := len(PARQUET_MAGIC)

	// for row group
	// for idx, cc := range schema.columns {
	// 	cc.FileOffset = int64(offset)
	// 	// n, err := cc.Write(w)
	// 	// if err != nil {
	// 	// 	return fmt.Errorf("chunk writer: could not write chunk for column %d: %s", idx, err)
	// 	// }
	// 	// offset += n
	// 	cc.MetaData.DataPageOffset = int64(offset)

	// 	n1, err := io.Copy(w, &chunks[0])
	// 	if err != nil {
	// 		return fmt.Errorf("chunk writer: could not write chunk for column %d: %s", idx, err)
	// 	}

	// 	log.Println("wrote:", n1)

	// 	offset += int(n1)

	// 	group.AddColumn(cc)

	// 	columnDescriptor := thrift.NewSchemaElement()
	// 	columnDescriptor.Name = cc.GetMetaData().PathInSchema[0]
	// 	columnDescriptor.NumChildren = nil
	// 	columnDescriptor.Type = &typeint
	// 	required := thrift.FieldRepetitionType_REQUIRED
	// 	columnDescriptor.RepetitionType = &required

	// 	schema = append(schema, columnDescriptor)
	// }

	// write metadata at then end of the file in thrift format
	meta := thrift.FileMetaData{
		Version:          0,
		Schema:           elements,
		RowGroups:        []*thrift.RowGroup{},
		KeyValueMetadata: []*thrift.KeyValue{},
		CreatedBy:        strptr("go-0.1"), // go-parquet version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)
	}

	return &meta
}
Example #2
0
// AddColumn adds a column with the given specifications format
// format is
//          name: type [original type] REQUIRED
func (s *Schema) AddColumnFromSpec(format string) error {
	values := strings.SplitN(format, ":", 2)
	if len(values) != 2 {
		return ErrBadFormat
	}

	name := values[0]
	spec := values[1]

	el := thrift.NewSchemaElement()
	el.Name = name

	values = strings.Split(strings.TrimSpace(spec), " ")

	originalType, err := thrift.TypeFromString(normalizeType(values[0]))
	if err != nil {
		return fmt.Errorf("could not add column: bad type: %s (%s)", err, values[0])
	}
	el.Type = &originalType

	switch len(values) {
	case 3:
		convertedType, err := thrift.ConvertedTypeFromString(normalizeType(values[1]))
		if err != nil {
			return fmt.Errorf("could not add column: bad converted type: %s", err)
		}

		repetitionType, err := thrift.FieldRepetitionTypeFromString(normalizeType(values[2]))
		if err != nil {
			return fmt.Errorf("could not add column: bad repetition type: %s", err)
		}

		el.ConvertedType = &convertedType
		el.RepetitionType = &repetitionType
	case 2:
		repetitionType, err := thrift.FieldRepetitionTypeFromString(normalizeType(values[1]))
		if err != nil {
			return fmt.Errorf("could not add column: bad repetition type: %s", err)
		}
		el.RepetitionType = &repetitionType

	default:
		return fmt.Errorf("could not add column: invalid number of elements in format")

	}

	s.columns[el.Name] = ColumnDescriptor{
		SchemaElement: el,
	}

	return nil
}
Example #3
0
// name, type
func (s *Schema) AddColumnFromThriftSchema(spec map[string]interface{}) error {
	el := thrift.NewSchemaElement()

	type_, ok := spec["type"]
	if !ok {
		return fmt.Errorf("invalid spec: key 'type', not found")
	}

	name, ok := spec["name"]
	if !ok {
		return fmt.Errorf("invalid spec: key 'name', not found")
	}

	el.Name = name.(string)

	// https://avro.apache.org/docs/1.8.0/spec.html#schema_primitive
	switch type_ {
	case "null":
	case "boolean":
		el.Type = typeBoolean
	case "int":
		el.Type = typeInt32
		el.ConvertedType = ctInt32
	case "long":
		el.Type = typeInt64
		el.ConvertedType = ctInt64
	case "float":
		el.Type = typeFloat
	case "double":
		el.Type = typeDouble
	case "bytes":
		el.Type = typeByteArray
	case "string":
		el.Type = typeByteArray
		el.ConvertedType = ctUTF8
	default:
		return fmt.Errorf("unsupported type: %s", type_)
	}

	s.columns[el.Name] = ColumnDescriptor{
		SchemaElement: el,
	}

	return nil
}