func (schema *Schema) createMetadata() *thrift.FileMetaData { root_children := int32(1) root := thrift.NewSchemaElement() root.Name = "root" root.NumChildren = &root_children // the root of the schema does not have to have a repetition type. // All the other elements do. elements := []*thrift.SchemaElement{root} //typeint := thrift.Type_INT32 //offset := len(PARQUET_MAGIC) // for row group // for idx, cc := range schema.columns { // cc.FileOffset = int64(offset) // // n, err := cc.Write(w) // // if err != nil { // // return fmt.Errorf("chunk writer: could not write chunk for column %d: %s", idx, err) // // } // // offset += n // cc.MetaData.DataPageOffset = int64(offset) // n1, err := io.Copy(w, &chunks[0]) // if err != nil { // return fmt.Errorf("chunk writer: could not write chunk for column %d: %s", idx, err) // } // log.Println("wrote:", n1) // offset += int(n1) // group.AddColumn(cc) // columnDescriptor := thrift.NewSchemaElement() // columnDescriptor.Name = cc.GetMetaData().PathInSchema[0] // columnDescriptor.NumChildren = nil // columnDescriptor.Type = &typeint // required := thrift.FieldRepetitionType_REQUIRED // columnDescriptor.RepetitionType = &required // schema = append(schema, columnDescriptor) // } // write metadata at then end of the file in thrift format meta := thrift.FileMetaData{ Version: 0, Schema: elements, RowGroups: []*thrift.RowGroup{}, KeyValueMetadata: []*thrift.KeyValue{}, CreatedBy: strptr("go-0.1"), // go-parquet version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) } return &meta }
// AddColumn adds a column with the given specifications format // format is // name: type [original type] REQUIRED func (s *Schema) AddColumnFromSpec(format string) error { values := strings.SplitN(format, ":", 2) if len(values) != 2 { return ErrBadFormat } name := values[0] spec := values[1] el := thrift.NewSchemaElement() el.Name = name values = strings.Split(strings.TrimSpace(spec), " ") originalType, err := thrift.TypeFromString(normalizeType(values[0])) if err != nil { return fmt.Errorf("could not add column: bad type: %s (%s)", err, values[0]) } el.Type = &originalType switch len(values) { case 3: convertedType, err := thrift.ConvertedTypeFromString(normalizeType(values[1])) if err != nil { return fmt.Errorf("could not add column: bad converted type: %s", err) } repetitionType, err := thrift.FieldRepetitionTypeFromString(normalizeType(values[2])) if err != nil { return fmt.Errorf("could not add column: bad repetition type: %s", err) } el.ConvertedType = &convertedType el.RepetitionType = &repetitionType case 2: repetitionType, err := thrift.FieldRepetitionTypeFromString(normalizeType(values[1])) if err != nil { return fmt.Errorf("could not add column: bad repetition type: %s", err) } el.RepetitionType = &repetitionType default: return fmt.Errorf("could not add column: invalid number of elements in format") } s.columns[el.Name] = ColumnDescriptor{ SchemaElement: el, } return nil }
// name, type func (s *Schema) AddColumnFromThriftSchema(spec map[string]interface{}) error { el := thrift.NewSchemaElement() type_, ok := spec["type"] if !ok { return fmt.Errorf("invalid spec: key 'type', not found") } name, ok := spec["name"] if !ok { return fmt.Errorf("invalid spec: key 'name', not found") } el.Name = name.(string) // https://avro.apache.org/docs/1.8.0/spec.html#schema_primitive switch type_ { case "null": case "boolean": el.Type = typeBoolean case "int": el.Type = typeInt32 el.ConvertedType = ctInt32 case "long": el.Type = typeInt64 el.ConvertedType = ctInt64 case "float": el.Type = typeFloat case "double": el.Type = typeDouble case "bytes": el.Type = typeByteArray case "string": el.Type = typeByteArray el.ConvertedType = ctUTF8 default: return fmt.Errorf("unsupported type: %s", type_) } s.columns[el.Name] = ColumnDescriptor{ SchemaElement: el, } return nil }