Skip to content

Commit c938fc7

Browse files
author
Jiayi Wang
committed
address review comments from emkornfield and adamreeve
- Rename parquet3.fbs to parquet.fbs - Comment out deprecated PLAIN_DICTIONARY encoding (like BIT_PACKED) - Add distinct_count back to Statistics - Remove ConvertedType forward-compat constraint from DecimalOptions - Add backward-compat note for LogicalType Empty union types - Reorder SchemaElement fields to match Thrift ordering - Expand is_fully_dict_encoded documentation - Rename meta_data to metadata in ColumnChunk - Clarify total_byte_size in RowGroup - Remove FileCryptoMetaData (encrypted footer layout not yet specified)
1 parent 5a0baf2 commit c938fc7

1 file changed

Lines changed: 30 additions & 41 deletions

File tree

Lines changed: 30 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,6 @@ namespace parquet.format;
3535
// ColumnMetaData.is_fully_dict_encoded.
3636
// 4. ColumnMetaData.path_in_schema is removed since it can be derived from the schema.
3737
// 5. ConvertedType is fully dropped as it is superseded by LogicalType.
38-
// 6. Offset and column indexes are removed since they are small and their offsets
39-
// alone take comparable space.
4038

4139
/**
4240
* Types supported by Parquet. These types are intended to be used in combination
@@ -95,19 +93,19 @@ enum Encoding : byte {
9593
/**
9694
* Deprecated: Dictionary encoding. The values in the dictionary are encoded in the
9795
* plain type.
98-
* in a data page use RLE_DICTIONARY instead.
99-
* in a Dictionary page use PLAIN instead
96+
* In a data page use RLE_DICTIONARY instead.
97+
* In a Dictionary page use PLAIN instead.
10098
*/
101-
PLAIN_DICTIONARY = 2,
99+
// PLAIN_DICTIONARY = 2,
102100

103101
/** Group packed run length encoding. Usable for definition/repetition levels
104102
* encoding and Booleans (on one bit: 0 is false; 1 is true.)
105103
*/
106104
RLE = 3,
107105

108-
/** Bit packed encoding. This can only be used if the data has a known max
106+
/** Deprecated: Bit packed encoding. This can only be used if the data has a known max
109107
* width. Usable for definition/repetition levels encoding.
110-
* This encoding is deprecated and is replaced by the RLE/bit-packing hybrid encoding.
108+
* This encoding is replaced by the RLE/bit-packing hybrid encoding.
111109
*/
112110
// BIT_PACKED = 4,
113111

@@ -176,9 +174,6 @@ table Empty {}
176174
* Scale must be zero or a positive integer less than or equal to the precision.
177175
* Precision must be a non-zero positive integer.
178176
*
179-
* To maintain forward-compatibility in v1, implementations using this logical
180-
* type must also set scale and precision on the annotated SchemaElement.
181-
*
182177
* Allowed for physical types: INT32, INT64, FIXED_LEN_BYTE_ARRAY, and BYTE_ARRAY.
183178
*/
184179
table DecimalOptions {
@@ -275,9 +270,13 @@ table GeographyType {
275270

276271
/**
277272
* LogicalType annotations to replace ConvertedType.
273+
*
274+
* Types with no parameters use `Empty`. To add parameters later, append a new
275+
* union member (e.g., StringTypeV2:StringOptions); new readers remain
276+
* backward-compatible with old files.
278277
*/
279278
union LogicalType {
280-
StringType:Empty,
279+
StringType:Empty,
281280
MapType:Empty,
282281
ListType:Empty,
283282
EnumType:Empty,
@@ -298,6 +297,8 @@ union LogicalType {
298297

299298
table Statistics {
300299
null_count: long = null;
300+
/** count of distinct values occurring */
301+
distinct_count: long = null;
301302
// Store min/max values as fixed-width entities depending on the physical type.
302303
// If min_len/max_len is present then the corresponding min/max value is present.
303304
//
@@ -411,26 +412,23 @@ union ColumnOrder {
411412
* the nodes are listed in depth first traversal order.
412413
*/
413414
table SchemaElement {
414-
/** Name of the field in the schema */
415-
name: string;
416-
417415
/** Data type for this field. Not set if the current element is a non-leaf node */
418416
type: Type = null;
419417

420-
/** repetition of the field. The root of the schema does not have a repetition_type.
421-
* All other nodes must have one */
422-
repetition_type: FieldRepetitionType = null;
423-
424-
/** The logical type of this SchemaElement */
425-
logical_type: LogicalType;
426-
427418
/** If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the values.
428419
* Otherwise, if specified, this is the maximum bit length to store any of the values.
429420
* (e.g. a low cardinality INT col could have this set to 3). Note that this is
430421
* in the schema, and therefore fixed for the entire file.
431422
*/
432423
type_length: int = null;
433424

425+
/** repetition of the field. The root of the schema does not have a repetition_type.
426+
* All other nodes must have one */
427+
repetition_type: FieldRepetitionType = null;
428+
429+
/** Name of the field in the schema */
430+
name: string;
431+
434432
/** Nested fields. Since thrift does not support nested fields,
435433
* the nesting is flattened to a single list by a depth-first traversal.
436434
* The children count is used to construct the nested relationship.
@@ -442,7 +440,12 @@ table SchemaElement {
442440
* original field id in the parquet schema
443441
*/
444442
field_id: int = null;
445-
column_order: ColumnOrder; // only present for leaf nodes
443+
444+
/** The logical type of this SchemaElement */
445+
logical_type: LogicalType;
446+
447+
/** Column ordering for leaf nodes, used to interpret min/max statistics */
448+
column_order: ColumnOrder;
446449
}
447450

448451
enum PageType : byte {
@@ -489,7 +492,9 @@ table ColumnMetadata {
489492
/** optional statistics for this column chunk */
490493
statistics: Statistics;
491494

492-
/** Indicates whether the column chunk pages are fully dictionary encoded. */
495+
/** True if every data page in this column chunk is dictionary-encoded
496+
* (no fallback). Replaces Thrift encoding_stats.
497+
*/
493498
is_fully_dict_encoded: bool;
494499

495500
/** Optional Bloom filter information for this column chunk */
@@ -517,7 +522,7 @@ table ColumnChunk {
517522
* Note: while marked as optional, this field is in fact required by most major
518523
* Parquet implementations. As such, writers MUST populate this field.
519524
**/
520-
meta_data: ColumnMetadata;
525+
metadata: ColumnMetadata;
521526

522527
/** Crypto metadata of encrypted columns **/
523528
crypto_metadata: ColumnCryptoMetadata;
@@ -547,7 +552,7 @@ table RowGroup {
547552
**/
548553
columns: [ColumnChunk];
549554

550-
/** Total byte size of all the uncompressed column data in this row group **/
555+
/** Sum of total_uncompressed_size across all columns (uncompressed, encoded) **/
551556
total_byte_size: long;
552557

553558
/** Number of rows in this row group **/
@@ -570,22 +575,6 @@ table RowGroup {
570575
ordinal: short = null;
571576
}
572577

573-
/**
574-
* Crypto metadata for files with encrypted footer.
575-
*/
576-
table FileCryptoMetaData {
577-
/**
578-
* Encryption algorithm. This field is only used for files
579-
* with encrypted footer. Files with plaintext footer store algorithm id
580-
* inside footer (FileMetaData structure).
581-
*/
582-
encryption_algorithm: EncryptionAlgorithm;
583-
584-
/** Retrieval metadata of key used for encryption of footer,
585-
* and (possibly) columns **/
586-
key_metadata: [byte];
587-
}
588-
589578
/**
590579
* Description for file metadata
591580
*/

0 commit comments

Comments
 (0)