@@ -35,8 +35,6 @@ namespace parquet.format;
3535// ColumnMetaData.is_fully_dict_encoded.
3636// 4. ColumnMetaData.path_in_schema is removed since it can be derived from the schema.
3737// 5. ConvertedType is fully dropped as it is superseded by LogicalType.
38- // 6. Offset and column indexes are removed since they are small and their offsets
39- // alone take comparable space.
4038
4139/* *
4240 * Types supported by Parquet. These types are intended to be used in combination
@@ -95,19 +93,19 @@ enum Encoding : byte {
9593 /* *
9694 * Deprecated: Dictionary encoding. The values in the dictionary are encoded in the
9795 * plain type.
98- * in a data page use RLE_DICTIONARY instead.
99- * in a Dictionary page use PLAIN instead
96+ * In a data page use RLE_DICTIONARY instead.
97+ * In a Dictionary page use PLAIN instead.
10098 */
101- PLAIN_DICTIONARY = 2 ,
99+ // PLAIN_DICTIONARY = 2,
102100
103101 /* * Group packed run length encoding. Usable for definition/repetition levels
104102 * encoding and Booleans (on one bit: 0 is false; 1 is true.)
105103 */
106104 RLE = 3 ,
107105
108- /* * Bit packed encoding. This can only be used if the data has a known max
106+ /* * Deprecated: Bit packed encoding. This can only be used if the data has a known max
109107 * width. Usable for definition/repetition levels encoding.
110- * This encoding is deprecated and is replaced by the RLE/bit-packing hybrid encoding.
108+ * This encoding is replaced by the RLE/bit-packing hybrid encoding.
111109 */
112110 // BIT_PACKED = 4,
113111
@@ -176,9 +174,6 @@ table Empty {}
176174 * Scale must be zero or a positive integer less than or equal to the precision.
177175 * Precision must be a non-zero positive integer.
178176 *
179- * To maintain forward-compatibility in v1, implementations using this logical
180- * type must also set scale and precision on the annotated SchemaElement.
181- *
182177 * Allowed for physical types: INT32, INT64, FIXED_LEN_BYTE_ARRAY, and BYTE_ARRAY.
183178 */
184179table DecimalOptions {
@@ -275,9 +270,13 @@ table GeographyType {
275270
276271/* *
277272 * LogicalType annotations to replace ConvertedType.
273+ *
274+ * Types with no parameters use `Empty`. To add parameters later, append a new
275+ * union member (e.g., StringTypeV2:StringOptions); new readers remain
276+ * backward-compatible with old files.
278277 */
279278union LogicalType {
280- StringType :Empty ,
279+ StringType :Empty ,
281280 MapType :Empty ,
282281 ListType :Empty ,
283282 EnumType :Empty ,
@@ -298,6 +297,8 @@ union LogicalType {
298297
299298table Statistics {
300299 null_count : long = null ;
300+ /* * count of distinct values occurring */
301+ distinct_count : long = null ;
301302 // Store min/max values as fixed-width entities depending on the physical type.
302303 // If min_len/max_len is present then the corresponding min/max value is present.
303304 //
@@ -411,26 +412,23 @@ union ColumnOrder {
411412 * the nodes are listed in depth first traversal order.
412413 */
413414table SchemaElement {
414- /* * Name of the field in the schema */
415- name : string ;
416-
417415 /* * Data type for this field. Not set if the current element is a non-leaf node */
418416 type : Type = null ;
419417
420- /* * repetition of the field. The root of the schema does not have a repetition_type.
421- * All other nodes must have one */
422- repetition_type : FieldRepetitionType = null ;
423-
424- /* * The logical type of this SchemaElement */
425- logical_type : LogicalType ;
426-
427418 /* * If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the values.
428419 * Otherwise, if specified, this is the maximum bit length to store any of the values.
429420 * (e.g. a low cardinality INT col could have this set to 3). Note that this is
430421 * in the schema, and therefore fixed for the entire file.
431422 */
432423 type_length : int = null ;
433424
425+ /* * repetition of the field. The root of the schema does not have a repetition_type.
426+ * All other nodes must have one */
427+ repetition_type : FieldRepetitionType = null ;
428+
429+ /* * Name of the field in the schema */
430+ name : string ;
431+
434432 /* * Nested fields. Since thrift does not support nested fields,
435433 * the nesting is flattened to a single list by a depth-first traversal.
436434 * The children count is used to construct the nested relationship.
@@ -442,7 +440,12 @@ table SchemaElement {
442440 * original field id in the parquet schema
443441 */
444442 field_id : int = null ;
445- column_order : ColumnOrder ; // only present for leaf nodes
443+
444+ /* * The logical type of this SchemaElement */
445+ logical_type : LogicalType ;
446+
447+ /* * Column ordering for leaf nodes, used to interpret min/max statistics */
448+ column_order : ColumnOrder ;
446449}
447450
448451enum PageType : byte {
@@ -489,7 +492,9 @@ table ColumnMetadata {
489492 /* * optional statistics for this column chunk */
490493 statistics : Statistics ;
491494
492- /* * Indicates whether the column chunk pages are fully dictionary encoded. */
495+ /* * True if every data page in this column chunk is dictionary-encoded
496+ * (no fallback). Replaces Thrift encoding_stats.
497+ */
493498 is_fully_dict_encoded : bool ;
494499
495500 /* * Optional Bloom filter information for this column chunk */
@@ -517,7 +522,7 @@ table ColumnChunk {
517522 * Note: while marked as optional, this field is in fact required by most major
518523 * Parquet implementations. As such, writers MUST populate this field.
519524 **/
520- meta_data : ColumnMetadata ;
525+ metadata : ColumnMetadata ;
521526
522527 /* * Crypto metadata of encrypted columns **/
523528 crypto_metadata : ColumnCryptoMetadata ;
@@ -547,7 +552,7 @@ table RowGroup {
547552 **/
548553 columns : [ColumnChunk ];
549554
550- /* * Total byte size of all the uncompressed column data in this row group **/
555+ /* * Sum of total_uncompressed_size across all columns ( uncompressed, encoded) **/
551556 total_byte_size : long ;
552557
553558 /* * Number of rows in this row group **/
@@ -570,22 +575,6 @@ table RowGroup {
570575 ordinal : short = null ;
571576}
572577
573- /* *
574- * Crypto metadata for files with encrypted footer.
575- */
576- table FileCryptoMetaData {
577- /* *
578- * Encryption algorithm. This field is only used for files
579- * with encrypted footer. Files with plaintext footer store algorithm id
580- * inside footer (FileMetaData structure).
581- */
582- encryption_algorithm : EncryptionAlgorithm ;
583-
584- /* * Retrieval metadata of key used for encryption of footer,
585- * and (possibly) columns **/
586- key_metadata : [byte ];
587- }
588-
589578/* *
590579 * Description for file metadata
591580 */
0 commit comments