@@ -45,7 +45,7 @@ use crate::utils::{
4545 grouping_set_expr_count, grouping_set_to_exprlist, split_conjunction,
4646} ;
4747use crate :: {
48- BinaryExpr , CreateMemoryTable , CreateView , Execute , Expr , ExprSchemable ,
48+ BinaryExpr , CreateMemoryTable , CreateView , Execute , Expr , ExprSchemable , GroupingSet ,
4949 LogicalPlanBuilder , Operator , Prepare , TableProviderFilterPushDown , TableSource ,
5050 WindowFunctionDefinition , build_join_schema, expr_vec_fmt, requalify_sides_if_needed,
5151} ;
@@ -3595,11 +3595,12 @@ impl Aggregate {
35953595 . into_iter ( )
35963596 . map ( |( q, f) | ( q, f. as_ref ( ) . clone ( ) . with_nullable ( true ) . into ( ) ) )
35973597 . collect :: < Vec < _ > > ( ) ;
3598+ let max_ordinal = max_grouping_set_duplicate_ordinal ( & group_expr) ;
35983599 qualified_fields. push ( (
35993600 None ,
36003601 Field :: new (
36013602 Self :: INTERNAL_GROUPING_ID ,
3602- Self :: grouping_id_type ( qualified_fields. len ( ) ) ,
3603+ Self :: grouping_id_type ( qualified_fields. len ( ) , max_ordinal ) ,
36033604 false ,
36043605 )
36053606 . into ( ) ,
@@ -3685,15 +3686,24 @@ impl Aggregate {
36853686 }
36863687
36873688 /// Returns the data type of the grouping id.
3688- /// The grouping ID value is a bitmask where each set bit
3689- /// indicates that the corresponding grouping expression is
3690- /// null
3691- pub fn grouping_id_type ( group_exprs : usize ) -> DataType {
3692- if group_exprs <= 8 {
3689+ ///
3690+ /// The grouping ID packs two pieces of information into a single integer:
3691+ /// - The low `group_exprs` bits are the semantic bitmask (a set bit means the
3692+ /// corresponding grouping expression is NULL for this grouping set).
3693+ /// - The bits above position `group_exprs` encode a duplicate ordinal that
3694+ /// distinguishes multiple occurrences of the same grouping set pattern.
3695+ ///
3696+ /// `max_ordinal` is the highest ordinal value that will appear (0 when there
3697+ /// are no duplicate grouping sets). The type is chosen to be the smallest
3698+ /// unsigned integer that can represent both parts.
3699+ pub fn grouping_id_type ( group_exprs : usize , max_ordinal : usize ) -> DataType {
3700+ let ordinal_bits = usize:: BITS as usize - max_ordinal. leading_zeros ( ) as usize ;
3701+ let total_bits = group_exprs + ordinal_bits;
3702+ if total_bits <= 8 {
36933703 DataType :: UInt8
3694- } else if group_exprs <= 16 {
3704+ } else if total_bits <= 16 {
36953705 DataType :: UInt16
3696- } else if group_exprs <= 32 {
3706+ } else if total_bits <= 32 {
36973707 DataType :: UInt32
36983708 } else {
36993709 DataType :: UInt64
@@ -3702,21 +3712,36 @@ impl Aggregate {
37023712
37033713 /// Internal column used when the aggregation is a grouping set.
37043714 ///
3705- /// This column contains a bitmask where each bit represents a grouping
3706- /// expression. The least significant bit corresponds to the rightmost
3707- /// grouping expression. A bit value of 0 indicates that the corresponding
3708- /// column is included in the grouping set, while a value of 1 means it is excluded.
3715+ /// This column packs two values into a single unsigned integer:
3716+ ///
3717+ /// - **Low bits (positions 0 .. n-1)**: a semantic bitmask where each bit
3718+ /// represents one of the `n` grouping expressions. The least significant
3719+ /// bit corresponds to the rightmost grouping expression. A `1` bit means
3720+ /// the corresponding column is replaced with `NULL` for this grouping set;
3721+ /// a `0` bit means it is included.
3722+ /// - **High bits (positions n and above)**: a *duplicate ordinal* that
3723+ /// distinguishes multiple occurrences of the same semantic grouping set
3724+ /// pattern within a single query. The ordinal is `0` for the first
3725+ /// occurrence, `1` for the second, and so on.
3726+ ///
3727+ /// The integer type is chosen by [`Self::grouping_id_type`] to be the
3728+ /// smallest `UInt8 / UInt16 / UInt32 / UInt64` that can represent both
3729+ /// parts.
37093730 ///
3710- /// For example, for the grouping expressions CUBE(a, b), the grouping ID
3711- /// column will have the following values:
3731+ /// For example, for the grouping expressions CUBE(a, b) (no duplicates),
3732+ /// the grouping ID column will have the following values:
37123733 /// 0b00: Both `a` and `b` are included
37133734 /// 0b01: `b` is excluded
37143735 /// 0b10: `a` is excluded
37153736 /// 0b11: Both `a` and `b` are excluded
37163737 ///
3717- /// This internal column is necessary because excluded columns are replaced
3718- /// with `NULL` values. To handle these cases correctly, we must distinguish
3719- /// between an actual `NULL` value in a column and a column being excluded from the set.
3738+ /// When the same set appears twice and `n = 2`, the duplicate ordinal is
3739+ /// packed into bit 2:
3740+ /// first occurrence: `0b0_01` (ordinal = 0, mask = 0b01)
3741+ /// second occurrence: `0b1_01` (ordinal = 1, mask = 0b01)
3742+ ///
3743+ /// The GROUPING function always masks the value with `(1 << n) - 1` before
3744+ /// interpreting it so the ordinal bits are invisible to user-facing SQL.
37203745 pub const INTERNAL_GROUPING_ID : & ' static str = "__grouping_id" ;
37213746}
37223747
@@ -3737,6 +3762,24 @@ impl PartialOrd for Aggregate {
37373762 }
37383763}
37393764
3765+ /// Returns the highest duplicate ordinal across all grouping sets in `group_expr`.
3766+ ///
3767+ /// The ordinal for each occurrence of a grouping set pattern is its 0-based
3768+ /// index among identical entries. For example, if the same set appears three
3769+ /// times, the ordinals are 0, 1, 2 and this function returns 2.
3770+ /// Returns 0 when no grouping set is duplicated.
3771+ fn max_grouping_set_duplicate_ordinal ( group_expr : & [ Expr ] ) -> usize {
3772+ if let Some ( Expr :: GroupingSet ( GroupingSet :: GroupingSets ( sets) ) ) = group_expr. first ( ) {
3773+ let mut counts: HashMap < & [ Expr ] , usize > = HashMap :: new ( ) ;
3774+ for set in sets {
3775+ * counts. entry ( set) . or_insert ( 0 ) += 1 ;
3776+ }
3777+ counts. into_values ( ) . max ( ) . unwrap_or ( 0 ) . saturating_sub ( 1 )
3778+ } else {
3779+ 0
3780+ }
3781+ }
3782+
37403783/// Checks whether any expression in `group_expr` contains `Expr::GroupingSet`.
37413784fn contains_grouping_set ( group_expr : & [ Expr ] ) -> bool {
37423785 group_expr
@@ -5053,6 +5096,14 @@ mod tests {
50535096 ) ;
50545097 }
50555098
5099+ #[ test]
5100+ fn grouping_id_type_accounts_for_duplicate_ordinal_bits ( ) {
5101+ // 8 grouping columns fit in UInt8 when there are no duplicate ordinals,
5102+ // but adding one duplicate ordinal bit widens the type to UInt16.
5103+ assert_eq ! ( Aggregate :: grouping_id_type( 8 , 0 ) , DataType :: UInt8 ) ;
5104+ assert_eq ! ( Aggregate :: grouping_id_type( 8 , 1 ) , DataType :: UInt16 ) ;
5105+ }
5106+
50565107 #[ test]
50575108 fn test_filter_is_scalar ( ) {
50585109 // test empty placeholder
0 commit comments