Skip to content

Commit ccfb079

Browse files
committed
- Fixed corner case in Parquet page v1 that didn't account for padded trailing values in bit-packed runs.
1 parent 2d7689a commit ccfb079

2 files changed

Lines changed: 59 additions & 5 deletions

File tree

src/processing/parquet_utils.cpp

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,10 +76,22 @@ uint32_t ReadV1RunHeaderUleb128(tcb::span<const uint8_t> bytes, size_t& offset)
7676
// - present_count = number of decoded definition levels equal to max_def_level.
7777
//
7878
size_t CountPresentFromDefinitionLevelsV1(tcb::span<const uint8_t> def_payload, int32_t num_values, int32_t max_def_level) {
79+
if (num_values < 0) {
80+
throw InvalidInputException("Invalid V1 definition levels: num_values must be non-negative, got " +
81+
std::to_string(num_values));
82+
}
83+
if (max_def_level <= 0) {
84+
throw InvalidInputException("Invalid V1 definition levels: max_def_level must be positive, got " +
85+
std::to_string(max_def_level));
86+
}
87+
7988
// Definition level bit width is ceil(log2(max_def_level + 1)).
89+
// Computes the minimum number of bits needed to represent definition levels from 0..max_def_level.
8090
int bit_width = 0;
81-
while ((1u << bit_width) <= static_cast<uint32_t>(max_def_level)) {
91+
uint32_t def_level_domain = static_cast<uint32_t>(max_def_level);
92+
while (def_level_domain > 0) {
8293
++bit_width;
94+
def_level_domain >>= 1;
8395
}
8496
if (bit_width <= 0) {
8597
throw InvalidInputException("Invalid V1 definition levels: computed bit_width must be positive");
@@ -124,7 +136,7 @@ size_t CountPresentFromDefinitionLevelsV1(tcb::span<const uint8_t> def_payload,
124136
const size_t num_groups = static_cast<size_t>(header >> 1);
125137
const size_t run_len = num_groups * 8;
126138
const size_t remaining = static_cast<size_t>(num_values) - decoded_values;
127-
if (num_groups == 0 || run_len > remaining) {
139+
if (num_groups == 0) {
128140
throw InvalidInputException("Invalid DATA_PAGE_V1 definition levels: invalid bit-packed run length");
129141
}
130142

@@ -148,7 +160,10 @@ size_t CountPresentFromDefinitionLevelsV1(tcb::span<const uint8_t> def_payload,
148160
return v;
149161
};
150162

151-
for (size_t i = 0; i < run_len; ++i) {
163+
// A final bit-packed run may include padded trailing values to complete
164+
// 8-value groups. Decode only the logical values still remaining.
165+
const size_t values_to_decode = std::min(run_len, remaining);
166+
for (size_t i = 0; i < values_to_decode; ++i) {
152167
uint32_t level = ReadPacked(i * static_cast<size_t>(bit_width));
153168
if (level > static_cast<uint32_t>(max_def_level)) {
154169
throw InvalidInputException("Invalid DATA_PAGE_V1 definition levels: decoded level exceeds max_def_level");
@@ -157,9 +172,12 @@ size_t CountPresentFromDefinitionLevelsV1(tcb::span<const uint8_t> def_payload,
157172
++present_count;
158173
}
159174
}
160-
decoded_values += run_len;
175+
decoded_values += values_to_decode;
161176
}
162177
}
178+
if (def_offset != def_payload.size()) {
179+
throw InvalidInputException("Invalid DATA_PAGE_V1 definition levels: trailing bytes after decoding");
180+
}
163181
return present_count;
164182
}
165183

src/processing/parquet_utils_test.cpp

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,10 +279,21 @@ TEST(ParquetUtils, CountPresentFromDefinitionLevelsV1_BitPackedCanonical_0to7_88
279279
// Canonical example from Parquet Encodings.md:
280280
// values 0..7 with bit_width=3 are packed as bytes 0x88, 0xC6, 0xFA.
281281
// Bit-packed header for one group of 8 values is varint((1 << 1) | 1) = 0x03.
282+
//
283+
// Important detail: decoder bit_width is derived from max_def_level at runtime.
284+
// So this payload is valid only when max_def_level implies bit_width=3 (e.g. 7).
285+
// Reusing these same bytes with max_def_level=3 (bit_width=2) is a different
286+
// encoding contract and can leave unread trailing bytes by design.
282287
std::vector<uint8_t> def_payload = {0x03, 0x88, 0xC6, 0xFA};
283288

284289
EXPECT_EQ(1u, CountPresentFromDefinitionLevelsV1(def_payload, 8, 7)); // only value 7
285-
EXPECT_EQ(1u, CountPresentFromDefinitionLevelsV1(def_payload, 8, 3)); // only value 3
290+
291+
// Separate bit_width=2 payload for max_def_level=3.
292+
// This keeps payload bit-width consistent with decoder configuration and
293+
// avoids mixing a 3-bit packed stream with a 2-bit decode expectation.
294+
std::vector<uint32_t> levels_bw2 = {0, 1, 2, 3, 0, 1, 2, 0}; // one value at level 3
295+
auto def_payload_bw2 = MakeBitPackedDefPayload(levels_bw2, 2);
296+
EXPECT_EQ(1u, CountPresentFromDefinitionLevelsV1(def_payload_bw2, 8, 3));
286297
}
287298

288299
TEST(ParquetUtils, CountPresentFromDefinitionLevelsV1_ManualBytes_RleRunLen4_Level1) {
@@ -338,6 +349,31 @@ TEST(ParquetUtils, CountPresentFromDefinitionLevelsV1_RejectsZeroBitPackedGroups
338349
EXPECT_THROW(CountPresentFromDefinitionLevelsV1(def_payload, 8, 1), InvalidInputException);
339350
}
340351

352+
TEST(ParquetUtils, CountPresentFromDefinitionLevelsV1_BitPackedFinalRunAllowsPadding) {
353+
// Corner case: a final bit-packed run is encoded as a full 8-value group,
354+
// while logical num_values ends mid-group. Decode only the logical values
355+
// and ignore padded trailing values in the last group.
356+
// Payload: header=0x03 (1 bit-packed group => 8 values), packed=0x07 (bits 1,1,1,0,0,0,0,0).
357+
std::vector<uint8_t> def_payload = {0x03, 0x07};
358+
EXPECT_EQ(3u, CountPresentFromDefinitionLevelsV1(def_payload, 3, 1));
359+
}
360+
361+
TEST(ParquetUtils, CountPresentFromDefinitionLevelsV1_RejectsTrailingBytesAfterDecoding) {
362+
// One full bit-packed group (8 values) plus extra trailing byte that must be rejected.
363+
std::vector<uint8_t> def_payload = {0x03, 0xAA, 0xFF};
364+
EXPECT_THROW(CountPresentFromDefinitionLevelsV1(def_payload, 8, 1), InvalidInputException);
365+
}
366+
367+
TEST(ParquetUtils, CountPresentFromDefinitionLevelsV1_RejectsNonPositiveMaxDefLevel) {
368+
auto def_payload = MakeRleDefPayload(1, 0, 1);
369+
EXPECT_THROW(CountPresentFromDefinitionLevelsV1(def_payload, 1, 0), InvalidInputException);
370+
}
371+
372+
TEST(ParquetUtils, CountPresentFromDefinitionLevelsV1_RejectsNegativeNumValues) {
373+
auto def_payload = MakeRleDefPayload(1, 1, 1);
374+
EXPECT_THROW(CountPresentFromDefinitionLevelsV1(def_payload, -1, 1), InvalidInputException);
375+
}
376+
341377
// -----------------------------------------------------------------------------
342378
// Tests for DecompressAndSplit function.
343379
// -----------------------------------------------------------------------------

0 commit comments

Comments
 (0)