Skip to content

Commit 7e8778d

Browse files
authored
Merge pull request #225 from protegrity/av_typelist_optimizing_061
Implementing zero-copy Parquet conversion of bytes to TypedValues
2 parents 0895a45 + 82c5adc commit 7e8778d

3 files changed

Lines changed: 415 additions & 0 deletions

File tree

src/processing/parquet_utils.cpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@
1818
#include "parquet_utils.h"
1919
#include "enum_utils.h"
2020
#include "compression_utils.h"
21+
#include "typed_buffer_values.h"
2122
#include <cstring>
2223
#include <iostream>
2324

2425
using namespace dbps::external;
2526
using namespace dbps::enum_utils;
2627
using namespace dbps::compression;
28+
using namespace dbps::processing;
2729

2830
// -----------------------------------------------------------------------------
2931
// Process Parquet formatted Dictionary and Data pages
@@ -377,3 +379,63 @@ std::vector<uint8_t> GetTypedListAsValueBytes(
377379
}
378380

379381
// -----------------------------------------------------------------------------
382+
// Build Parquet formatted value bytes into TypedValuesBuffer
383+
// -----------------------------------------------------------------------------
384+
385+
TypedValuesBuffer ReinterpretValueBytesAsTypedValuesBuffer(tcb::span<const uint8_t> value_bytes,
386+
Type::type datatype,
387+
const std::optional<int>& datatype_length,
388+
Encoding::type encoding) {
389+
390+
if (encoding == Encoding::RLE_DICTIONARY) {
391+
throw DBPSUnsupportedException(
392+
"Unsupported encoding: RLE_DICTIONARY is not supported for per-value operations "
393+
"since values are not present in the data, only references to them.");
394+
}
395+
396+
if (encoding != Encoding::PLAIN) {
397+
throw DBPSUnsupportedException(
398+
"On ReinterpretValueBytesAsTypedValuesBuffer, unsupported encoding: "
399+
+ std::string(to_string(encoding)));
400+
}
401+
402+
if (datatype == Type::BOOLEAN) {
403+
throw DBPSUnsupportedException("On ReinterpretValueBytesAsTypedValuesBuffer, BOOLEAN datatype "
404+
"values are bit-encoded and not expanded as bytes, so BOOLEAN is not supported.");
405+
}
406+
407+
switch (datatype) {
408+
case Type::INT32:
409+
return TypedBufferI32{value_bytes};
410+
case Type::INT64:
411+
return TypedBufferI64{value_bytes};
412+
case Type::FLOAT:
413+
return TypedBufferFloat{value_bytes};
414+
case Type::DOUBLE:
415+
return TypedBufferDouble{value_bytes};
416+
case Type::INT96:
417+
return TypedBufferInt96{value_bytes};
418+
case Type::FIXED_LEN_BYTE_ARRAY: {
419+
if (!datatype_length.has_value() || datatype_length.value() <= 0) {
420+
throw InvalidInputException("FIXED_LEN_BYTE_ARRAY requires a positive datatype_length");
421+
}
422+
return TypedBufferRawBytesFixedSized{
423+
value_bytes, 0, RawBytesFixedSizedCodec{static_cast<size_t>(datatype_length.value())}};
424+
}
425+
case Type::BYTE_ARRAY:
426+
return TypedBufferRawBytesVariableSized{value_bytes};
427+
default:
428+
throw InvalidInputException(
429+
"Invalid datatype: " + std::string(to_string(datatype)));
430+
}
431+
}
432+
433+
std::vector<uint8_t> GetTypedValuesBufferAsValueBytes(TypedValuesBuffer buffer) {
434+
// std::visit is needed to unwrap the variant. TypedValuesBuffer could be of different ByteBuffer types,
435+
// so the indirection is needed to call FinalizeAndTakeBuffer() on the correct type. In practice, this is the same for all.
436+
return std::visit([](auto& buf) -> std::vector<uint8_t> {
437+
return buf.FinalizeAndTakeBuffer();
438+
}, buffer);
439+
}
440+
441+
// -----------------------------------------------------------------------------

src/processing/parquet_utils.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "../common/exceptions.h"
2929
#include "enum_utils.h"
3030
#include "typed_list_values.h"
31+
#include "typed_buffer_values.h"
3132
#include "../common/bytes_utils.h"
3233

3334
struct LevelAndValueBytes {
@@ -131,3 +132,31 @@ std::vector<uint8_t> GetTypedListAsValueBytes(
131132
Type::type datatype,
132133
const std::optional<int>& datatype_length,
133134
Encoding::type encoding);
135+
136+
/**
137+
* Zero-copy reinterpretation of raw value bytes into a typed buffer.
138+
* Returns a TypedValuesBuffer variant holding the appropriate ByteBuffer<Codec>
139+
* for the given Parquet datatype.
140+
*
141+
* The returned buffer holds a non-owning view into value_bytes.
142+
* The caller must keep the backing data alive for as long as the buffer is used.
143+
*
144+
* @param value_bytes Raw value bytes (span must outlive the returned buffer)
145+
* @param datatype Parquet physical type
146+
* @param datatype_length Required for FIXED_LEN_BYTE_ARRAY (must be > 0)
147+
* @param encoding Only PLAIN is currently supported
148+
* @throws DBPSUnsupportedException for RLE_DICTIONARY, BOOLEAN, or non-PLAIN encodings
149+
* @throws InvalidInputException for invalid datatype_length on FIXED_LEN_BYTE_ARRAY
150+
*/
151+
dbps::processing::TypedValuesBuffer ReinterpretValueBytesAsTypedValuesBuffer(
152+
tcb::span<const uint8_t> value_bytes,
153+
Type::type datatype,
154+
const std::optional<int>& datatype_length,
155+
Encoding::type encoding);
156+
157+
/**
158+
* Finalize a typed buffer and return the raw value bytes.
159+
* Consumes the buffer; caller must std::move() it in to pass ownership.
160+
*/
161+
std::vector<uint8_t> GetTypedValuesBufferAsValueBytes(
162+
dbps::processing::TypedValuesBuffer buffer);

0 commit comments

Comments
 (0)