Merge pull request #225 from protegrity/av_typelist_optimizing_061

avalerio-tkd · web-flow · commit 7e8778daa83e · 2026-03-07T09:44:34.000-06:00
Implementing zero-copy Parquet conversion of bytes to TypedValues
diff --git a/src/processing/parquet_utils.cpp b/src/processing/parquet_utils.cpp
@@ -18,12 +18,14 @@
 #include "parquet_utils.h"
 #include "enum_utils.h"
 #include "compression_utils.h"
+#include "typed_buffer_values.h"
 #include <cstring>
 #include <iostream>
 
 using namespace dbps::external;
 using namespace dbps::enum_utils;
 using namespace dbps::compression;
+using namespace dbps::processing;
 
 // -----------------------------------------------------------------------------
 // Process Parquet formatted Dictionary and Data pages
@@ -377,3 +379,63 @@ std::vector<uint8_t> GetTypedListAsValueBytes(
 }
 
 // -----------------------------------------------------------------------------
+// Build Parquet formatted value bytes into TypedValuesBuffer
+// -----------------------------------------------------------------------------
+
+TypedValuesBuffer ReinterpretValueBytesAsTypedValuesBuffer(tcb::span<const uint8_t> value_bytes,
+    Type::type datatype,
+    const std::optional<int>& datatype_length,
+    Encoding::type encoding) {
+
+    if (encoding == Encoding::RLE_DICTIONARY) {
+        throw DBPSUnsupportedException(
+            "Unsupported encoding: RLE_DICTIONARY is not supported for per-value operations "
+            "since values are not present in the data, only references to them.");
+    }
+
+    if (encoding != Encoding::PLAIN) {
+        throw DBPSUnsupportedException(
+            "On ReinterpretValueBytesAsTypedValuesBuffer, unsupported encoding: "
+            + std::string(to_string(encoding)));
+    }
+
+    if (datatype == Type::BOOLEAN) {
+        throw DBPSUnsupportedException("On ReinterpretValueBytesAsTypedValuesBuffer, BOOLEAN datatype "
+            "values are bit-encoded and not expanded as bytes, so BOOLEAN is not supported.");
+    }
+
+    switch (datatype) {
+        case Type::INT32:
+            return TypedBufferI32{value_bytes};
+        case Type::INT64:
+            return TypedBufferI64{value_bytes};
+        case Type::FLOAT:
+            return TypedBufferFloat{value_bytes};
+        case Type::DOUBLE:
+            return TypedBufferDouble{value_bytes};
+        case Type::INT96:
+            return TypedBufferInt96{value_bytes};
+        case Type::FIXED_LEN_BYTE_ARRAY: {
+            if (!datatype_length.has_value() || datatype_length.value() <= 0) {
+                throw InvalidInputException("FIXED_LEN_BYTE_ARRAY requires a positive datatype_length");
+            }
+            return TypedBufferRawBytesFixedSized{
+                value_bytes, 0, RawBytesFixedSizedCodec{static_cast<size_t>(datatype_length.value())}};
+        }
+        case Type::BYTE_ARRAY:
+            return TypedBufferRawBytesVariableSized{value_bytes};
+        default:
+            throw InvalidInputException(
+                "Invalid datatype: " + std::string(to_string(datatype)));
+    }
+}
+
+std::vector<uint8_t> GetTypedValuesBufferAsValueBytes(TypedValuesBuffer buffer) {
+    // std::visit is needed to unwrap the variant. TypedValuesBuffer could be of different ByteBuffer types,
+    // so the indirection is needed to call FinalizeAndTakeBuffer() on the correct type. In practice, this is the same for all.
+    return std::visit([](auto& buf) -> std::vector<uint8_t> {
+        return buf.FinalizeAndTakeBuffer();
+    }, buffer);
+}
+
+// -----------------------------------------------------------------------------
diff --git a/src/processing/parquet_utils.h b/src/processing/parquet_utils.h
@@ -28,6 +28,7 @@
 #include "../common/exceptions.h"
 #include "enum_utils.h"
 #include "typed_list_values.h"
+#include "typed_buffer_values.h"
 #include "../common/bytes_utils.h"
 
 struct LevelAndValueBytes {
@@ -131,3 +132,31 @@ std::vector<uint8_t> GetTypedListAsValueBytes(
     Type::type datatype,
     const std::optional<int>& datatype_length,
     Encoding::type encoding);
+
+/**
+ * Zero-copy reinterpretation of raw value bytes into a typed buffer.
+ * Returns a TypedValuesBuffer variant holding the appropriate ByteBuffer<Codec>
+ * for the given Parquet datatype.
+ *
+ * The returned buffer holds a non-owning view into value_bytes.
+ * The caller must keep the backing data alive for as long as the buffer is used.
+ *
+ * @param value_bytes Raw value bytes (span must outlive the returned buffer)
+ * @param datatype Parquet physical type
+ * @param datatype_length Required for FIXED_LEN_BYTE_ARRAY (must be > 0)
+ * @param encoding Only PLAIN is currently supported
+ * @throws DBPSUnsupportedException for RLE_DICTIONARY, BOOLEAN, or non-PLAIN encodings
+ * @throws InvalidInputException for invalid datatype_length on FIXED_LEN_BYTE_ARRAY
+ */
+dbps::processing::TypedValuesBuffer ReinterpretValueBytesAsTypedValuesBuffer(
+    tcb::span<const uint8_t> value_bytes,
+    Type::type datatype,
+    const std::optional<int>& datatype_length,
+    Encoding::type encoding);
+
+/**
+ * Finalize a typed buffer and return the raw value bytes.
+ * Consumes the buffer; caller must std::move() it in to pass ownership.
+ */
+std::vector<uint8_t> GetTypedValuesBufferAsValueBytes(
+    dbps::processing::TypedValuesBuffer buffer);
diff --git a/src/processing/parquet_utils_test.cpp b/src/processing/parquet_utils_test.cpp