Skip to content
Merged
24 changes: 24 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,18 @@ target_include_directories(dbps_byte_buffer_lib INTERFACE
)
target_link_libraries(dbps_byte_buffer_lib INTERFACE tcb_span)

# TODO: Update this when deprecating BasicEncryptor
# Standalone XOR encryptor library (kept parallel to legacy basic_encryptor path)
add_library(dbps_xor_encryptor_lib STATIC
src/processing/encryptors/basic_xor_encryptor.cpp
)
target_link_libraries(dbps_xor_encryptor_lib PUBLIC dbps_common_lib tcb_span)
target_include_directories(dbps_xor_encryptor_lib PUBLIC
src/processing
src/processing/encryptors
src/common
)

# Server components library
add_library(dbps_server_lib STATIC
src/processing/encryption_sequencer.cpp
Expand Down Expand Up @@ -358,6 +370,15 @@ if(BUILD_TESTS)
)
target_include_directories(basic_encryptor_test PRIVATE src/processing src/processing/encryptors)

# Basic XOR encryptor tests (parallel path; does not replace BasicEncryptor callers)
add_executable(basic_xor_encryptor_test src/processing/encryptors/basic_xor_encryptor_test.cpp)
target_link_libraries(basic_xor_encryptor_test
dbps_xor_encryptor_lib
dbps_common_lib
gtest_main
)
target_include_directories(basic_xor_encryptor_test PRIVATE src/processing src/processing/encryptors)

# Auth utils tests
add_executable(auth_utils_test src/server/auth_utils_test.cpp)
target_link_libraries(auth_utils_test
Expand Down Expand Up @@ -513,6 +534,7 @@ add_custom_target(libraries
dbps_local_lib
COMMENT "Building all libraries"
)
add_dependencies(libraries dbps_xor_encryptor_lib)

add_custom_target(shared_libraries
DEPENDS
Expand Down Expand Up @@ -556,6 +578,7 @@ if(BUILD_TESTS)
http_client_base_test
COMMENT "Building all tests"
)
add_dependencies(tests basic_xor_encryptor_test)

# Register tests with CTest via GoogleTest discovery
gtest_discover_tests(json_request_test)
Expand All @@ -569,6 +592,7 @@ if(BUILD_TESTS)
gtest_discover_tests(typed_buffer_test)
gtest_discover_tests(typed_buffer_values_test)
gtest_discover_tests(basic_encryptor_test)
gtest_discover_tests(basic_xor_encryptor_test)
gtest_discover_tests(auth_utils_test)
gtest_discover_tests(dbpa_interface_test)
gtest_discover_tests(dbpa_utils_test)
Expand Down
7 changes: 7 additions & 0 deletions src/common/bytes_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,13 @@ inline void append_f64_le(std::vector<uint8_t>& out, double v) {
append_u64_le(out, bits);
}

inline void write_u32_le_at(std::vector<uint8_t>& buf, size_t offset, uint32_t v) {
buf[offset + 0] = static_cast<uint8_t>(v & 0xFF);
buf[offset + 1] = static_cast<uint8_t>((v >> 8) & 0xFF);
buf[offset + 2] = static_cast<uint8_t>((v >> 16) & 0xFF);
buf[offset + 3] = static_cast<uint8_t>((v >> 24) & 0xFF);
}

inline uint32_t read_u32_le(const std::vector<uint8_t>& in, size_t offset) {
return static_cast<uint32_t>(in[offset]) |
(static_cast<uint32_t>(in[offset + 1]) << 8) |
Expand Down
208 changes: 208 additions & 0 deletions src/processing/encryptors/basic_xor_encryptor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "basic_xor_encryptor.h"
#include "encryptor_utils.h"
#include "../../common/exceptions.h"
#include "../../common/enum_utils.h"
#include <cstring>
#include <functional>
#include <iostream>
#include <type_traits>

using namespace dbps::processing;
using namespace dbps::external;

// ---------------------------------------------------------------------------
// Functions for encrypting and decrypting byte arrays.
// ---------------------------------------------------------------------------

static std::vector<uint8_t> EncryptByteArray(tcb::span<const uint8_t> data, const std::string& key_id) {
if (data.empty()) {
return {};
}
if (key_id.empty()) {
throw std::invalid_argument("EncryptByteArray: key must not be empty for non-empty data");
}
std::vector<uint8_t> out(data.size());
std::hash<std::string> hasher;
size_t key_hash = hasher(key_id);
for (size_t i = 0; i < data.size(); ++i) {
out[i] = data[i] ^ (key_hash & 0xFF);
key_hash = (key_hash << 1) | (key_hash >> 31);
}
return out;
}

static std::vector<uint8_t> DecryptByteArray(tcb::span<const uint8_t> data, const std::string& key_id) {
return EncryptByteArray(data, key_id);
}

// ---------------------------------------------------------------------------
// Block encryption
// ---------------------------------------------------------------------------

std::vector<uint8_t> BasicXorEncryptor::EncryptBlock(tcb::span<const uint8_t> data) {
return EncryptByteArray(data, key_id_);
}

std::vector<uint8_t> BasicXorEncryptor::DecryptBlock(tcb::span<const uint8_t> data) {
return DecryptByteArray(data, key_id_);
}

// ---------------------------------------------------------------------------
// Value-level encryption (TypedValuesBuffer in -> bytes out)
//
// Output layout:
// Fixed: [0x01][uint32 count][uint32 elem_size] <contiguous encrypted elements>
// Variable: [0x00][uint32 count] <length-prefixed encrypted elements>
//
// ---------------------------------------------------------------------------

std::vector<uint8_t> BasicXorEncryptor::EncryptValueList(
const TypedValuesBuffer& typed_buffer) {

std::cout << "EncryptValueList context: column=" << column_name_
<< " user=" << user_id_ << " key=" << key_id_
<< " datatype=" << dbps::enum_utils::to_string(datatype_) << std::endl;

return std::visit([&](const auto& input_buffer) -> std::vector<uint8_t> {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we simplify this code? (e.g. remove the use of lambdas?) BasicEncryptor is supposed to be an example encryptor - this implementation makes legibility a bit hard.

@avalerio-tkd avalerio-tkd Mar 8, 2026

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great comment. Totally agree on keeping BasicEncryptor as readable as possible.

However, we can't remove this one since it's a needed visit due to TypeBuffer overloading. There are workarounds but all end up doing a visit somewhere, just placed somewhere else. This is in general a known boilerplate pattern for accessing variant types in cpp.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed offline. Added a TODO note in-place for this and will address it in a followup cleanup.

using BufferType = std::decay_t<decltype(input_buffer)>;
constexpr bool is_fixed = BufferType::is_fixed_sized;
const size_t num_elements = input_buffer.GetNumElements();
constexpr size_t prefix_length = is_fixed ? kFixedHeaderLength : kVariableHeaderLength;

// Empty buffer, return empty vector with header.
if (num_elements == 0) {
std::vector<uint8_t> result(prefix_length);
WriteHeader(result, {is_fixed, 0, 0});
return result;
}

auto encrypt_into = [&](auto& output_buffer) -> std::vector<uint8_t> {
size_t output_index = 0;
for (const auto raw_bytes : input_buffer.raw_elements()) {
auto encrypted = EncryptByteArray(raw_bytes, key_id_);
output_buffer.SetElement(output_index, tcb::span<const uint8_t>(encrypted));
output_index++;
}
return output_buffer.FinalizeAndTakeBuffer();
};

std::vector<uint8_t> final_buffer;
size_t element_size = 0;
if constexpr (is_fixed) {
element_size = input_buffer.GetElementSize();
TypedBufferRawBytesFixedSized output_buffer{
num_elements, prefix_length, RawBytesFixedSizedCodec{element_size}};
final_buffer = encrypt_into(output_buffer);
} else {
auto reserved_bytes_hint = input_buffer.GetSpanSize();
TypedBufferRawBytesVariableSized output_buffer{
num_elements, reserved_bytes_hint, true, prefix_length};
final_buffer = encrypt_into(output_buffer);
}
WriteHeader(final_buffer, {is_fixed,
static_cast<uint32_t>(num_elements),
static_cast<uint32_t>(element_size)});
return final_buffer;

}, typed_buffer);
}

// ---------------------------------------------------------------------------
// Value-level decryption (bytes in -> TypedValuesBuffer out)
//
// Parses the header, then wraps the full span (with prefix_size) as a
// TypedBufferRawBytes... read buffer so the buffer skips the header
// automatically. Output is the correctly-typed buffer matching datatype_.
// ---------------------------------------------------------------------------

TypedValuesBuffer BasicXorEncryptor::DecryptValueList(
tcb::span<const uint8_t> encrypted_bytes) {

auto header = ReadHeader(encrypted_bytes);
auto num_elements = static_cast<size_t>(header.num_elements);

if (header.is_fixed) {
TypedBufferRawBytesFixedSized encrypted_buffer{
encrypted_bytes, kFixedHeaderLength,
RawBytesFixedSizedCodec{header.element_size}};

auto decrypt_fixed_into = [&](auto output) {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

simillar than for the encrypt. Can we optimize for legibility here?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved it to a separate helper function.

size_t output_index = 0;
for (const auto raw_bytes : encrypted_buffer.raw_elements()) {
auto decrypted_bytes = DecryptByteArray(raw_bytes, key_id_);
output.SetRawElement(output_index, tcb::span<const uint8_t>(decrypted_bytes));
output_index++;
}
return output;
};

// TODO: This is leaking Parquet-specific types into the encryptor, which should be agnostic of Parquet.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for the call out. why was this not needed before?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The previous implementation had the same dependency, just that it was harder to read because it was used indirectly by a helper function on parquet_utils, so harder to detect. I didn't realize it either.

I have a possible solution in mind that we can discuss offline. The gist is we can add a type annotation to the output. This can come from the Codec that generates it, could be as simple as a unique byte value. This would be protected by the version check if the Codec code changes.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed offline. The TODO note capture the pending item. Will leave as-is for this PR and will address in a followup.

// This is needed because on the returned bytes we are not saving a type information.
// We could annotate the generating bytes by simply updating the 1st byte of the header to indicate the type.
switch (datatype_) {
case Type::INT32:
return decrypt_fixed_into(TypedBufferI32{num_elements});
case Type::INT64:
return decrypt_fixed_into(TypedBufferI64{num_elements});
case Type::INT96:
return decrypt_fixed_into(TypedBufferInt96{num_elements});
case Type::FLOAT:
return decrypt_fixed_into(TypedBufferFloat{num_elements});
case Type::DOUBLE:
return decrypt_fixed_into(TypedBufferDouble{num_elements});
case Type::FIXED_LEN_BYTE_ARRAY: {
TypedBufferRawBytesFixedSized output_buffer{
num_elements, 0, RawBytesFixedSizedCodec{header.element_size}};
size_t output_index = 0;
for (const auto element : encrypted_buffer) {
auto decrypted_bytes = DecryptByteArray(element, key_id_);
output_buffer.SetElement(output_index, tcb::span<const uint8_t>(decrypted_bytes));
output_index++;
}
return output_buffer;
}
default:
throw InvalidInputException(
std::string("DecryptValueList: unsupported fixed-size datatype: ")
+ std::string(dbps::enum_utils::to_string(datatype_)));
}
} else {
TypedBufferRawBytesVariableSized encrypted_buffer{
encrypted_bytes, kVariableHeaderLength};

switch (datatype_) {
case Type::BYTE_ARRAY: {
auto reserved_bytes_hint = encrypted_buffer.GetSpanSize();
TypedBufferRawBytesVariableSized output_buffer{num_elements, reserved_bytes_hint, true};
size_t output_index = 0;
for (const auto element : encrypted_buffer) {
auto decrypted_bytes = DecryptByteArray(element, key_id_);
output_buffer.SetElement(output_index, tcb::span<const uint8_t>(decrypted_bytes));
output_index++;
}
return output_buffer;
}
default:
throw InvalidInputException(
std::string("DecryptValueList: unsupported variable-size datatype: ")
+ std::string(dbps::enum_utils::to_string(datatype_)));
}
}
}
109 changes: 109 additions & 0 deletions src/processing/encryptors/basic_xor_encryptor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

// TODO: Remove these includes when deprecating BasicEncryptor.
#include <cstdint>
#include <string>
#include <tcb/span.hpp>
#include <vector>
#include "../typed_buffer_values.h"
#include "../../common/enums.h"

#ifndef DBPS_EXPORT
#define DBPS_EXPORT
#endif

using namespace dbps::processing;

/**
* TODO: Remove this when deprecating BasicEncryptor.
* Temporary interface for the XOR encryptor during migration.
* Keeps XOR implementation independent from DBPSEncryptor while both paths coexist.
*/
class DBPS_EXPORT XorEncryptorInterface {
public:
XorEncryptorInterface(
const std::string& key_id,
const std::string& column_name,
const std::string& user_id,
const std::string& application_context,
dbps::external::Type::type datatype)
: key_id_(key_id),
column_name_(column_name),
user_id_(user_id),
application_context_(application_context),
datatype_(datatype) {}

virtual ~XorEncryptorInterface() = default;

virtual std::vector<uint8_t> EncryptBlock(tcb::span<const uint8_t> data) = 0;
virtual std::vector<uint8_t> DecryptBlock(tcb::span<const uint8_t> data) = 0;
virtual std::vector<uint8_t> EncryptValueList(const TypedValuesBuffer& typed_buffer) = 0;
virtual TypedValuesBuffer DecryptValueList(tcb::span<const uint8_t> encrypted_bytes) = 0;

protected:
std::string key_id_;
std::string column_name_;
std::string user_id_;
std::string application_context_;
dbps::external::Type::type datatype_;
};

/**
* Basic implementation of the temporary XOR encryptor interface.
*
* This implementation provides:
* - Block encryption/decryption using XOR with key_id hash (same as current encryption_sequencer)
*
* This is a simple, default encryption implementation that can be replaced with more
* sophisticated encryption providers (e.g., Protegrity) in the future.
*/
class DBPS_EXPORT BasicXorEncryptor : public XorEncryptorInterface {
public:
/**
* Constructor that initializes the encryptor with context parameters.
*
* @param key_id The encryption key identifier
* @param column_name The name of the column being encrypted/decrypted
* @param user_id The user identifier for context
* @param application_context Additional application context information
* @param datatype The data type of the column being encrypted/decrypted.
* It is needed for correct type specific parsing during the DecryptValueList call.
*/
BasicXorEncryptor(
const std::string& key_id,
const std::string& column_name,
const std::string& user_id,
const std::string& application_context,
dbps::external::Type::type datatype)
: XorEncryptorInterface(key_id, column_name, user_id, application_context, datatype) {}

~BasicXorEncryptor() override = default;

// Block encryption methods
std::vector<uint8_t> EncryptBlock(tcb::span<const uint8_t> data) override;

std::vector<uint8_t> DecryptBlock(tcb::span<const uint8_t> data) override;

// Value encryption methods
std::vector<uint8_t> EncryptValueList(const TypedValuesBuffer& typed_buffer) override;

TypedValuesBuffer DecryptValueList(tcb::span<const uint8_t> encrypted_bytes) override;
};

Loading