-
Notifications
You must be signed in to change notification settings - Fork 0
Implementing BasicEncryptor with the optimized TypedBuffers #226
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
e23b8a6
df77cdd
f891206
349db98
abce28a
34b186d
cb5b655
38a5879
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,208 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #include "basic_xor_encryptor.h" | ||
| #include "encryptor_utils.h" | ||
| #include "../../common/exceptions.h" | ||
| #include "../../common/enum_utils.h" | ||
| #include <cstring> | ||
| #include <functional> | ||
| #include <iostream> | ||
| #include <type_traits> | ||
|
|
||
| using namespace dbps::processing; | ||
| using namespace dbps::external; | ||
|
|
||
| // --------------------------------------------------------------------------- | ||
| // Functions for encrypting and decrypting byte arrays. | ||
| // --------------------------------------------------------------------------- | ||
|
|
||
| static std::vector<uint8_t> EncryptByteArray(tcb::span<const uint8_t> data, const std::string& key_id) { | ||
| if (data.empty()) { | ||
| return {}; | ||
| } | ||
| if (key_id.empty()) { | ||
| throw std::invalid_argument("EncryptByteArray: key must not be empty for non-empty data"); | ||
| } | ||
| std::vector<uint8_t> out(data.size()); | ||
| std::hash<std::string> hasher; | ||
| size_t key_hash = hasher(key_id); | ||
| for (size_t i = 0; i < data.size(); ++i) { | ||
| out[i] = data[i] ^ (key_hash & 0xFF); | ||
| key_hash = (key_hash << 1) | (key_hash >> 31); | ||
| } | ||
| return out; | ||
| } | ||
|
|
||
| static std::vector<uint8_t> DecryptByteArray(tcb::span<const uint8_t> data, const std::string& key_id) { | ||
| return EncryptByteArray(data, key_id); | ||
| } | ||
|
|
||
| // --------------------------------------------------------------------------- | ||
| // Block encryption | ||
| // --------------------------------------------------------------------------- | ||
|
|
||
| std::vector<uint8_t> BasicXorEncryptor::EncryptBlock(tcb::span<const uint8_t> data) { | ||
| return EncryptByteArray(data, key_id_); | ||
| } | ||
|
|
||
| std::vector<uint8_t> BasicXorEncryptor::DecryptBlock(tcb::span<const uint8_t> data) { | ||
| return DecryptByteArray(data, key_id_); | ||
| } | ||
|
|
||
| // --------------------------------------------------------------------------- | ||
| // Value-level encryption (TypedValuesBuffer in -> bytes out) | ||
| // | ||
| // Output layout: | ||
| // Fixed: [0x01][uint32 count][uint32 elem_size] <contiguous encrypted elements> | ||
| // Variable: [0x00][uint32 count] <length-prefixed encrypted elements> | ||
| // | ||
| // --------------------------------------------------------------------------- | ||
|
|
||
| std::vector<uint8_t> BasicXorEncryptor::EncryptValueList( | ||
| const TypedValuesBuffer& typed_buffer) { | ||
|
|
||
| std::cout << "EncryptValueList context: column=" << column_name_ | ||
| << " user=" << user_id_ << " key=" << key_id_ | ||
| << " datatype=" << dbps::enum_utils::to_string(datatype_) << std::endl; | ||
|
|
||
| return std::visit([&](const auto& input_buffer) -> std::vector<uint8_t> { | ||
| using BufferType = std::decay_t<decltype(input_buffer)>; | ||
| constexpr bool is_fixed = BufferType::is_fixed_sized; | ||
| const size_t num_elements = input_buffer.GetNumElements(); | ||
| constexpr size_t prefix_length = is_fixed ? kFixedHeaderLength : kVariableHeaderLength; | ||
|
|
||
| // Empty buffer, return empty vector with header. | ||
| if (num_elements == 0) { | ||
| std::vector<uint8_t> result(prefix_length); | ||
| WriteHeader(result, {is_fixed, 0, 0}); | ||
| return result; | ||
| } | ||
|
|
||
| auto encrypt_into = [&](auto& output_buffer) -> std::vector<uint8_t> { | ||
| size_t output_index = 0; | ||
| for (const auto raw_bytes : input_buffer.raw_elements()) { | ||
| auto encrypted = EncryptByteArray(raw_bytes, key_id_); | ||
| output_buffer.SetElement(output_index, tcb::span<const uint8_t>(encrypted)); | ||
| output_index++; | ||
| } | ||
| return output_buffer.FinalizeAndTakeBuffer(); | ||
| }; | ||
|
|
||
| std::vector<uint8_t> final_buffer; | ||
| size_t element_size = 0; | ||
| if constexpr (is_fixed) { | ||
| element_size = input_buffer.GetElementSize(); | ||
| TypedBufferRawBytesFixedSized output_buffer{ | ||
| num_elements, prefix_length, RawBytesFixedSizedCodec{element_size}}; | ||
| final_buffer = encrypt_into(output_buffer); | ||
| } else { | ||
| auto reserved_bytes_hint = input_buffer.GetSpanSize(); | ||
| TypedBufferRawBytesVariableSized output_buffer{ | ||
| num_elements, reserved_bytes_hint, true, prefix_length}; | ||
| final_buffer = encrypt_into(output_buffer); | ||
| } | ||
| WriteHeader(final_buffer, {is_fixed, | ||
| static_cast<uint32_t>(num_elements), | ||
| static_cast<uint32_t>(element_size)}); | ||
| return final_buffer; | ||
|
|
||
| }, typed_buffer); | ||
| } | ||
|
|
||
| // --------------------------------------------------------------------------- | ||
| // Value-level decryption (bytes in -> TypedValuesBuffer out) | ||
| // | ||
| // Parses the header, then wraps the full span (with prefix_size) as a | ||
| // TypedBufferRawBytes... read buffer so the buffer skips the header | ||
| // automatically. Output is the correctly-typed buffer matching datatype_. | ||
| // --------------------------------------------------------------------------- | ||
|
|
||
| TypedValuesBuffer BasicXorEncryptor::DecryptValueList( | ||
| tcb::span<const uint8_t> encrypted_bytes) { | ||
|
|
||
| auto header = ReadHeader(encrypted_bytes); | ||
| auto num_elements = static_cast<size_t>(header.num_elements); | ||
|
|
||
| if (header.is_fixed) { | ||
| TypedBufferRawBytesFixedSized encrypted_buffer{ | ||
| encrypted_bytes, kFixedHeaderLength, | ||
| RawBytesFixedSizedCodec{header.element_size}}; | ||
|
|
||
| auto decrypt_fixed_into = [&](auto output) { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. simillar than for the
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moved it to a separate helper function. |
||
| size_t output_index = 0; | ||
| for (const auto raw_bytes : encrypted_buffer.raw_elements()) { | ||
| auto decrypted_bytes = DecryptByteArray(raw_bytes, key_id_); | ||
| output.SetRawElement(output_index, tcb::span<const uint8_t>(decrypted_bytes)); | ||
| output_index++; | ||
| } | ||
| return output; | ||
| }; | ||
|
|
||
| // TODO: This is leaking Parquet-specific types into the encryptor, which should be agnostic of Parquet. | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks for the call out. why was this not needed before?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The previous implementation had the same dependency, just that it was harder to read because it was used indirectly by a helper function on parquet_utils, so harder to detect. I didn't realize it either. I have a possible solution in mind that we can discuss offline. The gist is we can add a type annotation to the output. This can come from the Codec that generates it, could be as simple as a unique byte value. This would be protected by the version check if the Codec code changes.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Discussed offline. The TODO note capture the pending item. Will leave as-is for this PR and will address in a followup. |
||
| // This is needed because on the returned bytes we are not saving a type information. | ||
| // We could annotate the generating bytes by simply updating the 1st byte of the header to indicate the type. | ||
| switch (datatype_) { | ||
| case Type::INT32: | ||
| return decrypt_fixed_into(TypedBufferI32{num_elements}); | ||
| case Type::INT64: | ||
| return decrypt_fixed_into(TypedBufferI64{num_elements}); | ||
| case Type::INT96: | ||
| return decrypt_fixed_into(TypedBufferInt96{num_elements}); | ||
| case Type::FLOAT: | ||
| return decrypt_fixed_into(TypedBufferFloat{num_elements}); | ||
| case Type::DOUBLE: | ||
| return decrypt_fixed_into(TypedBufferDouble{num_elements}); | ||
| case Type::FIXED_LEN_BYTE_ARRAY: { | ||
| TypedBufferRawBytesFixedSized output_buffer{ | ||
| num_elements, 0, RawBytesFixedSizedCodec{header.element_size}}; | ||
| size_t output_index = 0; | ||
| for (const auto element : encrypted_buffer) { | ||
| auto decrypted_bytes = DecryptByteArray(element, key_id_); | ||
| output_buffer.SetElement(output_index, tcb::span<const uint8_t>(decrypted_bytes)); | ||
| output_index++; | ||
| } | ||
| return output_buffer; | ||
| } | ||
| default: | ||
| throw InvalidInputException( | ||
| std::string("DecryptValueList: unsupported fixed-size datatype: ") | ||
| + std::string(dbps::enum_utils::to_string(datatype_))); | ||
| } | ||
| } else { | ||
| TypedBufferRawBytesVariableSized encrypted_buffer{ | ||
| encrypted_bytes, kVariableHeaderLength}; | ||
|
|
||
| switch (datatype_) { | ||
| case Type::BYTE_ARRAY: { | ||
| auto reserved_bytes_hint = encrypted_buffer.GetSpanSize(); | ||
| TypedBufferRawBytesVariableSized output_buffer{num_elements, reserved_bytes_hint, true}; | ||
| size_t output_index = 0; | ||
| for (const auto element : encrypted_buffer) { | ||
| auto decrypted_bytes = DecryptByteArray(element, key_id_); | ||
| output_buffer.SetElement(output_index, tcb::span<const uint8_t>(decrypted_bytes)); | ||
| output_index++; | ||
| } | ||
| return output_buffer; | ||
| } | ||
| default: | ||
| throw InvalidInputException( | ||
| std::string("DecryptValueList: unsupported variable-size datatype: ") | ||
| + std::string(dbps::enum_utils::to_string(datatype_))); | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,109 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #pragma once | ||
|
|
||
| // TODO: Remove these includes when deprecating BasicEncryptor. | ||
| #include <cstdint> | ||
| #include <string> | ||
| #include <tcb/span.hpp> | ||
| #include <vector> | ||
| #include "../typed_buffer_values.h" | ||
| #include "../../common/enums.h" | ||
|
|
||
| #ifndef DBPS_EXPORT | ||
| #define DBPS_EXPORT | ||
| #endif | ||
|
|
||
| using namespace dbps::processing; | ||
|
|
||
| /** | ||
| * TODO: Remove this when deprecating BasicEncryptor. | ||
| * Temporary interface for the XOR encryptor during migration. | ||
| * Keeps XOR implementation independent from DBPSEncryptor while both paths coexist. | ||
| */ | ||
| class DBPS_EXPORT XorEncryptorInterface { | ||
| public: | ||
| XorEncryptorInterface( | ||
| const std::string& key_id, | ||
| const std::string& column_name, | ||
| const std::string& user_id, | ||
| const std::string& application_context, | ||
| dbps::external::Type::type datatype) | ||
| : key_id_(key_id), | ||
| column_name_(column_name), | ||
| user_id_(user_id), | ||
| application_context_(application_context), | ||
| datatype_(datatype) {} | ||
|
|
||
| virtual ~XorEncryptorInterface() = default; | ||
|
|
||
| virtual std::vector<uint8_t> EncryptBlock(tcb::span<const uint8_t> data) = 0; | ||
| virtual std::vector<uint8_t> DecryptBlock(tcb::span<const uint8_t> data) = 0; | ||
| virtual std::vector<uint8_t> EncryptValueList(const TypedValuesBuffer& typed_buffer) = 0; | ||
| virtual TypedValuesBuffer DecryptValueList(tcb::span<const uint8_t> encrypted_bytes) = 0; | ||
|
|
||
| protected: | ||
| std::string key_id_; | ||
| std::string column_name_; | ||
| std::string user_id_; | ||
| std::string application_context_; | ||
| dbps::external::Type::type datatype_; | ||
| }; | ||
|
|
||
| /** | ||
| * Basic implementation of the temporary XOR encryptor interface. | ||
| * | ||
| * This implementation provides: | ||
| * - Block encryption/decryption using XOR with key_id hash (same as current encryption_sequencer) | ||
| * | ||
| * This is a simple, default encryption implementation that can be replaced with more | ||
| * sophisticated encryption providers (e.g., Protegrity) in the future. | ||
| */ | ||
| class DBPS_EXPORT BasicXorEncryptor : public XorEncryptorInterface { | ||
| public: | ||
| /** | ||
| * Constructor that initializes the encryptor with context parameters. | ||
| * | ||
| * @param key_id The encryption key identifier | ||
| * @param column_name The name of the column being encrypted/decrypted | ||
| * @param user_id The user identifier for context | ||
| * @param application_context Additional application context information | ||
| * @param datatype The data type of the column being encrypted/decrypted. | ||
| * It is needed for correct type specific parsing during the DecryptValueList call. | ||
| */ | ||
| BasicXorEncryptor( | ||
| const std::string& key_id, | ||
| const std::string& column_name, | ||
| const std::string& user_id, | ||
| const std::string& application_context, | ||
| dbps::external::Type::type datatype) | ||
| : XorEncryptorInterface(key_id, column_name, user_id, application_context, datatype) {} | ||
|
|
||
| ~BasicXorEncryptor() override = default; | ||
|
|
||
| // Block encryption methods | ||
| std::vector<uint8_t> EncryptBlock(tcb::span<const uint8_t> data) override; | ||
|
|
||
| std::vector<uint8_t> DecryptBlock(tcb::span<const uint8_t> data) override; | ||
|
|
||
| // Value encryption methods | ||
| std::vector<uint8_t> EncryptValueList(const TypedValuesBuffer& typed_buffer) override; | ||
|
|
||
| TypedValuesBuffer DecryptValueList(tcb::span<const uint8_t> encrypted_bytes) override; | ||
| }; | ||
|
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we simplify this code? (e.g. remove the use of lambdas?) BasicEncryptor is supposed to be an example encryptor - this implementation makes legibility a bit hard.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great comment. Totally agree on keeping BasicEncryptor as readable as possible.
However, we can't remove this one since it's a needed
visitdue to TypeBuffer overloading. There are workarounds but all end up doing avisitsomewhere, just placed somewhere else. This is in general a known boilerplate pattern for accessing variant types in cpp.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Discussed offline. Added a TODO note in-place for this and will address it in a followup cleanup.