Skip to content

Commit e788277

Browse files
committed
Add HLL unit tests to build
Adds hll_tests target to CMakeLists.txt to compile and run the new HyperLogLog unit test suite alongside other test targets.
1 parent 3fc257b commit e788277

5 files changed

Lines changed: 811 additions & 0 deletions

File tree

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ if(BUILD_TESTS)
127127
add_cloudsql_test(transaction_coverage_tests tests/transaction_coverage_tests.cpp)
128128
add_cloudsql_test(utils_coverage_tests tests/utils_coverage_tests.cpp)
129129
add_cloudsql_test(bloom_filter_tests tests/bloom_filter_test.cpp)
130+
add_cloudsql_test(hll_tests tests/hll_test.cpp)
130131
add_cloudsql_test(cloudSQL_tests tests/cloudSQL_tests.cpp)
131132
add_cloudsql_test(server_tests tests/server_tests.cpp)
132133
add_cloudsql_test(statement_tests tests/statement_tests.cpp)

include/common/hll.hpp

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
/**
2+
* @file hll.hpp
3+
* @brief HyperLogLog probabilistic cardinality estimator
4+
*/
5+
6+
#pragma once
7+
8+
#include <algorithm>
9+
#include <array>
10+
#include <cmath>
11+
#include <cstdint>
12+
13+
namespace cloudsql {
14+
namespace common {
15+
16+
/**
17+
* @brief HyperLogLog — memory-bounded NDV estimator
18+
*
19+
* Uses a fixed register array of 2048 bytes (~12KB total) regardless of
20+
* cardinality. Provides probabilistic cardinality estimates with ~1.6%
21+
* standard error for cardinalities >> kNumRegisters.
22+
*
23+
* Algorithm (Flajolet et al. HyperLogLog):
24+
* - For each item, hash to 64 bits
25+
* - Register index: BOTTOM kIndexBits (p=11 for m=2048)
26+
* - Register value: count of trailing zeros in remaining upper bits + 1
27+
* - Final cardinality: m * log2(m / sum(2^(-reg_i)))
28+
*
29+
* For small cardinalities (<< kNumRegisters), uses bias correction
30+
* to counteract HLL's systematic overestimation.
31+
*/
32+
class HyperLogLog {
33+
public:
34+
static constexpr size_t kNumRegisters = 2048; // 2^11 for 11-bit index
35+
static constexpr double kPowBase = 2.0; // base for 2^(-reg) computation
36+
static constexpr int kIndexBits = 11; // bits used for register index
37+
38+
/**
39+
* @brief Construct with optional seed for reproducible hashing
40+
*/
41+
explicit HyperLogLog(int seed = 0) noexcept : seed_(seed), registers_({}) {}
42+
43+
/**
44+
* @brief Insert a pre-hashed 64-bit value
45+
*/
46+
void insert(uint64_t hash) noexcept {
47+
hash ^= static_cast<uint64_t>(seed_);
48+
49+
// Register index from BOTTOM kIndexBits of hash
50+
int idx = static_cast<int>(hash & (kNumRegisters - 1));
51+
52+
// Count trailing zeros in the UPPER bits (after index bits)
53+
// These are the bits from position kIndexBits to 63 (kZeroCountBits bits total)
54+
uint64_t remaining = hash >> kIndexBits;
55+
int zeros = count_trailing_zeros(remaining) + 1;
56+
57+
// Clamp to uint8_t max
58+
uint8_t new_val = static_cast<uint8_t>(std::min(zeros, 255));
59+
registers_.at(idx) = std::max(registers_.at(idx), new_val);
60+
}
61+
62+
/**
63+
* @brief Estimate cardinality using HyperLogLog formula
64+
*/
65+
[[nodiscard]] uint64_t cardinality() const noexcept {
66+
double sum = 0.0;
67+
int nonzero_count = 0;
68+
for (uint8_t reg : registers_) {
69+
if (reg != 0) {
70+
++nonzero_count;
71+
sum += std::pow(kPowBase, -static_cast<double>(reg));
72+
}
73+
}
74+
75+
// Empty HLL → cardinality 0
76+
if (nonzero_count == 0) {
77+
return 0;
78+
}
79+
80+
double m = static_cast<double>(kNumRegisters);
81+
int empty_count = static_cast<int>(m) - nonzero_count;
82+
83+
// For sparse data (few registers used), use linear counting to avoid
84+
// HLL's extreme overestimation. When registers are sparse (nonzero < m/20),
85+
// the HLL raw formula gives wildly incorrect results.
86+
if (nonzero_count < static_cast<int>(m / 20)) {
87+
// Linear counting: E[n] ≈ -m * log(V/m) where V = empty fraction
88+
// Using simple form without alpha scaling for very sparse data
89+
double linear_est = -m * std::log2(static_cast<double>(empty_count) / m);
90+
return static_cast<uint64_t>(std::max(1.0, linear_est));
91+
}
92+
93+
// Standard HLL formula for moderate to large cardinalities
94+
double raw_estimate = m * std::log2(m / sum);
95+
96+
// Bias correction for small cardinalities
97+
// HLL systematically overestimates for small n; apply downward bias
98+
double bias = 0.0;
99+
if (raw_estimate <= 2.5 * m) {
100+
bias = -0.5 * (m / 10.0);
101+
}
102+
103+
double estimate = raw_estimate + bias;
104+
105+
if (estimate < 0) {
106+
return 0;
107+
}
108+
if (estimate > static_cast<double>(kMaxCardinality)) {
109+
return kMaxCardinality;
110+
}
111+
return static_cast<uint64_t>(estimate);
112+
}
113+
114+
/**
115+
* @brief Reset all registers to zero
116+
*/
117+
void reset() noexcept { registers_.fill(0); }
118+
119+
/**
120+
* @brief Merge another HLL into this one (element-wise max of registers)
121+
*/
122+
void merge(const HyperLogLog& other) noexcept {
123+
for (size_t i = 0; i < kNumRegisters; ++i) {
124+
registers_.at(i) = std::max(registers_.at(i), other.registers_.at(i));
125+
}
126+
}
127+
128+
/**
129+
* @brief Hash a byte buffer to uint64_t (FNV-1a hash)
130+
*
131+
* FNV-1a is used instead of djb2 because djb2 doesn't distribute
132+
* upper bits well for strings with common prefixes (many strings map
133+
* to the same top bits, causing HLL register collisions).
134+
*/
135+
[[nodiscard]] static uint64_t hash_bytes(const void* data, size_t len) noexcept {
136+
// FNV-1a constants for 64-bit hash
137+
static constexpr uint64_t kFnvOffsetBasis = 14695981039346656037ULL;
138+
static constexpr uint64_t kFnvPrime = 1099511628211ULL;
139+
140+
const uint8_t* bytes = static_cast<const uint8_t*>(data);
141+
uint64_t hash = kFnvOffsetBasis;
142+
for (size_t i = 0; i < len; ++i) {
143+
hash ^= bytes[i];
144+
hash *= kFnvPrime;
145+
}
146+
return hash;
147+
}
148+
149+
private:
150+
static constexpr uint64_t kMaxCardinality = UINT64_MAX;
151+
152+
std::array<uint8_t, kNumRegisters> registers_;
153+
int seed_;
154+
155+
/**
156+
* @brief Count trailing zero bits in a 64-bit value
157+
*/
158+
[[nodiscard]] static int count_trailing_zeros(uint64_t v) noexcept {
159+
if (v == 0) {
160+
return 64;
161+
}
162+
return __builtin_ctzll(v);
163+
}
164+
};
165+
166+
} // namespace common
167+
} // namespace cloudsql

src/executor/query_executor.cpp

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
#include "catalog/catalog.hpp"
2222
#include "common/cluster_manager.hpp"
23+
#include "common/hll.hpp"
2324
#include "common/value.hpp"
2425
#include "distributed/raft_group.hpp"
2526
#include "distributed/raft_manager.hpp"
@@ -913,6 +914,133 @@ QueryResult QueryExecutor::execute_update(const parser::UpdateStatement& stmt,
913914
return result;
914915
}
915916

917+
<<<<<<< Updated upstream
918+
=======
919+
QueryResult QueryExecutor::execute_analyze(const parser::AnalyzeStatement& stmt) {
920+
QueryResult result;
921+
922+
auto table_meta_opt = catalog_.get_table_by_name(stmt.table_name());
923+
if (!table_meta_opt.has_value()) {
924+
result.set_error("Table not found: " + stmt.table_name());
925+
return result;
926+
}
927+
const auto* table_meta = table_meta_opt.value();
928+
929+
Schema schema;
930+
for (const auto& col : table_meta->columns) {
931+
schema.add_column(col.name, col.type);
932+
}
933+
934+
storage::HeapTable table(stmt.table_name(), bpm_, schema);
935+
936+
// Collect per-column stats by scanning the table (single pass)
937+
std::vector<ColumnInfo> col_stats(table_meta->columns.size());
938+
std::vector<common::HyperLogLog> ndv_estimators(table_meta->columns.size());
939+
940+
auto iter = table.scan();
941+
Tuple tuple;
942+
uint64_t row_count = 0;
943+
944+
while (iter.next(tuple)) {
945+
row_count++;
946+
for (size_t col_idx = 0; col_idx < table_meta->columns.size(); ++col_idx) {
947+
const auto& col_info = table_meta->columns[col_idx];
948+
const auto& val = tuple.get(col_idx);
949+
950+
if (val.is_null()) {
951+
col_stats[col_idx].null_count++;
952+
} else {
953+
// Collect NDV via HyperLogLog — memory-bounded vs unbounded unordered_set
954+
uint64_t hash = 0;
955+
if (col_info.type == common::ValueType::TYPE_TEXT ||
956+
col_info.type == common::ValueType::TYPE_VARCHAR ||
957+
col_info.type == common::ValueType::TYPE_CHAR) {
958+
// Use 64-char prefix for text hashing
959+
const std::string& s = val.as_text();
960+
size_t prefix_len = std::min(s.size(), size_t(64));
961+
hash = common::HyperLogLog::hash_bytes(s.data(), prefix_len);
962+
} else {
963+
// Use common::Value::Hash for numeric and other types
964+
hash = static_cast<uint64_t>(common::Value::Hash{}(val));
965+
}
966+
ndv_estimators[col_idx].insert(hash);
967+
968+
switch (col_info.type) {
969+
case common::ValueType::TYPE_INT64:
970+
case common::ValueType::TYPE_INT32:
971+
case common::ValueType::TYPE_INT16:
972+
case common::ValueType::TYPE_INT8:
973+
case common::ValueType::TYPE_BOOL: {
974+
int64_t v = val.to_int64();
975+
if (!col_stats[col_idx].min_int.has_value() ||
976+
v < col_stats[col_idx].min_int.value()) {
977+
col_stats[col_idx].min_int = v;
978+
}
979+
if (!col_stats[col_idx].max_int.has_value() ||
980+
v > col_stats[col_idx].max_int.value()) {
981+
col_stats[col_idx].max_int = v;
982+
}
983+
break;
984+
}
985+
case common::ValueType::TYPE_FLOAT64:
986+
case common::ValueType::TYPE_FLOAT32:
987+
case common::ValueType::TYPE_DECIMAL: {
988+
double v = val.to_float64();
989+
if (!col_stats[col_idx].min_double.has_value() ||
990+
v < col_stats[col_idx].min_double.value()) {
991+
col_stats[col_idx].min_double = v;
992+
}
993+
if (!col_stats[col_idx].max_double.has_value() ||
994+
v > col_stats[col_idx].max_double.value()) {
995+
col_stats[col_idx].max_double = v;
996+
}
997+
break;
998+
}
999+
case common::ValueType::TYPE_TEXT:
1000+
case common::ValueType::TYPE_VARCHAR:
1001+
case common::ValueType::TYPE_CHAR: {
1002+
const std::string& s = val.as_text();
1003+
uint64_t len = static_cast<uint64_t>(s.size());
1004+
if (!col_stats[col_idx].min_str_len.has_value() ||
1005+
len < col_stats[col_idx].min_str_len.value()) {
1006+
col_stats[col_idx].min_str_len = len;
1007+
}
1008+
if (!col_stats[col_idx].max_str_len.has_value() ||
1009+
len > col_stats[col_idx].max_str_len.value()) {
1010+
col_stats[col_idx].max_str_len = len;
1011+
}
1012+
break;
1013+
}
1014+
default:
1015+
break;
1016+
}
1017+
}
1018+
}
1019+
}
1020+
1021+
// Compute NDV from HLL estimators collected in single pass
1022+
for (size_t col_idx = 0; col_idx < table_meta->columns.size(); ++col_idx) {
1023+
col_stats[col_idx].ndv = ndv_estimators[col_idx].cardinality();
1024+
}
1025+
1026+
// Update table-level stats
1027+
catalog_.update_table_stats(table_meta->table_id, row_count);
1028+
1029+
// Update per-column stats
1030+
for (size_t col_idx = 0; col_idx < table_meta->columns.size(); ++col_idx) {
1031+
col_stats[col_idx].has_stats = true;
1032+
col_stats[col_idx].name = table_meta->columns[col_idx].name;
1033+
col_stats[col_idx].type = table_meta->columns[col_idx].type;
1034+
col_stats[col_idx].position = table_meta->columns[col_idx].position;
1035+
catalog_.update_column_stats(table_meta->table_id, table_meta->columns[col_idx].name,
1036+
col_stats[col_idx]);
1037+
}
1038+
1039+
result.set_rows_affected(1);
1040+
return result;
1041+
}
1042+
1043+
>>>>>>> Stashed changes
9161044
std::unique_ptr<Operator> QueryExecutor::build_plan(const parser::SelectStatement& stmt,
9171045
transaction::Transaction* txn) {
9181046
/* 1. Base: Initial table access (Sequential Scan or Index Scan) */

0 commit comments

Comments
 (0)