|
| 1 | +/** |
| 2 | + * @file hll.hpp |
| 3 | + * @brief HyperLogLog probabilistic cardinality estimator |
| 4 | + */ |
| 5 | + |
| 6 | +#pragma once |
| 7 | + |
| 8 | +#include <algorithm> |
| 9 | +#include <array> |
| 10 | +#include <cmath> |
| 11 | +#include <cstdint> |
| 12 | + |
| 13 | +namespace cloudsql { |
| 14 | +namespace common { |
| 15 | + |
| 16 | +/** |
| 17 | + * @brief HyperLogLog — memory-bounded NDV estimator |
| 18 | + * |
| 19 | + * Uses a fixed register array of 2048 bytes (~12KB total) regardless of |
| 20 | + * cardinality. Provides probabilistic cardinality estimates with ~1.6% |
| 21 | + * standard error for cardinalities >> kNumRegisters. |
| 22 | + * |
| 23 | + * Algorithm (Flajolet et al. HyperLogLog): |
| 24 | + * - For each item, hash to 64 bits |
| 25 | + * - Register index: BOTTOM kIndexBits (p=11 for m=2048) |
| 26 | + * - Register value: count of trailing zeros in remaining upper bits + 1 |
| 27 | + * - Final cardinality: m * log2(m / sum(2^(-reg_i))) |
| 28 | + * |
| 29 | + * For small cardinalities (<< kNumRegisters), uses linear counting |
| 30 | + * fallback to avoid HLL's systematic overestimation. |
| 31 | + */ |
| 32 | +class HyperLogLog { |
| 33 | + public: |
| 34 | + static constexpr size_t kNumRegisters = 2048; // 2^11 for 11-bit index |
| 35 | + static constexpr double kPowBase = 2.0; // base for 2^(-reg) computation |
| 36 | + static constexpr int kIndexBits = 11; // bits used for register index |
| 37 | + |
| 38 | + // Linear counting fallback: when nonzero registers < m / kLinearCountingThreshold, |
| 39 | + // raw HLL formula overestimates severely (e.g., 3 distinct values → 23k estimate). |
| 40 | + // Linear counting: E[n] ≈ -m * log(V/m) where V = empty fraction. |
| 41 | + static constexpr double kLinearCountingThreshold = 20.0; |
| 42 | + |
| 43 | + // Bias correction: when raw_estimate <= kBiasCorrectionBoundary * m, apply correction. |
| 44 | + // Empirical testing shows HLL systematically overestimates for small cardinalities. |
| 45 | + static constexpr double kBiasCorrectionBoundary = 2.5; |
| 46 | + |
| 47 | + // Bias adjustment: bias = -0.5 * (m / kBiasAdjustmentFactor). |
| 48 | + static constexpr double kBiasAdjustmentFactor = 10.0; |
| 49 | + |
| 50 | + /** |
| 51 | + * @brief Construct with optional seed for reproducible hashing |
| 52 | + */ |
| 53 | + explicit HyperLogLog(int seed = 0) noexcept : seed_(seed), registers_({}) {} |
| 54 | + |
| 55 | + /** |
| 56 | + * @brief Insert a pre-hashed 64-bit value |
| 57 | + */ |
| 58 | + void insert(uint64_t hash) noexcept { |
| 59 | + hash ^= static_cast<uint64_t>(seed_); |
| 60 | + |
| 61 | + // Register index from BOTTOM kIndexBits of hash |
| 62 | + int idx = static_cast<int>(hash & (kNumRegisters - 1)); |
| 63 | + |
| 64 | + // Count trailing zeros in the UPPER bits (after index bits) |
| 65 | + // These are the bits from position kIndexBits to 63 |
| 66 | + uint64_t remaining = hash >> kIndexBits; |
| 67 | + int zeros = count_trailing_zeros(remaining) + 1; |
| 68 | + |
| 69 | + // Clamp to uint8_t max |
| 70 | + uint8_t new_val = static_cast<uint8_t>(std::min(zeros, 255)); |
| 71 | + registers_.at(idx) = std::max(registers_.at(idx), new_val); |
| 72 | + } |
| 73 | + |
| 74 | + /** |
| 75 | + * @brief Estimate cardinality using HyperLogLog formula |
| 76 | + */ |
| 77 | + [[nodiscard]] uint64_t cardinality() const noexcept { |
| 78 | + double sum = 0.0; |
| 79 | + int nonzero_count = 0; |
| 80 | + for (uint8_t reg : registers_) { |
| 81 | + if (reg != 0) { |
| 82 | + ++nonzero_count; |
| 83 | + sum += std::pow(kPowBase, -static_cast<double>(reg)); |
| 84 | + } |
| 85 | + } |
| 86 | + |
| 87 | + // Empty HLL → cardinality 0 |
| 88 | + if (nonzero_count == 0) { |
| 89 | + return 0; |
| 90 | + } |
| 91 | + |
| 92 | + double m = static_cast<double>(kNumRegisters); |
| 93 | + int empty_count = static_cast<int>(m) - nonzero_count; |
| 94 | + |
| 95 | + // For sparse data (few registers used), use linear counting to avoid |
| 96 | + // HLL's extreme overestimation. When registers are sparse (nonzero < |
| 97 | + // m/kLinearCountingThreshold), the HLL raw formula gives wildly incorrect results. |
| 98 | + if (nonzero_count < static_cast<int>(m / kLinearCountingThreshold)) { |
| 99 | + // Linear counting: E[n] ≈ -m * log(V/m) where V = empty fraction |
| 100 | + double linear_est = -m * std::log2(static_cast<double>(empty_count) / m); |
| 101 | + return static_cast<uint64_t>(std::max(1.0, linear_est)); |
| 102 | + } |
| 103 | + |
| 104 | + // Standard HLL formula for moderate to large cardinalities |
| 105 | + double raw_estimate = m * std::log2(m / sum); |
| 106 | + |
| 107 | + // Bias correction for small cardinalities |
| 108 | + double bias = 0.0; |
| 109 | + if (raw_estimate <= kBiasCorrectionBoundary * m) { |
| 110 | + bias = -0.5 * (m / kBiasAdjustmentFactor); |
| 111 | + } |
| 112 | + |
| 113 | + double estimate = raw_estimate + bias; |
| 114 | + |
| 115 | + if (estimate < 0) { |
| 116 | + return 0; |
| 117 | + } |
| 118 | + if (estimate > static_cast<double>(kMaxCardinality)) { |
| 119 | + return kMaxCardinality; |
| 120 | + } |
| 121 | + return static_cast<uint64_t>(estimate); |
| 122 | + } |
| 123 | + |
| 124 | + /** |
| 125 | + * @brief Reset all registers to zero |
| 126 | + */ |
| 127 | + void reset() noexcept { registers_.fill(0); } |
| 128 | + |
| 129 | + /** |
| 130 | + * @brief Merge another HLL into this one (element-wise max of registers) |
| 131 | + */ |
| 132 | + void merge(const HyperLogLog& other) noexcept { |
| 133 | + for (size_t i = 0; i < kNumRegisters; ++i) { |
| 134 | + registers_.at(i) = std::max(registers_.at(i), other.registers_.at(i)); |
| 135 | + } |
| 136 | + } |
| 137 | + |
| 138 | + /** |
| 139 | + * @brief Hash a byte buffer to uint64_t (FNV-1a hash) |
| 140 | + * |
| 141 | + * FNV-1a is used instead of djb2 because djb2 doesn't distribute |
| 142 | + * upper bits well for strings with common prefixes. |
| 143 | + */ |
| 144 | + [[nodiscard]] static uint64_t hash_bytes(const void* data, size_t len) noexcept { |
| 145 | + static constexpr uint64_t kFnvOffsetBasis = 14695981039346656037ULL; |
| 146 | + static constexpr uint64_t kFnvPrime = 1099511628211ULL; |
| 147 | + |
| 148 | + const uint8_t* bytes = static_cast<const uint8_t*>(data); |
| 149 | + uint64_t hash = kFnvOffsetBasis; |
| 150 | + for (size_t i = 0; i < len; ++i) { |
| 151 | + hash ^= bytes[i]; |
| 152 | + hash *= kFnvPrime; |
| 153 | + } |
| 154 | + return hash; |
| 155 | + } |
| 156 | + |
| 157 | + private: |
| 158 | + static constexpr uint64_t kMaxCardinality = UINT64_MAX; |
| 159 | + |
| 160 | + std::array<uint8_t, kNumRegisters> registers_; |
| 161 | + int seed_; |
| 162 | + |
| 163 | + /** |
| 164 | + * @brief Count trailing zero bits in a 64-bit value |
| 165 | + */ |
| 166 | + [[nodiscard]] static int count_trailing_zeros(uint64_t v) noexcept { |
| 167 | + if (v == 0) { |
| 168 | + return 64; |
| 169 | + } |
| 170 | + return __builtin_ctzll(v); |
| 171 | + } |
| 172 | +}; |
| 173 | + |
| 174 | +} // namespace common |
| 175 | +} // namespace cloudsql |
0 commit comments