|
20 | 20 |
|
21 | 21 | #include "catalog/catalog.hpp" |
22 | 22 | #include "common/cluster_manager.hpp" |
| 23 | +#include "common/hll.hpp" |
23 | 24 | #include "common/value.hpp" |
24 | 25 | #include "distributed/raft_group.hpp" |
25 | 26 | #include "distributed/raft_manager.hpp" |
@@ -913,6 +914,133 @@ QueryResult QueryExecutor::execute_update(const parser::UpdateStatement& stmt, |
913 | 914 | return result; |
914 | 915 | } |
915 | 916 |
|
| 917 | +<<<<<<< Updated upstream |
| 918 | +======= |
| 919 | +QueryResult QueryExecutor::execute_analyze(const parser::AnalyzeStatement& stmt) { |
| 920 | + QueryResult result; |
| 921 | + |
| 922 | + auto table_meta_opt = catalog_.get_table_by_name(stmt.table_name()); |
| 923 | + if (!table_meta_opt.has_value()) { |
| 924 | + result.set_error("Table not found: " + stmt.table_name()); |
| 925 | + return result; |
| 926 | + } |
| 927 | + const auto* table_meta = table_meta_opt.value(); |
| 928 | + |
| 929 | + Schema schema; |
| 930 | + for (const auto& col : table_meta->columns) { |
| 931 | + schema.add_column(col.name, col.type); |
| 932 | + } |
| 933 | + |
| 934 | + storage::HeapTable table(stmt.table_name(), bpm_, schema); |
| 935 | + |
| 936 | + // Collect per-column stats by scanning the table (single pass) |
| 937 | + std::vector<ColumnInfo> col_stats(table_meta->columns.size()); |
| 938 | + std::vector<common::HyperLogLog> ndv_estimators(table_meta->columns.size()); |
| 939 | + |
| 940 | + auto iter = table.scan(); |
| 941 | + Tuple tuple; |
| 942 | + uint64_t row_count = 0; |
| 943 | + |
| 944 | + while (iter.next(tuple)) { |
| 945 | + row_count++; |
| 946 | + for (size_t col_idx = 0; col_idx < table_meta->columns.size(); ++col_idx) { |
| 947 | + const auto& col_info = table_meta->columns[col_idx]; |
| 948 | + const auto& val = tuple.get(col_idx); |
| 949 | + |
| 950 | + if (val.is_null()) { |
| 951 | + col_stats[col_idx].null_count++; |
| 952 | + } else { |
| 953 | + // Collect NDV via HyperLogLog — memory-bounded vs unbounded unordered_set |
| 954 | + uint64_t hash = 0; |
| 955 | + if (col_info.type == common::ValueType::TYPE_TEXT || |
| 956 | + col_info.type == common::ValueType::TYPE_VARCHAR || |
| 957 | + col_info.type == common::ValueType::TYPE_CHAR) { |
| 958 | + // Use 64-char prefix for text hashing |
| 959 | + const std::string& s = val.as_text(); |
| 960 | + size_t prefix_len = std::min(s.size(), size_t(64)); |
| 961 | + hash = common::HyperLogLog::hash_bytes(s.data(), prefix_len); |
| 962 | + } else { |
| 963 | + // Use common::Value::Hash for numeric and other types |
| 964 | + hash = static_cast<uint64_t>(common::Value::Hash{}(val)); |
| 965 | + } |
| 966 | + ndv_estimators[col_idx].insert(hash); |
| 967 | + |
| 968 | + switch (col_info.type) { |
| 969 | + case common::ValueType::TYPE_INT64: |
| 970 | + case common::ValueType::TYPE_INT32: |
| 971 | + case common::ValueType::TYPE_INT16: |
| 972 | + case common::ValueType::TYPE_INT8: |
| 973 | + case common::ValueType::TYPE_BOOL: { |
| 974 | + int64_t v = val.to_int64(); |
| 975 | + if (!col_stats[col_idx].min_int.has_value() || |
| 976 | + v < col_stats[col_idx].min_int.value()) { |
| 977 | + col_stats[col_idx].min_int = v; |
| 978 | + } |
| 979 | + if (!col_stats[col_idx].max_int.has_value() || |
| 980 | + v > col_stats[col_idx].max_int.value()) { |
| 981 | + col_stats[col_idx].max_int = v; |
| 982 | + } |
| 983 | + break; |
| 984 | + } |
| 985 | + case common::ValueType::TYPE_FLOAT64: |
| 986 | + case common::ValueType::TYPE_FLOAT32: |
| 987 | + case common::ValueType::TYPE_DECIMAL: { |
| 988 | + double v = val.to_float64(); |
| 989 | + if (!col_stats[col_idx].min_double.has_value() || |
| 990 | + v < col_stats[col_idx].min_double.value()) { |
| 991 | + col_stats[col_idx].min_double = v; |
| 992 | + } |
| 993 | + if (!col_stats[col_idx].max_double.has_value() || |
| 994 | + v > col_stats[col_idx].max_double.value()) { |
| 995 | + col_stats[col_idx].max_double = v; |
| 996 | + } |
| 997 | + break; |
| 998 | + } |
| 999 | + case common::ValueType::TYPE_TEXT: |
| 1000 | + case common::ValueType::TYPE_VARCHAR: |
| 1001 | + case common::ValueType::TYPE_CHAR: { |
| 1002 | + const std::string& s = val.as_text(); |
| 1003 | + uint64_t len = static_cast<uint64_t>(s.size()); |
| 1004 | + if (!col_stats[col_idx].min_str_len.has_value() || |
| 1005 | + len < col_stats[col_idx].min_str_len.value()) { |
| 1006 | + col_stats[col_idx].min_str_len = len; |
| 1007 | + } |
| 1008 | + if (!col_stats[col_idx].max_str_len.has_value() || |
| 1009 | + len > col_stats[col_idx].max_str_len.value()) { |
| 1010 | + col_stats[col_idx].max_str_len = len; |
| 1011 | + } |
| 1012 | + break; |
| 1013 | + } |
| 1014 | + default: |
| 1015 | + break; |
| 1016 | + } |
| 1017 | + } |
| 1018 | + } |
| 1019 | + } |
| 1020 | + |
| 1021 | + // Compute NDV from HLL estimators collected in single pass |
| 1022 | + for (size_t col_idx = 0; col_idx < table_meta->columns.size(); ++col_idx) { |
| 1023 | + col_stats[col_idx].ndv = ndv_estimators[col_idx].cardinality(); |
| 1024 | + } |
| 1025 | + |
| 1026 | + // Update table-level stats |
| 1027 | + catalog_.update_table_stats(table_meta->table_id, row_count); |
| 1028 | + |
| 1029 | + // Update per-column stats |
| 1030 | + for (size_t col_idx = 0; col_idx < table_meta->columns.size(); ++col_idx) { |
| 1031 | + col_stats[col_idx].has_stats = true; |
| 1032 | + col_stats[col_idx].name = table_meta->columns[col_idx].name; |
| 1033 | + col_stats[col_idx].type = table_meta->columns[col_idx].type; |
| 1034 | + col_stats[col_idx].position = table_meta->columns[col_idx].position; |
| 1035 | + catalog_.update_column_stats(table_meta->table_id, table_meta->columns[col_idx].name, |
| 1036 | + col_stats[col_idx]); |
| 1037 | + } |
| 1038 | + |
| 1039 | + result.set_rows_affected(1); |
| 1040 | + return result; |
| 1041 | +} |
| 1042 | + |
| 1043 | +>>>>>>> Stashed changes |
916 | 1044 | std::unique_ptr<Operator> QueryExecutor::build_plan(const parser::SelectStatement& stmt, |
917 | 1045 | transaction::Transaction* txn) { |
918 | 1046 | /* 1. Base: Initial table access (Sequential Scan or Index Scan) */ |
|
0 commit comments