|
| 1 | +/* |
| 2 | + * Copyright (c) Meta Platforms, Inc. and affiliates. |
| 3 | + * All rights reserved. |
| 4 | + * |
| 5 | + * This source code is licensed under the BSD-style license found in the |
| 6 | + * LICENSE file in the root directory of this source tree. |
| 7 | + */ |
| 8 | + |
| 9 | +// GPU no-bleed integration proof for the CUDA V2 per-session mutable-state |
| 10 | +// rebind -- the REAL guard for mutable-buffer completeness (an under-declared |
| 11 | +// buffer would be shared across sessions; only behavior catches that, not the |
| 12 | +// declared-subset-of-discovered bookkeeping check). This is the automated form |
| 13 | +// of the manual "A solo / A inter" proof in the V2 foundation commit. |
| 14 | +// |
| 15 | +// CRITICAL: sessions are interleaved at EXECUTE granularity (A prefill, B |
| 16 | +// prefill, A decode, B decode, ...). The mechanism under test is the |
| 17 | +// per-execute rebind, so running A-to-completion then B would pass even with a |
| 18 | +// broken rebind. |
| 19 | +// |
| 20 | +// GPU-gated: requires a CUDA device + an exported model. Set QWEN_MODEL_PATH, |
| 21 | +// QWEN_DATA_PATH, QWEN_TOKENIZER_PATH. Skips cleanly (exit 0) if unset or the |
| 22 | +// engine cannot be created (no device) -- so it is safe in CI; the real run is |
| 23 | +// the nightly/manual GPU job. |
| 24 | + |
| 25 | +#include <executorch/examples/models/qwen3_5_moe/qwen35_moe_engine.h> |
| 26 | + |
| 27 | +#include <executorch/backends/cuda/runtime/cuda_mutable_state.h> |
| 28 | + |
| 29 | +#include <cstdint> |
| 30 | +#include <cstdio> |
| 31 | +#include <cstdlib> |
| 32 | +#include <memory> |
| 33 | +#include <string> |
| 34 | +#include <vector> |
| 35 | + |
| 36 | +#include <cuda_runtime.h> |
| 37 | + |
| 38 | +namespace llm = ::executorch::extension::llm; |
| 39 | +using ::executorch::runtime::Error; |
| 40 | + |
| 41 | +namespace { |
| 42 | +int g_failures = 0; |
| 43 | +void check(const char* name, bool ok) { |
| 44 | + printf(" [%s] %s\n", ok ? "PASS" : "FAIL", name); |
| 45 | + if (!ok) { |
| 46 | + ++g_failures; |
| 47 | + } |
| 48 | +} |
| 49 | + |
| 50 | +const char* env(const char* k) { |
| 51 | + const char* v = std::getenv(k); |
| 52 | + return (v && *v) ? v : nullptr; |
| 53 | +} |
| 54 | + |
| 55 | +std::vector<uint64_t> encode(llm::Qwen35MoEEngine& e, const std::string& s) { |
| 56 | + auto r = e.tokenizer()->encode(s); |
| 57 | + return r.ok() ? std::move(*r) : std::vector<uint64_t>{}; |
| 58 | +} |
| 59 | + |
| 60 | +// Greedy-decode `n` non-terminal tokens from a freshly-prefilled session. |
| 61 | +std::vector<uint64_t> |
| 62 | +solo_decode(llm::LLMSession& s, std::vector<uint64_t> prompt, int n) { |
| 63 | + llm::SamplingConfig samp; // temperature -1 => greedy/argmax (deterministic) |
| 64 | + std::vector<uint64_t> out; |
| 65 | + if (s.prefill_tokens(prompt, &samp) != Error::Ok) { |
| 66 | + return out; |
| 67 | + } |
| 68 | + for (int i = 0; i < n; ++i) { |
| 69 | + auto r = s.decode_one(samp); |
| 70 | + if (r.error() != Error::Ok || r.get().is_terminal) { |
| 71 | + break; |
| 72 | + } |
| 73 | + out.push_back(r.get().token_id); |
| 74 | + } |
| 75 | + return out; |
| 76 | +} |
| 77 | + |
| 78 | +int64_t gpu_free() { |
| 79 | + size_t free = 0, total = 0; |
| 80 | + return cudaMemGetInfo(&free, &total) == cudaSuccess |
| 81 | + ? static_cast<int64_t>(free) |
| 82 | + : -1; |
| 83 | +} |
| 84 | + |
| 85 | +// GPU-FREE fall-closed DEFAULTS of cuda_mutable_state (no device, no handle). |
| 86 | +// Covers only the safety defaults -- the descriptor build, positive coverage, |
| 87 | +// bytes_per_session sum, and symbols_available AND-fold are exercised |
| 88 | +// BEHAVIORALLY by the no-bleed integration test below (the real guard); a |
| 89 | +// GPU-free unit test of those branches would need a build_descriptors allocator |
| 90 | +// seam / fake-handle harness and is a knowingly-deferred follow-up. |
| 91 | +namespace cu = ::executorch::backends::cuda; |
| 92 | +void test_mutable_state_fallclosed_defaults() { |
| 93 | + printf("cuda_mutable_state fall-closed defaults (GPU-free):\n"); |
| 94 | + const cu::MutableStateContext bad = 999999; // never created |
| 95 | + cu::MutableStateContext c1 = cu::mutable_state_create_context(); |
| 96 | + cu::MutableStateContext c2 = cu::mutable_state_create_context(); |
| 97 | + check("context ids are distinct/monotonic", c2 > c1); |
| 98 | + check( |
| 99 | + "fresh context: rebinding unavailable (no handle)", |
| 100 | + !cu::mutable_state_available(c1)); |
| 101 | + check( |
| 102 | + "bytes_per_session: 0 for fresh and unknown", |
| 103 | + cu::mutable_state_bytes_per_session(c1) == 0 && |
| 104 | + cu::mutable_state_bytes_per_session(bad) == 0); |
| 105 | + check( |
| 106 | + "validate_coverage: unknown ctx -> InvalidArgument", |
| 107 | + cu::mutable_state_validate_coverage(bad) == Error::InvalidArgument); |
| 108 | + check( |
| 109 | + "validate_coverage: no symbols -> NotSupported (fall closed)", |
| 110 | + cu::mutable_state_validate_coverage(c1) == Error::NotSupported); |
| 111 | + // Declaring FQNs without symbols still falls closed (the check is gated on |
| 112 | + // symbols, so it never wrongly passes coverage with nothing discovered). |
| 113 | + cu::mutable_state_register_fqns(c1, {"a.b", "c.d"}); |
| 114 | + check( |
| 115 | + "validate_coverage: declared-but-no-symbols still NotSupported", |
| 116 | + cu::mutable_state_validate_coverage(c1) == Error::NotSupported); |
| 117 | + check( |
| 118 | + "create_session: unknown ctx -> InvalidArgument", |
| 119 | + cu::mutable_state_create_session(bad).error() == Error::InvalidArgument); |
| 120 | + check( |
| 121 | + "create_session: no symbols -> NotSupported", |
| 122 | + cu::mutable_state_create_session(c1).error() == Error::NotSupported); |
| 123 | + cu::mutable_state_destroy_session(bad, 0); // no-op, must not crash |
| 124 | + cu::mutable_state_destroy_context(bad); // no-op, must not crash |
| 125 | + cu::mutable_state_destroy_context(c1); |
| 126 | + cu::mutable_state_destroy_context(c2); |
| 127 | + check("destroy of unknown ctx/session is a safe no-op", true); |
| 128 | +} |
| 129 | + |
| 130 | +} // namespace |
| 131 | + |
| 132 | +int main() { |
| 133 | + // GPU-free fall-closed defaults always run (even when the integration part |
| 134 | + // skips for lack of a device). |
| 135 | + test_mutable_state_fallclosed_defaults(); |
| 136 | + |
| 137 | + const char* model = env("QWEN_MODEL_PATH"); |
| 138 | + const char* tok = env("QWEN_TOKENIZER_PATH"); |
| 139 | + if (!model || !tok) { |
| 140 | + printf( |
| 141 | + "SKIP: integration proof needs QWEN_MODEL_PATH / QWEN_TOKENIZER_PATH " |
| 142 | + "(+ QWEN_DATA_PATH) on a CUDA box.\n"); |
| 143 | + return g_failures ? 1 : 0; |
| 144 | + } |
| 145 | + llm::Qwen35MoEConfig config; |
| 146 | + config.model_path = model; |
| 147 | + config.data_path = env("QWEN_DATA_PATH") ? env("QWEN_DATA_PATH") : ""; |
| 148 | + config.tokenizer_path = tok; |
| 149 | + config.max_sessions = 4; |
| 150 | + |
| 151 | + auto engine_r = llm::Qwen35MoEEngine::create(config); |
| 152 | + if (engine_r.error() != Error::Ok) { |
| 153 | + printf("SKIP: engine create failed (no CUDA device / bad paths).\n"); |
| 154 | + return 0; |
| 155 | + } |
| 156 | + auto engine = std::move(engine_r.get()); |
| 157 | + printf("no-bleed integration proof:\n"); |
| 158 | + |
| 159 | + const int kN = 24; |
| 160 | + auto prompt_a = encode(*engine, "List three colors:"); |
| 161 | + auto prompt_b = |
| 162 | + encode(*engine, "Name two countries in Europe and explain why."); |
| 163 | + check("prompts encoded", !prompt_a.empty() && !prompt_b.empty()); |
| 164 | + |
| 165 | + // (1) Session A solo -> baseline greedy ids. |
| 166 | + auto sa_r = engine->create_session(); |
| 167 | + check("create session A", sa_r.error() == Error::Ok); |
| 168 | + std::vector<uint64_t> ids_solo; |
| 169 | + if (sa_r.error() == Error::Ok) { |
| 170 | + auto sa = std::move(sa_r.get()); |
| 171 | + ids_solo = solo_decode(*sa, prompt_a, kN); |
| 172 | + } |
| 173 | + check("solo produced tokens", !ids_solo.empty()); |
| 174 | + |
| 175 | + // (2) A2 and B interleaved at EXECUTE granularity. |
| 176 | + auto a2_r = engine->create_session(); |
| 177 | + auto b_r = engine->create_session(); |
| 178 | + check("create A2 + B", a2_r.error() == Error::Ok && b_r.error() == Error::Ok); |
| 179 | + std::vector<uint64_t> ids_a2, ids_b; |
| 180 | + if (a2_r.error() == Error::Ok && b_r.error() == Error::Ok) { |
| 181 | + auto a2 = std::move(a2_r.get()); |
| 182 | + auto b = std::move(b_r.get()); |
| 183 | + llm::SamplingConfig samp; |
| 184 | + bool ok = a2->prefill_tokens(prompt_a, &samp) == Error::Ok && // A prefill |
| 185 | + b->prefill_tokens(prompt_b, &samp) == Error::Ok; // then B prefill |
| 186 | + check("interleaved prefills", ok); |
| 187 | + bool a_done = false, b_done = false; |
| 188 | + for (int i = 0; i < kN && ok; ++i) { |
| 189 | + if (!a_done) { // A decode |
| 190 | + auto r = a2->decode_one(samp); |
| 191 | + if (r.error() != Error::Ok || r.get().is_terminal) { |
| 192 | + a_done = true; |
| 193 | + } else { |
| 194 | + ids_a2.push_back(r.get().token_id); |
| 195 | + } |
| 196 | + } |
| 197 | + if (!b_done) { // B decode (between A's steps) |
| 198 | + auto r = b->decode_one(samp); |
| 199 | + if (r.error() != Error::Ok || r.get().is_terminal) { |
| 200 | + b_done = true; |
| 201 | + } else { |
| 202 | + ids_b.push_back(r.get().token_id); |
| 203 | + } |
| 204 | + } |
| 205 | + } |
| 206 | + } |
| 207 | + |
| 208 | + // THE no-bleed assertion: A's interleaved output is bit-identical to A solo |
| 209 | + // (greedy is deterministic), so B's interleaved session state did not corrupt |
| 210 | + // A's -- i.e. each session's mutable buffers are truly isolated. |
| 211 | + check( |
| 212 | + "no bleed: A interleaved == A solo (bit-identical)", ids_a2 == ids_solo); |
| 213 | + // Sanity that B actually ran a different conversation (else the test is |
| 214 | + // vacuous). |
| 215 | + check("B ran a distinct conversation", !ids_b.empty() && ids_b != ids_solo); |
| 216 | + |
| 217 | + // (3) Per-extra-session memory is STATE-sized, not a second model load. |
| 218 | + // Per-session buffers are allocated LAZILY on first execute (rebind), not at |
| 219 | + // create_session(), so measure the free-memory delta around a fresh session's |
| 220 | + // first prefill. |
| 221 | + const int64_t est = engine->serving_capacity().estimated_bytes_per_session; |
| 222 | + { |
| 223 | + int64_t free_before = gpu_free(); |
| 224 | + auto extra_r = engine->create_session(); |
| 225 | + if (extra_r.error() == Error::Ok) { |
| 226 | + auto extra = std::move(extra_r.get()); |
| 227 | + llm::SamplingConfig samp; |
| 228 | + extra->prefill_tokens( |
| 229 | + prompt_a, &samp); // first execute -> allocates state |
| 230 | + int64_t free_after = gpu_free(); |
| 231 | + if (free_before > 0 && free_after > 0) { |
| 232 | + const int64_t delta = free_before - free_after; |
| 233 | + printf( |
| 234 | + " extra-session GPU delta=%lld bytes (est/session=%lld)\n", |
| 235 | + (long long)delta, |
| 236 | + (long long)est); |
| 237 | + check( |
| 238 | + "extra session is state-sized (>0, < 4 GB, not an ~18 GB reload)", |
| 239 | + delta > 0 && delta < (4LL << 30)); |
| 240 | + if (est > 0) { |
| 241 | + check( |
| 242 | + "memory delta within 2x of estimated_bytes_per_session", |
| 243 | + delta <= est * 2 + (256LL << 20)); |
| 244 | + } |
| 245 | + } |
| 246 | + } |
| 247 | + } // extra released here -> frees its slot before the capacity test |
| 248 | + |
| 249 | + // (4) Capacity: the (max_sessions+1)th create_session fails (no silent |
| 250 | + // share). The sessions above already hold slots; create up to capacity then |
| 251 | + // one more. |
| 252 | + std::vector<std::unique_ptr<llm::LLMSession>> held; |
| 253 | + while (true) { |
| 254 | + auto r = engine->create_session(); |
| 255 | + if (r.error() != Error::Ok) { |
| 256 | + break; |
| 257 | + } |
| 258 | + held.push_back(std::move(r.get())); |
| 259 | + if (held.size() > (size_t)config.max_sessions + 2) { |
| 260 | + break; // guard against a non-enforcing backend |
| 261 | + } |
| 262 | + } |
| 263 | + check( |
| 264 | + "capacity enforced: create_session fails past max_sessions", |
| 265 | + held.size() <= (size_t)config.max_sessions); |
| 266 | + |
| 267 | + printf( |
| 268 | + "\n%s (%d failure(s))\n", |
| 269 | + g_failures ? "FAILURES" : "ALL PASS", |
| 270 | + g_failures); |
| 271 | + return g_failures ? 1 : 0; |
| 272 | +} |
0 commit comments