Chia-Network · richardkiss · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 30, 2026
diff --git a/benches/deserialize.rs b/benches/deserialize.rs
@@ -4,6 +4,9 @@ use clvmr::serde::{
     node_to_bytes_backrefs, serialized_length_from_bytes, serialized_length_from_bytes_trusted,
     tree_hash_from_stream,
 };
+use clvmr::serde_2026::{deserialize_2026_body, serialize_2026};
+
+const BENCH_MAX_ATOM_LEN: usize = 1 << 20;
 use criterion::{Criterion, criterion_group, criterion_main};
 use std::include_bytes;
 use std::time::Instant;
@@ -76,6 +79,34 @@ fn deserialize_benchmark(c: &mut Criterion) {
         });
     }
 
+    {
+        let mut a = Allocator::new();
+        let node = node_from_bytes(&mut a, block).expect("node_from_bytes");
+        let serialized_2026 = serialize_2026(&a, node).expect("serialize_2026");
+
+        let mut a = Allocator::new();
+        let node = node_from_bytes_backrefs(&mut a, compressed_block.as_ref())
+            .expect("node_from_bytes_backrefs");
+        let serialized_2026_compressed = serialize_2026(&a, node).expect("serialize_2026");
+
+        for (data, name_suffix) in &[
+            (serialized_2026.as_slice(), ""),
+            (serialized_2026_compressed.as_slice(), "-compressed-src"),
+        ] {
+            let mut a = Allocator::new();
+            let iter_checkpoint = a.checkpoint();
+            group.bench_function(format!("deserialize_2026_body{name_suffix}"), |b| {
+                b.iter(|| {
+                    a.restore_checkpoint(&iter_checkpoint);
+                    let start = Instant::now();
+                    deserialize_2026_body(&mut a, data, BENCH_MAX_ATOM_LEN, false)
+                        .expect("deserialize_2026_body");
+                    start.elapsed()
+                })
+            });
+        }
+    }
+
     let mut a = Allocator::new();
     let iter_checkpoint = a.checkpoint();
     group.bench_function("node_from_bytes", |b| {

diff --git a/benches/serialize.rs b/benches/serialize.rs
@@ -3,6 +3,7 @@ use clvmr::serde::{
     Serializer, node_from_bytes, node_from_bytes_backrefs, node_to_bytes_backrefs,
     node_to_bytes_limit,
 };
+use clvmr::serde_2026::serialize_2026;
 use criterion::black_box;
 use criterion::{Criterion, criterion_group, criterion_main};
 use std::include_bytes;
@@ -56,6 +57,14 @@ fn serialize_benchmark(c: &mut Criterion) {
                 start.elapsed()
             })
         });
+
+        group.bench_function(format!("serialize_2026 {name}"), |b| {
+            b.iter(|| {
+                let start = Instant::now();
+                black_box(serialize_2026(&a, node).expect("serialize_2026"));
+                start.elapsed()
+            })
+        });
     }
 
     group.finish();

diff --git a/docs/serde-2026.md b/docs/serde-2026.md
@@ -0,0 +1,145 @@
+# 2026 Serialization Format
+
+## Magic Header
+
+Every 2026-format blob begins with a 6-byte magic prefix:
+
+```
+0xfd 0xff 0x32 0x30 0x32 0x36
+```
+
+Rationale:
+
+- `0xfd 0xff` drives legacy/backrefs decoders down an invalid atom-size path,
+  causing immediate failure.
+- `0x32 0x30 0x32 0x36` is ASCII `"2026"` for readable hexdumps.
+
+### Detection
+
+The magic prefix allows helper APIs to distinguish 2026-format blobs from
+legacy/backref blobs:
+
+- If the blob starts with `0xfd 0xff 0x32 0x30 0x32 0x36`, it is 2026-format.
+- Otherwise, parse with the legacy/backrefs path.
+
+Consensus callers do not need to rely on auto-detection. They can select the
+expected format from fork height or consensus flags and call the corresponding
+deserializer directly.
+
+### Backward Compatibility
+
+When a 2026-format blob is handed to a legacy deserializer unaware of the new
+format, it should fail quickly due to the deliberately invalid size prefix.
+
+## Payload Format
+
+After the 6-byte magic header, the payload consists of two sections:
+
+1. **Atom table** — all unique atoms (except nil), grouped by length
+2. **Instruction stream** — stack-based operations to reconstruct the tree
+
+### Atom Table
+
+Nil (the empty atom) is **not** included in the atom table — it has a dedicated
+opcode (`0`) in the instruction stream.
+
+The atom table begins with a varint encoding the number of atom groups.
+
+For each group (in stream order):
+
+- If the group contains **one** atom: a positive varint encoding the atom's byte
+  length, followed by the atom's raw bytes.
+- If the group contains **multiple** atoms of the same length: a negative varint
+  encoding the negated byte length, then a positive varint encoding the count,
+  then the raw bytes of each atom concatenated (each is exactly `length` bytes).
+
+Atom lengths must be non-zero because nil is excluded from the atom table.
+Deserializers enforce a configurable maximum atom length (default: 1 MiB) and a
+maximum input byte budget (default: 10 MiB). Separate atom-group, atom-count,
+instruction-count, stack-size, and pair-count limits are not needed for DoS
+protection: every declared item must consume at least one input byte before it
+can produce parser work or allocate a CLVM node. The input byte budget therefore
+bounds all of those quantities.
+
+Atoms are assigned indices starting from 0, in the order they appear in the
+table.
+
+The decoder accepts groups in any order. Multiple groups with the same byte
+length are valid (they contribute separate atom indices). A serializer may
+choose a specific ordering strategy (for example, sorting by frequency so
+commonly-referenced atoms land in lower index ranges whose varint encodings are
+shorter).
+
+### Instruction Stream
+
+The instruction stream begins with a varint encoding the total number of
+instructions.
+
+Each instruction is a varint:
+
+| Varint value | Meaning                                                          |
+| ------------ | ---------------------------------------------------------------- |
+| `0`          | Push nil (the empty atom)                                        |
+| `1`          | Pop two items (left was pushed first), cons them, push the pair  |
+| `-1`         | Pop two items (right was pushed first), cons them, push the pair |
+| `N >= 2`     | Push the atom at index N-2 onto the stack                        |
+| `N <= -2`    | Push the already-constructed pair at index -N-2 onto the stack   |
+
+Pairs are indexed in construction order (the first pair cons'd is index 0, the
+second is index 1, etc.). A negative instruction references a pair that was
+previously constructed during this same decode, enabling shared sub-trees
+without re-encoding them.
+
+The current serializer emits left-first cons instructions (`1`). Decoders accept
+right-first cons instructions (`-1`) so future serializers can choose different
+pair visit orders without changing the wire format.
+
+After all instructions execute, the stack must contain exactly one item: the
+root node.
+
+### Varint Encoding
+
+Signed integers are encoded with a variable-length prefix scheme:
+
+```
+0xxxxxxx                          →  7-bit value, range [-64, 63]
+10xxxxxx xxxxxxxx                 → 14-bit value, range [-8192, 8191]
+110xxxxx xxxxxxxx xxxxxxxx        → 21-bit value, range [-1048576, 1048575]
+1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx
+                                   → 28-bit value
+11110xxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
+                                   → 35-bit value
+111110xx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
+                                   → 42-bit value
+1111110x xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
+                                   → 49-bit value
+11111110 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
+                                   → 56-bit value
+```
+
+The number of leading `1` bits determines how many additional bytes follow,
+similar to UTF-8 prefix-length coding. A `0` separator bit follows the leading
+`1`s. The remaining bits (across all bytes) form a two's-complement signed
+integer in big-endian order.
+
+A prefix of 8 leading `1` bits (`0xFF`) is invalid.
+
+The deserializer has a `strict` mode that rejects overlong varint encodings. In
+strict mode, every varint must use the shortest encoding that can represent its
+value. Lenient mode accepts overlong encodings for tooling/backward-compatible
+parsing.
+
+## Size Bound
+
+For the current instruction-stream format, the analysis in
+`generator-identity-hf-analysis/docs/SERDE2026_UPPER_BOUND.md` proves:
+
+```
+serde_2026_bytes <= atom_bytes + 2 * unique_atoms + 3 * unique_pairs + 5
+```
+
+assuming all atom lengths fit in a 4-byte varint (`length <= 2^27 - 1`). This
+condition is far weaker than the default 1 MiB atom limit. Because the hard fork
+cost formula charges this same size component, consensus callers can derive
+their accepted serde_2026 byte budget from the 11B block cost limit instead of
+choosing an arbitrary message-size cap.
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
@@ -148,3 +148,15 @@ name = "modpow"
 path = "fuzz_targets/modpow.rs"
 test = false
 doc = false
+
+[[bin]]
+name = "serde-2026"
+path = "fuzz_targets/serde_2026.rs"
+test = false
+doc = false
+
+[[bin]]
+name = "serde-2026-varint"
+path = "fuzz_targets/serde_2026_varint.rs"
+test = false
+doc = false
diff --git a/fuzz/fuzz_targets/serde_2026.rs b/fuzz/fuzz_targets/serde_2026.rs
@@ -0,0 +1,69 @@
+#![no_main]
+
+use clvm_fuzzing::ArbitraryClvmTree;
+use clvmr::serde::node_to_bytes;
+use clvmr::serde_2026::{deserialize_2026_body, serialize_2026_level};
+use clvmr::{Allocator, allocator::NodePtr};
+use libfuzzer_sys::{Corpus, fuzz_target};
+
+const FUZZ_MAX_ATOM_LEN: usize = 1 << 20;
+
+#[derive(arbitrary::Arbitrary, Debug)]
+enum FuzzInput {
+    Bytes(Vec<u8>),
+    Tree(Box<ArbitraryClvmTree<10_000, true>>),
+}
+
+fn canonical(a: &Allocator, node: NodePtr) -> Vec<u8> {
+    node_to_bytes(a, node).expect("node_to_bytes failed")
+}
+
+fn roundtrip_check(label: &str, a: &Allocator, original: NodePtr, blob: &[u8]) {
+    let mut a2 = Allocator::new();
+    let decoded = deserialize_2026_body(&mut a2, blob, FUZZ_MAX_ATOM_LEN, false)
+        .unwrap_or_else(|e| panic!("{label}: deserialize failed: {e:?}"));
+    assert_eq!(
+        canonical(a, original),
+        canonical(&a2, decoded),
+        "{label}: tree mismatch"
+    );
+}
+
+fn check_tree(a: &Allocator, node: NodePtr) {
+    for (label, level) in serialization_strategies() {
+        let blob =
+            serialize_2026_level(a, node, level).unwrap_or_else(|_| panic!("{label} failed"));
+        roundtrip_check(label, a, node, &blob);
+    }
+}
+
+fn serialization_strategies() -> impl Iterator<Item = (&'static str, u32)> {
+    std::iter::once(("fast", 0))
+}
+
+fuzz_target!(|input: FuzzInput| -> Corpus {
+    match input {
+        FuzzInput::Bytes(data) => {
+            let mut a = Allocator::new();
+            let Ok(node) = deserialize_2026_body(&mut a, &data, FUZZ_MAX_ATOM_LEN, false) else {
+                return Corpus::Reject;
+            };
+            check_tree(&a, node);
+        }
+        FuzzInput::Tree(program) => {
+            check_tree(&program.allocator, program.tree);
+
+            let mut a2 = Allocator::new();
+            let blob =
+                serialize_2026_level(&program.allocator, program.tree, 0).expect("Fast failed");
+            let decoded = deserialize_2026_body(&mut a2, &blob, FUZZ_MAX_ATOM_LEN, false)
+                .expect("deserialize fast failed");
+            assert_eq!(
+                canonical(&program.allocator, program.tree),
+                canonical(&a2, decoded)
+            );
+        }
+    }
+
+    Corpus::Keep
+});
diff --git a/fuzz/fuzz_targets/serde_2026_varint.rs b/fuzz/fuzz_targets/serde_2026_varint.rs
@@ -0,0 +1,46 @@
+#![no_main]
+
+use clvmr::serde_2026::{decode_varint, encode_varint};
+use libfuzzer_sys::fuzz_target;
+use std::io::Cursor;
+
+fuzz_target!(|data: &[u8]| {
+    // Non-strict decode must never panic on arbitrary input.
+    let mut cur = Cursor::new(data);
+    let non_strict = decode_varint(&mut cur, false);
+
+    if let Ok(v) = non_strict {
+        let consumed = cur.position() as usize;
+
+        // The canonical (minimal) encoding must:
+        //  - decode in strict mode to the same value
+        //  - never be longer than the bytes we just consumed
+        //    (strict-mode rejections are exactly the "overlong" case)
+        let canonical = encode_varint(v);
+        assert!(
+            canonical.len() <= consumed,
+            "canonical encoding of {v} is {} bytes but input took {consumed}",
+            canonical.len(),
+        );
+        let strict_canonical = decode_varint(&mut Cursor::new(&canonical[..]), true)
+            .expect("canonical encoding must decode under strict");
+        assert_eq!(v, strict_canonical, "canonical roundtrip mismatch");
+
+        // Strict-mode decode of the original input either matches the
+        // non-strict value (canonical input) or errors (overlong / non-canonical).
+        match decode_varint(&mut Cursor::new(data), true) {
+            Ok(strict_v) => assert_eq!(strict_v, v, "strict and non-strict disagree"),
+            Err(_) => {
+                // Overlong-but-valid-as-non-strict encoding. Confirm it really is
+                // non-canonical: the canonical form must be shorter.
+                assert!(
+                    canonical.len() < consumed,
+                    "strict rejected a canonical-length encoding"
+                );
+            }
+        }
+    } else {
+        // If non-strict failed, strict must also fail (strict is stricter).
+        assert!(decode_varint(&mut Cursor::new(data), true).is_err());
+    }
+});
diff --git a/src/lib.rs b/src/lib.rs
@@ -15,6 +15,7 @@ pub mod run_program;
 pub mod runtime_dialect;
 pub mod secp_ops;
 pub mod serde;
+pub mod serde_2026;
 pub mod sha_tree_op;
 pub mod traverse_path;
 pub mod treehash;

diff --git a/src/serde/mod.rs b/src/serde/mod.rs
@@ -41,3 +41,10 @@ pub use tools::{
     tree_hash_from_stream,
 };
 pub use tree_cache::{TreeCache, TreeCacheCheckpoint};
+
+pub use crate::serde_2026::{
+    SERDE_2026_MAGIC_PREFIX, deserialize_2026_body, deserialize_2026_body_from_stream,
+    node_from_bytes_serde_2026, node_to_bytes_serde_2026, node_to_bytes_serde_2026_level,
+    serialize_2026, serialize_2026_level, serialize_2026_to_stream, serialize_2026_to_stream_level,
+    serialized_length_serde_2026,
+};