Tencent · nihui · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -72,6 +72,7 @@ option(NCNN_THREADS "build with threads" ON)
 option(NCNN_BENCHMARK "print benchmark information for every layer" OFF)
 option(NCNN_C_API "build with C api" ON)
 option(NCNN_PLATFORM_API "build with platform api candy" ON)
+option(NCNN_BATCH "batch inference support" ON)
 option(NCNN_WINXP "build with windows xp compatibility" OFF)
 option(NCNN_PIXEL "convert and resize from/to image pixel" ON)
 option(NCNN_PIXEL_ROTATE "rotate image pixel orientation" ON)

diff --git a/cmake/ncnn_add_layer.cmake b/cmake/ncnn_add_layer.cmake
@@ -133,6 +133,13 @@ macro(ncnn_add_layer class)
         file(GLOB NCNN_SHADER_SRCS "layer/vulkan/shader/${name}.comp")
         file(GLOB NCNN_SHADER_SUBSRCS "layer/vulkan/shader/${name}_*.comp")
         list(APPEND NCNN_SHADER_SRCS ${NCNN_SHADER_SUBSRCS})
+        if(NOT NCNN_BATCH AND name STREQUAL "reshape")
+            list(REMOVE_ITEM NCNN_SHADER_SRCS
+                ${CMAKE_CURRENT_SOURCE_DIR}/layer/vulkan/shader/reshape_batch_reorder.comp
+                ${CMAKE_CURRENT_SOURCE_DIR}/layer/vulkan/shader/reshape_batch_reorder_pack1to4.comp
+                ${CMAKE_CURRENT_SOURCE_DIR}/layer/vulkan/shader/reshape_batch_reorder_pack4.comp
+                ${CMAKE_CURRENT_SOURCE_DIR}/layer/vulkan/shader/reshape_batch_reorder_pack4to1.comp)
+        endif()
         foreach(NCNN_SHADER_SRC ${NCNN_SHADER_SRCS})
             ncnn_add_shader(${NCNN_SHADER_SRC})
         endforeach()

diff --git a/python/src/main.cpp b/python/src/main.cpp
@@ -6,6 +6,8 @@
 #include <pybind11/numpy.h>
 #include <pybind11/functional.h>
 
+#include <string.h>
+
 #include <cpu.h>
 #include <gpu.h>
 #include <net.h>
@@ -248,9 +250,17 @@ PYBIND11_MODULE(ncnn, m)
 
     .def(py::init<const Mat&>(), py::arg("m"))
 
-    .def(py::init([](py::buffer const b) {
+    .def(py::init([](py::buffer const b, int batch_index) {
         py::buffer_info info = b.request();
-        if (info.ndim > 4)
+#if !NCNN_BATCH
+        if (batch_index != 233)
+        {
+            std::stringstream ss;
+            ss << "ncnn batch support disabled";
+            pybind11::pybind11_fail(ss.str());
+        }
+#endif
+        if (batch_index == 233 && info.ndim > 4)
         {
             std::stringstream ss;
             ss << "convert numpy.ndarray to ncnn.Mat only dims <=4 support now, but given " << info.ndim;
@@ -259,6 +269,85 @@ PYBIND11_MODULE(ncnn, m)
 
         size_t elemsize = info.itemsize;
 
+        if (batch_index != 233)
+        {
+            if (info.ndim > 5)
+            {
+                std::stringstream ss;
+                ss << "convert numpy.ndarray to ncnn.Mat with batch only dims <=5 support now, but given " << info.ndim;
+                pybind11::pybind11_fail(ss.str());
+            }
+
+            if (info.ndim < 2)
+            {
+                std::stringstream ss;
+                ss << "convert numpy.ndarray to ncnn.Mat with batch only dims >=2 support now, but given " << info.ndim;
+                pybind11::pybind11_fail(ss.str());
+            }
+
+            if (batch_index < 0)
+                batch_index += info.ndim;
+
+            if (batch_index < 0 || batch_index >= info.ndim)
+            {
+                std::stringstream ss;
+                ss << "batch_index out of range";
+                pybind11::pybind11_fail(ss.str());
+            }
+
+            std::vector<int> shape;
+            for (int i = 0; i < info.ndim; i++)
+            {
+                if (i == batch_index)
+                    continue;
+                shape.push_back((int)info.shape[i]);
+            }
+
+            Mat* v = new Mat;
+            if (shape.size() == 1)
+            {
+                v->create(shape[0], elemsize, 1, (int)info.shape[batch_index]);
+            }
+            else if (shape.size() == 2)
+            {
+                v->create(shape[1], shape[0], elemsize, 1, (int)info.shape[batch_index]);
+            }
+            else if (shape.size() == 3)
+            {
+                v->create(shape[2], shape[1], shape[0], elemsize, 1, (int)info.shape[batch_index]);
+            }
+            else if (shape.size() == 4)
+            {
+                v->create(shape[3], shape[2], shape[1], shape[0], elemsize, 1, (int)info.shape[batch_index]);
+            }
+
+            py::object src = py::reinterpret_borrow<py::object>(b);
+            for (int i = 0; i < v->n; i++)
+            {
+                py::array slice = src.attr("take")(i, py::arg("axis") = batch_index).attr("copy")();
+                py::buffer_info slice_info = slice.request();
+
+                Mat mb = v->batch(i);
+                const unsigned char* sptr = (const unsigned char*)slice_info.ptr;
+
+                if (mb.dims <= 2)
+                {
+                    memcpy(mb.data, sptr, (size_t)mb.w * mb.h * elemsize);
+                }
+                else
+                {
+                    size_t channel_size = (size_t)mb.w * mb.h * mb.d * elemsize;
+                    for (int q = 0; q < mb.c; q++)
+                    {
+                        Mat mbq = mb.channel(q);
+                        memcpy(mbq.data, sptr + channel_size * q, channel_size);
+                    }
+                }
+            }
+
+            return std::unique_ptr<Mat>(v);
+        }
+
         Mat* v = nullptr;
         if (info.ndim == 1)
         {
@@ -288,16 +377,32 @@ PYBIND11_MODULE(ncnn, m)
         }
         return std::unique_ptr<Mat>(v);
     }),
-    py::arg("array"))
+    py::arg("array"), py::arg("batch_index") = 233)
     .def_buffer([](Mat& m) -> py::buffer_info {
         return to_buffer_info(m);
     })
     .def(
-    "numpy", [](py::object obj, const std::string& format = "") -> py::array {
+    "numpy", [](py::object obj, const std::string& format = "", int batch_index = 233) -> py::array {
         auto* m = obj.cast<Mat*>();
+        if (batch_index != 233)
+        {
+#if !NCNN_BATCH
+            std::stringstream ss;
+            ss << "ncnn batch support disabled";
+            pybind11::pybind11_fail(ss.str());
+#endif
+            py::object numpy = py::module_::import("numpy");
+            py::list batch_slices;
+            for (int i = 0; i < m->n; i++)
+            {
+                Mat mb = m->batch(i);
+                batch_slices.append(py::array(to_buffer_info(mb, format), obj));
+            }
+            return numpy.attr("stack")(batch_slices, py::arg("axis") = batch_index).cast<py::array>();
+        }
         return py::array(to_buffer_info(*m, format), obj);
     },
-    py::arg("format") = "", "i for int32, f for float32, d for double")
+    py::arg("format") = "", py::arg("batch_index") = 233, "i for int32, f for float32, d for double")
     //.def("fill", (void (Mat::*)(int))(&Mat::fill), py::arg("v"))
     .def("fill", (void (Mat::*)(float))(&Mat::fill), py::arg("v"))
     .def("clone", &Mat::clone, py::arg("allocator") = nullptr)
@@ -328,36 +433,38 @@ PYBIND11_MODULE(ncnn, m)
     .def("reshape", (Mat(Mat::*)(int, int, int, int, Allocator*) const) & Mat::reshape, py::arg("w"), py::arg("h"), py::arg("d"), py::arg("c"), py::kw_only(), py::arg("allocator") = nullptr)
 
     .def(
-    "create", [](Mat& mat, py::tuple shape, size_t elemsize, int elempack, Allocator* allocator) {
+    "create", [](Mat& mat, py::tuple shape, size_t elemsize, int elempack, int batch, Allocator* allocator) {
         switch (shape.size())
         {
         case 1:
-            return mat.create(shape[0].cast<int>(), elemsize, elempack, allocator);
+            return mat.create(shape[0].cast<int>(), elemsize, elempack, batch, allocator);
         case 2:
-            return mat.create(shape[0].cast<int>(), shape[1].cast<int>(), elemsize, elempack, allocator);
+            return mat.create(shape[0].cast<int>(), shape[1].cast<int>(), elemsize, elempack, batch, allocator);
         case 3:
-            return mat.create(shape[0].cast<int>(), shape[1].cast<int>(), shape[2].cast<int>(), elemsize, elempack, allocator);
+            return mat.create(shape[0].cast<int>(), shape[1].cast<int>(), shape[2].cast<int>(), elemsize, elempack, batch, allocator);
         case 4:
-            return mat.create(shape[0].cast<int>(), shape[1].cast<int>(), shape[2].cast<int>(), shape[3].cast<int>(), elemsize, elempack, allocator);
+            return mat.create(shape[0].cast<int>(), shape[1].cast<int>(), shape[2].cast<int>(), shape[3].cast<int>(), elemsize, elempack, batch, allocator);
         default:
             std::stringstream ss;
             ss << "shape must be 1, 2, 3 or 4 dims, not " << shape.size();
             pybind11::pybind11_fail(ss.str());
         }
         return;
     },
-    py::arg("shape"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
-    .def("create", (void (Mat::*)(int, size_t, int, Allocator*)) & Mat::create, py::arg("w"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
-    .def("create", (void (Mat::*)(int, int, size_t, int, Allocator*)) & Mat::create, py::arg("w"), py::arg("h"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
-    .def("create", (void (Mat::*)(int, int, int, size_t, int, Allocator*)) & Mat::create, py::arg("w"), py::arg("h"), py::arg("c"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
-    .def("create", (void (Mat::*)(int, int, int, int, size_t, int, Allocator*)) & Mat::create, py::arg("w"), py::arg("h"), py::arg("d"), py::arg("c"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
+    py::arg("shape"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("batch") = 1, py::arg("allocator") = nullptr)
+    .def("create", (void (Mat::*)(int, size_t, int, int, Allocator*)) & Mat::create, py::arg("w"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("batch") = 1, py::arg("allocator") = nullptr)
+    .def("create", (void (Mat::*)(int, int, size_t, int, int, Allocator*)) & Mat::create, py::arg("w"), py::arg("h"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("batch") = 1, py::arg("allocator") = nullptr)
+    .def("create", (void (Mat::*)(int, int, int, size_t, int, int, Allocator*)) & Mat::create, py::arg("w"), py::arg("h"), py::arg("c"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("batch") = 1, py::arg("allocator") = nullptr)
+    .def("create", (void (Mat::*)(int, int, int, int, size_t, int, int, Allocator*)) & Mat::create, py::arg("w"), py::arg("h"), py::arg("d"), py::arg("c"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("batch") = 1, py::arg("allocator") = nullptr)
     .def("create_like", (void (Mat::*)(const Mat&, Allocator*)) & Mat::create_like, py::arg("m"), py::arg("allocator") = nullptr)
+    .def("create_like", (void (Mat::*)(const Mat&, int, Allocator*)) & Mat::create_like, py::arg("m"), py::arg("batch"), py::arg("allocator") = nullptr)
     .def("addref", &Mat::addref)
     .def("release", &Mat::release)
     .def("empty", &Mat::empty)
     .def("total", &Mat::total)
     .def("elembits", &Mat::elembits)
     .def("shape", &Mat::shape)
+    .def("batch", (Mat(Mat::*)(int)) & Mat::batch, py::arg("b"))
     .def("channel", (Mat(Mat::*)(int)) & Mat::channel, py::arg("c"))
     //.def("channel", (const Mat (Mat::*)(int) const) & Mat::channel, py::arg("c"))
     .def("depth", (Mat(Mat::*)(int)) & Mat::depth, py::arg("z"))
@@ -471,6 +578,13 @@ PYBIND11_MODULE(ncnn, m)
     .def_readwrite("d", &Mat::d)
     .def_readwrite("c", &Mat::c)
     .def_readwrite("cstep", &Mat::cstep)
+#if NCNN_BATCH
+    .def_readwrite("n", &Mat::n)
+#else
+    .def_property_readonly("n", [](const Mat&) {
+        return 1;
+    })
+#endif
     .def("__repr__", [](const Mat& m) {
         std::stringstream ss;
         ss << "<ncnn.Mat w=" << m.w << " h=" << m.h << " d=" << m.d << " c=" << m.c << " dims=" << m.dims

diff --git a/python/tests/test_mat.py b/python/tests/test_mat.py
@@ -283,6 +283,21 @@ def test_numpy():
     array2[0] = 100
     assert array[0] == 100
 
+def test_numpy_batch_index():
+    array = np.arange(3 * 5 * 2 * 7, dtype=np.float32).reshape(3, 5, 2, 7)
+    mat = ncnn.Mat(array, batch_index=2)
+    array2 = mat.numpy(batch_index=2)
+    assert (array == array2).all()
+
+    mat2 = mat.clone()
+    array3 = mat2.numpy(batch_index=2)
+    assert (array == array3).all()
+
+    array = np.arange(2 * 3 * 4 * 5 * 6, dtype=np.float32).reshape(2, 3, 4, 5, 6)
+    mat = ncnn.Mat(array, batch_index=-2)
+    array2 = mat.numpy(batch_index=-2)
+    assert (array == array2).all()
+
 def test_fill():
     mat = ncnn.Mat(1)
     mat.fill(1.0)

diff --git a/src/c_api.cpp b/src/c_api.cpp
@@ -507,33 +507,35 @@ ncnn_mat_t ncnn_mat_create_4d(int w, int h, int d, int c, ncnn_allocator_t alloc
     return (ncnn_mat_t)(new Mat(w, h, d, c, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL));
 }
 
+#if NCNN_BATCH
 ncnn_mat_t ncnn_mat_create_1d_batch(int w, int batch, ncnn_allocator_t allocator)
 {
     Mat* m = new Mat();
-    m->create_batch(w, batch, (size_t)4u, 1, allocator ? (Allocator*)allocator->pthis : NULL);
+    m->create(w, (size_t)4u, 1, batch, allocator ? (Allocator*)allocator->pthis : NULL);
     return (ncnn_mat_t)m;
 }
 
 ncnn_mat_t ncnn_mat_create_2d_batch(int w, int h, int batch, ncnn_allocator_t allocator)
 {
     Mat* m = new Mat();
-    m->create_batch(w, h, batch, (size_t)4u, 1, allocator ? (Allocator*)allocator->pthis : NULL);
+    m->create(w, h, (size_t)4u, 1, batch, allocator ? (Allocator*)allocator->pthis : NULL);
     return (ncnn_mat_t)m;
 }
 
 ncnn_mat_t ncnn_mat_create_3d_batch(int w, int h, int c, int batch, ncnn_allocator_t allocator)
 {
     Mat* m = new Mat();
-    m->create_batch(w, h, c, batch, (size_t)4u, 1, allocator ? (Allocator*)allocator->pthis : NULL);
+    m->create(w, h, c, (size_t)4u, 1, batch, allocator ? (Allocator*)allocator->pthis : NULL);
     return (ncnn_mat_t)m;
 }
 
 ncnn_mat_t ncnn_mat_create_4d_batch(int w, int h, int d, int c, int batch, ncnn_allocator_t allocator)
 {
     Mat* m = new Mat();
-    m->create_batch(w, h, d, c, batch, (size_t)4u, 1, allocator ? (Allocator*)allocator->pthis : NULL);
+    m->create(w, h, d, c, (size_t)4u, 1, batch, allocator ? (Allocator*)allocator->pthis : NULL);
     return (ncnn_mat_t)m;
 }
+#endif // NCNN_BATCH
 
 ncnn_mat_t ncnn_mat_create_external_1d(int w, void* data, ncnn_allocator_t allocator)
 {
@@ -575,33 +577,35 @@ ncnn_mat_t ncnn_mat_create_4d_elem(int w, int h, int d, int c, size_t elemsize,
     return (ncnn_mat_t)(new Mat(w, h, d, c, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL));
 }
 
-ncnn_mat_t ncnn_mat_create_1d_batch_elem(int w, int batch, size_t elemsize, int elempack, ncnn_allocator_t allocator)
+#if NCNN_BATCH
+ncnn_mat_t ncnn_mat_create_1d_elem_batch(int w, size_t elemsize, int elempack, int batch, ncnn_allocator_t allocator)
 {
     Mat* m = new Mat();
-    m->create_batch(w, batch, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL);
+    m->create(w, elemsize, elempack, batch, allocator ? (Allocator*)allocator->pthis : NULL);
     return (ncnn_mat_t)m;
 }
 
-ncnn_mat_t ncnn_mat_create_2d_batch_elem(int w, int h, int batch, size_t elemsize, int elempack, ncnn_allocator_t allocator)
+ncnn_mat_t ncnn_mat_create_2d_elem_batch(int w, int h, size_t elemsize, int elempack, int batch, ncnn_allocator_t allocator)
 {
     Mat* m = new Mat();
-    m->create_batch(w, h, batch, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL);
+    m->create(w, h, elemsize, elempack, batch, allocator ? (Allocator*)allocator->pthis : NULL);
     return (ncnn_mat_t)m;
 }
 
-ncnn_mat_t ncnn_mat_create_3d_batch_elem(int w, int h, int c, int batch, size_t elemsize, int elempack, ncnn_allocator_t allocator)
+ncnn_mat_t ncnn_mat_create_3d_elem_batch(int w, int h, int c, size_t elemsize, int elempack, int batch, ncnn_allocator_t allocator)
 {
     Mat* m = new Mat();
-    m->create_batch(w, h, c, batch, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL);
+    m->create(w, h, c, elemsize, elempack, batch, allocator ? (Allocator*)allocator->pthis : NULL);
     return (ncnn_mat_t)m;
 }
 
-ncnn_mat_t ncnn_mat_create_4d_batch_elem(int w, int h, int d, int c, int batch, size_t elemsize, int elempack, ncnn_allocator_t allocator)
+ncnn_mat_t ncnn_mat_create_4d_elem_batch(int w, int h, int d, int c, size_t elemsize, int elempack, int batch, ncnn_allocator_t allocator)
 {
     Mat* m = new Mat();
-    m->create_batch(w, h, d, c, batch, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL);
+    m->create(w, h, d, c, elemsize, elempack, batch, allocator ? (Allocator*)allocator->pthis : NULL);
     return (ncnn_mat_t)m;
 }
+#endif // NCNN_BATCH
 
 ncnn_mat_t ncnn_mat_create_external_1d_elem(int w, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator)
 {
@@ -703,20 +707,24 @@ size_t ncnn_mat_get_cstep(const ncnn_mat_t mat)
     return ((const Mat*)mat)->cstep;
 }
 
+#if NCNN_BATCH
 size_t ncnn_mat_get_nstep(const ncnn_mat_t mat)
 {
     return ((const Mat*)mat)->nstep;
 }
+#endif // NCNN_BATCH
 
 void* ncnn_mat_get_data(const ncnn_mat_t mat)
 {
     return ((const Mat*)mat)->data;
 }
 
-ncnn_mat_t ncnn_mat_get_batch(const ncnn_mat_t mat, int b)
+#if NCNN_BATCH
+void* ncnn_mat_get_batch_data(const ncnn_mat_t mat, int b)
 {
-    return (ncnn_mat_t)(new Mat(((const Mat*)mat)->batch(b)));
+    return ((const Mat*)mat)->batch(b).data;
 }
+#endif // NCNN_BATCH
 
 void* ncnn_mat_get_channel_data(const ncnn_mat_t mat, int c)
 {