1- From 617486784d5394fbb54f4d99a4860a050318a4e8 Mon Sep 17 00:00:00 2001
2- From: Gian Marco Iodice <gianmarco.iodice @arm.com>
3- Date: Tue, 16 Jul 2024 17 :28:50 +0100
1+ From 25ba8dfa43e2b4b101b890c88464b638427d3d42 Mon Sep 17 00:00:00 2001
2+ From: Charles Xu <charles.xu @arm.com>
3+ Date: Wed, 17 Jul 2024 13 :28:18 +0200
44Subject: [PATCH] Use KleidiAI Int4 Matmul micro-kernels in llama.cpp
55
66- Update CMake file to fetch the Int4 micro-kernels from the KleidiAI
77repository
88- Implement a KleidiAI backend for llama.cpp
9+ - Add weight caching feature for KleidiAI
910
10- Signed-off-by: Gian Marco Iodice <gianmarco.iodice @arm.com>
11+ Signed-off-by: Charles Xu <charles.xu @arm.com>
1112---
12- CMakeLists.txt | 48 ++++
13- ggml-alloc.c | 13 ++
14- ggml-kleidiai.cpp | 560 ++++++++++++++++++++++++++++++++++++++++++++++
13+ CMakeLists.txt | 52 ++++
14+ ggml-alloc.c | 13 +
15+ ggml-kleidiai.cpp | 675 ++++++++++++++++++++++++++++++++++++++++++++++
1516 ggml-kleidiai.h | 45 ++++
16- ggml.c | 27 +++
17+ ggml.c | 27 ++
1718 llama.cpp | 19 +-
18- 6 files changed, 711 insertions(+), 1 deletion(-)
19+ 6 files changed, 830 insertions(+), 1 deletion(-)
1920 create mode 100644 ggml-kleidiai.cpp
2021 create mode 100644 ggml-kleidiai.h
2122
2223diff --git a/CMakeLists.txt b/CMakeLists.txt
23- index 08481334..22504ad2 100644
24+ index 08481334..07f8f601 100644
2425--- a/CMakeLists.txt
2526+++ b/CMakeLists.txt
26- @@ -548,6 +548,53 @@ if (LLAMA_VULKAN)
27+ @@ -548,6 +548,57 @@ if (LLAMA_VULKAN)
2728 endif()
2829 endif()
2930
@@ -72,12 +73,16 @@ index 08481334..22504ad2 100644
7273+ add_compile_definitions(GGML_USE_KLEIDIAI)
7374+ add_compile_definitions(GGML_KLEIDIAI_REUSE_MEMORY)
7475+
76+ + if (LLAMA_KLEIDIAI_CACHE)
77+ + add_compile_definitions(GGML_KLEIDIAI_USE_CACHE)
78+ + endif()
79+ +
7580+ endif()
7681+
7782 if (LLAMA_HIPBLAS)
7883 if (NOT EXISTS $ENV{ROCM_PATH})
7984 if (NOT EXISTS /opt/rocm)
80- @@ -1268,6 +1315 ,7 @@ add_library(ggml OBJECT
85+ @@ -1268,6 +1319 ,7 @@ add_library(ggml OBJECT
8186 ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
8287 ${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
8388 ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
@@ -118,10 +123,10 @@ index bd367c42..ed4ce0ae 100644
118123 if (this_size > max_size) {
119124diff --git a/ggml-kleidiai.cpp b/ggml-kleidiai.cpp
120125new file mode 100644
121- index 00000000..6800f63e
126+ index 00000000..257a0d4c
122127--- /dev/null
123128+++ b/ggml-kleidiai.cpp
124- @@ -0,0 +1,560 @@
129+ @@ -0,0 +1,675 @@
125130+ /*
126131+ * Copyright (c) 2024 Arm Limited.
127132+ *
@@ -160,6 +165,13 @@ index 00000000..6800f63e
160165+ #include <string.h>
161166+ #include <asm/hwcap.h>
162167+ #include <sys/auxv.h>
168+ + #if defined(GGML_KLEIDIAI_USE_CACHE)
169+ + #include <cstring>
170+ + #include <sys/mman.h>
171+ + #include <sys/stat.h>
172+ + #include <fcntl.h>
173+ + #include <unistd.h>
174+ + #endif
163175+
164176+ // KleidiAI micro-kernels
165177+ #include "kai_matmul_clamp_f32_qsi8d32p_qsi4c32p_interface.h"
@@ -213,6 +225,85 @@ index 00000000..6800f63e
213225+ unsigned long int getauxval(unsigned long int __type) __INTRODUCED_IN(18);
214226+ #endif
215227+
228+ + #if defined(GGML_KLEIDIAI_USE_CACHE)
229+ + struct binary_data {
230+ + void *ptr;
231+ + size_t size;
232+ + };
233+ +
234+ + struct cached_weight {
235+ + int fd;
236+ + binary_data data;
237+ + };
238+ +
239+ + static const char *g_cache_filename = "kai_transformed_weights.cache";
240+ + static const size_t g_cache_key_size = 16;
241+ +
242+ + static struct cached_weight g_kai_cached_weight;
243+ +
244+ + static void ggml_kai_open_cached_weight() {
245+ + if (access(g_cache_filename, F_OK) != 0) {
246+ + g_kai_cached_weight.fd = open(g_cache_filename, O_RDWR | O_CREAT, 0644);
247+ + if (g_kai_cached_weight.fd == -1) {
248+ + GGML_ASSERT(false);
249+ + }
250+ + g_kai_cached_weight.data.size = 0;
251+ + }
252+ + else {
253+ + struct stat file_info;
254+ + g_kai_cached_weight.fd = open(g_cache_filename, O_RDONLY);
255+ + if (fstat(g_kai_cached_weight.fd, &file_info) == -1) {
256+ + GGML_ASSERT(false);
257+ + }
258+ +
259+ + g_kai_cached_weight.data.size = file_info.st_size;
260+ +
261+ + if (g_kai_cached_weight.data.size > 0) {
262+ + g_kai_cached_weight.data.ptr = mmap(NULL, g_kai_cached_weight.data.size, PROT_READ, MAP_PRIVATE, g_kai_cached_weight.fd, 0);
263+ + if (g_kai_cached_weight.data.ptr == MAP_FAILED) {
264+ + GGML_ASSERT(false);
265+ + }
266+ + }
267+ +
268+ + }
269+ + }
270+ +
271+ + static void ggml_kai_write_cache_weight(int fd, void *key, size_t key_size, void *data, size_t data_size) {
272+ + if (write(fd, key, key_size) != static_cast<ssize_t>(key_size)) {
273+ + GGML_ASSERT(false);
274+ + }
275+ +
276+ + if (write(fd, &data_size, sizeof(size_t)) != sizeof(size_t)) {
277+ + GGML_ASSERT(false);
278+ + }
279+ +
280+ + if (write(fd, data, data_size) != static_cast<ssize_t>(data_size)) {
281+ + GGML_ASSERT(false);
282+ + }
283+ + }
284+ +
285+ + static bool ggml_kai_match_cached_weight(void *token, struct binary_data *data) {
286+ + char* data_ptr = static_cast<char*>(g_kai_cached_weight.data.ptr);
287+ + char* end_ptr = data_ptr + g_kai_cached_weight.data.size;
288+ +
289+ + while (data_ptr < end_ptr) {
290+ + void *key = data_ptr;
291+ + data_ptr += g_cache_key_size;
292+ +
293+ + data->size=*(std::size_t*)data_ptr;
294+ + data_ptr += sizeof(std::size_t);
295+ +
296+ + data->ptr = data_ptr;
297+ + data_ptr += data->size;
298+ +
299+ + if (memcmp(token, key, 16) == 0) {
300+ + return true;
301+ + }
302+ + }
303+ + return false;
304+ + }
305+ + #endif
306+ +
216307+ inline bool is_feature_supported(uint64_t features, uint64_t feature_mask) {
217308+ return (features & feature_mask);
218309+ }
@@ -240,6 +331,10 @@ index 00000000..6800f63e
240331+ ggml_kai_free_extra_mem();
241332+ initialized = true;
242333+ g_kai_loaded = true;
334+ +
335+ + #if defined(GGML_KLEIDIAI_USE_CACHE)
336+ + ggml_kai_open_cached_weight();
337+ + #endif
243338+ }
244339+ }
245340+
@@ -523,6 +618,20 @@ index 00000000..6800f63e
523618+ if (cur->extra == NULL) {
524619+ if(cur->type == GGML_TYPE_Q4_0) {
525620+
621+ + #if defined(GGML_KLEIDIAI_USE_CACHE)
622+ + if (g_kai_cached_weight.data.size > 0) {
623+ + struct binary_data data;
624+ + bool matched = ggml_kai_match_cached_weight(cur->data, &data);
625+ + if (matched) {
626+ + cur->extra = data.ptr;
627+ + }
628+ + else {
629+ + perror("No match found, please remove the cache file and try again!");
630+ + GGML_ASSERT(false);
631+ + }
632+ + return;
633+ + }
634+ + #endif
526635+ const size_t original_data_size = ggml_nbytes(cur);
527636+ const size_t reshaped_data_sz = rhs_packing_params.packed_size;
528637+
@@ -545,6 +654,10 @@ index 00000000..6800f63e
545654+ 0,
546655+ ¶ms);
547656+
657+ + #if defined(GGML_KLEIDIAI_USE_CACHE)
658+ + ggml_kai_write_cache_weight(g_kai_cached_weight.fd, cur->data, g_cache_key_size, reshaped_data, reshaped_data_sz);
659+ + #endif
660+ +
548661+ #if defined(GGML_KLEIDIAI_REUSE_MEMORY)
549662+ GGML_ASSERT(reshaped_data_sz <= original_data_size);
550663+ memcpy(cur->data, (void *)reshaped_data, ggml_nbytes(cur));
@@ -570,9 +683,9 @@ index 00000000..6800f63e
570683+ // tensor->src[1] = second source tensor
571684+
572685+ ggml_kai_func_t func;
573- + const bool is_cpu_only = tensor->backend == GGML_BACKEND_TYPE_CPU
574- + || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU ))
575- + || (tensor->src[1] != nullptr && tensor->src[1 ]->backend == GGML_BACKEND_TYPE_CPU );
686+ + const bool is_cpu_only = ggml_backend_buffer_is_host( tensor->buffer)
687+ + || (tensor->src[0] != nullptr && ggml_backend_buffer_is_host (tensor->src[0]->buffer ))
688+ + || (tensor->src[1] != nullptr && ggml_backend_buffer_is_host( tensor->src[0 ]->buffer) );
576689+
577690+ if (!is_cpu_only) {
578691+ return false;
@@ -604,9 +717,9 @@ index 00000000..6800f63e
604717+ // tensor->src[0] = first source tensor
605718+ // tensor->src[1] = second source tensor
606719+
607- + const bool is_cpu_only = tensor->backend == GGML_BACKEND_TYPE_CPU
608- + || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU ))
609- + || (tensor->src[1] != nullptr && tensor->src[1 ]->backend == GGML_BACKEND_TYPE_CPU );
720+ + const bool is_cpu_only = ggml_backend_buffer_is_host( tensor->buffer)
721+ + || (tensor->src[0] != nullptr && ggml_backend_buffer_is_host (tensor->src[0]->buffer ))
722+ + || (tensor->src[1] != nullptr && ggml_backend_buffer_is_host( tensor->src[0 ]->buffer) );
610723+
611724+ if (!is_cpu_only) {
612725+ return false;
@@ -680,6 +793,13 @@ index 00000000..6800f63e
680793+ free(g_extra_mem[i]);
681794+ }
682795+ g_extra_mem_idx = 0;
796+ +
797+ + #if defined(GGML_KLEIDIAI_USE_CACHE)
798+ + if (g_kai_cached_weight.data.size > 0) {
799+ + munmap(g_kai_cached_weight.data.ptr, g_kai_cached_weight.data.size);
800+ + }
801+ + close(g_kai_cached_weight.fd);
802+ + #endif
683803+ }
684804+ #endif // defined(__aarch64__) && (defined(__ANDROID__) || defined(__linux__))
685805diff --git a/ggml-kleidiai.h b/ggml-kleidiai.h
@@ -845,5 +965,5 @@ index 05591aa4..735dde04 100644
845965 }
846966
847967- -
848- 2.25 .1
968+ 2.34 .1
849969
0 commit comments