Skip to content

Commit eaf7a15

Browse files
committed
feat: support allocating memory outside for RKNPU2, breaking 4GB limit.
1 parent 32ccc10 commit eaf7a15

File tree

2 files changed

+125
-5
lines changed

2 files changed

+125
-5
lines changed

ggml-rknpu2.c

+122-3
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@
99
#include "rknn_api.h"
1010
#include "rknn_matmul_api.h"
1111

12+
#include <sys/ioctl.h>
13+
#include <sys/mman.h>
14+
#include <fcntl.h>
15+
#include <errno.h>
16+
#include <unistd.h>
17+
1218
#include <arm_neon.h>
1319

1420

@@ -114,11 +120,107 @@ struct ggml_rknpu2_matmul_kernel
114120
rknn_tensor_mem* C;
115121
};
116122

123+
#define GGML_RKNPU2_USE_OUTSIDE_ALLOC 1
124+
125+
#if GGML_RKNPU2_USE_OUTSIDE_ALLOC
126+
struct dma_heap_allocation_data {
127+
uint64_t len;
128+
uint32_t fd;
129+
uint32_t fd_flags;
130+
uint64_t heap_flags;
131+
};
132+
133+
#define DMA_HEAP_IOC_MAGIC 'H'
134+
#define DMA_HEAP_IOCTL_ALLOC _IOWR(DMA_HEAP_IOC_MAGIC, 0x0,\
135+
struct dma_heap_allocation_data)
136+
137+
#define DMA_BUF_SYNC_READ (1 << 0)
138+
#define DMA_BUF_SYNC_WRITE (2 << 0)
139+
#define DMA_BUF_SYNC_RW (DMA_BUF_SYNC_READ | DMA_BUF_SYNC_WRITE)
140+
#define DMA_BUF_SYNC_START (0 << 2)
141+
#define DMA_BUF_SYNC_END (1 << 2)
142+
#define DMA_BUF_BASE 'b'
143+
#define DMA_BUF_IOCTL_SYNC _IOW(DMA_BUF_BASE, 0, uint64_t)
144+
#define CMA_HEAP_SIZE (1024 * 1024)
145+
146+
//Helper function to manually allocate buffer from dma_heap for RKNPU2
147+
//The internal RKNPU2 API will allocate buffer from DMA32 heap, which is only 4GiB, not enough for large models.
148+
//WARNING: Memory leak will not be released on exit!! But it will be released on next run...?
149+
int dma_alloc(size_t size, int *fd, void **va) {
150+
int ret;
151+
int prot;
152+
void *mmap_va;
153+
int dma_heap_fd = -1;
154+
struct dma_heap_allocation_data buf_data;
155+
const char* path = "/dev/dma_heap/system";
156+
157+
/* open dma_heap fd */
158+
dma_heap_fd = open(path, O_RDWR);
159+
if (dma_heap_fd < 0) {
160+
printf("open %s fail!\n", path);
161+
return dma_heap_fd;
162+
}
163+
164+
/* alloc buffer */
165+
memset(&buf_data, 0x0, sizeof(struct dma_heap_allocation_data));
166+
167+
buf_data.len = size;
168+
buf_data.fd_flags = O_CLOEXEC | O_RDWR;
169+
ret = ioctl(dma_heap_fd, DMA_HEAP_IOCTL_ALLOC, &buf_data);
170+
if (ret < 0) {
171+
printf("RK_DMA_HEAP_ALLOC_BUFFER failed\n");
172+
return ret;
173+
}
174+
175+
/* mmap va */
176+
if (fcntl(buf_data.fd, F_GETFL) & O_RDWR)
177+
prot = PROT_READ | PROT_WRITE;
178+
else
179+
prot = PROT_READ;
180+
181+
/* mmap contiguors buffer to user */
182+
mmap_va = (void *)mmap(NULL, buf_data.len, prot, MAP_SHARED, buf_data.fd, 0);
183+
if (mmap_va == MAP_FAILED) {
184+
printf("mmap failed: %s\n", strerror(errno));
185+
return -errno;
186+
}
187+
188+
*va = mmap_va;
189+
*fd = buf_data.fd;
190+
191+
close(dma_heap_fd);
192+
193+
return 0;
194+
}
195+
196+
int dma_sync_device_to_cpu(int fd) {
197+
uint64_t flags = DMA_BUF_SYNC_START | DMA_BUF_SYNC_RW;
198+
return ioctl(fd, DMA_BUF_IOCTL_SYNC, &flags);
199+
}
200+
201+
int dma_sync_cpu_to_device(int fd) {
202+
uint64_t flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_RW;
203+
return ioctl(fd, DMA_BUF_IOCTL_SYNC, &flags);
204+
}
205+
void dma_buf_free(size_t size, int *fd, void *va) {
206+
int len;
207+
208+
len = size;
209+
munmap(va, len);
210+
211+
close(*fd);
212+
*fd = -1;
213+
}
214+
215+
#endif
216+
117217
// Pool of RKNPU2 matmul kernels so we can reuse them
118218
#define GGML_RKNPU2_MAX_MATMUL_KERNELS 16
119219
static struct ggml_rknpu2_matmul_kernel matmul_kernels[GGML_RKNPU2_MAX_MATMUL_KERNELS];
120220
static int matmul_kernels_count = 0;
121221

222+
static uint64_t rknpu2_allocated_bytes = 0;
223+
122224
static struct ggml_rknpu2_matmul_kernel *
123225
ggml_rknpu2_matmul_kernel_find(int m, int k, int n, rknn_tensor_type type) {
124226
for (int i = 0; i < matmul_kernels_count; i++) {
@@ -184,10 +286,27 @@ void ggml_rknpu2_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
184286
// First time called. Initialize RKNPU2 API structs
185287
if(pack->initialized == 0) {
186288
struct ggml_rknpu2_matmul_kernel* kernel = ggml_rknpu2_matmul_kernel_create(m, k, n, pack->type);
187-
188-
pack->B = rknn_create_mem(kernel->matmul_ctx, kernel->matmul_io_attr.B.size);
189-
memcpy(pack->B->virt_addr, pack->ordered_data, kernel->matmul_io_attr.B.size);
289+
// allocate B
290+
#if GGML_RKNPU2_USE_OUTSIDE_ALLOC
291+
int fd = -1;
292+
uint8_t *va = NULL;
293+
dma_alloc(kernel->matmul_io_attr.B.size, &fd, (void *)&va);
294+
dma_sync_device_to_cpu(fd);
295+
pack->B = rknn_create_mem_from_fd(kernel->matmul_ctx, fd, va,
296+
kernel->matmul_io_attr.B.size, 0);
297+
memcpy(pack->B->virt_addr, pack->ordered_data,
298+
kernel->matmul_io_attr.B.size);
299+
dma_sync_cpu_to_device(fd);
300+
#else
301+
pack->B =
302+
rknn_create_mem(kernel->matmul_ctx, kernel->matmul_io_attr.B.size);
303+
memcpy(pack->B->virt_addr, pack->ordered_data,
304+
kernel->matmul_io_attr.B.size);
305+
#endif
190306
free(pack->ordered_data);
307+
rknpu2_allocated_bytes += kernel->matmul_io_attr.B.size;
308+
printf("RKNPU2 allocated %f MiB\n",
309+
rknpu2_allocated_bytes / 1024.0F / 1024.0F);
191310
pack->ordered_data = NULL;
192311
pack->initialized = 1;
193312
}

ggml-rknpu2.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@
55
The experimental RKNPU2 backend for GGML.
66
77
LIMITATIONS:
8-
- Only supports running in FLOAT16 mode (uses Q8_0 GGML tensors)
8+
- Only supports Q8_0 GGML quantization
99
- Only MatMul is supported
1010
- RK3588 only
11-
- Not faster then just running on RK3588 CPU (4x A76)
11+
- Please only use 1 CPU thread for best efficiency. More threads will not run faster.
12+
- Not faster than running on 1x A76 Cores, but saves some CPU time...
1213
*/
1314

1415
#include "ggml.h"

0 commit comments

Comments
 (0)