|
9 | 9 | #include "rknn_api.h"
|
10 | 10 | #include "rknn_matmul_api.h"
|
11 | 11 |
|
| 12 | +#include <sys/ioctl.h> |
| 13 | +#include <sys/mman.h> |
| 14 | +#include <fcntl.h> |
| 15 | +#include <errno.h> |
| 16 | +#include <unistd.h> |
| 17 | + |
12 | 18 | #include <arm_neon.h>
|
13 | 19 |
|
14 | 20 |
|
@@ -114,11 +120,107 @@ struct ggml_rknpu2_matmul_kernel
|
114 | 120 | rknn_tensor_mem* C;
|
115 | 121 | };
|
116 | 122 |
|
| 123 | +#define GGML_RKNPU2_USE_OUTSIDE_ALLOC 1 |
| 124 | + |
| 125 | +#if GGML_RKNPU2_USE_OUTSIDE_ALLOC |
| 126 | +struct dma_heap_allocation_data { |
| 127 | + uint64_t len; |
| 128 | + uint32_t fd; |
| 129 | + uint32_t fd_flags; |
| 130 | + uint64_t heap_flags; |
| 131 | +}; |
| 132 | + |
| 133 | +#define DMA_HEAP_IOC_MAGIC 'H' |
| 134 | +#define DMA_HEAP_IOCTL_ALLOC _IOWR(DMA_HEAP_IOC_MAGIC, 0x0,\ |
| 135 | + struct dma_heap_allocation_data) |
| 136 | + |
| 137 | +#define DMA_BUF_SYNC_READ (1 << 0) |
| 138 | +#define DMA_BUF_SYNC_WRITE (2 << 0) |
| 139 | +#define DMA_BUF_SYNC_RW (DMA_BUF_SYNC_READ | DMA_BUF_SYNC_WRITE) |
| 140 | +#define DMA_BUF_SYNC_START (0 << 2) |
| 141 | +#define DMA_BUF_SYNC_END (1 << 2) |
| 142 | +#define DMA_BUF_BASE 'b' |
| 143 | +#define DMA_BUF_IOCTL_SYNC _IOW(DMA_BUF_BASE, 0, uint64_t) |
| 144 | +#define CMA_HEAP_SIZE (1024 * 1024) |
| 145 | + |
| 146 | +//Helper function to manually allocate buffer from dma_heap for RKNPU2 |
| 147 | +//The internal RKNPU2 API will allocate buffer from DMA32 heap, which is only 4GiB, not enough for large models. |
| 148 | +//WARNING: Memory leak will not be released on exit!! But it will be released on next run...? |
| 149 | +int dma_alloc(size_t size, int *fd, void **va) { |
| 150 | + int ret; |
| 151 | + int prot; |
| 152 | + void *mmap_va; |
| 153 | + int dma_heap_fd = -1; |
| 154 | + struct dma_heap_allocation_data buf_data; |
| 155 | + const char* path = "/dev/dma_heap/system"; |
| 156 | + |
| 157 | + /* open dma_heap fd */ |
| 158 | + dma_heap_fd = open(path, O_RDWR); |
| 159 | + if (dma_heap_fd < 0) { |
| 160 | + printf("open %s fail!\n", path); |
| 161 | + return dma_heap_fd; |
| 162 | + } |
| 163 | + |
| 164 | + /* alloc buffer */ |
| 165 | + memset(&buf_data, 0x0, sizeof(struct dma_heap_allocation_data)); |
| 166 | + |
| 167 | + buf_data.len = size; |
| 168 | + buf_data.fd_flags = O_CLOEXEC | O_RDWR; |
| 169 | + ret = ioctl(dma_heap_fd, DMA_HEAP_IOCTL_ALLOC, &buf_data); |
| 170 | + if (ret < 0) { |
| 171 | + printf("RK_DMA_HEAP_ALLOC_BUFFER failed\n"); |
| 172 | + return ret; |
| 173 | + } |
| 174 | + |
| 175 | + /* mmap va */ |
| 176 | + if (fcntl(buf_data.fd, F_GETFL) & O_RDWR) |
| 177 | + prot = PROT_READ | PROT_WRITE; |
| 178 | + else |
| 179 | + prot = PROT_READ; |
| 180 | + |
| 181 | + /* mmap contiguors buffer to user */ |
| 182 | + mmap_va = (void *)mmap(NULL, buf_data.len, prot, MAP_SHARED, buf_data.fd, 0); |
| 183 | + if (mmap_va == MAP_FAILED) { |
| 184 | + printf("mmap failed: %s\n", strerror(errno)); |
| 185 | + return -errno; |
| 186 | + } |
| 187 | + |
| 188 | + *va = mmap_va; |
| 189 | + *fd = buf_data.fd; |
| 190 | + |
| 191 | + close(dma_heap_fd); |
| 192 | + |
| 193 | + return 0; |
| 194 | +} |
| 195 | + |
| 196 | +int dma_sync_device_to_cpu(int fd) { |
| 197 | + uint64_t flags = DMA_BUF_SYNC_START | DMA_BUF_SYNC_RW; |
| 198 | + return ioctl(fd, DMA_BUF_IOCTL_SYNC, &flags); |
| 199 | +} |
| 200 | + |
| 201 | +int dma_sync_cpu_to_device(int fd) { |
| 202 | + uint64_t flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_RW; |
| 203 | + return ioctl(fd, DMA_BUF_IOCTL_SYNC, &flags); |
| 204 | +} |
| 205 | +void dma_buf_free(size_t size, int *fd, void *va) { |
| 206 | + int len; |
| 207 | + |
| 208 | + len = size; |
| 209 | + munmap(va, len); |
| 210 | + |
| 211 | + close(*fd); |
| 212 | + *fd = -1; |
| 213 | +} |
| 214 | + |
| 215 | +#endif |
| 216 | + |
117 | 217 | // Pool of RKNPU2 matmul kernels so we can reuse them
|
118 | 218 | #define GGML_RKNPU2_MAX_MATMUL_KERNELS 16
|
119 | 219 | static struct ggml_rknpu2_matmul_kernel matmul_kernels[GGML_RKNPU2_MAX_MATMUL_KERNELS];
|
120 | 220 | static int matmul_kernels_count = 0;
|
121 | 221 |
|
| 222 | +static uint64_t rknpu2_allocated_bytes = 0; |
| 223 | + |
122 | 224 | static struct ggml_rknpu2_matmul_kernel *
|
123 | 225 | ggml_rknpu2_matmul_kernel_find(int m, int k, int n, rknn_tensor_type type) {
|
124 | 226 | for (int i = 0; i < matmul_kernels_count; i++) {
|
@@ -184,10 +286,27 @@ void ggml_rknpu2_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
184 | 286 | // First time called. Initialize RKNPU2 API structs
|
185 | 287 | if(pack->initialized == 0) {
|
186 | 288 | struct ggml_rknpu2_matmul_kernel* kernel = ggml_rknpu2_matmul_kernel_create(m, k, n, pack->type);
|
187 |
| - |
188 |
| - pack->B = rknn_create_mem(kernel->matmul_ctx, kernel->matmul_io_attr.B.size); |
189 |
| - memcpy(pack->B->virt_addr, pack->ordered_data, kernel->matmul_io_attr.B.size); |
| 289 | + // allocate B |
| 290 | +#if GGML_RKNPU2_USE_OUTSIDE_ALLOC |
| 291 | + int fd = -1; |
| 292 | + uint8_t *va = NULL; |
| 293 | + dma_alloc(kernel->matmul_io_attr.B.size, &fd, (void *)&va); |
| 294 | + dma_sync_device_to_cpu(fd); |
| 295 | + pack->B = rknn_create_mem_from_fd(kernel->matmul_ctx, fd, va, |
| 296 | + kernel->matmul_io_attr.B.size, 0); |
| 297 | + memcpy(pack->B->virt_addr, pack->ordered_data, |
| 298 | + kernel->matmul_io_attr.B.size); |
| 299 | + dma_sync_cpu_to_device(fd); |
| 300 | +#else |
| 301 | + pack->B = |
| 302 | + rknn_create_mem(kernel->matmul_ctx, kernel->matmul_io_attr.B.size); |
| 303 | + memcpy(pack->B->virt_addr, pack->ordered_data, |
| 304 | + kernel->matmul_io_attr.B.size); |
| 305 | +#endif |
190 | 306 | free(pack->ordered_data);
|
| 307 | + rknpu2_allocated_bytes += kernel->matmul_io_attr.B.size; |
| 308 | + printf("RKNPU2 allocated %f MiB\n", |
| 309 | + rknpu2_allocated_bytes / 1024.0F / 1024.0F); |
191 | 310 | pack->ordered_data = NULL;
|
192 | 311 | pack->initialized = 1;
|
193 | 312 | }
|
|
0 commit comments