|
6 | 6 | #include "rwkv_v3.h"
|
7 | 7 | #include "ggml.h"
|
8 | 8 |
|
| 9 | +#ifdef GGML_USE_CUBLAS |
| 10 | +#include "ggml-cuda.h" |
| 11 | +#endif |
| 12 | +#if defined(GGML_USE_CLBLAST) |
| 13 | +#include "ggml-opencl.h" |
| 14 | +#endif |
| 15 | + |
9 | 16 | #include <string>
|
10 | 17 | #include <vector>
|
11 | 18 | #include <cstring>
|
@@ -1058,7 +1065,11 @@ struct rwkv_future_tensor rwkv_future_graph_work(struct rwkv_future_ctx & ctx,
|
1058 | 1065 | const size_t n_threads,
|
1059 | 1066 | const size_t sequence_len = 1
|
1060 | 1067 | ) {
|
| 1068 | +#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS) |
| 1069 | + enum ggml_type mul_mat_type = type == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; |
| 1070 | +#else |
1061 | 1071 | enum ggml_type mul_mat_type = ggml_is_quantized(type) ? GGML_TYPE_Q8_1 : type;
|
| 1072 | +#endif |
1062 | 1073 | return ctx.alloc(GGML_TYPE_I8, rwkv_future_tensor::size(mul_mat_type, ffn_key_height, sequence_len) * n_threads + 64 * (n_threads - 1));
|
1063 | 1074 | }
|
1064 | 1075 |
|
@@ -1545,7 +1556,38 @@ struct rwkv_context * rwkv_clone_context(struct rwkv_context * ctx, const uint32
|
1545 | 1556 | }
|
1546 | 1557 |
|
1547 | 1558 | bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers) {
|
| 1559 | +#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_CUBLAS) |
| 1560 | + printf("\nOffloading %u (or fewer) layers...",n_layers); |
| 1561 | + const auto offload = [&](struct ggml_tensor * tensor) { |
| 1562 | + // TODO support multi-GPU |
| 1563 | + tensor->backend = GGML_BACKEND_GPU; |
| 1564 | + #if defined(GGML_USE_CLBLAST) |
| 1565 | + ggml_cl_transform_tensor(tensor->data, tensor); |
| 1566 | + #else |
| 1567 | + ggml_cuda_transform_tensor(tensor->data, tensor); |
| 1568 | + #endif |
| 1569 | + }; |
| 1570 | + |
| 1571 | + const size_t n_gpu = std::min(n_layers, ctx->instance->model.header.n_layer); |
| 1572 | + |
| 1573 | + if (ctx->gpu_layers < n_gpu) { |
| 1574 | + for (size_t & i = ctx->gpu_layers; i < n_gpu; i++) { |
| 1575 | + const struct rwkv_layer & layer = ctx->instance->model.layers[i]; |
| 1576 | + |
| 1577 | + // TODO also offload other operations to GPU with ggml_cuda_assign_buffers |
| 1578 | + offload(layer.att_key); |
| 1579 | + offload(layer.att_value); |
| 1580 | + offload(layer.att_receptance); |
| 1581 | + offload(layer.att_output); |
| 1582 | + |
| 1583 | + offload(layer.ffn_key); |
| 1584 | + offload(layer.ffn_value); |
| 1585 | + offload(layer.ffn_receptance); |
| 1586 | + } |
1548 | 1587 |
|
| 1588 | + return true; |
| 1589 | + } |
| 1590 | +#endif |
1549 | 1591 | return false;
|
1550 | 1592 | }
|
1551 | 1593 |
|
|
0 commit comments