Skip to content

Commit c0fe629

Browse files
committed
fix CUDA split buffers
1 parent 3a77442 commit c0fe629

File tree

3 files changed

+21
-11
lines changed

3 files changed

+21
-11
lines changed

ggml-backend.c

+10-2
Original file line numberDiff line numberDiff line change
@@ -1051,8 +1051,9 @@ struct ggml_backend_sched {
10511051
struct ggml_cgraph * graph;
10521052

10531053
// graph splits
1054-
struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
1054+
struct ggml_backend_sched_split * splits;
10551055
int n_splits;
1056+
int splits_capacity;
10561057

10571058
// pipeline parallelism support
10581059
int n_copies;
@@ -1443,6 +1444,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
14431444
if (node_backend_id != cur_backend_id || offload) {
14441445
split->i_end = i;
14451446
i_split++;
1447+
if (i_split >= sched->splits_capacity) {
1448+
sched->splits_capacity *= 2;
1449+
sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
1450+
}
14461451
GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
14471452
split = &sched->splits[i_split];
14481453
split->backend_id = node_backend_id;
@@ -1711,7 +1716,9 @@ ggml_backend_sched_t ggml_backend_sched_new(
17111716

17121717
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
17131718

1714-
GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
1719+
const int initial_splits_capacity = 16;
1720+
sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
1721+
sched->splits_capacity = initial_splits_capacity;
17151722

17161723
for (int b = 0; b < n_backends; b++) {
17171724
sched->backends[b] = backends[b];
@@ -1742,6 +1749,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
17421749
}
17431750
ggml_gallocr_free(sched->galloc);
17441751
ggml_free(sched->ctx);
1752+
free(sched->splits);
17451753
free(sched->hash_set.keys);
17461754
free(sched->tensor_backend_id);
17471755
free(sched->tensor_copies);

ggml-cuda.cu

+6-8
Original file line numberDiff line numberDiff line change
@@ -10755,6 +10755,8 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
1075510755
};
1075610756

1075710757
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
10758+
ggml_init_cublas();
10759+
1075810760
// FIXME: this is not thread safe
1075910761
if (device >= ggml_backend_cuda_get_device_count()) {
1076010762
return nullptr;
@@ -11039,6 +11041,8 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
1103911041
};
1104011042

1104111043
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
11044+
ggml_init_cublas();
11045+
1104211046
// FIXME: this is not thread safe
1104311047
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
1104411048

@@ -11389,15 +11393,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
1138911393
}
1139011394

1139111395
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
11392-
const ggml_tensor * dst = op;
11393-
1139411396
const int min_batch_size = 32;
1139511397

11396-
if (dst->ne[1] > min_batch_size && dst->op != GGML_OP_GET_ROWS) {
11397-
return true;
11398-
}
11399-
11400-
return false;
11398+
return op->ne[1] > min_batch_size && op->op != GGML_OP_GET_ROWS;
1140111399
}
1140211400

1140311401
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
@@ -11476,7 +11474,7 @@ static ggml_guid_t ggml_backend_cuda_guid() {
1147611474
}
1147711475

1147811476
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
11479-
ggml_init_cublas(); // TODO: remove from ggml.c
11477+
ggml_init_cublas();
1148011478

1148111479
if (device < 0 || device >= ggml_cuda_get_device_count()) {
1148211480
fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);

llama.cpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -5039,7 +5039,11 @@ static bool llm_load_tensors(
50395039
ml.get_mapping_range(&first, &last, ctx);
50405040
buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
50415041
#ifdef GGML_USE_CUBLAS
5042-
ggml_backend_cuda_register_host_buffer((char *) ml.mapping->addr + first, last - first);
5042+
if (n_layer >= n_gpu_layers) {
5043+
ggml_backend_cuda_register_host_buffer(
5044+
ggml_backend_buffer_get_base(buf),
5045+
ggml_backend_buffer_get_size(buf));
5046+
}
50435047
#endif
50445048
}
50455049
#ifdef GGML_USE_METAL

0 commit comments

Comments
 (0)