Skip to content

Commit 0867742

Browse files
committed
using abort_callback from ggml to stop llama computation
1 parent aa7ab99 commit 0867742

File tree

5 files changed

+52
-12
lines changed

5 files changed

+52
-12
lines changed

ggml-backend.c

+23-5
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,9 @@ struct ggml_backend_cpu_context {
653653
int n_threads;
654654
void * work_data;
655655
size_t work_size;
656+
657+
ggml_abort_callback abort_callback;
658+
void * abort_callback_data;
656659
};
657660

658661
GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
@@ -691,6 +694,9 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
691694
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
692695
}
693696

697+
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
698+
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
699+
694700
return cpu_plan;
695701
}
696702

@@ -721,9 +727,11 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
721727
cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
722728
cpu_ctx->work_size = cplan.work_size;
723729
}
724-
725730
cplan.work_data = cpu_ctx->work_data;
726731

732+
cplan.abort_callback = cpu_ctx->abort_callback;
733+
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
734+
727735
ggml_graph_compute(cgraph, &cplan);
728736
return true;
729737
}
@@ -759,9 +767,11 @@ static struct ggml_backend_i cpu_backend_i = {
759767
ggml_backend_t ggml_backend_cpu_init(void) {
760768
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
761769

762-
ctx->n_threads = GGML_DEFAULT_N_THREADS;
763-
ctx->work_data = NULL;
764-
ctx->work_size = 0;
770+
ctx->n_threads = GGML_DEFAULT_N_THREADS;
771+
ctx->work_data = NULL;
772+
ctx->work_size = 0;
773+
ctx->abort_callback = NULL;
774+
ctx->abort_callback_data = NULL;
765775

766776
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
767777

@@ -776,13 +786,21 @@ GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
776786
return backend && backend->iface.get_name == ggml_backend_cpu_name;
777787
}
778788

779-
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
789+
GGML_CALL void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
780790
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
781791

782792
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
783793
ctx->n_threads = n_threads;
784794
}
785795

796+
GGML_CALL void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
797+
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
798+
799+
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
800+
ctx->abort_callback = abort_callback;
801+
ctx->abort_callback_data = abort_callback_data;
802+
}
803+
786804
GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
787805
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
788806
}

ggml-backend.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -80,11 +80,11 @@ extern "C" {
8080
//
8181
// CPU backend
8282
//
83-
8483
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
8584

86-
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
87-
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
85+
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
86+
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
87+
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
8888

8989
// Create a backend buffer from an existing pointer
9090
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);

ggml.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,8 @@ extern "C" {
567567

568568
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
569569

570+
typedef bool (*ggml_abort_callback)(void * data);
571+
570572
// the compute plan that needs to be prepared for ggml_graph_compute()
571573
// since https://github.com/ggerganov/ggml/issues/287
572574
struct ggml_cplan {
@@ -576,8 +578,8 @@ extern "C" {
576578
int n_threads;
577579

578580
// abort ggml_graph_compute when true
579-
bool (*abort_callback)(void * data);
580-
void * abort_callback_data;
581+
ggml_abort_callback abort_callback;
582+
void * abort_callback_data;
581583
};
582584

583585
enum ggml_cgraph_eval_order {

llama.cpp

+16-2
Original file line numberDiff line numberDiff line change
@@ -1842,6 +1842,9 @@ struct llama_context {
18421842
// allocator for the input tensors
18431843
ggml_tallocr * alloc = nullptr;
18441844

1845+
ggml_abort_callback abort_callback = nullptr;
1846+
void * abort_callback_data = nullptr;
1847+
18451848
// input tensors
18461849
ggml_backend_buffer_t buf_input = nullptr;
18471850
ggml_context * ctx_input = nullptr;
@@ -7300,6 +7303,7 @@ static int llama_decode_internal(
73007303

73017304
if (lctx.backend_cpu != nullptr) {
73027305
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
7306+
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
73037307
}
73047308
ggml_backend_sched_graph_compute(lctx.sched, gf);
73057309

@@ -10482,6 +10486,8 @@ struct llama_context_params llama_context_default_params() {
1048210486
/*.logits_all =*/ false,
1048310487
/*.embedding =*/ false,
1048410488
/*.offload_kqv =*/ true,
10489+
/*.abort_callback =*/ nullptr,
10490+
/*.abort_callback_data =*/ nullptr,
1048510491
};
1048610492

1048710493
return result;
@@ -10670,8 +10676,11 @@ struct llama_context * llama_new_context_with_model(
1067010676
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
1067110677
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
1067210678

10673-
ctx->rng = std::mt19937(params.seed);
10674-
ctx->logits_all = params.logits_all;
10679+
ctx->abort_callback = params.abort_callback;
10680+
ctx->abort_callback_data = params.abort_callback_data;
10681+
10682+
ctx->rng = std::mt19937(params.seed);
10683+
ctx->logits_all = params.logits_all;
1067510684

1067610685
const ggml_type type_k = params.type_k;
1067710686
const ggml_type type_v = params.type_v;
@@ -11575,6 +11584,11 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
1157511584
ctx->cparams.n_threads_batch = n_threads_batch;
1157611585
}
1157711586

11587+
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
11588+
ctx->abort_callback = abort_callback;
11589+
ctx->abort_callback_data = abort_callback_data;
11590+
}
11591+
1157811592
struct llama_batch llama_batch_get_one(
1157911593
llama_token * tokens,
1158011594
int32_t n_tokens,

llama.h

+6
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,9 @@ extern "C" {
235235
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
236236
bool embedding; // embedding mode only
237237
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
238+
239+
ggml_abort_callback abort_callback;
240+
void * abort_callback_data;
238241
};
239242

240243
// model quantization parameters
@@ -612,6 +615,9 @@ extern "C" {
612615
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
613616
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
614617

618+
// Set abort callback
619+
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
620+
615621
// Token logits obtained from the last call to llama_eval()
616622
// The logits for the last token are stored in the last row
617623
// Logits for which llama_batch.logits[i] == 0 are undefined

0 commit comments

Comments
 (0)