Skip to content

Commit 506177d

Browse files
Xarbirusggerganov
authored andcommitted
llama : add abort_callback to interrupt computation (ggml-org#5409)
* using abort_callback from ggml to stop llama computation * format fix * a brief explaining comment --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 1a5ed7a commit 506177d

File tree

2 files changed

+27
-4
lines changed

2 files changed

+27
-4
lines changed

llama.cpp

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1987,6 +1987,9 @@ struct llama_context {
19871987
std::vector<uint8_t> buf_compute_meta;
19881988
ggml_backend_sched_t sched = nullptr;
19891989

1990+
ggml_abort_callback abort_callback = nullptr;
1991+
void * abort_callback_data = nullptr;
1992+
19901993
// input tensors
19911994
ggml_backend_buffer_t buf_input = nullptr;
19921995
ggml_context * ctx_input = nullptr;
@@ -8071,6 +8074,7 @@ static void llama_graph_compute(
80718074

80728075
if (lctx.backend_cpu != nullptr) {
80738076
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
8077+
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
80748078
}
80758079

80768080
ggml_backend_sched_graph_compute(lctx.sched, gf);
@@ -11856,6 +11860,8 @@ struct llama_context_params llama_context_default_params() {
1185611860
/*.embedding =*/ false,
1185711861
/*.offload_kqv =*/ true,
1185811862
/*.do_pooling =*/ true,
11863+
/*.abort_callback =*/ nullptr,
11864+
/*.abort_callback_data =*/ nullptr,
1185911865
};
1186011866

1186111867
return result;
@@ -12038,8 +12044,11 @@ struct llama_context * llama_new_context_with_model(
1203812044
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
1203912045
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
1204012046

12041-
ctx->rng = std::mt19937(params.seed);
12042-
ctx->logits_all = params.logits_all;
12047+
ctx->abort_callback = params.abort_callback;
12048+
ctx->abort_callback_data = params.abort_callback_data;
12049+
12050+
ctx->rng = std::mt19937(params.seed);
12051+
ctx->logits_all = params.logits_all;
1204312052

1204412053
const ggml_type type_k = params.type_k;
1204512054
const ggml_type type_v = params.type_v;
@@ -12989,6 +12998,11 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
1298912998
ctx->cparams.n_threads_batch = n_threads_batch;
1299012999
}
1299113000

13001+
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
13002+
ctx->abort_callback = abort_callback;
13003+
ctx->abort_callback_data = abort_callback_data;
13004+
}
13005+
1299213006
struct llama_batch llama_batch_get_one(
1299313007
llama_token * tokens,
1299413008
int32_t n_tokens,

llama.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -255,10 +255,16 @@ extern "C" {
255255
enum ggml_type type_v; // data type for V cache
256256

257257
// Keep the booleans together to avoid misalignment during copy-by-value.
258-
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
258+
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
259259
bool embedding; // embedding mode only
260260
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
261261
bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
262+
263+
// Abort callback
264+
// if it returns true, execution of llama_decode() will be aborted
265+
// currently works only with CPU execution
266+
ggml_abort_callback abort_callback;
267+
void * abort_callback_data;
262268
};
263269

264270
// model quantization parameters
@@ -632,7 +638,10 @@ extern "C" {
632638
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
633639
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
634640

635-
// Token logits obtained from the last call to llama_eval()
641+
// Set abort callback
642+
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
643+
644+
// Token logits obtained from the last call to llama_decode()
636645
// The logits for the last token are stored in the last row
637646
// Logits for which llama_batch.logits[i] == 0 are undefined
638647
// Rows: n_tokens provided with llama_batch

0 commit comments

Comments
 (0)