Skip to content

Commit 5931c1f

Browse files
slarenggerganov
andauthored
ggml : add support for dynamic loading of backends (#10469)
* ggml : add support for dynamic loading of backends --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent f6d12e7 commit 5931c1f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+728
-272
lines changed

Makefile

+2-1
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ endif
251251
#
252252

253253
# keep standard at C11 and C++11
254-
MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon
254+
MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
255255
MK_CFLAGS = -std=c11 -fPIC
256256
MK_CXXFLAGS = -std=c++11 -fPIC
257257
MK_NVCCFLAGS = -std=c++11
@@ -290,6 +290,7 @@ endif
290290
# some memory allocation are available on Linux through GNU extensions in libc
291291
ifeq ($(UNAME_S),Linux)
292292
MK_CPPFLAGS += -D_GNU_SOURCE
293+
MK_LDFLAGS += -ldl
293294
endif
294295

295296
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,

Package.swift

+2-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ linkerSettings.append(.linkedFramework("Accelerate"))
4343
cSettings.append(
4444
contentsOf: [
4545
.define("GGML_USE_ACCELERATE"),
46-
.define("GGML_USE_METAL")
46+
.define("GGML_USE_METAL"),
47+
.define("GGML_USE_CPU")
4748
]
4849
)
4950
#endif

common/common.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,9 @@ void common_init() {
377377
#endif
378378

379379
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
380+
381+
// load dynamic backends
382+
ggml_backend_load_all();
380383
}
381384

382385
std::string common_params_get_system_info(const common_params & params) {

examples/CMakeLists.txt

+15-12
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
1212

1313
if (EMSCRIPTEN)
1414
else()
15-
add_subdirectory(cvector-generator)
1615
add_subdirectory(batched-bench)
1716
add_subdirectory(batched)
18-
add_subdirectory(convert-llama2c-to-ggml)
1917
add_subdirectory(embedding)
2018
add_subdirectory(eval-callback)
21-
add_subdirectory(export-lora)
2219
add_subdirectory(gbnf-validator)
2320
add_subdirectory(gguf-hash)
2421
add_subdirectory(gguf-split)
@@ -27,29 +24,35 @@ else()
2724
add_subdirectory(imatrix)
2825
add_subdirectory(infill)
2926
add_subdirectory(llama-bench)
30-
add_subdirectory(llava)
3127
add_subdirectory(lookahead)
3228
add_subdirectory(lookup)
3329
add_subdirectory(main)
3430
add_subdirectory(parallel)
3531
add_subdirectory(passkey)
3632
add_subdirectory(perplexity)
37-
add_subdirectory(quantize-stats)
3833
add_subdirectory(quantize)
3934
add_subdirectory(retrieval)
40-
if (GGML_RPC)
41-
add_subdirectory(rpc)
42-
endif()
4335
if (LLAMA_BUILD_SERVER)
44-
add_subdirectory(server)
45-
endif()
46-
if (GGML_SYCL)
47-
add_subdirectory(sycl)
36+
add_subdirectory(server)
4837
endif()
4938
add_subdirectory(save-load-state)
5039
add_subdirectory(simple)
5140
add_subdirectory(simple-chat)
5241
add_subdirectory(speculative)
5342
add_subdirectory(speculative-simple)
5443
add_subdirectory(tokenize)
44+
if (NOT GGML_BACKEND_DL)
45+
# these examples use the backends directly and cannot be built with dynamic loading
46+
add_subdirectory(convert-llama2c-to-ggml)
47+
add_subdirectory(cvector-generator)
48+
add_subdirectory(export-lora)
49+
add_subdirectory(quantize-stats)
50+
add_subdirectory(llava)
51+
if (GGML_RPC)
52+
add_subdirectory(rpc)
53+
endif()
54+
if (GGML_SYCL)
55+
add_subdirectory(sycl)
56+
endif()
57+
endif()
5558
endif()

examples/eval-callback/CMakeLists.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,6 @@ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
55
target_compile_features(${TARGET} PRIVATE cxx_std_11)
66

77
set(TEST_TARGET test-eval-callback)
8-
add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
8+
add_test(NAME ${TEST_TARGET}
9+
COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
910
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)

examples/llama-bench/llama-bench.cpp

+13-2
Original file line numberDiff line numberDiff line change
@@ -1477,6 +1477,17 @@ int main(int argc, char ** argv) {
14771477

14781478
cmd_params params = parse_cmd_params(argc, argv);
14791479

1480+
// initialize backends
1481+
ggml_backend_load_all();
1482+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
1483+
if (!cpu_dev) {
1484+
fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
1485+
return 1;
1486+
}
1487+
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
1488+
auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new");
1489+
auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free");
1490+
14801491
// initialize llama.cpp
14811492
if (!params.verbose) {
14821493
llama_log_set(llama_null_log_callback, NULL);
@@ -1551,7 +1562,7 @@ int main(int argc, char ** argv) {
15511562
tpp.poll = t.poll;
15521563
tpp.prio = params.prio;
15531564

1554-
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
1565+
struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
15551566
if (!threadpool) {
15561567
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
15571568
exit(1);
@@ -1612,7 +1623,7 @@ int main(int argc, char ** argv) {
16121623

16131624
llama_free(ctx);
16141625

1615-
ggml_threadpool_free(threadpool);
1626+
ggml_threadpool_free_fn(threadpool);
16161627
}
16171628

16181629
llama_free_model(lmodel);

examples/main/main.cpp

+8-4
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,10 @@ int main(int argc, char ** argv) {
165165

166166
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
167167

168+
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
169+
auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
170+
auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
171+
168172
struct ggml_threadpool_params tpp_batch =
169173
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
170174
struct ggml_threadpool_params tpp =
@@ -174,7 +178,7 @@ int main(int argc, char ** argv) {
174178

175179
struct ggml_threadpool * threadpool_batch = NULL;
176180
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
177-
threadpool_batch = ggml_threadpool_new(&tpp_batch);
181+
threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
178182
if (!threadpool_batch) {
179183
LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
180184
return 1;
@@ -184,7 +188,7 @@ int main(int argc, char ** argv) {
184188
tpp.paused = true;
185189
}
186190

187-
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
191+
struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
188192
if (!threadpool) {
189193
LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
190194
return 1;
@@ -890,8 +894,8 @@ int main(int argc, char ** argv) {
890894

891895
llama_backend_free();
892896

893-
ggml_threadpool_free(threadpool);
894-
ggml_threadpool_free(threadpool_batch);
897+
ggml_threadpool_free_fn(threadpool);
898+
ggml_threadpool_free_fn(threadpool_batch);
895899

896900
return 0;
897901
}

examples/simple-chat/simple-chat.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ int main(int argc, char ** argv) {
6262
}
6363
}, nullptr);
6464

65+
// load dynamic backends
66+
ggml_backend_load_all();
67+
6568
// initialize the model
6669
llama_model_params model_params = llama_model_default_params();
6770
model_params.n_gpu_layers = ngl;

examples/simple/simple.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ int main(int argc, char ** argv) {
7474
}
7575
}
7676

77+
// load dynamic backends
78+
79+
ggml_backend_load_all();
80+
7781
// initialize the model
7882

7983
llama_model_params model_params = llama_model_default_params();

ggml/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ else()
3333
endif()
3434

3535
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
36+
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
3637

3738
#
3839
# option list

ggml/include/ggml-backend.h

+15
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,14 @@ extern "C" {
190190
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
191191
// Get additional buffer types provided by the device (returns a NULL-terminated array)
192192
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
193+
// Set the abort callback for the backend
194+
typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
195+
// Get a list of feature flags supported by the backend (returns a NULL-terminated array)
196+
struct ggml_backend_feature {
197+
const char * name;
198+
const char * value;
199+
};
200+
typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
193201

194202
//
195203
// Backend registry
@@ -214,6 +222,13 @@ extern "C" {
214222
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
215223
GGML_API ggml_backend_t ggml_backend_init_best(void);
216224

225+
// Load a backend from a dynamic library and register it
226+
GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
227+
// Unload a backend if loaded dynamically and unregister it
228+
GGML_API void ggml_backend_unload(ggml_backend_reg_t reg);
229+
// Load all known backends from dynamic libraries
230+
GGML_API void ggml_backend_load_all(void);
231+
217232
//
218233
// Backend scheduler
219234
//

ggml/include/ggml-cpu.h

+6-32
Original file line numberDiff line numberDiff line change
@@ -7,29 +7,6 @@
77
extern "C" {
88
#endif
99

10-
// Scheduling priorities
11-
enum ggml_sched_priority {
12-
GGML_SCHED_PRIO_NORMAL,
13-
GGML_SCHED_PRIO_MEDIUM,
14-
GGML_SCHED_PRIO_HIGH,
15-
GGML_SCHED_PRIO_REALTIME
16-
};
17-
18-
// Threadpool params
19-
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
20-
struct ggml_threadpool_params {
21-
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
22-
int n_threads; // number of threads
23-
enum ggml_sched_priority prio; // thread priority
24-
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
25-
bool strict_cpu; // strict cpu placement
26-
bool paused; // start in paused state
27-
};
28-
29-
struct ggml_threadpool; // forward declaration, see ggml.c
30-
31-
typedef struct ggml_threadpool * ggml_threadpool_t;
32-
3310
// the compute plan that needs to be prepared for ggml_graph_compute()
3411
// since https://github.com/ggerganov/ggml/issues/287
3512
struct ggml_cplan {
@@ -75,14 +52,11 @@ extern "C" {
7552
GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
7653
GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
7754

78-
GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
79-
GGML_BACKEND_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
80-
GGML_BACKEND_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
81-
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
82-
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
83-
GGML_BACKEND_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
84-
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
85-
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
55+
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
56+
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
57+
GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
58+
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
59+
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
8660

8761
// ggml_graph_plan() has to be called before ggml_graph_compute()
8862
// when plan.work_size > 0, caller must allocate memory for plan.work_data
@@ -104,10 +78,10 @@ extern "C" {
10478
GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
10579
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
10680
GGML_BACKEND_API int ggml_cpu_has_avx (void);
81+
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
10782
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
10883
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
10984
GGML_BACKEND_API int ggml_cpu_has_fma (void);
110-
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
11185
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
11286
GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
11387
GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);

ggml/include/ggml.h

+31
Original file line numberDiff line numberDiff line change
@@ -2215,6 +2215,37 @@ extern "C" {
22152215

22162216
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
22172217

2218+
// ggml threadpool
2219+
// TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
2220+
// the goal should be to create an API that other backends can use move everything to the ggml base
2221+
2222+
// scheduling priorities
2223+
enum ggml_sched_priority {
2224+
GGML_SCHED_PRIO_NORMAL,
2225+
GGML_SCHED_PRIO_MEDIUM,
2226+
GGML_SCHED_PRIO_HIGH,
2227+
GGML_SCHED_PRIO_REALTIME
2228+
};
2229+
2230+
// threadpool params
2231+
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
2232+
struct ggml_threadpool_params {
2233+
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
2234+
int n_threads; // number of threads
2235+
enum ggml_sched_priority prio; // thread priority
2236+
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
2237+
bool strict_cpu; // strict cpu placement
2238+
bool paused; // start in paused state
2239+
};
2240+
2241+
struct ggml_threadpool; // forward declaration, see ggml.c
2242+
2243+
typedef struct ggml_threadpool * ggml_threadpool_t;
2244+
2245+
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
2246+
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
2247+
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
2248+
22182249
#ifdef __cplusplus
22192250
}
22202251
#endif

0 commit comments

Comments
 (0)