Skip to content

Commit 8a793ce

Browse files
committed
llama : accept a list of devices to use to offload a model
1 parent 9ca2e67 commit 8a793ce

File tree

7 files changed

+99
-24
lines changed

7 files changed

+99
-24
lines changed

common/arg.cpp

+53-4
Original file line numberDiff line numberDiff line change
@@ -1312,6 +1312,40 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
13121312
else { throw std::invalid_argument("invalid value"); }
13131313
}
13141314
).set_env("LLAMA_ARG_NUMA"));
1315+
add_opt(common_arg(
1316+
{"-dev", "--device"}, "<dev1,dev2,..>",
1317+
"comma-separated list of devices to use for offloading\n"
1318+
"use --list-devices to see a list of available devices",
1319+
[](common_params & params, const std::string & value) {
1320+
auto devices = string_split<std::string>(value, ',');
1321+
if (devices.empty()) {
1322+
throw std::invalid_argument("no devices specified");
1323+
}
1324+
for (const auto & device : devices) {
1325+
auto * dev = ggml_backend_dev_by_name(device.c_str());
1326+
if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
1327+
throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
1328+
}
1329+
params.devices.push_back(dev);
1330+
}
1331+
params.devices.push_back(nullptr);
1332+
}
1333+
).set_env("LLAMA_ARG_DEVICES"));
1334+
add_opt(common_arg(
1335+
{"--list-devices"},
1336+
"print list available devices and exit",
1337+
[](common_params &) {
1338+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
1339+
auto * dev = ggml_backend_dev_get(i);
1340+
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
1341+
size_t free, total;
1342+
ggml_backend_dev_memory(dev, &free, &total);
1343+
printf("%s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
1344+
}
1345+
}
1346+
exit(0);
1347+
}
1348+
));
13151349
add_opt(common_arg(
13161350
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
13171351
"number of layers to store in VRAM",
@@ -1336,10 +1370,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
13361370
} else if (arg_next == "layer") {
13371371
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
13381372
} else if (arg_next == "row") {
1339-
#ifdef GGML_USE_SYCL
1340-
fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
1341-
exit(1);
1342-
#endif // GGML_USE_SYCL
13431373
params.split_mode = LLAMA_SPLIT_MODE_ROW;
13441374
} else {
13451375
throw std::invalid_argument("invalid value");
@@ -2042,6 +2072,25 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20422072
params.speculative.n_ctx = value;
20432073
}
20442074
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2075+
add_opt(common_arg(
2076+
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
2077+
"comma-separated list of devices to use for offloading the draft model\n"
2078+
"use --list-devices to see a list of available devices",
2079+
[](common_params & params, const std::string & value) {
2080+
auto devices = string_split<std::string>(value, ',');
2081+
if (devices.empty()) {
2082+
throw std::invalid_argument("no devices specified");
2083+
}
2084+
for (const auto & device : devices) {
2085+
auto * dev = ggml_backend_dev_by_name(device.c_str());
2086+
if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
2087+
throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
2088+
}
2089+
params.speculative.devices.push_back(dev);
2090+
}
2091+
params.speculative.devices.push_back(nullptr);
2092+
}
2093+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
20452094
add_opt(common_arg(
20462095
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
20472096
"number of layers to store in VRAM for the draft model",

common/common.cpp

+4-1
Original file line numberDiff line numberDiff line change
@@ -982,9 +982,12 @@ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_l
982982
}
983983
}
984984

985-
struct llama_model_params common_model_params_to_llama(const common_params & params) {
985+
struct llama_model_params common_model_params_to_llama(common_params & params) {
986986
auto mparams = llama_model_default_params();
987987

988+
if (!params.devices.empty()) {
989+
mparams.devices = params.devices.data();
990+
}
988991
if (params.n_gpu_layers != -1) {
989992
mparams.n_gpu_layers = params.n_gpu_layers;
990993
}

common/common.h

+9-5
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ struct common_params_sampling {
156156
};
157157

158158
struct common_params_speculative {
159+
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
159160
int32_t n_ctx = 0; // draft context size
160161
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
161162
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
@@ -178,9 +179,6 @@ struct common_params {
178179
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
179180
int32_t n_parallel = 1; // number of parallel sequences to decode
180181
int32_t n_sequences = 1; // number of sequences to decode
181-
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
182-
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
183-
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
184182
int32_t grp_attn_n = 1; // group-attention factor
185183
int32_t grp_attn_w = 512; // group-attention width
186184
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
@@ -193,6 +191,13 @@ struct common_params {
193191
int32_t yarn_orig_ctx = 0; // YaRN original context length
194192
float defrag_thold = 0.1f; // KV cache defragmentation threshold
195193

194+
// offload params
195+
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
196+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
197+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
198+
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
199+
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
200+
196201
struct cpu_params cpuparams;
197202
struct cpu_params cpuparams_batch;
198203

@@ -201,7 +206,6 @@ struct common_params {
201206

202207
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
203208

204-
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
205209
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
206210
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
207211
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
@@ -462,7 +466,7 @@ struct common_init_result {
462466

463467
struct common_init_result common_init_from_params(common_params & params);
464468

465-
struct llama_model_params common_model_params_to_llama (const common_params & params);
469+
struct llama_model_params common_model_params_to_llama ( common_params & params);
466470
struct llama_context_params common_context_params_to_llama(const common_params & params);
467471
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
468472

examples/speculative/speculative.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ int main(int argc, char ** argv) {
7676
ctx_tgt = llama_init_tgt.context;
7777

7878
// load the draft model
79+
params.devices = params.speculative.devices;
7980
params.model = params.speculative.model;
8081
params.n_gpu_layers = params.speculative.n_gpu_layers;
8182
if (params.speculative.cpuparams.n_threads > 0) {

ggml/src/ggml-backend-reg.cpp

+11-2
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,15 @@ void ggml_backend_device_register(ggml_backend_dev_t device) {
253253
}
254254

255255
// Backend (reg) enumeration
256+
static bool striequals(const char * a, const char * b) {
257+
for (; *a && *b; a++, b++) {
258+
if (std::tolower(*a) != std::tolower(*b)) {
259+
return false;
260+
}
261+
}
262+
return *a == *b;
263+
}
264+
256265
size_t ggml_backend_reg_count() {
257266
return get_reg().backends.size();
258267
}
@@ -265,7 +274,7 @@ ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
265274
ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
266275
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
267276
ggml_backend_reg_t reg = ggml_backend_reg_get(i);
268-
if (std::strcmp(ggml_backend_reg_name(reg), name) == 0) {
277+
if (striequals(ggml_backend_reg_name(reg), name)) {
269278
return reg;
270279
}
271280
}
@@ -285,7 +294,7 @@ ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
285294
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
286295
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
287296
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
288-
if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
297+
if (striequals(ggml_backend_dev_name(dev), name)) {
289298
return dev;
290299
}
291300
}

include/llama.h

+3
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,9 @@ extern "C" {
272272
};
273273

274274
struct llama_model_params {
275+
// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
276+
ggml_backend_dev_t * devices;
277+
275278
int32_t n_gpu_layers; // number of layers to store in VRAM
276279
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
277280

src/llama.cpp

+18-12
Original file line numberDiff line numberDiff line change
@@ -19364,6 +19364,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
1936419364
//
1936519365
struct llama_model_params llama_model_default_params() {
1936619366
struct llama_model_params result = {
19367+
/*.devices =*/ nullptr,
1936719368
/*.n_gpu_layers =*/ 0,
1936819369
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
1936919370
/*.main_gpu =*/ 0,
@@ -19576,19 +19577,24 @@ struct llama_model * llama_load_model_from_file(
1957619577
}
1957719578

1957819579
// create list of devices to use with this model
19579-
// currently, we use all available devices
19580-
// TODO: rework API to give user more control over device selection
19581-
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
19582-
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
19583-
switch (ggml_backend_dev_type(dev)) {
19584-
case GGML_BACKEND_DEVICE_TYPE_CPU:
19585-
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
19586-
// skip CPU backends since they are handled separately
19587-
break;
19580+
if (params.devices) {
19581+
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
19582+
model->devices.push_back(*dev);
19583+
}
19584+
} else {
19585+
// use all available devices
19586+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
19587+
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
19588+
switch (ggml_backend_dev_type(dev)) {
19589+
case GGML_BACKEND_DEVICE_TYPE_CPU:
19590+
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
19591+
// skip CPU backends since they are handled separately
19592+
break;
1958819593

19589-
case GGML_BACKEND_DEVICE_TYPE_GPU:
19590-
model->devices.push_back(dev);
19591-
break;
19594+
case GGML_BACKEND_DEVICE_TYPE_GPU:
19595+
model->devices.push_back(dev);
19596+
break;
19597+
}
1959219598
}
1959319599
}
1959419600

0 commit comments

Comments
 (0)