Skip to content

Commit 54ec8a9

Browse files
committed
llama : add llama_init_backend() API (close ggml-org#1527)
1 parent f401d5f commit 54ec8a9

File tree

7 files changed

+48
-29
lines changed

7 files changed

+48
-29
lines changed

examples/benchmark/benchmark-matmult.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
#include <locale.h>
21
#include "ggml.h"
32
#include "build-info.h"
3+
4+
#include <locale.h>
45
#include <assert.h>
56
#include <math.h>
67
#include <cstring>

examples/embedding/embedding.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ int main(int argc, char ** argv) {
3131
params.prompt = gpt_random_prompt(rng);
3232
}
3333

34+
llama_init_backend();
35+
3436
llama_context * ctx;
3537

3638
// load the model

examples/main/main.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,7 @@ int main(int argc, char ** argv) {
9696
params.prompt = gpt_random_prompt(rng);
9797
}
9898

99-
// params.prompt = R"(// this function checks if the number n is prime
100-
//bool is_prime(int n) {)";
99+
llama_init_backend();
101100

102101
llama_context * ctx;
103102
g_ctx = &ctx;

examples/perplexity/perplexity.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@ int main(int argc, char ** argv) {
143143
params.prompt = gpt_random_prompt(rng);
144144
}
145145

146+
llama_init_backend();
147+
146148
llama_context * ctx;
147149

148150
// load the model and apply lora adapter, if any

examples/quantize/quantize.cpp

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
#include "ggml.h"
2-
#include "llama.h"
31
#include "build-info.h"
42

3+
#include "llama.h"
4+
55
#include <cstdio>
66
#include <map>
77
#include <string>
@@ -42,8 +42,6 @@ bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::st
4242
// ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
4343
//
4444
int main(int argc, char ** argv) {
45-
ggml_time_init();
46-
4745
if (argc < 3) {
4846
fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]);
4947
for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
@@ -52,12 +50,7 @@ int main(int argc, char ** argv) {
5250
return 1;
5351
}
5452

55-
// needed to initialize f16 tables
56-
{
57-
struct ggml_init_params params = { 0, NULL, false };
58-
struct ggml_context * ctx = ggml_init(params);
59-
ggml_free(ctx);
60-
}
53+
llama_init_backend();
6154

6255
// parse command line arguments
6356
const std::string fname_inp = argv[1];
@@ -116,25 +109,25 @@ int main(int argc, char ** argv) {
116109
}
117110
fprintf(stderr, "\n");
118111

119-
const int64_t t_main_start_us = ggml_time_us();
112+
const int64_t t_main_start_us = llama_time_us();
120113

121114
int64_t t_quantize_us = 0;
122115

123116
// load the model
124117
{
125-
const int64_t t_start_us = ggml_time_us();
118+
const int64_t t_start_us = llama_time_us();
126119

127120
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
128121
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
129122
return 1;
130123
}
131124

132-
t_quantize_us = ggml_time_us() - t_start_us;
125+
t_quantize_us = llama_time_us() - t_start_us;
133126
}
134127

135128
// report timing
136129
{
137-
const int64_t t_main_end_us = ggml_time_us();
130+
const int64_t t_main_end_us = llama_time_us();
138131

139132
printf("\n");
140133
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);

llama.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -847,6 +847,21 @@ bool llama_mlock_supported() {
847847
return llama_mlock::SUPPORTED;
848848
}
849849

850+
void llama_init_backend() {
851+
ggml_time_init();
852+
853+
// needed to initialize f16 tables
854+
{
855+
struct ggml_init_params params = { 0, NULL, false };
856+
struct ggml_context * ctx = ggml_init(params);
857+
ggml_free(ctx);
858+
}
859+
}
860+
861+
int64_t llama_time_us() {
862+
return ggml_time_us();
863+
}
864+
850865
//
851866
// model loading
852867
//

llama.h

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,9 @@ extern "C" {
4040
typedef int llama_token;
4141

4242
typedef struct llama_token_data {
43-
llama_token id; // token id
44-
float logit; // log-odds of the token
45-
float p; // probability of the token
43+
llama_token id; // token id
44+
float logit; // log-odds of the token
45+
float p; // probability of the token
4646
} llama_token_data;
4747

4848
typedef struct llama_token_data_array {
@@ -73,23 +73,30 @@ extern "C" {
7373

7474
// model file types
7575
enum llama_ftype {
76-
LLAMA_FTYPE_ALL_F32 = 0,
77-
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
78-
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
79-
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
76+
LLAMA_FTYPE_ALL_F32 = 0,
77+
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
78+
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
79+
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
8080
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
81-
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
82-
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
83-
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
84-
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
85-
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
81+
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
82+
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
83+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
84+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
85+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
8686
};
8787

8888
LLAMA_API struct llama_context_params llama_context_default_params();
8989

9090
LLAMA_API bool llama_mmap_supported();
9191
LLAMA_API bool llama_mlock_supported();
9292

93+
// TODO: not great API - very likely to change
94+
// Initialize the llama + ggml backend
95+
// Call once at the start of the program
96+
LLAMA_API void llama_init_backend();
97+
98+
LLAMA_API int64_t llama_time_us();
99+
93100
// Various functions for loading a ggml llama model.
94101
// Allocate (almost) all memory needed for the model.
95102
// Return NULL on failure

0 commit comments

Comments
 (0)