Skip to content

llama : update API names to use correct prefix #11174

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 14 additions & 14 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -857,22 +857,22 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams;
}

const llama_vocab * vocab = llama_get_vocab(model);
const llama_vocab * vocab = llama_model_get_vocab(model);

if (params.reranking) {
bool ok = true;

if (llama_token_bos(vocab) == LLAMA_TOKEN_NULL) {
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
ok = false;
}

if (llama_token_eos(vocab) == LLAMA_TOKEN_NULL) {
if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
ok = false;
}

if (llama_token_sep(vocab) == LLAMA_TOKEN_NULL) {
if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
ok = false;
}
Expand All @@ -886,7 +886,7 @@ struct common_init_result common_init_from_params(common_params & params) {

auto cparams = common_context_params_to_llama(params);

llama_context * lctx = llama_new_context_with_model(model, cparams);
llama_context * lctx = llama_init_from_model(model, cparams);
if (lctx == NULL) {
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
llama_model_free(model);
Expand All @@ -900,7 +900,7 @@ struct common_init_result common_init_from_params(common_params & params) {

if (!params.control_vectors.empty()) {
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model);

const auto cvec = common_control_vector_load(params.control_vectors);
if (cvec.n_embd == -1) {
Expand Down Expand Up @@ -943,14 +943,14 @@ struct common_init_result common_init_from_params(common_params & params) {
common_lora_adapters_apply(lctx, params.lora_adapters);
}

if (params.sampling.ignore_eos && llama_token_eos(vocab) == LLAMA_TOKEN_NULL) {
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
params.sampling.ignore_eos = false;
}

if (params.sampling.ignore_eos) {
for (llama_token i = 0; i < llama_n_vocab(vocab); i++) {
if (llama_token_is_eog(vocab, i)) {
for (llama_token i = 0; i < llama_vocab_n_vocab(vocab); i++) {
if (llama_vocab_is_eog(vocab, i)) {
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
params.sampling.logit_bias.push_back({i, -INFINITY});
}
Expand All @@ -971,8 +971,8 @@ struct common_init_result common_init_from_params(common_params & params) {
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);

std::vector<llama_token> tmp;
llama_token bos = llama_token_bos(vocab);
llama_token eos = llama_token_eos(vocab);
llama_token bos = llama_vocab_bos(vocab);
llama_token eos = llama_vocab_eos(vocab);

// some models (e.g. T5) don't have a BOS token
if (bos != LLAMA_TOKEN_NULL) {
Expand Down Expand Up @@ -1563,7 +1563,7 @@ std::vector<llama_token> common_tokenize(
bool add_special,
bool parse_special) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_get_vocab(model);
const llama_vocab * vocab = llama_model_get_vocab(model);
return common_tokenize(vocab, text, add_special, parse_special);
}

Expand All @@ -1588,7 +1588,7 @@ std::vector<llama_token> common_tokenize(

std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_get_vocab(model);
const llama_vocab * vocab = llama_model_get_vocab(model);
return common_token_to_piece(vocab, token, special);
}

Expand All @@ -1610,7 +1610,7 @@ std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token

std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_get_vocab(model);
const llama_vocab * vocab = llama_model_get_vocab(model);
return common_detokenize(vocab, tokens, special);
}

Expand Down
12 changes: 6 additions & 6 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,9 @@ struct common_sampler {
const auto * logits = llama_get_logits_ith(ctx, idx);

const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_get_vocab(model);
const llama_vocab * vocab = llama_model_get_vocab(model);

const int n_vocab = llama_n_vocab(vocab);
const int n_vocab = llama_vocab_n_vocab(vocab);

cur.resize(n_vocab);

Expand Down Expand Up @@ -145,7 +145,7 @@ std::string common_params_sampling::print() const {
}

struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
const llama_vocab * vocab = llama_get_vocab(model);
const llama_vocab * vocab = llama_model_get_vocab(model);

llama_sampler_chain_params lparams = llama_sampler_chain_default_params();

Expand All @@ -162,7 +162,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co

llama_sampler_chain_add(result->chain,
llama_sampler_init_logit_bias(
llama_n_vocab(vocab),
llama_vocab_n_vocab(vocab),
params.logit_bias.size(),
params.logit_bias.data()));

Expand All @@ -177,7 +177,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
c_breakers.push_back(str.c_str());
}

llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
}
break;
case COMMON_SAMPLER_TYPE_TOP_K:
Expand Down Expand Up @@ -211,7 +211,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
} else if (params.mirostat == 1) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_vocab(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
} else if (params.mirostat == 2) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
Expand Down
26 changes: 13 additions & 13 deletions common/speculative.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ bool common_speculative_are_compatible(
const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
const struct llama_model * model_dft = llama_get_model(ctx_dft);

const struct llama_vocab * vocab_tgt = llama_get_vocab(model_tgt);
const struct llama_vocab * vocab_dft = llama_get_vocab(model_dft);
const struct llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
const struct llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);

const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
Expand All @@ -94,32 +94,32 @@ bool common_speculative_are_compatible(
return false;
}

if (llama_add_bos_token(vocab_tgt) != llama_add_bos_token(vocab_dft) ||
llama_add_eos_token(vocab_tgt) != llama_add_eos_token(vocab_dft) ||
llama_token_bos(vocab_tgt) != llama_token_bos(vocab_dft) ||
llama_token_eos(vocab_tgt) != llama_token_eos(vocab_dft)) {
if (llama_vocab_add_bos(vocab_tgt) != llama_vocab_add_bos(vocab_dft) ||
llama_vocab_add_eos(vocab_tgt) != llama_vocab_add_eos(vocab_dft) ||
llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) {
LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__);
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(vocab_tgt), llama_add_bos_token(vocab_tgt), llama_token_eos(vocab_tgt), llama_add_eos_token(vocab_tgt));
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(vocab_dft), llama_add_bos_token(vocab_dft), llama_token_eos(vocab_dft), llama_add_eos_token(vocab_dft));
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_add_eos(vocab_tgt));
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_add_eos(vocab_dft));
return false;
}

{
const int n_vocab_tgt = llama_n_vocab(vocab_tgt);
const int n_vocab_dft = llama_n_vocab(vocab_dft);
const int n_vocab_tgt = llama_vocab_n_vocab(vocab_tgt);
const int n_vocab_dft = llama_vocab_n_vocab(vocab_dft);

const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);

if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
"target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
__func__, n_vocab_tgt, llama_n_vocab(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
__func__, n_vocab_tgt, llama_vocab_n_vocab(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
return false;
}

for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
const char * token_text_tgt = llama_token_get_text(vocab_tgt, i);
const char * token_text_dft = llama_token_get_text(vocab_dft, i);
const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but "
"token %d content differs - target '%s', draft '%s'\n", __func__, i,
Expand Down
2 changes: 1 addition & 1 deletion examples/batched-bench/batched-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ int main(int argc, char ** argv) {
// ensure enough sequences are available
ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());

llama_context * ctx = llama_new_context_with_model(model, ctx_params);
llama_context * ctx = llama_init_from_model(model, ctx_params);

if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
Expand Down
2 changes: 1 addition & 1 deletion examples/batched.swift/Sources/main.swift
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ while n_cur <= n_len {
let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])

// is it an end of stream? -> mark the stream as finished
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
if llama_vocab_is_eog(model, new_token_id) || n_cur == n_len {
i_batch[i] = -1
// print("")
if n_parallel > 1 {
Expand Down
8 changes: 4 additions & 4 deletions examples/batched/batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ int main(int argc, char ** argv) {
return 1;
}

const llama_vocab * vocab = llama_get_vocab(model);
const llama_vocab * vocab = llama_model_get_vocab(model);

// tokenize the prompt

Expand All @@ -64,7 +64,7 @@ int main(int argc, char ** argv) {
ctx_params.n_ctx = n_kv_req;
ctx_params.n_batch = std::max(n_predict, n_parallel);

llama_context * ctx = llama_new_context_with_model(model, ctx_params);
llama_context * ctx = llama_init_from_model(model, ctx_params);

auto sparams = llama_sampler_chain_default_params();
sparams.no_perf = false;
Expand Down Expand Up @@ -123,7 +123,7 @@ int main(int argc, char ** argv) {

llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
decoder_start_token_id = llama_token_bos(vocab);
decoder_start_token_id = llama_vocab_bos(vocab);
}

common_batch_clear(batch);
Expand Down Expand Up @@ -176,7 +176,7 @@ int main(int argc, char ** argv) {
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);

// is it an end of generation? -> mark the stream as finished
if (llama_token_is_eog(vocab, new_token_id) || n_cur == n_predict) {
if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) {
i_batch[i] = -1;
LOG("\n");
if (n_parallel > 1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -911,7 +911,7 @@ int main(int argc, char ** argv) {
load_vocab(params.fn_vocab_model, &config, &vocab);

struct my_llama_model model;
model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
model.hparams.n_vocab = config.vocab_size; //llama_vocab_n_vocab(lctx);
model.hparams.n_ctx = params.n_ctx;
model.hparams.n_embd = config.dim; //params.n_embd;
model.hparams.n_ff = config.hidden_dim;
Expand Down
8 changes: 4 additions & 4 deletions examples/cvector-generator/cvector-generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,8 +274,8 @@ struct tokenized_prompt {

tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_get_vocab(model);
const bool add_bos = llama_add_bos_token(vocab);
const llama_vocab * vocab = llama_model_get_vocab(model);
const bool add_bos = llama_vocab_add_bos(vocab);
tokens_pos = common_tokenize(ctx, pos, add_bos, true);
tokens_neg = common_tokenize(ctx, neg, add_bos, true);
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
Expand Down Expand Up @@ -423,8 +423,8 @@ int main(int argc, char ** argv) {
llama_context * ctx = llama_init.context.get();

// int n_ctx = llama_n_ctx(ctx);
int n_layers = llama_n_layer(model);
int n_embd = llama_n_embd(model);
int n_layers = llama_model_n_layer(model);
int n_embd = llama_model_n_embd(model);

// get model hint param (a.k.a model arch name)
char model_hint[128];
Expand Down
8 changes: 4 additions & 4 deletions examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,9 @@ int main(int argc, char ** argv) {
return 1;
}

const llama_vocab * vocab = llama_get_vocab(model);
const llama_vocab * vocab = llama_model_get_vocab(model);

const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx_train = llama_model_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);

const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
Expand Down Expand Up @@ -150,7 +150,7 @@ int main(int argc, char ** argv) {
// check if the last token is SEP
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
for (auto & inp : inputs) {
if (inp.empty() || inp.back() != llama_token_sep(vocab)) {
if (inp.empty() || inp.back() != llama_vocab_sep(vocab)) {
LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
}
Expand Down Expand Up @@ -183,7 +183,7 @@ int main(int argc, char ** argv) {
}

// allocate output
const int n_embd = llama_n_embd(model);
const int n_embd = llama_model_n_embd(model);
std::vector<float> embeddings(n_embd_count * n_embd, 0);
float * emb = embeddings.data();

Expand Down
4 changes: 2 additions & 2 deletions examples/eval-callback/eval-callback.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,9 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {

static bool run(llama_context * ctx, const common_params & params) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_get_vocab(model);
const llama_vocab * vocab = llama_model_get_vocab(model);

const bool add_bos = llama_add_bos_token(vocab);
const bool add_bos = llama_vocab_add_bos(vocab);

std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);

Expand Down
1 change: 0 additions & 1 deletion examples/export-lora/export-lora.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#include <map>
#include <vector>
#include <string>
#include <thread>
#include <fstream>

static bool g_verbose = false;
Expand Down
14 changes: 7 additions & 7 deletions examples/gritlm/gritlm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
std::vector<std::vector<float>> result;

const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_get_vocab(model);
const llama_vocab * vocab = llama_model_get_vocab(model);

llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);

Expand All @@ -26,7 +26,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve

// GritLM seems to have EOS = ""
// https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
// inputs.push_back(llama_token_eos(vocab));
// inputs.push_back(llama_vocab_eos(vocab));

// we want to ignore instruction tokens for mean pooling
const int32_t n_inst = common_tokenize(vocab, instruction, true, false).size();
Expand All @@ -53,7 +53,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
llama_decode(ctx, batch);

// get embedding dimensions
uint64_t n_embd = llama_n_embd(model);
uint64_t n_embd = llama_model_n_embd(model);

// allocate embedding output
std::vector<float> emb_unorm(n_embd, 0.0f);
Expand Down Expand Up @@ -98,9 +98,9 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
std::string result;

const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_get_vocab(model);
const llama_vocab * vocab = llama_model_get_vocab(model);

llama_token eos_token = llama_token_eos(vocab);
llama_token eos_token = llama_vocab_eos(vocab);

llama_kv_cache_clear(ctx);
llama_set_embeddings(ctx, false);
Expand Down Expand Up @@ -171,7 +171,7 @@ int main(int argc, char * argv[]) {
llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);

// create generation context
llama_context * ctx = llama_new_context_with_model(model, cparams);
llama_context * ctx = llama_init_from_model(model, cparams);

auto sparams = llama_sampler_chain_default_params();

Expand Down Expand Up @@ -200,7 +200,7 @@ int main(int argc, char * argv[]) {
const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction));

const int n_embd = llama_n_embd(model);
const int n_embd = llama_model_n_embd(model);

const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
Expand Down
Loading
Loading