Skip to content

Commit f47fd17

Browse files
authored
Merge branch 'ggerganov:master' into master
2 parents 4e5c5c4 + 11bff29 commit f47fd17

File tree

7 files changed

+75
-30
lines changed

7 files changed

+75
-30
lines changed

convert-mpt-hf-to-gguf.py

+2
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ def parse_args() -> argparse.Namespace:
9898
gguf_writer.add_block_count(block_count)
9999
gguf_writer.add_feed_forward_length(4 * hparams["d_model"])
100100
gguf_writer.add_head_count(hparams["n_heads"])
101+
if kv_n_heads := hparams["attn_config"].get("kv_n_heads"):
102+
gguf_writer.add_head_count_kv(kv_n_heads)
101103
gguf_writer.add_layer_norm_eps(1e-05)
102104
if hparams["attn_config"]["clip_qkv"] is not None:
103105
gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"])

examples/finetune/finetune.cpp

+9-10
Original file line numberDiff line numberDiff line change
@@ -529,13 +529,14 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
529529
set_param_lora(lora);
530530

531531
// measure data size
532-
struct ggml_allocr * alloc = NULL;
533-
alloc = ggml_allocr_new_measure(tensor_alignment);
534-
alloc_lora(alloc, lora);
532+
size_t size = 0;
533+
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
534+
size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
535+
}
535536

536537
// allocate data
537-
lora->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
538-
ggml_allocr_free(alloc);
538+
struct ggml_allocr * alloc = NULL;
539+
lora->data.resize(size + tensor_alignment);
539540
alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment);
540541
alloc_lora(alloc, lora);
541542
ggml_allocr_free(alloc);
@@ -1714,11 +1715,9 @@ int main(int argc, char ** argv) {
17141715
struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch);
17151716

17161717
// measure required memory for input tensors
1717-
alloc = ggml_allocr_new_measure(tensor_alignment);
1718-
ggml_allocr_alloc(alloc, tokens_input);
1719-
ggml_allocr_alloc(alloc, target_probs);
1720-
size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
1721-
ggml_allocr_free(alloc);
1718+
size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
1719+
GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
1720+
tensor_alignment;
17221721
printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
17231722

17241723
// allocate input tensors

examples/llava/llava.cpp

+7-1
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,13 @@ int main(int argc, char ** argv) {
7979

8080
llama_backend_init(params.numa);
8181

82-
llama_model_params model_params = llama_model_default_params();
82+
llama_model_params model_params = llama_model_default_params();
83+
model_params.n_gpu_layers = params.n_gpu_layers;
84+
model_params.main_gpu = params.main_gpu;
85+
model_params.tensor_split = params.tensor_split;
86+
model_params.use_mmap = params.use_mmap;
87+
model_params.use_mlock = params.use_mlock;
88+
8389
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
8490
if (model == NULL) {
8591
fprintf(stderr , "%s: error: unable to load model\n" , __func__);

ggml-opencl.cpp

+17-15
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
#pragma warning(disable: 4244 4267) // possible loss of data
2020
#endif
2121

22-
#define CL_DMMV_BLOCK_SIZE 32
22+
#define CL_DMMV_LOCAL_SIZE 32
2323

2424
#ifndef K_QUANTS_PER_ITERATION
2525
#define K_QUANTS_PER_ITERATION 1
@@ -338,7 +338,7 @@ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx,
338338
const int row = get_group_id(0);
339339

340340
const int num_blocks_per_row = ncols / QK_K;
341-
const int ib0 = row*num_blocks_per_row;
341+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
342342

343343
__global const struct block_q2_K * x = xx + ib0;
344344

@@ -413,7 +413,7 @@ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx,
413413
const int row = get_group_id(0);
414414

415415
const int num_blocks_per_row = ncols / QK_K;
416-
const int ib0 = row*num_blocks_per_row;
416+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
417417

418418
__global const struct block_q3_K * x = xx + ib0;
419419

@@ -489,7 +489,7 @@ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx,
489489

490490
const int row = get_group_id(0);
491491
const int num_blocks_per_row = ncols / QK_K;
492-
const int ib0 = row*num_blocks_per_row;
492+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
493493

494494
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
495495
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
@@ -562,7 +562,7 @@ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx,
562562

563563
const int row = get_group_id(0);
564564
const int num_blocks_per_row = ncols / QK_K;
565-
const int ib0 = row*num_blocks_per_row;
565+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
566566

567567
const int tid = get_local_id(0)/2; // 0...15
568568
const int ix = get_local_id(0)%2;
@@ -641,7 +641,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
641641
const int row = get_group_id(0);
642642

643643
const int num_blocks_per_row = ncols / QK_K;
644-
const int ib0 = row*num_blocks_per_row;
644+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
645645

646646
__global const struct block_q6_K * x = xx + ib0;
647647

@@ -745,19 +745,21 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
745745

746746
std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
747747
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
748-
const int block_size = get_local_size(0);
748+
const int local_size = get_local_size(0);
749749
const int row = get_group_id(0);
750750
const int tid = get_local_id(0);
751751

752752
const uint qk = QUANT_K;
753753
const uint qr = QUANT_R;
754754

755+
const int col_step = local_size * 2;
755756
const int y_offset = qr == 1 ? 1 : qk/2;
756757

758+
x += get_global_offset(0);
759+
757760
tmp[tid] = 0;
758761

759-
for (int i = 0; i < ncols/block_size; i += 2) {
760-
const int col = i*block_size + 2*tid;
762+
for (int col = tid*2; col < ncols; col += col_step) {
761763
const int ib = (row*ncols + col)/qk; // block index
762764
const int iqs = (col%qk)/qr; // quant index
763765
const int iybs = col - col%qk; // y block start index
@@ -773,7 +775,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
773775

774776
// sum up partial sums and write back result
775777
barrier(CLK_LOCAL_MEM_FENCE);
776-
for (int s=block_size/2; s>0; s>>=1) {
778+
for (int s=local_size/2; s>0; s>>=1) {
777779
if (tid < s) {
778780
tmp[tid] += tmp[tid + s];
779781
}
@@ -1704,7 +1706,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
17041706
const int nb2 = dst->nb[2];
17051707
const int nb3 = dst->nb[3];
17061708
const ggml_type type = src0->type;
1707-
const bool mul_mat_vec = ne11 == 1;
1709+
const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
17081710

17091711
const int64_t r2 = ne12 / ne02;
17101712
const int64_t r3 = ne13 / ne03;
@@ -1737,7 +1739,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
17371739
GGML_ASSERT(to_fp32_cl != nullptr);
17381740

17391741
const size_t global_denom = ggml_cl_global_denom(type);
1740-
const size_t local = ggml_cl_local_size(type);
1742+
const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
17411743

17421744
size_t ev_idx = 0;
17431745
std::vector<cl_event> events;
@@ -1770,16 +1772,16 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
17701772
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
17711773

17721774
// compute
1773-
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
1774-
const size_t local = CL_DMMV_BLOCK_SIZE;
1775+
const size_t global = ne01 * local;
1776+
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
17751777
const cl_int ncols = ne00;
17761778
events.emplace_back();
17771779
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
17781780
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
17791781
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
17801782
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
17811783
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
1782-
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1784+
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
17831785
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
17841786
// convert src0 to fp32 on device
17851787
const size_t global = x_ne / global_denom;

ggml.c

+34
Original file line numberDiff line numberDiff line change
@@ -5494,6 +5494,39 @@ struct ggml_tensor * ggml_view_tensor(
54945494
return result;
54955495
}
54965496

5497+
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
5498+
struct ggml_object * obj = ctx->objects_begin;
5499+
5500+
char * const mem_buffer = ctx->mem_buffer;
5501+
5502+
while (obj != NULL) {
5503+
if (obj->type == GGML_OBJECT_TENSOR) {
5504+
return (struct ggml_tensor *)(mem_buffer + obj->offs);
5505+
}
5506+
5507+
obj = obj->next;
5508+
}
5509+
5510+
return NULL;
5511+
}
5512+
5513+
struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
5514+
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
5515+
obj = obj->next;
5516+
5517+
char * const mem_buffer = ctx->mem_buffer;
5518+
5519+
while (obj != NULL) {
5520+
if (obj->type == GGML_OBJECT_TENSOR) {
5521+
return (struct ggml_tensor *)(mem_buffer + obj->offs);
5522+
}
5523+
5524+
obj = obj->next;
5525+
}
5526+
5527+
return NULL;
5528+
}
5529+
54975530
struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
54985531
struct ggml_object * obj = ctx->objects_begin;
54995532

@@ -8647,6 +8680,7 @@ void ggml_set_param(
86478680

86488681
GGML_ASSERT(tensor->grad == NULL);
86498682
tensor->grad = ggml_dup_tensor(ctx, tensor);
8683+
ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
86508684
}
86518685

86528686
// ggml_compute_forward_dup

ggml.h

+3
Original file line numberDiff line numberDiff line change
@@ -704,6 +704,9 @@ extern "C" {
704704
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
705705
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
706706

707+
// Context tensor enumeration and lookup
708+
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
709+
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
707710
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
708711

709712
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);

llama.cpp

+3-4
Original file line numberDiff line numberDiff line change
@@ -2839,8 +2839,8 @@ static void llm_load_tensors(
28392839
auto & layer = model.layers[i];
28402840

28412841
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
2842-
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
2843-
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2842+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
2843+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
28442844

28452845
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
28462846

@@ -5368,7 +5368,7 @@ static struct ggml_cgraph * llm_build_mpt(
53685368
const int64_t n_layer = hparams.n_layer;
53695369
const int64_t n_ctx = cparams.n_ctx;
53705370
const int64_t n_head = hparams.n_head;
5371-
const int64_t n_head_kv = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
5371+
const int64_t n_head_kv = hparams.n_head_kv;
53725372
const int64_t n_embd_head = hparams.n_embd_head();
53735373
const int64_t n_embd_gqa = hparams.n_embd_gqa();
53745374

@@ -5721,7 +5721,6 @@ static struct ggml_cgraph * llama_build_graph(
57215721
//
57225722
// - lctx: llama context
57235723
// - batch: batch to evaluate
5724-
// - n_threads: number of threads to use
57255724
//
57265725
// return 0 on success
57275726
// return positive int on warning

0 commit comments

Comments
 (0)