From ef9afe1540bf8f877443de488f95a6a0f44e41d6 Mon Sep 17 00:00:00 2001 From: Slaren <2141330+slaren@users.noreply.github.com> Date: Wed, 29 Mar 2023 02:03:43 +0200 Subject: [PATCH 1/6] Add mmap support for model files --- ggml.c | 9 ++++-- ggml.h | 1 + llama.cpp | 86 +++++++++++++++++++++++++++++++++++++++++++------------ 3 files changed, 74 insertions(+), 22 deletions(-) diff --git a/ggml.c b/ggml.c index efe9316bb01ad..ad5fdabe98ab0 100644 --- a/ggml.c +++ b/ggml.c @@ -2419,8 +2419,9 @@ struct ggml_context { void * mem_buffer; bool mem_buffer_owned; bool mem_buffer_mlocked; + bool no_alloc; - int n_objects; + int n_objects; struct ggml_object * objects_begin; struct ggml_object * objects_end; @@ -2702,6 +2703,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size), /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, /*.mem_buffer_mlocked =*/ false, + /*.no_alloc =*/ params.no_alloc, /*.n_objects =*/ 0, /*.objects_begin =*/ NULL, /*.objects_end =*/ NULL, @@ -2817,7 +2819,7 @@ struct ggml_tensor * ggml_new_tensor_impl( size_t size_needed = 0; - if (data == NULL) { + if (data == NULL && !ctx->no_alloc) { size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]); for (int i = 1; i < n_dims; i++) { size_needed *= ne[i]; @@ -2901,7 +2903,7 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, - /*.data =*/ data == NULL ? (void *)(result + 1) : data, + /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data, /*.pad =*/ { 0 }, }; @@ -10164,6 +10166,7 @@ enum ggml_opt_result ggml_opt( struct ggml_init_params params_ctx = { .mem_size = 16*1024*1024, .mem_buffer = NULL, + .no_alloc = false, }; ctx = ggml_init(params_ctx); diff --git a/ggml.h b/ggml.h index 335230f9f0bb2..058dfe2306516 100644 --- a/ggml.h +++ b/ggml.h @@ -316,6 +316,7 @@ struct ggml_init_params { // memory pool size_t mem_size; // bytes void * mem_buffer; // if NULL, memory will be allocated internally + bool no_alloc; // don't allocate memory for the tensor data }; void ggml_time_init(void); // call this once at the beginning of the program diff --git a/llama.cpp b/llama.cpp index 2d0279258740a..8791d16b8dbf4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12,6 +12,13 @@ #include #include +// headers for POSIX mmap +#if defined (__unix__) || defined (__APPLE__) +# include +# include +# include +#endif + #define LLAMA_USE_SCRATCH #define LLAMA_MAX_SCRATCH_BUFFERS 16 @@ -246,6 +253,7 @@ static bool kv_cache_init( struct ggml_init_params params; params.mem_size = cache.buf.size(); params.mem_buffer = cache.buf.data(); + params.no_alloc = false; cache.ctx = ggml_init(params); @@ -288,6 +296,26 @@ struct llama_context_params llama_context_default_params() { // model loading // +void * mmap_file(const char* fname) { +#if defined(MAP_FAILED) + // POSIX mmap + int fd = open(fname, O_RDONLY); + size_t len = lseek(fd, 0, SEEK_END); + void * mm_addr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0); + if (mm_addr == MAP_FAILED) { + perror("mmap failed"); + mm_addr = NULL; + } + close(fd); + return mm_addr; +#else + // TODO: windows support + (void)(fname); // suppress warnings + return NULL; +#endif +} + + static bool llama_model_load( const std::string & fname, llama_context & lctx, @@ -303,6 +331,7 @@ static bool llama_model_load( lctx.t_start_us = t_start_us; + // TODO: this could probably be smaller when using mmap std::vector f_buf(1024*1024); auto & model = lctx.model; @@ -449,39 +478,49 @@ static bool llama_model_load( } } + bool use_mmap = (n_parts == 1); + + // try to memory map the model file + void* mm_addr = NULL; + if (use_mmap) { + mm_addr = mmap_file(fname.c_str()); + if (mm_addr == NULL) { + use_mmap = false; + } + } + + + auto & ctx = model.ctx; size_t ctx_size = 0; - { const auto & hparams = model.hparams; const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; - ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings + if (!use_mmap) { + ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm - ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output + ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm - ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1 - ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2 - ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3 - - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v + ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1 + ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2 + ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3 + } ctx_size += (5 + 10*n_layer)*256; // object overhead @@ -514,6 +553,7 @@ static bool llama_model_load( struct ggml_init_params params = { /*.mem_size =*/ lctx.model.buf.size(), /*.mem_buffer =*/ lctx.model.buf.data(), + /*.no_alloc =*/ use_mmap, }; model.ctx = ggml_init(params); @@ -595,7 +635,7 @@ static bool llama_model_load( fname_part += "." + std::to_string(i); } - fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str()); + fprintf(stderr, "%s: loading model part %d/%d from '%s'%s\n", __func__, i+1, n_parts, fname_part.c_str(), use_mmap ? " (memory mapped)" : ""); fin = std::ifstream(fname_part, std::ios::binary); fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size()); @@ -736,7 +776,14 @@ static bool llama_model_load( } if (part_id == 0) { - fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + if (mm_addr) { + off_t offset = fin.tellg(); + tensor->data = (char *) mm_addr + offset; + fin.seekg(ggml_nbytes(tensor), std::ios::cur); + } + else { + fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + } } else { fin.seekg(ggml_nbytes(tensor), std::ios::cur); } @@ -849,6 +896,7 @@ static bool llama_eval_internal( struct ggml_init_params params = { /*.mem_size =*/ buf_compute.size(), /*.mem_buffer =*/ buf_compute.data(), + /*.no_alloc =*/ false, }; struct ggml_context * ctx0 = ggml_init(params); From 7961493a401b28937358e6558d135596e73712ec Mon Sep 17 00:00:00 2001 From: Slaren <2141330+slaren@users.noreply.github.com> Date: Wed, 29 Mar 2023 05:38:57 +0200 Subject: [PATCH 2/6] Fix ggml_init_params in quantize --- examples/quantize/quantize.cpp | 2 +- llama.cpp | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index b444328acc6aa..680757c6bf356 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -19,7 +19,7 @@ int main(int argc, char ** argv) { // needed to initialize f16 tables { - struct ggml_init_params params = { 0, NULL }; + struct ggml_init_params params = { 0, NULL, false }; struct ggml_context * ctx = ggml_init(params); ggml_free(ctx); } diff --git a/llama.cpp b/llama.cpp index 8791d16b8dbf4..d9a5954c6d2d6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -315,7 +315,6 @@ void * mmap_file(const char* fname) { #endif } - static bool llama_model_load( const std::string & fname, llama_context & lctx, @@ -489,8 +488,6 @@ static bool llama_model_load( } } - - auto & ctx = model.ctx; size_t ctx_size = 0; From e6f1c199378e4d693688d49386f6b4e5ca4eb9ca Mon Sep 17 00:00:00 2001 From: Slaren <2141330+slaren@users.noreply.github.com> Date: Wed, 29 Mar 2023 06:18:18 +0200 Subject: [PATCH 3/6] Make mmap_file static --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index d9a5954c6d2d6..886ba8361fbca 100644 --- a/llama.cpp +++ b/llama.cpp @@ -296,7 +296,7 @@ struct llama_context_params llama_context_default_params() { // model loading // -void * mmap_file(const char* fname) { +static void * mmap_file(const char* fname) { #if defined(MAP_FAILED) // POSIX mmap int fd = open(fname, O_RDONLY); From baa529e9c097dcb3675951eaa0f01cd58e3fa101 Mon Sep 17 00:00:00 2001 From: Slaren <2141330+slaren@users.noreply.github.com> Date: Wed, 29 Mar 2023 08:31:26 +0200 Subject: [PATCH 4/6] Unmap the file in llama_free --- llama.cpp | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/llama.cpp b/llama.cpp index 886ba8361fbca..00ab5dba23761 100644 --- a/llama.cpp +++ b/llama.cpp @@ -149,6 +149,10 @@ struct llama_model { // the model memory buffer std::vector buf; + // model memory mapped file + void * mm_addr; + size_t mm_length; + // tensors int n_loaded; std::unordered_map tensors; @@ -296,22 +300,32 @@ struct llama_context_params llama_context_default_params() { // model loading // -static void * mmap_file(const char* fname) { +static void mmap_file(const char* fname, void * &mm_addr, size_t &mm_length) { #if defined(MAP_FAILED) - // POSIX mmap + // POSIX int fd = open(fname, O_RDONLY); - size_t len = lseek(fd, 0, SEEK_END); - void * mm_addr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0); + mm_length = lseek(fd, 0, SEEK_END); + mm_addr = mmap(NULL, mm_length, PROT_READ, MAP_SHARED, fd, 0); + close(fd); if (mm_addr == MAP_FAILED) { perror("mmap failed"); mm_addr = NULL; + mm_length = 0; } - close(fd); - return mm_addr; #else // TODO: windows support (void)(fname); // suppress warnings - return NULL; +#endif +} + +static void munmap_file(void * addr, size_t length) { +#if defined(MAP_FAILED) + // POSIX + munmap(addr, length); +#else + // TODO: windows support + (void)(addr); // suppress warnings + (void)(length); #endif } @@ -480,12 +494,15 @@ static bool llama_model_load( bool use_mmap = (n_parts == 1); // try to memory map the model file - void* mm_addr = NULL; + void * mm_addr = NULL; if (use_mmap) { - mm_addr = mmap_file(fname.c_str()); - if (mm_addr == NULL) { + mmap_file(fname.c_str(), model.mm_addr, model.mm_length); + if (model.mm_addr == NULL) { use_mmap = false; } + else { + mm_addr = model.mm_addr; + } } auto & ctx = model.ctx; @@ -1750,6 +1767,10 @@ void llama_free(struct llama_context * ctx) { ggml_free(ctx->model.ctx); } + if (ctx->model.mm_addr) { + munmap_file(ctx->model.mm_addr, ctx->model.mm_length); + } + delete ctx; } From 3f5f4286dd414eb2b540d4a6c7c02edca13a4c79 Mon Sep 17 00:00:00 2001 From: Slaren <2141330+slaren@users.noreply.github.com> Date: Wed, 29 Mar 2023 08:53:14 +0200 Subject: [PATCH 5/6] Always initialize mm_addr and mm_length in llama_model --- llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 00ab5dba23761..60b72b6584f84 100644 --- a/llama.cpp +++ b/llama.cpp @@ -150,8 +150,8 @@ struct llama_model { std::vector buf; // model memory mapped file - void * mm_addr; - size_t mm_length; + void * mm_addr = NULL; + size_t mm_length = 0; // tensors int n_loaded; From f789d8d50aab00e4f938d58f3c41bacbcd91c087 Mon Sep 17 00:00:00 2001 From: Slaren <2141330+slaren@users.noreply.github.com> Date: Wed, 29 Mar 2023 22:22:36 +0200 Subject: [PATCH 6/6] Initial windows support (untested) --- llama.cpp | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 60b72b6584f84..8ffb530e69a87 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12,11 +12,15 @@ #include #include -// headers for POSIX mmap +// mmap #if defined (__unix__) || defined (__APPLE__) # include # include # include +#elif defined(_WIN32) +# define WIN32_LEAN_AND_MEAN +# include +//#include #endif #define LLAMA_USE_SCRATCH @@ -312,8 +316,31 @@ static void mmap_file(const char* fname, void * &mm_addr, size_t &mm_length) { mm_addr = NULL; mm_length = 0; } +#elif defined(_WIN32) + mm_addr = NULL; + + HANDLE hFile = CreateFileA(filename, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); + if (hFile == INVALID_HANDLE_VALUE) { + return; + } + + // not really necessary + LARGE_INTEGER fileSize; + GetFileSizeEx(hFile, &fileSize); + mm_length = fileSize; + + HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); + CloseHandle(hFile); + + if (hMapping == NULL) { + return; + } + + mm_addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); + CloseHandle(hMapping); #else - // TODO: windows support + mm_addr = NULL; + mm_length = 0; (void)(fname); // suppress warnings #endif } @@ -322,8 +349,9 @@ static void munmap_file(void * addr, size_t length) { #if defined(MAP_FAILED) // POSIX munmap(addr, length); +#elif defined(_WIN32) + UnmapViewOfFile(addr); #else - // TODO: windows support (void)(addr); // suppress warnings (void)(length); #endif