Skip to content

Commit 9f13623

Browse files
author
ochafik
committed
Merge remote-tracking branch 'origin/master' into grammar-fast
2 parents 05efa34 + 928e0b7 commit 9f13623

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+3427
-8403
lines changed

.github/workflows/bench.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ on:
3232
- cron: '04 2 * * *'
3333

3434
concurrency:
35-
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}-${{ github.event.inputs.sha }}
35+
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
3636
cancel-in-progress: true
3737

3838
jobs:

.github/workflows/server.yml

+8-4
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ on:
2323
- cron: '2 4 * * *'
2424

2525
concurrency:
26-
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
26+
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
2727
cancel-in-progress: true
2828

2929
jobs:
@@ -58,6 +58,7 @@ jobs:
5858
git \
5959
cmake \
6060
python3-pip \
61+
python3-venv \
6162
curl \
6263
wget \
6364
language-pack-en \
@@ -100,10 +101,13 @@ jobs:
100101
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
101102
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
102103
103-
- name: Tests dependencies
104-
id: test_dependencies
104+
- name: Setup python env
105+
id: pipenv
105106
run: |
106-
pip install -r examples/server/tests/requirements.txt
107+
cd examples/server/tests
108+
python3 -m venv venv
109+
. venv/bin/activate
110+
pip install -r requirements.txt
107111
108112
- name: Tests
109113
id: server_integration_tests

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ lcov-report/
3434
gcovr-report/
3535

3636
build*
37+
!build.zig
3738
cmake-build-*
3839
out/
3940
tmp/
@@ -100,6 +101,9 @@ qnt-*.txt
100101
perf-*.txt
101102

102103
examples/jeopardy/results.txt
104+
examples/server/*.html.hpp
105+
examples/server/*.js.hpp
106+
examples/server/*.mjs.hpp
103107

104108
poetry.lock
105109
poetry.toml

CMakeLists.txt

+1-11
Original file line numberDiff line numberDiff line change
@@ -43,17 +43,7 @@ else()
4343
set(LLAMA_METAL_DEFAULT OFF)
4444
endif()
4545

46-
# TODO: fix this for Android CI
47-
# https://github.com/ggerganov/llama.cpp/pull/6716#issuecomment-2061509191
48-
#if (CMAKE_SYSTEM_NAME MATCHES "ANDROID")
49-
# set(LLAMA_LLAMAFILE_DEFAULT OFF)
50-
#else()
51-
# set(LLAMA_LLAMAFILE_DEFAULT ON)
52-
#endif()
53-
54-
# TODO: temporary disable until MoE is fixed
55-
# https://github.com/ggerganov/llama.cpp/pull/6716
56-
set(LLAMA_LLAMAFILE_DEFAULT OFF)
46+
set(LLAMA_LLAMAFILE_DEFAULT ON)
5747

5848
# general
5949
option(BUILD_SHARED_LIBS "build shared libraries" OFF)

Makefile

+11-6
Original file line numberDiff line numberDiff line change
@@ -384,10 +384,6 @@ ifdef LLAMA_OPENBLAS
384384
MK_LDFLAGS += $(shell pkg-config --libs openblas)
385385
endif # LLAMA_OPENBLAS
386386

387-
# TODO: temporary disable until MoE is fixed
388-
# https://github.com/ggerganov/llama.cpp/pull/6716
389-
LLAMA_NO_LLAMAFILE := 1
390-
391387
ifndef LLAMA_NO_LLAMAFILE
392388
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
393389
OBJS += sgemm.o
@@ -772,7 +768,7 @@ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.
772768
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
773769
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
774770

775-
quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS)
771+
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
776772
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
777773
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
778774

@@ -800,10 +796,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
800796
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
801797
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
802798

803-
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
799+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
804800
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
805801
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
806802

803+
# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
804+
examples/server/%.hpp: examples/server/public/% Makefile
805+
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
806+
echo "unsigned char $${NAME}[] = {" && \
807+
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
808+
echo "};" && \
809+
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
810+
) > $@
811+
807812
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
808813
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
809814
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

README-sycl.md

+12-8
Original file line numberDiff line numberDiff line change
@@ -229,12 +229,12 @@ source /opt/intel/oneapi/setvars.sh
229229
# Build LLAMA with MKL BLAS acceleration for intel GPU
230230
mkdir -p build && cd build
231231

232-
# Option 1: Use FP16 for better performance in long-prompt inference
233-
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
234-
235-
# Option 2: Use FP32 by default
232+
# Option 1: Use FP32 (recommended for better performance in most cases)
236233
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
237234

235+
# Option 2: Use FP16
236+
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
237+
238238
#build all binary
239239
cmake --build . --config Release -j -v
240240
```
@@ -250,12 +250,12 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
250250
# Build LLAMA with Nvidia BLAS acceleration through SYCL
251251
mkdir -p build && cd build
252252

253-
# Option 1: Use FP16 for better performance in long-prompt inference
254-
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
255-
256-
# Option 2: Use FP32 by default
253+
# Option 1: Use FP32 (recommended for better performance in most cases)
257254
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
258255

256+
# Option 2: Use FP16
257+
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
258+
259259
#build all binary
260260
cmake --build . --config Release -j -v
261261

@@ -416,6 +416,10 @@ mkdir -p build
416416
cd build
417417
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
418418
419+
# Option 1: Use FP32 (recommended for better performance in most cases)
420+
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
421+
422+
# Option 2: Or FP16
419423
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
420424
421425
make -j

README.md

+8-2
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ Typically finetunes of the base models below are supported as well.
9393

9494
- [X] LLaMA 🦙
9595
- [x] LLaMA 2 🦙🦙
96+
- [x] LLaMA 3 🦙🦙🦙
9697
- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
9798
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
9899
- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
@@ -119,8 +120,9 @@ Typically finetunes of the base models below are supported as well.
119120
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
120121
- [x] [Gemma](https://ai.google.dev/gemma)
121122
- [x] [Mamba](https://github.com/state-spaces/mamba)
123+
- [x] [Grok-1](https://huggingface.co/keyfan/grok-1-hf)
122124
- [x] [Xverse](https://huggingface.co/models?search=xverse)
123-
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
125+
- [x] [Command-R models](https://huggingface.co/models?search=CohereForAI/c4ai-command-r)
124126
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
125127
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
126128
- [x] [OLMo](https://allenai.org/olmo)
@@ -135,6 +137,8 @@ Typically finetunes of the base models below are supported as well.
135137
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
136138
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
137139
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
140+
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
141+
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
138142

139143
**HTTP server**
140144

@@ -1117,7 +1121,9 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m
11171121
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
11181122
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
11191123
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
1120-
- Matrix multiplication is unconventional: [`z = ggml_mul_mat(ctx, x, y)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means `zT = x @ yT`
1124+
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
1125+
1126+
![matmul](media/matmul.png)
11211127
11221128
### Docs
11231129

build.zig

+29
Original file line numberDiff line numberDiff line change
@@ -140,4 +140,33 @@ pub fn build(b: *std.build.Builder) !void {
140140
if (server.target.isWindows()) {
141141
server.linkSystemLibrary("ws2_32");
142142
}
143+
144+
const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
145+
for (server_assets) |asset| {
146+
const input_path = b.fmt("examples/server/public/{s}", .{asset});
147+
const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
148+
149+
// Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
150+
151+
const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
152+
defer b.allocator.free(input);
153+
154+
var buf = std.ArrayList(u8).init(b.allocator);
155+
defer buf.deinit();
156+
157+
for (input) |byte| {
158+
try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
159+
}
160+
161+
var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
162+
defer b.allocator.free(name);
163+
std.mem.replaceScalar(u8, name, '.', '_');
164+
165+
try std.fs.cwd().writeFile(output_path, b.fmt(
166+
"unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
167+
.{ name, buf.items, name, input.len },
168+
));
169+
170+
std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
171+
}
143172
}

ci/run.sh

+3
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,9 @@ function gg_run_test_scripts_debug {
160160

161161
set -e
162162

163+
# TODO: too slow, run on dedicated node
163164
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
165+
#(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
164166

165167
set +e
166168
}
@@ -184,6 +186,7 @@ function gg_run_test_scripts_release {
184186
set -e
185187

186188
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
189+
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
187190

188191
set +e
189192
}

0 commit comments

Comments
 (0)