Skip to content

Commit d6ea1a6

Browse files
authored
Merge branch 'master' into ci/public-runner
2 parents 4c145b0 + 7a59129 commit d6ea1a6

File tree

89 files changed

+1245
-1548
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

89 files changed

+1245
-1548
lines changed

.github/workflows/dependabot_auto.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ jobs:
1414
steps:
1515
- name: Dependabot metadata
1616
id: metadata
17-
uses: dependabot/fetch-metadata@v2.2.0
17+
uses: dependabot/fetch-metadata@v2.3.0
1818
with:
1919
github-token: "${{ secrets.GITHUB_TOKEN }}"
2020
skip-commit-verification: true

.github/workflows/notify-models.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
with:
1919
model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
2020
# Check the PR diff using the current branch and the base branch of the PR
21-
- uses: GrantBirki/git-diff-action@v2.7.0
21+
- uses: GrantBirki/git-diff-action@v2.8.0
2222
id: git-diff-action
2323
with:
2424
json_diff_file_output: diff.json
@@ -99,7 +99,7 @@ jobs:
9999
docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
100100
until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready"; docker logs --tail 10 local-ai; sleep 2; done
101101
# Check the PR diff using the current branch and the base branch of the PR
102-
- uses: GrantBirki/git-diff-action@v2.7.0
102+
- uses: GrantBirki/git-diff-action@v2.8.0
103103
id: git-diff-action
104104
with:
105105
json_diff_file_output: diff.json

Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ RUN make prepare
303303
## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
304304
## (both will use CUDA or hipblas for the actual computation)
305305
RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
306-
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
306+
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
307307
else \
308308
make build; \
309309
fi

Makefile

+14-38
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,7 @@ BINARY_NAME=local-ai
66
DETECT_LIBS?=true
77

88
# llama.cpp versions
9-
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
10-
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
11-
CPPLLAMA_VERSION?=6152129d05870cb38162c422c6ba80434e021e9f
9+
CPPLLAMA_VERSION?=d2fe216fb2fb7ca8627618c9ea3a2e7886325780
1210

1311
# whisper.cpp version
1412
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@@ -24,7 +22,7 @@ BARKCPP_VERSION?=v1.0.0
2422

2523
# stablediffusion.cpp (ggml)
2624
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
27-
STABLEDIFFUSION_GGML_VERSION?=5eb15ef4d022bef4a391de4f5f6556e81fbb5024
25+
STABLEDIFFUSION_GGML_VERSION?=d46ed5e184b97c2018dc2e8105925bdb8775e02c
2826

2927
ONNX_VERSION?=1.20.0
3028
ONNX_ARCH?=x64
@@ -151,7 +149,6 @@ ifeq ($(BUILD_TYPE),hipblas)
151149
LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
152150
export CXX=$(ROCM_HOME)/llvm/bin/clang++
153151
export CC=$(ROCM_HOME)/llvm/bin/clang
154-
# llama-ggml has no hipblas support, so override it here.
155152
export STABLE_BUILD_TYPE=
156153
export GGML_HIP=1
157154
GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
@@ -186,8 +183,8 @@ endif
186183
ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface
187184
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
188185
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
186+
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
189187
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
190-
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
191188
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
192189
ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
193190
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
@@ -221,19 +218,6 @@ endif
221218

222219
all: help
223220

224-
## go-llama.cpp
225-
sources/go-llama.cpp:
226-
mkdir -p sources/go-llama.cpp
227-
cd sources/go-llama.cpp && \
228-
git init && \
229-
git remote add origin $(GOLLAMA_REPO) && \
230-
git fetch origin && \
231-
git checkout $(GOLLAMA_VERSION) && \
232-
git submodule update --init --recursive --depth 1 --single-branch
233-
234-
sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
235-
$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
236-
237221
## bark.cpp
238222
sources/bark.cpp:
239223
git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
@@ -309,27 +293,24 @@ sources/whisper.cpp:
309293
sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
310294
cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
311295

312-
get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
296+
get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
313297

314298
replace:
315299
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
316300
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
317301
$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
318-
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
319302

320303
dropreplace:
321304
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
322305
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
323306
$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
324-
$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
325307

326308
prepare-sources: get-sources replace
327309
$(GOCMD) mod download
328310

329311
## GENERIC
330312
rebuild: ## Rebuilds the project
331313
$(GOCMD) clean -cache
332-
$(MAKE) -C sources/go-llama.cpp clean
333314
$(MAKE) -C sources/whisper.cpp clean
334315
$(MAKE) -C sources/go-piper clean
335316
$(MAKE) build
@@ -433,7 +414,7 @@ run: prepare ## run local-ai
433414
test-models/testmodel.ggml:
434415
mkdir test-models
435416
mkdir test-dir
436-
wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
417+
wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml
437418
wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
438419
wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
439420
wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
@@ -448,8 +429,7 @@ test: prepare test-models/testmodel.ggml grpcs
448429
export GO_TAGS="tts debug"
449430
$(MAKE) prepare-test
450431
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
451-
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
452-
$(MAKE) test-llama
432+
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
453433
$(MAKE) test-llama-gguf
454434
$(MAKE) test-tts
455435
$(MAKE) test-stablediffusion
@@ -478,10 +458,6 @@ teardown-e2e:
478458
rm -rf $(TEST_DIR) || true
479459
docker stop $$(docker ps -q --filter ancestor=localai-tests)
480460

481-
test-llama: prepare-test
482-
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
483-
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
484-
485461
test-llama-gguf: prepare-test
486462
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
487463
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
@@ -699,6 +675,13 @@ backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc backend/cpp/llama/llama.
699675
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
700676
cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2
701677

678+
backend-assets/grpc/llama-cpp-avx512: backend-assets/grpc backend/cpp/llama/llama.cpp
679+
cp -rf backend/cpp/llama backend/cpp/llama-avx512
680+
$(MAKE) -C backend/cpp/llama-avx512 purge
681+
$(info ${GREEN}I llama-cpp build info:avx512${RESET})
682+
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx512" build-llama-cpp-grpc-server
683+
cp -rfv backend/cpp/llama-avx512/grpc-server backend-assets/grpc/llama-cpp-avx512
684+
702685
backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp
703686
cp -rf backend/cpp/llama backend/cpp/llama-avx
704687
$(MAKE) -C backend/cpp/llama-avx purge
@@ -752,13 +735,6 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
752735
mkdir -p backend-assets/util/
753736
cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
754737

755-
backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
756-
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
757-
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
758-
ifneq ($(UPX),)
759-
$(UPX) backend-assets/grpc/llama-ggml
760-
endif
761-
762738
backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
763739
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
764740
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
@@ -853,7 +829,7 @@ swagger:
853829

854830
.PHONY: gen-assets
855831
gen-assets:
856-
$(GOCMD) run core/dependencies_manager/manager.go embedded/webui_static.yaml core/http/static/assets
832+
$(GOCMD) run core/dependencies_manager/manager.go webui_static.yaml core/http/static/assets
857833

858834
## Documentation
859835
docs/layouts/_default:

backend/backend.proto

+7
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,11 @@ message Reply {
163163
double timing_token_generation = 5;
164164
}
165165

166+
message GrammarTrigger {
167+
string word = 1;
168+
bool at_start = 2;
169+
}
170+
166171
message ModelOptions {
167172
string Model = 1;
168173
int32 ContextSize = 2;
@@ -247,6 +252,8 @@ message ModelOptions {
247252

248253
string CacheTypeKey = 63;
249254
string CacheTypeValue = 64;
255+
256+
repeated GrammarTrigger GrammarTriggers = 65;
250257
}
251258

252259
message Result {

backend/cpp/llama/grpc-server.cpp

+32
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,9 @@ struct llama_server_context
468468
bool add_bos_token = true;
469469
bool has_eos_token = true;
470470

471+
bool grammar_lazy = false;
472+
std::vector<common_grammar_trigger> grammar_trigger_words;
473+
471474
int32_t n_ctx; // total context for all clients / slots
472475

473476
// system prompt
@@ -706,6 +709,8 @@ struct llama_server_context
706709
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
707710
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
708711
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
712+
slot->sparams.grammar_trigger_words = grammar_trigger_words;
713+
slot->sparams.grammar_lazy = grammar_lazy;
709714

710715
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
711716
// Might be better to reject the request with a 400 ?
@@ -2374,6 +2379,21 @@ static void params_parse(const backend::ModelOptions* request,
23742379
if ( request->ropefreqscale() != 0.0f ) {
23752380
params.rope_freq_scale = request->ropefreqscale();
23762381
}
2382+
2383+
if (request->grammartriggers_size() > 0) {
2384+
LOG_INFO("configuring grammar triggers", {});
2385+
llama.grammar_lazy = true;
2386+
for (int i = 0; i < request->grammartriggers_size(); i++) {
2387+
common_grammar_trigger trigger;
2388+
trigger.word = request->grammartriggers(i).word();
2389+
trigger.at_start = request->grammartriggers(i).at_start();
2390+
llama.grammar_trigger_words.push_back(trigger);
2391+
LOG_INFO("grammar trigger", {
2392+
{ "word", trigger.word },
2393+
{ "at_start", trigger.at_start }
2394+
});
2395+
}
2396+
}
23772397
}
23782398

23792399

@@ -2522,6 +2542,18 @@ class BackendServiceImpl final : public backend::Backend::Service {
25222542
return grpc::Status::OK;
25232543
}
25242544

2545+
grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response){
2546+
json data = parse_options(false, request, llama);
2547+
2548+
std::vector<llama_token> tokens = llama.tokenize(data["prompt"],false);
2549+
2550+
for (int i=0 ; i< tokens.size(); i++){
2551+
response->add_tokens(tokens[i]);
2552+
}
2553+
2554+
return grpc::Status::OK;
2555+
}
2556+
25252557
grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
25262558
llama_client_slot* active_slot = llama.get_active_slot();
25272559

0 commit comments

Comments
 (0)