withcatai · giladgd · Feb 24, 2024 · Feb 24, 2024 · Feb 24, 2024 · Feb 24, 2024
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -76,8 +76,12 @@ body:
           required: false
         - label: CUDA support
           required: false
+        - label: Vulkan support
+          required: false
         - label: Grammar
           required: false
+        - label: Function calling
+          required: false
   - type: dropdown
     id: pr
     attributes:

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -1,7 +1,10 @@
 name: Build
 on:
   push:
-
+    branches:
+      - master
+      - beta
+  pull_request:
   workflow_dispatch:
 
 jobs:
@@ -116,6 +119,24 @@ jobs:
           cuda: '12.2.0'
           method: 'network'
 
+      - name: Install Vulkan SDK on Windows
+        if: startsWith(matrix.config.os, 'windows')
+        env:
+          VULKAN_VERSION: 1.3.261.1
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
+          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
+          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
+          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
+
+      - name: Install Vulkan SDK on Ubuntu
+        if: startsWith(matrix.config.name, 'Ubuntu GCC')
+        run: |
+          wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
+          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+          sudo apt update
+          sudo apt install vulkan-sdk
+
       - name: Install dependencies on macOS
         if: startsWith(matrix.config.os, 'macos')
         run: |
@@ -179,10 +200,12 @@ jobs:
           if (process.env.ARTIFACT_NAME === "win") {
             await buildBinary("x64");
             await buildBinary("x64", ["--cuda"]);
+            await buildBinary("x64", ["--vulkan"]);
             // await buildBinary("arm64", [], windowsOnArmNodeVersion); // disabled arm64 for now as compilation doesn't work
           } else if (process.env.ARTIFACT_NAME === "linux") {
             await buildBinary("x64");
             await buildBinary("x64", ["--cuda"]);
+            await buildBinary("x64", ["--vulkan"]);
             await buildBinary("arm64");
             await buildBinary("armv7l");
           } else if (process.env.ARTIFACT_NAME === "mac") {
@@ -299,7 +322,7 @@ jobs:
 
   release:
     name: Release
-    if: github.ref == 'refs/heads/master' || github.ref == 'refs/heads/beta'
+    if: github.event_name == 'push' && (github.ref == 'refs/heads/master' || github.ref == 'refs/heads/beta')
     runs-on: ubuntu-latest
     concurrency: release-${{ github.ref }}
     environment:
@@ -367,7 +390,7 @@ jobs:
           npm run docs:build
       - name: Upload docs to GitHub Pages
         if: steps.set-npm-url.outputs.npm-url != '' && github.ref == 'refs/heads/master'
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-pages-artifact@v3
         with:
           name: pages-docs
           path: docs-site

diff --git a/.github/workflows/prLint.yml b/.github/workflows/prLint.yml
@@ -3,6 +3,7 @@ on:
   pull_request:
   pull_request_target:
     types: [opened, reopened, edited, synchronize]
+
 jobs:
   lint:
     name: Lint

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -1,5 +1,12 @@
 name: Test
-on: [push]
+on:
+  push:
+    branches:
+      - master
+      - beta
+  pull_request:
+  workflow_dispatch:
+
 jobs:
   test:
     name: Test

diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@
 
 ## Features
 * Run a text generation model locally on your machine
-* Metal and CUDA support
+* Metal, CUDA and Vulkan support
 * Pre-built binaries are provided, with a fallback to building from source without `node-gyp` or Python
 * Chat with a model using a chat wrapper
 * Use the CLI to chat with a model without writing any code

diff --git a/docs/guide/chat-prompt-wrapper.md b/docs/guide/chat-prompt-wrapper.md
@@ -7,8 +7,8 @@ and parse its response to know whether it finished answering, or should we tell
 For example, to prompt a model with "Where do llamas come from?" we can give the model a text like this to predict the completion of:
 ```txt
 You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
-If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.
-If you don't know the answer to a question, please don't share false information.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly.
+If you don't know the answer to a question, don't share false information.
 
 ### Human
 Where do llamas come from?

diff --git a/docs/index.md b/docs/index.md
@@ -20,7 +20,7 @@ hero:
 
 features:
   - icon: 🚀
-    title: Metal and CUDA support
+    title: Metal, CUDA and Vulkan support
     details: Utilize the power of your GPU to run AI models faster
     link: /guide/#cuda-and-metal-support
     linkText: Learn more

diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
@@ -58,6 +58,26 @@ if (LLAMA_CUBLAS)
     endif()
 endif()
 
+if (LLAMA_VULKAN OR LLAMA_KOMPUTE)
+    find_package(Vulkan)
+    if (Vulkan_FOUND)
+        if (LLAMA_VULKAN)
+            message(STATUS "Using Vulkan for GPU info")
+        elseif (LLAMA_KOMPUTE)
+            message(STATUS "Using Vulkan for GPU info because Kompute is enabled")
+        endif()
+
+        set(GPU_INFO_HEADERS ${GPU_INFO_HEADERS} gpuInfo/vulkan-gpu-info.h)
+        set(GPU_INFO_SOURCES ${GPU_INFO_SOURCES} gpuInfo/vulkan-gpu-info.cpp)
+
+        add_compile_definitions(GPU_INFO_USE_VULKAN)
+
+        set(GPU_INFO_EXTRA_LIBS ${GPU_INFO_EXTRA_LIBS} Vulkan::Vulkan)
+    else()
+        message(WARNING "Vulkan not found. Not using it for GPU info")
+    endif()
+endif()
+
 if (LLAMA_HIPBLAS)
     list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
 

diff --git a/llama/addon.cpp b/llama/addon.cpp
@@ -12,6 +12,9 @@
 #ifdef GPU_INFO_USE_CUBLAS
 #  include "gpuInfo/cuda-gpu-info.h"
 #endif
+#ifdef GPU_INFO_USE_VULKAN
+#  include "gpuInfo/vulkan-gpu-info.h"
+#endif
 #ifdef GPU_INFO_USE_METAL
 #  include "gpuInfo/metal-gpu-info.h"
 #endif
@@ -35,6 +38,7 @@ using AddonThreadSafeLogCallbackFunction =
 AddonThreadSafeLogCallbackFunction addonThreadSafeLoggerCallback;
 bool addonJsLoggerCallbackSet = false;
 int addonLoggerLogLevel = 5;
+bool backendInitialized = false;
 
 std::string addon_model_token_to_piece(const struct llama_model* model, llama_token token) {
     std::vector<char> result(8, 0);
@@ -51,10 +55,15 @@ std::string addon_model_token_to_piece(const struct llama_model* model, llama_to
 }
 
 #ifdef GPU_INFO_USE_CUBLAS
-void lodCudaError(const char* message) {
+void logCudaError(const char* message) {
     addonLlamaCppLogCallback(GGML_LOG_LEVEL_ERROR, (std::string("CUDA error: ") + std::string(message)).c_str(), nullptr);
 }
 #endif
+#ifdef GPU_INFO_USE_VULKAN
+void logVulkanWarning(const char* message) {
+    addonLlamaCppLogCallback(GGML_LOG_LEVEL_WARN, (std::string("Vulkan warning: ") + std::string(message)).c_str(), nullptr);
+}
+#endif
 
 Napi::Value getGpuVramInfo(const Napi::CallbackInfo& info) {
     uint64_t total = 0;
@@ -63,14 +72,25 @@ Napi::Value getGpuVramInfo(const Napi::CallbackInfo& info) {
 #ifdef GPU_INFO_USE_CUBLAS
     size_t cudaDeviceTotal = 0;
     size_t cudaDeviceUsed = 0;
-    bool cudeGetInfoSuccess = gpuInfoGetTotalCudaDevicesInfo(&cudaDeviceTotal, &cudaDeviceUsed, lodCudaError);
+    bool cudeGetInfoSuccess = gpuInfoGetTotalCudaDevicesInfo(&cudaDeviceTotal, &cudaDeviceUsed, logCudaError);
 
     if (cudeGetInfoSuccess) {
         total += cudaDeviceTotal;
         used += cudaDeviceUsed;
     }
 #endif
 
+#ifdef GPU_INFO_USE_VULKAN
+    uint64_t vulkanDeviceTotal = 0;
+    uint64_t vulkanDeviceUsed = 0;
+    const bool vulkanDeviceSupportsMemoryBudgetExtension = gpuInfoGetTotalVulkanDevicesInfo(&vulkanDeviceTotal, &vulkanDeviceUsed, logVulkanWarning);
+
+    if (vulkanDeviceSupportsMemoryBudgetExtension) {
+        total += vulkanDeviceTotal;
+        used += vulkanDeviceUsed;
+    }
+#endif
+
 #ifdef GPU_INFO_USE_METAL
     uint64_t metalDeviceTotal = 0;
     uint64_t metalDeviceUsed = 0;
@@ -950,7 +970,7 @@ void addonCallJsLogCallback(
             called = false;
         }
     }
-    
+
     if (!called && data != nullptr) {
         if (data->logLevelNumber == 2) {
             fputs(data->stringStream->str().c_str(), stderr);
@@ -1046,8 +1066,17 @@ Napi::Value setLoggerLogLevel(const Napi::CallbackInfo& info) {
     return info.Env().Undefined();
 }
 
+static void addonFreeLlamaBackend(Napi::Env env, int* data) {
+    if (backendInitialized) {
+        llama_backend_free();
+        backendInitialized = false;
+    }
+}
+
 Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
     llama_backend_init();
+    backendInitialized = true;
+
     exports.DefineProperties({
         Napi::PropertyDescriptor::Function("systemInfo", systemInfo),
         Napi::PropertyDescriptor::Function("setLogger", setLogger),
@@ -1061,6 +1090,8 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
 
     llama_log_set(addonLlamaCppLogCallback, nullptr);
 
+    exports.AddFinalizer(addonFreeLlamaBackend, static_cast<int*>(nullptr));
+
     return exports;
 }
 

diff --git a/llama/gpuInfo/cuda-gpu-info.cu b/llama/gpuInfo/cuda-gpu-info.cu
@@ -15,9 +15,9 @@
 #endif
 
 
-typedef void (*gpuInfoErrorLogCallback_t)(const char* message);
+typedef void (*gpuInfoCudaErrorLogCallback_t)(const char* message);
 
-bool gpuInfoSetCudaDevice(const int device, gpuInfoErrorLogCallback_t errorLogCallback) {
+bool gpuInfoSetCudaDevice(const int device, gpuInfoCudaErrorLogCallback_t errorLogCallback) {
     int current_device;
     auto getDeviceResult = cudaGetDevice(&current_device);
 
@@ -40,7 +40,7 @@ bool gpuInfoSetCudaDevice(const int device, gpuInfoErrorLogCallback_t errorLogCa
     return true;
 }
 
-bool gpuInfoGetCudaDeviceInfo(int device, size_t * total, size_t * used, gpuInfoErrorLogCallback_t errorLogCallback) {
+bool gpuInfoGetCudaDeviceInfo(int device, size_t * total, size_t * used, gpuInfoCudaErrorLogCallback_t errorLogCallback) {
     gpuInfoSetCudaDevice(device, errorLogCallback);
 
     size_t freeMem;
@@ -58,7 +58,7 @@ bool gpuInfoGetCudaDeviceInfo(int device, size_t * total, size_t * used, gpuInfo
     return true;
 }
 
-int gpuInfoGetCudaDeviceCount(gpuInfoErrorLogCallback_t errorLogCallback) {
+int gpuInfoGetCudaDeviceCount(gpuInfoCudaErrorLogCallback_t errorLogCallback) {
     int deviceCount;
     auto getDeviceCountResult = cudaGetDeviceCount(&deviceCount);
 
@@ -70,7 +70,7 @@ int gpuInfoGetCudaDeviceCount(gpuInfoErrorLogCallback_t errorLogCallback) {
     return deviceCount;
 }
 
-bool gpuInfoGetTotalCudaDevicesInfo(size_t * total, size_t * used, gpuInfoErrorLogCallback_t errorLogCallback) {
+bool gpuInfoGetTotalCudaDevicesInfo(size_t * total, size_t * used, gpuInfoCudaErrorLogCallback_t errorLogCallback) {
     int deviceCount = gpuInfoGetCudaDeviceCount(errorLogCallback);
 
     if (deviceCount < 0) {

diff --git a/llama/gpuInfo/cuda-gpu-info.h b/llama/gpuInfo/cuda-gpu-info.h
@@ -2,6 +2,6 @@
 
 #include <stddef.h>
 
-typedef void (*gpuInfoErrorLogCallback_t)(const char* message);
+typedef void (*gpuInfoCudaErrorLogCallback_t)(const char* message);
 
-bool gpuInfoGetTotalCudaDevicesInfo(size_t * total, size_t * used, gpuInfoErrorLogCallback_t errorLogCallback);
+bool gpuInfoGetTotalCudaDevicesInfo(size_t * total, size_t * used, gpuInfoCudaErrorLogCallback_t errorLogCallback);
diff --git a/llama/gpuInfo/vulkan-gpu-info.cpp b/llama/gpuInfo/vulkan-gpu-info.cpp
@@ -0,0 +1,65 @@
+#include <stddef.h>
+
+#include <vulkan/vulkan.hpp>
+
+typedef void (*gpuInfoVulkanWarningLogCallback_t)(const char* message);
+
+bool gpuInfoGetTotalVulkanDevicesInfo(size_t* total, size_t* used, gpuInfoVulkanWarningLogCallback_t warningLogCallback) {
+    vk::ApplicationInfo appInfo("node-llama-cpp GPU info", 1, "llama.cpp", 1, VK_API_VERSION_1_2);
+    vk::InstanceCreateInfo createInfo(vk::InstanceCreateFlags(), &appInfo, {}, {});
+    vk::Instance instance = vk::createInstance(createInfo);
+
+    auto physicalDevices = instance.enumeratePhysicalDevices();
+
+    size_t usedMem = 0;
+    size_t totalMem = 0;
+
+    for (size_t i = 0; i < physicalDevices.size(); i++) {
+        vk::PhysicalDevice physicalDevice = physicalDevices[i];
+        vk::PhysicalDeviceMemoryProperties memProps = physicalDevice.getMemoryProperties();
+        vk::PhysicalDeviceProperties deviceProps = physicalDevice.getProperties();
+
+        if (deviceProps.deviceType == vk::PhysicalDeviceType::eCpu) {
+            // ignore CPU devices, as we don't want to count RAM from the CPU as VRAM
+            continue;
+        }
+
+        std::vector<vk::ExtensionProperties> extensionProperties = physicalDevice.enumerateDeviceExtensionProperties();
+        bool memoryBudgetExtensionSupported =
+            std::any_of(
+                extensionProperties.begin(),
+                extensionProperties.end(),
+                [](const vk::ExtensionProperties& ext) { return std::string(ext.extensionName.data()) == VK_EXT_MEMORY_BUDGET_EXTENSION_NAME;}
+            );
+
+        if (memoryBudgetExtensionSupported) {
+            vk::PhysicalDeviceMemoryBudgetPropertiesEXT memoryBudgetProperties;
+            vk::PhysicalDeviceMemoryProperties2 memProps2 = {};
+            memProps2.pNext = &memoryBudgetProperties;
+
+            physicalDevice.getMemoryProperties2(&memProps2);
+
+            for (uint32_t i = 0; i < memProps.memoryHeapCount; ++i) {
+                if (memProps.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
+                    totalMem += memProps.memoryHeaps[i].size;
+                    usedMem += memoryBudgetProperties.heapUsage[i];
+                    break;
+                }
+            }
+        } else {
+            // VK_EXT_memory_budget extension is not supported, so we cannot determine used memory
+            warningLogCallback(
+                (
+                    "Vulkan VK_EXT_memory_budget extension not supported for device \"" +
+                    std::string(deviceProps.deviceName.data()) + "\", so VRAM info cannot be determained for it"
+                )
+                    .c_str()
+            );
+            return false;
+        }
+    }
+
+    *total = totalMem;
+    *used = usedMem;
+    return true;
+}
diff --git a/llama/gpuInfo/vulkan-gpu-info.h b/llama/gpuInfo/vulkan-gpu-info.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <stddef.h>
+
+typedef void (*gpuInfoVulkanWarningLogCallback_t)(const char* message);
+
+bool gpuInfoGetTotalVulkanDevicesInfo(size_t* total, size_t* used, gpuInfoVulkanWarningLogCallback_t warningLogCallback);
diff --git a/package.json b/package.json
@@ -82,6 +82,7 @@
     "gguf",
     "metal",
     "cuda",
+    "vulkan",
     "grammar",
     "json-grammar",
     "json-schema-grammar",