Skip to content

llava : introduce libmtmd #12849

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Apr 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion examples/llava/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# llava (legacy)

add_library(llava OBJECT
llava.cpp
llava.h
Expand All @@ -22,12 +24,41 @@ if (BUILD_SHARED_LIBS)
install(TARGETS llava_shared LIBRARY)
endif()

# mtmd

add_library(mtmd OBJECT
mtmd.cpp
mtmd.h
clip.cpp
clip.h
clip-impl.h
)

target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})

target_include_directories(mtmd PUBLIC .)
target_include_directories(mtmd PRIVATE ../..)
target_include_directories(mtmd PRIVATE ../../common) # for stb_image.h

target_compile_features(mtmd PRIVATE cxx_std_17)

add_library(mtmd_static STATIC $<TARGET_OBJECTS:mtmd>)
if (BUILD_SHARED_LIBS)
set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS mtmd_shared LIBRARY)
endif()

Comment on lines +45 to +53
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not really sure what is the need for building both static and shared libs here. Probably something to revisit and simplify.

if (NOT MSVC)
target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
endif()

if(TARGET BUILD_INFO)
add_dependencies(llava BUILD_INFO)
add_dependencies(mtmd BUILD_INFO)
endif()

set(TARGET llama-llava-cli)
Expand Down Expand Up @@ -55,7 +86,7 @@ set(TARGET llama-gemma3-cli)
add_executable(${TARGET} gemma3-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

set(TARGET llama-llava-clip-quantize-cli)
Expand Down
62 changes: 62 additions & 0 deletions examples/llava/clip-impl.h
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
#include "ggml.h"
#include "gguf.h"

#include "clip.h"

#include <climits>
#include <cstdarg>
#include <string>
#include <map>
#include <sstream>
#include <vector>
#include <memory>

// Internal header for clip.cpp

Expand Down Expand Up @@ -120,6 +123,23 @@ static projector_type clip_projector_type_from_string(const std::string & str) {
return PROJECTOR_TYPE_UNKNOWN;
}

// RGB uint8 image
struct clip_image_u8 {
int nx;
int ny;

std::vector<uint8_t> buf;
};

// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
struct clip_image_f32 {
int nx;
int ny;

std::vector<float> buf;
};

//
// logging
//
Expand Down Expand Up @@ -178,6 +198,28 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, ..
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, __VA_ARGS__)

//
// cpp wrappers
//

struct clip_image_u8_deleter {
void operator()(clip_image_u8 * val) { clip_image_u8_free(val); }
};

struct clip_image_f32_deleter {
void operator()(clip_image_f32 * val) { clip_image_f32_free(val); }
};

struct clip_image_f32_batch_deleter {
void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
};

typedef std::unique_ptr<clip_image_u8, clip_image_u8_deleter> clip_image_u8_ptr;
typedef std::unique_ptr<clip_image_f32, clip_image_f32_deleter> clip_image_f32_ptr;
typedef std::unique_ptr<clip_image_f32_batch, clip_image_f32_batch_deleter> clip_image_f32_batch_ptr;

// TODO @ngxson : we're currently having a naming clash between struct clip_image_size and function clip_image_size()
Comment on lines +205 to +221
Copy link
Collaborator Author

@ngxson ngxson Apr 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is what I was talking about in #12834 (comment)

In a follow-up PR, I'll use this inside clip.cpp


//
// common utils
//
Expand Down Expand Up @@ -214,6 +256,20 @@ static void string_replace_all(std::string & s, const std::string & search, cons
s = std::move(builder);
}

// split string by a `std::string delim` instead of `char delim`
static std::vector<std::string> string_split_str(std::string s, const std::string & delimiter) {
std::vector<std::string> tokens;
size_t pos = 0;
std::string token;
while ((pos = s.find(delimiter)) != std::string::npos) {
token = s.substr(0, pos);
tokens.push_back(token);
s.erase(0, pos + delimiter.length());
}
tokens.push_back(s);
return tokens;
}

//
// gguf utils
//
Expand Down Expand Up @@ -271,3 +327,9 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
}
}

//
// API used internally with mtmd
//

projector_type clip_get_projector_type(const struct clip_ctx * ctx);
33 changes: 16 additions & 17 deletions examples/llava/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,23 +32,6 @@ struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callbac

//#define CLIP_DEBUG_FUNCTIONS

// RGB uint8 image
struct clip_image_u8 {
int nx;
int ny;

std::vector<uint8_t> buf;
};

// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
struct clip_image_f32 {
int nx;
int ny;

std::vector<float> buf;
};

#ifdef CLIP_DEBUG_FUNCTIONS
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary);
Expand Down Expand Up @@ -1614,6 +1597,12 @@ struct clip_image_f32 * clip_image_f32_init() {
return new clip_image_f32();
}

unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
if (nx) *nx = img->nx;
if (ny) *ny = img->ny;
return img->buf.data();
}

void clip_image_size_free(struct clip_image_size * load_image_size) {
if (load_image_size == nullptr) {
return;
Expand Down Expand Up @@ -2346,6 +2335,8 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
n_patches = x_patch * y_patch;
} else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
n_patches = 256;
}

return n_patches;
Expand Down Expand Up @@ -2893,3 +2884,11 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,
clip_image_encode(ctx, n_threads, &clip_img, vec);
return true;
}

//
// API used internally with mtmd
//

projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
return ctx->proj_type;
}
3 changes: 3 additions & 0 deletions examples/llava/clip.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ CLIP_API struct clip_image_size * clip_image_size_init();
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
CLIP_API struct clip_image_f32 * clip_image_f32_init();

// nx, ny are the output image dimensions
CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);

CLIP_API void clip_image_size_free (struct clip_image_size * img_size);
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
Expand Down
Loading
Loading