From 23c648e98db7a47085c6407699a2f0e8f79137ec Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 28 Jan 2024 18:03:59 +0100 Subject: [PATCH] ggml : add Vulkan backend (llama/2059) * Vulkan loader code * Fix matmul kernel, continue implementation * Continue implementation * Vulkan memory management * Vulkan development * Matmul call * Add aligned malloc and free for VMA * Continue implementation * First matmul success * GEMM Kernel optimization * 1D Blocktiling * 2D Blocktiling * Write coalescing * Continue vulkan implementation and optimization * First FP16 attempt, disabled for now * Code abstraction, FP16 implementation, fix kernel, add FP16 to FP32 kernel * Enable device extensions properly, restore fp16 matmul op * Fix mulmat_f16 * Output FP32 in fp16 matmul shader * Fix f16_to_f32 kernel * dequant_q4_0 kernel * Add VMA library * Avoid requesting dedicated memory, VMA can decide that by itself * Add bounds checking to matmul kernels, improve implementation, fix command buffers not freed properly * add cmake commands * Add 2d write operation, profiling code * Fix 2d write * Fix queue selection for AMD RADV * Fix trailing whitespace in vk_mem_alloc.h * Add WIP warp tile mat mul shaders * Disable glslc optimization * Disable glslc optimization for CMake * Optimize warptile matmul shader, replace blocktile with it * Add split-k optimization for small matrix multiplication Use semaphores for synchronization instead of fences or waitidle Rework async write/read for synchronization * Fix validation errors, improve compatibility with AMD GPUs * Rework command buffer handling * Variable matmul kernel using specialization constants * Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints * Reuse semaphores * Handle stage flags during command buffer submission properly * Increase matmul test runs for consistent results * Fix F32 matmul * Add vectorized loading and zeropadding for matrix multiplication * Use pinned memory for f16 preprocessing * Don't force aligned matmul * Don't free before queue done * Replace VMA library with native Vulkan buffer management * Basic offloading support with mul_f32 and dmmv for q4_0 * Run glslc commands in parallel * Unroll loops in dmmv shader * Reduce usage of waitIdle * Reuse pinned allocation for f16 conversion * Handle devices with only a single queue * Fix trailing whitespace in CMakeLists.txt * Allow parallel execution of kernels, parallelize third and fourth dimension calls * Add fallback for devices only supporting one DescriptorSet per DescriptorPool * Move to graph function similar to CUDA implementation * Use F16 kernel for most things, replace q_f32 with mul_mat_q_f16 function * Add F32 dmmv shaders * Batch submissions * Add .spv to gitignore * Split off matrix vector multiplication for separate optimization * Use single command buffer for matrix vector multiplication ops * Reduce overhead of mul_f32 calls by using a single command buffer * Add submission batching to mul_f32 * Fix tests * Add missing barrier * Add further missing barrier * Add further ops * Replace vk::QueueFamilyIgnored with VK_QUEUE_FAMILY_IGNORED to support more Vulkan header versions * Remove unnecessary cblas link * Fix descriptor set pre-allocation assert * Add runtime shader compilation, start transferring shaders to this approach * Transfer remaining shaders to header and compile on runtime * Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16 * Add support for q4_1, q5_0, q5_1 and q8_0 * Remove unnecessary scalar layout extension * Parse graph early to pre-record command buffers * Add q6_k support * Add multi-submit for command buffers * Fix q6_k dequant shader for AMD * Fix q6_k for GPUs without fp16 support * Simplify q6_k fp16 fix * Minor fixes * Fix wg_denom of m-mulmat shaders * Add Python-based Vulkan shader generator * Replace shaderc dependency with precompiled shaders Fix python script to generate shaders * Clean up code * Fix shader generator script Windows compatibility Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com> * Close file before deletion * Fix vulkan shader fp32 name * Add q2_k and q3_k support Add validation check to compare shader results to cpu results * Add q4_k support * Add q5_k support * Bake SPIR-V bytecode into the library instead of loading shaders from file * Switch to signal semaphores for flexibility Prepare broadcasting support for mul mat * Finish broadcasting mul mat support for GQA * Clean up unused functions Add repeat op * Add further ops, not yet enabled. Improve semaphore code * Reduce number of used semaphores by utilizing timelines more properly * Remove queue information * Reuse timeline semaphores, allow parallel operation with binary semaphores to work around nvidia driver limitations * Add Vulkan to llama-bench * Remove cblas dependency * Fix matmul k-split bug * Fix q4_k dmmv K_QUANTS_PER_ITERATION 1 shader * Add RMS Norm shader, rework op_f32 shader setup, fix matmul bug * Fix issues with float16 overflows in shaders * Fix issues with older Vulkan headers on Ubuntu 22.04 * Allow multi-op partial offloading by parsing the graph to preallocate enough between-op buffers * Implement further ops, rework op_f32 calls, fix bugs * Finish full offloading support, add last remaining ops, fix bugs, remove redundant code * Upload generated file ggml-vulkan-shaders.hpp, remove redundant shaders * Merge upstream changes, fix conflicts, adapt soft_max op * Fix Python and shader header format * Free model gpu buffers on exit * Use single queue per device to simplify code * Add matmul shader support for running multiple calculations in parallel * Switch from semaphore-synchronized multiple command buffers per op to single command buffer for multiple ops, whole graph if possible * Fix missing event cast * Replace uint64_t(-1) with UINT64_MAX, rename function for clarity * Fix warning about empty C function parameters * Fix compiler warnings * Properly implement Vulkan backend buffer handling * Fix oversized host staging buffers * Simplify barrier synchronization calls * Fix gcc warnings * Implement max_size for backend buffer types to limit the size of a single allocation * Use min of maxMemoryAllocationSize and maxBufferSize for device max allocation size * refactor multi buf * Disable unsupported ops to fix tests * Check for maintenance4 support before using it * Handle devices with only a single queue * Fix single queue logic * propagate buffer usage in multi buffers * Implement rope_neox op * Cleanup header and other files * Simplify gpu_extras by removing events and putting staging memcpys into contexts * Move queue into context Add not-yet-enabled async backend ops * Simplify context use, optimize matmul shader for warp size 64 (AMD GCN), fix split_k matmul shader optimization * Add get_max_size to SYCL backend. Co-authored-by: Georgi Gerganov * llama : fix trailing whitespace --------- Co-authored-by: Henri Vasserman Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com> Co-authored-by: slaren Co-authored-by: Georgi Gerganov --- ggml-alloc.c | 106 ++++++++++++++++++++++++++++++++++---------- ggml-backend-impl.h | 6 +++ ggml-backend.c | 104 ++++++++++++++++++++++++++++++++++++++++++- ggml-backend.h | 3 ++ ggml-cuda.cu | 3 ++ ggml-metal.m | 1 + ggml-opencl.cpp | 2 + ggml.c | 45 +++++++++++++++++-- ggml.h | 1 + 9 files changed, 242 insertions(+), 29 deletions(-) diff --git a/ggml-alloc.c b/ggml-alloc.c index 95a93c9..dfe5ba2 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -778,38 +778,26 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) } // utils -ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { - GGML_ASSERT(ggml_get_no_alloc(ctx) == true); - size_t alignment = ggml_backend_buft_get_alignment(buft); - - size_t nbytes = 0; - for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - if (t->data == NULL && t->view_src == NULL) { - nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment); - } - } - - if (nbytes == 0) { - // all the tensors in the context are already allocated -#ifndef NDEBUG - fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__); -#endif - return NULL; - } - - ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes); +static bool alloc_tensor_range(struct ggml_context * ctx, + struct ggml_tensor * first, struct ggml_tensor * last, + ggml_backend_buffer_type_t buft, size_t size, + ggml_backend_buffer_t ** buffers, size_t * n_buffers) { + ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size); if (buffer == NULL) { - // failed to allocate buffer #ifndef NDEBUG - fprintf(stderr, "%s: failed to allocate buffer\n", __func__); + fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size); #endif - return NULL; + for (size_t i = 0; i < *n_buffers; i++) { + ggml_backend_buffer_free(*buffers[i]); + } + free(buffers); + return false; } ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer); - for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) { if (t->data == NULL) { if (t->view_src == NULL) { ggml_tallocr_alloc(tallocr, t); @@ -826,6 +814,76 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte ggml_tallocr_free(tallocr); + *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1)); + (*buffers)[(*n_buffers)++] = buffer; + + return true; +} + +ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { + GGML_ASSERT(ggml_get_no_alloc(ctx) == true); + + size_t alignment = ggml_backend_buft_get_alignment(buft); + size_t max_size = ggml_backend_buft_get_max_size(buft); + + ggml_backend_buffer_t * buffers = NULL; + size_t n_buffers = 0; + + size_t cur_buf_size = 0; + struct ggml_tensor * first = ggml_get_first_tensor(ctx); + for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) { + size_t this_size = 0; + if (t->data == NULL && t->view_src == NULL) { + this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment); + } + + if (this_size > max_size) { + // tensor is too large to fit in a single buffer + fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n", + __func__, t->name, + ggml_backend_buft_name(buft), + this_size, max_size); + for (size_t i = 0; i < n_buffers; i++) { + ggml_backend_buffer_free(buffers[i]); + } + free(buffers); + return NULL; + } + + if ((cur_buf_size + this_size) > max_size) { + // allocate tensors in the current buffer + if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) { + return NULL; + } + first = t; + cur_buf_size = this_size; + } else { + cur_buf_size += this_size; + } + } + + // allocate remaining tensors + if (cur_buf_size > 0) { + if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) { + return NULL; + } + } + + if (n_buffers == 0) { + // all the tensors in the context are already allocated +#ifndef NDEBUG + fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__); +#endif + return NULL; + } + + ggml_backend_buffer_t buffer; + if (n_buffers == 1) { + buffer = buffers[0]; + } else { + buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers); + } + free(buffers); return buffer; } diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index 1397828..f95df47 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -19,6 +19,7 @@ extern "C" { const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft); ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment + size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend // check if tensor data is in host memory @@ -63,6 +64,11 @@ extern "C" { // do not use directly, use ggml_backend_tensor_copy instead bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); + // buffer that contains a collection of buffers + GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); + GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer); + GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); + // // Backend // diff --git a/ggml-backend.c b/ggml-backend.c index 897a4cb..8b6cf7c 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -27,6 +27,14 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) { return buft->iface.get_alignment(buft); } +size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) { + // get_max_size is optional, defaults to SIZE_MAX + if (buft->iface.get_max_size) { + return buft->iface.get_max_size(buft); + } + return SIZE_MAX; +} + GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { // get_alloc_size is optional, defaults to ggml_nbytes if (buft->iface.get_alloc_size) { @@ -57,8 +65,6 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init( size_t size) { ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer)); - GGML_ASSERT(iface.get_base != NULL); - (*buffer) = (struct ggml_backend_buffer) { /* .interface = */ iface, /* .buft = */ buft, @@ -108,6 +114,10 @@ size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) { return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer)); } +size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) { + return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer)); +} + size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor); } @@ -122,6 +132,11 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) { void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { buffer->usage = usage; + + // FIXME: add a generic callback to the buffer interface + if (ggml_backend_buffer_is_multi_buffer(buffer)) { + ggml_backend_multi_buffer_set_usage(buffer, usage); + } } ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) { @@ -171,6 +186,10 @@ size_t ggml_backend_get_alignment(ggml_backend_t backend) { return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend)); } +size_t ggml_backend_get_max_size(ggml_backend_t backend) { + return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend)); +} + void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); @@ -349,6 +368,11 @@ GGML_CALL static void ggml_backend_registry_init(void) { extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL); #endif + +#ifdef GGML_USE_VULKAN + extern GGML_CALL int ggml_backend_vk_reg_devices(void); + ggml_backend_vk_reg_devices(); +#endif } GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) { @@ -552,6 +576,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { /* .get_name = */ ggml_backend_cpu_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, @@ -607,6 +632,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, @@ -763,6 +789,80 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, v GGML_UNUSED(user_data); } +// multi-buffer buffer + +struct ggml_backend_multi_buffer_context { + ggml_backend_buffer_t * buffers; + size_t n_buffers; +}; + +typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t; + +GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) { + ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context; + + return ctx->buffers[0]->iface.get_name(ctx->buffers[0]); +} + +GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context; + for (size_t i = 0; i < ctx->n_buffers; i++) { + ggml_backend_buffer_free(ctx->buffers[i]); + } + + free(ctx->buffers); + free(ctx); +} + +GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context; + for (size_t i = 0; i < ctx->n_buffers; i++) { + ggml_backend_buffer_clear(ctx->buffers[i], value); + } +} + +static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) { + static struct ggml_backend_buffer_i multi_backend_buffer_i = { + /* .get_name = */ ggml_backend_multi_buffer_get_name, + /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer, + /* .get_base = */ NULL, + /* .init_tensor = */ NULL, + /* .set_tensor = */ NULL, + /* .get_tensor = */ NULL, + /* .cpy_tensor = */ NULL, + /* .clear = */ ggml_backend_multi_buffer_clear, + /* .reset = */ NULL, + }; + + return multi_backend_buffer_i; +} + +GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) { + ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context)); + ctx->n_buffers = n_buffers; + ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t)); + + size_t total_size = 0; + for (size_t i = 0; i < n_buffers; i++) { + ctx->buffers[i] = buffers[i]; + total_size += ggml_backend_buffer_get_size(buffers[i]); + } + + return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size); +} + +GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) { + return buffer->iface.get_name == ggml_backend_multi_buffer_get_name; +} + +GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { + GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer)); + ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context; + for (size_t i = 0; i < ctx->n_buffers; i++) { + ggml_backend_buffer_set_usage(ctx->buffers[i], usage); + } +} + // scheduler diff --git a/ggml-backend.h b/ggml-backend.h index ab4ad77..8b8160f 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -20,6 +20,7 @@ extern "C" { GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); + GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); @@ -36,6 +37,7 @@ extern "C" { GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); @@ -54,6 +56,7 @@ extern "C" { GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend); GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size); GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend); + GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend); GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 0d599e2..7695b86 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -10440,6 +10440,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { /* .get_name = */ ggml_backend_cuda_buffer_type_name, /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, /* .is_host = */ NULL, @@ -10715,6 +10716,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface /* .get_name = */ ggml_backend_cuda_split_buffer_type_name, /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size, /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend, /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, @@ -10794,6 +10796,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { /* .get_name = */ ggml_backend_cuda_host_buffer_type_name, /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, diff --git a/ggml-metal.m b/ggml-metal.m index ab3c84f..a0efda0 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -2400,6 +2400,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { /* .get_name = */ ggml_backend_metal_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // TODO: return device.maxBufferLength /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend, /* .is_host = */ ggml_backend_metal_buffer_type_is_host, diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index bf9ad96..d406635 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -2136,6 +2136,7 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = { /* .get_name = */ ggml_backend_opencl_buffer_type_name, /* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // TODO: return from device info /* .get_alloc_size = */ NULL, /* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend, /* .is_host = */ NULL, @@ -2192,6 +2193,7 @@ ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() { /* .get_name = */ ggml_backend_opencl_host_buffer_type_name, /* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, diff --git a/ggml.c b/ggml.c index 8236ff5..5b37487 100644 --- a/ggml.c +++ b/ggml.c @@ -248,6 +248,8 @@ inline static void * ggml_aligned_malloc(size_t size) { #include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) #include "ggml-opencl.h" +#elif defined(GGML_USE_VULKAN) +#include "ggml-vulkan.h" #elif defined(GGML_USE_SYCL) #include "ggml-sycl.h" #endif @@ -2295,6 +2297,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { ggml_init_cublas(); #elif defined(GGML_USE_CLBLAST) ggml_cl_init(); +#elif defined(GGML_USE_VULKAN) + ggml_vk_init(); #elif defined(GGML_USE_SYCL) ggml_init_sycl(); #endif @@ -8019,7 +8023,7 @@ static void ggml_compute_forward_mul_f32( const int ith = params->ith; const int nth = params->nth; -#ifdef GGML_USE_CLBLAST +#if defined(GGML_USE_CLBLAST) if (src1->backend == GGML_BACKEND_GPU) { // TODO: OpenCL kernel support full broadcast GGML_ASSERT(ggml_can_repeat_rows(src1, src0)); @@ -14703,6 +14707,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); +#elif defined(GGML_USE_VULKAN) + const bool skip_cpu = ggml_vk_compute_forward(params, tensor); +#ifdef GGML_VULKAN_CHECK_RESULTS + if (skip_cpu) { + ggml_vk_check_results_1(params, tensor); + } +#endif + if (skip_cpu) { + return; + } + GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); #endif // GGML_USE_CUBLAS #ifdef GGML_USE_SYCL @@ -17105,6 +17121,17 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { } } +#ifdef GGML_USE_VULKAN + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]); + } + ggml_vk_preallocate_buffers(); + + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1); + } +#endif + const int n_threads = cplan->n_threads; struct ggml_compute_state_shared state_shared = { @@ -17156,6 +17183,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { } } +#ifdef GGML_USE_VULKAN + ggml_vk_graph_cleanup(); +#endif + // performance stats (graph) { int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles; @@ -20290,7 +20321,7 @@ int ggml_cpu_has_wasm_simd(void) { } int ggml_cpu_has_blas(void) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL) return 1; #else return 0; @@ -20313,6 +20344,14 @@ int ggml_cpu_has_clblast(void) { #endif } +int ggml_cpu_has_vulkan(void) { +#if defined(GGML_USE_VULKAN) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_sycl(void) { #if defined(GGML_USE_SYCL) return 1; @@ -20322,7 +20361,7 @@ int ggml_cpu_has_sycl(void) { } int ggml_cpu_has_gpublas(void) { - return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_sycl(); + return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_sycl(); } int ggml_cpu_has_sse3(void) { diff --git a/ggml.h b/ggml.h index 3d8d6f2..d697fd2 100644 --- a/ggml.h +++ b/ggml.h @@ -2263,6 +2263,7 @@ extern "C" { GGML_API int ggml_cpu_has_blas (void); GGML_API int ggml_cpu_has_cublas (void); GGML_API int ggml_cpu_has_clblast (void); + GGML_API int ggml_cpu_has_vulkan (void); GGML_API int ggml_cpu_has_gpublas (void); GGML_API int ggml_cpu_has_sse3 (void); GGML_API int ggml_cpu_has_ssse3 (void);