From 1d48457aa61d784719899d18c46ede86635c4bad Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Wed, 30 Oct 2024 02:01:23 +0100 Subject: [PATCH] llama : refactor model loader with backend registry (llama/10026) --- ggml/include/ggml-backend.h | 19 +-- ggml/include/ggml-cuda.h | 2 +- ggml/src/ggml-amx.cpp | 33 ++--- ggml/src/ggml-backend-impl.h | 19 +-- ggml/src/ggml-backend.cpp | 235 ++++++++++++++++++----------------- ggml/src/ggml-blas.cpp | 20 +-- ggml/src/ggml-cann.cpp | 50 +------- ggml/src/ggml-cuda.cu | 142 ++++++++++----------- ggml/src/ggml-kompute.cpp | 15 --- ggml/src/ggml-metal.m | 44 ++++--- ggml/src/ggml-rpc.cpp | 20 +-- ggml/src/ggml-sycl.cpp | 54 +++----- ggml/src/ggml-vulkan.cpp | 26 ++-- ggml/src/ggml.c | 4 +- 14 files changed, 273 insertions(+), 410 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 5933b8e..c11eb41 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -114,11 +114,12 @@ extern "C" { // enum ggml_backend_dev_type { + // CPU device using system memory GGML_BACKEND_DEVICE_TYPE_CPU, + // GPU device using dedicated memory GGML_BACKEND_DEVICE_TYPE_GPU, - // devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication) - GGML_BACKEND_DEVICE_TYPE_CPU_FULL, - GGML_BACKEND_DEVICE_TYPE_GPU_FULL + // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX) + GGML_BACKEND_DEVICE_TYPE_ACCEL }; // functionality supported by the device @@ -167,10 +168,14 @@ extern "C" { GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index); GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name); + // Common functions that may be obtained using ggml_backend_reg_get_proc_address - // Functions that may be obtained using ggml_backend_reg_get_proc_address - typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *); - typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t, int); + // Split buffer type for tensor parallelism + typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split); + // Set the number of threads for the backend + typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads); + // Get additional buffer types provided by the device (returns a NULL-terminated array) + typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device); // // Backend registry @@ -192,7 +197,7 @@ extern "C" { GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params); // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params) GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params); - // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL) + // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL) GGML_API ggml_backend_t ggml_backend_init_best(void); // diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h index f44d8f4..305d0b6 100644 --- a/ggml/include/ggml-cuda.h +++ b/ggml/include/ggml-cuda.h @@ -28,7 +28,7 @@ GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend); GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); // split tensor buffer that splits matrices by rows across multiple devices -GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split); +GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split); // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); diff --git a/ggml/src/ggml-amx.cpp b/ggml/src/ggml-amx.cpp index ac6ec23..144dc9d 100644 --- a/ggml/src/ggml-amx.cpp +++ b/ggml/src/ggml-amx.cpp @@ -16,12 +16,6 @@ #if defined(__AMX_INT8__) // AMX buffer interface -static const char * ggml_backend_amx_buffer_get_name(ggml_backend_buffer_t buffer) { - return "AMX"; - - GGML_UNUSED(buffer); -} - static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) { free(buffer->context); } @@ -72,7 +66,6 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t } static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = { - /* .get_name = */ ggml_backend_amx_buffer_get_name, /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer, /* .get_base = */ ggml_backend_amx_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required @@ -121,14 +114,14 @@ static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = { /* .iface = */ { - /* .get_name = */ ggml_backend_amx_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size, - /* .is_host = */ ggml_backend_amx_buffer_type_is_host, + /* .get_name = */ ggml_backend_amx_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size, + /* .is_host = */ ggml_backend_amx_buffer_type_is_host, }, - /* .device = */ NULL, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0), /* .context = */ NULL, }; @@ -149,12 +142,6 @@ static void ggml_backend_amx_free(ggml_backend_t backend) { delete backend; } -static ggml_backend_buffer_type_t ggml_backend_amx_get_default_buffer_type(ggml_backend_t backend) { - return ggml_backend_amx_buffer_type(); - - GGML_UNUSED(backend); -} - static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context; @@ -187,7 +174,6 @@ static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, s static struct ggml_backend_i ggml_backend_amx_i = { /* .get_name = */ ggml_backend_amx_name, /* .free = */ ggml_backend_amx_free, - /* .get_default_buffer_type = */ ggml_backend_amx_get_default_buffer_type, /* .set_tensor_async = */ NULL, /* .get_tensor_async = */ NULL, /* .cpy_tensor_async = */ NULL, @@ -197,9 +183,6 @@ static struct ggml_backend_i ggml_backend_amx_i = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_amx_graph_compute, - /* .supports_op = */ NULL, - /* .supports_buft = */ NULL, - /* .offload_op = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, }; @@ -279,7 +262,7 @@ static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t * } static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) { - return GGML_BACKEND_DEVICE_TYPE_CPU; + return GGML_BACKEND_DEVICE_TYPE_ACCEL; GGML_UNUSED(dev); } diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index fd3deae..fa8d5b7 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -22,7 +22,7 @@ extern "C" { size_t (*get_max_size) (ggml_backend_buffer_type_t buft); // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes) size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); - // (optional) check if tensor data is in host memory (defaults to false) + // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false) bool (*is_host) (ggml_backend_buffer_type_t buft); }; @@ -37,7 +37,6 @@ extern "C" { // struct ggml_backend_buffer_i { - const char * (*get_name) (ggml_backend_buffer_t buffer); // (optional) free the buffer void (*free_buffer) (ggml_backend_buffer_t buffer); // base address of the buffer @@ -88,19 +87,16 @@ extern "C" { void (*free)(ggml_backend_t backend); - // Will be moved to the device interface - // buffer allocation - ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend); - // (optional) asynchronous tensor data access void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst); - // (optional) complete all pending operations + // (optional) complete all pending operations (required if the backend supports async operations) void (*synchronize)(ggml_backend_t backend); - // (optional) compute graph with a plan (not used currently) + // (optional) graph plans (not used currently) + // compute graph with a plan ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology @@ -111,13 +107,6 @@ extern "C" { // compute graph (always async if supported by the backend) enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); - // IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface - // new backends should implement the device interface instead - // These functions are being moved to the device interface - bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op); - bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft); - bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op); - // (optional) event synchronization // record an event on this stream void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event); diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 7d7b63a..fd57488 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -34,6 +34,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { } ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + if (size == 0) { + // return a dummy buffer for zero-sized allocations + return ggml_backend_buffer_init(buft, {}, NULL, 0); + } + return buft->iface.alloc_buffer(buft, size); } @@ -89,7 +94,7 @@ ggml_backend_buffer_t ggml_backend_buffer_init( } const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) { - return buffer->iface.get_name(buffer); + return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer)); } void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { @@ -108,6 +113,11 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { } void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { + // get_base is optional if the buffer is zero-sized + if (buffer->size == 0) { + return NULL; + } + void * base = buffer->iface.get_base(buffer); GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); @@ -122,6 +132,15 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t } } +void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + // clear is optional if the buffer is zero-sized + if (buffer->size == 0) { + return; + } + + buffer->iface.clear(buffer, value); +} + size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) { return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer)); } @@ -134,10 +153,6 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor); } -void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - buffer->iface.clear(buffer, value); -} - bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) { return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer)); } @@ -198,7 +213,7 @@ void ggml_backend_free(ggml_backend_t backend) { } ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) { - return backend->iface.get_default_buffer_type(backend); + return ggml_backend_dev_buffer_type(backend->device); } ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) { @@ -238,43 +253,42 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + if (size == 0) { + return; + } + GGML_ASSERT(buf != NULL && "tensor buffer not set"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - if (!size) { - return; - } - buf->iface.set_tensor(buf, tensor, data, offset, size); } void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + if (size == 0) { + return; + } + GGML_ASSERT(buf != NULL && "tensor buffer not set"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - if (!size) { - return; - } - buf->iface.get_tensor(buf, tensor, data, offset, size); } GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; - GGML_ASSERT(buf != NULL && "tensor buffer not set"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - - if (!size) { + if (size == 0) { return; } - GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer"); + GGML_ASSERT(buf != NULL && "tensor buffer not set"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer"); buf->iface.memset_tensor(buf, tensor, value, offset, size); } @@ -316,32 +330,15 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct } bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { - // helper to ease transition to device interface - if (backend->device) { - return ggml_backend_dev_supports_op(backend->device, op); - } - - return backend->iface.supports_op(backend, op); + return ggml_backend_dev_supports_op(backend->device, op); } bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { - // helper to ease transition to device interface - if (backend->device) { - return ggml_backend_dev_supports_buft(backend->device, buft); - } - return backend->iface.supports_buft(backend, buft); + return ggml_backend_dev_supports_buft(backend->device, buft); } bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) { - // helper to ease transition to device interface - if (backend->device) { - return ggml_backend_dev_offload_op(backend->device, op); - } - - if (backend->iface.offload_op != NULL) { - return backend->iface.offload_op(backend, op); - } - return false; + return ggml_backend_dev_offload_op(backend->device, op); } ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) { @@ -582,6 +579,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_VULKAN register_backend(ggml_backend_vk_reg()); #endif +#ifdef GGML_USE_CANN + register_backend(ggml_backend_cann_reg()); +#endif #ifdef GGML_USE_BLAS register_backend(ggml_backend_blas_reg()); #endif @@ -591,9 +591,6 @@ struct ggml_backend_registry { #ifdef GGML_USE_AMX register_backend(ggml_backend_amx_reg()); #endif -#ifdef GGML_USE_CANN - register_backend(ggml_backend_cann_reg()); -#endif // TODO: kompute @@ -701,9 +698,9 @@ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const } ggml_backend_t ggml_backend_init_best(void) { - ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL); + ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU); if (!dev) { - dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU_FULL); + dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); } if (!dev) { return NULL; @@ -711,13 +708,7 @@ ggml_backend_t ggml_backend_init_best(void) { return ggml_backend_dev_init(dev, NULL); } -// backend CPU - -static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) { - return "CPU"; - - GGML_UNUSED(buffer); -} +// CPU backend - buffer static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { uintptr_t data = (uintptr_t)buffer->context; @@ -767,7 +758,6 @@ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t } static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { - /* .get_name = */ ggml_backend_cpu_buffer_get_name, /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required @@ -780,7 +770,6 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { }; static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { - /* .get_name = */ ggml_backend_cpu_buffer_get_name, /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required @@ -792,6 +781,8 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { /* .reset = */ NULL, }; +// CPU backend - buffer type + static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return "CPU"; @@ -799,19 +790,14 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty } static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - auto alloc_size = size; - if (alloc_size == 0) { - alloc_size = 1; - } - - void * data = ggml_aligned_malloc(alloc_size); + void * data = ggml_aligned_malloc(size); if (data == NULL) { - GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size); + GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); return NULL; } - return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, alloc_size); + return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size); } static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { @@ -843,6 +829,29 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { return &ggml_backend_cpu_buffer_type; } +static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) { + return "CPU_Mapped"; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) { + static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { + /* .iface = */ { + /* .get_name = */ ggml_backend_cpu_buffer_from_ptr_type_get_name, + /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), + /* .context = */ NULL, + }; + + return &ggml_backend_cpu_buffer_type; +} + #ifdef GGML_USE_CPU_HBM // buffer type HBM @@ -855,18 +864,11 @@ static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffe GGML_UNUSED(buft); } -static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) { - return "CPU_HBM"; - - GGML_UNUSED(buf); -} - static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { hbw_free(buffer->context); } static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - //void * ptr = hbw_malloc(size); void * ptr; int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); if (result != 0) { @@ -876,7 +878,6 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); buffer->buft = buft; - buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name; buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer; return buffer; @@ -899,6 +900,21 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { } #endif +static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) { + static ggml_backend_buffer_type_t bufts[] = { +#ifdef GGML_USE_CPU_HBM + ggml_backend_cpu_hbm_buffer_type(), +#endif + NULL + }; + + return bufts; + + GGML_UNUSED(device); +} + +// CPU backend - backend (stream) + struct ggml_backend_cpu_context { int n_threads; ggml_threadpool_t threadpool; @@ -923,12 +939,6 @@ static void ggml_backend_cpu_free(ggml_backend_t backend) { delete backend; } -static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) { - return ggml_backend_cpu_buffer_type(); - - GGML_UNUSED(backend); -} - struct ggml_backend_plan_cpu { struct ggml_cplan cplan; struct ggml_cgraph cgraph; @@ -998,7 +1008,6 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s static const struct ggml_backend_i ggml_backend_cpu_i = { /* .get_name = */ ggml_backend_cpu_get_name, /* .free = */ ggml_backend_cpu_free, - /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type, /* .set_tensor_async = */ NULL, /* .get_tensor_async = */ NULL, /* .cpy_tensor_async = */ NULL, @@ -1008,9 +1017,6 @@ static const struct ggml_backend_i ggml_backend_cpu_i = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, /* .graph_compute = */ ggml_backend_cpu_graph_compute, - /* .supports_op = */ NULL, - /* .supports_buft = */ NULL, - /* .offload_op = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, }; @@ -1081,10 +1087,10 @@ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned"); - return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size); + return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size); } -//////////////////////// +// CPU backend - device struct ggml_backend_cpu_device_context { std::string description = "CPU"; @@ -1171,7 +1177,7 @@ static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * } static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) { - return GGML_BACKEND_DEVICE_TYPE_CPU_FULL; + return GGML_BACKEND_DEVICE_TYPE_CPU; GGML_UNUSED(dev); } @@ -1189,7 +1195,7 @@ static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggm }; } -static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t dev, const char * params) { +static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) { return ggml_backend_cpu_init(); GGML_UNUSED(dev); @@ -1202,7 +1208,7 @@ static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_b GGML_UNUSED(dev); } -static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { +static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { return ggml_backend_cpu_buffer_from_ptr(ptr, size); GGML_UNUSED(dev); @@ -1244,10 +1250,10 @@ static const struct ggml_backend_device_i ggml_backend_cpu_device_i = { /* .get_memory = */ ggml_backend_cpu_device_get_memory, /* .get_type = */ ggml_backend_cpu_device_get_type, /* .get_props = */ ggml_backend_cpu_device_get_props, - /* .init_backend = */ ggml_backend_cpu_device_init, + /* .init_backend = */ ggml_backend_cpu_device_init_backend, /* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type, /* .get_host_buffer_type = */ NULL, - /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_ptr, + /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr, /* .supports_op = */ ggml_backend_cpu_device_supports_op, /* .supports_buft = */ ggml_backend_cpu_device_supports_buft, /* .offload_op = */ NULL, @@ -1256,7 +1262,7 @@ static const struct ggml_backend_device_i ggml_backend_cpu_device_i = { /* .event_synchronize = */ NULL, }; -//////////////////////// +// CPU backend - backend (reg) static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) { return "CPU"; @@ -1287,6 +1293,10 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch if (strcmp(name, "ggml_backend_set_n_threads") == 0) { return (void *)ggml_backend_cpu_set_n_threads; } + if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) { + return (void *)ggml_backend_cpu_get_extra_bufts; + } + return NULL; GGML_UNUSED(reg); @@ -1315,12 +1325,6 @@ struct ggml_backend_multi_buffer_context { size_t n_buffers; }; -static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) { - ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; - - return ctx->buffers[0]->iface.get_name(ctx->buffers[0]); -} - static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context; for (size_t i = 0; i < ctx->n_buffers; i++) { @@ -1339,7 +1343,6 @@ static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_ } static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = { - /* .get_name = */ ggml_backend_multi_buffer_get_name, /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer, /* .get_base = */ NULL, /* .init_tensor = */ NULL, @@ -1368,7 +1371,7 @@ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer } bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) { - return buffer->iface.get_name == ggml_backend_multi_buffer_get_name; + return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer; } void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { @@ -1460,7 +1463,7 @@ struct ggml_backend_sched { char * context_buffer; size_t context_buffer_size; - bool debug; + int debug; }; #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor) @@ -1500,7 +1503,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co return -1; } -#if 0 +#if 1 #define GGML_SCHED_MAX_SPLITS_DEBUG 4096 static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__) @@ -1548,7 +1551,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st if (src == NULL) { continue; } - if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { + // skip ROPE since the rope freqs tensor is too small to choose a backend based on it + // not an ideal solution + if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); // check if a backend with higher prio wants to offload the op if (src_backend_id == sched->n_backends - 1) { @@ -1595,19 +1600,21 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str if (ggml_is_view_op(node->op)) { continue; } - ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); - GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, - fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)); - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - continue; + if (sched->debug > 1) { + ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); + GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, + fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)); + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src); + GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name, + fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); } - ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src); - GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name, - fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); + GGML_LOG_DEBUG("\n"); } - GGML_LOG_DEBUG("\n"); } } @@ -1899,11 +1906,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (src == NULL) { continue; } - // check if a weight is on a different backend + // check if a weight is on a different and incompatible backend // by starting a new split, the memory of the previously offloaded weights can be reused if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { int src_backend_id = tensor_backend_id(src); - if (src_backend_id != cur_backend_id) { + if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) { need_new_split = true; break; } @@ -1915,7 +1922,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg int src_backend_id = sched->hv_tensor_backend_ids[id]; bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) { - //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name); need_new_split = true; break; } @@ -2240,7 +2246,8 @@ ggml_backend_sched_t ggml_backend_sched_new( struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched)); - sched->debug = getenv("GGML_SCHED_DEBUG") != NULL; + const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG"); + sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0; sched->n_backends = n_backends; sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1; diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp index 7875ec8..8d96220 100644 --- a/ggml/src/ggml-blas.cpp +++ b/ggml/src/ggml-blas.cpp @@ -224,12 +224,6 @@ static void ggml_backend_blas_free(ggml_backend_t backend) { delete backend; } -static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) { - return ggml_backend_cpu_buffer_type(); - - GGML_UNUSED(backend); -} - static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; @@ -265,7 +259,6 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, static struct ggml_backend_i blas_backend_i = { /* .get_name = */ ggml_backend_blas_get_name, /* .free = */ ggml_backend_blas_free, - /* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type, /* .set_tensor_async = */ NULL, /* .get_tensor_async = */ NULL, /* .cpy_tensor_async = */ NULL, @@ -275,9 +268,6 @@ static struct ggml_backend_i blas_backend_i = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_blas_graph_compute, - /* .supports_op = */ NULL, - /* .supports_buft = */ NULL, - /* .offload_op = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, }; @@ -356,7 +346,7 @@ static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t * } static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) { - return GGML_BACKEND_DEVICE_TYPE_CPU; + return GGML_BACKEND_DEVICE_TYPE_ACCEL; GGML_UNUSED(dev); } @@ -374,7 +364,7 @@ static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct gg }; } -static ggml_backend_t ggml_backend_blas_device_init(ggml_backend_dev_t dev, const char * params) { +static ggml_backend_t ggml_backend_blas_device_init_backend(ggml_backend_dev_t dev, const char * params) { return ggml_backend_blas_init(); GGML_UNUSED(dev); @@ -387,7 +377,7 @@ static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_ GGML_UNUSED(dev); } -static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { +static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { return ggml_backend_cpu_buffer_from_ptr(ptr, size); GGML_UNUSED(dev); @@ -456,10 +446,10 @@ static const struct ggml_backend_device_i ggml_backend_blas_device_i = { /* .get_memory = */ ggml_backend_blas_device_get_memory, /* .get_type = */ ggml_backend_blas_device_get_type, /* .get_props = */ ggml_backend_blas_device_get_props, - /* .init_backend = */ ggml_backend_blas_device_init, + /* .init_backend = */ ggml_backend_blas_device_init_backend, /* .get_buffer_type = */ ggml_backend_blas_device_get_buffer_type, /* .get_host_buffer_type = */ NULL, - /* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_ptr, + /* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_host_ptr, /* .supports_op = */ ggml_backend_blas_device_supports_op, /* .supports_buft = */ ggml_backend_blas_device_supports_buft, /* .offload_op = */ NULL, diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index af0fb60..f8ac11e 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -489,23 +489,6 @@ struct ggml_backend_cann_buffer_context { ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); } }; -/** - * @brief Retrieve the name associated with a CANN buffer. - * - * This function returns the name of a CANN buffer, which is stored in the - * context of the buffer. - * - * @param buffer The CANN buffer whose name is to be retrieved. - * @return A pointer to a C-string containing the name of the buffer. - */ - -static const char* ggml_backend_cann_buffer_get_name( - ggml_backend_buffer_t buffer) { - return "CANN"; - - GGML_UNUSED(buffer); -} - /** * @brief Check if a buffer is a CANN buffer. * @@ -515,9 +498,10 @@ static const char* ggml_backend_cann_buffer_get_name( * @param buffer The buffer to check. * @return true if the buffer is a CANN buffer, false otherwise. */ +static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft); static bool ggml_backend_buffer_is_cann( ggml_backend_buffer_t buffer) { - return buffer->iface.get_name == ggml_backend_cann_buffer_get_name; + return ggml_backend_buft_is_cann(buffer->buft); } /** @@ -965,7 +949,6 @@ static void ggml_backend_cann_buffer_clear( * on a CANN buffer within the backend. */ static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = { - /* .get_name = */ ggml_backend_cann_buffer_get_name, /* .free_buffer = */ ggml_backend_cann_buffer_free_buffer, /* .get_base = */ ggml_backend_cann_buffer_get_base, /* .init_tensor = */ ggml_backend_cann_buffer_init_tensor, @@ -999,9 +982,10 @@ struct ggml_backend_cann_buffer_type_context { */ static const char* ggml_backend_cann_buffer_type_name( ggml_backend_buffer_type_t buft) { - return "CANN"; + ggml_backend_cann_buffer_type_context* buft_ctx = + (ggml_backend_cann_buffer_type_context*)buft->context; - GGML_UNUSED(buft); + return buft_ctx->name.c_str(); } /** @@ -1465,24 +1449,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) { delete backend; } -/** - * @brief Retrieves the default buffer type associated with the CANN backend. - * - * This function returns the buffer type specific to the device associated - * with the CANN backend. It is used to allocate buffers for computations - * performed by the backend. - * - * @param backend Pointer to the CANN backend structure. - * @return Pointer to the buffer type structure for the CANN backend. - */ -static ggml_backend_buffer_type_t -ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) { - ggml_backend_cann_context* cann_ctx = - (ggml_backend_cann_context*)backend->context; - - return ggml_backend_cann_buffer_type(cann_ctx->device); -} - /** * @brief Sets tensor data asynchronously in the CANN backend. * @@ -1863,7 +1829,6 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend, static const ggml_backend_i ggml_backend_cann_interface = { /* .get_name = */ ggml_backend_cann_name, /* .free = */ ggml_backend_cann_free, - /* .get_default_buffer_type = */ ggml_backend_cann_get_default_buffer_type, /* .set_tensor_async = */ ggml_backend_cann_set_tensor_async, /* .get_tensor_async = */ ggml_backend_cann_get_tensor_async, /* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async, @@ -1873,9 +1838,6 @@ static const ggml_backend_i ggml_backend_cann_interface = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_cann_graph_compute, - /* .supports_op = */ NULL, // moved to device - /* .supports_buft = */ NULL, // moved to device - /* .offload_op = */ NULL, // moved to device /* .event_record = */ ggml_backend_cann_event_record, /* .event_wait = */ ggml_backend_cann_event_wait, }; @@ -1918,7 +1880,7 @@ static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) { GGML_UNUSED(dev); - return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; + return GGML_BACKEND_DEVICE_TYPE_GPU; } static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 217df96..0870915 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -421,20 +421,15 @@ struct ggml_backend_cuda_buffer_context { } }; -static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) { - ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; - return ctx->name.c_str(); -} - -static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { - return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name; -} - static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; delete ctx; } +static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { + return buffer->iface.free_buffer == ggml_backend_cuda_buffer_free_buffer; +} + static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; return ctx->dev_ptr; @@ -515,7 +510,6 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t } static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { - /* .get_name = */ ggml_backend_cuda_buffer_get_name, /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, /* .get_base = */ ggml_backend_cuda_buffer_get_base, /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, @@ -548,8 +542,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac ggml_cuda_set_device(buft_ctx->device); - size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0 - void * dev_ptr; cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device); if (err != cudaSuccess) { @@ -657,7 +649,9 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl } struct ggml_backend_cuda_split_buffer_type_context { + int main_device; std::array tensor_split; + std::string name; }; struct ggml_backend_cuda_split_buffer_context { @@ -680,16 +674,6 @@ struct ggml_backend_cuda_split_buffer_context { std::vector tensor_extras; }; -static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) { - return GGML_CUDA_NAME "_Split"; - - GGML_UNUSED(buffer); -} - -static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) { - return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name; - GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds -} static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; @@ -833,7 +817,6 @@ static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, u } static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { - /* .get_name = */ ggml_backend_cuda_split_buffer_get_name, /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer, /* .get_base = */ ggml_backend_cuda_split_buffer_get_base, /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor, @@ -848,9 +831,9 @@ static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { // cuda split buffer type static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - return GGML_CUDA_NAME "_Split"; + ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context; - GGML_UNUSED(buft); + return ctx->name.c_str(); } static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) { @@ -915,11 +898,11 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, }; -ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { +ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) { static std::mutex mutex; std::lock_guard lock(mutex); - static std::map, struct ggml_backend_buffer_type> buft_map; + static std::map>, struct ggml_backend_buffer_type> buft_map; std::array tensor_split_arr = {}; @@ -937,18 +920,23 @@ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * ten } } - auto it = buft_map.find(tensor_split_arr); + auto it = buft_map.find({main_device, tensor_split_arr}); if (it != buft_map.end()) { return &it->second; } + auto * ctx = new ggml_backend_cuda_split_buffer_type_context{ + main_device, + tensor_split_arr, + GGML_CUDA_NAME + std::to_string(main_device) + "_Split", + }; struct ggml_backend_buffer_type buft { /* .iface = */ ggml_backend_cuda_split_buffer_type_interface, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0), - /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr}, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), main_device), + /* .context = */ ctx, }; - auto result = buft_map.emplace(tensor_split_arr, buft); + auto result = buft_map.emplace(std::make_pair(main_device, tensor_split_arr), buft); return &result.first->second; } @@ -960,12 +948,6 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_ GGML_UNUSED(buft); } -static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) { - return GGML_CUDA_NAME "_Host"; - - GGML_UNUSED(buffer); -} - static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { CUDA_CHECK(cudaFreeHost(buffer->context)); } @@ -998,7 +980,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); buffer->buft = buft; - buffer->iface.get_name = ggml_backend_cuda_host_buffer_name; buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer; return buffer; @@ -1400,7 +1381,7 @@ static void ggml_cuda_op_mul_mat( const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING); - const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer); + const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft); GGML_ASSERT(!(split && ne02 > 1)); GGML_ASSERT(!(split && ne03 > 1)); GGML_ASSERT(!(split && ne02 < ne12)); @@ -1890,7 +1871,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co } static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer); + const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft); bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 @@ -2017,7 +1998,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * GGML_TENSOR_BINARY_OP_LOCALS - GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers"); + GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers"); cudaStream_t stream = ctx.stream(); @@ -2150,7 +2131,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) { // why is this here instead of mul_mat? - if (dst->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(dst->src[0]->buffer)) { + if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) { ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device); } @@ -2371,12 +2352,6 @@ static void ggml_backend_cuda_free(ggml_backend_t backend) { delete backend; } -static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; - - return ggml_backend_cuda_buffer_type(cuda_ctx->device); -} - static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; @@ -2582,7 +2557,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, continue; } - if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) { + if (node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_cuda_split(node->src[0]->buffer->buft)) { use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture #ifndef NDEBUG GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__); @@ -2669,7 +2644,8 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { assert(node->src[j]->buffer); - assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer)); + assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || + ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft)); } } #endif @@ -2762,7 +2738,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info); if (stat == cudaErrorGraphExecUpdateFailure) { #ifndef NDEBUG - GGML_LOG_ERROR("%s: CUDA graph update failed\n", __func__); + GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__); #endif // The pre-existing graph exec cannot be updated due to violated constraints // so instead clear error and re-instantiate @@ -2811,7 +2787,6 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev static const ggml_backend_i ggml_backend_cuda_interface = { /* .get_name = */ ggml_backend_cuda_get_name, /* .free = */ ggml_backend_cuda_free, - /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type, /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async, /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async, /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async, @@ -2821,9 +2796,6 @@ static const ggml_backend_i ggml_backend_cuda_interface = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_cuda_graph_compute, - /* .supports_op = */ NULL, // moved to device - /* .supports_buft = */ NULL, // moved to device - /* .offload_op = */ NULL, // moved to device /* .event_record = */ ggml_backend_cuda_event_record, /* .event_wait = */ ggml_backend_cuda_event_wait, }; @@ -2913,7 +2885,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) { GGML_UNUSED(dev); - return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; + return GGML_BACKEND_DEVICE_TYPE_GPU; } static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { @@ -2937,7 +2909,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back }; } -static ggml_backend_t ggml_backend_cuda_device_init(ggml_backend_dev_t dev, const char * params) { +static ggml_backend_t ggml_backend_cuda_device_init_backend(ggml_backend_dev_t dev, const char * params) { GGML_UNUSED(params); ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; return ggml_backend_cuda_init(ctx->device); @@ -2953,18 +2925,29 @@ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type( return ggml_backend_cuda_host_buffer_type(); } -static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { - GGML_UNUSED(dev); - GGML_UNUSED(ptr); - GGML_UNUSED(size); - GGML_UNUSED(max_tensor_size); - return nullptr; -} - // TODO: move these functions here static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context; + // split buffers can only be used with GGML_OP_MUL_MAT + if (op->op != GGML_OP_MUL_MAT) { + for (int i = 0; i < GGML_MAX_SRC; i++) { + if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda_split(op->src[i]->buffer->buft)) { + return false; + } + } + } + + // check if all the sources are allocated on this device + for (int i = 0; i < GGML_MAX_SRC; i++) { + if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda(op->src[i]->buffer->buft)) { + ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)op->src[i]->buffer->buft->context; + if (buft_ctx->device != dev_ctx->device) { + return false; + } + } + } + switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { @@ -3190,24 +3173,27 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g } static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - if (ggml_backend_buft_is_cuda_split(buft)) { - return true; - } + return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev; +} - if (ggml_backend_buft_is_cuda(buft)) { - ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context; - ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; - return buft_ctx->device == dev_ctx->device; +static int64_t get_op_batch_size(const ggml_tensor * op) { + switch (op->op) { + case GGML_OP_GET_ROWS: + return 0; + case GGML_OP_MUL_MAT: + return op->ne[1]; + case GGML_OP_MUL_MAT_ID: + case GGML_OP_ROPE: + return op->ne[2]; + default: + return ggml_nrows(op); } - - return false; } static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { const int min_batch_size = 32; - return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || - (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID); + return get_op_batch_size(op) >= min_batch_size; GGML_UNUSED(dev); } @@ -3248,10 +3234,10 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = { /* .get_memory = */ ggml_backend_cuda_device_get_memory, /* .get_type = */ ggml_backend_cuda_device_get_type, /* .get_props = */ ggml_backend_cuda_device_get_props, - /* .init_backend = */ ggml_backend_cuda_device_init, + /* .init_backend = */ ggml_backend_cuda_device_init_backend, /* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type, /* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type, - /* .buffer_from_host_ptr = */ ggml_backend_cuda_device_buffer_from_host_ptr, + /* .buffer_from_host_ptr = */ NULL, /* .supports_op = */ ggml_backend_cuda_device_supports_op, /* .supports_buft = */ ggml_backend_cuda_device_supports_buft, /* .offload_op = */ ggml_backend_cuda_device_offload_op, diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp index 2c926aa..1f22202 100644 --- a/ggml/src/ggml-kompute.cpp +++ b/ggml/src/ggml-kompute.cpp @@ -1820,11 +1820,6 @@ static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) { } } -static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) { - auto * ctx = static_cast(buffer->buft->context); - return ctx->name.c_str(); -} - static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) { auto * memory = (ggml_vk_memory *)buffer->context; if (ggml_vk_has_device()) { @@ -1868,7 +1863,6 @@ static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint } static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = { - /* .get_name = */ ggml_backend_kompute_buffer_get_name, /* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer, /* .get_base = */ ggml_backend_kompute_buffer_get_base, /* .init_tensor = */ NULL, @@ -1953,11 +1947,6 @@ static void ggml_backend_kompute_free(ggml_backend_t backend) { delete backend; } -static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) { - auto * ctx = static_cast(backend->context); - return ggml_backend_kompute_buffer_type(ctx->device); -} - static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { auto * ctx = static_cast(backend->context); ggml_vk_graph_compute(ctx, cgraph); @@ -1977,7 +1966,6 @@ static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_back static struct ggml_backend_i kompute_backend_i = { /* .get_name = */ ggml_backend_kompute_name, /* .free = */ ggml_backend_kompute_free, - /* .get_default_buffer_type = */ ggml_backend_kompute_get_default_buffer_type, /* .set_tensor_async = */ NULL, /* .get_tensor_async = */ NULL, /* .cpy_tensor_async = */ NULL, @@ -1987,9 +1975,6 @@ static struct ggml_backend_i kompute_backend_i = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_kompute_graph_compute, - /* .supports_op = */ ggml_backend_kompute_supports_op, - /* .supports_buft = */ ggml_backend_kompute_supports_buft, - /* .offload_op = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, }; diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index fb2efc6..f9bd6fa 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -3254,12 +3254,6 @@ static enum ggml_status ggml_metal_graph_compute( // backend interface -static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) { - return "Metal"; - - UNUSED(buffer); -} - static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; @@ -3314,7 +3308,6 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_ } static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { - /* .get_name = */ ggml_backend_metal_buffer_get_name, /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer, /* .get_base = */ ggml_backend_metal_buffer_get_base, /* .init_tensor = */ NULL, @@ -3439,6 +3432,29 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { return &ggml_backend_buffer_type_metal; } +static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) { + return "Metal_Mapped"; + + UNUSED(buft); +} + +static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) { + static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = { + /* .iface = */ { + /* .get_name = */ ggml_backend_metal_buffer_from_ptr_type_get_name, + /* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_metal_buffer_type_is_host, + }, + /* .device = */ &g_ggml_backend_metal_device, + /* .context = */ NULL, + }; + + return &ggml_backend_buffer_from_ptr_type_metal; +} + // TODO: obsoleted by ggml_backend_metal_device_buffer_from_ptr ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) { struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context)); @@ -3515,7 +3531,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz } } - return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size); + return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size); } // backend @@ -3536,12 +3552,6 @@ static void ggml_backend_metal_free(ggml_backend_t backend) { free(backend); } -static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) { - return ggml_backend_metal_buffer_type(); - - UNUSED(backend); -} - static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { return ggml_metal_graph_compute(backend, cgraph); } @@ -3608,7 +3618,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { static struct ggml_backend_i ggml_backend_metal_i = { /* .get_name = */ ggml_backend_metal_name, /* .free = */ ggml_backend_metal_free, - /* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type, /* .set_tensor_async = */ NULL, /* .get_tensor_async = */ NULL, /* .cpy_tensor_async = */ NULL, @@ -3618,9 +3627,6 @@ static struct ggml_backend_i ggml_backend_metal_i = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_metal_graph_compute, - /* .supports_op = */ NULL, - /* .supports_buft = */ NULL, - /* .offload_op = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, }; @@ -3715,7 +3721,7 @@ static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t } static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backend_dev_t dev) { - return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; + return GGML_BACKEND_DEVICE_TYPE_GPU; GGML_UNUSED(dev); } diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp index 0e936b3..2778009 100644 --- a/ggml/src/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc.cpp @@ -178,7 +178,6 @@ struct ggml_backend_rpc_buffer_context { std::shared_ptr sock; std::unordered_map base_cache; uint64_t remote_ptr; - std::string name; }; // RPC helper functions @@ -409,11 +408,6 @@ static std::shared_ptr get_socket(const std::string & endpoint) { return sock; } -static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) { - ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; - return ctx->name.c_str(); -} - static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; rpc_msg_free_buffer_req request = {ctx->remote_ptr}; @@ -524,7 +518,6 @@ static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t } static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = { - /* .get_name = */ ggml_backend_rpc_buffer_get_name, /* .free_buffer = */ ggml_backend_rpc_buffer_free_buffer, /* .get_base = */ ggml_backend_rpc_buffer_get_base, /* .init_tensor = */ ggml_backend_rpc_buffer_init_tensor, @@ -551,7 +544,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back if (response.remote_ptr != 0) { ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_rpc_buffer_interface, - new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"}, + new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr}, response.remote_size); return buffer; } else { @@ -609,11 +602,6 @@ static void ggml_backend_rpc_free(ggml_backend_t backend) { delete backend; } -static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) { - ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context; - return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str()); -} - static void ggml_backend_rpc_synchronize(ggml_backend_t backend) { UNUSED(backend); // this is no-op because we don't have any async operations @@ -670,7 +658,6 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g static ggml_backend_i ggml_backend_rpc_interface = { /* .get_name = */ ggml_backend_rpc_name, /* .free = */ ggml_backend_rpc_free, - /* .get_default_buffer_type = */ ggml_backend_rpc_get_default_buffer_type, /* .set_tensor_async = */ NULL, /* .get_tensor_async = */ NULL, /* .cpy_tensor_async = */ NULL, @@ -680,9 +667,6 @@ static ggml_backend_i ggml_backend_rpc_interface = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_rpc_graph_compute, - /* .supports_op = */ NULL, - /* .supports_buft = */ NULL, - /* .offload_op = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, }; @@ -1278,7 +1262,7 @@ static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t * static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) { // TODO: obtain value from the server - return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; + return GGML_BACKEND_DEVICE_TYPE_GPU; UNUSED(dev); } diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 4d91ee4..a62c67f 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -249,13 +249,10 @@ struct ggml_backend_sycl_buffer_context { } }; -static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) { - ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context; - return ctx->name.c_str(); -} +static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_type_t buft); static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) { - return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name; + return buffer->buft->iface.get_name == ggml_backend_sycl_buffer_type_get_name; } static void @@ -440,7 +437,6 @@ catch (sycl::exception const &exc) { } static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = { - /* .get_name = */ ggml_backend_sycl_buffer_get_name, /* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer, /* .get_base = */ ggml_backend_sycl_buffer_get_base, /* .init_tensor = */ ggml_backend_sycl_buffer_init_tensor, @@ -698,16 +694,6 @@ struct ggml_backend_sycl_split_buffer_context { std::vector streams; }; -static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) { - return GGML_SYCL_NAME "_Split"; - - GGML_UNUSED(buffer); -} - -static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) { - return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name; -} - static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; delete ctx; @@ -915,7 +901,6 @@ static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, u } static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = { - /* .get_name = */ ggml_backend_sycl_split_buffer_get_name, /* .free_buffer = */ ggml_backend_sycl_split_buffer_free_buffer, /* .get_base = */ ggml_backend_sycl_split_buffer_get_base, /* .init_tensor = */ ggml_backend_sycl_split_buffer_init_tensor, @@ -935,6 +920,10 @@ static const char * ggml_backend_sycl_split_buffer_type_get_name(ggml_backend_bu GGML_UNUSED(buft); } +static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) { + return buffer->buft->iface.get_name == ggml_backend_sycl_split_buffer_type_get_name; +} + static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point // instead, we allocate them for each tensor separately in init_tensor @@ -1040,12 +1029,6 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_ GGML_UNUSED(buft); } -static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) { - return GGML_SYCL_NAME "_Host"; - - GGML_UNUSED(buffer); -} - static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_sycl_host_free(buffer->context); } @@ -1061,7 +1044,6 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm // FIXME: this is a hack to avoid having to implement a new buffer type ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); buffer->buft = buft; - buffer->iface.get_name = ggml_backend_sycl_host_buffer_name; buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer; return buffer; @@ -4889,12 +4871,6 @@ static void ggml_backend_sycl_free(ggml_backend_t backend) { delete backend; } - -static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) { - ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; - return ggml_backend_sycl_buffer_type(sycl_ctx->device); -} - static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend, ggml_tensor *tensor, const void *data, size_t offset, @@ -5031,7 +5007,6 @@ static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_ev static ggml_backend_i ggml_backend_sycl_interface = { /* .get_name = */ ggml_backend_sycl_get_name, /* .free = */ ggml_backend_sycl_free, - /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type, /* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async, /* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async, /* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async, @@ -5043,9 +5018,6 @@ static ggml_backend_i ggml_backend_sycl_interface = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_sycl_graph_compute, - /* .supports_op = */ NULL, // moved to device - /* .supports_buft = */ NULL, // moved to device - /* .offload_op = */ NULL, // moved to device /* .event_record = */ ggml_backend_sycl_event_record, /* .event_wait = */ ggml_backend_sycl_event_wait, }; @@ -5092,7 +5064,7 @@ static void ggml_backend_sycl_device_get_memory(ggml_backend_dev_t dev, size_t * static enum ggml_backend_dev_type ggml_backend_sycl_device_get_type(ggml_backend_dev_t dev) { GGML_UNUSED(dev); - return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; + return GGML_BACKEND_DEVICE_TYPE_GPU; } static void ggml_backend_sycl_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { @@ -5388,12 +5360,14 @@ static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t re return ctx->devices[index]; } -static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) -{ +static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) { GGML_UNUSED(reg); - if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { - return (void *)ggml_backend_sycl_split_buffer_type; - } + + // TODO: update to the current function signature + //if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { + // return (void *)ggml_backend_sycl_split_buffer_type; + //} + // SYCL doesn't support registering host memory, left here for reference // "ggml_backend_register_host_buffer" // "ggml_backend_unregister_host_buffer" diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index ecae13a..3c2c7c1 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -6247,13 +6247,8 @@ static void ggml_vk_get_device_description(int device, char * description, size_ // device backend -static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) { - ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; - return ctx->name.c_str(); -} - static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) { - return buffer->iface.get_name == ggml_backend_vk_buffer_get_name; + return buffer->buft->iface.get_name == ggml_backend_vk_buffer_type_name; } static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -6317,7 +6312,6 @@ static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t v } static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = { - /* .get_name = */ ggml_backend_vk_buffer_get_name, /* .free_buffer = */ ggml_backend_vk_buffer_free_buffer, /* .get_base = */ ggml_backend_vk_buffer_get_base, /* .init_tensor = */ ggml_backend_vk_buffer_init_tensor, @@ -6413,7 +6407,6 @@ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); buffer->buft = buft; - buffer->iface.get_name = ggml_backend_vk_host_buffer_name; buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer; return buffer; @@ -6646,7 +6639,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg static ggml_backend_i ggml_backend_vk_interface = { /* .get_name = */ ggml_backend_vk_name, /* .free = */ ggml_backend_vk_free, - /* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type, /* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async, /* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async, /* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async, @@ -6656,9 +6648,6 @@ static ggml_backend_i ggml_backend_vk_interface = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_vk_graph_compute, - /* .supports_op = */ NULL, - /* .supports_buft = */ NULL, - /* .offload_op = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, }; @@ -6717,7 +6706,7 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total ////////////////////////// struct ggml_backend_vk_device_context { - int device; + size_t device; std::string name; std::string description; }; @@ -6749,7 +6738,7 @@ static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(gg static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) { UNUSED(dev); - return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; + return GGML_BACKEND_DEVICE_TYPE_GPU; } static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { @@ -6758,9 +6747,10 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml props->type = ggml_backend_vk_device_get_type(dev); ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total); props->caps = { - /* async */ false, - /* host_buffer */ true, - /* events */ false, + /* .async = */ false, + /* .host_buffer = */ true, + /* .buffer_from_host_ptr = */ false, + /* .events = */ false, }; } @@ -6949,7 +6939,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, static std::mutex mutex; std::lock_guard lock(mutex); if (!initialized) { - for (size_t i = 0; i < ggml_backend_vk_get_device_count(); i++) { + for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) { ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context; char desc[256]; ggml_backend_vk_get_device_description(i, desc, sizeof(desc)); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index a4359e7..74f73c8 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3999,7 +3999,9 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size); - assert(false); +#ifndef NDEBUG + GGML_ABORT("not enough space in the context's memory pool"); +#endif return NULL; }