ggml : backport llama.cpp updates (close #709)

- About x2 overall performance improvement on Apple Silicon
- Results should now be the same for different number of threads (not
  tested)
This commit is contained in:
Georgi Gerganov 2023-04-10 22:28:54 +03:00
parent 0a2d1210bc
commit 69b8503935
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
3 changed files with 1658 additions and 1202 deletions

2614
ggml.c

File diff suppressed because it is too large Load Diff

113
ggml.h
View File

@ -236,6 +236,7 @@ enum ggml_op {
GGML_OP_SCALE, GGML_OP_SCALE,
GGML_OP_CPY, GGML_OP_CPY,
GGML_OP_CONT,
GGML_OP_RESHAPE, GGML_OP_RESHAPE,
GGML_OP_VIEW, GGML_OP_VIEW,
GGML_OP_PERMUTE, GGML_OP_PERMUTE,
@ -253,16 +254,29 @@ enum ggml_op {
GGML_OP_COUNT, GGML_OP_COUNT,
}; };
// ggml object
struct ggml_object {
size_t offs;
size_t size;
struct ggml_object * next;
char padding[8];
};
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
// n-dimensional tensor // n-dimensional tensor
struct ggml_tensor { struct ggml_tensor {
enum ggml_type type; enum ggml_type type;
int n_dims; int n_dims;
int ne[GGML_MAX_DIMS]; // number of elements int64_t ne[GGML_MAX_DIMS]; // number of elements
size_t nb[GGML_MAX_DIMS]; // stride in bytes: size_t nb[GGML_MAX_DIMS]; // stride in bytes:
// nb[0] = sizeof(type) // nb[0] = sizeof(type)
// nb[1] = nb[0] * ne[0] + padding // nb[1] = nb[0] * ne[0] + padding
// nb[i] = nb[i-1] * ne[i-1] // nb[i] = nb[i-1] * ne[i-1]
// compute data // compute data
enum ggml_op op; enum ggml_op op;
@ -316,6 +330,7 @@ struct ggml_init_params {
// memory pool // memory pool
size_t mem_size; // bytes size_t mem_size; // bytes
void * mem_buffer; // if NULL, memory will be allocated internally void * mem_buffer; // if NULL, memory will be allocated internally
bool no_alloc; // don't allocate memory for the tensor data
}; };
void ggml_time_init(void); // call this once at the beginning of the program void ggml_time_init(void); // call this once at the beginning of the program
@ -327,8 +342,8 @@ int64_t ggml_cycles_per_ms(void);
void ggml_print_object (const struct ggml_object * obj); void ggml_print_object (const struct ggml_object * obj);
void ggml_print_objects(const struct ggml_context * ctx); void ggml_print_objects(const struct ggml_context * ctx);
int ggml_nelements(const struct ggml_tensor * tensor); int64_t ggml_nelements(const struct ggml_tensor * tensor);
size_t ggml_nbytes (const struct ggml_tensor * tensor); size_t ggml_nbytes (const struct ggml_tensor * tensor);
int ggml_blck_size (enum ggml_type type); int ggml_blck_size (enum ggml_type type);
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
@ -343,40 +358,37 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
bool ggml_mlock_supported(void);
bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
struct ggml_tensor * ggml_new_tensor( struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx, struct ggml_context * ctx,
enum ggml_type type, enum ggml_type type,
int n_dims, int n_dims,
const int *ne); const int64_t *ne);
struct ggml_tensor * ggml_new_tensor_1d( struct ggml_tensor * ggml_new_tensor_1d(
struct ggml_context * ctx, struct ggml_context * ctx,
enum ggml_type type, enum ggml_type type,
int ne0); int64_t ne0);
struct ggml_tensor * ggml_new_tensor_2d( struct ggml_tensor * ggml_new_tensor_2d(
struct ggml_context * ctx, struct ggml_context * ctx,
enum ggml_type type, enum ggml_type type,
int ne0, int64_t ne0,
int ne1); int64_t ne1);
struct ggml_tensor * ggml_new_tensor_3d( struct ggml_tensor * ggml_new_tensor_3d(
struct ggml_context * ctx, struct ggml_context * ctx,
enum ggml_type type, enum ggml_type type,
int ne0, int64_t ne0,
int ne1, int64_t ne1,
int ne2); int64_t ne2);
struct ggml_tensor * ggml_new_tensor_4d( struct ggml_tensor * ggml_new_tensor_4d(
struct ggml_context * ctx, struct ggml_context * ctx,
enum ggml_type type, enum ggml_type type,
int ne0, int64_t ne0,
int ne1, int64_t ne1,
int ne2, int64_t ne2,
int ne3); int64_t ne3);
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
@ -514,6 +526,11 @@ struct ggml_tensor * ggml_cpy(
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// make contiguous
struct ggml_tensor * ggml_cont(
struct ggml_context * ctx,
struct ggml_tensor * a);
// return view(a), b specifies the new shape // return view(a), b specifies the new shape
// TODO: when we start computing gradient, make a copy instead of view // TODO: when we start computing gradient, make a copy instead of view
struct ggml_tensor * ggml_reshape( struct ggml_tensor * ggml_reshape(
@ -526,33 +543,43 @@ struct ggml_tensor * ggml_reshape(
struct ggml_tensor * ggml_reshape_2d( struct ggml_tensor * ggml_reshape_2d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int ne0, int64_t ne0,
int ne1); int64_t ne1);
// return view(a) // return view(a)
// TODO: when we start computing gradient, make a copy instead of view // TODO: when we start computing gradient, make a copy instead of view
struct ggml_tensor * ggml_reshape_3d( struct ggml_tensor * ggml_reshape_3d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int ne0, int64_t ne0,
int ne1, int64_t ne1,
int ne2); int64_t ne2);
// offset in bytes // offset in bytes
struct ggml_tensor * ggml_view_1d( struct ggml_tensor * ggml_view_1d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int ne0, int64_t ne0,
size_t offset); size_t offset);
struct ggml_tensor * ggml_view_2d( struct ggml_tensor * ggml_view_2d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int ne0, int64_t ne0,
int ne1, int64_t ne1,
size_t nb1, // row stride in bytes size_t nb1, // row stride in bytes
size_t offset); size_t offset);
struct ggml_tensor * ggml_view_3d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
int64_t ne1,
int64_t ne2,
size_t nb1, // row stride in bytes
size_t nb2, // slice stride in bytes
size_t offset);
struct ggml_tensor * ggml_permute( struct ggml_tensor * ggml_permute(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -748,8 +775,8 @@ enum ggml_opt_result ggml_opt(
// quantization // quantization
// //
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist); size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist); size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
// //
// system info // system info
@ -768,6 +795,30 @@ int ggml_cpu_has_blas(void);
int ggml_cpu_has_sse3(void); int ggml_cpu_has_sse3(void);
int ggml_cpu_has_vsx(void); int ggml_cpu_has_vsx(void);
//
// Internal types and functions exposed for tests and benchmarks
//
#ifdef __cplusplus
// restrict not standard in C++
#define GGML_RESTRICT
#else
#define GGML_RESTRICT restrict
#endif
typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
typedef struct {
dequantize_row_q_t dequantize_row_q;
quantize_row_q_t quantize_row_q;
quantize_row_q_t quantize_row_q_reference;
vec_dot_q_t vec_dot_q;
} quantize_fns_t;
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -654,9 +654,11 @@ static bool kv_cache_init(
int n_ctx) { int n_ctx) {
cache.buf.resize(mem_bytes); cache.buf.resize(mem_bytes);
struct ggml_init_params params; struct ggml_init_params params = {
params.mem_size = cache.buf.size(); /*.mem_size =*/ cache.buf.size(),
params.mem_buffer = cache.buf.data(); /*.mem_buffer =*/ cache.buf.data(),
/*.no_alloc =*/ false,
};
cache.ctx = ggml_init(params); cache.ctx = ggml_init(params);
@ -688,9 +690,11 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype)); WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype));
struct ggml_init_params params; struct ggml_init_params params = {
params.mem_size = cache.buf.size(); /*.mem_size =*/ cache.buf.size(),
params.mem_buffer = cache.buf.data(); /*.mem_buffer =*/ cache.buf.data(),
/*.no_alloc =*/ false,
};
cache.ctx = ggml_init(params); cache.ctx = ggml_init(params);
@ -1028,9 +1032,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
// create the ggml context // create the ggml context
{ {
struct ggml_init_params params; struct ggml_init_params params = {
params.mem_size = wctx.model.buf->size(); /*.mem_size =*/ wctx.model.buf->size(),
params.mem_buffer = wctx.model.buf->data(); /*.mem_buffer =*/ wctx.model.buf->data(),
/*.no_alloc =*/ false,
};
model.ctx = ggml_init(params); model.ctx = ggml_init(params);
if (!model.ctx) { if (!model.ctx) {
@ -1254,10 +1260,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
break; break;
} }
int32_t nelements = 1; int64_t nelements = 1;
int32_t ne[3] = { 1, 1, 1 }; int64_t ne[3] = { 1, 1, 1 };
for (int i = 0; i < n_dims; ++i) { for (int i = 0; i < n_dims; ++i) {
read_safe(loader, ne[i]); int32_t ne_cur;
read_safe(loader, ne_cur);
ne[i] = ne_cur;
nelements *= ne[i]; nelements *= ne[i];
} }
@ -1278,7 +1286,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
} }
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) { if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n", fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld, %lld], expected [%lld, %lld, %lld]\n",
__func__, name.data(), tensor->ne[0], tensor->ne[1], tensor->ne[2], ne[0], ne[1], ne[2]); __func__, name.data(), tensor->ne[0], tensor->ne[1], tensor->ne[2], ne[0], ne[1], ne[2]);
return false; return false;
} }
@ -1286,7 +1294,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t); const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
if (nelements*bpe != ggml_nbytes(tensor)) { if (nelements*bpe != ggml_nbytes(tensor)) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %llu\n",
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe); __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
return false; return false;
} }
@ -1344,9 +1352,11 @@ static bool whisper_encode_internal(
const int n_mels = hparams.n_mels; const int n_mels = hparams.n_mels;
assert(mel_inp.n_mel == n_mels); assert(mel_inp.n_mel == n_mels);
struct ggml_init_params params; struct ggml_init_params params = {
params.mem_size = wstate.buf_compute.size(); /*.mem_size =*/ wstate.buf_compute.size(),
params.mem_buffer = wstate.buf_compute.data(); /*.mem_buffer =*/ wstate.buf_compute.data(),
/*.no_alloc =*/ false,
};
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
@ -1501,8 +1511,7 @@ static bool whisper_encode_internal(
Vcur, Vcur,
n_state/n_head, n_head, n_ctx), n_state/n_head, n_head, n_ctx),
1, 2, 0, 3), 1, 2, 0, 3),
ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head) ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head));
);
struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false); struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
#else #else
@ -1726,10 +1735,12 @@ static bool whisper_encode_internal(
wstate.use_buf(ctx0, -1); wstate.use_buf(ctx0, -1);
//struct ggml_tensor * k = ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx)); Vcross = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcross, n_state, n_ctx));
//struct ggml_tensor * v = ggml_view_1d(ctx0, wstate.kv_cross.v, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.v)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
struct ggml_tensor* k = ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx)); struct ggml_tensor * k = ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx));
struct ggml_tensor* v = ggml_view_1d(ctx0, wstate.kv_cross.v, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.v)*n_state)*(il*n_ctx)); struct ggml_tensor * v = ggml_view_2d(ctx0, wstate.kv_cross.v, n_ctx, n_state,
( n_ctx)*ggml_element_size(wstate.kv_cross.v),
(il*n_ctx)*ggml_element_size(wstate.kv_cross.v)*n_state);
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcross, k)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcross, k));
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
@ -1797,9 +1808,11 @@ static bool whisper_decode_internal(
//WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx); //WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);
struct ggml_init_params params; struct ggml_init_params params = {
params.mem_size = wstate.buf_compute.size(); /*.mem_size =*/ wstate.buf_compute.size(),
params.mem_buffer = wstate.buf_compute.data(); /*.mem_buffer =*/ wstate.buf_compute.data(),
/*.no_alloc =*/ false,
};
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
@ -1862,20 +1875,24 @@ static bool whisper_decode_internal(
Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
layer.attn_v_w,
cur);
Vcur = ggml_add(ctx0,
ggml_repeat(ctx0,
layer.attn_v_b,
Vcur),
Vcur);
// store key and value to memory // store key and value to memory
{ {
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
layer.attn_v_w,
cur);
Vcur = ggml_add(ctx0,
ggml_repeat(ctx0,
layer.attn_v_b,
Vcur),
Vcur);
Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_state, N));
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_state, (ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + n_past)); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_state, (ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + n_past));
struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_state, (ggml_element_size(kv_self.v)*n_state)*(il*n_ctx + n_past)); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_state,
( n_ctx)*ggml_element_size(kv_self.v),
(il*n_ctx)*ggml_element_size(kv_self.v)*n_state + n_past*ggml_element_size(kv_self.v));
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
@ -1914,16 +1931,14 @@ static bool whisper_decode_internal(
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
struct ggml_tensor * V_trans = struct ggml_tensor * V =
ggml_cpy(ctx0, ggml_view_3d(ctx0, kv_self.v,
ggml_permute(ctx0, n_past + N, n_state/n_head, n_head,
ggml_reshape_3d(ctx0, n_ctx*ggml_element_size(kv_self.v),
ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_state, il*n_ctx*ggml_element_size(kv_self.v)*n_state), n_ctx*ggml_element_size(kv_self.v)*n_state/n_head,
n_state/n_head, n_head, n_past + N), il*n_ctx*ggml_element_size(kv_self.v)*n_state);
1, 2, 0, 3),
ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_state/n_head, n_head));
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
@ -1986,15 +2001,22 @@ static bool whisper_decode_internal(
ggml_view_1d(ctx0, wstate.kv_cross.k, M*n_state, il*M*ggml_element_size(wstate.kv_cross.k)*n_state), ggml_view_1d(ctx0, wstate.kv_cross.k, M*n_state, il*M*ggml_element_size(wstate.kv_cross.k)*n_state),
n_state/n_head, n_head, M); n_state/n_head, n_head, M);
struct ggml_tensor * Vcross = //struct ggml_tensor * Vcross =
ggml_reshape_3d(ctx0, // ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*ggml_element_size(wstate.kv_cross.v)*n_state), // ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*ggml_element_size(wstate.kv_cross.v)*n_state),
n_state/n_head, n_head, M); // n_state/n_head, n_head, M);
struct ggml_tensor * V_trans = //struct ggml_tensor * V_trans =
ggml_cpy(ctx0, // ggml_cpy(ctx0,
ggml_permute(ctx0, Vcross, 1, 2, 0, 3), // ggml_permute(ctx0, Vcross, 1, 2, 0, 3),
ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head)); // ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head));
struct ggml_tensor * V =
ggml_view_3d(ctx0, wstate.kv_cross.v,
M, n_state/n_head, n_head,
M*ggml_element_size(wstate.kv_cross.v),
M*ggml_element_size(wstate.kv_cross.v)*n_state/n_head,
il*M*ggml_element_size(wstate.kv_cross.v)*n_state);
// ------ // ------
@ -2021,7 +2043,7 @@ static bool whisper_decode_internal(
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ); struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
@ -4726,6 +4748,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
struct ggml_init_params gparams = { struct ggml_init_params gparams = {
/*.mem_size =*/ buf.size(), /*.mem_size =*/ buf.size(),
/*.mem_buffer =*/ buf.data(), /*.mem_buffer =*/ buf.data(),
/*.no_alloc =*/ false,
}; };
struct ggml_context * ctx0 = ggml_init(gparams); struct ggml_context * ctx0 = ggml_init(gparams);