ggml : backport llama.cpp updates (close #709)
- About x2 overall performance improvement on Apple Silicon - Results should now be the same for different number of threads (not tested)
This commit is contained in:
parent
0a2d1210bc
commit
69b8503935
113
ggml.h
113
ggml.h
@ -236,6 +236,7 @@ enum ggml_op {
|
|||||||
|
|
||||||
GGML_OP_SCALE,
|
GGML_OP_SCALE,
|
||||||
GGML_OP_CPY,
|
GGML_OP_CPY,
|
||||||
|
GGML_OP_CONT,
|
||||||
GGML_OP_RESHAPE,
|
GGML_OP_RESHAPE,
|
||||||
GGML_OP_VIEW,
|
GGML_OP_VIEW,
|
||||||
GGML_OP_PERMUTE,
|
GGML_OP_PERMUTE,
|
||||||
@ -253,16 +254,29 @@ enum ggml_op {
|
|||||||
GGML_OP_COUNT,
|
GGML_OP_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
// ggml object
|
||||||
|
struct ggml_object {
|
||||||
|
size_t offs;
|
||||||
|
size_t size;
|
||||||
|
|
||||||
|
struct ggml_object * next;
|
||||||
|
|
||||||
|
char padding[8];
|
||||||
|
};
|
||||||
|
|
||||||
|
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
||||||
|
|
||||||
// n-dimensional tensor
|
// n-dimensional tensor
|
||||||
struct ggml_tensor {
|
struct ggml_tensor {
|
||||||
enum ggml_type type;
|
enum ggml_type type;
|
||||||
|
|
||||||
int n_dims;
|
int n_dims;
|
||||||
int ne[GGML_MAX_DIMS]; // number of elements
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
||||||
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
||||||
// nb[0] = sizeof(type)
|
// nb[0] = sizeof(type)
|
||||||
// nb[1] = nb[0] * ne[0] + padding
|
// nb[1] = nb[0] * ne[0] + padding
|
||||||
// nb[i] = nb[i-1] * ne[i-1]
|
// nb[i] = nb[i-1] * ne[i-1]
|
||||||
|
|
||||||
// compute data
|
// compute data
|
||||||
enum ggml_op op;
|
enum ggml_op op;
|
||||||
@ -316,6 +330,7 @@ struct ggml_init_params {
|
|||||||
// memory pool
|
// memory pool
|
||||||
size_t mem_size; // bytes
|
size_t mem_size; // bytes
|
||||||
void * mem_buffer; // if NULL, memory will be allocated internally
|
void * mem_buffer; // if NULL, memory will be allocated internally
|
||||||
|
bool no_alloc; // don't allocate memory for the tensor data
|
||||||
};
|
};
|
||||||
|
|
||||||
void ggml_time_init(void); // call this once at the beginning of the program
|
void ggml_time_init(void); // call this once at the beginning of the program
|
||||||
@ -327,8 +342,8 @@ int64_t ggml_cycles_per_ms(void);
|
|||||||
void ggml_print_object (const struct ggml_object * obj);
|
void ggml_print_object (const struct ggml_object * obj);
|
||||||
void ggml_print_objects(const struct ggml_context * ctx);
|
void ggml_print_objects(const struct ggml_context * ctx);
|
||||||
|
|
||||||
int ggml_nelements(const struct ggml_tensor * tensor);
|
int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
||||||
size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
int ggml_blck_size (enum ggml_type type);
|
int ggml_blck_size (enum ggml_type type);
|
||||||
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
||||||
@ -343,40 +358,37 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
|
|||||||
|
|
||||||
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
||||||
|
|
||||||
bool ggml_mlock_supported(void);
|
|
||||||
bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
|
|
||||||
|
|
||||||
struct ggml_tensor * ggml_new_tensor(
|
struct ggml_tensor * ggml_new_tensor(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
int n_dims,
|
int n_dims,
|
||||||
const int *ne);
|
const int64_t *ne);
|
||||||
|
|
||||||
struct ggml_tensor * ggml_new_tensor_1d(
|
struct ggml_tensor * ggml_new_tensor_1d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
int ne0);
|
int64_t ne0);
|
||||||
|
|
||||||
struct ggml_tensor * ggml_new_tensor_2d(
|
struct ggml_tensor * ggml_new_tensor_2d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
int ne0,
|
int64_t ne0,
|
||||||
int ne1);
|
int64_t ne1);
|
||||||
|
|
||||||
struct ggml_tensor * ggml_new_tensor_3d(
|
struct ggml_tensor * ggml_new_tensor_3d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
int ne0,
|
int64_t ne0,
|
||||||
int ne1,
|
int64_t ne1,
|
||||||
int ne2);
|
int64_t ne2);
|
||||||
|
|
||||||
struct ggml_tensor * ggml_new_tensor_4d(
|
struct ggml_tensor * ggml_new_tensor_4d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
int ne0,
|
int64_t ne0,
|
||||||
int ne1,
|
int64_t ne1,
|
||||||
int ne2,
|
int64_t ne2,
|
||||||
int ne3);
|
int64_t ne3);
|
||||||
|
|
||||||
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||||
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||||
@ -514,6 +526,11 @@ struct ggml_tensor * ggml_cpy(
|
|||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
// make contiguous
|
||||||
|
struct ggml_tensor * ggml_cont(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
// return view(a), b specifies the new shape
|
// return view(a), b specifies the new shape
|
||||||
// TODO: when we start computing gradient, make a copy instead of view
|
// TODO: when we start computing gradient, make a copy instead of view
|
||||||
struct ggml_tensor * ggml_reshape(
|
struct ggml_tensor * ggml_reshape(
|
||||||
@ -526,33 +543,43 @@ struct ggml_tensor * ggml_reshape(
|
|||||||
struct ggml_tensor * ggml_reshape_2d(
|
struct ggml_tensor * ggml_reshape_2d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int ne0,
|
int64_t ne0,
|
||||||
int ne1);
|
int64_t ne1);
|
||||||
|
|
||||||
// return view(a)
|
// return view(a)
|
||||||
// TODO: when we start computing gradient, make a copy instead of view
|
// TODO: when we start computing gradient, make a copy instead of view
|
||||||
struct ggml_tensor * ggml_reshape_3d(
|
struct ggml_tensor * ggml_reshape_3d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int ne0,
|
int64_t ne0,
|
||||||
int ne1,
|
int64_t ne1,
|
||||||
int ne2);
|
int64_t ne2);
|
||||||
|
|
||||||
// offset in bytes
|
// offset in bytes
|
||||||
struct ggml_tensor * ggml_view_1d(
|
struct ggml_tensor * ggml_view_1d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int ne0,
|
int64_t ne0,
|
||||||
size_t offset);
|
size_t offset);
|
||||||
|
|
||||||
struct ggml_tensor * ggml_view_2d(
|
struct ggml_tensor * ggml_view_2d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int ne0,
|
int64_t ne0,
|
||||||
int ne1,
|
int64_t ne1,
|
||||||
size_t nb1, // row stride in bytes
|
size_t nb1, // row stride in bytes
|
||||||
size_t offset);
|
size_t offset);
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_view_3d(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int64_t ne0,
|
||||||
|
int64_t ne1,
|
||||||
|
int64_t ne2,
|
||||||
|
size_t nb1, // row stride in bytes
|
||||||
|
size_t nb2, // slice stride in bytes
|
||||||
|
size_t offset);
|
||||||
|
|
||||||
struct ggml_tensor * ggml_permute(
|
struct ggml_tensor * ggml_permute(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
@ -748,8 +775,8 @@ enum ggml_opt_result ggml_opt(
|
|||||||
// quantization
|
// quantization
|
||||||
//
|
//
|
||||||
|
|
||||||
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
|
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
|
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
|
|
||||||
//
|
//
|
||||||
// system info
|
// system info
|
||||||
@ -768,6 +795,30 @@ int ggml_cpu_has_blas(void);
|
|||||||
int ggml_cpu_has_sse3(void);
|
int ggml_cpu_has_sse3(void);
|
||||||
int ggml_cpu_has_vsx(void);
|
int ggml_cpu_has_vsx(void);
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// Internal types and functions exposed for tests and benchmarks
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
// restrict not standard in C++
|
||||||
|
#define GGML_RESTRICT
|
||||||
|
#else
|
||||||
|
#define GGML_RESTRICT restrict
|
||||||
|
#endif
|
||||||
|
typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||||
|
typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||||
|
typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
dequantize_row_q_t dequantize_row_q;
|
||||||
|
quantize_row_q_t quantize_row_q;
|
||||||
|
quantize_row_q_t quantize_row_q_reference;
|
||||||
|
vec_dot_q_t vec_dot_q;
|
||||||
|
} quantize_fns_t;
|
||||||
|
|
||||||
|
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
133
whisper.cpp
133
whisper.cpp
@ -654,9 +654,11 @@ static bool kv_cache_init(
|
|||||||
int n_ctx) {
|
int n_ctx) {
|
||||||
cache.buf.resize(mem_bytes);
|
cache.buf.resize(mem_bytes);
|
||||||
|
|
||||||
struct ggml_init_params params;
|
struct ggml_init_params params = {
|
||||||
params.mem_size = cache.buf.size();
|
/*.mem_size =*/ cache.buf.size(),
|
||||||
params.mem_buffer = cache.buf.data();
|
/*.mem_buffer =*/ cache.buf.data(),
|
||||||
|
/*.no_alloc =*/ false,
|
||||||
|
};
|
||||||
|
|
||||||
cache.ctx = ggml_init(params);
|
cache.ctx = ggml_init(params);
|
||||||
|
|
||||||
@ -688,9 +690,11 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
|
|||||||
|
|
||||||
WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype));
|
WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype));
|
||||||
|
|
||||||
struct ggml_init_params params;
|
struct ggml_init_params params = {
|
||||||
params.mem_size = cache.buf.size();
|
/*.mem_size =*/ cache.buf.size(),
|
||||||
params.mem_buffer = cache.buf.data();
|
/*.mem_buffer =*/ cache.buf.data(),
|
||||||
|
/*.no_alloc =*/ false,
|
||||||
|
};
|
||||||
|
|
||||||
cache.ctx = ggml_init(params);
|
cache.ctx = ggml_init(params);
|
||||||
|
|
||||||
@ -1028,9 +1032,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|||||||
|
|
||||||
// create the ggml context
|
// create the ggml context
|
||||||
{
|
{
|
||||||
struct ggml_init_params params;
|
struct ggml_init_params params = {
|
||||||
params.mem_size = wctx.model.buf->size();
|
/*.mem_size =*/ wctx.model.buf->size(),
|
||||||
params.mem_buffer = wctx.model.buf->data();
|
/*.mem_buffer =*/ wctx.model.buf->data(),
|
||||||
|
/*.no_alloc =*/ false,
|
||||||
|
};
|
||||||
|
|
||||||
model.ctx = ggml_init(params);
|
model.ctx = ggml_init(params);
|
||||||
if (!model.ctx) {
|
if (!model.ctx) {
|
||||||
@ -1254,10 +1260,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t nelements = 1;
|
int64_t nelements = 1;
|
||||||
int32_t ne[3] = { 1, 1, 1 };
|
int64_t ne[3] = { 1, 1, 1 };
|
||||||
for (int i = 0; i < n_dims; ++i) {
|
for (int i = 0; i < n_dims; ++i) {
|
||||||
read_safe(loader, ne[i]);
|
int32_t ne_cur;
|
||||||
|
read_safe(loader, ne_cur);
|
||||||
|
ne[i] = ne_cur;
|
||||||
nelements *= ne[i];
|
nelements *= ne[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1278,7 +1286,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
|
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
|
||||||
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
|
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld, %lld], expected [%lld, %lld, %lld]\n",
|
||||||
__func__, name.data(), tensor->ne[0], tensor->ne[1], tensor->ne[2], ne[0], ne[1], ne[2]);
|
__func__, name.data(), tensor->ne[0], tensor->ne[1], tensor->ne[2], ne[0], ne[1], ne[2]);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -1286,7 +1294,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
|||||||
const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
|
const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
|
||||||
|
|
||||||
if (nelements*bpe != ggml_nbytes(tensor)) {
|
if (nelements*bpe != ggml_nbytes(tensor)) {
|
||||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %llu\n",
|
||||||
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -1344,9 +1352,11 @@ static bool whisper_encode_internal(
|
|||||||
const int n_mels = hparams.n_mels;
|
const int n_mels = hparams.n_mels;
|
||||||
assert(mel_inp.n_mel == n_mels);
|
assert(mel_inp.n_mel == n_mels);
|
||||||
|
|
||||||
struct ggml_init_params params;
|
struct ggml_init_params params = {
|
||||||
params.mem_size = wstate.buf_compute.size();
|
/*.mem_size =*/ wstate.buf_compute.size(),
|
||||||
params.mem_buffer = wstate.buf_compute.data();
|
/*.mem_buffer =*/ wstate.buf_compute.data(),
|
||||||
|
/*.no_alloc =*/ false,
|
||||||
|
};
|
||||||
|
|
||||||
struct ggml_context * ctx0 = ggml_init(params);
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
|
|
||||||
@ -1501,8 +1511,7 @@ static bool whisper_encode_internal(
|
|||||||
Vcur,
|
Vcur,
|
||||||
n_state/n_head, n_head, n_ctx),
|
n_state/n_head, n_head, n_ctx),
|
||||||
1, 2, 0, 3),
|
1, 2, 0, 3),
|
||||||
ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head)
|
ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head));
|
||||||
);
|
|
||||||
|
|
||||||
struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
|
struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
|
||||||
#else
|
#else
|
||||||
@ -1726,10 +1735,12 @@ static bool whisper_encode_internal(
|
|||||||
|
|
||||||
wstate.use_buf(ctx0, -1);
|
wstate.use_buf(ctx0, -1);
|
||||||
|
|
||||||
//struct ggml_tensor * k = ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
|
Vcross = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcross, n_state, n_ctx));
|
||||||
//struct ggml_tensor * v = ggml_view_1d(ctx0, wstate.kv_cross.v, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.v)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
|
|
||||||
struct ggml_tensor* k = ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx));
|
struct ggml_tensor * k = ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx));
|
||||||
struct ggml_tensor* v = ggml_view_1d(ctx0, wstate.kv_cross.v, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.v)*n_state)*(il*n_ctx));
|
struct ggml_tensor * v = ggml_view_2d(ctx0, wstate.kv_cross.v, n_ctx, n_state,
|
||||||
|
( n_ctx)*ggml_element_size(wstate.kv_cross.v),
|
||||||
|
(il*n_ctx)*ggml_element_size(wstate.kv_cross.v)*n_state);
|
||||||
|
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcross, k));
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcross, k));
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
|
||||||
@ -1797,9 +1808,11 @@ static bool whisper_decode_internal(
|
|||||||
|
|
||||||
//WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);
|
//WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);
|
||||||
|
|
||||||
struct ggml_init_params params;
|
struct ggml_init_params params = {
|
||||||
params.mem_size = wstate.buf_compute.size();
|
/*.mem_size =*/ wstate.buf_compute.size(),
|
||||||
params.mem_buffer = wstate.buf_compute.data();
|
/*.mem_buffer =*/ wstate.buf_compute.data(),
|
||||||
|
/*.no_alloc =*/ false,
|
||||||
|
};
|
||||||
|
|
||||||
struct ggml_context * ctx0 = ggml_init(params);
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
|
|
||||||
@ -1862,20 +1875,24 @@ static bool whisper_decode_internal(
|
|||||||
|
|
||||||
Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
||||||
|
|
||||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
|
|
||||||
layer.attn_v_w,
|
|
||||||
cur);
|
|
||||||
|
|
||||||
Vcur = ggml_add(ctx0,
|
|
||||||
ggml_repeat(ctx0,
|
|
||||||
layer.attn_v_b,
|
|
||||||
Vcur),
|
|
||||||
Vcur);
|
|
||||||
|
|
||||||
// store key and value to memory
|
// store key and value to memory
|
||||||
{
|
{
|
||||||
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
|
||||||
|
layer.attn_v_w,
|
||||||
|
cur);
|
||||||
|
|
||||||
|
Vcur = ggml_add(ctx0,
|
||||||
|
ggml_repeat(ctx0,
|
||||||
|
layer.attn_v_b,
|
||||||
|
Vcur),
|
||||||
|
Vcur);
|
||||||
|
|
||||||
|
Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_state, N));
|
||||||
|
|
||||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_state, (ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + n_past));
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_state, (ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + n_past));
|
||||||
struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_state, (ggml_element_size(kv_self.v)*n_state)*(il*n_ctx + n_past));
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_state,
|
||||||
|
( n_ctx)*ggml_element_size(kv_self.v),
|
||||||
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_state + n_past*ggml_element_size(kv_self.v));
|
||||||
|
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
||||||
@ -1914,16 +1931,14 @@ static bool whisper_decode_internal(
|
|||||||
|
|
||||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
||||||
|
|
||||||
struct ggml_tensor * V_trans =
|
struct ggml_tensor * V =
|
||||||
ggml_cpy(ctx0,
|
ggml_view_3d(ctx0, kv_self.v,
|
||||||
ggml_permute(ctx0,
|
n_past + N, n_state/n_head, n_head,
|
||||||
ggml_reshape_3d(ctx0,
|
n_ctx*ggml_element_size(kv_self.v),
|
||||||
ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_state, il*n_ctx*ggml_element_size(kv_self.v)*n_state),
|
n_ctx*ggml_element_size(kv_self.v)*n_state/n_head,
|
||||||
n_state/n_head, n_head, n_past + N),
|
il*n_ctx*ggml_element_size(kv_self.v)*n_state);
|
||||||
1, 2, 0, 3),
|
|
||||||
ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_state/n_head, n_head));
|
|
||||||
|
|
||||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||||
|
|
||||||
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||||
|
|
||||||
@ -1986,15 +2001,22 @@ static bool whisper_decode_internal(
|
|||||||
ggml_view_1d(ctx0, wstate.kv_cross.k, M*n_state, il*M*ggml_element_size(wstate.kv_cross.k)*n_state),
|
ggml_view_1d(ctx0, wstate.kv_cross.k, M*n_state, il*M*ggml_element_size(wstate.kv_cross.k)*n_state),
|
||||||
n_state/n_head, n_head, M);
|
n_state/n_head, n_head, M);
|
||||||
|
|
||||||
struct ggml_tensor * Vcross =
|
//struct ggml_tensor * Vcross =
|
||||||
ggml_reshape_3d(ctx0,
|
// ggml_reshape_3d(ctx0,
|
||||||
ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*ggml_element_size(wstate.kv_cross.v)*n_state),
|
// ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*ggml_element_size(wstate.kv_cross.v)*n_state),
|
||||||
n_state/n_head, n_head, M);
|
// n_state/n_head, n_head, M);
|
||||||
|
|
||||||
struct ggml_tensor * V_trans =
|
//struct ggml_tensor * V_trans =
|
||||||
ggml_cpy(ctx0,
|
// ggml_cpy(ctx0,
|
||||||
ggml_permute(ctx0, Vcross, 1, 2, 0, 3),
|
// ggml_permute(ctx0, Vcross, 1, 2, 0, 3),
|
||||||
ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head));
|
// ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head));
|
||||||
|
|
||||||
|
struct ggml_tensor * V =
|
||||||
|
ggml_view_3d(ctx0, wstate.kv_cross.v,
|
||||||
|
M, n_state/n_head, n_head,
|
||||||
|
M*ggml_element_size(wstate.kv_cross.v),
|
||||||
|
M*ggml_element_size(wstate.kv_cross.v)*n_state/n_head,
|
||||||
|
il*M*ggml_element_size(wstate.kv_cross.v)*n_state);
|
||||||
|
|
||||||
// ------
|
// ------
|
||||||
|
|
||||||
@ -2021,7 +2043,7 @@ static bool whisper_decode_internal(
|
|||||||
|
|
||||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
|
||||||
|
|
||||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||||
|
|
||||||
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||||
|
|
||||||
@ -4726,6 +4748,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|||||||
struct ggml_init_params gparams = {
|
struct ggml_init_params gparams = {
|
||||||
/*.mem_size =*/ buf.size(),
|
/*.mem_size =*/ buf.size(),
|
||||||
/*.mem_buffer =*/ buf.data(),
|
/*.mem_buffer =*/ buf.data(),
|
||||||
|
/*.no_alloc =*/ false,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_context * ctx0 = ggml_init(gparams);
|
struct ggml_context * ctx0 = ggml_init(gparams);
|
||||||
|
Loading…
Reference in New Issue
Block a user