sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)
This commit is contained in:
parent
bebf0da983
commit
d4353e48f7
23
ggml-alloc.c
23
ggml-alloc.c
@ -446,12 +446,14 @@ static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * n
|
|||||||
return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
|
return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
|
||||||
}
|
}
|
||||||
|
|
||||||
static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view) {
|
static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
|
||||||
ggml_tallocr_t alloc = node_tallocr(galloc, view);
|
ggml_tallocr_t alloc = node_tallocr(galloc, view);
|
||||||
|
|
||||||
//printf("init_view: %s from src %s\n", view->name, view->view_src->name);
|
//printf("init_view: %s from src %s\n", view->name, view->view_src->name);
|
||||||
GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
|
GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
|
||||||
view->backend = view->view_src->backend;
|
if (update_backend) {
|
||||||
|
view->backend = view->view_src->backend;
|
||||||
|
}
|
||||||
view->buffer = view->view_src->buffer;
|
view->buffer = view->view_src->buffer;
|
||||||
view->data = (char *)view->view_src->data + view->view_offs;
|
view->data = (char *)view->view_src->data + view->view_offs;
|
||||||
|
|
||||||
@ -469,7 +471,7 @@ static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
|
|||||||
|
|
||||||
if (node->data == NULL) {
|
if (node->data == NULL) {
|
||||||
if (ggml_is_view(node)) {
|
if (ggml_is_view(node)) {
|
||||||
init_view(galloc, node);
|
init_view(galloc, node, true);
|
||||||
} else {
|
} else {
|
||||||
// see if we can reuse a parent's buffer (inplace)
|
// see if we can reuse a parent's buffer (inplace)
|
||||||
if (ggml_op_can_inplace(node->op)) {
|
if (ggml_op_can_inplace(node->op)) {
|
||||||
@ -499,15 +501,14 @@ static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
|
|||||||
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
||||||
node->view_src = view_src;
|
node->view_src = view_src;
|
||||||
view_src_hn->n_views += 1;
|
view_src_hn->n_views += 1;
|
||||||
init_view(galloc, node);
|
init_view(galloc, node, false);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
||||||
node->view_src = parent;
|
node->view_src = parent;
|
||||||
p_hn->n_views += 1;
|
p_hn->n_views += 1;
|
||||||
init_view(galloc, node);
|
init_view(galloc, node, false);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -537,7 +538,7 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|||||||
hash_get(galloc, view_src)->n_views += 1;
|
hash_get(galloc, view_src)->n_views += 1;
|
||||||
if (node->buffer == NULL && node->data != NULL) {
|
if (node->buffer == NULL && node->data != NULL) {
|
||||||
// view of a pre-allocated tensor, didn't call init_view() yet
|
// view of a pre-allocated tensor, didn't call init_view() yet
|
||||||
init_view(galloc, node);
|
init_view(galloc, node, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -548,7 +549,7 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|||||||
}
|
}
|
||||||
hash_get(galloc, parent)->n_children += 1;
|
hash_get(galloc, parent)->n_children += 1;
|
||||||
if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
|
if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
|
||||||
init_view(galloc, parent);
|
init_view(galloc, parent, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -663,7 +664,7 @@ size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, st
|
|||||||
return max_size;
|
return max_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_alloct) {
|
void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
|
||||||
const size_t hash_size = hash_set.size;
|
const size_t hash_size = hash_set.size;
|
||||||
|
|
||||||
GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
|
GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
|
||||||
@ -686,7 +687,7 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap
|
|||||||
// reset hash values
|
// reset hash values
|
||||||
memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
|
memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
|
||||||
|
|
||||||
galloc->hash_allocs = hash_node_alloct;
|
galloc->hash_allocs = hash_node_talloc;
|
||||||
|
|
||||||
ggml_tallocr_alloc_graph_impl(galloc, graph);
|
ggml_tallocr_alloc_graph_impl(galloc, graph);
|
||||||
|
|
||||||
|
@ -1368,7 +1368,12 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
|
|||||||
float max = x[0];
|
float max = x[0];
|
||||||
float sum_w = weights[0];
|
float sum_w = weights[0];
|
||||||
float sum_x = sum_w * x[0];
|
float sum_x = sum_w * x[0];
|
||||||
|
#ifdef HAVE_BUGGY_APPLE_LINKER
|
||||||
|
// use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
|
||||||
|
for (volatile int i = 1; i < n; ++i) {
|
||||||
|
#else
|
||||||
for (int i = 1; i < n; ++i) {
|
for (int i = 1; i < n; ++i) {
|
||||||
|
#endif
|
||||||
if (x[i] < min) min = x[i];
|
if (x[i] < min) min = x[i];
|
||||||
if (x[i] > max) max = x[i];
|
if (x[i] > max) max = x[i];
|
||||||
float w = weights[i];
|
float w = weights[i];
|
||||||
|
341
ggml.c
341
ggml.c
@ -5024,8 +5024,13 @@ struct ggml_tensor * ggml_rope_back(
|
|||||||
int n_dims,
|
int n_dims,
|
||||||
int mode,
|
int mode,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
|
int n_orig_ctx,
|
||||||
float freq_base,
|
float freq_base,
|
||||||
float freq_scale,
|
float freq_scale,
|
||||||
|
float ext_factor,
|
||||||
|
float attn_factor,
|
||||||
|
float beta_fast,
|
||||||
|
float beta_slow,
|
||||||
float xpos_base,
|
float xpos_base,
|
||||||
bool xpos_down) {
|
bool xpos_down) {
|
||||||
GGML_ASSERT(ggml_is_vector(b));
|
GGML_ASSERT(ggml_is_vector(b));
|
||||||
@ -5042,11 +5047,15 @@ struct ggml_tensor * ggml_rope_back(
|
|||||||
|
|
||||||
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
|
int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
|
||||||
memcpy(params + 4, &freq_base, sizeof(float));
|
memcpy(params + 5, &freq_base, sizeof(float));
|
||||||
memcpy(params + 5, &freq_scale, sizeof(float));
|
memcpy(params + 6, &freq_scale, sizeof(float));
|
||||||
memcpy(params + 6, &xpos_base, sizeof(float));
|
memcpy(params + 7, &ext_factor, sizeof(float));
|
||||||
memcpy(params + 7, &xpos_down, sizeof(bool));
|
memcpy(params + 8, &attn_factor, sizeof(float));
|
||||||
|
memcpy(params + 9, &beta_fast, sizeof(float));
|
||||||
|
memcpy(params + 10, &beta_slow, sizeof(float));
|
||||||
|
memcpy(params + 11, &xpos_base, sizeof(float));
|
||||||
|
memcpy(params + 12, &xpos_down, sizeof(bool));
|
||||||
ggml_set_op_params(result, params, sizeof(params));
|
ggml_set_op_params(result, params, sizeof(params));
|
||||||
|
|
||||||
result->op = GGML_OP_ROPE_BACK;
|
result->op = GGML_OP_ROPE_BACK;
|
||||||
@ -9376,7 +9385,6 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
static void ggml_compute_forward_mul_mat(
|
static void ggml_compute_forward_mul_mat(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
@ -10946,7 +10954,8 @@ static void ggml_compute_forward_rope_f32(
|
|||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst,
|
||||||
|
const bool forward) {
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -11005,6 +11014,11 @@ static void ggml_compute_forward_rope_f32(
|
|||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
const bool is_glm = mode & 4;
|
const bool is_glm = mode & 4;
|
||||||
|
|
||||||
|
// backward process uses inverse rotation by cos and sin.
|
||||||
|
// cos and sin build a rotation matrix, where the inverse is the transpose.
|
||||||
|
// this essentially just switches the sign of sin.
|
||||||
|
const float sin_sign = forward ? 1.0f : -1.0f;
|
||||||
|
|
||||||
const int32_t * pos = (const int32_t *) src1->data;
|
const int32_t * pos = (const int32_t *) src1->data;
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
@ -11021,9 +11035,9 @@ static void ggml_compute_forward_rope_f32(
|
|||||||
float block_theta = MAX(p - (n_ctx - 2), 0);
|
float block_theta = MAX(p - (n_ctx - 2), 0);
|
||||||
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
||||||
const float cos_theta = cosf(theta_base);
|
const float cos_theta = cosf(theta_base);
|
||||||
const float sin_theta = sinf(theta_base);
|
const float sin_theta = sinf(theta_base) * sin_sign;
|
||||||
const float cos_block_theta = cosf(block_theta);
|
const float cos_block_theta = cosf(block_theta);
|
||||||
const float sin_block_theta = sinf(block_theta);
|
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
theta_base *= theta_scale;
|
||||||
block_theta *= theta_scale;
|
block_theta *= theta_scale;
|
||||||
@ -11047,6 +11061,7 @@ static void ggml_compute_forward_rope_f32(
|
|||||||
rope_yarn(
|
rope_yarn(
|
||||||
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
||||||
);
|
);
|
||||||
|
sin_theta *= sin_sign;
|
||||||
|
|
||||||
// zeta scaling for xPos only:
|
// zeta scaling for xPos only:
|
||||||
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
||||||
@ -11077,6 +11092,7 @@ static void ggml_compute_forward_rope_f32(
|
|||||||
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
||||||
&cos_theta, &sin_theta
|
&cos_theta, &sin_theta
|
||||||
);
|
);
|
||||||
|
sin_theta *= sin_sign;
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
theta_base *= theta_scale;
|
||||||
|
|
||||||
@ -11102,7 +11118,8 @@ static void ggml_compute_forward_rope_f16(
|
|||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst,
|
||||||
|
const bool forward) {
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -11154,6 +11171,11 @@ static void ggml_compute_forward_rope_f16(
|
|||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
const bool is_glm = mode & 4;
|
const bool is_glm = mode & 4;
|
||||||
|
|
||||||
|
// backward process uses inverse rotation by cos and sin.
|
||||||
|
// cos and sin build a rotation matrix, where the inverse is the transpose.
|
||||||
|
// this essentially just switches the sign of sin.
|
||||||
|
const float sin_sign = forward ? 1.0f : -1.0f;
|
||||||
|
|
||||||
const int32_t * pos = (const int32_t *) src1->data;
|
const int32_t * pos = (const int32_t *) src1->data;
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
@ -11170,9 +11192,9 @@ static void ggml_compute_forward_rope_f16(
|
|||||||
float block_theta = MAX(p - (n_ctx - 2), 0);
|
float block_theta = MAX(p - (n_ctx - 2), 0);
|
||||||
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
||||||
const float cos_theta = cosf(theta_base);
|
const float cos_theta = cosf(theta_base);
|
||||||
const float sin_theta = sinf(theta_base);
|
const float sin_theta = sinf(theta_base) * sin_sign;
|
||||||
const float cos_block_theta = cosf(block_theta);
|
const float cos_block_theta = cosf(block_theta);
|
||||||
const float sin_block_theta = sinf(block_theta);
|
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
theta_base *= theta_scale;
|
||||||
block_theta *= theta_scale;
|
block_theta *= theta_scale;
|
||||||
@ -11196,6 +11218,7 @@ static void ggml_compute_forward_rope_f16(
|
|||||||
rope_yarn(
|
rope_yarn(
|
||||||
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
||||||
);
|
);
|
||||||
|
sin_theta *= sin_sign;
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
theta_base *= theta_scale;
|
||||||
|
|
||||||
@ -11222,6 +11245,7 @@ static void ggml_compute_forward_rope_f16(
|
|||||||
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
||||||
&cos_theta, &sin_theta
|
&cos_theta, &sin_theta
|
||||||
);
|
);
|
||||||
|
sin_theta *= sin_sign;
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
theta_base *= theta_scale;
|
||||||
|
|
||||||
@ -11251,11 +11275,11 @@ static void ggml_compute_forward_rope(
|
|||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_rope_f16(params, src0, src1, dst);
|
ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_rope_f32(params, src0, src1, dst);
|
ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
@ -11266,216 +11290,6 @@ static void ggml_compute_forward_rope(
|
|||||||
|
|
||||||
// ggml_compute_forward_rope_back
|
// ggml_compute_forward_rope_back
|
||||||
|
|
||||||
static void ggml_compute_forward_rope_back_f32(
|
|
||||||
const struct ggml_compute_params * params,
|
|
||||||
const struct ggml_tensor * src0,
|
|
||||||
const struct ggml_tensor * src1,
|
|
||||||
struct ggml_tensor * dst) {
|
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// y = rope(x, src1)
|
|
||||||
// dx = rope_back(dy, src1)
|
|
||||||
// src0 is dy, src1 contains options
|
|
||||||
|
|
||||||
float freq_base;
|
|
||||||
float freq_scale;
|
|
||||||
|
|
||||||
// these two only relevant for xPos RoPE:
|
|
||||||
float xpos_base;
|
|
||||||
bool xpos_down;
|
|
||||||
|
|
||||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
|
||||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
||||||
const int mode = ((int32_t *) dst->op_params)[2];
|
|
||||||
const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
|
|
||||||
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
|
||||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
|
||||||
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
|
|
||||||
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
|
|
||||||
|
|
||||||
GGML_TENSOR_UNARY_OP_LOCALS
|
|
||||||
|
|
||||||
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
|
|
||||||
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
|
|
||||||
|
|
||||||
assert(nb0 == sizeof(float));
|
|
||||||
|
|
||||||
const int ith = params->ith;
|
|
||||||
const int nth = params->nth;
|
|
||||||
|
|
||||||
const int nr = ggml_nrows(dst);
|
|
||||||
|
|
||||||
// rows per thread
|
|
||||||
const int dr = (nr + nth - 1)/nth;
|
|
||||||
|
|
||||||
// row range for this thread
|
|
||||||
const int ir0 = dr*ith;
|
|
||||||
const int ir1 = MIN(ir0 + dr, nr);
|
|
||||||
|
|
||||||
// row index used to determine which thread to use
|
|
||||||
int ir = 0;
|
|
||||||
|
|
||||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
|
||||||
|
|
||||||
const int32_t * pos = (const int32_t *) src1->data;
|
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
|
||||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
|
||||||
const int64_t p = pos[i2];
|
|
||||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
|
||||||
if (ir++ < ir0) continue;
|
|
||||||
if (ir > ir1) break;
|
|
||||||
|
|
||||||
float theta_base = freq_scale * (float)p;
|
|
||||||
|
|
||||||
if (!is_neox) {
|
|
||||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
|
||||||
const float cos_theta = cosf(theta_base);
|
|
||||||
const float sin_theta = sinf(theta_base);
|
|
||||||
|
|
||||||
// zeta scaling for xPos only:
|
|
||||||
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
|
||||||
if (xpos_down) zeta = 1.0f / zeta;
|
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
|
||||||
|
|
||||||
const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
||||||
float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
||||||
|
|
||||||
const float dy0 = dy[0];
|
|
||||||
const float dy1 = dy[1];
|
|
||||||
|
|
||||||
dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta;
|
|
||||||
dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
|
||||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
|
||||||
const float cos_theta = cosf(theta_base);
|
|
||||||
const float sin_theta = sinf(theta_base);
|
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
|
||||||
|
|
||||||
const int64_t i0 = ib*n_dims + ic/2;
|
|
||||||
|
|
||||||
const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
||||||
float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
||||||
|
|
||||||
const float dy0 = dy[0];
|
|
||||||
const float dy1 = dy[n_dims/2];
|
|
||||||
|
|
||||||
dx[0] = dy0*cos_theta + dy1*sin_theta;
|
|
||||||
dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_compute_forward_rope_back_f16(
|
|
||||||
const struct ggml_compute_params * params,
|
|
||||||
const struct ggml_tensor * src0,
|
|
||||||
const struct ggml_tensor * src1,
|
|
||||||
struct ggml_tensor * dst) {
|
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// y = rope(x, src1)
|
|
||||||
// dx = rope_back(dy, src1)
|
|
||||||
// src0 is dy, src1 contains options
|
|
||||||
|
|
||||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
|
||||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
||||||
const int mode = ((int32_t *) dst->op_params)[2];
|
|
||||||
|
|
||||||
GGML_TENSOR_UNARY_OP_LOCALS
|
|
||||||
|
|
||||||
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
|
|
||||||
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
|
|
||||||
|
|
||||||
assert(nb0 == sizeof(ggml_fp16_t));
|
|
||||||
|
|
||||||
const int ith = params->ith;
|
|
||||||
const int nth = params->nth;
|
|
||||||
|
|
||||||
const int nr = ggml_nrows(dst);
|
|
||||||
|
|
||||||
// rows per thread
|
|
||||||
const int dr = (nr + nth - 1)/nth;
|
|
||||||
|
|
||||||
// row range for this thread
|
|
||||||
const int ir0 = dr*ith;
|
|
||||||
const int ir1 = MIN(ir0 + dr, nr);
|
|
||||||
|
|
||||||
// row index used to determine which thread to use
|
|
||||||
int ir = 0;
|
|
||||||
|
|
||||||
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
|
||||||
|
|
||||||
const int32_t * pos = (const int32_t *) src1->data;
|
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
|
||||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
|
||||||
const int64_t p = pos[i2];
|
|
||||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
|
||||||
if (ir++ < ir0) continue;
|
|
||||||
if (ir > ir1) break;
|
|
||||||
|
|
||||||
float theta_base = (float)p;
|
|
||||||
|
|
||||||
if (!is_neox) {
|
|
||||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
|
||||||
const float cos_theta = cosf(theta_base);
|
|
||||||
const float sin_theta = sinf(theta_base);
|
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
|
||||||
|
|
||||||
const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
||||||
ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
||||||
|
|
||||||
const float dy0 = GGML_FP16_TO_FP32(dy[0]);
|
|
||||||
const float dy1 = GGML_FP16_TO_FP32(dy[1]);
|
|
||||||
|
|
||||||
dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
|
|
||||||
dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
|
||||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
|
||||||
const float cos_theta = cosf(theta_base);
|
|
||||||
const float sin_theta = sinf(theta_base);
|
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
|
||||||
|
|
||||||
const int64_t i0 = ib*n_dims + ic/2;
|
|
||||||
|
|
||||||
const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
||||||
ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
||||||
|
|
||||||
const float dy0 = GGML_FP16_TO_FP32(dy[0]);
|
|
||||||
const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]);
|
|
||||||
|
|
||||||
dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
|
|
||||||
dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_compute_forward_rope_back(
|
static void ggml_compute_forward_rope_back(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
@ -11484,11 +11298,11 @@ static void ggml_compute_forward_rope_back(
|
|||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
|
ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
|
ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
@ -14923,17 +14737,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||||||
// necessary for llama
|
// necessary for llama
|
||||||
if (src0->grad) {
|
if (src0->grad) {
|
||||||
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
||||||
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
||||||
const int mode = ((int32_t *) tensor->op_params)[2];
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
||||||
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
||||||
float freq_base;
|
const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
|
||||||
float freq_scale;
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
|
||||||
float xpos_base;
|
|
||||||
bool xpos_down;
|
memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
|
||||||
memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
|
memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
|
||||||
memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
|
memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float));
|
||||||
memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
|
memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
|
||||||
memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
|
memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
|
||||||
|
memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
|
||||||
|
memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
|
||||||
|
memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
|
||||||
|
|
||||||
src0->grad = ggml_add_or_set(ctx,
|
src0->grad = ggml_add_or_set(ctx,
|
||||||
src0->grad,
|
src0->grad,
|
||||||
@ -14943,8 +14760,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||||||
n_dims,
|
n_dims,
|
||||||
mode,
|
mode,
|
||||||
n_ctx,
|
n_ctx,
|
||||||
|
n_orig_ctx,
|
||||||
freq_base,
|
freq_base,
|
||||||
freq_scale,
|
freq_scale,
|
||||||
|
ext_factor,
|
||||||
|
attn_factor,
|
||||||
|
beta_fast,
|
||||||
|
beta_slow,
|
||||||
xpos_base,
|
xpos_base,
|
||||||
xpos_down),
|
xpos_down),
|
||||||
zero_table);
|
zero_table);
|
||||||
@ -14954,17 +14776,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||||||
{
|
{
|
||||||
if (src0->grad) {
|
if (src0->grad) {
|
||||||
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
||||||
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
||||||
const int mode = ((int32_t *) tensor->op_params)[2];
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
||||||
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
||||||
float freq_base;
|
const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
|
||||||
float freq_scale;
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
|
||||||
float xpos_base;
|
|
||||||
bool xpos_down;
|
memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
|
||||||
memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
|
memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
|
||||||
memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
|
memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float));
|
||||||
memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
|
memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
|
||||||
memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
|
memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
|
||||||
|
memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
|
||||||
|
memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
|
||||||
|
memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
|
||||||
|
|
||||||
src0->grad = ggml_add_or_set(ctx,
|
src0->grad = ggml_add_or_set(ctx,
|
||||||
src0->grad,
|
src0->grad,
|
||||||
@ -14973,14 +14798,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||||||
src1,
|
src1,
|
||||||
n_dims,
|
n_dims,
|
||||||
mode,
|
mode,
|
||||||
0,
|
|
||||||
n_ctx,
|
n_ctx,
|
||||||
|
n_orig_ctx,
|
||||||
freq_base,
|
freq_base,
|
||||||
freq_scale,
|
freq_scale,
|
||||||
0.0f,
|
ext_factor,
|
||||||
1.0f,
|
attn_factor,
|
||||||
0.0f,
|
beta_fast,
|
||||||
0.0f,
|
beta_slow,
|
||||||
xpos_base,
|
xpos_base,
|
||||||
xpos_down,
|
xpos_down,
|
||||||
false),
|
false),
|
||||||
@ -18248,7 +18073,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||||||
{
|
{
|
||||||
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
||||||
|
|
||||||
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
||||||
struct gguf_kv * kv = &ctx->kv[i];
|
struct gguf_kv * kv = &ctx->kv[i];
|
||||||
|
|
||||||
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
||||||
@ -18295,7 +18120,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||||||
case GGUF_TYPE_STRING:
|
case GGUF_TYPE_STRING:
|
||||||
{
|
{
|
||||||
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
|
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
|
||||||
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
|
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
||||||
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
@ -18323,7 +18148,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||||||
{
|
{
|
||||||
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
||||||
|
|
||||||
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||||
struct gguf_tensor_info * info = &ctx->infos[i];
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||||
|
|
||||||
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
||||||
@ -18370,7 +18195,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||||||
// compute the total size of the data section, taking into account the alignment
|
// compute the total size of the data section, taking into account the alignment
|
||||||
{
|
{
|
||||||
ctx->size = 0;
|
ctx->size = 0;
|
||||||
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||||
struct gguf_tensor_info * info = &ctx->infos[i];
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||||
|
|
||||||
const int64_t ne =
|
const int64_t ne =
|
||||||
@ -18439,7 +18264,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||||||
ggml_set_no_alloc(ctx_data, true);
|
ggml_set_no_alloc(ctx_data, true);
|
||||||
|
|
||||||
// create the tensors
|
// create the tensors
|
||||||
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||||
const int64_t ne[GGML_MAX_DIMS] = {
|
const int64_t ne[GGML_MAX_DIMS] = {
|
||||||
ctx->infos[i].ne[0],
|
ctx->infos[i].ne[0],
|
||||||
ctx->infos[i].ne[1],
|
ctx->infos[i].ne[1],
|
||||||
|
5
ggml.h
5
ggml.h
@ -1371,8 +1371,13 @@ extern "C" {
|
|||||||
int n_dims,
|
int n_dims,
|
||||||
int mode,
|
int mode,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
|
int n_orig_ctx,
|
||||||
float freq_base,
|
float freq_base,
|
||||||
float freq_scale,
|
float freq_scale,
|
||||||
|
float ext_factor,
|
||||||
|
float attn_factor,
|
||||||
|
float beta_fast,
|
||||||
|
float beta_slow,
|
||||||
float xpos_base,
|
float xpos_base,
|
||||||
bool xpos_down);
|
bool xpos_down);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user