From d73a63629e5b15ff1271d34cb3604cd28edea9f2 Mon Sep 17 00:00:00 2001 From: Michael Podvitskiy Date: Mon, 11 Mar 2024 10:28:51 +0100 Subject: [PATCH] ggml, ci : Windows ARM runner and build fixes (llama/5979) * windows arm ci * fix `error C2078: too many initializers` with ggml_vld1q_u32 macro for MSVC ARM64 * fix `warning C4146: unary minus operator applied to unsigned type, result still unsigned` * fix `error C2065: '__fp16': undeclared identifier` --- ggml-impl.h | 8 ++++++-- ggml-quants.c | 16 ++++++++-------- ggml.c | 4 ++-- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/ggml-impl.h b/ggml-impl.h index c5637e4..e68b728 100644 --- a/ggml-impl.h +++ b/ggml-impl.h @@ -53,26 +53,30 @@ extern "C" { // #include +typedef __fp16 ggml_fp16_internal_t; + #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - __fp16 tmp; + ggml_fp16_internal_t tmp; memcpy(&tmp, &h, sizeof(ggml_fp16_t)); return (float)tmp; } static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { ggml_fp16_t res; - __fp16 tmp = f; + ggml_fp16_internal_t tmp = f; memcpy(&res, &tmp, sizeof(ggml_fp16_t)); return res; } #else +typedef uint16_t ggml_fp16_internal_t; + #ifdef __wasm_simd128__ #include #else diff --git a/ggml-quants.c b/ggml-quants.c index f9a3d9f..86b0764 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -9374,15 +9374,15 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * const uint8x16_t idx_l = vld1q_u8(qs); qs += 16; idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256)); - const uint32x4_t aux32x4_0 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]], - iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]}; - const uint32x4_t aux32x4_1 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]], - iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]}; + const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]], + iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]); + const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]], + iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]); idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256)); - const uint32x4_t aux32x4_2 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]], - iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]}; - const uint32x4_t aux32x4_3 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]], - iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]}; + const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]], + iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]); + const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]], + iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]); vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16))); diff --git a/ggml.c b/ggml.c index 80efa6f..9a7bd1d 100644 --- a/ggml.c +++ b/ggml.c @@ -857,7 +857,7 @@ inline static float vaddvq_f32(float32x4_t v) { #define GGML_F16x8 float16x8_t #define GGML_F16x8_ZERO vdupq_n_f16(0.0f) #define GGML_F16x8_SET1(x) vdupq_n_f16(x) - #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x)) + #define GGML_F16x8_LOAD(x) vld1q_f16((const ggml_fp16_internal_t *)(x)) #define GGML_F16x8_STORE vst1q_f16 #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) #define GGML_F16x8_ADD vaddq_f16 @@ -900,7 +900,7 @@ inline static float vaddvq_f32(float32x4_t v) { #define GGML_F32Cx4 float32x4_t #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x) - #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x))) + #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x))) #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) #define GGML_F32Cx4_ADD vaddq_f32