From eb2b086584b44351a2efdd9f09ee5c02bf1939d4 Mon Sep 17 00:00:00 2001 From: Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com> Date: Sun, 19 May 2024 19:18:39 -0700 Subject: [PATCH] Add provisions for windows support for BF16 code including CMake provision for enabling AVX512_BF16 (llama/7258) --- ggml-impl.h | 12 ++++++++++++ ggml.c | 24 ++++++++++++++++-------- ggml.h | 1 + 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/ggml-impl.h b/ggml-impl.h index 59684fa..5ff014f 100644 --- a/ggml-impl.h +++ b/ggml-impl.h @@ -17,6 +17,18 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) +#if defined(_WIN32) + +#define m512bh(p) p +#define m512i(p) p + +#else + +#define m512bh(p) (__m512bh)(p) +#define m512i(p) (__m512i)(p) + +#endif + /** * Converts brain16 to float32. * diff --git a/ggml.c b/ggml.c index 3a104c4..53da231 100644 --- a/ggml.c +++ b/ggml.c @@ -406,10 +406,10 @@ void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) { int i = 0; #if defined(__AVX512BF16__) for (; i + 32 <= n; i += 32) { - _mm512_storeu_ps( - (__m512 *)(y + i), - (__m512)_mm512_cvtne2ps_pbh(_mm512_loadu_ps(x + i + 16), - _mm512_loadu_ps(x + i))); + _mm512_storeu_si512( + (__m512i *)(y + i), + m512i(_mm512_cvtne2ps_pbh(_mm512_loadu_ps(x + i + 16), + _mm512_loadu_ps(x + i)))); } #endif for (; i < n; i++) { @@ -1666,10 +1666,10 @@ static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t __m512 c1 = _mm512_setzero_ps(); __m512 c2 = _mm512_setzero_ps(); for (; i + 64 <= n; i += 64) { - c1 = _mm512_dpbf16_ps(c1, (__m512bh)_mm512_loadu_ps((const float *)(x + i)), - (__m512bh)_mm512_loadu_ps((const float *)(y + i))); - c2 = _mm512_dpbf16_ps(c2, (__m512bh)_mm512_loadu_ps((const float *)(x + i + 32)), - (__m512bh)_mm512_loadu_ps((const float *)(y + i + 32))); + c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))), + m512bh(_mm512_loadu_si512((y + i)))); + c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))), + m512bh(_mm512_loadu_si512((y + i + 32)))); } sumf += (ggml_float)_mm512_reduce_add_ps(c1); sumf += (ggml_float)_mm512_reduce_add_ps(c2); @@ -23137,6 +23137,14 @@ int ggml_cpu_has_avx512_vnni(void) { #endif } +int ggml_cpu_has_avx512_bf16(void) { +#if defined(__AVX512BF16__) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_fma(void) { #if defined(__FMA__) return 1; diff --git a/ggml.h b/ggml.h index 8c13f4b..7747571 100644 --- a/ggml.h +++ b/ggml.h @@ -2390,6 +2390,7 @@ extern "C" { GGML_API int ggml_cpu_has_avx512 (void); GGML_API int ggml_cpu_has_avx512_vbmi(void); GGML_API int ggml_cpu_has_avx512_vnni(void); + GGML_API int ggml_cpu_has_avx512_bf16(void); GGML_API int ggml_cpu_has_fma (void); GGML_API int ggml_cpu_has_neon (void); GGML_API int ggml_cpu_has_arm_fma (void);