952 lines
39 KiB
Diff
952 lines
39 KiB
Diff
From 273f23698c887b52e66c2abec8101b7398f0f9c4 Mon Sep 17 00:00:00 2001
|
|
From: "Benjamin A. Beasley" <code@musicinmybrain.net>
|
|
Date: Wed, 5 Jun 2024 11:06:02 -0400
|
|
Subject: [PATCH] Patch for sleef 3.6
|
|
|
|
---
|
|
...ectorization-on-windows-submodule-sl.patch | 910 ++++++++++++++++++
|
|
python-torch.spec | 11 +
|
|
2 files changed, 921 insertions(+)
|
|
create mode 100644 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
|
|
|
|
diff --git a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
|
|
new file mode 100644
|
|
index 000000000000..562f55b742c2
|
|
--- /dev/null
|
|
+++ b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
|
|
@@ -0,0 +1,910 @@
|
|
+From 3d1e4b3e5ddcdd2717e590c635097163fef64c83 Mon Sep 17 00:00:00 2001
|
|
+From: Xu Han <xu.han@intel.com>
|
|
+Date: Sun, 31 Mar 2024 03:07:32 +0000
|
|
+Subject: [PATCH] Enable x86 CPU vectorization on windows [submodule sleef]
|
|
+ (#118980)
|
|
+
|
|
+Enable VEC on Windows OS.
|
|
+1. Fix some type defination gap between Windows and Linux.
|
|
+2. Fix some operator not support on Windows, such as [], /.
|
|
+3. Enable static sleef library build on Windows.
|
|
+4. Disable unsupported function overloading on MSVC.
|
|
+5. Upgrade submodule sleef lib, which fixed build issue on Windows.
|
|
+6. Fixed bazel build issues.
|
|
+7. Fix test app not link to sleef on Windows.
|
|
+
|
|
+Note: If rebuild fail after pulled this PR, please sync `sleef` submodule by run:
|
|
+```cmd
|
|
+git submodule sync
|
|
+git submodule update --init --recursive
|
|
+```
|
|
+
|
|
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/118980
|
|
+Approved by: https://github.com/jgong5, https://github.com/ezyang, https://github.com/malfet
|
|
+---
|
|
+ aten/src/ATen/CMakeLists.txt | 48 ++++++--------
|
|
+ aten/src/ATen/cpu/vec/vec256/vec256.h | 14 ++--
|
|
+ .../src/ATen/cpu/vec/vec256/vec256_bfloat16.h | 27 ++++++--
|
|
+ .../cpu/vec/vec256/vec256_complex_double.h | 7 +-
|
|
+ .../cpu/vec/vec256/vec256_complex_float.h | 7 +-
|
|
+ aten/src/ATen/cpu/vec/vec256/vec256_double.h | 5 +-
|
|
+ aten/src/ATen/cpu/vec/vec256/vec256_float.h | 15 +++--
|
|
+ aten/src/ATen/cpu/vec/vec256/vec256_qint.h | 12 +++-
|
|
+ aten/src/ATen/cpu/vec/vec512/vec512.h | 14 ++--
|
|
+ .../src/ATen/cpu/vec/vec512/vec512_bfloat16.h | 27 ++++++--
|
|
+ .../cpu/vec/vec512/vec512_complex_double.h | 7 +-
|
|
+ .../cpu/vec/vec512/vec512_complex_float.h | 7 +-
|
|
+ aten/src/ATen/cpu/vec/vec512/vec512_double.h | 5 +-
|
|
+ aten/src/ATen/cpu/vec/vec512/vec512_float.h | 15 +++--
|
|
+ aten/src/ATen/cpu/vec/vec512/vec512_qint.h | 66 ++++++++++++++++++-
|
|
+ aten/src/ATen/cpu/vec/vec_base.h | 6 ++
|
|
+ caffe2/CMakeLists.txt | 2 +-
|
|
+ third_party/sleef.BUILD | 3 +-
|
|
+ 18 files changed, 194 insertions(+), 93 deletions(-)
|
|
+
|
|
+diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
|
|
+index bf425af5fa9..58d5828e8ca 100644
|
|
+--- a/aten/src/ATen/CMakeLists.txt
|
|
++++ b/aten/src/ATen/CMakeLists.txt
|
|
+@@ -419,32 +419,25 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$")
|
|
+ list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo)
|
|
+ endif()
|
|
+
|
|
+-if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
|
|
+- # Preserve values for the main build
|
|
+- set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS})
|
|
+- set(__aten_sleef_build_tests ${BUILD_TESTS})
|
|
+-
|
|
+- # Unset our restrictive C++ flags here and reset them later.
|
|
+- # Remove this once we use proper target_compile_options.
|
|
+- set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
|
|
+- set(CMAKE_CXX_FLAGS)
|
|
+-
|
|
+- # Bump up optimization level for sleef to -O1, since at -O0 the compiler
|
|
+- # excessively spills intermediate vector registers to the stack
|
|
+- # and makes things run impossibly slowly
|
|
+- set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
|
|
+- if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
|
|
+- string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
|
|
+- else()
|
|
+- set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
|
|
++if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
|
|
++ if(NOT MSVC)
|
|
++ # Bump up optimization level for sleef to -O1, since at -O0 the compiler
|
|
++ # excessively spills intermediate vector registers to the stack
|
|
++ # and makes things run impossibly slowly
|
|
++ set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
|
|
++ if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
|
|
++ string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
|
|
++ else()
|
|
++ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
|
|
++ endif()
|
|
+ endif()
|
|
+
|
|
+ if(NOT USE_SYSTEM_SLEEF)
|
|
+- set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
|
|
+- set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
|
|
+- set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
|
|
+- set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
|
|
+- set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE})
|
|
++ set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
|
|
++ set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
|
|
++ set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
|
|
++ set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
|
|
++ set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE)
|
|
+ if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
|
|
+ if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
|
|
+ set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE)
|
|
+@@ -465,12 +458,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
|
|
+ endif()
|
|
+ list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef)
|
|
+
|
|
+- set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
|
|
+- set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
|
|
+-
|
|
+- # Set these back. TODO: Use SLEEF_ to pass these instead
|
|
+- set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE)
|
|
+- set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE)
|
|
++ if(NOT MSVC)
|
|
++ set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
|
|
++ endif()
|
|
+ endif()
|
|
+
|
|
+ if(USE_CUDA AND NOT USE_ROCM)
|
|
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h
|
|
+index 800b027e469..c431fa3c605 100644
|
|
+--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
|
|
++++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
|
|
+@@ -69,7 +69,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
|
|
+ }
|
|
+
|
|
+
|
|
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX2)
|
|
+
|
|
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
+
|
|
+@@ -94,7 +94,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
|
|
+ }
|
|
+
|
|
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
+-
|
|
++#ifndef _MSC_VER
|
|
++// MSVC is not working well on complex function overload.
|
|
+ template<int64_t scale = 1>
|
|
+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
|
|
+ inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
|
|
+@@ -106,9 +107,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
|
|
+ inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
|
|
+ return _mm256_i32gather_ps(base_addr, vindex, scale);
|
|
+ }
|
|
+-
|
|
++#endif
|
|
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
+-
|
|
++#ifndef _MSC_VER
|
|
++// MSVC is not working well on complex function overload.
|
|
+ template<int64_t scale = 1>
|
|
+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
|
|
+ inline mask_gather(const Vectorized<double>& src, const double* base_addr,
|
|
+@@ -122,7 +124,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
|
|
+ const Vectorized<int32_t>& vindex, Vectorized<float>& mask) {
|
|
+ return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale);
|
|
+ }
|
|
+-
|
|
++#endif
|
|
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
+
|
|
+ // Only works for inputs in the range: [-2^51, 2^51]
|
|
+@@ -302,6 +304,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
|
|
+ return flip8(v);
|
|
+ }
|
|
+
|
|
+-#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
|
++#endif // (defined(CPU_CAPABILITY_AVX2)
|
|
+
|
|
+ }} // namepsace at::vec::CPU_CAPABILITY
|
|
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
|
|
+index 3e26213d6d2..66557436c70 100644
|
|
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
|
|
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
|
|
+@@ -7,7 +7,8 @@
|
|
+ #include <ATen/cpu/vec/vec_base.h>
|
|
+ #include <c10/util/irange.h>
|
|
+
|
|
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX2)
|
|
++#define SLEEF_STATIC_LIBS
|
|
+ #include <sleef.h>
|
|
+ #endif
|
|
+
|
|
+@@ -18,7 +19,18 @@ namespace at::vec {
|
|
+ // See Note [CPU_CAPABILITY namespace]
|
|
+ inline namespace CPU_CAPABILITY {
|
|
+
|
|
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX2)
|
|
++
|
|
++#ifndef SLEEF_CONST
|
|
++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
|
|
++#define SLEEF_CONST const
|
|
++#else
|
|
++#define SLEEF_CONST
|
|
++#endif
|
|
++#define SLEEF_CONST_OLD SLEEF_CONST
|
|
++#else
|
|
++#define SLEEF_CONST_OLD
|
|
++#endif
|
|
+
|
|
+ // bfloat16 conversion
|
|
+ static inline void cvtbf16_fp32(const __m128i& a, __m256& o) {
|
|
+@@ -265,7 +277,8 @@ public:
|
|
+ }
|
|
+ return b;
|
|
+ }
|
|
+- Vectorized<T> map(const __m256 (*const vop)(__m256)) const {
|
|
++
|
|
++ Vectorized<T> map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const {
|
|
+ __m256 lo, hi;
|
|
+ cvt_to_fp32<T>(values, lo, hi);
|
|
+ const auto o1 = vop(lo);
|
|
+@@ -1026,7 +1039,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
|
|
+ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
|
|
+ CONVERT_VECTORIZED_INIT(Half, half);
|
|
+
|
|
+-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
|
++#else // defined(CPU_CAPABILITY_AVX2)
|
|
+
|
|
+ #define CONVERT_NON_VECTORIZED_INIT(type, name) \
|
|
+ inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
|
|
+@@ -1051,9 +1064,9 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
|
|
+ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
|
|
+ CONVERT_NON_VECTORIZED_INIT(Half, half);
|
|
+
|
|
+-#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
|
++#endif // defined(CPU_CAPABILITY_AVX2)
|
|
+
|
|
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX2)
|
|
+ #define LOAD_FP32_VECTORIZED_INIT(type, name) \
|
|
+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
|
|
+ auto values = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data)); \
|
|
+@@ -1072,7 +1085,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
|
|
+ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
|
|
+ LOAD_FP32_VECTORIZED_INIT(Half, fp16);
|
|
+
|
|
+-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
|
++#else // defined(CPU_CAPABILITY_AVX2)
|
|
+ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
|
|
+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
|
|
+ __at_align__ float values[Vectorized<float>::size()]; \
|
|
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
|
|
+index f93ea1e63c3..6c198fb37d3 100644
|
|
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
|
|
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
|
|
+@@ -8,7 +8,8 @@
|
|
+ #include <ATen/cpu/vec/intrinsics.h>
|
|
+ #include <ATen/cpu/vec/vec_base.h>
|
|
+
|
|
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX2)
|
|
++#define SLEEF_STATIC_LIBS
|
|
+ #include <sleef.h>
|
|
+ #endif
|
|
+
|
|
+@@ -16,7 +17,7 @@ namespace at::vec {
|
|
+ // See Note [CPU_CAPABILITY namespace]
|
|
+ inline namespace CPU_CAPABILITY {
|
|
+
|
|
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX2)
|
|
+
|
|
+ template <> class Vectorized<c10::complex<double>> {
|
|
+ private:
|
|
+@@ -145,7 +146,7 @@ public:
|
|
+ auto abs = abs_();
|
|
+ auto zero = _mm256_setzero_pd();
|
|
+ auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ);
|
|
+- auto div = values / abs;
|
|
++ auto div = _mm256_div_pd(values, abs);
|
|
+ return _mm256_blendv_pd(div, zero, mask);
|
|
+ }
|
|
+ __m256d real_() const {
|
|
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
|
|
+index 7c142c04b79..c72d4d49274 100644
|
|
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
|
|
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
|
|
+@@ -7,7 +7,8 @@
|
|
+ #include <c10/util/irange.h>
|
|
+ #include <ATen/cpu/vec/intrinsics.h>
|
|
+ #include <ATen/cpu/vec/vec_base.h>
|
|
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX2)
|
|
++#define SLEEF_STATIC_LIBS
|
|
+ #include <sleef.h>
|
|
+ #endif
|
|
+
|
|
+@@ -15,7 +16,7 @@ namespace at::vec {
|
|
+ // See Note [CPU_CAPABILITY namespace]
|
|
+ inline namespace CPU_CAPABILITY {
|
|
+
|
|
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX2)
|
|
+
|
|
+ template <> class Vectorized<c10::complex<float>> {
|
|
+ private:
|
|
+@@ -180,7 +181,7 @@ public:
|
|
+ auto abs = abs_();
|
|
+ auto zero = _mm256_setzero_ps();
|
|
+ auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ);
|
|
+- auto div = values / abs;
|
|
++ auto div = _mm256_div_ps(values, abs);
|
|
+ return _mm256_blendv_ps(div, zero, mask);
|
|
+ }
|
|
+ __m256 real_() const {
|
|
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
|
|
+index bc82d07edd1..bed6da627af 100644
|
|
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h
|
|
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
|
|
+@@ -6,7 +6,8 @@
|
|
+ #include <ATen/cpu/vec/intrinsics.h>
|
|
+ #include <ATen/cpu/vec/vec_base.h>
|
|
+ #include <c10/util/irange.h>
|
|
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX2)
|
|
++#define SLEEF_STATIC_LIBS
|
|
+ #include <sleef.h>
|
|
+ #endif
|
|
+
|
|
+@@ -15,7 +16,7 @@ namespace at::vec {
|
|
+ inline namespace CPU_CAPABILITY {
|
|
+
|
|
+
|
|
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX2)
|
|
+
|
|
+ template <> class Vectorized<double> {
|
|
+ private:
|
|
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
|
|
+index 886809a0b8a..0e3664cd37b 100644
|
|
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
|
|
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
|
|
+@@ -6,7 +6,8 @@
|
|
+ #include <ATen/cpu/vec/intrinsics.h>
|
|
+ #include <ATen/cpu/vec/vec_base.h>
|
|
+ #include <c10/util/irange.h>
|
|
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX2)
|
|
++#define SLEEF_STATIC_LIBS
|
|
+ #include <sleef.h>
|
|
+ #endif
|
|
+
|
|
+@@ -14,7 +15,7 @@ namespace at::vec {
|
|
+ // See Note [CPU_CAPABILITY namespace]
|
|
+ inline namespace CPU_CAPABILITY {
|
|
+
|
|
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX2)
|
|
+
|
|
+ template <> class Vectorized<float> {
|
|
+ private:
|
|
+@@ -226,14 +227,14 @@ public:
|
|
+ static __m256 vec_factorial_5 =
|
|
+ _mm256_set1_ps(0.00828929059f); // 1/factorial(5)
|
|
+ static __m256 vec_exp_log2ef =
|
|
+- (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e)
|
|
++ _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e)
|
|
+ static __m256 vec_half = _mm256_set1_ps(0.5f);
|
|
+ static __m256 vec_one = _mm256_set1_ps(1.f);
|
|
+ static __m256 vec_zero = _mm256_set1_ps(0.f);
|
|
+ static __m256 vec_two = _mm256_set1_ps(2.f);
|
|
+- static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2)
|
|
+- static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50);
|
|
+- static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218);
|
|
++ static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2)
|
|
++ static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
|
|
++ static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
|
|
+ static __m256i vec_127 = _mm256_set1_epi32(0x0000007f);
|
|
+ static int n_mantissa_bits = 23;
|
|
+
|
|
+@@ -266,7 +267,7 @@ public:
|
|
+ auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number);
|
|
+ auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127);
|
|
+ vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
|
|
+- auto vec_two_pow_n = (__m256)vec_two_pow_n_i;
|
|
++ auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i);
|
|
+ vec_two_pow_n =
|
|
+ _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask);
|
|
+
|
|
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
|
|
+index 4128841701a..85e099904cd 100644
|
|
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
|
|
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
|
|
+@@ -41,11 +41,17 @@
|
|
+ namespace at::vec {
|
|
+ inline namespace CPU_CAPABILITY {
|
|
+
|
|
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX2)
|
|
+
|
|
++#ifdef _MSC_VER
|
|
++__declspec(align(64)) struct Vectorizedqi {
|
|
++ protected:
|
|
++ __m256i vals;
|
|
++#else
|
|
+ struct Vectorizedqi {
|
|
+ protected:
|
|
+ __m256i vals __attribute__((aligned(64)));
|
|
++#endif
|
|
+
|
|
+ public:
|
|
+ Vectorizedqi() {}
|
|
+@@ -133,7 +139,7 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
|
|
+ }
|
|
+
|
|
+ template <typename T>
|
|
+-inline void __attribute__((always_inline)) QuantizeAvx2(
|
|
++__FORCE_INLINE void QuantizeAvx2(
|
|
+ const float* src,
|
|
+ T* dst,
|
|
+ int len,
|
|
+@@ -1331,5 +1337,5 @@ Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const V
|
|
+ return a.maximum(b);
|
|
+ }
|
|
+
|
|
+-#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
|
++#endif // if defined(CPU_CAPABILITY_AVX2)
|
|
+ }} // namespace at::vec::CPU_CAPABILITY
|
|
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h
|
|
+index fe96d123e64..87f723d782c 100644
|
|
+--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
|
|
++++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
|
|
+@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
|
|
+ }
|
|
+
|
|
+
|
|
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX512)
|
|
+
|
|
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
+
|
|
+@@ -80,7 +80,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
|
|
+ }
|
|
+
|
|
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
+-
|
|
++#ifndef _MSC_VER
|
|
++// MSVC is not working well on complex function overload.
|
|
+ template<int64_t scale = 1>
|
|
+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
|
|
+ inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
|
|
+@@ -92,9 +93,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
|
|
+ inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
|
|
+ return _mm512_i32gather_ps(vindex, base_addr, scale);
|
|
+ }
|
|
+-
|
|
++#endif
|
|
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
+-
|
|
++#ifndef _MSC_VER
|
|
++// MSVC is not working well on complex function overload.
|
|
+ template<int64_t scale = 1>
|
|
+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
|
|
+ inline mask_gather(const Vectorized<double>& src, const double* base_addr,
|
|
+@@ -112,7 +114,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
|
|
+ auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ);
|
|
+ return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale);
|
|
+ }
|
|
+-
|
|
++#endif
|
|
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
+
|
|
+ template<>
|
|
+@@ -270,6 +272,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
|
|
+ return flip8(v);
|
|
+ }
|
|
+
|
|
+-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
|
++#endif // defined(CPU_CAPABILITY_AVX512)
|
|
+
|
|
+ }}}
|
|
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
|
|
+index f9fc92d52bf..eb3b6a72240 100644
|
|
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
|
|
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
|
|
+@@ -7,7 +7,8 @@
|
|
+ #include <ATen/cpu/vec/vec_base.h>
|
|
+ #include <c10/util/irange.h>
|
|
+
|
|
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX512)
|
|
++#define SLEEF_STATIC_LIBS
|
|
+ #include <sleef.h>
|
|
+ #endif
|
|
+
|
|
+@@ -16,7 +17,18 @@ namespace vec {
|
|
+ // See Note [CPU_CAPABILITY namespace]
|
|
+ inline namespace CPU_CAPABILITY {
|
|
+
|
|
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX512)
|
|
++
|
|
++#ifndef SLEEF_CONST
|
|
++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
|
|
++#define SLEEF_CONST const
|
|
++#else
|
|
++#define SLEEF_CONST
|
|
++#endif
|
|
++#define SLEEF_CONST_OLD SLEEF_CONST
|
|
++#else
|
|
++#define SLEEF_CONST_OLD
|
|
++#endif
|
|
+
|
|
+ // bfloat16 conversion
|
|
+ static inline void cvtbf16_fp32(const __m256i& a, __m512& o) {
|
|
+@@ -362,7 +374,8 @@ public:
|
|
+ }
|
|
+ #pragma clang diagnostic push
|
|
+ #pragma clang diagnostic ignored "-Wignored-qualifiers"
|
|
+- Vectorized<T> map(const __m512 (*const vop)(__m512)) const {
|
|
++
|
|
++ Vectorized<T> map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const {
|
|
+ __m512 lo, hi;
|
|
+ cvt_to_fp32<T>(values, lo, hi);
|
|
+ const auto o1 = vop(lo);
|
|
+@@ -1571,7 +1584,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
|
|
+ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
|
|
+ CONVERT_VECTORIZED_INIT(Half, half);
|
|
+
|
|
+-#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
|
++#else //defined(CPU_CAPABILITY_AVX512)
|
|
+
|
|
+ #define CONVERT_NON_VECTORIZED_INIT(type, name) \
|
|
+ inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
|
|
+@@ -1601,9 +1614,9 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
|
|
+ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
|
|
+ CONVERT_NON_VECTORIZED_INIT(Half, half);
|
|
+
|
|
+-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
|
++#endif // defined(CPU_CAPABILITY_AVX512)
|
|
+
|
|
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX512)
|
|
+ #define LOAD_FP32_VECTORIZED_INIT(type, name) \
|
|
+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
|
|
+ auto values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)); \
|
|
+@@ -1622,7 +1635,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
|
|
+ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
|
|
+ LOAD_FP32_VECTORIZED_INIT(Half, fp16);
|
|
+
|
|
+-#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
|
++#else // defined(CPU_CAPABILITY_AVX512)
|
|
+ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
|
|
+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
|
|
+ __at_align__ float values[Vectorized<float>::size()]; \
|
|
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
|
|
+index 02aa3a87cc1..c35204f9da2 100644
|
|
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
|
|
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
|
|
+@@ -7,7 +7,8 @@
|
|
+ #include <c10/util/irange.h>
|
|
+ #include <ATen/cpu/vec/intrinsics.h>
|
|
+ #include <ATen/cpu/vec/vec_base.h>
|
|
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX512)
|
|
++#define SLEEF_STATIC_LIBS
|
|
+ #include <sleef.h>
|
|
+ #endif
|
|
+
|
|
+@@ -16,7 +17,7 @@ namespace vec {
|
|
+ // See Note [CPU_CAPABILITY namespace]
|
|
+ inline namespace CPU_CAPABILITY {
|
|
+
|
|
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX512)
|
|
+
|
|
+ template <> class Vectorized<c10::complex<double>> {
|
|
+ private:
|
|
+@@ -203,7 +204,7 @@ public:
|
|
+ auto abs = abs_();
|
|
+ auto zero = _mm512_setzero_pd();
|
|
+ auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ);
|
|
+- auto div = values / abs;
|
|
++ auto div = _mm512_div_pd(values, abs);
|
|
+ return _mm512_mask_blend_pd(mask, div, zero);
|
|
+ }
|
|
+ __m512d real_() const {
|
|
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
|
|
+index a5d790c98b2..2801e484d94 100644
|
|
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
|
|
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
|
|
+@@ -7,7 +7,8 @@
|
|
+ #include <c10/util/irange.h>
|
|
+ #include <ATen/cpu/vec/intrinsics.h>
|
|
+ #include <ATen/cpu/vec/vec_base.h>
|
|
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX512)
|
|
++#define SLEEF_STATIC_LIBS
|
|
+ #include <sleef.h>
|
|
+ #endif
|
|
+
|
|
+@@ -16,7 +17,7 @@ namespace vec {
|
|
+ // See Note [CPU_CAPABILITY namespace]
|
|
+ inline namespace CPU_CAPABILITY {
|
|
+
|
|
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX512)
|
|
+
|
|
+ template <> class Vectorized<c10::complex<float>> {
|
|
+ private:
|
|
+@@ -708,7 +709,7 @@ public:
|
|
+ auto abs = abs_();
|
|
+ auto zero = _mm512_setzero_ps();
|
|
+ auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ);
|
|
+- auto div = values / abs;
|
|
++ auto div = _mm512_div_ps(values, abs);
|
|
+ return _mm512_mask_blend_ps(mask, div, zero);
|
|
+ }
|
|
+ __m512 real_() const {
|
|
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
|
|
+index 27b2753c903..508ab257e60 100644
|
|
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
|
|
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
|
|
+@@ -6,7 +6,8 @@
|
|
+ #include <ATen/cpu/vec/intrinsics.h>
|
|
+ #include <ATen/cpu/vec/vec_base.h>
|
|
+ #include <c10/util/irange.h>
|
|
+-#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER)
|
|
++#if (defined(CPU_CAPABILITY_AVX512))
|
|
++#define SLEEF_STATIC_LIBS
|
|
+ #include <sleef.h>
|
|
+ #endif
|
|
+
|
|
+@@ -15,7 +16,7 @@ namespace vec {
|
|
+ // See Note [CPU_CAPABILITY namespace]
|
|
+ inline namespace CPU_CAPABILITY {
|
|
+
|
|
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX512)
|
|
+
|
|
+ template <> class Vectorized<double> {
|
|
+ private:
|
|
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
|
|
+index ba5738687fd..a08df3c141a 100644
|
|
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
|
|
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
|
|
+@@ -6,7 +6,8 @@
|
|
+ #include <ATen/cpu/vec/intrinsics.h>
|
|
+ #include <ATen/cpu/vec/vec_base.h>
|
|
+ #include <c10/util/irange.h>
|
|
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX512)
|
|
++#define SLEEF_STATIC_LIBS
|
|
+ #include <sleef.h>
|
|
+ #endif
|
|
+
|
|
+@@ -15,7 +16,7 @@ namespace vec {
|
|
+ // See Note [CPU_CAPABILITY namespace]
|
|
+ inline namespace CPU_CAPABILITY {
|
|
+
|
|
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX512)
|
|
+
|
|
+ template <> class Vectorized<float> {
|
|
+ private:
|
|
+@@ -246,14 +247,14 @@ public:
|
|
+ static __m512 vec_factorial_5 =
|
|
+ _mm512_set1_ps(0.00828929059f); // 1/factorial(5)
|
|
+ static __m512 vec_exp_log2ef =
|
|
+- (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e)
|
|
++ _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
|
|
+ static __m512 vec_half = _mm512_set1_ps(0.5f);
|
|
+ static __m512 vec_one = _mm512_set1_ps(1.f);
|
|
+ static __m512 vec_zero = _mm512_set1_ps(0.f);
|
|
+ static __m512 vec_two = _mm512_set1_ps(2.f);
|
|
+- static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2)
|
|
+- static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50);
|
|
+- static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218);
|
|
++ static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2)
|
|
++ static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
|
|
++ static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
|
|
+ static __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
|
|
+ static int n_mantissa_bits = 23;
|
|
+
|
|
+@@ -288,7 +289,7 @@ public:
|
|
+ auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number);
|
|
+ auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127);
|
|
+ vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
|
|
+- auto vec_two_pow_n = (__m512)vec_two_pow_n_i;
|
|
++ auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i);
|
|
+ vec_two_pow_n =
|
|
+ _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero);
|
|
+
|
|
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
|
|
+index e0713d01312..a5671ed4a50 100644
|
|
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
|
|
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
|
|
+@@ -42,11 +42,17 @@ namespace at {
|
|
+ namespace vec {
|
|
+ inline namespace CPU_CAPABILITY {
|
|
+
|
|
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
|
++#if defined(CPU_CAPABILITY_AVX512)
|
|
+
|
|
++#ifdef _MSC_VER
|
|
++__declspec(align(64)) struct Vectorizedqi {
|
|
++ protected:
|
|
++ __m512i vals;
|
|
++#else
|
|
+ struct Vectorizedqi {
|
|
+ protected:
|
|
+ __m512i vals __attribute__((aligned(64)));
|
|
++#endif
|
|
+
|
|
+ public:
|
|
+ Vectorizedqi() {}
|
|
+@@ -136,7 +142,7 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
|
|
+ }
|
|
+
|
|
+ template <typename T>
|
|
+-inline void __attribute__((always_inline)) QuantizeAvx512(
|
|
++__FORCE_INLINE void QuantizeAvx512(
|
|
+ const float* src,
|
|
+ T* dst,
|
|
+ int len,
|
|
+@@ -525,10 +531,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
|
|
+ Vectorized<float> scale,
|
|
+ Vectorized<float> zero_point,
|
|
+ Vectorized<float> scale_neg_zp_premul) const {
|
|
++ #if defined(_MSC_VER) && !defined(__clang__)
|
|
++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
|
|
++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
|
|
++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
|
|
++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
|
|
++ #else
|
|
+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
|
|
+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
|
|
+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
|
|
+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
|
|
++ #endif
|
|
+
|
|
+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
|
|
+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
|
|
+@@ -549,10 +562,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
|
|
+ float_vec_return_type dequantize(
|
|
+ Vectorized<float> scale,
|
|
+ Vectorized<float> zero_point) const {
|
|
++ #if defined(_MSC_VER) && !defined(__clang__)
|
|
++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
|
|
++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
|
|
++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
|
|
++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
|
|
++ #else
|
|
+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
|
|
+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
|
|
+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
|
|
+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
|
|
++ #endif
|
|
+
|
|
+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
|
|
+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
|
|
+@@ -598,20 +618,34 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
|
|
+ }
|
|
+
|
|
+ int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
|
|
++ #if defined(_MSC_VER) && !defined(__clang__)
|
|
++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
|
|
++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
|
|
++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
|
|
++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
|
|
++ #else
|
|
+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
|
|
+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
|
|
+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
|
|
+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
|
|
++ #endif
|
|
+
|
|
+ __m512i int32_val0 = cvtepi8_epi32(int_val0);
|
|
+ __m512i int32_val1 = cvtepi8_epi32(int_val1);
|
|
+ __m512i int32_val2 = cvtepi8_epi32(int_val2);
|
|
+ __m512i int32_val3 = cvtepi8_epi32(int_val3);
|
|
+
|
|
++ #if defined(_MSC_VER) && !defined(__clang__)
|
|
++ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
|
|
++ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
|
|
++ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
|
|
++ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
|
|
++ #else
|
|
+ __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
|
|
+ __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
|
|
+ __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
|
|
+ __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
|
|
++ #endif
|
|
+
|
|
+ __m512i int32_b0 = cvtepi8_epi32(int_b0);
|
|
+ __m512i int32_b1 = cvtepi8_epi32(int_b1);
|
|
+@@ -721,10 +755,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
|
|
+ Vectorized<float> scale,
|
|
+ Vectorized<float> zero_point,
|
|
+ Vectorized<float> scale_zp_premul) const {
|
|
++ #if defined(_MSC_VER) && !defined(__clang__)
|
|
++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
|
|
++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
|
|
++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
|
|
++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
|
|
++ #else
|
|
+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
|
|
+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
|
|
+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
|
|
+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
|
|
++ #endif
|
|
+
|
|
+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
|
|
+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
|
|
+@@ -746,10 +787,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
|
|
+ float_vec_return_type dequantize(
|
|
+ Vectorized<float> scale,
|
|
+ Vectorized<float> zero_point) const {
|
|
++ #if defined(_MSC_VER) && !defined(__clang__)
|
|
++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
|
|
++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
|
|
++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
|
|
++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
|
|
++ #else
|
|
+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
|
|
+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
|
|
+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
|
|
+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
|
|
++ #endif
|
|
+
|
|
+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
|
|
+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
|
|
+@@ -796,20 +844,34 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
|
|
+ }
|
|
+
|
|
+ int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
|
|
++ #if defined(_MSC_VER) && !defined(__clang__)
|
|
++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
|
|
++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
|
|
++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
|
|
++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
|
|
++ #else
|
|
+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
|
|
+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
|
|
+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
|
|
+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
|
|
++ #endif
|
|
+
|
|
+ __m512i int32_val0 = cvtepu8_epi32(int_val0);
|
|
+ __m512i int32_val1 = cvtepu8_epi32(int_val1);
|
|
+ __m512i int32_val2 = cvtepu8_epi32(int_val2);
|
|
+ __m512i int32_val3 = cvtepu8_epi32(int_val3);
|
|
+
|
|
++ #if defined(_MSC_VER) && !defined(__clang__)
|
|
++ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
|
|
++ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
|
|
++ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
|
|
++ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
|
|
++ #else
|
|
+ __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
|
|
+ __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
|
|
+ __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
|
|
+ __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
|
|
++ #endif
|
|
+
|
|
+ __m512i int32_b0 = cvtepu8_epi32(int_b0);
|
|
+ __m512i int32_b1 = cvtepu8_epi32(int_b1);
|
|
+diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
|
|
+index adf81dd915c..20cb8ef6dbc 100644
|
|
+--- a/aten/src/ATen/cpu/vec/vec_base.h
|
|
++++ b/aten/src/ATen/cpu/vec/vec_base.h
|
|
+@@ -36,6 +36,12 @@
|
|
+ #include <c10/util/irange.h>
|
|
+ #include <c10/util/Load.h>
|
|
+
|
|
++#if defined(__GNUC__)
|
|
++#define __FORCE_INLINE __attribute__((always_inline)) inline
|
|
++#elif defined(_MSC_VER)
|
|
++#define __FORCE_INLINE __forceinline
|
|
++#endif
|
|
++
|
|
+ // These macros helped us unify vec_base.h
|
|
+ #ifdef CPU_CAPABILITY_AVX512
|
|
+ #if defined(__GNUC__)
|
|
+diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
|
|
+index a6b6f0f7d1d..15d37cf4861 100644
|
|
+--- a/caffe2/CMakeLists.txt
|
|
++++ b/caffe2/CMakeLists.txt
|
|
+@@ -1787,7 +1787,7 @@ if(BUILD_TEST)
|
|
+ endif()
|
|
+ else()
|
|
+ add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
|
|
+- target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main)
|
|
++ target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main)
|
|
+ endif()
|
|
+ target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<INSTALL_INTERFACE:include>)
|
|
+ target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>)
|
|
+diff --git a/third_party/sleef.BUILD b/third_party/sleef.BUILD
|
|
+index 573f9c5b54a..f22a6e905e2 100644
|
|
+--- a/third_party/sleef.BUILD
|
|
++++ b/third_party/sleef.BUILD
|
|
+@@ -38,6 +38,7 @@ SLEEF_PUBLIC_HEADERS = [
|
|
+ SLEEF_PRIVATE_INCLUDES = [
|
|
+ "-Iexternal/sleef/src/arch",
|
|
+ "-Iexternal/sleef/src/common",
|
|
++ "-Iexternal/sleef/src/libm",
|
|
+ ]
|
|
+
|
|
+ SLEEF_PUBLIC_INCLUDES = [
|
|
+@@ -201,8 +202,6 @@ cc_library(
|
|
+ srcs = [
|
|
+ "src/libm/rempitab.c",
|
|
+ "src/libm/sleefdp.c",
|
|
+- "src/libm/sleefld.c",
|
|
+- "src/libm/sleefqp.c",
|
|
+ "src/libm/sleefsp.c",
|
|
+ ],
|
|
+ hdrs = SLEEF_PUBLIC_HEADERS,
|
|
+--
|
|
+2.45.1
|
|
+
|
|
diff --git a/python-torch.spec b/python-torch.spec
|
|
index d50687a5174a..63600c2e8c39 100644
|
|
--- a/python-torch.spec
|
|
+++ b/python-torch.spec
|
|
@@ -176,6 +176,17 @@ Patch7: 0001-Reenable-dim-for-python-3.12.patch
|
|
Patch8: 0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch
|
|
%endif
|
|
|
|
+# Enable x86 CPU vectorization on windows [submodule sleef] (#118980)
|
|
+# https://github.com/pytorch/pytorch/commit/56451cd49d9cf94b49197e09dec13426bb1a5370
|
|
+#
|
|
+# Despite the title, this patch fixes compatibility with sleef 3.6 by including
|
|
+# a backwards-compatible version of the fix from
|
|
+# https://github.com/pytorch/pytorch/pull/122723.
|
|
+# Cherry-picked on v2.3.0, without the commit to update the third_party/sleef
|
|
+# git submodule (because the release archive contains an actual sleef source
|
|
+# tree instead, so this would not apply.)
|
|
+Patch9: 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
|
|
+
|
|
%if %{with rocm}
|
|
# ROCm patches
|
|
# https://github.com/pytorch/pytorch/pull/120551
|
|
--
|
|
2.45.1
|
|
|