diff --git a/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch b/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch deleted file mode 100644 index 413c60d..0000000 --- a/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch +++ /dev/null @@ -1,47 +0,0 @@ -From 091b7fe1ccbb5e4ff4ac6017d42bacb869f61a27 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 20 Jul 2024 05:37:15 -0600 -Subject: [PATCH] Add cmake option USE_SYSTEM_FBGEMM - -Signed-off-by: Tom Rix ---- - CMakeLists.txt | 1 + - cmake/Dependencies.cmake | 3 ++- - 2 files changed, 3 insertions(+), 1 deletion(-) - -diff --git a/CMakeLists.txt b/CMakeLists.txt -index c4cd4b2c2a98..2068f7c6c4f2 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -253,6 +253,7 @@ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF - "USE_CUDNN" OFF) - cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF) - option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON) -+option(USE_SYSTEM_FBGEMM "Use system-wide FBGEMM" OFF) - option(USE_KINETO "Use Kineto profiling library" ON) - option(USE_CUPTI_SO "Use CUPTI as a shared library" ON) - option(USE_FAKELOWP "Use FakeLowp operators" OFF) -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index f1f2eb7cec31..192dac46f13b 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -706,6 +706,7 @@ endif() - - # ---[ FBGEMM - if(USE_FBGEMM) -+ if (NOT USE_SYSTEM_FBGEMM) - set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party") - if(NOT DEFINED FBGEMM_SOURCE_DIR) - set(FBGEMM_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/fbgemm" CACHE STRING "FBGEMM source directory") -@@ -746,7 +747,7 @@ if(USE_FBGEMM) - target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable) - endif() - endif() -- -+ endif() - if(USE_FBGEMM) - list(APPEND Caffe2_DEPENDENCY_LIBS fbgemm) - endif() --- -2.45.1 - diff --git a/0001-Changes-to-compile-with-3.13-126033.patch b/0001-Changes-to-compile-with-3.13-126033.patch deleted file mode 100644 index ddc0dcf..0000000 --- a/0001-Changes-to-compile-with-3.13-126033.patch +++ /dev/null @@ -1,222 +0,0 @@ -From 655a06444b261cb28e71a0973c0ab67aaa8261ab Mon Sep 17 00:00:00 2001 -From: albanD -Date: Tue, 14 May 2024 02:14:53 +0000 -Subject: [PATCH] Changes to compile with 3.13 (#126033) - -This is mainly: -- Fix refcount access macro -- Hide all the Dynamo code that needs update as usual -- Add _PyWeakref_ClearRef as an extern provided by CPython. Including the pycore header that defines it would require raw c include shenanigans that I don't think are worth it. -This allows to build both with regular and nogil version of cpython. Both - -Note that this requires the 3.13 branch at least past [d3094744d40de2deefbda9b1996d5029c9ebf0b0](https://github.com/python/cpython/commit/d3094744d40de2deefbda9b1996d5029c9ebf0b0) which we need for mimalloc include and weakref function being exposed. - -debug-only issues in pybind11 with PyMem_MALLOC vs PyObject_MALLOC being should be synced either by updating pybind or cpython. @colesbury I can send a PR to ifdef the proper use in pybind if you think that this is the best solution here? - -Pull Request resolved: https://github.com/pytorch/pytorch/pull/126033 -Approved by: https://github.com/colesbury ---- - torch/csrc/Storage.cpp | 2 +- - torch/csrc/autograd/python_variable.cpp | 2 +- - torch/csrc/dynamo/cpython_defs.c | 15 +++++- - torch/csrc/dynamo/cpython_defs.h | 2 + - torch/csrc/dynamo/eval_frame.c | 67 ++++++++++++++++++------- - torch/csrc/utils/python_compat.h | 4 ++ - 6 files changed, 70 insertions(+), 22 deletions(-) - -diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp -index 93dbc9c09bb2..b22bbac35981 100644 ---- a/torch/csrc/Storage.cpp -+++ b/torch/csrc/Storage.cpp -@@ -236,7 +236,7 @@ static void THPStorage_subclass_dealloc(PyObject* self) { - if (type->tp_del) { - PyObject_GC_Track(self); - type->tp_del(self); -- if (self->ob_refcnt > 0) { -+ if (Py_REFCNT(self) > 0) { - // Resurrected (see above comment about resurrection from `__del__`) - return; - } -diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp -index 9e85f0026b35..8fd1129da63c 100644 ---- a/torch/csrc/autograd/python_variable.cpp -+++ b/torch/csrc/autograd/python_variable.cpp -@@ -1910,7 +1910,7 @@ void THPVariable_subclass_dealloc(PyObject* self) { - if (type->tp_del) { - PyObject_GC_Track(self); - type->tp_del(self); -- if (self->ob_refcnt > 0) { -+ if (Py_REFCNT(self) > 0) { - /* Resurrected */ - return; - } -diff --git a/torch/csrc/dynamo/cpython_defs.c b/torch/csrc/dynamo/cpython_defs.c -index 4a1dba63009a..5e0945a052ae 100644 ---- a/torch/csrc/dynamo/cpython_defs.c -+++ b/torch/csrc/dynamo/cpython_defs.c -@@ -13,6 +13,17 @@ - } else { \ - } - -+#if IS_PYTHON_3_13_PLUS -+// Gave up after fixing a few of these -+// pycore_opcode.h is gone (new is pycore_opcode_metadata.h ?) -+// f_code is gone (new is f_executable?) -+ -+// Fake definitions for what we removed -+const uint8_t* THP_PyOpcode_Caches = NULL; -+const int THP_PyOpcode_Caches_size = 0; -+ -+#else -+ - // NOTE: all `assert`s below are converted to `CHECK`s - - #if IS_PYTHON_3_11_PLUS -@@ -29,8 +40,8 @@ - #define NEED_OPCODE_TABLES // To get _PyOpcode_Deopt - #include - #undef NEED_OPCODE_TABLES --#undef Py_BUILD_CORE - #include -+#undef Py_BUILD_CORE - - // As a simple way to reduce the impact of ABI changes on the CPython side, this check forces - // us to manually re-check that the function didn't change on the next major version -@@ -364,3 +375,5 @@ THP_PyFrame_Clear(_PyInterpreterFrame *frame) - } - - #endif -+ -+#endif // CPython 3.13 -\ No newline at end of file -diff --git a/torch/csrc/dynamo/cpython_defs.h b/torch/csrc/dynamo/cpython_defs.h -index a897c3e6c6e7..3b6c9667f8c9 100644 ---- a/torch/csrc/dynamo/cpython_defs.h -+++ b/torch/csrc/dynamo/cpython_defs.h -@@ -8,7 +8,9 @@ - - #if IS_PYTHON_3_11_PLUS - -+#define Py_BUILD_CORE - #include -+#undef Py_BUILD_CORE - - int THP_PyFrame_FastToLocalsWithError( - _PyInterpreterFrame* frame, -diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c -index c286e821f09d..e13cb5af2a0e 100644 ---- a/torch/csrc/dynamo/eval_frame.c -+++ b/torch/csrc/dynamo/eval_frame.c -@@ -8,6 +8,31 @@ - #include - #include - -+ -+ -+PyObject* guard_error_hook = NULL; -+const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup"; -+ -+static int active_dynamo_threads = 0; -+ -+static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT; -+ -+inline static PyObject* eval_frame_callback_get(void) { -+ void* result = PyThread_tss_get(&eval_frame_callback_key); -+ if (unlikely(result == NULL)) { -+ return (PyObject*)Py_None; -+ } else { -+ return (PyObject*)result; -+ } -+} -+ -+inline static void eval_frame_callback_set(PyObject* obj) { -+ PyThread_tss_set(&eval_frame_callback_key, obj); -+} -+ -+// 3.13 Not supported at all. See cpython_defs.c for hints -+#if !(IS_PYTHON_3_13_PLUS) -+ - // Problem in CPython includes when mixing core and non-core build - // The fix was not backported to 3.12 so this is needed here - // https://github.com/python/cpython/issues/105268 -@@ -138,24 +163,6 @@ THP_PyFrame_FastToLocalsWithError(THP_EVAL_API_FRAME_OBJECT *frame, int *free_va - } - #endif - --PyObject* guard_error_hook = NULL; --const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup"; -- --static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT; -- --inline static PyObject* eval_frame_callback_get(void) { -- void* result = PyThread_tss_get(&eval_frame_callback_key); -- if (unlikely(result == NULL)) { -- return (PyObject*)Py_None; -- } else { -- return (PyObject*)result; -- } --} -- --inline static void eval_frame_callback_set(PyObject* obj) { -- PyThread_tss_set(&eval_frame_callback_key, obj); --} -- - static PyObject* _custom_eval_frame_shim( - PyThreadState* tstate, - THP_EVAL_API_FRAME_OBJECT* frame, -@@ -627,7 +634,29 @@ static PyObject* _custom_eval_frame( - } - } - --static int active_dynamo_threads = 0; -+#else // IS_PYTHON_3_13_PLUS -+ -+// Fake definitions for everything we removed -+ -+typedef struct THPPyInterpreterFrame { -+ PyObject_HEAD -+ _PyInterpreterFrame* frame; // Borrowed reference -+} THPPyInterpreterFrame; -+ -+inline static void enable_eval_frame_shim(PyThreadState* tstate) {} -+inline static void enable_eval_frame_default(PyThreadState* tstate) {} -+ -+static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL}; -+ -+static PyTypeObject THPPyInterpreterFrameType = { -+ PyVarObject_HEAD_INIT(NULL, 0) -+ .tp_name = "torch._C.dynamo.eval_frame._PyInterpreterFrame", -+ .tp_basicsize = sizeof(THPPyInterpreterFrame), -+ .tp_flags = Py_TPFLAGS_DEFAULT, -+ .tp_getset = THPPyInterpreterFrame_properties, -+}; -+ -+#endif // CPython 3.13 - - static PyObject* increment_working_threads(PyThreadState* tstate) { - active_dynamo_threads = active_dynamo_threads + 1; -diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h -index 73b991cf3fbf..b060db00db73 100644 ---- a/torch/csrc/utils/python_compat.h -+++ b/torch/csrc/utils/python_compat.h -@@ -11,6 +11,7 @@ extern "C" { - - #define IS_PYTHON_3_11_PLUS PY_VERSION_HEX >= 0x030B00C1 - #define IS_PYTHON_3_12_PLUS PY_VERSION_HEX >= 0x030C0000 -+#define IS_PYTHON_3_13_PLUS PY_VERSION_HEX >= 0x030D0000 - - PYCAPI_COMPAT_STATIC_INLINE(int) - PyCode_GetNCellvars(PyCodeObject* code) { -@@ -32,6 +33,9 @@ PyCode_GetNFreevars(PyCodeObject* code) { - #endif - } - -+// Provided by CPython but getting the header for them is very hard -+extern void _PyWeakref_ClearRef(PyWeakReference* self); -+ - #ifdef __cplusplus - } - #endif --- -2.45.1 - diff --git a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch deleted file mode 100644 index 562f55b..0000000 --- a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch +++ /dev/null @@ -1,910 +0,0 @@ -From 3d1e4b3e5ddcdd2717e590c635097163fef64c83 Mon Sep 17 00:00:00 2001 -From: Xu Han -Date: Sun, 31 Mar 2024 03:07:32 +0000 -Subject: [PATCH] Enable x86 CPU vectorization on windows [submodule sleef] - (#118980) - -Enable VEC on Windows OS. -1. Fix some type defination gap between Windows and Linux. -2. Fix some operator not support on Windows, such as [], /. -3. Enable static sleef library build on Windows. -4. Disable unsupported function overloading on MSVC. -5. Upgrade submodule sleef lib, which fixed build issue on Windows. -6. Fixed bazel build issues. -7. Fix test app not link to sleef on Windows. - -Note: If rebuild fail after pulled this PR, please sync `sleef` submodule by run: -```cmd -git submodule sync -git submodule update --init --recursive -``` - -Pull Request resolved: https://github.com/pytorch/pytorch/pull/118980 -Approved by: https://github.com/jgong5, https://github.com/ezyang, https://github.com/malfet ---- - aten/src/ATen/CMakeLists.txt | 48 ++++++-------- - aten/src/ATen/cpu/vec/vec256/vec256.h | 14 ++-- - .../src/ATen/cpu/vec/vec256/vec256_bfloat16.h | 27 ++++++-- - .../cpu/vec/vec256/vec256_complex_double.h | 7 +- - .../cpu/vec/vec256/vec256_complex_float.h | 7 +- - aten/src/ATen/cpu/vec/vec256/vec256_double.h | 5 +- - aten/src/ATen/cpu/vec/vec256/vec256_float.h | 15 +++-- - aten/src/ATen/cpu/vec/vec256/vec256_qint.h | 12 +++- - aten/src/ATen/cpu/vec/vec512/vec512.h | 14 ++-- - .../src/ATen/cpu/vec/vec512/vec512_bfloat16.h | 27 ++++++-- - .../cpu/vec/vec512/vec512_complex_double.h | 7 +- - .../cpu/vec/vec512/vec512_complex_float.h | 7 +- - aten/src/ATen/cpu/vec/vec512/vec512_double.h | 5 +- - aten/src/ATen/cpu/vec/vec512/vec512_float.h | 15 +++-- - aten/src/ATen/cpu/vec/vec512/vec512_qint.h | 66 ++++++++++++++++++- - aten/src/ATen/cpu/vec/vec_base.h | 6 ++ - caffe2/CMakeLists.txt | 2 +- - third_party/sleef.BUILD | 3 +- - 18 files changed, 194 insertions(+), 93 deletions(-) - -diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt -index bf425af5fa9..58d5828e8ca 100644 ---- a/aten/src/ATen/CMakeLists.txt -+++ b/aten/src/ATen/CMakeLists.txt -@@ -419,32 +419,25 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$") - list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo) - endif() - --if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) -- # Preserve values for the main build -- set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS}) -- set(__aten_sleef_build_tests ${BUILD_TESTS}) -- -- # Unset our restrictive C++ flags here and reset them later. -- # Remove this once we use proper target_compile_options. -- set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) -- set(CMAKE_CXX_FLAGS) -- -- # Bump up optimization level for sleef to -O1, since at -O0 the compiler -- # excessively spills intermediate vector registers to the stack -- # and makes things run impossibly slowly -- set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) -- if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0") -- string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -- else() -- set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1") -+if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) -+ if(NOT MSVC) -+ # Bump up optimization level for sleef to -O1, since at -O0 the compiler -+ # excessively spills intermediate vector registers to the stack -+ # and makes things run impossibly slowly -+ set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) -+ if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0") -+ string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -+ else() -+ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1") -+ endif() - endif() - - if(NOT USE_SYSTEM_SLEEF) -- set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) -- set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) -- set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) -- set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) -- set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE}) -+ set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) -+ set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) -+ set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) -+ set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) -+ set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE) - if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") - if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64") - set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE) -@@ -465,12 +458,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) - endif() - list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef) - -- set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -- set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS}) -- -- # Set these back. TODO: Use SLEEF_ to pass these instead -- set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE) -- set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE) -+ if(NOT MSVC) -+ set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -+ endif() - endif() - - if(USE_CUDA AND NOT USE_ROCM) -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h -index 800b027e469..c431fa3c605 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h -@@ -69,7 +69,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { - } - - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -@@ -94,7 +94,8 @@ inline Vectorized cast(const Vectorized& src) - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -+#ifndef _MSC_VER -+// MSVC is not working well on complex function overload. - template - std::enable_if_t> - inline gather(const double* base_addr, const Vectorized& vindex) { -@@ -106,9 +107,10 @@ std::enable_if_t& vindex) { - return _mm256_i32gather_ps(base_addr, vindex, scale); - } -- -+#endif - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -+#ifndef _MSC_VER -+// MSVC is not working well on complex function overload. - template - std::enable_if_t> - inline mask_gather(const Vectorized& src, const double* base_addr, -@@ -122,7 +124,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, - const Vectorized& vindex, Vectorized& mask) { - return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale); - } -- -+#endif - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - // Only works for inputs in the range: [-2^51, 2^51] -@@ -302,6 +304,6 @@ inline Vectorized flip(const Vectorized & v) { - return flip8(v); - } - --#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#endif // (defined(CPU_CAPABILITY_AVX2) - - }} // namepsace at::vec::CPU_CAPABILITY -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h -index 3e26213d6d2..66557436c70 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h -@@ -7,7 +7,8 @@ - #include - #include - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -18,7 +19,18 @@ namespace at::vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) -+ -+#ifndef SLEEF_CONST -+#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) -+#define SLEEF_CONST const -+#else -+#define SLEEF_CONST -+#endif -+#define SLEEF_CONST_OLD SLEEF_CONST -+#else -+#define SLEEF_CONST_OLD -+#endif - - // bfloat16 conversion - static inline void cvtbf16_fp32(const __m128i& a, __m256& o) { -@@ -265,7 +277,8 @@ public: - } - return b; - } -- Vectorized map(const __m256 (*const vop)(__m256)) const { -+ -+ Vectorized map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const { - __m256 lo, hi; - cvt_to_fp32(values, lo, hi); - const auto o1 = vop(lo); -@@ -1026,7 +1039,7 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V - CONVERT_VECTORIZED_INIT(BFloat16, bfloat16); - CONVERT_VECTORIZED_INIT(Half, half); - --#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#else // defined(CPU_CAPABILITY_AVX2) - - #define CONVERT_NON_VECTORIZED_INIT(type, name) \ - inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ -@@ -1051,9 +1064,9 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V - CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16); - CONVERT_NON_VECTORIZED_INIT(Half, half); - --#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#endif // defined(CPU_CAPABILITY_AVX2) - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - #define LOAD_FP32_VECTORIZED_INIT(type, name) \ - inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ - auto values = _mm_loadu_si128(reinterpret_cast(data)); \ -@@ -1072,7 +1085,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec - LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16); - LOAD_FP32_VECTORIZED_INIT(Half, fp16); - --#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#else // defined(CPU_CAPABILITY_AVX2) - #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ - inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ - __at_align__ float values[Vectorized::size()]; \ -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h -index f93ea1e63c3..6c198fb37d3 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h -@@ -8,7 +8,8 @@ - #include - #include - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -16,7 +17,7 @@ namespace at::vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - - template <> class Vectorized> { - private: -@@ -145,7 +146,7 @@ public: - auto abs = abs_(); - auto zero = _mm256_setzero_pd(); - auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ); -- auto div = values / abs; -+ auto div = _mm256_div_pd(values, abs); - return _mm256_blendv_pd(div, zero, mask); - } - __m256d real_() const { -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h -index 7c142c04b79..c72d4d49274 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h -@@ -7,7 +7,8 @@ - #include - #include - #include --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -15,7 +16,7 @@ namespace at::vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - - template <> class Vectorized> { - private: -@@ -180,7 +181,7 @@ public: - auto abs = abs_(); - auto zero = _mm256_setzero_ps(); - auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ); -- auto div = values / abs; -+ auto div = _mm256_div_ps(values, abs); - return _mm256_blendv_ps(div, zero, mask); - } - __m256 real_() const { -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h -index bc82d07edd1..bed6da627af 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h -@@ -6,7 +6,8 @@ - #include - #include - #include --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -15,7 +16,7 @@ namespace at::vec { - inline namespace CPU_CAPABILITY { - - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - - template <> class Vectorized { - private: -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h -index 886809a0b8a..0e3664cd37b 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h -@@ -6,7 +6,8 @@ - #include - #include - #include --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -14,7 +15,7 @@ namespace at::vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - - template <> class Vectorized { - private: -@@ -226,14 +227,14 @@ public: - static __m256 vec_factorial_5 = - _mm256_set1_ps(0.00828929059f); // 1/factorial(5) - static __m256 vec_exp_log2ef = -- (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e) -+ _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e) - static __m256 vec_half = _mm256_set1_ps(0.5f); - static __m256 vec_one = _mm256_set1_ps(1.f); - static __m256 vec_zero = _mm256_set1_ps(0.f); - static __m256 vec_two = _mm256_set1_ps(2.f); -- static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2) -- static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50); -- static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218); -+ static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2) -+ static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50)); -+ static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218)); - static __m256i vec_127 = _mm256_set1_epi32(0x0000007f); - static int n_mantissa_bits = 23; - -@@ -266,7 +267,7 @@ public: - auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number); - auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127); - vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); -- auto vec_two_pow_n = (__m256)vec_two_pow_n_i; -+ auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i); - vec_two_pow_n = - _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask); - -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h -index 4128841701a..85e099904cd 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h -@@ -41,11 +41,17 @@ - namespace at::vec { - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - -+#ifdef _MSC_VER -+__declspec(align(64)) struct Vectorizedqi { -+ protected: -+ __m256i vals; -+#else - struct Vectorizedqi { - protected: - __m256i vals __attribute__((aligned(64))); -+#endif - - public: - Vectorizedqi() {} -@@ -133,7 +139,7 @@ inline convert_float_to_int8(at::vec::Vectorized src) { - } - - template --inline void __attribute__((always_inline)) QuantizeAvx2( -+__FORCE_INLINE void QuantizeAvx2( - const float* src, - T* dst, - int len, -@@ -1331,5 +1337,5 @@ Vectorized inline maximum(const Vectorized& a, const V - return a.maximum(b); - } - --#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#endif // if defined(CPU_CAPABILITY_AVX2) - }} // namespace at::vec::CPU_CAPABILITY -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h -index fe96d123e64..87f723d782c 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h -@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { - } - - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -@@ -80,7 +80,8 @@ inline Vectorized cast(const Vectorized& src) - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -+#ifndef _MSC_VER -+// MSVC is not working well on complex function overload. - template - std::enable_if_t> - inline gather(const double* base_addr, const Vectorized& vindex) { -@@ -92,9 +93,10 @@ std::enable_if_t& vindex) { - return _mm512_i32gather_ps(vindex, base_addr, scale); - } -- -+#endif - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -+#ifndef _MSC_VER -+// MSVC is not working well on complex function overload. - template - std::enable_if_t> - inline mask_gather(const Vectorized& src, const double* base_addr, -@@ -112,7 +114,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, - auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ); - return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale); - } -- -+#endif - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - template<> -@@ -270,6 +272,6 @@ inline Vectorized flip(const Vectorized & v) { - return flip8(v); - } - --#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#endif // defined(CPU_CAPABILITY_AVX512) - - }}} -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h -index f9fc92d52bf..eb3b6a72240 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h -@@ -7,7 +7,8 @@ - #include - #include - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -16,7 +17,18 @@ namespace vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) -+ -+#ifndef SLEEF_CONST -+#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) -+#define SLEEF_CONST const -+#else -+#define SLEEF_CONST -+#endif -+#define SLEEF_CONST_OLD SLEEF_CONST -+#else -+#define SLEEF_CONST_OLD -+#endif - - // bfloat16 conversion - static inline void cvtbf16_fp32(const __m256i& a, __m512& o) { -@@ -362,7 +374,8 @@ public: - } - #pragma clang diagnostic push - #pragma clang diagnostic ignored "-Wignored-qualifiers" -- Vectorized map(const __m512 (*const vop)(__m512)) const { -+ -+ Vectorized map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const { - __m512 lo, hi; - cvt_to_fp32(values, lo, hi); - const auto o1 = vop(lo); -@@ -1571,7 +1584,7 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V - CONVERT_VECTORIZED_INIT(BFloat16, bfloat16); - CONVERT_VECTORIZED_INIT(Half, half); - --#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#else //defined(CPU_CAPABILITY_AVX512) - - #define CONVERT_NON_VECTORIZED_INIT(type, name) \ - inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ -@@ -1601,9 +1614,9 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V - CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16); - CONVERT_NON_VECTORIZED_INIT(Half, half); - --#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#endif // defined(CPU_CAPABILITY_AVX512) - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - #define LOAD_FP32_VECTORIZED_INIT(type, name) \ - inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ - auto values = _mm256_loadu_si256(reinterpret_cast(data)); \ -@@ -1622,7 +1635,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec - LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16); - LOAD_FP32_VECTORIZED_INIT(Half, fp16); - --#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#else // defined(CPU_CAPABILITY_AVX512) - #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ - inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ - __at_align__ float values[Vectorized::size()]; \ -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h -index 02aa3a87cc1..c35204f9da2 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h -@@ -7,7 +7,8 @@ - #include - #include - #include --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -16,7 +17,7 @@ namespace vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - - template <> class Vectorized> { - private: -@@ -203,7 +204,7 @@ public: - auto abs = abs_(); - auto zero = _mm512_setzero_pd(); - auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ); -- auto div = values / abs; -+ auto div = _mm512_div_pd(values, abs); - return _mm512_mask_blend_pd(mask, div, zero); - } - __m512d real_() const { -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h -index a5d790c98b2..2801e484d94 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h -@@ -7,7 +7,8 @@ - #include - #include - #include --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -16,7 +17,7 @@ namespace vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - - template <> class Vectorized> { - private: -@@ -708,7 +709,7 @@ public: - auto abs = abs_(); - auto zero = _mm512_setzero_ps(); - auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ); -- auto div = values / abs; -+ auto div = _mm512_div_ps(values, abs); - return _mm512_mask_blend_ps(mask, div, zero); - } - __m512 real_() const { -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h -index 27b2753c903..508ab257e60 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h -@@ -6,7 +6,8 @@ - #include - #include - #include --#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER) -+#if (defined(CPU_CAPABILITY_AVX512)) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -15,7 +16,7 @@ namespace vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - - template <> class Vectorized { - private: -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h -index ba5738687fd..a08df3c141a 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h -@@ -6,7 +6,8 @@ - #include - #include - #include --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -15,7 +16,7 @@ namespace vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - - template <> class Vectorized { - private: -@@ -246,14 +247,14 @@ public: - static __m512 vec_factorial_5 = - _mm512_set1_ps(0.00828929059f); // 1/factorial(5) - static __m512 vec_exp_log2ef = -- (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e) -+ _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e) - static __m512 vec_half = _mm512_set1_ps(0.5f); - static __m512 vec_one = _mm512_set1_ps(1.f); - static __m512 vec_zero = _mm512_set1_ps(0.f); - static __m512 vec_two = _mm512_set1_ps(2.f); -- static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2) -- static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50); -- static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218); -+ static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2) -+ static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50)); -+ static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218)); - static __m512i vec_127 = _mm512_set1_epi32(0x0000007f); - static int n_mantissa_bits = 23; - -@@ -288,7 +289,7 @@ public: - auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number); - auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127); - vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); -- auto vec_two_pow_n = (__m512)vec_two_pow_n_i; -+ auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i); - vec_two_pow_n = - _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero); - -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h -index e0713d01312..a5671ed4a50 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h -@@ -42,11 +42,17 @@ namespace at { - namespace vec { - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - -+#ifdef _MSC_VER -+__declspec(align(64)) struct Vectorizedqi { -+ protected: -+ __m512i vals; -+#else - struct Vectorizedqi { - protected: - __m512i vals __attribute__((aligned(64))); -+#endif - - public: - Vectorizedqi() {} -@@ -136,7 +142,7 @@ inline convert_float_to_int8(at::vec::Vectorized src) { - } - - template --inline void __attribute__((always_inline)) QuantizeAvx512( -+__FORCE_INLINE void QuantizeAvx512( - const float* src, - T* dst, - int len, -@@ -525,10 +531,17 @@ struct Vectorized : public Vectorizedqi { - Vectorized scale, - Vectorized zero_point, - Vectorized scale_neg_zp_premul) const { -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -+ #else - __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); - __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); - __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); - __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -+ #endif - - __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); - __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); -@@ -549,10 +562,17 @@ struct Vectorized : public Vectorizedqi { - float_vec_return_type dequantize( - Vectorized scale, - Vectorized zero_point) const { -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -+ #else - __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); - __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); - __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); - __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -+ #endif - - __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); - __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); -@@ -598,20 +618,34 @@ struct Vectorized : public Vectorizedqi { - } - - int_vec_return_type widening_subtract(Vectorized b) const { -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -+ #else - __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); - __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); - __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); - __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -+ #endif - - __m512i int32_val0 = cvtepi8_epi32(int_val0); - __m512i int32_val1 = cvtepi8_epi32(int_val1); - __m512i int32_val2 = cvtepi8_epi32(int_val2); - __m512i int32_val3 = cvtepi8_epi32(int_val3); - -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); -+ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); -+ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); -+ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); -+ #else - __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); - __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); - __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); - __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); -+ #endif - - __m512i int32_b0 = cvtepi8_epi32(int_b0); - __m512i int32_b1 = cvtepi8_epi32(int_b1); -@@ -721,10 +755,17 @@ struct Vectorized : public Vectorizedqi { - Vectorized scale, - Vectorized zero_point, - Vectorized scale_zp_premul) const { -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -+ #else - __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); - __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); - __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); - __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -+ #endif - - __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); - __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); -@@ -746,10 +787,17 @@ struct Vectorized : public Vectorizedqi { - float_vec_return_type dequantize( - Vectorized scale, - Vectorized zero_point) const { -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -+ #else - __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); - __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); - __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); - __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -+ #endif - - __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); - __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); -@@ -796,20 +844,34 @@ struct Vectorized : public Vectorizedqi { - } - - int_vec_return_type widening_subtract(Vectorized b) const { -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -+ #else - __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); - __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); - __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); - __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -+ #endif - - __m512i int32_val0 = cvtepu8_epi32(int_val0); - __m512i int32_val1 = cvtepu8_epi32(int_val1); - __m512i int32_val2 = cvtepu8_epi32(int_val2); - __m512i int32_val3 = cvtepu8_epi32(int_val3); - -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); -+ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); -+ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); -+ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); -+ #else - __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); - __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); - __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); - __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); -+ #endif - - __m512i int32_b0 = cvtepu8_epi32(int_b0); - __m512i int32_b1 = cvtepu8_epi32(int_b1); -diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h -index adf81dd915c..20cb8ef6dbc 100644 ---- a/aten/src/ATen/cpu/vec/vec_base.h -+++ b/aten/src/ATen/cpu/vec/vec_base.h -@@ -36,6 +36,12 @@ - #include - #include - -+#if defined(__GNUC__) -+#define __FORCE_INLINE __attribute__((always_inline)) inline -+#elif defined(_MSC_VER) -+#define __FORCE_INLINE __forceinline -+#endif -+ - // These macros helped us unify vec_base.h - #ifdef CPU_CAPABILITY_AVX512 - #if defined(__GNUC__) -diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt -index a6b6f0f7d1d..15d37cf4861 100644 ---- a/caffe2/CMakeLists.txt -+++ b/caffe2/CMakeLists.txt -@@ -1787,7 +1787,7 @@ if(BUILD_TEST) - endif() - else() - add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}") -- target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main) -+ target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main) - endif() - target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $) - target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $) -diff --git a/third_party/sleef.BUILD b/third_party/sleef.BUILD -index 573f9c5b54a..f22a6e905e2 100644 ---- a/third_party/sleef.BUILD -+++ b/third_party/sleef.BUILD -@@ -38,6 +38,7 @@ SLEEF_PUBLIC_HEADERS = [ - SLEEF_PRIVATE_INCLUDES = [ - "-Iexternal/sleef/src/arch", - "-Iexternal/sleef/src/common", -+ "-Iexternal/sleef/src/libm", - ] - - SLEEF_PUBLIC_INCLUDES = [ -@@ -201,8 +202,6 @@ cc_library( - srcs = [ - "src/libm/rempitab.c", - "src/libm/sleefdp.c", -- "src/libm/sleefld.c", -- "src/libm/sleefqp.c", - "src/libm/sleefsp.c", - ], - hdrs = SLEEF_PUBLIC_HEADERS, --- -2.45.1 - diff --git a/0001-Improve-finding-and-using-the-rocm_version.h.patch b/0001-Improve-finding-and-using-the-rocm_version.h.patch deleted file mode 100644 index b8232c7..0000000 --- a/0001-Improve-finding-and-using-the-rocm_version.h.patch +++ /dev/null @@ -1,142 +0,0 @@ -From 201ac4618a1526e048a0d6c02d9bc4cf30bf0ee1 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Wed, 14 Aug 2024 17:18:38 -0700 -Subject: [PATCH] Improve finding and using the rocm_version.h - -On Fedora, the rocm_version.h's path is /usr/include/rocm_version.h -So we have this build error -pytorch/aten/src/ATen/hip/tunable/Tunable.cpp:40:10: fatal error: - rocm-core/rocm_version.h: No such file or directory - 40 | #include - | ^~~~~~~~~~~~~~~~~~~~~~~~~~ - -In other cases, depending on the rocm release either -/opt/rocm/include or /opt/rocm/include/rocm-core - -Convert the EXISTS() checks into a find_path. -Add a -I${ROCM_VERSION_DIR} to the compile options so it can be -found by Tunable.cpp - -Signed-off-by: Tom Rix ---- - aten/src/ATen/cuda/tunable/Tunable.cpp | 2 +- - cmake/Dependencies.cmake | 1 + - cmake/public/LoadHIP.cmake | 72 ++++++++++---------------- - 3 files changed, 30 insertions(+), 45 deletions(-) - -diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp -index 1b7c89875855..32c1d70f3152 100644 ---- a/aten/src/ATen/cuda/tunable/Tunable.cpp -+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp -@@ -36,7 +36,7 @@ - - // for validators - #ifdef USE_ROCM --#include -+#include - #define ROCBLAS_BETA_FEATURES_API - #include - #include -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index 7ef8eabb5162..61bc4d7a54b6 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1063,6 +1063,7 @@ if(USE_ROCM) - list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) - list(APPEND HIP_CXX_FLAGS -std=c++17) - list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2) -+ list(APPEND HIP_CXX_FLAGS -I${ROCM_VERSION_DIR}) - if(HIP_NEW_TYPE_ENUMS) - list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS) - endif() -diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake -index 1c0d3a203991..6a7e3bd163f5 100644 ---- a/cmake/public/LoadHIP.cmake -+++ b/cmake/public/LoadHIP.cmake -@@ -42,55 +42,39 @@ find_package_and_print_version(HIP 1.0) - - if(HIP_FOUND) - set(PYTORCH_FOUND_HIP TRUE) -- set(FOUND_ROCM_VERSION_H FALSE) -- - set(PROJECT_RANDOM_BINARY_DIR "${PROJECT_BINARY_DIR}") -- set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc") - - # Find ROCM version for checks - # ROCM 5.0 and later will have header api for version management -- if(EXISTS ${ROCM_INCLUDE_DIRS}/rocm_version.h) -- set(FOUND_ROCM_VERSION_H TRUE) -- file(WRITE ${file} "" -- "#include \n" -- ) -- elseif(EXISTS ${ROCM_INCLUDE_DIRS}/rocm-core/rocm_version.h) -- set(FOUND_ROCM_VERSION_H TRUE) -- file(WRITE ${file} "" -- "#include \n" -- ) -- else() -- message("********************* rocm_version.h couldnt be found ******************\n") -- endif() -- -- if(FOUND_ROCM_VERSION_H) -- file(APPEND ${file} "" -- "#include \n" -- -- "#ifndef ROCM_VERSION_PATCH\n" -- "#define ROCM_VERSION_PATCH 0\n" -- "#endif\n" -- "#define STRINGIFYHELPER(x) #x\n" -- "#define STRINGIFY(x) STRINGIFYHELPER(x)\n" -- "int main() {\n" -- " printf(\"%d.%d.%s\", ROCM_VERSION_MAJOR, ROCM_VERSION_MINOR, STRINGIFY(ROCM_VERSION_PATCH));\n" -- " return 0;\n" -- "}\n" -- ) -- -- try_run(run_result compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file} -- CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}" -- RUN_OUTPUT_VARIABLE rocm_version_from_header -- COMPILE_OUTPUT_VARIABLE output_var -- ) -- # We expect the compile to be successful if the include directory exists. -- if(NOT compile_result) -- message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var}) -- endif() -- message(STATUS "Caffe2: Header version is: " ${rocm_version_from_header}) -- set(ROCM_VERSION_DEV_RAW ${rocm_version_from_header}) -- message("\n***** ROCm version from rocm_version.h ****\n") -+ find_path(ROCM_VERSION_DIR rocm_version.h HINTS ${ROCM_INCLUDE_DIRS} ${ROCM_INCLUDE_DIRS}/rocm-core) -+ set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc") -+ file(WRITE ${file} "" -+ "#include \n" -+ "#include \n" -+ -+ "#ifndef ROCM_VERSION_PATCH\n" -+ "#define ROCM_VERSION_PATCH 0\n" -+ "#endif\n" -+ "#define STRINGIFYHELPER(x) #x\n" -+ "#define STRINGIFY(x) STRINGIFYHELPER(x)\n" -+ "int main() {\n" -+ " printf(\"%d.%d.%s\", ROCM_VERSION_MAJOR, ROCM_VERSION_MINOR, STRINGIFY(ROCM_VERSION_PATCH));\n" -+ " return 0;\n" -+ "}\n" -+ ) -+ -+ try_run(run_result compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file} -+ CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_VERSION_DIR}" -+ RUN_OUTPUT_VARIABLE rocm_version_from_header -+ COMPILE_OUTPUT_VARIABLE output_var -+ ) -+ # We expect the compile to be successful if the include directory exists. -+ if(NOT compile_result) -+ message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var}) - endif() -+ message(STATUS "Caffe2: Header version is: " ${rocm_version_from_header}) -+ set(ROCM_VERSION_DEV_RAW ${rocm_version_from_header}) -+ message("\n***** ROCm version from rocm_version.h ****\n") - - string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+).*$" ROCM_VERSION_DEV_MATCH ${ROCM_VERSION_DEV_RAW}) - --- -2.46.0 - diff --git a/0001-Optionally-use-hipblaslt.patch b/0001-Optionally-use-hipblaslt.patch deleted file mode 100644 index 1e5ca4b..0000000 --- a/0001-Optionally-use-hipblaslt.patch +++ /dev/null @@ -1,506 +0,0 @@ -From f1d65e958afa65882dbfea8b392ab847a84d41ed Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 29 Jun 2024 04:18:34 -0700 -Subject: [PATCH] Optionally use hipblaslt - ---- - aten/src/ATen/cuda/CUDABlas.cpp | 46 ++++++++++++++++++------ - aten/src/ATen/cuda/CUDAContextLight.h | 4 +++ - aten/src/ATen/cuda/CublasHandlePool.cpp | 10 ++++-- - aten/src/ATen/cuda/tunable/TunableGemm.h | 18 +++++++--- - aten/src/ATen/native/cuda/Blas.cpp | 18 +++++++++- - cmake/Dependencies.cmake | 3 ++ - cmake/public/LoadHIP.cmake | 2 +- - 7 files changed, 82 insertions(+), 19 deletions(-) - -diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp -index ce991a9bcad4..3f0d17b52778 100644 ---- a/aten/src/ATen/cuda/CUDABlas.cpp -+++ b/aten/src/ATen/cuda/CUDABlas.cpp -@@ -14,7 +14,9 @@ - #include - - #ifdef USE_ROCM -+#ifdef USE_HIPBLASLT - #include -+#endif - // until hipblas has an API to accept flags, we must use rocblas here - #include - #include -@@ -182,6 +184,9 @@ uint32_t _getAlignment(uintptr_t address) { - static size_t _parseChosenWorkspaceSize() { - const char * val = getenv("CUBLASLT_WORKSPACE_SIZE"); - #ifdef USE_ROCM -+#ifndef USE_HIPBLASLT -+ return 0; -+#endif - if (!val) { - // accept either env var - val = getenv("HIPBLASLT_WORKSPACE_SIZE"); -@@ -235,6 +240,7 @@ namespace at::cuda::blas { - } while (0) - - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - namespace { - // Following the pattern of CuSparseDescriptor - // Defined here for now because this is the only place cublas_lt interface is -@@ -318,7 +324,6 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< - }; - } // namespace - -- - template - inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { - cudaDataType_t abcType = CUDA_R_32F; -@@ -452,7 +457,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { - " scaleType ", - scaleType); - } -- -+#endif - - template - inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { -@@ -608,10 +613,13 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(double)) - template <> - void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(float)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(float)); - } -- else { -+ else -+#endif -+ { - bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(float)); - } - } -@@ -651,10 +659,13 @@ void bgemm_internal>(CUDABLAS_BGEMM_ARGTYPES(c10::complex - void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::Half)); - } -- else { -+ else -+#endif -+ { - bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::Half)); - } - } -@@ -662,10 +673,13 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)) - template <> - void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::BFloat16)); - } -- else { -+ else -+#endif -+ { - bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::BFloat16)); - } - } -@@ -781,11 +795,13 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { - } - } - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) { - // forward to bgemm implementation but set strides and batches to 0 - bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0); - } -+#endif - - template - inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) { -@@ -1008,10 +1024,13 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) - template <> - void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); - } -- else { -+ else -+#endif -+ { - gemm_internal_cublas(CUDABLAS_GEMM_ARGS(float)); - } - } -@@ -1051,10 +1070,13 @@ void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex - void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); - } -- else { -+ else -+#endif -+ { - gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::Half)); - } - } -@@ -1062,10 +1084,13 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) - template <> - void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } -- else { -+ else -+#endif -+ { - gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } - } -@@ -1177,7 +1202,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { - } - } - -- -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - void gemm_and_bias( - bool transpose_mat1, -@@ -1410,7 +1435,7 @@ void scaled_gemm( - ScalarType result_dtype, - void* amax_ptr, - bool use_fast_accum) { --#if CUDA_VERSION >= 11080 || defined(USE_ROCM) -+#if CUDA_VERSION >= 11080 || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - const auto computeType = CUBLAS_COMPUTE_32F; - const auto scaleType = CUDA_R_32F; - const int8_t fastAccuMode = use_fast_accum ? 1 : 0; -@@ -1681,6 +1706,7 @@ void int8_gemm( - " scaleType ", - scaleType); - } -+#endif - - template <> - void trsm(CUDABLAS_TRSM_ARGTYPES(float)) { -diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h -index f2b657ced51b..f0ee613c4208 100644 ---- a/aten/src/ATen/cuda/CUDAContextLight.h -+++ b/aten/src/ATen/cuda/CUDAContextLight.h -@@ -9,7 +9,9 @@ - - // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also - // added bf16 support -+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))) - #include -+#endif - - #ifdef CUDART_VERSION - #include -@@ -80,7 +82,9 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator(); - /* Handles */ - TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle(); - TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle(); -+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))) - TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle(); -+#endif - - TORCH_CUDA_CPP_API void clearCublasWorkspaces(); - -diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp -index 8eac525b3695..abfdf7a23847 100644 ---- a/aten/src/ATen/cuda/CublasHandlePool.cpp -+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp -@@ -29,7 +29,7 @@ namespace at::cuda { - - namespace { - --#if defined(USE_ROCM) -+#if defined(USE_ROCM) && defined(USE_HIPBLASLT) - void createCublasLtHandle(cublasLtHandle_t *handle) { - TORCH_CUDABLAS_CHECK(cublasLtCreate(handle)); - } -@@ -191,8 +191,9 @@ cublasHandle_t getCurrentCUDABlasHandle() { - return handle; - } - --cublasLtHandle_t getCurrentCUDABlasLtHandle() { - #ifdef USE_ROCM -+#if defined(USE_HIPBLASLT) -+cublasLtHandle_t getCurrentCUDABlasLtHandle() { - c10::DeviceIndex device = 0; - AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); - -@@ -213,9 +214,12 @@ cublasLtHandle_t getCurrentCUDABlasLtHandle() { - - auto handle = myPoolWindow->reserve(device); - return handle; -+} -+#endif - #else -+cublasLtHandle_t getCurrentCUDABlasLtHandle() { - return reinterpret_cast(getCurrentCUDABlasHandle()); --#endif - } -+#endif - - } // namespace at::cuda -diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h -index 53e6154120c9..fa1d664696db 100644 ---- a/aten/src/ATen/cuda/tunable/TunableGemm.h -+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h -@@ -11,7 +11,9 @@ - - #include - #ifdef USE_ROCM -+#ifdef USE_HIPBLASLT - #include -+#endif - #include - #endif - #include -@@ -65,6 +67,7 @@ class DefaultGemmStridedBatchedOp : public Callable> - } - }; - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - class DefaultScaledGemmOp : public Callable> { - public: -@@ -94,6 +97,7 @@ class DefaultScaledGemmOp : public Callable> { - return OK; - } - }; -+#endif - - template - inline bool IsZero(T v) { -@@ -191,6 +195,7 @@ static void AddRocblasValidator() { - } - } - -+#ifdef USE_HIPBLASLT - static void AddHipblasltValidator() { - auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); - if (validators.find("HIPBLASLT_VERSION") == validators.end()) { -@@ -205,6 +210,7 @@ static void AddHipblasltValidator() { - [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; }); - } - } -+#endif - - static void AddRocmValidator() { - auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); -@@ -243,7 +249,7 @@ class GemmTunableOp : public TunableOp, StreamTimer> { - } - AddRocblasValidator(); - } -- -+#ifdef USE_HIPBLASLT - static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); - if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) { - rocm_validators = true; -@@ -257,7 +263,7 @@ class GemmTunableOp : public TunableOp, StreamTimer> { - } - AddHipblasltValidator(); - } -- -+#endif - if (rocm_validators) { - AddRocmValidator(); - } -@@ -286,7 +292,7 @@ class GemmStridedBatchedTunableOp : public TunableOp - } - AddRocblasValidator(); - } -- -+#ifdef USE_HIPBLASLT - static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); - if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) { - rocm_validators = true; -@@ -300,7 +306,7 @@ class GemmStridedBatchedTunableOp : public TunableOp - } - AddHipblasltValidator(); - } -- -+#endif - if (rocm_validators) { - AddRocmValidator(); - } -@@ -312,6 +318,7 @@ class GemmStridedBatchedTunableOp : public TunableOp - } - }; - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - class ScaledGemmTunableOp : public TunableOp, StreamTimer> { - public: -@@ -321,10 +328,12 @@ class ScaledGemmTunableOp : public TunableOp, StreamTimer> - auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); - - #if defined(USE_ROCM) -+#ifdef USE_HIPBLASLT - for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps()) { - this->RegisterOp(std::move(name), std::move(op)); - } - AddHipblasltValidator(); -+#endif - AddRocmValidator(); - #endif - } -@@ -337,6 +346,7 @@ class ScaledGemmTunableOp : public TunableOp, StreamTimer> - "_", BlasOpToString(ALayout), BlasOpToString(BLayout)); - } - }; -+#endif - - #undef XSTRINGIFY - #undef STRINGIFY -diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp -index 84c59a4fd0d7..56ad5de3bf2d 100644 ---- a/aten/src/ATen/native/cuda/Blas.cpp -+++ b/aten/src/ATen/native/cuda/Blas.cpp -@@ -173,6 +173,7 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa - } - - static bool getDisableAddmmCudaLt() { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT"); - #ifdef USE_ROCM - // if we enable tunable op, it'll take priority over just hipblaslt (heuristics) -@@ -196,10 +197,14 @@ static bool getDisableAddmmCudaLt() { - } - return false; - #endif -+#else -+ return true; -+#endif - } - - #ifdef USE_ROCM - static bool isSupportedHipLtROCmArch(int index) { -+#ifdef USE_HIPBLASLT - hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index); - std::string device_arch = prop->gcnArchName; - static const std::vector archs = {"gfx90a", "gfx940", "gfx941", "gfx942"}; -@@ -210,6 +215,7 @@ static bool isSupportedHipLtROCmArch(int index) { - } - } - TORCH_CHECK(false, "Attempting to use hipBLASLt on a unsupported architecture!"); -+#endif - return false; - } - #endif -@@ -235,6 +241,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - at::ScalarType scalar_type = self.scalar_type(); - c10::MaybeOwned self_; - if (&result != &self) { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM) - // Strangely, if mat2 has only 1 row or column, we get - // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic. -@@ -276,13 +283,14 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - scalar_type != at::ScalarType::BFloat16)); - #endif - } -+#endif - #endif - if (!useLtInterface) { - self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm"); - } - self__sizes = self_->sizes(); - } else { --#if defined(USE_ROCM) -+#if defined(USE_ROCM) && defined(USE_HIPBLASLT) - useLtInterface = !disable_addmm_cuda_lt && - result.dim() == 2 && result.is_contiguous() && - isSupportedHipLtROCmArch(self.device().index()) && -@@ -334,6 +342,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj()); - - if (useLtInterface) { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - #if defined(USE_ROCM) - AT_DISPATCH_FLOATING_TYPES_AND2( - at::ScalarType::Half, -@@ -394,6 +403,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - activation_epilogue - ); - }); -+#endif - #endif - } else - { -@@ -803,6 +813,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) { - } - - static bool _scaled_mm_allowed_device() { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - auto dprops = at::cuda::getCurrentDeviceProperties(); - #ifdef USE_ROCM - std::string device_arch = dprops->gcnArchName; -@@ -817,6 +828,9 @@ static bool _scaled_mm_allowed_device() { - #else - return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9); - #endif -+#else -+ return false; -+#endif - } - - // Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax -@@ -850,6 +864,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, - // Check sizes - bool allowed_device = _scaled_mm_allowed_device(); - TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+"); -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix"); - TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix"); - TORCH_CHECK( -@@ -1025,6 +1040,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, - #if defined(USE_ROCM) && ROCM_VERSION >= 60000 && ROCM_VERSION < 60200 - // ROCm's hipBLASLt does not support amax before 6.2, so calculate separately - amax = at::max(at::abs(out.to(kFloat))); -+#endif - #endif - - return {out, amax}; -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index f1f2eb7cec31..8d05e834bbc5 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1052,6 +1052,9 @@ if(USE_ROCM) - list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) - list(APPEND HIP_CXX_FLAGS -std=c++17) - list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2) -+ if(hipblast_FOUND) -+ list(APPEND HIP_CXX_FLAGS -DUSE_HIPBLASLT) -+ endif() - if(HIP_NEW_TYPE_ENUMS) - list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS) - endif() -diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake -index fa39156031ff..df4836847fdf 100644 ---- a/cmake/public/LoadHIP.cmake -+++ b/cmake/public/LoadHIP.cmake -@@ -155,7 +155,7 @@ if(HIP_FOUND) - find_package_and_print_version(hiprand REQUIRED) - find_package_and_print_version(rocblas REQUIRED) - find_package_and_print_version(hipblas REQUIRED) -- find_package_and_print_version(hipblaslt REQUIRED) -+ find_package_and_print_version(hipblaslt) - find_package_and_print_version(miopen REQUIRED) - find_package_and_print_version(hipfft REQUIRED) - find_package_and_print_version(hipsparse REQUIRED) --- -2.45.2 - diff --git a/0001-Patch-for-sleef-3.6.patch b/0001-Patch-for-sleef-3.6.patch deleted file mode 100644 index 13aa208..0000000 --- a/0001-Patch-for-sleef-3.6.patch +++ /dev/null @@ -1,952 +0,0 @@ -From 273f23698c887b52e66c2abec8101b7398f0f9c4 Mon Sep 17 00:00:00 2001 -From: "Benjamin A. Beasley" -Date: Wed, 5 Jun 2024 11:06:02 -0400 -Subject: [PATCH] Patch for sleef 3.6 - ---- - ...ectorization-on-windows-submodule-sl.patch | 910 ++++++++++++++++++ - python-torch.spec | 11 + - 2 files changed, 921 insertions(+) - create mode 100644 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch - -diff --git a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch -new file mode 100644 -index 000000000000..562f55b742c2 ---- /dev/null -+++ b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch -@@ -0,0 +1,910 @@ -+From 3d1e4b3e5ddcdd2717e590c635097163fef64c83 Mon Sep 17 00:00:00 2001 -+From: Xu Han -+Date: Sun, 31 Mar 2024 03:07:32 +0000 -+Subject: [PATCH] Enable x86 CPU vectorization on windows [submodule sleef] -+ (#118980) -+ -+Enable VEC on Windows OS. -+1. Fix some type defination gap between Windows and Linux. -+2. Fix some operator not support on Windows, such as [], /. -+3. Enable static sleef library build on Windows. -+4. Disable unsupported function overloading on MSVC. -+5. Upgrade submodule sleef lib, which fixed build issue on Windows. -+6. Fixed bazel build issues. -+7. Fix test app not link to sleef on Windows. -+ -+Note: If rebuild fail after pulled this PR, please sync `sleef` submodule by run: -+```cmd -+git submodule sync -+git submodule update --init --recursive -+``` -+ -+Pull Request resolved: https://github.com/pytorch/pytorch/pull/118980 -+Approved by: https://github.com/jgong5, https://github.com/ezyang, https://github.com/malfet -+--- -+ aten/src/ATen/CMakeLists.txt | 48 ++++++-------- -+ aten/src/ATen/cpu/vec/vec256/vec256.h | 14 ++-- -+ .../src/ATen/cpu/vec/vec256/vec256_bfloat16.h | 27 ++++++-- -+ .../cpu/vec/vec256/vec256_complex_double.h | 7 +- -+ .../cpu/vec/vec256/vec256_complex_float.h | 7 +- -+ aten/src/ATen/cpu/vec/vec256/vec256_double.h | 5 +- -+ aten/src/ATen/cpu/vec/vec256/vec256_float.h | 15 +++-- -+ aten/src/ATen/cpu/vec/vec256/vec256_qint.h | 12 +++- -+ aten/src/ATen/cpu/vec/vec512/vec512.h | 14 ++-- -+ .../src/ATen/cpu/vec/vec512/vec512_bfloat16.h | 27 ++++++-- -+ .../cpu/vec/vec512/vec512_complex_double.h | 7 +- -+ .../cpu/vec/vec512/vec512_complex_float.h | 7 +- -+ aten/src/ATen/cpu/vec/vec512/vec512_double.h | 5 +- -+ aten/src/ATen/cpu/vec/vec512/vec512_float.h | 15 +++-- -+ aten/src/ATen/cpu/vec/vec512/vec512_qint.h | 66 ++++++++++++++++++- -+ aten/src/ATen/cpu/vec/vec_base.h | 6 ++ -+ caffe2/CMakeLists.txt | 2 +- -+ third_party/sleef.BUILD | 3 +- -+ 18 files changed, 194 insertions(+), 93 deletions(-) -+ -+diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt -+index bf425af5fa9..58d5828e8ca 100644 -+--- a/aten/src/ATen/CMakeLists.txt -++++ b/aten/src/ATen/CMakeLists.txt -+@@ -419,32 +419,25 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$") -+ list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo) -+ endif() -+ -+-if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) -+- # Preserve values for the main build -+- set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS}) -+- set(__aten_sleef_build_tests ${BUILD_TESTS}) -+- -+- # Unset our restrictive C++ flags here and reset them later. -+- # Remove this once we use proper target_compile_options. -+- set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) -+- set(CMAKE_CXX_FLAGS) -+- -+- # Bump up optimization level for sleef to -O1, since at -O0 the compiler -+- # excessively spills intermediate vector registers to the stack -+- # and makes things run impossibly slowly -+- set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) -+- if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0") -+- string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -+- else() -+- set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1") -++if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) -++ if(NOT MSVC) -++ # Bump up optimization level for sleef to -O1, since at -O0 the compiler -++ # excessively spills intermediate vector registers to the stack -++ # and makes things run impossibly slowly -++ set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) -++ if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0") -++ string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -++ else() -++ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1") -++ endif() -+ endif() -+ -+ if(NOT USE_SYSTEM_SLEEF) -+- set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) -+- set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) -+- set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) -+- set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) -+- set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE}) -++ set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) -++ set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) -++ set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) -++ set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) -++ set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE) -+ if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") -+ if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64") -+ set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE) -+@@ -465,12 +458,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) -+ endif() -+ list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef) -+ -+- set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -+- set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS}) -+- -+- # Set these back. TODO: Use SLEEF_ to pass these instead -+- set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE) -+- set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE) -++ if(NOT MSVC) -++ set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -++ endif() -+ endif() -+ -+ if(USE_CUDA AND NOT USE_ROCM) -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h -+index 800b027e469..c431fa3c605 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256.h -+@@ -69,7 +69,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { -+ } -+ -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ -+@@ -94,7 +94,8 @@ inline Vectorized cast(const Vectorized& src) -+ } -+ -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+- -++#ifndef _MSC_VER -++// MSVC is not working well on complex function overload. -+ template -+ std::enable_if_t> -+ inline gather(const double* base_addr, const Vectorized& vindex) { -+@@ -106,9 +107,10 @@ std::enable_if_t& vindex) { -+ return _mm256_i32gather_ps(base_addr, vindex, scale); -+ } -+- -++#endif -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+- -++#ifndef _MSC_VER -++// MSVC is not working well on complex function overload. -+ template -+ std::enable_if_t> -+ inline mask_gather(const Vectorized& src, const double* base_addr, -+@@ -122,7 +124,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, -+ const Vectorized& vindex, Vectorized& mask) { -+ return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale); -+ } -+- -++#endif -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ -+ // Only works for inputs in the range: [-2^51, 2^51] -+@@ -302,6 +304,6 @@ inline Vectorized flip(const Vectorized & v) { -+ return flip8(v); -+ } -+ -+-#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#endif // (defined(CPU_CAPABILITY_AVX2) -+ -+ }} // namepsace at::vec::CPU_CAPABILITY -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h -+index 3e26213d6d2..66557436c70 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h -+@@ -7,7 +7,8 @@ -+ #include -+ #include -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -18,7 +19,18 @@ namespace at::vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -++ -++#ifndef SLEEF_CONST -++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) -++#define SLEEF_CONST const -++#else -++#define SLEEF_CONST -++#endif -++#define SLEEF_CONST_OLD SLEEF_CONST -++#else -++#define SLEEF_CONST_OLD -++#endif -+ -+ // bfloat16 conversion -+ static inline void cvtbf16_fp32(const __m128i& a, __m256& o) { -+@@ -265,7 +277,8 @@ public: -+ } -+ return b; -+ } -+- Vectorized map(const __m256 (*const vop)(__m256)) const { -++ -++ Vectorized map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const { -+ __m256 lo, hi; -+ cvt_to_fp32(values, lo, hi); -+ const auto o1 = vop(lo); -+@@ -1026,7 +1039,7 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V -+ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16); -+ CONVERT_VECTORIZED_INIT(Half, half); -+ -+-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#else // defined(CPU_CAPABILITY_AVX2) -+ -+ #define CONVERT_NON_VECTORIZED_INIT(type, name) \ -+ inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ -+@@ -1051,9 +1064,9 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V -+ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16); -+ CONVERT_NON_VECTORIZED_INIT(Half, half); -+ -+-#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#endif // defined(CPU_CAPABILITY_AVX2) -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ #define LOAD_FP32_VECTORIZED_INIT(type, name) \ -+ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ -+ auto values = _mm_loadu_si128(reinterpret_cast(data)); \ -+@@ -1072,7 +1085,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec -+ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16); -+ LOAD_FP32_VECTORIZED_INIT(Half, fp16); -+ -+-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#else // defined(CPU_CAPABILITY_AVX2) -+ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ -+ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ -+ __at_align__ float values[Vectorized::size()]; \ -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h -+index f93ea1e63c3..6c198fb37d3 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h -+@@ -8,7 +8,8 @@ -+ #include -+ #include -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -16,7 +17,7 @@ namespace at::vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ -+ template <> class Vectorized> { -+ private: -+@@ -145,7 +146,7 @@ public: -+ auto abs = abs_(); -+ auto zero = _mm256_setzero_pd(); -+ auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ); -+- auto div = values / abs; -++ auto div = _mm256_div_pd(values, abs); -+ return _mm256_blendv_pd(div, zero, mask); -+ } -+ __m256d real_() const { -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h -+index 7c142c04b79..c72d4d49274 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h -+@@ -7,7 +7,8 @@ -+ #include -+ #include -+ #include -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -15,7 +16,7 @@ namespace at::vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ -+ template <> class Vectorized> { -+ private: -+@@ -180,7 +181,7 @@ public: -+ auto abs = abs_(); -+ auto zero = _mm256_setzero_ps(); -+ auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ); -+- auto div = values / abs; -++ auto div = _mm256_div_ps(values, abs); -+ return _mm256_blendv_ps(div, zero, mask); -+ } -+ __m256 real_() const { -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h -+index bc82d07edd1..bed6da627af 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h -+@@ -6,7 +6,8 @@ -+ #include -+ #include -+ #include -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -15,7 +16,7 @@ namespace at::vec { -+ inline namespace CPU_CAPABILITY { -+ -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ -+ template <> class Vectorized { -+ private: -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h -+index 886809a0b8a..0e3664cd37b 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h -+@@ -6,7 +6,8 @@ -+ #include -+ #include -+ #include -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -14,7 +15,7 @@ namespace at::vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ -+ template <> class Vectorized { -+ private: -+@@ -226,14 +227,14 @@ public: -+ static __m256 vec_factorial_5 = -+ _mm256_set1_ps(0.00828929059f); // 1/factorial(5) -+ static __m256 vec_exp_log2ef = -+- (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e) -++ _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e) -+ static __m256 vec_half = _mm256_set1_ps(0.5f); -+ static __m256 vec_one = _mm256_set1_ps(1.f); -+ static __m256 vec_zero = _mm256_set1_ps(0.f); -+ static __m256 vec_two = _mm256_set1_ps(2.f); -+- static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2) -+- static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50); -+- static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218); -++ static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2) -++ static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50)); -++ static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218)); -+ static __m256i vec_127 = _mm256_set1_epi32(0x0000007f); -+ static int n_mantissa_bits = 23; -+ -+@@ -266,7 +267,7 @@ public: -+ auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number); -+ auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127); -+ vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); -+- auto vec_two_pow_n = (__m256)vec_two_pow_n_i; -++ auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i); -+ vec_two_pow_n = -+ _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask); -+ -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h -+index 4128841701a..85e099904cd 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h -+@@ -41,11 +41,17 @@ -+ namespace at::vec { -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ -++#ifdef _MSC_VER -++__declspec(align(64)) struct Vectorizedqi { -++ protected: -++ __m256i vals; -++#else -+ struct Vectorizedqi { -+ protected: -+ __m256i vals __attribute__((aligned(64))); -++#endif -+ -+ public: -+ Vectorizedqi() {} -+@@ -133,7 +139,7 @@ inline convert_float_to_int8(at::vec::Vectorized src) { -+ } -+ -+ template -+-inline void __attribute__((always_inline)) QuantizeAvx2( -++__FORCE_INLINE void QuantizeAvx2( -+ const float* src, -+ T* dst, -+ int len, -+@@ -1331,5 +1337,5 @@ Vectorized inline maximum(const Vectorized& a, const V -+ return a.maximum(b); -+ } -+ -+-#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#endif // if defined(CPU_CAPABILITY_AVX2) -+ }} // namespace at::vec::CPU_CAPABILITY -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h -+index fe96d123e64..87f723d782c 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512.h -+@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { -+ } -+ -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ -+@@ -80,7 +80,8 @@ inline Vectorized cast(const Vectorized& src) -+ } -+ -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+- -++#ifndef _MSC_VER -++// MSVC is not working well on complex function overload. -+ template -+ std::enable_if_t> -+ inline gather(const double* base_addr, const Vectorized& vindex) { -+@@ -92,9 +93,10 @@ std::enable_if_t& vindex) { -+ return _mm512_i32gather_ps(vindex, base_addr, scale); -+ } -+- -++#endif -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+- -++#ifndef _MSC_VER -++// MSVC is not working well on complex function overload. -+ template -+ std::enable_if_t> -+ inline mask_gather(const Vectorized& src, const double* base_addr, -+@@ -112,7 +114,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, -+ auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ); -+ return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale); -+ } -+- -++#endif -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ -+ template<> -+@@ -270,6 +272,6 @@ inline Vectorized flip(const Vectorized & v) { -+ return flip8(v); -+ } -+ -+-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#endif // defined(CPU_CAPABILITY_AVX512) -+ -+ }}} -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h -+index f9fc92d52bf..eb3b6a72240 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h -+@@ -7,7 +7,8 @@ -+ #include -+ #include -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -16,7 +17,18 @@ namespace vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -++ -++#ifndef SLEEF_CONST -++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) -++#define SLEEF_CONST const -++#else -++#define SLEEF_CONST -++#endif -++#define SLEEF_CONST_OLD SLEEF_CONST -++#else -++#define SLEEF_CONST_OLD -++#endif -+ -+ // bfloat16 conversion -+ static inline void cvtbf16_fp32(const __m256i& a, __m512& o) { -+@@ -362,7 +374,8 @@ public: -+ } -+ #pragma clang diagnostic push -+ #pragma clang diagnostic ignored "-Wignored-qualifiers" -+- Vectorized map(const __m512 (*const vop)(__m512)) const { -++ -++ Vectorized map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const { -+ __m512 lo, hi; -+ cvt_to_fp32(values, lo, hi); -+ const auto o1 = vop(lo); -+@@ -1571,7 +1584,7 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V -+ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16); -+ CONVERT_VECTORIZED_INIT(Half, half); -+ -+-#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#else //defined(CPU_CAPABILITY_AVX512) -+ -+ #define CONVERT_NON_VECTORIZED_INIT(type, name) \ -+ inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ -+@@ -1601,9 +1614,9 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V -+ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16); -+ CONVERT_NON_VECTORIZED_INIT(Half, half); -+ -+-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#endif // defined(CPU_CAPABILITY_AVX512) -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ #define LOAD_FP32_VECTORIZED_INIT(type, name) \ -+ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ -+ auto values = _mm256_loadu_si256(reinterpret_cast(data)); \ -+@@ -1622,7 +1635,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec -+ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16); -+ LOAD_FP32_VECTORIZED_INIT(Half, fp16); -+ -+-#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#else // defined(CPU_CAPABILITY_AVX512) -+ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ -+ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ -+ __at_align__ float values[Vectorized::size()]; \ -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h -+index 02aa3a87cc1..c35204f9da2 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h -+@@ -7,7 +7,8 @@ -+ #include -+ #include -+ #include -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -16,7 +17,7 @@ namespace vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ -+ template <> class Vectorized> { -+ private: -+@@ -203,7 +204,7 @@ public: -+ auto abs = abs_(); -+ auto zero = _mm512_setzero_pd(); -+ auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ); -+- auto div = values / abs; -++ auto div = _mm512_div_pd(values, abs); -+ return _mm512_mask_blend_pd(mask, div, zero); -+ } -+ __m512d real_() const { -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h -+index a5d790c98b2..2801e484d94 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h -+@@ -7,7 +7,8 @@ -+ #include -+ #include -+ #include -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -16,7 +17,7 @@ namespace vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ -+ template <> class Vectorized> { -+ private: -+@@ -708,7 +709,7 @@ public: -+ auto abs = abs_(); -+ auto zero = _mm512_setzero_ps(); -+ auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ); -+- auto div = values / abs; -++ auto div = _mm512_div_ps(values, abs); -+ return _mm512_mask_blend_ps(mask, div, zero); -+ } -+ __m512 real_() const { -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h -+index 27b2753c903..508ab257e60 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h -+@@ -6,7 +6,8 @@ -+ #include -+ #include -+ #include -+-#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER) -++#if (defined(CPU_CAPABILITY_AVX512)) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -15,7 +16,7 @@ namespace vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ -+ template <> class Vectorized { -+ private: -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h -+index ba5738687fd..a08df3c141a 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h -+@@ -6,7 +6,8 @@ -+ #include -+ #include -+ #include -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -15,7 +16,7 @@ namespace vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ -+ template <> class Vectorized { -+ private: -+@@ -246,14 +247,14 @@ public: -+ static __m512 vec_factorial_5 = -+ _mm512_set1_ps(0.00828929059f); // 1/factorial(5) -+ static __m512 vec_exp_log2ef = -+- (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e) -++ _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e) -+ static __m512 vec_half = _mm512_set1_ps(0.5f); -+ static __m512 vec_one = _mm512_set1_ps(1.f); -+ static __m512 vec_zero = _mm512_set1_ps(0.f); -+ static __m512 vec_two = _mm512_set1_ps(2.f); -+- static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2) -+- static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50); -+- static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218); -++ static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2) -++ static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50)); -++ static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218)); -+ static __m512i vec_127 = _mm512_set1_epi32(0x0000007f); -+ static int n_mantissa_bits = 23; -+ -+@@ -288,7 +289,7 @@ public: -+ auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number); -+ auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127); -+ vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); -+- auto vec_two_pow_n = (__m512)vec_two_pow_n_i; -++ auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i); -+ vec_two_pow_n = -+ _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero); -+ -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h -+index e0713d01312..a5671ed4a50 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h -+@@ -42,11 +42,17 @@ namespace at { -+ namespace vec { -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ -++#ifdef _MSC_VER -++__declspec(align(64)) struct Vectorizedqi { -++ protected: -++ __m512i vals; -++#else -+ struct Vectorizedqi { -+ protected: -+ __m512i vals __attribute__((aligned(64))); -++#endif -+ -+ public: -+ Vectorizedqi() {} -+@@ -136,7 +142,7 @@ inline convert_float_to_int8(at::vec::Vectorized src) { -+ } -+ -+ template -+-inline void __attribute__((always_inline)) QuantizeAvx512( -++__FORCE_INLINE void QuantizeAvx512( -+ const float* src, -+ T* dst, -+ int len, -+@@ -525,10 +531,17 @@ struct Vectorized : public Vectorizedqi { -+ Vectorized scale, -+ Vectorized zero_point, -+ Vectorized scale_neg_zp_premul) const { -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -++ #else -+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -++ #endif -+ -+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); -+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); -+@@ -549,10 +562,17 @@ struct Vectorized : public Vectorizedqi { -+ float_vec_return_type dequantize( -+ Vectorized scale, -+ Vectorized zero_point) const { -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -++ #else -+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -++ #endif -+ -+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); -+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); -+@@ -598,20 +618,34 @@ struct Vectorized : public Vectorizedqi { -+ } -+ -+ int_vec_return_type widening_subtract(Vectorized b) const { -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -++ #else -+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -++ #endif -+ -+ __m512i int32_val0 = cvtepi8_epi32(int_val0); -+ __m512i int32_val1 = cvtepi8_epi32(int_val1); -+ __m512i int32_val2 = cvtepi8_epi32(int_val2); -+ __m512i int32_val3 = cvtepi8_epi32(int_val3); -+ -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); -++ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); -++ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); -++ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); -++ #else -+ __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); -+ __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); -+ __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); -+ __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); -++ #endif -+ -+ __m512i int32_b0 = cvtepi8_epi32(int_b0); -+ __m512i int32_b1 = cvtepi8_epi32(int_b1); -+@@ -721,10 +755,17 @@ struct Vectorized : public Vectorizedqi { -+ Vectorized scale, -+ Vectorized zero_point, -+ Vectorized scale_zp_premul) const { -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -++ #else -+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -++ #endif -+ -+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); -+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); -+@@ -746,10 +787,17 @@ struct Vectorized : public Vectorizedqi { -+ float_vec_return_type dequantize( -+ Vectorized scale, -+ Vectorized zero_point) const { -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -++ #else -+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -++ #endif -+ -+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); -+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); -+@@ -796,20 +844,34 @@ struct Vectorized : public Vectorizedqi { -+ } -+ -+ int_vec_return_type widening_subtract(Vectorized b) const { -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -++ #else -+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -++ #endif -+ -+ __m512i int32_val0 = cvtepu8_epi32(int_val0); -+ __m512i int32_val1 = cvtepu8_epi32(int_val1); -+ __m512i int32_val2 = cvtepu8_epi32(int_val2); -+ __m512i int32_val3 = cvtepu8_epi32(int_val3); -+ -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); -++ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); -++ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); -++ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); -++ #else -+ __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); -+ __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); -+ __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); -+ __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); -++ #endif -+ -+ __m512i int32_b0 = cvtepu8_epi32(int_b0); -+ __m512i int32_b1 = cvtepu8_epi32(int_b1); -+diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h -+index adf81dd915c..20cb8ef6dbc 100644 -+--- a/aten/src/ATen/cpu/vec/vec_base.h -++++ b/aten/src/ATen/cpu/vec/vec_base.h -+@@ -36,6 +36,12 @@ -+ #include -+ #include -+ -++#if defined(__GNUC__) -++#define __FORCE_INLINE __attribute__((always_inline)) inline -++#elif defined(_MSC_VER) -++#define __FORCE_INLINE __forceinline -++#endif -++ -+ // These macros helped us unify vec_base.h -+ #ifdef CPU_CAPABILITY_AVX512 -+ #if defined(__GNUC__) -+diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt -+index a6b6f0f7d1d..15d37cf4861 100644 -+--- a/caffe2/CMakeLists.txt -++++ b/caffe2/CMakeLists.txt -+@@ -1787,7 +1787,7 @@ if(BUILD_TEST) -+ endif() -+ else() -+ add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}") -+- target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main) -++ target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main) -+ endif() -+ target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $) -+ target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $) -+diff --git a/third_party/sleef.BUILD b/third_party/sleef.BUILD -+index 573f9c5b54a..f22a6e905e2 100644 -+--- a/third_party/sleef.BUILD -++++ b/third_party/sleef.BUILD -+@@ -38,6 +38,7 @@ SLEEF_PUBLIC_HEADERS = [ -+ SLEEF_PRIVATE_INCLUDES = [ -+ "-Iexternal/sleef/src/arch", -+ "-Iexternal/sleef/src/common", -++ "-Iexternal/sleef/src/libm", -+ ] -+ -+ SLEEF_PUBLIC_INCLUDES = [ -+@@ -201,8 +202,6 @@ cc_library( -+ srcs = [ -+ "src/libm/rempitab.c", -+ "src/libm/sleefdp.c", -+- "src/libm/sleefld.c", -+- "src/libm/sleefqp.c", -+ "src/libm/sleefsp.c", -+ ], -+ hdrs = SLEEF_PUBLIC_HEADERS, -+-- -+2.45.1 -+ -diff --git a/python-torch.spec b/python-torch.spec -index d50687a5174a..63600c2e8c39 100644 ---- a/python-torch.spec -+++ b/python-torch.spec -@@ -176,6 +176,17 @@ Patch7: 0001-Reenable-dim-for-python-3.12.patch - Patch8: 0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch - %endif - -+# Enable x86 CPU vectorization on windows [submodule sleef] (#118980) -+# https://github.com/pytorch/pytorch/commit/56451cd49d9cf94b49197e09dec13426bb1a5370 -+# -+# Despite the title, this patch fixes compatibility with sleef 3.6 by including -+# a backwards-compatible version of the fix from -+# https://github.com/pytorch/pytorch/pull/122723. -+# Cherry-picked on v2.3.0, without the commit to update the third_party/sleef -+# git submodule (because the release archive contains an actual sleef source -+# tree instead, so this would not apply.) -+Patch9: 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch -+ - %if %{with rocm} - # ROCm patches - # https://github.com/pytorch/pytorch/pull/120551 --- -2.45.1 - diff --git a/0001-Reenable-dim-for-python-3.12.patch b/0001-Reenable-dim-for-python-3.12.patch deleted file mode 100644 index 138b5d4..0000000 --- a/0001-Reenable-dim-for-python-3.12.patch +++ /dev/null @@ -1,115 +0,0 @@ -From ee3fb343a376cdba6f4ce188cac90023f13e2aea Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Thu, 4 Apr 2024 14:21:38 -0600 -Subject: [PATCH] Reenable dim for python 3.12 - -In 3.12: - -_PyArg_Parser added an element to the start of the structure. -So existing positional initialization is off. Switch to element -initialization. - -_Py_CODEUNIT changed to from an int to a union, but relevant_op -is passed an int for the return of decoder.opcode, so the parameter -type is wrong, switch it to int. - -The opcode PRECALL was removed, so reduce its handling to 3.11 - -Signed-off-by: Tom Rix ---- - functorch/csrc/dim/dim.cpp | 24 +++++------------------- - functorch/csrc/dim/minpybind.h | 4 ++-- - 2 files changed, 7 insertions(+), 21 deletions(-) - -diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp -index 4cc027504c77..e48b0d58081f 100644 ---- a/functorch/csrc/dim/dim.cpp -+++ b/functorch/csrc/dim/dim.cpp -@@ -6,20 +6,6 @@ - - #include - -- --// Many APIs have changed/don't exist anymore --#if IS_PYTHON_3_12_PLUS -- --#include "dim.h" -- --// Re-enable this some day --PyObject* Dim_init() { -- PyErr_SetString(PyExc_RuntimeError, "First class dim doesn't work with python 3.12"); -- return nullptr; --} -- --#else -- - #include "minpybind.h" - #include - #include -@@ -441,7 +427,7 @@ static PyObject* DimList_bind(DimList *self, - PY_BEGIN - mpy::handle sizes; - static const char * const _keywords[] = {"sizes", nullptr}; -- static _PyArg_Parser parser = {"O", _keywords, 0}; -+ static _PyArg_Parser parser = { .format = "O", .keywords = _keywords}; - if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, &sizes)) { - return nullptr; - } -@@ -465,7 +451,7 @@ static PyObject* DimList_bind_len(DimList *self, - PY_BEGIN - int size; - static const char * const _keywords[] = {"N", nullptr}; -- static _PyArg_Parser parser = {"i", _keywords, 0}; -+ static _PyArg_Parser parser = { .format = "i", .keywords = _keywords}; - if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, &size)) { - return nullptr; - } -@@ -1468,7 +1454,7 @@ PyTypeObject Tensor::Type = { - - // dim() -------------------- - --static bool relevant_op(_Py_CODEUNIT c) { -+static bool relevant_op(int c) { - switch(c) { - case STORE_NAME: - case STORE_GLOBAL: -@@ -1587,7 +1573,7 @@ static PyObject* _dims(PyObject *self, - auto c = mpy::obj::steal(PyFrame_GetCode(f.ptr())); - auto lasti = PyFrame_GetLasti(f.ptr()); - auto decoder = PyInstDecoder(c.ptr(), lasti); -- #if IS_PYTHON_3_11_PLUS -+ #if IS_PYTHON_3_11 - // When py3.11 adapts bytecode lasti points to the precall - // rather than the call instruction after it - if (decoder.opcode() == PRECALL) { -@@ -3268,4 +3254,4 @@ PyObject* Dim_init() { - } - } - --#endif -+ -diff --git a/functorch/csrc/dim/minpybind.h b/functorch/csrc/dim/minpybind.h -index de82b5af95a4..d76d4828bf80 100644 ---- a/functorch/csrc/dim/minpybind.h -+++ b/functorch/csrc/dim/minpybind.h -@@ -621,7 +621,7 @@ struct vector_args { - PyObject *dummy = NULL; - _PyArg_ParseStackAndKeywords((PyObject*const*)args, nargs, kwnames.ptr(), _parser, &dummy, &dummy, &dummy, &dummy, &dummy); - #else -- _PyArg_Parser* _parser = new _PyArg_Parser{NULL, &names_buf[0], fname_cstr, 0}; -+ _PyArg_Parser* _parser = new _PyArg_Parser{ .keywords = &names_buf[0], .fname = fname_cstr}; - std::unique_ptr buf(new PyObject*[names.size()]); - _PyArg_UnpackKeywords((PyObject*const*)args, nargs, NULL, kwnames.ptr(), _parser, required, (Py_ssize_t)values.size() - kwonly, 0, &buf[0]); - #endif -@@ -706,7 +706,7 @@ inline object handle::call_vector(vector_args args) { - #define MPY_PARSE_ARGS_KWNAMES(fmt, FORALL_ARGS) \ - static const char * const kwlist[] = { FORALL_ARGS(MPY_ARGS_NAME) nullptr}; \ - FORALL_ARGS(MPY_ARGS_DECLARE) \ -- static _PyArg_Parser parser = {fmt, kwlist, 0}; \ -+ static _PyArg_Parser parser = { .format = fmt, .keywords = kwlist}; \ - if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, FORALL_ARGS(MPY_ARGS_POINTER) nullptr)) { \ - throw mpy::exception_set(); \ - } --- -2.44.0 - diff --git a/0001-Regenerate-flatbuffer-header.patch b/0001-Regenerate-flatbuffer-header.patch deleted file mode 100644 index 4eec491..0000000 --- a/0001-Regenerate-flatbuffer-header.patch +++ /dev/null @@ -1,39 +0,0 @@ -From 5b8e51b24513fa851eeff42f23d942bde301e321 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Fri, 29 Sep 2023 06:19:29 -0700 -Subject: [PATCH] Regenerate flatbuffer header - -For this error -torch/csrc/jit/serialization/mobile_bytecode_generated.h:12:41: -error: static assertion failed: Non-compatible flatbuffers version included - 12 | FLATBUFFERS_VERSION_MINOR == 3 && - -PyTorch is expecting 23.3.3, what f38 has -Rawhide is at 23.5.26 - -Regenerate with -flatc --cpp --gen-mutable --no-prefix --scoped-enums mobile_bytecode.fbs - -Signed-off-by: Tom Rix ---- - torch/csrc/jit/serialization/mobile_bytecode_generated.h | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/torch/csrc/jit/serialization/mobile_bytecode_generated.h b/torch/csrc/jit/serialization/mobile_bytecode_generated.h -index cffe8bc7a6..83575e4c19 100644 ---- a/torch/csrc/jit/serialization/mobile_bytecode_generated.h -+++ b/torch/csrc/jit/serialization/mobile_bytecode_generated.h -@@ -9,8 +9,8 @@ - // Ensure the included flatbuffers.h is the same version as when this file was - // generated, otherwise it may not be compatible. - static_assert(FLATBUFFERS_VERSION_MAJOR == 23 && -- FLATBUFFERS_VERSION_MINOR == 3 && -- FLATBUFFERS_VERSION_REVISION == 3, -+ FLATBUFFERS_VERSION_MINOR == 5 && -+ FLATBUFFERS_VERSION_REVISION == 26, - "Non-compatible flatbuffers version included"); - - namespace torch { --- -2.43.0 - diff --git a/0001-Stub-in-kineto-ActivityType.patch b/0001-Stub-in-kineto-ActivityType.patch deleted file mode 100644 index f088645..0000000 --- a/0001-Stub-in-kineto-ActivityType.patch +++ /dev/null @@ -1,73 +0,0 @@ -From 3ef82b814179da571b2478f61d4279717ab0b23a Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Fri, 29 Sep 2023 06:25:23 -0700 -Subject: [PATCH] Stub in kineto ActivityType - -There is an error with kineto is not used, the shim still -requires the ActivityTYpe.h header to get the enum Activity type. -So cut-n-paste just enough of the header in to do this. - -Signed-off-by: Tom Rix ---- - torch/csrc/profiler/kineto_shim.h | 44 +++++++++++++++++++++++++++++++ - 1 file changed, 44 insertions(+) - -diff --git a/torch/csrc/profiler/kineto_shim.h b/torch/csrc/profiler/kineto_shim.h -index e92cbf003d..68985ab7d0 100644 ---- a/torch/csrc/profiler/kineto_shim.h -+++ b/torch/csrc/profiler/kineto_shim.h -@@ -12,7 +12,51 @@ - #undef USE_KINETO - #endif - -+#ifdef USE_KINETO - #include -+#else -+namespace libkineto { -+// copied from header -+/* -+ * Copyright (c) Meta Platforms, Inc. and affiliates. -+ * All rights reserved. -+ * -+ * This source code is licensed under the BSD-style license found in the -+ * LICENSE file in the root directory of this source tree. -+ */ -+ -+// Note : All activity types are not enabled by default. Please add them -+// at correct position in the enum -+enum class ActivityType { -+ // Activity types enabled by default -+ CPU_OP = 0, // cpu side ops -+ USER_ANNOTATION, -+ GPU_USER_ANNOTATION, -+ GPU_MEMCPY, -+ GPU_MEMSET, -+ CONCURRENT_KERNEL, // on-device kernels -+ EXTERNAL_CORRELATION, -+ CUDA_RUNTIME, // host side cuda runtime events -+ CUDA_DRIVER, // host side cuda driver events -+ CPU_INSTANT_EVENT, // host side point-like events -+ PYTHON_FUNCTION, -+ OVERHEAD, // CUPTI induced overhead events sampled from its overhead API. -+ -+ // Optional Activity types -+ CUDA_SYNC, // synchronization events between runtime and kernels -+ GLOW_RUNTIME, // host side glow runtime events -+ MTIA_RUNTIME, // host side MTIA runtime events -+ CUDA_PROFILER_RANGE, // CUPTI Profiler range for performance metrics -+ MTIA_CCP_EVENTS, // MTIA ondevice CCP events -+ HPU_OP, // HPU host side runtime event -+ XPU_RUNTIME, // host side xpu runtime events -+ -+ ENUM_COUNT, // This is to add buffer and not used for any profiling logic. Add your new type before it. -+ OPTIONAL_ACTIVITY_TYPE_START = CUDA_SYNC, -+}; -+} -+ -+#endif - - #include - #include --- -2.43.0 - diff --git a/0001-can-not-use-with-c-files.patch b/0001-can-not-use-with-c-files.patch deleted file mode 100644 index 719737c..0000000 --- a/0001-can-not-use-with-c-files.patch +++ /dev/null @@ -1,25 +0,0 @@ -From a5dff521691a17701b5a02ec75e84cfe1bf605f7 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 3 Feb 2024 06:41:49 -0500 -Subject: [PATCH] can not use with c files - ---- - cmake/Dependencies.cmake | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index 4dd8042058..5f91f3ffab 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1269,7 +1269,7 @@ if(USE_ROCM) - list(APPEND HIP_CXX_FLAGS -Wno-duplicate-decl-specifier) - list(APPEND HIP_CXX_FLAGS -DCAFFE2_USE_MIOPEN) - list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) -- list(APPEND HIP_CXX_FLAGS -std=c++17) -+# list(APPEND HIP_CXX_FLAGS -std=c++17) - if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "6.0.0") - list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2) - endif() --- -2.43.0 - diff --git a/0001-cuda-hip-signatures.patch b/0001-cuda-hip-signatures.patch deleted file mode 100644 index a258737..0000000 --- a/0001-cuda-hip-signatures.patch +++ /dev/null @@ -1,42 +0,0 @@ -From 214dc959acc809e1959643272c344ee5335d5a69 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Thu, 1 Feb 2024 11:29:47 -0500 -Subject: [PATCH] cuda - hip signatures - ---- - aten/src/ATen/cuda/detail/LazyNVRTC.cpp | 9 +++++++++ - 1 file changed, 9 insertions(+) - -diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp -index 1b85e7776e..bb6f88783a 100644 ---- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp -+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp -@@ -134,8 +134,13 @@ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, - const char *src, - const char *name, - int numHeaders, -+#if !defined(USE_ROCM) - const char * const *headers, - const char * const *includeNames) { -+#else -+ const char **headers, -+ const char **includeNames) { -+#endif - auto fn = reinterpret_cast(getNVRTCLibrary().sym(__func__)); - if (!fn) - throw std::runtime_error("Can't get nvrtcCreateProgram"); -@@ -150,7 +155,11 @@ NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *); - NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *); - NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *); - #endif -+#if !defined(USE_ROCM) - NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *); -+#else -+NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char **); -+#endif - _STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult); - NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*); - NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *); --- -2.43.0 - diff --git a/0001-disable-use-of-aotriton.patch b/0001-disable-use-of-aotriton.patch deleted file mode 100644 index 61ffd1e..0000000 --- a/0001-disable-use-of-aotriton.patch +++ /dev/null @@ -1,94 +0,0 @@ -From 038ce9e44776e23f21c1816daa259bc0ea335088 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 29 Jun 2024 07:06:09 -0700 -Subject: [PATCH] disable use of aotriton - ---- - .../ATen/native/transformers/cuda/sdp_utils.cpp | 17 +++++++++++++++-- - 1 file changed, 15 insertions(+), 2 deletions(-) - -diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp -index 214b02d8262e..7b3eb9dcd8cd 100644 ---- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp -+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp -@@ -19,9 +19,12 @@ - #include - #include - -+#ifdef USE_FLASH_ATTENTION - #if USE_ROCM - #include - #endif -+#endif -+ - - /** - * Note [SDPA Runtime Dispatch] -@@ -182,6 +185,9 @@ bool check_sm_version(cudaDeviceProp * dprops) { - - bool check_flash_attention_hardware_support(sdp_params const& params, bool debug) { - // Check that the gpu is capable of running flash attention -+#ifndef USE_FLASH_ATTENTION -+ return false; -+#else - using sm80 = SMVersion<8, 0>; - using sm90 = SMVersion<9, 0>; - #if USE_ROCM -@@ -209,9 +215,13 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug - } - #endif - return true; -+#endif - } - - bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) { -+#ifndef USE_FLASH_ATTENTION -+ return false; -+#else - // Mem Efficient attention supports hardware in the range [sm_50, sm_90] - using sm50 = SMVersion<5, 0>; - using sm90 = SMVersion<9, 0>; -@@ -240,6 +250,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) - } - #endif - return true; -+#endif - } - - bool check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89( -@@ -554,7 +565,7 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) { - #ifndef USE_FLASH_ATTENTION - TORCH_WARN_ONCE(!debug, "Torch was not compiled with flash attention."); - return false; --#endif -+#else - - // Define gate functions that determine if a flash kernel can be ran - // Replace with std::to_array when we migrate to c++20 -@@ -597,13 +608,14 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) { - } - } - return true; -+#endif - } - - bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) { - #ifndef USE_MEM_EFF_ATTENTION - TORCH_WARN_ONCE(!debug, "Torch was not compiled with memory efficient attention."); - return false; --#endif -+#else - // Constraints specific to mem efficient attention - constexpr auto greater_than_or_equal_sm80_mem_efficient_dtypes = - array_of(at::kHalf, at::kFloat, at::kBFloat16); -@@ -663,6 +675,7 @@ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) { - } - #endif - return check_tensor_dtype(params, less_than_sm80_mem_efficient_dtypes, debug); -+#endif - } - - SDPBackend select_sdp_backend(sdp_params const& kernel_params) { --- -2.45.2 - diff --git a/0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch b/0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch deleted file mode 100644 index 0ce5b1f..0000000 --- a/0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch +++ /dev/null @@ -1,226 +0,0 @@ -From b9d45eb1cc90696a4de76676221219e24423c709 Mon Sep 17 00:00:00 2001 -From: William Wen -Date: Wed, 3 Apr 2024 17:58:46 -0700 -Subject: [PATCH] [dynamo, 3.12] enable dynamo on 3.12, enable most dynamo - unittests on 3.12 (#123216) - -Pull Request resolved: https://github.com/pytorch/pytorch/pull/123216 -Approved by: https://github.com/jansel, https://github.com/malfet ---- - test/dynamo/test_autograd_function.py | 3 ++ - test/dynamo/test_misc.py | 63 +++++++++++++++++++++++++ - test/functorch/test_eager_transforms.py | 7 ++- - test/run_test.py | 3 -- - torch/__init__.py | 5 +- - torch/_dynamo/eval_frame.py | 4 +- - torch/_dynamo/test_case.py | 8 +--- - 7 files changed, 74 insertions(+), 19 deletions(-) - -diff --git a/test/dynamo/test_autograd_function.py b/test/dynamo/test_autograd_function.py -index d23fec607afa..bc5ebc767038 100644 ---- a/test/dynamo/test_autograd_function.py -+++ b/test/dynamo/test_autograd_function.py -@@ -2,6 +2,8 @@ - - import copy - import math -+import sys -+import unittest - - import torch - -@@ -528,6 +530,7 @@ class AutogradFunctionTests(torch._dynamo.test_case.TestCase): - # I pulled all of these test cases from test_autograd.py - # In the future, we should make the Dynamo test suite actually - # run on test_autograd.py (it's disabled right now) and delete these. -+ @unittest.skipIf(sys.version_info >= (3, 12), "invalid free in 3.12+") - def test_smoke_from_test_autograd(self): - class Func(torch.autograd.Function): - @staticmethod -diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py -index a73de8b1c7e9..8f54e0564e6b 100644 ---- a/test/dynamo/test_misc.py -+++ b/test/dynamo/test_misc.py -@@ -9760,6 +9760,69 @@ fn - lambda mod: mod, - ) - -+ @xfailIfPy311 -+ def test_outside_linear_module_free(self): -+ # Compared to test_linear_module_free, the linear -+ # layer is not the code object that is directly compiled. -+ def model_inp_ctr(): -+ fc = torch.nn.Linear(100, 100) -+ -+ class Mod(torch.nn.Module): -+ def __init__(self): -+ super().__init__() -+ self.fc_ref = fc -+ -+ def forward(self, x): -+ return fc(x[0]) -+ -+ # return fc to keep it alive in _test_compile_model_free -+ return Mod(), (torch.randn(100, 100), fc) -+ -+ self._test_compile_model_free(model_inp_ctr, lambda mod: mod.fc_ref) -+ -+ @unittest.skipIf(sys.version_info >= (3, 12), "leaks in 3.12+") -+ def test_parameter_free(self): -+ def model_inp_ctr(): -+ param = torch.nn.Parameter(torch.randn(100, 100)) -+ -+ class Mod(torch.nn.Module): -+ def __init__(self): -+ super().__init__() -+ self.param = param -+ -+ def forward(self, x): -+ return self.param * x[0] -+ -+ # return param to keep it alive in _test_compile_model_free -+ return Mod(), (torch.randn(100, 100), param) -+ -+ self._test_compile_model_free(model_inp_ctr, lambda mod: mod.param) -+ -+ def test_raises_importerror1(self): -+ @torch.compile(backend="eager") -+ def fn(x): -+ try: -+ import some_module_that_surely_does_not_exist -+ -+ return -+ except ImportError: -+ pass -+ return x.sin() -+ -+ x = torch.randn(8) -+ self.assertEqual(fn(x), x.sin()) -+ -+ def test_raises_importerror2(self): -+ @torch.compile(backend="eager") -+ def fn(x): -+ import some_module_that_surely_does_not_exist -+ -+ return x + 1 -+ -+ x = torch.randn(8) -+ with self.assertRaises(ImportError): -+ fn(x) -+ - def test_dynamo_cache_move_to_front(self): - class Mod(torch.nn.Module): - def __init__(self): -diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py -index 09415cf8f48e..60790ec06059 100644 ---- a/test/functorch/test_eager_transforms.py -+++ b/test/functorch/test_eager_transforms.py -@@ -4762,8 +4762,7 @@ class TestCompileTransforms(TestCase): - # Triton only supports GPU with SM70 or later. - @expectedFailureIf((IS_ARM64 and not IS_MACOS) or - IS_WINDOWS or -- (TEST_CUDA and not SM70OrLater) or -- (sys.version_info >= (3, 12))) -+ (TEST_CUDA and not SM70OrLater)) - def test_compile_vmap_hessian(self, device): - # The model and inputs are a smaller version - # of code at benchmark repo: -@@ -4792,8 +4791,8 @@ class TestCompileTransforms(TestCase): - actual = opt_fn(params_and_buffers, x) - self.assertEqual(actual, expected) - -- # torch.compile is not supported on Windows or on Python 3.12+ -- @expectedFailureIf(IS_WINDOWS or (sys.version_info >= (3, 12))) -+ # torch.compile is not supported on Windows -+ @expectedFailureIf(IS_WINDOWS) - @torch._dynamo.config.patch(suppress_errors=False) - @torch._dynamo.config.patch(capture_func_transforms=True) - @skipIfTorchDynamo("Do not test torch.compile on top of torch.compile") -diff --git a/test/run_test.py b/test/run_test.py -index e86af9623042..ebb14df4167d 100755 ---- a/test/run_test.py -+++ b/test/run_test.py -@@ -74,7 +74,6 @@ sys.path.remove(str(REPO_ROOT)) - RERUN_DISABLED_TESTS = os.getenv("PYTORCH_TEST_RERUN_DISABLED_TESTS", "0") == "1" - DISTRIBUTED_TEST_PREFIX = "distributed" - INDUCTOR_TEST_PREFIX = "inductor" --DYNAMO_TEST_PREFIX = "dynamo" - - - # Note [ROCm parallel CI testing] -@@ -324,7 +323,6 @@ JIT_EXECUTOR_TESTS = [ - ] - - INDUCTOR_TESTS = [test for test in TESTS if test.startswith(INDUCTOR_TEST_PREFIX)] --DYNAMO_TESTS = [test for test in TESTS if test.startswith(DYNAMO_TEST_PREFIX)] - DISTRIBUTED_TESTS = [test for test in TESTS if test.startswith(DISTRIBUTED_TEST_PREFIX)] - TORCH_EXPORT_TESTS = [test for test in TESTS if test.startswith("export")] - FUNCTORCH_TESTS = [test for test in TESTS if test.startswith("functorch")] -@@ -1361,7 +1359,6 @@ def get_selected_tests(options) -> List[str]: - # these tests failing in Python 3.12 temporarily disabling - if sys.version_info >= (3, 12): - options.exclude.extend(INDUCTOR_TESTS) -- options.exclude.extend(DYNAMO_TESTS) - options.exclude.extend( - [ - "functorch/test_dims", -diff --git a/torch/__init__.py b/torch/__init__.py -index d381712b4a35..26cdffe81d29 100644 ---- a/torch/__init__.py -+++ b/torch/__init__.py -@@ -1861,9 +1861,8 @@ def compile(model: Optional[Callable] = None, *, - - """ - _C._log_api_usage_once("torch.compile") -- # Temporary until we get proper support for python 3.12 -- if sys.version_info >= (3, 12): -- raise RuntimeError("Dynamo is not supported on Python 3.12+") -+ if sys.version_info >= (3, 13): -+ raise RuntimeError("Dynamo is not supported on Python 3.13+") - - # Decorator mode - if model is None: -diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py -index 53ab0df3a947..0a80eeea99ed 100644 ---- a/torch/_dynamo/eval_frame.py -+++ b/torch/_dynamo/eval_frame.py -@@ -589,8 +589,8 @@ class _NullDecorator(contextlib.nullcontext): # type: ignore[type-arg] - - - def check_if_dynamo_supported(): -- if sys.version_info >= (3, 12): -- raise RuntimeError("Python 3.12+ not yet supported for torch.compile") -+ if sys.version_info >= (3, 13): -+ raise RuntimeError("Python 3.13+ not yet supported for torch.compile") - - - def is_dynamo_supported(): -diff --git a/torch/_dynamo/test_case.py b/torch/_dynamo/test_case.py -index e3cbef09eaae..297ea6e2bc2a 100644 ---- a/torch/_dynamo/test_case.py -+++ b/torch/_dynamo/test_case.py -@@ -1,7 +1,6 @@ - import contextlib - import importlib - import logging --import sys - - import torch - import torch.testing -@@ -20,12 +19,7 @@ log = logging.getLogger(__name__) - def run_tests(needs=()): - from torch.testing._internal.common_utils import run_tests - -- if ( -- TEST_WITH_TORCHDYNAMO -- or IS_WINDOWS -- or TEST_WITH_CROSSREF -- or sys.version_info >= (3, 12) -- ): -+ if TEST_WITH_TORCHDYNAMO or IS_WINDOWS or TEST_WITH_CROSSREF: - return # skip testing - - if isinstance(needs, str): --- -2.44.0 - diff --git a/0001-include-fmt-ranges.h-for-using-fmt-join.patch b/0001-include-fmt-ranges.h-for-using-fmt-join.patch deleted file mode 100644 index f7f6c7d..0000000 --- a/0001-include-fmt-ranges.h-for-using-fmt-join.patch +++ /dev/null @@ -1,54 +0,0 @@ -From ba2cf11d1bf1dd5086c8e793198a697d4179cca7 Mon Sep 17 00:00:00 2001 -From: Kefu Chai -Date: Tue, 16 Jul 2024 08:00:22 +0800 -Subject: [PATCH] include fmt/ranges.h for using fmt::join() - -fmt::join() was moved into fmt/ranges.h in fmt 11, so include this -header for using it. - -Signed-off-by: Kefu Chai ---- - torch/csrc/distributed/c10d/socket.cpp | 1 + - torch/csrc/profiler/standalone/execution_trace_observer.cpp | 1 + - torch/csrc/profiler/util.cpp | 1 + - 3 files changed, 3 insertions(+) - -diff --git a/torch/csrc/distributed/c10d/socket.cpp b/torch/csrc/distributed/c10d/socket.cpp -index 5013f2540..cbcd33a19 100644 ---- a/torch/csrc/distributed/c10d/socket.cpp -+++ b/torch/csrc/distributed/c10d/socket.cpp -@@ -31,6 +31,7 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wdeprecated") - #include - C10_DIAGNOSTIC_POP() - #include -+#include - - #include - #include -diff --git a/torch/csrc/profiler/standalone/execution_trace_observer.cpp b/torch/csrc/profiler/standalone/execution_trace_observer.cpp -index 2ef2e5423..fb053e916 100644 ---- a/torch/csrc/profiler/standalone/execution_trace_observer.cpp -+++ b/torch/csrc/profiler/standalone/execution_trace_observer.cpp -@@ -10,6 +10,7 @@ - #endif // _WIN32 - - #include -+#include - #include - #include - #include -diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp -index 896bf606c..c229ce130 100644 ---- a/torch/csrc/profiler/util.cpp -+++ b/torch/csrc/profiler/util.cpp -@@ -5,6 +5,7 @@ - #include - #include - #include -+#include - - #ifdef USE_KINETO - #include --- -2.45.2 - diff --git a/0001-no-third_party-FXdiv.patch b/0001-no-third_party-FXdiv.patch deleted file mode 100644 index 71404e3..0000000 --- a/0001-no-third_party-FXdiv.patch +++ /dev/null @@ -1,54 +0,0 @@ -From b3b307add5724ee5730f161e16594fa702f34a19 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 3 Feb 2024 08:20:28 -0500 -Subject: [PATCH] no third_party FXdiv - ---- - caffe2/CMakeLists.txt | 24 ++++++++++++------------ - 1 file changed, 12 insertions(+), 12 deletions(-) - -diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt -index b2f3adbfae..80a5625c8d 100644 ---- a/caffe2/CMakeLists.txt -+++ b/caffe2/CMakeLists.txt -@@ -110,15 +110,15 @@ endif() - # Note: the folders that are being commented out have not been properly - # addressed yet. - --if(NOT MSVC AND USE_XNNPACK) -- if(NOT TARGET fxdiv) -- set(FXDIV_BUILD_TESTS OFF CACHE BOOL "") -- set(FXDIV_BUILD_BENCHMARKS OFF CACHE BOOL "") -- add_subdirectory( -- "${FXDIV_SOURCE_DIR}" -- "${CMAKE_BINARY_DIR}/FXdiv") -- endif() --endif() -+#if(NOT MSVC AND USE_XNNPACK) -+# if(NOT TARGET fxdiv) -+# set(FXDIV_BUILD_TESTS OFF CACHE BOOL "") -+# set(FXDIV_BUILD_BENCHMARKS OFF CACHE BOOL "") -+# add_subdirectory( -+# "${FXDIV_SOURCE_DIR}" -+# "${CMAKE_BINARY_DIR}/FXdiv") -+# endif() -+#endif() - - add_subdirectory(core) - add_subdirectory(serialize) -@@ -1081,9 +1081,9 @@ if(USE_XPU) - target_compile_definitions(torch_xpu PRIVATE USE_XPU) - endif() - --if(NOT MSVC AND USE_XNNPACK) -- TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv) --endif() -+#if(NOT MSVC AND USE_XNNPACK) -+# TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv) -+#endif() - - # ========================================================== - # formerly-libtorch flags --- -2.43.0 - diff --git a/0001-no-third_party-fmt.patch b/0001-no-third_party-fmt.patch deleted file mode 100644 index 6e82af2..0000000 --- a/0001-no-third_party-fmt.patch +++ /dev/null @@ -1,65 +0,0 @@ -From 2ce255b75760a0a513fb1706629b416f76a5c822 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 3 Feb 2024 08:16:04 -0500 -Subject: [PATCH] no third_party fmt - ---- - c10/CMakeLists.txt | 2 +- - cmake/Dependencies.cmake | 6 +++--- - torch/CMakeLists.txt | 2 +- - 3 files changed, 5 insertions(+), 5 deletions(-) - -diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt -index 1f742f4c176..4fa08913bdd 100644 ---- a/c10/CMakeLists.txt -+++ b/c10/CMakeLists.txt -@@ -87,7 +87,7 @@ endif() - if(C10_USE_GLOG) - target_link_libraries(c10 PUBLIC glog::glog) - endif() --target_link_libraries(c10 PRIVATE fmt::fmt-header-only) -+target_link_libraries(c10 PRIVATE fmt) - - if(C10_USE_NUMA) - message(STATUS "NUMA paths:") -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index 6f5a2d5feff..42fbf80f6e8 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1837,7 +1837,7 @@ endif() - # - set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) - set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libs" FORCE) --add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt) -+# add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt) - - # Disable compiler feature checks for `fmt`. - # -@@ -1846,9 +1846,9 @@ add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt) - # CMAKE_CXX_FLAGS in ways that break feature checks. Since we already know - # `fmt` is compatible with a superset of the compilers that PyTorch is, it - # shouldn't be too bad to just disable the checks. --set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "") -+# set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "") - --list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only) -+# list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only) - set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE) - - # ---[ Kineto -diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt -index 97a72eed55b..9e5014d1980 100644 ---- a/torch/CMakeLists.txt -+++ b/torch/CMakeLists.txt -@@ -80,7 +80,7 @@ set(TORCH_PYTHON_LINK_LIBRARIES - python::python - pybind::pybind11 - shm -- fmt::fmt-header-only -+ fmt - ATEN_CPU_FILES_GEN_LIB) - - if(USE_ASAN AND TARGET Sanitizer::address) --- -2.43.2 - diff --git a/0001-no-third_party-foxi.patch b/0001-no-third_party-foxi.patch deleted file mode 100644 index ba1ec40..0000000 --- a/0001-no-third_party-foxi.patch +++ /dev/null @@ -1,36 +0,0 @@ -From 8cb61cf9282102ac225645fcc9fb4a1bb7cb15a2 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 3 Feb 2024 08:11:55 -0500 -Subject: [PATCH] no third_party foxi - ---- - cmake/Dependencies.cmake | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index 5f91f3ffab..8e1461af81 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1567,7 +1567,7 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX) - set_target_properties(onnx_proto PROPERTIES CXX_STANDARD 17) - endif() - endif() -- add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/foxi EXCLUDE_FROM_ALL) -+ # add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/foxi EXCLUDE_FROM_ALL) - - add_definitions(-DONNX_NAMESPACE=${ONNX_NAMESPACE}) - if(NOT USE_SYSTEM_ONNX) -@@ -1600,8 +1600,8 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX) - message("-- Found onnx: ${ONNX_LIBRARY} ${ONNX_PROTO_LIBRARY}") - list(APPEND Caffe2_DEPENDENCY_LIBS onnx_proto onnx) - endif() -- include_directories(${FOXI_INCLUDE_DIRS}) -- list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader) -+# include_directories(${FOXI_INCLUDE_DIRS}) -+# list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader) - # Recover the build shared libs option. - set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS}) - endif() --- -2.43.0 - diff --git a/0001-python-torch-disable-ck.patch b/0001-python-torch-disable-ck.patch deleted file mode 100644 index e8fd9c2..0000000 --- a/0001-python-torch-disable-ck.patch +++ /dev/null @@ -1,112 +0,0 @@ -From 027dad1eaed51c1172e2497da611e3267d42d2f0 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Fri, 28 Mar 2025 09:16:03 -0700 -Subject: [PATCH] python-torch: disable ck - ---- - aten/src/ATen/CMakeLists.txt | 7 +++---- - aten/src/ATen/Context.cpp | 1 + - aten/src/ATen/cuda/CUDABlas.cpp | 10 +++++----- - 3 files changed, 9 insertions(+), 9 deletions(-) - -diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt -index 085af373ec22..84808880e51c 100644 ---- a/aten/src/ATen/CMakeLists.txt -+++ b/aten/src/ATen/CMakeLists.txt -@@ -134,7 +134,7 @@ file(GLOB native_cuda_cu "native/cuda/*.cu") - file(GLOB native_cuda_cpp "native/cuda/*.cpp") - file(GLOB native_cuda_h "native/cuda/*.h" "native/cuda/*.cuh") - file(GLOB native_cuda_linalg_cpp "native/cuda/linalg/*.cpp") --file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh" "native/hip/bgemm_kernels/*.h") -+file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh" ) - file(GLOB native_cudnn_cpp "native/cudnn/*.cpp") - file(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu") - file(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp") -@@ -145,7 +145,7 @@ file(GLOB native_nested_h "native/nested/*.h") - file(GLOB native_nested_cuda_cu "native/nested/cuda/*.cu") - file(GLOB native_nested_cuda_cpp "native/nested/cuda/*.cpp") - --file(GLOB native_hip_hip "native/hip/*.hip" "native/hip/bgemm_kernels/*.hip") -+file(GLOB native_hip_hip "native/hip/*.hip" ) - file(GLOB native_hip_cpp "native/hip/*.cpp") - file(GLOB native_hip_linalg_cpp "native/hip/linalg/*.cpp") - file(GLOB native_miopen_cpp "native/miopen/*.cpp") -@@ -361,13 +361,12 @@ endif() - ${native_quantized_hip_hip} - ${native_transformers_hip_hip} ${native_transformers_src_hip_hip} - ) -- if(WIN32) # Windows doesn't support Composable Kernels and Triton - file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip") - file(GLOB native_hip_ck "native/hip/ck*.hip") - exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}" - ${native_hip_bgemm} ${native_hip_ck} - ${native_transformers_hip_hip} ${native_transformers_hip_cpp}) -- endif() -+ - # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources) - list(APPEND all_hip_cpp - ${native_nested_hip_cpp} -diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp -index f598fc3a39d3..03dab6ff38fe 100644 ---- a/aten/src/ATen/Context.cpp -+++ b/aten/src/ATen/Context.cpp -@@ -355,6 +355,7 @@ at::BlasBackend Context::blasPreferredBackend() { - } - - void Context::setBlasPreferredBackend(at::BlasBackend b) { -+ return; - #ifdef _MSC_VER - TORCH_WARN_ONCE( - "torch.backends.cuda.preferred_blas_library is an experimental feature. " -diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp -index a62b028fd4ff..cba38426ea1f 100644 ---- a/aten/src/ATen/cuda/CUDABlas.cpp -+++ b/aten/src/ATen/cuda/CUDABlas.cpp -@@ -708,7 +708,7 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::BFloat16)); - } --#ifdef USE_ROCM -+#ifdef USE_ROCM_NO_CK - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - at::native::bgemm_internal_ck(CUDABLAS_BGEMM_ARGS(at::BFloat16)); - } -@@ -1061,7 +1061,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(double)); - #endif - } --#ifdef USE_ROCM -+#ifdef USE_ROCM_NO_CK - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(double)); - } -@@ -1077,7 +1077,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); - } --#ifdef USE_ROCM -+#ifdef USE_ROCM_NO_CK - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(float)); - } -@@ -1125,7 +1125,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); - } --#ifdef USE_ROCM -+#ifdef USE_ROCM_NO_CK - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::Half)); - } -@@ -1141,7 +1141,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } --#ifdef USE_ROCM -+#ifdef USE_ROCM_NO_CK - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } --- -2.48.1 - diff --git a/0001-reenable-foxi-linking.patch b/0001-reenable-foxi-linking.patch deleted file mode 100644 index 8e39795..0000000 --- a/0001-reenable-foxi-linking.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 58ccda271e8f51c3fa5b7518cf6ee52ce204fd37 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Thu, 22 Feb 2024 09:28:11 -0500 -Subject: [PATCH] reenable foxi linking - ---- - cmake/Dependencies.cmake | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index 42fbf80f6e8..bc3a2dc6fee 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1604,7 +1604,7 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX) - list(APPEND Caffe2_DEPENDENCY_LIBS onnx_proto onnx) - endif() - # include_directories(${FOXI_INCLUDE_DIRS}) --# list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader) -+ list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader) - # Recover the build shared libs option. - set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS}) - endif() --- -2.43.2 - diff --git a/0001-silence-an-assert.patch b/0001-silence-an-assert.patch deleted file mode 100644 index 0b20dcf..0000000 --- a/0001-silence-an-assert.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 04dd33db93b852fdfd7ea408813080b2e2026650 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 3 Feb 2024 06:41:20 -0500 -Subject: [PATCH] silence an assert - ---- - aten/src/ATen/native/cuda/IndexKernel.cu | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu -index 657c0c77b3..b406aa6687 100644 ---- a/aten/src/ATen/native/cuda/IndexKernel.cu -+++ b/aten/src/ATen/native/cuda/IndexKernel.cu -@@ -249,7 +249,7 @@ void index_put_kernel_quantized_cuda(TensorIterator& iter, const IntArrayRef ind - - gpu_index_kernel(iter, index_size, index_stride, [inv_scale, zero_point, qmin, qmax]C10_DEVICE(char* const out_data, const char* const in_data, const int64_t offset) { - int64_t qvalue = static_cast(zero_point + nearbyintf(*(float*)in_data * inv_scale)); -- qvalue = std::clamp(qvalue, qmin, qmax); -+ //qvalue = std::clamp(qvalue, qmin, qmax); - *(scalar_t*)(out_data + offset) = static_cast(qvalue); - }); - }); --- -2.43.0 - diff --git a/0001-torch-paper-over-c-assert.patch b/0001-torch-paper-over-c-assert.patch deleted file mode 100644 index b7e55ce..0000000 --- a/0001-torch-paper-over-c-assert.patch +++ /dev/null @@ -1,88 +0,0 @@ -From f646e0f04ae591c8f2d8a0cd24b035725c57659b Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Thu, 23 Jan 2025 08:24:22 -0800 -Subject: [PATCH] torch: paper over c++ assert - ---- - aten/src/ATen/native/sparse/FlattenIndicesCommon.h | 2 ++ - .../ATen/native/sparse/SparseBinaryOpIntersectionCommon.h | 5 +++++ - .../src/ATen/native/sparse/ValidateCompressedIndicesCommon.h | 2 ++ - 3 files changed, 9 insertions(+) - -diff --git a/aten/src/ATen/native/sparse/FlattenIndicesCommon.h b/aten/src/ATen/native/sparse/FlattenIndicesCommon.h -index 0e79ed809ae6..a3cec8aaf78b 100644 ---- a/aten/src/ATen/native/sparse/FlattenIndicesCommon.h -+++ b/aten/src/ATen/native/sparse/FlattenIndicesCommon.h -@@ -69,11 +69,13 @@ Tensor _flatten_indices_impl(const Tensor& indices, IntArrayRef size) { - [=] FUNCAPI (int64_t nnz_idx) -> int64_t { - const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride; - auto hash = static_cast(0); -+#if 0 - for (int64_t dim = 0; dim < sparse_dim; ++dim) { - const auto dim_hash_coeff = hash_coeffs[dim]; - const auto dim_index = ptr_indices_dim[dim * indices_dim_stride]; - hash += dim_index * dim_hash_coeff; - } -+#endif - return hash; - }); - } -diff --git a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h -index c0b94bf39d54..8de4900b7a01 100644 ---- a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h -+++ b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h -@@ -279,12 +279,15 @@ void _sparse_binary_op_intersection_kernel_impl( - if (!ptr_indices) { - return hash; - } -+#if 0 -+// /usr/lib/gcc/x86_64-redhat-linux/15/../../../../include/c++/15/array:219:2: error: reference to __host__ function '__glibcxx_assert_fail' in __host__ __device__ function - const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride; - for (int64_t dim = 0; dim < sparse_dim; ++dim) { - const auto dim_hash_coeff = hash_coeffs[dim]; - const auto dim_index = ptr_indices_dim[dim * indices_dim_stride]; - hash += dim_index * dim_hash_coeff; - } -+#endif - return hash; - }); - } -@@ -364,6 +367,7 @@ void _sparse_binary_op_intersection_kernel_impl( - if (hash_ptr) { - hash = hash_ptr[nnz_idx]; - } else if (sparse_dim) { -+#if 0 - // Compute hash value - const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride; - for (int64_t dim = 0; dim < sparse_dim; ++dim) { -@@ -371,6 +375,7 @@ void _sparse_binary_op_intersection_kernel_impl( - const auto dim_index = ptr_indices_dim[dim * indices_dim_stride]; - hash += dim_index * dim_hash_coeff; - } -+#endif - } - - // Perform hash values intersection -diff --git a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h -index ec4c084a39cc..9bc9655b0afa 100644 ---- a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h -+++ b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h -@@ -341,6 +341,7 @@ void _validate_compressed_sparse_indices_kernel( - // assuming idx contiguity per batch: - int64_t tmp = batch_idx * nnz; - // `nnz == idx_sizes[idx_ndims - 1]` is checked above as `nnz == idx.size(-1)` -+#if 0 - for (int i = idx_ndims - 1; - i >= 0 && nnz > 0; // break early when nnz==0 - i--) { -@@ -348,6 +349,7 @@ void _validate_compressed_sparse_indices_kernel( - idx_offset += (tmp - div * idx_sizes[i]) * idx_strides[i]; - tmp = div; - } -+#endif - const auto* RESTRICT ptr_idx_batch = ptr_idx + idx_offset; - _check_idx_sorted_distinct_vals_slices_with_cidx< - cdim_name, --- -2.48.1 - diff --git a/0001-use-any-hip.patch b/0001-use-any-hip.patch deleted file mode 100644 index dca86ea..0000000 --- a/0001-use-any-hip.patch +++ /dev/null @@ -1,34 +0,0 @@ -From 4248211ce9a9de81bb3ade5d421ba709b19ead08 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 3 Feb 2024 15:01:28 -0500 -Subject: [PATCH] use any hip - ---- - cmake/public/LoadHIP.cmake | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake -index 1abeb06228..28458c4146 100644 ---- a/cmake/public/LoadHIP.cmake -+++ b/cmake/public/LoadHIP.cmake -@@ -30,7 +30,7 @@ endif() - message("Building PyTorch for GPU arch: ${PYTORCH_ROCM_ARCH}") - - # Add HIP to the CMAKE Module Path --set(CMAKE_MODULE_PATH ${ROCM_PATH}/lib/cmake/hip ${CMAKE_MODULE_PATH}) -+set(CMAKE_MODULE_PATH ${ROCM_PATH}/lib64/cmake/hip ${CMAKE_MODULE_PATH}) - - macro(find_package_and_print_version PACKAGE_NAME) - find_package("${PACKAGE_NAME}" ${ARGN}) -@@ -38,7 +38,7 @@ macro(find_package_and_print_version PACKAGE_NAME) - endmacro() - - # Find the HIP Package --find_package_and_print_version(HIP 1.0) -+find_package_and_print_version(HIP MODULE) - - if(HIP_FOUND) - set(PYTORCH_FOUND_HIP TRUE) --- -2.43.0 - diff --git a/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch b/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch deleted file mode 100644 index 413c60d..0000000 --- a/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch +++ /dev/null @@ -1,47 +0,0 @@ -From 091b7fe1ccbb5e4ff4ac6017d42bacb869f61a27 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 20 Jul 2024 05:37:15 -0600 -Subject: [PATCH] Add cmake option USE_SYSTEM_FBGEMM - -Signed-off-by: Tom Rix ---- - CMakeLists.txt | 1 + - cmake/Dependencies.cmake | 3 ++- - 2 files changed, 3 insertions(+), 1 deletion(-) - -diff --git a/CMakeLists.txt b/CMakeLists.txt -index c4cd4b2c2a98..2068f7c6c4f2 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -253,6 +253,7 @@ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF - "USE_CUDNN" OFF) - cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF) - option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON) -+option(USE_SYSTEM_FBGEMM "Use system-wide FBGEMM" OFF) - option(USE_KINETO "Use Kineto profiling library" ON) - option(USE_CUPTI_SO "Use CUPTI as a shared library" ON) - option(USE_FAKELOWP "Use FakeLowp operators" OFF) -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index f1f2eb7cec31..192dac46f13b 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -706,6 +706,7 @@ endif() - - # ---[ FBGEMM - if(USE_FBGEMM) -+ if (NOT USE_SYSTEM_FBGEMM) - set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party") - if(NOT DEFINED FBGEMM_SOURCE_DIR) - set(FBGEMM_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/fbgemm" CACHE STRING "FBGEMM source directory") -@@ -746,7 +747,7 @@ if(USE_FBGEMM) - target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable) - endif() - endif() -- -+ endif() - if(USE_FBGEMM) - list(APPEND Caffe2_DEPENDENCY_LIBS fbgemm) - endif() --- -2.45.1 - diff --git a/next/0001-Add-cmake-variable-USE_ROCM_CK.patch b/next/0001-Add-cmake-variable-USE_ROCM_CK.patch new file mode 100644 index 0000000..1afe692 --- /dev/null +++ b/next/0001-Add-cmake-variable-USE_ROCM_CK.patch @@ -0,0 +1,149 @@ +From 4cc5d88dfe7a45ab245648dc874645d32a24b98b Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Fri, 27 Jun 2025 13:52:51 -0700 +Subject: [PATCH] Add cmake variable USE_ROCM_CK + +--- + CMakeLists.txt | 1 + + aten/src/ATen/CMakeLists.txt | 40 ++++++++++++++++----------------- + aten/src/ATen/cuda/CUDABlas.cpp | 10 ++++----- + cmake/Dependencies.cmake | 3 +++ + 4 files changed, 29 insertions(+), 25 deletions(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 99c0b9e0ea0c..4c632e42f531 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -240,6 +240,7 @@ cmake_dependent_option( + BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON + "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF) + cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF) ++cmake_dependent_option(USE_ROCM_CK "Use ROCm Composable Kernel" ON "USE_ROCM" ON) + option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF) + cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF) + cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF +diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt +index c9cfd74b501e..59f6178218ee 100644 +--- a/aten/src/ATen/CMakeLists.txt ++++ b/aten/src/ATen/CMakeLists.txt +@@ -373,26 +373,26 @@ if(USE_ROCM) + # is header only, so this should be ok, except that the CMake build generates + # a ck/config.h. We just do that part here. Without this, the ck.h from the + # ROCM SDK may get accidentally used instead. +- function(_pytorch_rocm_generate_ck_conf) +- set(CK_ENABLE_INT8 "ON") +- set(CK_ENABLE_FP16 "ON") +- set(CK_ENABLE_FP32 "ON") +- set(CK_ENABLE_FP64 "ON") +- set(CK_ENABLE_BF16 "ON") +- set(CK_ENABLE_FP8 "ON") +- set(CK_ENABLE_BF8 "ON") +- set(CK_USE_XDL "ON") +- set(CK_USE_WMMA "ON") +- configure_file( +- "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in" +- "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h" +- ) +- endfunction() ++# function(_pytorch_rocm_generate_ck_conf) ++# set(CK_ENABLE_INT8 "ON") ++# set(CK_ENABLE_FP16 "ON") ++# set(CK_ENABLE_FP32 "ON") ++# set(CK_ENABLE_FP64 "ON") ++# set(CK_ENABLE_BF16 "ON") ++# set(CK_ENABLE_FP8 "ON") ++# set(CK_ENABLE_BF8 "ON") ++# set(CK_USE_XDL "ON") ++# set(CK_USE_WMMA "ON") ++# configure_file( ++# "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in" ++# "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h" ++# ) ++# endfunction() + list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip) +- list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include) +- list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include) +- list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel) +- _pytorch_rocm_generate_ck_conf() ++# list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include) ++# list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include) ++# list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel) ++# _pytorch_rocm_generate_ck_conf() + + # Next two lines are needed because TunableOp uses third-party/fmt + list(APPEND ATen_HIP_INCLUDE $) +@@ -409,7 +409,7 @@ endif() + ${native_quantized_hip_hip} + ${native_transformers_hip_hip} ${native_transformers_src_hip_hip} + ) +- if(WIN32) # Windows doesn't support Composable Kernels ++ if(NOT USE_ROCM_CK) # Windows doesn't support Composable Kernels + file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip") + file(GLOB native_hip_ck "native/hip/ck*.hip") + exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}" +diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp +index 89350a11bea7..33e5f2808057 100644 +--- a/aten/src/ATen/cuda/CUDABlas.cpp ++++ b/aten/src/ATen/cuda/CUDABlas.cpp +@@ -752,7 +752,7 @@ template <> + void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(double)) + { + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + // hipblaslt does not support double gemm yet + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(double)); + #else +@@ -1103,7 +1103,7 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE( + void * beta_ptr = &fbeta; + _cublasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); + GEMM_CHECK_ARGVALUES(at::Half); +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + int flag = 0; + #if USE_GEMM_FLAGS_FP16_ALT_IMPL + flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; +@@ -1270,7 +1270,7 @@ template <> + void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) + { + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + // hipblaslt does not support double gemm yet + gemm_internal_cublas(CUDABLAS_GEMM_ARGS(double)); + #else +@@ -1311,7 +1311,7 @@ template <> + void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) + { + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + // hipblaslt does not support complex gemm yet + gemm_internal_cublas>(CUDABLAS_GEMM_ARGS(c10::complex)); + #else +@@ -1327,7 +1327,7 @@ template <> + void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) + { + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + // hipblaslt does not support complex gemm yet + gemm_internal_cublas>(CUDABLAS_GEMM_ARGS(c10::complex)); + #else +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index a93386c27f8d..be1368999d38 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -1031,6 +1031,9 @@ if(USE_ROCM) + if(HIPBLASLT_VEC_EXT) + list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_VEC_EXT) + endif() ++ if(USE_ROCM_CK) ++ list(APPEND HIP_CXX_FLAGS -DUSE_ROCM_CK) ++ endif() + list(APPEND HIP_HIPCC_FLAGS --offload-compress) + if(WIN32) + add_definitions(-DROCM_ON_WINDOWS) +-- +2.49.0 + diff --git a/next/0001-Optionally-use-hipblaslt.patch b/next/0001-Optionally-use-hipblaslt.patch deleted file mode 100644 index 1e5ca4b..0000000 --- a/next/0001-Optionally-use-hipblaslt.patch +++ /dev/null @@ -1,506 +0,0 @@ -From f1d65e958afa65882dbfea8b392ab847a84d41ed Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 29 Jun 2024 04:18:34 -0700 -Subject: [PATCH] Optionally use hipblaslt - ---- - aten/src/ATen/cuda/CUDABlas.cpp | 46 ++++++++++++++++++------ - aten/src/ATen/cuda/CUDAContextLight.h | 4 +++ - aten/src/ATen/cuda/CublasHandlePool.cpp | 10 ++++-- - aten/src/ATen/cuda/tunable/TunableGemm.h | 18 +++++++--- - aten/src/ATen/native/cuda/Blas.cpp | 18 +++++++++- - cmake/Dependencies.cmake | 3 ++ - cmake/public/LoadHIP.cmake | 2 +- - 7 files changed, 82 insertions(+), 19 deletions(-) - -diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp -index ce991a9bcad4..3f0d17b52778 100644 ---- a/aten/src/ATen/cuda/CUDABlas.cpp -+++ b/aten/src/ATen/cuda/CUDABlas.cpp -@@ -14,7 +14,9 @@ - #include - - #ifdef USE_ROCM -+#ifdef USE_HIPBLASLT - #include -+#endif - // until hipblas has an API to accept flags, we must use rocblas here - #include - #include -@@ -182,6 +184,9 @@ uint32_t _getAlignment(uintptr_t address) { - static size_t _parseChosenWorkspaceSize() { - const char * val = getenv("CUBLASLT_WORKSPACE_SIZE"); - #ifdef USE_ROCM -+#ifndef USE_HIPBLASLT -+ return 0; -+#endif - if (!val) { - // accept either env var - val = getenv("HIPBLASLT_WORKSPACE_SIZE"); -@@ -235,6 +240,7 @@ namespace at::cuda::blas { - } while (0) - - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - namespace { - // Following the pattern of CuSparseDescriptor - // Defined here for now because this is the only place cublas_lt interface is -@@ -318,7 +324,6 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< - }; - } // namespace - -- - template - inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { - cudaDataType_t abcType = CUDA_R_32F; -@@ -452,7 +457,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { - " scaleType ", - scaleType); - } -- -+#endif - - template - inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { -@@ -608,10 +613,13 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(double)) - template <> - void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(float)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(float)); - } -- else { -+ else -+#endif -+ { - bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(float)); - } - } -@@ -651,10 +659,13 @@ void bgemm_internal>(CUDABLAS_BGEMM_ARGTYPES(c10::complex - void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::Half)); - } -- else { -+ else -+#endif -+ { - bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::Half)); - } - } -@@ -662,10 +673,13 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)) - template <> - void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::BFloat16)); - } -- else { -+ else -+#endif -+ { - bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::BFloat16)); - } - } -@@ -781,11 +795,13 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { - } - } - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) { - // forward to bgemm implementation but set strides and batches to 0 - bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0); - } -+#endif - - template - inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) { -@@ -1008,10 +1024,13 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) - template <> - void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); - } -- else { -+ else -+#endif -+ { - gemm_internal_cublas(CUDABLAS_GEMM_ARGS(float)); - } - } -@@ -1051,10 +1070,13 @@ void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex - void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); - } -- else { -+ else -+#endif -+ { - gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::Half)); - } - } -@@ -1062,10 +1084,13 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) - template <> - void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } -- else { -+ else -+#endif -+ { - gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } - } -@@ -1177,7 +1202,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { - } - } - -- -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - void gemm_and_bias( - bool transpose_mat1, -@@ -1410,7 +1435,7 @@ void scaled_gemm( - ScalarType result_dtype, - void* amax_ptr, - bool use_fast_accum) { --#if CUDA_VERSION >= 11080 || defined(USE_ROCM) -+#if CUDA_VERSION >= 11080 || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - const auto computeType = CUBLAS_COMPUTE_32F; - const auto scaleType = CUDA_R_32F; - const int8_t fastAccuMode = use_fast_accum ? 1 : 0; -@@ -1681,6 +1706,7 @@ void int8_gemm( - " scaleType ", - scaleType); - } -+#endif - - template <> - void trsm(CUDABLAS_TRSM_ARGTYPES(float)) { -diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h -index f2b657ced51b..f0ee613c4208 100644 ---- a/aten/src/ATen/cuda/CUDAContextLight.h -+++ b/aten/src/ATen/cuda/CUDAContextLight.h -@@ -9,7 +9,9 @@ - - // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also - // added bf16 support -+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))) - #include -+#endif - - #ifdef CUDART_VERSION - #include -@@ -80,7 +82,9 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator(); - /* Handles */ - TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle(); - TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle(); -+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))) - TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle(); -+#endif - - TORCH_CUDA_CPP_API void clearCublasWorkspaces(); - -diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp -index 8eac525b3695..abfdf7a23847 100644 ---- a/aten/src/ATen/cuda/CublasHandlePool.cpp -+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp -@@ -29,7 +29,7 @@ namespace at::cuda { - - namespace { - --#if defined(USE_ROCM) -+#if defined(USE_ROCM) && defined(USE_HIPBLASLT) - void createCublasLtHandle(cublasLtHandle_t *handle) { - TORCH_CUDABLAS_CHECK(cublasLtCreate(handle)); - } -@@ -191,8 +191,9 @@ cublasHandle_t getCurrentCUDABlasHandle() { - return handle; - } - --cublasLtHandle_t getCurrentCUDABlasLtHandle() { - #ifdef USE_ROCM -+#if defined(USE_HIPBLASLT) -+cublasLtHandle_t getCurrentCUDABlasLtHandle() { - c10::DeviceIndex device = 0; - AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); - -@@ -213,9 +214,12 @@ cublasLtHandle_t getCurrentCUDABlasLtHandle() { - - auto handle = myPoolWindow->reserve(device); - return handle; -+} -+#endif - #else -+cublasLtHandle_t getCurrentCUDABlasLtHandle() { - return reinterpret_cast(getCurrentCUDABlasHandle()); --#endif - } -+#endif - - } // namespace at::cuda -diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h -index 53e6154120c9..fa1d664696db 100644 ---- a/aten/src/ATen/cuda/tunable/TunableGemm.h -+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h -@@ -11,7 +11,9 @@ - - #include - #ifdef USE_ROCM -+#ifdef USE_HIPBLASLT - #include -+#endif - #include - #endif - #include -@@ -65,6 +67,7 @@ class DefaultGemmStridedBatchedOp : public Callable> - } - }; - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - class DefaultScaledGemmOp : public Callable> { - public: -@@ -94,6 +97,7 @@ class DefaultScaledGemmOp : public Callable> { - return OK; - } - }; -+#endif - - template - inline bool IsZero(T v) { -@@ -191,6 +195,7 @@ static void AddRocblasValidator() { - } - } - -+#ifdef USE_HIPBLASLT - static void AddHipblasltValidator() { - auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); - if (validators.find("HIPBLASLT_VERSION") == validators.end()) { -@@ -205,6 +210,7 @@ static void AddHipblasltValidator() { - [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; }); - } - } -+#endif - - static void AddRocmValidator() { - auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); -@@ -243,7 +249,7 @@ class GemmTunableOp : public TunableOp, StreamTimer> { - } - AddRocblasValidator(); - } -- -+#ifdef USE_HIPBLASLT - static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); - if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) { - rocm_validators = true; -@@ -257,7 +263,7 @@ class GemmTunableOp : public TunableOp, StreamTimer> { - } - AddHipblasltValidator(); - } -- -+#endif - if (rocm_validators) { - AddRocmValidator(); - } -@@ -286,7 +292,7 @@ class GemmStridedBatchedTunableOp : public TunableOp - } - AddRocblasValidator(); - } -- -+#ifdef USE_HIPBLASLT - static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); - if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) { - rocm_validators = true; -@@ -300,7 +306,7 @@ class GemmStridedBatchedTunableOp : public TunableOp - } - AddHipblasltValidator(); - } -- -+#endif - if (rocm_validators) { - AddRocmValidator(); - } -@@ -312,6 +318,7 @@ class GemmStridedBatchedTunableOp : public TunableOp - } - }; - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - class ScaledGemmTunableOp : public TunableOp, StreamTimer> { - public: -@@ -321,10 +328,12 @@ class ScaledGemmTunableOp : public TunableOp, StreamTimer> - auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); - - #if defined(USE_ROCM) -+#ifdef USE_HIPBLASLT - for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps()) { - this->RegisterOp(std::move(name), std::move(op)); - } - AddHipblasltValidator(); -+#endif - AddRocmValidator(); - #endif - } -@@ -337,6 +346,7 @@ class ScaledGemmTunableOp : public TunableOp, StreamTimer> - "_", BlasOpToString(ALayout), BlasOpToString(BLayout)); - } - }; -+#endif - - #undef XSTRINGIFY - #undef STRINGIFY -diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp -index 84c59a4fd0d7..56ad5de3bf2d 100644 ---- a/aten/src/ATen/native/cuda/Blas.cpp -+++ b/aten/src/ATen/native/cuda/Blas.cpp -@@ -173,6 +173,7 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa - } - - static bool getDisableAddmmCudaLt() { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT"); - #ifdef USE_ROCM - // if we enable tunable op, it'll take priority over just hipblaslt (heuristics) -@@ -196,10 +197,14 @@ static bool getDisableAddmmCudaLt() { - } - return false; - #endif -+#else -+ return true; -+#endif - } - - #ifdef USE_ROCM - static bool isSupportedHipLtROCmArch(int index) { -+#ifdef USE_HIPBLASLT - hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index); - std::string device_arch = prop->gcnArchName; - static const std::vector archs = {"gfx90a", "gfx940", "gfx941", "gfx942"}; -@@ -210,6 +215,7 @@ static bool isSupportedHipLtROCmArch(int index) { - } - } - TORCH_CHECK(false, "Attempting to use hipBLASLt on a unsupported architecture!"); -+#endif - return false; - } - #endif -@@ -235,6 +241,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - at::ScalarType scalar_type = self.scalar_type(); - c10::MaybeOwned self_; - if (&result != &self) { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM) - // Strangely, if mat2 has only 1 row or column, we get - // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic. -@@ -276,13 +283,14 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - scalar_type != at::ScalarType::BFloat16)); - #endif - } -+#endif - #endif - if (!useLtInterface) { - self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm"); - } - self__sizes = self_->sizes(); - } else { --#if defined(USE_ROCM) -+#if defined(USE_ROCM) && defined(USE_HIPBLASLT) - useLtInterface = !disable_addmm_cuda_lt && - result.dim() == 2 && result.is_contiguous() && - isSupportedHipLtROCmArch(self.device().index()) && -@@ -334,6 +342,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj()); - - if (useLtInterface) { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - #if defined(USE_ROCM) - AT_DISPATCH_FLOATING_TYPES_AND2( - at::ScalarType::Half, -@@ -394,6 +403,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - activation_epilogue - ); - }); -+#endif - #endif - } else - { -@@ -803,6 +813,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) { - } - - static bool _scaled_mm_allowed_device() { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - auto dprops = at::cuda::getCurrentDeviceProperties(); - #ifdef USE_ROCM - std::string device_arch = dprops->gcnArchName; -@@ -817,6 +828,9 @@ static bool _scaled_mm_allowed_device() { - #else - return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9); - #endif -+#else -+ return false; -+#endif - } - - // Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax -@@ -850,6 +864,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, - // Check sizes - bool allowed_device = _scaled_mm_allowed_device(); - TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+"); -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix"); - TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix"); - TORCH_CHECK( -@@ -1025,6 +1040,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, - #if defined(USE_ROCM) && ROCM_VERSION >= 60000 && ROCM_VERSION < 60200 - // ROCm's hipBLASLt does not support amax before 6.2, so calculate separately - amax = at::max(at::abs(out.to(kFloat))); -+#endif - #endif - - return {out, amax}; -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index f1f2eb7cec31..8d05e834bbc5 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1052,6 +1052,9 @@ if(USE_ROCM) - list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) - list(APPEND HIP_CXX_FLAGS -std=c++17) - list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2) -+ if(hipblast_FOUND) -+ list(APPEND HIP_CXX_FLAGS -DUSE_HIPBLASLT) -+ endif() - if(HIP_NEW_TYPE_ENUMS) - list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS) - endif() -diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake -index fa39156031ff..df4836847fdf 100644 ---- a/cmake/public/LoadHIP.cmake -+++ b/cmake/public/LoadHIP.cmake -@@ -155,7 +155,7 @@ if(HIP_FOUND) - find_package_and_print_version(hiprand REQUIRED) - find_package_and_print_version(rocblas REQUIRED) - find_package_and_print_version(hipblas REQUIRED) -- find_package_and_print_version(hipblaslt REQUIRED) -+ find_package_and_print_version(hipblaslt) - find_package_and_print_version(miopen REQUIRED) - find_package_and_print_version(hipfft REQUIRED) - find_package_and_print_version(hipsparse REQUIRED) --- -2.45.2 - diff --git a/next/0001-disable-use-of-aotriton.patch b/next/0001-disable-use-of-aotriton.patch deleted file mode 100644 index 61ffd1e..0000000 --- a/next/0001-disable-use-of-aotriton.patch +++ /dev/null @@ -1,94 +0,0 @@ -From 038ce9e44776e23f21c1816daa259bc0ea335088 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 29 Jun 2024 07:06:09 -0700 -Subject: [PATCH] disable use of aotriton - ---- - .../ATen/native/transformers/cuda/sdp_utils.cpp | 17 +++++++++++++++-- - 1 file changed, 15 insertions(+), 2 deletions(-) - -diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp -index 214b02d8262e..7b3eb9dcd8cd 100644 ---- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp -+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp -@@ -19,9 +19,12 @@ - #include - #include - -+#ifdef USE_FLASH_ATTENTION - #if USE_ROCM - #include - #endif -+#endif -+ - - /** - * Note [SDPA Runtime Dispatch] -@@ -182,6 +185,9 @@ bool check_sm_version(cudaDeviceProp * dprops) { - - bool check_flash_attention_hardware_support(sdp_params const& params, bool debug) { - // Check that the gpu is capable of running flash attention -+#ifndef USE_FLASH_ATTENTION -+ return false; -+#else - using sm80 = SMVersion<8, 0>; - using sm90 = SMVersion<9, 0>; - #if USE_ROCM -@@ -209,9 +215,13 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug - } - #endif - return true; -+#endif - } - - bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) { -+#ifndef USE_FLASH_ATTENTION -+ return false; -+#else - // Mem Efficient attention supports hardware in the range [sm_50, sm_90] - using sm50 = SMVersion<5, 0>; - using sm90 = SMVersion<9, 0>; -@@ -240,6 +250,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) - } - #endif - return true; -+#endif - } - - bool check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89( -@@ -554,7 +565,7 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) { - #ifndef USE_FLASH_ATTENTION - TORCH_WARN_ONCE(!debug, "Torch was not compiled with flash attention."); - return false; --#endif -+#else - - // Define gate functions that determine if a flash kernel can be ran - // Replace with std::to_array when we migrate to c++20 -@@ -597,13 +608,14 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) { - } - } - return true; -+#endif - } - - bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) { - #ifndef USE_MEM_EFF_ATTENTION - TORCH_WARN_ONCE(!debug, "Torch was not compiled with memory efficient attention."); - return false; --#endif -+#else - // Constraints specific to mem efficient attention - constexpr auto greater_than_or_equal_sm80_mem_efficient_dtypes = - array_of(at::kHalf, at::kFloat, at::kBFloat16); -@@ -663,6 +675,7 @@ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) { - } - #endif - return check_tensor_dtype(params, less_than_sm80_mem_efficient_dtypes, debug); -+#endif - } - - SDPBackend select_sdp_backend(sdp_params const& kernel_params) { --- -2.45.2 - diff --git a/python-torch.spec b/python-torch.spec index 90c908c..91a82a9 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -6,10 +6,10 @@ # So pre releases can be tried %bcond_with gitcommit %if %{with gitcommit} -# v2.8.0-rc3 -%global commit0 3d53a53e504089a52a149791fd33d7fc898bd055 +# v2.8.0-rc6 +%global commit0 f2b69a083d15e3d0083bb304302a3fd0b5fb8705 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) -%global date0 20250625 +%global date0 20250718 %global pypi_version 2.8.0 %global flatbuffers_version 24.12.23 %global miniz_version 3.0.2 @@ -357,11 +357,9 @@ sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)@#list(APPE sed -i -e 's@if(NOT TARGET fxdiv)@if(MSVC AND USE_XNNPACK)@' caffe2/CMakeLists.txt sed -i -e 's@TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@#TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@' caffe2/CMakeLists.txt -%if %{without gitcommit} # https://github.com/pytorch/pytorch/issues/149803 # Tries to checkout nccl -sed -i -e 's@ checkout_nccl()@# checkout_nccl()@' tools/build_pytorch_libs.py -%endif +sed -i -e 's@ checkout_nccl()@ True@' tools/build_pytorch_libs.py # Disable the use of check_submodule's in the setup.py, we are a tarball, not a git repo sed -i -e 's@check_submodules()$@#check_submodules()@' setup.py @@ -541,6 +539,7 @@ export USE_SYSTEM_EIGEN_INSTALL=ON export USE_SYSTEM_ONNX=ON export USE_SYSTEM_PYBIND11=OFF export USE_SYSTEM_LIBS=OFF +export USE_SYSTEM_NCCL=OFF export USE_TENSORPIPE=OFF export USE_XNNPACK=OFF export USE_XPU=OFF