diff --git a/.gitignore b/.gitignore index c424df5..cdf142f 100644 --- a/.gitignore +++ b/.gitignore @@ -19,15 +19,3 @@ /pytorch-v2.4.1.tar.gz /pytorch-v2.5.0.tar.gz /pytorch-v2.5.1.tar.gz -/pytorch-v2.7.0.tar.gz -/v2.13.6.tar.gz -/pytorch-a1cb3cc.tar.gz -/v24.12.23.tar.gz -/kineto-5e75018.tar.gz -/pytorch-v2.8.0.tar.gz -/v1.18.0.tar.gz -/pytorch-715dca6.tar.gz -/pytorch-fd36458.tar.gz -/pytorch-0fabc3b.tar.gz -/pytorch-v2.9.0.tar.gz -/pytorch-v2.9.1.tar.gz diff --git a/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch b/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch new file mode 100644 index 0000000..413c60d --- /dev/null +++ b/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch @@ -0,0 +1,47 @@ +From 091b7fe1ccbb5e4ff4ac6017d42bacb869f61a27 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Sat, 20 Jul 2024 05:37:15 -0600 +Subject: [PATCH] Add cmake option USE_SYSTEM_FBGEMM + +Signed-off-by: Tom Rix +--- + CMakeLists.txt | 1 + + cmake/Dependencies.cmake | 3 ++- + 2 files changed, 3 insertions(+), 1 deletion(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index c4cd4b2c2a98..2068f7c6c4f2 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -253,6 +253,7 @@ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF + "USE_CUDNN" OFF) + cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF) + option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON) ++option(USE_SYSTEM_FBGEMM "Use system-wide FBGEMM" OFF) + option(USE_KINETO "Use Kineto profiling library" ON) + option(USE_CUPTI_SO "Use CUPTI as a shared library" ON) + option(USE_FAKELOWP "Use FakeLowp operators" OFF) +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index f1f2eb7cec31..192dac46f13b 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -706,6 +706,7 @@ endif() + + # ---[ FBGEMM + if(USE_FBGEMM) ++ if (NOT USE_SYSTEM_FBGEMM) + set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party") + if(NOT DEFINED FBGEMM_SOURCE_DIR) + set(FBGEMM_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/fbgemm" CACHE STRING "FBGEMM source directory") +@@ -746,7 +747,7 @@ if(USE_FBGEMM) + target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable) + endif() + endif() +- ++ endif() + if(USE_FBGEMM) + list(APPEND Caffe2_DEPENDENCY_LIBS fbgemm) + endif() +-- +2.45.1 + diff --git a/0001-Add-cmake-variable-USE_ROCM_CK.patch b/0001-Add-cmake-variable-USE_ROCM_CK.patch deleted file mode 100644 index 925e03b..0000000 --- a/0001-Add-cmake-variable-USE_ROCM_CK.patch +++ /dev/null @@ -1,202 +0,0 @@ -From 193854993cd939de186de19589c1add4c4b2cf66 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Mon, 21 Jul 2025 11:35:03 -0700 -Subject: [PATCH] Add cmake variable USE_ROCM_CK - ---- - CMakeLists.txt | 1 + - aten/src/ATen/CMakeLists.txt | 40 ++++++++++++++++----------------- - aten/src/ATen/cuda/CUDABlas.cpp | 22 +++++++++--------- - cmake/Dependencies.cmake | 3 +++ - 4 files changed, 35 insertions(+), 31 deletions(-) - -diff --git a/CMakeLists.txt b/CMakeLists.txt -index a5d25e6afa0f..afc1b53efa64 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -240,6 +240,7 @@ cmake_dependent_option( - BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON - "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF) - cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF) -+cmake_dependent_option(USE_ROCM_CK "Use ROCm Composable Kernel" ON "USE_ROCM" ON) - option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF) - cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF) - cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF -diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt -index c9cfd74b501e..59f6178218ee 100644 ---- a/aten/src/ATen/CMakeLists.txt -+++ b/aten/src/ATen/CMakeLists.txt -@@ -373,26 +373,26 @@ if(USE_ROCM) - # is header only, so this should be ok, except that the CMake build generates - # a ck/config.h. We just do that part here. Without this, the ck.h from the - # ROCM SDK may get accidentally used instead. -- function(_pytorch_rocm_generate_ck_conf) -- set(CK_ENABLE_INT8 "ON") -- set(CK_ENABLE_FP16 "ON") -- set(CK_ENABLE_FP32 "ON") -- set(CK_ENABLE_FP64 "ON") -- set(CK_ENABLE_BF16 "ON") -- set(CK_ENABLE_FP8 "ON") -- set(CK_ENABLE_BF8 "ON") -- set(CK_USE_XDL "ON") -- set(CK_USE_WMMA "ON") -- configure_file( -- "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in" -- "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h" -- ) -- endfunction() -+# function(_pytorch_rocm_generate_ck_conf) -+# set(CK_ENABLE_INT8 "ON") -+# set(CK_ENABLE_FP16 "ON") -+# set(CK_ENABLE_FP32 "ON") -+# set(CK_ENABLE_FP64 "ON") -+# set(CK_ENABLE_BF16 "ON") -+# set(CK_ENABLE_FP8 "ON") -+# set(CK_ENABLE_BF8 "ON") -+# set(CK_USE_XDL "ON") -+# set(CK_USE_WMMA "ON") -+# configure_file( -+# "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in" -+# "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h" -+# ) -+# endfunction() - list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip) -- list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include) -- list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include) -- list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel) -- _pytorch_rocm_generate_ck_conf() -+# list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include) -+# list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include) -+# list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel) -+# _pytorch_rocm_generate_ck_conf() - - # Next two lines are needed because TunableOp uses third-party/fmt - list(APPEND ATen_HIP_INCLUDE $) -@@ -409,7 +409,7 @@ endif() - ${native_quantized_hip_hip} - ${native_transformers_hip_hip} ${native_transformers_src_hip_hip} - ) -- if(WIN32) # Windows doesn't support Composable Kernels -+ if(NOT USE_ROCM_CK) # Windows doesn't support Composable Kernels - file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip") - file(GLOB native_hip_ck "native/hip/ck*.hip") - exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}" -diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp -index 89350a11bea7..e5b7960177cf 100644 ---- a/aten/src/ATen/cuda/CUDABlas.cpp -+++ b/aten/src/ATen/cuda/CUDABlas.cpp -@@ -752,7 +752,7 @@ template <> - void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(double)) - { - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { --#ifdef USE_ROCM -+#ifdef USE_ROCM_CK - // hipblaslt does not support double gemm yet - bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(double)); - #else -@@ -836,7 +836,7 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) - bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::BFloat16)); - } - } --#if defined(USE_ROCM) && !defined(_MSC_VER) -+#if defined(USE_ROCM) && defined(USE_ROCM_CK) - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - at::native::bgemm_internal_ck(CUDABLAS_BGEMM_ARGS(at::BFloat16)); - } -@@ -1270,14 +1270,14 @@ template <> - void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) - { - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { --#ifdef USE_ROCM -+#ifdef USE_ROCM_CK - // hipblaslt does not support double gemm yet - gemm_internal_cublas(CUDABLAS_GEMM_ARGS(double)); - #else - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(double)); - #endif - } --#if defined(USE_ROCM) && !defined(_MSC_VER) -+#if defined(USE_ROCM) && defined(USE_ROCM_CK) - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(double)); - } -@@ -1293,7 +1293,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); - } --#if defined(USE_ROCM) && !defined(_MSC_VER) -+#if defined(USE_ROCM) && defined(USE_ROCM_CK) - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100 - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); -@@ -1311,7 +1311,7 @@ template <> - void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) - { - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { --#ifdef USE_ROCM -+#ifdef USE_ROCM_CK - // hipblaslt does not support complex gemm yet - gemm_internal_cublas>(CUDABLAS_GEMM_ARGS(c10::complex)); - #else -@@ -1327,7 +1327,7 @@ template <> - void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) - { - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { --#ifdef USE_ROCM -+#ifdef USE_ROCM_CK - // hipblaslt does not support complex gemm yet - gemm_internal_cublas>(CUDABLAS_GEMM_ARGS(c10::complex)); - #else -@@ -1345,7 +1345,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); - } --#if defined(USE_ROCM) && !defined(_MSC_VER) -+#if defined(USE_ROCM) && defined(USE_ROCM_CK) - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::Half)); - } -@@ -1361,7 +1361,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } --#if defined(USE_ROCM) && !defined(_MSC_VER) -+#if defined(USE_ROCM) && defined(USE_ROCM_CK) - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } -@@ -1382,7 +1382,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); - } --#if defined(USE_ROCM) && !defined(_MSC_VER) -+#if defined(USE_ROCM) && defined(USE_ROCM_CK) - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm"); - } -@@ -1398,7 +1398,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::B - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } --#if defined(USE_ROCM) && !defined(_MSC_VER) -+#if defined(USE_ROCM) && defined(USE_ROCM_CK) - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm"); - } -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index a93386c27f8d..be1368999d38 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1031,6 +1031,9 @@ if(USE_ROCM) - if(HIPBLASLT_VEC_EXT) - list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_VEC_EXT) - endif() -+ if(USE_ROCM_CK) -+ list(APPEND HIP_CXX_FLAGS -DUSE_ROCM_CK) -+ endif() - list(APPEND HIP_HIPCC_FLAGS --offload-compress) - if(WIN32) - add_definitions(-DROCM_ON_WINDOWS) --- -2.49.0 - diff --git a/0001-Changes-to-compile-with-3.13-126033.patch b/0001-Changes-to-compile-with-3.13-126033.patch new file mode 100644 index 0000000..ddc0dcf --- /dev/null +++ b/0001-Changes-to-compile-with-3.13-126033.patch @@ -0,0 +1,222 @@ +From 655a06444b261cb28e71a0973c0ab67aaa8261ab Mon Sep 17 00:00:00 2001 +From: albanD +Date: Tue, 14 May 2024 02:14:53 +0000 +Subject: [PATCH] Changes to compile with 3.13 (#126033) + +This is mainly: +- Fix refcount access macro +- Hide all the Dynamo code that needs update as usual +- Add _PyWeakref_ClearRef as an extern provided by CPython. Including the pycore header that defines it would require raw c include shenanigans that I don't think are worth it. +This allows to build both with regular and nogil version of cpython. Both + +Note that this requires the 3.13 branch at least past [d3094744d40de2deefbda9b1996d5029c9ebf0b0](https://github.com/python/cpython/commit/d3094744d40de2deefbda9b1996d5029c9ebf0b0) which we need for mimalloc include and weakref function being exposed. + +debug-only issues in pybind11 with PyMem_MALLOC vs PyObject_MALLOC being should be synced either by updating pybind or cpython. @colesbury I can send a PR to ifdef the proper use in pybind if you think that this is the best solution here? + +Pull Request resolved: https://github.com/pytorch/pytorch/pull/126033 +Approved by: https://github.com/colesbury +--- + torch/csrc/Storage.cpp | 2 +- + torch/csrc/autograd/python_variable.cpp | 2 +- + torch/csrc/dynamo/cpython_defs.c | 15 +++++- + torch/csrc/dynamo/cpython_defs.h | 2 + + torch/csrc/dynamo/eval_frame.c | 67 ++++++++++++++++++------- + torch/csrc/utils/python_compat.h | 4 ++ + 6 files changed, 70 insertions(+), 22 deletions(-) + +diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp +index 93dbc9c09bb2..b22bbac35981 100644 +--- a/torch/csrc/Storage.cpp ++++ b/torch/csrc/Storage.cpp +@@ -236,7 +236,7 @@ static void THPStorage_subclass_dealloc(PyObject* self) { + if (type->tp_del) { + PyObject_GC_Track(self); + type->tp_del(self); +- if (self->ob_refcnt > 0) { ++ if (Py_REFCNT(self) > 0) { + // Resurrected (see above comment about resurrection from `__del__`) + return; + } +diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp +index 9e85f0026b35..8fd1129da63c 100644 +--- a/torch/csrc/autograd/python_variable.cpp ++++ b/torch/csrc/autograd/python_variable.cpp +@@ -1910,7 +1910,7 @@ void THPVariable_subclass_dealloc(PyObject* self) { + if (type->tp_del) { + PyObject_GC_Track(self); + type->tp_del(self); +- if (self->ob_refcnt > 0) { ++ if (Py_REFCNT(self) > 0) { + /* Resurrected */ + return; + } +diff --git a/torch/csrc/dynamo/cpython_defs.c b/torch/csrc/dynamo/cpython_defs.c +index 4a1dba63009a..5e0945a052ae 100644 +--- a/torch/csrc/dynamo/cpython_defs.c ++++ b/torch/csrc/dynamo/cpython_defs.c +@@ -13,6 +13,17 @@ + } else { \ + } + ++#if IS_PYTHON_3_13_PLUS ++// Gave up after fixing a few of these ++// pycore_opcode.h is gone (new is pycore_opcode_metadata.h ?) ++// f_code is gone (new is f_executable?) ++ ++// Fake definitions for what we removed ++const uint8_t* THP_PyOpcode_Caches = NULL; ++const int THP_PyOpcode_Caches_size = 0; ++ ++#else ++ + // NOTE: all `assert`s below are converted to `CHECK`s + + #if IS_PYTHON_3_11_PLUS +@@ -29,8 +40,8 @@ + #define NEED_OPCODE_TABLES // To get _PyOpcode_Deopt + #include + #undef NEED_OPCODE_TABLES +-#undef Py_BUILD_CORE + #include ++#undef Py_BUILD_CORE + + // As a simple way to reduce the impact of ABI changes on the CPython side, this check forces + // us to manually re-check that the function didn't change on the next major version +@@ -364,3 +375,5 @@ THP_PyFrame_Clear(_PyInterpreterFrame *frame) + } + + #endif ++ ++#endif // CPython 3.13 +\ No newline at end of file +diff --git a/torch/csrc/dynamo/cpython_defs.h b/torch/csrc/dynamo/cpython_defs.h +index a897c3e6c6e7..3b6c9667f8c9 100644 +--- a/torch/csrc/dynamo/cpython_defs.h ++++ b/torch/csrc/dynamo/cpython_defs.h +@@ -8,7 +8,9 @@ + + #if IS_PYTHON_3_11_PLUS + ++#define Py_BUILD_CORE + #include ++#undef Py_BUILD_CORE + + int THP_PyFrame_FastToLocalsWithError( + _PyInterpreterFrame* frame, +diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c +index c286e821f09d..e13cb5af2a0e 100644 +--- a/torch/csrc/dynamo/eval_frame.c ++++ b/torch/csrc/dynamo/eval_frame.c +@@ -8,6 +8,31 @@ + #include + #include + ++ ++ ++PyObject* guard_error_hook = NULL; ++const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup"; ++ ++static int active_dynamo_threads = 0; ++ ++static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT; ++ ++inline static PyObject* eval_frame_callback_get(void) { ++ void* result = PyThread_tss_get(&eval_frame_callback_key); ++ if (unlikely(result == NULL)) { ++ return (PyObject*)Py_None; ++ } else { ++ return (PyObject*)result; ++ } ++} ++ ++inline static void eval_frame_callback_set(PyObject* obj) { ++ PyThread_tss_set(&eval_frame_callback_key, obj); ++} ++ ++// 3.13 Not supported at all. See cpython_defs.c for hints ++#if !(IS_PYTHON_3_13_PLUS) ++ + // Problem in CPython includes when mixing core and non-core build + // The fix was not backported to 3.12 so this is needed here + // https://github.com/python/cpython/issues/105268 +@@ -138,24 +163,6 @@ THP_PyFrame_FastToLocalsWithError(THP_EVAL_API_FRAME_OBJECT *frame, int *free_va + } + #endif + +-PyObject* guard_error_hook = NULL; +-const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup"; +- +-static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT; +- +-inline static PyObject* eval_frame_callback_get(void) { +- void* result = PyThread_tss_get(&eval_frame_callback_key); +- if (unlikely(result == NULL)) { +- return (PyObject*)Py_None; +- } else { +- return (PyObject*)result; +- } +-} +- +-inline static void eval_frame_callback_set(PyObject* obj) { +- PyThread_tss_set(&eval_frame_callback_key, obj); +-} +- + static PyObject* _custom_eval_frame_shim( + PyThreadState* tstate, + THP_EVAL_API_FRAME_OBJECT* frame, +@@ -627,7 +634,29 @@ static PyObject* _custom_eval_frame( + } + } + +-static int active_dynamo_threads = 0; ++#else // IS_PYTHON_3_13_PLUS ++ ++// Fake definitions for everything we removed ++ ++typedef struct THPPyInterpreterFrame { ++ PyObject_HEAD ++ _PyInterpreterFrame* frame; // Borrowed reference ++} THPPyInterpreterFrame; ++ ++inline static void enable_eval_frame_shim(PyThreadState* tstate) {} ++inline static void enable_eval_frame_default(PyThreadState* tstate) {} ++ ++static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL}; ++ ++static PyTypeObject THPPyInterpreterFrameType = { ++ PyVarObject_HEAD_INIT(NULL, 0) ++ .tp_name = "torch._C.dynamo.eval_frame._PyInterpreterFrame", ++ .tp_basicsize = sizeof(THPPyInterpreterFrame), ++ .tp_flags = Py_TPFLAGS_DEFAULT, ++ .tp_getset = THPPyInterpreterFrame_properties, ++}; ++ ++#endif // CPython 3.13 + + static PyObject* increment_working_threads(PyThreadState* tstate) { + active_dynamo_threads = active_dynamo_threads + 1; +diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h +index 73b991cf3fbf..b060db00db73 100644 +--- a/torch/csrc/utils/python_compat.h ++++ b/torch/csrc/utils/python_compat.h +@@ -11,6 +11,7 @@ extern "C" { + + #define IS_PYTHON_3_11_PLUS PY_VERSION_HEX >= 0x030B00C1 + #define IS_PYTHON_3_12_PLUS PY_VERSION_HEX >= 0x030C0000 ++#define IS_PYTHON_3_13_PLUS PY_VERSION_HEX >= 0x030D0000 + + PYCAPI_COMPAT_STATIC_INLINE(int) + PyCode_GetNCellvars(PyCodeObject* code) { +@@ -32,6 +33,9 @@ PyCode_GetNFreevars(PyCodeObject* code) { + #endif + } + ++// Provided by CPython but getting the header for them is very hard ++extern void _PyWeakref_ClearRef(PyWeakReference* self); ++ + #ifdef __cplusplus + } + #endif +-- +2.45.1 + diff --git a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch new file mode 100644 index 0000000..562f55b --- /dev/null +++ b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch @@ -0,0 +1,910 @@ +From 3d1e4b3e5ddcdd2717e590c635097163fef64c83 Mon Sep 17 00:00:00 2001 +From: Xu Han +Date: Sun, 31 Mar 2024 03:07:32 +0000 +Subject: [PATCH] Enable x86 CPU vectorization on windows [submodule sleef] + (#118980) + +Enable VEC on Windows OS. +1. Fix some type defination gap between Windows and Linux. +2. Fix some operator not support on Windows, such as [], /. +3. Enable static sleef library build on Windows. +4. Disable unsupported function overloading on MSVC. +5. Upgrade submodule sleef lib, which fixed build issue on Windows. +6. Fixed bazel build issues. +7. Fix test app not link to sleef on Windows. + +Note: If rebuild fail after pulled this PR, please sync `sleef` submodule by run: +```cmd +git submodule sync +git submodule update --init --recursive +``` + +Pull Request resolved: https://github.com/pytorch/pytorch/pull/118980 +Approved by: https://github.com/jgong5, https://github.com/ezyang, https://github.com/malfet +--- + aten/src/ATen/CMakeLists.txt | 48 ++++++-------- + aten/src/ATen/cpu/vec/vec256/vec256.h | 14 ++-- + .../src/ATen/cpu/vec/vec256/vec256_bfloat16.h | 27 ++++++-- + .../cpu/vec/vec256/vec256_complex_double.h | 7 +- + .../cpu/vec/vec256/vec256_complex_float.h | 7 +- + aten/src/ATen/cpu/vec/vec256/vec256_double.h | 5 +- + aten/src/ATen/cpu/vec/vec256/vec256_float.h | 15 +++-- + aten/src/ATen/cpu/vec/vec256/vec256_qint.h | 12 +++- + aten/src/ATen/cpu/vec/vec512/vec512.h | 14 ++-- + .../src/ATen/cpu/vec/vec512/vec512_bfloat16.h | 27 ++++++-- + .../cpu/vec/vec512/vec512_complex_double.h | 7 +- + .../cpu/vec/vec512/vec512_complex_float.h | 7 +- + aten/src/ATen/cpu/vec/vec512/vec512_double.h | 5 +- + aten/src/ATen/cpu/vec/vec512/vec512_float.h | 15 +++-- + aten/src/ATen/cpu/vec/vec512/vec512_qint.h | 66 ++++++++++++++++++- + aten/src/ATen/cpu/vec/vec_base.h | 6 ++ + caffe2/CMakeLists.txt | 2 +- + third_party/sleef.BUILD | 3 +- + 18 files changed, 194 insertions(+), 93 deletions(-) + +diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt +index bf425af5fa9..58d5828e8ca 100644 +--- a/aten/src/ATen/CMakeLists.txt ++++ b/aten/src/ATen/CMakeLists.txt +@@ -419,32 +419,25 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$") + list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo) + endif() + +-if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) +- # Preserve values for the main build +- set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS}) +- set(__aten_sleef_build_tests ${BUILD_TESTS}) +- +- # Unset our restrictive C++ flags here and reset them later. +- # Remove this once we use proper target_compile_options. +- set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) +- set(CMAKE_CXX_FLAGS) +- +- # Bump up optimization level for sleef to -O1, since at -O0 the compiler +- # excessively spills intermediate vector registers to the stack +- # and makes things run impossibly slowly +- set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) +- if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0") +- string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) +- else() +- set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1") ++if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) ++ if(NOT MSVC) ++ # Bump up optimization level for sleef to -O1, since at -O0 the compiler ++ # excessively spills intermediate vector registers to the stack ++ # and makes things run impossibly slowly ++ set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) ++ if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0") ++ string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) ++ else() ++ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1") ++ endif() + endif() + + if(NOT USE_SYSTEM_SLEEF) +- set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) +- set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) +- set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) +- set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) +- set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE}) ++ set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) ++ set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) ++ set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) ++ set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) ++ set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE) + if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64") + set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE) +@@ -465,12 +458,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) + endif() + list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef) + +- set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) +- set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS}) +- +- # Set these back. TODO: Use SLEEF_ to pass these instead +- set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE) +- set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE) ++ if(NOT MSVC) ++ set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) ++ endif() + endif() + + if(USE_CUDA AND NOT USE_ROCM) +diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h +index 800b027e469..c431fa3c605 100644 +--- a/aten/src/ATen/cpu/vec/vec256/vec256.h ++++ b/aten/src/ATen/cpu/vec/vec256/vec256.h +@@ -69,7 +69,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { + } + + +-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX2) + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +@@ -94,7 +94,8 @@ inline Vectorized cast(const Vectorized& src) + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +- ++#ifndef _MSC_VER ++// MSVC is not working well on complex function overload. + template + std::enable_if_t> + inline gather(const double* base_addr, const Vectorized& vindex) { +@@ -106,9 +107,10 @@ std::enable_if_t& vindex) { + return _mm256_i32gather_ps(base_addr, vindex, scale); + } +- ++#endif + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +- ++#ifndef _MSC_VER ++// MSVC is not working well on complex function overload. + template + std::enable_if_t> + inline mask_gather(const Vectorized& src, const double* base_addr, +@@ -122,7 +124,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, + const Vectorized& vindex, Vectorized& mask) { + return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale); + } +- ++#endif + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + // Only works for inputs in the range: [-2^51, 2^51] +@@ -302,6 +304,6 @@ inline Vectorized flip(const Vectorized & v) { + return flip8(v); + } + +-#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) ++#endif // (defined(CPU_CAPABILITY_AVX2) + + }} // namepsace at::vec::CPU_CAPABILITY +diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h +index 3e26213d6d2..66557436c70 100644 +--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h ++++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h +@@ -7,7 +7,8 @@ + #include + #include + +-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX2) ++#define SLEEF_STATIC_LIBS + #include + #endif + +@@ -18,7 +19,18 @@ namespace at::vec { + // See Note [CPU_CAPABILITY namespace] + inline namespace CPU_CAPABILITY { + +-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX2) ++ ++#ifndef SLEEF_CONST ++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) ++#define SLEEF_CONST const ++#else ++#define SLEEF_CONST ++#endif ++#define SLEEF_CONST_OLD SLEEF_CONST ++#else ++#define SLEEF_CONST_OLD ++#endif + + // bfloat16 conversion + static inline void cvtbf16_fp32(const __m128i& a, __m256& o) { +@@ -265,7 +277,8 @@ public: + } + return b; + } +- Vectorized map(const __m256 (*const vop)(__m256)) const { ++ ++ Vectorized map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const { + __m256 lo, hi; + cvt_to_fp32(values, lo, hi); + const auto o1 = vop(lo); +@@ -1026,7 +1039,7 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V + CONVERT_VECTORIZED_INIT(BFloat16, bfloat16); + CONVERT_VECTORIZED_INIT(Half, half); + +-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) ++#else // defined(CPU_CAPABILITY_AVX2) + + #define CONVERT_NON_VECTORIZED_INIT(type, name) \ + inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ +@@ -1051,9 +1064,9 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V + CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16); + CONVERT_NON_VECTORIZED_INIT(Half, half); + +-#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) ++#endif // defined(CPU_CAPABILITY_AVX2) + +-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX2) + #define LOAD_FP32_VECTORIZED_INIT(type, name) \ + inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ + auto values = _mm_loadu_si128(reinterpret_cast(data)); \ +@@ -1072,7 +1085,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec + LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16); + LOAD_FP32_VECTORIZED_INIT(Half, fp16); + +-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) ++#else // defined(CPU_CAPABILITY_AVX2) + #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ + inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ + __at_align__ float values[Vectorized::size()]; \ +diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h +index f93ea1e63c3..6c198fb37d3 100644 +--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h ++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h +@@ -8,7 +8,8 @@ + #include + #include + +-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX2) ++#define SLEEF_STATIC_LIBS + #include + #endif + +@@ -16,7 +17,7 @@ namespace at::vec { + // See Note [CPU_CAPABILITY namespace] + inline namespace CPU_CAPABILITY { + +-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX2) + + template <> class Vectorized> { + private: +@@ -145,7 +146,7 @@ public: + auto abs = abs_(); + auto zero = _mm256_setzero_pd(); + auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ); +- auto div = values / abs; ++ auto div = _mm256_div_pd(values, abs); + return _mm256_blendv_pd(div, zero, mask); + } + __m256d real_() const { +diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h +index 7c142c04b79..c72d4d49274 100644 +--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h ++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h +@@ -7,7 +7,8 @@ + #include + #include + #include +-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX2) ++#define SLEEF_STATIC_LIBS + #include + #endif + +@@ -15,7 +16,7 @@ namespace at::vec { + // See Note [CPU_CAPABILITY namespace] + inline namespace CPU_CAPABILITY { + +-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX2) + + template <> class Vectorized> { + private: +@@ -180,7 +181,7 @@ public: + auto abs = abs_(); + auto zero = _mm256_setzero_ps(); + auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ); +- auto div = values / abs; ++ auto div = _mm256_div_ps(values, abs); + return _mm256_blendv_ps(div, zero, mask); + } + __m256 real_() const { +diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h +index bc82d07edd1..bed6da627af 100644 +--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h ++++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h +@@ -6,7 +6,8 @@ + #include + #include + #include +-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX2) ++#define SLEEF_STATIC_LIBS + #include + #endif + +@@ -15,7 +16,7 @@ namespace at::vec { + inline namespace CPU_CAPABILITY { + + +-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX2) + + template <> class Vectorized { + private: +diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h +index 886809a0b8a..0e3664cd37b 100644 +--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h ++++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h +@@ -6,7 +6,8 @@ + #include + #include + #include +-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX2) ++#define SLEEF_STATIC_LIBS + #include + #endif + +@@ -14,7 +15,7 @@ namespace at::vec { + // See Note [CPU_CAPABILITY namespace] + inline namespace CPU_CAPABILITY { + +-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX2) + + template <> class Vectorized { + private: +@@ -226,14 +227,14 @@ public: + static __m256 vec_factorial_5 = + _mm256_set1_ps(0.00828929059f); // 1/factorial(5) + static __m256 vec_exp_log2ef = +- (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e) ++ _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e) + static __m256 vec_half = _mm256_set1_ps(0.5f); + static __m256 vec_one = _mm256_set1_ps(1.f); + static __m256 vec_zero = _mm256_set1_ps(0.f); + static __m256 vec_two = _mm256_set1_ps(2.f); +- static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2) +- static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50); +- static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218); ++ static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2) ++ static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50)); ++ static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218)); + static __m256i vec_127 = _mm256_set1_epi32(0x0000007f); + static int n_mantissa_bits = 23; + +@@ -266,7 +267,7 @@ public: + auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number); + auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127); + vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); +- auto vec_two_pow_n = (__m256)vec_two_pow_n_i; ++ auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i); + vec_two_pow_n = + _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask); + +diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h +index 4128841701a..85e099904cd 100644 +--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h ++++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h +@@ -41,11 +41,17 @@ + namespace at::vec { + inline namespace CPU_CAPABILITY { + +-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX2) + ++#ifdef _MSC_VER ++__declspec(align(64)) struct Vectorizedqi { ++ protected: ++ __m256i vals; ++#else + struct Vectorizedqi { + protected: + __m256i vals __attribute__((aligned(64))); ++#endif + + public: + Vectorizedqi() {} +@@ -133,7 +139,7 @@ inline convert_float_to_int8(at::vec::Vectorized src) { + } + + template +-inline void __attribute__((always_inline)) QuantizeAvx2( ++__FORCE_INLINE void QuantizeAvx2( + const float* src, + T* dst, + int len, +@@ -1331,5 +1337,5 @@ Vectorized inline maximum(const Vectorized& a, const V + return a.maximum(b); + } + +-#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) ++#endif // if defined(CPU_CAPABILITY_AVX2) + }} // namespace at::vec::CPU_CAPABILITY +diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h +index fe96d123e64..87f723d782c 100644 +--- a/aten/src/ATen/cpu/vec/vec512/vec512.h ++++ b/aten/src/ATen/cpu/vec/vec512/vec512.h +@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { + } + + +-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX512) + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +@@ -80,7 +80,8 @@ inline Vectorized cast(const Vectorized& src) + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +- ++#ifndef _MSC_VER ++// MSVC is not working well on complex function overload. + template + std::enable_if_t> + inline gather(const double* base_addr, const Vectorized& vindex) { +@@ -92,9 +93,10 @@ std::enable_if_t& vindex) { + return _mm512_i32gather_ps(vindex, base_addr, scale); + } +- ++#endif + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +- ++#ifndef _MSC_VER ++// MSVC is not working well on complex function overload. + template + std::enable_if_t> + inline mask_gather(const Vectorized& src, const double* base_addr, +@@ -112,7 +114,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, + auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ); + return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale); + } +- ++#endif + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + template<> +@@ -270,6 +272,6 @@ inline Vectorized flip(const Vectorized & v) { + return flip8(v); + } + +-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) ++#endif // defined(CPU_CAPABILITY_AVX512) + + }}} +diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h +index f9fc92d52bf..eb3b6a72240 100644 +--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h ++++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h +@@ -7,7 +7,8 @@ + #include + #include + +-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX512) ++#define SLEEF_STATIC_LIBS + #include + #endif + +@@ -16,7 +17,18 @@ namespace vec { + // See Note [CPU_CAPABILITY namespace] + inline namespace CPU_CAPABILITY { + +-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX512) ++ ++#ifndef SLEEF_CONST ++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) ++#define SLEEF_CONST const ++#else ++#define SLEEF_CONST ++#endif ++#define SLEEF_CONST_OLD SLEEF_CONST ++#else ++#define SLEEF_CONST_OLD ++#endif + + // bfloat16 conversion + static inline void cvtbf16_fp32(const __m256i& a, __m512& o) { +@@ -362,7 +374,8 @@ public: + } + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wignored-qualifiers" +- Vectorized map(const __m512 (*const vop)(__m512)) const { ++ ++ Vectorized map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const { + __m512 lo, hi; + cvt_to_fp32(values, lo, hi); + const auto o1 = vop(lo); +@@ -1571,7 +1584,7 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V + CONVERT_VECTORIZED_INIT(BFloat16, bfloat16); + CONVERT_VECTORIZED_INIT(Half, half); + +-#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) ++#else //defined(CPU_CAPABILITY_AVX512) + + #define CONVERT_NON_VECTORIZED_INIT(type, name) \ + inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ +@@ -1601,9 +1614,9 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V + CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16); + CONVERT_NON_VECTORIZED_INIT(Half, half); + +-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) ++#endif // defined(CPU_CAPABILITY_AVX512) + +-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX512) + #define LOAD_FP32_VECTORIZED_INIT(type, name) \ + inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ + auto values = _mm256_loadu_si256(reinterpret_cast(data)); \ +@@ -1622,7 +1635,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec + LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16); + LOAD_FP32_VECTORIZED_INIT(Half, fp16); + +-#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) ++#else // defined(CPU_CAPABILITY_AVX512) + #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ + inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ + __at_align__ float values[Vectorized::size()]; \ +diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h +index 02aa3a87cc1..c35204f9da2 100644 +--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h ++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h +@@ -7,7 +7,8 @@ + #include + #include + #include +-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX512) ++#define SLEEF_STATIC_LIBS + #include + #endif + +@@ -16,7 +17,7 @@ namespace vec { + // See Note [CPU_CAPABILITY namespace] + inline namespace CPU_CAPABILITY { + +-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX512) + + template <> class Vectorized> { + private: +@@ -203,7 +204,7 @@ public: + auto abs = abs_(); + auto zero = _mm512_setzero_pd(); + auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ); +- auto div = values / abs; ++ auto div = _mm512_div_pd(values, abs); + return _mm512_mask_blend_pd(mask, div, zero); + } + __m512d real_() const { +diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h +index a5d790c98b2..2801e484d94 100644 +--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h ++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h +@@ -7,7 +7,8 @@ + #include + #include + #include +-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX512) ++#define SLEEF_STATIC_LIBS + #include + #endif + +@@ -16,7 +17,7 @@ namespace vec { + // See Note [CPU_CAPABILITY namespace] + inline namespace CPU_CAPABILITY { + +-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX512) + + template <> class Vectorized> { + private: +@@ -708,7 +709,7 @@ public: + auto abs = abs_(); + auto zero = _mm512_setzero_ps(); + auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ); +- auto div = values / abs; ++ auto div = _mm512_div_ps(values, abs); + return _mm512_mask_blend_ps(mask, div, zero); + } + __m512 real_() const { +diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h +index 27b2753c903..508ab257e60 100644 +--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h ++++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h +@@ -6,7 +6,8 @@ + #include + #include + #include +-#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER) ++#if (defined(CPU_CAPABILITY_AVX512)) ++#define SLEEF_STATIC_LIBS + #include + #endif + +@@ -15,7 +16,7 @@ namespace vec { + // See Note [CPU_CAPABILITY namespace] + inline namespace CPU_CAPABILITY { + +-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX512) + + template <> class Vectorized { + private: +diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h +index ba5738687fd..a08df3c141a 100644 +--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h ++++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h +@@ -6,7 +6,8 @@ + #include + #include + #include +-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX512) ++#define SLEEF_STATIC_LIBS + #include + #endif + +@@ -15,7 +16,7 @@ namespace vec { + // See Note [CPU_CAPABILITY namespace] + inline namespace CPU_CAPABILITY { + +-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX512) + + template <> class Vectorized { + private: +@@ -246,14 +247,14 @@ public: + static __m512 vec_factorial_5 = + _mm512_set1_ps(0.00828929059f); // 1/factorial(5) + static __m512 vec_exp_log2ef = +- (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e) ++ _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e) + static __m512 vec_half = _mm512_set1_ps(0.5f); + static __m512 vec_one = _mm512_set1_ps(1.f); + static __m512 vec_zero = _mm512_set1_ps(0.f); + static __m512 vec_two = _mm512_set1_ps(2.f); +- static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2) +- static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50); +- static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218); ++ static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2) ++ static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50)); ++ static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218)); + static __m512i vec_127 = _mm512_set1_epi32(0x0000007f); + static int n_mantissa_bits = 23; + +@@ -288,7 +289,7 @@ public: + auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number); + auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127); + vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); +- auto vec_two_pow_n = (__m512)vec_two_pow_n_i; ++ auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i); + vec_two_pow_n = + _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero); + +diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h +index e0713d01312..a5671ed4a50 100644 +--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h ++++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h +@@ -42,11 +42,17 @@ namespace at { + namespace vec { + inline namespace CPU_CAPABILITY { + +-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) ++#if defined(CPU_CAPABILITY_AVX512) + ++#ifdef _MSC_VER ++__declspec(align(64)) struct Vectorizedqi { ++ protected: ++ __m512i vals; ++#else + struct Vectorizedqi { + protected: + __m512i vals __attribute__((aligned(64))); ++#endif + + public: + Vectorizedqi() {} +@@ -136,7 +142,7 @@ inline convert_float_to_int8(at::vec::Vectorized src) { + } + + template +-inline void __attribute__((always_inline)) QuantizeAvx512( ++__FORCE_INLINE void QuantizeAvx512( + const float* src, + T* dst, + int len, +@@ -525,10 +531,17 @@ struct Vectorized : public Vectorizedqi { + Vectorized scale, + Vectorized zero_point, + Vectorized scale_neg_zp_premul) const { ++ #if defined(_MSC_VER) && !defined(__clang__) ++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); ++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); ++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); ++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); ++ #else + __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); + __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); + __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); + __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); ++ #endif + + __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); + __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); +@@ -549,10 +562,17 @@ struct Vectorized : public Vectorizedqi { + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point) const { ++ #if defined(_MSC_VER) && !defined(__clang__) ++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); ++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); ++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); ++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); ++ #else + __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); + __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); + __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); + __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); ++ #endif + + __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); + __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); +@@ -598,20 +618,34 @@ struct Vectorized : public Vectorizedqi { + } + + int_vec_return_type widening_subtract(Vectorized b) const { ++ #if defined(_MSC_VER) && !defined(__clang__) ++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); ++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); ++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); ++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); ++ #else + __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); + __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); + __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); + __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); ++ #endif + + __m512i int32_val0 = cvtepi8_epi32(int_val0); + __m512i int32_val1 = cvtepi8_epi32(int_val1); + __m512i int32_val2 = cvtepi8_epi32(int_val2); + __m512i int32_val3 = cvtepi8_epi32(int_val3); + ++ #if defined(_MSC_VER) && !defined(__clang__) ++ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); ++ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); ++ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); ++ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); ++ #else + __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); + __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); + __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); + __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); ++ #endif + + __m512i int32_b0 = cvtepi8_epi32(int_b0); + __m512i int32_b1 = cvtepi8_epi32(int_b1); +@@ -721,10 +755,17 @@ struct Vectorized : public Vectorizedqi { + Vectorized scale, + Vectorized zero_point, + Vectorized scale_zp_premul) const { ++ #if defined(_MSC_VER) && !defined(__clang__) ++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); ++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); ++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); ++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); ++ #else + __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); + __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); + __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); + __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); ++ #endif + + __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); + __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); +@@ -746,10 +787,17 @@ struct Vectorized : public Vectorizedqi { + float_vec_return_type dequantize( + Vectorized scale, + Vectorized zero_point) const { ++ #if defined(_MSC_VER) && !defined(__clang__) ++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); ++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); ++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); ++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); ++ #else + __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); + __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); + __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); + __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); ++ #endif + + __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); + __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); +@@ -796,20 +844,34 @@ struct Vectorized : public Vectorizedqi { + } + + int_vec_return_type widening_subtract(Vectorized b) const { ++ #if defined(_MSC_VER) && !defined(__clang__) ++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); ++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); ++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); ++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); ++ #else + __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); + __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); + __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); + __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); ++ #endif + + __m512i int32_val0 = cvtepu8_epi32(int_val0); + __m512i int32_val1 = cvtepu8_epi32(int_val1); + __m512i int32_val2 = cvtepu8_epi32(int_val2); + __m512i int32_val3 = cvtepu8_epi32(int_val3); + ++ #if defined(_MSC_VER) && !defined(__clang__) ++ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); ++ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); ++ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); ++ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); ++ #else + __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); + __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); + __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); + __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); ++ #endif + + __m512i int32_b0 = cvtepu8_epi32(int_b0); + __m512i int32_b1 = cvtepu8_epi32(int_b1); +diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h +index adf81dd915c..20cb8ef6dbc 100644 +--- a/aten/src/ATen/cpu/vec/vec_base.h ++++ b/aten/src/ATen/cpu/vec/vec_base.h +@@ -36,6 +36,12 @@ + #include + #include + ++#if defined(__GNUC__) ++#define __FORCE_INLINE __attribute__((always_inline)) inline ++#elif defined(_MSC_VER) ++#define __FORCE_INLINE __forceinline ++#endif ++ + // These macros helped us unify vec_base.h + #ifdef CPU_CAPABILITY_AVX512 + #if defined(__GNUC__) +diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt +index a6b6f0f7d1d..15d37cf4861 100644 +--- a/caffe2/CMakeLists.txt ++++ b/caffe2/CMakeLists.txt +@@ -1787,7 +1787,7 @@ if(BUILD_TEST) + endif() + else() + add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}") +- target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main) ++ target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main) + endif() + target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $) + target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $) +diff --git a/third_party/sleef.BUILD b/third_party/sleef.BUILD +index 573f9c5b54a..f22a6e905e2 100644 +--- a/third_party/sleef.BUILD ++++ b/third_party/sleef.BUILD +@@ -38,6 +38,7 @@ SLEEF_PUBLIC_HEADERS = [ + SLEEF_PRIVATE_INCLUDES = [ + "-Iexternal/sleef/src/arch", + "-Iexternal/sleef/src/common", ++ "-Iexternal/sleef/src/libm", + ] + + SLEEF_PUBLIC_INCLUDES = [ +@@ -201,8 +202,6 @@ cc_library( + srcs = [ + "src/libm/rempitab.c", + "src/libm/sleefdp.c", +- "src/libm/sleefld.c", +- "src/libm/sleefqp.c", + "src/libm/sleefsp.c", + ], + hdrs = SLEEF_PUBLIC_HEADERS, +-- +2.45.1 + diff --git a/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch b/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch deleted file mode 100644 index b6a282c..0000000 --- a/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch +++ /dev/null @@ -1,359 +0,0 @@ -From f2a544b2e3a5bdc04985f6e06223c0c1700120a0 Mon Sep 17 00:00:00 2001 -From: albanD -Date: Sat, 12 Jul 2025 03:42:33 -0400 -Subject: [PATCH] Fix compilation and "import torch" issues for cpython 3.14 - -Imported from -https://github.com/albanD/pytorch/tree/cpython314_build -commit 88bb9cdb72449f4277829e20d94ad8aec1894216 - -Signed-off-by: Tom Rix ---- - torch/_dynamo/bytecode_analysis.py | 2 +- - torch/ao/quantization/__init__.py | 5 +++- - torch/ao/quantization/qconfig.py | 4 ++- - torch/ao/quantization/utils.py | 7 +++-- - torch/csrc/dynamo/cpython_defs.c | 16 +++++++++++ - torch/csrc/dynamo/cpython_includes.h | 17 ++++++++++++ - torch/csrc/dynamo/eval_frame.c | 34 +++++++++++++++-------- - torch/csrc/dynamo/framelocals_mapping.cpp | 14 ++++++++++ - torch/csrc/utils/python_compat.h | 1 + - torch/onnx/__init__.py | 1 - - torch/utils/weak.py | 29 +++++++++++++++++-- - 11 files changed, 111 insertions(+), 19 deletions(-) - -diff --git a/torch/_dynamo/bytecode_analysis.py b/torch/_dynamo/bytecode_analysis.py -index 3252ea91409f..2de74ee5bf8d 100644 ---- a/torch/_dynamo/bytecode_analysis.py -+++ b/torch/_dynamo/bytecode_analysis.py -@@ -33,7 +33,7 @@ if sys.version_info >= (3, 11): - TERMINAL_OPCODES.add(dis.opmap["JUMP_FORWARD"]) - else: - TERMINAL_OPCODES.add(dis.opmap["JUMP_ABSOLUTE"]) --if sys.version_info >= (3, 12): -+if (3, 12) <= sys.version_info < (3, 14): - TERMINAL_OPCODES.add(dis.opmap["RETURN_CONST"]) - if sys.version_info >= (3, 13): - TERMINAL_OPCODES.add(dis.opmap["JUMP_BACKWARD_NO_INTERRUPT"]) -diff --git a/torch/ao/quantization/__init__.py b/torch/ao/quantization/__init__.py -index ffc1792fd23f..cf5a8b99a894 100644 ---- a/torch/ao/quantization/__init__.py -+++ b/torch/ao/quantization/__init__.py -@@ -1,5 +1,6 @@ - # mypy: allow-untyped-defs - -+import sys - from typing import Callable, Optional, Union - - import torch -@@ -33,7 +34,9 @@ from .stubs import * # noqa: F403 - - # ensure __module__ is set correctly for public APIs - ObserverOrFakeQuantize = Union[ObserverBase, FakeQuantizeBase] --ObserverOrFakeQuantize.__module__ = "torch.ao.quantization" -+if sys.version_info < (3, 14): -+ ObserverOrFakeQuantize.__module__ = "torch.ao.quantization" -+ - for _f in [ - compare_results, - extract_results_from_loggers, -diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py -index efee5302ad42..d9a8fc78bab4 100644 ---- a/torch/ao/quantization/qconfig.py -+++ b/torch/ao/quantization/qconfig.py -@@ -1,5 +1,6 @@ - # mypy: allow-untyped-defs - import copy -+import sys - import warnings - from collections import namedtuple - from typing import Any, Optional, Union -@@ -568,7 +569,8 @@ def _assert_valid_qconfig(qconfig: Optional[QConfig], mod: torch.nn.Module) -> N - - - QConfigAny = Optional[QConfig] --QConfigAny.__module__ = "torch.ao.quantization.qconfig" -+if sys.version_info < (3, 14): -+ QConfigAny.__module__ = "torch.ao.quantization.qconfig" - - - def _add_module_to_qconfig_obs_ctr( -diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py -index 4ac3112ec072..3b1503e01701 100644 ---- a/torch/ao/quantization/utils.py -+++ b/torch/ao/quantization/utils.py -@@ -4,6 +4,7 @@ Utils shared by different modes of quantization (eager/graph) - """ - - import functools -+import sys - import warnings - from collections import OrderedDict - from inspect import getfullargspec, signature -@@ -16,7 +17,8 @@ from torch.nn.utils.parametrize import is_parametrized - - - NodePattern = Union[tuple[Node, Node], tuple[Node, tuple[Node, Node]], Any] --NodePattern.__module__ = "torch.ao.quantization.utils" -+if sys.version_info < (3, 14): -+ NodePattern.__module__ = "torch.ao.quantization.utils" - - # This is the Quantizer class instance from torch/quantization/fx/quantize.py. - # Define separately to prevent circular imports. -@@ -31,7 +33,8 @@ QuantizerCls = Any - Pattern = Union[ - Callable, tuple[Callable, Callable], tuple[Callable, tuple[Callable, Callable]], Any - ] --Pattern.__module__ = "torch.ao.quantization.utils" -+if sys.version_info < (3, 14): -+ Pattern.__module__ = "torch.ao.quantization.utils" - - - # TODO: maybe rename this to MatchInputNode -diff --git a/torch/csrc/dynamo/cpython_defs.c b/torch/csrc/dynamo/cpython_defs.c -index b68ef894aeaa..244d4165d5e8 100644 ---- a/torch/csrc/dynamo/cpython_defs.c -+++ b/torch/csrc/dynamo/cpython_defs.c -@@ -2,6 +2,20 @@ - #include - #include - -+#if IS_PYTHON_3_14_PLUS -+ -+const uint8_t* THP_PyOpcode_Caches = NULL; -+const int THP_PyOpcode_Caches_size = 0; -+ -+void -+THP_PyThreadState_PopFrame(PyThreadState *tstate, _PyInterpreterFrame * frame) -+{} -+void -+THP_PyFrame_Clear(_PyInterpreterFrame *frame) -+{} -+ -+#else -+ - #if IS_PYTHON_3_11_PLUS - - #define Py_BUILD_CORE -@@ -360,3 +374,5 @@ const uint8_t* THP_PyOpcode_Caches = NULL; - const int THP_PyOpcode_Caches_size = 0; - - #endif -+ -+#endif // IS_PYTHON_3_14_PLUS -\ No newline at end of file -diff --git a/torch/csrc/dynamo/cpython_includes.h b/torch/csrc/dynamo/cpython_includes.h -index 6b99c1d5aec8..616be16563cf 100644 ---- a/torch/csrc/dynamo/cpython_includes.h -+++ b/torch/csrc/dynamo/cpython_includes.h -@@ -21,6 +21,14 @@ - - #if IS_PYTHON_3_11_PLUS - #include -+#if IS_PYTHON_3_14_PLUS -+#include -+#include -+#endif -+#endif -+ -+#if IS_PYTHON_3_14_PLUS -+#include - #endif - - #undef Py_BUILD_CORE -@@ -30,6 +38,13 @@ - extern "C" { - #endif - -+#if IS_PYTHON_3_14_PLUS -+ -+#define F_CODE(x) (PyCodeObject*)PyStackRef_AsPyObjectBorrow(x->f_executable) -+#define PREV_INSTR(x) (x)->instr_ptr -+ -+#else -+ - #if IS_PYTHON_3_13_PLUS - #define F_CODE(x) ((PyCodeObject*)(x)->f_executable) - #define PREV_INSTR(x) (x)->instr_ptr -@@ -38,6 +53,8 @@ extern "C" { - #define PREV_INSTR(x) (x)->prev_instr - #endif - -+#endif // IS_PYTHON_3_14_PLUS -+ - #if IS_PYTHON_3_12_PLUS - #define FUNC(x) ((x)->f_funcobj) - #else -diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c -index f413782b2d30..72bb8839bac3 100644 ---- a/torch/csrc/dynamo/eval_frame.c -+++ b/torch/csrc/dynamo/eval_frame.c -@@ -224,17 +224,6 @@ const char* get_frame_name(THP_EVAL_API_FRAME_OBJECT* frame) { - return PyUnicode_AsUTF8(F_CODE(frame)->co_name); - } - --void clear_old_frame_if_python_312_plus( -- PyThreadState* tstate, -- THP_EVAL_API_FRAME_OBJECT* frame) { --#if IS_PYTHON_3_12_PLUS -- -- THP_PyFrame_Clear(frame); -- THP_PyThreadState_PopFrame(tstate, frame); -- --#endif --} -- - static PyObject* dynamo_eval_custom_code_impl( - PyThreadState* tstate, - THP_EVAL_API_FRAME_OBJECT* frame, -@@ -485,6 +474,18 @@ static PyObject* dynamo__custom_eval_frame_shim( - - static void enable_eval_frame_shim(PyThreadState* tstate) {} - static void enable_eval_frame_default(PyThreadState* tstate) {} -+PyObject* dynamo_eval_custom_code( -+ PyThreadState* tstate, -+ THP_EVAL_API_FRAME_OBJECT* frame, -+ PyCodeObject* code, -+ const char* trace_annotation, -+ int throw_flag) {} -+THPPyInterpreterFrame* THPPyInterpreterFrame_New( -+ THP_EVAL_API_FRAME_OBJECT* frame) {} -+PyObject* dynamo_eval_frame_default( -+ PyThreadState* tstate, -+ THP_EVAL_API_FRAME_OBJECT* frame, -+ int throw_flag) {} - - static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL}; - -@@ -498,6 +499,17 @@ static PyTypeObject THPPyInterpreterFrameType = { - - #endif // !(IS_PYTHON_3_14_PLUS) - -+void clear_old_frame_if_python_312_plus( -+ PyThreadState* tstate, -+ THP_EVAL_API_FRAME_OBJECT* frame) { -+#if IS_PYTHON_3_12_PLUS -+ -+ THP_PyFrame_Clear(frame); -+ THP_PyThreadState_PopFrame(tstate, frame); -+ -+#endif -+} -+ - static PyObject* increment_working_threads( - PyThreadState* tstate, - PyObject* module) { -diff --git a/torch/csrc/dynamo/framelocals_mapping.cpp b/torch/csrc/dynamo/framelocals_mapping.cpp -index b839fb26fc91..c4ee36d87767 100644 ---- a/torch/csrc/dynamo/framelocals_mapping.cpp -+++ b/torch/csrc/dynamo/framelocals_mapping.cpp -@@ -26,9 +26,13 @@ FrameLocalsMapping::FrameLocalsMapping(FrameLocalsFrameType* frame) - PyCodeObject* co = F_CODE(frame); - _framelocals.resize(co->co_nlocalsplus, nullptr); - -+#if IS_PYTHON_3_14_PLUS -+ TORCH_CHECK(false, "Python 3.14+ not supported"); -+#else - if (!frame->stacktop) { - return; - } -+#endif - - auto update_framelocals = [&](int i, PyObject* value) { - _PyLocals_Kind kind = _PyLocals_GetKind(co->co_localspluskinds, i); -@@ -53,11 +57,21 @@ FrameLocalsMapping::FrameLocalsMapping(FrameLocalsFrameType* frame) - }; - - auto offset = co->co_nlocalsplus - co->co_nfreevars; -+#if IS_PYTHON_3_14_PLUS -+ TORCH_CHECK(false, "Python 3.14+ not supported"); -+#else - for (int i = 0; i < offset; i++) { - update_framelocals(i, frame->localsplus[i]); - } -+#endif -+ - // Get references to closure variables -+#if IS_PYTHON_3_14_PLUS -+ PyObject* closure; -+ TORCH_CHECK(false, "Python 3.14+ not supported"); -+#else - PyObject* closure = ((PyFunctionObject*)FUNC(frame))->func_closure; -+#endif - for (int i = 0; i < co->co_nfreevars; i++) { - update_framelocals(offset + i, PyTuple_GET_ITEM(closure, i)); - } -diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h -index a1537611cc47..16292e4fd030 100644 ---- a/torch/csrc/utils/python_compat.h -+++ b/torch/csrc/utils/python_compat.h -@@ -13,6 +13,7 @@ extern "C" { - #define IS_PYTHON_3_12_PLUS PY_VERSION_HEX >= 0x030C0000 - #define IS_PYTHON_3_13_PLUS PY_VERSION_HEX >= 0x030D0000 - #define IS_PYTHON_3_14_PLUS PY_VERSION_HEX >= 0x030E0000 -+#define IS_PYTHON_3_15_PLUS PY_VERSION_HEX >= 0x030F0000 - - static inline int PyCode_GetNCellvars(PyCodeObject* code) { - // gh-26364 added co_ncellvars to Python 3.11.0rc1 -diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py -index 345ffd2a065b..ceeadde5365b 100644 ---- a/torch/onnx/__init__.py -+++ b/torch/onnx/__init__.py -@@ -104,7 +104,6 @@ ONNXProgram.__module__ = "torch.onnx" - OnnxExporterError.__module__ = "torch.onnx" - _OrtBackend.__module__ = "torch.onnx" - _OrtBackendOptions.__module__ = "torch.onnx" --_OrtExecutionProvider.__module__ = "torch.onnx" - enable_fake_mode.__module__ = "torch.onnx" - is_onnxrt_backend_supported.__module__ = "torch.onnx" - -diff --git a/torch/utils/weak.py b/torch/utils/weak.py -index 8bf2ba5ed02b..9c7218cb2ad3 100644 ---- a/torch/utils/weak.py -+++ b/torch/utils/weak.py -@@ -3,8 +3,6 @@ from __future__ import annotations - - import collections.abc as _collections_abc - import weakref -- --from _weakrefset import _IterationGuard # type: ignore[attr-defined] - from collections.abc import Mapping, MutableMapping - from weakref import ref - -@@ -22,6 +20,33 @@ __all__ = [ - ] - - -+# TODO: make weakref properly thread safe following -+# https://github.com/python/cpython/pull/125325 -+class _IterationGuard: -+ # This context manager registers itself in the current iterators of the -+ # weak container, such as to delay all removals until the context manager -+ # exits. -+ # This technique should be relatively thread-safe (since sets are). -+ -+ def __init__(self, weakcontainer): -+ # Don't create cycles -+ self.weakcontainer = ref(weakcontainer) -+ -+ def __enter__(self): -+ w = self.weakcontainer() -+ if w is not None: -+ w._iterating.add(self) -+ return self -+ -+ def __exit__(self, e, t, b): -+ w = self.weakcontainer() -+ if w is not None: -+ s = w._iterating -+ s.remove(self) -+ if not s: -+ w._commit_removals() -+ -+ - # This file defines a variant of WeakKeyDictionary that overrides the hashing - # behavior of the key to use object identity, rather than the builtin - # __eq__/__hash__ functions. This is useful for Tensor weak keys, as their --- -2.49.0 - diff --git a/0001-Improve-finding-and-using-the-rocm_version.h.patch b/0001-Improve-finding-and-using-the-rocm_version.h.patch new file mode 100644 index 0000000..b8232c7 --- /dev/null +++ b/0001-Improve-finding-and-using-the-rocm_version.h.patch @@ -0,0 +1,142 @@ +From 201ac4618a1526e048a0d6c02d9bc4cf30bf0ee1 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Wed, 14 Aug 2024 17:18:38 -0700 +Subject: [PATCH] Improve finding and using the rocm_version.h + +On Fedora, the rocm_version.h's path is /usr/include/rocm_version.h +So we have this build error +pytorch/aten/src/ATen/hip/tunable/Tunable.cpp:40:10: fatal error: + rocm-core/rocm_version.h: No such file or directory + 40 | #include + | ^~~~~~~~~~~~~~~~~~~~~~~~~~ + +In other cases, depending on the rocm release either +/opt/rocm/include or /opt/rocm/include/rocm-core + +Convert the EXISTS() checks into a find_path. +Add a -I${ROCM_VERSION_DIR} to the compile options so it can be +found by Tunable.cpp + +Signed-off-by: Tom Rix +--- + aten/src/ATen/cuda/tunable/Tunable.cpp | 2 +- + cmake/Dependencies.cmake | 1 + + cmake/public/LoadHIP.cmake | 72 ++++++++++---------------- + 3 files changed, 30 insertions(+), 45 deletions(-) + +diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp +index 1b7c89875855..32c1d70f3152 100644 +--- a/aten/src/ATen/cuda/tunable/Tunable.cpp ++++ b/aten/src/ATen/cuda/tunable/Tunable.cpp +@@ -36,7 +36,7 @@ + + // for validators + #ifdef USE_ROCM +-#include ++#include + #define ROCBLAS_BETA_FEATURES_API + #include + #include +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index 7ef8eabb5162..61bc4d7a54b6 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -1063,6 +1063,7 @@ if(USE_ROCM) + list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) + list(APPEND HIP_CXX_FLAGS -std=c++17) + list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2) ++ list(APPEND HIP_CXX_FLAGS -I${ROCM_VERSION_DIR}) + if(HIP_NEW_TYPE_ENUMS) + list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS) + endif() +diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake +index 1c0d3a203991..6a7e3bd163f5 100644 +--- a/cmake/public/LoadHIP.cmake ++++ b/cmake/public/LoadHIP.cmake +@@ -42,55 +42,39 @@ find_package_and_print_version(HIP 1.0) + + if(HIP_FOUND) + set(PYTORCH_FOUND_HIP TRUE) +- set(FOUND_ROCM_VERSION_H FALSE) +- + set(PROJECT_RANDOM_BINARY_DIR "${PROJECT_BINARY_DIR}") +- set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc") + + # Find ROCM version for checks + # ROCM 5.0 and later will have header api for version management +- if(EXISTS ${ROCM_INCLUDE_DIRS}/rocm_version.h) +- set(FOUND_ROCM_VERSION_H TRUE) +- file(WRITE ${file} "" +- "#include \n" +- ) +- elseif(EXISTS ${ROCM_INCLUDE_DIRS}/rocm-core/rocm_version.h) +- set(FOUND_ROCM_VERSION_H TRUE) +- file(WRITE ${file} "" +- "#include \n" +- ) +- else() +- message("********************* rocm_version.h couldnt be found ******************\n") +- endif() +- +- if(FOUND_ROCM_VERSION_H) +- file(APPEND ${file} "" +- "#include \n" +- +- "#ifndef ROCM_VERSION_PATCH\n" +- "#define ROCM_VERSION_PATCH 0\n" +- "#endif\n" +- "#define STRINGIFYHELPER(x) #x\n" +- "#define STRINGIFY(x) STRINGIFYHELPER(x)\n" +- "int main() {\n" +- " printf(\"%d.%d.%s\", ROCM_VERSION_MAJOR, ROCM_VERSION_MINOR, STRINGIFY(ROCM_VERSION_PATCH));\n" +- " return 0;\n" +- "}\n" +- ) +- +- try_run(run_result compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file} +- CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}" +- RUN_OUTPUT_VARIABLE rocm_version_from_header +- COMPILE_OUTPUT_VARIABLE output_var +- ) +- # We expect the compile to be successful if the include directory exists. +- if(NOT compile_result) +- message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var}) +- endif() +- message(STATUS "Caffe2: Header version is: " ${rocm_version_from_header}) +- set(ROCM_VERSION_DEV_RAW ${rocm_version_from_header}) +- message("\n***** ROCm version from rocm_version.h ****\n") ++ find_path(ROCM_VERSION_DIR rocm_version.h HINTS ${ROCM_INCLUDE_DIRS} ${ROCM_INCLUDE_DIRS}/rocm-core) ++ set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc") ++ file(WRITE ${file} "" ++ "#include \n" ++ "#include \n" ++ ++ "#ifndef ROCM_VERSION_PATCH\n" ++ "#define ROCM_VERSION_PATCH 0\n" ++ "#endif\n" ++ "#define STRINGIFYHELPER(x) #x\n" ++ "#define STRINGIFY(x) STRINGIFYHELPER(x)\n" ++ "int main() {\n" ++ " printf(\"%d.%d.%s\", ROCM_VERSION_MAJOR, ROCM_VERSION_MINOR, STRINGIFY(ROCM_VERSION_PATCH));\n" ++ " return 0;\n" ++ "}\n" ++ ) ++ ++ try_run(run_result compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file} ++ CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_VERSION_DIR}" ++ RUN_OUTPUT_VARIABLE rocm_version_from_header ++ COMPILE_OUTPUT_VARIABLE output_var ++ ) ++ # We expect the compile to be successful if the include directory exists. ++ if(NOT compile_result) ++ message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var}) + endif() ++ message(STATUS "Caffe2: Header version is: " ${rocm_version_from_header}) ++ set(ROCM_VERSION_DEV_RAW ${rocm_version_from_header}) ++ message("\n***** ROCm version from rocm_version.h ****\n") + + string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+).*$" ROCM_VERSION_DEV_MATCH ${ROCM_VERSION_DEV_RAW}) + +-- +2.46.0 + diff --git a/0001-Optionally-use-hipblaslt.patch b/0001-Optionally-use-hipblaslt.patch new file mode 100644 index 0000000..1e5ca4b --- /dev/null +++ b/0001-Optionally-use-hipblaslt.patch @@ -0,0 +1,506 @@ +From f1d65e958afa65882dbfea8b392ab847a84d41ed Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Sat, 29 Jun 2024 04:18:34 -0700 +Subject: [PATCH] Optionally use hipblaslt + +--- + aten/src/ATen/cuda/CUDABlas.cpp | 46 ++++++++++++++++++------ + aten/src/ATen/cuda/CUDAContextLight.h | 4 +++ + aten/src/ATen/cuda/CublasHandlePool.cpp | 10 ++++-- + aten/src/ATen/cuda/tunable/TunableGemm.h | 18 +++++++--- + aten/src/ATen/native/cuda/Blas.cpp | 18 +++++++++- + cmake/Dependencies.cmake | 3 ++ + cmake/public/LoadHIP.cmake | 2 +- + 7 files changed, 82 insertions(+), 19 deletions(-) + +diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp +index ce991a9bcad4..3f0d17b52778 100644 +--- a/aten/src/ATen/cuda/CUDABlas.cpp ++++ b/aten/src/ATen/cuda/CUDABlas.cpp +@@ -14,7 +14,9 @@ + #include + + #ifdef USE_ROCM ++#ifdef USE_HIPBLASLT + #include ++#endif + // until hipblas has an API to accept flags, we must use rocblas here + #include + #include +@@ -182,6 +184,9 @@ uint32_t _getAlignment(uintptr_t address) { + static size_t _parseChosenWorkspaceSize() { + const char * val = getenv("CUBLASLT_WORKSPACE_SIZE"); + #ifdef USE_ROCM ++#ifndef USE_HIPBLASLT ++ return 0; ++#endif + if (!val) { + // accept either env var + val = getenv("HIPBLASLT_WORKSPACE_SIZE"); +@@ -235,6 +240,7 @@ namespace at::cuda::blas { + } while (0) + + ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + namespace { + // Following the pattern of CuSparseDescriptor + // Defined here for now because this is the only place cublas_lt interface is +@@ -318,7 +324,6 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< + }; + } // namespace + +- + template + inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { + cudaDataType_t abcType = CUDA_R_32F; +@@ -452,7 +457,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { + " scaleType ", + scaleType); + } +- ++#endif + + template + inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { +@@ -608,10 +613,13 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(double)) + template <> + void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(float)) + { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(float)); + } +- else { ++ else ++#endif ++ { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(float)); + } + } +@@ -651,10 +659,13 @@ void bgemm_internal>(CUDABLAS_BGEMM_ARGTYPES(c10::complex + void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)) + { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::Half)); + } +- else { ++ else ++#endif ++ { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::Half)); + } + } +@@ -662,10 +673,13 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)) + template <> + void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) + { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } +- else { ++ else ++#endif ++ { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } + } +@@ -781,11 +795,13 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { + } + } + ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + template + inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) { + // forward to bgemm implementation but set strides and batches to 0 + bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0); + } ++#endif + + template + inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) { +@@ -1008,10 +1024,13 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) + template <> + void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) + { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); + } +- else { ++ else ++#endif ++ { + gemm_internal_cublas(CUDABLAS_GEMM_ARGS(float)); + } + } +@@ -1051,10 +1070,13 @@ void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex + void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) + { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); + } +- else { ++ else ++#endif ++ { + gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::Half)); + } + } +@@ -1062,10 +1084,13 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) + template <> + void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) + { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } +- else { ++ else ++#endif ++ { + gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } + } +@@ -1177,7 +1202,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { + } + } + +- ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + template + void gemm_and_bias( + bool transpose_mat1, +@@ -1410,7 +1435,7 @@ void scaled_gemm( + ScalarType result_dtype, + void* amax_ptr, + bool use_fast_accum) { +-#if CUDA_VERSION >= 11080 || defined(USE_ROCM) ++#if CUDA_VERSION >= 11080 || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + const auto computeType = CUBLAS_COMPUTE_32F; + const auto scaleType = CUDA_R_32F; + const int8_t fastAccuMode = use_fast_accum ? 1 : 0; +@@ -1681,6 +1706,7 @@ void int8_gemm( + " scaleType ", + scaleType); + } ++#endif + + template <> + void trsm(CUDABLAS_TRSM_ARGTYPES(float)) { +diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h +index f2b657ced51b..f0ee613c4208 100644 +--- a/aten/src/ATen/cuda/CUDAContextLight.h ++++ b/aten/src/ATen/cuda/CUDAContextLight.h +@@ -9,7 +9,9 @@ + + // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also + // added bf16 support ++#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))) + #include ++#endif + + #ifdef CUDART_VERSION + #include +@@ -80,7 +82,9 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator(); + /* Handles */ + TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle(); + TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle(); ++#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))) + TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle(); ++#endif + + TORCH_CUDA_CPP_API void clearCublasWorkspaces(); + +diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp +index 8eac525b3695..abfdf7a23847 100644 +--- a/aten/src/ATen/cuda/CublasHandlePool.cpp ++++ b/aten/src/ATen/cuda/CublasHandlePool.cpp +@@ -29,7 +29,7 @@ namespace at::cuda { + + namespace { + +-#if defined(USE_ROCM) ++#if defined(USE_ROCM) && defined(USE_HIPBLASLT) + void createCublasLtHandle(cublasLtHandle_t *handle) { + TORCH_CUDABLAS_CHECK(cublasLtCreate(handle)); + } +@@ -191,8 +191,9 @@ cublasHandle_t getCurrentCUDABlasHandle() { + return handle; + } + +-cublasLtHandle_t getCurrentCUDABlasLtHandle() { + #ifdef USE_ROCM ++#if defined(USE_HIPBLASLT) ++cublasLtHandle_t getCurrentCUDABlasLtHandle() { + c10::DeviceIndex device = 0; + AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); + +@@ -213,9 +214,12 @@ cublasLtHandle_t getCurrentCUDABlasLtHandle() { + + auto handle = myPoolWindow->reserve(device); + return handle; ++} ++#endif + #else ++cublasLtHandle_t getCurrentCUDABlasLtHandle() { + return reinterpret_cast(getCurrentCUDABlasHandle()); +-#endif + } ++#endif + + } // namespace at::cuda +diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h +index 53e6154120c9..fa1d664696db 100644 +--- a/aten/src/ATen/cuda/tunable/TunableGemm.h ++++ b/aten/src/ATen/cuda/tunable/TunableGemm.h +@@ -11,7 +11,9 @@ + + #include + #ifdef USE_ROCM ++#ifdef USE_HIPBLASLT + #include ++#endif + #include + #endif + #include +@@ -65,6 +67,7 @@ class DefaultGemmStridedBatchedOp : public Callable> + } + }; + ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + template + class DefaultScaledGemmOp : public Callable> { + public: +@@ -94,6 +97,7 @@ class DefaultScaledGemmOp : public Callable> { + return OK; + } + }; ++#endif + + template + inline bool IsZero(T v) { +@@ -191,6 +195,7 @@ static void AddRocblasValidator() { + } + } + ++#ifdef USE_HIPBLASLT + static void AddHipblasltValidator() { + auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); + if (validators.find("HIPBLASLT_VERSION") == validators.end()) { +@@ -205,6 +210,7 @@ static void AddHipblasltValidator() { + [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; }); + } + } ++#endif + + static void AddRocmValidator() { + auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); +@@ -243,7 +249,7 @@ class GemmTunableOp : public TunableOp, StreamTimer> { + } + AddRocblasValidator(); + } +- ++#ifdef USE_HIPBLASLT + static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); + if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) { + rocm_validators = true; +@@ -257,7 +263,7 @@ class GemmTunableOp : public TunableOp, StreamTimer> { + } + AddHipblasltValidator(); + } +- ++#endif + if (rocm_validators) { + AddRocmValidator(); + } +@@ -286,7 +292,7 @@ class GemmStridedBatchedTunableOp : public TunableOp + } + AddRocblasValidator(); + } +- ++#ifdef USE_HIPBLASLT + static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); + if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) { + rocm_validators = true; +@@ -300,7 +306,7 @@ class GemmStridedBatchedTunableOp : public TunableOp + } + AddHipblasltValidator(); + } +- ++#endif + if (rocm_validators) { + AddRocmValidator(); + } +@@ -312,6 +318,7 @@ class GemmStridedBatchedTunableOp : public TunableOp + } + }; + ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + template + class ScaledGemmTunableOp : public TunableOp, StreamTimer> { + public: +@@ -321,10 +328,12 @@ class ScaledGemmTunableOp : public TunableOp, StreamTimer> + auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); + + #if defined(USE_ROCM) ++#ifdef USE_HIPBLASLT + for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps()) { + this->RegisterOp(std::move(name), std::move(op)); + } + AddHipblasltValidator(); ++#endif + AddRocmValidator(); + #endif + } +@@ -337,6 +346,7 @@ class ScaledGemmTunableOp : public TunableOp, StreamTimer> + "_", BlasOpToString(ALayout), BlasOpToString(BLayout)); + } + }; ++#endif + + #undef XSTRINGIFY + #undef STRINGIFY +diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp +index 84c59a4fd0d7..56ad5de3bf2d 100644 +--- a/aten/src/ATen/native/cuda/Blas.cpp ++++ b/aten/src/ATen/native/cuda/Blas.cpp +@@ -173,6 +173,7 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa + } + + static bool getDisableAddmmCudaLt() { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT"); + #ifdef USE_ROCM + // if we enable tunable op, it'll take priority over just hipblaslt (heuristics) +@@ -196,10 +197,14 @@ static bool getDisableAddmmCudaLt() { + } + return false; + #endif ++#else ++ return true; ++#endif + } + + #ifdef USE_ROCM + static bool isSupportedHipLtROCmArch(int index) { ++#ifdef USE_HIPBLASLT + hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index); + std::string device_arch = prop->gcnArchName; + static const std::vector archs = {"gfx90a", "gfx940", "gfx941", "gfx942"}; +@@ -210,6 +215,7 @@ static bool isSupportedHipLtROCmArch(int index) { + } + } + TORCH_CHECK(false, "Attempting to use hipBLASLt on a unsupported architecture!"); ++#endif + return false; + } + #endif +@@ -235,6 +241,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma + at::ScalarType scalar_type = self.scalar_type(); + c10::MaybeOwned self_; + if (&result != &self) { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM) + // Strangely, if mat2 has only 1 row or column, we get + // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic. +@@ -276,13 +283,14 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma + scalar_type != at::ScalarType::BFloat16)); + #endif + } ++#endif + #endif + if (!useLtInterface) { + self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm"); + } + self__sizes = self_->sizes(); + } else { +-#if defined(USE_ROCM) ++#if defined(USE_ROCM) && defined(USE_HIPBLASLT) + useLtInterface = !disable_addmm_cuda_lt && + result.dim() == 2 && result.is_contiguous() && + isSupportedHipLtROCmArch(self.device().index()) && +@@ -334,6 +342,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj()); + + if (useLtInterface) { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + #if defined(USE_ROCM) + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Half, +@@ -394,6 +403,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma + activation_epilogue + ); + }); ++#endif + #endif + } else + { +@@ -803,6 +813,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) { + } + + static bool _scaled_mm_allowed_device() { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + auto dprops = at::cuda::getCurrentDeviceProperties(); + #ifdef USE_ROCM + std::string device_arch = dprops->gcnArchName; +@@ -817,6 +828,9 @@ static bool _scaled_mm_allowed_device() { + #else + return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9); + #endif ++#else ++ return false; ++#endif + } + + // Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax +@@ -850,6 +864,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, + // Check sizes + bool allowed_device = _scaled_mm_allowed_device(); + TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+"); ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix"); + TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix"); + TORCH_CHECK( +@@ -1025,6 +1040,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, + #if defined(USE_ROCM) && ROCM_VERSION >= 60000 && ROCM_VERSION < 60200 + // ROCm's hipBLASLt does not support amax before 6.2, so calculate separately + amax = at::max(at::abs(out.to(kFloat))); ++#endif + #endif + + return {out, amax}; +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index f1f2eb7cec31..8d05e834bbc5 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -1052,6 +1052,9 @@ if(USE_ROCM) + list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) + list(APPEND HIP_CXX_FLAGS -std=c++17) + list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2) ++ if(hipblast_FOUND) ++ list(APPEND HIP_CXX_FLAGS -DUSE_HIPBLASLT) ++ endif() + if(HIP_NEW_TYPE_ENUMS) + list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS) + endif() +diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake +index fa39156031ff..df4836847fdf 100644 +--- a/cmake/public/LoadHIP.cmake ++++ b/cmake/public/LoadHIP.cmake +@@ -155,7 +155,7 @@ if(HIP_FOUND) + find_package_and_print_version(hiprand REQUIRED) + find_package_and_print_version(rocblas REQUIRED) + find_package_and_print_version(hipblas REQUIRED) +- find_package_and_print_version(hipblaslt REQUIRED) ++ find_package_and_print_version(hipblaslt) + find_package_and_print_version(miopen REQUIRED) + find_package_and_print_version(hipfft REQUIRED) + find_package_and_print_version(hipsparse REQUIRED) +-- +2.45.2 + diff --git a/0001-Patch-for-sleef-3.6.patch b/0001-Patch-for-sleef-3.6.patch new file mode 100644 index 0000000..13aa208 --- /dev/null +++ b/0001-Patch-for-sleef-3.6.patch @@ -0,0 +1,952 @@ +From 273f23698c887b52e66c2abec8101b7398f0f9c4 Mon Sep 17 00:00:00 2001 +From: "Benjamin A. Beasley" +Date: Wed, 5 Jun 2024 11:06:02 -0400 +Subject: [PATCH] Patch for sleef 3.6 + +--- + ...ectorization-on-windows-submodule-sl.patch | 910 ++++++++++++++++++ + python-torch.spec | 11 + + 2 files changed, 921 insertions(+) + create mode 100644 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch + +diff --git a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch +new file mode 100644 +index 000000000000..562f55b742c2 +--- /dev/null ++++ b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch +@@ -0,0 +1,910 @@ ++From 3d1e4b3e5ddcdd2717e590c635097163fef64c83 Mon Sep 17 00:00:00 2001 ++From: Xu Han ++Date: Sun, 31 Mar 2024 03:07:32 +0000 ++Subject: [PATCH] Enable x86 CPU vectorization on windows [submodule sleef] ++ (#118980) ++ ++Enable VEC on Windows OS. ++1. Fix some type defination gap between Windows and Linux. ++2. Fix some operator not support on Windows, such as [], /. ++3. Enable static sleef library build on Windows. ++4. Disable unsupported function overloading on MSVC. ++5. Upgrade submodule sleef lib, which fixed build issue on Windows. ++6. Fixed bazel build issues. ++7. Fix test app not link to sleef on Windows. ++ ++Note: If rebuild fail after pulled this PR, please sync `sleef` submodule by run: ++```cmd ++git submodule sync ++git submodule update --init --recursive ++``` ++ ++Pull Request resolved: https://github.com/pytorch/pytorch/pull/118980 ++Approved by: https://github.com/jgong5, https://github.com/ezyang, https://github.com/malfet ++--- ++ aten/src/ATen/CMakeLists.txt | 48 ++++++-------- ++ aten/src/ATen/cpu/vec/vec256/vec256.h | 14 ++-- ++ .../src/ATen/cpu/vec/vec256/vec256_bfloat16.h | 27 ++++++-- ++ .../cpu/vec/vec256/vec256_complex_double.h | 7 +- ++ .../cpu/vec/vec256/vec256_complex_float.h | 7 +- ++ aten/src/ATen/cpu/vec/vec256/vec256_double.h | 5 +- ++ aten/src/ATen/cpu/vec/vec256/vec256_float.h | 15 +++-- ++ aten/src/ATen/cpu/vec/vec256/vec256_qint.h | 12 +++- ++ aten/src/ATen/cpu/vec/vec512/vec512.h | 14 ++-- ++ .../src/ATen/cpu/vec/vec512/vec512_bfloat16.h | 27 ++++++-- ++ .../cpu/vec/vec512/vec512_complex_double.h | 7 +- ++ .../cpu/vec/vec512/vec512_complex_float.h | 7 +- ++ aten/src/ATen/cpu/vec/vec512/vec512_double.h | 5 +- ++ aten/src/ATen/cpu/vec/vec512/vec512_float.h | 15 +++-- ++ aten/src/ATen/cpu/vec/vec512/vec512_qint.h | 66 ++++++++++++++++++- ++ aten/src/ATen/cpu/vec/vec_base.h | 6 ++ ++ caffe2/CMakeLists.txt | 2 +- ++ third_party/sleef.BUILD | 3 +- ++ 18 files changed, 194 insertions(+), 93 deletions(-) ++ ++diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt ++index bf425af5fa9..58d5828e8ca 100644 ++--- a/aten/src/ATen/CMakeLists.txt +++++ b/aten/src/ATen/CMakeLists.txt ++@@ -419,32 +419,25 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$") ++ list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo) ++ endif() ++ ++-if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) ++- # Preserve values for the main build ++- set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS}) ++- set(__aten_sleef_build_tests ${BUILD_TESTS}) ++- ++- # Unset our restrictive C++ flags here and reset them later. ++- # Remove this once we use proper target_compile_options. ++- set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) ++- set(CMAKE_CXX_FLAGS) ++- ++- # Bump up optimization level for sleef to -O1, since at -O0 the compiler ++- # excessively spills intermediate vector registers to the stack ++- # and makes things run impossibly slowly ++- set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) ++- if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0") ++- string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) ++- else() ++- set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1") +++if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) +++ if(NOT MSVC) +++ # Bump up optimization level for sleef to -O1, since at -O0 the compiler +++ # excessively spills intermediate vector registers to the stack +++ # and makes things run impossibly slowly +++ set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) +++ if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0") +++ string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) +++ else() +++ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1") +++ endif() ++ endif() ++ ++ if(NOT USE_SYSTEM_SLEEF) ++- set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) ++- set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) ++- set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) ++- set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) ++- set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE}) +++ set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) +++ set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) +++ set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) +++ set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) +++ set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE) ++ if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") ++ if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64") ++ set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE) ++@@ -465,12 +458,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) ++ endif() ++ list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef) ++ ++- set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) ++- set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS}) ++- ++- # Set these back. TODO: Use SLEEF_ to pass these instead ++- set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE) ++- set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE) +++ if(NOT MSVC) +++ set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) +++ endif() ++ endif() ++ ++ if(USE_CUDA AND NOT USE_ROCM) ++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h ++index 800b027e469..c431fa3c605 100644 ++--- a/aten/src/ATen/cpu/vec/vec256/vec256.h +++++ b/aten/src/ATen/cpu/vec/vec256/vec256.h ++@@ -69,7 +69,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { ++ } ++ ++ ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) ++ ++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++@@ -94,7 +94,8 @@ inline Vectorized cast(const Vectorized& src) ++ } ++ ++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++- +++#ifndef _MSC_VER +++// MSVC is not working well on complex function overload. ++ template ++ std::enable_if_t> ++ inline gather(const double* base_addr, const Vectorized& vindex) { ++@@ -106,9 +107,10 @@ std::enable_if_t& vindex) { ++ return _mm256_i32gather_ps(base_addr, vindex, scale); ++ } ++- +++#endif ++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++- +++#ifndef _MSC_VER +++// MSVC is not working well on complex function overload. ++ template ++ std::enable_if_t> ++ inline mask_gather(const Vectorized& src, const double* base_addr, ++@@ -122,7 +124,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, ++ const Vectorized& vindex, Vectorized& mask) { ++ return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale); ++ } ++- +++#endif ++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++ // Only works for inputs in the range: [-2^51, 2^51] ++@@ -302,6 +304,6 @@ inline Vectorized flip(const Vectorized & v) { ++ return flip8(v); ++ } ++ ++-#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#endif // (defined(CPU_CAPABILITY_AVX2) ++ ++ }} // namepsace at::vec::CPU_CAPABILITY ++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h ++index 3e26213d6d2..66557436c70 100644 ++--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h +++++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h ++@@ -7,7 +7,8 @@ ++ #include ++ #include ++ ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) +++#define SLEEF_STATIC_LIBS ++ #include ++ #endif ++ ++@@ -18,7 +19,18 @@ namespace at::vec { ++ // See Note [CPU_CAPABILITY namespace] ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) +++ +++#ifndef SLEEF_CONST +++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) +++#define SLEEF_CONST const +++#else +++#define SLEEF_CONST +++#endif +++#define SLEEF_CONST_OLD SLEEF_CONST +++#else +++#define SLEEF_CONST_OLD +++#endif ++ ++ // bfloat16 conversion ++ static inline void cvtbf16_fp32(const __m128i& a, __m256& o) { ++@@ -265,7 +277,8 @@ public: ++ } ++ return b; ++ } ++- Vectorized map(const __m256 (*const vop)(__m256)) const { +++ +++ Vectorized map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const { ++ __m256 lo, hi; ++ cvt_to_fp32(values, lo, hi); ++ const auto o1 = vop(lo); ++@@ -1026,7 +1039,7 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V ++ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16); ++ CONVERT_VECTORIZED_INIT(Half, half); ++ ++-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#else // defined(CPU_CAPABILITY_AVX2) ++ ++ #define CONVERT_NON_VECTORIZED_INIT(type, name) \ ++ inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ ++@@ -1051,9 +1064,9 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V ++ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16); ++ CONVERT_NON_VECTORIZED_INIT(Half, half); ++ ++-#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#endif // defined(CPU_CAPABILITY_AVX2) ++ ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) ++ #define LOAD_FP32_VECTORIZED_INIT(type, name) \ ++ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ ++ auto values = _mm_loadu_si128(reinterpret_cast(data)); \ ++@@ -1072,7 +1085,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec ++ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16); ++ LOAD_FP32_VECTORIZED_INIT(Half, fp16); ++ ++-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#else // defined(CPU_CAPABILITY_AVX2) ++ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ ++ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ ++ __at_align__ float values[Vectorized::size()]; \ ++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h ++index f93ea1e63c3..6c198fb37d3 100644 ++--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h +++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h ++@@ -8,7 +8,8 @@ ++ #include ++ #include ++ ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) +++#define SLEEF_STATIC_LIBS ++ #include ++ #endif ++ ++@@ -16,7 +17,7 @@ namespace at::vec { ++ // See Note [CPU_CAPABILITY namespace] ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) ++ ++ template <> class Vectorized> { ++ private: ++@@ -145,7 +146,7 @@ public: ++ auto abs = abs_(); ++ auto zero = _mm256_setzero_pd(); ++ auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ); ++- auto div = values / abs; +++ auto div = _mm256_div_pd(values, abs); ++ return _mm256_blendv_pd(div, zero, mask); ++ } ++ __m256d real_() const { ++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h ++index 7c142c04b79..c72d4d49274 100644 ++--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h +++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h ++@@ -7,7 +7,8 @@ ++ #include ++ #include ++ #include ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) +++#define SLEEF_STATIC_LIBS ++ #include ++ #endif ++ ++@@ -15,7 +16,7 @@ namespace at::vec { ++ // See Note [CPU_CAPABILITY namespace] ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) ++ ++ template <> class Vectorized> { ++ private: ++@@ -180,7 +181,7 @@ public: ++ auto abs = abs_(); ++ auto zero = _mm256_setzero_ps(); ++ auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ); ++- auto div = values / abs; +++ auto div = _mm256_div_ps(values, abs); ++ return _mm256_blendv_ps(div, zero, mask); ++ } ++ __m256 real_() const { ++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h ++index bc82d07edd1..bed6da627af 100644 ++--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h +++++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h ++@@ -6,7 +6,8 @@ ++ #include ++ #include ++ #include ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) +++#define SLEEF_STATIC_LIBS ++ #include ++ #endif ++ ++@@ -15,7 +16,7 @@ namespace at::vec { ++ inline namespace CPU_CAPABILITY { ++ ++ ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) ++ ++ template <> class Vectorized { ++ private: ++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h ++index 886809a0b8a..0e3664cd37b 100644 ++--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h +++++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h ++@@ -6,7 +6,8 @@ ++ #include ++ #include ++ #include ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) +++#define SLEEF_STATIC_LIBS ++ #include ++ #endif ++ ++@@ -14,7 +15,7 @@ namespace at::vec { ++ // See Note [CPU_CAPABILITY namespace] ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) ++ ++ template <> class Vectorized { ++ private: ++@@ -226,14 +227,14 @@ public: ++ static __m256 vec_factorial_5 = ++ _mm256_set1_ps(0.00828929059f); // 1/factorial(5) ++ static __m256 vec_exp_log2ef = ++- (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e) +++ _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e) ++ static __m256 vec_half = _mm256_set1_ps(0.5f); ++ static __m256 vec_one = _mm256_set1_ps(1.f); ++ static __m256 vec_zero = _mm256_set1_ps(0.f); ++ static __m256 vec_two = _mm256_set1_ps(2.f); ++- static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2) ++- static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50); ++- static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218); +++ static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2) +++ static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50)); +++ static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218)); ++ static __m256i vec_127 = _mm256_set1_epi32(0x0000007f); ++ static int n_mantissa_bits = 23; ++ ++@@ -266,7 +267,7 @@ public: ++ auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number); ++ auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127); ++ vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); ++- auto vec_two_pow_n = (__m256)vec_two_pow_n_i; +++ auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i); ++ vec_two_pow_n = ++ _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask); ++ ++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h ++index 4128841701a..85e099904cd 100644 ++--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h +++++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h ++@@ -41,11 +41,17 @@ ++ namespace at::vec { ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) ++ +++#ifdef _MSC_VER +++__declspec(align(64)) struct Vectorizedqi { +++ protected: +++ __m256i vals; +++#else ++ struct Vectorizedqi { ++ protected: ++ __m256i vals __attribute__((aligned(64))); +++#endif ++ ++ public: ++ Vectorizedqi() {} ++@@ -133,7 +139,7 @@ inline convert_float_to_int8(at::vec::Vectorized src) { ++ } ++ ++ template ++-inline void __attribute__((always_inline)) QuantizeAvx2( +++__FORCE_INLINE void QuantizeAvx2( ++ const float* src, ++ T* dst, ++ int len, ++@@ -1331,5 +1337,5 @@ Vectorized inline maximum(const Vectorized& a, const V ++ return a.maximum(b); ++ } ++ ++-#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#endif // if defined(CPU_CAPABILITY_AVX2) ++ }} // namespace at::vec::CPU_CAPABILITY ++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h ++index fe96d123e64..87f723d782c 100644 ++--- a/aten/src/ATen/cpu/vec/vec512/vec512.h +++++ b/aten/src/ATen/cpu/vec/vec512/vec512.h ++@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { ++ } ++ ++ ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) ++ ++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++@@ -80,7 +80,8 @@ inline Vectorized cast(const Vectorized& src) ++ } ++ ++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++- +++#ifndef _MSC_VER +++// MSVC is not working well on complex function overload. ++ template ++ std::enable_if_t> ++ inline gather(const double* base_addr, const Vectorized& vindex) { ++@@ -92,9 +93,10 @@ std::enable_if_t& vindex) { ++ return _mm512_i32gather_ps(vindex, base_addr, scale); ++ } ++- +++#endif ++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++- +++#ifndef _MSC_VER +++// MSVC is not working well on complex function overload. ++ template ++ std::enable_if_t> ++ inline mask_gather(const Vectorized& src, const double* base_addr, ++@@ -112,7 +114,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, ++ auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ); ++ return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale); ++ } ++- +++#endif ++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++ template<> ++@@ -270,6 +272,6 @@ inline Vectorized flip(const Vectorized & v) { ++ return flip8(v); ++ } ++ ++-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#endif // defined(CPU_CAPABILITY_AVX512) ++ ++ }}} ++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h ++index f9fc92d52bf..eb3b6a72240 100644 ++--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h +++++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h ++@@ -7,7 +7,8 @@ ++ #include ++ #include ++ ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) +++#define SLEEF_STATIC_LIBS ++ #include ++ #endif ++ ++@@ -16,7 +17,18 @@ namespace vec { ++ // See Note [CPU_CAPABILITY namespace] ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) +++ +++#ifndef SLEEF_CONST +++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) +++#define SLEEF_CONST const +++#else +++#define SLEEF_CONST +++#endif +++#define SLEEF_CONST_OLD SLEEF_CONST +++#else +++#define SLEEF_CONST_OLD +++#endif ++ ++ // bfloat16 conversion ++ static inline void cvtbf16_fp32(const __m256i& a, __m512& o) { ++@@ -362,7 +374,8 @@ public: ++ } ++ #pragma clang diagnostic push ++ #pragma clang diagnostic ignored "-Wignored-qualifiers" ++- Vectorized map(const __m512 (*const vop)(__m512)) const { +++ +++ Vectorized map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const { ++ __m512 lo, hi; ++ cvt_to_fp32(values, lo, hi); ++ const auto o1 = vop(lo); ++@@ -1571,7 +1584,7 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V ++ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16); ++ CONVERT_VECTORIZED_INIT(Half, half); ++ ++-#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#else //defined(CPU_CAPABILITY_AVX512) ++ ++ #define CONVERT_NON_VECTORIZED_INIT(type, name) \ ++ inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ ++@@ -1601,9 +1614,9 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V ++ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16); ++ CONVERT_NON_VECTORIZED_INIT(Half, half); ++ ++-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#endif // defined(CPU_CAPABILITY_AVX512) ++ ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) ++ #define LOAD_FP32_VECTORIZED_INIT(type, name) \ ++ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ ++ auto values = _mm256_loadu_si256(reinterpret_cast(data)); \ ++@@ -1622,7 +1635,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec ++ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16); ++ LOAD_FP32_VECTORIZED_INIT(Half, fp16); ++ ++-#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#else // defined(CPU_CAPABILITY_AVX512) ++ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ ++ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ ++ __at_align__ float values[Vectorized::size()]; \ ++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h ++index 02aa3a87cc1..c35204f9da2 100644 ++--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h +++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h ++@@ -7,7 +7,8 @@ ++ #include ++ #include ++ #include ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) +++#define SLEEF_STATIC_LIBS ++ #include ++ #endif ++ ++@@ -16,7 +17,7 @@ namespace vec { ++ // See Note [CPU_CAPABILITY namespace] ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) ++ ++ template <> class Vectorized> { ++ private: ++@@ -203,7 +204,7 @@ public: ++ auto abs = abs_(); ++ auto zero = _mm512_setzero_pd(); ++ auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ); ++- auto div = values / abs; +++ auto div = _mm512_div_pd(values, abs); ++ return _mm512_mask_blend_pd(mask, div, zero); ++ } ++ __m512d real_() const { ++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h ++index a5d790c98b2..2801e484d94 100644 ++--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h +++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h ++@@ -7,7 +7,8 @@ ++ #include ++ #include ++ #include ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) +++#define SLEEF_STATIC_LIBS ++ #include ++ #endif ++ ++@@ -16,7 +17,7 @@ namespace vec { ++ // See Note [CPU_CAPABILITY namespace] ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) ++ ++ template <> class Vectorized> { ++ private: ++@@ -708,7 +709,7 @@ public: ++ auto abs = abs_(); ++ auto zero = _mm512_setzero_ps(); ++ auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ); ++- auto div = values / abs; +++ auto div = _mm512_div_ps(values, abs); ++ return _mm512_mask_blend_ps(mask, div, zero); ++ } ++ __m512 real_() const { ++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h ++index 27b2753c903..508ab257e60 100644 ++--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h +++++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h ++@@ -6,7 +6,8 @@ ++ #include ++ #include ++ #include ++-#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER) +++#if (defined(CPU_CAPABILITY_AVX512)) +++#define SLEEF_STATIC_LIBS ++ #include ++ #endif ++ ++@@ -15,7 +16,7 @@ namespace vec { ++ // See Note [CPU_CAPABILITY namespace] ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) ++ ++ template <> class Vectorized { ++ private: ++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h ++index ba5738687fd..a08df3c141a 100644 ++--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h +++++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h ++@@ -6,7 +6,8 @@ ++ #include ++ #include ++ #include ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) +++#define SLEEF_STATIC_LIBS ++ #include ++ #endif ++ ++@@ -15,7 +16,7 @@ namespace vec { ++ // See Note [CPU_CAPABILITY namespace] ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) ++ ++ template <> class Vectorized { ++ private: ++@@ -246,14 +247,14 @@ public: ++ static __m512 vec_factorial_5 = ++ _mm512_set1_ps(0.00828929059f); // 1/factorial(5) ++ static __m512 vec_exp_log2ef = ++- (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e) +++ _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e) ++ static __m512 vec_half = _mm512_set1_ps(0.5f); ++ static __m512 vec_one = _mm512_set1_ps(1.f); ++ static __m512 vec_zero = _mm512_set1_ps(0.f); ++ static __m512 vec_two = _mm512_set1_ps(2.f); ++- static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2) ++- static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50); ++- static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218); +++ static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2) +++ static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50)); +++ static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218)); ++ static __m512i vec_127 = _mm512_set1_epi32(0x0000007f); ++ static int n_mantissa_bits = 23; ++ ++@@ -288,7 +289,7 @@ public: ++ auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number); ++ auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127); ++ vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); ++- auto vec_two_pow_n = (__m512)vec_two_pow_n_i; +++ auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i); ++ vec_two_pow_n = ++ _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero); ++ ++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h ++index e0713d01312..a5671ed4a50 100644 ++--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h +++++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h ++@@ -42,11 +42,17 @@ namespace at { ++ namespace vec { ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) ++ +++#ifdef _MSC_VER +++__declspec(align(64)) struct Vectorizedqi { +++ protected: +++ __m512i vals; +++#else ++ struct Vectorizedqi { ++ protected: ++ __m512i vals __attribute__((aligned(64))); +++#endif ++ ++ public: ++ Vectorizedqi() {} ++@@ -136,7 +142,7 @@ inline convert_float_to_int8(at::vec::Vectorized src) { ++ } ++ ++ template ++-inline void __attribute__((always_inline)) QuantizeAvx512( +++__FORCE_INLINE void QuantizeAvx512( ++ const float* src, ++ T* dst, ++ int len, ++@@ -525,10 +531,17 @@ struct Vectorized : public Vectorizedqi { ++ Vectorized scale, ++ Vectorized zero_point, ++ Vectorized scale_neg_zp_premul) const { +++ #if defined(_MSC_VER) && !defined(__clang__) +++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); +++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); +++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); +++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +++ #else ++ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); ++ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); ++ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); ++ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +++ #endif ++ ++ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); ++ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); ++@@ -549,10 +562,17 @@ struct Vectorized : public Vectorizedqi { ++ float_vec_return_type dequantize( ++ Vectorized scale, ++ Vectorized zero_point) const { +++ #if defined(_MSC_VER) && !defined(__clang__) +++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); +++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); +++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); +++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +++ #else ++ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); ++ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); ++ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); ++ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +++ #endif ++ ++ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); ++ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); ++@@ -598,20 +618,34 @@ struct Vectorized : public Vectorizedqi { ++ } ++ ++ int_vec_return_type widening_subtract(Vectorized b) const { +++ #if defined(_MSC_VER) && !defined(__clang__) +++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); +++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); +++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); +++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +++ #else ++ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); ++ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); ++ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); ++ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +++ #endif ++ ++ __m512i int32_val0 = cvtepi8_epi32(int_val0); ++ __m512i int32_val1 = cvtepi8_epi32(int_val1); ++ __m512i int32_val2 = cvtepi8_epi32(int_val2); ++ __m512i int32_val3 = cvtepi8_epi32(int_val3); ++ +++ #if defined(_MSC_VER) && !defined(__clang__) +++ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); +++ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); +++ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); +++ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); +++ #else ++ __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); ++ __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); ++ __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); ++ __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); +++ #endif ++ ++ __m512i int32_b0 = cvtepi8_epi32(int_b0); ++ __m512i int32_b1 = cvtepi8_epi32(int_b1); ++@@ -721,10 +755,17 @@ struct Vectorized : public Vectorizedqi { ++ Vectorized scale, ++ Vectorized zero_point, ++ Vectorized scale_zp_premul) const { +++ #if defined(_MSC_VER) && !defined(__clang__) +++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); +++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); +++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); +++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +++ #else ++ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); ++ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); ++ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); ++ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +++ #endif ++ ++ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); ++ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); ++@@ -746,10 +787,17 @@ struct Vectorized : public Vectorizedqi { ++ float_vec_return_type dequantize( ++ Vectorized scale, ++ Vectorized zero_point) const { +++ #if defined(_MSC_VER) && !defined(__clang__) +++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); +++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); +++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); +++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +++ #else ++ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); ++ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); ++ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); ++ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +++ #endif ++ ++ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); ++ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); ++@@ -796,20 +844,34 @@ struct Vectorized : public Vectorizedqi { ++ } ++ ++ int_vec_return_type widening_subtract(Vectorized b) const { +++ #if defined(_MSC_VER) && !defined(__clang__) +++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); +++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); +++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); +++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +++ #else ++ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); ++ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); ++ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); ++ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +++ #endif ++ ++ __m512i int32_val0 = cvtepu8_epi32(int_val0); ++ __m512i int32_val1 = cvtepu8_epi32(int_val1); ++ __m512i int32_val2 = cvtepu8_epi32(int_val2); ++ __m512i int32_val3 = cvtepu8_epi32(int_val3); ++ +++ #if defined(_MSC_VER) && !defined(__clang__) +++ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); +++ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); +++ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); +++ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); +++ #else ++ __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); ++ __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); ++ __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); ++ __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); +++ #endif ++ ++ __m512i int32_b0 = cvtepu8_epi32(int_b0); ++ __m512i int32_b1 = cvtepu8_epi32(int_b1); ++diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h ++index adf81dd915c..20cb8ef6dbc 100644 ++--- a/aten/src/ATen/cpu/vec/vec_base.h +++++ b/aten/src/ATen/cpu/vec/vec_base.h ++@@ -36,6 +36,12 @@ ++ #include ++ #include ++ +++#if defined(__GNUC__) +++#define __FORCE_INLINE __attribute__((always_inline)) inline +++#elif defined(_MSC_VER) +++#define __FORCE_INLINE __forceinline +++#endif +++ ++ // These macros helped us unify vec_base.h ++ #ifdef CPU_CAPABILITY_AVX512 ++ #if defined(__GNUC__) ++diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt ++index a6b6f0f7d1d..15d37cf4861 100644 ++--- a/caffe2/CMakeLists.txt +++++ b/caffe2/CMakeLists.txt ++@@ -1787,7 +1787,7 @@ if(BUILD_TEST) ++ endif() ++ else() ++ add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}") ++- target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main) +++ target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main) ++ endif() ++ target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $) ++ target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $) ++diff --git a/third_party/sleef.BUILD b/third_party/sleef.BUILD ++index 573f9c5b54a..f22a6e905e2 100644 ++--- a/third_party/sleef.BUILD +++++ b/third_party/sleef.BUILD ++@@ -38,6 +38,7 @@ SLEEF_PUBLIC_HEADERS = [ ++ SLEEF_PRIVATE_INCLUDES = [ ++ "-Iexternal/sleef/src/arch", ++ "-Iexternal/sleef/src/common", +++ "-Iexternal/sleef/src/libm", ++ ] ++ ++ SLEEF_PUBLIC_INCLUDES = [ ++@@ -201,8 +202,6 @@ cc_library( ++ srcs = [ ++ "src/libm/rempitab.c", ++ "src/libm/sleefdp.c", ++- "src/libm/sleefld.c", ++- "src/libm/sleefqp.c", ++ "src/libm/sleefsp.c", ++ ], ++ hdrs = SLEEF_PUBLIC_HEADERS, ++-- ++2.45.1 ++ +diff --git a/python-torch.spec b/python-torch.spec +index d50687a5174a..63600c2e8c39 100644 +--- a/python-torch.spec ++++ b/python-torch.spec +@@ -176,6 +176,17 @@ Patch7: 0001-Reenable-dim-for-python-3.12.patch + Patch8: 0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch + %endif + ++# Enable x86 CPU vectorization on windows [submodule sleef] (#118980) ++# https://github.com/pytorch/pytorch/commit/56451cd49d9cf94b49197e09dec13426bb1a5370 ++# ++# Despite the title, this patch fixes compatibility with sleef 3.6 by including ++# a backwards-compatible version of the fix from ++# https://github.com/pytorch/pytorch/pull/122723. ++# Cherry-picked on v2.3.0, without the commit to update the third_party/sleef ++# git submodule (because the release archive contains an actual sleef source ++# tree instead, so this would not apply.) ++Patch9: 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch ++ + %if %{with rocm} + # ROCm patches + # https://github.com/pytorch/pytorch/pull/120551 +-- +2.45.1 + diff --git a/0001-Reenable-dim-for-python-3.12.patch b/0001-Reenable-dim-for-python-3.12.patch new file mode 100644 index 0000000..138b5d4 --- /dev/null +++ b/0001-Reenable-dim-for-python-3.12.patch @@ -0,0 +1,115 @@ +From ee3fb343a376cdba6f4ce188cac90023f13e2aea Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Thu, 4 Apr 2024 14:21:38 -0600 +Subject: [PATCH] Reenable dim for python 3.12 + +In 3.12: + +_PyArg_Parser added an element to the start of the structure. +So existing positional initialization is off. Switch to element +initialization. + +_Py_CODEUNIT changed to from an int to a union, but relevant_op +is passed an int for the return of decoder.opcode, so the parameter +type is wrong, switch it to int. + +The opcode PRECALL was removed, so reduce its handling to 3.11 + +Signed-off-by: Tom Rix +--- + functorch/csrc/dim/dim.cpp | 24 +++++------------------- + functorch/csrc/dim/minpybind.h | 4 ++-- + 2 files changed, 7 insertions(+), 21 deletions(-) + +diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp +index 4cc027504c77..e48b0d58081f 100644 +--- a/functorch/csrc/dim/dim.cpp ++++ b/functorch/csrc/dim/dim.cpp +@@ -6,20 +6,6 @@ + + #include + +- +-// Many APIs have changed/don't exist anymore +-#if IS_PYTHON_3_12_PLUS +- +-#include "dim.h" +- +-// Re-enable this some day +-PyObject* Dim_init() { +- PyErr_SetString(PyExc_RuntimeError, "First class dim doesn't work with python 3.12"); +- return nullptr; +-} +- +-#else +- + #include "minpybind.h" + #include + #include +@@ -441,7 +427,7 @@ static PyObject* DimList_bind(DimList *self, + PY_BEGIN + mpy::handle sizes; + static const char * const _keywords[] = {"sizes", nullptr}; +- static _PyArg_Parser parser = {"O", _keywords, 0}; ++ static _PyArg_Parser parser = { .format = "O", .keywords = _keywords}; + if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, &sizes)) { + return nullptr; + } +@@ -465,7 +451,7 @@ static PyObject* DimList_bind_len(DimList *self, + PY_BEGIN + int size; + static const char * const _keywords[] = {"N", nullptr}; +- static _PyArg_Parser parser = {"i", _keywords, 0}; ++ static _PyArg_Parser parser = { .format = "i", .keywords = _keywords}; + if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, &size)) { + return nullptr; + } +@@ -1468,7 +1454,7 @@ PyTypeObject Tensor::Type = { + + // dim() -------------------- + +-static bool relevant_op(_Py_CODEUNIT c) { ++static bool relevant_op(int c) { + switch(c) { + case STORE_NAME: + case STORE_GLOBAL: +@@ -1587,7 +1573,7 @@ static PyObject* _dims(PyObject *self, + auto c = mpy::obj::steal(PyFrame_GetCode(f.ptr())); + auto lasti = PyFrame_GetLasti(f.ptr()); + auto decoder = PyInstDecoder(c.ptr(), lasti); +- #if IS_PYTHON_3_11_PLUS ++ #if IS_PYTHON_3_11 + // When py3.11 adapts bytecode lasti points to the precall + // rather than the call instruction after it + if (decoder.opcode() == PRECALL) { +@@ -3268,4 +3254,4 @@ PyObject* Dim_init() { + } + } + +-#endif ++ +diff --git a/functorch/csrc/dim/minpybind.h b/functorch/csrc/dim/minpybind.h +index de82b5af95a4..d76d4828bf80 100644 +--- a/functorch/csrc/dim/minpybind.h ++++ b/functorch/csrc/dim/minpybind.h +@@ -621,7 +621,7 @@ struct vector_args { + PyObject *dummy = NULL; + _PyArg_ParseStackAndKeywords((PyObject*const*)args, nargs, kwnames.ptr(), _parser, &dummy, &dummy, &dummy, &dummy, &dummy); + #else +- _PyArg_Parser* _parser = new _PyArg_Parser{NULL, &names_buf[0], fname_cstr, 0}; ++ _PyArg_Parser* _parser = new _PyArg_Parser{ .keywords = &names_buf[0], .fname = fname_cstr}; + std::unique_ptr buf(new PyObject*[names.size()]); + _PyArg_UnpackKeywords((PyObject*const*)args, nargs, NULL, kwnames.ptr(), _parser, required, (Py_ssize_t)values.size() - kwonly, 0, &buf[0]); + #endif +@@ -706,7 +706,7 @@ inline object handle::call_vector(vector_args args) { + #define MPY_PARSE_ARGS_KWNAMES(fmt, FORALL_ARGS) \ + static const char * const kwlist[] = { FORALL_ARGS(MPY_ARGS_NAME) nullptr}; \ + FORALL_ARGS(MPY_ARGS_DECLARE) \ +- static _PyArg_Parser parser = {fmt, kwlist, 0}; \ ++ static _PyArg_Parser parser = { .format = fmt, .keywords = kwlist}; \ + if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, FORALL_ARGS(MPY_ARGS_POINTER) nullptr)) { \ + throw mpy::exception_set(); \ + } +-- +2.44.0 + diff --git a/0001-Regenerate-flatbuffer-header.patch b/0001-Regenerate-flatbuffer-header.patch new file mode 100644 index 0000000..4eec491 --- /dev/null +++ b/0001-Regenerate-flatbuffer-header.patch @@ -0,0 +1,39 @@ +From 5b8e51b24513fa851eeff42f23d942bde301e321 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Fri, 29 Sep 2023 06:19:29 -0700 +Subject: [PATCH] Regenerate flatbuffer header + +For this error +torch/csrc/jit/serialization/mobile_bytecode_generated.h:12:41: +error: static assertion failed: Non-compatible flatbuffers version included + 12 | FLATBUFFERS_VERSION_MINOR == 3 && + +PyTorch is expecting 23.3.3, what f38 has +Rawhide is at 23.5.26 + +Regenerate with +flatc --cpp --gen-mutable --no-prefix --scoped-enums mobile_bytecode.fbs + +Signed-off-by: Tom Rix +--- + torch/csrc/jit/serialization/mobile_bytecode_generated.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/torch/csrc/jit/serialization/mobile_bytecode_generated.h b/torch/csrc/jit/serialization/mobile_bytecode_generated.h +index cffe8bc7a6..83575e4c19 100644 +--- a/torch/csrc/jit/serialization/mobile_bytecode_generated.h ++++ b/torch/csrc/jit/serialization/mobile_bytecode_generated.h +@@ -9,8 +9,8 @@ + // Ensure the included flatbuffers.h is the same version as when this file was + // generated, otherwise it may not be compatible. + static_assert(FLATBUFFERS_VERSION_MAJOR == 23 && +- FLATBUFFERS_VERSION_MINOR == 3 && +- FLATBUFFERS_VERSION_REVISION == 3, ++ FLATBUFFERS_VERSION_MINOR == 5 && ++ FLATBUFFERS_VERSION_REVISION == 26, + "Non-compatible flatbuffers version included"); + + namespace torch { +-- +2.43.0 + diff --git a/0001-Stub-in-kineto-ActivityType.patch b/0001-Stub-in-kineto-ActivityType.patch new file mode 100644 index 0000000..f088645 --- /dev/null +++ b/0001-Stub-in-kineto-ActivityType.patch @@ -0,0 +1,73 @@ +From 3ef82b814179da571b2478f61d4279717ab0b23a Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Fri, 29 Sep 2023 06:25:23 -0700 +Subject: [PATCH] Stub in kineto ActivityType + +There is an error with kineto is not used, the shim still +requires the ActivityTYpe.h header to get the enum Activity type. +So cut-n-paste just enough of the header in to do this. + +Signed-off-by: Tom Rix +--- + torch/csrc/profiler/kineto_shim.h | 44 +++++++++++++++++++++++++++++++ + 1 file changed, 44 insertions(+) + +diff --git a/torch/csrc/profiler/kineto_shim.h b/torch/csrc/profiler/kineto_shim.h +index e92cbf003d..68985ab7d0 100644 +--- a/torch/csrc/profiler/kineto_shim.h ++++ b/torch/csrc/profiler/kineto_shim.h +@@ -12,7 +12,51 @@ + #undef USE_KINETO + #endif + ++#ifdef USE_KINETO + #include ++#else ++namespace libkineto { ++// copied from header ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under the BSD-style license found in the ++ * LICENSE file in the root directory of this source tree. ++ */ ++ ++// Note : All activity types are not enabled by default. Please add them ++// at correct position in the enum ++enum class ActivityType { ++ // Activity types enabled by default ++ CPU_OP = 0, // cpu side ops ++ USER_ANNOTATION, ++ GPU_USER_ANNOTATION, ++ GPU_MEMCPY, ++ GPU_MEMSET, ++ CONCURRENT_KERNEL, // on-device kernels ++ EXTERNAL_CORRELATION, ++ CUDA_RUNTIME, // host side cuda runtime events ++ CUDA_DRIVER, // host side cuda driver events ++ CPU_INSTANT_EVENT, // host side point-like events ++ PYTHON_FUNCTION, ++ OVERHEAD, // CUPTI induced overhead events sampled from its overhead API. ++ ++ // Optional Activity types ++ CUDA_SYNC, // synchronization events between runtime and kernels ++ GLOW_RUNTIME, // host side glow runtime events ++ MTIA_RUNTIME, // host side MTIA runtime events ++ CUDA_PROFILER_RANGE, // CUPTI Profiler range for performance metrics ++ MTIA_CCP_EVENTS, // MTIA ondevice CCP events ++ HPU_OP, // HPU host side runtime event ++ XPU_RUNTIME, // host side xpu runtime events ++ ++ ENUM_COUNT, // This is to add buffer and not used for any profiling logic. Add your new type before it. ++ OPTIONAL_ACTIVITY_TYPE_START = CUDA_SYNC, ++}; ++} ++ ++#endif + + #include + #include +-- +2.43.0 + diff --git a/0001-can-not-use-with-c-files.patch b/0001-can-not-use-with-c-files.patch new file mode 100644 index 0000000..719737c --- /dev/null +++ b/0001-can-not-use-with-c-files.patch @@ -0,0 +1,25 @@ +From a5dff521691a17701b5a02ec75e84cfe1bf605f7 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Sat, 3 Feb 2024 06:41:49 -0500 +Subject: [PATCH] can not use with c files + +--- + cmake/Dependencies.cmake | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index 4dd8042058..5f91f3ffab 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -1269,7 +1269,7 @@ if(USE_ROCM) + list(APPEND HIP_CXX_FLAGS -Wno-duplicate-decl-specifier) + list(APPEND HIP_CXX_FLAGS -DCAFFE2_USE_MIOPEN) + list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) +- list(APPEND HIP_CXX_FLAGS -std=c++17) ++# list(APPEND HIP_CXX_FLAGS -std=c++17) + if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "6.0.0") + list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2) + endif() +-- +2.43.0 + diff --git a/0001-cuda-hip-signatures.patch b/0001-cuda-hip-signatures.patch new file mode 100644 index 0000000..a258737 --- /dev/null +++ b/0001-cuda-hip-signatures.patch @@ -0,0 +1,42 @@ +From 214dc959acc809e1959643272c344ee5335d5a69 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Thu, 1 Feb 2024 11:29:47 -0500 +Subject: [PATCH] cuda - hip signatures + +--- + aten/src/ATen/cuda/detail/LazyNVRTC.cpp | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp +index 1b85e7776e..bb6f88783a 100644 +--- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp ++++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp +@@ -134,8 +134,13 @@ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, + const char *src, + const char *name, + int numHeaders, ++#if !defined(USE_ROCM) + const char * const *headers, + const char * const *includeNames) { ++#else ++ const char **headers, ++ const char **includeNames) { ++#endif + auto fn = reinterpret_cast(getNVRTCLibrary().sym(__func__)); + if (!fn) + throw std::runtime_error("Can't get nvrtcCreateProgram"); +@@ -150,7 +155,11 @@ NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *); + NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *); + NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *); + #endif ++#if !defined(USE_ROCM) + NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *); ++#else ++NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char **); ++#endif + _STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult); + NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*); + NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *); +-- +2.43.0 + diff --git a/0001-disable-use-of-aotriton.patch b/0001-disable-use-of-aotriton.patch new file mode 100644 index 0000000..61ffd1e --- /dev/null +++ b/0001-disable-use-of-aotriton.patch @@ -0,0 +1,94 @@ +From 038ce9e44776e23f21c1816daa259bc0ea335088 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Sat, 29 Jun 2024 07:06:09 -0700 +Subject: [PATCH] disable use of aotriton + +--- + .../ATen/native/transformers/cuda/sdp_utils.cpp | 17 +++++++++++++++-- + 1 file changed, 15 insertions(+), 2 deletions(-) + +diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp +index 214b02d8262e..7b3eb9dcd8cd 100644 +--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp ++++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp +@@ -19,9 +19,12 @@ + #include + #include + ++#ifdef USE_FLASH_ATTENTION + #if USE_ROCM + #include + #endif ++#endif ++ + + /** + * Note [SDPA Runtime Dispatch] +@@ -182,6 +185,9 @@ bool check_sm_version(cudaDeviceProp * dprops) { + + bool check_flash_attention_hardware_support(sdp_params const& params, bool debug) { + // Check that the gpu is capable of running flash attention ++#ifndef USE_FLASH_ATTENTION ++ return false; ++#else + using sm80 = SMVersion<8, 0>; + using sm90 = SMVersion<9, 0>; + #if USE_ROCM +@@ -209,9 +215,13 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug + } + #endif + return true; ++#endif + } + + bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) { ++#ifndef USE_FLASH_ATTENTION ++ return false; ++#else + // Mem Efficient attention supports hardware in the range [sm_50, sm_90] + using sm50 = SMVersion<5, 0>; + using sm90 = SMVersion<9, 0>; +@@ -240,6 +250,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) + } + #endif + return true; ++#endif + } + + bool check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89( +@@ -554,7 +565,7 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) { + #ifndef USE_FLASH_ATTENTION + TORCH_WARN_ONCE(!debug, "Torch was not compiled with flash attention."); + return false; +-#endif ++#else + + // Define gate functions that determine if a flash kernel can be ran + // Replace with std::to_array when we migrate to c++20 +@@ -597,13 +608,14 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) { + } + } + return true; ++#endif + } + + bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) { + #ifndef USE_MEM_EFF_ATTENTION + TORCH_WARN_ONCE(!debug, "Torch was not compiled with memory efficient attention."); + return false; +-#endif ++#else + // Constraints specific to mem efficient attention + constexpr auto greater_than_or_equal_sm80_mem_efficient_dtypes = + array_of(at::kHalf, at::kFloat, at::kBFloat16); +@@ -663,6 +675,7 @@ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) { + } + #endif + return check_tensor_dtype(params, less_than_sm80_mem_efficient_dtypes, debug); ++#endif + } + + SDPBackend select_sdp_backend(sdp_params const& kernel_params) { +-- +2.45.2 + diff --git a/0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch b/0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch new file mode 100644 index 0000000..0ce5b1f --- /dev/null +++ b/0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch @@ -0,0 +1,226 @@ +From b9d45eb1cc90696a4de76676221219e24423c709 Mon Sep 17 00:00:00 2001 +From: William Wen +Date: Wed, 3 Apr 2024 17:58:46 -0700 +Subject: [PATCH] [dynamo, 3.12] enable dynamo on 3.12, enable most dynamo + unittests on 3.12 (#123216) + +Pull Request resolved: https://github.com/pytorch/pytorch/pull/123216 +Approved by: https://github.com/jansel, https://github.com/malfet +--- + test/dynamo/test_autograd_function.py | 3 ++ + test/dynamo/test_misc.py | 63 +++++++++++++++++++++++++ + test/functorch/test_eager_transforms.py | 7 ++- + test/run_test.py | 3 -- + torch/__init__.py | 5 +- + torch/_dynamo/eval_frame.py | 4 +- + torch/_dynamo/test_case.py | 8 +--- + 7 files changed, 74 insertions(+), 19 deletions(-) + +diff --git a/test/dynamo/test_autograd_function.py b/test/dynamo/test_autograd_function.py +index d23fec607afa..bc5ebc767038 100644 +--- a/test/dynamo/test_autograd_function.py ++++ b/test/dynamo/test_autograd_function.py +@@ -2,6 +2,8 @@ + + import copy + import math ++import sys ++import unittest + + import torch + +@@ -528,6 +530,7 @@ class AutogradFunctionTests(torch._dynamo.test_case.TestCase): + # I pulled all of these test cases from test_autograd.py + # In the future, we should make the Dynamo test suite actually + # run on test_autograd.py (it's disabled right now) and delete these. ++ @unittest.skipIf(sys.version_info >= (3, 12), "invalid free in 3.12+") + def test_smoke_from_test_autograd(self): + class Func(torch.autograd.Function): + @staticmethod +diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py +index a73de8b1c7e9..8f54e0564e6b 100644 +--- a/test/dynamo/test_misc.py ++++ b/test/dynamo/test_misc.py +@@ -9760,6 +9760,69 @@ fn + lambda mod: mod, + ) + ++ @xfailIfPy311 ++ def test_outside_linear_module_free(self): ++ # Compared to test_linear_module_free, the linear ++ # layer is not the code object that is directly compiled. ++ def model_inp_ctr(): ++ fc = torch.nn.Linear(100, 100) ++ ++ class Mod(torch.nn.Module): ++ def __init__(self): ++ super().__init__() ++ self.fc_ref = fc ++ ++ def forward(self, x): ++ return fc(x[0]) ++ ++ # return fc to keep it alive in _test_compile_model_free ++ return Mod(), (torch.randn(100, 100), fc) ++ ++ self._test_compile_model_free(model_inp_ctr, lambda mod: mod.fc_ref) ++ ++ @unittest.skipIf(sys.version_info >= (3, 12), "leaks in 3.12+") ++ def test_parameter_free(self): ++ def model_inp_ctr(): ++ param = torch.nn.Parameter(torch.randn(100, 100)) ++ ++ class Mod(torch.nn.Module): ++ def __init__(self): ++ super().__init__() ++ self.param = param ++ ++ def forward(self, x): ++ return self.param * x[0] ++ ++ # return param to keep it alive in _test_compile_model_free ++ return Mod(), (torch.randn(100, 100), param) ++ ++ self._test_compile_model_free(model_inp_ctr, lambda mod: mod.param) ++ ++ def test_raises_importerror1(self): ++ @torch.compile(backend="eager") ++ def fn(x): ++ try: ++ import some_module_that_surely_does_not_exist ++ ++ return ++ except ImportError: ++ pass ++ return x.sin() ++ ++ x = torch.randn(8) ++ self.assertEqual(fn(x), x.sin()) ++ ++ def test_raises_importerror2(self): ++ @torch.compile(backend="eager") ++ def fn(x): ++ import some_module_that_surely_does_not_exist ++ ++ return x + 1 ++ ++ x = torch.randn(8) ++ with self.assertRaises(ImportError): ++ fn(x) ++ + def test_dynamo_cache_move_to_front(self): + class Mod(torch.nn.Module): + def __init__(self): +diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py +index 09415cf8f48e..60790ec06059 100644 +--- a/test/functorch/test_eager_transforms.py ++++ b/test/functorch/test_eager_transforms.py +@@ -4762,8 +4762,7 @@ class TestCompileTransforms(TestCase): + # Triton only supports GPU with SM70 or later. + @expectedFailureIf((IS_ARM64 and not IS_MACOS) or + IS_WINDOWS or +- (TEST_CUDA and not SM70OrLater) or +- (sys.version_info >= (3, 12))) ++ (TEST_CUDA and not SM70OrLater)) + def test_compile_vmap_hessian(self, device): + # The model and inputs are a smaller version + # of code at benchmark repo: +@@ -4792,8 +4791,8 @@ class TestCompileTransforms(TestCase): + actual = opt_fn(params_and_buffers, x) + self.assertEqual(actual, expected) + +- # torch.compile is not supported on Windows or on Python 3.12+ +- @expectedFailureIf(IS_WINDOWS or (sys.version_info >= (3, 12))) ++ # torch.compile is not supported on Windows ++ @expectedFailureIf(IS_WINDOWS) + @torch._dynamo.config.patch(suppress_errors=False) + @torch._dynamo.config.patch(capture_func_transforms=True) + @skipIfTorchDynamo("Do not test torch.compile on top of torch.compile") +diff --git a/test/run_test.py b/test/run_test.py +index e86af9623042..ebb14df4167d 100755 +--- a/test/run_test.py ++++ b/test/run_test.py +@@ -74,7 +74,6 @@ sys.path.remove(str(REPO_ROOT)) + RERUN_DISABLED_TESTS = os.getenv("PYTORCH_TEST_RERUN_DISABLED_TESTS", "0") == "1" + DISTRIBUTED_TEST_PREFIX = "distributed" + INDUCTOR_TEST_PREFIX = "inductor" +-DYNAMO_TEST_PREFIX = "dynamo" + + + # Note [ROCm parallel CI testing] +@@ -324,7 +323,6 @@ JIT_EXECUTOR_TESTS = [ + ] + + INDUCTOR_TESTS = [test for test in TESTS if test.startswith(INDUCTOR_TEST_PREFIX)] +-DYNAMO_TESTS = [test for test in TESTS if test.startswith(DYNAMO_TEST_PREFIX)] + DISTRIBUTED_TESTS = [test for test in TESTS if test.startswith(DISTRIBUTED_TEST_PREFIX)] + TORCH_EXPORT_TESTS = [test for test in TESTS if test.startswith("export")] + FUNCTORCH_TESTS = [test for test in TESTS if test.startswith("functorch")] +@@ -1361,7 +1359,6 @@ def get_selected_tests(options) -> List[str]: + # these tests failing in Python 3.12 temporarily disabling + if sys.version_info >= (3, 12): + options.exclude.extend(INDUCTOR_TESTS) +- options.exclude.extend(DYNAMO_TESTS) + options.exclude.extend( + [ + "functorch/test_dims", +diff --git a/torch/__init__.py b/torch/__init__.py +index d381712b4a35..26cdffe81d29 100644 +--- a/torch/__init__.py ++++ b/torch/__init__.py +@@ -1861,9 +1861,8 @@ def compile(model: Optional[Callable] = None, *, + + """ + _C._log_api_usage_once("torch.compile") +- # Temporary until we get proper support for python 3.12 +- if sys.version_info >= (3, 12): +- raise RuntimeError("Dynamo is not supported on Python 3.12+") ++ if sys.version_info >= (3, 13): ++ raise RuntimeError("Dynamo is not supported on Python 3.13+") + + # Decorator mode + if model is None: +diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py +index 53ab0df3a947..0a80eeea99ed 100644 +--- a/torch/_dynamo/eval_frame.py ++++ b/torch/_dynamo/eval_frame.py +@@ -589,8 +589,8 @@ class _NullDecorator(contextlib.nullcontext): # type: ignore[type-arg] + + + def check_if_dynamo_supported(): +- if sys.version_info >= (3, 12): +- raise RuntimeError("Python 3.12+ not yet supported for torch.compile") ++ if sys.version_info >= (3, 13): ++ raise RuntimeError("Python 3.13+ not yet supported for torch.compile") + + + def is_dynamo_supported(): +diff --git a/torch/_dynamo/test_case.py b/torch/_dynamo/test_case.py +index e3cbef09eaae..297ea6e2bc2a 100644 +--- a/torch/_dynamo/test_case.py ++++ b/torch/_dynamo/test_case.py +@@ -1,7 +1,6 @@ + import contextlib + import importlib + import logging +-import sys + + import torch + import torch.testing +@@ -20,12 +19,7 @@ log = logging.getLogger(__name__) + def run_tests(needs=()): + from torch.testing._internal.common_utils import run_tests + +- if ( +- TEST_WITH_TORCHDYNAMO +- or IS_WINDOWS +- or TEST_WITH_CROSSREF +- or sys.version_info >= (3, 12) +- ): ++ if TEST_WITH_TORCHDYNAMO or IS_WINDOWS or TEST_WITH_CROSSREF: + return # skip testing + + if isinstance(needs, str): +-- +2.44.0 + diff --git a/0001-include-fmt-ranges.h-for-using-fmt-join.patch b/0001-include-fmt-ranges.h-for-using-fmt-join.patch new file mode 100644 index 0000000..f7f6c7d --- /dev/null +++ b/0001-include-fmt-ranges.h-for-using-fmt-join.patch @@ -0,0 +1,54 @@ +From ba2cf11d1bf1dd5086c8e793198a697d4179cca7 Mon Sep 17 00:00:00 2001 +From: Kefu Chai +Date: Tue, 16 Jul 2024 08:00:22 +0800 +Subject: [PATCH] include fmt/ranges.h for using fmt::join() + +fmt::join() was moved into fmt/ranges.h in fmt 11, so include this +header for using it. + +Signed-off-by: Kefu Chai +--- + torch/csrc/distributed/c10d/socket.cpp | 1 + + torch/csrc/profiler/standalone/execution_trace_observer.cpp | 1 + + torch/csrc/profiler/util.cpp | 1 + + 3 files changed, 3 insertions(+) + +diff --git a/torch/csrc/distributed/c10d/socket.cpp b/torch/csrc/distributed/c10d/socket.cpp +index 5013f2540..cbcd33a19 100644 +--- a/torch/csrc/distributed/c10d/socket.cpp ++++ b/torch/csrc/distributed/c10d/socket.cpp +@@ -31,6 +31,7 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wdeprecated") + #include + C10_DIAGNOSTIC_POP() + #include ++#include + + #include + #include +diff --git a/torch/csrc/profiler/standalone/execution_trace_observer.cpp b/torch/csrc/profiler/standalone/execution_trace_observer.cpp +index 2ef2e5423..fb053e916 100644 +--- a/torch/csrc/profiler/standalone/execution_trace_observer.cpp ++++ b/torch/csrc/profiler/standalone/execution_trace_observer.cpp +@@ -10,6 +10,7 @@ + #endif // _WIN32 + + #include ++#include + #include + #include + #include +diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp +index 896bf606c..c229ce130 100644 +--- a/torch/csrc/profiler/util.cpp ++++ b/torch/csrc/profiler/util.cpp +@@ -5,6 +5,7 @@ + #include + #include + #include ++#include + + #ifdef USE_KINETO + #include +-- +2.45.2 + diff --git a/0001-no-third_party-FXdiv.patch b/0001-no-third_party-FXdiv.patch new file mode 100644 index 0000000..71404e3 --- /dev/null +++ b/0001-no-third_party-FXdiv.patch @@ -0,0 +1,54 @@ +From b3b307add5724ee5730f161e16594fa702f34a19 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Sat, 3 Feb 2024 08:20:28 -0500 +Subject: [PATCH] no third_party FXdiv + +--- + caffe2/CMakeLists.txt | 24 ++++++++++++------------ + 1 file changed, 12 insertions(+), 12 deletions(-) + +diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt +index b2f3adbfae..80a5625c8d 100644 +--- a/caffe2/CMakeLists.txt ++++ b/caffe2/CMakeLists.txt +@@ -110,15 +110,15 @@ endif() + # Note: the folders that are being commented out have not been properly + # addressed yet. + +-if(NOT MSVC AND USE_XNNPACK) +- if(NOT TARGET fxdiv) +- set(FXDIV_BUILD_TESTS OFF CACHE BOOL "") +- set(FXDIV_BUILD_BENCHMARKS OFF CACHE BOOL "") +- add_subdirectory( +- "${FXDIV_SOURCE_DIR}" +- "${CMAKE_BINARY_DIR}/FXdiv") +- endif() +-endif() ++#if(NOT MSVC AND USE_XNNPACK) ++# if(NOT TARGET fxdiv) ++# set(FXDIV_BUILD_TESTS OFF CACHE BOOL "") ++# set(FXDIV_BUILD_BENCHMARKS OFF CACHE BOOL "") ++# add_subdirectory( ++# "${FXDIV_SOURCE_DIR}" ++# "${CMAKE_BINARY_DIR}/FXdiv") ++# endif() ++#endif() + + add_subdirectory(core) + add_subdirectory(serialize) +@@ -1081,9 +1081,9 @@ if(USE_XPU) + target_compile_definitions(torch_xpu PRIVATE USE_XPU) + endif() + +-if(NOT MSVC AND USE_XNNPACK) +- TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv) +-endif() ++#if(NOT MSVC AND USE_XNNPACK) ++# TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv) ++#endif() + + # ========================================================== + # formerly-libtorch flags +-- +2.43.0 + diff --git a/0001-no-third_party-fmt.patch b/0001-no-third_party-fmt.patch new file mode 100644 index 0000000..6e82af2 --- /dev/null +++ b/0001-no-third_party-fmt.patch @@ -0,0 +1,65 @@ +From 2ce255b75760a0a513fb1706629b416f76a5c822 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Sat, 3 Feb 2024 08:16:04 -0500 +Subject: [PATCH] no third_party fmt + +--- + c10/CMakeLists.txt | 2 +- + cmake/Dependencies.cmake | 6 +++--- + torch/CMakeLists.txt | 2 +- + 3 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt +index 1f742f4c176..4fa08913bdd 100644 +--- a/c10/CMakeLists.txt ++++ b/c10/CMakeLists.txt +@@ -87,7 +87,7 @@ endif() + if(C10_USE_GLOG) + target_link_libraries(c10 PUBLIC glog::glog) + endif() +-target_link_libraries(c10 PRIVATE fmt::fmt-header-only) ++target_link_libraries(c10 PRIVATE fmt) + + if(C10_USE_NUMA) + message(STATUS "NUMA paths:") +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index 6f5a2d5feff..42fbf80f6e8 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -1837,7 +1837,7 @@ endif() + # + set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) + set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libs" FORCE) +-add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt) ++# add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt) + + # Disable compiler feature checks for `fmt`. + # +@@ -1846,9 +1846,9 @@ add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt) + # CMAKE_CXX_FLAGS in ways that break feature checks. Since we already know + # `fmt` is compatible with a superset of the compilers that PyTorch is, it + # shouldn't be too bad to just disable the checks. +-set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "") ++# set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "") + +-list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only) ++# list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only) + set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE) + + # ---[ Kineto +diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt +index 97a72eed55b..9e5014d1980 100644 +--- a/torch/CMakeLists.txt ++++ b/torch/CMakeLists.txt +@@ -80,7 +80,7 @@ set(TORCH_PYTHON_LINK_LIBRARIES + python::python + pybind::pybind11 + shm +- fmt::fmt-header-only ++ fmt + ATEN_CPU_FILES_GEN_LIB) + + if(USE_ASAN AND TARGET Sanitizer::address) +-- +2.43.2 + diff --git a/0001-no-third_party-foxi.patch b/0001-no-third_party-foxi.patch new file mode 100644 index 0000000..ba1ec40 --- /dev/null +++ b/0001-no-third_party-foxi.patch @@ -0,0 +1,36 @@ +From 8cb61cf9282102ac225645fcc9fb4a1bb7cb15a2 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Sat, 3 Feb 2024 08:11:55 -0500 +Subject: [PATCH] no third_party foxi + +--- + cmake/Dependencies.cmake | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index 5f91f3ffab..8e1461af81 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -1567,7 +1567,7 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX) + set_target_properties(onnx_proto PROPERTIES CXX_STANDARD 17) + endif() + endif() +- add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/foxi EXCLUDE_FROM_ALL) ++ # add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/foxi EXCLUDE_FROM_ALL) + + add_definitions(-DONNX_NAMESPACE=${ONNX_NAMESPACE}) + if(NOT USE_SYSTEM_ONNX) +@@ -1600,8 +1600,8 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX) + message("-- Found onnx: ${ONNX_LIBRARY} ${ONNX_PROTO_LIBRARY}") + list(APPEND Caffe2_DEPENDENCY_LIBS onnx_proto onnx) + endif() +- include_directories(${FOXI_INCLUDE_DIRS}) +- list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader) ++# include_directories(${FOXI_INCLUDE_DIRS}) ++# list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader) + # Recover the build shared libs option. + set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS}) + endif() +-- +2.43.0 + diff --git a/0001-reenable-foxi-linking.patch b/0001-reenable-foxi-linking.patch new file mode 100644 index 0000000..8e39795 --- /dev/null +++ b/0001-reenable-foxi-linking.patch @@ -0,0 +1,25 @@ +From 58ccda271e8f51c3fa5b7518cf6ee52ce204fd37 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Thu, 22 Feb 2024 09:28:11 -0500 +Subject: [PATCH] reenable foxi linking + +--- + cmake/Dependencies.cmake | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index 42fbf80f6e8..bc3a2dc6fee 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -1604,7 +1604,7 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX) + list(APPEND Caffe2_DEPENDENCY_LIBS onnx_proto onnx) + endif() + # include_directories(${FOXI_INCLUDE_DIRS}) +-# list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader) ++ list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader) + # Recover the build shared libs option. + set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS}) + endif() +-- +2.43.2 + diff --git a/0001-silence-an-assert.patch b/0001-silence-an-assert.patch new file mode 100644 index 0000000..0b20dcf --- /dev/null +++ b/0001-silence-an-assert.patch @@ -0,0 +1,25 @@ +From 04dd33db93b852fdfd7ea408813080b2e2026650 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Sat, 3 Feb 2024 06:41:20 -0500 +Subject: [PATCH] silence an assert + +--- + aten/src/ATen/native/cuda/IndexKernel.cu | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu +index 657c0c77b3..b406aa6687 100644 +--- a/aten/src/ATen/native/cuda/IndexKernel.cu ++++ b/aten/src/ATen/native/cuda/IndexKernel.cu +@@ -249,7 +249,7 @@ void index_put_kernel_quantized_cuda(TensorIterator& iter, const IntArrayRef ind + + gpu_index_kernel(iter, index_size, index_stride, [inv_scale, zero_point, qmin, qmax]C10_DEVICE(char* const out_data, const char* const in_data, const int64_t offset) { + int64_t qvalue = static_cast(zero_point + nearbyintf(*(float*)in_data * inv_scale)); +- qvalue = std::clamp(qvalue, qmin, qmax); ++ //qvalue = std::clamp(qvalue, qmin, qmax); + *(scalar_t*)(out_data + offset) = static_cast(qvalue); + }); + }); +-- +2.43.0 + diff --git a/0001-torch-paper-over-c-assert.patch b/0001-torch-paper-over-c-assert.patch new file mode 100644 index 0000000..b7e55ce --- /dev/null +++ b/0001-torch-paper-over-c-assert.patch @@ -0,0 +1,88 @@ +From f646e0f04ae591c8f2d8a0cd24b035725c57659b Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Thu, 23 Jan 2025 08:24:22 -0800 +Subject: [PATCH] torch: paper over c++ assert + +--- + aten/src/ATen/native/sparse/FlattenIndicesCommon.h | 2 ++ + .../ATen/native/sparse/SparseBinaryOpIntersectionCommon.h | 5 +++++ + .../src/ATen/native/sparse/ValidateCompressedIndicesCommon.h | 2 ++ + 3 files changed, 9 insertions(+) + +diff --git a/aten/src/ATen/native/sparse/FlattenIndicesCommon.h b/aten/src/ATen/native/sparse/FlattenIndicesCommon.h +index 0e79ed809ae6..a3cec8aaf78b 100644 +--- a/aten/src/ATen/native/sparse/FlattenIndicesCommon.h ++++ b/aten/src/ATen/native/sparse/FlattenIndicesCommon.h +@@ -69,11 +69,13 @@ Tensor _flatten_indices_impl(const Tensor& indices, IntArrayRef size) { + [=] FUNCAPI (int64_t nnz_idx) -> int64_t { + const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride; + auto hash = static_cast(0); ++#if 0 + for (int64_t dim = 0; dim < sparse_dim; ++dim) { + const auto dim_hash_coeff = hash_coeffs[dim]; + const auto dim_index = ptr_indices_dim[dim * indices_dim_stride]; + hash += dim_index * dim_hash_coeff; + } ++#endif + return hash; + }); + } +diff --git a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h +index c0b94bf39d54..8de4900b7a01 100644 +--- a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h ++++ b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h +@@ -279,12 +279,15 @@ void _sparse_binary_op_intersection_kernel_impl( + if (!ptr_indices) { + return hash; + } ++#if 0 ++// /usr/lib/gcc/x86_64-redhat-linux/15/../../../../include/c++/15/array:219:2: error: reference to __host__ function '__glibcxx_assert_fail' in __host__ __device__ function + const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride; + for (int64_t dim = 0; dim < sparse_dim; ++dim) { + const auto dim_hash_coeff = hash_coeffs[dim]; + const auto dim_index = ptr_indices_dim[dim * indices_dim_stride]; + hash += dim_index * dim_hash_coeff; + } ++#endif + return hash; + }); + } +@@ -364,6 +367,7 @@ void _sparse_binary_op_intersection_kernel_impl( + if (hash_ptr) { + hash = hash_ptr[nnz_idx]; + } else if (sparse_dim) { ++#if 0 + // Compute hash value + const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride; + for (int64_t dim = 0; dim < sparse_dim; ++dim) { +@@ -371,6 +375,7 @@ void _sparse_binary_op_intersection_kernel_impl( + const auto dim_index = ptr_indices_dim[dim * indices_dim_stride]; + hash += dim_index * dim_hash_coeff; + } ++#endif + } + + // Perform hash values intersection +diff --git a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h +index ec4c084a39cc..9bc9655b0afa 100644 +--- a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h ++++ b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h +@@ -341,6 +341,7 @@ void _validate_compressed_sparse_indices_kernel( + // assuming idx contiguity per batch: + int64_t tmp = batch_idx * nnz; + // `nnz == idx_sizes[idx_ndims - 1]` is checked above as `nnz == idx.size(-1)` ++#if 0 + for (int i = idx_ndims - 1; + i >= 0 && nnz > 0; // break early when nnz==0 + i--) { +@@ -348,6 +349,7 @@ void _validate_compressed_sparse_indices_kernel( + idx_offset += (tmp - div * idx_sizes[i]) * idx_strides[i]; + tmp = div; + } ++#endif + const auto* RESTRICT ptr_idx_batch = ptr_idx + idx_offset; + _check_idx_sorted_distinct_vals_slices_with_cidx< + cdim_name, +-- +2.48.1 + diff --git a/0001-use-any-hip.patch b/0001-use-any-hip.patch new file mode 100644 index 0000000..dca86ea --- /dev/null +++ b/0001-use-any-hip.patch @@ -0,0 +1,34 @@ +From 4248211ce9a9de81bb3ade5d421ba709b19ead08 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Sat, 3 Feb 2024 15:01:28 -0500 +Subject: [PATCH] use any hip + +--- + cmake/public/LoadHIP.cmake | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake +index 1abeb06228..28458c4146 100644 +--- a/cmake/public/LoadHIP.cmake ++++ b/cmake/public/LoadHIP.cmake +@@ -30,7 +30,7 @@ endif() + message("Building PyTorch for GPU arch: ${PYTORCH_ROCM_ARCH}") + + # Add HIP to the CMAKE Module Path +-set(CMAKE_MODULE_PATH ${ROCM_PATH}/lib/cmake/hip ${CMAKE_MODULE_PATH}) ++set(CMAKE_MODULE_PATH ${ROCM_PATH}/lib64/cmake/hip ${CMAKE_MODULE_PATH}) + + macro(find_package_and_print_version PACKAGE_NAME) + find_package("${PACKAGE_NAME}" ${ARGN}) +@@ -38,7 +38,7 @@ macro(find_package_and_print_version PACKAGE_NAME) + endmacro() + + # Find the HIP Package +-find_package_and_print_version(HIP 1.0) ++find_package_and_print_version(HIP MODULE) + + if(HIP_FOUND) + set(PYTORCH_FOUND_HIP TRUE) +-- +2.43.0 + diff --git a/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch b/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch new file mode 100644 index 0000000..413c60d --- /dev/null +++ b/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch @@ -0,0 +1,47 @@ +From 091b7fe1ccbb5e4ff4ac6017d42bacb869f61a27 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Sat, 20 Jul 2024 05:37:15 -0600 +Subject: [PATCH] Add cmake option USE_SYSTEM_FBGEMM + +Signed-off-by: Tom Rix +--- + CMakeLists.txt | 1 + + cmake/Dependencies.cmake | 3 ++- + 2 files changed, 3 insertions(+), 1 deletion(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index c4cd4b2c2a98..2068f7c6c4f2 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -253,6 +253,7 @@ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF + "USE_CUDNN" OFF) + cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF) + option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON) ++option(USE_SYSTEM_FBGEMM "Use system-wide FBGEMM" OFF) + option(USE_KINETO "Use Kineto profiling library" ON) + option(USE_CUPTI_SO "Use CUPTI as a shared library" ON) + option(USE_FAKELOWP "Use FakeLowp operators" OFF) +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index f1f2eb7cec31..192dac46f13b 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -706,6 +706,7 @@ endif() + + # ---[ FBGEMM + if(USE_FBGEMM) ++ if (NOT USE_SYSTEM_FBGEMM) + set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party") + if(NOT DEFINED FBGEMM_SOURCE_DIR) + set(FBGEMM_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/fbgemm" CACHE STRING "FBGEMM source directory") +@@ -746,7 +747,7 @@ if(USE_FBGEMM) + target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable) + endif() + endif() +- ++ endif() + if(USE_FBGEMM) + list(APPEND Caffe2_DEPENDENCY_LIBS fbgemm) + endif() +-- +2.45.1 + diff --git a/next/0001-Optionally-use-hipblaslt.patch b/next/0001-Optionally-use-hipblaslt.patch new file mode 100644 index 0000000..1e5ca4b --- /dev/null +++ b/next/0001-Optionally-use-hipblaslt.patch @@ -0,0 +1,506 @@ +From f1d65e958afa65882dbfea8b392ab847a84d41ed Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Sat, 29 Jun 2024 04:18:34 -0700 +Subject: [PATCH] Optionally use hipblaslt + +--- + aten/src/ATen/cuda/CUDABlas.cpp | 46 ++++++++++++++++++------ + aten/src/ATen/cuda/CUDAContextLight.h | 4 +++ + aten/src/ATen/cuda/CublasHandlePool.cpp | 10 ++++-- + aten/src/ATen/cuda/tunable/TunableGemm.h | 18 +++++++--- + aten/src/ATen/native/cuda/Blas.cpp | 18 +++++++++- + cmake/Dependencies.cmake | 3 ++ + cmake/public/LoadHIP.cmake | 2 +- + 7 files changed, 82 insertions(+), 19 deletions(-) + +diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp +index ce991a9bcad4..3f0d17b52778 100644 +--- a/aten/src/ATen/cuda/CUDABlas.cpp ++++ b/aten/src/ATen/cuda/CUDABlas.cpp +@@ -14,7 +14,9 @@ + #include + + #ifdef USE_ROCM ++#ifdef USE_HIPBLASLT + #include ++#endif + // until hipblas has an API to accept flags, we must use rocblas here + #include + #include +@@ -182,6 +184,9 @@ uint32_t _getAlignment(uintptr_t address) { + static size_t _parseChosenWorkspaceSize() { + const char * val = getenv("CUBLASLT_WORKSPACE_SIZE"); + #ifdef USE_ROCM ++#ifndef USE_HIPBLASLT ++ return 0; ++#endif + if (!val) { + // accept either env var + val = getenv("HIPBLASLT_WORKSPACE_SIZE"); +@@ -235,6 +240,7 @@ namespace at::cuda::blas { + } while (0) + + ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + namespace { + // Following the pattern of CuSparseDescriptor + // Defined here for now because this is the only place cublas_lt interface is +@@ -318,7 +324,6 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< + }; + } // namespace + +- + template + inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { + cudaDataType_t abcType = CUDA_R_32F; +@@ -452,7 +457,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { + " scaleType ", + scaleType); + } +- ++#endif + + template + inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { +@@ -608,10 +613,13 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(double)) + template <> + void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(float)) + { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(float)); + } +- else { ++ else ++#endif ++ { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(float)); + } + } +@@ -651,10 +659,13 @@ void bgemm_internal>(CUDABLAS_BGEMM_ARGTYPES(c10::complex + void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)) + { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::Half)); + } +- else { ++ else ++#endif ++ { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::Half)); + } + } +@@ -662,10 +673,13 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)) + template <> + void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) + { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } +- else { ++ else ++#endif ++ { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } + } +@@ -781,11 +795,13 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { + } + } + ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + template + inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) { + // forward to bgemm implementation but set strides and batches to 0 + bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0); + } ++#endif + + template + inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) { +@@ -1008,10 +1024,13 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) + template <> + void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) + { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); + } +- else { ++ else ++#endif ++ { + gemm_internal_cublas(CUDABLAS_GEMM_ARGS(float)); + } + } +@@ -1051,10 +1070,13 @@ void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex + void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) + { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); + } +- else { ++ else ++#endif ++ { + gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::Half)); + } + } +@@ -1062,10 +1084,13 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) + template <> + void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) + { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } +- else { ++ else ++#endif ++ { + gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } + } +@@ -1177,7 +1202,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { + } + } + +- ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + template + void gemm_and_bias( + bool transpose_mat1, +@@ -1410,7 +1435,7 @@ void scaled_gemm( + ScalarType result_dtype, + void* amax_ptr, + bool use_fast_accum) { +-#if CUDA_VERSION >= 11080 || defined(USE_ROCM) ++#if CUDA_VERSION >= 11080 || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + const auto computeType = CUBLAS_COMPUTE_32F; + const auto scaleType = CUDA_R_32F; + const int8_t fastAccuMode = use_fast_accum ? 1 : 0; +@@ -1681,6 +1706,7 @@ void int8_gemm( + " scaleType ", + scaleType); + } ++#endif + + template <> + void trsm(CUDABLAS_TRSM_ARGTYPES(float)) { +diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h +index f2b657ced51b..f0ee613c4208 100644 +--- a/aten/src/ATen/cuda/CUDAContextLight.h ++++ b/aten/src/ATen/cuda/CUDAContextLight.h +@@ -9,7 +9,9 @@ + + // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also + // added bf16 support ++#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))) + #include ++#endif + + #ifdef CUDART_VERSION + #include +@@ -80,7 +82,9 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator(); + /* Handles */ + TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle(); + TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle(); ++#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))) + TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle(); ++#endif + + TORCH_CUDA_CPP_API void clearCublasWorkspaces(); + +diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp +index 8eac525b3695..abfdf7a23847 100644 +--- a/aten/src/ATen/cuda/CublasHandlePool.cpp ++++ b/aten/src/ATen/cuda/CublasHandlePool.cpp +@@ -29,7 +29,7 @@ namespace at::cuda { + + namespace { + +-#if defined(USE_ROCM) ++#if defined(USE_ROCM) && defined(USE_HIPBLASLT) + void createCublasLtHandle(cublasLtHandle_t *handle) { + TORCH_CUDABLAS_CHECK(cublasLtCreate(handle)); + } +@@ -191,8 +191,9 @@ cublasHandle_t getCurrentCUDABlasHandle() { + return handle; + } + +-cublasLtHandle_t getCurrentCUDABlasLtHandle() { + #ifdef USE_ROCM ++#if defined(USE_HIPBLASLT) ++cublasLtHandle_t getCurrentCUDABlasLtHandle() { + c10::DeviceIndex device = 0; + AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); + +@@ -213,9 +214,12 @@ cublasLtHandle_t getCurrentCUDABlasLtHandle() { + + auto handle = myPoolWindow->reserve(device); + return handle; ++} ++#endif + #else ++cublasLtHandle_t getCurrentCUDABlasLtHandle() { + return reinterpret_cast(getCurrentCUDABlasHandle()); +-#endif + } ++#endif + + } // namespace at::cuda +diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h +index 53e6154120c9..fa1d664696db 100644 +--- a/aten/src/ATen/cuda/tunable/TunableGemm.h ++++ b/aten/src/ATen/cuda/tunable/TunableGemm.h +@@ -11,7 +11,9 @@ + + #include + #ifdef USE_ROCM ++#ifdef USE_HIPBLASLT + #include ++#endif + #include + #endif + #include +@@ -65,6 +67,7 @@ class DefaultGemmStridedBatchedOp : public Callable> + } + }; + ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + template + class DefaultScaledGemmOp : public Callable> { + public: +@@ -94,6 +97,7 @@ class DefaultScaledGemmOp : public Callable> { + return OK; + } + }; ++#endif + + template + inline bool IsZero(T v) { +@@ -191,6 +195,7 @@ static void AddRocblasValidator() { + } + } + ++#ifdef USE_HIPBLASLT + static void AddHipblasltValidator() { + auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); + if (validators.find("HIPBLASLT_VERSION") == validators.end()) { +@@ -205,6 +210,7 @@ static void AddHipblasltValidator() { + [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; }); + } + } ++#endif + + static void AddRocmValidator() { + auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); +@@ -243,7 +249,7 @@ class GemmTunableOp : public TunableOp, StreamTimer> { + } + AddRocblasValidator(); + } +- ++#ifdef USE_HIPBLASLT + static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); + if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) { + rocm_validators = true; +@@ -257,7 +263,7 @@ class GemmTunableOp : public TunableOp, StreamTimer> { + } + AddHipblasltValidator(); + } +- ++#endif + if (rocm_validators) { + AddRocmValidator(); + } +@@ -286,7 +292,7 @@ class GemmStridedBatchedTunableOp : public TunableOp + } + AddRocblasValidator(); + } +- ++#ifdef USE_HIPBLASLT + static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); + if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) { + rocm_validators = true; +@@ -300,7 +306,7 @@ class GemmStridedBatchedTunableOp : public TunableOp + } + AddHipblasltValidator(); + } +- ++#endif + if (rocm_validators) { + AddRocmValidator(); + } +@@ -312,6 +318,7 @@ class GemmStridedBatchedTunableOp : public TunableOp + } + }; + ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + template + class ScaledGemmTunableOp : public TunableOp, StreamTimer> { + public: +@@ -321,10 +328,12 @@ class ScaledGemmTunableOp : public TunableOp, StreamTimer> + auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); + + #if defined(USE_ROCM) ++#ifdef USE_HIPBLASLT + for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps()) { + this->RegisterOp(std::move(name), std::move(op)); + } + AddHipblasltValidator(); ++#endif + AddRocmValidator(); + #endif + } +@@ -337,6 +346,7 @@ class ScaledGemmTunableOp : public TunableOp, StreamTimer> + "_", BlasOpToString(ALayout), BlasOpToString(BLayout)); + } + }; ++#endif + + #undef XSTRINGIFY + #undef STRINGIFY +diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp +index 84c59a4fd0d7..56ad5de3bf2d 100644 +--- a/aten/src/ATen/native/cuda/Blas.cpp ++++ b/aten/src/ATen/native/cuda/Blas.cpp +@@ -173,6 +173,7 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa + } + + static bool getDisableAddmmCudaLt() { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT"); + #ifdef USE_ROCM + // if we enable tunable op, it'll take priority over just hipblaslt (heuristics) +@@ -196,10 +197,14 @@ static bool getDisableAddmmCudaLt() { + } + return false; + #endif ++#else ++ return true; ++#endif + } + + #ifdef USE_ROCM + static bool isSupportedHipLtROCmArch(int index) { ++#ifdef USE_HIPBLASLT + hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index); + std::string device_arch = prop->gcnArchName; + static const std::vector archs = {"gfx90a", "gfx940", "gfx941", "gfx942"}; +@@ -210,6 +215,7 @@ static bool isSupportedHipLtROCmArch(int index) { + } + } + TORCH_CHECK(false, "Attempting to use hipBLASLt on a unsupported architecture!"); ++#endif + return false; + } + #endif +@@ -235,6 +241,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma + at::ScalarType scalar_type = self.scalar_type(); + c10::MaybeOwned self_; + if (&result != &self) { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM) + // Strangely, if mat2 has only 1 row or column, we get + // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic. +@@ -276,13 +283,14 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma + scalar_type != at::ScalarType::BFloat16)); + #endif + } ++#endif + #endif + if (!useLtInterface) { + self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm"); + } + self__sizes = self_->sizes(); + } else { +-#if defined(USE_ROCM) ++#if defined(USE_ROCM) && defined(USE_HIPBLASLT) + useLtInterface = !disable_addmm_cuda_lt && + result.dim() == 2 && result.is_contiguous() && + isSupportedHipLtROCmArch(self.device().index()) && +@@ -334,6 +342,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj()); + + if (useLtInterface) { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + #if defined(USE_ROCM) + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Half, +@@ -394,6 +403,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma + activation_epilogue + ); + }); ++#endif + #endif + } else + { +@@ -803,6 +813,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) { + } + + static bool _scaled_mm_allowed_device() { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + auto dprops = at::cuda::getCurrentDeviceProperties(); + #ifdef USE_ROCM + std::string device_arch = dprops->gcnArchName; +@@ -817,6 +828,9 @@ static bool _scaled_mm_allowed_device() { + #else + return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9); + #endif ++#else ++ return false; ++#endif + } + + // Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax +@@ -850,6 +864,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, + // Check sizes + bool allowed_device = _scaled_mm_allowed_device(); + TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+"); ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix"); + TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix"); + TORCH_CHECK( +@@ -1025,6 +1040,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, + #if defined(USE_ROCM) && ROCM_VERSION >= 60000 && ROCM_VERSION < 60200 + // ROCm's hipBLASLt does not support amax before 6.2, so calculate separately + amax = at::max(at::abs(out.to(kFloat))); ++#endif + #endif + + return {out, amax}; +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index f1f2eb7cec31..8d05e834bbc5 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -1052,6 +1052,9 @@ if(USE_ROCM) + list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) + list(APPEND HIP_CXX_FLAGS -std=c++17) + list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2) ++ if(hipblast_FOUND) ++ list(APPEND HIP_CXX_FLAGS -DUSE_HIPBLASLT) ++ endif() + if(HIP_NEW_TYPE_ENUMS) + list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS) + endif() +diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake +index fa39156031ff..df4836847fdf 100644 +--- a/cmake/public/LoadHIP.cmake ++++ b/cmake/public/LoadHIP.cmake +@@ -155,7 +155,7 @@ if(HIP_FOUND) + find_package_and_print_version(hiprand REQUIRED) + find_package_and_print_version(rocblas REQUIRED) + find_package_and_print_version(hipblas REQUIRED) +- find_package_and_print_version(hipblaslt REQUIRED) ++ find_package_and_print_version(hipblaslt) + find_package_and_print_version(miopen REQUIRED) + find_package_and_print_version(hipfft REQUIRED) + find_package_and_print_version(hipsparse REQUIRED) +-- +2.45.2 + diff --git a/next/0001-Use-horrible-dynamo-stub.patch b/next/0001-Use-horrible-dynamo-stub.patch deleted file mode 100644 index 1900519..0000000 --- a/next/0001-Use-horrible-dynamo-stub.patch +++ /dev/null @@ -1,85 +0,0 @@ -From fd535f7bf44f2034cca2a66b4cc7d68d962341df Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sun, 20 Jul 2025 12:47:58 -0700 -Subject: [PATCH] Use horrible dynamo stub - -Rawhide's update of python is too fast for dynamo -So paper of the problem with a horrible stub that throws -runtime exceptions if dynamo is used. - -Signed-off-by: Tom Rix ---- - build_variables.bzl | 26 ++++++++++++---------- - torch/csrc/dynamo/horrible_dynamo_stub.cpp | 16 +++++++++++++ - 2 files changed, 30 insertions(+), 12 deletions(-) - create mode 100644 torch/csrc/dynamo/horrible_dynamo_stub.cpp - -diff --git a/build_variables.bzl b/build_variables.bzl -index b266c80e8843..a3be6893349b 100644 ---- a/build_variables.bzl -+++ b/build_variables.bzl -@@ -140,7 +140,8 @@ core_trainer_sources = [ - "torch/csrc/autograd/variable.cpp", - "torch/csrc/autograd/utils/warnings.cpp", - "torch/csrc/autograd/jit_decomp_interface.cpp", -- "torch/csrc/dynamo/compiled_autograd.cpp", -+# "torch/csrc/dynamo/compiled_autograd.cpp", -+ "torch/csrc/dynamo/horrible_dynamo_stub.cpp", - "torch/csrc/jit/frontend/name_mangler.cpp", - "torch/csrc/jit/ir/type_hashing.cpp", - "torch/csrc/jit/serialization/pickler.cpp", -@@ -868,17 +869,18 @@ libtorch_python_core_sources = [ - "torch/csrc/autograd/python_torch_functions_manual.cpp", - "torch/csrc/autograd/python_variable.cpp", - "torch/csrc/autograd/python_variable_indexing.cpp", -- "torch/csrc/dynamo/python_compiled_autograd.cpp", -- "torch/csrc/dynamo/cache_entry.cpp", -- "torch/csrc/dynamo/cpp_shim.cpp", -- "torch/csrc/dynamo/cpython_defs.c", -- "torch/csrc/dynamo/eval_frame.c", -- "torch/csrc/dynamo/eval_frame_cpp.cpp", -- "torch/csrc/dynamo/extra_state.cpp", -- "torch/csrc/dynamo/framelocals_mapping.cpp", -- "torch/csrc/dynamo/guards.cpp", -- "torch/csrc/dynamo/utils.cpp", -- "torch/csrc/dynamo/init.cpp", -+# "torch/csrc/dynamo/python_compiled_autograd.cpp", -+# "torch/csrc/dynamo/cache_entry.cpp", -+# "torch/csrc/dynamo/cpp_shim.cpp", -+# "torch/csrc/dynamo/cpython_defs.c", -+# "torch/csrc/dynamo/eval_frame.c", -+# "torch/csrc/dynamo/eval_frame_cpp.cpp", -+# "torch/csrc/dynamo/extra_state.cpp", -+# "torch/csrc/dynamo/framelocals_mapping.cpp", -+# "torch/csrc/dynamo/guards.cpp", -+# "torch/csrc/dynamo/utils.cpp", -+# "torch/csrc/dynamo/init.cpp", -+ "torch/csrc/dynamo/horrible_dynamo_stub.cpp", - "torch/csrc/functorch/init.cpp", - "torch/csrc/fx/node.cpp", - "torch/csrc/mps/Module.cpp", -diff --git a/torch/csrc/dynamo/horrible_dynamo_stub.cpp b/torch/csrc/dynamo/horrible_dynamo_stub.cpp -new file mode 100644 -index 000000000000..3ac1324d4557 ---- /dev/null -+++ b/torch/csrc/dynamo/horrible_dynamo_stub.cpp -@@ -0,0 +1,16 @@ -+#include -+#include -+ -+namespace torch::dynamo::autograd { -+const std::unique_ptr& getPyCompilerInterface() { -+ throw std::runtime_error("Dynamo not supported"); -+ return nullptr; -+} -+std::vector> get_input_metadata( -+ const edge_list& edges) { -+ std::vector> r; -+ throw std::runtime_error("Dynamo not supported"); -+ return r; -+} -+ -+} --- -2.49.0 - diff --git a/next/0001-disable-use-of-aotriton.patch b/next/0001-disable-use-of-aotriton.patch new file mode 100644 index 0000000..61ffd1e --- /dev/null +++ b/next/0001-disable-use-of-aotriton.patch @@ -0,0 +1,94 @@ +From 038ce9e44776e23f21c1816daa259bc0ea335088 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Sat, 29 Jun 2024 07:06:09 -0700 +Subject: [PATCH] disable use of aotriton + +--- + .../ATen/native/transformers/cuda/sdp_utils.cpp | 17 +++++++++++++++-- + 1 file changed, 15 insertions(+), 2 deletions(-) + +diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp +index 214b02d8262e..7b3eb9dcd8cd 100644 +--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp ++++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp +@@ -19,9 +19,12 @@ + #include + #include + ++#ifdef USE_FLASH_ATTENTION + #if USE_ROCM + #include + #endif ++#endif ++ + + /** + * Note [SDPA Runtime Dispatch] +@@ -182,6 +185,9 @@ bool check_sm_version(cudaDeviceProp * dprops) { + + bool check_flash_attention_hardware_support(sdp_params const& params, bool debug) { + // Check that the gpu is capable of running flash attention ++#ifndef USE_FLASH_ATTENTION ++ return false; ++#else + using sm80 = SMVersion<8, 0>; + using sm90 = SMVersion<9, 0>; + #if USE_ROCM +@@ -209,9 +215,13 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug + } + #endif + return true; ++#endif + } + + bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) { ++#ifndef USE_FLASH_ATTENTION ++ return false; ++#else + // Mem Efficient attention supports hardware in the range [sm_50, sm_90] + using sm50 = SMVersion<5, 0>; + using sm90 = SMVersion<9, 0>; +@@ -240,6 +250,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) + } + #endif + return true; ++#endif + } + + bool check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89( +@@ -554,7 +565,7 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) { + #ifndef USE_FLASH_ATTENTION + TORCH_WARN_ONCE(!debug, "Torch was not compiled with flash attention."); + return false; +-#endif ++#else + + // Define gate functions that determine if a flash kernel can be ran + // Replace with std::to_array when we migrate to c++20 +@@ -597,13 +608,14 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) { + } + } + return true; ++#endif + } + + bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) { + #ifndef USE_MEM_EFF_ATTENTION + TORCH_WARN_ONCE(!debug, "Torch was not compiled with memory efficient attention."); + return false; +-#endif ++#else + // Constraints specific to mem efficient attention + constexpr auto greater_than_or_equal_sm80_mem_efficient_dtypes = + array_of(at::kHalf, at::kFloat, at::kBFloat16); +@@ -663,6 +675,7 @@ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) { + } + #endif + return check_tensor_dtype(params, less_than_sm80_mem_efficient_dtypes, debug); ++#endif + } + + SDPBackend select_sdp_backend(sdp_params const& kernel_params) { +-- +2.45.2 + diff --git a/pyproject.toml b/pyproject.toml index 925742b..9508ad0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,165 +1,46 @@ -# Package ###################################################################### - [build-system] requires = [ - # 70.1.0: min version for integrated bdist_wheel command from wheel package - # 77.0.0: min version for SPDX expression support for project.license - "setuptools>=70.1.0,<80.0", - "cmake>=3.27", - "ninja", - "numpy", - "packaging", - "pyyaml", - "requests", - "six", # dependency chain: NNPACK -> PeachPy -> six - "typing-extensions>=4.10.0", -] -build-backend = "setuptools.build_meta" - -[dependency-groups] -dev = [ - # This list should be kept in sync with the requirements-build.txt - # in PyTorch root until the project fully migrates to pyproject.toml - # after which this can be removed as it is already specified in the - # [build-system] section - "setuptools>=70.1.0,<80.0", # setuptools develop deprecated on 80.0 - "cmake>=3.27", - "ninja", - "numpy", - "packaging", - "pyyaml", - "requests", - "six", # dependency chain: NNPACK -> PeachPy -> six - "typing-extensions>=4.10.0", - - # This list should be kept in sync with the requirements.txt in - # PyTorch root until the project fully migrates to pyproject.toml - "build[uv]", - "expecttest>=0.3.0", - "filelock", - "fsspec>=0.8.5", - "hypothesis", - "jinja2", - "lintrunner; platform_machine != 's390x' and platform_machine != 'riscv64'", - "networkx>=2.5.1", - "optree>=0.13.0", - "psutil", - "sympy>=1.13.3", - "typing-extensions>=4.13.2", + "setuptools", "wheel", + "astunparse", + "numpy", + "ninja", + "pyyaml", + "cmake", + "typing-extensions", + "requests", ] +# Use legacy backend to import local packages in setup.py +build-backend = "setuptools.build_meta:__legacy__" -[project] -name = "torch" -description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" -readme = "README.md" -requires-python = ">=3.10" -# TODO: change to `license = "BSD-3-Clause"` and enable PEP 639 after pinning setuptools>=77 -# FIXME: As of 2025.06.20, it is hard to ensure the minimum version of setuptools in our CI environment. -# TOML-table-based license deprecated in setuptools>=77, and the deprecation warning will be changed -# to an error on 2026.02.18. See also: https://github.com/pypa/setuptools/issues/4903 -license = { text = "BSD-3-Clause" } -authors = [{ name = "PyTorch Team", email = "packages@pytorch.org" }] -keywords = ["pytorch", "machine learning"] -classifiers = [ - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "Intended Audience :: Education", - "Intended Audience :: Science/Research", - "Topic :: Scientific/Engineering", - "Topic :: Scientific/Engineering :: Mathematics", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - "Topic :: Software Development", - "Topic :: Software Development :: Libraries", - "Topic :: Software Development :: Libraries :: Python Modules", - "Programming Language :: C++", - "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", - "Programming Language :: Python :: 3.14", -] -dynamic = [ - "entry-points", - "dependencies", - "scripts", - "version", -] -[project.urls] -Homepage = "https://pytorch.org" -Repository = "https://github.com/pytorch/pytorch" -Documentation = "https://pytorch.org/docs" -"Issue Tracker" = "https://github.com/pytorch/pytorch/issues" -Forum = "https://discuss.pytorch.org" +[tool.black] +# Uncomment if pyproject.toml worked fine to ensure consistency with flake8 +# line-length = 120 +target-version = ["py38", "py39", "py310", "py311"] -[project.optional-dependencies] -optree = ["optree>=0.13.0"] -opt-einsum = ["opt-einsum>=3.3"] -pyyaml = ["pyyaml"] - -# Linter tools ################################################################# - -[tool.isort] -src_paths = ["caffe2", "torch", "torchgen", "functorch", "test"] -extra_standard_library = ["typing_extensions"] -skip_gitignore = true -skip_glob = ["third_party/*"] -atomic = true -profile = "black" -indent = 4 -line_length = 88 -lines_after_imports = 2 -multi_line_output = 3 -include_trailing_comma = true -combine_as_imports = true - -[tool.usort.known] -first_party = ["caffe2", "torch", "torchgen", "functorch", "test"] -standard_library = ["typing_extensions"] [tool.ruff] -line-length = 88 -src = ["caffe2", "torch", "torchgen", "functorch", "test"] +target-version = "py38" -[tool.ruff.format] -docstring-code-format = true -quote-style = "double" - -[tool.ruff.lint] # NOTE: Synchoronize the ignores with .flake8 -external = [ - "B001", - "B902", - "B950", - "E121", - "E122", - "E128", - "E131", - "E704", - "E723", - "F723", - "F812", - "P201", - "P204", - "T484", - "TOR901", -] ignore = [ # these ignores are from flake8-bugbear; please fix! "B007", "B008", "B017", "B018", # Useless expression + "B019", "B023", "B028", # No explicit `stacklevel` keyword argument found + "B904", "E402", "C408", # C408 ignored because we like the dict keyword argument syntax "E501", # E501 is not flexible enough, we're using B950 instead "E721", + "E731", # Assign lambda expression "E741", "EXE001", "F405", - "FURB122", # writelines + "F841", # these ignores are from flake8-logging-format; please fix! "G101", # these ignores are from ruff NPY; please fix! @@ -167,49 +48,39 @@ ignore = [ # these ignores are from ruff PERF; please fix! "PERF203", "PERF401", + "PERF403", # these ignores are from PYI; please fix! + "PYI019", "PYI024", "PYI036", "PYI041", "PYI056", "SIM102", "SIM103", "SIM112", # flake8-simplify code styles "SIM105", # these ignores are from flake8-simplify. please fix or ignore with commented reason - "SIM108", # SIM108 ignored because we prefer if-else-block instead of ternary expression + "SIM108", "SIM110", "SIM114", # Combine `if` branches using logical `or` operator "SIM115", "SIM116", # Disable Use a dictionary instead of consecutive `if` statements "SIM117", "SIM118", + "UP006", # keep-runtime-typing "UP007", # keep-runtime-typing - "UP045", # keep-runtime-typing - "TC006", - # TODO: Remove Python-3.10 specific suppressions - "B905", - "UP035", - "UP036", - "UP038", - "UP041", - "FURB161", ] +line-length = 120 select = [ "B", - "B904", # Re-raised error without specifying the cause via the from keyword "C4", "G", "E", "EXE", "F", "SIM1", - "SIM911", "W", # Not included in flake8 - "FURB", - "LOG", "NPY", "PERF", "PGH004", - "PIE790", "PIE794", "PIE800", "PIE804", @@ -218,96 +89,40 @@ select = [ "PLC0131", # type bivariance "PLC0132", # type param mismatch "PLC0205", # string as __slots__ - "PLC3002", # unnecessary-direct-lambda-call "PLE", "PLR0133", # constant comparison "PLR0206", # property with params "PLR1722", # use sys exit - "PLR1736", # unnecessary list index "PLW0129", # assert on string literal - "PLW0131", # named expr without context - "PLW0133", # useless exception statement - "PLW0245", # super without brackets "PLW0406", # import self "PLW0711", # binary op exception - "PLW1501", # bad open mode - "PLW1507", # shallow copy os.environ "PLW1509", # preexec_fn not safe with threads - "PLW2101", # useless lock statement "PLW3301", # nested min max "PT006", # TODO: enable more PT rules - "PT014", # duplicate parameterize case "PT022", "PT023", "PT024", "PT025", "PT026", "PYI", - "Q003", # avoidable escaped quote - "Q004", # unnecessary escaped quote - "RSE", "RUF008", # mutable dataclass default - "RUF013", # ban implicit optional "RUF015", # access first ele in constant time "RUF016", # type error non-integer index "RUF017", - "RUF018", # no assignment in assert - "RUF019", # unnecessary-key-check - "RUF020", # never union - "RUF024", # from keys mutable - "RUF026", # default factory kwarg - "RUF030", # No print statement in assert - "RUF033", # default values __post_init__ dataclass - "RUF041", # simplify nested Literal - "RUF048", # properly parse `__version__` - "RUF200", # validate pyproject.toml - "S324", # for hashlib FIPS compliance - "SLOT", - "TC", - "TRY002", # ban vanilla raise (todo fix NOQAs) - "TRY203", - "TRY401", # verbose-log-message + "TRY200", + "TRY302", "UP", - "YTT", ] -[tool.ruff.lint.pyupgrade] -# Preserve types, even if a file imports `from __future__ import annotations`. -keep-runtime-typing = true - -[tool.ruff.lint.per-file-ignores] +[tool.ruff.per-file-ignores] "__init__.py" = [ "F401", ] -"*.pyi" = [ - "PYI011", # typed-argument-default-in-stub - "PYI021", # docstring-in-stub - "PYI053", # string-or-bytes-too-long -] -"functorch/notebooks/**" = [ - "F401", -] -"test/export/**" = [ - "PGH004" -] -"test/typing/**" = [ - "PGH004" -] "test/typing/reveal/**" = [ "F821", ] "test/torch_np/numpy_tests/**" = [ "F821", - "NPY201", -] -"test/dynamo/test_bytecode_utils.py" = [ - "F821", -] -"test/dynamo/test_debug_utils.py" = [ - "UP037", -] -"test/dynamo/test_misc.py" = [ - "PGH004", ] "test/jit/**" = [ "PLR0133", # tests require this for JIT @@ -321,33 +136,19 @@ keep-runtime-typing = true "RUF015", "UP", # We don't want to modify the jit test as they test specify syntax ] -"test/inductor/s429861_repro.py" = [ - "PGH004", -] -"test/inductor/test_torchinductor.py" = [ - "UP037", -] -# autogenerated #TODO figure out why file level noqa is ignored -"torch/_appdirs.py" = ["PGH004"] -"torch/jit/_shape_functions.py" = ["PGH004"] -"torch/_inductor/fx_passes/serialized_patterns/**" = ["F401", "F501"] -"torch/_inductor/autoheuristic/artifacts/**" = ["F401", "F501"] -"torch/_inductor/codegen/**" = [ - "PGH004" + +"torch/onnx/**" = [ + "UP037", # ONNX does runtime type checking ] + "torchgen/api/types/__init__.py" = [ "F401", "F403", ] +"torchgen/executorch/api/types/__init__.py" = [ + "F401", + "F403", +] "torch/utils/collect_env.py" = [ "UP", # collect_env.py needs to work with older versions of Python ] -"torch/_vendor/**" = [ - "UP", # No need to mess with _vendor -] -"tools/linter/**" = [ - "LOG015" # please fix -] - -[tool.codespell] -ignore-words = "tools/linter/dictionary.txt" diff --git a/python-torch.spec b/python-torch.spec index d3c31d7..86794c5 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -6,20 +6,13 @@ # So pre releases can be tried %bcond_with gitcommit %if %{with gitcommit} -# v2.9.0-rc9 -%global commit0 0fabc3ba44823f257e70ce397d989c8de5e362c1 +# v2.5.0-rc9 +%global commit0 417a0763a7d69f6ce80719ac89c1d2deeee78163 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) -%global date0 20251008 -%global pypi_version 2.9.0 -%global flatbuffers_version 24.12.23 -%global miniz_version 3.0.2 -%global pybind11_version 2.13.6 -%global rc_tag -rc9 +%global date0 2024103 +%global pypi_version 2.5.0 %else -%global pypi_version 2.9.1 -%global flatbuffers_version 24.12.23 -%global miniz_version 3.0.2 -%global pybind11_version 2.13.6 +%global pypi_version 2.5.1 %endif # For -test subpackage @@ -32,9 +25,12 @@ %ifarch x86_64 %bcond_without rocm %endif +%bcond_with rocm_loop +%global rocm_default_gpu default +%global rocm_gpu_list gfx9 # For testing distributed+rccl etc. -%bcond_without rccl +%bcond_with rccl %bcond_with gloo %bcond_without mpi %bcond_without tensorpipe @@ -49,12 +45,6 @@ %bcond_with httplib %bcond_with kineto -%if 0%{?fedora} -%bcond_without onnx -%else -%bcond_with onnx -%endif - Name: python-%{pypi_name} %if %{with gitcommit} Version: %{pypi_version}^git%{date0}.%{shortcommit0} @@ -73,8 +63,8 @@ Source1000: pyproject.toml %else Source0: %{forgeurl}/releases/download/v%{version}/pytorch-v%{version}.tar.gz %endif -Source1: https://github.com/google/flatbuffers/archive/refs/tags/v%{flatbuffers_version}.tar.gz -Source2: https://github.com/pybind/pybind11/archive/refs/tags/v%{pybind11_version}.tar.gz +Source1: https://github.com/google/flatbuffers/archive/refs/tags/v23.3.3.tar.gz +Source2: https://github.com/pybind/pybind11/archive/refs/tags/v2.11.1.tar.gz # Developement on tensorpipe has stopped, repo made read only July 1, 2023, this is the last commit %global tp_commit 52791a2fd214b2a9dc5759d36725909c1daa7f2e @@ -99,20 +89,26 @@ Source70: https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp- %endif %if %{without kineto} -%global ki_commit 5e7501833f1021ce6f618572d3baf657b6319658 +%global ki_commit be1317644c68b4bfc4646024a6b221066e430031 %global ki_scommit %(c=%{ki_commit}; echo ${c:0:7}) Source80: https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz %endif -%global ox_ver 1.18.0 -Source90: https://github.com/onnx/onnx/archive/refs/tags/v%{ox_ver}.tar.gz +Patch11: 0001-Improve-finding-and-using-the-rocm_version.h.patch -%global pt_arches x86_64 aarch64 -ExclusiveArch: %pt_arches +# ROCm patches +# Patches need to be refactored for ToT +# These are ROCm packages +Patch101: 0001-cuda-hip-signatures.patch +# https://github.com/pytorch/pytorch/issues/145608 +Patch102: 0001-torch-paper-over-c-assert.patch + +ExclusiveArch: x86_64 aarch64 %global toolchain gcc %global _lto_cflags %nil BuildRequires: cmake +BuildRequires: binutils-gold BuildRequires: eigen3-devel BuildRequires: flexiblas-devel BuildRequires: fmt-devel @@ -126,12 +122,9 @@ BuildRequires: gloo-devel BuildRequires: json-devel BuildRequires: libomp-devel -BuildRequires: moodycamel-concurrentqueue-devel BuildRequires: numactl-devel BuildRequires: ninja-build -%if %{with onnx} BuildRequires: onnx-devel -%endif %if %{with mpi} BuildRequires: openmpi-devel %endif @@ -152,7 +145,6 @@ BuildRequires: python3dist(filelock) BuildRequires: python3dist(jinja2) BuildRequires: python3dist(networkx) BuildRequires: python3dist(numpy) -BuildRequires: python3dist(pip) BuildRequires: python3dist(pyyaml) BuildRequires: python3dist(setuptools) BuildRequires: python3dist(sphinx) @@ -171,10 +163,8 @@ BuildRequires: hipcub-devel BuildRequires: hipfft-devel BuildRequires: hiprand-devel BuildRequires: hipsparse-devel -BuildRequires: hipsparselt-devel BuildRequires: hipsolver-devel -# Magma is broken on ROCm 7 -# BuildRequires: magma-devel +BuildRequires: magma-devel BuildRequires: miopen-devel BuildRequires: rocblas-devel BuildRequires: rocrand-devel @@ -190,12 +180,12 @@ BuildRequires: rocm-core-devel BuildRequires: rocm-hip-devel BuildRequires: rocm-runtime-devel BuildRequires: rocm-rpm-macros -BuildRequires: rocsolver-devel -BuildRequires: rocm-smi-devel +BuildRequires: rocm-rpm-macros-modules BuildRequires: rocthrust-devel BuildRequires: roctracer-devel Requires: amdsmi +Requires: rocm-rpm-macros-modules %endif @@ -204,7 +194,6 @@ BuildRequires: google-benchmark-devel %endif Requires: python3dist(dill) -Requires: python3dist(yaml) Obsoletes: caffe = 1.0^git20200212.9b89154 @@ -224,10 +213,10 @@ Summary: %{summary} Provides: pytorch # Apache-2.0 -Provides: bundled(flatbuffers) = %{flatbuffers_version} +Provides: bundled(flatbuffers) = 22.3.3 # MIT -Provides: bundled(miniz) = %{miniz_version} -Provides: bundled(pybind11) = %{pybind11_version} +Provides: bundled(miniz) = 2.1.0 +Provides: bundled(pybind11) = 2.11.1 %if %{with tensorpipe} # BSD-3-Clause @@ -247,6 +236,15 @@ PyTorch is a Python package that provides two high-level features: You can reuse your favorite Python packages such as NumPy, SciPy, and Cython to extend PyTorch when needed. +%if %{with rocm_loop} +%package -n python3-%{pypi_name}-rocm-gfx9 +Summary: %{name} for ROCm gfx9 + +%description -n python3-%{pypi_name}-rocm-gfx9 +%{summary} + +%endif + %if %{with test} %package -n python3-%{pypi_name}-test Summary: Tests for %{name} @@ -273,11 +271,11 @@ rm -rf %{pypi_name}.egg-info tar xf %{SOURCE1} rm -rf third_party/flatbuffers/* -cp -r flatbuffers-%{flatbuffers_version}/* third_party/flatbuffers/ +cp -r flatbuffers-23.3.3/* third_party/flatbuffers/ tar xf %{SOURCE2} rm -rf third_party/pybind11/* -cp -r pybind11-%{pybind11_version}/* third_party/pybind11/ +cp -r pybind11-2.11.1/* third_party/pybind11/ %if %{with tensorpipe} tar xf %{SOURCE20} @@ -313,14 +311,8 @@ rm -rf third_party/kineto/* cp -r kineto-*/* third_party/kineto/ %endif -%if %{without onnx} -tar xf %{SOURCE90} -rm -rf third_party/onnx/* -cp -r onnx-*/* third_party/onnx/ -%endif - -# Adjust for the hipblaslt's we build -sed -i -e 's@"gfx90a", "gfx940", "gfx941", "gfx942"@"gfx90a", "gfx1103", "gfx1150", "gfx1151", "gfx1100", "gfx1101", "gfx1200", "gfx1201"@' aten/src/ATen/native/cuda/Blas.cpp +# hipblaslt only building with gfx90a +sed -i -e 's@"gfx90a", "gfx940", "gfx941", "gfx942"@"gfx90a"@' aten/src/ATen/native/cuda/Blas.cpp %if 0%{?rhel} # In RHEL but too old @@ -343,17 +335,9 @@ sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-pass sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unused-command-line-argument@' cmake/Dependencies.cmake sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unused-result@' cmake/Dependencies.cmake sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-deprecated-declarations@' cmake/Dependencies.cmake -# Use parallel jobs -sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -parallel-jobs=4@' cmake/Dependencies.cmake -# Need to link with librocm_smi64 -sed -i -e 's@hiprtc::hiprtc@hiprtc::hiprtc rocm_smi64@' cmake/Dependencies.cmake # No third_party fmt, use system sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt -sed -i -e 's@fmt::fmt-header-only@fmt@' aten/src/ATen/CMakeLists.txt -sed -i -e 's@list(APPEND ATen_HIP_INCLUDE $)@@' aten/src/ATen/CMakeLists.txt - -sed -i -e 's@fmt::fmt-header-only@fmt@' third_party/kineto/libkineto/CMakeLists.txt sed -i -e 's@fmt::fmt-header-only@fmt@' c10/CMakeLists.txt sed -i -e 's@fmt::fmt-header-only@fmt@' torch/CMakeLists.txt sed -i -e 's@fmt::fmt-header-only@fmt@' cmake/Dependencies.cmake @@ -367,10 +351,6 @@ sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)@#list(APPE sed -i -e 's@if(NOT TARGET fxdiv)@if(MSVC AND USE_XNNPACK)@' caffe2/CMakeLists.txt sed -i -e 's@TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@#TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@' caffe2/CMakeLists.txt -# https://github.com/pytorch/pytorch/issues/149803 -# Tries to checkout nccl -sed -i -e 's@ checkout_nccl()@ True@' tools/build_pytorch_libs.py - # Disable the use of check_submodule's in the setup.py, we are a tarball, not a git repo sed -i -e 's@check_submodules()$@#check_submodules()@' setup.py @@ -382,7 +362,7 @@ sed -i -e 's@check_submodules()$@#check_submodules()@' setup.py # the third_party dir to compile the file. # mimiz is licensed MIT # https://github.com/richgel999/miniz/blob/master/LICENSE -mv third_party/miniz-%{miniz_version} . +mv third_party/miniz-2.1.0 . # # setup.py depends on this script mv third_party/build_bundled.py . @@ -408,10 +388,6 @@ mv third_party/cpp-httplib . mv third_party/kineto . %endif -%if %{without onnx} -mv third_party/onnx . -%endif - %if %{with test} mv third_party/googletest . %endif @@ -420,7 +396,7 @@ mv third_party/googletest . rm -rf third_party/* # Put stuff back mv build_bundled.py third_party -mv miniz-%{miniz_version} third_party +mv miniz-2.1.0 third_party mv flatbuffers third_party mv pybind11 third_party @@ -440,10 +416,6 @@ mv cpp-httplib third_party mv kineto third_party %endif -%if %{without onnx} -mv onnx third_party -%endif - %if %{with test} mv googletest third_party %endif @@ -451,7 +423,6 @@ mv googletest third_party # # Fake out pocketfft, and system header will be used mkdir third_party/pocketfft -cp /usr/include/pocketfft_hdronly.h third_party/pocketfft/ # # Use the system valgrind headers @@ -464,23 +435,11 @@ sed -i -e 's@DESTINATION ${PYTHON_LIB_REL_PATH}@DESTINATION ${CMAKE_INSTALL_PREF # reenable foxi linking sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)@#list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)@' cmake/Dependencies.cmake -# cmake version changed -sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION 3.5)@' third_party/tensorpipe/third_party/libuv/CMakeLists.txt -sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION 3.5)@' libuv*/CMakeLists.txt -%if %{without opentelemtry} -sed -i -e 's@cmake_minimum_required(VERSION 3.1)@cmake_minimum_required(VERSION 3.5)@' third_party/opentelemetry-cpp/CMakeLists.txt -%endif - %if %{with rocm} # hipify ./tools/amd_build/build_amd.py # Fedora installs to /usr/include, not /usr/include/rocm-core sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable/TunableGemm.h -# https://github.com/pytorch/pytorch/issues/149805 -sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' cmake/public/LoadHIP.cmake -# Fedora installs to /usr/include, not /usr/include/rocm-core -sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable/Tunable.cpp -sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/cuda/tunable/Tunable.cpp # use any hip, correct CMAKE_MODULE_PATH sed -i -e 's@lib/cmake/hip@lib64/cmake/hip@' cmake/public/LoadHIP.cmake sed -i -e 's@HIP 1.0@HIP MODULE@' cmake/public/LoadHIP.cmake @@ -489,27 +448,16 @@ sed -i -e 's@HIP 1.0@HIP MODULE@' cmake/public/LoadHIP.cmake %endif -# moodycamel include path needs adjusting to use the system's -sed -i -e 's@${PROJECT_SOURCE_DIR}/third_party/concurrentqueue@/usr/include/concurrentqueue@' cmake/Dependencies.cmake - %build -# Export the arches -# echo "%%pytorch_arches %pt_arches" > macros.pytorch - # # Control the number of jobs # # The build can fail if too many threads exceed the physical memory -# Run at least one thread, more if CPU & memory resources are available. +# So count core and and memory and increase the build memory util the build succeeds # -%ifarch x86_64 # Real cores, No hyperthreading COMPILE_JOBS=`cat /proc/cpuinfo | grep -m 1 'cpu cores' | awk '{ print $4 }'` -%else -# cpuinfo format varies on other arches, fall back to nproc -COMPILE_JOBS=`nproc` -%endif if [ ${COMPILE_JOBS}x = x ]; then COMPILE_JOBS=1 fi @@ -549,10 +497,10 @@ export USE_CUDA=OFF export USE_FAKELOWP=OFF export USE_FBGEMM=OFF export USE_FLASH_ATTENTION=OFF +export USE_GOLD_LINKER=ON export USE_GLOO=OFF export USE_ITT=OFF export USE_KINETO=OFF -export USE_KLEIDIAI=OFF export USE_LITE_INTERPRETER_PROFILER=OFF export USE_LITE_PROTO=OFF export USE_MAGMA=OFF @@ -567,21 +515,18 @@ export USE_PYTORCH_QNNPACK=OFF export USE_ROCM=OFF export USE_SYSTEM_SLEEF=ON export USE_SYSTEM_EIGEN_INSTALL=ON -%if %{with onnx} export USE_SYSTEM_ONNX=ON -%endif export USE_SYSTEM_PYBIND11=OFF export USE_SYSTEM_LIBS=OFF -export USE_SYSTEM_NCCL=OFF export USE_TENSORPIPE=OFF -export USE_XNNPACK=OFF +export USE_XNNPACK=ON export USE_XPU=OFF export USE_SYSTEM_PTHREADPOOL=ON export USE_SYSTEM_CPUINFO=ON export USE_SYSTEM_FP16=ON export USE_SYSTEM_FXDIV=ON export USE_SYSTEM_PSIMD=ON -export USE_SYSTEM_XNNPACK=OFF +export USE_SYSTEM_XNNPACK=ON export USE_DISTRIBUTED=ON %if %{with tensorpipe} @@ -610,73 +555,110 @@ export BUILD_TEST=ON # # See BZ 2244862 + %if %{with rocm} export USE_ROCM=ON -export USE_ROCM_CK_SDPA=OFF -export USE_ROCM_CK_GEMM=OFF -export USE_FBGEMM_GENAI=OFF - -# Magma is broken on ROCm 7 -# export USE_MAGMA=ON +export USE_MAGMA=ON export HIP_PATH=`hipconfig -p` export ROCM_PATH=`hipconfig -R` -#RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir` -#export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode +RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir` +export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode # pytorch uses clang, not hipcc export HIP_CLANG_PATH=%{rocmllvm_bindir} -export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} -%endif - -%if 0%{?fedora} -%pyproject_wheel -%else +gpu=%{rocm_default_gpu} +module load rocm/$gpu +export PYTORCH_ROCM_ARCH=$ROCM_GPUS %py3_build +mv build build-${gpu} +module purge + +%if %{with rocm_loop} +for gpu in %{rocm_gpu_list} +do + module load rocm/$gpu + export PYTORCH_ROCM_ARCH=$ROCM_GPUS + %py3_build + mv build build-${gpu} + module purge +done %endif +%else + +%py3_build + +%endif %install -# pytorch rpm macros -# install -Dpm 644 macros.pytorch \ -# %{buildroot}%{_rpmmacrodir}/macros.pytorch - %if %{with rocm} export USE_ROCM=ON -export USE_ROCM_CK=OFF export HIP_PATH=`hipconfig -p` export ROCM_PATH=`hipconfig -R` -# RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir` -# export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode +RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir` +export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode # pytorch uses clang, not hipcc export HIP_CLANG_PATH=%{rocmllvm_bindir} -export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} -%endif - -%if 0%{?fedora} -%pyproject_install -%pyproject_save_files '*torch*' -%else +gpu=%{rocm_default_gpu} +module load rocm/$gpu +export PYTORCH_ROCM_ARCH=$ROCM_GPUS +mv build-${gpu} build %py3_install +mv build build-${gpu} +module purge + +%if %{with rocm_loop} +for gpu in %{rocm_gpu_list} +do + module load rocm/$gpu + export PYTORCH_ROCM_ARCH=$ROCM_GPUS + mv build-${gpu} build + # need to customize the install location, so replace py3_install + %{__python3} %{py_setup} %{?py_setup_args} install -O1 --skip-build --root %{buildroot} --prefix /usr/lib64/rocm/${gpu} %{?*} + rm -rfv %{buildroot}/usr/lib/rocm/${gpu}/bin/__pycache__ + mv build build-${gpu} + module purge +done %endif +%else + +%py3_install + + +%endif %check -# Not working yet -# pyproject_check_import torch +%py3_check_import torch # Do not remote the empty files %files -n python3-%{pypi_name} %license LICENSE %doc README.md +%{_bindir}/convert-caffe2-to-onnx +%{_bindir}/convert-onnx-to-caffe2 %{_bindir}/torchrun -%{python3_sitearch}/%{pypi_name}* +%{_bindir}/torchfrtrace +%{python3_sitearch}/%{pypi_name} +%{python3_sitearch}/%{pypi_name}-*.egg-info %{python3_sitearch}/functorch +%{python3_sitearch}/torchgen + +%if %{with rocm} +%if %{with rocm_loop} + +%files -n python3-%{pypi_name}-rocm-gfx9 +%{_libdir}/rocm/gfx9/bin/* +%{_libdir}/rocm/gfx9/lib64/* + +%endif +%endif %changelog %autochangelog diff --git a/sources b/sources index 9a3681f..aa1ed3c 100644 --- a/sources +++ b/sources @@ -1,19 +1,11 @@ -SHA512 (pytorch-v2.7.0.tar.gz) = 17e875a66f1669901f5f770c9d829ba5bfa3967296cfb71550e8a92507181db742548eaf7cc9a2c478c4b91e366f27cc480e2e1bbb328db8501d30e1649839e6 SHA512 (v23.3.3.tar.gz) = 4066c94f2473c7ea16917d29a613e16f840a329089c88e0bdbdb999aef3442ba00abfd2aa92266fa9c067e399dc88e6f0ccac40dc151378857e665638e78bbf0 -SHA512 (v2.13.6.tar.gz) = 497c25b33b09a9c42f67131ab82e35d689e8ce089dd7639be997305ff9a6d502447b79c824508c455d559e61f0186335b54dd2771d903a7c1621833930622d1a +SHA512 (v2.11.1.tar.gz) = ed1512ff0bca3bc0a45edc2eb8c77f8286ab9389f6ff1d5cb309be24bc608abbe0df6a7f5cb18c8f80a3bfa509058547c13551c3cd6a759af708fd0cdcdd9e95 SHA512 (tensorpipe-52791a2.tar.gz) = 1e5faf17a7236c5506c08cb28be16069b11bb929bbca64ed9745ce4277d46739186ab7d6597da7437d90ed2d166d4c37ef2f3bceabe8083ef3adbb0e8e5f227e -SHA512 (v1.41.0.tar.gz) = bb08a1970a10e8d9571ffea3d021643de30ec212cd51317b98d6cf0cfe55d6877992921fb01d1188a6d466687335b77885685d924f8cb7200a0bec30eee05c65 SHA512 (libnop-910b558.tar.gz) = 74c5324eaa1b6b2ac8dfef94c835b5c5b044625f8e5efe3522470b1ecc4798ff43d344a013cee2f6901e83267c6167072947b754e63f1552ae7044cffe234c36 +SHA512 (v1.41.0.tar.gz) = bb08a1970a10e8d9571ffea3d021643de30ec212cd51317b98d6cf0cfe55d6877992921fb01d1188a6d466687335b77885685d924f8cb7200a0bec30eee05c65 SHA512 (v1.14.2.tar.gz) = 97635bbaf6dd567c201451dfaf7815b2052fe50d9bccc97aade86cfa4a92651374d167296a5453031b2681dc302806a289bca011a9e79ddc381a17d6118971d7 SHA512 (cpp-httplib-3b6597b.tar.gz) = 8f1090658c498d04f14fec5c2f301847b1f3360bf92b18d82927643ee04ab61a6b274733a01c7850f9c030205120d674d1d961358d49fdd15636736fb8704f55 SHA512 (kineto-be13176.tar.gz) = 41a08c7da9eea7d12402f80a5550c9d4df79798719cc52b12a507828c8c896ba28a37c35d8adf809ca72589e1d84965d5ef6dd01f3f8dc1c803c5ed67b03a43a -SHA512 (pytorch-a1cb3cc.tar.gz) = 92bf8b2c2ef0b459406b60169ecebdc50652c75943e3d6087e4d261f6e308dbad365529561e0f07ea3f0b71790efb68b5e4ab2f44e270462097208d924dc2d95 -SHA512 (v24.12.23.tar.gz) = f97762ba41b9cfef648e93932fd789324c6bb6ebc5b7aeca8185c9ef602294b67d73aea7ae371035579a1419cbfbeba7c3e88b31b5a5848db98f5e8a03b982b1 -SHA512 (kineto-5e75018.tar.gz) = 921b96a56e01d69895b79e67582d8977ed6f873573ab41557c5d026ada5d1f6365e4ed0a0c6804057c52e92510749fc58619f554a164c1ba9d8cd13e789bebd0 -SHA512 (pytorch-v2.8.0.tar.gz) = 791e658eab87fb957f025558cb9f925078d2426ab7b6f60771d9841dfb691f67d905ba1330a800008efe7c938b6c69bdc52232bccfe8d4860e795a532cd69d28 -SHA512 (v1.18.0.tar.gz) = 2f38664947c8d1efc40620a7c1b1953d2aa4b0a37b67c4886b86e77c1d697363c26413413ddda8eabc545892fb1bcb43afc7e93e62f0901527524a2727e1ea8d -SHA512 (pytorch-715dca6.tar.gz) = 09c9aae54fab3eb17901fc3226fece1c13f41cb8e45a2cb066021823abeb8d27c340993088e01d8e55bb37ed5f94334ec31e6c539cddfacbad157abd27c5e907 -SHA512 (pytorch-fd36458.tar.gz) = acbb7475b92ad4a8e8d779f3745da22d8438e4c5ef2d6e76d71c987789f2752c8aef7022c87c9a74640fe4f9c1f1a61a3f12a796f63b1e6be24da8e5aacf37dc -SHA512 (pytorch-0fabc3b.tar.gz) = 2e87975de0bf6f3dcede168b379e1928712bca16170c2a8ee7d63459f53086c01baac05e0763e4d5d28cdaf1c7d8912225ee06adeff96ead4f6f456ee174b341 -SHA512 (pytorch-v2.9.0.tar.gz) = ae989e3a7fe30f9ea90944dc25e21ca92f2a94ee40d8de974a168c292d82c16ee8920624eff91a85755469ad05473dce0f85893e3ed7794ec5c6bdd89cbd2023 -SHA512 (pytorch-v2.9.1.tar.gz) = 88de0289fa2760abd69bef505b5ae3b6d7ff176b415cbb31bbc89ce5476a3800b322a97c4490f270f8b89657aff931bf9a5516202b268e0bb8b1f63dbb87b34a +SHA512 (pytorch-v2.4.1.tar.gz) = fc364112a88540035f213274b526b4c8ad9be335ff214161091e8a8d3efa89ebef5a5b4d33b20b5b659896d164dcbe739f0d7d2a293d3afc0efcfaa2bf62fc2c +SHA512 (pytorch-v2.5.0.tar.gz) = 6ccf1ac9f191f5bd757ef7fbfc1dcd81d591577f2d3df7313c6ed32790c592aaffd253e18dc778a2fcc707e4533299817dfdf9fae108636ce5c29c1b8ff8bba6 +SHA512 (pytorch-v2.5.1.tar.gz) = a8882608c2ab6467a58d60c6df84c9f1004b43eafeba57db499dbbfdecc09db2e221b9d4c344c8af7c0bea6252e874c400483502dca24a0b474c376b9fef1dd4