diff --git a/.gitignore b/.gitignore index 1ab81b0..c424df5 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,18 @@ /v1.14.2.tar.gz /cpp-httplib-3b6597b.tar.gz /kineto-be13176.tar.gz +/pytorch-v2.4.1.tar.gz +/pytorch-v2.5.0.tar.gz +/pytorch-v2.5.1.tar.gz +/pytorch-v2.7.0.tar.gz +/v2.13.6.tar.gz +/pytorch-a1cb3cc.tar.gz +/v24.12.23.tar.gz +/kineto-5e75018.tar.gz +/pytorch-v2.8.0.tar.gz +/v1.18.0.tar.gz +/pytorch-715dca6.tar.gz +/pytorch-fd36458.tar.gz +/pytorch-0fabc3b.tar.gz +/pytorch-v2.9.0.tar.gz +/pytorch-v2.9.1.tar.gz diff --git a/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch b/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch deleted file mode 100644 index 413c60d..0000000 --- a/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch +++ /dev/null @@ -1,47 +0,0 @@ -From 091b7fe1ccbb5e4ff4ac6017d42bacb869f61a27 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 20 Jul 2024 05:37:15 -0600 -Subject: [PATCH] Add cmake option USE_SYSTEM_FBGEMM - -Signed-off-by: Tom Rix ---- - CMakeLists.txt | 1 + - cmake/Dependencies.cmake | 3 ++- - 2 files changed, 3 insertions(+), 1 deletion(-) - -diff --git a/CMakeLists.txt b/CMakeLists.txt -index c4cd4b2c2a98..2068f7c6c4f2 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -253,6 +253,7 @@ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF - "USE_CUDNN" OFF) - cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF) - option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON) -+option(USE_SYSTEM_FBGEMM "Use system-wide FBGEMM" OFF) - option(USE_KINETO "Use Kineto profiling library" ON) - option(USE_CUPTI_SO "Use CUPTI as a shared library" ON) - option(USE_FAKELOWP "Use FakeLowp operators" OFF) -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index f1f2eb7cec31..192dac46f13b 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -706,6 +706,7 @@ endif() - - # ---[ FBGEMM - if(USE_FBGEMM) -+ if (NOT USE_SYSTEM_FBGEMM) - set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party") - if(NOT DEFINED FBGEMM_SOURCE_DIR) - set(FBGEMM_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/fbgemm" CACHE STRING "FBGEMM source directory") -@@ -746,7 +747,7 @@ if(USE_FBGEMM) - target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable) - endif() - endif() -- -+ endif() - if(USE_FBGEMM) - list(APPEND Caffe2_DEPENDENCY_LIBS fbgemm) - endif() --- -2.45.1 - diff --git a/0001-Add-cmake-variable-USE_ROCM_CK.patch b/0001-Add-cmake-variable-USE_ROCM_CK.patch new file mode 100644 index 0000000..925e03b --- /dev/null +++ b/0001-Add-cmake-variable-USE_ROCM_CK.patch @@ -0,0 +1,202 @@ +From 193854993cd939de186de19589c1add4c4b2cf66 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Mon, 21 Jul 2025 11:35:03 -0700 +Subject: [PATCH] Add cmake variable USE_ROCM_CK + +--- + CMakeLists.txt | 1 + + aten/src/ATen/CMakeLists.txt | 40 ++++++++++++++++----------------- + aten/src/ATen/cuda/CUDABlas.cpp | 22 +++++++++--------- + cmake/Dependencies.cmake | 3 +++ + 4 files changed, 35 insertions(+), 31 deletions(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index a5d25e6afa0f..afc1b53efa64 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -240,6 +240,7 @@ cmake_dependent_option( + BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON + "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF) + cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF) ++cmake_dependent_option(USE_ROCM_CK "Use ROCm Composable Kernel" ON "USE_ROCM" ON) + option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF) + cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF) + cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF +diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt +index c9cfd74b501e..59f6178218ee 100644 +--- a/aten/src/ATen/CMakeLists.txt ++++ b/aten/src/ATen/CMakeLists.txt +@@ -373,26 +373,26 @@ if(USE_ROCM) + # is header only, so this should be ok, except that the CMake build generates + # a ck/config.h. We just do that part here. Without this, the ck.h from the + # ROCM SDK may get accidentally used instead. +- function(_pytorch_rocm_generate_ck_conf) +- set(CK_ENABLE_INT8 "ON") +- set(CK_ENABLE_FP16 "ON") +- set(CK_ENABLE_FP32 "ON") +- set(CK_ENABLE_FP64 "ON") +- set(CK_ENABLE_BF16 "ON") +- set(CK_ENABLE_FP8 "ON") +- set(CK_ENABLE_BF8 "ON") +- set(CK_USE_XDL "ON") +- set(CK_USE_WMMA "ON") +- configure_file( +- "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in" +- "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h" +- ) +- endfunction() ++# function(_pytorch_rocm_generate_ck_conf) ++# set(CK_ENABLE_INT8 "ON") ++# set(CK_ENABLE_FP16 "ON") ++# set(CK_ENABLE_FP32 "ON") ++# set(CK_ENABLE_FP64 "ON") ++# set(CK_ENABLE_BF16 "ON") ++# set(CK_ENABLE_FP8 "ON") ++# set(CK_ENABLE_BF8 "ON") ++# set(CK_USE_XDL "ON") ++# set(CK_USE_WMMA "ON") ++# configure_file( ++# "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in" ++# "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h" ++# ) ++# endfunction() + list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip) +- list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include) +- list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include) +- list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel) +- _pytorch_rocm_generate_ck_conf() ++# list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include) ++# list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include) ++# list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel) ++# _pytorch_rocm_generate_ck_conf() + + # Next two lines are needed because TunableOp uses third-party/fmt + list(APPEND ATen_HIP_INCLUDE $) +@@ -409,7 +409,7 @@ endif() + ${native_quantized_hip_hip} + ${native_transformers_hip_hip} ${native_transformers_src_hip_hip} + ) +- if(WIN32) # Windows doesn't support Composable Kernels ++ if(NOT USE_ROCM_CK) # Windows doesn't support Composable Kernels + file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip") + file(GLOB native_hip_ck "native/hip/ck*.hip") + exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}" +diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp +index 89350a11bea7..e5b7960177cf 100644 +--- a/aten/src/ATen/cuda/CUDABlas.cpp ++++ b/aten/src/ATen/cuda/CUDABlas.cpp +@@ -752,7 +752,7 @@ template <> + void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(double)) + { + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + // hipblaslt does not support double gemm yet + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(double)); + #else +@@ -836,7 +836,7 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } + } +-#if defined(USE_ROCM) && !defined(_MSC_VER) ++#if defined(USE_ROCM) && defined(USE_ROCM_CK) + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::bgemm_internal_ck(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } +@@ -1270,14 +1270,14 @@ template <> + void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) + { + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + // hipblaslt does not support double gemm yet + gemm_internal_cublas(CUDABLAS_GEMM_ARGS(double)); + #else + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(double)); + #endif + } +-#if defined(USE_ROCM) && !defined(_MSC_VER) ++#if defined(USE_ROCM) && defined(USE_ROCM_CK) + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(double)); + } +@@ -1293,7 +1293,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); + } +-#if defined(USE_ROCM) && !defined(_MSC_VER) ++#if defined(USE_ROCM) && defined(USE_ROCM_CK) + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100 + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); +@@ -1311,7 +1311,7 @@ template <> + void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) + { + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + // hipblaslt does not support complex gemm yet + gemm_internal_cublas>(CUDABLAS_GEMM_ARGS(c10::complex)); + #else +@@ -1327,7 +1327,7 @@ template <> + void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) + { + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + // hipblaslt does not support complex gemm yet + gemm_internal_cublas>(CUDABLAS_GEMM_ARGS(c10::complex)); + #else +@@ -1345,7 +1345,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); + } +-#if defined(USE_ROCM) && !defined(_MSC_VER) ++#if defined(USE_ROCM) && defined(USE_ROCM_CK) + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::Half)); + } +@@ -1361,7 +1361,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } +-#if defined(USE_ROCM) && !defined(_MSC_VER) ++#if defined(USE_ROCM) && defined(USE_ROCM_CK) + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } +@@ -1382,7 +1382,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); + } +-#if defined(USE_ROCM) && !defined(_MSC_VER) ++#if defined(USE_ROCM) && defined(USE_ROCM_CK) + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm"); + } +@@ -1398,7 +1398,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::B + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } +-#if defined(USE_ROCM) && !defined(_MSC_VER) ++#if defined(USE_ROCM) && defined(USE_ROCM_CK) + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm"); + } +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index a93386c27f8d..be1368999d38 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -1031,6 +1031,9 @@ if(USE_ROCM) + if(HIPBLASLT_VEC_EXT) + list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_VEC_EXT) + endif() ++ if(USE_ROCM_CK) ++ list(APPEND HIP_CXX_FLAGS -DUSE_ROCM_CK) ++ endif() + list(APPEND HIP_HIPCC_FLAGS --offload-compress) + if(WIN32) + add_definitions(-DROCM_ON_WINDOWS) +-- +2.49.0 + diff --git a/0001-Changes-to-compile-with-3.13-126033.patch b/0001-Changes-to-compile-with-3.13-126033.patch deleted file mode 100644 index ddc0dcf..0000000 --- a/0001-Changes-to-compile-with-3.13-126033.patch +++ /dev/null @@ -1,222 +0,0 @@ -From 655a06444b261cb28e71a0973c0ab67aaa8261ab Mon Sep 17 00:00:00 2001 -From: albanD -Date: Tue, 14 May 2024 02:14:53 +0000 -Subject: [PATCH] Changes to compile with 3.13 (#126033) - -This is mainly: -- Fix refcount access macro -- Hide all the Dynamo code that needs update as usual -- Add _PyWeakref_ClearRef as an extern provided by CPython. Including the pycore header that defines it would require raw c include shenanigans that I don't think are worth it. -This allows to build both with regular and nogil version of cpython. Both - -Note that this requires the 3.13 branch at least past [d3094744d40de2deefbda9b1996d5029c9ebf0b0](https://github.com/python/cpython/commit/d3094744d40de2deefbda9b1996d5029c9ebf0b0) which we need for mimalloc include and weakref function being exposed. - -debug-only issues in pybind11 with PyMem_MALLOC vs PyObject_MALLOC being should be synced either by updating pybind or cpython. @colesbury I can send a PR to ifdef the proper use in pybind if you think that this is the best solution here? - -Pull Request resolved: https://github.com/pytorch/pytorch/pull/126033 -Approved by: https://github.com/colesbury ---- - torch/csrc/Storage.cpp | 2 +- - torch/csrc/autograd/python_variable.cpp | 2 +- - torch/csrc/dynamo/cpython_defs.c | 15 +++++- - torch/csrc/dynamo/cpython_defs.h | 2 + - torch/csrc/dynamo/eval_frame.c | 67 ++++++++++++++++++------- - torch/csrc/utils/python_compat.h | 4 ++ - 6 files changed, 70 insertions(+), 22 deletions(-) - -diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp -index 93dbc9c09bb2..b22bbac35981 100644 ---- a/torch/csrc/Storage.cpp -+++ b/torch/csrc/Storage.cpp -@@ -236,7 +236,7 @@ static void THPStorage_subclass_dealloc(PyObject* self) { - if (type->tp_del) { - PyObject_GC_Track(self); - type->tp_del(self); -- if (self->ob_refcnt > 0) { -+ if (Py_REFCNT(self) > 0) { - // Resurrected (see above comment about resurrection from `__del__`) - return; - } -diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp -index 9e85f0026b35..8fd1129da63c 100644 ---- a/torch/csrc/autograd/python_variable.cpp -+++ b/torch/csrc/autograd/python_variable.cpp -@@ -1910,7 +1910,7 @@ void THPVariable_subclass_dealloc(PyObject* self) { - if (type->tp_del) { - PyObject_GC_Track(self); - type->tp_del(self); -- if (self->ob_refcnt > 0) { -+ if (Py_REFCNT(self) > 0) { - /* Resurrected */ - return; - } -diff --git a/torch/csrc/dynamo/cpython_defs.c b/torch/csrc/dynamo/cpython_defs.c -index 4a1dba63009a..5e0945a052ae 100644 ---- a/torch/csrc/dynamo/cpython_defs.c -+++ b/torch/csrc/dynamo/cpython_defs.c -@@ -13,6 +13,17 @@ - } else { \ - } - -+#if IS_PYTHON_3_13_PLUS -+// Gave up after fixing a few of these -+// pycore_opcode.h is gone (new is pycore_opcode_metadata.h ?) -+// f_code is gone (new is f_executable?) -+ -+// Fake definitions for what we removed -+const uint8_t* THP_PyOpcode_Caches = NULL; -+const int THP_PyOpcode_Caches_size = 0; -+ -+#else -+ - // NOTE: all `assert`s below are converted to `CHECK`s - - #if IS_PYTHON_3_11_PLUS -@@ -29,8 +40,8 @@ - #define NEED_OPCODE_TABLES // To get _PyOpcode_Deopt - #include - #undef NEED_OPCODE_TABLES --#undef Py_BUILD_CORE - #include -+#undef Py_BUILD_CORE - - // As a simple way to reduce the impact of ABI changes on the CPython side, this check forces - // us to manually re-check that the function didn't change on the next major version -@@ -364,3 +375,5 @@ THP_PyFrame_Clear(_PyInterpreterFrame *frame) - } - - #endif -+ -+#endif // CPython 3.13 -\ No newline at end of file -diff --git a/torch/csrc/dynamo/cpython_defs.h b/torch/csrc/dynamo/cpython_defs.h -index a897c3e6c6e7..3b6c9667f8c9 100644 ---- a/torch/csrc/dynamo/cpython_defs.h -+++ b/torch/csrc/dynamo/cpython_defs.h -@@ -8,7 +8,9 @@ - - #if IS_PYTHON_3_11_PLUS - -+#define Py_BUILD_CORE - #include -+#undef Py_BUILD_CORE - - int THP_PyFrame_FastToLocalsWithError( - _PyInterpreterFrame* frame, -diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c -index c286e821f09d..e13cb5af2a0e 100644 ---- a/torch/csrc/dynamo/eval_frame.c -+++ b/torch/csrc/dynamo/eval_frame.c -@@ -8,6 +8,31 @@ - #include - #include - -+ -+ -+PyObject* guard_error_hook = NULL; -+const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup"; -+ -+static int active_dynamo_threads = 0; -+ -+static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT; -+ -+inline static PyObject* eval_frame_callback_get(void) { -+ void* result = PyThread_tss_get(&eval_frame_callback_key); -+ if (unlikely(result == NULL)) { -+ return (PyObject*)Py_None; -+ } else { -+ return (PyObject*)result; -+ } -+} -+ -+inline static void eval_frame_callback_set(PyObject* obj) { -+ PyThread_tss_set(&eval_frame_callback_key, obj); -+} -+ -+// 3.13 Not supported at all. See cpython_defs.c for hints -+#if !(IS_PYTHON_3_13_PLUS) -+ - // Problem in CPython includes when mixing core and non-core build - // The fix was not backported to 3.12 so this is needed here - // https://github.com/python/cpython/issues/105268 -@@ -138,24 +163,6 @@ THP_PyFrame_FastToLocalsWithError(THP_EVAL_API_FRAME_OBJECT *frame, int *free_va - } - #endif - --PyObject* guard_error_hook = NULL; --const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup"; -- --static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT; -- --inline static PyObject* eval_frame_callback_get(void) { -- void* result = PyThread_tss_get(&eval_frame_callback_key); -- if (unlikely(result == NULL)) { -- return (PyObject*)Py_None; -- } else { -- return (PyObject*)result; -- } --} -- --inline static void eval_frame_callback_set(PyObject* obj) { -- PyThread_tss_set(&eval_frame_callback_key, obj); --} -- - static PyObject* _custom_eval_frame_shim( - PyThreadState* tstate, - THP_EVAL_API_FRAME_OBJECT* frame, -@@ -627,7 +634,29 @@ static PyObject* _custom_eval_frame( - } - } - --static int active_dynamo_threads = 0; -+#else // IS_PYTHON_3_13_PLUS -+ -+// Fake definitions for everything we removed -+ -+typedef struct THPPyInterpreterFrame { -+ PyObject_HEAD -+ _PyInterpreterFrame* frame; // Borrowed reference -+} THPPyInterpreterFrame; -+ -+inline static void enable_eval_frame_shim(PyThreadState* tstate) {} -+inline static void enable_eval_frame_default(PyThreadState* tstate) {} -+ -+static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL}; -+ -+static PyTypeObject THPPyInterpreterFrameType = { -+ PyVarObject_HEAD_INIT(NULL, 0) -+ .tp_name = "torch._C.dynamo.eval_frame._PyInterpreterFrame", -+ .tp_basicsize = sizeof(THPPyInterpreterFrame), -+ .tp_flags = Py_TPFLAGS_DEFAULT, -+ .tp_getset = THPPyInterpreterFrame_properties, -+}; -+ -+#endif // CPython 3.13 - - static PyObject* increment_working_threads(PyThreadState* tstate) { - active_dynamo_threads = active_dynamo_threads + 1; -diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h -index 73b991cf3fbf..b060db00db73 100644 ---- a/torch/csrc/utils/python_compat.h -+++ b/torch/csrc/utils/python_compat.h -@@ -11,6 +11,7 @@ extern "C" { - - #define IS_PYTHON_3_11_PLUS PY_VERSION_HEX >= 0x030B00C1 - #define IS_PYTHON_3_12_PLUS PY_VERSION_HEX >= 0x030C0000 -+#define IS_PYTHON_3_13_PLUS PY_VERSION_HEX >= 0x030D0000 - - PYCAPI_COMPAT_STATIC_INLINE(int) - PyCode_GetNCellvars(PyCodeObject* code) { -@@ -32,6 +33,9 @@ PyCode_GetNFreevars(PyCodeObject* code) { - #endif - } - -+// Provided by CPython but getting the header for them is very hard -+extern void _PyWeakref_ClearRef(PyWeakReference* self); -+ - #ifdef __cplusplus - } - #endif --- -2.45.1 - diff --git a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch deleted file mode 100644 index 562f55b..0000000 --- a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch +++ /dev/null @@ -1,910 +0,0 @@ -From 3d1e4b3e5ddcdd2717e590c635097163fef64c83 Mon Sep 17 00:00:00 2001 -From: Xu Han -Date: Sun, 31 Mar 2024 03:07:32 +0000 -Subject: [PATCH] Enable x86 CPU vectorization on windows [submodule sleef] - (#118980) - -Enable VEC on Windows OS. -1. Fix some type defination gap between Windows and Linux. -2. Fix some operator not support on Windows, such as [], /. -3. Enable static sleef library build on Windows. -4. Disable unsupported function overloading on MSVC. -5. Upgrade submodule sleef lib, which fixed build issue on Windows. -6. Fixed bazel build issues. -7. Fix test app not link to sleef on Windows. - -Note: If rebuild fail after pulled this PR, please sync `sleef` submodule by run: -```cmd -git submodule sync -git submodule update --init --recursive -``` - -Pull Request resolved: https://github.com/pytorch/pytorch/pull/118980 -Approved by: https://github.com/jgong5, https://github.com/ezyang, https://github.com/malfet ---- - aten/src/ATen/CMakeLists.txt | 48 ++++++-------- - aten/src/ATen/cpu/vec/vec256/vec256.h | 14 ++-- - .../src/ATen/cpu/vec/vec256/vec256_bfloat16.h | 27 ++++++-- - .../cpu/vec/vec256/vec256_complex_double.h | 7 +- - .../cpu/vec/vec256/vec256_complex_float.h | 7 +- - aten/src/ATen/cpu/vec/vec256/vec256_double.h | 5 +- - aten/src/ATen/cpu/vec/vec256/vec256_float.h | 15 +++-- - aten/src/ATen/cpu/vec/vec256/vec256_qint.h | 12 +++- - aten/src/ATen/cpu/vec/vec512/vec512.h | 14 ++-- - .../src/ATen/cpu/vec/vec512/vec512_bfloat16.h | 27 ++++++-- - .../cpu/vec/vec512/vec512_complex_double.h | 7 +- - .../cpu/vec/vec512/vec512_complex_float.h | 7 +- - aten/src/ATen/cpu/vec/vec512/vec512_double.h | 5 +- - aten/src/ATen/cpu/vec/vec512/vec512_float.h | 15 +++-- - aten/src/ATen/cpu/vec/vec512/vec512_qint.h | 66 ++++++++++++++++++- - aten/src/ATen/cpu/vec/vec_base.h | 6 ++ - caffe2/CMakeLists.txt | 2 +- - third_party/sleef.BUILD | 3 +- - 18 files changed, 194 insertions(+), 93 deletions(-) - -diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt -index bf425af5fa9..58d5828e8ca 100644 ---- a/aten/src/ATen/CMakeLists.txt -+++ b/aten/src/ATen/CMakeLists.txt -@@ -419,32 +419,25 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$") - list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo) - endif() - --if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) -- # Preserve values for the main build -- set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS}) -- set(__aten_sleef_build_tests ${BUILD_TESTS}) -- -- # Unset our restrictive C++ flags here and reset them later. -- # Remove this once we use proper target_compile_options. -- set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) -- set(CMAKE_CXX_FLAGS) -- -- # Bump up optimization level for sleef to -O1, since at -O0 the compiler -- # excessively spills intermediate vector registers to the stack -- # and makes things run impossibly slowly -- set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) -- if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0") -- string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -- else() -- set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1") -+if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) -+ if(NOT MSVC) -+ # Bump up optimization level for sleef to -O1, since at -O0 the compiler -+ # excessively spills intermediate vector registers to the stack -+ # and makes things run impossibly slowly -+ set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) -+ if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0") -+ string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -+ else() -+ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1") -+ endif() - endif() - - if(NOT USE_SYSTEM_SLEEF) -- set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) -- set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) -- set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) -- set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) -- set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE}) -+ set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) -+ set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) -+ set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) -+ set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) -+ set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE) - if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") - if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64") - set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE) -@@ -465,12 +458,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) - endif() - list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef) - -- set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -- set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS}) -- -- # Set these back. TODO: Use SLEEF_ to pass these instead -- set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE) -- set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE) -+ if(NOT MSVC) -+ set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -+ endif() - endif() - - if(USE_CUDA AND NOT USE_ROCM) -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h -index 800b027e469..c431fa3c605 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h -@@ -69,7 +69,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { - } - - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -@@ -94,7 +94,8 @@ inline Vectorized cast(const Vectorized& src) - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -+#ifndef _MSC_VER -+// MSVC is not working well on complex function overload. - template - std::enable_if_t> - inline gather(const double* base_addr, const Vectorized& vindex) { -@@ -106,9 +107,10 @@ std::enable_if_t& vindex) { - return _mm256_i32gather_ps(base_addr, vindex, scale); - } -- -+#endif - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -+#ifndef _MSC_VER -+// MSVC is not working well on complex function overload. - template - std::enable_if_t> - inline mask_gather(const Vectorized& src, const double* base_addr, -@@ -122,7 +124,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, - const Vectorized& vindex, Vectorized& mask) { - return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale); - } -- -+#endif - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - // Only works for inputs in the range: [-2^51, 2^51] -@@ -302,6 +304,6 @@ inline Vectorized flip(const Vectorized & v) { - return flip8(v); - } - --#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#endif // (defined(CPU_CAPABILITY_AVX2) - - }} // namepsace at::vec::CPU_CAPABILITY -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h -index 3e26213d6d2..66557436c70 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h -@@ -7,7 +7,8 @@ - #include - #include - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -18,7 +19,18 @@ namespace at::vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) -+ -+#ifndef SLEEF_CONST -+#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) -+#define SLEEF_CONST const -+#else -+#define SLEEF_CONST -+#endif -+#define SLEEF_CONST_OLD SLEEF_CONST -+#else -+#define SLEEF_CONST_OLD -+#endif - - // bfloat16 conversion - static inline void cvtbf16_fp32(const __m128i& a, __m256& o) { -@@ -265,7 +277,8 @@ public: - } - return b; - } -- Vectorized map(const __m256 (*const vop)(__m256)) const { -+ -+ Vectorized map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const { - __m256 lo, hi; - cvt_to_fp32(values, lo, hi); - const auto o1 = vop(lo); -@@ -1026,7 +1039,7 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V - CONVERT_VECTORIZED_INIT(BFloat16, bfloat16); - CONVERT_VECTORIZED_INIT(Half, half); - --#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#else // defined(CPU_CAPABILITY_AVX2) - - #define CONVERT_NON_VECTORIZED_INIT(type, name) \ - inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ -@@ -1051,9 +1064,9 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V - CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16); - CONVERT_NON_VECTORIZED_INIT(Half, half); - --#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#endif // defined(CPU_CAPABILITY_AVX2) - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - #define LOAD_FP32_VECTORIZED_INIT(type, name) \ - inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ - auto values = _mm_loadu_si128(reinterpret_cast(data)); \ -@@ -1072,7 +1085,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec - LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16); - LOAD_FP32_VECTORIZED_INIT(Half, fp16); - --#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#else // defined(CPU_CAPABILITY_AVX2) - #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ - inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ - __at_align__ float values[Vectorized::size()]; \ -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h -index f93ea1e63c3..6c198fb37d3 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h -@@ -8,7 +8,8 @@ - #include - #include - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -16,7 +17,7 @@ namespace at::vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - - template <> class Vectorized> { - private: -@@ -145,7 +146,7 @@ public: - auto abs = abs_(); - auto zero = _mm256_setzero_pd(); - auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ); -- auto div = values / abs; -+ auto div = _mm256_div_pd(values, abs); - return _mm256_blendv_pd(div, zero, mask); - } - __m256d real_() const { -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h -index 7c142c04b79..c72d4d49274 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h -@@ -7,7 +7,8 @@ - #include - #include - #include --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -15,7 +16,7 @@ namespace at::vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - - template <> class Vectorized> { - private: -@@ -180,7 +181,7 @@ public: - auto abs = abs_(); - auto zero = _mm256_setzero_ps(); - auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ); -- auto div = values / abs; -+ auto div = _mm256_div_ps(values, abs); - return _mm256_blendv_ps(div, zero, mask); - } - __m256 real_() const { -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h -index bc82d07edd1..bed6da627af 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h -@@ -6,7 +6,8 @@ - #include - #include - #include --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -15,7 +16,7 @@ namespace at::vec { - inline namespace CPU_CAPABILITY { - - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - - template <> class Vectorized { - private: -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h -index 886809a0b8a..0e3664cd37b 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h -@@ -6,7 +6,8 @@ - #include - #include - #include --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -14,7 +15,7 @@ namespace at::vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - - template <> class Vectorized { - private: -@@ -226,14 +227,14 @@ public: - static __m256 vec_factorial_5 = - _mm256_set1_ps(0.00828929059f); // 1/factorial(5) - static __m256 vec_exp_log2ef = -- (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e) -+ _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e) - static __m256 vec_half = _mm256_set1_ps(0.5f); - static __m256 vec_one = _mm256_set1_ps(1.f); - static __m256 vec_zero = _mm256_set1_ps(0.f); - static __m256 vec_two = _mm256_set1_ps(2.f); -- static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2) -- static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50); -- static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218); -+ static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2) -+ static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50)); -+ static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218)); - static __m256i vec_127 = _mm256_set1_epi32(0x0000007f); - static int n_mantissa_bits = 23; - -@@ -266,7 +267,7 @@ public: - auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number); - auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127); - vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); -- auto vec_two_pow_n = (__m256)vec_two_pow_n_i; -+ auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i); - vec_two_pow_n = - _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask); - -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h -index 4128841701a..85e099904cd 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h -@@ -41,11 +41,17 @@ - namespace at::vec { - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - -+#ifdef _MSC_VER -+__declspec(align(64)) struct Vectorizedqi { -+ protected: -+ __m256i vals; -+#else - struct Vectorizedqi { - protected: - __m256i vals __attribute__((aligned(64))); -+#endif - - public: - Vectorizedqi() {} -@@ -133,7 +139,7 @@ inline convert_float_to_int8(at::vec::Vectorized src) { - } - - template --inline void __attribute__((always_inline)) QuantizeAvx2( -+__FORCE_INLINE void QuantizeAvx2( - const float* src, - T* dst, - int len, -@@ -1331,5 +1337,5 @@ Vectorized inline maximum(const Vectorized& a, const V - return a.maximum(b); - } - --#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#endif // if defined(CPU_CAPABILITY_AVX2) - }} // namespace at::vec::CPU_CAPABILITY -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h -index fe96d123e64..87f723d782c 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h -@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { - } - - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -@@ -80,7 +80,8 @@ inline Vectorized cast(const Vectorized& src) - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -+#ifndef _MSC_VER -+// MSVC is not working well on complex function overload. - template - std::enable_if_t> - inline gather(const double* base_addr, const Vectorized& vindex) { -@@ -92,9 +93,10 @@ std::enable_if_t& vindex) { - return _mm512_i32gather_ps(vindex, base_addr, scale); - } -- -+#endif - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -+#ifndef _MSC_VER -+// MSVC is not working well on complex function overload. - template - std::enable_if_t> - inline mask_gather(const Vectorized& src, const double* base_addr, -@@ -112,7 +114,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, - auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ); - return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale); - } -- -+#endif - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - template<> -@@ -270,6 +272,6 @@ inline Vectorized flip(const Vectorized & v) { - return flip8(v); - } - --#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#endif // defined(CPU_CAPABILITY_AVX512) - - }}} -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h -index f9fc92d52bf..eb3b6a72240 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h -@@ -7,7 +7,8 @@ - #include - #include - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -16,7 +17,18 @@ namespace vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) -+ -+#ifndef SLEEF_CONST -+#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) -+#define SLEEF_CONST const -+#else -+#define SLEEF_CONST -+#endif -+#define SLEEF_CONST_OLD SLEEF_CONST -+#else -+#define SLEEF_CONST_OLD -+#endif - - // bfloat16 conversion - static inline void cvtbf16_fp32(const __m256i& a, __m512& o) { -@@ -362,7 +374,8 @@ public: - } - #pragma clang diagnostic push - #pragma clang diagnostic ignored "-Wignored-qualifiers" -- Vectorized map(const __m512 (*const vop)(__m512)) const { -+ -+ Vectorized map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const { - __m512 lo, hi; - cvt_to_fp32(values, lo, hi); - const auto o1 = vop(lo); -@@ -1571,7 +1584,7 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V - CONVERT_VECTORIZED_INIT(BFloat16, bfloat16); - CONVERT_VECTORIZED_INIT(Half, half); - --#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#else //defined(CPU_CAPABILITY_AVX512) - - #define CONVERT_NON_VECTORIZED_INIT(type, name) \ - inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ -@@ -1601,9 +1614,9 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V - CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16); - CONVERT_NON_VECTORIZED_INIT(Half, half); - --#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#endif // defined(CPU_CAPABILITY_AVX512) - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - #define LOAD_FP32_VECTORIZED_INIT(type, name) \ - inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ - auto values = _mm256_loadu_si256(reinterpret_cast(data)); \ -@@ -1622,7 +1635,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec - LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16); - LOAD_FP32_VECTORIZED_INIT(Half, fp16); - --#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#else // defined(CPU_CAPABILITY_AVX512) - #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ - inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ - __at_align__ float values[Vectorized::size()]; \ -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h -index 02aa3a87cc1..c35204f9da2 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h -@@ -7,7 +7,8 @@ - #include - #include - #include --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -16,7 +17,7 @@ namespace vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - - template <> class Vectorized> { - private: -@@ -203,7 +204,7 @@ public: - auto abs = abs_(); - auto zero = _mm512_setzero_pd(); - auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ); -- auto div = values / abs; -+ auto div = _mm512_div_pd(values, abs); - return _mm512_mask_blend_pd(mask, div, zero); - } - __m512d real_() const { -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h -index a5d790c98b2..2801e484d94 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h -@@ -7,7 +7,8 @@ - #include - #include - #include --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -16,7 +17,7 @@ namespace vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - - template <> class Vectorized> { - private: -@@ -708,7 +709,7 @@ public: - auto abs = abs_(); - auto zero = _mm512_setzero_ps(); - auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ); -- auto div = values / abs; -+ auto div = _mm512_div_ps(values, abs); - return _mm512_mask_blend_ps(mask, div, zero); - } - __m512 real_() const { -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h -index 27b2753c903..508ab257e60 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h -@@ -6,7 +6,8 @@ - #include - #include - #include --#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER) -+#if (defined(CPU_CAPABILITY_AVX512)) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -15,7 +16,7 @@ namespace vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - - template <> class Vectorized { - private: -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h -index ba5738687fd..a08df3c141a 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h -@@ -6,7 +6,8 @@ - #include - #include - #include --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -15,7 +16,7 @@ namespace vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - - template <> class Vectorized { - private: -@@ -246,14 +247,14 @@ public: - static __m512 vec_factorial_5 = - _mm512_set1_ps(0.00828929059f); // 1/factorial(5) - static __m512 vec_exp_log2ef = -- (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e) -+ _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e) - static __m512 vec_half = _mm512_set1_ps(0.5f); - static __m512 vec_one = _mm512_set1_ps(1.f); - static __m512 vec_zero = _mm512_set1_ps(0.f); - static __m512 vec_two = _mm512_set1_ps(2.f); -- static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2) -- static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50); -- static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218); -+ static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2) -+ static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50)); -+ static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218)); - static __m512i vec_127 = _mm512_set1_epi32(0x0000007f); - static int n_mantissa_bits = 23; - -@@ -288,7 +289,7 @@ public: - auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number); - auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127); - vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); -- auto vec_two_pow_n = (__m512)vec_two_pow_n_i; -+ auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i); - vec_two_pow_n = - _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero); - -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h -index e0713d01312..a5671ed4a50 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h -@@ -42,11 +42,17 @@ namespace at { - namespace vec { - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - -+#ifdef _MSC_VER -+__declspec(align(64)) struct Vectorizedqi { -+ protected: -+ __m512i vals; -+#else - struct Vectorizedqi { - protected: - __m512i vals __attribute__((aligned(64))); -+#endif - - public: - Vectorizedqi() {} -@@ -136,7 +142,7 @@ inline convert_float_to_int8(at::vec::Vectorized src) { - } - - template --inline void __attribute__((always_inline)) QuantizeAvx512( -+__FORCE_INLINE void QuantizeAvx512( - const float* src, - T* dst, - int len, -@@ -525,10 +531,17 @@ struct Vectorized : public Vectorizedqi { - Vectorized scale, - Vectorized zero_point, - Vectorized scale_neg_zp_premul) const { -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -+ #else - __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); - __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); - __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); - __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -+ #endif - - __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); - __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); -@@ -549,10 +562,17 @@ struct Vectorized : public Vectorizedqi { - float_vec_return_type dequantize( - Vectorized scale, - Vectorized zero_point) const { -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -+ #else - __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); - __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); - __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); - __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -+ #endif - - __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); - __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); -@@ -598,20 +618,34 @@ struct Vectorized : public Vectorizedqi { - } - - int_vec_return_type widening_subtract(Vectorized b) const { -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -+ #else - __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); - __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); - __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); - __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -+ #endif - - __m512i int32_val0 = cvtepi8_epi32(int_val0); - __m512i int32_val1 = cvtepi8_epi32(int_val1); - __m512i int32_val2 = cvtepi8_epi32(int_val2); - __m512i int32_val3 = cvtepi8_epi32(int_val3); - -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); -+ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); -+ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); -+ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); -+ #else - __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); - __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); - __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); - __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); -+ #endif - - __m512i int32_b0 = cvtepi8_epi32(int_b0); - __m512i int32_b1 = cvtepi8_epi32(int_b1); -@@ -721,10 +755,17 @@ struct Vectorized : public Vectorizedqi { - Vectorized scale, - Vectorized zero_point, - Vectorized scale_zp_premul) const { -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -+ #else - __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); - __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); - __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); - __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -+ #endif - - __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); - __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); -@@ -746,10 +787,17 @@ struct Vectorized : public Vectorizedqi { - float_vec_return_type dequantize( - Vectorized scale, - Vectorized zero_point) const { -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -+ #else - __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); - __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); - __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); - __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -+ #endif - - __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); - __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); -@@ -796,20 +844,34 @@ struct Vectorized : public Vectorizedqi { - } - - int_vec_return_type widening_subtract(Vectorized b) const { -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -+ #else - __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); - __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); - __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); - __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -+ #endif - - __m512i int32_val0 = cvtepu8_epi32(int_val0); - __m512i int32_val1 = cvtepu8_epi32(int_val1); - __m512i int32_val2 = cvtepu8_epi32(int_val2); - __m512i int32_val3 = cvtepu8_epi32(int_val3); - -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); -+ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); -+ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); -+ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); -+ #else - __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); - __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); - __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); - __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); -+ #endif - - __m512i int32_b0 = cvtepu8_epi32(int_b0); - __m512i int32_b1 = cvtepu8_epi32(int_b1); -diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h -index adf81dd915c..20cb8ef6dbc 100644 ---- a/aten/src/ATen/cpu/vec/vec_base.h -+++ b/aten/src/ATen/cpu/vec/vec_base.h -@@ -36,6 +36,12 @@ - #include - #include - -+#if defined(__GNUC__) -+#define __FORCE_INLINE __attribute__((always_inline)) inline -+#elif defined(_MSC_VER) -+#define __FORCE_INLINE __forceinline -+#endif -+ - // These macros helped us unify vec_base.h - #ifdef CPU_CAPABILITY_AVX512 - #if defined(__GNUC__) -diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt -index a6b6f0f7d1d..15d37cf4861 100644 ---- a/caffe2/CMakeLists.txt -+++ b/caffe2/CMakeLists.txt -@@ -1787,7 +1787,7 @@ if(BUILD_TEST) - endif() - else() - add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}") -- target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main) -+ target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main) - endif() - target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $) - target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $) -diff --git a/third_party/sleef.BUILD b/third_party/sleef.BUILD -index 573f9c5b54a..f22a6e905e2 100644 ---- a/third_party/sleef.BUILD -+++ b/third_party/sleef.BUILD -@@ -38,6 +38,7 @@ SLEEF_PUBLIC_HEADERS = [ - SLEEF_PRIVATE_INCLUDES = [ - "-Iexternal/sleef/src/arch", - "-Iexternal/sleef/src/common", -+ "-Iexternal/sleef/src/libm", - ] - - SLEEF_PUBLIC_INCLUDES = [ -@@ -201,8 +202,6 @@ cc_library( - srcs = [ - "src/libm/rempitab.c", - "src/libm/sleefdp.c", -- "src/libm/sleefld.c", -- "src/libm/sleefqp.c", - "src/libm/sleefsp.c", - ], - hdrs = SLEEF_PUBLIC_HEADERS, --- -2.45.1 - diff --git a/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch b/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch new file mode 100644 index 0000000..b6a282c --- /dev/null +++ b/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch @@ -0,0 +1,359 @@ +From f2a544b2e3a5bdc04985f6e06223c0c1700120a0 Mon Sep 17 00:00:00 2001 +From: albanD +Date: Sat, 12 Jul 2025 03:42:33 -0400 +Subject: [PATCH] Fix compilation and "import torch" issues for cpython 3.14 + +Imported from +https://github.com/albanD/pytorch/tree/cpython314_build +commit 88bb9cdb72449f4277829e20d94ad8aec1894216 + +Signed-off-by: Tom Rix +--- + torch/_dynamo/bytecode_analysis.py | 2 +- + torch/ao/quantization/__init__.py | 5 +++- + torch/ao/quantization/qconfig.py | 4 ++- + torch/ao/quantization/utils.py | 7 +++-- + torch/csrc/dynamo/cpython_defs.c | 16 +++++++++++ + torch/csrc/dynamo/cpython_includes.h | 17 ++++++++++++ + torch/csrc/dynamo/eval_frame.c | 34 +++++++++++++++-------- + torch/csrc/dynamo/framelocals_mapping.cpp | 14 ++++++++++ + torch/csrc/utils/python_compat.h | 1 + + torch/onnx/__init__.py | 1 - + torch/utils/weak.py | 29 +++++++++++++++++-- + 11 files changed, 111 insertions(+), 19 deletions(-) + +diff --git a/torch/_dynamo/bytecode_analysis.py b/torch/_dynamo/bytecode_analysis.py +index 3252ea91409f..2de74ee5bf8d 100644 +--- a/torch/_dynamo/bytecode_analysis.py ++++ b/torch/_dynamo/bytecode_analysis.py +@@ -33,7 +33,7 @@ if sys.version_info >= (3, 11): + TERMINAL_OPCODES.add(dis.opmap["JUMP_FORWARD"]) + else: + TERMINAL_OPCODES.add(dis.opmap["JUMP_ABSOLUTE"]) +-if sys.version_info >= (3, 12): ++if (3, 12) <= sys.version_info < (3, 14): + TERMINAL_OPCODES.add(dis.opmap["RETURN_CONST"]) + if sys.version_info >= (3, 13): + TERMINAL_OPCODES.add(dis.opmap["JUMP_BACKWARD_NO_INTERRUPT"]) +diff --git a/torch/ao/quantization/__init__.py b/torch/ao/quantization/__init__.py +index ffc1792fd23f..cf5a8b99a894 100644 +--- a/torch/ao/quantization/__init__.py ++++ b/torch/ao/quantization/__init__.py +@@ -1,5 +1,6 @@ + # mypy: allow-untyped-defs + ++import sys + from typing import Callable, Optional, Union + + import torch +@@ -33,7 +34,9 @@ from .stubs import * # noqa: F403 + + # ensure __module__ is set correctly for public APIs + ObserverOrFakeQuantize = Union[ObserverBase, FakeQuantizeBase] +-ObserverOrFakeQuantize.__module__ = "torch.ao.quantization" ++if sys.version_info < (3, 14): ++ ObserverOrFakeQuantize.__module__ = "torch.ao.quantization" ++ + for _f in [ + compare_results, + extract_results_from_loggers, +diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py +index efee5302ad42..d9a8fc78bab4 100644 +--- a/torch/ao/quantization/qconfig.py ++++ b/torch/ao/quantization/qconfig.py +@@ -1,5 +1,6 @@ + # mypy: allow-untyped-defs + import copy ++import sys + import warnings + from collections import namedtuple + from typing import Any, Optional, Union +@@ -568,7 +569,8 @@ def _assert_valid_qconfig(qconfig: Optional[QConfig], mod: torch.nn.Module) -> N + + + QConfigAny = Optional[QConfig] +-QConfigAny.__module__ = "torch.ao.quantization.qconfig" ++if sys.version_info < (3, 14): ++ QConfigAny.__module__ = "torch.ao.quantization.qconfig" + + + def _add_module_to_qconfig_obs_ctr( +diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py +index 4ac3112ec072..3b1503e01701 100644 +--- a/torch/ao/quantization/utils.py ++++ b/torch/ao/quantization/utils.py +@@ -4,6 +4,7 @@ Utils shared by different modes of quantization (eager/graph) + """ + + import functools ++import sys + import warnings + from collections import OrderedDict + from inspect import getfullargspec, signature +@@ -16,7 +17,8 @@ from torch.nn.utils.parametrize import is_parametrized + + + NodePattern = Union[tuple[Node, Node], tuple[Node, tuple[Node, Node]], Any] +-NodePattern.__module__ = "torch.ao.quantization.utils" ++if sys.version_info < (3, 14): ++ NodePattern.__module__ = "torch.ao.quantization.utils" + + # This is the Quantizer class instance from torch/quantization/fx/quantize.py. + # Define separately to prevent circular imports. +@@ -31,7 +33,8 @@ QuantizerCls = Any + Pattern = Union[ + Callable, tuple[Callable, Callable], tuple[Callable, tuple[Callable, Callable]], Any + ] +-Pattern.__module__ = "torch.ao.quantization.utils" ++if sys.version_info < (3, 14): ++ Pattern.__module__ = "torch.ao.quantization.utils" + + + # TODO: maybe rename this to MatchInputNode +diff --git a/torch/csrc/dynamo/cpython_defs.c b/torch/csrc/dynamo/cpython_defs.c +index b68ef894aeaa..244d4165d5e8 100644 +--- a/torch/csrc/dynamo/cpython_defs.c ++++ b/torch/csrc/dynamo/cpython_defs.c +@@ -2,6 +2,20 @@ + #include + #include + ++#if IS_PYTHON_3_14_PLUS ++ ++const uint8_t* THP_PyOpcode_Caches = NULL; ++const int THP_PyOpcode_Caches_size = 0; ++ ++void ++THP_PyThreadState_PopFrame(PyThreadState *tstate, _PyInterpreterFrame * frame) ++{} ++void ++THP_PyFrame_Clear(_PyInterpreterFrame *frame) ++{} ++ ++#else ++ + #if IS_PYTHON_3_11_PLUS + + #define Py_BUILD_CORE +@@ -360,3 +374,5 @@ const uint8_t* THP_PyOpcode_Caches = NULL; + const int THP_PyOpcode_Caches_size = 0; + + #endif ++ ++#endif // IS_PYTHON_3_14_PLUS +\ No newline at end of file +diff --git a/torch/csrc/dynamo/cpython_includes.h b/torch/csrc/dynamo/cpython_includes.h +index 6b99c1d5aec8..616be16563cf 100644 +--- a/torch/csrc/dynamo/cpython_includes.h ++++ b/torch/csrc/dynamo/cpython_includes.h +@@ -21,6 +21,14 @@ + + #if IS_PYTHON_3_11_PLUS + #include ++#if IS_PYTHON_3_14_PLUS ++#include ++#include ++#endif ++#endif ++ ++#if IS_PYTHON_3_14_PLUS ++#include + #endif + + #undef Py_BUILD_CORE +@@ -30,6 +38,13 @@ + extern "C" { + #endif + ++#if IS_PYTHON_3_14_PLUS ++ ++#define F_CODE(x) (PyCodeObject*)PyStackRef_AsPyObjectBorrow(x->f_executable) ++#define PREV_INSTR(x) (x)->instr_ptr ++ ++#else ++ + #if IS_PYTHON_3_13_PLUS + #define F_CODE(x) ((PyCodeObject*)(x)->f_executable) + #define PREV_INSTR(x) (x)->instr_ptr +@@ -38,6 +53,8 @@ extern "C" { + #define PREV_INSTR(x) (x)->prev_instr + #endif + ++#endif // IS_PYTHON_3_14_PLUS ++ + #if IS_PYTHON_3_12_PLUS + #define FUNC(x) ((x)->f_funcobj) + #else +diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c +index f413782b2d30..72bb8839bac3 100644 +--- a/torch/csrc/dynamo/eval_frame.c ++++ b/torch/csrc/dynamo/eval_frame.c +@@ -224,17 +224,6 @@ const char* get_frame_name(THP_EVAL_API_FRAME_OBJECT* frame) { + return PyUnicode_AsUTF8(F_CODE(frame)->co_name); + } + +-void clear_old_frame_if_python_312_plus( +- PyThreadState* tstate, +- THP_EVAL_API_FRAME_OBJECT* frame) { +-#if IS_PYTHON_3_12_PLUS +- +- THP_PyFrame_Clear(frame); +- THP_PyThreadState_PopFrame(tstate, frame); +- +-#endif +-} +- + static PyObject* dynamo_eval_custom_code_impl( + PyThreadState* tstate, + THP_EVAL_API_FRAME_OBJECT* frame, +@@ -485,6 +474,18 @@ static PyObject* dynamo__custom_eval_frame_shim( + + static void enable_eval_frame_shim(PyThreadState* tstate) {} + static void enable_eval_frame_default(PyThreadState* tstate) {} ++PyObject* dynamo_eval_custom_code( ++ PyThreadState* tstate, ++ THP_EVAL_API_FRAME_OBJECT* frame, ++ PyCodeObject* code, ++ const char* trace_annotation, ++ int throw_flag) {} ++THPPyInterpreterFrame* THPPyInterpreterFrame_New( ++ THP_EVAL_API_FRAME_OBJECT* frame) {} ++PyObject* dynamo_eval_frame_default( ++ PyThreadState* tstate, ++ THP_EVAL_API_FRAME_OBJECT* frame, ++ int throw_flag) {} + + static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL}; + +@@ -498,6 +499,17 @@ static PyTypeObject THPPyInterpreterFrameType = { + + #endif // !(IS_PYTHON_3_14_PLUS) + ++void clear_old_frame_if_python_312_plus( ++ PyThreadState* tstate, ++ THP_EVAL_API_FRAME_OBJECT* frame) { ++#if IS_PYTHON_3_12_PLUS ++ ++ THP_PyFrame_Clear(frame); ++ THP_PyThreadState_PopFrame(tstate, frame); ++ ++#endif ++} ++ + static PyObject* increment_working_threads( + PyThreadState* tstate, + PyObject* module) { +diff --git a/torch/csrc/dynamo/framelocals_mapping.cpp b/torch/csrc/dynamo/framelocals_mapping.cpp +index b839fb26fc91..c4ee36d87767 100644 +--- a/torch/csrc/dynamo/framelocals_mapping.cpp ++++ b/torch/csrc/dynamo/framelocals_mapping.cpp +@@ -26,9 +26,13 @@ FrameLocalsMapping::FrameLocalsMapping(FrameLocalsFrameType* frame) + PyCodeObject* co = F_CODE(frame); + _framelocals.resize(co->co_nlocalsplus, nullptr); + ++#if IS_PYTHON_3_14_PLUS ++ TORCH_CHECK(false, "Python 3.14+ not supported"); ++#else + if (!frame->stacktop) { + return; + } ++#endif + + auto update_framelocals = [&](int i, PyObject* value) { + _PyLocals_Kind kind = _PyLocals_GetKind(co->co_localspluskinds, i); +@@ -53,11 +57,21 @@ FrameLocalsMapping::FrameLocalsMapping(FrameLocalsFrameType* frame) + }; + + auto offset = co->co_nlocalsplus - co->co_nfreevars; ++#if IS_PYTHON_3_14_PLUS ++ TORCH_CHECK(false, "Python 3.14+ not supported"); ++#else + for (int i = 0; i < offset; i++) { + update_framelocals(i, frame->localsplus[i]); + } ++#endif ++ + // Get references to closure variables ++#if IS_PYTHON_3_14_PLUS ++ PyObject* closure; ++ TORCH_CHECK(false, "Python 3.14+ not supported"); ++#else + PyObject* closure = ((PyFunctionObject*)FUNC(frame))->func_closure; ++#endif + for (int i = 0; i < co->co_nfreevars; i++) { + update_framelocals(offset + i, PyTuple_GET_ITEM(closure, i)); + } +diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h +index a1537611cc47..16292e4fd030 100644 +--- a/torch/csrc/utils/python_compat.h ++++ b/torch/csrc/utils/python_compat.h +@@ -13,6 +13,7 @@ extern "C" { + #define IS_PYTHON_3_12_PLUS PY_VERSION_HEX >= 0x030C0000 + #define IS_PYTHON_3_13_PLUS PY_VERSION_HEX >= 0x030D0000 + #define IS_PYTHON_3_14_PLUS PY_VERSION_HEX >= 0x030E0000 ++#define IS_PYTHON_3_15_PLUS PY_VERSION_HEX >= 0x030F0000 + + static inline int PyCode_GetNCellvars(PyCodeObject* code) { + // gh-26364 added co_ncellvars to Python 3.11.0rc1 +diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py +index 345ffd2a065b..ceeadde5365b 100644 +--- a/torch/onnx/__init__.py ++++ b/torch/onnx/__init__.py +@@ -104,7 +104,6 @@ ONNXProgram.__module__ = "torch.onnx" + OnnxExporterError.__module__ = "torch.onnx" + _OrtBackend.__module__ = "torch.onnx" + _OrtBackendOptions.__module__ = "torch.onnx" +-_OrtExecutionProvider.__module__ = "torch.onnx" + enable_fake_mode.__module__ = "torch.onnx" + is_onnxrt_backend_supported.__module__ = "torch.onnx" + +diff --git a/torch/utils/weak.py b/torch/utils/weak.py +index 8bf2ba5ed02b..9c7218cb2ad3 100644 +--- a/torch/utils/weak.py ++++ b/torch/utils/weak.py +@@ -3,8 +3,6 @@ from __future__ import annotations + + import collections.abc as _collections_abc + import weakref +- +-from _weakrefset import _IterationGuard # type: ignore[attr-defined] + from collections.abc import Mapping, MutableMapping + from weakref import ref + +@@ -22,6 +20,33 @@ __all__ = [ + ] + + ++# TODO: make weakref properly thread safe following ++# https://github.com/python/cpython/pull/125325 ++class _IterationGuard: ++ # This context manager registers itself in the current iterators of the ++ # weak container, such as to delay all removals until the context manager ++ # exits. ++ # This technique should be relatively thread-safe (since sets are). ++ ++ def __init__(self, weakcontainer): ++ # Don't create cycles ++ self.weakcontainer = ref(weakcontainer) ++ ++ def __enter__(self): ++ w = self.weakcontainer() ++ if w is not None: ++ w._iterating.add(self) ++ return self ++ ++ def __exit__(self, e, t, b): ++ w = self.weakcontainer() ++ if w is not None: ++ s = w._iterating ++ s.remove(self) ++ if not s: ++ w._commit_removals() ++ ++ + # This file defines a variant of WeakKeyDictionary that overrides the hashing + # behavior of the key to use object identity, rather than the builtin + # __eq__/__hash__ functions. This is useful for Tensor weak keys, as their +-- +2.49.0 + diff --git a/0001-Optionally-use-hipblaslt.patch b/0001-Optionally-use-hipblaslt.patch deleted file mode 100644 index 1e5ca4b..0000000 --- a/0001-Optionally-use-hipblaslt.patch +++ /dev/null @@ -1,506 +0,0 @@ -From f1d65e958afa65882dbfea8b392ab847a84d41ed Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 29 Jun 2024 04:18:34 -0700 -Subject: [PATCH] Optionally use hipblaslt - ---- - aten/src/ATen/cuda/CUDABlas.cpp | 46 ++++++++++++++++++------ - aten/src/ATen/cuda/CUDAContextLight.h | 4 +++ - aten/src/ATen/cuda/CublasHandlePool.cpp | 10 ++++-- - aten/src/ATen/cuda/tunable/TunableGemm.h | 18 +++++++--- - aten/src/ATen/native/cuda/Blas.cpp | 18 +++++++++- - cmake/Dependencies.cmake | 3 ++ - cmake/public/LoadHIP.cmake | 2 +- - 7 files changed, 82 insertions(+), 19 deletions(-) - -diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp -index ce991a9bcad4..3f0d17b52778 100644 ---- a/aten/src/ATen/cuda/CUDABlas.cpp -+++ b/aten/src/ATen/cuda/CUDABlas.cpp -@@ -14,7 +14,9 @@ - #include - - #ifdef USE_ROCM -+#ifdef USE_HIPBLASLT - #include -+#endif - // until hipblas has an API to accept flags, we must use rocblas here - #include - #include -@@ -182,6 +184,9 @@ uint32_t _getAlignment(uintptr_t address) { - static size_t _parseChosenWorkspaceSize() { - const char * val = getenv("CUBLASLT_WORKSPACE_SIZE"); - #ifdef USE_ROCM -+#ifndef USE_HIPBLASLT -+ return 0; -+#endif - if (!val) { - // accept either env var - val = getenv("HIPBLASLT_WORKSPACE_SIZE"); -@@ -235,6 +240,7 @@ namespace at::cuda::blas { - } while (0) - - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - namespace { - // Following the pattern of CuSparseDescriptor - // Defined here for now because this is the only place cublas_lt interface is -@@ -318,7 +324,6 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< - }; - } // namespace - -- - template - inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { - cudaDataType_t abcType = CUDA_R_32F; -@@ -452,7 +457,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { - " scaleType ", - scaleType); - } -- -+#endif - - template - inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { -@@ -608,10 +613,13 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(double)) - template <> - void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(float)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(float)); - } -- else { -+ else -+#endif -+ { - bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(float)); - } - } -@@ -651,10 +659,13 @@ void bgemm_internal>(CUDABLAS_BGEMM_ARGTYPES(c10::complex - void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::Half)); - } -- else { -+ else -+#endif -+ { - bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::Half)); - } - } -@@ -662,10 +673,13 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)) - template <> - void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::BFloat16)); - } -- else { -+ else -+#endif -+ { - bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::BFloat16)); - } - } -@@ -781,11 +795,13 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { - } - } - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) { - // forward to bgemm implementation but set strides and batches to 0 - bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0); - } -+#endif - - template - inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) { -@@ -1008,10 +1024,13 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) - template <> - void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); - } -- else { -+ else -+#endif -+ { - gemm_internal_cublas(CUDABLAS_GEMM_ARGS(float)); - } - } -@@ -1051,10 +1070,13 @@ void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex - void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); - } -- else { -+ else -+#endif -+ { - gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::Half)); - } - } -@@ -1062,10 +1084,13 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) - template <> - void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } -- else { -+ else -+#endif -+ { - gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } - } -@@ -1177,7 +1202,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { - } - } - -- -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - void gemm_and_bias( - bool transpose_mat1, -@@ -1410,7 +1435,7 @@ void scaled_gemm( - ScalarType result_dtype, - void* amax_ptr, - bool use_fast_accum) { --#if CUDA_VERSION >= 11080 || defined(USE_ROCM) -+#if CUDA_VERSION >= 11080 || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - const auto computeType = CUBLAS_COMPUTE_32F; - const auto scaleType = CUDA_R_32F; - const int8_t fastAccuMode = use_fast_accum ? 1 : 0; -@@ -1681,6 +1706,7 @@ void int8_gemm( - " scaleType ", - scaleType); - } -+#endif - - template <> - void trsm(CUDABLAS_TRSM_ARGTYPES(float)) { -diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h -index f2b657ced51b..f0ee613c4208 100644 ---- a/aten/src/ATen/cuda/CUDAContextLight.h -+++ b/aten/src/ATen/cuda/CUDAContextLight.h -@@ -9,7 +9,9 @@ - - // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also - // added bf16 support -+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))) - #include -+#endif - - #ifdef CUDART_VERSION - #include -@@ -80,7 +82,9 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator(); - /* Handles */ - TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle(); - TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle(); -+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))) - TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle(); -+#endif - - TORCH_CUDA_CPP_API void clearCublasWorkspaces(); - -diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp -index 8eac525b3695..abfdf7a23847 100644 ---- a/aten/src/ATen/cuda/CublasHandlePool.cpp -+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp -@@ -29,7 +29,7 @@ namespace at::cuda { - - namespace { - --#if defined(USE_ROCM) -+#if defined(USE_ROCM) && defined(USE_HIPBLASLT) - void createCublasLtHandle(cublasLtHandle_t *handle) { - TORCH_CUDABLAS_CHECK(cublasLtCreate(handle)); - } -@@ -191,8 +191,9 @@ cublasHandle_t getCurrentCUDABlasHandle() { - return handle; - } - --cublasLtHandle_t getCurrentCUDABlasLtHandle() { - #ifdef USE_ROCM -+#if defined(USE_HIPBLASLT) -+cublasLtHandle_t getCurrentCUDABlasLtHandle() { - c10::DeviceIndex device = 0; - AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); - -@@ -213,9 +214,12 @@ cublasLtHandle_t getCurrentCUDABlasLtHandle() { - - auto handle = myPoolWindow->reserve(device); - return handle; -+} -+#endif - #else -+cublasLtHandle_t getCurrentCUDABlasLtHandle() { - return reinterpret_cast(getCurrentCUDABlasHandle()); --#endif - } -+#endif - - } // namespace at::cuda -diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h -index 53e6154120c9..fa1d664696db 100644 ---- a/aten/src/ATen/cuda/tunable/TunableGemm.h -+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h -@@ -11,7 +11,9 @@ - - #include - #ifdef USE_ROCM -+#ifdef USE_HIPBLASLT - #include -+#endif - #include - #endif - #include -@@ -65,6 +67,7 @@ class DefaultGemmStridedBatchedOp : public Callable> - } - }; - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - class DefaultScaledGemmOp : public Callable> { - public: -@@ -94,6 +97,7 @@ class DefaultScaledGemmOp : public Callable> { - return OK; - } - }; -+#endif - - template - inline bool IsZero(T v) { -@@ -191,6 +195,7 @@ static void AddRocblasValidator() { - } - } - -+#ifdef USE_HIPBLASLT - static void AddHipblasltValidator() { - auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); - if (validators.find("HIPBLASLT_VERSION") == validators.end()) { -@@ -205,6 +210,7 @@ static void AddHipblasltValidator() { - [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; }); - } - } -+#endif - - static void AddRocmValidator() { - auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); -@@ -243,7 +249,7 @@ class GemmTunableOp : public TunableOp, StreamTimer> { - } - AddRocblasValidator(); - } -- -+#ifdef USE_HIPBLASLT - static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); - if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) { - rocm_validators = true; -@@ -257,7 +263,7 @@ class GemmTunableOp : public TunableOp, StreamTimer> { - } - AddHipblasltValidator(); - } -- -+#endif - if (rocm_validators) { - AddRocmValidator(); - } -@@ -286,7 +292,7 @@ class GemmStridedBatchedTunableOp : public TunableOp - } - AddRocblasValidator(); - } -- -+#ifdef USE_HIPBLASLT - static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); - if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) { - rocm_validators = true; -@@ -300,7 +306,7 @@ class GemmStridedBatchedTunableOp : public TunableOp - } - AddHipblasltValidator(); - } -- -+#endif - if (rocm_validators) { - AddRocmValidator(); - } -@@ -312,6 +318,7 @@ class GemmStridedBatchedTunableOp : public TunableOp - } - }; - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - class ScaledGemmTunableOp : public TunableOp, StreamTimer> { - public: -@@ -321,10 +328,12 @@ class ScaledGemmTunableOp : public TunableOp, StreamTimer> - auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); - - #if defined(USE_ROCM) -+#ifdef USE_HIPBLASLT - for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps()) { - this->RegisterOp(std::move(name), std::move(op)); - } - AddHipblasltValidator(); -+#endif - AddRocmValidator(); - #endif - } -@@ -337,6 +346,7 @@ class ScaledGemmTunableOp : public TunableOp, StreamTimer> - "_", BlasOpToString(ALayout), BlasOpToString(BLayout)); - } - }; -+#endif - - #undef XSTRINGIFY - #undef STRINGIFY -diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp -index 84c59a4fd0d7..56ad5de3bf2d 100644 ---- a/aten/src/ATen/native/cuda/Blas.cpp -+++ b/aten/src/ATen/native/cuda/Blas.cpp -@@ -173,6 +173,7 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa - } - - static bool getDisableAddmmCudaLt() { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT"); - #ifdef USE_ROCM - // if we enable tunable op, it'll take priority over just hipblaslt (heuristics) -@@ -196,10 +197,14 @@ static bool getDisableAddmmCudaLt() { - } - return false; - #endif -+#else -+ return true; -+#endif - } - - #ifdef USE_ROCM - static bool isSupportedHipLtROCmArch(int index) { -+#ifdef USE_HIPBLASLT - hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index); - std::string device_arch = prop->gcnArchName; - static const std::vector archs = {"gfx90a", "gfx940", "gfx941", "gfx942"}; -@@ -210,6 +215,7 @@ static bool isSupportedHipLtROCmArch(int index) { - } - } - TORCH_CHECK(false, "Attempting to use hipBLASLt on a unsupported architecture!"); -+#endif - return false; - } - #endif -@@ -235,6 +241,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - at::ScalarType scalar_type = self.scalar_type(); - c10::MaybeOwned self_; - if (&result != &self) { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM) - // Strangely, if mat2 has only 1 row or column, we get - // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic. -@@ -276,13 +283,14 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - scalar_type != at::ScalarType::BFloat16)); - #endif - } -+#endif - #endif - if (!useLtInterface) { - self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm"); - } - self__sizes = self_->sizes(); - } else { --#if defined(USE_ROCM) -+#if defined(USE_ROCM) && defined(USE_HIPBLASLT) - useLtInterface = !disable_addmm_cuda_lt && - result.dim() == 2 && result.is_contiguous() && - isSupportedHipLtROCmArch(self.device().index()) && -@@ -334,6 +342,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj()); - - if (useLtInterface) { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - #if defined(USE_ROCM) - AT_DISPATCH_FLOATING_TYPES_AND2( - at::ScalarType::Half, -@@ -394,6 +403,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - activation_epilogue - ); - }); -+#endif - #endif - } else - { -@@ -803,6 +813,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) { - } - - static bool _scaled_mm_allowed_device() { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - auto dprops = at::cuda::getCurrentDeviceProperties(); - #ifdef USE_ROCM - std::string device_arch = dprops->gcnArchName; -@@ -817,6 +828,9 @@ static bool _scaled_mm_allowed_device() { - #else - return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9); - #endif -+#else -+ return false; -+#endif - } - - // Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax -@@ -850,6 +864,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, - // Check sizes - bool allowed_device = _scaled_mm_allowed_device(); - TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+"); -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix"); - TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix"); - TORCH_CHECK( -@@ -1025,6 +1040,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, - #if defined(USE_ROCM) && ROCM_VERSION >= 60000 && ROCM_VERSION < 60200 - // ROCm's hipBLASLt does not support amax before 6.2, so calculate separately - amax = at::max(at::abs(out.to(kFloat))); -+#endif - #endif - - return {out, amax}; -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index f1f2eb7cec31..8d05e834bbc5 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1052,6 +1052,9 @@ if(USE_ROCM) - list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) - list(APPEND HIP_CXX_FLAGS -std=c++17) - list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2) -+ if(hipblast_FOUND) -+ list(APPEND HIP_CXX_FLAGS -DUSE_HIPBLASLT) -+ endif() - if(HIP_NEW_TYPE_ENUMS) - list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS) - endif() -diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake -index fa39156031ff..df4836847fdf 100644 ---- a/cmake/public/LoadHIP.cmake -+++ b/cmake/public/LoadHIP.cmake -@@ -155,7 +155,7 @@ if(HIP_FOUND) - find_package_and_print_version(hiprand REQUIRED) - find_package_and_print_version(rocblas REQUIRED) - find_package_and_print_version(hipblas REQUIRED) -- find_package_and_print_version(hipblaslt REQUIRED) -+ find_package_and_print_version(hipblaslt) - find_package_and_print_version(miopen REQUIRED) - find_package_and_print_version(hipfft REQUIRED) - find_package_and_print_version(hipsparse REQUIRED) --- -2.45.2 - diff --git a/0001-Patch-for-sleef-3.6.patch b/0001-Patch-for-sleef-3.6.patch deleted file mode 100644 index 13aa208..0000000 --- a/0001-Patch-for-sleef-3.6.patch +++ /dev/null @@ -1,952 +0,0 @@ -From 273f23698c887b52e66c2abec8101b7398f0f9c4 Mon Sep 17 00:00:00 2001 -From: "Benjamin A. Beasley" -Date: Wed, 5 Jun 2024 11:06:02 -0400 -Subject: [PATCH] Patch for sleef 3.6 - ---- - ...ectorization-on-windows-submodule-sl.patch | 910 ++++++++++++++++++ - python-torch.spec | 11 + - 2 files changed, 921 insertions(+) - create mode 100644 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch - -diff --git a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch -new file mode 100644 -index 000000000000..562f55b742c2 ---- /dev/null -+++ b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch -@@ -0,0 +1,910 @@ -+From 3d1e4b3e5ddcdd2717e590c635097163fef64c83 Mon Sep 17 00:00:00 2001 -+From: Xu Han -+Date: Sun, 31 Mar 2024 03:07:32 +0000 -+Subject: [PATCH] Enable x86 CPU vectorization on windows [submodule sleef] -+ (#118980) -+ -+Enable VEC on Windows OS. -+1. Fix some type defination gap between Windows and Linux. -+2. Fix some operator not support on Windows, such as [], /. -+3. Enable static sleef library build on Windows. -+4. Disable unsupported function overloading on MSVC. -+5. Upgrade submodule sleef lib, which fixed build issue on Windows. -+6. Fixed bazel build issues. -+7. Fix test app not link to sleef on Windows. -+ -+Note: If rebuild fail after pulled this PR, please sync `sleef` submodule by run: -+```cmd -+git submodule sync -+git submodule update --init --recursive -+``` -+ -+Pull Request resolved: https://github.com/pytorch/pytorch/pull/118980 -+Approved by: https://github.com/jgong5, https://github.com/ezyang, https://github.com/malfet -+--- -+ aten/src/ATen/CMakeLists.txt | 48 ++++++-------- -+ aten/src/ATen/cpu/vec/vec256/vec256.h | 14 ++-- -+ .../src/ATen/cpu/vec/vec256/vec256_bfloat16.h | 27 ++++++-- -+ .../cpu/vec/vec256/vec256_complex_double.h | 7 +- -+ .../cpu/vec/vec256/vec256_complex_float.h | 7 +- -+ aten/src/ATen/cpu/vec/vec256/vec256_double.h | 5 +- -+ aten/src/ATen/cpu/vec/vec256/vec256_float.h | 15 +++-- -+ aten/src/ATen/cpu/vec/vec256/vec256_qint.h | 12 +++- -+ aten/src/ATen/cpu/vec/vec512/vec512.h | 14 ++-- -+ .../src/ATen/cpu/vec/vec512/vec512_bfloat16.h | 27 ++++++-- -+ .../cpu/vec/vec512/vec512_complex_double.h | 7 +- -+ .../cpu/vec/vec512/vec512_complex_float.h | 7 +- -+ aten/src/ATen/cpu/vec/vec512/vec512_double.h | 5 +- -+ aten/src/ATen/cpu/vec/vec512/vec512_float.h | 15 +++-- -+ aten/src/ATen/cpu/vec/vec512/vec512_qint.h | 66 ++++++++++++++++++- -+ aten/src/ATen/cpu/vec/vec_base.h | 6 ++ -+ caffe2/CMakeLists.txt | 2 +- -+ third_party/sleef.BUILD | 3 +- -+ 18 files changed, 194 insertions(+), 93 deletions(-) -+ -+diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt -+index bf425af5fa9..58d5828e8ca 100644 -+--- a/aten/src/ATen/CMakeLists.txt -++++ b/aten/src/ATen/CMakeLists.txt -+@@ -419,32 +419,25 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$") -+ list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo) -+ endif() -+ -+-if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) -+- # Preserve values for the main build -+- set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS}) -+- set(__aten_sleef_build_tests ${BUILD_TESTS}) -+- -+- # Unset our restrictive C++ flags here and reset them later. -+- # Remove this once we use proper target_compile_options. -+- set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) -+- set(CMAKE_CXX_FLAGS) -+- -+- # Bump up optimization level for sleef to -O1, since at -O0 the compiler -+- # excessively spills intermediate vector registers to the stack -+- # and makes things run impossibly slowly -+- set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) -+- if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0") -+- string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -+- else() -+- set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1") -++if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) -++ if(NOT MSVC) -++ # Bump up optimization level for sleef to -O1, since at -O0 the compiler -++ # excessively spills intermediate vector registers to the stack -++ # and makes things run impossibly slowly -++ set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) -++ if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0") -++ string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -++ else() -++ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1") -++ endif() -+ endif() -+ -+ if(NOT USE_SYSTEM_SLEEF) -+- set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) -+- set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) -+- set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) -+- set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) -+- set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE}) -++ set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) -++ set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) -++ set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) -++ set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) -++ set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE) -+ if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") -+ if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64") -+ set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE) -+@@ -465,12 +458,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) -+ endif() -+ list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef) -+ -+- set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -+- set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS}) -+- -+- # Set these back. TODO: Use SLEEF_ to pass these instead -+- set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE) -+- set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE) -++ if(NOT MSVC) -++ set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -++ endif() -+ endif() -+ -+ if(USE_CUDA AND NOT USE_ROCM) -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h -+index 800b027e469..c431fa3c605 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256.h -+@@ -69,7 +69,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { -+ } -+ -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ -+@@ -94,7 +94,8 @@ inline Vectorized cast(const Vectorized& src) -+ } -+ -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+- -++#ifndef _MSC_VER -++// MSVC is not working well on complex function overload. -+ template -+ std::enable_if_t> -+ inline gather(const double* base_addr, const Vectorized& vindex) { -+@@ -106,9 +107,10 @@ std::enable_if_t& vindex) { -+ return _mm256_i32gather_ps(base_addr, vindex, scale); -+ } -+- -++#endif -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+- -++#ifndef _MSC_VER -++// MSVC is not working well on complex function overload. -+ template -+ std::enable_if_t> -+ inline mask_gather(const Vectorized& src, const double* base_addr, -+@@ -122,7 +124,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, -+ const Vectorized& vindex, Vectorized& mask) { -+ return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale); -+ } -+- -++#endif -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ -+ // Only works for inputs in the range: [-2^51, 2^51] -+@@ -302,6 +304,6 @@ inline Vectorized flip(const Vectorized & v) { -+ return flip8(v); -+ } -+ -+-#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#endif // (defined(CPU_CAPABILITY_AVX2) -+ -+ }} // namepsace at::vec::CPU_CAPABILITY -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h -+index 3e26213d6d2..66557436c70 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h -+@@ -7,7 +7,8 @@ -+ #include -+ #include -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -18,7 +19,18 @@ namespace at::vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -++ -++#ifndef SLEEF_CONST -++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) -++#define SLEEF_CONST const -++#else -++#define SLEEF_CONST -++#endif -++#define SLEEF_CONST_OLD SLEEF_CONST -++#else -++#define SLEEF_CONST_OLD -++#endif -+ -+ // bfloat16 conversion -+ static inline void cvtbf16_fp32(const __m128i& a, __m256& o) { -+@@ -265,7 +277,8 @@ public: -+ } -+ return b; -+ } -+- Vectorized map(const __m256 (*const vop)(__m256)) const { -++ -++ Vectorized map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const { -+ __m256 lo, hi; -+ cvt_to_fp32(values, lo, hi); -+ const auto o1 = vop(lo); -+@@ -1026,7 +1039,7 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V -+ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16); -+ CONVERT_VECTORIZED_INIT(Half, half); -+ -+-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#else // defined(CPU_CAPABILITY_AVX2) -+ -+ #define CONVERT_NON_VECTORIZED_INIT(type, name) \ -+ inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ -+@@ -1051,9 +1064,9 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V -+ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16); -+ CONVERT_NON_VECTORIZED_INIT(Half, half); -+ -+-#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#endif // defined(CPU_CAPABILITY_AVX2) -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ #define LOAD_FP32_VECTORIZED_INIT(type, name) \ -+ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ -+ auto values = _mm_loadu_si128(reinterpret_cast(data)); \ -+@@ -1072,7 +1085,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec -+ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16); -+ LOAD_FP32_VECTORIZED_INIT(Half, fp16); -+ -+-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#else // defined(CPU_CAPABILITY_AVX2) -+ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ -+ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ -+ __at_align__ float values[Vectorized::size()]; \ -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h -+index f93ea1e63c3..6c198fb37d3 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h -+@@ -8,7 +8,8 @@ -+ #include -+ #include -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -16,7 +17,7 @@ namespace at::vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ -+ template <> class Vectorized> { -+ private: -+@@ -145,7 +146,7 @@ public: -+ auto abs = abs_(); -+ auto zero = _mm256_setzero_pd(); -+ auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ); -+- auto div = values / abs; -++ auto div = _mm256_div_pd(values, abs); -+ return _mm256_blendv_pd(div, zero, mask); -+ } -+ __m256d real_() const { -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h -+index 7c142c04b79..c72d4d49274 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h -+@@ -7,7 +7,8 @@ -+ #include -+ #include -+ #include -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -15,7 +16,7 @@ namespace at::vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ -+ template <> class Vectorized> { -+ private: -+@@ -180,7 +181,7 @@ public: -+ auto abs = abs_(); -+ auto zero = _mm256_setzero_ps(); -+ auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ); -+- auto div = values / abs; -++ auto div = _mm256_div_ps(values, abs); -+ return _mm256_blendv_ps(div, zero, mask); -+ } -+ __m256 real_() const { -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h -+index bc82d07edd1..bed6da627af 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h -+@@ -6,7 +6,8 @@ -+ #include -+ #include -+ #include -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -15,7 +16,7 @@ namespace at::vec { -+ inline namespace CPU_CAPABILITY { -+ -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ -+ template <> class Vectorized { -+ private: -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h -+index 886809a0b8a..0e3664cd37b 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h -+@@ -6,7 +6,8 @@ -+ #include -+ #include -+ #include -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -14,7 +15,7 @@ namespace at::vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ -+ template <> class Vectorized { -+ private: -+@@ -226,14 +227,14 @@ public: -+ static __m256 vec_factorial_5 = -+ _mm256_set1_ps(0.00828929059f); // 1/factorial(5) -+ static __m256 vec_exp_log2ef = -+- (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e) -++ _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e) -+ static __m256 vec_half = _mm256_set1_ps(0.5f); -+ static __m256 vec_one = _mm256_set1_ps(1.f); -+ static __m256 vec_zero = _mm256_set1_ps(0.f); -+ static __m256 vec_two = _mm256_set1_ps(2.f); -+- static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2) -+- static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50); -+- static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218); -++ static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2) -++ static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50)); -++ static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218)); -+ static __m256i vec_127 = _mm256_set1_epi32(0x0000007f); -+ static int n_mantissa_bits = 23; -+ -+@@ -266,7 +267,7 @@ public: -+ auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number); -+ auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127); -+ vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); -+- auto vec_two_pow_n = (__m256)vec_two_pow_n_i; -++ auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i); -+ vec_two_pow_n = -+ _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask); -+ -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h -+index 4128841701a..85e099904cd 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h -+@@ -41,11 +41,17 @@ -+ namespace at::vec { -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ -++#ifdef _MSC_VER -++__declspec(align(64)) struct Vectorizedqi { -++ protected: -++ __m256i vals; -++#else -+ struct Vectorizedqi { -+ protected: -+ __m256i vals __attribute__((aligned(64))); -++#endif -+ -+ public: -+ Vectorizedqi() {} -+@@ -133,7 +139,7 @@ inline convert_float_to_int8(at::vec::Vectorized src) { -+ } -+ -+ template -+-inline void __attribute__((always_inline)) QuantizeAvx2( -++__FORCE_INLINE void QuantizeAvx2( -+ const float* src, -+ T* dst, -+ int len, -+@@ -1331,5 +1337,5 @@ Vectorized inline maximum(const Vectorized& a, const V -+ return a.maximum(b); -+ } -+ -+-#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#endif // if defined(CPU_CAPABILITY_AVX2) -+ }} // namespace at::vec::CPU_CAPABILITY -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h -+index fe96d123e64..87f723d782c 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512.h -+@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { -+ } -+ -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ -+@@ -80,7 +80,8 @@ inline Vectorized cast(const Vectorized& src) -+ } -+ -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+- -++#ifndef _MSC_VER -++// MSVC is not working well on complex function overload. -+ template -+ std::enable_if_t> -+ inline gather(const double* base_addr, const Vectorized& vindex) { -+@@ -92,9 +93,10 @@ std::enable_if_t& vindex) { -+ return _mm512_i32gather_ps(vindex, base_addr, scale); -+ } -+- -++#endif -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+- -++#ifndef _MSC_VER -++// MSVC is not working well on complex function overload. -+ template -+ std::enable_if_t> -+ inline mask_gather(const Vectorized& src, const double* base_addr, -+@@ -112,7 +114,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, -+ auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ); -+ return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale); -+ } -+- -++#endif -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ -+ template<> -+@@ -270,6 +272,6 @@ inline Vectorized flip(const Vectorized & v) { -+ return flip8(v); -+ } -+ -+-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#endif // defined(CPU_CAPABILITY_AVX512) -+ -+ }}} -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h -+index f9fc92d52bf..eb3b6a72240 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h -+@@ -7,7 +7,8 @@ -+ #include -+ #include -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -16,7 +17,18 @@ namespace vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -++ -++#ifndef SLEEF_CONST -++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) -++#define SLEEF_CONST const -++#else -++#define SLEEF_CONST -++#endif -++#define SLEEF_CONST_OLD SLEEF_CONST -++#else -++#define SLEEF_CONST_OLD -++#endif -+ -+ // bfloat16 conversion -+ static inline void cvtbf16_fp32(const __m256i& a, __m512& o) { -+@@ -362,7 +374,8 @@ public: -+ } -+ #pragma clang diagnostic push -+ #pragma clang diagnostic ignored "-Wignored-qualifiers" -+- Vectorized map(const __m512 (*const vop)(__m512)) const { -++ -++ Vectorized map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const { -+ __m512 lo, hi; -+ cvt_to_fp32(values, lo, hi); -+ const auto o1 = vop(lo); -+@@ -1571,7 +1584,7 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V -+ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16); -+ CONVERT_VECTORIZED_INIT(Half, half); -+ -+-#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#else //defined(CPU_CAPABILITY_AVX512) -+ -+ #define CONVERT_NON_VECTORIZED_INIT(type, name) \ -+ inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ -+@@ -1601,9 +1614,9 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V -+ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16); -+ CONVERT_NON_VECTORIZED_INIT(Half, half); -+ -+-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#endif // defined(CPU_CAPABILITY_AVX512) -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ #define LOAD_FP32_VECTORIZED_INIT(type, name) \ -+ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ -+ auto values = _mm256_loadu_si256(reinterpret_cast(data)); \ -+@@ -1622,7 +1635,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec -+ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16); -+ LOAD_FP32_VECTORIZED_INIT(Half, fp16); -+ -+-#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#else // defined(CPU_CAPABILITY_AVX512) -+ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ -+ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ -+ __at_align__ float values[Vectorized::size()]; \ -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h -+index 02aa3a87cc1..c35204f9da2 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h -+@@ -7,7 +7,8 @@ -+ #include -+ #include -+ #include -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -16,7 +17,7 @@ namespace vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ -+ template <> class Vectorized> { -+ private: -+@@ -203,7 +204,7 @@ public: -+ auto abs = abs_(); -+ auto zero = _mm512_setzero_pd(); -+ auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ); -+- auto div = values / abs; -++ auto div = _mm512_div_pd(values, abs); -+ return _mm512_mask_blend_pd(mask, div, zero); -+ } -+ __m512d real_() const { -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h -+index a5d790c98b2..2801e484d94 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h -+@@ -7,7 +7,8 @@ -+ #include -+ #include -+ #include -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -16,7 +17,7 @@ namespace vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ -+ template <> class Vectorized> { -+ private: -+@@ -708,7 +709,7 @@ public: -+ auto abs = abs_(); -+ auto zero = _mm512_setzero_ps(); -+ auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ); -+- auto div = values / abs; -++ auto div = _mm512_div_ps(values, abs); -+ return _mm512_mask_blend_ps(mask, div, zero); -+ } -+ __m512 real_() const { -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h -+index 27b2753c903..508ab257e60 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h -+@@ -6,7 +6,8 @@ -+ #include -+ #include -+ #include -+-#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER) -++#if (defined(CPU_CAPABILITY_AVX512)) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -15,7 +16,7 @@ namespace vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ -+ template <> class Vectorized { -+ private: -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h -+index ba5738687fd..a08df3c141a 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h -+@@ -6,7 +6,8 @@ -+ #include -+ #include -+ #include -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -15,7 +16,7 @@ namespace vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ -+ template <> class Vectorized { -+ private: -+@@ -246,14 +247,14 @@ public: -+ static __m512 vec_factorial_5 = -+ _mm512_set1_ps(0.00828929059f); // 1/factorial(5) -+ static __m512 vec_exp_log2ef = -+- (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e) -++ _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e) -+ static __m512 vec_half = _mm512_set1_ps(0.5f); -+ static __m512 vec_one = _mm512_set1_ps(1.f); -+ static __m512 vec_zero = _mm512_set1_ps(0.f); -+ static __m512 vec_two = _mm512_set1_ps(2.f); -+- static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2) -+- static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50); -+- static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218); -++ static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2) -++ static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50)); -++ static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218)); -+ static __m512i vec_127 = _mm512_set1_epi32(0x0000007f); -+ static int n_mantissa_bits = 23; -+ -+@@ -288,7 +289,7 @@ public: -+ auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number); -+ auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127); -+ vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); -+- auto vec_two_pow_n = (__m512)vec_two_pow_n_i; -++ auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i); -+ vec_two_pow_n = -+ _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero); -+ -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h -+index e0713d01312..a5671ed4a50 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h -+@@ -42,11 +42,17 @@ namespace at { -+ namespace vec { -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ -++#ifdef _MSC_VER -++__declspec(align(64)) struct Vectorizedqi { -++ protected: -++ __m512i vals; -++#else -+ struct Vectorizedqi { -+ protected: -+ __m512i vals __attribute__((aligned(64))); -++#endif -+ -+ public: -+ Vectorizedqi() {} -+@@ -136,7 +142,7 @@ inline convert_float_to_int8(at::vec::Vectorized src) { -+ } -+ -+ template -+-inline void __attribute__((always_inline)) QuantizeAvx512( -++__FORCE_INLINE void QuantizeAvx512( -+ const float* src, -+ T* dst, -+ int len, -+@@ -525,10 +531,17 @@ struct Vectorized : public Vectorizedqi { -+ Vectorized scale, -+ Vectorized zero_point, -+ Vectorized scale_neg_zp_premul) const { -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -++ #else -+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -++ #endif -+ -+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); -+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); -+@@ -549,10 +562,17 @@ struct Vectorized : public Vectorizedqi { -+ float_vec_return_type dequantize( -+ Vectorized scale, -+ Vectorized zero_point) const { -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -++ #else -+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -++ #endif -+ -+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); -+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); -+@@ -598,20 +618,34 @@ struct Vectorized : public Vectorizedqi { -+ } -+ -+ int_vec_return_type widening_subtract(Vectorized b) const { -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -++ #else -+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -++ #endif -+ -+ __m512i int32_val0 = cvtepi8_epi32(int_val0); -+ __m512i int32_val1 = cvtepi8_epi32(int_val1); -+ __m512i int32_val2 = cvtepi8_epi32(int_val2); -+ __m512i int32_val3 = cvtepi8_epi32(int_val3); -+ -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); -++ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); -++ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); -++ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); -++ #else -+ __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); -+ __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); -+ __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); -+ __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); -++ #endif -+ -+ __m512i int32_b0 = cvtepi8_epi32(int_b0); -+ __m512i int32_b1 = cvtepi8_epi32(int_b1); -+@@ -721,10 +755,17 @@ struct Vectorized : public Vectorizedqi { -+ Vectorized scale, -+ Vectorized zero_point, -+ Vectorized scale_zp_premul) const { -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -++ #else -+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -++ #endif -+ -+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); -+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); -+@@ -746,10 +787,17 @@ struct Vectorized : public Vectorizedqi { -+ float_vec_return_type dequantize( -+ Vectorized scale, -+ Vectorized zero_point) const { -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -++ #else -+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -++ #endif -+ -+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); -+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); -+@@ -796,20 +844,34 @@ struct Vectorized : public Vectorizedqi { -+ } -+ -+ int_vec_return_type widening_subtract(Vectorized b) const { -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -++ #else -+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -++ #endif -+ -+ __m512i int32_val0 = cvtepu8_epi32(int_val0); -+ __m512i int32_val1 = cvtepu8_epi32(int_val1); -+ __m512i int32_val2 = cvtepu8_epi32(int_val2); -+ __m512i int32_val3 = cvtepu8_epi32(int_val3); -+ -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); -++ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); -++ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); -++ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); -++ #else -+ __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); -+ __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); -+ __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); -+ __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); -++ #endif -+ -+ __m512i int32_b0 = cvtepu8_epi32(int_b0); -+ __m512i int32_b1 = cvtepu8_epi32(int_b1); -+diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h -+index adf81dd915c..20cb8ef6dbc 100644 -+--- a/aten/src/ATen/cpu/vec/vec_base.h -++++ b/aten/src/ATen/cpu/vec/vec_base.h -+@@ -36,6 +36,12 @@ -+ #include -+ #include -+ -++#if defined(__GNUC__) -++#define __FORCE_INLINE __attribute__((always_inline)) inline -++#elif defined(_MSC_VER) -++#define __FORCE_INLINE __forceinline -++#endif -++ -+ // These macros helped us unify vec_base.h -+ #ifdef CPU_CAPABILITY_AVX512 -+ #if defined(__GNUC__) -+diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt -+index a6b6f0f7d1d..15d37cf4861 100644 -+--- a/caffe2/CMakeLists.txt -++++ b/caffe2/CMakeLists.txt -+@@ -1787,7 +1787,7 @@ if(BUILD_TEST) -+ endif() -+ else() -+ add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}") -+- target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main) -++ target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main) -+ endif() -+ target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $) -+ target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $) -+diff --git a/third_party/sleef.BUILD b/third_party/sleef.BUILD -+index 573f9c5b54a..f22a6e905e2 100644 -+--- a/third_party/sleef.BUILD -++++ b/third_party/sleef.BUILD -+@@ -38,6 +38,7 @@ SLEEF_PUBLIC_HEADERS = [ -+ SLEEF_PRIVATE_INCLUDES = [ -+ "-Iexternal/sleef/src/arch", -+ "-Iexternal/sleef/src/common", -++ "-Iexternal/sleef/src/libm", -+ ] -+ -+ SLEEF_PUBLIC_INCLUDES = [ -+@@ -201,8 +202,6 @@ cc_library( -+ srcs = [ -+ "src/libm/rempitab.c", -+ "src/libm/sleefdp.c", -+- "src/libm/sleefld.c", -+- "src/libm/sleefqp.c", -+ "src/libm/sleefsp.c", -+ ], -+ hdrs = SLEEF_PUBLIC_HEADERS, -+-- -+2.45.1 -+ -diff --git a/python-torch.spec b/python-torch.spec -index d50687a5174a..63600c2e8c39 100644 ---- a/python-torch.spec -+++ b/python-torch.spec -@@ -176,6 +176,17 @@ Patch7: 0001-Reenable-dim-for-python-3.12.patch - Patch8: 0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch - %endif - -+# Enable x86 CPU vectorization on windows [submodule sleef] (#118980) -+# https://github.com/pytorch/pytorch/commit/56451cd49d9cf94b49197e09dec13426bb1a5370 -+# -+# Despite the title, this patch fixes compatibility with sleef 3.6 by including -+# a backwards-compatible version of the fix from -+# https://github.com/pytorch/pytorch/pull/122723. -+# Cherry-picked on v2.3.0, without the commit to update the third_party/sleef -+# git submodule (because the release archive contains an actual sleef source -+# tree instead, so this would not apply.) -+Patch9: 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch -+ - %if %{with rocm} - # ROCm patches - # https://github.com/pytorch/pytorch/pull/120551 --- -2.45.1 - diff --git a/0001-Reenable-dim-for-python-3.12.patch b/0001-Reenable-dim-for-python-3.12.patch deleted file mode 100644 index 138b5d4..0000000 --- a/0001-Reenable-dim-for-python-3.12.patch +++ /dev/null @@ -1,115 +0,0 @@ -From ee3fb343a376cdba6f4ce188cac90023f13e2aea Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Thu, 4 Apr 2024 14:21:38 -0600 -Subject: [PATCH] Reenable dim for python 3.12 - -In 3.12: - -_PyArg_Parser added an element to the start of the structure. -So existing positional initialization is off. Switch to element -initialization. - -_Py_CODEUNIT changed to from an int to a union, but relevant_op -is passed an int for the return of decoder.opcode, so the parameter -type is wrong, switch it to int. - -The opcode PRECALL was removed, so reduce its handling to 3.11 - -Signed-off-by: Tom Rix ---- - functorch/csrc/dim/dim.cpp | 24 +++++------------------- - functorch/csrc/dim/minpybind.h | 4 ++-- - 2 files changed, 7 insertions(+), 21 deletions(-) - -diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp -index 4cc027504c77..e48b0d58081f 100644 ---- a/functorch/csrc/dim/dim.cpp -+++ b/functorch/csrc/dim/dim.cpp -@@ -6,20 +6,6 @@ - - #include - -- --// Many APIs have changed/don't exist anymore --#if IS_PYTHON_3_12_PLUS -- --#include "dim.h" -- --// Re-enable this some day --PyObject* Dim_init() { -- PyErr_SetString(PyExc_RuntimeError, "First class dim doesn't work with python 3.12"); -- return nullptr; --} -- --#else -- - #include "minpybind.h" - #include - #include -@@ -441,7 +427,7 @@ static PyObject* DimList_bind(DimList *self, - PY_BEGIN - mpy::handle sizes; - static const char * const _keywords[] = {"sizes", nullptr}; -- static _PyArg_Parser parser = {"O", _keywords, 0}; -+ static _PyArg_Parser parser = { .format = "O", .keywords = _keywords}; - if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, &sizes)) { - return nullptr; - } -@@ -465,7 +451,7 @@ static PyObject* DimList_bind_len(DimList *self, - PY_BEGIN - int size; - static const char * const _keywords[] = {"N", nullptr}; -- static _PyArg_Parser parser = {"i", _keywords, 0}; -+ static _PyArg_Parser parser = { .format = "i", .keywords = _keywords}; - if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, &size)) { - return nullptr; - } -@@ -1468,7 +1454,7 @@ PyTypeObject Tensor::Type = { - - // dim() -------------------- - --static bool relevant_op(_Py_CODEUNIT c) { -+static bool relevant_op(int c) { - switch(c) { - case STORE_NAME: - case STORE_GLOBAL: -@@ -1587,7 +1573,7 @@ static PyObject* _dims(PyObject *self, - auto c = mpy::obj::steal(PyFrame_GetCode(f.ptr())); - auto lasti = PyFrame_GetLasti(f.ptr()); - auto decoder = PyInstDecoder(c.ptr(), lasti); -- #if IS_PYTHON_3_11_PLUS -+ #if IS_PYTHON_3_11 - // When py3.11 adapts bytecode lasti points to the precall - // rather than the call instruction after it - if (decoder.opcode() == PRECALL) { -@@ -3268,4 +3254,4 @@ PyObject* Dim_init() { - } - } - --#endif -+ -diff --git a/functorch/csrc/dim/minpybind.h b/functorch/csrc/dim/minpybind.h -index de82b5af95a4..d76d4828bf80 100644 ---- a/functorch/csrc/dim/minpybind.h -+++ b/functorch/csrc/dim/minpybind.h -@@ -621,7 +621,7 @@ struct vector_args { - PyObject *dummy = NULL; - _PyArg_ParseStackAndKeywords((PyObject*const*)args, nargs, kwnames.ptr(), _parser, &dummy, &dummy, &dummy, &dummy, &dummy); - #else -- _PyArg_Parser* _parser = new _PyArg_Parser{NULL, &names_buf[0], fname_cstr, 0}; -+ _PyArg_Parser* _parser = new _PyArg_Parser{ .keywords = &names_buf[0], .fname = fname_cstr}; - std::unique_ptr buf(new PyObject*[names.size()]); - _PyArg_UnpackKeywords((PyObject*const*)args, nargs, NULL, kwnames.ptr(), _parser, required, (Py_ssize_t)values.size() - kwonly, 0, &buf[0]); - #endif -@@ -706,7 +706,7 @@ inline object handle::call_vector(vector_args args) { - #define MPY_PARSE_ARGS_KWNAMES(fmt, FORALL_ARGS) \ - static const char * const kwlist[] = { FORALL_ARGS(MPY_ARGS_NAME) nullptr}; \ - FORALL_ARGS(MPY_ARGS_DECLARE) \ -- static _PyArg_Parser parser = {fmt, kwlist, 0}; \ -+ static _PyArg_Parser parser = { .format = fmt, .keywords = kwlist}; \ - if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, FORALL_ARGS(MPY_ARGS_POINTER) nullptr)) { \ - throw mpy::exception_set(); \ - } --- -2.44.0 - diff --git a/0001-Regenerate-flatbuffer-header.patch b/0001-Regenerate-flatbuffer-header.patch deleted file mode 100644 index 4eec491..0000000 --- a/0001-Regenerate-flatbuffer-header.patch +++ /dev/null @@ -1,39 +0,0 @@ -From 5b8e51b24513fa851eeff42f23d942bde301e321 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Fri, 29 Sep 2023 06:19:29 -0700 -Subject: [PATCH] Regenerate flatbuffer header - -For this error -torch/csrc/jit/serialization/mobile_bytecode_generated.h:12:41: -error: static assertion failed: Non-compatible flatbuffers version included - 12 | FLATBUFFERS_VERSION_MINOR == 3 && - -PyTorch is expecting 23.3.3, what f38 has -Rawhide is at 23.5.26 - -Regenerate with -flatc --cpp --gen-mutable --no-prefix --scoped-enums mobile_bytecode.fbs - -Signed-off-by: Tom Rix ---- - torch/csrc/jit/serialization/mobile_bytecode_generated.h | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/torch/csrc/jit/serialization/mobile_bytecode_generated.h b/torch/csrc/jit/serialization/mobile_bytecode_generated.h -index cffe8bc7a6..83575e4c19 100644 ---- a/torch/csrc/jit/serialization/mobile_bytecode_generated.h -+++ b/torch/csrc/jit/serialization/mobile_bytecode_generated.h -@@ -9,8 +9,8 @@ - // Ensure the included flatbuffers.h is the same version as when this file was - // generated, otherwise it may not be compatible. - static_assert(FLATBUFFERS_VERSION_MAJOR == 23 && -- FLATBUFFERS_VERSION_MINOR == 3 && -- FLATBUFFERS_VERSION_REVISION == 3, -+ FLATBUFFERS_VERSION_MINOR == 5 && -+ FLATBUFFERS_VERSION_REVISION == 26, - "Non-compatible flatbuffers version included"); - - namespace torch { --- -2.43.0 - diff --git a/0001-Stub-in-kineto-ActivityType.patch b/0001-Stub-in-kineto-ActivityType.patch deleted file mode 100644 index f088645..0000000 --- a/0001-Stub-in-kineto-ActivityType.patch +++ /dev/null @@ -1,73 +0,0 @@ -From 3ef82b814179da571b2478f61d4279717ab0b23a Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Fri, 29 Sep 2023 06:25:23 -0700 -Subject: [PATCH] Stub in kineto ActivityType - -There is an error with kineto is not used, the shim still -requires the ActivityTYpe.h header to get the enum Activity type. -So cut-n-paste just enough of the header in to do this. - -Signed-off-by: Tom Rix ---- - torch/csrc/profiler/kineto_shim.h | 44 +++++++++++++++++++++++++++++++ - 1 file changed, 44 insertions(+) - -diff --git a/torch/csrc/profiler/kineto_shim.h b/torch/csrc/profiler/kineto_shim.h -index e92cbf003d..68985ab7d0 100644 ---- a/torch/csrc/profiler/kineto_shim.h -+++ b/torch/csrc/profiler/kineto_shim.h -@@ -12,7 +12,51 @@ - #undef USE_KINETO - #endif - -+#ifdef USE_KINETO - #include -+#else -+namespace libkineto { -+// copied from header -+/* -+ * Copyright (c) Meta Platforms, Inc. and affiliates. -+ * All rights reserved. -+ * -+ * This source code is licensed under the BSD-style license found in the -+ * LICENSE file in the root directory of this source tree. -+ */ -+ -+// Note : All activity types are not enabled by default. Please add them -+// at correct position in the enum -+enum class ActivityType { -+ // Activity types enabled by default -+ CPU_OP = 0, // cpu side ops -+ USER_ANNOTATION, -+ GPU_USER_ANNOTATION, -+ GPU_MEMCPY, -+ GPU_MEMSET, -+ CONCURRENT_KERNEL, // on-device kernels -+ EXTERNAL_CORRELATION, -+ CUDA_RUNTIME, // host side cuda runtime events -+ CUDA_DRIVER, // host side cuda driver events -+ CPU_INSTANT_EVENT, // host side point-like events -+ PYTHON_FUNCTION, -+ OVERHEAD, // CUPTI induced overhead events sampled from its overhead API. -+ -+ // Optional Activity types -+ CUDA_SYNC, // synchronization events between runtime and kernels -+ GLOW_RUNTIME, // host side glow runtime events -+ MTIA_RUNTIME, // host side MTIA runtime events -+ CUDA_PROFILER_RANGE, // CUPTI Profiler range for performance metrics -+ MTIA_CCP_EVENTS, // MTIA ondevice CCP events -+ HPU_OP, // HPU host side runtime event -+ XPU_RUNTIME, // host side xpu runtime events -+ -+ ENUM_COUNT, // This is to add buffer and not used for any profiling logic. Add your new type before it. -+ OPTIONAL_ACTIVITY_TYPE_START = CUDA_SYNC, -+}; -+} -+ -+#endif - - #include - #include --- -2.43.0 - diff --git a/0001-can-not-use-with-c-files.patch b/0001-can-not-use-with-c-files.patch deleted file mode 100644 index 719737c..0000000 --- a/0001-can-not-use-with-c-files.patch +++ /dev/null @@ -1,25 +0,0 @@ -From a5dff521691a17701b5a02ec75e84cfe1bf605f7 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 3 Feb 2024 06:41:49 -0500 -Subject: [PATCH] can not use with c files - ---- - cmake/Dependencies.cmake | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index 4dd8042058..5f91f3ffab 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1269,7 +1269,7 @@ if(USE_ROCM) - list(APPEND HIP_CXX_FLAGS -Wno-duplicate-decl-specifier) - list(APPEND HIP_CXX_FLAGS -DCAFFE2_USE_MIOPEN) - list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) -- list(APPEND HIP_CXX_FLAGS -std=c++17) -+# list(APPEND HIP_CXX_FLAGS -std=c++17) - if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "6.0.0") - list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2) - endif() --- -2.43.0 - diff --git a/0001-cuda-hip-signatures.patch b/0001-cuda-hip-signatures.patch deleted file mode 100644 index a258737..0000000 --- a/0001-cuda-hip-signatures.patch +++ /dev/null @@ -1,42 +0,0 @@ -From 214dc959acc809e1959643272c344ee5335d5a69 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Thu, 1 Feb 2024 11:29:47 -0500 -Subject: [PATCH] cuda - hip signatures - ---- - aten/src/ATen/cuda/detail/LazyNVRTC.cpp | 9 +++++++++ - 1 file changed, 9 insertions(+) - -diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp -index 1b85e7776e..bb6f88783a 100644 ---- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp -+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp -@@ -134,8 +134,13 @@ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, - const char *src, - const char *name, - int numHeaders, -+#if !defined(USE_ROCM) - const char * const *headers, - const char * const *includeNames) { -+#else -+ const char **headers, -+ const char **includeNames) { -+#endif - auto fn = reinterpret_cast(getNVRTCLibrary().sym(__func__)); - if (!fn) - throw std::runtime_error("Can't get nvrtcCreateProgram"); -@@ -150,7 +155,11 @@ NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *); - NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *); - NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *); - #endif -+#if !defined(USE_ROCM) - NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *); -+#else -+NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char **); -+#endif - _STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult); - NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*); - NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *); --- -2.43.0 - diff --git a/0001-disable-use-of-aotriton.patch b/0001-disable-use-of-aotriton.patch deleted file mode 100644 index 61ffd1e..0000000 --- a/0001-disable-use-of-aotriton.patch +++ /dev/null @@ -1,94 +0,0 @@ -From 038ce9e44776e23f21c1816daa259bc0ea335088 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 29 Jun 2024 07:06:09 -0700 -Subject: [PATCH] disable use of aotriton - ---- - .../ATen/native/transformers/cuda/sdp_utils.cpp | 17 +++++++++++++++-- - 1 file changed, 15 insertions(+), 2 deletions(-) - -diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp -index 214b02d8262e..7b3eb9dcd8cd 100644 ---- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp -+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp -@@ -19,9 +19,12 @@ - #include - #include - -+#ifdef USE_FLASH_ATTENTION - #if USE_ROCM - #include - #endif -+#endif -+ - - /** - * Note [SDPA Runtime Dispatch] -@@ -182,6 +185,9 @@ bool check_sm_version(cudaDeviceProp * dprops) { - - bool check_flash_attention_hardware_support(sdp_params const& params, bool debug) { - // Check that the gpu is capable of running flash attention -+#ifndef USE_FLASH_ATTENTION -+ return false; -+#else - using sm80 = SMVersion<8, 0>; - using sm90 = SMVersion<9, 0>; - #if USE_ROCM -@@ -209,9 +215,13 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug - } - #endif - return true; -+#endif - } - - bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) { -+#ifndef USE_FLASH_ATTENTION -+ return false; -+#else - // Mem Efficient attention supports hardware in the range [sm_50, sm_90] - using sm50 = SMVersion<5, 0>; - using sm90 = SMVersion<9, 0>; -@@ -240,6 +250,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) - } - #endif - return true; -+#endif - } - - bool check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89( -@@ -554,7 +565,7 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) { - #ifndef USE_FLASH_ATTENTION - TORCH_WARN_ONCE(!debug, "Torch was not compiled with flash attention."); - return false; --#endif -+#else - - // Define gate functions that determine if a flash kernel can be ran - // Replace with std::to_array when we migrate to c++20 -@@ -597,13 +608,14 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) { - } - } - return true; -+#endif - } - - bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) { - #ifndef USE_MEM_EFF_ATTENTION - TORCH_WARN_ONCE(!debug, "Torch was not compiled with memory efficient attention."); - return false; --#endif -+#else - // Constraints specific to mem efficient attention - constexpr auto greater_than_or_equal_sm80_mem_efficient_dtypes = - array_of(at::kHalf, at::kFloat, at::kBFloat16); -@@ -663,6 +675,7 @@ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) { - } - #endif - return check_tensor_dtype(params, less_than_sm80_mem_efficient_dtypes, debug); -+#endif - } - - SDPBackend select_sdp_backend(sdp_params const& kernel_params) { --- -2.45.2 - diff --git a/0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch b/0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch deleted file mode 100644 index 0ce5b1f..0000000 --- a/0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch +++ /dev/null @@ -1,226 +0,0 @@ -From b9d45eb1cc90696a4de76676221219e24423c709 Mon Sep 17 00:00:00 2001 -From: William Wen -Date: Wed, 3 Apr 2024 17:58:46 -0700 -Subject: [PATCH] [dynamo, 3.12] enable dynamo on 3.12, enable most dynamo - unittests on 3.12 (#123216) - -Pull Request resolved: https://github.com/pytorch/pytorch/pull/123216 -Approved by: https://github.com/jansel, https://github.com/malfet ---- - test/dynamo/test_autograd_function.py | 3 ++ - test/dynamo/test_misc.py | 63 +++++++++++++++++++++++++ - test/functorch/test_eager_transforms.py | 7 ++- - test/run_test.py | 3 -- - torch/__init__.py | 5 +- - torch/_dynamo/eval_frame.py | 4 +- - torch/_dynamo/test_case.py | 8 +--- - 7 files changed, 74 insertions(+), 19 deletions(-) - -diff --git a/test/dynamo/test_autograd_function.py b/test/dynamo/test_autograd_function.py -index d23fec607afa..bc5ebc767038 100644 ---- a/test/dynamo/test_autograd_function.py -+++ b/test/dynamo/test_autograd_function.py -@@ -2,6 +2,8 @@ - - import copy - import math -+import sys -+import unittest - - import torch - -@@ -528,6 +530,7 @@ class AutogradFunctionTests(torch._dynamo.test_case.TestCase): - # I pulled all of these test cases from test_autograd.py - # In the future, we should make the Dynamo test suite actually - # run on test_autograd.py (it's disabled right now) and delete these. -+ @unittest.skipIf(sys.version_info >= (3, 12), "invalid free in 3.12+") - def test_smoke_from_test_autograd(self): - class Func(torch.autograd.Function): - @staticmethod -diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py -index a73de8b1c7e9..8f54e0564e6b 100644 ---- a/test/dynamo/test_misc.py -+++ b/test/dynamo/test_misc.py -@@ -9760,6 +9760,69 @@ fn - lambda mod: mod, - ) - -+ @xfailIfPy311 -+ def test_outside_linear_module_free(self): -+ # Compared to test_linear_module_free, the linear -+ # layer is not the code object that is directly compiled. -+ def model_inp_ctr(): -+ fc = torch.nn.Linear(100, 100) -+ -+ class Mod(torch.nn.Module): -+ def __init__(self): -+ super().__init__() -+ self.fc_ref = fc -+ -+ def forward(self, x): -+ return fc(x[0]) -+ -+ # return fc to keep it alive in _test_compile_model_free -+ return Mod(), (torch.randn(100, 100), fc) -+ -+ self._test_compile_model_free(model_inp_ctr, lambda mod: mod.fc_ref) -+ -+ @unittest.skipIf(sys.version_info >= (3, 12), "leaks in 3.12+") -+ def test_parameter_free(self): -+ def model_inp_ctr(): -+ param = torch.nn.Parameter(torch.randn(100, 100)) -+ -+ class Mod(torch.nn.Module): -+ def __init__(self): -+ super().__init__() -+ self.param = param -+ -+ def forward(self, x): -+ return self.param * x[0] -+ -+ # return param to keep it alive in _test_compile_model_free -+ return Mod(), (torch.randn(100, 100), param) -+ -+ self._test_compile_model_free(model_inp_ctr, lambda mod: mod.param) -+ -+ def test_raises_importerror1(self): -+ @torch.compile(backend="eager") -+ def fn(x): -+ try: -+ import some_module_that_surely_does_not_exist -+ -+ return -+ except ImportError: -+ pass -+ return x.sin() -+ -+ x = torch.randn(8) -+ self.assertEqual(fn(x), x.sin()) -+ -+ def test_raises_importerror2(self): -+ @torch.compile(backend="eager") -+ def fn(x): -+ import some_module_that_surely_does_not_exist -+ -+ return x + 1 -+ -+ x = torch.randn(8) -+ with self.assertRaises(ImportError): -+ fn(x) -+ - def test_dynamo_cache_move_to_front(self): - class Mod(torch.nn.Module): - def __init__(self): -diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py -index 09415cf8f48e..60790ec06059 100644 ---- a/test/functorch/test_eager_transforms.py -+++ b/test/functorch/test_eager_transforms.py -@@ -4762,8 +4762,7 @@ class TestCompileTransforms(TestCase): - # Triton only supports GPU with SM70 or later. - @expectedFailureIf((IS_ARM64 and not IS_MACOS) or - IS_WINDOWS or -- (TEST_CUDA and not SM70OrLater) or -- (sys.version_info >= (3, 12))) -+ (TEST_CUDA and not SM70OrLater)) - def test_compile_vmap_hessian(self, device): - # The model and inputs are a smaller version - # of code at benchmark repo: -@@ -4792,8 +4791,8 @@ class TestCompileTransforms(TestCase): - actual = opt_fn(params_and_buffers, x) - self.assertEqual(actual, expected) - -- # torch.compile is not supported on Windows or on Python 3.12+ -- @expectedFailureIf(IS_WINDOWS or (sys.version_info >= (3, 12))) -+ # torch.compile is not supported on Windows -+ @expectedFailureIf(IS_WINDOWS) - @torch._dynamo.config.patch(suppress_errors=False) - @torch._dynamo.config.patch(capture_func_transforms=True) - @skipIfTorchDynamo("Do not test torch.compile on top of torch.compile") -diff --git a/test/run_test.py b/test/run_test.py -index e86af9623042..ebb14df4167d 100755 ---- a/test/run_test.py -+++ b/test/run_test.py -@@ -74,7 +74,6 @@ sys.path.remove(str(REPO_ROOT)) - RERUN_DISABLED_TESTS = os.getenv("PYTORCH_TEST_RERUN_DISABLED_TESTS", "0") == "1" - DISTRIBUTED_TEST_PREFIX = "distributed" - INDUCTOR_TEST_PREFIX = "inductor" --DYNAMO_TEST_PREFIX = "dynamo" - - - # Note [ROCm parallel CI testing] -@@ -324,7 +323,6 @@ JIT_EXECUTOR_TESTS = [ - ] - - INDUCTOR_TESTS = [test for test in TESTS if test.startswith(INDUCTOR_TEST_PREFIX)] --DYNAMO_TESTS = [test for test in TESTS if test.startswith(DYNAMO_TEST_PREFIX)] - DISTRIBUTED_TESTS = [test for test in TESTS if test.startswith(DISTRIBUTED_TEST_PREFIX)] - TORCH_EXPORT_TESTS = [test for test in TESTS if test.startswith("export")] - FUNCTORCH_TESTS = [test for test in TESTS if test.startswith("functorch")] -@@ -1361,7 +1359,6 @@ def get_selected_tests(options) -> List[str]: - # these tests failing in Python 3.12 temporarily disabling - if sys.version_info >= (3, 12): - options.exclude.extend(INDUCTOR_TESTS) -- options.exclude.extend(DYNAMO_TESTS) - options.exclude.extend( - [ - "functorch/test_dims", -diff --git a/torch/__init__.py b/torch/__init__.py -index d381712b4a35..26cdffe81d29 100644 ---- a/torch/__init__.py -+++ b/torch/__init__.py -@@ -1861,9 +1861,8 @@ def compile(model: Optional[Callable] = None, *, - - """ - _C._log_api_usage_once("torch.compile") -- # Temporary until we get proper support for python 3.12 -- if sys.version_info >= (3, 12): -- raise RuntimeError("Dynamo is not supported on Python 3.12+") -+ if sys.version_info >= (3, 13): -+ raise RuntimeError("Dynamo is not supported on Python 3.13+") - - # Decorator mode - if model is None: -diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py -index 53ab0df3a947..0a80eeea99ed 100644 ---- a/torch/_dynamo/eval_frame.py -+++ b/torch/_dynamo/eval_frame.py -@@ -589,8 +589,8 @@ class _NullDecorator(contextlib.nullcontext): # type: ignore[type-arg] - - - def check_if_dynamo_supported(): -- if sys.version_info >= (3, 12): -- raise RuntimeError("Python 3.12+ not yet supported for torch.compile") -+ if sys.version_info >= (3, 13): -+ raise RuntimeError("Python 3.13+ not yet supported for torch.compile") - - - def is_dynamo_supported(): -diff --git a/torch/_dynamo/test_case.py b/torch/_dynamo/test_case.py -index e3cbef09eaae..297ea6e2bc2a 100644 ---- a/torch/_dynamo/test_case.py -+++ b/torch/_dynamo/test_case.py -@@ -1,7 +1,6 @@ - import contextlib - import importlib - import logging --import sys - - import torch - import torch.testing -@@ -20,12 +19,7 @@ log = logging.getLogger(__name__) - def run_tests(needs=()): - from torch.testing._internal.common_utils import run_tests - -- if ( -- TEST_WITH_TORCHDYNAMO -- or IS_WINDOWS -- or TEST_WITH_CROSSREF -- or sys.version_info >= (3, 12) -- ): -+ if TEST_WITH_TORCHDYNAMO or IS_WINDOWS or TEST_WITH_CROSSREF: - return # skip testing - - if isinstance(needs, str): --- -2.44.0 - diff --git a/0001-include-fmt-ranges.h-for-using-fmt-join.patch b/0001-include-fmt-ranges.h-for-using-fmt-join.patch deleted file mode 100644 index f7f6c7d..0000000 --- a/0001-include-fmt-ranges.h-for-using-fmt-join.patch +++ /dev/null @@ -1,54 +0,0 @@ -From ba2cf11d1bf1dd5086c8e793198a697d4179cca7 Mon Sep 17 00:00:00 2001 -From: Kefu Chai -Date: Tue, 16 Jul 2024 08:00:22 +0800 -Subject: [PATCH] include fmt/ranges.h for using fmt::join() - -fmt::join() was moved into fmt/ranges.h in fmt 11, so include this -header for using it. - -Signed-off-by: Kefu Chai ---- - torch/csrc/distributed/c10d/socket.cpp | 1 + - torch/csrc/profiler/standalone/execution_trace_observer.cpp | 1 + - torch/csrc/profiler/util.cpp | 1 + - 3 files changed, 3 insertions(+) - -diff --git a/torch/csrc/distributed/c10d/socket.cpp b/torch/csrc/distributed/c10d/socket.cpp -index 5013f2540..cbcd33a19 100644 ---- a/torch/csrc/distributed/c10d/socket.cpp -+++ b/torch/csrc/distributed/c10d/socket.cpp -@@ -31,6 +31,7 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wdeprecated") - #include - C10_DIAGNOSTIC_POP() - #include -+#include - - #include - #include -diff --git a/torch/csrc/profiler/standalone/execution_trace_observer.cpp b/torch/csrc/profiler/standalone/execution_trace_observer.cpp -index 2ef2e5423..fb053e916 100644 ---- a/torch/csrc/profiler/standalone/execution_trace_observer.cpp -+++ b/torch/csrc/profiler/standalone/execution_trace_observer.cpp -@@ -10,6 +10,7 @@ - #endif // _WIN32 - - #include -+#include - #include - #include - #include -diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp -index 896bf606c..c229ce130 100644 ---- a/torch/csrc/profiler/util.cpp -+++ b/torch/csrc/profiler/util.cpp -@@ -5,6 +5,7 @@ - #include - #include - #include -+#include - - #ifdef USE_KINETO - #include --- -2.45.2 - diff --git a/0001-no-third_party-FXdiv.patch b/0001-no-third_party-FXdiv.patch deleted file mode 100644 index 71404e3..0000000 --- a/0001-no-third_party-FXdiv.patch +++ /dev/null @@ -1,54 +0,0 @@ -From b3b307add5724ee5730f161e16594fa702f34a19 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 3 Feb 2024 08:20:28 -0500 -Subject: [PATCH] no third_party FXdiv - ---- - caffe2/CMakeLists.txt | 24 ++++++++++++------------ - 1 file changed, 12 insertions(+), 12 deletions(-) - -diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt -index b2f3adbfae..80a5625c8d 100644 ---- a/caffe2/CMakeLists.txt -+++ b/caffe2/CMakeLists.txt -@@ -110,15 +110,15 @@ endif() - # Note: the folders that are being commented out have not been properly - # addressed yet. - --if(NOT MSVC AND USE_XNNPACK) -- if(NOT TARGET fxdiv) -- set(FXDIV_BUILD_TESTS OFF CACHE BOOL "") -- set(FXDIV_BUILD_BENCHMARKS OFF CACHE BOOL "") -- add_subdirectory( -- "${FXDIV_SOURCE_DIR}" -- "${CMAKE_BINARY_DIR}/FXdiv") -- endif() --endif() -+#if(NOT MSVC AND USE_XNNPACK) -+# if(NOT TARGET fxdiv) -+# set(FXDIV_BUILD_TESTS OFF CACHE BOOL "") -+# set(FXDIV_BUILD_BENCHMARKS OFF CACHE BOOL "") -+# add_subdirectory( -+# "${FXDIV_SOURCE_DIR}" -+# "${CMAKE_BINARY_DIR}/FXdiv") -+# endif() -+#endif() - - add_subdirectory(core) - add_subdirectory(serialize) -@@ -1081,9 +1081,9 @@ if(USE_XPU) - target_compile_definitions(torch_xpu PRIVATE USE_XPU) - endif() - --if(NOT MSVC AND USE_XNNPACK) -- TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv) --endif() -+#if(NOT MSVC AND USE_XNNPACK) -+# TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv) -+#endif() - - # ========================================================== - # formerly-libtorch flags --- -2.43.0 - diff --git a/0001-no-third_party-fmt.patch b/0001-no-third_party-fmt.patch deleted file mode 100644 index 6e82af2..0000000 --- a/0001-no-third_party-fmt.patch +++ /dev/null @@ -1,65 +0,0 @@ -From 2ce255b75760a0a513fb1706629b416f76a5c822 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 3 Feb 2024 08:16:04 -0500 -Subject: [PATCH] no third_party fmt - ---- - c10/CMakeLists.txt | 2 +- - cmake/Dependencies.cmake | 6 +++--- - torch/CMakeLists.txt | 2 +- - 3 files changed, 5 insertions(+), 5 deletions(-) - -diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt -index 1f742f4c176..4fa08913bdd 100644 ---- a/c10/CMakeLists.txt -+++ b/c10/CMakeLists.txt -@@ -87,7 +87,7 @@ endif() - if(C10_USE_GLOG) - target_link_libraries(c10 PUBLIC glog::glog) - endif() --target_link_libraries(c10 PRIVATE fmt::fmt-header-only) -+target_link_libraries(c10 PRIVATE fmt) - - if(C10_USE_NUMA) - message(STATUS "NUMA paths:") -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index 6f5a2d5feff..42fbf80f6e8 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1837,7 +1837,7 @@ endif() - # - set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) - set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libs" FORCE) --add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt) -+# add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt) - - # Disable compiler feature checks for `fmt`. - # -@@ -1846,9 +1846,9 @@ add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt) - # CMAKE_CXX_FLAGS in ways that break feature checks. Since we already know - # `fmt` is compatible with a superset of the compilers that PyTorch is, it - # shouldn't be too bad to just disable the checks. --set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "") -+# set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "") - --list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only) -+# list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only) - set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE) - - # ---[ Kineto -diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt -index 97a72eed55b..9e5014d1980 100644 ---- a/torch/CMakeLists.txt -+++ b/torch/CMakeLists.txt -@@ -80,7 +80,7 @@ set(TORCH_PYTHON_LINK_LIBRARIES - python::python - pybind::pybind11 - shm -- fmt::fmt-header-only -+ fmt - ATEN_CPU_FILES_GEN_LIB) - - if(USE_ASAN AND TARGET Sanitizer::address) --- -2.43.2 - diff --git a/0001-no-third_party-foxi.patch b/0001-no-third_party-foxi.patch deleted file mode 100644 index ba1ec40..0000000 --- a/0001-no-third_party-foxi.patch +++ /dev/null @@ -1,36 +0,0 @@ -From 8cb61cf9282102ac225645fcc9fb4a1bb7cb15a2 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 3 Feb 2024 08:11:55 -0500 -Subject: [PATCH] no third_party foxi - ---- - cmake/Dependencies.cmake | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index 5f91f3ffab..8e1461af81 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1567,7 +1567,7 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX) - set_target_properties(onnx_proto PROPERTIES CXX_STANDARD 17) - endif() - endif() -- add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/foxi EXCLUDE_FROM_ALL) -+ # add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/foxi EXCLUDE_FROM_ALL) - - add_definitions(-DONNX_NAMESPACE=${ONNX_NAMESPACE}) - if(NOT USE_SYSTEM_ONNX) -@@ -1600,8 +1600,8 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX) - message("-- Found onnx: ${ONNX_LIBRARY} ${ONNX_PROTO_LIBRARY}") - list(APPEND Caffe2_DEPENDENCY_LIBS onnx_proto onnx) - endif() -- include_directories(${FOXI_INCLUDE_DIRS}) -- list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader) -+# include_directories(${FOXI_INCLUDE_DIRS}) -+# list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader) - # Recover the build shared libs option. - set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS}) - endif() --- -2.43.0 - diff --git a/0001-reenable-foxi-linking.patch b/0001-reenable-foxi-linking.patch deleted file mode 100644 index 8e39795..0000000 --- a/0001-reenable-foxi-linking.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 58ccda271e8f51c3fa5b7518cf6ee52ce204fd37 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Thu, 22 Feb 2024 09:28:11 -0500 -Subject: [PATCH] reenable foxi linking - ---- - cmake/Dependencies.cmake | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index 42fbf80f6e8..bc3a2dc6fee 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1604,7 +1604,7 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX) - list(APPEND Caffe2_DEPENDENCY_LIBS onnx_proto onnx) - endif() - # include_directories(${FOXI_INCLUDE_DIRS}) --# list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader) -+ list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader) - # Recover the build shared libs option. - set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS}) - endif() --- -2.43.2 - diff --git a/0001-silence-an-assert.patch b/0001-silence-an-assert.patch deleted file mode 100644 index 0b20dcf..0000000 --- a/0001-silence-an-assert.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 04dd33db93b852fdfd7ea408813080b2e2026650 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 3 Feb 2024 06:41:20 -0500 -Subject: [PATCH] silence an assert - ---- - aten/src/ATen/native/cuda/IndexKernel.cu | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu -index 657c0c77b3..b406aa6687 100644 ---- a/aten/src/ATen/native/cuda/IndexKernel.cu -+++ b/aten/src/ATen/native/cuda/IndexKernel.cu -@@ -249,7 +249,7 @@ void index_put_kernel_quantized_cuda(TensorIterator& iter, const IntArrayRef ind - - gpu_index_kernel(iter, index_size, index_stride, [inv_scale, zero_point, qmin, qmax]C10_DEVICE(char* const out_data, const char* const in_data, const int64_t offset) { - int64_t qvalue = static_cast(zero_point + nearbyintf(*(float*)in_data * inv_scale)); -- qvalue = std::clamp(qvalue, qmin, qmax); -+ //qvalue = std::clamp(qvalue, qmin, qmax); - *(scalar_t*)(out_data + offset) = static_cast(qvalue); - }); - }); --- -2.43.0 - diff --git a/0001-use-any-hip.patch b/0001-use-any-hip.patch deleted file mode 100644 index dca86ea..0000000 --- a/0001-use-any-hip.patch +++ /dev/null @@ -1,34 +0,0 @@ -From 4248211ce9a9de81bb3ade5d421ba709b19ead08 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 3 Feb 2024 15:01:28 -0500 -Subject: [PATCH] use any hip - ---- - cmake/public/LoadHIP.cmake | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake -index 1abeb06228..28458c4146 100644 ---- a/cmake/public/LoadHIP.cmake -+++ b/cmake/public/LoadHIP.cmake -@@ -30,7 +30,7 @@ endif() - message("Building PyTorch for GPU arch: ${PYTORCH_ROCM_ARCH}") - - # Add HIP to the CMAKE Module Path --set(CMAKE_MODULE_PATH ${ROCM_PATH}/lib/cmake/hip ${CMAKE_MODULE_PATH}) -+set(CMAKE_MODULE_PATH ${ROCM_PATH}/lib64/cmake/hip ${CMAKE_MODULE_PATH}) - - macro(find_package_and_print_version PACKAGE_NAME) - find_package("${PACKAGE_NAME}" ${ARGN}) -@@ -38,7 +38,7 @@ macro(find_package_and_print_version PACKAGE_NAME) - endmacro() - - # Find the HIP Package --find_package_and_print_version(HIP 1.0) -+find_package_and_print_version(HIP MODULE) - - if(HIP_FOUND) - set(PYTORCH_FOUND_HIP TRUE) --- -2.43.0 - diff --git a/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch b/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch deleted file mode 100644 index 413c60d..0000000 --- a/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch +++ /dev/null @@ -1,47 +0,0 @@ -From 091b7fe1ccbb5e4ff4ac6017d42bacb869f61a27 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 20 Jul 2024 05:37:15 -0600 -Subject: [PATCH] Add cmake option USE_SYSTEM_FBGEMM - -Signed-off-by: Tom Rix ---- - CMakeLists.txt | 1 + - cmake/Dependencies.cmake | 3 ++- - 2 files changed, 3 insertions(+), 1 deletion(-) - -diff --git a/CMakeLists.txt b/CMakeLists.txt -index c4cd4b2c2a98..2068f7c6c4f2 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -253,6 +253,7 @@ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF - "USE_CUDNN" OFF) - cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF) - option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON) -+option(USE_SYSTEM_FBGEMM "Use system-wide FBGEMM" OFF) - option(USE_KINETO "Use Kineto profiling library" ON) - option(USE_CUPTI_SO "Use CUPTI as a shared library" ON) - option(USE_FAKELOWP "Use FakeLowp operators" OFF) -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index f1f2eb7cec31..192dac46f13b 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -706,6 +706,7 @@ endif() - - # ---[ FBGEMM - if(USE_FBGEMM) -+ if (NOT USE_SYSTEM_FBGEMM) - set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party") - if(NOT DEFINED FBGEMM_SOURCE_DIR) - set(FBGEMM_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/fbgemm" CACHE STRING "FBGEMM source directory") -@@ -746,7 +747,7 @@ if(USE_FBGEMM) - target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable) - endif() - endif() -- -+ endif() - if(USE_FBGEMM) - list(APPEND Caffe2_DEPENDENCY_LIBS fbgemm) - endif() --- -2.45.1 - diff --git a/next/0001-Optionally-use-hipblaslt.patch b/next/0001-Optionally-use-hipblaslt.patch deleted file mode 100644 index 1e5ca4b..0000000 --- a/next/0001-Optionally-use-hipblaslt.patch +++ /dev/null @@ -1,506 +0,0 @@ -From f1d65e958afa65882dbfea8b392ab847a84d41ed Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 29 Jun 2024 04:18:34 -0700 -Subject: [PATCH] Optionally use hipblaslt - ---- - aten/src/ATen/cuda/CUDABlas.cpp | 46 ++++++++++++++++++------ - aten/src/ATen/cuda/CUDAContextLight.h | 4 +++ - aten/src/ATen/cuda/CublasHandlePool.cpp | 10 ++++-- - aten/src/ATen/cuda/tunable/TunableGemm.h | 18 +++++++--- - aten/src/ATen/native/cuda/Blas.cpp | 18 +++++++++- - cmake/Dependencies.cmake | 3 ++ - cmake/public/LoadHIP.cmake | 2 +- - 7 files changed, 82 insertions(+), 19 deletions(-) - -diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp -index ce991a9bcad4..3f0d17b52778 100644 ---- a/aten/src/ATen/cuda/CUDABlas.cpp -+++ b/aten/src/ATen/cuda/CUDABlas.cpp -@@ -14,7 +14,9 @@ - #include - - #ifdef USE_ROCM -+#ifdef USE_HIPBLASLT - #include -+#endif - // until hipblas has an API to accept flags, we must use rocblas here - #include - #include -@@ -182,6 +184,9 @@ uint32_t _getAlignment(uintptr_t address) { - static size_t _parseChosenWorkspaceSize() { - const char * val = getenv("CUBLASLT_WORKSPACE_SIZE"); - #ifdef USE_ROCM -+#ifndef USE_HIPBLASLT -+ return 0; -+#endif - if (!val) { - // accept either env var - val = getenv("HIPBLASLT_WORKSPACE_SIZE"); -@@ -235,6 +240,7 @@ namespace at::cuda::blas { - } while (0) - - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - namespace { - // Following the pattern of CuSparseDescriptor - // Defined here for now because this is the only place cublas_lt interface is -@@ -318,7 +324,6 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< - }; - } // namespace - -- - template - inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { - cudaDataType_t abcType = CUDA_R_32F; -@@ -452,7 +457,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { - " scaleType ", - scaleType); - } -- -+#endif - - template - inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { -@@ -608,10 +613,13 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(double)) - template <> - void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(float)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(float)); - } -- else { -+ else -+#endif -+ { - bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(float)); - } - } -@@ -651,10 +659,13 @@ void bgemm_internal>(CUDABLAS_BGEMM_ARGTYPES(c10::complex - void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::Half)); - } -- else { -+ else -+#endif -+ { - bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::Half)); - } - } -@@ -662,10 +673,13 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)) - template <> - void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::BFloat16)); - } -- else { -+ else -+#endif -+ { - bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::BFloat16)); - } - } -@@ -781,11 +795,13 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { - } - } - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) { - // forward to bgemm implementation but set strides and batches to 0 - bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0); - } -+#endif - - template - inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) { -@@ -1008,10 +1024,13 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) - template <> - void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); - } -- else { -+ else -+#endif -+ { - gemm_internal_cublas(CUDABLAS_GEMM_ARGS(float)); - } - } -@@ -1051,10 +1070,13 @@ void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex - void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); - } -- else { -+ else -+#endif -+ { - gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::Half)); - } - } -@@ -1062,10 +1084,13 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) - template <> - void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } -- else { -+ else -+#endif -+ { - gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } - } -@@ -1177,7 +1202,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { - } - } - -- -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - void gemm_and_bias( - bool transpose_mat1, -@@ -1410,7 +1435,7 @@ void scaled_gemm( - ScalarType result_dtype, - void* amax_ptr, - bool use_fast_accum) { --#if CUDA_VERSION >= 11080 || defined(USE_ROCM) -+#if CUDA_VERSION >= 11080 || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - const auto computeType = CUBLAS_COMPUTE_32F; - const auto scaleType = CUDA_R_32F; - const int8_t fastAccuMode = use_fast_accum ? 1 : 0; -@@ -1681,6 +1706,7 @@ void int8_gemm( - " scaleType ", - scaleType); - } -+#endif - - template <> - void trsm(CUDABLAS_TRSM_ARGTYPES(float)) { -diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h -index f2b657ced51b..f0ee613c4208 100644 ---- a/aten/src/ATen/cuda/CUDAContextLight.h -+++ b/aten/src/ATen/cuda/CUDAContextLight.h -@@ -9,7 +9,9 @@ - - // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also - // added bf16 support -+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))) - #include -+#endif - - #ifdef CUDART_VERSION - #include -@@ -80,7 +82,9 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator(); - /* Handles */ - TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle(); - TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle(); -+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))) - TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle(); -+#endif - - TORCH_CUDA_CPP_API void clearCublasWorkspaces(); - -diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp -index 8eac525b3695..abfdf7a23847 100644 ---- a/aten/src/ATen/cuda/CublasHandlePool.cpp -+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp -@@ -29,7 +29,7 @@ namespace at::cuda { - - namespace { - --#if defined(USE_ROCM) -+#if defined(USE_ROCM) && defined(USE_HIPBLASLT) - void createCublasLtHandle(cublasLtHandle_t *handle) { - TORCH_CUDABLAS_CHECK(cublasLtCreate(handle)); - } -@@ -191,8 +191,9 @@ cublasHandle_t getCurrentCUDABlasHandle() { - return handle; - } - --cublasLtHandle_t getCurrentCUDABlasLtHandle() { - #ifdef USE_ROCM -+#if defined(USE_HIPBLASLT) -+cublasLtHandle_t getCurrentCUDABlasLtHandle() { - c10::DeviceIndex device = 0; - AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); - -@@ -213,9 +214,12 @@ cublasLtHandle_t getCurrentCUDABlasLtHandle() { - - auto handle = myPoolWindow->reserve(device); - return handle; -+} -+#endif - #else -+cublasLtHandle_t getCurrentCUDABlasLtHandle() { - return reinterpret_cast(getCurrentCUDABlasHandle()); --#endif - } -+#endif - - } // namespace at::cuda -diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h -index 53e6154120c9..fa1d664696db 100644 ---- a/aten/src/ATen/cuda/tunable/TunableGemm.h -+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h -@@ -11,7 +11,9 @@ - - #include - #ifdef USE_ROCM -+#ifdef USE_HIPBLASLT - #include -+#endif - #include - #endif - #include -@@ -65,6 +67,7 @@ class DefaultGemmStridedBatchedOp : public Callable> - } - }; - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - class DefaultScaledGemmOp : public Callable> { - public: -@@ -94,6 +97,7 @@ class DefaultScaledGemmOp : public Callable> { - return OK; - } - }; -+#endif - - template - inline bool IsZero(T v) { -@@ -191,6 +195,7 @@ static void AddRocblasValidator() { - } - } - -+#ifdef USE_HIPBLASLT - static void AddHipblasltValidator() { - auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); - if (validators.find("HIPBLASLT_VERSION") == validators.end()) { -@@ -205,6 +210,7 @@ static void AddHipblasltValidator() { - [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; }); - } - } -+#endif - - static void AddRocmValidator() { - auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); -@@ -243,7 +249,7 @@ class GemmTunableOp : public TunableOp, StreamTimer> { - } - AddRocblasValidator(); - } -- -+#ifdef USE_HIPBLASLT - static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); - if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) { - rocm_validators = true; -@@ -257,7 +263,7 @@ class GemmTunableOp : public TunableOp, StreamTimer> { - } - AddHipblasltValidator(); - } -- -+#endif - if (rocm_validators) { - AddRocmValidator(); - } -@@ -286,7 +292,7 @@ class GemmStridedBatchedTunableOp : public TunableOp - } - AddRocblasValidator(); - } -- -+#ifdef USE_HIPBLASLT - static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); - if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) { - rocm_validators = true; -@@ -300,7 +306,7 @@ class GemmStridedBatchedTunableOp : public TunableOp - } - AddHipblasltValidator(); - } -- -+#endif - if (rocm_validators) { - AddRocmValidator(); - } -@@ -312,6 +318,7 @@ class GemmStridedBatchedTunableOp : public TunableOp - } - }; - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - class ScaledGemmTunableOp : public TunableOp, StreamTimer> { - public: -@@ -321,10 +328,12 @@ class ScaledGemmTunableOp : public TunableOp, StreamTimer> - auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); - - #if defined(USE_ROCM) -+#ifdef USE_HIPBLASLT - for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps()) { - this->RegisterOp(std::move(name), std::move(op)); - } - AddHipblasltValidator(); -+#endif - AddRocmValidator(); - #endif - } -@@ -337,6 +346,7 @@ class ScaledGemmTunableOp : public TunableOp, StreamTimer> - "_", BlasOpToString(ALayout), BlasOpToString(BLayout)); - } - }; -+#endif - - #undef XSTRINGIFY - #undef STRINGIFY -diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp -index 84c59a4fd0d7..56ad5de3bf2d 100644 ---- a/aten/src/ATen/native/cuda/Blas.cpp -+++ b/aten/src/ATen/native/cuda/Blas.cpp -@@ -173,6 +173,7 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa - } - - static bool getDisableAddmmCudaLt() { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT"); - #ifdef USE_ROCM - // if we enable tunable op, it'll take priority over just hipblaslt (heuristics) -@@ -196,10 +197,14 @@ static bool getDisableAddmmCudaLt() { - } - return false; - #endif -+#else -+ return true; -+#endif - } - - #ifdef USE_ROCM - static bool isSupportedHipLtROCmArch(int index) { -+#ifdef USE_HIPBLASLT - hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index); - std::string device_arch = prop->gcnArchName; - static const std::vector archs = {"gfx90a", "gfx940", "gfx941", "gfx942"}; -@@ -210,6 +215,7 @@ static bool isSupportedHipLtROCmArch(int index) { - } - } - TORCH_CHECK(false, "Attempting to use hipBLASLt on a unsupported architecture!"); -+#endif - return false; - } - #endif -@@ -235,6 +241,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - at::ScalarType scalar_type = self.scalar_type(); - c10::MaybeOwned self_; - if (&result != &self) { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM) - // Strangely, if mat2 has only 1 row or column, we get - // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic. -@@ -276,13 +283,14 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - scalar_type != at::ScalarType::BFloat16)); - #endif - } -+#endif - #endif - if (!useLtInterface) { - self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm"); - } - self__sizes = self_->sizes(); - } else { --#if defined(USE_ROCM) -+#if defined(USE_ROCM) && defined(USE_HIPBLASLT) - useLtInterface = !disable_addmm_cuda_lt && - result.dim() == 2 && result.is_contiguous() && - isSupportedHipLtROCmArch(self.device().index()) && -@@ -334,6 +342,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj()); - - if (useLtInterface) { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - #if defined(USE_ROCM) - AT_DISPATCH_FLOATING_TYPES_AND2( - at::ScalarType::Half, -@@ -394,6 +403,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - activation_epilogue - ); - }); -+#endif - #endif - } else - { -@@ -803,6 +813,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) { - } - - static bool _scaled_mm_allowed_device() { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - auto dprops = at::cuda::getCurrentDeviceProperties(); - #ifdef USE_ROCM - std::string device_arch = dprops->gcnArchName; -@@ -817,6 +828,9 @@ static bool _scaled_mm_allowed_device() { - #else - return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9); - #endif -+#else -+ return false; -+#endif - } - - // Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax -@@ -850,6 +864,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, - // Check sizes - bool allowed_device = _scaled_mm_allowed_device(); - TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+"); -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix"); - TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix"); - TORCH_CHECK( -@@ -1025,6 +1040,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, - #if defined(USE_ROCM) && ROCM_VERSION >= 60000 && ROCM_VERSION < 60200 - // ROCm's hipBLASLt does not support amax before 6.2, so calculate separately - amax = at::max(at::abs(out.to(kFloat))); -+#endif - #endif - - return {out, amax}; -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index f1f2eb7cec31..8d05e834bbc5 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1052,6 +1052,9 @@ if(USE_ROCM) - list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) - list(APPEND HIP_CXX_FLAGS -std=c++17) - list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2) -+ if(hipblast_FOUND) -+ list(APPEND HIP_CXX_FLAGS -DUSE_HIPBLASLT) -+ endif() - if(HIP_NEW_TYPE_ENUMS) - list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS) - endif() -diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake -index fa39156031ff..df4836847fdf 100644 ---- a/cmake/public/LoadHIP.cmake -+++ b/cmake/public/LoadHIP.cmake -@@ -155,7 +155,7 @@ if(HIP_FOUND) - find_package_and_print_version(hiprand REQUIRED) - find_package_and_print_version(rocblas REQUIRED) - find_package_and_print_version(hipblas REQUIRED) -- find_package_and_print_version(hipblaslt REQUIRED) -+ find_package_and_print_version(hipblaslt) - find_package_and_print_version(miopen REQUIRED) - find_package_and_print_version(hipfft REQUIRED) - find_package_and_print_version(hipsparse REQUIRED) --- -2.45.2 - diff --git a/next/0001-Use-horrible-dynamo-stub.patch b/next/0001-Use-horrible-dynamo-stub.patch new file mode 100644 index 0000000..1900519 --- /dev/null +++ b/next/0001-Use-horrible-dynamo-stub.patch @@ -0,0 +1,85 @@ +From fd535f7bf44f2034cca2a66b4cc7d68d962341df Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Sun, 20 Jul 2025 12:47:58 -0700 +Subject: [PATCH] Use horrible dynamo stub + +Rawhide's update of python is too fast for dynamo +So paper of the problem with a horrible stub that throws +runtime exceptions if dynamo is used. + +Signed-off-by: Tom Rix +--- + build_variables.bzl | 26 ++++++++++++---------- + torch/csrc/dynamo/horrible_dynamo_stub.cpp | 16 +++++++++++++ + 2 files changed, 30 insertions(+), 12 deletions(-) + create mode 100644 torch/csrc/dynamo/horrible_dynamo_stub.cpp + +diff --git a/build_variables.bzl b/build_variables.bzl +index b266c80e8843..a3be6893349b 100644 +--- a/build_variables.bzl ++++ b/build_variables.bzl +@@ -140,7 +140,8 @@ core_trainer_sources = [ + "torch/csrc/autograd/variable.cpp", + "torch/csrc/autograd/utils/warnings.cpp", + "torch/csrc/autograd/jit_decomp_interface.cpp", +- "torch/csrc/dynamo/compiled_autograd.cpp", ++# "torch/csrc/dynamo/compiled_autograd.cpp", ++ "torch/csrc/dynamo/horrible_dynamo_stub.cpp", + "torch/csrc/jit/frontend/name_mangler.cpp", + "torch/csrc/jit/ir/type_hashing.cpp", + "torch/csrc/jit/serialization/pickler.cpp", +@@ -868,17 +869,18 @@ libtorch_python_core_sources = [ + "torch/csrc/autograd/python_torch_functions_manual.cpp", + "torch/csrc/autograd/python_variable.cpp", + "torch/csrc/autograd/python_variable_indexing.cpp", +- "torch/csrc/dynamo/python_compiled_autograd.cpp", +- "torch/csrc/dynamo/cache_entry.cpp", +- "torch/csrc/dynamo/cpp_shim.cpp", +- "torch/csrc/dynamo/cpython_defs.c", +- "torch/csrc/dynamo/eval_frame.c", +- "torch/csrc/dynamo/eval_frame_cpp.cpp", +- "torch/csrc/dynamo/extra_state.cpp", +- "torch/csrc/dynamo/framelocals_mapping.cpp", +- "torch/csrc/dynamo/guards.cpp", +- "torch/csrc/dynamo/utils.cpp", +- "torch/csrc/dynamo/init.cpp", ++# "torch/csrc/dynamo/python_compiled_autograd.cpp", ++# "torch/csrc/dynamo/cache_entry.cpp", ++# "torch/csrc/dynamo/cpp_shim.cpp", ++# "torch/csrc/dynamo/cpython_defs.c", ++# "torch/csrc/dynamo/eval_frame.c", ++# "torch/csrc/dynamo/eval_frame_cpp.cpp", ++# "torch/csrc/dynamo/extra_state.cpp", ++# "torch/csrc/dynamo/framelocals_mapping.cpp", ++# "torch/csrc/dynamo/guards.cpp", ++# "torch/csrc/dynamo/utils.cpp", ++# "torch/csrc/dynamo/init.cpp", ++ "torch/csrc/dynamo/horrible_dynamo_stub.cpp", + "torch/csrc/functorch/init.cpp", + "torch/csrc/fx/node.cpp", + "torch/csrc/mps/Module.cpp", +diff --git a/torch/csrc/dynamo/horrible_dynamo_stub.cpp b/torch/csrc/dynamo/horrible_dynamo_stub.cpp +new file mode 100644 +index 000000000000..3ac1324d4557 +--- /dev/null ++++ b/torch/csrc/dynamo/horrible_dynamo_stub.cpp +@@ -0,0 +1,16 @@ ++#include ++#include ++ ++namespace torch::dynamo::autograd { ++const std::unique_ptr& getPyCompilerInterface() { ++ throw std::runtime_error("Dynamo not supported"); ++ return nullptr; ++} ++std::vector> get_input_metadata( ++ const edge_list& edges) { ++ std::vector> r; ++ throw std::runtime_error("Dynamo not supported"); ++ return r; ++} ++ ++} +-- +2.49.0 + diff --git a/next/0001-disable-use-of-aotriton.patch b/next/0001-disable-use-of-aotriton.patch deleted file mode 100644 index 61ffd1e..0000000 --- a/next/0001-disable-use-of-aotriton.patch +++ /dev/null @@ -1,94 +0,0 @@ -From 038ce9e44776e23f21c1816daa259bc0ea335088 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 29 Jun 2024 07:06:09 -0700 -Subject: [PATCH] disable use of aotriton - ---- - .../ATen/native/transformers/cuda/sdp_utils.cpp | 17 +++++++++++++++-- - 1 file changed, 15 insertions(+), 2 deletions(-) - -diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp -index 214b02d8262e..7b3eb9dcd8cd 100644 ---- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp -+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp -@@ -19,9 +19,12 @@ - #include - #include - -+#ifdef USE_FLASH_ATTENTION - #if USE_ROCM - #include - #endif -+#endif -+ - - /** - * Note [SDPA Runtime Dispatch] -@@ -182,6 +185,9 @@ bool check_sm_version(cudaDeviceProp * dprops) { - - bool check_flash_attention_hardware_support(sdp_params const& params, bool debug) { - // Check that the gpu is capable of running flash attention -+#ifndef USE_FLASH_ATTENTION -+ return false; -+#else - using sm80 = SMVersion<8, 0>; - using sm90 = SMVersion<9, 0>; - #if USE_ROCM -@@ -209,9 +215,13 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug - } - #endif - return true; -+#endif - } - - bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) { -+#ifndef USE_FLASH_ATTENTION -+ return false; -+#else - // Mem Efficient attention supports hardware in the range [sm_50, sm_90] - using sm50 = SMVersion<5, 0>; - using sm90 = SMVersion<9, 0>; -@@ -240,6 +250,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) - } - #endif - return true; -+#endif - } - - bool check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89( -@@ -554,7 +565,7 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) { - #ifndef USE_FLASH_ATTENTION - TORCH_WARN_ONCE(!debug, "Torch was not compiled with flash attention."); - return false; --#endif -+#else - - // Define gate functions that determine if a flash kernel can be ran - // Replace with std::to_array when we migrate to c++20 -@@ -597,13 +608,14 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) { - } - } - return true; -+#endif - } - - bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) { - #ifndef USE_MEM_EFF_ATTENTION - TORCH_WARN_ONCE(!debug, "Torch was not compiled with memory efficient attention."); - return false; --#endif -+#else - // Constraints specific to mem efficient attention - constexpr auto greater_than_or_equal_sm80_mem_efficient_dtypes = - array_of(at::kHalf, at::kFloat, at::kBFloat16); -@@ -663,6 +675,7 @@ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) { - } - #endif - return check_tensor_dtype(params, less_than_sm80_mem_efficient_dtypes, debug); -+#endif - } - - SDPBackend select_sdp_backend(sdp_params const& kernel_params) { --- -2.45.2 - diff --git a/pyproject.toml b/pyproject.toml index 9508ad0..925742b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,46 +1,165 @@ +# Package ###################################################################### + [build-system] requires = [ - "setuptools", - "wheel", - "astunparse", - "numpy", + # 70.1.0: min version for integrated bdist_wheel command from wheel package + # 77.0.0: min version for SPDX expression support for project.license + "setuptools>=70.1.0,<80.0", + "cmake>=3.27", "ninja", + "numpy", + "packaging", "pyyaml", - "cmake", - "typing-extensions", "requests", + "six", # dependency chain: NNPACK -> PeachPy -> six + "typing-extensions>=4.10.0", ] -# Use legacy backend to import local packages in setup.py -build-backend = "setuptools.build_meta:__legacy__" +build-backend = "setuptools.build_meta" +[dependency-groups] +dev = [ + # This list should be kept in sync with the requirements-build.txt + # in PyTorch root until the project fully migrates to pyproject.toml + # after which this can be removed as it is already specified in the + # [build-system] section + "setuptools>=70.1.0,<80.0", # setuptools develop deprecated on 80.0 + "cmake>=3.27", + "ninja", + "numpy", + "packaging", + "pyyaml", + "requests", + "six", # dependency chain: NNPACK -> PeachPy -> six + "typing-extensions>=4.10.0", -[tool.black] -# Uncomment if pyproject.toml worked fine to ensure consistency with flake8 -# line-length = 120 -target-version = ["py38", "py39", "py310", "py311"] + # This list should be kept in sync with the requirements.txt in + # PyTorch root until the project fully migrates to pyproject.toml + "build[uv]", + "expecttest>=0.3.0", + "filelock", + "fsspec>=0.8.5", + "hypothesis", + "jinja2", + "lintrunner; platform_machine != 's390x' and platform_machine != 'riscv64'", + "networkx>=2.5.1", + "optree>=0.13.0", + "psutil", + "sympy>=1.13.3", + "typing-extensions>=4.13.2", + "wheel", +] +[project] +name = "torch" +description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" +readme = "README.md" +requires-python = ">=3.10" +# TODO: change to `license = "BSD-3-Clause"` and enable PEP 639 after pinning setuptools>=77 +# FIXME: As of 2025.06.20, it is hard to ensure the minimum version of setuptools in our CI environment. +# TOML-table-based license deprecated in setuptools>=77, and the deprecation warning will be changed +# to an error on 2026.02.18. See also: https://github.com/pypa/setuptools/issues/4903 +license = { text = "BSD-3-Clause" } +authors = [{ name = "PyTorch Team", email = "packages@pytorch.org" }] +keywords = ["pytorch", "machine learning"] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Mathematics", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Python Modules", + "Programming Language :: C++", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", +] +dynamic = [ + "entry-points", + "dependencies", + "scripts", + "version", +] + +[project.urls] +Homepage = "https://pytorch.org" +Repository = "https://github.com/pytorch/pytorch" +Documentation = "https://pytorch.org/docs" +"Issue Tracker" = "https://github.com/pytorch/pytorch/issues" +Forum = "https://discuss.pytorch.org" + +[project.optional-dependencies] +optree = ["optree>=0.13.0"] +opt-einsum = ["opt-einsum>=3.3"] +pyyaml = ["pyyaml"] + +# Linter tools ################################################################# + +[tool.isort] +src_paths = ["caffe2", "torch", "torchgen", "functorch", "test"] +extra_standard_library = ["typing_extensions"] +skip_gitignore = true +skip_glob = ["third_party/*"] +atomic = true +profile = "black" +indent = 4 +line_length = 88 +lines_after_imports = 2 +multi_line_output = 3 +include_trailing_comma = true +combine_as_imports = true + +[tool.usort.known] +first_party = ["caffe2", "torch", "torchgen", "functorch", "test"] +standard_library = ["typing_extensions"] [tool.ruff] -target-version = "py38" +line-length = 88 +src = ["caffe2", "torch", "torchgen", "functorch", "test"] +[tool.ruff.format] +docstring-code-format = true +quote-style = "double" + +[tool.ruff.lint] # NOTE: Synchoronize the ignores with .flake8 +external = [ + "B001", + "B902", + "B950", + "E121", + "E122", + "E128", + "E131", + "E704", + "E723", + "F723", + "F812", + "P201", + "P204", + "T484", + "TOR901", +] ignore = [ # these ignores are from flake8-bugbear; please fix! "B007", "B008", "B017", "B018", # Useless expression - "B019", "B023", "B028", # No explicit `stacklevel` keyword argument found - "B904", "E402", "C408", # C408 ignored because we like the dict keyword argument syntax "E501", # E501 is not flexible enough, we're using B950 instead "E721", - "E731", # Assign lambda expression "E741", "EXE001", "F405", - "F841", + "FURB122", # writelines # these ignores are from flake8-logging-format; please fix! "G101", # these ignores are from ruff NPY; please fix! @@ -48,39 +167,49 @@ ignore = [ # these ignores are from ruff PERF; please fix! "PERF203", "PERF401", - "PERF403", # these ignores are from PYI; please fix! - "PYI019", "PYI024", "PYI036", "PYI041", "PYI056", "SIM102", "SIM103", "SIM112", # flake8-simplify code styles "SIM105", # these ignores are from flake8-simplify. please fix or ignore with commented reason - "SIM108", + "SIM108", # SIM108 ignored because we prefer if-else-block instead of ternary expression "SIM110", "SIM114", # Combine `if` branches using logical `or` operator "SIM115", "SIM116", # Disable Use a dictionary instead of consecutive `if` statements "SIM117", "SIM118", - "UP006", # keep-runtime-typing "UP007", # keep-runtime-typing + "UP045", # keep-runtime-typing + "TC006", + # TODO: Remove Python-3.10 specific suppressions + "B905", + "UP035", + "UP036", + "UP038", + "UP041", + "FURB161", ] -line-length = 120 select = [ "B", + "B904", # Re-raised error without specifying the cause via the from keyword "C4", "G", "E", "EXE", "F", "SIM1", + "SIM911", "W", # Not included in flake8 + "FURB", + "LOG", "NPY", "PERF", "PGH004", + "PIE790", "PIE794", "PIE800", "PIE804", @@ -89,40 +218,96 @@ select = [ "PLC0131", # type bivariance "PLC0132", # type param mismatch "PLC0205", # string as __slots__ + "PLC3002", # unnecessary-direct-lambda-call "PLE", "PLR0133", # constant comparison "PLR0206", # property with params "PLR1722", # use sys exit + "PLR1736", # unnecessary list index "PLW0129", # assert on string literal + "PLW0131", # named expr without context + "PLW0133", # useless exception statement + "PLW0245", # super without brackets "PLW0406", # import self "PLW0711", # binary op exception + "PLW1501", # bad open mode + "PLW1507", # shallow copy os.environ "PLW1509", # preexec_fn not safe with threads + "PLW2101", # useless lock statement "PLW3301", # nested min max "PT006", # TODO: enable more PT rules + "PT014", # duplicate parameterize case "PT022", "PT023", "PT024", "PT025", "PT026", "PYI", + "Q003", # avoidable escaped quote + "Q004", # unnecessary escaped quote + "RSE", "RUF008", # mutable dataclass default + "RUF013", # ban implicit optional "RUF015", # access first ele in constant time "RUF016", # type error non-integer index "RUF017", - "TRY200", - "TRY302", + "RUF018", # no assignment in assert + "RUF019", # unnecessary-key-check + "RUF020", # never union + "RUF024", # from keys mutable + "RUF026", # default factory kwarg + "RUF030", # No print statement in assert + "RUF033", # default values __post_init__ dataclass + "RUF041", # simplify nested Literal + "RUF048", # properly parse `__version__` + "RUF200", # validate pyproject.toml + "S324", # for hashlib FIPS compliance + "SLOT", + "TC", + "TRY002", # ban vanilla raise (todo fix NOQAs) + "TRY203", + "TRY401", # verbose-log-message "UP", + "YTT", ] -[tool.ruff.per-file-ignores] +[tool.ruff.lint.pyupgrade] +# Preserve types, even if a file imports `from __future__ import annotations`. +keep-runtime-typing = true + +[tool.ruff.lint.per-file-ignores] "__init__.py" = [ "F401", ] +"*.pyi" = [ + "PYI011", # typed-argument-default-in-stub + "PYI021", # docstring-in-stub + "PYI053", # string-or-bytes-too-long +] +"functorch/notebooks/**" = [ + "F401", +] +"test/export/**" = [ + "PGH004" +] +"test/typing/**" = [ + "PGH004" +] "test/typing/reveal/**" = [ "F821", ] "test/torch_np/numpy_tests/**" = [ "F821", + "NPY201", +] +"test/dynamo/test_bytecode_utils.py" = [ + "F821", +] +"test/dynamo/test_debug_utils.py" = [ + "UP037", +] +"test/dynamo/test_misc.py" = [ + "PGH004", ] "test/jit/**" = [ "PLR0133", # tests require this for JIT @@ -136,19 +321,33 @@ select = [ "RUF015", "UP", # We don't want to modify the jit test as they test specify syntax ] - -"torch/onnx/**" = [ - "UP037", # ONNX does runtime type checking +"test/inductor/s429861_repro.py" = [ + "PGH004", +] +"test/inductor/test_torchinductor.py" = [ + "UP037", +] +# autogenerated #TODO figure out why file level noqa is ignored +"torch/_appdirs.py" = ["PGH004"] +"torch/jit/_shape_functions.py" = ["PGH004"] +"torch/_inductor/fx_passes/serialized_patterns/**" = ["F401", "F501"] +"torch/_inductor/autoheuristic/artifacts/**" = ["F401", "F501"] +"torch/_inductor/codegen/**" = [ + "PGH004" ] - "torchgen/api/types/__init__.py" = [ "F401", "F403", ] -"torchgen/executorch/api/types/__init__.py" = [ - "F401", - "F403", -] "torch/utils/collect_env.py" = [ "UP", # collect_env.py needs to work with older versions of Python ] +"torch/_vendor/**" = [ + "UP", # No need to mess with _vendor +] +"tools/linter/**" = [ + "LOG015" # please fix +] + +[tool.codespell] +ignore-words = "tools/linter/dictionary.txt" diff --git a/python-torch.spec b/python-torch.spec index 9945a66..d3c31d7 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -6,12 +6,20 @@ # So pre releases can be tried %bcond_with gitcommit %if %{with gitcommit} -%global commit0 d01a7a9faa5a742a3df7374b97bbc1db1205b6ed +# v2.9.0-rc9 +%global commit0 0fabc3ba44823f257e70ce397d989c8de5e362c1 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) -%global date0 20240828 -%global pypi_version 2.5.0 +%global date0 20251008 +%global pypi_version 2.9.0 +%global flatbuffers_version 24.12.23 +%global miniz_version 3.0.2 +%global pybind11_version 2.13.6 +%global rc_tag -rc9 %else -%global pypi_version 2.4.0 +%global pypi_version 2.9.1 +%global flatbuffers_version 24.12.23 +%global miniz_version 3.0.2 +%global pybind11_version 2.13.6 %endif # For -test subpackage @@ -24,90 +32,29 @@ %ifarch x86_64 %bcond_without rocm %endif -%bcond_without hipblaslt -%bcond_without magma -%bcond_without rocm_loop -%global rocm_default_gpu default -%global rocm_gpu_list gfx9 -# Caffe2 support came in F41 -%if 0%{?fedora} > 40 -%bcond_without caffe2 -%else -%bcond_with caffe2 -%endif - -# Distributed support came in F41 -%if 0%{?fedora} > 40 -%bcond_without distributed # For testing distributed+rccl etc. %bcond_without rccl %bcond_with gloo %bcond_without mpi %bcond_without tensorpipe -%else -%bcond_with distributed -%endif - -# Do no confuse xnnpack versions -%if 0%{?fedora} > 40 -%bcond_without xnnpack -%else -%bcond_with xnnpack -%endif - -%bcond_without pthreadpool -%bcond_without pocketfft - -%ifarch x86_64 - %if %{with rocm} - %bcond_with fbgemm - %else - %bcond_without fbgemm - %endif -%else - %bcond_with fbgemm -%endif - -# For testing cuda -%ifarch x86_64 -%bcond_with cuda -%endif - -# Pick a version that works -%if %{with cuda} -%if 0%{?fedora} < 40 -%global cuda_ver 12.5 -%global cudart_ver 12-5 -%global cublas_ver 12-5 -%global cufft_ver 12-5 -%global curand_ver 12-5 -%global cusparse_ver 12-5 -%else -%global cuda_ver 12.5 -%endif -%endif - -# For testing compat-gcc -%global compat_gcc_major 13 -%bcond_with compat_gcc # Disable dwz with rocm because memory can be exhausted %if %{with rocm} %define _find_debuginfo_dwz_opts %{nil} %endif -%if %{with cuda} -# workaround problems with -pie -%global build_cxxflags %{nil} -%global build_ldflags %{nil} -%endif - # These came in 2.4 and not yet in Fedora %bcond_with opentelemetry %bcond_with httplib %bcond_with kineto +%if 0%{?fedora} +%bcond_without onnx +%else +%bcond_with onnx +%endif + Name: python-%{pypi_name} %if %{with gitcommit} Version: %{pypi_version}^git%{date0}.%{shortcommit0} @@ -126,15 +73,8 @@ Source1000: pyproject.toml %else Source0: %{forgeurl}/releases/download/v%{version}/pytorch-v%{version}.tar.gz %endif -Source1: https://github.com/google/flatbuffers/archive/refs/tags/v23.3.3.tar.gz -Source2: https://github.com/pybind/pybind11/archive/refs/tags/v2.11.1.tar.gz - -%if %{with cuda} -%global cuf_ver 1.1.2 -Source10: https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v%{cuf_ver}.tar.gz -%global cul_ver 3.4.1 -Source11: https://github.com/NVIDIA/cutlass/archive/refs/tags/v%{cul_ver}.tar.gz -%endif +Source1: https://github.com/google/flatbuffers/archive/refs/tags/v%{flatbuffers_version}.tar.gz +Source2: https://github.com/pybind/pybind11/archive/refs/tags/v%{pybind11_version}.tar.gz # Developement on tensorpipe has stopped, repo made read only July 1, 2023, this is the last commit %global tp_commit 52791a2fd214b2a9dc5759d36725909c1daa7f2e @@ -147,36 +87,6 @@ Source21: https://github.com/libuv/libuv/archive/refs/tags/v1.41.0.tar.gz %global nop_scommit %(c=%{nop_commit}; echo ${c:0:7}) Source22: https://github.com/google/libnop/archive/%{nop_commit}/libnop-%{nop_scommit}.tar.gz -%if %{without xnnpack} -%global xnn_commit fcbf55af6cf28a4627bcd1f703ab7ad843f0f3a2 -%global xnn_scommit %(c=%{xnn_commit}; echo ${c:0:7}) -Source30: https://github.com/google/xnnpack/archive/%{xnn_commit}/xnnpack-%{xnn_scommit}.tar.gz -%global fx_commit 63058eff77e11aa15bf531df5dd34395ec3017c8 -%global fx_scommit %(c=%{fx_commit}; echo ${c:0:7}) -Source31: https://github.com/Maratyszcza/fxdiv/archive/%{fx_commit}/FXdiv-%{fx_scommit}.tar.gz -%global fp_commit 0a92994d729ff76a58f692d3028ca1b64b145d91 -%global fp_scommit %(c=%{fp_commit}; echo ${c:0:7}) -Source32: https://github.com/Maratyszcza/FP16/archive/%{fp_commit}/FP16-%{fp_scommit}.tar.gz -%global ps_commit 072586a71b55b7f8c584153d223e95687148a900 -%global ps_scommit %(c=%{ps_commit}; echo ${c:0:7}) -Source33: https://github.com/Maratyszcza/psimd/archive/%{ps_commit}/psimd-%{ps_scommit}.tar.gz -%global ci_commit 16bfc1622c6902d6f91d316ec54894910c620325 -%global ci_scommit %(c=%{ci_commit}; echo ${c:0:7}) -Source34: https://github.com/pytorch/cpuinfo/archive/%{ci_commit}/cpuinfo-%{ci_scommit}.tar.gz -%endif - -%if %{without pthreadpool} -%global pt_commit 4fe0e1e183925bf8cfa6aae24237e724a96479b8 -%global pt_scommit %(c=%{pt_commit}; echo ${c:0:7}) -Source40: https://github.com/Maratyszcza/pthreadpool/archive/%{pt_commit}/pthreadpool-%{pt_scommit}.tar.gz -%endif - -%if %{without pocketfft} -%global pf_commit 076cb3d2536b7c5d0629093ad886e10ac05f3623 -%global pf_scommit %(c=%{pf_commit}; echo ${c:0:7}) -Source50: https://github.com/mreineck/pocketfft/archive/%{pf_commit}/pocketfft-%{pf_scommit}.tar.gz -%endif - %if %{without opentelemetry} %global ot_ver 1.14.2 Source60: https://github.com/open-telemetry/opentelemetry-cpp/archive/refs/tags/v%{ot_ver}.tar.gz @@ -189,115 +99,60 @@ Source70: https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp- %endif %if %{without kineto} -%global ki_commit be1317644c68b4bfc4646024a6b221066e430031 +%global ki_commit 5e7501833f1021ce6f618572d3baf657b6319658 %global ki_scommit %(c=%{ki_commit}; echo ${c:0:7}) Source80: https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz %endif -%if %{without gitcommit} -Patch0: 0001-no-third_party-foxi.patch -# https://github.com/pytorch/pytorch/pull/131282 -Patch1: 0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch +%global ox_ver 1.18.0 +Source90: https://github.com/onnx/onnx/archive/refs/tags/v%{ox_ver}.tar.gz -%endif - - -%if %{with gitcommit} -Patch11: 0002-Improve-finding-and-using-the-rocm_version.h.patch -%endif - -# ROCm patches -# Patches need to be refactored for ToT -# These are ROCm packages -%if %{without cuda} -# https://github.com/pytorch/pytorch/pull/120551 -%if %{without hipblaslt} -Patch100: 0001-Optionally-use-hipblaslt.patch -%endif -Patch101: 0001-cuda-hip-signatures.patch -%if %{without gitcommit} -Patch102: 0001-silence-an-assert.patch -%endif -%if %{without gitcommit} -Patch105: 0001-disable-use-of-aotriton.patch -%endif -%endif - -%if %{without gitcommit} -Patch106: 0001-include-fmt-ranges.h-for-using-fmt-join.patch -%endif - -ExclusiveArch: x86_64 aarch64 +%global pt_arches x86_64 aarch64 +ExclusiveArch: %pt_arches %global toolchain gcc %global _lto_cflags %nil BuildRequires: cmake BuildRequires: eigen3-devel -%if %{with fbgemm} -BuildRequires: asmjit-devel -BuildRequires: fbgemm-devel -%endif BuildRequires: flexiblas-devel BuildRequires: fmt-devel -%if %{with caffe2} BuildRequires: foxi-devel -%endif - -%if %{with compat_gcc} -BuildRequires: gcc%{compat_gcc_major}-c++ -BuildRequires: gcc%{compat_gcc_major}-gfortran -%else BuildRequires: gcc-c++ BuildRequires: gcc-gfortran -%endif -%if %{with distributed} %if %{with gloo} BuildRequires: gloo-devel %endif -%endif -%if %{with gitcommit} BuildRequires: json-devel -%endif BuildRequires: libomp-devel +BuildRequires: moodycamel-concurrentqueue-devel BuildRequires: numactl-devel BuildRequires: ninja-build +%if %{with onnx} BuildRequires: onnx-devel -%if %{with distributed} +%endif %if %{with mpi} BuildRequires: openmpi-devel %endif -%endif BuildRequires: protobuf-devel BuildRequires: sleef-devel BuildRequires: valgrind-devel - -%if %{with pocketfft} BuildRequires: pocketfft-devel -%endif - -%if %{with pthreadpool} BuildRequires: pthreadpool-devel -%endif -%if %{with xnnpack} BuildRequires: cpuinfo-devel BuildRequires: FP16-devel BuildRequires: fxdiv-devel BuildRequires: psimd-devel -%if %{with gitcommit} BuildRequires: xnnpack-devel = 0.0^git20240814.312eb7e -%else -BuildRequires: xnnpack-devel = 0.0^git20240229.fcbf55a -%endif -%endif BuildRequires: python3-devel BuildRequires: python3dist(filelock) BuildRequires: python3dist(jinja2) BuildRequires: python3dist(networkx) BuildRequires: python3dist(numpy) +BuildRequires: python3dist(pip) BuildRequires: python3dist(pyyaml) BuildRequires: python3dist(setuptools) BuildRequires: python3dist(sphinx) @@ -311,100 +166,47 @@ BuildRequires: python3dist(sympy) %if %{with rocm} BuildRequires: hipblas-devel -%if %{with hipblaslt} BuildRequires: hipblaslt-devel -%endif BuildRequires: hipcub-devel BuildRequires: hipfft-devel BuildRequires: hiprand-devel BuildRequires: hipsparse-devel +BuildRequires: hipsparselt-devel BuildRequires: hipsolver-devel -%if %{with magma} -BuildRequires: magma-devel -%endif +# Magma is broken on ROCm 7 +# BuildRequires: magma-devel BuildRequires: miopen-devel BuildRequires: rocblas-devel BuildRequires: rocrand-devel BuildRequires: rocfft-devel -%if %{with distributed} %if %{with rccl} BuildRequires: rccl-devel %endif -%endif BuildRequires: rocprim-devel BuildRequires: rocm-cmake BuildRequires: rocm-comgr-devel +BuildRequires: rocm-compilersupport-macros BuildRequires: rocm-core-devel BuildRequires: rocm-hip-devel BuildRequires: rocm-runtime-devel BuildRequires: rocm-rpm-macros -BuildRequires: rocm-rpm-macros-modules +BuildRequires: rocsolver-devel +BuildRequires: rocm-smi-devel BuildRequires: rocthrust-devel BuildRequires: roctracer-devel Requires: amdsmi -Requires: rocm-rpm-macros-modules %endif -%if %{with cuda} -BuildRequires: cuda-cudart-devel-%{cudart_ver} -BuildRequires: libcublas-devel-%{cublas_ver} -BuildRequires: libcufft-devel-%{cufft_ver} -BuildRequires: libcurand-devel-%{curand_ver} -BuildRequires: libcusparse-devel-%{cusparse_ver} -%endif - %if %{with test} BuildRequires: google-benchmark-devel %endif Requires: python3dist(dill) +Requires: python3dist(yaml) -# For convience -Provides: pytorch - -# Apache-2.0 -Provides: bundled(flatbuffers) = 22.3.3 -# MIT -Provides: bundled(miniz) = 2.1.0 -Provides: bundled(pybind11) = 2.11.1 - -%if %{with tensorpipe} -# BSD-3-Clause -Provides: bundled(tensorpipe) -# Apache-2.0 -Provides: bundled(libnop) -# MIT AND CC-BY-4.0 AND ISC AND BSD-2-Clause -Provides: bundled(libuv) = 1.41.0 -%endif - -# These are already in Fedora -%if %{without xnnpack} -# BSD-3-Clause -Provides: bundled(xnnpack) -# MIT -Provides: bundled(FP16) -# MIT -Provides: bundled(fxdiv) -# MIT -Provides: bundled(psimd) -# BSD-2-Clause -Provides: bundled(cpuinfo) -%endif - -%if %{without pthreadpool} -# BSD-2-Clause -Provides: bundled(pthreadpool) -%endif - -%if %{without pocketfft} -# BSD-3-Clause -Provides: bundled(pocketfft) -%endif - -# For convience -Provides: pytorch +Obsoletes: caffe = 1.0^git20200212.9b89154 %description PyTorch is a Python package that provides two high-level features: @@ -418,6 +220,24 @@ and Cython to extend PyTorch when needed. %package -n python3-%{pypi_name} Summary: %{summary} +# For convience +Provides: pytorch + +# Apache-2.0 +Provides: bundled(flatbuffers) = %{flatbuffers_version} +# MIT +Provides: bundled(miniz) = %{miniz_version} +Provides: bundled(pybind11) = %{pybind11_version} + +%if %{with tensorpipe} +# BSD-3-Clause +Provides: bundled(tensorpipe) +# Apache-2.0 +Provides: bundled(libnop) +# MIT AND CC-BY-4.0 AND ISC AND BSD-2-Clause +Provides: bundled(libuv) = 1.41.0 +%endif + %description -n python3-%{pypi_name} PyTorch is a Python package that provides two high-level features: @@ -427,20 +247,6 @@ PyTorch is a Python package that provides two high-level features: You can reuse your favorite Python packages such as NumPy, SciPy, and Cython to extend PyTorch when needed. -%if %{with cuda} -%package -n python3-%{pypi_name}-cuda-%{cudart_ver} -Summary: %{name} for CUDA - -%description -n python3-%{pypi_name}-cuda-%{cudart_ver} -%{summary} -%endif - -%package -n python3-%{pypi_name}-rocm-gfx9 -Summary: %{name} for ROCm gfx9 - -%description -n python3-%{pypi_name}-rocm-gfx9 -%{summary} - %if %{with test} %package -n python3-%{pypi_name}-test Summary: Tests for %{name} @@ -467,20 +273,11 @@ rm -rf %{pypi_name}.egg-info tar xf %{SOURCE1} rm -rf third_party/flatbuffers/* -cp -r flatbuffers-23.3.3/* third_party/flatbuffers/ +cp -r flatbuffers-%{flatbuffers_version}/* third_party/flatbuffers/ tar xf %{SOURCE2} rm -rf third_party/pybind11/* -cp -r pybind11-2.11.1/* third_party/pybind11/ - -%if %{with cuda} -tar xf %{SOURCE10} -rm -rf third_party/cudnn_frontend/* -cp -r cudnn-frontend-%{cuf_ver}/* third_party/cudnn_frontend/ -tar xf %{SOURCE11} -rm -rf third_party/cutlass/* -cp -r cutlass-%{cul_ver}/* third_party/cutlass/ -%endif +cp -r pybind11-%{pybind11_version}/* third_party/pybind11/ %if %{with tensorpipe} tar xf %{SOURCE20} @@ -492,36 +289,10 @@ cp -r libuv-*/* third_party/tensorpipe/third_party/libuv/ tar xf %{SOURCE22} rm -rf third_party/tensorpipe/third_party/libnop/* cp -r libnop-*/* third_party/tensorpipe/third_party/libnop/ -%endif -%if %{without xnnpack} -tar xf %{SOURCE30} -rm -rf third_party/XNNPACK/* -cp -r XNNPACK-*/* third_party/XNNPACK/ -tar xf %{SOURCE31} -rm -rf third_party/FXdiv/* -cp -r FXdiv-*/* third_party/FXdiv/ -tar xf %{SOURCE32} -rm -rf third_party/FP16/* -cp -r FP16-*/* third_party/FP16/ -tar xf %{SOURCE33} -rm -rf third_party/psimd/* -cp -r psimd-*/* third_party/psimd/ -tar xf %{SOURCE34} -rm -rf third_party/cpuinfo/* -cp -r cpuinfo-*/* third_party/cpuinfo/ -%endif - -%if %{without pthreadpool} -tar xf %{SOURCE40} -rm -rf third_party/pthreadpool/* -cp -r pthreadpool-*/* third_party/pthreadpool/ -%endif - -%if %{without pocketfft} -tar xf %{SOURCE50} -rm -rf third_party/pocketfft/* -cp -r pocketfft-*/* third_party/pocketfft/ +# gcc 15 include cstdint +sed -i '/#include ' third_party/tensorpipe/tensorpipe/common/allocator.h +sed -i '/#include ' third_party/tensorpipe/tensorpipe/common/memory.h %endif %if %{without opentelemtry} @@ -542,40 +313,63 @@ rm -rf third_party/kineto/* cp -r kineto-*/* third_party/kineto/ %endif -# hipblaslt only building with gfx90a -%if %{with hipblaslt} -sed -i -e 's@"gfx90a", "gfx940", "gfx941", "gfx942"@"gfx90a"@' aten/src/ATen/native/cuda/Blas.cpp +%if %{without onnx} +tar xf %{SOURCE90} +rm -rf third_party/onnx/* +cp -r onnx-*/* third_party/onnx/ %endif +# Adjust for the hipblaslt's we build +sed -i -e 's@"gfx90a", "gfx940", "gfx941", "gfx942"@"gfx90a", "gfx1103", "gfx1150", "gfx1151", "gfx1100", "gfx1101", "gfx1200", "gfx1201"@' aten/src/ATen/native/cuda/Blas.cpp + %if 0%{?rhel} # In RHEL but too old sed -i -e '/typing-extensions/d' setup.py # Need to pip these sed -i -e '/sympy/d' setup.py sed -i -e '/fsspec/d' setup.py +%else +# for 2.5.0 +sed -i -e 's@sympy==1.13.1@sympy>=1.13.1@' setup.py %endif # A new dependency # Connected to USE_FLASH_ATTENTION, since this is off, do not need it sed -i -e '/aotriton.cmake/d' cmake/Dependencies.cmake +# Compress hip +sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc --offload-compress@' cmake/Dependencies.cmake +# Silence noisy warning +sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-pass-failed@' cmake/Dependencies.cmake +sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unused-command-line-argument@' cmake/Dependencies.cmake +sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unused-result@' cmake/Dependencies.cmake +sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-deprecated-declarations@' cmake/Dependencies.cmake +# Use parallel jobs +sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -parallel-jobs=4@' cmake/Dependencies.cmake +# Need to link with librocm_smi64 +sed -i -e 's@hiprtc::hiprtc@hiprtc::hiprtc rocm_smi64@' cmake/Dependencies.cmake # No third_party fmt, use system sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt +sed -i -e 's@fmt::fmt-header-only@fmt@' aten/src/ATen/CMakeLists.txt +sed -i -e 's@list(APPEND ATen_HIP_INCLUDE $)@@' aten/src/ATen/CMakeLists.txt + +sed -i -e 's@fmt::fmt-header-only@fmt@' third_party/kineto/libkineto/CMakeLists.txt sed -i -e 's@fmt::fmt-header-only@fmt@' c10/CMakeLists.txt sed -i -e 's@fmt::fmt-header-only@fmt@' torch/CMakeLists.txt sed -i -e 's@fmt::fmt-header-only@fmt@' cmake/Dependencies.cmake -%if %{with gitcommit} sed -i -e 's@fmt::fmt-header-only@fmt@' caffe2/CMakeLists.txt -%endif + sed -i -e 's@add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt)@#add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt)@' cmake/Dependencies.cmake sed -i -e 's@set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "")@#set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "")@' cmake/Dependencies.cmake sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)@#list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)@' cmake/Dependencies.cmake # No third_party FXdiv -%if %{with xnnpack} sed -i -e 's@if(NOT TARGET fxdiv)@if(MSVC AND USE_XNNPACK)@' caffe2/CMakeLists.txt sed -i -e 's@TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@#TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@' caffe2/CMakeLists.txt -%endif + +# https://github.com/pytorch/pytorch/issues/149803 +# Tries to checkout nccl +sed -i -e 's@ checkout_nccl()@ True@' tools/build_pytorch_libs.py # Disable the use of check_submodule's in the setup.py, we are a tarball, not a git repo sed -i -e 's@check_submodules()$@#check_submodules()@' setup.py @@ -588,7 +382,7 @@ sed -i -e 's@check_submodules()$@#check_submodules()@' setup.py # the third_party dir to compile the file. # mimiz is licensed MIT # https://github.com/richgel999/miniz/blob/master/LICENSE -mv third_party/miniz-2.1.0 . +mv third_party/miniz-%{miniz_version} . # # setup.py depends on this script mv third_party/build_bundled.py . @@ -598,31 +392,10 @@ mv third_party/flatbuffers . mv third_party/pybind11 . -%if %{with cuda} -mv third_party/cudnn_frontend . -mv third_party/cutlass . -%endif - %if %{with tensorpipe} mv third_party/tensorpipe . %endif -%if %{without xnnpack} -mv third_party/XNNPACK . -mv third_party/FXdiv . -mv third_party/FP16 . -mv third_party/psimd . -mv third_party/cpuinfo . -%endif - -%if %{without pthreadpool} -mv third_party/pthreadpool . -%endif - -%if %{without pocketfft} -mv third_party/pocketfft . -%endif - %if %{without opentelemetry} mv third_party/opentelemetry-cpp . %endif @@ -635,6 +408,10 @@ mv third_party/cpp-httplib . mv third_party/kineto . %endif +%if %{without onnx} +mv third_party/onnx . +%endif + %if %{with test} mv third_party/googletest . %endif @@ -643,35 +420,14 @@ mv third_party/googletest . rm -rf third_party/* # Put stuff back mv build_bundled.py third_party -mv miniz-2.1.0 third_party +mv miniz-%{miniz_version} third_party mv flatbuffers third_party mv pybind11 third_party -%if %{with cuda} -mv cudnn_frontend third_party -mv cutlass third_party -%endif - %if %{with tensorpipe} mv tensorpipe third_party %endif -%if %{without xnnpack} -mv XNNPACK third_party -mv FXdiv third_party -mv FP16 third_party -mv psimd third_party -mv cpuinfo third_party -%endif - -%if %{without pthreadpool} -mv pthreadpool third_party -%endif - -%if %{without pocketfft} -mv pocketfft third_party -%endif - %if %{without opentelemetry} mv opentelemetry-cpp third_party %endif @@ -684,15 +440,18 @@ mv cpp-httplib third_party mv kineto third_party %endif +%if %{without onnx} +mv onnx third_party +%endif + %if %{with test} mv googletest third_party %endif -%if %{with pocketfft} # # Fake out pocketfft, and system header will be used mkdir third_party/pocketfft -%endif +cp /usr/include/pocketfft_hdronly.h third_party/pocketfft/ # # Use the system valgrind headers @@ -705,11 +464,23 @@ sed -i -e 's@DESTINATION ${PYTHON_LIB_REL_PATH}@DESTINATION ${CMAKE_INSTALL_PREF # reenable foxi linking sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)@#list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)@' cmake/Dependencies.cmake +# cmake version changed +sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION 3.5)@' third_party/tensorpipe/third_party/libuv/CMakeLists.txt +sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION 3.5)@' libuv*/CMakeLists.txt +%if %{without opentelemtry} +sed -i -e 's@cmake_minimum_required(VERSION 3.1)@cmake_minimum_required(VERSION 3.5)@' third_party/opentelemetry-cpp/CMakeLists.txt +%endif + %if %{with rocm} # hipify ./tools/amd_build/build_amd.py # Fedora installs to /usr/include, not /usr/include/rocm-core sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable/TunableGemm.h +# https://github.com/pytorch/pytorch/issues/149805 +sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' cmake/public/LoadHIP.cmake +# Fedora installs to /usr/include, not /usr/include/rocm-core +sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable/Tunable.cpp +sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/cuda/tunable/Tunable.cpp # use any hip, correct CMAKE_MODULE_PATH sed -i -e 's@lib/cmake/hip@lib64/cmake/hip@' cmake/public/LoadHIP.cmake sed -i -e 's@HIP 1.0@HIP MODULE@' cmake/public/LoadHIP.cmake @@ -718,31 +489,32 @@ sed -i -e 's@HIP 1.0@HIP MODULE@' cmake/public/LoadHIP.cmake %endif -%if %{with cuda} - -# TBD - -%endif +# moodycamel include path needs adjusting to use the system's +sed -i -e 's@${PROJECT_SOURCE_DIR}/third_party/concurrentqueue@/usr/include/concurrentqueue@' cmake/Dependencies.cmake %build +# Export the arches +# echo "%%pytorch_arches %pt_arches" > macros.pytorch + # # Control the number of jobs # # The build can fail if too many threads exceed the physical memory -# So count core and and memory and increase the build memory util the build succeeds +# Run at least one thread, more if CPU & memory resources are available. # +%ifarch x86_64 # Real cores, No hyperthreading COMPILE_JOBS=`cat /proc/cpuinfo | grep -m 1 'cpu cores' | awk '{ print $4 }'` +%else +# cpuinfo format varies on other arches, fall back to nproc +COMPILE_JOBS=`nproc` +%endif if [ ${COMPILE_JOBS}x = x ]; then COMPILE_JOBS=1 fi # Take into account memmory usage per core, do not thrash real memory -%if %{with cuda} -BUILD_MEM=4 -%else BUILD_MEM=2 -%endif MEM_KB=0 MEM_KB=`cat /proc/meminfo | grep MemTotal | awk '{ print $2 }'` MEM_MB=`eval "expr ${MEM_KB} / 1024"` @@ -753,12 +525,6 @@ if [ "$COMPILE_JOBS_MEM" -lt "$COMPILE_JOBS" ]; then fi export MAX_JOBS=$COMPILE_JOBS -%if %{with compat_gcc} -export CC=%{_bindir}/gcc-%{compat_gcc_major} -export CXX=%{_bindir}/g++-%{compat_gcc_major} -export FC=%{_bindir}/gfortran-%{compat_gcc_major} -%endif - # For debugging setup.py # export SETUPTOOLS_SCM_DEBUG=1 @@ -768,10 +534,7 @@ export FC=%{_bindir}/gfortran-%{compat_gcc_major} # export CMAKE_SHARED_LINKER_FLAGS=-Wl,--verbose # Manually set this hardening flag -# CUDA is unhappy with pie, so do not use it -%if %{without cuda} export CMAKE_EXE_LINKER_FLAGS=-pie -%endif export BUILD_CUSTOM_PROTOBUF=OFF export BUILD_NVFUSER=OFF @@ -784,17 +547,12 @@ export INTERN_BUILD_MOBILE=OFF export USE_DISTRIBUTED=OFF export USE_CUDA=OFF export USE_FAKELOWP=OFF -%if %{with fbgemm} -export USE_FBGEMM=ON -export USE_SYSTEM_FBGEMM=ON -%else export USE_FBGEMM=OFF -%endif export USE_FLASH_ATTENTION=OFF -export USE_GOLD_LINKER=ON export USE_GLOO=OFF export USE_ITT=OFF export USE_KINETO=OFF +export USE_KLEIDIAI=OFF export USE_LITE_INTERPRETER_PROFILER=OFF export USE_LITE_PROTO=OFF export USE_MAGMA=OFF @@ -809,37 +567,22 @@ export USE_PYTORCH_QNNPACK=OFF export USE_ROCM=OFF export USE_SYSTEM_SLEEF=ON export USE_SYSTEM_EIGEN_INSTALL=ON +%if %{with onnx} export USE_SYSTEM_ONNX=ON +%endif export USE_SYSTEM_PYBIND11=OFF export USE_SYSTEM_LIBS=OFF +export USE_SYSTEM_NCCL=OFF export USE_TENSORPIPE=OFF -export USE_XNNPACK=ON +export USE_XNNPACK=OFF export USE_XPU=OFF - -%if %{with pthreadpool} export USE_SYSTEM_PTHREADPOOL=ON -%endif - -%if %{with xnnpack} export USE_SYSTEM_CPUINFO=ON export USE_SYSTEM_FP16=ON export USE_SYSTEM_FXDIV=ON export USE_SYSTEM_PSIMD=ON -export USE_SYSTEM_XNNPACK=ON -%endif +export USE_SYSTEM_XNNPACK=OFF -%if %{with cuda} -%if %{without rocm} -export CPLUS_INCLUDE_PATH=/usr/local/cuda-%{cuda_ver}/include -export CUDACXX=/usr/local/cuda-%{cuda_ver}/bin/nvcc -export CUDA_HOME=/usr/local/cuda-%{cuda_ver}/ -export USE_CUDA=ON -# The arches to build for -export TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0" -%endif -%endif - -%if %{with distributed} export USE_DISTRIBUTED=ON %if %{with tensorpipe} export USE_TENSORPIPE=ON @@ -853,7 +596,6 @@ export USE_SYSTEM_GLOO=ON %if %{with mpi} export USE_MPI=ON %endif -%endif %if %{with test} export BUILD_TEST=ON @@ -868,112 +610,73 @@ export BUILD_TEST=ON # # See BZ 2244862 - %if %{with rocm} export USE_ROCM=ON -%if %{with magma} -export USE_MAGMA=ON -%endif +export USE_ROCM_CK_SDPA=OFF +export USE_ROCM_CK_GEMM=OFF +export USE_FBGEMM_GENAI=OFF + +# Magma is broken on ROCm 7 +# export USE_MAGMA=ON export HIP_PATH=`hipconfig -p` export ROCM_PATH=`hipconfig -R` -export HIP_CLANG_PATH=`hipconfig -l` -RESOURCE_DIR=`${HIP_CLANG_PATH}/clang -print-resource-dir` -export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode +#RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir` +#export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode -gpu=%{rocm_default_gpu} -module load rocm/$gpu -export PYTORCH_ROCM_ARCH=$ROCM_GPUS -%py3_build -mv build build-${gpu} -module purge +# pytorch uses clang, not hipcc +export HIP_CLANG_PATH=%{rocmllvm_bindir} +export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} -%if %{with rocm_loop} -for gpu in %{rocm_gpu_list} -do - module load rocm/$gpu - export PYTORCH_ROCM_ARCH=$ROCM_GPUS - %py3_build - mv build build-${gpu} - module purge -done %endif +%if 0%{?fedora} +%pyproject_wheel %else - %py3_build - %endif + %install -%if %{with compat_gcc} -export CC=%{_bindir}/gcc%{compat_gcc_major} -export CXX=%{_bindir}/g++%{compat_gcc_major} -export FC=%{_bindir}/gfortran%{compat_gcc_major} -%endif +# pytorch rpm macros +# install -Dpm 644 macros.pytorch \ +# %{buildroot}%{_rpmmacrodir}/macros.pytorch %if %{with rocm} export USE_ROCM=ON +export USE_ROCM_CK=OFF export HIP_PATH=`hipconfig -p` export ROCM_PATH=`hipconfig -R` -export HIP_CLANG_PATH=`hipconfig -l` -RESOURCE_DIR=`${HIP_CLANG_PATH}/clang -print-resource-dir` -export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode +# RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir` +# export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode -gpu=%{rocm_default_gpu} -module load rocm/$gpu -export PYTORCH_ROCM_ARCH=$ROCM_GPUS -mv build-${gpu} build -%py3_install -mv build build-${gpu} -module purge +# pytorch uses clang, not hipcc +export HIP_CLANG_PATH=%{rocmllvm_bindir} +export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} -%if %{with rocm_loop} -for gpu in %{rocm_gpu_list} -do - module load rocm/$gpu - export PYTORCH_ROCM_ARCH=$ROCM_GPUS - mv build-${gpu} build - # need to customize the install location, so replace py3_install - %{__python3} %{py_setup} %{?py_setup_args} install -O1 --skip-build --root %{buildroot} --prefix /usr/lib64/rocm/${gpu} %{?*} - rm -rfv %{buildroot}/usr/lib/rocm/${gpu}/bin/__pycache__ - mv build build-${gpu} - module purge -done %endif +%if 0%{?fedora} +%pyproject_install +%pyproject_save_files '*torch*' %else - %py3_install - - %endif + +%check +# Not working yet +# pyproject_check_import torch + # Do not remote the empty files -%if %{with cuda} -%files -n python3-%{pypi_name}-cuda-%{cudart_ver} -%else %files -n python3-%{pypi_name} -%endif %license LICENSE %doc README.md -%{_bindir}/convert-caffe2-to-onnx -%{_bindir}/convert-onnx-to-caffe2 %{_bindir}/torchrun -%{python3_sitearch}/%{pypi_name} -%{python3_sitearch}/%{pypi_name}-*.egg-info +%{python3_sitearch}/%{pypi_name}* %{python3_sitearch}/functorch -%{python3_sitearch}/torchgen - -%if %{with rocm} - -%files -n python3-%{pypi_name}-rocm-gfx9 -%{_libdir}/rocm/gfx9/bin/* -%{_libdir}/rocm/gfx9/lib64/* - -%endif %changelog %autochangelog diff --git a/sources b/sources index a4dbc9a..9a3681f 100644 --- a/sources +++ b/sources @@ -1,18 +1,19 @@ -SHA512 (pytorch-v2.1.0.tar.gz) = 59421bf6cea6661d61ed66ab16526e3a07162e70e53381cbd5987042917610ec993d2f151fb086f0f98e5a396fe69e82bbc76f840bebffe4ebe7f50458c3aa44 -SHA512 (pytorch-v2.1.2.tar.gz) = b7305407ad9dda877d277a0e7009f65f6d69f39370f2231b8bb8c6a9b711022d2129febdb00f5c83751b6664e01000fe2d30c5e5c13757de89fb8b2b99197a28 -SHA512 (pytorch-975d428.tar.gz) = a02195b18d832db9a739c3eeecd0cd0c8868d8b92e4a2fca42e4bdd20735f0745d84573df28d9ae1db014cf79ffd005a8409b3e8bb92f9db2a446f784ef46ff4 +SHA512 (pytorch-v2.7.0.tar.gz) = 17e875a66f1669901f5f770c9d829ba5bfa3967296cfb71550e8a92507181db742548eaf7cc9a2c478c4b91e366f27cc480e2e1bbb328db8501d30e1649839e6 SHA512 (v23.3.3.tar.gz) = 4066c94f2473c7ea16917d29a613e16f840a329089c88e0bdbdb999aef3442ba00abfd2aa92266fa9c067e399dc88e6f0ccac40dc151378857e665638e78bbf0 -SHA512 (v2.11.1.tar.gz) = ed1512ff0bca3bc0a45edc2eb8c77f8286ab9389f6ff1d5cb309be24bc608abbe0df6a7f5cb18c8f80a3bfa509058547c13551c3cd6a759af708fd0cdcdd9e95 -SHA512 (pytorch-6a89a75.tar.gz) = 6978acc6f37d7c5adc71517a6f379c7133b2bbd040189deddba7753acde41f6ddba2e9f2e397928e89c776d6a5458b8a74f8e04beb312d71fd30b072687ba98f -SHA512 (pytorch-74832f1.tar.gz) = bd553bfbbb422d353bbbf616c201251b2517b905e2621fa05bfe3d97726b078caad377583adccdc0cca234235a11fcb4730a93e834907b2ca4c06d552b2a2683 -SHA512 (pytorch-4bb5cb5.tar.gz) = 430ae996ddee560537787646ae9f7aa01498f37c99c2e3fe4c5f66ee732ee3fe4ecf337fdf857bc0c7fe27634af75cee3ce576bbe2576463b81e27dbbfacf6ef +SHA512 (v2.13.6.tar.gz) = 497c25b33b09a9c42f67131ab82e35d689e8ce089dd7639be997305ff9a6d502447b79c824508c455d559e61f0186335b54dd2771d903a7c1621833930622d1a SHA512 (tensorpipe-52791a2.tar.gz) = 1e5faf17a7236c5506c08cb28be16069b11bb929bbca64ed9745ce4277d46739186ab7d6597da7437d90ed2d166d4c37ef2f3bceabe8083ef3adbb0e8e5f227e SHA512 (v1.41.0.tar.gz) = bb08a1970a10e8d9571ffea3d021643de30ec212cd51317b98d6cf0cfe55d6877992921fb01d1188a6d466687335b77885685d924f8cb7200a0bec30eee05c65 SHA512 (libnop-910b558.tar.gz) = 74c5324eaa1b6b2ac8dfef94c835b5c5b044625f8e5efe3522470b1ecc4798ff43d344a013cee2f6901e83267c6167072947b754e63f1552ae7044cffe234c36 -SHA512 (pytorch-97ff6cf.tar.gz) = 105ebcba298558fe833f90e7e40b003d35a74609e777f9dc4c47f5668c884f603455113ac0ff252a62b83c81137ae66ceb1a862d351203925dcfc3dcf9f73580 -SHA512 (pytorch-v2.3.0.tar.gz) = 0c2ffc7bf2fd86070e9958c34eca1f03a0248a011ac6ffaeb69f65306ff856edd5359986f02af25888433187e6d7f29b60edded092e2ac30c8cec49023166eda -SHA512 (pytorch-v2.3.1.tar.gz) = fe132251b2bae87b70ba3d95dc32f6a4545970d11893118b0ebe6ca129732e516ef4d6cc4f380b3db9bb2277d1db8ce78a401c40149bb1dfbab76eab9e3992c4 -SHA512 (pytorch-v2.4.0.tar.gz) = bcfca6aefee00d15d1c0a2456cd9d802d9a8e96816d421d10e3eed5c608bfdf23dfae492ca3638c0bae99ef5bb8c98f4774c0b9f1a8b94d4dc36a52226033314 SHA512 (v1.14.2.tar.gz) = 97635bbaf6dd567c201451dfaf7815b2052fe50d9bccc97aade86cfa4a92651374d167296a5453031b2681dc302806a289bca011a9e79ddc381a17d6118971d7 SHA512 (cpp-httplib-3b6597b.tar.gz) = 8f1090658c498d04f14fec5c2f301847b1f3360bf92b18d82927643ee04ab61a6b274733a01c7850f9c030205120d674d1d961358d49fdd15636736fb8704f55 SHA512 (kineto-be13176.tar.gz) = 41a08c7da9eea7d12402f80a5550c9d4df79798719cc52b12a507828c8c896ba28a37c35d8adf809ca72589e1d84965d5ef6dd01f3f8dc1c803c5ed67b03a43a +SHA512 (pytorch-a1cb3cc.tar.gz) = 92bf8b2c2ef0b459406b60169ecebdc50652c75943e3d6087e4d261f6e308dbad365529561e0f07ea3f0b71790efb68b5e4ab2f44e270462097208d924dc2d95 +SHA512 (v24.12.23.tar.gz) = f97762ba41b9cfef648e93932fd789324c6bb6ebc5b7aeca8185c9ef602294b67d73aea7ae371035579a1419cbfbeba7c3e88b31b5a5848db98f5e8a03b982b1 +SHA512 (kineto-5e75018.tar.gz) = 921b96a56e01d69895b79e67582d8977ed6f873573ab41557c5d026ada5d1f6365e4ed0a0c6804057c52e92510749fc58619f554a164c1ba9d8cd13e789bebd0 +SHA512 (pytorch-v2.8.0.tar.gz) = 791e658eab87fb957f025558cb9f925078d2426ab7b6f60771d9841dfb691f67d905ba1330a800008efe7c938b6c69bdc52232bccfe8d4860e795a532cd69d28 +SHA512 (v1.18.0.tar.gz) = 2f38664947c8d1efc40620a7c1b1953d2aa4b0a37b67c4886b86e77c1d697363c26413413ddda8eabc545892fb1bcb43afc7e93e62f0901527524a2727e1ea8d +SHA512 (pytorch-715dca6.tar.gz) = 09c9aae54fab3eb17901fc3226fece1c13f41cb8e45a2cb066021823abeb8d27c340993088e01d8e55bb37ed5f94334ec31e6c539cddfacbad157abd27c5e907 +SHA512 (pytorch-fd36458.tar.gz) = acbb7475b92ad4a8e8d779f3745da22d8438e4c5ef2d6e76d71c987789f2752c8aef7022c87c9a74640fe4f9c1f1a61a3f12a796f63b1e6be24da8e5aacf37dc +SHA512 (pytorch-0fabc3b.tar.gz) = 2e87975de0bf6f3dcede168b379e1928712bca16170c2a8ee7d63459f53086c01baac05e0763e4d5d28cdaf1c7d8912225ee06adeff96ead4f6f456ee174b341 +SHA512 (pytorch-v2.9.0.tar.gz) = ae989e3a7fe30f9ea90944dc25e21ca92f2a94ee40d8de974a168c292d82c16ee8920624eff91a85755469ad05473dce0f85893e3ed7794ec5c6bdd89cbd2023 +SHA512 (pytorch-v2.9.1.tar.gz) = 88de0289fa2760abd69bef505b5ae3b6d7ff176b415cbb31bbc89ce5476a3800b322a97c4490f270f8b89657aff931bf9a5516202b268e0bb8b1f63dbb87b34a