From 3c49a148a5de2274007c910e0bb1268298720dcd Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Tue, 11 Jun 2024 05:27:49 -0600
Subject: [PATCH 01/88] Reduce amd gpu list on F40

Signed-off-by: Tom Rix <trix@redhat.com>
---
 python-torch.spec | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/python-torch.spec b/python-torch.spec
index 6f102ca..ea60d27 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -33,7 +33,12 @@
 # hipblaslt is in development
 %bcond_with hipblaslt
 # Which families gpu build for
+%if 0%{?fedora} > 40
 %global rocm_gpu_list gfx8 gfx9 gfx10 gfx11 gfx90a gfx942 gfx1100
+%else
+# F40
+%global rocm_gpu_list gfx8 gfx9 gfx10 gfx11
+%endif
 %global rocm_default_gpu default
 %bcond_without rocm_loop
 
@@ -436,6 +441,7 @@ Summary:        %{name} for ROCm gfx11
 %description -n python3-%{pypi_name}-rocm-gfx11
 %{summary}
 
+%if 0%{?fedora} > 40
 %package -n python3-%{pypi_name}-rocm-gfx90a
 Summary:        %{name} for ROCm MI200
 %description -n python3-%{pypi_name}-rocm-gfx90a
@@ -450,6 +456,7 @@ Summary:        %{name} for ROCm MI300
 Summary:        %{name} for W7900
 %description -n python3-%{pypi_name}-rocm-gfx1100
 %{summary}
+%endif
 
 %endif
 
@@ -985,6 +992,7 @@ done
 %{_libdir}/rocm/gfx11/bin/*
 %{_libdir}/rocm/gfx11/lib64/*
 
+%if 0%{?fedora} > 40
 %files -n python3-%{pypi_name}-rocm-gfx90a
 %{_libdir}/rocm/gfx90a/bin/*
 %{_libdir}/rocm/gfx90a/lib64/*
@@ -996,6 +1004,7 @@ done
 %files -n python3-%{pypi_name}-rocm-gfx1100
 %{_libdir}/rocm/gfx1100/bin/*
 %{_libdir}/rocm/gfx1100/lib64/*
+%endif
 
 %endif
 

From af5905971ba0fc355070ae33ca0b923e729b535e Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Tue, 11 Jun 2024 14:39:24 -0600
Subject: [PATCH 02/88] Fix broken cpuinfo for aarch64

Signed-off-by: Tom Rix <trix@redhat.com>
---
 python-torch.spec | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index ea60d27..29db936 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -172,6 +172,9 @@ Source32:       https://github.com/Maratyszcza/FP16/archive/%{fp_commit}/FP16-%{
 %global ps_commit 072586a71b55b7f8c584153d223e95687148a900
 %global ps_scommit %(c=%{ps_commit}; echo ${c:0:7})
 Source33:       https://github.com/Maratyszcza/psimd/archive/%{ps_commit}/psimd-%{ps_scommit}.tar.gz
+%global ci_commit d6860c477c99f1fce9e28eb206891af3c0e1a1d7
+%global ci_scommit %(c=%{ci_commit}; echo ${c:0:7})
+Source34:       https://github.com/pytorch/cpuinfo/archive/%{ci_commit}/cpuinfo-%{ci_scommit}.tar.gz
 %endif
 
 %if %{without pthreadpool}
@@ -245,7 +248,6 @@ ExclusiveArch:  x86_64 aarch64
 %global _lto_cflags %nil
 
 BuildRequires:  cmake
-BuildRequires:  cpuinfo-devel
 BuildRequires:  eigen3-devel
 BuildRequires:  fmt-devel
 %if %{with caffe2}
@@ -287,6 +289,7 @@ BuildRequires:  pthreadpool-devel
 %endif
 
 %if %{with xnnpack}
+BuildRequires:  cpuinfo-devel
 BuildRequires:  FP16-devel
 BuildRequires:  fxdiv-devel
 BuildRequires:  psimd-devel
@@ -380,6 +383,8 @@ Provides:       bundled(FP16)
 Provides:       bundled(fxdiv)
 # MIT
 Provides:       bundled(psimd)
+# BSD-2-Clause
+Provides:       bundled(cpuinfo)
 %endif
 
 %if %{without pthreadpool}
@@ -525,6 +530,9 @@ cp -r FP16-*/* third_party/FP16/
 tar xf %{SOURCE33}
 rm -rf third_party/psimd/*
 cp -r psimd-*/* third_party/psimd/
+tar xf %{SOURCE34}
+rm -rf third_party/cpuinfo/*
+cp -r cpuinfo-*/* third_party/cpuinfo/
 %endif
 
 %if %{without pthreadpool}
@@ -630,6 +638,7 @@ mv third_party/XNNPACK .
 mv third_party/FXdiv .
 mv third_party/FP16 .
 mv third_party/psimd .
+mv third_party/cpuinfo .
 %endif
 
 %if %{without pthreadpool}
@@ -678,6 +687,7 @@ mv XNNPACK third_party
 mv FXdiv third_party
 mv FP16 third_party
 mv psimd third_party
+mv cpuinfo third_party
 %endif
 
 %if %{without pthreadpool}
@@ -814,7 +824,6 @@ export USE_PYTORCH_QNNPACK=OFF
 export USE_QNNPACK=OFF
 %endif
 export USE_ROCM=OFF
-export USE_SYSTEM_CPUINFO=ON
 export USE_SYSTEM_SLEEF=ON
 export USE_SYSTEM_EIGEN_INSTALL=ON
 export USE_SYSTEM_ONNX=ON
@@ -828,6 +837,7 @@ export USE_SYSTEM_PTHREADPOOL=ON
 %endif
 
 %if %{with xnnpack}
+export USE_SYSTEM_CPUINFO=ON
 export USE_SYSTEM_FP16=ON
 export USE_SYSTEM_FXDIV=ON
 export USE_SYSTEM_PSIMD=ON

From 1311b02e130c02fdeab8faf67852f4a1e8ac2b2d Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Thu, 13 Jun 2024 09:35:46 -0400
Subject: [PATCH 03/88] Use specific version of CUDA base on disto release

Signed-off-by: Tom Rix <trix@redhat.com>
---
 python-torch.spec | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 29db936..06fe5c1 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -92,6 +92,15 @@
 %bcond_with cuda
 %endif
 
+# Pick a version that works
+%if %{with cuda}
+%if 0%{?fedora} < 40
+%global cuda_ver 12.4
+%else
+%global cuda_ver 12.5
+%endif
+%endif
+
 # For testing compat-gcc
 %global compat_gcc_major 13
 %bcond_with compat_gcc
@@ -850,8 +859,9 @@ export BUILD_CAFFE2=ON
 
 %if %{with cuda}
 %if %{without rocm}
-export CUDACXX=/usr/local/cuda/bin/nvcc
-export CPLUS_INCLUDE_PATH=/usr/local/cuda/include
+export CPLUS_INCLUDE_PATH=/usr/local/cuda-%{cuda_ver}/include
+export CUDACXX=/usr/local/cuda-%{cuda_ver}/bin/nvcc
+export CUDA_HOME=/usr/local/cuda-%{cuda_ver}/
 export USE_CUDA=ON
 # The arches to build for
 export TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0"

From 751813c99af359a8b3c4534dbc5f18c74a786af4 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Thu, 13 Jun 2024 15:49:35 -0400
Subject: [PATCH 04/88] Update gitcommit

Signed-off-by: Tom Rix <trix@redhat.com>
---
 python-torch.spec | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 06fe5c1..91eb9db 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -7,9 +7,9 @@
 %bcond_with gitcommit
 %if %{with gitcommit}
 # ToT
-%global commit0 75b0720a97ac5d82e8a7a1a6ae7c5f7a87d7183d
+%global commit0 2b9465d62ab8733a36226f0d8e236a8a9bd60c23
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20240609
+%global date0 20240612
 %global pypi_version 2.4.0
 %else
 %global pypi_version 2.3.1

From 5c8e5d96d2fc35a376efc18d891f873c7390ccda Mon Sep 17 00:00:00 2001
From: Python Maint <python-maint@redhat.com>
Date: Fri, 14 Jun 2024 10:41:08 +0200
Subject: [PATCH 05/88] Rebuilt for Python 3.13


From dc54ab1a71ed46b41b7651363d7fd66ecc294ef1 Mon Sep 17 00:00:00 2001
From: "Benjamin A. Beasley" <code@musicinmybrain.net>
Date: Wed, 5 Jun 2024 11:06:02 -0400
Subject: [PATCH 06/88] Patch for sleef 3.6

---
 ...ectorization-on-windows-submodule-sl.patch | 910 ++++++++++++++++++
 python-torch.spec                             |  13 +
 2 files changed, 923 insertions(+)
 create mode 100644 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch

diff --git a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
new file mode 100644
index 0000000..562f55b
--- /dev/null
+++ b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
@@ -0,0 +1,910 @@
+From 3d1e4b3e5ddcdd2717e590c635097163fef64c83 Mon Sep 17 00:00:00 2001
+From: Xu Han <xu.han@intel.com>
+Date: Sun, 31 Mar 2024 03:07:32 +0000
+Subject: [PATCH] Enable x86 CPU vectorization on windows [submodule sleef]
+ (#118980)
+
+Enable VEC on Windows OS.
+1. Fix some type defination gap between Windows and Linux.
+2. Fix some operator not support on Windows, such as [], /.
+3. Enable static sleef library build on Windows.
+4. Disable unsupported function overloading on MSVC.
+5. Upgrade submodule sleef lib, which fixed build issue on Windows.
+6. Fixed bazel build issues.
+7. Fix test app not link to sleef on Windows.
+
+Note: If rebuild fail after pulled this PR, please sync `sleef` submodule by run:
+```cmd
+git submodule sync
+git submodule update --init --recursive
+```
+
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/118980
+Approved by: https://github.com/jgong5, https://github.com/ezyang, https://github.com/malfet
+---
+ aten/src/ATen/CMakeLists.txt                  | 48 ++++++--------
+ aten/src/ATen/cpu/vec/vec256/vec256.h         | 14 ++--
+ .../src/ATen/cpu/vec/vec256/vec256_bfloat16.h | 27 ++++++--
+ .../cpu/vec/vec256/vec256_complex_double.h    |  7 +-
+ .../cpu/vec/vec256/vec256_complex_float.h     |  7 +-
+ aten/src/ATen/cpu/vec/vec256/vec256_double.h  |  5 +-
+ aten/src/ATen/cpu/vec/vec256/vec256_float.h   | 15 +++--
+ aten/src/ATen/cpu/vec/vec256/vec256_qint.h    | 12 +++-
+ aten/src/ATen/cpu/vec/vec512/vec512.h         | 14 ++--
+ .../src/ATen/cpu/vec/vec512/vec512_bfloat16.h | 27 ++++++--
+ .../cpu/vec/vec512/vec512_complex_double.h    |  7 +-
+ .../cpu/vec/vec512/vec512_complex_float.h     |  7 +-
+ aten/src/ATen/cpu/vec/vec512/vec512_double.h  |  5 +-
+ aten/src/ATen/cpu/vec/vec512/vec512_float.h   | 15 +++--
+ aten/src/ATen/cpu/vec/vec512/vec512_qint.h    | 66 ++++++++++++++++++-
+ aten/src/ATen/cpu/vec/vec_base.h              |  6 ++
+ caffe2/CMakeLists.txt                         |  2 +-
+ third_party/sleef.BUILD                       |  3 +-
+ 18 files changed, 194 insertions(+), 93 deletions(-)
+
+diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
+index bf425af5fa9..58d5828e8ca 100644
+--- a/aten/src/ATen/CMakeLists.txt
++++ b/aten/src/ATen/CMakeLists.txt
+@@ -419,32 +419,25 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$")
+   list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo)
+ endif()
+ 
+-if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
+-  # Preserve values for the main build
+-  set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS})
+-  set(__aten_sleef_build_tests ${BUILD_TESTS})
+-
+-  # Unset our restrictive C++ flags here and reset them later.
+-  # Remove this once we use proper target_compile_options.
+-  set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+-  set(CMAKE_CXX_FLAGS)
+-
+-  # Bump up optimization level for sleef to -O1, since at -O0 the compiler
+-  # excessively spills intermediate vector registers to the stack
+-  # and makes things run impossibly slowly
+-  set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+-  if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
+-    string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
+-  else()
+-    set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
++if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
++  if(NOT MSVC)
++    # Bump up optimization level for sleef to -O1, since at -O0 the compiler
++    # excessively spills intermediate vector registers to the stack
++    # and makes things run impossibly slowly
++    set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
++    if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
++      string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
++    else()
++      set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
++    endif()
+   endif()
+ 
+   if(NOT USE_SYSTEM_SLEEF)
+-    set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
+-    set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
+-    set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
+-    set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
+-    set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE})
++    set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
++    set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
++    set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
++    set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
++    set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE)
+     if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+       if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+         set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE)
+@@ -465,12 +458,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
+   endif()
+   list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef)
+ 
+-  set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
+-  set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
+-
+-  # Set these back. TODO: Use SLEEF_ to pass these instead
+-  set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE)
+-  set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE)
++  if(NOT MSVC)
++    set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
++  endif()
+ endif()
+ 
+ if(USE_CUDA AND NOT USE_ROCM)
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h
+index 800b027e469..c431fa3c605 100644
+--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
++++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
+@@ -69,7 +69,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
+ }
+ 
+ 
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
+ 
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ 
+@@ -94,7 +94,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
+ }
+ 
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-
++#ifndef _MSC_VER
++// MSVC is not working well on complex function overload.
+ template<int64_t scale = 1>
+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
+ inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
+@@ -106,9 +107,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
+ inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
+   return _mm256_i32gather_ps(base_addr, vindex, scale);
+ }
+-
++#endif
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-
++#ifndef _MSC_VER
++// MSVC is not working well on complex function overload.
+ template<int64_t scale = 1>
+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
+ inline mask_gather(const Vectorized<double>& src, const double* base_addr,
+@@ -122,7 +124,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
+                    const Vectorized<int32_t>& vindex, Vectorized<float>& mask) {
+   return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale);
+ }
+-
++#endif
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ 
+ // Only works for inputs in the range: [-2^51, 2^51]
+@@ -302,6 +304,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
+   return flip8(v);
+ }
+ 
+-#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#endif // (defined(CPU_CAPABILITY_AVX2)
+ 
+ }} // namepsace at::vec::CPU_CAPABILITY
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
+index 3e26213d6d2..66557436c70 100644
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
+@@ -7,7 +7,8 @@
+ #include <ATen/cpu/vec/vec_base.h>
+ #include <c10/util/irange.h>
+ 
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
++#define SLEEF_STATIC_LIBS
+ #include <sleef.h>
+ #endif
+ 
+@@ -18,7 +19,18 @@ namespace at::vec {
+ // See Note [CPU_CAPABILITY namespace]
+ inline namespace CPU_CAPABILITY {
+ 
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
++
++#ifndef SLEEF_CONST
++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
++#define SLEEF_CONST const
++#else
++#define SLEEF_CONST
++#endif
++#define SLEEF_CONST_OLD SLEEF_CONST
++#else
++#define SLEEF_CONST_OLD
++#endif
+ 
+ // bfloat16 conversion
+ static inline void cvtbf16_fp32(const __m128i& a, __m256& o) {
+@@ -265,7 +277,8 @@ public:
+     }
+     return b;
+   }
+-  Vectorized<T> map(const __m256 (*const vop)(__m256)) const {
++
++  Vectorized<T> map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const {
+     __m256 lo, hi;
+     cvt_to_fp32<T>(values, lo, hi);
+     const auto o1 = vop(lo);
+@@ -1026,7 +1039,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
+ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
+ CONVERT_VECTORIZED_INIT(Half, half);
+ 
+-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#else // defined(CPU_CAPABILITY_AVX2)
+ 
+ #define CONVERT_NON_VECTORIZED_INIT(type, name) \
+ inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
+@@ -1051,9 +1064,9 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
+ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
+ CONVERT_NON_VECTORIZED_INIT(Half, half);
+ 
+-#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#endif // defined(CPU_CAPABILITY_AVX2)
+ 
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
+ #define LOAD_FP32_VECTORIZED_INIT(type, name) \
+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
+   auto values = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data)); \
+@@ -1072,7 +1085,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
+ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
+ LOAD_FP32_VECTORIZED_INIT(Half, fp16);
+ 
+-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#else // defined(CPU_CAPABILITY_AVX2)
+ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
+   __at_align__ float values[Vectorized<float>::size()]; \
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
+index f93ea1e63c3..6c198fb37d3 100644
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
+@@ -8,7 +8,8 @@
+ #include <ATen/cpu/vec/intrinsics.h>
+ #include <ATen/cpu/vec/vec_base.h>
+ 
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
++#define SLEEF_STATIC_LIBS
+ #include <sleef.h>
+ #endif
+ 
+@@ -16,7 +17,7 @@ namespace at::vec {
+ // See Note [CPU_CAPABILITY namespace]
+ inline namespace CPU_CAPABILITY {
+ 
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
+ 
+ template <> class Vectorized<c10::complex<double>> {
+ private:
+@@ -145,7 +146,7 @@ public:
+     auto abs = abs_();
+     auto zero = _mm256_setzero_pd();
+     auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ);
+-    auto div = values / abs;
++    auto div = _mm256_div_pd(values, abs);
+     return _mm256_blendv_pd(div, zero, mask);
+   }
+   __m256d real_() const {
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
+index 7c142c04b79..c72d4d49274 100644
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
+@@ -7,7 +7,8 @@
+ #include <c10/util/irange.h>
+ #include <ATen/cpu/vec/intrinsics.h>
+ #include <ATen/cpu/vec/vec_base.h>
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
++#define SLEEF_STATIC_LIBS
+ #include <sleef.h>
+ #endif
+ 
+@@ -15,7 +16,7 @@ namespace at::vec {
+ // See Note [CPU_CAPABILITY namespace]
+ inline namespace CPU_CAPABILITY {
+ 
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
+ 
+ template <> class Vectorized<c10::complex<float>> {
+ private:
+@@ -180,7 +181,7 @@ public:
+     auto abs = abs_();
+     auto zero = _mm256_setzero_ps();
+     auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ);
+-    auto div = values / abs;
++    auto div = _mm256_div_ps(values, abs);
+     return _mm256_blendv_ps(div, zero, mask);
+   }
+   __m256 real_() const {
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
+index bc82d07edd1..bed6da627af 100644
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
+@@ -6,7 +6,8 @@
+ #include <ATen/cpu/vec/intrinsics.h>
+ #include <ATen/cpu/vec/vec_base.h>
+ #include <c10/util/irange.h>
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
++#define SLEEF_STATIC_LIBS
+ #include <sleef.h>
+ #endif
+ 
+@@ -15,7 +16,7 @@ namespace at::vec {
+ inline namespace CPU_CAPABILITY {
+ 
+ 
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
+ 
+ template <> class Vectorized<double> {
+ private:
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+index 886809a0b8a..0e3664cd37b 100644
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+@@ -6,7 +6,8 @@
+ #include <ATen/cpu/vec/intrinsics.h>
+ #include <ATen/cpu/vec/vec_base.h>
+ #include <c10/util/irange.h>
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
++#define SLEEF_STATIC_LIBS
+ #include <sleef.h>
+ #endif
+ 
+@@ -14,7 +15,7 @@ namespace at::vec {
+ // See Note [CPU_CAPABILITY namespace]
+ inline namespace CPU_CAPABILITY {
+ 
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
+ 
+ template <> class Vectorized<float> {
+ private:
+@@ -226,14 +227,14 @@ public:
+     static __m256 vec_factorial_5 =
+         _mm256_set1_ps(0.00828929059f); // 1/factorial(5)
+     static __m256 vec_exp_log2ef =
+-        (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e)
++        _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e)
+     static __m256 vec_half = _mm256_set1_ps(0.5f);
+     static __m256 vec_one = _mm256_set1_ps(1.f);
+     static __m256 vec_zero = _mm256_set1_ps(0.f);
+     static __m256 vec_two = _mm256_set1_ps(2.f);
+-    static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2)
+-    static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50);
+-    static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218);
++    static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2)
++    static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
++    static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
+     static __m256i vec_127 = _mm256_set1_epi32(0x0000007f);
+     static int n_mantissa_bits = 23;
+ 
+@@ -266,7 +267,7 @@ public:
+     auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number);
+     auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127);
+     vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
+-    auto vec_two_pow_n = (__m256)vec_two_pow_n_i;
++    auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i);
+     vec_two_pow_n =
+         _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask);
+ 
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+index 4128841701a..85e099904cd 100644
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+@@ -41,11 +41,17 @@
+ namespace at::vec {
+ inline namespace CPU_CAPABILITY {
+ 
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
+ 
++#ifdef _MSC_VER
++__declspec(align(64)) struct Vectorizedqi {
++ protected:
++  __m256i vals;
++#else
+ struct Vectorizedqi {
+  protected:
+   __m256i vals __attribute__((aligned(64)));
++#endif
+ 
+  public:
+   Vectorizedqi() {}
+@@ -133,7 +139,7 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
+ }
+ 
+ template <typename T>
+-inline void __attribute__((always_inline)) QuantizeAvx2(
++__FORCE_INLINE void QuantizeAvx2(
+     const float* src,
+     T* dst,
+     int len,
+@@ -1331,5 +1337,5 @@ Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const V
+   return a.maximum(b);
+ }
+ 
+-#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#endif // if defined(CPU_CAPABILITY_AVX2)
+ }} // namespace at::vec::CPU_CAPABILITY
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h
+index fe96d123e64..87f723d782c 100644
+--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
++++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
+@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
+ }
+ 
+ 
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
+ 
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ 
+@@ -80,7 +80,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
+ }
+ 
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-
++#ifndef _MSC_VER
++// MSVC is not working well on complex function overload.
+ template<int64_t scale = 1>
+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
+ inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
+@@ -92,9 +93,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
+ inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
+   return _mm512_i32gather_ps(vindex, base_addr, scale);
+ }
+-
++#endif
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-
++#ifndef _MSC_VER
++// MSVC is not working well on complex function overload.
+ template<int64_t scale = 1>
+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
+ inline mask_gather(const Vectorized<double>& src, const double* base_addr,
+@@ -112,7 +114,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
+   auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ);
+   return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale);
+ }
+-
++#endif
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ 
+ template<>
+@@ -270,6 +272,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
+   return flip8(v);
+ }
+ 
+-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#endif // defined(CPU_CAPABILITY_AVX512)
+ 
+ }}}
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
+index f9fc92d52bf..eb3b6a72240 100644
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
+@@ -7,7 +7,8 @@
+ #include <ATen/cpu/vec/vec_base.h>
+ #include <c10/util/irange.h>
+ 
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
++#define SLEEF_STATIC_LIBS
+ #include <sleef.h>
+ #endif
+ 
+@@ -16,7 +17,18 @@ namespace vec {
+ // See Note [CPU_CAPABILITY namespace]
+ inline namespace CPU_CAPABILITY {
+ 
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
++
++#ifndef SLEEF_CONST
++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
++#define SLEEF_CONST const
++#else
++#define SLEEF_CONST
++#endif
++#define SLEEF_CONST_OLD SLEEF_CONST
++#else
++#define SLEEF_CONST_OLD
++#endif
+ 
+ // bfloat16 conversion
+ static inline void cvtbf16_fp32(const __m256i& a, __m512& o) {
+@@ -362,7 +374,8 @@ public:
+   }
+   #pragma clang diagnostic push
+   #pragma clang diagnostic ignored "-Wignored-qualifiers"
+-  Vectorized<T> map(const __m512 (*const vop)(__m512)) const {
++
++  Vectorized<T> map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const {
+     __m512 lo, hi;
+     cvt_to_fp32<T>(values, lo, hi);
+     const auto o1 = vop(lo);
+@@ -1571,7 +1584,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
+ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
+ CONVERT_VECTORIZED_INIT(Half, half);
+ 
+-#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#else //defined(CPU_CAPABILITY_AVX512)
+ 
+ #define CONVERT_NON_VECTORIZED_INIT(type, name) \
+ inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
+@@ -1601,9 +1614,9 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
+ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
+ CONVERT_NON_VECTORIZED_INIT(Half, half);
+ 
+-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#endif // defined(CPU_CAPABILITY_AVX512)
+ 
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
+ #define LOAD_FP32_VECTORIZED_INIT(type, name) \
+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
+   auto values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)); \
+@@ -1622,7 +1635,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
+ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
+ LOAD_FP32_VECTORIZED_INIT(Half, fp16);
+ 
+-#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#else // defined(CPU_CAPABILITY_AVX512)
+ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
+   __at_align__ float values[Vectorized<float>::size()]; \
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
+index 02aa3a87cc1..c35204f9da2 100644
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
+@@ -7,7 +7,8 @@
+ #include <c10/util/irange.h>
+ #include <ATen/cpu/vec/intrinsics.h>
+ #include <ATen/cpu/vec/vec_base.h>
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
++#define SLEEF_STATIC_LIBS
+ #include <sleef.h>
+ #endif
+ 
+@@ -16,7 +17,7 @@ namespace vec {
+ // See Note [CPU_CAPABILITY namespace]
+ inline namespace CPU_CAPABILITY {
+ 
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
+ 
+ template <> class Vectorized<c10::complex<double>> {
+ private:
+@@ -203,7 +204,7 @@ public:
+     auto abs = abs_();
+     auto zero = _mm512_setzero_pd();
+     auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ);
+-    auto div = values / abs;
++    auto div = _mm512_div_pd(values, abs);
+     return _mm512_mask_blend_pd(mask, div, zero);
+   }
+   __m512d real_() const {
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
+index a5d790c98b2..2801e484d94 100644
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
+@@ -7,7 +7,8 @@
+ #include <c10/util/irange.h>
+ #include <ATen/cpu/vec/intrinsics.h>
+ #include <ATen/cpu/vec/vec_base.h>
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
++#define SLEEF_STATIC_LIBS
+ #include <sleef.h>
+ #endif
+ 
+@@ -16,7 +17,7 @@ namespace vec {
+ // See Note [CPU_CAPABILITY namespace]
+ inline namespace CPU_CAPABILITY {
+ 
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
+ 
+ template <> class Vectorized<c10::complex<float>> {
+ private:
+@@ -708,7 +709,7 @@ public:
+     auto abs = abs_();
+     auto zero = _mm512_setzero_ps();
+     auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ);
+-    auto div = values / abs;
++    auto div = _mm512_div_ps(values, abs);
+     return _mm512_mask_blend_ps(mask, div, zero);
+   }
+   __m512 real_() const {
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
+index 27b2753c903..508ab257e60 100644
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
+@@ -6,7 +6,8 @@
+ #include <ATen/cpu/vec/intrinsics.h>
+ #include <ATen/cpu/vec/vec_base.h>
+ #include <c10/util/irange.h>
+-#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER)
++#if (defined(CPU_CAPABILITY_AVX512))
++#define SLEEF_STATIC_LIBS
+ #include <sleef.h>
+ #endif
+ 
+@@ -15,7 +16,7 @@ namespace vec {
+ // See Note [CPU_CAPABILITY namespace]
+ inline namespace CPU_CAPABILITY {
+ 
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
+ 
+ template <> class Vectorized<double> {
+ private:
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+index ba5738687fd..a08df3c141a 100644
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+@@ -6,7 +6,8 @@
+ #include <ATen/cpu/vec/intrinsics.h>
+ #include <ATen/cpu/vec/vec_base.h>
+ #include <c10/util/irange.h>
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
++#define SLEEF_STATIC_LIBS
+ #include <sleef.h>
+ #endif
+ 
+@@ -15,7 +16,7 @@ namespace vec {
+ // See Note [CPU_CAPABILITY namespace]
+ inline namespace CPU_CAPABILITY {
+ 
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
+ 
+ template <> class Vectorized<float> {
+ private:
+@@ -246,14 +247,14 @@ public:
+     static __m512 vec_factorial_5 =
+         _mm512_set1_ps(0.00828929059f); // 1/factorial(5)
+     static __m512 vec_exp_log2ef =
+-        (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e)
++        _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
+     static __m512 vec_half = _mm512_set1_ps(0.5f);
+     static __m512 vec_one = _mm512_set1_ps(1.f);
+     static __m512 vec_zero = _mm512_set1_ps(0.f);
+     static __m512 vec_two = _mm512_set1_ps(2.f);
+-    static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2)
+-    static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50);
+-    static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218);
++    static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2)
++    static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
++    static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
+     static __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
+     static int n_mantissa_bits = 23;
+ 
+@@ -288,7 +289,7 @@ public:
+     auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number);
+     auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127);
+     vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
+-    auto vec_two_pow_n = (__m512)vec_two_pow_n_i;
++    auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i);
+     vec_two_pow_n =
+         _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero);
+ 
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
+index e0713d01312..a5671ed4a50 100644
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
+@@ -42,11 +42,17 @@ namespace at {
+ namespace vec {
+ inline namespace CPU_CAPABILITY {
+ 
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
+ 
++#ifdef _MSC_VER
++__declspec(align(64)) struct Vectorizedqi {
++ protected:
++  __m512i vals;
++#else
+ struct Vectorizedqi {
+  protected:
+   __m512i vals __attribute__((aligned(64)));
++#endif
+ 
+  public:
+   Vectorizedqi() {}
+@@ -136,7 +142,7 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
+ }
+ 
+ template <typename T>
+-inline void __attribute__((always_inline)) QuantizeAvx512(
++__FORCE_INLINE void QuantizeAvx512(
+     const float* src,
+     T* dst,
+     int len,
+@@ -525,10 +531,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
+       Vectorized<float> scale,
+       Vectorized<float> zero_point,
+       Vectorized<float> scale_neg_zp_premul) const {
++    #if defined(_MSC_VER) && !defined(__clang__)
++    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
++    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
++    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
++    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
++    #else
+     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
++    #endif
+ 
+     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
+     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
+@@ -549,10 +562,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
+   float_vec_return_type dequantize(
+       Vectorized<float> scale,
+       Vectorized<float> zero_point) const {
++    #if defined(_MSC_VER) && !defined(__clang__)
++    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
++    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
++    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
++    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
++    #else
+     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
++    #endif
+ 
+     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
+     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
+@@ -598,20 +618,34 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
+     }
+ 
+     int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
++      #if defined(_MSC_VER) && !defined(__clang__)
++      __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
++      __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
++      __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
++      __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
++      #else
+       __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+       __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+       __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+       __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
++      #endif
+ 
+       __m512i int32_val0 = cvtepi8_epi32(int_val0);
+       __m512i int32_val1 = cvtepi8_epi32(int_val1);
+       __m512i int32_val2 = cvtepi8_epi32(int_val2);
+       __m512i int32_val3 = cvtepi8_epi32(int_val3);
+ 
++      #if defined(_MSC_VER) && !defined(__clang__)
++      __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
++      __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
++      __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
++      __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
++      #else
+       __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
+       __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
+       __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
+       __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
++      #endif
+ 
+       __m512i int32_b0 = cvtepi8_epi32(int_b0);
+       __m512i int32_b1 = cvtepi8_epi32(int_b1);
+@@ -721,10 +755,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
+       Vectorized<float> scale,
+       Vectorized<float> zero_point,
+       Vectorized<float> scale_zp_premul) const {
++    #if defined(_MSC_VER) && !defined(__clang__)
++    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
++    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
++    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
++    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
++    #else
+     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
++    #endif
+ 
+     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
+     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
+@@ -746,10 +787,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
+   float_vec_return_type dequantize(
+       Vectorized<float> scale,
+       Vectorized<float> zero_point) const {
++    #if defined(_MSC_VER) && !defined(__clang__)
++    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
++    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
++    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
++    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
++    #else
+     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
++    #endif
+ 
+     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
+     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
+@@ -796,20 +844,34 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
+     }
+ 
+     int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
++      #if defined(_MSC_VER) && !defined(__clang__)
++      __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
++      __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
++      __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
++      __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
++      #else
+       __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+       __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+       __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+       __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
++      #endif
+ 
+       __m512i int32_val0 = cvtepu8_epi32(int_val0);
+       __m512i int32_val1 = cvtepu8_epi32(int_val1);
+       __m512i int32_val2 = cvtepu8_epi32(int_val2);
+       __m512i int32_val3 = cvtepu8_epi32(int_val3);
+ 
++      #if defined(_MSC_VER) && !defined(__clang__)
++      __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
++      __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
++      __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
++      __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
++      #else
+       __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
+       __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
+       __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
+       __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
++      #endif
+ 
+       __m512i int32_b0 = cvtepu8_epi32(int_b0);
+       __m512i int32_b1 = cvtepu8_epi32(int_b1);
+diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
+index adf81dd915c..20cb8ef6dbc 100644
+--- a/aten/src/ATen/cpu/vec/vec_base.h
++++ b/aten/src/ATen/cpu/vec/vec_base.h
+@@ -36,6 +36,12 @@
+ #include <c10/util/irange.h>
+ #include <c10/util/Load.h>
+ 
++#if defined(__GNUC__)
++#define __FORCE_INLINE __attribute__((always_inline)) inline
++#elif defined(_MSC_VER)
++#define __FORCE_INLINE __forceinline
++#endif
++
+ // These macros helped us unify vec_base.h
+ #ifdef CPU_CAPABILITY_AVX512
+ #if defined(__GNUC__)
+diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
+index a6b6f0f7d1d..15d37cf4861 100644
+--- a/caffe2/CMakeLists.txt
++++ b/caffe2/CMakeLists.txt
+@@ -1787,7 +1787,7 @@ if(BUILD_TEST)
+           endif()
+         else()
+           add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
+-          target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main)
++          target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main)
+         endif()
+         target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<INSTALL_INTERFACE:include>)
+         target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>)
+diff --git a/third_party/sleef.BUILD b/third_party/sleef.BUILD
+index 573f9c5b54a..f22a6e905e2 100644
+--- a/third_party/sleef.BUILD
++++ b/third_party/sleef.BUILD
+@@ -38,6 +38,7 @@ SLEEF_PUBLIC_HEADERS = [
+ SLEEF_PRIVATE_INCLUDES = [
+     "-Iexternal/sleef/src/arch",
+     "-Iexternal/sleef/src/common",
++    "-Iexternal/sleef/src/libm",
+ ]
+ 
+ SLEEF_PUBLIC_INCLUDES = [
+@@ -201,8 +202,6 @@ cc_library(
+     srcs = [
+         "src/libm/rempitab.c",
+         "src/libm/sleefdp.c",
+-        "src/libm/sleefld.c",
+-        "src/libm/sleefqp.c",
+         "src/libm/sleefsp.c",
+     ],
+     hdrs = SLEEF_PUBLIC_HEADERS,
+-- 
+2.45.1
+
diff --git a/python-torch.spec b/python-torch.spec
index 91eb9db..b3fb7bf 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -237,6 +237,19 @@ Patch7:        0001-Reenable-dim-for-python-3.12.patch
 # Patch8:        0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch
 %endif
 
+%if %{without gitcommit}
+# Enable x86 CPU vectorization on windows [submodule sleef] (#118980)
+# https://github.com/pytorch/pytorch/commit/56451cd49d9cf94b49197e09dec13426bb1a5370
+#
+# Despite the title, this patch fixes compatibility with sleef 3.6 by including
+# a backwards-compatible version of the fix from
+# https://github.com/pytorch/pytorch/pull/122723.
+# Cherry-picked on v2.3.0, without the commit to update the third_party/sleef
+# git submodule (because the release archive contains an actual sleef source
+# tree instead, so this would not apply.)
+Patch9:        0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
+%endif
+
 # ROCm patches
 # Patches need to be refactored for ToT
 %if %{without gitcommit}

From 4e1bebf65fc723186ecf5fdf055bfd4e05116ad5 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Fri, 21 Jun 2024 05:33:36 -0600
Subject: [PATCH 07/88] Update gitcommit to 2.4.0-rc3

Signed-off-by: Tom Rix <trix@redhat.com>
---
 ...-Changes-to-compile-with-3.13-126033.patch | 222 ++++++++++++++++++
 python-torch.spec                             |  23 +-
 2 files changed, 241 insertions(+), 4 deletions(-)
 create mode 100644 0001-Changes-to-compile-with-3.13-126033.patch

diff --git a/0001-Changes-to-compile-with-3.13-126033.patch b/0001-Changes-to-compile-with-3.13-126033.patch
new file mode 100644
index 0000000..ddc0dcf
--- /dev/null
+++ b/0001-Changes-to-compile-with-3.13-126033.patch
@@ -0,0 +1,222 @@
+From 655a06444b261cb28e71a0973c0ab67aaa8261ab Mon Sep 17 00:00:00 2001
+From: albanD <desmaison.alban@gmail.com>
+Date: Tue, 14 May 2024 02:14:53 +0000
+Subject: [PATCH] Changes to compile with 3.13 (#126033)
+
+This is mainly:
+- Fix refcount access macro
+- Hide all the Dynamo code that needs update as usual
+- Add _PyWeakref_ClearRef as an extern provided by CPython. Including the pycore header that defines it would require raw c include shenanigans that I don't think are worth it.
+This allows to build both with regular and nogil version of cpython. Both
+
+Note that this requires the 3.13 branch at least past [d3094744d40de2deefbda9b1996d5029c9ebf0b0](https://github.com/python/cpython/commit/d3094744d40de2deefbda9b1996d5029c9ebf0b0) which we need for mimalloc include and weakref function being exposed.
+
+debug-only issues in pybind11 with PyMem_MALLOC vs PyObject_MALLOC being should be synced either by updating pybind or cpython. @colesbury I can send a PR to ifdef the proper use in pybind if you think that this is the best solution here?
+
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/126033
+Approved by: https://github.com/colesbury
+---
+ torch/csrc/Storage.cpp                  |  2 +-
+ torch/csrc/autograd/python_variable.cpp |  2 +-
+ torch/csrc/dynamo/cpython_defs.c        | 15 +++++-
+ torch/csrc/dynamo/cpython_defs.h        |  2 +
+ torch/csrc/dynamo/eval_frame.c          | 67 ++++++++++++++++++-------
+ torch/csrc/utils/python_compat.h        |  4 ++
+ 6 files changed, 70 insertions(+), 22 deletions(-)
+
+diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp
+index 93dbc9c09bb2..b22bbac35981 100644
+--- a/torch/csrc/Storage.cpp
++++ b/torch/csrc/Storage.cpp
+@@ -236,7 +236,7 @@ static void THPStorage_subclass_dealloc(PyObject* self) {
+   if (type->tp_del) {
+     PyObject_GC_Track(self);
+     type->tp_del(self);
+-    if (self->ob_refcnt > 0) {
++    if (Py_REFCNT(self) > 0) {
+       // Resurrected (see above comment about resurrection from `__del__`)
+       return;
+     }
+diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
+index 9e85f0026b35..8fd1129da63c 100644
+--- a/torch/csrc/autograd/python_variable.cpp
++++ b/torch/csrc/autograd/python_variable.cpp
+@@ -1910,7 +1910,7 @@ void THPVariable_subclass_dealloc(PyObject* self) {
+   if (type->tp_del) {
+     PyObject_GC_Track(self);
+     type->tp_del(self);
+-    if (self->ob_refcnt > 0) {
++    if (Py_REFCNT(self) > 0) {
+       /* Resurrected */
+       return;
+     }
+diff --git a/torch/csrc/dynamo/cpython_defs.c b/torch/csrc/dynamo/cpython_defs.c
+index 4a1dba63009a..5e0945a052ae 100644
+--- a/torch/csrc/dynamo/cpython_defs.c
++++ b/torch/csrc/dynamo/cpython_defs.c
+@@ -13,6 +13,17 @@
+   } else {                                                              \
+   }
+ 
++#if IS_PYTHON_3_13_PLUS
++// Gave up after fixing a few of these
++// pycore_opcode.h is gone (new is pycore_opcode_metadata.h ?)
++// f_code is gone (new is f_executable?)
++
++// Fake definitions for what we removed
++const uint8_t* THP_PyOpcode_Caches = NULL;
++const int THP_PyOpcode_Caches_size = 0;
++
++#else
++
+ // NOTE: all `assert`s below are converted to `CHECK`s
+ 
+ #if IS_PYTHON_3_11_PLUS
+@@ -29,8 +40,8 @@
+ #define NEED_OPCODE_TABLES // To get _PyOpcode_Deopt
+ #include <internal/pycore_opcode.h>
+ #undef NEED_OPCODE_TABLES
+-#undef Py_BUILD_CORE
+ #include <internal/pycore_frame.h>
++#undef Py_BUILD_CORE
+ 
+ // As a simple way to reduce the impact of ABI changes on the CPython side, this check forces
+ // us to manually re-check that the function didn't change on the next major version
+@@ -364,3 +375,5 @@ THP_PyFrame_Clear(_PyInterpreterFrame *frame)
+ }
+ 
+ #endif
++
++#endif // CPython 3.13
+\ No newline at end of file
+diff --git a/torch/csrc/dynamo/cpython_defs.h b/torch/csrc/dynamo/cpython_defs.h
+index a897c3e6c6e7..3b6c9667f8c9 100644
+--- a/torch/csrc/dynamo/cpython_defs.h
++++ b/torch/csrc/dynamo/cpython_defs.h
+@@ -8,7 +8,9 @@
+ 
+ #if IS_PYTHON_3_11_PLUS
+ 
++#define Py_BUILD_CORE
+ #include <internal/pycore_frame.h>
++#undef Py_BUILD_CORE
+ 
+ int THP_PyFrame_FastToLocalsWithError(
+     _PyInterpreterFrame* frame,
+diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c
+index c286e821f09d..e13cb5af2a0e 100644
+--- a/torch/csrc/dynamo/eval_frame.c
++++ b/torch/csrc/dynamo/eval_frame.c
+@@ -8,6 +8,31 @@
+ #include <opcode.h>
+ #include <stdbool.h>
+ 
++
++
++PyObject* guard_error_hook = NULL;
++const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup";
++
++static int active_dynamo_threads = 0;
++
++static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT;
++
++inline static PyObject* eval_frame_callback_get(void) {
++  void* result = PyThread_tss_get(&eval_frame_callback_key);
++  if (unlikely(result == NULL)) {
++    return (PyObject*)Py_None;
++  } else {
++    return (PyObject*)result;
++  }
++}
++
++inline static void eval_frame_callback_set(PyObject* obj) {
++  PyThread_tss_set(&eval_frame_callback_key, obj);
++}
++
++// 3.13 Not supported at all. See cpython_defs.c for hints
++#if !(IS_PYTHON_3_13_PLUS)
++
+ // Problem in CPython includes when mixing core and non-core build
+ // The fix was not backported to 3.12 so this is needed here
+ // https://github.com/python/cpython/issues/105268
+@@ -138,24 +163,6 @@ THP_PyFrame_FastToLocalsWithError(THP_EVAL_API_FRAME_OBJECT *frame, int *free_va
+ }
+ #endif
+ 
+-PyObject* guard_error_hook = NULL;
+-const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup";
+-
+-static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT;
+-
+-inline static PyObject* eval_frame_callback_get(void) {
+-  void* result = PyThread_tss_get(&eval_frame_callback_key);
+-  if (unlikely(result == NULL)) {
+-    return (PyObject*)Py_None;
+-  } else {
+-    return (PyObject*)result;
+-  }
+-}
+-
+-inline static void eval_frame_callback_set(PyObject* obj) {
+-  PyThread_tss_set(&eval_frame_callback_key, obj);
+-}
+-
+ static PyObject* _custom_eval_frame_shim(
+     PyThreadState* tstate,
+     THP_EVAL_API_FRAME_OBJECT* frame,
+@@ -627,7 +634,29 @@ static PyObject* _custom_eval_frame(
+   }
+ }
+ 
+-static int active_dynamo_threads = 0;
++#else // IS_PYTHON_3_13_PLUS
++
++// Fake definitions for everything we removed
++
++typedef struct THPPyInterpreterFrame {
++  PyObject_HEAD
++  _PyInterpreterFrame* frame; // Borrowed reference
++} THPPyInterpreterFrame;
++
++inline static void enable_eval_frame_shim(PyThreadState* tstate) {}
++inline static void enable_eval_frame_default(PyThreadState* tstate) {}
++
++static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL};
++
++static PyTypeObject THPPyInterpreterFrameType = {
++    PyVarObject_HEAD_INIT(NULL, 0)
++    .tp_name = "torch._C.dynamo.eval_frame._PyInterpreterFrame",
++    .tp_basicsize = sizeof(THPPyInterpreterFrame),
++    .tp_flags = Py_TPFLAGS_DEFAULT,
++    .tp_getset = THPPyInterpreterFrame_properties,
++};
++
++#endif // CPython 3.13
+ 
+ static PyObject* increment_working_threads(PyThreadState* tstate) {
+   active_dynamo_threads = active_dynamo_threads + 1;
+diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h
+index 73b991cf3fbf..b060db00db73 100644
+--- a/torch/csrc/utils/python_compat.h
++++ b/torch/csrc/utils/python_compat.h
+@@ -11,6 +11,7 @@ extern "C" {
+ 
+ #define IS_PYTHON_3_11_PLUS PY_VERSION_HEX >= 0x030B00C1
+ #define IS_PYTHON_3_12_PLUS PY_VERSION_HEX >= 0x030C0000
++#define IS_PYTHON_3_13_PLUS PY_VERSION_HEX >= 0x030D0000
+ 
+ PYCAPI_COMPAT_STATIC_INLINE(int)
+ PyCode_GetNCellvars(PyCodeObject* code) {
+@@ -32,6 +33,9 @@ PyCode_GetNFreevars(PyCodeObject* code) {
+ #endif
+ }
+ 
++// Provided by CPython but getting the header for them is very hard
++extern void _PyWeakref_ClearRef(PyWeakReference* self);
++
+ #ifdef __cplusplus
+ }
+ #endif
+-- 
+2.45.1
+
diff --git a/python-torch.spec b/python-torch.spec
index b3fb7bf..d773fc0 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# ToT
-%global commit0 2b9465d62ab8733a36226f0d8e236a8a9bd60c23
+# v2.4.0-rc3
+%global commit0 50e57d4f3fdbd51cea0ef9979bc61abd1a4e96c8
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20240612
+%global date0 20240620
 %global pypi_version 2.4.0
 %else
 %global pypi_version 2.3.1
@@ -63,8 +63,13 @@
 
 # OpenCV support came in F41
 %if 0%{?fedora} > 40
+%if %{without gitcommit}
 %bcond_without opencv
 %else
+# USE_OPENCV removed in 2.4.0+
+%bcond_with opencv
+%endif
+%else
 %bcond_with opencv
 %endif
 
@@ -230,7 +235,8 @@ Patch6:        0001-reenable-foxi-linking.patch
 # Bring some patches forward
 %if %{without gitcommit}
 # https://github.com/pytorch/pytorch/pull/123384
-Patch7:        0001-Reenable-dim-for-python-3.12.patch
+# Breaks on python 3.13
+# Patch7:        0001-Reenable-dim-for-python-3.12.patch
 
 # Dynamo/Inductor on 3.12
 # Fails to apply on 2.3.1
@@ -248,6 +254,11 @@ Patch7:        0001-Reenable-dim-for-python-3.12.patch
 # git submodule (because the release archive contains an actual sleef source
 # tree instead, so this would not apply.)
 Patch9:        0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
+
+# For Python 3.13
+# https://github.com/pytorch/pytorch/pull/126033
+Patch10: 0001-Changes-to-compile-with-3.13-126033.patch
+
 %endif
 
 # ROCm patches
@@ -503,6 +514,7 @@ Requires:       python3-%{pypi_name}%{?_isa} = %{version}-%{release}
 %autosetup -p1 -n pytorch-%{commit0}
 # Overwrite with a git checkout of the pyproject.toml
 cp %{SOURCE1000} .
+
 %else
 %autosetup -p1 -n pytorch-v%{version}
 %endif
@@ -826,6 +838,7 @@ export CAFFE2_LINK_LOCAL_PROTOBUF=OFF
 export INTERN_BUILD_MOBILE=OFF
 export USE_DISTRIBUTED=OFF
 export USE_CUDA=OFF
+export USE_FAKELOWP=OFF
 export USE_FBGEMM=OFF
 export USE_FLASH_ATTENTION=OFF
 export USE_GOLD_LINKER=OFF
@@ -837,6 +850,7 @@ export USE_LITE_PROTO=OFF
 export USE_MAGMA=OFF
 export USE_MKLDNN=OFF
 export USE_MPI=OFF
+export USE_MKLDNN=OFF
 export USE_NCCL=OFF
 export USE_NNPACK=OFF
 export USE_NUMPY=ON
@@ -853,6 +867,7 @@ export USE_SYSTEM_PYBIND11=OFF
 export USE_SYSTEM_LIBS=OFF
 export USE_TENSORPIPE=OFF
 export USE_XNNPACK=ON
+export USE_XPU=OFF
 
 %if %{with pthreadpool}
 export USE_SYSTEM_PTHREADPOOL=ON

From c7679453c9e64d416cbc118dd519f5d093404a00 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Mon, 24 Jun 2024 16:21:55 -0400
Subject: [PATCH 08/88] Update gitcommit to 2.4.0-rc5

Signed-off-by: Tom Rix <trix@redhat.com>
---
 python-torch.spec | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index d773fc0..39e5c44 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.4.0-rc3
-%global commit0 50e57d4f3fdbd51cea0ef9979bc61abd1a4e96c8
+# v2.4.0-rc5
+%global commit0 93c51dc84bb02bb3faafd217a12a1ff6264cc569
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20240620
+%global date0 20240621
 %global pypi_version 2.4.0
 %else
 %global pypi_version 2.3.1

From bcdd1822e39ef62207489ac40554d02f21115c3c Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Tue, 25 Jun 2024 08:09:08 -0400
Subject: [PATCH 09/88] Add CUDA BuildRequires

Signed-off-by: Tom Rix <trix@redhat.com>
---
 python-torch.spec | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/python-torch.spec b/python-torch.spec
index 39e5c44..2c0e093 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -100,7 +100,12 @@
 # Pick a version that works
 %if %{with cuda}
 %if 0%{?fedora} < 40
-%global cuda_ver 12.4
+%global cuda_ver     12.5
+%global cudart_ver   12-5
+%global cublas_ver   12-5
+%global cufft_ver    12-5
+%global curand_ver   12-5
+%global cusparse_ver 12-5
 %else
 %global cuda_ver 12.5
 %endif
@@ -378,6 +383,14 @@ BuildRequires:  roctracer-devel
 Requires:       rocm-rpm-macros-modules
 %endif
 
+%if %{with cuda}
+BuildRequires:  cuda-cudart-devel-%{cudart_ver}
+BuildRequires:  libcublas-devel-%{cublas_ver}
+BuildRequires:  libcufft-devel-%{cufft_ver}
+BuildRequires:  libcurand-devel-%{curand_ver}
+BuildRequires:  libcusparse-devel-%{cusparse_ver}
+%endif
+
 %if %{with opencv}
 BuildRequires:  opencv-devel
 %endif

From 3a008de5e56a5dd4b786b860fb5d7ce0491b0c83 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Wed, 26 Jun 2024 05:50:40 -0600
Subject: [PATCH 10/88] Update gitcommit to v2.4.0-rc6

Signed-off-by: Tom Rix <trix@redhat.com>
---
 python-torch.spec | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 2c0e093..8e5a265 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,8 +6,8 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.4.0-rc5
-%global commit0 93c51dc84bb02bb3faafd217a12a1ff6264cc569
+# v2.4.0-rc6
+%global commit0 699c05647931d3e4731828f07a3f34c4e0623eb9
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
 %global date0 20240621
 %global pypi_version 2.4.0
@@ -779,6 +779,12 @@ rm caffe2/mobile/contrib/libopencl-stub/include/CL/*.h
 rm caffe2/mobile/contrib/libopencl-stub/include/CL/*.hpp
 %endif
 
+# Fix installing to /usr/lib64
+%if %{with gitcommit}
+sed -i -e 's@DESTINATION ${PYTHON_LIB_REL_PATH}@DESTINATION ${CMAKE_INSTALL_PREFIX}/${PYTHON_LIB_REL_PATH}@' caffe2/CMakeLists.txt
+%endif
+
+
 %if %{with rocm}
 # hipify
 ./tools/amd_build/build_amd.py
@@ -1015,8 +1021,10 @@ done
 %endif
 
 %else
+
 %py3_install
 
+
 %endif
 
 # Do not remote the empty files
@@ -1032,9 +1040,11 @@ done
 %{python3_sitearch}/%{pypi_name}-*.egg-info
 %{python3_sitearch}/functorch
 %{python3_sitearch}/torchgen
+%if %{without gitcommit}
 %if %{with caffe2}
 %{python3_sitearch}/caffe2
 %endif
+%endif
 
 %if %{with rocm}
 %files -n python3-%{pypi_name}-rocm-gfx8

From 3f2aad9a21b054e23ffcd1ed211dfc624b6240c0 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Wed, 26 Jun 2024 09:46:41 -0400
Subject: [PATCH 11/88] Add a CUDA subpackage

Signed-off-by: Tom Rix <trix@redhat.com>
---
 python-torch.spec | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 8e5a265..ec84f9a 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -467,6 +467,14 @@ PyTorch is a Python package that provides two high-level features:
 You can reuse your favorite Python packages such as NumPy, SciPy,
 and Cython to extend PyTorch when needed.
 
+%if %{with cuda}
+%package -n python3-%{pypi_name}-cuda-%{cudart_ver}
+Summary:        %{name} for CUDA
+
+%description -n python3-%{pypi_name}-cuda-%{cudart_ver}
+%{summary}
+%endif
+
 %if %{with rocm}
 %package -n python3-%{pypi_name}-rocm-gfx8
 Summary:        %{name} for ROCm gfx8
@@ -1029,8 +1037,11 @@ done
 
 # Do not remote the empty files
 
-
-%files -n python3-%{pypi_name} 
+%if %{with cuda}
+%files -n python3-%{pypi_name}-cuda-%{cudart_ver}
+%else
+%files -n python3-%{pypi_name}
+%endif
 %license LICENSE
 %doc README.md 
 %{_bindir}/convert-caffe2-to-onnx

From 854533551e561f040c91e3cb77d3724fbc3401a9 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Thu, 4 Jul 2024 07:29:25 -0600
Subject: [PATCH 12/88] Revisions of patches for 2.4

Signed-off-by: Tom Rix <trix@redhat.com>
---
 next/0001-Optionally-use-hipblaslt.patch | 506 +++++++++++++++++++++++
 next/0001-disable-use-of-aotriton.patch  |  94 +++++
 2 files changed, 600 insertions(+)
 create mode 100644 next/0001-Optionally-use-hipblaslt.patch
 create mode 100644 next/0001-disable-use-of-aotriton.patch

diff --git a/next/0001-Optionally-use-hipblaslt.patch b/next/0001-Optionally-use-hipblaslt.patch
new file mode 100644
index 0000000..1e5ca4b
--- /dev/null
+++ b/next/0001-Optionally-use-hipblaslt.patch
@@ -0,0 +1,506 @@
+From f1d65e958afa65882dbfea8b392ab847a84d41ed Mon Sep 17 00:00:00 2001
+From: Tom Rix <trix@redhat.com>
+Date: Sat, 29 Jun 2024 04:18:34 -0700
+Subject: [PATCH] Optionally use hipblaslt
+
+---
+ aten/src/ATen/cuda/CUDABlas.cpp          | 46 ++++++++++++++++++------
+ aten/src/ATen/cuda/CUDAContextLight.h    |  4 +++
+ aten/src/ATen/cuda/CublasHandlePool.cpp  | 10 ++++--
+ aten/src/ATen/cuda/tunable/TunableGemm.h | 18 +++++++---
+ aten/src/ATen/native/cuda/Blas.cpp       | 18 +++++++++-
+ cmake/Dependencies.cmake                 |  3 ++
+ cmake/public/LoadHIP.cmake               |  2 +-
+ 7 files changed, 82 insertions(+), 19 deletions(-)
+
+diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
+index ce991a9bcad4..3f0d17b52778 100644
+--- a/aten/src/ATen/cuda/CUDABlas.cpp
++++ b/aten/src/ATen/cuda/CUDABlas.cpp
+@@ -14,7 +14,9 @@
+ #include <c10/util/irange.h>
+ 
+ #ifdef USE_ROCM
++#ifdef USE_HIPBLASLT
+ #include <hipblaslt/hipblaslt-ext.hpp>
++#endif
+ // until hipblas has an API to accept flags, we must use rocblas here
+ #include <hipblas/hipblas.h>
+ #include <rocblas/rocblas.h>
+@@ -182,6 +184,9 @@ uint32_t _getAlignment(uintptr_t address) {
+ static size_t _parseChosenWorkspaceSize() {
+   const char * val = getenv("CUBLASLT_WORKSPACE_SIZE");
+ #ifdef USE_ROCM
++#ifndef USE_HIPBLASLT
++  return 0;
++#endif
+   if (!val) {
+     // accept either env var
+     val = getenv("HIPBLASLT_WORKSPACE_SIZE");
+@@ -235,6 +240,7 @@ namespace at::cuda::blas {
+   } while (0)
+ 
+ 
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+ namespace {
+ // Following the pattern of CuSparseDescriptor
+ // Defined here for now because this is the only place cublas_lt interface is
+@@ -318,7 +324,6 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
+ };
+ } // namespace
+ 
+-
+ template <typename Dtype>
+ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+   cudaDataType_t abcType = CUDA_R_32F;
+@@ -452,7 +457,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+       " scaleType ",
+       scaleType);
+ }
+-
++#endif
+ 
+ template <typename Dtype>
+ inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+@@ -608,10 +613,13 @@ void bgemm_internal<double>(CUDABLAS_BGEMM_ARGTYPES(double))
+ template <>
+ void bgemm_internal<float>(CUDABLAS_BGEMM_ARGTYPES(float))
+ {
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     bgemm_internal_cublaslt<float>(CUDABLAS_BGEMM_ARGS(float));
+   }
+-  else {
++  else
++#endif
++  {
+     bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGS(float));
+   }
+ }
+@@ -651,10 +659,13 @@ void bgemm_internal<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<fl
+ template <>
+ void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half))
+ {
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     bgemm_internal_cublaslt<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
+   }
+-  else {
++  else
++#endif    
++  {
+     bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
+   }
+ }
+@@ -662,10 +673,13 @@ void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half))
+ template <>
+ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
+ {
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     bgemm_internal_cublaslt<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+   }
+-  else {
++  else
++#endif
++  {
+     bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+   }
+ }
+@@ -781,11 +795,13 @@ void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
+   }
+ }
+ 
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+ template <typename Dtype>
+ inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+   // forward to bgemm implementation but set strides and batches to 0
+   bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0);
+ }
++#endif
+ 
+ template <typename Dtype>
+ inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+@@ -1008,10 +1024,13 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
+ template <>
+ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
+ {
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
+   }
+-  else {
++  else
++#endif
++  {
+     gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGS(float));
+   }
+ }
+@@ -1051,10 +1070,13 @@ void gemm_internal<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<floa
+ template <>
+ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
+ {
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+-  else {
++  else
++#endif
++  {
+     gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+ }
+@@ -1062,10 +1084,13 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
+ template <>
+ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
+ {
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+-  else {
++  else
++#endif
++  {
+     gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+ }
+@@ -1177,7 +1202,7 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
+   }
+ }
+ 
+-
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+ template <typename Dtype>
+ void gemm_and_bias(
+     bool transpose_mat1,
+@@ -1410,7 +1435,7 @@ void scaled_gemm(
+     ScalarType result_dtype,
+     void* amax_ptr,
+     bool use_fast_accum) {
+-#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
++#if CUDA_VERSION >= 11080 || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+   const auto computeType = CUBLAS_COMPUTE_32F;
+   const auto scaleType = CUDA_R_32F;
+   const int8_t fastAccuMode = use_fast_accum ? 1 : 0;
+@@ -1681,6 +1706,7 @@ void int8_gemm(
+       " scaleType ",
+       scaleType);
+ }
++#endif
+ 
+ template <>
+ void trsm<float>(CUDABLAS_TRSM_ARGTYPES(float)) {
+diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h
+index f2b657ced51b..f0ee613c4208 100644
+--- a/aten/src/ATen/cuda/CUDAContextLight.h
++++ b/aten/src/ATen/cuda/CUDAContextLight.h
+@@ -9,7 +9,9 @@
+ 
+ // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
+ // added bf16 support
++#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)))
+ #include <cublasLt.h>
++#endif
+ 
+ #ifdef CUDART_VERSION
+ #include <cusolverDn.h>
+@@ -80,7 +82,9 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator();
+ /* Handles */
+ TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle();
+ TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
++#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)))
+ TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
++#endif
+ 
+ TORCH_CUDA_CPP_API void clearCublasWorkspaces();
+ 
+diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
+index 8eac525b3695..abfdf7a23847 100644
+--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
++++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
+@@ -29,7 +29,7 @@ namespace at::cuda {
+ 
+ namespace {
+ 
+-#if defined(USE_ROCM)
++#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
+ void createCublasLtHandle(cublasLtHandle_t *handle) {
+   TORCH_CUDABLAS_CHECK(cublasLtCreate(handle));
+ }
+@@ -191,8 +191,9 @@ cublasHandle_t getCurrentCUDABlasHandle() {
+   return handle;
+ }
+ 
+-cublasLtHandle_t getCurrentCUDABlasLtHandle() {
+ #ifdef USE_ROCM
++#if defined(USE_HIPBLASLT)
++cublasLtHandle_t getCurrentCUDABlasLtHandle() {
+   c10::DeviceIndex device = 0;
+   AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+ 
+@@ -213,9 +214,12 @@ cublasLtHandle_t getCurrentCUDABlasLtHandle() {
+ 
+   auto handle = myPoolWindow->reserve(device);
+   return handle;
++}
++#endif
+ #else
++cublasLtHandle_t getCurrentCUDABlasLtHandle() {
+   return reinterpret_cast<cublasLtHandle_t>(getCurrentCUDABlasHandle());
+-#endif
+ }
++#endif
+ 
+ } // namespace at::cuda
+diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h
+index 53e6154120c9..fa1d664696db 100644
+--- a/aten/src/ATen/cuda/tunable/TunableGemm.h
++++ b/aten/src/ATen/cuda/tunable/TunableGemm.h
+@@ -11,7 +11,9 @@
+ 
+ #include <ATen/cuda/tunable/GemmCommon.h>
+ #ifdef USE_ROCM
++#ifdef USE_HIPBLASLT
+ #include <ATen/cuda/tunable/GemmHipblaslt.h>
++#endif
+ #include <ATen/cuda/tunable/GemmRocblas.h>
+ #endif
+ #include <ATen/cuda/tunable/StreamTimer.h>
+@@ -65,6 +67,7 @@ class DefaultGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>>
+     }
+ };
+ 
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+ template <typename T>
+ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
+   public:
+@@ -94,6 +97,7 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
+       return OK;
+     }
+ };
++#endif
+ 
+ template <typename T>
+ inline bool IsZero(T v) {
+@@ -191,6 +195,7 @@ static void AddRocblasValidator() {
+   }
+ }
+ 
++#ifdef USE_HIPBLASLT
+ static void AddHipblasltValidator() {
+   auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
+   if (validators.find("HIPBLASLT_VERSION") == validators.end()) {
+@@ -205,6 +210,7 @@ static void AddHipblasltValidator() {
+         [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
+   }
+ }
++#endif
+ 
+ static void AddRocmValidator() {
+   auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
+@@ -243,7 +249,7 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
+       }
+       AddRocblasValidator();
+     }
+-
++#ifdef USE_HIPBLASLT
+     static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+     if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
+       rocm_validators = true;
+@@ -257,7 +263,7 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
+       }
+       AddHipblasltValidator();
+     }
+-
++#endif
+     if (rocm_validators) {
+       AddRocmValidator();
+     }
+@@ -286,7 +292,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
+       }
+       AddRocblasValidator();
+     }
+-
++#ifdef USE_HIPBLASLT
+     static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+     if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
+       rocm_validators = true;
+@@ -300,7 +306,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
+       }
+       AddHipblasltValidator();
+     }
+-
++#endif
+     if (rocm_validators) {
+       AddRocmValidator();
+     }
+@@ -312,6 +318,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
+   }
+ };
+ 
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+ template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout>
+ class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer> {
+  public:
+@@ -321,10 +328,12 @@ class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer>
+     auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
+ 
+ #if defined(USE_ROCM)
++#ifdef USE_HIPBLASLT
+     for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps<AT, BT, CT, ALayout, BLayout>()) {
+       this->RegisterOp(std::move(name), std::move(op));
+     }
+     AddHipblasltValidator();
++#endif
+     AddRocmValidator();
+ #endif
+   }
+@@ -337,6 +346,7 @@ class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer>
+             "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+   }
+ };
++#endif
+ 
+ #undef XSTRINGIFY
+ #undef STRINGIFY
+diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
+index 84c59a4fd0d7..56ad5de3bf2d 100644
+--- a/aten/src/ATen/native/cuda/Blas.cpp
++++ b/aten/src/ATen/native/cuda/Blas.cpp
+@@ -173,6 +173,7 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa
+ }
+ 
+ static bool getDisableAddmmCudaLt() {
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+     static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT");
+ #ifdef USE_ROCM
+     // if we enable tunable op, it'll take priority over just hipblaslt (heuristics)
+@@ -196,10 +197,14 @@ static bool getDisableAddmmCudaLt() {
+     }
+     return false;
+ #endif
++#else
++    return true;
++#endif
+ }
+ 
+ #ifdef USE_ROCM
+ static bool isSupportedHipLtROCmArch(int index) {
++#ifdef USE_HIPBLASLT
+     hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index);
+     std::string device_arch = prop->gcnArchName;
+     static const std::vector<std::string> archs = {"gfx90a", "gfx940", "gfx941", "gfx942"};
+@@ -210,6 +215,7 @@ static bool isSupportedHipLtROCmArch(int index) {
+         }
+     }
+     TORCH_CHECK(false, "Attempting to use hipBLASLt on a unsupported architecture!");
++#endif
+     return false;
+ }
+ #endif
+@@ -235,6 +241,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
+   at::ScalarType scalar_type = self.scalar_type();
+   c10::MaybeOwned<Tensor> self_;
+   if (&result != &self) {
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+ #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM)
+     // Strangely, if mat2 has only 1 row or column, we get
+     // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
+@@ -276,13 +283,14 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
+             scalar_type != at::ScalarType::BFloat16));
+ #endif
+     }
++#endif
+ #endif
+     if (!useLtInterface) {
+       self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
+     }
+     self__sizes = self_->sizes();
+   } else {
+-#if defined(USE_ROCM)
++#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
+     useLtInterface = !disable_addmm_cuda_lt &&
+         result.dim() == 2 && result.is_contiguous() &&
+         isSupportedHipLtROCmArch(self.device().index()) &&
+@@ -334,6 +342,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
+   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj());
+ 
+   if (useLtInterface) {
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+ #if defined(USE_ROCM)
+     AT_DISPATCH_FLOATING_TYPES_AND2(
+         at::ScalarType::Half,
+@@ -394,6 +403,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
+               activation_epilogue
+           );
+         });
++#endif
+ #endif
+   } else
+   {
+@@ -803,6 +813,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
+ }
+ 
+ static bool _scaled_mm_allowed_device() {
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+     auto dprops = at::cuda::getCurrentDeviceProperties();
+ #ifdef USE_ROCM
+     std::string device_arch = dprops->gcnArchName;
+@@ -817,6 +828,9 @@ static bool _scaled_mm_allowed_device() {
+ #else
+     return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9);
+ #endif
++#else
++    return false;
++#endif
+ }
+ 
+ // Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax
+@@ -850,6 +864,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
+   // Check sizes
+   bool allowed_device = _scaled_mm_allowed_device();
+   TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+");
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+   TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
+   TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
+   TORCH_CHECK(
+@@ -1025,6 +1040,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
+ #if defined(USE_ROCM) && ROCM_VERSION >= 60000 && ROCM_VERSION < 60200
+   // ROCm's hipBLASLt does not support amax before 6.2, so calculate separately
+   amax = at::max(at::abs(out.to(kFloat)));
++#endif
+ #endif
+ 
+   return {out, amax};
+diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
+index f1f2eb7cec31..8d05e834bbc5 100644
+--- a/cmake/Dependencies.cmake
++++ b/cmake/Dependencies.cmake
+@@ -1052,6 +1052,9 @@ if(USE_ROCM)
+     list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
+     list(APPEND HIP_CXX_FLAGS -std=c++17)
+     list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2)
++    if(hipblast_FOUND)
++      list(APPEND HIP_CXX_FLAGS -DUSE_HIPBLASLT)
++    endif()
+     if(HIP_NEW_TYPE_ENUMS)
+       list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS)
+     endif()
+diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
+index fa39156031ff..df4836847fdf 100644
+--- a/cmake/public/LoadHIP.cmake
++++ b/cmake/public/LoadHIP.cmake
+@@ -155,7 +155,7 @@ if(HIP_FOUND)
+   find_package_and_print_version(hiprand REQUIRED)
+   find_package_and_print_version(rocblas REQUIRED)
+   find_package_and_print_version(hipblas REQUIRED)
+-  find_package_and_print_version(hipblaslt REQUIRED)
++  find_package_and_print_version(hipblaslt)
+   find_package_and_print_version(miopen REQUIRED)
+   find_package_and_print_version(hipfft REQUIRED)
+   find_package_and_print_version(hipsparse REQUIRED)
+-- 
+2.45.2
+
diff --git a/next/0001-disable-use-of-aotriton.patch b/next/0001-disable-use-of-aotriton.patch
new file mode 100644
index 0000000..61ffd1e
--- /dev/null
+++ b/next/0001-disable-use-of-aotriton.patch
@@ -0,0 +1,94 @@
+From 038ce9e44776e23f21c1816daa259bc0ea335088 Mon Sep 17 00:00:00 2001
+From: Tom Rix <trix@redhat.com>
+Date: Sat, 29 Jun 2024 07:06:09 -0700
+Subject: [PATCH] disable use of aotriton
+
+---
+ .../ATen/native/transformers/cuda/sdp_utils.cpp | 17 +++++++++++++++--
+ 1 file changed, 15 insertions(+), 2 deletions(-)
+
+diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+index 214b02d8262e..7b3eb9dcd8cd 100644
+--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
++++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+@@ -19,9 +19,12 @@
+ #include <c10/core/SymInt.h>
+ #include <c10/util/string_view.h>
+ 
++#ifdef USE_FLASH_ATTENTION
+ #if USE_ROCM
+ #include <aotriton/flash.h>
+ #endif
++#endif
++
+ 
+ /**
+ * Note [SDPA Runtime Dispatch]
+@@ -182,6 +185,9 @@ bool check_sm_version(cudaDeviceProp * dprops) {
+ 
+ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug) {
+   // Check that the gpu is capable of running flash attention
++#ifndef USE_FLASH_ATTENTION
++  return false;
++#else
+   using sm80 = SMVersion<8, 0>;
+   using sm90 = SMVersion<9, 0>;
+ #if USE_ROCM
+@@ -209,9 +215,13 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
+   }
+ #endif
+   return true;
++#endif
+ }
+ 
+ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) {
++#ifndef USE_FLASH_ATTENTION
++  return false;
++#else
+   // Mem Efficient attention supports hardware in the range [sm_50, sm_90]
+   using sm50 = SMVersion<5, 0>;
+   using sm90 = SMVersion<9, 0>;
+@@ -240,6 +250,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
+   }
+ #endif
+   return true;
++#endif  
+ }
+ 
+ bool check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89(
+@@ -554,7 +565,7 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
+ #ifndef USE_FLASH_ATTENTION
+   TORCH_WARN_ONCE(!debug, "Torch was not compiled with flash attention.");
+   return false;
+-#endif
++#else
+ 
+   // Define gate functions that determine if a flash kernel can be ran
+   // Replace with std::to_array when we migrate to c++20
+@@ -597,13 +608,14 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
+     }
+   }
+   return true;
++#endif
+ }
+ 
+ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) {
+ #ifndef USE_MEM_EFF_ATTENTION
+   TORCH_WARN_ONCE(!debug, "Torch was not compiled with memory efficient attention.");
+   return false;
+-#endif
++#else
+   // Constraints specific to mem efficient attention
+   constexpr auto greater_than_or_equal_sm80_mem_efficient_dtypes =
+       array_of<at::ScalarType>(at::kHalf, at::kFloat, at::kBFloat16);
+@@ -663,6 +675,7 @@ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) {
+   }
+ #endif
+   return check_tensor_dtype(params, less_than_sm80_mem_efficient_dtypes, debug);
++#endif
+ }
+ 
+ SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
+-- 
+2.45.2
+

From d5247c7f63c2bbfeeeba5a24b09824b1f184c15a Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Thu, 4 Jul 2024 09:35:32 -0600
Subject: [PATCH 13/88] Show use of hipblaslt package

PyTorch+ROCm requires hipblaslt.
Instead of patching to work around this requirement, use hipblaslt.

Signed-off-by: Tom Rix <trix@redhat.com>
---
 python-torch.spec | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index ec84f9a..78823ba 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.4.0-rc6
-%global commit0 699c05647931d3e4731828f07a3f34c4e0623eb9
+# v2.4.0-rc7
+%global commit0 499621e7bbd30f9c7600f26a5fba0cf065faad5e
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20240621
+%global date0 20240702
 %global pypi_version 2.4.0
 %else
 %global pypi_version 2.3.1
@@ -268,17 +268,19 @@ Patch10: 0001-Changes-to-compile-with-3.13-126033.patch
 
 # ROCm patches
 # Patches need to be refactored for ToT
-%if %{without gitcommit}
 # These are ROCm packages
 %if %{without cuda}
 # https://github.com/pytorch/pytorch/pull/120551
+%if %{without hipblaslt}
 Patch100:      0001-Optionally-use-hipblaslt.patch
+%endif
 Patch101:      0001-cuda-hip-signatures.patch
 Patch102:      0001-silence-an-assert.patch
+%if %{without gitcommit}
 Patch103:      0001-can-not-use-with-c-files.patch
-Patch104:      0001-use-any-hip.patch
-Patch105:      0001-disable-use-of-aotriton.patch
 %endif
+# Patch104:      0001-use-any-hip.patch
+Patch105:      0001-disable-use-of-aotriton.patch
 %endif
 
 ExclusiveArch:  x86_64 aarch64
@@ -631,6 +633,11 @@ sed -i -e 's/STATUS/FATAL/' caffe2/image/CMakeLists.txt
 %endif
 %endif
 
+# hipblaslt only building with gfx90a
+%if %{with hipblaslt}
+sed -i -e 's@"gfx90a", "gfx940", "gfx941", "gfx942"@"gfx90a"@' aten/src/ATen/native/cuda/Blas.cpp
+%endif
+
 %if 0%{?rhel}
 # In RHEL but too old
 sed -i -e '/typing-extensions/d' setup.py
@@ -798,6 +805,12 @@ sed -i -e 's@DESTINATION ${PYTHON_LIB_REL_PATH}@DESTINATION ${CMAKE_INSTALL_PREF
 ./tools/amd_build/build_amd.py
 # Fedora installs to /usr/include, not /usr/include/rocm-core
 sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable/TunableGemm.h
+# use any hip, correct CMAKE_MODULE_PATH
+sed -i -e 's@lib/cmake/hip@lib64/cmake/hip@' cmake/public/LoadHIP.cmake
+sed -i -e 's@HIP 1.0@HIP MODULE@'            cmake/public/LoadHIP.cmake
+# silence an assert
+# sed -i -e '/qvalue = std::clamp(qvalue, qmin, qmax);/d' aten/src/ATen/native/cuda/IndexKernel.cu
+
 %endif
 
 %if %{with cuda}
@@ -875,6 +888,7 @@ export USE_KINETO=OFF
 export USE_LITE_INTERPRETER_PROFILER=OFF
 export USE_LITE_PROTO=OFF
 export USE_MAGMA=OFF
+export USE_MEM_EFF_ATTENTION=OFF
 export USE_MKLDNN=OFF
 export USE_MPI=OFF
 export USE_MKLDNN=OFF
@@ -909,8 +923,10 @@ export USE_SYSTEM_XNNPACK=ON
 %endif
 
 %if %{with caffe2}
+%if %{without gitcommit}
 export BUILD_CAFFE2=ON
 %endif
+%endif
 
 %if %{with cuda}
 %if %{without rocm}

From 9028e14b856f03778ee32d7ea8a95d665f494f65 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Fri, 5 Jul 2024 10:20:14 -0600
Subject: [PATCH 14/88] Switch from openblas to flexiblas (rhbz#2295953)

Suggested-by: Inaki Ucar
Signed-off-by: Tom Rix <trix@redhat.com>
---
 python-torch.spec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-torch.spec b/python-torch.spec
index 78823ba..65893dd 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -289,6 +289,7 @@ ExclusiveArch:  x86_64 aarch64
 
 BuildRequires:  cmake
 BuildRequires:  eigen3-devel
+BuildRequires:  flexiblas-devel
 BuildRequires:  fmt-devel
 %if %{with caffe2}
 BuildRequires:  foxi-devel
@@ -315,7 +316,6 @@ BuildRequires:  libomp-devel
 BuildRequires:  openmpi-devel
 %endif
 %endif
-BuildRequires:  openblas-devel
 BuildRequires:  protobuf-devel
 BuildRequires:  sleef-devel
 BuildRequires:  valgrind-devel

From 6b67ee18046a9bdd77f53dec5a83cb1aa4923e1b Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Wed, 10 Jul 2024 06:35:10 -0600
Subject: [PATCH 15/88] Update to 2.4-rc8

Also change logic to use compat gcc
Add logic to these magma package

Signed-off-by: Tom Rix <trix@redhat.com>
---
 python-torch.spec | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 65893dd..0747835 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.4.0-rc7
-%global commit0 499621e7bbd30f9c7600f26a5fba0cf065faad5e
+# v2.4.0-rc8
+%global commit0 e4ee3be4063b7c430974252fdf7db42273388d86
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20240702
+%global date0 20240709
 %global pypi_version 2.4.0
 %else
 %global pypi_version 2.3.1
@@ -32,6 +32,8 @@
 %endif
 # hipblaslt is in development
 %bcond_with hipblaslt
+# magma is in development
+%bcond_with magma
 # Which families gpu build for
 %if 0%{?fedora} > 40
 %global rocm_gpu_list gfx8 gfx9 gfx10 gfx11 gfx90a gfx942 gfx1100
@@ -296,8 +298,8 @@ BuildRequires:  foxi-devel
 %endif
 
 %if %{with compat_gcc}
-BuildRequires:  compat-gcc-%{compat_gcc_major}-c++
-BuildRequires:  compat-gcc-%{compat_gcc_major}-gfortran
+BuildRequires:  gcc%{compat_gcc_major}-c++
+BuildRequires:  gcc%{compat_gcc_major}-gfortran
 %else
 BuildRequires:  gcc-c++
 BuildRequires:  gcc-gfortran
@@ -362,6 +364,9 @@ BuildRequires:  hipfft-devel
 BuildRequires:  hiprand-devel
 BuildRequires:  hipsparse-devel
 BuildRequires:  hipsolver-devel
+%if %{with magma}
+BuildRequires:  magma-devel
+%endif
 BuildRequires:  miopen-devel
 BuildRequires:  rocblas-devel
 BuildRequires:  rocrand-devel
@@ -849,9 +854,9 @@ fi
 export MAX_JOBS=$COMPILE_JOBS
 
 %if %{with compat_gcc}
-export CC=%{_bindir}/gcc%{compat_gcc_major}
-export CXX=%{_bindir}/g++%{compat_gcc_major}
-export FC=%{_bindir}/gfortran%{compat_gcc_major}
+export CC=%{_bindir}/gcc-%{compat_gcc_major}
+export CXX=%{_bindir}/g++-%{compat_gcc_major}
+export FC=%{_bindir}/gfortran-%{compat_gcc_major}
 %endif
 
 # For debugging setup.py
@@ -976,6 +981,9 @@ export BUILD_TEST=ON
 %if %{with rocm}
 
 export USE_ROCM=ON
+%if %{with magma}
+export USE_MAGMA=ON
+%endif
 export HIP_PATH=`hipconfig -p`
 export ROCM_PATH=`hipconfig -R`
 export HIP_CLANG_PATH=`hipconfig -l`

From 5a34ef7cfdd0b818e1fad33001a538ecded012bb Mon Sep 17 00:00:00 2001
From: Kefu Chai <tchaikov@gmail.com>
Date: Mon, 15 Jul 2024 21:29:26 +0800
Subject: [PATCH 16/88] Rebuilt for fmt 11

---
 ...lude-fmt-ranges.h-for-using-fmt-join.patch | 54 +++++++++++++++++++
 python-torch.spec                             |  1 +
 2 files changed, 55 insertions(+)
 create mode 100644 0001-include-fmt-ranges.h-for-using-fmt-join.patch

diff --git a/0001-include-fmt-ranges.h-for-using-fmt-join.patch b/0001-include-fmt-ranges.h-for-using-fmt-join.patch
new file mode 100644
index 0000000..f7f6c7d
--- /dev/null
+++ b/0001-include-fmt-ranges.h-for-using-fmt-join.patch
@@ -0,0 +1,54 @@
+From ba2cf11d1bf1dd5086c8e793198a697d4179cca7 Mon Sep 17 00:00:00 2001
+From: Kefu Chai <tchaikov@gmail.com>
+Date: Tue, 16 Jul 2024 08:00:22 +0800
+Subject: [PATCH] include fmt/ranges.h for using fmt::join()
+
+fmt::join() was moved into fmt/ranges.h in fmt 11, so include this
+header for using it.
+
+Signed-off-by: Kefu Chai <tchaikov@gmail.com>
+---
+ torch/csrc/distributed/c10d/socket.cpp                      | 1 +
+ torch/csrc/profiler/standalone/execution_trace_observer.cpp | 1 +
+ torch/csrc/profiler/util.cpp                                | 1 +
+ 3 files changed, 3 insertions(+)
+
+diff --git a/torch/csrc/distributed/c10d/socket.cpp b/torch/csrc/distributed/c10d/socket.cpp
+index 5013f2540..cbcd33a19 100644
+--- a/torch/csrc/distributed/c10d/socket.cpp
++++ b/torch/csrc/distributed/c10d/socket.cpp
+@@ -31,6 +31,7 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wdeprecated")
+ #include <fmt/chrono.h>
+ C10_DIAGNOSTIC_POP()
+ #include <fmt/format.h>
++#include <fmt/ranges.h>
+ 
+ #include <torch/csrc/distributed/c10d/error.h>
+ #include <torch/csrc/distributed/c10d/exception.h>
+diff --git a/torch/csrc/profiler/standalone/execution_trace_observer.cpp b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
+index 2ef2e5423..fb053e916 100644
+--- a/torch/csrc/profiler/standalone/execution_trace_observer.cpp
++++ b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
+@@ -10,6 +10,7 @@
+ #endif // _WIN32
+ 
+ #include <fmt/format.h>
++#include <fmt/ranges.h>
+ #include <chrono>
+ #include <cmath>
+ #include <fstream>
+diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp
+index 896bf606c..c229ce130 100644
+--- a/torch/csrc/profiler/util.cpp
++++ b/torch/csrc/profiler/util.cpp
+@@ -5,6 +5,7 @@
+ #include <c10/util/ArrayRef.h>
+ #include <c10/util/irange.h>
+ #include <fmt/format.h>
++#include <fmt/ranges.h>
+ 
+ #ifdef USE_KINETO
+ #include <libkineto.h>
+-- 
+2.45.2
+
diff --git a/python-torch.spec b/python-torch.spec
index 0747835..f27f099 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -284,6 +284,7 @@ Patch103:      0001-can-not-use-with-c-files.patch
 # Patch104:      0001-use-any-hip.patch
 Patch105:      0001-disable-use-of-aotriton.patch
 %endif
+Patch106:      0001-include-fmt-ranges.h-for-using-fmt-join.patch
 
 ExclusiveArch:  x86_64 aarch64
 %global toolchain gcc

From 9bb05d0f36b1955bd460820cf2735e3979a55866 Mon Sep 17 00:00:00 2001
From: Fedora Release Engineering <releng@fedoraproject.org>
Date: Fri, 19 Jul 2024 16:35:07 +0000
Subject: [PATCH 17/88] Rebuilt for
 https://fedoraproject.org/wiki/Fedora_41_Mass_Rebuild


From 2d9b5647c30ff5f2f6fe073130b49e03e56093a6 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Sat, 20 Jul 2024 06:16:21 -0600
Subject: [PATCH 18/88] Use fbgemm on 2.4

Signed-off-by: Tom Rix <trix@redhat.com>
---
 ...1-Add-cmake-option-USE_SYSTEM_FBGEMM.patch | 47 +++++++++++++++++++
 python-torch.spec                             | 31 ++++++++++--
 2 files changed, 73 insertions(+), 5 deletions(-)
 create mode 100644 next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch

diff --git a/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch b/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
new file mode 100644
index 0000000..413c60d
--- /dev/null
+++ b/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
@@ -0,0 +1,47 @@
+From 091b7fe1ccbb5e4ff4ac6017d42bacb869f61a27 Mon Sep 17 00:00:00 2001
+From: Tom Rix <trix@redhat.com>
+Date: Sat, 20 Jul 2024 05:37:15 -0600
+Subject: [PATCH] Add cmake option USE_SYSTEM_FBGEMM
+
+Signed-off-by: Tom Rix <trix@redhat.com>
+---
+ CMakeLists.txt           | 1 +
+ cmake/Dependencies.cmake | 3 ++-
+ 2 files changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index c4cd4b2c2a98..2068f7c6c4f2 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -253,6 +253,7 @@ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
+                        "USE_CUDNN" OFF)
+ cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF)
+ option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
++option(USE_SYSTEM_FBGEMM "Use system-wide FBGEMM" OFF)
+ option(USE_KINETO "Use Kineto profiling library" ON)
+ option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
+ option(USE_FAKELOWP "Use FakeLowp operators" OFF)
+diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
+index f1f2eb7cec31..192dac46f13b 100644
+--- a/cmake/Dependencies.cmake
++++ b/cmake/Dependencies.cmake
+@@ -706,6 +706,7 @@ endif()
+ 
+ # ---[ FBGEMM
+ if(USE_FBGEMM)
++  if (NOT USE_SYSTEM_FBGEMM)
+   set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")
+   if(NOT DEFINED FBGEMM_SOURCE_DIR)
+     set(FBGEMM_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/fbgemm" CACHE STRING "FBGEMM source directory")
+@@ -746,7 +747,7 @@ if(USE_FBGEMM)
+       target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable)
+     endif()
+   endif()
+-
++  endif()
+   if(USE_FBGEMM)
+     list(APPEND Caffe2_DEPENDENCY_LIBS fbgemm)
+   endif()
+-- 
+2.45.1
+
diff --git a/python-torch.spec b/python-torch.spec
index f27f099..3e74505 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -32,14 +32,14 @@
 %endif
 # hipblaslt is in development
 %bcond_with hipblaslt
-# magma is in development
-%bcond_with magma
 # Which families gpu build for
 %if 0%{?fedora} > 40
 %global rocm_gpu_list gfx8 gfx9 gfx10 gfx11 gfx90a gfx942 gfx1100
+%bcond_without magma
 %else
 # F40
 %global rocm_gpu_list gfx8 gfx9 gfx10 gfx11
+%bcond_with magma
 %endif
 %global rocm_default_gpu default
 %bcond_without rocm_loop
@@ -94,6 +94,16 @@
 %bcond_with pocketfft
 %endif
 
+%if 0%{?fedora} > 40
+%if %{with gitcommit}
+%bcond_without fbgemm
+%else
+%bcond_with fbgemm
+%endif
+%else
+%bcond_with fbgemm
+%endif
+
 # For testing cuda
 %ifarch x86_64
 %bcond_with cuda
@@ -230,6 +240,10 @@ Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{
 %endif
 
 Patch0:        0001-no-third_party-foxi.patch
+%if %{with gitcommit}
+# https://github.com/pytorch/pytorch/pull/131282
+Patch1:        0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
+%endif
 
 %if %{without gitcommit}
 Patch3:        0001-Stub-in-kineto-ActivityType.patch
@@ -265,7 +279,6 @@ Patch9:        0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
 # For Python 3.13
 # https://github.com/pytorch/pytorch/pull/126033
 Patch10: 0001-Changes-to-compile-with-3.13-126033.patch
-
 %endif
 
 # ROCm patches
@@ -281,7 +294,6 @@ Patch102:      0001-silence-an-assert.patch
 %if %{without gitcommit}
 Patch103:      0001-can-not-use-with-c-files.patch
 %endif
-# Patch104:      0001-use-any-hip.patch
 Patch105:      0001-disable-use-of-aotriton.patch
 %endif
 Patch106:      0001-include-fmt-ranges.h-for-using-fmt-join.patch
@@ -292,6 +304,10 @@ ExclusiveArch:  x86_64 aarch64
 
 BuildRequires:  cmake
 BuildRequires:  eigen3-devel
+%if %{with fbgemm}
+BuildRequires:  asmjit-devel
+BuildRequires:  fbgemm-devel
+%endif
 BuildRequires:  flexiblas-devel
 BuildRequires:  fmt-devel
 %if %{with caffe2}
@@ -885,9 +901,14 @@ export INTERN_BUILD_MOBILE=OFF
 export USE_DISTRIBUTED=OFF
 export USE_CUDA=OFF
 export USE_FAKELOWP=OFF
+%if %{with fbgemm}
+export USE_FBGEMM=ON
+export USE_SYSTEM_FBGEMM=ON
+%else
 export USE_FBGEMM=OFF
+%endif
 export USE_FLASH_ATTENTION=OFF
-export USE_GOLD_LINKER=OFF
+export USE_GOLD_LINKER=ON
 export USE_GLOO=OFF
 export USE_ITT=OFF
 export USE_KINETO=OFF

From 2debc89ffde27e50372829c19a94c0b9e46fcb38 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Sat, 20 Jul 2024 15:43:46 -0600
Subject: [PATCH 19/88] Fix USE_NUMA

Needed a BuildRequires: numactl-devel

Signed-off-by: Tom Rix <trix@redhat.com>
---
 python-torch.spec | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 3e74505..1dad38f 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -327,9 +327,10 @@ BuildRequires:  gcc-gfortran
 BuildRequires:  gloo-devel
 %endif
 %endif
+BuildRequires:  libomp-devel
+BuildRequires:  numactl-devel
 BuildRequires:  ninja-build
 BuildRequires:  onnx-devel
-BuildRequires:  libomp-devel
 %if %{with distributed}
 %if %{with mpi}
 BuildRequires:  openmpi-devel
@@ -918,7 +919,6 @@ export USE_MAGMA=OFF
 export USE_MEM_EFF_ATTENTION=OFF
 export USE_MKLDNN=OFF
 export USE_MPI=OFF
-export USE_MKLDNN=OFF
 export USE_NCCL=OFF
 export USE_NNPACK=OFF
 export USE_NUMPY=ON

From 86185b46a2a46c8e9b3f1dfc9e2ecfe8d13a7fc4 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Thu, 25 Jul 2024 16:27:17 -0600
Subject: [PATCH 20/88] PyTorch 2.4

Signed-off-by: Tom Rix <trix@redhat.com>
---
 .gitignore                                    |   4 +
 0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch |  47 +
 0001-Optionally-use-hipblaslt.patch           | 550 +++++++---
 0001-Patch-for-sleef-3.6.patch                | 952 ++++++++++++++++++
 0001-disable-use-of-aotriton.patch            |  78 +-
 python-torch.spec                             | 106 +-
 sources                                       |   4 +
 7 files changed, 1468 insertions(+), 273 deletions(-)
 create mode 100644 0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
 create mode 100644 0001-Patch-for-sleef-3.6.patch

diff --git a/.gitignore b/.gitignore
index 315fe1c..1ab81b0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,7 @@
 /pytorch-97ff6cf.tar.gz
 /pytorch-v2.3.0.tar.gz
 /pytorch-v2.3.1.tar.gz
+/pytorch-v2.4.0.tar.gz
+/v1.14.2.tar.gz
+/cpp-httplib-3b6597b.tar.gz
+/kineto-be13176.tar.gz
diff --git a/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch b/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
new file mode 100644
index 0000000..413c60d
--- /dev/null
+++ b/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
@@ -0,0 +1,47 @@
+From 091b7fe1ccbb5e4ff4ac6017d42bacb869f61a27 Mon Sep 17 00:00:00 2001
+From: Tom Rix <trix@redhat.com>
+Date: Sat, 20 Jul 2024 05:37:15 -0600
+Subject: [PATCH] Add cmake option USE_SYSTEM_FBGEMM
+
+Signed-off-by: Tom Rix <trix@redhat.com>
+---
+ CMakeLists.txt           | 1 +
+ cmake/Dependencies.cmake | 3 ++-
+ 2 files changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index c4cd4b2c2a98..2068f7c6c4f2 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -253,6 +253,7 @@ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
+                        "USE_CUDNN" OFF)
+ cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF)
+ option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
++option(USE_SYSTEM_FBGEMM "Use system-wide FBGEMM" OFF)
+ option(USE_KINETO "Use Kineto profiling library" ON)
+ option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
+ option(USE_FAKELOWP "Use FakeLowp operators" OFF)
+diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
+index f1f2eb7cec31..192dac46f13b 100644
+--- a/cmake/Dependencies.cmake
++++ b/cmake/Dependencies.cmake
+@@ -706,6 +706,7 @@ endif()
+ 
+ # ---[ FBGEMM
+ if(USE_FBGEMM)
++  if (NOT USE_SYSTEM_FBGEMM)
+   set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")
+   if(NOT DEFINED FBGEMM_SOURCE_DIR)
+     set(FBGEMM_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/fbgemm" CACHE STRING "FBGEMM source directory")
+@@ -746,7 +747,7 @@ if(USE_FBGEMM)
+       target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable)
+     endif()
+   endif()
+-
++  endif()
+   if(USE_FBGEMM)
+     list(APPEND Caffe2_DEPENDENCY_LIBS fbgemm)
+   endif()
+-- 
+2.45.1
+
diff --git a/0001-Optionally-use-hipblaslt.patch b/0001-Optionally-use-hipblaslt.patch
index 56434a7..1e5ca4b 100644
--- a/0001-Optionally-use-hipblaslt.patch
+++ b/0001-Optionally-use-hipblaslt.patch
@@ -1,174 +1,398 @@
-From d77e05d90df006322cda021f1a8affdcc2c7eaef Mon Sep 17 00:00:00 2001
+From f1d65e958afa65882dbfea8b392ab847a84d41ed Mon Sep 17 00:00:00 2001
 From: Tom Rix <trix@redhat.com>
-Date: Fri, 23 Feb 2024 08:27:30 -0500
+Date: Sat, 29 Jun 2024 04:18:34 -0700
 Subject: [PATCH] Optionally use hipblaslt
 
-The hipblaslt package is not available on Fedora.
-Instead of requiring the package, make it optional.
-If it is found, define the preprocessor variable HIPBLASLT
-Convert the checks for ROCM_VERSION >= 507000 to HIPBLASLT checks
-
-Signed-off-by: Tom Rix <trix@redhat.com>
 ---
- aten/src/ATen/cuda/CUDABlas.cpp          |  7 ++++---
- aten/src/ATen/cuda/CUDABlas.h            |  2 +-
- aten/src/ATen/cuda/CUDAContextLight.h    |  4 ++--
- aten/src/ATen/cuda/CublasHandlePool.cpp  |  4 ++--
- aten/src/ATen/cuda/tunable/TunableGemm.h |  6 +++---
- aten/src/ATen/native/cuda/Blas.cpp       | 14 ++++++++------
- cmake/Dependencies.cmake                 |  3 +++
- cmake/public/LoadHIP.cmake               |  4 ++--
- 8 files changed, 25 insertions(+), 19 deletions(-)
+ aten/src/ATen/cuda/CUDABlas.cpp          | 46 ++++++++++++++++++------
+ aten/src/ATen/cuda/CUDAContextLight.h    |  4 +++
+ aten/src/ATen/cuda/CublasHandlePool.cpp  | 10 ++++--
+ aten/src/ATen/cuda/tunable/TunableGemm.h | 18 +++++++---
+ aten/src/ATen/native/cuda/Blas.cpp       | 18 +++++++++-
+ cmake/Dependencies.cmake                 |  3 ++
+ cmake/public/LoadHIP.cmake               |  2 +-
+ 7 files changed, 82 insertions(+), 19 deletions(-)
 
 diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
-index d534ec5a178..e815463f630 100644
+index ce991a9bcad4..3f0d17b52778 100644
 --- a/aten/src/ATen/cuda/CUDABlas.cpp
 +++ b/aten/src/ATen/cuda/CUDABlas.cpp
-@@ -14,7 +14,7 @@
+@@ -14,7 +14,9 @@
  #include <c10/util/irange.h>
  
  #ifdef USE_ROCM
--#if ROCM_VERSION >= 60000
-+#ifdef HIPBLASLT
++#ifdef USE_HIPBLASLT
  #include <hipblaslt/hipblaslt-ext.hpp>
- #endif
++#endif
  // until hipblas has an API to accept flags, we must use rocblas here
-@@ -781,7 +781,7 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
-   }
- }
+ #include <hipblas/hipblas.h>
+ #include <rocblas/rocblas.h>
+@@ -182,6 +184,9 @@ uint32_t _getAlignment(uintptr_t address) {
+ static size_t _parseChosenWorkspaceSize() {
+   const char * val = getenv("CUBLASLT_WORKSPACE_SIZE");
+ #ifdef USE_ROCM
++#ifndef USE_HIPBLASLT
++  return 0;
++#endif
+   if (!val) {
+     // accept either env var
+     val = getenv("HIPBLASLT_WORKSPACE_SIZE");
+@@ -235,6 +240,7 @@ namespace at::cuda::blas {
+   } while (0)
  
--#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
-+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
  
- #if defined(USE_ROCM) && ROCM_VERSION >= 50700 && ROCM_VERSION < 60000
- // only for rocm 5.7 where we first supported hipblaslt, it was difficult
-@@ -912,6 +912,7 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+ namespace {
+ // Following the pattern of CuSparseDescriptor
+ // Defined here for now because this is the only place cublas_lt interface is
+@@ -318,7 +324,6 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
  };
  } // namespace
  
-+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
+-
+ template <typename Dtype>
+ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+   cudaDataType_t abcType = CUDA_R_32F;
+@@ -452,7 +457,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+       " scaleType ",
+       scaleType);
+ }
+-
++#endif
+ 
+ template <typename Dtype>
+ inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+@@ -608,10 +613,13 @@ void bgemm_internal<double>(CUDABLAS_BGEMM_ARGTYPES(double))
+ template <>
+ void bgemm_internal<float>(CUDABLAS_BGEMM_ARGTYPES(float))
+ {
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     bgemm_internal_cublaslt<float>(CUDABLAS_BGEMM_ARGS(float));
+   }
+-  else {
++  else
++#endif
++  {
+     bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGS(float));
+   }
+ }
+@@ -651,10 +659,13 @@ void bgemm_internal<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<fl
+ template <>
+ void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half))
+ {
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     bgemm_internal_cublaslt<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
+   }
+-  else {
++  else
++#endif    
++  {
+     bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
+   }
+ }
+@@ -662,10 +673,13 @@ void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half))
+ template <>
+ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
+ {
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     bgemm_internal_cublaslt<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+   }
+-  else {
++  else
++#endif
++  {
+     bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+   }
+ }
+@@ -781,11 +795,13 @@ void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
+   }
+ }
+ 
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+ template <typename Dtype>
+ inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+   // forward to bgemm implementation but set strides and batches to 0
+   bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0);
+ }
++#endif
+ 
+ template <typename Dtype>
+ inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+@@ -1008,10 +1024,13 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
+ template <>
+ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
+ {
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
+   }
+-  else {
++  else
++#endif
++  {
+     gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGS(float));
+   }
+ }
+@@ -1051,10 +1070,13 @@ void gemm_internal<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<floa
+ template <>
+ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
+ {
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+-  else {
++  else
++#endif
++  {
+     gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+ }
+@@ -1062,10 +1084,13 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
+ template <>
+ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
+ {
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+-  else {
++  else
++#endif
++  {
+     gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+ }
+@@ -1177,7 +1202,7 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
+   }
+ }
+ 
+-
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
  template <typename Dtype>
  void gemm_and_bias(
      bool transpose_mat1,
-@@ -1124,7 +1125,7 @@ template void gemm_and_bias(
-     at::BFloat16* result_ptr,
-     int64_t result_ld,
-     GEMMAndBiasActivationEpilogue activation);
--
+@@ -1410,7 +1435,7 @@ void scaled_gemm(
+     ScalarType result_dtype,
+     void* amax_ptr,
+     bool use_fast_accum) {
+-#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
++#if CUDA_VERSION >= 11080 || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+   const auto computeType = CUBLAS_COMPUTE_32F;
+   const auto scaleType = CUDA_R_32F;
+   const int8_t fastAccuMode = use_fast_accum ? 1 : 0;
+@@ -1681,6 +1706,7 @@ void int8_gemm(
+       " scaleType ",
+       scaleType);
+ }
 +#endif
- void scaled_gemm(
-     char transa,
-     char transb,
-diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h
-index eb12bb350c5..068607467dd 100644
---- a/aten/src/ATen/cuda/CUDABlas.h
-+++ b/aten/src/ATen/cuda/CUDABlas.h
-@@ -82,7 +82,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
- template <>
- void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
  
--#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
-+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
- enum GEMMAndBiasActivationEpilogue {
-   None,
-   RELU,
+ template <>
+ void trsm<float>(CUDABLAS_TRSM_ARGTYPES(float)) {
 diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h
-index 4ec35f59a21..e28dc42034f 100644
+index f2b657ced51b..f0ee613c4208 100644
 --- a/aten/src/ATen/cuda/CUDAContextLight.h
 +++ b/aten/src/ATen/cuda/CUDAContextLight.h
-@@ -9,7 +9,7 @@
+@@ -9,7 +9,9 @@
  
  // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
  // added bf16 support
--#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
-+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
++#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)))
  #include <cublasLt.h>
- #endif
++#endif
  
-@@ -82,7 +82,7 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator();
+ #ifdef CUDART_VERSION
+ #include <cusolverDn.h>
+@@ -80,7 +82,9 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator();
  /* Handles */
  TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle();
  TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
--#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
-+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
++#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)))
  TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
- #endif
++#endif
+ 
+ TORCH_CUDA_CPP_API void clearCublasWorkspaces();
  
 diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
-index 6913d2cd95e..3d4276be372 100644
+index 8eac525b3695..abfdf7a23847 100644
 --- a/aten/src/ATen/cuda/CublasHandlePool.cpp
 +++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
 @@ -29,7 +29,7 @@ namespace at::cuda {
  
  namespace {
  
--#if defined(USE_ROCM) && ROCM_VERSION >= 50700
-+#if defined(USE_ROCM) && defined(HIPBLASLT)
+-#if defined(USE_ROCM)
++#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
  void createCublasLtHandle(cublasLtHandle_t *handle) {
    TORCH_CUDABLAS_CHECK(cublasLtCreate(handle));
  }
-@@ -190,7 +190,7 @@ cublasHandle_t getCurrentCUDABlasHandle() {
+@@ -191,8 +191,9 @@ cublasHandle_t getCurrentCUDABlasHandle() {
    return handle;
  }
  
--#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
-+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
- cublasLtHandle_t getCurrentCUDABlasLtHandle() {
+-cublasLtHandle_t getCurrentCUDABlasLtHandle() {
  #ifdef USE_ROCM
++#if defined(USE_HIPBLASLT)
++cublasLtHandle_t getCurrentCUDABlasLtHandle() {
    c10::DeviceIndex device = 0;
+   AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+ 
+@@ -213,9 +214,12 @@ cublasLtHandle_t getCurrentCUDABlasLtHandle() {
+ 
+   auto handle = myPoolWindow->reserve(device);
+   return handle;
++}
++#endif
+ #else
++cublasLtHandle_t getCurrentCUDABlasLtHandle() {
+   return reinterpret_cast<cublasLtHandle_t>(getCurrentCUDABlasHandle());
+-#endif
+ }
++#endif
+ 
+ } // namespace at::cuda
 diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h
-index 3ba0d761277..dde1870cfbf 100644
+index 53e6154120c9..fa1d664696db 100644
 --- a/aten/src/ATen/cuda/tunable/TunableGemm.h
 +++ b/aten/src/ATen/cuda/tunable/TunableGemm.h
-@@ -11,7 +11,7 @@
+@@ -11,7 +11,9 @@
  
  #include <ATen/cuda/tunable/GemmCommon.h>
  #ifdef USE_ROCM
--#if ROCM_VERSION >= 50700
-+#ifdef HIPBLASLT
++#ifdef USE_HIPBLASLT
  #include <ATen/cuda/tunable/GemmHipblaslt.h>
- #endif
++#endif
  #include <ATen/cuda/tunable/GemmRocblas.h>
-@@ -166,7 +166,7 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
-     }
  #endif
- 
--#if defined(USE_ROCM) && ROCM_VERSION >= 50700
-+#if defined(USE_ROCM) && defined(HIPBLASLT)
-     static const char *env = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
-     if (env == nullptr || strcmp(env, "1") == 0) {
-       // disallow tuning of hipblaslt with c10::complex
-@@ -240,7 +240,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
+ #include <ATen/cuda/tunable/StreamTimer.h>
+@@ -65,6 +67,7 @@ class DefaultGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>>
      }
- #endif
- 
--#if defined(USE_ROCM) && ROCM_VERSION >= 50700
-+#if defined(USE_ROCM) && defined(HIPBLASLT)
-     static const char *env = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
-     if (env == nullptr || strcmp(env, "1") == 0) {
-       // disallow tuning of hipblaslt with c10::complex
-diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
-index 29e5c5e3cf1..df56f3d7f1d 100644
---- a/aten/src/ATen/native/cuda/Blas.cpp
-+++ b/aten/src/ATen/native/cuda/Blas.cpp
-@@ -155,7 +155,7 @@ enum class Activation {
-   GELU,
  };
  
--#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
-+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
- cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activation a) {
-   switch (a) {
-     case Activation::None:
-@@ -193,6 +193,7 @@ static bool getDisableAddmmCudaLt() {
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+ template <typename T>
+ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
+   public:
+@@ -94,6 +97,7 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
+       return OK;
+     }
+ };
++#endif
+ 
+ template <typename T>
+ inline bool IsZero(T v) {
+@@ -191,6 +195,7 @@ static void AddRocblasValidator() {
+   }
+ }
+ 
++#ifdef USE_HIPBLASLT
+ static void AddHipblasltValidator() {
+   auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
+   if (validators.find("HIPBLASLT_VERSION") == validators.end()) {
+@@ -205,6 +210,7 @@ static void AddHipblasltValidator() {
+         [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
+   }
+ }
++#endif
+ 
+ static void AddRocmValidator() {
+   auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
+@@ -243,7 +249,7 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
+       }
+       AddRocblasValidator();
+     }
+-
++#ifdef USE_HIPBLASLT
+     static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+     if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
+       rocm_validators = true;
+@@ -257,7 +263,7 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
+       }
+       AddHipblasltValidator();
+     }
+-
++#endif
+     if (rocm_validators) {
+       AddRocmValidator();
+     }
+@@ -286,7 +292,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
+       }
+       AddRocblasValidator();
+     }
+-
++#ifdef USE_HIPBLASLT
+     static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+     if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
+       rocm_validators = true;
+@@ -300,7 +306,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
+       }
+       AddHipblasltValidator();
+     }
+-
++#endif
+     if (rocm_validators) {
+       AddRocmValidator();
+     }
+@@ -312,6 +318,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
+   }
+ };
+ 
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+ template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout>
+ class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer> {
+  public:
+@@ -321,10 +328,12 @@ class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer>
+     auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
+ 
+ #if defined(USE_ROCM)
++#ifdef USE_HIPBLASLT
+     for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps<AT, BT, CT, ALayout, BLayout>()) {
+       this->RegisterOp(std::move(name), std::move(op));
+     }
+     AddHipblasltValidator();
++#endif
+     AddRocmValidator();
+ #endif
+   }
+@@ -337,6 +346,7 @@ class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer>
+             "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+   }
+ };
++#endif
+ 
+ #undef XSTRINGIFY
+ #undef STRINGIFY
+diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
+index 84c59a4fd0d7..56ad5de3bf2d 100644
+--- a/aten/src/ATen/native/cuda/Blas.cpp
++++ b/aten/src/ATen/native/cuda/Blas.cpp
+@@ -173,6 +173,7 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa
+ }
+ 
+ static bool getDisableAddmmCudaLt() {
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+     static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT");
+ #ifdef USE_ROCM
+     // if we enable tunable op, it'll take priority over just hipblaslt (heuristics)
+@@ -196,10 +197,14 @@ static bool getDisableAddmmCudaLt() {
+     }
+     return false;
+ #endif
++#else
++    return true;
++#endif
+ }
  
  #ifdef USE_ROCM
  static bool isSupportedHipLtROCmArch(int index) {
-+#if defined(HIPBLASLT)
++#ifdef USE_HIPBLASLT
      hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index);
      std::string device_arch = prop->gcnArchName;
      static const std::vector<std::string> archs = {"gfx90a", "gfx940", "gfx941", "gfx942"};
-@@ -203,6 +204,7 @@ static bool isSupportedHipLtROCmArch(int index) {
+@@ -210,6 +215,7 @@ static bool isSupportedHipLtROCmArch(int index) {
          }
      }
      TORCH_CHECK(false, "Attempting to use hipBLASLt on a unsupported architecture!");
@@ -176,87 +400,107 @@ index 29e5c5e3cf1..df56f3d7f1d 100644
      return false;
  }
  #endif
-@@ -228,7 +230,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
+@@ -235,6 +241,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
    at::ScalarType scalar_type = self.scalar_type();
    c10::MaybeOwned<Tensor> self_;
    if (&result != &self) {
--#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11040 && !defined(_MSC_VER)) || defined(USE_ROCM) && ROCM_VERSION >= 50700
-+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11040 && !defined(_MSC_VER)) || defined(USE_ROCM) && defined(HIPBLASLT)
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+ #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM)
      // Strangely, if mat2 has only 1 row or column, we get
      // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
-     // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
-@@ -271,7 +273,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
+@@ -276,13 +283,14 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
+             scalar_type != at::ScalarType::BFloat16));
+ #endif
+     }
++#endif
+ #endif
+     if (!useLtInterface) {
+       self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
      }
      self__sizes = self_->sizes();
    } else {
--#if defined(USE_ROCM) && ROCM_VERSION >= 50700
-+#if defined(USE_ROCM) && defined(HIPBLASLT)
+-#if defined(USE_ROCM)
++#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
      useLtInterface = !disable_addmm_cuda_lt &&
          result.dim() == 2 && result.is_contiguous() &&
          isSupportedHipLtROCmArch(self.device().index()) &&
-@@ -322,7 +324,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
- 
+@@ -334,6 +342,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj());
  
--#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
-+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
    if (useLtInterface) {
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+ #if defined(USE_ROCM)
      AT_DISPATCH_FLOATING_TYPES_AND2(
          at::ScalarType::Half,
-@@ -876,7 +878,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
-   at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
-   at::native::resize_output(amax, {});
+@@ -394,6 +403,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
+               activation_epilogue
+           );
+         });
++#endif
+ #endif
+   } else
+   {
+@@ -803,6 +813,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
+ }
  
--#if !defined(USE_ROCM) && !defined(_MSC_VER) || (defined(USE_ROCM) && ROCM_VERSION >= 60000)
-+#if !defined(USE_ROCM) && !defined(_MSC_VER) || (defined(USE_ROCM) && defined(HIPBLASLT))
-   cublasCommonArgs args(mat1, mat2, out);
-   const auto out_dtype_ = args.result->scalar_type();
-   TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
-@@ -906,7 +908,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
-   TORCH_CHECK(false, "_scaled_mm_out_cuda is not compiled for this platform.");
+ static bool _scaled_mm_allowed_device() {
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+     auto dprops = at::cuda::getCurrentDeviceProperties();
+ #ifdef USE_ROCM
+     std::string device_arch = dprops->gcnArchName;
+@@ -817,6 +828,9 @@ static bool _scaled_mm_allowed_device() {
+ #else
+     return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9);
+ #endif
++#else
++    return false;
++#endif
+ }
+ 
+ // Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax
+@@ -850,6 +864,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
+   // Check sizes
+   bool allowed_device = _scaled_mm_allowed_device();
+   TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+");
++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
+   TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
+   TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
+   TORCH_CHECK(
+@@ -1025,6 +1040,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
+ #if defined(USE_ROCM) && ROCM_VERSION >= 60000 && ROCM_VERSION < 60200
+   // ROCm's hipBLASLt does not support amax before 6.2, so calculate separately
+   amax = at::max(at::abs(out.to(kFloat)));
++#endif
  #endif
  
--#if defined(USE_ROCM) && ROCM_VERSION >= 60000
-+#if defined(USE_ROCM) && defined(HIPBLASLT)
-   // rocm's hipblaslt does not yet support amax, so calculate separately
-   auto out_float32 = out.to(kFloat);
-   out_float32.abs_();
+   return {out, amax};
 diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index b7ffbeb07dc..2b6c3678984 100644
+index f1f2eb7cec31..8d05e834bbc5 100644
 --- a/cmake/Dependencies.cmake
 +++ b/cmake/Dependencies.cmake
-@@ -1273,6 +1273,9 @@ if(USE_ROCM)
-     if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "6.0.0")
-       list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2)
-     endif()
+@@ -1052,6 +1052,9 @@ if(USE_ROCM)
+     list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
+     list(APPEND HIP_CXX_FLAGS -std=c++17)
+     list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2)
 +    if(hipblast_FOUND)
-+      list(APPEND HIP_CXX_FLAGS -DHIPBLASLT)
++      list(APPEND HIP_CXX_FLAGS -DUSE_HIPBLASLT)
 +    endif()
-     if(HIPBLASLT_CUSTOM_DATA_TYPE)
-       list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_CUSTOM_DATA_TYPE)
+     if(HIP_NEW_TYPE_ENUMS)
+       list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS)
      endif()
 diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
-index f6ca263c5e5..53eb0b63c1a 100644
+index fa39156031ff..df4836847fdf 100644
 --- a/cmake/public/LoadHIP.cmake
 +++ b/cmake/public/LoadHIP.cmake
-@@ -156,7 +156,7 @@ if(HIP_FOUND)
+@@ -155,7 +155,7 @@ if(HIP_FOUND)
+   find_package_and_print_version(hiprand REQUIRED)
    find_package_and_print_version(rocblas REQUIRED)
    find_package_and_print_version(hipblas REQUIRED)
-   if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.7.0")
--    find_package_and_print_version(hipblaslt REQUIRED)
-+    find_package_and_print_version(hipblaslt)
-   endif()
+-  find_package_and_print_version(hipblaslt REQUIRED)
++  find_package_and_print_version(hipblaslt)
    find_package_and_print_version(miopen REQUIRED)
-   if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "4.1.0")
-@@ -191,7 +191,7 @@ if(HIP_FOUND)
-   # roctx is part of roctracer
-   find_library(ROCM_ROCTX_LIB roctx64 HINTS ${ROCM_PATH}/lib)
- 
--  if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.7.0")
-+  if(hipblastlt_FOUND)
-     # check whether hipblaslt is using its own datatype
-     set(file "${PROJECT_BINARY_DIR}/hipblaslt_test_data_type.cc")
-     file(WRITE ${file} ""
+   find_package_and_print_version(hipfft REQUIRED)
+   find_package_and_print_version(hipsparse REQUIRED)
 -- 
-2.43.2
+2.45.2
 
diff --git a/0001-Patch-for-sleef-3.6.patch b/0001-Patch-for-sleef-3.6.patch
new file mode 100644
index 0000000..13aa208
--- /dev/null
+++ b/0001-Patch-for-sleef-3.6.patch
@@ -0,0 +1,952 @@
+From 273f23698c887b52e66c2abec8101b7398f0f9c4 Mon Sep 17 00:00:00 2001
+From: "Benjamin A. Beasley" <code@musicinmybrain.net>
+Date: Wed, 5 Jun 2024 11:06:02 -0400
+Subject: [PATCH] Patch for sleef 3.6
+
+---
+ ...ectorization-on-windows-submodule-sl.patch | 910 ++++++++++++++++++
+ python-torch.spec                             |  11 +
+ 2 files changed, 921 insertions(+)
+ create mode 100644 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
+
+diff --git a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
+new file mode 100644
+index 000000000000..562f55b742c2
+--- /dev/null
++++ b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
+@@ -0,0 +1,910 @@
++From 3d1e4b3e5ddcdd2717e590c635097163fef64c83 Mon Sep 17 00:00:00 2001
++From: Xu Han <xu.han@intel.com>
++Date: Sun, 31 Mar 2024 03:07:32 +0000
++Subject: [PATCH] Enable x86 CPU vectorization on windows [submodule sleef]
++ (#118980)
++
++Enable VEC on Windows OS.
++1. Fix some type defination gap between Windows and Linux.
++2. Fix some operator not support on Windows, such as [], /.
++3. Enable static sleef library build on Windows.
++4. Disable unsupported function overloading on MSVC.
++5. Upgrade submodule sleef lib, which fixed build issue on Windows.
++6. Fixed bazel build issues.
++7. Fix test app not link to sleef on Windows.
++
++Note: If rebuild fail after pulled this PR, please sync `sleef` submodule by run:
++```cmd
++git submodule sync
++git submodule update --init --recursive
++```
++
++Pull Request resolved: https://github.com/pytorch/pytorch/pull/118980
++Approved by: https://github.com/jgong5, https://github.com/ezyang, https://github.com/malfet
++---
++ aten/src/ATen/CMakeLists.txt                  | 48 ++++++--------
++ aten/src/ATen/cpu/vec/vec256/vec256.h         | 14 ++--
++ .../src/ATen/cpu/vec/vec256/vec256_bfloat16.h | 27 ++++++--
++ .../cpu/vec/vec256/vec256_complex_double.h    |  7 +-
++ .../cpu/vec/vec256/vec256_complex_float.h     |  7 +-
++ aten/src/ATen/cpu/vec/vec256/vec256_double.h  |  5 +-
++ aten/src/ATen/cpu/vec/vec256/vec256_float.h   | 15 +++--
++ aten/src/ATen/cpu/vec/vec256/vec256_qint.h    | 12 +++-
++ aten/src/ATen/cpu/vec/vec512/vec512.h         | 14 ++--
++ .../src/ATen/cpu/vec/vec512/vec512_bfloat16.h | 27 ++++++--
++ .../cpu/vec/vec512/vec512_complex_double.h    |  7 +-
++ .../cpu/vec/vec512/vec512_complex_float.h     |  7 +-
++ aten/src/ATen/cpu/vec/vec512/vec512_double.h  |  5 +-
++ aten/src/ATen/cpu/vec/vec512/vec512_float.h   | 15 +++--
++ aten/src/ATen/cpu/vec/vec512/vec512_qint.h    | 66 ++++++++++++++++++-
++ aten/src/ATen/cpu/vec/vec_base.h              |  6 ++
++ caffe2/CMakeLists.txt                         |  2 +-
++ third_party/sleef.BUILD                       |  3 +-
++ 18 files changed, 194 insertions(+), 93 deletions(-)
++
++diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
++index bf425af5fa9..58d5828e8ca 100644
++--- a/aten/src/ATen/CMakeLists.txt
+++++ b/aten/src/ATen/CMakeLists.txt
++@@ -419,32 +419,25 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$")
++   list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo)
++ endif()
++ 
++-if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
++-  # Preserve values for the main build
++-  set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS})
++-  set(__aten_sleef_build_tests ${BUILD_TESTS})
++-
++-  # Unset our restrictive C++ flags here and reset them later.
++-  # Remove this once we use proper target_compile_options.
++-  set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
++-  set(CMAKE_CXX_FLAGS)
++-
++-  # Bump up optimization level for sleef to -O1, since at -O0 the compiler
++-  # excessively spills intermediate vector registers to the stack
++-  # and makes things run impossibly slowly
++-  set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
++-  if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
++-    string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
++-  else()
++-    set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
+++if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
+++  if(NOT MSVC)
+++    # Bump up optimization level for sleef to -O1, since at -O0 the compiler
+++    # excessively spills intermediate vector registers to the stack
+++    # and makes things run impossibly slowly
+++    set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+++    if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
+++      string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
+++    else()
+++      set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
+++    endif()
++   endif()
++ 
++   if(NOT USE_SYSTEM_SLEEF)
++-    set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
++-    set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
++-    set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
++-    set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
++-    set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE})
+++    set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
+++    set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
+++    set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
+++    set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
+++    set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE)
++     if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
++       if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
++         set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE)
++@@ -465,12 +458,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
++   endif()
++   list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef)
++ 
++-  set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
++-  set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
++-
++-  # Set these back. TODO: Use SLEEF_ to pass these instead
++-  set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE)
++-  set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE)
+++  if(NOT MSVC)
+++    set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
+++  endif()
++ endif()
++ 
++ if(USE_CUDA AND NOT USE_ROCM)
++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h
++index 800b027e469..c431fa3c605 100644
++--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
++@@ -69,7 +69,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
++ }
++ 
++ 
++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX2)
++ 
++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ 
++@@ -94,7 +94,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
++ }
++ 
++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++-
+++#ifndef _MSC_VER
+++// MSVC is not working well on complex function overload.
++ template<int64_t scale = 1>
++ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
++ inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
++@@ -106,9 +107,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
++ inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
++   return _mm256_i32gather_ps(base_addr, vindex, scale);
++ }
++-
+++#endif
++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++-
+++#ifndef _MSC_VER
+++// MSVC is not working well on complex function overload.
++ template<int64_t scale = 1>
++ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
++ inline mask_gather(const Vectorized<double>& src, const double* base_addr,
++@@ -122,7 +124,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
++                    const Vectorized<int32_t>& vindex, Vectorized<float>& mask) {
++   return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale);
++ }
++-
+++#endif
++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ 
++ // Only works for inputs in the range: [-2^51, 2^51]
++@@ -302,6 +304,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
++   return flip8(v);
++ }
++ 
++-#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+++#endif // (defined(CPU_CAPABILITY_AVX2)
++ 
++ }} // namepsace at::vec::CPU_CAPABILITY
++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
++index 3e26213d6d2..66557436c70 100644
++--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
+++++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
++@@ -7,7 +7,8 @@
++ #include <ATen/cpu/vec/vec_base.h>
++ #include <c10/util/irange.h>
++ 
++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX2)
+++#define SLEEF_STATIC_LIBS
++ #include <sleef.h>
++ #endif
++ 
++@@ -18,7 +19,18 @@ namespace at::vec {
++ // See Note [CPU_CAPABILITY namespace]
++ inline namespace CPU_CAPABILITY {
++ 
++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX2)
+++
+++#ifndef SLEEF_CONST
+++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
+++#define SLEEF_CONST const
+++#else
+++#define SLEEF_CONST
+++#endif
+++#define SLEEF_CONST_OLD SLEEF_CONST
+++#else
+++#define SLEEF_CONST_OLD
+++#endif
++ 
++ // bfloat16 conversion
++ static inline void cvtbf16_fp32(const __m128i& a, __m256& o) {
++@@ -265,7 +277,8 @@ public:
++     }
++     return b;
++   }
++-  Vectorized<T> map(const __m256 (*const vop)(__m256)) const {
+++
+++  Vectorized<T> map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const {
++     __m256 lo, hi;
++     cvt_to_fp32<T>(values, lo, hi);
++     const auto o1 = vop(lo);
++@@ -1026,7 +1039,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
++ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
++ CONVERT_VECTORIZED_INIT(Half, half);
++ 
++-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+++#else // defined(CPU_CAPABILITY_AVX2)
++ 
++ #define CONVERT_NON_VECTORIZED_INIT(type, name) \
++ inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
++@@ -1051,9 +1064,9 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
++ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
++ CONVERT_NON_VECTORIZED_INIT(Half, half);
++ 
++-#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+++#endif // defined(CPU_CAPABILITY_AVX2)
++ 
++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX2)
++ #define LOAD_FP32_VECTORIZED_INIT(type, name) \
++ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
++   auto values = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data)); \
++@@ -1072,7 +1085,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
++ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
++ LOAD_FP32_VECTORIZED_INIT(Half, fp16);
++ 
++-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+++#else // defined(CPU_CAPABILITY_AVX2)
++ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
++ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
++   __at_align__ float values[Vectorized<float>::size()]; \
++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
++index f93ea1e63c3..6c198fb37d3 100644
++--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
+++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
++@@ -8,7 +8,8 @@
++ #include <ATen/cpu/vec/intrinsics.h>
++ #include <ATen/cpu/vec/vec_base.h>
++ 
++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX2)
+++#define SLEEF_STATIC_LIBS
++ #include <sleef.h>
++ #endif
++ 
++@@ -16,7 +17,7 @@ namespace at::vec {
++ // See Note [CPU_CAPABILITY namespace]
++ inline namespace CPU_CAPABILITY {
++ 
++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX2)
++ 
++ template <> class Vectorized<c10::complex<double>> {
++ private:
++@@ -145,7 +146,7 @@ public:
++     auto abs = abs_();
++     auto zero = _mm256_setzero_pd();
++     auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ);
++-    auto div = values / abs;
+++    auto div = _mm256_div_pd(values, abs);
++     return _mm256_blendv_pd(div, zero, mask);
++   }
++   __m256d real_() const {
++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
++index 7c142c04b79..c72d4d49274 100644
++--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
+++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
++@@ -7,7 +7,8 @@
++ #include <c10/util/irange.h>
++ #include <ATen/cpu/vec/intrinsics.h>
++ #include <ATen/cpu/vec/vec_base.h>
++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX2)
+++#define SLEEF_STATIC_LIBS
++ #include <sleef.h>
++ #endif
++ 
++@@ -15,7 +16,7 @@ namespace at::vec {
++ // See Note [CPU_CAPABILITY namespace]
++ inline namespace CPU_CAPABILITY {
++ 
++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX2)
++ 
++ template <> class Vectorized<c10::complex<float>> {
++ private:
++@@ -180,7 +181,7 @@ public:
++     auto abs = abs_();
++     auto zero = _mm256_setzero_ps();
++     auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ);
++-    auto div = values / abs;
+++    auto div = _mm256_div_ps(values, abs);
++     return _mm256_blendv_ps(div, zero, mask);
++   }
++   __m256 real_() const {
++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
++index bc82d07edd1..bed6da627af 100644
++--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h
+++++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
++@@ -6,7 +6,8 @@
++ #include <ATen/cpu/vec/intrinsics.h>
++ #include <ATen/cpu/vec/vec_base.h>
++ #include <c10/util/irange.h>
++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX2)
+++#define SLEEF_STATIC_LIBS
++ #include <sleef.h>
++ #endif
++ 
++@@ -15,7 +16,7 @@ namespace at::vec {
++ inline namespace CPU_CAPABILITY {
++ 
++ 
++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX2)
++ 
++ template <> class Vectorized<double> {
++ private:
++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
++index 886809a0b8a..0e3664cd37b 100644
++--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+++++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
++@@ -6,7 +6,8 @@
++ #include <ATen/cpu/vec/intrinsics.h>
++ #include <ATen/cpu/vec/vec_base.h>
++ #include <c10/util/irange.h>
++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX2)
+++#define SLEEF_STATIC_LIBS
++ #include <sleef.h>
++ #endif
++ 
++@@ -14,7 +15,7 @@ namespace at::vec {
++ // See Note [CPU_CAPABILITY namespace]
++ inline namespace CPU_CAPABILITY {
++ 
++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX2)
++ 
++ template <> class Vectorized<float> {
++ private:
++@@ -226,14 +227,14 @@ public:
++     static __m256 vec_factorial_5 =
++         _mm256_set1_ps(0.00828929059f); // 1/factorial(5)
++     static __m256 vec_exp_log2ef =
++-        (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e)
+++        _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e)
++     static __m256 vec_half = _mm256_set1_ps(0.5f);
++     static __m256 vec_one = _mm256_set1_ps(1.f);
++     static __m256 vec_zero = _mm256_set1_ps(0.f);
++     static __m256 vec_two = _mm256_set1_ps(2.f);
++-    static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2)
++-    static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50);
++-    static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218);
+++    static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2)
+++    static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
+++    static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
++     static __m256i vec_127 = _mm256_set1_epi32(0x0000007f);
++     static int n_mantissa_bits = 23;
++ 
++@@ -266,7 +267,7 @@ public:
++     auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number);
++     auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127);
++     vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
++-    auto vec_two_pow_n = (__m256)vec_two_pow_n_i;
+++    auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i);
++     vec_two_pow_n =
++         _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask);
++ 
++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
++index 4128841701a..85e099904cd 100644
++--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
++@@ -41,11 +41,17 @@
++ namespace at::vec {
++ inline namespace CPU_CAPABILITY {
++ 
++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX2)
++ 
+++#ifdef _MSC_VER
+++__declspec(align(64)) struct Vectorizedqi {
+++ protected:
+++  __m256i vals;
+++#else
++ struct Vectorizedqi {
++  protected:
++   __m256i vals __attribute__((aligned(64)));
+++#endif
++ 
++  public:
++   Vectorizedqi() {}
++@@ -133,7 +139,7 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
++ }
++ 
++ template <typename T>
++-inline void __attribute__((always_inline)) QuantizeAvx2(
+++__FORCE_INLINE void QuantizeAvx2(
++     const float* src,
++     T* dst,
++     int len,
++@@ -1331,5 +1337,5 @@ Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const V
++   return a.maximum(b);
++ }
++ 
++-#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+++#endif // if defined(CPU_CAPABILITY_AVX2)
++ }} // namespace at::vec::CPU_CAPABILITY
++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h
++index fe96d123e64..87f723d782c 100644
++--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
+++++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
++@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
++ }
++ 
++ 
++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX512)
++ 
++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ 
++@@ -80,7 +80,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
++ }
++ 
++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++-
+++#ifndef _MSC_VER
+++// MSVC is not working well on complex function overload.
++ template<int64_t scale = 1>
++ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
++ inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
++@@ -92,9 +93,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
++ inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
++   return _mm512_i32gather_ps(vindex, base_addr, scale);
++ }
++-
+++#endif
++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++-
+++#ifndef _MSC_VER
+++// MSVC is not working well on complex function overload.
++ template<int64_t scale = 1>
++ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
++ inline mask_gather(const Vectorized<double>& src, const double* base_addr,
++@@ -112,7 +114,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
++   auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ);
++   return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale);
++ }
++-
+++#endif
++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ 
++ template<>
++@@ -270,6 +272,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
++   return flip8(v);
++ }
++ 
++-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+++#endif // defined(CPU_CAPABILITY_AVX512)
++ 
++ }}}
++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
++index f9fc92d52bf..eb3b6a72240 100644
++--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
+++++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
++@@ -7,7 +7,8 @@
++ #include <ATen/cpu/vec/vec_base.h>
++ #include <c10/util/irange.h>
++ 
++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX512)
+++#define SLEEF_STATIC_LIBS
++ #include <sleef.h>
++ #endif
++ 
++@@ -16,7 +17,18 @@ namespace vec {
++ // See Note [CPU_CAPABILITY namespace]
++ inline namespace CPU_CAPABILITY {
++ 
++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX512)
+++
+++#ifndef SLEEF_CONST
+++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
+++#define SLEEF_CONST const
+++#else
+++#define SLEEF_CONST
+++#endif
+++#define SLEEF_CONST_OLD SLEEF_CONST
+++#else
+++#define SLEEF_CONST_OLD
+++#endif
++ 
++ // bfloat16 conversion
++ static inline void cvtbf16_fp32(const __m256i& a, __m512& o) {
++@@ -362,7 +374,8 @@ public:
++   }
++   #pragma clang diagnostic push
++   #pragma clang diagnostic ignored "-Wignored-qualifiers"
++-  Vectorized<T> map(const __m512 (*const vop)(__m512)) const {
+++
+++  Vectorized<T> map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const {
++     __m512 lo, hi;
++     cvt_to_fp32<T>(values, lo, hi);
++     const auto o1 = vop(lo);
++@@ -1571,7 +1584,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
++ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
++ CONVERT_VECTORIZED_INIT(Half, half);
++ 
++-#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+++#else //defined(CPU_CAPABILITY_AVX512)
++ 
++ #define CONVERT_NON_VECTORIZED_INIT(type, name) \
++ inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
++@@ -1601,9 +1614,9 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
++ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
++ CONVERT_NON_VECTORIZED_INIT(Half, half);
++ 
++-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+++#endif // defined(CPU_CAPABILITY_AVX512)
++ 
++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX512)
++ #define LOAD_FP32_VECTORIZED_INIT(type, name) \
++ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
++   auto values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)); \
++@@ -1622,7 +1635,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
++ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
++ LOAD_FP32_VECTORIZED_INIT(Half, fp16);
++ 
++-#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+++#else // defined(CPU_CAPABILITY_AVX512)
++ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
++ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
++   __at_align__ float values[Vectorized<float>::size()]; \
++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
++index 02aa3a87cc1..c35204f9da2 100644
++--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
+++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
++@@ -7,7 +7,8 @@
++ #include <c10/util/irange.h>
++ #include <ATen/cpu/vec/intrinsics.h>
++ #include <ATen/cpu/vec/vec_base.h>
++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX512)
+++#define SLEEF_STATIC_LIBS
++ #include <sleef.h>
++ #endif
++ 
++@@ -16,7 +17,7 @@ namespace vec {
++ // See Note [CPU_CAPABILITY namespace]
++ inline namespace CPU_CAPABILITY {
++ 
++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX512)
++ 
++ template <> class Vectorized<c10::complex<double>> {
++ private:
++@@ -203,7 +204,7 @@ public:
++     auto abs = abs_();
++     auto zero = _mm512_setzero_pd();
++     auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ);
++-    auto div = values / abs;
+++    auto div = _mm512_div_pd(values, abs);
++     return _mm512_mask_blend_pd(mask, div, zero);
++   }
++   __m512d real_() const {
++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
++index a5d790c98b2..2801e484d94 100644
++--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
+++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
++@@ -7,7 +7,8 @@
++ #include <c10/util/irange.h>
++ #include <ATen/cpu/vec/intrinsics.h>
++ #include <ATen/cpu/vec/vec_base.h>
++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX512)
+++#define SLEEF_STATIC_LIBS
++ #include <sleef.h>
++ #endif
++ 
++@@ -16,7 +17,7 @@ namespace vec {
++ // See Note [CPU_CAPABILITY namespace]
++ inline namespace CPU_CAPABILITY {
++ 
++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX512)
++ 
++ template <> class Vectorized<c10::complex<float>> {
++ private:
++@@ -708,7 +709,7 @@ public:
++     auto abs = abs_();
++     auto zero = _mm512_setzero_ps();
++     auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ);
++-    auto div = values / abs;
+++    auto div = _mm512_div_ps(values, abs);
++     return _mm512_mask_blend_ps(mask, div, zero);
++   }
++   __m512 real_() const {
++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
++index 27b2753c903..508ab257e60 100644
++--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
+++++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
++@@ -6,7 +6,8 @@
++ #include <ATen/cpu/vec/intrinsics.h>
++ #include <ATen/cpu/vec/vec_base.h>
++ #include <c10/util/irange.h>
++-#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER)
+++#if (defined(CPU_CAPABILITY_AVX512))
+++#define SLEEF_STATIC_LIBS
++ #include <sleef.h>
++ #endif
++ 
++@@ -15,7 +16,7 @@ namespace vec {
++ // See Note [CPU_CAPABILITY namespace]
++ inline namespace CPU_CAPABILITY {
++ 
++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX512)
++ 
++ template <> class Vectorized<double> {
++ private:
++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
++index ba5738687fd..a08df3c141a 100644
++--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+++++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
++@@ -6,7 +6,8 @@
++ #include <ATen/cpu/vec/intrinsics.h>
++ #include <ATen/cpu/vec/vec_base.h>
++ #include <c10/util/irange.h>
++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX512)
+++#define SLEEF_STATIC_LIBS
++ #include <sleef.h>
++ #endif
++ 
++@@ -15,7 +16,7 @@ namespace vec {
++ // See Note [CPU_CAPABILITY namespace]
++ inline namespace CPU_CAPABILITY {
++ 
++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX512)
++ 
++ template <> class Vectorized<float> {
++ private:
++@@ -246,14 +247,14 @@ public:
++     static __m512 vec_factorial_5 =
++         _mm512_set1_ps(0.00828929059f); // 1/factorial(5)
++     static __m512 vec_exp_log2ef =
++-        (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e)
+++        _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
++     static __m512 vec_half = _mm512_set1_ps(0.5f);
++     static __m512 vec_one = _mm512_set1_ps(1.f);
++     static __m512 vec_zero = _mm512_set1_ps(0.f);
++     static __m512 vec_two = _mm512_set1_ps(2.f);
++-    static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2)
++-    static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50);
++-    static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218);
+++    static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2)
+++    static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
+++    static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
++     static __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
++     static int n_mantissa_bits = 23;
++ 
++@@ -288,7 +289,7 @@ public:
++     auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number);
++     auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127);
++     vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
++-    auto vec_two_pow_n = (__m512)vec_two_pow_n_i;
+++    auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i);
++     vec_two_pow_n =
++         _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero);
++ 
++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
++index e0713d01312..a5671ed4a50 100644
++--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
+++++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
++@@ -42,11 +42,17 @@ namespace at {
++ namespace vec {
++ inline namespace CPU_CAPABILITY {
++ 
++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+++#if defined(CPU_CAPABILITY_AVX512)
++ 
+++#ifdef _MSC_VER
+++__declspec(align(64)) struct Vectorizedqi {
+++ protected:
+++  __m512i vals;
+++#else
++ struct Vectorizedqi {
++  protected:
++   __m512i vals __attribute__((aligned(64)));
+++#endif
++ 
++  public:
++   Vectorizedqi() {}
++@@ -136,7 +142,7 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
++ }
++ 
++ template <typename T>
++-inline void __attribute__((always_inline)) QuantizeAvx512(
+++__FORCE_INLINE void QuantizeAvx512(
++     const float* src,
++     T* dst,
++     int len,
++@@ -525,10 +531,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
++       Vectorized<float> scale,
++       Vectorized<float> zero_point,
++       Vectorized<float> scale_neg_zp_premul) const {
+++    #if defined(_MSC_VER) && !defined(__clang__)
+++    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+++    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+++    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+++    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+++    #else
++     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
++     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
++     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
++     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+++    #endif
++ 
++     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
++     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
++@@ -549,10 +562,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
++   float_vec_return_type dequantize(
++       Vectorized<float> scale,
++       Vectorized<float> zero_point) const {
+++    #if defined(_MSC_VER) && !defined(__clang__)
+++    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+++    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+++    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+++    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+++    #else
++     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
++     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
++     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
++     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+++    #endif
++ 
++     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
++     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
++@@ -598,20 +618,34 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
++     }
++ 
++     int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
+++      #if defined(_MSC_VER) && !defined(__clang__)
+++      __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+++      __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+++      __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+++      __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+++      #else
++       __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
++       __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
++       __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
++       __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+++      #endif
++ 
++       __m512i int32_val0 = cvtepi8_epi32(int_val0);
++       __m512i int32_val1 = cvtepi8_epi32(int_val1);
++       __m512i int32_val2 = cvtepi8_epi32(int_val2);
++       __m512i int32_val3 = cvtepi8_epi32(int_val3);
++ 
+++      #if defined(_MSC_VER) && !defined(__clang__)
+++      __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
+++      __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
+++      __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
+++      __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
+++      #else
++       __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
++       __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
++       __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
++       __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
+++      #endif
++ 
++       __m512i int32_b0 = cvtepi8_epi32(int_b0);
++       __m512i int32_b1 = cvtepi8_epi32(int_b1);
++@@ -721,10 +755,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
++       Vectorized<float> scale,
++       Vectorized<float> zero_point,
++       Vectorized<float> scale_zp_premul) const {
+++    #if defined(_MSC_VER) && !defined(__clang__)
+++    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+++    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+++    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+++    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+++    #else
++     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
++     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
++     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
++     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+++    #endif
++ 
++     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
++     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
++@@ -746,10 +787,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
++   float_vec_return_type dequantize(
++       Vectorized<float> scale,
++       Vectorized<float> zero_point) const {
+++    #if defined(_MSC_VER) && !defined(__clang__)
+++    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+++    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+++    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+++    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+++    #else
++     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
++     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
++     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
++     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+++    #endif
++ 
++     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
++     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
++@@ -796,20 +844,34 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
++     }
++ 
++     int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
+++      #if defined(_MSC_VER) && !defined(__clang__)
+++      __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
+++      __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
+++      __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
+++      __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
+++      #else
++       __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
++       __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
++       __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
++       __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+++      #endif
++ 
++       __m512i int32_val0 = cvtepu8_epi32(int_val0);
++       __m512i int32_val1 = cvtepu8_epi32(int_val1);
++       __m512i int32_val2 = cvtepu8_epi32(int_val2);
++       __m512i int32_val3 = cvtepu8_epi32(int_val3);
++ 
+++      #if defined(_MSC_VER) && !defined(__clang__)
+++      __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
+++      __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
+++      __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
+++      __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
+++      #else
++       __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
++       __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
++       __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
++       __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
+++      #endif
++ 
++       __m512i int32_b0 = cvtepu8_epi32(int_b0);
++       __m512i int32_b1 = cvtepu8_epi32(int_b1);
++diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
++index adf81dd915c..20cb8ef6dbc 100644
++--- a/aten/src/ATen/cpu/vec/vec_base.h
+++++ b/aten/src/ATen/cpu/vec/vec_base.h
++@@ -36,6 +36,12 @@
++ #include <c10/util/irange.h>
++ #include <c10/util/Load.h>
++ 
+++#if defined(__GNUC__)
+++#define __FORCE_INLINE __attribute__((always_inline)) inline
+++#elif defined(_MSC_VER)
+++#define __FORCE_INLINE __forceinline
+++#endif
+++
++ // These macros helped us unify vec_base.h
++ #ifdef CPU_CAPABILITY_AVX512
++ #if defined(__GNUC__)
++diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
++index a6b6f0f7d1d..15d37cf4861 100644
++--- a/caffe2/CMakeLists.txt
+++++ b/caffe2/CMakeLists.txt
++@@ -1787,7 +1787,7 @@ if(BUILD_TEST)
++           endif()
++         else()
++           add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
++-          target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main)
+++          target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main)
++         endif()
++         target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<INSTALL_INTERFACE:include>)
++         target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>)
++diff --git a/third_party/sleef.BUILD b/third_party/sleef.BUILD
++index 573f9c5b54a..f22a6e905e2 100644
++--- a/third_party/sleef.BUILD
+++++ b/third_party/sleef.BUILD
++@@ -38,6 +38,7 @@ SLEEF_PUBLIC_HEADERS = [
++ SLEEF_PRIVATE_INCLUDES = [
++     "-Iexternal/sleef/src/arch",
++     "-Iexternal/sleef/src/common",
+++    "-Iexternal/sleef/src/libm",
++ ]
++ 
++ SLEEF_PUBLIC_INCLUDES = [
++@@ -201,8 +202,6 @@ cc_library(
++     srcs = [
++         "src/libm/rempitab.c",
++         "src/libm/sleefdp.c",
++-        "src/libm/sleefld.c",
++-        "src/libm/sleefqp.c",
++         "src/libm/sleefsp.c",
++     ],
++     hdrs = SLEEF_PUBLIC_HEADERS,
++-- 
++2.45.1
++
+diff --git a/python-torch.spec b/python-torch.spec
+index d50687a5174a..63600c2e8c39 100644
+--- a/python-torch.spec
++++ b/python-torch.spec
+@@ -176,6 +176,17 @@ Patch7:        0001-Reenable-dim-for-python-3.12.patch
+ Patch8:        0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch
+ %endif
+ 
++# Enable x86 CPU vectorization on windows [submodule sleef] (#118980)
++# https://github.com/pytorch/pytorch/commit/56451cd49d9cf94b49197e09dec13426bb1a5370
++#
++# Despite the title, this patch fixes compatibility with sleef 3.6 by including
++# a backwards-compatible version of the fix from
++# https://github.com/pytorch/pytorch/pull/122723.
++# Cherry-picked on v2.3.0, without the commit to update the third_party/sleef
++# git submodule (because the release archive contains an actual sleef source
++# tree instead, so this would not apply.)
++Patch9:        0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
++
+ %if %{with rocm}
+ # ROCm patches
+ # https://github.com/pytorch/pytorch/pull/120551
+-- 
+2.45.1
+
diff --git a/0001-disable-use-of-aotriton.patch b/0001-disable-use-of-aotriton.patch
index 34a1704..61ffd1e 100644
--- a/0001-disable-use-of-aotriton.patch
+++ b/0001-disable-use-of-aotriton.patch
@@ -1,46 +1,94 @@
-From 33d48f71db7530f00dbd8cff281b65aa8b355b2a Mon Sep 17 00:00:00 2001
+From 038ce9e44776e23f21c1816daa259bc0ea335088 Mon Sep 17 00:00:00 2001
 From: Tom Rix <trix@redhat.com>
-Date: Tue, 19 Mar 2024 11:32:37 -0400
+Date: Sat, 29 Jun 2024 07:06:09 -0700
 Subject: [PATCH] disable use of aotriton
 
 ---
- aten/src/ATen/native/transformers/cuda/sdp_utils.cpp | 6 ++++++
- 1 file changed, 6 insertions(+)
+ .../ATen/native/transformers/cuda/sdp_utils.cpp | 17 +++++++++++++++--
+ 1 file changed, 15 insertions(+), 2 deletions(-)
 
 diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
-index 96b839820efd..2d3dd0cb4b0f 100644
+index 214b02d8262e..7b3eb9dcd8cd 100644
 --- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
 +++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
-@@ -21,9 +21,11 @@
- #include <cmath>
- #include <functional>
+@@ -19,9 +19,12 @@
+ #include <c10/core/SymInt.h>
+ #include <c10/util/string_view.h>
  
 +#ifdef USE_FLASH_ATTENTION
  #if USE_ROCM
  #include <aotriton/flash.h>
  #endif
 +#endif
++
  
  /**
  * Note [SDPA Runtime Dispatch]
-@@ -183,6 +185,7 @@ bool check_sm_version(cudaDeviceProp * dprops) {
- }
+@@ -182,6 +185,9 @@ bool check_sm_version(cudaDeviceProp * dprops) {
  
  bool check_flash_attention_hardware_support(sdp_params const& params, bool debug) {
-+#ifdef USE_FLASH_ATTENTION
    // Check that the gpu is capable of running flash attention
++#ifndef USE_FLASH_ATTENTION
++  return false;
++#else
    using sm80 = SMVersion<8, 0>;
    using sm90 = SMVersion<9, 0>;
-@@ -211,6 +214,9 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
+ #if USE_ROCM
+@@ -209,9 +215,13 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
    }
  #endif
    return true;
-+#else
-+  return false;
 +#endif
  }
  
  bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) {
++#ifndef USE_FLASH_ATTENTION
++  return false;
++#else
+   // Mem Efficient attention supports hardware in the range [sm_50, sm_90]
+   using sm50 = SMVersion<5, 0>;
+   using sm90 = SMVersion<9, 0>;
+@@ -240,6 +250,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
+   }
+ #endif
+   return true;
++#endif  
+ }
+ 
+ bool check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89(
+@@ -554,7 +565,7 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
+ #ifndef USE_FLASH_ATTENTION
+   TORCH_WARN_ONCE(!debug, "Torch was not compiled with flash attention.");
+   return false;
+-#endif
++#else
+ 
+   // Define gate functions that determine if a flash kernel can be ran
+   // Replace with std::to_array when we migrate to c++20
+@@ -597,13 +608,14 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
+     }
+   }
+   return true;
++#endif
+ }
+ 
+ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) {
+ #ifndef USE_MEM_EFF_ATTENTION
+   TORCH_WARN_ONCE(!debug, "Torch was not compiled with memory efficient attention.");
+   return false;
+-#endif
++#else
+   // Constraints specific to mem efficient attention
+   constexpr auto greater_than_or_equal_sm80_mem_efficient_dtypes =
+       array_of<at::ScalarType>(at::kHalf, at::kFloat, at::kBFloat16);
+@@ -663,6 +675,7 @@ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) {
+   }
+ #endif
+   return check_tensor_dtype(params, less_than_sm80_mem_efficient_dtypes, debug);
++#endif
+ }
+ 
+ SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
 -- 
-2.44.0
+2.45.2
 
diff --git a/python-torch.spec b/python-torch.spec
index 1dad38f..7bcc4bf 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -12,7 +12,7 @@
 %global date0 20240709
 %global pypi_version 2.4.0
 %else
-%global pypi_version 2.3.1
+%global pypi_version 2.4.0
 %endif
 
 # For -test subpackage
@@ -63,18 +63,6 @@
 %bcond_with distributed
 %endif
 
-# OpenCV support came in F41
-%if 0%{?fedora} > 40
-%if %{without gitcommit}
-%bcond_without opencv
-%else
-# USE_OPENCV removed in 2.4.0+
-%bcond_with opencv
-%endif
-%else
-%bcond_with opencv
-%endif
-
 # Do no confuse xnnpack versions
 %if 0%{?fedora} > 40
 %bcond_without xnnpack
@@ -95,14 +83,10 @@
 %endif
 
 %if 0%{?fedora} > 40
-%if %{with gitcommit}
 %bcond_without fbgemm
 %else
 %bcond_with fbgemm
 %endif
-%else
-%bcond_with fbgemm
-%endif
 
 # For testing cuda
 %ifarch x86_64
@@ -139,15 +123,9 @@
 %endif
 			 
 # These came in 2.4 and not yet in Fedora
-%if %{with gitcommit}
 %bcond_with opentelemetry
 %bcond_with httplib
 %bcond_with kineto
-%else
-%bcond_without opentelemetry
-%bcond_without httplib
-%bcond_without kineto
-%endif
 
 Name:           python-%{pypi_name}
 %if %{with gitcommit}
@@ -220,7 +198,6 @@ Source40:       https://github.com/Maratyszcza/pthreadpool/archive/%{pt_commit}/
 Source50:       https://github.com/mreineck/pocketfft/archive/%{pf_commit}/pocketfft-%{pf_scommit}.tar.gz
 %endif
 
-%if %{with gitcommit}
 %if %{without opentelemetry}
 %global ot_ver 1.14.2
 Source60:       https://github.com/open-telemetry/opentelemetry-cpp/archive/refs/tags/v%{ot_ver}.tar.gz
@@ -237,50 +214,15 @@ Source70:       https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp-
 %global ki_scommit %(c=%{ki_commit}; echo ${c:0:7})
 Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz
 %endif
-%endif
 
 Patch0:        0001-no-third_party-foxi.patch
-%if %{with gitcommit}
 # https://github.com/pytorch/pytorch/pull/131282
 Patch1:        0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
-%endif
-
-%if %{without gitcommit}
-Patch3:        0001-Stub-in-kineto-ActivityType.patch
-%endif
 
 %if %{with caffe2}
 Patch6:        0001-reenable-foxi-linking.patch
 %endif
 
-# Bring some patches forward
-%if %{without gitcommit}
-# https://github.com/pytorch/pytorch/pull/123384
-# Breaks on python 3.13
-# Patch7:        0001-Reenable-dim-for-python-3.12.patch
-
-# Dynamo/Inductor on 3.12
-# Fails to apply on 2.3.1
-# Patch8:        0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch
-%endif
-
-%if %{without gitcommit}
-# Enable x86 CPU vectorization on windows [submodule sleef] (#118980)
-# https://github.com/pytorch/pytorch/commit/56451cd49d9cf94b49197e09dec13426bb1a5370
-#
-# Despite the title, this patch fixes compatibility with sleef 3.6 by including
-# a backwards-compatible version of the fix from
-# https://github.com/pytorch/pytorch/pull/122723.
-# Cherry-picked on v2.3.0, without the commit to update the third_party/sleef
-# git submodule (because the release archive contains an actual sleef source
-# tree instead, so this would not apply.)
-Patch9:        0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
-
-# For Python 3.13
-# https://github.com/pytorch/pytorch/pull/126033
-Patch10: 0001-Changes-to-compile-with-3.13-126033.patch
-%endif
-
 # ROCm patches
 # Patches need to be refactored for ToT
 # These are ROCm packages
@@ -291,9 +233,6 @@ Patch100:      0001-Optionally-use-hipblaslt.patch
 %endif
 Patch101:      0001-cuda-hip-signatures.patch
 Patch102:      0001-silence-an-assert.patch
-%if %{without gitcommit}
-Patch103:      0001-can-not-use-with-c-files.patch
-%endif
 Patch105:      0001-disable-use-of-aotriton.patch
 %endif
 Patch106:      0001-include-fmt-ranges.h-for-using-fmt-join.patch
@@ -416,10 +355,6 @@ BuildRequires:  libcurand-devel-%{curand_ver}
 BuildRequires:  libcusparse-devel-%{cusparse_ver}
 %endif
 
-%if %{with opencv}
-BuildRequires:  opencv-devel
-%endif
-
 %if %{with test}
 BuildRequires:  google-benchmark-devel
 %endif
@@ -627,7 +562,6 @@ rm -rf third_party/pocketfft/*
 cp -r pocketfft-*/* third_party/pocketfft/
 %endif
 
-%if %{with gitcommit}
 %if %{without opentelemtry}
 tar xf %{SOURCE60}
 rm -rf third_party/opentelemetry-cpp/*
@@ -645,16 +579,6 @@ tar xf %{SOURCE80}
 rm -rf third_party/kineto/*
 cp -r kineto-*/* third_party/kineto/
 %endif
-%endif
-
-%if %{with opencv}
-%if %{without gitcommit}
-# Reduce requirements, *FOUND is not set 
-sed -i -e 's/USE_OPENCV AND OpenCV_FOUND AND USE_FFMPEG AND FFMPEG_FOUND/USE_OPENCV AND USE_FFMPEG/' caffe2/video/CMakeLists.txt
-sed -i -e 's/USE_OPENCV AND OpenCV_FOUND/USE_OPENCV/' caffe2/image/CMakeLists.txt
-sed -i -e 's/STATUS/FATAL/' caffe2/image/CMakeLists.txt
-%endif
-%endif
 
 # hipblaslt only building with gfx90a
 %if %{with hipblaslt}
@@ -810,18 +734,8 @@ mkdir third_party/pocketfft
 mkdir third_party/valgrind-headers
 cp %{_includedir}/valgrind/* third_party/valgrind-headers
 
-%if %{without gitcommit}
-# Remove unneeded OpenCL files that confuse the lincense scanner
-rm caffe2/contrib/opencl/OpenCL/cl.hpp
-rm caffe2/mobile/contrib/libopencl-stub/include/CL/*.h
-rm caffe2/mobile/contrib/libopencl-stub/include/CL/*.hpp
-%endif
-
 # Fix installing to /usr/lib64
-%if %{with gitcommit}
 sed -i -e 's@DESTINATION ${PYTHON_LIB_REL_PATH}@DESTINATION ${CMAKE_INSTALL_PREFIX}/${PYTHON_LIB_REL_PATH}@' caffe2/CMakeLists.txt
-%endif
-
 
 %if %{with rocm}
 # hipify
@@ -924,9 +838,6 @@ export USE_NNPACK=OFF
 export USE_NUMPY=ON
 export USE_OPENMP=ON
 export USE_PYTORCH_QNNPACK=OFF
-%if %{without gitcommit}
-export USE_QNNPACK=OFF
-%endif
 export USE_ROCM=OFF
 export USE_SYSTEM_SLEEF=ON
 export USE_SYSTEM_EIGEN_INSTALL=ON
@@ -949,12 +860,6 @@ export USE_SYSTEM_PSIMD=ON
 export USE_SYSTEM_XNNPACK=ON
 %endif
 
-%if %{with caffe2}
-%if %{without gitcommit}
-export BUILD_CAFFE2=ON
-%endif
-%endif
-
 %if %{with cuda}
 %if %{without rocm}
 export CPLUS_INCLUDE_PATH=/usr/local/cuda-%{cuda_ver}/include
@@ -982,10 +887,6 @@ export USE_MPI=ON
 %endif
 %endif
 
-%if %{with opencv}
-export USE_OPENCV=ON
-%endif
-
 %if %{with test}
 export BUILD_TEST=ON
 %endif
@@ -1097,11 +998,6 @@ done
 %{python3_sitearch}/%{pypi_name}-*.egg-info
 %{python3_sitearch}/functorch
 %{python3_sitearch}/torchgen
-%if %{without gitcommit}
-%if %{with caffe2}
-%{python3_sitearch}/caffe2
-%endif
-%endif
 
 %if %{with rocm}
 %files -n python3-%{pypi_name}-rocm-gfx8
diff --git a/sources b/sources
index 60cce58..a4dbc9a 100644
--- a/sources
+++ b/sources
@@ -12,3 +12,7 @@ SHA512 (libnop-910b558.tar.gz) = 74c5324eaa1b6b2ac8dfef94c835b5c5b044625f8e5efe3
 SHA512 (pytorch-97ff6cf.tar.gz) = 105ebcba298558fe833f90e7e40b003d35a74609e777f9dc4c47f5668c884f603455113ac0ff252a62b83c81137ae66ceb1a862d351203925dcfc3dcf9f73580
 SHA512 (pytorch-v2.3.0.tar.gz) = 0c2ffc7bf2fd86070e9958c34eca1f03a0248a011ac6ffaeb69f65306ff856edd5359986f02af25888433187e6d7f29b60edded092e2ac30c8cec49023166eda
 SHA512 (pytorch-v2.3.1.tar.gz) = fe132251b2bae87b70ba3d95dc32f6a4545970d11893118b0ebe6ca129732e516ef4d6cc4f380b3db9bb2277d1db8ce78a401c40149bb1dfbab76eab9e3992c4
+SHA512 (pytorch-v2.4.0.tar.gz) = bcfca6aefee00d15d1c0a2456cd9d802d9a8e96816d421d10e3eed5c608bfdf23dfae492ca3638c0bae99ef5bb8c98f4774c0b9f1a8b94d4dc36a52226033314
+SHA512 (v1.14.2.tar.gz) = 97635bbaf6dd567c201451dfaf7815b2052fe50d9bccc97aade86cfa4a92651374d167296a5453031b2681dc302806a289bca011a9e79ddc381a17d6118971d7
+SHA512 (cpp-httplib-3b6597b.tar.gz) = 8f1090658c498d04f14fec5c2f301847b1f3360bf92b18d82927643ee04ab61a6b274733a01c7850f9c030205120d674d1d961358d49fdd15636736fb8704f55
+SHA512 (kineto-be13176.tar.gz) = 41a08c7da9eea7d12402f80a5550c9d4df79798719cc52b12a507828c8c896ba28a37c35d8adf809ca72589e1d84965d5ef6dd01f3f8dc1c803c5ed67b03a43a

From f30087e5e738000876d4eb83610ecae43863d915 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9rgio=20M=2E=20Basto?= <sergio@serjux.com>
Date: Thu, 25 Jul 2024 23:51:21 +0100
Subject: [PATCH 21/88] Rebuild for opencv 4.10.0


From 2efd1f0d79ce0f352537a3387e7714f850a2e832 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Sat, 27 Jul 2024 08:58:44 -0400
Subject: [PATCH 22/88] Fbgemm not available on aarch64

Signed-off-by: Tom Rix <trix@redhat.com>
---
 python-torch.spec | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python-torch.spec b/python-torch.spec
index 7bcc4bf..d3b1156 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -83,10 +83,14 @@
 %endif
 
 %if 0%{?fedora} > 40
+%ifarch x86_64
 %bcond_without fbgemm
 %else
 %bcond_with fbgemm
 %endif
+%else
+%bcond_with fbgemm
+%endif
 
 # For testing cuda
 %ifarch x86_64

From 5b0238639fecc7fa8d9182a355b5b268bc9109be Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Sat, 3 Aug 2024 20:47:55 -0700
Subject: [PATCH 23/88] Simplify ROCm gpu list

gfx8 is obsolete
gfx110,gfx942 and gfx90a are redundant.

Signed-off-by: Tom Rix
---
 python-torch.spec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-torch.spec b/python-torch.spec
index d3b1156..d19b477 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -34,7 +34,7 @@
 %bcond_with hipblaslt
 # Which families gpu build for
 %if 0%{?fedora} > 40
-%global rocm_gpu_list gfx8 gfx9 gfx10 gfx11 gfx90a gfx942 gfx1100
+%global rocm_gpu_list gfx9 gfx10 gfx11
 %bcond_without magma
 %else
 # F40

From da9c85c23d2da82221c3bbfe50e4e328408d562b Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Sun, 4 Aug 2024 06:10:29 -0700
Subject: [PATCH 24/88] Remove the packages

---
 python-torch.spec | 49 -----------------------------------------------
 1 file changed, 49 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index d19b477..8cdb28a 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -33,14 +33,8 @@
 # hipblaslt is in development
 %bcond_with hipblaslt
 # Which families gpu build for
-%if 0%{?fedora} > 40
 %global rocm_gpu_list gfx9 gfx10 gfx11
 %bcond_without magma
-%else
-# F40
-%global rocm_gpu_list gfx8 gfx9 gfx10 gfx11
-%bcond_with magma
-%endif
 %global rocm_default_gpu default
 %bcond_without rocm_loop
 
@@ -439,13 +433,6 @@ Summary:        %{name} for CUDA
 %{summary}
 %endif
 
-%if %{with rocm}
-%package -n python3-%{pypi_name}-rocm-gfx8
-Summary:        %{name} for ROCm gfx8
-
-%description -n python3-%{pypi_name}-rocm-gfx8
-%{summary}
-
 %package -n python3-%{pypi_name}-rocm-gfx9
 Summary:        %{name} for ROCm gfx9
 
@@ -464,25 +451,6 @@ Summary:        %{name} for ROCm gfx11
 %description -n python3-%{pypi_name}-rocm-gfx11
 %{summary}
 
-%if 0%{?fedora} > 40
-%package -n python3-%{pypi_name}-rocm-gfx90a
-Summary:        %{name} for ROCm MI200
-%description -n python3-%{pypi_name}-rocm-gfx90a
-%{summary}
-
-%package -n python3-%{pypi_name}-rocm-gfx942
-Summary:        %{name} for ROCm MI300
-%description -n python3-%{pypi_name}-rocm-gfx942
-%{summary}
-
-%package -n python3-%{pypi_name}-rocm-gfx1100
-Summary:        %{name} for W7900
-%description -n python3-%{pypi_name}-rocm-gfx1100
-%{summary}
-%endif
-
-%endif
-
 %if %{with test}
 %package -n python3-%{pypi_name}-test
 Summary:        Tests for %{name}
@@ -1004,9 +972,6 @@ done
 %{python3_sitearch}/torchgen
 
 %if %{with rocm}
-%files -n python3-%{pypi_name}-rocm-gfx8
-%{_libdir}/rocm/gfx8/bin/*
-%{_libdir}/rocm/gfx8/lib64/*
 
 %files -n python3-%{pypi_name}-rocm-gfx9
 %{_libdir}/rocm/gfx9/bin/*
@@ -1020,20 +985,6 @@ done
 %{_libdir}/rocm/gfx11/bin/*
 %{_libdir}/rocm/gfx11/lib64/*
 
-%if 0%{?fedora} > 40
-%files -n python3-%{pypi_name}-rocm-gfx90a
-%{_libdir}/rocm/gfx90a/bin/*
-%{_libdir}/rocm/gfx90a/lib64/*
-
-%files -n python3-%{pypi_name}-rocm-gfx942
-%{_libdir}/rocm/gfx942/bin/*
-%{_libdir}/rocm/gfx942/lib64/*
-
-%files -n python3-%{pypi_name}-rocm-gfx1100
-%{_libdir}/rocm/gfx1100/bin/*
-%{_libdir}/rocm/gfx1100/lib64/*
-%endif
-
 %endif
 
 %changelog

From 548a9ad819e634a47915c4672601cf77931421df Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Mon, 5 Aug 2024 05:02:57 -0700
Subject: [PATCH 25/88] Enable hipblaslt

Remove gfx10,gfx11 - koji taking too long to build.

Signed-off-by: Tom Rix <trix@amd.com>
---
 python-torch.spec | 33 +++------------------------------
 1 file changed, 3 insertions(+), 30 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 8cdb28a..fd46384 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -23,20 +23,13 @@
 %bcond_with test
 
 %ifarch x86_64
-# ROCm support came in F40
-%if 0%{?fedora} > 39
 %bcond_without rocm
-%else
-%bcond_with rocm
 %endif
-%endif
-# hipblaslt is in development
-%bcond_with hipblaslt
-# Which families gpu build for
-%global rocm_gpu_list gfx9 gfx10 gfx11
+%bcond_without hipblaslt
 %bcond_without magma
-%global rocm_default_gpu default
 %bcond_without rocm_loop
+%global rocm_default_gpu default
+%global rocm_gpu_list gfx9
 
 # Caffe2 support came in F41
 %if 0%{?fedora} > 40
@@ -439,18 +432,6 @@ Summary:        %{name} for ROCm gfx9
 %description -n python3-%{pypi_name}-rocm-gfx9
 %{summary}
 
-%package -n python3-%{pypi_name}-rocm-gfx10
-Summary:        %{name} for ROCm gfx10
-
-%description -n python3-%{pypi_name}-rocm-gfx10
-%{summary}
-
-%package -n python3-%{pypi_name}-rocm-gfx11
-Summary:        %{name} for ROCm gfx11
-
-%description -n python3-%{pypi_name}-rocm-gfx11
-%{summary}
-
 %if %{with test}
 %package -n python3-%{pypi_name}-test
 Summary:        Tests for %{name}
@@ -977,14 +958,6 @@ done
 %{_libdir}/rocm/gfx9/bin/*
 %{_libdir}/rocm/gfx9/lib64/*
 
-%files -n python3-%{pypi_name}-rocm-gfx10
-%{_libdir}/rocm/gfx10/bin/*
-%{_libdir}/rocm/gfx10/lib64/*
-
-%files -n python3-%{pypi_name}-rocm-gfx11
-%{_libdir}/rocm/gfx11/bin/*
-%{_libdir}/rocm/gfx11/lib64/*
-
 %endif
 
 %changelog

From c5c84ab4b2aa5a0f8b51b1ce537e3715409d7ea2 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Wed, 7 Aug 2024 11:13:43 -0700
Subject: [PATCH 26/88] Disable fbgemm with rocm

With building python-torchvision, there is this link error.
trix@fedora:~/fedora/python-torchvision$ c++filt _ZN2at4cuda4blas9int8_gemmEbblllPKalS3_lPil
at::cuda::blas::int8_gemm(bool, bool, long, long, long, signed char const*, long, signed char const*, long,\
 int*, long)

So  disable fbgemm with rocm is enabled.
Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index fd46384..6eb7447 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -69,14 +69,14 @@
 %bcond_with pocketfft
 %endif
 
-%if 0%{?fedora} > 40
 %ifarch x86_64
-%bcond_without fbgemm
+  %if %{with rocm}
+    %bcond_with fbgemm
+  %else
+    %bcond_without fbgemm
+  %endif
 %else
-%bcond_with fbgemm
-%endif
-%else
-%bcond_with fbgemm
+  %bcond_with fbgemm
 %endif
 
 # For testing cuda

From fdb139d12bcc6c8d085aaa5f4381ae3bbc56e430 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 15 Aug 2024 10:24:43 -0700
Subject: [PATCH 27/88] Start tracking 2.5

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 6eb7447..4a1b2d2 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,11 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.4.0-rc8
-%global commit0 e4ee3be4063b7c430974252fdf7db42273388d86
+%global commit0 9876aa39c02861b1ad250e36a25a4f85fe3a6800
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20240709
-%global pypi_version 2.4.0
+%global date0 20240813
+%global pypi_version 2.5.0
 %else
 %global pypi_version 2.4.0
 %endif
@@ -206,7 +205,9 @@ Source70:       https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp-
 Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz
 %endif
 
+%if %{without gitcommit}
 Patch0:        0001-no-third_party-foxi.patch
+%endif
 # https://github.com/pytorch/pytorch/pull/131282
 Patch1:        0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
 
@@ -214,6 +215,12 @@ Patch1:        0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
 Patch6:        0001-reenable-foxi-linking.patch
 %endif
 
+%if %{with gitcommit}
+# https://github.com/pytorch/pytorch/pull/133451
+Patch10:       0001-Only-include-aotriton-if-it-is-needed.patch
+Patch11:       0002-Improve-finding-and-using-the-rocm_version.h.patch
+%endif
+
 # ROCm patches
 # Patches need to be refactored for ToT
 # These are ROCm packages
@@ -223,10 +230,17 @@ Patch6:        0001-reenable-foxi-linking.patch
 Patch100:      0001-Optionally-use-hipblaslt.patch
 %endif
 Patch101:      0001-cuda-hip-signatures.patch
+%if %{without gitcommit}
 Patch102:      0001-silence-an-assert.patch
+%endif
+%if %{without gitcommit}
 Patch105:      0001-disable-use-of-aotriton.patch
 %endif
+%endif
+
+%if %{without gitcommit}
 Patch106:      0001-include-fmt-ranges.h-for-using-fmt-join.patch
+%endif
 
 ExclusiveArch:  x86_64 aarch64
 %global toolchain gcc
@@ -283,8 +297,12 @@ BuildRequires:  cpuinfo-devel
 BuildRequires:  FP16-devel
 BuildRequires:  fxdiv-devel
 BuildRequires:  psimd-devel
+%if %{with gitcommit}
+BuildRequires:  xnnpack-devel = 0.0^git20240814.312eb7e
+%else
 BuildRequires:  xnnpack-devel = 0.0^git20240229.fcbf55a
 %endif
+%endif
 
 BuildRequires:  python3-devel
 BuildRequires:  python3dist(filelock)
@@ -951,6 +969,9 @@ done
 %{python3_sitearch}/%{pypi_name}-*.egg-info
 %{python3_sitearch}/functorch
 %{python3_sitearch}/torchgen
+%if %{with gitcommit}
+%{python3_sitearch}/benchmarks
+%endif
 
 %if %{with rocm}
 

From 9d8df35466b1933c1c30a7b0fc0068901c9d0ec6 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Fri, 30 Aug 2024 15:31:17 -0700
Subject: [PATCH 28/88] Update the gitcommit

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 4a1b2d2..9463683 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,9 +6,9 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-%global commit0 9876aa39c02861b1ad250e36a25a4f85fe3a6800
+%global commit0 d01a7a9faa5a742a3df7374b97bbc1db1205b6ed
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20240813
+%global date0 20240828
 %global pypi_version 2.5.0
 %else
 %global pypi_version 2.4.0
@@ -56,17 +56,8 @@
 %bcond_with xnnpack
 %endif
 
-%if 0%{?fedora} > 39
 %bcond_without pthreadpool
-%else
-%bcond_with pthreadpool
-%endif
-
-%if 0%{?fedora} > 39
 %bcond_without pocketfft
-%else
-%bcond_with pocketfft
-%endif
 
 %ifarch x86_64
   %if %{with rocm}
@@ -145,7 +136,6 @@ Source10:       https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v%{cu
 Source11:       https://github.com/NVIDIA/cutlass/archive/refs/tags/v%{cul_ver}.tar.gz
 %endif
 
-%if %{with tensorpipe}
 # Developement on tensorpipe has stopped, repo made read only July 1, 2023, this is the last commit
 %global tp_commit 52791a2fd214b2a9dc5759d36725909c1daa7f2e
 %global tp_scommit %(c=%{tp_commit}; echo ${c:0:7})
@@ -156,7 +146,6 @@ Source21:       https://github.com/libuv/libuv/archive/refs/tags/v1.41.0.tar.gz
 %global nop_commit 910b55815be16109f04f4180e9adee14fb4ce281
 %global nop_scommit %(c=%{nop_commit}; echo ${c:0:7})
 Source22:       https://github.com/google/libnop/archive/%{nop_commit}/libnop-%{nop_scommit}.tar.gz
-%endif
 
 %if %{without xnnpack}
 %global xnn_commit fcbf55af6cf28a4627bcd1f703ab7ad843f0f3a2
@@ -171,7 +160,7 @@ Source32:       https://github.com/Maratyszcza/FP16/archive/%{fp_commit}/FP16-%{
 %global ps_commit 072586a71b55b7f8c584153d223e95687148a900
 %global ps_scommit %(c=%{ps_commit}; echo ${c:0:7})
 Source33:       https://github.com/Maratyszcza/psimd/archive/%{ps_commit}/psimd-%{ps_scommit}.tar.gz
-%global ci_commit d6860c477c99f1fce9e28eb206891af3c0e1a1d7
+%global ci_commit 16bfc1622c6902d6f91d316ec54894910c620325
 %global ci_scommit %(c=%{ci_commit}; echo ${c:0:7})
 Source34:       https://github.com/pytorch/cpuinfo/archive/%{ci_commit}/cpuinfo-%{ci_scommit}.tar.gz
 %endif
@@ -207,17 +196,13 @@ Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{
 
 %if %{without gitcommit}
 Patch0:        0001-no-third_party-foxi.patch
-%endif
 # https://github.com/pytorch/pytorch/pull/131282
 Patch1:        0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
 
-%if %{with caffe2}
-Patch6:        0001-reenable-foxi-linking.patch
 %endif
 
+
 %if %{with gitcommit}
-# https://github.com/pytorch/pytorch/pull/133451
-Patch10:       0001-Only-include-aotriton-if-it-is-needed.patch
 Patch11:       0002-Improve-finding-and-using-the-rocm_version.h.patch
 %endif
 
@@ -271,6 +256,10 @@ BuildRequires:  gcc-gfortran
 BuildRequires:  gloo-devel
 %endif
 %endif
+%if %{with gitcommit}
+BuildRequires:  json-devel
+%endif
+
 BuildRequires:  libomp-devel
 BuildRequires:  numactl-devel
 BuildRequires:  ninja-build
@@ -573,6 +562,9 @@ sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt
 sed -i -e 's@fmt::fmt-header-only@fmt@' c10/CMakeLists.txt
 sed -i -e 's@fmt::fmt-header-only@fmt@' torch/CMakeLists.txt
 sed -i -e 's@fmt::fmt-header-only@fmt@' cmake/Dependencies.cmake
+%if %{with gitcommit}
+sed -i -e 's@fmt::fmt-header-only@fmt@' caffe2/CMakeLists.txt
+%endif
 sed -i -e 's@add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt)@#add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt)@' cmake/Dependencies.cmake
 sed -i -e 's@set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "")@#set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "")@' cmake/Dependencies.cmake
 sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)@#list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)@' cmake/Dependencies.cmake
@@ -708,6 +700,9 @@ cp %{_includedir}/valgrind/* third_party/valgrind-headers
 # Fix installing to /usr/lib64
 sed -i -e 's@DESTINATION ${PYTHON_LIB_REL_PATH}@DESTINATION ${CMAKE_INSTALL_PREFIX}/${PYTHON_LIB_REL_PATH}@' caffe2/CMakeLists.txt
 
+# reenable foxi linking
+sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)@#list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)@' cmake/Dependencies.cmake
+
 %if %{with rocm}
 # hipify
 ./tools/amd_build/build_amd.py
@@ -969,9 +964,6 @@ done
 %{python3_sitearch}/%{pypi_name}-*.egg-info
 %{python3_sitearch}/functorch
 %{python3_sitearch}/torchgen
-%if %{with gitcommit}
-%{python3_sitearch}/benchmarks
-%endif
 
 %if %{with rocm}
 

From 2904630b42f0cdbefe8fbdd7a3e42ed34e7ffb79 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Tue, 3 Sep 2024 05:30:39 -0700
Subject: [PATCH 29/88] amdsmi is a runtime dependency for ROCm

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python-torch.spec b/python-torch.spec
index 9463683..9945a66 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -342,7 +342,9 @@ BuildRequires:  rocm-rpm-macros-modules
 BuildRequires:  rocthrust-devel
 BuildRequires:  roctracer-devel
 
+Requires:       amdsmi
 Requires:       rocm-rpm-macros-modules
+
 %endif
 
 %if %{with cuda}

From d6f80bf26e54a534b5c41c908facf2426cbb4c35 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Mon, 9 Sep 2024 09:43:45 -0700
Subject: [PATCH 30/88] Update to 2.4.1

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore        | 1 +
 python-torch.spec | 2 +-
 sources           | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 1ab81b0..2491511 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,4 @@
 /v1.14.2.tar.gz
 /cpp-httplib-3b6597b.tar.gz
 /kineto-be13176.tar.gz
+/pytorch-v2.4.1.tar.gz
diff --git a/python-torch.spec b/python-torch.spec
index 9945a66..a724fb5 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -11,7 +11,7 @@
 %global date0 20240828
 %global pypi_version 2.5.0
 %else
-%global pypi_version 2.4.0
+%global pypi_version 2.4.1
 %endif
 
 # For -test subpackage
diff --git a/sources b/sources
index a4dbc9a..50611eb 100644
--- a/sources
+++ b/sources
@@ -16,3 +16,4 @@ SHA512 (pytorch-v2.4.0.tar.gz) = bcfca6aefee00d15d1c0a2456cd9d802d9a8e96816d421d
 SHA512 (v1.14.2.tar.gz) = 97635bbaf6dd567c201451dfaf7815b2052fe50d9bccc97aade86cfa4a92651374d167296a5453031b2681dc302806a289bca011a9e79ddc381a17d6118971d7
 SHA512 (cpp-httplib-3b6597b.tar.gz) = 8f1090658c498d04f14fec5c2f301847b1f3360bf92b18d82927643ee04ab61a6b274733a01c7850f9c030205120d674d1d961358d49fdd15636736fb8704f55
 SHA512 (kineto-be13176.tar.gz) = 41a08c7da9eea7d12402f80a5550c9d4df79798719cc52b12a507828c8c896ba28a37c35d8adf809ca72589e1d84965d5ef6dd01f3f8dc1c803c5ed67b03a43a
+SHA512 (pytorch-v2.4.1.tar.gz) = fc364112a88540035f213274b526b4c8ad9be335ff214161091e8a8d3efa89ebef5a5b4d33b20b5b659896d164dcbe739f0d7d2a293d3afc0efcfaa2bf62fc2c

From e6c59f266585a859a6dd744b48c5c1b948dddc9e Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sun, 15 Sep 2024 06:27:11 -0700
Subject: [PATCH 31/88] Update gitcommit

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index a724fb5..c41fad2 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,9 +6,9 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-%global commit0 d01a7a9faa5a742a3df7374b97bbc1db1205b6ed
+%global commit0 8df01c82587f80b2800c6cfcbabe168d96cba731
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20240828
+%global date0 20240914
 %global pypi_version 2.5.0
 %else
 %global pypi_version 2.4.1
@@ -962,6 +962,9 @@ done
 %{_bindir}/convert-caffe2-to-onnx
 %{_bindir}/convert-onnx-to-caffe2
 %{_bindir}/torchrun
+%if %{with gitcommit}
+%{_bindir}/torchfrtrace
+%endif
 %{python3_sitearch}/%{pypi_name}
 %{python3_sitearch}/%{pypi_name}-*.egg-info
 %{python3_sitearch}/functorch

From 93a8fc8c0dab721d6a80a9b370e5cb8d5152e7fd Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sun, 15 Sep 2024 11:04:21 -0700
Subject: [PATCH 32/88] Simplify cuda versions

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index c41fad2..b72b118 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -74,19 +74,8 @@
 %bcond_with cuda
 %endif
 
-# Pick a version that works
-%if %{with cuda}
-%if 0%{?fedora} < 40
-%global cuda_ver     12.5
-%global cudart_ver   12-5
-%global cublas_ver   12-5
-%global cufft_ver    12-5
-%global curand_ver   12-5
-%global cusparse_ver 12-5
-%else
+# Pick a CUDA version that works
 %global cuda_ver 12.5
-%endif
-%endif
 
 # For testing compat-gcc
 %global compat_gcc_major 13
@@ -348,11 +337,11 @@ Requires:       rocm-rpm-macros-modules
 %endif
 
 %if %{with cuda}
-BuildRequires:  cuda-cudart-devel-%{cudart_ver}
-BuildRequires:  libcublas-devel-%{cublas_ver}
-BuildRequires:  libcufft-devel-%{cufft_ver}
-BuildRequires:  libcurand-devel-%{curand_ver}
-BuildRequires:  libcusparse-devel-%{cusparse_ver}
+BuildRequires:  cuda-cudart-devel-%{cuda_ver}
+BuildRequires:  libcublas-devel-%{cuda_ver}
+BuildRequires:  libcufft-devel-%{cuda_ver}
+BuildRequires:  libcurand-devel-%{cuda_ver}
+BuildRequires:  libcusparse-devel-%{cuda_ver}
 %endif
 
 %if %{with test}
@@ -428,10 +417,10 @@ You can reuse your favorite Python packages such as NumPy, SciPy,
 and Cython to extend PyTorch when needed.
 
 %if %{with cuda}
-%package -n python3-%{pypi_name}-cuda-%{cudart_ver}
+%package -n python3-%{pypi_name}-cuda-%{cuda_ver}
 Summary:        %{name} for CUDA
 
-%description -n python3-%{pypi_name}-cuda-%{cudart_ver}
+%description -n python3-%{pypi_name}-cuda-%{cuda_ver}
 %{summary}
 %endif
 
@@ -953,7 +942,7 @@ done
 # Do not remote the empty files
 
 %if %{with cuda}
-%files -n python3-%{pypi_name}-cuda-%{cudart_ver}
+%files -n python3-%{pypi_name}-cuda-%{cuda_ver}
 %else
 %files -n python3-%{pypi_name}
 %endif

From b9295a009bceb5c731505627efde8ac8d0e8f244 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sun, 29 Sep 2024 17:22:33 -0700
Subject: [PATCH 33/88] Update gitcommit

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 ...finding-and-using-the-rocm_version.h.patch | 142 ++++++++++++++++++
 1 file changed, 142 insertions(+)
 create mode 100644 0001-Improve-finding-and-using-the-rocm_version.h.patch

diff --git a/0001-Improve-finding-and-using-the-rocm_version.h.patch b/0001-Improve-finding-and-using-the-rocm_version.h.patch
new file mode 100644
index 0000000..b8232c7
--- /dev/null
+++ b/0001-Improve-finding-and-using-the-rocm_version.h.patch
@@ -0,0 +1,142 @@
+From 201ac4618a1526e048a0d6c02d9bc4cf30bf0ee1 Mon Sep 17 00:00:00 2001
+From: Tom Rix <Tom.Rix@amd.com>
+Date: Wed, 14 Aug 2024 17:18:38 -0700
+Subject: [PATCH] Improve finding and using the rocm_version.h
+
+On Fedora, the rocm_version.h's path is /usr/include/rocm_version.h
+So we have this build error
+pytorch/aten/src/ATen/hip/tunable/Tunable.cpp:40:10: fatal error:
+  rocm-core/rocm_version.h: No such file or directory
+   40 | #include <rocm-core/rocm_version.h>
+      |          ^~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In other cases, depending on the rocm release either
+/opt/rocm/include or /opt/rocm/include/rocm-core
+
+Convert the EXISTS() checks into a find_path.
+Add a -I${ROCM_VERSION_DIR} to the compile options so it can be
+found by Tunable.cpp
+
+Signed-off-by: Tom Rix <Tom.Rix@amd.com>
+---
+ aten/src/ATen/cuda/tunable/Tunable.cpp |  2 +-
+ cmake/Dependencies.cmake               |  1 +
+ cmake/public/LoadHIP.cmake             | 72 ++++++++++----------------
+ 3 files changed, 30 insertions(+), 45 deletions(-)
+
+diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp
+index 1b7c89875855..32c1d70f3152 100644
+--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
++++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
+@@ -36,7 +36,7 @@
+ 
+ // for validators
+ #ifdef USE_ROCM
+-#include <rocm-core/rocm_version.h>
++#include <rocm_version.h>
+ #define ROCBLAS_BETA_FEATURES_API
+ #include <rocblas/rocblas.h>
+ #include <hipblaslt/hipblaslt.h>
+diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
+index 7ef8eabb5162..61bc4d7a54b6 100644
+--- a/cmake/Dependencies.cmake
++++ b/cmake/Dependencies.cmake
+@@ -1063,6 +1063,7 @@ if(USE_ROCM)
+     list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
+     list(APPEND HIP_CXX_FLAGS -std=c++17)
+     list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2)
++    list(APPEND HIP_CXX_FLAGS -I${ROCM_VERSION_DIR})
+     if(HIP_NEW_TYPE_ENUMS)
+       list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS)
+     endif()
+diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
+index 1c0d3a203991..6a7e3bd163f5 100644
+--- a/cmake/public/LoadHIP.cmake
++++ b/cmake/public/LoadHIP.cmake
+@@ -42,55 +42,39 @@ find_package_and_print_version(HIP 1.0)
+ 
+ if(HIP_FOUND)
+   set(PYTORCH_FOUND_HIP TRUE)
+-  set(FOUND_ROCM_VERSION_H FALSE)
+-
+   set(PROJECT_RANDOM_BINARY_DIR "${PROJECT_BINARY_DIR}")
+-  set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc")
+ 
+   # Find ROCM version for checks
+   # ROCM 5.0 and later will have header api for version management
+-  if(EXISTS ${ROCM_INCLUDE_DIRS}/rocm_version.h)
+-    set(FOUND_ROCM_VERSION_H TRUE)
+-    file(WRITE ${file} ""
+-      "#include <rocm_version.h>\n"
+-      )
+-  elseif(EXISTS ${ROCM_INCLUDE_DIRS}/rocm-core/rocm_version.h)
+-    set(FOUND_ROCM_VERSION_H TRUE)
+-    file(WRITE ${file} ""
+-      "#include <rocm-core/rocm_version.h>\n"
+-      )
+-  else()
+-    message("********************* rocm_version.h couldnt be found ******************\n")
+-  endif()
+-
+-  if(FOUND_ROCM_VERSION_H)
+-    file(APPEND ${file} ""
+-      "#include <cstdio>\n"
+-
+-      "#ifndef ROCM_VERSION_PATCH\n"
+-      "#define ROCM_VERSION_PATCH 0\n"
+-      "#endif\n"
+-      "#define STRINGIFYHELPER(x) #x\n"
+-      "#define STRINGIFY(x) STRINGIFYHELPER(x)\n"
+-      "int main() {\n"
+-      "  printf(\"%d.%d.%s\", ROCM_VERSION_MAJOR, ROCM_VERSION_MINOR, STRINGIFY(ROCM_VERSION_PATCH));\n"
+-      "  return 0;\n"
+-      "}\n"
+-      )
+-
+-    try_run(run_result compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file}
+-      CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}"
+-      RUN_OUTPUT_VARIABLE rocm_version_from_header
+-      COMPILE_OUTPUT_VARIABLE output_var
+-      )
+-    # We expect the compile to be successful if the include directory exists.
+-    if(NOT compile_result)
+-      message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var})
+-    endif()
+-    message(STATUS "Caffe2: Header version is: " ${rocm_version_from_header})
+-    set(ROCM_VERSION_DEV_RAW ${rocm_version_from_header})
+-    message("\n***** ROCm version from rocm_version.h ****\n")
++  find_path(ROCM_VERSION_DIR rocm_version.h HINTS ${ROCM_INCLUDE_DIRS} ${ROCM_INCLUDE_DIRS}/rocm-core)
++  set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc")
++  file(WRITE ${file} ""
++    "#include <rocm_version.h>\n"
++    "#include <cstdio>\n"
++
++    "#ifndef ROCM_VERSION_PATCH\n"
++    "#define ROCM_VERSION_PATCH 0\n"
++    "#endif\n"
++    "#define STRINGIFYHELPER(x) #x\n"
++    "#define STRINGIFY(x) STRINGIFYHELPER(x)\n"
++    "int main() {\n"
++    "  printf(\"%d.%d.%s\", ROCM_VERSION_MAJOR, ROCM_VERSION_MINOR, STRINGIFY(ROCM_VERSION_PATCH));\n"
++    "  return 0;\n"
++    "}\n"
++  )
++
++  try_run(run_result compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file}
++    CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_VERSION_DIR}"
++    RUN_OUTPUT_VARIABLE rocm_version_from_header
++    COMPILE_OUTPUT_VARIABLE output_var
++  )
++  # We expect the compile to be successful if the include directory exists.
++  if(NOT compile_result)
++    message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var})
+   endif()
++  message(STATUS "Caffe2: Header version is: " ${rocm_version_from_header})
++  set(ROCM_VERSION_DEV_RAW ${rocm_version_from_header})
++  message("\n***** ROCm version from rocm_version.h ****\n")
+ 
+   string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+).*$" ROCM_VERSION_DEV_MATCH ${ROCM_VERSION_DEV_RAW})
+ 
+-- 
+2.46.0
+

From 905814b8c0a7c445300a3f0b0a6f07e8f0890381 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Mon, 7 Oct 2024 12:27:39 -0700
Subject: [PATCH 34/88] Some help finding llvm18

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index b72b118..a5d915c 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -323,6 +323,7 @@ BuildRequires:  rccl-devel
 BuildRequires:  rocprim-devel
 BuildRequires:  rocm-cmake
 BuildRequires:  rocm-comgr-devel
+BuildRequires:  rocm-compilersupport-macros
 BuildRequires:  rocm-core-devel
 BuildRequires:  rocm-hip-devel
 BuildRequires:  rocm-runtime-devel
@@ -866,8 +867,7 @@ export USE_MAGMA=ON
 %endif
 export HIP_PATH=`hipconfig -p`
 export ROCM_PATH=`hipconfig -R`
-export HIP_CLANG_PATH=`hipconfig -l`
-RESOURCE_DIR=`${HIP_CLANG_PATH}/clang -print-resource-dir`
+RESOURCE_DIR=`%{_libdir}/llvm%{rocmllvm_version}/bin/clang -print-resource-dir`
 export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 
 gpu=%{rocm_default_gpu}
@@ -906,8 +906,7 @@ export FC=%{_bindir}/gfortran%{compat_gcc_major}
 export USE_ROCM=ON
 export HIP_PATH=`hipconfig -p`
 export ROCM_PATH=`hipconfig -R`
-export HIP_CLANG_PATH=`hipconfig -l`
-RESOURCE_DIR=`${HIP_CLANG_PATH}/clang -print-resource-dir`
+RESOURCE_DIR=`%{_libdir}/llvm%{rocmllvm_version}/bin/clang -print-resource-dir`
 export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 
 gpu=%{rocm_default_gpu}

From 07745618d1dacdc09d6f82a47416aac9d80644da Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 10 Oct 2024 04:07:13 -0700
Subject: [PATCH 35/88] Update for llvm18

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/python-torch.spec b/python-torch.spec
index a5d915c..ed34d64 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -548,6 +548,8 @@ sed -i -e '/fsspec/d' setup.py
 # A new dependency
 # Connected to USE_FLASH_ATTENTION, since this is off, do not need it
 sed -i -e '/aotriton.cmake/d' cmake/Dependencies.cmake
+# Compress hip
+sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc --offload-compress@' cmake/Dependencies.cmake
 
 # No third_party fmt, use system
 sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt
@@ -870,6 +872,14 @@ export ROCM_PATH=`hipconfig -R`
 RESOURCE_DIR=`%{_libdir}/llvm%{rocmllvm_version}/bin/clang -print-resource-dir`
 export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 
+# pytorch uses clang, not hipcc
+LLVM_BINDIR=`llvm-config-%{rocmllvm_version} --bindir`
+if [ ! -x ${LLVM_BINDIR}/clang ]; then
+    echo "Something wrong with llvm-config"
+    false
+fi
+export HIP_CLANG_PATH=${LLVM_BINDIR}
+
 gpu=%{rocm_default_gpu}
 module load rocm/$gpu
 export PYTORCH_ROCM_ARCH=$ROCM_GPUS
@@ -909,6 +919,14 @@ export ROCM_PATH=`hipconfig -R`
 RESOURCE_DIR=`%{_libdir}/llvm%{rocmllvm_version}/bin/clang -print-resource-dir`
 export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 
+# pytorch uses clang, not hipcc
+LLVM_BINDIR=`llvm-config-%{rocmllvm_version} --bindir`
+if [ ! -x ${LLVM_BINDIR}/clang ]; then
+    echo "Something wrong with llvm-config"
+    false
+fi
+export HIP_CLANG_PATH=${LLVM_BINDIR}
+
 gpu=%{rocm_default_gpu}
 module load rocm/$gpu
 export PYTORCH_ROCM_ARCH=$ROCM_GPUS

From 1c67f87710aad9ce01d3dff356a21181868c6f47 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Fri, 11 Oct 2024 06:41:52 -0700
Subject: [PATCH 36/88] Update gitcommit to v2.5.0-rc9

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index ed34d64..c432d5e 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,9 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-%global commit0 8df01c82587f80b2800c6cfcbabe168d96cba731
+# v2.5.0-rc9
+%global commit0 417a0763a7d69f6ce80719ac89c1d2deeee78163
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20240914
+%global date0 2024103
 %global pypi_version 2.5.0
 %else
 %global pypi_version 2.4.1
@@ -192,7 +193,7 @@ Patch1:        0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
 
 
 %if %{with gitcommit}
-Patch11:       0002-Improve-finding-and-using-the-rocm_version.h.patch
+Patch11:       0001-Improve-finding-and-using-the-rocm_version.h.patch
 %endif
 
 # ROCm patches

From 30effbf7a8c2a83a13265ba68d86589a47ce8832 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Tue, 29 Oct 2024 12:48:21 -0700
Subject: [PATCH 37/88] Use the new xnnpack

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index c432d5e..4eb44e3 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -276,11 +276,7 @@ BuildRequires:  cpuinfo-devel
 BuildRequires:  FP16-devel
 BuildRequires:  fxdiv-devel
 BuildRequires:  psimd-devel
-%if %{with gitcommit}
 BuildRequires:  xnnpack-devel = 0.0^git20240814.312eb7e
-%else
-BuildRequires:  xnnpack-devel = 0.0^git20240229.fcbf55a
-%endif
 %endif
 
 BuildRequires:  python3-devel

From f6e9c1478553500932ec686bc4ee44058a1fc08a Mon Sep 17 00:00:00 2001
From: Peter Robinson <pbrobinson@gmail.com>
Date: Sun, 1 Sep 2024 10:20:25 +0100
Subject: [PATCH 38/88] Fix various Provides including the pytotch provides

The Provides need to be in the python3-torch package as
the root package isn't one that users see, it's purely
the src.rpm so move all the Provides to the right
location and drop duplicate pytotch provides while we're
at it.

Fixes: RHBZ #2272064
---
 python-torch.spec | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 4eb44e3..133dea3 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -348,6 +348,18 @@ BuildRequires:  google-benchmark-devel
 
 Requires:       python3dist(dill)
 
+%description
+PyTorch is a Python package that provides two high-level features:
+
+ * Tensor computation (like NumPy) with strong GPU acceleration
+ * Deep neural networks built on a tape-based autograd system
+
+You can reuse your favorite Python packages such as NumPy, SciPy,
+and Cython to extend PyTorch when needed.
+
+%package -n     python3-%{pypi_name}
+Summary:        %{summary}
+
 # For convience
 Provides:       pytorch
 
@@ -390,21 +402,6 @@ Provides:       bundled(pthreadpool)
 Provides:       bundled(pocketfft)
 %endif
 
-# For convience
-Provides:       pytorch
-
-%description
-PyTorch is a Python package that provides two high-level features:
-
- * Tensor computation (like NumPy) with strong GPU acceleration
- * Deep neural networks built on a tape-based autograd system
-
-You can reuse your favorite Python packages such as NumPy, SciPy,
-and Cython to extend PyTorch when needed.
-
-%package -n     python3-%{pypi_name}
-Summary:        %{summary}
-
 %description -n python3-%{pypi_name}
 PyTorch is a Python package that provides two high-level features:
 

From 8dce5406c1057378b50aee60c38bb8713b3efede Mon Sep 17 00:00:00 2001
From: Peter Robinson <pbrobinson@gmail.com>
Date: Sun, 29 Sep 2024 13:07:03 +0100
Subject: [PATCH 39/88] Add binutils-gold build dep

We explicitly set the USE_GOLD_LINKER optoin but we don't
add the binutils-gold build dep so it errors, add the dep
to fix that.
---
 python-torch.spec | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python-torch.spec b/python-torch.spec
index 133dea3..2fff2e1 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -222,6 +222,7 @@ ExclusiveArch:  x86_64 aarch64
 %global _lto_cflags %nil
 
 BuildRequires:  cmake
+BuildRequires:  binutils-gold
 BuildRequires:  eigen3-devel
 %if %{with fbgemm}
 BuildRequires:  asmjit-devel

From 0d47a8ae9c010d4df3660cbd762f1e2f843a63b7 Mon Sep 17 00:00:00 2001
From: Peter Robinson <pbrobinson@gmail.com>
Date: Thu, 31 Oct 2024 11:21:08 +0000
Subject: [PATCH 40/88] drop old versions of pytorch from sources

Saves quite a lot of unnecessary downloads when
building locally.
---
 sources | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/sources b/sources
index 50611eb..107c6e2 100644
--- a/sources
+++ b/sources
@@ -1,18 +1,8 @@
-SHA512 (pytorch-v2.1.0.tar.gz) = 59421bf6cea6661d61ed66ab16526e3a07162e70e53381cbd5987042917610ec993d2f151fb086f0f98e5a396fe69e82bbc76f840bebffe4ebe7f50458c3aa44
-SHA512 (pytorch-v2.1.2.tar.gz) = b7305407ad9dda877d277a0e7009f65f6d69f39370f2231b8bb8c6a9b711022d2129febdb00f5c83751b6664e01000fe2d30c5e5c13757de89fb8b2b99197a28
-SHA512 (pytorch-975d428.tar.gz) = a02195b18d832db9a739c3eeecd0cd0c8868d8b92e4a2fca42e4bdd20735f0745d84573df28d9ae1db014cf79ffd005a8409b3e8bb92f9db2a446f784ef46ff4
 SHA512 (v23.3.3.tar.gz) = 4066c94f2473c7ea16917d29a613e16f840a329089c88e0bdbdb999aef3442ba00abfd2aa92266fa9c067e399dc88e6f0ccac40dc151378857e665638e78bbf0
 SHA512 (v2.11.1.tar.gz) = ed1512ff0bca3bc0a45edc2eb8c77f8286ab9389f6ff1d5cb309be24bc608abbe0df6a7f5cb18c8f80a3bfa509058547c13551c3cd6a759af708fd0cdcdd9e95
-SHA512 (pytorch-6a89a75.tar.gz) = 6978acc6f37d7c5adc71517a6f379c7133b2bbd040189deddba7753acde41f6ddba2e9f2e397928e89c776d6a5458b8a74f8e04beb312d71fd30b072687ba98f
-SHA512 (pytorch-74832f1.tar.gz) = bd553bfbbb422d353bbbf616c201251b2517b905e2621fa05bfe3d97726b078caad377583adccdc0cca234235a11fcb4730a93e834907b2ca4c06d552b2a2683
-SHA512 (pytorch-4bb5cb5.tar.gz) = 430ae996ddee560537787646ae9f7aa01498f37c99c2e3fe4c5f66ee732ee3fe4ecf337fdf857bc0c7fe27634af75cee3ce576bbe2576463b81e27dbbfacf6ef
 SHA512 (tensorpipe-52791a2.tar.gz) = 1e5faf17a7236c5506c08cb28be16069b11bb929bbca64ed9745ce4277d46739186ab7d6597da7437d90ed2d166d4c37ef2f3bceabe8083ef3adbb0e8e5f227e
-SHA512 (v1.41.0.tar.gz) = bb08a1970a10e8d9571ffea3d021643de30ec212cd51317b98d6cf0cfe55d6877992921fb01d1188a6d466687335b77885685d924f8cb7200a0bec30eee05c65
 SHA512 (libnop-910b558.tar.gz) = 74c5324eaa1b6b2ac8dfef94c835b5c5b044625f8e5efe3522470b1ecc4798ff43d344a013cee2f6901e83267c6167072947b754e63f1552ae7044cffe234c36
-SHA512 (pytorch-97ff6cf.tar.gz) = 105ebcba298558fe833f90e7e40b003d35a74609e777f9dc4c47f5668c884f603455113ac0ff252a62b83c81137ae66ceb1a862d351203925dcfc3dcf9f73580
-SHA512 (pytorch-v2.3.0.tar.gz) = 0c2ffc7bf2fd86070e9958c34eca1f03a0248a011ac6ffaeb69f65306ff856edd5359986f02af25888433187e6d7f29b60edded092e2ac30c8cec49023166eda
-SHA512 (pytorch-v2.3.1.tar.gz) = fe132251b2bae87b70ba3d95dc32f6a4545970d11893118b0ebe6ca129732e516ef4d6cc4f380b3db9bb2277d1db8ce78a401c40149bb1dfbab76eab9e3992c4
-SHA512 (pytorch-v2.4.0.tar.gz) = bcfca6aefee00d15d1c0a2456cd9d802d9a8e96816d421d10e3eed5c608bfdf23dfae492ca3638c0bae99ef5bb8c98f4774c0b9f1a8b94d4dc36a52226033314
+SHA512 (v1.41.0.tar.gz) = bb08a1970a10e8d9571ffea3d021643de30ec212cd51317b98d6cf0cfe55d6877992921fb01d1188a6d466687335b77885685d924f8cb7200a0bec30eee05c65
 SHA512 (v1.14.2.tar.gz) = 97635bbaf6dd567c201451dfaf7815b2052fe50d9bccc97aade86cfa4a92651374d167296a5453031b2681dc302806a289bca011a9e79ddc381a17d6118971d7
 SHA512 (cpp-httplib-3b6597b.tar.gz) = 8f1090658c498d04f14fec5c2f301847b1f3360bf92b18d82927643ee04ab61a6b274733a01c7850f9c030205120d674d1d961358d49fdd15636736fb8704f55
 SHA512 (kineto-be13176.tar.gz) = 41a08c7da9eea7d12402f80a5550c9d4df79798719cc52b12a507828c8c896ba28a37c35d8adf809ca72589e1d84965d5ef6dd01f3f8dc1c803c5ed67b03a43a

From 2819806afc308b5a1f8e2389eca5a6a1c0b6ca50 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 14 Nov 2024 11:23:54 -0800
Subject: [PATCH 41/88] Use rocmllvm_bindir

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 2fff2e1..b6b1d36 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -864,16 +864,11 @@ export USE_MAGMA=ON
 %endif
 export HIP_PATH=`hipconfig -p`
 export ROCM_PATH=`hipconfig -R`
-RESOURCE_DIR=`%{_libdir}/llvm%{rocmllvm_version}/bin/clang -print-resource-dir`
+RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir`
 export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 
 # pytorch uses clang, not hipcc
-LLVM_BINDIR=`llvm-config-%{rocmllvm_version} --bindir`
-if [ ! -x ${LLVM_BINDIR}/clang ]; then
-    echo "Something wrong with llvm-config"
-    false
-fi
-export HIP_CLANG_PATH=${LLVM_BINDIR}
+export HIP_CLANG_PATH=%{rocmllvm_bindir}
 
 gpu=%{rocm_default_gpu}
 module load rocm/$gpu
@@ -911,16 +906,11 @@ export FC=%{_bindir}/gfortran%{compat_gcc_major}
 export USE_ROCM=ON
 export HIP_PATH=`hipconfig -p`
 export ROCM_PATH=`hipconfig -R`
-RESOURCE_DIR=`%{_libdir}/llvm%{rocmllvm_version}/bin/clang -print-resource-dir`
+RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir`
 export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 
 # pytorch uses clang, not hipcc
-LLVM_BINDIR=`llvm-config-%{rocmllvm_version} --bindir`
-if [ ! -x ${LLVM_BINDIR}/clang ]; then
-    echo "Something wrong with llvm-config"
-    false
-fi
-export HIP_CLANG_PATH=${LLVM_BINDIR}
+export HIP_CLANG_PATH=%{rocmllvm_bindir}
 
 gpu=%{rocm_default_gpu}
 module load rocm/$gpu

From 0cf3ac43fb97415287f99f7010cc25a5be256a55 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Tue, 26 Nov 2024 15:53:19 -0800
Subject: [PATCH 42/88] Update for 2.5.0

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore        |  1 +
 python-torch.spec | 41 ++++++++++++-----------------------------
 sources           |  1 +
 3 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2491511..3d21da1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,4 @@
 /cpp-httplib-3b6597b.tar.gz
 /kineto-be13176.tar.gz
 /pytorch-v2.4.1.tar.gz
+/pytorch-v2.5.0.tar.gz
diff --git a/python-torch.spec b/python-torch.spec
index b6b1d36..1f1cdc0 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -12,7 +12,7 @@
 %global date0 2024103
 %global pypi_version 2.5.0
 %else
-%global pypi_version 2.4.1
+%global pypi_version 2.5.0
 %endif
 
 # For -test subpackage
@@ -27,7 +27,7 @@
 %endif
 %bcond_without hipblaslt
 %bcond_without magma
-%bcond_without rocm_loop
+%bcond_with rocm_loop
 %global rocm_default_gpu default
 %global rocm_gpu_list gfx9
 
@@ -42,7 +42,7 @@
 %if 0%{?fedora} > 40
 %bcond_without distributed
 # For testing distributed+rccl etc.
-%bcond_without rccl
+%bcond_with rccl
 %bcond_with gloo
 %bcond_without mpi
 %bcond_without tensorpipe
@@ -184,17 +184,7 @@ Source70:       https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp-
 Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz
 %endif
 
-%if %{without gitcommit}
-Patch0:        0001-no-third_party-foxi.patch
-# https://github.com/pytorch/pytorch/pull/131282
-Patch1:        0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
-
-%endif
-
-
-%if %{with gitcommit}
 Patch11:       0001-Improve-finding-and-using-the-rocm_version.h.patch
-%endif
 
 # ROCm patches
 # Patches need to be refactored for ToT
@@ -205,16 +195,6 @@ Patch11:       0001-Improve-finding-and-using-the-rocm_version.h.patch
 Patch100:      0001-Optionally-use-hipblaslt.patch
 %endif
 Patch101:      0001-cuda-hip-signatures.patch
-%if %{without gitcommit}
-Patch102:      0001-silence-an-assert.patch
-%endif
-%if %{without gitcommit}
-Patch105:      0001-disable-use-of-aotriton.patch
-%endif
-%endif
-
-%if %{without gitcommit}
-Patch106:      0001-include-fmt-ranges.h-for-using-fmt-join.patch
 %endif
 
 ExclusiveArch:  x86_64 aarch64
@@ -247,9 +227,7 @@ BuildRequires:  gcc-gfortran
 BuildRequires:  gloo-devel
 %endif
 %endif
-%if %{with gitcommit}
 BuildRequires:  json-devel
-%endif
 
 BuildRequires:  libomp-devel
 BuildRequires:  numactl-devel
@@ -420,12 +398,15 @@ Summary:        %{name} for CUDA
 %{summary}
 %endif
 
+%if %{with rocm_loop}
 %package -n python3-%{pypi_name}-rocm-gfx9
 Summary:        %{name} for ROCm gfx9
 
 %description -n python3-%{pypi_name}-rocm-gfx9
 %{summary}
 
+%endif
+
 %if %{with test}
 %package -n python3-%{pypi_name}-test
 Summary:        Tests for %{name}
@@ -538,6 +519,9 @@ sed -i -e '/typing-extensions/d' setup.py
 # Need to pip these
 sed -i -e '/sympy/d' setup.py
 sed -i -e '/fsspec/d' setup.py
+%else
+# for 2.5.0
+sed -i -e 's@sympy==1.13.1@sympy>=1.13.1@' setup.py
 %endif
 
 # A new dependency
@@ -551,9 +535,8 @@ sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt
 sed -i -e 's@fmt::fmt-header-only@fmt@' c10/CMakeLists.txt
 sed -i -e 's@fmt::fmt-header-only@fmt@' torch/CMakeLists.txt
 sed -i -e 's@fmt::fmt-header-only@fmt@' cmake/Dependencies.cmake
-%if %{with gitcommit}
 sed -i -e 's@fmt::fmt-header-only@fmt@' caffe2/CMakeLists.txt
-%endif
+
 sed -i -e 's@add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt)@#add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt)@' cmake/Dependencies.cmake
 sed -i -e 's@set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "")@#set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "")@' cmake/Dependencies.cmake
 sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)@#list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)@' cmake/Dependencies.cmake
@@ -953,20 +936,20 @@ done
 %{_bindir}/convert-caffe2-to-onnx
 %{_bindir}/convert-onnx-to-caffe2
 %{_bindir}/torchrun
-%if %{with gitcommit}
 %{_bindir}/torchfrtrace
-%endif
 %{python3_sitearch}/%{pypi_name}
 %{python3_sitearch}/%{pypi_name}-*.egg-info
 %{python3_sitearch}/functorch
 %{python3_sitearch}/torchgen
 
 %if %{with rocm}
+%if %{with rocm_loop}
 
 %files -n python3-%{pypi_name}-rocm-gfx9
 %{_libdir}/rocm/gfx9/bin/*
 %{_libdir}/rocm/gfx9/lib64/*
 
+%endif
 %endif
 
 %changelog
diff --git a/sources b/sources
index 107c6e2..babeda2 100644
--- a/sources
+++ b/sources
@@ -7,3 +7,4 @@ SHA512 (v1.14.2.tar.gz) = 97635bbaf6dd567c201451dfaf7815b2052fe50d9bccc97aade86c
 SHA512 (cpp-httplib-3b6597b.tar.gz) = 8f1090658c498d04f14fec5c2f301847b1f3360bf92b18d82927643ee04ab61a6b274733a01c7850f9c030205120d674d1d961358d49fdd15636736fb8704f55
 SHA512 (kineto-be13176.tar.gz) = 41a08c7da9eea7d12402f80a5550c9d4df79798719cc52b12a507828c8c896ba28a37c35d8adf809ca72589e1d84965d5ef6dd01f3f8dc1c803c5ed67b03a43a
 SHA512 (pytorch-v2.4.1.tar.gz) = fc364112a88540035f213274b526b4c8ad9be335ff214161091e8a8d3efa89ebef5a5b4d33b20b5b659896d164dcbe739f0d7d2a293d3afc0efcfaa2bf62fc2c
+SHA512 (pytorch-v2.5.0.tar.gz) = 6ccf1ac9f191f5bd757ef7fbfc1dcd81d591577f2d3df7313c6ed32790c592aaffd253e18dc778a2fcc707e4533299817dfdf9fae108636ce5c29c1b8ff8bba6

From 91a938b757b8a75ae93ed68ee06fdfbcdb234a26 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sat, 21 Dec 2024 07:04:12 -0800
Subject: [PATCH 43/88] Update to 2.5.1

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore        | 1 +
 python-torch.spec | 2 +-
 sources           | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 3d21da1..cdf142f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,3 +18,4 @@
 /kineto-be13176.tar.gz
 /pytorch-v2.4.1.tar.gz
 /pytorch-v2.5.0.tar.gz
+/pytorch-v2.5.1.tar.gz
diff --git a/python-torch.spec b/python-torch.spec
index 1f1cdc0..aafe65d 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -12,7 +12,7 @@
 %global date0 2024103
 %global pypi_version 2.5.0
 %else
-%global pypi_version 2.5.0
+%global pypi_version 2.5.1
 %endif
 
 # For -test subpackage
diff --git a/sources b/sources
index babeda2..aa1ed3c 100644
--- a/sources
+++ b/sources
@@ -8,3 +8,4 @@ SHA512 (cpp-httplib-3b6597b.tar.gz) = 8f1090658c498d04f14fec5c2f301847b1f3360bf9
 SHA512 (kineto-be13176.tar.gz) = 41a08c7da9eea7d12402f80a5550c9d4df79798719cc52b12a507828c8c896ba28a37c35d8adf809ca72589e1d84965d5ef6dd01f3f8dc1c803c5ed67b03a43a
 SHA512 (pytorch-v2.4.1.tar.gz) = fc364112a88540035f213274b526b4c8ad9be335ff214161091e8a8d3efa89ebef5a5b4d33b20b5b659896d164dcbe739f0d7d2a293d3afc0efcfaa2bf62fc2c
 SHA512 (pytorch-v2.5.0.tar.gz) = 6ccf1ac9f191f5bd757ef7fbfc1dcd81d591577f2d3df7313c6ed32790c592aaffd253e18dc778a2fcc707e4533299817dfdf9fae108636ce5c29c1b8ff8bba6
+SHA512 (pytorch-v2.5.1.tar.gz) = a8882608c2ab6467a58d60c6df84c9f1004b43eafeba57db499dbbfdecc09db2e221b9d4c344c8af7c0bea6252e874c400483502dca24a0b474c376b9fef1dd4

From 64906bb61ce31d751d167bdc7ee19e6d9d3edf81 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Mon, 23 Dec 2024 10:12:09 -0800
Subject: [PATCH 44/88] Obsolete caffe

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python-torch.spec b/python-torch.spec
index aafe65d..4891072 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -327,6 +327,8 @@ BuildRequires:  google-benchmark-devel
 
 Requires:       python3dist(dill)
 
+Obsoletes:      caffe  = 1.0^git20200212.9b89154
+
 %description
 PyTorch is a Python package that provides two high-level features:
 

From 3400ff60245ca66c0171092b1122a5c9249aabfd Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Tue, 24 Dec 2024 04:59:37 -0800
Subject: [PATCH 45/88] Remove many options

These options were not being tested so they are being removed.

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 311 +---------------------------------------------
 1 file changed, 3 insertions(+), 308 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 4891072..2261c9e 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -25,74 +25,21 @@
 %ifarch x86_64
 %bcond_without rocm
 %endif
-%bcond_without hipblaslt
-%bcond_without magma
 %bcond_with rocm_loop
 %global rocm_default_gpu default
 %global rocm_gpu_list gfx9
 
-# Caffe2 support came in F41
-%if 0%{?fedora} > 40
-%bcond_without caffe2
-%else
-%bcond_with caffe2
-%endif
-
-# Distributed support came in F41
-%if 0%{?fedora} > 40
-%bcond_without distributed
 # For testing distributed+rccl etc.
 %bcond_with rccl
 %bcond_with gloo
 %bcond_without mpi
 %bcond_without tensorpipe
-%else
-%bcond_with distributed
-%endif
-
-# Do no confuse xnnpack versions
-%if 0%{?fedora} > 40
-%bcond_without xnnpack
-%else
-%bcond_with xnnpack
-%endif
-
-%bcond_without pthreadpool
-%bcond_without pocketfft
-
-%ifarch x86_64
-  %if %{with rocm}
-    %bcond_with fbgemm
-  %else
-    %bcond_without fbgemm
-  %endif
-%else
-  %bcond_with fbgemm
-%endif
-
-# For testing cuda
-%ifarch x86_64
-%bcond_with cuda
-%endif
-
-# Pick a CUDA version that works
-%global cuda_ver 12.5
-
-# For testing compat-gcc
-%global compat_gcc_major 13
-%bcond_with compat_gcc
 
 # Disable dwz with rocm because memory can be exhausted
 %if %{with rocm}
 %define _find_debuginfo_dwz_opts %{nil}
 %endif
 
-%if %{with cuda}
-# workaround problems with -pie
-%global build_cxxflags %{nil}
-%global build_ldflags %{nil}
-%endif
-			 
 # These came in 2.4 and not yet in Fedora
 %bcond_with opentelemetry
 %bcond_with httplib
@@ -119,13 +66,6 @@ Source0:        %{forgeurl}/releases/download/v%{version}/pytorch-v%{version}.ta
 Source1:        https://github.com/google/flatbuffers/archive/refs/tags/v23.3.3.tar.gz
 Source2:        https://github.com/pybind/pybind11/archive/refs/tags/v2.11.1.tar.gz
 
-%if %{with cuda}
-%global cuf_ver 1.1.2
-Source10:       https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v%{cuf_ver}.tar.gz
-%global cul_ver 3.4.1
-Source11:       https://github.com/NVIDIA/cutlass/archive/refs/tags/v%{cul_ver}.tar.gz
-%endif
-
 # Developement on tensorpipe has stopped, repo made read only July 1, 2023, this is the last commit
 %global tp_commit 52791a2fd214b2a9dc5759d36725909c1daa7f2e
 %global tp_scommit %(c=%{tp_commit}; echo ${c:0:7})
@@ -137,36 +77,6 @@ Source21:       https://github.com/libuv/libuv/archive/refs/tags/v1.41.0.tar.gz
 %global nop_scommit %(c=%{nop_commit}; echo ${c:0:7})
 Source22:       https://github.com/google/libnop/archive/%{nop_commit}/libnop-%{nop_scommit}.tar.gz
 
-%if %{without xnnpack}
-%global xnn_commit fcbf55af6cf28a4627bcd1f703ab7ad843f0f3a2
-%global xnn_scommit %(c=%{xnn_commit}; echo ${c:0:7})
-Source30:       https://github.com/google/xnnpack/archive/%{xnn_commit}/xnnpack-%{xnn_scommit}.tar.gz
-%global fx_commit 63058eff77e11aa15bf531df5dd34395ec3017c8
-%global fx_scommit %(c=%{fx_commit}; echo ${c:0:7})
-Source31:       https://github.com/Maratyszcza/fxdiv/archive/%{fx_commit}/FXdiv-%{fx_scommit}.tar.gz
-%global fp_commit 0a92994d729ff76a58f692d3028ca1b64b145d91
-%global fp_scommit %(c=%{fp_commit}; echo ${c:0:7})
-Source32:       https://github.com/Maratyszcza/FP16/archive/%{fp_commit}/FP16-%{fp_scommit}.tar.gz
-%global ps_commit 072586a71b55b7f8c584153d223e95687148a900
-%global ps_scommit %(c=%{ps_commit}; echo ${c:0:7})
-Source33:       https://github.com/Maratyszcza/psimd/archive/%{ps_commit}/psimd-%{ps_scommit}.tar.gz
-%global ci_commit 16bfc1622c6902d6f91d316ec54894910c620325
-%global ci_scommit %(c=%{ci_commit}; echo ${c:0:7})
-Source34:       https://github.com/pytorch/cpuinfo/archive/%{ci_commit}/cpuinfo-%{ci_scommit}.tar.gz
-%endif
-
-%if %{without pthreadpool}
-%global pt_commit 4fe0e1e183925bf8cfa6aae24237e724a96479b8
-%global pt_scommit %(c=%{pt_commit}; echo ${c:0:7})
-Source40:       https://github.com/Maratyszcza/pthreadpool/archive/%{pt_commit}/pthreadpool-%{pt_scommit}.tar.gz
-%endif
-
-%if %{without pocketfft}
-%global pf_commit 076cb3d2536b7c5d0629093ad886e10ac05f3623
-%global pf_scommit %(c=%{pf_commit}; echo ${c:0:7})
-Source50:       https://github.com/mreineck/pocketfft/archive/%{pf_commit}/pocketfft-%{pf_scommit}.tar.gz
-%endif
-
 %if %{without opentelemetry}
 %global ot_ver 1.14.2
 Source60:       https://github.com/open-telemetry/opentelemetry-cpp/archive/refs/tags/v%{ot_ver}.tar.gz
@@ -189,13 +99,7 @@ Patch11:       0001-Improve-finding-and-using-the-rocm_version.h.patch
 # ROCm patches
 # Patches need to be refactored for ToT
 # These are ROCm packages
-%if %{without cuda}
-# https://github.com/pytorch/pytorch/pull/120551
-%if %{without hipblaslt}
-Patch100:      0001-Optionally-use-hipblaslt.patch
-%endif
 Patch101:      0001-cuda-hip-signatures.patch
-%endif
 
 ExclusiveArch:  x86_64 aarch64
 %global toolchain gcc
@@ -204,59 +108,35 @@ ExclusiveArch:  x86_64 aarch64
 BuildRequires:  cmake
 BuildRequires:  binutils-gold
 BuildRequires:  eigen3-devel
-%if %{with fbgemm}
-BuildRequires:  asmjit-devel
-BuildRequires:  fbgemm-devel
-%endif
 BuildRequires:  flexiblas-devel
 BuildRequires:  fmt-devel
-%if %{with caffe2}
 BuildRequires:  foxi-devel
-%endif
-
-%if %{with compat_gcc}
-BuildRequires:  gcc%{compat_gcc_major}-c++
-BuildRequires:  gcc%{compat_gcc_major}-gfortran
-%else
 BuildRequires:  gcc-c++
 BuildRequires:  gcc-gfortran
-%endif
 
-%if %{with distributed}
 %if %{with gloo}
 BuildRequires:  gloo-devel
 %endif
-%endif
 BuildRequires:  json-devel
 
 BuildRequires:  libomp-devel
 BuildRequires:  numactl-devel
 BuildRequires:  ninja-build
 BuildRequires:  onnx-devel
-%if %{with distributed}
 %if %{with mpi}
 BuildRequires:  openmpi-devel
 %endif
-%endif
 BuildRequires:  protobuf-devel
 BuildRequires:  sleef-devel
 BuildRequires:  valgrind-devel
-
-%if %{with pocketfft}
 BuildRequires:  pocketfft-devel
-%endif
-
-%if %{with pthreadpool}
 BuildRequires:  pthreadpool-devel
-%endif
 
-%if %{with xnnpack}
 BuildRequires:  cpuinfo-devel
 BuildRequires:  FP16-devel
 BuildRequires:  fxdiv-devel
 BuildRequires:  psimd-devel
 BuildRequires:  xnnpack-devel = 0.0^git20240814.312eb7e
-%endif
 
 BuildRequires:  python3-devel
 BuildRequires:  python3dist(filelock)
@@ -276,26 +156,20 @@ BuildRequires:  python3dist(sympy)
 
 %if %{with rocm}
 BuildRequires:  hipblas-devel
-%if %{with hipblaslt}
 BuildRequires:  hipblaslt-devel
-%endif
 BuildRequires:  hipcub-devel
 BuildRequires:  hipfft-devel
 BuildRequires:  hiprand-devel
 BuildRequires:  hipsparse-devel
 BuildRequires:  hipsolver-devel
-%if %{with magma}
 BuildRequires:  magma-devel
-%endif
 BuildRequires:  miopen-devel
 BuildRequires:  rocblas-devel
 BuildRequires:  rocrand-devel
 BuildRequires:  rocfft-devel
-%if %{with distributed}
 %if %{with rccl}
 BuildRequires:  rccl-devel
 %endif
-%endif
 BuildRequires:  rocprim-devel
 BuildRequires:  rocm-cmake
 BuildRequires:  rocm-comgr-devel
@@ -313,14 +187,6 @@ Requires:       rocm-rpm-macros-modules
 
 %endif
 
-%if %{with cuda}
-BuildRequires:  cuda-cudart-devel-%{cuda_ver}
-BuildRequires:  libcublas-devel-%{cuda_ver}
-BuildRequires:  libcufft-devel-%{cuda_ver}
-BuildRequires:  libcurand-devel-%{cuda_ver}
-BuildRequires:  libcusparse-devel-%{cuda_ver}
-%endif
-
 %if %{with test}
 BuildRequires:  google-benchmark-devel
 %endif
@@ -359,30 +225,6 @@ Provides:       bundled(libnop)
 Provides:       bundled(libuv) = 1.41.0
 %endif
 
-# These are already in Fedora
-%if %{without xnnpack}
-# BSD-3-Clause
-Provides:       bundled(xnnpack)
-# MIT
-Provides:       bundled(FP16)
-# MIT
-Provides:       bundled(fxdiv)
-# MIT
-Provides:       bundled(psimd)
-# BSD-2-Clause
-Provides:       bundled(cpuinfo)
-%endif
-
-%if %{without pthreadpool}
-# BSD-2-Clause
-Provides:       bundled(pthreadpool)
-%endif
-
-%if %{without pocketfft}
-# BSD-3-Clause
-Provides:       bundled(pocketfft)
-%endif
-
 %description -n python3-%{pypi_name}
 PyTorch is a Python package that provides two high-level features:
 
@@ -392,14 +234,6 @@ PyTorch is a Python package that provides two high-level features:
 You can reuse your favorite Python packages such as NumPy, SciPy,
 and Cython to extend PyTorch when needed.
 
-%if %{with cuda}
-%package -n python3-%{pypi_name}-cuda-%{cuda_ver}
-Summary:        %{name} for CUDA
-
-%description -n python3-%{pypi_name}-cuda-%{cuda_ver}
-%{summary}
-%endif
-
 %if %{with rocm_loop}
 %package -n python3-%{pypi_name}-rocm-gfx9
 Summary:        %{name} for ROCm gfx9
@@ -441,15 +275,6 @@ tar xf %{SOURCE2}
 rm -rf third_party/pybind11/*
 cp -r pybind11-2.11.1/* third_party/pybind11/
 
-%if %{with cuda}
-tar xf %{SOURCE10}
-rm -rf third_party/cudnn_frontend/*
-cp -r cudnn-frontend-%{cuf_ver}/* third_party/cudnn_frontend/
-tar xf %{SOURCE11}
-rm -rf third_party/cutlass/*
-cp -r cutlass-%{cul_ver}/* third_party/cutlass/
-%endif
-
 %if %{with tensorpipe}
 tar xf %{SOURCE20}
 rm -rf third_party/tensorpipe/*
@@ -462,36 +287,6 @@ rm -rf third_party/tensorpipe/third_party/libnop/*
 cp -r libnop-*/* third_party/tensorpipe/third_party/libnop/
 %endif
 
-%if %{without xnnpack}
-tar xf %{SOURCE30}
-rm -rf third_party/XNNPACK/*
-cp -r XNNPACK-*/* third_party/XNNPACK/
-tar xf %{SOURCE31}
-rm -rf third_party/FXdiv/*
-cp -r FXdiv-*/* third_party/FXdiv/
-tar xf %{SOURCE32}
-rm -rf third_party/FP16/*
-cp -r FP16-*/* third_party/FP16/
-tar xf %{SOURCE33}
-rm -rf third_party/psimd/*
-cp -r psimd-*/* third_party/psimd/
-tar xf %{SOURCE34}
-rm -rf third_party/cpuinfo/*
-cp -r cpuinfo-*/* third_party/cpuinfo/
-%endif
-
-%if %{without pthreadpool}
-tar xf %{SOURCE40}
-rm -rf third_party/pthreadpool/*
-cp -r pthreadpool-*/* third_party/pthreadpool/
-%endif
-
-%if %{without pocketfft}
-tar xf %{SOURCE50}
-rm -rf third_party/pocketfft/*
-cp -r pocketfft-*/* third_party/pocketfft/
-%endif
-
 %if %{without opentelemtry}
 tar xf %{SOURCE60}
 rm -rf third_party/opentelemetry-cpp/*
@@ -511,9 +306,7 @@ cp -r kineto-*/* third_party/kineto/
 %endif
 
 # hipblaslt only building with gfx90a
-%if %{with hipblaslt}
 sed -i -e 's@"gfx90a", "gfx940", "gfx941", "gfx942"@"gfx90a"@' aten/src/ATen/native/cuda/Blas.cpp
-%endif
 
 %if 0%{?rhel}
 # In RHEL but too old
@@ -544,10 +337,8 @@ sed -i -e 's@set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_
 sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)@#list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)@' cmake/Dependencies.cmake
 
 # No third_party FXdiv
-%if %{with xnnpack}
 sed -i -e 's@if(NOT TARGET fxdiv)@if(MSVC AND USE_XNNPACK)@' caffe2/CMakeLists.txt
 sed -i -e 's@TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@#TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@' caffe2/CMakeLists.txt
-%endif
 
 # Disable the use of check_submodule's in the setup.py, we are a tarball, not a git repo
 sed -i -e 's@check_submodules()$@#check_submodules()@' setup.py
@@ -570,31 +361,10 @@ mv third_party/flatbuffers .
 
 mv third_party/pybind11 .
 
-%if %{with cuda}
-mv third_party/cudnn_frontend .
-mv third_party/cutlass .
-%endif
-
 %if %{with tensorpipe}
 mv third_party/tensorpipe .
 %endif
 
-%if %{without xnnpack}
-mv third_party/XNNPACK .
-mv third_party/FXdiv .
-mv third_party/FP16 .
-mv third_party/psimd .
-mv third_party/cpuinfo .
-%endif
-
-%if %{without pthreadpool}
-mv third_party/pthreadpool .
-%endif
-
-%if %{without pocketfft}
-mv third_party/pocketfft .
-%endif
-
 %if %{without opentelemetry}
 mv third_party/opentelemetry-cpp .
 %endif
@@ -619,31 +389,10 @@ mv miniz-2.1.0 third_party
 mv flatbuffers third_party
 mv pybind11 third_party
 
-%if %{with cuda}
-mv cudnn_frontend third_party
-mv cutlass third_party
-%endif
-
 %if %{with tensorpipe}
 mv tensorpipe third_party
 %endif
 
-%if %{without xnnpack}
-mv XNNPACK third_party
-mv FXdiv third_party
-mv FP16 third_party
-mv psimd third_party
-mv cpuinfo third_party
-%endif
-
-%if %{without pthreadpool}
-mv pthreadpool third_party
-%endif
-
-%if %{without pocketfft}
-mv pocketfft third_party
-%endif
-
 %if %{without opentelemetry}
 mv opentelemetry-cpp third_party
 %endif
@@ -660,11 +409,9 @@ mv kineto third_party
 mv googletest third_party
 %endif
 
-%if %{with pocketfft}
 #
 # Fake out pocketfft, and system header will be used
 mkdir third_party/pocketfft
-%endif
 
 #
 # Use the system valgrind headers
@@ -690,12 +437,6 @@ sed -i -e 's@HIP 1.0@HIP MODULE@'            cmake/public/LoadHIP.cmake
 
 %endif
 
-%if %{with cuda}
-
-# TBD
-
-%endif
-
 %build
 
 #
@@ -710,11 +451,7 @@ if [ ${COMPILE_JOBS}x = x ]; then
     COMPILE_JOBS=1
 fi
 # Take into account memmory usage per core, do not thrash real memory
-%if %{with cuda}
-BUILD_MEM=4
-%else
 BUILD_MEM=2
-%endif
 MEM_KB=0
 MEM_KB=`cat /proc/meminfo | grep MemTotal | awk '{ print $2 }'`
 MEM_MB=`eval "expr ${MEM_KB} / 1024"`
@@ -725,12 +462,6 @@ if [ "$COMPILE_JOBS_MEM" -lt "$COMPILE_JOBS" ]; then
 fi
 export MAX_JOBS=$COMPILE_JOBS
 
-%if %{with compat_gcc}
-export CC=%{_bindir}/gcc-%{compat_gcc_major}
-export CXX=%{_bindir}/g++-%{compat_gcc_major}
-export FC=%{_bindir}/gfortran-%{compat_gcc_major}
-%endif
-
 # For debugging setup.py
 # export SETUPTOOLS_SCM_DEBUG=1
 
@@ -740,10 +471,7 @@ export FC=%{_bindir}/gfortran-%{compat_gcc_major}
 # export CMAKE_SHARED_LINKER_FLAGS=-Wl,--verbose
 
 # Manually set this hardening flag
-# CUDA is unhappy with pie, so do not use it
-%if %{without cuda}
 export CMAKE_EXE_LINKER_FLAGS=-pie
-%endif
 
 export BUILD_CUSTOM_PROTOBUF=OFF
 export BUILD_NVFUSER=OFF
@@ -756,12 +484,7 @@ export INTERN_BUILD_MOBILE=OFF
 export USE_DISTRIBUTED=OFF
 export USE_CUDA=OFF
 export USE_FAKELOWP=OFF
-%if %{with fbgemm}
-export USE_FBGEMM=ON
-export USE_SYSTEM_FBGEMM=ON
-%else
 export USE_FBGEMM=OFF
-%endif
 export USE_FLASH_ATTENTION=OFF
 export USE_GOLD_LINKER=ON
 export USE_GLOO=OFF
@@ -787,31 +510,13 @@ export USE_SYSTEM_LIBS=OFF
 export USE_TENSORPIPE=OFF
 export USE_XNNPACK=ON
 export USE_XPU=OFF
-
-%if %{with pthreadpool}
 export USE_SYSTEM_PTHREADPOOL=ON
-%endif
-
-%if %{with xnnpack}
 export USE_SYSTEM_CPUINFO=ON
 export USE_SYSTEM_FP16=ON
 export USE_SYSTEM_FXDIV=ON
 export USE_SYSTEM_PSIMD=ON
 export USE_SYSTEM_XNNPACK=ON
-%endif
 
-%if %{with cuda}
-%if %{without rocm}
-export CPLUS_INCLUDE_PATH=/usr/local/cuda-%{cuda_ver}/include
-export CUDACXX=/usr/local/cuda-%{cuda_ver}/bin/nvcc
-export CUDA_HOME=/usr/local/cuda-%{cuda_ver}/
-export USE_CUDA=ON
-# The arches to build for
-export TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0"
-%endif
-%endif
-
-%if %{with distributed}
 export USE_DISTRIBUTED=ON
 %if %{with tensorpipe}
 export USE_TENSORPIPE=ON
@@ -825,7 +530,6 @@ export USE_SYSTEM_GLOO=ON
 %if %{with mpi}
 export USE_MPI=ON
 %endif
-%endif
 
 %if %{with test}
 export BUILD_TEST=ON
@@ -844,9 +548,7 @@ export BUILD_TEST=ON
 %if %{with rocm}
 
 export USE_ROCM=ON
-%if %{with magma}
 export USE_MAGMA=ON
-%endif
 export HIP_PATH=`hipconfig -p`
 export ROCM_PATH=`hipconfig -R`
 RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir`
@@ -881,12 +583,6 @@ done
 
 %install
 
-%if %{with compat_gcc}
-export CC=%{_bindir}/gcc%{compat_gcc_major}
-export CXX=%{_bindir}/g++%{compat_gcc_major}
-export FC=%{_bindir}/gfortran%{compat_gcc_major}
-%endif
-
 %if %{with rocm}
 export USE_ROCM=ON
 export HIP_PATH=`hipconfig -p`
@@ -926,13 +622,12 @@ done
 
 %endif
 
+%check
+%py3_check_import torch
+
 # Do not remote the empty files
 
-%if %{with cuda}
-%files -n python3-%{pypi_name}-cuda-%{cuda_ver}
-%else
 %files -n python3-%{pypi_name}
-%endif
 %license LICENSE
 %doc README.md 
 %{_bindir}/convert-caffe2-to-onnx

From 749438b8bbc43fdaeb7ffbef885b238a817952df Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Wed, 1 Jan 2025 16:54:46 -0800
Subject: [PATCH 46/88] Rebuild for onnx

Signed-off-by: Tom Rix <Tom.Rix@amd.com>

From d8b4f3d9d965c2c1336e6bd77b97ecf4e6e3af98 Mon Sep 17 00:00:00 2001
From: Fedora Release Engineering <releng@fedoraproject.org>
Date: Sat, 18 Jan 2025 20:11:05 +0000
Subject: [PATCH 47/88] Rebuilt for
 https://fedoraproject.org/wiki/Fedora_42_Mass_Rebuild


From 14a9d711181acb231c1fae9504780e21c08b131d Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 23 Jan 2025 12:11:47 -0800
Subject: [PATCH 48/88] triage build break

gcc 15 libstdc++ change broke things.

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 0001-torch-paper-over-c-assert.patch | 88 ++++++++++++++++++++++++++++
 python-torch.spec                    | 10 ++++
 2 files changed, 98 insertions(+)
 create mode 100644 0001-torch-paper-over-c-assert.patch

diff --git a/0001-torch-paper-over-c-assert.patch b/0001-torch-paper-over-c-assert.patch
new file mode 100644
index 0000000..b7e55ce
--- /dev/null
+++ b/0001-torch-paper-over-c-assert.patch
@@ -0,0 +1,88 @@
+From f646e0f04ae591c8f2d8a0cd24b035725c57659b Mon Sep 17 00:00:00 2001
+From: Tom Rix <Tom.Rix@amd.com>
+Date: Thu, 23 Jan 2025 08:24:22 -0800
+Subject: [PATCH] torch: paper over c++ assert
+
+---
+ aten/src/ATen/native/sparse/FlattenIndicesCommon.h           | 2 ++
+ .../ATen/native/sparse/SparseBinaryOpIntersectionCommon.h    | 5 +++++
+ .../src/ATen/native/sparse/ValidateCompressedIndicesCommon.h | 2 ++
+ 3 files changed, 9 insertions(+)
+
+diff --git a/aten/src/ATen/native/sparse/FlattenIndicesCommon.h b/aten/src/ATen/native/sparse/FlattenIndicesCommon.h
+index 0e79ed809ae6..a3cec8aaf78b 100644
+--- a/aten/src/ATen/native/sparse/FlattenIndicesCommon.h
++++ b/aten/src/ATen/native/sparse/FlattenIndicesCommon.h
+@@ -69,11 +69,13 @@ Tensor _flatten_indices_impl(const Tensor& indices, IntArrayRef size) {
+           [=] FUNCAPI (int64_t nnz_idx) -> int64_t {
+           const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride;
+           auto hash = static_cast<int64_t>(0);
++#if 0
+           for (int64_t dim = 0; dim < sparse_dim; ++dim) {
+             const auto dim_hash_coeff = hash_coeffs[dim];
+             const auto dim_index = ptr_indices_dim[dim * indices_dim_stride];
+             hash += dim_index * dim_hash_coeff;
+           }
++#endif
+           return hash;
+       });
+     }
+diff --git a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
+index c0b94bf39d54..8de4900b7a01 100644
+--- a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
++++ b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
+@@ -279,12 +279,15 @@ void _sparse_binary_op_intersection_kernel_impl(
+           if (!ptr_indices) {
+             return hash;
+           }
++#if 0
++//	  /usr/lib/gcc/x86_64-redhat-linux/15/../../../../include/c++/15/array:219:2: error: reference to __host__ function '__glibcxx_assert_fail' in __host__ __device__ function
+           const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride;
+           for (int64_t dim = 0; dim < sparse_dim; ++dim) {
+             const auto dim_hash_coeff = hash_coeffs[dim];
+             const auto dim_index = ptr_indices_dim[dim * indices_dim_stride];
+             hash += dim_index * dim_hash_coeff;
+           }
++#endif
+           return hash;
+       });
+     }
+@@ -364,6 +367,7 @@ void _sparse_binary_op_intersection_kernel_impl(
+           if (hash_ptr) {
+             hash = hash_ptr[nnz_idx];
+           } else if (sparse_dim) {
++#if 0
+             // Compute hash value
+             const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride;
+             for (int64_t dim = 0; dim < sparse_dim; ++dim) {
+@@ -371,6 +375,7 @@ void _sparse_binary_op_intersection_kernel_impl(
+               const auto dim_index = ptr_indices_dim[dim * indices_dim_stride];
+               hash += dim_index * dim_hash_coeff;
+             }
++#endif
+           }
+ 
+           // Perform hash values intersection
+diff --git a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
+index ec4c084a39cc..9bc9655b0afa 100644
+--- a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
++++ b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
+@@ -341,6 +341,7 @@ void _validate_compressed_sparse_indices_kernel(
+                 // assuming idx contiguity per batch:
+                 int64_t tmp = batch_idx * nnz;
+                 // `nnz == idx_sizes[idx_ndims - 1]` is checked above as `nnz == idx.size(-1)`
++#if 0		
+                 for (int i = idx_ndims - 1;
+                      i >= 0 && nnz > 0;  // break early when nnz==0
+                      i--) {
+@@ -348,6 +349,7 @@ void _validate_compressed_sparse_indices_kernel(
+                   idx_offset += (tmp - div * idx_sizes[i]) * idx_strides[i];
+                   tmp = div;
+                 }
++#endif
+                 const auto* RESTRICT ptr_idx_batch = ptr_idx + idx_offset;
+                 _check_idx_sorted_distinct_vals_slices_with_cidx<
+                     cdim_name,
+-- 
+2.48.1
+
diff --git a/python-torch.spec b/python-torch.spec
index 2261c9e..267cbb3 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -100,6 +100,7 @@ Patch11:       0001-Improve-finding-and-using-the-rocm_version.h.patch
 # Patches need to be refactored for ToT
 # These are ROCm packages
 Patch101:      0001-cuda-hip-signatures.patch
+Patch102:      0001-torch-paper-over-c-assert.patch
 
 ExclusiveArch:  x86_64 aarch64
 %global toolchain gcc
@@ -285,6 +286,10 @@ cp -r libuv-*/* third_party/tensorpipe/third_party/libuv/
 tar xf %{SOURCE22}
 rm -rf third_party/tensorpipe/third_party/libnop/*
 cp -r libnop-*/* third_party/tensorpipe/third_party/libnop/
+
+# gcc 15 include cstdint
+sed -i '/#include <tensorpipe.*/a#include <cstdint>' third_party/tensorpipe/tensorpipe/common/allocator.h
+sed -i '/#include <tensorpipe.*/a#include <cstdint>' third_party/tensorpipe/tensorpipe/common/memory.h
 %endif
 
 %if %{without opentelemtry}
@@ -324,6 +329,11 @@ sed -i -e 's@sympy==1.13.1@sympy>=1.13.1@' setup.py
 sed -i -e '/aotriton.cmake/d' cmake/Dependencies.cmake
 # Compress hip
 sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc --offload-compress@' cmake/Dependencies.cmake
+# Silence noisy warning
+sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-pass-failed@' cmake/Dependencies.cmake
+sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unused-command-line-argument@' cmake/Dependencies.cmake
+sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unused-result@' cmake/Dependencies.cmake
+sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-deprecated-declarations@' cmake/Dependencies.cmake
 
 # No third_party fmt, use system
 sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt

From c36179d4207c790c41bfcc98efaa64b08bc3ed48 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Fri, 24 Jan 2025 04:39:34 -0800
Subject: [PATCH 49/88] Document the issue for c++ asserts in upstream

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python-torch.spec b/python-torch.spec
index 267cbb3..86794c5 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -100,6 +100,7 @@ Patch11:       0001-Improve-finding-and-using-the-rocm_version.h.patch
 # Patches need to be refactored for ToT
 # These are ROCm packages
 Patch101:      0001-cuda-hip-signatures.patch
+# https://github.com/pytorch/pytorch/issues/145608
 Patch102:      0001-torch-paper-over-c-assert.patch
 
 ExclusiveArch:  x86_64 aarch64

From 34290486726d9e6bcb7b92f67a542b59fe2dc8e0 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Fri, 31 Jan 2025 04:02:13 -0800
Subject: [PATCH 50/88] Rebuild


From 9c39544f2ccbc19b4a11c0c0709e3313d5711cf0 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Mon, 17 Feb 2025 05:15:28 -0800
Subject: [PATCH 51/88] Remove rocm loop

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 64 ++---------------------------------------------
 1 file changed, 2 insertions(+), 62 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 86794c5..7d6d26e 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -25,9 +25,6 @@
 %ifarch x86_64
 %bcond_without rocm
 %endif
-%bcond_with rocm_loop
-%global rocm_default_gpu default
-%global rocm_gpu_list gfx9
 
 # For testing distributed+rccl etc.
 %bcond_with rccl
@@ -180,12 +177,10 @@ BuildRequires:  rocm-core-devel
 BuildRequires:  rocm-hip-devel
 BuildRequires:  rocm-runtime-devel
 BuildRequires:  rocm-rpm-macros
-BuildRequires:  rocm-rpm-macros-modules
 BuildRequires:  rocthrust-devel
 BuildRequires:  roctracer-devel
 
 Requires:       amdsmi
-Requires:       rocm-rpm-macros-modules
 
 %endif
 
@@ -236,15 +231,6 @@ PyTorch is a Python package that provides two high-level features:
 You can reuse your favorite Python packages such as NumPy, SciPy,
 and Cython to extend PyTorch when needed.
 
-%if %{with rocm_loop}
-%package -n python3-%{pypi_name}-rocm-gfx9
-Summary:        %{name} for ROCm gfx9
-
-%description -n python3-%{pypi_name}-rocm-gfx9
-%{summary}
-
-%endif
-
 %if %{with test}
 %package -n python3-%{pypi_name}-test
 Summary:        Tests for %{name}
@@ -567,24 +553,8 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 
 # pytorch uses clang, not hipcc
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
-
-gpu=%{rocm_default_gpu}
-module load rocm/$gpu
-export PYTORCH_ROCM_ARCH=$ROCM_GPUS
+export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %py3_build
-mv build build-${gpu}
-module purge
-
-%if %{with rocm_loop}
-for gpu in %{rocm_gpu_list}
-do
-    module load rocm/$gpu
-    export PYTORCH_ROCM_ARCH=$ROCM_GPUS
-    %py3_build
-    mv build build-${gpu}
-    module purge
-done
-%endif
 
 %else
 
@@ -603,28 +573,8 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 
 # pytorch uses clang, not hipcc
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
-
-gpu=%{rocm_default_gpu}
-module load rocm/$gpu
-export PYTORCH_ROCM_ARCH=$ROCM_GPUS
-mv build-${gpu} build
+export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %py3_install
-mv build build-${gpu}
-module purge
-
-%if %{with rocm_loop}
-for gpu in %{rocm_gpu_list}
-do
-    module load rocm/$gpu
-    export PYTORCH_ROCM_ARCH=$ROCM_GPUS
-    mv build-${gpu} build
-    # need to customize the install location, so replace py3_install
-    %{__python3} %{py_setup} %{?py_setup_args} install -O1 --skip-build --root %{buildroot} --prefix /usr/lib64/rocm/${gpu} %{?*}
-    rm -rfv %{buildroot}/usr/lib/rocm/${gpu}/bin/__pycache__
-    mv build build-${gpu}
-    module purge
-done
-%endif
 
 %else
 
@@ -650,16 +600,6 @@ done
 %{python3_sitearch}/functorch
 %{python3_sitearch}/torchgen
 
-%if %{with rocm}
-%if %{with rocm_loop}
-
-%files -n python3-%{pypi_name}-rocm-gfx9
-%{_libdir}/rocm/gfx9/bin/*
-%{_libdir}/rocm/gfx9/lib64/*
-
-%endif
-%endif
-
 %changelog
 %autochangelog
 

From 2508009c1f33f1a5ff742b9bad321afc1d2ddc0b Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Wed, 26 Feb 2025 07:32:31 -0800
Subject: [PATCH 52/88] Remove gold linker

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 7d6d26e..6ceceea 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -105,7 +105,6 @@ ExclusiveArch:  x86_64 aarch64
 %global _lto_cflags %nil
 
 BuildRequires:  cmake
-BuildRequires:  binutils-gold
 BuildRequires:  eigen3-devel
 BuildRequires:  flexiblas-devel
 BuildRequires:  fmt-devel
@@ -483,7 +482,6 @@ export USE_CUDA=OFF
 export USE_FAKELOWP=OFF
 export USE_FBGEMM=OFF
 export USE_FLASH_ATTENTION=OFF
-export USE_GOLD_LINKER=ON
 export USE_GLOO=OFF
 export USE_ITT=OFF
 export USE_KINETO=OFF

From 7569831b203df17daeb25bc64ca9806449b10bb7 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sat, 1 Mar 2025 07:52:12 -0800
Subject: [PATCH 53/88] cmake version changed

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python-torch.spec b/python-torch.spec
index 6ceceea..48096a4 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -420,6 +420,9 @@ sed -i -e 's@DESTINATION ${PYTHON_LIB_REL_PATH}@DESTINATION ${CMAKE_INSTALL_PREF
 # reenable foxi linking
 sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)@#list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)@' cmake/Dependencies.cmake
 
+# cmake version changed
+sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION 3.5)@' third_party/tensorpipe/third_party/libuv/CMakeLists.txt
+
 %if %{with rocm}
 # hipify
 ./tools/amd_build/build_amd.py

From dd353fd56b4b082675582145e25af4d7568459b0 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Wed, 12 Mar 2025 08:07:40 -0700
Subject: [PATCH 54/88] Remove papering over c++ assert problem.

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 48096a4..7787533 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -97,8 +97,6 @@ Patch11:       0001-Improve-finding-and-using-the-rocm_version.h.patch
 # Patches need to be refactored for ToT
 # These are ROCm packages
 Patch101:      0001-cuda-hip-signatures.patch
-# https://github.com/pytorch/pytorch/issues/145608
-Patch102:      0001-torch-paper-over-c-assert.patch
 
 ExclusiveArch:  x86_64 aarch64
 %global toolchain gcc

From 23f5d1192643bdfc836fb031dac1dcc63e075e99 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 13 Mar 2025 05:07:43 -0700
Subject: [PATCH 55/88] Update gitcommit

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 7787533..95584ca 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,13 +6,19 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.5.0-rc9
-%global commit0 417a0763a7d69f6ce80719ac89c1d2deeee78163
+# v2.7.0-rc1
+%global commit0 cdd7a2c72bbf0a72faf6fe4b4903c053f0465a2e
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 2024103
-%global pypi_version 2.5.0
+%global date0 20250412
+%global pypi_version 2.7.0
+%global flatbuffers_version 23.3.3
+%global miniz_version 3.0.2
+%global pybind11_version 2.13.6
 %else
 %global pypi_version 2.5.1
+%global flatbuffers_version 23.3.3
+%global miniz_version 2.1.0
+%global pybind11_version 2.11.1
 %endif
 
 # For -test subpackage
@@ -60,8 +66,8 @@ Source1000:     pyproject.toml
 %else
 Source0:        %{forgeurl}/releases/download/v%{version}/pytorch-v%{version}.tar.gz
 %endif
-Source1:        https://github.com/google/flatbuffers/archive/refs/tags/v23.3.3.tar.gz
-Source2:        https://github.com/pybind/pybind11/archive/refs/tags/v2.11.1.tar.gz
+Source1:        https://github.com/google/flatbuffers/archive/refs/tags/v%{flatbuffers_version}.tar.gz
+Source2:        https://github.com/pybind/pybind11/archive/refs/tags/v%{pybind11_version}.tar.gz
 
 # Developement on tensorpipe has stopped, repo made read only July 1, 2023, this is the last commit
 %global tp_commit 52791a2fd214b2a9dc5759d36725909c1daa7f2e
@@ -91,12 +97,14 @@ Source70:       https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp-
 Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz
 %endif
 
+%if %{without gitcommit}
 Patch11:       0001-Improve-finding-and-using-the-rocm_version.h.patch
 
 # ROCm patches
 # Patches need to be refactored for ToT
 # These are ROCm packages
 Patch101:      0001-cuda-hip-signatures.patch
+%endif
 
 ExclusiveArch:  x86_64 aarch64
 %global toolchain gcc
@@ -205,10 +213,10 @@ Summary:        %{summary}
 Provides:       pytorch
 
 # Apache-2.0
-Provides:       bundled(flatbuffers) = 22.3.3
+Provides:       bundled(flatbuffers) = %{flatbuffers_version}
 # MIT
-Provides:       bundled(miniz) = 2.1.0
-Provides:       bundled(pybind11) = 2.11.1
+Provides:       bundled(miniz) = %{miniz_version}
+Provides:       bundled(pybind11) = %{pybind11_version}
 
 %if %{with tensorpipe}
 # BSD-3-Clause
@@ -254,11 +262,11 @@ rm -rf %{pypi_name}.egg-info
 
 tar xf %{SOURCE1}
 rm -rf third_party/flatbuffers/*
-cp -r flatbuffers-23.3.3/* third_party/flatbuffers/
+cp -r flatbuffers-%{flatbuffers_version}/* third_party/flatbuffers/
 
 tar xf %{SOURCE2}
 rm -rf third_party/pybind11/*
-cp -r pybind11-2.11.1/* third_party/pybind11/
+cp -r pybind11-%{pybind11_version}/* third_party/pybind11/
 
 %if %{with tensorpipe}
 tar xf %{SOURCE20}
@@ -345,7 +353,7 @@ sed -i -e 's@check_submodules()$@#check_submodules()@' setup.py
 # the third_party dir to compile the file.
 # mimiz is licensed MIT
 # https://github.com/richgel999/miniz/blob/master/LICENSE
-mv third_party/miniz-2.1.0 .
+mv third_party/miniz-%{miniz_version} .
 #
 # setup.py depends on this script
 mv third_party/build_bundled.py .
@@ -379,7 +387,7 @@ mv third_party/googletest .
 rm -rf third_party/*
 # Put stuff back
 mv build_bundled.py third_party
-mv miniz-2.1.0 third_party
+mv miniz-%{miniz_version} third_party
 mv flatbuffers third_party
 mv pybind11 third_party
 

From bd11f4aa1a27d626aae19b7278b790e753e587bc Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sat, 22 Mar 2025 11:58:26 -0700
Subject: [PATCH 56/88] Update gitcommit to v2.7.0-rc2

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 95584ca..8a9a9f7 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.7.0-rc1
-%global commit0 cdd7a2c72bbf0a72faf6fe4b4903c053f0465a2e
+# v2.7.0-rc2
+%global commit0 b1940b5867e40e40ebdce4db76f76d3d0b71d3f4
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250412
+%global date0 20250413
 %global pypi_version 2.7.0
 %global flatbuffers_version 23.3.3
 %global miniz_version 3.0.2
@@ -253,6 +253,10 @@ Requires:       python3-%{pypi_name}%{?_isa} = %{version}-%{release}
 # Overwrite with a git checkout of the pyproject.toml
 cp %{SOURCE1000} .
 
+# https://github.com/pytorch/pytorch/issues/149803
+# Tries to checkout nccl
+sed -i -e 's@    checkout_nccl()@#    checkout_nccl()@' tools/build_pytorch_libs.py
+
 %else
 %autosetup -p1 -n pytorch-v%{version}
 %endif
@@ -329,6 +333,12 @@ sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-depr
 
 # No third_party fmt, use system
 sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt
+%if %{with gitcommit}
+sed -i -e 's@fmt::fmt-header-only@fmt@' aten/src/ATen/CMakeLists.txt
+sed -i -e 's@list(APPEND ATen_HIP_INCLUDE $<TARGET_PROPERTY:fmt,INTERFACE_INCLUDE_DIRECTORIES>)@@' aten/src/ATen/CMakeLists.txt
+
+sed -i -e 's@fmt::fmt-header-only@fmt@' third_party/kineto/libkineto/CMakeLists.txt
+%endif
 sed -i -e 's@fmt::fmt-header-only@fmt@' c10/CMakeLists.txt
 sed -i -e 's@fmt::fmt-header-only@fmt@' torch/CMakeLists.txt
 sed -i -e 's@fmt::fmt-header-only@fmt@' cmake/Dependencies.cmake
@@ -434,6 +444,10 @@ sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION
 ./tools/amd_build/build_amd.py
 # Fedora installs to /usr/include, not /usr/include/rocm-core
 sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable/TunableGemm.h
+%if %{with gitcommit}
+# https://github.com/pytorch/pytorch/issues/149805
+sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' cmake/public/LoadHIP.cmake
+%endif
 # use any hip, correct CMAKE_MODULE_PATH
 sed -i -e 's@lib/cmake/hip@lib64/cmake/hip@' cmake/public/LoadHIP.cmake
 sed -i -e 's@HIP 1.0@HIP MODULE@'            cmake/public/LoadHIP.cmake
@@ -512,14 +526,22 @@ export USE_SYSTEM_ONNX=ON
 export USE_SYSTEM_PYBIND11=OFF
 export USE_SYSTEM_LIBS=OFF
 export USE_TENSORPIPE=OFF
+%if %{with gitcommit}
+export USE_XNNPACK=OFF
+%else
 export USE_XNNPACK=ON
+%endif
 export USE_XPU=OFF
 export USE_SYSTEM_PTHREADPOOL=ON
 export USE_SYSTEM_CPUINFO=ON
 export USE_SYSTEM_FP16=ON
 export USE_SYSTEM_FXDIV=ON
 export USE_SYSTEM_PSIMD=ON
+%if %{with gitcommit}
+export USE_SYSTEM_XNNPACK=OFF
+%else
 export USE_SYSTEM_XNNPACK=ON
+%endif
 
 export USE_DISTRIBUTED=ON
 %if %{with tensorpipe}

From e80f34f74dac2bbc813c0914a1b4afd74abbe057 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sat, 29 Mar 2025 05:11:02 -0700
Subject: [PATCH 57/88] Update gitcommit to 2.7-rc3

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 0001-Add-cmake-varaible-USE_ROCM_CK.patch | 120 ++++++++++++++++++++++
 0001-python-torch-disable-ck.patch        | 112 ++++++++++++++++++++
 python-torch.spec                         |  25 ++++-
 3 files changed, 254 insertions(+), 3 deletions(-)
 create mode 100644 0001-Add-cmake-varaible-USE_ROCM_CK.patch
 create mode 100644 0001-python-torch-disable-ck.patch

diff --git a/0001-Add-cmake-varaible-USE_ROCM_CK.patch b/0001-Add-cmake-varaible-USE_ROCM_CK.patch
new file mode 100644
index 0000000..b34e07a
--- /dev/null
+++ b/0001-Add-cmake-varaible-USE_ROCM_CK.patch
@@ -0,0 +1,120 @@
+From 0f33e0a7bbd1522ee74f8fc1fbe3af7563318c79 Mon Sep 17 00:00:00 2001
+From: Tom Rix <Tom.Rix@amd.com>
+Date: Fri, 28 Mar 2025 15:33:09 -0700
+Subject: [PATCH] Add cmake varaible USE_ROCM_CK
+
+To control the use of ROCm Composable Kernel usage.
+
+CK is not compatible with all rocBLAS gpu's, so the user
+must explicitly choose to use CK.
+
+Signed-off-by: Tom Rix <Tom.Rix@amd.com>
+---
+ CMakeLists.txt                  |  1 +
+ aten/src/ATen/CMakeLists.txt    |  8 ++++++--
+ aten/src/ATen/cuda/CUDABlas.cpp | 10 +++++-----
+ cmake/Dependencies.cmake        |  3 +++
+ 4 files changed, 15 insertions(+), 7 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index f3fee2f7ffc2..73903acce452 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -249,6 +249,7 @@ cmake_dependent_option(
+   BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON
+   "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
+ cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF)
++cmake_dependent_option(USE_ROCM_CK "Use ROCm Composable Kernel" ON "USE_ROCM" ON)
+ option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
+ cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF)
+ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
+diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
+index 085af373ec22..af268ab88572 100644
+--- a/aten/src/ATen/CMakeLists.txt
++++ b/aten/src/ATen/CMakeLists.txt
+@@ -361,13 +361,17 @@ endif()
+     ${native_quantized_hip_hip}
+     ${native_transformers_hip_hip} ${native_transformers_src_hip_hip}
+   )
+-  if(WIN32) # Windows doesn't support Composable Kernels and Triton
++  if(NOT USE_ROCM_CK) # Windows doesn't support Composable Kernels and Triton
+     file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip")
+     file(GLOB native_hip_ck "native/hip/ck*.hip")
+     exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
+-      ${native_hip_bgemm} ${native_hip_ck}
++      ${native_hip_bgemm} ${native_hip_ck})
++  endif()
++  if(WIN32) # Windows doesn't support Composable Kernels and Triton
++    exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
+       ${native_transformers_hip_hip} ${native_transformers_hip_cpp})
+   endif()
++
+   # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources)
+   list(APPEND all_hip_cpp
+     ${native_nested_hip_cpp}
+diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
+index a62b028fd4ff..a3dbf76848ea 100644
+--- a/aten/src/ATen/cuda/CUDABlas.cpp
++++ b/aten/src/ATen/cuda/CUDABlas.cpp
+@@ -708,7 +708,7 @@ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     bgemm_internal_cublaslt<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+   }
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_CK
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::bgemm_internal_ck<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+   }
+@@ -1061,7 +1061,7 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
+     gemm_internal_cublaslt<double>(CUDABLAS_GEMM_ARGS(double));
+ #endif
+   }
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_CK
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<double>(CUDABLAS_GEMM_ARGS(double));
+   }
+@@ -1077,7 +1077,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
+   }
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_CK
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
+   }
+@@ -1125,7 +1125,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_CK
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+@@ -1141,7 +1141,7 @@ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_CK
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
+index 30917bdf39f5..2ca6091030f1 100644
+--- a/cmake/Dependencies.cmake
++++ b/cmake/Dependencies.cmake
+@@ -1046,6 +1046,9 @@ if(USE_ROCM)
+     if(HIPBLASLT_VEC_EXT)
+       list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_VEC_EXT)
+     endif()
++    if(USE_ROCM_CK)
++      list(APPEND HIP_CXX_FLAGS -DUSE_ROCM_CK)
++    endif()
+     list(APPEND HIP_HIPCC_FLAGS --offload-compress)
+     if(WIN32)
+       add_definitions(-DROCM_ON_WINDOWS)
+-- 
+2.48.1
+
diff --git a/0001-python-torch-disable-ck.patch b/0001-python-torch-disable-ck.patch
new file mode 100644
index 0000000..e8fd9c2
--- /dev/null
+++ b/0001-python-torch-disable-ck.patch
@@ -0,0 +1,112 @@
+From 027dad1eaed51c1172e2497da611e3267d42d2f0 Mon Sep 17 00:00:00 2001
+From: Tom Rix <Tom.Rix@amd.com>
+Date: Fri, 28 Mar 2025 09:16:03 -0700
+Subject: [PATCH] python-torch: disable ck
+
+---
+ aten/src/ATen/CMakeLists.txt    |  7 +++----
+ aten/src/ATen/Context.cpp       |  1 +
+ aten/src/ATen/cuda/CUDABlas.cpp | 10 +++++-----
+ 3 files changed, 9 insertions(+), 9 deletions(-)
+
+diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
+index 085af373ec22..84808880e51c 100644
+--- a/aten/src/ATen/CMakeLists.txt
++++ b/aten/src/ATen/CMakeLists.txt
+@@ -134,7 +134,7 @@ file(GLOB native_cuda_cu "native/cuda/*.cu")
+ file(GLOB native_cuda_cpp "native/cuda/*.cpp")
+ file(GLOB native_cuda_h "native/cuda/*.h" "native/cuda/*.cuh")
+ file(GLOB native_cuda_linalg_cpp "native/cuda/linalg/*.cpp")
+-file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh" "native/hip/bgemm_kernels/*.h")
++file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh" )
+ file(GLOB native_cudnn_cpp "native/cudnn/*.cpp")
+ file(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu")
+ file(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp")
+@@ -145,7 +145,7 @@ file(GLOB native_nested_h "native/nested/*.h")
+ file(GLOB native_nested_cuda_cu "native/nested/cuda/*.cu")
+ file(GLOB native_nested_cuda_cpp "native/nested/cuda/*.cpp")
+ 
+-file(GLOB native_hip_hip "native/hip/*.hip" "native/hip/bgemm_kernels/*.hip")
++file(GLOB native_hip_hip "native/hip/*.hip" )
+ file(GLOB native_hip_cpp "native/hip/*.cpp")
+ file(GLOB native_hip_linalg_cpp "native/hip/linalg/*.cpp")
+ file(GLOB native_miopen_cpp "native/miopen/*.cpp")
+@@ -361,13 +361,12 @@ endif()
+     ${native_quantized_hip_hip}
+     ${native_transformers_hip_hip} ${native_transformers_src_hip_hip}
+   )
+-  if(WIN32) # Windows doesn't support Composable Kernels and Triton
+     file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip")
+     file(GLOB native_hip_ck "native/hip/ck*.hip")
+     exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
+       ${native_hip_bgemm} ${native_hip_ck}
+       ${native_transformers_hip_hip} ${native_transformers_hip_cpp})
+-  endif()
++
+   # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources)
+   list(APPEND all_hip_cpp
+     ${native_nested_hip_cpp}
+diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
+index f598fc3a39d3..03dab6ff38fe 100644
+--- a/aten/src/ATen/Context.cpp
++++ b/aten/src/ATen/Context.cpp
+@@ -355,6 +355,7 @@ at::BlasBackend Context::blasPreferredBackend() {
+ }
+ 
+ void Context::setBlasPreferredBackend(at::BlasBackend b) {
++  return;
+ #ifdef _MSC_VER
+   TORCH_WARN_ONCE(
+     "torch.backends.cuda.preferred_blas_library is an experimental feature. "
+diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
+index a62b028fd4ff..cba38426ea1f 100644
+--- a/aten/src/ATen/cuda/CUDABlas.cpp
++++ b/aten/src/ATen/cuda/CUDABlas.cpp
+@@ -708,7 +708,7 @@ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     bgemm_internal_cublaslt<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+   }
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_NO_CK
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::bgemm_internal_ck<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+   }
+@@ -1061,7 +1061,7 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
+     gemm_internal_cublaslt<double>(CUDABLAS_GEMM_ARGS(double));
+ #endif
+   }
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_NO_CK
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<double>(CUDABLAS_GEMM_ARGS(double));
+   }
+@@ -1077,7 +1077,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
+   }
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_NO_CK
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
+   }
+@@ -1125,7 +1125,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_NO_CK
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+@@ -1141,7 +1141,7 @@ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_NO_CK
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+-- 
+2.48.1
+
diff --git a/python-torch.spec b/python-torch.spec
index 8a9a9f7..8f5ed02 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.7.0-rc2
-%global commit0 b1940b5867e40e40ebdce4db76f76d3d0b71d3f4
+# v2.7.0-rc3
+%global commit0 b04d8358d959925bee0adfd67cc17987af9fbb9d
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250413
+%global date0 20250326
 %global pypi_version 2.7.0
 %global flatbuffers_version 23.3.3
 %global miniz_version 3.0.2
@@ -104,6 +104,13 @@ Patch11:       0001-Improve-finding-and-using-the-rocm_version.h.patch
 # Patches need to be refactored for ToT
 # These are ROCm packages
 Patch101:      0001-cuda-hip-signatures.patch
+%else
+# https://github.com/pytorch/pytorch/issues/150187
+# The hack job
+# Patch11:       0001-python-torch-disable-ck.patch
+# Cleaned up hack job
+Patch11:       0001-Add-cmake-varaible-USE_ROCM_CK.patch
+
 %endif
 
 ExclusiveArch:  x86_64 aarch64
@@ -159,6 +166,9 @@ BuildRequires:  python3dist(sympy)
 %endif
 
 %if %{with rocm}
+%if %{with gitcommit}
+BuildRequires:  composable_kernel-devel
+%endif
 BuildRequires:  hipblas-devel
 BuildRequires:  hipblaslt-devel
 BuildRequires:  hipcub-devel
@@ -330,6 +340,8 @@ sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-pass
 sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unused-command-line-argument@' cmake/Dependencies.cmake
 sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unused-result@' cmake/Dependencies.cmake
 sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-deprecated-declarations@' cmake/Dependencies.cmake
+# Use parallel jobs
+sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -parallel-jobs=4@' cmake/Dependencies.cmake
 
 # No third_party fmt, use system
 sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt
@@ -447,6 +459,9 @@ sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable
 %if %{with gitcommit}
 # https://github.com/pytorch/pytorch/issues/149805
 sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' cmake/public/LoadHIP.cmake
+# Fedora installs to /usr/include, not /usr/include/rocm-core
+sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable/Tunable.cpp
+sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/cuda/tunable/Tunable.cpp
 %endif
 # use any hip, correct CMAKE_MODULE_PATH
 sed -i -e 's@lib/cmake/hip@lib64/cmake/hip@' cmake/public/LoadHIP.cmake
@@ -574,6 +589,7 @@ export BUILD_TEST=ON
 %if %{with rocm}
 
 export USE_ROCM=ON
+export USE_ROCM_CK=OFF
 export USE_MAGMA=ON
 export HIP_PATH=`hipconfig -p`
 export ROCM_PATH=`hipconfig -R`
@@ -595,6 +611,7 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 
 %if %{with rocm}
 export USE_ROCM=ON
+export USE_ROCM_CK=OFF
 export HIP_PATH=`hipconfig -p`
 export ROCM_PATH=`hipconfig -R`
 RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir`
@@ -620,8 +637,10 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %files -n python3-%{pypi_name}
 %license LICENSE
 %doc README.md 
+%if %{without gitcommit}
 %{_bindir}/convert-caffe2-to-onnx
 %{_bindir}/convert-onnx-to-caffe2
+%endif
 %{_bindir}/torchrun
 %{_bindir}/torchfrtrace
 %{python3_sitearch}/%{pypi_name}

From 96edd6c2ece9753ac856ef3809f735fc1174783d Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sat, 5 Apr 2025 08:56:18 -0700
Subject: [PATCH 58/88] Update gitcommit to v2.7.0-rc6

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 8f5ed02..a217432 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.7.0-rc3
-%global commit0 b04d8358d959925bee0adfd67cc17987af9fbb9d
+# v2.7.0-rc6
+%global commit0 06c6a81a987e271d35a5da9501b4a17915bb8206
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250326
+%global date0 20250403
 %global pypi_version 2.7.0
 %global flatbuffers_version 23.3.3
 %global miniz_version 3.0.2
@@ -166,9 +166,6 @@ BuildRequires:  python3dist(sympy)
 %endif
 
 %if %{with rocm}
-%if %{with gitcommit}
-BuildRequires:  composable_kernel-devel
-%endif
 BuildRequires:  hipblas-devel
 BuildRequires:  hipblaslt-devel
 BuildRequires:  hipcub-devel

From e3c2449e4fac3865f272f20e24f20948b2d9b208 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 10 Apr 2025 05:27:21 -0700
Subject: [PATCH 59/88] Update gitcomit to 2.7.0-rc8

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index a217432..8ea791e 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.7.0-rc6
-%global commit0 06c6a81a987e271d35a5da9501b4a17915bb8206
+# v2.7.0-rc8
+%global commit0 c7ff78dfc0c38847bf5daa78ab8b3669e1734246
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250403
+%global date0 20250408
 %global pypi_version 2.7.0
 %global flatbuffers_version 23.3.3
 %global miniz_version 3.0.2

From f0eda9ace1c0f50ab511cfd9fe524b50053e09c1 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sun, 13 Apr 2025 07:11:27 -0700
Subject: [PATCH 60/88] Update gitcommit to 2.7.0-rc9

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 8ea791e..f03837b 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.7.0-rc8
-%global commit0 c7ff78dfc0c38847bf5daa78ab8b3669e1734246
+# v2.7.0-rc9
+%global commit0 073912749d667fcfb2de1c15e1e664dc0ccd3460
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250408
+%global date0 20250410
 %global pypi_version 2.7.0
 %global flatbuffers_version 23.3.3
 %global miniz_version 3.0.2

From fb64b28d49af6ca8bd72c63bc83166a0fd737b32 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sat, 19 Apr 2025 08:20:35 -0700
Subject: [PATCH 61/88] Update gitcommit to 2.7.0-rc10

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index f03837b..5a9f08f 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.7.0-rc9
-%global commit0 073912749d667fcfb2de1c15e1e664dc0ccd3460
+# v2.7.0-rc10
+%global commit0 134179474539648ba7dee1317959529fbd0e7f89
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250410
+%global date0 20250415
 %global pypi_version 2.7.0
 %global flatbuffers_version 23.3.3
 %global miniz_version 3.0.2

From 2f3d92b7c5b36d8afb0846f4af6871bcb7a7fc1d Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Fri, 25 Apr 2025 12:39:04 -0700
Subject: [PATCH 62/88] Update to 2.7.0

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore        |  2 ++
 python-torch.spec | 37 +++++++++++--------------------------
 sources           |  8 +++-----
 3 files changed, 16 insertions(+), 31 deletions(-)

diff --git a/.gitignore b/.gitignore
index cdf142f..25abff5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,5 @@
 /pytorch-v2.4.1.tar.gz
 /pytorch-v2.5.0.tar.gz
 /pytorch-v2.5.1.tar.gz
+/pytorch-v2.7.0.tar.gz
+/v2.13.6.tar.gz
diff --git a/python-torch.spec b/python-torch.spec
index 5a9f08f..bc15924 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -15,10 +15,10 @@
 %global miniz_version 3.0.2
 %global pybind11_version 2.13.6
 %else
-%global pypi_version 2.5.1
+%global pypi_version 2.7.0
 %global flatbuffers_version 23.3.3
-%global miniz_version 2.1.0
-%global pybind11_version 2.11.1
+%global miniz_version 3.0.2
+%global pybind11_version 2.13.6
 %endif
 
 # For -test subpackage
@@ -98,19 +98,20 @@ Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{
 %endif
 
 %if %{without gitcommit}
-Patch11:       0001-Improve-finding-and-using-the-rocm_version.h.patch
+# Patch11:       0001-Improve-finding-and-using-the-rocm_version.h.patch
 
 # ROCm patches
 # Patches need to be refactored for ToT
 # These are ROCm packages
-Patch101:      0001-cuda-hip-signatures.patch
-%else
+# Patch101:      0001-cuda-hip-signatures.patch
+
 # https://github.com/pytorch/pytorch/issues/150187
 # The hack job
 # Patch11:       0001-python-torch-disable-ck.patch
 # Cleaned up hack job
 Patch11:       0001-Add-cmake-varaible-USE_ROCM_CK.patch
 
+%else
 %endif
 
 ExclusiveArch:  x86_64 aarch64
@@ -260,10 +261,6 @@ Requires:       python3-%{pypi_name}%{?_isa} = %{version}-%{release}
 # Overwrite with a git checkout of the pyproject.toml
 cp %{SOURCE1000} .
 
-# https://github.com/pytorch/pytorch/issues/149803
-# Tries to checkout nccl
-sed -i -e 's@    checkout_nccl()@#    checkout_nccl()@' tools/build_pytorch_libs.py
-
 %else
 %autosetup -p1 -n pytorch-v%{version}
 %endif
@@ -342,12 +339,10 @@ sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -parallel
 
 # No third_party fmt, use system
 sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt
-%if %{with gitcommit}
 sed -i -e 's@fmt::fmt-header-only@fmt@' aten/src/ATen/CMakeLists.txt
 sed -i -e 's@list(APPEND ATen_HIP_INCLUDE $<TARGET_PROPERTY:fmt,INTERFACE_INCLUDE_DIRECTORIES>)@@' aten/src/ATen/CMakeLists.txt
 
 sed -i -e 's@fmt::fmt-header-only@fmt@' third_party/kineto/libkineto/CMakeLists.txt
-%endif
 sed -i -e 's@fmt::fmt-header-only@fmt@' c10/CMakeLists.txt
 sed -i -e 's@fmt::fmt-header-only@fmt@' torch/CMakeLists.txt
 sed -i -e 's@fmt::fmt-header-only@fmt@' cmake/Dependencies.cmake
@@ -361,6 +356,10 @@ sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)@#list(APPE
 sed -i -e 's@if(NOT TARGET fxdiv)@if(MSVC AND USE_XNNPACK)@' caffe2/CMakeLists.txt
 sed -i -e 's@TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@#TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@' caffe2/CMakeLists.txt
 
+# https://github.com/pytorch/pytorch/issues/149803
+# Tries to checkout nccl
+sed -i -e 's@    checkout_nccl()@#    checkout_nccl()@' tools/build_pytorch_libs.py
+
 # Disable the use of check_submodule's in the setup.py, we are a tarball, not a git repo
 sed -i -e 's@check_submodules()$@#check_submodules()@' setup.py
 
@@ -453,13 +452,11 @@ sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION
 ./tools/amd_build/build_amd.py
 # Fedora installs to /usr/include, not /usr/include/rocm-core
 sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable/TunableGemm.h
-%if %{with gitcommit}
 # https://github.com/pytorch/pytorch/issues/149805
 sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' cmake/public/LoadHIP.cmake
 # Fedora installs to /usr/include, not /usr/include/rocm-core
 sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable/Tunable.cpp
 sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/cuda/tunable/Tunable.cpp
-%endif
 # use any hip, correct CMAKE_MODULE_PATH
 sed -i -e 's@lib/cmake/hip@lib64/cmake/hip@' cmake/public/LoadHIP.cmake
 sed -i -e 's@HIP 1.0@HIP MODULE@'            cmake/public/LoadHIP.cmake
@@ -538,22 +535,14 @@ export USE_SYSTEM_ONNX=ON
 export USE_SYSTEM_PYBIND11=OFF
 export USE_SYSTEM_LIBS=OFF
 export USE_TENSORPIPE=OFF
-%if %{with gitcommit}
 export USE_XNNPACK=OFF
-%else
-export USE_XNNPACK=ON
-%endif
 export USE_XPU=OFF
 export USE_SYSTEM_PTHREADPOOL=ON
 export USE_SYSTEM_CPUINFO=ON
 export USE_SYSTEM_FP16=ON
 export USE_SYSTEM_FXDIV=ON
 export USE_SYSTEM_PSIMD=ON
-%if %{with gitcommit}
 export USE_SYSTEM_XNNPACK=OFF
-%else
-export USE_SYSTEM_XNNPACK=ON
-%endif
 
 export USE_DISTRIBUTED=ON
 %if %{with tensorpipe}
@@ -634,10 +623,6 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %files -n python3-%{pypi_name}
 %license LICENSE
 %doc README.md 
-%if %{without gitcommit}
-%{_bindir}/convert-caffe2-to-onnx
-%{_bindir}/convert-onnx-to-caffe2
-%endif
 %{_bindir}/torchrun
 %{_bindir}/torchfrtrace
 %{python3_sitearch}/%{pypi_name}
diff --git a/sources b/sources
index aa1ed3c..4021d40 100644
--- a/sources
+++ b/sources
@@ -1,11 +1,9 @@
+SHA512 (pytorch-v2.7.0.tar.gz) = 17e875a66f1669901f5f770c9d829ba5bfa3967296cfb71550e8a92507181db742548eaf7cc9a2c478c4b91e366f27cc480e2e1bbb328db8501d30e1649839e6
 SHA512 (v23.3.3.tar.gz) = 4066c94f2473c7ea16917d29a613e16f840a329089c88e0bdbdb999aef3442ba00abfd2aa92266fa9c067e399dc88e6f0ccac40dc151378857e665638e78bbf0
-SHA512 (v2.11.1.tar.gz) = ed1512ff0bca3bc0a45edc2eb8c77f8286ab9389f6ff1d5cb309be24bc608abbe0df6a7f5cb18c8f80a3bfa509058547c13551c3cd6a759af708fd0cdcdd9e95
+SHA512 (v2.13.6.tar.gz) = 497c25b33b09a9c42f67131ab82e35d689e8ce089dd7639be997305ff9a6d502447b79c824508c455d559e61f0186335b54dd2771d903a7c1621833930622d1a
 SHA512 (tensorpipe-52791a2.tar.gz) = 1e5faf17a7236c5506c08cb28be16069b11bb929bbca64ed9745ce4277d46739186ab7d6597da7437d90ed2d166d4c37ef2f3bceabe8083ef3adbb0e8e5f227e
-SHA512 (libnop-910b558.tar.gz) = 74c5324eaa1b6b2ac8dfef94c835b5c5b044625f8e5efe3522470b1ecc4798ff43d344a013cee2f6901e83267c6167072947b754e63f1552ae7044cffe234c36
 SHA512 (v1.41.0.tar.gz) = bb08a1970a10e8d9571ffea3d021643de30ec212cd51317b98d6cf0cfe55d6877992921fb01d1188a6d466687335b77885685d924f8cb7200a0bec30eee05c65
+SHA512 (libnop-910b558.tar.gz) = 74c5324eaa1b6b2ac8dfef94c835b5c5b044625f8e5efe3522470b1ecc4798ff43d344a013cee2f6901e83267c6167072947b754e63f1552ae7044cffe234c36
 SHA512 (v1.14.2.tar.gz) = 97635bbaf6dd567c201451dfaf7815b2052fe50d9bccc97aade86cfa4a92651374d167296a5453031b2681dc302806a289bca011a9e79ddc381a17d6118971d7
 SHA512 (cpp-httplib-3b6597b.tar.gz) = 8f1090658c498d04f14fec5c2f301847b1f3360bf92b18d82927643ee04ab61a6b274733a01c7850f9c030205120d674d1d961358d49fdd15636736fb8704f55
 SHA512 (kineto-be13176.tar.gz) = 41a08c7da9eea7d12402f80a5550c9d4df79798719cc52b12a507828c8c896ba28a37c35d8adf809ca72589e1d84965d5ef6dd01f3f8dc1c803c5ed67b03a43a
-SHA512 (pytorch-v2.4.1.tar.gz) = fc364112a88540035f213274b526b4c8ad9be335ff214161091e8a8d3efa89ebef5a5b4d33b20b5b659896d164dcbe739f0d7d2a293d3afc0efcfaa2bf62fc2c
-SHA512 (pytorch-v2.5.0.tar.gz) = 6ccf1ac9f191f5bd757ef7fbfc1dcd81d591577f2d3df7313c6ed32790c592aaffd253e18dc778a2fcc707e4533299817dfdf9fae108636ce5c29c1b8ff8bba6
-SHA512 (pytorch-v2.5.1.tar.gz) = a8882608c2ab6467a58d60c6df84c9f1004b43eafeba57db499dbbfdecc09db2e221b9d4c344c8af7c0bea6252e874c400483502dca24a0b474c376b9fef1dd4

From aeb5b118d5303d5cd147d573df87021f0db9ba2a Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 1 May 2025 08:40:21 -0700
Subject: [PATCH 63/88] Turn off kleidai

Breaks aarch64

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python-torch.spec b/python-torch.spec
index bc15924..ab500c5 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -517,6 +517,7 @@ export USE_FLASH_ATTENTION=OFF
 export USE_GLOO=OFF
 export USE_ITT=OFF
 export USE_KINETO=OFF
+export USE_KLEIDIAI=OFF
 export USE_LITE_INTERPRETER_PROFILER=OFF
 export USE_LITE_PROTO=OFF
 export USE_MAGMA=OFF

From e6d73d7c4909bb23cb0dfad74464ecb3a81cb286 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sun, 4 May 2025 07:36:00 -0700
Subject: [PATCH 64/88] Rebuild for magma

Signed-off-by: Tom Rix <Tom.Rix@amd.com>

From edfa2c25e3ffc54addd3cc4e18e972f7e2a6cf89 Mon Sep 17 00:00:00 2001
From: Python Maint <python-maint@redhat.com>
Date: Fri, 6 Jun 2025 16:14:25 +0200
Subject: [PATCH 65/88] Rebuilt for Python 3.14


From 27593d78b34de046e4d8ebcf0c242343742bfc95 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Fri, 27 Jun 2025 14:44:29 -0700
Subject: [PATCH 66/88] update gitcommit to 2.8-rc3

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 pyproject.toml    | 184 +++++++++++++++++++++++++++++++++++++++-------
 python-torch.spec |  35 ++++++---
 2 files changed, 181 insertions(+), 38 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9508ad0..ccf9c2a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,13 +1,41 @@
+[project]
+name = "torch"
+requires-python = ">=3.9"
+license = {text = "BSD-3-Clause"}
+dynamic = [
+    "authors",
+    "classifiers",
+    "entry-points",
+    "dependencies",
+    "description",
+    "keywords",
+    "optional-dependencies",
+    "readme",
+    "scripts",
+    "version",
+]
+
+[project.urls]
+Homepage = "https://pytorch.org/"
+Documentation = "https://pytorch.org/docs/"
+Source = "https://github.com/pytorch/pytorch"
+Forum = "https://discuss.pytorch.org/"
+
+
 [build-system]
 requires = [
-    "setuptools",
+    # After 75.8.2 dropped dep disttools API. Please fix
+    # API temporarily restored and shim used. Please fix
+    # Setuptools will drop support for setup.py past 80
+    # min version for recursive glob package data support
+    "setuptools>=62.3.0,<80.0",
     "wheel",
     "astunparse",
     "numpy",
     "ninja",
     "pyyaml",
     "cmake",
-    "typing-extensions",
+    "typing-extensions>=4.10.0",
     "requests",
 ]
 # Use legacy backend to import local packages in setup.py
@@ -15,32 +43,68 @@ build-backend = "setuptools.build_meta:__legacy__"
 
 
 [tool.black]
-# Uncomment if pyproject.toml worked fine to ensure consistency with flake8
-# line-length = 120
-target-version = ["py38", "py39", "py310", "py311"]
+line-length = 88
+
+[tool.isort]
+src_paths = ["caffe2", "torch", "torchgen", "functorch", "test"]
+extra_standard_library = ["typing_extensions"]
+skip_gitignore = true
+skip_glob = ["third_party/*"]
+atomic = true
+profile = "black"
+indent = 4
+line_length = 88
+lines_after_imports = 2
+multi_line_output = 3
+include_trailing_comma = true
+combine_as_imports = true
+
+
+[tool.usort.known]
+first_party = ["caffe2", "torch", "torchgen", "functorch", "test"]
+standard_library = ["typing_extensions"]
 
 
 [tool.ruff]
-target-version = "py38"
+line-length = 88
+src = ["caffe2", "torch", "torchgen", "functorch", "test"]
 
+[tool.ruff.format]
+docstring-code-format = true
+quote-style = "double"
+
+[tool.ruff.lint]
 # NOTE: Synchoronize the ignores with .flake8
+external = [
+    "B001",
+    "B902",
+    "B950",
+    "E121",
+    "E122",
+    "E128",
+    "E131",
+    "E704",
+    "E723",
+    "F723",
+    "F812",
+    "P201",
+    "P204",
+    "T484",
+    "TOR901",
+]
 ignore = [
     # these ignores are from flake8-bugbear; please fix!
     "B007", "B008", "B017",
     "B018", # Useless expression
-    "B019",
     "B023",
     "B028", # No explicit `stacklevel` keyword argument found
-    "B904",
     "E402",
     "C408", # C408 ignored because we like the dict keyword argument syntax
     "E501", # E501 is not flexible enough, we're using B950 instead
     "E721",
-    "E731", # Assign lambda expression
     "E741",
     "EXE001",
     "F405",
-    "F841",
     # these ignores are from flake8-logging-format; please fix!
     "G101",
     # these ignores are from ruff NPY; please fix!
@@ -48,39 +112,41 @@ ignore = [
     # these ignores are from ruff PERF; please fix!
     "PERF203",
     "PERF401",
-    "PERF403",
     # these ignores are from PYI; please fix!
-    "PYI019",
     "PYI024",
     "PYI036",
     "PYI041",
     "PYI056",
     "SIM102", "SIM103", "SIM112", # flake8-simplify code styles
     "SIM105", # these ignores are from flake8-simplify. please fix or ignore with commented reason
-    "SIM108",
+    "SIM108", # SIM108 ignored because we prefer if-else-block instead of ternary expression
     "SIM110",
     "SIM114", # Combine `if` branches using logical `or` operator
     "SIM115",
     "SIM116", # Disable Use a dictionary instead of consecutive `if` statements
     "SIM117",
     "SIM118",
-    "UP006", # keep-runtime-typing
     "UP007", # keep-runtime-typing
+    "TC006",
 ]
-line-length = 120
 select = [
     "B",
+    "B904", # Re-raised error without specifying the cause via the from keyword
     "C4",
     "G",
     "E",
     "EXE",
     "F",
     "SIM1",
+    "SIM911",
     "W",
     # Not included in flake8
+    "FURB",
+    "LOG",
     "NPY",
     "PERF",
     "PGH004",
+    "PIE790",
     "PIE794",
     "PIE800",
     "PIE804",
@@ -89,40 +155,92 @@ select = [
     "PLC0131", # type bivariance
     "PLC0132", # type param mismatch
     "PLC0205", # string as __slots__
+    "PLC3002", # unnecessary-direct-lambda-call
     "PLE",
     "PLR0133", # constant comparison
     "PLR0206", # property with params
     "PLR1722", # use sys exit
+    "PLR1736", # unnecessary list index
     "PLW0129", # assert on string literal
+    "PLW0131", # named expr without context
+    "PLW0133", # useless exception statement
+    "PLW0245", # super without brackets
     "PLW0406", # import self
     "PLW0711", # binary op exception
+    "PLW1501", # bad open mode
+    "PLW1507", # shallow copy os.environ
     "PLW1509", # preexec_fn not safe with threads
+    "PLW2101", # useless lock statement
     "PLW3301", # nested min max
     "PT006", # TODO: enable more PT rules
+    "PT014", # duplicate parameterize case
     "PT022",
     "PT023",
     "PT024",
     "PT025",
     "PT026",
     "PYI",
+    "Q003",  # avoidable escaped quote
+    "Q004",  # unnecessary escaped quote
+    "RSE",
     "RUF008", # mutable dataclass default
+    "RUF013", # ban implicit optional
     "RUF015", # access first ele in constant time
     "RUF016", # type error non-integer index
     "RUF017",
-    "TRY200",
-    "TRY302",
+    "RUF018", # no assignment in assert
+    "RUF019", # unnecessary-key-check
+    "RUF020", # never union
+    "RUF024", # from keys mutable
+    "RUF026", # default factory kwarg
+    "RUF030", # No print statement in assert
+    "RUF033", # default values __post_init__ dataclass
+    "RUF041", # simplify nested Literal
+    "RUF048", # properly parse `__version__`
+    "RUF200", # validate pyproject.toml
+    "S324", # for hashlib FIPS compliance
+    "SLOT",
+    "TC",
+    "TRY002", # ban vanilla raise (todo fix NOQAs)
+    "TRY203",
+    "TRY401", # verbose-log-message
     "UP",
+    "YTT",
 ]
 
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 "__init__.py" = [
     "F401",
 ]
+"*.pyi" = [
+    "PYI011", # typed-argument-default-in-stub
+    "PYI021", # docstring-in-stub
+    "PYI053", # string-or-bytes-too-long
+]
+"functorch/notebooks/**" = [
+    "F401",
+]
+"test/export/**" = [
+    "PGH004"
+]
+"test/typing/**" = [
+    "PGH004"
+]
 "test/typing/reveal/**" = [
     "F821",
 ]
 "test/torch_np/numpy_tests/**" = [
     "F821",
+    "NPY201",
+]
+"test/dynamo/test_bytecode_utils.py" = [
+    "F821",
+]
+"test/dynamo/test_debug_utils.py" = [
+    "UP037",
+]
+"test/dynamo/test_misc.py" = [
+    "PGH004",
 ]
 "test/jit/**" = [
     "PLR0133", # tests require this for JIT
@@ -136,19 +254,33 @@ select = [
     "RUF015",
     "UP", # We don't want to modify the jit test as they test specify syntax
 ]
-
-"torch/onnx/**" = [
-    "UP037", # ONNX does runtime type checking
+"test/inductor/s429861_repro.py" = [
+    "PGH004",
+]
+"test/inductor/test_torchinductor.py" = [
+    "UP037",
+]
+# autogenerated #TODO figure out why file level noqa is ignored
+"torch/_appdirs.py" = ["PGH004"]
+"torch/jit/_shape_functions.py" = ["PGH004"]
+"torch/_inductor/fx_passes/serialized_patterns/**" = ["F401", "F501"]
+"torch/_inductor/autoheuristic/artifacts/**" = ["F401", "F501"]
+"torch/_inductor/codegen/**" = [
+    "PGH004"
 ]
-
 "torchgen/api/types/__init__.py" = [
     "F401",
     "F403",
 ]
-"torchgen/executorch/api/types/__init__.py" = [
-    "F401",
-    "F403",
-]
 "torch/utils/collect_env.py" = [
     "UP", # collect_env.py needs to work with older versions of Python
 ]
+"torch/_vendor/**" = [
+    "UP", # No need to mess with _vendor
+]
+"tools/linter/**" = [
+    "LOG015" # please fix
+]
+
+[tool.codespell]
+ignore-words = "tools/linter/dictionary.txt"
diff --git a/python-torch.spec b/python-torch.spec
index ab500c5..90c908c 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,12 +6,12 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.7.0-rc10
-%global commit0 134179474539648ba7dee1317959529fbd0e7f89
+# v2.8.0-rc3
+%global commit0 3d53a53e504089a52a149791fd33d7fc898bd055
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250415
-%global pypi_version 2.7.0
-%global flatbuffers_version 23.3.3
+%global date0 20250625
+%global pypi_version 2.8.0
+%global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
 %global pybind11_version 2.13.6
 %else
@@ -92,19 +92,16 @@ Source70:       https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp-
 %endif
 
 %if %{without kineto}
+%if %{with gitcommit}
+%global ki_commit 5e7501833f1021ce6f618572d3baf657b6319658
+%else
 %global ki_commit be1317644c68b4bfc4646024a6b221066e430031
+%endif
 %global ki_scommit %(c=%{ki_commit}; echo ${c:0:7})
 Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz
 %endif
 
 %if %{without gitcommit}
-# Patch11:       0001-Improve-finding-and-using-the-rocm_version.h.patch
-
-# ROCm patches
-# Patches need to be refactored for ToT
-# These are ROCm packages
-# Patch101:      0001-cuda-hip-signatures.patch
-
 # https://github.com/pytorch/pytorch/issues/150187
 # The hack job
 # Patch11:       0001-python-torch-disable-ck.patch
@@ -112,6 +109,7 @@ Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{
 Patch11:       0001-Add-cmake-varaible-USE_ROCM_CK.patch
 
 %else
+Patch11:       0001-Add-cmake-variable-USE_ROCM_CK.patch
 %endif
 
 ExclusiveArch:  x86_64 aarch64
@@ -190,6 +188,9 @@ BuildRequires:  rocm-core-devel
 BuildRequires:  rocm-hip-devel
 BuildRequires:  rocm-runtime-devel
 BuildRequires:  rocm-rpm-macros
+%if %{with gitcommit}
+BuildRequires:  rocsolver-devel
+%endif
 BuildRequires:  rocthrust-devel
 BuildRequires:  roctracer-devel
 
@@ -356,9 +357,11 @@ sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)@#list(APPE
 sed -i -e 's@if(NOT TARGET fxdiv)@if(MSVC AND USE_XNNPACK)@' caffe2/CMakeLists.txt
 sed -i -e 's@TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@#TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@' caffe2/CMakeLists.txt
 
+%if %{without gitcommit}
 # https://github.com/pytorch/pytorch/issues/149803
 # Tries to checkout nccl
 sed -i -e 's@    checkout_nccl()@#    checkout_nccl()@' tools/build_pytorch_libs.py
+%endif
 
 # Disable the use of check_submodule's in the setup.py, we are a tarball, not a git repo
 sed -i -e 's@check_submodules()$@#check_submodules()@' setup.py
@@ -432,6 +435,9 @@ mv googletest third_party
 #
 # Fake out pocketfft, and system header will be used
 mkdir third_party/pocketfft
+%if %{with gitcommit}
+cp /usr/include/pocketfft_hdronly.h third_party/pocketfft/
+%endif
 
 #
 # Use the system valgrind headers
@@ -585,7 +591,12 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 
 # pytorch uses clang, not hipcc
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
+%if %{?fedora} <= 43
+export PYTORCH_ROCM_ARCH="gfx1100;gfx1201"
+%else
 export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
+%endif
+
 %py3_build
 
 %else

From 42c33b8dcd0de27e96cf0871c8087aa571d63f20 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sun, 20 Jul 2025 12:44:41 -0700
Subject: [PATCH 67/88] Update the next gitcommit to v2.8.0-rc6

Remove old patches.

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch |  47 -
 ...-Changes-to-compile-with-3.13-126033.patch | 222 ----
 ...ectorization-on-windows-submodule-sl.patch | 910 -----------------
 ...finding-and-using-the-rocm_version.h.patch | 142 ---
 0001-Optionally-use-hipblaslt.patch           | 506 ----------
 0001-Patch-for-sleef-3.6.patch                | 952 ------------------
 0001-Reenable-dim-for-python-3.12.patch       | 115 ---
 0001-Regenerate-flatbuffer-header.patch       |  39 -
 0001-Stub-in-kineto-ActivityType.patch        |  73 --
 0001-can-not-use-with-c-files.patch           |  25 -
 0001-cuda-hip-signatures.patch                |  42 -
 0001-disable-use-of-aotriton.patch            |  94 --
 ...le-dynamo-on-3.12-enable-most-dynamo.patch | 226 -----
 ...lude-fmt-ranges.h-for-using-fmt-join.patch |  54 -
 0001-no-third_party-FXdiv.patch               |  54 -
 0001-no-third_party-fmt.patch                 |  65 --
 0001-no-third_party-foxi.patch                |  36 -
 0001-python-torch-disable-ck.patch            | 112 ---
 0001-reenable-foxi-linking.patch              |  25 -
 0001-silence-an-assert.patch                  |  25 -
 0001-torch-paper-over-c-assert.patch          |  88 --
 0001-use-any-hip.patch                        |  34 -
 ...1-Add-cmake-option-USE_SYSTEM_FBGEMM.patch |  47 -
 .../0001-Add-cmake-variable-USE_ROCM_CK.patch | 149 +++
 next/0001-Optionally-use-hipblaslt.patch      | 506 ----------
 next/0001-disable-use-of-aotriton.patch       |  94 --
 python-torch.spec                             |  11 +-
 27 files changed, 154 insertions(+), 4539 deletions(-)
 delete mode 100644 0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
 delete mode 100644 0001-Changes-to-compile-with-3.13-126033.patch
 delete mode 100644 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
 delete mode 100644 0001-Improve-finding-and-using-the-rocm_version.h.patch
 delete mode 100644 0001-Optionally-use-hipblaslt.patch
 delete mode 100644 0001-Patch-for-sleef-3.6.patch
 delete mode 100644 0001-Reenable-dim-for-python-3.12.patch
 delete mode 100644 0001-Regenerate-flatbuffer-header.patch
 delete mode 100644 0001-Stub-in-kineto-ActivityType.patch
 delete mode 100644 0001-can-not-use-with-c-files.patch
 delete mode 100644 0001-cuda-hip-signatures.patch
 delete mode 100644 0001-disable-use-of-aotriton.patch
 delete mode 100644 0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch
 delete mode 100644 0001-include-fmt-ranges.h-for-using-fmt-join.patch
 delete mode 100644 0001-no-third_party-FXdiv.patch
 delete mode 100644 0001-no-third_party-fmt.patch
 delete mode 100644 0001-no-third_party-foxi.patch
 delete mode 100644 0001-python-torch-disable-ck.patch
 delete mode 100644 0001-reenable-foxi-linking.patch
 delete mode 100644 0001-silence-an-assert.patch
 delete mode 100644 0001-torch-paper-over-c-assert.patch
 delete mode 100644 0001-use-any-hip.patch
 delete mode 100644 next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
 create mode 100644 next/0001-Add-cmake-variable-USE_ROCM_CK.patch
 delete mode 100644 next/0001-Optionally-use-hipblaslt.patch
 delete mode 100644 next/0001-disable-use-of-aotriton.patch

diff --git a/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch b/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
deleted file mode 100644
index 413c60d..0000000
--- a/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
+++ /dev/null
@@ -1,47 +0,0 @@
-From 091b7fe1ccbb5e4ff4ac6017d42bacb869f61a27 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 20 Jul 2024 05:37:15 -0600
-Subject: [PATCH] Add cmake option USE_SYSTEM_FBGEMM
-
-Signed-off-by: Tom Rix <trix@redhat.com>
----
- CMakeLists.txt           | 1 +
- cmake/Dependencies.cmake | 3 ++-
- 2 files changed, 3 insertions(+), 1 deletion(-)
-
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index c4cd4b2c2a98..2068f7c6c4f2 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -253,6 +253,7 @@ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
-                        "USE_CUDNN" OFF)
- cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF)
- option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
-+option(USE_SYSTEM_FBGEMM "Use system-wide FBGEMM" OFF)
- option(USE_KINETO "Use Kineto profiling library" ON)
- option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
- option(USE_FAKELOWP "Use FakeLowp operators" OFF)
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index f1f2eb7cec31..192dac46f13b 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -706,6 +706,7 @@ endif()
- 
- # ---[ FBGEMM
- if(USE_FBGEMM)
-+  if (NOT USE_SYSTEM_FBGEMM)
-   set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")
-   if(NOT DEFINED FBGEMM_SOURCE_DIR)
-     set(FBGEMM_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/fbgemm" CACHE STRING "FBGEMM source directory")
-@@ -746,7 +747,7 @@ if(USE_FBGEMM)
-       target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable)
-     endif()
-   endif()
--
-+  endif()
-   if(USE_FBGEMM)
-     list(APPEND Caffe2_DEPENDENCY_LIBS fbgemm)
-   endif()
--- 
-2.45.1
-
diff --git a/0001-Changes-to-compile-with-3.13-126033.patch b/0001-Changes-to-compile-with-3.13-126033.patch
deleted file mode 100644
index ddc0dcf..0000000
--- a/0001-Changes-to-compile-with-3.13-126033.patch
+++ /dev/null
@@ -1,222 +0,0 @@
-From 655a06444b261cb28e71a0973c0ab67aaa8261ab Mon Sep 17 00:00:00 2001
-From: albanD <desmaison.alban@gmail.com>
-Date: Tue, 14 May 2024 02:14:53 +0000
-Subject: [PATCH] Changes to compile with 3.13 (#126033)
-
-This is mainly:
-- Fix refcount access macro
-- Hide all the Dynamo code that needs update as usual
-- Add _PyWeakref_ClearRef as an extern provided by CPython. Including the pycore header that defines it would require raw c include shenanigans that I don't think are worth it.
-This allows to build both with regular and nogil version of cpython. Both
-
-Note that this requires the 3.13 branch at least past [d3094744d40de2deefbda9b1996d5029c9ebf0b0](https://github.com/python/cpython/commit/d3094744d40de2deefbda9b1996d5029c9ebf0b0) which we need for mimalloc include and weakref function being exposed.
-
-debug-only issues in pybind11 with PyMem_MALLOC vs PyObject_MALLOC being should be synced either by updating pybind or cpython. @colesbury I can send a PR to ifdef the proper use in pybind if you think that this is the best solution here?
-
-Pull Request resolved: https://github.com/pytorch/pytorch/pull/126033
-Approved by: https://github.com/colesbury
----
- torch/csrc/Storage.cpp                  |  2 +-
- torch/csrc/autograd/python_variable.cpp |  2 +-
- torch/csrc/dynamo/cpython_defs.c        | 15 +++++-
- torch/csrc/dynamo/cpython_defs.h        |  2 +
- torch/csrc/dynamo/eval_frame.c          | 67 ++++++++++++++++++-------
- torch/csrc/utils/python_compat.h        |  4 ++
- 6 files changed, 70 insertions(+), 22 deletions(-)
-
-diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp
-index 93dbc9c09bb2..b22bbac35981 100644
---- a/torch/csrc/Storage.cpp
-+++ b/torch/csrc/Storage.cpp
-@@ -236,7 +236,7 @@ static void THPStorage_subclass_dealloc(PyObject* self) {
-   if (type->tp_del) {
-     PyObject_GC_Track(self);
-     type->tp_del(self);
--    if (self->ob_refcnt > 0) {
-+    if (Py_REFCNT(self) > 0) {
-       // Resurrected (see above comment about resurrection from `__del__`)
-       return;
-     }
-diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
-index 9e85f0026b35..8fd1129da63c 100644
---- a/torch/csrc/autograd/python_variable.cpp
-+++ b/torch/csrc/autograd/python_variable.cpp
-@@ -1910,7 +1910,7 @@ void THPVariable_subclass_dealloc(PyObject* self) {
-   if (type->tp_del) {
-     PyObject_GC_Track(self);
-     type->tp_del(self);
--    if (self->ob_refcnt > 0) {
-+    if (Py_REFCNT(self) > 0) {
-       /* Resurrected */
-       return;
-     }
-diff --git a/torch/csrc/dynamo/cpython_defs.c b/torch/csrc/dynamo/cpython_defs.c
-index 4a1dba63009a..5e0945a052ae 100644
---- a/torch/csrc/dynamo/cpython_defs.c
-+++ b/torch/csrc/dynamo/cpython_defs.c
-@@ -13,6 +13,17 @@
-   } else {                                                              \
-   }
- 
-+#if IS_PYTHON_3_13_PLUS
-+// Gave up after fixing a few of these
-+// pycore_opcode.h is gone (new is pycore_opcode_metadata.h ?)
-+// f_code is gone (new is f_executable?)
-+
-+// Fake definitions for what we removed
-+const uint8_t* THP_PyOpcode_Caches = NULL;
-+const int THP_PyOpcode_Caches_size = 0;
-+
-+#else
-+
- // NOTE: all `assert`s below are converted to `CHECK`s
- 
- #if IS_PYTHON_3_11_PLUS
-@@ -29,8 +40,8 @@
- #define NEED_OPCODE_TABLES // To get _PyOpcode_Deopt
- #include <internal/pycore_opcode.h>
- #undef NEED_OPCODE_TABLES
--#undef Py_BUILD_CORE
- #include <internal/pycore_frame.h>
-+#undef Py_BUILD_CORE
- 
- // As a simple way to reduce the impact of ABI changes on the CPython side, this check forces
- // us to manually re-check that the function didn't change on the next major version
-@@ -364,3 +375,5 @@ THP_PyFrame_Clear(_PyInterpreterFrame *frame)
- }
- 
- #endif
-+
-+#endif // CPython 3.13
-\ No newline at end of file
-diff --git a/torch/csrc/dynamo/cpython_defs.h b/torch/csrc/dynamo/cpython_defs.h
-index a897c3e6c6e7..3b6c9667f8c9 100644
---- a/torch/csrc/dynamo/cpython_defs.h
-+++ b/torch/csrc/dynamo/cpython_defs.h
-@@ -8,7 +8,9 @@
- 
- #if IS_PYTHON_3_11_PLUS
- 
-+#define Py_BUILD_CORE
- #include <internal/pycore_frame.h>
-+#undef Py_BUILD_CORE
- 
- int THP_PyFrame_FastToLocalsWithError(
-     _PyInterpreterFrame* frame,
-diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c
-index c286e821f09d..e13cb5af2a0e 100644
---- a/torch/csrc/dynamo/eval_frame.c
-+++ b/torch/csrc/dynamo/eval_frame.c
-@@ -8,6 +8,31 @@
- #include <opcode.h>
- #include <stdbool.h>
- 
-+
-+
-+PyObject* guard_error_hook = NULL;
-+const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup";
-+
-+static int active_dynamo_threads = 0;
-+
-+static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT;
-+
-+inline static PyObject* eval_frame_callback_get(void) {
-+  void* result = PyThread_tss_get(&eval_frame_callback_key);
-+  if (unlikely(result == NULL)) {
-+    return (PyObject*)Py_None;
-+  } else {
-+    return (PyObject*)result;
-+  }
-+}
-+
-+inline static void eval_frame_callback_set(PyObject* obj) {
-+  PyThread_tss_set(&eval_frame_callback_key, obj);
-+}
-+
-+// 3.13 Not supported at all. See cpython_defs.c for hints
-+#if !(IS_PYTHON_3_13_PLUS)
-+
- // Problem in CPython includes when mixing core and non-core build
- // The fix was not backported to 3.12 so this is needed here
- // https://github.com/python/cpython/issues/105268
-@@ -138,24 +163,6 @@ THP_PyFrame_FastToLocalsWithError(THP_EVAL_API_FRAME_OBJECT *frame, int *free_va
- }
- #endif
- 
--PyObject* guard_error_hook = NULL;
--const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup";
--
--static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT;
--
--inline static PyObject* eval_frame_callback_get(void) {
--  void* result = PyThread_tss_get(&eval_frame_callback_key);
--  if (unlikely(result == NULL)) {
--    return (PyObject*)Py_None;
--  } else {
--    return (PyObject*)result;
--  }
--}
--
--inline static void eval_frame_callback_set(PyObject* obj) {
--  PyThread_tss_set(&eval_frame_callback_key, obj);
--}
--
- static PyObject* _custom_eval_frame_shim(
-     PyThreadState* tstate,
-     THP_EVAL_API_FRAME_OBJECT* frame,
-@@ -627,7 +634,29 @@ static PyObject* _custom_eval_frame(
-   }
- }
- 
--static int active_dynamo_threads = 0;
-+#else // IS_PYTHON_3_13_PLUS
-+
-+// Fake definitions for everything we removed
-+
-+typedef struct THPPyInterpreterFrame {
-+  PyObject_HEAD
-+  _PyInterpreterFrame* frame; // Borrowed reference
-+} THPPyInterpreterFrame;
-+
-+inline static void enable_eval_frame_shim(PyThreadState* tstate) {}
-+inline static void enable_eval_frame_default(PyThreadState* tstate) {}
-+
-+static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL};
-+
-+static PyTypeObject THPPyInterpreterFrameType = {
-+    PyVarObject_HEAD_INIT(NULL, 0)
-+    .tp_name = "torch._C.dynamo.eval_frame._PyInterpreterFrame",
-+    .tp_basicsize = sizeof(THPPyInterpreterFrame),
-+    .tp_flags = Py_TPFLAGS_DEFAULT,
-+    .tp_getset = THPPyInterpreterFrame_properties,
-+};
-+
-+#endif // CPython 3.13
- 
- static PyObject* increment_working_threads(PyThreadState* tstate) {
-   active_dynamo_threads = active_dynamo_threads + 1;
-diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h
-index 73b991cf3fbf..b060db00db73 100644
---- a/torch/csrc/utils/python_compat.h
-+++ b/torch/csrc/utils/python_compat.h
-@@ -11,6 +11,7 @@ extern "C" {
- 
- #define IS_PYTHON_3_11_PLUS PY_VERSION_HEX >= 0x030B00C1
- #define IS_PYTHON_3_12_PLUS PY_VERSION_HEX >= 0x030C0000
-+#define IS_PYTHON_3_13_PLUS PY_VERSION_HEX >= 0x030D0000
- 
- PYCAPI_COMPAT_STATIC_INLINE(int)
- PyCode_GetNCellvars(PyCodeObject* code) {
-@@ -32,6 +33,9 @@ PyCode_GetNFreevars(PyCodeObject* code) {
- #endif
- }
- 
-+// Provided by CPython but getting the header for them is very hard
-+extern void _PyWeakref_ClearRef(PyWeakReference* self);
-+
- #ifdef __cplusplus
- }
- #endif
--- 
-2.45.1
-
diff --git a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
deleted file mode 100644
index 562f55b..0000000
--- a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
+++ /dev/null
@@ -1,910 +0,0 @@
-From 3d1e4b3e5ddcdd2717e590c635097163fef64c83 Mon Sep 17 00:00:00 2001
-From: Xu Han <xu.han@intel.com>
-Date: Sun, 31 Mar 2024 03:07:32 +0000
-Subject: [PATCH] Enable x86 CPU vectorization on windows [submodule sleef]
- (#118980)
-
-Enable VEC on Windows OS.
-1. Fix some type defination gap between Windows and Linux.
-2. Fix some operator not support on Windows, such as [], /.
-3. Enable static sleef library build on Windows.
-4. Disable unsupported function overloading on MSVC.
-5. Upgrade submodule sleef lib, which fixed build issue on Windows.
-6. Fixed bazel build issues.
-7. Fix test app not link to sleef on Windows.
-
-Note: If rebuild fail after pulled this PR, please sync `sleef` submodule by run:
-```cmd
-git submodule sync
-git submodule update --init --recursive
-```
-
-Pull Request resolved: https://github.com/pytorch/pytorch/pull/118980
-Approved by: https://github.com/jgong5, https://github.com/ezyang, https://github.com/malfet
----
- aten/src/ATen/CMakeLists.txt                  | 48 ++++++--------
- aten/src/ATen/cpu/vec/vec256/vec256.h         | 14 ++--
- .../src/ATen/cpu/vec/vec256/vec256_bfloat16.h | 27 ++++++--
- .../cpu/vec/vec256/vec256_complex_double.h    |  7 +-
- .../cpu/vec/vec256/vec256_complex_float.h     |  7 +-
- aten/src/ATen/cpu/vec/vec256/vec256_double.h  |  5 +-
- aten/src/ATen/cpu/vec/vec256/vec256_float.h   | 15 +++--
- aten/src/ATen/cpu/vec/vec256/vec256_qint.h    | 12 +++-
- aten/src/ATen/cpu/vec/vec512/vec512.h         | 14 ++--
- .../src/ATen/cpu/vec/vec512/vec512_bfloat16.h | 27 ++++++--
- .../cpu/vec/vec512/vec512_complex_double.h    |  7 +-
- .../cpu/vec/vec512/vec512_complex_float.h     |  7 +-
- aten/src/ATen/cpu/vec/vec512/vec512_double.h  |  5 +-
- aten/src/ATen/cpu/vec/vec512/vec512_float.h   | 15 +++--
- aten/src/ATen/cpu/vec/vec512/vec512_qint.h    | 66 ++++++++++++++++++-
- aten/src/ATen/cpu/vec/vec_base.h              |  6 ++
- caffe2/CMakeLists.txt                         |  2 +-
- third_party/sleef.BUILD                       |  3 +-
- 18 files changed, 194 insertions(+), 93 deletions(-)
-
-diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
-index bf425af5fa9..58d5828e8ca 100644
---- a/aten/src/ATen/CMakeLists.txt
-+++ b/aten/src/ATen/CMakeLists.txt
-@@ -419,32 +419,25 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$")
-   list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo)
- endif()
- 
--if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
--  # Preserve values for the main build
--  set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS})
--  set(__aten_sleef_build_tests ${BUILD_TESTS})
--
--  # Unset our restrictive C++ flags here and reset them later.
--  # Remove this once we use proper target_compile_options.
--  set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
--  set(CMAKE_CXX_FLAGS)
--
--  # Bump up optimization level for sleef to -O1, since at -O0 the compiler
--  # excessively spills intermediate vector registers to the stack
--  # and makes things run impossibly slowly
--  set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
--  if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
--    string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
--  else()
--    set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
-+if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
-+  if(NOT MSVC)
-+    # Bump up optimization level for sleef to -O1, since at -O0 the compiler
-+    # excessively spills intermediate vector registers to the stack
-+    # and makes things run impossibly slowly
-+    set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
-+    if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
-+      string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
-+    else()
-+      set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
-+    endif()
-   endif()
- 
-   if(NOT USE_SYSTEM_SLEEF)
--    set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
--    set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
--    set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
--    set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
--    set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE})
-+    set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
-+    set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
-+    set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
-+    set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
-+    set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE)
-     if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
-       if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
-         set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE)
-@@ -465,12 +458,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
-   endif()
-   list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef)
- 
--  set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
--  set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
--
--  # Set these back. TODO: Use SLEEF_ to pass these instead
--  set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE)
--  set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE)
-+  if(NOT MSVC)
-+    set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
-+  endif()
- endif()
- 
- if(USE_CUDA AND NOT USE_ROCM)
-diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h
-index 800b027e469..c431fa3c605 100644
---- a/aten/src/ATen/cpu/vec/vec256/vec256.h
-+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
-@@ -69,7 +69,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
- }
- 
- 
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
- 
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- 
-@@ -94,7 +94,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
- }
- 
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--
-+#ifndef _MSC_VER
-+// MSVC is not working well on complex function overload.
- template<int64_t scale = 1>
- std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
- inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
-@@ -106,9 +107,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
- inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
-   return _mm256_i32gather_ps(base_addr, vindex, scale);
- }
--
-+#endif
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--
-+#ifndef _MSC_VER
-+// MSVC is not working well on complex function overload.
- template<int64_t scale = 1>
- std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
- inline mask_gather(const Vectorized<double>& src, const double* base_addr,
-@@ -122,7 +124,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
-                    const Vectorized<int32_t>& vindex, Vectorized<float>& mask) {
-   return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale);
- }
--
-+#endif
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- 
- // Only works for inputs in the range: [-2^51, 2^51]
-@@ -302,6 +304,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
-   return flip8(v);
- }
- 
--#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#endif // (defined(CPU_CAPABILITY_AVX2)
- 
- }} // namepsace at::vec::CPU_CAPABILITY
-diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
-index 3e26213d6d2..66557436c70 100644
---- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
-+++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
-@@ -7,7 +7,8 @@
- #include <ATen/cpu/vec/vec_base.h>
- #include <c10/util/irange.h>
- 
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
-+#define SLEEF_STATIC_LIBS
- #include <sleef.h>
- #endif
- 
-@@ -18,7 +19,18 @@ namespace at::vec {
- // See Note [CPU_CAPABILITY namespace]
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
-+
-+#ifndef SLEEF_CONST
-+#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
-+#define SLEEF_CONST const
-+#else
-+#define SLEEF_CONST
-+#endif
-+#define SLEEF_CONST_OLD SLEEF_CONST
-+#else
-+#define SLEEF_CONST_OLD
-+#endif
- 
- // bfloat16 conversion
- static inline void cvtbf16_fp32(const __m128i& a, __m256& o) {
-@@ -265,7 +277,8 @@ public:
-     }
-     return b;
-   }
--  Vectorized<T> map(const __m256 (*const vop)(__m256)) const {
-+
-+  Vectorized<T> map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const {
-     __m256 lo, hi;
-     cvt_to_fp32<T>(values, lo, hi);
-     const auto o1 = vop(lo);
-@@ -1026,7 +1039,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
- CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
- CONVERT_VECTORIZED_INIT(Half, half);
- 
--#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#else // defined(CPU_CAPABILITY_AVX2)
- 
- #define CONVERT_NON_VECTORIZED_INIT(type, name) \
- inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
-@@ -1051,9 +1064,9 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
- CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
- CONVERT_NON_VECTORIZED_INIT(Half, half);
- 
--#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#endif // defined(CPU_CAPABILITY_AVX2)
- 
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
- #define LOAD_FP32_VECTORIZED_INIT(type, name) \
- inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
-   auto values = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data)); \
-@@ -1072,7 +1085,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
- LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
- LOAD_FP32_VECTORIZED_INIT(Half, fp16);
- 
--#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#else // defined(CPU_CAPABILITY_AVX2)
- #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
- inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
-   __at_align__ float values[Vectorized<float>::size()]; \
-diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
-index f93ea1e63c3..6c198fb37d3 100644
---- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
-+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
-@@ -8,7 +8,8 @@
- #include <ATen/cpu/vec/intrinsics.h>
- #include <ATen/cpu/vec/vec_base.h>
- 
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
-+#define SLEEF_STATIC_LIBS
- #include <sleef.h>
- #endif
- 
-@@ -16,7 +17,7 @@ namespace at::vec {
- // See Note [CPU_CAPABILITY namespace]
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
- 
- template <> class Vectorized<c10::complex<double>> {
- private:
-@@ -145,7 +146,7 @@ public:
-     auto abs = abs_();
-     auto zero = _mm256_setzero_pd();
-     auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ);
--    auto div = values / abs;
-+    auto div = _mm256_div_pd(values, abs);
-     return _mm256_blendv_pd(div, zero, mask);
-   }
-   __m256d real_() const {
-diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
-index 7c142c04b79..c72d4d49274 100644
---- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
-+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
-@@ -7,7 +7,8 @@
- #include <c10/util/irange.h>
- #include <ATen/cpu/vec/intrinsics.h>
- #include <ATen/cpu/vec/vec_base.h>
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
-+#define SLEEF_STATIC_LIBS
- #include <sleef.h>
- #endif
- 
-@@ -15,7 +16,7 @@ namespace at::vec {
- // See Note [CPU_CAPABILITY namespace]
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
- 
- template <> class Vectorized<c10::complex<float>> {
- private:
-@@ -180,7 +181,7 @@ public:
-     auto abs = abs_();
-     auto zero = _mm256_setzero_ps();
-     auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ);
--    auto div = values / abs;
-+    auto div = _mm256_div_ps(values, abs);
-     return _mm256_blendv_ps(div, zero, mask);
-   }
-   __m256 real_() const {
-diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
-index bc82d07edd1..bed6da627af 100644
---- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h
-+++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
-@@ -6,7 +6,8 @@
- #include <ATen/cpu/vec/intrinsics.h>
- #include <ATen/cpu/vec/vec_base.h>
- #include <c10/util/irange.h>
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
-+#define SLEEF_STATIC_LIBS
- #include <sleef.h>
- #endif
- 
-@@ -15,7 +16,7 @@ namespace at::vec {
- inline namespace CPU_CAPABILITY {
- 
- 
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
- 
- template <> class Vectorized<double> {
- private:
-diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
-index 886809a0b8a..0e3664cd37b 100644
---- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
-+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
-@@ -6,7 +6,8 @@
- #include <ATen/cpu/vec/intrinsics.h>
- #include <ATen/cpu/vec/vec_base.h>
- #include <c10/util/irange.h>
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
-+#define SLEEF_STATIC_LIBS
- #include <sleef.h>
- #endif
- 
-@@ -14,7 +15,7 @@ namespace at::vec {
- // See Note [CPU_CAPABILITY namespace]
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
- 
- template <> class Vectorized<float> {
- private:
-@@ -226,14 +227,14 @@ public:
-     static __m256 vec_factorial_5 =
-         _mm256_set1_ps(0.00828929059f); // 1/factorial(5)
-     static __m256 vec_exp_log2ef =
--        (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e)
-+        _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e)
-     static __m256 vec_half = _mm256_set1_ps(0.5f);
-     static __m256 vec_one = _mm256_set1_ps(1.f);
-     static __m256 vec_zero = _mm256_set1_ps(0.f);
-     static __m256 vec_two = _mm256_set1_ps(2.f);
--    static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2)
--    static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50);
--    static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218);
-+    static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2)
-+    static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
-+    static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
-     static __m256i vec_127 = _mm256_set1_epi32(0x0000007f);
-     static int n_mantissa_bits = 23;
- 
-@@ -266,7 +267,7 @@ public:
-     auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number);
-     auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127);
-     vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
--    auto vec_two_pow_n = (__m256)vec_two_pow_n_i;
-+    auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i);
-     vec_two_pow_n =
-         _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask);
- 
-diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
-index 4128841701a..85e099904cd 100644
---- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
-+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
-@@ -41,11 +41,17 @@
- namespace at::vec {
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
- 
-+#ifdef _MSC_VER
-+__declspec(align(64)) struct Vectorizedqi {
-+ protected:
-+  __m256i vals;
-+#else
- struct Vectorizedqi {
-  protected:
-   __m256i vals __attribute__((aligned(64)));
-+#endif
- 
-  public:
-   Vectorizedqi() {}
-@@ -133,7 +139,7 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
- }
- 
- template <typename T>
--inline void __attribute__((always_inline)) QuantizeAvx2(
-+__FORCE_INLINE void QuantizeAvx2(
-     const float* src,
-     T* dst,
-     int len,
-@@ -1331,5 +1337,5 @@ Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const V
-   return a.maximum(b);
- }
- 
--#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#endif // if defined(CPU_CAPABILITY_AVX2)
- }} // namespace at::vec::CPU_CAPABILITY
-diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h
-index fe96d123e64..87f723d782c 100644
---- a/aten/src/ATen/cpu/vec/vec512/vec512.h
-+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
-@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
- }
- 
- 
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
- 
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- 
-@@ -80,7 +80,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
- }
- 
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--
-+#ifndef _MSC_VER
-+// MSVC is not working well on complex function overload.
- template<int64_t scale = 1>
- std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
- inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
-@@ -92,9 +93,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
- inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
-   return _mm512_i32gather_ps(vindex, base_addr, scale);
- }
--
-+#endif
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--
-+#ifndef _MSC_VER
-+// MSVC is not working well on complex function overload.
- template<int64_t scale = 1>
- std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
- inline mask_gather(const Vectorized<double>& src, const double* base_addr,
-@@ -112,7 +114,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
-   auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ);
-   return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale);
- }
--
-+#endif
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- 
- template<>
-@@ -270,6 +272,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
-   return flip8(v);
- }
- 
--#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#endif // defined(CPU_CAPABILITY_AVX512)
- 
- }}}
-diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
-index f9fc92d52bf..eb3b6a72240 100644
---- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
-+++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
-@@ -7,7 +7,8 @@
- #include <ATen/cpu/vec/vec_base.h>
- #include <c10/util/irange.h>
- 
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
-+#define SLEEF_STATIC_LIBS
- #include <sleef.h>
- #endif
- 
-@@ -16,7 +17,18 @@ namespace vec {
- // See Note [CPU_CAPABILITY namespace]
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
-+
-+#ifndef SLEEF_CONST
-+#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
-+#define SLEEF_CONST const
-+#else
-+#define SLEEF_CONST
-+#endif
-+#define SLEEF_CONST_OLD SLEEF_CONST
-+#else
-+#define SLEEF_CONST_OLD
-+#endif
- 
- // bfloat16 conversion
- static inline void cvtbf16_fp32(const __m256i& a, __m512& o) {
-@@ -362,7 +374,8 @@ public:
-   }
-   #pragma clang diagnostic push
-   #pragma clang diagnostic ignored "-Wignored-qualifiers"
--  Vectorized<T> map(const __m512 (*const vop)(__m512)) const {
-+
-+  Vectorized<T> map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const {
-     __m512 lo, hi;
-     cvt_to_fp32<T>(values, lo, hi);
-     const auto o1 = vop(lo);
-@@ -1571,7 +1584,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
- CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
- CONVERT_VECTORIZED_INIT(Half, half);
- 
--#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#else //defined(CPU_CAPABILITY_AVX512)
- 
- #define CONVERT_NON_VECTORIZED_INIT(type, name) \
- inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
-@@ -1601,9 +1614,9 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
- CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
- CONVERT_NON_VECTORIZED_INIT(Half, half);
- 
--#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#endif // defined(CPU_CAPABILITY_AVX512)
- 
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
- #define LOAD_FP32_VECTORIZED_INIT(type, name) \
- inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
-   auto values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)); \
-@@ -1622,7 +1635,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
- LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
- LOAD_FP32_VECTORIZED_INIT(Half, fp16);
- 
--#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#else // defined(CPU_CAPABILITY_AVX512)
- #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
- inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
-   __at_align__ float values[Vectorized<float>::size()]; \
-diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
-index 02aa3a87cc1..c35204f9da2 100644
---- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
-+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
-@@ -7,7 +7,8 @@
- #include <c10/util/irange.h>
- #include <ATen/cpu/vec/intrinsics.h>
- #include <ATen/cpu/vec/vec_base.h>
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
-+#define SLEEF_STATIC_LIBS
- #include <sleef.h>
- #endif
- 
-@@ -16,7 +17,7 @@ namespace vec {
- // See Note [CPU_CAPABILITY namespace]
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
- 
- template <> class Vectorized<c10::complex<double>> {
- private:
-@@ -203,7 +204,7 @@ public:
-     auto abs = abs_();
-     auto zero = _mm512_setzero_pd();
-     auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ);
--    auto div = values / abs;
-+    auto div = _mm512_div_pd(values, abs);
-     return _mm512_mask_blend_pd(mask, div, zero);
-   }
-   __m512d real_() const {
-diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
-index a5d790c98b2..2801e484d94 100644
---- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
-+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
-@@ -7,7 +7,8 @@
- #include <c10/util/irange.h>
- #include <ATen/cpu/vec/intrinsics.h>
- #include <ATen/cpu/vec/vec_base.h>
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
-+#define SLEEF_STATIC_LIBS
- #include <sleef.h>
- #endif
- 
-@@ -16,7 +17,7 @@ namespace vec {
- // See Note [CPU_CAPABILITY namespace]
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
- 
- template <> class Vectorized<c10::complex<float>> {
- private:
-@@ -708,7 +709,7 @@ public:
-     auto abs = abs_();
-     auto zero = _mm512_setzero_ps();
-     auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ);
--    auto div = values / abs;
-+    auto div = _mm512_div_ps(values, abs);
-     return _mm512_mask_blend_ps(mask, div, zero);
-   }
-   __m512 real_() const {
-diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
-index 27b2753c903..508ab257e60 100644
---- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
-+++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
-@@ -6,7 +6,8 @@
- #include <ATen/cpu/vec/intrinsics.h>
- #include <ATen/cpu/vec/vec_base.h>
- #include <c10/util/irange.h>
--#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER)
-+#if (defined(CPU_CAPABILITY_AVX512))
-+#define SLEEF_STATIC_LIBS
- #include <sleef.h>
- #endif
- 
-@@ -15,7 +16,7 @@ namespace vec {
- // See Note [CPU_CAPABILITY namespace]
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
- 
- template <> class Vectorized<double> {
- private:
-diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
-index ba5738687fd..a08df3c141a 100644
---- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
-+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
-@@ -6,7 +6,8 @@
- #include <ATen/cpu/vec/intrinsics.h>
- #include <ATen/cpu/vec/vec_base.h>
- #include <c10/util/irange.h>
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
-+#define SLEEF_STATIC_LIBS
- #include <sleef.h>
- #endif
- 
-@@ -15,7 +16,7 @@ namespace vec {
- // See Note [CPU_CAPABILITY namespace]
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
- 
- template <> class Vectorized<float> {
- private:
-@@ -246,14 +247,14 @@ public:
-     static __m512 vec_factorial_5 =
-         _mm512_set1_ps(0.00828929059f); // 1/factorial(5)
-     static __m512 vec_exp_log2ef =
--        (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e)
-+        _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
-     static __m512 vec_half = _mm512_set1_ps(0.5f);
-     static __m512 vec_one = _mm512_set1_ps(1.f);
-     static __m512 vec_zero = _mm512_set1_ps(0.f);
-     static __m512 vec_two = _mm512_set1_ps(2.f);
--    static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2)
--    static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50);
--    static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218);
-+    static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2)
-+    static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
-+    static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
-     static __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
-     static int n_mantissa_bits = 23;
- 
-@@ -288,7 +289,7 @@ public:
-     auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number);
-     auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127);
-     vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
--    auto vec_two_pow_n = (__m512)vec_two_pow_n_i;
-+    auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i);
-     vec_two_pow_n =
-         _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero);
- 
-diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
-index e0713d01312..a5671ed4a50 100644
---- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
-+++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
-@@ -42,11 +42,17 @@ namespace at {
- namespace vec {
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
- 
-+#ifdef _MSC_VER
-+__declspec(align(64)) struct Vectorizedqi {
-+ protected:
-+  __m512i vals;
-+#else
- struct Vectorizedqi {
-  protected:
-   __m512i vals __attribute__((aligned(64)));
-+#endif
- 
-  public:
-   Vectorizedqi() {}
-@@ -136,7 +142,7 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
- }
- 
- template <typename T>
--inline void __attribute__((always_inline)) QuantizeAvx512(
-+__FORCE_INLINE void QuantizeAvx512(
-     const float* src,
-     T* dst,
-     int len,
-@@ -525,10 +531,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
-       Vectorized<float> scale,
-       Vectorized<float> zero_point,
-       Vectorized<float> scale_neg_zp_premul) const {
-+    #if defined(_MSC_VER) && !defined(__clang__)
-+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-+    #else
-     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-+    #endif
- 
-     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
-     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
-@@ -549,10 +562,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
-   float_vec_return_type dequantize(
-       Vectorized<float> scale,
-       Vectorized<float> zero_point) const {
-+    #if defined(_MSC_VER) && !defined(__clang__)
-+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-+    #else
-     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-+    #endif
- 
-     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
-     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
-@@ -598,20 +618,34 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
-     }
- 
-     int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
-+      #if defined(_MSC_VER) && !defined(__clang__)
-+      __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-+      __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-+      __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-+      __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-+      #else
-       __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-       __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-       __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-       __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-+      #endif
- 
-       __m512i int32_val0 = cvtepi8_epi32(int_val0);
-       __m512i int32_val1 = cvtepi8_epi32(int_val1);
-       __m512i int32_val2 = cvtepi8_epi32(int_val2);
-       __m512i int32_val3 = cvtepi8_epi32(int_val3);
- 
-+      #if defined(_MSC_VER) && !defined(__clang__)
-+      __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
-+      __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
-+      __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
-+      __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
-+      #else
-       __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
-       __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
-       __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
-       __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
-+      #endif
- 
-       __m512i int32_b0 = cvtepi8_epi32(int_b0);
-       __m512i int32_b1 = cvtepi8_epi32(int_b1);
-@@ -721,10 +755,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
-       Vectorized<float> scale,
-       Vectorized<float> zero_point,
-       Vectorized<float> scale_zp_premul) const {
-+    #if defined(_MSC_VER) && !defined(__clang__)
-+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-+    #else
-     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-+    #endif
- 
-     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
-     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
-@@ -746,10 +787,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
-   float_vec_return_type dequantize(
-       Vectorized<float> scale,
-       Vectorized<float> zero_point) const {
-+    #if defined(_MSC_VER) && !defined(__clang__)
-+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-+    #else
-     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-+    #endif
- 
-     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
-     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
-@@ -796,20 +844,34 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
-     }
- 
-     int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
-+      #if defined(_MSC_VER) && !defined(__clang__)
-+      __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-+      __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-+      __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-+      __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-+      #else
-       __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-       __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-       __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-       __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-+      #endif
- 
-       __m512i int32_val0 = cvtepu8_epi32(int_val0);
-       __m512i int32_val1 = cvtepu8_epi32(int_val1);
-       __m512i int32_val2 = cvtepu8_epi32(int_val2);
-       __m512i int32_val3 = cvtepu8_epi32(int_val3);
- 
-+      #if defined(_MSC_VER) && !defined(__clang__)
-+      __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
-+      __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
-+      __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
-+      __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
-+      #else
-       __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
-       __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
-       __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
-       __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
-+      #endif
- 
-       __m512i int32_b0 = cvtepu8_epi32(int_b0);
-       __m512i int32_b1 = cvtepu8_epi32(int_b1);
-diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
-index adf81dd915c..20cb8ef6dbc 100644
---- a/aten/src/ATen/cpu/vec/vec_base.h
-+++ b/aten/src/ATen/cpu/vec/vec_base.h
-@@ -36,6 +36,12 @@
- #include <c10/util/irange.h>
- #include <c10/util/Load.h>
- 
-+#if defined(__GNUC__)
-+#define __FORCE_INLINE __attribute__((always_inline)) inline
-+#elif defined(_MSC_VER)
-+#define __FORCE_INLINE __forceinline
-+#endif
-+
- // These macros helped us unify vec_base.h
- #ifdef CPU_CAPABILITY_AVX512
- #if defined(__GNUC__)
-diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
-index a6b6f0f7d1d..15d37cf4861 100644
---- a/caffe2/CMakeLists.txt
-+++ b/caffe2/CMakeLists.txt
-@@ -1787,7 +1787,7 @@ if(BUILD_TEST)
-           endif()
-         else()
-           add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
--          target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main)
-+          target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main)
-         endif()
-         target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<INSTALL_INTERFACE:include>)
-         target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>)
-diff --git a/third_party/sleef.BUILD b/third_party/sleef.BUILD
-index 573f9c5b54a..f22a6e905e2 100644
---- a/third_party/sleef.BUILD
-+++ b/third_party/sleef.BUILD
-@@ -38,6 +38,7 @@ SLEEF_PUBLIC_HEADERS = [
- SLEEF_PRIVATE_INCLUDES = [
-     "-Iexternal/sleef/src/arch",
-     "-Iexternal/sleef/src/common",
-+    "-Iexternal/sleef/src/libm",
- ]
- 
- SLEEF_PUBLIC_INCLUDES = [
-@@ -201,8 +202,6 @@ cc_library(
-     srcs = [
-         "src/libm/rempitab.c",
-         "src/libm/sleefdp.c",
--        "src/libm/sleefld.c",
--        "src/libm/sleefqp.c",
-         "src/libm/sleefsp.c",
-     ],
-     hdrs = SLEEF_PUBLIC_HEADERS,
--- 
-2.45.1
-
diff --git a/0001-Improve-finding-and-using-the-rocm_version.h.patch b/0001-Improve-finding-and-using-the-rocm_version.h.patch
deleted file mode 100644
index b8232c7..0000000
--- a/0001-Improve-finding-and-using-the-rocm_version.h.patch
+++ /dev/null
@@ -1,142 +0,0 @@
-From 201ac4618a1526e048a0d6c02d9bc4cf30bf0ee1 Mon Sep 17 00:00:00 2001
-From: Tom Rix <Tom.Rix@amd.com>
-Date: Wed, 14 Aug 2024 17:18:38 -0700
-Subject: [PATCH] Improve finding and using the rocm_version.h
-
-On Fedora, the rocm_version.h's path is /usr/include/rocm_version.h
-So we have this build error
-pytorch/aten/src/ATen/hip/tunable/Tunable.cpp:40:10: fatal error:
-  rocm-core/rocm_version.h: No such file or directory
-   40 | #include <rocm-core/rocm_version.h>
-      |          ^~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In other cases, depending on the rocm release either
-/opt/rocm/include or /opt/rocm/include/rocm-core
-
-Convert the EXISTS() checks into a find_path.
-Add a -I${ROCM_VERSION_DIR} to the compile options so it can be
-found by Tunable.cpp
-
-Signed-off-by: Tom Rix <Tom.Rix@amd.com>
----
- aten/src/ATen/cuda/tunable/Tunable.cpp |  2 +-
- cmake/Dependencies.cmake               |  1 +
- cmake/public/LoadHIP.cmake             | 72 ++++++++++----------------
- 3 files changed, 30 insertions(+), 45 deletions(-)
-
-diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp
-index 1b7c89875855..32c1d70f3152 100644
---- a/aten/src/ATen/cuda/tunable/Tunable.cpp
-+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
-@@ -36,7 +36,7 @@
- 
- // for validators
- #ifdef USE_ROCM
--#include <rocm-core/rocm_version.h>
-+#include <rocm_version.h>
- #define ROCBLAS_BETA_FEATURES_API
- #include <rocblas/rocblas.h>
- #include <hipblaslt/hipblaslt.h>
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index 7ef8eabb5162..61bc4d7a54b6 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1063,6 +1063,7 @@ if(USE_ROCM)
-     list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
-     list(APPEND HIP_CXX_FLAGS -std=c++17)
-     list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2)
-+    list(APPEND HIP_CXX_FLAGS -I${ROCM_VERSION_DIR})
-     if(HIP_NEW_TYPE_ENUMS)
-       list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS)
-     endif()
-diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
-index 1c0d3a203991..6a7e3bd163f5 100644
---- a/cmake/public/LoadHIP.cmake
-+++ b/cmake/public/LoadHIP.cmake
-@@ -42,55 +42,39 @@ find_package_and_print_version(HIP 1.0)
- 
- if(HIP_FOUND)
-   set(PYTORCH_FOUND_HIP TRUE)
--  set(FOUND_ROCM_VERSION_H FALSE)
--
-   set(PROJECT_RANDOM_BINARY_DIR "${PROJECT_BINARY_DIR}")
--  set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc")
- 
-   # Find ROCM version for checks
-   # ROCM 5.0 and later will have header api for version management
--  if(EXISTS ${ROCM_INCLUDE_DIRS}/rocm_version.h)
--    set(FOUND_ROCM_VERSION_H TRUE)
--    file(WRITE ${file} ""
--      "#include <rocm_version.h>\n"
--      )
--  elseif(EXISTS ${ROCM_INCLUDE_DIRS}/rocm-core/rocm_version.h)
--    set(FOUND_ROCM_VERSION_H TRUE)
--    file(WRITE ${file} ""
--      "#include <rocm-core/rocm_version.h>\n"
--      )
--  else()
--    message("********************* rocm_version.h couldnt be found ******************\n")
--  endif()
--
--  if(FOUND_ROCM_VERSION_H)
--    file(APPEND ${file} ""
--      "#include <cstdio>\n"
--
--      "#ifndef ROCM_VERSION_PATCH\n"
--      "#define ROCM_VERSION_PATCH 0\n"
--      "#endif\n"
--      "#define STRINGIFYHELPER(x) #x\n"
--      "#define STRINGIFY(x) STRINGIFYHELPER(x)\n"
--      "int main() {\n"
--      "  printf(\"%d.%d.%s\", ROCM_VERSION_MAJOR, ROCM_VERSION_MINOR, STRINGIFY(ROCM_VERSION_PATCH));\n"
--      "  return 0;\n"
--      "}\n"
--      )
--
--    try_run(run_result compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file}
--      CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}"
--      RUN_OUTPUT_VARIABLE rocm_version_from_header
--      COMPILE_OUTPUT_VARIABLE output_var
--      )
--    # We expect the compile to be successful if the include directory exists.
--    if(NOT compile_result)
--      message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var})
--    endif()
--    message(STATUS "Caffe2: Header version is: " ${rocm_version_from_header})
--    set(ROCM_VERSION_DEV_RAW ${rocm_version_from_header})
--    message("\n***** ROCm version from rocm_version.h ****\n")
-+  find_path(ROCM_VERSION_DIR rocm_version.h HINTS ${ROCM_INCLUDE_DIRS} ${ROCM_INCLUDE_DIRS}/rocm-core)
-+  set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc")
-+  file(WRITE ${file} ""
-+    "#include <rocm_version.h>\n"
-+    "#include <cstdio>\n"
-+
-+    "#ifndef ROCM_VERSION_PATCH\n"
-+    "#define ROCM_VERSION_PATCH 0\n"
-+    "#endif\n"
-+    "#define STRINGIFYHELPER(x) #x\n"
-+    "#define STRINGIFY(x) STRINGIFYHELPER(x)\n"
-+    "int main() {\n"
-+    "  printf(\"%d.%d.%s\", ROCM_VERSION_MAJOR, ROCM_VERSION_MINOR, STRINGIFY(ROCM_VERSION_PATCH));\n"
-+    "  return 0;\n"
-+    "}\n"
-+  )
-+
-+  try_run(run_result compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file}
-+    CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_VERSION_DIR}"
-+    RUN_OUTPUT_VARIABLE rocm_version_from_header
-+    COMPILE_OUTPUT_VARIABLE output_var
-+  )
-+  # We expect the compile to be successful if the include directory exists.
-+  if(NOT compile_result)
-+    message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var})
-   endif()
-+  message(STATUS "Caffe2: Header version is: " ${rocm_version_from_header})
-+  set(ROCM_VERSION_DEV_RAW ${rocm_version_from_header})
-+  message("\n***** ROCm version from rocm_version.h ****\n")
- 
-   string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+).*$" ROCM_VERSION_DEV_MATCH ${ROCM_VERSION_DEV_RAW})
- 
--- 
-2.46.0
-
diff --git a/0001-Optionally-use-hipblaslt.patch b/0001-Optionally-use-hipblaslt.patch
deleted file mode 100644
index 1e5ca4b..0000000
--- a/0001-Optionally-use-hipblaslt.patch
+++ /dev/null
@@ -1,506 +0,0 @@
-From f1d65e958afa65882dbfea8b392ab847a84d41ed Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 29 Jun 2024 04:18:34 -0700
-Subject: [PATCH] Optionally use hipblaslt
-
----
- aten/src/ATen/cuda/CUDABlas.cpp          | 46 ++++++++++++++++++------
- aten/src/ATen/cuda/CUDAContextLight.h    |  4 +++
- aten/src/ATen/cuda/CublasHandlePool.cpp  | 10 ++++--
- aten/src/ATen/cuda/tunable/TunableGemm.h | 18 +++++++---
- aten/src/ATen/native/cuda/Blas.cpp       | 18 +++++++++-
- cmake/Dependencies.cmake                 |  3 ++
- cmake/public/LoadHIP.cmake               |  2 +-
- 7 files changed, 82 insertions(+), 19 deletions(-)
-
-diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
-index ce991a9bcad4..3f0d17b52778 100644
---- a/aten/src/ATen/cuda/CUDABlas.cpp
-+++ b/aten/src/ATen/cuda/CUDABlas.cpp
-@@ -14,7 +14,9 @@
- #include <c10/util/irange.h>
- 
- #ifdef USE_ROCM
-+#ifdef USE_HIPBLASLT
- #include <hipblaslt/hipblaslt-ext.hpp>
-+#endif
- // until hipblas has an API to accept flags, we must use rocblas here
- #include <hipblas/hipblas.h>
- #include <rocblas/rocblas.h>
-@@ -182,6 +184,9 @@ uint32_t _getAlignment(uintptr_t address) {
- static size_t _parseChosenWorkspaceSize() {
-   const char * val = getenv("CUBLASLT_WORKSPACE_SIZE");
- #ifdef USE_ROCM
-+#ifndef USE_HIPBLASLT
-+  return 0;
-+#endif
-   if (!val) {
-     // accept either env var
-     val = getenv("HIPBLASLT_WORKSPACE_SIZE");
-@@ -235,6 +240,7 @@ namespace at::cuda::blas {
-   } while (0)
- 
- 
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- namespace {
- // Following the pattern of CuSparseDescriptor
- // Defined here for now because this is the only place cublas_lt interface is
-@@ -318,7 +324,6 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
- };
- } // namespace
- 
--
- template <typename Dtype>
- inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-   cudaDataType_t abcType = CUDA_R_32F;
-@@ -452,7 +457,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-       " scaleType ",
-       scaleType);
- }
--
-+#endif
- 
- template <typename Dtype>
- inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-@@ -608,10 +613,13 @@ void bgemm_internal<double>(CUDABLAS_BGEMM_ARGTYPES(double))
- template <>
- void bgemm_internal<float>(CUDABLAS_BGEMM_ARGTYPES(float))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     bgemm_internal_cublaslt<float>(CUDABLAS_BGEMM_ARGS(float));
-   }
--  else {
-+  else
-+#endif
-+  {
-     bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGS(float));
-   }
- }
-@@ -651,10 +659,13 @@ void bgemm_internal<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<fl
- template <>
- void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     bgemm_internal_cublaslt<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
-   }
--  else {
-+  else
-+#endif    
-+  {
-     bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
-   }
- }
-@@ -662,10 +673,13 @@ void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half))
- template <>
- void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     bgemm_internal_cublaslt<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
-   }
--  else {
-+  else
-+#endif
-+  {
-     bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
-   }
- }
-@@ -781,11 +795,13 @@ void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
-   }
- }
- 
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- template <typename Dtype>
- inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
-   // forward to bgemm implementation but set strides and batches to 0
-   bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0);
- }
-+#endif
- 
- template <typename Dtype>
- inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
-@@ -1008,10 +1024,13 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
- template <>
- void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
-   }
--  else {
-+  else
-+#endif
-+  {
-     gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGS(float));
-   }
- }
-@@ -1051,10 +1070,13 @@ void gemm_internal<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<floa
- template <>
- void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
-   }
--  else {
-+  else
-+#endif
-+  {
-     gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
-   }
- }
-@@ -1062,10 +1084,13 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
- template <>
- void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
-   }
--  else {
-+  else
-+#endif
-+  {
-     gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
-   }
- }
-@@ -1177,7 +1202,7 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
-   }
- }
- 
--
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- template <typename Dtype>
- void gemm_and_bias(
-     bool transpose_mat1,
-@@ -1410,7 +1435,7 @@ void scaled_gemm(
-     ScalarType result_dtype,
-     void* amax_ptr,
-     bool use_fast_accum) {
--#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
-+#if CUDA_VERSION >= 11080 || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   const auto computeType = CUBLAS_COMPUTE_32F;
-   const auto scaleType = CUDA_R_32F;
-   const int8_t fastAccuMode = use_fast_accum ? 1 : 0;
-@@ -1681,6 +1706,7 @@ void int8_gemm(
-       " scaleType ",
-       scaleType);
- }
-+#endif
- 
- template <>
- void trsm<float>(CUDABLAS_TRSM_ARGTYPES(float)) {
-diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h
-index f2b657ced51b..f0ee613c4208 100644
---- a/aten/src/ATen/cuda/CUDAContextLight.h
-+++ b/aten/src/ATen/cuda/CUDAContextLight.h
-@@ -9,7 +9,9 @@
- 
- // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
- // added bf16 support
-+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)))
- #include <cublasLt.h>
-+#endif
- 
- #ifdef CUDART_VERSION
- #include <cusolverDn.h>
-@@ -80,7 +82,9 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator();
- /* Handles */
- TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle();
- TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
-+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)))
- TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
-+#endif
- 
- TORCH_CUDA_CPP_API void clearCublasWorkspaces();
- 
-diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
-index 8eac525b3695..abfdf7a23847 100644
---- a/aten/src/ATen/cuda/CublasHandlePool.cpp
-+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
-@@ -29,7 +29,7 @@ namespace at::cuda {
- 
- namespace {
- 
--#if defined(USE_ROCM)
-+#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
- void createCublasLtHandle(cublasLtHandle_t *handle) {
-   TORCH_CUDABLAS_CHECK(cublasLtCreate(handle));
- }
-@@ -191,8 +191,9 @@ cublasHandle_t getCurrentCUDABlasHandle() {
-   return handle;
- }
- 
--cublasLtHandle_t getCurrentCUDABlasLtHandle() {
- #ifdef USE_ROCM
-+#if defined(USE_HIPBLASLT)
-+cublasLtHandle_t getCurrentCUDABlasLtHandle() {
-   c10::DeviceIndex device = 0;
-   AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
- 
-@@ -213,9 +214,12 @@ cublasLtHandle_t getCurrentCUDABlasLtHandle() {
- 
-   auto handle = myPoolWindow->reserve(device);
-   return handle;
-+}
-+#endif
- #else
-+cublasLtHandle_t getCurrentCUDABlasLtHandle() {
-   return reinterpret_cast<cublasLtHandle_t>(getCurrentCUDABlasHandle());
--#endif
- }
-+#endif
- 
- } // namespace at::cuda
-diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h
-index 53e6154120c9..fa1d664696db 100644
---- a/aten/src/ATen/cuda/tunable/TunableGemm.h
-+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h
-@@ -11,7 +11,9 @@
- 
- #include <ATen/cuda/tunable/GemmCommon.h>
- #ifdef USE_ROCM
-+#ifdef USE_HIPBLASLT
- #include <ATen/cuda/tunable/GemmHipblaslt.h>
-+#endif
- #include <ATen/cuda/tunable/GemmRocblas.h>
- #endif
- #include <ATen/cuda/tunable/StreamTimer.h>
-@@ -65,6 +67,7 @@ class DefaultGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>>
-     }
- };
- 
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- template <typename T>
- class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
-   public:
-@@ -94,6 +97,7 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
-       return OK;
-     }
- };
-+#endif
- 
- template <typename T>
- inline bool IsZero(T v) {
-@@ -191,6 +195,7 @@ static void AddRocblasValidator() {
-   }
- }
- 
-+#ifdef USE_HIPBLASLT
- static void AddHipblasltValidator() {
-   auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
-   if (validators.find("HIPBLASLT_VERSION") == validators.end()) {
-@@ -205,6 +210,7 @@ static void AddHipblasltValidator() {
-         [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
-   }
- }
-+#endif
- 
- static void AddRocmValidator() {
-   auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
-@@ -243,7 +249,7 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
-       }
-       AddRocblasValidator();
-     }
--
-+#ifdef USE_HIPBLASLT
-     static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
-     if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
-       rocm_validators = true;
-@@ -257,7 +263,7 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
-       }
-       AddHipblasltValidator();
-     }
--
-+#endif
-     if (rocm_validators) {
-       AddRocmValidator();
-     }
-@@ -286,7 +292,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
-       }
-       AddRocblasValidator();
-     }
--
-+#ifdef USE_HIPBLASLT
-     static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
-     if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
-       rocm_validators = true;
-@@ -300,7 +306,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
-       }
-       AddHipblasltValidator();
-     }
--
-+#endif
-     if (rocm_validators) {
-       AddRocmValidator();
-     }
-@@ -312,6 +318,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
-   }
- };
- 
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout>
- class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer> {
-  public:
-@@ -321,10 +328,12 @@ class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer>
-     auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
- 
- #if defined(USE_ROCM)
-+#ifdef USE_HIPBLASLT
-     for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps<AT, BT, CT, ALayout, BLayout>()) {
-       this->RegisterOp(std::move(name), std::move(op));
-     }
-     AddHipblasltValidator();
-+#endif
-     AddRocmValidator();
- #endif
-   }
-@@ -337,6 +346,7 @@ class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer>
-             "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
-   }
- };
-+#endif
- 
- #undef XSTRINGIFY
- #undef STRINGIFY
-diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
-index 84c59a4fd0d7..56ad5de3bf2d 100644
---- a/aten/src/ATen/native/cuda/Blas.cpp
-+++ b/aten/src/ATen/native/cuda/Blas.cpp
-@@ -173,6 +173,7 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa
- }
- 
- static bool getDisableAddmmCudaLt() {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-     static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT");
- #ifdef USE_ROCM
-     // if we enable tunable op, it'll take priority over just hipblaslt (heuristics)
-@@ -196,10 +197,14 @@ static bool getDisableAddmmCudaLt() {
-     }
-     return false;
- #endif
-+#else
-+    return true;
-+#endif
- }
- 
- #ifdef USE_ROCM
- static bool isSupportedHipLtROCmArch(int index) {
-+#ifdef USE_HIPBLASLT
-     hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index);
-     std::string device_arch = prop->gcnArchName;
-     static const std::vector<std::string> archs = {"gfx90a", "gfx940", "gfx941", "gfx942"};
-@@ -210,6 +215,7 @@ static bool isSupportedHipLtROCmArch(int index) {
-         }
-     }
-     TORCH_CHECK(false, "Attempting to use hipBLASLt on a unsupported architecture!");
-+#endif
-     return false;
- }
- #endif
-@@ -235,6 +241,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
-   at::ScalarType scalar_type = self.scalar_type();
-   c10::MaybeOwned<Tensor> self_;
-   if (&result != &self) {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM)
-     // Strangely, if mat2 has only 1 row or column, we get
-     // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
-@@ -276,13 +283,14 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
-             scalar_type != at::ScalarType::BFloat16));
- #endif
-     }
-+#endif
- #endif
-     if (!useLtInterface) {
-       self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
-     }
-     self__sizes = self_->sizes();
-   } else {
--#if defined(USE_ROCM)
-+#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
-     useLtInterface = !disable_addmm_cuda_lt &&
-         result.dim() == 2 && result.is_contiguous() &&
-         isSupportedHipLtROCmArch(self.device().index()) &&
-@@ -334,6 +342,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
-   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj());
- 
-   if (useLtInterface) {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- #if defined(USE_ROCM)
-     AT_DISPATCH_FLOATING_TYPES_AND2(
-         at::ScalarType::Half,
-@@ -394,6 +403,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
-               activation_epilogue
-           );
-         });
-+#endif
- #endif
-   } else
-   {
-@@ -803,6 +813,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
- }
- 
- static bool _scaled_mm_allowed_device() {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-     auto dprops = at::cuda::getCurrentDeviceProperties();
- #ifdef USE_ROCM
-     std::string device_arch = dprops->gcnArchName;
-@@ -817,6 +828,9 @@ static bool _scaled_mm_allowed_device() {
- #else
-     return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9);
- #endif
-+#else
-+    return false;
-+#endif
- }
- 
- // Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax
-@@ -850,6 +864,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
-   // Check sizes
-   bool allowed_device = _scaled_mm_allowed_device();
-   TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+");
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
-   TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
-   TORCH_CHECK(
-@@ -1025,6 +1040,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
- #if defined(USE_ROCM) && ROCM_VERSION >= 60000 && ROCM_VERSION < 60200
-   // ROCm's hipBLASLt does not support amax before 6.2, so calculate separately
-   amax = at::max(at::abs(out.to(kFloat)));
-+#endif
- #endif
- 
-   return {out, amax};
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index f1f2eb7cec31..8d05e834bbc5 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1052,6 +1052,9 @@ if(USE_ROCM)
-     list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
-     list(APPEND HIP_CXX_FLAGS -std=c++17)
-     list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2)
-+    if(hipblast_FOUND)
-+      list(APPEND HIP_CXX_FLAGS -DUSE_HIPBLASLT)
-+    endif()
-     if(HIP_NEW_TYPE_ENUMS)
-       list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS)
-     endif()
-diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
-index fa39156031ff..df4836847fdf 100644
---- a/cmake/public/LoadHIP.cmake
-+++ b/cmake/public/LoadHIP.cmake
-@@ -155,7 +155,7 @@ if(HIP_FOUND)
-   find_package_and_print_version(hiprand REQUIRED)
-   find_package_and_print_version(rocblas REQUIRED)
-   find_package_and_print_version(hipblas REQUIRED)
--  find_package_and_print_version(hipblaslt REQUIRED)
-+  find_package_and_print_version(hipblaslt)
-   find_package_and_print_version(miopen REQUIRED)
-   find_package_and_print_version(hipfft REQUIRED)
-   find_package_and_print_version(hipsparse REQUIRED)
--- 
-2.45.2
-
diff --git a/0001-Patch-for-sleef-3.6.patch b/0001-Patch-for-sleef-3.6.patch
deleted file mode 100644
index 13aa208..0000000
--- a/0001-Patch-for-sleef-3.6.patch
+++ /dev/null
@@ -1,952 +0,0 @@
-From 273f23698c887b52e66c2abec8101b7398f0f9c4 Mon Sep 17 00:00:00 2001
-From: "Benjamin A. Beasley" <code@musicinmybrain.net>
-Date: Wed, 5 Jun 2024 11:06:02 -0400
-Subject: [PATCH] Patch for sleef 3.6
-
----
- ...ectorization-on-windows-submodule-sl.patch | 910 ++++++++++++++++++
- python-torch.spec                             |  11 +
- 2 files changed, 921 insertions(+)
- create mode 100644 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
-
-diff --git a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
-new file mode 100644
-index 000000000000..562f55b742c2
---- /dev/null
-+++ b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
-@@ -0,0 +1,910 @@
-+From 3d1e4b3e5ddcdd2717e590c635097163fef64c83 Mon Sep 17 00:00:00 2001
-+From: Xu Han <xu.han@intel.com>
-+Date: Sun, 31 Mar 2024 03:07:32 +0000
-+Subject: [PATCH] Enable x86 CPU vectorization on windows [submodule sleef]
-+ (#118980)
-+
-+Enable VEC on Windows OS.
-+1. Fix some type defination gap between Windows and Linux.
-+2. Fix some operator not support on Windows, such as [], /.
-+3. Enable static sleef library build on Windows.
-+4. Disable unsupported function overloading on MSVC.
-+5. Upgrade submodule sleef lib, which fixed build issue on Windows.
-+6. Fixed bazel build issues.
-+7. Fix test app not link to sleef on Windows.
-+
-+Note: If rebuild fail after pulled this PR, please sync `sleef` submodule by run:
-+```cmd
-+git submodule sync
-+git submodule update --init --recursive
-+```
-+
-+Pull Request resolved: https://github.com/pytorch/pytorch/pull/118980
-+Approved by: https://github.com/jgong5, https://github.com/ezyang, https://github.com/malfet
-+---
-+ aten/src/ATen/CMakeLists.txt                  | 48 ++++++--------
-+ aten/src/ATen/cpu/vec/vec256/vec256.h         | 14 ++--
-+ .../src/ATen/cpu/vec/vec256/vec256_bfloat16.h | 27 ++++++--
-+ .../cpu/vec/vec256/vec256_complex_double.h    |  7 +-
-+ .../cpu/vec/vec256/vec256_complex_float.h     |  7 +-
-+ aten/src/ATen/cpu/vec/vec256/vec256_double.h  |  5 +-
-+ aten/src/ATen/cpu/vec/vec256/vec256_float.h   | 15 +++--
-+ aten/src/ATen/cpu/vec/vec256/vec256_qint.h    | 12 +++-
-+ aten/src/ATen/cpu/vec/vec512/vec512.h         | 14 ++--
-+ .../src/ATen/cpu/vec/vec512/vec512_bfloat16.h | 27 ++++++--
-+ .../cpu/vec/vec512/vec512_complex_double.h    |  7 +-
-+ .../cpu/vec/vec512/vec512_complex_float.h     |  7 +-
-+ aten/src/ATen/cpu/vec/vec512/vec512_double.h  |  5 +-
-+ aten/src/ATen/cpu/vec/vec512/vec512_float.h   | 15 +++--
-+ aten/src/ATen/cpu/vec/vec512/vec512_qint.h    | 66 ++++++++++++++++++-
-+ aten/src/ATen/cpu/vec/vec_base.h              |  6 ++
-+ caffe2/CMakeLists.txt                         |  2 +-
-+ third_party/sleef.BUILD                       |  3 +-
-+ 18 files changed, 194 insertions(+), 93 deletions(-)
-+
-+diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
-+index bf425af5fa9..58d5828e8ca 100644
-+--- a/aten/src/ATen/CMakeLists.txt
-++++ b/aten/src/ATen/CMakeLists.txt
-+@@ -419,32 +419,25 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$")
-+   list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo)
-+ endif()
-+ 
-+-if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
-+-  # Preserve values for the main build
-+-  set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS})
-+-  set(__aten_sleef_build_tests ${BUILD_TESTS})
-+-
-+-  # Unset our restrictive C++ flags here and reset them later.
-+-  # Remove this once we use proper target_compile_options.
-+-  set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-+-  set(CMAKE_CXX_FLAGS)
-+-
-+-  # Bump up optimization level for sleef to -O1, since at -O0 the compiler
-+-  # excessively spills intermediate vector registers to the stack
-+-  # and makes things run impossibly slowly
-+-  set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
-+-  if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
-+-    string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
-+-  else()
-+-    set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
-++if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
-++  if(NOT MSVC)
-++    # Bump up optimization level for sleef to -O1, since at -O0 the compiler
-++    # excessively spills intermediate vector registers to the stack
-++    # and makes things run impossibly slowly
-++    set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
-++    if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
-++      string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
-++    else()
-++      set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
-++    endif()
-+   endif()
-+ 
-+   if(NOT USE_SYSTEM_SLEEF)
-+-    set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
-+-    set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
-+-    set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
-+-    set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
-+-    set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE})
-++    set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
-++    set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
-++    set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
-++    set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
-++    set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE)
-+     if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
-+       if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
-+         set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE)
-+@@ -465,12 +458,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
-+   endif()
-+   list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef)
-+ 
-+-  set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
-+-  set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
-+-
-+-  # Set these back. TODO: Use SLEEF_ to pass these instead
-+-  set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE)
-+-  set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE)
-++  if(NOT MSVC)
-++    set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
-++  endif()
-+ endif()
-+ 
-+ if(USE_CUDA AND NOT USE_ROCM)
-+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h
-+index 800b027e469..c431fa3c605 100644
-+--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
-++++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
-+@@ -69,7 +69,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
-+ }
-+ 
-+ 
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-+ 
-+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+ 
-+@@ -94,7 +94,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
-+ }
-+ 
-+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+-
-++#ifndef _MSC_VER
-++// MSVC is not working well on complex function overload.
-+ template<int64_t scale = 1>
-+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
-+ inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
-+@@ -106,9 +107,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
-+ inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
-+   return _mm256_i32gather_ps(base_addr, vindex, scale);
-+ }
-+-
-++#endif
-+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+-
-++#ifndef _MSC_VER
-++// MSVC is not working well on complex function overload.
-+ template<int64_t scale = 1>
-+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
-+ inline mask_gather(const Vectorized<double>& src, const double* base_addr,
-+@@ -122,7 +124,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
-+                    const Vectorized<int32_t>& vindex, Vectorized<float>& mask) {
-+   return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale);
-+ }
-+-
-++#endif
-+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+ 
-+ // Only works for inputs in the range: [-2^51, 2^51]
-+@@ -302,6 +304,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
-+   return flip8(v);
-+ }
-+ 
-+-#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#endif // (defined(CPU_CAPABILITY_AVX2)
-+ 
-+ }} // namepsace at::vec::CPU_CAPABILITY
-+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
-+index 3e26213d6d2..66557436c70 100644
-+--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
-++++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
-+@@ -7,7 +7,8 @@
-+ #include <ATen/cpu/vec/vec_base.h>
-+ #include <c10/util/irange.h>
-+ 
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-++#define SLEEF_STATIC_LIBS
-+ #include <sleef.h>
-+ #endif
-+ 
-+@@ -18,7 +19,18 @@ namespace at::vec {
-+ // See Note [CPU_CAPABILITY namespace]
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-++
-++#ifndef SLEEF_CONST
-++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
-++#define SLEEF_CONST const
-++#else
-++#define SLEEF_CONST
-++#endif
-++#define SLEEF_CONST_OLD SLEEF_CONST
-++#else
-++#define SLEEF_CONST_OLD
-++#endif
-+ 
-+ // bfloat16 conversion
-+ static inline void cvtbf16_fp32(const __m128i& a, __m256& o) {
-+@@ -265,7 +277,8 @@ public:
-+     }
-+     return b;
-+   }
-+-  Vectorized<T> map(const __m256 (*const vop)(__m256)) const {
-++
-++  Vectorized<T> map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const {
-+     __m256 lo, hi;
-+     cvt_to_fp32<T>(values, lo, hi);
-+     const auto o1 = vop(lo);
-+@@ -1026,7 +1039,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
-+ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
-+ CONVERT_VECTORIZED_INIT(Half, half);
-+ 
-+-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#else // defined(CPU_CAPABILITY_AVX2)
-+ 
-+ #define CONVERT_NON_VECTORIZED_INIT(type, name) \
-+ inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
-+@@ -1051,9 +1064,9 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
-+ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
-+ CONVERT_NON_VECTORIZED_INIT(Half, half);
-+ 
-+-#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#endif // defined(CPU_CAPABILITY_AVX2)
-+ 
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-+ #define LOAD_FP32_VECTORIZED_INIT(type, name) \
-+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
-+   auto values = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data)); \
-+@@ -1072,7 +1085,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
-+ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
-+ LOAD_FP32_VECTORIZED_INIT(Half, fp16);
-+ 
-+-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#else // defined(CPU_CAPABILITY_AVX2)
-+ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
-+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
-+   __at_align__ float values[Vectorized<float>::size()]; \
-+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
-+index f93ea1e63c3..6c198fb37d3 100644
-+--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
-++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
-+@@ -8,7 +8,8 @@
-+ #include <ATen/cpu/vec/intrinsics.h>
-+ #include <ATen/cpu/vec/vec_base.h>
-+ 
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-++#define SLEEF_STATIC_LIBS
-+ #include <sleef.h>
-+ #endif
-+ 
-+@@ -16,7 +17,7 @@ namespace at::vec {
-+ // See Note [CPU_CAPABILITY namespace]
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-+ 
-+ template <> class Vectorized<c10::complex<double>> {
-+ private:
-+@@ -145,7 +146,7 @@ public:
-+     auto abs = abs_();
-+     auto zero = _mm256_setzero_pd();
-+     auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ);
-+-    auto div = values / abs;
-++    auto div = _mm256_div_pd(values, abs);
-+     return _mm256_blendv_pd(div, zero, mask);
-+   }
-+   __m256d real_() const {
-+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
-+index 7c142c04b79..c72d4d49274 100644
-+--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
-++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
-+@@ -7,7 +7,8 @@
-+ #include <c10/util/irange.h>
-+ #include <ATen/cpu/vec/intrinsics.h>
-+ #include <ATen/cpu/vec/vec_base.h>
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-++#define SLEEF_STATIC_LIBS
-+ #include <sleef.h>
-+ #endif
-+ 
-+@@ -15,7 +16,7 @@ namespace at::vec {
-+ // See Note [CPU_CAPABILITY namespace]
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-+ 
-+ template <> class Vectorized<c10::complex<float>> {
-+ private:
-+@@ -180,7 +181,7 @@ public:
-+     auto abs = abs_();
-+     auto zero = _mm256_setzero_ps();
-+     auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ);
-+-    auto div = values / abs;
-++    auto div = _mm256_div_ps(values, abs);
-+     return _mm256_blendv_ps(div, zero, mask);
-+   }
-+   __m256 real_() const {
-+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
-+index bc82d07edd1..bed6da627af 100644
-+--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h
-++++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
-+@@ -6,7 +6,8 @@
-+ #include <ATen/cpu/vec/intrinsics.h>
-+ #include <ATen/cpu/vec/vec_base.h>
-+ #include <c10/util/irange.h>
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-++#define SLEEF_STATIC_LIBS
-+ #include <sleef.h>
-+ #endif
-+ 
-+@@ -15,7 +16,7 @@ namespace at::vec {
-+ inline namespace CPU_CAPABILITY {
-+ 
-+ 
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-+ 
-+ template <> class Vectorized<double> {
-+ private:
-+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
-+index 886809a0b8a..0e3664cd37b 100644
-+--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
-++++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
-+@@ -6,7 +6,8 @@
-+ #include <ATen/cpu/vec/intrinsics.h>
-+ #include <ATen/cpu/vec/vec_base.h>
-+ #include <c10/util/irange.h>
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-++#define SLEEF_STATIC_LIBS
-+ #include <sleef.h>
-+ #endif
-+ 
-+@@ -14,7 +15,7 @@ namespace at::vec {
-+ // See Note [CPU_CAPABILITY namespace]
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-+ 
-+ template <> class Vectorized<float> {
-+ private:
-+@@ -226,14 +227,14 @@ public:
-+     static __m256 vec_factorial_5 =
-+         _mm256_set1_ps(0.00828929059f); // 1/factorial(5)
-+     static __m256 vec_exp_log2ef =
-+-        (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e)
-++        _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e)
-+     static __m256 vec_half = _mm256_set1_ps(0.5f);
-+     static __m256 vec_one = _mm256_set1_ps(1.f);
-+     static __m256 vec_zero = _mm256_set1_ps(0.f);
-+     static __m256 vec_two = _mm256_set1_ps(2.f);
-+-    static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2)
-+-    static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50);
-+-    static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218);
-++    static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2)
-++    static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
-++    static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
-+     static __m256i vec_127 = _mm256_set1_epi32(0x0000007f);
-+     static int n_mantissa_bits = 23;
-+ 
-+@@ -266,7 +267,7 @@ public:
-+     auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number);
-+     auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127);
-+     vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
-+-    auto vec_two_pow_n = (__m256)vec_two_pow_n_i;
-++    auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i);
-+     vec_two_pow_n =
-+         _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask);
-+ 
-+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
-+index 4128841701a..85e099904cd 100644
-+--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
-++++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
-+@@ -41,11 +41,17 @@
-+ namespace at::vec {
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-+ 
-++#ifdef _MSC_VER
-++__declspec(align(64)) struct Vectorizedqi {
-++ protected:
-++  __m256i vals;
-++#else
-+ struct Vectorizedqi {
-+  protected:
-+   __m256i vals __attribute__((aligned(64)));
-++#endif
-+ 
-+  public:
-+   Vectorizedqi() {}
-+@@ -133,7 +139,7 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
-+ }
-+ 
-+ template <typename T>
-+-inline void __attribute__((always_inline)) QuantizeAvx2(
-++__FORCE_INLINE void QuantizeAvx2(
-+     const float* src,
-+     T* dst,
-+     int len,
-+@@ -1331,5 +1337,5 @@ Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const V
-+   return a.maximum(b);
-+ }
-+ 
-+-#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#endif // if defined(CPU_CAPABILITY_AVX2)
-+ }} // namespace at::vec::CPU_CAPABILITY
-+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h
-+index fe96d123e64..87f723d782c 100644
-+--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
-++++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
-+@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
-+ }
-+ 
-+ 
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-+ 
-+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+ 
-+@@ -80,7 +80,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
-+ }
-+ 
-+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+-
-++#ifndef _MSC_VER
-++// MSVC is not working well on complex function overload.
-+ template<int64_t scale = 1>
-+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
-+ inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
-+@@ -92,9 +93,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
-+ inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
-+   return _mm512_i32gather_ps(vindex, base_addr, scale);
-+ }
-+-
-++#endif
-+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+-
-++#ifndef _MSC_VER
-++// MSVC is not working well on complex function overload.
-+ template<int64_t scale = 1>
-+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
-+ inline mask_gather(const Vectorized<double>& src, const double* base_addr,
-+@@ -112,7 +114,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
-+   auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ);
-+   return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale);
-+ }
-+-
-++#endif
-+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+ 
-+ template<>
-+@@ -270,6 +272,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
-+   return flip8(v);
-+ }
-+ 
-+-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#endif // defined(CPU_CAPABILITY_AVX512)
-+ 
-+ }}}
-+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
-+index f9fc92d52bf..eb3b6a72240 100644
-+--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
-++++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
-+@@ -7,7 +7,8 @@
-+ #include <ATen/cpu/vec/vec_base.h>
-+ #include <c10/util/irange.h>
-+ 
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-++#define SLEEF_STATIC_LIBS
-+ #include <sleef.h>
-+ #endif
-+ 
-+@@ -16,7 +17,18 @@ namespace vec {
-+ // See Note [CPU_CAPABILITY namespace]
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-++
-++#ifndef SLEEF_CONST
-++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
-++#define SLEEF_CONST const
-++#else
-++#define SLEEF_CONST
-++#endif
-++#define SLEEF_CONST_OLD SLEEF_CONST
-++#else
-++#define SLEEF_CONST_OLD
-++#endif
-+ 
-+ // bfloat16 conversion
-+ static inline void cvtbf16_fp32(const __m256i& a, __m512& o) {
-+@@ -362,7 +374,8 @@ public:
-+   }
-+   #pragma clang diagnostic push
-+   #pragma clang diagnostic ignored "-Wignored-qualifiers"
-+-  Vectorized<T> map(const __m512 (*const vop)(__m512)) const {
-++
-++  Vectorized<T> map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const {
-+     __m512 lo, hi;
-+     cvt_to_fp32<T>(values, lo, hi);
-+     const auto o1 = vop(lo);
-+@@ -1571,7 +1584,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
-+ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
-+ CONVERT_VECTORIZED_INIT(Half, half);
-+ 
-+-#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#else //defined(CPU_CAPABILITY_AVX512)
-+ 
-+ #define CONVERT_NON_VECTORIZED_INIT(type, name) \
-+ inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
-+@@ -1601,9 +1614,9 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
-+ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
-+ CONVERT_NON_VECTORIZED_INIT(Half, half);
-+ 
-+-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#endif // defined(CPU_CAPABILITY_AVX512)
-+ 
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-+ #define LOAD_FP32_VECTORIZED_INIT(type, name) \
-+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
-+   auto values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)); \
-+@@ -1622,7 +1635,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
-+ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
-+ LOAD_FP32_VECTORIZED_INIT(Half, fp16);
-+ 
-+-#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#else // defined(CPU_CAPABILITY_AVX512)
-+ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
-+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
-+   __at_align__ float values[Vectorized<float>::size()]; \
-+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
-+index 02aa3a87cc1..c35204f9da2 100644
-+--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
-++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
-+@@ -7,7 +7,8 @@
-+ #include <c10/util/irange.h>
-+ #include <ATen/cpu/vec/intrinsics.h>
-+ #include <ATen/cpu/vec/vec_base.h>
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-++#define SLEEF_STATIC_LIBS
-+ #include <sleef.h>
-+ #endif
-+ 
-+@@ -16,7 +17,7 @@ namespace vec {
-+ // See Note [CPU_CAPABILITY namespace]
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-+ 
-+ template <> class Vectorized<c10::complex<double>> {
-+ private:
-+@@ -203,7 +204,7 @@ public:
-+     auto abs = abs_();
-+     auto zero = _mm512_setzero_pd();
-+     auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ);
-+-    auto div = values / abs;
-++    auto div = _mm512_div_pd(values, abs);
-+     return _mm512_mask_blend_pd(mask, div, zero);
-+   }
-+   __m512d real_() const {
-+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
-+index a5d790c98b2..2801e484d94 100644
-+--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
-++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
-+@@ -7,7 +7,8 @@
-+ #include <c10/util/irange.h>
-+ #include <ATen/cpu/vec/intrinsics.h>
-+ #include <ATen/cpu/vec/vec_base.h>
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-++#define SLEEF_STATIC_LIBS
-+ #include <sleef.h>
-+ #endif
-+ 
-+@@ -16,7 +17,7 @@ namespace vec {
-+ // See Note [CPU_CAPABILITY namespace]
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-+ 
-+ template <> class Vectorized<c10::complex<float>> {
-+ private:
-+@@ -708,7 +709,7 @@ public:
-+     auto abs = abs_();
-+     auto zero = _mm512_setzero_ps();
-+     auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ);
-+-    auto div = values / abs;
-++    auto div = _mm512_div_ps(values, abs);
-+     return _mm512_mask_blend_ps(mask, div, zero);
-+   }
-+   __m512 real_() const {
-+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
-+index 27b2753c903..508ab257e60 100644
-+--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
-++++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
-+@@ -6,7 +6,8 @@
-+ #include <ATen/cpu/vec/intrinsics.h>
-+ #include <ATen/cpu/vec/vec_base.h>
-+ #include <c10/util/irange.h>
-+-#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER)
-++#if (defined(CPU_CAPABILITY_AVX512))
-++#define SLEEF_STATIC_LIBS
-+ #include <sleef.h>
-+ #endif
-+ 
-+@@ -15,7 +16,7 @@ namespace vec {
-+ // See Note [CPU_CAPABILITY namespace]
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-+ 
-+ template <> class Vectorized<double> {
-+ private:
-+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
-+index ba5738687fd..a08df3c141a 100644
-+--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
-++++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
-+@@ -6,7 +6,8 @@
-+ #include <ATen/cpu/vec/intrinsics.h>
-+ #include <ATen/cpu/vec/vec_base.h>
-+ #include <c10/util/irange.h>
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-++#define SLEEF_STATIC_LIBS
-+ #include <sleef.h>
-+ #endif
-+ 
-+@@ -15,7 +16,7 @@ namespace vec {
-+ // See Note [CPU_CAPABILITY namespace]
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-+ 
-+ template <> class Vectorized<float> {
-+ private:
-+@@ -246,14 +247,14 @@ public:
-+     static __m512 vec_factorial_5 =
-+         _mm512_set1_ps(0.00828929059f); // 1/factorial(5)
-+     static __m512 vec_exp_log2ef =
-+-        (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e)
-++        _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
-+     static __m512 vec_half = _mm512_set1_ps(0.5f);
-+     static __m512 vec_one = _mm512_set1_ps(1.f);
-+     static __m512 vec_zero = _mm512_set1_ps(0.f);
-+     static __m512 vec_two = _mm512_set1_ps(2.f);
-+-    static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2)
-+-    static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50);
-+-    static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218);
-++    static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2)
-++    static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
-++    static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
-+     static __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
-+     static int n_mantissa_bits = 23;
-+ 
-+@@ -288,7 +289,7 @@ public:
-+     auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number);
-+     auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127);
-+     vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
-+-    auto vec_two_pow_n = (__m512)vec_two_pow_n_i;
-++    auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i);
-+     vec_two_pow_n =
-+         _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero);
-+ 
-+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
-+index e0713d01312..a5671ed4a50 100644
-+--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
-++++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
-+@@ -42,11 +42,17 @@ namespace at {
-+ namespace vec {
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-+ 
-++#ifdef _MSC_VER
-++__declspec(align(64)) struct Vectorizedqi {
-++ protected:
-++  __m512i vals;
-++#else
-+ struct Vectorizedqi {
-+  protected:
-+   __m512i vals __attribute__((aligned(64)));
-++#endif
-+ 
-+  public:
-+   Vectorizedqi() {}
-+@@ -136,7 +142,7 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
-+ }
-+ 
-+ template <typename T>
-+-inline void __attribute__((always_inline)) QuantizeAvx512(
-++__FORCE_INLINE void QuantizeAvx512(
-+     const float* src,
-+     T* dst,
-+     int len,
-+@@ -525,10 +531,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
-+       Vectorized<float> scale,
-+       Vectorized<float> zero_point,
-+       Vectorized<float> scale_neg_zp_premul) const {
-++    #if defined(_MSC_VER) && !defined(__clang__)
-++    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-++    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-++    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-++    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-++    #else
-+     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-+     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-+     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-+     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-++    #endif
-+ 
-+     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
-+     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
-+@@ -549,10 +562,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
-+   float_vec_return_type dequantize(
-+       Vectorized<float> scale,
-+       Vectorized<float> zero_point) const {
-++    #if defined(_MSC_VER) && !defined(__clang__)
-++    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-++    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-++    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-++    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-++    #else
-+     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-+     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-+     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-+     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-++    #endif
-+ 
-+     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
-+     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
-+@@ -598,20 +618,34 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
-+     }
-+ 
-+     int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
-++      #if defined(_MSC_VER) && !defined(__clang__)
-++      __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-++      __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-++      __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-++      __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-++      #else
-+       __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-+       __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-+       __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-+       __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-++      #endif
-+ 
-+       __m512i int32_val0 = cvtepi8_epi32(int_val0);
-+       __m512i int32_val1 = cvtepi8_epi32(int_val1);
-+       __m512i int32_val2 = cvtepi8_epi32(int_val2);
-+       __m512i int32_val3 = cvtepi8_epi32(int_val3);
-+ 
-++      #if defined(_MSC_VER) && !defined(__clang__)
-++      __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
-++      __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
-++      __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
-++      __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
-++      #else
-+       __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
-+       __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
-+       __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
-+       __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
-++      #endif
-+ 
-+       __m512i int32_b0 = cvtepi8_epi32(int_b0);
-+       __m512i int32_b1 = cvtepi8_epi32(int_b1);
-+@@ -721,10 +755,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
-+       Vectorized<float> scale,
-+       Vectorized<float> zero_point,
-+       Vectorized<float> scale_zp_premul) const {
-++    #if defined(_MSC_VER) && !defined(__clang__)
-++    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-++    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-++    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-++    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-++    #else
-+     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-+     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-+     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-+     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-++    #endif
-+ 
-+     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
-+     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
-+@@ -746,10 +787,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
-+   float_vec_return_type dequantize(
-+       Vectorized<float> scale,
-+       Vectorized<float> zero_point) const {
-++    #if defined(_MSC_VER) && !defined(__clang__)
-++    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-++    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-++    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-++    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-++    #else
-+     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-+     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-+     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-+     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-++    #endif
-+ 
-+     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
-+     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
-+@@ -796,20 +844,34 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
-+     }
-+ 
-+     int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
-++      #if defined(_MSC_VER) && !defined(__clang__)
-++      __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-++      __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-++      __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-++      __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-++      #else
-+       __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-+       __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-+       __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-+       __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-++      #endif
-+ 
-+       __m512i int32_val0 = cvtepu8_epi32(int_val0);
-+       __m512i int32_val1 = cvtepu8_epi32(int_val1);
-+       __m512i int32_val2 = cvtepu8_epi32(int_val2);
-+       __m512i int32_val3 = cvtepu8_epi32(int_val3);
-+ 
-++      #if defined(_MSC_VER) && !defined(__clang__)
-++      __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
-++      __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
-++      __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
-++      __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
-++      #else
-+       __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
-+       __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
-+       __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
-+       __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
-++      #endif
-+ 
-+       __m512i int32_b0 = cvtepu8_epi32(int_b0);
-+       __m512i int32_b1 = cvtepu8_epi32(int_b1);
-+diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
-+index adf81dd915c..20cb8ef6dbc 100644
-+--- a/aten/src/ATen/cpu/vec/vec_base.h
-++++ b/aten/src/ATen/cpu/vec/vec_base.h
-+@@ -36,6 +36,12 @@
-+ #include <c10/util/irange.h>
-+ #include <c10/util/Load.h>
-+ 
-++#if defined(__GNUC__)
-++#define __FORCE_INLINE __attribute__((always_inline)) inline
-++#elif defined(_MSC_VER)
-++#define __FORCE_INLINE __forceinline
-++#endif
-++
-+ // These macros helped us unify vec_base.h
-+ #ifdef CPU_CAPABILITY_AVX512
-+ #if defined(__GNUC__)
-+diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
-+index a6b6f0f7d1d..15d37cf4861 100644
-+--- a/caffe2/CMakeLists.txt
-++++ b/caffe2/CMakeLists.txt
-+@@ -1787,7 +1787,7 @@ if(BUILD_TEST)
-+           endif()
-+         else()
-+           add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
-+-          target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main)
-++          target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main)
-+         endif()
-+         target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<INSTALL_INTERFACE:include>)
-+         target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>)
-+diff --git a/third_party/sleef.BUILD b/third_party/sleef.BUILD
-+index 573f9c5b54a..f22a6e905e2 100644
-+--- a/third_party/sleef.BUILD
-++++ b/third_party/sleef.BUILD
-+@@ -38,6 +38,7 @@ SLEEF_PUBLIC_HEADERS = [
-+ SLEEF_PRIVATE_INCLUDES = [
-+     "-Iexternal/sleef/src/arch",
-+     "-Iexternal/sleef/src/common",
-++    "-Iexternal/sleef/src/libm",
-+ ]
-+ 
-+ SLEEF_PUBLIC_INCLUDES = [
-+@@ -201,8 +202,6 @@ cc_library(
-+     srcs = [
-+         "src/libm/rempitab.c",
-+         "src/libm/sleefdp.c",
-+-        "src/libm/sleefld.c",
-+-        "src/libm/sleefqp.c",
-+         "src/libm/sleefsp.c",
-+     ],
-+     hdrs = SLEEF_PUBLIC_HEADERS,
-+-- 
-+2.45.1
-+
-diff --git a/python-torch.spec b/python-torch.spec
-index d50687a5174a..63600c2e8c39 100644
---- a/python-torch.spec
-+++ b/python-torch.spec
-@@ -176,6 +176,17 @@ Patch7:        0001-Reenable-dim-for-python-3.12.patch
- Patch8:        0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch
- %endif
- 
-+# Enable x86 CPU vectorization on windows [submodule sleef] (#118980)
-+# https://github.com/pytorch/pytorch/commit/56451cd49d9cf94b49197e09dec13426bb1a5370
-+#
-+# Despite the title, this patch fixes compatibility with sleef 3.6 by including
-+# a backwards-compatible version of the fix from
-+# https://github.com/pytorch/pytorch/pull/122723.
-+# Cherry-picked on v2.3.0, without the commit to update the third_party/sleef
-+# git submodule (because the release archive contains an actual sleef source
-+# tree instead, so this would not apply.)
-+Patch9:        0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
-+
- %if %{with rocm}
- # ROCm patches
- # https://github.com/pytorch/pytorch/pull/120551
--- 
-2.45.1
-
diff --git a/0001-Reenable-dim-for-python-3.12.patch b/0001-Reenable-dim-for-python-3.12.patch
deleted file mode 100644
index 138b5d4..0000000
--- a/0001-Reenable-dim-for-python-3.12.patch
+++ /dev/null
@@ -1,115 +0,0 @@
-From ee3fb343a376cdba6f4ce188cac90023f13e2aea Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Thu, 4 Apr 2024 14:21:38 -0600
-Subject: [PATCH] Reenable dim for python 3.12
-
-In 3.12:
-
-_PyArg_Parser added an element to the start of the structure.
-So existing positional initialization is off.  Switch to element
-initialization.
-
-_Py_CODEUNIT changed to from an int to a union, but relevant_op
-is passed an int for the return of decoder.opcode, so the parameter
-type is wrong, switch it to int.
-
-The opcode PRECALL was removed, so reduce its handling to 3.11
-
-Signed-off-by: Tom Rix <trix@redhat.com>
----
- functorch/csrc/dim/dim.cpp     | 24 +++++-------------------
- functorch/csrc/dim/minpybind.h |  4 ++--
- 2 files changed, 7 insertions(+), 21 deletions(-)
-
-diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp
-index 4cc027504c77..e48b0d58081f 100644
---- a/functorch/csrc/dim/dim.cpp
-+++ b/functorch/csrc/dim/dim.cpp
-@@ -6,20 +6,6 @@
- 
- #include <torch/csrc/utils/python_compat.h>
- 
--
--// Many APIs have changed/don't exist anymore
--#if IS_PYTHON_3_12_PLUS
--
--#include "dim.h"
--
--// Re-enable this some day
--PyObject* Dim_init() {
--    PyErr_SetString(PyExc_RuntimeError, "First class dim doesn't work with python 3.12");
--    return nullptr;
--}
--
--#else
--
- #include "minpybind.h"
- #include <frameobject.h>
- #include <opcode.h>
-@@ -441,7 +427,7 @@ static PyObject* DimList_bind(DimList *self,
-     PY_BEGIN
-     mpy::handle sizes;
-     static const char * const _keywords[] = {"sizes", nullptr};
--    static _PyArg_Parser parser = {"O", _keywords, 0};
-+    static _PyArg_Parser parser = { .format = "O", .keywords = _keywords};
-     if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, &sizes)) {
-         return nullptr;
-     }
-@@ -465,7 +451,7 @@ static PyObject* DimList_bind_len(DimList *self,
-     PY_BEGIN
-     int size;
-     static const char * const _keywords[] = {"N", nullptr};
--    static _PyArg_Parser parser = {"i", _keywords, 0};
-+    static _PyArg_Parser parser = { .format = "i", .keywords = _keywords};
-     if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, &size)) {
-         return nullptr;
-     }
-@@ -1468,7 +1454,7 @@ PyTypeObject Tensor::Type = {
- 
- // dim() --------------------
- 
--static bool relevant_op(_Py_CODEUNIT c) {
-+static bool relevant_op(int c) {
-     switch(c) {
-         case STORE_NAME:
-         case STORE_GLOBAL:
-@@ -1587,7 +1573,7 @@ static PyObject* _dims(PyObject *self,
-     auto c = mpy::obj<PyCodeObject>::steal(PyFrame_GetCode(f.ptr()));
-     auto lasti = PyFrame_GetLasti(f.ptr());
-     auto decoder = PyInstDecoder(c.ptr(), lasti);
--    #if IS_PYTHON_3_11_PLUS
-+    #if IS_PYTHON_3_11
-     // When py3.11 adapts bytecode lasti points to the precall
-     // rather than the call instruction after it
-     if (decoder.opcode() == PRECALL) {
-@@ -3268,4 +3254,4 @@ PyObject* Dim_init() {
-     }
- }
- 
--#endif
-+
-diff --git a/functorch/csrc/dim/minpybind.h b/functorch/csrc/dim/minpybind.h
-index de82b5af95a4..d76d4828bf80 100644
---- a/functorch/csrc/dim/minpybind.h
-+++ b/functorch/csrc/dim/minpybind.h
-@@ -621,7 +621,7 @@ struct vector_args {
-             PyObject *dummy = NULL;
-             _PyArg_ParseStackAndKeywords((PyObject*const*)args, nargs, kwnames.ptr(), _parser, &dummy, &dummy, &dummy, &dummy, &dummy);
- #else
--            _PyArg_Parser* _parser = new _PyArg_Parser{NULL, &names_buf[0], fname_cstr, 0};
-+            _PyArg_Parser* _parser = new _PyArg_Parser{ .keywords = &names_buf[0], .fname = fname_cstr};
-             std::unique_ptr<PyObject*[]> buf(new PyObject*[names.size()]);
-             _PyArg_UnpackKeywords((PyObject*const*)args, nargs, NULL, kwnames.ptr(), _parser, required, (Py_ssize_t)values.size() - kwonly, 0, &buf[0]);
- #endif
-@@ -706,7 +706,7 @@ inline object handle::call_vector(vector_args args) {
- #define MPY_PARSE_ARGS_KWNAMES(fmt, FORALL_ARGS) \
-     static const char * const kwlist[] = { FORALL_ARGS(MPY_ARGS_NAME) nullptr}; \
-     FORALL_ARGS(MPY_ARGS_DECLARE) \
--    static _PyArg_Parser parser = {fmt, kwlist, 0}; \
-+    static _PyArg_Parser parser = { .format = fmt, .keywords = kwlist}; \
-     if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, FORALL_ARGS(MPY_ARGS_POINTER) nullptr)) { \
-         throw mpy::exception_set(); \
-     }
--- 
-2.44.0
-
diff --git a/0001-Regenerate-flatbuffer-header.patch b/0001-Regenerate-flatbuffer-header.patch
deleted file mode 100644
index 4eec491..0000000
--- a/0001-Regenerate-flatbuffer-header.patch
+++ /dev/null
@@ -1,39 +0,0 @@
-From 5b8e51b24513fa851eeff42f23d942bde301e321 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Fri, 29 Sep 2023 06:19:29 -0700
-Subject: [PATCH] Regenerate flatbuffer header
-
-For this error
-torch/csrc/jit/serialization/mobile_bytecode_generated.h:12:41:
-error: static assertion failed: Non-compatible flatbuffers version included
-   12 |               FLATBUFFERS_VERSION_MINOR == 3 &&
-
-PyTorch is expecting 23.3.3, what f38 has
-Rawhide is at 23.5.26
-
-Regenerate with
-flatc --cpp --gen-mutable --no-prefix --scoped-enums mobile_bytecode.fbs
-
-Signed-off-by: Tom Rix <trix@redhat.com>
----
- torch/csrc/jit/serialization/mobile_bytecode_generated.h | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/torch/csrc/jit/serialization/mobile_bytecode_generated.h b/torch/csrc/jit/serialization/mobile_bytecode_generated.h
-index cffe8bc7a6..83575e4c19 100644
---- a/torch/csrc/jit/serialization/mobile_bytecode_generated.h
-+++ b/torch/csrc/jit/serialization/mobile_bytecode_generated.h
-@@ -9,8 +9,8 @@
- // Ensure the included flatbuffers.h is the same version as when this file was
- // generated, otherwise it may not be compatible.
- static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
--              FLATBUFFERS_VERSION_MINOR == 3 &&
--              FLATBUFFERS_VERSION_REVISION == 3,
-+              FLATBUFFERS_VERSION_MINOR == 5 &&
-+              FLATBUFFERS_VERSION_REVISION == 26,
-              "Non-compatible flatbuffers version included");
- 
- namespace torch {
--- 
-2.43.0
-
diff --git a/0001-Stub-in-kineto-ActivityType.patch b/0001-Stub-in-kineto-ActivityType.patch
deleted file mode 100644
index f088645..0000000
--- a/0001-Stub-in-kineto-ActivityType.patch
+++ /dev/null
@@ -1,73 +0,0 @@
-From 3ef82b814179da571b2478f61d4279717ab0b23a Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Fri, 29 Sep 2023 06:25:23 -0700
-Subject: [PATCH] Stub in kineto ActivityType
-
-There is an error with kineto is not used, the shim still
-requires the ActivityTYpe.h header to get the enum Activity type.
-So cut-n-paste just enough of the header in to do this.
-
-Signed-off-by: Tom Rix <trix@redhat.com>
----
- torch/csrc/profiler/kineto_shim.h | 44 +++++++++++++++++++++++++++++++
- 1 file changed, 44 insertions(+)
-
-diff --git a/torch/csrc/profiler/kineto_shim.h b/torch/csrc/profiler/kineto_shim.h
-index e92cbf003d..68985ab7d0 100644
---- a/torch/csrc/profiler/kineto_shim.h
-+++ b/torch/csrc/profiler/kineto_shim.h
-@@ -12,7 +12,51 @@
- #undef USE_KINETO
- #endif
- 
-+#ifdef USE_KINETO
- #include <ActivityType.h>
-+#else
-+namespace libkineto {
-+// copied from header
-+/*
-+ * Copyright (c) Meta Platforms, Inc. and affiliates.
-+ * All rights reserved.
-+ *
-+ * This source code is licensed under the BSD-style license found in the
-+ * LICENSE file in the root directory of this source tree.
-+ */
-+
-+// Note : All activity types are not enabled by default. Please add them
-+// at correct position in the enum
-+enum class ActivityType {
-+    // Activity types enabled by default
-+    CPU_OP = 0, // cpu side ops
-+    USER_ANNOTATION,
-+    GPU_USER_ANNOTATION,
-+    GPU_MEMCPY,
-+    GPU_MEMSET,
-+    CONCURRENT_KERNEL, // on-device kernels
-+    EXTERNAL_CORRELATION,
-+    CUDA_RUNTIME, // host side cuda runtime events
-+    CUDA_DRIVER, // host side cuda driver events
-+    CPU_INSTANT_EVENT, // host side point-like events
-+    PYTHON_FUNCTION,
-+    OVERHEAD, // CUPTI induced overhead events sampled from its overhead API.
-+
-+    // Optional Activity types
-+    CUDA_SYNC, // synchronization events between runtime and kernels
-+    GLOW_RUNTIME, // host side glow runtime events
-+    MTIA_RUNTIME, // host side MTIA runtime events
-+    CUDA_PROFILER_RANGE, // CUPTI Profiler range for performance metrics
-+    MTIA_CCP_EVENTS, // MTIA ondevice CCP events
-+    HPU_OP, // HPU host side runtime event
-+    XPU_RUNTIME, // host side xpu runtime events
-+
-+    ENUM_COUNT, // This is to add buffer and not used for any profiling logic. Add your new type before it.
-+    OPTIONAL_ACTIVITY_TYPE_START = CUDA_SYNC,
-+};
-+}
-+
-+#endif
- 
- #include <torch/csrc/Export.h>
- #include <torch/csrc/profiler/api.h>
--- 
-2.43.0
-
diff --git a/0001-can-not-use-with-c-files.patch b/0001-can-not-use-with-c-files.patch
deleted file mode 100644
index 719737c..0000000
--- a/0001-can-not-use-with-c-files.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From a5dff521691a17701b5a02ec75e84cfe1bf605f7 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 3 Feb 2024 06:41:49 -0500
-Subject: [PATCH] can not use with c files
-
----
- cmake/Dependencies.cmake | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index 4dd8042058..5f91f3ffab 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1269,7 +1269,7 @@ if(USE_ROCM)
-     list(APPEND HIP_CXX_FLAGS -Wno-duplicate-decl-specifier)
-     list(APPEND HIP_CXX_FLAGS -DCAFFE2_USE_MIOPEN)
-     list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
--    list(APPEND HIP_CXX_FLAGS -std=c++17)
-+#    list(APPEND HIP_CXX_FLAGS -std=c++17)
-     if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "6.0.0")
-       list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2)
-     endif()
--- 
-2.43.0
-
diff --git a/0001-cuda-hip-signatures.patch b/0001-cuda-hip-signatures.patch
deleted file mode 100644
index a258737..0000000
--- a/0001-cuda-hip-signatures.patch
+++ /dev/null
@@ -1,42 +0,0 @@
-From 214dc959acc809e1959643272c344ee5335d5a69 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Thu, 1 Feb 2024 11:29:47 -0500
-Subject: [PATCH] cuda - hip signatures
-
----
- aten/src/ATen/cuda/detail/LazyNVRTC.cpp | 9 +++++++++
- 1 file changed, 9 insertions(+)
-
-diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
-index 1b85e7776e..bb6f88783a 100644
---- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
-+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
-@@ -134,8 +134,13 @@ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
-                                const char *src,
-                                const char *name,
-                                int numHeaders,
-+#if !defined(USE_ROCM)
-                                const char * const *headers,
-                                const char * const *includeNames) {
-+#else
-+                               const char **headers,
-+			       const char **includeNames) {
-+#endif
-   auto fn = reinterpret_cast<decltype(&nvrtcCreateProgram)>(getNVRTCLibrary().sym(__func__));
-   if (!fn)
-     throw std::runtime_error("Can't get nvrtcCreateProgram");
-@@ -150,7 +155,11 @@ NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *);
- NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *);
- NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *);
- #endif
-+#if !defined(USE_ROCM)
- NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *);
-+#else
-+NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char **);
-+#endif
- _STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult);
- NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*);
- NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *);
--- 
-2.43.0
-
diff --git a/0001-disable-use-of-aotriton.patch b/0001-disable-use-of-aotriton.patch
deleted file mode 100644
index 61ffd1e..0000000
--- a/0001-disable-use-of-aotriton.patch
+++ /dev/null
@@ -1,94 +0,0 @@
-From 038ce9e44776e23f21c1816daa259bc0ea335088 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 29 Jun 2024 07:06:09 -0700
-Subject: [PATCH] disable use of aotriton
-
----
- .../ATen/native/transformers/cuda/sdp_utils.cpp | 17 +++++++++++++++--
- 1 file changed, 15 insertions(+), 2 deletions(-)
-
-diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
-index 214b02d8262e..7b3eb9dcd8cd 100644
---- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
-+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
-@@ -19,9 +19,12 @@
- #include <c10/core/SymInt.h>
- #include <c10/util/string_view.h>
- 
-+#ifdef USE_FLASH_ATTENTION
- #if USE_ROCM
- #include <aotriton/flash.h>
- #endif
-+#endif
-+
- 
- /**
- * Note [SDPA Runtime Dispatch]
-@@ -182,6 +185,9 @@ bool check_sm_version(cudaDeviceProp * dprops) {
- 
- bool check_flash_attention_hardware_support(sdp_params const& params, bool debug) {
-   // Check that the gpu is capable of running flash attention
-+#ifndef USE_FLASH_ATTENTION
-+  return false;
-+#else
-   using sm80 = SMVersion<8, 0>;
-   using sm90 = SMVersion<9, 0>;
- #if USE_ROCM
-@@ -209,9 +215,13 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
-   }
- #endif
-   return true;
-+#endif
- }
- 
- bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) {
-+#ifndef USE_FLASH_ATTENTION
-+  return false;
-+#else
-   // Mem Efficient attention supports hardware in the range [sm_50, sm_90]
-   using sm50 = SMVersion<5, 0>;
-   using sm90 = SMVersion<9, 0>;
-@@ -240,6 +250,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
-   }
- #endif
-   return true;
-+#endif  
- }
- 
- bool check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89(
-@@ -554,7 +565,7 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
- #ifndef USE_FLASH_ATTENTION
-   TORCH_WARN_ONCE(!debug, "Torch was not compiled with flash attention.");
-   return false;
--#endif
-+#else
- 
-   // Define gate functions that determine if a flash kernel can be ran
-   // Replace with std::to_array when we migrate to c++20
-@@ -597,13 +608,14 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
-     }
-   }
-   return true;
-+#endif
- }
- 
- bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) {
- #ifndef USE_MEM_EFF_ATTENTION
-   TORCH_WARN_ONCE(!debug, "Torch was not compiled with memory efficient attention.");
-   return false;
--#endif
-+#else
-   // Constraints specific to mem efficient attention
-   constexpr auto greater_than_or_equal_sm80_mem_efficient_dtypes =
-       array_of<at::ScalarType>(at::kHalf, at::kFloat, at::kBFloat16);
-@@ -663,6 +675,7 @@ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) {
-   }
- #endif
-   return check_tensor_dtype(params, less_than_sm80_mem_efficient_dtypes, debug);
-+#endif
- }
- 
- SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
--- 
-2.45.2
-
diff --git a/0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch b/0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch
deleted file mode 100644
index 0ce5b1f..0000000
--- a/0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch
+++ /dev/null
@@ -1,226 +0,0 @@
-From b9d45eb1cc90696a4de76676221219e24423c709 Mon Sep 17 00:00:00 2001
-From: William Wen <williamwen@meta.com>
-Date: Wed, 3 Apr 2024 17:58:46 -0700
-Subject: [PATCH] [dynamo, 3.12] enable dynamo on 3.12, enable most dynamo
- unittests on 3.12 (#123216)
-
-Pull Request resolved: https://github.com/pytorch/pytorch/pull/123216
-Approved by: https://github.com/jansel, https://github.com/malfet
----
- test/dynamo/test_autograd_function.py   |  3 ++
- test/dynamo/test_misc.py                | 63 +++++++++++++++++++++++++
- test/functorch/test_eager_transforms.py |  7 ++-
- test/run_test.py                        |  3 --
- torch/__init__.py                       |  5 +-
- torch/_dynamo/eval_frame.py             |  4 +-
- torch/_dynamo/test_case.py              |  8 +---
- 7 files changed, 74 insertions(+), 19 deletions(-)
-
-diff --git a/test/dynamo/test_autograd_function.py b/test/dynamo/test_autograd_function.py
-index d23fec607afa..bc5ebc767038 100644
---- a/test/dynamo/test_autograd_function.py
-+++ b/test/dynamo/test_autograd_function.py
-@@ -2,6 +2,8 @@
- 
- import copy
- import math
-+import sys
-+import unittest
- 
- import torch
- 
-@@ -528,6 +530,7 @@ class AutogradFunctionTests(torch._dynamo.test_case.TestCase):
-     # I pulled all of these test cases from test_autograd.py
-     # In the future, we should make the Dynamo test suite actually
-     # run on test_autograd.py (it's disabled right now) and delete these.
-+    @unittest.skipIf(sys.version_info >= (3, 12), "invalid free in 3.12+")
-     def test_smoke_from_test_autograd(self):
-         class Func(torch.autograd.Function):
-             @staticmethod
-diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
-index a73de8b1c7e9..8f54e0564e6b 100644
---- a/test/dynamo/test_misc.py
-+++ b/test/dynamo/test_misc.py
-@@ -9760,6 +9760,69 @@ fn
-             lambda mod: mod,
-         )
- 
-+    @xfailIfPy311
-+    def test_outside_linear_module_free(self):
-+        # Compared to test_linear_module_free, the linear
-+        # layer is not the code object that is directly compiled.
-+        def model_inp_ctr():
-+            fc = torch.nn.Linear(100, 100)
-+
-+            class Mod(torch.nn.Module):
-+                def __init__(self):
-+                    super().__init__()
-+                    self.fc_ref = fc
-+
-+                def forward(self, x):
-+                    return fc(x[0])
-+
-+            # return fc to keep it alive in _test_compile_model_free
-+            return Mod(), (torch.randn(100, 100), fc)
-+
-+        self._test_compile_model_free(model_inp_ctr, lambda mod: mod.fc_ref)
-+
-+    @unittest.skipIf(sys.version_info >= (3, 12), "leaks in 3.12+")
-+    def test_parameter_free(self):
-+        def model_inp_ctr():
-+            param = torch.nn.Parameter(torch.randn(100, 100))
-+
-+            class Mod(torch.nn.Module):
-+                def __init__(self):
-+                    super().__init__()
-+                    self.param = param
-+
-+                def forward(self, x):
-+                    return self.param * x[0]
-+
-+            # return param to keep it alive in _test_compile_model_free
-+            return Mod(), (torch.randn(100, 100), param)
-+
-+        self._test_compile_model_free(model_inp_ctr, lambda mod: mod.param)
-+
-+    def test_raises_importerror1(self):
-+        @torch.compile(backend="eager")
-+        def fn(x):
-+            try:
-+                import some_module_that_surely_does_not_exist
-+
-+                return
-+            except ImportError:
-+                pass
-+            return x.sin()
-+
-+        x = torch.randn(8)
-+        self.assertEqual(fn(x), x.sin())
-+
-+    def test_raises_importerror2(self):
-+        @torch.compile(backend="eager")
-+        def fn(x):
-+            import some_module_that_surely_does_not_exist
-+
-+            return x + 1
-+
-+        x = torch.randn(8)
-+        with self.assertRaises(ImportError):
-+            fn(x)
-+
-     def test_dynamo_cache_move_to_front(self):
-         class Mod(torch.nn.Module):
-             def __init__(self):
-diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
-index 09415cf8f48e..60790ec06059 100644
---- a/test/functorch/test_eager_transforms.py
-+++ b/test/functorch/test_eager_transforms.py
-@@ -4762,8 +4762,7 @@ class TestCompileTransforms(TestCase):
-     # Triton only supports GPU with SM70 or later.
-     @expectedFailureIf((IS_ARM64 and not IS_MACOS) or
-                        IS_WINDOWS or
--                       (TEST_CUDA and not SM70OrLater) or
--                       (sys.version_info >= (3, 12)))
-+                       (TEST_CUDA and not SM70OrLater))
-     def test_compile_vmap_hessian(self, device):
-         # The model and inputs are a smaller version
-         # of code at benchmark repo:
-@@ -4792,8 +4791,8 @@ class TestCompileTransforms(TestCase):
-         actual = opt_fn(params_and_buffers, x)
-         self.assertEqual(actual, expected)
- 
--    # torch.compile is not supported on Windows or on Python 3.12+
--    @expectedFailureIf(IS_WINDOWS or (sys.version_info >= (3, 12)))
-+    # torch.compile is not supported on Windows
-+    @expectedFailureIf(IS_WINDOWS)
-     @torch._dynamo.config.patch(suppress_errors=False)
-     @torch._dynamo.config.patch(capture_func_transforms=True)
-     @skipIfTorchDynamo("Do not test torch.compile on top of torch.compile")
-diff --git a/test/run_test.py b/test/run_test.py
-index e86af9623042..ebb14df4167d 100755
---- a/test/run_test.py
-+++ b/test/run_test.py
-@@ -74,7 +74,6 @@ sys.path.remove(str(REPO_ROOT))
- RERUN_DISABLED_TESTS = os.getenv("PYTORCH_TEST_RERUN_DISABLED_TESTS", "0") == "1"
- DISTRIBUTED_TEST_PREFIX = "distributed"
- INDUCTOR_TEST_PREFIX = "inductor"
--DYNAMO_TEST_PREFIX = "dynamo"
- 
- 
- # Note [ROCm parallel CI testing]
-@@ -324,7 +323,6 @@ JIT_EXECUTOR_TESTS = [
- ]
- 
- INDUCTOR_TESTS = [test for test in TESTS if test.startswith(INDUCTOR_TEST_PREFIX)]
--DYNAMO_TESTS = [test for test in TESTS if test.startswith(DYNAMO_TEST_PREFIX)]
- DISTRIBUTED_TESTS = [test for test in TESTS if test.startswith(DISTRIBUTED_TEST_PREFIX)]
- TORCH_EXPORT_TESTS = [test for test in TESTS if test.startswith("export")]
- FUNCTORCH_TESTS = [test for test in TESTS if test.startswith("functorch")]
-@@ -1361,7 +1359,6 @@ def get_selected_tests(options) -> List[str]:
-     # these tests failing in Python 3.12 temporarily disabling
-     if sys.version_info >= (3, 12):
-         options.exclude.extend(INDUCTOR_TESTS)
--        options.exclude.extend(DYNAMO_TESTS)
-         options.exclude.extend(
-             [
-                 "functorch/test_dims",
-diff --git a/torch/__init__.py b/torch/__init__.py
-index d381712b4a35..26cdffe81d29 100644
---- a/torch/__init__.py
-+++ b/torch/__init__.py
-@@ -1861,9 +1861,8 @@ def compile(model: Optional[Callable] = None, *,
- 
-     """
-     _C._log_api_usage_once("torch.compile")
--    # Temporary until we get proper support for python 3.12
--    if sys.version_info >= (3, 12):
--        raise RuntimeError("Dynamo is not supported on Python 3.12+")
-+    if sys.version_info >= (3, 13):
-+        raise RuntimeError("Dynamo is not supported on Python 3.13+")
- 
-     # Decorator mode
-     if model is None:
-diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
-index 53ab0df3a947..0a80eeea99ed 100644
---- a/torch/_dynamo/eval_frame.py
-+++ b/torch/_dynamo/eval_frame.py
-@@ -589,8 +589,8 @@ class _NullDecorator(contextlib.nullcontext):  # type: ignore[type-arg]
- 
- 
- def check_if_dynamo_supported():
--    if sys.version_info >= (3, 12):
--        raise RuntimeError("Python 3.12+ not yet supported for torch.compile")
-+    if sys.version_info >= (3, 13):
-+        raise RuntimeError("Python 3.13+ not yet supported for torch.compile")
- 
- 
- def is_dynamo_supported():
-diff --git a/torch/_dynamo/test_case.py b/torch/_dynamo/test_case.py
-index e3cbef09eaae..297ea6e2bc2a 100644
---- a/torch/_dynamo/test_case.py
-+++ b/torch/_dynamo/test_case.py
-@@ -1,7 +1,6 @@
- import contextlib
- import importlib
- import logging
--import sys
- 
- import torch
- import torch.testing
-@@ -20,12 +19,7 @@ log = logging.getLogger(__name__)
- def run_tests(needs=()):
-     from torch.testing._internal.common_utils import run_tests
- 
--    if (
--        TEST_WITH_TORCHDYNAMO
--        or IS_WINDOWS
--        or TEST_WITH_CROSSREF
--        or sys.version_info >= (3, 12)
--    ):
-+    if TEST_WITH_TORCHDYNAMO or IS_WINDOWS or TEST_WITH_CROSSREF:
-         return  # skip testing
- 
-     if isinstance(needs, str):
--- 
-2.44.0
-
diff --git a/0001-include-fmt-ranges.h-for-using-fmt-join.patch b/0001-include-fmt-ranges.h-for-using-fmt-join.patch
deleted file mode 100644
index f7f6c7d..0000000
--- a/0001-include-fmt-ranges.h-for-using-fmt-join.patch
+++ /dev/null
@@ -1,54 +0,0 @@
-From ba2cf11d1bf1dd5086c8e793198a697d4179cca7 Mon Sep 17 00:00:00 2001
-From: Kefu Chai <tchaikov@gmail.com>
-Date: Tue, 16 Jul 2024 08:00:22 +0800
-Subject: [PATCH] include fmt/ranges.h for using fmt::join()
-
-fmt::join() was moved into fmt/ranges.h in fmt 11, so include this
-header for using it.
-
-Signed-off-by: Kefu Chai <tchaikov@gmail.com>
----
- torch/csrc/distributed/c10d/socket.cpp                      | 1 +
- torch/csrc/profiler/standalone/execution_trace_observer.cpp | 1 +
- torch/csrc/profiler/util.cpp                                | 1 +
- 3 files changed, 3 insertions(+)
-
-diff --git a/torch/csrc/distributed/c10d/socket.cpp b/torch/csrc/distributed/c10d/socket.cpp
-index 5013f2540..cbcd33a19 100644
---- a/torch/csrc/distributed/c10d/socket.cpp
-+++ b/torch/csrc/distributed/c10d/socket.cpp
-@@ -31,6 +31,7 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wdeprecated")
- #include <fmt/chrono.h>
- C10_DIAGNOSTIC_POP()
- #include <fmt/format.h>
-+#include <fmt/ranges.h>
- 
- #include <torch/csrc/distributed/c10d/error.h>
- #include <torch/csrc/distributed/c10d/exception.h>
-diff --git a/torch/csrc/profiler/standalone/execution_trace_observer.cpp b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
-index 2ef2e5423..fb053e916 100644
---- a/torch/csrc/profiler/standalone/execution_trace_observer.cpp
-+++ b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
-@@ -10,6 +10,7 @@
- #endif // _WIN32
- 
- #include <fmt/format.h>
-+#include <fmt/ranges.h>
- #include <chrono>
- #include <cmath>
- #include <fstream>
-diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp
-index 896bf606c..c229ce130 100644
---- a/torch/csrc/profiler/util.cpp
-+++ b/torch/csrc/profiler/util.cpp
-@@ -5,6 +5,7 @@
- #include <c10/util/ArrayRef.h>
- #include <c10/util/irange.h>
- #include <fmt/format.h>
-+#include <fmt/ranges.h>
- 
- #ifdef USE_KINETO
- #include <libkineto.h>
--- 
-2.45.2
-
diff --git a/0001-no-third_party-FXdiv.patch b/0001-no-third_party-FXdiv.patch
deleted file mode 100644
index 71404e3..0000000
--- a/0001-no-third_party-FXdiv.patch
+++ /dev/null
@@ -1,54 +0,0 @@
-From b3b307add5724ee5730f161e16594fa702f34a19 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 3 Feb 2024 08:20:28 -0500
-Subject: [PATCH] no third_party FXdiv
-
----
- caffe2/CMakeLists.txt | 24 ++++++++++++------------
- 1 file changed, 12 insertions(+), 12 deletions(-)
-
-diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
-index b2f3adbfae..80a5625c8d 100644
---- a/caffe2/CMakeLists.txt
-+++ b/caffe2/CMakeLists.txt
-@@ -110,15 +110,15 @@ endif()
- # Note: the folders that are being commented out have not been properly
- # addressed yet.
- 
--if(NOT MSVC AND USE_XNNPACK)
--  if(NOT TARGET fxdiv)
--    set(FXDIV_BUILD_TESTS OFF CACHE BOOL "")
--    set(FXDIV_BUILD_BENCHMARKS OFF CACHE BOOL "")
--    add_subdirectory(
--      "${FXDIV_SOURCE_DIR}"
--      "${CMAKE_BINARY_DIR}/FXdiv")
--  endif()
--endif()
-+#if(NOT MSVC AND USE_XNNPACK)
-+#  if(NOT TARGET fxdiv)
-+#    set(FXDIV_BUILD_TESTS OFF CACHE BOOL "")
-+#    set(FXDIV_BUILD_BENCHMARKS OFF CACHE BOOL "")
-+#    add_subdirectory(
-+#      "${FXDIV_SOURCE_DIR}"
-+#      "${CMAKE_BINARY_DIR}/FXdiv")
-+#  endif()
-+#endif()
- 
- add_subdirectory(core)
- add_subdirectory(serialize)
-@@ -1081,9 +1081,9 @@ if(USE_XPU)
-   target_compile_definitions(torch_xpu PRIVATE USE_XPU)
- endif()
- 
--if(NOT MSVC AND USE_XNNPACK)
--  TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)
--endif()
-+#if(NOT MSVC AND USE_XNNPACK)
-+#  TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)
-+#endif()
- 
- # ==========================================================
- # formerly-libtorch flags
--- 
-2.43.0
-
diff --git a/0001-no-third_party-fmt.patch b/0001-no-third_party-fmt.patch
deleted file mode 100644
index 6e82af2..0000000
--- a/0001-no-third_party-fmt.patch
+++ /dev/null
@@ -1,65 +0,0 @@
-From 2ce255b75760a0a513fb1706629b416f76a5c822 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 3 Feb 2024 08:16:04 -0500
-Subject: [PATCH] no third_party fmt
-
----
- c10/CMakeLists.txt       | 2 +-
- cmake/Dependencies.cmake | 6 +++---
- torch/CMakeLists.txt     | 2 +-
- 3 files changed, 5 insertions(+), 5 deletions(-)
-
-diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
-index 1f742f4c176..4fa08913bdd 100644
---- a/c10/CMakeLists.txt
-+++ b/c10/CMakeLists.txt
-@@ -87,7 +87,7 @@ endif()
- if(C10_USE_GLOG)
-   target_link_libraries(c10 PUBLIC glog::glog)
- endif()
--target_link_libraries(c10 PRIVATE fmt::fmt-header-only)
-+target_link_libraries(c10 PRIVATE fmt)
- 
- if(C10_USE_NUMA)
-   message(STATUS "NUMA paths:")
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index 6f5a2d5feff..42fbf80f6e8 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1837,7 +1837,7 @@ endif()
- #
- set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
- set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libs" FORCE)
--add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt)
-+# add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt)
- 
- # Disable compiler feature checks for `fmt`.
- #
-@@ -1846,9 +1846,9 @@ add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt)
- # CMAKE_CXX_FLAGS in ways that break feature checks. Since we already know
- # `fmt` is compatible with a superset of the compilers that PyTorch is, it
- # shouldn't be too bad to just disable the checks.
--set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "")
-+# set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "")
- 
--list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)
-+# list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)
- set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE)
- 
- # ---[ Kineto
-diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
-index 97a72eed55b..9e5014d1980 100644
---- a/torch/CMakeLists.txt
-+++ b/torch/CMakeLists.txt
-@@ -80,7 +80,7 @@ set(TORCH_PYTHON_LINK_LIBRARIES
-     python::python
-     pybind::pybind11
-     shm
--    fmt::fmt-header-only
-+    fmt
-     ATEN_CPU_FILES_GEN_LIB)
- 
- if(USE_ASAN AND TARGET Sanitizer::address)
--- 
-2.43.2
-
diff --git a/0001-no-third_party-foxi.patch b/0001-no-third_party-foxi.patch
deleted file mode 100644
index ba1ec40..0000000
--- a/0001-no-third_party-foxi.patch
+++ /dev/null
@@ -1,36 +0,0 @@
-From 8cb61cf9282102ac225645fcc9fb4a1bb7cb15a2 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 3 Feb 2024 08:11:55 -0500
-Subject: [PATCH] no third_party foxi
-
----
- cmake/Dependencies.cmake | 6 +++---
- 1 file changed, 3 insertions(+), 3 deletions(-)
-
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index 5f91f3ffab..8e1461af81 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1567,7 +1567,7 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX)
-       set_target_properties(onnx_proto PROPERTIES CXX_STANDARD 17)
-     endif()
-   endif()
--  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/foxi EXCLUDE_FROM_ALL)
-+  # add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/foxi EXCLUDE_FROM_ALL)
- 
-   add_definitions(-DONNX_NAMESPACE=${ONNX_NAMESPACE})
-   if(NOT USE_SYSTEM_ONNX)
-@@ -1600,8 +1600,8 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX)
-     message("-- Found onnx: ${ONNX_LIBRARY} ${ONNX_PROTO_LIBRARY}")
-     list(APPEND Caffe2_DEPENDENCY_LIBS onnx_proto onnx)
-   endif()
--  include_directories(${FOXI_INCLUDE_DIRS})
--  list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)
-+#  include_directories(${FOXI_INCLUDE_DIRS})
-+#  list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)
-   # Recover the build shared libs option.
-   set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS})
- endif()
--- 
-2.43.0
-
diff --git a/0001-python-torch-disable-ck.patch b/0001-python-torch-disable-ck.patch
deleted file mode 100644
index e8fd9c2..0000000
--- a/0001-python-torch-disable-ck.patch
+++ /dev/null
@@ -1,112 +0,0 @@
-From 027dad1eaed51c1172e2497da611e3267d42d2f0 Mon Sep 17 00:00:00 2001
-From: Tom Rix <Tom.Rix@amd.com>
-Date: Fri, 28 Mar 2025 09:16:03 -0700
-Subject: [PATCH] python-torch: disable ck
-
----
- aten/src/ATen/CMakeLists.txt    |  7 +++----
- aten/src/ATen/Context.cpp       |  1 +
- aten/src/ATen/cuda/CUDABlas.cpp | 10 +++++-----
- 3 files changed, 9 insertions(+), 9 deletions(-)
-
-diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
-index 085af373ec22..84808880e51c 100644
---- a/aten/src/ATen/CMakeLists.txt
-+++ b/aten/src/ATen/CMakeLists.txt
-@@ -134,7 +134,7 @@ file(GLOB native_cuda_cu "native/cuda/*.cu")
- file(GLOB native_cuda_cpp "native/cuda/*.cpp")
- file(GLOB native_cuda_h "native/cuda/*.h" "native/cuda/*.cuh")
- file(GLOB native_cuda_linalg_cpp "native/cuda/linalg/*.cpp")
--file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh" "native/hip/bgemm_kernels/*.h")
-+file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh" )
- file(GLOB native_cudnn_cpp "native/cudnn/*.cpp")
- file(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu")
- file(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp")
-@@ -145,7 +145,7 @@ file(GLOB native_nested_h "native/nested/*.h")
- file(GLOB native_nested_cuda_cu "native/nested/cuda/*.cu")
- file(GLOB native_nested_cuda_cpp "native/nested/cuda/*.cpp")
- 
--file(GLOB native_hip_hip "native/hip/*.hip" "native/hip/bgemm_kernels/*.hip")
-+file(GLOB native_hip_hip "native/hip/*.hip" )
- file(GLOB native_hip_cpp "native/hip/*.cpp")
- file(GLOB native_hip_linalg_cpp "native/hip/linalg/*.cpp")
- file(GLOB native_miopen_cpp "native/miopen/*.cpp")
-@@ -361,13 +361,12 @@ endif()
-     ${native_quantized_hip_hip}
-     ${native_transformers_hip_hip} ${native_transformers_src_hip_hip}
-   )
--  if(WIN32) # Windows doesn't support Composable Kernels and Triton
-     file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip")
-     file(GLOB native_hip_ck "native/hip/ck*.hip")
-     exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
-       ${native_hip_bgemm} ${native_hip_ck}
-       ${native_transformers_hip_hip} ${native_transformers_hip_cpp})
--  endif()
-+
-   # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources)
-   list(APPEND all_hip_cpp
-     ${native_nested_hip_cpp}
-diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
-index f598fc3a39d3..03dab6ff38fe 100644
---- a/aten/src/ATen/Context.cpp
-+++ b/aten/src/ATen/Context.cpp
-@@ -355,6 +355,7 @@ at::BlasBackend Context::blasPreferredBackend() {
- }
- 
- void Context::setBlasPreferredBackend(at::BlasBackend b) {
-+  return;
- #ifdef _MSC_VER
-   TORCH_WARN_ONCE(
-     "torch.backends.cuda.preferred_blas_library is an experimental feature. "
-diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
-index a62b028fd4ff..cba38426ea1f 100644
---- a/aten/src/ATen/cuda/CUDABlas.cpp
-+++ b/aten/src/ATen/cuda/CUDABlas.cpp
-@@ -708,7 +708,7 @@ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     bgemm_internal_cublaslt<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
-   }
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_NO_CK
-   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-     at::native::bgemm_internal_ck<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
-   }
-@@ -1061,7 +1061,7 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
-     gemm_internal_cublaslt<double>(CUDABLAS_GEMM_ARGS(double));
- #endif
-   }
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_NO_CK
-   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-     at::native::gemm_internal_ck<double>(CUDABLAS_GEMM_ARGS(double));
-   }
-@@ -1077,7 +1077,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
-   }
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_NO_CK
-   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-     at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
-   }
-@@ -1125,7 +1125,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
-   }
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_NO_CK
-   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-     at::native::gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
-   }
-@@ -1141,7 +1141,7 @@ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
-   }
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_NO_CK
-   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-     at::native::gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
-   }
--- 
-2.48.1
-
diff --git a/0001-reenable-foxi-linking.patch b/0001-reenable-foxi-linking.patch
deleted file mode 100644
index 8e39795..0000000
--- a/0001-reenable-foxi-linking.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From 58ccda271e8f51c3fa5b7518cf6ee52ce204fd37 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Thu, 22 Feb 2024 09:28:11 -0500
-Subject: [PATCH] reenable foxi linking
-
----
- cmake/Dependencies.cmake | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index 42fbf80f6e8..bc3a2dc6fee 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1604,7 +1604,7 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX)
-     list(APPEND Caffe2_DEPENDENCY_LIBS onnx_proto onnx)
-   endif()
- #  include_directories(${FOXI_INCLUDE_DIRS})
--#  list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)
-+  list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)
-   # Recover the build shared libs option.
-   set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS})
- endif()
--- 
-2.43.2
-
diff --git a/0001-silence-an-assert.patch b/0001-silence-an-assert.patch
deleted file mode 100644
index 0b20dcf..0000000
--- a/0001-silence-an-assert.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From 04dd33db93b852fdfd7ea408813080b2e2026650 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 3 Feb 2024 06:41:20 -0500
-Subject: [PATCH] silence an assert
-
----
- aten/src/ATen/native/cuda/IndexKernel.cu | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu
-index 657c0c77b3..b406aa6687 100644
---- a/aten/src/ATen/native/cuda/IndexKernel.cu
-+++ b/aten/src/ATen/native/cuda/IndexKernel.cu
-@@ -249,7 +249,7 @@ void index_put_kernel_quantized_cuda(TensorIterator& iter, const IntArrayRef ind
- 
-     gpu_index_kernel(iter, index_size, index_stride, [inv_scale, zero_point, qmin, qmax]C10_DEVICE(char* const out_data, const char* const in_data, const int64_t offset) {
-       int64_t qvalue = static_cast<int64_t>(zero_point + nearbyintf(*(float*)in_data * inv_scale));
--      qvalue = std::clamp(qvalue, qmin, qmax);
-+      //qvalue = std::clamp(qvalue, qmin, qmax);
-       *(scalar_t*)(out_data + offset) = static_cast<scalar_t>(qvalue);
-     });
-   });
--- 
-2.43.0
-
diff --git a/0001-torch-paper-over-c-assert.patch b/0001-torch-paper-over-c-assert.patch
deleted file mode 100644
index b7e55ce..0000000
--- a/0001-torch-paper-over-c-assert.patch
+++ /dev/null
@@ -1,88 +0,0 @@
-From f646e0f04ae591c8f2d8a0cd24b035725c57659b Mon Sep 17 00:00:00 2001
-From: Tom Rix <Tom.Rix@amd.com>
-Date: Thu, 23 Jan 2025 08:24:22 -0800
-Subject: [PATCH] torch: paper over c++ assert
-
----
- aten/src/ATen/native/sparse/FlattenIndicesCommon.h           | 2 ++
- .../ATen/native/sparse/SparseBinaryOpIntersectionCommon.h    | 5 +++++
- .../src/ATen/native/sparse/ValidateCompressedIndicesCommon.h | 2 ++
- 3 files changed, 9 insertions(+)
-
-diff --git a/aten/src/ATen/native/sparse/FlattenIndicesCommon.h b/aten/src/ATen/native/sparse/FlattenIndicesCommon.h
-index 0e79ed809ae6..a3cec8aaf78b 100644
---- a/aten/src/ATen/native/sparse/FlattenIndicesCommon.h
-+++ b/aten/src/ATen/native/sparse/FlattenIndicesCommon.h
-@@ -69,11 +69,13 @@ Tensor _flatten_indices_impl(const Tensor& indices, IntArrayRef size) {
-           [=] FUNCAPI (int64_t nnz_idx) -> int64_t {
-           const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride;
-           auto hash = static_cast<int64_t>(0);
-+#if 0
-           for (int64_t dim = 0; dim < sparse_dim; ++dim) {
-             const auto dim_hash_coeff = hash_coeffs[dim];
-             const auto dim_index = ptr_indices_dim[dim * indices_dim_stride];
-             hash += dim_index * dim_hash_coeff;
-           }
-+#endif
-           return hash;
-       });
-     }
-diff --git a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
-index c0b94bf39d54..8de4900b7a01 100644
---- a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
-+++ b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
-@@ -279,12 +279,15 @@ void _sparse_binary_op_intersection_kernel_impl(
-           if (!ptr_indices) {
-             return hash;
-           }
-+#if 0
-+//	  /usr/lib/gcc/x86_64-redhat-linux/15/../../../../include/c++/15/array:219:2: error: reference to __host__ function '__glibcxx_assert_fail' in __host__ __device__ function
-           const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride;
-           for (int64_t dim = 0; dim < sparse_dim; ++dim) {
-             const auto dim_hash_coeff = hash_coeffs[dim];
-             const auto dim_index = ptr_indices_dim[dim * indices_dim_stride];
-             hash += dim_index * dim_hash_coeff;
-           }
-+#endif
-           return hash;
-       });
-     }
-@@ -364,6 +367,7 @@ void _sparse_binary_op_intersection_kernel_impl(
-           if (hash_ptr) {
-             hash = hash_ptr[nnz_idx];
-           } else if (sparse_dim) {
-+#if 0
-             // Compute hash value
-             const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride;
-             for (int64_t dim = 0; dim < sparse_dim; ++dim) {
-@@ -371,6 +375,7 @@ void _sparse_binary_op_intersection_kernel_impl(
-               const auto dim_index = ptr_indices_dim[dim * indices_dim_stride];
-               hash += dim_index * dim_hash_coeff;
-             }
-+#endif
-           }
- 
-           // Perform hash values intersection
-diff --git a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
-index ec4c084a39cc..9bc9655b0afa 100644
---- a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
-+++ b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
-@@ -341,6 +341,7 @@ void _validate_compressed_sparse_indices_kernel(
-                 // assuming idx contiguity per batch:
-                 int64_t tmp = batch_idx * nnz;
-                 // `nnz == idx_sizes[idx_ndims - 1]` is checked above as `nnz == idx.size(-1)`
-+#if 0		
-                 for (int i = idx_ndims - 1;
-                      i >= 0 && nnz > 0;  // break early when nnz==0
-                      i--) {
-@@ -348,6 +349,7 @@ void _validate_compressed_sparse_indices_kernel(
-                   idx_offset += (tmp - div * idx_sizes[i]) * idx_strides[i];
-                   tmp = div;
-                 }
-+#endif
-                 const auto* RESTRICT ptr_idx_batch = ptr_idx + idx_offset;
-                 _check_idx_sorted_distinct_vals_slices_with_cidx<
-                     cdim_name,
--- 
-2.48.1
-
diff --git a/0001-use-any-hip.patch b/0001-use-any-hip.patch
deleted file mode 100644
index dca86ea..0000000
--- a/0001-use-any-hip.patch
+++ /dev/null
@@ -1,34 +0,0 @@
-From 4248211ce9a9de81bb3ade5d421ba709b19ead08 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 3 Feb 2024 15:01:28 -0500
-Subject: [PATCH] use any hip
-
----
- cmake/public/LoadHIP.cmake | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
-index 1abeb06228..28458c4146 100644
---- a/cmake/public/LoadHIP.cmake
-+++ b/cmake/public/LoadHIP.cmake
-@@ -30,7 +30,7 @@ endif()
- message("Building PyTorch for GPU arch: ${PYTORCH_ROCM_ARCH}")
- 
- # Add HIP to the CMAKE Module Path
--set(CMAKE_MODULE_PATH ${ROCM_PATH}/lib/cmake/hip ${CMAKE_MODULE_PATH})
-+set(CMAKE_MODULE_PATH ${ROCM_PATH}/lib64/cmake/hip ${CMAKE_MODULE_PATH})
- 
- macro(find_package_and_print_version PACKAGE_NAME)
-   find_package("${PACKAGE_NAME}" ${ARGN})
-@@ -38,7 +38,7 @@ macro(find_package_and_print_version PACKAGE_NAME)
- endmacro()
- 
- # Find the HIP Package
--find_package_and_print_version(HIP 1.0)
-+find_package_and_print_version(HIP MODULE)
- 
- if(HIP_FOUND)
-   set(PYTORCH_FOUND_HIP TRUE)
--- 
-2.43.0
-
diff --git a/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch b/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
deleted file mode 100644
index 413c60d..0000000
--- a/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
+++ /dev/null
@@ -1,47 +0,0 @@
-From 091b7fe1ccbb5e4ff4ac6017d42bacb869f61a27 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 20 Jul 2024 05:37:15 -0600
-Subject: [PATCH] Add cmake option USE_SYSTEM_FBGEMM
-
-Signed-off-by: Tom Rix <trix@redhat.com>
----
- CMakeLists.txt           | 1 +
- cmake/Dependencies.cmake | 3 ++-
- 2 files changed, 3 insertions(+), 1 deletion(-)
-
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index c4cd4b2c2a98..2068f7c6c4f2 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -253,6 +253,7 @@ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
-                        "USE_CUDNN" OFF)
- cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF)
- option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
-+option(USE_SYSTEM_FBGEMM "Use system-wide FBGEMM" OFF)
- option(USE_KINETO "Use Kineto profiling library" ON)
- option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
- option(USE_FAKELOWP "Use FakeLowp operators" OFF)
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index f1f2eb7cec31..192dac46f13b 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -706,6 +706,7 @@ endif()
- 
- # ---[ FBGEMM
- if(USE_FBGEMM)
-+  if (NOT USE_SYSTEM_FBGEMM)
-   set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")
-   if(NOT DEFINED FBGEMM_SOURCE_DIR)
-     set(FBGEMM_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/fbgemm" CACHE STRING "FBGEMM source directory")
-@@ -746,7 +747,7 @@ if(USE_FBGEMM)
-       target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable)
-     endif()
-   endif()
--
-+  endif()
-   if(USE_FBGEMM)
-     list(APPEND Caffe2_DEPENDENCY_LIBS fbgemm)
-   endif()
--- 
-2.45.1
-
diff --git a/next/0001-Add-cmake-variable-USE_ROCM_CK.patch b/next/0001-Add-cmake-variable-USE_ROCM_CK.patch
new file mode 100644
index 0000000..1afe692
--- /dev/null
+++ b/next/0001-Add-cmake-variable-USE_ROCM_CK.patch
@@ -0,0 +1,149 @@
+From 4cc5d88dfe7a45ab245648dc874645d32a24b98b Mon Sep 17 00:00:00 2001
+From: Tom Rix <Tom.Rix@amd.com>
+Date: Fri, 27 Jun 2025 13:52:51 -0700
+Subject: [PATCH] Add cmake variable USE_ROCM_CK
+
+---
+ CMakeLists.txt                  |  1 +
+ aten/src/ATen/CMakeLists.txt    | 40 ++++++++++++++++-----------------
+ aten/src/ATen/cuda/CUDABlas.cpp | 10 ++++-----
+ cmake/Dependencies.cmake        |  3 +++
+ 4 files changed, 29 insertions(+), 25 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 99c0b9e0ea0c..4c632e42f531 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -240,6 +240,7 @@ cmake_dependent_option(
+   BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON
+   "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
+ cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF)
++cmake_dependent_option(USE_ROCM_CK "Use ROCm Composable Kernel" ON "USE_ROCM" ON)
+ option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
+ cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF)
+ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
+diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
+index c9cfd74b501e..59f6178218ee 100644
+--- a/aten/src/ATen/CMakeLists.txt
++++ b/aten/src/ATen/CMakeLists.txt
+@@ -373,26 +373,26 @@ if(USE_ROCM)
+   # is header only, so this should be ok, except that the CMake build generates
+   # a ck/config.h. We just do that part here. Without this, the ck.h from the
+   # ROCM SDK may get accidentally used instead.
+-  function(_pytorch_rocm_generate_ck_conf)
+-    set(CK_ENABLE_INT8 "ON")
+-    set(CK_ENABLE_FP16 "ON")
+-    set(CK_ENABLE_FP32 "ON")
+-    set(CK_ENABLE_FP64 "ON")
+-    set(CK_ENABLE_BF16 "ON")
+-    set(CK_ENABLE_FP8 "ON")
+-    set(CK_ENABLE_BF8 "ON")
+-    set(CK_USE_XDL "ON")
+-    set(CK_USE_WMMA "ON")
+-    configure_file(
+-      "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in"
+-      "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h"
+-      )
+-  endfunction()
++#  function(_pytorch_rocm_generate_ck_conf)
++#    set(CK_ENABLE_INT8 "ON")
++#    set(CK_ENABLE_FP16 "ON")
++#    set(CK_ENABLE_FP32 "ON")
++#    set(CK_ENABLE_FP64 "ON")
++#    set(CK_ENABLE_BF16 "ON")
++#    set(CK_ENABLE_FP8 "ON")
++#    set(CK_ENABLE_BF8 "ON")
++#    set(CK_USE_XDL "ON")
++#    set(CK_USE_WMMA "ON")
++#    configure_file(
++#      "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in"
++#      "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h"
++#      )
++#  endfunction()
+   list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
+-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
+-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
+-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
+-  _pytorch_rocm_generate_ck_conf()
++#  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
++#  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
++#  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
++#  _pytorch_rocm_generate_ck_conf()
+ 
+   # Next two lines are needed because TunableOp uses third-party/fmt
+   list(APPEND ATen_HIP_INCLUDE $<TARGET_PROPERTY:fmt::fmt-header-only,INTERFACE_INCLUDE_DIRECTORIES>)
+@@ -409,7 +409,7 @@ endif()
+     ${native_quantized_hip_hip}
+     ${native_transformers_hip_hip} ${native_transformers_src_hip_hip}
+   )
+-  if(WIN32) # Windows doesn't support Composable Kernels
++  if(NOT USE_ROCM_CK) # Windows doesn't support Composable Kernels
+     file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip")
+     file(GLOB native_hip_ck "native/hip/ck*.hip")
+     exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
+diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
+index 89350a11bea7..33e5f2808057 100644
+--- a/aten/src/ATen/cuda/CUDABlas.cpp
++++ b/aten/src/ATen/cuda/CUDABlas.cpp
+@@ -752,7 +752,7 @@ template <>
+ void bgemm_internal<double>(CUDABLAS_BGEMM_ARGTYPES(double))
+ {
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_CK
+     // hipblaslt does not support double gemm yet
+     bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGS(double));
+ #else
+@@ -1103,7 +1103,7 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
+   void * beta_ptr = &fbeta;
+   _cublasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
+   GEMM_CHECK_ARGVALUES(at::Half);
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_CK
+   int flag = 0;
+ #if USE_GEMM_FLAGS_FP16_ALT_IMPL
+   flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
+@@ -1270,7 +1270,7 @@ template <>
+ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
+ {
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_CK
+     // hipblaslt does not support double gemm yet
+     gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGS(double));
+ #else
+@@ -1311,7 +1311,7 @@ template <>
+ void gemm_internal<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>))
+ {
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_CK
+     // hipblaslt does not support complex gemm yet
+     gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGS(c10::complex<double>));
+ #else
+@@ -1327,7 +1327,7 @@ template <>
+ void gemm_internal<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>))
+ {
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_CK
+     // hipblaslt does not support complex gemm yet
+     gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGS(c10::complex<float>));
+ #else
+diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
+index a93386c27f8d..be1368999d38 100644
+--- a/cmake/Dependencies.cmake
++++ b/cmake/Dependencies.cmake
+@@ -1031,6 +1031,9 @@ if(USE_ROCM)
+     if(HIPBLASLT_VEC_EXT)
+       list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_VEC_EXT)
+     endif()
++    if(USE_ROCM_CK)
++      list(APPEND HIP_CXX_FLAGS -DUSE_ROCM_CK)
++    endif()
+     list(APPEND HIP_HIPCC_FLAGS --offload-compress)
+     if(WIN32)
+       add_definitions(-DROCM_ON_WINDOWS)
+-- 
+2.49.0
+
diff --git a/next/0001-Optionally-use-hipblaslt.patch b/next/0001-Optionally-use-hipblaslt.patch
deleted file mode 100644
index 1e5ca4b..0000000
--- a/next/0001-Optionally-use-hipblaslt.patch
+++ /dev/null
@@ -1,506 +0,0 @@
-From f1d65e958afa65882dbfea8b392ab847a84d41ed Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 29 Jun 2024 04:18:34 -0700
-Subject: [PATCH] Optionally use hipblaslt
-
----
- aten/src/ATen/cuda/CUDABlas.cpp          | 46 ++++++++++++++++++------
- aten/src/ATen/cuda/CUDAContextLight.h    |  4 +++
- aten/src/ATen/cuda/CublasHandlePool.cpp  | 10 ++++--
- aten/src/ATen/cuda/tunable/TunableGemm.h | 18 +++++++---
- aten/src/ATen/native/cuda/Blas.cpp       | 18 +++++++++-
- cmake/Dependencies.cmake                 |  3 ++
- cmake/public/LoadHIP.cmake               |  2 +-
- 7 files changed, 82 insertions(+), 19 deletions(-)
-
-diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
-index ce991a9bcad4..3f0d17b52778 100644
---- a/aten/src/ATen/cuda/CUDABlas.cpp
-+++ b/aten/src/ATen/cuda/CUDABlas.cpp
-@@ -14,7 +14,9 @@
- #include <c10/util/irange.h>
- 
- #ifdef USE_ROCM
-+#ifdef USE_HIPBLASLT
- #include <hipblaslt/hipblaslt-ext.hpp>
-+#endif
- // until hipblas has an API to accept flags, we must use rocblas here
- #include <hipblas/hipblas.h>
- #include <rocblas/rocblas.h>
-@@ -182,6 +184,9 @@ uint32_t _getAlignment(uintptr_t address) {
- static size_t _parseChosenWorkspaceSize() {
-   const char * val = getenv("CUBLASLT_WORKSPACE_SIZE");
- #ifdef USE_ROCM
-+#ifndef USE_HIPBLASLT
-+  return 0;
-+#endif
-   if (!val) {
-     // accept either env var
-     val = getenv("HIPBLASLT_WORKSPACE_SIZE");
-@@ -235,6 +240,7 @@ namespace at::cuda::blas {
-   } while (0)
- 
- 
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- namespace {
- // Following the pattern of CuSparseDescriptor
- // Defined here for now because this is the only place cublas_lt interface is
-@@ -318,7 +324,6 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
- };
- } // namespace
- 
--
- template <typename Dtype>
- inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-   cudaDataType_t abcType = CUDA_R_32F;
-@@ -452,7 +457,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-       " scaleType ",
-       scaleType);
- }
--
-+#endif
- 
- template <typename Dtype>
- inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-@@ -608,10 +613,13 @@ void bgemm_internal<double>(CUDABLAS_BGEMM_ARGTYPES(double))
- template <>
- void bgemm_internal<float>(CUDABLAS_BGEMM_ARGTYPES(float))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     bgemm_internal_cublaslt<float>(CUDABLAS_BGEMM_ARGS(float));
-   }
--  else {
-+  else
-+#endif
-+  {
-     bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGS(float));
-   }
- }
-@@ -651,10 +659,13 @@ void bgemm_internal<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<fl
- template <>
- void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     bgemm_internal_cublaslt<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
-   }
--  else {
-+  else
-+#endif    
-+  {
-     bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
-   }
- }
-@@ -662,10 +673,13 @@ void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half))
- template <>
- void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     bgemm_internal_cublaslt<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
-   }
--  else {
-+  else
-+#endif
-+  {
-     bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
-   }
- }
-@@ -781,11 +795,13 @@ void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
-   }
- }
- 
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- template <typename Dtype>
- inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
-   // forward to bgemm implementation but set strides and batches to 0
-   bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0);
- }
-+#endif
- 
- template <typename Dtype>
- inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
-@@ -1008,10 +1024,13 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
- template <>
- void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
-   }
--  else {
-+  else
-+#endif
-+  {
-     gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGS(float));
-   }
- }
-@@ -1051,10 +1070,13 @@ void gemm_internal<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<floa
- template <>
- void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
-   }
--  else {
-+  else
-+#endif
-+  {
-     gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
-   }
- }
-@@ -1062,10 +1084,13 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
- template <>
- void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
-   }
--  else {
-+  else
-+#endif
-+  {
-     gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
-   }
- }
-@@ -1177,7 +1202,7 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
-   }
- }
- 
--
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- template <typename Dtype>
- void gemm_and_bias(
-     bool transpose_mat1,
-@@ -1410,7 +1435,7 @@ void scaled_gemm(
-     ScalarType result_dtype,
-     void* amax_ptr,
-     bool use_fast_accum) {
--#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
-+#if CUDA_VERSION >= 11080 || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   const auto computeType = CUBLAS_COMPUTE_32F;
-   const auto scaleType = CUDA_R_32F;
-   const int8_t fastAccuMode = use_fast_accum ? 1 : 0;
-@@ -1681,6 +1706,7 @@ void int8_gemm(
-       " scaleType ",
-       scaleType);
- }
-+#endif
- 
- template <>
- void trsm<float>(CUDABLAS_TRSM_ARGTYPES(float)) {
-diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h
-index f2b657ced51b..f0ee613c4208 100644
---- a/aten/src/ATen/cuda/CUDAContextLight.h
-+++ b/aten/src/ATen/cuda/CUDAContextLight.h
-@@ -9,7 +9,9 @@
- 
- // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
- // added bf16 support
-+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)))
- #include <cublasLt.h>
-+#endif
- 
- #ifdef CUDART_VERSION
- #include <cusolverDn.h>
-@@ -80,7 +82,9 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator();
- /* Handles */
- TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle();
- TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
-+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)))
- TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
-+#endif
- 
- TORCH_CUDA_CPP_API void clearCublasWorkspaces();
- 
-diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
-index 8eac525b3695..abfdf7a23847 100644
---- a/aten/src/ATen/cuda/CublasHandlePool.cpp
-+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
-@@ -29,7 +29,7 @@ namespace at::cuda {
- 
- namespace {
- 
--#if defined(USE_ROCM)
-+#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
- void createCublasLtHandle(cublasLtHandle_t *handle) {
-   TORCH_CUDABLAS_CHECK(cublasLtCreate(handle));
- }
-@@ -191,8 +191,9 @@ cublasHandle_t getCurrentCUDABlasHandle() {
-   return handle;
- }
- 
--cublasLtHandle_t getCurrentCUDABlasLtHandle() {
- #ifdef USE_ROCM
-+#if defined(USE_HIPBLASLT)
-+cublasLtHandle_t getCurrentCUDABlasLtHandle() {
-   c10::DeviceIndex device = 0;
-   AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
- 
-@@ -213,9 +214,12 @@ cublasLtHandle_t getCurrentCUDABlasLtHandle() {
- 
-   auto handle = myPoolWindow->reserve(device);
-   return handle;
-+}
-+#endif
- #else
-+cublasLtHandle_t getCurrentCUDABlasLtHandle() {
-   return reinterpret_cast<cublasLtHandle_t>(getCurrentCUDABlasHandle());
--#endif
- }
-+#endif
- 
- } // namespace at::cuda
-diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h
-index 53e6154120c9..fa1d664696db 100644
---- a/aten/src/ATen/cuda/tunable/TunableGemm.h
-+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h
-@@ -11,7 +11,9 @@
- 
- #include <ATen/cuda/tunable/GemmCommon.h>
- #ifdef USE_ROCM
-+#ifdef USE_HIPBLASLT
- #include <ATen/cuda/tunable/GemmHipblaslt.h>
-+#endif
- #include <ATen/cuda/tunable/GemmRocblas.h>
- #endif
- #include <ATen/cuda/tunable/StreamTimer.h>
-@@ -65,6 +67,7 @@ class DefaultGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>>
-     }
- };
- 
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- template <typename T>
- class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
-   public:
-@@ -94,6 +97,7 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
-       return OK;
-     }
- };
-+#endif
- 
- template <typename T>
- inline bool IsZero(T v) {
-@@ -191,6 +195,7 @@ static void AddRocblasValidator() {
-   }
- }
- 
-+#ifdef USE_HIPBLASLT
- static void AddHipblasltValidator() {
-   auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
-   if (validators.find("HIPBLASLT_VERSION") == validators.end()) {
-@@ -205,6 +210,7 @@ static void AddHipblasltValidator() {
-         [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
-   }
- }
-+#endif
- 
- static void AddRocmValidator() {
-   auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
-@@ -243,7 +249,7 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
-       }
-       AddRocblasValidator();
-     }
--
-+#ifdef USE_HIPBLASLT
-     static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
-     if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
-       rocm_validators = true;
-@@ -257,7 +263,7 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
-       }
-       AddHipblasltValidator();
-     }
--
-+#endif
-     if (rocm_validators) {
-       AddRocmValidator();
-     }
-@@ -286,7 +292,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
-       }
-       AddRocblasValidator();
-     }
--
-+#ifdef USE_HIPBLASLT
-     static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
-     if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
-       rocm_validators = true;
-@@ -300,7 +306,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
-       }
-       AddHipblasltValidator();
-     }
--
-+#endif
-     if (rocm_validators) {
-       AddRocmValidator();
-     }
-@@ -312,6 +318,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
-   }
- };
- 
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout>
- class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer> {
-  public:
-@@ -321,10 +328,12 @@ class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer>
-     auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
- 
- #if defined(USE_ROCM)
-+#ifdef USE_HIPBLASLT
-     for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps<AT, BT, CT, ALayout, BLayout>()) {
-       this->RegisterOp(std::move(name), std::move(op));
-     }
-     AddHipblasltValidator();
-+#endif
-     AddRocmValidator();
- #endif
-   }
-@@ -337,6 +346,7 @@ class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer>
-             "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
-   }
- };
-+#endif
- 
- #undef XSTRINGIFY
- #undef STRINGIFY
-diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
-index 84c59a4fd0d7..56ad5de3bf2d 100644
---- a/aten/src/ATen/native/cuda/Blas.cpp
-+++ b/aten/src/ATen/native/cuda/Blas.cpp
-@@ -173,6 +173,7 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa
- }
- 
- static bool getDisableAddmmCudaLt() {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-     static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT");
- #ifdef USE_ROCM
-     // if we enable tunable op, it'll take priority over just hipblaslt (heuristics)
-@@ -196,10 +197,14 @@ static bool getDisableAddmmCudaLt() {
-     }
-     return false;
- #endif
-+#else
-+    return true;
-+#endif
- }
- 
- #ifdef USE_ROCM
- static bool isSupportedHipLtROCmArch(int index) {
-+#ifdef USE_HIPBLASLT
-     hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index);
-     std::string device_arch = prop->gcnArchName;
-     static const std::vector<std::string> archs = {"gfx90a", "gfx940", "gfx941", "gfx942"};
-@@ -210,6 +215,7 @@ static bool isSupportedHipLtROCmArch(int index) {
-         }
-     }
-     TORCH_CHECK(false, "Attempting to use hipBLASLt on a unsupported architecture!");
-+#endif
-     return false;
- }
- #endif
-@@ -235,6 +241,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
-   at::ScalarType scalar_type = self.scalar_type();
-   c10::MaybeOwned<Tensor> self_;
-   if (&result != &self) {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM)
-     // Strangely, if mat2 has only 1 row or column, we get
-     // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
-@@ -276,13 +283,14 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
-             scalar_type != at::ScalarType::BFloat16));
- #endif
-     }
-+#endif
- #endif
-     if (!useLtInterface) {
-       self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
-     }
-     self__sizes = self_->sizes();
-   } else {
--#if defined(USE_ROCM)
-+#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
-     useLtInterface = !disable_addmm_cuda_lt &&
-         result.dim() == 2 && result.is_contiguous() &&
-         isSupportedHipLtROCmArch(self.device().index()) &&
-@@ -334,6 +342,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
-   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj());
- 
-   if (useLtInterface) {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- #if defined(USE_ROCM)
-     AT_DISPATCH_FLOATING_TYPES_AND2(
-         at::ScalarType::Half,
-@@ -394,6 +403,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
-               activation_epilogue
-           );
-         });
-+#endif
- #endif
-   } else
-   {
-@@ -803,6 +813,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
- }
- 
- static bool _scaled_mm_allowed_device() {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-     auto dprops = at::cuda::getCurrentDeviceProperties();
- #ifdef USE_ROCM
-     std::string device_arch = dprops->gcnArchName;
-@@ -817,6 +828,9 @@ static bool _scaled_mm_allowed_device() {
- #else
-     return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9);
- #endif
-+#else
-+    return false;
-+#endif
- }
- 
- // Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax
-@@ -850,6 +864,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
-   // Check sizes
-   bool allowed_device = _scaled_mm_allowed_device();
-   TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+");
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
-   TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
-   TORCH_CHECK(
-@@ -1025,6 +1040,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
- #if defined(USE_ROCM) && ROCM_VERSION >= 60000 && ROCM_VERSION < 60200
-   // ROCm's hipBLASLt does not support amax before 6.2, so calculate separately
-   amax = at::max(at::abs(out.to(kFloat)));
-+#endif
- #endif
- 
-   return {out, amax};
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index f1f2eb7cec31..8d05e834bbc5 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1052,6 +1052,9 @@ if(USE_ROCM)
-     list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
-     list(APPEND HIP_CXX_FLAGS -std=c++17)
-     list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2)
-+    if(hipblast_FOUND)
-+      list(APPEND HIP_CXX_FLAGS -DUSE_HIPBLASLT)
-+    endif()
-     if(HIP_NEW_TYPE_ENUMS)
-       list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS)
-     endif()
-diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
-index fa39156031ff..df4836847fdf 100644
---- a/cmake/public/LoadHIP.cmake
-+++ b/cmake/public/LoadHIP.cmake
-@@ -155,7 +155,7 @@ if(HIP_FOUND)
-   find_package_and_print_version(hiprand REQUIRED)
-   find_package_and_print_version(rocblas REQUIRED)
-   find_package_and_print_version(hipblas REQUIRED)
--  find_package_and_print_version(hipblaslt REQUIRED)
-+  find_package_and_print_version(hipblaslt)
-   find_package_and_print_version(miopen REQUIRED)
-   find_package_and_print_version(hipfft REQUIRED)
-   find_package_and_print_version(hipsparse REQUIRED)
--- 
-2.45.2
-
diff --git a/next/0001-disable-use-of-aotriton.patch b/next/0001-disable-use-of-aotriton.patch
deleted file mode 100644
index 61ffd1e..0000000
--- a/next/0001-disable-use-of-aotriton.patch
+++ /dev/null
@@ -1,94 +0,0 @@
-From 038ce9e44776e23f21c1816daa259bc0ea335088 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 29 Jun 2024 07:06:09 -0700
-Subject: [PATCH] disable use of aotriton
-
----
- .../ATen/native/transformers/cuda/sdp_utils.cpp | 17 +++++++++++++++--
- 1 file changed, 15 insertions(+), 2 deletions(-)
-
-diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
-index 214b02d8262e..7b3eb9dcd8cd 100644
---- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
-+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
-@@ -19,9 +19,12 @@
- #include <c10/core/SymInt.h>
- #include <c10/util/string_view.h>
- 
-+#ifdef USE_FLASH_ATTENTION
- #if USE_ROCM
- #include <aotriton/flash.h>
- #endif
-+#endif
-+
- 
- /**
- * Note [SDPA Runtime Dispatch]
-@@ -182,6 +185,9 @@ bool check_sm_version(cudaDeviceProp * dprops) {
- 
- bool check_flash_attention_hardware_support(sdp_params const& params, bool debug) {
-   // Check that the gpu is capable of running flash attention
-+#ifndef USE_FLASH_ATTENTION
-+  return false;
-+#else
-   using sm80 = SMVersion<8, 0>;
-   using sm90 = SMVersion<9, 0>;
- #if USE_ROCM
-@@ -209,9 +215,13 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
-   }
- #endif
-   return true;
-+#endif
- }
- 
- bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) {
-+#ifndef USE_FLASH_ATTENTION
-+  return false;
-+#else
-   // Mem Efficient attention supports hardware in the range [sm_50, sm_90]
-   using sm50 = SMVersion<5, 0>;
-   using sm90 = SMVersion<9, 0>;
-@@ -240,6 +250,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
-   }
- #endif
-   return true;
-+#endif  
- }
- 
- bool check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89(
-@@ -554,7 +565,7 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
- #ifndef USE_FLASH_ATTENTION
-   TORCH_WARN_ONCE(!debug, "Torch was not compiled with flash attention.");
-   return false;
--#endif
-+#else
- 
-   // Define gate functions that determine if a flash kernel can be ran
-   // Replace with std::to_array when we migrate to c++20
-@@ -597,13 +608,14 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
-     }
-   }
-   return true;
-+#endif
- }
- 
- bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) {
- #ifndef USE_MEM_EFF_ATTENTION
-   TORCH_WARN_ONCE(!debug, "Torch was not compiled with memory efficient attention.");
-   return false;
--#endif
-+#else
-   // Constraints specific to mem efficient attention
-   constexpr auto greater_than_or_equal_sm80_mem_efficient_dtypes =
-       array_of<at::ScalarType>(at::kHalf, at::kFloat, at::kBFloat16);
-@@ -663,6 +675,7 @@ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) {
-   }
- #endif
-   return check_tensor_dtype(params, less_than_sm80_mem_efficient_dtypes, debug);
-+#endif
- }
- 
- SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
--- 
-2.45.2
-
diff --git a/python-torch.spec b/python-torch.spec
index 90c908c..91a82a9 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.8.0-rc3
-%global commit0 3d53a53e504089a52a149791fd33d7fc898bd055
+# v2.8.0-rc6
+%global commit0 f2b69a083d15e3d0083bb304302a3fd0b5fb8705
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250625
+%global date0 20250718
 %global pypi_version 2.8.0
 %global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
@@ -357,11 +357,9 @@ sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)@#list(APPE
 sed -i -e 's@if(NOT TARGET fxdiv)@if(MSVC AND USE_XNNPACK)@' caffe2/CMakeLists.txt
 sed -i -e 's@TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@#TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@' caffe2/CMakeLists.txt
 
-%if %{without gitcommit}
 # https://github.com/pytorch/pytorch/issues/149803
 # Tries to checkout nccl
-sed -i -e 's@    checkout_nccl()@#    checkout_nccl()@' tools/build_pytorch_libs.py
-%endif
+sed -i -e 's@    checkout_nccl()@    True@' tools/build_pytorch_libs.py
 
 # Disable the use of check_submodule's in the setup.py, we are a tarball, not a git repo
 sed -i -e 's@check_submodules()$@#check_submodules()@' setup.py
@@ -541,6 +539,7 @@ export USE_SYSTEM_EIGEN_INSTALL=ON
 export USE_SYSTEM_ONNX=ON
 export USE_SYSTEM_PYBIND11=OFF
 export USE_SYSTEM_LIBS=OFF
+export USE_SYSTEM_NCCL=OFF
 export USE_TENSORPIPE=OFF
 export USE_XNNPACK=OFF
 export USE_XPU=OFF

From 61ccf033a8aece458e7df6573367bde478952636 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 24 Jul 2025 06:07:03 -0700
Subject: [PATCH 68/88] Update gitcommit to 2.8.0-rc8

Patch problem with 3.14
Start converting over py3 macros
Handle new dependency on rocmsmi

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .../0001-Add-cmake-variable-USE_ROCM_CK.patch |  85 ++++-
 ...and-import-torch-issues-for-cpython-.patch | 359 ++++++++++++++++++
 next/0001-Use-horrible-dynamo-stub.patch      |  85 +++++
 python-torch.spec                             |  60 ++-
 4 files changed, 560 insertions(+), 29 deletions(-)
 create mode 100644 next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
 create mode 100644 next/0001-Use-horrible-dynamo-stub.patch

diff --git a/next/0001-Add-cmake-variable-USE_ROCM_CK.patch b/next/0001-Add-cmake-variable-USE_ROCM_CK.patch
index 1afe692..925e03b 100644
--- a/next/0001-Add-cmake-variable-USE_ROCM_CK.patch
+++ b/next/0001-Add-cmake-variable-USE_ROCM_CK.patch
@@ -1,17 +1,17 @@
-From 4cc5d88dfe7a45ab245648dc874645d32a24b98b Mon Sep 17 00:00:00 2001
+From 193854993cd939de186de19589c1add4c4b2cf66 Mon Sep 17 00:00:00 2001
 From: Tom Rix <Tom.Rix@amd.com>
-Date: Fri, 27 Jun 2025 13:52:51 -0700
+Date: Mon, 21 Jul 2025 11:35:03 -0700
 Subject: [PATCH] Add cmake variable USE_ROCM_CK
 
 ---
  CMakeLists.txt                  |  1 +
  aten/src/ATen/CMakeLists.txt    | 40 ++++++++++++++++-----------------
- aten/src/ATen/cuda/CUDABlas.cpp | 10 ++++-----
+ aten/src/ATen/cuda/CUDABlas.cpp | 22 +++++++++---------
  cmake/Dependencies.cmake        |  3 +++
- 4 files changed, 29 insertions(+), 25 deletions(-)
+ 4 files changed, 35 insertions(+), 31 deletions(-)
 
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 99c0b9e0ea0c..4c632e42f531 100644
+index a5d25e6afa0f..afc1b53efa64 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -240,6 +240,7 @@ cmake_dependent_option(
@@ -82,7 +82,7 @@ index c9cfd74b501e..59f6178218ee 100644
      file(GLOB native_hip_ck "native/hip/ck*.hip")
      exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
 diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
-index 89350a11bea7..33e5f2808057 100644
+index 89350a11bea7..e5b7960177cf 100644
 --- a/aten/src/ATen/cuda/CUDABlas.cpp
 +++ b/aten/src/ATen/cuda/CUDABlas.cpp
 @@ -752,7 +752,7 @@ template <>
@@ -94,16 +94,16 @@ index 89350a11bea7..33e5f2808057 100644
      // hipblaslt does not support double gemm yet
      bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGS(double));
  #else
-@@ -1103,7 +1103,7 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
-   void * beta_ptr = &fbeta;
-   _cublasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
-   GEMM_CHECK_ARGVALUES(at::Half);
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_CK
-   int flag = 0;
- #if USE_GEMM_FLAGS_FP16_ALT_IMPL
-   flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
-@@ -1270,7 +1270,7 @@ template <>
+@@ -836,7 +836,7 @@ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
+       bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+     }
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
++#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::bgemm_internal_ck<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+   }
+@@ -1270,14 +1270,14 @@ template <>
  void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
  {
    if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
@@ -112,6 +112,23 @@ index 89350a11bea7..33e5f2808057 100644
      // hipblaslt does not support double gemm yet
      gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGS(double));
  #else
+     gemm_internal_cublaslt<double>(CUDABLAS_GEMM_ARGS(double));
+ #endif
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
++#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<double>(CUDABLAS_GEMM_ARGS(double));
+   }
+@@ -1293,7 +1293,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
++#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100
+       gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
 @@ -1311,7 +1311,7 @@ template <>
  void gemm_internal<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>))
  {
@@ -130,6 +147,42 @@ index 89350a11bea7..33e5f2808057 100644
      // hipblaslt does not support complex gemm yet
      gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGS(c10::complex<float>));
  #else
+@@ -1345,7 +1345,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
++#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+@@ -1361,7 +1361,7 @@ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
++#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+@@ -1382,7 +1382,7 @@ void gemm_internal<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half,
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::Half, float>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
++#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+   }
+@@ -1398,7 +1398,7 @@ void gemm_internal<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::B
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::BFloat16, float>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
++#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+   }
 diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
 index a93386c27f8d..be1368999d38 100644
 --- a/cmake/Dependencies.cmake
diff --git a/next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch b/next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
new file mode 100644
index 0000000..b6a282c
--- /dev/null
+++ b/next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
@@ -0,0 +1,359 @@
+From f2a544b2e3a5bdc04985f6e06223c0c1700120a0 Mon Sep 17 00:00:00 2001
+From: albanD <desmaison.alban@gmail.com>
+Date: Sat, 12 Jul 2025 03:42:33 -0400
+Subject: [PATCH] Fix compilation and "import torch" issues for cpython 3.14
+
+Imported from
+https://github.com/albanD/pytorch/tree/cpython314_build
+commit 88bb9cdb72449f4277829e20d94ad8aec1894216
+
+Signed-off-by: Tom Rix <Tom.Rix@amd.com>
+---
+ torch/_dynamo/bytecode_analysis.py        |  2 +-
+ torch/ao/quantization/__init__.py         |  5 +++-
+ torch/ao/quantization/qconfig.py          |  4 ++-
+ torch/ao/quantization/utils.py            |  7 +++--
+ torch/csrc/dynamo/cpython_defs.c          | 16 +++++++++++
+ torch/csrc/dynamo/cpython_includes.h      | 17 ++++++++++++
+ torch/csrc/dynamo/eval_frame.c            | 34 +++++++++++++++--------
+ torch/csrc/dynamo/framelocals_mapping.cpp | 14 ++++++++++
+ torch/csrc/utils/python_compat.h          |  1 +
+ torch/onnx/__init__.py                    |  1 -
+ torch/utils/weak.py                       | 29 +++++++++++++++++--
+ 11 files changed, 111 insertions(+), 19 deletions(-)
+
+diff --git a/torch/_dynamo/bytecode_analysis.py b/torch/_dynamo/bytecode_analysis.py
+index 3252ea91409f..2de74ee5bf8d 100644
+--- a/torch/_dynamo/bytecode_analysis.py
++++ b/torch/_dynamo/bytecode_analysis.py
+@@ -33,7 +33,7 @@ if sys.version_info >= (3, 11):
+     TERMINAL_OPCODES.add(dis.opmap["JUMP_FORWARD"])
+ else:
+     TERMINAL_OPCODES.add(dis.opmap["JUMP_ABSOLUTE"])
+-if sys.version_info >= (3, 12):
++if (3, 12) <= sys.version_info < (3, 14):
+     TERMINAL_OPCODES.add(dis.opmap["RETURN_CONST"])
+ if sys.version_info >= (3, 13):
+     TERMINAL_OPCODES.add(dis.opmap["JUMP_BACKWARD_NO_INTERRUPT"])
+diff --git a/torch/ao/quantization/__init__.py b/torch/ao/quantization/__init__.py
+index ffc1792fd23f..cf5a8b99a894 100644
+--- a/torch/ao/quantization/__init__.py
++++ b/torch/ao/quantization/__init__.py
+@@ -1,5 +1,6 @@
+ # mypy: allow-untyped-defs
+ 
++import sys
+ from typing import Callable, Optional, Union
+ 
+ import torch
+@@ -33,7 +34,9 @@ from .stubs import *  # noqa: F403
+ 
+ # ensure __module__ is set correctly for public APIs
+ ObserverOrFakeQuantize = Union[ObserverBase, FakeQuantizeBase]
+-ObserverOrFakeQuantize.__module__ = "torch.ao.quantization"
++if sys.version_info < (3, 14):
++    ObserverOrFakeQuantize.__module__ = "torch.ao.quantization"
++
+ for _f in [
+     compare_results,
+     extract_results_from_loggers,
+diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py
+index efee5302ad42..d9a8fc78bab4 100644
+--- a/torch/ao/quantization/qconfig.py
++++ b/torch/ao/quantization/qconfig.py
+@@ -1,5 +1,6 @@
+ # mypy: allow-untyped-defs
+ import copy
++import sys
+ import warnings
+ from collections import namedtuple
+ from typing import Any, Optional, Union
+@@ -568,7 +569,8 @@ def _assert_valid_qconfig(qconfig: Optional[QConfig], mod: torch.nn.Module) -> N
+ 
+ 
+ QConfigAny = Optional[QConfig]
+-QConfigAny.__module__ = "torch.ao.quantization.qconfig"
++if sys.version_info < (3, 14):
++    QConfigAny.__module__ = "torch.ao.quantization.qconfig"
+ 
+ 
+ def _add_module_to_qconfig_obs_ctr(
+diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py
+index 4ac3112ec072..3b1503e01701 100644
+--- a/torch/ao/quantization/utils.py
++++ b/torch/ao/quantization/utils.py
+@@ -4,6 +4,7 @@ Utils shared by different modes of quantization (eager/graph)
+ """
+ 
+ import functools
++import sys
+ import warnings
+ from collections import OrderedDict
+ from inspect import getfullargspec, signature
+@@ -16,7 +17,8 @@ from torch.nn.utils.parametrize import is_parametrized
+ 
+ 
+ NodePattern = Union[tuple[Node, Node], tuple[Node, tuple[Node, Node]], Any]
+-NodePattern.__module__ = "torch.ao.quantization.utils"
++if sys.version_info < (3, 14):
++    NodePattern.__module__ = "torch.ao.quantization.utils"
+ 
+ # This is the Quantizer class instance from torch/quantization/fx/quantize.py.
+ # Define separately to prevent circular imports.
+@@ -31,7 +33,8 @@ QuantizerCls = Any
+ Pattern = Union[
+     Callable, tuple[Callable, Callable], tuple[Callable, tuple[Callable, Callable]], Any
+ ]
+-Pattern.__module__ = "torch.ao.quantization.utils"
++if sys.version_info < (3, 14):
++    Pattern.__module__ = "torch.ao.quantization.utils"
+ 
+ 
+ # TODO: maybe rename this to MatchInputNode
+diff --git a/torch/csrc/dynamo/cpython_defs.c b/torch/csrc/dynamo/cpython_defs.c
+index b68ef894aeaa..244d4165d5e8 100644
+--- a/torch/csrc/dynamo/cpython_defs.c
++++ b/torch/csrc/dynamo/cpython_defs.c
+@@ -2,6 +2,20 @@
+ #include <torch/csrc/dynamo/cpython_includes.h>
+ #include <torch/csrc/dynamo/debug_macros.h>
+ 
++#if IS_PYTHON_3_14_PLUS
++
++const uint8_t* THP_PyOpcode_Caches = NULL;
++const int THP_PyOpcode_Caches_size = 0;
++
++void
++THP_PyThreadState_PopFrame(PyThreadState *tstate, _PyInterpreterFrame * frame)
++{}
++void
++THP_PyFrame_Clear(_PyInterpreterFrame *frame)
++{}
++
++#else
++
+ #if IS_PYTHON_3_11_PLUS
+ 
+ #define Py_BUILD_CORE
+@@ -360,3 +374,5 @@ const uint8_t* THP_PyOpcode_Caches = NULL;
+ const int THP_PyOpcode_Caches_size = 0;
+ 
+ #endif
++
++#endif // IS_PYTHON_3_14_PLUS
+\ No newline at end of file
+diff --git a/torch/csrc/dynamo/cpython_includes.h b/torch/csrc/dynamo/cpython_includes.h
+index 6b99c1d5aec8..616be16563cf 100644
+--- a/torch/csrc/dynamo/cpython_includes.h
++++ b/torch/csrc/dynamo/cpython_includes.h
+@@ -21,6 +21,14 @@
+ 
+ #if IS_PYTHON_3_11_PLUS
+ #include <internal/pycore_frame.h>
++#if IS_PYTHON_3_14_PLUS
++#include <internal/pycore_interpframe_structs.h>
++#include <internal/pycore_stackref.h>
++#endif
++#endif
++
++#if IS_PYTHON_3_14_PLUS
++#include <internal/pycore_code.h>
+ #endif
+ 
+ #undef Py_BUILD_CORE
+@@ -30,6 +38,13 @@
+ extern "C" {
+ #endif
+ 
++#if IS_PYTHON_3_14_PLUS
++
++#define F_CODE(x) (PyCodeObject*)PyStackRef_AsPyObjectBorrow(x->f_executable)
++#define PREV_INSTR(x) (x)->instr_ptr
++
++#else
++
+ #if IS_PYTHON_3_13_PLUS
+ #define F_CODE(x) ((PyCodeObject*)(x)->f_executable)
+ #define PREV_INSTR(x) (x)->instr_ptr
+@@ -38,6 +53,8 @@ extern "C" {
+ #define PREV_INSTR(x) (x)->prev_instr
+ #endif
+ 
++#endif // IS_PYTHON_3_14_PLUS
++
+ #if IS_PYTHON_3_12_PLUS
+ #define FUNC(x) ((x)->f_funcobj)
+ #else
+diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c
+index f413782b2d30..72bb8839bac3 100644
+--- a/torch/csrc/dynamo/eval_frame.c
++++ b/torch/csrc/dynamo/eval_frame.c
+@@ -224,17 +224,6 @@ const char* get_frame_name(THP_EVAL_API_FRAME_OBJECT* frame) {
+   return PyUnicode_AsUTF8(F_CODE(frame)->co_name);
+ }
+ 
+-void clear_old_frame_if_python_312_plus(
+-    PyThreadState* tstate,
+-    THP_EVAL_API_FRAME_OBJECT* frame) {
+-#if IS_PYTHON_3_12_PLUS
+-
+-  THP_PyFrame_Clear(frame);
+-  THP_PyThreadState_PopFrame(tstate, frame);
+-
+-#endif
+-}
+-
+ static PyObject* dynamo_eval_custom_code_impl(
+     PyThreadState* tstate,
+     THP_EVAL_API_FRAME_OBJECT* frame,
+@@ -485,6 +474,18 @@ static PyObject* dynamo__custom_eval_frame_shim(
+ 
+ static void enable_eval_frame_shim(PyThreadState* tstate) {}
+ static void enable_eval_frame_default(PyThreadState* tstate) {}
++PyObject* dynamo_eval_custom_code(
++    PyThreadState* tstate,
++    THP_EVAL_API_FRAME_OBJECT* frame,
++    PyCodeObject* code,
++    const char* trace_annotation,
++    int throw_flag) {}
++THPPyInterpreterFrame* THPPyInterpreterFrame_New(
++    THP_EVAL_API_FRAME_OBJECT* frame) {}
++PyObject* dynamo_eval_frame_default(
++    PyThreadState* tstate,
++    THP_EVAL_API_FRAME_OBJECT* frame,
++    int throw_flag) {}
+ 
+ static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL};
+ 
+@@ -498,6 +499,17 @@ static PyTypeObject THPPyInterpreterFrameType = {
+ 
+ #endif // !(IS_PYTHON_3_14_PLUS)
+ 
++void clear_old_frame_if_python_312_plus(
++    PyThreadState* tstate,
++    THP_EVAL_API_FRAME_OBJECT* frame) {
++#if IS_PYTHON_3_12_PLUS
++
++  THP_PyFrame_Clear(frame);
++  THP_PyThreadState_PopFrame(tstate, frame);
++
++#endif
++}
++
+ static PyObject* increment_working_threads(
+     PyThreadState* tstate,
+     PyObject* module) {
+diff --git a/torch/csrc/dynamo/framelocals_mapping.cpp b/torch/csrc/dynamo/framelocals_mapping.cpp
+index b839fb26fc91..c4ee36d87767 100644
+--- a/torch/csrc/dynamo/framelocals_mapping.cpp
++++ b/torch/csrc/dynamo/framelocals_mapping.cpp
+@@ -26,9 +26,13 @@ FrameLocalsMapping::FrameLocalsMapping(FrameLocalsFrameType* frame)
+   PyCodeObject* co = F_CODE(frame);
+   _framelocals.resize(co->co_nlocalsplus, nullptr);
+ 
++#if IS_PYTHON_3_14_PLUS
++  TORCH_CHECK(false, "Python 3.14+ not supported");
++#else
+   if (!frame->stacktop) {
+     return;
+   }
++#endif
+ 
+   auto update_framelocals = [&](int i, PyObject* value) {
+     _PyLocals_Kind kind = _PyLocals_GetKind(co->co_localspluskinds, i);
+@@ -53,11 +57,21 @@ FrameLocalsMapping::FrameLocalsMapping(FrameLocalsFrameType* frame)
+   };
+ 
+   auto offset = co->co_nlocalsplus - co->co_nfreevars;
++#if IS_PYTHON_3_14_PLUS
++  TORCH_CHECK(false, "Python 3.14+ not supported");
++#else
+   for (int i = 0; i < offset; i++) {
+     update_framelocals(i, frame->localsplus[i]);
+   }
++#endif
++
+   // Get references to closure variables
++#if IS_PYTHON_3_14_PLUS
++  PyObject* closure;
++  TORCH_CHECK(false, "Python 3.14+ not supported");
++#else
+   PyObject* closure = ((PyFunctionObject*)FUNC(frame))->func_closure;
++#endif
+   for (int i = 0; i < co->co_nfreevars; i++) {
+     update_framelocals(offset + i, PyTuple_GET_ITEM(closure, i));
+   }
+diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h
+index a1537611cc47..16292e4fd030 100644
+--- a/torch/csrc/utils/python_compat.h
++++ b/torch/csrc/utils/python_compat.h
+@@ -13,6 +13,7 @@ extern "C" {
+ #define IS_PYTHON_3_12_PLUS PY_VERSION_HEX >= 0x030C0000
+ #define IS_PYTHON_3_13_PLUS PY_VERSION_HEX >= 0x030D0000
+ #define IS_PYTHON_3_14_PLUS PY_VERSION_HEX >= 0x030E0000
++#define IS_PYTHON_3_15_PLUS PY_VERSION_HEX >= 0x030F0000
+ 
+ static inline int PyCode_GetNCellvars(PyCodeObject* code) {
+ // gh-26364 added co_ncellvars to Python 3.11.0rc1
+diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
+index 345ffd2a065b..ceeadde5365b 100644
+--- a/torch/onnx/__init__.py
++++ b/torch/onnx/__init__.py
+@@ -104,7 +104,6 @@ ONNXProgram.__module__ = "torch.onnx"
+ OnnxExporterError.__module__ = "torch.onnx"
+ _OrtBackend.__module__ = "torch.onnx"
+ _OrtBackendOptions.__module__ = "torch.onnx"
+-_OrtExecutionProvider.__module__ = "torch.onnx"
+ enable_fake_mode.__module__ = "torch.onnx"
+ is_onnxrt_backend_supported.__module__ = "torch.onnx"
+ 
+diff --git a/torch/utils/weak.py b/torch/utils/weak.py
+index 8bf2ba5ed02b..9c7218cb2ad3 100644
+--- a/torch/utils/weak.py
++++ b/torch/utils/weak.py
+@@ -3,8 +3,6 @@ from __future__ import annotations
+ 
+ import collections.abc as _collections_abc
+ import weakref
+-
+-from _weakrefset import _IterationGuard  # type: ignore[attr-defined]
+ from collections.abc import Mapping, MutableMapping
+ from weakref import ref
+ 
+@@ -22,6 +20,33 @@ __all__ = [
+ ]
+ 
+ 
++# TODO: make weakref properly thread safe following
++# https://github.com/python/cpython/pull/125325
++class _IterationGuard:
++    # This context manager registers itself in the current iterators of the
++    # weak container, such as to delay all removals until the context manager
++    # exits.
++    # This technique should be relatively thread-safe (since sets are).
++
++    def __init__(self, weakcontainer):
++        # Don't create cycles
++        self.weakcontainer = ref(weakcontainer)
++
++    def __enter__(self):
++        w = self.weakcontainer()
++        if w is not None:
++            w._iterating.add(self)
++        return self
++
++    def __exit__(self, e, t, b):
++        w = self.weakcontainer()
++        if w is not None:
++            s = w._iterating
++            s.remove(self)
++            if not s:
++                w._commit_removals()
++
++
+ # This file defines a variant of WeakKeyDictionary that overrides the hashing
+ # behavior of the key to use object identity, rather than the builtin
+ # __eq__/__hash__ functions.  This is useful for Tensor weak keys, as their
+-- 
+2.49.0
+
diff --git a/next/0001-Use-horrible-dynamo-stub.patch b/next/0001-Use-horrible-dynamo-stub.patch
new file mode 100644
index 0000000..1900519
--- /dev/null
+++ b/next/0001-Use-horrible-dynamo-stub.patch
@@ -0,0 +1,85 @@
+From fd535f7bf44f2034cca2a66b4cc7d68d962341df Mon Sep 17 00:00:00 2001
+From: Tom Rix <Tom.Rix@amd.com>
+Date: Sun, 20 Jul 2025 12:47:58 -0700
+Subject: [PATCH] Use horrible dynamo stub
+
+Rawhide's update of python is too fast for dynamo
+So paper of the problem with a horrible stub that throws
+runtime exceptions if dynamo is used.
+
+Signed-off-by: Tom Rix <Tom.Rix@amd.com>
+---
+ build_variables.bzl                        | 26 ++++++++++++----------
+ torch/csrc/dynamo/horrible_dynamo_stub.cpp | 16 +++++++++++++
+ 2 files changed, 30 insertions(+), 12 deletions(-)
+ create mode 100644 torch/csrc/dynamo/horrible_dynamo_stub.cpp
+
+diff --git a/build_variables.bzl b/build_variables.bzl
+index b266c80e8843..a3be6893349b 100644
+--- a/build_variables.bzl
++++ b/build_variables.bzl
+@@ -140,7 +140,8 @@ core_trainer_sources = [
+     "torch/csrc/autograd/variable.cpp",
+     "torch/csrc/autograd/utils/warnings.cpp",
+     "torch/csrc/autograd/jit_decomp_interface.cpp",
+-    "torch/csrc/dynamo/compiled_autograd.cpp",
++#    "torch/csrc/dynamo/compiled_autograd.cpp",
++    "torch/csrc/dynamo/horrible_dynamo_stub.cpp",
+     "torch/csrc/jit/frontend/name_mangler.cpp",
+     "torch/csrc/jit/ir/type_hashing.cpp",
+     "torch/csrc/jit/serialization/pickler.cpp",
+@@ -868,17 +869,18 @@ libtorch_python_core_sources = [
+     "torch/csrc/autograd/python_torch_functions_manual.cpp",
+     "torch/csrc/autograd/python_variable.cpp",
+     "torch/csrc/autograd/python_variable_indexing.cpp",
+-    "torch/csrc/dynamo/python_compiled_autograd.cpp",
+-    "torch/csrc/dynamo/cache_entry.cpp",
+-    "torch/csrc/dynamo/cpp_shim.cpp",
+-    "torch/csrc/dynamo/cpython_defs.c",
+-    "torch/csrc/dynamo/eval_frame.c",
+-    "torch/csrc/dynamo/eval_frame_cpp.cpp",
+-    "torch/csrc/dynamo/extra_state.cpp",
+-    "torch/csrc/dynamo/framelocals_mapping.cpp",
+-    "torch/csrc/dynamo/guards.cpp",
+-    "torch/csrc/dynamo/utils.cpp",
+-    "torch/csrc/dynamo/init.cpp",
++#    "torch/csrc/dynamo/python_compiled_autograd.cpp",
++#    "torch/csrc/dynamo/cache_entry.cpp",
++#    "torch/csrc/dynamo/cpp_shim.cpp",
++#    "torch/csrc/dynamo/cpython_defs.c",
++#    "torch/csrc/dynamo/eval_frame.c",
++#    "torch/csrc/dynamo/eval_frame_cpp.cpp",
++#    "torch/csrc/dynamo/extra_state.cpp",
++#    "torch/csrc/dynamo/framelocals_mapping.cpp",
++#    "torch/csrc/dynamo/guards.cpp",
++#    "torch/csrc/dynamo/utils.cpp",
++#    "torch/csrc/dynamo/init.cpp",
++    "torch/csrc/dynamo/horrible_dynamo_stub.cpp",
+     "torch/csrc/functorch/init.cpp",
+     "torch/csrc/fx/node.cpp",
+     "torch/csrc/mps/Module.cpp",
+diff --git a/torch/csrc/dynamo/horrible_dynamo_stub.cpp b/torch/csrc/dynamo/horrible_dynamo_stub.cpp
+new file mode 100644
+index 000000000000..3ac1324d4557
+--- /dev/null
++++ b/torch/csrc/dynamo/horrible_dynamo_stub.cpp
+@@ -0,0 +1,16 @@
++#include <torch/csrc/autograd/engine.h>
++#include <torch/csrc/dynamo/compiled_autograd.h>
++
++namespace torch::dynamo::autograd {
++const std::unique_ptr<PyCompilerInterface>& getPyCompilerInterface() {
++  throw std::runtime_error("Dynamo not supported");
++  return nullptr;
++}
++std::vector<std::optional<InputMetadata>> get_input_metadata(
++    const edge_list& edges) {
++  std::vector<std::optional<InputMetadata>> r;
++  throw std::runtime_error("Dynamo not supported");
++  return r;
++}
++
++}
+-- 
+2.49.0
+
diff --git a/python-torch.spec b/python-torch.spec
index 91a82a9..44c1199 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.8.0-rc6
-%global commit0 f2b69a083d15e3d0083bb304302a3fd0b5fb8705
+# v2.8.0-rc8
+%global commit0 a1cb3cc05d46d198467bebbb6e8fba50a325d4e7
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250718
+%global date0 20250723
 %global pypi_version 2.8.0
 %global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
@@ -33,7 +33,11 @@
 %endif
 
 # For testing distributed+rccl etc.
+%if %{with gitcommit}
+%bcond_without rccl
+%else
 %bcond_with rccl
+%endif
 %bcond_with gloo
 %bcond_without mpi
 %bcond_without tensorpipe
@@ -103,13 +107,13 @@ Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{
 
 %if %{without gitcommit}
 # https://github.com/pytorch/pytorch/issues/150187
-# The hack job
-# Patch11:       0001-python-torch-disable-ck.patch
-# Cleaned up hack job
 Patch11:       0001-Add-cmake-varaible-USE_ROCM_CK.patch
-
 %else
+# https://github.com/pytorch/pytorch/issues/150187
 Patch11:       0001-Add-cmake-variable-USE_ROCM_CK.patch
+# https://github.com/pytorch/pytorch/issues/156595
+# Patch12:       0001-Use-horrible-dynamo-stub.patch
+Patch12:       0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
 %endif
 
 ExclusiveArch:  x86_64 aarch64
@@ -153,6 +157,9 @@ BuildRequires:  python3dist(filelock)
 BuildRequires:  python3dist(jinja2)
 BuildRequires:  python3dist(networkx)
 BuildRequires:  python3dist(numpy)
+%if %{with gitcommit}
+BuildRequires:  python3dist(pip)
+%endif
 BuildRequires:  python3dist(pyyaml)
 BuildRequires:  python3dist(setuptools)
 BuildRequires:  python3dist(sphinx)
@@ -171,6 +178,9 @@ BuildRequires:  hipcub-devel
 BuildRequires:  hipfft-devel
 BuildRequires:  hiprand-devel
 BuildRequires:  hipsparse-devel
+%if %{with gitcommit}
+BuildRequires:  hipsparselt-devel
+%endif
 BuildRequires:  hipsolver-devel
 BuildRequires:  magma-devel
 BuildRequires:  miopen-devel
@@ -190,6 +200,7 @@ BuildRequires:  rocm-runtime-devel
 BuildRequires:  rocm-rpm-macros
 %if %{with gitcommit}
 BuildRequires:  rocsolver-devel
+BuildRequires:  rocm-smi-devel
 %endif
 BuildRequires:  rocthrust-devel
 BuildRequires:  roctracer-devel
@@ -337,6 +348,10 @@ sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unus
 sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-deprecated-declarations@' cmake/Dependencies.cmake
 # Use parallel jobs
 sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -parallel-jobs=4@' cmake/Dependencies.cmake
+%if %{with gitcommit}
+# Need to link with librocm_smi64
+sed -i -e 's@hiprtc::hiprtc@hiprtc::hiprtc rocm_smi64@' cmake/Dependencies.cmake
+%endif
 
 # No third_party fmt, use system
 sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt
@@ -590,17 +605,21 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 
 # pytorch uses clang, not hipcc
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
-%if %{?fedora} <= 43
-export PYTORCH_ROCM_ARCH="gfx1100;gfx1201"
-%else
 export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
+
+%if %{with gitcommit}
+%pyproject_wheel
+%else
+%py3_build
 %endif
 
-%py3_build
-
 %else
 
+%if %{with gitcommit}
+%pyproject_wheel
+%else
 %py3_build
+%endif
 
 %endif
 
@@ -617,17 +636,32 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 # pytorch uses clang, not hipcc
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
 export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
+%if %{with gitcommit}
+%pyproject_install
+%else
 %py3_install
+%endif
 
 %else
 
+%if %{with gitcommit}
+%pyproject_install
+%pyproject_save_files torch
+%else
 %py3_install
-
+%endif
 
 %endif
 
+
+
 %check
+%if %{with gitcommit}
+# Not working yet
+# pyproject_check_import torch
+%else
 %py3_check_import torch
+%endif
 
 # Do not remote the empty files
 

From 6158e4810ccdce8ccbe332db1ac0b02e87dc4615 Mon Sep 17 00:00:00 2001
From: Fedora Release Engineering <releng@fedoraproject.org>
Date: Fri, 25 Jul 2025 10:49:07 +0000
Subject: [PATCH 69/88] Rebuilt for
 https://fedoraproject.org/wiki/Fedora_43_Mass_Rebuild


From 72ad1f0389043a5f26735e3ca2a2a88398daba23 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sat, 26 Jul 2025 17:15:59 -0700
Subject: [PATCH 70/88] Fix some issues with switching to pyproject macros

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 44c1199..03fbf30 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -214,6 +214,7 @@ BuildRequires:  google-benchmark-devel
 %endif
 
 Requires:       python3dist(dill)
+Requires:       python3dist(yaml)
 
 Obsoletes:      caffe  = 1.0^git20200212.9b89154
 
@@ -638,6 +639,7 @@ export HIP_CLANG_PATH=%{rocmllvm_bindir}
 export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %if %{with gitcommit}
 %pyproject_install
+%pyproject_save_files '*torch*'
 %else
 %py3_install
 %endif
@@ -646,7 +648,7 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 
 %if %{with gitcommit}
 %pyproject_install
-%pyproject_save_files torch
+%pyproject_save_files '*torch*'
 %else
 %py3_install
 %endif
@@ -670,10 +672,8 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %doc README.md 
 %{_bindir}/torchrun
 %{_bindir}/torchfrtrace
-%{python3_sitearch}/%{pypi_name}
-%{python3_sitearch}/%{pypi_name}-*.egg-info
+%{python3_sitearch}/%{pypi_name}*
 %{python3_sitearch}/functorch
-%{python3_sitearch}/torchgen
 
 %changelog
 %autochangelog

From cec8b79644fdf9b63ba227c506c9bccdb36b1618 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 31 Jul 2025 05:52:50 -0700
Subject: [PATCH 71/88] Update to 2.8.0-rc8

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore                                    |   3 +
 0001-Add-cmake-varaible-USE_ROCM_CK.patch     | 120 ------------------
 ... 0001-Add-cmake-variable-USE_ROCM_CK.patch |   0
 ...and-import-torch-issues-for-cpython-.patch |   0
 python-torch.spec                             |  59 ++-------
 sources                                       |   3 +
 6 files changed, 17 insertions(+), 168 deletions(-)
 delete mode 100644 0001-Add-cmake-varaible-USE_ROCM_CK.patch
 rename next/0001-Add-cmake-variable-USE_ROCM_CK.patch => 0001-Add-cmake-variable-USE_ROCM_CK.patch (100%)
 rename next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch => 0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch (100%)

diff --git a/.gitignore b/.gitignore
index 25abff5..a4ed35b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,6 @@
 /pytorch-v2.5.1.tar.gz
 /pytorch-v2.7.0.tar.gz
 /v2.13.6.tar.gz
+/pytorch-a1cb3cc.tar.gz
+/v24.12.23.tar.gz
+/kineto-5e75018.tar.gz
diff --git a/0001-Add-cmake-varaible-USE_ROCM_CK.patch b/0001-Add-cmake-varaible-USE_ROCM_CK.patch
deleted file mode 100644
index b34e07a..0000000
--- a/0001-Add-cmake-varaible-USE_ROCM_CK.patch
+++ /dev/null
@@ -1,120 +0,0 @@
-From 0f33e0a7bbd1522ee74f8fc1fbe3af7563318c79 Mon Sep 17 00:00:00 2001
-From: Tom Rix <Tom.Rix@amd.com>
-Date: Fri, 28 Mar 2025 15:33:09 -0700
-Subject: [PATCH] Add cmake varaible USE_ROCM_CK
-
-To control the use of ROCm Composable Kernel usage.
-
-CK is not compatible with all rocBLAS gpu's, so the user
-must explicitly choose to use CK.
-
-Signed-off-by: Tom Rix <Tom.Rix@amd.com>
----
- CMakeLists.txt                  |  1 +
- aten/src/ATen/CMakeLists.txt    |  8 ++++++--
- aten/src/ATen/cuda/CUDABlas.cpp | 10 +++++-----
- cmake/Dependencies.cmake        |  3 +++
- 4 files changed, 15 insertions(+), 7 deletions(-)
-
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index f3fee2f7ffc2..73903acce452 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -249,6 +249,7 @@ cmake_dependent_option(
-   BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON
-   "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
- cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF)
-+cmake_dependent_option(USE_ROCM_CK "Use ROCm Composable Kernel" ON "USE_ROCM" ON)
- option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
- cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF)
- cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
-diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
-index 085af373ec22..af268ab88572 100644
---- a/aten/src/ATen/CMakeLists.txt
-+++ b/aten/src/ATen/CMakeLists.txt
-@@ -361,13 +361,17 @@ endif()
-     ${native_quantized_hip_hip}
-     ${native_transformers_hip_hip} ${native_transformers_src_hip_hip}
-   )
--  if(WIN32) # Windows doesn't support Composable Kernels and Triton
-+  if(NOT USE_ROCM_CK) # Windows doesn't support Composable Kernels and Triton
-     file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip")
-     file(GLOB native_hip_ck "native/hip/ck*.hip")
-     exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
--      ${native_hip_bgemm} ${native_hip_ck}
-+      ${native_hip_bgemm} ${native_hip_ck})
-+  endif()
-+  if(WIN32) # Windows doesn't support Composable Kernels and Triton
-+    exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
-       ${native_transformers_hip_hip} ${native_transformers_hip_cpp})
-   endif()
-+
-   # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources)
-   list(APPEND all_hip_cpp
-     ${native_nested_hip_cpp}
-diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
-index a62b028fd4ff..a3dbf76848ea 100644
---- a/aten/src/ATen/cuda/CUDABlas.cpp
-+++ b/aten/src/ATen/cuda/CUDABlas.cpp
-@@ -708,7 +708,7 @@ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     bgemm_internal_cublaslt<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
-   }
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_CK
-   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-     at::native::bgemm_internal_ck<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
-   }
-@@ -1061,7 +1061,7 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
-     gemm_internal_cublaslt<double>(CUDABLAS_GEMM_ARGS(double));
- #endif
-   }
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_CK
-   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-     at::native::gemm_internal_ck<double>(CUDABLAS_GEMM_ARGS(double));
-   }
-@@ -1077,7 +1077,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
-   }
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_CK
-   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-     at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
-   }
-@@ -1125,7 +1125,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
-   }
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_CK
-   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-     at::native::gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
-   }
-@@ -1141,7 +1141,7 @@ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
-   }
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_CK
-   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-     at::native::gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
-   }
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index 30917bdf39f5..2ca6091030f1 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1046,6 +1046,9 @@ if(USE_ROCM)
-     if(HIPBLASLT_VEC_EXT)
-       list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_VEC_EXT)
-     endif()
-+    if(USE_ROCM_CK)
-+      list(APPEND HIP_CXX_FLAGS -DUSE_ROCM_CK)
-+    endif()
-     list(APPEND HIP_HIPCC_FLAGS --offload-compress)
-     if(WIN32)
-       add_definitions(-DROCM_ON_WINDOWS)
--- 
-2.48.1
-
diff --git a/next/0001-Add-cmake-variable-USE_ROCM_CK.patch b/0001-Add-cmake-variable-USE_ROCM_CK.patch
similarity index 100%
rename from next/0001-Add-cmake-variable-USE_ROCM_CK.patch
rename to 0001-Add-cmake-variable-USE_ROCM_CK.patch
diff --git a/next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch b/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
similarity index 100%
rename from next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
rename to 0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
diff --git a/python-torch.spec b/python-torch.spec
index 03fbf30..1fbad8e 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -15,8 +15,11 @@
 %global miniz_version 3.0.2
 %global pybind11_version 2.13.6
 %else
-%global pypi_version 2.7.0
-%global flatbuffers_version 23.3.3
+%global commit0 a1cb3cc05d46d198467bebbb6e8fba50a325d4e7
+%global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
+%global date0 20250723
+%global pypi_version 2.8.0
+%global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
 %global pybind11_version 2.13.6
 %endif
@@ -33,11 +36,7 @@
 %endif
 
 # For testing distributed+rccl etc.
-%if %{with gitcommit}
 %bcond_without rccl
-%else
-%bcond_with rccl
-%endif
 %bcond_with gloo
 %bcond_without mpi
 %bcond_without tensorpipe
@@ -56,7 +55,7 @@ Name:           python-%{pypi_name}
 %if %{with gitcommit}
 Version:        %{pypi_version}^git%{date0}.%{shortcommit0}
 %else
-Version:        %{pypi_version}
+Version:        %{pypi_version}.rc8
 %endif
 Release:        %autorelease
 Summary:        PyTorch AI/ML framework
@@ -68,7 +67,8 @@ URL:            https://pytorch.org/
 Source0:        %{forgeurl}/archive/%{commit0}/pytorch-%{shortcommit0}.tar.gz
 Source1000:     pyproject.toml
 %else
-Source0:        %{forgeurl}/releases/download/v%{version}/pytorch-v%{version}.tar.gz
+Source0:        %{forgeurl}/archive/%{commit0}/pytorch-%{shortcommit0}.tar.gz
+Source1000:     pyproject.toml
 %endif
 Source1:        https://github.com/google/flatbuffers/archive/refs/tags/v%{flatbuffers_version}.tar.gz
 Source2:        https://github.com/pybind/pybind11/archive/refs/tags/v%{pybind11_version}.tar.gz
@@ -96,25 +96,16 @@ Source70:       https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp-
 %endif
 
 %if %{without kineto}
-%if %{with gitcommit}
 %global ki_commit 5e7501833f1021ce6f618572d3baf657b6319658
-%else
-%global ki_commit be1317644c68b4bfc4646024a6b221066e430031
-%endif
 %global ki_scommit %(c=%{ki_commit}; echo ${c:0:7})
 Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz
 %endif
 
-%if %{without gitcommit}
-# https://github.com/pytorch/pytorch/issues/150187
-Patch11:       0001-Add-cmake-varaible-USE_ROCM_CK.patch
-%else
 # https://github.com/pytorch/pytorch/issues/150187
 Patch11:       0001-Add-cmake-variable-USE_ROCM_CK.patch
 # https://github.com/pytorch/pytorch/issues/156595
 # Patch12:       0001-Use-horrible-dynamo-stub.patch
 Patch12:       0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
-%endif
 
 ExclusiveArch:  x86_64 aarch64
 %global toolchain gcc
@@ -157,9 +148,7 @@ BuildRequires:  python3dist(filelock)
 BuildRequires:  python3dist(jinja2)
 BuildRequires:  python3dist(networkx)
 BuildRequires:  python3dist(numpy)
-%if %{with gitcommit}
 BuildRequires:  python3dist(pip)
-%endif
 BuildRequires:  python3dist(pyyaml)
 BuildRequires:  python3dist(setuptools)
 BuildRequires:  python3dist(sphinx)
@@ -178,9 +167,7 @@ BuildRequires:  hipcub-devel
 BuildRequires:  hipfft-devel
 BuildRequires:  hiprand-devel
 BuildRequires:  hipsparse-devel
-%if %{with gitcommit}
 BuildRequires:  hipsparselt-devel
-%endif
 BuildRequires:  hipsolver-devel
 BuildRequires:  magma-devel
 BuildRequires:  miopen-devel
@@ -198,10 +185,8 @@ BuildRequires:  rocm-core-devel
 BuildRequires:  rocm-hip-devel
 BuildRequires:  rocm-runtime-devel
 BuildRequires:  rocm-rpm-macros
-%if %{with gitcommit}
 BuildRequires:  rocsolver-devel
 BuildRequires:  rocm-smi-devel
-%endif
 BuildRequires:  rocthrust-devel
 BuildRequires:  roctracer-devel
 
@@ -275,7 +260,9 @@ Requires:       python3-%{pypi_name}%{?_isa} = %{version}-%{release}
 cp %{SOURCE1000} .
 
 %else
-%autosetup -p1 -n pytorch-v%{version}
+%autosetup -p1 -n pytorch-%{commit0}
+# Overwrite with a git checkout of the pyproject.toml
+cp %{SOURCE1000} .
 %endif
 
 # Remove bundled egg-info
@@ -349,10 +336,8 @@ sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unus
 sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-deprecated-declarations@' cmake/Dependencies.cmake
 # Use parallel jobs
 sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -parallel-jobs=4@' cmake/Dependencies.cmake
-%if %{with gitcommit}
 # Need to link with librocm_smi64
 sed -i -e 's@hiprtc::hiprtc@hiprtc::hiprtc rocm_smi64@' cmake/Dependencies.cmake
-%endif
 
 # No third_party fmt, use system
 sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt
@@ -449,9 +434,7 @@ mv googletest third_party
 #
 # Fake out pocketfft, and system header will be used
 mkdir third_party/pocketfft
-%if %{with gitcommit}
 cp /usr/include/pocketfft_hdronly.h third_party/pocketfft/
-%endif
 
 #
 # Use the system valgrind headers
@@ -608,19 +591,11 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
 export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 
-%if %{with gitcommit}
 %pyproject_wheel
-%else
-%py3_build
-%endif
 
 %else
 
-%if %{with gitcommit}
 %pyproject_wheel
-%else
-%py3_build
-%endif
 
 %endif
 
@@ -637,33 +612,21 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 # pytorch uses clang, not hipcc
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
 export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
-%if %{with gitcommit}
 %pyproject_install
 %pyproject_save_files '*torch*'
-%else
-%py3_install
-%endif
 
 %else
 
-%if %{with gitcommit}
 %pyproject_install
 %pyproject_save_files '*torch*'
-%else
-%py3_install
-%endif
 
 %endif
 
 
 %check
-%if %{with gitcommit}
 # Not working yet
 # pyproject_check_import torch
-%else
-%py3_check_import torch
-%endif
 
 # Do not remote the empty files
 
diff --git a/sources b/sources
index 4021d40..c7eae22 100644
--- a/sources
+++ b/sources
@@ -7,3 +7,6 @@ SHA512 (libnop-910b558.tar.gz) = 74c5324eaa1b6b2ac8dfef94c835b5c5b044625f8e5efe3
 SHA512 (v1.14.2.tar.gz) = 97635bbaf6dd567c201451dfaf7815b2052fe50d9bccc97aade86cfa4a92651374d167296a5453031b2681dc302806a289bca011a9e79ddc381a17d6118971d7
 SHA512 (cpp-httplib-3b6597b.tar.gz) = 8f1090658c498d04f14fec5c2f301847b1f3360bf92b18d82927643ee04ab61a6b274733a01c7850f9c030205120d674d1d961358d49fdd15636736fb8704f55
 SHA512 (kineto-be13176.tar.gz) = 41a08c7da9eea7d12402f80a5550c9d4df79798719cc52b12a507828c8c896ba28a37c35d8adf809ca72589e1d84965d5ef6dd01f3f8dc1c803c5ed67b03a43a
+SHA512 (pytorch-a1cb3cc.tar.gz) = 92bf8b2c2ef0b459406b60169ecebdc50652c75943e3d6087e4d261f6e308dbad365529561e0f07ea3f0b71790efb68b5e4ab2f44e270462097208d924dc2d95
+SHA512 (v24.12.23.tar.gz) = f97762ba41b9cfef648e93932fd789324c6bb6ebc5b7aeca8185c9ef602294b67d73aea7ae371035579a1419cbfbeba7c3e88b31b5a5848db98f5e8a03b982b1
+SHA512 (kineto-5e75018.tar.gz) = 921b96a56e01d69895b79e67582d8977ed6f873573ab41557c5d026ada5d1f6365e4ed0a0c6804057c52e92510749fc58619f554a164c1ba9d8cd13e789bebd0

From eaa9838b3ced49bdfa6d8c094d4e72cbb2406ec4 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Mon, 4 Aug 2025 16:23:41 -0700
Subject: [PATCH 72/88] Change a couple cmake mins

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python-torch.spec b/python-torch.spec
index 1fbad8e..e3cfb6d 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -449,6 +449,11 @@ sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)@#list(APPEND Caffe2
 
 # cmake version changed
 sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION 3.5)@' third_party/tensorpipe/third_party/libuv/CMakeLists.txt
+sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION 3.5)@' libuv*/CMakeLists.txt
+%if %{without opentelemtry}
+sed -i -e 's@cmake_minimum_required(VERSION 3.1)@cmake_minimum_required(VERSION 3.5)@' third_party/opentelemetry-cpp/CMakeLists.txt
+%endif
+
 
 %if %{with rocm}
 # hipify

From d67e1e127ac97f1557e8c2ea8b392318ded073f0 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Fri, 8 Aug 2025 14:00:11 -0700
Subject: [PATCH 73/88] Update to 2.8.0

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore        |  1 +
 python-torch.spec | 16 +++++-----------
 sources           |  1 +
 3 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index a4ed35b..7832594 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,4 @@
 /pytorch-a1cb3cc.tar.gz
 /v24.12.23.tar.gz
 /kineto-5e75018.tar.gz
+/pytorch-v2.8.0.tar.gz
diff --git a/python-torch.spec b/python-torch.spec
index e3cfb6d..442480f 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -15,9 +15,6 @@
 %global miniz_version 3.0.2
 %global pybind11_version 2.13.6
 %else
-%global commit0 a1cb3cc05d46d198467bebbb6e8fba50a325d4e7
-%global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250723
 %global pypi_version 2.8.0
 %global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
@@ -55,7 +52,7 @@ Name:           python-%{pypi_name}
 %if %{with gitcommit}
 Version:        %{pypi_version}^git%{date0}.%{shortcommit0}
 %else
-Version:        %{pypi_version}.rc8
+Version:        %{pypi_version}
 %endif
 Release:        %autorelease
 Summary:        PyTorch AI/ML framework
@@ -67,8 +64,7 @@ URL:            https://pytorch.org/
 Source0:        %{forgeurl}/archive/%{commit0}/pytorch-%{shortcommit0}.tar.gz
 Source1000:     pyproject.toml
 %else
-Source0:        %{forgeurl}/archive/%{commit0}/pytorch-%{shortcommit0}.tar.gz
-Source1000:     pyproject.toml
+Source0:        %{forgeurl}/releases/download/v%{version}/pytorch-v%{version}.tar.gz
 %endif
 Source1:        https://github.com/google/flatbuffers/archive/refs/tags/v%{flatbuffers_version}.tar.gz
 Source2:        https://github.com/pybind/pybind11/archive/refs/tags/v%{pybind11_version}.tar.gz
@@ -260,9 +256,7 @@ Requires:       python3-%{pypi_name}%{?_isa} = %{version}-%{release}
 cp %{SOURCE1000} .
 
 %else
-%autosetup -p1 -n pytorch-%{commit0}
-# Overwrite with a git checkout of the pyproject.toml
-cp %{SOURCE1000} .
+%autosetup -p1 -n pytorch-v%{version}
 %endif
 
 # Remove bundled egg-info
@@ -310,8 +304,8 @@ rm -rf third_party/kineto/*
 cp -r kineto-*/* third_party/kineto/
 %endif
 
-# hipblaslt only building with gfx90a
-sed -i -e 's@"gfx90a", "gfx940", "gfx941", "gfx942"@"gfx90a"@' aten/src/ATen/native/cuda/Blas.cpp
+# Adjust for the hipblaslt's we build
+sed -i -e 's@"gfx90a", "gfx940", "gfx941", "gfx942"@"gfx90a", "gfx1103", "gfx1150", "gfx1151", "gfx1100", "gfx1101", "gfx1200", "gfx1201"@' aten/src/ATen/native/cuda/Blas.cpp
 
 %if 0%{?rhel}
 # In RHEL but too old
diff --git a/sources b/sources
index c7eae22..335b8a8 100644
--- a/sources
+++ b/sources
@@ -10,3 +10,4 @@ SHA512 (kineto-be13176.tar.gz) = 41a08c7da9eea7d12402f80a5550c9d4df79798719cc52b
 SHA512 (pytorch-a1cb3cc.tar.gz) = 92bf8b2c2ef0b459406b60169ecebdc50652c75943e3d6087e4d261f6e308dbad365529561e0f07ea3f0b71790efb68b5e4ab2f44e270462097208d924dc2d95
 SHA512 (v24.12.23.tar.gz) = f97762ba41b9cfef648e93932fd789324c6bb6ebc5b7aeca8185c9ef602294b67d73aea7ae371035579a1419cbfbeba7c3e88b31b5a5848db98f5e8a03b982b1
 SHA512 (kineto-5e75018.tar.gz) = 921b96a56e01d69895b79e67582d8977ed6f873573ab41557c5d026ada5d1f6365e4ed0a0c6804057c52e92510749fc58619f554a164c1ba9d8cd13e789bebd0
+SHA512 (pytorch-v2.8.0.tar.gz) = 791e658eab87fb957f025558cb9f925078d2426ab7b6f60771d9841dfb691f67d905ba1330a800008efe7c938b6c69bdc52232bccfe8d4860e795a532cd69d28

From 1b986b49932fabd64e4dad05277398d9629a57d3 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 14 Aug 2025 09:10:18 -0700
Subject: [PATCH 74/88] Build on EPEL

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore        |  1 +
 python-torch.spec | 57 ++++++++++++++++++++++++++++++++++-------------
 sources           |  1 +
 3 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/.gitignore b/.gitignore
index 7832594..5fda907 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,3 +25,4 @@
 /v24.12.23.tar.gz
 /kineto-5e75018.tar.gz
 /pytorch-v2.8.0.tar.gz
+/v1.18.0.tar.gz
diff --git a/python-torch.spec b/python-torch.spec
index 442480f..6034593 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -48,6 +48,12 @@
 %bcond_with httplib
 %bcond_with kineto
 
+%if 0%{?fedora}
+%bcond_without onnx
+%else
+%bcond_with onnx
+%endif
+
 Name:           python-%{pypi_name}
 %if %{with gitcommit}
 Version:        %{pypi_version}^git%{date0}.%{shortcommit0}
@@ -97,6 +103,11 @@ Source70:       https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp-
 Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz
 %endif
 
+%if %{without onnx}
+%global ox_ver 1.18.0
+Source90:       https://github.com/onnx/onnx/archive/refs/tags/v%{ox_ver}.tar.gz
+%endif
+
 # https://github.com/pytorch/pytorch/issues/150187
 Patch11:       0001-Add-cmake-variable-USE_ROCM_CK.patch
 # https://github.com/pytorch/pytorch/issues/156595
@@ -123,7 +134,9 @@ BuildRequires:  json-devel
 BuildRequires:  libomp-devel
 BuildRequires:  numactl-devel
 BuildRequires:  ninja-build
+%if %{with onnx}
 BuildRequires:  onnx-devel
+%endif
 %if %{with mpi}
 BuildRequires:  openmpi-devel
 %endif
@@ -304,6 +317,12 @@ rm -rf third_party/kineto/*
 cp -r kineto-*/* third_party/kineto/
 %endif
 
+%if %{without onnx}
+tar xf %{SOURCE90}
+rm -rf third_party/onnx/*
+cp -r onnx-*/* third_party/onnx/
+%endif
+
 # Adjust for the hipblaslt's we build
 sed -i -e 's@"gfx90a", "gfx940", "gfx941", "gfx942"@"gfx90a", "gfx1103", "gfx1150", "gfx1151", "gfx1100", "gfx1101", "gfx1200", "gfx1201"@' aten/src/ATen/native/cuda/Blas.cpp
 
@@ -393,6 +412,10 @@ mv third_party/cpp-httplib .
 mv third_party/kineto .
 %endif
 
+%if %{without onnx}
+mv third_party/onnx .
+%endif
+
 %if %{with test}
 mv third_party/googletest .
 %endif
@@ -421,6 +444,10 @@ mv cpp-httplib third_party
 mv kineto third_party
 %endif
 
+%if %{without onnx}
+mv onnx third_party
+%endif
+
 %if %{with test}
 mv googletest third_party
 %endif
@@ -448,7 +475,6 @@ sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION
 sed -i -e 's@cmake_minimum_required(VERSION 3.1)@cmake_minimum_required(VERSION 3.5)@' third_party/opentelemetry-cpp/CMakeLists.txt
 %endif
 
-
 %if %{with rocm}
 # hipify
 ./tools/amd_build/build_amd.py
@@ -534,7 +560,9 @@ export USE_PYTORCH_QNNPACK=OFF
 export USE_ROCM=OFF
 export USE_SYSTEM_SLEEF=ON
 export USE_SYSTEM_EIGEN_INSTALL=ON
+%if %{with onnx}
 export USE_SYSTEM_ONNX=ON
+%endif
 export USE_SYSTEM_PYBIND11=OFF
 export USE_SYSTEM_LIBS=OFF
 export USE_SYSTEM_NCCL=OFF
@@ -575,7 +603,6 @@ export BUILD_TEST=ON
 #
 # See BZ 2244862
 
-
 %if %{with rocm}
 
 export USE_ROCM=ON
@@ -590,14 +617,15 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
 export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 
-%pyproject_wheel
-
-%else
-
-%pyproject_wheel
-
 %endif
 
+%if 0%{?fedora}
+%pyproject_wheel
+%else
+%py3_build
+%endif
+
+
 %install
 
 %if %{with rocm}
@@ -611,16 +639,15 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 # pytorch uses clang, not hipcc
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
 export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
-%pyproject_install
-%pyproject_save_files '*torch*'
-
-%else
-
-%pyproject_install
-%pyproject_save_files '*torch*'
 
 %endif
 
+%if 0%{?fedora}
+%pyproject_install
+%pyproject_save_files '*torch*'
+%else
+%py3_install
+%endif
 
 
 %check
diff --git a/sources b/sources
index 335b8a8..5c15ab4 100644
--- a/sources
+++ b/sources
@@ -11,3 +11,4 @@ SHA512 (pytorch-a1cb3cc.tar.gz) = 92bf8b2c2ef0b459406b60169ecebdc50652c75943e3d6
 SHA512 (v24.12.23.tar.gz) = f97762ba41b9cfef648e93932fd789324c6bb6ebc5b7aeca8185c9ef602294b67d73aea7ae371035579a1419cbfbeba7c3e88b31b5a5848db98f5e8a03b982b1
 SHA512 (kineto-5e75018.tar.gz) = 921b96a56e01d69895b79e67582d8977ed6f873573ab41557c5d026ada5d1f6365e4ed0a0c6804057c52e92510749fc58619f554a164c1ba9d8cd13e789bebd0
 SHA512 (pytorch-v2.8.0.tar.gz) = 791e658eab87fb957f025558cb9f925078d2426ab7b6f60771d9841dfb691f67d905ba1330a800008efe7c938b6c69bdc52232bccfe8d4860e795a532cd69d28
+SHA512 (v1.18.0.tar.gz) = 2f38664947c8d1efc40620a7c1b1953d2aa4b0a37b67c4886b86e77c1d697363c26413413ddda8eabc545892fb1bcb43afc7e93e62f0901527524a2727e1ea8d

From a6dcc4b8d8fd8903c09cc93f87a6bc2aff357bdf Mon Sep 17 00:00:00 2001
From: Python Maint <python-maint@redhat.com>
Date: Fri, 15 Aug 2025 15:02:18 +0200
Subject: [PATCH 75/88] Rebuilt for Python 3.14.0rc2 bytecode


From 95f1f6fe22c05bf4a53479100fdfbaf48e1e90c3 Mon Sep 17 00:00:00 2001
From: Python Maint <python-maint@redhat.com>
Date: Fri, 19 Sep 2025 14:37:44 +0200
Subject: [PATCH 76/88] Rebuilt for Python 3.14.0rc3 bytecode


From 89daf765fd18ce8d6ecf40a8c6f8fe275c542091 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Fri, 26 Sep 2025 14:24:46 -0700
Subject: [PATCH 77/88] Disable magma

Magma is broken on ROCm 7.

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 6034593..210aee2 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -178,7 +178,8 @@ BuildRequires:  hiprand-devel
 BuildRequires:  hipsparse-devel
 BuildRequires:  hipsparselt-devel
 BuildRequires:  hipsolver-devel
-BuildRequires:  magma-devel
+# Magma is broken on ROCm 7
+# BuildRequires:  magma-devel
 BuildRequires:  miopen-devel
 BuildRequires:  rocblas-devel
 BuildRequires:  rocrand-devel
@@ -607,7 +608,8 @@ export BUILD_TEST=ON
 
 export USE_ROCM=ON
 export USE_ROCM_CK=OFF
-export USE_MAGMA=ON
+# Magma is broken on ROCm 7
+# export USE_MAGMA=ON
 export HIP_PATH=`hipconfig -p`
 export ROCM_PATH=`hipconfig -R`
 RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir`

From f29cac5d83e262d22372de0576f7e89b85abab3e Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Tue, 30 Sep 2025 07:35:43 -0700
Subject: [PATCH 78/88] Update to 2.9.0-rc4

Work around ROCm 7 build issue in 2.8.0

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore        |  1 +
 python-torch.spec | 34 ++++++++++++++++++++++++++++------
 sources           |  1 +
 3 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5fda907..0986c30 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,4 @@
 /kineto-5e75018.tar.gz
 /pytorch-v2.8.0.tar.gz
 /v1.18.0.tar.gz
+/pytorch-715dca6.tar.gz
diff --git a/python-torch.spec b/python-torch.spec
index 210aee2..e25a665 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -4,13 +4,13 @@
 %global forgeurl https://github.com/pytorch/pytorch
 
 # So pre releases can be tried
-%bcond_with gitcommit
+%bcond_without gitcommit
 %if %{with gitcommit}
-# v2.8.0-rc8
-%global commit0 a1cb3cc05d46d198467bebbb6e8fba50a325d4e7
+# v2.9.0-rc4
+%global commit0 715dca672526a20322d07c2e67772cfe4400a20f
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250723
-%global pypi_version 2.8.0
+%global date0 20250923
+%global pypi_version 2.9.0
 %global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
 %global pybind11_version 2.13.6
@@ -108,11 +108,13 @@ Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{
 Source90:       https://github.com/onnx/onnx/archive/refs/tags/v%{ox_ver}.tar.gz
 %endif
 
+%if %{with gitcommit}
+%else
 # https://github.com/pytorch/pytorch/issues/150187
 Patch11:       0001-Add-cmake-variable-USE_ROCM_CK.patch
 # https://github.com/pytorch/pytorch/issues/156595
-# Patch12:       0001-Use-horrible-dynamo-stub.patch
 Patch12:       0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
+%endif
 
 ExclusiveArch:  x86_64 aarch64
 %global toolchain gcc
@@ -200,6 +202,10 @@ BuildRequires:  rocm-smi-devel
 BuildRequires:  rocthrust-devel
 BuildRequires:  roctracer-devel
 
+%if %{with gitcommit}
+BuildRequires:  moodycamel-concurrentqueue-devel
+%endif
+
 Requires:       amdsmi
 
 %endif
@@ -492,6 +498,13 @@ sed -i -e 's@HIP 1.0@HIP MODULE@'            cmake/public/LoadHIP.cmake
 # silence an assert
 # sed -i -e '/qvalue = std::clamp(qvalue, qmin, qmax);/d' aten/src/ATen/native/cuda/IndexKernel.cu
 
+%endif
+
+%if %{with gitcommit}
+# moodycamel include path needs adjusting to use the system's
+sed -i -e 's@${PROJECT_SOURCE_DIR}/third_party/concurrentqueue@/usr/include/concurrentqueue@' cmake/Dependencies.cmake
+
+
 %endif
 
 %build
@@ -607,7 +620,14 @@ export BUILD_TEST=ON
 %if %{with rocm}
 
 export USE_ROCM=ON
+%if %{with gitcommit}
+export USE_ROCM_CK_SDPA=OFF
+export USE_ROCM_CK_GEMM=OFF
+export USE_FBGEMM_GENAI=OFF
+%else
 export USE_ROCM_CK=OFF
+%endif
+
 # Magma is broken on ROCm 7
 # export USE_MAGMA=ON
 export HIP_PATH=`hipconfig -p`
@@ -662,7 +682,9 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %license LICENSE
 %doc README.md 
 %{_bindir}/torchrun
+%if %{without gitcommit}
 %{_bindir}/torchfrtrace
+%endif
 %{python3_sitearch}/%{pypi_name}*
 %{python3_sitearch}/functorch
 
diff --git a/sources b/sources
index 5c15ab4..0fdf299 100644
--- a/sources
+++ b/sources
@@ -12,3 +12,4 @@ SHA512 (v24.12.23.tar.gz) = f97762ba41b9cfef648e93932fd789324c6bb6ebc5b7aeca8185
 SHA512 (kineto-5e75018.tar.gz) = 921b96a56e01d69895b79e67582d8977ed6f873573ab41557c5d026ada5d1f6365e4ed0a0c6804057c52e92510749fc58619f554a164c1ba9d8cd13e789bebd0
 SHA512 (pytorch-v2.8.0.tar.gz) = 791e658eab87fb957f025558cb9f925078d2426ab7b6f60771d9841dfb691f67d905ba1330a800008efe7c938b6c69bdc52232bccfe8d4860e795a532cd69d28
 SHA512 (v1.18.0.tar.gz) = 2f38664947c8d1efc40620a7c1b1953d2aa4b0a37b67c4886b86e77c1d697363c26413413ddda8eabc545892fb1bcb43afc7e93e62f0901527524a2727e1ea8d
+SHA512 (pytorch-715dca6.tar.gz) = 09c9aae54fab3eb17901fc3226fece1c13f41cb8e45a2cb066021823abeb8d27c340993088e01d8e55bb37ed5f94334ec31e6c539cddfacbad157abd27c5e907

From 1509cbcd60c545c5b19d00033c4538f6763bf4cc Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Mon, 6 Oct 2025 14:27:41 -0700
Subject: [PATCH 79/88] Update to 2.9.0-rc6

aarch64 is not building, so disable.

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore        |   1 +
 pyproject.toml    | 139 ++++++++++++++++++++++++++++++++++------------
 python-torch.spec |  11 ++--
 sources           |   1 +
 4 files changed, 112 insertions(+), 40 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0986c30..2918194 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,4 @@
 /pytorch-v2.8.0.tar.gz
 /v1.18.0.tar.gz
 /pytorch-715dca6.tar.gz
+/pytorch-fd36458.tar.gz
diff --git a/pyproject.toml b/pyproject.toml
index ccf9c2a..925742b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,49 +1,105 @@
+# Package ######################################################################
+
+[build-system]
+requires = [
+    # 70.1.0: min version for integrated bdist_wheel command from wheel package
+    # 77.0.0: min version for SPDX expression support for project.license
+    "setuptools>=70.1.0,<80.0",
+    "cmake>=3.27",
+    "ninja",
+    "numpy",
+    "packaging",
+    "pyyaml",
+    "requests",
+    "six",  # dependency chain: NNPACK -> PeachPy -> six
+    "typing-extensions>=4.10.0",
+]
+build-backend = "setuptools.build_meta"
+
+[dependency-groups]
+dev = [
+    # This list should be kept in sync with the requirements-build.txt
+    # in PyTorch root until the project fully migrates to pyproject.toml
+    # after which this can be removed as it is already specified in the
+    # [build-system] section
+    "setuptools>=70.1.0,<80.0",  # setuptools develop deprecated on 80.0
+    "cmake>=3.27",
+    "ninja",
+    "numpy",
+    "packaging",
+    "pyyaml",
+    "requests",
+    "six",  # dependency chain: NNPACK -> PeachPy -> six
+    "typing-extensions>=4.10.0",
+
+    # This list should be kept in sync with the requirements.txt in
+    # PyTorch root until the project fully migrates to pyproject.toml
+    "build[uv]",
+    "expecttest>=0.3.0",
+    "filelock",
+    "fsspec>=0.8.5",
+    "hypothesis",
+    "jinja2",
+    "lintrunner; platform_machine != 's390x' and platform_machine != 'riscv64'",
+    "networkx>=2.5.1",
+    "optree>=0.13.0",
+    "psutil",
+    "sympy>=1.13.3",
+    "typing-extensions>=4.13.2",
+    "wheel",
+]
+
 [project]
 name = "torch"
-requires-python = ">=3.9"
-license = {text = "BSD-3-Clause"}
+description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
+readme = "README.md"
+requires-python = ">=3.10"
+# TODO: change to `license = "BSD-3-Clause"` and enable PEP 639 after pinning setuptools>=77
+# FIXME: As of 2025.06.20, it is hard to ensure the minimum version of setuptools in our CI environment.
+# TOML-table-based license deprecated in setuptools>=77, and the deprecation warning will be changed
+# to an error on 2026.02.18. See also: https://github.com/pypa/setuptools/issues/4903
+license = { text = "BSD-3-Clause" }
+authors = [{ name = "PyTorch Team", email = "packages@pytorch.org" }]
+keywords = ["pytorch", "machine learning"]
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Education",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering",
+    "Topic :: Scientific/Engineering :: Mathematics",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development",
+    "Topic :: Software Development :: Libraries",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Programming Language :: C++",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
+]
 dynamic = [
-    "authors",
-    "classifiers",
     "entry-points",
     "dependencies",
-    "description",
-    "keywords",
-    "optional-dependencies",
-    "readme",
     "scripts",
     "version",
 ]
 
 [project.urls]
-Homepage = "https://pytorch.org/"
-Documentation = "https://pytorch.org/docs/"
-Source = "https://github.com/pytorch/pytorch"
-Forum = "https://discuss.pytorch.org/"
+Homepage = "https://pytorch.org"
+Repository = "https://github.com/pytorch/pytorch"
+Documentation = "https://pytorch.org/docs"
+"Issue Tracker" = "https://github.com/pytorch/pytorch/issues"
+Forum = "https://discuss.pytorch.org"
 
+[project.optional-dependencies]
+optree = ["optree>=0.13.0"]
+opt-einsum = ["opt-einsum>=3.3"]
+pyyaml = ["pyyaml"]
 
-[build-system]
-requires = [
-    # After 75.8.2 dropped dep disttools API. Please fix
-    # API temporarily restored and shim used. Please fix
-    # Setuptools will drop support for setup.py past 80
-    # min version for recursive glob package data support
-    "setuptools>=62.3.0,<80.0",
-    "wheel",
-    "astunparse",
-    "numpy",
-    "ninja",
-    "pyyaml",
-    "cmake",
-    "typing-extensions>=4.10.0",
-    "requests",
-]
-# Use legacy backend to import local packages in setup.py
-build-backend = "setuptools.build_meta:__legacy__"
-
-
-[tool.black]
-line-length = 88
+# Linter tools #################################################################
 
 [tool.isort]
 src_paths = ["caffe2", "torch", "torchgen", "functorch", "test"]
@@ -59,12 +115,10 @@ multi_line_output = 3
 include_trailing_comma = true
 combine_as_imports = true
 
-
 [tool.usort.known]
 first_party = ["caffe2", "torch", "torchgen", "functorch", "test"]
 standard_library = ["typing_extensions"]
 
-
 [tool.ruff]
 line-length = 88
 src = ["caffe2", "torch", "torchgen", "functorch", "test"]
@@ -105,6 +159,7 @@ ignore = [
     "E741",
     "EXE001",
     "F405",
+    "FURB122", # writelines
     # these ignores are from flake8-logging-format; please fix!
     "G101",
     # these ignores are from ruff NPY; please fix!
@@ -127,7 +182,15 @@ ignore = [
     "SIM117",
     "SIM118",
     "UP007", # keep-runtime-typing
+    "UP045", # keep-runtime-typing
     "TC006",
+    # TODO: Remove Python-3.10 specific suppressions
+    "B905",
+    "UP035",
+    "UP036",
+    "UP038",
+    "UP041",
+    "FURB161",
 ]
 select = [
     "B",
@@ -208,6 +271,10 @@ select = [
     "YTT",
 ]
 
+[tool.ruff.lint.pyupgrade]
+# Preserve types, even if a file imports `from __future__ import annotations`.
+keep-runtime-typing = true
+
 [tool.ruff.lint.per-file-ignores]
 "__init__.py" = [
     "F401",
diff --git a/python-torch.spec b/python-torch.spec
index e25a665..3dad39d 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,14 +6,15 @@
 # So pre releases can be tried
 %bcond_without gitcommit
 %if %{with gitcommit}
-# v2.9.0-rc4
-%global commit0 715dca672526a20322d07c2e67772cfe4400a20f
+# v2.9.0-rc6
+%global commit0 fd364580a94079854f2f32d463c118afaefe62e0
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250923
+%global date0 20251002
 %global pypi_version 2.9.0
 %global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
 %global pybind11_version 2.13.6
+%global rc_tag -rc6
 %else
 %global pypi_version 2.8.0
 %global flatbuffers_version 24.12.23
@@ -116,7 +117,9 @@ Patch11:       0001-Add-cmake-variable-USE_ROCM_CK.patch
 Patch12:       0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
 %endif
 
-ExclusiveArch:  x86_64 aarch64
+# ExclusiveArch:  x86_64 aarch64
+# aarch64 not building on 2.9.0-rc6
+ExclusiveArch:  x86_64
 %global toolchain gcc
 %global _lto_cflags %nil
 
diff --git a/sources b/sources
index 0fdf299..701e6b4 100644
--- a/sources
+++ b/sources
@@ -13,3 +13,4 @@ SHA512 (kineto-5e75018.tar.gz) = 921b96a56e01d69895b79e67582d8977ed6f873573ab415
 SHA512 (pytorch-v2.8.0.tar.gz) = 791e658eab87fb957f025558cb9f925078d2426ab7b6f60771d9841dfb691f67d905ba1330a800008efe7c938b6c69bdc52232bccfe8d4860e795a532cd69d28
 SHA512 (v1.18.0.tar.gz) = 2f38664947c8d1efc40620a7c1b1953d2aa4b0a37b67c4886b86e77c1d697363c26413413ddda8eabc545892fb1bcb43afc7e93e62f0901527524a2727e1ea8d
 SHA512 (pytorch-715dca6.tar.gz) = 09c9aae54fab3eb17901fc3226fece1c13f41cb8e45a2cb066021823abeb8d27c340993088e01d8e55bb37ed5f94334ec31e6c539cddfacbad157abd27c5e907
+SHA512 (pytorch-fd36458.tar.gz) = acbb7475b92ad4a8e8d779f3745da22d8438e4c5ef2d6e76d71c987789f2752c8aef7022c87c9a74640fe4f9c1f1a61a3f12a796f63b1e6be24da8e5aacf37dc

From b615a5f89b4a35fc47b600ed1c75ebd5e3a21863 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Mon, 13 Oct 2025 10:17:32 -0700
Subject: [PATCH 80/88] Update to 2.9.0-rc9

Introduce pytorch-rpm-macros package.
Add %pytorch_arches to the macros, set to aarch64 and x86_64

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore        |  1 +
 python-torch.spec | 43 ++++++++++++++++++++++++++++++++-----------
 sources           |  1 +
 3 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2918194..2dab732 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@
 /v1.18.0.tar.gz
 /pytorch-715dca6.tar.gz
 /pytorch-fd36458.tar.gz
+/pytorch-0fabc3b.tar.gz
diff --git a/python-torch.spec b/python-torch.spec
index 3dad39d..f7f7f0c 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,15 +6,15 @@
 # So pre releases can be tried
 %bcond_without gitcommit
 %if %{with gitcommit}
-# v2.9.0-rc6
-%global commit0 fd364580a94079854f2f32d463c118afaefe62e0
+# v2.9.0-rc9
+%global commit0 0fabc3ba44823f257e70ce397d989c8de5e362c1
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20251002
+%global date0 20251008
 %global pypi_version 2.9.0
 %global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
 %global pybind11_version 2.13.6
-%global rc_tag -rc6
+%global rc_tag -rc9
 %else
 %global pypi_version 2.8.0
 %global flatbuffers_version 24.12.23
@@ -117,9 +117,13 @@ Patch11:       0001-Add-cmake-variable-USE_ROCM_CK.patch
 Patch12:       0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
 %endif
 
-# ExclusiveArch:  x86_64 aarch64
-# aarch64 not building on 2.9.0-rc6
-ExclusiveArch:  x86_64
+%if 0%{?fedora} >= 45
+# drop aarch64 in 45
+%global pt_arches x86_64
+%else
+%global pt_arches x86_64 aarch64
+%endif
+ExclusiveArch:  %pt_arches
 %global toolchain gcc
 %global _lto_cflags %nil
 
@@ -137,6 +141,9 @@ BuildRequires:  gloo-devel
 BuildRequires:  json-devel
 
 BuildRequires:  libomp-devel
+%if %{with gitcommit}
+BuildRequires:  moodycamel-concurrentqueue-devel
+%endif
 BuildRequires:  numactl-devel
 BuildRequires:  ninja-build
 %if %{with onnx}
@@ -205,10 +212,6 @@ BuildRequires:  rocm-smi-devel
 BuildRequires:  rocthrust-devel
 BuildRequires:  roctracer-devel
 
-%if %{with gitcommit}
-BuildRequires:  moodycamel-concurrentqueue-devel
-%endif
-
 Requires:       amdsmi
 
 %endif
@@ -261,6 +264,14 @@ PyTorch is a Python package that provides two high-level features:
 You can reuse your favorite Python packages such as NumPy, SciPy,
 and Cython to extend PyTorch when needed.
 
+%package -n pytorch-rpm-macros
+Summary:        PyTorch RPM macros
+BuildArch:      noarch
+
+%description -n pytorch-rpm-macros
+This package contains PyTorch related RPM macros.
+
+
 %if %{with test}
 %package -n python3-%{pypi_name}-test
 Summary:        Tests for %{name}
@@ -512,6 +523,9 @@ sed -i -e 's@${PROJECT_SOURCE_DIR}/third_party/concurrentqueue@/usr/include/conc
 
 %build
 
+# Export the arches
+echo "%%pytorch_arches %pt_arches"   > macros.pytorch
+
 #
 # Control the number of jobs
 #
@@ -653,6 +667,10 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 
 %install
 
+# pytorch rpm macros
+install -Dpm 644 macros.pytorch \
+    %{buildroot}%{_rpmmacrodir}/macros.pytorch
+
 %if %{with rocm}
 export USE_ROCM=ON
 export USE_ROCM_CK=OFF
@@ -691,6 +709,9 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %{python3_sitearch}/%{pypi_name}*
 %{python3_sitearch}/functorch
 
+%files -n pytorch-rpm-macros
+%{_rpmmacrodir}/macros.pytorch
+
 %changelog
 %autochangelog
 
diff --git a/sources b/sources
index 701e6b4..89e2a95 100644
--- a/sources
+++ b/sources
@@ -14,3 +14,4 @@ SHA512 (pytorch-v2.8.0.tar.gz) = 791e658eab87fb957f025558cb9f925078d2426ab7b6f60
 SHA512 (v1.18.0.tar.gz) = 2f38664947c8d1efc40620a7c1b1953d2aa4b0a37b67c4886b86e77c1d697363c26413413ddda8eabc545892fb1bcb43afc7e93e62f0901527524a2727e1ea8d
 SHA512 (pytorch-715dca6.tar.gz) = 09c9aae54fab3eb17901fc3226fece1c13f41cb8e45a2cb066021823abeb8d27c340993088e01d8e55bb37ed5f94334ec31e6c539cddfacbad157abd27c5e907
 SHA512 (pytorch-fd36458.tar.gz) = acbb7475b92ad4a8e8d779f3745da22d8438e4c5ef2d6e76d71c987789f2752c8aef7022c87c9a74640fe4f9c1f1a61a3f12a796f63b1e6be24da8e5aacf37dc
+SHA512 (pytorch-0fabc3b.tar.gz) = 2e87975de0bf6f3dcede168b379e1928712bca16170c2a8ee7d63459f53086c01baac05e0763e4d5d28cdaf1c7d8912225ee06adeff96ead4f6f456ee174b341

From ef01976cf4687ab7bce860c23297cf9b6d105940 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Wed, 15 Oct 2025 16:27:01 -0700
Subject: [PATCH 81/88] Update to 2.9.0

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore        |  1 +
 python-torch.spec | 25 ++-----------------------
 sources           |  1 +
 3 files changed, 4 insertions(+), 23 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2dab732..444b9ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,3 +29,4 @@
 /pytorch-715dca6.tar.gz
 /pytorch-fd36458.tar.gz
 /pytorch-0fabc3b.tar.gz
+/pytorch-v2.9.0.tar.gz
diff --git a/python-torch.spec b/python-torch.spec
index f7f7f0c..0fd7ebd 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -4,7 +4,7 @@
 %global forgeurl https://github.com/pytorch/pytorch
 
 # So pre releases can be tried
-%bcond_without gitcommit
+%bcond_with gitcommit
 %if %{with gitcommit}
 # v2.9.0-rc9
 %global commit0 0fabc3ba44823f257e70ce397d989c8de5e362c1
@@ -16,7 +16,7 @@
 %global pybind11_version 2.13.6
 %global rc_tag -rc9
 %else
-%global pypi_version 2.8.0
+%global pypi_version 2.9.0
 %global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
 %global pybind11_version 2.13.6
@@ -109,14 +109,6 @@ Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{
 Source90:       https://github.com/onnx/onnx/archive/refs/tags/v%{ox_ver}.tar.gz
 %endif
 
-%if %{with gitcommit}
-%else
-# https://github.com/pytorch/pytorch/issues/150187
-Patch11:       0001-Add-cmake-variable-USE_ROCM_CK.patch
-# https://github.com/pytorch/pytorch/issues/156595
-Patch12:       0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
-%endif
-
 %if 0%{?fedora} >= 45
 # drop aarch64 in 45
 %global pt_arches x86_64
@@ -141,9 +133,7 @@ BuildRequires:  gloo-devel
 BuildRequires:  json-devel
 
 BuildRequires:  libomp-devel
-%if %{with gitcommit}
 BuildRequires:  moodycamel-concurrentqueue-devel
-%endif
 BuildRequires:  numactl-devel
 BuildRequires:  ninja-build
 %if %{with onnx}
@@ -514,13 +504,9 @@ sed -i -e 's@HIP 1.0@HIP MODULE@'            cmake/public/LoadHIP.cmake
 
 %endif
 
-%if %{with gitcommit}
 # moodycamel include path needs adjusting to use the system's
 sed -i -e 's@${PROJECT_SOURCE_DIR}/third_party/concurrentqueue@/usr/include/concurrentqueue@' cmake/Dependencies.cmake
 
-
-%endif
-
 %build
 
 # Export the arches
@@ -637,13 +623,9 @@ export BUILD_TEST=ON
 %if %{with rocm}
 
 export USE_ROCM=ON
-%if %{with gitcommit}
 export USE_ROCM_CK_SDPA=OFF
 export USE_ROCM_CK_GEMM=OFF
 export USE_FBGEMM_GENAI=OFF
-%else
-export USE_ROCM_CK=OFF
-%endif
 
 # Magma is broken on ROCm 7
 # export USE_MAGMA=ON
@@ -703,9 +685,6 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %license LICENSE
 %doc README.md 
 %{_bindir}/torchrun
-%if %{without gitcommit}
-%{_bindir}/torchfrtrace
-%endif
 %{python3_sitearch}/%{pypi_name}*
 %{python3_sitearch}/functorch
 
diff --git a/sources b/sources
index 89e2a95..05c1f96 100644
--- a/sources
+++ b/sources
@@ -15,3 +15,4 @@ SHA512 (v1.18.0.tar.gz) = 2f38664947c8d1efc40620a7c1b1953d2aa4b0a37b67c4886b86e7
 SHA512 (pytorch-715dca6.tar.gz) = 09c9aae54fab3eb17901fc3226fece1c13f41cb8e45a2cb066021823abeb8d27c340993088e01d8e55bb37ed5f94334ec31e6c539cddfacbad157abd27c5e907
 SHA512 (pytorch-fd36458.tar.gz) = acbb7475b92ad4a8e8d779f3745da22d8438e4c5ef2d6e76d71c987789f2752c8aef7022c87c9a74640fe4f9c1f1a61a3f12a796f63b1e6be24da8e5aacf37dc
 SHA512 (pytorch-0fabc3b.tar.gz) = 2e87975de0bf6f3dcede168b379e1928712bca16170c2a8ee7d63459f53086c01baac05e0763e4d5d28cdaf1c7d8912225ee06adeff96ead4f6f456ee174b341
+SHA512 (pytorch-v2.9.0.tar.gz) = ae989e3a7fe30f9ea90944dc25e21ca92f2a94ee40d8de974a168c292d82c16ee8920624eff91a85755469ad05473dce0f85893e3ed7794ec5c6bdd89cbd2023

From 741c412249afb0dabfb37cdc6f3bbd05ce3ec176 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Fri, 17 Oct 2025 10:05:50 -0700
Subject: [PATCH 82/88] Remove pytorch-rpm-macros package.

This does not work when building on a general arch

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 0fd7ebd..b623dc6 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -254,14 +254,6 @@ PyTorch is a Python package that provides two high-level features:
 You can reuse your favorite Python packages such as NumPy, SciPy,
 and Cython to extend PyTorch when needed.
 
-%package -n pytorch-rpm-macros
-Summary:        PyTorch RPM macros
-BuildArch:      noarch
-
-%description -n pytorch-rpm-macros
-This package contains PyTorch related RPM macros.
-
-
 %if %{with test}
 %package -n python3-%{pypi_name}-test
 Summary:        Tests for %{name}
@@ -510,7 +502,7 @@ sed -i -e 's@${PROJECT_SOURCE_DIR}/third_party/concurrentqueue@/usr/include/conc
 %build
 
 # Export the arches
-echo "%%pytorch_arches %pt_arches"   > macros.pytorch
+# echo "%%pytorch_arches %pt_arches"   > macros.pytorch
 
 #
 # Control the number of jobs
@@ -650,8 +642,8 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %install
 
 # pytorch rpm macros
-install -Dpm 644 macros.pytorch \
-    %{buildroot}%{_rpmmacrodir}/macros.pytorch
+# install -Dpm 644 macros.pytorch \
+#    %{buildroot}%{_rpmmacrodir}/macros.pytorch
 
 %if %{with rocm}
 export USE_ROCM=ON
@@ -688,9 +680,6 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %{python3_sitearch}/%{pypi_name}*
 %{python3_sitearch}/functorch
 
-%files -n pytorch-rpm-macros
-%{_rpmmacrodir}/macros.pytorch
-
 %changelog
 %autochangelog
 

From e0030b3ec55af789d1db27c223bb58a4c102498b Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Mon, 17 Nov 2025 14:11:32 -0800
Subject: [PATCH 83/88] Rebuild for ROCm 7.1

Signed-off-by: Tom Rix <Tom.Rix@amd.com>

From b3977567d226ae906a3f253b8e782d47373dd578 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Mon, 24 Nov 2025 07:03:31 -0800
Subject: [PATCH 84/88] Always include onnx src

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index b623dc6..7bd82df 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -104,10 +104,8 @@ Source70:       https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp-
 Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz
 %endif
 
-%if %{without onnx}
 %global ox_ver 1.18.0
 Source90:       https://github.com/onnx/onnx/archive/refs/tags/v%{ox_ver}.tar.gz
-%endif
 
 %if 0%{?fedora} >= 45
 # drop aarch64 in 45

From 7908450a47cf1c4210b8ab7ab0132587580e1d72 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 18 Dec 2025 13:52:00 -0800
Subject: [PATCH 85/88] Update to 2.9.1

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore        | 1 +
 python-torch.spec | 2 +-
 sources           | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 444b9ca..c424df5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,3 +30,4 @@
 /pytorch-fd36458.tar.gz
 /pytorch-0fabc3b.tar.gz
 /pytorch-v2.9.0.tar.gz
+/pytorch-v2.9.1.tar.gz
diff --git a/python-torch.spec b/python-torch.spec
index 7bd82df..e493b97 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -16,7 +16,7 @@
 %global pybind11_version 2.13.6
 %global rc_tag -rc9
 %else
-%global pypi_version 2.9.0
+%global pypi_version 2.9.1
 %global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
 %global pybind11_version 2.13.6
diff --git a/sources b/sources
index 05c1f96..9a3681f 100644
--- a/sources
+++ b/sources
@@ -16,3 +16,4 @@ SHA512 (pytorch-715dca6.tar.gz) = 09c9aae54fab3eb17901fc3226fece1c13f41cb8e45a2c
 SHA512 (pytorch-fd36458.tar.gz) = acbb7475b92ad4a8e8d779f3745da22d8438e4c5ef2d6e76d71c987789f2752c8aef7022c87c9a74640fe4f9c1f1a61a3f12a796f63b1e6be24da8e5aacf37dc
 SHA512 (pytorch-0fabc3b.tar.gz) = 2e87975de0bf6f3dcede168b379e1928712bca16170c2a8ee7d63459f53086c01baac05e0763e4d5d28cdaf1c7d8912225ee06adeff96ead4f6f456ee174b341
 SHA512 (pytorch-v2.9.0.tar.gz) = ae989e3a7fe30f9ea90944dc25e21ca92f2a94ee40d8de974a168c292d82c16ee8920624eff91a85755469ad05473dce0f85893e3ed7794ec5c6bdd89cbd2023
+SHA512 (pytorch-v2.9.1.tar.gz) = 88de0289fa2760abd69bef505b5ae3b6d7ff176b415cbb31bbc89ce5476a3800b322a97c4490f270f8b89657aff931bf9a5516202b268e0bb8b1f63dbb87b34a

From 7ddebb112b1931a495e78fba092b77e4f91022df Mon Sep 17 00:00:00 2001
From: "Alexander F. Lent" <lx@xanderlent.com>
Date: Thu, 18 Dec 2025 09:08:09 -0500
Subject: [PATCH 86/88] Improve build times on non-x86 systems

Signed-off-by: Alexander F. Lent <lx@xanderlent.com>
---
 python-torch.spec | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python-torch.spec b/python-torch.spec
index e493b97..abacce9 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -506,10 +506,15 @@ sed -i -e 's@${PROJECT_SOURCE_DIR}/third_party/concurrentqueue@/usr/include/conc
 # Control the number of jobs
 #
 # The build can fail if too many threads exceed the physical memory
-# So count core and and memory and increase the build memory util the build succeeds
+# Run at least one thread, more if CPU & memory resources are available.
 #
+%ifarch x86_64
 # Real cores, No hyperthreading
 COMPILE_JOBS=`cat /proc/cpuinfo | grep -m 1 'cpu cores' | awk '{ print $4 }'`
+%else
+# cpuinfo format varies on other arches, fall back to nproc
+COMPILE_JOBS=`nproc`
+%endif
 if [ ${COMPILE_JOBS}x = x ]; then
     COMPILE_JOBS=1
 fi

From 767d576d1df8b3041f11c86243bcf3503a6d559a Mon Sep 17 00:00:00 2001
From: "Alexander F. Lent" <lx@xanderlent.com>
Date: Sat, 20 Dec 2025 21:24:48 -0500
Subject: [PATCH 87/88] Continue to support aarch64 with myself maintaining

Signed-off-by: Alexander F. Lent <lx@xanderlent.com>
---
 python-torch.spec | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index abacce9..640adb3 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -107,12 +107,7 @@ Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{
 %global ox_ver 1.18.0
 Source90:       https://github.com/onnx/onnx/archive/refs/tags/v%{ox_ver}.tar.gz
 
-%if 0%{?fedora} >= 45
-# drop aarch64 in 45
-%global pt_arches x86_64
-%else
 %global pt_arches x86_64 aarch64
-%endif
 ExclusiveArch:  %pt_arches
 %global toolchain gcc
 %global _lto_cflags %nil

From 294accd75d5a894c11d48bbea7a5d2dd70bebd15 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Mon, 12 Jan 2026 16:38:15 -0800
Subject: [PATCH 88/88] Fix hip

device lib path is no longer needed

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 640adb3..d3c31d7 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -621,8 +621,8 @@ export USE_FBGEMM_GENAI=OFF
 # export USE_MAGMA=ON
 export HIP_PATH=`hipconfig -p`
 export ROCM_PATH=`hipconfig -R`
-RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir`
-export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
+#RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir`
+#export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 
 # pytorch uses clang, not hipcc
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
@@ -648,8 +648,8 @@ export USE_ROCM=ON
 export USE_ROCM_CK=OFF
 export HIP_PATH=`hipconfig -p`
 export ROCM_PATH=`hipconfig -R`
-RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir`
-export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
+# RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir`
+# export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 
 # pytorch uses clang, not hipcc
 export HIP_CLANG_PATH=%{rocmllvm_bindir}