From 9c39544f2ccbc19b4a11c0c0709e3313d5711cf0 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Mon, 17 Feb 2025 05:15:28 -0800
Subject: [PATCH 01/38] Remove rocm loop

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 64 ++---------------------------------------------
 1 file changed, 2 insertions(+), 62 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 86794c5..7d6d26e 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -25,9 +25,6 @@
 %ifarch x86_64
 %bcond_without rocm
 %endif
-%bcond_with rocm_loop
-%global rocm_default_gpu default
-%global rocm_gpu_list gfx9
 
 # For testing distributed+rccl etc.
 %bcond_with rccl
@@ -180,12 +177,10 @@ BuildRequires:  rocm-core-devel
 BuildRequires:  rocm-hip-devel
 BuildRequires:  rocm-runtime-devel
 BuildRequires:  rocm-rpm-macros
-BuildRequires:  rocm-rpm-macros-modules
 BuildRequires:  rocthrust-devel
 BuildRequires:  roctracer-devel
 
 Requires:       amdsmi
-Requires:       rocm-rpm-macros-modules
 
 %endif
 
@@ -236,15 +231,6 @@ PyTorch is a Python package that provides two high-level features:
 You can reuse your favorite Python packages such as NumPy, SciPy,
 and Cython to extend PyTorch when needed.
 
-%if %{with rocm_loop}
-%package -n python3-%{pypi_name}-rocm-gfx9
-Summary:        %{name} for ROCm gfx9
-
-%description -n python3-%{pypi_name}-rocm-gfx9
-%{summary}
-
-%endif
-
 %if %{with test}
 %package -n python3-%{pypi_name}-test
 Summary:        Tests for %{name}
@@ -567,24 +553,8 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 
 # pytorch uses clang, not hipcc
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
-
-gpu=%{rocm_default_gpu}
-module load rocm/$gpu
-export PYTORCH_ROCM_ARCH=$ROCM_GPUS
+export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %py3_build
-mv build build-${gpu}
-module purge
-
-%if %{with rocm_loop}
-for gpu in %{rocm_gpu_list}
-do
-    module load rocm/$gpu
-    export PYTORCH_ROCM_ARCH=$ROCM_GPUS
-    %py3_build
-    mv build build-${gpu}
-    module purge
-done
-%endif
 
 %else
 
@@ -603,28 +573,8 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 
 # pytorch uses clang, not hipcc
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
-
-gpu=%{rocm_default_gpu}
-module load rocm/$gpu
-export PYTORCH_ROCM_ARCH=$ROCM_GPUS
-mv build-${gpu} build
+export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %py3_install
-mv build build-${gpu}
-module purge
-
-%if %{with rocm_loop}
-for gpu in %{rocm_gpu_list}
-do
-    module load rocm/$gpu
-    export PYTORCH_ROCM_ARCH=$ROCM_GPUS
-    mv build-${gpu} build
-    # need to customize the install location, so replace py3_install
-    %{__python3} %{py_setup} %{?py_setup_args} install -O1 --skip-build --root %{buildroot} --prefix /usr/lib64/rocm/${gpu} %{?*}
-    rm -rfv %{buildroot}/usr/lib/rocm/${gpu}/bin/__pycache__
-    mv build build-${gpu}
-    module purge
-done
-%endif
 
 %else
 
@@ -650,16 +600,6 @@ done
 %{python3_sitearch}/functorch
 %{python3_sitearch}/torchgen
 
-%if %{with rocm}
-%if %{with rocm_loop}
-
-%files -n python3-%{pypi_name}-rocm-gfx9
-%{_libdir}/rocm/gfx9/bin/*
-%{_libdir}/rocm/gfx9/lib64/*
-
-%endif
-%endif
-
 %changelog
 %autochangelog
 

From 2508009c1f33f1a5ff742b9bad321afc1d2ddc0b Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Wed, 26 Feb 2025 07:32:31 -0800
Subject: [PATCH 02/38] Remove gold linker

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 7d6d26e..6ceceea 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -105,7 +105,6 @@ ExclusiveArch:  x86_64 aarch64
 %global _lto_cflags %nil
 
 BuildRequires:  cmake
-BuildRequires:  binutils-gold
 BuildRequires:  eigen3-devel
 BuildRequires:  flexiblas-devel
 BuildRequires:  fmt-devel
@@ -483,7 +482,6 @@ export USE_CUDA=OFF
 export USE_FAKELOWP=OFF
 export USE_FBGEMM=OFF
 export USE_FLASH_ATTENTION=OFF
-export USE_GOLD_LINKER=ON
 export USE_GLOO=OFF
 export USE_ITT=OFF
 export USE_KINETO=OFF

From 7569831b203df17daeb25bc64ca9806449b10bb7 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sat, 1 Mar 2025 07:52:12 -0800
Subject: [PATCH 03/38] cmake version changed

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python-torch.spec b/python-torch.spec
index 6ceceea..48096a4 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -420,6 +420,9 @@ sed -i -e 's@DESTINATION ${PYTHON_LIB_REL_PATH}@DESTINATION ${CMAKE_INSTALL_PREF
 # reenable foxi linking
 sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)@#list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)@' cmake/Dependencies.cmake
 
+# cmake version changed
+sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION 3.5)@' third_party/tensorpipe/third_party/libuv/CMakeLists.txt
+
 %if %{with rocm}
 # hipify
 ./tools/amd_build/build_amd.py

From dd353fd56b4b082675582145e25af4d7568459b0 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Wed, 12 Mar 2025 08:07:40 -0700
Subject: [PATCH 04/38] Remove papering over c++ assert problem.

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 48096a4..7787533 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -97,8 +97,6 @@ Patch11:       0001-Improve-finding-and-using-the-rocm_version.h.patch
 # Patches need to be refactored for ToT
 # These are ROCm packages
 Patch101:      0001-cuda-hip-signatures.patch
-# https://github.com/pytorch/pytorch/issues/145608
-Patch102:      0001-torch-paper-over-c-assert.patch
 
 ExclusiveArch:  x86_64 aarch64
 %global toolchain gcc

From 23f5d1192643bdfc836fb031dac1dcc63e075e99 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 13 Mar 2025 05:07:43 -0700
Subject: [PATCH 05/38] Update gitcommit

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 7787533..95584ca 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,13 +6,19 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.5.0-rc9
-%global commit0 417a0763a7d69f6ce80719ac89c1d2deeee78163
+# v2.7.0-rc1
+%global commit0 cdd7a2c72bbf0a72faf6fe4b4903c053f0465a2e
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 2024103
-%global pypi_version 2.5.0
+%global date0 20250412
+%global pypi_version 2.7.0
+%global flatbuffers_version 23.3.3
+%global miniz_version 3.0.2
+%global pybind11_version 2.13.6
 %else
 %global pypi_version 2.5.1
+%global flatbuffers_version 23.3.3
+%global miniz_version 2.1.0
+%global pybind11_version 2.11.1
 %endif
 
 # For -test subpackage
@@ -60,8 +66,8 @@ Source1000:     pyproject.toml
 %else
 Source0:        %{forgeurl}/releases/download/v%{version}/pytorch-v%{version}.tar.gz
 %endif
-Source1:        https://github.com/google/flatbuffers/archive/refs/tags/v23.3.3.tar.gz
-Source2:        https://github.com/pybind/pybind11/archive/refs/tags/v2.11.1.tar.gz
+Source1:        https://github.com/google/flatbuffers/archive/refs/tags/v%{flatbuffers_version}.tar.gz
+Source2:        https://github.com/pybind/pybind11/archive/refs/tags/v%{pybind11_version}.tar.gz
 
 # Developement on tensorpipe has stopped, repo made read only July 1, 2023, this is the last commit
 %global tp_commit 52791a2fd214b2a9dc5759d36725909c1daa7f2e
@@ -91,12 +97,14 @@ Source70:       https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp-
 Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz
 %endif
 
+%if %{without gitcommit}
 Patch11:       0001-Improve-finding-and-using-the-rocm_version.h.patch
 
 # ROCm patches
 # Patches need to be refactored for ToT
 # These are ROCm packages
 Patch101:      0001-cuda-hip-signatures.patch
+%endif
 
 ExclusiveArch:  x86_64 aarch64
 %global toolchain gcc
@@ -205,10 +213,10 @@ Summary:        %{summary}
 Provides:       pytorch
 
 # Apache-2.0
-Provides:       bundled(flatbuffers) = 22.3.3
+Provides:       bundled(flatbuffers) = %{flatbuffers_version}
 # MIT
-Provides:       bundled(miniz) = 2.1.0
-Provides:       bundled(pybind11) = 2.11.1
+Provides:       bundled(miniz) = %{miniz_version}
+Provides:       bundled(pybind11) = %{pybind11_version}
 
 %if %{with tensorpipe}
 # BSD-3-Clause
@@ -254,11 +262,11 @@ rm -rf %{pypi_name}.egg-info
 
 tar xf %{SOURCE1}
 rm -rf third_party/flatbuffers/*
-cp -r flatbuffers-23.3.3/* third_party/flatbuffers/
+cp -r flatbuffers-%{flatbuffers_version}/* third_party/flatbuffers/
 
 tar xf %{SOURCE2}
 rm -rf third_party/pybind11/*
-cp -r pybind11-2.11.1/* third_party/pybind11/
+cp -r pybind11-%{pybind11_version}/* third_party/pybind11/
 
 %if %{with tensorpipe}
 tar xf %{SOURCE20}
@@ -345,7 +353,7 @@ sed -i -e 's@check_submodules()$@#check_submodules()@' setup.py
 # the third_party dir to compile the file.
 # mimiz is licensed MIT
 # https://github.com/richgel999/miniz/blob/master/LICENSE
-mv third_party/miniz-2.1.0 .
+mv third_party/miniz-%{miniz_version} .
 #
 # setup.py depends on this script
 mv third_party/build_bundled.py .
@@ -379,7 +387,7 @@ mv third_party/googletest .
 rm -rf third_party/*
 # Put stuff back
 mv build_bundled.py third_party
-mv miniz-2.1.0 third_party
+mv miniz-%{miniz_version} third_party
 mv flatbuffers third_party
 mv pybind11 third_party
 

From bd11f4aa1a27d626aae19b7278b790e753e587bc Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sat, 22 Mar 2025 11:58:26 -0700
Subject: [PATCH 06/38] Update gitcommit to v2.7.0-rc2

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 95584ca..8a9a9f7 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.7.0-rc1
-%global commit0 cdd7a2c72bbf0a72faf6fe4b4903c053f0465a2e
+# v2.7.0-rc2
+%global commit0 b1940b5867e40e40ebdce4db76f76d3d0b71d3f4
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250412
+%global date0 20250413
 %global pypi_version 2.7.0
 %global flatbuffers_version 23.3.3
 %global miniz_version 3.0.2
@@ -253,6 +253,10 @@ Requires:       python3-%{pypi_name}%{?_isa} = %{version}-%{release}
 # Overwrite with a git checkout of the pyproject.toml
 cp %{SOURCE1000} .
 
+# https://github.com/pytorch/pytorch/issues/149803
+# Tries to checkout nccl
+sed -i -e 's@    checkout_nccl()@#    checkout_nccl()@' tools/build_pytorch_libs.py
+
 %else
 %autosetup -p1 -n pytorch-v%{version}
 %endif
@@ -329,6 +333,12 @@ sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-depr
 
 # No third_party fmt, use system
 sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt
+%if %{with gitcommit}
+sed -i -e 's@fmt::fmt-header-only@fmt@' aten/src/ATen/CMakeLists.txt
+sed -i -e 's@list(APPEND ATen_HIP_INCLUDE $<TARGET_PROPERTY:fmt,INTERFACE_INCLUDE_DIRECTORIES>)@@' aten/src/ATen/CMakeLists.txt
+
+sed -i -e 's@fmt::fmt-header-only@fmt@' third_party/kineto/libkineto/CMakeLists.txt
+%endif
 sed -i -e 's@fmt::fmt-header-only@fmt@' c10/CMakeLists.txt
 sed -i -e 's@fmt::fmt-header-only@fmt@' torch/CMakeLists.txt
 sed -i -e 's@fmt::fmt-header-only@fmt@' cmake/Dependencies.cmake
@@ -434,6 +444,10 @@ sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION
 ./tools/amd_build/build_amd.py
 # Fedora installs to /usr/include, not /usr/include/rocm-core
 sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable/TunableGemm.h
+%if %{with gitcommit}
+# https://github.com/pytorch/pytorch/issues/149805
+sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' cmake/public/LoadHIP.cmake
+%endif
 # use any hip, correct CMAKE_MODULE_PATH
 sed -i -e 's@lib/cmake/hip@lib64/cmake/hip@' cmake/public/LoadHIP.cmake
 sed -i -e 's@HIP 1.0@HIP MODULE@'            cmake/public/LoadHIP.cmake
@@ -512,14 +526,22 @@ export USE_SYSTEM_ONNX=ON
 export USE_SYSTEM_PYBIND11=OFF
 export USE_SYSTEM_LIBS=OFF
 export USE_TENSORPIPE=OFF
+%if %{with gitcommit}
+export USE_XNNPACK=OFF
+%else
 export USE_XNNPACK=ON
+%endif
 export USE_XPU=OFF
 export USE_SYSTEM_PTHREADPOOL=ON
 export USE_SYSTEM_CPUINFO=ON
 export USE_SYSTEM_FP16=ON
 export USE_SYSTEM_FXDIV=ON
 export USE_SYSTEM_PSIMD=ON
+%if %{with gitcommit}
+export USE_SYSTEM_XNNPACK=OFF
+%else
 export USE_SYSTEM_XNNPACK=ON
+%endif
 
 export USE_DISTRIBUTED=ON
 %if %{with tensorpipe}

From e80f34f74dac2bbc813c0914a1b4afd74abbe057 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sat, 29 Mar 2025 05:11:02 -0700
Subject: [PATCH 07/38] Update gitcommit to 2.7-rc3

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 0001-Add-cmake-varaible-USE_ROCM_CK.patch | 120 ++++++++++++++++++++++
 0001-python-torch-disable-ck.patch        | 112 ++++++++++++++++++++
 python-torch.spec                         |  25 ++++-
 3 files changed, 254 insertions(+), 3 deletions(-)
 create mode 100644 0001-Add-cmake-varaible-USE_ROCM_CK.patch
 create mode 100644 0001-python-torch-disable-ck.patch

diff --git a/0001-Add-cmake-varaible-USE_ROCM_CK.patch b/0001-Add-cmake-varaible-USE_ROCM_CK.patch
new file mode 100644
index 0000000..b34e07a
--- /dev/null
+++ b/0001-Add-cmake-varaible-USE_ROCM_CK.patch
@@ -0,0 +1,120 @@
+From 0f33e0a7bbd1522ee74f8fc1fbe3af7563318c79 Mon Sep 17 00:00:00 2001
+From: Tom Rix <Tom.Rix@amd.com>
+Date: Fri, 28 Mar 2025 15:33:09 -0700
+Subject: [PATCH] Add cmake varaible USE_ROCM_CK
+
+To control the use of ROCm Composable Kernel usage.
+
+CK is not compatible with all rocBLAS gpu's, so the user
+must explicitly choose to use CK.
+
+Signed-off-by: Tom Rix <Tom.Rix@amd.com>
+---
+ CMakeLists.txt                  |  1 +
+ aten/src/ATen/CMakeLists.txt    |  8 ++++++--
+ aten/src/ATen/cuda/CUDABlas.cpp | 10 +++++-----
+ cmake/Dependencies.cmake        |  3 +++
+ 4 files changed, 15 insertions(+), 7 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index f3fee2f7ffc2..73903acce452 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -249,6 +249,7 @@ cmake_dependent_option(
+   BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON
+   "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
+ cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF)
++cmake_dependent_option(USE_ROCM_CK "Use ROCm Composable Kernel" ON "USE_ROCM" ON)
+ option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
+ cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF)
+ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
+diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
+index 085af373ec22..af268ab88572 100644
+--- a/aten/src/ATen/CMakeLists.txt
++++ b/aten/src/ATen/CMakeLists.txt
+@@ -361,13 +361,17 @@ endif()
+     ${native_quantized_hip_hip}
+     ${native_transformers_hip_hip} ${native_transformers_src_hip_hip}
+   )
+-  if(WIN32) # Windows doesn't support Composable Kernels and Triton
++  if(NOT USE_ROCM_CK) # Windows doesn't support Composable Kernels and Triton
+     file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip")
+     file(GLOB native_hip_ck "native/hip/ck*.hip")
+     exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
+-      ${native_hip_bgemm} ${native_hip_ck}
++      ${native_hip_bgemm} ${native_hip_ck})
++  endif()
++  if(WIN32) # Windows doesn't support Composable Kernels and Triton
++    exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
+       ${native_transformers_hip_hip} ${native_transformers_hip_cpp})
+   endif()
++
+   # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources)
+   list(APPEND all_hip_cpp
+     ${native_nested_hip_cpp}
+diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
+index a62b028fd4ff..a3dbf76848ea 100644
+--- a/aten/src/ATen/cuda/CUDABlas.cpp
++++ b/aten/src/ATen/cuda/CUDABlas.cpp
+@@ -708,7 +708,7 @@ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     bgemm_internal_cublaslt<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+   }
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_CK
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::bgemm_internal_ck<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+   }
+@@ -1061,7 +1061,7 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
+     gemm_internal_cublaslt<double>(CUDABLAS_GEMM_ARGS(double));
+ #endif
+   }
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_CK
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<double>(CUDABLAS_GEMM_ARGS(double));
+   }
+@@ -1077,7 +1077,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
+   }
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_CK
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
+   }
+@@ -1125,7 +1125,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_CK
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+@@ -1141,7 +1141,7 @@ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_CK
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
+index 30917bdf39f5..2ca6091030f1 100644
+--- a/cmake/Dependencies.cmake
++++ b/cmake/Dependencies.cmake
+@@ -1046,6 +1046,9 @@ if(USE_ROCM)
+     if(HIPBLASLT_VEC_EXT)
+       list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_VEC_EXT)
+     endif()
++    if(USE_ROCM_CK)
++      list(APPEND HIP_CXX_FLAGS -DUSE_ROCM_CK)
++    endif()
+     list(APPEND HIP_HIPCC_FLAGS --offload-compress)
+     if(WIN32)
+       add_definitions(-DROCM_ON_WINDOWS)
+-- 
+2.48.1
+
diff --git a/0001-python-torch-disable-ck.patch b/0001-python-torch-disable-ck.patch
new file mode 100644
index 0000000..e8fd9c2
--- /dev/null
+++ b/0001-python-torch-disable-ck.patch
@@ -0,0 +1,112 @@
+From 027dad1eaed51c1172e2497da611e3267d42d2f0 Mon Sep 17 00:00:00 2001
+From: Tom Rix <Tom.Rix@amd.com>
+Date: Fri, 28 Mar 2025 09:16:03 -0700
+Subject: [PATCH] python-torch: disable ck
+
+---
+ aten/src/ATen/CMakeLists.txt    |  7 +++----
+ aten/src/ATen/Context.cpp       |  1 +
+ aten/src/ATen/cuda/CUDABlas.cpp | 10 +++++-----
+ 3 files changed, 9 insertions(+), 9 deletions(-)
+
+diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
+index 085af373ec22..84808880e51c 100644
+--- a/aten/src/ATen/CMakeLists.txt
++++ b/aten/src/ATen/CMakeLists.txt
+@@ -134,7 +134,7 @@ file(GLOB native_cuda_cu "native/cuda/*.cu")
+ file(GLOB native_cuda_cpp "native/cuda/*.cpp")
+ file(GLOB native_cuda_h "native/cuda/*.h" "native/cuda/*.cuh")
+ file(GLOB native_cuda_linalg_cpp "native/cuda/linalg/*.cpp")
+-file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh" "native/hip/bgemm_kernels/*.h")
++file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh" )
+ file(GLOB native_cudnn_cpp "native/cudnn/*.cpp")
+ file(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu")
+ file(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp")
+@@ -145,7 +145,7 @@ file(GLOB native_nested_h "native/nested/*.h")
+ file(GLOB native_nested_cuda_cu "native/nested/cuda/*.cu")
+ file(GLOB native_nested_cuda_cpp "native/nested/cuda/*.cpp")
+ 
+-file(GLOB native_hip_hip "native/hip/*.hip" "native/hip/bgemm_kernels/*.hip")
++file(GLOB native_hip_hip "native/hip/*.hip" )
+ file(GLOB native_hip_cpp "native/hip/*.cpp")
+ file(GLOB native_hip_linalg_cpp "native/hip/linalg/*.cpp")
+ file(GLOB native_miopen_cpp "native/miopen/*.cpp")
+@@ -361,13 +361,12 @@ endif()
+     ${native_quantized_hip_hip}
+     ${native_transformers_hip_hip} ${native_transformers_src_hip_hip}
+   )
+-  if(WIN32) # Windows doesn't support Composable Kernels and Triton
+     file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip")
+     file(GLOB native_hip_ck "native/hip/ck*.hip")
+     exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
+       ${native_hip_bgemm} ${native_hip_ck}
+       ${native_transformers_hip_hip} ${native_transformers_hip_cpp})
+-  endif()
++
+   # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources)
+   list(APPEND all_hip_cpp
+     ${native_nested_hip_cpp}
+diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
+index f598fc3a39d3..03dab6ff38fe 100644
+--- a/aten/src/ATen/Context.cpp
++++ b/aten/src/ATen/Context.cpp
+@@ -355,6 +355,7 @@ at::BlasBackend Context::blasPreferredBackend() {
+ }
+ 
+ void Context::setBlasPreferredBackend(at::BlasBackend b) {
++  return;
+ #ifdef _MSC_VER
+   TORCH_WARN_ONCE(
+     "torch.backends.cuda.preferred_blas_library is an experimental feature. "
+diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
+index a62b028fd4ff..cba38426ea1f 100644
+--- a/aten/src/ATen/cuda/CUDABlas.cpp
++++ b/aten/src/ATen/cuda/CUDABlas.cpp
+@@ -708,7 +708,7 @@ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     bgemm_internal_cublaslt<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+   }
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_NO_CK
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::bgemm_internal_ck<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+   }
+@@ -1061,7 +1061,7 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
+     gemm_internal_cublaslt<double>(CUDABLAS_GEMM_ARGS(double));
+ #endif
+   }
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_NO_CK
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<double>(CUDABLAS_GEMM_ARGS(double));
+   }
+@@ -1077,7 +1077,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
+   }
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_NO_CK
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
+   }
+@@ -1125,7 +1125,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_NO_CK
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+@@ -1141,7 +1141,7 @@ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_NO_CK
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+-- 
+2.48.1
+
diff --git a/python-torch.spec b/python-torch.spec
index 8a9a9f7..8f5ed02 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.7.0-rc2
-%global commit0 b1940b5867e40e40ebdce4db76f76d3d0b71d3f4
+# v2.7.0-rc3
+%global commit0 b04d8358d959925bee0adfd67cc17987af9fbb9d
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250413
+%global date0 20250326
 %global pypi_version 2.7.0
 %global flatbuffers_version 23.3.3
 %global miniz_version 3.0.2
@@ -104,6 +104,13 @@ Patch11:       0001-Improve-finding-and-using-the-rocm_version.h.patch
 # Patches need to be refactored for ToT
 # These are ROCm packages
 Patch101:      0001-cuda-hip-signatures.patch
+%else
+# https://github.com/pytorch/pytorch/issues/150187
+# The hack job
+# Patch11:       0001-python-torch-disable-ck.patch
+# Cleaned up hack job
+Patch11:       0001-Add-cmake-varaible-USE_ROCM_CK.patch
+
 %endif
 
 ExclusiveArch:  x86_64 aarch64
@@ -159,6 +166,9 @@ BuildRequires:  python3dist(sympy)
 %endif
 
 %if %{with rocm}
+%if %{with gitcommit}
+BuildRequires:  composable_kernel-devel
+%endif
 BuildRequires:  hipblas-devel
 BuildRequires:  hipblaslt-devel
 BuildRequires:  hipcub-devel
@@ -330,6 +340,8 @@ sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-pass
 sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unused-command-line-argument@' cmake/Dependencies.cmake
 sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unused-result@' cmake/Dependencies.cmake
 sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-deprecated-declarations@' cmake/Dependencies.cmake
+# Use parallel jobs
+sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -parallel-jobs=4@' cmake/Dependencies.cmake
 
 # No third_party fmt, use system
 sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt
@@ -447,6 +459,9 @@ sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable
 %if %{with gitcommit}
 # https://github.com/pytorch/pytorch/issues/149805
 sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' cmake/public/LoadHIP.cmake
+# Fedora installs to /usr/include, not /usr/include/rocm-core
+sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable/Tunable.cpp
+sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/cuda/tunable/Tunable.cpp
 %endif
 # use any hip, correct CMAKE_MODULE_PATH
 sed -i -e 's@lib/cmake/hip@lib64/cmake/hip@' cmake/public/LoadHIP.cmake
@@ -574,6 +589,7 @@ export BUILD_TEST=ON
 %if %{with rocm}
 
 export USE_ROCM=ON
+export USE_ROCM_CK=OFF
 export USE_MAGMA=ON
 export HIP_PATH=`hipconfig -p`
 export ROCM_PATH=`hipconfig -R`
@@ -595,6 +611,7 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 
 %if %{with rocm}
 export USE_ROCM=ON
+export USE_ROCM_CK=OFF
 export HIP_PATH=`hipconfig -p`
 export ROCM_PATH=`hipconfig -R`
 RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir`
@@ -620,8 +637,10 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %files -n python3-%{pypi_name}
 %license LICENSE
 %doc README.md 
+%if %{without gitcommit}
 %{_bindir}/convert-caffe2-to-onnx
 %{_bindir}/convert-onnx-to-caffe2
+%endif
 %{_bindir}/torchrun
 %{_bindir}/torchfrtrace
 %{python3_sitearch}/%{pypi_name}

From 96edd6c2ece9753ac856ef3809f735fc1174783d Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sat, 5 Apr 2025 08:56:18 -0700
Subject: [PATCH 08/38] Update gitcommit to v2.7.0-rc6

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 8f5ed02..a217432 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.7.0-rc3
-%global commit0 b04d8358d959925bee0adfd67cc17987af9fbb9d
+# v2.7.0-rc6
+%global commit0 06c6a81a987e271d35a5da9501b4a17915bb8206
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250326
+%global date0 20250403
 %global pypi_version 2.7.0
 %global flatbuffers_version 23.3.3
 %global miniz_version 3.0.2
@@ -166,9 +166,6 @@ BuildRequires:  python3dist(sympy)
 %endif
 
 %if %{with rocm}
-%if %{with gitcommit}
-BuildRequires:  composable_kernel-devel
-%endif
 BuildRequires:  hipblas-devel
 BuildRequires:  hipblaslt-devel
 BuildRequires:  hipcub-devel

From e3c2449e4fac3865f272f20e24f20948b2d9b208 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 10 Apr 2025 05:27:21 -0700
Subject: [PATCH 09/38] Update gitcomit to 2.7.0-rc8

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index a217432..8ea791e 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.7.0-rc6
-%global commit0 06c6a81a987e271d35a5da9501b4a17915bb8206
+# v2.7.0-rc8
+%global commit0 c7ff78dfc0c38847bf5daa78ab8b3669e1734246
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250403
+%global date0 20250408
 %global pypi_version 2.7.0
 %global flatbuffers_version 23.3.3
 %global miniz_version 3.0.2

From f0eda9ace1c0f50ab511cfd9fe524b50053e09c1 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sun, 13 Apr 2025 07:11:27 -0700
Subject: [PATCH 10/38] Update gitcommit to 2.7.0-rc9

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 8ea791e..f03837b 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.7.0-rc8
-%global commit0 c7ff78dfc0c38847bf5daa78ab8b3669e1734246
+# v2.7.0-rc9
+%global commit0 073912749d667fcfb2de1c15e1e664dc0ccd3460
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250408
+%global date0 20250410
 %global pypi_version 2.7.0
 %global flatbuffers_version 23.3.3
 %global miniz_version 3.0.2

From fb64b28d49af6ca8bd72c63bc83166a0fd737b32 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sat, 19 Apr 2025 08:20:35 -0700
Subject: [PATCH 11/38] Update gitcommit to 2.7.0-rc10

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index f03837b..5a9f08f 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.7.0-rc9
-%global commit0 073912749d667fcfb2de1c15e1e664dc0ccd3460
+# v2.7.0-rc10
+%global commit0 134179474539648ba7dee1317959529fbd0e7f89
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250410
+%global date0 20250415
 %global pypi_version 2.7.0
 %global flatbuffers_version 23.3.3
 %global miniz_version 3.0.2

From 2f3d92b7c5b36d8afb0846f4af6871bcb7a7fc1d Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Fri, 25 Apr 2025 12:39:04 -0700
Subject: [PATCH 12/38] Update to 2.7.0

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore        |  2 ++
 python-torch.spec | 37 +++++++++++--------------------------
 sources           |  8 +++-----
 3 files changed, 16 insertions(+), 31 deletions(-)

diff --git a/.gitignore b/.gitignore
index cdf142f..25abff5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,5 @@
 /pytorch-v2.4.1.tar.gz
 /pytorch-v2.5.0.tar.gz
 /pytorch-v2.5.1.tar.gz
+/pytorch-v2.7.0.tar.gz
+/v2.13.6.tar.gz
diff --git a/python-torch.spec b/python-torch.spec
index 5a9f08f..bc15924 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -15,10 +15,10 @@
 %global miniz_version 3.0.2
 %global pybind11_version 2.13.6
 %else
-%global pypi_version 2.5.1
+%global pypi_version 2.7.0
 %global flatbuffers_version 23.3.3
-%global miniz_version 2.1.0
-%global pybind11_version 2.11.1
+%global miniz_version 3.0.2
+%global pybind11_version 2.13.6
 %endif
 
 # For -test subpackage
@@ -98,19 +98,20 @@ Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{
 %endif
 
 %if %{without gitcommit}
-Patch11:       0001-Improve-finding-and-using-the-rocm_version.h.patch
+# Patch11:       0001-Improve-finding-and-using-the-rocm_version.h.patch
 
 # ROCm patches
 # Patches need to be refactored for ToT
 # These are ROCm packages
-Patch101:      0001-cuda-hip-signatures.patch
-%else
+# Patch101:      0001-cuda-hip-signatures.patch
+
 # https://github.com/pytorch/pytorch/issues/150187
 # The hack job
 # Patch11:       0001-python-torch-disable-ck.patch
 # Cleaned up hack job
 Patch11:       0001-Add-cmake-varaible-USE_ROCM_CK.patch
 
+%else
 %endif
 
 ExclusiveArch:  x86_64 aarch64
@@ -260,10 +261,6 @@ Requires:       python3-%{pypi_name}%{?_isa} = %{version}-%{release}
 # Overwrite with a git checkout of the pyproject.toml
 cp %{SOURCE1000} .
 
-# https://github.com/pytorch/pytorch/issues/149803
-# Tries to checkout nccl
-sed -i -e 's@    checkout_nccl()@#    checkout_nccl()@' tools/build_pytorch_libs.py
-
 %else
 %autosetup -p1 -n pytorch-v%{version}
 %endif
@@ -342,12 +339,10 @@ sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -parallel
 
 # No third_party fmt, use system
 sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt
-%if %{with gitcommit}
 sed -i -e 's@fmt::fmt-header-only@fmt@' aten/src/ATen/CMakeLists.txt
 sed -i -e 's@list(APPEND ATen_HIP_INCLUDE $<TARGET_PROPERTY:fmt,INTERFACE_INCLUDE_DIRECTORIES>)@@' aten/src/ATen/CMakeLists.txt
 
 sed -i -e 's@fmt::fmt-header-only@fmt@' third_party/kineto/libkineto/CMakeLists.txt
-%endif
 sed -i -e 's@fmt::fmt-header-only@fmt@' c10/CMakeLists.txt
 sed -i -e 's@fmt::fmt-header-only@fmt@' torch/CMakeLists.txt
 sed -i -e 's@fmt::fmt-header-only@fmt@' cmake/Dependencies.cmake
@@ -361,6 +356,10 @@ sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)@#list(APPE
 sed -i -e 's@if(NOT TARGET fxdiv)@if(MSVC AND USE_XNNPACK)@' caffe2/CMakeLists.txt
 sed -i -e 's@TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@#TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@' caffe2/CMakeLists.txt
 
+# https://github.com/pytorch/pytorch/issues/149803
+# Tries to checkout nccl
+sed -i -e 's@    checkout_nccl()@#    checkout_nccl()@' tools/build_pytorch_libs.py
+
 # Disable the use of check_submodule's in the setup.py, we are a tarball, not a git repo
 sed -i -e 's@check_submodules()$@#check_submodules()@' setup.py
 
@@ -453,13 +452,11 @@ sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION
 ./tools/amd_build/build_amd.py
 # Fedora installs to /usr/include, not /usr/include/rocm-core
 sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable/TunableGemm.h
-%if %{with gitcommit}
 # https://github.com/pytorch/pytorch/issues/149805
 sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' cmake/public/LoadHIP.cmake
 # Fedora installs to /usr/include, not /usr/include/rocm-core
 sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable/Tunable.cpp
 sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/cuda/tunable/Tunable.cpp
-%endif
 # use any hip, correct CMAKE_MODULE_PATH
 sed -i -e 's@lib/cmake/hip@lib64/cmake/hip@' cmake/public/LoadHIP.cmake
 sed -i -e 's@HIP 1.0@HIP MODULE@'            cmake/public/LoadHIP.cmake
@@ -538,22 +535,14 @@ export USE_SYSTEM_ONNX=ON
 export USE_SYSTEM_PYBIND11=OFF
 export USE_SYSTEM_LIBS=OFF
 export USE_TENSORPIPE=OFF
-%if %{with gitcommit}
 export USE_XNNPACK=OFF
-%else
-export USE_XNNPACK=ON
-%endif
 export USE_XPU=OFF
 export USE_SYSTEM_PTHREADPOOL=ON
 export USE_SYSTEM_CPUINFO=ON
 export USE_SYSTEM_FP16=ON
 export USE_SYSTEM_FXDIV=ON
 export USE_SYSTEM_PSIMD=ON
-%if %{with gitcommit}
 export USE_SYSTEM_XNNPACK=OFF
-%else
-export USE_SYSTEM_XNNPACK=ON
-%endif
 
 export USE_DISTRIBUTED=ON
 %if %{with tensorpipe}
@@ -634,10 +623,6 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %files -n python3-%{pypi_name}
 %license LICENSE
 %doc README.md 
-%if %{without gitcommit}
-%{_bindir}/convert-caffe2-to-onnx
-%{_bindir}/convert-onnx-to-caffe2
-%endif
 %{_bindir}/torchrun
 %{_bindir}/torchfrtrace
 %{python3_sitearch}/%{pypi_name}
diff --git a/sources b/sources
index aa1ed3c..4021d40 100644
--- a/sources
+++ b/sources
@@ -1,11 +1,9 @@
+SHA512 (pytorch-v2.7.0.tar.gz) = 17e875a66f1669901f5f770c9d829ba5bfa3967296cfb71550e8a92507181db742548eaf7cc9a2c478c4b91e366f27cc480e2e1bbb328db8501d30e1649839e6
 SHA512 (v23.3.3.tar.gz) = 4066c94f2473c7ea16917d29a613e16f840a329089c88e0bdbdb999aef3442ba00abfd2aa92266fa9c067e399dc88e6f0ccac40dc151378857e665638e78bbf0
-SHA512 (v2.11.1.tar.gz) = ed1512ff0bca3bc0a45edc2eb8c77f8286ab9389f6ff1d5cb309be24bc608abbe0df6a7f5cb18c8f80a3bfa509058547c13551c3cd6a759af708fd0cdcdd9e95
+SHA512 (v2.13.6.tar.gz) = 497c25b33b09a9c42f67131ab82e35d689e8ce089dd7639be997305ff9a6d502447b79c824508c455d559e61f0186335b54dd2771d903a7c1621833930622d1a
 SHA512 (tensorpipe-52791a2.tar.gz) = 1e5faf17a7236c5506c08cb28be16069b11bb929bbca64ed9745ce4277d46739186ab7d6597da7437d90ed2d166d4c37ef2f3bceabe8083ef3adbb0e8e5f227e
-SHA512 (libnop-910b558.tar.gz) = 74c5324eaa1b6b2ac8dfef94c835b5c5b044625f8e5efe3522470b1ecc4798ff43d344a013cee2f6901e83267c6167072947b754e63f1552ae7044cffe234c36
 SHA512 (v1.41.0.tar.gz) = bb08a1970a10e8d9571ffea3d021643de30ec212cd51317b98d6cf0cfe55d6877992921fb01d1188a6d466687335b77885685d924f8cb7200a0bec30eee05c65
+SHA512 (libnop-910b558.tar.gz) = 74c5324eaa1b6b2ac8dfef94c835b5c5b044625f8e5efe3522470b1ecc4798ff43d344a013cee2f6901e83267c6167072947b754e63f1552ae7044cffe234c36
 SHA512 (v1.14.2.tar.gz) = 97635bbaf6dd567c201451dfaf7815b2052fe50d9bccc97aade86cfa4a92651374d167296a5453031b2681dc302806a289bca011a9e79ddc381a17d6118971d7
 SHA512 (cpp-httplib-3b6597b.tar.gz) = 8f1090658c498d04f14fec5c2f301847b1f3360bf92b18d82927643ee04ab61a6b274733a01c7850f9c030205120d674d1d961358d49fdd15636736fb8704f55
 SHA512 (kineto-be13176.tar.gz) = 41a08c7da9eea7d12402f80a5550c9d4df79798719cc52b12a507828c8c896ba28a37c35d8adf809ca72589e1d84965d5ef6dd01f3f8dc1c803c5ed67b03a43a
-SHA512 (pytorch-v2.4.1.tar.gz) = fc364112a88540035f213274b526b4c8ad9be335ff214161091e8a8d3efa89ebef5a5b4d33b20b5b659896d164dcbe739f0d7d2a293d3afc0efcfaa2bf62fc2c
-SHA512 (pytorch-v2.5.0.tar.gz) = 6ccf1ac9f191f5bd757ef7fbfc1dcd81d591577f2d3df7313c6ed32790c592aaffd253e18dc778a2fcc707e4533299817dfdf9fae108636ce5c29c1b8ff8bba6
-SHA512 (pytorch-v2.5.1.tar.gz) = a8882608c2ab6467a58d60c6df84c9f1004b43eafeba57db499dbbfdecc09db2e221b9d4c344c8af7c0bea6252e874c400483502dca24a0b474c376b9fef1dd4

From aeb5b118d5303d5cd147d573df87021f0db9ba2a Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 1 May 2025 08:40:21 -0700
Subject: [PATCH 13/38] Turn off kleidai

Breaks aarch64

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python-torch.spec b/python-torch.spec
index bc15924..ab500c5 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -517,6 +517,7 @@ export USE_FLASH_ATTENTION=OFF
 export USE_GLOO=OFF
 export USE_ITT=OFF
 export USE_KINETO=OFF
+export USE_KLEIDIAI=OFF
 export USE_LITE_INTERPRETER_PROFILER=OFF
 export USE_LITE_PROTO=OFF
 export USE_MAGMA=OFF

From e6d73d7c4909bb23cb0dfad74464ecb3a81cb286 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sun, 4 May 2025 07:36:00 -0700
Subject: [PATCH 14/38] Rebuild for magma

Signed-off-by: Tom Rix <Tom.Rix@amd.com>

From edfa2c25e3ffc54addd3cc4e18e972f7e2a6cf89 Mon Sep 17 00:00:00 2001
From: Python Maint <python-maint@redhat.com>
Date: Fri, 6 Jun 2025 16:14:25 +0200
Subject: [PATCH 15/38] Rebuilt for Python 3.14


From 27593d78b34de046e4d8ebcf0c242343742bfc95 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Fri, 27 Jun 2025 14:44:29 -0700
Subject: [PATCH 16/38] update gitcommit to 2.8-rc3

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 pyproject.toml    | 184 +++++++++++++++++++++++++++++++++++++++-------
 python-torch.spec |  35 ++++++---
 2 files changed, 181 insertions(+), 38 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9508ad0..ccf9c2a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,13 +1,41 @@
+[project]
+name = "torch"
+requires-python = ">=3.9"
+license = {text = "BSD-3-Clause"}
+dynamic = [
+    "authors",
+    "classifiers",
+    "entry-points",
+    "dependencies",
+    "description",
+    "keywords",
+    "optional-dependencies",
+    "readme",
+    "scripts",
+    "version",
+]
+
+[project.urls]
+Homepage = "https://pytorch.org/"
+Documentation = "https://pytorch.org/docs/"
+Source = "https://github.com/pytorch/pytorch"
+Forum = "https://discuss.pytorch.org/"
+
+
 [build-system]
 requires = [
-    "setuptools",
+    # After 75.8.2 dropped dep disttools API. Please fix
+    # API temporarily restored and shim used. Please fix
+    # Setuptools will drop support for setup.py past 80
+    # min version for recursive glob package data support
+    "setuptools>=62.3.0,<80.0",
     "wheel",
     "astunparse",
     "numpy",
     "ninja",
     "pyyaml",
     "cmake",
-    "typing-extensions",
+    "typing-extensions>=4.10.0",
     "requests",
 ]
 # Use legacy backend to import local packages in setup.py
@@ -15,32 +43,68 @@ build-backend = "setuptools.build_meta:__legacy__"
 
 
 [tool.black]
-# Uncomment if pyproject.toml worked fine to ensure consistency with flake8
-# line-length = 120
-target-version = ["py38", "py39", "py310", "py311"]
+line-length = 88
+
+[tool.isort]
+src_paths = ["caffe2", "torch", "torchgen", "functorch", "test"]
+extra_standard_library = ["typing_extensions"]
+skip_gitignore = true
+skip_glob = ["third_party/*"]
+atomic = true
+profile = "black"
+indent = 4
+line_length = 88
+lines_after_imports = 2
+multi_line_output = 3
+include_trailing_comma = true
+combine_as_imports = true
+
+
+[tool.usort.known]
+first_party = ["caffe2", "torch", "torchgen", "functorch", "test"]
+standard_library = ["typing_extensions"]
 
 
 [tool.ruff]
-target-version = "py38"
+line-length = 88
+src = ["caffe2", "torch", "torchgen", "functorch", "test"]
 
+[tool.ruff.format]
+docstring-code-format = true
+quote-style = "double"
+
+[tool.ruff.lint]
 # NOTE: Synchoronize the ignores with .flake8
+external = [
+    "B001",
+    "B902",
+    "B950",
+    "E121",
+    "E122",
+    "E128",
+    "E131",
+    "E704",
+    "E723",
+    "F723",
+    "F812",
+    "P201",
+    "P204",
+    "T484",
+    "TOR901",
+]
 ignore = [
     # these ignores are from flake8-bugbear; please fix!
     "B007", "B008", "B017",
     "B018", # Useless expression
-    "B019",
     "B023",
     "B028", # No explicit `stacklevel` keyword argument found
-    "B904",
     "E402",
     "C408", # C408 ignored because we like the dict keyword argument syntax
     "E501", # E501 is not flexible enough, we're using B950 instead
     "E721",
-    "E731", # Assign lambda expression
     "E741",
     "EXE001",
     "F405",
-    "F841",
     # these ignores are from flake8-logging-format; please fix!
     "G101",
     # these ignores are from ruff NPY; please fix!
@@ -48,39 +112,41 @@ ignore = [
     # these ignores are from ruff PERF; please fix!
     "PERF203",
     "PERF401",
-    "PERF403",
     # these ignores are from PYI; please fix!
-    "PYI019",
     "PYI024",
     "PYI036",
     "PYI041",
     "PYI056",
     "SIM102", "SIM103", "SIM112", # flake8-simplify code styles
     "SIM105", # these ignores are from flake8-simplify. please fix or ignore with commented reason
-    "SIM108",
+    "SIM108", # SIM108 ignored because we prefer if-else-block instead of ternary expression
     "SIM110",
     "SIM114", # Combine `if` branches using logical `or` operator
     "SIM115",
     "SIM116", # Disable Use a dictionary instead of consecutive `if` statements
     "SIM117",
     "SIM118",
-    "UP006", # keep-runtime-typing
     "UP007", # keep-runtime-typing
+    "TC006",
 ]
-line-length = 120
 select = [
     "B",
+    "B904", # Re-raised error without specifying the cause via the from keyword
     "C4",
     "G",
     "E",
     "EXE",
     "F",
     "SIM1",
+    "SIM911",
     "W",
     # Not included in flake8
+    "FURB",
+    "LOG",
     "NPY",
     "PERF",
     "PGH004",
+    "PIE790",
     "PIE794",
     "PIE800",
     "PIE804",
@@ -89,40 +155,92 @@ select = [
     "PLC0131", # type bivariance
     "PLC0132", # type param mismatch
     "PLC0205", # string as __slots__
+    "PLC3002", # unnecessary-direct-lambda-call
     "PLE",
     "PLR0133", # constant comparison
     "PLR0206", # property with params
     "PLR1722", # use sys exit
+    "PLR1736", # unnecessary list index
     "PLW0129", # assert on string literal
+    "PLW0131", # named expr without context
+    "PLW0133", # useless exception statement
+    "PLW0245", # super without brackets
     "PLW0406", # import self
     "PLW0711", # binary op exception
+    "PLW1501", # bad open mode
+    "PLW1507", # shallow copy os.environ
     "PLW1509", # preexec_fn not safe with threads
+    "PLW2101", # useless lock statement
     "PLW3301", # nested min max
     "PT006", # TODO: enable more PT rules
+    "PT014", # duplicate parameterize case
     "PT022",
     "PT023",
     "PT024",
     "PT025",
     "PT026",
     "PYI",
+    "Q003",  # avoidable escaped quote
+    "Q004",  # unnecessary escaped quote
+    "RSE",
     "RUF008", # mutable dataclass default
+    "RUF013", # ban implicit optional
     "RUF015", # access first ele in constant time
     "RUF016", # type error non-integer index
     "RUF017",
-    "TRY200",
-    "TRY302",
+    "RUF018", # no assignment in assert
+    "RUF019", # unnecessary-key-check
+    "RUF020", # never union
+    "RUF024", # from keys mutable
+    "RUF026", # default factory kwarg
+    "RUF030", # No print statement in assert
+    "RUF033", # default values __post_init__ dataclass
+    "RUF041", # simplify nested Literal
+    "RUF048", # properly parse `__version__`
+    "RUF200", # validate pyproject.toml
+    "S324", # for hashlib FIPS compliance
+    "SLOT",
+    "TC",
+    "TRY002", # ban vanilla raise (todo fix NOQAs)
+    "TRY203",
+    "TRY401", # verbose-log-message
     "UP",
+    "YTT",
 ]
 
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 "__init__.py" = [
     "F401",
 ]
+"*.pyi" = [
+    "PYI011", # typed-argument-default-in-stub
+    "PYI021", # docstring-in-stub
+    "PYI053", # string-or-bytes-too-long
+]
+"functorch/notebooks/**" = [
+    "F401",
+]
+"test/export/**" = [
+    "PGH004"
+]
+"test/typing/**" = [
+    "PGH004"
+]
 "test/typing/reveal/**" = [
     "F821",
 ]
 "test/torch_np/numpy_tests/**" = [
     "F821",
+    "NPY201",
+]
+"test/dynamo/test_bytecode_utils.py" = [
+    "F821",
+]
+"test/dynamo/test_debug_utils.py" = [
+    "UP037",
+]
+"test/dynamo/test_misc.py" = [
+    "PGH004",
 ]
 "test/jit/**" = [
     "PLR0133", # tests require this for JIT
@@ -136,19 +254,33 @@ select = [
     "RUF015",
     "UP", # We don't want to modify the jit test as they test specify syntax
 ]
-
-"torch/onnx/**" = [
-    "UP037", # ONNX does runtime type checking
+"test/inductor/s429861_repro.py" = [
+    "PGH004",
+]
+"test/inductor/test_torchinductor.py" = [
+    "UP037",
+]
+# autogenerated #TODO figure out why file level noqa is ignored
+"torch/_appdirs.py" = ["PGH004"]
+"torch/jit/_shape_functions.py" = ["PGH004"]
+"torch/_inductor/fx_passes/serialized_patterns/**" = ["F401", "F501"]
+"torch/_inductor/autoheuristic/artifacts/**" = ["F401", "F501"]
+"torch/_inductor/codegen/**" = [
+    "PGH004"
 ]
-
 "torchgen/api/types/__init__.py" = [
     "F401",
     "F403",
 ]
-"torchgen/executorch/api/types/__init__.py" = [
-    "F401",
-    "F403",
-]
 "torch/utils/collect_env.py" = [
     "UP", # collect_env.py needs to work with older versions of Python
 ]
+"torch/_vendor/**" = [
+    "UP", # No need to mess with _vendor
+]
+"tools/linter/**" = [
+    "LOG015" # please fix
+]
+
+[tool.codespell]
+ignore-words = "tools/linter/dictionary.txt"
diff --git a/python-torch.spec b/python-torch.spec
index ab500c5..90c908c 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,12 +6,12 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.7.0-rc10
-%global commit0 134179474539648ba7dee1317959529fbd0e7f89
+# v2.8.0-rc3
+%global commit0 3d53a53e504089a52a149791fd33d7fc898bd055
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250415
-%global pypi_version 2.7.0
-%global flatbuffers_version 23.3.3
+%global date0 20250625
+%global pypi_version 2.8.0
+%global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
 %global pybind11_version 2.13.6
 %else
@@ -92,19 +92,16 @@ Source70:       https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp-
 %endif
 
 %if %{without kineto}
+%if %{with gitcommit}
+%global ki_commit 5e7501833f1021ce6f618572d3baf657b6319658
+%else
 %global ki_commit be1317644c68b4bfc4646024a6b221066e430031
+%endif
 %global ki_scommit %(c=%{ki_commit}; echo ${c:0:7})
 Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz
 %endif
 
 %if %{without gitcommit}
-# Patch11:       0001-Improve-finding-and-using-the-rocm_version.h.patch
-
-# ROCm patches
-# Patches need to be refactored for ToT
-# These are ROCm packages
-# Patch101:      0001-cuda-hip-signatures.patch
-
 # https://github.com/pytorch/pytorch/issues/150187
 # The hack job
 # Patch11:       0001-python-torch-disable-ck.patch
@@ -112,6 +109,7 @@ Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{
 Patch11:       0001-Add-cmake-varaible-USE_ROCM_CK.patch
 
 %else
+Patch11:       0001-Add-cmake-variable-USE_ROCM_CK.patch
 %endif
 
 ExclusiveArch:  x86_64 aarch64
@@ -190,6 +188,9 @@ BuildRequires:  rocm-core-devel
 BuildRequires:  rocm-hip-devel
 BuildRequires:  rocm-runtime-devel
 BuildRequires:  rocm-rpm-macros
+%if %{with gitcommit}
+BuildRequires:  rocsolver-devel
+%endif
 BuildRequires:  rocthrust-devel
 BuildRequires:  roctracer-devel
 
@@ -356,9 +357,11 @@ sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)@#list(APPE
 sed -i -e 's@if(NOT TARGET fxdiv)@if(MSVC AND USE_XNNPACK)@' caffe2/CMakeLists.txt
 sed -i -e 's@TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@#TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@' caffe2/CMakeLists.txt
 
+%if %{without gitcommit}
 # https://github.com/pytorch/pytorch/issues/149803
 # Tries to checkout nccl
 sed -i -e 's@    checkout_nccl()@#    checkout_nccl()@' tools/build_pytorch_libs.py
+%endif
 
 # Disable the use of check_submodule's in the setup.py, we are a tarball, not a git repo
 sed -i -e 's@check_submodules()$@#check_submodules()@' setup.py
@@ -432,6 +435,9 @@ mv googletest third_party
 #
 # Fake out pocketfft, and system header will be used
 mkdir third_party/pocketfft
+%if %{with gitcommit}
+cp /usr/include/pocketfft_hdronly.h third_party/pocketfft/
+%endif
 
 #
 # Use the system valgrind headers
@@ -585,7 +591,12 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 
 # pytorch uses clang, not hipcc
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
+%if %{?fedora} <= 43
+export PYTORCH_ROCM_ARCH="gfx1100;gfx1201"
+%else
 export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
+%endif
+
 %py3_build
 
 %else

From 42c33b8dcd0de27e96cf0871c8087aa571d63f20 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sun, 20 Jul 2025 12:44:41 -0700
Subject: [PATCH 17/38] Update the next gitcommit to v2.8.0-rc6

Remove old patches.

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch |  47 -
 ...-Changes-to-compile-with-3.13-126033.patch | 222 ----
 ...ectorization-on-windows-submodule-sl.patch | 910 -----------------
 ...finding-and-using-the-rocm_version.h.patch | 142 ---
 0001-Optionally-use-hipblaslt.patch           | 506 ----------
 0001-Patch-for-sleef-3.6.patch                | 952 ------------------
 0001-Reenable-dim-for-python-3.12.patch       | 115 ---
 0001-Regenerate-flatbuffer-header.patch       |  39 -
 0001-Stub-in-kineto-ActivityType.patch        |  73 --
 0001-can-not-use-with-c-files.patch           |  25 -
 0001-cuda-hip-signatures.patch                |  42 -
 0001-disable-use-of-aotriton.patch            |  94 --
 ...le-dynamo-on-3.12-enable-most-dynamo.patch | 226 -----
 ...lude-fmt-ranges.h-for-using-fmt-join.patch |  54 -
 0001-no-third_party-FXdiv.patch               |  54 -
 0001-no-third_party-fmt.patch                 |  65 --
 0001-no-third_party-foxi.patch                |  36 -
 0001-python-torch-disable-ck.patch            | 112 ---
 0001-reenable-foxi-linking.patch              |  25 -
 0001-silence-an-assert.patch                  |  25 -
 0001-torch-paper-over-c-assert.patch          |  88 --
 0001-use-any-hip.patch                        |  34 -
 ...1-Add-cmake-option-USE_SYSTEM_FBGEMM.patch |  47 -
 .../0001-Add-cmake-variable-USE_ROCM_CK.patch | 149 +++
 next/0001-Optionally-use-hipblaslt.patch      | 506 ----------
 next/0001-disable-use-of-aotriton.patch       |  94 --
 python-torch.spec                             |  11 +-
 27 files changed, 154 insertions(+), 4539 deletions(-)
 delete mode 100644 0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
 delete mode 100644 0001-Changes-to-compile-with-3.13-126033.patch
 delete mode 100644 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
 delete mode 100644 0001-Improve-finding-and-using-the-rocm_version.h.patch
 delete mode 100644 0001-Optionally-use-hipblaslt.patch
 delete mode 100644 0001-Patch-for-sleef-3.6.patch
 delete mode 100644 0001-Reenable-dim-for-python-3.12.patch
 delete mode 100644 0001-Regenerate-flatbuffer-header.patch
 delete mode 100644 0001-Stub-in-kineto-ActivityType.patch
 delete mode 100644 0001-can-not-use-with-c-files.patch
 delete mode 100644 0001-cuda-hip-signatures.patch
 delete mode 100644 0001-disable-use-of-aotriton.patch
 delete mode 100644 0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch
 delete mode 100644 0001-include-fmt-ranges.h-for-using-fmt-join.patch
 delete mode 100644 0001-no-third_party-FXdiv.patch
 delete mode 100644 0001-no-third_party-fmt.patch
 delete mode 100644 0001-no-third_party-foxi.patch
 delete mode 100644 0001-python-torch-disable-ck.patch
 delete mode 100644 0001-reenable-foxi-linking.patch
 delete mode 100644 0001-silence-an-assert.patch
 delete mode 100644 0001-torch-paper-over-c-assert.patch
 delete mode 100644 0001-use-any-hip.patch
 delete mode 100644 next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
 create mode 100644 next/0001-Add-cmake-variable-USE_ROCM_CK.patch
 delete mode 100644 next/0001-Optionally-use-hipblaslt.patch
 delete mode 100644 next/0001-disable-use-of-aotriton.patch

diff --git a/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch b/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
deleted file mode 100644
index 413c60d..0000000
--- a/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
+++ /dev/null
@@ -1,47 +0,0 @@
-From 091b7fe1ccbb5e4ff4ac6017d42bacb869f61a27 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 20 Jul 2024 05:37:15 -0600
-Subject: [PATCH] Add cmake option USE_SYSTEM_FBGEMM
-
-Signed-off-by: Tom Rix <trix@redhat.com>
----
- CMakeLists.txt           | 1 +
- cmake/Dependencies.cmake | 3 ++-
- 2 files changed, 3 insertions(+), 1 deletion(-)
-
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index c4cd4b2c2a98..2068f7c6c4f2 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -253,6 +253,7 @@ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
-                        "USE_CUDNN" OFF)
- cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF)
- option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
-+option(USE_SYSTEM_FBGEMM "Use system-wide FBGEMM" OFF)
- option(USE_KINETO "Use Kineto profiling library" ON)
- option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
- option(USE_FAKELOWP "Use FakeLowp operators" OFF)
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index f1f2eb7cec31..192dac46f13b 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -706,6 +706,7 @@ endif()
- 
- # ---[ FBGEMM
- if(USE_FBGEMM)
-+  if (NOT USE_SYSTEM_FBGEMM)
-   set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")
-   if(NOT DEFINED FBGEMM_SOURCE_DIR)
-     set(FBGEMM_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/fbgemm" CACHE STRING "FBGEMM source directory")
-@@ -746,7 +747,7 @@ if(USE_FBGEMM)
-       target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable)
-     endif()
-   endif()
--
-+  endif()
-   if(USE_FBGEMM)
-     list(APPEND Caffe2_DEPENDENCY_LIBS fbgemm)
-   endif()
--- 
-2.45.1
-
diff --git a/0001-Changes-to-compile-with-3.13-126033.patch b/0001-Changes-to-compile-with-3.13-126033.patch
deleted file mode 100644
index ddc0dcf..0000000
--- a/0001-Changes-to-compile-with-3.13-126033.patch
+++ /dev/null
@@ -1,222 +0,0 @@
-From 655a06444b261cb28e71a0973c0ab67aaa8261ab Mon Sep 17 00:00:00 2001
-From: albanD <desmaison.alban@gmail.com>
-Date: Tue, 14 May 2024 02:14:53 +0000
-Subject: [PATCH] Changes to compile with 3.13 (#126033)
-
-This is mainly:
-- Fix refcount access macro
-- Hide all the Dynamo code that needs update as usual
-- Add _PyWeakref_ClearRef as an extern provided by CPython. Including the pycore header that defines it would require raw c include shenanigans that I don't think are worth it.
-This allows to build both with regular and nogil version of cpython. Both
-
-Note that this requires the 3.13 branch at least past [d3094744d40de2deefbda9b1996d5029c9ebf0b0](https://github.com/python/cpython/commit/d3094744d40de2deefbda9b1996d5029c9ebf0b0) which we need for mimalloc include and weakref function being exposed.
-
-debug-only issues in pybind11 with PyMem_MALLOC vs PyObject_MALLOC being should be synced either by updating pybind or cpython. @colesbury I can send a PR to ifdef the proper use in pybind if you think that this is the best solution here?
-
-Pull Request resolved: https://github.com/pytorch/pytorch/pull/126033
-Approved by: https://github.com/colesbury
----
- torch/csrc/Storage.cpp                  |  2 +-
- torch/csrc/autograd/python_variable.cpp |  2 +-
- torch/csrc/dynamo/cpython_defs.c        | 15 +++++-
- torch/csrc/dynamo/cpython_defs.h        |  2 +
- torch/csrc/dynamo/eval_frame.c          | 67 ++++++++++++++++++-------
- torch/csrc/utils/python_compat.h        |  4 ++
- 6 files changed, 70 insertions(+), 22 deletions(-)
-
-diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp
-index 93dbc9c09bb2..b22bbac35981 100644
---- a/torch/csrc/Storage.cpp
-+++ b/torch/csrc/Storage.cpp
-@@ -236,7 +236,7 @@ static void THPStorage_subclass_dealloc(PyObject* self) {
-   if (type->tp_del) {
-     PyObject_GC_Track(self);
-     type->tp_del(self);
--    if (self->ob_refcnt > 0) {
-+    if (Py_REFCNT(self) > 0) {
-       // Resurrected (see above comment about resurrection from `__del__`)
-       return;
-     }
-diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
-index 9e85f0026b35..8fd1129da63c 100644
---- a/torch/csrc/autograd/python_variable.cpp
-+++ b/torch/csrc/autograd/python_variable.cpp
-@@ -1910,7 +1910,7 @@ void THPVariable_subclass_dealloc(PyObject* self) {
-   if (type->tp_del) {
-     PyObject_GC_Track(self);
-     type->tp_del(self);
--    if (self->ob_refcnt > 0) {
-+    if (Py_REFCNT(self) > 0) {
-       /* Resurrected */
-       return;
-     }
-diff --git a/torch/csrc/dynamo/cpython_defs.c b/torch/csrc/dynamo/cpython_defs.c
-index 4a1dba63009a..5e0945a052ae 100644
---- a/torch/csrc/dynamo/cpython_defs.c
-+++ b/torch/csrc/dynamo/cpython_defs.c
-@@ -13,6 +13,17 @@
-   } else {                                                              \
-   }
- 
-+#if IS_PYTHON_3_13_PLUS
-+// Gave up after fixing a few of these
-+// pycore_opcode.h is gone (new is pycore_opcode_metadata.h ?)
-+// f_code is gone (new is f_executable?)
-+
-+// Fake definitions for what we removed
-+const uint8_t* THP_PyOpcode_Caches = NULL;
-+const int THP_PyOpcode_Caches_size = 0;
-+
-+#else
-+
- // NOTE: all `assert`s below are converted to `CHECK`s
- 
- #if IS_PYTHON_3_11_PLUS
-@@ -29,8 +40,8 @@
- #define NEED_OPCODE_TABLES // To get _PyOpcode_Deopt
- #include <internal/pycore_opcode.h>
- #undef NEED_OPCODE_TABLES
--#undef Py_BUILD_CORE
- #include <internal/pycore_frame.h>
-+#undef Py_BUILD_CORE
- 
- // As a simple way to reduce the impact of ABI changes on the CPython side, this check forces
- // us to manually re-check that the function didn't change on the next major version
-@@ -364,3 +375,5 @@ THP_PyFrame_Clear(_PyInterpreterFrame *frame)
- }
- 
- #endif
-+
-+#endif // CPython 3.13
-\ No newline at end of file
-diff --git a/torch/csrc/dynamo/cpython_defs.h b/torch/csrc/dynamo/cpython_defs.h
-index a897c3e6c6e7..3b6c9667f8c9 100644
---- a/torch/csrc/dynamo/cpython_defs.h
-+++ b/torch/csrc/dynamo/cpython_defs.h
-@@ -8,7 +8,9 @@
- 
- #if IS_PYTHON_3_11_PLUS
- 
-+#define Py_BUILD_CORE
- #include <internal/pycore_frame.h>
-+#undef Py_BUILD_CORE
- 
- int THP_PyFrame_FastToLocalsWithError(
-     _PyInterpreterFrame* frame,
-diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c
-index c286e821f09d..e13cb5af2a0e 100644
---- a/torch/csrc/dynamo/eval_frame.c
-+++ b/torch/csrc/dynamo/eval_frame.c
-@@ -8,6 +8,31 @@
- #include <opcode.h>
- #include <stdbool.h>
- 
-+
-+
-+PyObject* guard_error_hook = NULL;
-+const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup";
-+
-+static int active_dynamo_threads = 0;
-+
-+static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT;
-+
-+inline static PyObject* eval_frame_callback_get(void) {
-+  void* result = PyThread_tss_get(&eval_frame_callback_key);
-+  if (unlikely(result == NULL)) {
-+    return (PyObject*)Py_None;
-+  } else {
-+    return (PyObject*)result;
-+  }
-+}
-+
-+inline static void eval_frame_callback_set(PyObject* obj) {
-+  PyThread_tss_set(&eval_frame_callback_key, obj);
-+}
-+
-+// 3.13 Not supported at all. See cpython_defs.c for hints
-+#if !(IS_PYTHON_3_13_PLUS)
-+
- // Problem in CPython includes when mixing core and non-core build
- // The fix was not backported to 3.12 so this is needed here
- // https://github.com/python/cpython/issues/105268
-@@ -138,24 +163,6 @@ THP_PyFrame_FastToLocalsWithError(THP_EVAL_API_FRAME_OBJECT *frame, int *free_va
- }
- #endif
- 
--PyObject* guard_error_hook = NULL;
--const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup";
--
--static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT;
--
--inline static PyObject* eval_frame_callback_get(void) {
--  void* result = PyThread_tss_get(&eval_frame_callback_key);
--  if (unlikely(result == NULL)) {
--    return (PyObject*)Py_None;
--  } else {
--    return (PyObject*)result;
--  }
--}
--
--inline static void eval_frame_callback_set(PyObject* obj) {
--  PyThread_tss_set(&eval_frame_callback_key, obj);
--}
--
- static PyObject* _custom_eval_frame_shim(
-     PyThreadState* tstate,
-     THP_EVAL_API_FRAME_OBJECT* frame,
-@@ -627,7 +634,29 @@ static PyObject* _custom_eval_frame(
-   }
- }
- 
--static int active_dynamo_threads = 0;
-+#else // IS_PYTHON_3_13_PLUS
-+
-+// Fake definitions for everything we removed
-+
-+typedef struct THPPyInterpreterFrame {
-+  PyObject_HEAD
-+  _PyInterpreterFrame* frame; // Borrowed reference
-+} THPPyInterpreterFrame;
-+
-+inline static void enable_eval_frame_shim(PyThreadState* tstate) {}
-+inline static void enable_eval_frame_default(PyThreadState* tstate) {}
-+
-+static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL};
-+
-+static PyTypeObject THPPyInterpreterFrameType = {
-+    PyVarObject_HEAD_INIT(NULL, 0)
-+    .tp_name = "torch._C.dynamo.eval_frame._PyInterpreterFrame",
-+    .tp_basicsize = sizeof(THPPyInterpreterFrame),
-+    .tp_flags = Py_TPFLAGS_DEFAULT,
-+    .tp_getset = THPPyInterpreterFrame_properties,
-+};
-+
-+#endif // CPython 3.13
- 
- static PyObject* increment_working_threads(PyThreadState* tstate) {
-   active_dynamo_threads = active_dynamo_threads + 1;
-diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h
-index 73b991cf3fbf..b060db00db73 100644
---- a/torch/csrc/utils/python_compat.h
-+++ b/torch/csrc/utils/python_compat.h
-@@ -11,6 +11,7 @@ extern "C" {
- 
- #define IS_PYTHON_3_11_PLUS PY_VERSION_HEX >= 0x030B00C1
- #define IS_PYTHON_3_12_PLUS PY_VERSION_HEX >= 0x030C0000
-+#define IS_PYTHON_3_13_PLUS PY_VERSION_HEX >= 0x030D0000
- 
- PYCAPI_COMPAT_STATIC_INLINE(int)
- PyCode_GetNCellvars(PyCodeObject* code) {
-@@ -32,6 +33,9 @@ PyCode_GetNFreevars(PyCodeObject* code) {
- #endif
- }
- 
-+// Provided by CPython but getting the header for them is very hard
-+extern void _PyWeakref_ClearRef(PyWeakReference* self);
-+
- #ifdef __cplusplus
- }
- #endif
--- 
-2.45.1
-
diff --git a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
deleted file mode 100644
index 562f55b..0000000
--- a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
+++ /dev/null
@@ -1,910 +0,0 @@
-From 3d1e4b3e5ddcdd2717e590c635097163fef64c83 Mon Sep 17 00:00:00 2001
-From: Xu Han <xu.han@intel.com>
-Date: Sun, 31 Mar 2024 03:07:32 +0000
-Subject: [PATCH] Enable x86 CPU vectorization on windows [submodule sleef]
- (#118980)
-
-Enable VEC on Windows OS.
-1. Fix some type defination gap between Windows and Linux.
-2. Fix some operator not support on Windows, such as [], /.
-3. Enable static sleef library build on Windows.
-4. Disable unsupported function overloading on MSVC.
-5. Upgrade submodule sleef lib, which fixed build issue on Windows.
-6. Fixed bazel build issues.
-7. Fix test app not link to sleef on Windows.
-
-Note: If rebuild fail after pulled this PR, please sync `sleef` submodule by run:
-```cmd
-git submodule sync
-git submodule update --init --recursive
-```
-
-Pull Request resolved: https://github.com/pytorch/pytorch/pull/118980
-Approved by: https://github.com/jgong5, https://github.com/ezyang, https://github.com/malfet
----
- aten/src/ATen/CMakeLists.txt                  | 48 ++++++--------
- aten/src/ATen/cpu/vec/vec256/vec256.h         | 14 ++--
- .../src/ATen/cpu/vec/vec256/vec256_bfloat16.h | 27 ++++++--
- .../cpu/vec/vec256/vec256_complex_double.h    |  7 +-
- .../cpu/vec/vec256/vec256_complex_float.h     |  7 +-
- aten/src/ATen/cpu/vec/vec256/vec256_double.h  |  5 +-
- aten/src/ATen/cpu/vec/vec256/vec256_float.h   | 15 +++--
- aten/src/ATen/cpu/vec/vec256/vec256_qint.h    | 12 +++-
- aten/src/ATen/cpu/vec/vec512/vec512.h         | 14 ++--
- .../src/ATen/cpu/vec/vec512/vec512_bfloat16.h | 27 ++++++--
- .../cpu/vec/vec512/vec512_complex_double.h    |  7 +-
- .../cpu/vec/vec512/vec512_complex_float.h     |  7 +-
- aten/src/ATen/cpu/vec/vec512/vec512_double.h  |  5 +-
- aten/src/ATen/cpu/vec/vec512/vec512_float.h   | 15 +++--
- aten/src/ATen/cpu/vec/vec512/vec512_qint.h    | 66 ++++++++++++++++++-
- aten/src/ATen/cpu/vec/vec_base.h              |  6 ++
- caffe2/CMakeLists.txt                         |  2 +-
- third_party/sleef.BUILD                       |  3 +-
- 18 files changed, 194 insertions(+), 93 deletions(-)
-
-diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
-index bf425af5fa9..58d5828e8ca 100644
---- a/aten/src/ATen/CMakeLists.txt
-+++ b/aten/src/ATen/CMakeLists.txt
-@@ -419,32 +419,25 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$")
-   list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo)
- endif()
- 
--if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
--  # Preserve values for the main build
--  set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS})
--  set(__aten_sleef_build_tests ${BUILD_TESTS})
--
--  # Unset our restrictive C++ flags here and reset them later.
--  # Remove this once we use proper target_compile_options.
--  set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
--  set(CMAKE_CXX_FLAGS)
--
--  # Bump up optimization level for sleef to -O1, since at -O0 the compiler
--  # excessively spills intermediate vector registers to the stack
--  # and makes things run impossibly slowly
--  set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
--  if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
--    string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
--  else()
--    set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
-+if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
-+  if(NOT MSVC)
-+    # Bump up optimization level for sleef to -O1, since at -O0 the compiler
-+    # excessively spills intermediate vector registers to the stack
-+    # and makes things run impossibly slowly
-+    set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
-+    if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
-+      string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
-+    else()
-+      set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
-+    endif()
-   endif()
- 
-   if(NOT USE_SYSTEM_SLEEF)
--    set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
--    set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
--    set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
--    set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
--    set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE})
-+    set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
-+    set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
-+    set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
-+    set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
-+    set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE)
-     if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
-       if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
-         set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE)
-@@ -465,12 +458,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
-   endif()
-   list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef)
- 
--  set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
--  set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
--
--  # Set these back. TODO: Use SLEEF_ to pass these instead
--  set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE)
--  set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE)
-+  if(NOT MSVC)
-+    set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
-+  endif()
- endif()
- 
- if(USE_CUDA AND NOT USE_ROCM)
-diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h
-index 800b027e469..c431fa3c605 100644
---- a/aten/src/ATen/cpu/vec/vec256/vec256.h
-+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
-@@ -69,7 +69,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
- }
- 
- 
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
- 
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- 
-@@ -94,7 +94,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
- }
- 
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--
-+#ifndef _MSC_VER
-+// MSVC is not working well on complex function overload.
- template<int64_t scale = 1>
- std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
- inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
-@@ -106,9 +107,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
- inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
-   return _mm256_i32gather_ps(base_addr, vindex, scale);
- }
--
-+#endif
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--
-+#ifndef _MSC_VER
-+// MSVC is not working well on complex function overload.
- template<int64_t scale = 1>
- std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
- inline mask_gather(const Vectorized<double>& src, const double* base_addr,
-@@ -122,7 +124,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
-                    const Vectorized<int32_t>& vindex, Vectorized<float>& mask) {
-   return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale);
- }
--
-+#endif
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- 
- // Only works for inputs in the range: [-2^51, 2^51]
-@@ -302,6 +304,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
-   return flip8(v);
- }
- 
--#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#endif // (defined(CPU_CAPABILITY_AVX2)
- 
- }} // namepsace at::vec::CPU_CAPABILITY
-diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
-index 3e26213d6d2..66557436c70 100644
---- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
-+++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
-@@ -7,7 +7,8 @@
- #include <ATen/cpu/vec/vec_base.h>
- #include <c10/util/irange.h>
- 
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
-+#define SLEEF_STATIC_LIBS
- #include <sleef.h>
- #endif
- 
-@@ -18,7 +19,18 @@ namespace at::vec {
- // See Note [CPU_CAPABILITY namespace]
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
-+
-+#ifndef SLEEF_CONST
-+#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
-+#define SLEEF_CONST const
-+#else
-+#define SLEEF_CONST
-+#endif
-+#define SLEEF_CONST_OLD SLEEF_CONST
-+#else
-+#define SLEEF_CONST_OLD
-+#endif
- 
- // bfloat16 conversion
- static inline void cvtbf16_fp32(const __m128i& a, __m256& o) {
-@@ -265,7 +277,8 @@ public:
-     }
-     return b;
-   }
--  Vectorized<T> map(const __m256 (*const vop)(__m256)) const {
-+
-+  Vectorized<T> map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const {
-     __m256 lo, hi;
-     cvt_to_fp32<T>(values, lo, hi);
-     const auto o1 = vop(lo);
-@@ -1026,7 +1039,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
- CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
- CONVERT_VECTORIZED_INIT(Half, half);
- 
--#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#else // defined(CPU_CAPABILITY_AVX2)
- 
- #define CONVERT_NON_VECTORIZED_INIT(type, name) \
- inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
-@@ -1051,9 +1064,9 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
- CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
- CONVERT_NON_VECTORIZED_INIT(Half, half);
- 
--#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#endif // defined(CPU_CAPABILITY_AVX2)
- 
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
- #define LOAD_FP32_VECTORIZED_INIT(type, name) \
- inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
-   auto values = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data)); \
-@@ -1072,7 +1085,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
- LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
- LOAD_FP32_VECTORIZED_INIT(Half, fp16);
- 
--#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#else // defined(CPU_CAPABILITY_AVX2)
- #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
- inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
-   __at_align__ float values[Vectorized<float>::size()]; \
-diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
-index f93ea1e63c3..6c198fb37d3 100644
---- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
-+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
-@@ -8,7 +8,8 @@
- #include <ATen/cpu/vec/intrinsics.h>
- #include <ATen/cpu/vec/vec_base.h>
- 
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
-+#define SLEEF_STATIC_LIBS
- #include <sleef.h>
- #endif
- 
-@@ -16,7 +17,7 @@ namespace at::vec {
- // See Note [CPU_CAPABILITY namespace]
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
- 
- template <> class Vectorized<c10::complex<double>> {
- private:
-@@ -145,7 +146,7 @@ public:
-     auto abs = abs_();
-     auto zero = _mm256_setzero_pd();
-     auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ);
--    auto div = values / abs;
-+    auto div = _mm256_div_pd(values, abs);
-     return _mm256_blendv_pd(div, zero, mask);
-   }
-   __m256d real_() const {
-diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
-index 7c142c04b79..c72d4d49274 100644
---- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
-+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
-@@ -7,7 +7,8 @@
- #include <c10/util/irange.h>
- #include <ATen/cpu/vec/intrinsics.h>
- #include <ATen/cpu/vec/vec_base.h>
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
-+#define SLEEF_STATIC_LIBS
- #include <sleef.h>
- #endif
- 
-@@ -15,7 +16,7 @@ namespace at::vec {
- // See Note [CPU_CAPABILITY namespace]
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
- 
- template <> class Vectorized<c10::complex<float>> {
- private:
-@@ -180,7 +181,7 @@ public:
-     auto abs = abs_();
-     auto zero = _mm256_setzero_ps();
-     auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ);
--    auto div = values / abs;
-+    auto div = _mm256_div_ps(values, abs);
-     return _mm256_blendv_ps(div, zero, mask);
-   }
-   __m256 real_() const {
-diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
-index bc82d07edd1..bed6da627af 100644
---- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h
-+++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
-@@ -6,7 +6,8 @@
- #include <ATen/cpu/vec/intrinsics.h>
- #include <ATen/cpu/vec/vec_base.h>
- #include <c10/util/irange.h>
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
-+#define SLEEF_STATIC_LIBS
- #include <sleef.h>
- #endif
- 
-@@ -15,7 +16,7 @@ namespace at::vec {
- inline namespace CPU_CAPABILITY {
- 
- 
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
- 
- template <> class Vectorized<double> {
- private:
-diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
-index 886809a0b8a..0e3664cd37b 100644
---- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
-+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
-@@ -6,7 +6,8 @@
- #include <ATen/cpu/vec/intrinsics.h>
- #include <ATen/cpu/vec/vec_base.h>
- #include <c10/util/irange.h>
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
-+#define SLEEF_STATIC_LIBS
- #include <sleef.h>
- #endif
- 
-@@ -14,7 +15,7 @@ namespace at::vec {
- // See Note [CPU_CAPABILITY namespace]
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
- 
- template <> class Vectorized<float> {
- private:
-@@ -226,14 +227,14 @@ public:
-     static __m256 vec_factorial_5 =
-         _mm256_set1_ps(0.00828929059f); // 1/factorial(5)
-     static __m256 vec_exp_log2ef =
--        (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e)
-+        _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e)
-     static __m256 vec_half = _mm256_set1_ps(0.5f);
-     static __m256 vec_one = _mm256_set1_ps(1.f);
-     static __m256 vec_zero = _mm256_set1_ps(0.f);
-     static __m256 vec_two = _mm256_set1_ps(2.f);
--    static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2)
--    static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50);
--    static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218);
-+    static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2)
-+    static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
-+    static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
-     static __m256i vec_127 = _mm256_set1_epi32(0x0000007f);
-     static int n_mantissa_bits = 23;
- 
-@@ -266,7 +267,7 @@ public:
-     auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number);
-     auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127);
-     vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
--    auto vec_two_pow_n = (__m256)vec_two_pow_n_i;
-+    auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i);
-     vec_two_pow_n =
-         _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask);
- 
-diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
-index 4128841701a..85e099904cd 100644
---- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
-+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
-@@ -41,11 +41,17 @@
- namespace at::vec {
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX2)
- 
-+#ifdef _MSC_VER
-+__declspec(align(64)) struct Vectorizedqi {
-+ protected:
-+  __m256i vals;
-+#else
- struct Vectorizedqi {
-  protected:
-   __m256i vals __attribute__((aligned(64)));
-+#endif
- 
-  public:
-   Vectorizedqi() {}
-@@ -133,7 +139,7 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
- }
- 
- template <typename T>
--inline void __attribute__((always_inline)) QuantizeAvx2(
-+__FORCE_INLINE void QuantizeAvx2(
-     const float* src,
-     T* dst,
-     int len,
-@@ -1331,5 +1337,5 @@ Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const V
-   return a.maximum(b);
- }
- 
--#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-+#endif // if defined(CPU_CAPABILITY_AVX2)
- }} // namespace at::vec::CPU_CAPABILITY
-diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h
-index fe96d123e64..87f723d782c 100644
---- a/aten/src/ATen/cpu/vec/vec512/vec512.h
-+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
-@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
- }
- 
- 
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
- 
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- 
-@@ -80,7 +80,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
- }
- 
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--
-+#ifndef _MSC_VER
-+// MSVC is not working well on complex function overload.
- template<int64_t scale = 1>
- std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
- inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
-@@ -92,9 +93,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
- inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
-   return _mm512_i32gather_ps(vindex, base_addr, scale);
- }
--
-+#endif
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--
-+#ifndef _MSC_VER
-+// MSVC is not working well on complex function overload.
- template<int64_t scale = 1>
- std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
- inline mask_gather(const Vectorized<double>& src, const double* base_addr,
-@@ -112,7 +114,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
-   auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ);
-   return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale);
- }
--
-+#endif
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- 
- template<>
-@@ -270,6 +272,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
-   return flip8(v);
- }
- 
--#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#endif // defined(CPU_CAPABILITY_AVX512)
- 
- }}}
-diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
-index f9fc92d52bf..eb3b6a72240 100644
---- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
-+++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
-@@ -7,7 +7,8 @@
- #include <ATen/cpu/vec/vec_base.h>
- #include <c10/util/irange.h>
- 
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
-+#define SLEEF_STATIC_LIBS
- #include <sleef.h>
- #endif
- 
-@@ -16,7 +17,18 @@ namespace vec {
- // See Note [CPU_CAPABILITY namespace]
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
-+
-+#ifndef SLEEF_CONST
-+#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
-+#define SLEEF_CONST const
-+#else
-+#define SLEEF_CONST
-+#endif
-+#define SLEEF_CONST_OLD SLEEF_CONST
-+#else
-+#define SLEEF_CONST_OLD
-+#endif
- 
- // bfloat16 conversion
- static inline void cvtbf16_fp32(const __m256i& a, __m512& o) {
-@@ -362,7 +374,8 @@ public:
-   }
-   #pragma clang diagnostic push
-   #pragma clang diagnostic ignored "-Wignored-qualifiers"
--  Vectorized<T> map(const __m512 (*const vop)(__m512)) const {
-+
-+  Vectorized<T> map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const {
-     __m512 lo, hi;
-     cvt_to_fp32<T>(values, lo, hi);
-     const auto o1 = vop(lo);
-@@ -1571,7 +1584,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
- CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
- CONVERT_VECTORIZED_INIT(Half, half);
- 
--#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#else //defined(CPU_CAPABILITY_AVX512)
- 
- #define CONVERT_NON_VECTORIZED_INIT(type, name) \
- inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
-@@ -1601,9 +1614,9 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
- CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
- CONVERT_NON_VECTORIZED_INIT(Half, half);
- 
--#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#endif // defined(CPU_CAPABILITY_AVX512)
- 
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
- #define LOAD_FP32_VECTORIZED_INIT(type, name) \
- inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
-   auto values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)); \
-@@ -1622,7 +1635,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
- LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
- LOAD_FP32_VECTORIZED_INIT(Half, fp16);
- 
--#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#else // defined(CPU_CAPABILITY_AVX512)
- #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
- inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
-   __at_align__ float values[Vectorized<float>::size()]; \
-diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
-index 02aa3a87cc1..c35204f9da2 100644
---- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
-+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
-@@ -7,7 +7,8 @@
- #include <c10/util/irange.h>
- #include <ATen/cpu/vec/intrinsics.h>
- #include <ATen/cpu/vec/vec_base.h>
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
-+#define SLEEF_STATIC_LIBS
- #include <sleef.h>
- #endif
- 
-@@ -16,7 +17,7 @@ namespace vec {
- // See Note [CPU_CAPABILITY namespace]
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
- 
- template <> class Vectorized<c10::complex<double>> {
- private:
-@@ -203,7 +204,7 @@ public:
-     auto abs = abs_();
-     auto zero = _mm512_setzero_pd();
-     auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ);
--    auto div = values / abs;
-+    auto div = _mm512_div_pd(values, abs);
-     return _mm512_mask_blend_pd(mask, div, zero);
-   }
-   __m512d real_() const {
-diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
-index a5d790c98b2..2801e484d94 100644
---- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
-+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
-@@ -7,7 +7,8 @@
- #include <c10/util/irange.h>
- #include <ATen/cpu/vec/intrinsics.h>
- #include <ATen/cpu/vec/vec_base.h>
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
-+#define SLEEF_STATIC_LIBS
- #include <sleef.h>
- #endif
- 
-@@ -16,7 +17,7 @@ namespace vec {
- // See Note [CPU_CAPABILITY namespace]
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
- 
- template <> class Vectorized<c10::complex<float>> {
- private:
-@@ -708,7 +709,7 @@ public:
-     auto abs = abs_();
-     auto zero = _mm512_setzero_ps();
-     auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ);
--    auto div = values / abs;
-+    auto div = _mm512_div_ps(values, abs);
-     return _mm512_mask_blend_ps(mask, div, zero);
-   }
-   __m512 real_() const {
-diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
-index 27b2753c903..508ab257e60 100644
---- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
-+++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
-@@ -6,7 +6,8 @@
- #include <ATen/cpu/vec/intrinsics.h>
- #include <ATen/cpu/vec/vec_base.h>
- #include <c10/util/irange.h>
--#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER)
-+#if (defined(CPU_CAPABILITY_AVX512))
-+#define SLEEF_STATIC_LIBS
- #include <sleef.h>
- #endif
- 
-@@ -15,7 +16,7 @@ namespace vec {
- // See Note [CPU_CAPABILITY namespace]
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
- 
- template <> class Vectorized<double> {
- private:
-diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
-index ba5738687fd..a08df3c141a 100644
---- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
-+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
-@@ -6,7 +6,8 @@
- #include <ATen/cpu/vec/intrinsics.h>
- #include <ATen/cpu/vec/vec_base.h>
- #include <c10/util/irange.h>
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
-+#define SLEEF_STATIC_LIBS
- #include <sleef.h>
- #endif
- 
-@@ -15,7 +16,7 @@ namespace vec {
- // See Note [CPU_CAPABILITY namespace]
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
- 
- template <> class Vectorized<float> {
- private:
-@@ -246,14 +247,14 @@ public:
-     static __m512 vec_factorial_5 =
-         _mm512_set1_ps(0.00828929059f); // 1/factorial(5)
-     static __m512 vec_exp_log2ef =
--        (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e)
-+        _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
-     static __m512 vec_half = _mm512_set1_ps(0.5f);
-     static __m512 vec_one = _mm512_set1_ps(1.f);
-     static __m512 vec_zero = _mm512_set1_ps(0.f);
-     static __m512 vec_two = _mm512_set1_ps(2.f);
--    static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2)
--    static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50);
--    static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218);
-+    static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2)
-+    static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
-+    static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
-     static __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
-     static int n_mantissa_bits = 23;
- 
-@@ -288,7 +289,7 @@ public:
-     auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number);
-     auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127);
-     vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
--    auto vec_two_pow_n = (__m512)vec_two_pow_n_i;
-+    auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i);
-     vec_two_pow_n =
-         _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero);
- 
-diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
-index e0713d01312..a5671ed4a50 100644
---- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
-+++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
-@@ -42,11 +42,17 @@ namespace at {
- namespace vec {
- inline namespace CPU_CAPABILITY {
- 
--#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-+#if defined(CPU_CAPABILITY_AVX512)
- 
-+#ifdef _MSC_VER
-+__declspec(align(64)) struct Vectorizedqi {
-+ protected:
-+  __m512i vals;
-+#else
- struct Vectorizedqi {
-  protected:
-   __m512i vals __attribute__((aligned(64)));
-+#endif
- 
-  public:
-   Vectorizedqi() {}
-@@ -136,7 +142,7 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
- }
- 
- template <typename T>
--inline void __attribute__((always_inline)) QuantizeAvx512(
-+__FORCE_INLINE void QuantizeAvx512(
-     const float* src,
-     T* dst,
-     int len,
-@@ -525,10 +531,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
-       Vectorized<float> scale,
-       Vectorized<float> zero_point,
-       Vectorized<float> scale_neg_zp_premul) const {
-+    #if defined(_MSC_VER) && !defined(__clang__)
-+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-+    #else
-     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-+    #endif
- 
-     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
-     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
-@@ -549,10 +562,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
-   float_vec_return_type dequantize(
-       Vectorized<float> scale,
-       Vectorized<float> zero_point) const {
-+    #if defined(_MSC_VER) && !defined(__clang__)
-+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-+    #else
-     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-+    #endif
- 
-     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
-     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
-@@ -598,20 +618,34 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
-     }
- 
-     int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
-+      #if defined(_MSC_VER) && !defined(__clang__)
-+      __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-+      __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-+      __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-+      __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-+      #else
-       __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-       __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-       __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-       __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-+      #endif
- 
-       __m512i int32_val0 = cvtepi8_epi32(int_val0);
-       __m512i int32_val1 = cvtepi8_epi32(int_val1);
-       __m512i int32_val2 = cvtepi8_epi32(int_val2);
-       __m512i int32_val3 = cvtepi8_epi32(int_val3);
- 
-+      #if defined(_MSC_VER) && !defined(__clang__)
-+      __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
-+      __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
-+      __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
-+      __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
-+      #else
-       __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
-       __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
-       __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
-       __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
-+      #endif
- 
-       __m512i int32_b0 = cvtepi8_epi32(int_b0);
-       __m512i int32_b1 = cvtepi8_epi32(int_b1);
-@@ -721,10 +755,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
-       Vectorized<float> scale,
-       Vectorized<float> zero_point,
-       Vectorized<float> scale_zp_premul) const {
-+    #if defined(_MSC_VER) && !defined(__clang__)
-+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-+    #else
-     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-+    #endif
- 
-     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
-     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
-@@ -746,10 +787,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
-   float_vec_return_type dequantize(
-       Vectorized<float> scale,
-       Vectorized<float> zero_point) const {
-+    #if defined(_MSC_VER) && !defined(__clang__)
-+    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-+    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-+    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-+    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-+    #else
-     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-+    #endif
- 
-     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
-     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
-@@ -796,20 +844,34 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
-     }
- 
-     int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
-+      #if defined(_MSC_VER) && !defined(__clang__)
-+      __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-+      __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-+      __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-+      __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-+      #else
-       __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-       __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-       __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-       __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-+      #endif
- 
-       __m512i int32_val0 = cvtepu8_epi32(int_val0);
-       __m512i int32_val1 = cvtepu8_epi32(int_val1);
-       __m512i int32_val2 = cvtepu8_epi32(int_val2);
-       __m512i int32_val3 = cvtepu8_epi32(int_val3);
- 
-+      #if defined(_MSC_VER) && !defined(__clang__)
-+      __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
-+      __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
-+      __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
-+      __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
-+      #else
-       __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
-       __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
-       __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
-       __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
-+      #endif
- 
-       __m512i int32_b0 = cvtepu8_epi32(int_b0);
-       __m512i int32_b1 = cvtepu8_epi32(int_b1);
-diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
-index adf81dd915c..20cb8ef6dbc 100644
---- a/aten/src/ATen/cpu/vec/vec_base.h
-+++ b/aten/src/ATen/cpu/vec/vec_base.h
-@@ -36,6 +36,12 @@
- #include <c10/util/irange.h>
- #include <c10/util/Load.h>
- 
-+#if defined(__GNUC__)
-+#define __FORCE_INLINE __attribute__((always_inline)) inline
-+#elif defined(_MSC_VER)
-+#define __FORCE_INLINE __forceinline
-+#endif
-+
- // These macros helped us unify vec_base.h
- #ifdef CPU_CAPABILITY_AVX512
- #if defined(__GNUC__)
-diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
-index a6b6f0f7d1d..15d37cf4861 100644
---- a/caffe2/CMakeLists.txt
-+++ b/caffe2/CMakeLists.txt
-@@ -1787,7 +1787,7 @@ if(BUILD_TEST)
-           endif()
-         else()
-           add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
--          target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main)
-+          target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main)
-         endif()
-         target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<INSTALL_INTERFACE:include>)
-         target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>)
-diff --git a/third_party/sleef.BUILD b/third_party/sleef.BUILD
-index 573f9c5b54a..f22a6e905e2 100644
---- a/third_party/sleef.BUILD
-+++ b/third_party/sleef.BUILD
-@@ -38,6 +38,7 @@ SLEEF_PUBLIC_HEADERS = [
- SLEEF_PRIVATE_INCLUDES = [
-     "-Iexternal/sleef/src/arch",
-     "-Iexternal/sleef/src/common",
-+    "-Iexternal/sleef/src/libm",
- ]
- 
- SLEEF_PUBLIC_INCLUDES = [
-@@ -201,8 +202,6 @@ cc_library(
-     srcs = [
-         "src/libm/rempitab.c",
-         "src/libm/sleefdp.c",
--        "src/libm/sleefld.c",
--        "src/libm/sleefqp.c",
-         "src/libm/sleefsp.c",
-     ],
-     hdrs = SLEEF_PUBLIC_HEADERS,
--- 
-2.45.1
-
diff --git a/0001-Improve-finding-and-using-the-rocm_version.h.patch b/0001-Improve-finding-and-using-the-rocm_version.h.patch
deleted file mode 100644
index b8232c7..0000000
--- a/0001-Improve-finding-and-using-the-rocm_version.h.patch
+++ /dev/null
@@ -1,142 +0,0 @@
-From 201ac4618a1526e048a0d6c02d9bc4cf30bf0ee1 Mon Sep 17 00:00:00 2001
-From: Tom Rix <Tom.Rix@amd.com>
-Date: Wed, 14 Aug 2024 17:18:38 -0700
-Subject: [PATCH] Improve finding and using the rocm_version.h
-
-On Fedora, the rocm_version.h's path is /usr/include/rocm_version.h
-So we have this build error
-pytorch/aten/src/ATen/hip/tunable/Tunable.cpp:40:10: fatal error:
-  rocm-core/rocm_version.h: No such file or directory
-   40 | #include <rocm-core/rocm_version.h>
-      |          ^~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In other cases, depending on the rocm release either
-/opt/rocm/include or /opt/rocm/include/rocm-core
-
-Convert the EXISTS() checks into a find_path.
-Add a -I${ROCM_VERSION_DIR} to the compile options so it can be
-found by Tunable.cpp
-
-Signed-off-by: Tom Rix <Tom.Rix@amd.com>
----
- aten/src/ATen/cuda/tunable/Tunable.cpp |  2 +-
- cmake/Dependencies.cmake               |  1 +
- cmake/public/LoadHIP.cmake             | 72 ++++++++++----------------
- 3 files changed, 30 insertions(+), 45 deletions(-)
-
-diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp
-index 1b7c89875855..32c1d70f3152 100644
---- a/aten/src/ATen/cuda/tunable/Tunable.cpp
-+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
-@@ -36,7 +36,7 @@
- 
- // for validators
- #ifdef USE_ROCM
--#include <rocm-core/rocm_version.h>
-+#include <rocm_version.h>
- #define ROCBLAS_BETA_FEATURES_API
- #include <rocblas/rocblas.h>
- #include <hipblaslt/hipblaslt.h>
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index 7ef8eabb5162..61bc4d7a54b6 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1063,6 +1063,7 @@ if(USE_ROCM)
-     list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
-     list(APPEND HIP_CXX_FLAGS -std=c++17)
-     list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2)
-+    list(APPEND HIP_CXX_FLAGS -I${ROCM_VERSION_DIR})
-     if(HIP_NEW_TYPE_ENUMS)
-       list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS)
-     endif()
-diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
-index 1c0d3a203991..6a7e3bd163f5 100644
---- a/cmake/public/LoadHIP.cmake
-+++ b/cmake/public/LoadHIP.cmake
-@@ -42,55 +42,39 @@ find_package_and_print_version(HIP 1.0)
- 
- if(HIP_FOUND)
-   set(PYTORCH_FOUND_HIP TRUE)
--  set(FOUND_ROCM_VERSION_H FALSE)
--
-   set(PROJECT_RANDOM_BINARY_DIR "${PROJECT_BINARY_DIR}")
--  set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc")
- 
-   # Find ROCM version for checks
-   # ROCM 5.0 and later will have header api for version management
--  if(EXISTS ${ROCM_INCLUDE_DIRS}/rocm_version.h)
--    set(FOUND_ROCM_VERSION_H TRUE)
--    file(WRITE ${file} ""
--      "#include <rocm_version.h>\n"
--      )
--  elseif(EXISTS ${ROCM_INCLUDE_DIRS}/rocm-core/rocm_version.h)
--    set(FOUND_ROCM_VERSION_H TRUE)
--    file(WRITE ${file} ""
--      "#include <rocm-core/rocm_version.h>\n"
--      )
--  else()
--    message("********************* rocm_version.h couldnt be found ******************\n")
--  endif()
--
--  if(FOUND_ROCM_VERSION_H)
--    file(APPEND ${file} ""
--      "#include <cstdio>\n"
--
--      "#ifndef ROCM_VERSION_PATCH\n"
--      "#define ROCM_VERSION_PATCH 0\n"
--      "#endif\n"
--      "#define STRINGIFYHELPER(x) #x\n"
--      "#define STRINGIFY(x) STRINGIFYHELPER(x)\n"
--      "int main() {\n"
--      "  printf(\"%d.%d.%s\", ROCM_VERSION_MAJOR, ROCM_VERSION_MINOR, STRINGIFY(ROCM_VERSION_PATCH));\n"
--      "  return 0;\n"
--      "}\n"
--      )
--
--    try_run(run_result compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file}
--      CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}"
--      RUN_OUTPUT_VARIABLE rocm_version_from_header
--      COMPILE_OUTPUT_VARIABLE output_var
--      )
--    # We expect the compile to be successful if the include directory exists.
--    if(NOT compile_result)
--      message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var})
--    endif()
--    message(STATUS "Caffe2: Header version is: " ${rocm_version_from_header})
--    set(ROCM_VERSION_DEV_RAW ${rocm_version_from_header})
--    message("\n***** ROCm version from rocm_version.h ****\n")
-+  find_path(ROCM_VERSION_DIR rocm_version.h HINTS ${ROCM_INCLUDE_DIRS} ${ROCM_INCLUDE_DIRS}/rocm-core)
-+  set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc")
-+  file(WRITE ${file} ""
-+    "#include <rocm_version.h>\n"
-+    "#include <cstdio>\n"
-+
-+    "#ifndef ROCM_VERSION_PATCH\n"
-+    "#define ROCM_VERSION_PATCH 0\n"
-+    "#endif\n"
-+    "#define STRINGIFYHELPER(x) #x\n"
-+    "#define STRINGIFY(x) STRINGIFYHELPER(x)\n"
-+    "int main() {\n"
-+    "  printf(\"%d.%d.%s\", ROCM_VERSION_MAJOR, ROCM_VERSION_MINOR, STRINGIFY(ROCM_VERSION_PATCH));\n"
-+    "  return 0;\n"
-+    "}\n"
-+  )
-+
-+  try_run(run_result compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file}
-+    CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_VERSION_DIR}"
-+    RUN_OUTPUT_VARIABLE rocm_version_from_header
-+    COMPILE_OUTPUT_VARIABLE output_var
-+  )
-+  # We expect the compile to be successful if the include directory exists.
-+  if(NOT compile_result)
-+    message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var})
-   endif()
-+  message(STATUS "Caffe2: Header version is: " ${rocm_version_from_header})
-+  set(ROCM_VERSION_DEV_RAW ${rocm_version_from_header})
-+  message("\n***** ROCm version from rocm_version.h ****\n")
- 
-   string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+).*$" ROCM_VERSION_DEV_MATCH ${ROCM_VERSION_DEV_RAW})
- 
--- 
-2.46.0
-
diff --git a/0001-Optionally-use-hipblaslt.patch b/0001-Optionally-use-hipblaslt.patch
deleted file mode 100644
index 1e5ca4b..0000000
--- a/0001-Optionally-use-hipblaslt.patch
+++ /dev/null
@@ -1,506 +0,0 @@
-From f1d65e958afa65882dbfea8b392ab847a84d41ed Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 29 Jun 2024 04:18:34 -0700
-Subject: [PATCH] Optionally use hipblaslt
-
----
- aten/src/ATen/cuda/CUDABlas.cpp          | 46 ++++++++++++++++++------
- aten/src/ATen/cuda/CUDAContextLight.h    |  4 +++
- aten/src/ATen/cuda/CublasHandlePool.cpp  | 10 ++++--
- aten/src/ATen/cuda/tunable/TunableGemm.h | 18 +++++++---
- aten/src/ATen/native/cuda/Blas.cpp       | 18 +++++++++-
- cmake/Dependencies.cmake                 |  3 ++
- cmake/public/LoadHIP.cmake               |  2 +-
- 7 files changed, 82 insertions(+), 19 deletions(-)
-
-diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
-index ce991a9bcad4..3f0d17b52778 100644
---- a/aten/src/ATen/cuda/CUDABlas.cpp
-+++ b/aten/src/ATen/cuda/CUDABlas.cpp
-@@ -14,7 +14,9 @@
- #include <c10/util/irange.h>
- 
- #ifdef USE_ROCM
-+#ifdef USE_HIPBLASLT
- #include <hipblaslt/hipblaslt-ext.hpp>
-+#endif
- // until hipblas has an API to accept flags, we must use rocblas here
- #include <hipblas/hipblas.h>
- #include <rocblas/rocblas.h>
-@@ -182,6 +184,9 @@ uint32_t _getAlignment(uintptr_t address) {
- static size_t _parseChosenWorkspaceSize() {
-   const char * val = getenv("CUBLASLT_WORKSPACE_SIZE");
- #ifdef USE_ROCM
-+#ifndef USE_HIPBLASLT
-+  return 0;
-+#endif
-   if (!val) {
-     // accept either env var
-     val = getenv("HIPBLASLT_WORKSPACE_SIZE");
-@@ -235,6 +240,7 @@ namespace at::cuda::blas {
-   } while (0)
- 
- 
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- namespace {
- // Following the pattern of CuSparseDescriptor
- // Defined here for now because this is the only place cublas_lt interface is
-@@ -318,7 +324,6 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
- };
- } // namespace
- 
--
- template <typename Dtype>
- inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-   cudaDataType_t abcType = CUDA_R_32F;
-@@ -452,7 +457,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-       " scaleType ",
-       scaleType);
- }
--
-+#endif
- 
- template <typename Dtype>
- inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-@@ -608,10 +613,13 @@ void bgemm_internal<double>(CUDABLAS_BGEMM_ARGTYPES(double))
- template <>
- void bgemm_internal<float>(CUDABLAS_BGEMM_ARGTYPES(float))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     bgemm_internal_cublaslt<float>(CUDABLAS_BGEMM_ARGS(float));
-   }
--  else {
-+  else
-+#endif
-+  {
-     bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGS(float));
-   }
- }
-@@ -651,10 +659,13 @@ void bgemm_internal<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<fl
- template <>
- void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     bgemm_internal_cublaslt<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
-   }
--  else {
-+  else
-+#endif    
-+  {
-     bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
-   }
- }
-@@ -662,10 +673,13 @@ void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half))
- template <>
- void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     bgemm_internal_cublaslt<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
-   }
--  else {
-+  else
-+#endif
-+  {
-     bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
-   }
- }
-@@ -781,11 +795,13 @@ void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
-   }
- }
- 
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- template <typename Dtype>
- inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
-   // forward to bgemm implementation but set strides and batches to 0
-   bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0);
- }
-+#endif
- 
- template <typename Dtype>
- inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
-@@ -1008,10 +1024,13 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
- template <>
- void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
-   }
--  else {
-+  else
-+#endif
-+  {
-     gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGS(float));
-   }
- }
-@@ -1051,10 +1070,13 @@ void gemm_internal<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<floa
- template <>
- void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
-   }
--  else {
-+  else
-+#endif
-+  {
-     gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
-   }
- }
-@@ -1062,10 +1084,13 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
- template <>
- void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
-   }
--  else {
-+  else
-+#endif
-+  {
-     gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
-   }
- }
-@@ -1177,7 +1202,7 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
-   }
- }
- 
--
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- template <typename Dtype>
- void gemm_and_bias(
-     bool transpose_mat1,
-@@ -1410,7 +1435,7 @@ void scaled_gemm(
-     ScalarType result_dtype,
-     void* amax_ptr,
-     bool use_fast_accum) {
--#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
-+#if CUDA_VERSION >= 11080 || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   const auto computeType = CUBLAS_COMPUTE_32F;
-   const auto scaleType = CUDA_R_32F;
-   const int8_t fastAccuMode = use_fast_accum ? 1 : 0;
-@@ -1681,6 +1706,7 @@ void int8_gemm(
-       " scaleType ",
-       scaleType);
- }
-+#endif
- 
- template <>
- void trsm<float>(CUDABLAS_TRSM_ARGTYPES(float)) {
-diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h
-index f2b657ced51b..f0ee613c4208 100644
---- a/aten/src/ATen/cuda/CUDAContextLight.h
-+++ b/aten/src/ATen/cuda/CUDAContextLight.h
-@@ -9,7 +9,9 @@
- 
- // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
- // added bf16 support
-+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)))
- #include <cublasLt.h>
-+#endif
- 
- #ifdef CUDART_VERSION
- #include <cusolverDn.h>
-@@ -80,7 +82,9 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator();
- /* Handles */
- TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle();
- TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
-+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)))
- TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
-+#endif
- 
- TORCH_CUDA_CPP_API void clearCublasWorkspaces();
- 
-diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
-index 8eac525b3695..abfdf7a23847 100644
---- a/aten/src/ATen/cuda/CublasHandlePool.cpp
-+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
-@@ -29,7 +29,7 @@ namespace at::cuda {
- 
- namespace {
- 
--#if defined(USE_ROCM)
-+#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
- void createCublasLtHandle(cublasLtHandle_t *handle) {
-   TORCH_CUDABLAS_CHECK(cublasLtCreate(handle));
- }
-@@ -191,8 +191,9 @@ cublasHandle_t getCurrentCUDABlasHandle() {
-   return handle;
- }
- 
--cublasLtHandle_t getCurrentCUDABlasLtHandle() {
- #ifdef USE_ROCM
-+#if defined(USE_HIPBLASLT)
-+cublasLtHandle_t getCurrentCUDABlasLtHandle() {
-   c10::DeviceIndex device = 0;
-   AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
- 
-@@ -213,9 +214,12 @@ cublasLtHandle_t getCurrentCUDABlasLtHandle() {
- 
-   auto handle = myPoolWindow->reserve(device);
-   return handle;
-+}
-+#endif
- #else
-+cublasLtHandle_t getCurrentCUDABlasLtHandle() {
-   return reinterpret_cast<cublasLtHandle_t>(getCurrentCUDABlasHandle());
--#endif
- }
-+#endif
- 
- } // namespace at::cuda
-diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h
-index 53e6154120c9..fa1d664696db 100644
---- a/aten/src/ATen/cuda/tunable/TunableGemm.h
-+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h
-@@ -11,7 +11,9 @@
- 
- #include <ATen/cuda/tunable/GemmCommon.h>
- #ifdef USE_ROCM
-+#ifdef USE_HIPBLASLT
- #include <ATen/cuda/tunable/GemmHipblaslt.h>
-+#endif
- #include <ATen/cuda/tunable/GemmRocblas.h>
- #endif
- #include <ATen/cuda/tunable/StreamTimer.h>
-@@ -65,6 +67,7 @@ class DefaultGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>>
-     }
- };
- 
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- template <typename T>
- class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
-   public:
-@@ -94,6 +97,7 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
-       return OK;
-     }
- };
-+#endif
- 
- template <typename T>
- inline bool IsZero(T v) {
-@@ -191,6 +195,7 @@ static void AddRocblasValidator() {
-   }
- }
- 
-+#ifdef USE_HIPBLASLT
- static void AddHipblasltValidator() {
-   auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
-   if (validators.find("HIPBLASLT_VERSION") == validators.end()) {
-@@ -205,6 +210,7 @@ static void AddHipblasltValidator() {
-         [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
-   }
- }
-+#endif
- 
- static void AddRocmValidator() {
-   auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
-@@ -243,7 +249,7 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
-       }
-       AddRocblasValidator();
-     }
--
-+#ifdef USE_HIPBLASLT
-     static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
-     if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
-       rocm_validators = true;
-@@ -257,7 +263,7 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
-       }
-       AddHipblasltValidator();
-     }
--
-+#endif
-     if (rocm_validators) {
-       AddRocmValidator();
-     }
-@@ -286,7 +292,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
-       }
-       AddRocblasValidator();
-     }
--
-+#ifdef USE_HIPBLASLT
-     static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
-     if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
-       rocm_validators = true;
-@@ -300,7 +306,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
-       }
-       AddHipblasltValidator();
-     }
--
-+#endif
-     if (rocm_validators) {
-       AddRocmValidator();
-     }
-@@ -312,6 +318,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
-   }
- };
- 
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout>
- class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer> {
-  public:
-@@ -321,10 +328,12 @@ class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer>
-     auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
- 
- #if defined(USE_ROCM)
-+#ifdef USE_HIPBLASLT
-     for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps<AT, BT, CT, ALayout, BLayout>()) {
-       this->RegisterOp(std::move(name), std::move(op));
-     }
-     AddHipblasltValidator();
-+#endif
-     AddRocmValidator();
- #endif
-   }
-@@ -337,6 +346,7 @@ class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer>
-             "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
-   }
- };
-+#endif
- 
- #undef XSTRINGIFY
- #undef STRINGIFY
-diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
-index 84c59a4fd0d7..56ad5de3bf2d 100644
---- a/aten/src/ATen/native/cuda/Blas.cpp
-+++ b/aten/src/ATen/native/cuda/Blas.cpp
-@@ -173,6 +173,7 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa
- }
- 
- static bool getDisableAddmmCudaLt() {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-     static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT");
- #ifdef USE_ROCM
-     // if we enable tunable op, it'll take priority over just hipblaslt (heuristics)
-@@ -196,10 +197,14 @@ static bool getDisableAddmmCudaLt() {
-     }
-     return false;
- #endif
-+#else
-+    return true;
-+#endif
- }
- 
- #ifdef USE_ROCM
- static bool isSupportedHipLtROCmArch(int index) {
-+#ifdef USE_HIPBLASLT
-     hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index);
-     std::string device_arch = prop->gcnArchName;
-     static const std::vector<std::string> archs = {"gfx90a", "gfx940", "gfx941", "gfx942"};
-@@ -210,6 +215,7 @@ static bool isSupportedHipLtROCmArch(int index) {
-         }
-     }
-     TORCH_CHECK(false, "Attempting to use hipBLASLt on a unsupported architecture!");
-+#endif
-     return false;
- }
- #endif
-@@ -235,6 +241,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
-   at::ScalarType scalar_type = self.scalar_type();
-   c10::MaybeOwned<Tensor> self_;
-   if (&result != &self) {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM)
-     // Strangely, if mat2 has only 1 row or column, we get
-     // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
-@@ -276,13 +283,14 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
-             scalar_type != at::ScalarType::BFloat16));
- #endif
-     }
-+#endif
- #endif
-     if (!useLtInterface) {
-       self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
-     }
-     self__sizes = self_->sizes();
-   } else {
--#if defined(USE_ROCM)
-+#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
-     useLtInterface = !disable_addmm_cuda_lt &&
-         result.dim() == 2 && result.is_contiguous() &&
-         isSupportedHipLtROCmArch(self.device().index()) &&
-@@ -334,6 +342,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
-   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj());
- 
-   if (useLtInterface) {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- #if defined(USE_ROCM)
-     AT_DISPATCH_FLOATING_TYPES_AND2(
-         at::ScalarType::Half,
-@@ -394,6 +403,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
-               activation_epilogue
-           );
-         });
-+#endif
- #endif
-   } else
-   {
-@@ -803,6 +813,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
- }
- 
- static bool _scaled_mm_allowed_device() {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-     auto dprops = at::cuda::getCurrentDeviceProperties();
- #ifdef USE_ROCM
-     std::string device_arch = dprops->gcnArchName;
-@@ -817,6 +828,9 @@ static bool _scaled_mm_allowed_device() {
- #else
-     return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9);
- #endif
-+#else
-+    return false;
-+#endif
- }
- 
- // Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax
-@@ -850,6 +864,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
-   // Check sizes
-   bool allowed_device = _scaled_mm_allowed_device();
-   TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+");
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
-   TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
-   TORCH_CHECK(
-@@ -1025,6 +1040,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
- #if defined(USE_ROCM) && ROCM_VERSION >= 60000 && ROCM_VERSION < 60200
-   // ROCm's hipBLASLt does not support amax before 6.2, so calculate separately
-   amax = at::max(at::abs(out.to(kFloat)));
-+#endif
- #endif
- 
-   return {out, amax};
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index f1f2eb7cec31..8d05e834bbc5 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1052,6 +1052,9 @@ if(USE_ROCM)
-     list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
-     list(APPEND HIP_CXX_FLAGS -std=c++17)
-     list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2)
-+    if(hipblast_FOUND)
-+      list(APPEND HIP_CXX_FLAGS -DUSE_HIPBLASLT)
-+    endif()
-     if(HIP_NEW_TYPE_ENUMS)
-       list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS)
-     endif()
-diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
-index fa39156031ff..df4836847fdf 100644
---- a/cmake/public/LoadHIP.cmake
-+++ b/cmake/public/LoadHIP.cmake
-@@ -155,7 +155,7 @@ if(HIP_FOUND)
-   find_package_and_print_version(hiprand REQUIRED)
-   find_package_and_print_version(rocblas REQUIRED)
-   find_package_and_print_version(hipblas REQUIRED)
--  find_package_and_print_version(hipblaslt REQUIRED)
-+  find_package_and_print_version(hipblaslt)
-   find_package_and_print_version(miopen REQUIRED)
-   find_package_and_print_version(hipfft REQUIRED)
-   find_package_and_print_version(hipsparse REQUIRED)
--- 
-2.45.2
-
diff --git a/0001-Patch-for-sleef-3.6.patch b/0001-Patch-for-sleef-3.6.patch
deleted file mode 100644
index 13aa208..0000000
--- a/0001-Patch-for-sleef-3.6.patch
+++ /dev/null
@@ -1,952 +0,0 @@
-From 273f23698c887b52e66c2abec8101b7398f0f9c4 Mon Sep 17 00:00:00 2001
-From: "Benjamin A. Beasley" <code@musicinmybrain.net>
-Date: Wed, 5 Jun 2024 11:06:02 -0400
-Subject: [PATCH] Patch for sleef 3.6
-
----
- ...ectorization-on-windows-submodule-sl.patch | 910 ++++++++++++++++++
- python-torch.spec                             |  11 +
- 2 files changed, 921 insertions(+)
- create mode 100644 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
-
-diff --git a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
-new file mode 100644
-index 000000000000..562f55b742c2
---- /dev/null
-+++ b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
-@@ -0,0 +1,910 @@
-+From 3d1e4b3e5ddcdd2717e590c635097163fef64c83 Mon Sep 17 00:00:00 2001
-+From: Xu Han <xu.han@intel.com>
-+Date: Sun, 31 Mar 2024 03:07:32 +0000
-+Subject: [PATCH] Enable x86 CPU vectorization on windows [submodule sleef]
-+ (#118980)
-+
-+Enable VEC on Windows OS.
-+1. Fix some type defination gap between Windows and Linux.
-+2. Fix some operator not support on Windows, such as [], /.
-+3. Enable static sleef library build on Windows.
-+4. Disable unsupported function overloading on MSVC.
-+5. Upgrade submodule sleef lib, which fixed build issue on Windows.
-+6. Fixed bazel build issues.
-+7. Fix test app not link to sleef on Windows.
-+
-+Note: If rebuild fail after pulled this PR, please sync `sleef` submodule by run:
-+```cmd
-+git submodule sync
-+git submodule update --init --recursive
-+```
-+
-+Pull Request resolved: https://github.com/pytorch/pytorch/pull/118980
-+Approved by: https://github.com/jgong5, https://github.com/ezyang, https://github.com/malfet
-+---
-+ aten/src/ATen/CMakeLists.txt                  | 48 ++++++--------
-+ aten/src/ATen/cpu/vec/vec256/vec256.h         | 14 ++--
-+ .../src/ATen/cpu/vec/vec256/vec256_bfloat16.h | 27 ++++++--
-+ .../cpu/vec/vec256/vec256_complex_double.h    |  7 +-
-+ .../cpu/vec/vec256/vec256_complex_float.h     |  7 +-
-+ aten/src/ATen/cpu/vec/vec256/vec256_double.h  |  5 +-
-+ aten/src/ATen/cpu/vec/vec256/vec256_float.h   | 15 +++--
-+ aten/src/ATen/cpu/vec/vec256/vec256_qint.h    | 12 +++-
-+ aten/src/ATen/cpu/vec/vec512/vec512.h         | 14 ++--
-+ .../src/ATen/cpu/vec/vec512/vec512_bfloat16.h | 27 ++++++--
-+ .../cpu/vec/vec512/vec512_complex_double.h    |  7 +-
-+ .../cpu/vec/vec512/vec512_complex_float.h     |  7 +-
-+ aten/src/ATen/cpu/vec/vec512/vec512_double.h  |  5 +-
-+ aten/src/ATen/cpu/vec/vec512/vec512_float.h   | 15 +++--
-+ aten/src/ATen/cpu/vec/vec512/vec512_qint.h    | 66 ++++++++++++++++++-
-+ aten/src/ATen/cpu/vec/vec_base.h              |  6 ++
-+ caffe2/CMakeLists.txt                         |  2 +-
-+ third_party/sleef.BUILD                       |  3 +-
-+ 18 files changed, 194 insertions(+), 93 deletions(-)
-+
-+diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
-+index bf425af5fa9..58d5828e8ca 100644
-+--- a/aten/src/ATen/CMakeLists.txt
-++++ b/aten/src/ATen/CMakeLists.txt
-+@@ -419,32 +419,25 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$")
-+   list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo)
-+ endif()
-+ 
-+-if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
-+-  # Preserve values for the main build
-+-  set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS})
-+-  set(__aten_sleef_build_tests ${BUILD_TESTS})
-+-
-+-  # Unset our restrictive C++ flags here and reset them later.
-+-  # Remove this once we use proper target_compile_options.
-+-  set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-+-  set(CMAKE_CXX_FLAGS)
-+-
-+-  # Bump up optimization level for sleef to -O1, since at -O0 the compiler
-+-  # excessively spills intermediate vector registers to the stack
-+-  # and makes things run impossibly slowly
-+-  set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
-+-  if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
-+-    string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
-+-  else()
-+-    set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
-++if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
-++  if(NOT MSVC)
-++    # Bump up optimization level for sleef to -O1, since at -O0 the compiler
-++    # excessively spills intermediate vector registers to the stack
-++    # and makes things run impossibly slowly
-++    set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
-++    if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
-++      string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
-++    else()
-++      set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
-++    endif()
-+   endif()
-+ 
-+   if(NOT USE_SYSTEM_SLEEF)
-+-    set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
-+-    set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
-+-    set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
-+-    set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
-+-    set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE})
-++    set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
-++    set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
-++    set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
-++    set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
-++    set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE)
-+     if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
-+       if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
-+         set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE)
-+@@ -465,12 +458,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
-+   endif()
-+   list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef)
-+ 
-+-  set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
-+-  set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
-+-
-+-  # Set these back. TODO: Use SLEEF_ to pass these instead
-+-  set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE)
-+-  set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE)
-++  if(NOT MSVC)
-++    set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
-++  endif()
-+ endif()
-+ 
-+ if(USE_CUDA AND NOT USE_ROCM)
-+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h
-+index 800b027e469..c431fa3c605 100644
-+--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
-++++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
-+@@ -69,7 +69,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
-+ }
-+ 
-+ 
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-+ 
-+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+ 
-+@@ -94,7 +94,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
-+ }
-+ 
-+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+-
-++#ifndef _MSC_VER
-++// MSVC is not working well on complex function overload.
-+ template<int64_t scale = 1>
-+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
-+ inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
-+@@ -106,9 +107,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
-+ inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
-+   return _mm256_i32gather_ps(base_addr, vindex, scale);
-+ }
-+-
-++#endif
-+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+-
-++#ifndef _MSC_VER
-++// MSVC is not working well on complex function overload.
-+ template<int64_t scale = 1>
-+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
-+ inline mask_gather(const Vectorized<double>& src, const double* base_addr,
-+@@ -122,7 +124,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
-+                    const Vectorized<int32_t>& vindex, Vectorized<float>& mask) {
-+   return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale);
-+ }
-+-
-++#endif
-+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+ 
-+ // Only works for inputs in the range: [-2^51, 2^51]
-+@@ -302,6 +304,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
-+   return flip8(v);
-+ }
-+ 
-+-#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#endif // (defined(CPU_CAPABILITY_AVX2)
-+ 
-+ }} // namepsace at::vec::CPU_CAPABILITY
-+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
-+index 3e26213d6d2..66557436c70 100644
-+--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
-++++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
-+@@ -7,7 +7,8 @@
-+ #include <ATen/cpu/vec/vec_base.h>
-+ #include <c10/util/irange.h>
-+ 
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-++#define SLEEF_STATIC_LIBS
-+ #include <sleef.h>
-+ #endif
-+ 
-+@@ -18,7 +19,18 @@ namespace at::vec {
-+ // See Note [CPU_CAPABILITY namespace]
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-++
-++#ifndef SLEEF_CONST
-++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
-++#define SLEEF_CONST const
-++#else
-++#define SLEEF_CONST
-++#endif
-++#define SLEEF_CONST_OLD SLEEF_CONST
-++#else
-++#define SLEEF_CONST_OLD
-++#endif
-+ 
-+ // bfloat16 conversion
-+ static inline void cvtbf16_fp32(const __m128i& a, __m256& o) {
-+@@ -265,7 +277,8 @@ public:
-+     }
-+     return b;
-+   }
-+-  Vectorized<T> map(const __m256 (*const vop)(__m256)) const {
-++
-++  Vectorized<T> map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const {
-+     __m256 lo, hi;
-+     cvt_to_fp32<T>(values, lo, hi);
-+     const auto o1 = vop(lo);
-+@@ -1026,7 +1039,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
-+ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
-+ CONVERT_VECTORIZED_INIT(Half, half);
-+ 
-+-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#else // defined(CPU_CAPABILITY_AVX2)
-+ 
-+ #define CONVERT_NON_VECTORIZED_INIT(type, name) \
-+ inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
-+@@ -1051,9 +1064,9 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
-+ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
-+ CONVERT_NON_VECTORIZED_INIT(Half, half);
-+ 
-+-#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#endif // defined(CPU_CAPABILITY_AVX2)
-+ 
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-+ #define LOAD_FP32_VECTORIZED_INIT(type, name) \
-+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
-+   auto values = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data)); \
-+@@ -1072,7 +1085,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
-+ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
-+ LOAD_FP32_VECTORIZED_INIT(Half, fp16);
-+ 
-+-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#else // defined(CPU_CAPABILITY_AVX2)
-+ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
-+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
-+   __at_align__ float values[Vectorized<float>::size()]; \
-+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
-+index f93ea1e63c3..6c198fb37d3 100644
-+--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
-++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
-+@@ -8,7 +8,8 @@
-+ #include <ATen/cpu/vec/intrinsics.h>
-+ #include <ATen/cpu/vec/vec_base.h>
-+ 
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-++#define SLEEF_STATIC_LIBS
-+ #include <sleef.h>
-+ #endif
-+ 
-+@@ -16,7 +17,7 @@ namespace at::vec {
-+ // See Note [CPU_CAPABILITY namespace]
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-+ 
-+ template <> class Vectorized<c10::complex<double>> {
-+ private:
-+@@ -145,7 +146,7 @@ public:
-+     auto abs = abs_();
-+     auto zero = _mm256_setzero_pd();
-+     auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ);
-+-    auto div = values / abs;
-++    auto div = _mm256_div_pd(values, abs);
-+     return _mm256_blendv_pd(div, zero, mask);
-+   }
-+   __m256d real_() const {
-+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
-+index 7c142c04b79..c72d4d49274 100644
-+--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
-++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
-+@@ -7,7 +7,8 @@
-+ #include <c10/util/irange.h>
-+ #include <ATen/cpu/vec/intrinsics.h>
-+ #include <ATen/cpu/vec/vec_base.h>
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-++#define SLEEF_STATIC_LIBS
-+ #include <sleef.h>
-+ #endif
-+ 
-+@@ -15,7 +16,7 @@ namespace at::vec {
-+ // See Note [CPU_CAPABILITY namespace]
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-+ 
-+ template <> class Vectorized<c10::complex<float>> {
-+ private:
-+@@ -180,7 +181,7 @@ public:
-+     auto abs = abs_();
-+     auto zero = _mm256_setzero_ps();
-+     auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ);
-+-    auto div = values / abs;
-++    auto div = _mm256_div_ps(values, abs);
-+     return _mm256_blendv_ps(div, zero, mask);
-+   }
-+   __m256 real_() const {
-+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
-+index bc82d07edd1..bed6da627af 100644
-+--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h
-++++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
-+@@ -6,7 +6,8 @@
-+ #include <ATen/cpu/vec/intrinsics.h>
-+ #include <ATen/cpu/vec/vec_base.h>
-+ #include <c10/util/irange.h>
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-++#define SLEEF_STATIC_LIBS
-+ #include <sleef.h>
-+ #endif
-+ 
-+@@ -15,7 +16,7 @@ namespace at::vec {
-+ inline namespace CPU_CAPABILITY {
-+ 
-+ 
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-+ 
-+ template <> class Vectorized<double> {
-+ private:
-+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
-+index 886809a0b8a..0e3664cd37b 100644
-+--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
-++++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
-+@@ -6,7 +6,8 @@
-+ #include <ATen/cpu/vec/intrinsics.h>
-+ #include <ATen/cpu/vec/vec_base.h>
-+ #include <c10/util/irange.h>
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-++#define SLEEF_STATIC_LIBS
-+ #include <sleef.h>
-+ #endif
-+ 
-+@@ -14,7 +15,7 @@ namespace at::vec {
-+ // See Note [CPU_CAPABILITY namespace]
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-+ 
-+ template <> class Vectorized<float> {
-+ private:
-+@@ -226,14 +227,14 @@ public:
-+     static __m256 vec_factorial_5 =
-+         _mm256_set1_ps(0.00828929059f); // 1/factorial(5)
-+     static __m256 vec_exp_log2ef =
-+-        (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e)
-++        _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e)
-+     static __m256 vec_half = _mm256_set1_ps(0.5f);
-+     static __m256 vec_one = _mm256_set1_ps(1.f);
-+     static __m256 vec_zero = _mm256_set1_ps(0.f);
-+     static __m256 vec_two = _mm256_set1_ps(2.f);
-+-    static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2)
-+-    static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50);
-+-    static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218);
-++    static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2)
-++    static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
-++    static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
-+     static __m256i vec_127 = _mm256_set1_epi32(0x0000007f);
-+     static int n_mantissa_bits = 23;
-+ 
-+@@ -266,7 +267,7 @@ public:
-+     auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number);
-+     auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127);
-+     vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
-+-    auto vec_two_pow_n = (__m256)vec_two_pow_n_i;
-++    auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i);
-+     vec_two_pow_n =
-+         _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask);
-+ 
-+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
-+index 4128841701a..85e099904cd 100644
-+--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
-++++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
-+@@ -41,11 +41,17 @@
-+ namespace at::vec {
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX2)
-+ 
-++#ifdef _MSC_VER
-++__declspec(align(64)) struct Vectorizedqi {
-++ protected:
-++  __m256i vals;
-++#else
-+ struct Vectorizedqi {
-+  protected:
-+   __m256i vals __attribute__((aligned(64)));
-++#endif
-+ 
-+  public:
-+   Vectorizedqi() {}
-+@@ -133,7 +139,7 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
-+ }
-+ 
-+ template <typename T>
-+-inline void __attribute__((always_inline)) QuantizeAvx2(
-++__FORCE_INLINE void QuantizeAvx2(
-+     const float* src,
-+     T* dst,
-+     int len,
-+@@ -1331,5 +1337,5 @@ Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const V
-+   return a.maximum(b);
-+ }
-+ 
-+-#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
-++#endif // if defined(CPU_CAPABILITY_AVX2)
-+ }} // namespace at::vec::CPU_CAPABILITY
-+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h
-+index fe96d123e64..87f723d782c 100644
-+--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
-++++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
-+@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
-+ }
-+ 
-+ 
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-+ 
-+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+ 
-+@@ -80,7 +80,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
-+ }
-+ 
-+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+-
-++#ifndef _MSC_VER
-++// MSVC is not working well on complex function overload.
-+ template<int64_t scale = 1>
-+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
-+ inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
-+@@ -92,9 +93,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
-+ inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
-+   return _mm512_i32gather_ps(vindex, base_addr, scale);
-+ }
-+-
-++#endif
-+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+-
-++#ifndef _MSC_VER
-++// MSVC is not working well on complex function overload.
-+ template<int64_t scale = 1>
-+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
-+ inline mask_gather(const Vectorized<double>& src, const double* base_addr,
-+@@ -112,7 +114,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
-+   auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ);
-+   return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale);
-+ }
-+-
-++#endif
-+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+ 
-+ template<>
-+@@ -270,6 +272,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
-+   return flip8(v);
-+ }
-+ 
-+-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#endif // defined(CPU_CAPABILITY_AVX512)
-+ 
-+ }}}
-+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
-+index f9fc92d52bf..eb3b6a72240 100644
-+--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
-++++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
-+@@ -7,7 +7,8 @@
-+ #include <ATen/cpu/vec/vec_base.h>
-+ #include <c10/util/irange.h>
-+ 
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-++#define SLEEF_STATIC_LIBS
-+ #include <sleef.h>
-+ #endif
-+ 
-+@@ -16,7 +17,18 @@ namespace vec {
-+ // See Note [CPU_CAPABILITY namespace]
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-++
-++#ifndef SLEEF_CONST
-++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
-++#define SLEEF_CONST const
-++#else
-++#define SLEEF_CONST
-++#endif
-++#define SLEEF_CONST_OLD SLEEF_CONST
-++#else
-++#define SLEEF_CONST_OLD
-++#endif
-+ 
-+ // bfloat16 conversion
-+ static inline void cvtbf16_fp32(const __m256i& a, __m512& o) {
-+@@ -362,7 +374,8 @@ public:
-+   }
-+   #pragma clang diagnostic push
-+   #pragma clang diagnostic ignored "-Wignored-qualifiers"
-+-  Vectorized<T> map(const __m512 (*const vop)(__m512)) const {
-++
-++  Vectorized<T> map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const {
-+     __m512 lo, hi;
-+     cvt_to_fp32<T>(values, lo, hi);
-+     const auto o1 = vop(lo);
-+@@ -1571,7 +1584,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
-+ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
-+ CONVERT_VECTORIZED_INIT(Half, half);
-+ 
-+-#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#else //defined(CPU_CAPABILITY_AVX512)
-+ 
-+ #define CONVERT_NON_VECTORIZED_INIT(type, name) \
-+ inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
-+@@ -1601,9 +1614,9 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
-+ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
-+ CONVERT_NON_VECTORIZED_INIT(Half, half);
-+ 
-+-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#endif // defined(CPU_CAPABILITY_AVX512)
-+ 
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-+ #define LOAD_FP32_VECTORIZED_INIT(type, name) \
-+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
-+   auto values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)); \
-+@@ -1622,7 +1635,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
-+ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
-+ LOAD_FP32_VECTORIZED_INIT(Half, fp16);
-+ 
-+-#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#else // defined(CPU_CAPABILITY_AVX512)
-+ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
-+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
-+   __at_align__ float values[Vectorized<float>::size()]; \
-+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
-+index 02aa3a87cc1..c35204f9da2 100644
-+--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
-++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
-+@@ -7,7 +7,8 @@
-+ #include <c10/util/irange.h>
-+ #include <ATen/cpu/vec/intrinsics.h>
-+ #include <ATen/cpu/vec/vec_base.h>
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-++#define SLEEF_STATIC_LIBS
-+ #include <sleef.h>
-+ #endif
-+ 
-+@@ -16,7 +17,7 @@ namespace vec {
-+ // See Note [CPU_CAPABILITY namespace]
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-+ 
-+ template <> class Vectorized<c10::complex<double>> {
-+ private:
-+@@ -203,7 +204,7 @@ public:
-+     auto abs = abs_();
-+     auto zero = _mm512_setzero_pd();
-+     auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ);
-+-    auto div = values / abs;
-++    auto div = _mm512_div_pd(values, abs);
-+     return _mm512_mask_blend_pd(mask, div, zero);
-+   }
-+   __m512d real_() const {
-+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
-+index a5d790c98b2..2801e484d94 100644
-+--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
-++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
-+@@ -7,7 +7,8 @@
-+ #include <c10/util/irange.h>
-+ #include <ATen/cpu/vec/intrinsics.h>
-+ #include <ATen/cpu/vec/vec_base.h>
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-++#define SLEEF_STATIC_LIBS
-+ #include <sleef.h>
-+ #endif
-+ 
-+@@ -16,7 +17,7 @@ namespace vec {
-+ // See Note [CPU_CAPABILITY namespace]
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-+ 
-+ template <> class Vectorized<c10::complex<float>> {
-+ private:
-+@@ -708,7 +709,7 @@ public:
-+     auto abs = abs_();
-+     auto zero = _mm512_setzero_ps();
-+     auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ);
-+-    auto div = values / abs;
-++    auto div = _mm512_div_ps(values, abs);
-+     return _mm512_mask_blend_ps(mask, div, zero);
-+   }
-+   __m512 real_() const {
-+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
-+index 27b2753c903..508ab257e60 100644
-+--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
-++++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
-+@@ -6,7 +6,8 @@
-+ #include <ATen/cpu/vec/intrinsics.h>
-+ #include <ATen/cpu/vec/vec_base.h>
-+ #include <c10/util/irange.h>
-+-#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER)
-++#if (defined(CPU_CAPABILITY_AVX512))
-++#define SLEEF_STATIC_LIBS
-+ #include <sleef.h>
-+ #endif
-+ 
-+@@ -15,7 +16,7 @@ namespace vec {
-+ // See Note [CPU_CAPABILITY namespace]
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-+ 
-+ template <> class Vectorized<double> {
-+ private:
-+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
-+index ba5738687fd..a08df3c141a 100644
-+--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
-++++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
-+@@ -6,7 +6,8 @@
-+ #include <ATen/cpu/vec/intrinsics.h>
-+ #include <ATen/cpu/vec/vec_base.h>
-+ #include <c10/util/irange.h>
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-++#define SLEEF_STATIC_LIBS
-+ #include <sleef.h>
-+ #endif
-+ 
-+@@ -15,7 +16,7 @@ namespace vec {
-+ // See Note [CPU_CAPABILITY namespace]
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-+ 
-+ template <> class Vectorized<float> {
-+ private:
-+@@ -246,14 +247,14 @@ public:
-+     static __m512 vec_factorial_5 =
-+         _mm512_set1_ps(0.00828929059f); // 1/factorial(5)
-+     static __m512 vec_exp_log2ef =
-+-        (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e)
-++        _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
-+     static __m512 vec_half = _mm512_set1_ps(0.5f);
-+     static __m512 vec_one = _mm512_set1_ps(1.f);
-+     static __m512 vec_zero = _mm512_set1_ps(0.f);
-+     static __m512 vec_two = _mm512_set1_ps(2.f);
-+-    static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2)
-+-    static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50);
-+-    static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218);
-++    static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2)
-++    static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
-++    static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
-+     static __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
-+     static int n_mantissa_bits = 23;
-+ 
-+@@ -288,7 +289,7 @@ public:
-+     auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number);
-+     auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127);
-+     vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
-+-    auto vec_two_pow_n = (__m512)vec_two_pow_n_i;
-++    auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i);
-+     vec_two_pow_n =
-+         _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero);
-+ 
-+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
-+index e0713d01312..a5671ed4a50 100644
-+--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
-++++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
-+@@ -42,11 +42,17 @@ namespace at {
-+ namespace vec {
-+ inline namespace CPU_CAPABILITY {
-+ 
-+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
-++#if defined(CPU_CAPABILITY_AVX512)
-+ 
-++#ifdef _MSC_VER
-++__declspec(align(64)) struct Vectorizedqi {
-++ protected:
-++  __m512i vals;
-++#else
-+ struct Vectorizedqi {
-+  protected:
-+   __m512i vals __attribute__((aligned(64)));
-++#endif
-+ 
-+  public:
-+   Vectorizedqi() {}
-+@@ -136,7 +142,7 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
-+ }
-+ 
-+ template <typename T>
-+-inline void __attribute__((always_inline)) QuantizeAvx512(
-++__FORCE_INLINE void QuantizeAvx512(
-+     const float* src,
-+     T* dst,
-+     int len,
-+@@ -525,10 +531,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
-+       Vectorized<float> scale,
-+       Vectorized<float> zero_point,
-+       Vectorized<float> scale_neg_zp_premul) const {
-++    #if defined(_MSC_VER) && !defined(__clang__)
-++    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-++    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-++    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-++    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-++    #else
-+     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-+     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-+     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-+     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-++    #endif
-+ 
-+     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
-+     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
-+@@ -549,10 +562,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
-+   float_vec_return_type dequantize(
-+       Vectorized<float> scale,
-+       Vectorized<float> zero_point) const {
-++    #if defined(_MSC_VER) && !defined(__clang__)
-++    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-++    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-++    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-++    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-++    #else
-+     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-+     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-+     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-+     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-++    #endif
-+ 
-+     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
-+     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
-+@@ -598,20 +618,34 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
-+     }
-+ 
-+     int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
-++      #if defined(_MSC_VER) && !defined(__clang__)
-++      __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-++      __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-++      __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-++      __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-++      #else
-+       __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-+       __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-+       __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-+       __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-++      #endif
-+ 
-+       __m512i int32_val0 = cvtepi8_epi32(int_val0);
-+       __m512i int32_val1 = cvtepi8_epi32(int_val1);
-+       __m512i int32_val2 = cvtepi8_epi32(int_val2);
-+       __m512i int32_val3 = cvtepi8_epi32(int_val3);
-+ 
-++      #if defined(_MSC_VER) && !defined(__clang__)
-++      __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
-++      __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
-++      __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
-++      __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
-++      #else
-+       __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
-+       __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
-+       __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
-+       __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
-++      #endif
-+ 
-+       __m512i int32_b0 = cvtepi8_epi32(int_b0);
-+       __m512i int32_b1 = cvtepi8_epi32(int_b1);
-+@@ -721,10 +755,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
-+       Vectorized<float> scale,
-+       Vectorized<float> zero_point,
-+       Vectorized<float> scale_zp_premul) const {
-++    #if defined(_MSC_VER) && !defined(__clang__)
-++    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-++    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-++    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-++    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-++    #else
-+     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-+     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-+     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-+     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-++    #endif
-+ 
-+     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
-+     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
-+@@ -746,10 +787,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
-+   float_vec_return_type dequantize(
-+       Vectorized<float> scale,
-+       Vectorized<float> zero_point) const {
-++    #if defined(_MSC_VER) && !defined(__clang__)
-++    __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-++    __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-++    __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-++    __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-++    #else
-+     __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-+     __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-+     __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-+     __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-++    #endif
-+ 
-+     __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
-+     __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
-+@@ -796,20 +844,34 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
-+     }
-+ 
-+     int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
-++      #if defined(_MSC_VER) && !defined(__clang__)
-++      __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
-++      __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
-++      __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
-++      __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
-++      #else
-+       __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
-+       __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
-+       __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
-+       __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
-++      #endif
-+ 
-+       __m512i int32_val0 = cvtepu8_epi32(int_val0);
-+       __m512i int32_val1 = cvtepu8_epi32(int_val1);
-+       __m512i int32_val2 = cvtepu8_epi32(int_val2);
-+       __m512i int32_val3 = cvtepu8_epi32(int_val3);
-+ 
-++      #if defined(_MSC_VER) && !defined(__clang__)
-++      __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
-++      __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
-++      __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
-++      __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
-++      #else
-+       __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
-+       __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
-+       __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
-+       __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
-++      #endif
-+ 
-+       __m512i int32_b0 = cvtepu8_epi32(int_b0);
-+       __m512i int32_b1 = cvtepu8_epi32(int_b1);
-+diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
-+index adf81dd915c..20cb8ef6dbc 100644
-+--- a/aten/src/ATen/cpu/vec/vec_base.h
-++++ b/aten/src/ATen/cpu/vec/vec_base.h
-+@@ -36,6 +36,12 @@
-+ #include <c10/util/irange.h>
-+ #include <c10/util/Load.h>
-+ 
-++#if defined(__GNUC__)
-++#define __FORCE_INLINE __attribute__((always_inline)) inline
-++#elif defined(_MSC_VER)
-++#define __FORCE_INLINE __forceinline
-++#endif
-++
-+ // These macros helped us unify vec_base.h
-+ #ifdef CPU_CAPABILITY_AVX512
-+ #if defined(__GNUC__)
-+diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
-+index a6b6f0f7d1d..15d37cf4861 100644
-+--- a/caffe2/CMakeLists.txt
-++++ b/caffe2/CMakeLists.txt
-+@@ -1787,7 +1787,7 @@ if(BUILD_TEST)
-+           endif()
-+         else()
-+           add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
-+-          target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main)
-++          target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main)
-+         endif()
-+         target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<INSTALL_INTERFACE:include>)
-+         target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>)
-+diff --git a/third_party/sleef.BUILD b/third_party/sleef.BUILD
-+index 573f9c5b54a..f22a6e905e2 100644
-+--- a/third_party/sleef.BUILD
-++++ b/third_party/sleef.BUILD
-+@@ -38,6 +38,7 @@ SLEEF_PUBLIC_HEADERS = [
-+ SLEEF_PRIVATE_INCLUDES = [
-+     "-Iexternal/sleef/src/arch",
-+     "-Iexternal/sleef/src/common",
-++    "-Iexternal/sleef/src/libm",
-+ ]
-+ 
-+ SLEEF_PUBLIC_INCLUDES = [
-+@@ -201,8 +202,6 @@ cc_library(
-+     srcs = [
-+         "src/libm/rempitab.c",
-+         "src/libm/sleefdp.c",
-+-        "src/libm/sleefld.c",
-+-        "src/libm/sleefqp.c",
-+         "src/libm/sleefsp.c",
-+     ],
-+     hdrs = SLEEF_PUBLIC_HEADERS,
-+-- 
-+2.45.1
-+
-diff --git a/python-torch.spec b/python-torch.spec
-index d50687a5174a..63600c2e8c39 100644
---- a/python-torch.spec
-+++ b/python-torch.spec
-@@ -176,6 +176,17 @@ Patch7:        0001-Reenable-dim-for-python-3.12.patch
- Patch8:        0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch
- %endif
- 
-+# Enable x86 CPU vectorization on windows [submodule sleef] (#118980)
-+# https://github.com/pytorch/pytorch/commit/56451cd49d9cf94b49197e09dec13426bb1a5370
-+#
-+# Despite the title, this patch fixes compatibility with sleef 3.6 by including
-+# a backwards-compatible version of the fix from
-+# https://github.com/pytorch/pytorch/pull/122723.
-+# Cherry-picked on v2.3.0, without the commit to update the third_party/sleef
-+# git submodule (because the release archive contains an actual sleef source
-+# tree instead, so this would not apply.)
-+Patch9:        0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
-+
- %if %{with rocm}
- # ROCm patches
- # https://github.com/pytorch/pytorch/pull/120551
--- 
-2.45.1
-
diff --git a/0001-Reenable-dim-for-python-3.12.patch b/0001-Reenable-dim-for-python-3.12.patch
deleted file mode 100644
index 138b5d4..0000000
--- a/0001-Reenable-dim-for-python-3.12.patch
+++ /dev/null
@@ -1,115 +0,0 @@
-From ee3fb343a376cdba6f4ce188cac90023f13e2aea Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Thu, 4 Apr 2024 14:21:38 -0600
-Subject: [PATCH] Reenable dim for python 3.12
-
-In 3.12:
-
-_PyArg_Parser added an element to the start of the structure.
-So existing positional initialization is off.  Switch to element
-initialization.
-
-_Py_CODEUNIT changed to from an int to a union, but relevant_op
-is passed an int for the return of decoder.opcode, so the parameter
-type is wrong, switch it to int.
-
-The opcode PRECALL was removed, so reduce its handling to 3.11
-
-Signed-off-by: Tom Rix <trix@redhat.com>
----
- functorch/csrc/dim/dim.cpp     | 24 +++++-------------------
- functorch/csrc/dim/minpybind.h |  4 ++--
- 2 files changed, 7 insertions(+), 21 deletions(-)
-
-diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp
-index 4cc027504c77..e48b0d58081f 100644
---- a/functorch/csrc/dim/dim.cpp
-+++ b/functorch/csrc/dim/dim.cpp
-@@ -6,20 +6,6 @@
- 
- #include <torch/csrc/utils/python_compat.h>
- 
--
--// Many APIs have changed/don't exist anymore
--#if IS_PYTHON_3_12_PLUS
--
--#include "dim.h"
--
--// Re-enable this some day
--PyObject* Dim_init() {
--    PyErr_SetString(PyExc_RuntimeError, "First class dim doesn't work with python 3.12");
--    return nullptr;
--}
--
--#else
--
- #include "minpybind.h"
- #include <frameobject.h>
- #include <opcode.h>
-@@ -441,7 +427,7 @@ static PyObject* DimList_bind(DimList *self,
-     PY_BEGIN
-     mpy::handle sizes;
-     static const char * const _keywords[] = {"sizes", nullptr};
--    static _PyArg_Parser parser = {"O", _keywords, 0};
-+    static _PyArg_Parser parser = { .format = "O", .keywords = _keywords};
-     if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, &sizes)) {
-         return nullptr;
-     }
-@@ -465,7 +451,7 @@ static PyObject* DimList_bind_len(DimList *self,
-     PY_BEGIN
-     int size;
-     static const char * const _keywords[] = {"N", nullptr};
--    static _PyArg_Parser parser = {"i", _keywords, 0};
-+    static _PyArg_Parser parser = { .format = "i", .keywords = _keywords};
-     if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, &size)) {
-         return nullptr;
-     }
-@@ -1468,7 +1454,7 @@ PyTypeObject Tensor::Type = {
- 
- // dim() --------------------
- 
--static bool relevant_op(_Py_CODEUNIT c) {
-+static bool relevant_op(int c) {
-     switch(c) {
-         case STORE_NAME:
-         case STORE_GLOBAL:
-@@ -1587,7 +1573,7 @@ static PyObject* _dims(PyObject *self,
-     auto c = mpy::obj<PyCodeObject>::steal(PyFrame_GetCode(f.ptr()));
-     auto lasti = PyFrame_GetLasti(f.ptr());
-     auto decoder = PyInstDecoder(c.ptr(), lasti);
--    #if IS_PYTHON_3_11_PLUS
-+    #if IS_PYTHON_3_11
-     // When py3.11 adapts bytecode lasti points to the precall
-     // rather than the call instruction after it
-     if (decoder.opcode() == PRECALL) {
-@@ -3268,4 +3254,4 @@ PyObject* Dim_init() {
-     }
- }
- 
--#endif
-+
-diff --git a/functorch/csrc/dim/minpybind.h b/functorch/csrc/dim/minpybind.h
-index de82b5af95a4..d76d4828bf80 100644
---- a/functorch/csrc/dim/minpybind.h
-+++ b/functorch/csrc/dim/minpybind.h
-@@ -621,7 +621,7 @@ struct vector_args {
-             PyObject *dummy = NULL;
-             _PyArg_ParseStackAndKeywords((PyObject*const*)args, nargs, kwnames.ptr(), _parser, &dummy, &dummy, &dummy, &dummy, &dummy);
- #else
--            _PyArg_Parser* _parser = new _PyArg_Parser{NULL, &names_buf[0], fname_cstr, 0};
-+            _PyArg_Parser* _parser = new _PyArg_Parser{ .keywords = &names_buf[0], .fname = fname_cstr};
-             std::unique_ptr<PyObject*[]> buf(new PyObject*[names.size()]);
-             _PyArg_UnpackKeywords((PyObject*const*)args, nargs, NULL, kwnames.ptr(), _parser, required, (Py_ssize_t)values.size() - kwonly, 0, &buf[0]);
- #endif
-@@ -706,7 +706,7 @@ inline object handle::call_vector(vector_args args) {
- #define MPY_PARSE_ARGS_KWNAMES(fmt, FORALL_ARGS) \
-     static const char * const kwlist[] = { FORALL_ARGS(MPY_ARGS_NAME) nullptr}; \
-     FORALL_ARGS(MPY_ARGS_DECLARE) \
--    static _PyArg_Parser parser = {fmt, kwlist, 0}; \
-+    static _PyArg_Parser parser = { .format = fmt, .keywords = kwlist}; \
-     if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, FORALL_ARGS(MPY_ARGS_POINTER) nullptr)) { \
-         throw mpy::exception_set(); \
-     }
--- 
-2.44.0
-
diff --git a/0001-Regenerate-flatbuffer-header.patch b/0001-Regenerate-flatbuffer-header.patch
deleted file mode 100644
index 4eec491..0000000
--- a/0001-Regenerate-flatbuffer-header.patch
+++ /dev/null
@@ -1,39 +0,0 @@
-From 5b8e51b24513fa851eeff42f23d942bde301e321 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Fri, 29 Sep 2023 06:19:29 -0700
-Subject: [PATCH] Regenerate flatbuffer header
-
-For this error
-torch/csrc/jit/serialization/mobile_bytecode_generated.h:12:41:
-error: static assertion failed: Non-compatible flatbuffers version included
-   12 |               FLATBUFFERS_VERSION_MINOR == 3 &&
-
-PyTorch is expecting 23.3.3, what f38 has
-Rawhide is at 23.5.26
-
-Regenerate with
-flatc --cpp --gen-mutable --no-prefix --scoped-enums mobile_bytecode.fbs
-
-Signed-off-by: Tom Rix <trix@redhat.com>
----
- torch/csrc/jit/serialization/mobile_bytecode_generated.h | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/torch/csrc/jit/serialization/mobile_bytecode_generated.h b/torch/csrc/jit/serialization/mobile_bytecode_generated.h
-index cffe8bc7a6..83575e4c19 100644
---- a/torch/csrc/jit/serialization/mobile_bytecode_generated.h
-+++ b/torch/csrc/jit/serialization/mobile_bytecode_generated.h
-@@ -9,8 +9,8 @@
- // Ensure the included flatbuffers.h is the same version as when this file was
- // generated, otherwise it may not be compatible.
- static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
--              FLATBUFFERS_VERSION_MINOR == 3 &&
--              FLATBUFFERS_VERSION_REVISION == 3,
-+              FLATBUFFERS_VERSION_MINOR == 5 &&
-+              FLATBUFFERS_VERSION_REVISION == 26,
-              "Non-compatible flatbuffers version included");
- 
- namespace torch {
--- 
-2.43.0
-
diff --git a/0001-Stub-in-kineto-ActivityType.patch b/0001-Stub-in-kineto-ActivityType.patch
deleted file mode 100644
index f088645..0000000
--- a/0001-Stub-in-kineto-ActivityType.patch
+++ /dev/null
@@ -1,73 +0,0 @@
-From 3ef82b814179da571b2478f61d4279717ab0b23a Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Fri, 29 Sep 2023 06:25:23 -0700
-Subject: [PATCH] Stub in kineto ActivityType
-
-There is an error with kineto is not used, the shim still
-requires the ActivityTYpe.h header to get the enum Activity type.
-So cut-n-paste just enough of the header in to do this.
-
-Signed-off-by: Tom Rix <trix@redhat.com>
----
- torch/csrc/profiler/kineto_shim.h | 44 +++++++++++++++++++++++++++++++
- 1 file changed, 44 insertions(+)
-
-diff --git a/torch/csrc/profiler/kineto_shim.h b/torch/csrc/profiler/kineto_shim.h
-index e92cbf003d..68985ab7d0 100644
---- a/torch/csrc/profiler/kineto_shim.h
-+++ b/torch/csrc/profiler/kineto_shim.h
-@@ -12,7 +12,51 @@
- #undef USE_KINETO
- #endif
- 
-+#ifdef USE_KINETO
- #include <ActivityType.h>
-+#else
-+namespace libkineto {
-+// copied from header
-+/*
-+ * Copyright (c) Meta Platforms, Inc. and affiliates.
-+ * All rights reserved.
-+ *
-+ * This source code is licensed under the BSD-style license found in the
-+ * LICENSE file in the root directory of this source tree.
-+ */
-+
-+// Note : All activity types are not enabled by default. Please add them
-+// at correct position in the enum
-+enum class ActivityType {
-+    // Activity types enabled by default
-+    CPU_OP = 0, // cpu side ops
-+    USER_ANNOTATION,
-+    GPU_USER_ANNOTATION,
-+    GPU_MEMCPY,
-+    GPU_MEMSET,
-+    CONCURRENT_KERNEL, // on-device kernels
-+    EXTERNAL_CORRELATION,
-+    CUDA_RUNTIME, // host side cuda runtime events
-+    CUDA_DRIVER, // host side cuda driver events
-+    CPU_INSTANT_EVENT, // host side point-like events
-+    PYTHON_FUNCTION,
-+    OVERHEAD, // CUPTI induced overhead events sampled from its overhead API.
-+
-+    // Optional Activity types
-+    CUDA_SYNC, // synchronization events between runtime and kernels
-+    GLOW_RUNTIME, // host side glow runtime events
-+    MTIA_RUNTIME, // host side MTIA runtime events
-+    CUDA_PROFILER_RANGE, // CUPTI Profiler range for performance metrics
-+    MTIA_CCP_EVENTS, // MTIA ondevice CCP events
-+    HPU_OP, // HPU host side runtime event
-+    XPU_RUNTIME, // host side xpu runtime events
-+
-+    ENUM_COUNT, // This is to add buffer and not used for any profiling logic. Add your new type before it.
-+    OPTIONAL_ACTIVITY_TYPE_START = CUDA_SYNC,
-+};
-+}
-+
-+#endif
- 
- #include <torch/csrc/Export.h>
- #include <torch/csrc/profiler/api.h>
--- 
-2.43.0
-
diff --git a/0001-can-not-use-with-c-files.patch b/0001-can-not-use-with-c-files.patch
deleted file mode 100644
index 719737c..0000000
--- a/0001-can-not-use-with-c-files.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From a5dff521691a17701b5a02ec75e84cfe1bf605f7 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 3 Feb 2024 06:41:49 -0500
-Subject: [PATCH] can not use with c files
-
----
- cmake/Dependencies.cmake | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index 4dd8042058..5f91f3ffab 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1269,7 +1269,7 @@ if(USE_ROCM)
-     list(APPEND HIP_CXX_FLAGS -Wno-duplicate-decl-specifier)
-     list(APPEND HIP_CXX_FLAGS -DCAFFE2_USE_MIOPEN)
-     list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
--    list(APPEND HIP_CXX_FLAGS -std=c++17)
-+#    list(APPEND HIP_CXX_FLAGS -std=c++17)
-     if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "6.0.0")
-       list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2)
-     endif()
--- 
-2.43.0
-
diff --git a/0001-cuda-hip-signatures.patch b/0001-cuda-hip-signatures.patch
deleted file mode 100644
index a258737..0000000
--- a/0001-cuda-hip-signatures.patch
+++ /dev/null
@@ -1,42 +0,0 @@
-From 214dc959acc809e1959643272c344ee5335d5a69 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Thu, 1 Feb 2024 11:29:47 -0500
-Subject: [PATCH] cuda - hip signatures
-
----
- aten/src/ATen/cuda/detail/LazyNVRTC.cpp | 9 +++++++++
- 1 file changed, 9 insertions(+)
-
-diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
-index 1b85e7776e..bb6f88783a 100644
---- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
-+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
-@@ -134,8 +134,13 @@ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
-                                const char *src,
-                                const char *name,
-                                int numHeaders,
-+#if !defined(USE_ROCM)
-                                const char * const *headers,
-                                const char * const *includeNames) {
-+#else
-+                               const char **headers,
-+			       const char **includeNames) {
-+#endif
-   auto fn = reinterpret_cast<decltype(&nvrtcCreateProgram)>(getNVRTCLibrary().sym(__func__));
-   if (!fn)
-     throw std::runtime_error("Can't get nvrtcCreateProgram");
-@@ -150,7 +155,11 @@ NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *);
- NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *);
- NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *);
- #endif
-+#if !defined(USE_ROCM)
- NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *);
-+#else
-+NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char **);
-+#endif
- _STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult);
- NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*);
- NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *);
--- 
-2.43.0
-
diff --git a/0001-disable-use-of-aotriton.patch b/0001-disable-use-of-aotriton.patch
deleted file mode 100644
index 61ffd1e..0000000
--- a/0001-disable-use-of-aotriton.patch
+++ /dev/null
@@ -1,94 +0,0 @@
-From 038ce9e44776e23f21c1816daa259bc0ea335088 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 29 Jun 2024 07:06:09 -0700
-Subject: [PATCH] disable use of aotriton
-
----
- .../ATen/native/transformers/cuda/sdp_utils.cpp | 17 +++++++++++++++--
- 1 file changed, 15 insertions(+), 2 deletions(-)
-
-diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
-index 214b02d8262e..7b3eb9dcd8cd 100644
---- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
-+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
-@@ -19,9 +19,12 @@
- #include <c10/core/SymInt.h>
- #include <c10/util/string_view.h>
- 
-+#ifdef USE_FLASH_ATTENTION
- #if USE_ROCM
- #include <aotriton/flash.h>
- #endif
-+#endif
-+
- 
- /**
- * Note [SDPA Runtime Dispatch]
-@@ -182,6 +185,9 @@ bool check_sm_version(cudaDeviceProp * dprops) {
- 
- bool check_flash_attention_hardware_support(sdp_params const& params, bool debug) {
-   // Check that the gpu is capable of running flash attention
-+#ifndef USE_FLASH_ATTENTION
-+  return false;
-+#else
-   using sm80 = SMVersion<8, 0>;
-   using sm90 = SMVersion<9, 0>;
- #if USE_ROCM
-@@ -209,9 +215,13 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
-   }
- #endif
-   return true;
-+#endif
- }
- 
- bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) {
-+#ifndef USE_FLASH_ATTENTION
-+  return false;
-+#else
-   // Mem Efficient attention supports hardware in the range [sm_50, sm_90]
-   using sm50 = SMVersion<5, 0>;
-   using sm90 = SMVersion<9, 0>;
-@@ -240,6 +250,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
-   }
- #endif
-   return true;
-+#endif  
- }
- 
- bool check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89(
-@@ -554,7 +565,7 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
- #ifndef USE_FLASH_ATTENTION
-   TORCH_WARN_ONCE(!debug, "Torch was not compiled with flash attention.");
-   return false;
--#endif
-+#else
- 
-   // Define gate functions that determine if a flash kernel can be ran
-   // Replace with std::to_array when we migrate to c++20
-@@ -597,13 +608,14 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
-     }
-   }
-   return true;
-+#endif
- }
- 
- bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) {
- #ifndef USE_MEM_EFF_ATTENTION
-   TORCH_WARN_ONCE(!debug, "Torch was not compiled with memory efficient attention.");
-   return false;
--#endif
-+#else
-   // Constraints specific to mem efficient attention
-   constexpr auto greater_than_or_equal_sm80_mem_efficient_dtypes =
-       array_of<at::ScalarType>(at::kHalf, at::kFloat, at::kBFloat16);
-@@ -663,6 +675,7 @@ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) {
-   }
- #endif
-   return check_tensor_dtype(params, less_than_sm80_mem_efficient_dtypes, debug);
-+#endif
- }
- 
- SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
--- 
-2.45.2
-
diff --git a/0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch b/0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch
deleted file mode 100644
index 0ce5b1f..0000000
--- a/0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch
+++ /dev/null
@@ -1,226 +0,0 @@
-From b9d45eb1cc90696a4de76676221219e24423c709 Mon Sep 17 00:00:00 2001
-From: William Wen <williamwen@meta.com>
-Date: Wed, 3 Apr 2024 17:58:46 -0700
-Subject: [PATCH] [dynamo, 3.12] enable dynamo on 3.12, enable most dynamo
- unittests on 3.12 (#123216)
-
-Pull Request resolved: https://github.com/pytorch/pytorch/pull/123216
-Approved by: https://github.com/jansel, https://github.com/malfet
----
- test/dynamo/test_autograd_function.py   |  3 ++
- test/dynamo/test_misc.py                | 63 +++++++++++++++++++++++++
- test/functorch/test_eager_transforms.py |  7 ++-
- test/run_test.py                        |  3 --
- torch/__init__.py                       |  5 +-
- torch/_dynamo/eval_frame.py             |  4 +-
- torch/_dynamo/test_case.py              |  8 +---
- 7 files changed, 74 insertions(+), 19 deletions(-)
-
-diff --git a/test/dynamo/test_autograd_function.py b/test/dynamo/test_autograd_function.py
-index d23fec607afa..bc5ebc767038 100644
---- a/test/dynamo/test_autograd_function.py
-+++ b/test/dynamo/test_autograd_function.py
-@@ -2,6 +2,8 @@
- 
- import copy
- import math
-+import sys
-+import unittest
- 
- import torch
- 
-@@ -528,6 +530,7 @@ class AutogradFunctionTests(torch._dynamo.test_case.TestCase):
-     # I pulled all of these test cases from test_autograd.py
-     # In the future, we should make the Dynamo test suite actually
-     # run on test_autograd.py (it's disabled right now) and delete these.
-+    @unittest.skipIf(sys.version_info >= (3, 12), "invalid free in 3.12+")
-     def test_smoke_from_test_autograd(self):
-         class Func(torch.autograd.Function):
-             @staticmethod
-diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
-index a73de8b1c7e9..8f54e0564e6b 100644
---- a/test/dynamo/test_misc.py
-+++ b/test/dynamo/test_misc.py
-@@ -9760,6 +9760,69 @@ fn
-             lambda mod: mod,
-         )
- 
-+    @xfailIfPy311
-+    def test_outside_linear_module_free(self):
-+        # Compared to test_linear_module_free, the linear
-+        # layer is not the code object that is directly compiled.
-+        def model_inp_ctr():
-+            fc = torch.nn.Linear(100, 100)
-+
-+            class Mod(torch.nn.Module):
-+                def __init__(self):
-+                    super().__init__()
-+                    self.fc_ref = fc
-+
-+                def forward(self, x):
-+                    return fc(x[0])
-+
-+            # return fc to keep it alive in _test_compile_model_free
-+            return Mod(), (torch.randn(100, 100), fc)
-+
-+        self._test_compile_model_free(model_inp_ctr, lambda mod: mod.fc_ref)
-+
-+    @unittest.skipIf(sys.version_info >= (3, 12), "leaks in 3.12+")
-+    def test_parameter_free(self):
-+        def model_inp_ctr():
-+            param = torch.nn.Parameter(torch.randn(100, 100))
-+
-+            class Mod(torch.nn.Module):
-+                def __init__(self):
-+                    super().__init__()
-+                    self.param = param
-+
-+                def forward(self, x):
-+                    return self.param * x[0]
-+
-+            # return param to keep it alive in _test_compile_model_free
-+            return Mod(), (torch.randn(100, 100), param)
-+
-+        self._test_compile_model_free(model_inp_ctr, lambda mod: mod.param)
-+
-+    def test_raises_importerror1(self):
-+        @torch.compile(backend="eager")
-+        def fn(x):
-+            try:
-+                import some_module_that_surely_does_not_exist
-+
-+                return
-+            except ImportError:
-+                pass
-+            return x.sin()
-+
-+        x = torch.randn(8)
-+        self.assertEqual(fn(x), x.sin())
-+
-+    def test_raises_importerror2(self):
-+        @torch.compile(backend="eager")
-+        def fn(x):
-+            import some_module_that_surely_does_not_exist
-+
-+            return x + 1
-+
-+        x = torch.randn(8)
-+        with self.assertRaises(ImportError):
-+            fn(x)
-+
-     def test_dynamo_cache_move_to_front(self):
-         class Mod(torch.nn.Module):
-             def __init__(self):
-diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
-index 09415cf8f48e..60790ec06059 100644
---- a/test/functorch/test_eager_transforms.py
-+++ b/test/functorch/test_eager_transforms.py
-@@ -4762,8 +4762,7 @@ class TestCompileTransforms(TestCase):
-     # Triton only supports GPU with SM70 or later.
-     @expectedFailureIf((IS_ARM64 and not IS_MACOS) or
-                        IS_WINDOWS or
--                       (TEST_CUDA and not SM70OrLater) or
--                       (sys.version_info >= (3, 12)))
-+                       (TEST_CUDA and not SM70OrLater))
-     def test_compile_vmap_hessian(self, device):
-         # The model and inputs are a smaller version
-         # of code at benchmark repo:
-@@ -4792,8 +4791,8 @@ class TestCompileTransforms(TestCase):
-         actual = opt_fn(params_and_buffers, x)
-         self.assertEqual(actual, expected)
- 
--    # torch.compile is not supported on Windows or on Python 3.12+
--    @expectedFailureIf(IS_WINDOWS or (sys.version_info >= (3, 12)))
-+    # torch.compile is not supported on Windows
-+    @expectedFailureIf(IS_WINDOWS)
-     @torch._dynamo.config.patch(suppress_errors=False)
-     @torch._dynamo.config.patch(capture_func_transforms=True)
-     @skipIfTorchDynamo("Do not test torch.compile on top of torch.compile")
-diff --git a/test/run_test.py b/test/run_test.py
-index e86af9623042..ebb14df4167d 100755
---- a/test/run_test.py
-+++ b/test/run_test.py
-@@ -74,7 +74,6 @@ sys.path.remove(str(REPO_ROOT))
- RERUN_DISABLED_TESTS = os.getenv("PYTORCH_TEST_RERUN_DISABLED_TESTS", "0") == "1"
- DISTRIBUTED_TEST_PREFIX = "distributed"
- INDUCTOR_TEST_PREFIX = "inductor"
--DYNAMO_TEST_PREFIX = "dynamo"
- 
- 
- # Note [ROCm parallel CI testing]
-@@ -324,7 +323,6 @@ JIT_EXECUTOR_TESTS = [
- ]
- 
- INDUCTOR_TESTS = [test for test in TESTS if test.startswith(INDUCTOR_TEST_PREFIX)]
--DYNAMO_TESTS = [test for test in TESTS if test.startswith(DYNAMO_TEST_PREFIX)]
- DISTRIBUTED_TESTS = [test for test in TESTS if test.startswith(DISTRIBUTED_TEST_PREFIX)]
- TORCH_EXPORT_TESTS = [test for test in TESTS if test.startswith("export")]
- FUNCTORCH_TESTS = [test for test in TESTS if test.startswith("functorch")]
-@@ -1361,7 +1359,6 @@ def get_selected_tests(options) -> List[str]:
-     # these tests failing in Python 3.12 temporarily disabling
-     if sys.version_info >= (3, 12):
-         options.exclude.extend(INDUCTOR_TESTS)
--        options.exclude.extend(DYNAMO_TESTS)
-         options.exclude.extend(
-             [
-                 "functorch/test_dims",
-diff --git a/torch/__init__.py b/torch/__init__.py
-index d381712b4a35..26cdffe81d29 100644
---- a/torch/__init__.py
-+++ b/torch/__init__.py
-@@ -1861,9 +1861,8 @@ def compile(model: Optional[Callable] = None, *,
- 
-     """
-     _C._log_api_usage_once("torch.compile")
--    # Temporary until we get proper support for python 3.12
--    if sys.version_info >= (3, 12):
--        raise RuntimeError("Dynamo is not supported on Python 3.12+")
-+    if sys.version_info >= (3, 13):
-+        raise RuntimeError("Dynamo is not supported on Python 3.13+")
- 
-     # Decorator mode
-     if model is None:
-diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
-index 53ab0df3a947..0a80eeea99ed 100644
---- a/torch/_dynamo/eval_frame.py
-+++ b/torch/_dynamo/eval_frame.py
-@@ -589,8 +589,8 @@ class _NullDecorator(contextlib.nullcontext):  # type: ignore[type-arg]
- 
- 
- def check_if_dynamo_supported():
--    if sys.version_info >= (3, 12):
--        raise RuntimeError("Python 3.12+ not yet supported for torch.compile")
-+    if sys.version_info >= (3, 13):
-+        raise RuntimeError("Python 3.13+ not yet supported for torch.compile")
- 
- 
- def is_dynamo_supported():
-diff --git a/torch/_dynamo/test_case.py b/torch/_dynamo/test_case.py
-index e3cbef09eaae..297ea6e2bc2a 100644
---- a/torch/_dynamo/test_case.py
-+++ b/torch/_dynamo/test_case.py
-@@ -1,7 +1,6 @@
- import contextlib
- import importlib
- import logging
--import sys
- 
- import torch
- import torch.testing
-@@ -20,12 +19,7 @@ log = logging.getLogger(__name__)
- def run_tests(needs=()):
-     from torch.testing._internal.common_utils import run_tests
- 
--    if (
--        TEST_WITH_TORCHDYNAMO
--        or IS_WINDOWS
--        or TEST_WITH_CROSSREF
--        or sys.version_info >= (3, 12)
--    ):
-+    if TEST_WITH_TORCHDYNAMO or IS_WINDOWS or TEST_WITH_CROSSREF:
-         return  # skip testing
- 
-     if isinstance(needs, str):
--- 
-2.44.0
-
diff --git a/0001-include-fmt-ranges.h-for-using-fmt-join.patch b/0001-include-fmt-ranges.h-for-using-fmt-join.patch
deleted file mode 100644
index f7f6c7d..0000000
--- a/0001-include-fmt-ranges.h-for-using-fmt-join.patch
+++ /dev/null
@@ -1,54 +0,0 @@
-From ba2cf11d1bf1dd5086c8e793198a697d4179cca7 Mon Sep 17 00:00:00 2001
-From: Kefu Chai <tchaikov@gmail.com>
-Date: Tue, 16 Jul 2024 08:00:22 +0800
-Subject: [PATCH] include fmt/ranges.h for using fmt::join()
-
-fmt::join() was moved into fmt/ranges.h in fmt 11, so include this
-header for using it.
-
-Signed-off-by: Kefu Chai <tchaikov@gmail.com>
----
- torch/csrc/distributed/c10d/socket.cpp                      | 1 +
- torch/csrc/profiler/standalone/execution_trace_observer.cpp | 1 +
- torch/csrc/profiler/util.cpp                                | 1 +
- 3 files changed, 3 insertions(+)
-
-diff --git a/torch/csrc/distributed/c10d/socket.cpp b/torch/csrc/distributed/c10d/socket.cpp
-index 5013f2540..cbcd33a19 100644
---- a/torch/csrc/distributed/c10d/socket.cpp
-+++ b/torch/csrc/distributed/c10d/socket.cpp
-@@ -31,6 +31,7 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wdeprecated")
- #include <fmt/chrono.h>
- C10_DIAGNOSTIC_POP()
- #include <fmt/format.h>
-+#include <fmt/ranges.h>
- 
- #include <torch/csrc/distributed/c10d/error.h>
- #include <torch/csrc/distributed/c10d/exception.h>
-diff --git a/torch/csrc/profiler/standalone/execution_trace_observer.cpp b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
-index 2ef2e5423..fb053e916 100644
---- a/torch/csrc/profiler/standalone/execution_trace_observer.cpp
-+++ b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
-@@ -10,6 +10,7 @@
- #endif // _WIN32
- 
- #include <fmt/format.h>
-+#include <fmt/ranges.h>
- #include <chrono>
- #include <cmath>
- #include <fstream>
-diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp
-index 896bf606c..c229ce130 100644
---- a/torch/csrc/profiler/util.cpp
-+++ b/torch/csrc/profiler/util.cpp
-@@ -5,6 +5,7 @@
- #include <c10/util/ArrayRef.h>
- #include <c10/util/irange.h>
- #include <fmt/format.h>
-+#include <fmt/ranges.h>
- 
- #ifdef USE_KINETO
- #include <libkineto.h>
--- 
-2.45.2
-
diff --git a/0001-no-third_party-FXdiv.patch b/0001-no-third_party-FXdiv.patch
deleted file mode 100644
index 71404e3..0000000
--- a/0001-no-third_party-FXdiv.patch
+++ /dev/null
@@ -1,54 +0,0 @@
-From b3b307add5724ee5730f161e16594fa702f34a19 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 3 Feb 2024 08:20:28 -0500
-Subject: [PATCH] no third_party FXdiv
-
----
- caffe2/CMakeLists.txt | 24 ++++++++++++------------
- 1 file changed, 12 insertions(+), 12 deletions(-)
-
-diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
-index b2f3adbfae..80a5625c8d 100644
---- a/caffe2/CMakeLists.txt
-+++ b/caffe2/CMakeLists.txt
-@@ -110,15 +110,15 @@ endif()
- # Note: the folders that are being commented out have not been properly
- # addressed yet.
- 
--if(NOT MSVC AND USE_XNNPACK)
--  if(NOT TARGET fxdiv)
--    set(FXDIV_BUILD_TESTS OFF CACHE BOOL "")
--    set(FXDIV_BUILD_BENCHMARKS OFF CACHE BOOL "")
--    add_subdirectory(
--      "${FXDIV_SOURCE_DIR}"
--      "${CMAKE_BINARY_DIR}/FXdiv")
--  endif()
--endif()
-+#if(NOT MSVC AND USE_XNNPACK)
-+#  if(NOT TARGET fxdiv)
-+#    set(FXDIV_BUILD_TESTS OFF CACHE BOOL "")
-+#    set(FXDIV_BUILD_BENCHMARKS OFF CACHE BOOL "")
-+#    add_subdirectory(
-+#      "${FXDIV_SOURCE_DIR}"
-+#      "${CMAKE_BINARY_DIR}/FXdiv")
-+#  endif()
-+#endif()
- 
- add_subdirectory(core)
- add_subdirectory(serialize)
-@@ -1081,9 +1081,9 @@ if(USE_XPU)
-   target_compile_definitions(torch_xpu PRIVATE USE_XPU)
- endif()
- 
--if(NOT MSVC AND USE_XNNPACK)
--  TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)
--endif()
-+#if(NOT MSVC AND USE_XNNPACK)
-+#  TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)
-+#endif()
- 
- # ==========================================================
- # formerly-libtorch flags
--- 
-2.43.0
-
diff --git a/0001-no-third_party-fmt.patch b/0001-no-third_party-fmt.patch
deleted file mode 100644
index 6e82af2..0000000
--- a/0001-no-third_party-fmt.patch
+++ /dev/null
@@ -1,65 +0,0 @@
-From 2ce255b75760a0a513fb1706629b416f76a5c822 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 3 Feb 2024 08:16:04 -0500
-Subject: [PATCH] no third_party fmt
-
----
- c10/CMakeLists.txt       | 2 +-
- cmake/Dependencies.cmake | 6 +++---
- torch/CMakeLists.txt     | 2 +-
- 3 files changed, 5 insertions(+), 5 deletions(-)
-
-diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
-index 1f742f4c176..4fa08913bdd 100644
---- a/c10/CMakeLists.txt
-+++ b/c10/CMakeLists.txt
-@@ -87,7 +87,7 @@ endif()
- if(C10_USE_GLOG)
-   target_link_libraries(c10 PUBLIC glog::glog)
- endif()
--target_link_libraries(c10 PRIVATE fmt::fmt-header-only)
-+target_link_libraries(c10 PRIVATE fmt)
- 
- if(C10_USE_NUMA)
-   message(STATUS "NUMA paths:")
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index 6f5a2d5feff..42fbf80f6e8 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1837,7 +1837,7 @@ endif()
- #
- set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
- set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libs" FORCE)
--add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt)
-+# add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt)
- 
- # Disable compiler feature checks for `fmt`.
- #
-@@ -1846,9 +1846,9 @@ add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt)
- # CMAKE_CXX_FLAGS in ways that break feature checks. Since we already know
- # `fmt` is compatible with a superset of the compilers that PyTorch is, it
- # shouldn't be too bad to just disable the checks.
--set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "")
-+# set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "")
- 
--list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)
-+# list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)
- set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE)
- 
- # ---[ Kineto
-diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
-index 97a72eed55b..9e5014d1980 100644
---- a/torch/CMakeLists.txt
-+++ b/torch/CMakeLists.txt
-@@ -80,7 +80,7 @@ set(TORCH_PYTHON_LINK_LIBRARIES
-     python::python
-     pybind::pybind11
-     shm
--    fmt::fmt-header-only
-+    fmt
-     ATEN_CPU_FILES_GEN_LIB)
- 
- if(USE_ASAN AND TARGET Sanitizer::address)
--- 
-2.43.2
-
diff --git a/0001-no-third_party-foxi.patch b/0001-no-third_party-foxi.patch
deleted file mode 100644
index ba1ec40..0000000
--- a/0001-no-third_party-foxi.patch
+++ /dev/null
@@ -1,36 +0,0 @@
-From 8cb61cf9282102ac225645fcc9fb4a1bb7cb15a2 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 3 Feb 2024 08:11:55 -0500
-Subject: [PATCH] no third_party foxi
-
----
- cmake/Dependencies.cmake | 6 +++---
- 1 file changed, 3 insertions(+), 3 deletions(-)
-
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index 5f91f3ffab..8e1461af81 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1567,7 +1567,7 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX)
-       set_target_properties(onnx_proto PROPERTIES CXX_STANDARD 17)
-     endif()
-   endif()
--  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/foxi EXCLUDE_FROM_ALL)
-+  # add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/foxi EXCLUDE_FROM_ALL)
- 
-   add_definitions(-DONNX_NAMESPACE=${ONNX_NAMESPACE})
-   if(NOT USE_SYSTEM_ONNX)
-@@ -1600,8 +1600,8 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX)
-     message("-- Found onnx: ${ONNX_LIBRARY} ${ONNX_PROTO_LIBRARY}")
-     list(APPEND Caffe2_DEPENDENCY_LIBS onnx_proto onnx)
-   endif()
--  include_directories(${FOXI_INCLUDE_DIRS})
--  list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)
-+#  include_directories(${FOXI_INCLUDE_DIRS})
-+#  list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)
-   # Recover the build shared libs option.
-   set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS})
- endif()
--- 
-2.43.0
-
diff --git a/0001-python-torch-disable-ck.patch b/0001-python-torch-disable-ck.patch
deleted file mode 100644
index e8fd9c2..0000000
--- a/0001-python-torch-disable-ck.patch
+++ /dev/null
@@ -1,112 +0,0 @@
-From 027dad1eaed51c1172e2497da611e3267d42d2f0 Mon Sep 17 00:00:00 2001
-From: Tom Rix <Tom.Rix@amd.com>
-Date: Fri, 28 Mar 2025 09:16:03 -0700
-Subject: [PATCH] python-torch: disable ck
-
----
- aten/src/ATen/CMakeLists.txt    |  7 +++----
- aten/src/ATen/Context.cpp       |  1 +
- aten/src/ATen/cuda/CUDABlas.cpp | 10 +++++-----
- 3 files changed, 9 insertions(+), 9 deletions(-)
-
-diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
-index 085af373ec22..84808880e51c 100644
---- a/aten/src/ATen/CMakeLists.txt
-+++ b/aten/src/ATen/CMakeLists.txt
-@@ -134,7 +134,7 @@ file(GLOB native_cuda_cu "native/cuda/*.cu")
- file(GLOB native_cuda_cpp "native/cuda/*.cpp")
- file(GLOB native_cuda_h "native/cuda/*.h" "native/cuda/*.cuh")
- file(GLOB native_cuda_linalg_cpp "native/cuda/linalg/*.cpp")
--file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh" "native/hip/bgemm_kernels/*.h")
-+file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh" )
- file(GLOB native_cudnn_cpp "native/cudnn/*.cpp")
- file(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu")
- file(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp")
-@@ -145,7 +145,7 @@ file(GLOB native_nested_h "native/nested/*.h")
- file(GLOB native_nested_cuda_cu "native/nested/cuda/*.cu")
- file(GLOB native_nested_cuda_cpp "native/nested/cuda/*.cpp")
- 
--file(GLOB native_hip_hip "native/hip/*.hip" "native/hip/bgemm_kernels/*.hip")
-+file(GLOB native_hip_hip "native/hip/*.hip" )
- file(GLOB native_hip_cpp "native/hip/*.cpp")
- file(GLOB native_hip_linalg_cpp "native/hip/linalg/*.cpp")
- file(GLOB native_miopen_cpp "native/miopen/*.cpp")
-@@ -361,13 +361,12 @@ endif()
-     ${native_quantized_hip_hip}
-     ${native_transformers_hip_hip} ${native_transformers_src_hip_hip}
-   )
--  if(WIN32) # Windows doesn't support Composable Kernels and Triton
-     file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip")
-     file(GLOB native_hip_ck "native/hip/ck*.hip")
-     exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
-       ${native_hip_bgemm} ${native_hip_ck}
-       ${native_transformers_hip_hip} ${native_transformers_hip_cpp})
--  endif()
-+
-   # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources)
-   list(APPEND all_hip_cpp
-     ${native_nested_hip_cpp}
-diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
-index f598fc3a39d3..03dab6ff38fe 100644
---- a/aten/src/ATen/Context.cpp
-+++ b/aten/src/ATen/Context.cpp
-@@ -355,6 +355,7 @@ at::BlasBackend Context::blasPreferredBackend() {
- }
- 
- void Context::setBlasPreferredBackend(at::BlasBackend b) {
-+  return;
- #ifdef _MSC_VER
-   TORCH_WARN_ONCE(
-     "torch.backends.cuda.preferred_blas_library is an experimental feature. "
-diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
-index a62b028fd4ff..cba38426ea1f 100644
---- a/aten/src/ATen/cuda/CUDABlas.cpp
-+++ b/aten/src/ATen/cuda/CUDABlas.cpp
-@@ -708,7 +708,7 @@ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     bgemm_internal_cublaslt<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
-   }
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_NO_CK
-   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-     at::native::bgemm_internal_ck<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
-   }
-@@ -1061,7 +1061,7 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
-     gemm_internal_cublaslt<double>(CUDABLAS_GEMM_ARGS(double));
- #endif
-   }
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_NO_CK
-   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-     at::native::gemm_internal_ck<double>(CUDABLAS_GEMM_ARGS(double));
-   }
-@@ -1077,7 +1077,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
-   }
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_NO_CK
-   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-     at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
-   }
-@@ -1125,7 +1125,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
-   }
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_NO_CK
-   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-     at::native::gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
-   }
-@@ -1141,7 +1141,7 @@ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
-   }
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_NO_CK
-   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-     at::native::gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
-   }
--- 
-2.48.1
-
diff --git a/0001-reenable-foxi-linking.patch b/0001-reenable-foxi-linking.patch
deleted file mode 100644
index 8e39795..0000000
--- a/0001-reenable-foxi-linking.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From 58ccda271e8f51c3fa5b7518cf6ee52ce204fd37 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Thu, 22 Feb 2024 09:28:11 -0500
-Subject: [PATCH] reenable foxi linking
-
----
- cmake/Dependencies.cmake | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index 42fbf80f6e8..bc3a2dc6fee 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1604,7 +1604,7 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX)
-     list(APPEND Caffe2_DEPENDENCY_LIBS onnx_proto onnx)
-   endif()
- #  include_directories(${FOXI_INCLUDE_DIRS})
--#  list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)
-+  list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)
-   # Recover the build shared libs option.
-   set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS})
- endif()
--- 
-2.43.2
-
diff --git a/0001-silence-an-assert.patch b/0001-silence-an-assert.patch
deleted file mode 100644
index 0b20dcf..0000000
--- a/0001-silence-an-assert.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From 04dd33db93b852fdfd7ea408813080b2e2026650 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 3 Feb 2024 06:41:20 -0500
-Subject: [PATCH] silence an assert
-
----
- aten/src/ATen/native/cuda/IndexKernel.cu | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu
-index 657c0c77b3..b406aa6687 100644
---- a/aten/src/ATen/native/cuda/IndexKernel.cu
-+++ b/aten/src/ATen/native/cuda/IndexKernel.cu
-@@ -249,7 +249,7 @@ void index_put_kernel_quantized_cuda(TensorIterator& iter, const IntArrayRef ind
- 
-     gpu_index_kernel(iter, index_size, index_stride, [inv_scale, zero_point, qmin, qmax]C10_DEVICE(char* const out_data, const char* const in_data, const int64_t offset) {
-       int64_t qvalue = static_cast<int64_t>(zero_point + nearbyintf(*(float*)in_data * inv_scale));
--      qvalue = std::clamp(qvalue, qmin, qmax);
-+      //qvalue = std::clamp(qvalue, qmin, qmax);
-       *(scalar_t*)(out_data + offset) = static_cast<scalar_t>(qvalue);
-     });
-   });
--- 
-2.43.0
-
diff --git a/0001-torch-paper-over-c-assert.patch b/0001-torch-paper-over-c-assert.patch
deleted file mode 100644
index b7e55ce..0000000
--- a/0001-torch-paper-over-c-assert.patch
+++ /dev/null
@@ -1,88 +0,0 @@
-From f646e0f04ae591c8f2d8a0cd24b035725c57659b Mon Sep 17 00:00:00 2001
-From: Tom Rix <Tom.Rix@amd.com>
-Date: Thu, 23 Jan 2025 08:24:22 -0800
-Subject: [PATCH] torch: paper over c++ assert
-
----
- aten/src/ATen/native/sparse/FlattenIndicesCommon.h           | 2 ++
- .../ATen/native/sparse/SparseBinaryOpIntersectionCommon.h    | 5 +++++
- .../src/ATen/native/sparse/ValidateCompressedIndicesCommon.h | 2 ++
- 3 files changed, 9 insertions(+)
-
-diff --git a/aten/src/ATen/native/sparse/FlattenIndicesCommon.h b/aten/src/ATen/native/sparse/FlattenIndicesCommon.h
-index 0e79ed809ae6..a3cec8aaf78b 100644
---- a/aten/src/ATen/native/sparse/FlattenIndicesCommon.h
-+++ b/aten/src/ATen/native/sparse/FlattenIndicesCommon.h
-@@ -69,11 +69,13 @@ Tensor _flatten_indices_impl(const Tensor& indices, IntArrayRef size) {
-           [=] FUNCAPI (int64_t nnz_idx) -> int64_t {
-           const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride;
-           auto hash = static_cast<int64_t>(0);
-+#if 0
-           for (int64_t dim = 0; dim < sparse_dim; ++dim) {
-             const auto dim_hash_coeff = hash_coeffs[dim];
-             const auto dim_index = ptr_indices_dim[dim * indices_dim_stride];
-             hash += dim_index * dim_hash_coeff;
-           }
-+#endif
-           return hash;
-       });
-     }
-diff --git a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
-index c0b94bf39d54..8de4900b7a01 100644
---- a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
-+++ b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
-@@ -279,12 +279,15 @@ void _sparse_binary_op_intersection_kernel_impl(
-           if (!ptr_indices) {
-             return hash;
-           }
-+#if 0
-+//	  /usr/lib/gcc/x86_64-redhat-linux/15/../../../../include/c++/15/array:219:2: error: reference to __host__ function '__glibcxx_assert_fail' in __host__ __device__ function
-           const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride;
-           for (int64_t dim = 0; dim < sparse_dim; ++dim) {
-             const auto dim_hash_coeff = hash_coeffs[dim];
-             const auto dim_index = ptr_indices_dim[dim * indices_dim_stride];
-             hash += dim_index * dim_hash_coeff;
-           }
-+#endif
-           return hash;
-       });
-     }
-@@ -364,6 +367,7 @@ void _sparse_binary_op_intersection_kernel_impl(
-           if (hash_ptr) {
-             hash = hash_ptr[nnz_idx];
-           } else if (sparse_dim) {
-+#if 0
-             // Compute hash value
-             const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride;
-             for (int64_t dim = 0; dim < sparse_dim; ++dim) {
-@@ -371,6 +375,7 @@ void _sparse_binary_op_intersection_kernel_impl(
-               const auto dim_index = ptr_indices_dim[dim * indices_dim_stride];
-               hash += dim_index * dim_hash_coeff;
-             }
-+#endif
-           }
- 
-           // Perform hash values intersection
-diff --git a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
-index ec4c084a39cc..9bc9655b0afa 100644
---- a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
-+++ b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
-@@ -341,6 +341,7 @@ void _validate_compressed_sparse_indices_kernel(
-                 // assuming idx contiguity per batch:
-                 int64_t tmp = batch_idx * nnz;
-                 // `nnz == idx_sizes[idx_ndims - 1]` is checked above as `nnz == idx.size(-1)`
-+#if 0		
-                 for (int i = idx_ndims - 1;
-                      i >= 0 && nnz > 0;  // break early when nnz==0
-                      i--) {
-@@ -348,6 +349,7 @@ void _validate_compressed_sparse_indices_kernel(
-                   idx_offset += (tmp - div * idx_sizes[i]) * idx_strides[i];
-                   tmp = div;
-                 }
-+#endif
-                 const auto* RESTRICT ptr_idx_batch = ptr_idx + idx_offset;
-                 _check_idx_sorted_distinct_vals_slices_with_cidx<
-                     cdim_name,
--- 
-2.48.1
-
diff --git a/0001-use-any-hip.patch b/0001-use-any-hip.patch
deleted file mode 100644
index dca86ea..0000000
--- a/0001-use-any-hip.patch
+++ /dev/null
@@ -1,34 +0,0 @@
-From 4248211ce9a9de81bb3ade5d421ba709b19ead08 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 3 Feb 2024 15:01:28 -0500
-Subject: [PATCH] use any hip
-
----
- cmake/public/LoadHIP.cmake | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
-index 1abeb06228..28458c4146 100644
---- a/cmake/public/LoadHIP.cmake
-+++ b/cmake/public/LoadHIP.cmake
-@@ -30,7 +30,7 @@ endif()
- message("Building PyTorch for GPU arch: ${PYTORCH_ROCM_ARCH}")
- 
- # Add HIP to the CMAKE Module Path
--set(CMAKE_MODULE_PATH ${ROCM_PATH}/lib/cmake/hip ${CMAKE_MODULE_PATH})
-+set(CMAKE_MODULE_PATH ${ROCM_PATH}/lib64/cmake/hip ${CMAKE_MODULE_PATH})
- 
- macro(find_package_and_print_version PACKAGE_NAME)
-   find_package("${PACKAGE_NAME}" ${ARGN})
-@@ -38,7 +38,7 @@ macro(find_package_and_print_version PACKAGE_NAME)
- endmacro()
- 
- # Find the HIP Package
--find_package_and_print_version(HIP 1.0)
-+find_package_and_print_version(HIP MODULE)
- 
- if(HIP_FOUND)
-   set(PYTORCH_FOUND_HIP TRUE)
--- 
-2.43.0
-
diff --git a/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch b/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
deleted file mode 100644
index 413c60d..0000000
--- a/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
+++ /dev/null
@@ -1,47 +0,0 @@
-From 091b7fe1ccbb5e4ff4ac6017d42bacb869f61a27 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 20 Jul 2024 05:37:15 -0600
-Subject: [PATCH] Add cmake option USE_SYSTEM_FBGEMM
-
-Signed-off-by: Tom Rix <trix@redhat.com>
----
- CMakeLists.txt           | 1 +
- cmake/Dependencies.cmake | 3 ++-
- 2 files changed, 3 insertions(+), 1 deletion(-)
-
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index c4cd4b2c2a98..2068f7c6c4f2 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -253,6 +253,7 @@ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
-                        "USE_CUDNN" OFF)
- cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF)
- option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
-+option(USE_SYSTEM_FBGEMM "Use system-wide FBGEMM" OFF)
- option(USE_KINETO "Use Kineto profiling library" ON)
- option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
- option(USE_FAKELOWP "Use FakeLowp operators" OFF)
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index f1f2eb7cec31..192dac46f13b 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -706,6 +706,7 @@ endif()
- 
- # ---[ FBGEMM
- if(USE_FBGEMM)
-+  if (NOT USE_SYSTEM_FBGEMM)
-   set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")
-   if(NOT DEFINED FBGEMM_SOURCE_DIR)
-     set(FBGEMM_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/fbgemm" CACHE STRING "FBGEMM source directory")
-@@ -746,7 +747,7 @@ if(USE_FBGEMM)
-       target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable)
-     endif()
-   endif()
--
-+  endif()
-   if(USE_FBGEMM)
-     list(APPEND Caffe2_DEPENDENCY_LIBS fbgemm)
-   endif()
--- 
-2.45.1
-
diff --git a/next/0001-Add-cmake-variable-USE_ROCM_CK.patch b/next/0001-Add-cmake-variable-USE_ROCM_CK.patch
new file mode 100644
index 0000000..1afe692
--- /dev/null
+++ b/next/0001-Add-cmake-variable-USE_ROCM_CK.patch
@@ -0,0 +1,149 @@
+From 4cc5d88dfe7a45ab245648dc874645d32a24b98b Mon Sep 17 00:00:00 2001
+From: Tom Rix <Tom.Rix@amd.com>
+Date: Fri, 27 Jun 2025 13:52:51 -0700
+Subject: [PATCH] Add cmake variable USE_ROCM_CK
+
+---
+ CMakeLists.txt                  |  1 +
+ aten/src/ATen/CMakeLists.txt    | 40 ++++++++++++++++-----------------
+ aten/src/ATen/cuda/CUDABlas.cpp | 10 ++++-----
+ cmake/Dependencies.cmake        |  3 +++
+ 4 files changed, 29 insertions(+), 25 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 99c0b9e0ea0c..4c632e42f531 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -240,6 +240,7 @@ cmake_dependent_option(
+   BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON
+   "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
+ cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF)
++cmake_dependent_option(USE_ROCM_CK "Use ROCm Composable Kernel" ON "USE_ROCM" ON)
+ option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
+ cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF)
+ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
+diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
+index c9cfd74b501e..59f6178218ee 100644
+--- a/aten/src/ATen/CMakeLists.txt
++++ b/aten/src/ATen/CMakeLists.txt
+@@ -373,26 +373,26 @@ if(USE_ROCM)
+   # is header only, so this should be ok, except that the CMake build generates
+   # a ck/config.h. We just do that part here. Without this, the ck.h from the
+   # ROCM SDK may get accidentally used instead.
+-  function(_pytorch_rocm_generate_ck_conf)
+-    set(CK_ENABLE_INT8 "ON")
+-    set(CK_ENABLE_FP16 "ON")
+-    set(CK_ENABLE_FP32 "ON")
+-    set(CK_ENABLE_FP64 "ON")
+-    set(CK_ENABLE_BF16 "ON")
+-    set(CK_ENABLE_FP8 "ON")
+-    set(CK_ENABLE_BF8 "ON")
+-    set(CK_USE_XDL "ON")
+-    set(CK_USE_WMMA "ON")
+-    configure_file(
+-      "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in"
+-      "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h"
+-      )
+-  endfunction()
++#  function(_pytorch_rocm_generate_ck_conf)
++#    set(CK_ENABLE_INT8 "ON")
++#    set(CK_ENABLE_FP16 "ON")
++#    set(CK_ENABLE_FP32 "ON")
++#    set(CK_ENABLE_FP64 "ON")
++#    set(CK_ENABLE_BF16 "ON")
++#    set(CK_ENABLE_FP8 "ON")
++#    set(CK_ENABLE_BF8 "ON")
++#    set(CK_USE_XDL "ON")
++#    set(CK_USE_WMMA "ON")
++#    configure_file(
++#      "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in"
++#      "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h"
++#      )
++#  endfunction()
+   list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
+-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
+-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
+-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
+-  _pytorch_rocm_generate_ck_conf()
++#  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
++#  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
++#  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
++#  _pytorch_rocm_generate_ck_conf()
+ 
+   # Next two lines are needed because TunableOp uses third-party/fmt
+   list(APPEND ATen_HIP_INCLUDE $<TARGET_PROPERTY:fmt::fmt-header-only,INTERFACE_INCLUDE_DIRECTORIES>)
+@@ -409,7 +409,7 @@ endif()
+     ${native_quantized_hip_hip}
+     ${native_transformers_hip_hip} ${native_transformers_src_hip_hip}
+   )
+-  if(WIN32) # Windows doesn't support Composable Kernels
++  if(NOT USE_ROCM_CK) # Windows doesn't support Composable Kernels
+     file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip")
+     file(GLOB native_hip_ck "native/hip/ck*.hip")
+     exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
+diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
+index 89350a11bea7..33e5f2808057 100644
+--- a/aten/src/ATen/cuda/CUDABlas.cpp
++++ b/aten/src/ATen/cuda/CUDABlas.cpp
+@@ -752,7 +752,7 @@ template <>
+ void bgemm_internal<double>(CUDABLAS_BGEMM_ARGTYPES(double))
+ {
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_CK
+     // hipblaslt does not support double gemm yet
+     bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGS(double));
+ #else
+@@ -1103,7 +1103,7 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
+   void * beta_ptr = &fbeta;
+   _cublasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
+   GEMM_CHECK_ARGVALUES(at::Half);
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_CK
+   int flag = 0;
+ #if USE_GEMM_FLAGS_FP16_ALT_IMPL
+   flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
+@@ -1270,7 +1270,7 @@ template <>
+ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
+ {
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_CK
+     // hipblaslt does not support double gemm yet
+     gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGS(double));
+ #else
+@@ -1311,7 +1311,7 @@ template <>
+ void gemm_internal<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>))
+ {
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_CK
+     // hipblaslt does not support complex gemm yet
+     gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGS(c10::complex<double>));
+ #else
+@@ -1327,7 +1327,7 @@ template <>
+ void gemm_internal<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>))
+ {
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+-#ifdef USE_ROCM
++#ifdef USE_ROCM_CK
+     // hipblaslt does not support complex gemm yet
+     gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGS(c10::complex<float>));
+ #else
+diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
+index a93386c27f8d..be1368999d38 100644
+--- a/cmake/Dependencies.cmake
++++ b/cmake/Dependencies.cmake
+@@ -1031,6 +1031,9 @@ if(USE_ROCM)
+     if(HIPBLASLT_VEC_EXT)
+       list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_VEC_EXT)
+     endif()
++    if(USE_ROCM_CK)
++      list(APPEND HIP_CXX_FLAGS -DUSE_ROCM_CK)
++    endif()
+     list(APPEND HIP_HIPCC_FLAGS --offload-compress)
+     if(WIN32)
+       add_definitions(-DROCM_ON_WINDOWS)
+-- 
+2.49.0
+
diff --git a/next/0001-Optionally-use-hipblaslt.patch b/next/0001-Optionally-use-hipblaslt.patch
deleted file mode 100644
index 1e5ca4b..0000000
--- a/next/0001-Optionally-use-hipblaslt.patch
+++ /dev/null
@@ -1,506 +0,0 @@
-From f1d65e958afa65882dbfea8b392ab847a84d41ed Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 29 Jun 2024 04:18:34 -0700
-Subject: [PATCH] Optionally use hipblaslt
-
----
- aten/src/ATen/cuda/CUDABlas.cpp          | 46 ++++++++++++++++++------
- aten/src/ATen/cuda/CUDAContextLight.h    |  4 +++
- aten/src/ATen/cuda/CublasHandlePool.cpp  | 10 ++++--
- aten/src/ATen/cuda/tunable/TunableGemm.h | 18 +++++++---
- aten/src/ATen/native/cuda/Blas.cpp       | 18 +++++++++-
- cmake/Dependencies.cmake                 |  3 ++
- cmake/public/LoadHIP.cmake               |  2 +-
- 7 files changed, 82 insertions(+), 19 deletions(-)
-
-diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
-index ce991a9bcad4..3f0d17b52778 100644
---- a/aten/src/ATen/cuda/CUDABlas.cpp
-+++ b/aten/src/ATen/cuda/CUDABlas.cpp
-@@ -14,7 +14,9 @@
- #include <c10/util/irange.h>
- 
- #ifdef USE_ROCM
-+#ifdef USE_HIPBLASLT
- #include <hipblaslt/hipblaslt-ext.hpp>
-+#endif
- // until hipblas has an API to accept flags, we must use rocblas here
- #include <hipblas/hipblas.h>
- #include <rocblas/rocblas.h>
-@@ -182,6 +184,9 @@ uint32_t _getAlignment(uintptr_t address) {
- static size_t _parseChosenWorkspaceSize() {
-   const char * val = getenv("CUBLASLT_WORKSPACE_SIZE");
- #ifdef USE_ROCM
-+#ifndef USE_HIPBLASLT
-+  return 0;
-+#endif
-   if (!val) {
-     // accept either env var
-     val = getenv("HIPBLASLT_WORKSPACE_SIZE");
-@@ -235,6 +240,7 @@ namespace at::cuda::blas {
-   } while (0)
- 
- 
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- namespace {
- // Following the pattern of CuSparseDescriptor
- // Defined here for now because this is the only place cublas_lt interface is
-@@ -318,7 +324,6 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
- };
- } // namespace
- 
--
- template <typename Dtype>
- inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-   cudaDataType_t abcType = CUDA_R_32F;
-@@ -452,7 +457,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-       " scaleType ",
-       scaleType);
- }
--
-+#endif
- 
- template <typename Dtype>
- inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-@@ -608,10 +613,13 @@ void bgemm_internal<double>(CUDABLAS_BGEMM_ARGTYPES(double))
- template <>
- void bgemm_internal<float>(CUDABLAS_BGEMM_ARGTYPES(float))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     bgemm_internal_cublaslt<float>(CUDABLAS_BGEMM_ARGS(float));
-   }
--  else {
-+  else
-+#endif
-+  {
-     bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGS(float));
-   }
- }
-@@ -651,10 +659,13 @@ void bgemm_internal<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<fl
- template <>
- void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     bgemm_internal_cublaslt<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
-   }
--  else {
-+  else
-+#endif    
-+  {
-     bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
-   }
- }
-@@ -662,10 +673,13 @@ void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half))
- template <>
- void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     bgemm_internal_cublaslt<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
-   }
--  else {
-+  else
-+#endif
-+  {
-     bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
-   }
- }
-@@ -781,11 +795,13 @@ void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
-   }
- }
- 
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- template <typename Dtype>
- inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
-   // forward to bgemm implementation but set strides and batches to 0
-   bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0);
- }
-+#endif
- 
- template <typename Dtype>
- inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
-@@ -1008,10 +1024,13 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
- template <>
- void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
-   }
--  else {
-+  else
-+#endif
-+  {
-     gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGS(float));
-   }
- }
-@@ -1051,10 +1070,13 @@ void gemm_internal<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<floa
- template <>
- void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
-   }
--  else {
-+  else
-+#endif
-+  {
-     gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
-   }
- }
-@@ -1062,10 +1084,13 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
- template <>
- void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
- {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
-   }
--  else {
-+  else
-+#endif
-+  {
-     gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
-   }
- }
-@@ -1177,7 +1202,7 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
-   }
- }
- 
--
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- template <typename Dtype>
- void gemm_and_bias(
-     bool transpose_mat1,
-@@ -1410,7 +1435,7 @@ void scaled_gemm(
-     ScalarType result_dtype,
-     void* amax_ptr,
-     bool use_fast_accum) {
--#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
-+#if CUDA_VERSION >= 11080 || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   const auto computeType = CUBLAS_COMPUTE_32F;
-   const auto scaleType = CUDA_R_32F;
-   const int8_t fastAccuMode = use_fast_accum ? 1 : 0;
-@@ -1681,6 +1706,7 @@ void int8_gemm(
-       " scaleType ",
-       scaleType);
- }
-+#endif
- 
- template <>
- void trsm<float>(CUDABLAS_TRSM_ARGTYPES(float)) {
-diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h
-index f2b657ced51b..f0ee613c4208 100644
---- a/aten/src/ATen/cuda/CUDAContextLight.h
-+++ b/aten/src/ATen/cuda/CUDAContextLight.h
-@@ -9,7 +9,9 @@
- 
- // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
- // added bf16 support
-+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)))
- #include <cublasLt.h>
-+#endif
- 
- #ifdef CUDART_VERSION
- #include <cusolverDn.h>
-@@ -80,7 +82,9 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator();
- /* Handles */
- TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle();
- TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
-+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)))
- TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
-+#endif
- 
- TORCH_CUDA_CPP_API void clearCublasWorkspaces();
- 
-diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
-index 8eac525b3695..abfdf7a23847 100644
---- a/aten/src/ATen/cuda/CublasHandlePool.cpp
-+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
-@@ -29,7 +29,7 @@ namespace at::cuda {
- 
- namespace {
- 
--#if defined(USE_ROCM)
-+#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
- void createCublasLtHandle(cublasLtHandle_t *handle) {
-   TORCH_CUDABLAS_CHECK(cublasLtCreate(handle));
- }
-@@ -191,8 +191,9 @@ cublasHandle_t getCurrentCUDABlasHandle() {
-   return handle;
- }
- 
--cublasLtHandle_t getCurrentCUDABlasLtHandle() {
- #ifdef USE_ROCM
-+#if defined(USE_HIPBLASLT)
-+cublasLtHandle_t getCurrentCUDABlasLtHandle() {
-   c10::DeviceIndex device = 0;
-   AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
- 
-@@ -213,9 +214,12 @@ cublasLtHandle_t getCurrentCUDABlasLtHandle() {
- 
-   auto handle = myPoolWindow->reserve(device);
-   return handle;
-+}
-+#endif
- #else
-+cublasLtHandle_t getCurrentCUDABlasLtHandle() {
-   return reinterpret_cast<cublasLtHandle_t>(getCurrentCUDABlasHandle());
--#endif
- }
-+#endif
- 
- } // namespace at::cuda
-diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h
-index 53e6154120c9..fa1d664696db 100644
---- a/aten/src/ATen/cuda/tunable/TunableGemm.h
-+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h
-@@ -11,7 +11,9 @@
- 
- #include <ATen/cuda/tunable/GemmCommon.h>
- #ifdef USE_ROCM
-+#ifdef USE_HIPBLASLT
- #include <ATen/cuda/tunable/GemmHipblaslt.h>
-+#endif
- #include <ATen/cuda/tunable/GemmRocblas.h>
- #endif
- #include <ATen/cuda/tunable/StreamTimer.h>
-@@ -65,6 +67,7 @@ class DefaultGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>>
-     }
- };
- 
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- template <typename T>
- class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
-   public:
-@@ -94,6 +97,7 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
-       return OK;
-     }
- };
-+#endif
- 
- template <typename T>
- inline bool IsZero(T v) {
-@@ -191,6 +195,7 @@ static void AddRocblasValidator() {
-   }
- }
- 
-+#ifdef USE_HIPBLASLT
- static void AddHipblasltValidator() {
-   auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
-   if (validators.find("HIPBLASLT_VERSION") == validators.end()) {
-@@ -205,6 +210,7 @@ static void AddHipblasltValidator() {
-         [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
-   }
- }
-+#endif
- 
- static void AddRocmValidator() {
-   auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
-@@ -243,7 +249,7 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
-       }
-       AddRocblasValidator();
-     }
--
-+#ifdef USE_HIPBLASLT
-     static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
-     if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
-       rocm_validators = true;
-@@ -257,7 +263,7 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
-       }
-       AddHipblasltValidator();
-     }
--
-+#endif
-     if (rocm_validators) {
-       AddRocmValidator();
-     }
-@@ -286,7 +292,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
-       }
-       AddRocblasValidator();
-     }
--
-+#ifdef USE_HIPBLASLT
-     static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
-     if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
-       rocm_validators = true;
-@@ -300,7 +306,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
-       }
-       AddHipblasltValidator();
-     }
--
-+#endif
-     if (rocm_validators) {
-       AddRocmValidator();
-     }
-@@ -312,6 +318,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
-   }
- };
- 
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout>
- class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer> {
-  public:
-@@ -321,10 +328,12 @@ class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer>
-     auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
- 
- #if defined(USE_ROCM)
-+#ifdef USE_HIPBLASLT
-     for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps<AT, BT, CT, ALayout, BLayout>()) {
-       this->RegisterOp(std::move(name), std::move(op));
-     }
-     AddHipblasltValidator();
-+#endif
-     AddRocmValidator();
- #endif
-   }
-@@ -337,6 +346,7 @@ class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer>
-             "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
-   }
- };
-+#endif
- 
- #undef XSTRINGIFY
- #undef STRINGIFY
-diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
-index 84c59a4fd0d7..56ad5de3bf2d 100644
---- a/aten/src/ATen/native/cuda/Blas.cpp
-+++ b/aten/src/ATen/native/cuda/Blas.cpp
-@@ -173,6 +173,7 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa
- }
- 
- static bool getDisableAddmmCudaLt() {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-     static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT");
- #ifdef USE_ROCM
-     // if we enable tunable op, it'll take priority over just hipblaslt (heuristics)
-@@ -196,10 +197,14 @@ static bool getDisableAddmmCudaLt() {
-     }
-     return false;
- #endif
-+#else
-+    return true;
-+#endif
- }
- 
- #ifdef USE_ROCM
- static bool isSupportedHipLtROCmArch(int index) {
-+#ifdef USE_HIPBLASLT
-     hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index);
-     std::string device_arch = prop->gcnArchName;
-     static const std::vector<std::string> archs = {"gfx90a", "gfx940", "gfx941", "gfx942"};
-@@ -210,6 +215,7 @@ static bool isSupportedHipLtROCmArch(int index) {
-         }
-     }
-     TORCH_CHECK(false, "Attempting to use hipBLASLt on a unsupported architecture!");
-+#endif
-     return false;
- }
- #endif
-@@ -235,6 +241,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
-   at::ScalarType scalar_type = self.scalar_type();
-   c10::MaybeOwned<Tensor> self_;
-   if (&result != &self) {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM)
-     // Strangely, if mat2 has only 1 row or column, we get
-     // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
-@@ -276,13 +283,14 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
-             scalar_type != at::ScalarType::BFloat16));
- #endif
-     }
-+#endif
- #endif
-     if (!useLtInterface) {
-       self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
-     }
-     self__sizes = self_->sizes();
-   } else {
--#if defined(USE_ROCM)
-+#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
-     useLtInterface = !disable_addmm_cuda_lt &&
-         result.dim() == 2 && result.is_contiguous() &&
-         isSupportedHipLtROCmArch(self.device().index()) &&
-@@ -334,6 +342,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
-   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj());
- 
-   if (useLtInterface) {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
- #if defined(USE_ROCM)
-     AT_DISPATCH_FLOATING_TYPES_AND2(
-         at::ScalarType::Half,
-@@ -394,6 +403,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
-               activation_epilogue
-           );
-         });
-+#endif
- #endif
-   } else
-   {
-@@ -803,6 +813,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
- }
- 
- static bool _scaled_mm_allowed_device() {
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-     auto dprops = at::cuda::getCurrentDeviceProperties();
- #ifdef USE_ROCM
-     std::string device_arch = dprops->gcnArchName;
-@@ -817,6 +828,9 @@ static bool _scaled_mm_allowed_device() {
- #else
-     return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9);
- #endif
-+#else
-+    return false;
-+#endif
- }
- 
- // Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax
-@@ -850,6 +864,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
-   // Check sizes
-   bool allowed_device = _scaled_mm_allowed_device();
-   TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+");
-+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
-   TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
-   TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
-   TORCH_CHECK(
-@@ -1025,6 +1040,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
- #if defined(USE_ROCM) && ROCM_VERSION >= 60000 && ROCM_VERSION < 60200
-   // ROCm's hipBLASLt does not support amax before 6.2, so calculate separately
-   amax = at::max(at::abs(out.to(kFloat)));
-+#endif
- #endif
- 
-   return {out, amax};
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index f1f2eb7cec31..8d05e834bbc5 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1052,6 +1052,9 @@ if(USE_ROCM)
-     list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
-     list(APPEND HIP_CXX_FLAGS -std=c++17)
-     list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2)
-+    if(hipblast_FOUND)
-+      list(APPEND HIP_CXX_FLAGS -DUSE_HIPBLASLT)
-+    endif()
-     if(HIP_NEW_TYPE_ENUMS)
-       list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS)
-     endif()
-diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
-index fa39156031ff..df4836847fdf 100644
---- a/cmake/public/LoadHIP.cmake
-+++ b/cmake/public/LoadHIP.cmake
-@@ -155,7 +155,7 @@ if(HIP_FOUND)
-   find_package_and_print_version(hiprand REQUIRED)
-   find_package_and_print_version(rocblas REQUIRED)
-   find_package_and_print_version(hipblas REQUIRED)
--  find_package_and_print_version(hipblaslt REQUIRED)
-+  find_package_and_print_version(hipblaslt)
-   find_package_and_print_version(miopen REQUIRED)
-   find_package_and_print_version(hipfft REQUIRED)
-   find_package_and_print_version(hipsparse REQUIRED)
--- 
-2.45.2
-
diff --git a/next/0001-disable-use-of-aotriton.patch b/next/0001-disable-use-of-aotriton.patch
deleted file mode 100644
index 61ffd1e..0000000
--- a/next/0001-disable-use-of-aotriton.patch
+++ /dev/null
@@ -1,94 +0,0 @@
-From 038ce9e44776e23f21c1816daa259bc0ea335088 Mon Sep 17 00:00:00 2001
-From: Tom Rix <trix@redhat.com>
-Date: Sat, 29 Jun 2024 07:06:09 -0700
-Subject: [PATCH] disable use of aotriton
-
----
- .../ATen/native/transformers/cuda/sdp_utils.cpp | 17 +++++++++++++++--
- 1 file changed, 15 insertions(+), 2 deletions(-)
-
-diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
-index 214b02d8262e..7b3eb9dcd8cd 100644
---- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
-+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
-@@ -19,9 +19,12 @@
- #include <c10/core/SymInt.h>
- #include <c10/util/string_view.h>
- 
-+#ifdef USE_FLASH_ATTENTION
- #if USE_ROCM
- #include <aotriton/flash.h>
- #endif
-+#endif
-+
- 
- /**
- * Note [SDPA Runtime Dispatch]
-@@ -182,6 +185,9 @@ bool check_sm_version(cudaDeviceProp * dprops) {
- 
- bool check_flash_attention_hardware_support(sdp_params const& params, bool debug) {
-   // Check that the gpu is capable of running flash attention
-+#ifndef USE_FLASH_ATTENTION
-+  return false;
-+#else
-   using sm80 = SMVersion<8, 0>;
-   using sm90 = SMVersion<9, 0>;
- #if USE_ROCM
-@@ -209,9 +215,13 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
-   }
- #endif
-   return true;
-+#endif
- }
- 
- bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) {
-+#ifndef USE_FLASH_ATTENTION
-+  return false;
-+#else
-   // Mem Efficient attention supports hardware in the range [sm_50, sm_90]
-   using sm50 = SMVersion<5, 0>;
-   using sm90 = SMVersion<9, 0>;
-@@ -240,6 +250,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
-   }
- #endif
-   return true;
-+#endif  
- }
- 
- bool check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89(
-@@ -554,7 +565,7 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
- #ifndef USE_FLASH_ATTENTION
-   TORCH_WARN_ONCE(!debug, "Torch was not compiled with flash attention.");
-   return false;
--#endif
-+#else
- 
-   // Define gate functions that determine if a flash kernel can be ran
-   // Replace with std::to_array when we migrate to c++20
-@@ -597,13 +608,14 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
-     }
-   }
-   return true;
-+#endif
- }
- 
- bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) {
- #ifndef USE_MEM_EFF_ATTENTION
-   TORCH_WARN_ONCE(!debug, "Torch was not compiled with memory efficient attention.");
-   return false;
--#endif
-+#else
-   // Constraints specific to mem efficient attention
-   constexpr auto greater_than_or_equal_sm80_mem_efficient_dtypes =
-       array_of<at::ScalarType>(at::kHalf, at::kFloat, at::kBFloat16);
-@@ -663,6 +675,7 @@ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) {
-   }
- #endif
-   return check_tensor_dtype(params, less_than_sm80_mem_efficient_dtypes, debug);
-+#endif
- }
- 
- SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
--- 
-2.45.2
-
diff --git a/python-torch.spec b/python-torch.spec
index 90c908c..91a82a9 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.8.0-rc3
-%global commit0 3d53a53e504089a52a149791fd33d7fc898bd055
+# v2.8.0-rc6
+%global commit0 f2b69a083d15e3d0083bb304302a3fd0b5fb8705
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250625
+%global date0 20250718
 %global pypi_version 2.8.0
 %global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
@@ -357,11 +357,9 @@ sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)@#list(APPE
 sed -i -e 's@if(NOT TARGET fxdiv)@if(MSVC AND USE_XNNPACK)@' caffe2/CMakeLists.txt
 sed -i -e 's@TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@#TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@' caffe2/CMakeLists.txt
 
-%if %{without gitcommit}
 # https://github.com/pytorch/pytorch/issues/149803
 # Tries to checkout nccl
-sed -i -e 's@    checkout_nccl()@#    checkout_nccl()@' tools/build_pytorch_libs.py
-%endif
+sed -i -e 's@    checkout_nccl()@    True@' tools/build_pytorch_libs.py
 
 # Disable the use of check_submodule's in the setup.py, we are a tarball, not a git repo
 sed -i -e 's@check_submodules()$@#check_submodules()@' setup.py
@@ -541,6 +539,7 @@ export USE_SYSTEM_EIGEN_INSTALL=ON
 export USE_SYSTEM_ONNX=ON
 export USE_SYSTEM_PYBIND11=OFF
 export USE_SYSTEM_LIBS=OFF
+export USE_SYSTEM_NCCL=OFF
 export USE_TENSORPIPE=OFF
 export USE_XNNPACK=OFF
 export USE_XPU=OFF

From 61ccf033a8aece458e7df6573367bde478952636 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 24 Jul 2025 06:07:03 -0700
Subject: [PATCH 18/38] Update gitcommit to 2.8.0-rc8

Patch problem with 3.14
Start converting over py3 macros
Handle new dependency on rocmsmi

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .../0001-Add-cmake-variable-USE_ROCM_CK.patch |  85 ++++-
 ...and-import-torch-issues-for-cpython-.patch | 359 ++++++++++++++++++
 next/0001-Use-horrible-dynamo-stub.patch      |  85 +++++
 python-torch.spec                             |  60 ++-
 4 files changed, 560 insertions(+), 29 deletions(-)
 create mode 100644 next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
 create mode 100644 next/0001-Use-horrible-dynamo-stub.patch

diff --git a/next/0001-Add-cmake-variable-USE_ROCM_CK.patch b/next/0001-Add-cmake-variable-USE_ROCM_CK.patch
index 1afe692..925e03b 100644
--- a/next/0001-Add-cmake-variable-USE_ROCM_CK.patch
+++ b/next/0001-Add-cmake-variable-USE_ROCM_CK.patch
@@ -1,17 +1,17 @@
-From 4cc5d88dfe7a45ab245648dc874645d32a24b98b Mon Sep 17 00:00:00 2001
+From 193854993cd939de186de19589c1add4c4b2cf66 Mon Sep 17 00:00:00 2001
 From: Tom Rix <Tom.Rix@amd.com>
-Date: Fri, 27 Jun 2025 13:52:51 -0700
+Date: Mon, 21 Jul 2025 11:35:03 -0700
 Subject: [PATCH] Add cmake variable USE_ROCM_CK
 
 ---
  CMakeLists.txt                  |  1 +
  aten/src/ATen/CMakeLists.txt    | 40 ++++++++++++++++-----------------
- aten/src/ATen/cuda/CUDABlas.cpp | 10 ++++-----
+ aten/src/ATen/cuda/CUDABlas.cpp | 22 +++++++++---------
  cmake/Dependencies.cmake        |  3 +++
- 4 files changed, 29 insertions(+), 25 deletions(-)
+ 4 files changed, 35 insertions(+), 31 deletions(-)
 
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 99c0b9e0ea0c..4c632e42f531 100644
+index a5d25e6afa0f..afc1b53efa64 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -240,6 +240,7 @@ cmake_dependent_option(
@@ -82,7 +82,7 @@ index c9cfd74b501e..59f6178218ee 100644
      file(GLOB native_hip_ck "native/hip/ck*.hip")
      exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
 diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
-index 89350a11bea7..33e5f2808057 100644
+index 89350a11bea7..e5b7960177cf 100644
 --- a/aten/src/ATen/cuda/CUDABlas.cpp
 +++ b/aten/src/ATen/cuda/CUDABlas.cpp
 @@ -752,7 +752,7 @@ template <>
@@ -94,16 +94,16 @@ index 89350a11bea7..33e5f2808057 100644
      // hipblaslt does not support double gemm yet
      bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGS(double));
  #else
-@@ -1103,7 +1103,7 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
-   void * beta_ptr = &fbeta;
-   _cublasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
-   GEMM_CHECK_ARGVALUES(at::Half);
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_CK
-   int flag = 0;
- #if USE_GEMM_FLAGS_FP16_ALT_IMPL
-   flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
-@@ -1270,7 +1270,7 @@ template <>
+@@ -836,7 +836,7 @@ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
+       bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+     }
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
++#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::bgemm_internal_ck<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+   }
+@@ -1270,14 +1270,14 @@ template <>
  void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
  {
    if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
@@ -112,6 +112,23 @@ index 89350a11bea7..33e5f2808057 100644
      // hipblaslt does not support double gemm yet
      gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGS(double));
  #else
+     gemm_internal_cublaslt<double>(CUDABLAS_GEMM_ARGS(double));
+ #endif
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
++#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<double>(CUDABLAS_GEMM_ARGS(double));
+   }
+@@ -1293,7 +1293,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
++#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100
+       gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
 @@ -1311,7 +1311,7 @@ template <>
  void gemm_internal<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>))
  {
@@ -130,6 +147,42 @@ index 89350a11bea7..33e5f2808057 100644
      // hipblaslt does not support complex gemm yet
      gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGS(c10::complex<float>));
  #else
+@@ -1345,7 +1345,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
++#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+@@ -1361,7 +1361,7 @@ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
++#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+@@ -1382,7 +1382,7 @@ void gemm_internal<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half,
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::Half, float>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
++#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+   }
+@@ -1398,7 +1398,7 @@ void gemm_internal<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::B
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::BFloat16, float>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
++#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+   }
 diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
 index a93386c27f8d..be1368999d38 100644
 --- a/cmake/Dependencies.cmake
diff --git a/next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch b/next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
new file mode 100644
index 0000000..b6a282c
--- /dev/null
+++ b/next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
@@ -0,0 +1,359 @@
+From f2a544b2e3a5bdc04985f6e06223c0c1700120a0 Mon Sep 17 00:00:00 2001
+From: albanD <desmaison.alban@gmail.com>
+Date: Sat, 12 Jul 2025 03:42:33 -0400
+Subject: [PATCH] Fix compilation and "import torch" issues for cpython 3.14
+
+Imported from
+https://github.com/albanD/pytorch/tree/cpython314_build
+commit 88bb9cdb72449f4277829e20d94ad8aec1894216
+
+Signed-off-by: Tom Rix <Tom.Rix@amd.com>
+---
+ torch/_dynamo/bytecode_analysis.py        |  2 +-
+ torch/ao/quantization/__init__.py         |  5 +++-
+ torch/ao/quantization/qconfig.py          |  4 ++-
+ torch/ao/quantization/utils.py            |  7 +++--
+ torch/csrc/dynamo/cpython_defs.c          | 16 +++++++++++
+ torch/csrc/dynamo/cpython_includes.h      | 17 ++++++++++++
+ torch/csrc/dynamo/eval_frame.c            | 34 +++++++++++++++--------
+ torch/csrc/dynamo/framelocals_mapping.cpp | 14 ++++++++++
+ torch/csrc/utils/python_compat.h          |  1 +
+ torch/onnx/__init__.py                    |  1 -
+ torch/utils/weak.py                       | 29 +++++++++++++++++--
+ 11 files changed, 111 insertions(+), 19 deletions(-)
+
+diff --git a/torch/_dynamo/bytecode_analysis.py b/torch/_dynamo/bytecode_analysis.py
+index 3252ea91409f..2de74ee5bf8d 100644
+--- a/torch/_dynamo/bytecode_analysis.py
++++ b/torch/_dynamo/bytecode_analysis.py
+@@ -33,7 +33,7 @@ if sys.version_info >= (3, 11):
+     TERMINAL_OPCODES.add(dis.opmap["JUMP_FORWARD"])
+ else:
+     TERMINAL_OPCODES.add(dis.opmap["JUMP_ABSOLUTE"])
+-if sys.version_info >= (3, 12):
++if (3, 12) <= sys.version_info < (3, 14):
+     TERMINAL_OPCODES.add(dis.opmap["RETURN_CONST"])
+ if sys.version_info >= (3, 13):
+     TERMINAL_OPCODES.add(dis.opmap["JUMP_BACKWARD_NO_INTERRUPT"])
+diff --git a/torch/ao/quantization/__init__.py b/torch/ao/quantization/__init__.py
+index ffc1792fd23f..cf5a8b99a894 100644
+--- a/torch/ao/quantization/__init__.py
++++ b/torch/ao/quantization/__init__.py
+@@ -1,5 +1,6 @@
+ # mypy: allow-untyped-defs
+ 
++import sys
+ from typing import Callable, Optional, Union
+ 
+ import torch
+@@ -33,7 +34,9 @@ from .stubs import *  # noqa: F403
+ 
+ # ensure __module__ is set correctly for public APIs
+ ObserverOrFakeQuantize = Union[ObserverBase, FakeQuantizeBase]
+-ObserverOrFakeQuantize.__module__ = "torch.ao.quantization"
++if sys.version_info < (3, 14):
++    ObserverOrFakeQuantize.__module__ = "torch.ao.quantization"
++
+ for _f in [
+     compare_results,
+     extract_results_from_loggers,
+diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py
+index efee5302ad42..d9a8fc78bab4 100644
+--- a/torch/ao/quantization/qconfig.py
++++ b/torch/ao/quantization/qconfig.py
+@@ -1,5 +1,6 @@
+ # mypy: allow-untyped-defs
+ import copy
++import sys
+ import warnings
+ from collections import namedtuple
+ from typing import Any, Optional, Union
+@@ -568,7 +569,8 @@ def _assert_valid_qconfig(qconfig: Optional[QConfig], mod: torch.nn.Module) -> N
+ 
+ 
+ QConfigAny = Optional[QConfig]
+-QConfigAny.__module__ = "torch.ao.quantization.qconfig"
++if sys.version_info < (3, 14):
++    QConfigAny.__module__ = "torch.ao.quantization.qconfig"
+ 
+ 
+ def _add_module_to_qconfig_obs_ctr(
+diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py
+index 4ac3112ec072..3b1503e01701 100644
+--- a/torch/ao/quantization/utils.py
++++ b/torch/ao/quantization/utils.py
+@@ -4,6 +4,7 @@ Utils shared by different modes of quantization (eager/graph)
+ """
+ 
+ import functools
++import sys
+ import warnings
+ from collections import OrderedDict
+ from inspect import getfullargspec, signature
+@@ -16,7 +17,8 @@ from torch.nn.utils.parametrize import is_parametrized
+ 
+ 
+ NodePattern = Union[tuple[Node, Node], tuple[Node, tuple[Node, Node]], Any]
+-NodePattern.__module__ = "torch.ao.quantization.utils"
++if sys.version_info < (3, 14):
++    NodePattern.__module__ = "torch.ao.quantization.utils"
+ 
+ # This is the Quantizer class instance from torch/quantization/fx/quantize.py.
+ # Define separately to prevent circular imports.
+@@ -31,7 +33,8 @@ QuantizerCls = Any
+ Pattern = Union[
+     Callable, tuple[Callable, Callable], tuple[Callable, tuple[Callable, Callable]], Any
+ ]
+-Pattern.__module__ = "torch.ao.quantization.utils"
++if sys.version_info < (3, 14):
++    Pattern.__module__ = "torch.ao.quantization.utils"
+ 
+ 
+ # TODO: maybe rename this to MatchInputNode
+diff --git a/torch/csrc/dynamo/cpython_defs.c b/torch/csrc/dynamo/cpython_defs.c
+index b68ef894aeaa..244d4165d5e8 100644
+--- a/torch/csrc/dynamo/cpython_defs.c
++++ b/torch/csrc/dynamo/cpython_defs.c
+@@ -2,6 +2,20 @@
+ #include <torch/csrc/dynamo/cpython_includes.h>
+ #include <torch/csrc/dynamo/debug_macros.h>
+ 
++#if IS_PYTHON_3_14_PLUS
++
++const uint8_t* THP_PyOpcode_Caches = NULL;
++const int THP_PyOpcode_Caches_size = 0;
++
++void
++THP_PyThreadState_PopFrame(PyThreadState *tstate, _PyInterpreterFrame * frame)
++{}
++void
++THP_PyFrame_Clear(_PyInterpreterFrame *frame)
++{}
++
++#else
++
+ #if IS_PYTHON_3_11_PLUS
+ 
+ #define Py_BUILD_CORE
+@@ -360,3 +374,5 @@ const uint8_t* THP_PyOpcode_Caches = NULL;
+ const int THP_PyOpcode_Caches_size = 0;
+ 
+ #endif
++
++#endif // IS_PYTHON_3_14_PLUS
+\ No newline at end of file
+diff --git a/torch/csrc/dynamo/cpython_includes.h b/torch/csrc/dynamo/cpython_includes.h
+index 6b99c1d5aec8..616be16563cf 100644
+--- a/torch/csrc/dynamo/cpython_includes.h
++++ b/torch/csrc/dynamo/cpython_includes.h
+@@ -21,6 +21,14 @@
+ 
+ #if IS_PYTHON_3_11_PLUS
+ #include <internal/pycore_frame.h>
++#if IS_PYTHON_3_14_PLUS
++#include <internal/pycore_interpframe_structs.h>
++#include <internal/pycore_stackref.h>
++#endif
++#endif
++
++#if IS_PYTHON_3_14_PLUS
++#include <internal/pycore_code.h>
+ #endif
+ 
+ #undef Py_BUILD_CORE
+@@ -30,6 +38,13 @@
+ extern "C" {
+ #endif
+ 
++#if IS_PYTHON_3_14_PLUS
++
++#define F_CODE(x) (PyCodeObject*)PyStackRef_AsPyObjectBorrow(x->f_executable)
++#define PREV_INSTR(x) (x)->instr_ptr
++
++#else
++
+ #if IS_PYTHON_3_13_PLUS
+ #define F_CODE(x) ((PyCodeObject*)(x)->f_executable)
+ #define PREV_INSTR(x) (x)->instr_ptr
+@@ -38,6 +53,8 @@ extern "C" {
+ #define PREV_INSTR(x) (x)->prev_instr
+ #endif
+ 
++#endif // IS_PYTHON_3_14_PLUS
++
+ #if IS_PYTHON_3_12_PLUS
+ #define FUNC(x) ((x)->f_funcobj)
+ #else
+diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c
+index f413782b2d30..72bb8839bac3 100644
+--- a/torch/csrc/dynamo/eval_frame.c
++++ b/torch/csrc/dynamo/eval_frame.c
+@@ -224,17 +224,6 @@ const char* get_frame_name(THP_EVAL_API_FRAME_OBJECT* frame) {
+   return PyUnicode_AsUTF8(F_CODE(frame)->co_name);
+ }
+ 
+-void clear_old_frame_if_python_312_plus(
+-    PyThreadState* tstate,
+-    THP_EVAL_API_FRAME_OBJECT* frame) {
+-#if IS_PYTHON_3_12_PLUS
+-
+-  THP_PyFrame_Clear(frame);
+-  THP_PyThreadState_PopFrame(tstate, frame);
+-
+-#endif
+-}
+-
+ static PyObject* dynamo_eval_custom_code_impl(
+     PyThreadState* tstate,
+     THP_EVAL_API_FRAME_OBJECT* frame,
+@@ -485,6 +474,18 @@ static PyObject* dynamo__custom_eval_frame_shim(
+ 
+ static void enable_eval_frame_shim(PyThreadState* tstate) {}
+ static void enable_eval_frame_default(PyThreadState* tstate) {}
++PyObject* dynamo_eval_custom_code(
++    PyThreadState* tstate,
++    THP_EVAL_API_FRAME_OBJECT* frame,
++    PyCodeObject* code,
++    const char* trace_annotation,
++    int throw_flag) {}
++THPPyInterpreterFrame* THPPyInterpreterFrame_New(
++    THP_EVAL_API_FRAME_OBJECT* frame) {}
++PyObject* dynamo_eval_frame_default(
++    PyThreadState* tstate,
++    THP_EVAL_API_FRAME_OBJECT* frame,
++    int throw_flag) {}
+ 
+ static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL};
+ 
+@@ -498,6 +499,17 @@ static PyTypeObject THPPyInterpreterFrameType = {
+ 
+ #endif // !(IS_PYTHON_3_14_PLUS)
+ 
++void clear_old_frame_if_python_312_plus(
++    PyThreadState* tstate,
++    THP_EVAL_API_FRAME_OBJECT* frame) {
++#if IS_PYTHON_3_12_PLUS
++
++  THP_PyFrame_Clear(frame);
++  THP_PyThreadState_PopFrame(tstate, frame);
++
++#endif
++}
++
+ static PyObject* increment_working_threads(
+     PyThreadState* tstate,
+     PyObject* module) {
+diff --git a/torch/csrc/dynamo/framelocals_mapping.cpp b/torch/csrc/dynamo/framelocals_mapping.cpp
+index b839fb26fc91..c4ee36d87767 100644
+--- a/torch/csrc/dynamo/framelocals_mapping.cpp
++++ b/torch/csrc/dynamo/framelocals_mapping.cpp
+@@ -26,9 +26,13 @@ FrameLocalsMapping::FrameLocalsMapping(FrameLocalsFrameType* frame)
+   PyCodeObject* co = F_CODE(frame);
+   _framelocals.resize(co->co_nlocalsplus, nullptr);
+ 
++#if IS_PYTHON_3_14_PLUS
++  TORCH_CHECK(false, "Python 3.14+ not supported");
++#else
+   if (!frame->stacktop) {
+     return;
+   }
++#endif
+ 
+   auto update_framelocals = [&](int i, PyObject* value) {
+     _PyLocals_Kind kind = _PyLocals_GetKind(co->co_localspluskinds, i);
+@@ -53,11 +57,21 @@ FrameLocalsMapping::FrameLocalsMapping(FrameLocalsFrameType* frame)
+   };
+ 
+   auto offset = co->co_nlocalsplus - co->co_nfreevars;
++#if IS_PYTHON_3_14_PLUS
++  TORCH_CHECK(false, "Python 3.14+ not supported");
++#else
+   for (int i = 0; i < offset; i++) {
+     update_framelocals(i, frame->localsplus[i]);
+   }
++#endif
++
+   // Get references to closure variables
++#if IS_PYTHON_3_14_PLUS
++  PyObject* closure;
++  TORCH_CHECK(false, "Python 3.14+ not supported");
++#else
+   PyObject* closure = ((PyFunctionObject*)FUNC(frame))->func_closure;
++#endif
+   for (int i = 0; i < co->co_nfreevars; i++) {
+     update_framelocals(offset + i, PyTuple_GET_ITEM(closure, i));
+   }
+diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h
+index a1537611cc47..16292e4fd030 100644
+--- a/torch/csrc/utils/python_compat.h
++++ b/torch/csrc/utils/python_compat.h
+@@ -13,6 +13,7 @@ extern "C" {
+ #define IS_PYTHON_3_12_PLUS PY_VERSION_HEX >= 0x030C0000
+ #define IS_PYTHON_3_13_PLUS PY_VERSION_HEX >= 0x030D0000
+ #define IS_PYTHON_3_14_PLUS PY_VERSION_HEX >= 0x030E0000
++#define IS_PYTHON_3_15_PLUS PY_VERSION_HEX >= 0x030F0000
+ 
+ static inline int PyCode_GetNCellvars(PyCodeObject* code) {
+ // gh-26364 added co_ncellvars to Python 3.11.0rc1
+diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
+index 345ffd2a065b..ceeadde5365b 100644
+--- a/torch/onnx/__init__.py
++++ b/torch/onnx/__init__.py
+@@ -104,7 +104,6 @@ ONNXProgram.__module__ = "torch.onnx"
+ OnnxExporterError.__module__ = "torch.onnx"
+ _OrtBackend.__module__ = "torch.onnx"
+ _OrtBackendOptions.__module__ = "torch.onnx"
+-_OrtExecutionProvider.__module__ = "torch.onnx"
+ enable_fake_mode.__module__ = "torch.onnx"
+ is_onnxrt_backend_supported.__module__ = "torch.onnx"
+ 
+diff --git a/torch/utils/weak.py b/torch/utils/weak.py
+index 8bf2ba5ed02b..9c7218cb2ad3 100644
+--- a/torch/utils/weak.py
++++ b/torch/utils/weak.py
+@@ -3,8 +3,6 @@ from __future__ import annotations
+ 
+ import collections.abc as _collections_abc
+ import weakref
+-
+-from _weakrefset import _IterationGuard  # type: ignore[attr-defined]
+ from collections.abc import Mapping, MutableMapping
+ from weakref import ref
+ 
+@@ -22,6 +20,33 @@ __all__ = [
+ ]
+ 
+ 
++# TODO: make weakref properly thread safe following
++# https://github.com/python/cpython/pull/125325
++class _IterationGuard:
++    # This context manager registers itself in the current iterators of the
++    # weak container, such as to delay all removals until the context manager
++    # exits.
++    # This technique should be relatively thread-safe (since sets are).
++
++    def __init__(self, weakcontainer):
++        # Don't create cycles
++        self.weakcontainer = ref(weakcontainer)
++
++    def __enter__(self):
++        w = self.weakcontainer()
++        if w is not None:
++            w._iterating.add(self)
++        return self
++
++    def __exit__(self, e, t, b):
++        w = self.weakcontainer()
++        if w is not None:
++            s = w._iterating
++            s.remove(self)
++            if not s:
++                w._commit_removals()
++
++
+ # This file defines a variant of WeakKeyDictionary that overrides the hashing
+ # behavior of the key to use object identity, rather than the builtin
+ # __eq__/__hash__ functions.  This is useful for Tensor weak keys, as their
+-- 
+2.49.0
+
diff --git a/next/0001-Use-horrible-dynamo-stub.patch b/next/0001-Use-horrible-dynamo-stub.patch
new file mode 100644
index 0000000..1900519
--- /dev/null
+++ b/next/0001-Use-horrible-dynamo-stub.patch
@@ -0,0 +1,85 @@
+From fd535f7bf44f2034cca2a66b4cc7d68d962341df Mon Sep 17 00:00:00 2001
+From: Tom Rix <Tom.Rix@amd.com>
+Date: Sun, 20 Jul 2025 12:47:58 -0700
+Subject: [PATCH] Use horrible dynamo stub
+
+Rawhide's update of python is too fast for dynamo
+So paper of the problem with a horrible stub that throws
+runtime exceptions if dynamo is used.
+
+Signed-off-by: Tom Rix <Tom.Rix@amd.com>
+---
+ build_variables.bzl                        | 26 ++++++++++++----------
+ torch/csrc/dynamo/horrible_dynamo_stub.cpp | 16 +++++++++++++
+ 2 files changed, 30 insertions(+), 12 deletions(-)
+ create mode 100644 torch/csrc/dynamo/horrible_dynamo_stub.cpp
+
+diff --git a/build_variables.bzl b/build_variables.bzl
+index b266c80e8843..a3be6893349b 100644
+--- a/build_variables.bzl
++++ b/build_variables.bzl
+@@ -140,7 +140,8 @@ core_trainer_sources = [
+     "torch/csrc/autograd/variable.cpp",
+     "torch/csrc/autograd/utils/warnings.cpp",
+     "torch/csrc/autograd/jit_decomp_interface.cpp",
+-    "torch/csrc/dynamo/compiled_autograd.cpp",
++#    "torch/csrc/dynamo/compiled_autograd.cpp",
++    "torch/csrc/dynamo/horrible_dynamo_stub.cpp",
+     "torch/csrc/jit/frontend/name_mangler.cpp",
+     "torch/csrc/jit/ir/type_hashing.cpp",
+     "torch/csrc/jit/serialization/pickler.cpp",
+@@ -868,17 +869,18 @@ libtorch_python_core_sources = [
+     "torch/csrc/autograd/python_torch_functions_manual.cpp",
+     "torch/csrc/autograd/python_variable.cpp",
+     "torch/csrc/autograd/python_variable_indexing.cpp",
+-    "torch/csrc/dynamo/python_compiled_autograd.cpp",
+-    "torch/csrc/dynamo/cache_entry.cpp",
+-    "torch/csrc/dynamo/cpp_shim.cpp",
+-    "torch/csrc/dynamo/cpython_defs.c",
+-    "torch/csrc/dynamo/eval_frame.c",
+-    "torch/csrc/dynamo/eval_frame_cpp.cpp",
+-    "torch/csrc/dynamo/extra_state.cpp",
+-    "torch/csrc/dynamo/framelocals_mapping.cpp",
+-    "torch/csrc/dynamo/guards.cpp",
+-    "torch/csrc/dynamo/utils.cpp",
+-    "torch/csrc/dynamo/init.cpp",
++#    "torch/csrc/dynamo/python_compiled_autograd.cpp",
++#    "torch/csrc/dynamo/cache_entry.cpp",
++#    "torch/csrc/dynamo/cpp_shim.cpp",
++#    "torch/csrc/dynamo/cpython_defs.c",
++#    "torch/csrc/dynamo/eval_frame.c",
++#    "torch/csrc/dynamo/eval_frame_cpp.cpp",
++#    "torch/csrc/dynamo/extra_state.cpp",
++#    "torch/csrc/dynamo/framelocals_mapping.cpp",
++#    "torch/csrc/dynamo/guards.cpp",
++#    "torch/csrc/dynamo/utils.cpp",
++#    "torch/csrc/dynamo/init.cpp",
++    "torch/csrc/dynamo/horrible_dynamo_stub.cpp",
+     "torch/csrc/functorch/init.cpp",
+     "torch/csrc/fx/node.cpp",
+     "torch/csrc/mps/Module.cpp",
+diff --git a/torch/csrc/dynamo/horrible_dynamo_stub.cpp b/torch/csrc/dynamo/horrible_dynamo_stub.cpp
+new file mode 100644
+index 000000000000..3ac1324d4557
+--- /dev/null
++++ b/torch/csrc/dynamo/horrible_dynamo_stub.cpp
+@@ -0,0 +1,16 @@
++#include <torch/csrc/autograd/engine.h>
++#include <torch/csrc/dynamo/compiled_autograd.h>
++
++namespace torch::dynamo::autograd {
++const std::unique_ptr<PyCompilerInterface>& getPyCompilerInterface() {
++  throw std::runtime_error("Dynamo not supported");
++  return nullptr;
++}
++std::vector<std::optional<InputMetadata>> get_input_metadata(
++    const edge_list& edges) {
++  std::vector<std::optional<InputMetadata>> r;
++  throw std::runtime_error("Dynamo not supported");
++  return r;
++}
++
++}
+-- 
+2.49.0
+
diff --git a/python-torch.spec b/python-torch.spec
index 91a82a9..44c1199 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.8.0-rc6
-%global commit0 f2b69a083d15e3d0083bb304302a3fd0b5fb8705
+# v2.8.0-rc8
+%global commit0 a1cb3cc05d46d198467bebbb6e8fba50a325d4e7
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250718
+%global date0 20250723
 %global pypi_version 2.8.0
 %global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
@@ -33,7 +33,11 @@
 %endif
 
 # For testing distributed+rccl etc.
+%if %{with gitcommit}
+%bcond_without rccl
+%else
 %bcond_with rccl
+%endif
 %bcond_with gloo
 %bcond_without mpi
 %bcond_without tensorpipe
@@ -103,13 +107,13 @@ Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{
 
 %if %{without gitcommit}
 # https://github.com/pytorch/pytorch/issues/150187
-# The hack job
-# Patch11:       0001-python-torch-disable-ck.patch
-# Cleaned up hack job
 Patch11:       0001-Add-cmake-varaible-USE_ROCM_CK.patch
-
 %else
+# https://github.com/pytorch/pytorch/issues/150187
 Patch11:       0001-Add-cmake-variable-USE_ROCM_CK.patch
+# https://github.com/pytorch/pytorch/issues/156595
+# Patch12:       0001-Use-horrible-dynamo-stub.patch
+Patch12:       0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
 %endif
 
 ExclusiveArch:  x86_64 aarch64
@@ -153,6 +157,9 @@ BuildRequires:  python3dist(filelock)
 BuildRequires:  python3dist(jinja2)
 BuildRequires:  python3dist(networkx)
 BuildRequires:  python3dist(numpy)
+%if %{with gitcommit}
+BuildRequires:  python3dist(pip)
+%endif
 BuildRequires:  python3dist(pyyaml)
 BuildRequires:  python3dist(setuptools)
 BuildRequires:  python3dist(sphinx)
@@ -171,6 +178,9 @@ BuildRequires:  hipcub-devel
 BuildRequires:  hipfft-devel
 BuildRequires:  hiprand-devel
 BuildRequires:  hipsparse-devel
+%if %{with gitcommit}
+BuildRequires:  hipsparselt-devel
+%endif
 BuildRequires:  hipsolver-devel
 BuildRequires:  magma-devel
 BuildRequires:  miopen-devel
@@ -190,6 +200,7 @@ BuildRequires:  rocm-runtime-devel
 BuildRequires:  rocm-rpm-macros
 %if %{with gitcommit}
 BuildRequires:  rocsolver-devel
+BuildRequires:  rocm-smi-devel
 %endif
 BuildRequires:  rocthrust-devel
 BuildRequires:  roctracer-devel
@@ -337,6 +348,10 @@ sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unus
 sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-deprecated-declarations@' cmake/Dependencies.cmake
 # Use parallel jobs
 sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -parallel-jobs=4@' cmake/Dependencies.cmake
+%if %{with gitcommit}
+# Need to link with librocm_smi64
+sed -i -e 's@hiprtc::hiprtc@hiprtc::hiprtc rocm_smi64@' cmake/Dependencies.cmake
+%endif
 
 # No third_party fmt, use system
 sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt
@@ -590,17 +605,21 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 
 # pytorch uses clang, not hipcc
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
-%if %{?fedora} <= 43
-export PYTORCH_ROCM_ARCH="gfx1100;gfx1201"
-%else
 export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
+
+%if %{with gitcommit}
+%pyproject_wheel
+%else
+%py3_build
 %endif
 
-%py3_build
-
 %else
 
+%if %{with gitcommit}
+%pyproject_wheel
+%else
 %py3_build
+%endif
 
 %endif
 
@@ -617,17 +636,32 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 # pytorch uses clang, not hipcc
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
 export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
+%if %{with gitcommit}
+%pyproject_install
+%else
 %py3_install
+%endif
 
 %else
 
+%if %{with gitcommit}
+%pyproject_install
+%pyproject_save_files torch
+%else
 %py3_install
-
+%endif
 
 %endif
 
+
+
 %check
+%if %{with gitcommit}
+# Not working yet
+# pyproject_check_import torch
+%else
 %py3_check_import torch
+%endif
 
 # Do not remote the empty files
 

From 6158e4810ccdce8ccbe332db1ac0b02e87dc4615 Mon Sep 17 00:00:00 2001
From: Fedora Release Engineering <releng@fedoraproject.org>
Date: Fri, 25 Jul 2025 10:49:07 +0000
Subject: [PATCH 19/38] Rebuilt for
 https://fedoraproject.org/wiki/Fedora_43_Mass_Rebuild


From 72ad1f0389043a5f26735e3ca2a2a88398daba23 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sat, 26 Jul 2025 17:15:59 -0700
Subject: [PATCH 20/38] Fix some issues with switching to pyproject macros

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 44c1199..03fbf30 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -214,6 +214,7 @@ BuildRequires:  google-benchmark-devel
 %endif
 
 Requires:       python3dist(dill)
+Requires:       python3dist(yaml)
 
 Obsoletes:      caffe  = 1.0^git20200212.9b89154
 
@@ -638,6 +639,7 @@ export HIP_CLANG_PATH=%{rocmllvm_bindir}
 export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %if %{with gitcommit}
 %pyproject_install
+%pyproject_save_files '*torch*'
 %else
 %py3_install
 %endif
@@ -646,7 +648,7 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 
 %if %{with gitcommit}
 %pyproject_install
-%pyproject_save_files torch
+%pyproject_save_files '*torch*'
 %else
 %py3_install
 %endif
@@ -670,10 +672,8 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %doc README.md 
 %{_bindir}/torchrun
 %{_bindir}/torchfrtrace
-%{python3_sitearch}/%{pypi_name}
-%{python3_sitearch}/%{pypi_name}-*.egg-info
+%{python3_sitearch}/%{pypi_name}*
 %{python3_sitearch}/functorch
-%{python3_sitearch}/torchgen
 
 %changelog
 %autochangelog

From cec8b79644fdf9b63ba227c506c9bccdb36b1618 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 31 Jul 2025 05:52:50 -0700
Subject: [PATCH 21/38] Update to 2.8.0-rc8

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore                                    |   3 +
 0001-Add-cmake-varaible-USE_ROCM_CK.patch     | 120 ------------------
 ... 0001-Add-cmake-variable-USE_ROCM_CK.patch |   0
 ...and-import-torch-issues-for-cpython-.patch |   0
 python-torch.spec                             |  59 ++-------
 sources                                       |   3 +
 6 files changed, 17 insertions(+), 168 deletions(-)
 delete mode 100644 0001-Add-cmake-varaible-USE_ROCM_CK.patch
 rename next/0001-Add-cmake-variable-USE_ROCM_CK.patch => 0001-Add-cmake-variable-USE_ROCM_CK.patch (100%)
 rename next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch => 0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch (100%)

diff --git a/.gitignore b/.gitignore
index 25abff5..a4ed35b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,6 @@
 /pytorch-v2.5.1.tar.gz
 /pytorch-v2.7.0.tar.gz
 /v2.13.6.tar.gz
+/pytorch-a1cb3cc.tar.gz
+/v24.12.23.tar.gz
+/kineto-5e75018.tar.gz
diff --git a/0001-Add-cmake-varaible-USE_ROCM_CK.patch b/0001-Add-cmake-varaible-USE_ROCM_CK.patch
deleted file mode 100644
index b34e07a..0000000
--- a/0001-Add-cmake-varaible-USE_ROCM_CK.patch
+++ /dev/null
@@ -1,120 +0,0 @@
-From 0f33e0a7bbd1522ee74f8fc1fbe3af7563318c79 Mon Sep 17 00:00:00 2001
-From: Tom Rix <Tom.Rix@amd.com>
-Date: Fri, 28 Mar 2025 15:33:09 -0700
-Subject: [PATCH] Add cmake varaible USE_ROCM_CK
-
-To control the use of ROCm Composable Kernel usage.
-
-CK is not compatible with all rocBLAS gpu's, so the user
-must explicitly choose to use CK.
-
-Signed-off-by: Tom Rix <Tom.Rix@amd.com>
----
- CMakeLists.txt                  |  1 +
- aten/src/ATen/CMakeLists.txt    |  8 ++++++--
- aten/src/ATen/cuda/CUDABlas.cpp | 10 +++++-----
- cmake/Dependencies.cmake        |  3 +++
- 4 files changed, 15 insertions(+), 7 deletions(-)
-
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index f3fee2f7ffc2..73903acce452 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -249,6 +249,7 @@ cmake_dependent_option(
-   BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON
-   "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
- cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF)
-+cmake_dependent_option(USE_ROCM_CK "Use ROCm Composable Kernel" ON "USE_ROCM" ON)
- option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
- cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF)
- cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
-diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
-index 085af373ec22..af268ab88572 100644
---- a/aten/src/ATen/CMakeLists.txt
-+++ b/aten/src/ATen/CMakeLists.txt
-@@ -361,13 +361,17 @@ endif()
-     ${native_quantized_hip_hip}
-     ${native_transformers_hip_hip} ${native_transformers_src_hip_hip}
-   )
--  if(WIN32) # Windows doesn't support Composable Kernels and Triton
-+  if(NOT USE_ROCM_CK) # Windows doesn't support Composable Kernels and Triton
-     file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip")
-     file(GLOB native_hip_ck "native/hip/ck*.hip")
-     exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
--      ${native_hip_bgemm} ${native_hip_ck}
-+      ${native_hip_bgemm} ${native_hip_ck})
-+  endif()
-+  if(WIN32) # Windows doesn't support Composable Kernels and Triton
-+    exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
-       ${native_transformers_hip_hip} ${native_transformers_hip_cpp})
-   endif()
-+
-   # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources)
-   list(APPEND all_hip_cpp
-     ${native_nested_hip_cpp}
-diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
-index a62b028fd4ff..a3dbf76848ea 100644
---- a/aten/src/ATen/cuda/CUDABlas.cpp
-+++ b/aten/src/ATen/cuda/CUDABlas.cpp
-@@ -708,7 +708,7 @@ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     bgemm_internal_cublaslt<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
-   }
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_CK
-   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-     at::native::bgemm_internal_ck<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
-   }
-@@ -1061,7 +1061,7 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
-     gemm_internal_cublaslt<double>(CUDABLAS_GEMM_ARGS(double));
- #endif
-   }
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_CK
-   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-     at::native::gemm_internal_ck<double>(CUDABLAS_GEMM_ARGS(double));
-   }
-@@ -1077,7 +1077,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
-   }
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_CK
-   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-     at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
-   }
-@@ -1125,7 +1125,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
-   }
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_CK
-   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-     at::native::gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
-   }
-@@ -1141,7 +1141,7 @@ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
-   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
-     gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
-   }
--#ifdef USE_ROCM
-+#ifdef USE_ROCM_CK
-   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-     at::native::gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
-   }
-diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
-index 30917bdf39f5..2ca6091030f1 100644
---- a/cmake/Dependencies.cmake
-+++ b/cmake/Dependencies.cmake
-@@ -1046,6 +1046,9 @@ if(USE_ROCM)
-     if(HIPBLASLT_VEC_EXT)
-       list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_VEC_EXT)
-     endif()
-+    if(USE_ROCM_CK)
-+      list(APPEND HIP_CXX_FLAGS -DUSE_ROCM_CK)
-+    endif()
-     list(APPEND HIP_HIPCC_FLAGS --offload-compress)
-     if(WIN32)
-       add_definitions(-DROCM_ON_WINDOWS)
--- 
-2.48.1
-
diff --git a/next/0001-Add-cmake-variable-USE_ROCM_CK.patch b/0001-Add-cmake-variable-USE_ROCM_CK.patch
similarity index 100%
rename from next/0001-Add-cmake-variable-USE_ROCM_CK.patch
rename to 0001-Add-cmake-variable-USE_ROCM_CK.patch
diff --git a/next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch b/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
similarity index 100%
rename from next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
rename to 0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
diff --git a/python-torch.spec b/python-torch.spec
index 03fbf30..1fbad8e 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -15,8 +15,11 @@
 %global miniz_version 3.0.2
 %global pybind11_version 2.13.6
 %else
-%global pypi_version 2.7.0
-%global flatbuffers_version 23.3.3
+%global commit0 a1cb3cc05d46d198467bebbb6e8fba50a325d4e7
+%global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
+%global date0 20250723
+%global pypi_version 2.8.0
+%global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
 %global pybind11_version 2.13.6
 %endif
@@ -33,11 +36,7 @@
 %endif
 
 # For testing distributed+rccl etc.
-%if %{with gitcommit}
 %bcond_without rccl
-%else
-%bcond_with rccl
-%endif
 %bcond_with gloo
 %bcond_without mpi
 %bcond_without tensorpipe
@@ -56,7 +55,7 @@ Name:           python-%{pypi_name}
 %if %{with gitcommit}
 Version:        %{pypi_version}^git%{date0}.%{shortcommit0}
 %else
-Version:        %{pypi_version}
+Version:        %{pypi_version}.rc8
 %endif
 Release:        %autorelease
 Summary:        PyTorch AI/ML framework
@@ -68,7 +67,8 @@ URL:            https://pytorch.org/
 Source0:        %{forgeurl}/archive/%{commit0}/pytorch-%{shortcommit0}.tar.gz
 Source1000:     pyproject.toml
 %else
-Source0:        %{forgeurl}/releases/download/v%{version}/pytorch-v%{version}.tar.gz
+Source0:        %{forgeurl}/archive/%{commit0}/pytorch-%{shortcommit0}.tar.gz
+Source1000:     pyproject.toml
 %endif
 Source1:        https://github.com/google/flatbuffers/archive/refs/tags/v%{flatbuffers_version}.tar.gz
 Source2:        https://github.com/pybind/pybind11/archive/refs/tags/v%{pybind11_version}.tar.gz
@@ -96,25 +96,16 @@ Source70:       https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp-
 %endif
 
 %if %{without kineto}
-%if %{with gitcommit}
 %global ki_commit 5e7501833f1021ce6f618572d3baf657b6319658
-%else
-%global ki_commit be1317644c68b4bfc4646024a6b221066e430031
-%endif
 %global ki_scommit %(c=%{ki_commit}; echo ${c:0:7})
 Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz
 %endif
 
-%if %{without gitcommit}
-# https://github.com/pytorch/pytorch/issues/150187
-Patch11:       0001-Add-cmake-varaible-USE_ROCM_CK.patch
-%else
 # https://github.com/pytorch/pytorch/issues/150187
 Patch11:       0001-Add-cmake-variable-USE_ROCM_CK.patch
 # https://github.com/pytorch/pytorch/issues/156595
 # Patch12:       0001-Use-horrible-dynamo-stub.patch
 Patch12:       0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
-%endif
 
 ExclusiveArch:  x86_64 aarch64
 %global toolchain gcc
@@ -157,9 +148,7 @@ BuildRequires:  python3dist(filelock)
 BuildRequires:  python3dist(jinja2)
 BuildRequires:  python3dist(networkx)
 BuildRequires:  python3dist(numpy)
-%if %{with gitcommit}
 BuildRequires:  python3dist(pip)
-%endif
 BuildRequires:  python3dist(pyyaml)
 BuildRequires:  python3dist(setuptools)
 BuildRequires:  python3dist(sphinx)
@@ -178,9 +167,7 @@ BuildRequires:  hipcub-devel
 BuildRequires:  hipfft-devel
 BuildRequires:  hiprand-devel
 BuildRequires:  hipsparse-devel
-%if %{with gitcommit}
 BuildRequires:  hipsparselt-devel
-%endif
 BuildRequires:  hipsolver-devel
 BuildRequires:  magma-devel
 BuildRequires:  miopen-devel
@@ -198,10 +185,8 @@ BuildRequires:  rocm-core-devel
 BuildRequires:  rocm-hip-devel
 BuildRequires:  rocm-runtime-devel
 BuildRequires:  rocm-rpm-macros
-%if %{with gitcommit}
 BuildRequires:  rocsolver-devel
 BuildRequires:  rocm-smi-devel
-%endif
 BuildRequires:  rocthrust-devel
 BuildRequires:  roctracer-devel
 
@@ -275,7 +260,9 @@ Requires:       python3-%{pypi_name}%{?_isa} = %{version}-%{release}
 cp %{SOURCE1000} .
 
 %else
-%autosetup -p1 -n pytorch-v%{version}
+%autosetup -p1 -n pytorch-%{commit0}
+# Overwrite with a git checkout of the pyproject.toml
+cp %{SOURCE1000} .
 %endif
 
 # Remove bundled egg-info
@@ -349,10 +336,8 @@ sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unus
 sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-deprecated-declarations@' cmake/Dependencies.cmake
 # Use parallel jobs
 sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -parallel-jobs=4@' cmake/Dependencies.cmake
-%if %{with gitcommit}
 # Need to link with librocm_smi64
 sed -i -e 's@hiprtc::hiprtc@hiprtc::hiprtc rocm_smi64@' cmake/Dependencies.cmake
-%endif
 
 # No third_party fmt, use system
 sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt
@@ -449,9 +434,7 @@ mv googletest third_party
 #
 # Fake out pocketfft, and system header will be used
 mkdir third_party/pocketfft
-%if %{with gitcommit}
 cp /usr/include/pocketfft_hdronly.h third_party/pocketfft/
-%endif
 
 #
 # Use the system valgrind headers
@@ -608,19 +591,11 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
 export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 
-%if %{with gitcommit}
 %pyproject_wheel
-%else
-%py3_build
-%endif
 
 %else
 
-%if %{with gitcommit}
 %pyproject_wheel
-%else
-%py3_build
-%endif
 
 %endif
 
@@ -637,33 +612,21 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 # pytorch uses clang, not hipcc
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
 export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
-%if %{with gitcommit}
 %pyproject_install
 %pyproject_save_files '*torch*'
-%else
-%py3_install
-%endif
 
 %else
 
-%if %{with gitcommit}
 %pyproject_install
 %pyproject_save_files '*torch*'
-%else
-%py3_install
-%endif
 
 %endif
 
 
 %check
-%if %{with gitcommit}
 # Not working yet
 # pyproject_check_import torch
-%else
-%py3_check_import torch
-%endif
 
 # Do not remote the empty files
 
diff --git a/sources b/sources
index 4021d40..c7eae22 100644
--- a/sources
+++ b/sources
@@ -7,3 +7,6 @@ SHA512 (libnop-910b558.tar.gz) = 74c5324eaa1b6b2ac8dfef94c835b5c5b044625f8e5efe3
 SHA512 (v1.14.2.tar.gz) = 97635bbaf6dd567c201451dfaf7815b2052fe50d9bccc97aade86cfa4a92651374d167296a5453031b2681dc302806a289bca011a9e79ddc381a17d6118971d7
 SHA512 (cpp-httplib-3b6597b.tar.gz) = 8f1090658c498d04f14fec5c2f301847b1f3360bf92b18d82927643ee04ab61a6b274733a01c7850f9c030205120d674d1d961358d49fdd15636736fb8704f55
 SHA512 (kineto-be13176.tar.gz) = 41a08c7da9eea7d12402f80a5550c9d4df79798719cc52b12a507828c8c896ba28a37c35d8adf809ca72589e1d84965d5ef6dd01f3f8dc1c803c5ed67b03a43a
+SHA512 (pytorch-a1cb3cc.tar.gz) = 92bf8b2c2ef0b459406b60169ecebdc50652c75943e3d6087e4d261f6e308dbad365529561e0f07ea3f0b71790efb68b5e4ab2f44e270462097208d924dc2d95
+SHA512 (v24.12.23.tar.gz) = f97762ba41b9cfef648e93932fd789324c6bb6ebc5b7aeca8185c9ef602294b67d73aea7ae371035579a1419cbfbeba7c3e88b31b5a5848db98f5e8a03b982b1
+SHA512 (kineto-5e75018.tar.gz) = 921b96a56e01d69895b79e67582d8977ed6f873573ab41557c5d026ada5d1f6365e4ed0a0c6804057c52e92510749fc58619f554a164c1ba9d8cd13e789bebd0

From eaa9838b3ced49bdfa6d8c094d4e72cbb2406ec4 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Mon, 4 Aug 2025 16:23:41 -0700
Subject: [PATCH 22/38] Change a couple cmake mins

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python-torch.spec b/python-torch.spec
index 1fbad8e..e3cfb6d 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -449,6 +449,11 @@ sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)@#list(APPEND Caffe2
 
 # cmake version changed
 sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION 3.5)@' third_party/tensorpipe/third_party/libuv/CMakeLists.txt
+sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION 3.5)@' libuv*/CMakeLists.txt
+%if %{without opentelemtry}
+sed -i -e 's@cmake_minimum_required(VERSION 3.1)@cmake_minimum_required(VERSION 3.5)@' third_party/opentelemetry-cpp/CMakeLists.txt
+%endif
+
 
 %if %{with rocm}
 # hipify

From d67e1e127ac97f1557e8c2ea8b392318ded073f0 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Fri, 8 Aug 2025 14:00:11 -0700
Subject: [PATCH 23/38] Update to 2.8.0

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore        |  1 +
 python-torch.spec | 16 +++++-----------
 sources           |  1 +
 3 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index a4ed35b..7832594 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,4 @@
 /pytorch-a1cb3cc.tar.gz
 /v24.12.23.tar.gz
 /kineto-5e75018.tar.gz
+/pytorch-v2.8.0.tar.gz
diff --git a/python-torch.spec b/python-torch.spec
index e3cfb6d..442480f 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -15,9 +15,6 @@
 %global miniz_version 3.0.2
 %global pybind11_version 2.13.6
 %else
-%global commit0 a1cb3cc05d46d198467bebbb6e8fba50a325d4e7
-%global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250723
 %global pypi_version 2.8.0
 %global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
@@ -55,7 +52,7 @@ Name:           python-%{pypi_name}
 %if %{with gitcommit}
 Version:        %{pypi_version}^git%{date0}.%{shortcommit0}
 %else
-Version:        %{pypi_version}.rc8
+Version:        %{pypi_version}
 %endif
 Release:        %autorelease
 Summary:        PyTorch AI/ML framework
@@ -67,8 +64,7 @@ URL:            https://pytorch.org/
 Source0:        %{forgeurl}/archive/%{commit0}/pytorch-%{shortcommit0}.tar.gz
 Source1000:     pyproject.toml
 %else
-Source0:        %{forgeurl}/archive/%{commit0}/pytorch-%{shortcommit0}.tar.gz
-Source1000:     pyproject.toml
+Source0:        %{forgeurl}/releases/download/v%{version}/pytorch-v%{version}.tar.gz
 %endif
 Source1:        https://github.com/google/flatbuffers/archive/refs/tags/v%{flatbuffers_version}.tar.gz
 Source2:        https://github.com/pybind/pybind11/archive/refs/tags/v%{pybind11_version}.tar.gz
@@ -260,9 +256,7 @@ Requires:       python3-%{pypi_name}%{?_isa} = %{version}-%{release}
 cp %{SOURCE1000} .
 
 %else
-%autosetup -p1 -n pytorch-%{commit0}
-# Overwrite with a git checkout of the pyproject.toml
-cp %{SOURCE1000} .
+%autosetup -p1 -n pytorch-v%{version}
 %endif
 
 # Remove bundled egg-info
@@ -310,8 +304,8 @@ rm -rf third_party/kineto/*
 cp -r kineto-*/* third_party/kineto/
 %endif
 
-# hipblaslt only building with gfx90a
-sed -i -e 's@"gfx90a", "gfx940", "gfx941", "gfx942"@"gfx90a"@' aten/src/ATen/native/cuda/Blas.cpp
+# Adjust for the hipblaslt's we build
+sed -i -e 's@"gfx90a", "gfx940", "gfx941", "gfx942"@"gfx90a", "gfx1103", "gfx1150", "gfx1151", "gfx1100", "gfx1101", "gfx1200", "gfx1201"@' aten/src/ATen/native/cuda/Blas.cpp
 
 %if 0%{?rhel}
 # In RHEL but too old
diff --git a/sources b/sources
index c7eae22..335b8a8 100644
--- a/sources
+++ b/sources
@@ -10,3 +10,4 @@ SHA512 (kineto-be13176.tar.gz) = 41a08c7da9eea7d12402f80a5550c9d4df79798719cc52b
 SHA512 (pytorch-a1cb3cc.tar.gz) = 92bf8b2c2ef0b459406b60169ecebdc50652c75943e3d6087e4d261f6e308dbad365529561e0f07ea3f0b71790efb68b5e4ab2f44e270462097208d924dc2d95
 SHA512 (v24.12.23.tar.gz) = f97762ba41b9cfef648e93932fd789324c6bb6ebc5b7aeca8185c9ef602294b67d73aea7ae371035579a1419cbfbeba7c3e88b31b5a5848db98f5e8a03b982b1
 SHA512 (kineto-5e75018.tar.gz) = 921b96a56e01d69895b79e67582d8977ed6f873573ab41557c5d026ada5d1f6365e4ed0a0c6804057c52e92510749fc58619f554a164c1ba9d8cd13e789bebd0
+SHA512 (pytorch-v2.8.0.tar.gz) = 791e658eab87fb957f025558cb9f925078d2426ab7b6f60771d9841dfb691f67d905ba1330a800008efe7c938b6c69bdc52232bccfe8d4860e795a532cd69d28

From 1b986b49932fabd64e4dad05277398d9629a57d3 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 14 Aug 2025 09:10:18 -0700
Subject: [PATCH 24/38] Build on EPEL

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore        |  1 +
 python-torch.spec | 57 ++++++++++++++++++++++++++++++++++-------------
 sources           |  1 +
 3 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/.gitignore b/.gitignore
index 7832594..5fda907 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,3 +25,4 @@
 /v24.12.23.tar.gz
 /kineto-5e75018.tar.gz
 /pytorch-v2.8.0.tar.gz
+/v1.18.0.tar.gz
diff --git a/python-torch.spec b/python-torch.spec
index 442480f..6034593 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -48,6 +48,12 @@
 %bcond_with httplib
 %bcond_with kineto
 
+%if 0%{?fedora}
+%bcond_without onnx
+%else
+%bcond_with onnx
+%endif
+
 Name:           python-%{pypi_name}
 %if %{with gitcommit}
 Version:        %{pypi_version}^git%{date0}.%{shortcommit0}
@@ -97,6 +103,11 @@ Source70:       https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp-
 Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz
 %endif
 
+%if %{without onnx}
+%global ox_ver 1.18.0
+Source90:       https://github.com/onnx/onnx/archive/refs/tags/v%{ox_ver}.tar.gz
+%endif
+
 # https://github.com/pytorch/pytorch/issues/150187
 Patch11:       0001-Add-cmake-variable-USE_ROCM_CK.patch
 # https://github.com/pytorch/pytorch/issues/156595
@@ -123,7 +134,9 @@ BuildRequires:  json-devel
 BuildRequires:  libomp-devel
 BuildRequires:  numactl-devel
 BuildRequires:  ninja-build
+%if %{with onnx}
 BuildRequires:  onnx-devel
+%endif
 %if %{with mpi}
 BuildRequires:  openmpi-devel
 %endif
@@ -304,6 +317,12 @@ rm -rf third_party/kineto/*
 cp -r kineto-*/* third_party/kineto/
 %endif
 
+%if %{without onnx}
+tar xf %{SOURCE90}
+rm -rf third_party/onnx/*
+cp -r onnx-*/* third_party/onnx/
+%endif
+
 # Adjust for the hipblaslt's we build
 sed -i -e 's@"gfx90a", "gfx940", "gfx941", "gfx942"@"gfx90a", "gfx1103", "gfx1150", "gfx1151", "gfx1100", "gfx1101", "gfx1200", "gfx1201"@' aten/src/ATen/native/cuda/Blas.cpp
 
@@ -393,6 +412,10 @@ mv third_party/cpp-httplib .
 mv third_party/kineto .
 %endif
 
+%if %{without onnx}
+mv third_party/onnx .
+%endif
+
 %if %{with test}
 mv third_party/googletest .
 %endif
@@ -421,6 +444,10 @@ mv cpp-httplib third_party
 mv kineto third_party
 %endif
 
+%if %{without onnx}
+mv onnx third_party
+%endif
+
 %if %{with test}
 mv googletest third_party
 %endif
@@ -448,7 +475,6 @@ sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION
 sed -i -e 's@cmake_minimum_required(VERSION 3.1)@cmake_minimum_required(VERSION 3.5)@' third_party/opentelemetry-cpp/CMakeLists.txt
 %endif
 
-
 %if %{with rocm}
 # hipify
 ./tools/amd_build/build_amd.py
@@ -534,7 +560,9 @@ export USE_PYTORCH_QNNPACK=OFF
 export USE_ROCM=OFF
 export USE_SYSTEM_SLEEF=ON
 export USE_SYSTEM_EIGEN_INSTALL=ON
+%if %{with onnx}
 export USE_SYSTEM_ONNX=ON
+%endif
 export USE_SYSTEM_PYBIND11=OFF
 export USE_SYSTEM_LIBS=OFF
 export USE_SYSTEM_NCCL=OFF
@@ -575,7 +603,6 @@ export BUILD_TEST=ON
 #
 # See BZ 2244862
 
-
 %if %{with rocm}
 
 export USE_ROCM=ON
@@ -590,14 +617,15 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
 export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 
-%pyproject_wheel
-
-%else
-
-%pyproject_wheel
-
 %endif
 
+%if 0%{?fedora}
+%pyproject_wheel
+%else
+%py3_build
+%endif
+
+
 %install
 
 %if %{with rocm}
@@ -611,16 +639,15 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 # pytorch uses clang, not hipcc
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
 export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
-%pyproject_install
-%pyproject_save_files '*torch*'
-
-%else
-
-%pyproject_install
-%pyproject_save_files '*torch*'
 
 %endif
 
+%if 0%{?fedora}
+%pyproject_install
+%pyproject_save_files '*torch*'
+%else
+%py3_install
+%endif
 
 
 %check
diff --git a/sources b/sources
index 335b8a8..5c15ab4 100644
--- a/sources
+++ b/sources
@@ -11,3 +11,4 @@ SHA512 (pytorch-a1cb3cc.tar.gz) = 92bf8b2c2ef0b459406b60169ecebdc50652c75943e3d6
 SHA512 (v24.12.23.tar.gz) = f97762ba41b9cfef648e93932fd789324c6bb6ebc5b7aeca8185c9ef602294b67d73aea7ae371035579a1419cbfbeba7c3e88b31b5a5848db98f5e8a03b982b1
 SHA512 (kineto-5e75018.tar.gz) = 921b96a56e01d69895b79e67582d8977ed6f873573ab41557c5d026ada5d1f6365e4ed0a0c6804057c52e92510749fc58619f554a164c1ba9d8cd13e789bebd0
 SHA512 (pytorch-v2.8.0.tar.gz) = 791e658eab87fb957f025558cb9f925078d2426ab7b6f60771d9841dfb691f67d905ba1330a800008efe7c938b6c69bdc52232bccfe8d4860e795a532cd69d28
+SHA512 (v1.18.0.tar.gz) = 2f38664947c8d1efc40620a7c1b1953d2aa4b0a37b67c4886b86e77c1d697363c26413413ddda8eabc545892fb1bcb43afc7e93e62f0901527524a2727e1ea8d

From a6dcc4b8d8fd8903c09cc93f87a6bc2aff357bdf Mon Sep 17 00:00:00 2001
From: Python Maint <python-maint@redhat.com>
Date: Fri, 15 Aug 2025 15:02:18 +0200
Subject: [PATCH 25/38] Rebuilt for Python 3.14.0rc2 bytecode


From 95f1f6fe22c05bf4a53479100fdfbaf48e1e90c3 Mon Sep 17 00:00:00 2001
From: Python Maint <python-maint@redhat.com>
Date: Fri, 19 Sep 2025 14:37:44 +0200
Subject: [PATCH 26/38] Rebuilt for Python 3.14.0rc3 bytecode


From 89daf765fd18ce8d6ecf40a8c6f8fe275c542091 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Fri, 26 Sep 2025 14:24:46 -0700
Subject: [PATCH 27/38] Disable magma

Magma is broken on ROCm 7.

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 6034593..210aee2 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -178,7 +178,8 @@ BuildRequires:  hiprand-devel
 BuildRequires:  hipsparse-devel
 BuildRequires:  hipsparselt-devel
 BuildRequires:  hipsolver-devel
-BuildRequires:  magma-devel
+# Magma is broken on ROCm 7
+# BuildRequires:  magma-devel
 BuildRequires:  miopen-devel
 BuildRequires:  rocblas-devel
 BuildRequires:  rocrand-devel
@@ -607,7 +608,8 @@ export BUILD_TEST=ON
 
 export USE_ROCM=ON
 export USE_ROCM_CK=OFF
-export USE_MAGMA=ON
+# Magma is broken on ROCm 7
+# export USE_MAGMA=ON
 export HIP_PATH=`hipconfig -p`
 export ROCM_PATH=`hipconfig -R`
 RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir`

From f29cac5d83e262d22372de0576f7e89b85abab3e Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Tue, 30 Sep 2025 07:35:43 -0700
Subject: [PATCH 28/38] Update to 2.9.0-rc4

Work around ROCm 7 build issue in 2.8.0

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore        |  1 +
 python-torch.spec | 34 ++++++++++++++++++++++++++++------
 sources           |  1 +
 3 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5fda907..0986c30 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,4 @@
 /kineto-5e75018.tar.gz
 /pytorch-v2.8.0.tar.gz
 /v1.18.0.tar.gz
+/pytorch-715dca6.tar.gz
diff --git a/python-torch.spec b/python-torch.spec
index 210aee2..e25a665 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -4,13 +4,13 @@
 %global forgeurl https://github.com/pytorch/pytorch
 
 # So pre releases can be tried
-%bcond_with gitcommit
+%bcond_without gitcommit
 %if %{with gitcommit}
-# v2.8.0-rc8
-%global commit0 a1cb3cc05d46d198467bebbb6e8fba50a325d4e7
+# v2.9.0-rc4
+%global commit0 715dca672526a20322d07c2e67772cfe4400a20f
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250723
-%global pypi_version 2.8.0
+%global date0 20250923
+%global pypi_version 2.9.0
 %global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
 %global pybind11_version 2.13.6
@@ -108,11 +108,13 @@ Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{
 Source90:       https://github.com/onnx/onnx/archive/refs/tags/v%{ox_ver}.tar.gz
 %endif
 
+%if %{with gitcommit}
+%else
 # https://github.com/pytorch/pytorch/issues/150187
 Patch11:       0001-Add-cmake-variable-USE_ROCM_CK.patch
 # https://github.com/pytorch/pytorch/issues/156595
-# Patch12:       0001-Use-horrible-dynamo-stub.patch
 Patch12:       0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
+%endif
 
 ExclusiveArch:  x86_64 aarch64
 %global toolchain gcc
@@ -200,6 +202,10 @@ BuildRequires:  rocm-smi-devel
 BuildRequires:  rocthrust-devel
 BuildRequires:  roctracer-devel
 
+%if %{with gitcommit}
+BuildRequires:  moodycamel-concurrentqueue-devel
+%endif
+
 Requires:       amdsmi
 
 %endif
@@ -492,6 +498,13 @@ sed -i -e 's@HIP 1.0@HIP MODULE@'            cmake/public/LoadHIP.cmake
 # silence an assert
 # sed -i -e '/qvalue = std::clamp(qvalue, qmin, qmax);/d' aten/src/ATen/native/cuda/IndexKernel.cu
 
+%endif
+
+%if %{with gitcommit}
+# moodycamel include path needs adjusting to use the system's
+sed -i -e 's@${PROJECT_SOURCE_DIR}/third_party/concurrentqueue@/usr/include/concurrentqueue@' cmake/Dependencies.cmake
+
+
 %endif
 
 %build
@@ -607,7 +620,14 @@ export BUILD_TEST=ON
 %if %{with rocm}
 
 export USE_ROCM=ON
+%if %{with gitcommit}
+export USE_ROCM_CK_SDPA=OFF
+export USE_ROCM_CK_GEMM=OFF
+export USE_FBGEMM_GENAI=OFF
+%else
 export USE_ROCM_CK=OFF
+%endif
+
 # Magma is broken on ROCm 7
 # export USE_MAGMA=ON
 export HIP_PATH=`hipconfig -p`
@@ -662,7 +682,9 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %license LICENSE
 %doc README.md 
 %{_bindir}/torchrun
+%if %{without gitcommit}
 %{_bindir}/torchfrtrace
+%endif
 %{python3_sitearch}/%{pypi_name}*
 %{python3_sitearch}/functorch
 
diff --git a/sources b/sources
index 5c15ab4..0fdf299 100644
--- a/sources
+++ b/sources
@@ -12,3 +12,4 @@ SHA512 (v24.12.23.tar.gz) = f97762ba41b9cfef648e93932fd789324c6bb6ebc5b7aeca8185
 SHA512 (kineto-5e75018.tar.gz) = 921b96a56e01d69895b79e67582d8977ed6f873573ab41557c5d026ada5d1f6365e4ed0a0c6804057c52e92510749fc58619f554a164c1ba9d8cd13e789bebd0
 SHA512 (pytorch-v2.8.0.tar.gz) = 791e658eab87fb957f025558cb9f925078d2426ab7b6f60771d9841dfb691f67d905ba1330a800008efe7c938b6c69bdc52232bccfe8d4860e795a532cd69d28
 SHA512 (v1.18.0.tar.gz) = 2f38664947c8d1efc40620a7c1b1953d2aa4b0a37b67c4886b86e77c1d697363c26413413ddda8eabc545892fb1bcb43afc7e93e62f0901527524a2727e1ea8d
+SHA512 (pytorch-715dca6.tar.gz) = 09c9aae54fab3eb17901fc3226fece1c13f41cb8e45a2cb066021823abeb8d27c340993088e01d8e55bb37ed5f94334ec31e6c539cddfacbad157abd27c5e907

From 1509cbcd60c545c5b19d00033c4538f6763bf4cc Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Mon, 6 Oct 2025 14:27:41 -0700
Subject: [PATCH 29/38] Update to 2.9.0-rc6

aarch64 is not building, so disable.

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore        |   1 +
 pyproject.toml    | 139 ++++++++++++++++++++++++++++++++++------------
 python-torch.spec |  11 ++--
 sources           |   1 +
 4 files changed, 112 insertions(+), 40 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0986c30..2918194 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,4 @@
 /pytorch-v2.8.0.tar.gz
 /v1.18.0.tar.gz
 /pytorch-715dca6.tar.gz
+/pytorch-fd36458.tar.gz
diff --git a/pyproject.toml b/pyproject.toml
index ccf9c2a..925742b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,49 +1,105 @@
+# Package ######################################################################
+
+[build-system]
+requires = [
+    # 70.1.0: min version for integrated bdist_wheel command from wheel package
+    # 77.0.0: min version for SPDX expression support for project.license
+    "setuptools>=70.1.0,<80.0",
+    "cmake>=3.27",
+    "ninja",
+    "numpy",
+    "packaging",
+    "pyyaml",
+    "requests",
+    "six",  # dependency chain: NNPACK -> PeachPy -> six
+    "typing-extensions>=4.10.0",
+]
+build-backend = "setuptools.build_meta"
+
+[dependency-groups]
+dev = [
+    # This list should be kept in sync with the requirements-build.txt
+    # in PyTorch root until the project fully migrates to pyproject.toml
+    # after which this can be removed as it is already specified in the
+    # [build-system] section
+    "setuptools>=70.1.0,<80.0",  # setuptools develop deprecated on 80.0
+    "cmake>=3.27",
+    "ninja",
+    "numpy",
+    "packaging",
+    "pyyaml",
+    "requests",
+    "six",  # dependency chain: NNPACK -> PeachPy -> six
+    "typing-extensions>=4.10.0",
+
+    # This list should be kept in sync with the requirements.txt in
+    # PyTorch root until the project fully migrates to pyproject.toml
+    "build[uv]",
+    "expecttest>=0.3.0",
+    "filelock",
+    "fsspec>=0.8.5",
+    "hypothesis",
+    "jinja2",
+    "lintrunner; platform_machine != 's390x' and platform_machine != 'riscv64'",
+    "networkx>=2.5.1",
+    "optree>=0.13.0",
+    "psutil",
+    "sympy>=1.13.3",
+    "typing-extensions>=4.13.2",
+    "wheel",
+]
+
 [project]
 name = "torch"
-requires-python = ">=3.9"
-license = {text = "BSD-3-Clause"}
+description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
+readme = "README.md"
+requires-python = ">=3.10"
+# TODO: change to `license = "BSD-3-Clause"` and enable PEP 639 after pinning setuptools>=77
+# FIXME: As of 2025.06.20, it is hard to ensure the minimum version of setuptools in our CI environment.
+# TOML-table-based license deprecated in setuptools>=77, and the deprecation warning will be changed
+# to an error on 2026.02.18. See also: https://github.com/pypa/setuptools/issues/4903
+license = { text = "BSD-3-Clause" }
+authors = [{ name = "PyTorch Team", email = "packages@pytorch.org" }]
+keywords = ["pytorch", "machine learning"]
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Education",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering",
+    "Topic :: Scientific/Engineering :: Mathematics",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development",
+    "Topic :: Software Development :: Libraries",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Programming Language :: C++",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
+]
 dynamic = [
-    "authors",
-    "classifiers",
     "entry-points",
     "dependencies",
-    "description",
-    "keywords",
-    "optional-dependencies",
-    "readme",
     "scripts",
     "version",
 ]
 
 [project.urls]
-Homepage = "https://pytorch.org/"
-Documentation = "https://pytorch.org/docs/"
-Source = "https://github.com/pytorch/pytorch"
-Forum = "https://discuss.pytorch.org/"
+Homepage = "https://pytorch.org"
+Repository = "https://github.com/pytorch/pytorch"
+Documentation = "https://pytorch.org/docs"
+"Issue Tracker" = "https://github.com/pytorch/pytorch/issues"
+Forum = "https://discuss.pytorch.org"
 
+[project.optional-dependencies]
+optree = ["optree>=0.13.0"]
+opt-einsum = ["opt-einsum>=3.3"]
+pyyaml = ["pyyaml"]
 
-[build-system]
-requires = [
-    # After 75.8.2 dropped dep disttools API. Please fix
-    # API temporarily restored and shim used. Please fix
-    # Setuptools will drop support for setup.py past 80
-    # min version for recursive glob package data support
-    "setuptools>=62.3.0,<80.0",
-    "wheel",
-    "astunparse",
-    "numpy",
-    "ninja",
-    "pyyaml",
-    "cmake",
-    "typing-extensions>=4.10.0",
-    "requests",
-]
-# Use legacy backend to import local packages in setup.py
-build-backend = "setuptools.build_meta:__legacy__"
-
-
-[tool.black]
-line-length = 88
+# Linter tools #################################################################
 
 [tool.isort]
 src_paths = ["caffe2", "torch", "torchgen", "functorch", "test"]
@@ -59,12 +115,10 @@ multi_line_output = 3
 include_trailing_comma = true
 combine_as_imports = true
 
-
 [tool.usort.known]
 first_party = ["caffe2", "torch", "torchgen", "functorch", "test"]
 standard_library = ["typing_extensions"]
 
-
 [tool.ruff]
 line-length = 88
 src = ["caffe2", "torch", "torchgen", "functorch", "test"]
@@ -105,6 +159,7 @@ ignore = [
     "E741",
     "EXE001",
     "F405",
+    "FURB122", # writelines
     # these ignores are from flake8-logging-format; please fix!
     "G101",
     # these ignores are from ruff NPY; please fix!
@@ -127,7 +182,15 @@ ignore = [
     "SIM117",
     "SIM118",
     "UP007", # keep-runtime-typing
+    "UP045", # keep-runtime-typing
     "TC006",
+    # TODO: Remove Python-3.10 specific suppressions
+    "B905",
+    "UP035",
+    "UP036",
+    "UP038",
+    "UP041",
+    "FURB161",
 ]
 select = [
     "B",
@@ -208,6 +271,10 @@ select = [
     "YTT",
 ]
 
+[tool.ruff.lint.pyupgrade]
+# Preserve types, even if a file imports `from __future__ import annotations`.
+keep-runtime-typing = true
+
 [tool.ruff.lint.per-file-ignores]
 "__init__.py" = [
     "F401",
diff --git a/python-torch.spec b/python-torch.spec
index e25a665..3dad39d 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,14 +6,15 @@
 # So pre releases can be tried
 %bcond_without gitcommit
 %if %{with gitcommit}
-# v2.9.0-rc4
-%global commit0 715dca672526a20322d07c2e67772cfe4400a20f
+# v2.9.0-rc6
+%global commit0 fd364580a94079854f2f32d463c118afaefe62e0
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250923
+%global date0 20251002
 %global pypi_version 2.9.0
 %global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
 %global pybind11_version 2.13.6
+%global rc_tag -rc6
 %else
 %global pypi_version 2.8.0
 %global flatbuffers_version 24.12.23
@@ -116,7 +117,9 @@ Patch11:       0001-Add-cmake-variable-USE_ROCM_CK.patch
 Patch12:       0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
 %endif
 
-ExclusiveArch:  x86_64 aarch64
+# ExclusiveArch:  x86_64 aarch64
+# aarch64 not building on 2.9.0-rc6
+ExclusiveArch:  x86_64
 %global toolchain gcc
 %global _lto_cflags %nil
 
diff --git a/sources b/sources
index 0fdf299..701e6b4 100644
--- a/sources
+++ b/sources
@@ -13,3 +13,4 @@ SHA512 (kineto-5e75018.tar.gz) = 921b96a56e01d69895b79e67582d8977ed6f873573ab415
 SHA512 (pytorch-v2.8.0.tar.gz) = 791e658eab87fb957f025558cb9f925078d2426ab7b6f60771d9841dfb691f67d905ba1330a800008efe7c938b6c69bdc52232bccfe8d4860e795a532cd69d28
 SHA512 (v1.18.0.tar.gz) = 2f38664947c8d1efc40620a7c1b1953d2aa4b0a37b67c4886b86e77c1d697363c26413413ddda8eabc545892fb1bcb43afc7e93e62f0901527524a2727e1ea8d
 SHA512 (pytorch-715dca6.tar.gz) = 09c9aae54fab3eb17901fc3226fece1c13f41cb8e45a2cb066021823abeb8d27c340993088e01d8e55bb37ed5f94334ec31e6c539cddfacbad157abd27c5e907
+SHA512 (pytorch-fd36458.tar.gz) = acbb7475b92ad4a8e8d779f3745da22d8438e4c5ef2d6e76d71c987789f2752c8aef7022c87c9a74640fe4f9c1f1a61a3f12a796f63b1e6be24da8e5aacf37dc

From b615a5f89b4a35fc47b600ed1c75ebd5e3a21863 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Mon, 13 Oct 2025 10:17:32 -0700
Subject: [PATCH 30/38] Update to 2.9.0-rc9

Introduce pytorch-rpm-macros package.
Add %pytorch_arches to the macros, set to aarch64 and x86_64

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore        |  1 +
 python-torch.spec | 43 ++++++++++++++++++++++++++++++++-----------
 sources           |  1 +
 3 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2918194..2dab732 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@
 /v1.18.0.tar.gz
 /pytorch-715dca6.tar.gz
 /pytorch-fd36458.tar.gz
+/pytorch-0fabc3b.tar.gz
diff --git a/python-torch.spec b/python-torch.spec
index 3dad39d..f7f7f0c 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -6,15 +6,15 @@
 # So pre releases can be tried
 %bcond_without gitcommit
 %if %{with gitcommit}
-# v2.9.0-rc6
-%global commit0 fd364580a94079854f2f32d463c118afaefe62e0
+# v2.9.0-rc9
+%global commit0 0fabc3ba44823f257e70ce397d989c8de5e362c1
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20251002
+%global date0 20251008
 %global pypi_version 2.9.0
 %global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
 %global pybind11_version 2.13.6
-%global rc_tag -rc6
+%global rc_tag -rc9
 %else
 %global pypi_version 2.8.0
 %global flatbuffers_version 24.12.23
@@ -117,9 +117,13 @@ Patch11:       0001-Add-cmake-variable-USE_ROCM_CK.patch
 Patch12:       0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
 %endif
 
-# ExclusiveArch:  x86_64 aarch64
-# aarch64 not building on 2.9.0-rc6
-ExclusiveArch:  x86_64
+%if 0%{?fedora} >= 45
+# drop aarch64 in 45
+%global pt_arches x86_64
+%else
+%global pt_arches x86_64 aarch64
+%endif
+ExclusiveArch:  %pt_arches
 %global toolchain gcc
 %global _lto_cflags %nil
 
@@ -137,6 +141,9 @@ BuildRequires:  gloo-devel
 BuildRequires:  json-devel
 
 BuildRequires:  libomp-devel
+%if %{with gitcommit}
+BuildRequires:  moodycamel-concurrentqueue-devel
+%endif
 BuildRequires:  numactl-devel
 BuildRequires:  ninja-build
 %if %{with onnx}
@@ -205,10 +212,6 @@ BuildRequires:  rocm-smi-devel
 BuildRequires:  rocthrust-devel
 BuildRequires:  roctracer-devel
 
-%if %{with gitcommit}
-BuildRequires:  moodycamel-concurrentqueue-devel
-%endif
-
 Requires:       amdsmi
 
 %endif
@@ -261,6 +264,14 @@ PyTorch is a Python package that provides two high-level features:
 You can reuse your favorite Python packages such as NumPy, SciPy,
 and Cython to extend PyTorch when needed.
 
+%package -n pytorch-rpm-macros
+Summary:        PyTorch RPM macros
+BuildArch:      noarch
+
+%description -n pytorch-rpm-macros
+This package contains PyTorch related RPM macros.
+
+
 %if %{with test}
 %package -n python3-%{pypi_name}-test
 Summary:        Tests for %{name}
@@ -512,6 +523,9 @@ sed -i -e 's@${PROJECT_SOURCE_DIR}/third_party/concurrentqueue@/usr/include/conc
 
 %build
 
+# Export the arches
+echo "%%pytorch_arches %pt_arches"   > macros.pytorch
+
 #
 # Control the number of jobs
 #
@@ -653,6 +667,10 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 
 %install
 
+# pytorch rpm macros
+install -Dpm 644 macros.pytorch \
+    %{buildroot}%{_rpmmacrodir}/macros.pytorch
+
 %if %{with rocm}
 export USE_ROCM=ON
 export USE_ROCM_CK=OFF
@@ -691,6 +709,9 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %{python3_sitearch}/%{pypi_name}*
 %{python3_sitearch}/functorch
 
+%files -n pytorch-rpm-macros
+%{_rpmmacrodir}/macros.pytorch
+
 %changelog
 %autochangelog
 
diff --git a/sources b/sources
index 701e6b4..89e2a95 100644
--- a/sources
+++ b/sources
@@ -14,3 +14,4 @@ SHA512 (pytorch-v2.8.0.tar.gz) = 791e658eab87fb957f025558cb9f925078d2426ab7b6f60
 SHA512 (v1.18.0.tar.gz) = 2f38664947c8d1efc40620a7c1b1953d2aa4b0a37b67c4886b86e77c1d697363c26413413ddda8eabc545892fb1bcb43afc7e93e62f0901527524a2727e1ea8d
 SHA512 (pytorch-715dca6.tar.gz) = 09c9aae54fab3eb17901fc3226fece1c13f41cb8e45a2cb066021823abeb8d27c340993088e01d8e55bb37ed5f94334ec31e6c539cddfacbad157abd27c5e907
 SHA512 (pytorch-fd36458.tar.gz) = acbb7475b92ad4a8e8d779f3745da22d8438e4c5ef2d6e76d71c987789f2752c8aef7022c87c9a74640fe4f9c1f1a61a3f12a796f63b1e6be24da8e5aacf37dc
+SHA512 (pytorch-0fabc3b.tar.gz) = 2e87975de0bf6f3dcede168b379e1928712bca16170c2a8ee7d63459f53086c01baac05e0763e4d5d28cdaf1c7d8912225ee06adeff96ead4f6f456ee174b341

From ef01976cf4687ab7bce860c23297cf9b6d105940 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Wed, 15 Oct 2025 16:27:01 -0700
Subject: [PATCH 31/38] Update to 2.9.0

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore        |  1 +
 python-torch.spec | 25 ++-----------------------
 sources           |  1 +
 3 files changed, 4 insertions(+), 23 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2dab732..444b9ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,3 +29,4 @@
 /pytorch-715dca6.tar.gz
 /pytorch-fd36458.tar.gz
 /pytorch-0fabc3b.tar.gz
+/pytorch-v2.9.0.tar.gz
diff --git a/python-torch.spec b/python-torch.spec
index f7f7f0c..0fd7ebd 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -4,7 +4,7 @@
 %global forgeurl https://github.com/pytorch/pytorch
 
 # So pre releases can be tried
-%bcond_without gitcommit
+%bcond_with gitcommit
 %if %{with gitcommit}
 # v2.9.0-rc9
 %global commit0 0fabc3ba44823f257e70ce397d989c8de5e362c1
@@ -16,7 +16,7 @@
 %global pybind11_version 2.13.6
 %global rc_tag -rc9
 %else
-%global pypi_version 2.8.0
+%global pypi_version 2.9.0
 %global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
 %global pybind11_version 2.13.6
@@ -109,14 +109,6 @@ Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{
 Source90:       https://github.com/onnx/onnx/archive/refs/tags/v%{ox_ver}.tar.gz
 %endif
 
-%if %{with gitcommit}
-%else
-# https://github.com/pytorch/pytorch/issues/150187
-Patch11:       0001-Add-cmake-variable-USE_ROCM_CK.patch
-# https://github.com/pytorch/pytorch/issues/156595
-Patch12:       0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
-%endif
-
 %if 0%{?fedora} >= 45
 # drop aarch64 in 45
 %global pt_arches x86_64
@@ -141,9 +133,7 @@ BuildRequires:  gloo-devel
 BuildRequires:  json-devel
 
 BuildRequires:  libomp-devel
-%if %{with gitcommit}
 BuildRequires:  moodycamel-concurrentqueue-devel
-%endif
 BuildRequires:  numactl-devel
 BuildRequires:  ninja-build
 %if %{with onnx}
@@ -514,13 +504,9 @@ sed -i -e 's@HIP 1.0@HIP MODULE@'            cmake/public/LoadHIP.cmake
 
 %endif
 
-%if %{with gitcommit}
 # moodycamel include path needs adjusting to use the system's
 sed -i -e 's@${PROJECT_SOURCE_DIR}/third_party/concurrentqueue@/usr/include/concurrentqueue@' cmake/Dependencies.cmake
 
-
-%endif
-
 %build
 
 # Export the arches
@@ -637,13 +623,9 @@ export BUILD_TEST=ON
 %if %{with rocm}
 
 export USE_ROCM=ON
-%if %{with gitcommit}
 export USE_ROCM_CK_SDPA=OFF
 export USE_ROCM_CK_GEMM=OFF
 export USE_FBGEMM_GENAI=OFF
-%else
-export USE_ROCM_CK=OFF
-%endif
 
 # Magma is broken on ROCm 7
 # export USE_MAGMA=ON
@@ -703,9 +685,6 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %license LICENSE
 %doc README.md 
 %{_bindir}/torchrun
-%if %{without gitcommit}
-%{_bindir}/torchfrtrace
-%endif
 %{python3_sitearch}/%{pypi_name}*
 %{python3_sitearch}/functorch
 
diff --git a/sources b/sources
index 89e2a95..05c1f96 100644
--- a/sources
+++ b/sources
@@ -15,3 +15,4 @@ SHA512 (v1.18.0.tar.gz) = 2f38664947c8d1efc40620a7c1b1953d2aa4b0a37b67c4886b86e7
 SHA512 (pytorch-715dca6.tar.gz) = 09c9aae54fab3eb17901fc3226fece1c13f41cb8e45a2cb066021823abeb8d27c340993088e01d8e55bb37ed5f94334ec31e6c539cddfacbad157abd27c5e907
 SHA512 (pytorch-fd36458.tar.gz) = acbb7475b92ad4a8e8d779f3745da22d8438e4c5ef2d6e76d71c987789f2752c8aef7022c87c9a74640fe4f9c1f1a61a3f12a796f63b1e6be24da8e5aacf37dc
 SHA512 (pytorch-0fabc3b.tar.gz) = 2e87975de0bf6f3dcede168b379e1928712bca16170c2a8ee7d63459f53086c01baac05e0763e4d5d28cdaf1c7d8912225ee06adeff96ead4f6f456ee174b341
+SHA512 (pytorch-v2.9.0.tar.gz) = ae989e3a7fe30f9ea90944dc25e21ca92f2a94ee40d8de974a168c292d82c16ee8920624eff91a85755469ad05473dce0f85893e3ed7794ec5c6bdd89cbd2023

From 741c412249afb0dabfb37cdc6f3bbd05ce3ec176 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Fri, 17 Oct 2025 10:05:50 -0700
Subject: [PATCH 32/38] Remove pytorch-rpm-macros package.

This does not work when building on a general arch

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 0fd7ebd..b623dc6 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -254,14 +254,6 @@ PyTorch is a Python package that provides two high-level features:
 You can reuse your favorite Python packages such as NumPy, SciPy,
 and Cython to extend PyTorch when needed.
 
-%package -n pytorch-rpm-macros
-Summary:        PyTorch RPM macros
-BuildArch:      noarch
-
-%description -n pytorch-rpm-macros
-This package contains PyTorch related RPM macros.
-
-
 %if %{with test}
 %package -n python3-%{pypi_name}-test
 Summary:        Tests for %{name}
@@ -510,7 +502,7 @@ sed -i -e 's@${PROJECT_SOURCE_DIR}/third_party/concurrentqueue@/usr/include/conc
 %build
 
 # Export the arches
-echo "%%pytorch_arches %pt_arches"   > macros.pytorch
+# echo "%%pytorch_arches %pt_arches"   > macros.pytorch
 
 #
 # Control the number of jobs
@@ -650,8 +642,8 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %install
 
 # pytorch rpm macros
-install -Dpm 644 macros.pytorch \
-    %{buildroot}%{_rpmmacrodir}/macros.pytorch
+# install -Dpm 644 macros.pytorch \
+#    %{buildroot}%{_rpmmacrodir}/macros.pytorch
 
 %if %{with rocm}
 export USE_ROCM=ON
@@ -688,9 +680,6 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
 %{python3_sitearch}/%{pypi_name}*
 %{python3_sitearch}/functorch
 
-%files -n pytorch-rpm-macros
-%{_rpmmacrodir}/macros.pytorch
-
 %changelog
 %autochangelog
 

From e0030b3ec55af789d1db27c223bb58a4c102498b Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Mon, 17 Nov 2025 14:11:32 -0800
Subject: [PATCH 33/38] Rebuild for ROCm 7.1

Signed-off-by: Tom Rix <Tom.Rix@amd.com>

From b3977567d226ae906a3f253b8e782d47373dd578 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Mon, 24 Nov 2025 07:03:31 -0800
Subject: [PATCH 34/38] Always include onnx src

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index b623dc6..7bd82df 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -104,10 +104,8 @@ Source70:       https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp-
 Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz
 %endif
 
-%if %{without onnx}
 %global ox_ver 1.18.0
 Source90:       https://github.com/onnx/onnx/archive/refs/tags/v%{ox_ver}.tar.gz
-%endif
 
 %if 0%{?fedora} >= 45
 # drop aarch64 in 45

From 7908450a47cf1c4210b8ab7ab0132587580e1d72 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Thu, 18 Dec 2025 13:52:00 -0800
Subject: [PATCH 35/38] Update to 2.9.1

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 .gitignore        | 1 +
 python-torch.spec | 2 +-
 sources           | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 444b9ca..c424df5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,3 +30,4 @@
 /pytorch-fd36458.tar.gz
 /pytorch-0fabc3b.tar.gz
 /pytorch-v2.9.0.tar.gz
+/pytorch-v2.9.1.tar.gz
diff --git a/python-torch.spec b/python-torch.spec
index 7bd82df..e493b97 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -16,7 +16,7 @@
 %global pybind11_version 2.13.6
 %global rc_tag -rc9
 %else
-%global pypi_version 2.9.0
+%global pypi_version 2.9.1
 %global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
 %global pybind11_version 2.13.6
diff --git a/sources b/sources
index 05c1f96..9a3681f 100644
--- a/sources
+++ b/sources
@@ -16,3 +16,4 @@ SHA512 (pytorch-715dca6.tar.gz) = 09c9aae54fab3eb17901fc3226fece1c13f41cb8e45a2c
 SHA512 (pytorch-fd36458.tar.gz) = acbb7475b92ad4a8e8d779f3745da22d8438e4c5ef2d6e76d71c987789f2752c8aef7022c87c9a74640fe4f9c1f1a61a3f12a796f63b1e6be24da8e5aacf37dc
 SHA512 (pytorch-0fabc3b.tar.gz) = 2e87975de0bf6f3dcede168b379e1928712bca16170c2a8ee7d63459f53086c01baac05e0763e4d5d28cdaf1c7d8912225ee06adeff96ead4f6f456ee174b341
 SHA512 (pytorch-v2.9.0.tar.gz) = ae989e3a7fe30f9ea90944dc25e21ca92f2a94ee40d8de974a168c292d82c16ee8920624eff91a85755469ad05473dce0f85893e3ed7794ec5c6bdd89cbd2023
+SHA512 (pytorch-v2.9.1.tar.gz) = 88de0289fa2760abd69bef505b5ae3b6d7ff176b415cbb31bbc89ce5476a3800b322a97c4490f270f8b89657aff931bf9a5516202b268e0bb8b1f63dbb87b34a

From 7ddebb112b1931a495e78fba092b77e4f91022df Mon Sep 17 00:00:00 2001
From: "Alexander F. Lent" <lx@xanderlent.com>
Date: Thu, 18 Dec 2025 09:08:09 -0500
Subject: [PATCH 36/38] Improve build times on non-x86 systems

Signed-off-by: Alexander F. Lent <lx@xanderlent.com>
---
 python-torch.spec | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python-torch.spec b/python-torch.spec
index e493b97..abacce9 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -506,10 +506,15 @@ sed -i -e 's@${PROJECT_SOURCE_DIR}/third_party/concurrentqueue@/usr/include/conc
 # Control the number of jobs
 #
 # The build can fail if too many threads exceed the physical memory
-# So count core and and memory and increase the build memory util the build succeeds
+# Run at least one thread, more if CPU & memory resources are available.
 #
+%ifarch x86_64
 # Real cores, No hyperthreading
 COMPILE_JOBS=`cat /proc/cpuinfo | grep -m 1 'cpu cores' | awk '{ print $4 }'`
+%else
+# cpuinfo format varies on other arches, fall back to nproc
+COMPILE_JOBS=`nproc`
+%endif
 if [ ${COMPILE_JOBS}x = x ]; then
     COMPILE_JOBS=1
 fi

From 767d576d1df8b3041f11c86243bcf3503a6d559a Mon Sep 17 00:00:00 2001
From: "Alexander F. Lent" <lx@xanderlent.com>
Date: Sat, 20 Dec 2025 21:24:48 -0500
Subject: [PATCH 37/38] Continue to support aarch64 with myself maintaining

Signed-off-by: Alexander F. Lent <lx@xanderlent.com>
---
 python-torch.spec | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index abacce9..640adb3 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -107,12 +107,7 @@ Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{
 %global ox_ver 1.18.0
 Source90:       https://github.com/onnx/onnx/archive/refs/tags/v%{ox_ver}.tar.gz
 
-%if 0%{?fedora} >= 45
-# drop aarch64 in 45
-%global pt_arches x86_64
-%else
 %global pt_arches x86_64 aarch64
-%endif
 ExclusiveArch:  %pt_arches
 %global toolchain gcc
 %global _lto_cflags %nil

From 294accd75d5a894c11d48bbea7a5d2dd70bebd15 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Mon, 12 Jan 2026 16:38:15 -0800
Subject: [PATCH 38/38] Fix hip

device lib path is no longer needed

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
 python-torch.spec | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python-torch.spec b/python-torch.spec
index 640adb3..d3c31d7 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -621,8 +621,8 @@ export USE_FBGEMM_GENAI=OFF
 # export USE_MAGMA=ON
 export HIP_PATH=`hipconfig -p`
 export ROCM_PATH=`hipconfig -R`
-RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir`
-export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
+#RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir`
+#export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 
 # pytorch uses clang, not hipcc
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
@@ -648,8 +648,8 @@ export USE_ROCM=ON
 export USE_ROCM_CK=OFF
 export HIP_PATH=`hipconfig -p`
 export ROCM_PATH=`hipconfig -R`
-RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir`
-export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
+# RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir`
+# export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 
 # pytorch uses clang, not hipcc
 export HIP_CLANG_PATH=%{rocmllvm_bindir}