From 9c39544f2ccbc19b4a11c0c0709e3313d5711cf0 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 17 Feb 2025 05:15:28 -0800 Subject: [PATCH 01/38] Remove rocm loop Signed-off-by: Tom Rix --- python-torch.spec | 64 ++--------------------------------------------- 1 file changed, 2 insertions(+), 62 deletions(-) diff --git a/python-torch.spec b/python-torch.spec index 86794c5..7d6d26e 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -25,9 +25,6 @@ %ifarch x86_64 %bcond_without rocm %endif -%bcond_with rocm_loop -%global rocm_default_gpu default -%global rocm_gpu_list gfx9 # For testing distributed+rccl etc. %bcond_with rccl @@ -180,12 +177,10 @@ BuildRequires: rocm-core-devel BuildRequires: rocm-hip-devel BuildRequires: rocm-runtime-devel BuildRequires: rocm-rpm-macros -BuildRequires: rocm-rpm-macros-modules BuildRequires: rocthrust-devel BuildRequires: roctracer-devel Requires: amdsmi -Requires: rocm-rpm-macros-modules %endif @@ -236,15 +231,6 @@ PyTorch is a Python package that provides two high-level features: You can reuse your favorite Python packages such as NumPy, SciPy, and Cython to extend PyTorch when needed. -%if %{with rocm_loop} -%package -n python3-%{pypi_name}-rocm-gfx9 -Summary: %{name} for ROCm gfx9 - -%description -n python3-%{pypi_name}-rocm-gfx9 -%{summary} - -%endif - %if %{with test} %package -n python3-%{pypi_name}-test Summary: Tests for %{name} @@ -567,24 +553,8 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode # pytorch uses clang, not hipcc export HIP_CLANG_PATH=%{rocmllvm_bindir} - -gpu=%{rocm_default_gpu} -module load rocm/$gpu -export PYTORCH_ROCM_ARCH=$ROCM_GPUS +export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} %py3_build -mv build build-${gpu} -module purge - -%if %{with rocm_loop} -for gpu in %{rocm_gpu_list} -do - module load rocm/$gpu - export PYTORCH_ROCM_ARCH=$ROCM_GPUS - %py3_build - mv build build-${gpu} - module purge -done -%endif %else @@ -603,28 +573,8 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode # pytorch uses clang, not hipcc export HIP_CLANG_PATH=%{rocmllvm_bindir} - -gpu=%{rocm_default_gpu} -module load rocm/$gpu -export PYTORCH_ROCM_ARCH=$ROCM_GPUS -mv build-${gpu} build +export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} %py3_install -mv build build-${gpu} -module purge - -%if %{with rocm_loop} -for gpu in %{rocm_gpu_list} -do - module load rocm/$gpu - export PYTORCH_ROCM_ARCH=$ROCM_GPUS - mv build-${gpu} build - # need to customize the install location, so replace py3_install - %{__python3} %{py_setup} %{?py_setup_args} install -O1 --skip-build --root %{buildroot} --prefix /usr/lib64/rocm/${gpu} %{?*} - rm -rfv %{buildroot}/usr/lib/rocm/${gpu}/bin/__pycache__ - mv build build-${gpu} - module purge -done -%endif %else @@ -650,16 +600,6 @@ done %{python3_sitearch}/functorch %{python3_sitearch}/torchgen -%if %{with rocm} -%if %{with rocm_loop} - -%files -n python3-%{pypi_name}-rocm-gfx9 -%{_libdir}/rocm/gfx9/bin/* -%{_libdir}/rocm/gfx9/lib64/* - -%endif -%endif - %changelog %autochangelog From 2508009c1f33f1a5ff742b9bad321afc1d2ddc0b Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Wed, 26 Feb 2025 07:32:31 -0800 Subject: [PATCH 02/38] Remove gold linker Signed-off-by: Tom Rix --- python-torch.spec | 2 -- 1 file changed, 2 deletions(-) diff --git a/python-torch.spec b/python-torch.spec index 7d6d26e..6ceceea 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -105,7 +105,6 @@ ExclusiveArch: x86_64 aarch64 %global _lto_cflags %nil BuildRequires: cmake -BuildRequires: binutils-gold BuildRequires: eigen3-devel BuildRequires: flexiblas-devel BuildRequires: fmt-devel @@ -483,7 +482,6 @@ export USE_CUDA=OFF export USE_FAKELOWP=OFF export USE_FBGEMM=OFF export USE_FLASH_ATTENTION=OFF -export USE_GOLD_LINKER=ON export USE_GLOO=OFF export USE_ITT=OFF export USE_KINETO=OFF From 7569831b203df17daeb25bc64ca9806449b10bb7 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sat, 1 Mar 2025 07:52:12 -0800 Subject: [PATCH 03/38] cmake version changed Signed-off-by: Tom Rix --- python-torch.spec | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python-torch.spec b/python-torch.spec index 6ceceea..48096a4 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -420,6 +420,9 @@ sed -i -e 's@DESTINATION ${PYTHON_LIB_REL_PATH}@DESTINATION ${CMAKE_INSTALL_PREF # reenable foxi linking sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)@#list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)@' cmake/Dependencies.cmake +# cmake version changed +sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION 3.5)@' third_party/tensorpipe/third_party/libuv/CMakeLists.txt + %if %{with rocm} # hipify ./tools/amd_build/build_amd.py From dd353fd56b4b082675582145e25af4d7568459b0 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Wed, 12 Mar 2025 08:07:40 -0700 Subject: [PATCH 04/38] Remove papering over c++ assert problem. Signed-off-by: Tom Rix --- python-torch.spec | 2 -- 1 file changed, 2 deletions(-) diff --git a/python-torch.spec b/python-torch.spec index 48096a4..7787533 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -97,8 +97,6 @@ Patch11: 0001-Improve-finding-and-using-the-rocm_version.h.patch # Patches need to be refactored for ToT # These are ROCm packages Patch101: 0001-cuda-hip-signatures.patch -# https://github.com/pytorch/pytorch/issues/145608 -Patch102: 0001-torch-paper-over-c-assert.patch ExclusiveArch: x86_64 aarch64 %global toolchain gcc From 23f5d1192643bdfc836fb031dac1dcc63e075e99 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Thu, 13 Mar 2025 05:07:43 -0700 Subject: [PATCH 05/38] Update gitcommit Signed-off-by: Tom Rix --- python-torch.spec | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/python-torch.spec b/python-torch.spec index 7787533..95584ca 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -6,13 +6,19 @@ # So pre releases can be tried %bcond_with gitcommit %if %{with gitcommit} -# v2.5.0-rc9 -%global commit0 417a0763a7d69f6ce80719ac89c1d2deeee78163 +# v2.7.0-rc1 +%global commit0 cdd7a2c72bbf0a72faf6fe4b4903c053f0465a2e %global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) -%global date0 2024103 -%global pypi_version 2.5.0 +%global date0 20250412 +%global pypi_version 2.7.0 +%global flatbuffers_version 23.3.3 +%global miniz_version 3.0.2 +%global pybind11_version 2.13.6 %else %global pypi_version 2.5.1 +%global flatbuffers_version 23.3.3 +%global miniz_version 2.1.0 +%global pybind11_version 2.11.1 %endif # For -test subpackage @@ -60,8 +66,8 @@ Source1000: pyproject.toml %else Source0: %{forgeurl}/releases/download/v%{version}/pytorch-v%{version}.tar.gz %endif -Source1: https://github.com/google/flatbuffers/archive/refs/tags/v23.3.3.tar.gz -Source2: https://github.com/pybind/pybind11/archive/refs/tags/v2.11.1.tar.gz +Source1: https://github.com/google/flatbuffers/archive/refs/tags/v%{flatbuffers_version}.tar.gz +Source2: https://github.com/pybind/pybind11/archive/refs/tags/v%{pybind11_version}.tar.gz # Developement on tensorpipe has stopped, repo made read only July 1, 2023, this is the last commit %global tp_commit 52791a2fd214b2a9dc5759d36725909c1daa7f2e @@ -91,12 +97,14 @@ Source70: https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp- Source80: https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz %endif +%if %{without gitcommit} Patch11: 0001-Improve-finding-and-using-the-rocm_version.h.patch # ROCm patches # Patches need to be refactored for ToT # These are ROCm packages Patch101: 0001-cuda-hip-signatures.patch +%endif ExclusiveArch: x86_64 aarch64 %global toolchain gcc @@ -205,10 +213,10 @@ Summary: %{summary} Provides: pytorch # Apache-2.0 -Provides: bundled(flatbuffers) = 22.3.3 +Provides: bundled(flatbuffers) = %{flatbuffers_version} # MIT -Provides: bundled(miniz) = 2.1.0 -Provides: bundled(pybind11) = 2.11.1 +Provides: bundled(miniz) = %{miniz_version} +Provides: bundled(pybind11) = %{pybind11_version} %if %{with tensorpipe} # BSD-3-Clause @@ -254,11 +262,11 @@ rm -rf %{pypi_name}.egg-info tar xf %{SOURCE1} rm -rf third_party/flatbuffers/* -cp -r flatbuffers-23.3.3/* third_party/flatbuffers/ +cp -r flatbuffers-%{flatbuffers_version}/* third_party/flatbuffers/ tar xf %{SOURCE2} rm -rf third_party/pybind11/* -cp -r pybind11-2.11.1/* third_party/pybind11/ +cp -r pybind11-%{pybind11_version}/* third_party/pybind11/ %if %{with tensorpipe} tar xf %{SOURCE20} @@ -345,7 +353,7 @@ sed -i -e 's@check_submodules()$@#check_submodules()@' setup.py # the third_party dir to compile the file. # mimiz is licensed MIT # https://github.com/richgel999/miniz/blob/master/LICENSE -mv third_party/miniz-2.1.0 . +mv third_party/miniz-%{miniz_version} . # # setup.py depends on this script mv third_party/build_bundled.py . @@ -379,7 +387,7 @@ mv third_party/googletest . rm -rf third_party/* # Put stuff back mv build_bundled.py third_party -mv miniz-2.1.0 third_party +mv miniz-%{miniz_version} third_party mv flatbuffers third_party mv pybind11 third_party From bd11f4aa1a27d626aae19b7278b790e753e587bc Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sat, 22 Mar 2025 11:58:26 -0700 Subject: [PATCH 06/38] Update gitcommit to v2.7.0-rc2 Signed-off-by: Tom Rix --- python-torch.spec | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/python-torch.spec b/python-torch.spec index 95584ca..8a9a9f7 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -6,10 +6,10 @@ # So pre releases can be tried %bcond_with gitcommit %if %{with gitcommit} -# v2.7.0-rc1 -%global commit0 cdd7a2c72bbf0a72faf6fe4b4903c053f0465a2e +# v2.7.0-rc2 +%global commit0 b1940b5867e40e40ebdce4db76f76d3d0b71d3f4 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) -%global date0 20250412 +%global date0 20250413 %global pypi_version 2.7.0 %global flatbuffers_version 23.3.3 %global miniz_version 3.0.2 @@ -253,6 +253,10 @@ Requires: python3-%{pypi_name}%{?_isa} = %{version}-%{release} # Overwrite with a git checkout of the pyproject.toml cp %{SOURCE1000} . +# https://github.com/pytorch/pytorch/issues/149803 +# Tries to checkout nccl +sed -i -e 's@ checkout_nccl()@# checkout_nccl()@' tools/build_pytorch_libs.py + %else %autosetup -p1 -n pytorch-v%{version} %endif @@ -329,6 +333,12 @@ sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-depr # No third_party fmt, use system sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt +%if %{with gitcommit} +sed -i -e 's@fmt::fmt-header-only@fmt@' aten/src/ATen/CMakeLists.txt +sed -i -e 's@list(APPEND ATen_HIP_INCLUDE $)@@' aten/src/ATen/CMakeLists.txt + +sed -i -e 's@fmt::fmt-header-only@fmt@' third_party/kineto/libkineto/CMakeLists.txt +%endif sed -i -e 's@fmt::fmt-header-only@fmt@' c10/CMakeLists.txt sed -i -e 's@fmt::fmt-header-only@fmt@' torch/CMakeLists.txt sed -i -e 's@fmt::fmt-header-only@fmt@' cmake/Dependencies.cmake @@ -434,6 +444,10 @@ sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION ./tools/amd_build/build_amd.py # Fedora installs to /usr/include, not /usr/include/rocm-core sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable/TunableGemm.h +%if %{with gitcommit} +# https://github.com/pytorch/pytorch/issues/149805 +sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' cmake/public/LoadHIP.cmake +%endif # use any hip, correct CMAKE_MODULE_PATH sed -i -e 's@lib/cmake/hip@lib64/cmake/hip@' cmake/public/LoadHIP.cmake sed -i -e 's@HIP 1.0@HIP MODULE@' cmake/public/LoadHIP.cmake @@ -512,14 +526,22 @@ export USE_SYSTEM_ONNX=ON export USE_SYSTEM_PYBIND11=OFF export USE_SYSTEM_LIBS=OFF export USE_TENSORPIPE=OFF +%if %{with gitcommit} +export USE_XNNPACK=OFF +%else export USE_XNNPACK=ON +%endif export USE_XPU=OFF export USE_SYSTEM_PTHREADPOOL=ON export USE_SYSTEM_CPUINFO=ON export USE_SYSTEM_FP16=ON export USE_SYSTEM_FXDIV=ON export USE_SYSTEM_PSIMD=ON +%if %{with gitcommit} +export USE_SYSTEM_XNNPACK=OFF +%else export USE_SYSTEM_XNNPACK=ON +%endif export USE_DISTRIBUTED=ON %if %{with tensorpipe} From e80f34f74dac2bbc813c0914a1b4afd74abbe057 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sat, 29 Mar 2025 05:11:02 -0700 Subject: [PATCH 07/38] Update gitcommit to 2.7-rc3 Signed-off-by: Tom Rix --- 0001-Add-cmake-varaible-USE_ROCM_CK.patch | 120 ++++++++++++++++++++++ 0001-python-torch-disable-ck.patch | 112 ++++++++++++++++++++ python-torch.spec | 25 ++++- 3 files changed, 254 insertions(+), 3 deletions(-) create mode 100644 0001-Add-cmake-varaible-USE_ROCM_CK.patch create mode 100644 0001-python-torch-disable-ck.patch diff --git a/0001-Add-cmake-varaible-USE_ROCM_CK.patch b/0001-Add-cmake-varaible-USE_ROCM_CK.patch new file mode 100644 index 0000000..b34e07a --- /dev/null +++ b/0001-Add-cmake-varaible-USE_ROCM_CK.patch @@ -0,0 +1,120 @@ +From 0f33e0a7bbd1522ee74f8fc1fbe3af7563318c79 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Fri, 28 Mar 2025 15:33:09 -0700 +Subject: [PATCH] Add cmake varaible USE_ROCM_CK + +To control the use of ROCm Composable Kernel usage. + +CK is not compatible with all rocBLAS gpu's, so the user +must explicitly choose to use CK. + +Signed-off-by: Tom Rix +--- + CMakeLists.txt | 1 + + aten/src/ATen/CMakeLists.txt | 8 ++++++-- + aten/src/ATen/cuda/CUDABlas.cpp | 10 +++++----- + cmake/Dependencies.cmake | 3 +++ + 4 files changed, 15 insertions(+), 7 deletions(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index f3fee2f7ffc2..73903acce452 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -249,6 +249,7 @@ cmake_dependent_option( + BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON + "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF) + cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF) ++cmake_dependent_option(USE_ROCM_CK "Use ROCm Composable Kernel" ON "USE_ROCM" ON) + option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF) + cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF) + cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF +diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt +index 085af373ec22..af268ab88572 100644 +--- a/aten/src/ATen/CMakeLists.txt ++++ b/aten/src/ATen/CMakeLists.txt +@@ -361,13 +361,17 @@ endif() + ${native_quantized_hip_hip} + ${native_transformers_hip_hip} ${native_transformers_src_hip_hip} + ) +- if(WIN32) # Windows doesn't support Composable Kernels and Triton ++ if(NOT USE_ROCM_CK) # Windows doesn't support Composable Kernels and Triton + file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip") + file(GLOB native_hip_ck "native/hip/ck*.hip") + exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}" +- ${native_hip_bgemm} ${native_hip_ck} ++ ${native_hip_bgemm} ${native_hip_ck}) ++ endif() ++ if(WIN32) # Windows doesn't support Composable Kernels and Triton ++ exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}" + ${native_transformers_hip_hip} ${native_transformers_hip_cpp}) + endif() ++ + # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources) + list(APPEND all_hip_cpp + ${native_nested_hip_cpp} +diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp +index a62b028fd4ff..a3dbf76848ea 100644 +--- a/aten/src/ATen/cuda/CUDABlas.cpp ++++ b/aten/src/ATen/cuda/CUDABlas.cpp +@@ -708,7 +708,7 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::bgemm_internal_ck(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } +@@ -1061,7 +1061,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(double)); + #endif + } +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(double)); + } +@@ -1077,7 +1077,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); + } +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(float)); + } +@@ -1125,7 +1125,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); + } +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::Half)); + } +@@ -1141,7 +1141,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index 30917bdf39f5..2ca6091030f1 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -1046,6 +1046,9 @@ if(USE_ROCM) + if(HIPBLASLT_VEC_EXT) + list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_VEC_EXT) + endif() ++ if(USE_ROCM_CK) ++ list(APPEND HIP_CXX_FLAGS -DUSE_ROCM_CK) ++ endif() + list(APPEND HIP_HIPCC_FLAGS --offload-compress) + if(WIN32) + add_definitions(-DROCM_ON_WINDOWS) +-- +2.48.1 + diff --git a/0001-python-torch-disable-ck.patch b/0001-python-torch-disable-ck.patch new file mode 100644 index 0000000..e8fd9c2 --- /dev/null +++ b/0001-python-torch-disable-ck.patch @@ -0,0 +1,112 @@ +From 027dad1eaed51c1172e2497da611e3267d42d2f0 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Fri, 28 Mar 2025 09:16:03 -0700 +Subject: [PATCH] python-torch: disable ck + +--- + aten/src/ATen/CMakeLists.txt | 7 +++---- + aten/src/ATen/Context.cpp | 1 + + aten/src/ATen/cuda/CUDABlas.cpp | 10 +++++----- + 3 files changed, 9 insertions(+), 9 deletions(-) + +diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt +index 085af373ec22..84808880e51c 100644 +--- a/aten/src/ATen/CMakeLists.txt ++++ b/aten/src/ATen/CMakeLists.txt +@@ -134,7 +134,7 @@ file(GLOB native_cuda_cu "native/cuda/*.cu") + file(GLOB native_cuda_cpp "native/cuda/*.cpp") + file(GLOB native_cuda_h "native/cuda/*.h" "native/cuda/*.cuh") + file(GLOB native_cuda_linalg_cpp "native/cuda/linalg/*.cpp") +-file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh" "native/hip/bgemm_kernels/*.h") ++file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh" ) + file(GLOB native_cudnn_cpp "native/cudnn/*.cpp") + file(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu") + file(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp") +@@ -145,7 +145,7 @@ file(GLOB native_nested_h "native/nested/*.h") + file(GLOB native_nested_cuda_cu "native/nested/cuda/*.cu") + file(GLOB native_nested_cuda_cpp "native/nested/cuda/*.cpp") + +-file(GLOB native_hip_hip "native/hip/*.hip" "native/hip/bgemm_kernels/*.hip") ++file(GLOB native_hip_hip "native/hip/*.hip" ) + file(GLOB native_hip_cpp "native/hip/*.cpp") + file(GLOB native_hip_linalg_cpp "native/hip/linalg/*.cpp") + file(GLOB native_miopen_cpp "native/miopen/*.cpp") +@@ -361,13 +361,12 @@ endif() + ${native_quantized_hip_hip} + ${native_transformers_hip_hip} ${native_transformers_src_hip_hip} + ) +- if(WIN32) # Windows doesn't support Composable Kernels and Triton + file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip") + file(GLOB native_hip_ck "native/hip/ck*.hip") + exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}" + ${native_hip_bgemm} ${native_hip_ck} + ${native_transformers_hip_hip} ${native_transformers_hip_cpp}) +- endif() ++ + # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources) + list(APPEND all_hip_cpp + ${native_nested_hip_cpp} +diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp +index f598fc3a39d3..03dab6ff38fe 100644 +--- a/aten/src/ATen/Context.cpp ++++ b/aten/src/ATen/Context.cpp +@@ -355,6 +355,7 @@ at::BlasBackend Context::blasPreferredBackend() { + } + + void Context::setBlasPreferredBackend(at::BlasBackend b) { ++ return; + #ifdef _MSC_VER + TORCH_WARN_ONCE( + "torch.backends.cuda.preferred_blas_library is an experimental feature. " +diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp +index a62b028fd4ff..cba38426ea1f 100644 +--- a/aten/src/ATen/cuda/CUDABlas.cpp ++++ b/aten/src/ATen/cuda/CUDABlas.cpp +@@ -708,7 +708,7 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } +-#ifdef USE_ROCM ++#ifdef USE_ROCM_NO_CK + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::bgemm_internal_ck(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } +@@ -1061,7 +1061,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(double)); + #endif + } +-#ifdef USE_ROCM ++#ifdef USE_ROCM_NO_CK + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(double)); + } +@@ -1077,7 +1077,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); + } +-#ifdef USE_ROCM ++#ifdef USE_ROCM_NO_CK + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(float)); + } +@@ -1125,7 +1125,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); + } +-#ifdef USE_ROCM ++#ifdef USE_ROCM_NO_CK + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::Half)); + } +@@ -1141,7 +1141,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } +-#ifdef USE_ROCM ++#ifdef USE_ROCM_NO_CK + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } +-- +2.48.1 + diff --git a/python-torch.spec b/python-torch.spec index 8a9a9f7..8f5ed02 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -6,10 +6,10 @@ # So pre releases can be tried %bcond_with gitcommit %if %{with gitcommit} -# v2.7.0-rc2 -%global commit0 b1940b5867e40e40ebdce4db76f76d3d0b71d3f4 +# v2.7.0-rc3 +%global commit0 b04d8358d959925bee0adfd67cc17987af9fbb9d %global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) -%global date0 20250413 +%global date0 20250326 %global pypi_version 2.7.0 %global flatbuffers_version 23.3.3 %global miniz_version 3.0.2 @@ -104,6 +104,13 @@ Patch11: 0001-Improve-finding-and-using-the-rocm_version.h.patch # Patches need to be refactored for ToT # These are ROCm packages Patch101: 0001-cuda-hip-signatures.patch +%else +# https://github.com/pytorch/pytorch/issues/150187 +# The hack job +# Patch11: 0001-python-torch-disable-ck.patch +# Cleaned up hack job +Patch11: 0001-Add-cmake-varaible-USE_ROCM_CK.patch + %endif ExclusiveArch: x86_64 aarch64 @@ -159,6 +166,9 @@ BuildRequires: python3dist(sympy) %endif %if %{with rocm} +%if %{with gitcommit} +BuildRequires: composable_kernel-devel +%endif BuildRequires: hipblas-devel BuildRequires: hipblaslt-devel BuildRequires: hipcub-devel @@ -330,6 +340,8 @@ sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-pass sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unused-command-line-argument@' cmake/Dependencies.cmake sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unused-result@' cmake/Dependencies.cmake sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-deprecated-declarations@' cmake/Dependencies.cmake +# Use parallel jobs +sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -parallel-jobs=4@' cmake/Dependencies.cmake # No third_party fmt, use system sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt @@ -447,6 +459,9 @@ sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable %if %{with gitcommit} # https://github.com/pytorch/pytorch/issues/149805 sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' cmake/public/LoadHIP.cmake +# Fedora installs to /usr/include, not /usr/include/rocm-core +sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable/Tunable.cpp +sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/cuda/tunable/Tunable.cpp %endif # use any hip, correct CMAKE_MODULE_PATH sed -i -e 's@lib/cmake/hip@lib64/cmake/hip@' cmake/public/LoadHIP.cmake @@ -574,6 +589,7 @@ export BUILD_TEST=ON %if %{with rocm} export USE_ROCM=ON +export USE_ROCM_CK=OFF export USE_MAGMA=ON export HIP_PATH=`hipconfig -p` export ROCM_PATH=`hipconfig -R` @@ -595,6 +611,7 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} %if %{with rocm} export USE_ROCM=ON +export USE_ROCM_CK=OFF export HIP_PATH=`hipconfig -p` export ROCM_PATH=`hipconfig -R` RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir` @@ -620,8 +637,10 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} %files -n python3-%{pypi_name} %license LICENSE %doc README.md +%if %{without gitcommit} %{_bindir}/convert-caffe2-to-onnx %{_bindir}/convert-onnx-to-caffe2 +%endif %{_bindir}/torchrun %{_bindir}/torchfrtrace %{python3_sitearch}/%{pypi_name} From 96edd6c2ece9753ac856ef3809f735fc1174783d Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sat, 5 Apr 2025 08:56:18 -0700 Subject: [PATCH 08/38] Update gitcommit to v2.7.0-rc6 Signed-off-by: Tom Rix --- python-torch.spec | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/python-torch.spec b/python-torch.spec index 8f5ed02..a217432 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -6,10 +6,10 @@ # So pre releases can be tried %bcond_with gitcommit %if %{with gitcommit} -# v2.7.0-rc3 -%global commit0 b04d8358d959925bee0adfd67cc17987af9fbb9d +# v2.7.0-rc6 +%global commit0 06c6a81a987e271d35a5da9501b4a17915bb8206 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) -%global date0 20250326 +%global date0 20250403 %global pypi_version 2.7.0 %global flatbuffers_version 23.3.3 %global miniz_version 3.0.2 @@ -166,9 +166,6 @@ BuildRequires: python3dist(sympy) %endif %if %{with rocm} -%if %{with gitcommit} -BuildRequires: composable_kernel-devel -%endif BuildRequires: hipblas-devel BuildRequires: hipblaslt-devel BuildRequires: hipcub-devel From e3c2449e4fac3865f272f20e24f20948b2d9b208 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Thu, 10 Apr 2025 05:27:21 -0700 Subject: [PATCH 09/38] Update gitcomit to 2.7.0-rc8 Signed-off-by: Tom Rix --- python-torch.spec | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python-torch.spec b/python-torch.spec index a217432..8ea791e 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -6,10 +6,10 @@ # So pre releases can be tried %bcond_with gitcommit %if %{with gitcommit} -# v2.7.0-rc6 -%global commit0 06c6a81a987e271d35a5da9501b4a17915bb8206 +# v2.7.0-rc8 +%global commit0 c7ff78dfc0c38847bf5daa78ab8b3669e1734246 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) -%global date0 20250403 +%global date0 20250408 %global pypi_version 2.7.0 %global flatbuffers_version 23.3.3 %global miniz_version 3.0.2 From f0eda9ace1c0f50ab511cfd9fe524b50053e09c1 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sun, 13 Apr 2025 07:11:27 -0700 Subject: [PATCH 10/38] Update gitcommit to 2.7.0-rc9 Signed-off-by: Tom Rix --- python-torch.spec | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python-torch.spec b/python-torch.spec index 8ea791e..f03837b 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -6,10 +6,10 @@ # So pre releases can be tried %bcond_with gitcommit %if %{with gitcommit} -# v2.7.0-rc8 -%global commit0 c7ff78dfc0c38847bf5daa78ab8b3669e1734246 +# v2.7.0-rc9 +%global commit0 073912749d667fcfb2de1c15e1e664dc0ccd3460 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) -%global date0 20250408 +%global date0 20250410 %global pypi_version 2.7.0 %global flatbuffers_version 23.3.3 %global miniz_version 3.0.2 From fb64b28d49af6ca8bd72c63bc83166a0fd737b32 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sat, 19 Apr 2025 08:20:35 -0700 Subject: [PATCH 11/38] Update gitcommit to 2.7.0-rc10 Signed-off-by: Tom Rix --- python-torch.spec | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python-torch.spec b/python-torch.spec index f03837b..5a9f08f 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -6,10 +6,10 @@ # So pre releases can be tried %bcond_with gitcommit %if %{with gitcommit} -# v2.7.0-rc9 -%global commit0 073912749d667fcfb2de1c15e1e664dc0ccd3460 +# v2.7.0-rc10 +%global commit0 134179474539648ba7dee1317959529fbd0e7f89 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) -%global date0 20250410 +%global date0 20250415 %global pypi_version 2.7.0 %global flatbuffers_version 23.3.3 %global miniz_version 3.0.2 From 2f3d92b7c5b36d8afb0846f4af6871bcb7a7fc1d Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Fri, 25 Apr 2025 12:39:04 -0700 Subject: [PATCH 12/38] Update to 2.7.0 Signed-off-by: Tom Rix --- .gitignore | 2 ++ python-torch.spec | 37 +++++++++++-------------------------- sources | 8 +++----- 3 files changed, 16 insertions(+), 31 deletions(-) diff --git a/.gitignore b/.gitignore index cdf142f..25abff5 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,5 @@ /pytorch-v2.4.1.tar.gz /pytorch-v2.5.0.tar.gz /pytorch-v2.5.1.tar.gz +/pytorch-v2.7.0.tar.gz +/v2.13.6.tar.gz diff --git a/python-torch.spec b/python-torch.spec index 5a9f08f..bc15924 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -15,10 +15,10 @@ %global miniz_version 3.0.2 %global pybind11_version 2.13.6 %else -%global pypi_version 2.5.1 +%global pypi_version 2.7.0 %global flatbuffers_version 23.3.3 -%global miniz_version 2.1.0 -%global pybind11_version 2.11.1 +%global miniz_version 3.0.2 +%global pybind11_version 2.13.6 %endif # For -test subpackage @@ -98,19 +98,20 @@ Source80: https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ %endif %if %{without gitcommit} -Patch11: 0001-Improve-finding-and-using-the-rocm_version.h.patch +# Patch11: 0001-Improve-finding-and-using-the-rocm_version.h.patch # ROCm patches # Patches need to be refactored for ToT # These are ROCm packages -Patch101: 0001-cuda-hip-signatures.patch -%else +# Patch101: 0001-cuda-hip-signatures.patch + # https://github.com/pytorch/pytorch/issues/150187 # The hack job # Patch11: 0001-python-torch-disable-ck.patch # Cleaned up hack job Patch11: 0001-Add-cmake-varaible-USE_ROCM_CK.patch +%else %endif ExclusiveArch: x86_64 aarch64 @@ -260,10 +261,6 @@ Requires: python3-%{pypi_name}%{?_isa} = %{version}-%{release} # Overwrite with a git checkout of the pyproject.toml cp %{SOURCE1000} . -# https://github.com/pytorch/pytorch/issues/149803 -# Tries to checkout nccl -sed -i -e 's@ checkout_nccl()@# checkout_nccl()@' tools/build_pytorch_libs.py - %else %autosetup -p1 -n pytorch-v%{version} %endif @@ -342,12 +339,10 @@ sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -parallel # No third_party fmt, use system sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt -%if %{with gitcommit} sed -i -e 's@fmt::fmt-header-only@fmt@' aten/src/ATen/CMakeLists.txt sed -i -e 's@list(APPEND ATen_HIP_INCLUDE $)@@' aten/src/ATen/CMakeLists.txt sed -i -e 's@fmt::fmt-header-only@fmt@' third_party/kineto/libkineto/CMakeLists.txt -%endif sed -i -e 's@fmt::fmt-header-only@fmt@' c10/CMakeLists.txt sed -i -e 's@fmt::fmt-header-only@fmt@' torch/CMakeLists.txt sed -i -e 's@fmt::fmt-header-only@fmt@' cmake/Dependencies.cmake @@ -361,6 +356,10 @@ sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)@#list(APPE sed -i -e 's@if(NOT TARGET fxdiv)@if(MSVC AND USE_XNNPACK)@' caffe2/CMakeLists.txt sed -i -e 's@TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@#TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@' caffe2/CMakeLists.txt +# https://github.com/pytorch/pytorch/issues/149803 +# Tries to checkout nccl +sed -i -e 's@ checkout_nccl()@# checkout_nccl()@' tools/build_pytorch_libs.py + # Disable the use of check_submodule's in the setup.py, we are a tarball, not a git repo sed -i -e 's@check_submodules()$@#check_submodules()@' setup.py @@ -453,13 +452,11 @@ sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION ./tools/amd_build/build_amd.py # Fedora installs to /usr/include, not /usr/include/rocm-core sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable/TunableGemm.h -%if %{with gitcommit} # https://github.com/pytorch/pytorch/issues/149805 sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' cmake/public/LoadHIP.cmake # Fedora installs to /usr/include, not /usr/include/rocm-core sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable/Tunable.cpp sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/cuda/tunable/Tunable.cpp -%endif # use any hip, correct CMAKE_MODULE_PATH sed -i -e 's@lib/cmake/hip@lib64/cmake/hip@' cmake/public/LoadHIP.cmake sed -i -e 's@HIP 1.0@HIP MODULE@' cmake/public/LoadHIP.cmake @@ -538,22 +535,14 @@ export USE_SYSTEM_ONNX=ON export USE_SYSTEM_PYBIND11=OFF export USE_SYSTEM_LIBS=OFF export USE_TENSORPIPE=OFF -%if %{with gitcommit} export USE_XNNPACK=OFF -%else -export USE_XNNPACK=ON -%endif export USE_XPU=OFF export USE_SYSTEM_PTHREADPOOL=ON export USE_SYSTEM_CPUINFO=ON export USE_SYSTEM_FP16=ON export USE_SYSTEM_FXDIV=ON export USE_SYSTEM_PSIMD=ON -%if %{with gitcommit} export USE_SYSTEM_XNNPACK=OFF -%else -export USE_SYSTEM_XNNPACK=ON -%endif export USE_DISTRIBUTED=ON %if %{with tensorpipe} @@ -634,10 +623,6 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} %files -n python3-%{pypi_name} %license LICENSE %doc README.md -%if %{without gitcommit} -%{_bindir}/convert-caffe2-to-onnx -%{_bindir}/convert-onnx-to-caffe2 -%endif %{_bindir}/torchrun %{_bindir}/torchfrtrace %{python3_sitearch}/%{pypi_name} diff --git a/sources b/sources index aa1ed3c..4021d40 100644 --- a/sources +++ b/sources @@ -1,11 +1,9 @@ +SHA512 (pytorch-v2.7.0.tar.gz) = 17e875a66f1669901f5f770c9d829ba5bfa3967296cfb71550e8a92507181db742548eaf7cc9a2c478c4b91e366f27cc480e2e1bbb328db8501d30e1649839e6 SHA512 (v23.3.3.tar.gz) = 4066c94f2473c7ea16917d29a613e16f840a329089c88e0bdbdb999aef3442ba00abfd2aa92266fa9c067e399dc88e6f0ccac40dc151378857e665638e78bbf0 -SHA512 (v2.11.1.tar.gz) = ed1512ff0bca3bc0a45edc2eb8c77f8286ab9389f6ff1d5cb309be24bc608abbe0df6a7f5cb18c8f80a3bfa509058547c13551c3cd6a759af708fd0cdcdd9e95 +SHA512 (v2.13.6.tar.gz) = 497c25b33b09a9c42f67131ab82e35d689e8ce089dd7639be997305ff9a6d502447b79c824508c455d559e61f0186335b54dd2771d903a7c1621833930622d1a SHA512 (tensorpipe-52791a2.tar.gz) = 1e5faf17a7236c5506c08cb28be16069b11bb929bbca64ed9745ce4277d46739186ab7d6597da7437d90ed2d166d4c37ef2f3bceabe8083ef3adbb0e8e5f227e -SHA512 (libnop-910b558.tar.gz) = 74c5324eaa1b6b2ac8dfef94c835b5c5b044625f8e5efe3522470b1ecc4798ff43d344a013cee2f6901e83267c6167072947b754e63f1552ae7044cffe234c36 SHA512 (v1.41.0.tar.gz) = bb08a1970a10e8d9571ffea3d021643de30ec212cd51317b98d6cf0cfe55d6877992921fb01d1188a6d466687335b77885685d924f8cb7200a0bec30eee05c65 +SHA512 (libnop-910b558.tar.gz) = 74c5324eaa1b6b2ac8dfef94c835b5c5b044625f8e5efe3522470b1ecc4798ff43d344a013cee2f6901e83267c6167072947b754e63f1552ae7044cffe234c36 SHA512 (v1.14.2.tar.gz) = 97635bbaf6dd567c201451dfaf7815b2052fe50d9bccc97aade86cfa4a92651374d167296a5453031b2681dc302806a289bca011a9e79ddc381a17d6118971d7 SHA512 (cpp-httplib-3b6597b.tar.gz) = 8f1090658c498d04f14fec5c2f301847b1f3360bf92b18d82927643ee04ab61a6b274733a01c7850f9c030205120d674d1d961358d49fdd15636736fb8704f55 SHA512 (kineto-be13176.tar.gz) = 41a08c7da9eea7d12402f80a5550c9d4df79798719cc52b12a507828c8c896ba28a37c35d8adf809ca72589e1d84965d5ef6dd01f3f8dc1c803c5ed67b03a43a -SHA512 (pytorch-v2.4.1.tar.gz) = fc364112a88540035f213274b526b4c8ad9be335ff214161091e8a8d3efa89ebef5a5b4d33b20b5b659896d164dcbe739f0d7d2a293d3afc0efcfaa2bf62fc2c -SHA512 (pytorch-v2.5.0.tar.gz) = 6ccf1ac9f191f5bd757ef7fbfc1dcd81d591577f2d3df7313c6ed32790c592aaffd253e18dc778a2fcc707e4533299817dfdf9fae108636ce5c29c1b8ff8bba6 -SHA512 (pytorch-v2.5.1.tar.gz) = a8882608c2ab6467a58d60c6df84c9f1004b43eafeba57db499dbbfdecc09db2e221b9d4c344c8af7c0bea6252e874c400483502dca24a0b474c376b9fef1dd4 From aeb5b118d5303d5cd147d573df87021f0db9ba2a Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Thu, 1 May 2025 08:40:21 -0700 Subject: [PATCH 13/38] Turn off kleidai Breaks aarch64 Signed-off-by: Tom Rix --- python-torch.spec | 1 + 1 file changed, 1 insertion(+) diff --git a/python-torch.spec b/python-torch.spec index bc15924..ab500c5 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -517,6 +517,7 @@ export USE_FLASH_ATTENTION=OFF export USE_GLOO=OFF export USE_ITT=OFF export USE_KINETO=OFF +export USE_KLEIDIAI=OFF export USE_LITE_INTERPRETER_PROFILER=OFF export USE_LITE_PROTO=OFF export USE_MAGMA=OFF From e6d73d7c4909bb23cb0dfad74464ecb3a81cb286 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sun, 4 May 2025 07:36:00 -0700 Subject: [PATCH 14/38] Rebuild for magma Signed-off-by: Tom Rix From edfa2c25e3ffc54addd3cc4e18e972f7e2a6cf89 Mon Sep 17 00:00:00 2001 From: Python Maint Date: Fri, 6 Jun 2025 16:14:25 +0200 Subject: [PATCH 15/38] Rebuilt for Python 3.14 From 27593d78b34de046e4d8ebcf0c242343742bfc95 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Fri, 27 Jun 2025 14:44:29 -0700 Subject: [PATCH 16/38] update gitcommit to 2.8-rc3 Signed-off-by: Tom Rix --- pyproject.toml | 184 +++++++++++++++++++++++++++++++++++++++------- python-torch.spec | 35 ++++++--- 2 files changed, 181 insertions(+), 38 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9508ad0..ccf9c2a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,41 @@ +[project] +name = "torch" +requires-python = ">=3.9" +license = {text = "BSD-3-Clause"} +dynamic = [ + "authors", + "classifiers", + "entry-points", + "dependencies", + "description", + "keywords", + "optional-dependencies", + "readme", + "scripts", + "version", +] + +[project.urls] +Homepage = "https://pytorch.org/" +Documentation = "https://pytorch.org/docs/" +Source = "https://github.com/pytorch/pytorch" +Forum = "https://discuss.pytorch.org/" + + [build-system] requires = [ - "setuptools", + # After 75.8.2 dropped dep disttools API. Please fix + # API temporarily restored and shim used. Please fix + # Setuptools will drop support for setup.py past 80 + # min version for recursive glob package data support + "setuptools>=62.3.0,<80.0", "wheel", "astunparse", "numpy", "ninja", "pyyaml", "cmake", - "typing-extensions", + "typing-extensions>=4.10.0", "requests", ] # Use legacy backend to import local packages in setup.py @@ -15,32 +43,68 @@ build-backend = "setuptools.build_meta:__legacy__" [tool.black] -# Uncomment if pyproject.toml worked fine to ensure consistency with flake8 -# line-length = 120 -target-version = ["py38", "py39", "py310", "py311"] +line-length = 88 + +[tool.isort] +src_paths = ["caffe2", "torch", "torchgen", "functorch", "test"] +extra_standard_library = ["typing_extensions"] +skip_gitignore = true +skip_glob = ["third_party/*"] +atomic = true +profile = "black" +indent = 4 +line_length = 88 +lines_after_imports = 2 +multi_line_output = 3 +include_trailing_comma = true +combine_as_imports = true + + +[tool.usort.known] +first_party = ["caffe2", "torch", "torchgen", "functorch", "test"] +standard_library = ["typing_extensions"] [tool.ruff] -target-version = "py38" +line-length = 88 +src = ["caffe2", "torch", "torchgen", "functorch", "test"] +[tool.ruff.format] +docstring-code-format = true +quote-style = "double" + +[tool.ruff.lint] # NOTE: Synchoronize the ignores with .flake8 +external = [ + "B001", + "B902", + "B950", + "E121", + "E122", + "E128", + "E131", + "E704", + "E723", + "F723", + "F812", + "P201", + "P204", + "T484", + "TOR901", +] ignore = [ # these ignores are from flake8-bugbear; please fix! "B007", "B008", "B017", "B018", # Useless expression - "B019", "B023", "B028", # No explicit `stacklevel` keyword argument found - "B904", "E402", "C408", # C408 ignored because we like the dict keyword argument syntax "E501", # E501 is not flexible enough, we're using B950 instead "E721", - "E731", # Assign lambda expression "E741", "EXE001", "F405", - "F841", # these ignores are from flake8-logging-format; please fix! "G101", # these ignores are from ruff NPY; please fix! @@ -48,39 +112,41 @@ ignore = [ # these ignores are from ruff PERF; please fix! "PERF203", "PERF401", - "PERF403", # these ignores are from PYI; please fix! - "PYI019", "PYI024", "PYI036", "PYI041", "PYI056", "SIM102", "SIM103", "SIM112", # flake8-simplify code styles "SIM105", # these ignores are from flake8-simplify. please fix or ignore with commented reason - "SIM108", + "SIM108", # SIM108 ignored because we prefer if-else-block instead of ternary expression "SIM110", "SIM114", # Combine `if` branches using logical `or` operator "SIM115", "SIM116", # Disable Use a dictionary instead of consecutive `if` statements "SIM117", "SIM118", - "UP006", # keep-runtime-typing "UP007", # keep-runtime-typing + "TC006", ] -line-length = 120 select = [ "B", + "B904", # Re-raised error without specifying the cause via the from keyword "C4", "G", "E", "EXE", "F", "SIM1", + "SIM911", "W", # Not included in flake8 + "FURB", + "LOG", "NPY", "PERF", "PGH004", + "PIE790", "PIE794", "PIE800", "PIE804", @@ -89,40 +155,92 @@ select = [ "PLC0131", # type bivariance "PLC0132", # type param mismatch "PLC0205", # string as __slots__ + "PLC3002", # unnecessary-direct-lambda-call "PLE", "PLR0133", # constant comparison "PLR0206", # property with params "PLR1722", # use sys exit + "PLR1736", # unnecessary list index "PLW0129", # assert on string literal + "PLW0131", # named expr without context + "PLW0133", # useless exception statement + "PLW0245", # super without brackets "PLW0406", # import self "PLW0711", # binary op exception + "PLW1501", # bad open mode + "PLW1507", # shallow copy os.environ "PLW1509", # preexec_fn not safe with threads + "PLW2101", # useless lock statement "PLW3301", # nested min max "PT006", # TODO: enable more PT rules + "PT014", # duplicate parameterize case "PT022", "PT023", "PT024", "PT025", "PT026", "PYI", + "Q003", # avoidable escaped quote + "Q004", # unnecessary escaped quote + "RSE", "RUF008", # mutable dataclass default + "RUF013", # ban implicit optional "RUF015", # access first ele in constant time "RUF016", # type error non-integer index "RUF017", - "TRY200", - "TRY302", + "RUF018", # no assignment in assert + "RUF019", # unnecessary-key-check + "RUF020", # never union + "RUF024", # from keys mutable + "RUF026", # default factory kwarg + "RUF030", # No print statement in assert + "RUF033", # default values __post_init__ dataclass + "RUF041", # simplify nested Literal + "RUF048", # properly parse `__version__` + "RUF200", # validate pyproject.toml + "S324", # for hashlib FIPS compliance + "SLOT", + "TC", + "TRY002", # ban vanilla raise (todo fix NOQAs) + "TRY203", + "TRY401", # verbose-log-message "UP", + "YTT", ] -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] "__init__.py" = [ "F401", ] +"*.pyi" = [ + "PYI011", # typed-argument-default-in-stub + "PYI021", # docstring-in-stub + "PYI053", # string-or-bytes-too-long +] +"functorch/notebooks/**" = [ + "F401", +] +"test/export/**" = [ + "PGH004" +] +"test/typing/**" = [ + "PGH004" +] "test/typing/reveal/**" = [ "F821", ] "test/torch_np/numpy_tests/**" = [ "F821", + "NPY201", +] +"test/dynamo/test_bytecode_utils.py" = [ + "F821", +] +"test/dynamo/test_debug_utils.py" = [ + "UP037", +] +"test/dynamo/test_misc.py" = [ + "PGH004", ] "test/jit/**" = [ "PLR0133", # tests require this for JIT @@ -136,19 +254,33 @@ select = [ "RUF015", "UP", # We don't want to modify the jit test as they test specify syntax ] - -"torch/onnx/**" = [ - "UP037", # ONNX does runtime type checking +"test/inductor/s429861_repro.py" = [ + "PGH004", +] +"test/inductor/test_torchinductor.py" = [ + "UP037", +] +# autogenerated #TODO figure out why file level noqa is ignored +"torch/_appdirs.py" = ["PGH004"] +"torch/jit/_shape_functions.py" = ["PGH004"] +"torch/_inductor/fx_passes/serialized_patterns/**" = ["F401", "F501"] +"torch/_inductor/autoheuristic/artifacts/**" = ["F401", "F501"] +"torch/_inductor/codegen/**" = [ + "PGH004" ] - "torchgen/api/types/__init__.py" = [ "F401", "F403", ] -"torchgen/executorch/api/types/__init__.py" = [ - "F401", - "F403", -] "torch/utils/collect_env.py" = [ "UP", # collect_env.py needs to work with older versions of Python ] +"torch/_vendor/**" = [ + "UP", # No need to mess with _vendor +] +"tools/linter/**" = [ + "LOG015" # please fix +] + +[tool.codespell] +ignore-words = "tools/linter/dictionary.txt" diff --git a/python-torch.spec b/python-torch.spec index ab500c5..90c908c 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -6,12 +6,12 @@ # So pre releases can be tried %bcond_with gitcommit %if %{with gitcommit} -# v2.7.0-rc10 -%global commit0 134179474539648ba7dee1317959529fbd0e7f89 +# v2.8.0-rc3 +%global commit0 3d53a53e504089a52a149791fd33d7fc898bd055 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) -%global date0 20250415 -%global pypi_version 2.7.0 -%global flatbuffers_version 23.3.3 +%global date0 20250625 +%global pypi_version 2.8.0 +%global flatbuffers_version 24.12.23 %global miniz_version 3.0.2 %global pybind11_version 2.13.6 %else @@ -92,19 +92,16 @@ Source70: https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp- %endif %if %{without kineto} +%if %{with gitcommit} +%global ki_commit 5e7501833f1021ce6f618572d3baf657b6319658 +%else %global ki_commit be1317644c68b4bfc4646024a6b221066e430031 +%endif %global ki_scommit %(c=%{ki_commit}; echo ${c:0:7}) Source80: https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz %endif %if %{without gitcommit} -# Patch11: 0001-Improve-finding-and-using-the-rocm_version.h.patch - -# ROCm patches -# Patches need to be refactored for ToT -# These are ROCm packages -# Patch101: 0001-cuda-hip-signatures.patch - # https://github.com/pytorch/pytorch/issues/150187 # The hack job # Patch11: 0001-python-torch-disable-ck.patch @@ -112,6 +109,7 @@ Source80: https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ Patch11: 0001-Add-cmake-varaible-USE_ROCM_CK.patch %else +Patch11: 0001-Add-cmake-variable-USE_ROCM_CK.patch %endif ExclusiveArch: x86_64 aarch64 @@ -190,6 +188,9 @@ BuildRequires: rocm-core-devel BuildRequires: rocm-hip-devel BuildRequires: rocm-runtime-devel BuildRequires: rocm-rpm-macros +%if %{with gitcommit} +BuildRequires: rocsolver-devel +%endif BuildRequires: rocthrust-devel BuildRequires: roctracer-devel @@ -356,9 +357,11 @@ sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)@#list(APPE sed -i -e 's@if(NOT TARGET fxdiv)@if(MSVC AND USE_XNNPACK)@' caffe2/CMakeLists.txt sed -i -e 's@TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@#TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@' caffe2/CMakeLists.txt +%if %{without gitcommit} # https://github.com/pytorch/pytorch/issues/149803 # Tries to checkout nccl sed -i -e 's@ checkout_nccl()@# checkout_nccl()@' tools/build_pytorch_libs.py +%endif # Disable the use of check_submodule's in the setup.py, we are a tarball, not a git repo sed -i -e 's@check_submodules()$@#check_submodules()@' setup.py @@ -432,6 +435,9 @@ mv googletest third_party # # Fake out pocketfft, and system header will be used mkdir third_party/pocketfft +%if %{with gitcommit} +cp /usr/include/pocketfft_hdronly.h third_party/pocketfft/ +%endif # # Use the system valgrind headers @@ -585,7 +591,12 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode # pytorch uses clang, not hipcc export HIP_CLANG_PATH=%{rocmllvm_bindir} +%if %{?fedora} <= 43 +export PYTORCH_ROCM_ARCH="gfx1100;gfx1201" +%else export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} +%endif + %py3_build %else From 42c33b8dcd0de27e96cf0871c8087aa571d63f20 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sun, 20 Jul 2025 12:44:41 -0700 Subject: [PATCH 17/38] Update the next gitcommit to v2.8.0-rc6 Remove old patches. Signed-off-by: Tom Rix --- 0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch | 47 - ...-Changes-to-compile-with-3.13-126033.patch | 222 ---- ...ectorization-on-windows-submodule-sl.patch | 910 ----------------- ...finding-and-using-the-rocm_version.h.patch | 142 --- 0001-Optionally-use-hipblaslt.patch | 506 ---------- 0001-Patch-for-sleef-3.6.patch | 952 ------------------ 0001-Reenable-dim-for-python-3.12.patch | 115 --- 0001-Regenerate-flatbuffer-header.patch | 39 - 0001-Stub-in-kineto-ActivityType.patch | 73 -- 0001-can-not-use-with-c-files.patch | 25 - 0001-cuda-hip-signatures.patch | 42 - 0001-disable-use-of-aotriton.patch | 94 -- ...le-dynamo-on-3.12-enable-most-dynamo.patch | 226 ----- ...lude-fmt-ranges.h-for-using-fmt-join.patch | 54 - 0001-no-third_party-FXdiv.patch | 54 - 0001-no-third_party-fmt.patch | 65 -- 0001-no-third_party-foxi.patch | 36 - 0001-python-torch-disable-ck.patch | 112 --- 0001-reenable-foxi-linking.patch | 25 - 0001-silence-an-assert.patch | 25 - 0001-torch-paper-over-c-assert.patch | 88 -- 0001-use-any-hip.patch | 34 - ...1-Add-cmake-option-USE_SYSTEM_FBGEMM.patch | 47 - .../0001-Add-cmake-variable-USE_ROCM_CK.patch | 149 +++ next/0001-Optionally-use-hipblaslt.patch | 506 ---------- next/0001-disable-use-of-aotriton.patch | 94 -- python-torch.spec | 11 +- 27 files changed, 154 insertions(+), 4539 deletions(-) delete mode 100644 0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch delete mode 100644 0001-Changes-to-compile-with-3.13-126033.patch delete mode 100644 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch delete mode 100644 0001-Improve-finding-and-using-the-rocm_version.h.patch delete mode 100644 0001-Optionally-use-hipblaslt.patch delete mode 100644 0001-Patch-for-sleef-3.6.patch delete mode 100644 0001-Reenable-dim-for-python-3.12.patch delete mode 100644 0001-Regenerate-flatbuffer-header.patch delete mode 100644 0001-Stub-in-kineto-ActivityType.patch delete mode 100644 0001-can-not-use-with-c-files.patch delete mode 100644 0001-cuda-hip-signatures.patch delete mode 100644 0001-disable-use-of-aotriton.patch delete mode 100644 0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch delete mode 100644 0001-include-fmt-ranges.h-for-using-fmt-join.patch delete mode 100644 0001-no-third_party-FXdiv.patch delete mode 100644 0001-no-third_party-fmt.patch delete mode 100644 0001-no-third_party-foxi.patch delete mode 100644 0001-python-torch-disable-ck.patch delete mode 100644 0001-reenable-foxi-linking.patch delete mode 100644 0001-silence-an-assert.patch delete mode 100644 0001-torch-paper-over-c-assert.patch delete mode 100644 0001-use-any-hip.patch delete mode 100644 next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch create mode 100644 next/0001-Add-cmake-variable-USE_ROCM_CK.patch delete mode 100644 next/0001-Optionally-use-hipblaslt.patch delete mode 100644 next/0001-disable-use-of-aotriton.patch diff --git a/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch b/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch deleted file mode 100644 index 413c60d..0000000 --- a/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch +++ /dev/null @@ -1,47 +0,0 @@ -From 091b7fe1ccbb5e4ff4ac6017d42bacb869f61a27 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 20 Jul 2024 05:37:15 -0600 -Subject: [PATCH] Add cmake option USE_SYSTEM_FBGEMM - -Signed-off-by: Tom Rix ---- - CMakeLists.txt | 1 + - cmake/Dependencies.cmake | 3 ++- - 2 files changed, 3 insertions(+), 1 deletion(-) - -diff --git a/CMakeLists.txt b/CMakeLists.txt -index c4cd4b2c2a98..2068f7c6c4f2 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -253,6 +253,7 @@ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF - "USE_CUDNN" OFF) - cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF) - option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON) -+option(USE_SYSTEM_FBGEMM "Use system-wide FBGEMM" OFF) - option(USE_KINETO "Use Kineto profiling library" ON) - option(USE_CUPTI_SO "Use CUPTI as a shared library" ON) - option(USE_FAKELOWP "Use FakeLowp operators" OFF) -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index f1f2eb7cec31..192dac46f13b 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -706,6 +706,7 @@ endif() - - # ---[ FBGEMM - if(USE_FBGEMM) -+ if (NOT USE_SYSTEM_FBGEMM) - set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party") - if(NOT DEFINED FBGEMM_SOURCE_DIR) - set(FBGEMM_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/fbgemm" CACHE STRING "FBGEMM source directory") -@@ -746,7 +747,7 @@ if(USE_FBGEMM) - target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable) - endif() - endif() -- -+ endif() - if(USE_FBGEMM) - list(APPEND Caffe2_DEPENDENCY_LIBS fbgemm) - endif() --- -2.45.1 - diff --git a/0001-Changes-to-compile-with-3.13-126033.patch b/0001-Changes-to-compile-with-3.13-126033.patch deleted file mode 100644 index ddc0dcf..0000000 --- a/0001-Changes-to-compile-with-3.13-126033.patch +++ /dev/null @@ -1,222 +0,0 @@ -From 655a06444b261cb28e71a0973c0ab67aaa8261ab Mon Sep 17 00:00:00 2001 -From: albanD -Date: Tue, 14 May 2024 02:14:53 +0000 -Subject: [PATCH] Changes to compile with 3.13 (#126033) - -This is mainly: -- Fix refcount access macro -- Hide all the Dynamo code that needs update as usual -- Add _PyWeakref_ClearRef as an extern provided by CPython. Including the pycore header that defines it would require raw c include shenanigans that I don't think are worth it. -This allows to build both with regular and nogil version of cpython. Both - -Note that this requires the 3.13 branch at least past [d3094744d40de2deefbda9b1996d5029c9ebf0b0](https://github.com/python/cpython/commit/d3094744d40de2deefbda9b1996d5029c9ebf0b0) which we need for mimalloc include and weakref function being exposed. - -debug-only issues in pybind11 with PyMem_MALLOC vs PyObject_MALLOC being should be synced either by updating pybind or cpython. @colesbury I can send a PR to ifdef the proper use in pybind if you think that this is the best solution here? - -Pull Request resolved: https://github.com/pytorch/pytorch/pull/126033 -Approved by: https://github.com/colesbury ---- - torch/csrc/Storage.cpp | 2 +- - torch/csrc/autograd/python_variable.cpp | 2 +- - torch/csrc/dynamo/cpython_defs.c | 15 +++++- - torch/csrc/dynamo/cpython_defs.h | 2 + - torch/csrc/dynamo/eval_frame.c | 67 ++++++++++++++++++------- - torch/csrc/utils/python_compat.h | 4 ++ - 6 files changed, 70 insertions(+), 22 deletions(-) - -diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp -index 93dbc9c09bb2..b22bbac35981 100644 ---- a/torch/csrc/Storage.cpp -+++ b/torch/csrc/Storage.cpp -@@ -236,7 +236,7 @@ static void THPStorage_subclass_dealloc(PyObject* self) { - if (type->tp_del) { - PyObject_GC_Track(self); - type->tp_del(self); -- if (self->ob_refcnt > 0) { -+ if (Py_REFCNT(self) > 0) { - // Resurrected (see above comment about resurrection from `__del__`) - return; - } -diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp -index 9e85f0026b35..8fd1129da63c 100644 ---- a/torch/csrc/autograd/python_variable.cpp -+++ b/torch/csrc/autograd/python_variable.cpp -@@ -1910,7 +1910,7 @@ void THPVariable_subclass_dealloc(PyObject* self) { - if (type->tp_del) { - PyObject_GC_Track(self); - type->tp_del(self); -- if (self->ob_refcnt > 0) { -+ if (Py_REFCNT(self) > 0) { - /* Resurrected */ - return; - } -diff --git a/torch/csrc/dynamo/cpython_defs.c b/torch/csrc/dynamo/cpython_defs.c -index 4a1dba63009a..5e0945a052ae 100644 ---- a/torch/csrc/dynamo/cpython_defs.c -+++ b/torch/csrc/dynamo/cpython_defs.c -@@ -13,6 +13,17 @@ - } else { \ - } - -+#if IS_PYTHON_3_13_PLUS -+// Gave up after fixing a few of these -+// pycore_opcode.h is gone (new is pycore_opcode_metadata.h ?) -+// f_code is gone (new is f_executable?) -+ -+// Fake definitions for what we removed -+const uint8_t* THP_PyOpcode_Caches = NULL; -+const int THP_PyOpcode_Caches_size = 0; -+ -+#else -+ - // NOTE: all `assert`s below are converted to `CHECK`s - - #if IS_PYTHON_3_11_PLUS -@@ -29,8 +40,8 @@ - #define NEED_OPCODE_TABLES // To get _PyOpcode_Deopt - #include - #undef NEED_OPCODE_TABLES --#undef Py_BUILD_CORE - #include -+#undef Py_BUILD_CORE - - // As a simple way to reduce the impact of ABI changes on the CPython side, this check forces - // us to manually re-check that the function didn't change on the next major version -@@ -364,3 +375,5 @@ THP_PyFrame_Clear(_PyInterpreterFrame *frame) - } - - #endif -+ -+#endif // CPython 3.13 -\ No newline at end of file -diff --git a/torch/csrc/dynamo/cpython_defs.h b/torch/csrc/dynamo/cpython_defs.h -index a897c3e6c6e7..3b6c9667f8c9 100644 ---- a/torch/csrc/dynamo/cpython_defs.h -+++ b/torch/csrc/dynamo/cpython_defs.h -@@ -8,7 +8,9 @@ - - #if IS_PYTHON_3_11_PLUS - -+#define Py_BUILD_CORE - #include -+#undef Py_BUILD_CORE - - int THP_PyFrame_FastToLocalsWithError( - _PyInterpreterFrame* frame, -diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c -index c286e821f09d..e13cb5af2a0e 100644 ---- a/torch/csrc/dynamo/eval_frame.c -+++ b/torch/csrc/dynamo/eval_frame.c -@@ -8,6 +8,31 @@ - #include - #include - -+ -+ -+PyObject* guard_error_hook = NULL; -+const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup"; -+ -+static int active_dynamo_threads = 0; -+ -+static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT; -+ -+inline static PyObject* eval_frame_callback_get(void) { -+ void* result = PyThread_tss_get(&eval_frame_callback_key); -+ if (unlikely(result == NULL)) { -+ return (PyObject*)Py_None; -+ } else { -+ return (PyObject*)result; -+ } -+} -+ -+inline static void eval_frame_callback_set(PyObject* obj) { -+ PyThread_tss_set(&eval_frame_callback_key, obj); -+} -+ -+// 3.13 Not supported at all. See cpython_defs.c for hints -+#if !(IS_PYTHON_3_13_PLUS) -+ - // Problem in CPython includes when mixing core and non-core build - // The fix was not backported to 3.12 so this is needed here - // https://github.com/python/cpython/issues/105268 -@@ -138,24 +163,6 @@ THP_PyFrame_FastToLocalsWithError(THP_EVAL_API_FRAME_OBJECT *frame, int *free_va - } - #endif - --PyObject* guard_error_hook = NULL; --const char* cache_lookup_profiler_str = "TorchDynamo Cache Lookup"; -- --static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT; -- --inline static PyObject* eval_frame_callback_get(void) { -- void* result = PyThread_tss_get(&eval_frame_callback_key); -- if (unlikely(result == NULL)) { -- return (PyObject*)Py_None; -- } else { -- return (PyObject*)result; -- } --} -- --inline static void eval_frame_callback_set(PyObject* obj) { -- PyThread_tss_set(&eval_frame_callback_key, obj); --} -- - static PyObject* _custom_eval_frame_shim( - PyThreadState* tstate, - THP_EVAL_API_FRAME_OBJECT* frame, -@@ -627,7 +634,29 @@ static PyObject* _custom_eval_frame( - } - } - --static int active_dynamo_threads = 0; -+#else // IS_PYTHON_3_13_PLUS -+ -+// Fake definitions for everything we removed -+ -+typedef struct THPPyInterpreterFrame { -+ PyObject_HEAD -+ _PyInterpreterFrame* frame; // Borrowed reference -+} THPPyInterpreterFrame; -+ -+inline static void enable_eval_frame_shim(PyThreadState* tstate) {} -+inline static void enable_eval_frame_default(PyThreadState* tstate) {} -+ -+static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL}; -+ -+static PyTypeObject THPPyInterpreterFrameType = { -+ PyVarObject_HEAD_INIT(NULL, 0) -+ .tp_name = "torch._C.dynamo.eval_frame._PyInterpreterFrame", -+ .tp_basicsize = sizeof(THPPyInterpreterFrame), -+ .tp_flags = Py_TPFLAGS_DEFAULT, -+ .tp_getset = THPPyInterpreterFrame_properties, -+}; -+ -+#endif // CPython 3.13 - - static PyObject* increment_working_threads(PyThreadState* tstate) { - active_dynamo_threads = active_dynamo_threads + 1; -diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h -index 73b991cf3fbf..b060db00db73 100644 ---- a/torch/csrc/utils/python_compat.h -+++ b/torch/csrc/utils/python_compat.h -@@ -11,6 +11,7 @@ extern "C" { - - #define IS_PYTHON_3_11_PLUS PY_VERSION_HEX >= 0x030B00C1 - #define IS_PYTHON_3_12_PLUS PY_VERSION_HEX >= 0x030C0000 -+#define IS_PYTHON_3_13_PLUS PY_VERSION_HEX >= 0x030D0000 - - PYCAPI_COMPAT_STATIC_INLINE(int) - PyCode_GetNCellvars(PyCodeObject* code) { -@@ -32,6 +33,9 @@ PyCode_GetNFreevars(PyCodeObject* code) { - #endif - } - -+// Provided by CPython but getting the header for them is very hard -+extern void _PyWeakref_ClearRef(PyWeakReference* self); -+ - #ifdef __cplusplus - } - #endif --- -2.45.1 - diff --git a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch deleted file mode 100644 index 562f55b..0000000 --- a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch +++ /dev/null @@ -1,910 +0,0 @@ -From 3d1e4b3e5ddcdd2717e590c635097163fef64c83 Mon Sep 17 00:00:00 2001 -From: Xu Han -Date: Sun, 31 Mar 2024 03:07:32 +0000 -Subject: [PATCH] Enable x86 CPU vectorization on windows [submodule sleef] - (#118980) - -Enable VEC on Windows OS. -1. Fix some type defination gap between Windows and Linux. -2. Fix some operator not support on Windows, such as [], /. -3. Enable static sleef library build on Windows. -4. Disable unsupported function overloading on MSVC. -5. Upgrade submodule sleef lib, which fixed build issue on Windows. -6. Fixed bazel build issues. -7. Fix test app not link to sleef on Windows. - -Note: If rebuild fail after pulled this PR, please sync `sleef` submodule by run: -```cmd -git submodule sync -git submodule update --init --recursive -``` - -Pull Request resolved: https://github.com/pytorch/pytorch/pull/118980 -Approved by: https://github.com/jgong5, https://github.com/ezyang, https://github.com/malfet ---- - aten/src/ATen/CMakeLists.txt | 48 ++++++-------- - aten/src/ATen/cpu/vec/vec256/vec256.h | 14 ++-- - .../src/ATen/cpu/vec/vec256/vec256_bfloat16.h | 27 ++++++-- - .../cpu/vec/vec256/vec256_complex_double.h | 7 +- - .../cpu/vec/vec256/vec256_complex_float.h | 7 +- - aten/src/ATen/cpu/vec/vec256/vec256_double.h | 5 +- - aten/src/ATen/cpu/vec/vec256/vec256_float.h | 15 +++-- - aten/src/ATen/cpu/vec/vec256/vec256_qint.h | 12 +++- - aten/src/ATen/cpu/vec/vec512/vec512.h | 14 ++-- - .../src/ATen/cpu/vec/vec512/vec512_bfloat16.h | 27 ++++++-- - .../cpu/vec/vec512/vec512_complex_double.h | 7 +- - .../cpu/vec/vec512/vec512_complex_float.h | 7 +- - aten/src/ATen/cpu/vec/vec512/vec512_double.h | 5 +- - aten/src/ATen/cpu/vec/vec512/vec512_float.h | 15 +++-- - aten/src/ATen/cpu/vec/vec512/vec512_qint.h | 66 ++++++++++++++++++- - aten/src/ATen/cpu/vec/vec_base.h | 6 ++ - caffe2/CMakeLists.txt | 2 +- - third_party/sleef.BUILD | 3 +- - 18 files changed, 194 insertions(+), 93 deletions(-) - -diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt -index bf425af5fa9..58d5828e8ca 100644 ---- a/aten/src/ATen/CMakeLists.txt -+++ b/aten/src/ATen/CMakeLists.txt -@@ -419,32 +419,25 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$") - list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo) - endif() - --if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) -- # Preserve values for the main build -- set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS}) -- set(__aten_sleef_build_tests ${BUILD_TESTS}) -- -- # Unset our restrictive C++ flags here and reset them later. -- # Remove this once we use proper target_compile_options. -- set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) -- set(CMAKE_CXX_FLAGS) -- -- # Bump up optimization level for sleef to -O1, since at -O0 the compiler -- # excessively spills intermediate vector registers to the stack -- # and makes things run impossibly slowly -- set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) -- if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0") -- string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -- else() -- set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1") -+if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) -+ if(NOT MSVC) -+ # Bump up optimization level for sleef to -O1, since at -O0 the compiler -+ # excessively spills intermediate vector registers to the stack -+ # and makes things run impossibly slowly -+ set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) -+ if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0") -+ string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -+ else() -+ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1") -+ endif() - endif() - - if(NOT USE_SYSTEM_SLEEF) -- set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) -- set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) -- set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) -- set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) -- set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE}) -+ set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) -+ set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) -+ set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) -+ set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) -+ set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE) - if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") - if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64") - set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE) -@@ -465,12 +458,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) - endif() - list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef) - -- set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -- set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS}) -- -- # Set these back. TODO: Use SLEEF_ to pass these instead -- set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE) -- set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE) -+ if(NOT MSVC) -+ set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -+ endif() - endif() - - if(USE_CUDA AND NOT USE_ROCM) -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h -index 800b027e469..c431fa3c605 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h -@@ -69,7 +69,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { - } - - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -@@ -94,7 +94,8 @@ inline Vectorized cast(const Vectorized& src) - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -+#ifndef _MSC_VER -+// MSVC is not working well on complex function overload. - template - std::enable_if_t> - inline gather(const double* base_addr, const Vectorized& vindex) { -@@ -106,9 +107,10 @@ std::enable_if_t& vindex) { - return _mm256_i32gather_ps(base_addr, vindex, scale); - } -- -+#endif - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -+#ifndef _MSC_VER -+// MSVC is not working well on complex function overload. - template - std::enable_if_t> - inline mask_gather(const Vectorized& src, const double* base_addr, -@@ -122,7 +124,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, - const Vectorized& vindex, Vectorized& mask) { - return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale); - } -- -+#endif - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - // Only works for inputs in the range: [-2^51, 2^51] -@@ -302,6 +304,6 @@ inline Vectorized flip(const Vectorized & v) { - return flip8(v); - } - --#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#endif // (defined(CPU_CAPABILITY_AVX2) - - }} // namepsace at::vec::CPU_CAPABILITY -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h -index 3e26213d6d2..66557436c70 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h -@@ -7,7 +7,8 @@ - #include - #include - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -18,7 +19,18 @@ namespace at::vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) -+ -+#ifndef SLEEF_CONST -+#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) -+#define SLEEF_CONST const -+#else -+#define SLEEF_CONST -+#endif -+#define SLEEF_CONST_OLD SLEEF_CONST -+#else -+#define SLEEF_CONST_OLD -+#endif - - // bfloat16 conversion - static inline void cvtbf16_fp32(const __m128i& a, __m256& o) { -@@ -265,7 +277,8 @@ public: - } - return b; - } -- Vectorized map(const __m256 (*const vop)(__m256)) const { -+ -+ Vectorized map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const { - __m256 lo, hi; - cvt_to_fp32(values, lo, hi); - const auto o1 = vop(lo); -@@ -1026,7 +1039,7 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V - CONVERT_VECTORIZED_INIT(BFloat16, bfloat16); - CONVERT_VECTORIZED_INIT(Half, half); - --#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#else // defined(CPU_CAPABILITY_AVX2) - - #define CONVERT_NON_VECTORIZED_INIT(type, name) \ - inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ -@@ -1051,9 +1064,9 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V - CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16); - CONVERT_NON_VECTORIZED_INIT(Half, half); - --#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#endif // defined(CPU_CAPABILITY_AVX2) - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - #define LOAD_FP32_VECTORIZED_INIT(type, name) \ - inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ - auto values = _mm_loadu_si128(reinterpret_cast(data)); \ -@@ -1072,7 +1085,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec - LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16); - LOAD_FP32_VECTORIZED_INIT(Half, fp16); - --#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#else // defined(CPU_CAPABILITY_AVX2) - #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ - inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ - __at_align__ float values[Vectorized::size()]; \ -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h -index f93ea1e63c3..6c198fb37d3 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h -@@ -8,7 +8,8 @@ - #include - #include - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -16,7 +17,7 @@ namespace at::vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - - template <> class Vectorized> { - private: -@@ -145,7 +146,7 @@ public: - auto abs = abs_(); - auto zero = _mm256_setzero_pd(); - auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ); -- auto div = values / abs; -+ auto div = _mm256_div_pd(values, abs); - return _mm256_blendv_pd(div, zero, mask); - } - __m256d real_() const { -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h -index 7c142c04b79..c72d4d49274 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h -@@ -7,7 +7,8 @@ - #include - #include - #include --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -15,7 +16,7 @@ namespace at::vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - - template <> class Vectorized> { - private: -@@ -180,7 +181,7 @@ public: - auto abs = abs_(); - auto zero = _mm256_setzero_ps(); - auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ); -- auto div = values / abs; -+ auto div = _mm256_div_ps(values, abs); - return _mm256_blendv_ps(div, zero, mask); - } - __m256 real_() const { -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h -index bc82d07edd1..bed6da627af 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h -@@ -6,7 +6,8 @@ - #include - #include - #include --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -15,7 +16,7 @@ namespace at::vec { - inline namespace CPU_CAPABILITY { - - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - - template <> class Vectorized { - private: -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h -index 886809a0b8a..0e3664cd37b 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h -@@ -6,7 +6,8 @@ - #include - #include - #include --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -14,7 +15,7 @@ namespace at::vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - - template <> class Vectorized { - private: -@@ -226,14 +227,14 @@ public: - static __m256 vec_factorial_5 = - _mm256_set1_ps(0.00828929059f); // 1/factorial(5) - static __m256 vec_exp_log2ef = -- (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e) -+ _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e) - static __m256 vec_half = _mm256_set1_ps(0.5f); - static __m256 vec_one = _mm256_set1_ps(1.f); - static __m256 vec_zero = _mm256_set1_ps(0.f); - static __m256 vec_two = _mm256_set1_ps(2.f); -- static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2) -- static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50); -- static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218); -+ static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2) -+ static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50)); -+ static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218)); - static __m256i vec_127 = _mm256_set1_epi32(0x0000007f); - static int n_mantissa_bits = 23; - -@@ -266,7 +267,7 @@ public: - auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number); - auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127); - vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); -- auto vec_two_pow_n = (__m256)vec_two_pow_n_i; -+ auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i); - vec_two_pow_n = - _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask); - -diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h -index 4128841701a..85e099904cd 100644 ---- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h -+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h -@@ -41,11 +41,17 @@ - namespace at::vec { - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX2) - -+#ifdef _MSC_VER -+__declspec(align(64)) struct Vectorizedqi { -+ protected: -+ __m256i vals; -+#else - struct Vectorizedqi { - protected: - __m256i vals __attribute__((aligned(64))); -+#endif - - public: - Vectorizedqi() {} -@@ -133,7 +139,7 @@ inline convert_float_to_int8(at::vec::Vectorized src) { - } - - template --inline void __attribute__((always_inline)) QuantizeAvx2( -+__FORCE_INLINE void QuantizeAvx2( - const float* src, - T* dst, - int len, -@@ -1331,5 +1337,5 @@ Vectorized inline maximum(const Vectorized& a, const V - return a.maximum(b); - } - --#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -+#endif // if defined(CPU_CAPABILITY_AVX2) - }} // namespace at::vec::CPU_CAPABILITY -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h -index fe96d123e64..87f723d782c 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h -@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { - } - - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -@@ -80,7 +80,8 @@ inline Vectorized cast(const Vectorized& src) - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -+#ifndef _MSC_VER -+// MSVC is not working well on complex function overload. - template - std::enable_if_t> - inline gather(const double* base_addr, const Vectorized& vindex) { -@@ -92,9 +93,10 @@ std::enable_if_t& vindex) { - return _mm512_i32gather_ps(vindex, base_addr, scale); - } -- -+#endif - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -+#ifndef _MSC_VER -+// MSVC is not working well on complex function overload. - template - std::enable_if_t> - inline mask_gather(const Vectorized& src, const double* base_addr, -@@ -112,7 +114,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, - auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ); - return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale); - } -- -+#endif - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - template<> -@@ -270,6 +272,6 @@ inline Vectorized flip(const Vectorized & v) { - return flip8(v); - } - --#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#endif // defined(CPU_CAPABILITY_AVX512) - - }}} -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h -index f9fc92d52bf..eb3b6a72240 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h -@@ -7,7 +7,8 @@ - #include - #include - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -16,7 +17,18 @@ namespace vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) -+ -+#ifndef SLEEF_CONST -+#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) -+#define SLEEF_CONST const -+#else -+#define SLEEF_CONST -+#endif -+#define SLEEF_CONST_OLD SLEEF_CONST -+#else -+#define SLEEF_CONST_OLD -+#endif - - // bfloat16 conversion - static inline void cvtbf16_fp32(const __m256i& a, __m512& o) { -@@ -362,7 +374,8 @@ public: - } - #pragma clang diagnostic push - #pragma clang diagnostic ignored "-Wignored-qualifiers" -- Vectorized map(const __m512 (*const vop)(__m512)) const { -+ -+ Vectorized map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const { - __m512 lo, hi; - cvt_to_fp32(values, lo, hi); - const auto o1 = vop(lo); -@@ -1571,7 +1584,7 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V - CONVERT_VECTORIZED_INIT(BFloat16, bfloat16); - CONVERT_VECTORIZED_INIT(Half, half); - --#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#else //defined(CPU_CAPABILITY_AVX512) - - #define CONVERT_NON_VECTORIZED_INIT(type, name) \ - inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ -@@ -1601,9 +1614,9 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V - CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16); - CONVERT_NON_VECTORIZED_INIT(Half, half); - --#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#endif // defined(CPU_CAPABILITY_AVX512) - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - #define LOAD_FP32_VECTORIZED_INIT(type, name) \ - inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ - auto values = _mm256_loadu_si256(reinterpret_cast(data)); \ -@@ -1622,7 +1635,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec - LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16); - LOAD_FP32_VECTORIZED_INIT(Half, fp16); - --#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#else // defined(CPU_CAPABILITY_AVX512) - #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ - inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ - __at_align__ float values[Vectorized::size()]; \ -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h -index 02aa3a87cc1..c35204f9da2 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h -@@ -7,7 +7,8 @@ - #include - #include - #include --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -16,7 +17,7 @@ namespace vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - - template <> class Vectorized> { - private: -@@ -203,7 +204,7 @@ public: - auto abs = abs_(); - auto zero = _mm512_setzero_pd(); - auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ); -- auto div = values / abs; -+ auto div = _mm512_div_pd(values, abs); - return _mm512_mask_blend_pd(mask, div, zero); - } - __m512d real_() const { -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h -index a5d790c98b2..2801e484d94 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h -@@ -7,7 +7,8 @@ - #include - #include - #include --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -16,7 +17,7 @@ namespace vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - - template <> class Vectorized> { - private: -@@ -708,7 +709,7 @@ public: - auto abs = abs_(); - auto zero = _mm512_setzero_ps(); - auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ); -- auto div = values / abs; -+ auto div = _mm512_div_ps(values, abs); - return _mm512_mask_blend_ps(mask, div, zero); - } - __m512 real_() const { -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h -index 27b2753c903..508ab257e60 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h -@@ -6,7 +6,8 @@ - #include - #include - #include --#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER) -+#if (defined(CPU_CAPABILITY_AVX512)) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -15,7 +16,7 @@ namespace vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - - template <> class Vectorized { - private: -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h -index ba5738687fd..a08df3c141a 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h -@@ -6,7 +6,8 @@ - #include - #include - #include --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) -+#define SLEEF_STATIC_LIBS - #include - #endif - -@@ -15,7 +16,7 @@ namespace vec { - // See Note [CPU_CAPABILITY namespace] - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - - template <> class Vectorized { - private: -@@ -246,14 +247,14 @@ public: - static __m512 vec_factorial_5 = - _mm512_set1_ps(0.00828929059f); // 1/factorial(5) - static __m512 vec_exp_log2ef = -- (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e) -+ _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e) - static __m512 vec_half = _mm512_set1_ps(0.5f); - static __m512 vec_one = _mm512_set1_ps(1.f); - static __m512 vec_zero = _mm512_set1_ps(0.f); - static __m512 vec_two = _mm512_set1_ps(2.f); -- static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2) -- static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50); -- static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218); -+ static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2) -+ static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50)); -+ static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218)); - static __m512i vec_127 = _mm512_set1_epi32(0x0000007f); - static int n_mantissa_bits = 23; - -@@ -288,7 +289,7 @@ public: - auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number); - auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127); - vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); -- auto vec_two_pow_n = (__m512)vec_two_pow_n_i; -+ auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i); - vec_two_pow_n = - _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero); - -diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h -index e0713d01312..a5671ed4a50 100644 ---- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h -+++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h -@@ -42,11 +42,17 @@ namespace at { - namespace vec { - inline namespace CPU_CAPABILITY { - --#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -+#if defined(CPU_CAPABILITY_AVX512) - -+#ifdef _MSC_VER -+__declspec(align(64)) struct Vectorizedqi { -+ protected: -+ __m512i vals; -+#else - struct Vectorizedqi { - protected: - __m512i vals __attribute__((aligned(64))); -+#endif - - public: - Vectorizedqi() {} -@@ -136,7 +142,7 @@ inline convert_float_to_int8(at::vec::Vectorized src) { - } - - template --inline void __attribute__((always_inline)) QuantizeAvx512( -+__FORCE_INLINE void QuantizeAvx512( - const float* src, - T* dst, - int len, -@@ -525,10 +531,17 @@ struct Vectorized : public Vectorizedqi { - Vectorized scale, - Vectorized zero_point, - Vectorized scale_neg_zp_premul) const { -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -+ #else - __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); - __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); - __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); - __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -+ #endif - - __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); - __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); -@@ -549,10 +562,17 @@ struct Vectorized : public Vectorizedqi { - float_vec_return_type dequantize( - Vectorized scale, - Vectorized zero_point) const { -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -+ #else - __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); - __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); - __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); - __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -+ #endif - - __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); - __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); -@@ -598,20 +618,34 @@ struct Vectorized : public Vectorizedqi { - } - - int_vec_return_type widening_subtract(Vectorized b) const { -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -+ #else - __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); - __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); - __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); - __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -+ #endif - - __m512i int32_val0 = cvtepi8_epi32(int_val0); - __m512i int32_val1 = cvtepi8_epi32(int_val1); - __m512i int32_val2 = cvtepi8_epi32(int_val2); - __m512i int32_val3 = cvtepi8_epi32(int_val3); - -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); -+ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); -+ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); -+ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); -+ #else - __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); - __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); - __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); - __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); -+ #endif - - __m512i int32_b0 = cvtepi8_epi32(int_b0); - __m512i int32_b1 = cvtepi8_epi32(int_b1); -@@ -721,10 +755,17 @@ struct Vectorized : public Vectorizedqi { - Vectorized scale, - Vectorized zero_point, - Vectorized scale_zp_premul) const { -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -+ #else - __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); - __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); - __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); - __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -+ #endif - - __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); - __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); -@@ -746,10 +787,17 @@ struct Vectorized : public Vectorizedqi { - float_vec_return_type dequantize( - Vectorized scale, - Vectorized zero_point) const { -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -+ #else - __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); - __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); - __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); - __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -+ #endif - - __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); - __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); -@@ -796,20 +844,34 @@ struct Vectorized : public Vectorizedqi { - } - - int_vec_return_type widening_subtract(Vectorized b) const { -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -+ #else - __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); - __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); - __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); - __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -+ #endif - - __m512i int32_val0 = cvtepu8_epi32(int_val0); - __m512i int32_val1 = cvtepu8_epi32(int_val1); - __m512i int32_val2 = cvtepu8_epi32(int_val2); - __m512i int32_val3 = cvtepu8_epi32(int_val3); - -+ #if defined(_MSC_VER) && !defined(__clang__) -+ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); -+ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); -+ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); -+ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); -+ #else - __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); - __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); - __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); - __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); -+ #endif - - __m512i int32_b0 = cvtepu8_epi32(int_b0); - __m512i int32_b1 = cvtepu8_epi32(int_b1); -diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h -index adf81dd915c..20cb8ef6dbc 100644 ---- a/aten/src/ATen/cpu/vec/vec_base.h -+++ b/aten/src/ATen/cpu/vec/vec_base.h -@@ -36,6 +36,12 @@ - #include - #include - -+#if defined(__GNUC__) -+#define __FORCE_INLINE __attribute__((always_inline)) inline -+#elif defined(_MSC_VER) -+#define __FORCE_INLINE __forceinline -+#endif -+ - // These macros helped us unify vec_base.h - #ifdef CPU_CAPABILITY_AVX512 - #if defined(__GNUC__) -diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt -index a6b6f0f7d1d..15d37cf4861 100644 ---- a/caffe2/CMakeLists.txt -+++ b/caffe2/CMakeLists.txt -@@ -1787,7 +1787,7 @@ if(BUILD_TEST) - endif() - else() - add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}") -- target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main) -+ target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main) - endif() - target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $) - target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $) -diff --git a/third_party/sleef.BUILD b/third_party/sleef.BUILD -index 573f9c5b54a..f22a6e905e2 100644 ---- a/third_party/sleef.BUILD -+++ b/third_party/sleef.BUILD -@@ -38,6 +38,7 @@ SLEEF_PUBLIC_HEADERS = [ - SLEEF_PRIVATE_INCLUDES = [ - "-Iexternal/sleef/src/arch", - "-Iexternal/sleef/src/common", -+ "-Iexternal/sleef/src/libm", - ] - - SLEEF_PUBLIC_INCLUDES = [ -@@ -201,8 +202,6 @@ cc_library( - srcs = [ - "src/libm/rempitab.c", - "src/libm/sleefdp.c", -- "src/libm/sleefld.c", -- "src/libm/sleefqp.c", - "src/libm/sleefsp.c", - ], - hdrs = SLEEF_PUBLIC_HEADERS, --- -2.45.1 - diff --git a/0001-Improve-finding-and-using-the-rocm_version.h.patch b/0001-Improve-finding-and-using-the-rocm_version.h.patch deleted file mode 100644 index b8232c7..0000000 --- a/0001-Improve-finding-and-using-the-rocm_version.h.patch +++ /dev/null @@ -1,142 +0,0 @@ -From 201ac4618a1526e048a0d6c02d9bc4cf30bf0ee1 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Wed, 14 Aug 2024 17:18:38 -0700 -Subject: [PATCH] Improve finding and using the rocm_version.h - -On Fedora, the rocm_version.h's path is /usr/include/rocm_version.h -So we have this build error -pytorch/aten/src/ATen/hip/tunable/Tunable.cpp:40:10: fatal error: - rocm-core/rocm_version.h: No such file or directory - 40 | #include - | ^~~~~~~~~~~~~~~~~~~~~~~~~~ - -In other cases, depending on the rocm release either -/opt/rocm/include or /opt/rocm/include/rocm-core - -Convert the EXISTS() checks into a find_path. -Add a -I${ROCM_VERSION_DIR} to the compile options so it can be -found by Tunable.cpp - -Signed-off-by: Tom Rix ---- - aten/src/ATen/cuda/tunable/Tunable.cpp | 2 +- - cmake/Dependencies.cmake | 1 + - cmake/public/LoadHIP.cmake | 72 ++++++++++---------------- - 3 files changed, 30 insertions(+), 45 deletions(-) - -diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp -index 1b7c89875855..32c1d70f3152 100644 ---- a/aten/src/ATen/cuda/tunable/Tunable.cpp -+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp -@@ -36,7 +36,7 @@ - - // for validators - #ifdef USE_ROCM --#include -+#include - #define ROCBLAS_BETA_FEATURES_API - #include - #include -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index 7ef8eabb5162..61bc4d7a54b6 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1063,6 +1063,7 @@ if(USE_ROCM) - list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) - list(APPEND HIP_CXX_FLAGS -std=c++17) - list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2) -+ list(APPEND HIP_CXX_FLAGS -I${ROCM_VERSION_DIR}) - if(HIP_NEW_TYPE_ENUMS) - list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS) - endif() -diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake -index 1c0d3a203991..6a7e3bd163f5 100644 ---- a/cmake/public/LoadHIP.cmake -+++ b/cmake/public/LoadHIP.cmake -@@ -42,55 +42,39 @@ find_package_and_print_version(HIP 1.0) - - if(HIP_FOUND) - set(PYTORCH_FOUND_HIP TRUE) -- set(FOUND_ROCM_VERSION_H FALSE) -- - set(PROJECT_RANDOM_BINARY_DIR "${PROJECT_BINARY_DIR}") -- set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc") - - # Find ROCM version for checks - # ROCM 5.0 and later will have header api for version management -- if(EXISTS ${ROCM_INCLUDE_DIRS}/rocm_version.h) -- set(FOUND_ROCM_VERSION_H TRUE) -- file(WRITE ${file} "" -- "#include \n" -- ) -- elseif(EXISTS ${ROCM_INCLUDE_DIRS}/rocm-core/rocm_version.h) -- set(FOUND_ROCM_VERSION_H TRUE) -- file(WRITE ${file} "" -- "#include \n" -- ) -- else() -- message("********************* rocm_version.h couldnt be found ******************\n") -- endif() -- -- if(FOUND_ROCM_VERSION_H) -- file(APPEND ${file} "" -- "#include \n" -- -- "#ifndef ROCM_VERSION_PATCH\n" -- "#define ROCM_VERSION_PATCH 0\n" -- "#endif\n" -- "#define STRINGIFYHELPER(x) #x\n" -- "#define STRINGIFY(x) STRINGIFYHELPER(x)\n" -- "int main() {\n" -- " printf(\"%d.%d.%s\", ROCM_VERSION_MAJOR, ROCM_VERSION_MINOR, STRINGIFY(ROCM_VERSION_PATCH));\n" -- " return 0;\n" -- "}\n" -- ) -- -- try_run(run_result compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file} -- CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}" -- RUN_OUTPUT_VARIABLE rocm_version_from_header -- COMPILE_OUTPUT_VARIABLE output_var -- ) -- # We expect the compile to be successful if the include directory exists. -- if(NOT compile_result) -- message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var}) -- endif() -- message(STATUS "Caffe2: Header version is: " ${rocm_version_from_header}) -- set(ROCM_VERSION_DEV_RAW ${rocm_version_from_header}) -- message("\n***** ROCm version from rocm_version.h ****\n") -+ find_path(ROCM_VERSION_DIR rocm_version.h HINTS ${ROCM_INCLUDE_DIRS} ${ROCM_INCLUDE_DIRS}/rocm-core) -+ set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc") -+ file(WRITE ${file} "" -+ "#include \n" -+ "#include \n" -+ -+ "#ifndef ROCM_VERSION_PATCH\n" -+ "#define ROCM_VERSION_PATCH 0\n" -+ "#endif\n" -+ "#define STRINGIFYHELPER(x) #x\n" -+ "#define STRINGIFY(x) STRINGIFYHELPER(x)\n" -+ "int main() {\n" -+ " printf(\"%d.%d.%s\", ROCM_VERSION_MAJOR, ROCM_VERSION_MINOR, STRINGIFY(ROCM_VERSION_PATCH));\n" -+ " return 0;\n" -+ "}\n" -+ ) -+ -+ try_run(run_result compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file} -+ CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_VERSION_DIR}" -+ RUN_OUTPUT_VARIABLE rocm_version_from_header -+ COMPILE_OUTPUT_VARIABLE output_var -+ ) -+ # We expect the compile to be successful if the include directory exists. -+ if(NOT compile_result) -+ message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var}) - endif() -+ message(STATUS "Caffe2: Header version is: " ${rocm_version_from_header}) -+ set(ROCM_VERSION_DEV_RAW ${rocm_version_from_header}) -+ message("\n***** ROCm version from rocm_version.h ****\n") - - string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+).*$" ROCM_VERSION_DEV_MATCH ${ROCM_VERSION_DEV_RAW}) - --- -2.46.0 - diff --git a/0001-Optionally-use-hipblaslt.patch b/0001-Optionally-use-hipblaslt.patch deleted file mode 100644 index 1e5ca4b..0000000 --- a/0001-Optionally-use-hipblaslt.patch +++ /dev/null @@ -1,506 +0,0 @@ -From f1d65e958afa65882dbfea8b392ab847a84d41ed Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 29 Jun 2024 04:18:34 -0700 -Subject: [PATCH] Optionally use hipblaslt - ---- - aten/src/ATen/cuda/CUDABlas.cpp | 46 ++++++++++++++++++------ - aten/src/ATen/cuda/CUDAContextLight.h | 4 +++ - aten/src/ATen/cuda/CublasHandlePool.cpp | 10 ++++-- - aten/src/ATen/cuda/tunable/TunableGemm.h | 18 +++++++--- - aten/src/ATen/native/cuda/Blas.cpp | 18 +++++++++- - cmake/Dependencies.cmake | 3 ++ - cmake/public/LoadHIP.cmake | 2 +- - 7 files changed, 82 insertions(+), 19 deletions(-) - -diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp -index ce991a9bcad4..3f0d17b52778 100644 ---- a/aten/src/ATen/cuda/CUDABlas.cpp -+++ b/aten/src/ATen/cuda/CUDABlas.cpp -@@ -14,7 +14,9 @@ - #include - - #ifdef USE_ROCM -+#ifdef USE_HIPBLASLT - #include -+#endif - // until hipblas has an API to accept flags, we must use rocblas here - #include - #include -@@ -182,6 +184,9 @@ uint32_t _getAlignment(uintptr_t address) { - static size_t _parseChosenWorkspaceSize() { - const char * val = getenv("CUBLASLT_WORKSPACE_SIZE"); - #ifdef USE_ROCM -+#ifndef USE_HIPBLASLT -+ return 0; -+#endif - if (!val) { - // accept either env var - val = getenv("HIPBLASLT_WORKSPACE_SIZE"); -@@ -235,6 +240,7 @@ namespace at::cuda::blas { - } while (0) - - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - namespace { - // Following the pattern of CuSparseDescriptor - // Defined here for now because this is the only place cublas_lt interface is -@@ -318,7 +324,6 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< - }; - } // namespace - -- - template - inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { - cudaDataType_t abcType = CUDA_R_32F; -@@ -452,7 +457,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { - " scaleType ", - scaleType); - } -- -+#endif - - template - inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { -@@ -608,10 +613,13 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(double)) - template <> - void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(float)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(float)); - } -- else { -+ else -+#endif -+ { - bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(float)); - } - } -@@ -651,10 +659,13 @@ void bgemm_internal>(CUDABLAS_BGEMM_ARGTYPES(c10::complex - void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::Half)); - } -- else { -+ else -+#endif -+ { - bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::Half)); - } - } -@@ -662,10 +673,13 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)) - template <> - void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::BFloat16)); - } -- else { -+ else -+#endif -+ { - bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::BFloat16)); - } - } -@@ -781,11 +795,13 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { - } - } - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) { - // forward to bgemm implementation but set strides and batches to 0 - bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0); - } -+#endif - - template - inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) { -@@ -1008,10 +1024,13 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) - template <> - void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); - } -- else { -+ else -+#endif -+ { - gemm_internal_cublas(CUDABLAS_GEMM_ARGS(float)); - } - } -@@ -1051,10 +1070,13 @@ void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex - void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); - } -- else { -+ else -+#endif -+ { - gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::Half)); - } - } -@@ -1062,10 +1084,13 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) - template <> - void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } -- else { -+ else -+#endif -+ { - gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } - } -@@ -1177,7 +1202,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { - } - } - -- -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - void gemm_and_bias( - bool transpose_mat1, -@@ -1410,7 +1435,7 @@ void scaled_gemm( - ScalarType result_dtype, - void* amax_ptr, - bool use_fast_accum) { --#if CUDA_VERSION >= 11080 || defined(USE_ROCM) -+#if CUDA_VERSION >= 11080 || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - const auto computeType = CUBLAS_COMPUTE_32F; - const auto scaleType = CUDA_R_32F; - const int8_t fastAccuMode = use_fast_accum ? 1 : 0; -@@ -1681,6 +1706,7 @@ void int8_gemm( - " scaleType ", - scaleType); - } -+#endif - - template <> - void trsm(CUDABLAS_TRSM_ARGTYPES(float)) { -diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h -index f2b657ced51b..f0ee613c4208 100644 ---- a/aten/src/ATen/cuda/CUDAContextLight.h -+++ b/aten/src/ATen/cuda/CUDAContextLight.h -@@ -9,7 +9,9 @@ - - // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also - // added bf16 support -+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))) - #include -+#endif - - #ifdef CUDART_VERSION - #include -@@ -80,7 +82,9 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator(); - /* Handles */ - TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle(); - TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle(); -+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))) - TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle(); -+#endif - - TORCH_CUDA_CPP_API void clearCublasWorkspaces(); - -diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp -index 8eac525b3695..abfdf7a23847 100644 ---- a/aten/src/ATen/cuda/CublasHandlePool.cpp -+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp -@@ -29,7 +29,7 @@ namespace at::cuda { - - namespace { - --#if defined(USE_ROCM) -+#if defined(USE_ROCM) && defined(USE_HIPBLASLT) - void createCublasLtHandle(cublasLtHandle_t *handle) { - TORCH_CUDABLAS_CHECK(cublasLtCreate(handle)); - } -@@ -191,8 +191,9 @@ cublasHandle_t getCurrentCUDABlasHandle() { - return handle; - } - --cublasLtHandle_t getCurrentCUDABlasLtHandle() { - #ifdef USE_ROCM -+#if defined(USE_HIPBLASLT) -+cublasLtHandle_t getCurrentCUDABlasLtHandle() { - c10::DeviceIndex device = 0; - AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); - -@@ -213,9 +214,12 @@ cublasLtHandle_t getCurrentCUDABlasLtHandle() { - - auto handle = myPoolWindow->reserve(device); - return handle; -+} -+#endif - #else -+cublasLtHandle_t getCurrentCUDABlasLtHandle() { - return reinterpret_cast(getCurrentCUDABlasHandle()); --#endif - } -+#endif - - } // namespace at::cuda -diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h -index 53e6154120c9..fa1d664696db 100644 ---- a/aten/src/ATen/cuda/tunable/TunableGemm.h -+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h -@@ -11,7 +11,9 @@ - - #include - #ifdef USE_ROCM -+#ifdef USE_HIPBLASLT - #include -+#endif - #include - #endif - #include -@@ -65,6 +67,7 @@ class DefaultGemmStridedBatchedOp : public Callable> - } - }; - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - class DefaultScaledGemmOp : public Callable> { - public: -@@ -94,6 +97,7 @@ class DefaultScaledGemmOp : public Callable> { - return OK; - } - }; -+#endif - - template - inline bool IsZero(T v) { -@@ -191,6 +195,7 @@ static void AddRocblasValidator() { - } - } - -+#ifdef USE_HIPBLASLT - static void AddHipblasltValidator() { - auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); - if (validators.find("HIPBLASLT_VERSION") == validators.end()) { -@@ -205,6 +210,7 @@ static void AddHipblasltValidator() { - [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; }); - } - } -+#endif - - static void AddRocmValidator() { - auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); -@@ -243,7 +249,7 @@ class GemmTunableOp : public TunableOp, StreamTimer> { - } - AddRocblasValidator(); - } -- -+#ifdef USE_HIPBLASLT - static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); - if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) { - rocm_validators = true; -@@ -257,7 +263,7 @@ class GemmTunableOp : public TunableOp, StreamTimer> { - } - AddHipblasltValidator(); - } -- -+#endif - if (rocm_validators) { - AddRocmValidator(); - } -@@ -286,7 +292,7 @@ class GemmStridedBatchedTunableOp : public TunableOp - } - AddRocblasValidator(); - } -- -+#ifdef USE_HIPBLASLT - static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); - if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) { - rocm_validators = true; -@@ -300,7 +306,7 @@ class GemmStridedBatchedTunableOp : public TunableOp - } - AddHipblasltValidator(); - } -- -+#endif - if (rocm_validators) { - AddRocmValidator(); - } -@@ -312,6 +318,7 @@ class GemmStridedBatchedTunableOp : public TunableOp - } - }; - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - class ScaledGemmTunableOp : public TunableOp, StreamTimer> { - public: -@@ -321,10 +328,12 @@ class ScaledGemmTunableOp : public TunableOp, StreamTimer> - auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); - - #if defined(USE_ROCM) -+#ifdef USE_HIPBLASLT - for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps()) { - this->RegisterOp(std::move(name), std::move(op)); - } - AddHipblasltValidator(); -+#endif - AddRocmValidator(); - #endif - } -@@ -337,6 +346,7 @@ class ScaledGemmTunableOp : public TunableOp, StreamTimer> - "_", BlasOpToString(ALayout), BlasOpToString(BLayout)); - } - }; -+#endif - - #undef XSTRINGIFY - #undef STRINGIFY -diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp -index 84c59a4fd0d7..56ad5de3bf2d 100644 ---- a/aten/src/ATen/native/cuda/Blas.cpp -+++ b/aten/src/ATen/native/cuda/Blas.cpp -@@ -173,6 +173,7 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa - } - - static bool getDisableAddmmCudaLt() { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT"); - #ifdef USE_ROCM - // if we enable tunable op, it'll take priority over just hipblaslt (heuristics) -@@ -196,10 +197,14 @@ static bool getDisableAddmmCudaLt() { - } - return false; - #endif -+#else -+ return true; -+#endif - } - - #ifdef USE_ROCM - static bool isSupportedHipLtROCmArch(int index) { -+#ifdef USE_HIPBLASLT - hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index); - std::string device_arch = prop->gcnArchName; - static const std::vector archs = {"gfx90a", "gfx940", "gfx941", "gfx942"}; -@@ -210,6 +215,7 @@ static bool isSupportedHipLtROCmArch(int index) { - } - } - TORCH_CHECK(false, "Attempting to use hipBLASLt on a unsupported architecture!"); -+#endif - return false; - } - #endif -@@ -235,6 +241,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - at::ScalarType scalar_type = self.scalar_type(); - c10::MaybeOwned self_; - if (&result != &self) { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM) - // Strangely, if mat2 has only 1 row or column, we get - // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic. -@@ -276,13 +283,14 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - scalar_type != at::ScalarType::BFloat16)); - #endif - } -+#endif - #endif - if (!useLtInterface) { - self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm"); - } - self__sizes = self_->sizes(); - } else { --#if defined(USE_ROCM) -+#if defined(USE_ROCM) && defined(USE_HIPBLASLT) - useLtInterface = !disable_addmm_cuda_lt && - result.dim() == 2 && result.is_contiguous() && - isSupportedHipLtROCmArch(self.device().index()) && -@@ -334,6 +342,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj()); - - if (useLtInterface) { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - #if defined(USE_ROCM) - AT_DISPATCH_FLOATING_TYPES_AND2( - at::ScalarType::Half, -@@ -394,6 +403,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - activation_epilogue - ); - }); -+#endif - #endif - } else - { -@@ -803,6 +813,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) { - } - - static bool _scaled_mm_allowed_device() { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - auto dprops = at::cuda::getCurrentDeviceProperties(); - #ifdef USE_ROCM - std::string device_arch = dprops->gcnArchName; -@@ -817,6 +828,9 @@ static bool _scaled_mm_allowed_device() { - #else - return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9); - #endif -+#else -+ return false; -+#endif - } - - // Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax -@@ -850,6 +864,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, - // Check sizes - bool allowed_device = _scaled_mm_allowed_device(); - TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+"); -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix"); - TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix"); - TORCH_CHECK( -@@ -1025,6 +1040,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, - #if defined(USE_ROCM) && ROCM_VERSION >= 60000 && ROCM_VERSION < 60200 - // ROCm's hipBLASLt does not support amax before 6.2, so calculate separately - amax = at::max(at::abs(out.to(kFloat))); -+#endif - #endif - - return {out, amax}; -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index f1f2eb7cec31..8d05e834bbc5 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1052,6 +1052,9 @@ if(USE_ROCM) - list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) - list(APPEND HIP_CXX_FLAGS -std=c++17) - list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2) -+ if(hipblast_FOUND) -+ list(APPEND HIP_CXX_FLAGS -DUSE_HIPBLASLT) -+ endif() - if(HIP_NEW_TYPE_ENUMS) - list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS) - endif() -diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake -index fa39156031ff..df4836847fdf 100644 ---- a/cmake/public/LoadHIP.cmake -+++ b/cmake/public/LoadHIP.cmake -@@ -155,7 +155,7 @@ if(HIP_FOUND) - find_package_and_print_version(hiprand REQUIRED) - find_package_and_print_version(rocblas REQUIRED) - find_package_and_print_version(hipblas REQUIRED) -- find_package_and_print_version(hipblaslt REQUIRED) -+ find_package_and_print_version(hipblaslt) - find_package_and_print_version(miopen REQUIRED) - find_package_and_print_version(hipfft REQUIRED) - find_package_and_print_version(hipsparse REQUIRED) --- -2.45.2 - diff --git a/0001-Patch-for-sleef-3.6.patch b/0001-Patch-for-sleef-3.6.patch deleted file mode 100644 index 13aa208..0000000 --- a/0001-Patch-for-sleef-3.6.patch +++ /dev/null @@ -1,952 +0,0 @@ -From 273f23698c887b52e66c2abec8101b7398f0f9c4 Mon Sep 17 00:00:00 2001 -From: "Benjamin A. Beasley" -Date: Wed, 5 Jun 2024 11:06:02 -0400 -Subject: [PATCH] Patch for sleef 3.6 - ---- - ...ectorization-on-windows-submodule-sl.patch | 910 ++++++++++++++++++ - python-torch.spec | 11 + - 2 files changed, 921 insertions(+) - create mode 100644 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch - -diff --git a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch -new file mode 100644 -index 000000000000..562f55b742c2 ---- /dev/null -+++ b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch -@@ -0,0 +1,910 @@ -+From 3d1e4b3e5ddcdd2717e590c635097163fef64c83 Mon Sep 17 00:00:00 2001 -+From: Xu Han -+Date: Sun, 31 Mar 2024 03:07:32 +0000 -+Subject: [PATCH] Enable x86 CPU vectorization on windows [submodule sleef] -+ (#118980) -+ -+Enable VEC on Windows OS. -+1. Fix some type defination gap between Windows and Linux. -+2. Fix some operator not support on Windows, such as [], /. -+3. Enable static sleef library build on Windows. -+4. Disable unsupported function overloading on MSVC. -+5. Upgrade submodule sleef lib, which fixed build issue on Windows. -+6. Fixed bazel build issues. -+7. Fix test app not link to sleef on Windows. -+ -+Note: If rebuild fail after pulled this PR, please sync `sleef` submodule by run: -+```cmd -+git submodule sync -+git submodule update --init --recursive -+``` -+ -+Pull Request resolved: https://github.com/pytorch/pytorch/pull/118980 -+Approved by: https://github.com/jgong5, https://github.com/ezyang, https://github.com/malfet -+--- -+ aten/src/ATen/CMakeLists.txt | 48 ++++++-------- -+ aten/src/ATen/cpu/vec/vec256/vec256.h | 14 ++-- -+ .../src/ATen/cpu/vec/vec256/vec256_bfloat16.h | 27 ++++++-- -+ .../cpu/vec/vec256/vec256_complex_double.h | 7 +- -+ .../cpu/vec/vec256/vec256_complex_float.h | 7 +- -+ aten/src/ATen/cpu/vec/vec256/vec256_double.h | 5 +- -+ aten/src/ATen/cpu/vec/vec256/vec256_float.h | 15 +++-- -+ aten/src/ATen/cpu/vec/vec256/vec256_qint.h | 12 +++- -+ aten/src/ATen/cpu/vec/vec512/vec512.h | 14 ++-- -+ .../src/ATen/cpu/vec/vec512/vec512_bfloat16.h | 27 ++++++-- -+ .../cpu/vec/vec512/vec512_complex_double.h | 7 +- -+ .../cpu/vec/vec512/vec512_complex_float.h | 7 +- -+ aten/src/ATen/cpu/vec/vec512/vec512_double.h | 5 +- -+ aten/src/ATen/cpu/vec/vec512/vec512_float.h | 15 +++-- -+ aten/src/ATen/cpu/vec/vec512/vec512_qint.h | 66 ++++++++++++++++++- -+ aten/src/ATen/cpu/vec/vec_base.h | 6 ++ -+ caffe2/CMakeLists.txt | 2 +- -+ third_party/sleef.BUILD | 3 +- -+ 18 files changed, 194 insertions(+), 93 deletions(-) -+ -+diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt -+index bf425af5fa9..58d5828e8ca 100644 -+--- a/aten/src/ATen/CMakeLists.txt -++++ b/aten/src/ATen/CMakeLists.txt -+@@ -419,32 +419,25 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$") -+ list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo) -+ endif() -+ -+-if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) -+- # Preserve values for the main build -+- set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS}) -+- set(__aten_sleef_build_tests ${BUILD_TESTS}) -+- -+- # Unset our restrictive C++ flags here and reset them later. -+- # Remove this once we use proper target_compile_options. -+- set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) -+- set(CMAKE_CXX_FLAGS) -+- -+- # Bump up optimization level for sleef to -O1, since at -O0 the compiler -+- # excessively spills intermediate vector registers to the stack -+- # and makes things run impossibly slowly -+- set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) -+- if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0") -+- string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -+- else() -+- set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1") -++if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) -++ if(NOT MSVC) -++ # Bump up optimization level for sleef to -O1, since at -O0 the compiler -++ # excessively spills intermediate vector registers to the stack -++ # and makes things run impossibly slowly -++ set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) -++ if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0") -++ string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -++ else() -++ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1") -++ endif() -+ endif() -+ -+ if(NOT USE_SYSTEM_SLEEF) -+- set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) -+- set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) -+- set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) -+- set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) -+- set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE}) -++ set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) -++ set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) -++ set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) -++ set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) -++ set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE) -+ if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") -+ if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64") -+ set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE) -+@@ -465,12 +458,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) -+ endif() -+ list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef) -+ -+- set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -+- set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS}) -+- -+- # Set these back. TODO: Use SLEEF_ to pass these instead -+- set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE) -+- set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE) -++ if(NOT MSVC) -++ set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) -++ endif() -+ endif() -+ -+ if(USE_CUDA AND NOT USE_ROCM) -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h -+index 800b027e469..c431fa3c605 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256.h -+@@ -69,7 +69,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { -+ } -+ -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ -+@@ -94,7 +94,8 @@ inline Vectorized cast(const Vectorized& src) -+ } -+ -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+- -++#ifndef _MSC_VER -++// MSVC is not working well on complex function overload. -+ template -+ std::enable_if_t> -+ inline gather(const double* base_addr, const Vectorized& vindex) { -+@@ -106,9 +107,10 @@ std::enable_if_t& vindex) { -+ return _mm256_i32gather_ps(base_addr, vindex, scale); -+ } -+- -++#endif -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+- -++#ifndef _MSC_VER -++// MSVC is not working well on complex function overload. -+ template -+ std::enable_if_t> -+ inline mask_gather(const Vectorized& src, const double* base_addr, -+@@ -122,7 +124,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, -+ const Vectorized& vindex, Vectorized& mask) { -+ return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale); -+ } -+- -++#endif -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ -+ // Only works for inputs in the range: [-2^51, 2^51] -+@@ -302,6 +304,6 @@ inline Vectorized flip(const Vectorized & v) { -+ return flip8(v); -+ } -+ -+-#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#endif // (defined(CPU_CAPABILITY_AVX2) -+ -+ }} // namepsace at::vec::CPU_CAPABILITY -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h -+index 3e26213d6d2..66557436c70 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h -+@@ -7,7 +7,8 @@ -+ #include -+ #include -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -18,7 +19,18 @@ namespace at::vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -++ -++#ifndef SLEEF_CONST -++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) -++#define SLEEF_CONST const -++#else -++#define SLEEF_CONST -++#endif -++#define SLEEF_CONST_OLD SLEEF_CONST -++#else -++#define SLEEF_CONST_OLD -++#endif -+ -+ // bfloat16 conversion -+ static inline void cvtbf16_fp32(const __m128i& a, __m256& o) { -+@@ -265,7 +277,8 @@ public: -+ } -+ return b; -+ } -+- Vectorized map(const __m256 (*const vop)(__m256)) const { -++ -++ Vectorized map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const { -+ __m256 lo, hi; -+ cvt_to_fp32(values, lo, hi); -+ const auto o1 = vop(lo); -+@@ -1026,7 +1039,7 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V -+ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16); -+ CONVERT_VECTORIZED_INIT(Half, half); -+ -+-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#else // defined(CPU_CAPABILITY_AVX2) -+ -+ #define CONVERT_NON_VECTORIZED_INIT(type, name) \ -+ inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ -+@@ -1051,9 +1064,9 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V -+ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16); -+ CONVERT_NON_VECTORIZED_INIT(Half, half); -+ -+-#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#endif // defined(CPU_CAPABILITY_AVX2) -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ #define LOAD_FP32_VECTORIZED_INIT(type, name) \ -+ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ -+ auto values = _mm_loadu_si128(reinterpret_cast(data)); \ -+@@ -1072,7 +1085,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec -+ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16); -+ LOAD_FP32_VECTORIZED_INIT(Half, fp16); -+ -+-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#else // defined(CPU_CAPABILITY_AVX2) -+ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ -+ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ -+ __at_align__ float values[Vectorized::size()]; \ -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h -+index f93ea1e63c3..6c198fb37d3 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h -+@@ -8,7 +8,8 @@ -+ #include -+ #include -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -16,7 +17,7 @@ namespace at::vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ -+ template <> class Vectorized> { -+ private: -+@@ -145,7 +146,7 @@ public: -+ auto abs = abs_(); -+ auto zero = _mm256_setzero_pd(); -+ auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ); -+- auto div = values / abs; -++ auto div = _mm256_div_pd(values, abs); -+ return _mm256_blendv_pd(div, zero, mask); -+ } -+ __m256d real_() const { -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h -+index 7c142c04b79..c72d4d49274 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h -+@@ -7,7 +7,8 @@ -+ #include -+ #include -+ #include -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -15,7 +16,7 @@ namespace at::vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ -+ template <> class Vectorized> { -+ private: -+@@ -180,7 +181,7 @@ public: -+ auto abs = abs_(); -+ auto zero = _mm256_setzero_ps(); -+ auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ); -+- auto div = values / abs; -++ auto div = _mm256_div_ps(values, abs); -+ return _mm256_blendv_ps(div, zero, mask); -+ } -+ __m256 real_() const { -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h -+index bc82d07edd1..bed6da627af 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h -+@@ -6,7 +6,8 @@ -+ #include -+ #include -+ #include -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -15,7 +16,7 @@ namespace at::vec { -+ inline namespace CPU_CAPABILITY { -+ -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ -+ template <> class Vectorized { -+ private: -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h -+index 886809a0b8a..0e3664cd37b 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h -+@@ -6,7 +6,8 @@ -+ #include -+ #include -+ #include -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -14,7 +15,7 @@ namespace at::vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ -+ template <> class Vectorized { -+ private: -+@@ -226,14 +227,14 @@ public: -+ static __m256 vec_factorial_5 = -+ _mm256_set1_ps(0.00828929059f); // 1/factorial(5) -+ static __m256 vec_exp_log2ef = -+- (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e) -++ _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e) -+ static __m256 vec_half = _mm256_set1_ps(0.5f); -+ static __m256 vec_one = _mm256_set1_ps(1.f); -+ static __m256 vec_zero = _mm256_set1_ps(0.f); -+ static __m256 vec_two = _mm256_set1_ps(2.f); -+- static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2) -+- static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50); -+- static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218); -++ static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2) -++ static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50)); -++ static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218)); -+ static __m256i vec_127 = _mm256_set1_epi32(0x0000007f); -+ static int n_mantissa_bits = 23; -+ -+@@ -266,7 +267,7 @@ public: -+ auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number); -+ auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127); -+ vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); -+- auto vec_two_pow_n = (__m256)vec_two_pow_n_i; -++ auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i); -+ vec_two_pow_n = -+ _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask); -+ -+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h -+index 4128841701a..85e099904cd 100644 -+--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h -++++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h -+@@ -41,11 +41,17 @@ -+ namespace at::vec { -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX2) -+ -++#ifdef _MSC_VER -++__declspec(align(64)) struct Vectorizedqi { -++ protected: -++ __m256i vals; -++#else -+ struct Vectorizedqi { -+ protected: -+ __m256i vals __attribute__((aligned(64))); -++#endif -+ -+ public: -+ Vectorizedqi() {} -+@@ -133,7 +139,7 @@ inline convert_float_to_int8(at::vec::Vectorized src) { -+ } -+ -+ template -+-inline void __attribute__((always_inline)) QuantizeAvx2( -++__FORCE_INLINE void QuantizeAvx2( -+ const float* src, -+ T* dst, -+ int len, -+@@ -1331,5 +1337,5 @@ Vectorized inline maximum(const Vectorized& a, const V -+ return a.maximum(b); -+ } -+ -+-#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -++#endif // if defined(CPU_CAPABILITY_AVX2) -+ }} // namespace at::vec::CPU_CAPABILITY -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h -+index fe96d123e64..87f723d782c 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512.h -+@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { -+ } -+ -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ -+@@ -80,7 +80,8 @@ inline Vectorized cast(const Vectorized& src) -+ } -+ -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+- -++#ifndef _MSC_VER -++// MSVC is not working well on complex function overload. -+ template -+ std::enable_if_t> -+ inline gather(const double* base_addr, const Vectorized& vindex) { -+@@ -92,9 +93,10 @@ std::enable_if_t& vindex) { -+ return _mm512_i32gather_ps(vindex, base_addr, scale); -+ } -+- -++#endif -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+- -++#ifndef _MSC_VER -++// MSVC is not working well on complex function overload. -+ template -+ std::enable_if_t> -+ inline mask_gather(const Vectorized& src, const double* base_addr, -+@@ -112,7 +114,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, -+ auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ); -+ return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale); -+ } -+- -++#endif -+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -+ -+ template<> -+@@ -270,6 +272,6 @@ inline Vectorized flip(const Vectorized & v) { -+ return flip8(v); -+ } -+ -+-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#endif // defined(CPU_CAPABILITY_AVX512) -+ -+ }}} -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h -+index f9fc92d52bf..eb3b6a72240 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h -+@@ -7,7 +7,8 @@ -+ #include -+ #include -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -16,7 +17,18 @@ namespace vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -++ -++#ifndef SLEEF_CONST -++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) -++#define SLEEF_CONST const -++#else -++#define SLEEF_CONST -++#endif -++#define SLEEF_CONST_OLD SLEEF_CONST -++#else -++#define SLEEF_CONST_OLD -++#endif -+ -+ // bfloat16 conversion -+ static inline void cvtbf16_fp32(const __m256i& a, __m512& o) { -+@@ -362,7 +374,8 @@ public: -+ } -+ #pragma clang diagnostic push -+ #pragma clang diagnostic ignored "-Wignored-qualifiers" -+- Vectorized map(const __m512 (*const vop)(__m512)) const { -++ -++ Vectorized map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const { -+ __m512 lo, hi; -+ cvt_to_fp32(values, lo, hi); -+ const auto o1 = vop(lo); -+@@ -1571,7 +1584,7 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V -+ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16); -+ CONVERT_VECTORIZED_INIT(Half, half); -+ -+-#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#else //defined(CPU_CAPABILITY_AVX512) -+ -+ #define CONVERT_NON_VECTORIZED_INIT(type, name) \ -+ inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ -+@@ -1601,9 +1614,9 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V -+ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16); -+ CONVERT_NON_VECTORIZED_INIT(Half, half); -+ -+-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#endif // defined(CPU_CAPABILITY_AVX512) -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ #define LOAD_FP32_VECTORIZED_INIT(type, name) \ -+ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ -+ auto values = _mm256_loadu_si256(reinterpret_cast(data)); \ -+@@ -1622,7 +1635,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec -+ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16); -+ LOAD_FP32_VECTORIZED_INIT(Half, fp16); -+ -+-#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#else // defined(CPU_CAPABILITY_AVX512) -+ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ -+ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ -+ __at_align__ float values[Vectorized::size()]; \ -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h -+index 02aa3a87cc1..c35204f9da2 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h -+@@ -7,7 +7,8 @@ -+ #include -+ #include -+ #include -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -16,7 +17,7 @@ namespace vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ -+ template <> class Vectorized> { -+ private: -+@@ -203,7 +204,7 @@ public: -+ auto abs = abs_(); -+ auto zero = _mm512_setzero_pd(); -+ auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ); -+- auto div = values / abs; -++ auto div = _mm512_div_pd(values, abs); -+ return _mm512_mask_blend_pd(mask, div, zero); -+ } -+ __m512d real_() const { -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h -+index a5d790c98b2..2801e484d94 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h -+@@ -7,7 +7,8 @@ -+ #include -+ #include -+ #include -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -16,7 +17,7 @@ namespace vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ -+ template <> class Vectorized> { -+ private: -+@@ -708,7 +709,7 @@ public: -+ auto abs = abs_(); -+ auto zero = _mm512_setzero_ps(); -+ auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ); -+- auto div = values / abs; -++ auto div = _mm512_div_ps(values, abs); -+ return _mm512_mask_blend_ps(mask, div, zero); -+ } -+ __m512 real_() const { -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h -+index 27b2753c903..508ab257e60 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h -+@@ -6,7 +6,8 @@ -+ #include -+ #include -+ #include -+-#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER) -++#if (defined(CPU_CAPABILITY_AVX512)) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -15,7 +16,7 @@ namespace vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ -+ template <> class Vectorized { -+ private: -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h -+index ba5738687fd..a08df3c141a 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h -+@@ -6,7 +6,8 @@ -+ #include -+ #include -+ #include -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -++#define SLEEF_STATIC_LIBS -+ #include -+ #endif -+ -+@@ -15,7 +16,7 @@ namespace vec { -+ // See Note [CPU_CAPABILITY namespace] -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ -+ template <> class Vectorized { -+ private: -+@@ -246,14 +247,14 @@ public: -+ static __m512 vec_factorial_5 = -+ _mm512_set1_ps(0.00828929059f); // 1/factorial(5) -+ static __m512 vec_exp_log2ef = -+- (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e) -++ _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e) -+ static __m512 vec_half = _mm512_set1_ps(0.5f); -+ static __m512 vec_one = _mm512_set1_ps(1.f); -+ static __m512 vec_zero = _mm512_set1_ps(0.f); -+ static __m512 vec_two = _mm512_set1_ps(2.f); -+- static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2) -+- static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50); -+- static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218); -++ static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2) -++ static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50)); -++ static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218)); -+ static __m512i vec_127 = _mm512_set1_epi32(0x0000007f); -+ static int n_mantissa_bits = 23; -+ -+@@ -288,7 +289,7 @@ public: -+ auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number); -+ auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127); -+ vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); -+- auto vec_two_pow_n = (__m512)vec_two_pow_n_i; -++ auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i); -+ vec_two_pow_n = -+ _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero); -+ -+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h -+index e0713d01312..a5671ed4a50 100644 -+--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h -++++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h -+@@ -42,11 +42,17 @@ namespace at { -+ namespace vec { -+ inline namespace CPU_CAPABILITY { -+ -+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) -++#if defined(CPU_CAPABILITY_AVX512) -+ -++#ifdef _MSC_VER -++__declspec(align(64)) struct Vectorizedqi { -++ protected: -++ __m512i vals; -++#else -+ struct Vectorizedqi { -+ protected: -+ __m512i vals __attribute__((aligned(64))); -++#endif -+ -+ public: -+ Vectorizedqi() {} -+@@ -136,7 +142,7 @@ inline convert_float_to_int8(at::vec::Vectorized src) { -+ } -+ -+ template -+-inline void __attribute__((always_inline)) QuantizeAvx512( -++__FORCE_INLINE void QuantizeAvx512( -+ const float* src, -+ T* dst, -+ int len, -+@@ -525,10 +531,17 @@ struct Vectorized : public Vectorizedqi { -+ Vectorized scale, -+ Vectorized zero_point, -+ Vectorized scale_neg_zp_premul) const { -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -++ #else -+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -++ #endif -+ -+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); -+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); -+@@ -549,10 +562,17 @@ struct Vectorized : public Vectorizedqi { -+ float_vec_return_type dequantize( -+ Vectorized scale, -+ Vectorized zero_point) const { -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -++ #else -+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -++ #endif -+ -+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); -+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); -+@@ -598,20 +618,34 @@ struct Vectorized : public Vectorizedqi { -+ } -+ -+ int_vec_return_type widening_subtract(Vectorized b) const { -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -++ #else -+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -++ #endif -+ -+ __m512i int32_val0 = cvtepi8_epi32(int_val0); -+ __m512i int32_val1 = cvtepi8_epi32(int_val1); -+ __m512i int32_val2 = cvtepi8_epi32(int_val2); -+ __m512i int32_val3 = cvtepi8_epi32(int_val3); -+ -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); -++ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); -++ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); -++ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); -++ #else -+ __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); -+ __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); -+ __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); -+ __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); -++ #endif -+ -+ __m512i int32_b0 = cvtepi8_epi32(int_b0); -+ __m512i int32_b1 = cvtepi8_epi32(int_b1); -+@@ -721,10 +755,17 @@ struct Vectorized : public Vectorizedqi { -+ Vectorized scale, -+ Vectorized zero_point, -+ Vectorized scale_zp_premul) const { -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -++ #else -+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -++ #endif -+ -+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); -+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); -+@@ -746,10 +787,17 @@ struct Vectorized : public Vectorizedqi { -+ float_vec_return_type dequantize( -+ Vectorized scale, -+ Vectorized zero_point) const { -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -++ #else -+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -++ #endif -+ -+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); -+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); -+@@ -796,20 +844,34 @@ struct Vectorized : public Vectorizedqi { -+ } -+ -+ int_vec_return_type widening_subtract(Vectorized b) const { -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); -++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); -++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); -++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); -++ #else -+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); -+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); -+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); -+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); -++ #endif -+ -+ __m512i int32_val0 = cvtepu8_epi32(int_val0); -+ __m512i int32_val1 = cvtepu8_epi32(int_val1); -+ __m512i int32_val2 = cvtepu8_epi32(int_val2); -+ __m512i int32_val3 = cvtepu8_epi32(int_val3); -+ -++ #if defined(_MSC_VER) && !defined(__clang__) -++ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); -++ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); -++ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); -++ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); -++ #else -+ __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); -+ __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); -+ __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); -+ __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); -++ #endif -+ -+ __m512i int32_b0 = cvtepu8_epi32(int_b0); -+ __m512i int32_b1 = cvtepu8_epi32(int_b1); -+diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h -+index adf81dd915c..20cb8ef6dbc 100644 -+--- a/aten/src/ATen/cpu/vec/vec_base.h -++++ b/aten/src/ATen/cpu/vec/vec_base.h -+@@ -36,6 +36,12 @@ -+ #include -+ #include -+ -++#if defined(__GNUC__) -++#define __FORCE_INLINE __attribute__((always_inline)) inline -++#elif defined(_MSC_VER) -++#define __FORCE_INLINE __forceinline -++#endif -++ -+ // These macros helped us unify vec_base.h -+ #ifdef CPU_CAPABILITY_AVX512 -+ #if defined(__GNUC__) -+diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt -+index a6b6f0f7d1d..15d37cf4861 100644 -+--- a/caffe2/CMakeLists.txt -++++ b/caffe2/CMakeLists.txt -+@@ -1787,7 +1787,7 @@ if(BUILD_TEST) -+ endif() -+ else() -+ add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}") -+- target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main) -++ target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main) -+ endif() -+ target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $) -+ target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $) -+diff --git a/third_party/sleef.BUILD b/third_party/sleef.BUILD -+index 573f9c5b54a..f22a6e905e2 100644 -+--- a/third_party/sleef.BUILD -++++ b/third_party/sleef.BUILD -+@@ -38,6 +38,7 @@ SLEEF_PUBLIC_HEADERS = [ -+ SLEEF_PRIVATE_INCLUDES = [ -+ "-Iexternal/sleef/src/arch", -+ "-Iexternal/sleef/src/common", -++ "-Iexternal/sleef/src/libm", -+ ] -+ -+ SLEEF_PUBLIC_INCLUDES = [ -+@@ -201,8 +202,6 @@ cc_library( -+ srcs = [ -+ "src/libm/rempitab.c", -+ "src/libm/sleefdp.c", -+- "src/libm/sleefld.c", -+- "src/libm/sleefqp.c", -+ "src/libm/sleefsp.c", -+ ], -+ hdrs = SLEEF_PUBLIC_HEADERS, -+-- -+2.45.1 -+ -diff --git a/python-torch.spec b/python-torch.spec -index d50687a5174a..63600c2e8c39 100644 ---- a/python-torch.spec -+++ b/python-torch.spec -@@ -176,6 +176,17 @@ Patch7: 0001-Reenable-dim-for-python-3.12.patch - Patch8: 0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch - %endif - -+# Enable x86 CPU vectorization on windows [submodule sleef] (#118980) -+# https://github.com/pytorch/pytorch/commit/56451cd49d9cf94b49197e09dec13426bb1a5370 -+# -+# Despite the title, this patch fixes compatibility with sleef 3.6 by including -+# a backwards-compatible version of the fix from -+# https://github.com/pytorch/pytorch/pull/122723. -+# Cherry-picked on v2.3.0, without the commit to update the third_party/sleef -+# git submodule (because the release archive contains an actual sleef source -+# tree instead, so this would not apply.) -+Patch9: 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch -+ - %if %{with rocm} - # ROCm patches - # https://github.com/pytorch/pytorch/pull/120551 --- -2.45.1 - diff --git a/0001-Reenable-dim-for-python-3.12.patch b/0001-Reenable-dim-for-python-3.12.patch deleted file mode 100644 index 138b5d4..0000000 --- a/0001-Reenable-dim-for-python-3.12.patch +++ /dev/null @@ -1,115 +0,0 @@ -From ee3fb343a376cdba6f4ce188cac90023f13e2aea Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Thu, 4 Apr 2024 14:21:38 -0600 -Subject: [PATCH] Reenable dim for python 3.12 - -In 3.12: - -_PyArg_Parser added an element to the start of the structure. -So existing positional initialization is off. Switch to element -initialization. - -_Py_CODEUNIT changed to from an int to a union, but relevant_op -is passed an int for the return of decoder.opcode, so the parameter -type is wrong, switch it to int. - -The opcode PRECALL was removed, so reduce its handling to 3.11 - -Signed-off-by: Tom Rix ---- - functorch/csrc/dim/dim.cpp | 24 +++++------------------- - functorch/csrc/dim/minpybind.h | 4 ++-- - 2 files changed, 7 insertions(+), 21 deletions(-) - -diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp -index 4cc027504c77..e48b0d58081f 100644 ---- a/functorch/csrc/dim/dim.cpp -+++ b/functorch/csrc/dim/dim.cpp -@@ -6,20 +6,6 @@ - - #include - -- --// Many APIs have changed/don't exist anymore --#if IS_PYTHON_3_12_PLUS -- --#include "dim.h" -- --// Re-enable this some day --PyObject* Dim_init() { -- PyErr_SetString(PyExc_RuntimeError, "First class dim doesn't work with python 3.12"); -- return nullptr; --} -- --#else -- - #include "minpybind.h" - #include - #include -@@ -441,7 +427,7 @@ static PyObject* DimList_bind(DimList *self, - PY_BEGIN - mpy::handle sizes; - static const char * const _keywords[] = {"sizes", nullptr}; -- static _PyArg_Parser parser = {"O", _keywords, 0}; -+ static _PyArg_Parser parser = { .format = "O", .keywords = _keywords}; - if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, &sizes)) { - return nullptr; - } -@@ -465,7 +451,7 @@ static PyObject* DimList_bind_len(DimList *self, - PY_BEGIN - int size; - static const char * const _keywords[] = {"N", nullptr}; -- static _PyArg_Parser parser = {"i", _keywords, 0}; -+ static _PyArg_Parser parser = { .format = "i", .keywords = _keywords}; - if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, &size)) { - return nullptr; - } -@@ -1468,7 +1454,7 @@ PyTypeObject Tensor::Type = { - - // dim() -------------------- - --static bool relevant_op(_Py_CODEUNIT c) { -+static bool relevant_op(int c) { - switch(c) { - case STORE_NAME: - case STORE_GLOBAL: -@@ -1587,7 +1573,7 @@ static PyObject* _dims(PyObject *self, - auto c = mpy::obj::steal(PyFrame_GetCode(f.ptr())); - auto lasti = PyFrame_GetLasti(f.ptr()); - auto decoder = PyInstDecoder(c.ptr(), lasti); -- #if IS_PYTHON_3_11_PLUS -+ #if IS_PYTHON_3_11 - // When py3.11 adapts bytecode lasti points to the precall - // rather than the call instruction after it - if (decoder.opcode() == PRECALL) { -@@ -3268,4 +3254,4 @@ PyObject* Dim_init() { - } - } - --#endif -+ -diff --git a/functorch/csrc/dim/minpybind.h b/functorch/csrc/dim/minpybind.h -index de82b5af95a4..d76d4828bf80 100644 ---- a/functorch/csrc/dim/minpybind.h -+++ b/functorch/csrc/dim/minpybind.h -@@ -621,7 +621,7 @@ struct vector_args { - PyObject *dummy = NULL; - _PyArg_ParseStackAndKeywords((PyObject*const*)args, nargs, kwnames.ptr(), _parser, &dummy, &dummy, &dummy, &dummy, &dummy); - #else -- _PyArg_Parser* _parser = new _PyArg_Parser{NULL, &names_buf[0], fname_cstr, 0}; -+ _PyArg_Parser* _parser = new _PyArg_Parser{ .keywords = &names_buf[0], .fname = fname_cstr}; - std::unique_ptr buf(new PyObject*[names.size()]); - _PyArg_UnpackKeywords((PyObject*const*)args, nargs, NULL, kwnames.ptr(), _parser, required, (Py_ssize_t)values.size() - kwonly, 0, &buf[0]); - #endif -@@ -706,7 +706,7 @@ inline object handle::call_vector(vector_args args) { - #define MPY_PARSE_ARGS_KWNAMES(fmt, FORALL_ARGS) \ - static const char * const kwlist[] = { FORALL_ARGS(MPY_ARGS_NAME) nullptr}; \ - FORALL_ARGS(MPY_ARGS_DECLARE) \ -- static _PyArg_Parser parser = {fmt, kwlist, 0}; \ -+ static _PyArg_Parser parser = { .format = fmt, .keywords = kwlist}; \ - if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, FORALL_ARGS(MPY_ARGS_POINTER) nullptr)) { \ - throw mpy::exception_set(); \ - } --- -2.44.0 - diff --git a/0001-Regenerate-flatbuffer-header.patch b/0001-Regenerate-flatbuffer-header.patch deleted file mode 100644 index 4eec491..0000000 --- a/0001-Regenerate-flatbuffer-header.patch +++ /dev/null @@ -1,39 +0,0 @@ -From 5b8e51b24513fa851eeff42f23d942bde301e321 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Fri, 29 Sep 2023 06:19:29 -0700 -Subject: [PATCH] Regenerate flatbuffer header - -For this error -torch/csrc/jit/serialization/mobile_bytecode_generated.h:12:41: -error: static assertion failed: Non-compatible flatbuffers version included - 12 | FLATBUFFERS_VERSION_MINOR == 3 && - -PyTorch is expecting 23.3.3, what f38 has -Rawhide is at 23.5.26 - -Regenerate with -flatc --cpp --gen-mutable --no-prefix --scoped-enums mobile_bytecode.fbs - -Signed-off-by: Tom Rix ---- - torch/csrc/jit/serialization/mobile_bytecode_generated.h | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/torch/csrc/jit/serialization/mobile_bytecode_generated.h b/torch/csrc/jit/serialization/mobile_bytecode_generated.h -index cffe8bc7a6..83575e4c19 100644 ---- a/torch/csrc/jit/serialization/mobile_bytecode_generated.h -+++ b/torch/csrc/jit/serialization/mobile_bytecode_generated.h -@@ -9,8 +9,8 @@ - // Ensure the included flatbuffers.h is the same version as when this file was - // generated, otherwise it may not be compatible. - static_assert(FLATBUFFERS_VERSION_MAJOR == 23 && -- FLATBUFFERS_VERSION_MINOR == 3 && -- FLATBUFFERS_VERSION_REVISION == 3, -+ FLATBUFFERS_VERSION_MINOR == 5 && -+ FLATBUFFERS_VERSION_REVISION == 26, - "Non-compatible flatbuffers version included"); - - namespace torch { --- -2.43.0 - diff --git a/0001-Stub-in-kineto-ActivityType.patch b/0001-Stub-in-kineto-ActivityType.patch deleted file mode 100644 index f088645..0000000 --- a/0001-Stub-in-kineto-ActivityType.patch +++ /dev/null @@ -1,73 +0,0 @@ -From 3ef82b814179da571b2478f61d4279717ab0b23a Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Fri, 29 Sep 2023 06:25:23 -0700 -Subject: [PATCH] Stub in kineto ActivityType - -There is an error with kineto is not used, the shim still -requires the ActivityTYpe.h header to get the enum Activity type. -So cut-n-paste just enough of the header in to do this. - -Signed-off-by: Tom Rix ---- - torch/csrc/profiler/kineto_shim.h | 44 +++++++++++++++++++++++++++++++ - 1 file changed, 44 insertions(+) - -diff --git a/torch/csrc/profiler/kineto_shim.h b/torch/csrc/profiler/kineto_shim.h -index e92cbf003d..68985ab7d0 100644 ---- a/torch/csrc/profiler/kineto_shim.h -+++ b/torch/csrc/profiler/kineto_shim.h -@@ -12,7 +12,51 @@ - #undef USE_KINETO - #endif - -+#ifdef USE_KINETO - #include -+#else -+namespace libkineto { -+// copied from header -+/* -+ * Copyright (c) Meta Platforms, Inc. and affiliates. -+ * All rights reserved. -+ * -+ * This source code is licensed under the BSD-style license found in the -+ * LICENSE file in the root directory of this source tree. -+ */ -+ -+// Note : All activity types are not enabled by default. Please add them -+// at correct position in the enum -+enum class ActivityType { -+ // Activity types enabled by default -+ CPU_OP = 0, // cpu side ops -+ USER_ANNOTATION, -+ GPU_USER_ANNOTATION, -+ GPU_MEMCPY, -+ GPU_MEMSET, -+ CONCURRENT_KERNEL, // on-device kernels -+ EXTERNAL_CORRELATION, -+ CUDA_RUNTIME, // host side cuda runtime events -+ CUDA_DRIVER, // host side cuda driver events -+ CPU_INSTANT_EVENT, // host side point-like events -+ PYTHON_FUNCTION, -+ OVERHEAD, // CUPTI induced overhead events sampled from its overhead API. -+ -+ // Optional Activity types -+ CUDA_SYNC, // synchronization events between runtime and kernels -+ GLOW_RUNTIME, // host side glow runtime events -+ MTIA_RUNTIME, // host side MTIA runtime events -+ CUDA_PROFILER_RANGE, // CUPTI Profiler range for performance metrics -+ MTIA_CCP_EVENTS, // MTIA ondevice CCP events -+ HPU_OP, // HPU host side runtime event -+ XPU_RUNTIME, // host side xpu runtime events -+ -+ ENUM_COUNT, // This is to add buffer and not used for any profiling logic. Add your new type before it. -+ OPTIONAL_ACTIVITY_TYPE_START = CUDA_SYNC, -+}; -+} -+ -+#endif - - #include - #include --- -2.43.0 - diff --git a/0001-can-not-use-with-c-files.patch b/0001-can-not-use-with-c-files.patch deleted file mode 100644 index 719737c..0000000 --- a/0001-can-not-use-with-c-files.patch +++ /dev/null @@ -1,25 +0,0 @@ -From a5dff521691a17701b5a02ec75e84cfe1bf605f7 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 3 Feb 2024 06:41:49 -0500 -Subject: [PATCH] can not use with c files - ---- - cmake/Dependencies.cmake | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index 4dd8042058..5f91f3ffab 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1269,7 +1269,7 @@ if(USE_ROCM) - list(APPEND HIP_CXX_FLAGS -Wno-duplicate-decl-specifier) - list(APPEND HIP_CXX_FLAGS -DCAFFE2_USE_MIOPEN) - list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) -- list(APPEND HIP_CXX_FLAGS -std=c++17) -+# list(APPEND HIP_CXX_FLAGS -std=c++17) - if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "6.0.0") - list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2) - endif() --- -2.43.0 - diff --git a/0001-cuda-hip-signatures.patch b/0001-cuda-hip-signatures.patch deleted file mode 100644 index a258737..0000000 --- a/0001-cuda-hip-signatures.patch +++ /dev/null @@ -1,42 +0,0 @@ -From 214dc959acc809e1959643272c344ee5335d5a69 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Thu, 1 Feb 2024 11:29:47 -0500 -Subject: [PATCH] cuda - hip signatures - ---- - aten/src/ATen/cuda/detail/LazyNVRTC.cpp | 9 +++++++++ - 1 file changed, 9 insertions(+) - -diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp -index 1b85e7776e..bb6f88783a 100644 ---- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp -+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp -@@ -134,8 +134,13 @@ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, - const char *src, - const char *name, - int numHeaders, -+#if !defined(USE_ROCM) - const char * const *headers, - const char * const *includeNames) { -+#else -+ const char **headers, -+ const char **includeNames) { -+#endif - auto fn = reinterpret_cast(getNVRTCLibrary().sym(__func__)); - if (!fn) - throw std::runtime_error("Can't get nvrtcCreateProgram"); -@@ -150,7 +155,11 @@ NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *); - NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *); - NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *); - #endif -+#if !defined(USE_ROCM) - NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *); -+#else -+NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char **); -+#endif - _STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult); - NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*); - NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *); --- -2.43.0 - diff --git a/0001-disable-use-of-aotriton.patch b/0001-disable-use-of-aotriton.patch deleted file mode 100644 index 61ffd1e..0000000 --- a/0001-disable-use-of-aotriton.patch +++ /dev/null @@ -1,94 +0,0 @@ -From 038ce9e44776e23f21c1816daa259bc0ea335088 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 29 Jun 2024 07:06:09 -0700 -Subject: [PATCH] disable use of aotriton - ---- - .../ATen/native/transformers/cuda/sdp_utils.cpp | 17 +++++++++++++++-- - 1 file changed, 15 insertions(+), 2 deletions(-) - -diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp -index 214b02d8262e..7b3eb9dcd8cd 100644 ---- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp -+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp -@@ -19,9 +19,12 @@ - #include - #include - -+#ifdef USE_FLASH_ATTENTION - #if USE_ROCM - #include - #endif -+#endif -+ - - /** - * Note [SDPA Runtime Dispatch] -@@ -182,6 +185,9 @@ bool check_sm_version(cudaDeviceProp * dprops) { - - bool check_flash_attention_hardware_support(sdp_params const& params, bool debug) { - // Check that the gpu is capable of running flash attention -+#ifndef USE_FLASH_ATTENTION -+ return false; -+#else - using sm80 = SMVersion<8, 0>; - using sm90 = SMVersion<9, 0>; - #if USE_ROCM -@@ -209,9 +215,13 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug - } - #endif - return true; -+#endif - } - - bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) { -+#ifndef USE_FLASH_ATTENTION -+ return false; -+#else - // Mem Efficient attention supports hardware in the range [sm_50, sm_90] - using sm50 = SMVersion<5, 0>; - using sm90 = SMVersion<9, 0>; -@@ -240,6 +250,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) - } - #endif - return true; -+#endif - } - - bool check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89( -@@ -554,7 +565,7 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) { - #ifndef USE_FLASH_ATTENTION - TORCH_WARN_ONCE(!debug, "Torch was not compiled with flash attention."); - return false; --#endif -+#else - - // Define gate functions that determine if a flash kernel can be ran - // Replace with std::to_array when we migrate to c++20 -@@ -597,13 +608,14 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) { - } - } - return true; -+#endif - } - - bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) { - #ifndef USE_MEM_EFF_ATTENTION - TORCH_WARN_ONCE(!debug, "Torch was not compiled with memory efficient attention."); - return false; --#endif -+#else - // Constraints specific to mem efficient attention - constexpr auto greater_than_or_equal_sm80_mem_efficient_dtypes = - array_of(at::kHalf, at::kFloat, at::kBFloat16); -@@ -663,6 +675,7 @@ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) { - } - #endif - return check_tensor_dtype(params, less_than_sm80_mem_efficient_dtypes, debug); -+#endif - } - - SDPBackend select_sdp_backend(sdp_params const& kernel_params) { --- -2.45.2 - diff --git a/0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch b/0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch deleted file mode 100644 index 0ce5b1f..0000000 --- a/0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch +++ /dev/null @@ -1,226 +0,0 @@ -From b9d45eb1cc90696a4de76676221219e24423c709 Mon Sep 17 00:00:00 2001 -From: William Wen -Date: Wed, 3 Apr 2024 17:58:46 -0700 -Subject: [PATCH] [dynamo, 3.12] enable dynamo on 3.12, enable most dynamo - unittests on 3.12 (#123216) - -Pull Request resolved: https://github.com/pytorch/pytorch/pull/123216 -Approved by: https://github.com/jansel, https://github.com/malfet ---- - test/dynamo/test_autograd_function.py | 3 ++ - test/dynamo/test_misc.py | 63 +++++++++++++++++++++++++ - test/functorch/test_eager_transforms.py | 7 ++- - test/run_test.py | 3 -- - torch/__init__.py | 5 +- - torch/_dynamo/eval_frame.py | 4 +- - torch/_dynamo/test_case.py | 8 +--- - 7 files changed, 74 insertions(+), 19 deletions(-) - -diff --git a/test/dynamo/test_autograd_function.py b/test/dynamo/test_autograd_function.py -index d23fec607afa..bc5ebc767038 100644 ---- a/test/dynamo/test_autograd_function.py -+++ b/test/dynamo/test_autograd_function.py -@@ -2,6 +2,8 @@ - - import copy - import math -+import sys -+import unittest - - import torch - -@@ -528,6 +530,7 @@ class AutogradFunctionTests(torch._dynamo.test_case.TestCase): - # I pulled all of these test cases from test_autograd.py - # In the future, we should make the Dynamo test suite actually - # run on test_autograd.py (it's disabled right now) and delete these. -+ @unittest.skipIf(sys.version_info >= (3, 12), "invalid free in 3.12+") - def test_smoke_from_test_autograd(self): - class Func(torch.autograd.Function): - @staticmethod -diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py -index a73de8b1c7e9..8f54e0564e6b 100644 ---- a/test/dynamo/test_misc.py -+++ b/test/dynamo/test_misc.py -@@ -9760,6 +9760,69 @@ fn - lambda mod: mod, - ) - -+ @xfailIfPy311 -+ def test_outside_linear_module_free(self): -+ # Compared to test_linear_module_free, the linear -+ # layer is not the code object that is directly compiled. -+ def model_inp_ctr(): -+ fc = torch.nn.Linear(100, 100) -+ -+ class Mod(torch.nn.Module): -+ def __init__(self): -+ super().__init__() -+ self.fc_ref = fc -+ -+ def forward(self, x): -+ return fc(x[0]) -+ -+ # return fc to keep it alive in _test_compile_model_free -+ return Mod(), (torch.randn(100, 100), fc) -+ -+ self._test_compile_model_free(model_inp_ctr, lambda mod: mod.fc_ref) -+ -+ @unittest.skipIf(sys.version_info >= (3, 12), "leaks in 3.12+") -+ def test_parameter_free(self): -+ def model_inp_ctr(): -+ param = torch.nn.Parameter(torch.randn(100, 100)) -+ -+ class Mod(torch.nn.Module): -+ def __init__(self): -+ super().__init__() -+ self.param = param -+ -+ def forward(self, x): -+ return self.param * x[0] -+ -+ # return param to keep it alive in _test_compile_model_free -+ return Mod(), (torch.randn(100, 100), param) -+ -+ self._test_compile_model_free(model_inp_ctr, lambda mod: mod.param) -+ -+ def test_raises_importerror1(self): -+ @torch.compile(backend="eager") -+ def fn(x): -+ try: -+ import some_module_that_surely_does_not_exist -+ -+ return -+ except ImportError: -+ pass -+ return x.sin() -+ -+ x = torch.randn(8) -+ self.assertEqual(fn(x), x.sin()) -+ -+ def test_raises_importerror2(self): -+ @torch.compile(backend="eager") -+ def fn(x): -+ import some_module_that_surely_does_not_exist -+ -+ return x + 1 -+ -+ x = torch.randn(8) -+ with self.assertRaises(ImportError): -+ fn(x) -+ - def test_dynamo_cache_move_to_front(self): - class Mod(torch.nn.Module): - def __init__(self): -diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py -index 09415cf8f48e..60790ec06059 100644 ---- a/test/functorch/test_eager_transforms.py -+++ b/test/functorch/test_eager_transforms.py -@@ -4762,8 +4762,7 @@ class TestCompileTransforms(TestCase): - # Triton only supports GPU with SM70 or later. - @expectedFailureIf((IS_ARM64 and not IS_MACOS) or - IS_WINDOWS or -- (TEST_CUDA and not SM70OrLater) or -- (sys.version_info >= (3, 12))) -+ (TEST_CUDA and not SM70OrLater)) - def test_compile_vmap_hessian(self, device): - # The model and inputs are a smaller version - # of code at benchmark repo: -@@ -4792,8 +4791,8 @@ class TestCompileTransforms(TestCase): - actual = opt_fn(params_and_buffers, x) - self.assertEqual(actual, expected) - -- # torch.compile is not supported on Windows or on Python 3.12+ -- @expectedFailureIf(IS_WINDOWS or (sys.version_info >= (3, 12))) -+ # torch.compile is not supported on Windows -+ @expectedFailureIf(IS_WINDOWS) - @torch._dynamo.config.patch(suppress_errors=False) - @torch._dynamo.config.patch(capture_func_transforms=True) - @skipIfTorchDynamo("Do not test torch.compile on top of torch.compile") -diff --git a/test/run_test.py b/test/run_test.py -index e86af9623042..ebb14df4167d 100755 ---- a/test/run_test.py -+++ b/test/run_test.py -@@ -74,7 +74,6 @@ sys.path.remove(str(REPO_ROOT)) - RERUN_DISABLED_TESTS = os.getenv("PYTORCH_TEST_RERUN_DISABLED_TESTS", "0") == "1" - DISTRIBUTED_TEST_PREFIX = "distributed" - INDUCTOR_TEST_PREFIX = "inductor" --DYNAMO_TEST_PREFIX = "dynamo" - - - # Note [ROCm parallel CI testing] -@@ -324,7 +323,6 @@ JIT_EXECUTOR_TESTS = [ - ] - - INDUCTOR_TESTS = [test for test in TESTS if test.startswith(INDUCTOR_TEST_PREFIX)] --DYNAMO_TESTS = [test for test in TESTS if test.startswith(DYNAMO_TEST_PREFIX)] - DISTRIBUTED_TESTS = [test for test in TESTS if test.startswith(DISTRIBUTED_TEST_PREFIX)] - TORCH_EXPORT_TESTS = [test for test in TESTS if test.startswith("export")] - FUNCTORCH_TESTS = [test for test in TESTS if test.startswith("functorch")] -@@ -1361,7 +1359,6 @@ def get_selected_tests(options) -> List[str]: - # these tests failing in Python 3.12 temporarily disabling - if sys.version_info >= (3, 12): - options.exclude.extend(INDUCTOR_TESTS) -- options.exclude.extend(DYNAMO_TESTS) - options.exclude.extend( - [ - "functorch/test_dims", -diff --git a/torch/__init__.py b/torch/__init__.py -index d381712b4a35..26cdffe81d29 100644 ---- a/torch/__init__.py -+++ b/torch/__init__.py -@@ -1861,9 +1861,8 @@ def compile(model: Optional[Callable] = None, *, - - """ - _C._log_api_usage_once("torch.compile") -- # Temporary until we get proper support for python 3.12 -- if sys.version_info >= (3, 12): -- raise RuntimeError("Dynamo is not supported on Python 3.12+") -+ if sys.version_info >= (3, 13): -+ raise RuntimeError("Dynamo is not supported on Python 3.13+") - - # Decorator mode - if model is None: -diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py -index 53ab0df3a947..0a80eeea99ed 100644 ---- a/torch/_dynamo/eval_frame.py -+++ b/torch/_dynamo/eval_frame.py -@@ -589,8 +589,8 @@ class _NullDecorator(contextlib.nullcontext): # type: ignore[type-arg] - - - def check_if_dynamo_supported(): -- if sys.version_info >= (3, 12): -- raise RuntimeError("Python 3.12+ not yet supported for torch.compile") -+ if sys.version_info >= (3, 13): -+ raise RuntimeError("Python 3.13+ not yet supported for torch.compile") - - - def is_dynamo_supported(): -diff --git a/torch/_dynamo/test_case.py b/torch/_dynamo/test_case.py -index e3cbef09eaae..297ea6e2bc2a 100644 ---- a/torch/_dynamo/test_case.py -+++ b/torch/_dynamo/test_case.py -@@ -1,7 +1,6 @@ - import contextlib - import importlib - import logging --import sys - - import torch - import torch.testing -@@ -20,12 +19,7 @@ log = logging.getLogger(__name__) - def run_tests(needs=()): - from torch.testing._internal.common_utils import run_tests - -- if ( -- TEST_WITH_TORCHDYNAMO -- or IS_WINDOWS -- or TEST_WITH_CROSSREF -- or sys.version_info >= (3, 12) -- ): -+ if TEST_WITH_TORCHDYNAMO or IS_WINDOWS or TEST_WITH_CROSSREF: - return # skip testing - - if isinstance(needs, str): --- -2.44.0 - diff --git a/0001-include-fmt-ranges.h-for-using-fmt-join.patch b/0001-include-fmt-ranges.h-for-using-fmt-join.patch deleted file mode 100644 index f7f6c7d..0000000 --- a/0001-include-fmt-ranges.h-for-using-fmt-join.patch +++ /dev/null @@ -1,54 +0,0 @@ -From ba2cf11d1bf1dd5086c8e793198a697d4179cca7 Mon Sep 17 00:00:00 2001 -From: Kefu Chai -Date: Tue, 16 Jul 2024 08:00:22 +0800 -Subject: [PATCH] include fmt/ranges.h for using fmt::join() - -fmt::join() was moved into fmt/ranges.h in fmt 11, so include this -header for using it. - -Signed-off-by: Kefu Chai ---- - torch/csrc/distributed/c10d/socket.cpp | 1 + - torch/csrc/profiler/standalone/execution_trace_observer.cpp | 1 + - torch/csrc/profiler/util.cpp | 1 + - 3 files changed, 3 insertions(+) - -diff --git a/torch/csrc/distributed/c10d/socket.cpp b/torch/csrc/distributed/c10d/socket.cpp -index 5013f2540..cbcd33a19 100644 ---- a/torch/csrc/distributed/c10d/socket.cpp -+++ b/torch/csrc/distributed/c10d/socket.cpp -@@ -31,6 +31,7 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wdeprecated") - #include - C10_DIAGNOSTIC_POP() - #include -+#include - - #include - #include -diff --git a/torch/csrc/profiler/standalone/execution_trace_observer.cpp b/torch/csrc/profiler/standalone/execution_trace_observer.cpp -index 2ef2e5423..fb053e916 100644 ---- a/torch/csrc/profiler/standalone/execution_trace_observer.cpp -+++ b/torch/csrc/profiler/standalone/execution_trace_observer.cpp -@@ -10,6 +10,7 @@ - #endif // _WIN32 - - #include -+#include - #include - #include - #include -diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp -index 896bf606c..c229ce130 100644 ---- a/torch/csrc/profiler/util.cpp -+++ b/torch/csrc/profiler/util.cpp -@@ -5,6 +5,7 @@ - #include - #include - #include -+#include - - #ifdef USE_KINETO - #include --- -2.45.2 - diff --git a/0001-no-third_party-FXdiv.patch b/0001-no-third_party-FXdiv.patch deleted file mode 100644 index 71404e3..0000000 --- a/0001-no-third_party-FXdiv.patch +++ /dev/null @@ -1,54 +0,0 @@ -From b3b307add5724ee5730f161e16594fa702f34a19 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 3 Feb 2024 08:20:28 -0500 -Subject: [PATCH] no third_party FXdiv - ---- - caffe2/CMakeLists.txt | 24 ++++++++++++------------ - 1 file changed, 12 insertions(+), 12 deletions(-) - -diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt -index b2f3adbfae..80a5625c8d 100644 ---- a/caffe2/CMakeLists.txt -+++ b/caffe2/CMakeLists.txt -@@ -110,15 +110,15 @@ endif() - # Note: the folders that are being commented out have not been properly - # addressed yet. - --if(NOT MSVC AND USE_XNNPACK) -- if(NOT TARGET fxdiv) -- set(FXDIV_BUILD_TESTS OFF CACHE BOOL "") -- set(FXDIV_BUILD_BENCHMARKS OFF CACHE BOOL "") -- add_subdirectory( -- "${FXDIV_SOURCE_DIR}" -- "${CMAKE_BINARY_DIR}/FXdiv") -- endif() --endif() -+#if(NOT MSVC AND USE_XNNPACK) -+# if(NOT TARGET fxdiv) -+# set(FXDIV_BUILD_TESTS OFF CACHE BOOL "") -+# set(FXDIV_BUILD_BENCHMARKS OFF CACHE BOOL "") -+# add_subdirectory( -+# "${FXDIV_SOURCE_DIR}" -+# "${CMAKE_BINARY_DIR}/FXdiv") -+# endif() -+#endif() - - add_subdirectory(core) - add_subdirectory(serialize) -@@ -1081,9 +1081,9 @@ if(USE_XPU) - target_compile_definitions(torch_xpu PRIVATE USE_XPU) - endif() - --if(NOT MSVC AND USE_XNNPACK) -- TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv) --endif() -+#if(NOT MSVC AND USE_XNNPACK) -+# TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv) -+#endif() - - # ========================================================== - # formerly-libtorch flags --- -2.43.0 - diff --git a/0001-no-third_party-fmt.patch b/0001-no-third_party-fmt.patch deleted file mode 100644 index 6e82af2..0000000 --- a/0001-no-third_party-fmt.patch +++ /dev/null @@ -1,65 +0,0 @@ -From 2ce255b75760a0a513fb1706629b416f76a5c822 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 3 Feb 2024 08:16:04 -0500 -Subject: [PATCH] no third_party fmt - ---- - c10/CMakeLists.txt | 2 +- - cmake/Dependencies.cmake | 6 +++--- - torch/CMakeLists.txt | 2 +- - 3 files changed, 5 insertions(+), 5 deletions(-) - -diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt -index 1f742f4c176..4fa08913bdd 100644 ---- a/c10/CMakeLists.txt -+++ b/c10/CMakeLists.txt -@@ -87,7 +87,7 @@ endif() - if(C10_USE_GLOG) - target_link_libraries(c10 PUBLIC glog::glog) - endif() --target_link_libraries(c10 PRIVATE fmt::fmt-header-only) -+target_link_libraries(c10 PRIVATE fmt) - - if(C10_USE_NUMA) - message(STATUS "NUMA paths:") -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index 6f5a2d5feff..42fbf80f6e8 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1837,7 +1837,7 @@ endif() - # - set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) - set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libs" FORCE) --add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt) -+# add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt) - - # Disable compiler feature checks for `fmt`. - # -@@ -1846,9 +1846,9 @@ add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt) - # CMAKE_CXX_FLAGS in ways that break feature checks. Since we already know - # `fmt` is compatible with a superset of the compilers that PyTorch is, it - # shouldn't be too bad to just disable the checks. --set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "") -+# set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "") - --list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only) -+# list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only) - set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE) - - # ---[ Kineto -diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt -index 97a72eed55b..9e5014d1980 100644 ---- a/torch/CMakeLists.txt -+++ b/torch/CMakeLists.txt -@@ -80,7 +80,7 @@ set(TORCH_PYTHON_LINK_LIBRARIES - python::python - pybind::pybind11 - shm -- fmt::fmt-header-only -+ fmt - ATEN_CPU_FILES_GEN_LIB) - - if(USE_ASAN AND TARGET Sanitizer::address) --- -2.43.2 - diff --git a/0001-no-third_party-foxi.patch b/0001-no-third_party-foxi.patch deleted file mode 100644 index ba1ec40..0000000 --- a/0001-no-third_party-foxi.patch +++ /dev/null @@ -1,36 +0,0 @@ -From 8cb61cf9282102ac225645fcc9fb4a1bb7cb15a2 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 3 Feb 2024 08:11:55 -0500 -Subject: [PATCH] no third_party foxi - ---- - cmake/Dependencies.cmake | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index 5f91f3ffab..8e1461af81 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1567,7 +1567,7 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX) - set_target_properties(onnx_proto PROPERTIES CXX_STANDARD 17) - endif() - endif() -- add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/foxi EXCLUDE_FROM_ALL) -+ # add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/foxi EXCLUDE_FROM_ALL) - - add_definitions(-DONNX_NAMESPACE=${ONNX_NAMESPACE}) - if(NOT USE_SYSTEM_ONNX) -@@ -1600,8 +1600,8 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX) - message("-- Found onnx: ${ONNX_LIBRARY} ${ONNX_PROTO_LIBRARY}") - list(APPEND Caffe2_DEPENDENCY_LIBS onnx_proto onnx) - endif() -- include_directories(${FOXI_INCLUDE_DIRS}) -- list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader) -+# include_directories(${FOXI_INCLUDE_DIRS}) -+# list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader) - # Recover the build shared libs option. - set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS}) - endif() --- -2.43.0 - diff --git a/0001-python-torch-disable-ck.patch b/0001-python-torch-disable-ck.patch deleted file mode 100644 index e8fd9c2..0000000 --- a/0001-python-torch-disable-ck.patch +++ /dev/null @@ -1,112 +0,0 @@ -From 027dad1eaed51c1172e2497da611e3267d42d2f0 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Fri, 28 Mar 2025 09:16:03 -0700 -Subject: [PATCH] python-torch: disable ck - ---- - aten/src/ATen/CMakeLists.txt | 7 +++---- - aten/src/ATen/Context.cpp | 1 + - aten/src/ATen/cuda/CUDABlas.cpp | 10 +++++----- - 3 files changed, 9 insertions(+), 9 deletions(-) - -diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt -index 085af373ec22..84808880e51c 100644 ---- a/aten/src/ATen/CMakeLists.txt -+++ b/aten/src/ATen/CMakeLists.txt -@@ -134,7 +134,7 @@ file(GLOB native_cuda_cu "native/cuda/*.cu") - file(GLOB native_cuda_cpp "native/cuda/*.cpp") - file(GLOB native_cuda_h "native/cuda/*.h" "native/cuda/*.cuh") - file(GLOB native_cuda_linalg_cpp "native/cuda/linalg/*.cpp") --file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh" "native/hip/bgemm_kernels/*.h") -+file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh" ) - file(GLOB native_cudnn_cpp "native/cudnn/*.cpp") - file(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu") - file(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp") -@@ -145,7 +145,7 @@ file(GLOB native_nested_h "native/nested/*.h") - file(GLOB native_nested_cuda_cu "native/nested/cuda/*.cu") - file(GLOB native_nested_cuda_cpp "native/nested/cuda/*.cpp") - --file(GLOB native_hip_hip "native/hip/*.hip" "native/hip/bgemm_kernels/*.hip") -+file(GLOB native_hip_hip "native/hip/*.hip" ) - file(GLOB native_hip_cpp "native/hip/*.cpp") - file(GLOB native_hip_linalg_cpp "native/hip/linalg/*.cpp") - file(GLOB native_miopen_cpp "native/miopen/*.cpp") -@@ -361,13 +361,12 @@ endif() - ${native_quantized_hip_hip} - ${native_transformers_hip_hip} ${native_transformers_src_hip_hip} - ) -- if(WIN32) # Windows doesn't support Composable Kernels and Triton - file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip") - file(GLOB native_hip_ck "native/hip/ck*.hip") - exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}" - ${native_hip_bgemm} ${native_hip_ck} - ${native_transformers_hip_hip} ${native_transformers_hip_cpp}) -- endif() -+ - # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources) - list(APPEND all_hip_cpp - ${native_nested_hip_cpp} -diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp -index f598fc3a39d3..03dab6ff38fe 100644 ---- a/aten/src/ATen/Context.cpp -+++ b/aten/src/ATen/Context.cpp -@@ -355,6 +355,7 @@ at::BlasBackend Context::blasPreferredBackend() { - } - - void Context::setBlasPreferredBackend(at::BlasBackend b) { -+ return; - #ifdef _MSC_VER - TORCH_WARN_ONCE( - "torch.backends.cuda.preferred_blas_library is an experimental feature. " -diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp -index a62b028fd4ff..cba38426ea1f 100644 ---- a/aten/src/ATen/cuda/CUDABlas.cpp -+++ b/aten/src/ATen/cuda/CUDABlas.cpp -@@ -708,7 +708,7 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::BFloat16)); - } --#ifdef USE_ROCM -+#ifdef USE_ROCM_NO_CK - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - at::native::bgemm_internal_ck(CUDABLAS_BGEMM_ARGS(at::BFloat16)); - } -@@ -1061,7 +1061,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(double)); - #endif - } --#ifdef USE_ROCM -+#ifdef USE_ROCM_NO_CK - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(double)); - } -@@ -1077,7 +1077,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); - } --#ifdef USE_ROCM -+#ifdef USE_ROCM_NO_CK - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(float)); - } -@@ -1125,7 +1125,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); - } --#ifdef USE_ROCM -+#ifdef USE_ROCM_NO_CK - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::Half)); - } -@@ -1141,7 +1141,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } --#ifdef USE_ROCM -+#ifdef USE_ROCM_NO_CK - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } --- -2.48.1 - diff --git a/0001-reenable-foxi-linking.patch b/0001-reenable-foxi-linking.patch deleted file mode 100644 index 8e39795..0000000 --- a/0001-reenable-foxi-linking.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 58ccda271e8f51c3fa5b7518cf6ee52ce204fd37 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Thu, 22 Feb 2024 09:28:11 -0500 -Subject: [PATCH] reenable foxi linking - ---- - cmake/Dependencies.cmake | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index 42fbf80f6e8..bc3a2dc6fee 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1604,7 +1604,7 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX) - list(APPEND Caffe2_DEPENDENCY_LIBS onnx_proto onnx) - endif() - # include_directories(${FOXI_INCLUDE_DIRS}) --# list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader) -+ list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader) - # Recover the build shared libs option. - set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS}) - endif() --- -2.43.2 - diff --git a/0001-silence-an-assert.patch b/0001-silence-an-assert.patch deleted file mode 100644 index 0b20dcf..0000000 --- a/0001-silence-an-assert.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 04dd33db93b852fdfd7ea408813080b2e2026650 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 3 Feb 2024 06:41:20 -0500 -Subject: [PATCH] silence an assert - ---- - aten/src/ATen/native/cuda/IndexKernel.cu | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu -index 657c0c77b3..b406aa6687 100644 ---- a/aten/src/ATen/native/cuda/IndexKernel.cu -+++ b/aten/src/ATen/native/cuda/IndexKernel.cu -@@ -249,7 +249,7 @@ void index_put_kernel_quantized_cuda(TensorIterator& iter, const IntArrayRef ind - - gpu_index_kernel(iter, index_size, index_stride, [inv_scale, zero_point, qmin, qmax]C10_DEVICE(char* const out_data, const char* const in_data, const int64_t offset) { - int64_t qvalue = static_cast(zero_point + nearbyintf(*(float*)in_data * inv_scale)); -- qvalue = std::clamp(qvalue, qmin, qmax); -+ //qvalue = std::clamp(qvalue, qmin, qmax); - *(scalar_t*)(out_data + offset) = static_cast(qvalue); - }); - }); --- -2.43.0 - diff --git a/0001-torch-paper-over-c-assert.patch b/0001-torch-paper-over-c-assert.patch deleted file mode 100644 index b7e55ce..0000000 --- a/0001-torch-paper-over-c-assert.patch +++ /dev/null @@ -1,88 +0,0 @@ -From f646e0f04ae591c8f2d8a0cd24b035725c57659b Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Thu, 23 Jan 2025 08:24:22 -0800 -Subject: [PATCH] torch: paper over c++ assert - ---- - aten/src/ATen/native/sparse/FlattenIndicesCommon.h | 2 ++ - .../ATen/native/sparse/SparseBinaryOpIntersectionCommon.h | 5 +++++ - .../src/ATen/native/sparse/ValidateCompressedIndicesCommon.h | 2 ++ - 3 files changed, 9 insertions(+) - -diff --git a/aten/src/ATen/native/sparse/FlattenIndicesCommon.h b/aten/src/ATen/native/sparse/FlattenIndicesCommon.h -index 0e79ed809ae6..a3cec8aaf78b 100644 ---- a/aten/src/ATen/native/sparse/FlattenIndicesCommon.h -+++ b/aten/src/ATen/native/sparse/FlattenIndicesCommon.h -@@ -69,11 +69,13 @@ Tensor _flatten_indices_impl(const Tensor& indices, IntArrayRef size) { - [=] FUNCAPI (int64_t nnz_idx) -> int64_t { - const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride; - auto hash = static_cast(0); -+#if 0 - for (int64_t dim = 0; dim < sparse_dim; ++dim) { - const auto dim_hash_coeff = hash_coeffs[dim]; - const auto dim_index = ptr_indices_dim[dim * indices_dim_stride]; - hash += dim_index * dim_hash_coeff; - } -+#endif - return hash; - }); - } -diff --git a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h -index c0b94bf39d54..8de4900b7a01 100644 ---- a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h -+++ b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h -@@ -279,12 +279,15 @@ void _sparse_binary_op_intersection_kernel_impl( - if (!ptr_indices) { - return hash; - } -+#if 0 -+// /usr/lib/gcc/x86_64-redhat-linux/15/../../../../include/c++/15/array:219:2: error: reference to __host__ function '__glibcxx_assert_fail' in __host__ __device__ function - const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride; - for (int64_t dim = 0; dim < sparse_dim; ++dim) { - const auto dim_hash_coeff = hash_coeffs[dim]; - const auto dim_index = ptr_indices_dim[dim * indices_dim_stride]; - hash += dim_index * dim_hash_coeff; - } -+#endif - return hash; - }); - } -@@ -364,6 +367,7 @@ void _sparse_binary_op_intersection_kernel_impl( - if (hash_ptr) { - hash = hash_ptr[nnz_idx]; - } else if (sparse_dim) { -+#if 0 - // Compute hash value - const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride; - for (int64_t dim = 0; dim < sparse_dim; ++dim) { -@@ -371,6 +375,7 @@ void _sparse_binary_op_intersection_kernel_impl( - const auto dim_index = ptr_indices_dim[dim * indices_dim_stride]; - hash += dim_index * dim_hash_coeff; - } -+#endif - } - - // Perform hash values intersection -diff --git a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h -index ec4c084a39cc..9bc9655b0afa 100644 ---- a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h -+++ b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h -@@ -341,6 +341,7 @@ void _validate_compressed_sparse_indices_kernel( - // assuming idx contiguity per batch: - int64_t tmp = batch_idx * nnz; - // `nnz == idx_sizes[idx_ndims - 1]` is checked above as `nnz == idx.size(-1)` -+#if 0 - for (int i = idx_ndims - 1; - i >= 0 && nnz > 0; // break early when nnz==0 - i--) { -@@ -348,6 +349,7 @@ void _validate_compressed_sparse_indices_kernel( - idx_offset += (tmp - div * idx_sizes[i]) * idx_strides[i]; - tmp = div; - } -+#endif - const auto* RESTRICT ptr_idx_batch = ptr_idx + idx_offset; - _check_idx_sorted_distinct_vals_slices_with_cidx< - cdim_name, --- -2.48.1 - diff --git a/0001-use-any-hip.patch b/0001-use-any-hip.patch deleted file mode 100644 index dca86ea..0000000 --- a/0001-use-any-hip.patch +++ /dev/null @@ -1,34 +0,0 @@ -From 4248211ce9a9de81bb3ade5d421ba709b19ead08 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 3 Feb 2024 15:01:28 -0500 -Subject: [PATCH] use any hip - ---- - cmake/public/LoadHIP.cmake | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake -index 1abeb06228..28458c4146 100644 ---- a/cmake/public/LoadHIP.cmake -+++ b/cmake/public/LoadHIP.cmake -@@ -30,7 +30,7 @@ endif() - message("Building PyTorch for GPU arch: ${PYTORCH_ROCM_ARCH}") - - # Add HIP to the CMAKE Module Path --set(CMAKE_MODULE_PATH ${ROCM_PATH}/lib/cmake/hip ${CMAKE_MODULE_PATH}) -+set(CMAKE_MODULE_PATH ${ROCM_PATH}/lib64/cmake/hip ${CMAKE_MODULE_PATH}) - - macro(find_package_and_print_version PACKAGE_NAME) - find_package("${PACKAGE_NAME}" ${ARGN}) -@@ -38,7 +38,7 @@ macro(find_package_and_print_version PACKAGE_NAME) - endmacro() - - # Find the HIP Package --find_package_and_print_version(HIP 1.0) -+find_package_and_print_version(HIP MODULE) - - if(HIP_FOUND) - set(PYTORCH_FOUND_HIP TRUE) --- -2.43.0 - diff --git a/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch b/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch deleted file mode 100644 index 413c60d..0000000 --- a/next/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch +++ /dev/null @@ -1,47 +0,0 @@ -From 091b7fe1ccbb5e4ff4ac6017d42bacb869f61a27 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 20 Jul 2024 05:37:15 -0600 -Subject: [PATCH] Add cmake option USE_SYSTEM_FBGEMM - -Signed-off-by: Tom Rix ---- - CMakeLists.txt | 1 + - cmake/Dependencies.cmake | 3 ++- - 2 files changed, 3 insertions(+), 1 deletion(-) - -diff --git a/CMakeLists.txt b/CMakeLists.txt -index c4cd4b2c2a98..2068f7c6c4f2 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -253,6 +253,7 @@ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF - "USE_CUDNN" OFF) - cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF) - option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON) -+option(USE_SYSTEM_FBGEMM "Use system-wide FBGEMM" OFF) - option(USE_KINETO "Use Kineto profiling library" ON) - option(USE_CUPTI_SO "Use CUPTI as a shared library" ON) - option(USE_FAKELOWP "Use FakeLowp operators" OFF) -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index f1f2eb7cec31..192dac46f13b 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -706,6 +706,7 @@ endif() - - # ---[ FBGEMM - if(USE_FBGEMM) -+ if (NOT USE_SYSTEM_FBGEMM) - set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party") - if(NOT DEFINED FBGEMM_SOURCE_DIR) - set(FBGEMM_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/fbgemm" CACHE STRING "FBGEMM source directory") -@@ -746,7 +747,7 @@ if(USE_FBGEMM) - target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable) - endif() - endif() -- -+ endif() - if(USE_FBGEMM) - list(APPEND Caffe2_DEPENDENCY_LIBS fbgemm) - endif() --- -2.45.1 - diff --git a/next/0001-Add-cmake-variable-USE_ROCM_CK.patch b/next/0001-Add-cmake-variable-USE_ROCM_CK.patch new file mode 100644 index 0000000..1afe692 --- /dev/null +++ b/next/0001-Add-cmake-variable-USE_ROCM_CK.patch @@ -0,0 +1,149 @@ +From 4cc5d88dfe7a45ab245648dc874645d32a24b98b Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Fri, 27 Jun 2025 13:52:51 -0700 +Subject: [PATCH] Add cmake variable USE_ROCM_CK + +--- + CMakeLists.txt | 1 + + aten/src/ATen/CMakeLists.txt | 40 ++++++++++++++++----------------- + aten/src/ATen/cuda/CUDABlas.cpp | 10 ++++----- + cmake/Dependencies.cmake | 3 +++ + 4 files changed, 29 insertions(+), 25 deletions(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 99c0b9e0ea0c..4c632e42f531 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -240,6 +240,7 @@ cmake_dependent_option( + BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON + "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF) + cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF) ++cmake_dependent_option(USE_ROCM_CK "Use ROCm Composable Kernel" ON "USE_ROCM" ON) + option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF) + cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF) + cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF +diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt +index c9cfd74b501e..59f6178218ee 100644 +--- a/aten/src/ATen/CMakeLists.txt ++++ b/aten/src/ATen/CMakeLists.txt +@@ -373,26 +373,26 @@ if(USE_ROCM) + # is header only, so this should be ok, except that the CMake build generates + # a ck/config.h. We just do that part here. Without this, the ck.h from the + # ROCM SDK may get accidentally used instead. +- function(_pytorch_rocm_generate_ck_conf) +- set(CK_ENABLE_INT8 "ON") +- set(CK_ENABLE_FP16 "ON") +- set(CK_ENABLE_FP32 "ON") +- set(CK_ENABLE_FP64 "ON") +- set(CK_ENABLE_BF16 "ON") +- set(CK_ENABLE_FP8 "ON") +- set(CK_ENABLE_BF8 "ON") +- set(CK_USE_XDL "ON") +- set(CK_USE_WMMA "ON") +- configure_file( +- "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in" +- "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h" +- ) +- endfunction() ++# function(_pytorch_rocm_generate_ck_conf) ++# set(CK_ENABLE_INT8 "ON") ++# set(CK_ENABLE_FP16 "ON") ++# set(CK_ENABLE_FP32 "ON") ++# set(CK_ENABLE_FP64 "ON") ++# set(CK_ENABLE_BF16 "ON") ++# set(CK_ENABLE_FP8 "ON") ++# set(CK_ENABLE_BF8 "ON") ++# set(CK_USE_XDL "ON") ++# set(CK_USE_WMMA "ON") ++# configure_file( ++# "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in" ++# "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h" ++# ) ++# endfunction() + list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip) +- list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include) +- list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include) +- list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel) +- _pytorch_rocm_generate_ck_conf() ++# list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include) ++# list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include) ++# list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel) ++# _pytorch_rocm_generate_ck_conf() + + # Next two lines are needed because TunableOp uses third-party/fmt + list(APPEND ATen_HIP_INCLUDE $) +@@ -409,7 +409,7 @@ endif() + ${native_quantized_hip_hip} + ${native_transformers_hip_hip} ${native_transformers_src_hip_hip} + ) +- if(WIN32) # Windows doesn't support Composable Kernels ++ if(NOT USE_ROCM_CK) # Windows doesn't support Composable Kernels + file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip") + file(GLOB native_hip_ck "native/hip/ck*.hip") + exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}" +diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp +index 89350a11bea7..33e5f2808057 100644 +--- a/aten/src/ATen/cuda/CUDABlas.cpp ++++ b/aten/src/ATen/cuda/CUDABlas.cpp +@@ -752,7 +752,7 @@ template <> + void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(double)) + { + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + // hipblaslt does not support double gemm yet + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(double)); + #else +@@ -1103,7 +1103,7 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE( + void * beta_ptr = &fbeta; + _cublasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); + GEMM_CHECK_ARGVALUES(at::Half); +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + int flag = 0; + #if USE_GEMM_FLAGS_FP16_ALT_IMPL + flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; +@@ -1270,7 +1270,7 @@ template <> + void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) + { + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + // hipblaslt does not support double gemm yet + gemm_internal_cublas(CUDABLAS_GEMM_ARGS(double)); + #else +@@ -1311,7 +1311,7 @@ template <> + void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) + { + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + // hipblaslt does not support complex gemm yet + gemm_internal_cublas>(CUDABLAS_GEMM_ARGS(c10::complex)); + #else +@@ -1327,7 +1327,7 @@ template <> + void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) + { + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + // hipblaslt does not support complex gemm yet + gemm_internal_cublas>(CUDABLAS_GEMM_ARGS(c10::complex)); + #else +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index a93386c27f8d..be1368999d38 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -1031,6 +1031,9 @@ if(USE_ROCM) + if(HIPBLASLT_VEC_EXT) + list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_VEC_EXT) + endif() ++ if(USE_ROCM_CK) ++ list(APPEND HIP_CXX_FLAGS -DUSE_ROCM_CK) ++ endif() + list(APPEND HIP_HIPCC_FLAGS --offload-compress) + if(WIN32) + add_definitions(-DROCM_ON_WINDOWS) +-- +2.49.0 + diff --git a/next/0001-Optionally-use-hipblaslt.patch b/next/0001-Optionally-use-hipblaslt.patch deleted file mode 100644 index 1e5ca4b..0000000 --- a/next/0001-Optionally-use-hipblaslt.patch +++ /dev/null @@ -1,506 +0,0 @@ -From f1d65e958afa65882dbfea8b392ab847a84d41ed Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 29 Jun 2024 04:18:34 -0700 -Subject: [PATCH] Optionally use hipblaslt - ---- - aten/src/ATen/cuda/CUDABlas.cpp | 46 ++++++++++++++++++------ - aten/src/ATen/cuda/CUDAContextLight.h | 4 +++ - aten/src/ATen/cuda/CublasHandlePool.cpp | 10 ++++-- - aten/src/ATen/cuda/tunable/TunableGemm.h | 18 +++++++--- - aten/src/ATen/native/cuda/Blas.cpp | 18 +++++++++- - cmake/Dependencies.cmake | 3 ++ - cmake/public/LoadHIP.cmake | 2 +- - 7 files changed, 82 insertions(+), 19 deletions(-) - -diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp -index ce991a9bcad4..3f0d17b52778 100644 ---- a/aten/src/ATen/cuda/CUDABlas.cpp -+++ b/aten/src/ATen/cuda/CUDABlas.cpp -@@ -14,7 +14,9 @@ - #include - - #ifdef USE_ROCM -+#ifdef USE_HIPBLASLT - #include -+#endif - // until hipblas has an API to accept flags, we must use rocblas here - #include - #include -@@ -182,6 +184,9 @@ uint32_t _getAlignment(uintptr_t address) { - static size_t _parseChosenWorkspaceSize() { - const char * val = getenv("CUBLASLT_WORKSPACE_SIZE"); - #ifdef USE_ROCM -+#ifndef USE_HIPBLASLT -+ return 0; -+#endif - if (!val) { - // accept either env var - val = getenv("HIPBLASLT_WORKSPACE_SIZE"); -@@ -235,6 +240,7 @@ namespace at::cuda::blas { - } while (0) - - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - namespace { - // Following the pattern of CuSparseDescriptor - // Defined here for now because this is the only place cublas_lt interface is -@@ -318,7 +324,6 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< - }; - } // namespace - -- - template - inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { - cudaDataType_t abcType = CUDA_R_32F; -@@ -452,7 +457,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { - " scaleType ", - scaleType); - } -- -+#endif - - template - inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { -@@ -608,10 +613,13 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(double)) - template <> - void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(float)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(float)); - } -- else { -+ else -+#endif -+ { - bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(float)); - } - } -@@ -651,10 +659,13 @@ void bgemm_internal>(CUDABLAS_BGEMM_ARGTYPES(c10::complex - void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::Half)); - } -- else { -+ else -+#endif -+ { - bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::Half)); - } - } -@@ -662,10 +673,13 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)) - template <> - void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::BFloat16)); - } -- else { -+ else -+#endif -+ { - bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::BFloat16)); - } - } -@@ -781,11 +795,13 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { - } - } - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) { - // forward to bgemm implementation but set strides and batches to 0 - bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0); - } -+#endif - - template - inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) { -@@ -1008,10 +1024,13 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) - template <> - void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); - } -- else { -+ else -+#endif -+ { - gemm_internal_cublas(CUDABLAS_GEMM_ARGS(float)); - } - } -@@ -1051,10 +1070,13 @@ void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex - void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); - } -- else { -+ else -+#endif -+ { - gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::Half)); - } - } -@@ -1062,10 +1084,13 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) - template <> - void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) - { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } -- else { -+ else -+#endif -+ { - gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } - } -@@ -1177,7 +1202,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { - } - } - -- -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - void gemm_and_bias( - bool transpose_mat1, -@@ -1410,7 +1435,7 @@ void scaled_gemm( - ScalarType result_dtype, - void* amax_ptr, - bool use_fast_accum) { --#if CUDA_VERSION >= 11080 || defined(USE_ROCM) -+#if CUDA_VERSION >= 11080 || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - const auto computeType = CUBLAS_COMPUTE_32F; - const auto scaleType = CUDA_R_32F; - const int8_t fastAccuMode = use_fast_accum ? 1 : 0; -@@ -1681,6 +1706,7 @@ void int8_gemm( - " scaleType ", - scaleType); - } -+#endif - - template <> - void trsm(CUDABLAS_TRSM_ARGTYPES(float)) { -diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h -index f2b657ced51b..f0ee613c4208 100644 ---- a/aten/src/ATen/cuda/CUDAContextLight.h -+++ b/aten/src/ATen/cuda/CUDAContextLight.h -@@ -9,7 +9,9 @@ - - // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also - // added bf16 support -+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))) - #include -+#endif - - #ifdef CUDART_VERSION - #include -@@ -80,7 +82,9 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator(); - /* Handles */ - TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle(); - TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle(); -+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))) - TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle(); -+#endif - - TORCH_CUDA_CPP_API void clearCublasWorkspaces(); - -diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp -index 8eac525b3695..abfdf7a23847 100644 ---- a/aten/src/ATen/cuda/CublasHandlePool.cpp -+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp -@@ -29,7 +29,7 @@ namespace at::cuda { - - namespace { - --#if defined(USE_ROCM) -+#if defined(USE_ROCM) && defined(USE_HIPBLASLT) - void createCublasLtHandle(cublasLtHandle_t *handle) { - TORCH_CUDABLAS_CHECK(cublasLtCreate(handle)); - } -@@ -191,8 +191,9 @@ cublasHandle_t getCurrentCUDABlasHandle() { - return handle; - } - --cublasLtHandle_t getCurrentCUDABlasLtHandle() { - #ifdef USE_ROCM -+#if defined(USE_HIPBLASLT) -+cublasLtHandle_t getCurrentCUDABlasLtHandle() { - c10::DeviceIndex device = 0; - AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); - -@@ -213,9 +214,12 @@ cublasLtHandle_t getCurrentCUDABlasLtHandle() { - - auto handle = myPoolWindow->reserve(device); - return handle; -+} -+#endif - #else -+cublasLtHandle_t getCurrentCUDABlasLtHandle() { - return reinterpret_cast(getCurrentCUDABlasHandle()); --#endif - } -+#endif - - } // namespace at::cuda -diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h -index 53e6154120c9..fa1d664696db 100644 ---- a/aten/src/ATen/cuda/tunable/TunableGemm.h -+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h -@@ -11,7 +11,9 @@ - - #include - #ifdef USE_ROCM -+#ifdef USE_HIPBLASLT - #include -+#endif - #include - #endif - #include -@@ -65,6 +67,7 @@ class DefaultGemmStridedBatchedOp : public Callable> - } - }; - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - class DefaultScaledGemmOp : public Callable> { - public: -@@ -94,6 +97,7 @@ class DefaultScaledGemmOp : public Callable> { - return OK; - } - }; -+#endif - - template - inline bool IsZero(T v) { -@@ -191,6 +195,7 @@ static void AddRocblasValidator() { - } - } - -+#ifdef USE_HIPBLASLT - static void AddHipblasltValidator() { - auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); - if (validators.find("HIPBLASLT_VERSION") == validators.end()) { -@@ -205,6 +210,7 @@ static void AddHipblasltValidator() { - [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; }); - } - } -+#endif - - static void AddRocmValidator() { - auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); -@@ -243,7 +249,7 @@ class GemmTunableOp : public TunableOp, StreamTimer> { - } - AddRocblasValidator(); - } -- -+#ifdef USE_HIPBLASLT - static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); - if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) { - rocm_validators = true; -@@ -257,7 +263,7 @@ class GemmTunableOp : public TunableOp, StreamTimer> { - } - AddHipblasltValidator(); - } -- -+#endif - if (rocm_validators) { - AddRocmValidator(); - } -@@ -286,7 +292,7 @@ class GemmStridedBatchedTunableOp : public TunableOp - } - AddRocblasValidator(); - } -- -+#ifdef USE_HIPBLASLT - static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); - if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) { - rocm_validators = true; -@@ -300,7 +306,7 @@ class GemmStridedBatchedTunableOp : public TunableOp - } - AddHipblasltValidator(); - } -- -+#endif - if (rocm_validators) { - AddRocmValidator(); - } -@@ -312,6 +318,7 @@ class GemmStridedBatchedTunableOp : public TunableOp - } - }; - -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - template - class ScaledGemmTunableOp : public TunableOp, StreamTimer> { - public: -@@ -321,10 +328,12 @@ class ScaledGemmTunableOp : public TunableOp, StreamTimer> - auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); - - #if defined(USE_ROCM) -+#ifdef USE_HIPBLASLT - for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps()) { - this->RegisterOp(std::move(name), std::move(op)); - } - AddHipblasltValidator(); -+#endif - AddRocmValidator(); - #endif - } -@@ -337,6 +346,7 @@ class ScaledGemmTunableOp : public TunableOp, StreamTimer> - "_", BlasOpToString(ALayout), BlasOpToString(BLayout)); - } - }; -+#endif - - #undef XSTRINGIFY - #undef STRINGIFY -diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp -index 84c59a4fd0d7..56ad5de3bf2d 100644 ---- a/aten/src/ATen/native/cuda/Blas.cpp -+++ b/aten/src/ATen/native/cuda/Blas.cpp -@@ -173,6 +173,7 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa - } - - static bool getDisableAddmmCudaLt() { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT"); - #ifdef USE_ROCM - // if we enable tunable op, it'll take priority over just hipblaslt (heuristics) -@@ -196,10 +197,14 @@ static bool getDisableAddmmCudaLt() { - } - return false; - #endif -+#else -+ return true; -+#endif - } - - #ifdef USE_ROCM - static bool isSupportedHipLtROCmArch(int index) { -+#ifdef USE_HIPBLASLT - hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index); - std::string device_arch = prop->gcnArchName; - static const std::vector archs = {"gfx90a", "gfx940", "gfx941", "gfx942"}; -@@ -210,6 +215,7 @@ static bool isSupportedHipLtROCmArch(int index) { - } - } - TORCH_CHECK(false, "Attempting to use hipBLASLt on a unsupported architecture!"); -+#endif - return false; - } - #endif -@@ -235,6 +241,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - at::ScalarType scalar_type = self.scalar_type(); - c10::MaybeOwned self_; - if (&result != &self) { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM) - // Strangely, if mat2 has only 1 row or column, we get - // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic. -@@ -276,13 +283,14 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - scalar_type != at::ScalarType::BFloat16)); - #endif - } -+#endif - #endif - if (!useLtInterface) { - self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm"); - } - self__sizes = self_->sizes(); - } else { --#if defined(USE_ROCM) -+#if defined(USE_ROCM) && defined(USE_HIPBLASLT) - useLtInterface = !disable_addmm_cuda_lt && - result.dim() == 2 && result.is_contiguous() && - isSupportedHipLtROCmArch(self.device().index()) && -@@ -334,6 +342,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj()); - - if (useLtInterface) { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - #if defined(USE_ROCM) - AT_DISPATCH_FLOATING_TYPES_AND2( - at::ScalarType::Half, -@@ -394,6 +403,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - activation_epilogue - ); - }); -+#endif - #endif - } else - { -@@ -803,6 +813,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) { - } - - static bool _scaled_mm_allowed_device() { -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - auto dprops = at::cuda::getCurrentDeviceProperties(); - #ifdef USE_ROCM - std::string device_arch = dprops->gcnArchName; -@@ -817,6 +828,9 @@ static bool _scaled_mm_allowed_device() { - #else - return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9); - #endif -+#else -+ return false; -+#endif - } - - // Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax -@@ -850,6 +864,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, - // Check sizes - bool allowed_device = _scaled_mm_allowed_device(); - TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+"); -+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) - TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix"); - TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix"); - TORCH_CHECK( -@@ -1025,6 +1040,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, - #if defined(USE_ROCM) && ROCM_VERSION >= 60000 && ROCM_VERSION < 60200 - // ROCm's hipBLASLt does not support amax before 6.2, so calculate separately - amax = at::max(at::abs(out.to(kFloat))); -+#endif - #endif - - return {out, amax}; -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index f1f2eb7cec31..8d05e834bbc5 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1052,6 +1052,9 @@ if(USE_ROCM) - list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) - list(APPEND HIP_CXX_FLAGS -std=c++17) - list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2) -+ if(hipblast_FOUND) -+ list(APPEND HIP_CXX_FLAGS -DUSE_HIPBLASLT) -+ endif() - if(HIP_NEW_TYPE_ENUMS) - list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS) - endif() -diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake -index fa39156031ff..df4836847fdf 100644 ---- a/cmake/public/LoadHIP.cmake -+++ b/cmake/public/LoadHIP.cmake -@@ -155,7 +155,7 @@ if(HIP_FOUND) - find_package_and_print_version(hiprand REQUIRED) - find_package_and_print_version(rocblas REQUIRED) - find_package_and_print_version(hipblas REQUIRED) -- find_package_and_print_version(hipblaslt REQUIRED) -+ find_package_and_print_version(hipblaslt) - find_package_and_print_version(miopen REQUIRED) - find_package_and_print_version(hipfft REQUIRED) - find_package_and_print_version(hipsparse REQUIRED) --- -2.45.2 - diff --git a/next/0001-disable-use-of-aotriton.patch b/next/0001-disable-use-of-aotriton.patch deleted file mode 100644 index 61ffd1e..0000000 --- a/next/0001-disable-use-of-aotriton.patch +++ /dev/null @@ -1,94 +0,0 @@ -From 038ce9e44776e23f21c1816daa259bc0ea335088 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Sat, 29 Jun 2024 07:06:09 -0700 -Subject: [PATCH] disable use of aotriton - ---- - .../ATen/native/transformers/cuda/sdp_utils.cpp | 17 +++++++++++++++-- - 1 file changed, 15 insertions(+), 2 deletions(-) - -diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp -index 214b02d8262e..7b3eb9dcd8cd 100644 ---- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp -+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp -@@ -19,9 +19,12 @@ - #include - #include - -+#ifdef USE_FLASH_ATTENTION - #if USE_ROCM - #include - #endif -+#endif -+ - - /** - * Note [SDPA Runtime Dispatch] -@@ -182,6 +185,9 @@ bool check_sm_version(cudaDeviceProp * dprops) { - - bool check_flash_attention_hardware_support(sdp_params const& params, bool debug) { - // Check that the gpu is capable of running flash attention -+#ifndef USE_FLASH_ATTENTION -+ return false; -+#else - using sm80 = SMVersion<8, 0>; - using sm90 = SMVersion<9, 0>; - #if USE_ROCM -@@ -209,9 +215,13 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug - } - #endif - return true; -+#endif - } - - bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) { -+#ifndef USE_FLASH_ATTENTION -+ return false; -+#else - // Mem Efficient attention supports hardware in the range [sm_50, sm_90] - using sm50 = SMVersion<5, 0>; - using sm90 = SMVersion<9, 0>; -@@ -240,6 +250,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) - } - #endif - return true; -+#endif - } - - bool check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89( -@@ -554,7 +565,7 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) { - #ifndef USE_FLASH_ATTENTION - TORCH_WARN_ONCE(!debug, "Torch was not compiled with flash attention."); - return false; --#endif -+#else - - // Define gate functions that determine if a flash kernel can be ran - // Replace with std::to_array when we migrate to c++20 -@@ -597,13 +608,14 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) { - } - } - return true; -+#endif - } - - bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) { - #ifndef USE_MEM_EFF_ATTENTION - TORCH_WARN_ONCE(!debug, "Torch was not compiled with memory efficient attention."); - return false; --#endif -+#else - // Constraints specific to mem efficient attention - constexpr auto greater_than_or_equal_sm80_mem_efficient_dtypes = - array_of(at::kHalf, at::kFloat, at::kBFloat16); -@@ -663,6 +675,7 @@ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) { - } - #endif - return check_tensor_dtype(params, less_than_sm80_mem_efficient_dtypes, debug); -+#endif - } - - SDPBackend select_sdp_backend(sdp_params const& kernel_params) { --- -2.45.2 - diff --git a/python-torch.spec b/python-torch.spec index 90c908c..91a82a9 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -6,10 +6,10 @@ # So pre releases can be tried %bcond_with gitcommit %if %{with gitcommit} -# v2.8.0-rc3 -%global commit0 3d53a53e504089a52a149791fd33d7fc898bd055 +# v2.8.0-rc6 +%global commit0 f2b69a083d15e3d0083bb304302a3fd0b5fb8705 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) -%global date0 20250625 +%global date0 20250718 %global pypi_version 2.8.0 %global flatbuffers_version 24.12.23 %global miniz_version 3.0.2 @@ -357,11 +357,9 @@ sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)@#list(APPE sed -i -e 's@if(NOT TARGET fxdiv)@if(MSVC AND USE_XNNPACK)@' caffe2/CMakeLists.txt sed -i -e 's@TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@#TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv)@' caffe2/CMakeLists.txt -%if %{without gitcommit} # https://github.com/pytorch/pytorch/issues/149803 # Tries to checkout nccl -sed -i -e 's@ checkout_nccl()@# checkout_nccl()@' tools/build_pytorch_libs.py -%endif +sed -i -e 's@ checkout_nccl()@ True@' tools/build_pytorch_libs.py # Disable the use of check_submodule's in the setup.py, we are a tarball, not a git repo sed -i -e 's@check_submodules()$@#check_submodules()@' setup.py @@ -541,6 +539,7 @@ export USE_SYSTEM_EIGEN_INSTALL=ON export USE_SYSTEM_ONNX=ON export USE_SYSTEM_PYBIND11=OFF export USE_SYSTEM_LIBS=OFF +export USE_SYSTEM_NCCL=OFF export USE_TENSORPIPE=OFF export USE_XNNPACK=OFF export USE_XPU=OFF From 61ccf033a8aece458e7df6573367bde478952636 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Thu, 24 Jul 2025 06:07:03 -0700 Subject: [PATCH 18/38] Update gitcommit to 2.8.0-rc8 Patch problem with 3.14 Start converting over py3 macros Handle new dependency on rocmsmi Signed-off-by: Tom Rix --- .../0001-Add-cmake-variable-USE_ROCM_CK.patch | 85 ++++- ...and-import-torch-issues-for-cpython-.patch | 359 ++++++++++++++++++ next/0001-Use-horrible-dynamo-stub.patch | 85 +++++ python-torch.spec | 60 ++- 4 files changed, 560 insertions(+), 29 deletions(-) create mode 100644 next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch create mode 100644 next/0001-Use-horrible-dynamo-stub.patch diff --git a/next/0001-Add-cmake-variable-USE_ROCM_CK.patch b/next/0001-Add-cmake-variable-USE_ROCM_CK.patch index 1afe692..925e03b 100644 --- a/next/0001-Add-cmake-variable-USE_ROCM_CK.patch +++ b/next/0001-Add-cmake-variable-USE_ROCM_CK.patch @@ -1,17 +1,17 @@ -From 4cc5d88dfe7a45ab245648dc874645d32a24b98b Mon Sep 17 00:00:00 2001 +From 193854993cd939de186de19589c1add4c4b2cf66 Mon Sep 17 00:00:00 2001 From: Tom Rix -Date: Fri, 27 Jun 2025 13:52:51 -0700 +Date: Mon, 21 Jul 2025 11:35:03 -0700 Subject: [PATCH] Add cmake variable USE_ROCM_CK --- CMakeLists.txt | 1 + aten/src/ATen/CMakeLists.txt | 40 ++++++++++++++++----------------- - aten/src/ATen/cuda/CUDABlas.cpp | 10 ++++----- + aten/src/ATen/cuda/CUDABlas.cpp | 22 +++++++++--------- cmake/Dependencies.cmake | 3 +++ - 4 files changed, 29 insertions(+), 25 deletions(-) + 4 files changed, 35 insertions(+), 31 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt -index 99c0b9e0ea0c..4c632e42f531 100644 +index a5d25e6afa0f..afc1b53efa64 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -240,6 +240,7 @@ cmake_dependent_option( @@ -82,7 +82,7 @@ index c9cfd74b501e..59f6178218ee 100644 file(GLOB native_hip_ck "native/hip/ck*.hip") exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}" diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp -index 89350a11bea7..33e5f2808057 100644 +index 89350a11bea7..e5b7960177cf 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -752,7 +752,7 @@ template <> @@ -94,16 +94,16 @@ index 89350a11bea7..33e5f2808057 100644 // hipblaslt does not support double gemm yet bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(double)); #else -@@ -1103,7 +1103,7 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE( - void * beta_ptr = &fbeta; - _cublasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); - GEMM_CHECK_ARGVALUES(at::Half); --#ifdef USE_ROCM -+#ifdef USE_ROCM_CK - int flag = 0; - #if USE_GEMM_FLAGS_FP16_ALT_IMPL - flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; -@@ -1270,7 +1270,7 @@ template <> +@@ -836,7 +836,7 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } + } +-#if defined(USE_ROCM) && !defined(_MSC_VER) ++#if defined(USE_ROCM) && defined(USE_ROCM_CK) + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::bgemm_internal_ck(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } +@@ -1270,14 +1270,14 @@ template <> void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) { if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { @@ -112,6 +112,23 @@ index 89350a11bea7..33e5f2808057 100644 // hipblaslt does not support double gemm yet gemm_internal_cublas(CUDABLAS_GEMM_ARGS(double)); #else + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(double)); + #endif + } +-#if defined(USE_ROCM) && !defined(_MSC_VER) ++#if defined(USE_ROCM) && defined(USE_ROCM_CK) + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(double)); + } +@@ -1293,7 +1293,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); + } +-#if defined(USE_ROCM) && !defined(_MSC_VER) ++#if defined(USE_ROCM) && defined(USE_ROCM_CK) + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100 + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); @@ -1311,7 +1311,7 @@ template <> void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) { @@ -130,6 +147,42 @@ index 89350a11bea7..33e5f2808057 100644 // hipblaslt does not support complex gemm yet gemm_internal_cublas>(CUDABLAS_GEMM_ARGS(c10::complex)); #else +@@ -1345,7 +1345,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); + } +-#if defined(USE_ROCM) && !defined(_MSC_VER) ++#if defined(USE_ROCM) && defined(USE_ROCM_CK) + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::Half)); + } +@@ -1361,7 +1361,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } +-#if defined(USE_ROCM) && !defined(_MSC_VER) ++#if defined(USE_ROCM) && defined(USE_ROCM_CK) + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } +@@ -1382,7 +1382,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); + } +-#if defined(USE_ROCM) && !defined(_MSC_VER) ++#if defined(USE_ROCM) && defined(USE_ROCM_CK) + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm"); + } +@@ -1398,7 +1398,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::B + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } +-#if defined(USE_ROCM) && !defined(_MSC_VER) ++#if defined(USE_ROCM) && defined(USE_ROCM_CK) + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm"); + } diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index a93386c27f8d..be1368999d38 100644 --- a/cmake/Dependencies.cmake diff --git a/next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch b/next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch new file mode 100644 index 0000000..b6a282c --- /dev/null +++ b/next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch @@ -0,0 +1,359 @@ +From f2a544b2e3a5bdc04985f6e06223c0c1700120a0 Mon Sep 17 00:00:00 2001 +From: albanD +Date: Sat, 12 Jul 2025 03:42:33 -0400 +Subject: [PATCH] Fix compilation and "import torch" issues for cpython 3.14 + +Imported from +https://github.com/albanD/pytorch/tree/cpython314_build +commit 88bb9cdb72449f4277829e20d94ad8aec1894216 + +Signed-off-by: Tom Rix +--- + torch/_dynamo/bytecode_analysis.py | 2 +- + torch/ao/quantization/__init__.py | 5 +++- + torch/ao/quantization/qconfig.py | 4 ++- + torch/ao/quantization/utils.py | 7 +++-- + torch/csrc/dynamo/cpython_defs.c | 16 +++++++++++ + torch/csrc/dynamo/cpython_includes.h | 17 ++++++++++++ + torch/csrc/dynamo/eval_frame.c | 34 +++++++++++++++-------- + torch/csrc/dynamo/framelocals_mapping.cpp | 14 ++++++++++ + torch/csrc/utils/python_compat.h | 1 + + torch/onnx/__init__.py | 1 - + torch/utils/weak.py | 29 +++++++++++++++++-- + 11 files changed, 111 insertions(+), 19 deletions(-) + +diff --git a/torch/_dynamo/bytecode_analysis.py b/torch/_dynamo/bytecode_analysis.py +index 3252ea91409f..2de74ee5bf8d 100644 +--- a/torch/_dynamo/bytecode_analysis.py ++++ b/torch/_dynamo/bytecode_analysis.py +@@ -33,7 +33,7 @@ if sys.version_info >= (3, 11): + TERMINAL_OPCODES.add(dis.opmap["JUMP_FORWARD"]) + else: + TERMINAL_OPCODES.add(dis.opmap["JUMP_ABSOLUTE"]) +-if sys.version_info >= (3, 12): ++if (3, 12) <= sys.version_info < (3, 14): + TERMINAL_OPCODES.add(dis.opmap["RETURN_CONST"]) + if sys.version_info >= (3, 13): + TERMINAL_OPCODES.add(dis.opmap["JUMP_BACKWARD_NO_INTERRUPT"]) +diff --git a/torch/ao/quantization/__init__.py b/torch/ao/quantization/__init__.py +index ffc1792fd23f..cf5a8b99a894 100644 +--- a/torch/ao/quantization/__init__.py ++++ b/torch/ao/quantization/__init__.py +@@ -1,5 +1,6 @@ + # mypy: allow-untyped-defs + ++import sys + from typing import Callable, Optional, Union + + import torch +@@ -33,7 +34,9 @@ from .stubs import * # noqa: F403 + + # ensure __module__ is set correctly for public APIs + ObserverOrFakeQuantize = Union[ObserverBase, FakeQuantizeBase] +-ObserverOrFakeQuantize.__module__ = "torch.ao.quantization" ++if sys.version_info < (3, 14): ++ ObserverOrFakeQuantize.__module__ = "torch.ao.quantization" ++ + for _f in [ + compare_results, + extract_results_from_loggers, +diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py +index efee5302ad42..d9a8fc78bab4 100644 +--- a/torch/ao/quantization/qconfig.py ++++ b/torch/ao/quantization/qconfig.py +@@ -1,5 +1,6 @@ + # mypy: allow-untyped-defs + import copy ++import sys + import warnings + from collections import namedtuple + from typing import Any, Optional, Union +@@ -568,7 +569,8 @@ def _assert_valid_qconfig(qconfig: Optional[QConfig], mod: torch.nn.Module) -> N + + + QConfigAny = Optional[QConfig] +-QConfigAny.__module__ = "torch.ao.quantization.qconfig" ++if sys.version_info < (3, 14): ++ QConfigAny.__module__ = "torch.ao.quantization.qconfig" + + + def _add_module_to_qconfig_obs_ctr( +diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py +index 4ac3112ec072..3b1503e01701 100644 +--- a/torch/ao/quantization/utils.py ++++ b/torch/ao/quantization/utils.py +@@ -4,6 +4,7 @@ Utils shared by different modes of quantization (eager/graph) + """ + + import functools ++import sys + import warnings + from collections import OrderedDict + from inspect import getfullargspec, signature +@@ -16,7 +17,8 @@ from torch.nn.utils.parametrize import is_parametrized + + + NodePattern = Union[tuple[Node, Node], tuple[Node, tuple[Node, Node]], Any] +-NodePattern.__module__ = "torch.ao.quantization.utils" ++if sys.version_info < (3, 14): ++ NodePattern.__module__ = "torch.ao.quantization.utils" + + # This is the Quantizer class instance from torch/quantization/fx/quantize.py. + # Define separately to prevent circular imports. +@@ -31,7 +33,8 @@ QuantizerCls = Any + Pattern = Union[ + Callable, tuple[Callable, Callable], tuple[Callable, tuple[Callable, Callable]], Any + ] +-Pattern.__module__ = "torch.ao.quantization.utils" ++if sys.version_info < (3, 14): ++ Pattern.__module__ = "torch.ao.quantization.utils" + + + # TODO: maybe rename this to MatchInputNode +diff --git a/torch/csrc/dynamo/cpython_defs.c b/torch/csrc/dynamo/cpython_defs.c +index b68ef894aeaa..244d4165d5e8 100644 +--- a/torch/csrc/dynamo/cpython_defs.c ++++ b/torch/csrc/dynamo/cpython_defs.c +@@ -2,6 +2,20 @@ + #include + #include + ++#if IS_PYTHON_3_14_PLUS ++ ++const uint8_t* THP_PyOpcode_Caches = NULL; ++const int THP_PyOpcode_Caches_size = 0; ++ ++void ++THP_PyThreadState_PopFrame(PyThreadState *tstate, _PyInterpreterFrame * frame) ++{} ++void ++THP_PyFrame_Clear(_PyInterpreterFrame *frame) ++{} ++ ++#else ++ + #if IS_PYTHON_3_11_PLUS + + #define Py_BUILD_CORE +@@ -360,3 +374,5 @@ const uint8_t* THP_PyOpcode_Caches = NULL; + const int THP_PyOpcode_Caches_size = 0; + + #endif ++ ++#endif // IS_PYTHON_3_14_PLUS +\ No newline at end of file +diff --git a/torch/csrc/dynamo/cpython_includes.h b/torch/csrc/dynamo/cpython_includes.h +index 6b99c1d5aec8..616be16563cf 100644 +--- a/torch/csrc/dynamo/cpython_includes.h ++++ b/torch/csrc/dynamo/cpython_includes.h +@@ -21,6 +21,14 @@ + + #if IS_PYTHON_3_11_PLUS + #include ++#if IS_PYTHON_3_14_PLUS ++#include ++#include ++#endif ++#endif ++ ++#if IS_PYTHON_3_14_PLUS ++#include + #endif + + #undef Py_BUILD_CORE +@@ -30,6 +38,13 @@ + extern "C" { + #endif + ++#if IS_PYTHON_3_14_PLUS ++ ++#define F_CODE(x) (PyCodeObject*)PyStackRef_AsPyObjectBorrow(x->f_executable) ++#define PREV_INSTR(x) (x)->instr_ptr ++ ++#else ++ + #if IS_PYTHON_3_13_PLUS + #define F_CODE(x) ((PyCodeObject*)(x)->f_executable) + #define PREV_INSTR(x) (x)->instr_ptr +@@ -38,6 +53,8 @@ extern "C" { + #define PREV_INSTR(x) (x)->prev_instr + #endif + ++#endif // IS_PYTHON_3_14_PLUS ++ + #if IS_PYTHON_3_12_PLUS + #define FUNC(x) ((x)->f_funcobj) + #else +diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c +index f413782b2d30..72bb8839bac3 100644 +--- a/torch/csrc/dynamo/eval_frame.c ++++ b/torch/csrc/dynamo/eval_frame.c +@@ -224,17 +224,6 @@ const char* get_frame_name(THP_EVAL_API_FRAME_OBJECT* frame) { + return PyUnicode_AsUTF8(F_CODE(frame)->co_name); + } + +-void clear_old_frame_if_python_312_plus( +- PyThreadState* tstate, +- THP_EVAL_API_FRAME_OBJECT* frame) { +-#if IS_PYTHON_3_12_PLUS +- +- THP_PyFrame_Clear(frame); +- THP_PyThreadState_PopFrame(tstate, frame); +- +-#endif +-} +- + static PyObject* dynamo_eval_custom_code_impl( + PyThreadState* tstate, + THP_EVAL_API_FRAME_OBJECT* frame, +@@ -485,6 +474,18 @@ static PyObject* dynamo__custom_eval_frame_shim( + + static void enable_eval_frame_shim(PyThreadState* tstate) {} + static void enable_eval_frame_default(PyThreadState* tstate) {} ++PyObject* dynamo_eval_custom_code( ++ PyThreadState* tstate, ++ THP_EVAL_API_FRAME_OBJECT* frame, ++ PyCodeObject* code, ++ const char* trace_annotation, ++ int throw_flag) {} ++THPPyInterpreterFrame* THPPyInterpreterFrame_New( ++ THP_EVAL_API_FRAME_OBJECT* frame) {} ++PyObject* dynamo_eval_frame_default( ++ PyThreadState* tstate, ++ THP_EVAL_API_FRAME_OBJECT* frame, ++ int throw_flag) {} + + static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL}; + +@@ -498,6 +499,17 @@ static PyTypeObject THPPyInterpreterFrameType = { + + #endif // !(IS_PYTHON_3_14_PLUS) + ++void clear_old_frame_if_python_312_plus( ++ PyThreadState* tstate, ++ THP_EVAL_API_FRAME_OBJECT* frame) { ++#if IS_PYTHON_3_12_PLUS ++ ++ THP_PyFrame_Clear(frame); ++ THP_PyThreadState_PopFrame(tstate, frame); ++ ++#endif ++} ++ + static PyObject* increment_working_threads( + PyThreadState* tstate, + PyObject* module) { +diff --git a/torch/csrc/dynamo/framelocals_mapping.cpp b/torch/csrc/dynamo/framelocals_mapping.cpp +index b839fb26fc91..c4ee36d87767 100644 +--- a/torch/csrc/dynamo/framelocals_mapping.cpp ++++ b/torch/csrc/dynamo/framelocals_mapping.cpp +@@ -26,9 +26,13 @@ FrameLocalsMapping::FrameLocalsMapping(FrameLocalsFrameType* frame) + PyCodeObject* co = F_CODE(frame); + _framelocals.resize(co->co_nlocalsplus, nullptr); + ++#if IS_PYTHON_3_14_PLUS ++ TORCH_CHECK(false, "Python 3.14+ not supported"); ++#else + if (!frame->stacktop) { + return; + } ++#endif + + auto update_framelocals = [&](int i, PyObject* value) { + _PyLocals_Kind kind = _PyLocals_GetKind(co->co_localspluskinds, i); +@@ -53,11 +57,21 @@ FrameLocalsMapping::FrameLocalsMapping(FrameLocalsFrameType* frame) + }; + + auto offset = co->co_nlocalsplus - co->co_nfreevars; ++#if IS_PYTHON_3_14_PLUS ++ TORCH_CHECK(false, "Python 3.14+ not supported"); ++#else + for (int i = 0; i < offset; i++) { + update_framelocals(i, frame->localsplus[i]); + } ++#endif ++ + // Get references to closure variables ++#if IS_PYTHON_3_14_PLUS ++ PyObject* closure; ++ TORCH_CHECK(false, "Python 3.14+ not supported"); ++#else + PyObject* closure = ((PyFunctionObject*)FUNC(frame))->func_closure; ++#endif + for (int i = 0; i < co->co_nfreevars; i++) { + update_framelocals(offset + i, PyTuple_GET_ITEM(closure, i)); + } +diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h +index a1537611cc47..16292e4fd030 100644 +--- a/torch/csrc/utils/python_compat.h ++++ b/torch/csrc/utils/python_compat.h +@@ -13,6 +13,7 @@ extern "C" { + #define IS_PYTHON_3_12_PLUS PY_VERSION_HEX >= 0x030C0000 + #define IS_PYTHON_3_13_PLUS PY_VERSION_HEX >= 0x030D0000 + #define IS_PYTHON_3_14_PLUS PY_VERSION_HEX >= 0x030E0000 ++#define IS_PYTHON_3_15_PLUS PY_VERSION_HEX >= 0x030F0000 + + static inline int PyCode_GetNCellvars(PyCodeObject* code) { + // gh-26364 added co_ncellvars to Python 3.11.0rc1 +diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py +index 345ffd2a065b..ceeadde5365b 100644 +--- a/torch/onnx/__init__.py ++++ b/torch/onnx/__init__.py +@@ -104,7 +104,6 @@ ONNXProgram.__module__ = "torch.onnx" + OnnxExporterError.__module__ = "torch.onnx" + _OrtBackend.__module__ = "torch.onnx" + _OrtBackendOptions.__module__ = "torch.onnx" +-_OrtExecutionProvider.__module__ = "torch.onnx" + enable_fake_mode.__module__ = "torch.onnx" + is_onnxrt_backend_supported.__module__ = "torch.onnx" + +diff --git a/torch/utils/weak.py b/torch/utils/weak.py +index 8bf2ba5ed02b..9c7218cb2ad3 100644 +--- a/torch/utils/weak.py ++++ b/torch/utils/weak.py +@@ -3,8 +3,6 @@ from __future__ import annotations + + import collections.abc as _collections_abc + import weakref +- +-from _weakrefset import _IterationGuard # type: ignore[attr-defined] + from collections.abc import Mapping, MutableMapping + from weakref import ref + +@@ -22,6 +20,33 @@ __all__ = [ + ] + + ++# TODO: make weakref properly thread safe following ++# https://github.com/python/cpython/pull/125325 ++class _IterationGuard: ++ # This context manager registers itself in the current iterators of the ++ # weak container, such as to delay all removals until the context manager ++ # exits. ++ # This technique should be relatively thread-safe (since sets are). ++ ++ def __init__(self, weakcontainer): ++ # Don't create cycles ++ self.weakcontainer = ref(weakcontainer) ++ ++ def __enter__(self): ++ w = self.weakcontainer() ++ if w is not None: ++ w._iterating.add(self) ++ return self ++ ++ def __exit__(self, e, t, b): ++ w = self.weakcontainer() ++ if w is not None: ++ s = w._iterating ++ s.remove(self) ++ if not s: ++ w._commit_removals() ++ ++ + # This file defines a variant of WeakKeyDictionary that overrides the hashing + # behavior of the key to use object identity, rather than the builtin + # __eq__/__hash__ functions. This is useful for Tensor weak keys, as their +-- +2.49.0 + diff --git a/next/0001-Use-horrible-dynamo-stub.patch b/next/0001-Use-horrible-dynamo-stub.patch new file mode 100644 index 0000000..1900519 --- /dev/null +++ b/next/0001-Use-horrible-dynamo-stub.patch @@ -0,0 +1,85 @@ +From fd535f7bf44f2034cca2a66b4cc7d68d962341df Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Sun, 20 Jul 2025 12:47:58 -0700 +Subject: [PATCH] Use horrible dynamo stub + +Rawhide's update of python is too fast for dynamo +So paper of the problem with a horrible stub that throws +runtime exceptions if dynamo is used. + +Signed-off-by: Tom Rix +--- + build_variables.bzl | 26 ++++++++++++---------- + torch/csrc/dynamo/horrible_dynamo_stub.cpp | 16 +++++++++++++ + 2 files changed, 30 insertions(+), 12 deletions(-) + create mode 100644 torch/csrc/dynamo/horrible_dynamo_stub.cpp + +diff --git a/build_variables.bzl b/build_variables.bzl +index b266c80e8843..a3be6893349b 100644 +--- a/build_variables.bzl ++++ b/build_variables.bzl +@@ -140,7 +140,8 @@ core_trainer_sources = [ + "torch/csrc/autograd/variable.cpp", + "torch/csrc/autograd/utils/warnings.cpp", + "torch/csrc/autograd/jit_decomp_interface.cpp", +- "torch/csrc/dynamo/compiled_autograd.cpp", ++# "torch/csrc/dynamo/compiled_autograd.cpp", ++ "torch/csrc/dynamo/horrible_dynamo_stub.cpp", + "torch/csrc/jit/frontend/name_mangler.cpp", + "torch/csrc/jit/ir/type_hashing.cpp", + "torch/csrc/jit/serialization/pickler.cpp", +@@ -868,17 +869,18 @@ libtorch_python_core_sources = [ + "torch/csrc/autograd/python_torch_functions_manual.cpp", + "torch/csrc/autograd/python_variable.cpp", + "torch/csrc/autograd/python_variable_indexing.cpp", +- "torch/csrc/dynamo/python_compiled_autograd.cpp", +- "torch/csrc/dynamo/cache_entry.cpp", +- "torch/csrc/dynamo/cpp_shim.cpp", +- "torch/csrc/dynamo/cpython_defs.c", +- "torch/csrc/dynamo/eval_frame.c", +- "torch/csrc/dynamo/eval_frame_cpp.cpp", +- "torch/csrc/dynamo/extra_state.cpp", +- "torch/csrc/dynamo/framelocals_mapping.cpp", +- "torch/csrc/dynamo/guards.cpp", +- "torch/csrc/dynamo/utils.cpp", +- "torch/csrc/dynamo/init.cpp", ++# "torch/csrc/dynamo/python_compiled_autograd.cpp", ++# "torch/csrc/dynamo/cache_entry.cpp", ++# "torch/csrc/dynamo/cpp_shim.cpp", ++# "torch/csrc/dynamo/cpython_defs.c", ++# "torch/csrc/dynamo/eval_frame.c", ++# "torch/csrc/dynamo/eval_frame_cpp.cpp", ++# "torch/csrc/dynamo/extra_state.cpp", ++# "torch/csrc/dynamo/framelocals_mapping.cpp", ++# "torch/csrc/dynamo/guards.cpp", ++# "torch/csrc/dynamo/utils.cpp", ++# "torch/csrc/dynamo/init.cpp", ++ "torch/csrc/dynamo/horrible_dynamo_stub.cpp", + "torch/csrc/functorch/init.cpp", + "torch/csrc/fx/node.cpp", + "torch/csrc/mps/Module.cpp", +diff --git a/torch/csrc/dynamo/horrible_dynamo_stub.cpp b/torch/csrc/dynamo/horrible_dynamo_stub.cpp +new file mode 100644 +index 000000000000..3ac1324d4557 +--- /dev/null ++++ b/torch/csrc/dynamo/horrible_dynamo_stub.cpp +@@ -0,0 +1,16 @@ ++#include ++#include ++ ++namespace torch::dynamo::autograd { ++const std::unique_ptr& getPyCompilerInterface() { ++ throw std::runtime_error("Dynamo not supported"); ++ return nullptr; ++} ++std::vector> get_input_metadata( ++ const edge_list& edges) { ++ std::vector> r; ++ throw std::runtime_error("Dynamo not supported"); ++ return r; ++} ++ ++} +-- +2.49.0 + diff --git a/python-torch.spec b/python-torch.spec index 91a82a9..44c1199 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -6,10 +6,10 @@ # So pre releases can be tried %bcond_with gitcommit %if %{with gitcommit} -# v2.8.0-rc6 -%global commit0 f2b69a083d15e3d0083bb304302a3fd0b5fb8705 +# v2.8.0-rc8 +%global commit0 a1cb3cc05d46d198467bebbb6e8fba50a325d4e7 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) -%global date0 20250718 +%global date0 20250723 %global pypi_version 2.8.0 %global flatbuffers_version 24.12.23 %global miniz_version 3.0.2 @@ -33,7 +33,11 @@ %endif # For testing distributed+rccl etc. +%if %{with gitcommit} +%bcond_without rccl +%else %bcond_with rccl +%endif %bcond_with gloo %bcond_without mpi %bcond_without tensorpipe @@ -103,13 +107,13 @@ Source80: https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ %if %{without gitcommit} # https://github.com/pytorch/pytorch/issues/150187 -# The hack job -# Patch11: 0001-python-torch-disable-ck.patch -# Cleaned up hack job Patch11: 0001-Add-cmake-varaible-USE_ROCM_CK.patch - %else +# https://github.com/pytorch/pytorch/issues/150187 Patch11: 0001-Add-cmake-variable-USE_ROCM_CK.patch +# https://github.com/pytorch/pytorch/issues/156595 +# Patch12: 0001-Use-horrible-dynamo-stub.patch +Patch12: 0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch %endif ExclusiveArch: x86_64 aarch64 @@ -153,6 +157,9 @@ BuildRequires: python3dist(filelock) BuildRequires: python3dist(jinja2) BuildRequires: python3dist(networkx) BuildRequires: python3dist(numpy) +%if %{with gitcommit} +BuildRequires: python3dist(pip) +%endif BuildRequires: python3dist(pyyaml) BuildRequires: python3dist(setuptools) BuildRequires: python3dist(sphinx) @@ -171,6 +178,9 @@ BuildRequires: hipcub-devel BuildRequires: hipfft-devel BuildRequires: hiprand-devel BuildRequires: hipsparse-devel +%if %{with gitcommit} +BuildRequires: hipsparselt-devel +%endif BuildRequires: hipsolver-devel BuildRequires: magma-devel BuildRequires: miopen-devel @@ -190,6 +200,7 @@ BuildRequires: rocm-runtime-devel BuildRequires: rocm-rpm-macros %if %{with gitcommit} BuildRequires: rocsolver-devel +BuildRequires: rocm-smi-devel %endif BuildRequires: rocthrust-devel BuildRequires: roctracer-devel @@ -337,6 +348,10 @@ sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unus sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-deprecated-declarations@' cmake/Dependencies.cmake # Use parallel jobs sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -parallel-jobs=4@' cmake/Dependencies.cmake +%if %{with gitcommit} +# Need to link with librocm_smi64 +sed -i -e 's@hiprtc::hiprtc@hiprtc::hiprtc rocm_smi64@' cmake/Dependencies.cmake +%endif # No third_party fmt, use system sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt @@ -590,17 +605,21 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode # pytorch uses clang, not hipcc export HIP_CLANG_PATH=%{rocmllvm_bindir} -%if %{?fedora} <= 43 -export PYTORCH_ROCM_ARCH="gfx1100;gfx1201" -%else export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} + +%if %{with gitcommit} +%pyproject_wheel +%else +%py3_build %endif -%py3_build - %else +%if %{with gitcommit} +%pyproject_wheel +%else %py3_build +%endif %endif @@ -617,17 +636,32 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode # pytorch uses clang, not hipcc export HIP_CLANG_PATH=%{rocmllvm_bindir} export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} +%if %{with gitcommit} +%pyproject_install +%else %py3_install +%endif %else +%if %{with gitcommit} +%pyproject_install +%pyproject_save_files torch +%else %py3_install - +%endif %endif + + %check +%if %{with gitcommit} +# Not working yet +# pyproject_check_import torch +%else %py3_check_import torch +%endif # Do not remote the empty files From 6158e4810ccdce8ccbe332db1ac0b02e87dc4615 Mon Sep 17 00:00:00 2001 From: Fedora Release Engineering Date: Fri, 25 Jul 2025 10:49:07 +0000 Subject: [PATCH 19/38] Rebuilt for https://fedoraproject.org/wiki/Fedora_43_Mass_Rebuild From 72ad1f0389043a5f26735e3ca2a2a88398daba23 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sat, 26 Jul 2025 17:15:59 -0700 Subject: [PATCH 20/38] Fix some issues with switching to pyproject macros Signed-off-by: Tom Rix --- python-torch.spec | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python-torch.spec b/python-torch.spec index 44c1199..03fbf30 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -214,6 +214,7 @@ BuildRequires: google-benchmark-devel %endif Requires: python3dist(dill) +Requires: python3dist(yaml) Obsoletes: caffe = 1.0^git20200212.9b89154 @@ -638,6 +639,7 @@ export HIP_CLANG_PATH=%{rocmllvm_bindir} export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} %if %{with gitcommit} %pyproject_install +%pyproject_save_files '*torch*' %else %py3_install %endif @@ -646,7 +648,7 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} %if %{with gitcommit} %pyproject_install -%pyproject_save_files torch +%pyproject_save_files '*torch*' %else %py3_install %endif @@ -670,10 +672,8 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} %doc README.md %{_bindir}/torchrun %{_bindir}/torchfrtrace -%{python3_sitearch}/%{pypi_name} -%{python3_sitearch}/%{pypi_name}-*.egg-info +%{python3_sitearch}/%{pypi_name}* %{python3_sitearch}/functorch -%{python3_sitearch}/torchgen %changelog %autochangelog From cec8b79644fdf9b63ba227c506c9bccdb36b1618 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Thu, 31 Jul 2025 05:52:50 -0700 Subject: [PATCH 21/38] Update to 2.8.0-rc8 Signed-off-by: Tom Rix --- .gitignore | 3 + 0001-Add-cmake-varaible-USE_ROCM_CK.patch | 120 ------------------ ... 0001-Add-cmake-variable-USE_ROCM_CK.patch | 0 ...and-import-torch-issues-for-cpython-.patch | 0 python-torch.spec | 59 ++------- sources | 3 + 6 files changed, 17 insertions(+), 168 deletions(-) delete mode 100644 0001-Add-cmake-varaible-USE_ROCM_CK.patch rename next/0001-Add-cmake-variable-USE_ROCM_CK.patch => 0001-Add-cmake-variable-USE_ROCM_CK.patch (100%) rename next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch => 0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch (100%) diff --git a/.gitignore b/.gitignore index 25abff5..a4ed35b 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,6 @@ /pytorch-v2.5.1.tar.gz /pytorch-v2.7.0.tar.gz /v2.13.6.tar.gz +/pytorch-a1cb3cc.tar.gz +/v24.12.23.tar.gz +/kineto-5e75018.tar.gz diff --git a/0001-Add-cmake-varaible-USE_ROCM_CK.patch b/0001-Add-cmake-varaible-USE_ROCM_CK.patch deleted file mode 100644 index b34e07a..0000000 --- a/0001-Add-cmake-varaible-USE_ROCM_CK.patch +++ /dev/null @@ -1,120 +0,0 @@ -From 0f33e0a7bbd1522ee74f8fc1fbe3af7563318c79 Mon Sep 17 00:00:00 2001 -From: Tom Rix -Date: Fri, 28 Mar 2025 15:33:09 -0700 -Subject: [PATCH] Add cmake varaible USE_ROCM_CK - -To control the use of ROCm Composable Kernel usage. - -CK is not compatible with all rocBLAS gpu's, so the user -must explicitly choose to use CK. - -Signed-off-by: Tom Rix ---- - CMakeLists.txt | 1 + - aten/src/ATen/CMakeLists.txt | 8 ++++++-- - aten/src/ATen/cuda/CUDABlas.cpp | 10 +++++----- - cmake/Dependencies.cmake | 3 +++ - 4 files changed, 15 insertions(+), 7 deletions(-) - -diff --git a/CMakeLists.txt b/CMakeLists.txt -index f3fee2f7ffc2..73903acce452 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -249,6 +249,7 @@ cmake_dependent_option( - BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON - "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF) - cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF) -+cmake_dependent_option(USE_ROCM_CK "Use ROCm Composable Kernel" ON "USE_ROCM" ON) - option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF) - cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF) - cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF -diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt -index 085af373ec22..af268ab88572 100644 ---- a/aten/src/ATen/CMakeLists.txt -+++ b/aten/src/ATen/CMakeLists.txt -@@ -361,13 +361,17 @@ endif() - ${native_quantized_hip_hip} - ${native_transformers_hip_hip} ${native_transformers_src_hip_hip} - ) -- if(WIN32) # Windows doesn't support Composable Kernels and Triton -+ if(NOT USE_ROCM_CK) # Windows doesn't support Composable Kernels and Triton - file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip") - file(GLOB native_hip_ck "native/hip/ck*.hip") - exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}" -- ${native_hip_bgemm} ${native_hip_ck} -+ ${native_hip_bgemm} ${native_hip_ck}) -+ endif() -+ if(WIN32) # Windows doesn't support Composable Kernels and Triton -+ exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}" - ${native_transformers_hip_hip} ${native_transformers_hip_cpp}) - endif() -+ - # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources) - list(APPEND all_hip_cpp - ${native_nested_hip_cpp} -diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp -index a62b028fd4ff..a3dbf76848ea 100644 ---- a/aten/src/ATen/cuda/CUDABlas.cpp -+++ b/aten/src/ATen/cuda/CUDABlas.cpp -@@ -708,7 +708,7 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::BFloat16)); - } --#ifdef USE_ROCM -+#ifdef USE_ROCM_CK - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - at::native::bgemm_internal_ck(CUDABLAS_BGEMM_ARGS(at::BFloat16)); - } -@@ -1061,7 +1061,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(double)); - #endif - } --#ifdef USE_ROCM -+#ifdef USE_ROCM_CK - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(double)); - } -@@ -1077,7 +1077,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); - } --#ifdef USE_ROCM -+#ifdef USE_ROCM_CK - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(float)); - } -@@ -1125,7 +1125,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); - } --#ifdef USE_ROCM -+#ifdef USE_ROCM_CK - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::Half)); - } -@@ -1141,7 +1141,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) - if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { - gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } --#ifdef USE_ROCM -+#ifdef USE_ROCM_CK - else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::BFloat16)); - } -diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index 30917bdf39f5..2ca6091030f1 100644 ---- a/cmake/Dependencies.cmake -+++ b/cmake/Dependencies.cmake -@@ -1046,6 +1046,9 @@ if(USE_ROCM) - if(HIPBLASLT_VEC_EXT) - list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_VEC_EXT) - endif() -+ if(USE_ROCM_CK) -+ list(APPEND HIP_CXX_FLAGS -DUSE_ROCM_CK) -+ endif() - list(APPEND HIP_HIPCC_FLAGS --offload-compress) - if(WIN32) - add_definitions(-DROCM_ON_WINDOWS) --- -2.48.1 - diff --git a/next/0001-Add-cmake-variable-USE_ROCM_CK.patch b/0001-Add-cmake-variable-USE_ROCM_CK.patch similarity index 100% rename from next/0001-Add-cmake-variable-USE_ROCM_CK.patch rename to 0001-Add-cmake-variable-USE_ROCM_CK.patch diff --git a/next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch b/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch similarity index 100% rename from next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch rename to 0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch diff --git a/python-torch.spec b/python-torch.spec index 03fbf30..1fbad8e 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -15,8 +15,11 @@ %global miniz_version 3.0.2 %global pybind11_version 2.13.6 %else -%global pypi_version 2.7.0 -%global flatbuffers_version 23.3.3 +%global commit0 a1cb3cc05d46d198467bebbb6e8fba50a325d4e7 +%global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) +%global date0 20250723 +%global pypi_version 2.8.0 +%global flatbuffers_version 24.12.23 %global miniz_version 3.0.2 %global pybind11_version 2.13.6 %endif @@ -33,11 +36,7 @@ %endif # For testing distributed+rccl etc. -%if %{with gitcommit} %bcond_without rccl -%else -%bcond_with rccl -%endif %bcond_with gloo %bcond_without mpi %bcond_without tensorpipe @@ -56,7 +55,7 @@ Name: python-%{pypi_name} %if %{with gitcommit} Version: %{pypi_version}^git%{date0}.%{shortcommit0} %else -Version: %{pypi_version} +Version: %{pypi_version}.rc8 %endif Release: %autorelease Summary: PyTorch AI/ML framework @@ -68,7 +67,8 @@ URL: https://pytorch.org/ Source0: %{forgeurl}/archive/%{commit0}/pytorch-%{shortcommit0}.tar.gz Source1000: pyproject.toml %else -Source0: %{forgeurl}/releases/download/v%{version}/pytorch-v%{version}.tar.gz +Source0: %{forgeurl}/archive/%{commit0}/pytorch-%{shortcommit0}.tar.gz +Source1000: pyproject.toml %endif Source1: https://github.com/google/flatbuffers/archive/refs/tags/v%{flatbuffers_version}.tar.gz Source2: https://github.com/pybind/pybind11/archive/refs/tags/v%{pybind11_version}.tar.gz @@ -96,25 +96,16 @@ Source70: https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp- %endif %if %{without kineto} -%if %{with gitcommit} %global ki_commit 5e7501833f1021ce6f618572d3baf657b6319658 -%else -%global ki_commit be1317644c68b4bfc4646024a6b221066e430031 -%endif %global ki_scommit %(c=%{ki_commit}; echo ${c:0:7}) Source80: https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz %endif -%if %{without gitcommit} -# https://github.com/pytorch/pytorch/issues/150187 -Patch11: 0001-Add-cmake-varaible-USE_ROCM_CK.patch -%else # https://github.com/pytorch/pytorch/issues/150187 Patch11: 0001-Add-cmake-variable-USE_ROCM_CK.patch # https://github.com/pytorch/pytorch/issues/156595 # Patch12: 0001-Use-horrible-dynamo-stub.patch Patch12: 0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch -%endif ExclusiveArch: x86_64 aarch64 %global toolchain gcc @@ -157,9 +148,7 @@ BuildRequires: python3dist(filelock) BuildRequires: python3dist(jinja2) BuildRequires: python3dist(networkx) BuildRequires: python3dist(numpy) -%if %{with gitcommit} BuildRequires: python3dist(pip) -%endif BuildRequires: python3dist(pyyaml) BuildRequires: python3dist(setuptools) BuildRequires: python3dist(sphinx) @@ -178,9 +167,7 @@ BuildRequires: hipcub-devel BuildRequires: hipfft-devel BuildRequires: hiprand-devel BuildRequires: hipsparse-devel -%if %{with gitcommit} BuildRequires: hipsparselt-devel -%endif BuildRequires: hipsolver-devel BuildRequires: magma-devel BuildRequires: miopen-devel @@ -198,10 +185,8 @@ BuildRequires: rocm-core-devel BuildRequires: rocm-hip-devel BuildRequires: rocm-runtime-devel BuildRequires: rocm-rpm-macros -%if %{with gitcommit} BuildRequires: rocsolver-devel BuildRequires: rocm-smi-devel -%endif BuildRequires: rocthrust-devel BuildRequires: roctracer-devel @@ -275,7 +260,9 @@ Requires: python3-%{pypi_name}%{?_isa} = %{version}-%{release} cp %{SOURCE1000} . %else -%autosetup -p1 -n pytorch-v%{version} +%autosetup -p1 -n pytorch-%{commit0} +# Overwrite with a git checkout of the pyproject.toml +cp %{SOURCE1000} . %endif # Remove bundled egg-info @@ -349,10 +336,8 @@ sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unus sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-deprecated-declarations@' cmake/Dependencies.cmake # Use parallel jobs sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -parallel-jobs=4@' cmake/Dependencies.cmake -%if %{with gitcommit} # Need to link with librocm_smi64 sed -i -e 's@hiprtc::hiprtc@hiprtc::hiprtc rocm_smi64@' cmake/Dependencies.cmake -%endif # No third_party fmt, use system sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt @@ -449,9 +434,7 @@ mv googletest third_party # # Fake out pocketfft, and system header will be used mkdir third_party/pocketfft -%if %{with gitcommit} cp /usr/include/pocketfft_hdronly.h third_party/pocketfft/ -%endif # # Use the system valgrind headers @@ -608,19 +591,11 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode export HIP_CLANG_PATH=%{rocmllvm_bindir} export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} -%if %{with gitcommit} %pyproject_wheel -%else -%py3_build -%endif %else -%if %{with gitcommit} %pyproject_wheel -%else -%py3_build -%endif %endif @@ -637,33 +612,21 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode # pytorch uses clang, not hipcc export HIP_CLANG_PATH=%{rocmllvm_bindir} export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} -%if %{with gitcommit} %pyproject_install %pyproject_save_files '*torch*' -%else -%py3_install -%endif %else -%if %{with gitcommit} %pyproject_install %pyproject_save_files '*torch*' -%else -%py3_install -%endif %endif %check -%if %{with gitcommit} # Not working yet # pyproject_check_import torch -%else -%py3_check_import torch -%endif # Do not remote the empty files diff --git a/sources b/sources index 4021d40..c7eae22 100644 --- a/sources +++ b/sources @@ -7,3 +7,6 @@ SHA512 (libnop-910b558.tar.gz) = 74c5324eaa1b6b2ac8dfef94c835b5c5b044625f8e5efe3 SHA512 (v1.14.2.tar.gz) = 97635bbaf6dd567c201451dfaf7815b2052fe50d9bccc97aade86cfa4a92651374d167296a5453031b2681dc302806a289bca011a9e79ddc381a17d6118971d7 SHA512 (cpp-httplib-3b6597b.tar.gz) = 8f1090658c498d04f14fec5c2f301847b1f3360bf92b18d82927643ee04ab61a6b274733a01c7850f9c030205120d674d1d961358d49fdd15636736fb8704f55 SHA512 (kineto-be13176.tar.gz) = 41a08c7da9eea7d12402f80a5550c9d4df79798719cc52b12a507828c8c896ba28a37c35d8adf809ca72589e1d84965d5ef6dd01f3f8dc1c803c5ed67b03a43a +SHA512 (pytorch-a1cb3cc.tar.gz) = 92bf8b2c2ef0b459406b60169ecebdc50652c75943e3d6087e4d261f6e308dbad365529561e0f07ea3f0b71790efb68b5e4ab2f44e270462097208d924dc2d95 +SHA512 (v24.12.23.tar.gz) = f97762ba41b9cfef648e93932fd789324c6bb6ebc5b7aeca8185c9ef602294b67d73aea7ae371035579a1419cbfbeba7c3e88b31b5a5848db98f5e8a03b982b1 +SHA512 (kineto-5e75018.tar.gz) = 921b96a56e01d69895b79e67582d8977ed6f873573ab41557c5d026ada5d1f6365e4ed0a0c6804057c52e92510749fc58619f554a164c1ba9d8cd13e789bebd0 From eaa9838b3ced49bdfa6d8c094d4e72cbb2406ec4 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 4 Aug 2025 16:23:41 -0700 Subject: [PATCH 22/38] Change a couple cmake mins Signed-off-by: Tom Rix --- python-torch.spec | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python-torch.spec b/python-torch.spec index 1fbad8e..e3cfb6d 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -449,6 +449,11 @@ sed -i -e 's@list(APPEND Caffe2_DEPENDENCY_LIBS foxi_loader)@#list(APPEND Caffe2 # cmake version changed sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION 3.5)@' third_party/tensorpipe/third_party/libuv/CMakeLists.txt +sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION 3.5)@' libuv*/CMakeLists.txt +%if %{without opentelemtry} +sed -i -e 's@cmake_minimum_required(VERSION 3.1)@cmake_minimum_required(VERSION 3.5)@' third_party/opentelemetry-cpp/CMakeLists.txt +%endif + %if %{with rocm} # hipify From d67e1e127ac97f1557e8c2ea8b392318ded073f0 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Fri, 8 Aug 2025 14:00:11 -0700 Subject: [PATCH 23/38] Update to 2.8.0 Signed-off-by: Tom Rix --- .gitignore | 1 + python-torch.spec | 16 +++++----------- sources | 1 + 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index a4ed35b..7832594 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,4 @@ /pytorch-a1cb3cc.tar.gz /v24.12.23.tar.gz /kineto-5e75018.tar.gz +/pytorch-v2.8.0.tar.gz diff --git a/python-torch.spec b/python-torch.spec index e3cfb6d..442480f 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -15,9 +15,6 @@ %global miniz_version 3.0.2 %global pybind11_version 2.13.6 %else -%global commit0 a1cb3cc05d46d198467bebbb6e8fba50a325d4e7 -%global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) -%global date0 20250723 %global pypi_version 2.8.0 %global flatbuffers_version 24.12.23 %global miniz_version 3.0.2 @@ -55,7 +52,7 @@ Name: python-%{pypi_name} %if %{with gitcommit} Version: %{pypi_version}^git%{date0}.%{shortcommit0} %else -Version: %{pypi_version}.rc8 +Version: %{pypi_version} %endif Release: %autorelease Summary: PyTorch AI/ML framework @@ -67,8 +64,7 @@ URL: https://pytorch.org/ Source0: %{forgeurl}/archive/%{commit0}/pytorch-%{shortcommit0}.tar.gz Source1000: pyproject.toml %else -Source0: %{forgeurl}/archive/%{commit0}/pytorch-%{shortcommit0}.tar.gz -Source1000: pyproject.toml +Source0: %{forgeurl}/releases/download/v%{version}/pytorch-v%{version}.tar.gz %endif Source1: https://github.com/google/flatbuffers/archive/refs/tags/v%{flatbuffers_version}.tar.gz Source2: https://github.com/pybind/pybind11/archive/refs/tags/v%{pybind11_version}.tar.gz @@ -260,9 +256,7 @@ Requires: python3-%{pypi_name}%{?_isa} = %{version}-%{release} cp %{SOURCE1000} . %else -%autosetup -p1 -n pytorch-%{commit0} -# Overwrite with a git checkout of the pyproject.toml -cp %{SOURCE1000} . +%autosetup -p1 -n pytorch-v%{version} %endif # Remove bundled egg-info @@ -310,8 +304,8 @@ rm -rf third_party/kineto/* cp -r kineto-*/* third_party/kineto/ %endif -# hipblaslt only building with gfx90a -sed -i -e 's@"gfx90a", "gfx940", "gfx941", "gfx942"@"gfx90a"@' aten/src/ATen/native/cuda/Blas.cpp +# Adjust for the hipblaslt's we build +sed -i -e 's@"gfx90a", "gfx940", "gfx941", "gfx942"@"gfx90a", "gfx1103", "gfx1150", "gfx1151", "gfx1100", "gfx1101", "gfx1200", "gfx1201"@' aten/src/ATen/native/cuda/Blas.cpp %if 0%{?rhel} # In RHEL but too old diff --git a/sources b/sources index c7eae22..335b8a8 100644 --- a/sources +++ b/sources @@ -10,3 +10,4 @@ SHA512 (kineto-be13176.tar.gz) = 41a08c7da9eea7d12402f80a5550c9d4df79798719cc52b SHA512 (pytorch-a1cb3cc.tar.gz) = 92bf8b2c2ef0b459406b60169ecebdc50652c75943e3d6087e4d261f6e308dbad365529561e0f07ea3f0b71790efb68b5e4ab2f44e270462097208d924dc2d95 SHA512 (v24.12.23.tar.gz) = f97762ba41b9cfef648e93932fd789324c6bb6ebc5b7aeca8185c9ef602294b67d73aea7ae371035579a1419cbfbeba7c3e88b31b5a5848db98f5e8a03b982b1 SHA512 (kineto-5e75018.tar.gz) = 921b96a56e01d69895b79e67582d8977ed6f873573ab41557c5d026ada5d1f6365e4ed0a0c6804057c52e92510749fc58619f554a164c1ba9d8cd13e789bebd0 +SHA512 (pytorch-v2.8.0.tar.gz) = 791e658eab87fb957f025558cb9f925078d2426ab7b6f60771d9841dfb691f67d905ba1330a800008efe7c938b6c69bdc52232bccfe8d4860e795a532cd69d28 From 1b986b49932fabd64e4dad05277398d9629a57d3 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Thu, 14 Aug 2025 09:10:18 -0700 Subject: [PATCH 24/38] Build on EPEL Signed-off-by: Tom Rix --- .gitignore | 1 + python-torch.spec | 57 ++++++++++++++++++++++++++++++++++------------- sources | 1 + 3 files changed, 44 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 7832594..5fda907 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,4 @@ /v24.12.23.tar.gz /kineto-5e75018.tar.gz /pytorch-v2.8.0.tar.gz +/v1.18.0.tar.gz diff --git a/python-torch.spec b/python-torch.spec index 442480f..6034593 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -48,6 +48,12 @@ %bcond_with httplib %bcond_with kineto +%if 0%{?fedora} +%bcond_without onnx +%else +%bcond_with onnx +%endif + Name: python-%{pypi_name} %if %{with gitcommit} Version: %{pypi_version}^git%{date0}.%{shortcommit0} @@ -97,6 +103,11 @@ Source70: https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp- Source80: https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz %endif +%if %{without onnx} +%global ox_ver 1.18.0 +Source90: https://github.com/onnx/onnx/archive/refs/tags/v%{ox_ver}.tar.gz +%endif + # https://github.com/pytorch/pytorch/issues/150187 Patch11: 0001-Add-cmake-variable-USE_ROCM_CK.patch # https://github.com/pytorch/pytorch/issues/156595 @@ -123,7 +134,9 @@ BuildRequires: json-devel BuildRequires: libomp-devel BuildRequires: numactl-devel BuildRequires: ninja-build +%if %{with onnx} BuildRequires: onnx-devel +%endif %if %{with mpi} BuildRequires: openmpi-devel %endif @@ -304,6 +317,12 @@ rm -rf third_party/kineto/* cp -r kineto-*/* third_party/kineto/ %endif +%if %{without onnx} +tar xf %{SOURCE90} +rm -rf third_party/onnx/* +cp -r onnx-*/* third_party/onnx/ +%endif + # Adjust for the hipblaslt's we build sed -i -e 's@"gfx90a", "gfx940", "gfx941", "gfx942"@"gfx90a", "gfx1103", "gfx1150", "gfx1151", "gfx1100", "gfx1101", "gfx1200", "gfx1201"@' aten/src/ATen/native/cuda/Blas.cpp @@ -393,6 +412,10 @@ mv third_party/cpp-httplib . mv third_party/kineto . %endif +%if %{without onnx} +mv third_party/onnx . +%endif + %if %{with test} mv third_party/googletest . %endif @@ -421,6 +444,10 @@ mv cpp-httplib third_party mv kineto third_party %endif +%if %{without onnx} +mv onnx third_party +%endif + %if %{with test} mv googletest third_party %endif @@ -448,7 +475,6 @@ sed -i -e 's@cmake_minimum_required(VERSION 3.4)@cmake_minimum_required(VERSION sed -i -e 's@cmake_minimum_required(VERSION 3.1)@cmake_minimum_required(VERSION 3.5)@' third_party/opentelemetry-cpp/CMakeLists.txt %endif - %if %{with rocm} # hipify ./tools/amd_build/build_amd.py @@ -534,7 +560,9 @@ export USE_PYTORCH_QNNPACK=OFF export USE_ROCM=OFF export USE_SYSTEM_SLEEF=ON export USE_SYSTEM_EIGEN_INSTALL=ON +%if %{with onnx} export USE_SYSTEM_ONNX=ON +%endif export USE_SYSTEM_PYBIND11=OFF export USE_SYSTEM_LIBS=OFF export USE_SYSTEM_NCCL=OFF @@ -575,7 +603,6 @@ export BUILD_TEST=ON # # See BZ 2244862 - %if %{with rocm} export USE_ROCM=ON @@ -590,14 +617,15 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode export HIP_CLANG_PATH=%{rocmllvm_bindir} export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} -%pyproject_wheel - -%else - -%pyproject_wheel - %endif +%if 0%{?fedora} +%pyproject_wheel +%else +%py3_build +%endif + + %install %if %{with rocm} @@ -611,16 +639,15 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode # pytorch uses clang, not hipcc export HIP_CLANG_PATH=%{rocmllvm_bindir} export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} -%pyproject_install -%pyproject_save_files '*torch*' - -%else - -%pyproject_install -%pyproject_save_files '*torch*' %endif +%if 0%{?fedora} +%pyproject_install +%pyproject_save_files '*torch*' +%else +%py3_install +%endif %check diff --git a/sources b/sources index 335b8a8..5c15ab4 100644 --- a/sources +++ b/sources @@ -11,3 +11,4 @@ SHA512 (pytorch-a1cb3cc.tar.gz) = 92bf8b2c2ef0b459406b60169ecebdc50652c75943e3d6 SHA512 (v24.12.23.tar.gz) = f97762ba41b9cfef648e93932fd789324c6bb6ebc5b7aeca8185c9ef602294b67d73aea7ae371035579a1419cbfbeba7c3e88b31b5a5848db98f5e8a03b982b1 SHA512 (kineto-5e75018.tar.gz) = 921b96a56e01d69895b79e67582d8977ed6f873573ab41557c5d026ada5d1f6365e4ed0a0c6804057c52e92510749fc58619f554a164c1ba9d8cd13e789bebd0 SHA512 (pytorch-v2.8.0.tar.gz) = 791e658eab87fb957f025558cb9f925078d2426ab7b6f60771d9841dfb691f67d905ba1330a800008efe7c938b6c69bdc52232bccfe8d4860e795a532cd69d28 +SHA512 (v1.18.0.tar.gz) = 2f38664947c8d1efc40620a7c1b1953d2aa4b0a37b67c4886b86e77c1d697363c26413413ddda8eabc545892fb1bcb43afc7e93e62f0901527524a2727e1ea8d From a6dcc4b8d8fd8903c09cc93f87a6bc2aff357bdf Mon Sep 17 00:00:00 2001 From: Python Maint Date: Fri, 15 Aug 2025 15:02:18 +0200 Subject: [PATCH 25/38] Rebuilt for Python 3.14.0rc2 bytecode From 95f1f6fe22c05bf4a53479100fdfbaf48e1e90c3 Mon Sep 17 00:00:00 2001 From: Python Maint Date: Fri, 19 Sep 2025 14:37:44 +0200 Subject: [PATCH 26/38] Rebuilt for Python 3.14.0rc3 bytecode From 89daf765fd18ce8d6ecf40a8c6f8fe275c542091 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Fri, 26 Sep 2025 14:24:46 -0700 Subject: [PATCH 27/38] Disable magma Magma is broken on ROCm 7. Signed-off-by: Tom Rix --- python-torch.spec | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python-torch.spec b/python-torch.spec index 6034593..210aee2 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -178,7 +178,8 @@ BuildRequires: hiprand-devel BuildRequires: hipsparse-devel BuildRequires: hipsparselt-devel BuildRequires: hipsolver-devel -BuildRequires: magma-devel +# Magma is broken on ROCm 7 +# BuildRequires: magma-devel BuildRequires: miopen-devel BuildRequires: rocblas-devel BuildRequires: rocrand-devel @@ -607,7 +608,8 @@ export BUILD_TEST=ON export USE_ROCM=ON export USE_ROCM_CK=OFF -export USE_MAGMA=ON +# Magma is broken on ROCm 7 +# export USE_MAGMA=ON export HIP_PATH=`hipconfig -p` export ROCM_PATH=`hipconfig -R` RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir` From f29cac5d83e262d22372de0576f7e89b85abab3e Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Tue, 30 Sep 2025 07:35:43 -0700 Subject: [PATCH 28/38] Update to 2.9.0-rc4 Work around ROCm 7 build issue in 2.8.0 Signed-off-by: Tom Rix --- .gitignore | 1 + python-torch.spec | 34 ++++++++++++++++++++++++++++------ sources | 1 + 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 5fda907..0986c30 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ /kineto-5e75018.tar.gz /pytorch-v2.8.0.tar.gz /v1.18.0.tar.gz +/pytorch-715dca6.tar.gz diff --git a/python-torch.spec b/python-torch.spec index 210aee2..e25a665 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -4,13 +4,13 @@ %global forgeurl https://github.com/pytorch/pytorch # So pre releases can be tried -%bcond_with gitcommit +%bcond_without gitcommit %if %{with gitcommit} -# v2.8.0-rc8 -%global commit0 a1cb3cc05d46d198467bebbb6e8fba50a325d4e7 +# v2.9.0-rc4 +%global commit0 715dca672526a20322d07c2e67772cfe4400a20f %global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) -%global date0 20250723 -%global pypi_version 2.8.0 +%global date0 20250923 +%global pypi_version 2.9.0 %global flatbuffers_version 24.12.23 %global miniz_version 3.0.2 %global pybind11_version 2.13.6 @@ -108,11 +108,13 @@ Source80: https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ Source90: https://github.com/onnx/onnx/archive/refs/tags/v%{ox_ver}.tar.gz %endif +%if %{with gitcommit} +%else # https://github.com/pytorch/pytorch/issues/150187 Patch11: 0001-Add-cmake-variable-USE_ROCM_CK.patch # https://github.com/pytorch/pytorch/issues/156595 -# Patch12: 0001-Use-horrible-dynamo-stub.patch Patch12: 0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch +%endif ExclusiveArch: x86_64 aarch64 %global toolchain gcc @@ -200,6 +202,10 @@ BuildRequires: rocm-smi-devel BuildRequires: rocthrust-devel BuildRequires: roctracer-devel +%if %{with gitcommit} +BuildRequires: moodycamel-concurrentqueue-devel +%endif + Requires: amdsmi %endif @@ -492,6 +498,13 @@ sed -i -e 's@HIP 1.0@HIP MODULE@' cmake/public/LoadHIP.cmake # silence an assert # sed -i -e '/qvalue = std::clamp(qvalue, qmin, qmax);/d' aten/src/ATen/native/cuda/IndexKernel.cu +%endif + +%if %{with gitcommit} +# moodycamel include path needs adjusting to use the system's +sed -i -e 's@${PROJECT_SOURCE_DIR}/third_party/concurrentqueue@/usr/include/concurrentqueue@' cmake/Dependencies.cmake + + %endif %build @@ -607,7 +620,14 @@ export BUILD_TEST=ON %if %{with rocm} export USE_ROCM=ON +%if %{with gitcommit} +export USE_ROCM_CK_SDPA=OFF +export USE_ROCM_CK_GEMM=OFF +export USE_FBGEMM_GENAI=OFF +%else export USE_ROCM_CK=OFF +%endif + # Magma is broken on ROCm 7 # export USE_MAGMA=ON export HIP_PATH=`hipconfig -p` @@ -662,7 +682,9 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} %license LICENSE %doc README.md %{_bindir}/torchrun +%if %{without gitcommit} %{_bindir}/torchfrtrace +%endif %{python3_sitearch}/%{pypi_name}* %{python3_sitearch}/functorch diff --git a/sources b/sources index 5c15ab4..0fdf299 100644 --- a/sources +++ b/sources @@ -12,3 +12,4 @@ SHA512 (v24.12.23.tar.gz) = f97762ba41b9cfef648e93932fd789324c6bb6ebc5b7aeca8185 SHA512 (kineto-5e75018.tar.gz) = 921b96a56e01d69895b79e67582d8977ed6f873573ab41557c5d026ada5d1f6365e4ed0a0c6804057c52e92510749fc58619f554a164c1ba9d8cd13e789bebd0 SHA512 (pytorch-v2.8.0.tar.gz) = 791e658eab87fb957f025558cb9f925078d2426ab7b6f60771d9841dfb691f67d905ba1330a800008efe7c938b6c69bdc52232bccfe8d4860e795a532cd69d28 SHA512 (v1.18.0.tar.gz) = 2f38664947c8d1efc40620a7c1b1953d2aa4b0a37b67c4886b86e77c1d697363c26413413ddda8eabc545892fb1bcb43afc7e93e62f0901527524a2727e1ea8d +SHA512 (pytorch-715dca6.tar.gz) = 09c9aae54fab3eb17901fc3226fece1c13f41cb8e45a2cb066021823abeb8d27c340993088e01d8e55bb37ed5f94334ec31e6c539cddfacbad157abd27c5e907 From 1509cbcd60c545c5b19d00033c4538f6763bf4cc Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 6 Oct 2025 14:27:41 -0700 Subject: [PATCH 29/38] Update to 2.9.0-rc6 aarch64 is not building, so disable. Signed-off-by: Tom Rix --- .gitignore | 1 + pyproject.toml | 139 ++++++++++++++++++++++++++++++++++------------ python-torch.spec | 11 ++-- sources | 1 + 4 files changed, 112 insertions(+), 40 deletions(-) diff --git a/.gitignore b/.gitignore index 0986c30..2918194 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ /pytorch-v2.8.0.tar.gz /v1.18.0.tar.gz /pytorch-715dca6.tar.gz +/pytorch-fd36458.tar.gz diff --git a/pyproject.toml b/pyproject.toml index ccf9c2a..925742b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,49 +1,105 @@ +# Package ###################################################################### + +[build-system] +requires = [ + # 70.1.0: min version for integrated bdist_wheel command from wheel package + # 77.0.0: min version for SPDX expression support for project.license + "setuptools>=70.1.0,<80.0", + "cmake>=3.27", + "ninja", + "numpy", + "packaging", + "pyyaml", + "requests", + "six", # dependency chain: NNPACK -> PeachPy -> six + "typing-extensions>=4.10.0", +] +build-backend = "setuptools.build_meta" + +[dependency-groups] +dev = [ + # This list should be kept in sync with the requirements-build.txt + # in PyTorch root until the project fully migrates to pyproject.toml + # after which this can be removed as it is already specified in the + # [build-system] section + "setuptools>=70.1.0,<80.0", # setuptools develop deprecated on 80.0 + "cmake>=3.27", + "ninja", + "numpy", + "packaging", + "pyyaml", + "requests", + "six", # dependency chain: NNPACK -> PeachPy -> six + "typing-extensions>=4.10.0", + + # This list should be kept in sync with the requirements.txt in + # PyTorch root until the project fully migrates to pyproject.toml + "build[uv]", + "expecttest>=0.3.0", + "filelock", + "fsspec>=0.8.5", + "hypothesis", + "jinja2", + "lintrunner; platform_machine != 's390x' and platform_machine != 'riscv64'", + "networkx>=2.5.1", + "optree>=0.13.0", + "psutil", + "sympy>=1.13.3", + "typing-extensions>=4.13.2", + "wheel", +] + [project] name = "torch" -requires-python = ">=3.9" -license = {text = "BSD-3-Clause"} +description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" +readme = "README.md" +requires-python = ">=3.10" +# TODO: change to `license = "BSD-3-Clause"` and enable PEP 639 after pinning setuptools>=77 +# FIXME: As of 2025.06.20, it is hard to ensure the minimum version of setuptools in our CI environment. +# TOML-table-based license deprecated in setuptools>=77, and the deprecation warning will be changed +# to an error on 2026.02.18. See also: https://github.com/pypa/setuptools/issues/4903 +license = { text = "BSD-3-Clause" } +authors = [{ name = "PyTorch Team", email = "packages@pytorch.org" }] +keywords = ["pytorch", "machine learning"] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Mathematics", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Python Modules", + "Programming Language :: C++", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", +] dynamic = [ - "authors", - "classifiers", "entry-points", "dependencies", - "description", - "keywords", - "optional-dependencies", - "readme", "scripts", "version", ] [project.urls] -Homepage = "https://pytorch.org/" -Documentation = "https://pytorch.org/docs/" -Source = "https://github.com/pytorch/pytorch" -Forum = "https://discuss.pytorch.org/" +Homepage = "https://pytorch.org" +Repository = "https://github.com/pytorch/pytorch" +Documentation = "https://pytorch.org/docs" +"Issue Tracker" = "https://github.com/pytorch/pytorch/issues" +Forum = "https://discuss.pytorch.org" +[project.optional-dependencies] +optree = ["optree>=0.13.0"] +opt-einsum = ["opt-einsum>=3.3"] +pyyaml = ["pyyaml"] -[build-system] -requires = [ - # After 75.8.2 dropped dep disttools API. Please fix - # API temporarily restored and shim used. Please fix - # Setuptools will drop support for setup.py past 80 - # min version for recursive glob package data support - "setuptools>=62.3.0,<80.0", - "wheel", - "astunparse", - "numpy", - "ninja", - "pyyaml", - "cmake", - "typing-extensions>=4.10.0", - "requests", -] -# Use legacy backend to import local packages in setup.py -build-backend = "setuptools.build_meta:__legacy__" - - -[tool.black] -line-length = 88 +# Linter tools ################################################################# [tool.isort] src_paths = ["caffe2", "torch", "torchgen", "functorch", "test"] @@ -59,12 +115,10 @@ multi_line_output = 3 include_trailing_comma = true combine_as_imports = true - [tool.usort.known] first_party = ["caffe2", "torch", "torchgen", "functorch", "test"] standard_library = ["typing_extensions"] - [tool.ruff] line-length = 88 src = ["caffe2", "torch", "torchgen", "functorch", "test"] @@ -105,6 +159,7 @@ ignore = [ "E741", "EXE001", "F405", + "FURB122", # writelines # these ignores are from flake8-logging-format; please fix! "G101", # these ignores are from ruff NPY; please fix! @@ -127,7 +182,15 @@ ignore = [ "SIM117", "SIM118", "UP007", # keep-runtime-typing + "UP045", # keep-runtime-typing "TC006", + # TODO: Remove Python-3.10 specific suppressions + "B905", + "UP035", + "UP036", + "UP038", + "UP041", + "FURB161", ] select = [ "B", @@ -208,6 +271,10 @@ select = [ "YTT", ] +[tool.ruff.lint.pyupgrade] +# Preserve types, even if a file imports `from __future__ import annotations`. +keep-runtime-typing = true + [tool.ruff.lint.per-file-ignores] "__init__.py" = [ "F401", diff --git a/python-torch.spec b/python-torch.spec index e25a665..3dad39d 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -6,14 +6,15 @@ # So pre releases can be tried %bcond_without gitcommit %if %{with gitcommit} -# v2.9.0-rc4 -%global commit0 715dca672526a20322d07c2e67772cfe4400a20f +# v2.9.0-rc6 +%global commit0 fd364580a94079854f2f32d463c118afaefe62e0 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) -%global date0 20250923 +%global date0 20251002 %global pypi_version 2.9.0 %global flatbuffers_version 24.12.23 %global miniz_version 3.0.2 %global pybind11_version 2.13.6 +%global rc_tag -rc6 %else %global pypi_version 2.8.0 %global flatbuffers_version 24.12.23 @@ -116,7 +117,9 @@ Patch11: 0001-Add-cmake-variable-USE_ROCM_CK.patch Patch12: 0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch %endif -ExclusiveArch: x86_64 aarch64 +# ExclusiveArch: x86_64 aarch64 +# aarch64 not building on 2.9.0-rc6 +ExclusiveArch: x86_64 %global toolchain gcc %global _lto_cflags %nil diff --git a/sources b/sources index 0fdf299..701e6b4 100644 --- a/sources +++ b/sources @@ -13,3 +13,4 @@ SHA512 (kineto-5e75018.tar.gz) = 921b96a56e01d69895b79e67582d8977ed6f873573ab415 SHA512 (pytorch-v2.8.0.tar.gz) = 791e658eab87fb957f025558cb9f925078d2426ab7b6f60771d9841dfb691f67d905ba1330a800008efe7c938b6c69bdc52232bccfe8d4860e795a532cd69d28 SHA512 (v1.18.0.tar.gz) = 2f38664947c8d1efc40620a7c1b1953d2aa4b0a37b67c4886b86e77c1d697363c26413413ddda8eabc545892fb1bcb43afc7e93e62f0901527524a2727e1ea8d SHA512 (pytorch-715dca6.tar.gz) = 09c9aae54fab3eb17901fc3226fece1c13f41cb8e45a2cb066021823abeb8d27c340993088e01d8e55bb37ed5f94334ec31e6c539cddfacbad157abd27c5e907 +SHA512 (pytorch-fd36458.tar.gz) = acbb7475b92ad4a8e8d779f3745da22d8438e4c5ef2d6e76d71c987789f2752c8aef7022c87c9a74640fe4f9c1f1a61a3f12a796f63b1e6be24da8e5aacf37dc From b615a5f89b4a35fc47b600ed1c75ebd5e3a21863 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 13 Oct 2025 10:17:32 -0700 Subject: [PATCH 30/38] Update to 2.9.0-rc9 Introduce pytorch-rpm-macros package. Add %pytorch_arches to the macros, set to aarch64 and x86_64 Signed-off-by: Tom Rix --- .gitignore | 1 + python-torch.spec | 43 ++++++++++++++++++++++++++++++++----------- sources | 1 + 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 2918194..2dab732 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ /v1.18.0.tar.gz /pytorch-715dca6.tar.gz /pytorch-fd36458.tar.gz +/pytorch-0fabc3b.tar.gz diff --git a/python-torch.spec b/python-torch.spec index 3dad39d..f7f7f0c 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -6,15 +6,15 @@ # So pre releases can be tried %bcond_without gitcommit %if %{with gitcommit} -# v2.9.0-rc6 -%global commit0 fd364580a94079854f2f32d463c118afaefe62e0 +# v2.9.0-rc9 +%global commit0 0fabc3ba44823f257e70ce397d989c8de5e362c1 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) -%global date0 20251002 +%global date0 20251008 %global pypi_version 2.9.0 %global flatbuffers_version 24.12.23 %global miniz_version 3.0.2 %global pybind11_version 2.13.6 -%global rc_tag -rc6 +%global rc_tag -rc9 %else %global pypi_version 2.8.0 %global flatbuffers_version 24.12.23 @@ -117,9 +117,13 @@ Patch11: 0001-Add-cmake-variable-USE_ROCM_CK.patch Patch12: 0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch %endif -# ExclusiveArch: x86_64 aarch64 -# aarch64 not building on 2.9.0-rc6 -ExclusiveArch: x86_64 +%if 0%{?fedora} >= 45 +# drop aarch64 in 45 +%global pt_arches x86_64 +%else +%global pt_arches x86_64 aarch64 +%endif +ExclusiveArch: %pt_arches %global toolchain gcc %global _lto_cflags %nil @@ -137,6 +141,9 @@ BuildRequires: gloo-devel BuildRequires: json-devel BuildRequires: libomp-devel +%if %{with gitcommit} +BuildRequires: moodycamel-concurrentqueue-devel +%endif BuildRequires: numactl-devel BuildRequires: ninja-build %if %{with onnx} @@ -205,10 +212,6 @@ BuildRequires: rocm-smi-devel BuildRequires: rocthrust-devel BuildRequires: roctracer-devel -%if %{with gitcommit} -BuildRequires: moodycamel-concurrentqueue-devel -%endif - Requires: amdsmi %endif @@ -261,6 +264,14 @@ PyTorch is a Python package that provides two high-level features: You can reuse your favorite Python packages such as NumPy, SciPy, and Cython to extend PyTorch when needed. +%package -n pytorch-rpm-macros +Summary: PyTorch RPM macros +BuildArch: noarch + +%description -n pytorch-rpm-macros +This package contains PyTorch related RPM macros. + + %if %{with test} %package -n python3-%{pypi_name}-test Summary: Tests for %{name} @@ -512,6 +523,9 @@ sed -i -e 's@${PROJECT_SOURCE_DIR}/third_party/concurrentqueue@/usr/include/conc %build +# Export the arches +echo "%%pytorch_arches %pt_arches" > macros.pytorch + # # Control the number of jobs # @@ -653,6 +667,10 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} %install +# pytorch rpm macros +install -Dpm 644 macros.pytorch \ + %{buildroot}%{_rpmmacrodir}/macros.pytorch + %if %{with rocm} export USE_ROCM=ON export USE_ROCM_CK=OFF @@ -691,6 +709,9 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} %{python3_sitearch}/%{pypi_name}* %{python3_sitearch}/functorch +%files -n pytorch-rpm-macros +%{_rpmmacrodir}/macros.pytorch + %changelog %autochangelog diff --git a/sources b/sources index 701e6b4..89e2a95 100644 --- a/sources +++ b/sources @@ -14,3 +14,4 @@ SHA512 (pytorch-v2.8.0.tar.gz) = 791e658eab87fb957f025558cb9f925078d2426ab7b6f60 SHA512 (v1.18.0.tar.gz) = 2f38664947c8d1efc40620a7c1b1953d2aa4b0a37b67c4886b86e77c1d697363c26413413ddda8eabc545892fb1bcb43afc7e93e62f0901527524a2727e1ea8d SHA512 (pytorch-715dca6.tar.gz) = 09c9aae54fab3eb17901fc3226fece1c13f41cb8e45a2cb066021823abeb8d27c340993088e01d8e55bb37ed5f94334ec31e6c539cddfacbad157abd27c5e907 SHA512 (pytorch-fd36458.tar.gz) = acbb7475b92ad4a8e8d779f3745da22d8438e4c5ef2d6e76d71c987789f2752c8aef7022c87c9a74640fe4f9c1f1a61a3f12a796f63b1e6be24da8e5aacf37dc +SHA512 (pytorch-0fabc3b.tar.gz) = 2e87975de0bf6f3dcede168b379e1928712bca16170c2a8ee7d63459f53086c01baac05e0763e4d5d28cdaf1c7d8912225ee06adeff96ead4f6f456ee174b341 From ef01976cf4687ab7bce860c23297cf9b6d105940 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Wed, 15 Oct 2025 16:27:01 -0700 Subject: [PATCH 31/38] Update to 2.9.0 Signed-off-by: Tom Rix --- .gitignore | 1 + python-torch.spec | 25 ++----------------------- sources | 1 + 3 files changed, 4 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index 2dab732..444b9ca 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,4 @@ /pytorch-715dca6.tar.gz /pytorch-fd36458.tar.gz /pytorch-0fabc3b.tar.gz +/pytorch-v2.9.0.tar.gz diff --git a/python-torch.spec b/python-torch.spec index f7f7f0c..0fd7ebd 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -4,7 +4,7 @@ %global forgeurl https://github.com/pytorch/pytorch # So pre releases can be tried -%bcond_without gitcommit +%bcond_with gitcommit %if %{with gitcommit} # v2.9.0-rc9 %global commit0 0fabc3ba44823f257e70ce397d989c8de5e362c1 @@ -16,7 +16,7 @@ %global pybind11_version 2.13.6 %global rc_tag -rc9 %else -%global pypi_version 2.8.0 +%global pypi_version 2.9.0 %global flatbuffers_version 24.12.23 %global miniz_version 3.0.2 %global pybind11_version 2.13.6 @@ -109,14 +109,6 @@ Source80: https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ Source90: https://github.com/onnx/onnx/archive/refs/tags/v%{ox_ver}.tar.gz %endif -%if %{with gitcommit} -%else -# https://github.com/pytorch/pytorch/issues/150187 -Patch11: 0001-Add-cmake-variable-USE_ROCM_CK.patch -# https://github.com/pytorch/pytorch/issues/156595 -Patch12: 0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch -%endif - %if 0%{?fedora} >= 45 # drop aarch64 in 45 %global pt_arches x86_64 @@ -141,9 +133,7 @@ BuildRequires: gloo-devel BuildRequires: json-devel BuildRequires: libomp-devel -%if %{with gitcommit} BuildRequires: moodycamel-concurrentqueue-devel -%endif BuildRequires: numactl-devel BuildRequires: ninja-build %if %{with onnx} @@ -514,13 +504,9 @@ sed -i -e 's@HIP 1.0@HIP MODULE@' cmake/public/LoadHIP.cmake %endif -%if %{with gitcommit} # moodycamel include path needs adjusting to use the system's sed -i -e 's@${PROJECT_SOURCE_DIR}/third_party/concurrentqueue@/usr/include/concurrentqueue@' cmake/Dependencies.cmake - -%endif - %build # Export the arches @@ -637,13 +623,9 @@ export BUILD_TEST=ON %if %{with rocm} export USE_ROCM=ON -%if %{with gitcommit} export USE_ROCM_CK_SDPA=OFF export USE_ROCM_CK_GEMM=OFF export USE_FBGEMM_GENAI=OFF -%else -export USE_ROCM_CK=OFF -%endif # Magma is broken on ROCm 7 # export USE_MAGMA=ON @@ -703,9 +685,6 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} %license LICENSE %doc README.md %{_bindir}/torchrun -%if %{without gitcommit} -%{_bindir}/torchfrtrace -%endif %{python3_sitearch}/%{pypi_name}* %{python3_sitearch}/functorch diff --git a/sources b/sources index 89e2a95..05c1f96 100644 --- a/sources +++ b/sources @@ -15,3 +15,4 @@ SHA512 (v1.18.0.tar.gz) = 2f38664947c8d1efc40620a7c1b1953d2aa4b0a37b67c4886b86e7 SHA512 (pytorch-715dca6.tar.gz) = 09c9aae54fab3eb17901fc3226fece1c13f41cb8e45a2cb066021823abeb8d27c340993088e01d8e55bb37ed5f94334ec31e6c539cddfacbad157abd27c5e907 SHA512 (pytorch-fd36458.tar.gz) = acbb7475b92ad4a8e8d779f3745da22d8438e4c5ef2d6e76d71c987789f2752c8aef7022c87c9a74640fe4f9c1f1a61a3f12a796f63b1e6be24da8e5aacf37dc SHA512 (pytorch-0fabc3b.tar.gz) = 2e87975de0bf6f3dcede168b379e1928712bca16170c2a8ee7d63459f53086c01baac05e0763e4d5d28cdaf1c7d8912225ee06adeff96ead4f6f456ee174b341 +SHA512 (pytorch-v2.9.0.tar.gz) = ae989e3a7fe30f9ea90944dc25e21ca92f2a94ee40d8de974a168c292d82c16ee8920624eff91a85755469ad05473dce0f85893e3ed7794ec5c6bdd89cbd2023 From 741c412249afb0dabfb37cdc6f3bbd05ce3ec176 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Fri, 17 Oct 2025 10:05:50 -0700 Subject: [PATCH 32/38] Remove pytorch-rpm-macros package. This does not work when building on a general arch Signed-off-by: Tom Rix --- python-torch.spec | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/python-torch.spec b/python-torch.spec index 0fd7ebd..b623dc6 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -254,14 +254,6 @@ PyTorch is a Python package that provides two high-level features: You can reuse your favorite Python packages such as NumPy, SciPy, and Cython to extend PyTorch when needed. -%package -n pytorch-rpm-macros -Summary: PyTorch RPM macros -BuildArch: noarch - -%description -n pytorch-rpm-macros -This package contains PyTorch related RPM macros. - - %if %{with test} %package -n python3-%{pypi_name}-test Summary: Tests for %{name} @@ -510,7 +502,7 @@ sed -i -e 's@${PROJECT_SOURCE_DIR}/third_party/concurrentqueue@/usr/include/conc %build # Export the arches -echo "%%pytorch_arches %pt_arches" > macros.pytorch +# echo "%%pytorch_arches %pt_arches" > macros.pytorch # # Control the number of jobs @@ -650,8 +642,8 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} %install # pytorch rpm macros -install -Dpm 644 macros.pytorch \ - %{buildroot}%{_rpmmacrodir}/macros.pytorch +# install -Dpm 644 macros.pytorch \ +# %{buildroot}%{_rpmmacrodir}/macros.pytorch %if %{with rocm} export USE_ROCM=ON @@ -688,9 +680,6 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} %{python3_sitearch}/%{pypi_name}* %{python3_sitearch}/functorch -%files -n pytorch-rpm-macros -%{_rpmmacrodir}/macros.pytorch - %changelog %autochangelog From e0030b3ec55af789d1db27c223bb58a4c102498b Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 17 Nov 2025 14:11:32 -0800 Subject: [PATCH 33/38] Rebuild for ROCm 7.1 Signed-off-by: Tom Rix From b3977567d226ae906a3f253b8e782d47373dd578 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 24 Nov 2025 07:03:31 -0800 Subject: [PATCH 34/38] Always include onnx src Signed-off-by: Tom Rix --- python-torch.spec | 2 -- 1 file changed, 2 deletions(-) diff --git a/python-torch.spec b/python-torch.spec index b623dc6..7bd82df 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -104,10 +104,8 @@ Source70: https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp- Source80: https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz %endif -%if %{without onnx} %global ox_ver 1.18.0 Source90: https://github.com/onnx/onnx/archive/refs/tags/v%{ox_ver}.tar.gz -%endif %if 0%{?fedora} >= 45 # drop aarch64 in 45 From 7908450a47cf1c4210b8ab7ab0132587580e1d72 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Thu, 18 Dec 2025 13:52:00 -0800 Subject: [PATCH 35/38] Update to 2.9.1 Signed-off-by: Tom Rix --- .gitignore | 1 + python-torch.spec | 2 +- sources | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 444b9ca..c424df5 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,4 @@ /pytorch-fd36458.tar.gz /pytorch-0fabc3b.tar.gz /pytorch-v2.9.0.tar.gz +/pytorch-v2.9.1.tar.gz diff --git a/python-torch.spec b/python-torch.spec index 7bd82df..e493b97 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -16,7 +16,7 @@ %global pybind11_version 2.13.6 %global rc_tag -rc9 %else -%global pypi_version 2.9.0 +%global pypi_version 2.9.1 %global flatbuffers_version 24.12.23 %global miniz_version 3.0.2 %global pybind11_version 2.13.6 diff --git a/sources b/sources index 05c1f96..9a3681f 100644 --- a/sources +++ b/sources @@ -16,3 +16,4 @@ SHA512 (pytorch-715dca6.tar.gz) = 09c9aae54fab3eb17901fc3226fece1c13f41cb8e45a2c SHA512 (pytorch-fd36458.tar.gz) = acbb7475b92ad4a8e8d779f3745da22d8438e4c5ef2d6e76d71c987789f2752c8aef7022c87c9a74640fe4f9c1f1a61a3f12a796f63b1e6be24da8e5aacf37dc SHA512 (pytorch-0fabc3b.tar.gz) = 2e87975de0bf6f3dcede168b379e1928712bca16170c2a8ee7d63459f53086c01baac05e0763e4d5d28cdaf1c7d8912225ee06adeff96ead4f6f456ee174b341 SHA512 (pytorch-v2.9.0.tar.gz) = ae989e3a7fe30f9ea90944dc25e21ca92f2a94ee40d8de974a168c292d82c16ee8920624eff91a85755469ad05473dce0f85893e3ed7794ec5c6bdd89cbd2023 +SHA512 (pytorch-v2.9.1.tar.gz) = 88de0289fa2760abd69bef505b5ae3b6d7ff176b415cbb31bbc89ce5476a3800b322a97c4490f270f8b89657aff931bf9a5516202b268e0bb8b1f63dbb87b34a From 7ddebb112b1931a495e78fba092b77e4f91022df Mon Sep 17 00:00:00 2001 From: "Alexander F. Lent" Date: Thu, 18 Dec 2025 09:08:09 -0500 Subject: [PATCH 36/38] Improve build times on non-x86 systems Signed-off-by: Alexander F. Lent --- python-torch.spec | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python-torch.spec b/python-torch.spec index e493b97..abacce9 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -506,10 +506,15 @@ sed -i -e 's@${PROJECT_SOURCE_DIR}/third_party/concurrentqueue@/usr/include/conc # Control the number of jobs # # The build can fail if too many threads exceed the physical memory -# So count core and and memory and increase the build memory util the build succeeds +# Run at least one thread, more if CPU & memory resources are available. # +%ifarch x86_64 # Real cores, No hyperthreading COMPILE_JOBS=`cat /proc/cpuinfo | grep -m 1 'cpu cores' | awk '{ print $4 }'` +%else +# cpuinfo format varies on other arches, fall back to nproc +COMPILE_JOBS=`nproc` +%endif if [ ${COMPILE_JOBS}x = x ]; then COMPILE_JOBS=1 fi From 767d576d1df8b3041f11c86243bcf3503a6d559a Mon Sep 17 00:00:00 2001 From: "Alexander F. Lent" Date: Sat, 20 Dec 2025 21:24:48 -0500 Subject: [PATCH 37/38] Continue to support aarch64 with myself maintaining Signed-off-by: Alexander F. Lent --- python-torch.spec | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python-torch.spec b/python-torch.spec index abacce9..640adb3 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -107,12 +107,7 @@ Source80: https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ %global ox_ver 1.18.0 Source90: https://github.com/onnx/onnx/archive/refs/tags/v%{ox_ver}.tar.gz -%if 0%{?fedora} >= 45 -# drop aarch64 in 45 -%global pt_arches x86_64 -%else %global pt_arches x86_64 aarch64 -%endif ExclusiveArch: %pt_arches %global toolchain gcc %global _lto_cflags %nil From 294accd75d5a894c11d48bbea7a5d2dd70bebd15 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 12 Jan 2026 16:38:15 -0800 Subject: [PATCH 38/38] Fix hip device lib path is no longer needed Signed-off-by: Tom Rix --- python-torch.spec | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python-torch.spec b/python-torch.spec index 640adb3..d3c31d7 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -621,8 +621,8 @@ export USE_FBGEMM_GENAI=OFF # export USE_MAGMA=ON export HIP_PATH=`hipconfig -p` export ROCM_PATH=`hipconfig -R` -RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir` -export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode +#RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir` +#export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode # pytorch uses clang, not hipcc export HIP_CLANG_PATH=%{rocmllvm_bindir} @@ -648,8 +648,8 @@ export USE_ROCM=ON export USE_ROCM_CK=OFF export HIP_PATH=`hipconfig -p` export ROCM_PATH=`hipconfig -R` -RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir` -export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode +# RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir` +# export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode # pytorch uses clang, not hipcc export HIP_CLANG_PATH=%{rocmllvm_bindir}