diff --git a/0001-Add-cmake-varaible-USE_ROCM_CK.patch b/0001-Add-cmake-varaible-USE_ROCM_CK.patch new file mode 100644 index 0000000..b34e07a --- /dev/null +++ b/0001-Add-cmake-varaible-USE_ROCM_CK.patch @@ -0,0 +1,120 @@ +From 0f33e0a7bbd1522ee74f8fc1fbe3af7563318c79 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Fri, 28 Mar 2025 15:33:09 -0700 +Subject: [PATCH] Add cmake varaible USE_ROCM_CK + +To control the use of ROCm Composable Kernel usage. + +CK is not compatible with all rocBLAS gpu's, so the user +must explicitly choose to use CK. + +Signed-off-by: Tom Rix +--- + CMakeLists.txt | 1 + + aten/src/ATen/CMakeLists.txt | 8 ++++++-- + aten/src/ATen/cuda/CUDABlas.cpp | 10 +++++----- + cmake/Dependencies.cmake | 3 +++ + 4 files changed, 15 insertions(+), 7 deletions(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index f3fee2f7ffc2..73903acce452 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -249,6 +249,7 @@ cmake_dependent_option( + BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON + "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF) + cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF) ++cmake_dependent_option(USE_ROCM_CK "Use ROCm Composable Kernel" ON "USE_ROCM" ON) + option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF) + cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF) + cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF +diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt +index 085af373ec22..af268ab88572 100644 +--- a/aten/src/ATen/CMakeLists.txt ++++ b/aten/src/ATen/CMakeLists.txt +@@ -361,13 +361,17 @@ endif() + ${native_quantized_hip_hip} + ${native_transformers_hip_hip} ${native_transformers_src_hip_hip} + ) +- if(WIN32) # Windows doesn't support Composable Kernels and Triton ++ if(NOT USE_ROCM_CK) # Windows doesn't support Composable Kernels and Triton + file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip") + file(GLOB native_hip_ck "native/hip/ck*.hip") + exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}" +- ${native_hip_bgemm} ${native_hip_ck} ++ ${native_hip_bgemm} ${native_hip_ck}) ++ endif() ++ if(WIN32) # Windows doesn't support Composable Kernels and Triton ++ exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}" + ${native_transformers_hip_hip} ${native_transformers_hip_cpp}) + endif() ++ + # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources) + list(APPEND all_hip_cpp + ${native_nested_hip_cpp} +diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp +index a62b028fd4ff..a3dbf76848ea 100644 +--- a/aten/src/ATen/cuda/CUDABlas.cpp ++++ b/aten/src/ATen/cuda/CUDABlas.cpp +@@ -708,7 +708,7 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::bgemm_internal_ck(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } +@@ -1061,7 +1061,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(double)); + #endif + } +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(double)); + } +@@ -1077,7 +1077,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); + } +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(float)); + } +@@ -1125,7 +1125,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); + } +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::Half)); + } +@@ -1141,7 +1141,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } +-#ifdef USE_ROCM ++#ifdef USE_ROCM_CK + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index 30917bdf39f5..2ca6091030f1 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -1046,6 +1046,9 @@ if(USE_ROCM) + if(HIPBLASLT_VEC_EXT) + list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_VEC_EXT) + endif() ++ if(USE_ROCM_CK) ++ list(APPEND HIP_CXX_FLAGS -DUSE_ROCM_CK) ++ endif() + list(APPEND HIP_HIPCC_FLAGS --offload-compress) + if(WIN32) + add_definitions(-DROCM_ON_WINDOWS) +-- +2.48.1 + diff --git a/0001-python-torch-disable-ck.patch b/0001-python-torch-disable-ck.patch new file mode 100644 index 0000000..e8fd9c2 --- /dev/null +++ b/0001-python-torch-disable-ck.patch @@ -0,0 +1,112 @@ +From 027dad1eaed51c1172e2497da611e3267d42d2f0 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Fri, 28 Mar 2025 09:16:03 -0700 +Subject: [PATCH] python-torch: disable ck + +--- + aten/src/ATen/CMakeLists.txt | 7 +++---- + aten/src/ATen/Context.cpp | 1 + + aten/src/ATen/cuda/CUDABlas.cpp | 10 +++++----- + 3 files changed, 9 insertions(+), 9 deletions(-) + +diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt +index 085af373ec22..84808880e51c 100644 +--- a/aten/src/ATen/CMakeLists.txt ++++ b/aten/src/ATen/CMakeLists.txt +@@ -134,7 +134,7 @@ file(GLOB native_cuda_cu "native/cuda/*.cu") + file(GLOB native_cuda_cpp "native/cuda/*.cpp") + file(GLOB native_cuda_h "native/cuda/*.h" "native/cuda/*.cuh") + file(GLOB native_cuda_linalg_cpp "native/cuda/linalg/*.cpp") +-file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh" "native/hip/bgemm_kernels/*.h") ++file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh" ) + file(GLOB native_cudnn_cpp "native/cudnn/*.cpp") + file(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu") + file(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp") +@@ -145,7 +145,7 @@ file(GLOB native_nested_h "native/nested/*.h") + file(GLOB native_nested_cuda_cu "native/nested/cuda/*.cu") + file(GLOB native_nested_cuda_cpp "native/nested/cuda/*.cpp") + +-file(GLOB native_hip_hip "native/hip/*.hip" "native/hip/bgemm_kernels/*.hip") ++file(GLOB native_hip_hip "native/hip/*.hip" ) + file(GLOB native_hip_cpp "native/hip/*.cpp") + file(GLOB native_hip_linalg_cpp "native/hip/linalg/*.cpp") + file(GLOB native_miopen_cpp "native/miopen/*.cpp") +@@ -361,13 +361,12 @@ endif() + ${native_quantized_hip_hip} + ${native_transformers_hip_hip} ${native_transformers_src_hip_hip} + ) +- if(WIN32) # Windows doesn't support Composable Kernels and Triton + file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip") + file(GLOB native_hip_ck "native/hip/ck*.hip") + exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}" + ${native_hip_bgemm} ${native_hip_ck} + ${native_transformers_hip_hip} ${native_transformers_hip_cpp}) +- endif() ++ + # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources) + list(APPEND all_hip_cpp + ${native_nested_hip_cpp} +diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp +index f598fc3a39d3..03dab6ff38fe 100644 +--- a/aten/src/ATen/Context.cpp ++++ b/aten/src/ATen/Context.cpp +@@ -355,6 +355,7 @@ at::BlasBackend Context::blasPreferredBackend() { + } + + void Context::setBlasPreferredBackend(at::BlasBackend b) { ++ return; + #ifdef _MSC_VER + TORCH_WARN_ONCE( + "torch.backends.cuda.preferred_blas_library is an experimental feature. " +diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp +index a62b028fd4ff..cba38426ea1f 100644 +--- a/aten/src/ATen/cuda/CUDABlas.cpp ++++ b/aten/src/ATen/cuda/CUDABlas.cpp +@@ -708,7 +708,7 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } +-#ifdef USE_ROCM ++#ifdef USE_ROCM_NO_CK + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::bgemm_internal_ck(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } +@@ -1061,7 +1061,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(double)); + #endif + } +-#ifdef USE_ROCM ++#ifdef USE_ROCM_NO_CK + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(double)); + } +@@ -1077,7 +1077,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); + } +-#ifdef USE_ROCM ++#ifdef USE_ROCM_NO_CK + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(float)); + } +@@ -1125,7 +1125,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); + } +-#ifdef USE_ROCM ++#ifdef USE_ROCM_NO_CK + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::Half)); + } +@@ -1141,7 +1141,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } +-#ifdef USE_ROCM ++#ifdef USE_ROCM_NO_CK + else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { + at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } +-- +2.48.1 + diff --git a/python-torch.spec b/python-torch.spec index 8a9a9f7..8f5ed02 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -6,10 +6,10 @@ # So pre releases can be tried %bcond_with gitcommit %if %{with gitcommit} -# v2.7.0-rc2 -%global commit0 b1940b5867e40e40ebdce4db76f76d3d0b71d3f4 +# v2.7.0-rc3 +%global commit0 b04d8358d959925bee0adfd67cc17987af9fbb9d %global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) -%global date0 20250413 +%global date0 20250326 %global pypi_version 2.7.0 %global flatbuffers_version 23.3.3 %global miniz_version 3.0.2 @@ -104,6 +104,13 @@ Patch11: 0001-Improve-finding-and-using-the-rocm_version.h.patch # Patches need to be refactored for ToT # These are ROCm packages Patch101: 0001-cuda-hip-signatures.patch +%else +# https://github.com/pytorch/pytorch/issues/150187 +# The hack job +# Patch11: 0001-python-torch-disable-ck.patch +# Cleaned up hack job +Patch11: 0001-Add-cmake-varaible-USE_ROCM_CK.patch + %endif ExclusiveArch: x86_64 aarch64 @@ -159,6 +166,9 @@ BuildRequires: python3dist(sympy) %endif %if %{with rocm} +%if %{with gitcommit} +BuildRequires: composable_kernel-devel +%endif BuildRequires: hipblas-devel BuildRequires: hipblaslt-devel BuildRequires: hipcub-devel @@ -330,6 +340,8 @@ sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-pass sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unused-command-line-argument@' cmake/Dependencies.cmake sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unused-result@' cmake/Dependencies.cmake sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-deprecated-declarations@' cmake/Dependencies.cmake +# Use parallel jobs +sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -parallel-jobs=4@' cmake/Dependencies.cmake # No third_party fmt, use system sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt @@ -447,6 +459,9 @@ sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable %if %{with gitcommit} # https://github.com/pytorch/pytorch/issues/149805 sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' cmake/public/LoadHIP.cmake +# Fedora installs to /usr/include, not /usr/include/rocm-core +sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/hip/tunable/Tunable.cpp +sed -i -e 's@rocm-core/rocm_version.h@rocm_version.h@' aten/src/ATen/cuda/tunable/Tunable.cpp %endif # use any hip, correct CMAKE_MODULE_PATH sed -i -e 's@lib/cmake/hip@lib64/cmake/hip@' cmake/public/LoadHIP.cmake @@ -574,6 +589,7 @@ export BUILD_TEST=ON %if %{with rocm} export USE_ROCM=ON +export USE_ROCM_CK=OFF export USE_MAGMA=ON export HIP_PATH=`hipconfig -p` export ROCM_PATH=`hipconfig -R` @@ -595,6 +611,7 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} %if %{with rocm} export USE_ROCM=ON +export USE_ROCM_CK=OFF export HIP_PATH=`hipconfig -p` export ROCM_PATH=`hipconfig -R` RESOURCE_DIR=`%{rocmllvm_bindir}/clang -print-resource-dir` @@ -620,8 +637,10 @@ export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default} %files -n python3-%{pypi_name} %license LICENSE %doc README.md +%if %{without gitcommit} %{_bindir}/convert-caffe2-to-onnx %{_bindir}/convert-onnx-to-caffe2 +%endif %{_bindir}/torchrun %{_bindir}/torchfrtrace %{python3_sitearch}/%{pypi_name}