diff --git a/next/0001-Optionally-use-hipblaslt.patch b/next/0001-Optionally-use-hipblaslt.patch index 029d98c..56434a7 100644 --- a/next/0001-Optionally-use-hipblaslt.patch +++ b/next/0001-Optionally-use-hipblaslt.patch @@ -1,4 +1,4 @@ -From 186728c9f4de720547fa3d1b7951c7d5563ad3c7 Mon Sep 17 00:00:00 2001 +From d77e05d90df006322cda021f1a8affdcc2c7eaef Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Fri, 23 Feb 2024 08:27:30 -0500 Subject: [PATCH] Optionally use hipblaslt @@ -10,21 +10,30 @@ Convert the checks for ROCM_VERSION >= 507000 to HIPBLASLT checks Signed-off-by: Tom Rix --- - aten/src/ATen/cuda/CUDABlas.cpp | 5 +++-- + aten/src/ATen/cuda/CUDABlas.cpp | 7 ++++--- aten/src/ATen/cuda/CUDABlas.h | 2 +- aten/src/ATen/cuda/CUDAContextLight.h | 4 ++-- aten/src/ATen/cuda/CublasHandlePool.cpp | 4 ++-- aten/src/ATen/cuda/tunable/TunableGemm.h | 6 +++--- - aten/src/ATen/native/cuda/Blas.cpp | 10 ++++++---- + aten/src/ATen/native/cuda/Blas.cpp | 14 ++++++++------ cmake/Dependencies.cmake | 3 +++ cmake/public/LoadHIP.cmake | 4 ++-- - 8 files changed, 22 insertions(+), 16 deletions(-) + 8 files changed, 25 insertions(+), 19 deletions(-) diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp -index 0a3de5f9d77..b67031e88a7 100644 +index d534ec5a178..e815463f630 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp -@@ -778,7 +778,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { +@@ -14,7 +14,7 @@ + #include + + #ifdef USE_ROCM +-#if ROCM_VERSION >= 60000 ++#ifdef HIPBLASLT + #include + #endif + // until hipblas has an API to accept flags, we must use rocblas here +@@ -781,7 +781,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { } } @@ -33,7 +42,7 @@ index 0a3de5f9d77..b67031e88a7 100644 #if defined(USE_ROCM) && ROCM_VERSION >= 50700 && ROCM_VERSION < 60000 // only for rocm 5.7 where we first supported hipblaslt, it was difficult -@@ -909,6 +909,7 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< +@@ -912,6 +912,7 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< }; } // namespace @@ -41,7 +50,7 @@ index 0a3de5f9d77..b67031e88a7 100644 template void gemm_and_bias( bool transpose_mat1, -@@ -1121,7 +1122,7 @@ template void gemm_and_bias( +@@ -1124,7 +1125,7 @@ template void gemm_and_bias( at::BFloat16* result_ptr, int64_t result_ld, GEMMAndBiasActivationEpilogue activation); @@ -86,10 +95,10 @@ index 4ec35f59a21..e28dc42034f 100644 #endif diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp -index d8ee09b1486..4510193a5cc 100644 +index 6913d2cd95e..3d4276be372 100644 --- a/aten/src/ATen/cuda/CublasHandlePool.cpp +++ b/aten/src/ATen/cuda/CublasHandlePool.cpp -@@ -28,7 +28,7 @@ namespace at::cuda { +@@ -29,7 +29,7 @@ namespace at::cuda { namespace { @@ -98,7 +107,7 @@ index d8ee09b1486..4510193a5cc 100644 void createCublasLtHandle(cublasLtHandle_t *handle) { TORCH_CUDABLAS_CHECK(cublasLtCreate(handle)); } -@@ -177,7 +177,7 @@ cublasHandle_t getCurrentCUDABlasHandle() { +@@ -190,7 +190,7 @@ cublasHandle_t getCurrentCUDABlasHandle() { return handle; } @@ -139,10 +148,10 @@ index 3ba0d761277..dde1870cfbf 100644 if (env == nullptr || strcmp(env, "1") == 0) { // disallow tuning of hipblaslt with c10::complex diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp -index 2b7ff1b502b..9ae2ccbd16d 100644 +index 29e5c5e3cf1..df56f3d7f1d 100644 --- a/aten/src/ATen/native/cuda/Blas.cpp +++ b/aten/src/ATen/native/cuda/Blas.cpp -@@ -153,7 +153,7 @@ enum class Activation { +@@ -155,7 +155,7 @@ enum class Activation { GELU, }; @@ -151,7 +160,7 @@ index 2b7ff1b502b..9ae2ccbd16d 100644 cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activation a) { switch (a) { case Activation::None: -@@ -191,6 +191,7 @@ static bool getDisableAddmmCudaLt() { +@@ -193,6 +193,7 @@ static bool getDisableAddmmCudaLt() { #ifdef USE_ROCM static bool isSupportedHipLtROCmArch(int index) { @@ -159,7 +168,7 @@ index 2b7ff1b502b..9ae2ccbd16d 100644 hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index); std::string device_arch = prop->gcnArchName; static const std::vector archs = {"gfx90a", "gfx940", "gfx941", "gfx942"}; -@@ -201,6 +202,7 @@ static bool isSupportedHipLtROCmArch(int index) { +@@ -203,6 +204,7 @@ static bool isSupportedHipLtROCmArch(int index) { } } TORCH_CHECK(false, "Attempting to use hipBLASLt on a unsupported architecture!"); @@ -167,7 +176,7 @@ index 2b7ff1b502b..9ae2ccbd16d 100644 return false; } #endif -@@ -226,7 +228,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma +@@ -228,7 +230,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma at::ScalarType scalar_type = self.scalar_type(); c10::MaybeOwned self_; if (&result != &self) { @@ -176,7 +185,7 @@ index 2b7ff1b502b..9ae2ccbd16d 100644 // Strangely, if mat2 has only 1 row or column, we get // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic. // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] -@@ -269,7 +271,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma +@@ -271,7 +273,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma } self__sizes = self_->sizes(); } else { @@ -185,7 +194,7 @@ index 2b7ff1b502b..9ae2ccbd16d 100644 useLtInterface = !disable_addmm_cuda_lt && result.dim() == 2 && result.is_contiguous() && isSupportedHipLtROCmArch(self.device().index()) && -@@ -320,7 +322,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma +@@ -322,7 +324,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj()); @@ -194,6 +203,24 @@ index 2b7ff1b502b..9ae2ccbd16d 100644 if (useLtInterface) { AT_DISPATCH_FLOATING_TYPES_AND2( at::ScalarType::Half, +@@ -876,7 +878,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, + at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]}); + at::native::resize_output(amax, {}); + +-#if !defined(USE_ROCM) && !defined(_MSC_VER) || (defined(USE_ROCM) && ROCM_VERSION >= 60000) ++#if !defined(USE_ROCM) && !defined(_MSC_VER) || (defined(USE_ROCM) && defined(HIPBLASLT)) + cublasCommonArgs args(mat1, mat2, out); + const auto out_dtype_ = args.result->scalar_type(); + TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt"); +@@ -906,7 +908,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, + TORCH_CHECK(false, "_scaled_mm_out_cuda is not compiled for this platform."); + #endif + +-#if defined(USE_ROCM) && ROCM_VERSION >= 60000 ++#if defined(USE_ROCM) && defined(HIPBLASLT) + // rocm's hipblaslt does not yet support amax, so calculate separately + auto out_float32 = out.to(kFloat); + out_float32.abs_(); diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index b7ffbeb07dc..2b6c3678984 100644 --- a/cmake/Dependencies.cmake diff --git a/python-torch.spec b/python-torch.spec index 5272677..e98624c 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -6,8 +6,8 @@ # So pre releases can be tried %bcond_without gitcommit %if %{with gitcommit} -# The top of tree ~2/22/24 -%global commit0 5c5b71b6eebae76d744261715231093e62f0d090 +# The top of tree ~2/28/24 +%global commit0 3cfed0122829540444911c271ce5480832ea3526 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7}) %global pypi_version 2.3.0 @@ -40,12 +40,18 @@ # For testing openmp %bcond_without openmp +%bcond_with foxi # For testing caffe2 %if 0%{?fedora} +# need foxi-devel +%if %{with foxi} %bcond_without caffe2 %else %bcond_with caffe2 %endif +%else +%bcond_with caffe2 +%endif # For testing distributed %bcond_with distributed @@ -182,23 +188,33 @@ BuildRequires: python3dist(fsspec) BuildRequires: python3dist(sympy) %endif +%if %{with openmp} +BuildRequires: libomp-devel +%endif + %if %{with rocm} +BuildRequires: compiler-rt BuildRequires: hipblas-devel %if %{with hipblaslt} BuildRequires: hipblaslt-devel %endif BuildRequires: hipcub-devel BuildRequires: hipfft-devel +BuildRequires: hiprand-devel BuildRequires: hipsparse-devel BuildRequires: hipsolver-devel +BuildRequires: lld BuildRequires: miopen-devel BuildRequires: rocblas-devel +BuildRequires: rocrand-devel +BuildRequires: rocfft-devel %if %{with distributed} BuildRequires: rccl-devel %endif BuildRequires: rocprim-devel BuildRequires: rocm-cmake BuildRequires: rocm-comgr-devel +BuildRequires: rocm-core-devel BuildRequires: rocm-hip-devel BuildRequires: rocm-runtime-devel BuildRequires: rocm-rpm-macros @@ -208,9 +224,11 @@ BuildRequires: rocthrust-devel Requires: rocm-rpm-macros-modules %endif +%if %{with foxi} %if %{with caffe2} BuildRequires: foxi-devel %endif +%endif %if %{with test} BuildRequires: google-benchmark-devel