diff --git a/.gitignore b/.gitignore index 315fe1c..1ab81b0 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,7 @@ /pytorch-97ff6cf.tar.gz /pytorch-v2.3.0.tar.gz /pytorch-v2.3.1.tar.gz +/pytorch-v2.4.0.tar.gz +/v1.14.2.tar.gz +/cpp-httplib-3b6597b.tar.gz +/kineto-be13176.tar.gz diff --git a/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch b/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch new file mode 100644 index 0000000..413c60d --- /dev/null +++ b/0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch @@ -0,0 +1,47 @@ +From 091b7fe1ccbb5e4ff4ac6017d42bacb869f61a27 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Sat, 20 Jul 2024 05:37:15 -0600 +Subject: [PATCH] Add cmake option USE_SYSTEM_FBGEMM + +Signed-off-by: Tom Rix +--- + CMakeLists.txt | 1 + + cmake/Dependencies.cmake | 3 ++- + 2 files changed, 3 insertions(+), 1 deletion(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index c4cd4b2c2a98..2068f7c6c4f2 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -253,6 +253,7 @@ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF + "USE_CUDNN" OFF) + cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF) + option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON) ++option(USE_SYSTEM_FBGEMM "Use system-wide FBGEMM" OFF) + option(USE_KINETO "Use Kineto profiling library" ON) + option(USE_CUPTI_SO "Use CUPTI as a shared library" ON) + option(USE_FAKELOWP "Use FakeLowp operators" OFF) +diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake +index f1f2eb7cec31..192dac46f13b 100644 +--- a/cmake/Dependencies.cmake ++++ b/cmake/Dependencies.cmake +@@ -706,6 +706,7 @@ endif() + + # ---[ FBGEMM + if(USE_FBGEMM) ++ if (NOT USE_SYSTEM_FBGEMM) + set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party") + if(NOT DEFINED FBGEMM_SOURCE_DIR) + set(FBGEMM_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/fbgemm" CACHE STRING "FBGEMM source directory") +@@ -746,7 +747,7 @@ if(USE_FBGEMM) + target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable) + endif() + endif() +- ++ endif() + if(USE_FBGEMM) + list(APPEND Caffe2_DEPENDENCY_LIBS fbgemm) + endif() +-- +2.45.1 + diff --git a/0001-Optionally-use-hipblaslt.patch b/0001-Optionally-use-hipblaslt.patch index 56434a7..1e5ca4b 100644 --- a/0001-Optionally-use-hipblaslt.patch +++ b/0001-Optionally-use-hipblaslt.patch @@ -1,174 +1,398 @@ -From d77e05d90df006322cda021f1a8affdcc2c7eaef Mon Sep 17 00:00:00 2001 +From f1d65e958afa65882dbfea8b392ab847a84d41ed Mon Sep 17 00:00:00 2001 From: Tom Rix -Date: Fri, 23 Feb 2024 08:27:30 -0500 +Date: Sat, 29 Jun 2024 04:18:34 -0700 Subject: [PATCH] Optionally use hipblaslt -The hipblaslt package is not available on Fedora. -Instead of requiring the package, make it optional. -If it is found, define the preprocessor variable HIPBLASLT -Convert the checks for ROCM_VERSION >= 507000 to HIPBLASLT checks - -Signed-off-by: Tom Rix --- - aten/src/ATen/cuda/CUDABlas.cpp | 7 ++++--- - aten/src/ATen/cuda/CUDABlas.h | 2 +- - aten/src/ATen/cuda/CUDAContextLight.h | 4 ++-- - aten/src/ATen/cuda/CublasHandlePool.cpp | 4 ++-- - aten/src/ATen/cuda/tunable/TunableGemm.h | 6 +++--- - aten/src/ATen/native/cuda/Blas.cpp | 14 ++++++++------ - cmake/Dependencies.cmake | 3 +++ - cmake/public/LoadHIP.cmake | 4 ++-- - 8 files changed, 25 insertions(+), 19 deletions(-) + aten/src/ATen/cuda/CUDABlas.cpp | 46 ++++++++++++++++++------ + aten/src/ATen/cuda/CUDAContextLight.h | 4 +++ + aten/src/ATen/cuda/CublasHandlePool.cpp | 10 ++++-- + aten/src/ATen/cuda/tunable/TunableGemm.h | 18 +++++++--- + aten/src/ATen/native/cuda/Blas.cpp | 18 +++++++++- + cmake/Dependencies.cmake | 3 ++ + cmake/public/LoadHIP.cmake | 2 +- + 7 files changed, 82 insertions(+), 19 deletions(-) diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp -index d534ec5a178..e815463f630 100644 +index ce991a9bcad4..3f0d17b52778 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp -@@ -14,7 +14,7 @@ +@@ -14,7 +14,9 @@ #include #ifdef USE_ROCM --#if ROCM_VERSION >= 60000 -+#ifdef HIPBLASLT ++#ifdef USE_HIPBLASLT #include - #endif ++#endif // until hipblas has an API to accept flags, we must use rocblas here -@@ -781,7 +781,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { - } - } + #include + #include +@@ -182,6 +184,9 @@ uint32_t _getAlignment(uintptr_t address) { + static size_t _parseChosenWorkspaceSize() { + const char * val = getenv("CUBLASLT_WORKSPACE_SIZE"); + #ifdef USE_ROCM ++#ifndef USE_HIPBLASLT ++ return 0; ++#endif + if (!val) { + // accept either env var + val = getenv("HIPBLASLT_WORKSPACE_SIZE"); +@@ -235,6 +240,7 @@ namespace at::cuda::blas { + } while (0) --#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) -+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT)) - #if defined(USE_ROCM) && ROCM_VERSION >= 50700 && ROCM_VERSION < 60000 - // only for rocm 5.7 where we first supported hipblaslt, it was difficult -@@ -912,6 +912,7 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + namespace { + // Following the pattern of CuSparseDescriptor + // Defined here for now because this is the only place cublas_lt interface is +@@ -318,7 +324,6 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< }; } // namespace -+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT)) +- + template + inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { + cudaDataType_t abcType = CUDA_R_32F; +@@ -452,7 +457,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { + " scaleType ", + scaleType); + } +- ++#endif + + template + inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { +@@ -608,10 +613,13 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(double)) + template <> + void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(float)) + { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(float)); + } +- else { ++ else ++#endif ++ { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(float)); + } + } +@@ -651,10 +659,13 @@ void bgemm_internal>(CUDABLAS_BGEMM_ARGTYPES(c10::complex + void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)) + { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::Half)); + } +- else { ++ else ++#endif ++ { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::Half)); + } + } +@@ -662,10 +673,13 @@ void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)) + template <> + void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) + { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } +- else { ++ else ++#endif ++ { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } + } +@@ -781,11 +795,13 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { + } + } + ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + template + inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) { + // forward to bgemm implementation but set strides and batches to 0 + bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0); + } ++#endif + + template + inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) { +@@ -1008,10 +1024,13 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) + template <> + void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) + { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); + } +- else { ++ else ++#endif ++ { + gemm_internal_cublas(CUDABLAS_GEMM_ARGS(float)); + } + } +@@ -1051,10 +1070,13 @@ void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex + void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) + { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); + } +- else { ++ else ++#endif ++ { + gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::Half)); + } + } +@@ -1062,10 +1084,13 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) + template <> + void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) + { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } +- else { ++ else ++#endif ++ { + gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } + } +@@ -1177,7 +1202,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { + } + } + +- ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) template void gemm_and_bias( bool transpose_mat1, -@@ -1124,7 +1125,7 @@ template void gemm_and_bias( - at::BFloat16* result_ptr, - int64_t result_ld, - GEMMAndBiasActivationEpilogue activation); -- +@@ -1410,7 +1435,7 @@ void scaled_gemm( + ScalarType result_dtype, + void* amax_ptr, + bool use_fast_accum) { +-#if CUDA_VERSION >= 11080 || defined(USE_ROCM) ++#if CUDA_VERSION >= 11080 || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + const auto computeType = CUBLAS_COMPUTE_32F; + const auto scaleType = CUDA_R_32F; + const int8_t fastAccuMode = use_fast_accum ? 1 : 0; +@@ -1681,6 +1706,7 @@ void int8_gemm( + " scaleType ", + scaleType); + } +#endif - void scaled_gemm( - char transa, - char transb, -diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h -index eb12bb350c5..068607467dd 100644 ---- a/aten/src/ATen/cuda/CUDABlas.h -+++ b/aten/src/ATen/cuda/CUDABlas.h -@@ -82,7 +82,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)); - template <> - void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)); --#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) -+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT)) - enum GEMMAndBiasActivationEpilogue { - None, - RELU, + template <> + void trsm(CUDABLAS_TRSM_ARGTYPES(float)) { diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h -index 4ec35f59a21..e28dc42034f 100644 +index f2b657ced51b..f0ee613c4208 100644 --- a/aten/src/ATen/cuda/CUDAContextLight.h +++ b/aten/src/ATen/cuda/CUDAContextLight.h -@@ -9,7 +9,7 @@ +@@ -9,7 +9,9 @@ // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also // added bf16 support --#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) -+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT)) ++#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))) #include - #endif ++#endif -@@ -82,7 +82,7 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator(); + #ifdef CUDART_VERSION + #include +@@ -80,7 +82,9 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator(); /* Handles */ TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle(); TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle(); --#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) -+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT)) ++#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))) TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle(); - #endif ++#endif + + TORCH_CUDA_CPP_API void clearCublasWorkspaces(); diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp -index 6913d2cd95e..3d4276be372 100644 +index 8eac525b3695..abfdf7a23847 100644 --- a/aten/src/ATen/cuda/CublasHandlePool.cpp +++ b/aten/src/ATen/cuda/CublasHandlePool.cpp @@ -29,7 +29,7 @@ namespace at::cuda { namespace { --#if defined(USE_ROCM) && ROCM_VERSION >= 50700 -+#if defined(USE_ROCM) && defined(HIPBLASLT) +-#if defined(USE_ROCM) ++#if defined(USE_ROCM) && defined(USE_HIPBLASLT) void createCublasLtHandle(cublasLtHandle_t *handle) { TORCH_CUDABLAS_CHECK(cublasLtCreate(handle)); } -@@ -190,7 +190,7 @@ cublasHandle_t getCurrentCUDABlasHandle() { +@@ -191,8 +191,9 @@ cublasHandle_t getCurrentCUDABlasHandle() { return handle; } --#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) -+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT)) - cublasLtHandle_t getCurrentCUDABlasLtHandle() { +-cublasLtHandle_t getCurrentCUDABlasLtHandle() { #ifdef USE_ROCM ++#if defined(USE_HIPBLASLT) ++cublasLtHandle_t getCurrentCUDABlasLtHandle() { c10::DeviceIndex device = 0; + AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); + +@@ -213,9 +214,12 @@ cublasLtHandle_t getCurrentCUDABlasLtHandle() { + + auto handle = myPoolWindow->reserve(device); + return handle; ++} ++#endif + #else ++cublasLtHandle_t getCurrentCUDABlasLtHandle() { + return reinterpret_cast(getCurrentCUDABlasHandle()); +-#endif + } ++#endif + + } // namespace at::cuda diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h -index 3ba0d761277..dde1870cfbf 100644 +index 53e6154120c9..fa1d664696db 100644 --- a/aten/src/ATen/cuda/tunable/TunableGemm.h +++ b/aten/src/ATen/cuda/tunable/TunableGemm.h -@@ -11,7 +11,7 @@ +@@ -11,7 +11,9 @@ #include #ifdef USE_ROCM --#if ROCM_VERSION >= 50700 -+#ifdef HIPBLASLT ++#ifdef USE_HIPBLASLT #include - #endif ++#endif #include -@@ -166,7 +166,7 @@ class GemmTunableOp : public TunableOp, StreamTimer> { - } #endif - --#if defined(USE_ROCM) && ROCM_VERSION >= 50700 -+#if defined(USE_ROCM) && defined(HIPBLASLT) - static const char *env = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); - if (env == nullptr || strcmp(env, "1") == 0) { - // disallow tuning of hipblaslt with c10::complex -@@ -240,7 +240,7 @@ class GemmStridedBatchedTunableOp : public TunableOp + #include +@@ -65,6 +67,7 @@ class DefaultGemmStridedBatchedOp : public Callable> } - #endif - --#if defined(USE_ROCM) && ROCM_VERSION >= 50700 -+#if defined(USE_ROCM) && defined(HIPBLASLT) - static const char *env = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); - if (env == nullptr || strcmp(env, "1") == 0) { - // disallow tuning of hipblaslt with c10::complex -diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp -index 29e5c5e3cf1..df56f3d7f1d 100644 ---- a/aten/src/ATen/native/cuda/Blas.cpp -+++ b/aten/src/ATen/native/cuda/Blas.cpp -@@ -155,7 +155,7 @@ enum class Activation { - GELU, }; --#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) -+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT)) - cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activation a) { - switch (a) { - case Activation::None: -@@ -193,6 +193,7 @@ static bool getDisableAddmmCudaLt() { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + template + class DefaultScaledGemmOp : public Callable> { + public: +@@ -94,6 +97,7 @@ class DefaultScaledGemmOp : public Callable> { + return OK; + } + }; ++#endif + + template + inline bool IsZero(T v) { +@@ -191,6 +195,7 @@ static void AddRocblasValidator() { + } + } + ++#ifdef USE_HIPBLASLT + static void AddHipblasltValidator() { + auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); + if (validators.find("HIPBLASLT_VERSION") == validators.end()) { +@@ -205,6 +210,7 @@ static void AddHipblasltValidator() { + [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; }); + } + } ++#endif + + static void AddRocmValidator() { + auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); +@@ -243,7 +249,7 @@ class GemmTunableOp : public TunableOp, StreamTimer> { + } + AddRocblasValidator(); + } +- ++#ifdef USE_HIPBLASLT + static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); + if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) { + rocm_validators = true; +@@ -257,7 +263,7 @@ class GemmTunableOp : public TunableOp, StreamTimer> { + } + AddHipblasltValidator(); + } +- ++#endif + if (rocm_validators) { + AddRocmValidator(); + } +@@ -286,7 +292,7 @@ class GemmStridedBatchedTunableOp : public TunableOp + } + AddRocblasValidator(); + } +- ++#ifdef USE_HIPBLASLT + static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); + if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) { + rocm_validators = true; +@@ -300,7 +306,7 @@ class GemmStridedBatchedTunableOp : public TunableOp + } + AddHipblasltValidator(); + } +- ++#endif + if (rocm_validators) { + AddRocmValidator(); + } +@@ -312,6 +318,7 @@ class GemmStridedBatchedTunableOp : public TunableOp + } + }; + ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + template + class ScaledGemmTunableOp : public TunableOp, StreamTimer> { + public: +@@ -321,10 +328,12 @@ class ScaledGemmTunableOp : public TunableOp, StreamTimer> + auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); + + #if defined(USE_ROCM) ++#ifdef USE_HIPBLASLT + for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps()) { + this->RegisterOp(std::move(name), std::move(op)); + } + AddHipblasltValidator(); ++#endif + AddRocmValidator(); + #endif + } +@@ -337,6 +346,7 @@ class ScaledGemmTunableOp : public TunableOp, StreamTimer> + "_", BlasOpToString(ALayout), BlasOpToString(BLayout)); + } + }; ++#endif + + #undef XSTRINGIFY + #undef STRINGIFY +diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp +index 84c59a4fd0d7..56ad5de3bf2d 100644 +--- a/aten/src/ATen/native/cuda/Blas.cpp ++++ b/aten/src/ATen/native/cuda/Blas.cpp +@@ -173,6 +173,7 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa + } + + static bool getDisableAddmmCudaLt() { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT"); + #ifdef USE_ROCM + // if we enable tunable op, it'll take priority over just hipblaslt (heuristics) +@@ -196,10 +197,14 @@ static bool getDisableAddmmCudaLt() { + } + return false; + #endif ++#else ++ return true; ++#endif + } #ifdef USE_ROCM static bool isSupportedHipLtROCmArch(int index) { -+#if defined(HIPBLASLT) ++#ifdef USE_HIPBLASLT hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index); std::string device_arch = prop->gcnArchName; static const std::vector archs = {"gfx90a", "gfx940", "gfx941", "gfx942"}; -@@ -203,6 +204,7 @@ static bool isSupportedHipLtROCmArch(int index) { +@@ -210,6 +215,7 @@ static bool isSupportedHipLtROCmArch(int index) { } } TORCH_CHECK(false, "Attempting to use hipBLASLt on a unsupported architecture!"); @@ -176,87 +400,107 @@ index 29e5c5e3cf1..df56f3d7f1d 100644 return false; } #endif -@@ -228,7 +230,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma +@@ -235,6 +241,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma at::ScalarType scalar_type = self.scalar_type(); c10::MaybeOwned self_; if (&result != &self) { --#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11040 && !defined(_MSC_VER)) || defined(USE_ROCM) && ROCM_VERSION >= 50700 -+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11040 && !defined(_MSC_VER)) || defined(USE_ROCM) && defined(HIPBLASLT) ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM) // Strangely, if mat2 has only 1 row or column, we get // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic. - // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] -@@ -271,7 +273,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma +@@ -276,13 +283,14 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma + scalar_type != at::ScalarType::BFloat16)); + #endif + } ++#endif + #endif + if (!useLtInterface) { + self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm"); } self__sizes = self_->sizes(); } else { --#if defined(USE_ROCM) && ROCM_VERSION >= 50700 -+#if defined(USE_ROCM) && defined(HIPBLASLT) +-#if defined(USE_ROCM) ++#if defined(USE_ROCM) && defined(USE_HIPBLASLT) useLtInterface = !disable_addmm_cuda_lt && result.dim() == 2 && result.is_contiguous() && isSupportedHipLtROCmArch(self.device().index()) && -@@ -322,7 +324,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma - +@@ -334,6 +342,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj()); --#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) -+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT)) if (useLtInterface) { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + #if defined(USE_ROCM) AT_DISPATCH_FLOATING_TYPES_AND2( at::ScalarType::Half, -@@ -876,7 +878,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, - at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]}); - at::native::resize_output(amax, {}); +@@ -394,6 +403,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma + activation_epilogue + ); + }); ++#endif + #endif + } else + { +@@ -803,6 +813,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) { + } --#if !defined(USE_ROCM) && !defined(_MSC_VER) || (defined(USE_ROCM) && ROCM_VERSION >= 60000) -+#if !defined(USE_ROCM) && !defined(_MSC_VER) || (defined(USE_ROCM) && defined(HIPBLASLT)) - cublasCommonArgs args(mat1, mat2, out); - const auto out_dtype_ = args.result->scalar_type(); - TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt"); -@@ -906,7 +908,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, - TORCH_CHECK(false, "_scaled_mm_out_cuda is not compiled for this platform."); + static bool _scaled_mm_allowed_device() { ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + auto dprops = at::cuda::getCurrentDeviceProperties(); + #ifdef USE_ROCM + std::string device_arch = dprops->gcnArchName; +@@ -817,6 +828,9 @@ static bool _scaled_mm_allowed_device() { + #else + return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9); + #endif ++#else ++ return false; ++#endif + } + + // Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax +@@ -850,6 +864,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, + // Check sizes + bool allowed_device = _scaled_mm_allowed_device(); + TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+"); ++#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)) + TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix"); + TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix"); + TORCH_CHECK( +@@ -1025,6 +1040,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, + #if defined(USE_ROCM) && ROCM_VERSION >= 60000 && ROCM_VERSION < 60200 + // ROCm's hipBLASLt does not support amax before 6.2, so calculate separately + amax = at::max(at::abs(out.to(kFloat))); ++#endif #endif --#if defined(USE_ROCM) && ROCM_VERSION >= 60000 -+#if defined(USE_ROCM) && defined(HIPBLASLT) - // rocm's hipblaslt does not yet support amax, so calculate separately - auto out_float32 = out.to(kFloat); - out_float32.abs_(); + return {out, amax}; diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake -index b7ffbeb07dc..2b6c3678984 100644 +index f1f2eb7cec31..8d05e834bbc5 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake -@@ -1273,6 +1273,9 @@ if(USE_ROCM) - if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "6.0.0") - list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2) - endif() +@@ -1052,6 +1052,9 @@ if(USE_ROCM) + list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP) + list(APPEND HIP_CXX_FLAGS -std=c++17) + list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2) + if(hipblast_FOUND) -+ list(APPEND HIP_CXX_FLAGS -DHIPBLASLT) ++ list(APPEND HIP_CXX_FLAGS -DUSE_HIPBLASLT) + endif() - if(HIPBLASLT_CUSTOM_DATA_TYPE) - list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_CUSTOM_DATA_TYPE) + if(HIP_NEW_TYPE_ENUMS) + list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS) endif() diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake -index f6ca263c5e5..53eb0b63c1a 100644 +index fa39156031ff..df4836847fdf 100644 --- a/cmake/public/LoadHIP.cmake +++ b/cmake/public/LoadHIP.cmake -@@ -156,7 +156,7 @@ if(HIP_FOUND) +@@ -155,7 +155,7 @@ if(HIP_FOUND) + find_package_and_print_version(hiprand REQUIRED) find_package_and_print_version(rocblas REQUIRED) find_package_and_print_version(hipblas REQUIRED) - if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.7.0") -- find_package_and_print_version(hipblaslt REQUIRED) -+ find_package_and_print_version(hipblaslt) - endif() +- find_package_and_print_version(hipblaslt REQUIRED) ++ find_package_and_print_version(hipblaslt) find_package_and_print_version(miopen REQUIRED) - if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "4.1.0") -@@ -191,7 +191,7 @@ if(HIP_FOUND) - # roctx is part of roctracer - find_library(ROCM_ROCTX_LIB roctx64 HINTS ${ROCM_PATH}/lib) - -- if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.7.0") -+ if(hipblastlt_FOUND) - # check whether hipblaslt is using its own datatype - set(file "${PROJECT_BINARY_DIR}/hipblaslt_test_data_type.cc") - file(WRITE ${file} "" + find_package_and_print_version(hipfft REQUIRED) + find_package_and_print_version(hipsparse REQUIRED) -- -2.43.2 +2.45.2 diff --git a/0001-Patch-for-sleef-3.6.patch b/0001-Patch-for-sleef-3.6.patch new file mode 100644 index 0000000..13aa208 --- /dev/null +++ b/0001-Patch-for-sleef-3.6.patch @@ -0,0 +1,952 @@ +From 273f23698c887b52e66c2abec8101b7398f0f9c4 Mon Sep 17 00:00:00 2001 +From: "Benjamin A. Beasley" +Date: Wed, 5 Jun 2024 11:06:02 -0400 +Subject: [PATCH] Patch for sleef 3.6 + +--- + ...ectorization-on-windows-submodule-sl.patch | 910 ++++++++++++++++++ + python-torch.spec | 11 + + 2 files changed, 921 insertions(+) + create mode 100644 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch + +diff --git a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch +new file mode 100644 +index 000000000000..562f55b742c2 +--- /dev/null ++++ b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch +@@ -0,0 +1,910 @@ ++From 3d1e4b3e5ddcdd2717e590c635097163fef64c83 Mon Sep 17 00:00:00 2001 ++From: Xu Han ++Date: Sun, 31 Mar 2024 03:07:32 +0000 ++Subject: [PATCH] Enable x86 CPU vectorization on windows [submodule sleef] ++ (#118980) ++ ++Enable VEC on Windows OS. ++1. Fix some type defination gap between Windows and Linux. ++2. Fix some operator not support on Windows, such as [], /. ++3. Enable static sleef library build on Windows. ++4. Disable unsupported function overloading on MSVC. ++5. Upgrade submodule sleef lib, which fixed build issue on Windows. ++6. Fixed bazel build issues. ++7. Fix test app not link to sleef on Windows. ++ ++Note: If rebuild fail after pulled this PR, please sync `sleef` submodule by run: ++```cmd ++git submodule sync ++git submodule update --init --recursive ++``` ++ ++Pull Request resolved: https://github.com/pytorch/pytorch/pull/118980 ++Approved by: https://github.com/jgong5, https://github.com/ezyang, https://github.com/malfet ++--- ++ aten/src/ATen/CMakeLists.txt | 48 ++++++-------- ++ aten/src/ATen/cpu/vec/vec256/vec256.h | 14 ++-- ++ .../src/ATen/cpu/vec/vec256/vec256_bfloat16.h | 27 ++++++-- ++ .../cpu/vec/vec256/vec256_complex_double.h | 7 +- ++ .../cpu/vec/vec256/vec256_complex_float.h | 7 +- ++ aten/src/ATen/cpu/vec/vec256/vec256_double.h | 5 +- ++ aten/src/ATen/cpu/vec/vec256/vec256_float.h | 15 +++-- ++ aten/src/ATen/cpu/vec/vec256/vec256_qint.h | 12 +++- ++ aten/src/ATen/cpu/vec/vec512/vec512.h | 14 ++-- ++ .../src/ATen/cpu/vec/vec512/vec512_bfloat16.h | 27 ++++++-- ++ .../cpu/vec/vec512/vec512_complex_double.h | 7 +- ++ .../cpu/vec/vec512/vec512_complex_float.h | 7 +- ++ aten/src/ATen/cpu/vec/vec512/vec512_double.h | 5 +- ++ aten/src/ATen/cpu/vec/vec512/vec512_float.h | 15 +++-- ++ aten/src/ATen/cpu/vec/vec512/vec512_qint.h | 66 ++++++++++++++++++- ++ aten/src/ATen/cpu/vec/vec_base.h | 6 ++ ++ caffe2/CMakeLists.txt | 2 +- ++ third_party/sleef.BUILD | 3 +- ++ 18 files changed, 194 insertions(+), 93 deletions(-) ++ ++diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt ++index bf425af5fa9..58d5828e8ca 100644 ++--- a/aten/src/ATen/CMakeLists.txt +++++ b/aten/src/ATen/CMakeLists.txt ++@@ -419,32 +419,25 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$") ++ list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo) ++ endif() ++ ++-if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) ++- # Preserve values for the main build ++- set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS}) ++- set(__aten_sleef_build_tests ${BUILD_TESTS}) ++- ++- # Unset our restrictive C++ flags here and reset them later. ++- # Remove this once we use proper target_compile_options. ++- set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) ++- set(CMAKE_CXX_FLAGS) ++- ++- # Bump up optimization level for sleef to -O1, since at -O0 the compiler ++- # excessively spills intermediate vector registers to the stack ++- # and makes things run impossibly slowly ++- set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) ++- if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0") ++- string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) ++- else() ++- set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1") +++if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) +++ if(NOT MSVC) +++ # Bump up optimization level for sleef to -O1, since at -O0 the compiler +++ # excessively spills intermediate vector registers to the stack +++ # and makes things run impossibly slowly +++ set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) +++ if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0") +++ string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) +++ else() +++ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1") +++ endif() ++ endif() ++ ++ if(NOT USE_SYSTEM_SLEEF) ++- set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) ++- set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) ++- set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) ++- set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) ++- set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE}) +++ set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) +++ set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) +++ set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) +++ set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) +++ set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE) ++ if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") ++ if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64") ++ set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE) ++@@ -465,12 +458,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) ++ endif() ++ list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef) ++ ++- set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) ++- set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS}) ++- ++- # Set these back. TODO: Use SLEEF_ to pass these instead ++- set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE) ++- set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE) +++ if(NOT MSVC) +++ set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) +++ endif() ++ endif() ++ ++ if(USE_CUDA AND NOT USE_ROCM) ++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h ++index 800b027e469..c431fa3c605 100644 ++--- a/aten/src/ATen/cpu/vec/vec256/vec256.h +++++ b/aten/src/ATen/cpu/vec/vec256/vec256.h ++@@ -69,7 +69,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { ++ } ++ ++ ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) ++ ++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++@@ -94,7 +94,8 @@ inline Vectorized cast(const Vectorized& src) ++ } ++ ++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++- +++#ifndef _MSC_VER +++// MSVC is not working well on complex function overload. ++ template ++ std::enable_if_t> ++ inline gather(const double* base_addr, const Vectorized& vindex) { ++@@ -106,9 +107,10 @@ std::enable_if_t& vindex) { ++ return _mm256_i32gather_ps(base_addr, vindex, scale); ++ } ++- +++#endif ++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++- +++#ifndef _MSC_VER +++// MSVC is not working well on complex function overload. ++ template ++ std::enable_if_t> ++ inline mask_gather(const Vectorized& src, const double* base_addr, ++@@ -122,7 +124,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, ++ const Vectorized& vindex, Vectorized& mask) { ++ return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale); ++ } ++- +++#endif ++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++ // Only works for inputs in the range: [-2^51, 2^51] ++@@ -302,6 +304,6 @@ inline Vectorized flip(const Vectorized & v) { ++ return flip8(v); ++ } ++ ++-#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#endif // (defined(CPU_CAPABILITY_AVX2) ++ ++ }} // namepsace at::vec::CPU_CAPABILITY ++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h ++index 3e26213d6d2..66557436c70 100644 ++--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h +++++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h ++@@ -7,7 +7,8 @@ ++ #include ++ #include ++ ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) +++#define SLEEF_STATIC_LIBS ++ #include ++ #endif ++ ++@@ -18,7 +19,18 @@ namespace at::vec { ++ // See Note [CPU_CAPABILITY namespace] ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) +++ +++#ifndef SLEEF_CONST +++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) +++#define SLEEF_CONST const +++#else +++#define SLEEF_CONST +++#endif +++#define SLEEF_CONST_OLD SLEEF_CONST +++#else +++#define SLEEF_CONST_OLD +++#endif ++ ++ // bfloat16 conversion ++ static inline void cvtbf16_fp32(const __m128i& a, __m256& o) { ++@@ -265,7 +277,8 @@ public: ++ } ++ return b; ++ } ++- Vectorized map(const __m256 (*const vop)(__m256)) const { +++ +++ Vectorized map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const { ++ __m256 lo, hi; ++ cvt_to_fp32(values, lo, hi); ++ const auto o1 = vop(lo); ++@@ -1026,7 +1039,7 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V ++ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16); ++ CONVERT_VECTORIZED_INIT(Half, half); ++ ++-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#else // defined(CPU_CAPABILITY_AVX2) ++ ++ #define CONVERT_NON_VECTORIZED_INIT(type, name) \ ++ inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ ++@@ -1051,9 +1064,9 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V ++ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16); ++ CONVERT_NON_VECTORIZED_INIT(Half, half); ++ ++-#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#endif // defined(CPU_CAPABILITY_AVX2) ++ ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) ++ #define LOAD_FP32_VECTORIZED_INIT(type, name) \ ++ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ ++ auto values = _mm_loadu_si128(reinterpret_cast(data)); \ ++@@ -1072,7 +1085,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec ++ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16); ++ LOAD_FP32_VECTORIZED_INIT(Half, fp16); ++ ++-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#else // defined(CPU_CAPABILITY_AVX2) ++ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ ++ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ ++ __at_align__ float values[Vectorized::size()]; \ ++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h ++index f93ea1e63c3..6c198fb37d3 100644 ++--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h +++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h ++@@ -8,7 +8,8 @@ ++ #include ++ #include ++ ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) +++#define SLEEF_STATIC_LIBS ++ #include ++ #endif ++ ++@@ -16,7 +17,7 @@ namespace at::vec { ++ // See Note [CPU_CAPABILITY namespace] ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) ++ ++ template <> class Vectorized> { ++ private: ++@@ -145,7 +146,7 @@ public: ++ auto abs = abs_(); ++ auto zero = _mm256_setzero_pd(); ++ auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ); ++- auto div = values / abs; +++ auto div = _mm256_div_pd(values, abs); ++ return _mm256_blendv_pd(div, zero, mask); ++ } ++ __m256d real_() const { ++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h ++index 7c142c04b79..c72d4d49274 100644 ++--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h +++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h ++@@ -7,7 +7,8 @@ ++ #include ++ #include ++ #include ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) +++#define SLEEF_STATIC_LIBS ++ #include ++ #endif ++ ++@@ -15,7 +16,7 @@ namespace at::vec { ++ // See Note [CPU_CAPABILITY namespace] ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) ++ ++ template <> class Vectorized> { ++ private: ++@@ -180,7 +181,7 @@ public: ++ auto abs = abs_(); ++ auto zero = _mm256_setzero_ps(); ++ auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ); ++- auto div = values / abs; +++ auto div = _mm256_div_ps(values, abs); ++ return _mm256_blendv_ps(div, zero, mask); ++ } ++ __m256 real_() const { ++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h ++index bc82d07edd1..bed6da627af 100644 ++--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h +++++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h ++@@ -6,7 +6,8 @@ ++ #include ++ #include ++ #include ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) +++#define SLEEF_STATIC_LIBS ++ #include ++ #endif ++ ++@@ -15,7 +16,7 @@ namespace at::vec { ++ inline namespace CPU_CAPABILITY { ++ ++ ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) ++ ++ template <> class Vectorized { ++ private: ++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h ++index 886809a0b8a..0e3664cd37b 100644 ++--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h +++++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h ++@@ -6,7 +6,8 @@ ++ #include ++ #include ++ #include ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) +++#define SLEEF_STATIC_LIBS ++ #include ++ #endif ++ ++@@ -14,7 +15,7 @@ namespace at::vec { ++ // See Note [CPU_CAPABILITY namespace] ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) ++ ++ template <> class Vectorized { ++ private: ++@@ -226,14 +227,14 @@ public: ++ static __m256 vec_factorial_5 = ++ _mm256_set1_ps(0.00828929059f); // 1/factorial(5) ++ static __m256 vec_exp_log2ef = ++- (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e) +++ _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e) ++ static __m256 vec_half = _mm256_set1_ps(0.5f); ++ static __m256 vec_one = _mm256_set1_ps(1.f); ++ static __m256 vec_zero = _mm256_set1_ps(0.f); ++ static __m256 vec_two = _mm256_set1_ps(2.f); ++- static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2) ++- static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50); ++- static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218); +++ static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2) +++ static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50)); +++ static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218)); ++ static __m256i vec_127 = _mm256_set1_epi32(0x0000007f); ++ static int n_mantissa_bits = 23; ++ ++@@ -266,7 +267,7 @@ public: ++ auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number); ++ auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127); ++ vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); ++- auto vec_two_pow_n = (__m256)vec_two_pow_n_i; +++ auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i); ++ vec_two_pow_n = ++ _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask); ++ ++diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h ++index 4128841701a..85e099904cd 100644 ++--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h +++++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h ++@@ -41,11 +41,17 @@ ++ namespace at::vec { ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX2) ++ +++#ifdef _MSC_VER +++__declspec(align(64)) struct Vectorizedqi { +++ protected: +++ __m256i vals; +++#else ++ struct Vectorizedqi { ++ protected: ++ __m256i vals __attribute__((aligned(64))); +++#endif ++ ++ public: ++ Vectorizedqi() {} ++@@ -133,7 +139,7 @@ inline convert_float_to_int8(at::vec::Vectorized src) { ++ } ++ ++ template ++-inline void __attribute__((always_inline)) QuantizeAvx2( +++__FORCE_INLINE void QuantizeAvx2( ++ const float* src, ++ T* dst, ++ int len, ++@@ -1331,5 +1337,5 @@ Vectorized inline maximum(const Vectorized& a, const V ++ return a.maximum(b); ++ } ++ ++-#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +++#endif // if defined(CPU_CAPABILITY_AVX2) ++ }} // namespace at::vec::CPU_CAPABILITY ++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h ++index fe96d123e64..87f723d782c 100644 ++--- a/aten/src/ATen/cpu/vec/vec512/vec512.h +++++ b/aten/src/ATen/cpu/vec/vec512/vec512.h ++@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { ++ } ++ ++ ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) ++ ++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++@@ -80,7 +80,8 @@ inline Vectorized cast(const Vectorized& src) ++ } ++ ++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++- +++#ifndef _MSC_VER +++// MSVC is not working well on complex function overload. ++ template ++ std::enable_if_t> ++ inline gather(const double* base_addr, const Vectorized& vindex) { ++@@ -92,9 +93,10 @@ std::enable_if_t& vindex) { ++ return _mm512_i32gather_ps(vindex, base_addr, scale); ++ } ++- +++#endif ++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++- +++#ifndef _MSC_VER +++// MSVC is not working well on complex function overload. ++ template ++ std::enable_if_t> ++ inline mask_gather(const Vectorized& src, const double* base_addr, ++@@ -112,7 +114,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, ++ auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ); ++ return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale); ++ } ++- +++#endif ++ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++ template<> ++@@ -270,6 +272,6 @@ inline Vectorized flip(const Vectorized & v) { ++ return flip8(v); ++ } ++ ++-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#endif // defined(CPU_CAPABILITY_AVX512) ++ ++ }}} ++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h ++index f9fc92d52bf..eb3b6a72240 100644 ++--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h +++++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h ++@@ -7,7 +7,8 @@ ++ #include ++ #include ++ ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) +++#define SLEEF_STATIC_LIBS ++ #include ++ #endif ++ ++@@ -16,7 +17,18 @@ namespace vec { ++ // See Note [CPU_CAPABILITY namespace] ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) +++ +++#ifndef SLEEF_CONST +++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) +++#define SLEEF_CONST const +++#else +++#define SLEEF_CONST +++#endif +++#define SLEEF_CONST_OLD SLEEF_CONST +++#else +++#define SLEEF_CONST_OLD +++#endif ++ ++ // bfloat16 conversion ++ static inline void cvtbf16_fp32(const __m256i& a, __m512& o) { ++@@ -362,7 +374,8 @@ public: ++ } ++ #pragma clang diagnostic push ++ #pragma clang diagnostic ignored "-Wignored-qualifiers" ++- Vectorized map(const __m512 (*const vop)(__m512)) const { +++ +++ Vectorized map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const { ++ __m512 lo, hi; ++ cvt_to_fp32(values, lo, hi); ++ const auto o1 = vop(lo); ++@@ -1571,7 +1584,7 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V ++ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16); ++ CONVERT_VECTORIZED_INIT(Half, half); ++ ++-#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#else //defined(CPU_CAPABILITY_AVX512) ++ ++ #define CONVERT_NON_VECTORIZED_INIT(type, name) \ ++ inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ ++@@ -1601,9 +1614,9 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V ++ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16); ++ CONVERT_NON_VECTORIZED_INIT(Half, half); ++ ++-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#endif // defined(CPU_CAPABILITY_AVX512) ++ ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) ++ #define LOAD_FP32_VECTORIZED_INIT(type, name) \ ++ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ ++ auto values = _mm256_loadu_si256(reinterpret_cast(data)); \ ++@@ -1622,7 +1635,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec ++ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16); ++ LOAD_FP32_VECTORIZED_INIT(Half, fp16); ++ ++-#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#else // defined(CPU_CAPABILITY_AVX512) ++ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ ++ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ ++ __at_align__ float values[Vectorized::size()]; \ ++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h ++index 02aa3a87cc1..c35204f9da2 100644 ++--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h +++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h ++@@ -7,7 +7,8 @@ ++ #include ++ #include ++ #include ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) +++#define SLEEF_STATIC_LIBS ++ #include ++ #endif ++ ++@@ -16,7 +17,7 @@ namespace vec { ++ // See Note [CPU_CAPABILITY namespace] ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) ++ ++ template <> class Vectorized> { ++ private: ++@@ -203,7 +204,7 @@ public: ++ auto abs = abs_(); ++ auto zero = _mm512_setzero_pd(); ++ auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ); ++- auto div = values / abs; +++ auto div = _mm512_div_pd(values, abs); ++ return _mm512_mask_blend_pd(mask, div, zero); ++ } ++ __m512d real_() const { ++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h ++index a5d790c98b2..2801e484d94 100644 ++--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h +++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h ++@@ -7,7 +7,8 @@ ++ #include ++ #include ++ #include ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) +++#define SLEEF_STATIC_LIBS ++ #include ++ #endif ++ ++@@ -16,7 +17,7 @@ namespace vec { ++ // See Note [CPU_CAPABILITY namespace] ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) ++ ++ template <> class Vectorized> { ++ private: ++@@ -708,7 +709,7 @@ public: ++ auto abs = abs_(); ++ auto zero = _mm512_setzero_ps(); ++ auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ); ++- auto div = values / abs; +++ auto div = _mm512_div_ps(values, abs); ++ return _mm512_mask_blend_ps(mask, div, zero); ++ } ++ __m512 real_() const { ++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h ++index 27b2753c903..508ab257e60 100644 ++--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h +++++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h ++@@ -6,7 +6,8 @@ ++ #include ++ #include ++ #include ++-#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER) +++#if (defined(CPU_CAPABILITY_AVX512)) +++#define SLEEF_STATIC_LIBS ++ #include ++ #endif ++ ++@@ -15,7 +16,7 @@ namespace vec { ++ // See Note [CPU_CAPABILITY namespace] ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) ++ ++ template <> class Vectorized { ++ private: ++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h ++index ba5738687fd..a08df3c141a 100644 ++--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h +++++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h ++@@ -6,7 +6,8 @@ ++ #include ++ #include ++ #include ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) +++#define SLEEF_STATIC_LIBS ++ #include ++ #endif ++ ++@@ -15,7 +16,7 @@ namespace vec { ++ // See Note [CPU_CAPABILITY namespace] ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) ++ ++ template <> class Vectorized { ++ private: ++@@ -246,14 +247,14 @@ public: ++ static __m512 vec_factorial_5 = ++ _mm512_set1_ps(0.00828929059f); // 1/factorial(5) ++ static __m512 vec_exp_log2ef = ++- (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e) +++ _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e) ++ static __m512 vec_half = _mm512_set1_ps(0.5f); ++ static __m512 vec_one = _mm512_set1_ps(1.f); ++ static __m512 vec_zero = _mm512_set1_ps(0.f); ++ static __m512 vec_two = _mm512_set1_ps(2.f); ++- static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2) ++- static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50); ++- static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218); +++ static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2) +++ static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50)); +++ static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218)); ++ static __m512i vec_127 = _mm512_set1_epi32(0x0000007f); ++ static int n_mantissa_bits = 23; ++ ++@@ -288,7 +289,7 @@ public: ++ auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number); ++ auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127); ++ vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); ++- auto vec_two_pow_n = (__m512)vec_two_pow_n_i; +++ auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i); ++ vec_two_pow_n = ++ _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero); ++ ++diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h ++index e0713d01312..a5671ed4a50 100644 ++--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h +++++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h ++@@ -42,11 +42,17 @@ namespace at { ++ namespace vec { ++ inline namespace CPU_CAPABILITY { ++ ++-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +++#if defined(CPU_CAPABILITY_AVX512) ++ +++#ifdef _MSC_VER +++__declspec(align(64)) struct Vectorizedqi { +++ protected: +++ __m512i vals; +++#else ++ struct Vectorizedqi { ++ protected: ++ __m512i vals __attribute__((aligned(64))); +++#endif ++ ++ public: ++ Vectorizedqi() {} ++@@ -136,7 +142,7 @@ inline convert_float_to_int8(at::vec::Vectorized src) { ++ } ++ ++ template ++-inline void __attribute__((always_inline)) QuantizeAvx512( +++__FORCE_INLINE void QuantizeAvx512( ++ const float* src, ++ T* dst, ++ int len, ++@@ -525,10 +531,17 @@ struct Vectorized : public Vectorizedqi { ++ Vectorized scale, ++ Vectorized zero_point, ++ Vectorized scale_neg_zp_premul) const { +++ #if defined(_MSC_VER) && !defined(__clang__) +++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); +++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); +++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); +++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +++ #else ++ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); ++ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); ++ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); ++ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +++ #endif ++ ++ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); ++ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); ++@@ -549,10 +562,17 @@ struct Vectorized : public Vectorizedqi { ++ float_vec_return_type dequantize( ++ Vectorized scale, ++ Vectorized zero_point) const { +++ #if defined(_MSC_VER) && !defined(__clang__) +++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); +++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); +++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); +++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +++ #else ++ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); ++ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); ++ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); ++ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +++ #endif ++ ++ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); ++ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); ++@@ -598,20 +618,34 @@ struct Vectorized : public Vectorizedqi { ++ } ++ ++ int_vec_return_type widening_subtract(Vectorized b) const { +++ #if defined(_MSC_VER) && !defined(__clang__) +++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); +++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); +++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); +++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +++ #else ++ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); ++ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); ++ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); ++ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +++ #endif ++ ++ __m512i int32_val0 = cvtepi8_epi32(int_val0); ++ __m512i int32_val1 = cvtepi8_epi32(int_val1); ++ __m512i int32_val2 = cvtepi8_epi32(int_val2); ++ __m512i int32_val3 = cvtepi8_epi32(int_val3); ++ +++ #if defined(_MSC_VER) && !defined(__clang__) +++ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); +++ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); +++ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); +++ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); +++ #else ++ __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); ++ __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); ++ __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); ++ __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); +++ #endif ++ ++ __m512i int32_b0 = cvtepi8_epi32(int_b0); ++ __m512i int32_b1 = cvtepi8_epi32(int_b1); ++@@ -721,10 +755,17 @@ struct Vectorized : public Vectorizedqi { ++ Vectorized scale, ++ Vectorized zero_point, ++ Vectorized scale_zp_premul) const { +++ #if defined(_MSC_VER) && !defined(__clang__) +++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); +++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); +++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); +++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +++ #else ++ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); ++ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); ++ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); ++ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +++ #endif ++ ++ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); ++ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); ++@@ -746,10 +787,17 @@ struct Vectorized : public Vectorizedqi { ++ float_vec_return_type dequantize( ++ Vectorized scale, ++ Vectorized zero_point) const { +++ #if defined(_MSC_VER) && !defined(__clang__) +++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); +++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); +++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); +++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +++ #else ++ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); ++ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); ++ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); ++ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +++ #endif ++ ++ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); ++ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); ++@@ -796,20 +844,34 @@ struct Vectorized : public Vectorizedqi { ++ } ++ ++ int_vec_return_type widening_subtract(Vectorized b) const { +++ #if defined(_MSC_VER) && !defined(__clang__) +++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); +++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); +++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); +++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); +++ #else ++ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); ++ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); ++ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); ++ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); +++ #endif ++ ++ __m512i int32_val0 = cvtepu8_epi32(int_val0); ++ __m512i int32_val1 = cvtepu8_epi32(int_val1); ++ __m512i int32_val2 = cvtepu8_epi32(int_val2); ++ __m512i int32_val3 = cvtepu8_epi32(int_val3); ++ +++ #if defined(_MSC_VER) && !defined(__clang__) +++ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); +++ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); +++ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); +++ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); +++ #else ++ __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); ++ __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); ++ __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); ++ __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); +++ #endif ++ ++ __m512i int32_b0 = cvtepu8_epi32(int_b0); ++ __m512i int32_b1 = cvtepu8_epi32(int_b1); ++diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h ++index adf81dd915c..20cb8ef6dbc 100644 ++--- a/aten/src/ATen/cpu/vec/vec_base.h +++++ b/aten/src/ATen/cpu/vec/vec_base.h ++@@ -36,6 +36,12 @@ ++ #include ++ #include ++ +++#if defined(__GNUC__) +++#define __FORCE_INLINE __attribute__((always_inline)) inline +++#elif defined(_MSC_VER) +++#define __FORCE_INLINE __forceinline +++#endif +++ ++ // These macros helped us unify vec_base.h ++ #ifdef CPU_CAPABILITY_AVX512 ++ #if defined(__GNUC__) ++diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt ++index a6b6f0f7d1d..15d37cf4861 100644 ++--- a/caffe2/CMakeLists.txt +++++ b/caffe2/CMakeLists.txt ++@@ -1787,7 +1787,7 @@ if(BUILD_TEST) ++ endif() ++ else() ++ add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}") ++- target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main) +++ target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main) ++ endif() ++ target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $) ++ target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $) ++diff --git a/third_party/sleef.BUILD b/third_party/sleef.BUILD ++index 573f9c5b54a..f22a6e905e2 100644 ++--- a/third_party/sleef.BUILD +++++ b/third_party/sleef.BUILD ++@@ -38,6 +38,7 @@ SLEEF_PUBLIC_HEADERS = [ ++ SLEEF_PRIVATE_INCLUDES = [ ++ "-Iexternal/sleef/src/arch", ++ "-Iexternal/sleef/src/common", +++ "-Iexternal/sleef/src/libm", ++ ] ++ ++ SLEEF_PUBLIC_INCLUDES = [ ++@@ -201,8 +202,6 @@ cc_library( ++ srcs = [ ++ "src/libm/rempitab.c", ++ "src/libm/sleefdp.c", ++- "src/libm/sleefld.c", ++- "src/libm/sleefqp.c", ++ "src/libm/sleefsp.c", ++ ], ++ hdrs = SLEEF_PUBLIC_HEADERS, ++-- ++2.45.1 ++ +diff --git a/python-torch.spec b/python-torch.spec +index d50687a5174a..63600c2e8c39 100644 +--- a/python-torch.spec ++++ b/python-torch.spec +@@ -176,6 +176,17 @@ Patch7: 0001-Reenable-dim-for-python-3.12.patch + Patch8: 0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch + %endif + ++# Enable x86 CPU vectorization on windows [submodule sleef] (#118980) ++# https://github.com/pytorch/pytorch/commit/56451cd49d9cf94b49197e09dec13426bb1a5370 ++# ++# Despite the title, this patch fixes compatibility with sleef 3.6 by including ++# a backwards-compatible version of the fix from ++# https://github.com/pytorch/pytorch/pull/122723. ++# Cherry-picked on v2.3.0, without the commit to update the third_party/sleef ++# git submodule (because the release archive contains an actual sleef source ++# tree instead, so this would not apply.) ++Patch9: 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch ++ + %if %{with rocm} + # ROCm patches + # https://github.com/pytorch/pytorch/pull/120551 +-- +2.45.1 + diff --git a/0001-disable-use-of-aotriton.patch b/0001-disable-use-of-aotriton.patch index 34a1704..61ffd1e 100644 --- a/0001-disable-use-of-aotriton.patch +++ b/0001-disable-use-of-aotriton.patch @@ -1,46 +1,94 @@ -From 33d48f71db7530f00dbd8cff281b65aa8b355b2a Mon Sep 17 00:00:00 2001 +From 038ce9e44776e23f21c1816daa259bc0ea335088 Mon Sep 17 00:00:00 2001 From: Tom Rix -Date: Tue, 19 Mar 2024 11:32:37 -0400 +Date: Sat, 29 Jun 2024 07:06:09 -0700 Subject: [PATCH] disable use of aotriton --- - aten/src/ATen/native/transformers/cuda/sdp_utils.cpp | 6 ++++++ - 1 file changed, 6 insertions(+) + .../ATen/native/transformers/cuda/sdp_utils.cpp | 17 +++++++++++++++-- + 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp -index 96b839820efd..2d3dd0cb4b0f 100644 +index 214b02d8262e..7b3eb9dcd8cd 100644 --- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp +++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp -@@ -21,9 +21,11 @@ - #include - #include +@@ -19,9 +19,12 @@ + #include + #include +#ifdef USE_FLASH_ATTENTION #if USE_ROCM #include #endif +#endif ++ /** * Note [SDPA Runtime Dispatch] -@@ -183,6 +185,7 @@ bool check_sm_version(cudaDeviceProp * dprops) { - } +@@ -182,6 +185,9 @@ bool check_sm_version(cudaDeviceProp * dprops) { bool check_flash_attention_hardware_support(sdp_params const& params, bool debug) { -+#ifdef USE_FLASH_ATTENTION // Check that the gpu is capable of running flash attention ++#ifndef USE_FLASH_ATTENTION ++ return false; ++#else using sm80 = SMVersion<8, 0>; using sm90 = SMVersion<9, 0>; -@@ -211,6 +214,9 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug + #if USE_ROCM +@@ -209,9 +215,13 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug } #endif return true; -+#else -+ return false; +#endif } bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) { ++#ifndef USE_FLASH_ATTENTION ++ return false; ++#else + // Mem Efficient attention supports hardware in the range [sm_50, sm_90] + using sm50 = SMVersion<5, 0>; + using sm90 = SMVersion<9, 0>; +@@ -240,6 +250,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) + } + #endif + return true; ++#endif + } + + bool check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89( +@@ -554,7 +565,7 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) { + #ifndef USE_FLASH_ATTENTION + TORCH_WARN_ONCE(!debug, "Torch was not compiled with flash attention."); + return false; +-#endif ++#else + + // Define gate functions that determine if a flash kernel can be ran + // Replace with std::to_array when we migrate to c++20 +@@ -597,13 +608,14 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) { + } + } + return true; ++#endif + } + + bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) { + #ifndef USE_MEM_EFF_ATTENTION + TORCH_WARN_ONCE(!debug, "Torch was not compiled with memory efficient attention."); + return false; +-#endif ++#else + // Constraints specific to mem efficient attention + constexpr auto greater_than_or_equal_sm80_mem_efficient_dtypes = + array_of(at::kHalf, at::kFloat, at::kBFloat16); +@@ -663,6 +675,7 @@ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) { + } + #endif + return check_tensor_dtype(params, less_than_sm80_mem_efficient_dtypes, debug); ++#endif + } + + SDPBackend select_sdp_backend(sdp_params const& kernel_params) { -- -2.44.0 +2.45.2 diff --git a/python-torch.spec b/python-torch.spec index 1dad38f..7bcc4bf 100644 --- a/python-torch.spec +++ b/python-torch.spec @@ -12,7 +12,7 @@ %global date0 20240709 %global pypi_version 2.4.0 %else -%global pypi_version 2.3.1 +%global pypi_version 2.4.0 %endif # For -test subpackage @@ -63,18 +63,6 @@ %bcond_with distributed %endif -# OpenCV support came in F41 -%if 0%{?fedora} > 40 -%if %{without gitcommit} -%bcond_without opencv -%else -# USE_OPENCV removed in 2.4.0+ -%bcond_with opencv -%endif -%else -%bcond_with opencv -%endif - # Do no confuse xnnpack versions %if 0%{?fedora} > 40 %bcond_without xnnpack @@ -95,14 +83,10 @@ %endif %if 0%{?fedora} > 40 -%if %{with gitcommit} %bcond_without fbgemm %else %bcond_with fbgemm %endif -%else -%bcond_with fbgemm -%endif # For testing cuda %ifarch x86_64 @@ -139,15 +123,9 @@ %endif # These came in 2.4 and not yet in Fedora -%if %{with gitcommit} %bcond_with opentelemetry %bcond_with httplib %bcond_with kineto -%else -%bcond_without opentelemetry -%bcond_without httplib -%bcond_without kineto -%endif Name: python-%{pypi_name} %if %{with gitcommit} @@ -220,7 +198,6 @@ Source40: https://github.com/Maratyszcza/pthreadpool/archive/%{pt_commit}/ Source50: https://github.com/mreineck/pocketfft/archive/%{pf_commit}/pocketfft-%{pf_scommit}.tar.gz %endif -%if %{with gitcommit} %if %{without opentelemetry} %global ot_ver 1.14.2 Source60: https://github.com/open-telemetry/opentelemetry-cpp/archive/refs/tags/v%{ot_ver}.tar.gz @@ -237,50 +214,15 @@ Source70: https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp- %global ki_scommit %(c=%{ki_commit}; echo ${c:0:7}) Source80: https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz %endif -%endif Patch0: 0001-no-third_party-foxi.patch -%if %{with gitcommit} # https://github.com/pytorch/pytorch/pull/131282 Patch1: 0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch -%endif - -%if %{without gitcommit} -Patch3: 0001-Stub-in-kineto-ActivityType.patch -%endif %if %{with caffe2} Patch6: 0001-reenable-foxi-linking.patch %endif -# Bring some patches forward -%if %{without gitcommit} -# https://github.com/pytorch/pytorch/pull/123384 -# Breaks on python 3.13 -# Patch7: 0001-Reenable-dim-for-python-3.12.patch - -# Dynamo/Inductor on 3.12 -# Fails to apply on 2.3.1 -# Patch8: 0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch -%endif - -%if %{without gitcommit} -# Enable x86 CPU vectorization on windows [submodule sleef] (#118980) -# https://github.com/pytorch/pytorch/commit/56451cd49d9cf94b49197e09dec13426bb1a5370 -# -# Despite the title, this patch fixes compatibility with sleef 3.6 by including -# a backwards-compatible version of the fix from -# https://github.com/pytorch/pytorch/pull/122723. -# Cherry-picked on v2.3.0, without the commit to update the third_party/sleef -# git submodule (because the release archive contains an actual sleef source -# tree instead, so this would not apply.) -Patch9: 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch - -# For Python 3.13 -# https://github.com/pytorch/pytorch/pull/126033 -Patch10: 0001-Changes-to-compile-with-3.13-126033.patch -%endif - # ROCm patches # Patches need to be refactored for ToT # These are ROCm packages @@ -291,9 +233,6 @@ Patch100: 0001-Optionally-use-hipblaslt.patch %endif Patch101: 0001-cuda-hip-signatures.patch Patch102: 0001-silence-an-assert.patch -%if %{without gitcommit} -Patch103: 0001-can-not-use-with-c-files.patch -%endif Patch105: 0001-disable-use-of-aotriton.patch %endif Patch106: 0001-include-fmt-ranges.h-for-using-fmt-join.patch @@ -416,10 +355,6 @@ BuildRequires: libcurand-devel-%{curand_ver} BuildRequires: libcusparse-devel-%{cusparse_ver} %endif -%if %{with opencv} -BuildRequires: opencv-devel -%endif - %if %{with test} BuildRequires: google-benchmark-devel %endif @@ -627,7 +562,6 @@ rm -rf third_party/pocketfft/* cp -r pocketfft-*/* third_party/pocketfft/ %endif -%if %{with gitcommit} %if %{without opentelemtry} tar xf %{SOURCE60} rm -rf third_party/opentelemetry-cpp/* @@ -645,16 +579,6 @@ tar xf %{SOURCE80} rm -rf third_party/kineto/* cp -r kineto-*/* third_party/kineto/ %endif -%endif - -%if %{with opencv} -%if %{without gitcommit} -# Reduce requirements, *FOUND is not set -sed -i -e 's/USE_OPENCV AND OpenCV_FOUND AND USE_FFMPEG AND FFMPEG_FOUND/USE_OPENCV AND USE_FFMPEG/' caffe2/video/CMakeLists.txt -sed -i -e 's/USE_OPENCV AND OpenCV_FOUND/USE_OPENCV/' caffe2/image/CMakeLists.txt -sed -i -e 's/STATUS/FATAL/' caffe2/image/CMakeLists.txt -%endif -%endif # hipblaslt only building with gfx90a %if %{with hipblaslt} @@ -810,18 +734,8 @@ mkdir third_party/pocketfft mkdir third_party/valgrind-headers cp %{_includedir}/valgrind/* third_party/valgrind-headers -%if %{without gitcommit} -# Remove unneeded OpenCL files that confuse the lincense scanner -rm caffe2/contrib/opencl/OpenCL/cl.hpp -rm caffe2/mobile/contrib/libopencl-stub/include/CL/*.h -rm caffe2/mobile/contrib/libopencl-stub/include/CL/*.hpp -%endif - # Fix installing to /usr/lib64 -%if %{with gitcommit} sed -i -e 's@DESTINATION ${PYTHON_LIB_REL_PATH}@DESTINATION ${CMAKE_INSTALL_PREFIX}/${PYTHON_LIB_REL_PATH}@' caffe2/CMakeLists.txt -%endif - %if %{with rocm} # hipify @@ -924,9 +838,6 @@ export USE_NNPACK=OFF export USE_NUMPY=ON export USE_OPENMP=ON export USE_PYTORCH_QNNPACK=OFF -%if %{without gitcommit} -export USE_QNNPACK=OFF -%endif export USE_ROCM=OFF export USE_SYSTEM_SLEEF=ON export USE_SYSTEM_EIGEN_INSTALL=ON @@ -949,12 +860,6 @@ export USE_SYSTEM_PSIMD=ON export USE_SYSTEM_XNNPACK=ON %endif -%if %{with caffe2} -%if %{without gitcommit} -export BUILD_CAFFE2=ON -%endif -%endif - %if %{with cuda} %if %{without rocm} export CPLUS_INCLUDE_PATH=/usr/local/cuda-%{cuda_ver}/include @@ -982,10 +887,6 @@ export USE_MPI=ON %endif %endif -%if %{with opencv} -export USE_OPENCV=ON -%endif - %if %{with test} export BUILD_TEST=ON %endif @@ -1097,11 +998,6 @@ done %{python3_sitearch}/%{pypi_name}-*.egg-info %{python3_sitearch}/functorch %{python3_sitearch}/torchgen -%if %{without gitcommit} -%if %{with caffe2} -%{python3_sitearch}/caffe2 -%endif -%endif %if %{with rocm} %files -n python3-%{pypi_name}-rocm-gfx8 diff --git a/sources b/sources index 60cce58..a4dbc9a 100644 --- a/sources +++ b/sources @@ -12,3 +12,7 @@ SHA512 (libnop-910b558.tar.gz) = 74c5324eaa1b6b2ac8dfef94c835b5c5b044625f8e5efe3 SHA512 (pytorch-97ff6cf.tar.gz) = 105ebcba298558fe833f90e7e40b003d35a74609e777f9dc4c47f5668c884f603455113ac0ff252a62b83c81137ae66ceb1a862d351203925dcfc3dcf9f73580 SHA512 (pytorch-v2.3.0.tar.gz) = 0c2ffc7bf2fd86070e9958c34eca1f03a0248a011ac6ffaeb69f65306ff856edd5359986f02af25888433187e6d7f29b60edded092e2ac30c8cec49023166eda SHA512 (pytorch-v2.3.1.tar.gz) = fe132251b2bae87b70ba3d95dc32f6a4545970d11893118b0ebe6ca129732e516ef4d6cc4f380b3db9bb2277d1db8ce78a401c40149bb1dfbab76eab9e3992c4 +SHA512 (pytorch-v2.4.0.tar.gz) = bcfca6aefee00d15d1c0a2456cd9d802d9a8e96816d421d10e3eed5c608bfdf23dfae492ca3638c0bae99ef5bb8c98f4774c0b9f1a8b94d4dc36a52226033314 +SHA512 (v1.14.2.tar.gz) = 97635bbaf6dd567c201451dfaf7815b2052fe50d9bccc97aade86cfa4a92651374d167296a5453031b2681dc302806a289bca011a9e79ddc381a17d6118971d7 +SHA512 (cpp-httplib-3b6597b.tar.gz) = 8f1090658c498d04f14fec5c2f301847b1f3360bf92b18d82927643ee04ab61a6b274733a01c7850f9c030205120d674d1d961358d49fdd15636736fb8704f55 +SHA512 (kineto-be13176.tar.gz) = 41a08c7da9eea7d12402f80a5550c9d4df79798719cc52b12a507828c8c896ba28a37c35d8adf809ca72589e1d84965d5ef6dd01f3f8dc1c803c5ed67b03a43a