PyTorch 2.4

Signed-off-by: Tom Rix <trix@redhat.com>
This commit is contained in:
Tom Rix 2024-07-25 16:27:17 -06:00
commit 86185b46a2
7 changed files with 1468 additions and 273 deletions

4
.gitignore vendored
View file

@ -12,3 +12,7 @@
/pytorch-97ff6cf.tar.gz
/pytorch-v2.3.0.tar.gz
/pytorch-v2.3.1.tar.gz
/pytorch-v2.4.0.tar.gz
/v1.14.2.tar.gz
/cpp-httplib-3b6597b.tar.gz
/kineto-be13176.tar.gz

View file

@ -0,0 +1,47 @@
From 091b7fe1ccbb5e4ff4ac6017d42bacb869f61a27 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Sat, 20 Jul 2024 05:37:15 -0600
Subject: [PATCH] Add cmake option USE_SYSTEM_FBGEMM
Signed-off-by: Tom Rix <trix@redhat.com>
---
CMakeLists.txt | 1 +
cmake/Dependencies.cmake | 3 ++-
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c4cd4b2c2a98..2068f7c6c4f2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -253,6 +253,7 @@ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
"USE_CUDNN" OFF)
cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF)
option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
+option(USE_SYSTEM_FBGEMM "Use system-wide FBGEMM" OFF)
option(USE_KINETO "Use Kineto profiling library" ON)
option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
option(USE_FAKELOWP "Use FakeLowp operators" OFF)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index f1f2eb7cec31..192dac46f13b 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -706,6 +706,7 @@ endif()
# ---[ FBGEMM
if(USE_FBGEMM)
+ if (NOT USE_SYSTEM_FBGEMM)
set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")
if(NOT DEFINED FBGEMM_SOURCE_DIR)
set(FBGEMM_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/fbgemm" CACHE STRING "FBGEMM source directory")
@@ -746,7 +747,7 @@ if(USE_FBGEMM)
target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable)
endif()
endif()
-
+ endif()
if(USE_FBGEMM)
list(APPEND Caffe2_DEPENDENCY_LIBS fbgemm)
endif()
--
2.45.1

View file

@ -1,174 +1,398 @@
From d77e05d90df006322cda021f1a8affdcc2c7eaef Mon Sep 17 00:00:00 2001
From f1d65e958afa65882dbfea8b392ab847a84d41ed Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Fri, 23 Feb 2024 08:27:30 -0500
Date: Sat, 29 Jun 2024 04:18:34 -0700
Subject: [PATCH] Optionally use hipblaslt
The hipblaslt package is not available on Fedora.
Instead of requiring the package, make it optional.
If it is found, define the preprocessor variable HIPBLASLT
Convert the checks for ROCM_VERSION >= 507000 to HIPBLASLT checks
Signed-off-by: Tom Rix <trix@redhat.com>
---
aten/src/ATen/cuda/CUDABlas.cpp | 7 ++++---
aten/src/ATen/cuda/CUDABlas.h | 2 +-
aten/src/ATen/cuda/CUDAContextLight.h | 4 ++--
aten/src/ATen/cuda/CublasHandlePool.cpp | 4 ++--
aten/src/ATen/cuda/tunable/TunableGemm.h | 6 +++---
aten/src/ATen/native/cuda/Blas.cpp | 14 ++++++++------
cmake/Dependencies.cmake | 3 +++
cmake/public/LoadHIP.cmake | 4 ++--
8 files changed, 25 insertions(+), 19 deletions(-)
aten/src/ATen/cuda/CUDABlas.cpp | 46 ++++++++++++++++++------
aten/src/ATen/cuda/CUDAContextLight.h | 4 +++
aten/src/ATen/cuda/CublasHandlePool.cpp | 10 ++++--
aten/src/ATen/cuda/tunable/TunableGemm.h | 18 +++++++---
aten/src/ATen/native/cuda/Blas.cpp | 18 +++++++++-
cmake/Dependencies.cmake | 3 ++
cmake/public/LoadHIP.cmake | 2 +-
7 files changed, 82 insertions(+), 19 deletions(-)
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index d534ec5a178..e815463f630 100644
index ce991a9bcad4..3f0d17b52778 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -14,7 +14,7 @@
@@ -14,7 +14,9 @@
#include <c10/util/irange.h>
#ifdef USE_ROCM
-#if ROCM_VERSION >= 60000
+#ifdef HIPBLASLT
+#ifdef USE_HIPBLASLT
#include <hipblaslt/hipblaslt-ext.hpp>
#endif
+#endif
// until hipblas has an API to accept flags, we must use rocblas here
@@ -781,7 +781,7 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
}
}
#include <hipblas/hipblas.h>
#include <rocblas/rocblas.h>
@@ -182,6 +184,9 @@ uint32_t _getAlignment(uintptr_t address) {
static size_t _parseChosenWorkspaceSize() {
const char * val = getenv("CUBLASLT_WORKSPACE_SIZE");
#ifdef USE_ROCM
+#ifndef USE_HIPBLASLT
+ return 0;
+#endif
if (!val) {
// accept either env var
val = getenv("HIPBLASLT_WORKSPACE_SIZE");
@@ -235,6 +240,7 @@ namespace at::cuda::blas {
} while (0)
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
#if defined(USE_ROCM) && ROCM_VERSION >= 50700 && ROCM_VERSION < 60000
// only for rocm 5.7 where we first supported hipblaslt, it was difficult
@@ -912,6 +912,7 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
namespace {
// Following the pattern of CuSparseDescriptor
// Defined here for now because this is the only place cublas_lt interface is
@@ -318,7 +324,6 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
};
} // namespace
+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
-
template <typename Dtype>
inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
cudaDataType_t abcType = CUDA_R_32F;
@@ -452,7 +457,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
" scaleType ",
scaleType);
}
-
+#endif
template <typename Dtype>
inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
@@ -608,10 +613,13 @@ void bgemm_internal<double>(CUDABLAS_BGEMM_ARGTYPES(double))
template <>
void bgemm_internal<float>(CUDABLAS_BGEMM_ARGTYPES(float))
{
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
bgemm_internal_cublaslt<float>(CUDABLAS_BGEMM_ARGS(float));
}
- else {
+ else
+#endif
+ {
bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGS(float));
}
}
@@ -651,10 +659,13 @@ void bgemm_internal<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<fl
template <>
void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half))
{
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
bgemm_internal_cublaslt<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
}
- else {
+ else
+#endif
+ {
bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
}
}
@@ -662,10 +673,13 @@ void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half))
template <>
void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
{
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
bgemm_internal_cublaslt<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
}
- else {
+ else
+#endif
+ {
bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
}
}
@@ -781,11 +795,13 @@ void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
}
}
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
template <typename Dtype>
inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
// forward to bgemm implementation but set strides and batches to 0
bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0);
}
+#endif
template <typename Dtype>
inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
@@ -1008,10 +1024,13 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
template <>
void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
{
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
}
- else {
+ else
+#endif
+ {
gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGS(float));
}
}
@@ -1051,10 +1070,13 @@ void gemm_internal<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<floa
template <>
void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
{
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
}
- else {
+ else
+#endif
+ {
gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
}
}
@@ -1062,10 +1084,13 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
template <>
void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
{
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
}
- else {
+ else
+#endif
+ {
gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
}
}
@@ -1177,7 +1202,7 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
}
}
-
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
template <typename Dtype>
void gemm_and_bias(
bool transpose_mat1,
@@ -1124,7 +1125,7 @@ template void gemm_and_bias(
at::BFloat16* result_ptr,
int64_t result_ld,
GEMMAndBiasActivationEpilogue activation);
-
@@ -1410,7 +1435,7 @@ void scaled_gemm(
ScalarType result_dtype,
void* amax_ptr,
bool use_fast_accum) {
-#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
+#if CUDA_VERSION >= 11080 || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
const auto computeType = CUBLAS_COMPUTE_32F;
const auto scaleType = CUDA_R_32F;
const int8_t fastAccuMode = use_fast_accum ? 1 : 0;
@@ -1681,6 +1706,7 @@ void int8_gemm(
" scaleType ",
scaleType);
}
+#endif
void scaled_gemm(
char transa,
char transb,
diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h
index eb12bb350c5..068607467dd 100644
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@@ -82,7 +82,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
template <>
void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
enum GEMMAndBiasActivationEpilogue {
None,
RELU,
template <>
void trsm<float>(CUDABLAS_TRSM_ARGTYPES(float)) {
diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h
index 4ec35f59a21..e28dc42034f 100644
index f2b657ced51b..f0ee613c4208 100644
--- a/aten/src/ATen/cuda/CUDAContextLight.h
+++ b/aten/src/ATen/cuda/CUDAContextLight.h
@@ -9,7 +9,7 @@
@@ -9,7 +9,9 @@
// cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
// added bf16 support
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)))
#include <cublasLt.h>
#endif
+#endif
@@ -82,7 +82,7 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator();
#ifdef CUDART_VERSION
#include <cusolverDn.h>
@@ -80,7 +82,9 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator();
/* Handles */
TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle();
TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)))
TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
#endif
+#endif
TORCH_CUDA_CPP_API void clearCublasWorkspaces();
diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
index 6913d2cd95e..3d4276be372 100644
index 8eac525b3695..abfdf7a23847 100644
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@@ -29,7 +29,7 @@ namespace at::cuda {
namespace {
-#if defined(USE_ROCM) && ROCM_VERSION >= 50700
+#if defined(USE_ROCM) && defined(HIPBLASLT)
-#if defined(USE_ROCM)
+#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
void createCublasLtHandle(cublasLtHandle_t *handle) {
TORCH_CUDABLAS_CHECK(cublasLtCreate(handle));
}
@@ -190,7 +190,7 @@ cublasHandle_t getCurrentCUDABlasHandle() {
@@ -191,8 +191,9 @@ cublasHandle_t getCurrentCUDABlasHandle() {
return handle;
}
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
cublasLtHandle_t getCurrentCUDABlasLtHandle() {
-cublasLtHandle_t getCurrentCUDABlasLtHandle() {
#ifdef USE_ROCM
+#if defined(USE_HIPBLASLT)
+cublasLtHandle_t getCurrentCUDABlasLtHandle() {
c10::DeviceIndex device = 0;
AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
@@ -213,9 +214,12 @@ cublasLtHandle_t getCurrentCUDABlasLtHandle() {
auto handle = myPoolWindow->reserve(device);
return handle;
+}
+#endif
#else
+cublasLtHandle_t getCurrentCUDABlasLtHandle() {
return reinterpret_cast<cublasLtHandle_t>(getCurrentCUDABlasHandle());
-#endif
}
+#endif
} // namespace at::cuda
diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h
index 3ba0d761277..dde1870cfbf 100644
index 53e6154120c9..fa1d664696db 100644
--- a/aten/src/ATen/cuda/tunable/TunableGemm.h
+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h
@@ -11,7 +11,7 @@
@@ -11,7 +11,9 @@
#include <ATen/cuda/tunable/GemmCommon.h>
#ifdef USE_ROCM
-#if ROCM_VERSION >= 50700
+#ifdef HIPBLASLT
+#ifdef USE_HIPBLASLT
#include <ATen/cuda/tunable/GemmHipblaslt.h>
#endif
+#endif
#include <ATen/cuda/tunable/GemmRocblas.h>
@@ -166,7 +166,7 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
}
#endif
-#if defined(USE_ROCM) && ROCM_VERSION >= 50700
+#if defined(USE_ROCM) && defined(HIPBLASLT)
static const char *env = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
if (env == nullptr || strcmp(env, "1") == 0) {
// disallow tuning of hipblaslt with c10::complex
@@ -240,7 +240,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
#include <ATen/cuda/tunable/StreamTimer.h>
@@ -65,6 +67,7 @@ class DefaultGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>>
}
#endif
-#if defined(USE_ROCM) && ROCM_VERSION >= 50700
+#if defined(USE_ROCM) && defined(HIPBLASLT)
static const char *env = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
if (env == nullptr || strcmp(env, "1") == 0) {
// disallow tuning of hipblaslt with c10::complex
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index 29e5c5e3cf1..df56f3d7f1d 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -155,7 +155,7 @@ enum class Activation {
GELU,
};
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activation a) {
switch (a) {
case Activation::None:
@@ -193,6 +193,7 @@ static bool getDisableAddmmCudaLt() {
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
template <typename T>
class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
public:
@@ -94,6 +97,7 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
return OK;
}
};
+#endif
template <typename T>
inline bool IsZero(T v) {
@@ -191,6 +195,7 @@ static void AddRocblasValidator() {
}
}
+#ifdef USE_HIPBLASLT
static void AddHipblasltValidator() {
auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
if (validators.find("HIPBLASLT_VERSION") == validators.end()) {
@@ -205,6 +210,7 @@ static void AddHipblasltValidator() {
[hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
}
}
+#endif
static void AddRocmValidator() {
auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
@@ -243,7 +249,7 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
}
AddRocblasValidator();
}
-
+#ifdef USE_HIPBLASLT
static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
rocm_validators = true;
@@ -257,7 +263,7 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
}
AddHipblasltValidator();
}
-
+#endif
if (rocm_validators) {
AddRocmValidator();
}
@@ -286,7 +292,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
}
AddRocblasValidator();
}
-
+#ifdef USE_HIPBLASLT
static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
rocm_validators = true;
@@ -300,7 +306,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
}
AddHipblasltValidator();
}
-
+#endif
if (rocm_validators) {
AddRocmValidator();
}
@@ -312,6 +318,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
}
};
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout>
class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer> {
public:
@@ -321,10 +328,12 @@ class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer>
auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
#if defined(USE_ROCM)
+#ifdef USE_HIPBLASLT
for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps<AT, BT, CT, ALayout, BLayout>()) {
this->RegisterOp(std::move(name), std::move(op));
}
AddHipblasltValidator();
+#endif
AddRocmValidator();
#endif
}
@@ -337,6 +346,7 @@ class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer>
"_", BlasOpToString(ALayout), BlasOpToString(BLayout));
}
};
+#endif
#undef XSTRINGIFY
#undef STRINGIFY
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index 84c59a4fd0d7..56ad5de3bf2d 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -173,6 +173,7 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa
}
static bool getDisableAddmmCudaLt() {
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT");
#ifdef USE_ROCM
// if we enable tunable op, it'll take priority over just hipblaslt (heuristics)
@@ -196,10 +197,14 @@ static bool getDisableAddmmCudaLt() {
}
return false;
#endif
+#else
+ return true;
+#endif
}
#ifdef USE_ROCM
static bool isSupportedHipLtROCmArch(int index) {
+#if defined(HIPBLASLT)
+#ifdef USE_HIPBLASLT
hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index);
std::string device_arch = prop->gcnArchName;
static const std::vector<std::string> archs = {"gfx90a", "gfx940", "gfx941", "gfx942"};
@@ -203,6 +204,7 @@ static bool isSupportedHipLtROCmArch(int index) {
@@ -210,6 +215,7 @@ static bool isSupportedHipLtROCmArch(int index) {
}
}
TORCH_CHECK(false, "Attempting to use hipBLASLt on a unsupported architecture!");
@ -176,87 +400,107 @@ index 29e5c5e3cf1..df56f3d7f1d 100644
return false;
}
#endif
@@ -228,7 +230,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
@@ -235,6 +241,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
at::ScalarType scalar_type = self.scalar_type();
c10::MaybeOwned<Tensor> self_;
if (&result != &self) {
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11040 && !defined(_MSC_VER)) || defined(USE_ROCM) && ROCM_VERSION >= 50700
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11040 && !defined(_MSC_VER)) || defined(USE_ROCM) && defined(HIPBLASLT)
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM)
// Strangely, if mat2 has only 1 row or column, we get
// CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
// self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
@@ -271,7 +273,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
@@ -276,13 +283,14 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
scalar_type != at::ScalarType::BFloat16));
#endif
}
+#endif
#endif
if (!useLtInterface) {
self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
}
self__sizes = self_->sizes();
} else {
-#if defined(USE_ROCM) && ROCM_VERSION >= 50700
+#if defined(USE_ROCM) && defined(HIPBLASLT)
-#if defined(USE_ROCM)
+#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
useLtInterface = !disable_addmm_cuda_lt &&
result.dim() == 2 && result.is_contiguous() &&
isSupportedHipLtROCmArch(self.device().index()) &&
@@ -322,7 +324,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
@@ -334,6 +342,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj());
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
if (useLtInterface) {
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
#if defined(USE_ROCM)
AT_DISPATCH_FLOATING_TYPES_AND2(
at::ScalarType::Half,
@@ -876,7 +878,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
at::native::resize_output(amax, {});
@@ -394,6 +403,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
activation_epilogue
);
});
+#endif
#endif
} else
{
@@ -803,6 +813,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
}
-#if !defined(USE_ROCM) && !defined(_MSC_VER) || (defined(USE_ROCM) && ROCM_VERSION >= 60000)
+#if !defined(USE_ROCM) && !defined(_MSC_VER) || (defined(USE_ROCM) && defined(HIPBLASLT))
cublasCommonArgs args(mat1, mat2, out);
const auto out_dtype_ = args.result->scalar_type();
TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
@@ -906,7 +908,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
TORCH_CHECK(false, "_scaled_mm_out_cuda is not compiled for this platform.");
static bool _scaled_mm_allowed_device() {
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
auto dprops = at::cuda::getCurrentDeviceProperties();
#ifdef USE_ROCM
std::string device_arch = dprops->gcnArchName;
@@ -817,6 +828,9 @@ static bool _scaled_mm_allowed_device() {
#else
return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9);
#endif
+#else
+ return false;
+#endif
}
// Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax
@@ -850,6 +864,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
// Check sizes
bool allowed_device = _scaled_mm_allowed_device();
TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+");
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
TORCH_CHECK(
@@ -1025,6 +1040,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
#if defined(USE_ROCM) && ROCM_VERSION >= 60000 && ROCM_VERSION < 60200
// ROCm's hipBLASLt does not support amax before 6.2, so calculate separately
amax = at::max(at::abs(out.to(kFloat)));
+#endif
#endif
-#if defined(USE_ROCM) && ROCM_VERSION >= 60000
+#if defined(USE_ROCM) && defined(HIPBLASLT)
// rocm's hipblaslt does not yet support amax, so calculate separately
auto out_float32 = out.to(kFloat);
out_float32.abs_();
return {out, amax};
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index b7ffbeb07dc..2b6c3678984 100644
index f1f2eb7cec31..8d05e834bbc5 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1273,6 +1273,9 @@ if(USE_ROCM)
if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "6.0.0")
list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2)
endif()
@@ -1052,6 +1052,9 @@ if(USE_ROCM)
list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
list(APPEND HIP_CXX_FLAGS -std=c++17)
list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2)
+ if(hipblast_FOUND)
+ list(APPEND HIP_CXX_FLAGS -DHIPBLASLT)
+ list(APPEND HIP_CXX_FLAGS -DUSE_HIPBLASLT)
+ endif()
if(HIPBLASLT_CUSTOM_DATA_TYPE)
list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_CUSTOM_DATA_TYPE)
if(HIP_NEW_TYPE_ENUMS)
list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS)
endif()
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index f6ca263c5e5..53eb0b63c1a 100644
index fa39156031ff..df4836847fdf 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -156,7 +156,7 @@ if(HIP_FOUND)
@@ -155,7 +155,7 @@ if(HIP_FOUND)
find_package_and_print_version(hiprand REQUIRED)
find_package_and_print_version(rocblas REQUIRED)
find_package_and_print_version(hipblas REQUIRED)
if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.7.0")
- find_package_and_print_version(hipblaslt REQUIRED)
+ find_package_and_print_version(hipblaslt)
endif()
- find_package_and_print_version(hipblaslt REQUIRED)
+ find_package_and_print_version(hipblaslt)
find_package_and_print_version(miopen REQUIRED)
if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "4.1.0")
@@ -191,7 +191,7 @@ if(HIP_FOUND)
# roctx is part of roctracer
find_library(ROCM_ROCTX_LIB roctx64 HINTS ${ROCM_PATH}/lib)
- if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.7.0")
+ if(hipblastlt_FOUND)
# check whether hipblaslt is using its own datatype
set(file "${PROJECT_BINARY_DIR}/hipblaslt_test_data_type.cc")
file(WRITE ${file} ""
find_package_and_print_version(hipfft REQUIRED)
find_package_and_print_version(hipsparse REQUIRED)
--
2.43.2
2.45.2

View file

@ -0,0 +1,952 @@
From 273f23698c887b52e66c2abec8101b7398f0f9c4 Mon Sep 17 00:00:00 2001
From: "Benjamin A. Beasley" <code@musicinmybrain.net>
Date: Wed, 5 Jun 2024 11:06:02 -0400
Subject: [PATCH] Patch for sleef 3.6
---
...ectorization-on-windows-submodule-sl.patch | 910 ++++++++++++++++++
python-torch.spec | 11 +
2 files changed, 921 insertions(+)
create mode 100644 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
diff --git a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
new file mode 100644
index 000000000000..562f55b742c2
--- /dev/null
+++ b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
@@ -0,0 +1,910 @@
+From 3d1e4b3e5ddcdd2717e590c635097163fef64c83 Mon Sep 17 00:00:00 2001
+From: Xu Han <xu.han@intel.com>
+Date: Sun, 31 Mar 2024 03:07:32 +0000
+Subject: [PATCH] Enable x86 CPU vectorization on windows [submodule sleef]
+ (#118980)
+
+Enable VEC on Windows OS.
+1. Fix some type defination gap between Windows and Linux.
+2. Fix some operator not support on Windows, such as [], /.
+3. Enable static sleef library build on Windows.
+4. Disable unsupported function overloading on MSVC.
+5. Upgrade submodule sleef lib, which fixed build issue on Windows.
+6. Fixed bazel build issues.
+7. Fix test app not link to sleef on Windows.
+
+Note: If rebuild fail after pulled this PR, please sync `sleef` submodule by run:
+```cmd
+git submodule sync
+git submodule update --init --recursive
+```
+
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/118980
+Approved by: https://github.com/jgong5, https://github.com/ezyang, https://github.com/malfet
+---
+ aten/src/ATen/CMakeLists.txt | 48 ++++++--------
+ aten/src/ATen/cpu/vec/vec256/vec256.h | 14 ++--
+ .../src/ATen/cpu/vec/vec256/vec256_bfloat16.h | 27 ++++++--
+ .../cpu/vec/vec256/vec256_complex_double.h | 7 +-
+ .../cpu/vec/vec256/vec256_complex_float.h | 7 +-
+ aten/src/ATen/cpu/vec/vec256/vec256_double.h | 5 +-
+ aten/src/ATen/cpu/vec/vec256/vec256_float.h | 15 +++--
+ aten/src/ATen/cpu/vec/vec256/vec256_qint.h | 12 +++-
+ aten/src/ATen/cpu/vec/vec512/vec512.h | 14 ++--
+ .../src/ATen/cpu/vec/vec512/vec512_bfloat16.h | 27 ++++++--
+ .../cpu/vec/vec512/vec512_complex_double.h | 7 +-
+ .../cpu/vec/vec512/vec512_complex_float.h | 7 +-
+ aten/src/ATen/cpu/vec/vec512/vec512_double.h | 5 +-
+ aten/src/ATen/cpu/vec/vec512/vec512_float.h | 15 +++--
+ aten/src/ATen/cpu/vec/vec512/vec512_qint.h | 66 ++++++++++++++++++-
+ aten/src/ATen/cpu/vec/vec_base.h | 6 ++
+ caffe2/CMakeLists.txt | 2 +-
+ third_party/sleef.BUILD | 3 +-
+ 18 files changed, 194 insertions(+), 93 deletions(-)
+
+diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
+index bf425af5fa9..58d5828e8ca 100644
+--- a/aten/src/ATen/CMakeLists.txt
++++ b/aten/src/ATen/CMakeLists.txt
+@@ -419,32 +419,25 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$")
+ list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo)
+ endif()
+
+-if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
+- # Preserve values for the main build
+- set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS})
+- set(__aten_sleef_build_tests ${BUILD_TESTS})
+-
+- # Unset our restrictive C++ flags here and reset them later.
+- # Remove this once we use proper target_compile_options.
+- set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+- set(CMAKE_CXX_FLAGS)
+-
+- # Bump up optimization level for sleef to -O1, since at -O0 the compiler
+- # excessively spills intermediate vector registers to the stack
+- # and makes things run impossibly slowly
+- set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+- if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
+- string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
+- else()
+- set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
++if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
++ if(NOT MSVC)
++ # Bump up optimization level for sleef to -O1, since at -O0 the compiler
++ # excessively spills intermediate vector registers to the stack
++ # and makes things run impossibly slowly
++ set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
++ if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
++ string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
++ else()
++ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
++ endif()
+ endif()
+
+ if(NOT USE_SYSTEM_SLEEF)
+- set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
+- set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
+- set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
+- set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
+- set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE})
++ set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
++ set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
++ set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
++ set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
++ set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE)
+ if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+ if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+ set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE)
+@@ -465,12 +458,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
+ endif()
+ list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef)
+
+- set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
+- set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
+-
+- # Set these back. TODO: Use SLEEF_ to pass these instead
+- set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE)
+- set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE)
++ if(NOT MSVC)
++ set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
++ endif()
+ endif()
+
+ if(USE_CUDA AND NOT USE_ROCM)
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h
+index 800b027e469..c431fa3c605 100644
+--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
++++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
+@@ -69,7 +69,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
+ }
+
+
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+@@ -94,7 +94,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-
++#ifndef _MSC_VER
++// MSVC is not working well on complex function overload.
+ template<int64_t scale = 1>
+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
+ inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
+@@ -106,9 +107,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
+ inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
+ return _mm256_i32gather_ps(base_addr, vindex, scale);
+ }
+-
++#endif
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-
++#ifndef _MSC_VER
++// MSVC is not working well on complex function overload.
+ template<int64_t scale = 1>
+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
+ inline mask_gather(const Vectorized<double>& src, const double* base_addr,
+@@ -122,7 +124,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
+ const Vectorized<int32_t>& vindex, Vectorized<float>& mask) {
+ return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale);
+ }
+-
++#endif
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ // Only works for inputs in the range: [-2^51, 2^51]
+@@ -302,6 +304,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
+ return flip8(v);
+ }
+
+-#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#endif // (defined(CPU_CAPABILITY_AVX2)
+
+ }} // namepsace at::vec::CPU_CAPABILITY
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
+index 3e26213d6d2..66557436c70 100644
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
+@@ -7,7 +7,8 @@
+ #include <ATen/cpu/vec/vec_base.h>
+ #include <c10/util/irange.h>
+
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
++#define SLEEF_STATIC_LIBS
+ #include <sleef.h>
+ #endif
+
+@@ -18,7 +19,18 @@ namespace at::vec {
+ // See Note [CPU_CAPABILITY namespace]
+ inline namespace CPU_CAPABILITY {
+
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
++
++#ifndef SLEEF_CONST
++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
++#define SLEEF_CONST const
++#else
++#define SLEEF_CONST
++#endif
++#define SLEEF_CONST_OLD SLEEF_CONST
++#else
++#define SLEEF_CONST_OLD
++#endif
+
+ // bfloat16 conversion
+ static inline void cvtbf16_fp32(const __m128i& a, __m256& o) {
+@@ -265,7 +277,8 @@ public:
+ }
+ return b;
+ }
+- Vectorized<T> map(const __m256 (*const vop)(__m256)) const {
++
++ Vectorized<T> map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const {
+ __m256 lo, hi;
+ cvt_to_fp32<T>(values, lo, hi);
+ const auto o1 = vop(lo);
+@@ -1026,7 +1039,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
+ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
+ CONVERT_VECTORIZED_INIT(Half, half);
+
+-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#else // defined(CPU_CAPABILITY_AVX2)
+
+ #define CONVERT_NON_VECTORIZED_INIT(type, name) \
+ inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
+@@ -1051,9 +1064,9 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
+ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
+ CONVERT_NON_VECTORIZED_INIT(Half, half);
+
+-#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#endif // defined(CPU_CAPABILITY_AVX2)
+
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
+ #define LOAD_FP32_VECTORIZED_INIT(type, name) \
+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
+ auto values = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data)); \
+@@ -1072,7 +1085,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
+ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
+ LOAD_FP32_VECTORIZED_INIT(Half, fp16);
+
+-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#else // defined(CPU_CAPABILITY_AVX2)
+ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
+ __at_align__ float values[Vectorized<float>::size()]; \
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
+index f93ea1e63c3..6c198fb37d3 100644
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
+@@ -8,7 +8,8 @@
+ #include <ATen/cpu/vec/intrinsics.h>
+ #include <ATen/cpu/vec/vec_base.h>
+
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
++#define SLEEF_STATIC_LIBS
+ #include <sleef.h>
+ #endif
+
+@@ -16,7 +17,7 @@ namespace at::vec {
+ // See Note [CPU_CAPABILITY namespace]
+ inline namespace CPU_CAPABILITY {
+
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
+
+ template <> class Vectorized<c10::complex<double>> {
+ private:
+@@ -145,7 +146,7 @@ public:
+ auto abs = abs_();
+ auto zero = _mm256_setzero_pd();
+ auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ);
+- auto div = values / abs;
++ auto div = _mm256_div_pd(values, abs);
+ return _mm256_blendv_pd(div, zero, mask);
+ }
+ __m256d real_() const {
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
+index 7c142c04b79..c72d4d49274 100644
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
+@@ -7,7 +7,8 @@
+ #include <c10/util/irange.h>
+ #include <ATen/cpu/vec/intrinsics.h>
+ #include <ATen/cpu/vec/vec_base.h>
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
++#define SLEEF_STATIC_LIBS
+ #include <sleef.h>
+ #endif
+
+@@ -15,7 +16,7 @@ namespace at::vec {
+ // See Note [CPU_CAPABILITY namespace]
+ inline namespace CPU_CAPABILITY {
+
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
+
+ template <> class Vectorized<c10::complex<float>> {
+ private:
+@@ -180,7 +181,7 @@ public:
+ auto abs = abs_();
+ auto zero = _mm256_setzero_ps();
+ auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ);
+- auto div = values / abs;
++ auto div = _mm256_div_ps(values, abs);
+ return _mm256_blendv_ps(div, zero, mask);
+ }
+ __m256 real_() const {
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
+index bc82d07edd1..bed6da627af 100644
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
+@@ -6,7 +6,8 @@
+ #include <ATen/cpu/vec/intrinsics.h>
+ #include <ATen/cpu/vec/vec_base.h>
+ #include <c10/util/irange.h>
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
++#define SLEEF_STATIC_LIBS
+ #include <sleef.h>
+ #endif
+
+@@ -15,7 +16,7 @@ namespace at::vec {
+ inline namespace CPU_CAPABILITY {
+
+
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
+
+ template <> class Vectorized<double> {
+ private:
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+index 886809a0b8a..0e3664cd37b 100644
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+@@ -6,7 +6,8 @@
+ #include <ATen/cpu/vec/intrinsics.h>
+ #include <ATen/cpu/vec/vec_base.h>
+ #include <c10/util/irange.h>
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
++#define SLEEF_STATIC_LIBS
+ #include <sleef.h>
+ #endif
+
+@@ -14,7 +15,7 @@ namespace at::vec {
+ // See Note [CPU_CAPABILITY namespace]
+ inline namespace CPU_CAPABILITY {
+
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
+
+ template <> class Vectorized<float> {
+ private:
+@@ -226,14 +227,14 @@ public:
+ static __m256 vec_factorial_5 =
+ _mm256_set1_ps(0.00828929059f); // 1/factorial(5)
+ static __m256 vec_exp_log2ef =
+- (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e)
++ _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e)
+ static __m256 vec_half = _mm256_set1_ps(0.5f);
+ static __m256 vec_one = _mm256_set1_ps(1.f);
+ static __m256 vec_zero = _mm256_set1_ps(0.f);
+ static __m256 vec_two = _mm256_set1_ps(2.f);
+- static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2)
+- static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50);
+- static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218);
++ static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2)
++ static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
++ static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
+ static __m256i vec_127 = _mm256_set1_epi32(0x0000007f);
+ static int n_mantissa_bits = 23;
+
+@@ -266,7 +267,7 @@ public:
+ auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number);
+ auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127);
+ vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
+- auto vec_two_pow_n = (__m256)vec_two_pow_n_i;
++ auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i);
+ vec_two_pow_n =
+ _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask);
+
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+index 4128841701a..85e099904cd 100644
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+@@ -41,11 +41,17 @@
+ namespace at::vec {
+ inline namespace CPU_CAPABILITY {
+
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX2)
+
++#ifdef _MSC_VER
++__declspec(align(64)) struct Vectorizedqi {
++ protected:
++ __m256i vals;
++#else
+ struct Vectorizedqi {
+ protected:
+ __m256i vals __attribute__((aligned(64)));
++#endif
+
+ public:
+ Vectorizedqi() {}
+@@ -133,7 +139,7 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
+ }
+
+ template <typename T>
+-inline void __attribute__((always_inline)) QuantizeAvx2(
++__FORCE_INLINE void QuantizeAvx2(
+ const float* src,
+ T* dst,
+ int len,
+@@ -1331,5 +1337,5 @@ Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const V
+ return a.maximum(b);
+ }
+
+-#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
++#endif // if defined(CPU_CAPABILITY_AVX2)
+ }} // namespace at::vec::CPU_CAPABILITY
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h
+index fe96d123e64..87f723d782c 100644
+--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
++++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
+@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
+ }
+
+
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+@@ -80,7 +80,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-
++#ifndef _MSC_VER
++// MSVC is not working well on complex function overload.
+ template<int64_t scale = 1>
+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
+ inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
+@@ -92,9 +93,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
+ inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
+ return _mm512_i32gather_ps(vindex, base_addr, scale);
+ }
+-
++#endif
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-
++#ifndef _MSC_VER
++// MSVC is not working well on complex function overload.
+ template<int64_t scale = 1>
+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
+ inline mask_gather(const Vectorized<double>& src, const double* base_addr,
+@@ -112,7 +114,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
+ auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ);
+ return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale);
+ }
+-
++#endif
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ template<>
+@@ -270,6 +272,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
+ return flip8(v);
+ }
+
+-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#endif // defined(CPU_CAPABILITY_AVX512)
+
+ }}}
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
+index f9fc92d52bf..eb3b6a72240 100644
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
+@@ -7,7 +7,8 @@
+ #include <ATen/cpu/vec/vec_base.h>
+ #include <c10/util/irange.h>
+
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
++#define SLEEF_STATIC_LIBS
+ #include <sleef.h>
+ #endif
+
+@@ -16,7 +17,18 @@ namespace vec {
+ // See Note [CPU_CAPABILITY namespace]
+ inline namespace CPU_CAPABILITY {
+
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
++
++#ifndef SLEEF_CONST
++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
++#define SLEEF_CONST const
++#else
++#define SLEEF_CONST
++#endif
++#define SLEEF_CONST_OLD SLEEF_CONST
++#else
++#define SLEEF_CONST_OLD
++#endif
+
+ // bfloat16 conversion
+ static inline void cvtbf16_fp32(const __m256i& a, __m512& o) {
+@@ -362,7 +374,8 @@ public:
+ }
+ #pragma clang diagnostic push
+ #pragma clang diagnostic ignored "-Wignored-qualifiers"
+- Vectorized<T> map(const __m512 (*const vop)(__m512)) const {
++
++ Vectorized<T> map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const {
+ __m512 lo, hi;
+ cvt_to_fp32<T>(values, lo, hi);
+ const auto o1 = vop(lo);
+@@ -1571,7 +1584,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
+ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
+ CONVERT_VECTORIZED_INIT(Half, half);
+
+-#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#else //defined(CPU_CAPABILITY_AVX512)
+
+ #define CONVERT_NON_VECTORIZED_INIT(type, name) \
+ inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
+@@ -1601,9 +1614,9 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
+ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
+ CONVERT_NON_VECTORIZED_INIT(Half, half);
+
+-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#endif // defined(CPU_CAPABILITY_AVX512)
+
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
+ #define LOAD_FP32_VECTORIZED_INIT(type, name) \
+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
+ auto values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)); \
+@@ -1622,7 +1635,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
+ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
+ LOAD_FP32_VECTORIZED_INIT(Half, fp16);
+
+-#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#else // defined(CPU_CAPABILITY_AVX512)
+ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
+ __at_align__ float values[Vectorized<float>::size()]; \
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
+index 02aa3a87cc1..c35204f9da2 100644
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
+@@ -7,7 +7,8 @@
+ #include <c10/util/irange.h>
+ #include <ATen/cpu/vec/intrinsics.h>
+ #include <ATen/cpu/vec/vec_base.h>
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
++#define SLEEF_STATIC_LIBS
+ #include <sleef.h>
+ #endif
+
+@@ -16,7 +17,7 @@ namespace vec {
+ // See Note [CPU_CAPABILITY namespace]
+ inline namespace CPU_CAPABILITY {
+
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
+
+ template <> class Vectorized<c10::complex<double>> {
+ private:
+@@ -203,7 +204,7 @@ public:
+ auto abs = abs_();
+ auto zero = _mm512_setzero_pd();
+ auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ);
+- auto div = values / abs;
++ auto div = _mm512_div_pd(values, abs);
+ return _mm512_mask_blend_pd(mask, div, zero);
+ }
+ __m512d real_() const {
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
+index a5d790c98b2..2801e484d94 100644
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
+@@ -7,7 +7,8 @@
+ #include <c10/util/irange.h>
+ #include <ATen/cpu/vec/intrinsics.h>
+ #include <ATen/cpu/vec/vec_base.h>
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
++#define SLEEF_STATIC_LIBS
+ #include <sleef.h>
+ #endif
+
+@@ -16,7 +17,7 @@ namespace vec {
+ // See Note [CPU_CAPABILITY namespace]
+ inline namespace CPU_CAPABILITY {
+
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
+
+ template <> class Vectorized<c10::complex<float>> {
+ private:
+@@ -708,7 +709,7 @@ public:
+ auto abs = abs_();
+ auto zero = _mm512_setzero_ps();
+ auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ);
+- auto div = values / abs;
++ auto div = _mm512_div_ps(values, abs);
+ return _mm512_mask_blend_ps(mask, div, zero);
+ }
+ __m512 real_() const {
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
+index 27b2753c903..508ab257e60 100644
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
+@@ -6,7 +6,8 @@
+ #include <ATen/cpu/vec/intrinsics.h>
+ #include <ATen/cpu/vec/vec_base.h>
+ #include <c10/util/irange.h>
+-#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER)
++#if (defined(CPU_CAPABILITY_AVX512))
++#define SLEEF_STATIC_LIBS
+ #include <sleef.h>
+ #endif
+
+@@ -15,7 +16,7 @@ namespace vec {
+ // See Note [CPU_CAPABILITY namespace]
+ inline namespace CPU_CAPABILITY {
+
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
+
+ template <> class Vectorized<double> {
+ private:
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+index ba5738687fd..a08df3c141a 100644
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+@@ -6,7 +6,8 @@
+ #include <ATen/cpu/vec/intrinsics.h>
+ #include <ATen/cpu/vec/vec_base.h>
+ #include <c10/util/irange.h>
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
++#define SLEEF_STATIC_LIBS
+ #include <sleef.h>
+ #endif
+
+@@ -15,7 +16,7 @@ namespace vec {
+ // See Note [CPU_CAPABILITY namespace]
+ inline namespace CPU_CAPABILITY {
+
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
+
+ template <> class Vectorized<float> {
+ private:
+@@ -246,14 +247,14 @@ public:
+ static __m512 vec_factorial_5 =
+ _mm512_set1_ps(0.00828929059f); // 1/factorial(5)
+ static __m512 vec_exp_log2ef =
+- (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e)
++ _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
+ static __m512 vec_half = _mm512_set1_ps(0.5f);
+ static __m512 vec_one = _mm512_set1_ps(1.f);
+ static __m512 vec_zero = _mm512_set1_ps(0.f);
+ static __m512 vec_two = _mm512_set1_ps(2.f);
+- static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2)
+- static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50);
+- static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218);
++ static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2)
++ static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
++ static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
+ static __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
+ static int n_mantissa_bits = 23;
+
+@@ -288,7 +289,7 @@ public:
+ auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number);
+ auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127);
+ vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
+- auto vec_two_pow_n = (__m512)vec_two_pow_n_i;
++ auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i);
+ vec_two_pow_n =
+ _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero);
+
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
+index e0713d01312..a5671ed4a50 100644
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
+@@ -42,11 +42,17 @@ namespace at {
+ namespace vec {
+ inline namespace CPU_CAPABILITY {
+
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
++#if defined(CPU_CAPABILITY_AVX512)
+
++#ifdef _MSC_VER
++__declspec(align(64)) struct Vectorizedqi {
++ protected:
++ __m512i vals;
++#else
+ struct Vectorizedqi {
+ protected:
+ __m512i vals __attribute__((aligned(64)));
++#endif
+
+ public:
+ Vectorizedqi() {}
+@@ -136,7 +142,7 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
+ }
+
+ template <typename T>
+-inline void __attribute__((always_inline)) QuantizeAvx512(
++__FORCE_INLINE void QuantizeAvx512(
+ const float* src,
+ T* dst,
+ int len,
+@@ -525,10 +531,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
+ Vectorized<float> scale,
+ Vectorized<float> zero_point,
+ Vectorized<float> scale_neg_zp_premul) const {
++ #if defined(_MSC_VER) && !defined(__clang__)
++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
++ #else
+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
++ #endif
+
+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
+@@ -549,10 +562,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
+ float_vec_return_type dequantize(
+ Vectorized<float> scale,
+ Vectorized<float> zero_point) const {
++ #if defined(_MSC_VER) && !defined(__clang__)
++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
++ #else
+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
++ #endif
+
+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
+@@ -598,20 +618,34 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
+ }
+
+ int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
++ #if defined(_MSC_VER) && !defined(__clang__)
++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
++ #else
+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
++ #endif
+
+ __m512i int32_val0 = cvtepi8_epi32(int_val0);
+ __m512i int32_val1 = cvtepi8_epi32(int_val1);
+ __m512i int32_val2 = cvtepi8_epi32(int_val2);
+ __m512i int32_val3 = cvtepi8_epi32(int_val3);
+
++ #if defined(_MSC_VER) && !defined(__clang__)
++ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
++ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
++ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
++ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
++ #else
+ __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
+ __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
+ __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
+ __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
++ #endif
+
+ __m512i int32_b0 = cvtepi8_epi32(int_b0);
+ __m512i int32_b1 = cvtepi8_epi32(int_b1);
+@@ -721,10 +755,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
+ Vectorized<float> scale,
+ Vectorized<float> zero_point,
+ Vectorized<float> scale_zp_premul) const {
++ #if defined(_MSC_VER) && !defined(__clang__)
++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
++ #else
+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
++ #endif
+
+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
+@@ -746,10 +787,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
+ float_vec_return_type dequantize(
+ Vectorized<float> scale,
+ Vectorized<float> zero_point) const {
++ #if defined(_MSC_VER) && !defined(__clang__)
++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
++ #else
+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
++ #endif
+
+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
+@@ -796,20 +844,34 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
+ }
+
+ int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
++ #if defined(_MSC_VER) && !defined(__clang__)
++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
++ #else
+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
++ #endif
+
+ __m512i int32_val0 = cvtepu8_epi32(int_val0);
+ __m512i int32_val1 = cvtepu8_epi32(int_val1);
+ __m512i int32_val2 = cvtepu8_epi32(int_val2);
+ __m512i int32_val3 = cvtepu8_epi32(int_val3);
+
++ #if defined(_MSC_VER) && !defined(__clang__)
++ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
++ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
++ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
++ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
++ #else
+ __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
+ __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
+ __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
+ __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
++ #endif
+
+ __m512i int32_b0 = cvtepu8_epi32(int_b0);
+ __m512i int32_b1 = cvtepu8_epi32(int_b1);
+diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
+index adf81dd915c..20cb8ef6dbc 100644
+--- a/aten/src/ATen/cpu/vec/vec_base.h
++++ b/aten/src/ATen/cpu/vec/vec_base.h
+@@ -36,6 +36,12 @@
+ #include <c10/util/irange.h>
+ #include <c10/util/Load.h>
+
++#if defined(__GNUC__)
++#define __FORCE_INLINE __attribute__((always_inline)) inline
++#elif defined(_MSC_VER)
++#define __FORCE_INLINE __forceinline
++#endif
++
+ // These macros helped us unify vec_base.h
+ #ifdef CPU_CAPABILITY_AVX512
+ #if defined(__GNUC__)
+diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
+index a6b6f0f7d1d..15d37cf4861 100644
+--- a/caffe2/CMakeLists.txt
++++ b/caffe2/CMakeLists.txt
+@@ -1787,7 +1787,7 @@ if(BUILD_TEST)
+ endif()
+ else()
+ add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
+- target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main)
++ target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main)
+ endif()
+ target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<INSTALL_INTERFACE:include>)
+ target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>)
+diff --git a/third_party/sleef.BUILD b/third_party/sleef.BUILD
+index 573f9c5b54a..f22a6e905e2 100644
+--- a/third_party/sleef.BUILD
++++ b/third_party/sleef.BUILD
+@@ -38,6 +38,7 @@ SLEEF_PUBLIC_HEADERS = [
+ SLEEF_PRIVATE_INCLUDES = [
+ "-Iexternal/sleef/src/arch",
+ "-Iexternal/sleef/src/common",
++ "-Iexternal/sleef/src/libm",
+ ]
+
+ SLEEF_PUBLIC_INCLUDES = [
+@@ -201,8 +202,6 @@ cc_library(
+ srcs = [
+ "src/libm/rempitab.c",
+ "src/libm/sleefdp.c",
+- "src/libm/sleefld.c",
+- "src/libm/sleefqp.c",
+ "src/libm/sleefsp.c",
+ ],
+ hdrs = SLEEF_PUBLIC_HEADERS,
+--
+2.45.1
+
diff --git a/python-torch.spec b/python-torch.spec
index d50687a5174a..63600c2e8c39 100644
--- a/python-torch.spec
+++ b/python-torch.spec
@@ -176,6 +176,17 @@ Patch7: 0001-Reenable-dim-for-python-3.12.patch
Patch8: 0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch
%endif
+# Enable x86 CPU vectorization on windows [submodule sleef] (#118980)
+# https://github.com/pytorch/pytorch/commit/56451cd49d9cf94b49197e09dec13426bb1a5370
+#
+# Despite the title, this patch fixes compatibility with sleef 3.6 by including
+# a backwards-compatible version of the fix from
+# https://github.com/pytorch/pytorch/pull/122723.
+# Cherry-picked on v2.3.0, without the commit to update the third_party/sleef
+# git submodule (because the release archive contains an actual sleef source
+# tree instead, so this would not apply.)
+Patch9: 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
+
%if %{with rocm}
# ROCm patches
# https://github.com/pytorch/pytorch/pull/120551
--
2.45.1

View file

@ -1,46 +1,94 @@
From 33d48f71db7530f00dbd8cff281b65aa8b355b2a Mon Sep 17 00:00:00 2001
From 038ce9e44776e23f21c1816daa259bc0ea335088 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Tue, 19 Mar 2024 11:32:37 -0400
Date: Sat, 29 Jun 2024 07:06:09 -0700
Subject: [PATCH] disable use of aotriton
---
aten/src/ATen/native/transformers/cuda/sdp_utils.cpp | 6 ++++++
1 file changed, 6 insertions(+)
.../ATen/native/transformers/cuda/sdp_utils.cpp | 17 +++++++++++++++--
1 file changed, 15 insertions(+), 2 deletions(-)
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
index 96b839820efd..2d3dd0cb4b0f 100644
index 214b02d8262e..7b3eb9dcd8cd 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@@ -21,9 +21,11 @@
#include <cmath>
#include <functional>
@@ -19,9 +19,12 @@
#include <c10/core/SymInt.h>
#include <c10/util/string_view.h>
+#ifdef USE_FLASH_ATTENTION
#if USE_ROCM
#include <aotriton/flash.h>
#endif
+#endif
+
/**
* Note [SDPA Runtime Dispatch]
@@ -183,6 +185,7 @@ bool check_sm_version(cudaDeviceProp * dprops) {
}
@@ -182,6 +185,9 @@ bool check_sm_version(cudaDeviceProp * dprops) {
bool check_flash_attention_hardware_support(sdp_params const& params, bool debug) {
+#ifdef USE_FLASH_ATTENTION
// Check that the gpu is capable of running flash attention
+#ifndef USE_FLASH_ATTENTION
+ return false;
+#else
using sm80 = SMVersion<8, 0>;
using sm90 = SMVersion<9, 0>;
@@ -211,6 +214,9 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
#if USE_ROCM
@@ -209,9 +215,13 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
}
#endif
return true;
+#else
+ return false;
+#endif
}
bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) {
+#ifndef USE_FLASH_ATTENTION
+ return false;
+#else
// Mem Efficient attention supports hardware in the range [sm_50, sm_90]
using sm50 = SMVersion<5, 0>;
using sm90 = SMVersion<9, 0>;
@@ -240,6 +250,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
}
#endif
return true;
+#endif
}
bool check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89(
@@ -554,7 +565,7 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
#ifndef USE_FLASH_ATTENTION
TORCH_WARN_ONCE(!debug, "Torch was not compiled with flash attention.");
return false;
-#endif
+#else
// Define gate functions that determine if a flash kernel can be ran
// Replace with std::to_array when we migrate to c++20
@@ -597,13 +608,14 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
}
}
return true;
+#endif
}
bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) {
#ifndef USE_MEM_EFF_ATTENTION
TORCH_WARN_ONCE(!debug, "Torch was not compiled with memory efficient attention.");
return false;
-#endif
+#else
// Constraints specific to mem efficient attention
constexpr auto greater_than_or_equal_sm80_mem_efficient_dtypes =
array_of<at::ScalarType>(at::kHalf, at::kFloat, at::kBFloat16);
@@ -663,6 +675,7 @@ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) {
}
#endif
return check_tensor_dtype(params, less_than_sm80_mem_efficient_dtypes, debug);
+#endif
}
SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
--
2.44.0
2.45.2

View file

@ -12,7 +12,7 @@
%global date0 20240709
%global pypi_version 2.4.0
%else
%global pypi_version 2.3.1
%global pypi_version 2.4.0
%endif
# For -test subpackage
@ -63,18 +63,6 @@
%bcond_with distributed
%endif
# OpenCV support came in F41
%if 0%{?fedora} > 40
%if %{without gitcommit}
%bcond_without opencv
%else
# USE_OPENCV removed in 2.4.0+
%bcond_with opencv
%endif
%else
%bcond_with opencv
%endif
# Do no confuse xnnpack versions
%if 0%{?fedora} > 40
%bcond_without xnnpack
@ -95,14 +83,10 @@
%endif
%if 0%{?fedora} > 40
%if %{with gitcommit}
%bcond_without fbgemm
%else
%bcond_with fbgemm
%endif
%else
%bcond_with fbgemm
%endif
# For testing cuda
%ifarch x86_64
@ -139,15 +123,9 @@
%endif
# These came in 2.4 and not yet in Fedora
%if %{with gitcommit}
%bcond_with opentelemetry
%bcond_with httplib
%bcond_with kineto
%else
%bcond_without opentelemetry
%bcond_without httplib
%bcond_without kineto
%endif
Name: python-%{pypi_name}
%if %{with gitcommit}
@ -220,7 +198,6 @@ Source40: https://github.com/Maratyszcza/pthreadpool/archive/%{pt_commit}/
Source50: https://github.com/mreineck/pocketfft/archive/%{pf_commit}/pocketfft-%{pf_scommit}.tar.gz
%endif
%if %{with gitcommit}
%if %{without opentelemetry}
%global ot_ver 1.14.2
Source60: https://github.com/open-telemetry/opentelemetry-cpp/archive/refs/tags/v%{ot_ver}.tar.gz
@ -237,50 +214,15 @@ Source70: https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp-
%global ki_scommit %(c=%{ki_commit}; echo ${c:0:7})
Source80: https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz
%endif
%endif
Patch0: 0001-no-third_party-foxi.patch
%if %{with gitcommit}
# https://github.com/pytorch/pytorch/pull/131282
Patch1: 0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
%endif
%if %{without gitcommit}
Patch3: 0001-Stub-in-kineto-ActivityType.patch
%endif
%if %{with caffe2}
Patch6: 0001-reenable-foxi-linking.patch
%endif
# Bring some patches forward
%if %{without gitcommit}
# https://github.com/pytorch/pytorch/pull/123384
# Breaks on python 3.13
# Patch7: 0001-Reenable-dim-for-python-3.12.patch
# Dynamo/Inductor on 3.12
# Fails to apply on 2.3.1
# Patch8: 0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch
%endif
%if %{without gitcommit}
# Enable x86 CPU vectorization on windows [submodule sleef] (#118980)
# https://github.com/pytorch/pytorch/commit/56451cd49d9cf94b49197e09dec13426bb1a5370
#
# Despite the title, this patch fixes compatibility with sleef 3.6 by including
# a backwards-compatible version of the fix from
# https://github.com/pytorch/pytorch/pull/122723.
# Cherry-picked on v2.3.0, without the commit to update the third_party/sleef
# git submodule (because the release archive contains an actual sleef source
# tree instead, so this would not apply.)
Patch9: 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
# For Python 3.13
# https://github.com/pytorch/pytorch/pull/126033
Patch10: 0001-Changes-to-compile-with-3.13-126033.patch
%endif
# ROCm patches
# Patches need to be refactored for ToT
# These are ROCm packages
@ -291,9 +233,6 @@ Patch100: 0001-Optionally-use-hipblaslt.patch
%endif
Patch101: 0001-cuda-hip-signatures.patch
Patch102: 0001-silence-an-assert.patch
%if %{without gitcommit}
Patch103: 0001-can-not-use-with-c-files.patch
%endif
Patch105: 0001-disable-use-of-aotriton.patch
%endif
Patch106: 0001-include-fmt-ranges.h-for-using-fmt-join.patch
@ -416,10 +355,6 @@ BuildRequires: libcurand-devel-%{curand_ver}
BuildRequires: libcusparse-devel-%{cusparse_ver}
%endif
%if %{with opencv}
BuildRequires: opencv-devel
%endif
%if %{with test}
BuildRequires: google-benchmark-devel
%endif
@ -627,7 +562,6 @@ rm -rf third_party/pocketfft/*
cp -r pocketfft-*/* third_party/pocketfft/
%endif
%if %{with gitcommit}
%if %{without opentelemtry}
tar xf %{SOURCE60}
rm -rf third_party/opentelemetry-cpp/*
@ -645,16 +579,6 @@ tar xf %{SOURCE80}
rm -rf third_party/kineto/*
cp -r kineto-*/* third_party/kineto/
%endif
%endif
%if %{with opencv}
%if %{without gitcommit}
# Reduce requirements, *FOUND is not set
sed -i -e 's/USE_OPENCV AND OpenCV_FOUND AND USE_FFMPEG AND FFMPEG_FOUND/USE_OPENCV AND USE_FFMPEG/' caffe2/video/CMakeLists.txt
sed -i -e 's/USE_OPENCV AND OpenCV_FOUND/USE_OPENCV/' caffe2/image/CMakeLists.txt
sed -i -e 's/STATUS/FATAL/' caffe2/image/CMakeLists.txt
%endif
%endif
# hipblaslt only building with gfx90a
%if %{with hipblaslt}
@ -810,18 +734,8 @@ mkdir third_party/pocketfft
mkdir third_party/valgrind-headers
cp %{_includedir}/valgrind/* third_party/valgrind-headers
%if %{without gitcommit}
# Remove unneeded OpenCL files that confuse the lincense scanner
rm caffe2/contrib/opencl/OpenCL/cl.hpp
rm caffe2/mobile/contrib/libopencl-stub/include/CL/*.h
rm caffe2/mobile/contrib/libopencl-stub/include/CL/*.hpp
%endif
# Fix installing to /usr/lib64
%if %{with gitcommit}
sed -i -e 's@DESTINATION ${PYTHON_LIB_REL_PATH}@DESTINATION ${CMAKE_INSTALL_PREFIX}/${PYTHON_LIB_REL_PATH}@' caffe2/CMakeLists.txt
%endif
%if %{with rocm}
# hipify
@ -924,9 +838,6 @@ export USE_NNPACK=OFF
export USE_NUMPY=ON
export USE_OPENMP=ON
export USE_PYTORCH_QNNPACK=OFF
%if %{without gitcommit}
export USE_QNNPACK=OFF
%endif
export USE_ROCM=OFF
export USE_SYSTEM_SLEEF=ON
export USE_SYSTEM_EIGEN_INSTALL=ON
@ -949,12 +860,6 @@ export USE_SYSTEM_PSIMD=ON
export USE_SYSTEM_XNNPACK=ON
%endif
%if %{with caffe2}
%if %{without gitcommit}
export BUILD_CAFFE2=ON
%endif
%endif
%if %{with cuda}
%if %{without rocm}
export CPLUS_INCLUDE_PATH=/usr/local/cuda-%{cuda_ver}/include
@ -982,10 +887,6 @@ export USE_MPI=ON
%endif
%endif
%if %{with opencv}
export USE_OPENCV=ON
%endif
%if %{with test}
export BUILD_TEST=ON
%endif
@ -1097,11 +998,6 @@ done
%{python3_sitearch}/%{pypi_name}-*.egg-info
%{python3_sitearch}/functorch
%{python3_sitearch}/torchgen
%if %{without gitcommit}
%if %{with caffe2}
%{python3_sitearch}/caffe2
%endif
%endif
%if %{with rocm}
%files -n python3-%{pypi_name}-rocm-gfx8

View file

@ -12,3 +12,7 @@ SHA512 (libnop-910b558.tar.gz) = 74c5324eaa1b6b2ac8dfef94c835b5c5b044625f8e5efe3
SHA512 (pytorch-97ff6cf.tar.gz) = 105ebcba298558fe833f90e7e40b003d35a74609e777f9dc4c47f5668c884f603455113ac0ff252a62b83c81137ae66ceb1a862d351203925dcfc3dcf9f73580
SHA512 (pytorch-v2.3.0.tar.gz) = 0c2ffc7bf2fd86070e9958c34eca1f03a0248a011ac6ffaeb69f65306ff856edd5359986f02af25888433187e6d7f29b60edded092e2ac30c8cec49023166eda
SHA512 (pytorch-v2.3.1.tar.gz) = fe132251b2bae87b70ba3d95dc32f6a4545970d11893118b0ebe6ca129732e516ef4d6cc4f380b3db9bb2277d1db8ce78a401c40149bb1dfbab76eab9e3992c4
SHA512 (pytorch-v2.4.0.tar.gz) = bcfca6aefee00d15d1c0a2456cd9d802d9a8e96816d421d10e3eed5c608bfdf23dfae492ca3638c0bae99ef5bb8c98f4774c0b9f1a8b94d4dc36a52226033314
SHA512 (v1.14.2.tar.gz) = 97635bbaf6dd567c201451dfaf7815b2052fe50d9bccc97aade86cfa4a92651374d167296a5453031b2681dc302806a289bca011a9e79ddc381a17d6118971d7
SHA512 (cpp-httplib-3b6597b.tar.gz) = 8f1090658c498d04f14fec5c2f301847b1f3360bf92b18d82927643ee04ab61a6b274733a01c7850f9c030205120d674d1d961358d49fdd15636736fb8704f55
SHA512 (kineto-be13176.tar.gz) = 41a08c7da9eea7d12402f80a5550c9d4df79798719cc52b12a507828c8c896ba28a37c35d8adf809ca72589e1d84965d5ef6dd01f3f8dc1c803c5ed67b03a43a