PyTorch 2.4
Signed-off-by: Tom Rix <trix@redhat.com>
This commit is contained in:
parent
2debc89ffd
commit
86185b46a2
7 changed files with 1468 additions and 273 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
|
@ -12,3 +12,7 @@
|
|||
/pytorch-97ff6cf.tar.gz
|
||||
/pytorch-v2.3.0.tar.gz
|
||||
/pytorch-v2.3.1.tar.gz
|
||||
/pytorch-v2.4.0.tar.gz
|
||||
/v1.14.2.tar.gz
|
||||
/cpp-httplib-3b6597b.tar.gz
|
||||
/kineto-be13176.tar.gz
|
||||
|
|
|
|||
47
0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
Normal file
47
0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
From 091b7fe1ccbb5e4ff4ac6017d42bacb869f61a27 Mon Sep 17 00:00:00 2001
|
||||
From: Tom Rix <trix@redhat.com>
|
||||
Date: Sat, 20 Jul 2024 05:37:15 -0600
|
||||
Subject: [PATCH] Add cmake option USE_SYSTEM_FBGEMM
|
||||
|
||||
Signed-off-by: Tom Rix <trix@redhat.com>
|
||||
---
|
||||
CMakeLists.txt | 1 +
|
||||
cmake/Dependencies.cmake | 3 ++-
|
||||
2 files changed, 3 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
||||
index c4cd4b2c2a98..2068f7c6c4f2 100644
|
||||
--- a/CMakeLists.txt
|
||||
+++ b/CMakeLists.txt
|
||||
@@ -253,6 +253,7 @@ cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
|
||||
"USE_CUDNN" OFF)
|
||||
cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF)
|
||||
option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
|
||||
+option(USE_SYSTEM_FBGEMM "Use system-wide FBGEMM" OFF)
|
||||
option(USE_KINETO "Use Kineto profiling library" ON)
|
||||
option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
|
||||
option(USE_FAKELOWP "Use FakeLowp operators" OFF)
|
||||
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
|
||||
index f1f2eb7cec31..192dac46f13b 100644
|
||||
--- a/cmake/Dependencies.cmake
|
||||
+++ b/cmake/Dependencies.cmake
|
||||
@@ -706,6 +706,7 @@ endif()
|
||||
|
||||
# ---[ FBGEMM
|
||||
if(USE_FBGEMM)
|
||||
+ if (NOT USE_SYSTEM_FBGEMM)
|
||||
set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party")
|
||||
if(NOT DEFINED FBGEMM_SOURCE_DIR)
|
||||
set(FBGEMM_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/fbgemm" CACHE STRING "FBGEMM source directory")
|
||||
@@ -746,7 +747,7 @@ if(USE_FBGEMM)
|
||||
target_compile_options_if_supported(asmjit -Wno-unused-but-set-variable)
|
||||
endif()
|
||||
endif()
|
||||
-
|
||||
+ endif()
|
||||
if(USE_FBGEMM)
|
||||
list(APPEND Caffe2_DEPENDENCY_LIBS fbgemm)
|
||||
endif()
|
||||
--
|
||||
2.45.1
|
||||
|
||||
|
|
@ -1,174 +1,398 @@
|
|||
From d77e05d90df006322cda021f1a8affdcc2c7eaef Mon Sep 17 00:00:00 2001
|
||||
From f1d65e958afa65882dbfea8b392ab847a84d41ed Mon Sep 17 00:00:00 2001
|
||||
From: Tom Rix <trix@redhat.com>
|
||||
Date: Fri, 23 Feb 2024 08:27:30 -0500
|
||||
Date: Sat, 29 Jun 2024 04:18:34 -0700
|
||||
Subject: [PATCH] Optionally use hipblaslt
|
||||
|
||||
The hipblaslt package is not available on Fedora.
|
||||
Instead of requiring the package, make it optional.
|
||||
If it is found, define the preprocessor variable HIPBLASLT
|
||||
Convert the checks for ROCM_VERSION >= 507000 to HIPBLASLT checks
|
||||
|
||||
Signed-off-by: Tom Rix <trix@redhat.com>
|
||||
---
|
||||
aten/src/ATen/cuda/CUDABlas.cpp | 7 ++++---
|
||||
aten/src/ATen/cuda/CUDABlas.h | 2 +-
|
||||
aten/src/ATen/cuda/CUDAContextLight.h | 4 ++--
|
||||
aten/src/ATen/cuda/CublasHandlePool.cpp | 4 ++--
|
||||
aten/src/ATen/cuda/tunable/TunableGemm.h | 6 +++---
|
||||
aten/src/ATen/native/cuda/Blas.cpp | 14 ++++++++------
|
||||
cmake/Dependencies.cmake | 3 +++
|
||||
cmake/public/LoadHIP.cmake | 4 ++--
|
||||
8 files changed, 25 insertions(+), 19 deletions(-)
|
||||
aten/src/ATen/cuda/CUDABlas.cpp | 46 ++++++++++++++++++------
|
||||
aten/src/ATen/cuda/CUDAContextLight.h | 4 +++
|
||||
aten/src/ATen/cuda/CublasHandlePool.cpp | 10 ++++--
|
||||
aten/src/ATen/cuda/tunable/TunableGemm.h | 18 +++++++---
|
||||
aten/src/ATen/native/cuda/Blas.cpp | 18 +++++++++-
|
||||
cmake/Dependencies.cmake | 3 ++
|
||||
cmake/public/LoadHIP.cmake | 2 +-
|
||||
7 files changed, 82 insertions(+), 19 deletions(-)
|
||||
|
||||
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
|
||||
index d534ec5a178..e815463f630 100644
|
||||
index ce991a9bcad4..3f0d17b52778 100644
|
||||
--- a/aten/src/ATen/cuda/CUDABlas.cpp
|
||||
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
|
||||
@@ -14,7 +14,7 @@
|
||||
@@ -14,7 +14,9 @@
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#ifdef USE_ROCM
|
||||
-#if ROCM_VERSION >= 60000
|
||||
+#ifdef HIPBLASLT
|
||||
+#ifdef USE_HIPBLASLT
|
||||
#include <hipblaslt/hipblaslt-ext.hpp>
|
||||
#endif
|
||||
+#endif
|
||||
// until hipblas has an API to accept flags, we must use rocblas here
|
||||
@@ -781,7 +781,7 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
|
||||
}
|
||||
}
|
||||
#include <hipblas/hipblas.h>
|
||||
#include <rocblas/rocblas.h>
|
||||
@@ -182,6 +184,9 @@ uint32_t _getAlignment(uintptr_t address) {
|
||||
static size_t _parseChosenWorkspaceSize() {
|
||||
const char * val = getenv("CUBLASLT_WORKSPACE_SIZE");
|
||||
#ifdef USE_ROCM
|
||||
+#ifndef USE_HIPBLASLT
|
||||
+ return 0;
|
||||
+#endif
|
||||
if (!val) {
|
||||
// accept either env var
|
||||
val = getenv("HIPBLASLT_WORKSPACE_SIZE");
|
||||
@@ -235,6 +240,7 @@ namespace at::cuda::blas {
|
||||
} while (0)
|
||||
|
||||
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
|
||||
+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
|
||||
|
||||
#if defined(USE_ROCM) && ROCM_VERSION >= 50700 && ROCM_VERSION < 60000
|
||||
// only for rocm 5.7 where we first supported hipblaslt, it was difficult
|
||||
@@ -912,6 +912,7 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
|
||||
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
|
||||
namespace {
|
||||
// Following the pattern of CuSparseDescriptor
|
||||
// Defined here for now because this is the only place cublas_lt interface is
|
||||
@@ -318,7 +324,6 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
|
||||
};
|
||||
} // namespace
|
||||
|
||||
+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
|
||||
-
|
||||
template <typename Dtype>
|
||||
inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
|
||||
cudaDataType_t abcType = CUDA_R_32F;
|
||||
@@ -452,7 +457,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
|
||||
" scaleType ",
|
||||
scaleType);
|
||||
}
|
||||
-
|
||||
+#endif
|
||||
|
||||
template <typename Dtype>
|
||||
inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
|
||||
@@ -608,10 +613,13 @@ void bgemm_internal<double>(CUDABLAS_BGEMM_ARGTYPES(double))
|
||||
template <>
|
||||
void bgemm_internal<float>(CUDABLAS_BGEMM_ARGTYPES(float))
|
||||
{
|
||||
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
|
||||
if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
|
||||
bgemm_internal_cublaslt<float>(CUDABLAS_BGEMM_ARGS(float));
|
||||
}
|
||||
- else {
|
||||
+ else
|
||||
+#endif
|
||||
+ {
|
||||
bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGS(float));
|
||||
}
|
||||
}
|
||||
@@ -651,10 +659,13 @@ void bgemm_internal<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<fl
|
||||
template <>
|
||||
void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half))
|
||||
{
|
||||
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
|
||||
if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
|
||||
bgemm_internal_cublaslt<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
|
||||
}
|
||||
- else {
|
||||
+ else
|
||||
+#endif
|
||||
+ {
|
||||
bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
|
||||
}
|
||||
}
|
||||
@@ -662,10 +673,13 @@ void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half))
|
||||
template <>
|
||||
void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
|
||||
{
|
||||
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
|
||||
if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
|
||||
bgemm_internal_cublaslt<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
|
||||
}
|
||||
- else {
|
||||
+ else
|
||||
+#endif
|
||||
+ {
|
||||
bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
|
||||
}
|
||||
}
|
||||
@@ -781,11 +795,13 @@ void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
|
||||
}
|
||||
}
|
||||
|
||||
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
|
||||
template <typename Dtype>
|
||||
inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
|
||||
// forward to bgemm implementation but set strides and batches to 0
|
||||
bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0);
|
||||
}
|
||||
+#endif
|
||||
|
||||
template <typename Dtype>
|
||||
inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
|
||||
@@ -1008,10 +1024,13 @@ void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
|
||||
template <>
|
||||
void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
|
||||
{
|
||||
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
|
||||
if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
|
||||
gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
|
||||
}
|
||||
- else {
|
||||
+ else
|
||||
+#endif
|
||||
+ {
|
||||
gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGS(float));
|
||||
}
|
||||
}
|
||||
@@ -1051,10 +1070,13 @@ void gemm_internal<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<floa
|
||||
template <>
|
||||
void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
|
||||
{
|
||||
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
|
||||
if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
|
||||
gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
|
||||
}
|
||||
- else {
|
||||
+ else
|
||||
+#endif
|
||||
+ {
|
||||
gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
|
||||
}
|
||||
}
|
||||
@@ -1062,10 +1084,13 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
|
||||
template <>
|
||||
void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
|
||||
{
|
||||
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
|
||||
if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
|
||||
gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
|
||||
}
|
||||
- else {
|
||||
+ else
|
||||
+#endif
|
||||
+ {
|
||||
gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
|
||||
}
|
||||
}
|
||||
@@ -1177,7 +1202,7 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
|
||||
}
|
||||
}
|
||||
|
||||
-
|
||||
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
|
||||
template <typename Dtype>
|
||||
void gemm_and_bias(
|
||||
bool transpose_mat1,
|
||||
@@ -1124,7 +1125,7 @@ template void gemm_and_bias(
|
||||
at::BFloat16* result_ptr,
|
||||
int64_t result_ld,
|
||||
GEMMAndBiasActivationEpilogue activation);
|
||||
-
|
||||
@@ -1410,7 +1435,7 @@ void scaled_gemm(
|
||||
ScalarType result_dtype,
|
||||
void* amax_ptr,
|
||||
bool use_fast_accum) {
|
||||
-#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
|
||||
+#if CUDA_VERSION >= 11080 || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
|
||||
const auto computeType = CUBLAS_COMPUTE_32F;
|
||||
const auto scaleType = CUDA_R_32F;
|
||||
const int8_t fastAccuMode = use_fast_accum ? 1 : 0;
|
||||
@@ -1681,6 +1706,7 @@ void int8_gemm(
|
||||
" scaleType ",
|
||||
scaleType);
|
||||
}
|
||||
+#endif
|
||||
void scaled_gemm(
|
||||
char transa,
|
||||
char transb,
|
||||
diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h
|
||||
index eb12bb350c5..068607467dd 100644
|
||||
--- a/aten/src/ATen/cuda/CUDABlas.h
|
||||
+++ b/aten/src/ATen/cuda/CUDABlas.h
|
||||
@@ -82,7 +82,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
|
||||
template <>
|
||||
void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
|
||||
|
||||
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
|
||||
+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
|
||||
enum GEMMAndBiasActivationEpilogue {
|
||||
None,
|
||||
RELU,
|
||||
template <>
|
||||
void trsm<float>(CUDABLAS_TRSM_ARGTYPES(float)) {
|
||||
diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h
|
||||
index 4ec35f59a21..e28dc42034f 100644
|
||||
index f2b657ced51b..f0ee613c4208 100644
|
||||
--- a/aten/src/ATen/cuda/CUDAContextLight.h
|
||||
+++ b/aten/src/ATen/cuda/CUDAContextLight.h
|
||||
@@ -9,7 +9,7 @@
|
||||
@@ -9,7 +9,9 @@
|
||||
|
||||
// cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
|
||||
// added bf16 support
|
||||
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
|
||||
+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
|
||||
+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)))
|
||||
#include <cublasLt.h>
|
||||
#endif
|
||||
+#endif
|
||||
|
||||
@@ -82,7 +82,7 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator();
|
||||
#ifdef CUDART_VERSION
|
||||
#include <cusolverDn.h>
|
||||
@@ -80,7 +82,9 @@ TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator();
|
||||
/* Handles */
|
||||
TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle();
|
||||
TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
|
||||
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
|
||||
+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
|
||||
+#if (!defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT)))
|
||||
TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
|
||||
#endif
|
||||
+#endif
|
||||
|
||||
TORCH_CUDA_CPP_API void clearCublasWorkspaces();
|
||||
|
||||
diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
|
||||
index 6913d2cd95e..3d4276be372 100644
|
||||
index 8eac525b3695..abfdf7a23847 100644
|
||||
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
|
||||
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
|
||||
@@ -29,7 +29,7 @@ namespace at::cuda {
|
||||
|
||||
namespace {
|
||||
|
||||
-#if defined(USE_ROCM) && ROCM_VERSION >= 50700
|
||||
+#if defined(USE_ROCM) && defined(HIPBLASLT)
|
||||
-#if defined(USE_ROCM)
|
||||
+#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
|
||||
void createCublasLtHandle(cublasLtHandle_t *handle) {
|
||||
TORCH_CUDABLAS_CHECK(cublasLtCreate(handle));
|
||||
}
|
||||
@@ -190,7 +190,7 @@ cublasHandle_t getCurrentCUDABlasHandle() {
|
||||
@@ -191,8 +191,9 @@ cublasHandle_t getCurrentCUDABlasHandle() {
|
||||
return handle;
|
||||
}
|
||||
|
||||
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
|
||||
+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
|
||||
cublasLtHandle_t getCurrentCUDABlasLtHandle() {
|
||||
-cublasLtHandle_t getCurrentCUDABlasLtHandle() {
|
||||
#ifdef USE_ROCM
|
||||
+#if defined(USE_HIPBLASLT)
|
||||
+cublasLtHandle_t getCurrentCUDABlasLtHandle() {
|
||||
c10::DeviceIndex device = 0;
|
||||
AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
|
||||
|
||||
@@ -213,9 +214,12 @@ cublasLtHandle_t getCurrentCUDABlasLtHandle() {
|
||||
|
||||
auto handle = myPoolWindow->reserve(device);
|
||||
return handle;
|
||||
+}
|
||||
+#endif
|
||||
#else
|
||||
+cublasLtHandle_t getCurrentCUDABlasLtHandle() {
|
||||
return reinterpret_cast<cublasLtHandle_t>(getCurrentCUDABlasHandle());
|
||||
-#endif
|
||||
}
|
||||
+#endif
|
||||
|
||||
} // namespace at::cuda
|
||||
diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h
|
||||
index 3ba0d761277..dde1870cfbf 100644
|
||||
index 53e6154120c9..fa1d664696db 100644
|
||||
--- a/aten/src/ATen/cuda/tunable/TunableGemm.h
|
||||
+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h
|
||||
@@ -11,7 +11,7 @@
|
||||
@@ -11,7 +11,9 @@
|
||||
|
||||
#include <ATen/cuda/tunable/GemmCommon.h>
|
||||
#ifdef USE_ROCM
|
||||
-#if ROCM_VERSION >= 50700
|
||||
+#ifdef HIPBLASLT
|
||||
+#ifdef USE_HIPBLASLT
|
||||
#include <ATen/cuda/tunable/GemmHipblaslt.h>
|
||||
#endif
|
||||
+#endif
|
||||
#include <ATen/cuda/tunable/GemmRocblas.h>
|
||||
@@ -166,7 +166,7 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
|
||||
}
|
||||
#endif
|
||||
|
||||
-#if defined(USE_ROCM) && ROCM_VERSION >= 50700
|
||||
+#if defined(USE_ROCM) && defined(HIPBLASLT)
|
||||
static const char *env = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
|
||||
if (env == nullptr || strcmp(env, "1") == 0) {
|
||||
// disallow tuning of hipblaslt with c10::complex
|
||||
@@ -240,7 +240,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
|
||||
#include <ATen/cuda/tunable/StreamTimer.h>
|
||||
@@ -65,6 +67,7 @@ class DefaultGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>>
|
||||
}
|
||||
#endif
|
||||
|
||||
-#if defined(USE_ROCM) && ROCM_VERSION >= 50700
|
||||
+#if defined(USE_ROCM) && defined(HIPBLASLT)
|
||||
static const char *env = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
|
||||
if (env == nullptr || strcmp(env, "1") == 0) {
|
||||
// disallow tuning of hipblaslt with c10::complex
|
||||
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
|
||||
index 29e5c5e3cf1..df56f3d7f1d 100644
|
||||
--- a/aten/src/ATen/native/cuda/Blas.cpp
|
||||
+++ b/aten/src/ATen/native/cuda/Blas.cpp
|
||||
@@ -155,7 +155,7 @@ enum class Activation {
|
||||
GELU,
|
||||
};
|
||||
|
||||
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
|
||||
+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
|
||||
cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activation a) {
|
||||
switch (a) {
|
||||
case Activation::None:
|
||||
@@ -193,6 +193,7 @@ static bool getDisableAddmmCudaLt() {
|
||||
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
|
||||
template <typename T>
|
||||
class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
|
||||
public:
|
||||
@@ -94,6 +97,7 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
|
||||
return OK;
|
||||
}
|
||||
};
|
||||
+#endif
|
||||
|
||||
template <typename T>
|
||||
inline bool IsZero(T v) {
|
||||
@@ -191,6 +195,7 @@ static void AddRocblasValidator() {
|
||||
}
|
||||
}
|
||||
|
||||
+#ifdef USE_HIPBLASLT
|
||||
static void AddHipblasltValidator() {
|
||||
auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
|
||||
if (validators.find("HIPBLASLT_VERSION") == validators.end()) {
|
||||
@@ -205,6 +210,7 @@ static void AddHipblasltValidator() {
|
||||
[hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
|
||||
}
|
||||
}
|
||||
+#endif
|
||||
|
||||
static void AddRocmValidator() {
|
||||
auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
|
||||
@@ -243,7 +249,7 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
|
||||
}
|
||||
AddRocblasValidator();
|
||||
}
|
||||
-
|
||||
+#ifdef USE_HIPBLASLT
|
||||
static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
|
||||
if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
|
||||
rocm_validators = true;
|
||||
@@ -257,7 +263,7 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
|
||||
}
|
||||
AddHipblasltValidator();
|
||||
}
|
||||
-
|
||||
+#endif
|
||||
if (rocm_validators) {
|
||||
AddRocmValidator();
|
||||
}
|
||||
@@ -286,7 +292,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
|
||||
}
|
||||
AddRocblasValidator();
|
||||
}
|
||||
-
|
||||
+#ifdef USE_HIPBLASLT
|
||||
static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
|
||||
if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
|
||||
rocm_validators = true;
|
||||
@@ -300,7 +306,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
|
||||
}
|
||||
AddHipblasltValidator();
|
||||
}
|
||||
-
|
||||
+#endif
|
||||
if (rocm_validators) {
|
||||
AddRocmValidator();
|
||||
}
|
||||
@@ -312,6 +318,7 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
|
||||
}
|
||||
};
|
||||
|
||||
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
|
||||
template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout>
|
||||
class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer> {
|
||||
public:
|
||||
@@ -321,10 +328,12 @@ class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer>
|
||||
auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
+#ifdef USE_HIPBLASLT
|
||||
for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps<AT, BT, CT, ALayout, BLayout>()) {
|
||||
this->RegisterOp(std::move(name), std::move(op));
|
||||
}
|
||||
AddHipblasltValidator();
|
||||
+#endif
|
||||
AddRocmValidator();
|
||||
#endif
|
||||
}
|
||||
@@ -337,6 +346,7 @@ class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer>
|
||||
"_", BlasOpToString(ALayout), BlasOpToString(BLayout));
|
||||
}
|
||||
};
|
||||
+#endif
|
||||
|
||||
#undef XSTRINGIFY
|
||||
#undef STRINGIFY
|
||||
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
|
||||
index 84c59a4fd0d7..56ad5de3bf2d 100644
|
||||
--- a/aten/src/ATen/native/cuda/Blas.cpp
|
||||
+++ b/aten/src/ATen/native/cuda/Blas.cpp
|
||||
@@ -173,6 +173,7 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa
|
||||
}
|
||||
|
||||
static bool getDisableAddmmCudaLt() {
|
||||
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
|
||||
static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT");
|
||||
#ifdef USE_ROCM
|
||||
// if we enable tunable op, it'll take priority over just hipblaslt (heuristics)
|
||||
@@ -196,10 +197,14 @@ static bool getDisableAddmmCudaLt() {
|
||||
}
|
||||
return false;
|
||||
#endif
|
||||
+#else
|
||||
+ return true;
|
||||
+#endif
|
||||
}
|
||||
|
||||
#ifdef USE_ROCM
|
||||
static bool isSupportedHipLtROCmArch(int index) {
|
||||
+#if defined(HIPBLASLT)
|
||||
+#ifdef USE_HIPBLASLT
|
||||
hipDeviceProp_t* prop = at::cuda::getDeviceProperties(index);
|
||||
std::string device_arch = prop->gcnArchName;
|
||||
static const std::vector<std::string> archs = {"gfx90a", "gfx940", "gfx941", "gfx942"};
|
||||
@@ -203,6 +204,7 @@ static bool isSupportedHipLtROCmArch(int index) {
|
||||
@@ -210,6 +215,7 @@ static bool isSupportedHipLtROCmArch(int index) {
|
||||
}
|
||||
}
|
||||
TORCH_CHECK(false, "Attempting to use hipBLASLt on a unsupported architecture!");
|
||||
|
|
@ -176,87 +400,107 @@ index 29e5c5e3cf1..df56f3d7f1d 100644
|
|||
return false;
|
||||
}
|
||||
#endif
|
||||
@@ -228,7 +230,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
@@ -235,6 +241,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
at::ScalarType scalar_type = self.scalar_type();
|
||||
c10::MaybeOwned<Tensor> self_;
|
||||
if (&result != &self) {
|
||||
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11040 && !defined(_MSC_VER)) || defined(USE_ROCM) && ROCM_VERSION >= 50700
|
||||
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11040 && !defined(_MSC_VER)) || defined(USE_ROCM) && defined(HIPBLASLT)
|
||||
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
|
||||
#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM)
|
||||
// Strangely, if mat2 has only 1 row or column, we get
|
||||
// CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
|
||||
// self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
|
||||
@@ -271,7 +273,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
@@ -276,13 +283,14 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
scalar_type != at::ScalarType::BFloat16));
|
||||
#endif
|
||||
}
|
||||
+#endif
|
||||
#endif
|
||||
if (!useLtInterface) {
|
||||
self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
|
||||
}
|
||||
self__sizes = self_->sizes();
|
||||
} else {
|
||||
-#if defined(USE_ROCM) && ROCM_VERSION >= 50700
|
||||
+#if defined(USE_ROCM) && defined(HIPBLASLT)
|
||||
-#if defined(USE_ROCM)
|
||||
+#if defined(USE_ROCM) && defined(USE_HIPBLASLT)
|
||||
useLtInterface = !disable_addmm_cuda_lt &&
|
||||
result.dim() == 2 && result.is_contiguous() &&
|
||||
isSupportedHipLtROCmArch(self.device().index()) &&
|
||||
@@ -322,7 +324,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
|
||||
@@ -334,6 +342,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj());
|
||||
|
||||
-#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
|
||||
+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && defined(HIPBLASLT))
|
||||
if (useLtInterface) {
|
||||
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
|
||||
#if defined(USE_ROCM)
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(
|
||||
at::ScalarType::Half,
|
||||
@@ -876,7 +878,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
|
||||
at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
|
||||
at::native::resize_output(amax, {});
|
||||
@@ -394,6 +403,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
activation_epilogue
|
||||
);
|
||||
});
|
||||
+#endif
|
||||
#endif
|
||||
} else
|
||||
{
|
||||
@@ -803,6 +813,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
|
||||
}
|
||||
|
||||
-#if !defined(USE_ROCM) && !defined(_MSC_VER) || (defined(USE_ROCM) && ROCM_VERSION >= 60000)
|
||||
+#if !defined(USE_ROCM) && !defined(_MSC_VER) || (defined(USE_ROCM) && defined(HIPBLASLT))
|
||||
cublasCommonArgs args(mat1, mat2, out);
|
||||
const auto out_dtype_ = args.result->scalar_type();
|
||||
TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
|
||||
@@ -906,7 +908,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
|
||||
TORCH_CHECK(false, "_scaled_mm_out_cuda is not compiled for this platform.");
|
||||
static bool _scaled_mm_allowed_device() {
|
||||
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
#ifdef USE_ROCM
|
||||
std::string device_arch = dprops->gcnArchName;
|
||||
@@ -817,6 +828,9 @@ static bool _scaled_mm_allowed_device() {
|
||||
#else
|
||||
return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9);
|
||||
#endif
|
||||
+#else
|
||||
+ return false;
|
||||
+#endif
|
||||
}
|
||||
|
||||
// Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax
|
||||
@@ -850,6 +864,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
|
||||
// Check sizes
|
||||
bool allowed_device = _scaled_mm_allowed_device();
|
||||
TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+");
|
||||
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && defined(USE_HIPBLASLT))
|
||||
TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
|
||||
TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
|
||||
TORCH_CHECK(
|
||||
@@ -1025,6 +1040,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
|
||||
#if defined(USE_ROCM) && ROCM_VERSION >= 60000 && ROCM_VERSION < 60200
|
||||
// ROCm's hipBLASLt does not support amax before 6.2, so calculate separately
|
||||
amax = at::max(at::abs(out.to(kFloat)));
|
||||
+#endif
|
||||
#endif
|
||||
|
||||
-#if defined(USE_ROCM) && ROCM_VERSION >= 60000
|
||||
+#if defined(USE_ROCM) && defined(HIPBLASLT)
|
||||
// rocm's hipblaslt does not yet support amax, so calculate separately
|
||||
auto out_float32 = out.to(kFloat);
|
||||
out_float32.abs_();
|
||||
return {out, amax};
|
||||
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
|
||||
index b7ffbeb07dc..2b6c3678984 100644
|
||||
index f1f2eb7cec31..8d05e834bbc5 100644
|
||||
--- a/cmake/Dependencies.cmake
|
||||
+++ b/cmake/Dependencies.cmake
|
||||
@@ -1273,6 +1273,9 @@ if(USE_ROCM)
|
||||
if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "6.0.0")
|
||||
list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2)
|
||||
endif()
|
||||
@@ -1052,6 +1052,9 @@ if(USE_ROCM)
|
||||
list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
|
||||
list(APPEND HIP_CXX_FLAGS -std=c++17)
|
||||
list(APPEND HIP_CXX_FLAGS -DHIPBLAS_V2)
|
||||
+ if(hipblast_FOUND)
|
||||
+ list(APPEND HIP_CXX_FLAGS -DHIPBLASLT)
|
||||
+ list(APPEND HIP_CXX_FLAGS -DUSE_HIPBLASLT)
|
||||
+ endif()
|
||||
if(HIPBLASLT_CUSTOM_DATA_TYPE)
|
||||
list(APPEND HIP_CXX_FLAGS -DHIPBLASLT_CUSTOM_DATA_TYPE)
|
||||
if(HIP_NEW_TYPE_ENUMS)
|
||||
list(APPEND HIP_CXX_FLAGS -DHIP_NEW_TYPE_ENUMS)
|
||||
endif()
|
||||
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
|
||||
index f6ca263c5e5..53eb0b63c1a 100644
|
||||
index fa39156031ff..df4836847fdf 100644
|
||||
--- a/cmake/public/LoadHIP.cmake
|
||||
+++ b/cmake/public/LoadHIP.cmake
|
||||
@@ -156,7 +156,7 @@ if(HIP_FOUND)
|
||||
@@ -155,7 +155,7 @@ if(HIP_FOUND)
|
||||
find_package_and_print_version(hiprand REQUIRED)
|
||||
find_package_and_print_version(rocblas REQUIRED)
|
||||
find_package_and_print_version(hipblas REQUIRED)
|
||||
if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.7.0")
|
||||
- find_package_and_print_version(hipblaslt REQUIRED)
|
||||
+ find_package_and_print_version(hipblaslt)
|
||||
endif()
|
||||
- find_package_and_print_version(hipblaslt REQUIRED)
|
||||
+ find_package_and_print_version(hipblaslt)
|
||||
find_package_and_print_version(miopen REQUIRED)
|
||||
if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "4.1.0")
|
||||
@@ -191,7 +191,7 @@ if(HIP_FOUND)
|
||||
# roctx is part of roctracer
|
||||
find_library(ROCM_ROCTX_LIB roctx64 HINTS ${ROCM_PATH}/lib)
|
||||
|
||||
- if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.7.0")
|
||||
+ if(hipblastlt_FOUND)
|
||||
# check whether hipblaslt is using its own datatype
|
||||
set(file "${PROJECT_BINARY_DIR}/hipblaslt_test_data_type.cc")
|
||||
file(WRITE ${file} ""
|
||||
find_package_and_print_version(hipfft REQUIRED)
|
||||
find_package_and_print_version(hipsparse REQUIRED)
|
||||
--
|
||||
2.43.2
|
||||
2.45.2
|
||||
|
||||
|
|
|
|||
952
0001-Patch-for-sleef-3.6.patch
Normal file
952
0001-Patch-for-sleef-3.6.patch
Normal file
|
|
@ -0,0 +1,952 @@
|
|||
From 273f23698c887b52e66c2abec8101b7398f0f9c4 Mon Sep 17 00:00:00 2001
|
||||
From: "Benjamin A. Beasley" <code@musicinmybrain.net>
|
||||
Date: Wed, 5 Jun 2024 11:06:02 -0400
|
||||
Subject: [PATCH] Patch for sleef 3.6
|
||||
|
||||
---
|
||||
...ectorization-on-windows-submodule-sl.patch | 910 ++++++++++++++++++
|
||||
python-torch.spec | 11 +
|
||||
2 files changed, 921 insertions(+)
|
||||
create mode 100644 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
|
||||
|
||||
diff --git a/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
|
||||
new file mode 100644
|
||||
index 000000000000..562f55b742c2
|
||||
--- /dev/null
|
||||
+++ b/0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
|
||||
@@ -0,0 +1,910 @@
|
||||
+From 3d1e4b3e5ddcdd2717e590c635097163fef64c83 Mon Sep 17 00:00:00 2001
|
||||
+From: Xu Han <xu.han@intel.com>
|
||||
+Date: Sun, 31 Mar 2024 03:07:32 +0000
|
||||
+Subject: [PATCH] Enable x86 CPU vectorization on windows [submodule sleef]
|
||||
+ (#118980)
|
||||
+
|
||||
+Enable VEC on Windows OS.
|
||||
+1. Fix some type defination gap between Windows and Linux.
|
||||
+2. Fix some operator not support on Windows, such as [], /.
|
||||
+3. Enable static sleef library build on Windows.
|
||||
+4. Disable unsupported function overloading on MSVC.
|
||||
+5. Upgrade submodule sleef lib, which fixed build issue on Windows.
|
||||
+6. Fixed bazel build issues.
|
||||
+7. Fix test app not link to sleef on Windows.
|
||||
+
|
||||
+Note: If rebuild fail after pulled this PR, please sync `sleef` submodule by run:
|
||||
+```cmd
|
||||
+git submodule sync
|
||||
+git submodule update --init --recursive
|
||||
+```
|
||||
+
|
||||
+Pull Request resolved: https://github.com/pytorch/pytorch/pull/118980
|
||||
+Approved by: https://github.com/jgong5, https://github.com/ezyang, https://github.com/malfet
|
||||
+---
|
||||
+ aten/src/ATen/CMakeLists.txt | 48 ++++++--------
|
||||
+ aten/src/ATen/cpu/vec/vec256/vec256.h | 14 ++--
|
||||
+ .../src/ATen/cpu/vec/vec256/vec256_bfloat16.h | 27 ++++++--
|
||||
+ .../cpu/vec/vec256/vec256_complex_double.h | 7 +-
|
||||
+ .../cpu/vec/vec256/vec256_complex_float.h | 7 +-
|
||||
+ aten/src/ATen/cpu/vec/vec256/vec256_double.h | 5 +-
|
||||
+ aten/src/ATen/cpu/vec/vec256/vec256_float.h | 15 +++--
|
||||
+ aten/src/ATen/cpu/vec/vec256/vec256_qint.h | 12 +++-
|
||||
+ aten/src/ATen/cpu/vec/vec512/vec512.h | 14 ++--
|
||||
+ .../src/ATen/cpu/vec/vec512/vec512_bfloat16.h | 27 ++++++--
|
||||
+ .../cpu/vec/vec512/vec512_complex_double.h | 7 +-
|
||||
+ .../cpu/vec/vec512/vec512_complex_float.h | 7 +-
|
||||
+ aten/src/ATen/cpu/vec/vec512/vec512_double.h | 5 +-
|
||||
+ aten/src/ATen/cpu/vec/vec512/vec512_float.h | 15 +++--
|
||||
+ aten/src/ATen/cpu/vec/vec512/vec512_qint.h | 66 ++++++++++++++++++-
|
||||
+ aten/src/ATen/cpu/vec/vec_base.h | 6 ++
|
||||
+ caffe2/CMakeLists.txt | 2 +-
|
||||
+ third_party/sleef.BUILD | 3 +-
|
||||
+ 18 files changed, 194 insertions(+), 93 deletions(-)
|
||||
+
|
||||
+diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
|
||||
+index bf425af5fa9..58d5828e8ca 100644
|
||||
+--- a/aten/src/ATen/CMakeLists.txt
|
||||
++++ b/aten/src/ATen/CMakeLists.txt
|
||||
+@@ -419,32 +419,25 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$")
|
||||
+ list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo)
|
||||
+ endif()
|
||||
+
|
||||
+-if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
|
||||
+- # Preserve values for the main build
|
||||
+- set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS})
|
||||
+- set(__aten_sleef_build_tests ${BUILD_TESTS})
|
||||
+-
|
||||
+- # Unset our restrictive C++ flags here and reset them later.
|
||||
+- # Remove this once we use proper target_compile_options.
|
||||
+- set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
|
||||
+- set(CMAKE_CXX_FLAGS)
|
||||
+-
|
||||
+- # Bump up optimization level for sleef to -O1, since at -O0 the compiler
|
||||
+- # excessively spills intermediate vector registers to the stack
|
||||
+- # and makes things run impossibly slowly
|
||||
+- set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
|
||||
+- if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
|
||||
+- string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
|
||||
+- else()
|
||||
+- set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
|
||||
++if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
|
||||
++ if(NOT MSVC)
|
||||
++ # Bump up optimization level for sleef to -O1, since at -O0 the compiler
|
||||
++ # excessively spills intermediate vector registers to the stack
|
||||
++ # and makes things run impossibly slowly
|
||||
++ set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
|
||||
++ if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0")
|
||||
++ string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
|
||||
++ else()
|
||||
++ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1")
|
||||
++ endif()
|
||||
+ endif()
|
||||
+
|
||||
+ if(NOT USE_SYSTEM_SLEEF)
|
||||
+- set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
|
||||
+- set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
|
||||
+- set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
|
||||
+- set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
|
||||
+- set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE})
|
||||
++ set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE)
|
||||
++ set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE)
|
||||
++ set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
|
||||
++ set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
|
||||
++ set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE)
|
||||
+ if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
|
||||
+ if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
|
||||
+ set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE)
|
||||
+@@ -465,12 +458,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
|
||||
+ endif()
|
||||
+ list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef)
|
||||
+
|
||||
+- set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
|
||||
+- set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
|
||||
+-
|
||||
+- # Set these back. TODO: Use SLEEF_ to pass these instead
|
||||
+- set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE)
|
||||
+- set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE)
|
||||
++ if(NOT MSVC)
|
||||
++ set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG})
|
||||
++ endif()
|
||||
+ endif()
|
||||
+
|
||||
+ if(USE_CUDA AND NOT USE_ROCM)
|
||||
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h
|
||||
+index 800b027e469..c431fa3c605 100644
|
||||
+--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
|
||||
++++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
|
||||
+@@ -69,7 +69,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
|
||||
+ }
|
||||
+
|
||||
+
|
||||
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX2)
|
||||
+
|
||||
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
+
|
||||
+@@ -94,7 +94,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
|
||||
+ }
|
||||
+
|
||||
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
+-
|
||||
++#ifndef _MSC_VER
|
||||
++// MSVC is not working well on complex function overload.
|
||||
+ template<int64_t scale = 1>
|
||||
+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
|
||||
+ inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
|
||||
+@@ -106,9 +107,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
|
||||
+ inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
|
||||
+ return _mm256_i32gather_ps(base_addr, vindex, scale);
|
||||
+ }
|
||||
+-
|
||||
++#endif
|
||||
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
+-
|
||||
++#ifndef _MSC_VER
|
||||
++// MSVC is not working well on complex function overload.
|
||||
+ template<int64_t scale = 1>
|
||||
+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
|
||||
+ inline mask_gather(const Vectorized<double>& src, const double* base_addr,
|
||||
+@@ -122,7 +124,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
|
||||
+ const Vectorized<int32_t>& vindex, Vectorized<float>& mask) {
|
||||
+ return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale);
|
||||
+ }
|
||||
+-
|
||||
++#endif
|
||||
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
+
|
||||
+ // Only works for inputs in the range: [-2^51, 2^51]
|
||||
+@@ -302,6 +304,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
|
||||
+ return flip8(v);
|
||||
+ }
|
||||
+
|
||||
+-#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
++#endif // (defined(CPU_CAPABILITY_AVX2)
|
||||
+
|
||||
+ }} // namepsace at::vec::CPU_CAPABILITY
|
||||
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
|
||||
+index 3e26213d6d2..66557436c70 100644
|
||||
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
|
||||
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
|
||||
+@@ -7,7 +7,8 @@
|
||||
+ #include <ATen/cpu/vec/vec_base.h>
|
||||
+ #include <c10/util/irange.h>
|
||||
+
|
||||
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX2)
|
||||
++#define SLEEF_STATIC_LIBS
|
||||
+ #include <sleef.h>
|
||||
+ #endif
|
||||
+
|
||||
+@@ -18,7 +19,18 @@ namespace at::vec {
|
||||
+ // See Note [CPU_CAPABILITY namespace]
|
||||
+ inline namespace CPU_CAPABILITY {
|
||||
+
|
||||
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX2)
|
||||
++
|
||||
++#ifndef SLEEF_CONST
|
||||
++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
|
||||
++#define SLEEF_CONST const
|
||||
++#else
|
||||
++#define SLEEF_CONST
|
||||
++#endif
|
||||
++#define SLEEF_CONST_OLD SLEEF_CONST
|
||||
++#else
|
||||
++#define SLEEF_CONST_OLD
|
||||
++#endif
|
||||
+
|
||||
+ // bfloat16 conversion
|
||||
+ static inline void cvtbf16_fp32(const __m128i& a, __m256& o) {
|
||||
+@@ -265,7 +277,8 @@ public:
|
||||
+ }
|
||||
+ return b;
|
||||
+ }
|
||||
+- Vectorized<T> map(const __m256 (*const vop)(__m256)) const {
|
||||
++
|
||||
++ Vectorized<T> map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const {
|
||||
+ __m256 lo, hi;
|
||||
+ cvt_to_fp32<T>(values, lo, hi);
|
||||
+ const auto o1 = vop(lo);
|
||||
+@@ -1026,7 +1039,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
|
||||
+ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
|
||||
+ CONVERT_VECTORIZED_INIT(Half, half);
|
||||
+
|
||||
+-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
++#else // defined(CPU_CAPABILITY_AVX2)
|
||||
+
|
||||
+ #define CONVERT_NON_VECTORIZED_INIT(type, name) \
|
||||
+ inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
|
||||
+@@ -1051,9 +1064,9 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
|
||||
+ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
|
||||
+ CONVERT_NON_VECTORIZED_INIT(Half, half);
|
||||
+
|
||||
+-#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
++#endif // defined(CPU_CAPABILITY_AVX2)
|
||||
+
|
||||
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX2)
|
||||
+ #define LOAD_FP32_VECTORIZED_INIT(type, name) \
|
||||
+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
|
||||
+ auto values = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data)); \
|
||||
+@@ -1072,7 +1085,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
|
||||
+ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
|
||||
+ LOAD_FP32_VECTORIZED_INIT(Half, fp16);
|
||||
+
|
||||
+-#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
++#else // defined(CPU_CAPABILITY_AVX2)
|
||||
+ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
|
||||
+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
|
||||
+ __at_align__ float values[Vectorized<float>::size()]; \
|
||||
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
|
||||
+index f93ea1e63c3..6c198fb37d3 100644
|
||||
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
|
||||
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
|
||||
+@@ -8,7 +8,8 @@
|
||||
+ #include <ATen/cpu/vec/intrinsics.h>
|
||||
+ #include <ATen/cpu/vec/vec_base.h>
|
||||
+
|
||||
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX2)
|
||||
++#define SLEEF_STATIC_LIBS
|
||||
+ #include <sleef.h>
|
||||
+ #endif
|
||||
+
|
||||
+@@ -16,7 +17,7 @@ namespace at::vec {
|
||||
+ // See Note [CPU_CAPABILITY namespace]
|
||||
+ inline namespace CPU_CAPABILITY {
|
||||
+
|
||||
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX2)
|
||||
+
|
||||
+ template <> class Vectorized<c10::complex<double>> {
|
||||
+ private:
|
||||
+@@ -145,7 +146,7 @@ public:
|
||||
+ auto abs = abs_();
|
||||
+ auto zero = _mm256_setzero_pd();
|
||||
+ auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ);
|
||||
+- auto div = values / abs;
|
||||
++ auto div = _mm256_div_pd(values, abs);
|
||||
+ return _mm256_blendv_pd(div, zero, mask);
|
||||
+ }
|
||||
+ __m256d real_() const {
|
||||
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
|
||||
+index 7c142c04b79..c72d4d49274 100644
|
||||
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
|
||||
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
|
||||
+@@ -7,7 +7,8 @@
|
||||
+ #include <c10/util/irange.h>
|
||||
+ #include <ATen/cpu/vec/intrinsics.h>
|
||||
+ #include <ATen/cpu/vec/vec_base.h>
|
||||
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX2)
|
||||
++#define SLEEF_STATIC_LIBS
|
||||
+ #include <sleef.h>
|
||||
+ #endif
|
||||
+
|
||||
+@@ -15,7 +16,7 @@ namespace at::vec {
|
||||
+ // See Note [CPU_CAPABILITY namespace]
|
||||
+ inline namespace CPU_CAPABILITY {
|
||||
+
|
||||
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX2)
|
||||
+
|
||||
+ template <> class Vectorized<c10::complex<float>> {
|
||||
+ private:
|
||||
+@@ -180,7 +181,7 @@ public:
|
||||
+ auto abs = abs_();
|
||||
+ auto zero = _mm256_setzero_ps();
|
||||
+ auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ);
|
||||
+- auto div = values / abs;
|
||||
++ auto div = _mm256_div_ps(values, abs);
|
||||
+ return _mm256_blendv_ps(div, zero, mask);
|
||||
+ }
|
||||
+ __m256 real_() const {
|
||||
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
|
||||
+index bc82d07edd1..bed6da627af 100644
|
||||
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h
|
||||
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
|
||||
+@@ -6,7 +6,8 @@
|
||||
+ #include <ATen/cpu/vec/intrinsics.h>
|
||||
+ #include <ATen/cpu/vec/vec_base.h>
|
||||
+ #include <c10/util/irange.h>
|
||||
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX2)
|
||||
++#define SLEEF_STATIC_LIBS
|
||||
+ #include <sleef.h>
|
||||
+ #endif
|
||||
+
|
||||
+@@ -15,7 +16,7 @@ namespace at::vec {
|
||||
+ inline namespace CPU_CAPABILITY {
|
||||
+
|
||||
+
|
||||
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX2)
|
||||
+
|
||||
+ template <> class Vectorized<double> {
|
||||
+ private:
|
||||
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
|
||||
+index 886809a0b8a..0e3664cd37b 100644
|
||||
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
|
||||
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
|
||||
+@@ -6,7 +6,8 @@
|
||||
+ #include <ATen/cpu/vec/intrinsics.h>
|
||||
+ #include <ATen/cpu/vec/vec_base.h>
|
||||
+ #include <c10/util/irange.h>
|
||||
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX2)
|
||||
++#define SLEEF_STATIC_LIBS
|
||||
+ #include <sleef.h>
|
||||
+ #endif
|
||||
+
|
||||
+@@ -14,7 +15,7 @@ namespace at::vec {
|
||||
+ // See Note [CPU_CAPABILITY namespace]
|
||||
+ inline namespace CPU_CAPABILITY {
|
||||
+
|
||||
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX2)
|
||||
+
|
||||
+ template <> class Vectorized<float> {
|
||||
+ private:
|
||||
+@@ -226,14 +227,14 @@ public:
|
||||
+ static __m256 vec_factorial_5 =
|
||||
+ _mm256_set1_ps(0.00828929059f); // 1/factorial(5)
|
||||
+ static __m256 vec_exp_log2ef =
|
||||
+- (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e)
|
||||
++ _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e)
|
||||
+ static __m256 vec_half = _mm256_set1_ps(0.5f);
|
||||
+ static __m256 vec_one = _mm256_set1_ps(1.f);
|
||||
+ static __m256 vec_zero = _mm256_set1_ps(0.f);
|
||||
+ static __m256 vec_two = _mm256_set1_ps(2.f);
|
||||
+- static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2)
|
||||
+- static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50);
|
||||
+- static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218);
|
||||
++ static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2)
|
||||
++ static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
|
||||
++ static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
|
||||
+ static __m256i vec_127 = _mm256_set1_epi32(0x0000007f);
|
||||
+ static int n_mantissa_bits = 23;
|
||||
+
|
||||
+@@ -266,7 +267,7 @@ public:
|
||||
+ auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number);
|
||||
+ auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127);
|
||||
+ vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
|
||||
+- auto vec_two_pow_n = (__m256)vec_two_pow_n_i;
|
||||
++ auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i);
|
||||
+ vec_two_pow_n =
|
||||
+ _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask);
|
||||
+
|
||||
+diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
|
||||
+index 4128841701a..85e099904cd 100644
|
||||
+--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
|
||||
++++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
|
||||
+@@ -41,11 +41,17 @@
|
||||
+ namespace at::vec {
|
||||
+ inline namespace CPU_CAPABILITY {
|
||||
+
|
||||
+-#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX2)
|
||||
+
|
||||
++#ifdef _MSC_VER
|
||||
++__declspec(align(64)) struct Vectorizedqi {
|
||||
++ protected:
|
||||
++ __m256i vals;
|
||||
++#else
|
||||
+ struct Vectorizedqi {
|
||||
+ protected:
|
||||
+ __m256i vals __attribute__((aligned(64)));
|
||||
++#endif
|
||||
+
|
||||
+ public:
|
||||
+ Vectorizedqi() {}
|
||||
+@@ -133,7 +139,7 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
|
||||
+ }
|
||||
+
|
||||
+ template <typename T>
|
||||
+-inline void __attribute__((always_inline)) QuantizeAvx2(
|
||||
++__FORCE_INLINE void QuantizeAvx2(
|
||||
+ const float* src,
|
||||
+ T* dst,
|
||||
+ int len,
|
||||
+@@ -1331,5 +1337,5 @@ Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const V
|
||||
+ return a.maximum(b);
|
||||
+ }
|
||||
+
|
||||
+-#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
|
||||
++#endif // if defined(CPU_CAPABILITY_AVX2)
|
||||
+ }} // namespace at::vec::CPU_CAPABILITY
|
||||
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h
|
||||
+index fe96d123e64..87f723d782c 100644
|
||||
+--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
|
||||
++++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
|
||||
+@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
|
||||
+ }
|
||||
+
|
||||
+
|
||||
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX512)
|
||||
+
|
||||
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
+
|
||||
+@@ -80,7 +80,8 @@ inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src)
|
||||
+ }
|
||||
+
|
||||
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
+-
|
||||
++#ifndef _MSC_VER
|
||||
++// MSVC is not working well on complex function overload.
|
||||
+ template<int64_t scale = 1>
|
||||
+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
|
||||
+ inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
|
||||
+@@ -92,9 +93,10 @@ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorize
|
||||
+ inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
|
||||
+ return _mm512_i32gather_ps(vindex, base_addr, scale);
|
||||
+ }
|
||||
+-
|
||||
++#endif
|
||||
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
+-
|
||||
++#ifndef _MSC_VER
|
||||
++// MSVC is not working well on complex function overload.
|
||||
+ template<int64_t scale = 1>
|
||||
+ std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
|
||||
+ inline mask_gather(const Vectorized<double>& src, const double* base_addr,
|
||||
+@@ -112,7 +114,7 @@ inline mask_gather(const Vectorized<float>& src, const float* base_addr,
|
||||
+ auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ);
|
||||
+ return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale);
|
||||
+ }
|
||||
+-
|
||||
++#endif
|
||||
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
+
|
||||
+ template<>
|
||||
+@@ -270,6 +272,6 @@ inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
|
||||
+ return flip8(v);
|
||||
+ }
|
||||
+
|
||||
+-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||
++#endif // defined(CPU_CAPABILITY_AVX512)
|
||||
+
|
||||
+ }}}
|
||||
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
|
||||
+index f9fc92d52bf..eb3b6a72240 100644
|
||||
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
|
||||
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
|
||||
+@@ -7,7 +7,8 @@
|
||||
+ #include <ATen/cpu/vec/vec_base.h>
|
||||
+ #include <c10/util/irange.h>
|
||||
+
|
||||
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX512)
|
||||
++#define SLEEF_STATIC_LIBS
|
||||
+ #include <sleef.h>
|
||||
+ #endif
|
||||
+
|
||||
+@@ -16,7 +17,18 @@ namespace vec {
|
||||
+ // See Note [CPU_CAPABILITY namespace]
|
||||
+ inline namespace CPU_CAPABILITY {
|
||||
+
|
||||
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX512)
|
||||
++
|
||||
++#ifndef SLEEF_CONST
|
||||
++#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
|
||||
++#define SLEEF_CONST const
|
||||
++#else
|
||||
++#define SLEEF_CONST
|
||||
++#endif
|
||||
++#define SLEEF_CONST_OLD SLEEF_CONST
|
||||
++#else
|
||||
++#define SLEEF_CONST_OLD
|
||||
++#endif
|
||||
+
|
||||
+ // bfloat16 conversion
|
||||
+ static inline void cvtbf16_fp32(const __m256i& a, __m512& o) {
|
||||
+@@ -362,7 +374,8 @@ public:
|
||||
+ }
|
||||
+ #pragma clang diagnostic push
|
||||
+ #pragma clang diagnostic ignored "-Wignored-qualifiers"
|
||||
+- Vectorized<T> map(const __m512 (*const vop)(__m512)) const {
|
||||
++
|
||||
++ Vectorized<T> map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const {
|
||||
+ __m512 lo, hi;
|
||||
+ cvt_to_fp32<T>(values, lo, hi);
|
||||
+ const auto o1 = vop(lo);
|
||||
+@@ -1571,7 +1584,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
|
||||
+ CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
|
||||
+ CONVERT_VECTORIZED_INIT(Half, half);
|
||||
+
|
||||
+-#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||
++#else //defined(CPU_CAPABILITY_AVX512)
|
||||
+
|
||||
+ #define CONVERT_NON_VECTORIZED_INIT(type, name) \
|
||||
+ inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
|
||||
+@@ -1601,9 +1614,9 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
|
||||
+ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
|
||||
+ CONVERT_NON_VECTORIZED_INIT(Half, half);
|
||||
+
|
||||
+-#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||
++#endif // defined(CPU_CAPABILITY_AVX512)
|
||||
+
|
||||
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX512)
|
||||
+ #define LOAD_FP32_VECTORIZED_INIT(type, name) \
|
||||
+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
|
||||
+ auto values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)); \
|
||||
+@@ -1622,7 +1635,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vec
|
||||
+ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
|
||||
+ LOAD_FP32_VECTORIZED_INIT(Half, fp16);
|
||||
+
|
||||
+-#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||
++#else // defined(CPU_CAPABILITY_AVX512)
|
||||
+ #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
|
||||
+ inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
|
||||
+ __at_align__ float values[Vectorized<float>::size()]; \
|
||||
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
|
||||
+index 02aa3a87cc1..c35204f9da2 100644
|
||||
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
|
||||
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
|
||||
+@@ -7,7 +7,8 @@
|
||||
+ #include <c10/util/irange.h>
|
||||
+ #include <ATen/cpu/vec/intrinsics.h>
|
||||
+ #include <ATen/cpu/vec/vec_base.h>
|
||||
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX512)
|
||||
++#define SLEEF_STATIC_LIBS
|
||||
+ #include <sleef.h>
|
||||
+ #endif
|
||||
+
|
||||
+@@ -16,7 +17,7 @@ namespace vec {
|
||||
+ // See Note [CPU_CAPABILITY namespace]
|
||||
+ inline namespace CPU_CAPABILITY {
|
||||
+
|
||||
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX512)
|
||||
+
|
||||
+ template <> class Vectorized<c10::complex<double>> {
|
||||
+ private:
|
||||
+@@ -203,7 +204,7 @@ public:
|
||||
+ auto abs = abs_();
|
||||
+ auto zero = _mm512_setzero_pd();
|
||||
+ auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ);
|
||||
+- auto div = values / abs;
|
||||
++ auto div = _mm512_div_pd(values, abs);
|
||||
+ return _mm512_mask_blend_pd(mask, div, zero);
|
||||
+ }
|
||||
+ __m512d real_() const {
|
||||
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
|
||||
+index a5d790c98b2..2801e484d94 100644
|
||||
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
|
||||
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
|
||||
+@@ -7,7 +7,8 @@
|
||||
+ #include <c10/util/irange.h>
|
||||
+ #include <ATen/cpu/vec/intrinsics.h>
|
||||
+ #include <ATen/cpu/vec/vec_base.h>
|
||||
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX512)
|
||||
++#define SLEEF_STATIC_LIBS
|
||||
+ #include <sleef.h>
|
||||
+ #endif
|
||||
+
|
||||
+@@ -16,7 +17,7 @@ namespace vec {
|
||||
+ // See Note [CPU_CAPABILITY namespace]
|
||||
+ inline namespace CPU_CAPABILITY {
|
||||
+
|
||||
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX512)
|
||||
+
|
||||
+ template <> class Vectorized<c10::complex<float>> {
|
||||
+ private:
|
||||
+@@ -708,7 +709,7 @@ public:
|
||||
+ auto abs = abs_();
|
||||
+ auto zero = _mm512_setzero_ps();
|
||||
+ auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ);
|
||||
+- auto div = values / abs;
|
||||
++ auto div = _mm512_div_ps(values, abs);
|
||||
+ return _mm512_mask_blend_ps(mask, div, zero);
|
||||
+ }
|
||||
+ __m512 real_() const {
|
||||
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
|
||||
+index 27b2753c903..508ab257e60 100644
|
||||
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
|
||||
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
|
||||
+@@ -6,7 +6,8 @@
|
||||
+ #include <ATen/cpu/vec/intrinsics.h>
|
||||
+ #include <ATen/cpu/vec/vec_base.h>
|
||||
+ #include <c10/util/irange.h>
|
||||
+-#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER)
|
||||
++#if (defined(CPU_CAPABILITY_AVX512))
|
||||
++#define SLEEF_STATIC_LIBS
|
||||
+ #include <sleef.h>
|
||||
+ #endif
|
||||
+
|
||||
+@@ -15,7 +16,7 @@ namespace vec {
|
||||
+ // See Note [CPU_CAPABILITY namespace]
|
||||
+ inline namespace CPU_CAPABILITY {
|
||||
+
|
||||
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX512)
|
||||
+
|
||||
+ template <> class Vectorized<double> {
|
||||
+ private:
|
||||
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
|
||||
+index ba5738687fd..a08df3c141a 100644
|
||||
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
|
||||
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
|
||||
+@@ -6,7 +6,8 @@
|
||||
+ #include <ATen/cpu/vec/intrinsics.h>
|
||||
+ #include <ATen/cpu/vec/vec_base.h>
|
||||
+ #include <c10/util/irange.h>
|
||||
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX512)
|
||||
++#define SLEEF_STATIC_LIBS
|
||||
+ #include <sleef.h>
|
||||
+ #endif
|
||||
+
|
||||
+@@ -15,7 +16,7 @@ namespace vec {
|
||||
+ // See Note [CPU_CAPABILITY namespace]
|
||||
+ inline namespace CPU_CAPABILITY {
|
||||
+
|
||||
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX512)
|
||||
+
|
||||
+ template <> class Vectorized<float> {
|
||||
+ private:
|
||||
+@@ -246,14 +247,14 @@ public:
|
||||
+ static __m512 vec_factorial_5 =
|
||||
+ _mm512_set1_ps(0.00828929059f); // 1/factorial(5)
|
||||
+ static __m512 vec_exp_log2ef =
|
||||
+- (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e)
|
||||
++ _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
|
||||
+ static __m512 vec_half = _mm512_set1_ps(0.5f);
|
||||
+ static __m512 vec_one = _mm512_set1_ps(1.f);
|
||||
+ static __m512 vec_zero = _mm512_set1_ps(0.f);
|
||||
+ static __m512 vec_two = _mm512_set1_ps(2.f);
|
||||
+- static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2)
|
||||
+- static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50);
|
||||
+- static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218);
|
||||
++ static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2)
|
||||
++ static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
|
||||
++ static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
|
||||
+ static __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
|
||||
+ static int n_mantissa_bits = 23;
|
||||
+
|
||||
+@@ -288,7 +289,7 @@ public:
|
||||
+ auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number);
|
||||
+ auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127);
|
||||
+ vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
|
||||
+- auto vec_two_pow_n = (__m512)vec_two_pow_n_i;
|
||||
++ auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i);
|
||||
+ vec_two_pow_n =
|
||||
+ _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero);
|
||||
+
|
||||
+diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
|
||||
+index e0713d01312..a5671ed4a50 100644
|
||||
+--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
|
||||
++++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
|
||||
+@@ -42,11 +42,17 @@ namespace at {
|
||||
+ namespace vec {
|
||||
+ inline namespace CPU_CAPABILITY {
|
||||
+
|
||||
+-#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
|
||||
++#if defined(CPU_CAPABILITY_AVX512)
|
||||
+
|
||||
++#ifdef _MSC_VER
|
||||
++__declspec(align(64)) struct Vectorizedqi {
|
||||
++ protected:
|
||||
++ __m512i vals;
|
||||
++#else
|
||||
+ struct Vectorizedqi {
|
||||
+ protected:
|
||||
+ __m512i vals __attribute__((aligned(64)));
|
||||
++#endif
|
||||
+
|
||||
+ public:
|
||||
+ Vectorizedqi() {}
|
||||
+@@ -136,7 +142,7 @@ inline convert_float_to_int8(at::vec::Vectorized<float> src) {
|
||||
+ }
|
||||
+
|
||||
+ template <typename T>
|
||||
+-inline void __attribute__((always_inline)) QuantizeAvx512(
|
||||
++__FORCE_INLINE void QuantizeAvx512(
|
||||
+ const float* src,
|
||||
+ T* dst,
|
||||
+ int len,
|
||||
+@@ -525,10 +531,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
|
||||
+ Vectorized<float> scale,
|
||||
+ Vectorized<float> zero_point,
|
||||
+ Vectorized<float> scale_neg_zp_premul) const {
|
||||
++ #if defined(_MSC_VER) && !defined(__clang__)
|
||||
++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
|
||||
++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
|
||||
++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
|
||||
++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
|
||||
++ #else
|
||||
+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
|
||||
+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
|
||||
+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
|
||||
+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
|
||||
++ #endif
|
||||
+
|
||||
+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
|
||||
+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
|
||||
+@@ -549,10 +562,17 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
|
||||
+ float_vec_return_type dequantize(
|
||||
+ Vectorized<float> scale,
|
||||
+ Vectorized<float> zero_point) const {
|
||||
++ #if defined(_MSC_VER) && !defined(__clang__)
|
||||
++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
|
||||
++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
|
||||
++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
|
||||
++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
|
||||
++ #else
|
||||
+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
|
||||
+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
|
||||
+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
|
||||
+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
|
||||
++ #endif
|
||||
+
|
||||
+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
|
||||
+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
|
||||
+@@ -598,20 +618,34 @@ struct Vectorized<c10::qint8> : public Vectorizedqi {
|
||||
+ }
|
||||
+
|
||||
+ int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
|
||||
++ #if defined(_MSC_VER) && !defined(__clang__)
|
||||
++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
|
||||
++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
|
||||
++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
|
||||
++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
|
||||
++ #else
|
||||
+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
|
||||
+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
|
||||
+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
|
||||
+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
|
||||
++ #endif
|
||||
+
|
||||
+ __m512i int32_val0 = cvtepi8_epi32(int_val0);
|
||||
+ __m512i int32_val1 = cvtepi8_epi32(int_val1);
|
||||
+ __m512i int32_val2 = cvtepi8_epi32(int_val2);
|
||||
+ __m512i int32_val3 = cvtepi8_epi32(int_val3);
|
||||
+
|
||||
++ #if defined(_MSC_VER) && !defined(__clang__)
|
||||
++ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
|
||||
++ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
|
||||
++ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
|
||||
++ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
|
||||
++ #else
|
||||
+ __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
|
||||
+ __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
|
||||
+ __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
|
||||
+ __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
|
||||
++ #endif
|
||||
+
|
||||
+ __m512i int32_b0 = cvtepi8_epi32(int_b0);
|
||||
+ __m512i int32_b1 = cvtepi8_epi32(int_b1);
|
||||
+@@ -721,10 +755,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
|
||||
+ Vectorized<float> scale,
|
||||
+ Vectorized<float> zero_point,
|
||||
+ Vectorized<float> scale_zp_premul) const {
|
||||
++ #if defined(_MSC_VER) && !defined(__clang__)
|
||||
++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
|
||||
++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
|
||||
++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
|
||||
++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
|
||||
++ #else
|
||||
+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
|
||||
+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
|
||||
+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
|
||||
+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
|
||||
++ #endif
|
||||
+
|
||||
+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
|
||||
+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
|
||||
+@@ -746,10 +787,17 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
|
||||
+ float_vec_return_type dequantize(
|
||||
+ Vectorized<float> scale,
|
||||
+ Vectorized<float> zero_point) const {
|
||||
++ #if defined(_MSC_VER) && !defined(__clang__)
|
||||
++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
|
||||
++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
|
||||
++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
|
||||
++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
|
||||
++ #else
|
||||
+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
|
||||
+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
|
||||
+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
|
||||
+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
|
||||
++ #endif
|
||||
+
|
||||
+ __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
|
||||
+ __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
|
||||
+@@ -796,20 +844,34 @@ struct Vectorized<c10::quint8> : public Vectorizedqi {
|
||||
+ }
|
||||
+
|
||||
+ int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
|
||||
++ #if defined(_MSC_VER) && !defined(__clang__)
|
||||
++ __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]);
|
||||
++ __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]);
|
||||
++ __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]);
|
||||
++ __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]);
|
||||
++ #else
|
||||
+ __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
|
||||
+ __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
|
||||
+ __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
|
||||
+ __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
|
||||
++ #endif
|
||||
+
|
||||
+ __m512i int32_val0 = cvtepu8_epi32(int_val0);
|
||||
+ __m512i int32_val1 = cvtepu8_epi32(int_val1);
|
||||
+ __m512i int32_val2 = cvtepu8_epi32(int_val2);
|
||||
+ __m512i int32_val3 = cvtepu8_epi32(int_val3);
|
||||
+
|
||||
++ #if defined(_MSC_VER) && !defined(__clang__)
|
||||
++ __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]);
|
||||
++ __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]);
|
||||
++ __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]);
|
||||
++ __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]);
|
||||
++ #else
|
||||
+ __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
|
||||
+ __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
|
||||
+ __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
|
||||
+ __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
|
||||
++ #endif
|
||||
+
|
||||
+ __m512i int32_b0 = cvtepu8_epi32(int_b0);
|
||||
+ __m512i int32_b1 = cvtepu8_epi32(int_b1);
|
||||
+diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
|
||||
+index adf81dd915c..20cb8ef6dbc 100644
|
||||
+--- a/aten/src/ATen/cpu/vec/vec_base.h
|
||||
++++ b/aten/src/ATen/cpu/vec/vec_base.h
|
||||
+@@ -36,6 +36,12 @@
|
||||
+ #include <c10/util/irange.h>
|
||||
+ #include <c10/util/Load.h>
|
||||
+
|
||||
++#if defined(__GNUC__)
|
||||
++#define __FORCE_INLINE __attribute__((always_inline)) inline
|
||||
++#elif defined(_MSC_VER)
|
||||
++#define __FORCE_INLINE __forceinline
|
||||
++#endif
|
||||
++
|
||||
+ // These macros helped us unify vec_base.h
|
||||
+ #ifdef CPU_CAPABILITY_AVX512
|
||||
+ #if defined(__GNUC__)
|
||||
+diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
|
||||
+index a6b6f0f7d1d..15d37cf4861 100644
|
||||
+--- a/caffe2/CMakeLists.txt
|
||||
++++ b/caffe2/CMakeLists.txt
|
||||
+@@ -1787,7 +1787,7 @@ if(BUILD_TEST)
|
||||
+ endif()
|
||||
+ else()
|
||||
+ add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
|
||||
+- target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main)
|
||||
++ target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main)
|
||||
+ endif()
|
||||
+ target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<INSTALL_INTERFACE:include>)
|
||||
+ target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>)
|
||||
+diff --git a/third_party/sleef.BUILD b/third_party/sleef.BUILD
|
||||
+index 573f9c5b54a..f22a6e905e2 100644
|
||||
+--- a/third_party/sleef.BUILD
|
||||
++++ b/third_party/sleef.BUILD
|
||||
+@@ -38,6 +38,7 @@ SLEEF_PUBLIC_HEADERS = [
|
||||
+ SLEEF_PRIVATE_INCLUDES = [
|
||||
+ "-Iexternal/sleef/src/arch",
|
||||
+ "-Iexternal/sleef/src/common",
|
||||
++ "-Iexternal/sleef/src/libm",
|
||||
+ ]
|
||||
+
|
||||
+ SLEEF_PUBLIC_INCLUDES = [
|
||||
+@@ -201,8 +202,6 @@ cc_library(
|
||||
+ srcs = [
|
||||
+ "src/libm/rempitab.c",
|
||||
+ "src/libm/sleefdp.c",
|
||||
+- "src/libm/sleefld.c",
|
||||
+- "src/libm/sleefqp.c",
|
||||
+ "src/libm/sleefsp.c",
|
||||
+ ],
|
||||
+ hdrs = SLEEF_PUBLIC_HEADERS,
|
||||
+--
|
||||
+2.45.1
|
||||
+
|
||||
diff --git a/python-torch.spec b/python-torch.spec
|
||||
index d50687a5174a..63600c2e8c39 100644
|
||||
--- a/python-torch.spec
|
||||
+++ b/python-torch.spec
|
||||
@@ -176,6 +176,17 @@ Patch7: 0001-Reenable-dim-for-python-3.12.patch
|
||||
Patch8: 0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch
|
||||
%endif
|
||||
|
||||
+# Enable x86 CPU vectorization on windows [submodule sleef] (#118980)
|
||||
+# https://github.com/pytorch/pytorch/commit/56451cd49d9cf94b49197e09dec13426bb1a5370
|
||||
+#
|
||||
+# Despite the title, this patch fixes compatibility with sleef 3.6 by including
|
||||
+# a backwards-compatible version of the fix from
|
||||
+# https://github.com/pytorch/pytorch/pull/122723.
|
||||
+# Cherry-picked on v2.3.0, without the commit to update the third_party/sleef
|
||||
+# git submodule (because the release archive contains an actual sleef source
|
||||
+# tree instead, so this would not apply.)
|
||||
+Patch9: 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
|
||||
+
|
||||
%if %{with rocm}
|
||||
# ROCm patches
|
||||
# https://github.com/pytorch/pytorch/pull/120551
|
||||
--
|
||||
2.45.1
|
||||
|
||||
|
|
@ -1,46 +1,94 @@
|
|||
From 33d48f71db7530f00dbd8cff281b65aa8b355b2a Mon Sep 17 00:00:00 2001
|
||||
From 038ce9e44776e23f21c1816daa259bc0ea335088 Mon Sep 17 00:00:00 2001
|
||||
From: Tom Rix <trix@redhat.com>
|
||||
Date: Tue, 19 Mar 2024 11:32:37 -0400
|
||||
Date: Sat, 29 Jun 2024 07:06:09 -0700
|
||||
Subject: [PATCH] disable use of aotriton
|
||||
|
||||
---
|
||||
aten/src/ATen/native/transformers/cuda/sdp_utils.cpp | 6 ++++++
|
||||
1 file changed, 6 insertions(+)
|
||||
.../ATen/native/transformers/cuda/sdp_utils.cpp | 17 +++++++++++++++--
|
||||
1 file changed, 15 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
|
||||
index 96b839820efd..2d3dd0cb4b0f 100644
|
||||
index 214b02d8262e..7b3eb9dcd8cd 100644
|
||||
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
|
||||
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
|
||||
@@ -21,9 +21,11 @@
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
@@ -19,9 +19,12 @@
|
||||
#include <c10/core/SymInt.h>
|
||||
#include <c10/util/string_view.h>
|
||||
|
||||
+#ifdef USE_FLASH_ATTENTION
|
||||
#if USE_ROCM
|
||||
#include <aotriton/flash.h>
|
||||
#endif
|
||||
+#endif
|
||||
+
|
||||
|
||||
/**
|
||||
* Note [SDPA Runtime Dispatch]
|
||||
@@ -183,6 +185,7 @@ bool check_sm_version(cudaDeviceProp * dprops) {
|
||||
}
|
||||
@@ -182,6 +185,9 @@ bool check_sm_version(cudaDeviceProp * dprops) {
|
||||
|
||||
bool check_flash_attention_hardware_support(sdp_params const& params, bool debug) {
|
||||
+#ifdef USE_FLASH_ATTENTION
|
||||
// Check that the gpu is capable of running flash attention
|
||||
+#ifndef USE_FLASH_ATTENTION
|
||||
+ return false;
|
||||
+#else
|
||||
using sm80 = SMVersion<8, 0>;
|
||||
using sm90 = SMVersion<9, 0>;
|
||||
@@ -211,6 +214,9 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
|
||||
#if USE_ROCM
|
||||
@@ -209,9 +215,13 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
|
||||
}
|
||||
#endif
|
||||
return true;
|
||||
+#else
|
||||
+ return false;
|
||||
+#endif
|
||||
}
|
||||
|
||||
bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug) {
|
||||
+#ifndef USE_FLASH_ATTENTION
|
||||
+ return false;
|
||||
+#else
|
||||
// Mem Efficient attention supports hardware in the range [sm_50, sm_90]
|
||||
using sm50 = SMVersion<5, 0>;
|
||||
using sm90 = SMVersion<9, 0>;
|
||||
@@ -240,6 +250,7 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
|
||||
}
|
||||
#endif
|
||||
return true;
|
||||
+#endif
|
||||
}
|
||||
|
||||
bool check_requires_grad_and_head_dim_gt192_constraints_on_sm86_89(
|
||||
@@ -554,7 +565,7 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
|
||||
#ifndef USE_FLASH_ATTENTION
|
||||
TORCH_WARN_ONCE(!debug, "Torch was not compiled with flash attention.");
|
||||
return false;
|
||||
-#endif
|
||||
+#else
|
||||
|
||||
// Define gate functions that determine if a flash kernel can be ran
|
||||
// Replace with std::to_array when we migrate to c++20
|
||||
@@ -597,13 +608,14 @@ bool can_use_flash_attention(sdp_params const& params, bool debug) {
|
||||
}
|
||||
}
|
||||
return true;
|
||||
+#endif
|
||||
}
|
||||
|
||||
bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) {
|
||||
#ifndef USE_MEM_EFF_ATTENTION
|
||||
TORCH_WARN_ONCE(!debug, "Torch was not compiled with memory efficient attention.");
|
||||
return false;
|
||||
-#endif
|
||||
+#else
|
||||
// Constraints specific to mem efficient attention
|
||||
constexpr auto greater_than_or_equal_sm80_mem_efficient_dtypes =
|
||||
array_of<at::ScalarType>(at::kHalf, at::kFloat, at::kBFloat16);
|
||||
@@ -663,6 +675,7 @@ bool can_use_mem_efficient_attention(sdp_params const& params, bool debug) {
|
||||
}
|
||||
#endif
|
||||
return check_tensor_dtype(params, less_than_sm80_mem_efficient_dtypes, debug);
|
||||
+#endif
|
||||
}
|
||||
|
||||
SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
|
||||
--
|
||||
2.44.0
|
||||
2.45.2
|
||||
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@
|
|||
%global date0 20240709
|
||||
%global pypi_version 2.4.0
|
||||
%else
|
||||
%global pypi_version 2.3.1
|
||||
%global pypi_version 2.4.0
|
||||
%endif
|
||||
|
||||
# For -test subpackage
|
||||
|
|
@ -63,18 +63,6 @@
|
|||
%bcond_with distributed
|
||||
%endif
|
||||
|
||||
# OpenCV support came in F41
|
||||
%if 0%{?fedora} > 40
|
||||
%if %{without gitcommit}
|
||||
%bcond_without opencv
|
||||
%else
|
||||
# USE_OPENCV removed in 2.4.0+
|
||||
%bcond_with opencv
|
||||
%endif
|
||||
%else
|
||||
%bcond_with opencv
|
||||
%endif
|
||||
|
||||
# Do no confuse xnnpack versions
|
||||
%if 0%{?fedora} > 40
|
||||
%bcond_without xnnpack
|
||||
|
|
@ -95,14 +83,10 @@
|
|||
%endif
|
||||
|
||||
%if 0%{?fedora} > 40
|
||||
%if %{with gitcommit}
|
||||
%bcond_without fbgemm
|
||||
%else
|
||||
%bcond_with fbgemm
|
||||
%endif
|
||||
%else
|
||||
%bcond_with fbgemm
|
||||
%endif
|
||||
|
||||
# For testing cuda
|
||||
%ifarch x86_64
|
||||
|
|
@ -139,15 +123,9 @@
|
|||
%endif
|
||||
|
||||
# These came in 2.4 and not yet in Fedora
|
||||
%if %{with gitcommit}
|
||||
%bcond_with opentelemetry
|
||||
%bcond_with httplib
|
||||
%bcond_with kineto
|
||||
%else
|
||||
%bcond_without opentelemetry
|
||||
%bcond_without httplib
|
||||
%bcond_without kineto
|
||||
%endif
|
||||
|
||||
Name: python-%{pypi_name}
|
||||
%if %{with gitcommit}
|
||||
|
|
@ -220,7 +198,6 @@ Source40: https://github.com/Maratyszcza/pthreadpool/archive/%{pt_commit}/
|
|||
Source50: https://github.com/mreineck/pocketfft/archive/%{pf_commit}/pocketfft-%{pf_scommit}.tar.gz
|
||||
%endif
|
||||
|
||||
%if %{with gitcommit}
|
||||
%if %{without opentelemetry}
|
||||
%global ot_ver 1.14.2
|
||||
Source60: https://github.com/open-telemetry/opentelemetry-cpp/archive/refs/tags/v%{ot_ver}.tar.gz
|
||||
|
|
@ -237,50 +214,15 @@ Source70: https://github.com/yhirose/cpp-httplib/archive/%{hl_commit}/cpp-
|
|||
%global ki_scommit %(c=%{ki_commit}; echo ${c:0:7})
|
||||
Source80: https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{ki_scommit}.tar.gz
|
||||
%endif
|
||||
%endif
|
||||
|
||||
Patch0: 0001-no-third_party-foxi.patch
|
||||
%if %{with gitcommit}
|
||||
# https://github.com/pytorch/pytorch/pull/131282
|
||||
Patch1: 0001-Add-cmake-option-USE_SYSTEM_FBGEMM.patch
|
||||
%endif
|
||||
|
||||
%if %{without gitcommit}
|
||||
Patch3: 0001-Stub-in-kineto-ActivityType.patch
|
||||
%endif
|
||||
|
||||
%if %{with caffe2}
|
||||
Patch6: 0001-reenable-foxi-linking.patch
|
||||
%endif
|
||||
|
||||
# Bring some patches forward
|
||||
%if %{without gitcommit}
|
||||
# https://github.com/pytorch/pytorch/pull/123384
|
||||
# Breaks on python 3.13
|
||||
# Patch7: 0001-Reenable-dim-for-python-3.12.patch
|
||||
|
||||
# Dynamo/Inductor on 3.12
|
||||
# Fails to apply on 2.3.1
|
||||
# Patch8: 0001-dynamo-3.12-enable-dynamo-on-3.12-enable-most-dynamo.patch
|
||||
%endif
|
||||
|
||||
%if %{without gitcommit}
|
||||
# Enable x86 CPU vectorization on windows [submodule sleef] (#118980)
|
||||
# https://github.com/pytorch/pytorch/commit/56451cd49d9cf94b49197e09dec13426bb1a5370
|
||||
#
|
||||
# Despite the title, this patch fixes compatibility with sleef 3.6 by including
|
||||
# a backwards-compatible version of the fix from
|
||||
# https://github.com/pytorch/pytorch/pull/122723.
|
||||
# Cherry-picked on v2.3.0, without the commit to update the third_party/sleef
|
||||
# git submodule (because the release archive contains an actual sleef source
|
||||
# tree instead, so this would not apply.)
|
||||
Patch9: 0001-Enable-x86-CPU-vectorization-on-windows-submodule-sl.patch
|
||||
|
||||
# For Python 3.13
|
||||
# https://github.com/pytorch/pytorch/pull/126033
|
||||
Patch10: 0001-Changes-to-compile-with-3.13-126033.patch
|
||||
%endif
|
||||
|
||||
# ROCm patches
|
||||
# Patches need to be refactored for ToT
|
||||
# These are ROCm packages
|
||||
|
|
@ -291,9 +233,6 @@ Patch100: 0001-Optionally-use-hipblaslt.patch
|
|||
%endif
|
||||
Patch101: 0001-cuda-hip-signatures.patch
|
||||
Patch102: 0001-silence-an-assert.patch
|
||||
%if %{without gitcommit}
|
||||
Patch103: 0001-can-not-use-with-c-files.patch
|
||||
%endif
|
||||
Patch105: 0001-disable-use-of-aotriton.patch
|
||||
%endif
|
||||
Patch106: 0001-include-fmt-ranges.h-for-using-fmt-join.patch
|
||||
|
|
@ -416,10 +355,6 @@ BuildRequires: libcurand-devel-%{curand_ver}
|
|||
BuildRequires: libcusparse-devel-%{cusparse_ver}
|
||||
%endif
|
||||
|
||||
%if %{with opencv}
|
||||
BuildRequires: opencv-devel
|
||||
%endif
|
||||
|
||||
%if %{with test}
|
||||
BuildRequires: google-benchmark-devel
|
||||
%endif
|
||||
|
|
@ -627,7 +562,6 @@ rm -rf third_party/pocketfft/*
|
|||
cp -r pocketfft-*/* third_party/pocketfft/
|
||||
%endif
|
||||
|
||||
%if %{with gitcommit}
|
||||
%if %{without opentelemtry}
|
||||
tar xf %{SOURCE60}
|
||||
rm -rf third_party/opentelemetry-cpp/*
|
||||
|
|
@ -645,16 +579,6 @@ tar xf %{SOURCE80}
|
|||
rm -rf third_party/kineto/*
|
||||
cp -r kineto-*/* third_party/kineto/
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%if %{with opencv}
|
||||
%if %{without gitcommit}
|
||||
# Reduce requirements, *FOUND is not set
|
||||
sed -i -e 's/USE_OPENCV AND OpenCV_FOUND AND USE_FFMPEG AND FFMPEG_FOUND/USE_OPENCV AND USE_FFMPEG/' caffe2/video/CMakeLists.txt
|
||||
sed -i -e 's/USE_OPENCV AND OpenCV_FOUND/USE_OPENCV/' caffe2/image/CMakeLists.txt
|
||||
sed -i -e 's/STATUS/FATAL/' caffe2/image/CMakeLists.txt
|
||||
%endif
|
||||
%endif
|
||||
|
||||
# hipblaslt only building with gfx90a
|
||||
%if %{with hipblaslt}
|
||||
|
|
@ -810,18 +734,8 @@ mkdir third_party/pocketfft
|
|||
mkdir third_party/valgrind-headers
|
||||
cp %{_includedir}/valgrind/* third_party/valgrind-headers
|
||||
|
||||
%if %{without gitcommit}
|
||||
# Remove unneeded OpenCL files that confuse the lincense scanner
|
||||
rm caffe2/contrib/opencl/OpenCL/cl.hpp
|
||||
rm caffe2/mobile/contrib/libopencl-stub/include/CL/*.h
|
||||
rm caffe2/mobile/contrib/libopencl-stub/include/CL/*.hpp
|
||||
%endif
|
||||
|
||||
# Fix installing to /usr/lib64
|
||||
%if %{with gitcommit}
|
||||
sed -i -e 's@DESTINATION ${PYTHON_LIB_REL_PATH}@DESTINATION ${CMAKE_INSTALL_PREFIX}/${PYTHON_LIB_REL_PATH}@' caffe2/CMakeLists.txt
|
||||
%endif
|
||||
|
||||
|
||||
%if %{with rocm}
|
||||
# hipify
|
||||
|
|
@ -924,9 +838,6 @@ export USE_NNPACK=OFF
|
|||
export USE_NUMPY=ON
|
||||
export USE_OPENMP=ON
|
||||
export USE_PYTORCH_QNNPACK=OFF
|
||||
%if %{without gitcommit}
|
||||
export USE_QNNPACK=OFF
|
||||
%endif
|
||||
export USE_ROCM=OFF
|
||||
export USE_SYSTEM_SLEEF=ON
|
||||
export USE_SYSTEM_EIGEN_INSTALL=ON
|
||||
|
|
@ -949,12 +860,6 @@ export USE_SYSTEM_PSIMD=ON
|
|||
export USE_SYSTEM_XNNPACK=ON
|
||||
%endif
|
||||
|
||||
%if %{with caffe2}
|
||||
%if %{without gitcommit}
|
||||
export BUILD_CAFFE2=ON
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%if %{with cuda}
|
||||
%if %{without rocm}
|
||||
export CPLUS_INCLUDE_PATH=/usr/local/cuda-%{cuda_ver}/include
|
||||
|
|
@ -982,10 +887,6 @@ export USE_MPI=ON
|
|||
%endif
|
||||
%endif
|
||||
|
||||
%if %{with opencv}
|
||||
export USE_OPENCV=ON
|
||||
%endif
|
||||
|
||||
%if %{with test}
|
||||
export BUILD_TEST=ON
|
||||
%endif
|
||||
|
|
@ -1097,11 +998,6 @@ done
|
|||
%{python3_sitearch}/%{pypi_name}-*.egg-info
|
||||
%{python3_sitearch}/functorch
|
||||
%{python3_sitearch}/torchgen
|
||||
%if %{without gitcommit}
|
||||
%if %{with caffe2}
|
||||
%{python3_sitearch}/caffe2
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%if %{with rocm}
|
||||
%files -n python3-%{pypi_name}-rocm-gfx8
|
||||
|
|
|
|||
4
sources
4
sources
|
|
@ -12,3 +12,7 @@ SHA512 (libnop-910b558.tar.gz) = 74c5324eaa1b6b2ac8dfef94c835b5c5b044625f8e5efe3
|
|||
SHA512 (pytorch-97ff6cf.tar.gz) = 105ebcba298558fe833f90e7e40b003d35a74609e777f9dc4c47f5668c884f603455113ac0ff252a62b83c81137ae66ceb1a862d351203925dcfc3dcf9f73580
|
||||
SHA512 (pytorch-v2.3.0.tar.gz) = 0c2ffc7bf2fd86070e9958c34eca1f03a0248a011ac6ffaeb69f65306ff856edd5359986f02af25888433187e6d7f29b60edded092e2ac30c8cec49023166eda
|
||||
SHA512 (pytorch-v2.3.1.tar.gz) = fe132251b2bae87b70ba3d95dc32f6a4545970d11893118b0ebe6ca129732e516ef4d6cc4f380b3db9bb2277d1db8ce78a401c40149bb1dfbab76eab9e3992c4
|
||||
SHA512 (pytorch-v2.4.0.tar.gz) = bcfca6aefee00d15d1c0a2456cd9d802d9a8e96816d421d10e3eed5c608bfdf23dfae492ca3638c0bae99ef5bb8c98f4774c0b9f1a8b94d4dc36a52226033314
|
||||
SHA512 (v1.14.2.tar.gz) = 97635bbaf6dd567c201451dfaf7815b2052fe50d9bccc97aade86cfa4a92651374d167296a5453031b2681dc302806a289bca011a9e79ddc381a17d6118971d7
|
||||
SHA512 (cpp-httplib-3b6597b.tar.gz) = 8f1090658c498d04f14fec5c2f301847b1f3360bf92b18d82927643ee04ab61a6b274733a01c7850f9c030205120d674d1d961358d49fdd15636736fb8704f55
|
||||
SHA512 (kineto-be13176.tar.gz) = 41a08c7da9eea7d12402f80a5550c9d4df79798719cc52b12a507828c8c896ba28a37c35d8adf809ca72589e1d84965d5ef6dd01f3f8dc1c803c5ed67b03a43a
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue