Update gitcommit to 2.8.0-rc8

Patch problem with 3.14 Start converting over py3 macros Handle new dependency on rocmsmi Signed-off-by: Tom Rix <Tom.Rix@amd.com>
2025-07-24 06:07:03 -07:00 · 2025-07-24 06:07:03 -07:00 · 61ccf033a8
commit 61ccf033a8
parent 42c33b8dcd
4 changed files with 560 additions and 29 deletions
--- a/next/0001-Add-cmake-variable-USE_ROCM_CK.patch
+++ b/next/0001-Add-cmake-variable-USE_ROCM_CK.patch
@ -1,17 +1,17 @@
-From 4cc5d88dfe7a45ab245648dc874645d32a24b98b Mon Sep 17 00:00:00 2001
+From 193854993cd939de186de19589c1add4c4b2cf66 Mon Sep 17 00:00:00 2001
 From: Tom Rix <Tom.Rix@amd.com>
-Date: Fri, 27 Jun 2025 13:52:51 -0700
+Date: Mon, 21 Jul 2025 11:35:03 -0700
 Subject: [PATCH] Add cmake variable USE_ROCM_CK

 ---
 CMakeLists.txt                  |  1 +
 aten/src/ATen/CMakeLists.txt    | 40 ++++++++++++++++-----------------
- aten/src/ATen/cuda/CUDABlas.cpp | 10 ++++-----
+ aten/src/ATen/cuda/CUDABlas.cpp | 22 +++++++++---------
 cmake/Dependencies.cmake        |  3 +++
- 4 files changed, 29 insertions(+), 25 deletions(-)
+ 4 files changed, 35 insertions(+), 31 deletions(-)

 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 99c0b9e0ea0c..4c632e42f531 100644
+index a5d25e6afa0f..afc1b53efa64 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
@@ -240,6 +240,7 @@ cmake_dependent_option(
@ -82,7 +82,7 @@ index c9cfd74b501e..59f6178218ee 100644
     file(GLOB native_hip_ck "native/hip/ck*.hip")
     exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
 diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
-index 89350a11bea7..33e5f2808057 100644
+index 89350a11bea7..e5b7960177cf 100644
 --- a/aten/src/ATen/cuda/CUDABlas.cpp
 +++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -752,7 +752,7 @@ template <>
@ -94,16 +94,16 @@ index 89350a11bea7..33e5f2808057 100644
     // hipblaslt does not support double gemm yet
     bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGS(double));
 #else
-@@ -1103,7 +1103,7 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
-   void * beta_ptr = &fbeta;
-   _cublasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
-   GEMM_CHECK_ARGVALUES(at::Half);
-#ifdef USE_ROCM
-+#ifdef USE_ROCM_CK
-   int flag = 0;
- #if USE_GEMM_FLAGS_FP16_ALT_IMPL
-   flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
-@@ -1270,7 +1270,7 @@ template <>
+@@ -836,7 +836,7 @@ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
+       bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+     }
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
+#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::bgemm_internal_ck<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+   }
+@@ -1270,14 +1270,14 @@ template <>
 void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
 {
   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
@ -112,6 +112,23 @@ index 89350a11bea7..33e5f2808057 100644
     // hipblaslt does not support double gemm yet
     gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGS(double));
 #else
+     gemm_internal_cublaslt<double>(CUDABLAS_GEMM_ARGS(double));
+ #endif
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
+#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<double>(CUDABLAS_GEMM_ARGS(double));
+   }
+@@ -1293,7 +1293,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
+#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100
+       gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
@@ -1311,7 +1311,7 @@ template <>
 void gemm_internal<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>))
 {
@ -130,6 +147,42 @@ index 89350a11bea7..33e5f2808057 100644
     // hipblaslt does not support complex gemm yet
     gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGS(c10::complex<float>));
 #else
+@@ -1345,7 +1345,7 @@ void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
+#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+@@ -1361,7 +1361,7 @@ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
+#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     at::native::gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+@@ -1382,7 +1382,7 @@ void gemm_internal<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half,
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::Half, float>(CUDABLAS_GEMM_ARGS(at::Half));
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
+#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+   }
+@@ -1398,7 +1398,7 @@ void gemm_internal<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::B
+   if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+     gemm_internal_cublaslt<at::BFloat16, float>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+   }
+-#if defined(USE_ROCM) && !defined(_MSC_VER)
+#if defined(USE_ROCM) && defined(USE_ROCM_CK)
+   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+     TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+   }
 diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
 index a93386c27f8d..be1368999d38 100644
 --- a/cmake/Dependencies.cmake
--- a/next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
+++ b/next/0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
@ -0,0 +1,359 @@
+From f2a544b2e3a5bdc04985f6e06223c0c1700120a0 Mon Sep 17 00:00:00 2001
+From: albanD <desmaison.alban@gmail.com>
+Date: Sat, 12 Jul 2025 03:42:33 -0400
+Subject: [PATCH] Fix compilation and "import torch" issues for cpython 3.14
+
+Imported from
+https://github.com/albanD/pytorch/tree/cpython314_build
+commit 88bb9cdb72449f4277829e20d94ad8aec1894216
+
+Signed-off-by: Tom Rix <Tom.Rix@amd.com>
+---
+ torch/_dynamo/bytecode_analysis.py        |  2 +-
+ torch/ao/quantization/__init__.py         |  5 +++-
+ torch/ao/quantization/qconfig.py          |  4 ++-
+ torch/ao/quantization/utils.py            |  7 +++--
+ torch/csrc/dynamo/cpython_defs.c          | 16 +++++++++++
+ torch/csrc/dynamo/cpython_includes.h      | 17 ++++++++++++
+ torch/csrc/dynamo/eval_frame.c            | 34 +++++++++++++++--------
+ torch/csrc/dynamo/framelocals_mapping.cpp | 14 ++++++++++
+ torch/csrc/utils/python_compat.h          |  1 +
+ torch/onnx/__init__.py                    |  1 -
+ torch/utils/weak.py                       | 29 +++++++++++++++++--
+ 11 files changed, 111 insertions(+), 19 deletions(-)
+
+diff --git a/torch/_dynamo/bytecode_analysis.py b/torch/_dynamo/bytecode_analysis.py
+index 3252ea91409f..2de74ee5bf8d 100644
+--- a/torch/_dynamo/bytecode_analysis.py
+++ b/torch/_dynamo/bytecode_analysis.py
+@@ -33,7 +33,7 @@ if sys.version_info >= (3, 11):
+     TERMINAL_OPCODES.add(dis.opmap["JUMP_FORWARD"])
+ else:
+     TERMINAL_OPCODES.add(dis.opmap["JUMP_ABSOLUTE"])
+-if sys.version_info >= (3, 12):
+if (3, 12) <= sys.version_info < (3, 14):
+     TERMINAL_OPCODES.add(dis.opmap["RETURN_CONST"])
+ if sys.version_info >= (3, 13):
+     TERMINAL_OPCODES.add(dis.opmap["JUMP_BACKWARD_NO_INTERRUPT"])
+diff --git a/torch/ao/quantization/__init__.py b/torch/ao/quantization/__init__.py
+index ffc1792fd23f..cf5a8b99a894 100644
+--- a/torch/ao/quantization/__init__.py
+++ b/torch/ao/quantization/__init__.py
+@@ -1,5 +1,6 @@
+ # mypy: allow-untyped-defs
+ 
+import sys
+ from typing import Callable, Optional, Union
+ 
+ import torch
+@@ -33,7 +34,9 @@ from .stubs import *  # noqa: F403
+ 
+ # ensure __module__ is set correctly for public APIs
+ ObserverOrFakeQuantize = Union[ObserverBase, FakeQuantizeBase]
+-ObserverOrFakeQuantize.__module__ = "torch.ao.quantization"
+if sys.version_info < (3, 14):
+    ObserverOrFakeQuantize.__module__ = "torch.ao.quantization"
+
+ for _f in [
+     compare_results,
+     extract_results_from_loggers,
+diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py
+index efee5302ad42..d9a8fc78bab4 100644
+--- a/torch/ao/quantization/qconfig.py
+++ b/torch/ao/quantization/qconfig.py
+@@ -1,5 +1,6 @@
+ # mypy: allow-untyped-defs
+ import copy
+import sys
+ import warnings
+ from collections import namedtuple
+ from typing import Any, Optional, Union
+@@ -568,7 +569,8 @@ def _assert_valid_qconfig(qconfig: Optional[QConfig], mod: torch.nn.Module) -> N
+ 
+ 
+ QConfigAny = Optional[QConfig]
+-QConfigAny.__module__ = "torch.ao.quantization.qconfig"
+if sys.version_info < (3, 14):
+    QConfigAny.__module__ = "torch.ao.quantization.qconfig"
+ 
+ 
+ def _add_module_to_qconfig_obs_ctr(
+diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py
+index 4ac3112ec072..3b1503e01701 100644
+--- a/torch/ao/quantization/utils.py
+++ b/torch/ao/quantization/utils.py
+@@ -4,6 +4,7 @@ Utils shared by different modes of quantization (eager/graph)
+ """
+ 
+ import functools
+import sys
+ import warnings
+ from collections import OrderedDict
+ from inspect import getfullargspec, signature
+@@ -16,7 +17,8 @@ from torch.nn.utils.parametrize import is_parametrized
+ 
+ 
+ NodePattern = Union[tuple[Node, Node], tuple[Node, tuple[Node, Node]], Any]
+-NodePattern.__module__ = "torch.ao.quantization.utils"
+if sys.version_info < (3, 14):
+    NodePattern.__module__ = "torch.ao.quantization.utils"
+ 
+ # This is the Quantizer class instance from torch/quantization/fx/quantize.py.
+ # Define separately to prevent circular imports.
+@@ -31,7 +33,8 @@ QuantizerCls = Any
+ Pattern = Union[
+     Callable, tuple[Callable, Callable], tuple[Callable, tuple[Callable, Callable]], Any
+ ]
+-Pattern.__module__ = "torch.ao.quantization.utils"
+if sys.version_info < (3, 14):
+    Pattern.__module__ = "torch.ao.quantization.utils"
+ 
+ 
+ # TODO: maybe rename this to MatchInputNode
+diff --git a/torch/csrc/dynamo/cpython_defs.c b/torch/csrc/dynamo/cpython_defs.c
+index b68ef894aeaa..244d4165d5e8 100644
+--- a/torch/csrc/dynamo/cpython_defs.c
+++ b/torch/csrc/dynamo/cpython_defs.c
+@@ -2,6 +2,20 @@
+ #include <torch/csrc/dynamo/cpython_includes.h>
+ #include <torch/csrc/dynamo/debug_macros.h>
+ 
+#if IS_PYTHON_3_14_PLUS
+
+const uint8_t* THP_PyOpcode_Caches = NULL;
+const int THP_PyOpcode_Caches_size = 0;
+
+void
+THP_PyThreadState_PopFrame(PyThreadState *tstate, _PyInterpreterFrame * frame)
+{}
+void
+THP_PyFrame_Clear(_PyInterpreterFrame *frame)
+{}
+
+#else
+
+ #if IS_PYTHON_3_11_PLUS
+ 
+ #define Py_BUILD_CORE
+@@ -360,3 +374,5 @@ const uint8_t* THP_PyOpcode_Caches = NULL;
+ const int THP_PyOpcode_Caches_size = 0;
+ 
+ #endif
+
+#endif // IS_PYTHON_3_14_PLUS
+\ No newline at end of file
+diff --git a/torch/csrc/dynamo/cpython_includes.h b/torch/csrc/dynamo/cpython_includes.h
+index 6b99c1d5aec8..616be16563cf 100644
+--- a/torch/csrc/dynamo/cpython_includes.h
+++ b/torch/csrc/dynamo/cpython_includes.h
+@@ -21,6 +21,14 @@
+ 
+ #if IS_PYTHON_3_11_PLUS
+ #include <internal/pycore_frame.h>
+#if IS_PYTHON_3_14_PLUS
+#include <internal/pycore_interpframe_structs.h>
+#include <internal/pycore_stackref.h>
+#endif
+#endif
+
+#if IS_PYTHON_3_14_PLUS
+#include <internal/pycore_code.h>
+ #endif
+ 
+ #undef Py_BUILD_CORE
+@@ -30,6 +38,13 @@
+ extern "C" {
+ #endif
+ 
+#if IS_PYTHON_3_14_PLUS
+
+#define F_CODE(x) (PyCodeObject*)PyStackRef_AsPyObjectBorrow(x->f_executable)
+#define PREV_INSTR(x) (x)->instr_ptr
+
+#else
+
+ #if IS_PYTHON_3_13_PLUS
+ #define F_CODE(x) ((PyCodeObject*)(x)->f_executable)
+ #define PREV_INSTR(x) (x)->instr_ptr
+@@ -38,6 +53,8 @@ extern "C" {
+ #define PREV_INSTR(x) (x)->prev_instr
+ #endif
+ 
+#endif // IS_PYTHON_3_14_PLUS
+
+ #if IS_PYTHON_3_12_PLUS
+ #define FUNC(x) ((x)->f_funcobj)
+ #else
+diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c
+index f413782b2d30..72bb8839bac3 100644
+--- a/torch/csrc/dynamo/eval_frame.c
+++ b/torch/csrc/dynamo/eval_frame.c
+@@ -224,17 +224,6 @@ const char* get_frame_name(THP_EVAL_API_FRAME_OBJECT* frame) {
+   return PyUnicode_AsUTF8(F_CODE(frame)->co_name);
+ }
+ 
+-void clear_old_frame_if_python_312_plus(
+-    PyThreadState* tstate,
+-    THP_EVAL_API_FRAME_OBJECT* frame) {
+-#if IS_PYTHON_3_12_PLUS
+-
+-  THP_PyFrame_Clear(frame);
+-  THP_PyThreadState_PopFrame(tstate, frame);
+-
+-#endif
+-}
+-
+ static PyObject* dynamo_eval_custom_code_impl(
+     PyThreadState* tstate,
+     THP_EVAL_API_FRAME_OBJECT* frame,
+@@ -485,6 +474,18 @@ static PyObject* dynamo__custom_eval_frame_shim(
+ 
+ static void enable_eval_frame_shim(PyThreadState* tstate) {}
+ static void enable_eval_frame_default(PyThreadState* tstate) {}
+PyObject* dynamo_eval_custom_code(
+    PyThreadState* tstate,
+    THP_EVAL_API_FRAME_OBJECT* frame,
+    PyCodeObject* code,
+    const char* trace_annotation,
+    int throw_flag) {}
+THPPyInterpreterFrame* THPPyInterpreterFrame_New(
+    THP_EVAL_API_FRAME_OBJECT* frame) {}
+PyObject* dynamo_eval_frame_default(
+    PyThreadState* tstate,
+    THP_EVAL_API_FRAME_OBJECT* frame,
+    int throw_flag) {}
+ 
+ static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL};
+ 
+@@ -498,6 +499,17 @@ static PyTypeObject THPPyInterpreterFrameType = {
+ 
+ #endif // !(IS_PYTHON_3_14_PLUS)
+ 
+void clear_old_frame_if_python_312_plus(
+    PyThreadState* tstate,
+    THP_EVAL_API_FRAME_OBJECT* frame) {
+#if IS_PYTHON_3_12_PLUS
+
+  THP_PyFrame_Clear(frame);
+  THP_PyThreadState_PopFrame(tstate, frame);
+
+#endif
+}
+
+ static PyObject* increment_working_threads(
+     PyThreadState* tstate,
+     PyObject* module) {
+diff --git a/torch/csrc/dynamo/framelocals_mapping.cpp b/torch/csrc/dynamo/framelocals_mapping.cpp
+index b839fb26fc91..c4ee36d87767 100644
+--- a/torch/csrc/dynamo/framelocals_mapping.cpp
+++ b/torch/csrc/dynamo/framelocals_mapping.cpp
+@@ -26,9 +26,13 @@ FrameLocalsMapping::FrameLocalsMapping(FrameLocalsFrameType* frame)
+   PyCodeObject* co = F_CODE(frame);
+   _framelocals.resize(co->co_nlocalsplus, nullptr);
+ 
+#if IS_PYTHON_3_14_PLUS
+  TORCH_CHECK(false, "Python 3.14+ not supported");
+#else
+   if (!frame->stacktop) {
+     return;
+   }
+#endif
+ 
+   auto update_framelocals = [&](int i, PyObject* value) {
+     _PyLocals_Kind kind = _PyLocals_GetKind(co->co_localspluskinds, i);
+@@ -53,11 +57,21 @@ FrameLocalsMapping::FrameLocalsMapping(FrameLocalsFrameType* frame)
+   };
+ 
+   auto offset = co->co_nlocalsplus - co->co_nfreevars;
+#if IS_PYTHON_3_14_PLUS
+  TORCH_CHECK(false, "Python 3.14+ not supported");
+#else
+   for (int i = 0; i < offset; i++) {
+     update_framelocals(i, frame->localsplus[i]);
+   }
+#endif
+
+   // Get references to closure variables
+#if IS_PYTHON_3_14_PLUS
+  PyObject* closure;
+  TORCH_CHECK(false, "Python 3.14+ not supported");
+#else
+   PyObject* closure = ((PyFunctionObject*)FUNC(frame))->func_closure;
+#endif
+   for (int i = 0; i < co->co_nfreevars; i++) {
+     update_framelocals(offset + i, PyTuple_GET_ITEM(closure, i));
+   }
+diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h
+index a1537611cc47..16292e4fd030 100644
+--- a/torch/csrc/utils/python_compat.h
+++ b/torch/csrc/utils/python_compat.h
+@@ -13,6 +13,7 @@ extern "C" {
+ #define IS_PYTHON_3_12_PLUS PY_VERSION_HEX >= 0x030C0000
+ #define IS_PYTHON_3_13_PLUS PY_VERSION_HEX >= 0x030D0000
+ #define IS_PYTHON_3_14_PLUS PY_VERSION_HEX >= 0x030E0000
+#define IS_PYTHON_3_15_PLUS PY_VERSION_HEX >= 0x030F0000
+ 
+ static inline int PyCode_GetNCellvars(PyCodeObject* code) {
+ // gh-26364 added co_ncellvars to Python 3.11.0rc1
+diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
+index 345ffd2a065b..ceeadde5365b 100644
+--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
+@@ -104,7 +104,6 @@ ONNXProgram.__module__ = "torch.onnx"
+ OnnxExporterError.__module__ = "torch.onnx"
+ _OrtBackend.__module__ = "torch.onnx"
+ _OrtBackendOptions.__module__ = "torch.onnx"
+-_OrtExecutionProvider.__module__ = "torch.onnx"
+ enable_fake_mode.__module__ = "torch.onnx"
+ is_onnxrt_backend_supported.__module__ = "torch.onnx"
+ 
+diff --git a/torch/utils/weak.py b/torch/utils/weak.py
+index 8bf2ba5ed02b..9c7218cb2ad3 100644
+--- a/torch/utils/weak.py
+++ b/torch/utils/weak.py
+@@ -3,8 +3,6 @@ from __future__ import annotations
+ 
+ import collections.abc as _collections_abc
+ import weakref
+-
+-from _weakrefset import _IterationGuard  # type: ignore[attr-defined]
+ from collections.abc import Mapping, MutableMapping
+ from weakref import ref
+ 
+@@ -22,6 +20,33 @@ __all__ = [
+ ]
+ 
+ 
+# TODO: make weakref properly thread safe following
+# https://github.com/python/cpython/pull/125325
+class _IterationGuard:
+    # This context manager registers itself in the current iterators of the
+    # weak container, such as to delay all removals until the context manager
+    # exits.
+    # This technique should be relatively thread-safe (since sets are).
+
+    def __init__(self, weakcontainer):
+        # Don't create cycles
+        self.weakcontainer = ref(weakcontainer)
+
+    def __enter__(self):
+        w = self.weakcontainer()
+        if w is not None:
+            w._iterating.add(self)
+        return self
+
+    def __exit__(self, e, t, b):
+        w = self.weakcontainer()
+        if w is not None:
+            s = w._iterating
+            s.remove(self)
+            if not s:
+                w._commit_removals()
+
+
+ # This file defines a variant of WeakKeyDictionary that overrides the hashing
+ # behavior of the key to use object identity, rather than the builtin
+ # __eq__/__hash__ functions.  This is useful for Tensor weak keys, as their
+-- 
+2.49.0
+
--- a/next/0001-Use-horrible-dynamo-stub.patch
+++ b/next/0001-Use-horrible-dynamo-stub.patch
@ -0,0 +1,85 @@
+From fd535f7bf44f2034cca2a66b4cc7d68d962341df Mon Sep 17 00:00:00 2001
+From: Tom Rix <Tom.Rix@amd.com>
+Date: Sun, 20 Jul 2025 12:47:58 -0700
+Subject: [PATCH] Use horrible dynamo stub
+
+Rawhide's update of python is too fast for dynamo
+So paper of the problem with a horrible stub that throws
+runtime exceptions if dynamo is used.
+
+Signed-off-by: Tom Rix <Tom.Rix@amd.com>
+---
+ build_variables.bzl                        | 26 ++++++++++++----------
+ torch/csrc/dynamo/horrible_dynamo_stub.cpp | 16 +++++++++++++
+ 2 files changed, 30 insertions(+), 12 deletions(-)
+ create mode 100644 torch/csrc/dynamo/horrible_dynamo_stub.cpp
+
+diff --git a/build_variables.bzl b/build_variables.bzl
+index b266c80e8843..a3be6893349b 100644
+--- a/build_variables.bzl
+++ b/build_variables.bzl
+@@ -140,7 +140,8 @@ core_trainer_sources = [
+     "torch/csrc/autograd/variable.cpp",
+     "torch/csrc/autograd/utils/warnings.cpp",
+     "torch/csrc/autograd/jit_decomp_interface.cpp",
+-    "torch/csrc/dynamo/compiled_autograd.cpp",
+#    "torch/csrc/dynamo/compiled_autograd.cpp",
+    "torch/csrc/dynamo/horrible_dynamo_stub.cpp",
+     "torch/csrc/jit/frontend/name_mangler.cpp",
+     "torch/csrc/jit/ir/type_hashing.cpp",
+     "torch/csrc/jit/serialization/pickler.cpp",
+@@ -868,17 +869,18 @@ libtorch_python_core_sources = [
+     "torch/csrc/autograd/python_torch_functions_manual.cpp",
+     "torch/csrc/autograd/python_variable.cpp",
+     "torch/csrc/autograd/python_variable_indexing.cpp",
+-    "torch/csrc/dynamo/python_compiled_autograd.cpp",
+-    "torch/csrc/dynamo/cache_entry.cpp",
+-    "torch/csrc/dynamo/cpp_shim.cpp",
+-    "torch/csrc/dynamo/cpython_defs.c",
+-    "torch/csrc/dynamo/eval_frame.c",
+-    "torch/csrc/dynamo/eval_frame_cpp.cpp",
+-    "torch/csrc/dynamo/extra_state.cpp",
+-    "torch/csrc/dynamo/framelocals_mapping.cpp",
+-    "torch/csrc/dynamo/guards.cpp",
+-    "torch/csrc/dynamo/utils.cpp",
+-    "torch/csrc/dynamo/init.cpp",
+#    "torch/csrc/dynamo/python_compiled_autograd.cpp",
+#    "torch/csrc/dynamo/cache_entry.cpp",
+#    "torch/csrc/dynamo/cpp_shim.cpp",
+#    "torch/csrc/dynamo/cpython_defs.c",
+#    "torch/csrc/dynamo/eval_frame.c",
+#    "torch/csrc/dynamo/eval_frame_cpp.cpp",
+#    "torch/csrc/dynamo/extra_state.cpp",
+#    "torch/csrc/dynamo/framelocals_mapping.cpp",
+#    "torch/csrc/dynamo/guards.cpp",
+#    "torch/csrc/dynamo/utils.cpp",
+#    "torch/csrc/dynamo/init.cpp",
+    "torch/csrc/dynamo/horrible_dynamo_stub.cpp",
+     "torch/csrc/functorch/init.cpp",
+     "torch/csrc/fx/node.cpp",
+     "torch/csrc/mps/Module.cpp",
+diff --git a/torch/csrc/dynamo/horrible_dynamo_stub.cpp b/torch/csrc/dynamo/horrible_dynamo_stub.cpp
+new file mode 100644
+index 000000000000..3ac1324d4557
+--- /dev/null
+++ b/torch/csrc/dynamo/horrible_dynamo_stub.cpp
+@@ -0,0 +1,16 @@
+#include <torch/csrc/autograd/engine.h>
+#include <torch/csrc/dynamo/compiled_autograd.h>
+
+namespace torch::dynamo::autograd {
+const std::unique_ptr<PyCompilerInterface>& getPyCompilerInterface() {
+  throw std::runtime_error("Dynamo not supported");
+  return nullptr;
+}
+std::vector<std::optional<InputMetadata>> get_input_metadata(
+    const edge_list& edges) {
+  std::vector<std::optional<InputMetadata>> r;
+  throw std::runtime_error("Dynamo not supported");
+  return r;
+}
+
+}
+-- 
+2.49.0
+
--- a/python-torch.spec
+++ b/python-torch.spec
@ -6,10 +6,10 @@
 # So pre releases can be tried
 %bcond_with gitcommit
 %if %{with gitcommit}
-# v2.8.0-rc6
-%global commit0 f2b69a083d15e3d0083bb304302a3fd0b5fb8705
+# v2.8.0-rc8
+%global commit0 a1cb3cc05d46d198467bebbb6e8fba50a325d4e7
 %global shortcommit0 %(c=%{commit0}; echo ${c:0:7})
-%global date0 20250718
+%global date0 20250723
 %global pypi_version 2.8.0
 %global flatbuffers_version 24.12.23
 %global miniz_version 3.0.2
@ -33,7 +33,11 @@
 %endif

 # For testing distributed+rccl etc.
+%if %{with gitcommit}
+%bcond_without rccl
+%else
 %bcond_with rccl
+%endif
 %bcond_with gloo
 %bcond_without mpi
 %bcond_without tensorpipe
@ -103,13 +107,13 @@ Source80:       https://github.com/pytorch/kineto/archive/%{ki_commit}/kineto-%{

 %if %{without gitcommit}
 # https://github.com/pytorch/pytorch/issues/150187
-# The hack job
-# Patch11:       0001-python-torch-disable-ck.patch
-# Cleaned up hack job
 Patch11:       0001-Add-cmake-varaible-USE_ROCM_CK.patch
-
 %else
+# https://github.com/pytorch/pytorch/issues/150187
 Patch11:       0001-Add-cmake-variable-USE_ROCM_CK.patch
+# https://github.com/pytorch/pytorch/issues/156595
+# Patch12:       0001-Use-horrible-dynamo-stub.patch
+Patch12:       0001-Fix-compilation-and-import-torch-issues-for-cpython-.patch
 %endif

 ExclusiveArch:  x86_64 aarch64
@ -153,6 +157,9 @@ BuildRequires:  python3dist(filelock)
 BuildRequires:  python3dist(jinja2)
 BuildRequires:  python3dist(networkx)
 BuildRequires:  python3dist(numpy)
+%if %{with gitcommit}
+BuildRequires:  python3dist(pip)
+%endif
 BuildRequires:  python3dist(pyyaml)
 BuildRequires:  python3dist(setuptools)
 BuildRequires:  python3dist(sphinx)
@ -171,6 +178,9 @@ BuildRequires:  hipcub-devel
 BuildRequires:  hipfft-devel
 BuildRequires:  hiprand-devel
 BuildRequires:  hipsparse-devel
+%if %{with gitcommit}
+BuildRequires:  hipsparselt-devel
+%endif
 BuildRequires:  hipsolver-devel
 BuildRequires:  magma-devel
 BuildRequires:  miopen-devel
@ -190,6 +200,7 @@ BuildRequires:  rocm-runtime-devel
 BuildRequires:  rocm-rpm-macros
 %if %{with gitcommit}
 BuildRequires:  rocsolver-devel
+BuildRequires:  rocm-smi-devel
 %endif
 BuildRequires:  rocthrust-devel
 BuildRequires:  roctracer-devel
@ -337,6 +348,10 @@ sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-unus
 sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -Wno-deprecated-declarations@' cmake/Dependencies.cmake
 # Use parallel jobs
 sed -i -e 's@HIP_CLANG_FLAGS -fno-gpu-rdc@HIP_CLANG_FLAGS -fno-gpu-rdc -parallel-jobs=4@' cmake/Dependencies.cmake
+%if %{with gitcommit}
+# Need to link with librocm_smi64
+sed -i -e 's@hiprtc::hiprtc@hiprtc::hiprtc rocm_smi64@' cmake/Dependencies.cmake
+%endif

 # No third_party fmt, use system
 sed -i -e 's@fmt::fmt-header-only@fmt@' CMakeLists.txt
@ -590,17 +605,21 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode

 # pytorch uses clang, not hipcc
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
-%if %{?fedora} <= 43
-export PYTORCH_ROCM_ARCH="gfx1100;gfx1201"
-%else
 export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
+
+%if %{with gitcommit}
+%pyproject_wheel
+%else
+%py3_build
 %endif

-%py3_build
-
 %else

+%if %{with gitcommit}
+%pyproject_wheel
+%else
 %py3_build
+%endif

 %endif

@ -617,17 +636,32 @@ export DEVICE_LIB_PATH=${RESOURCE_DIR}/amdgcn/bitcode
 # pytorch uses clang, not hipcc
 export HIP_CLANG_PATH=%{rocmllvm_bindir}
 export PYTORCH_ROCM_ARCH=%{rocm_gpu_list_default}
+%if %{with gitcommit}
+%pyproject_install
+%else
 %py3_install
+%endif

 %else

+%if %{with gitcommit}
+%pyproject_install
+%pyproject_save_files torch
+%else
 %py3_install
-
+%endif

 %endif

+
+
 %check
+%if %{with gitcommit}
+# Not working yet
+# pyproject_check_import torch
+%else
 %py3_check_import torch
+%endif

 # Do not remote the empty files