From 8fae6ec80e4deb7b01782ee70576982ea9ea86b6 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Tue, 16 Dec 2025 12:37:19 -0800 Subject: [PATCH 1/7] Add maybe_enable_deferred_ref_count(). If we are specializing to LOAD_GLOBAL_MODULE, set deferred reference counting for the value, if it meets criteria. For now, it's only done for frozenset objects. --- Python/specialize.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/Python/specialize.c b/Python/specialize.c index 19433bc7a74319..a4a468e7ad005b 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -1264,6 +1264,24 @@ specialize_attr_loadclassattr(PyObject *owner, _Py_CODEUNIT *instr, return 1; } +#ifdef Py_GIL_DISABLED +static void +maybe_enable_deferred_ref_count(PyObject *globals, PyObject *name) +{ + PyObject *value; + if (PyDict_GetItemRef(globals, name, &value) != 1) { + return; + } + if (!PyType_IS_GC(Py_TYPE(value)) || _PyObject_HasDeferredRefcount(value)) { + Py_DECREF(value); + return; + } + if (PyFrozenSet_Check(value)) { + PyUnstable_Object_EnableDeferredRefcount(value); + } + Py_DECREF(value); +} +#endif static void specialize_load_global_lock_held( @@ -1305,6 +1323,9 @@ specialize_load_global_lock_held( SPECIALIZATION_FAIL(LOAD_GLOBAL, SPEC_FAIL_OUT_OF_RANGE); goto fail; } +#ifdef Py_GIL_DISABLED + maybe_enable_deferred_ref_count(globals, name); +#endif cache->index = (uint16_t)index; cache->module_keys_version = (uint16_t)keys_version; specialize(instr, LOAD_GLOBAL_MODULE); From 2372f96bb1ebd4d3d3096cb3a7e87db22154b30f Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Tue, 16 Dec 2025 15:08:51 -0800 Subject: [PATCH 2/7] Improve maybe_enable_deferred_ref_count() function. Use it for LOAD_ATTR_MODULE in addition to LOAD_GLOBAL_MODULE. Don't enable deferred ref counts if the object is owned by the current thread. Specialized bytecode is per-thread so this works. Enable for frozensets, tuples and type objects. --- Python/specialize.c | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/Python/specialize.c b/Python/specialize.c index a4a468e7ad005b..53d3c626ebaa90 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -355,6 +355,28 @@ static int function_kind(PyCodeObject *code); static bool function_check_args(PyObject *o, int expected_argcount, int opcode); static uint32_t function_get_version(PyObject *o, int opcode); +#ifdef Py_GIL_DISABLED +static void +maybe_enable_deferred_ref_count(PyObject *dict, PyObject *name) +{ + PyObject *op; + if (PyDict_GetItemRef(dict, name, &op) != 1) { + return; + } + if (_Py_IsOwnedByCurrentThread(op) || + !PyType_IS_GC(Py_TYPE(op)) || + _PyObject_HasDeferredRefcount(op)) { + Py_DECREF(op); + return; + } + if (PyFrozenSet_Check(op) || PyTuple_Check(op) || PyType_Check(op)) { + PyUnstable_Object_EnableDeferredRefcount(op); + } + Py_DECREF(op); +} +#endif + + static int specialize_module_load_attr_lock_held(PyDictObject *dict, _Py_CODEUNIT *instr, PyObject *name) { @@ -384,6 +406,9 @@ specialize_module_load_attr_lock_held(PyDictObject *dict, _Py_CODEUNIT *instr, P SPECIALIZATION_FAIL(LOAD_ATTR, SPEC_FAIL_OUT_OF_VERSIONS); return -1; } +#ifdef Py_GIL_DISABLED + maybe_enable_deferred_ref_count((PyObject *)dict, name); +#endif write_u32(cache->version, keys_version); cache->index = (uint16_t)index; specialize(instr, LOAD_ATTR_MODULE); @@ -1264,25 +1289,6 @@ specialize_attr_loadclassattr(PyObject *owner, _Py_CODEUNIT *instr, return 1; } -#ifdef Py_GIL_DISABLED -static void -maybe_enable_deferred_ref_count(PyObject *globals, PyObject *name) -{ - PyObject *value; - if (PyDict_GetItemRef(globals, name, &value) != 1) { - return; - } - if (!PyType_IS_GC(Py_TYPE(value)) || _PyObject_HasDeferredRefcount(value)) { - Py_DECREF(value); - return; - } - if (PyFrozenSet_Check(value)) { - PyUnstable_Object_EnableDeferredRefcount(value); - } - Py_DECREF(value); -} -#endif - static void specialize_load_global_lock_held( PyObject *globals, PyObject *builtins, From 03977d9f258f3513307b7e620ead0fe805cf1ea6 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Wed, 17 Dec 2025 16:33:53 -0800 Subject: [PATCH 3/7] Enable deferred counting for more types. Also, avoid PyDict_GetItemRef() call by returning the value when the index is looked up (as previously discarded). --- Include/internal/pycore_dict.h | 1 + Objects/dictobject.c | 12 +++++++++--- Python/specialize.c | 27 +++++++++++---------------- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/Include/internal/pycore_dict.h b/Include/internal/pycore_dict.h index 1193f496da132d..43f4505ff41be4 100644 --- a/Include/internal/pycore_dict.h +++ b/Include/internal/pycore_dict.h @@ -114,6 +114,7 @@ extern Py_ssize_t _Py_dict_lookup_threadsafe_stackref(PyDictObject *mp, PyObject extern int _PyDict_GetMethodStackRef(PyDictObject *dict, PyObject *name, _PyStackRef *method); +extern Py_ssize_t _PyDict_LookupIndexAndValue(PyDictObject *, PyObject *, PyObject **); extern Py_ssize_t _PyDict_LookupIndex(PyDictObject *, PyObject *); extern Py_ssize_t _PyDictKeys_StringLookup(PyDictKeysObject* dictkeys, PyObject *key); diff --git a/Objects/dictobject.c b/Objects/dictobject.c index 49a42a35acb8fd..59103deeeffa47 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -2346,10 +2346,9 @@ dict_unhashable_type(PyObject *key) } Py_ssize_t -_PyDict_LookupIndex(PyDictObject *mp, PyObject *key) +_PyDict_LookupIndexAndValue(PyDictObject *mp, PyObject *key, PyObject **value) { // TODO: Thread safety - PyObject *value; assert(PyDict_CheckExact((PyObject*)mp)); assert(PyUnicode_CheckExact(key)); @@ -2359,7 +2358,14 @@ _PyDict_LookupIndex(PyDictObject *mp, PyObject *key) return -1; } - return _Py_dict_lookup(mp, key, hash, &value); + return _Py_dict_lookup(mp, key, hash, value); +} + +Py_ssize_t +_PyDict_LookupIndex(PyDictObject *mp, PyObject *key) +{ + PyObject *value; // discarded + return _PyDict_LookupIndexAndValue(mp, key, &value); } /* Same as PyDict_GetItemWithError() but with hash supplied by caller. diff --git a/Python/specialize.c b/Python/specialize.c index 53d3c626ebaa90..6e38bd09ea3101 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -357,22 +357,11 @@ static uint32_t function_get_version(PyObject *o, int opcode); #ifdef Py_GIL_DISABLED static void -maybe_enable_deferred_ref_count(PyObject *dict, PyObject *name) +maybe_enable_deferred_ref_count(PyObject *op) { - PyObject *op; - if (PyDict_GetItemRef(dict, name, &op) != 1) { - return; - } - if (_Py_IsOwnedByCurrentThread(op) || - !PyType_IS_GC(Py_TYPE(op)) || - _PyObject_HasDeferredRefcount(op)) { - Py_DECREF(op); - return; - } - if (PyFrozenSet_Check(op) || PyTuple_Check(op) || PyType_Check(op)) { + if (!_Py_IsOwnedByCurrentThread(op)) { PyUnstable_Object_EnableDeferredRefcount(op); } - Py_DECREF(op); } #endif @@ -391,7 +380,8 @@ specialize_module_load_attr_lock_held(PyDictObject *dict, _Py_CODEUNIT *instr, P SPECIALIZATION_FAIL(LOAD_ATTR, SPEC_FAIL_ATTR_MODULE_ATTR_NOT_FOUND); return -1; } - index = _PyDict_LookupIndex(dict, name); + PyObject *value; + index = _PyDict_LookupIndexAndValue(dict, name, &value); assert (index != DKIX_ERROR); if (index != (uint16_t)index) { SPECIALIZATION_FAIL(LOAD_ATTR, @@ -407,7 +397,7 @@ specialize_module_load_attr_lock_held(PyDictObject *dict, _Py_CODEUNIT *instr, P return -1; } #ifdef Py_GIL_DISABLED - maybe_enable_deferred_ref_count((PyObject *)dict, name); + maybe_enable_deferred_ref_count(value); #endif write_u32(cache->version, keys_version); cache->index = (uint16_t)index; @@ -1308,7 +1298,12 @@ specialize_load_global_lock_held( SPECIALIZATION_FAIL(LOAD_GLOBAL, SPEC_FAIL_LOAD_GLOBAL_NON_STRING_OR_SPLIT); goto fail; } +#ifdef Py_GIL_DISABLED + PyObject *value; + Py_ssize_t index = _PyDict_LookupIndexAndValue((PyDictObject *)globals, name, &value); +#else Py_ssize_t index = _PyDictKeys_StringLookup(globals_keys, name); +#endif if (index == DKIX_ERROR) { SPECIALIZATION_FAIL(LOAD_GLOBAL, SPEC_FAIL_EXPECTED_ERROR); goto fail; @@ -1330,7 +1325,7 @@ specialize_load_global_lock_held( goto fail; } #ifdef Py_GIL_DISABLED - maybe_enable_deferred_ref_count(globals, name); + maybe_enable_deferred_ref_count(value); #endif cache->index = (uint16_t)index; cache->module_keys_version = (uint16_t)keys_version; From 5ef77af2fa95dc48fdd9a41f6c63b13e9befc7b3 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Wed, 17 Dec 2025 16:39:45 -0800 Subject: [PATCH 4/7] Add deepcopy benchmark to ftscalingbench. This is taken from the PR GH-132658. --- Tools/ftscalingbench/ftscalingbench.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Tools/ftscalingbench/ftscalingbench.py b/Tools/ftscalingbench/ftscalingbench.py index 097a065f368f30..c2bd7c3880bc90 100644 --- a/Tools/ftscalingbench/ftscalingbench.py +++ b/Tools/ftscalingbench/ftscalingbench.py @@ -21,6 +21,7 @@ # > echo "0" | sudo tee /sys/devices/system/cpu/cpufreq/boost # +import copy import math import os import queue @@ -214,6 +215,14 @@ def instantiate_dataclass(): for _ in range(1000 * WORK_SCALE): obj = MyDataClass(x=1, y=2, z=3) + +@register_benchmark +def deepcopy(): + x = {'list': [1, 2], 'tuple': (1, None)} + for i in range(40 * WORK_SCALE): + copy.deepcopy(x) + + def bench_one_thread(func): t0 = time.perf_counter_ns() func() From 04826f7f82d742854d81ad093d4ba3d4193edea8 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 22 Dec 2025 15:53:47 -0800 Subject: [PATCH 5/7] Add comment for maybe_enable_deferred_ref_count(). --- Python/specialize.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Python/specialize.c b/Python/specialize.c index 6e38bd09ea3101..f647bf6e26c117 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -360,6 +360,10 @@ static void maybe_enable_deferred_ref_count(PyObject *op) { if (!_Py_IsOwnedByCurrentThread(op)) { + // For module level variables that are heavily used from multiple + // threads, deferred reference counting provides good scaling + // benefits. The downside is that the object will only be deallocated + // by a GC run. PyUnstable_Object_EnableDeferredRefcount(op); } } From 25192bdc789b2d771f78d4c3ae5caf9b272d868c Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Wed, 24 Dec 2025 13:19:37 -0800 Subject: [PATCH 6/7] Add NEWS. --- .../2025-12-24-13-19-16.gh-issue-132657._P4DDb.rst | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-12-24-13-19-16.gh-issue-132657._P4DDb.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-12-24-13-19-16.gh-issue-132657._P4DDb.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-24-13-19-16.gh-issue-132657._P4DDb.rst new file mode 100644 index 00000000000000..650dc50893bc68 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-24-13-19-16.gh-issue-132657._P4DDb.rst @@ -0,0 +1,6 @@ +If we are specializing to :opcode:`LOAD_GLOBAL_MODULE` or +:opcode:`LOAD_ATTR_MODULE`, try to enable deferred reference counting for +the value, if the object is owned by a different thread. This applies to +the free-threaded build only and will improve scaling of multi-threaded +programs. Note that when deferred reference counting is enabled, the object +will be deallocated by the GC, rather than by :c:func:`Py_DECREF`. From 9b9d01aa0ca79534789db5f49cd51de537c1e56c Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Wed, 24 Dec 2025 14:40:01 -0800 Subject: [PATCH 7/7] Don't use :opcode: markup. --- .../2025-12-24-13-19-16.gh-issue-132657._P4DDb.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-12-24-13-19-16.gh-issue-132657._P4DDb.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-24-13-19-16.gh-issue-132657._P4DDb.rst index 650dc50893bc68..bbc9611b748fde 100644 --- a/Misc/NEWS.d/next/Core_and_Builtins/2025-12-24-13-19-16.gh-issue-132657._P4DDb.rst +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-12-24-13-19-16.gh-issue-132657._P4DDb.rst @@ -1,6 +1,6 @@ -If we are specializing to :opcode:`LOAD_GLOBAL_MODULE` or -:opcode:`LOAD_ATTR_MODULE`, try to enable deferred reference counting for -the value, if the object is owned by a different thread. This applies to -the free-threaded build only and will improve scaling of multi-threaded -programs. Note that when deferred reference counting is enabled, the object -will be deallocated by the GC, rather than by :c:func:`Py_DECREF`. +If we are specializing to ``LOAD_GLOBAL_MODULE`` or ``LOAD_ATTR_MODULE``, try +to enable deferred reference counting for the value, if the object is owned by +a different thread. This applies to the free-threaded build only and should +improve scaling of multi-threaded programs. Note that when deferred reference +counting is enabled, the object will be deallocated by the GC, rather than by +:c:func:`Py_DECREF`.