diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index fe8d0a54f2af1a..b73dbe123838a4 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -503,6 +503,11 @@ typedef struct { aliased to either operand). Used by the tier 2 optimizer to enable inplace follow-up ops. */ int result_unique; + /* Expected types of the left and right operands. Used by the tier 2 + optimizer to eliminate _GUARD_BINARY_OP_EXTEND when the operand + types are already known. NULL means unknown/don't eliminate. */ + PyTypeObject *lhs_type; + PyTypeObject *rhs_type; } _PyBinaryOpSpecializationDescr; /* Comparison bit masks. */ diff --git a/Include/internal/pycore_list.h b/Include/internal/pycore_list.h index 6b92dc5d111f3b..df0d00f752573b 100644 --- a/Include/internal/pycore_list.h +++ b/Include/internal/pycore_list.h @@ -15,6 +15,7 @@ extern "C" { PyAPI_FUNC(PyObject*) _PyList_Extend(PyListObject *, PyObject *); PyAPI_FUNC(PyObject) *_PyList_SliceSubscript(PyObject*, PyObject*); PyAPI_FUNC(PyObject *) _PyList_BinarySlice(PyObject *, PyObject *, PyObject *); +PyAPI_FUNC(PyObject *) _PyList_Concat(PyObject *, PyObject *); extern void _PyList_DebugMallocStats(FILE *out); // _PyList_GetItemRef should be used only when the object is known as a list // because it doesn't raise TypeError when the object is not a list, whereas PyList_GetItemRef does. diff --git a/Include/internal/pycore_tuple.h b/Include/internal/pycore_tuple.h index 9409ec94976d3a..bf80f96396ea4a 100644 --- a/Include/internal/pycore_tuple.h +++ b/Include/internal/pycore_tuple.h @@ -28,6 +28,7 @@ PyAPI_FUNC(void) _PyStolenTuple_Free(PyObject *self); PyAPI_FUNC(PyObject *)_PyTuple_FromStackRefStealOnSuccess(const union _PyStackRef *, Py_ssize_t); PyAPI_FUNC(PyObject *)_PyTuple_FromArraySteal(PyObject *const *, Py_ssize_t); PyAPI_FUNC(PyObject *) _PyTuple_BinarySlice(PyObject *, PyObject *, PyObject *); +PyAPI_FUNC(PyObject *) _PyTuple_Concat(PyObject *, PyObject *); PyAPI_FUNC(PyObject *) _PyTuple_FromPair(PyObject *, PyObject *); PyAPI_FUNC(PyObject *) _PyTuple_FromPairSteal(PyObject *, PyObject *); diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index b31c9f68d01bec..e4050d3db48cb7 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -3836,6 +3836,72 @@ def testfunc(n): # propagates PyFloat_Type. self.assertNotIn("_GUARD_NOS_FLOAT", uops) + def test_binary_op_extend_list_concat_type_propagation(self): + # list + list is specialized via BINARY_OP_EXTEND. The tier 2 optimizer + # should learn that the result is a list and eliminate subsequent + # list-type guards. + def testfunc(n): + a = [1, 2] + b = [3, 4] + x = True + for _ in range(n): + c = a + b + if c[0]: + x = False + return x + + res, ex = self._run_with_optimizer(testfunc, TIER2_THRESHOLD) + self.assertEqual(res, False) + self.assertIsNotNone(ex) + uops = get_opnames(ex) + self.assertIn("_BINARY_OP_EXTEND", uops) + # The c[0] subscript emits _GUARD_NOS_LIST before _BINARY_OP_SUBSCR_LIST_INT; + # since _BINARY_OP_EXTEND now propagates PyList_Type, that guard is gone. + self.assertIn("_BINARY_OP_SUBSCR_LIST_INT", uops) + self.assertNotIn("_GUARD_NOS_LIST", uops) + + def test_binary_op_extend_tuple_concat_type_propagation(self): + # tuple + tuple is specialized via BINARY_OP_EXTEND. The tier 2 optimizer + # should learn the result is a tuple and eliminate subsequent tuple guards. + def testfunc(n): + t1 = (1, 2) + t2 = (3, 4) + for _ in range(n): + a, b, c, d = t1 + t2 + return a + b + c + d + + res, ex = self._run_with_optimizer(testfunc, TIER2_THRESHOLD) + self.assertEqual(res, 10) + self.assertIsNotNone(ex) + uops = get_opnames(ex) + self.assertIn("_BINARY_OP_EXTEND", uops) + self.assertIn("_UNPACK_SEQUENCE_TUPLE", uops) + self.assertNotIn("_GUARD_TOS_TUPLE", uops) + + def test_binary_op_extend_guard_elimination(self): + # When both operands have known types (e.g., from a prior + # _BINARY_OP_EXTEND result), the _GUARD_BINARY_OP_EXTEND + # should be eliminated. + def testfunc(n): + a = [1, 2] + b = [3, 4] + total = 0 + for _ in range(n): + c = a + b # first: guard stays, result type = list + d = c + c # second: both operands are list -> guard eliminated + total += d[0] + return total + + res, ex = self._run_with_optimizer(testfunc, TIER2_THRESHOLD) + self.assertEqual(res, TIER2_THRESHOLD) + self.assertIsNotNone(ex) + uops = get_opnames(ex) + # Both list additions use _BINARY_OP_EXTEND + self.assertEqual(uops.count("_BINARY_OP_EXTEND"), 2) + # But the second guard is eliminated because both operands + # are known to be lists from the first _BINARY_OP_EXTEND. + self.assertEqual(uops.count("_GUARD_BINARY_OP_EXTEND"), 1) + def test_unary_invert_long_type(self): def testfunc(n): for _ in range(n): diff --git a/Lib/test/test_opcache.py b/Lib/test/test_opcache.py index 4ca108cd6ca43e..60876080577452 100644 --- a/Lib/test/test_opcache.py +++ b/Lib/test/test_opcache.py @@ -1423,6 +1423,21 @@ def binary_op_add_extend(): self.assert_specialized(binary_op_add_extend, "BINARY_OP_EXTEND") self.assert_no_opcode(binary_op_add_extend, "BINARY_OP") + def binary_op_add_extend_sequences(): + l1 = [1, 2] + l2 = [None] + t1 = (1, 2) + t2 = (None,) + for _ in range(100): + list_sum = l1 + l2 + self.assertEqual(list_sum, [1, 2, None]) + tuple_sum = t1 + t2 + self.assertEqual(tuple_sum, (1, 2, None)) + + binary_op_add_extend_sequences() + self.assert_specialized(binary_op_add_extend_sequences, "BINARY_OP_EXTEND") + self.assert_no_opcode(binary_op_add_extend_sequences, "BINARY_OP") + def binary_op_zero_division(): def compactlong_lhs(arg): 42 / arg diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-01-17-19-48-28.gh-issue-100239.7pbTEA.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-01-17-19-48-28.gh-issue-100239.7pbTEA.rst new file mode 100644 index 00000000000000..594ef72ac57fae --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-01-17-19-48-28.gh-issue-100239.7pbTEA.rst @@ -0,0 +1,3 @@ +Specialize ``BINARY_OP`` for concatenation of lists and tuples, and +propagate the result type through ``_BINARY_OP_EXTEND`` in the tier 2 +optimizer so that follow-up type guards can be eliminated. diff --git a/Objects/listobject.c b/Objects/listobject.c index 5c9fd55bab1b22..97869b17cde7a8 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -798,8 +798,8 @@ list_concat_lock_held(PyListObject *a, PyListObject *b) return (PyObject *)np; } -static PyObject * -list_concat(PyObject *aa, PyObject *bb) +PyObject * +_PyList_Concat(PyObject *aa, PyObject *bb) { if (!PyList_Check(bb)) { PyErr_Format(PyExc_TypeError, @@ -3617,7 +3617,7 @@ static PyMethodDef list_methods[] = { static PySequenceMethods list_as_sequence = { list_length, /* sq_length */ - list_concat, /* sq_concat */ + _PyList_Concat, /* sq_concat */ list_repeat, /* sq_repeat */ list_item, /* sq_item */ 0, /* sq_slice */ diff --git a/Objects/tupleobject.c b/Objects/tupleobject.c index ee6320e6ca3cfe..07384acde32e52 100644 --- a/Objects/tupleobject.c +++ b/Objects/tupleobject.c @@ -547,8 +547,8 @@ PyTuple_GetSlice(PyObject *op, Py_ssize_t i, Py_ssize_t j) return tuple_slice((PyTupleObject *)op, i, j); } -static PyObject * -tuple_concat(PyObject *aa, PyObject *bb) +PyObject * +_PyTuple_Concat(PyObject *aa, PyObject *bb) { PyTupleObject *a = _PyTuple_CAST(aa); if (Py_SIZE(a) == 0 && PyTuple_CheckExact(bb)) { @@ -864,7 +864,7 @@ tuple_subtype_new(PyTypeObject *type, PyObject *iterable) static PySequenceMethods tuple_as_sequence = { tuple_length, /* sq_length */ - tuple_concat, /* sq_concat */ + _PyTuple_Concat, /* sq_concat */ tuple_repeat, /* sq_repeat */ tuple_item, /* sq_item */ 0, /* sq_slice */ diff --git a/Python/optimizer_bytecodes.c b/Python/optimizer_bytecodes.c index 58b50707e55cee..0009b5104676ef 100644 --- a/Python/optimizer_bytecodes.c +++ b/Python/optimizer_bytecodes.c @@ -409,6 +409,16 @@ dummy_func(void) { r = right; } + op(_GUARD_BINARY_OP_EXTEND, (descr/4, left, right -- left, right)) { + _PyBinaryOpSpecializationDescr *d = (_PyBinaryOpSpecializationDescr *)descr; + if (d != NULL && d->lhs_type != NULL && d->rhs_type != NULL) { + if (sym_matches_type(left, d->lhs_type) && + sym_matches_type(right, d->rhs_type)) { + REPLACE_OP(this_instr, _NOP, 0, 0); + } + } + } + op(_BINARY_OP_EXTEND, (descr/4, left, right -- res, l, r)) { _PyBinaryOpSpecializationDescr *d = (_PyBinaryOpSpecializationDescr *)descr; if (d != NULL && d->result_type != NULL) { diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h index 891887301119d7..c052c63095ad74 100644 --- a/Python/optimizer_cases.c.h +++ b/Python/optimizer_cases.c.h @@ -1156,6 +1156,18 @@ } case _GUARD_BINARY_OP_EXTEND: { + JitOptRef right; + JitOptRef left; + right = stack_pointer[-1]; + left = stack_pointer[-2]; + PyObject *descr = (PyObject *)this_instr->operand0; + _PyBinaryOpSpecializationDescr *d = (_PyBinaryOpSpecializationDescr *)descr; + if (d != NULL && d->lhs_type != NULL && d->rhs_type != NULL) { + if (sym_matches_type(left, d->lhs_type) && + sym_matches_type(right, d->rhs_type)) { + REPLACE_OP(this_instr, _NOP, 0, 0); + } + } break; } diff --git a/Python/specialize.c b/Python/specialize.c index 0fe225dcbb6b5f..ed4d3da6e59b05 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -9,7 +9,8 @@ #include "pycore_function.h" // _PyFunction_GetVersionForCurrentState() #include "pycore_interpframe.h" // FRAME_SPECIALS_SIZE #include "pycore_lazyimportobject.h" // PyLazyImport_CheckExact -#include "pycore_list.h" // _PyListIterObject +#include "pycore_list.h" // _PyListIterObject, _PyList_Concat +#include "pycore_tuple.h" // _PyTuple_Concat #include "pycore_long.h" // _PyLong_IsNonNegativeCompact() #include "pycore_moduleobject.h" #include "pycore_object.h" @@ -2104,6 +2105,166 @@ is_compactlong(PyObject *v) _PyLong_IsCompact((PyLongObject *)v); } +/* list-list */ + +static int +list_list_guard(PyObject *lhs, PyObject *rhs) +{ + return PyList_CheckExact(lhs) && PyList_CheckExact(rhs); +} + +static PyObject * +list_list_add(PyObject *lhs, PyObject *rhs) +{ + return _PyList_Concat(lhs, rhs); +} + +/* tuple-tuple */ + +static int +tuple_tuple_guard(PyObject *lhs, PyObject *rhs) +{ + return PyTuple_CheckExact(lhs) && PyTuple_CheckExact(rhs); +} + +static PyObject * +tuple_tuple_add(PyObject *lhs, PyObject *rhs) +{ + return _PyTuple_Concat(lhs, rhs); +} + +/* sequence * int helpers: bypass PyNumber_Multiply dispatch overhead + by calling sq_repeat directly with PyLong_AsSsize_t. */ + +static inline PyObject * +seq_int_multiply(PyObject *seq, PyObject *n, + ssizeargfunc repeat) +{ + Py_ssize_t count = PyLong_AsSsize_t(n); + if (count == -1 && PyErr_Occurred()) { + return NULL; + } + return repeat(seq, count); +} + +/* str-int and int-str */ + +static int +str_int_guard(PyObject *lhs, PyObject *rhs) +{ + return PyUnicode_CheckExact(lhs) && PyLong_CheckExact(rhs); +} + +static int +int_str_guard(PyObject *lhs, PyObject *rhs) +{ + return PyLong_CheckExact(lhs) && PyUnicode_CheckExact(rhs); +} + +static PyObject * +str_int_multiply(PyObject *lhs, PyObject *rhs) +{ + return seq_int_multiply(lhs, rhs, + PyUnicode_Type.tp_as_sequence->sq_repeat); +} + +static PyObject * +int_str_multiply(PyObject *lhs, PyObject *rhs) +{ + return seq_int_multiply(rhs, lhs, + PyUnicode_Type.tp_as_sequence->sq_repeat); +} + +/* bytes-bytes */ + +static int +bytes_bytes_guard(PyObject *lhs, PyObject *rhs) +{ + return PyBytes_CheckExact(lhs) && PyBytes_CheckExact(rhs); +} + +static PyObject * +bytes_bytes_add(PyObject *lhs, PyObject *rhs) +{ + return PyBytes_Type.tp_as_sequence->sq_concat(lhs, rhs); +} + +/* bytes-int and int-bytes */ + +static int +bytes_int_guard(PyObject *lhs, PyObject *rhs) +{ + return PyBytes_CheckExact(lhs) && PyLong_CheckExact(rhs); +} + +static int +int_bytes_guard(PyObject *lhs, PyObject *rhs) +{ + return PyLong_CheckExact(lhs) && PyBytes_CheckExact(rhs); +} + +static PyObject * +bytes_int_multiply(PyObject *lhs, PyObject *rhs) +{ + return seq_int_multiply(lhs, rhs, + PyBytes_Type.tp_as_sequence->sq_repeat); +} + +static PyObject * +int_bytes_multiply(PyObject *lhs, PyObject *rhs) +{ + return seq_int_multiply(rhs, lhs, + PyBytes_Type.tp_as_sequence->sq_repeat); +} + +/* tuple-int and int-tuple */ + +static int +tuple_int_guard(PyObject *lhs, PyObject *rhs) +{ + return PyTuple_CheckExact(lhs) && PyLong_CheckExact(rhs); +} + +static int +int_tuple_guard(PyObject *lhs, PyObject *rhs) +{ + return PyLong_CheckExact(lhs) && PyTuple_CheckExact(rhs); +} + +static PyObject * +tuple_int_multiply(PyObject *lhs, PyObject *rhs) +{ + return seq_int_multiply(lhs, rhs, + PyTuple_Type.tp_as_sequence->sq_repeat); +} + +static PyObject * +int_tuple_multiply(PyObject *lhs, PyObject *rhs) +{ + return seq_int_multiply(rhs, lhs, + PyTuple_Type.tp_as_sequence->sq_repeat); +} + +/* dict-dict */ + +static int +dict_dict_guard(PyObject *lhs, PyObject *rhs) +{ + return PyDict_CheckExact(lhs) && PyDict_CheckExact(rhs); +} + +static PyObject * +dict_dict_or(PyObject *lhs, PyObject *rhs) +{ + return PyDict_Type.tp_as_number->nb_or(lhs, rhs); +} + +static PyObject * +dict_dict_ior(PyObject *lhs, PyObject *rhs) +{ + return PyDict_Type.tp_as_number->nb_inplace_or(lhs, rhs); +} + static int compactlongs_guard(PyObject *lhs, PyObject *rhs) { @@ -2194,25 +2355,63 @@ LONG_FLOAT_ACTION(compactlong_float_true_div, /) #undef LONG_FLOAT_ACTION static _PyBinaryOpSpecializationDescr binaryop_extend_descrs[] = { - /* long-long arithmetic */ - {NB_OR, compactlongs_guard, compactlongs_or, &PyLong_Type, 1}, - {NB_AND, compactlongs_guard, compactlongs_and, &PyLong_Type, 1}, - {NB_XOR, compactlongs_guard, compactlongs_xor, &PyLong_Type, 1}, - {NB_INPLACE_OR, compactlongs_guard, compactlongs_or, &PyLong_Type, 1}, - {NB_INPLACE_AND, compactlongs_guard, compactlongs_and, &PyLong_Type, 1}, - {NB_INPLACE_XOR, compactlongs_guard, compactlongs_xor, &PyLong_Type, 1}, - - /* float-long arithemetic */ - {NB_ADD, float_compactlong_guard, float_compactlong_add, &PyFloat_Type, 1}, - {NB_SUBTRACT, float_compactlong_guard, float_compactlong_subtract, &PyFloat_Type, 1}, - {NB_TRUE_DIVIDE, nonzero_float_compactlong_guard, float_compactlong_true_div, &PyFloat_Type, 1}, - {NB_MULTIPLY, float_compactlong_guard, float_compactlong_multiply, &PyFloat_Type, 1}, - - /* float-float arithmetic */ - {NB_ADD, compactlong_float_guard, compactlong_float_add, &PyFloat_Type, 1}, - {NB_SUBTRACT, compactlong_float_guard, compactlong_float_subtract, &PyFloat_Type, 1}, - {NB_TRUE_DIVIDE, nonzero_compactlong_float_guard, compactlong_float_true_div, &PyFloat_Type, 1}, - {NB_MULTIPLY, compactlong_float_guard, compactlong_float_multiply, &PyFloat_Type, 1}, + /* long-long arithmetic: guards also check _PyLong_IsCompact, so + type alone is not sufficient to eliminate the guard. */ + {NB_OR, compactlongs_guard, compactlongs_or, &PyLong_Type, 1, NULL, NULL}, + {NB_AND, compactlongs_guard, compactlongs_and, &PyLong_Type, 1, NULL, NULL}, + {NB_XOR, compactlongs_guard, compactlongs_xor, &PyLong_Type, 1, NULL, NULL}, + {NB_INPLACE_OR, compactlongs_guard, compactlongs_or, &PyLong_Type, 1, NULL, NULL}, + {NB_INPLACE_AND, compactlongs_guard, compactlongs_and, &PyLong_Type, 1, NULL, NULL}, + {NB_INPLACE_XOR, compactlongs_guard, compactlongs_xor, &PyLong_Type, 1, NULL, NULL}, + + /* float-long arithmetic: guards also check NaN and compactness. */ + {NB_ADD, float_compactlong_guard, float_compactlong_add, &PyFloat_Type, 1, NULL, NULL}, + {NB_SUBTRACT, float_compactlong_guard, float_compactlong_subtract, &PyFloat_Type, 1, NULL, NULL}, + {NB_TRUE_DIVIDE, nonzero_float_compactlong_guard, float_compactlong_true_div, &PyFloat_Type, 1, NULL, NULL}, + {NB_MULTIPLY, float_compactlong_guard, float_compactlong_multiply, &PyFloat_Type, 1, NULL, NULL}, + + /* long-float arithmetic: guards also check NaN and compactness. */ + {NB_ADD, compactlong_float_guard, compactlong_float_add, &PyFloat_Type, 1, NULL, NULL}, + {NB_SUBTRACT, compactlong_float_guard, compactlong_float_subtract, &PyFloat_Type, 1, NULL, NULL}, + {NB_TRUE_DIVIDE, nonzero_compactlong_float_guard, compactlong_float_true_div, &PyFloat_Type, 1, NULL, NULL}, + {NB_MULTIPLY, compactlong_float_guard, compactlong_float_multiply, &PyFloat_Type, 1, NULL, NULL}, + + /* list-list concatenation: _PyList_Concat always allocates a new list */ + {NB_ADD, list_list_guard, list_list_add, &PyList_Type, 1, &PyList_Type, &PyList_Type}, + /* tuple-tuple concatenation: _PyTuple_Concat has a zero-length shortcut + that can return one of the operands, so the result is not guaranteed + to be a freshly allocated object. */ + {NB_ADD, tuple_tuple_guard, tuple_tuple_add, &PyTuple_Type, 0, &PyTuple_Type, &PyTuple_Type}, + + /* str * int / int * str: call unicode_repeat directly. + unicode_repeat returns the original when n == 1. */ + {NB_MULTIPLY, str_int_guard, str_int_multiply, &PyUnicode_Type, 0, &PyUnicode_Type, &PyLong_Type}, + {NB_MULTIPLY, int_str_guard, int_str_multiply, &PyUnicode_Type, 0, &PyLong_Type, &PyUnicode_Type}, + {NB_INPLACE_MULTIPLY, str_int_guard, str_int_multiply, &PyUnicode_Type, 0, &PyUnicode_Type, &PyLong_Type}, + {NB_INPLACE_MULTIPLY, int_str_guard, int_str_multiply, &PyUnicode_Type, 0, &PyLong_Type, &PyUnicode_Type}, + + /* bytes + bytes: bytes_concat may return an operand when one side + is empty, so result is not always unique. */ + {NB_ADD, bytes_bytes_guard, bytes_bytes_add, &PyBytes_Type, 0, &PyBytes_Type, &PyBytes_Type}, + {NB_INPLACE_ADD, bytes_bytes_guard, bytes_bytes_add, &PyBytes_Type, 0, &PyBytes_Type, &PyBytes_Type}, + + /* bytes * int / int * bytes: call bytes_repeat directly. + bytes_repeat returns the original when n == 1. */ + {NB_MULTIPLY, bytes_int_guard, bytes_int_multiply, &PyBytes_Type, 0, &PyBytes_Type, &PyLong_Type}, + {NB_MULTIPLY, int_bytes_guard, int_bytes_multiply, &PyBytes_Type, 0, &PyLong_Type, &PyBytes_Type}, + {NB_INPLACE_MULTIPLY, bytes_int_guard, bytes_int_multiply, &PyBytes_Type, 0, &PyBytes_Type, &PyLong_Type}, + {NB_INPLACE_MULTIPLY, int_bytes_guard, int_bytes_multiply, &PyBytes_Type, 0, &PyLong_Type, &PyBytes_Type}, + + /* tuple * int / int * tuple: call tuple_repeat directly. + tuple_repeat returns the original when n == 1. */ + {NB_MULTIPLY, tuple_int_guard, tuple_int_multiply, &PyTuple_Type, 0, &PyTuple_Type, &PyLong_Type}, + {NB_MULTIPLY, int_tuple_guard, int_tuple_multiply, &PyTuple_Type, 0, &PyLong_Type, &PyTuple_Type}, + {NB_INPLACE_MULTIPLY, tuple_int_guard, tuple_int_multiply, &PyTuple_Type, 0, &PyTuple_Type, &PyLong_Type}, + {NB_INPLACE_MULTIPLY, int_tuple_guard, int_tuple_multiply, &PyTuple_Type, 0, &PyLong_Type, &PyTuple_Type}, + + /* dict | dict */ + {NB_OR, dict_dict_guard, dict_dict_or, &PyDict_Type, 1, &PyDict_Type, &PyDict_Type}, + {NB_INPLACE_OR, dict_dict_guard, dict_dict_ior, &PyDict_Type, 0, &PyDict_Type, &PyDict_Type}, }; static int