【CPython3.6源码分析】PyListObject

参考

前言

1
2
3
Another generally useful object type is a list of object pointers.
This is a mutable type: the list items can be changed, and items can be
added or removed. Out-of-range indices or non-list objects are ignored.

老套路,开局一段注释:

  • 对象指针列表
  • 可以增删改查
  • 具有索引容错功能

PyListObject

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
// listobject.h.23
typedef struct {
PyObject_VAR_HEAD
PyObject **ob_item;
Py_ssize_t allocated;
} PyListObject;

/* ob_item contains space for 'allocated' elements. The number
* currently in use is ob_size.
* Invariants:
* 0 <= ob_size <= allocated
* len(list) == ob_size
* ob_item == NULL implies ob_size == allocated == 0
* list.sort() temporarily sets allocated to -1 to detect mutations.
*
* Items must normally not be NULL, except during construction when
* the list is not yet visible outside the function that builds it.
*/

如上,其中的注释很有用,其他信息:

  • **ob_item 明确是指针数组
  • allocated 标记了容器大小,决定了内存大小
  • ob_size 标记了元素个数

PyList_Type

1
2
3
4
5
6
7
8
9
10
11
// listobject.c.2624
PyTypeObject PyList_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
"list",
sizeof(PyListObject),
0,
(destructor)list_dealloc, /* tp_dealloc */
PyType_GenericAlloc, /* tp_alloc */
PyType_GenericNew, /* tp_new */
(initproc)list_init, /* tp_init */
};

在之前的 PyLong_Type、PyUnicode_Type中,tp_new都是单独定义的函数。
而在PyList_Type中,是通用的 PyType_Generic* 函数。下面,我们就来看下他们的逻辑。

PyType_GenericNew

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
// typeobject.c.958
PyObject *
PyType_GenericNew(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
return type->tp_alloc(type, 0);
}

// typeobject.c.928
PyObject * PyType_GenericAlloc(PyTypeObject *type, Py_ssize_t nitems)
{
PyObject *obj;
const size_t size = _PyObject_VAR_SIZE(type, nitems+1);
/* note that we need to add one, for the sentinel */

if (PyType_IS_GC(type))
obj = _PyObject_GC_Malloc(size);
else
obj = (PyObject *)PyObject_MALLOC(size);

if (obj == NULL));
return PyErr_NoMemory();

memset(obj, '\0', size);

if (type->tp_flags & Py_TPFLAGS_HEAPTYPE)
Py_INCREF(type);

if (type->tp_itemsize == 0)
(void)PyObject_INIT(obj, type);
else
(void) PyObject_INIT_VAR((PyVarObject *)obj, type, nitems);

if (PyType_IS_GC(type))
_PyObject_GC_TRACK(obj);
return obj;
}

// objimpl.h.141
/* Macros trading binary compatibility for speed. See also pymem.h.
Note that these macros expect non-NULL object pointers.*/
#define PyObject_INIT(op, typeobj) \
( Py_TYPE(op) = (typeobj), _Py_NewReference((PyObject *)(op)), (op) )
#define PyObject_INIT_VAR(op, typeobj, size) \
( Py_SIZE(op) = (size), PyObject_INIT((op), (typeobj)) )

如上,具体细节不在深究,能够看出一个轮廓:

  1. 根据 type 计算 size
  2. 根据 size 调用 MALLOC
  3. 调用 INIT 完成初始化

PyList_New

1
2
3
4
5
6
7
8
9
PyObject* PyList_New(Py_ssize_t len)
Return value: New reference.
Return a new list of length len on success, or NULL on failure.

Note: If len is greater than zero,
the returned list object’s items are set to NULL.
Thus you cannot use abstract API functions such as PySequence_SetItem()
or expose the object to Python code before setting all items to a real
object with PyList_SetItem().

上面,是来自来自文档的API介绍,其中介绍了创建列表,设置元素的方法。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
// listobject.c.104
#define PyList_MAXFREELIST 80
static PyListObject *free_list[PyList_MAXFREELIST];
static int numfree = 0;

// listobject.c.140
PyObject * PyList_New(Py_ssize_t size)
{
PyListObject *op;

// asert size >= 0
if (size < 0) {
PyErr_BadInternalCall();
return NULL;
}

// 尝试共享 list 对象指针
if (numfree) {
numfree--;
op = free_list[numfree];
_Py_NewReference((PyObject *)op);
} else {
op = PyObject_GC_New(PyListObject, &PyList_Type);
if (op == NULL)
return NULL;
}

// 获取真实数据地址
if (size <= 0)
op->ob_item = NULL;
else {
op->ob_item = (PyObject **) PyMem_Calloc(size, sizeof(PyObject *));
}

// 赋初值
Py_SIZE(op) = size;
op->allocated = size;
_PyObject_GC_TRACK(op);
return (PyObject *) op;
}

如上,从中我们可以看到,一个数组 free_list,很容易联想到共享机制。通过 numfreefree_list 的配合,实现了列表的共享,却又不同于之前谈到的整数小对象池。此处共享的是对象的指针,但真实数据的地址是不共享的,这点也很容易明白。

共享机制

如上,free_list 与 small_ints 类似,都是提供对象缓冲池,但又不完全一样。

  • 小整数对象 small_ints 是一个数组 [-5, 256]
  • 单字节对象 characters 是一个数组 latin-1 [0 - 255]
  • UnicodeObject interned 是一个弱引用的对象字典
  • ListObject free_list 是一个数组,存放着80个销毁的对象

list_dealloc

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
// Objects/listobject.c/313行
static void list_dealloc(PyListObject *op)
{
Py_ssize_t i;
PyObject_GC_UnTrack(op);
Py_TRASHCAN_SAFE_BEGIN(op)
if (op->ob_item != NULL) {
i = Py_SIZE(op);
while (--i >= 0) {
Py_XDECREF(op->ob_item[i]);
}
PyMem_FREE(op->ob_item);
}
if (numfree < PyList_MAXFREELIST && PyList_CheckExact(op))
free_list[numfree++] = op;
else
Py_TYPE(op)->tp_free((PyObject *)op);
Py_TRASHCAN_SAFE_END(op)
}

如上,PyList_Type 中定义了 tp_dealloc 为 list_dealloc,在代码中可以看见:

  • 对元素进行循环处理:减引用
  • 对 ob_item 指针进行释放
  • 尝试加入缓冲池 free_list

CRUD

在 PyList_New 中,很明确的指出创建对象后要先调用 PyList_SetItem

PyList_SetItem

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
int PyList_SetItem(PyObject *list, Py_ssize_t index, PyObject *item)
Set the item at index index in list to item.
Return 0 on success or -1 on failure.

Note: This function “steals” a reference to item and
discards a reference to an item already in the list at
the affected position.

void PyList_SET_ITEM(PyObject *list, Py_ssize_t i, PyObject *o)
Macro form of PyList_SetItem() without error checking.
This is normally only used to fill in new lists where
there is no previous content.

Note: This macro “steals” a reference to item, and, unlike
PyList_SetItem(), does not discard a reference to any item
that is being replaced;
any reference in list at position i will be leaked.

文档中,指出设置元素的方式,并且指明引用计数的处理原则。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#define PyList_SET_ITEM(op, i, v) (((PyListObject *)(op))->ob_item[i] = (v))

// listobject.c.218
int PyList_SetItem(PyObject *op, Py_ssize_t i, PyObject *newitem)
{
PyObject **p;

// op 类型检查
if (!PyList_Check(op)) {
Py_XDECREF(newitem);
PyErr_BadInternalCall();
return -1;
}

// 索引值容错
if (i < 0 || i >= Py_SIZE(op)) {
Py_XDECREF(newitem);
PyErr_SetString(PyExc_IndexError,
"list assignment index out of range");
return -1;
}

// 对象指针引用
p = ((PyListObject *)op) -> ob_item + i;
Py_XSETREF(*p, newitem);
return 0;
}

// object.h.882
#define Py_XSETREF(op, op2) \
do { \
PyObject *_py_tmp = (PyObject *)(op); \
(op) = (op2); \
Py_XDECREF(_py_tmp); \
} while (0)

如上,对元素进行赋值操作,单纯的进行了对象指针赋值操作,时间复杂度O(1)。

需要注意的是,如上文所述,在指针替换的过程中,只减少了原地址的引用,并未增加 newitem 的引用数。

PyList_GetItem

1
2
3
4
5
6
7
8
9
10
11
12
PyObject* PyList_GetItem(PyObject *list, Py_ssize_t index)
Return value: Borrowed reference.

Return the object at position index in the list pointed to by list.
The position must be positive,
indexing from the end of the list is not supported.
If index is out of bounds, return NULL and set an IndexError exception.

PyObject* PyList_GET_ITEM(PyObject *list, Py_ssize_t i)
Return value: Borrowed reference.

Macro form of PyList_GetItem() without error checking.

文档中也指出了获取元素的方式,其中一个是宏实现。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#define PyList_GET_ITEM(op, i) (((PyListObject *)(op))->ob_item[i])

// Objects/listobject.c/198行
PyObject * PyList_GetItem(PyObject *op, Py_ssize_t i)
{
// 类型检查(略)
if (i < 0 || i >= Py_SIZE(op)) {
if (indexerr == NULL) {
indexerr = PyUnicode_FromString(
"list index out of range");
if (indexerr == NULL)
return NULL;
}
PyErr_SetObject(PyExc_IndexError, indexerr);
return NULL;
}
return ((PyListObject *)op) -> ob_item[i];
}

如上,函数PyList_GetItem会进行容错处理。最终按索引进行数组取值,时间复杂度O(1)

PyList_Insert

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
// listobject.c.272
int PyList_Insert(PyObject *op, Py_ssize_t where, PyObject *newitem)
{
// 类型检查(略)
return ins1((PyListObject *)op, where, newitem);
}

// listobject.c.239
static int ins1(PyListObject *self, Py_ssize_t where, PyObject *v)
{
Py_ssize_t i, n = Py_SIZE(self);
PyObject **items;

// 容错处理
if (v == NULL) {
PyErr_BadInternalCall();
return -1;
}
if (n == PY_SSIZE_T_MAX) {
PyErr_SetString(PyExc_OverflowError,
"cannot add more objects to list");
return -1;
}

// !!! 可能调整位置 !!!
if (list_resize(self, n+1) < 0)
return -1;

// 索引正负号处理
if (where < 0) {
where += n;
if (where < 0)
where = 0;
}
if (where > n)
where = n;

// 移动元素
items = self->ob_item;
for (i = n; --i >= where; )
items[i+1] = items[i];

// 插入值
Py_INCREF(v);
items[where] = v;
return 0;
}

如上,对数组进行插值大概可以分为几部分,从中可以看出:

  • 索引 where 支持负值,支持大于元素个数的值
  • 调用 list_resize 可能会调整位置
  • 插入值,会产生移动操作。时间复杂度O(n)

list_resize

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
// listobject.c.25
static int list_resize(PyListObject *self, Py_ssize_t newsize)
{
PyObject **items;
size_t new_allocated;
Py_ssize_t allocated = self->allocated;

// 如果足够大,就减小
if (allocated >= newsize && newsize >= (allocated >> 1)) {
assert(self->ob_item != NULL || newsize == 0);
Py_SIZE(self) = newsize;
return 0;
}
// 计算新的需要
new_allocated = (newsize >> 3) + (newsize < 9 ? 3 : 6);

/* check for integer overflow */
if (new_allocated > SIZE_MAX - newsize) {
PyErr_NoMemory();
return -1;
} else {
new_allocated += newsize;
}
if (newsize == 0)
new_allocated = 0;
items = self->ob_item;

// 调整大小
if (new_allocated <= (SIZE_MAX / sizeof(PyObject *)))
PyMem_RESIZE(items, PyObject *, new_allocated);
else
items = NULL;
if (items == NULL) {
PyErr_NoMemory();
return -1;
}

// 赋值
self->ob_item = items;
Py_SIZE(self) = newsize;
self->allocated = new_allocated;
return 0;
}

调用链:
if (list_resize(self, n+1) < 0)
return -1;

如上,list_resize 接收一个 list对 象指针以及新的size值,成功返回0,失败返回-1。具体分析,见下文的伪代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def list_resize(*self, newsize):
allocated = self.allocated

if newsize == 0:
new_allocated = 0
elif allocated /2 <= newsize <= allocated:
self.size = newsize
return 0
else:
new_allocated = newsize + newsize / 8 + (3 if newsize<9 else 6)

PyMem_RESIZE(items, PyObject *, new_allocated);
self->ob_item = items;
Py_SIZE(self) = newsize;
self->allocated = new_allocated;
return 0;

PyMem_RESIZE,调用 PyMem_REALLOC,实现最终的重新分配内存空间。

PyList_Append

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
// Objects/listobject.c/282行
static int
app1(PyListObject *self, PyObject *v)
{
Py_ssize_t n = PyList_GET_SIZE(self);

if (list_resize(self, n+1) < 0)
return -1;

Py_INCREF(v);
PyList_SET_ITEM(self, n, v);
return 0;
}

int
PyList_Append(PyObject *op, PyObject *newitem)
{
if (PyList_Check(op) && (newitem != NULL))
return app1((PyListObject *)op, newitem);
PyErr_BadInternalCall();
return -1;
}

如上,append -> app1 -> list_resize -> PyList_SET_ITEM
若 list_resize 不触发 PyMem_RESIZE(),时间复杂度为 O(1)

PyList_GetSlice(list_slice)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
// listobject.c.458
PyObject *
PyList_GetSlice(PyObject *a, Py_ssize_t ilow, Py_ssize_t ihigh)
{
// 类型检查(略)
return list_slice((PyListObject *)a, ilow, ihigh);
}

// listobject.c.428
static PyObject *
list_slice(PyListObject *a, Py_ssize_t ilow, Py_ssize_t ihigh)
{
PyListObject *np;
PyObject **src, **dest;
Py_ssize_t i, len;

// 边界处理
if (ilow < 0)
ilow = 0;
else if (ilow > Py_SIZE(a))
ilow = Py_SIZE(a);
if (ihigh < ilow)
ihigh = ilow;
else if (ihigh > Py_SIZE(a))
ihigh = Py_SIZE(a);
len = ihigh - ilow;

// 新对象
np = (PyListObject *) PyList_New(len);
if (np == NULL)
return NULL;
src = a->ob_item + ilow;
dest = np->ob_item;

// 拷贝数据指针
for (i = 0; i < len; i++) {
PyObject *v = src[i];
Py_INCREF(v);
dest[i] = v;
}
return (PyObject *)np;
}

如上,对List 切片取值。其中需要注意的是,在数据拷贝过程中,拷贝的是指针,实现的是 浅拷贝。

PyList_SetSlice(list_ass_slice)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
int
PyList_SetSlice(PyObject *a,
Py_ssize_t ilow,
Py_ssize_t ihigh, PyObject *v)
{
... // 类型检查
return list_ass_slice((PyListObject *)a, ilow, ihigh, v);
}

// listobject.c.569
static int
list_ass_slice(PyListObject *a,
Py_ssize_t ilow, Py_ssize_t ihigh,
PyObject *v)
{
PyObject *recycle_on_stack[8];
/* can allocate more if needed */
PyObject **recycle = recycle_on_stack;
PyObject **item;
PyObject **vitem = NULL;
PyObject *v_as_SF = NULL; /* PySequence_Fast(v) */
Py_ssize_t n; /* # of elements in replacement list */
Py_ssize_t norig; /* # of elements in list getting replaced */
Py_ssize_t d; /* Change in size */
Py_ssize_t k;
size_t s;
int result = -1; /* guilty until proved innocent */
#define b ((PyListObject *)v)

// v = NULL,不执行操作
if (v == NULL)
n = 0;
else {
if (a == b) {
/* Special case "a[i:j] = a" -- copy b first */
v = list_slice(b, 0, Py_SIZE(b));
if (v == NULL)
return result;
result = list_ass_slice(a, ilow, ihigh, v);
Py_DECREF(v);
return result;
}
v_as_SF = PySequence_Fast(v, "can only assign an iterable");
n = PySequence_Fast_GET_SIZE(v_as_SF);
vitem = PySequence_Fast_ITEMS(v_as_SF);
}
// 边界处理(略)
ilow = ?
ihigh = ?
norig = ihigh - ilow;
assert(norig>= 0);
d = n - norig;

// a[:] = [] 清空
if (Py_SIZE(a) + d == 0) {
Py_XDECREF(v_as_SF);
return list_clear(a);
}

// 创建临时数据 recycle
item = a->ob_item;
/* recycle the items that we are about to remove */
s = norig * sizeof(PyObject *);
/* If norig == 0, item might be NULL,
in which case we may not memcpy from it. */
if (s) {
if (s > sizeof(recycle_on_stack)) {
recycle = (PyObject **)PyMem_MALLOC(s);
}
memcpy(recycle, &item[ilow], s);
}

// 形如 a[1:10] = [1,2]
if (d < 0) { /* Delete -d items */
Py_ssize_t tail;
tail = (Py_SIZE(a) - ihigh) * sizeof(PyObject *);

// 左移 a[ihigh:],减少空位
memmove(&item[ihigh+d], &item[ihigh], tail);

// 拷贝后,尝试 resieze
if (list_resize(a, Py_SIZE(a) + d) < 0) {
// 失败后恢复 recycle
memmove(&item[ihigh], &item[ihigh+d], tail);
memcpy(&item[ilow], recycle, s);
goto Error;
}
item = a->ob_item;
}

// 形如 a[0:2] =[1,2,3,4]
else if (d > 0) { /* Insert d items */
k = Py_SIZE(a);
if (list_resize(a, k+d) < 0)
goto Error;
item = a->ob_item;

// 右移 a[ihigh:],腾出位置
memmove(&item[ihigh+d],
&item[ihigh],
(k - ihigh)*sizeof(PyObject *));
}

// 拷贝 v 的值指针到 a
for (k = 0; k < n; k++, ilow++) {
PyObject *w = vitem[k];
Py_XINCREF(w);

// 注意,从 ilow 开始
item[ilow] = w;
}

// 释放 recycle
for (k = norig - 1; k >= 0; --k)
Py_XDECREF(recycle[k]);
result = 0;
Error:
if (recycle != recycle_on_stack)
PyMem_FREE(recycle);
Py_XDECREF(v_as_SF);
return result;
#undef b
}

当执行 a[low:high] = v 时,调用 SetSlice 方法,具体执行逻辑见注释内容。

从源码中,能分析出一些执行结果,伪代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
/* a[ilow:ihigh] = v if v != NULL.
del a[ilow:ihigh] if v == NULL. */

>>> a
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
>>> del a[1:1]
>>> a
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

v=NULL
n=0
ilow = 1
ihigh = 1
norig = ihigh - ilow = 0
d = n - norig = 0
不满足 if(d) 中的任何条件,结果就是不做任何操作

>>> a
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
>>> a[1:1] = [1]
>>> a
[0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9]

v!=NULL
n=1
ilow = 1
ihigh = 1
norig = ihigh - ilow = 0
d = n - norig = 1
满足 if(d>0),右移腾出位置,并插入a[ilow] = 1

listremove

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
// listobject.c.2193
static PyObject * listremove(PyListObject *self, PyObject *v)
{
Py_ssize_t i;

for (i = 0; i < Py_SIZE(self); i++) {
int cmp = PyObject_RichCompareBool(self->ob_item[i], v, Py_EQ);
if (cmp > 0) {
if (list_ass_slice(self, i, i+1,
(PyObject *)NULL) == 0)
Py_RETURN_NONE;
return NULL;
}
else if (cmp < 0)
return NULL;
}
PyErr_SetString(PyExc_ValueError, "list.remove(x): x not in list");
return NULL;
}

由上,可以看出,删除元素是通过 list_ass_slice 实现:

  • 仅删除第一个遇见的元素,删除一个不在其中的元素,会报错
  • 删除是通过遍历实现,时间复杂度 O(n)

总结

PyListObject 也利用了缓冲池机制,只缓冲对象,不缓冲数据。

下面是粗略的时间复杂度( list_resize 不涉及到 移动时):

  • SetItem - O(1)
  • GetItem - O(1)
  • Append - O(1)
  • Insert - O(n),对象移动
  • GetSlice - O(high - low),创建新对象,拷贝指针
  • SetSlice - O(n),对象移动,拷贝指针
  • Remove - O(n),遍历,对象移动