Oct 15, 2024

C for Python

Building a dictionary type

Introduction

I once wondered: how does Python implement its dict type? I went on to look at the CPython source code (Objects/dictobject.c) and thought it would be cool to create my own dictionary type in C for Python. The builtin dict type supports various functionalities that would be copious to implement in a new data type, but it's not that hard to build a simple type that supports key insertion and lookup with the C API. So let's do it.

The resources I followed are the C API documentation, the tutorial about extending Python with C, and the CPython source code. The dictionary type I built is the simplest possible — my intention was to interact with the C API, not to create an optimized data type. Our dictionary will use a linked list and a sequential search algorithm to implement insertion and lookup. You can find the source code in the GitHub repository.

The underlying linked list

Each node of the linked list will correspond to a dictionary entry, i.e. a key and a value. Because our dictionary will accept any hashable Python object as key, the nodes will also hold the hash of the key object. Lastly, because the nodes are linked, each node will hold a pointer to the next one.

Here's the header file node.h for the node type:

#pragma once

#define PY_SSIZE_T_CLEAN
#include <Python.h>

// Node of linked-list used for sequential-search dictionary implementation.
// This struct is not a Python object.
typedef struct _SSDictNode {
  PyObject *key;
  Py_hash_t key_hash; // Any hashable Python object can be a key
  PyObject *value;    // Any Python object can be a value
  struct _SSDictNode *next;
} SSDictNode;

void SSDictNode_dealloc(SSDictNode *self);
SSDictNode *SSDictNode_new();
void SSDictNode_init(SSDictNode *self, PyObject *key, PyObject *value);

SSDictNode is not actually a Python object — we don't need to expose it to Python; it merely serves as an underlying data type. We've also defined signatures for functions to create, initialize, and deallocate SSDictNode structs.

And the corresponding implementation file node.c:

#include "node.h"

void SSDictNode_dealloc(SSDictNode *self) {
  Py_XDECREF(self->key);
  Py_XDECREF(self->value);
}

SSDictNode *SSDictNode_new() {
  SSDictNode *self = (SSDictNode *)PyMem_RawMalloc(sizeof(SSDictNode));
  if (self == NULL) {
    PyErr_SetString(PyExc_MemoryError,
                    "failed to allocate memory for SSDict node");
    return NULL;
  }

  self->key = NULL;
  self->key_hash = -1; // a real hash could never be -1
  self->value = NULL;

  return self;
}

void SSDictNode_init(SSDictNode *self, PyObject *key, PyObject *value) {
  Py_hash_t hash = PyObject_Hash(key);

  if (hash == -1) {
    PyObject_HashNotImplemented(key);
    return;
  }

  self->key = Py_NewRef(key);
  self->key_hash = hash;
  self->value = Py_NewRef(value);
}

SSDictNode_dealloc does not actually free the node because we want to ensure we have the pointer to the next node before doing so. The actual freeing is performed outside the scope of this function, where nodes are freed sequentially.

The dictionary type

Let's move on to the actual dictionary Python object. Here's the header file ssdict.h:

#pragma once

#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include "node.h"

typedef unsigned int dict_size;

typedef struct {
  PyObject_HEAD SSDictNode *head;
  dict_size size;
} SSDict;

PyObject *SSDict__new__(PyTypeObject *type, PyObject *args, PyObject *kwds);
int       SSDict__init__(PyObject *self);
PyObject *SSDict__getitem__(PyObject *self, PyObject *key);
void      SSDict_dealloc(SSDict *self);
int       SSDict_assign_value(PyObject *_self, PyObject *key, PyObject *value);
int       SSDict_len(PyObject *_self);
int       SSDict__contains__(PyObject *_self, PyObject *key);

static void SSDict_replace_node_value(SSDictNode *node, PyObject *value);
static void SSDict_add_new_node(SSDict *self, PyObject *key, PyObject *value);
static int  SSDict_delete_item(SSDict *self, PyObject *key);
static int  SSDict_delete_head_node(SSDict *self);
static int  SSDict_delete_body_node(SSDict *self, Py_hash_t hash);
static int  SSDict_set_item(SSDict *self, PyObject *key, PyObject *value);

SSDict is our dictionary Python object. Its only C attributes are the head of the linked list and the size. Every non-static function will get mapped to a Python method.

The implementation file ssdict.c:

#include "ssdict.h"

PyObject *SSDict__new__(PyTypeObject *type, PyObject *args, PyObject *kwds) {
  SSDict *self = (SSDict *)type->tp_alloc(type, 0);
  return (PyObject *)self;
}

int SSDict__init__(PyObject *self) {
  SSDict *dict = (SSDict *)self;
  dict->head = NULL;
  dict->size = 0;
  return 0;
}

PyObject *SSDict__getitem__(PyObject *self, PyObject *key) {
  Py_hash_t hash = PyObject_Hash(key);
  if (hash == -1) {
    PyObject_HashNotImplemented(key);
    return NULL;
  }

  SSDictNode *node = ((SSDict *)self)->head;
  while (node != NULL) {
    if (node->key_hash == hash) return node->value;
    node = node->next;
  }

  PyErr_SetObject(PyExc_KeyError, key);
  return NULL;
}

void SSDict_dealloc(SSDict *self) {
  SSDictNode *node = self->head;
  while (node != NULL) {
    SSDictNode_dealloc(node);
    SSDictNode *tmp = node;
    node = node->next;
    PyMem_RawFree(tmp);
  }
  Py_TYPE(self)->tp_free((PyObject *)self);
}

int SSDict_assign_value(PyObject *_self, PyObject *key, PyObject *value) {
  SSDict *self = (SSDict *)_self;
  return value == NULL
    ? SSDict_delete_item(self, key)
    : SSDict_set_item(self, key, value);
}

int SSDict_len(PyObject *_self) {
  return ((SSDict *)_self)->size;
}

int SSDict__contains__(PyObject *_self, PyObject *key) {
  Py_hash_t hash = PyObject_Hash(key);
  if (hash == -1) {
    PyObject_HashNotImplemented(key);
    return -1;
  }

  SSDictNode *node = ((SSDict *)_self)->head;
  while (node != NULL) {
    if (node->key_hash == hash) return 1;
    node = node->next;
  }
  return 0;
}

static void SSDict_replace_node_value(SSDictNode *node, PyObject *value) {
  // Use a temporary to avoid decrementing the old value before incrementing
  // the new one — the destructor triggered by decref could do bad things.
  PyObject *tmp = node->value;
  node->value = Py_NewRef(value);
  Py_XDECREF(tmp);
}

static void SSDict_add_new_node(SSDict *self, PyObject *key, PyObject *value) {
  SSDictNode *new = SSDictNode_new();
  SSDictNode_init(new, key, value);
  new->next = self->head;
  self->head = new;
  self->size++;
}

static int SSDict_delete_item(SSDict *self, PyObject *key) {
  Py_hash_t hash = PyObject_Hash(key);
  if (hash == -1) {
    PyObject_HashNotImplemented(key);
    return -1;
  }

  if (self->head->key_hash == hash)
    return SSDict_delete_head_node(self);

  if (SSDict_delete_body_node(self, hash) < 0) {
    PyErr_SetObject(PyExc_KeyError, key);
    return -1;
  }
  return 0;
}

static int SSDict_delete_head_node(SSDict *self) {
  Py_XDECREF(self->head->key);
  Py_XDECREF(self->head->value);

  if (self->head->next == NULL) {
    free(self->head);
    self->head = NULL;
    self->size = 0;
  } else {
    SSDictNode *tmp = self->head;
    self->head = self->head->next;
    free(tmp);
    self->size--;
  }
  return 0;
}

static int SSDict_delete_body_node(SSDict *self, Py_hash_t hash) {
  SSDictNode *before = self->head;
  SSDictNode *current = before->next;

  while (current != NULL) {
    if (current->key_hash == hash) {
      before->next = current->next;
      Py_XDECREF(current->key);
      Py_XDECREF(current->value);
      free(current);
      self->size--;
      return 0;
    }
    before = current;
    current = current->next;
  }
  return -1;
}

static int SSDict_set_item(SSDict *self, PyObject *key, PyObject *value) {
  Py_hash_t hash = PyObject_Hash(key);
  if (hash == -1) {
    PyObject_HashNotImplemented(key);
    return -1;
  }

  SSDictNode *node = self->head;
  while (node != NULL) {
    if (node->key_hash == hash) {
      SSDict_replace_node_value(node, value);
      return 0;
    }
    node = node->next;
  }

  SSDict_add_new_node(self, key, value);
  return 0;
}

Having defined the dictionary type and its methods, we're left with wiring up the module with the C API. For our type to act as a mapping in Python we need a PyMappingMethods struct. For key in dict to work we also need to set sq_contains on PySequenceMethods.

definitions.c:

#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include "ssdict.h"

static PyMethodDef SSDict_methods[] = {
    {"__getitem__", SSDict__getitem__, METH_O | METH_COEXIST,
     "Get an item from the dictionary"},
    {NULL}
};

static PyMappingMethods SSDict_mappingmethods = {
    .mp_length       = (lenfunc)SSDict_len,
    .mp_subscript    = SSDict__getitem__,
    .mp_ass_subscript = SSDict_assign_value,
};

static PySequenceMethods SSDict_sequencemethods = {
    .sq_contains = SSDict__contains__,
};

static PyTypeObject SSDictType = {
    .ob_base    = PyVarObject_HEAD_INIT(NULL, 0),
    .tp_name    = "dictionaries.SSDict",
    .tp_doc     = PyDoc_STR("Dictionary using a sequential-search linked list."),
    .tp_basicsize = sizeof(SSDict),
    .tp_flags   = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,
    .tp_new     = SSDict__new__,
    .tp_init    = (initproc)SSDict__init__,
    .tp_dealloc = (destructor)SSDict_dealloc,
    .tp_methods = SSDict_methods,
    .tp_as_mapping  = &SSDict_mappingmethods,
    .tp_as_sequence = &SSDict_sequencemethods,
    .tp_hash    = PyObject_HashNotImplemented,
};

static PyModuleDef dictionariesmodule = {
    .m_base = PyModuleDef_HEAD_INIT,
    .m_name = "dictionaries",
    .m_doc  = "Dictionary types.",
    .m_size = -1,
};

PyMODINIT_FUNC PyInit_dictionaries(void) {
  if (PyType_Ready(&SSDictType) < 0) return NULL;

  PyObject *module = PyModule_Create(&dictionariesmodule);
  if (module == NULL) return NULL;

  Py_INCREF(&SSDictType);
  if (PyModule_AddObject(module, "SSDict", (PyObject *)&SSDictType) < 0) {
    Py_DECREF(&SSDictType);
    Py_DECREF(module);
    return NULL;
  }

  return module;
}

Building the module

We'll build the module using pip with pyproject.toml and setup.py for configuration. All C files live under src/ssdict/.

pyproject.toml

[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"

[project]
name = "dictionaries"
version = "0.1.0"

setup.py

from setuptools import Extension, setup

setup(
    ext_modules=[
        Extension(
            "dictionaries",
            sources=[
                "src/ssdict/ssdict.c",
                "src/ssdict/node.c",
                "src/ssdict/definitions.c",
            ],
            include_dirs=["src/ssdict"],
        )
    ]
)

Install the extension from the directory containing pyproject.toml:

pip install -e .

This creates a shared object (.so) file and an .egg-info file in the src directory.

Running an example

After installing, append the path to the .so file to your PYTHONPATH, then try it out:

example.py

from dictionaries import SSDict

d = SSDict()

d[object()] = "123"
d[3] = object()
d[3] = 5
d["mykey"] = "myvalue"

print("\nSet items successfully!\n")
print(f"d[3] = {d[3]}\n")
print(f"dictionary length = {len(d)}\n")
print(f'"mykey" in d = {"mykey" in d}\n')

Output:

Set items successfully!

d[3] = 5

dictionary length = 3

"mykey" in d = True

Testing the dictionary type

Running a quick example is fine, but a proper test suite is better. The example below uses my own unittest-extensions library.

test.py

import unittest
from unittest_extensions import TestCase, args
from dictionaries import SSDict

class TestSSDict(TestCase):
    def dict(self) -> SSDict:
        return self._dict

    def setUp(self):
        self._dict = SSDict()

class TestSSDictSubscript(TestSSDict):
    def subject(self, key):
        return self.dict()[key]

    @args(2)
    def test_unset_item(self):
        self.assertResultRaises(KeyError)

    @args(1)
    def test_set_item(self):
        self.dict().__setitem__(1, 11)
        self.assertResult(11)

    @args({1})
    def test_unset_unhashable_item(self):
        self.assertResultRaises(TypeError)

class TestSSDictAssSubscript(TestSSDict):
    def subject(self, key, value):
        self.dict()[key] = value
        return self.dict()

    @args({1}, 1)
    def test_unhashable_key(self):
        self.assertResultRaises(TypeError)

    @args(12, [1, 2, 3])
    def test_new_key(self):
        self.assertEqual(self.result().__getitem__(12), [1, 2, 3])

    @args("a", "b")
    def test_existent_key(self):
        self.dict().__setitem__("a", "aa")
        self.assertEqual(self.result().__getitem__("a"), "b")

class TestSSDictLength(TestSSDict):
    def subject(self):
        return len(self.dict())

    def add_item(self, key=1, value=1):
        self.dict().__setitem__(key, value)

    def test_empty_dict(self):    self.assertResult(0)
    def test_one_item(self):      self.add_item(); self.assertResult(1)
    def test_add_same_item(self): self.add_item(); self.add_item(); self.assertResult(1)
    def test_add_different_items(self): self.add_item(); self.add_item(2); self.assertResult(2)

class TestSSDictContains(TestSSDict):
    def subject(self, key):
        return key in self.dict()

    @args("a")
    def test_key_not_in_dict(self):
        self.assertResultFalse()

    @args("a")
    def test_key_in_dict(self):
        self.dict().__setitem__("a", 1)
        self.assertResultTrue()

    @args({1})
    def test_unhashable_item(self):
        self.assertResultRaises(TypeError)

if __name__ == "__main__":
    unittest.main()

Conclusion

Implementing a simple data type with C for Python is not that hard. However, making a production-ready type requires consideration of things like thread-safety and reference counting edge cases.

If you want to make something run faster by implementing it in C, think carefully before going down that path. The builtin data structures of Python are heavily optimized — you'd have to put in a lot of effort to build something that actually beats them.