Extending Python with Rust

Optimization

  • Programming productivity
    • “YAGNI” © Yehuda Katz, Let's Talk About Rust @ GoGaRuCo 2014
    • “This stresses power over usability. Right?! You need a PhD here!” Simon Peyton Jones, Adventure with Types in Haskell, 2014
  • Higher level optimization have greater impact
    • Architecture/Design Level
    • High-level, Algorithms and data structures
    • Low-Level

Runtime level optimization

25% better performance than CPython

"maybe like 10% runtime", hot attr

by Kevin Modzelewski @ pyston talk November 24, 2015

Source code level optimization

LST = list(map(''.join, product('abc', repeat=17))) 
def foo():
    return map(str.upper, LST)
def bar():
    res = []
    for i in LST:
        res.append(i.upper())
    return res
def baz():
    return [i.upper() for i in LST]

Tools

  • numba
  • C extension
  • cython

Numba


@jit(int32(int32, int32), nopython=True, nogil=True)
def add_two(a, b):
    acc = 0
    i = 0
    while i < 1000:
        acc += a + b
        i += 1
    return acc

12 SETUP_LOOP              40 (to 55)
15 LOAD_FAST                3 (i)
18 LOAD_CONST               2 (1000)
21 COMPARE_OP               0 (<)
24 POP_JUMP_IF_FALSE       54

27 LOAD_FAST                2 (acc)
30 LOAD_FAST                0 (a)
33 LOAD_FAST                1 (b)
36 BINARY_ADD
37 INPLACE_ADD
38 STORE_FAST               2 (acc)

41 LOAD_FAST                3 (i)
44 LOAD_CONST               3 (1)
47 INPLACE_ADD
48 STORE_FAST               3 (i)
51 JUMP_ABSOLUTE           15
54 POP_BLOCK

___main__.add_two$1.int32.int32:
addl	%r8d, %ecx
imull	$1000, %ecx, %eax
movl	%eax, (%rdi)
xorl	%eax, %eax
retq

add_two.inspect_asm().values()[0].decode('string_escape')

_wrapper.__main__.add_two$1.int32.int32:
	movq	%rdi, %r14
	movabsq	$_.const.add_two, %r10
	movabsq	$_PyArg_UnpackTuple, %r11
    ...
	movabsq	$_PyNumber_Long, %r15
	callq	*%r15
	movq	%rax, %rbx
	xorl	%r14d, %r14d
	testq	%rbx, %rbx
	je	LBB1_8
	movabsq	$_PyLong_AsLongLong, %rax
    ...

+ ~77 Python C API instructions

Rich Wareham, "Creating a toy language with the Python, LLVM and the IPython web notebook" https://www.youtube.com/watch?v=G78cTmgeUxI

Minimal module with C


// sample.c
void initsample(void)
{
    Py_InitModule("sample", NULL);
}

PyMethodDef


static PyObject * add_two(PyObject * self, PyObject * args);

static PyMethodDef SampleMethods[] = {
    {"add_two", add_two, METH_VARARGS, ""},
    {NULL, NULL, 0, NULL}
};

void initsample(void) {
    Py_InitModule("sample", SampleMethods);
}

func


PyObject * add_two(PyObject * self, PyObject * args) {
    int a, b, acc = 0;
    if (!PyArg_ParseTuple(args, "ii", &a, &b)) {
        PyErr_SetNone(PyExc_ValueError);
        return NULL;
    }
    for (int i = 0; i < 1000; i++)
        acc += a + b;
    return Py_BuildValue("i", acc);
}

Module import (dyn)


import sample

IMPORT_NAME  0 (sample)
STORE_FAST   0 (sample)

// ceval.c
...
w = GETITEM(names, oparg);
v = PyDict_GetItemString(f->f_builtins, "__import__");
...
w = PyTuple_Pack(4, w,
        f->f_globals,
        f->f_locals == NULL ? Py_None : f->f_locals,
        v);
...
x = PyEval_CallObject(v, w);
...
SET_TOP(x); if (x != NULL) DISPATCH();
...
  1. builtin___import__
  2. PyImport_ImportModuleLevel
  3. import_module_level
  4. load_next
  5. import_submodule
  6. find_module
  7. load_module

dl_funcptr _PyImport_GetDynLoadFunc(const char *fqname,
    const char *shortname,
    const char *pathname, FILE *fp)
{
    char funcname[258];
    PyOS_snprintf(funcname, sizeof(funcname),
        "init%.200s", shortname);
    return dl_loadmod(Py_GetProgramName(),
        pathname, funcname);
}

cython


def add_two(a, b):
    i = acc = 0
    while i < 1000:
        acc += a + b
    return acc

$ cat sample.c | wc -l 1906


__pyx_t_2 = PyNumber_Add(__pyx_v_a, __pyx_v_b);
if (unlikely(!__pyx_t_2)) {
    __pyx_filename = __pyx_f[0];
    __pyx_lineno = 14; __pyx_clineno = __LINE__;
    goto __pyx_L1_error;
}
__Pyx_GOTREF(__pyx_t_2);
__pyx_t_3 = PyNumber_InPlaceAdd(__pyx_v_acc, __pyx_t_2);
if (unlikely(!__pyx_t_3)) {
    __pyx_filename = __pyx_f[0]; __pyx_lineno = 14;
    __pyx_clineno = __LINE__; goto __pyx_L1_error;
}
  • extract C code
int cadd_two(int a, int b) {
    int32_t acc = 0;
    for (int i = 0; i < 1000; i++)
        acc += a + b;
    return acc;
}
  • wrap
cdef extern from "sample_func.h":
    int cadd_two(int, int)
def add_two(a, b):
    return cadd_two(a, b)
cythonize("sample.pyx", sources=[ 'sample_func.c' ])
  • 2081 lines
  __pyx_t_1 = __Pyx_PyInt_As_int32_t(__pyx_v_a); if (unlikely((__pyx_t_1 == (int32_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_
__pyx_t_2 = __Pyx_PyInt_As_int32_t(__pyx_v_b); if (unlikely((__pyx_t_2 == (int32_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_
__pyx_t_3 = __Pyx_PyInt_From_int32_t(cadd_two(__pyx_t_1, __pyx_t_2)); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 8; _

Rust

#[no_mangle]
pub extern fn initsample() {
    unsafe {
        Py_InitModule4_64(&SAMPLE[0] as *const _,
            &METHODS[0] as *const _,
            0 as *const _,
            0,
            PYTHON_API_VERSION);
    };
}
type PyCFunction = unsafe extern "C"
    fn (slf: *mut isize, args: *mut isize) -> *mut isize;

#[repr(C)]
struct PyMethodDef {
    pub ml_name: *const i8,
    pub ml_meth: Option<PyCFunction>,
    pub ml_flags: i32,
    pub ml_doc: *const i8,
}
unsafe impl Sync for PyMethodDef { }
lazy_static! {
    static ref METHODS: Vec = { vec![
        PyMethodDef {
            ml_name: &ADD_TWO[0] as *const _,
            ml_meth: Some(add_two),
        },
        ...
    ] };
}
#[link(name="python2.7")]
extern {
    fn Py_InitModule4_64(name: *const i8,
        methods: *const PyMethodDef,
        doc: *const i8, s: isize, apiver: usize) -> *mut isize;
    fn PyArg_ParseTuple(arg1: *mut isize,
        arg2: *const i8, ...) -> isize;
    fn Py_BuildValue(arg1: *const i8, ...) -> *mut isize;
}
#[allow(unused_variables)]
unsafe extern "C" fn add_two(slf: *mut isize,
        args: *mut isize) -> *mut isize {
    let mut a: i32 = 0;
    let mut b: i32 = 0;
    if PyArg_ParseTuple(args,
            &II_ARGS[0] as *const _,
            &a as *const i32, &b as *const i32) == 0 {
        return 0 as *mut _;
    }
    let mut acc: i32 = 0;
    for i in 0..1000 { acc += a + b; }
    Py_BuildValue(&I_ARGS[0] as *const _, acc)
}

    let acc: i32 = (0..).take(1000)
                .map(|_| a + b)
                .fold(0, |acc, x| acc + x);

otool -tV ./sample.so

__ZN7add_two20h391818698d43ab0ffcaE:
...
callq   0x7a002      ## symbol stub for: _PyArg_ParseTuple
testq   %rax, %rax
je      0x14e3
movl    -0x8(%rbp), %eax
addl    -0x4(%rbp), %eax
imull   $0x3e8, %eax, %esi      ## imm = 0x3E8
leaq    _ref5540(%rip), %rdi    ## literal pool for: "h"
...

cons

  • CPython API 2.7
  • no_std
  • std::ffi::CString
  • jemalloc (rust) + malloc (malloc)

rust-cpython

https://github.com/dgrunwald/rust-cpython

#![feature(slice_patterns)] #[macro_use] extern crate cpython;
use cpython::{PyObject, PyResult, Python, PyTuple, PyDict};
py_module_initializer!(sample, |py, m| {
    try!(m.add(py, "add_two", py_fn!(add_two))); Ok(())
});

fn add_two(p: Python, args: &PyTuple, kw: Option<&PyDict>) -> PyResult<PyObject> {
    match args.as_slice() {
        [a, b] => {
            let acc: i32 = 0;
            for i in 0..1000 { acc += a.value(p) + b.value(p) }
            Ok(acc.to_py_object())
        },
        _ => Ok(py.None())
    }
}
https://github.com/zolkko/pug-rust.git