Skip to content

bpo-34749: Improved performance of binascii.a2b_base64(). #9444

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 14, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
:func:`binascii.a2b_base64` is now up to 2 times faster. Patch by Sergey
Fedoseev.
142 changes: 56 additions & 86 deletions Modules/binascii.c
Original file line number Diff line number Diff line change
Expand Up @@ -130,15 +130,24 @@ static const unsigned char table_a2b_hqx[256] = {
static const unsigned char table_b2a_hqx[] =
"!\"#$%&'()*+,-012345689@ABCDEFGHIJKLMNPQRSTUVXYZ[`abcdefhijklmpqr";

static const char table_a2b_base64[] = {
static const unsigned char table_a2b_base64[] = {
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1, 0,-1,-1, /* Note PAD->0 */
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
-1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1,

-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
};

#define BASE64_PAD '='
Expand Down Expand Up @@ -413,32 +422,6 @@ binascii_b2a_uu_impl(PyObject *module, Py_buffer *data, int backtick)
return _PyBytesWriter_Finish(&writer, ascii_data);
}


static int
binascii_find_valid(const unsigned char *s, Py_ssize_t slen, int num)
{
/* Finds & returns the (num+1)th
** valid character for base64, or -1 if none.
*/

int ret = -1;
unsigned char c, b64val;

while ((slen > 0) && (ret == -1)) {
c = *s;
b64val = table_a2b_base64[c & 0x7f];
if ( ((c <= 0x7f) && (b64val != (unsigned char)-1)) ) {
if (num == 0)
ret = *s;
num--;
}

s++;
slen--;
}
return ret;
}

/*[clinic input]
binascii.a2b_base64

Expand All @@ -452,88 +435,74 @@ static PyObject *
binascii_a2b_base64_impl(PyObject *module, Py_buffer *data)
/*[clinic end generated code: output=0628223f19fd3f9b input=5872acf6e1cac243]*/
{
const unsigned char *ascii_data;
unsigned char *bin_data;
unsigned char *bin_data_start;
int leftbits = 0;
unsigned char this_ch;
unsigned int leftchar = 0;
Py_ssize_t ascii_len, bin_len;
int quad_pos = 0;
_PyBytesWriter writer;
binascii_state *state;

ascii_data = data->buf;
ascii_len = data->len;
assert(data->len >= 0);

assert(ascii_len >= 0);

if (ascii_len > PY_SSIZE_T_MAX - 3)
return PyErr_NoMemory();

bin_len = ((ascii_len+3)/4)*3; /* Upper bound, corrected later */

_PyBytesWriter_Init(&writer);
const unsigned char *ascii_data = data->buf;
size_t ascii_len = data->len;

/* Allocate the buffer */
bin_data = _PyBytesWriter_Alloc(&writer, bin_len);
Py_ssize_t bin_len = ((ascii_len+3)/4)*3; /* Upper bound, corrected later */
_PyBytesWriter writer;
_PyBytesWriter_Init(&writer);
unsigned char *bin_data = _PyBytesWriter_Alloc(&writer, bin_len);
if (bin_data == NULL)
return NULL;
bin_data_start = bin_data;

for( ; ascii_len > 0; ascii_len--, ascii_data++) {
this_ch = *ascii_data;
unsigned char *bin_data_start = bin_data;

if (this_ch > 0x7f ||
this_ch == '\r' || this_ch == '\n' || this_ch == ' ')
continue;
int quad_pos = 0;
unsigned char leftchar = 0;
int pads = 0;
for (size_t i = 0; i < ascii_len; i++) {
unsigned char this_ch = ascii_data[i];

/* Check for pad sequences and ignore
** the invalid ones.
*/
if (this_ch == BASE64_PAD) {
if ( (quad_pos < 2) ||
((quad_pos == 2) &&
(binascii_find_valid(ascii_data, ascii_len, 1)
!= BASE64_PAD)) )
{
continue;
}
else {
if (quad_pos >= 2 && quad_pos + ++pads >= 4) {
/* A pad sequence means no more input.
** We've already interpreted the data
** from the quad at this point.
*/
leftbits = 0;
break;
goto done;
}
continue;
}

this_ch = table_a2b_base64[*ascii_data];
if ( this_ch == (unsigned char) -1 )
this_ch = table_a2b_base64[this_ch];
if (this_ch >= 64) {
continue;
}
pads = 0;

/*
** Shift it in on the low end, and see if there's
** a byte ready for output.
*/
quad_pos = (quad_pos + 1) & 0x03;
leftchar = (leftchar << 6) | (this_ch);
leftbits += 6;

if ( leftbits >= 8 ) {
leftbits -= 8;
*bin_data++ = (leftchar >> leftbits) & 0xff;
leftchar &= ((1 << leftbits) - 1);
switch (quad_pos) {
case 0:
quad_pos = 1;
leftchar = this_ch;
break;
case 1:
quad_pos = 2;
*bin_data++ = (leftchar << 2) | (this_ch >> 4);
leftchar = this_ch & 0x0f;
break;
case 2:
quad_pos = 3;
*bin_data++ = (leftchar << 4) | (this_ch >> 2);
leftchar = this_ch & 0x03;
break;
case 3:
quad_pos = 0;
*bin_data++ = (leftchar << 6) | (this_ch);
leftchar = 0;
break;
}
}

if (leftbits != 0) {
state = PyModule_GetState(module);
if (quad_pos != 0) {
binascii_state *state = PyModule_GetState(module);
if (state == NULL) {
return NULL;
}
if (leftbits == 6) {
/* error already set, from PyModule_GetState */
} else if (quad_pos == 1) {
/*
** There is exactly one extra valid, non-padding, base64 character.
** This is an invalid length, as there is no possible input that
Expand All @@ -551,6 +520,7 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data)
return NULL;
}

done:
return _PyBytesWriter_Finish(&writer, bin_data);
}

Expand Down