Skip to content

Commit e2dcd38

Browse files
committed
bpo-31530: stop crashes when iterating over a file on multiple threads, round 2
Multiple threads iterating over a file can corrupt the file's internal readahead buffer resulting in crashes. To fix this, cache buffer state thread-locally for the duration of a file_iternext call and only update the file's internal state after reading completes. No attempt is made to define or provide "reasonable" semantics for iterating over a file on multiple threads. (Non-crashing) races are still present. Duplicated, corrupt, and missing data will happen. This was originally fixed by 6401e56, which raised an exception from seek() and next() when concurrent operations were detected. Alas, this simpler solution breaks legitimate use cases such as capturing the standard streams when multiple threads are logging.
1 parent 29f0e33 commit e2dcd38

File tree

2 files changed

+77
-38
lines changed

2 files changed

+77
-38
lines changed

Lib/test/test_file2k.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -652,6 +652,16 @@ def io_func():
652652
self.f.writelines('')
653653
self._test_close_open_io(io_func)
654654

655+
def test_iteration_torture(self):
656+
with open(self.filename, "wb") as fp:
657+
for i in xrange(2**20):
658+
fp.write(b"0"*50 + b"\n")
659+
with open(self.filename, "rb") as f:
660+
def it():
661+
for l in f:
662+
pass
663+
self._run_workers(it, 10)
664+
655665

656666
@unittest.skipUnless(os.name == 'posix', 'test requires a posix system.')
657667
class TestFileSignalEINTR(unittest.TestCase):

Objects/fileobject.c

Lines changed: 67 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -609,7 +609,12 @@ err_iterbuffered(void)
609609
return NULL;
610610
}
611611

612-
static void drop_readahead(PyFileObject *);
612+
static void
613+
drop_file_readahead(PyFileObject *f)
614+
{
615+
PyMem_FREE(f->f_buf);
616+
f->f_buf = NULL;
617+
}
613618

614619
/* Methods */
615620

@@ -632,7 +637,7 @@ file_dealloc(PyFileObject *f)
632637
Py_XDECREF(f->f_mode);
633638
Py_XDECREF(f->f_encoding);
634639
Py_XDECREF(f->f_errors);
635-
drop_readahead(f);
640+
drop_file_readahead(f);
636641
Py_TYPE(f)->tp_free((PyObject *)f);
637642
}
638643

@@ -767,7 +772,7 @@ file_seek(PyFileObject *f, PyObject *args)
767772

768773
if (f->f_fp == NULL)
769774
return err_closed();
770-
drop_readahead(f);
775+
drop_file_readahead(f);
771776
whence = 0;
772777
if (!PyArg_ParseTuple(args, "O|i:seek", &offobj, &whence))
773778
return NULL;
@@ -2236,48 +2241,51 @@ static PyGetSetDef file_getsetlist[] = {
22362241
{0},
22372242
};
22382243

2244+
typedef struct {
2245+
char *buf, *bufptr, *bufend;
2246+
} readaheadbuffer;
2247+
22392248
static void
2240-
drop_readahead(PyFileObject *f)
2249+
drop_readaheadbuffer(readaheadbuffer *rab)
22412250
{
2242-
if (f->f_buf != NULL) {
2243-
PyMem_Free(f->f_buf);
2244-
f->f_buf = NULL;
2251+
if (rab->buf != NULL) {
2252+
PyMem_FREE(rab->buf);
2253+
rab->buf = NULL;
22452254
}
22462255
}
22472256

22482257
/* Make sure that file has a readahead buffer with at least one byte
22492258
(unless at EOF) and no more than bufsize. Returns negative value on
22502259
error, will set MemoryError if bufsize bytes cannot be allocated. */
22512260
static int
2252-
readahead(PyFileObject *f, Py_ssize_t bufsize)
2261+
readahead(PyFileObject *f, readaheadbuffer *rab, Py_ssize_t bufsize)
22532262
{
22542263
Py_ssize_t chunksize;
22552264

2256-
if (f->f_buf != NULL) {
2257-
if( (f->f_bufend - f->f_bufptr) >= 1)
2265+
if (rab->buf != NULL) {
2266+
if ((rab->bufend - rab->bufptr) >= 1)
22582267
return 0;
22592268
else
2260-
drop_readahead(f);
2269+
drop_readaheadbuffer(rab);
22612270
}
2262-
if ((f->f_buf = (char *)PyMem_Malloc(bufsize)) == NULL) {
2271+
if ((rab->buf = PyMem_MALLOC(bufsize)) == NULL) {
22632272
PyErr_NoMemory();
22642273
return -1;
22652274
}
22662275
FILE_BEGIN_ALLOW_THREADS(f)
22672276
errno = 0;
2268-
chunksize = Py_UniversalNewlineFread(
2269-
f->f_buf, bufsize, f->f_fp, (PyObject *)f);
2277+
chunksize = Py_UniversalNewlineFread(rab->buf, bufsize, f->f_fp, (PyObject *)f);
22702278
FILE_END_ALLOW_THREADS(f)
22712279
if (chunksize == 0) {
22722280
if (ferror(f->f_fp)) {
22732281
PyErr_SetFromErrno(PyExc_IOError);
22742282
clearerr(f->f_fp);
2275-
drop_readahead(f);
2283+
drop_readaheadbuffer(rab);
22762284
return -1;
22772285
}
22782286
}
2279-
f->f_bufptr = f->f_buf;
2280-
f->f_bufend = f->f_buf + chunksize;
2287+
rab->bufptr = rab->buf;
2288+
rab->bufend = rab->buf + chunksize;
22812289
return 0;
22822290
}
22832291

@@ -2287,45 +2295,43 @@ readahead(PyFileObject *f, Py_ssize_t bufsize)
22872295
logarithmic buffer growth to about 50 even when reading a 1gb line. */
22882296

22892297
static PyStringObject *
2290-
readahead_get_line_skip(PyFileObject *f, Py_ssize_t skip, Py_ssize_t bufsize)
2298+
readahead_get_line_skip(PyFileObject *f, readaheadbuffer *rab, Py_ssize_t skip, Py_ssize_t bufsize)
22912299
{
22922300
PyStringObject* s;
22932301
char *bufptr;
22942302
char *buf;
22952303
Py_ssize_t len;
22962304

2297-
if (f->f_buf == NULL)
2298-
if (readahead(f, bufsize) < 0)
2305+
if (rab->buf == NULL)
2306+
if (readahead(f, rab, bufsize) < 0)
22992307
return NULL;
23002308

2301-
len = f->f_bufend - f->f_bufptr;
2309+
len = rab->bufend - rab->bufptr;
23022310
if (len == 0)
2303-
return (PyStringObject *)
2304-
PyString_FromStringAndSize(NULL, skip);
2305-
bufptr = (char *)memchr(f->f_bufptr, '\n', len);
2311+
return (PyStringObject *)PyString_FromStringAndSize(NULL, skip);
2312+
bufptr = (char *)memchr(rab->bufptr, '\n', len);
23062313
if (bufptr != NULL) {
23072314
bufptr++; /* Count the '\n' */
2308-
len = bufptr - f->f_bufptr;
2309-
s = (PyStringObject *)
2310-
PyString_FromStringAndSize(NULL, skip + len);
2315+
len = bufptr - rab->bufptr;
2316+
s = (PyStringObject *)PyString_FromStringAndSize(NULL, skip + len);
23112317
if (s == NULL)
23122318
return NULL;
2313-
memcpy(PyString_AS_STRING(s) + skip, f->f_bufptr, len);
2314-
f->f_bufptr = bufptr;
2315-
if (bufptr == f->f_bufend)
2316-
drop_readahead(f);
2319+
memcpy(PyString_AS_STRING(s) + skip, rab->bufptr, len);
2320+
rab->bufptr = bufptr;
2321+
if (bufptr == rab->bufend)
2322+
drop_readaheadbuffer(rab);
23172323
} else {
2318-
bufptr = f->f_bufptr;
2319-
buf = f->f_buf;
2320-
f->f_buf = NULL; /* Force new readahead buffer */
2324+
bufptr = rab->bufptr;
2325+
buf = rab->buf;
2326+
rab->buf = NULL; /* Force new readahead buffer */
23212327
assert(len <= PY_SSIZE_T_MAX - skip);
2322-
s = readahead_get_line_skip(f, skip + len, bufsize + (bufsize>>2));
2328+
s = readahead_get_line_skip(f, rab, skip + len, bufsize + (bufsize>>2));
23232329
if (s == NULL) {
2324-
PyMem_Free(buf);
2330+
PyMem_FREE(buf);
23252331
return NULL;
23262332
}
23272333
memcpy(PyString_AS_STRING(s) + skip, bufptr, len);
2328-
PyMem_Free(buf);
2334+
PyMem_FREE(buf);
23292335
}
23302336
return s;
23312337
}
@@ -2343,7 +2349,30 @@ file_iternext(PyFileObject *f)
23432349
if (!f->readable)
23442350
return err_mode("reading");
23452351

2346-
l = readahead_get_line_skip(f, 0, READAHEAD_BUFSIZE);
2352+
{
2353+
/*
2354+
Multiple threads can enter this method while the GIL is released
2355+
during file read and wreak havoc on the file object's readahead
2356+
buffer. To avoid dealing with cross-thread coordination issues, we
2357+
cache the file buffer state locally and only set it back on the file
2358+
object when we're done.
2359+
*/
2360+
readaheadbuffer rab = {f->f_buf, f->f_bufptr, f->f_bufend};
2361+
f->f_buf = NULL;
2362+
l = readahead_get_line_skip(f, &rab, 0, READAHEAD_BUFSIZE);
2363+
/*
2364+
Make sure the file's internal read buffer is cleared out. This will
2365+
only do anything if some other thread interleaved with us during
2366+
readahead. We want to drop any changeling buffer, so we don't leak
2367+
memory. We may lose data, but that's what you get for reading the same
2368+
file object in multiple threads.
2369+
*/
2370+
drop_file_readahead(f);
2371+
f->f_buf = rab.buf;
2372+
f->f_bufptr = rab.bufptr;
2373+
f->f_bufend = rab.bufend;
2374+
}
2375+
23472376
if (l == NULL || PyString_GET_SIZE(l) == 0) {
23482377
Py_XDECREF(l);
23492378
return NULL;

0 commit comments

Comments
 (0)