pandas-dev · jreback · Oct 29, 2013 · Oct 28, 2013 · Oct 29, 2013 · jtratner
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -2745,6 +2745,12 @@ Notes & Caveats
      need to serialize these operations in a single thread in a single
      process. You will corrupt your data otherwise. See the issue
      (:`2397`) for more information.
+   - If serializing all write operations via a single thread in a single
+     process is not an option, another alternative is to use an external
+     distributed lock manager to ensure there is only a single writer at a
+     time and all readers close the file during writes and re-open it after any
+     writes. In this case you should use ``store.flush(fsync=True)`` prior to
+     releasing any write locks. See the issue (:`5364`) for more information.
    - ``PyTables`` only supports fixed-width string columns in
      ``tables``. The sizes of a string based indexing column
      (e.g. *columns* or *minor_axis*) are determined as the maximum size

diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -275,6 +275,8 @@ API Changes
     - store `datetime.date` objects as ordinals rather then timetuples to avoid
       timezone issues (:issue:`2852`), thanks @tavistmorph and @numpand
     - ``numexpr`` 2.2.2 fixes incompatiblity in PyTables 2.4 (:issue:`4908`)
+    - ``flush`` now accepts an ``fsync`` parameter, which defaults to ``False``
+      (:issue:`5364`)
   - ``JSON``
 
     - added ``date_unit`` parameter to specify resolution of timestamps.

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -10,6 +10,7 @@
 import copy
 import itertools
 import warnings
+import os
 
 import numpy as np
 from pandas import (Series, TimeSeries, DataFrame, Panel, Panel4D, Index,
@@ -525,12 +526,30 @@ def is_open(self):
             return False
         return bool(self._handle.isopen)
 
-    def flush(self):
+    def flush(self, fsync=False):
         """
-        Force all buffered modifications to be written to disk
+        Force all buffered modifications to be written to disk.
+
+        By default this method requests PyTables to flush, and PyTables in turn
+        requests the HDF5 library to flush any changes to the operating system.
+        There is no guarantee the operating system will actually commit writes
+        to disk.
+
+        To request the operating system to write the file to disk, pass
+        ``fsync=True``. The method will then block until the operating system
+        reports completion, although be aware there might be other caching
+        layers (eg disk controllers, disks themselves etc) which further delay
+        durability.
+
+        Parameters
+        ----------
+        fsync : boolean, invoke fsync for the file handle, default False
+
         """
         if self._handle is not None:
             self._handle.flush()
+            if fsync:
+                os.fsync(self._handle.fileno())
 
     def get(self, key):
         """
@@ -4072,5 +4091,4 @@ def timeit(key, df, fn=None, remove=True, **kwargs):
     store.close()
 
     if remove:
-        import os
         os.remove(fn)
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
@@ -466,6 +466,12 @@ def test_flush(self):
             store['a'] = tm.makeTimeSeries()
             store.flush()
 
+    def test_flush_fsync(self):
+
+        with ensure_clean(self.path) as store:
+            store['a'] = tm.makeTimeSeries()
+            store.flush(fsync=True)
+
     def test_get(self):
 
         with ensure_clean(self.path) as store: