There's a persistent rumor on the spambayes mailing list that dumbdbm

tim-one · tim-one · commit 7dfd5701b20c · 2003-07-12T20:11:25.000Z
databases are associated with corruption problems, so I studied this code
carefully and ran some brutal stress tests.  I didn't find any bugs,
although it's unclear whether this code *intends* that __setitem__ can
leave the directory file out of synch with the data file (so
if a dumbdbm isn't properly closed, and the value of an existing key
was ever replaced, corruption is almost certain, where "corruption"
means the directory file is out of synch with the data file).

Added many comments and generally modernized the code.  Examples of the
latter:  we have better ways of reading a whole file line-by-line now;
eval() now tolerates a trailing newline; the %r format code can be used
to avoid explicit repr/backtick calls; and the code often broke tuples
into their components when it was clearer and faster to just leave them
as tuples.
diff --git a/Lib/dumbdbm.py b/Lib/dumbdbm.py
@@ -33,11 +33,26 @@
 
 class _Database(UserDict.DictMixin):
 
-    def __init__(self, file, mode):
+    def __init__(self, filebasename, mode):
         self._mode = mode
-        self._dirfile = file + _os.extsep + 'dir'
-        self._datfile = file + _os.extsep + 'dat'
-        self._bakfile = file + _os.extsep + 'bak'
+
+        # The directory file is a text file.  Each line looks like
+        #    "%r, (%d, %d)\n" % (key, pos, siz)
+        # where key is the string key, pos is the offset into the dat
+        # file of the associated value's first byte, and siz is the number
+        # of bytes in the associated value.
+        self._dirfile = filebasename + _os.extsep + 'dir'
+
+        # The data file is a binary file pointed into by the directory
+        # file, and holds the values associated with keys.  Each value
+        # begins at a _BLOCKSIZE-aligned byte offset, and is a raw
+        # binary 8-bit string value.
+        self._datfile = filebasename + _os.extsep + 'dat'
+        self._bakfile = filebasename + _os.extsep + 'bak'
+
+        # The index is an in-memory dict, mirroring the directory file.
+        self._index = None  # maps keys to (pos, siz) pairs
+
         # Mod by Jack: create data file if needed
         try:
             f = _open(self._datfile, 'r')
@@ -46,28 +61,36 @@ def __init__(self, file, mode):
         f.close()
         self._update()
 
+    # Read directory file into the in-memory index dict.
     def _update(self):
         self._index = {}
         try:
             f = _open(self._dirfile)
         except IOError:
             pass
         else:
-            while 1:
-                line = f.readline().rstrip()
-                if not line: break
-                key, (pos, siz) = eval(line)
-                self._index[key] = (pos, siz)
+            for line in f:
+                key, pos_and_siz_pair = eval(line)
+                self._index[key] = pos_and_siz_pair
             f.close()
 
+    # Write the index dict to the directory file.  The original directory
+    # file (if any) is renamed with a .bak extension first.  If a .bak
+    # file currently exists, it's deleted.
     def _commit(self):
-        try: _os.unlink(self._bakfile)
-        except _os.error: pass
-        try: _os.rename(self._dirfile, self._bakfile)
-        except _os.error: pass
+        try:
+            _os.unlink(self._bakfile)
+        except _os.error:
+            pass
+
+        try:
+            _os.rename(self._dirfile, self._bakfile)
+        except _os.error:
+            pass
+
         f = _open(self._dirfile, 'w', self._mode)
         for key, (pos, siz) in self._index.items():
-            f.write("%s, (%s, %s)\n" % (`key`, `pos`, `siz`))
+            f.write("%r, (%d, %d)\n" % (key, pos, siz))
         f.close()
 
     def __getitem__(self, key):
@@ -78,53 +101,71 @@ def __getitem__(self, key):
         f.close()
         return dat
 
+    # Append val to the data file, starting at a _BLOCKSIZE-aligned
+    # offset.  The data file is first padded with NUL bytes (if needed)
+    # to get to an aligned offset.  Return pair
+    #     (starting offset of val, len(val))
     def _addval(self, val):
         f = _open(self._datfile, 'rb+')
         f.seek(0, 2)
         pos = int(f.tell())
-## Does not work under MW compiler
-##              pos = ((pos + _BLOCKSIZE - 1) / _BLOCKSIZE) * _BLOCKSIZE
-##              f.seek(pos)
         npos = ((pos + _BLOCKSIZE - 1) // _BLOCKSIZE) * _BLOCKSIZE
         f.write('\0'*(npos-pos))
         pos = npos
-
         f.write(val)
         f.close()
         return (pos, len(val))
 
+    # Write val to the data file, starting at offset pos.  The caller
+    # is responsible for ensuring that there's enough room starting at
+    # pos to hold val, without overwriting some other value.  Return
+    # pair (pos, len(val)).
     def _setval(self, pos, val):
         f = _open(self._datfile, 'rb+')
         f.seek(pos)
         f.write(val)
         f.close()
         return (pos, len(val))
 
-    def _addkey(self, key, (pos, siz)):
-        self._index[key] = (pos, siz)
+    # key is a new key whose associated value starts in the data file
+    # at offset pos and with length size.  Add an index record to
+    # the in-memory index dict, and append one to the index file.
+    def _addkey(self, key, pos_and_siz_pair):
+        self._index[key] = pos_and_siz_pair
         f = _open(self._dirfile, 'a', self._mode)
-        f.write("%s, (%s, %s)\n" % (`key`, `pos`, `siz`))
+        f.write("%r, %r\n" % (key, pos_and_siz_pair))
         f.close()
 
     def __setitem__(self, key, val):
         if not type(key) == type('') == type(val):
             raise TypeError, "keys and values must be strings"
-        if not key in self._index:
-            (pos, siz) = self._addval(val)
-            self._addkey(key, (pos, siz))
+        if key not in self._index:
+            self._addkey(key, self._addval(val))
         else:
+            # See whether the new value is small enough to fit in the
+            # (padded) space currently occupied by the old value.
             pos, siz = self._index[key]
             oldblocks = (siz + _BLOCKSIZE - 1) // _BLOCKSIZE
             newblocks = (len(val) + _BLOCKSIZE - 1) // _BLOCKSIZE
             if newblocks <= oldblocks:
-                pos, siz = self._setval(pos, val)
-                self._index[key] = pos, siz
+                self._index[key] = self._setval(pos, val)
             else:
-                pos, siz = self._addval(val)
-                self._index[key] = pos, siz
+                # The new value doesn't fit in the (padded) space used
+                # by the old value.  The blocks used by the old value are
+                # forever lost.
+                self._index[key] = self._addval(val)
+
+            # Note that _index may be out of synch with the directory
+            # file now:  _setval() and _addval() don't update the directory
+            # file.
 
     def __delitem__(self, key):
+        # The blocks used by the associated value are lost.
         del self._index[key]
+        # XXX It's unclear why we do a _commit() here (the code always
+        # XXX has, so I'm not changing it).  _setitem__ doesn't try to
+        # XXX keep the directory file in synch.  Why should we?  Or
+        # XXX why shouldn't __setitem__?
         self._commit()
 
     def keys(self):