Skip to content

Commit 7dfd570

Browse files
committed
There's a persistent rumor on the spambayes mailing list that dumbdbm
databases are associated with corruption problems, so I studied this code carefully and ran some brutal stress tests. I didn't find any bugs, although it's unclear whether this code *intends* that __setitem__ can leave the directory file out of synch with the data file (so if a dumbdbm isn't properly closed, and the value of an existing key was ever replaced, corruption is almost certain, where "corruption" means the directory file is out of synch with the data file). Added many comments and generally modernized the code. Examples of the latter: we have better ways of reading a whole file line-by-line now; eval() now tolerates a trailing newline; the %r format code can be used to avoid explicit repr/backtick calls; and the code often broke tuples into their components when it was clearer and faster to just leave them as tuples.
1 parent 541342f commit 7dfd570

File tree

1 file changed

+69
-28
lines changed

1 file changed

+69
-28
lines changed

Lib/dumbdbm.py

Lines changed: 69 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,26 @@
3333

3434
class _Database(UserDict.DictMixin):
3535

36-
def __init__(self, file, mode):
36+
def __init__(self, filebasename, mode):
3737
self._mode = mode
38-
self._dirfile = file + _os.extsep + 'dir'
39-
self._datfile = file + _os.extsep + 'dat'
40-
self._bakfile = file + _os.extsep + 'bak'
38+
39+
# The directory file is a text file. Each line looks like
40+
# "%r, (%d, %d)\n" % (key, pos, siz)
41+
# where key is the string key, pos is the offset into the dat
42+
# file of the associated value's first byte, and siz is the number
43+
# of bytes in the associated value.
44+
self._dirfile = filebasename + _os.extsep + 'dir'
45+
46+
# The data file is a binary file pointed into by the directory
47+
# file, and holds the values associated with keys. Each value
48+
# begins at a _BLOCKSIZE-aligned byte offset, and is a raw
49+
# binary 8-bit string value.
50+
self._datfile = filebasename + _os.extsep + 'dat'
51+
self._bakfile = filebasename + _os.extsep + 'bak'
52+
53+
# The index is an in-memory dict, mirroring the directory file.
54+
self._index = None # maps keys to (pos, siz) pairs
55+
4156
# Mod by Jack: create data file if needed
4257
try:
4358
f = _open(self._datfile, 'r')
@@ -46,28 +61,36 @@ def __init__(self, file, mode):
4661
f.close()
4762
self._update()
4863

64+
# Read directory file into the in-memory index dict.
4965
def _update(self):
5066
self._index = {}
5167
try:
5268
f = _open(self._dirfile)
5369
except IOError:
5470
pass
5571
else:
56-
while 1:
57-
line = f.readline().rstrip()
58-
if not line: break
59-
key, (pos, siz) = eval(line)
60-
self._index[key] = (pos, siz)
72+
for line in f:
73+
key, pos_and_siz_pair = eval(line)
74+
self._index[key] = pos_and_siz_pair
6175
f.close()
6276

77+
# Write the index dict to the directory file. The original directory
78+
# file (if any) is renamed with a .bak extension first. If a .bak
79+
# file currently exists, it's deleted.
6380
def _commit(self):
64-
try: _os.unlink(self._bakfile)
65-
except _os.error: pass
66-
try: _os.rename(self._dirfile, self._bakfile)
67-
except _os.error: pass
81+
try:
82+
_os.unlink(self._bakfile)
83+
except _os.error:
84+
pass
85+
86+
try:
87+
_os.rename(self._dirfile, self._bakfile)
88+
except _os.error:
89+
pass
90+
6891
f = _open(self._dirfile, 'w', self._mode)
6992
for key, (pos, siz) in self._index.items():
70-
f.write("%s, (%s, %s)\n" % (`key`, `pos`, `siz`))
93+
f.write("%r, (%d, %d)\n" % (key, pos, siz))
7194
f.close()
7295

7396
def __getitem__(self, key):
@@ -78,53 +101,71 @@ def __getitem__(self, key):
78101
f.close()
79102
return dat
80103

104+
# Append val to the data file, starting at a _BLOCKSIZE-aligned
105+
# offset. The data file is first padded with NUL bytes (if needed)
106+
# to get to an aligned offset. Return pair
107+
# (starting offset of val, len(val))
81108
def _addval(self, val):
82109
f = _open(self._datfile, 'rb+')
83110
f.seek(0, 2)
84111
pos = int(f.tell())
85-
## Does not work under MW compiler
86-
## pos = ((pos + _BLOCKSIZE - 1) / _BLOCKSIZE) * _BLOCKSIZE
87-
## f.seek(pos)
88112
npos = ((pos + _BLOCKSIZE - 1) // _BLOCKSIZE) * _BLOCKSIZE
89113
f.write('\0'*(npos-pos))
90114
pos = npos
91-
92115
f.write(val)
93116
f.close()
94117
return (pos, len(val))
95118

119+
# Write val to the data file, starting at offset pos. The caller
120+
# is responsible for ensuring that there's enough room starting at
121+
# pos to hold val, without overwriting some other value. Return
122+
# pair (pos, len(val)).
96123
def _setval(self, pos, val):
97124
f = _open(self._datfile, 'rb+')
98125
f.seek(pos)
99126
f.write(val)
100127
f.close()
101128
return (pos, len(val))
102129

103-
def _addkey(self, key, (pos, siz)):
104-
self._index[key] = (pos, siz)
130+
# key is a new key whose associated value starts in the data file
131+
# at offset pos and with length size. Add an index record to
132+
# the in-memory index dict, and append one to the index file.
133+
def _addkey(self, key, pos_and_siz_pair):
134+
self._index[key] = pos_and_siz_pair
105135
f = _open(self._dirfile, 'a', self._mode)
106-
f.write("%s, (%s, %s)\n" % (`key`, `pos`, `siz`))
136+
f.write("%r, %r\n" % (key, pos_and_siz_pair))
107137
f.close()
108138

109139
def __setitem__(self, key, val):
110140
if not type(key) == type('') == type(val):
111141
raise TypeError, "keys and values must be strings"
112-
if not key in self._index:
113-
(pos, siz) = self._addval(val)
114-
self._addkey(key, (pos, siz))
142+
if key not in self._index:
143+
self._addkey(key, self._addval(val))
115144
else:
145+
# See whether the new value is small enough to fit in the
146+
# (padded) space currently occupied by the old value.
116147
pos, siz = self._index[key]
117148
oldblocks = (siz + _BLOCKSIZE - 1) // _BLOCKSIZE
118149
newblocks = (len(val) + _BLOCKSIZE - 1) // _BLOCKSIZE
119150
if newblocks <= oldblocks:
120-
pos, siz = self._setval(pos, val)
121-
self._index[key] = pos, siz
151+
self._index[key] = self._setval(pos, val)
122152
else:
123-
pos, siz = self._addval(val)
124-
self._index[key] = pos, siz
153+
# The new value doesn't fit in the (padded) space used
154+
# by the old value. The blocks used by the old value are
155+
# forever lost.
156+
self._index[key] = self._addval(val)
157+
158+
# Note that _index may be out of synch with the directory
159+
# file now: _setval() and _addval() don't update the directory
160+
# file.
125161

126162
def __delitem__(self, key):
163+
# The blocks used by the associated value are lost.
127164
del self._index[key]
165+
# XXX It's unclear why we do a _commit() here (the code always
166+
# XXX has, so I'm not changing it). _setitem__ doesn't try to
167+
# XXX keep the directory file in synch. Why should we? Or
168+
# XXX why shouldn't __setitem__?
128169
self._commit()
129170

130171
def keys(self):

0 commit comments

Comments
 (0)