Skip to content

Commit fb8cc40

Browse files
committed
greatly simplify syncing of indexed-but-moved dirs
Personal benchmarks show a very significant performance gain when calling get_path(). This is because we are no longer scanning all the contents of every indexed-but-moved directory in _sync_all(), but rather intelligently appending the new paths of the directories to check for dirtiness to a deque used in the body of _sync_all(). Also fixes a few bugs (sorry, couldn't split this commit up): - Indexing symlinks always indexes the real path that symlink refers to - Fixes bug with the way new paths were calculated in recursive moves
1 parent 8da5648 commit fb8cc40

File tree

3 files changed

+245
-86
lines changed

3 files changed

+245
-86
lines changed

jupyter_server/pytest_plugin.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -475,12 +475,13 @@ def delete_fid_db(fid_db_path):
475475

476476

477477
@pytest.fixture
478-
def fid_manager(fid_db_path, tmp_path):
478+
def fid_manager(fid_db_path, jp_root_dir):
479479
"""Fixture returning a test-configured instance of `FileIdManager`."""
480-
fid_manager = FileIdManager(db_path=fid_db_path, root_dir=str(tmp_path))
480+
fid_manager = FileIdManager(db_path=fid_db_path, root_dir=str(jp_root_dir))
481481
# disable journal so no temp journal file is created under `tmp_path`.
482482
# reduces test flakiness since sometimes journal file has same ino and
483483
# crtime as a deleted file, so FID manager detects it wrongly as a move
484+
# also makes tests run faster :)
484485
fid_manager.con.execute("PRAGMA journal_mode = OFF")
485486
return fid_manager
486487

jupyter_server/services/contents/fileidmanager.py

Lines changed: 120 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import os
22
import sqlite3
33
import stat
4-
from typing import Optional
4+
from collections import deque
5+
from typing import Deque, Optional
56

67
from jupyter_core.paths import jupyter_data_dir
78
from traitlets import TraitError, Unicode, validate
@@ -13,6 +14,7 @@ class StatStruct:
1314
crtime: Optional[int]
1415
mtime: int
1516
is_dir: bool
17+
is_symlink: bool
1618

1719

1820
class FileIdManager(LoggingConfigurable):
@@ -100,83 +102,103 @@ def _sync_all(self):
100102
-----
101103
A dirty directory is a directory that is either:
102104
- unindexed
103-
- indexed but moved
104105
- indexed but with different `mtime`
105106
106107
Dirty directories contain possibly indexed but moved files as children.
107108
Hence we need to call _sync_file() on their contents via _sync_dir().
108-
Indexed directories that are dirty solely because of mtime difference
109-
are included in the below SELECT query. Unindexed or indexed-but-moved
110-
dirty directories are not included in the query, and hence must be
111-
handled in _sync_dir().
109+
Indexed directories with mtime difference are handled in this method
110+
body. Unindexed dirty directories are handled immediately when
111+
encountered in _sync_dir().
112+
113+
sync_deque is an additional deque of directories that should be checked
114+
for dirtiness, and is appended to whenever _sync_file() encounters an
115+
indexed directory that was moved out-of-band. This is necessary because
116+
the SELECT query is not guaranteed to include the new paths following
117+
the move.
112118
"""
119+
sync_deque: Deque = deque()
113120
cursor = self.con.execute("SELECT id, path, mtime FROM Files WHERE is_dir = 1")
114-
for dir in cursor:
121+
dir = cursor.fetchone()
122+
while dir:
115123
id, path, old_mtime = dir
116124
stat_info = self._stat(path)
117125

118-
# ignore directories that no longer exist
119-
if stat_info is None:
120-
continue
126+
# ignores directories that no longer exist
127+
if stat_info is not None:
128+
new_mtime = stat_info.mtime
129+
dir_dirty = new_mtime != old_mtime
130+
if dir_dirty:
131+
self._sync_dir(path, sync_deque)
132+
self._update(id, stat_info)
121133

122-
new_mtime = stat_info.mtime
123-
dir_dirty = new_mtime != old_mtime
124-
if dir_dirty:
125-
self._sync_dir(path)
126-
self._update(id, stat_info)
134+
dir = sync_deque.popleft() if sync_deque else cursor.fetchone()
127135

128-
def _sync_dir(self, dir_path):
136+
def _sync_dir(self, dir_path, sync_deque):
129137
"""
130138
Syncs the contents of a directory. If a child directory is dirty because
131-
it is either unindexed or indexed-but-moved, then the contents of that
132-
child directory are synced. See _sync_all() for more on dirty
133-
directories.
139+
it is unindexed, then the contents of that child directory are synced.
140+
See _sync_all() for more on dirty directories.
141+
142+
Parameters
143+
----------
144+
dir_path : string
145+
Path of the directory to sync contents of.
146+
147+
sync_deque: deque
148+
Deque of directory records to be checked for dirtiness in
149+
_sync_all().
134150
"""
135151
with os.scandir(dir_path) as scan_iter:
136152
for entry in scan_iter:
137153
stat_info = self._stat(entry.path)
138-
id, is_dirty_dir = self._sync_file(entry.path, stat_info)
154+
id = self._sync_file(entry.path, stat_info, sync_deque)
139155

140-
# if entry is unindexed directory, create new record
156+
# if entry is unindexed directory, create new record and sync
157+
# contents recursively.
141158
if stat_info.is_dir and id is None:
142159
self._create(entry.path, stat_info)
143-
144-
# sync dirty dir contents if it is either unindexed or
145-
# indexed-but-moved
146-
if is_dirty_dir:
147-
self._sync_dir(entry.path)
160+
self._sync_dir(entry.path, sync_deque)
148161

149162
scan_iter.close()
150163

151-
def _sync_file(self, path, stat_info):
164+
def _sync_file(self, path, stat_info, sync_deque=None):
152165
"""
153166
Syncs the file at `path` with the Files table by detecting whether the
154167
file was previously indexed but moved. Updates the record with the new
155168
path. This ensures that the file at path is associated with the correct
156169
file ID. This method does nothing if the file at `path` was not
157170
previously indexed.
158171
172+
Parameters
173+
----------
174+
path : string
175+
Path of the file to sync.
176+
177+
stat_info : StatStruct
178+
Stat info of the file to sync.
179+
180+
sync_deque : deque, optional
181+
Deque of directory records to be checked for dirtiness in
182+
_sync_all(). If specified, this method appends to sync_deque any
183+
moved indexed directory and all of its children recursively.
184+
159185
Returns
160186
-------
161-
Returns a two-tuple containing the elements
162-
163187
id : int, optional
164-
ID of the file if it was previously indexed. None otherwise.
165-
166-
dir_dirty: bool
167-
Whether the file is a dirty directory and should be traversed by
168-
_sync_dir(). Not necessarily true even if the `mtime` differs, since
169-
directories which are dirty only because of `mtime` difference are
170-
included in the query run by _sync_all(). See _sync_all() for more
171-
on dirty directories.
188+
ID of the file if it is a real file (not a symlink) and it was
189+
previously indexed. None otherwise.
172190
"""
191+
# if file is symlink, do nothing
192+
if stat_info.is_symlink:
193+
return None
194+
173195
src = self.con.execute(
174196
"SELECT id, path, crtime, mtime FROM Files WHERE ino = ?", (stat_info.ino,)
175197
).fetchone()
176198

177199
# if no record with matching ino, then return None
178200
if not src:
179-
return None, stat_info.is_dir
201+
return None
180202

181203
id, old_path, src_crtime, src_mtime = src
182204
src_timestamp = src_crtime if src_crtime is not None else src_mtime
@@ -185,13 +207,20 @@ def _sync_file(self, path, stat_info):
185207
# if record has identical ino and crtime/mtime to an existing record,
186208
# update it with new destination path and stat info, returning its id
187209
if src_timestamp == dst_timestamp:
188-
self._update(id, stat_info, path)
189-
return id, stat_info.is_dir and old_path != path
210+
self._update_with_path(id, stat_info, path)
211+
212+
# update paths of indexed children under moved directories
213+
if stat_info.is_dir and old_path != path:
214+
self._move_recursive(old_path, path, sync_deque)
215+
if sync_deque is not None:
216+
sync_deque.appendleft((id, path, src_mtime))
217+
218+
return id
190219

191220
# otherwise delete the existing record with identical `ino`, since inos
192221
# must be unique. then return None
193222
self.con.execute("DELETE FROM Files WHERE id = ?", (id,))
194-
return None, stat_info.is_dir
223+
return None
195224

196225
def _normalize_path(self, path):
197226
"""Normalizes a given file path."""
@@ -217,14 +246,15 @@ def _parse_raw_stat(self, raw_stat):
217246
)
218247
stat_info.mtime = raw_stat.st_mtime_ns
219248
stat_info.is_dir = stat.S_ISDIR(raw_stat.st_mode)
249+
stat_info.is_symlink = stat.S_ISLNK(raw_stat.st_mode)
220250

221251
return stat_info
222252

223253
def _stat(self, path):
224254
"""Returns stat info on a path in a StatStruct object.Returns None if
225255
file does not exist at path."""
226256
try:
227-
raw_stat = os.stat(path)
257+
raw_stat = os.lstat(path)
228258
except OSError:
229259
return None
230260

@@ -240,23 +270,24 @@ def _create(self, path, stat_info):
240270

241271
return cursor.lastrowid
242272

243-
def _update(self, id, stat_info, path=None):
244-
"""Updates a record given its file ID, stat info, and possibly path."""
245-
if path is not None:
246-
self.con.execute(
247-
"UPDATE Files SET path = ?, ino = ?, crtime = ?, mtime = ? WHERE id = ?",
248-
(path, stat_info.ino, stat_info.crtime, stat_info.mtime, id),
249-
)
250-
else:
251-
self.con.execute(
252-
# updating `ino` and `crtime` is a conscious design decision because
253-
# this method is called by `move()`. these values are only preserved
254-
# by fs moves done via the `rename()` syscall, like `mv`. we don't
255-
# care how the contents manager moves a file; it could be deleting
256-
# and creating a new file (which will change the stat info).
257-
"UPDATE Files SET ino = ?, crtime = ?, mtime = ? WHERE id = ?",
258-
(stat_info.ino, stat_info.crtime, stat_info.mtime, id),
259-
)
273+
def _update_with_path(self, id, stat_info, path):
274+
"""Same as _update(), but accepts and updates path."""
275+
self.con.execute(
276+
"UPDATE Files SET path = ?, ino = ?, crtime = ?, mtime = ? WHERE id = ?",
277+
(path, stat_info.ino, stat_info.crtime, stat_info.mtime, id),
278+
)
279+
280+
def _update(self, id, stat_info):
281+
"""Updates a record given its file ID and stat info."""
282+
# updating `ino` and `crtime` is a conscious design decision because
283+
# this method is called by `move()`. these values are only preserved by
284+
# fs moves done via the `rename()` syscall, like `mv`. we don't care how
285+
# the contents manager moves a file; it could be deleting and creating a
286+
# new file (which will change the stat info).
287+
self.con.execute(
288+
"UPDATE Files SET ino = ?, crtime = ?, mtime = ? WHERE id = ?",
289+
(stat_info.ino, stat_info.crtime, stat_info.mtime, id),
290+
)
260291

261292
def index(self, path, stat_info=None, commit=True):
262293
"""Returns the file ID for the file at `path`, creating a new file ID if
@@ -266,8 +297,12 @@ def index(self, path, stat_info=None, commit=True):
266297
if not stat_info:
267298
return None
268299

300+
# if file is symlink, then index the path it refers to instead
301+
if stat_info.is_symlink:
302+
return self.index(os.path.realpath(path))
303+
269304
# sync file at path and return file ID if it exists
270-
id, _ = self._sync_file(path, stat_info)
305+
id = self._sync_file(path, stat_info)
271306
if id is not None:
272307
return id
273308

@@ -287,7 +322,7 @@ def get_id(self, path):
287322
return None
288323

289324
# then sync file at path and retrieve id, if any
290-
id, _ = self._sync_file(path, stat_info)
325+
id = self._sync_file(path, stat_info)
291326
self.con.commit()
292327
return id
293328

@@ -306,6 +341,27 @@ def get_path(self, id):
306341

307342
return path
308343

344+
def _move_recursive(self, old_path, new_path, sync_deque=None):
345+
"""Updates path of all indexed files prefixed with `old_path` and
346+
replaces the prefix with `new_path`. If `sync_deque` is specified, moved
347+
indexed directories are appended to `sync_deque`."""
348+
old_path_glob = os.path.join(old_path, "*")
349+
records = self.con.execute(
350+
"SELECT id, path, mtime FROM Files WHERE path GLOB ?", (old_path_glob,)
351+
).fetchall()
352+
353+
for record in records:
354+
id, old_recpath, mtime = record
355+
new_recpath = os.path.join(new_path, os.path.relpath(old_recpath, start=old_path))
356+
stat_info = self._stat(new_recpath)
357+
if not stat_info:
358+
continue
359+
360+
self._update_with_path(id, stat_info, new_recpath)
361+
362+
if sync_deque is not None and stat_info.is_dir:
363+
sync_deque.append((id, new_recpath, mtime))
364+
309365
def move(self, old_path, new_path, recursive=False):
310366
"""Handles file moves by updating the file path of the associated file
311367
ID. Returns the file ID. Returns None if file does not exist at new_path."""
@@ -320,22 +376,7 @@ def move(self, old_path, new_path, recursive=False):
320376
self.log.debug(f"FileIdManager : Moving file from ${old_path} to ${new_path}")
321377

322378
if recursive:
323-
old_path_glob = os.path.join(old_path, "*")
324-
records = self.con.execute(
325-
"SELECT id, path FROM Files WHERE path GLOB ?", (old_path_glob,)
326-
).fetchall()
327-
for record in records:
328-
if not record:
329-
continue
330-
id, old_recpath = record
331-
new_recpath = os.path.join(new_path, os.path.basename(old_recpath))
332-
rec_stat_info = self._stat(new_recpath)
333-
if not rec_stat_info:
334-
continue
335-
self.con.execute(
336-
"UPDATE Files SET path = ?, mtime = ? WHERE id = ?",
337-
(new_recpath, rec_stat_info.mtime, id),
338-
)
379+
self._move_recursive(old_path, new_path)
339380

340381
# attempt to fetch ID associated with old path
341382
# we avoid using get_id() here since that will always return None as file no longer exists at old path
@@ -349,7 +390,7 @@ def move(self, old_path, new_path, recursive=False):
349390
# update existing record with new path and stat info
350391
# TODO: make sure is_dir for existing record matches that of file at new_path
351392
id = row[0]
352-
self._update(id, stat_info, new_path)
393+
self._update_with_path(id, stat_info, new_path)
353394
self.con.commit()
354395
return id
355396

@@ -387,8 +428,8 @@ def copy(self, from_path, to_path, recursive=False):
387428
),
388429
)
389430

431+
self.index(from_path, commit=False)
390432
# transaction committed in index()
391-
self.index(from_path)
392433
return self.index(to_path)
393434

394435
def delete(self, path, recursive=False):

0 commit comments

Comments
 (0)