1
1
import os
2
2
import sqlite3
3
3
import stat
4
- from typing import Optional
4
+ from collections import deque
5
+ from typing import Deque , Optional
5
6
6
7
from jupyter_core .paths import jupyter_data_dir
7
8
from traitlets import TraitError , Unicode , validate
@@ -13,6 +14,7 @@ class StatStruct:
13
14
crtime : Optional [int ]
14
15
mtime : int
15
16
is_dir : bool
17
+ is_symlink : bool
16
18
17
19
18
20
class FileIdManager (LoggingConfigurable ):
@@ -100,83 +102,103 @@ def _sync_all(self):
100
102
-----
101
103
A dirty directory is a directory that is either:
102
104
- unindexed
103
- - indexed but moved
104
105
- indexed but with different `mtime`
105
106
106
107
Dirty directories contain possibly indexed but moved files as children.
107
108
Hence we need to call _sync_file() on their contents via _sync_dir().
108
- Indexed directories that are dirty solely because of mtime difference
109
- are included in the below SELECT query. Unindexed or indexed-but-moved
110
- dirty directories are not included in the query, and hence must be
111
- handled in _sync_dir().
109
+ Indexed directories with mtime difference are handled in this method
110
+ body. Unindexed dirty directories are handled immediately when
111
+ encountered in _sync_dir().
112
+
113
+ sync_deque is an additional deque of directories that should be checked
114
+ for dirtiness, and is appended to whenever _sync_file() encounters an
115
+ indexed directory that was moved out-of-band. This is necessary because
116
+ the SELECT query is not guaranteed to include the new paths following
117
+ the move.
112
118
"""
119
+ sync_deque : Deque = deque ()
113
120
cursor = self .con .execute ("SELECT id, path, mtime FROM Files WHERE is_dir = 1" )
114
- for dir in cursor :
121
+ dir = cursor .fetchone ()
122
+ while dir :
115
123
id , path , old_mtime = dir
116
124
stat_info = self ._stat (path )
117
125
118
- # ignore directories that no longer exist
119
- if stat_info is None :
120
- continue
126
+ # ignores directories that no longer exist
127
+ if stat_info is not None :
128
+ new_mtime = stat_info .mtime
129
+ dir_dirty = new_mtime != old_mtime
130
+ if dir_dirty :
131
+ self ._sync_dir (path , sync_deque )
132
+ self ._update (id , stat_info )
121
133
122
- new_mtime = stat_info .mtime
123
- dir_dirty = new_mtime != old_mtime
124
- if dir_dirty :
125
- self ._sync_dir (path )
126
- self ._update (id , stat_info )
134
+ dir = sync_deque .popleft () if sync_deque else cursor .fetchone ()
127
135
128
- def _sync_dir (self , dir_path ):
136
+ def _sync_dir (self , dir_path , sync_deque ):
129
137
"""
130
138
Syncs the contents of a directory. If a child directory is dirty because
131
- it is either unindexed or indexed-but-moved, then the contents of that
132
- child directory are synced. See _sync_all() for more on dirty
133
- directories.
139
+ it is unindexed, then the contents of that child directory are synced.
140
+ See _sync_all() for more on dirty directories.
141
+
142
+ Parameters
143
+ ----------
144
+ dir_path : string
145
+ Path of the directory to sync contents of.
146
+
147
+ sync_deque: deque
148
+ Deque of directory records to be checked for dirtiness in
149
+ _sync_all().
134
150
"""
135
151
with os .scandir (dir_path ) as scan_iter :
136
152
for entry in scan_iter :
137
153
stat_info = self ._stat (entry .path )
138
- id , is_dirty_dir = self ._sync_file (entry .path , stat_info )
154
+ id = self ._sync_file (entry .path , stat_info , sync_deque )
139
155
140
- # if entry is unindexed directory, create new record
156
+ # if entry is unindexed directory, create new record and sync
157
+ # contents recursively.
141
158
if stat_info .is_dir and id is None :
142
159
self ._create (entry .path , stat_info )
143
-
144
- # sync dirty dir contents if it is either unindexed or
145
- # indexed-but-moved
146
- if is_dirty_dir :
147
- self ._sync_dir (entry .path )
160
+ self ._sync_dir (entry .path , sync_deque )
148
161
149
162
scan_iter .close ()
150
163
151
- def _sync_file (self , path , stat_info ):
164
+ def _sync_file (self , path , stat_info , sync_deque = None ):
152
165
"""
153
166
Syncs the file at `path` with the Files table by detecting whether the
154
167
file was previously indexed but moved. Updates the record with the new
155
168
path. This ensures that the file at path is associated with the correct
156
169
file ID. This method does nothing if the file at `path` was not
157
170
previously indexed.
158
171
172
+ Parameters
173
+ ----------
174
+ path : string
175
+ Path of the file to sync.
176
+
177
+ stat_info : StatStruct
178
+ Stat info of the file to sync.
179
+
180
+ sync_deque : deque, optional
181
+ Deque of directory records to be checked for dirtiness in
182
+ _sync_all(). If specified, this method appends to sync_deque any
183
+ moved indexed directory and all of its children recursively.
184
+
159
185
Returns
160
186
-------
161
- Returns a two-tuple containing the elements
162
-
163
187
id : int, optional
164
- ID of the file if it was previously indexed. None otherwise.
165
-
166
- dir_dirty: bool
167
- Whether the file is a dirty directory and should be traversed by
168
- _sync_dir(). Not necessarily true even if the `mtime` differs, since
169
- directories which are dirty only because of `mtime` difference are
170
- included in the query run by _sync_all(). See _sync_all() for more
171
- on dirty directories.
188
+ ID of the file if it is a real file (not a symlink) and it was
189
+ previously indexed. None otherwise.
172
190
"""
191
+ # if file is symlink, do nothing
192
+ if stat_info .is_symlink :
193
+ return None
194
+
173
195
src = self .con .execute (
174
196
"SELECT id, path, crtime, mtime FROM Files WHERE ino = ?" , (stat_info .ino ,)
175
197
).fetchone ()
176
198
177
199
# if no record with matching ino, then return None
178
200
if not src :
179
- return None , stat_info . is_dir
201
+ return None
180
202
181
203
id , old_path , src_crtime , src_mtime = src
182
204
src_timestamp = src_crtime if src_crtime is not None else src_mtime
@@ -185,13 +207,20 @@ def _sync_file(self, path, stat_info):
185
207
# if record has identical ino and crtime/mtime to an existing record,
186
208
# update it with new destination path and stat info, returning its id
187
209
if src_timestamp == dst_timestamp :
188
- self ._update (id , stat_info , path )
189
- return id , stat_info .is_dir and old_path != path
210
+ self ._update_with_path (id , stat_info , path )
211
+
212
+ # update paths of indexed children under moved directories
213
+ if stat_info .is_dir and old_path != path :
214
+ self ._move_recursive (old_path , path , sync_deque )
215
+ if sync_deque is not None :
216
+ sync_deque .appendleft ((id , path , src_mtime ))
217
+
218
+ return id
190
219
191
220
# otherwise delete the existing record with identical `ino`, since inos
192
221
# must be unique. then return None
193
222
self .con .execute ("DELETE FROM Files WHERE id = ?" , (id ,))
194
- return None , stat_info . is_dir
223
+ return None
195
224
196
225
def _normalize_path (self , path ):
197
226
"""Normalizes a given file path."""
@@ -217,14 +246,15 @@ def _parse_raw_stat(self, raw_stat):
217
246
)
218
247
stat_info .mtime = raw_stat .st_mtime_ns
219
248
stat_info .is_dir = stat .S_ISDIR (raw_stat .st_mode )
249
+ stat_info .is_symlink = stat .S_ISLNK (raw_stat .st_mode )
220
250
221
251
return stat_info
222
252
223
253
def _stat (self , path ):
224
254
"""Returns stat info on a path in a StatStruct object.Returns None if
225
255
file does not exist at path."""
226
256
try :
227
- raw_stat = os .stat (path )
257
+ raw_stat = os .lstat (path )
228
258
except OSError :
229
259
return None
230
260
@@ -240,23 +270,24 @@ def _create(self, path, stat_info):
240
270
241
271
return cursor .lastrowid
242
272
243
- def _update (self , id , stat_info , path = None ):
244
- """Updates a record given its file ID, stat info, and possibly path."""
245
- if path is not None :
246
- self .con .execute (
247
- "UPDATE Files SET path = ?, ino = ?, crtime = ?, mtime = ? WHERE id = ?" ,
248
- (path , stat_info .ino , stat_info .crtime , stat_info .mtime , id ),
249
- )
250
- else :
251
- self .con .execute (
252
- # updating `ino` and `crtime` is a conscious design decision because
253
- # this method is called by `move()`. these values are only preserved
254
- # by fs moves done via the `rename()` syscall, like `mv`. we don't
255
- # care how the contents manager moves a file; it could be deleting
256
- # and creating a new file (which will change the stat info).
257
- "UPDATE Files SET ino = ?, crtime = ?, mtime = ? WHERE id = ?" ,
258
- (stat_info .ino , stat_info .crtime , stat_info .mtime , id ),
259
- )
273
+ def _update_with_path (self , id , stat_info , path ):
274
+ """Same as _update(), but accepts and updates path."""
275
+ self .con .execute (
276
+ "UPDATE Files SET path = ?, ino = ?, crtime = ?, mtime = ? WHERE id = ?" ,
277
+ (path , stat_info .ino , stat_info .crtime , stat_info .mtime , id ),
278
+ )
279
+
280
+ def _update (self , id , stat_info ):
281
+ """Updates a record given its file ID and stat info."""
282
+ # updating `ino` and `crtime` is a conscious design decision because
283
+ # this method is called by `move()`. these values are only preserved by
284
+ # fs moves done via the `rename()` syscall, like `mv`. we don't care how
285
+ # the contents manager moves a file; it could be deleting and creating a
286
+ # new file (which will change the stat info).
287
+ self .con .execute (
288
+ "UPDATE Files SET ino = ?, crtime = ?, mtime = ? WHERE id = ?" ,
289
+ (stat_info .ino , stat_info .crtime , stat_info .mtime , id ),
290
+ )
260
291
261
292
def index (self , path , stat_info = None , commit = True ):
262
293
"""Returns the file ID for the file at `path`, creating a new file ID if
@@ -266,8 +297,12 @@ def index(self, path, stat_info=None, commit=True):
266
297
if not stat_info :
267
298
return None
268
299
300
+ # if file is symlink, then index the path it refers to instead
301
+ if stat_info .is_symlink :
302
+ return self .index (os .path .realpath (path ))
303
+
269
304
# sync file at path and return file ID if it exists
270
- id , _ = self ._sync_file (path , stat_info )
305
+ id = self ._sync_file (path , stat_info )
271
306
if id is not None :
272
307
return id
273
308
@@ -287,7 +322,7 @@ def get_id(self, path):
287
322
return None
288
323
289
324
# then sync file at path and retrieve id, if any
290
- id , _ = self ._sync_file (path , stat_info )
325
+ id = self ._sync_file (path , stat_info )
291
326
self .con .commit ()
292
327
return id
293
328
@@ -306,6 +341,27 @@ def get_path(self, id):
306
341
307
342
return path
308
343
344
+ def _move_recursive (self , old_path , new_path , sync_deque = None ):
345
+ """Updates path of all indexed files prefixed with `old_path` and
346
+ replaces the prefix with `new_path`. If `sync_deque` is specified, moved
347
+ indexed directories are appended to `sync_deque`."""
348
+ old_path_glob = os .path .join (old_path , "*" )
349
+ records = self .con .execute (
350
+ "SELECT id, path, mtime FROM Files WHERE path GLOB ?" , (old_path_glob ,)
351
+ ).fetchall ()
352
+
353
+ for record in records :
354
+ id , old_recpath , mtime = record
355
+ new_recpath = os .path .join (new_path , os .path .relpath (old_recpath , start = old_path ))
356
+ stat_info = self ._stat (new_recpath )
357
+ if not stat_info :
358
+ continue
359
+
360
+ self ._update_with_path (id , stat_info , new_recpath )
361
+
362
+ if sync_deque is not None and stat_info .is_dir :
363
+ sync_deque .append ((id , new_recpath , mtime ))
364
+
309
365
def move (self , old_path , new_path , recursive = False ):
310
366
"""Handles file moves by updating the file path of the associated file
311
367
ID. Returns the file ID. Returns None if file does not exist at new_path."""
@@ -320,22 +376,7 @@ def move(self, old_path, new_path, recursive=False):
320
376
self .log .debug (f"FileIdManager : Moving file from ${ old_path } to ${ new_path } " )
321
377
322
378
if recursive :
323
- old_path_glob = os .path .join (old_path , "*" )
324
- records = self .con .execute (
325
- "SELECT id, path FROM Files WHERE path GLOB ?" , (old_path_glob ,)
326
- ).fetchall ()
327
- for record in records :
328
- if not record :
329
- continue
330
- id , old_recpath = record
331
- new_recpath = os .path .join (new_path , os .path .basename (old_recpath ))
332
- rec_stat_info = self ._stat (new_recpath )
333
- if not rec_stat_info :
334
- continue
335
- self .con .execute (
336
- "UPDATE Files SET path = ?, mtime = ? WHERE id = ?" ,
337
- (new_recpath , rec_stat_info .mtime , id ),
338
- )
379
+ self ._move_recursive (old_path , new_path )
339
380
340
381
# attempt to fetch ID associated with old path
341
382
# we avoid using get_id() here since that will always return None as file no longer exists at old path
@@ -349,7 +390,7 @@ def move(self, old_path, new_path, recursive=False):
349
390
# update existing record with new path and stat info
350
391
# TODO: make sure is_dir for existing record matches that of file at new_path
351
392
id = row [0 ]
352
- self ._update (id , stat_info , new_path )
393
+ self ._update_with_path (id , stat_info , new_path )
353
394
self .con .commit ()
354
395
return id
355
396
@@ -387,8 +428,8 @@ def copy(self, from_path, to_path, recursive=False):
387
428
),
388
429
)
389
430
431
+ self .index (from_path , commit = False )
390
432
# transaction committed in index()
391
- self .index (from_path )
392
433
return self .index (to_path )
393
434
394
435
def delete (self , path , recursive = False ):
0 commit comments