Skip to content

Commit 9da7b46

Browse files
committed
add FileIdManager class and assoc. tests
1 parent 1ec1aee commit 9da7b46

File tree

4 files changed

+406
-0
lines changed

4 files changed

+406
-0
lines changed
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import os
2+
import timeit
3+
4+
from jupyter_core.paths import jupyter_data_dir
5+
6+
from jupyter_server.services.contents.fileidmanager import FileIdManager
7+
8+
db_path = os.path.join(jupyter_data_dir(), "file_id_manager_perftest.db")
9+
10+
11+
def build_setup(n, insert=True):
12+
def setup():
13+
try:
14+
os.remove(db_path)
15+
except:
16+
pass
17+
fid_manager = FileIdManager(db_path=db_path)
18+
19+
if not insert:
20+
return
21+
22+
for i in range(n):
23+
fid_manager.con.execute(
24+
"INSERT INTO Files (path) VALUES (?)", (f"abracadabra/{i}.txt",)
25+
)
26+
fid_manager.con.commit()
27+
28+
return setup
29+
30+
31+
BATCH_SIZE = 100_000
32+
33+
34+
def build_test_index(n, single_transaction, batched=False):
35+
def test_index():
36+
fid_manager = FileIdManager(db_path=db_path)
37+
38+
if single_transaction:
39+
if batched:
40+
for batch_start in range(0, n, BATCH_SIZE):
41+
batch_end = batch_start + BATCH_SIZE
42+
fid_manager.con.execute(
43+
"INSERT INTO FILES (path) VALUES "
44+
+ ",".join(
45+
[f'("abracadabra/{i}.txt")' for i in range(batch_start, batch_end)]
46+
)
47+
)
48+
else:
49+
for i in range(n):
50+
fid_manager.con.execute(
51+
"INSERT INTO Files (path) VALUES (?)", (f"abracadabra/{i}.txt",)
52+
)
53+
54+
fid_manager.con.commit()
55+
else:
56+
for i in range(n):
57+
fid_manager.index(f"abracadabra/{i}.txt")
58+
59+
return test_index
60+
61+
62+
def test_copy():
63+
fid_manager = FileIdManager(db_path=db_path)
64+
fid_manager.copy("abracadabra", "shazam", recursive=True)
65+
66+
67+
def test_move():
68+
fid_manager = FileIdManager(db_path=db_path)
69+
fid_manager.move("abracadabra", "shazam", recursive=True)
70+
71+
72+
def test_delete():
73+
fid_manager = FileIdManager(db_path=db_path)
74+
fid_manager.delete("abracadabra", recursive=True)
75+
76+
77+
row_template = "{:<9,d} files | {:<8.4f} s"
78+
79+
80+
# too slow for 1k+
81+
print("Index benchmark (separate transactions)")
82+
for i in [100, 1_000]:
83+
print(
84+
row_template.format(
85+
i,
86+
timeit.timeit(
87+
build_test_index(i, single_transaction=False),
88+
build_setup(i, insert=False),
89+
number=1,
90+
),
91+
)
92+
)
93+
94+
print("Index benchmark (single transaction, atomic INSERTs)")
95+
for i in [100, 1_000, 10_000, 100_000, 1_000_000]:
96+
print(
97+
row_template.format(
98+
i,
99+
timeit.timeit(
100+
build_test_index(i, single_transaction=True, batched=False),
101+
build_setup(i, insert=False),
102+
number=1,
103+
),
104+
)
105+
)
106+
107+
# suggested by https://stackoverflow.com/a/72527058/12548458
108+
# asymptotically faster because it reduces work being done by the SQLite VDBE https://www.sqlite.org/opcode.html
109+
# weird constant time factor that makes it sub-optimal for <1M records.
110+
print("Index benchmark (single transaction, batched INSERTs)")
111+
for i in [100, 1_000, 10_000, 100_000, 1_000_000]:
112+
print(
113+
row_template.format(
114+
i,
115+
timeit.timeit(
116+
build_test_index(i, single_transaction=True, batched=True),
117+
build_setup(i, insert=False),
118+
number=1,
119+
),
120+
)
121+
)
122+
123+
print("Recursive move benchmark")
124+
for i in [100, 1_000, 10_000, 100_000, 1_000_000]:
125+
print(row_template.format(i, timeit.timeit(test_move, build_setup(i), number=1)))
126+
127+
print("Recursive copy benchmark")
128+
for i in [100, 1_000, 10_000, 100_000, 1_000_000]:
129+
print(row_template.format(i, timeit.timeit(test_copy, build_setup(i), number=1)))
130+
131+
print("Recursive delete benchmark")
132+
for i in [100, 1_000, 10_000, 100_000, 1_000_000]:
133+
print(row_template.format(i, timeit.timeit(test_delete, build_setup(i), number=1)))

jupyter_server/pytest_plugin.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
from jupyter_server.extension import serverextension
2020
from jupyter_server.serverapp import ServerApp
21+
from jupyter_server.services.contents.fileidmanager import FileIdManager
2122
from jupyter_server.services.contents.filemanager import FileContentsManager
2223
from jupyter_server.services.contents.largefilemanager import LargeFileManager
2324
from jupyter_server.utils import url_path_join
@@ -457,6 +458,27 @@ def jp_large_contents_manager(tmp_path):
457458
return LargeFileManager(root_dir=str(tmp_path))
458459

459460

461+
@pytest.fixture
462+
def fid_db_path(jp_data_dir):
463+
"""Fixture that returns the file ID DB path used for tests."""
464+
return str(jp_data_dir / "fileidmanager_test.db")
465+
466+
467+
@pytest.fixture(autouse=True)
468+
def delete_db(fid_db_path):
469+
"""Fixture that automatically deletes the DB file before each test."""
470+
try:
471+
os.remove(fid_db_path)
472+
except OSError:
473+
pass
474+
475+
476+
@pytest.fixture
477+
def fid_manager(fid_db_path):
478+
"""Fixture returning a test-configured instance of `FileIdManager`."""
479+
return FileIdManager(db_path=fid_db_path)
480+
481+
460482
@pytest.fixture
461483
def jp_create_notebook(jp_root_dir):
462484
"""Creates a notebook in the test's home directory."""
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
import os
2+
import sqlite3
3+
4+
from jupyter_core.paths import jupyter_data_dir
5+
from traitlets import Unicode
6+
from traitlets.config.configurable import LoggingConfigurable
7+
8+
9+
class FileIdManager(LoggingConfigurable):
10+
db_path = Unicode(
11+
default_value=os.path.join(jupyter_data_dir(), "file_id_manager.db"),
12+
help=(
13+
"The path of the DB file used by `FileIdManager`. "
14+
"Defaults to `jupyter_data_dir()/file_id_manager.db`."
15+
),
16+
config=True,
17+
)
18+
19+
def __init__(self, *args, **kwargs):
20+
# pass args and kwargs to parent Configurable
21+
super().__init__(*args, **kwargs)
22+
# initialize connection with db
23+
self.con = sqlite3.connect(self.db_path)
24+
self.log.debug("Creating File ID tables and indices")
25+
self.con.execute(
26+
"CREATE TABLE IF NOT EXISTS Files(id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE)"
27+
)
28+
self.con.execute("CREATE INDEX IF NOT EXISTS ix_Files_path ON FILES (path)")
29+
self.con.commit()
30+
31+
def _normalize_path(self, path):
32+
"""Normalizes a given file path."""
33+
path = os.path.normcase(path)
34+
path = os.path.normpath(path)
35+
return path
36+
37+
def index(self, path):
38+
"""Adds the file path to the Files table, then returns the file ID. If
39+
the file is already indexed, the file ID is immediately returned."""
40+
path = self._normalize_path(path)
41+
existing_id = self.get_id(path)
42+
if existing_id is not None:
43+
return existing_id
44+
45+
cursor = self.con.execute("INSERT INTO Files (path) VALUES (?)", (path,))
46+
self.con.commit()
47+
return cursor.lastrowid
48+
49+
def get_id(self, path):
50+
"""Retrieves the file ID associated with a file path. Returns None if
51+
the file path has not yet been indexed."""
52+
path = self._normalize_path(path)
53+
row = self.con.execute("SELECT id FROM Files WHERE path = ?", (path,)).fetchone()
54+
self.con.commit()
55+
return row[0] if row else None
56+
57+
def get_path(self, id):
58+
"""Retrieves the file path associated with a file ID. Returns None if
59+
the ID does not exist in the Files table."""
60+
row = self.con.execute("SELECT path FROM Files WHERE id = ?", (id,)).fetchone()
61+
self.con.commit()
62+
return row[0] if row else None
63+
64+
def move(self, old_path, new_path, recursive=False):
65+
"""Handles file moves by updating the file path of the associated file
66+
ID. Returns the file ID."""
67+
old_path = self._normalize_path(old_path)
68+
new_path = self._normalize_path(new_path)
69+
self.log.debug(f"Moving file from ${old_path} to ${new_path}")
70+
71+
if recursive:
72+
old_path_glob = os.path.join(old_path, "*")
73+
self.con.execute(
74+
"UPDATE Files SET path = ? || substr(path, ?) WHERE path GLOB ?",
75+
(new_path, len(old_path) + 1, old_path_glob),
76+
)
77+
self.con.commit()
78+
79+
id = self.get_id(old_path)
80+
if id is None:
81+
return self.index(new_path)
82+
else:
83+
self.con.execute("UPDATE Files SET path = ? WHERE id = ?", (new_path, id))
84+
self.con.commit()
85+
return id
86+
87+
def copy(self, from_path, to_path, recursive=False):
88+
"""Handles file copies by creating a new record in the Files table.
89+
Returns the file ID associated with `new_path`. Also indexes `old_path`
90+
if record does not exist in Files table. TODO: emit to event bus to
91+
inform client extensions to copy records associated with old file ID to
92+
the new file ID."""
93+
from_path = self._normalize_path(from_path)
94+
to_path = self._normalize_path(to_path)
95+
self.log.debug(f"Copying file from ${from_path} to ${to_path}")
96+
97+
if recursive:
98+
from_path_glob = os.path.join(from_path, "*")
99+
self.con.execute(
100+
"INSERT INTO Files (path) SELECT (? || substr(path, ?)) FROM Files WHERE path GLOB ?",
101+
(to_path, len(from_path) + 1, from_path_glob),
102+
)
103+
self.con.commit()
104+
105+
self.index(from_path)
106+
return self.index(to_path)
107+
108+
def delete(self, path, recursive=False):
109+
"""Handles file deletions by deleting the associated record in the File
110+
table. Returns None."""
111+
path = self._normalize_path(path)
112+
self.log.debug(f"Deleting file {path}")
113+
114+
if recursive:
115+
path_glob = os.path.join(path, "*")
116+
self.con.execute("DELETE FROM Files WHERE path GLOB ?", (path_glob,))
117+
self.con.commit()
118+
119+
self.con.execute("DELETE FROM Files WHERE path = ?", (path,))
120+
self.con.commit()
121+
122+
def _cleanup(self):
123+
"""Cleans up `FileIdManager` by committing any pending transactions and
124+
closing the connection."""
125+
self.con.commit()
126+
self.con.close()

0 commit comments

Comments
 (0)