@@ -18,7 +18,6 @@ prototypes:
18
18
char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
19
19
20
20
locking rules:
21
- none have BKL
22
21
dcache_lock rename_lock ->d_lock may block
23
22
d_revalidate: no no no yes
24
23
d_hash no no no yes
@@ -42,18 +41,23 @@ ata *);
42
41
int (*rename) (struct inode *, struct dentry *,
43
42
struct inode *, struct dentry *);
44
43
int (*readlink) (struct dentry *, char __user *,int);
45
- int (*follow_link) (struct dentry *, struct nameidata *);
44
+ void * (*follow_link) (struct dentry *, struct nameidata *);
45
+ void (*put_link) (struct dentry *, struct nameidata *, void *);
46
46
void (*truncate) (struct inode *);
47
47
int (*permission) (struct inode *, int, struct nameidata *);
48
+ int (*check_acl)(struct inode *, int);
48
49
int (*setattr) (struct dentry *, struct iattr *);
49
50
int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
50
51
int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
51
52
ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
52
53
ssize_t (*listxattr) (struct dentry *, char *, size_t);
53
54
int (*removexattr) (struct dentry *, const char *);
55
+ void (*truncate_range)(struct inode *, loff_t, loff_t);
56
+ long (*fallocate)(struct inode *inode, int mode, loff_t offset, loff_t len);
57
+ int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
54
58
55
59
locking rules:
56
- all may block, none have BKL
60
+ all may block
57
61
i_mutex(inode)
58
62
lookup: yes
59
63
create: yes
@@ -66,19 +70,24 @@ rmdir: yes (both) (see below)
66
70
rename: yes (all) (see below)
67
71
readlink: no
68
72
follow_link: no
73
+ put_link: no
69
74
truncate: yes (see below)
70
75
setattr: yes
71
76
permission: no
77
+ check_acl: no
72
78
getattr: no
73
79
setxattr: yes
74
80
getxattr: no
75
81
listxattr: no
76
82
removexattr: yes
83
+ truncate_range: yes
84
+ fallocate: no
85
+ fiemap: no
77
86
Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
78
87
victim.
79
88
cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
80
89
->truncate() is never called directly - it's a callback, not a
81
- method. It's called by vmtruncate() - library function normally used by
90
+ method. It's called by vmtruncate() - deprecated library function used by
82
91
->setattr(). Locking information above applies to that call (i.e. is
83
92
inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been
84
93
passed).
@@ -91,7 +100,7 @@ prototypes:
91
100
struct inode *(*alloc_inode)(struct super_block *sb);
92
101
void (*destroy_inode)(struct inode *);
93
102
void (*dirty_inode) (struct inode *);
94
- int (*write_inode) (struct inode *, int );
103
+ int (*write_inode) (struct inode *, struct writeback_control *wbc );
95
104
int (*drop_inode) (struct inode *);
96
105
void (*evict_inode) (struct inode *);
97
106
void (*put_super) (struct super_block *);
@@ -105,10 +114,10 @@ prototypes:
105
114
int (*show_options)(struct seq_file *, struct vfsmount *);
106
115
ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
107
116
ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
117
+ int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
108
118
109
119
locking rules:
110
120
All may block [not true, see below]
111
- None have BKL
112
121
s_umount
113
122
alloc_inode:
114
123
destroy_inode:
@@ -127,6 +136,7 @@ umount_begin: no
127
136
show_options: no (namespace_sem)
128
137
quota_read: no (see below)
129
138
quota_write: no (see below)
139
+ bdev_try_to_free_page: no (see below)
130
140
131
141
->statfs() has s_umount (shared) when called by ustat(2) (native or
132
142
compat), but that's an accident of bad API; s_umount is used to pin
@@ -139,19 +149,25 @@ be the only ones operating on the quota file by the quota code (via
139
149
dqio_sem) (unless an admin really wants to screw up something and
140
150
writes to quota files with quotas on). For other details about locking
141
151
see also dquot_operations section.
152
+ ->bdev_try_to_free_page is called from the ->releasepage handler of
153
+ the block device inode. See there for more details.
142
154
143
155
--------------------------- file_system_type ---------------------------
144
156
prototypes:
145
157
int (*get_sb) (struct file_system_type *, int,
146
158
const char *, void *, struct vfsmount *);
159
+ struct dentry *(*mount) (struct file_system_type *, int,
160
+ const char *, void *);
147
161
void (*kill_sb) (struct super_block *);
148
162
locking rules:
149
- may block BKL
150
- get_sb yes no
151
- kill_sb yes no
163
+ may block
164
+ get_sb yes
165
+ mount yes
166
+ kill_sb yes
152
167
153
168
->get_sb() returns error or 0 with locked superblock attached to the vfsmount
154
169
(exclusive on ->s_umount).
170
+ ->mount() returns ERR_PTR or the root dentry.
155
171
->kill_sb() takes a write-locked superblock, does all shutdown work on it,
156
172
unlocks and drops the reference.
157
173
@@ -176,27 +192,35 @@ prototypes:
176
192
void (*freepage)(struct page *);
177
193
int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
178
194
loff_t offset, unsigned long nr_segs);
179
- int (*launder_page) (struct page *);
195
+ int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **,
196
+ unsigned long *);
197
+ int (*migratepage)(struct address_space *, struct page *, struct page *);
198
+ int (*launder_page)(struct page *);
199
+ int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long);
200
+ int (*error_remove_page)(struct address_space *, struct page *);
180
201
181
202
locking rules:
182
203
All except set_page_dirty and freepage may block
183
204
184
- BKL PageLocked(page) i_mutex
185
- writepage: no yes, unlocks (see below)
186
- readpage: no yes, unlocks
187
- sync_page: no maybe
188
- writepages: no
189
- set_page_dirty no no
190
- readpages: no
191
- write_begin: no locks the page yes
192
- write_end: no yes, unlocks yes
193
- perform_write: no n/a yes
194
- bmap: no
195
- invalidatepage: no yes
196
- releasepage: no yes
197
- freepage: no yes
198
- direct_IO: no
199
- launder_page: no yes
205
+ PageLocked(page) i_mutex
206
+ writepage: yes, unlocks (see below)
207
+ readpage: yes, unlocks
208
+ sync_page: maybe
209
+ writepages:
210
+ set_page_dirty no
211
+ readpages:
212
+ write_begin: locks the page yes
213
+ write_end: yes, unlocks yes
214
+ bmap:
215
+ invalidatepage: yes
216
+ releasepage: yes
217
+ freepage: yes
218
+ direct_IO:
219
+ get_xip_mem: maybe
220
+ migratepage: yes (both)
221
+ launder_page: yes
222
+ is_partially_uptodate: yes
223
+ error_remove_page: yes
200
224
201
225
->write_begin(), ->write_end(), ->sync_page() and ->readpage()
202
226
may be called from the request handler (/dev/loop).
@@ -276,9 +300,8 @@ under spinlock (it cannot block) and is sometimes called with the page
276
300
not locked.
277
301
278
302
->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some
279
- filesystems and by the swapper. The latter will eventually go away. All
280
- instances do not actually need the BKL. Please, keep it that way and don't
281
- breed new callers.
303
+ filesystems and by the swapper. The latter will eventually go away. Please,
304
+ keep it that way and don't breed new callers.
282
305
283
306
->invalidatepage() is called when the filesystem must attempt to drop
284
307
some or all of the buffers from the page when it is being truncated. It
@@ -299,47 +322,37 @@ cleaned, or an error value if not. Note that in order to prevent the page
299
322
getting mapped back in and redirtied, it needs to be kept locked
300
323
across the entire operation.
301
324
302
- Note: currently almost all instances of address_space methods are
303
- using BKL for internal serialization and that's one of the worst sources
304
- of contention. Normally they are calling library functions (in fs/buffer.c)
305
- and pass foo_get_block() as a callback (on local block-based filesystems,
306
- indeed). BKL is not needed for library stuff and is usually taken by
307
- foo_get_block(). It's an overkill, since block bitmaps can be protected by
308
- internal fs locking and real critical areas are much smaller than the areas
309
- filesystems protect now.
310
-
311
325
----------------------- file_lock_operations ------------------------------
312
326
prototypes:
313
- void (*fl_insert)(struct file_lock *); /* lock insertion callback */
314
- void (*fl_remove)(struct file_lock *); /* lock removal callback */
315
327
void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
316
328
void (*fl_release_private)(struct file_lock *);
317
329
318
330
319
331
locking rules:
320
- BKL may block
321
- fl_insert: yes no
322
- fl_remove: yes no
323
- fl_copy_lock: yes no
324
- fl_release_private: yes yes
332
+ file_lock_lock may block
333
+ fl_copy_lock: yes no
334
+ fl_release_private: maybe no
325
335
326
336
----------------------- lock_manager_operations ---------------------------
327
337
prototypes:
328
338
int (*fl_compare_owner)(struct file_lock *, struct file_lock *);
329
339
void (*fl_notify)(struct file_lock *); /* unblock callback */
340
+ int (*fl_grant)(struct file_lock *, struct file_lock *, int);
330
341
void (*fl_release_private)(struct file_lock *);
331
342
void (*fl_break)(struct file_lock *); /* break_lease callback */
343
+ int (*fl_mylease)(struct file_lock *, struct file_lock *);
344
+ int (*fl_change)(struct file_lock **, int);
332
345
333
346
locking rules:
334
- BKL may block
335
- fl_compare_owner: yes no
336
- fl_notify: yes no
337
- fl_release_private: yes yes
338
- fl_break: yes no
339
-
340
- Currently only NFSD and NLM provide instances of this class. None of the
341
- them block. If you have out-of-tree instances - please, show up. Locking
342
- in that area will change.
347
+ file_lock_lock may block
348
+ fl_compare_owner: yes no
349
+ fl_notify: yes no
350
+ fl_grant: no no
351
+ fl_release_private: maybe no
352
+ fl_break: yes no
353
+ fl_mylease: yes no
354
+ fl_change yes no
355
+
343
356
--------------------------- buffer_head -----------------------------------
344
357
prototypes:
345
358
void (*b_end_io)(struct buffer_head *bh, int uptodate);
@@ -364,17 +377,17 @@ prototypes:
364
377
void (*swap_slot_free_notify) (struct block_device *, unsigned long);
365
378
366
379
locking rules:
367
- BKL bd_mutex
368
- open: no yes
369
- release: no yes
370
- ioctl: no no
371
- compat_ioctl: no no
372
- direct_access: no no
373
- media_changed: no no
374
- unlock_native_capacity: no no
375
- revalidate_disk: no no
376
- getgeo: no no
377
- swap_slot_free_notify: no no (see below)
380
+ bd_mutex
381
+ open: yes
382
+ release: yes
383
+ ioctl: no
384
+ compat_ioctl: no
385
+ direct_access: no
386
+ media_changed: no
387
+ unlock_native_capacity: no
388
+ revalidate_disk: no
389
+ getgeo: no
390
+ swap_slot_free_notify: no (see below)
378
391
379
392
media_changed, unlock_native_capacity and revalidate_disk are called only from
380
393
check_disk_change().
@@ -413,34 +426,21 @@ prototypes:
413
426
unsigned long (*get_unmapped_area)(struct file *, unsigned long,
414
427
unsigned long, unsigned long, unsigned long);
415
428
int (*check_flags)(int);
429
+ int (*flock) (struct file *, int, struct file_lock *);
430
+ ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *,
431
+ size_t, unsigned int);
432
+ ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *,
433
+ size_t, unsigned int);
434
+ int (*setlease)(struct file *, long, struct file_lock **);
416
435
};
417
436
418
437
locking rules:
419
- All may block.
420
- BKL
421
- llseek: no (see below)
422
- read: no
423
- aio_read: no
424
- write: no
425
- aio_write: no
426
- readdir: no
427
- poll: no
428
- unlocked_ioctl: no
429
- compat_ioctl: no
430
- mmap: no
431
- open: no
432
- flush: no
433
- release: no
434
- fsync: no (see below)
435
- aio_fsync: no
436
- fasync: no
437
- lock: yes
438
- readv: no
439
- writev: no
440
- sendfile: no
441
- sendpage: no
442
- get_unmapped_area: no
443
- check_flags: no
438
+ All may block except for ->setlease.
439
+ No VFS locks held on entry except for ->fsync and ->setlease.
440
+
441
+ ->fsync() has i_mutex on inode.
442
+
443
+ ->setlease has the file_list_lock held and must not sleep.
444
444
445
445
->llseek() locking has moved from llseek to the individual llseek
446
446
implementations. If your fs is not using generic_file_llseek, you
@@ -450,17 +450,10 @@ mutex or just to use i_size_read() instead.
450
450
Note: this does not protect the file->f_pos against concurrent modifications
451
451
since this is something the userspace has to take care about.
452
452
453
- Note: ext2_release() was *the* source of contention on fs-intensive
454
- loads and dropping BKL on ->release() helps to get rid of that (we still
455
- grab BKL for cases when we close a file that had been opened r/w, but that
456
- can and should be done using the internal locking with smaller critical areas).
457
- Current worst offender is ext2_get_block()...
458
-
459
- ->fasync() is called without BKL protection, and is responsible for
460
- maintaining the FASYNC bit in filp->f_flags. Most instances call
461
- fasync_helper(), which does that maintenance, so it's not normally
462
- something one needs to worry about. Return values > 0 will be mapped to
463
- zero in the VFS layer.
453
+ ->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags.
454
+ Most instances call fasync_helper(), which does that maintenance, so it's
455
+ not normally something one needs to worry about. Return values > 0 will be
456
+ mapped to zero in the VFS layer.
464
457
465
458
->readdir() and ->ioctl() on directories must be changed. Ideally we would
466
459
move ->readdir() to inode_operations and use a separate method for directory
@@ -471,8 +464,6 @@ components. And there are other reasons why the current interface is a mess...
471
464
->read on directories probably must go away - we should just enforce -EISDIR
472
465
in sys_read() and friends.
473
466
474
- ->fsync() has i_mutex on inode.
475
-
476
467
--------------------------- dquot_operations -------------------------------
477
468
prototypes:
478
469
int (*write_dquot) (struct dquot *);
@@ -507,12 +498,12 @@ prototypes:
507
498
int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
508
499
509
500
locking rules:
510
- BKL mmap_sem PageLocked(page)
511
- open: no yes
512
- close: no yes
513
- fault: no yes can return with page locked
514
- page_mkwrite: no yes can return with page locked
515
- access: no yes
501
+ mmap_sem PageLocked(page)
502
+ open: yes
503
+ close: yes
504
+ fault: yes can return with page locked
505
+ page_mkwrite: yes can return with page locked
506
+ access: yes
516
507
517
508
->fault() is called when a previously not present pte is about
518
509
to be faulted in. The filesystem must find and return the page associated
@@ -539,6 +530,3 @@ VM_IO | VM_PFNMAP VMAs.
539
530
540
531
(if you break something or notice that it is broken and do not fix it yourself
541
532
- at least put it here)
542
-
543
- ipc/shm.c::shm_delete() - may need BKL.
544
- ->read() and ->write() in many drivers are (probably) missing BKL.
0 commit comments