Skip to content

Commit 8019ad1

Browse files
author
Peter Zijlstra
committed
futex: Fix inode life-time issue
As reported by Jann, ihold() does not in fact guarantee inode persistence. And instead of making it so, replace the usage of inode pointers with a per boot, machine wide, unique inode identifier. This sequence number is global, but shared (file backed) futexes are rare enough that this should not become a performance issue. Reported-by: Jann Horn <[email protected]> Suggested-by: Linus Torvalds <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
1 parent 98d54f8 commit 8019ad1

File tree

4 files changed

+65
-43
lines changed

4 files changed

+65
-43
lines changed

fs/inode.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
138138
inode->i_sb = sb;
139139
inode->i_blkbits = sb->s_blocksize_bits;
140140
inode->i_flags = 0;
141+
atomic64_set(&inode->i_sequence, 0);
141142
atomic_set(&inode->i_count, 1);
142143
inode->i_op = &empty_iops;
143144
inode->i_fop = &no_open_fops;

include/linux/fs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -698,6 +698,7 @@ struct inode {
698698
struct rcu_head i_rcu;
699699
};
700700
atomic64_t i_version;
701+
atomic64_t i_sequence; /* see futex */
701702
atomic_t i_count;
702703
atomic_t i_dio_count;
703704
atomic_t i_writecount;

include/linux/futex.h

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,23 +31,26 @@ struct task_struct;
3131

3232
union futex_key {
3333
struct {
34+
u64 i_seq;
3435
unsigned long pgoff;
35-
struct inode *inode;
36-
int offset;
36+
unsigned int offset;
3737
} shared;
3838
struct {
39+
union {
40+
struct mm_struct *mm;
41+
u64 __tmp;
42+
};
3943
unsigned long address;
40-
struct mm_struct *mm;
41-
int offset;
44+
unsigned int offset;
4245
} private;
4346
struct {
47+
u64 ptr;
4448
unsigned long word;
45-
void *ptr;
46-
int offset;
49+
unsigned int offset;
4750
} both;
4851
};
4952

50-
#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } }
53+
#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = 0ULL } }
5154

5255
#ifdef CONFIG_FUTEX
5356
enum {

kernel/futex.c

Lines changed: 53 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -429,7 +429,7 @@ static void get_futex_key_refs(union futex_key *key)
429429

430430
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
431431
case FUT_OFF_INODE:
432-
ihold(key->shared.inode); /* implies smp_mb(); (B) */
432+
smp_mb(); /* explicit smp_mb(); (B) */
433433
break;
434434
case FUT_OFF_MMSHARED:
435435
futex_get_mm(key); /* implies smp_mb(); (B) */
@@ -463,7 +463,6 @@ static void drop_futex_key_refs(union futex_key *key)
463463

464464
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
465465
case FUT_OFF_INODE:
466-
iput(key->shared.inode);
467466
break;
468467
case FUT_OFF_MMSHARED:
469468
mmdrop(key->private.mm);
@@ -505,6 +504,46 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
505504
return timeout;
506505
}
507506

507+
/*
508+
* Generate a machine wide unique identifier for this inode.
509+
*
510+
* This relies on u64 not wrapping in the life-time of the machine; which with
511+
* 1ns resolution means almost 585 years.
512+
*
513+
* This further relies on the fact that a well formed program will not unmap
514+
* the file while it has a (shared) futex waiting on it. This mapping will have
515+
* a file reference which pins the mount and inode.
516+
*
517+
* If for some reason an inode gets evicted and read back in again, it will get
518+
* a new sequence number and will _NOT_ match, even though it is the exact same
519+
* file.
520+
*
521+
* It is important that match_futex() will never have a false-positive, esp.
522+
* for PI futexes that can mess up the state. The above argues that false-negatives
523+
* are only possible for malformed programs.
524+
*/
525+
static u64 get_inode_sequence_number(struct inode *inode)
526+
{
527+
static atomic64_t i_seq;
528+
u64 old;
529+
530+
/* Does the inode already have a sequence number? */
531+
old = atomic64_read(&inode->i_sequence);
532+
if (likely(old))
533+
return old;
534+
535+
for (;;) {
536+
u64 new = atomic64_add_return(1, &i_seq);
537+
if (WARN_ON_ONCE(!new))
538+
continue;
539+
540+
old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
541+
if (old)
542+
return old;
543+
return new;
544+
}
545+
}
546+
508547
/**
509548
* get_futex_key() - Get parameters which are the keys for a futex
510549
* @uaddr: virtual address of the futex
@@ -517,9 +556,15 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
517556
*
518557
* The key words are stored in @key on success.
519558
*
520-
* For shared mappings, it's (page->index, file_inode(vma->vm_file),
521-
* offset_within_page). For private mappings, it's (uaddr, current->mm).
522-
* We can usually work out the index without swapping in the page.
559+
* For shared mappings (when @fshared), the key is:
560+
* ( inode->i_sequence, page->index, offset_within_page )
561+
* [ also see get_inode_sequence_number() ]
562+
*
563+
* For private mappings (or when !@fshared), the key is:
564+
* ( current->mm, address, 0 )
565+
*
566+
* This allows (cross process, where applicable) identification of the futex
567+
* without keeping the page pinned for the duration of the FUTEX_WAIT.
523568
*
524569
* lock_page() might sleep, the caller should not hold a spinlock.
525570
*/
@@ -659,8 +704,6 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a
659704
key->private.mm = mm;
660705
key->private.address = address;
661706

662-
get_futex_key_refs(key); /* implies smp_mb(); (B) */
663-
664707
} else {
665708
struct inode *inode;
666709

@@ -692,40 +735,14 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a
692735
goto again;
693736
}
694737

695-
/*
696-
* Take a reference unless it is about to be freed. Previously
697-
* this reference was taken by ihold under the page lock
698-
* pinning the inode in place so i_lock was unnecessary. The
699-
* only way for this check to fail is if the inode was
700-
* truncated in parallel which is almost certainly an
701-
* application bug. In such a case, just retry.
702-
*
703-
* We are not calling into get_futex_key_refs() in file-backed
704-
* cases, therefore a successful atomic_inc return below will
705-
* guarantee that get_futex_key() will still imply smp_mb(); (B).
706-
*/
707-
if (!atomic_inc_not_zero(&inode->i_count)) {
708-
rcu_read_unlock();
709-
put_page(page);
710-
711-
goto again;
712-
}
713-
714-
/* Should be impossible but lets be paranoid for now */
715-
if (WARN_ON_ONCE(inode->i_mapping != mapping)) {
716-
err = -EFAULT;
717-
rcu_read_unlock();
718-
iput(inode);
719-
720-
goto out;
721-
}
722-
723738
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
724-
key->shared.inode = inode;
739+
key->shared.i_seq = get_inode_sequence_number(inode);
725740
key->shared.pgoff = basepage_index(tail);
726741
rcu_read_unlock();
727742
}
728743

744+
get_futex_key_refs(key); /* implies smp_mb(); (B) */
745+
729746
out:
730747
put_page(page);
731748
return err;

0 commit comments

Comments
 (0)