Skip to content

Commit d2cd9ed

Browse files
Rik van Rieltorvalds
authored andcommitted
mm,fork: introduce MADV_WIPEONFORK
Introduce MADV_WIPEONFORK semantics, which result in a VMA being empty in the child process after fork. This differs from MADV_DONTFORK in one important way. If a child process accesses memory that was MADV_WIPEONFORK, it will get zeroes. The address ranges are still valid, they are just empty. If a child process accesses memory that was MADV_DONTFORK, it will get a segmentation fault, since those address ranges are no longer valid in the child after fork. Since MADV_DONTFORK also seems to be used to allow very large programs to fork in systems with strict memory overcommit restrictions, changing the semantics of MADV_DONTFORK might break existing programs. MADV_WIPEONFORK only works on private, anonymous VMAs. The use case is libraries that store or cache information, and want to know that they need to regenerate it in the child process after fork. Examples of this would be: - systemd/pulseaudio API checks (fail after fork) (replacing a getpid check, which is too slow without a PID cache) - PKCS#11 API reinitialization check (mandated by specification) - glibc's upcoming PRNG (reseed after fork) - OpenSSL PRNG (reseed after fork) The security benefits of a forking server having a re-inialized PRNG in every child process are pretty obvious. However, due to libraries having all kinds of internal state, and programs getting compiled with many different versions of each library, it is unreasonable to expect calling programs to re-initialize everything manually after fork. A further complication is the proliferation of clone flags, programs bypassing glibc's functions to call clone directly, and programs calling unshare, causing the glibc pthread_atfork hook to not get called. It would be better to have the kernel take care of this automatically. The patch also adds MADV_KEEPONFORK, to undo the effects of a prior MADV_WIPEONFORK. This is similar to the OpenBSD minherit syscall with MAP_INHERIT_ZERO: https://man.openbsd.org/minherit.2 [[email protected]: numerically order arch/parisc/include/uapi/asm/mman.h #defines] Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Rik van Riel <[email protected]> Reported-by: Florian Weimer <[email protected]> Reported-by: Colm MacCártaigh <[email protected]> Reviewed-by: Mike Kravetz <[email protected]> Cc: "H. Peter Anvin" <[email protected]> Cc: "Kirill A. Shutemov" <[email protected]> Cc: Andy Lutomirski <[email protected]> Cc: Dave Hansen <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: Helge Deller <[email protected]> Cc: Kees Cook <[email protected]> Cc: Matthew Wilcox <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: Will Drewry <[email protected]> Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent df3735c commit d2cd9ed

File tree

10 files changed

+39
-10
lines changed

10 files changed

+39
-10
lines changed

arch/alpha/include/uapi/asm/mman.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@
6464
overrides the coredump filter bits */
6565
#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */
6666

67+
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
68+
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
69+
6770
/* compatibility flags */
6871
#define MAP_FILE 0
6972

arch/mips/include/uapi/asm/mman.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,9 @@
9191
overrides the coredump filter bits */
9292
#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */
9393

94+
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
95+
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
96+
9497
/* compatibility flags */
9598
#define MAP_FILE 0
9699

arch/parisc/include/uapi/asm/mman.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@
5757
overrides the coredump filter bits */
5858
#define MADV_DODUMP 70 /* Clear the MADV_NODUMP flag */
5959

60+
#define MADV_WIPEONFORK 71 /* Zero memory on fork, child only */
61+
#define MADV_KEEPONFORK 72 /* Undo MADV_WIPEONFORK */
62+
6063
#define MADV_HWPOISON 100 /* poison a page for testing */
6164
#define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */
6265

arch/xtensa/include/uapi/asm/mman.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,9 @@
103103
overrides the coredump filter bits */
104104
#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */
105105

106+
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
107+
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
108+
106109
/* compatibility flags */
107110
#define MAP_FILE 0
108111

fs/proc/task_mmu.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -663,6 +663,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
663663
[ilog2(VM_NORESERVE)] = "nr",
664664
[ilog2(VM_HUGETLB)] = "ht",
665665
[ilog2(VM_ARCH_1)] = "ar",
666+
[ilog2(VM_WIPEONFORK)] = "wf",
666667
[ilog2(VM_DONTDUMP)] = "dd",
667668
#ifdef CONFIG_MEM_SOFT_DIRTY
668669
[ilog2(VM_SOFTDIRTY)] = "sd",

include/linux/mm.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ extern unsigned int kobjsize(const void *objp);
189189
#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
190190
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
191191
#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
192-
#define VM_ARCH_2 0x02000000
192+
#define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */
193193
#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
194194

195195
#ifdef CONFIG_MEM_SOFT_DIRTY

include/trace/events/mmflags.h

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -125,12 +125,6 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" )
125125
#define __VM_ARCH_SPECIFIC_1 {VM_ARCH_1, "arch_1" }
126126
#endif
127127

128-
#if defined(CONFIG_X86)
129-
#define __VM_ARCH_SPECIFIC_2 {VM_MPX, "mpx" }
130-
#else
131-
#define __VM_ARCH_SPECIFIC_2 {VM_ARCH_2, "arch_2" }
132-
#endif
133-
134128
#ifdef CONFIG_MEM_SOFT_DIRTY
135129
#define IF_HAVE_VM_SOFTDIRTY(flag,name) {flag, name },
136130
#else
@@ -162,7 +156,7 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" )
162156
{VM_NORESERVE, "noreserve" }, \
163157
{VM_HUGETLB, "hugetlb" }, \
164158
__VM_ARCH_SPECIFIC_1 , \
165-
__VM_ARCH_SPECIFIC_2 , \
159+
{VM_WIPEONFORK, "wipeonfork" }, \
166160
{VM_DONTDUMP, "dontdump" }, \
167161
IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \
168162
{VM_MIXEDMAP, "mixedmap" }, \

include/uapi/asm-generic/mman-common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@
5858
overrides the coredump filter bits */
5959
#define MADV_DODUMP 17 /* Clear the MADV_DONTDUMP flag */
6060

61+
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
62+
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
63+
6164
/* compatibility flags */
6265
#define MAP_FILE 0
6366

kernel/fork.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -657,7 +657,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
657657
retval = dup_userfaultfd(tmp, &uf);
658658
if (retval)
659659
goto fail_nomem_anon_vma_fork;
660-
if (anon_vma_fork(tmp, mpnt))
660+
if (tmp->vm_flags & VM_WIPEONFORK) {
661+
/* VM_WIPEONFORK gets a clean slate in the child. */
662+
tmp->anon_vma = NULL;
663+
if (anon_vma_prepare(tmp))
664+
goto fail_nomem_anon_vma_fork;
665+
} else if (anon_vma_fork(tmp, mpnt))
661666
goto fail_nomem_anon_vma_fork;
662667
tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
663668
tmp->vm_next = tmp->vm_prev = NULL;
@@ -701,7 +706,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
701706
rb_parent = &tmp->vm_rb;
702707

703708
mm->map_count++;
704-
retval = copy_page_range(mm, oldmm, mpnt);
709+
if (!(tmp->vm_flags & VM_WIPEONFORK))
710+
retval = copy_page_range(mm, oldmm, mpnt);
705711

706712
if (tmp->vm_ops && tmp->vm_ops->open)
707713
tmp->vm_ops->open(tmp);

mm/madvise.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,17 @@ static long madvise_behavior(struct vm_area_struct *vma,
8080
}
8181
new_flags &= ~VM_DONTCOPY;
8282
break;
83+
case MADV_WIPEONFORK:
84+
/* MADV_WIPEONFORK is only supported on anonymous memory. */
85+
if (vma->vm_file || vma->vm_flags & VM_SHARED) {
86+
error = -EINVAL;
87+
goto out;
88+
}
89+
new_flags |= VM_WIPEONFORK;
90+
break;
91+
case MADV_KEEPONFORK:
92+
new_flags &= ~VM_WIPEONFORK;
93+
break;
8394
case MADV_DONTDUMP:
8495
new_flags |= VM_DONTDUMP;
8596
break;
@@ -696,6 +707,8 @@ madvise_behavior_valid(int behavior)
696707
#endif
697708
case MADV_DONTDUMP:
698709
case MADV_DODUMP:
710+
case MADV_WIPEONFORK:
711+
case MADV_KEEPONFORK:
699712
#ifdef CONFIG_MEMORY_FAILURE
700713
case MADV_SOFT_OFFLINE:
701714
case MADV_HWPOISON:

0 commit comments

Comments
 (0)