Skip to content

Commit 9947f2f

Browse files
hikerockiesSomasundaram Krishnasamy
authored andcommitted
mm: Allow userspace to reserve VA range for use by userspace only
Add support for ELF binaries to reserve address ranges. Address range can be reserved at load time by adding an ELF NOTE section, or at run time with mprotect() with PROT_RESERVED flag. Reserved ranges can be allocated with mmap(..... MAP_FIXED...) and shmat(...., SHM_REMAP) later. Any reserved address ranges are annotated with "[rsvd]" in /proc/<pid>/maps output. A binary can check if the kernel supports VA range reservation by checking the value of auxiliary vector AT_VA_RESERVATION. VA reservation is done by adding a special NOTE section to binary using declarations similar to following: .section .note.rsvd_range, "a", @note .p2align 2 .long 1f - 0f # name size (not including padding) .long 3f - 2f # desc size (not including padding) .long 0x07c10001 0: .asciz "Reserved VA" # name 1: .p2align 2 2: .quad 0x7f2000000000 .quad 0x7f2000e00000 .quad 0x7f5000200000 .quad 0x7f500d000000 3: .p2align 2 Each reserved range is specified as pair of addresses (start and end). This note section is read by kernel elf loader and address ranges are reserved for the lifetime of process. A maximum of 64 such entries can be made in NOTE section. Execution of a binary file with more than 64 pairs of addresses in this note section will be terminated with ENOEXEC. NOTE: Kernel can not guarantee all VA ranges in the NOTE section will be reserved. If the address range is valid but is already in use (possibly by a shared library loaded earlier), execution of binary will be terminated with ENOMEM. NOTE: This feature needs two VMA flag bits. There are no free bits available in lower 32 bits. As a result this feature can only be supported on architectures that support high VMA flag bits (bits 32-63). Orabug: 28438736 Signed-off-by: Khalid Aziz <[email protected]> Reviewed-by: Konrad Rzeszutek Wilk <[email protected]> Reviewed-by: Mike Kravetz <[email protected]> (cherry picked from LUCI 97f7b3403bf1b2c3b4dea2e46e766aa3195a40c4) Signed-off-by: Khalid Aziz <[email protected]> Reviewed-by: Mike Kravetz <[email protected]> Signed-off-by: Somasundaram Krishnasamy <[email protected]>
1 parent 3da3ac1 commit 9947f2f

File tree

7 files changed

+281
-6
lines changed

7 files changed

+281
-6
lines changed

arch/x86/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ config X86_64
2929
select HAVE_ARCH_SOFT_DIRTY
3030
select MODULES_USE_ELF_RELA
3131
select X86_DEV_DMA_OPS
32+
select ARCH_USES_HIGH_VMA_FLAGS
3233

3334
#
3435
# Arch settings

fs/binfmt_elf.c

Lines changed: 125 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,11 @@
5151
#define user_siginfo_t siginfo_t
5252
#endif
5353

54+
extern int install_rsvd_mapping(struct mm_struct *mm,
55+
struct vm_area_struct *prev, unsigned long addr,
56+
unsigned long len);
57+
58+
5459
static int load_elf_binary(struct linux_binprm *bprm);
5560
static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
5661
int, int, unsigned long);
@@ -77,6 +82,8 @@ static int elf_core_dump(struct coredump_params *cprm);
7782
#define ELF_MIN_ALIGN PAGE_SIZE
7883
#endif
7984

85+
#define MAX_FILE_NOTE_SIZE (4*1024*1024)
86+
8087
#ifndef ELF_CORE_EFLAGS
8188
#define ELF_CORE_EFLAGS 0
8289
#endif
@@ -269,6 +276,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
269276
if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) {
270277
NEW_AUX_ENT(AT_EXECFD, bprm->interp_data);
271278
}
279+
NEW_AUX_ENT(AT_VA_RESERVATION, 1);
272280
#undef NEW_AUX_ENT
273281
/* AT_NULL is zero; clear the rest too */
274282
memset(&elf_info[ei_index], 0,
@@ -676,6 +684,105 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
676684
#endif
677685
}
678686

687+
#define MAX_RSVD_VA_RANGES 64
688+
#define RSVD_VA_STRING "Reserved VA"
689+
#define SZ_RSVD_VA_STRING sizeof(RSVD_VA_STRING)
690+
691+
static int reserve_va_range(struct elf_phdr *elf_ppnt,
692+
struct linux_binprm *bprm)
693+
{
694+
char *note_seg = NULL;
695+
struct elf_note *note;
696+
loff_t pos = elf_ppnt->p_offset;
697+
int retval = 0;
698+
size_t note_size = elf_ppnt->p_filesz;
699+
700+
note_seg = kvmalloc(note_size, GFP_KERNEL);
701+
if (!note_seg) {
702+
retval = -ENOMEM;
703+
return retval;
704+
}
705+
706+
retval = kernel_read(bprm->file, note_seg, note_size, &pos);
707+
if (retval != note_size) {
708+
if (retval >= 0)
709+
retval = -EIO;
710+
goto out;
711+
}
712+
713+
note = (struct elf_note *)note_seg;
714+
while ((char *)note + sizeof(struct elf_note) <
715+
(char *)(note_seg + note_size)) {
716+
char *name;
717+
unsigned long *val;
718+
unsigned long nentry, i;
719+
720+
if (note->n_type != 0x07c10001)
721+
goto cont_loop;
722+
723+
/* Sanity check for malformed note entry */
724+
if (note->n_namesz > SZ_RSVD_VA_STRING) {
725+
retval = -ENOEXEC;
726+
goto out;
727+
}
728+
729+
name = (char *)note + sizeof(struct elf_note);
730+
if (strncmp(name, RSVD_VA_STRING, SZ_RSVD_VA_STRING) == 0) {
731+
nentry = note->n_descsz/sizeof(void *);
732+
val = (unsigned long *)(name +
733+
roundup(note->n_namesz, 4));
734+
/*
735+
* Check if right number of address
736+
* entries exist in note section
737+
*/
738+
if (((nentry % 2) != 0) ||
739+
((nentry % 2) > MAX_RSVD_VA_RANGES)) {
740+
retval = -ENOEXEC;
741+
goto out;
742+
}
743+
for (i = 0 ; i < nentry; i += 2) {
744+
unsigned long range1, range2;
745+
struct mm_struct *mm = current->mm;
746+
747+
/*
748+
* Ensure we can access two address entries
749+
* in this note segment safely
750+
*/
751+
if ((char *)(val + 1) >=
752+
((char *)note_seg + note_size)) {
753+
retval = -ENOEXEC;
754+
goto out;
755+
}
756+
range1 = PAGE_ALIGN((*val++) - PAGE_SIZE + 1);
757+
range2 = PAGE_ALIGN(*val++);
758+
759+
/* Validate the address range being reserved */
760+
if ((range2 <= range1) ||
761+
(range2 > user_addr_max())) {
762+
retval = -ENOEXEC;
763+
goto out;
764+
}
765+
766+
down_write(&mm->mmap_sem);
767+
retval = install_rsvd_mapping(mm, NULL, range1,
768+
(range2-range1));
769+
up_write(&mm->mmap_sem);
770+
if (retval < 0)
771+
goto out;
772+
}
773+
}
774+
cont_loop:
775+
note = (struct elf_note *)((char *)note +
776+
sizeof(struct elf_note) +
777+
roundup(note->n_namesz, 4) +
778+
roundup(note->n_descsz, 4));
779+
}
780+
781+
out:
782+
kvfree(note_seg);
783+
return retval;
784+
}
785+
679786
static int load_elf_binary(struct linux_binprm *bprm)
680787
{
681788
struct file *interpreter = NULL; /* to shut gcc up */
@@ -877,6 +984,24 @@ static int load_elf_binary(struct linux_binprm *bprm)
877984

878985
current->mm->start_stack = bprm->p;
879986

987+
/*
988+
* Read the notes segment to find notes to reserve address space
989+
*/
990+
elf_ppnt = elf_phdata;
991+
for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++)
992+
if (elf_ppnt->p_type == PT_NOTE) {
993+
/* Sanity check for bogus note segment */
994+
if ((elf_ppnt->p_filesz > MAX_FILE_NOTE_SIZE) ||
995+
(elf_ppnt->p_filesz < sizeof(struct elf_note))) {
996+
retval = -ENOEXEC;
997+
goto out_free_ph;
998+
}
999+
retval = reserve_va_range(elf_ppnt, bprm);
1000+
if (retval < 0)
1001+
goto out_free_ph;
1002+
}
1003+
1004+
8801005
/* Now we do a little grungy work by mmapping the ELF image into
8811006
the correct location in memory. */
8821007
for(i = 0, elf_ppnt = elf_phdata;
@@ -1565,7 +1690,6 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
15651690
fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata);
15661691
}
15671692

1568-
#define MAX_FILE_NOTE_SIZE (4*1024*1024)
15691693
/*
15701694
* Format of NT_FILE note:
15711695
*

include/linux/mm.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,11 +214,15 @@ extern unsigned int kobjsize(const void *objp);
214214
#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
215215
#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
216216
#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
217+
#define VM_HIGH_ARCH_BIT_16 48 /* bit only usable on 64-bit architectures */
218+
#define VM_HIGH_ARCH_BIT_17 49 /* bit only usable on 64-bit architectures */
217219
#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
218220
#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
219221
#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
220222
#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
221223
#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
224+
#define VM_HIGH_ARCH_16 BIT(VM_HIGH_ARCH_BIT_16)
225+
#define VM_HIGH_ARCH_17 BIT(VM_HIGH_ARCH_BIT_17)
222226
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
223227

224228
#if defined(CONFIG_X86)
@@ -242,6 +246,17 @@ extern unsigned int kobjsize(const void *objp);
242246
# define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */
243247
#endif
244248

249+
#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
250+
# define VM_RSVD_VA VM_HIGH_ARCH_16 /* Reserved VA range */
251+
# define VM_RSVD_NORELINK VM_HIGH_ARCH_17 /* VA range unmapped by
252+
* userspace but still reserved
253+
* for use by userspace only
254+
*/
255+
#else
256+
# define VM_RSVD_VA VM_NONE
257+
# define VM_RSVD_NORELINK VM_NONE
258+
#endif
259+
245260
#if defined(CONFIG_X86_INTEL_MPX)
246261
/* MPX specific bounds table or bounds directory */
247262
# define VM_MPX VM_HIGH_ARCH_4

include/uapi/asm-generic/mman-common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#define PROT_NONE 0x0 /* page can not be accessed */
1515
#define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */
1616
#define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */
17+
#define PROT_RESERVED 0x10000000 /* Reserve this VA range */
1718

1819
#define MAP_SHARED 0x01 /* Share changes */
1920
#define MAP_PRIVATE 0x02 /* Changes are private */

include/uapi/linux/auxvec.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
* differ from AT_PLATFORM. */
3131
#define AT_RANDOM 25 /* address of 16 random bytes */
3232
#define AT_HWCAP2 26 /* extension of AT_HWCAP */
33+
#define AT_VA_RESERVATION 27 /* VA reservation support */
3334

3435
#define AT_EXECFN 31 /* filename of program */
3536

0 commit comments

Comments
 (0)