Skip to content

Commit 6365b84

Browse files
amlutoKAGA-KOKO
authored andcommitted
x86/syscalls: Split the x32 syscalls into their own table
For unfortunate historical reasons, the x32 syscalls and the x86_64 syscalls are not all numbered the same. As an example, ioctl() is nr 16 on x86_64 but 514 on x32. This has potentially nasty consequences, since it means that there are two valid RAX values to do ioctl(2) and two invalid RAX values. The valid values are 16 (i.e. ioctl(2) using the x86_64 ABI) and (514 | 0x40000000) (i.e. ioctl(2) using the x32 ABI). The invalid values are 514 and (16 | 0x40000000). 514 will enter the "COMPAT_SYSCALL_DEFINE3(ioctl, ...)" entry point with in_compat_syscall() and in_x32_syscall() returning false, whereas (16 | 0x40000000) will enter the native entry point with in_compat_syscall() and in_x32_syscall() returning true. Both are bogus, and both will exercise code paths in the kernel and in any running seccomp filters that really ought to be unreachable. Splitting out the x32 syscalls into their own tables, allows both bogus invocations to return -ENOSYS. I've checked glibc, musl, and Bionic, and all of them appear to call syscalls with their correct numbers, so this change should have no effect on them. There is an added benefit going forward: new syscalls that need special handling on x32 can share the same number on x32 and x86_64. This means that the special syscall range 512-547 can be treated as a legacy wart instead of something that may need to be extended in the future. Also add a selftest to verify the new behavior. Signed-off-by: Andy Lutomirski <[email protected]> Signed-off-by: Thomas Gleixner <[email protected]> Link: https://lkml.kernel.org/r/208024256b764312598f014ebfb0a42472c19354.1562185330.git.luto@kernel.org
1 parent f85a857 commit 6365b84

File tree

8 files changed

+163
-27
lines changed

8 files changed

+163
-27
lines changed

arch/x86/entry/common.c

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -285,15 +285,16 @@ __visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
285285
if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
286286
nr = syscall_trace_enter(regs);
287287

288-
/*
289-
* NB: Native and x32 syscalls are dispatched from the same
290-
* table. The only functional difference is the x32 bit in
291-
* regs->orig_ax, which changes the behavior of some syscalls.
292-
*/
293-
nr &= __SYSCALL_MASK;
294288
if (likely(nr < NR_syscalls)) {
295289
nr = array_index_nospec(nr, NR_syscalls);
296290
regs->ax = sys_call_table[nr](regs);
291+
#ifdef CONFIG_X86_X32_ABI
292+
} else if (likely((nr & __X32_SYSCALL_BIT) &&
293+
(nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) {
294+
nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT,
295+
X32_NR_syscalls);
296+
regs->ax = x32_sys_call_table[nr](regs);
297+
#endif
297298
}
298299

299300
syscall_return_slowpath(regs);

arch/x86/entry/syscall_64.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,13 @@
1010
/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */
1111
extern asmlinkage long sys_ni_syscall(const struct pt_regs *);
1212
#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *);
13+
#define __SYSCALL_X32(nr, sym, qual) __SYSCALL_64(nr, sym, qual)
1314
#include <asm/syscalls_64.h>
1415
#undef __SYSCALL_64
16+
#undef __SYSCALL_X32
1517

1618
#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
19+
#define __SYSCALL_X32(nr, sym, qual)
1720

1821
asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
1922
/*
@@ -23,3 +26,25 @@ asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
2326
[0 ... __NR_syscall_max] = &sys_ni_syscall,
2427
#include <asm/syscalls_64.h>
2528
};
29+
30+
#undef __SYSCALL_64
31+
#undef __SYSCALL_X32
32+
33+
#ifdef CONFIG_X86_X32_ABI
34+
35+
#define __SYSCALL_64(nr, sym, qual)
36+
#define __SYSCALL_X32(nr, sym, qual) [nr] = sym,
37+
38+
asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_syscall_x32_max+1] = {
39+
/*
40+
* Smells like a compiler bug -- it doesn't work
41+
* when the & below is removed.
42+
*/
43+
[0 ... __NR_syscall_x32_max] = &sys_ni_syscall,
44+
#include <asm/syscalls_64.h>
45+
};
46+
47+
#undef __SYSCALL_64
48+
#undef __SYSCALL_X32
49+
50+
#endif

arch/x86/entry/syscalls/syscalltbl.sh

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
1-
#!/bin/sh
1+
#!/bin/bash
22
# SPDX-License-Identifier: GPL-2.0
33

44
in="$1"
55
out="$2"
66

77
syscall_macro() {
8-
abi="$1"
9-
nr="$2"
10-
entry="$3"
8+
local abi="$1"
9+
local nr="$2"
10+
local entry="$3"
1111

1212
# Entry can be either just a function name or "function/qualifier"
1313
real_entry="${entry%%/*}"
@@ -21,11 +21,11 @@ syscall_macro() {
2121
}
2222

2323
emit() {
24-
abi="$1"
25-
nr="$2"
26-
entry="$3"
27-
compat="$4"
28-
umlentry=""
24+
local abi="$1"
25+
local nr="$2"
26+
local entry="$3"
27+
local compat="$4"
28+
local umlentry=""
2929

3030
if [ "$abi" != "I386" -a -n "$compat" ]; then
3131
echo "a compat entry ($abi: $compat) for a 64-bit syscall makes no sense" >&2
@@ -62,14 +62,17 @@ grep '^[0-9]' "$in" | sort -n | (
6262
while read nr abi name entry compat; do
6363
abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
6464
if [ "$abi" = "COMMON" -o "$abi" = "64" ]; then
65-
# COMMON is the same as 64, except that we don't expect X32
66-
# programs to use it. Our expectation has nothing to do with
67-
# any generated code, so treat them the same.
6865
emit 64 "$nr" "$entry" "$compat"
66+
if [ "$abi" = "COMMON" ]; then
67+
# COMMON means that this syscall exists in the same form for
68+
# 64-bit and X32.
69+
echo "#ifdef CONFIG_X86_X32_ABI"
70+
emit X32 "$nr" "$entry" "$compat"
71+
echo "#endif"
72+
fi
6973
elif [ "$abi" = "X32" ]; then
70-
# X32 is equivalent to 64 on an X32-compatible kernel.
7174
echo "#ifdef CONFIG_X86_X32_ABI"
72-
emit 64 "$nr" "$entry" "$compat"
75+
emit X32 "$nr" "$entry" "$compat"
7376
echo "#endif"
7477
elif [ "$abi" = "I386" ]; then
7578
emit "$abi" "$nr" "$entry" "$compat"

arch/x86/include/asm/syscall.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@ extern const sys_call_ptr_t sys_call_table[];
3636
extern const sys_call_ptr_t ia32_sys_call_table[];
3737
#endif
3838

39+
#ifdef CONFIG_X86_X32_ABI
40+
extern const sys_call_ptr_t x32_sys_call_table[];
41+
#endif
42+
3943
/*
4044
* Only the low 32 bits of orig_ax are meaningful, so we return int.
4145
* This importantly ignores the high bits on 64-bit, so comparisons

arch/x86/include/asm/unistd.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,6 @@
55
#include <uapi/asm/unistd.h>
66

77

8-
# ifdef CONFIG_X86_X32_ABI
9-
# define __SYSCALL_MASK (~(__X32_SYSCALL_BIT))
10-
# else
11-
# define __SYSCALL_MASK (~0)
12-
# endif
13-
148
# ifdef CONFIG_X86_32
159

1610
# include <asm/unistd_32.h>

arch/x86/kernel/asm-offsets_64.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,28 @@
66
#include <asm/ia32.h>
77

88
#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
9+
#define __SYSCALL_X32(nr, sym, qual)
910
static char syscalls_64[] = {
1011
#include <asm/syscalls_64.h>
1112
};
13+
#undef __SYSCALL_64
14+
#undef __SYSCALL_X32
15+
16+
#ifdef CONFIG_X86_X32_ABI
17+
#define __SYSCALL_64(nr, sym, qual)
18+
#define __SYSCALL_X32(nr, sym, qual) [nr] = 1,
19+
static char syscalls_x32[] = {
20+
#include <asm/syscalls_64.h>
21+
};
22+
#undef __SYSCALL_64
23+
#undef __SYSCALL_X32
24+
#endif
25+
1226
#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
1327
static char syscalls_ia32[] = {
1428
#include <asm/syscalls_32.h>
1529
};
30+
#undef __SYSCALL_I386
1631

1732
#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
1833
#include <asm/kvm_para.h>
@@ -80,6 +95,11 @@ int main(void)
8095
DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
8196
DEFINE(NR_syscalls, sizeof(syscalls_64));
8297

98+
#ifdef CONFIG_X86_X32_ABI
99+
DEFINE(__NR_syscall_x32_max, sizeof(syscalls_x32) - 1);
100+
DEFINE(X32_NR_syscalls, sizeof(syscalls_x32));
101+
#endif
102+
83103
DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1);
84104
DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));
85105

tools/testing/selftests/x86/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap
1717
TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
1818
test_FCMOV test_FCOMI test_FISTTP \
1919
vdso_restorer
20-
TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip
20+
TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering
2121
# Some selftests require 32bit support enabled also on 64bit systems
2222
TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall
2323

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
/*
3+
* syscall_arg_fault.c - tests faults 32-bit fast syscall stack args
4+
* Copyright (c) 2018 Andrew Lutomirski
5+
*/
6+
7+
#define _GNU_SOURCE
8+
9+
#include <stdlib.h>
10+
#include <stdio.h>
11+
#include <stdbool.h>
12+
#include <errno.h>
13+
#include <unistd.h>
14+
#include <syscall.h>
15+
16+
static int nerrs;
17+
18+
#define X32_BIT 0x40000000UL
19+
20+
static void check_enosys(unsigned long nr, bool *ok)
21+
{
22+
/* If this fails, a segfault is reasonably likely. */
23+
fflush(stdout);
24+
25+
long ret = syscall(nr, 0, 0, 0, 0, 0, 0);
26+
if (ret == 0) {
27+
printf("[FAIL]\tsyscall %lu succeeded, but it should have failed\n", nr);
28+
*ok = false;
29+
} else if (errno != ENOSYS) {
30+
printf("[FAIL]\tsyscall %lu had error code %d, but it should have reported ENOSYS\n", nr, errno);
31+
*ok = false;
32+
}
33+
}
34+
35+
static void test_x32_without_x32_bit(void)
36+
{
37+
bool ok = true;
38+
39+
/*
40+
* Syscalls 512-547 are "x32" syscalls. They are intended to be
41+
* called with the x32 (0x40000000) bit set. Calling them without
42+
* the x32 bit set is nonsense and should not work.
43+
*/
44+
printf("[RUN]\tChecking syscalls 512-547\n");
45+
for (int i = 512; i <= 547; i++)
46+
check_enosys(i, &ok);
47+
48+
/*
49+
* Check that a handful of 64-bit-only syscalls are rejected if the x32
50+
* bit is set.
51+
*/
52+
printf("[RUN]\tChecking some 64-bit syscalls in x32 range\n");
53+
check_enosys(16 | X32_BIT, &ok); /* ioctl */
54+
check_enosys(19 | X32_BIT, &ok); /* readv */
55+
check_enosys(20 | X32_BIT, &ok); /* writev */
56+
57+
/*
58+
* Check some syscalls with high bits set.
59+
*/
60+
printf("[RUN]\tChecking numbers above 2^32-1\n");
61+
check_enosys((1UL << 32), &ok);
62+
check_enosys(X32_BIT | (1UL << 32), &ok);
63+
64+
if (!ok)
65+
nerrs++;
66+
else
67+
printf("[OK]\tThey all returned -ENOSYS\n");
68+
}
69+
70+
int main()
71+
{
72+
/*
73+
* Anyone diagnosing a failure will want to know whether the kernel
74+
* supports x32. Tell them.
75+
*/
76+
printf("\tChecking for x32...");
77+
fflush(stdout);
78+
if (syscall(39 | X32_BIT, 0, 0, 0, 0, 0, 0) >= 0) {
79+
printf(" supported\n");
80+
} else if (errno == ENOSYS) {
81+
printf(" not supported\n");
82+
} else {
83+
printf(" confused\n");
84+
}
85+
86+
test_x32_without_x32_bit();
87+
88+
return nerrs ? 1 : 0;
89+
}

0 commit comments

Comments
 (0)