Skip to content

Commit d97b46a

Browse files
Cyrill Gorcunovtorvalds
authored andcommitted
syscalls, x86: add __NR_kcmp syscall
While doing the checkpoint-restore in the user space one need to determine whether various kernel objects (like mm_struct-s of file_struct-s) are shared between tasks and restore this state. The 2nd step can be solved by using appropriate CLONE_ flags and the unshare syscall, while there's currently no ways for solving the 1st one. One of the ways for checking whether two tasks share e.g. mm_struct is to provide some mm_struct ID of a task to its proc file, but showing such info considered to be not that good for security reasons. Thus after some debates we end up in conclusion that using that named 'comparison' syscall might be the best candidate. So here is it -- __NR_kcmp. It takes up to 5 arguments - the pids of the two tasks (which characteristics should be compared), the comparison type and (in case of comparison of files) two file descriptors. Lookups for pids are done in the caller's PID namespace only. At moment only x86 is supported and tested. [[email protected]: fix up selftests, warnings] [[email protected]: include errno.h] [[email protected]: tweak comment text] Signed-off-by: Cyrill Gorcunov <[email protected]> Acked-by: "Eric W. Biederman" <[email protected]> Cc: Pavel Emelyanov <[email protected]> Cc: Andrey Vagin <[email protected]> Cc: KOSAKI Motohiro <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: H. Peter Anvin <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: Glauber Costa <[email protected]> Cc: Andi Kleen <[email protected]> Cc: Tejun Heo <[email protected]> Cc: Matt Helsley <[email protected]> Cc: Pekka Enberg <[email protected]> Cc: Eric Dumazet <[email protected]> Cc: Vasiliy Kulikov <[email protected]> Cc: Alexey Dobriyan <[email protected]> Cc: [email protected] Cc: Michal Marek <[email protected]> Cc: Frederic Weisbecker <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 8184116 commit d97b46a

File tree

10 files changed

+348
-1
lines changed

10 files changed

+348
-1
lines changed

arch/x86/syscalls/syscall_32.tbl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,3 +355,4 @@
355355
346 i386 setns sys_setns
356356
347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv
357357
348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
358+
349 i386 kcmp sys_kcmp

arch/x86/syscalls/syscall_64.tbl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,8 @@
318318
309 common getcpu sys_getcpu
319319
310 64 process_vm_readv sys_process_vm_readv
320320
311 64 process_vm_writev sys_process_vm_writev
321+
312 64 kcmp sys_kcmp
322+
321323
#
322324
# x32-specific system call numbers start at 512 to avoid cache impact
323325
# for native 64-bit operation.

include/linux/kcmp.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#ifndef _LINUX_KCMP_H
2+
#define _LINUX_KCMP_H
3+
4+
/* Comparison type */
5+
enum kcmp_type {
6+
KCMP_FILE,
7+
KCMP_VM,
8+
KCMP_FILES,
9+
KCMP_FS,
10+
KCMP_SIGHAND,
11+
KCMP_IO,
12+
KCMP_SYSVSEM,
13+
14+
KCMP_TYPES,
15+
};
16+
17+
#endif /* _LINUX_KCMP_H */

include/linux/syscalls.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -858,4 +858,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
858858
unsigned long riovcnt,
859859
unsigned long flags);
860860

861+
asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
862+
unsigned long idx1, unsigned long idx2);
861863
#endif

kernel/Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ endif
2525
obj-y += sched/
2626
obj-y += power/
2727

28+
ifeq ($(CONFIG_CHECKPOINT_RESTORE),y)
29+
obj-$(CONFIG_X86) += kcmp.o
30+
endif
2831
obj-$(CONFIG_FREEZER) += freezer.o
2932
obj-$(CONFIG_PROFILING) += profile.o
3033
obj-$(CONFIG_STACKTRACE) += stacktrace.o

kernel/kcmp.c

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
#include <linux/kernel.h>
2+
#include <linux/syscalls.h>
3+
#include <linux/fdtable.h>
4+
#include <linux/string.h>
5+
#include <linux/random.h>
6+
#include <linux/module.h>
7+
#include <linux/init.h>
8+
#include <linux/errno.h>
9+
#include <linux/cache.h>
10+
#include <linux/bug.h>
11+
#include <linux/err.h>
12+
#include <linux/kcmp.h>
13+
14+
#include <asm/unistd.h>
15+
16+
/*
17+
* We don't expose the real in-memory order of objects for security reasons.
18+
* But still the comparison results should be suitable for sorting. So we
19+
* obfuscate kernel pointers values and compare the production instead.
20+
*
21+
* The obfuscation is done in two steps. First we xor the kernel pointer with
22+
* a random value, which puts pointer into a new position in a reordered space.
23+
* Secondly we multiply the xor production with a large odd random number to
24+
* permute its bits even more (the odd multiplier guarantees that the product
25+
* is unique ever after the high bits are truncated, since any odd number is
26+
* relative prime to 2^n).
27+
*
28+
* Note also that the obfuscation itself is invisible to userspace and if needed
29+
* it can be changed to an alternate scheme.
30+
*/
31+
static unsigned long cookies[KCMP_TYPES][2] __read_mostly;
32+
33+
static long kptr_obfuscate(long v, int type)
34+
{
35+
return (v ^ cookies[type][0]) * cookies[type][1];
36+
}
37+
38+
/*
39+
* 0 - equal, i.e. v1 = v2
40+
* 1 - less than, i.e. v1 < v2
41+
* 2 - greater than, i.e. v1 > v2
42+
* 3 - not equal but ordering unavailable (reserved for future)
43+
*/
44+
static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
45+
{
46+
long ret;
47+
48+
ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type);
49+
50+
return (ret < 0) | ((ret > 0) << 1);
51+
}
52+
53+
/* The caller must have pinned the task */
54+
static struct file *
55+
get_file_raw_ptr(struct task_struct *task, unsigned int idx)
56+
{
57+
struct file *file = NULL;
58+
59+
task_lock(task);
60+
rcu_read_lock();
61+
62+
if (task->files)
63+
file = fcheck_files(task->files, idx);
64+
65+
rcu_read_unlock();
66+
task_unlock(task);
67+
68+
return file;
69+
}
70+
71+
static void kcmp_unlock(struct mutex *m1, struct mutex *m2)
72+
{
73+
if (likely(m2 != m1))
74+
mutex_unlock(m2);
75+
mutex_unlock(m1);
76+
}
77+
78+
static int kcmp_lock(struct mutex *m1, struct mutex *m2)
79+
{
80+
int err;
81+
82+
if (m2 > m1)
83+
swap(m1, m2);
84+
85+
err = mutex_lock_killable(m1);
86+
if (!err && likely(m1 != m2)) {
87+
err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING);
88+
if (err)
89+
mutex_unlock(m1);
90+
}
91+
92+
return err;
93+
}
94+
95+
SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
96+
unsigned long, idx1, unsigned long, idx2)
97+
{
98+
struct task_struct *task1, *task2;
99+
int ret;
100+
101+
rcu_read_lock();
102+
103+
/*
104+
* Tasks are looked up in caller's PID namespace only.
105+
*/
106+
task1 = find_task_by_vpid(pid1);
107+
task2 = find_task_by_vpid(pid2);
108+
if (!task1 || !task2)
109+
goto err_no_task;
110+
111+
get_task_struct(task1);
112+
get_task_struct(task2);
113+
114+
rcu_read_unlock();
115+
116+
/*
117+
* One should have enough rights to inspect task details.
118+
*/
119+
ret = kcmp_lock(&task1->signal->cred_guard_mutex,
120+
&task2->signal->cred_guard_mutex);
121+
if (ret)
122+
goto err;
123+
if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
124+
!ptrace_may_access(task2, PTRACE_MODE_READ)) {
125+
ret = -EPERM;
126+
goto err_unlock;
127+
}
128+
129+
switch (type) {
130+
case KCMP_FILE: {
131+
struct file *filp1, *filp2;
132+
133+
filp1 = get_file_raw_ptr(task1, idx1);
134+
filp2 = get_file_raw_ptr(task2, idx2);
135+
136+
if (filp1 && filp2)
137+
ret = kcmp_ptr(filp1, filp2, KCMP_FILE);
138+
else
139+
ret = -EBADF;
140+
break;
141+
}
142+
case KCMP_VM:
143+
ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM);
144+
break;
145+
case KCMP_FILES:
146+
ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES);
147+
break;
148+
case KCMP_FS:
149+
ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS);
150+
break;
151+
case KCMP_SIGHAND:
152+
ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND);
153+
break;
154+
case KCMP_IO:
155+
ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO);
156+
break;
157+
case KCMP_SYSVSEM:
158+
#ifdef CONFIG_SYSVIPC
159+
ret = kcmp_ptr(task1->sysvsem.undo_list,
160+
task2->sysvsem.undo_list,
161+
KCMP_SYSVSEM);
162+
#else
163+
ret = -EOPNOTSUPP;
164+
#endif
165+
break;
166+
default:
167+
ret = -EINVAL;
168+
break;
169+
}
170+
171+
err_unlock:
172+
kcmp_unlock(&task1->signal->cred_guard_mutex,
173+
&task2->signal->cred_guard_mutex);
174+
err:
175+
put_task_struct(task1);
176+
put_task_struct(task2);
177+
178+
return ret;
179+
180+
err_no_task:
181+
rcu_read_unlock();
182+
return -ESRCH;
183+
}
184+
185+
static __init int kcmp_cookies_init(void)
186+
{
187+
int i;
188+
189+
get_random_bytes(cookies, sizeof(cookies));
190+
191+
for (i = 0; i < KCMP_TYPES; i++)
192+
cookies[i][1] |= (~(~0UL >> 1) | 1);
193+
194+
return 0;
195+
}
196+
arch_initcall(kcmp_cookies_init);

kernel/sys_ni.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,3 +203,6 @@ cond_syscall(sys_fanotify_mark);
203203
cond_syscall(sys_name_to_handle_at);
204204
cond_syscall(sys_open_by_handle_at);
205205
cond_syscall(compat_sys_open_by_handle_at);
206+
207+
/* compare kernel pointers */
208+
cond_syscall(sys_kcmp);

tools/testing/selftests/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
TARGETS = breakpoints mqueue vm
1+
TARGETS = breakpoints kcmp mqueue vm
22

33
all:
44
for TARGET in $(TARGETS); do \

tools/testing/selftests/kcmp/Makefile

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
uname_M := $(shell uname -m 2>/dev/null || echo not)
2+
ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/)
3+
ifeq ($(ARCH),i386)
4+
ARCH := X86
5+
CFLAGS := -DCONFIG_X86_32 -D__i386__
6+
endif
7+
ifeq ($(ARCH),x86_64)
8+
ARCH := X86
9+
CFLAGS := -DCONFIG_X86_64 -D__x86_64__
10+
endif
11+
12+
CFLAGS += -I../../../../arch/x86/include/generated/
13+
CFLAGS += -I../../../../include/
14+
CFLAGS += -I../../../../usr/include/
15+
CFLAGS += -I../../../../arch/x86/include/
16+
17+
all:
18+
ifeq ($(ARCH),X86)
19+
gcc $(CFLAGS) kcmp_test.c -o run_test
20+
else
21+
echo "Not an x86 target, can't build kcmp selftest"
22+
endif
23+
24+
run-tests: all
25+
./kcmp_test
26+
27+
clean:
28+
rm -fr ./run_test
29+
rm -fr ./test-file
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
#define _GNU_SOURCE
2+
3+
#include <stdio.h>
4+
#include <stdlib.h>
5+
#include <signal.h>
6+
#include <limits.h>
7+
#include <unistd.h>
8+
#include <errno.h>
9+
#include <string.h>
10+
#include <fcntl.h>
11+
12+
#include <linux/unistd.h>
13+
#include <linux/kcmp.h>
14+
15+
#include <sys/syscall.h>
16+
#include <sys/types.h>
17+
#include <sys/stat.h>
18+
#include <sys/wait.h>
19+
20+
static long sys_kcmp(int pid1, int pid2, int type, int fd1, int fd2)
21+
{
22+
return syscall(__NR_kcmp, pid1, pid2, type, fd1, fd2);
23+
}
24+
25+
int main(int argc, char **argv)
26+
{
27+
const char kpath[] = "kcmp-test-file";
28+
int pid1, pid2;
29+
int fd1, fd2;
30+
int status;
31+
32+
fd1 = open(kpath, O_RDWR | O_CREAT | O_TRUNC, 0644);
33+
pid1 = getpid();
34+
35+
if (fd1 < 0) {
36+
perror("Can't create file");
37+
exit(1);
38+
}
39+
40+
pid2 = fork();
41+
if (pid2 < 0) {
42+
perror("fork failed");
43+
exit(1);
44+
}
45+
46+
if (!pid2) {
47+
int pid2 = getpid();
48+
int ret;
49+
50+
fd2 = open(kpath, O_RDWR, 0644);
51+
if (fd2 < 0) {
52+
perror("Can't open file");
53+
exit(1);
54+
}
55+
56+
/* An example of output and arguments */
57+
printf("pid1: %6d pid2: %6d FD: %2ld FILES: %2ld VM: %2ld "
58+
"FS: %2ld SIGHAND: %2ld IO: %2ld SYSVSEM: %2ld "
59+
"INV: %2ld\n",
60+
pid1, pid2,
61+
sys_kcmp(pid1, pid2, KCMP_FILE, fd1, fd2),
62+
sys_kcmp(pid1, pid2, KCMP_FILES, 0, 0),
63+
sys_kcmp(pid1, pid2, KCMP_VM, 0, 0),
64+
sys_kcmp(pid1, pid2, KCMP_FS, 0, 0),
65+
sys_kcmp(pid1, pid2, KCMP_SIGHAND, 0, 0),
66+
sys_kcmp(pid1, pid2, KCMP_IO, 0, 0),
67+
sys_kcmp(pid1, pid2, KCMP_SYSVSEM, 0, 0),
68+
69+
/* This one should fail */
70+
sys_kcmp(pid1, pid2, KCMP_TYPES + 1, 0, 0));
71+
72+
/* This one should return same fd */
73+
ret = sys_kcmp(pid1, pid2, KCMP_FILE, fd1, fd1);
74+
if (ret) {
75+
printf("FAIL: 0 expected but %d returned\n", ret);
76+
ret = -1;
77+
} else
78+
printf("PASS: 0 returned as expected\n");
79+
80+
/* Compare with self */
81+
ret = sys_kcmp(pid1, pid1, KCMP_VM, 0, 0);
82+
if (ret) {
83+
printf("FAIL: 0 expected but %li returned\n", ret);
84+
ret = -1;
85+
} else
86+
printf("PASS: 0 returned as expected\n");
87+
88+
exit(ret);
89+
}
90+
91+
waitpid(pid2, &status, P_ALL);
92+
93+
return 0;
94+
}

0 commit comments

Comments
 (0)