Skip to content

Commit 0c8c0f0

Browse files
hansendcIngo Molnar
authored andcommitted
x86/fpu, sched: Dynamically allocate 'struct fpu'
The FPU rewrite removed the dynamic allocations of 'struct fpu'. But, this potentially wastes massive amounts of memory (2k per task on systems that do not have AVX-512 for instance). Instead of having a separate slab, this patch just appends the space that we need to the 'task_struct' which we dynamically allocate already. This saves from doing an extra slab allocation at fork(). The only real downside here is that we have to stick everything and the end of the task_struct. But, I think the BUILD_BUG_ON()s I stuck in there should keep that from being too fragile. Signed-off-by: Dave Hansen <[email protected]> Cc: Andy Lutomirski <[email protected]> Cc: Borislav Petkov <[email protected]> Cc: Brian Gerst <[email protected]> Cc: Dave Hansen <[email protected]> Cc: Denys Vlasenko <[email protected]> Cc: H. Peter Anvin <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Oleg Nesterov <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Thomas Gleixner <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Ingo Molnar <[email protected]>
1 parent a97439a commit 0c8c0f0

File tree

7 files changed

+104
-43
lines changed

7 files changed

+104
-43
lines changed

arch/x86/include/asm/fpu/types.h

Lines changed: 38 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ union fpregs_state {
189189
struct fxregs_state fxsave;
190190
struct swregs_state soft;
191191
struct xregs_state xsave;
192+
u8 __padding[PAGE_SIZE];
192193
};
193194

194195
/*
@@ -197,40 +198,6 @@ union fpregs_state {
197198
* state fields:
198199
*/
199200
struct fpu {
200-
/*
201-
* @state:
202-
*
203-
* In-memory copy of all FPU registers that we save/restore
204-
* over context switches. If the task is using the FPU then
205-
* the registers in the FPU are more recent than this state
206-
* copy. If the task context-switches away then they get
207-
* saved here and represent the FPU state.
208-
*
209-
* After context switches there may be a (short) time period
210-
* during which the in-FPU hardware registers are unchanged
211-
* and still perfectly match this state, if the tasks
212-
* scheduled afterwards are not using the FPU.
213-
*
214-
* This is the 'lazy restore' window of optimization, which
215-
* we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
216-
*
217-
* We detect whether a subsequent task uses the FPU via setting
218-
* CR0::TS to 1, which causes any FPU use to raise a #NM fault.
219-
*
220-
* During this window, if the task gets scheduled again, we
221-
* might be able to skip having to do a restore from this
222-
* memory buffer to the hardware registers - at the cost of
223-
* incurring the overhead of #NM fault traps.
224-
*
225-
* Note that on modern CPUs that support the XSAVEOPT (or other
226-
* optimized XSAVE instructions), we don't use #NM traps anymore,
227-
* as the hardware can track whether FPU registers need saving
228-
* or not. On such CPUs we activate the non-lazy ('eagerfpu')
229-
* logic, which unconditionally saves/restores all FPU state
230-
* across context switches. (if FPU state exists.)
231-
*/
232-
union fpregs_state state;
233-
234201
/*
235202
* @last_cpu:
236203
*
@@ -288,6 +255,43 @@ struct fpu {
288255
* deal with bursty apps that only use the FPU for a short time:
289256
*/
290257
unsigned char counter;
258+
/*
259+
* @state:
260+
*
261+
* In-memory copy of all FPU registers that we save/restore
262+
* over context switches. If the task is using the FPU then
263+
* the registers in the FPU are more recent than this state
264+
* copy. If the task context-switches away then they get
265+
* saved here and represent the FPU state.
266+
*
267+
* After context switches there may be a (short) time period
268+
* during which the in-FPU hardware registers are unchanged
269+
* and still perfectly match this state, if the tasks
270+
* scheduled afterwards are not using the FPU.
271+
*
272+
* This is the 'lazy restore' window of optimization, which
273+
* we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
274+
*
275+
* We detect whether a subsequent task uses the FPU via setting
276+
* CR0::TS to 1, which causes any FPU use to raise a #NM fault.
277+
*
278+
* During this window, if the task gets scheduled again, we
279+
* might be able to skip having to do a restore from this
280+
* memory buffer to the hardware registers - at the cost of
281+
* incurring the overhead of #NM fault traps.
282+
*
283+
* Note that on modern CPUs that support the XSAVEOPT (or other
284+
* optimized XSAVE instructions), we don't use #NM traps anymore,
285+
* as the hardware can track whether FPU registers need saving
286+
* or not. On such CPUs we activate the non-lazy ('eagerfpu')
287+
* logic, which unconditionally saves/restores all FPU state
288+
* across context switches. (if FPU state exists.)
289+
*/
290+
union fpregs_state state;
291+
/*
292+
* WARNING: 'state' is dynamically-sized. Do not put
293+
* anything after it here.
294+
*/
291295
};
292296

293297
#endif /* _ASM_X86_FPU_H */

arch/x86/include/asm/processor.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -390,9 +390,6 @@ struct thread_struct {
390390
#endif
391391
unsigned long gs;
392392

393-
/* Floating point and extended processor state */
394-
struct fpu fpu;
395-
396393
/* Save middle states of ptrace breakpoints */
397394
struct perf_event *ptrace_bps[HBP_NUM];
398395
/* Debug status used for traps, single steps, etc... */
@@ -418,6 +415,13 @@ struct thread_struct {
418415
unsigned long iopl;
419416
/* Max allowed port in the bitmap, in bytes: */
420417
unsigned io_bitmap_max;
418+
419+
/* Floating point and extended processor state */
420+
struct fpu fpu;
421+
/*
422+
* WARNING: 'fpu' is dynamically-sized. It *MUST* be at
423+
* the end.
424+
*/
421425
};
422426

423427
/*

arch/x86/kernel/fpu/init.c

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,45 @@ static void __init fpu__init_system_generic(void)
136136
unsigned int xstate_size;
137137
EXPORT_SYMBOL_GPL(xstate_size);
138138

139+
#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
140+
BUILD_BUG_ON((sizeof(TYPE) - \
141+
offsetof(TYPE, MEMBER) - \
142+
sizeof(((TYPE *)0)->MEMBER)) > \
143+
0) \
144+
145+
/*
146+
* We append the 'struct fpu' to the task_struct.
147+
*/
148+
int __weak arch_task_struct_size(void)
149+
{
150+
int task_size = sizeof(struct task_struct);
151+
152+
/*
153+
* Subtract off the static size of the register state.
154+
* It potentially has a bunch of padding.
155+
*/
156+
task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
157+
158+
/*
159+
* Add back the dynamically-calculated register state
160+
* size.
161+
*/
162+
task_size += xstate_size;
163+
164+
/*
165+
* We dynamically size 'struct fpu', so we require that
166+
* it be at the end of 'thread_struct' and that
167+
* 'thread_struct' be at the end of 'task_struct'. If
168+
* you hit a compile error here, check the structure to
169+
* see if something got added to the end.
170+
*/
171+
CHECK_MEMBER_AT_END_OF(struct fpu, state);
172+
CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
173+
CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
174+
175+
return task_size;
176+
}
177+
139178
/*
140179
* Set up the xstate_size based on the legacy FPU context size.
141180
*

arch/x86/kernel/process.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
8181
*/
8282
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
8383
{
84-
*dst = *src;
84+
memcpy(dst, src, arch_task_struct_size());
8585

8686
return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
8787
}

fs/proc/kcore.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
9292
roundup(sizeof(CORE_STR), 4)) +
9393
roundup(sizeof(struct elf_prstatus), 4) +
9494
roundup(sizeof(struct elf_prpsinfo), 4) +
95-
roundup(sizeof(struct task_struct), 4);
95+
roundup(arch_task_struct_size(), 4);
9696
*elf_buflen = PAGE_ALIGN(*elf_buflen);
9797
return size + *elf_buflen;
9898
}
@@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
415415
/* set up the task structure */
416416
notes[2].name = CORE_STR;
417417
notes[2].type = NT_TASKSTRUCT;
418-
notes[2].datasz = sizeof(struct task_struct);
418+
notes[2].datasz = arch_task_struct_size();
419419
notes[2].data = current;
420420

421421
nhdr->p_filesz += notesize(&notes[2]);

include/linux/sched.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1522,8 +1522,6 @@ struct task_struct {
15221522
/* hung task detection */
15231523
unsigned long last_switch_count;
15241524
#endif
1525-
/* CPU-specific state of this task */
1526-
struct thread_struct thread;
15271525
/* filesystem information */
15281526
struct fs_struct *fs;
15291527
/* open file information */
@@ -1778,8 +1776,18 @@ struct task_struct {
17781776
unsigned long task_state_change;
17791777
#endif
17801778
int pagefault_disabled;
1779+
/* CPU-specific state of this task */
1780+
struct thread_struct thread;
1781+
/*
1782+
* WARNING: on x86, 'thread_struct' contains a variable-sized
1783+
* structure. It *MUST* be at the end of 'task_struct'.
1784+
*
1785+
* Do not put anything below here!
1786+
*/
17811787
};
17821788

1789+
extern int arch_task_struct_size(void);
1790+
17831791
/* Future-safe accessor for struct task_struct's cpus_allowed. */
17841792
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
17851793

kernel/fork.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -287,15 +287,21 @@ static void set_max_threads(unsigned int max_threads_suggested)
287287
max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
288288
}
289289

290+
int __weak arch_task_struct_size(void)
291+
{
292+
return sizeof(struct task_struct);
293+
}
294+
290295
void __init fork_init(void)
291296
{
297+
int task_struct_size = arch_task_struct_size();
292298
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
293299
#ifndef ARCH_MIN_TASKALIGN
294300
#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
295301
#endif
296302
/* create a slab on which task_structs can be allocated */
297303
task_struct_cachep =
298-
kmem_cache_create("task_struct", sizeof(struct task_struct),
304+
kmem_cache_create("task_struct", task_struct_size,
299305
ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
300306
#endif
301307

0 commit comments

Comments
 (0)