Skip to content

Commit e7b52ff

Browse files
Alex ShiH. Peter Anvin
authored andcommitted
x86/flush_tlb: try flush_tlb_single one by one in flush_tlb_range
x86 has no flush_tlb_range support in instruction level. Currently the flush_tlb_range just implemented by flushing all page table. That is not the best solution for all scenarios. In fact, if we just use 'invlpg' to flush few lines from TLB, we can get the performance gain from later remain TLB lines accessing. But the 'invlpg' instruction costs much of time. Its execution time can compete with cr3 rewriting, and even a bit more on SNB CPU. So, on a 512 4KB TLB entries CPU, the balance points is at: (512 - X) * 100ns(assumed TLB refill cost) = X(TLB flush entries) * 100ns(assumed invlpg cost) Here, X is 256, that is 1/2 of 512 entries. But with the mysterious CPU pre-fetcher and page miss handler Unit, the assumed TLB refill cost is far lower then 100ns in sequential access. And 2 HT siblings in one core makes the memory access more faster if they are accessing the same memory. So, in the patch, I just do the change when the target entries is less than 1/16 of whole active tlb entries. Actually, I have no data support for the percentage '1/16', so any suggestions are welcomed. As to hugetlb, guess due to smaller page table, and smaller active TLB entries, I didn't see benefit via my benchmark, so no optimizing now. My micro benchmark show in ideal scenarios, the performance improves 70 percent in reading. And in worst scenario, the reading/writing performance is similar with unpatched 3.4-rc4 kernel. Here is the reading data on my 2P * 4cores *HT NHM EP machine, with THP 'always': multi thread testing, '-t' paramter is thread number: with patch unpatched 3.4-rc4 ./mprotect -t 1 14ns 24ns ./mprotect -t 2 13ns 22ns ./mprotect -t 4 12ns 19ns ./mprotect -t 8 14ns 16ns ./mprotect -t 16 28ns 26ns ./mprotect -t 32 54ns 51ns ./mprotect -t 128 200ns 199ns Single process with sequencial flushing and memory accessing: with patch unpatched 3.4-rc4 ./mprotect 7ns 11ns ./mprotect -p 4096 -l 8 -n 10240 21ns 21ns [ hpa: http://lkml.kernel.org/r/1B4B44D9196EFF41AE41FDA404FC0A100BFF94@SHSMSX101.ccr.corp.intel.com has additional performance numbers. ] Signed-off-by: Alex Shi <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: H. Peter Anvin <[email protected]>
1 parent e0ba94f commit e7b52ff

File tree

8 files changed

+114
-49
lines changed

8 files changed

+114
-49
lines changed

arch/x86/include/asm/paravirt.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -397,9 +397,10 @@ static inline void __flush_tlb_single(unsigned long addr)
397397

398398
static inline void flush_tlb_others(const struct cpumask *cpumask,
399399
struct mm_struct *mm,
400-
unsigned long va)
400+
unsigned long start,
401+
unsigned long end)
401402
{
402-
PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, cpumask, mm, va);
403+
PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end);
403404
}
404405

405406
static inline int paravirt_pgd_alloc(struct mm_struct *mm)

arch/x86/include/asm/paravirt_types.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,8 @@ struct pv_mmu_ops {
250250
void (*flush_tlb_single)(unsigned long addr);
251251
void (*flush_tlb_others)(const struct cpumask *cpus,
252252
struct mm_struct *mm,
253-
unsigned long va);
253+
unsigned long start,
254+
unsigned long end);
254255

255256
/* Hooks for allocating and freeing a pagetable top-level */
256257
int (*pgd_alloc)(struct mm_struct *mm);

arch/x86/include/asm/tlbflush.h

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -73,14 +73,10 @@ static inline void __flush_tlb_one(unsigned long addr)
7373
* - flush_tlb_page(vma, vmaddr) flushes one page
7474
* - flush_tlb_range(vma, start, end) flushes a range of pages
7575
* - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
76-
* - flush_tlb_others(cpumask, mm, va) flushes TLBs on other cpus
76+
* - flush_tlb_others(cpumask, mm, start, end) flushes TLBs on other cpus
7777
*
7878
* ..but the i386 has somewhat limited tlb flushing capabilities,
7979
* and page-granular flushes are available only on i486 and up.
80-
*
81-
* x86-64 can only flush individual pages or full VMs. For a range flush
82-
* we always do the full VM. Might be worth trying if for a small
83-
* range a few INVLPGs in a row are a win.
8480
*/
8581

8682
#ifndef CONFIG_SMP
@@ -111,7 +107,8 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
111107

112108
static inline void native_flush_tlb_others(const struct cpumask *cpumask,
113109
struct mm_struct *mm,
114-
unsigned long va)
110+
unsigned long start,
111+
unsigned long end)
115112
{
116113
}
117114

@@ -129,17 +126,14 @@ extern void flush_tlb_all(void);
129126
extern void flush_tlb_current_task(void);
130127
extern void flush_tlb_mm(struct mm_struct *);
131128
extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
129+
extern void flush_tlb_range(struct vm_area_struct *vma,
130+
unsigned long start, unsigned long end);
132131

133132
#define flush_tlb() flush_tlb_current_task()
134133

135-
static inline void flush_tlb_range(struct vm_area_struct *vma,
136-
unsigned long start, unsigned long end)
137-
{
138-
flush_tlb_mm(vma->vm_mm);
139-
}
140-
141134
void native_flush_tlb_others(const struct cpumask *cpumask,
142-
struct mm_struct *mm, unsigned long va);
135+
struct mm_struct *mm,
136+
unsigned long start, unsigned long end);
143137

144138
#define TLBSTATE_OK 1
145139
#define TLBSTATE_LAZY 2
@@ -159,7 +153,8 @@ static inline void reset_lazy_tlbstate(void)
159153
#endif /* SMP */
160154

161155
#ifndef CONFIG_PARAVIRT
162-
#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(mask, mm, va)
156+
#define flush_tlb_others(mask, mm, start, end) \
157+
native_flush_tlb_others(mask, mm, start, end)
163158
#endif
164159

165160
static inline void flush_tlb_kernel_range(unsigned long start,

arch/x86/include/asm/uv/uv.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ extern void uv_nmi_init(void);
1515
extern void uv_system_init(void);
1616
extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
1717
struct mm_struct *mm,
18-
unsigned long va,
18+
unsigned long start,
19+
unsigned end,
1920
unsigned int cpu);
2021

2122
#else /* X86_UV */
@@ -26,7 +27,7 @@ static inline void uv_cpu_init(void) { }
2627
static inline void uv_system_init(void) { }
2728
static inline const struct cpumask *
2829
uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
29-
unsigned long va, unsigned int cpu)
30+
unsigned long start, unsigned long end, unsigned int cpu)
3031
{ return cpumask; }
3132

3233
#endif /* X86_UV */

arch/x86/mm/tlb.c

Lines changed: 81 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
4141
union smp_flush_state {
4242
struct {
4343
struct mm_struct *flush_mm;
44-
unsigned long flush_va;
44+
unsigned long flush_start;
45+
unsigned long flush_end;
4546
raw_spinlock_t tlbstate_lock;
4647
DECLARE_BITMAP(flush_cpumask, NR_CPUS);
4748
};
@@ -156,10 +157,19 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
156157

157158
if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) {
158159
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
159-
if (f->flush_va == TLB_FLUSH_ALL)
160+
if (f->flush_end == TLB_FLUSH_ALL
161+
|| !cpu_has_invlpg)
160162
local_flush_tlb();
161-
else
162-
__flush_tlb_one(f->flush_va);
163+
else if (!f->flush_end)
164+
__flush_tlb_single(f->flush_start);
165+
else {
166+
unsigned long addr;
167+
addr = f->flush_start;
168+
while (addr < f->flush_end) {
169+
__flush_tlb_single(addr);
170+
addr += PAGE_SIZE;
171+
}
172+
}
163173
} else
164174
leave_mm(cpu);
165175
}
@@ -172,7 +182,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
172182
}
173183

174184
static void flush_tlb_others_ipi(const struct cpumask *cpumask,
175-
struct mm_struct *mm, unsigned long va)
185+
struct mm_struct *mm, unsigned long start,
186+
unsigned long end)
176187
{
177188
unsigned int sender;
178189
union smp_flush_state *f;
@@ -185,7 +196,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
185196
raw_spin_lock(&f->tlbstate_lock);
186197

187198
f->flush_mm = mm;
188-
f->flush_va = va;
199+
f->flush_start = start;
200+
f->flush_end = end;
189201
if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) {
190202
/*
191203
* We have to send the IPI only to
@@ -199,24 +211,26 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
199211
}
200212

201213
f->flush_mm = NULL;
202-
f->flush_va = 0;
214+
f->flush_start = 0;
215+
f->flush_end = 0;
203216
if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
204217
raw_spin_unlock(&f->tlbstate_lock);
205218
}
206219

207220
void native_flush_tlb_others(const struct cpumask *cpumask,
208-
struct mm_struct *mm, unsigned long va)
221+
struct mm_struct *mm, unsigned long start,
222+
unsigned long end)
209223
{
210224
if (is_uv_system()) {
211225
unsigned int cpu;
212226

213227
cpu = smp_processor_id();
214-
cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
228+
cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
215229
if (cpumask)
216-
flush_tlb_others_ipi(cpumask, mm, va);
230+
flush_tlb_others_ipi(cpumask, mm, start, end);
217231
return;
218232
}
219-
flush_tlb_others_ipi(cpumask, mm, va);
233+
flush_tlb_others_ipi(cpumask, mm, start, end);
220234
}
221235

222236
static void __cpuinit calculate_tlb_offset(void)
@@ -282,7 +296,7 @@ void flush_tlb_current_task(void)
282296

283297
local_flush_tlb();
284298
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
285-
flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
299+
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
286300
preempt_enable();
287301
}
288302

@@ -297,26 +311,77 @@ void flush_tlb_mm(struct mm_struct *mm)
297311
leave_mm(smp_processor_id());
298312
}
299313
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
300-
flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
314+
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
315+
316+
preempt_enable();
317+
}
318+
319+
#define FLUSHALL_BAR 16
320+
321+
void flush_tlb_range(struct vm_area_struct *vma,
322+
unsigned long start, unsigned long end)
323+
{
324+
struct mm_struct *mm;
325+
326+
if (!cpu_has_invlpg || vma->vm_flags & VM_HUGETLB) {
327+
flush_tlb_mm(vma->vm_mm);
328+
return;
329+
}
330+
331+
preempt_disable();
332+
mm = vma->vm_mm;
333+
if (current->active_mm == mm) {
334+
if (current->mm) {
335+
unsigned long addr, vmflag = vma->vm_flags;
336+
unsigned act_entries, tlb_entries = 0;
337+
338+
if (vmflag & VM_EXEC)
339+
tlb_entries = tlb_lli_4k[ENTRIES];
340+
else
341+
tlb_entries = tlb_lld_4k[ENTRIES];
342+
343+
act_entries = tlb_entries > mm->total_vm ?
344+
mm->total_vm : tlb_entries;
301345

346+
if ((end - start)/PAGE_SIZE > act_entries/FLUSHALL_BAR)
347+
local_flush_tlb();
348+
else {
349+
for (addr = start; addr < end;
350+
addr += PAGE_SIZE)
351+
__flush_tlb_single(addr);
352+
353+
if (cpumask_any_but(mm_cpumask(mm),
354+
smp_processor_id()) < nr_cpu_ids)
355+
flush_tlb_others(mm_cpumask(mm), mm,
356+
start, end);
357+
preempt_enable();
358+
return;
359+
}
360+
} else {
361+
leave_mm(smp_processor_id());
362+
}
363+
}
364+
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
365+
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
302366
preempt_enable();
303367
}
304368

305-
void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
369+
370+
void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
306371
{
307372
struct mm_struct *mm = vma->vm_mm;
308373

309374
preempt_disable();
310375

311376
if (current->active_mm == mm) {
312377
if (current->mm)
313-
__flush_tlb_one(va);
378+
__flush_tlb_one(start);
314379
else
315380
leave_mm(smp_processor_id());
316381
}
317382

318383
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
319-
flush_tlb_others(mm_cpumask(mm), mm, va);
384+
flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);
320385

321386
preempt_enable();
322387
}

arch/x86/platform/uv/tlb_uv.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1068,8 +1068,8 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
10681068
* done. The returned pointer is valid till preemption is re-enabled.
10691069
*/
10701070
const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
1071-
struct mm_struct *mm, unsigned long va,
1072-
unsigned int cpu)
1071+
struct mm_struct *mm, unsigned long start,
1072+
unsigned end, unsigned int cpu)
10731073
{
10741074
int locals = 0;
10751075
int remotes = 0;
@@ -1112,7 +1112,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
11121112

11131113
record_send_statistics(stat, locals, hubs, remotes, bau_desc);
11141114

1115-
bau_desc->payload.address = va;
1115+
bau_desc->payload.address = start;
11161116
bau_desc->payload.sending_cpu = cpu;
11171117
/*
11181118
* uv_flush_send_and_wait returns 0 if all cpu's were messaged,

arch/x86/xen/mmu.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1244,7 +1244,8 @@ static void xen_flush_tlb_single(unsigned long addr)
12441244
}
12451245

12461246
static void xen_flush_tlb_others(const struct cpumask *cpus,
1247-
struct mm_struct *mm, unsigned long va)
1247+
struct mm_struct *mm, unsigned long start,
1248+
unsigned long end)
12481249
{
12491250
struct {
12501251
struct mmuext_op op;
@@ -1256,7 +1257,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
12561257
} *args;
12571258
struct multicall_space mcs;
12581259

1259-
trace_xen_mmu_flush_tlb_others(cpus, mm, va);
1260+
trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
12601261

12611262
if (cpumask_empty(cpus))
12621263
return; /* nothing to do */
@@ -1269,11 +1270,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
12691270
cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
12701271
cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
12711272

1272-
if (va == TLB_FLUSH_ALL) {
1273-
args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1274-
} else {
1273+
args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1274+
if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
12751275
args->op.cmd = MMUEXT_INVLPG_MULTI;
1276-
args->op.arg1.linear_addr = va;
1276+
args->op.arg1.linear_addr = start;
12771277
}
12781278

12791279
MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);

include/trace/events/xen.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -397,18 +397,20 @@ TRACE_EVENT(xen_mmu_flush_tlb_single,
397397

398398
TRACE_EVENT(xen_mmu_flush_tlb_others,
399399
TP_PROTO(const struct cpumask *cpus, struct mm_struct *mm,
400-
unsigned long addr),
401-
TP_ARGS(cpus, mm, addr),
400+
unsigned long addr, unsigned long end),
401+
TP_ARGS(cpus, mm, addr, end),
402402
TP_STRUCT__entry(
403403
__field(unsigned, ncpus)
404404
__field(struct mm_struct *, mm)
405405
__field(unsigned long, addr)
406+
__field(unsigned long, end)
406407
),
407408
TP_fast_assign(__entry->ncpus = cpumask_weight(cpus);
408409
__entry->mm = mm;
409-
__entry->addr = addr),
410-
TP_printk("ncpus %d mm %p addr %lx",
411-
__entry->ncpus, __entry->mm, __entry->addr)
410+
__entry->addr = addr,
411+
__entry->end = end),
412+
TP_printk("ncpus %d mm %p addr %lx, end %lx",
413+
__entry->ncpus, __entry->mm, __entry->addr, __entry->end)
412414
);
413415

414416
TRACE_EVENT(xen_mmu_write_cr3,

0 commit comments

Comments
 (0)