audit: annotate branch direction for audit_in_mask()

terminus · vijay-suman · commit a9c8a5df69b4 · 2024-12-01T08:07:29.000-08:00
With sane audit rules, audit logging would only be triggered infrequently. Keeping this in mind, annotate audit_in_mask() as unlikely() to allow the compiler to pessimize the call to audit_filter_rules(). With this change, generated code for the list_for_each_rcu() loop goes from: # 5.4.17-2136.309.4.el7uek.ctx_major.asm bb1/start_iter: cmp %r15,%rbx # if (val > 0xffffffff) ja next_iter # BR1: never taken bb2: cmpl $0x3f,-0x3c(%rbp) # if (word >= AUDIT_BITMASK_SIZE) # LD1: computed word val from stack jg next_iter # BR2: never taken bb3: # rule->mask[word] & bit mov -0x50(%rbp),%rax # LD2: precomputed value test %r14d,0x30(%r13,%rax,4) # LD3: r13 from LD4 (prev iteration) # rax from LD1 (current iteration) je next_iter # BR3: taken: 32/37 times bb4: mov -0x48(%rbp),%rdi # this bb executes 5 times, falling lea 0x20(%r13),%rsi # through to bb5 each time. lea -0x34(%rbp),%r8 xor %ecx,%ecx mov %r12,%rdx callq audit_filter_rules.isra.12 test %eax,%eax jne exit_loop bb5/next_iter: mov 0x0(%r13),%r13 # LD4: r13 = list->next (r13 needed in LD3) cmp $0xffffffff82ac67e0,%r13 # list == list_head ffffffff8118b613: R_X86_64_32S audit_filter_list+0x40 jne start_iter # BR4: taken 36/37 times exit_loop: mov $0x1,%edx to: # 5.4.17-2136.309.4.el7uek.annotate_ctx_major.asm bb1/start_iter: cmp %r13,%r12 # if (val > 0xffffffff) ja next_iter # BR1: never taken. bb2: cmp $0x3f,%r9d # if (word >= AUDIT_BITMASK_SIZE) jg next_iter # BR2: never taken. bb3: # rule->mask[word] & bit test %r10d,0x30(%rbx,%r11,4) # LD1: rbx from LD2 (prev iteration) # r11 is precomputed fn(ctx->major) jne bb5 # BR3: not-taken: 32/37 times. bb5/next_iter: # bb4, bb5 named out-of-order for consistency mov (%rbx),%rbx # LD2: rbx = list->next (needed in LD1) cmp $0xffffffff82ac67e0,%rbx # list == list_head ffffffff8118b5ed: R_X86_64_32S audit_filter_list+0x40 jne start_iter # BR4: taken 36/37 times loop_exit: mov $0x1,%edx jmp fn_exit bb4: lea 0x20(%rbx),%rsi lea -0x34(%rbp),%r8 xor %ecx,%ecx mov %r14,%rdx mov %r15,%rdi mov %r11,-0x48(%rbp) mov %r10d,-0x40(%rbp) mov %r9d,-0x3c(%rbp) callq audit_filter_rules.isra.12 test %eax,%eax mov -0x3c(%rbp),%r9d mov -0x40(%rbp),%r10d mov -0x48(%rbp),%r11 je next_iter # BR5: taken 5 times ... (Notice the differences in branch direction in bb3, and the register spills and reloads in the prologue and epilogue in bb4.) Change in latency (including UEK5 for context): Min Mean Median Max pstdev cacheline-spread-stdev (ns) (ns) (ns) (ns) (uek5) 177.63 185.16 184.65 198.48 (+- 2.06%) stdev=1.58 var=2.62 -(uek6.major) 183.50 201.41 198.80 226.93 (+- 4.01%) stdev=1.71 var=3.06* +(uek6.major 165.26 188.72 184.25 230.34 (+- 8.98%) stdev=1.99 var=4.48* + +annot) perf stat -d (aggregated across 12 boot cycles) goes from: # 5.4.17-2136.309.4.el7uek.ctx_major.stats cycles 720.63 ( +- 3.97% ) instructions 1355.10 ( +- 0.00% ) IPC 1.88 ( +- 3.72% ) branches 328.20 ( +- 0.00% ) branch-misses 1.50 ( +- 12.66% ) L1-dcache-loads 446.33 ( +- 0.00% ) L1-dcache-load-misses 4.27 ( +- 71.42% ) cacheline-spread stdev=1.71 var=3.06 to: # 5.4.17-2136.309.4.el7uek.annotate_ctx_major.stats cycles 689.20 ( +- 8.45% ) instructions 1340.06 ( +- 0.00% ) IPC 1.96 ( +- 8.16% ) branches 328.19 ( +- 0.00% ) branch-misses 0.51 ( +- 0.00% ) L1-dcache-loads 380.32 ( +- 0.00% ) L1-dcache-load-misses 8.19 ( +- 84.12%* ) cacheline-spread stdev=1.99 svar=4.48* cycles: performance improves on average by ~30 cycles/call. IPC improves commensurately. Two reasons for this improvement: * one fewer branch misprediction: there's no real reason that I can see for this reduction in branch miss count. There is no significant change in basic-block structure (apart from the branch inversion.) * the direction of the branch in bb3 is now inverted, so it chooses the not-taken direction more often (32/37 times). The issue-latency for not-taken is 0.5-1 cycles, for taken 1-2 cycles (as per [1].) L1-dcache-loads: are substantially fewer (-2 each loop iteration). These loads were earlier coming from the stack and now come from a register. In effect, the spills and reloads have now been gets shifted away from the fastpath, to bb4, in the prologue and epilogue of audit_filter_rules(). (Note that the performance improvement is /not/ coming from this reduction in loads because these are orphan loads and so were already free.) L1-dcache-load-misses: these have increased significantly to about 8/loop-iteration. This is mostly because of a couple of runs with particularly unbalanced audit_entry allocations. (For this test case, one particular boot had a single cache-set with 13 (of 37) struct audit_entry mapping to it, with the next highest having 7 (of 37). L1-dcache misses for that on average for that boot ~20. This was also visible in higher stdev.) [1] https://www.agner.org/optimize/instruction_tables.pdf Orabug: 34544781 Conflicts: kernel/auditsc.c [ Calls to audit_in_mask() were unified in __audit_filter_op() resulting in a clearer merge. ] Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com> Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
@@ -792,7 +792,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
 	return AUDIT_STATE_BUILD;
 }
 
-static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
+static bool audit_in_mask(const struct audit_krule *rule, unsigned long val)
 {
 	int word, bit;
 
@@ -832,11 +832,12 @@ static int __audit_filter_op(struct task_struct *tsk,
 	enum audit_state state;
 
 	list_for_each_entry_rcu(e, list, list) {
-		if (audit_in_mask(&e->rule, op) &&
-		    audit_filter_rules(tsk, &e->rule, ctx, name,
-				       &state, false)) {
-			ctx->current_state = state;
-			return 1;
+		if (unlikely(audit_in_mask(&e->rule, op))) {
+			if (audit_filter_rules(tsk, &e->rule, ctx, name,
+						&state, false)) {
+				ctx->current_state = state;
+				return 1;
+			}
 		}
 	}
 	return 0;

Original file line number	Diff line number	Diff line change
`@@ -792,7 +792,7 @@ static enum audit_state audit_filter_task(struct task_struct tsk, char *key)`
`792`	`792`	`return AUDIT_STATE_BUILD;`
`793`	`793`	`}`
`794`	`794`
`795`		`-static int audit_in_mask(const struct audit_krule *rule, unsigned long val)`
	`795`	`+static bool audit_in_mask(const struct audit_krule *rule, unsigned long val)`
`796`	`796`	`{`
`797`	`797`	`int word, bit;`
`798`	`798`
`@@ -832,11 +832,12 @@ static int __audit_filter_op(struct task_struct *tsk,`
`832`	`832`	`enum audit_state state;`
`833`	`833`
`834`	`834`	`list_for_each_entry_rcu(e, list, list) {`
`835`		`- if (audit_in_mask(&e->rule, op) &&`
`836`		`- audit_filter_rules(tsk, &e->rule, ctx, name,`
`837`		`- &state, false)) {`
`838`		`- ctx->current_state = state;`
`839`		`- return 1;`
	`835`	`+ if (unlikely(audit_in_mask(&e->rule, op))) {`
	`836`	`+ if (audit_filter_rules(tsk, &e->rule, ctx, name,`
	`837`	`+ &state, false)) {`
	`838`	`+ ctx->current_state = state;`
	`839`	`+ return 1;`
	`840`	`+ }`
`840`	`841`	`}`
`841`	`842`	`}`
`842`	`843`	`return 0;`