workqueue: Add workqueue_attrs->__pod_cpumask

htejun · htejun · commit 9546b29e4a6a · 2023-08-07T15:57:25.000-10:00
workqueue_attrs has two uses: * to specify the required unouned workqueue properties by users * to match worker_pool's properties to workqueues by core code For example, if the user wants to restrict a workqueue to run only CPUs 0 and 2, and the two CPUs are on different affinity scopes, the workqueue's attrs->cpumask would contains CPUs 0 and 2, and the workqueue would be associated with two worker_pools, one with attrs->cpumask containing just CPU 0 and the other CPU 2. Workqueue wants to support non-strict affinity scopes where work items are started in their matching affinity scopes but the scheduler is free to migrate them outside the starting scopes, which can enable utilizing the whole machine while maintaining most of the locality benefits from affinity scopes. To enable that, worker_pools need to distinguish the strict affinity that it has to follow (because that's the restriction coming from the user) and the soft affinity that it wants to apply when dispatching work items. Note that two worker_pools with different soft dispatching requirements have to be separate; otherwise, for example, we'd be ping-ponging worker threads across NUMA boundaries constantly. This patch adds workqueue_attrs->__pod_cpumask. The new field is double underscored as it's only used internally to distinguish worker_pools. A worker_pool's ->cpumask is now always the same as the online subset of allowed CPUs of the associated workqueues, and ->__pod_cpumask is the pod's subset of that ->cpumask. Going back to the example above, both worker_pools would have ->cpumask containing both CPUs 0 and 2 but one's ->__pod_cpumask would contain 0 while the other's 2. * pool_allowed_cpus() is added. It returns the worker_pool's strict cpumask that the pool's workers must stay within. This is currently always ->__pod_cpumask as all boundaries are still strict. * As a workqueue_attrs can now track both the associated workqueues' cpumask and its per-pod subset, wq_calc_pod_cpumask() no longer needs an external out-argument. Drop @cpumask and instead store the result in ->__pod_cpumask. * The above also simplifies apply_wqattrs_prepare() as the same workqueue_attrs can be used to create all pods associated with a workqueue. tmp_attrs is dropped. * wq_update_pod() is updated to use wqattrs_equal() to test whether a pwq update is needed instead of only comparing ->cpumask so that ->__pod_cpumask is compared too. It can directly compare ->__pod_cpumaks but the code is easier to understand and more robust this way. The only user-visible behavior change is that two workqueues with different cpumasks no longer can share worker_pools even when their pod subsets coincide. Going back to the example, let's say there's another workqueue with cpumask 0, 2, 3, where 2 and 3 are in the same pod. It would be mapped to two worker_pools - one with CPU 0, the other with 2 and 3. The former has the same cpumask as the first pod of the earlier example and would have shared the same worker_pool but that's no longer the case after this patch. The worker_pools would have the same ->__pod_cpumask but their ->cpumask's wouldn't match. While this is necessary to support non-strict affinity scopes, there can be further optimizations to maintain sharing among strict affinity scopes. However, non-strict affinity scopes are going to be preferable for most use cases and we don't see very diverse mixture of unbound workqueue cpumasks anyway, so the additional overhead doesn't seem to justify the extra complexity. v2: - wq_update_pod() was incorrectly comparing target_attrs->__pod_cpumask to pool->attrs->cpumask instead of its ->__pod_cpumask. Fix it by using wqattrs_equal() for comparison instead. - Per-cpu worker pools weren't initializing ->__pod_cpumask which caused a subtle problem later on. Set it to cpumask_of(cpu) like ->cpumask. Signed-off-by: Tejun Heo <tj@kernel.org>
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
@@ -150,9 +150,25 @@ struct workqueue_attrs {
 
 	/**
 	 * @cpumask: allowed CPUs
+	 *
+	 * Work items in this workqueue are affine to these CPUs and not allowed
+	 * to execute on other CPUs. A pool serving a workqueue must have the
+	 * same @cpumask.
 	 */
 	cpumask_var_t cpumask;
 
+	/**
+	 * @__pod_cpumask: internal attribute used to create per-pod pools
+	 *
+	 * Internal use only.
+	 *
+	 * Per-pod unbound worker pools are used to improve locality. Always a
+	 * subset of ->cpumask. A workqueue can be associated with multiple
+	 * worker pools with disjoint @__pod_cpumask's. Whether the enforcement
+	 * of a pool's @__pod_cpumask is strict depends on @affn_strict.
+	 */
+	cpumask_var_t __pod_cpumask;
+
 	/*
 	 * Below fields aren't properties of a worker_pool. They only modify how
 	 * :c:func:`apply_workqueue_attrs` select pools and thus don't
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
@@ -366,7 +366,6 @@ static bool wq_online;			/* can kworkers be created yet? */
 
 /* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */
 static struct workqueue_attrs *wq_update_pod_attrs_buf;
-static cpumask_var_t wq_update_pod_cpumask_buf;
 
 static DEFINE_MUTEX(wq_pool_mutex);	/* protects pools and workqueues list */
 static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
@@ -2050,6 +2049,11 @@ static struct worker *alloc_worker(int node)
 	return worker;
 }
 
+static cpumask_t *pool_allowed_cpus(struct worker_pool *pool)
+{
+	return pool->attrs->__pod_cpumask;
+}
+
 /**
  * worker_attach_to_pool() - attach a worker to a pool
  * @worker: worker to be attached
@@ -2075,7 +2079,7 @@ static void worker_attach_to_pool(struct worker *worker,
 		kthread_set_per_cpu(worker->task, pool->cpu);
 
 	if (worker->rescue_wq)
-		set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+		set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool));
 
 	list_add_tail(&worker->node, &pool->workers);
 	worker->pool = pool;
@@ -2167,7 +2171,7 @@ static struct worker *create_worker(struct worker_pool *pool)
 	}
 
 	set_user_nice(worker->task, pool->attrs->nice);
-	kthread_bind_mask(worker->task, pool->attrs->cpumask);
+	kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
 
 	/* successful, attach the worker to the pool */
 	worker_attach_to_pool(worker, pool);
@@ -3672,6 +3676,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs)
 {
 	if (attrs) {
 		free_cpumask_var(attrs->cpumask);
+		free_cpumask_var(attrs->__pod_cpumask);
 		kfree(attrs);
 	}
 }
@@ -3693,6 +3698,8 @@ struct workqueue_attrs *alloc_workqueue_attrs(void)
 		goto fail;
 	if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL))
 		goto fail;
+	if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL))
+		goto fail;
 
 	cpumask_copy(attrs->cpumask, cpu_possible_mask);
 	attrs->affn_scope = wq_affn_dfl;
@@ -3707,6 +3714,7 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
 {
 	to->nice = from->nice;
 	cpumask_copy(to->cpumask, from->cpumask);
+	cpumask_copy(to->__pod_cpumask, from->__pod_cpumask);
 
 	/*
 	 * Unlike hash and equality test, copying shouldn't ignore wq-only
@@ -3735,6 +3743,8 @@ static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
 	hash = jhash_1word(attrs->nice, hash);
 	hash = jhash(cpumask_bits(attrs->cpumask),
 		     BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
+	hash = jhash(cpumask_bits(attrs->__pod_cpumask),
+		     BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
 	return hash;
 }
 
@@ -3746,6 +3756,8 @@ static bool wqattrs_equal(const struct workqueue_attrs *a,
 		return false;
 	if (!cpumask_equal(a->cpumask, b->cpumask))
 		return false;
+	if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask))
+		return false;
 	return true;
 }
 
@@ -3998,9 +4010,9 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
 		}
 	}
 
-	/* If cpumask is contained inside a NUMA pod, that's our NUMA node */
+	/* If __pod_cpumask is contained inside a NUMA pod, that's our node */
 	for (pod = 0; pod < pt->nr_pods; pod++) {
-		if (cpumask_subset(attrs->cpumask, pt->pod_cpus[pod])) {
+		if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) {
 			node = pt->pod_node[pod];
 			break;
 		}
@@ -4190,39 +4202,38 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
  * @attrs: the wq_attrs of the default pwq of the target workqueue
  * @cpu: the target CPU
  * @cpu_going_down: if >= 0, the CPU to consider as offline
- * @cpumask: outarg, the resulting cpumask
  *
  * Calculate the cpumask a workqueue with @attrs should use on @pod. If
  * @cpu_going_down is >= 0, that cpu is considered offline during calculation.
- * The result is stored in @cpumask.
+ * The result is stored in @attrs->__pod_cpumask.
  *
  * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled
  * and @pod has online CPUs requested by @attrs, the returned cpumask is the
  * intersection of the possible CPUs of @pod and @attrs->cpumask.
  *
  * The caller is responsible for ensuring that the cpumask of @pod stays stable.
  */
-static void wq_calc_pod_cpumask(const struct workqueue_attrs *attrs, int cpu,
-				int cpu_going_down, cpumask_t *cpumask)
+static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu,
+				int cpu_going_down)
 {
 	const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
 	int pod = pt->cpu_pod[cpu];
 
 	/* does @pod have any online CPUs @attrs wants? */
-	cpumask_and(cpumask, pt->pod_cpus[pod], attrs->cpumask);
-	cpumask_and(cpumask, cpumask, cpu_online_mask);
+	cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask);
+	cpumask_and(attrs->__pod_cpumask, attrs->__pod_cpumask, cpu_online_mask);
 	if (cpu_going_down >= 0)
-		cpumask_clear_cpu(cpu_going_down, cpumask);
+		cpumask_clear_cpu(cpu_going_down, attrs->__pod_cpumask);
 
-	if (cpumask_empty(cpumask)) {
-		cpumask_copy(cpumask, attrs->cpumask);
+	if (cpumask_empty(attrs->__pod_cpumask)) {
+		cpumask_copy(attrs->__pod_cpumask, attrs->cpumask);
 		return;
 	}
 
 	/* yeap, return possible CPUs in @pod that @attrs wants */
-	cpumask_and(cpumask, attrs->cpumask, pt->pod_cpus[pod]);
+	cpumask_and(attrs->__pod_cpumask, attrs->cpumask, pt->pod_cpus[pod]);
 
-	if (cpumask_empty(cpumask))
+	if (cpumask_empty(attrs->__pod_cpumask))
 		pr_warn_once("WARNING: workqueue cpumask: online intersect > "
 				"possible intersect\n");
 }
@@ -4276,7 +4287,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 		      const cpumask_var_t unbound_cpumask)
 {
 	struct apply_wqattrs_ctx *ctx;
-	struct workqueue_attrs *new_attrs, *tmp_attrs;
+	struct workqueue_attrs *new_attrs;
 	int cpu;
 
 	lockdep_assert_held(&wq_pool_mutex);
@@ -4288,8 +4299,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 	ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL);
 
 	new_attrs = alloc_workqueue_attrs();
-	tmp_attrs = alloc_workqueue_attrs();
-	if (!ctx || !new_attrs || !tmp_attrs)
+	if (!ctx || !new_attrs)
 		goto out_free;
 
 	/*
@@ -4299,23 +4309,18 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 	 */
 	copy_workqueue_attrs(new_attrs, attrs);
 	wqattrs_actualize_cpumask(new_attrs, unbound_cpumask);
+	cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
 	ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
 	if (!ctx->dfl_pwq)
 		goto out_free;
 
-	/*
-	 * We may create multiple pwqs with differing cpumasks. Make a copy of
-	 * @new_attrs which will be modified and used to obtain pools.
-	 */
-	copy_workqueue_attrs(tmp_attrs, new_attrs);
-
 	for_each_possible_cpu(cpu) {
 		if (new_attrs->ordered) {
 			ctx->dfl_pwq->refcnt++;
 			ctx->pwq_tbl[cpu] = ctx->dfl_pwq;
 		} else {
-			wq_calc_pod_cpumask(new_attrs, cpu, -1, tmp_attrs->cpumask);
-			ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, tmp_attrs);
+			wq_calc_pod_cpumask(new_attrs, cpu, -1);
+			ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs);
 			if (!ctx->pwq_tbl[cpu])
 				goto out_free;
 		}
@@ -4324,14 +4329,13 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 	/* save the user configured attrs and sanitize it. */
 	copy_workqueue_attrs(new_attrs, attrs);
 	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+	cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
 	ctx->attrs = new_attrs;
 
 	ctx->wq = wq;
-	free_workqueue_attrs(tmp_attrs);
 	return ctx;
 
 out_free:
-	free_workqueue_attrs(tmp_attrs);
 	free_workqueue_attrs(new_attrs);
 	apply_wqattrs_cleanup(ctx);
 	return ERR_PTR(-ENOMEM);
@@ -4459,7 +4463,6 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu,
 	int off_cpu = online ? -1 : hotplug_cpu;
 	struct pool_workqueue *old_pwq = NULL, *pwq;
 	struct workqueue_attrs *target_attrs;
-	cpumask_t *cpumask;
 
 	lockdep_assert_held(&wq_pool_mutex);
 
@@ -4472,20 +4475,18 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu,
 	 * CPU hotplug exclusion.
 	 */
 	target_attrs = wq_update_pod_attrs_buf;
-	cpumask = wq_update_pod_cpumask_buf;
 
 	copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
 	wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask);
 
 	/* nothing to do if the target cpumask matches the current pwq */
-	wq_calc_pod_cpumask(target_attrs, cpu, off_cpu, cpumask);
+	wq_calc_pod_cpumask(target_attrs, cpu, off_cpu);
 	pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu),
 					lockdep_is_held(&wq_pool_mutex));
-	if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
+	if (wqattrs_equal(target_attrs, pwq->pool->attrs))
 		return;
 
 	/* create a new pwq */
-	cpumask_copy(target_attrs->cpumask, cpumask);
 	pwq = alloc_unbound_pwq(wq, target_attrs);
 	if (!pwq) {
 		pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n",
@@ -5409,7 +5410,7 @@ static void rebind_workers(struct worker_pool *pool)
 	for_each_pool_worker(worker, pool) {
 		kthread_set_per_cpu(worker->task, pool->cpu);
 		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
-						  pool->attrs->cpumask) < 0);
+						  pool_allowed_cpus(pool)) < 0);
 	}
 
 	raw_spin_lock_irq(&pool->lock);
@@ -6424,8 +6425,6 @@ void __init workqueue_init_early(void)
 	wq_update_pod_attrs_buf = alloc_workqueue_attrs();
 	BUG_ON(!wq_update_pod_attrs_buf);
 
-	BUG_ON(!alloc_cpumask_var(&wq_update_pod_cpumask_buf, GFP_KERNEL));
-
 	/* initialize WQ_AFFN_SYSTEM pods */
 	pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
 	pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL);
@@ -6451,6 +6450,7 @@ void __init workqueue_init_early(void)
 			BUG_ON(init_worker_pool(pool));
 			pool->cpu = cpu;
 			cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
+			cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu));
 			pool->attrs->nice = std_nice[i++];
 			pool->node = cpu_to_node(cpu);