Skip to content

Commit 91ccc6e

Browse files
committed
workqueue: Introduce struct wq_node_nr_active
Currently, for both percpu and unbound workqueues, max_active applies per-cpu, which is a recent change for unbound workqueues. The change for unbound workqueues was a significant departure from the previous behavior of per-node application. It made some use cases create undesirable number of concurrent work items and left no good way of fixing them. To address the problem, workqueue is implementing a NUMA node segmented global nr_active mechanism, which will be explained further in the next patch. As a preparation, this patch introduces struct wq_node_nr_active. It's a data structured allocated for each workqueue and NUMA node pair and currently only tracks the workqueue's number of active work items on the node. This is split out from the next patch to make it easier to understand and review. Note that there is an extra wq_node_nr_active allocated for the invalid node nr_node_ids which is used to track nr_active for pools which don't have NUMA node associated such as the default fallback system-wide pool. This doesn't cause any behavior changes visible to userland yet. The next patch will expand to implement the control mechanism on top. v4: - Fixed out-of-bound access when freeing per-cpu workqueues. v3: - Use flexible array for wq->node_nr_active as suggested by Lai. v2: - wq->max_active now uses WRITE/READ_ONCE() as suggested by Lai. - Lai pointed out that pwq_tryinc_nr_active() incorrectly dropped pwq->max_active check. Restored. As the next patch replaces the max_active enforcement mechanism, this doesn't change the end result. Signed-off-by: Tejun Heo <[email protected]> Reviewed-by: Lai Jiangshan <[email protected]>
1 parent dd6c3c5 commit 91ccc6e

File tree

1 file changed

+135
-7
lines changed

1 file changed

+135
-7
lines changed

kernel/workqueue.c

Lines changed: 135 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,16 @@ struct wq_flusher {
284284

285285
struct wq_device;
286286

287+
/*
288+
* Unlike in a per-cpu workqueue where max_active limits its concurrency level
289+
* on each CPU, in an unbound workqueue, max_active applies to the whole system.
290+
* As sharing a single nr_active across multiple sockets can be very expensive,
291+
* the counting and enforcement is per NUMA node.
292+
*/
293+
struct wq_node_nr_active {
294+
atomic_t nr; /* per-node nr_active count */
295+
};
296+
287297
/*
288298
* The externally visible workqueue. It relays the issued work items to
289299
* the appropriate worker_pool through its pool_workqueues.
@@ -330,6 +340,7 @@ struct workqueue_struct {
330340
/* hot fields used during command issue, aligned to cacheline */
331341
unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
332342
struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */
343+
struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */
333344
};
334345

335346
static struct kmem_cache *pwq_cache;
@@ -1425,6 +1436,31 @@ work_func_t wq_worker_last_func(struct task_struct *task)
14251436
return worker->last_func;
14261437
}
14271438

1439+
/**
1440+
* wq_node_nr_active - Determine wq_node_nr_active to use
1441+
* @wq: workqueue of interest
1442+
* @node: NUMA node, can be %NUMA_NO_NODE
1443+
*
1444+
* Determine wq_node_nr_active to use for @wq on @node. Returns:
1445+
*
1446+
* - %NULL for per-cpu workqueues as they don't need to use shared nr_active.
1447+
*
1448+
* - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE.
1449+
*
1450+
* - Otherwise, node_nr_active[@node].
1451+
*/
1452+
static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq,
1453+
int node)
1454+
{
1455+
if (!(wq->flags & WQ_UNBOUND))
1456+
return NULL;
1457+
1458+
if (node == NUMA_NO_NODE)
1459+
node = nr_node_ids;
1460+
1461+
return wq->node_nr_active[node];
1462+
}
1463+
14281464
/**
14291465
* get_pwq - get an extra reference on the specified pool_workqueue
14301466
* @pwq: pool_workqueue to get
@@ -1506,12 +1542,17 @@ static bool pwq_activate_work(struct pool_workqueue *pwq,
15061542
struct work_struct *work)
15071543
{
15081544
struct worker_pool *pool = pwq->pool;
1545+
struct wq_node_nr_active *nna;
15091546

15101547
lockdep_assert_held(&pool->lock);
15111548

15121549
if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE))
15131550
return false;
15141551

1552+
nna = wq_node_nr_active(pwq->wq, pool->node);
1553+
if (nna)
1554+
atomic_inc(&nna->nr);
1555+
15151556
pwq->nr_active++;
15161557
__pwq_activate_work(pwq, work);
15171558
return true;
@@ -1528,14 +1569,18 @@ static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq)
15281569
{
15291570
struct workqueue_struct *wq = pwq->wq;
15301571
struct worker_pool *pool = pwq->pool;
1572+
struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node);
15311573
bool obtained;
15321574

15331575
lockdep_assert_held(&pool->lock);
15341576

15351577
obtained = pwq->nr_active < READ_ONCE(wq->max_active);
15361578

1537-
if (obtained)
1579+
if (obtained) {
15381580
pwq->nr_active++;
1581+
if (nna)
1582+
atomic_inc(&nna->nr);
1583+
}
15391584
return obtained;
15401585
}
15411586

@@ -1572,10 +1617,26 @@ static bool pwq_activate_first_inactive(struct pool_workqueue *pwq)
15721617
static void pwq_dec_nr_active(struct pool_workqueue *pwq)
15731618
{
15741619
struct worker_pool *pool = pwq->pool;
1620+
struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node);
15751621

15761622
lockdep_assert_held(&pool->lock);
15771623

1624+
/*
1625+
* @pwq->nr_active should be decremented for both percpu and unbound
1626+
* workqueues.
1627+
*/
15781628
pwq->nr_active--;
1629+
1630+
/*
1631+
* For a percpu workqueue, it's simple. Just need to kick the first
1632+
* inactive work item on @pwq itself.
1633+
*/
1634+
if (!nna) {
1635+
pwq_activate_first_inactive(pwq);
1636+
return;
1637+
}
1638+
1639+
atomic_dec(&nna->nr);
15791640
pwq_activate_first_inactive(pwq);
15801641
}
15811642

@@ -4039,11 +4100,63 @@ static void wq_free_lockdep(struct workqueue_struct *wq)
40394100
}
40404101
#endif
40414102

4103+
static void free_node_nr_active(struct wq_node_nr_active **nna_ar)
4104+
{
4105+
int node;
4106+
4107+
for_each_node(node) {
4108+
kfree(nna_ar[node]);
4109+
nna_ar[node] = NULL;
4110+
}
4111+
4112+
kfree(nna_ar[nr_node_ids]);
4113+
nna_ar[nr_node_ids] = NULL;
4114+
}
4115+
4116+
static void init_node_nr_active(struct wq_node_nr_active *nna)
4117+
{
4118+
atomic_set(&nna->nr, 0);
4119+
}
4120+
4121+
/*
4122+
* Each node's nr_active counter will be accessed mostly from its own node and
4123+
* should be allocated in the node.
4124+
*/
4125+
static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar)
4126+
{
4127+
struct wq_node_nr_active *nna;
4128+
int node;
4129+
4130+
for_each_node(node) {
4131+
nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node);
4132+
if (!nna)
4133+
goto err_free;
4134+
init_node_nr_active(nna);
4135+
nna_ar[node] = nna;
4136+
}
4137+
4138+
/* [nr_node_ids] is used as the fallback */
4139+
nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE);
4140+
if (!nna)
4141+
goto err_free;
4142+
init_node_nr_active(nna);
4143+
nna_ar[nr_node_ids] = nna;
4144+
4145+
return 0;
4146+
4147+
err_free:
4148+
free_node_nr_active(nna_ar);
4149+
return -ENOMEM;
4150+
}
4151+
40424152
static void rcu_free_wq(struct rcu_head *rcu)
40434153
{
40444154
struct workqueue_struct *wq =
40454155
container_of(rcu, struct workqueue_struct, rcu);
40464156

4157+
if (wq->flags & WQ_UNBOUND)
4158+
free_node_nr_active(wq->node_nr_active);
4159+
40474160
wq_free_lockdep(wq);
40484161
free_percpu(wq->cpu_pwq);
40494162
free_workqueue_attrs(wq->unbound_attrs);
@@ -4785,7 +4898,8 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
47854898
{
47864899
va_list args;
47874900
struct workqueue_struct *wq;
4788-
int len;
4901+
size_t wq_size;
4902+
int name_len;
47894903

47904904
/*
47914905
* Unbound && max_active == 1 used to imply ordered, which is no longer
@@ -4801,7 +4915,12 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
48014915
flags |= WQ_UNBOUND;
48024916

48034917
/* allocate wq and format name */
4804-
wq = kzalloc(sizeof(*wq), GFP_KERNEL);
4918+
if (flags & WQ_UNBOUND)
4919+
wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1);
4920+
else
4921+
wq_size = sizeof(*wq);
4922+
4923+
wq = kzalloc(wq_size, GFP_KERNEL);
48054924
if (!wq)
48064925
return NULL;
48074926

@@ -4812,11 +4931,12 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
48124931
}
48134932

48144933
va_start(args, max_active);
4815-
len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);
4934+
name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);
48164935
va_end(args);
48174936

4818-
if (len >= WQ_NAME_LEN)
4819-
pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n", wq->name);
4937+
if (name_len >= WQ_NAME_LEN)
4938+
pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n",
4939+
wq->name);
48204940

48214941
max_active = max_active ?: WQ_DFL_ACTIVE;
48224942
max_active = wq_clamp_max_active(max_active, flags, wq->name);
@@ -4835,8 +4955,13 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
48354955
wq_init_lockdep(wq);
48364956
INIT_LIST_HEAD(&wq->list);
48374957

4958+
if (flags & WQ_UNBOUND) {
4959+
if (alloc_node_nr_active(wq->node_nr_active) < 0)
4960+
goto err_unreg_lockdep;
4961+
}
4962+
48384963
if (alloc_and_link_pwqs(wq) < 0)
4839-
goto err_unreg_lockdep;
4964+
goto err_free_node_nr_active;
48404965

48414966
if (wq_online && init_rescuer(wq) < 0)
48424967
goto err_destroy;
@@ -4861,6 +4986,9 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
48614986

48624987
return wq;
48634988

4989+
err_free_node_nr_active:
4990+
if (wq->flags & WQ_UNBOUND)
4991+
free_node_nr_active(wq->node_nr_active);
48644992
err_unreg_lockdep:
48654993
wq_unregister_lockdep(wq);
48664994
wq_free_lockdep(wq);

0 commit comments

Comments
 (0)