Skip to content

Commit 380c27c

Browse files
htejunaxboe
authored andcommitted
writeback: implement wb_domain
Dirtyable memory is distributed to a wb (bdi_writeback) according to the relative bandwidth the wb is writing out in the whole system. This distribution is global - each wb is measured against all other wb's and gets the proportinately sized portion of the memory in the whole system. For cgroup writeback, the amount of dirtyable memory is scoped by memcg and thus each wb would need to be measured and controlled in its memcg. IOW, a wb will belong to two writeback domains - the global and memcg domains. Currently, what constitutes the global writeback domain are scattered across a number of global states. This patch starts collecting them into struct wb_domain. * fprop_global which serves as the basis for proportional bandwidth measurement and its period timer are moved into struct wb_domain. * global_wb_domain hosts the states for the global domain. * While at it, flatten wb_writeout_fraction() into its callers. This thin wrapper doesn't provide any actual benefits while getting in the way. This is pure reorganization and doesn't introduce any behavioral changes. Signed-off-by: Tejun Heo <[email protected]> Cc: Jens Axboe <[email protected]> Cc: Jan Kara <[email protected]> Cc: Wu Fengguang <[email protected]> Cc: Greg Thelen <[email protected]> Signed-off-by: Jens Axboe <[email protected]>
1 parent 8a73179 commit 380c27c

File tree

2 files changed

+59
-45
lines changed

2 files changed

+59
-45
lines changed

include/linux/writeback.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <linux/sched.h>
88
#include <linux/workqueue.h>
99
#include <linux/fs.h>
10+
#include <linux/flex_proportions.h>
1011

1112
DECLARE_PER_CPU(int, dirty_throttle_leaks);
1213

@@ -86,6 +87,36 @@ struct writeback_control {
8687
unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
8788
};
8889

90+
/*
91+
* A wb_domain represents a domain that wb's (bdi_writeback's) belong to
92+
* and are measured against each other in. There always is one global
93+
* domain, global_wb_domain, that every wb in the system is a member of.
94+
* This allows measuring the relative bandwidth of each wb to distribute
95+
* dirtyable memory accordingly.
96+
*/
97+
struct wb_domain {
98+
/*
99+
* Scale the writeback cache size proportional to the relative
100+
* writeout speed.
101+
*
102+
* We do this by keeping a floating proportion between BDIs, based
103+
* on page writeback completions [end_page_writeback()]. Those
104+
* devices that write out pages fastest will get the larger share,
105+
* while the slower will get a smaller share.
106+
*
107+
* We use page writeout completions because we are interested in
108+
* getting rid of dirty pages. Having them written out is the
109+
* primary goal.
110+
*
111+
* We introduce a concept of time, a period over which we measure
112+
* these events, because demand can/will vary over time. The length
113+
* of this period itself is measured in page writeback completions.
114+
*/
115+
struct fprop_global completions;
116+
struct timer_list period_timer; /* timer for aging of completions */
117+
unsigned long period_time;
118+
};
119+
89120
/*
90121
* fs/fs-writeback.c
91122
*/
@@ -120,6 +151,7 @@ static inline void laptop_sync_completion(void) { }
120151
#endif
121152
void throttle_vm_writeout(gfp_t gfp_mask);
122153
bool zone_dirty_ok(struct zone *zone);
154+
int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
123155

124156
extern unsigned long global_dirty_limit;
125157

mm/page-writeback.c

Lines changed: 27 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -124,29 +124,7 @@ EXPORT_SYMBOL(laptop_mode);
124124

125125
unsigned long global_dirty_limit;
126126

127-
/*
128-
* Scale the writeback cache size proportional to the relative writeout speeds.
129-
*
130-
* We do this by keeping a floating proportion between BDIs, based on page
131-
* writeback completions [end_page_writeback()]. Those devices that write out
132-
* pages fastest will get the larger share, while the slower will get a smaller
133-
* share.
134-
*
135-
* We use page writeout completions because we are interested in getting rid of
136-
* dirty pages. Having them written out is the primary goal.
137-
*
138-
* We introduce a concept of time, a period over which we measure these events,
139-
* because demand can/will vary over time. The length of this period itself is
140-
* measured in page writeback completions.
141-
*
142-
*/
143-
static struct fprop_global writeout_completions;
144-
145-
static void writeout_period(unsigned long t);
146-
/* Timer for aging of writeout_completions */
147-
static struct timer_list writeout_period_timer =
148-
TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
149-
static unsigned long writeout_period_time = 0;
127+
static struct wb_domain global_wb_domain;
150128

151129
/*
152130
* Length of period for aging writeout fractions of bdis. This is an
@@ -433,24 +411,26 @@ static unsigned long wp_next_time(unsigned long cur_time)
433411
}
434412

435413
/*
436-
* Increment the BDI's writeout completion count and the global writeout
414+
* Increment the wb's writeout completion count and the global writeout
437415
* completion count. Called from test_clear_page_writeback().
438416
*/
439417
static inline void __wb_writeout_inc(struct bdi_writeback *wb)
440418
{
419+
struct wb_domain *dom = &global_wb_domain;
420+
441421
__inc_wb_stat(wb, WB_WRITTEN);
442-
__fprop_inc_percpu_max(&writeout_completions, &wb->completions,
422+
__fprop_inc_percpu_max(&dom->completions, &wb->completions,
443423
wb->bdi->max_prop_frac);
444424
/* First event after period switching was turned off? */
445-
if (!unlikely(writeout_period_time)) {
425+
if (!unlikely(dom->period_time)) {
446426
/*
447427
* We can race with other __bdi_writeout_inc calls here but
448428
* it does not cause any harm since the resulting time when
449429
* timer will fire and what is in writeout_period_time will be
450430
* roughly the same.
451431
*/
452-
writeout_period_time = wp_next_time(jiffies);
453-
mod_timer(&writeout_period_timer, writeout_period_time);
432+
dom->period_time = wp_next_time(jiffies);
433+
mod_timer(&dom->period_timer, dom->period_time);
454434
}
455435
}
456436

@@ -464,38 +444,38 @@ void wb_writeout_inc(struct bdi_writeback *wb)
464444
}
465445
EXPORT_SYMBOL_GPL(wb_writeout_inc);
466446

467-
/*
468-
* Obtain an accurate fraction of the BDI's portion.
469-
*/
470-
static void wb_writeout_fraction(struct bdi_writeback *wb,
471-
long *numerator, long *denominator)
472-
{
473-
fprop_fraction_percpu(&writeout_completions, &wb->completions,
474-
numerator, denominator);
475-
}
476-
477447
/*
478448
* On idle system, we can be called long after we scheduled because we use
479449
* deferred timers so count with missed periods.
480450
*/
481451
static void writeout_period(unsigned long t)
482452
{
483-
int miss_periods = (jiffies - writeout_period_time) /
453+
struct wb_domain *dom = (void *)t;
454+
int miss_periods = (jiffies - dom->period_time) /
484455
VM_COMPLETIONS_PERIOD_LEN;
485456

486-
if (fprop_new_period(&writeout_completions, miss_periods + 1)) {
487-
writeout_period_time = wp_next_time(writeout_period_time +
457+
if (fprop_new_period(&dom->completions, miss_periods + 1)) {
458+
dom->period_time = wp_next_time(dom->period_time +
488459
miss_periods * VM_COMPLETIONS_PERIOD_LEN);
489-
mod_timer(&writeout_period_timer, writeout_period_time);
460+
mod_timer(&dom->period_timer, dom->period_time);
490461
} else {
491462
/*
492463
* Aging has zeroed all fractions. Stop wasting CPU on period
493464
* updates.
494465
*/
495-
writeout_period_time = 0;
466+
dom->period_time = 0;
496467
}
497468
}
498469

470+
int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
471+
{
472+
memset(dom, 0, sizeof(*dom));
473+
init_timer_deferrable(&dom->period_timer);
474+
dom->period_timer.function = writeout_period;
475+
dom->period_timer.data = (unsigned long)dom;
476+
return fprop_global_init(&dom->completions, gfp);
477+
}
478+
499479
/*
500480
* bdi_min_ratio keeps the sum of the minimum dirty shares of all
501481
* registered backing devices, which, for obvious reasons, can not
@@ -579,14 +559,16 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
579559
*/
580560
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
581561
{
562+
struct wb_domain *dom = &global_wb_domain;
582563
u64 wb_thresh;
583564
long numerator, denominator;
584565
unsigned long wb_min_ratio, wb_max_ratio;
585566

586567
/*
587568
* Calculate this BDI's share of the thresh ratio.
588569
*/
589-
wb_writeout_fraction(wb, &numerator, &denominator);
570+
fprop_fraction_percpu(&dom->completions, &wb->completions,
571+
&numerator, &denominator);
590572

591573
wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
592574
wb_thresh *= numerator;
@@ -1831,7 +1813,7 @@ void __init page_writeback_init(void)
18311813
writeback_set_ratelimit();
18321814
register_cpu_notifier(&ratelimit_nb);
18331815

1834-
fprop_global_init(&writeout_completions, GFP_KERNEL);
1816+
BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
18351817
}
18361818

18371819
/**

0 commit comments

Comments
 (0)