Skip to content

Commit 0e093d9

Browse files
gormanmtorvalds
authored andcommitted
writeback: do not sleep on the congestion queue if there are no congested BDIs or if significant congestion is not being encountered in the current zone
If congestion_wait() is called with no BDI congested, the caller will sleep for the full timeout and this may be an unnecessary sleep. This patch adds a wait_iff_congested() that checks congestion and only sleeps if a BDI is congested else, it calls cond_resched() to ensure the caller is not hogging the CPU longer than its quota but otherwise will not sleep. This is aimed at reducing some of the major desktop stalls reported during IO. For example, while kswapd is operating, it calls congestion_wait() but it could just have been reclaiming clean page cache pages with no congestion. Without this patch, it would sleep for a full timeout but after this patch, it'll just call schedule() if it has been on the CPU too long. Similar logic applies to direct reclaimers that are not making enough progress. Signed-off-by: Mel Gorman <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Minchan Kim <[email protected]> Cc: Wu Fengguang <[email protected]> Cc: KAMEZAWA Hiroyuki <[email protected]> Cc: KOSAKI Motohiro <[email protected]> Cc: Rik van Riel <[email protected]> Cc: Jens Axboe <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 08fc468 commit 0e093d9

File tree

6 files changed

+112
-12
lines changed

6 files changed

+112
-12
lines changed

include/linux/backing-dev.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ enum {
285285
void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
286286
void set_bdi_congested(struct backing_dev_info *bdi, int sync);
287287
long congestion_wait(int sync, long timeout);
288-
288+
long wait_iff_congested(struct zone *zone, int sync, long timeout);
289289

290290
static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi)
291291
{

include/linux/mmzone.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,9 @@ struct zone {
423423
typedef enum {
424424
ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */
425425
ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */
426+
ZONE_CONGESTED, /* zone has many dirty pages backed by
427+
* a congested BDI
428+
*/
426429
} zone_flags_t;
427430

428431
static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
@@ -440,6 +443,11 @@ static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag)
440443
clear_bit(flag, &zone->flags);
441444
}
442445

446+
static inline int zone_is_reclaim_congested(const struct zone *zone)
447+
{
448+
return test_bit(ZONE_CONGESTED, &zone->flags);
449+
}
450+
443451
static inline int zone_is_reclaim_locked(const struct zone *zone)
444452
{
445453
return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);

include/trace/events/writeback.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,13 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait,
179179
TP_ARGS(usec_timeout, usec_delayed)
180180
);
181181

182+
DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested,
183+
184+
TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
185+
186+
TP_ARGS(usec_timeout, usec_delayed)
187+
);
188+
182189
#endif /* _TRACE_WRITEBACK_H */
183190

184191
/* This part must be outside protection */

mm/backing-dev.c

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -729,14 +729,16 @@ static wait_queue_head_t congestion_wqh[2] = {
729729
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
730730
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
731731
};
732+
static atomic_t nr_bdi_congested[2];
732733

733734
void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
734735
{
735736
enum bdi_state bit;
736737
wait_queue_head_t *wqh = &congestion_wqh[sync];
737738

738739
bit = sync ? BDI_sync_congested : BDI_async_congested;
739-
clear_bit(bit, &bdi->state);
740+
if (test_and_clear_bit(bit, &bdi->state))
741+
atomic_dec(&nr_bdi_congested[sync]);
740742
smp_mb__after_clear_bit();
741743
if (waitqueue_active(wqh))
742744
wake_up(wqh);
@@ -748,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
748750
enum bdi_state bit;
749751

750752
bit = sync ? BDI_sync_congested : BDI_async_congested;
751-
set_bit(bit, &bdi->state);
753+
if (!test_and_set_bit(bit, &bdi->state))
754+
atomic_inc(&nr_bdi_congested[sync]);
752755
}
753756
EXPORT_SYMBOL(set_bdi_congested);
754757

@@ -779,3 +782,57 @@ long congestion_wait(int sync, long timeout)
779782
}
780783
EXPORT_SYMBOL(congestion_wait);
781784

785+
/**
786+
* wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
787+
* @zone: A zone to check if it is heavily congested
788+
* @sync: SYNC or ASYNC IO
789+
* @timeout: timeout in jiffies
790+
*
791+
* In the event of a congested backing_dev (any backing_dev) and the given
792+
* @zone has experienced recent congestion, this waits for up to @timeout
793+
* jiffies for either a BDI to exit congestion of the given @sync queue
794+
* or a write to complete.
795+
*
796+
* In the absense of zone congestion, cond_resched() is called to yield
797+
* the processor if necessary but otherwise does not sleep.
798+
*
799+
* The return value is 0 if the sleep is for the full timeout. Otherwise,
800+
* it is the number of jiffies that were still remaining when the function
801+
* returned. return_value == timeout implies the function did not sleep.
802+
*/
803+
long wait_iff_congested(struct zone *zone, int sync, long timeout)
804+
{
805+
long ret;
806+
unsigned long start = jiffies;
807+
DEFINE_WAIT(wait);
808+
wait_queue_head_t *wqh = &congestion_wqh[sync];
809+
810+
/*
811+
* If there is no congestion, or heavy congestion is not being
812+
* encountered in the current zone, yield if necessary instead
813+
* of sleeping on the congestion queue
814+
*/
815+
if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
816+
!zone_is_reclaim_congested(zone)) {
817+
cond_resched();
818+
819+
/* In case we scheduled, work out time remaining */
820+
ret = timeout - (jiffies - start);
821+
if (ret < 0)
822+
ret = 0;
823+
824+
goto out;
825+
}
826+
827+
/* Sleep until uncongested or a write happens */
828+
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
829+
ret = io_schedule_timeout(timeout);
830+
finish_wait(wqh, &wait);
831+
832+
out:
833+
trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
834+
jiffies_to_usecs(jiffies - start));
835+
836+
return ret;
837+
}
838+
EXPORT_SYMBOL(wait_iff_congested);

mm/page_alloc.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1907,7 +1907,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
19071907
preferred_zone, migratetype);
19081908

19091909
if (!page && gfp_mask & __GFP_NOFAIL)
1910-
congestion_wait(BLK_RW_ASYNC, HZ/50);
1910+
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
19111911
} while (!page && (gfp_mask & __GFP_NOFAIL));
19121912

19131913
return page;
@@ -2095,7 +2095,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
20952095
pages_reclaimed += did_some_progress;
20962096
if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
20972097
/* Wait for some write requests to complete then retry */
2098-
congestion_wait(BLK_RW_ASYNC, HZ/50);
2098+
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
20992099
goto rebalance;
21002100
}
21012101

mm/vmscan.c

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -401,10 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
401401
}
402402
if (mapping->a_ops->writepage == NULL)
403403
return PAGE_ACTIVATE;
404-
if (!may_write_to_queue(mapping->backing_dev_info, sc)) {
405-
disable_lumpy_reclaim_mode(sc);
404+
if (!may_write_to_queue(mapping->backing_dev_info, sc))
406405
return PAGE_KEEP;
407-
}
408406

409407
if (clear_page_dirty_for_io(page)) {
410408
int res;
@@ -681,11 +679,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
681679
* shrink_page_list() returns the number of reclaimed pages
682680
*/
683681
static unsigned long shrink_page_list(struct list_head *page_list,
682+
struct zone *zone,
684683
struct scan_control *sc)
685684
{
686685
LIST_HEAD(ret_pages);
687686
LIST_HEAD(free_pages);
688687
int pgactivate = 0;
688+
unsigned long nr_dirty = 0;
689+
unsigned long nr_congested = 0;
689690
unsigned long nr_reclaimed = 0;
690691

691692
cond_resched();
@@ -705,6 +706,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
705706
goto keep;
706707

707708
VM_BUG_ON(PageActive(page));
709+
VM_BUG_ON(page_zone(page) != zone);
708710

709711
sc->nr_scanned++;
710712

@@ -782,6 +784,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
782784
}
783785

784786
if (PageDirty(page)) {
787+
nr_dirty++;
788+
785789
if (references == PAGEREF_RECLAIM_CLEAN)
786790
goto keep_locked;
787791
if (!may_enter_fs)
@@ -792,6 +796,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
792796
/* Page is dirty, try to write it out here */
793797
switch (pageout(page, mapping, sc)) {
794798
case PAGE_KEEP:
799+
nr_congested++;
795800
goto keep_locked;
796801
case PAGE_ACTIVATE:
797802
goto activate_locked;
@@ -902,6 +907,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
902907
VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
903908
}
904909

910+
/*
911+
* Tag a zone as congested if all the dirty pages encountered were
912+
* backed by a congested BDI. In this case, reclaimers should just
913+
* back off and wait for congestion to clear because further reclaim
914+
* will encounter the same problem
915+
*/
916+
if (nr_dirty == nr_congested)
917+
zone_set_flag(zone, ZONE_CONGESTED);
918+
905919
free_page_list(&free_pages);
906920

907921
list_splice(&ret_pages, page_list);
@@ -1386,12 +1400,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
13861400

13871401
spin_unlock_irq(&zone->lru_lock);
13881402

1389-
nr_reclaimed = shrink_page_list(&page_list, sc);
1403+
nr_reclaimed = shrink_page_list(&page_list, zone, sc);
13901404

13911405
/* Check if we should syncronously wait for writeback */
13921406
if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
13931407
set_lumpy_reclaim_mode(priority, sc, true);
1394-
nr_reclaimed += shrink_page_list(&page_list, sc);
1408+
nr_reclaimed += shrink_page_list(&page_list, zone, sc);
13951409
}
13961410

13971411
local_irq_disable();
@@ -1982,8 +1996,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
19821996

19831997
/* Take a nap, wait for some writeback to complete */
19841998
if (!sc->hibernation_mode && sc->nr_scanned &&
1985-
priority < DEF_PRIORITY - 2)
1986-
congestion_wait(BLK_RW_ASYNC, HZ/10);
1999+
priority < DEF_PRIORITY - 2) {
2000+
struct zone *preferred_zone;
2001+
2002+
first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2003+
NULL, &preferred_zone);
2004+
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2005+
}
19872006
}
19882007

19892008
out:
@@ -2282,6 +2301,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
22822301
if (!zone_watermark_ok(zone, order,
22832302
min_wmark_pages(zone), end_zone, 0))
22842303
has_under_min_watermark_zone = 1;
2304+
} else {
2305+
/*
2306+
* If a zone reaches its high watermark,
2307+
* consider it to be no longer congested. It's
2308+
* possible there are dirty pages backed by
2309+
* congested BDIs but as pressure is relieved,
2310+
* spectulatively avoid congestion waits
2311+
*/
2312+
zone_clear_flag(zone, ZONE_CONGESTED);
22852313
}
22862314

22872315
}

0 commit comments

Comments
 (0)