Skip to content

Commit b76b840

Browse files
damien-lemoalaxboe
authored andcommitted
dm: Fix dm-zoned-reclaim zone write pointer alignment
The zone reclaim processing of the dm-zoned device mapper uses blkdev_issue_zeroout() to align the write pointer of a zone being used for reclaiming another zone, to write the valid data blocks from the zone being reclaimed at the same position relative to the zone start in the reclaim target zone. The first call to blkdev_issue_zeroout() will try to use hardware offload using a REQ_OP_WRITE_ZEROES operation if the device reports a non-zero max_write_zeroes_sectors queue limit. If this operation fails because of the lack of hardware support, blkdev_issue_zeroout() falls back to using a regular write operation with the zero-page as buffer. Currently, such REQ_OP_WRITE_ZEROES failure is automatically handled by the block layer zone write plugging code which will execute a report zones operation to ensure that the write pointer of the target zone of the failed operation has not changed and to "rewind" the zone write pointer offset of the target zone as it was advanced when the write zero operation was submitted. So the REQ_OP_WRITE_ZEROES failure does not cause any issue and blkdev_issue_zeroout() works as expected. However, since the automatic recovery of zone write pointers by the zone write plugging code can potentially cause deadlocks with queue freeze operations, a different recovery must be implemented in preparation for the removal of zone write plugging report zones based recovery. Do this by introducing the new function blk_zone_issue_zeroout(). This function first calls blkdev_issue_zeroout() with the flag BLKDEV_ZERO_NOFALLBACK to intercept failures on the first execution which attempt to use the device hardware offload with the REQ_OP_WRITE_ZEROES operation. If this attempt fails, a report zone operation is issued to restore the zone write pointer offset of the target zone to the correct position and blkdev_issue_zeroout() is called again without the BLKDEV_ZERO_NOFALLBACK flag. The report zones operation performing this recovery is implemented using the helper function disk_zone_sync_wp_offset() which calls the gendisk report_zones file operation with the callback disk_report_zones_cb(). This callback updates the target write pointer offset of the target zone using the new function disk_zone_wplug_sync_wp_offset(). dmz_reclaim_align_wp() is modified to change its call to blkdev_issue_zeroout() to a call to blk_zone_issue_zeroout() without any other change needed as the two functions are functionnally equivalent. Fixes: dd291d7 ("block: Introduce zone write plugging") Cc: [email protected] Signed-off-by: Damien Le Moal <[email protected]> Reviewed-by: Christoph Hellwig <[email protected]> Acked-by: Mike Snitzer <[email protected]> Reviewed-by: Martin K. Petersen <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jens Axboe <[email protected]>
1 parent 5eb3317 commit b76b840

File tree

3 files changed

+124
-26
lines changed

3 files changed

+124
-26
lines changed

block/blk-zoned.c

Lines changed: 118 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,30 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
115115
}
116116
EXPORT_SYMBOL_GPL(blk_zone_cond_str);
117117

118+
struct disk_report_zones_cb_args {
119+
struct gendisk *disk;
120+
report_zones_cb user_cb;
121+
void *user_data;
122+
};
123+
124+
static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
125+
struct blk_zone *zone);
126+
127+
static int disk_report_zones_cb(struct blk_zone *zone, unsigned int idx,
128+
void *data)
129+
{
130+
struct disk_report_zones_cb_args *args = data;
131+
struct gendisk *disk = args->disk;
132+
133+
if (disk->zone_wplugs_hash)
134+
disk_zone_wplug_sync_wp_offset(disk, zone);
135+
136+
if (!args->user_cb)
137+
return 0;
138+
139+
return args->user_cb(zone, idx, args->user_data);
140+
}
141+
118142
/**
119143
* blkdev_report_zones - Get zones information
120144
* @bdev: Target block device
@@ -694,6 +718,58 @@ static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
694718
spin_unlock_irqrestore(&zwplug->lock, flags);
695719
}
696720

721+
static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
722+
{
723+
switch (zone->cond) {
724+
case BLK_ZONE_COND_IMP_OPEN:
725+
case BLK_ZONE_COND_EXP_OPEN:
726+
case BLK_ZONE_COND_CLOSED:
727+
return zone->wp - zone->start;
728+
case BLK_ZONE_COND_FULL:
729+
return zone->len;
730+
case BLK_ZONE_COND_EMPTY:
731+
return 0;
732+
case BLK_ZONE_COND_NOT_WP:
733+
case BLK_ZONE_COND_OFFLINE:
734+
case BLK_ZONE_COND_READONLY:
735+
default:
736+
/*
737+
* Conventional, offline and read-only zones do not have a valid
738+
* write pointer.
739+
*/
740+
return UINT_MAX;
741+
}
742+
}
743+
744+
static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
745+
struct blk_zone *zone)
746+
{
747+
struct blk_zone_wplug *zwplug;
748+
unsigned long flags;
749+
750+
zwplug = disk_get_zone_wplug(disk, zone->start);
751+
if (!zwplug)
752+
return;
753+
754+
spin_lock_irqsave(&zwplug->lock, flags);
755+
if (zwplug->flags & BLK_ZONE_WPLUG_ERROR)
756+
disk_zone_wplug_set_wp_offset(disk, zwplug,
757+
blk_zone_wp_offset(zone));
758+
spin_unlock_irqrestore(&zwplug->lock, flags);
759+
760+
disk_put_zone_wplug(zwplug);
761+
}
762+
763+
static int disk_zone_sync_wp_offset(struct gendisk *disk, sector_t sector)
764+
{
765+
struct disk_report_zones_cb_args args = {
766+
.disk = disk,
767+
};
768+
769+
return disk->fops->report_zones(disk, sector, 1,
770+
disk_report_zones_cb, &args);
771+
}
772+
697773
static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio,
698774
unsigned int wp_offset)
699775
{
@@ -1280,29 +1356,6 @@ static void blk_zone_wplug_bio_work(struct work_struct *work)
12801356
disk_put_zone_wplug(zwplug);
12811357
}
12821358

1283-
static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
1284-
{
1285-
switch (zone->cond) {
1286-
case BLK_ZONE_COND_IMP_OPEN:
1287-
case BLK_ZONE_COND_EXP_OPEN:
1288-
case BLK_ZONE_COND_CLOSED:
1289-
return zone->wp - zone->start;
1290-
case BLK_ZONE_COND_FULL:
1291-
return zone->len;
1292-
case BLK_ZONE_COND_EMPTY:
1293-
return 0;
1294-
case BLK_ZONE_COND_NOT_WP:
1295-
case BLK_ZONE_COND_OFFLINE:
1296-
case BLK_ZONE_COND_READONLY:
1297-
default:
1298-
/*
1299-
* Conventional, offline and read-only zones do not have a valid
1300-
* write pointer.
1301-
*/
1302-
return UINT_MAX;
1303-
}
1304-
}
1305-
13061359
static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone,
13071360
unsigned int idx, void *data)
13081361
{
@@ -1866,6 +1919,48 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
18661919
}
18671920
EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
18681921

1922+
/**
1923+
* blk_zone_issue_zeroout - zero-fill a block range in a zone
1924+
* @bdev: blockdev to write
1925+
* @sector: start sector
1926+
* @nr_sects: number of sectors to write
1927+
* @gfp_mask: memory allocation flags (for bio_alloc)
1928+
*
1929+
* Description:
1930+
* Zero-fill a block range in a zone (@sector must be equal to the zone write
1931+
* pointer), handling potential errors due to the (initially unknown) lack of
1932+
* hardware offload (See blkdev_issue_zeroout()).
1933+
*/
1934+
int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
1935+
sector_t nr_sects, gfp_t gfp_mask)
1936+
{
1937+
int ret;
1938+
1939+
if (WARN_ON_ONCE(!bdev_is_zoned(bdev)))
1940+
return -EIO;
1941+
1942+
ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
1943+
BLKDEV_ZERO_NOFALLBACK);
1944+
if (ret != -EOPNOTSUPP)
1945+
return ret;
1946+
1947+
/*
1948+
* The failed call to blkdev_issue_zeroout() advanced the zone write
1949+
* pointer. Undo this using a report zone to update the zone write
1950+
* pointer to the correct current value.
1951+
*/
1952+
ret = disk_zone_sync_wp_offset(bdev->bd_disk, sector);
1953+
if (ret != 1)
1954+
return ret < 0 ? ret : -EIO;
1955+
1956+
/*
1957+
* Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a
1958+
* regular write with zero-pages.
1959+
*/
1960+
return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0);
1961+
}
1962+
EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout);
1963+
18691964
#ifdef CONFIG_BLK_DEBUG_FS
18701965

18711966
int queue_zone_wplugs_show(void *data, struct seq_file *m)

drivers/md/dm-zoned-reclaim.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,9 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone,
7676
* pointer and the requested position.
7777
*/
7878
nr_blocks = block - wp_block;
79-
ret = blkdev_issue_zeroout(dev->bdev,
80-
dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block),
81-
dmz_blk2sect(nr_blocks), GFP_NOIO, 0);
79+
ret = blk_zone_issue_zeroout(dev->bdev,
80+
dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block),
81+
dmz_blk2sect(nr_blocks), GFP_NOIO);
8282
if (ret) {
8383
dmz_dev_err(dev,
8484
"Align zone %u wp %llu to %llu (wp+%u) blocks failed %d",

include/linux/blkdev.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1421,6 +1421,9 @@ static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
14211421
return is_seq;
14221422
}
14231423

1424+
int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
1425+
sector_t nr_sects, gfp_t gfp_mask);
1426+
14241427
static inline unsigned int queue_dma_alignment(const struct request_queue *q)
14251428
{
14261429
return q->limits.dma_alignment;

0 commit comments

Comments
 (0)