@@ -1516,6 +1516,256 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
1516
1516
raid10_write_request (mddev , bio , r10_bio );
1517
1517
}
1518
1518
1519
+ static struct bio * raid10_split_bio (struct r10conf * conf ,
1520
+ struct bio * bio , sector_t sectors , bool want_first )
1521
+ {
1522
+ struct bio * split ;
1523
+
1524
+ split = bio_split (bio , sectors , GFP_NOIO , & conf -> bio_split );
1525
+ bio_chain (split , bio );
1526
+ allow_barrier (conf );
1527
+ if (want_first ) {
1528
+ submit_bio_noacct (bio );
1529
+ bio = split ;
1530
+ } else
1531
+ submit_bio_noacct (split );
1532
+ wait_barrier (conf );
1533
+
1534
+ return bio ;
1535
+ }
1536
+
1537
+ static void raid10_end_discard_request (struct bio * bio )
1538
+ {
1539
+ struct r10bio * r10_bio = bio -> bi_private ;
1540
+ struct r10conf * conf = r10_bio -> mddev -> private ;
1541
+ struct md_rdev * rdev = NULL ;
1542
+ int dev ;
1543
+ int slot , repl ;
1544
+
1545
+ /*
1546
+ * We don't care the return value of discard bio
1547
+ */
1548
+ if (!test_bit (R10BIO_Uptodate , & r10_bio -> state ))
1549
+ set_bit (R10BIO_Uptodate , & r10_bio -> state );
1550
+
1551
+ dev = find_bio_disk (conf , r10_bio , bio , & slot , & repl );
1552
+ if (repl )
1553
+ rdev = conf -> mirrors [dev ].replacement ;
1554
+ if (!rdev ) {
1555
+ /* raid10_remove_disk uses smp_mb to make sure rdev is set to
1556
+ * replacement before setting replacement to NULL. It can read
1557
+ * rdev first without barrier protect even replacment is NULL
1558
+ */
1559
+ smp_rmb ();
1560
+ rdev = conf -> mirrors [dev ].rdev ;
1561
+ }
1562
+
1563
+ if (atomic_dec_and_test (& r10_bio -> remaining )) {
1564
+ md_write_end (r10_bio -> mddev );
1565
+ raid_end_bio_io (r10_bio );
1566
+ }
1567
+
1568
+ rdev_dec_pending (rdev , conf -> mddev );
1569
+ }
1570
+
1571
+ /* There are some limitations to handle discard bio
1572
+ * 1st, the discard size is bigger than stripe_size*2.
1573
+ * 2st, if the discard bio spans reshape progress, we use the old way to
1574
+ * handle discard bio
1575
+ */
1576
+ static int raid10_handle_discard (struct mddev * mddev , struct bio * bio )
1577
+ {
1578
+ struct r10conf * conf = mddev -> private ;
1579
+ struct geom * geo = & conf -> geo ;
1580
+ struct r10bio * r10_bio ;
1581
+
1582
+ int disk ;
1583
+ sector_t chunk ;
1584
+ unsigned int stripe_size ;
1585
+ sector_t split_size ;
1586
+
1587
+ sector_t bio_start , bio_end ;
1588
+ sector_t first_stripe_index , last_stripe_index ;
1589
+ sector_t start_disk_offset ;
1590
+ unsigned int start_disk_index ;
1591
+ sector_t end_disk_offset ;
1592
+ unsigned int end_disk_index ;
1593
+ unsigned int remainder ;
1594
+
1595
+ if (test_bit (MD_RECOVERY_RESHAPE , & mddev -> recovery ))
1596
+ return - EAGAIN ;
1597
+
1598
+ wait_barrier (conf );
1599
+
1600
+ /* Check reshape again to avoid reshape happens after checking
1601
+ * MD_RECOVERY_RESHAPE and before wait_barrier
1602
+ */
1603
+ if (test_bit (MD_RECOVERY_RESHAPE , & mddev -> recovery ))
1604
+ goto out ;
1605
+
1606
+ stripe_size = geo -> raid_disks << geo -> chunk_shift ;
1607
+ bio_start = bio -> bi_iter .bi_sector ;
1608
+ bio_end = bio_end_sector (bio );
1609
+
1610
+ /* Maybe one discard bio is smaller than strip size or across one stripe
1611
+ * and discard region is larger than one stripe size. For far offset layout,
1612
+ * if the discard region is not aligned with stripe size, there is hole
1613
+ * when we submit discard bio to member disk. For simplicity, we only
1614
+ * handle discard bio which discard region is bigger than stripe_size*2
1615
+ */
1616
+ if (bio_sectors (bio ) < stripe_size * 2 )
1617
+ goto out ;
1618
+
1619
+ /* For far offset layout, if bio is not aligned with stripe size, it splits
1620
+ * the part that is not aligned with strip size.
1621
+ */
1622
+ div_u64_rem (bio_start , stripe_size , & remainder );
1623
+ if (geo -> far_offset && remainder ) {
1624
+ split_size = stripe_size - remainder ;
1625
+ bio = raid10_split_bio (conf , bio , split_size , false);
1626
+ }
1627
+ div_u64_rem (bio_end , stripe_size , & remainder );
1628
+ if (geo -> far_offset && remainder ) {
1629
+ split_size = bio_sectors (bio ) - remainder ;
1630
+ bio = raid10_split_bio (conf , bio , split_size , true);
1631
+ }
1632
+
1633
+ r10_bio = mempool_alloc (& conf -> r10bio_pool , GFP_NOIO );
1634
+ r10_bio -> mddev = mddev ;
1635
+ r10_bio -> state = 0 ;
1636
+ r10_bio -> sectors = 0 ;
1637
+ memset (r10_bio -> devs , 0 , sizeof (r10_bio -> devs [0 ]) * geo -> raid_disks );
1638
+
1639
+ wait_blocked_dev (mddev , r10_bio );
1640
+
1641
+ r10_bio -> master_bio = bio ;
1642
+
1643
+ bio_start = bio -> bi_iter .bi_sector ;
1644
+ bio_end = bio_end_sector (bio );
1645
+
1646
+ /* raid10 uses chunk as the unit to store data. It's similar like raid0.
1647
+ * One stripe contains the chunks from all member disk (one chunk from
1648
+ * one disk at the same HBA address). For layout detail, see 'man md 4'
1649
+ */
1650
+ chunk = bio_start >> geo -> chunk_shift ;
1651
+ chunk *= geo -> near_copies ;
1652
+ first_stripe_index = chunk ;
1653
+ start_disk_index = sector_div (first_stripe_index , geo -> raid_disks );
1654
+ if (geo -> far_offset )
1655
+ first_stripe_index *= geo -> far_copies ;
1656
+ start_disk_offset = (bio_start & geo -> chunk_mask ) +
1657
+ (first_stripe_index << geo -> chunk_shift );
1658
+
1659
+ chunk = bio_end >> geo -> chunk_shift ;
1660
+ chunk *= geo -> near_copies ;
1661
+ last_stripe_index = chunk ;
1662
+ end_disk_index = sector_div (last_stripe_index , geo -> raid_disks );
1663
+ if (geo -> far_offset )
1664
+ last_stripe_index *= geo -> far_copies ;
1665
+ end_disk_offset = (bio_end & geo -> chunk_mask ) +
1666
+ (last_stripe_index << geo -> chunk_shift );
1667
+
1668
+ rcu_read_lock ();
1669
+ for (disk = 0 ; disk < geo -> raid_disks ; disk ++ ) {
1670
+ struct md_rdev * rdev = rcu_dereference (conf -> mirrors [disk ].rdev );
1671
+ struct md_rdev * rrdev = rcu_dereference (
1672
+ conf -> mirrors [disk ].replacement );
1673
+
1674
+ r10_bio -> devs [disk ].bio = NULL ;
1675
+ r10_bio -> devs [disk ].repl_bio = NULL ;
1676
+
1677
+ if (rdev && (test_bit (Faulty , & rdev -> flags )))
1678
+ rdev = NULL ;
1679
+ if (rrdev && (test_bit (Faulty , & rrdev -> flags )))
1680
+ rrdev = NULL ;
1681
+ if (!rdev && !rrdev )
1682
+ continue ;
1683
+
1684
+ if (rdev ) {
1685
+ r10_bio -> devs [disk ].bio = bio ;
1686
+ atomic_inc (& rdev -> nr_pending );
1687
+ }
1688
+ if (rrdev ) {
1689
+ r10_bio -> devs [disk ].repl_bio = bio ;
1690
+ atomic_inc (& rrdev -> nr_pending );
1691
+ }
1692
+ }
1693
+ rcu_read_unlock ();
1694
+
1695
+ atomic_set (& r10_bio -> remaining , 1 );
1696
+ for (disk = 0 ; disk < geo -> raid_disks ; disk ++ ) {
1697
+ sector_t dev_start , dev_end ;
1698
+ struct bio * mbio , * rbio = NULL ;
1699
+ struct md_rdev * rdev = rcu_dereference (conf -> mirrors [disk ].rdev );
1700
+ struct md_rdev * rrdev = rcu_dereference (
1701
+ conf -> mirrors [disk ].replacement );
1702
+
1703
+ /*
1704
+ * Now start to calculate the start and end address for each disk.
1705
+ * The space between dev_start and dev_end is the discard region.
1706
+ *
1707
+ * For dev_start, it needs to consider three conditions:
1708
+ * 1st, the disk is before start_disk, you can imagine the disk in
1709
+ * the next stripe. So the dev_start is the start address of next
1710
+ * stripe.
1711
+ * 2st, the disk is after start_disk, it means the disk is at the
1712
+ * same stripe of first disk
1713
+ * 3st, the first disk itself, we can use start_disk_offset directly
1714
+ */
1715
+ if (disk < start_disk_index )
1716
+ dev_start = (first_stripe_index + 1 ) * mddev -> chunk_sectors ;
1717
+ else if (disk > start_disk_index )
1718
+ dev_start = first_stripe_index * mddev -> chunk_sectors ;
1719
+ else
1720
+ dev_start = start_disk_offset ;
1721
+
1722
+ if (disk < end_disk_index )
1723
+ dev_end = (last_stripe_index + 1 ) * mddev -> chunk_sectors ;
1724
+ else if (disk > end_disk_index )
1725
+ dev_end = last_stripe_index * mddev -> chunk_sectors ;
1726
+ else
1727
+ dev_end = end_disk_offset ;
1728
+
1729
+ /* It only handles discard bio which size is >= stripe size, so
1730
+ * dev_end > dev_start all the time
1731
+ */
1732
+ if (r10_bio -> devs [disk ].bio ) {
1733
+ mbio = bio_clone_fast (bio , GFP_NOIO , & mddev -> bio_set );
1734
+ mbio -> bi_end_io = raid10_end_discard_request ;
1735
+ mbio -> bi_private = r10_bio ;
1736
+ r10_bio -> devs [disk ].bio = mbio ;
1737
+ r10_bio -> devs [disk ].devnum = disk ;
1738
+ atomic_inc (& r10_bio -> remaining );
1739
+ md_submit_discard_bio (mddev , rdev , mbio ,
1740
+ dev_start + choose_data_offset (r10_bio , rdev ),
1741
+ dev_end - dev_start );
1742
+ bio_endio (mbio );
1743
+ }
1744
+ if (r10_bio -> devs [disk ].repl_bio ) {
1745
+ rbio = bio_clone_fast (bio , GFP_NOIO , & mddev -> bio_set );
1746
+ rbio -> bi_end_io = raid10_end_discard_request ;
1747
+ rbio -> bi_private = r10_bio ;
1748
+ r10_bio -> devs [disk ].repl_bio = rbio ;
1749
+ r10_bio -> devs [disk ].devnum = disk ;
1750
+ atomic_inc (& r10_bio -> remaining );
1751
+ md_submit_discard_bio (mddev , rrdev , rbio ,
1752
+ dev_start + choose_data_offset (r10_bio , rrdev ),
1753
+ dev_end - dev_start );
1754
+ bio_endio (rbio );
1755
+ }
1756
+ }
1757
+
1758
+ if (atomic_dec_and_test (& r10_bio -> remaining )) {
1759
+ md_write_end (r10_bio -> mddev );
1760
+ raid_end_bio_io (r10_bio );
1761
+ }
1762
+
1763
+ return 0 ;
1764
+ out :
1765
+ allow_barrier (conf );
1766
+ return - EAGAIN ;
1767
+ }
1768
+
1519
1769
static bool raid10_make_request (struct mddev * mddev , struct bio * bio )
1520
1770
{
1521
1771
struct r10conf * conf = mddev -> private ;
@@ -1530,6 +1780,10 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
1530
1780
if (!md_write_start (mddev , bio ))
1531
1781
return false;
1532
1782
1783
+ if (unlikely (bio_op (bio ) == REQ_OP_DISCARD ))
1784
+ if (!raid10_handle_discard (mddev , bio ))
1785
+ return true;
1786
+
1533
1787
/*
1534
1788
* If this request crosses a chunk boundary, we need to split
1535
1789
* it.
@@ -3769,7 +4023,7 @@ static int raid10_run(struct mddev *mddev)
3769
4023
3770
4024
if (mddev -> queue ) {
3771
4025
blk_queue_max_discard_sectors (mddev -> queue ,
3772
- mddev -> chunk_sectors );
4026
+ UINT_MAX );
3773
4027
blk_queue_max_write_same_sectors (mddev -> queue , 0 );
3774
4028
blk_queue_max_write_zeroes_sectors (mddev -> queue , 0 );
3775
4029
blk_queue_io_min (mddev -> queue , mddev -> chunk_sectors << 9 );
0 commit comments