Skip to content

Commit b99fdcd

Browse files
Ming Leisnitm
authored andcommitted
dm: support bio polling
Support bio polling (REQ_POLLED) in the following approach: 1) only support io polling on normal READ/WRITE, and other abnormal IOs still fallback to IRQ mode, so the target io (and DM's clone bio) is exactly inside the dm io. 2) hold one refcnt on io->io_count after submitting this dm bio with REQ_POLLED 3) support dm native bio splitting, any dm io instance associated with current bio will be added into one list which head is bio->bi_private which will be recovered before ending this bio 4) implement .poll_bio() callback, call bio_poll() on the single target bio inside the dm io which is retrieved via bio->bi_bio_drv_data; call dm_io_dec_pending() after the target io is done in .poll_bio() 5) enable QUEUE_FLAG_POLL if all underlying queues enable QUEUE_FLAG_POLL, which is based on Jeffle's previous patch. These changes are good for a 30-35% IOPS improvement for polled IO. For detailed test results please see (Jens, thanks for testing!): https://listman.redhat.com/archives/dm-devel/2022-March/049868.html or https://marc.info/?l=linux-block&m=164684246214700&w=2 Tested-by: Jens Axboe <axboe@kernel.dk> Signed-off-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
1 parent 69fe0f2 commit b99fdcd

3 files changed

Lines changed: 169 additions & 3 deletions

File tree

drivers/md/dm-core.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,8 @@ struct dm_io {
235235
bool start_io_acct:1;
236236
int was_accounted;
237237
unsigned long start_time;
238+
void *data;
239+
struct hlist_node node;
238240
spinlock_t endio_lock;
239241
struct dm_stats_aux stats_aux;
240242
/* last member of dm_target_io is 'struct bio' */

drivers/md/dm-table.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1481,6 +1481,14 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
14811481
return &t->targets[(KEYS_PER_NODE * n) + k];
14821482
}
14831483

1484+
static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev,
1485+
sector_t start, sector_t len, void *data)
1486+
{
1487+
struct request_queue *q = bdev_get_queue(dev->bdev);
1488+
1489+
return !test_bit(QUEUE_FLAG_POLL, &q->queue_flags);
1490+
}
1491+
14841492
/*
14851493
* type->iterate_devices() should be called when the sanity check needs to
14861494
* iterate and check all underlying data devices. iterate_devices() will
@@ -1531,6 +1539,11 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev,
15311539
return 0;
15321540
}
15331541

1542+
static int dm_table_supports_poll(struct dm_table *t)
1543+
{
1544+
return !dm_table_any_dev_attr(t, device_not_poll_capable, NULL);
1545+
}
1546+
15341547
/*
15351548
* Check whether a table has no data devices attached using each
15361549
* target's iterate_devices method.
@@ -2067,6 +2080,20 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
20672080
dm_update_crypto_profile(q, t);
20682081
disk_update_readahead(t->md->disk);
20692082

2083+
/*
2084+
* Check for request-based device is left to
2085+
* dm_mq_init_request_queue()->blk_mq_init_allocated_queue().
2086+
*
2087+
* For bio-based device, only set QUEUE_FLAG_POLL when all
2088+
* underlying devices supporting polling.
2089+
*/
2090+
if (__table_type_bio_based(t->type)) {
2091+
if (dm_table_supports_poll(t))
2092+
blk_queue_flag_set(QUEUE_FLAG_POLL, q);
2093+
else
2094+
blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
2095+
}
2096+
20702097
return 0;
20712098
}
20722099

drivers/md/dm.c

Lines changed: 140 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,13 @@
4040
#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
4141
#define DM_COOKIE_LENGTH 24
4242

43+
/*
44+
* For REQ_POLLED fs bio, this flag is set if we link mapped underlying
45+
* dm_io into one list, and reuse bio->bi_private as the list head. Before
46+
* ending this fs bio, we will recover its ->bi_private.
47+
*/
48+
#define REQ_DM_POLL_LIST REQ_DRV
49+
4350
static const char *_name = DM_NAME;
4451

4552
static unsigned int major = 0;
@@ -73,6 +80,7 @@ struct clone_info {
7380
struct dm_io *io;
7481
sector_t sector;
7582
unsigned sector_count;
83+
bool submit_as_polled;
7684
};
7785

7886
#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone))
@@ -599,6 +607,9 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
599607
if (!clone)
600608
return NULL;
601609

610+
/* REQ_DM_POLL_LIST shouldn't be inherited */
611+
clone->bi_opf &= ~REQ_DM_POLL_LIST;
612+
602613
tio = clone_to_tio(clone);
603614
tio->inside_dm_io = false;
604615
}
@@ -888,8 +899,15 @@ void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
888899
if (unlikely(wq_has_sleeper(&md->wait)))
889900
wake_up(&md->wait);
890901

891-
if (io_error == BLK_STS_DM_REQUEUE)
902+
if (io_error == BLK_STS_DM_REQUEUE) {
903+
/*
904+
* Upper layer won't help us poll split bio, io->orig_bio
905+
* may only reflect a subset of the pre-split original,
906+
* so clear REQ_POLLED in case of requeue
907+
*/
908+
bio->bi_opf &= ~REQ_POLLED;
892909
return;
910+
}
893911

894912
if (bio_is_flush_with_data(bio)) {
895913
/*
@@ -1440,6 +1458,47 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
14401458
return true;
14411459
}
14421460

1461+
/*
1462+
* Reuse ->bi_private as hlist head for storing all dm_io instances
1463+
* associated with this bio, and this bio's bi_private needs to be
1464+
* stored in dm_io->data before the reuse.
1465+
*
1466+
* bio->bi_private is owned by fs or upper layer, so block layer won't
1467+
* touch it after splitting. Meantime it won't be changed by anyone after
1468+
* bio is submitted. So this reuse is safe.
1469+
*/
1470+
static inline struct hlist_head *dm_get_bio_hlist_head(struct bio *bio)
1471+
{
1472+
return (struct hlist_head *)&bio->bi_private;
1473+
}
1474+
1475+
static void dm_queue_poll_io(struct bio *bio, struct dm_io *io)
1476+
{
1477+
struct hlist_head *head = dm_get_bio_hlist_head(bio);
1478+
1479+
if (!(bio->bi_opf & REQ_DM_POLL_LIST)) {
1480+
bio->bi_opf |= REQ_DM_POLL_LIST;
1481+
/*
1482+
* Save .bi_private into dm_io, so that we can reuse
1483+
* .bi_private as hlist head for storing dm_io list
1484+
*/
1485+
io->data = bio->bi_private;
1486+
1487+
INIT_HLIST_HEAD(head);
1488+
1489+
/* tell block layer to poll for completion */
1490+
bio->bi_cookie = ~BLK_QC_T_NONE;
1491+
} else {
1492+
/*
1493+
* bio recursed due to split, reuse original poll list,
1494+
* and save bio->bi_private too.
1495+
*/
1496+
io->data = hlist_entry(head->first, struct dm_io, node)->data;
1497+
}
1498+
1499+
hlist_add_head(&io->node, head);
1500+
}
1501+
14431502
/*
14441503
* Select the correct strategy for processing a non-flush bio.
14451504
*/
@@ -1457,6 +1516,12 @@ static int __split_and_process_bio(struct clone_info *ci)
14571516
if (__process_abnormal_io(ci, ti, &r))
14581517
return r;
14591518

1519+
/*
1520+
* Only support bio polling for normal IO, and the target io is
1521+
* exactly inside the dm_io instance (verified in dm_poll_dm_io)
1522+
*/
1523+
ci->submit_as_polled = ci->bio->bi_opf & REQ_POLLED;
1524+
14601525
len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
14611526
clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO);
14621527
__map_bio(clone);
@@ -1473,6 +1538,7 @@ static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
14731538
ci->map = map;
14741539
ci->io = alloc_io(md, bio);
14751540
ci->bio = bio;
1541+
ci->submit_as_polled = false;
14761542
ci->sector = bio->bi_iter.bi_sector;
14771543
ci->sector_count = bio_sectors(bio);
14781544

@@ -1522,8 +1588,17 @@ static void dm_split_and_process_bio(struct mapped_device *md,
15221588
if (ci.io->start_io_acct)
15231589
dm_start_io_acct(ci.io, NULL);
15241590

1525-
/* drop the extra reference count */
1526-
dm_io_dec_pending(ci.io, errno_to_blk_status(error));
1591+
/*
1592+
* Drop the extra reference count for non-POLLED bio, and hold one
1593+
* reference for POLLED bio, which will be released in dm_poll_bio
1594+
*
1595+
* Add every dm_io instance into the hlist_head which is stored in
1596+
* bio->bi_private, so that dm_poll_bio can poll them all.
1597+
*/
1598+
if (error || !ci.submit_as_polled)
1599+
dm_io_dec_pending(ci.io, errno_to_blk_status(error));
1600+
else
1601+
dm_queue_poll_io(bio, ci.io);
15271602
}
15281603

15291604
static void dm_submit_bio(struct bio *bio)
@@ -1558,6 +1633,67 @@ static void dm_submit_bio(struct bio *bio)
15581633
dm_put_live_table(md, srcu_idx);
15591634
}
15601635

1636+
static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob,
1637+
unsigned int flags)
1638+
{
1639+
WARN_ON_ONCE(!io->tio.inside_dm_io);
1640+
1641+
/* don't poll if the mapped io is done */
1642+
if (atomic_read(&io->io_count) > 1)
1643+
bio_poll(&io->tio.clone, iob, flags);
1644+
1645+
/* bio_poll holds the last reference */
1646+
return atomic_read(&io->io_count) == 1;
1647+
}
1648+
1649+
static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob,
1650+
unsigned int flags)
1651+
{
1652+
struct hlist_head *head = dm_get_bio_hlist_head(bio);
1653+
struct hlist_head tmp = HLIST_HEAD_INIT;
1654+
struct hlist_node *next;
1655+
struct dm_io *io;
1656+
1657+
/* Only poll normal bio which was marked as REQ_DM_POLL_LIST */
1658+
if (!(bio->bi_opf & REQ_DM_POLL_LIST))
1659+
return 0;
1660+
1661+
WARN_ON_ONCE(hlist_empty(head));
1662+
1663+
hlist_move_list(head, &tmp);
1664+
1665+
/*
1666+
* Restore .bi_private before possibly completing dm_io.
1667+
*
1668+
* bio_poll() is only possible once @bio has been completely
1669+
* submitted via submit_bio_noacct()'s depth-first submission.
1670+
* So there is no dm_queue_poll_io() race associated with
1671+
* clearing REQ_DM_POLL_LIST here.
1672+
*/
1673+
bio->bi_opf &= ~REQ_DM_POLL_LIST;
1674+
bio->bi_private = hlist_entry(tmp.first, struct dm_io, node)->data;
1675+
1676+
hlist_for_each_entry_safe(io, next, &tmp, node) {
1677+
if (dm_poll_dm_io(io, iob, flags)) {
1678+
hlist_del_init(&io->node);
1679+
/*
1680+
* clone_endio() has already occurred, so passing
1681+
* error as 0 here doesn't override io->status
1682+
*/
1683+
dm_io_dec_pending(io, 0);
1684+
}
1685+
}
1686+
1687+
/* Not done? */
1688+
if (!hlist_empty(&tmp)) {
1689+
bio->bi_opf |= REQ_DM_POLL_LIST;
1690+
/* Reset bio->bi_private to dm_io list head */
1691+
hlist_move_list(&tmp, head);
1692+
return 0;
1693+
}
1694+
return 1;
1695+
}
1696+
15611697
/*-----------------------------------------------------------------
15621698
* An IDR is used to keep track of allocated minor numbers.
15631699
*---------------------------------------------------------------*/
@@ -2983,6 +3119,7 @@ static const struct pr_ops dm_pr_ops = {
29833119

29843120
static const struct block_device_operations dm_blk_dops = {
29853121
.submit_bio = dm_submit_bio,
3122+
.poll_bio = dm_poll_bio,
29863123
.open = dm_blk_open,
29873124
.release = dm_blk_close,
29883125
.ioctl = dm_blk_ioctl,

0 commit comments

Comments
 (0)