Skip to content

Commit 216c8f5

Browse files
Ming Leiaxboe
authored andcommitted
ublk: replace monitor with cancelable uring_cmd
Monitor work actually introduces one extra context for handling abort, this way is easy to cause race, and also introduce extra delay when handling aborting. Now we start to support cancelable uring_cmd, so use it instead: 1) this cancel callback is either run from the uring cmd submission task context or called after the io_uring context is exit, so the callback is run exclusively with ublk_ch_uring_cmd() and __ublk_rq_task_work(). 2) the previous patch freezes request queue when calling ublk_abort_queue(), which is now completely exclusive with ublk_queue_rq() and ublk_ch_uring_cmd()/__ublk_rq_task_work(). 3) in timeout handler, if all IOs are in-flight, then all uring commands are completed, uring command canceling can't help us to provide forward progress any more, so call ublk_abort_requests() in timeout handler. This way simplifies aborting queue, and is helpful for adding new feature, such as, relax the limit of using single task for handling one queue. Signed-off-by: Ming Lei <ming.lei@redhat.com> Link: https://lore.kernel.org/r/20231009093324.957829-7-ming.lei@redhat.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent bd23f6c commit 216c8f5

1 file changed

Lines changed: 119 additions & 89 deletions

File tree

drivers/block/ublk_drv.c

Lines changed: 119 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ struct ublk_rq_data {
7575

7676
struct ublk_uring_cmd_pdu {
7777
struct ublk_queue *ubq;
78+
u16 tag;
7879
};
7980

8081
/*
@@ -141,14 +142,13 @@ struct ublk_queue {
141142
unsigned int max_io_sz;
142143
bool force_abort;
143144
bool timeout;
145+
bool canceling;
144146
unsigned short nr_io_ready; /* how many ios setup */
145147
spinlock_t cancel_lock;
146148
struct ublk_device *dev;
147149
struct ublk_io ios[];
148150
};
149151

150-
#define UBLK_DAEMON_MONITOR_PERIOD (5 * HZ)
151-
152152
struct ublk_device {
153153
struct gendisk *ub_disk;
154154

@@ -179,11 +179,6 @@ struct ublk_device {
179179
unsigned int nr_queues_ready;
180180
unsigned int nr_privileged_daemon;
181181

182-
/*
183-
* Our ubq->daemon may be killed without any notification, so
184-
* monitor each queue's daemon periodically
185-
*/
186-
struct delayed_work monitor_work;
187182
struct work_struct quiesce_work;
188183
struct work_struct stop_work;
189184
};
@@ -194,10 +189,11 @@ struct ublk_params_header {
194189
__u32 types;
195190
};
196191

192+
static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq);
193+
197194
static inline unsigned int ublk_req_build_flags(struct request *req);
198195
static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
199196
int tag);
200-
201197
static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
202198
{
203199
return ub->dev_info.flags & UBLK_F_USER_COPY;
@@ -1123,8 +1119,6 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
11231119
blk_mq_requeue_request(rq, false);
11241120
else
11251121
blk_mq_end_request(rq, BLK_STS_IOERR);
1126-
1127-
mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0);
11281122
}
11291123

11301124
static inline void __ublk_rq_task_work(struct request *req,
@@ -1245,30 +1239,28 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
12451239
io = &ubq->ios[rq->tag];
12461240
/*
12471241
* If the check pass, we know that this is a re-issued request aborted
1248-
* previously in monitor_work because the ubq_daemon(cmd's task) is
1242+
* previously in cancel fn because the ubq_daemon(cmd's task) is
12491243
* PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore
12501244
* because this ioucmd's io_uring context may be freed now if no inflight
12511245
* ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work.
12521246
*
1253-
* Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing
1247+
* Note: cancel fn sets UBLK_IO_FLAG_ABORTED and ends this request(releasing
12541248
* the tag). Then the request is re-started(allocating the tag) and we are here.
12551249
* Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
12561250
* guarantees that here is a re-issued request aborted previously.
12571251
*/
12581252
if (unlikely(io->flags & UBLK_IO_FLAG_ABORTED)) {
12591253
ublk_abort_io_cmds(ubq);
12601254
} else {
1261-
struct io_uring_cmd *cmd = io->cmd;
1262-
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1263-
1264-
pdu->ubq = ubq;
1265-
io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
1255+
io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb);
12661256
}
12671257
}
12681258

12691259
static enum blk_eh_timer_return ublk_timeout(struct request *rq)
12701260
{
12711261
struct ublk_queue *ubq = rq->mq_hctx->driver_data;
1262+
unsigned int nr_inflight = 0;
1263+
int i;
12721264

12731265
if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) {
12741266
if (!ubq->timeout) {
@@ -1279,6 +1271,29 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq)
12791271
return BLK_EH_DONE;
12801272
}
12811273

1274+
if (!ubq_daemon_is_dying(ubq))
1275+
return BLK_EH_RESET_TIMER;
1276+
1277+
for (i = 0; i < ubq->q_depth; i++) {
1278+
struct ublk_io *io = &ubq->ios[i];
1279+
1280+
if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
1281+
nr_inflight++;
1282+
}
1283+
1284+
/* cancelable uring_cmd can't help us if all commands are in-flight */
1285+
if (nr_inflight == ubq->q_depth) {
1286+
struct ublk_device *ub = ubq->dev;
1287+
1288+
if (ublk_abort_requests(ub, ubq)) {
1289+
if (ublk_can_use_recovery(ub))
1290+
schedule_work(&ub->quiesce_work);
1291+
else
1292+
schedule_work(&ub->stop_work);
1293+
}
1294+
return BLK_EH_DONE;
1295+
}
1296+
12821297
return BLK_EH_RESET_TIMER;
12831298
}
12841299

@@ -1308,7 +1323,7 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
13081323

13091324
blk_mq_start_request(bd->rq);
13101325

1311-
if (unlikely(ubq_daemon_is_dying(ubq))) {
1326+
if (unlikely(ubq->canceling)) {
13121327
__ublk_abort_rq(ubq, rq);
13131328
return BLK_STS_OK;
13141329
}
@@ -1416,9 +1431,9 @@ static void ublk_commit_completion(struct ublk_device *ub,
14161431
}
14171432

14181433
/*
1419-
* When ->ubq_daemon is exiting, either new request is ended immediately,
1420-
* or any queued io command is drained, so it is safe to abort queue
1421-
* lockless
1434+
* Called from ubq_daemon context via cancel fn, meantime quiesce ublk
1435+
* blk-mq queue, so we are called exclusively with blk-mq and ubq_daemon
1436+
* context, so everything is serialized.
14221437
*/
14231438
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
14241439
{
@@ -1441,10 +1456,17 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
14411456
}
14421457
}
14431458

1444-
static bool ublk_abort_requests(struct ublk_device *ub)
1459+
static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq)
14451460
{
14461461
struct gendisk *disk;
1447-
int i;
1462+
1463+
spin_lock(&ubq->cancel_lock);
1464+
if (ubq->canceling) {
1465+
spin_unlock(&ubq->cancel_lock);
1466+
return false;
1467+
}
1468+
ubq->canceling = true;
1469+
spin_unlock(&ubq->cancel_lock);
14481470

14491471
spin_lock(&ub->lock);
14501472
disk = ub->ub_disk;
@@ -1458,53 +1480,69 @@ static bool ublk_abort_requests(struct ublk_device *ub)
14581480

14591481
/* Now we are serialized with ublk_queue_rq() */
14601482
blk_mq_quiesce_queue(disk->queue);
1461-
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
1462-
struct ublk_queue *ubq = ublk_get_queue(ub, i);
1463-
1464-
if (ubq_daemon_is_dying(ubq)) {
1465-
/* abort queue is for making forward progress */
1466-
ublk_abort_queue(ub, ubq);
1467-
}
1468-
}
1483+
/* abort queue is for making forward progress */
1484+
ublk_abort_queue(ub, ubq);
14691485
blk_mq_unquiesce_queue(disk->queue);
14701486
put_device(disk_to_dev(disk));
14711487

14721488
return true;
14731489
}
14741490

1475-
static void ublk_daemon_monitor_work(struct work_struct *work)
1491+
static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io,
1492+
unsigned int issue_flags)
14761493
{
1477-
struct ublk_device *ub =
1478-
container_of(work, struct ublk_device, monitor_work.work);
1479-
int i;
1494+
bool done;
14801495

1481-
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
1482-
struct ublk_queue *ubq = ublk_get_queue(ub, i);
1496+
if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
1497+
return;
14831498

1484-
if (ubq_daemon_is_dying(ubq))
1485-
goto found;
1486-
}
1487-
return;
1499+
spin_lock(&ubq->cancel_lock);
1500+
done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
1501+
if (!done)
1502+
io->flags |= UBLK_IO_FLAG_CANCELED;
1503+
spin_unlock(&ubq->cancel_lock);
1504+
1505+
if (!done)
1506+
io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags);
1507+
}
1508+
1509+
/*
1510+
* The ublk char device won't be closed when calling cancel fn, so both
1511+
* ublk device and queue are guaranteed to be live
1512+
*/
1513+
static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
1514+
unsigned int issue_flags)
1515+
{
1516+
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1517+
struct ublk_queue *ubq = pdu->ubq;
1518+
struct task_struct *task;
1519+
struct ublk_device *ub;
1520+
bool need_schedule;
1521+
struct ublk_io *io;
14881522

1489-
found:
1490-
if (!ublk_abort_requests(ub))
1523+
if (WARN_ON_ONCE(!ubq))
14911524
return;
14921525

1493-
if (ublk_can_use_recovery(ub))
1494-
schedule_work(&ub->quiesce_work);
1495-
else
1496-
schedule_work(&ub->stop_work);
1526+
if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
1527+
return;
14971528

1498-
/*
1499-
* We can't schedule monitor work after ub's state is not UBLK_S_DEV_LIVE.
1500-
* after ublk_remove() or __ublk_quiesce_dev() is started.
1501-
*
1502-
* No need ub->mutex, monitor work are canceled after state is marked
1503-
* as not LIVE, so new state is observed reliably.
1504-
*/
1505-
if (ub->dev_info.state == UBLK_S_DEV_LIVE)
1506-
schedule_delayed_work(&ub->monitor_work,
1507-
UBLK_DAEMON_MONITOR_PERIOD);
1529+
task = io_uring_cmd_get_task(cmd);
1530+
if (WARN_ON_ONCE(task && task != ubq->ubq_daemon))
1531+
return;
1532+
1533+
ub = ubq->dev;
1534+
need_schedule = ublk_abort_requests(ub, ubq);
1535+
1536+
io = &ubq->ios[pdu->tag];
1537+
WARN_ON_ONCE(io->cmd != cmd);
1538+
ublk_cancel_cmd(ubq, &ubq->ios[pdu->tag], issue_flags);
1539+
1540+
if (need_schedule) {
1541+
if (ublk_can_use_recovery(ub))
1542+
schedule_work(&ub->quiesce_work);
1543+
else
1544+
schedule_work(&ub->stop_work);
1545+
}
15081546
}
15091547

15101548
static inline bool ublk_queue_ready(struct ublk_queue *ubq)
@@ -1516,24 +1554,8 @@ static void ublk_cancel_queue(struct ublk_queue *ubq)
15161554
{
15171555
int i;
15181556

1519-
for (i = 0; i < ubq->q_depth; i++) {
1520-
struct ublk_io *io = &ubq->ios[i];
1521-
1522-
if (io->flags & UBLK_IO_FLAG_ACTIVE) {
1523-
bool done;
1524-
1525-
spin_lock(&ubq->cancel_lock);
1526-
done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
1527-
if (!done)
1528-
io->flags |= UBLK_IO_FLAG_CANCELED;
1529-
spin_unlock(&ubq->cancel_lock);
1530-
1531-
if (!done)
1532-
io_uring_cmd_done(io->cmd,
1533-
UBLK_IO_RES_ABORT, 0,
1534-
IO_URING_F_UNLOCKED);
1535-
}
1536-
}
1557+
for (i = 0; i < ubq->q_depth; i++)
1558+
ublk_cancel_cmd(ubq, &ubq->ios[i], IO_URING_F_UNLOCKED);
15371559
}
15381560

15391561
/* Cancel all pending commands, must be called after del_gendisk() returns */
@@ -1580,15 +1602,6 @@ static void __ublk_quiesce_dev(struct ublk_device *ub)
15801602
blk_mq_quiesce_queue(ub->ub_disk->queue);
15811603
ublk_wait_tagset_rqs_idle(ub);
15821604
ub->dev_info.state = UBLK_S_DEV_QUIESCED;
1583-
/* we are going to release task_struct of ubq_daemon and resets
1584-
* ->ubq_daemon to NULL. So in monitor_work, check on ubq_daemon causes UAF.
1585-
* Besides, monitor_work is not necessary in QUIESCED state since we have
1586-
* already scheduled quiesce_work and quiesced all ubqs.
1587-
*
1588-
* Do not let monitor_work schedule itself if state it QUIESCED. And we cancel
1589-
* it here and re-schedule it in END_USER_RECOVERY to avoid UAF.
1590-
*/
1591-
cancel_delayed_work_sync(&ub->monitor_work);
15921605
}
15931606

15941607
static void ublk_quiesce_work_fn(struct work_struct *work)
@@ -1651,7 +1664,6 @@ static void ublk_stop_dev(struct ublk_device *ub)
16511664
unlock:
16521665
mutex_unlock(&ub->mutex);
16531666
ublk_cancel_dev(ub);
1654-
cancel_delayed_work_sync(&ub->monitor_work);
16551667
}
16561668

16571669
/* device can only be started after all IOs are ready */
@@ -1702,6 +1714,21 @@ static inline void ublk_fill_io_cmd(struct ublk_io *io,
17021714
io->addr = buf_addr;
17031715
}
17041716

1717+
static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
1718+
unsigned int issue_flags,
1719+
struct ublk_queue *ubq, unsigned int tag)
1720+
{
1721+
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1722+
1723+
/*
1724+
* Safe to refer to @ubq since ublk_queue won't be died until its
1725+
* commands are completed
1726+
*/
1727+
pdu->ubq = ubq;
1728+
pdu->tag = tag;
1729+
io_uring_cmd_mark_cancelable(cmd, issue_flags);
1730+
}
1731+
17051732
static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
17061733
unsigned int issue_flags,
17071734
const struct ublksrv_io_cmd *ub_cmd)
@@ -1817,6 +1844,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
18171844
default:
18181845
goto out;
18191846
}
1847+
ublk_prep_cancel(cmd, issue_flags, ubq, tag);
18201848
return -EIOCBQUEUED;
18211849

18221850
out:
@@ -1884,6 +1912,11 @@ static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd,
18841912

18851913
static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
18861914
{
1915+
if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
1916+
ublk_uring_cmd_cancel_fn(cmd, issue_flags);
1917+
return 0;
1918+
}
1919+
18871920
/* well-implemented server won't run into unlocked */
18881921
if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
18891922
io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
@@ -2215,8 +2248,6 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
22152248
if (wait_for_completion_interruptible(&ub->completion) != 0)
22162249
return -EINTR;
22172250

2218-
schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
2219-
22202251
mutex_lock(&ub->mutex);
22212252
if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
22222253
test_bit(UB_STATE_USED, &ub->state)) {
@@ -2393,7 +2424,6 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
23932424
spin_lock_init(&ub->lock);
23942425
INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn);
23952426
INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
2396-
INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work);
23972427

23982428
ret = ublk_alloc_dev_number(ub, header->dev_id);
23992429
if (ret < 0)
@@ -2647,6 +2677,7 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
26472677
/* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
26482678
ubq->ubq_daemon = NULL;
26492679
ubq->timeout = false;
2680+
ubq->canceling = false;
26502681

26512682
for (i = 0; i < ubq->q_depth; i++) {
26522683
struct ublk_io *io = &ubq->ios[i];
@@ -2732,7 +2763,6 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub,
27322763
__func__, header->dev_id);
27332764
blk_mq_kick_requeue_list(ub->ub_disk->queue);
27342765
ub->dev_info.state = UBLK_S_DEV_LIVE;
2735-
schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
27362766
ret = 0;
27372767
out_unlock:
27382768
mutex_unlock(&ub->mutex);

0 commit comments

Comments
 (0)