Skip to content

Commit fcc6eaa

Browse files
damien-lemoalaxboe
authored andcommitted
zloop: introduce the ordered_zone_append configuration parameter
The zone append operation processing for zloop devices is similar to any other command, that is, the operation is processed as a command work item, without any special serialization between the work items (beside the zone mutex for mutually exclusive code sections). This processing is fine and gives excellent performance. However, it has a side effect: zone append operation are very often reordered and processed in a sequence that is very different from their issuing order by the user. This effect is very visible using an XFS file system on top of a zloop device. A simple file write leads to many file extents as the data writes using zone append are reordered and so result in the physical order being different than the file logical order. E.g. executing: $ dd if=/dev/zero of=/mnt/test bs=1M count=10 && sync $ xfs_bmap /mnt/test /mnt/test: 0: [0..4095]: 2162688..2166783 1: [4096..6143]: 2168832..2170879 2: [6144..8191]: 2166784..2168831 3: [8192..10239]: 2170880..2172927 4: [10240..12287]: 2174976..2177023 5: [12288..14335]: 2172928..2174975 6: [14336..20479]: 2177024..2183167 For 10 IOs, 6 extents are created. This is fine and actually allows to exercise XFS zone garbage collection very well. However, this also makes debugging/working on XFS data placement harder as the underlying device will most of the time reorder IOs, resulting in many file extents. Allow a user to mitigate this with the new ordered_zone_append configuration parameter. For a zloop device created with this parameter specified, the sector of a zone append command is set early, when the command is submitted by the block layer with the zloop_queue_rq() function, instead of in the zloop_rw() function which is exectued later in the command work item context. This change ensures that more often than not, zone append operations data end up being written in the same order as the command submission by the user. In the case of XFS, this leads to far less file data extents. E.g., for the previous example, we get a single file data extent for the written file. $ dd if=/dev/zero of=/mnt/test bs=1M count=10 && sync $ xfs_bmap /mnt/test /mnt/test: 0: [0..20479]: 2162688..2183167 Since we cannot use a mutex in the context of the zloop_queue_rq() function to atomically set a zone append operation sector to the target zone write pointer location and increment that the write pointer, a new per-zone spinlock is introduced to protect a zone write pointer access and modifications. To check a zone write pointer location and set a zone append operation target sector to that value, the function zloop_set_zone_append_sector() is introduced and called from zloop_queue_rq(). Signed-off-by: Damien Le Moal <dlemoal@kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent 9236c5f commit fcc6eaa

1 file changed

Lines changed: 96 additions & 12 deletions

File tree

drivers/block/zloop.c

Lines changed: 96 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ enum {
3333
ZLOOP_OPT_QUEUE_DEPTH = (1 << 7),
3434
ZLOOP_OPT_BUFFERED_IO = (1 << 8),
3535
ZLOOP_OPT_ZONE_APPEND = (1 << 9),
36+
ZLOOP_OPT_ORDERED_ZONE_APPEND = (1 << 10),
3637
};
3738

3839
static const match_table_t zloop_opt_tokens = {
@@ -46,6 +47,7 @@ static const match_table_t zloop_opt_tokens = {
4647
{ ZLOOP_OPT_QUEUE_DEPTH, "queue_depth=%u" },
4748
{ ZLOOP_OPT_BUFFERED_IO, "buffered_io" },
4849
{ ZLOOP_OPT_ZONE_APPEND, "zone_append=%u" },
50+
{ ZLOOP_OPT_ORDERED_ZONE_APPEND, "ordered_zone_append" },
4951
{ ZLOOP_OPT_ERR, NULL }
5052
};
5153

@@ -59,6 +61,7 @@ static const match_table_t zloop_opt_tokens = {
5961
#define ZLOOP_DEF_QUEUE_DEPTH 128
6062
#define ZLOOP_DEF_BUFFERED_IO false
6163
#define ZLOOP_DEF_ZONE_APPEND true
64+
#define ZLOOP_DEF_ORDERED_ZONE_APPEND false
6265

6366
/* Arbitrary limit on the zone size (16GB). */
6467
#define ZLOOP_MAX_ZONE_SIZE_MB 16384
@@ -75,6 +78,7 @@ struct zloop_options {
7578
unsigned int queue_depth;
7679
bool buffered_io;
7780
bool zone_append;
81+
bool ordered_zone_append;
7882
};
7983

8084
/*
@@ -96,6 +100,7 @@ struct zloop_zone {
96100

97101
unsigned long flags;
98102
struct mutex lock;
103+
spinlock_t wp_lock;
99104
enum blk_zone_cond cond;
100105
sector_t start;
101106
sector_t wp;
@@ -113,6 +118,7 @@ struct zloop_device {
113118
struct workqueue_struct *workqueue;
114119
bool buffered_io;
115120
bool zone_append;
121+
bool ordered_zone_append;
116122

117123
const char *base_dir;
118124
struct file *data_dir;
@@ -152,6 +158,7 @@ static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
152158
struct zloop_zone *zone = &zlo->zones[zone_no];
153159
struct kstat stat;
154160
sector_t file_sectors;
161+
unsigned long flags;
155162
int ret;
156163

157164
lockdep_assert_held(&zone->lock);
@@ -177,6 +184,7 @@ static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
177184
return -EINVAL;
178185
}
179186

187+
spin_lock_irqsave(&zone->wp_lock, flags);
180188
if (!file_sectors) {
181189
zone->cond = BLK_ZONE_COND_EMPTY;
182190
zone->wp = zone->start;
@@ -187,6 +195,7 @@ static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
187195
zone->cond = BLK_ZONE_COND_CLOSED;
188196
zone->wp = zone->start + file_sectors;
189197
}
198+
spin_unlock_irqrestore(&zone->wp_lock, flags);
190199

191200
return 0;
192201
}
@@ -230,6 +239,7 @@ static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no)
230239
static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
231240
{
232241
struct zloop_zone *zone = &zlo->zones[zone_no];
242+
unsigned long flags;
233243
int ret = 0;
234244

235245
if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
@@ -248,10 +258,12 @@ static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
248258
break;
249259
case BLK_ZONE_COND_IMP_OPEN:
250260
case BLK_ZONE_COND_EXP_OPEN:
261+
spin_lock_irqsave(&zone->wp_lock, flags);
251262
if (zone->wp == zone->start)
252263
zone->cond = BLK_ZONE_COND_EMPTY;
253264
else
254265
zone->cond = BLK_ZONE_COND_CLOSED;
266+
spin_unlock_irqrestore(&zone->wp_lock, flags);
255267
break;
256268
case BLK_ZONE_COND_EMPTY:
257269
case BLK_ZONE_COND_FULL:
@@ -269,6 +281,7 @@ static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
269281
static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no)
270282
{
271283
struct zloop_zone *zone = &zlo->zones[zone_no];
284+
unsigned long flags;
272285
int ret = 0;
273286

274287
if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
@@ -286,9 +299,11 @@ static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no)
286299
goto unlock;
287300
}
288301

302+
spin_lock_irqsave(&zone->wp_lock, flags);
289303
zone->cond = BLK_ZONE_COND_EMPTY;
290304
zone->wp = zone->start;
291305
clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
306+
spin_unlock_irqrestore(&zone->wp_lock, flags);
292307

293308
unlock:
294309
mutex_unlock(&zone->lock);
@@ -313,6 +328,7 @@ static int zloop_reset_all_zones(struct zloop_device *zlo)
313328
static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no)
314329
{
315330
struct zloop_zone *zone = &zlo->zones[zone_no];
331+
unsigned long flags;
316332
int ret = 0;
317333

318334
if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
@@ -330,9 +346,11 @@ static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no)
330346
goto unlock;
331347
}
332348

349+
spin_lock_irqsave(&zone->wp_lock, flags);
333350
zone->cond = BLK_ZONE_COND_FULL;
334351
zone->wp = ULLONG_MAX;
335352
clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
353+
spin_unlock_irqrestore(&zone->wp_lock, flags);
336354

337355
unlock:
338356
mutex_unlock(&zone->lock);
@@ -374,6 +392,7 @@ static void zloop_rw(struct zloop_cmd *cmd)
374392
struct zloop_zone *zone;
375393
struct iov_iter iter;
376394
struct bio_vec tmp;
395+
unsigned long flags;
377396
sector_t zone_end;
378397
int nr_bvec = 0;
379398
int ret;
@@ -416,19 +435,30 @@ static void zloop_rw(struct zloop_cmd *cmd)
416435
if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) {
417436
mutex_lock(&zone->lock);
418437

438+
spin_lock_irqsave(&zone->wp_lock, flags);
439+
419440
/*
420441
* Zone append operations always go at the current write
421442
* pointer, but regular write operations must already be
422443
* aligned to the write pointer when submitted.
423444
*/
424445
if (is_append) {
425-
if (zone->cond == BLK_ZONE_COND_FULL) {
426-
ret = -EIO;
427-
goto unlock;
446+
/*
447+
* If ordered zone append is in use, we already checked
448+
* and set the target sector in zloop_queue_rq().
449+
*/
450+
if (!zlo->ordered_zone_append) {
451+
if (zone->cond == BLK_ZONE_COND_FULL) {
452+
spin_unlock_irqrestore(&zone->wp_lock,
453+
flags);
454+
ret = -EIO;
455+
goto unlock;
456+
}
457+
sector = zone->wp;
428458
}
429-
sector = zone->wp;
430459
cmd->sector = sector;
431460
} else if (sector != zone->wp) {
461+
spin_unlock_irqrestore(&zone->wp_lock, flags);
432462
pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n",
433463
zone_no, sector, zone->wp);
434464
ret = -EIO;
@@ -441,15 +471,19 @@ static void zloop_rw(struct zloop_cmd *cmd)
441471
zone->cond = BLK_ZONE_COND_IMP_OPEN;
442472

443473
/*
444-
* Advance the write pointer. If the write fails, the write
445-
* pointer position will be corrected when the next I/O starts
446-
* execution.
474+
* Advance the write pointer, unless ordered zone append is in
475+
* use. If the write fails, the write pointer position will be
476+
* corrected when the next I/O starts execution.
447477
*/
448-
zone->wp += nr_sectors;
449-
if (zone->wp == zone_end) {
450-
zone->cond = BLK_ZONE_COND_FULL;
451-
zone->wp = ULLONG_MAX;
478+
if (!is_append || !zlo->ordered_zone_append) {
479+
zone->wp += nr_sectors;
480+
if (zone->wp == zone_end) {
481+
zone->cond = BLK_ZONE_COND_FULL;
482+
zone->wp = ULLONG_MAX;
483+
}
452484
}
485+
486+
spin_unlock_irqrestore(&zone->wp_lock, flags);
453487
}
454488

455489
rq_for_each_bvec(tmp, rq, rq_iter)
@@ -623,6 +657,35 @@ static void zloop_complete_rq(struct request *rq)
623657
blk_mq_end_request(rq, sts);
624658
}
625659

660+
static bool zloop_set_zone_append_sector(struct request *rq)
661+
{
662+
struct zloop_device *zlo = rq->q->queuedata;
663+
unsigned int zone_no = rq_zone_no(rq);
664+
struct zloop_zone *zone = &zlo->zones[zone_no];
665+
sector_t zone_end = zone->start + zlo->zone_capacity;
666+
sector_t nr_sectors = blk_rq_sectors(rq);
667+
unsigned long flags;
668+
669+
spin_lock_irqsave(&zone->wp_lock, flags);
670+
671+
if (zone->cond == BLK_ZONE_COND_FULL ||
672+
zone->wp + nr_sectors > zone_end) {
673+
spin_unlock_irqrestore(&zone->wp_lock, flags);
674+
return false;
675+
}
676+
677+
rq->__sector = zone->wp;
678+
zone->wp += blk_rq_sectors(rq);
679+
if (zone->wp >= zone_end) {
680+
zone->cond = BLK_ZONE_COND_FULL;
681+
zone->wp = ULLONG_MAX;
682+
}
683+
684+
spin_unlock_irqrestore(&zone->wp_lock, flags);
685+
686+
return true;
687+
}
688+
626689
static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx,
627690
const struct blk_mq_queue_data *bd)
628691
{
@@ -633,6 +696,16 @@ static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx,
633696
if (zlo->state == Zlo_deleting)
634697
return BLK_STS_IOERR;
635698

699+
/*
700+
* If we need to strongly order zone append operations, set the request
701+
* sector to the zone write pointer location now instead of when the
702+
* command work runs.
703+
*/
704+
if (zlo->ordered_zone_append && req_op(rq) == REQ_OP_ZONE_APPEND) {
705+
if (!zloop_set_zone_append_sector(rq))
706+
return BLK_STS_IOERR;
707+
}
708+
636709
blk_mq_start_request(rq);
637710

638711
INIT_WORK(&cmd->work, zloop_cmd_workfn);
@@ -667,6 +740,7 @@ static int zloop_report_zones(struct gendisk *disk, sector_t sector,
667740
struct zloop_device *zlo = disk->private_data;
668741
struct blk_zone blkz = {};
669742
unsigned int first, i;
743+
unsigned long flags;
670744
int ret;
671745

672746
first = disk_zone_no(disk, sector);
@@ -690,7 +764,9 @@ static int zloop_report_zones(struct gendisk *disk, sector_t sector,
690764

691765
blkz.start = zone->start;
692766
blkz.len = zlo->zone_size;
767+
spin_lock_irqsave(&zone->wp_lock, flags);
693768
blkz.wp = zone->wp;
769+
spin_unlock_irqrestore(&zone->wp_lock, flags);
694770
blkz.cond = zone->cond;
695771
if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
696772
blkz.type = BLK_ZONE_TYPE_CONVENTIONAL;
@@ -798,6 +874,7 @@ static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts,
798874
int ret;
799875

800876
mutex_init(&zone->lock);
877+
spin_lock_init(&zone->wp_lock);
801878
zone->start = (sector_t)zone_no << zlo->zone_shift;
802879

803880
if (!restore)
@@ -951,6 +1028,8 @@ static int zloop_ctl_add(struct zloop_options *opts)
9511028
zlo->nr_conv_zones = opts->nr_conv_zones;
9521029
zlo->buffered_io = opts->buffered_io;
9531030
zlo->zone_append = opts->zone_append;
1031+
if (zlo->zone_append)
1032+
zlo->ordered_zone_append = opts->ordered_zone_append;
9541033

9551034
zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE,
9561035
opts->nr_queues * opts->queue_depth, zlo->id);
@@ -1037,8 +1116,9 @@ static int zloop_ctl_add(struct zloop_options *opts)
10371116
zlo->id, zlo->nr_zones,
10381117
((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20,
10391118
zlo->block_size);
1040-
pr_info("zloop%d: using %s zone append\n",
1119+
pr_info("zloop%d: using %s%s zone append\n",
10411120
zlo->id,
1121+
zlo->ordered_zone_append ? "ordered " : "",
10421122
zlo->zone_append ? "native" : "emulated");
10431123

10441124
return 0;
@@ -1127,6 +1207,7 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf)
11271207
opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH;
11281208
opts->buffered_io = ZLOOP_DEF_BUFFERED_IO;
11291209
opts->zone_append = ZLOOP_DEF_ZONE_APPEND;
1210+
opts->ordered_zone_append = ZLOOP_DEF_ORDERED_ZONE_APPEND;
11301211

11311212
if (!buf)
11321213
return 0;
@@ -1248,6 +1329,9 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf)
12481329
}
12491330
opts->zone_append = token;
12501331
break;
1332+
case ZLOOP_OPT_ORDERED_ZONE_APPEND:
1333+
opts->ordered_zone_append = true;
1334+
break;
12511335
case ZLOOP_OPT_ERR:
12521336
default:
12531337
pr_warn("unknown parameter or missing value '%s'\n", p);

0 commit comments

Comments
 (0)