Skip to content

Commit e2dd8ac

Browse files
jthornberMike Snitzer
authored andcommitted
dm bio prison v1: improve concurrent IO performance
Split the bio prison into multiple regions, with a separate rbtree and associated lock for each region. To get fast bio prison locking and not damage the performance of discards too much the bio-prison now stipulates that discards should not cross a BIO_PRISON_MAX_RANGE boundary. Because the range of a key (block_end - block_begin) must not exceed BIO_PRISON_MAX_RANGE: break_up_discard_bio() now ensures the data range reflected in PHYSICAL key doesn't exceed BIO_PRISON_MAX_RANGE. And splitting the thin target's discards (handled with VIRTUAL key) is achieved by updating dm-thin.c to set limits->max_discard_sectors in terms of BIO_PRISON_MAX_RANGE _and_ setting the thin and thin-pool targets' max_discard_granularity to true. Signed-off-by: Joe Thornber <ejt@redhat.com> Signed-off-by: Mike Snitzer <snitzer@kernel.org>
1 parent 06961c4 commit e2dd8ac

3 files changed

Lines changed: 121 additions & 68 deletions

File tree

drivers/md/dm-bio-prison-v1.c

Lines changed: 57 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,17 @@
1616

1717
/*----------------------------------------------------------------*/
1818

19+
#define NR_LOCKS 64
20+
#define LOCK_MASK (NR_LOCKS - 1)
1921
#define MIN_CELLS 1024
2022

21-
struct dm_bio_prison {
23+
struct prison_region {
2224
spinlock_t lock;
23-
struct rb_root cells;
25+
struct rb_root cell;
26+
} ____cacheline_aligned_in_smp;
27+
28+
struct dm_bio_prison {
29+
struct prison_region regions[NR_LOCKS];
2430
mempool_t cell_pool;
2531
};
2632

@@ -34,22 +40,24 @@ static struct kmem_cache *_cell_cache;
3440
*/
3541
struct dm_bio_prison *dm_bio_prison_create(void)
3642
{
37-
struct dm_bio_prison *prison = kzalloc(sizeof(*prison), GFP_KERNEL);
3843
int ret;
44+
unsigned i;
45+
struct dm_bio_prison *prison = kzalloc(sizeof(*prison), GFP_KERNEL);
3946

4047
if (!prison)
4148
return NULL;
4249

43-
spin_lock_init(&prison->lock);
50+
for (i = 0; i < NR_LOCKS; i++) {
51+
spin_lock_init(&prison->regions[i].lock);
52+
prison->regions[i].cell = RB_ROOT;
53+
}
4454

4555
ret = mempool_init_slab_pool(&prison->cell_pool, MIN_CELLS, _cell_cache);
4656
if (ret) {
4757
kfree(prison);
4858
return NULL;
4959
}
5060

51-
prison->cells = RB_ROOT;
52-
5361
return prison;
5462
}
5563
EXPORT_SYMBOL_GPL(dm_bio_prison_create);
@@ -107,14 +115,26 @@ static int cmp_keys(struct dm_cell_key *lhs,
107115
return 0;
108116
}
109117

110-
static int __bio_detain(struct dm_bio_prison *prison,
118+
static unsigned lock_nr(struct dm_cell_key *key)
119+
{
120+
return (key->block_begin >> BIO_PRISON_MAX_RANGE_SHIFT) & LOCK_MASK;
121+
}
122+
123+
static void check_range(struct dm_cell_key *key)
124+
{
125+
BUG_ON(key->block_end - key->block_begin > BIO_PRISON_MAX_RANGE);
126+
BUG_ON((key->block_begin >> BIO_PRISON_MAX_RANGE_SHIFT) !=
127+
((key->block_end - 1) >> BIO_PRISON_MAX_RANGE_SHIFT));
128+
}
129+
130+
static int __bio_detain(struct rb_root *root,
111131
struct dm_cell_key *key,
112132
struct bio *inmate,
113133
struct dm_bio_prison_cell *cell_prealloc,
114134
struct dm_bio_prison_cell **cell_result)
115135
{
116136
int r;
117-
struct rb_node **new = &prison->cells.rb_node, *parent = NULL;
137+
struct rb_node **new = &root->rb_node, *parent = NULL;
118138

119139
while (*new) {
120140
struct dm_bio_prison_cell *cell =
@@ -139,7 +159,7 @@ static int __bio_detain(struct dm_bio_prison *prison,
139159
*cell_result = cell_prealloc;
140160

141161
rb_link_node(&cell_prealloc->node, parent, new);
142-
rb_insert_color(&cell_prealloc->node, &prison->cells);
162+
rb_insert_color(&cell_prealloc->node, root);
143163

144164
return 0;
145165
}
@@ -151,10 +171,12 @@ static int bio_detain(struct dm_bio_prison *prison,
151171
struct dm_bio_prison_cell **cell_result)
152172
{
153173
int r;
174+
unsigned l = lock_nr(key);
175+
check_range(key);
154176

155-
spin_lock_irq(&prison->lock);
156-
r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result);
157-
spin_unlock_irq(&prison->lock);
177+
spin_lock_irq(&prison->regions[l].lock);
178+
r = __bio_detain(&prison->regions[l].cell, key, inmate, cell_prealloc, cell_result);
179+
spin_unlock_irq(&prison->regions[l].lock);
158180

159181
return r;
160182
}
@@ -181,11 +203,11 @@ EXPORT_SYMBOL_GPL(dm_get_cell);
181203
/*
182204
* @inmates must have been initialised prior to this call
183205
*/
184-
static void __cell_release(struct dm_bio_prison *prison,
206+
static void __cell_release(struct rb_root *root,
185207
struct dm_bio_prison_cell *cell,
186208
struct bio_list *inmates)
187209
{
188-
rb_erase(&cell->node, &prison->cells);
210+
rb_erase(&cell->node, root);
189211

190212
if (inmates) {
191213
if (cell->holder)
@@ -198,32 +220,35 @@ void dm_cell_release(struct dm_bio_prison *prison,
198220
struct dm_bio_prison_cell *cell,
199221
struct bio_list *bios)
200222
{
201-
spin_lock_irq(&prison->lock);
202-
__cell_release(prison, cell, bios);
203-
spin_unlock_irq(&prison->lock);
223+
unsigned l = lock_nr(&cell->key);
224+
225+
spin_lock_irq(&prison->regions[l].lock);
226+
__cell_release(&prison->regions[l].cell, cell, bios);
227+
spin_unlock_irq(&prison->regions[l].lock);
204228
}
205229
EXPORT_SYMBOL_GPL(dm_cell_release);
206230

207231
/*
208232
* Sometimes we don't want the holder, just the additional bios.
209233
*/
210-
static void __cell_release_no_holder(struct dm_bio_prison *prison,
234+
static void __cell_release_no_holder(struct rb_root *root,
211235
struct dm_bio_prison_cell *cell,
212236
struct bio_list *inmates)
213237
{
214-
rb_erase(&cell->node, &prison->cells);
238+
rb_erase(&cell->node, root);
215239
bio_list_merge(inmates, &cell->bios);
216240
}
217241

218242
void dm_cell_release_no_holder(struct dm_bio_prison *prison,
219243
struct dm_bio_prison_cell *cell,
220244
struct bio_list *inmates)
221245
{
246+
unsigned l = lock_nr(&cell->key);
222247
unsigned long flags;
223248

224-
spin_lock_irqsave(&prison->lock, flags);
225-
__cell_release_no_holder(prison, cell, inmates);
226-
spin_unlock_irqrestore(&prison->lock, flags);
249+
spin_lock_irqsave(&prison->regions[l].lock, flags);
250+
__cell_release_no_holder(&prison->regions[l].cell, cell, inmates);
251+
spin_unlock_irqrestore(&prison->regions[l].lock, flags);
227252
}
228253
EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
229254

@@ -248,18 +273,19 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
248273
void *context,
249274
struct dm_bio_prison_cell *cell)
250275
{
251-
spin_lock_irq(&prison->lock);
276+
unsigned l = lock_nr(&cell->key);
277+
spin_lock_irq(&prison->regions[l].lock);
252278
visit_fn(context, cell);
253-
rb_erase(&cell->node, &prison->cells);
254-
spin_unlock_irq(&prison->lock);
279+
rb_erase(&cell->node, &prison->regions[l].cell);
280+
spin_unlock_irq(&prison->regions[l].lock);
255281
}
256282
EXPORT_SYMBOL_GPL(dm_cell_visit_release);
257283

258-
static int __promote_or_release(struct dm_bio_prison *prison,
284+
static int __promote_or_release(struct rb_root *root,
259285
struct dm_bio_prison_cell *cell)
260286
{
261287
if (bio_list_empty(&cell->bios)) {
262-
rb_erase(&cell->node, &prison->cells);
288+
rb_erase(&cell->node, root);
263289
return 1;
264290
}
265291

@@ -271,10 +297,11 @@ int dm_cell_promote_or_release(struct dm_bio_prison *prison,
271297
struct dm_bio_prison_cell *cell)
272298
{
273299
int r;
300+
unsigned l = lock_nr(&cell->key);
274301

275-
spin_lock_irq(&prison->lock);
276-
r = __promote_or_release(prison, cell);
277-
spin_unlock_irq(&prison->lock);
302+
spin_lock_irq(&prison->regions[l].lock);
303+
r = __promote_or_release(&prison->regions[l].cell, cell);
304+
spin_unlock_irq(&prison->regions[l].lock);
278305

279306
return r;
280307
}

drivers/md/dm-bio-prison-v1.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,16 @@ struct dm_cell_key {
3434
dm_block_t block_begin, block_end;
3535
};
3636

37+
/*
38+
* The range of a key (block_end - block_begin) must not
39+
* exceed BIO_PRISON_MAX_RANGE. Also the range must not
40+
* cross a similarly sized boundary.
41+
*
42+
* Must be a power of 2.
43+
*/
44+
#define BIO_PRISON_MAX_RANGE 1024
45+
#define BIO_PRISON_MAX_RANGE_SHIFT 10
46+
3747
/*
3848
* Treat this as opaque, only in header so callers can manage allocation
3949
* themselves.

drivers/md/dm-thin.c

Lines changed: 54 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1674,54 +1674,69 @@ static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t
16741674
struct dm_cell_key data_key;
16751675
struct dm_bio_prison_cell *data_cell;
16761676
struct dm_thin_new_mapping *m;
1677-
dm_block_t virt_begin, virt_end, data_begin;
1677+
dm_block_t virt_begin, virt_end, data_begin, data_end;
1678+
dm_block_t len, next_boundary;
16781679

16791680
while (begin != end) {
1680-
r = ensure_next_mapping(pool);
1681-
if (r)
1682-
/* we did our best */
1683-
return;
1684-
16851681
r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
16861682
&data_begin, &maybe_shared);
1687-
if (r)
1683+
if (r) {
16881684
/*
16891685
* Silently fail, letting any mappings we've
16901686
* created complete.
16911687
*/
16921688
break;
1693-
1694-
build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key);
1695-
if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
1696-
/* contention, we'll give up with this range */
1697-
begin = virt_end;
1698-
continue;
16991689
}
17001690

1701-
/*
1702-
* IO may still be going to the destination block. We must
1703-
* quiesce before we can do the removal.
1704-
*/
1705-
m = get_next_mapping(pool);
1706-
m->tc = tc;
1707-
m->maybe_shared = maybe_shared;
1708-
m->virt_begin = virt_begin;
1709-
m->virt_end = virt_end;
1710-
m->data_block = data_begin;
1711-
m->cell = data_cell;
1712-
m->bio = bio;
1691+
data_end = data_begin + (virt_end - virt_begin);
17131692

17141693
/*
1715-
* The parent bio must not complete before sub discard bios are
1716-
* chained to it (see end_discard's bio_chain)!
1717-
*
1718-
* This per-mapping bi_remaining increment is paired with
1719-
* the implicit decrement that occurs via bio_endio() in
1720-
* end_discard().
1694+
* Make sure the data region obeys the bio prison restrictions.
17211695
*/
1722-
bio_inc_remaining(bio);
1723-
if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
1724-
pool->process_prepared_discard(m);
1696+
while (data_begin < data_end) {
1697+
r = ensure_next_mapping(pool);
1698+
if (r)
1699+
return; /* we did our best */
1700+
1701+
next_boundary = ((data_begin >> BIO_PRISON_MAX_RANGE_SHIFT) + 1)
1702+
<< BIO_PRISON_MAX_RANGE_SHIFT;
1703+
len = min_t(sector_t, data_end - data_begin, next_boundary - data_begin);
1704+
1705+
build_key(tc->td, PHYSICAL, data_begin, data_begin + len, &data_key);
1706+
if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
1707+
/* contention, we'll give up with this range */
1708+
data_begin += len;
1709+
continue;
1710+
}
1711+
1712+
/*
1713+
* IO may still be going to the destination block. We must
1714+
* quiesce before we can do the removal.
1715+
*/
1716+
m = get_next_mapping(pool);
1717+
m->tc = tc;
1718+
m->maybe_shared = maybe_shared;
1719+
m->virt_begin = virt_begin;
1720+
m->virt_end = virt_begin + len;
1721+
m->data_block = data_begin;
1722+
m->cell = data_cell;
1723+
m->bio = bio;
1724+
1725+
/*
1726+
* The parent bio must not complete before sub discard bios are
1727+
* chained to it (see end_discard's bio_chain)!
1728+
*
1729+
* This per-mapping bi_remaining increment is paired with
1730+
* the implicit decrement that occurs via bio_endio() in
1731+
* end_discard().
1732+
*/
1733+
bio_inc_remaining(bio);
1734+
if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
1735+
pool->process_prepared_discard(m);
1736+
1737+
virt_begin += len;
1738+
data_begin += len;
1739+
}
17251740

17261741
begin = virt_end;
17271742
}
@@ -3380,13 +3395,13 @@ static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv)
33803395
*/
33813396
if (pf.discard_enabled && pf.discard_passdown) {
33823397
ti->num_discard_bios = 1;
3383-
33843398
/*
33853399
* Setting 'discards_supported' circumvents the normal
33863400
* stacking of discard limits (this keeps the pool and
33873401
* thin devices' discard limits consistent).
33883402
*/
33893403
ti->discards_supported = true;
3404+
ti->max_discard_granularity = true;
33903405
}
33913406
ti->private = pt;
33923407

@@ -4096,7 +4111,7 @@ static struct target_type pool_target = {
40964111
.name = "thin-pool",
40974112
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
40984113
DM_TARGET_IMMUTABLE,
4099-
.version = {1, 22, 0},
4114+
.version = {1, 23, 0},
41004115
.module = THIS_MODULE,
41014116
.ctr = pool_ctr,
41024117
.dtr = pool_dtr,
@@ -4261,6 +4276,7 @@ static int thin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
42614276
if (tc->pool->pf.discard_enabled) {
42624277
ti->discards_supported = true;
42634278
ti->num_discard_bios = 1;
4279+
ti->max_discard_granularity = true;
42644280
}
42654281

42664282
mutex_unlock(&dm_thin_pool_table.mutex);
@@ -4476,12 +4492,12 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
44764492
return;
44774493

44784494
limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
4479-
limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
4495+
limits->max_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE;
44804496
}
44814497

44824498
static struct target_type thin_target = {
44834499
.name = "thin",
4484-
.version = {1, 22, 0},
4500+
.version = {1, 23, 0},
44854501
.module = THIS_MODULE,
44864502
.ctr = thin_ctr,
44874503
.dtr = thin_dtr,

0 commit comments

Comments
 (0)