Skip to content

Commit 4adc13e

Browse files
committed
Merge tag 'for-7.0/block-stable-pages-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull bounce buffer dio for stable pages from Jens Axboe: "This adds support for bounce buffering of dio for stable pages. This was all done by Christoph. In his words: This series tries to address the problem that under I/O pages can be modified during direct I/O, even when the device or file system require stable pages during I/O to calculate checksums, parity or data operations. It does so by adding block layer helpers to bounce buffer an iov_iter into a bio, then wires that up in iomap and ultimately XFS. The reason that the file system even needs to know about it, is because reads need a user context to copy the data back, and the infrastructure to defer ioends to a workqueue currently sits in XFS. I'm going to look into moving that into ioend and enabling it for other file systems. Additionally btrfs already has it's own infrastructure for this, and actually an urgent need to bounce buffer, so this should be useful there and could be wire up easily. In fact the idea comes from patches by Qu that did this in btrfs. This patch fixes all but one xfstests failures on T10 PI capable devices (generic/095 seems to have issues with a mix of mmap and splice still, I'm looking into that separately), and make qemu VMs running Windows, or Linux with swap enabled fine on an XFS file on a device using PI. Performance numbers on my (not exactly state of the art) NVMe PI test setup: Sequential reads using io_uring, QD=16. Bandwidth and CPU usage (usr/sys): | size | zero copy | bounce | +------+--------------------------+--------------------------+ | 4k | 1316MiB/s (12.65/55.40%) | 1081MiB/s (11.76/49.78%) | | 64K | 3370MiB/s ( 5.46/18.20%) | 3365MiB/s ( 4.47/15.68%) | | 1M | 3401MiB/s ( 0.76/23.05%) | 3400MiB/s ( 0.80/09.06%) | +------+--------------------------+--------------------------+ Sequential writes using io_uring, QD=16. Bandwidth and CPU usage (usr/sys): | size | zero copy | bounce | +------+--------------------------+--------------------------+ | 4k | 882MiB/s (11.83/33.88%) | 750MiB/s (10.53/34.08%) | | 64K | 2009MiB/s ( 7.33/15.80%) | 2007MiB/s ( 7.47/24.71%) | | 1M | 1992MiB/s ( 7.26/ 9.13%) | 1992MiB/s ( 9.21/19.11%) | +------+--------------------------+--------------------------+ Note that the 64k read numbers look really odd to me for the baseline zero copy case, but are reproducible over many repeated runs. The bounce read numbers should further improve when moving the PI validation to the file system and removing the double context switch, which I have patches for that will sent out soon" * tag 'for-7.0/block-stable-pages-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: xfs: use bounce buffering direct I/O when the device requires stable pages iomap: add a flag to bounce buffer direct I/O iomap: support ioends for direct reads iomap: rename IOMAP_DIO_DIRTY to IOMAP_DIO_USER_BACKED iomap: free the bio before completing the dio iomap: share code between iomap_dio_bio_end_io and iomap_finish_ioend_direct iomap: split out the per-bio logic from iomap_dio_bio_iter iomap: simplify iomap_dio_bio_iter iomap: fix submission side handling of completion side errors block: add helpers to bounce buffer an iov_iter into bios block: remove bio_release_page iov_iter: extract a iov_iter_extract_bvecs helper from bio code block: open code bio_add_page and fix handling of mismatching P2P ranges block: refactor get_contig_folio_len block: add a BIO_MAX_SIZE constant and use it
2 parents 0c00ed3 + 3373503 commit 4adc13e

13 files changed

Lines changed: 507 additions & 240 deletions

File tree

block/bio.c

Lines changed: 205 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -958,7 +958,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
958958
{
959959
if (bio->bi_vcnt >= bio->bi_max_vecs)
960960
return true;
961-
if (bio->bi_iter.bi_size > UINT_MAX - len)
961+
if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len)
962962
return true;
963963
return false;
964964
}
@@ -1064,7 +1064,7 @@ int bio_add_page(struct bio *bio, struct page *page,
10641064
{
10651065
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
10661066
return 0;
1067-
if (bio->bi_iter.bi_size > UINT_MAX - len)
1067+
if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len)
10681068
return 0;
10691069

10701070
if (bio->bi_vcnt > 0) {
@@ -1091,7 +1091,7 @@ void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len,
10911091
{
10921092
unsigned long nr = off / PAGE_SIZE;
10931093

1094-
WARN_ON_ONCE(len > UINT_MAX);
1094+
WARN_ON_ONCE(len > BIO_MAX_SIZE);
10951095
__bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE);
10961096
}
10971097
EXPORT_SYMBOL_GPL(bio_add_folio_nofail);
@@ -1115,7 +1115,7 @@ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
11151115
{
11161116
unsigned long nr = off / PAGE_SIZE;
11171117

1118-
if (len > UINT_MAX)
1118+
if (len > BIO_MAX_SIZE)
11191119
return false;
11201120
return bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE) > 0;
11211121
}
@@ -1206,122 +1206,6 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter)
12061206
bio_set_flag(bio, BIO_CLONED);
12071207
}
12081208

1209-
static unsigned int get_contig_folio_len(unsigned int *num_pages,
1210-
struct page **pages, unsigned int i,
1211-
struct folio *folio, size_t left,
1212-
size_t offset)
1213-
{
1214-
size_t bytes = left;
1215-
size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes);
1216-
unsigned int j;
1217-
1218-
/*
1219-
* We might COW a single page in the middle of
1220-
* a large folio, so we have to check that all
1221-
* pages belong to the same folio.
1222-
*/
1223-
bytes -= contig_sz;
1224-
for (j = i + 1; j < i + *num_pages; j++) {
1225-
size_t next = min_t(size_t, PAGE_SIZE, bytes);
1226-
1227-
if (page_folio(pages[j]) != folio ||
1228-
pages[j] != pages[j - 1] + 1) {
1229-
break;
1230-
}
1231-
contig_sz += next;
1232-
bytes -= next;
1233-
}
1234-
*num_pages = j - i;
1235-
1236-
return contig_sz;
1237-
}
1238-
1239-
#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))
1240-
1241-
/**
1242-
* __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
1243-
* @bio: bio to add pages to
1244-
* @iter: iov iterator describing the region to be mapped
1245-
*
1246-
* Extracts pages from *iter and appends them to @bio's bvec array. The pages
1247-
* will have to be cleaned up in the way indicated by the BIO_PAGE_PINNED flag.
1248-
* For a multi-segment *iter, this function only adds pages from the next
1249-
* non-empty segment of the iov iterator.
1250-
*/
1251-
static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
1252-
{
1253-
iov_iter_extraction_t extraction_flags = 0;
1254-
unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
1255-
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
1256-
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
1257-
struct page **pages = (struct page **)bv;
1258-
ssize_t size;
1259-
unsigned int num_pages, i = 0;
1260-
size_t offset, folio_offset, left, len;
1261-
int ret = 0;
1262-
1263-
/*
1264-
* Move page array up in the allocated memory for the bio vecs as far as
1265-
* possible so that we can start filling biovecs from the beginning
1266-
* without overwriting the temporary page array.
1267-
*/
1268-
BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
1269-
pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
1270-
1271-
if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
1272-
extraction_flags |= ITER_ALLOW_P2PDMA;
1273-
1274-
size = iov_iter_extract_pages(iter, &pages,
1275-
UINT_MAX - bio->bi_iter.bi_size,
1276-
nr_pages, extraction_flags, &offset);
1277-
if (unlikely(size <= 0))
1278-
return size ? size : -EFAULT;
1279-
1280-
nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
1281-
for (left = size, i = 0; left > 0; left -= len, i += num_pages) {
1282-
struct page *page = pages[i];
1283-
struct folio *folio = page_folio(page);
1284-
unsigned int old_vcnt = bio->bi_vcnt;
1285-
1286-
folio_offset = ((size_t)folio_page_idx(folio, page) <<
1287-
PAGE_SHIFT) + offset;
1288-
1289-
len = min(folio_size(folio) - folio_offset, left);
1290-
1291-
num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
1292-
1293-
if (num_pages > 1)
1294-
len = get_contig_folio_len(&num_pages, pages, i,
1295-
folio, left, offset);
1296-
1297-
if (!bio_add_folio(bio, folio, len, folio_offset)) {
1298-
WARN_ON_ONCE(1);
1299-
ret = -EINVAL;
1300-
goto out;
1301-
}
1302-
1303-
if (bio_flagged(bio, BIO_PAGE_PINNED)) {
1304-
/*
1305-
* We're adding another fragment of a page that already
1306-
* was part of the last segment. Undo our pin as the
1307-
* page was pinned when an earlier fragment of it was
1308-
* added to the bio and __bio_release_pages expects a
1309-
* single pin per page.
1310-
*/
1311-
if (offset && bio->bi_vcnt == old_vcnt)
1312-
unpin_user_folio(folio, 1);
1313-
}
1314-
offset = 0;
1315-
}
1316-
1317-
iov_iter_revert(iter, left);
1318-
out:
1319-
while (i < nr_pages)
1320-
bio_release_page(bio, pages[i++]);
1321-
1322-
return ret;
1323-
}
1324-
13251209
/*
13261210
* Aligns the bio size to the len_align_mask, releasing excessive bio vecs that
13271211
* __bio_iov_iter_get_pages may have inserted, and reverts the trimmed length
@@ -1345,7 +1229,9 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
13451229
break;
13461230
}
13471231

1348-
bio_release_page(bio, bv->bv_page);
1232+
if (bio_flagged(bio, BIO_PAGE_PINNED))
1233+
unpin_user_page(bv->bv_page);
1234+
13491235
bio->bi_vcnt--;
13501236
nbytes -= bv->bv_len;
13511237
} while (nbytes);
@@ -1379,7 +1265,7 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
13791265
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
13801266
unsigned len_align_mask)
13811267
{
1382-
int ret = 0;
1268+
iov_iter_extraction_t flags = 0;
13831269

13841270
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
13851271
return -EIO;
@@ -1392,13 +1278,205 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
13921278

13931279
if (iov_iter_extract_will_pin(iter))
13941280
bio_set_flag(bio, BIO_PAGE_PINNED);
1281+
if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
1282+
flags |= ITER_ALLOW_P2PDMA;
1283+
13951284
do {
1396-
ret = __bio_iov_iter_get_pages(bio, iter);
1397-
} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
1285+
ssize_t ret;
1286+
1287+
ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec,
1288+
BIO_MAX_SIZE - bio->bi_iter.bi_size,
1289+
&bio->bi_vcnt, bio->bi_max_vecs, flags);
1290+
if (ret <= 0) {
1291+
if (!bio->bi_vcnt)
1292+
return ret;
1293+
break;
1294+
}
1295+
bio->bi_iter.bi_size += ret;
1296+
} while (iov_iter_count(iter) && !bio_full(bio, 0));
13981297

1399-
if (bio->bi_vcnt)
1400-
return bio_iov_iter_align_down(bio, iter, len_align_mask);
1401-
return ret;
1298+
if (is_pci_p2pdma_page(bio->bi_io_vec->bv_page))
1299+
bio->bi_opf |= REQ_NOMERGE;
1300+
return bio_iov_iter_align_down(bio, iter, len_align_mask);
1301+
}
1302+
1303+
static struct folio *folio_alloc_greedy(gfp_t gfp, size_t *size)
1304+
{
1305+
struct folio *folio;
1306+
1307+
while (*size > PAGE_SIZE) {
1308+
folio = folio_alloc(gfp | __GFP_NORETRY, get_order(*size));
1309+
if (folio)
1310+
return folio;
1311+
*size = rounddown_pow_of_two(*size - 1);
1312+
}
1313+
1314+
return folio_alloc(gfp, get_order(*size));
1315+
}
1316+
1317+
static void bio_free_folios(struct bio *bio)
1318+
{
1319+
struct bio_vec *bv;
1320+
int i;
1321+
1322+
bio_for_each_bvec_all(bv, bio, i) {
1323+
struct folio *folio = page_folio(bv->bv_page);
1324+
1325+
if (!is_zero_folio(folio))
1326+
folio_put(folio);
1327+
}
1328+
}
1329+
1330+
static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter)
1331+
{
1332+
size_t total_len = iov_iter_count(iter);
1333+
1334+
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
1335+
return -EINVAL;
1336+
if (WARN_ON_ONCE(bio->bi_iter.bi_size))
1337+
return -EINVAL;
1338+
if (WARN_ON_ONCE(bio->bi_vcnt >= bio->bi_max_vecs))
1339+
return -EINVAL;
1340+
1341+
do {
1342+
size_t this_len = min(total_len, SZ_1M);
1343+
struct folio *folio;
1344+
1345+
if (this_len > PAGE_SIZE * 2)
1346+
this_len = rounddown_pow_of_two(this_len);
1347+
1348+
if (bio->bi_iter.bi_size > BIO_MAX_SIZE - this_len)
1349+
break;
1350+
1351+
folio = folio_alloc_greedy(GFP_KERNEL, &this_len);
1352+
if (!folio)
1353+
break;
1354+
bio_add_folio_nofail(bio, folio, this_len, 0);
1355+
1356+
if (copy_from_iter(folio_address(folio), this_len, iter) !=
1357+
this_len) {
1358+
bio_free_folios(bio);
1359+
return -EFAULT;
1360+
}
1361+
1362+
total_len -= this_len;
1363+
} while (total_len && bio->bi_vcnt < bio->bi_max_vecs);
1364+
1365+
if (!bio->bi_iter.bi_size)
1366+
return -ENOMEM;
1367+
return 0;
1368+
}
1369+
1370+
static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter)
1371+
{
1372+
size_t len = min(iov_iter_count(iter), SZ_1M);
1373+
struct folio *folio;
1374+
1375+
folio = folio_alloc_greedy(GFP_KERNEL, &len);
1376+
if (!folio)
1377+
return -ENOMEM;
1378+
1379+
do {
1380+
ssize_t ret;
1381+
1382+
ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec + 1, len,
1383+
&bio->bi_vcnt, bio->bi_max_vecs - 1, 0);
1384+
if (ret <= 0) {
1385+
if (!bio->bi_vcnt)
1386+
return ret;
1387+
break;
1388+
}
1389+
len -= ret;
1390+
bio->bi_iter.bi_size += ret;
1391+
} while (len && bio->bi_vcnt < bio->bi_max_vecs - 1);
1392+
1393+
/*
1394+
* Set the folio directly here. The above loop has already calculated
1395+
* the correct bi_size, and we use bi_vcnt for the user buffers. That
1396+
* is safe as bi_vcnt is only used by the submitter and not the actual
1397+
* I/O path.
1398+
*/
1399+
bvec_set_folio(&bio->bi_io_vec[0], folio, bio->bi_iter.bi_size, 0);
1400+
if (iov_iter_extract_will_pin(iter))
1401+
bio_set_flag(bio, BIO_PAGE_PINNED);
1402+
return 0;
1403+
}
1404+
1405+
/**
1406+
* bio_iov_iter_bounce - bounce buffer data from an iter into a bio
1407+
* @bio: bio to send
1408+
* @iter: iter to read from / write into
1409+
*
1410+
* Helper for direct I/O implementations that need to bounce buffer because
1411+
* we need to checksum the data or perform other operations that require
1412+
* consistency. Allocates folios to back the bounce buffer, and for writes
1413+
* copies the data into it. Needs to be paired with bio_iov_iter_unbounce()
1414+
* called on completion.
1415+
*/
1416+
int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter)
1417+
{
1418+
if (op_is_write(bio_op(bio)))
1419+
return bio_iov_iter_bounce_write(bio, iter);
1420+
return bio_iov_iter_bounce_read(bio, iter);
1421+
}
1422+
1423+
static void bvec_unpin(struct bio_vec *bv, bool mark_dirty)
1424+
{
1425+
struct folio *folio = page_folio(bv->bv_page);
1426+
size_t nr_pages = (bv->bv_offset + bv->bv_len - 1) / PAGE_SIZE -
1427+
bv->bv_offset / PAGE_SIZE + 1;
1428+
1429+
if (mark_dirty)
1430+
folio_mark_dirty_lock(folio);
1431+
unpin_user_folio(folio, nr_pages);
1432+
}
1433+
1434+
static void bio_iov_iter_unbounce_read(struct bio *bio, bool is_error,
1435+
bool mark_dirty)
1436+
{
1437+
unsigned int len = bio->bi_io_vec[0].bv_len;
1438+
1439+
if (likely(!is_error)) {
1440+
void *buf = bvec_virt(&bio->bi_io_vec[0]);
1441+
struct iov_iter to;
1442+
1443+
iov_iter_bvec(&to, ITER_DEST, bio->bi_io_vec + 1, bio->bi_vcnt,
1444+
len);
1445+
/* copying to pinned pages should always work */
1446+
WARN_ON_ONCE(copy_to_iter(buf, len, &to) != len);
1447+
} else {
1448+
/* No need to mark folios dirty if never copied to them */
1449+
mark_dirty = false;
1450+
}
1451+
1452+
if (bio_flagged(bio, BIO_PAGE_PINNED)) {
1453+
int i;
1454+
1455+
for (i = 0; i < bio->bi_vcnt; i++)
1456+
bvec_unpin(&bio->bi_io_vec[1 + i], mark_dirty);
1457+
}
1458+
1459+
folio_put(page_folio(bio->bi_io_vec[0].bv_page));
1460+
}
1461+
1462+
/**
1463+
* bio_iov_iter_unbounce - finish a bounce buffer operation
1464+
* @bio: completed bio
1465+
* @is_error: %true if an I/O error occurred and data should not be copied
1466+
* @mark_dirty: If %true, folios will be marked dirty.
1467+
*
1468+
* Helper for direct I/O implementations that need to bounce buffer because
1469+
* we need to checksum the data or perform other operations that require
1470+
* consistency. Called to complete a bio set up by bio_iov_iter_bounce().
1471+
* Copies data back for reads, and marks the original folios dirty if
1472+
* requested and then frees the bounce buffer.
1473+
*/
1474+
void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty)
1475+
{
1476+
if (op_is_write(bio_op(bio)))
1477+
bio_free_folios(bio);
1478+
else
1479+
bio_iov_iter_unbounce_read(bio, is_error, mark_dirty);
14021480
}
14031481

14041482
static void submit_bio_wait_endio(struct bio *bio)

0 commit comments

Comments
 (0)