Skip to content

Commit 06c5c97

Browse files
Mike Snitzerchucklever
authored andcommitted
NFSD: Implement NFSD_IO_DIRECT for NFS WRITE
When NFSD_IO_DIRECT is selected via the /sys/kernel/debug/nfsd/io_cache_write experimental tunable, split incoming unaligned NFS WRITE requests into a prefix, middle and suffix segment, as needed. The middle segment is now DIO-aligned and the prefix and/or suffix are unaligned. Synchronous buffered IO is used for the unaligned segments, and IOCB_DIRECT is used for the middle DIO-aligned extent. Although IOCB_DIRECT avoids the use of the page cache, by itself it doesn't guarantee data durability. For UNSTABLE WRITE requests, durability is obtained by a subsequent NFS COMMIT request. Signed-off-by: Mike Snitzer <snitzer@kernel.org> Co-developed-by: Chuck Lever <chuck.lever@oracle.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
1 parent e3e8e17 commit 06c5c97

3 files changed

Lines changed: 144 additions & 4 deletions

File tree

fs/nfsd/debugfs.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ static int nfsd_io_cache_write_set(void *data, u64 val)
108108
switch (val) {
109109
case NFSD_IO_BUFFERED:
110110
case NFSD_IO_DONTCACHE:
111+
case NFSD_IO_DIRECT:
111112
nfsd_io_cache_write = val;
112113
break;
113114
default:

fs/nfsd/trace.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,8 @@ DEFINE_NFSD_IO_EVENT(read_io_done);
469469
DEFINE_NFSD_IO_EVENT(read_done);
470470
DEFINE_NFSD_IO_EVENT(write_start);
471471
DEFINE_NFSD_IO_EVENT(write_opened);
472+
DEFINE_NFSD_IO_EVENT(write_direct);
473+
DEFINE_NFSD_IO_EVENT(write_vector);
472474
DEFINE_NFSD_IO_EVENT(write_io_done);
473475
DEFINE_NFSD_IO_EVENT(write_done);
474476
DEFINE_NFSD_IO_EVENT(commit_start);

fs/nfsd/vfs.c

Lines changed: 141 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1254,6 +1254,136 @@ static int wait_for_concurrent_writes(struct file *file)
12541254
return err;
12551255
}
12561256

1257+
struct nfsd_write_dio_seg {
1258+
struct iov_iter iter;
1259+
int flags;
1260+
};
1261+
1262+
static unsigned long
1263+
iov_iter_bvec_offset(const struct iov_iter *iter)
1264+
{
1265+
return (unsigned long)(iter->bvec->bv_offset + iter->iov_offset);
1266+
}
1267+
1268+
static void
1269+
nfsd_write_dio_seg_init(struct nfsd_write_dio_seg *segment,
1270+
struct bio_vec *bvec, unsigned int nvecs,
1271+
unsigned long total, size_t start, size_t len,
1272+
struct kiocb *iocb)
1273+
{
1274+
iov_iter_bvec(&segment->iter, ITER_SOURCE, bvec, nvecs, total);
1275+
if (start)
1276+
iov_iter_advance(&segment->iter, start);
1277+
iov_iter_truncate(&segment->iter, len);
1278+
segment->flags = iocb->ki_flags;
1279+
}
1280+
1281+
static unsigned int
1282+
nfsd_write_dio_iters_init(struct nfsd_file *nf, struct bio_vec *bvec,
1283+
unsigned int nvecs, struct kiocb *iocb,
1284+
unsigned long total,
1285+
struct nfsd_write_dio_seg segments[3])
1286+
{
1287+
u32 offset_align = nf->nf_dio_offset_align;
1288+
loff_t prefix_end, orig_end, middle_end;
1289+
u32 mem_align = nf->nf_dio_mem_align;
1290+
size_t prefix, middle, suffix;
1291+
loff_t offset = iocb->ki_pos;
1292+
unsigned int nsegs = 0;
1293+
1294+
/*
1295+
* Check if direct I/O is feasible for this write request.
1296+
* If alignments are not available, the write is too small,
1297+
* or no alignment can be found, fall back to buffered I/O.
1298+
*/
1299+
if (unlikely(!mem_align || !offset_align) ||
1300+
unlikely(total < max(offset_align, mem_align)))
1301+
goto no_dio;
1302+
1303+
prefix_end = round_up(offset, offset_align);
1304+
orig_end = offset + total;
1305+
middle_end = round_down(orig_end, offset_align);
1306+
1307+
prefix = prefix_end - offset;
1308+
middle = middle_end - prefix_end;
1309+
suffix = orig_end - middle_end;
1310+
1311+
if (!middle)
1312+
goto no_dio;
1313+
1314+
if (prefix)
1315+
nfsd_write_dio_seg_init(&segments[nsegs++], bvec,
1316+
nvecs, total, 0, prefix, iocb);
1317+
1318+
nfsd_write_dio_seg_init(&segments[nsegs], bvec, nvecs,
1319+
total, prefix, middle, iocb);
1320+
1321+
/*
1322+
* Check if the bvec iterator is aligned for direct I/O.
1323+
*
1324+
* bvecs generated from RPC receive buffers are contiguous: After
1325+
* the first bvec, all subsequent bvecs start at bv_offset zero
1326+
* (page-aligned). Therefore, only the first bvec is checked.
1327+
*/
1328+
if (iov_iter_bvec_offset(&segments[nsegs].iter) & (mem_align - 1))
1329+
goto no_dio;
1330+
segments[nsegs].flags |= IOCB_DIRECT;
1331+
nsegs++;
1332+
1333+
if (suffix)
1334+
nfsd_write_dio_seg_init(&segments[nsegs++], bvec, nvecs, total,
1335+
prefix + middle, suffix, iocb);
1336+
1337+
return nsegs;
1338+
1339+
no_dio:
1340+
/* No DIO alignment possible - pack into single non-DIO segment. */
1341+
nfsd_write_dio_seg_init(&segments[0], bvec, nvecs, total, 0,
1342+
total, iocb);
1343+
return 1;
1344+
}
1345+
1346+
static noinline_for_stack int
1347+
nfsd_direct_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
1348+
struct nfsd_file *nf, unsigned int nvecs,
1349+
unsigned long *cnt, struct kiocb *kiocb)
1350+
{
1351+
struct nfsd_write_dio_seg segments[3];
1352+
struct file *file = nf->nf_file;
1353+
unsigned int nsegs, i;
1354+
ssize_t host_err;
1355+
1356+
nsegs = nfsd_write_dio_iters_init(nf, rqstp->rq_bvec, nvecs,
1357+
kiocb, *cnt, segments);
1358+
1359+
*cnt = 0;
1360+
for (i = 0; i < nsegs; i++) {
1361+
kiocb->ki_flags = segments[i].flags;
1362+
if (kiocb->ki_flags & IOCB_DIRECT)
1363+
trace_nfsd_write_direct(rqstp, fhp, kiocb->ki_pos,
1364+
segments[i].iter.count);
1365+
else {
1366+
trace_nfsd_write_vector(rqstp, fhp, kiocb->ki_pos,
1367+
segments[i].iter.count);
1368+
/*
1369+
* Mark the I/O buffer as evict-able to reduce
1370+
* memory contention.
1371+
*/
1372+
if (nf->nf_file->f_op->fop_flags & FOP_DONTCACHE)
1373+
kiocb->ki_flags |= IOCB_DONTCACHE;
1374+
}
1375+
1376+
host_err = vfs_iocb_iter_write(file, kiocb, &segments[i].iter);
1377+
if (host_err < 0)
1378+
return host_err;
1379+
*cnt += host_err;
1380+
if (host_err < segments[i].iter.count)
1381+
break; /* partial write */
1382+
}
1383+
1384+
return 0;
1385+
}
1386+
12571387
/**
12581388
* nfsd_vfs_write - write data to an already-open file
12591389
* @rqstp: RPC execution context
@@ -1328,25 +1458,32 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
13281458
}
13291459

13301460
nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload);
1331-
iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
1461+
13321462
since = READ_ONCE(file->f_wb_err);
13331463
if (verf)
13341464
nfsd_copy_write_verifier(verf, nn);
13351465

13361466
switch (nfsd_io_cache_write) {
1337-
case NFSD_IO_BUFFERED:
1467+
case NFSD_IO_DIRECT:
1468+
host_err = nfsd_direct_write(rqstp, fhp, nf, nvecs,
1469+
cnt, &kiocb);
13381470
break;
13391471
case NFSD_IO_DONTCACHE:
13401472
if (file->f_op->fop_flags & FOP_DONTCACHE)
13411473
kiocb.ki_flags |= IOCB_DONTCACHE;
1474+
fallthrough;
1475+
case NFSD_IO_BUFFERED:
1476+
iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
1477+
host_err = vfs_iocb_iter_write(file, &kiocb, &iter);
1478+
if (host_err < 0)
1479+
break;
1480+
*cnt = host_err;
13421481
break;
13431482
}
1344-
host_err = vfs_iocb_iter_write(file, &kiocb, &iter);
13451483
if (host_err < 0) {
13461484
commit_reset_write_verifier(nn, rqstp, host_err);
13471485
goto out_nfserr;
13481486
}
1349-
*cnt = host_err;
13501487
nfsd_stats_io_write_add(nn, exp, *cnt);
13511488
fsnotify_modify(file);
13521489
host_err = filemap_check_wb_err(file->f_mapping, since);

0 commit comments

Comments
 (0)