Skip to content

Commit d686e64

Browse files
committed
NFSD: Implement NFSD_IO_DIRECT for NFS READ
Add an experimental option that forces NFS READ operations to use direct I/O instead of reading through the NFS server's page cache. There is already at least one other layer of read caching: the page cache on NFS clients. The server's page cache, in many cases, is unlikely to provide additional benefit. Some benchmarks have demonstrated that the server's page cache is actively detrimental for workloads whose working set is larger than the server's available physical memory. For instance, on small NFS servers, cached NFS file content can squeeze out local memory consumers. For large sequential workloads, an enormous amount of data flows into and out of the page cache and is consumed by NFS clients exactly once -- caching that data is expensive to do and totally valueless. For now this is a hidden option that can be enabled on test systems for benchmarking. In the longer term, this option might be enabled persistently or per-export. When the exported file system does not support direct I/O, NFSD falls back to using either DONTCACHE or buffered I/O to fulfill NFS READ requests. Suggested-by: Mike Snitzer <snitzer@kernel.org> Reviewed-by: Mike Snitzer <snitzer@kernel.org> Reviewed-by: Jeff Layton <jlayton@kernel.org> Reviewed-by: NeilBrown <neil@brown.name> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
1 parent d7de37d commit d686e64

4 files changed

Lines changed: 87 additions & 0 deletions

File tree

fs/nfsd/debugfs.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n");
4444
* Contents:
4545
* %0: NFS READ will use buffered IO
4646
* %1: NFS READ will use dontcache (buffered IO w/ dropbehind)
47+
* %2: NFS READ will use direct IO
4748
*
4849
* This setting takes immediate effect for all NFS versions,
4950
* all exports, and in all NFSD net namespaces.
@@ -64,6 +65,7 @@ static int nfsd_io_cache_read_set(void *data, u64 val)
6465
nfsd_io_cache_read = NFSD_IO_BUFFERED;
6566
break;
6667
case NFSD_IO_DONTCACHE:
68+
case NFSD_IO_DIRECT:
6769
/*
6870
* Must disable splice_read when enabling
6971
* NFSD_IO_DONTCACHE.

fs/nfsd/nfsd.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ enum {
160160
/* Any new NFSD_IO enum value must be added at the end */
161161
NFSD_IO_BUFFERED,
162162
NFSD_IO_DONTCACHE,
163+
NFSD_IO_DIRECT,
163164
};
164165

165166
extern u64 nfsd_io_cache_read __read_mostly;

fs/nfsd/trace.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,7 @@ DEFINE_EVENT(nfsd_io_class, nfsd_##name, \
464464
DEFINE_NFSD_IO_EVENT(read_start);
465465
DEFINE_NFSD_IO_EVENT(read_splice);
466466
DEFINE_NFSD_IO_EVENT(read_vector);
467+
DEFINE_NFSD_IO_EVENT(read_direct);
467468
DEFINE_NFSD_IO_EVENT(read_io_done);
468469
DEFINE_NFSD_IO_EVENT(read_done);
469470
DEFINE_NFSD_IO_EVENT(write_start);

fs/nfsd/vfs.c

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1074,6 +1074,83 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
10741074
return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
10751075
}
10761076

1077+
/*
1078+
* The byte range of the client's READ request is expanded on both ends
1079+
* until it meets the underlying file system's direct I/O alignment
1080+
* requirements. After the internal read is complete, the byte range of
1081+
* the NFS READ payload is reduced to the byte range that was originally
1082+
* requested.
1083+
*
1084+
* Note that a direct read can be done only when the xdr_buf containing
1085+
* the NFS READ reply does not already have contents in its .pages array.
1086+
* This is due to potentially restrictive alignment requirements on the
1087+
* read buffer. When .page_len and @base are zero, the .pages array is
1088+
* guaranteed to be page-aligned.
1089+
*/
1090+
static noinline_for_stack __be32
1091+
nfsd_direct_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
1092+
struct nfsd_file *nf, loff_t offset, unsigned long *count,
1093+
u32 *eof)
1094+
{
1095+
u64 dio_start, dio_end;
1096+
unsigned long v, total;
1097+
struct iov_iter iter;
1098+
struct kiocb kiocb;
1099+
ssize_t host_err;
1100+
size_t len;
1101+
1102+
init_sync_kiocb(&kiocb, nf->nf_file);
1103+
kiocb.ki_flags |= IOCB_DIRECT;
1104+
1105+
/* Read a properly-aligned region of bytes into rq_bvec */
1106+
dio_start = round_down(offset, nf->nf_dio_read_offset_align);
1107+
dio_end = round_up((u64)offset + *count, nf->nf_dio_read_offset_align);
1108+
1109+
kiocb.ki_pos = dio_start;
1110+
1111+
v = 0;
1112+
total = dio_end - dio_start;
1113+
while (total && v < rqstp->rq_maxpages &&
1114+
rqstp->rq_next_page < rqstp->rq_page_end) {
1115+
len = min_t(size_t, total, PAGE_SIZE);
1116+
bvec_set_page(&rqstp->rq_bvec[v], *rqstp->rq_next_page,
1117+
len, 0);
1118+
1119+
total -= len;
1120+
++rqstp->rq_next_page;
1121+
++v;
1122+
}
1123+
1124+
trace_nfsd_read_direct(rqstp, fhp, offset, *count - total);
1125+
iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v,
1126+
dio_end - dio_start - total);
1127+
1128+
host_err = vfs_iocb_iter_read(nf->nf_file, &kiocb, &iter);
1129+
if (host_err >= 0) {
1130+
unsigned int pad = offset - dio_start;
1131+
1132+
/* The returned payload starts after the pad */
1133+
rqstp->rq_res.page_base = pad;
1134+
1135+
/* Compute the count of bytes to be returned */
1136+
if (host_err > pad + *count)
1137+
host_err = *count;
1138+
else if (host_err > pad)
1139+
host_err -= pad;
1140+
else
1141+
host_err = 0;
1142+
} else if (unlikely(host_err == -EINVAL)) {
1143+
struct inode *inode = d_inode(fhp->fh_dentry);
1144+
1145+
pr_info_ratelimited("nfsd: Direct I/O alignment failure on %s/%ld\n",
1146+
inode->i_sb->s_id, inode->i_ino);
1147+
host_err = -ESERVERFAULT;
1148+
}
1149+
1150+
return nfsd_finish_read(rqstp, fhp, nf->nf_file, offset, count,
1151+
eof, host_err);
1152+
}
1153+
10771154
/**
10781155
* nfsd_iter_read - Perform a VFS read using an iterator
10791156
* @rqstp: RPC transaction context
@@ -1106,6 +1183,12 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
11061183
switch (nfsd_io_cache_read) {
11071184
case NFSD_IO_BUFFERED:
11081185
break;
1186+
case NFSD_IO_DIRECT:
1187+
/* When dio_read_offset_align is zero, dio is not supported */
1188+
if (nf->nf_dio_read_offset_align && !rqstp->rq_res.page_len)
1189+
return nfsd_direct_read(rqstp, fhp, nf, offset,
1190+
count, eof);
1191+
fallthrough;
11091192
case NFSD_IO_DONTCACHE:
11101193
if (file->f_op->fop_flags & FOP_DONTCACHE)
11111194
kiocb.ki_flags = IOCB_DONTCACHE;

0 commit comments

Comments
 (0)