Skip to content

Commit bb93c5e

Browse files
committed
Merge tag 'vfs-6.8.rw' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs rw updates from Christian Brauner: "This contains updates from Amir for read-write backing file helpers for stacking filesystems such as overlayfs: - Fanotify is currently in the process of introducing pre content events. Roughly, a new permission event will be added indicating that it is safe to write to the file being accessed. These events are used by hierarchical storage managers to e.g., fill the content of files on first access. During that work we noticed that our current permission checking is inconsistent in rw_verify_area() and remap_verify_area(). Especially in the splice code permission checking is done multiple times. For example, one time for the whole range and then again for partial ranges inside the iterator. In addition, we mostly do permission checking before we call file_start_write() except for a few places where we call it after. For pre-content events we need such permission checking to be done before file_start_write(). So this is a nice reason to clean this all up. After this series, all permission checking is done before file_start_write(). As part of this cleanup we also massaged the splice code a bit. We got rid of a few helpers because we are alredy drowning in special read-write helpers. We also cleaned up the return types for splice helpers. - Introduce generic read-write helpers for backing files. This lifts some overlayfs code to common code so it can be used by the FUSE passthrough work coming in over the next cycles. Make Amir and Miklos the maintainers for this new subsystem of the vfs" * tag 'vfs-6.8.rw' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (30 commits) fs: fix __sb_write_started() kerneldoc formatting fs: factor out backing_file_mmap() helper fs: factor out backing_file_splice_{read,write}() helpers fs: factor out backing_file_{read,write}_iter() helpers fs: prepare for stackable filesystems backing file helpers fsnotify: optionally pass access range in file permission hooks fsnotify: assert that file_start_write() is not held in permission hooks fsnotify: split fsnotify_perm() into two hooks fs: use splice_copy_file_range() inline helper splice: return type ssize_t from all helpers fs: use do_splice_direct() for nfsd/ksmbd server-side-copy fs: move file_start_write() into direct_splice_actor() fs: fork splice_file_range() from do_splice_direct() fs: create {sb,file}_write_not_started() helpers fs: create file_write_started() helper fs: create __sb_write_started() helper fs: move kiocb_start_write() into vfs_iocb_iter_write() fs: move permission hook out of do_iter_read() fs: move permission hook out of do_iter_write() fs: move file_start_write() into vfs_iter_write() ...
2 parents 8c9440f + c39e2ae commit bb93c5e

30 files changed

Lines changed: 941 additions & 567 deletions

File tree

MAINTAINERS

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8143,6 +8143,15 @@ S: Supported
81438143
F: fs/iomap/
81448144
F: include/linux/iomap.h
81458145

8146+
FILESYSTEMS [STACKABLE]
8147+
M: Miklos Szeredi <miklos@szeredi.hu>
8148+
M: Amir Goldstein <amir73il@gmail.com>
8149+
L: linux-fsdevel@vger.kernel.org
8150+
L: linux-unionfs@vger.kernel.org
8151+
S: Maintained
8152+
F: fs/backing-file.c
8153+
F: include/linux/backing-file.h
8154+
81468155
FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER
81478156
M: Riku Voipio <riku.voipio@iki.fi>
81488157
L: linux-hwmon@vger.kernel.org

drivers/block/loop.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -245,9 +245,7 @@ static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos)
245245

246246
iov_iter_bvec(&i, ITER_SOURCE, bvec, 1, bvec->bv_len);
247247

248-
file_start_write(file);
249248
bw = vfs_iter_write(file, &i, ppos, 0);
250-
file_end_write(file);
251249

252250
if (likely(bw == bvec->bv_len))
253251
return 0;

fs/Kconfig

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ config VALIDATE_FS_PARSER
1818
config FS_IOMAP
1919
bool
2020

21+
# Stackable filesystems
22+
config FS_STACK
23+
bool
24+
2125
config BUFFER_HEAD
2226
bool
2327

fs/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o
3939
obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o
4040
obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o
4141

42+
obj-$(CONFIG_FS_STACK) += backing-file.o
4243
obj-$(CONFIG_FS_MBCACHE) += mbcache.o
4344
obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o
4445
obj-$(CONFIG_NFS_COMMON) += nfs_common/

fs/backing-file.c

Lines changed: 336 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,336 @@
1+
// SPDX-License-Identifier: GPL-2.0-only
2+
/*
3+
* Common helpers for stackable filesystems and backing files.
4+
*
5+
* Forked from fs/overlayfs/file.c.
6+
*
7+
* Copyright (C) 2017 Red Hat, Inc.
8+
* Copyright (C) 2023 CTERA Networks.
9+
*/
10+
11+
#include <linux/fs.h>
12+
#include <linux/backing-file.h>
13+
#include <linux/splice.h>
14+
#include <linux/mm.h>
15+
16+
#include "internal.h"
17+
18+
/**
19+
* backing_file_open - open a backing file for kernel internal use
20+
* @user_path: path that the user reuqested to open
21+
* @flags: open flags
22+
* @real_path: path of the backing file
23+
* @cred: credentials for open
24+
*
25+
* Open a backing file for a stackable filesystem (e.g., overlayfs).
26+
* @user_path may be on the stackable filesystem and @real_path on the
27+
* underlying filesystem. In this case, we want to be able to return the
28+
* @user_path of the stackable filesystem. This is done by embedding the
29+
* returned file into a container structure that also stores the stacked
30+
* file's path, which can be retrieved using backing_file_user_path().
31+
*/
32+
struct file *backing_file_open(const struct path *user_path, int flags,
33+
const struct path *real_path,
34+
const struct cred *cred)
35+
{
36+
struct file *f;
37+
int error;
38+
39+
f = alloc_empty_backing_file(flags, cred);
40+
if (IS_ERR(f))
41+
return f;
42+
43+
path_get(user_path);
44+
*backing_file_user_path(f) = *user_path;
45+
error = vfs_open(real_path, f);
46+
if (error) {
47+
fput(f);
48+
f = ERR_PTR(error);
49+
}
50+
51+
return f;
52+
}
53+
EXPORT_SYMBOL_GPL(backing_file_open);
54+
55+
struct backing_aio {
56+
struct kiocb iocb;
57+
refcount_t ref;
58+
struct kiocb *orig_iocb;
59+
/* used for aio completion */
60+
void (*end_write)(struct file *);
61+
struct work_struct work;
62+
long res;
63+
};
64+
65+
static struct kmem_cache *backing_aio_cachep;
66+
67+
#define BACKING_IOCB_MASK \
68+
(IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND)
69+
70+
static rwf_t iocb_to_rw_flags(int flags)
71+
{
72+
return (__force rwf_t)(flags & BACKING_IOCB_MASK);
73+
}
74+
75+
static void backing_aio_put(struct backing_aio *aio)
76+
{
77+
if (refcount_dec_and_test(&aio->ref)) {
78+
fput(aio->iocb.ki_filp);
79+
kmem_cache_free(backing_aio_cachep, aio);
80+
}
81+
}
82+
83+
static void backing_aio_cleanup(struct backing_aio *aio, long res)
84+
{
85+
struct kiocb *iocb = &aio->iocb;
86+
struct kiocb *orig_iocb = aio->orig_iocb;
87+
88+
if (aio->end_write)
89+
aio->end_write(orig_iocb->ki_filp);
90+
91+
orig_iocb->ki_pos = iocb->ki_pos;
92+
backing_aio_put(aio);
93+
}
94+
95+
static void backing_aio_rw_complete(struct kiocb *iocb, long res)
96+
{
97+
struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb);
98+
struct kiocb *orig_iocb = aio->orig_iocb;
99+
100+
if (iocb->ki_flags & IOCB_WRITE)
101+
kiocb_end_write(iocb);
102+
103+
backing_aio_cleanup(aio, res);
104+
orig_iocb->ki_complete(orig_iocb, res);
105+
}
106+
107+
static void backing_aio_complete_work(struct work_struct *work)
108+
{
109+
struct backing_aio *aio = container_of(work, struct backing_aio, work);
110+
111+
backing_aio_rw_complete(&aio->iocb, aio->res);
112+
}
113+
114+
static void backing_aio_queue_completion(struct kiocb *iocb, long res)
115+
{
116+
struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb);
117+
118+
/*
119+
* Punt to a work queue to serialize updates of mtime/size.
120+
*/
121+
aio->res = res;
122+
INIT_WORK(&aio->work, backing_aio_complete_work);
123+
queue_work(file_inode(aio->orig_iocb->ki_filp)->i_sb->s_dio_done_wq,
124+
&aio->work);
125+
}
126+
127+
static int backing_aio_init_wq(struct kiocb *iocb)
128+
{
129+
struct super_block *sb = file_inode(iocb->ki_filp)->i_sb;
130+
131+
if (sb->s_dio_done_wq)
132+
return 0;
133+
134+
return sb_init_dio_done_wq(sb);
135+
}
136+
137+
138+
ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
139+
struct kiocb *iocb, int flags,
140+
struct backing_file_ctx *ctx)
141+
{
142+
struct backing_aio *aio = NULL;
143+
const struct cred *old_cred;
144+
ssize_t ret;
145+
146+
if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
147+
return -EIO;
148+
149+
if (!iov_iter_count(iter))
150+
return 0;
151+
152+
if (iocb->ki_flags & IOCB_DIRECT &&
153+
!(file->f_mode & FMODE_CAN_ODIRECT))
154+
return -EINVAL;
155+
156+
old_cred = override_creds(ctx->cred);
157+
if (is_sync_kiocb(iocb)) {
158+
rwf_t rwf = iocb_to_rw_flags(flags);
159+
160+
ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf);
161+
} else {
162+
ret = -ENOMEM;
163+
aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
164+
if (!aio)
165+
goto out;
166+
167+
aio->orig_iocb = iocb;
168+
kiocb_clone(&aio->iocb, iocb, get_file(file));
169+
aio->iocb.ki_complete = backing_aio_rw_complete;
170+
refcount_set(&aio->ref, 2);
171+
ret = vfs_iocb_iter_read(file, &aio->iocb, iter);
172+
backing_aio_put(aio);
173+
if (ret != -EIOCBQUEUED)
174+
backing_aio_cleanup(aio, ret);
175+
}
176+
out:
177+
revert_creds(old_cred);
178+
179+
if (ctx->accessed)
180+
ctx->accessed(ctx->user_file);
181+
182+
return ret;
183+
}
184+
EXPORT_SYMBOL_GPL(backing_file_read_iter);
185+
186+
ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
187+
struct kiocb *iocb, int flags,
188+
struct backing_file_ctx *ctx)
189+
{
190+
const struct cred *old_cred;
191+
ssize_t ret;
192+
193+
if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
194+
return -EIO;
195+
196+
if (!iov_iter_count(iter))
197+
return 0;
198+
199+
ret = file_remove_privs(ctx->user_file);
200+
if (ret)
201+
return ret;
202+
203+
if (iocb->ki_flags & IOCB_DIRECT &&
204+
!(file->f_mode & FMODE_CAN_ODIRECT))
205+
return -EINVAL;
206+
207+
/*
208+
* Stacked filesystems don't support deferred completions, don't copy
209+
* this property in case it is set by the issuer.
210+
*/
211+
flags &= ~IOCB_DIO_CALLER_COMP;
212+
213+
old_cred = override_creds(ctx->cred);
214+
if (is_sync_kiocb(iocb)) {
215+
rwf_t rwf = iocb_to_rw_flags(flags);
216+
217+
ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
218+
if (ctx->end_write)
219+
ctx->end_write(ctx->user_file);
220+
} else {
221+
struct backing_aio *aio;
222+
223+
ret = backing_aio_init_wq(iocb);
224+
if (ret)
225+
goto out;
226+
227+
ret = -ENOMEM;
228+
aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
229+
if (!aio)
230+
goto out;
231+
232+
aio->orig_iocb = iocb;
233+
aio->end_write = ctx->end_write;
234+
kiocb_clone(&aio->iocb, iocb, get_file(file));
235+
aio->iocb.ki_flags = flags;
236+
aio->iocb.ki_complete = backing_aio_queue_completion;
237+
refcount_set(&aio->ref, 2);
238+
ret = vfs_iocb_iter_write(file, &aio->iocb, iter);
239+
backing_aio_put(aio);
240+
if (ret != -EIOCBQUEUED)
241+
backing_aio_cleanup(aio, ret);
242+
}
243+
out:
244+
revert_creds(old_cred);
245+
246+
return ret;
247+
}
248+
EXPORT_SYMBOL_GPL(backing_file_write_iter);
249+
250+
ssize_t backing_file_splice_read(struct file *in, loff_t *ppos,
251+
struct pipe_inode_info *pipe, size_t len,
252+
unsigned int flags,
253+
struct backing_file_ctx *ctx)
254+
{
255+
const struct cred *old_cred;
256+
ssize_t ret;
257+
258+
if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING)))
259+
return -EIO;
260+
261+
old_cred = override_creds(ctx->cred);
262+
ret = vfs_splice_read(in, ppos, pipe, len, flags);
263+
revert_creds(old_cred);
264+
265+
if (ctx->accessed)
266+
ctx->accessed(ctx->user_file);
267+
268+
return ret;
269+
}
270+
EXPORT_SYMBOL_GPL(backing_file_splice_read);
271+
272+
ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
273+
struct file *out, loff_t *ppos, size_t len,
274+
unsigned int flags,
275+
struct backing_file_ctx *ctx)
276+
{
277+
const struct cred *old_cred;
278+
ssize_t ret;
279+
280+
if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING)))
281+
return -EIO;
282+
283+
ret = file_remove_privs(ctx->user_file);
284+
if (ret)
285+
return ret;
286+
287+
old_cred = override_creds(ctx->cred);
288+
file_start_write(out);
289+
ret = iter_file_splice_write(pipe, out, ppos, len, flags);
290+
file_end_write(out);
291+
revert_creds(old_cred);
292+
293+
if (ctx->end_write)
294+
ctx->end_write(ctx->user_file);
295+
296+
return ret;
297+
}
298+
EXPORT_SYMBOL_GPL(backing_file_splice_write);
299+
300+
int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
301+
struct backing_file_ctx *ctx)
302+
{
303+
const struct cred *old_cred;
304+
int ret;
305+
306+
if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)) ||
307+
WARN_ON_ONCE(ctx->user_file != vma->vm_file))
308+
return -EIO;
309+
310+
if (!file->f_op->mmap)
311+
return -ENODEV;
312+
313+
vma_set_file(vma, file);
314+
315+
old_cred = override_creds(ctx->cred);
316+
ret = call_mmap(vma->vm_file, vma);
317+
revert_creds(old_cred);
318+
319+
if (ctx->accessed)
320+
ctx->accessed(ctx->user_file);
321+
322+
return ret;
323+
}
324+
EXPORT_SYMBOL_GPL(backing_file_mmap);
325+
326+
static int __init backing_aio_init(void)
327+
{
328+
backing_aio_cachep = kmem_cache_create("backing_aio",
329+
sizeof(struct backing_aio),
330+
0, SLAB_HWCACHE_ALIGN, NULL);
331+
if (!backing_aio_cachep)
332+
return -ENOMEM;
333+
334+
return 0;
335+
}
336+
fs_initcall(backing_aio_init);

0 commit comments

Comments
 (0)