Skip to content

Commit 0e335a7

Browse files
committed
Merge tag 'vfs-7.0-rc2.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs fixes from Christian Brauner: - Fix an uninitialized variable in file_getattr(). The flags_valid field wasn't initialized before calling vfs_fileattr_get(), triggering KMSAN uninit-value reports in fuse - Fix writeback wakeup and logging timeouts when DETECT_HUNG_TASK is not enabled. sysctl_hung_task_timeout_secs is 0 in that case causing spurious "waiting for writeback completion for more than 1 seconds" warnings - Fix a null-ptr-deref in do_statmount() when the mount is internal - Add missing kernel-doc description for the @Private parameter in iomap_readahead() - Fix mount namespace creation to hold namespace_sem across the mount copy in create_new_namespace(). The previous drop-and-reacquire pattern was fragile and failed to clean up mount propagation links if the real rootfs was a shared or dependent mount - Fix /proc mount iteration where m->index wasn't updated when m->show() overflows, causing a restart to repeatedly show the same mount entry in a rapidly expanding mount table - Return EFSCORRUPTED instead of ENOSPC in minix_new_inode() when the inode number is out of range - Fix unshare(2) when CLONE_NEWNS is set and current->fs isn't shared. copy_mnt_ns() received the live fs_struct so if a subsequent namespace creation failed the rollback would leave pwd and root pointing to detached mounts. Always allocate a new fs_struct when CLONE_NEWNS is requested - fserror bug fixes: - Remove the unused fsnotify_sb_error() helper now that all callers have been converted to fserror_report_metadata - Fix a lockdep splat in fserror_report() where igrab() takes inode::i_lock which can be held in IRQ context. Replace igrab() with a direct i_count bump since filesystems should not report inodes that are about to be freed or not yet exposed - Handle error pointer in procfs for try_lookup_noperm() - Fix an integer overflow in ep_loop_check_proc() where recursive calls returning INT_MAX would overflow when +1 is added, breaking the recursion depth check - Fix a misleading break in pidfs * tag 'vfs-7.0-rc2.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: pidfs: avoid misleading break eventpoll: Fix integer overflow in ep_loop_check_proc() proc: Fix pointer error dereference fserror: fix lockdep complaint when igrabbing inode fsnotify: drop unused helper unshare: fix unshare_fs() handling minix: Correct errno in minix_new_inode namespace: fix proc mount iteration mount: hold namespace_sem across copy in create_new_namespace() iomap: Describe @Private in iomap_readahead() statmount: Fix the null-ptr-deref in do_statmount() writeback: Fix wakeup and logging timeouts for !DETECT_HUNG_TASK fs: init flags_valid before calling vfs_fileattr_get
2 parents bfbc0b5 + 4a1ddb0 commit 0e335a7

11 files changed

Lines changed: 139 additions & 87 deletions

File tree

fs/eventpoll.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2061,7 +2061,8 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
20612061
* @ep: the &struct eventpoll to be currently checked.
20622062
* @depth: Current depth of the path being checked.
20632063
*
2064-
* Return: depth of the subtree, or INT_MAX if we found a loop or went too deep.
2064+
* Return: depth of the subtree, or a value bigger than EP_MAX_NESTS if we found
2065+
* a loop or went too deep.
20652066
*/
20662067
static int ep_loop_check_proc(struct eventpoll *ep, int depth)
20672068
{
@@ -2080,7 +2081,7 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
20802081
struct eventpoll *ep_tovisit;
20812082
ep_tovisit = epi->ffd.file->private_data;
20822083
if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
2083-
result = INT_MAX;
2084+
result = EP_MAX_NESTS+1;
20842085
else
20852086
result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1);
20862087
if (result > EP_MAX_NESTS)

fs/file_attr.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename,
378378
struct path filepath __free(path_put) = {};
379379
unsigned int lookup_flags = 0;
380380
struct file_attr fattr;
381-
struct file_kattr fa;
381+
struct file_kattr fa = { .flags_valid = true }; /* hint only */
382382
int error;
383383

384384
BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0);

fs/fs-writeback.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -198,10 +198,11 @@ static void wb_queue_work(struct bdi_writeback *wb,
198198

199199
static bool wb_wait_for_completion_cb(struct wb_completion *done)
200200
{
201+
unsigned long timeout = sysctl_hung_task_timeout_secs;
201202
unsigned long waited_secs = (jiffies - done->wait_start) / HZ;
202203

203204
done->progress_stamp = jiffies;
204-
if (waited_secs > sysctl_hung_task_timeout_secs)
205+
if (timeout && (waited_secs > timeout))
205206
pr_info("INFO: The task %s:%d has been waiting for writeback "
206207
"completion for more than %lu seconds.",
207208
current->comm, current->pid, waited_secs);
@@ -1954,6 +1955,7 @@ static long writeback_sb_inodes(struct super_block *sb,
19541955
.range_end = LLONG_MAX,
19551956
};
19561957
unsigned long start_time = jiffies;
1958+
unsigned long timeout = sysctl_hung_task_timeout_secs;
19571959
long write_chunk;
19581960
long total_wrote = 0; /* count both pages and inodes */
19591961
unsigned long dirtied_before = jiffies;
@@ -2040,9 +2042,8 @@ static long writeback_sb_inodes(struct super_block *sb,
20402042
__writeback_single_inode(inode, &wbc);
20412043

20422044
/* Report progress to inform the hung task detector of the progress. */
2043-
if (work->done && work->done->progress_stamp &&
2044-
(jiffies - work->done->progress_stamp) > HZ *
2045-
sysctl_hung_task_timeout_secs / 2)
2045+
if (work->done && work->done->progress_stamp && timeout &&
2046+
(jiffies - work->done->progress_stamp) > HZ * timeout / 2)
20462047
wake_up_all(work->done->waitq);
20472048

20482049
wbc_detach_inode(&wbc);

fs/iomap/buffered-io.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,7 @@ static int iomap_readahead_iter(struct iomap_iter *iter,
624624
* iomap_readahead - Attempt to read pages from a file.
625625
* @ops: The operations vector for the filesystem.
626626
* @ctx: The ctx used for issuing readahead.
627+
* @private: The filesystem-specific information for issuing iomap_iter.
627628
*
628629
* This function is for filesystems to call to implement their readahead
629630
* address_space operation.

fs/iomap/ioend.c

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,57 @@ static u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
6969
return folio_count;
7070
}
7171

72+
static DEFINE_SPINLOCK(failed_ioend_lock);
73+
static LIST_HEAD(failed_ioend_list);
74+
75+
static void
76+
iomap_fail_ioends(
77+
struct work_struct *work)
78+
{
79+
struct iomap_ioend *ioend;
80+
struct list_head tmp;
81+
unsigned long flags;
82+
83+
spin_lock_irqsave(&failed_ioend_lock, flags);
84+
list_replace_init(&failed_ioend_list, &tmp);
85+
spin_unlock_irqrestore(&failed_ioend_lock, flags);
86+
87+
while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
88+
io_list))) {
89+
list_del_init(&ioend->io_list);
90+
iomap_finish_ioend_buffered(ioend);
91+
cond_resched();
92+
}
93+
}
94+
95+
static DECLARE_WORK(failed_ioend_work, iomap_fail_ioends);
96+
97+
static void iomap_fail_ioend_buffered(struct iomap_ioend *ioend)
98+
{
99+
unsigned long flags;
100+
101+
/*
102+
* Bounce I/O errors to a workqueue to avoid nested i_lock acquisitions
103+
* in the fserror code. The caller no longer owns the ioend reference
104+
* after the spinlock drops.
105+
*/
106+
spin_lock_irqsave(&failed_ioend_lock, flags);
107+
if (list_empty(&failed_ioend_list))
108+
WARN_ON_ONCE(!schedule_work(&failed_ioend_work));
109+
list_add_tail(&ioend->io_list, &failed_ioend_list);
110+
spin_unlock_irqrestore(&failed_ioend_lock, flags);
111+
}
112+
72113
static void ioend_writeback_end_bio(struct bio *bio)
73114
{
74115
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
75116

76117
ioend->io_error = blk_status_to_errno(bio->bi_status);
118+
if (ioend->io_error) {
119+
iomap_fail_ioend_buffered(ioend);
120+
return;
121+
}
122+
77123
iomap_finish_ioend_buffered(ioend);
78124
}
79125

fs/minix/bitmap.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode)
247247
j += i * bits_per_zone;
248248
if (!j || j > sbi->s_ninodes) {
249249
iput(inode);
250-
return ERR_PTR(-ENOSPC);
250+
return ERR_PTR(-EFSCORRUPTED);
251251
}
252252
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
253253
inode->i_ino = j;

fs/namespace.c

Lines changed: 74 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1531,23 +1531,33 @@ static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id
15311531
static void *m_start(struct seq_file *m, loff_t *pos)
15321532
{
15331533
struct proc_mounts *p = m->private;
1534+
struct mount *mnt;
15341535

15351536
down_read(&namespace_sem);
15361537

1537-
return mnt_find_id_at(p->ns, *pos);
1538+
mnt = mnt_find_id_at(p->ns, *pos);
1539+
if (mnt)
1540+
*pos = mnt->mnt_id_unique;
1541+
return mnt;
15381542
}
15391543

15401544
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
15411545
{
1542-
struct mount *next = NULL, *mnt = v;
1546+
struct mount *mnt = v;
15431547
struct rb_node *node = rb_next(&mnt->mnt_node);
15441548

1545-
++*pos;
15461549
if (node) {
1547-
next = node_to_mount(node);
1550+
struct mount *next = node_to_mount(node);
15481551
*pos = next->mnt_id_unique;
1552+
return next;
15491553
}
1550-
return next;
1554+
1555+
/*
1556+
* No more mounts. Set pos past current mount's ID so that if
1557+
* iteration restarts, mnt_find_id_at() returns NULL.
1558+
*/
1559+
*pos = mnt->mnt_id_unique + 1;
1560+
return NULL;
15511561
}
15521562

15531563
static void m_stop(struct seq_file *m, void *v)
@@ -2791,15 +2801,19 @@ static inline void unlock_mount(struct pinned_mountpoint *m)
27912801
}
27922802

27932803
static void lock_mount_exact(const struct path *path,
2794-
struct pinned_mountpoint *mp);
2804+
struct pinned_mountpoint *mp, bool copy_mount,
2805+
unsigned int copy_flags);
27952806

27962807
#define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \
27972808
struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
27982809
do_lock_mount((path), &mp, (beneath))
27992810
#define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false)
28002811
#define LOCK_MOUNT_EXACT(mp, path) \
28012812
struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
2802-
lock_mount_exact((path), &mp)
2813+
lock_mount_exact((path), &mp, false, 0)
2814+
#define LOCK_MOUNT_EXACT_COPY(mp, path, copy_flags) \
2815+
struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
2816+
lock_mount_exact((path), &mp, true, (copy_flags))
28032817

28042818
static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp)
28052819
{
@@ -3073,16 +3087,13 @@ static struct file *open_detached_copy(struct path *path, unsigned int flags)
30733087
return file;
30743088
}
30753089

3076-
DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *,
3077-
if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T))
3078-
30793090
static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags)
30803091
{
3081-
struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL;
3082-
struct path to_path __free(path_put) = {};
30833092
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
30843093
struct user_namespace *user_ns = current_user_ns();
3085-
struct mount *new_ns_root;
3094+
struct mnt_namespace *new_ns;
3095+
struct mount *new_ns_root, *old_ns_root;
3096+
struct path to_path;
30863097
struct mount *mnt;
30873098
unsigned int copy_flags = 0;
30883099
bool locked = false;
@@ -3094,71 +3105,63 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in
30943105
if (IS_ERR(new_ns))
30953106
return ERR_CAST(new_ns);
30963107

3097-
scoped_guard(namespace_excl) {
3098-
new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags);
3099-
if (IS_ERR(new_ns_root))
3100-
return ERR_CAST(new_ns_root);
3108+
old_ns_root = ns->root;
3109+
to_path.mnt = &old_ns_root->mnt;
3110+
to_path.dentry = old_ns_root->mnt.mnt_root;
31013111

3102-
/*
3103-
* If the real rootfs had a locked mount on top of it somewhere
3104-
* in the stack, lock the new mount tree as well so it can't be
3105-
* exposed.
3106-
*/
3107-
mnt = ns->root;
3108-
while (mnt->overmount) {
3109-
mnt = mnt->overmount;
3110-
if (mnt->mnt.mnt_flags & MNT_LOCKED)
3111-
locked = true;
3112-
}
3112+
VFS_WARN_ON_ONCE(old_ns_root->mnt.mnt_sb->s_type != &nullfs_fs_type);
3113+
3114+
LOCK_MOUNT_EXACT_COPY(mp, &to_path, copy_flags);
3115+
if (IS_ERR(mp.parent)) {
3116+
free_mnt_ns(new_ns);
3117+
return ERR_CAST(mp.parent);
31133118
}
3119+
new_ns_root = mp.parent;
31143120

31153121
/*
3116-
* We dropped the namespace semaphore so we can actually lock
3117-
* the copy for mounting. The copied mount isn't attached to any
3118-
* mount namespace and it is thus excluded from any propagation.
3119-
* So realistically we're isolated and the mount can't be
3120-
* overmounted.
3122+
* If the real rootfs had a locked mount on top of it somewhere
3123+
* in the stack, lock the new mount tree as well so it can't be
3124+
* exposed.
31213125
*/
3122-
3123-
/* Borrow the reference from clone_mnt(). */
3124-
to_path.mnt = &new_ns_root->mnt;
3125-
to_path.dentry = dget(new_ns_root->mnt.mnt_root);
3126-
3127-
/* Now lock for actual mounting. */
3128-
LOCK_MOUNT_EXACT(mp, &to_path);
3129-
if (unlikely(IS_ERR(mp.parent)))
3130-
return ERR_CAST(mp.parent);
3126+
mnt = old_ns_root;
3127+
while (mnt->overmount) {
3128+
mnt = mnt->overmount;
3129+
if (mnt->mnt.mnt_flags & MNT_LOCKED)
3130+
locked = true;
3131+
}
31313132

31323133
/*
3133-
* We don't emulate unshare()ing a mount namespace. We stick to the
3134-
* restrictions of creating detached bind-mounts. It has a lot
3135-
* saner and simpler semantics.
3134+
* We don't emulate unshare()ing a mount namespace. We stick
3135+
* to the restrictions of creating detached bind-mounts. It
3136+
* has a lot saner and simpler semantics.
31363137
*/
31373138
mnt = __do_loopback(path, flags, copy_flags);
3138-
if (IS_ERR(mnt))
3139-
return ERR_CAST(mnt);
3140-
31413139
scoped_guard(mount_writer) {
3140+
if (IS_ERR(mnt)) {
3141+
emptied_ns = new_ns;
3142+
umount_tree(new_ns_root, 0);
3143+
return ERR_CAST(mnt);
3144+
}
3145+
31423146
if (locked)
31433147
mnt->mnt.mnt_flags |= MNT_LOCKED;
31443148
/*
3145-
* Now mount the detached tree on top of the copy of the
3146-
* real rootfs we created.
3149+
* now mount the detached tree on top of the copy
3150+
* of the real rootfs we created.
31473151
*/
31483152
attach_mnt(mnt, new_ns_root, mp.mp);
31493153
if (user_ns != ns->user_ns)
31503154
lock_mnt_tree(new_ns_root);
31513155
}
31523156

3153-
/* Add all mounts to the new namespace. */
3154-
for (struct mount *p = new_ns_root; p; p = next_mnt(p, new_ns_root)) {
3155-
mnt_add_to_ns(new_ns, p);
3157+
for (mnt = new_ns_root; mnt; mnt = next_mnt(mnt, new_ns_root)) {
3158+
mnt_add_to_ns(new_ns, mnt);
31563159
new_ns->nr_mounts++;
31573160
}
31583161

3159-
new_ns->root = real_mount(no_free_ptr(to_path.mnt));
3162+
new_ns->root = new_ns_root;
31603163
ns_tree_add_raw(new_ns);
3161-
return no_free_ptr(new_ns);
3164+
return new_ns;
31623165
}
31633166

31643167
static struct file *open_new_namespace(struct path *path, unsigned int flags)
@@ -3840,26 +3843,36 @@ static int do_new_mount(const struct path *path, const char *fstype,
38403843
}
38413844

38423845
static void lock_mount_exact(const struct path *path,
3843-
struct pinned_mountpoint *mp)
3846+
struct pinned_mountpoint *mp, bool copy_mount,
3847+
unsigned int copy_flags)
38443848
{
38453849
struct dentry *dentry = path->dentry;
38463850
int err;
38473851

3852+
/* Assert that inode_lock() locked the correct inode. */
3853+
VFS_WARN_ON_ONCE(copy_mount && !path_mounted(path));
3854+
38483855
inode_lock(dentry->d_inode);
38493856
namespace_lock();
38503857
if (unlikely(cant_mount(dentry)))
38513858
err = -ENOENT;
3852-
else if (path_overmounted(path))
3859+
else if (!copy_mount && path_overmounted(path))
38533860
err = -EBUSY;
38543861
else
38553862
err = get_mountpoint(dentry, mp);
38563863
if (unlikely(err)) {
38573864
namespace_unlock();
38583865
inode_unlock(dentry->d_inode);
38593866
mp->parent = ERR_PTR(err);
3860-
} else {
3861-
mp->parent = real_mount(path->mnt);
3867+
return;
38623868
}
3869+
3870+
if (copy_mount)
3871+
mp->parent = clone_mnt(real_mount(path->mnt), dentry, copy_flags);
3872+
else
3873+
mp->parent = real_mount(path->mnt);
3874+
if (unlikely(IS_ERR(mp->parent)))
3875+
__unlock_mount(mp);
38633876
}
38643877

38653878
int finish_automount(struct vfsmount *__m, const struct path *path)
@@ -5678,6 +5691,8 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
56785691

56795692
s->mnt = mnt_file->f_path.mnt;
56805693
ns = real_mount(s->mnt)->mnt_ns;
5694+
if (IS_ERR(ns))
5695+
return PTR_ERR(ns);
56815696
if (!ns)
56825697
/*
56835698
* We can't set mount point and mnt_ns_id since we don't have a

fs/pidfs.c

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -608,9 +608,8 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
608608
struct user_namespace *user_ns;
609609

610610
user_ns = task_cred_xxx(task, user_ns);
611-
if (!ns_ref_get(user_ns))
612-
break;
613-
ns_common = to_ns_common(user_ns);
611+
if (ns_ref_get(user_ns))
612+
ns_common = to_ns_common(user_ns);
614613
}
615614
#endif
616615
break;
@@ -620,9 +619,8 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
620619
struct pid_namespace *pid_ns;
621620

622621
pid_ns = task_active_pid_ns(task);
623-
if (!ns_ref_get(pid_ns))
624-
break;
625-
ns_common = to_ns_common(pid_ns);
622+
if (ns_ref_get(pid_ns))
623+
ns_common = to_ns_common(pid_ns);
626624
}
627625
#endif
628626
break;

0 commit comments

Comments
 (0)