Merge tag 'vfs-7.0-rc2.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

torvalds · torvalds · commit 0e335a7745b0 · 2026-02-25T10:34:23.000-08:00
Pull vfs fixes from Christian Brauner: - Fix an uninitialized variable in file_getattr(). The flags_valid field wasn't initialized before calling vfs_fileattr_get(), triggering KMSAN uninit-value reports in fuse - Fix writeback wakeup and logging timeouts when DETECT_HUNG_TASK is not enabled. sysctl_hung_task_timeout_secs is 0 in that case causing spurious "waiting for writeback completion for more than 1 seconds" warnings - Fix a null-ptr-deref in do_statmount() when the mount is internal - Add missing kernel-doc description for the @Private parameter in iomap_readahead() - Fix mount namespace creation to hold namespace_sem across the mount copy in create_new_namespace(). The previous drop-and-reacquire pattern was fragile and failed to clean up mount propagation links if the real rootfs was a shared or dependent mount - Fix /proc mount iteration where m->index wasn't updated when m->show() overflows, causing a restart to repeatedly show the same mount entry in a rapidly expanding mount table - Return EFSCORRUPTED instead of ENOSPC in minix_new_inode() when the inode number is out of range - Fix unshare(2) when CLONE_NEWNS is set and current->fs isn't shared. copy_mnt_ns() received the live fs_struct so if a subsequent namespace creation failed the rollback would leave pwd and root pointing to detached mounts. Always allocate a new fs_struct when CLONE_NEWNS is requested - fserror bug fixes: - Remove the unused fsnotify_sb_error() helper now that all callers have been converted to fserror_report_metadata - Fix a lockdep splat in fserror_report() where igrab() takes inode::i_lock which can be held in IRQ context. Replace igrab() with a direct i_count bump since filesystems should not report inodes that are about to be freed or not yet exposed - Handle error pointer in procfs for try_lookup_noperm() - Fix an integer overflow in ep_loop_check_proc() where recursive calls returning INT_MAX would overflow when +1 is added, breaking the recursion depth check - Fix a misleading break in pidfs * tag 'vfs-7.0-rc2.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: pidfs: avoid misleading break eventpoll: Fix integer overflow in ep_loop_check_proc() proc: Fix pointer error dereference fserror: fix lockdep complaint when igrabbing inode fsnotify: drop unused helper unshare: fix unshare_fs() handling minix: Correct errno in minix_new_inode namespace: fix proc mount iteration mount: hold namespace_sem across copy in create_new_namespace() iomap: Describe @Private in iomap_readahead() statmount: Fix the null-ptr-deref in do_statmount() writeback: Fix wakeup and logging timeouts for !DETECT_HUNG_TASK fs: init flags_valid before calling vfs_fileattr_get
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
@@ -2061,7 +2061,8 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
  * @ep: the &struct eventpoll to be currently checked.
  * @depth: Current depth of the path being checked.
  *
- * Return: depth of the subtree, or INT_MAX if we found a loop or went too deep.
+ * Return: depth of the subtree, or a value bigger than EP_MAX_NESTS if we found
+ * a loop or went too deep.
  */
 static int ep_loop_check_proc(struct eventpoll *ep, int depth)
 {
@@ -2080,7 +2081,7 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
 			struct eventpoll *ep_tovisit;
 			ep_tovisit = epi->ffd.file->private_data;
 			if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
-				result = INT_MAX;
+				result = EP_MAX_NESTS+1;
 			else
 				result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1);
 			if (result > EP_MAX_NESTS)
diff --git a/fs/file_attr.c b/fs/file_attr.c
@@ -378,7 +378,7 @@ SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename,
 	struct path filepath __free(path_put) = {};
 	unsigned int lookup_flags = 0;
 	struct file_attr fattr;
-	struct file_kattr fa;
+	struct file_kattr fa = { .flags_valid = true }; /* hint only */
 	int error;
 
 	BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
@@ -198,10 +198,11 @@ static void wb_queue_work(struct bdi_writeback *wb,
 
 static bool wb_wait_for_completion_cb(struct wb_completion *done)
 {
+	unsigned long timeout = sysctl_hung_task_timeout_secs;
 	unsigned long waited_secs = (jiffies - done->wait_start) / HZ;
 
 	done->progress_stamp = jiffies;
-	if (waited_secs > sysctl_hung_task_timeout_secs)
+	if (timeout && (waited_secs > timeout))
 		pr_info("INFO: The task %s:%d has been waiting for writeback "
 			"completion for more than %lu seconds.",
 			current->comm, current->pid, waited_secs);
@@ -1954,6 +1955,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 		.range_end		= LLONG_MAX,
 	};
 	unsigned long start_time = jiffies;
+	unsigned long timeout = sysctl_hung_task_timeout_secs;
 	long write_chunk;
 	long total_wrote = 0;  /* count both pages and inodes */
 	unsigned long dirtied_before = jiffies;
@@ -2040,9 +2042,8 @@ static long writeback_sb_inodes(struct super_block *sb,
 		__writeback_single_inode(inode, &wbc);
 
 		/* Report progress to inform the hung task detector of the progress. */
-		if (work->done && work->done->progress_stamp &&
-		   (jiffies - work->done->progress_stamp) > HZ *
-		   sysctl_hung_task_timeout_secs / 2)
+		if (work->done && work->done->progress_stamp && timeout &&
+		   (jiffies - work->done->progress_stamp) > HZ * timeout / 2)
 			wake_up_all(work->done->waitq);
 
 		wbc_detach_inode(&wbc);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
@@ -624,6 +624,7 @@ static int iomap_readahead_iter(struct iomap_iter *iter,
  * iomap_readahead - Attempt to read pages from a file.
  * @ops: The operations vector for the filesystem.
  * @ctx: The ctx used for issuing readahead.
+ * @private: The filesystem-specific information for issuing iomap_iter.
  *
  * This function is for filesystems to call to implement their readahead
  * address_space operation.
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
@@ -69,11 +69,57 @@ static u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
 	return folio_count;
 }
 
+static DEFINE_SPINLOCK(failed_ioend_lock);
+static LIST_HEAD(failed_ioend_list);
+
+static void
+iomap_fail_ioends(
+	struct work_struct	*work)
+{
+	struct iomap_ioend	*ioend;
+	struct list_head	tmp;
+	unsigned long		flags;
+
+	spin_lock_irqsave(&failed_ioend_lock, flags);
+	list_replace_init(&failed_ioend_list, &tmp);
+	spin_unlock_irqrestore(&failed_ioend_lock, flags);
+
+	while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
+			io_list))) {
+		list_del_init(&ioend->io_list);
+		iomap_finish_ioend_buffered(ioend);
+		cond_resched();
+	}
+}
+
+static DECLARE_WORK(failed_ioend_work, iomap_fail_ioends);
+
+static void iomap_fail_ioend_buffered(struct iomap_ioend *ioend)
+{
+	unsigned long flags;
+
+	/*
+	 * Bounce I/O errors to a workqueue to avoid nested i_lock acquisitions
+	 * in the fserror code.  The caller no longer owns the ioend reference
+	 * after the spinlock drops.
+	 */
+	spin_lock_irqsave(&failed_ioend_lock, flags);
+	if (list_empty(&failed_ioend_list))
+		WARN_ON_ONCE(!schedule_work(&failed_ioend_work));
+	list_add_tail(&ioend->io_list, &failed_ioend_list);
+	spin_unlock_irqrestore(&failed_ioend_lock, flags);
+}
+
 static void ioend_writeback_end_bio(struct bio *bio)
 {
 	struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
 
 	ioend->io_error = blk_status_to_errno(bio->bi_status);
+	if (ioend->io_error) {
+		iomap_fail_ioend_buffered(ioend);
+		return;
+	}
+
 	iomap_finish_ioend_buffered(ioend);
 }
 
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
@@ -247,7 +247,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode)
 	j += i * bits_per_zone;
 	if (!j || j > sbi->s_ninodes) {
 		iput(inode);
-		return ERR_PTR(-ENOSPC);
+		return ERR_PTR(-EFSCORRUPTED);
 	}
 	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
 	inode->i_ino = j;
diff --git a/fs/namespace.c b/fs/namespace.c
@@ -1531,23 +1531,33 @@ static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	struct proc_mounts *p = m->private;
+	struct mount *mnt;
 
 	down_read(&namespace_sem);
 
-	return mnt_find_id_at(p->ns, *pos);
+	mnt = mnt_find_id_at(p->ns, *pos);
+	if (mnt)
+		*pos = mnt->mnt_id_unique;
+	return mnt;
 }
 
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct mount *next = NULL, *mnt = v;
+	struct mount *mnt = v;
 	struct rb_node *node = rb_next(&mnt->mnt_node);
 
-	++*pos;
 	if (node) {
-		next = node_to_mount(node);
+		struct mount *next = node_to_mount(node);
 		*pos = next->mnt_id_unique;
+		return next;
 	}
-	return next;
+
+	/*
+	 * No more mounts. Set pos past current mount's ID so that if
+	 * iteration restarts, mnt_find_id_at() returns NULL.
+	 */
+	*pos = mnt->mnt_id_unique + 1;
+	return NULL;
 }
 
 static void m_stop(struct seq_file *m, void *v)
@@ -2791,15 +2801,19 @@ static inline void unlock_mount(struct pinned_mountpoint *m)
 }
 
 static void lock_mount_exact(const struct path *path,
-			     struct pinned_mountpoint *mp);
+			     struct pinned_mountpoint *mp, bool copy_mount,
+			     unsigned int copy_flags);
 
 #define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \
 	struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
 	do_lock_mount((path), &mp, (beneath))
 #define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false)
 #define LOCK_MOUNT_EXACT(mp, path) \
 	struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
-	lock_mount_exact((path), &mp)
+	lock_mount_exact((path), &mp, false, 0)
+#define LOCK_MOUNT_EXACT_COPY(mp, path, copy_flags) \
+	struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
+	lock_mount_exact((path), &mp, true, (copy_flags))
 
 static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp)
 {
@@ -3073,16 +3087,13 @@ static struct file *open_detached_copy(struct path *path, unsigned int flags)
 	return file;
 }
 
-DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *,
-	    if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T))
-
 static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags)
 {
-	struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL;
-	struct path to_path __free(path_put) = {};
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	struct user_namespace *user_ns = current_user_ns();
-	struct mount *new_ns_root;
+	struct mnt_namespace *new_ns;
+	struct mount *new_ns_root, *old_ns_root;
+	struct path to_path;
 	struct mount *mnt;
 	unsigned int copy_flags = 0;
 	bool locked = false;
@@ -3094,71 +3105,63 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in
 	if (IS_ERR(new_ns))
 		return ERR_CAST(new_ns);
 
-	scoped_guard(namespace_excl) {
-		new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags);
-		if (IS_ERR(new_ns_root))
-			return ERR_CAST(new_ns_root);
+	old_ns_root = ns->root;
+	to_path.mnt = &old_ns_root->mnt;
+	to_path.dentry = old_ns_root->mnt.mnt_root;
 
-		/*
-		 * If the real rootfs had a locked mount on top of it somewhere
-		 * in the stack, lock the new mount tree as well so it can't be
-		 * exposed.
-		 */
-		mnt = ns->root;
-		while (mnt->overmount) {
-			mnt = mnt->overmount;
-			if (mnt->mnt.mnt_flags & MNT_LOCKED)
-				locked = true;
-		}
+	VFS_WARN_ON_ONCE(old_ns_root->mnt.mnt_sb->s_type != &nullfs_fs_type);
+
+	LOCK_MOUNT_EXACT_COPY(mp, &to_path, copy_flags);
+	if (IS_ERR(mp.parent)) {
+		free_mnt_ns(new_ns);
+		return ERR_CAST(mp.parent);
 	}
+	new_ns_root = mp.parent;
 
 	/*
-	 * We dropped the namespace semaphore so we can actually lock
-	 * the copy for mounting. The copied mount isn't attached to any
-	 * mount namespace and it is thus excluded from any propagation.
-	 * So realistically we're isolated and the mount can't be
-	 * overmounted.
+	 * If the real rootfs had a locked mount on top of it somewhere
+	 * in the stack, lock the new mount tree as well so it can't be
+	 * exposed.
 	 */
-
-	/* Borrow the reference from clone_mnt(). */
-	to_path.mnt = &new_ns_root->mnt;
-	to_path.dentry = dget(new_ns_root->mnt.mnt_root);
-
-	/* Now lock for actual mounting. */
-	LOCK_MOUNT_EXACT(mp, &to_path);
-	if (unlikely(IS_ERR(mp.parent)))
-		return ERR_CAST(mp.parent);
+	mnt = old_ns_root;
+	while (mnt->overmount) {
+		mnt = mnt->overmount;
+		if (mnt->mnt.mnt_flags & MNT_LOCKED)
+			locked = true;
+	}
 
 	/*
-	 * We don't emulate unshare()ing a mount namespace. We stick to the
-	 * restrictions of creating detached bind-mounts. It has a lot
-	 * saner and simpler semantics.
+	 * We don't emulate unshare()ing a mount namespace. We stick
+	 * to the restrictions of creating detached bind-mounts. It
+	 * has a lot saner and simpler semantics.
 	 */
 	mnt = __do_loopback(path, flags, copy_flags);
-	if (IS_ERR(mnt))
-		return ERR_CAST(mnt);
-
 	scoped_guard(mount_writer) {
+		if (IS_ERR(mnt)) {
+			emptied_ns = new_ns;
+			umount_tree(new_ns_root, 0);
+			return ERR_CAST(mnt);
+		}
+
 		if (locked)
 			mnt->mnt.mnt_flags |= MNT_LOCKED;
 		/*
-		 * Now mount the detached tree on top of the copy of the
-		 * real rootfs we created.
+		 * now mount the detached tree on top of the copy
+		 * of the real rootfs we created.
 		 */
 		attach_mnt(mnt, new_ns_root, mp.mp);
 		if (user_ns != ns->user_ns)
 			lock_mnt_tree(new_ns_root);
 	}
 
-	/* Add all mounts to the new namespace. */
-	for (struct mount *p = new_ns_root; p; p = next_mnt(p, new_ns_root)) {
-		mnt_add_to_ns(new_ns, p);
+	for (mnt = new_ns_root; mnt; mnt = next_mnt(mnt, new_ns_root)) {
+		mnt_add_to_ns(new_ns, mnt);
 		new_ns->nr_mounts++;
 	}
 
-	new_ns->root = real_mount(no_free_ptr(to_path.mnt));
+	new_ns->root = new_ns_root;
 	ns_tree_add_raw(new_ns);
-	return no_free_ptr(new_ns);
+	return new_ns;
 }
 
 static struct file *open_new_namespace(struct path *path, unsigned int flags)
@@ -3840,26 +3843,36 @@ static int do_new_mount(const struct path *path, const char *fstype,
 }
 
 static void lock_mount_exact(const struct path *path,
-			     struct pinned_mountpoint *mp)
+			     struct pinned_mountpoint *mp, bool copy_mount,
+			     unsigned int copy_flags)
 {
 	struct dentry *dentry = path->dentry;
 	int err;
 
+	/* Assert that inode_lock() locked the correct inode. */
+	VFS_WARN_ON_ONCE(copy_mount && !path_mounted(path));
+
 	inode_lock(dentry->d_inode);
 	namespace_lock();
 	if (unlikely(cant_mount(dentry)))
 		err = -ENOENT;
-	else if (path_overmounted(path))
+	else if (!copy_mount && path_overmounted(path))
 		err = -EBUSY;
 	else
 		err = get_mountpoint(dentry, mp);
 	if (unlikely(err)) {
 		namespace_unlock();
 		inode_unlock(dentry->d_inode);
 		mp->parent = ERR_PTR(err);
-	} else {
-		mp->parent = real_mount(path->mnt);
+		return;
 	}
+
+	if (copy_mount)
+		mp->parent = clone_mnt(real_mount(path->mnt), dentry, copy_flags);
+	else
+		mp->parent = real_mount(path->mnt);
+	if (unlikely(IS_ERR(mp->parent)))
+		__unlock_mount(mp);
 }
 
 int finish_automount(struct vfsmount *__m, const struct path *path)
@@ -5678,6 +5691,8 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
 
 		s->mnt = mnt_file->f_path.mnt;
 		ns = real_mount(s->mnt)->mnt_ns;
+		if (IS_ERR(ns))
+			return PTR_ERR(ns);
 		if (!ns)
 			/*
 			 * We can't set mount point and mnt_ns_id since we don't have a
diff --git a/fs/pidfs.c b/fs/pidfs.c
@@ -608,9 +608,8 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			struct user_namespace *user_ns;
 
 			user_ns = task_cred_xxx(task, user_ns);
-			if (!ns_ref_get(user_ns))
-				break;
-			ns_common = to_ns_common(user_ns);
+			if (ns_ref_get(user_ns))
+				ns_common = to_ns_common(user_ns);
 		}
 #endif
 		break;
@@ -620,9 +619,8 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			struct pid_namespace *pid_ns;
 
 			pid_ns = task_active_pid_ns(task);
-			if (!ns_ref_get(pid_ns))
-				break;
-			ns_common = to_ns_common(pid_ns);
+			if (ns_ref_get(pid_ns))
+				ns_common = to_ns_common(pid_ns);
 		}
 #endif
 		break;
diff --git a/fs/proc/base.c b/fs/proc/base.c
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
diff --git a/kernel/fork.c b/kernel/fork.c

Original file line number	Diff line number	Diff line change
`@@ -624,6 +624,7 @@ static int iomap_readahead_iter(struct iomap_iter *iter,`
`624`	`624`	`* iomap_readahead - Attempt to read pages from a file.`
`625`	`625`	`* @ops: The operations vector for the filesystem.`
`626`	`626`	`* @ctx: The ctx used for issuing readahead.`
	`627`	`+ * @private: The filesystem-specific information for issuing iomap_iter.`
`627`	`628`	`*`
`628`	`629`	`* This function is for filesystems to call to implement their readahead`
`629`	`630`	`* address_space operation.`
Original file line number	Diff line number	Diff line change
`@@ -247,7 +247,7 @@ struct inode minix_new_inode(const struct inode dir, umode_t mode)`
`247`	`247`	`j += i * bits_per_zone;`
`248`	`248`	`if (!j \|\| j > sbi->s_ninodes) {`
`249`	`249`	`iput(inode);`
`250`		`- return ERR_PTR(-ENOSPC);`
	`250`	`+ return ERR_PTR(-EFSCORRUPTED);`
`251`	`251`	`}`
`252`	`252`	`inode_init_owner(&nop_mnt_idmap, inode, dir, mode);`
`253`	`253`	`inode->i_ino = j;`