Skip to content

Commit 472e5b0

Browse files
committed
pipe: remove pipe_wait() and fix wakeup race with splice
The pipe splice code still used the old model of waiting for pipe IO by using a non-specific "pipe_wait()" that waited for any pipe event to happen, which depended on all pipe IO being entirely serialized by the pipe lock. So by checking the state you were waiting for, and then adding yourself to the wait queue before dropping the lock, you were guaranteed to see all the wakeups. Strictly speaking, the actual wakeups were not done under the lock, but the pipe_wait() model still worked, because since the waiter held the lock when checking whether it should sleep, it would always see the current state, and the wakeup was always done after updating the state. However, commit 0ddad21 ("pipe: use exclusive waits when reading or writing") split the single wait-queue into two, and in the process also made the "wait for event" code wait for _two_ wait queues, and that then showed a race with the wakers that were not serialized by the pipe lock. It's only splice that used that "pipe_wait()" model, so the problem wasn't obvious, but Josef Bacik reports: "I hit a hang with fstest btrfs/187, which does a btrfs send into /dev/null. This works by creating a pipe, the write side is given to the kernel to write into, and the read side is handed to a thread that splices into a file, in this case /dev/null. The box that was hung had the write side stuck here [pipe_write] and the read side stuck here [splice_from_pipe_next -> pipe_wait]. [ more details about pipe_wait() scenario ] The problem is we're doing the prepare_to_wait, which sets our state each time, however we can be woken up either with reads or writes. In the case above we race with the WRITER waking us up, and re-set our state to INTERRUPTIBLE, and thus never break out of schedule" Josef had a patch that avoided the issue in pipe_wait() by just making it set the state only once, but the deeper problem is that pipe_wait() depends on a level of synchonization by the pipe mutex that it really shouldn't. And the whole "wait for any pipe state change" model really isn't very good to begin with. So rather than trying to work around things in pipe_wait(), remove that legacy model of "wait for arbitrary pipe event" entirely, and actually create functions that wait for the pipe actually being readable or writable, and can do so without depending on the pipe lock serializing everything. Fixes: 0ddad21 ("pipe: use exclusive waits when reading or writing") Link: https://lore.kernel.org/linux-fsdevel/bfa88b5ad6f069b2b679316b9e495a970130416c.1601567868.git.josef@toxicpanda.com/ Reported-by: Josef Bacik <josef@toxicpanda.com> Reviewed-and-tested-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 44b6e23 commit 472e5b0

3 files changed

Lines changed: 48 additions & 27 deletions

File tree

fs/pipe.c

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -106,25 +106,6 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
106106
}
107107
}
108108

109-
/* Drop the inode semaphore and wait for a pipe event, atomically */
110-
void pipe_wait(struct pipe_inode_info *pipe)
111-
{
112-
DEFINE_WAIT(rdwait);
113-
DEFINE_WAIT(wrwait);
114-
115-
/*
116-
* Pipes are system-local resources, so sleeping on them
117-
* is considered a noninteractive wait:
118-
*/
119-
prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
120-
prepare_to_wait(&pipe->wr_wait, &wrwait, TASK_INTERRUPTIBLE);
121-
pipe_unlock(pipe);
122-
schedule();
123-
finish_wait(&pipe->rd_wait, &rdwait);
124-
finish_wait(&pipe->wr_wait, &wrwait);
125-
pipe_lock(pipe);
126-
}
127-
128109
static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
129110
struct pipe_buffer *buf)
130111
{
@@ -1035,12 +1016,52 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
10351016
return do_pipe2(fildes, 0);
10361017
}
10371018

1019+
/*
1020+
* This is the stupid "wait for pipe to be readable or writable"
1021+
* model.
1022+
*
1023+
* See pipe_read/write() for the proper kind of exclusive wait,
1024+
* but that requires that we wake up any other readers/writers
1025+
* if we then do not end up reading everything (ie the whole
1026+
* "wake_next_reader/writer" logic in pipe_read/write()).
1027+
*/
1028+
void pipe_wait_readable(struct pipe_inode_info *pipe)
1029+
{
1030+
pipe_unlock(pipe);
1031+
wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe));
1032+
pipe_lock(pipe);
1033+
}
1034+
1035+
void pipe_wait_writable(struct pipe_inode_info *pipe)
1036+
{
1037+
pipe_unlock(pipe);
1038+
wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe));
1039+
pipe_lock(pipe);
1040+
}
1041+
1042+
/*
1043+
* This depends on both the wait (here) and the wakeup (wake_up_partner)
1044+
* holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot
1045+
* race with the count check and waitqueue prep.
1046+
*
1047+
* Normally in order to avoid races, you'd do the prepare_to_wait() first,
1048+
* then check the condition you're waiting for, and only then sleep. But
1049+
* because of the pipe lock, we can check the condition before being on
1050+
* the wait queue.
1051+
*
1052+
* We use the 'rd_wait' waitqueue for pipe partner waiting.
1053+
*/
10381054
static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
10391055
{
1056+
DEFINE_WAIT(rdwait);
10401057
int cur = *cnt;
10411058

10421059
while (cur == *cnt) {
1043-
pipe_wait(pipe);
1060+
prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
1061+
pipe_unlock(pipe);
1062+
schedule();
1063+
finish_wait(&pipe->rd_wait, &rdwait);
1064+
pipe_lock(pipe);
10441065
if (signal_pending(current))
10451066
break;
10461067
}
@@ -1050,7 +1071,6 @@ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
10501071
static void wake_up_partner(struct pipe_inode_info *pipe)
10511072
{
10521073
wake_up_interruptible_all(&pipe->rd_wait);
1053-
wake_up_interruptible_all(&pipe->wr_wait);
10541074
}
10551075

10561076
static int fifo_open(struct inode *inode, struct file *filp)

fs/splice.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des
563563
sd->need_wakeup = false;
564564
}
565565

566-
pipe_wait(pipe);
566+
pipe_wait_readable(pipe);
567567
}
568568

569569
return 1;
@@ -1077,7 +1077,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
10771077
return -EAGAIN;
10781078
if (signal_pending(current))
10791079
return -ERESTARTSYS;
1080-
pipe_wait(pipe);
1080+
pipe_wait_writable(pipe);
10811081
}
10821082
}
10831083

@@ -1454,7 +1454,7 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
14541454
ret = -EAGAIN;
14551455
break;
14561456
}
1457-
pipe_wait(pipe);
1457+
pipe_wait_readable(pipe);
14581458
}
14591459

14601460
pipe_unlock(pipe);
@@ -1493,7 +1493,7 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
14931493
ret = -ERESTARTSYS;
14941494
break;
14951495
}
1496-
pipe_wait(pipe);
1496+
pipe_wait_writable(pipe);
14971497
}
14981498

14991499
pipe_unlock(pipe);

include/linux/pipe_fs_i.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,9 @@ extern unsigned int pipe_max_size;
240240
extern unsigned long pipe_user_pages_hard;
241241
extern unsigned long pipe_user_pages_soft;
242242

243-
/* Drop the inode semaphore and wait for a pipe event, atomically */
244-
void pipe_wait(struct pipe_inode_info *pipe);
243+
/* Wait for a pipe to be readable/writable while dropping the pipe lock */
244+
void pipe_wait_readable(struct pipe_inode_info *);
245+
void pipe_wait_writable(struct pipe_inode_info *);
245246

246247
struct pipe_inode_info *alloc_pipe_info(void);
247248
void free_pipe_info(struct pipe_inode_info *);

0 commit comments

Comments
 (0)