Skip to content

Commit c34fc6f

Browse files
npsingamsettyaxboe
authored andcommitted
fs: Initial atomic write support
An atomic write is a write issued with torn-write protection, meaning that for a power failure or any other hardware failure, all or none of the data from the write will be stored, but never a mix of old and new data. Userspace may add flag RWF_ATOMIC to pwritev2() to indicate that the write is to be issued with torn-write prevention, according to special alignment and length rules. For any syscall interface utilizing struct iocb, add IOCB_ATOMIC for iocb->ki_flags field to indicate the same. A call to statx will give the relevant atomic write info for a file: - atomic_write_unit_min - atomic_write_unit_max - atomic_write_segments_max Both min and max values must be a power-of-2. Applications can avail of atomic write feature by ensuring that the total length of a write is a power-of-2 in size and also sized between atomic_write_unit_min and atomic_write_unit_max, inclusive. Applications must ensure that the write is at a naturally-aligned offset in the file wrt the total write length. The value in atomic_write_segments_max indicates the upper limit for IOV_ITER iovcnt. Add file mode flag FMODE_CAN_ATOMIC_WRITE, so files which do not have the flag set will have RWF_ATOMIC rejected and not just ignored. Add a type argument to kiocb_set_rw_flags() to allows reads which have RWF_ATOMIC set to be rejected. Helper function generic_atomic_write_valid() can be used by FSes to verify compliant writes. There we check for iov_iter type is for ubuf, which implies iovcnt==1 for pwritev2(), which is an initial restriction for atomic_write_segments_max. Initially the only user will be bdev file operations write handler. We will rely on the block BIO submission path to ensure write sizes are compliant for the bdev, so we don't need to check atomic writes sizes yet. Signed-off-by: Prasad Singamsetty <prasad.singamsetty@oracle.com> jpg: merge into single patch and much rewrite Acked-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Signed-off-by: John Garry <john.g.garry@oracle.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Link: https://lore.kernel.org/r/20240620125359.2684798-4-john.g.garry@oracle.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent f70167a commit c34fc6f

6 files changed

Lines changed: 45 additions & 14 deletions

File tree

fs/aio.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1516,7 +1516,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res)
15161516
iocb_put(iocb);
15171517
}
15181518

1519-
static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
1519+
static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type)
15201520
{
15211521
int ret;
15221522

@@ -1542,7 +1542,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
15421542
} else
15431543
req->ki_ioprio = get_current_ioprio();
15441544

1545-
ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
1545+
ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags, rw_type);
15461546
if (unlikely(ret))
15471547
return ret;
15481548

@@ -1594,7 +1594,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
15941594
struct file *file;
15951595
int ret;
15961596

1597-
ret = aio_prep_rw(req, iocb);
1597+
ret = aio_prep_rw(req, iocb, READ);
15981598
if (ret)
15991599
return ret;
16001600
file = req->ki_filp;
@@ -1621,7 +1621,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
16211621
struct file *file;
16221622
int ret;
16231623

1624-
ret = aio_prep_rw(req, iocb);
1624+
ret = aio_prep_rw(req, iocb, WRITE);
16251625
if (ret)
16261626
return ret;
16271627
file = req->ki_filp;

fs/btrfs/ioctl.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4627,7 +4627,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
46274627
goto out_iov;
46284628

46294629
init_sync_kiocb(&kiocb, file);
4630-
ret = kiocb_set_rw_flags(&kiocb, 0);
4630+
ret = kiocb_set_rw_flags(&kiocb, 0, WRITE);
46314631
if (ret)
46324632
goto out_iov;
46334633
kiocb.ki_pos = pos;

fs/read_write.c

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -730,7 +730,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
730730
ssize_t ret;
731731

732732
init_sync_kiocb(&kiocb, filp);
733-
ret = kiocb_set_rw_flags(&kiocb, flags);
733+
ret = kiocb_set_rw_flags(&kiocb, flags, type);
734734
if (ret)
735735
return ret;
736736
kiocb.ki_pos = (ppos ? *ppos : 0);
@@ -1736,3 +1736,19 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
17361736

17371737
return 0;
17381738
}
1739+
1740+
bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos)
1741+
{
1742+
size_t len = iov_iter_count(iter);
1743+
1744+
if (!iter_is_ubuf(iter))
1745+
return false;
1746+
1747+
if (!is_power_of_2(len))
1748+
return false;
1749+
1750+
if (!IS_ALIGNED(pos, len))
1751+
return false;
1752+
1753+
return true;
1754+
}

include/linux/fs.h

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,10 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
125125
#define FMODE_EXEC ((__force fmode_t)(1 << 5))
126126
/* File writes are restricted (block device specific) */
127127
#define FMODE_WRITE_RESTRICTED ((__force fmode_t)(1 << 6))
128+
/* File supports atomic writes */
129+
#define FMODE_CAN_ATOMIC_WRITE ((__force fmode_t)(1 << 7))
128130

129-
/* FMODE_* bits 7 to 8 */
131+
/* FMODE_* bit 8 */
130132

131133
/* 32bit hashes as llseek() offset (for directories) */
132134
#define FMODE_32BITHASH ((__force fmode_t)(1 << 9))
@@ -317,6 +319,7 @@ struct readahead_control;
317319
#define IOCB_SYNC (__force int) RWF_SYNC
318320
#define IOCB_NOWAIT (__force int) RWF_NOWAIT
319321
#define IOCB_APPEND (__force int) RWF_APPEND
322+
#define IOCB_ATOMIC (__force int) RWF_ATOMIC
320323

321324
/* non-RWF related bits - start at 16 */
322325
#define IOCB_EVENTFD (1 << 16)
@@ -351,6 +354,7 @@ struct readahead_control;
351354
{ IOCB_SYNC, "SYNC" }, \
352355
{ IOCB_NOWAIT, "NOWAIT" }, \
353356
{ IOCB_APPEND, "APPEND" }, \
357+
{ IOCB_ATOMIC, "ATOMIC"}, \
354358
{ IOCB_EVENTFD, "EVENTFD"}, \
355359
{ IOCB_DIRECT, "DIRECT" }, \
356360
{ IOCB_WRITE, "WRITE" }, \
@@ -3403,7 +3407,8 @@ static inline int iocb_flags(struct file *file)
34033407
return res;
34043408
}
34053409

3406-
static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
3410+
static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags,
3411+
int rw_type)
34073412
{
34083413
int kiocb_flags = 0;
34093414

@@ -3422,6 +3427,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
34223427
return -EOPNOTSUPP;
34233428
kiocb_flags |= IOCB_NOIO;
34243429
}
3430+
if (flags & RWF_ATOMIC) {
3431+
if (rw_type != WRITE)
3432+
return -EOPNOTSUPP;
3433+
if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE))
3434+
return -EOPNOTSUPP;
3435+
}
34253436
kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
34263437
if (flags & RWF_SYNC)
34273438
kiocb_flags |= IOCB_DSYNC;
@@ -3613,4 +3624,6 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
36133624
extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
36143625
int advice);
36153626

3627+
bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos);
3628+
36163629
#endif /* _LINUX_FS_H */

include/uapi/linux/fs.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,9 +329,12 @@ typedef int __bitwise __kernel_rwf_t;
329329
/* per-IO negation of O_APPEND */
330330
#define RWF_NOAPPEND ((__force __kernel_rwf_t)0x00000020)
331331

332+
/* Atomic Write */
333+
#define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040)
334+
332335
/* mask of flags supported by the kernel */
333336
#define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
334-
RWF_APPEND | RWF_NOAPPEND)
337+
RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC)
335338

336339
/* Pagemap ioctl */
337340
#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg)

io_uring/rw.c

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -772,7 +772,7 @@ static bool need_complete_io(struct io_kiocb *req)
772772
S_ISBLK(file_inode(req->file)->i_mode);
773773
}
774774

775-
static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
775+
static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
776776
{
777777
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
778778
struct kiocb *kiocb = &rw->kiocb;
@@ -787,7 +787,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
787787
req->flags |= io_file_get_flags(file);
788788

789789
kiocb->ki_flags = file->f_iocb_flags;
790-
ret = kiocb_set_rw_flags(kiocb, rw->flags);
790+
ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type);
791791
if (unlikely(ret))
792792
return ret;
793793
kiocb->ki_flags |= IOCB_ALLOC_CACHE;
@@ -832,8 +832,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
832832
if (unlikely(ret < 0))
833833
return ret;
834834
}
835-
836-
ret = io_rw_init_file(req, FMODE_READ);
835+
ret = io_rw_init_file(req, FMODE_READ, READ);
837836
if (unlikely(ret))
838837
return ret;
839838
req->cqe.res = iov_iter_count(&io->iter);
@@ -1013,7 +1012,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
10131012
ssize_t ret, ret2;
10141013
loff_t *ppos;
10151014

1016-
ret = io_rw_init_file(req, FMODE_WRITE);
1015+
ret = io_rw_init_file(req, FMODE_WRITE, WRITE);
10171016
if (unlikely(ret))
10181017
return ret;
10191018
req->cqe.res = iov_iter_count(&io->iter);

0 commit comments

Comments
 (0)