Skip to content

Commit 61ba89b

Browse files
committed
erofs: add 48-bit block addressing on-disk support
The current 32-bit block addressing limits EROFS to a 16TiB maximum volume size with 4KiB blocks. However, several new use cases now require larger capacity support: - Massive datasets for model training in order to boost random sampling performance for each epoch; - Object storage clients using EROFS direct passthrough. This extends core on-disk structures to support 48-bit block addressing, such as inodes, device slots, and inode chunks. Additionally: - Expand superblock root NID to 8-byte `rootnid_8b` to enable full out-of-place update incremental builds; - Introduce `epoch` field in the superblock as well as add `mtime` field to 32-byte compact inodes for basic timestamp support. Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com> Acked-by: Chao Yu <chao@kernel.org> Link: https://lore.kernel.org/r/20250310095459.2620647-4-hsiangkao@linux.alibaba.com
1 parent 3422dfa commit 61ba89b

5 files changed

Lines changed: 61 additions & 69 deletions

File tree

fs/erofs/data.c

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
9595

9696
map->m_flags = EROFS_MAP_MAPPED;
9797
if (map->m_la < pos) {
98-
map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la;
98+
map->m_pa = erofs_pos(sb, vi->startblk) + map->m_la;
9999
map->m_llen = pos - map->m_la;
100100
} else {
101101
map->m_pa = erofs_iloc(inode) + vi->inode_isize +
@@ -124,7 +124,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
124124
map->m_llen = min_t(erofs_off_t, 1UL << vi->chunkbits,
125125
round_up(inode->i_size - map->m_la, blksz));
126126
if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) {
127-
startblk = le32_to_cpu(idx->blkaddr);
127+
startblk = le32_to_cpu(idx->startblk_lo);
128128
if (startblk != EROFS_NULL_ADDR) {
129129
map->m_deviceid = le16_to_cpu(idx->device_id) &
130130
EROFS_SB(sb)->device_id_mask;
@@ -168,7 +168,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
168168
{
169169
struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
170170
struct erofs_device_info *dif;
171-
erofs_off_t startoff, length;
171+
erofs_off_t startoff;
172172
int id;
173173

174174
erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0);
@@ -181,7 +181,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
181181
return -ENODEV;
182182
}
183183
if (devs->flatdev) {
184-
map->m_pa += erofs_pos(sb, dif->mapped_blkaddr);
184+
map->m_pa += erofs_pos(sb, dif->uniaddr);
185185
up_read(&devs->rwsem);
186186
return 0;
187187
}
@@ -190,13 +190,12 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
190190
} else if (devs->extra_devices && !devs->flatdev) {
191191
down_read(&devs->rwsem);
192192
idr_for_each_entry(&devs->tree, dif, id) {
193-
if (!dif->mapped_blkaddr)
193+
if (!dif->uniaddr)
194194
continue;
195195

196-
startoff = erofs_pos(sb, dif->mapped_blkaddr);
197-
length = erofs_pos(sb, dif->blocks);
196+
startoff = erofs_pos(sb, dif->uniaddr);
198197
if (map->m_pa >= startoff &&
199-
map->m_pa < startoff + length) {
198+
map->m_pa < startoff + erofs_pos(sb, dif->blocks)) {
200199
map->m_pa -= startoff;
201200
erofs_fill_from_devinfo(map, sb, dif);
202201
break;

fs/erofs/erofs_fs.h

Lines changed: 42 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -30,25 +30,19 @@
3030
#define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020
3131
#define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020
3232
#define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040
33+
#define EROFS_FEATURE_INCOMPAT_48BIT 0x00000080
3334
#define EROFS_ALL_FEATURE_INCOMPAT \
34-
(EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \
35-
EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
36-
EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \
37-
EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
38-
EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
39-
EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \
40-
EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \
41-
EROFS_FEATURE_INCOMPAT_FRAGMENTS | \
42-
EROFS_FEATURE_INCOMPAT_DEDUPE | \
43-
EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES)
35+
((EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES << 1) - 1)
4436

4537
#define EROFS_SB_EXTSLOT_SIZE 16
4638

4739
struct erofs_deviceslot {
4840
u8 tag[64]; /* digest(sha256), etc. */
49-
__le32 blocks; /* total fs blocks of this device */
50-
__le32 mapped_blkaddr; /* map starting at mapped_blkaddr */
51-
u8 reserved[56];
41+
__le32 blocks_lo; /* total blocks count of this device */
42+
__le32 uniaddr_lo; /* unified starting block of this device */
43+
__le32 blocks_hi; /* total blocks count MSB */
44+
__le16 uniaddr_hi; /* unified starting block MSB */
45+
u8 reserved[50];
5246
};
5347
#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot)
5448

@@ -59,13 +53,14 @@ struct erofs_super_block {
5953
__le32 feature_compat;
6054
__u8 blkszbits; /* filesystem block size in bit shift */
6155
__u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */
62-
63-
__le16 root_nid; /* nid of root directory */
56+
union {
57+
__le16 rootnid_2b; /* nid of root directory */
58+
__le16 blocks_hi; /* (48BIT on) blocks count MSB */
59+
} rb;
6460
__le64 inos; /* total valid ino # (== f_files - f_favail) */
65-
66-
__le64 build_time; /* compact inode time derivation */
67-
__le32 build_time_nsec; /* compact inode time derivation in ns scale */
68-
__le32 blocks; /* used for statfs */
61+
__le64 epoch; /* base seconds used for compact inodes */
62+
__le32 fixed_nsec; /* fixed nanoseconds for compact inodes */
63+
__le32 blocks_lo; /* blocks count LSB */
6964
__le32 meta_blkaddr; /* start block address of metadata area */
7065
__le32 xattr_blkaddr; /* start block address of shared xattr area */
7166
__u8 uuid[16]; /* 128-bit uuid for volume */
@@ -84,7 +79,10 @@ struct erofs_super_block {
8479
__le32 xattr_prefix_start; /* start of long xattr prefixes */
8580
__le64 packed_nid; /* nid of the special packed inode */
8681
__u8 xattr_filter_reserved; /* reserved for xattr name filter */
87-
__u8 reserved2[23];
82+
__u8 reserved[3];
83+
__le32 build_time; /* seconds added to epoch for mkfs time */
84+
__le64 rootnid_8b; /* (48BIT on) nid of root directory */
85+
__u8 reserved2[8];
8886
};
8987

9088
/*
@@ -115,19 +113,18 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
115113
#define EROFS_I_VERSION_MASK 0x01
116114
#define EROFS_I_DATALAYOUT_MASK 0x07
117115

118-
#define EROFS_I_VERSION_BIT 0
119-
#define EROFS_I_DATALAYOUT_BIT 1
120-
#define EROFS_I_ALL_BIT 4
121-
122-
#define EROFS_I_ALL ((1 << EROFS_I_ALL_BIT) - 1)
116+
#define EROFS_I_VERSION_BIT 0
117+
#define EROFS_I_DATALAYOUT_BIT 1
118+
#define EROFS_I_NLINK_1_BIT 4 /* non-directory compact inodes only */
119+
#define EROFS_I_ALL ((1 << (EROFS_I_NLINK_1_BIT + 1)) - 1)
123120

124121
/* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */
125122
#define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F
126-
/* with chunk indexes or just a 4-byte blkaddr array */
123+
/* with chunk indexes or just a 4-byte block array */
127124
#define EROFS_CHUNK_FORMAT_INDEXES 0x0020
125+
#define EROFS_CHUNK_FORMAT_48BIT 0x0040
128126

129-
#define EROFS_CHUNK_FORMAT_ALL \
130-
(EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES)
127+
#define EROFS_CHUNK_FORMAT_ALL ((EROFS_CHUNK_FORMAT_48BIT << 1) - 1)
131128

132129
/* 32-byte on-disk inode */
133130
#define EROFS_INODE_LAYOUT_COMPACT 0
@@ -140,45 +137,40 @@ struct erofs_inode_chunk_info {
140137
};
141138

142139
union erofs_inode_i_u {
143-
/* total compressed blocks for compressed inodes */
144-
__le32 compressed_blocks;
145-
146-
/* block address for uncompressed flat inodes */
147-
__le32 raw_blkaddr;
148-
149-
/* for device files, used to indicate old/new device # */
150-
__le32 rdev;
151-
152-
/* for chunk-based files, it contains the summary info */
140+
__le32 blocks_lo; /* total blocks count (if compressed inodes) */
141+
__le32 startblk_lo; /* starting block number (if flat inodes) */
142+
__le32 rdev; /* device ID (if special inodes) */
153143
struct erofs_inode_chunk_info c;
154144
};
155145

146+
union erofs_inode_i_nb {
147+
__le16 nlink; /* if EROFS_I_NLINK_1_BIT is unset */
148+
__le16 blocks_hi; /* total blocks count MSB */
149+
__le16 startblk_hi; /* starting block number MSB */
150+
};
151+
156152
/* 32-byte reduced form of an ondisk inode */
157153
struct erofs_inode_compact {
158154
__le16 i_format; /* inode format hints */
159-
160-
/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
161155
__le16 i_xattr_icount;
162156
__le16 i_mode;
163-
__le16 i_nlink;
157+
union erofs_inode_i_nb i_nb;
164158
__le32 i_size;
165-
__le32 i_reserved;
159+
__le32 i_mtime;
166160
union erofs_inode_i_u i_u;
167161

168162
__le32 i_ino; /* only used for 32-bit stat compatibility */
169163
__le16 i_uid;
170164
__le16 i_gid;
171-
__le32 i_reserved2;
165+
__le32 i_reserved;
172166
};
173167

174168
/* 64-byte complete form of an ondisk inode */
175169
struct erofs_inode_extended {
176170
__le16 i_format; /* inode format hints */
177-
178-
/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
179171
__le16 i_xattr_icount;
180172
__le16 i_mode;
181-
__le16 i_reserved;
173+
union erofs_inode_i_nb i_nb;
182174
__le64 i_size;
183175
union erofs_inode_i_u i_u;
184176

@@ -248,6 +240,7 @@ static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount)
248240
if (!i_xattr_icount)
249241
return 0;
250242

243+
/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
251244
return sizeof(struct erofs_xattr_ibody_header) +
252245
sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1);
253246
}
@@ -266,11 +259,11 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
266259
/* 4-byte block address array */
267260
#define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32)
268261

269-
/* 8-byte inode chunk indexes */
262+
/* 8-byte inode chunk index */
270263
struct erofs_inode_chunk_index {
271-
__le16 advise; /* always 0, don't care for now */
264+
__le16 startblk_hi; /* starting block number MSB */
272265
__le16 device_id; /* back-end storage id (with bits masked) */
273-
__le32 blkaddr; /* start block address of this inode chunk */
266+
__le32 startblk_lo; /* starting block number of this chunk */
274267
};
275268

276269
/* dirent sorts in alphabet order, thus we can do binary search */

fs/erofs/inode.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ static int erofs_read_inode(struct inode *inode)
108108
iu = dic->i_u;
109109
i_uid_write(inode, le16_to_cpu(dic->i_uid));
110110
i_gid_write(inode, le16_to_cpu(dic->i_gid));
111-
set_nlink(inode, le16_to_cpu(dic->i_nlink));
111+
set_nlink(inode, le16_to_cpu(dic->i_nb.nlink));
112112
inode_set_mtime(inode, sbi->build_time, sbi->build_time_nsec);
113113

114114
inode->i_size = le32_to_cpu(dic->i_size);
@@ -129,7 +129,7 @@ static int erofs_read_inode(struct inode *inode)
129129
case S_IFREG:
130130
case S_IFDIR:
131131
case S_IFLNK:
132-
vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr);
132+
vi->startblk = le32_to_cpu(iu.startblk_lo);
133133
if(S_ISLNK(inode->i_mode)) {
134134
err = erofs_fill_symlink(inode, ptr, ofs);
135135
if (err)
@@ -152,7 +152,7 @@ static int erofs_read_inode(struct inode *inode)
152152
}
153153

154154
if (erofs_inode_is_data_compressed(vi->datalayout))
155-
inode->i_blocks = le32_to_cpu(iu.compressed_blocks) <<
155+
inode->i_blocks = le32_to_cpu(iu.blocks_lo) <<
156156
(sb->s_blocksize_bits - 9);
157157
else
158158
inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9;

fs/erofs/internal.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ struct erofs_device_info {
4747
struct dax_device *dax_dev;
4848
u64 dax_part_off;
4949

50-
u32 blocks;
51-
u32 mapped_blkaddr;
50+
erofs_blk_t blocks;
51+
erofs_blk_t uniaddr;
5252
};
5353

5454
enum {
@@ -252,7 +252,7 @@ struct erofs_inode {
252252
unsigned int *xattr_shared_xattrs;
253253

254254
union {
255-
erofs_blk_t raw_blkaddr;
255+
erofs_blk_t startblk;
256256
struct {
257257
unsigned short chunkformat;
258258
unsigned char chunkbits;

fs/erofs/super.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
178178
dif->file = file;
179179
}
180180

181-
dif->blocks = le32_to_cpu(dis->blocks);
182-
dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
181+
dif->blocks = le32_to_cpu(dis->blocks_lo);
182+
dif->uniaddr = le32_to_cpu(dis->uniaddr_lo);
183183
sbi->total_blocks += dif->blocks;
184184
*pos += EROFS_DEVT_SLOT_SIZE;
185185
return 0;
@@ -299,7 +299,7 @@ static int erofs_read_superblock(struct super_block *sb)
299299
sbi->sb_size);
300300
goto out;
301301
}
302-
sbi->dif0.blocks = le32_to_cpu(dsb->blocks);
302+
sbi->dif0.blocks = le32_to_cpu(dsb->blocks_lo);
303303
sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
304304
#ifdef CONFIG_EROFS_FS_XATTR
305305
sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
@@ -308,12 +308,12 @@ static int erofs_read_superblock(struct super_block *sb)
308308
sbi->xattr_filter_reserved = dsb->xattr_filter_reserved;
309309
#endif
310310
sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
311-
sbi->root_nid = le16_to_cpu(dsb->root_nid);
311+
sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b);
312312
sbi->packed_nid = le64_to_cpu(dsb->packed_nid);
313313
sbi->inos = le64_to_cpu(dsb->inos);
314314

315-
sbi->build_time = le64_to_cpu(dsb->build_time);
316-
sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec);
315+
sbi->build_time = le64_to_cpu(dsb->epoch);
316+
sbi->build_time_nsec = le32_to_cpu(dsb->fixed_nsec);
317317

318318
super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid));
319319

0 commit comments

Comments
 (0)