Skip to content

Commit 5ef3208

Browse files
Hongzhen Luohsiangkao
authored andcommitted
erofs: introduce the page cache share feature
Currently, reading files with different paths (or names) but the same content will consume multiple copies of the page cache, even if the content of these page caches is the same. For example, reading identical files (e.g., *.so files) from two different minor versions of container images will cost multiple copies of the same page cache, since different containers have different mount points. Therefore, sharing the page cache for files with the same content can save memory. This introduces the page cache share feature in erofs. It allocate a shared inode and use its page cache as shared. Reads for files with identical content will ultimately be routed to the page cache of the shared inode. In this way, a single page cache satisfies multiple read requests for different files with the same contents. We introduce new mount option `inode_share` to enable the page sharing mode during mounting. This option is used in conjunction with `domain_id` to share the page cache within the same trusted domain. Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com> Signed-off-by: Hongbo Li <lihongbo22@huawei.com> Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com> Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
1 parent e77762e commit 5ef3208

7 files changed

Lines changed: 301 additions & 2 deletions

File tree

Documentation/filesystems/erofs.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,12 @@ fsid=%s Specify a filesystem image ID for Fscache back-end.
131131
domain_id=%s Specify a trusted domain ID for fscache mode so that
132132
different images with the same blobs, identified by blob IDs,
133133
can share storage within the same trusted domain.
134+
Also used for different filesystems with inode page sharing
135+
enabled to share page cache within the trusted domain.
134136
fsoffset=%llu Specify block-aligned filesystem offset for the primary device.
137+
inode_share Enable inode page sharing for this filesystem. Inodes with
138+
identical content within the same domain ID can share the
139+
page cache.
135140
=================== =========================================================
136141

137142
Sysfs Entries

fs/erofs/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o
1010
erofs-$(CONFIG_EROFS_FS_ZIP_ACCEL) += decompressor_crypto.o
1111
erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o
1212
erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
13+
erofs-$(CONFIG_EROFS_FS_PAGE_CACHE_SHARE) += ishare.o

fs/erofs/internal.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ struct erofs_sb_info {
176176
#define EROFS_MOUNT_DAX_ALWAYS 0x00000040
177177
#define EROFS_MOUNT_DAX_NEVER 0x00000080
178178
#define EROFS_MOUNT_DIRECT_IO 0x00000100
179+
#define EROFS_MOUNT_INODE_SHARE 0x00000200
179180

180181
#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option)
181182
#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option)
@@ -266,6 +267,11 @@ static inline u64 erofs_nid_to_ino64(struct erofs_sb_info *sbi, erofs_nid_t nid)
266267
/* default readahead size of directories */
267268
#define EROFS_DIR_RA_BYTES 16384
268269

270+
struct erofs_inode_fingerprint {
271+
u8 *opaque;
272+
int size;
273+
};
274+
269275
struct erofs_inode {
270276
erofs_nid_t nid;
271277

@@ -301,6 +307,18 @@ struct erofs_inode {
301307
};
302308
#endif /* CONFIG_EROFS_FS_ZIP */
303309
};
310+
#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
311+
struct list_head ishare_list;
312+
union {
313+
/* for each anon shared inode */
314+
struct {
315+
struct erofs_inode_fingerprint fingerprint;
316+
spinlock_t ishare_lock;
317+
};
318+
/* for each real inode */
319+
struct inode *sharedinode;
320+
};
321+
#endif
304322
/* the corresponding vfs inode */
305323
struct inode vfs_inode;
306324
};
@@ -407,6 +425,7 @@ extern const struct inode_operations erofs_dir_iops;
407425

408426
extern const struct file_operations erofs_file_fops;
409427
extern const struct file_operations erofs_dir_fops;
428+
extern const struct file_operations erofs_ishare_fops;
410429

411430
extern const struct iomap_ops z_erofs_iomap_report_ops;
412431

@@ -560,6 +579,18 @@ static inline struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) {
560579
static inline void erofs_fscache_submit_bio(struct bio *bio) {}
561580
#endif
562581

582+
#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
583+
int __init erofs_init_ishare(void);
584+
void erofs_exit_ishare(void);
585+
bool erofs_ishare_fill_inode(struct inode *inode);
586+
void erofs_ishare_free_inode(struct inode *inode);
587+
#else
588+
static inline int erofs_init_ishare(void) { return 0; }
589+
static inline void erofs_exit_ishare(void) {}
590+
static inline bool erofs_ishare_fill_inode(struct inode *inode) { return false; }
591+
static inline void erofs_ishare_free_inode(struct inode *inode) {}
592+
#endif
593+
563594
long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
564595
long erofs_compat_ioctl(struct file *filp, unsigned int cmd,
565596
unsigned long arg);

fs/erofs/ishare.c

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
// SPDX-License-Identifier: GPL-2.0-or-later
2+
/*
3+
* Copyright (C) 2024, Alibaba Cloud
4+
*/
5+
#include <linux/xxhash.h>
6+
#include <linux/mount.h>
7+
#include "internal.h"
8+
#include "xattr.h"
9+
10+
#include "../internal.h"
11+
12+
static struct vfsmount *erofs_ishare_mnt;
13+
14+
static int erofs_ishare_iget5_eq(struct inode *inode, void *data)
15+
{
16+
struct erofs_inode_fingerprint *fp1 = &EROFS_I(inode)->fingerprint;
17+
struct erofs_inode_fingerprint *fp2 = data;
18+
19+
return fp1->size == fp2->size &&
20+
!memcmp(fp1->opaque, fp2->opaque, fp2->size);
21+
}
22+
23+
static int erofs_ishare_iget5_set(struct inode *inode, void *data)
24+
{
25+
struct erofs_inode *vi = EROFS_I(inode);
26+
27+
vi->fingerprint = *(struct erofs_inode_fingerprint *)data;
28+
INIT_LIST_HEAD(&vi->ishare_list);
29+
spin_lock_init(&vi->ishare_lock);
30+
return 0;
31+
}
32+
33+
bool erofs_ishare_fill_inode(struct inode *inode)
34+
{
35+
struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
36+
struct erofs_inode *vi = EROFS_I(inode);
37+
struct erofs_inode_fingerprint fp;
38+
struct inode *sharedinode;
39+
unsigned long hash;
40+
41+
if (erofs_xattr_fill_inode_fingerprint(&fp, inode, sbi->domain_id))
42+
return false;
43+
hash = xxh32(fp.opaque, fp.size, 0);
44+
sharedinode = iget5_locked(erofs_ishare_mnt->mnt_sb, hash,
45+
erofs_ishare_iget5_eq, erofs_ishare_iget5_set,
46+
&fp);
47+
if (!sharedinode) {
48+
kfree(fp.opaque);
49+
return false;
50+
}
51+
52+
if (inode_state_read_once(sharedinode) & I_NEW) {
53+
if (erofs_inode_set_aops(sharedinode, inode, true)) {
54+
iget_failed(sharedinode);
55+
kfree(fp.opaque);
56+
return false;
57+
}
58+
sharedinode->i_size = vi->vfs_inode.i_size;
59+
unlock_new_inode(sharedinode);
60+
} else {
61+
kfree(fp.opaque);
62+
if (sharedinode->i_size != vi->vfs_inode.i_size) {
63+
_erofs_printk(inode->i_sb, KERN_WARNING
64+
"size(%lld:%lld) not matches for the same fingerprint\n",
65+
vi->vfs_inode.i_size, sharedinode->i_size);
66+
iput(sharedinode);
67+
return false;
68+
}
69+
}
70+
vi->sharedinode = sharedinode;
71+
INIT_LIST_HEAD(&vi->ishare_list);
72+
spin_lock(&EROFS_I(sharedinode)->ishare_lock);
73+
list_add(&vi->ishare_list, &EROFS_I(sharedinode)->ishare_list);
74+
spin_unlock(&EROFS_I(sharedinode)->ishare_lock);
75+
return true;
76+
}
77+
78+
void erofs_ishare_free_inode(struct inode *inode)
79+
{
80+
struct erofs_inode *vi = EROFS_I(inode);
81+
struct inode *sharedinode = vi->sharedinode;
82+
83+
if (!sharedinode)
84+
return;
85+
spin_lock(&EROFS_I(sharedinode)->ishare_lock);
86+
list_del(&vi->ishare_list);
87+
spin_unlock(&EROFS_I(sharedinode)->ishare_lock);
88+
iput(sharedinode);
89+
vi->sharedinode = NULL;
90+
}
91+
92+
static int erofs_ishare_file_open(struct inode *inode, struct file *file)
93+
{
94+
struct inode *sharedinode = EROFS_I(inode)->sharedinode;
95+
struct file *realfile;
96+
97+
if (file->f_flags & O_DIRECT)
98+
return -EINVAL;
99+
realfile = alloc_empty_backing_file(O_RDONLY|O_NOATIME, current_cred());
100+
if (IS_ERR(realfile))
101+
return PTR_ERR(realfile);
102+
ihold(sharedinode);
103+
realfile->f_op = &erofs_file_fops;
104+
realfile->f_inode = sharedinode;
105+
realfile->f_mapping = sharedinode->i_mapping;
106+
path_get(&file->f_path);
107+
backing_file_set_user_path(realfile, &file->f_path);
108+
109+
file_ra_state_init(&realfile->f_ra, file->f_mapping);
110+
realfile->private_data = EROFS_I(inode);
111+
file->private_data = realfile;
112+
return 0;
113+
}
114+
115+
static int erofs_ishare_file_release(struct inode *inode, struct file *file)
116+
{
117+
struct file *realfile = file->private_data;
118+
119+
iput(realfile->f_inode);
120+
fput(realfile);
121+
file->private_data = NULL;
122+
return 0;
123+
}
124+
125+
static ssize_t erofs_ishare_file_read_iter(struct kiocb *iocb,
126+
struct iov_iter *to)
127+
{
128+
struct file *realfile = iocb->ki_filp->private_data;
129+
struct kiocb dedup_iocb;
130+
ssize_t nread;
131+
132+
if (!iov_iter_count(to))
133+
return 0;
134+
kiocb_clone(&dedup_iocb, iocb, realfile);
135+
nread = filemap_read(&dedup_iocb, to, 0);
136+
iocb->ki_pos = dedup_iocb.ki_pos;
137+
return nread;
138+
}
139+
140+
static int erofs_ishare_mmap(struct file *file, struct vm_area_struct *vma)
141+
{
142+
struct file *realfile = file->private_data;
143+
144+
vma_set_file(vma, realfile);
145+
return generic_file_readonly_mmap(file, vma);
146+
}
147+
148+
const struct file_operations erofs_ishare_fops = {
149+
.open = erofs_ishare_file_open,
150+
.llseek = generic_file_llseek,
151+
.read_iter = erofs_ishare_file_read_iter,
152+
.mmap = erofs_ishare_mmap,
153+
.release = erofs_ishare_file_release,
154+
.get_unmapped_area = thp_get_unmapped_area,
155+
.splice_read = filemap_splice_read,
156+
};
157+
158+
int __init erofs_init_ishare(void)
159+
{
160+
erofs_ishare_mnt = kern_mount(&erofs_anon_fs_type);
161+
return PTR_ERR_OR_ZERO(erofs_ishare_mnt);
162+
}
163+
164+
void erofs_exit_ishare(void)
165+
{
166+
kern_unmount(erofs_ishare_mnt);
167+
}

fs/erofs/super.c

Lines changed: 60 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,7 @@ static void erofs_default_options(struct erofs_sb_info *sbi)
398398
enum {
399399
Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum,
400400
Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, Opt_fsoffset,
401+
Opt_inode_share,
401402
};
402403

403404
static const struct constant_table erofs_param_cache_strategy[] = {
@@ -425,6 +426,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
425426
fsparam_string("domain_id", Opt_domain_id),
426427
fsparam_flag_no("directio", Opt_directio),
427428
fsparam_u64("fsoffset", Opt_fsoffset),
429+
fsparam_flag("inode_share", Opt_inode_share),
428430
{}
429431
};
430432

@@ -526,6 +528,8 @@ static int erofs_fc_parse_param(struct fs_context *fc,
526528
if (!sbi->fsid)
527529
return -ENOMEM;
528530
break;
531+
#endif
532+
#if defined(CONFIG_EROFS_FS_ONDEMAND) || defined(CONFIG_EROFS_FS_PAGE_CACHE_SHARE)
529533
case Opt_domain_id:
530534
kfree_sensitive(sbi->domain_id);
531535
sbi->domain_id = no_free_ptr(param->string);
@@ -549,6 +553,13 @@ static int erofs_fc_parse_param(struct fs_context *fc,
549553
case Opt_fsoffset:
550554
sbi->dif0.fsoff = result.uint_64;
551555
break;
556+
case Opt_inode_share:
557+
#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
558+
set_opt(&sbi->opt, INODE_SHARE);
559+
#else
560+
errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
561+
#endif
562+
break;
552563
}
553564
return 0;
554565
}
@@ -647,6 +658,15 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
647658
sb->s_maxbytes = MAX_LFS_FILESIZE;
648659
sb->s_op = &erofs_sops;
649660

661+
if (!sbi->domain_id && test_opt(&sbi->opt, INODE_SHARE)) {
662+
errorfc(fc, "domain_id is needed when inode_ishare is on");
663+
return -EINVAL;
664+
}
665+
if (test_opt(&sbi->opt, DAX_ALWAYS) && test_opt(&sbi->opt, INODE_SHARE)) {
666+
errorfc(fc, "FSDAX is not allowed when inode_ishare is on");
667+
return -EINVAL;
668+
}
669+
650670
sbi->blkszbits = PAGE_SHIFT;
651671
if (!sb->s_bdev) {
652672
/*
@@ -724,6 +744,12 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
724744
erofs_info(sb, "unsupported blocksize for DAX");
725745
clear_opt(&sbi->opt, DAX_ALWAYS);
726746
}
747+
if (test_opt(&sbi->opt, INODE_SHARE) && !erofs_sb_has_ishare_xattrs(sbi)) {
748+
erofs_info(sb, "on-disk ishare xattrs not found. Turning off inode_share.");
749+
clear_opt(&sbi->opt, INODE_SHARE);
750+
}
751+
if (test_opt(&sbi->opt, INODE_SHARE))
752+
erofs_info(sb, "EXPERIMENTAL EROFS page cache share support in use. Use at your own risk!");
727753

728754
sb->s_time_gran = 1;
729755
sb->s_xattr = erofs_xattr_handlers;
@@ -953,10 +979,32 @@ static struct file_system_type erofs_fs_type = {
953979
};
954980
MODULE_ALIAS_FS("erofs");
955981

956-
#if defined(CONFIG_EROFS_FS_ONDEMAND)
982+
#if defined(CONFIG_EROFS_FS_ONDEMAND) || defined(CONFIG_EROFS_FS_PAGE_CACHE_SHARE)
983+
static void erofs_free_anon_inode(struct inode *inode)
984+
{
985+
struct erofs_inode *vi = EROFS_I(inode);
986+
987+
#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
988+
kfree(vi->fingerprint.opaque);
989+
#endif
990+
kmem_cache_free(erofs_inode_cachep, vi);
991+
}
992+
993+
static const struct super_operations erofs_anon_sops = {
994+
.alloc_inode = erofs_alloc_inode,
995+
.drop_inode = inode_just_drop,
996+
.free_inode = erofs_free_anon_inode,
997+
};
998+
957999
static int erofs_anon_init_fs_context(struct fs_context *fc)
9581000
{
959-
return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM;
1001+
struct pseudo_fs_context *ctx;
1002+
1003+
ctx = init_pseudo(fc, EROFS_SUPER_MAGIC);
1004+
if (!ctx)
1005+
return -ENOMEM;
1006+
ctx->ops = &erofs_anon_sops;
1007+
return 0;
9601008
}
9611009

9621010
struct file_system_type erofs_anon_fs_type = {
@@ -991,13 +1039,19 @@ static int __init erofs_module_init(void)
9911039
if (err)
9921040
goto sysfs_err;
9931041

1042+
err = erofs_init_ishare();
1043+
if (err)
1044+
goto ishare_err;
1045+
9941046
err = register_filesystem(&erofs_fs_type);
9951047
if (err)
9961048
goto fs_err;
9971049

9981050
return 0;
9991051

10001052
fs_err:
1053+
erofs_exit_ishare();
1054+
ishare_err:
10011055
erofs_exit_sysfs();
10021056
sysfs_err:
10031057
z_erofs_exit_subsystem();
@@ -1015,6 +1069,7 @@ static void __exit erofs_module_exit(void)
10151069
/* Ensure all RCU free inodes / pclusters are safe to be destroyed. */
10161070
rcu_barrier();
10171071

1072+
erofs_exit_ishare();
10181073
erofs_exit_sysfs();
10191074
z_erofs_exit_subsystem();
10201075
erofs_exit_shrinker();
@@ -1069,6 +1124,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
10691124
#endif
10701125
if (sbi->dif0.fsoff)
10711126
seq_printf(seq, ",fsoffset=%llu", sbi->dif0.fsoff);
1127+
if (test_opt(opt, INODE_SHARE))
1128+
seq_puts(seq, ",inode_share");
10721129
return 0;
10731130
}
10741131

@@ -1079,6 +1136,7 @@ static void erofs_evict_inode(struct inode *inode)
10791136
dax_break_layout_final(inode);
10801137
#endif
10811138

1139+
erofs_ishare_free_inode(inode);
10821140
truncate_inode_pages_final(&inode->i_data);
10831141
clear_inode(inode);
10841142
}

0 commit comments

Comments
 (0)