Skip to content

Commit 07de55c

Browse files
chaseyuJaegeuk Kim
authored andcommitted
f2fs: fix lock priority inversion issue
If userspace thread has held f2fs rw semaphore, due to its low priority, it could be runnable or preempted state for long time, during the time, it will block high priority thread which is trying to grab the same rw semaphore, e.g. cp_rwsem, io_rwsem... To fix such issue, let's detect thread's priority when it tries to grab f2fs_rwsem lock, if the priority is lower than a priority threshold, let's uplift the priority before it enters into critical region of lock, and restore the priority after it leaves from critical region. Meanwhile, introducing two new sysfs nodes: - /sys/fs/f2fs/<disk>/adjust_lock_priority, it is used to control whether the functionality is enable or not. ========== ================== Flag_Value Flag_Description ========== ================== 0x00000000 Disabled (default) 0x00000001 cp_rwsem 0x00000002 node_change 0x00000004 node_write 0x00000008 gc_lock 0x00000010 cp_global 0x00000020 io_rwsem ========== ================== - /sys/fs/f2fs/<disk>/lock_duration_priority, it is used to control priority threshold. Signed-off-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
1 parent d860974 commit 07de55c

5 files changed

Lines changed: 120 additions & 2 deletions

File tree

Documentation/ABI/testing/sysfs-fs-f2fs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -963,3 +963,27 @@ Description: This sysfs entry can be used to change type of injected timeout:
963963
0x00000003 Simulate Non-IO type sleep time
964964
0x00000004 Simulate runnable time
965965
========== ===============================
966+
967+
What: /sys/fs/f2fs/<disk>/adjust_lock_priority
968+
Date: January 2026
969+
Contact: "Chao Yu" <chao@kernel.org>
970+
Description: This sysfs entry can be used to enable/disable to adjust priority for task
971+
which is in critical region covered by lock.
972+
========== ==================
973+
Flag_Value Flag_Description
974+
========== ==================
975+
0x00000000 Disabled (default)
976+
0x00000001 cp_rwsem
977+
0x00000002 node_change
978+
0x00000004 node_write
979+
0x00000008 gc_lock
980+
0x00000010 cp_global
981+
0x00000020 io_rwsem
982+
========== ==================
983+
984+
What: /sys/fs/f2fs/<disk>/lock_duration_priority
985+
Date: January 2026
986+
Contact: "Chao Yu" <chao@kernel.org>
987+
Description: f2fs can tune priority of thread which has entered into critical region covered by
988+
f2fs rwsemphore lock. This sysfs entry can be used to control priority value, the
989+
range is [100,139], by default the value is 120.

fs/f2fs/checkpoint.c

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,43 +90,105 @@ static inline void trace_lock_elapsed_time_end(struct f2fs_rwsem *sem,
9090
runnable_time, io_sleep_time, other_time);
9191
}
9292

93+
static bool need_uplift_priority(struct f2fs_rwsem *sem, bool is_write)
94+
{
95+
if (!(sem->sbi->adjust_lock_priority & BIT(sem->name - 1)))
96+
return false;
97+
98+
switch (sem->name) {
99+
/*
100+
* writer is checkpoint which has high priority, let's just uplift
101+
* priority for reader
102+
*/
103+
case LOCK_NAME_CP_RWSEM:
104+
case LOCK_NAME_NODE_CHANGE:
105+
case LOCK_NAME_NODE_WRITE:
106+
return !is_write;
107+
case LOCK_NAME_GC_LOCK:
108+
case LOCK_NAME_CP_GLOBAL:
109+
case LOCK_NAME_IO_RWSEM:
110+
return true;
111+
default:
112+
f2fs_bug_on(sem->sbi, 1);
113+
}
114+
return false;
115+
}
116+
117+
static void uplift_priority(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc,
118+
bool is_write)
119+
{
120+
lc->need_restore = false;
121+
if (!sem->sbi->adjust_lock_priority)
122+
return;
123+
if (rt_task(current))
124+
return;
125+
if (!need_uplift_priority(sem, is_write))
126+
return;
127+
lc->orig_nice = task_nice(current);
128+
lc->new_nice = PRIO_TO_NICE(sem->sbi->lock_duration_priority);
129+
if (lc->orig_nice <= lc->new_nice)
130+
return;
131+
set_user_nice(current, lc->new_nice);
132+
lc->need_restore = true;
133+
}
134+
135+
static void restore_priority(struct f2fs_lock_context *lc)
136+
{
137+
if (!lc->need_restore)
138+
return;
139+
/* someone has updated the priority */
140+
if (task_nice(current) != lc->new_nice)
141+
return;
142+
set_user_nice(current, lc->orig_nice);
143+
}
144+
93145
void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
94146
{
147+
uplift_priority(sem, lc, false);
95148
f2fs_down_read(sem);
96149
trace_lock_elapsed_time_start(sem, lc);
97150
}
98151

99152
int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
100153
{
101-
if (!f2fs_down_read_trylock(sem))
154+
uplift_priority(sem, lc, false);
155+
if (!f2fs_down_read_trylock(sem)) {
156+
restore_priority(lc);
102157
return 0;
158+
}
103159
trace_lock_elapsed_time_start(sem, lc);
104160
return 1;
105161
}
106162

107163
void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
108164
{
109165
f2fs_up_read(sem);
166+
restore_priority(lc);
110167
trace_lock_elapsed_time_end(sem, lc, false);
111168
}
112169

113170
void f2fs_down_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
114171
{
172+
uplift_priority(sem, lc, true);
115173
f2fs_down_write(sem);
116174
trace_lock_elapsed_time_start(sem, lc);
117175
}
118176

119177
int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
120178
{
121-
if (!f2fs_down_write_trylock(sem))
179+
uplift_priority(sem, lc, true);
180+
if (!f2fs_down_write_trylock(sem)) {
181+
restore_priority(lc);
122182
return 0;
183+
}
123184
trace_lock_elapsed_time_start(sem, lc);
124185
return 1;
125186
}
126187

127188
void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
128189
{
129190
f2fs_up_write(sem);
191+
restore_priority(lc);
130192
trace_lock_elapsed_time_end(sem, lc, true);
131193
}
132194

fs/f2fs/f2fs.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,7 @@ enum f2fs_lock_name {
185185
LOCK_NAME_GC_LOCK,
186186
LOCK_NAME_CP_GLOBAL,
187187
LOCK_NAME_IO_RWSEM,
188+
LOCK_NAME_MAX,
188189
};
189190

190191
enum f2fs_timeout_type {
@@ -1447,7 +1448,10 @@ struct f2fs_time_stat {
14471448

14481449
struct f2fs_lock_context {
14491450
struct f2fs_time_stat ts;
1451+
int orig_nice;
1452+
int new_nice;
14501453
bool lock_trace;
1454+
bool need_restore;
14511455
};
14521456

14531457
struct f2fs_gc_control {
@@ -1588,6 +1592,8 @@ enum node_type {
15881592
/* a threshold of maximum elapsed time in critical region to print tracepoint */
15891593
#define MAX_LOCK_ELAPSED_TIME 500
15901594

1595+
#define F2FS_DEFAULT_TASK_PRIORITY (DEFAULT_PRIO)
1596+
15911597
static inline int f2fs_test_bit(unsigned int nr, char *addr);
15921598
static inline void f2fs_set_bit(unsigned int nr, char *addr);
15931599
static inline void f2fs_clear_bit(unsigned int nr, char *addr);
@@ -1998,6 +2004,12 @@ struct f2fs_sb_info {
19982004
/* max elapsed time threshold in critical region that lock covered */
19992005
unsigned long long max_lock_elapsed_time;
20002006

2007+
/* enable/disable to adjust task priority in critical region covered by lock */
2008+
unsigned int adjust_lock_priority;
2009+
2010+
/* adjust priority for task which is in critical region covered by lock */
2011+
unsigned int lock_duration_priority;
2012+
20012013
#ifdef CONFIG_F2FS_FS_COMPRESSION
20022014
struct kmem_cache *page_array_slab; /* page array entry */
20032015
unsigned int page_array_slab_size; /* default page array slab size */

fs/f2fs/super.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4338,6 +4338,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
43384338
spin_lock_init(&sbi->gc_remaining_trials_lock);
43394339
atomic64_set(&sbi->current_atomic_write, 0);
43404340
sbi->max_lock_elapsed_time = MAX_LOCK_ELAPSED_TIME;
4341+
sbi->adjust_lock_priority = 0;
4342+
sbi->lock_duration_priority = F2FS_DEFAULT_TASK_PRIORITY;
43414343

43424344
sbi->sum_blocksize = f2fs_sb_has_packed_ssa(sbi) ?
43434345
4096 : sbi->blocksize;

fs/f2fs/sysfs.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -955,6 +955,20 @@ static ssize_t __sbi_store(struct f2fs_attr *a,
955955
return count;
956956
}
957957

958+
if (!strcmp(a->attr.name, "adjust_lock_priority")) {
959+
if (t >= BIT(LOCK_NAME_MAX - 1))
960+
return -EINVAL;
961+
sbi->adjust_lock_priority = t;
962+
return count;
963+
}
964+
965+
if (!strcmp(a->attr.name, "lock_duration_priority")) {
966+
if (t < NICE_TO_PRIO(MIN_NICE) || t > NICE_TO_PRIO(MAX_NICE))
967+
return -EINVAL;
968+
sbi->lock_duration_priority = t;
969+
return count;
970+
}
971+
958972
__sbi_store_value(a, sbi, ptr + a->offset, t);
959973

960974
return count;
@@ -1272,6 +1286,8 @@ F2FS_SBI_GENERAL_RW_ATTR(carve_out);
12721286
F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section);
12731287
F2FS_SBI_GENERAL_RW_ATTR(bggc_io_aware);
12741288
F2FS_SBI_GENERAL_RW_ATTR(max_lock_elapsed_time);
1289+
F2FS_SBI_GENERAL_RW_ATTR(lock_duration_priority);
1290+
F2FS_SBI_GENERAL_RW_ATTR(adjust_lock_priority);
12751291

12761292
/* STAT_INFO ATTR */
12771293
#ifdef CONFIG_F2FS_STAT_FS
@@ -1478,6 +1494,8 @@ static struct attribute *f2fs_attrs[] = {
14781494
ATTR_LIST(allocate_section_hint),
14791495
ATTR_LIST(allocate_section_policy),
14801496
ATTR_LIST(max_lock_elapsed_time),
1497+
ATTR_LIST(lock_duration_priority),
1498+
ATTR_LIST(adjust_lock_priority),
14811499
NULL,
14821500
};
14831501
ATTRIBUTE_GROUPS(f2fs);

0 commit comments

Comments
 (0)