Skip to content

Commit d927a59

Browse files
MaxKellermannidryomov
authored andcommitted
ceph: add trace points to the MDS client
This patch adds trace points to the Ceph filesystem MDS client: - request submission (CEPH_MSG_CLIENT_REQUEST) and completion (CEPH_MSG_CLIENT_REPLY) - capabilities (CEPH_MSG_CLIENT_CAPS) These are the central pieces that are useful for analyzing MDS latency/performance problems from the client's perspective. In the long run, all doutc() calls should be replaced with tracepoints. This way, the Ceph filesystem can be traced at any time (without spamming the kernel log). Additionally, trace points can be used in BPF programs (which can even deference the pointer parameters and extract more values). Signed-off-by: Max Kellermann <max.kellermann@ionos.com> Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
1 parent d6c2f41 commit d927a59

4 files changed

Lines changed: 259 additions & 2 deletions

File tree

fs/ceph/caps.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "crypto.h"
1919
#include <linux/ceph/decode.h>
2020
#include <linux/ceph/messenger.h>
21+
#include <trace/events/ceph.h>
2122

2223
/*
2324
* Capability management
@@ -4452,6 +4453,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
44524453
session->s_mds, ceph_cap_op_name(op), vino.ino, vino.snap, inode,
44534454
seq, issue_seq, mseq);
44544455

4456+
trace_ceph_handle_caps(mdsc, session, op, &vino, ceph_inode(inode),
4457+
seq, issue_seq, mseq);
4458+
44554459
mutex_lock(&session->s_mutex);
44564460

44574461
if (!inode) {

fs/ceph/mds_client.c

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <linux/ceph/pagelist.h>
2525
#include <linux/ceph/auth.h>
2626
#include <linux/ceph/debugfs.h>
27+
#include <trace/events/ceph.h>
2728

2829
#define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE)
2930

@@ -3288,6 +3289,8 @@ static void complete_request(struct ceph_mds_client *mdsc,
32883289
{
32893290
req->r_end_latency = ktime_get();
32903291

3292+
trace_ceph_mdsc_complete_request(mdsc, req);
3293+
32913294
if (req->r_callback)
32923295
req->r_callback(mdsc, req);
32933296
complete_all(&req->r_completion);
@@ -3419,6 +3422,8 @@ static int __send_request(struct ceph_mds_session *session,
34193422
{
34203423
int err;
34213424

3425+
trace_ceph_mdsc_send_request(session, req);
3426+
34223427
err = __prepare_send_request(session, req, drop_cap_releases);
34233428
if (!err) {
34243429
ceph_msg_get(req->r_request);
@@ -3470,6 +3475,8 @@ static void __do_request(struct ceph_mds_client *mdsc,
34703475
}
34713476
if (mdsc->mdsmap->m_epoch == 0) {
34723477
doutc(cl, "no mdsmap, waiting for map\n");
3478+
trace_ceph_mdsc_suspend_request(mdsc, session, req,
3479+
ceph_mdsc_suspend_reason_no_mdsmap);
34733480
list_add(&req->r_wait, &mdsc->waiting_for_map);
34743481
return;
34753482
}
@@ -3491,6 +3498,8 @@ static void __do_request(struct ceph_mds_client *mdsc,
34913498
goto finish;
34923499
}
34933500
doutc(cl, "no mds or not active, waiting for map\n");
3501+
trace_ceph_mdsc_suspend_request(mdsc, session, req,
3502+
ceph_mdsc_suspend_reason_no_active_mds);
34943503
list_add(&req->r_wait, &mdsc->waiting_for_map);
34953504
return;
34963505
}
@@ -3536,9 +3545,11 @@ static void __do_request(struct ceph_mds_client *mdsc,
35363545
* it to the mdsc queue.
35373546
*/
35383547
if (session->s_state == CEPH_MDS_SESSION_REJECTED) {
3539-
if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER))
3548+
if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER)) {
3549+
trace_ceph_mdsc_suspend_request(mdsc, session, req,
3550+
ceph_mdsc_suspend_reason_rejected);
35403551
list_add(&req->r_wait, &mdsc->waiting_for_map);
3541-
else
3552+
} else
35423553
err = -EACCES;
35433554
goto out_session;
35443555
}
@@ -3552,6 +3563,8 @@ static void __do_request(struct ceph_mds_client *mdsc,
35523563
if (random)
35533564
req->r_resend_mds = mds;
35543565
}
3566+
trace_ceph_mdsc_suspend_request(mdsc, session, req,
3567+
ceph_mdsc_suspend_reason_session);
35553568
list_add(&req->r_wait, &session->s_waiting);
35563569
goto out_session;
35573570
}
@@ -3652,6 +3665,7 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
36523665
list_del_init(&req->r_wait);
36533666
doutc(cl, " wake request %p tid %llu\n", req,
36543667
req->r_tid);
3668+
trace_ceph_mdsc_resume_request(mdsc, req);
36553669
__do_request(mdsc, req);
36563670
}
36573671
}
@@ -3678,6 +3692,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
36783692
req->r_session->s_mds == mds) {
36793693
doutc(cl, " kicking tid %llu\n", req->r_tid);
36803694
list_del_init(&req->r_wait);
3695+
trace_ceph_mdsc_resume_request(mdsc, req);
36813696
__do_request(mdsc, req);
36823697
}
36833698
}
@@ -3724,6 +3739,7 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
37243739
doutc(cl, "submit_request on %p for inode %p\n", req, dir);
37253740
mutex_lock(&mdsc->mutex);
37263741
__register_request(mdsc, req, dir);
3742+
trace_ceph_mdsc_submit_request(mdsc, req);
37273743
__do_request(mdsc, req);
37283744
err = req->r_err;
37293745
mutex_unlock(&mdsc->mutex);

fs/ceph/super.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@
3030

3131
#include <uapi/linux/magic.h>
3232

33+
#define CREATE_TRACE_POINTS
34+
#include <trace/events/ceph.h>
35+
3336
static DEFINE_SPINLOCK(ceph_fsc_lock);
3437
static LIST_HEAD(ceph_fsc_list);
3538

include/trace/events/ceph.h

Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
/* SPDX-License-Identifier: GPL-2.0-or-later */
2+
/* Ceph filesystem support module tracepoints
3+
*
4+
* Copyright (C) 2025 IONOS SE. All Rights Reserved.
5+
* Written by Max Kellermann (max.kellermann@ionos.com)
6+
*/
7+
#undef TRACE_SYSTEM
8+
#define TRACE_SYSTEM ceph
9+
10+
#if !defined(_TRACE_CEPH_H) || defined(TRACE_HEADER_MULTI_READ)
11+
#define _TRACE_CEPH_H
12+
13+
#include <linux/tracepoint.h>
14+
15+
#define ceph_mdsc_suspend_reasons \
16+
EM(ceph_mdsc_suspend_reason_no_mdsmap, "no-mdsmap") \
17+
EM(ceph_mdsc_suspend_reason_no_active_mds, "no-active-mds") \
18+
EM(ceph_mdsc_suspend_reason_rejected, "rejected") \
19+
E_(ceph_mdsc_suspend_reason_session, "session")
20+
21+
#ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
22+
#define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
23+
24+
#undef EM
25+
#undef E_
26+
#define EM(a, b) a,
27+
#define E_(a, b) a
28+
29+
enum ceph_mdsc_suspend_reason { ceph_mdsc_suspend_reasons } __mode(byte);
30+
31+
#endif
32+
33+
/*
34+
* Export enum symbols via userspace.
35+
*/
36+
#undef EM
37+
#undef E_
38+
#define EM(a, b) TRACE_DEFINE_ENUM(a);
39+
#define E_(a, b) TRACE_DEFINE_ENUM(a);
40+
41+
ceph_mdsc_suspend_reasons;
42+
43+
/*
44+
* Now redefine the EM() and E_() macros to map the enums to the strings that
45+
* will be printed in the output.
46+
*/
47+
#undef EM
48+
#undef E_
49+
#define EM(a, b) { a, b },
50+
#define E_(a, b) { a, b }
51+
52+
TRACE_EVENT(ceph_mdsc_submit_request,
53+
TP_PROTO(struct ceph_mds_client *mdsc,
54+
struct ceph_mds_request *req),
55+
56+
TP_ARGS(mdsc, req),
57+
58+
TP_STRUCT__entry(
59+
__field(u64, tid)
60+
__field(int, op)
61+
__field(u64, ino)
62+
__field(u64, snap)
63+
),
64+
65+
TP_fast_assign(
66+
struct inode *inode;
67+
68+
__entry->tid = req->r_tid;
69+
__entry->op = req->r_op;
70+
71+
inode = req->r_inode;
72+
if (inode == NULL && req->r_dentry)
73+
inode = d_inode(req->r_dentry);
74+
75+
if (inode) {
76+
__entry->ino = ceph_ino(inode);
77+
__entry->snap = ceph_snap(inode);
78+
} else {
79+
__entry->ino = __entry->snap = 0;
80+
}
81+
),
82+
83+
TP_printk("R=%llu op=%s ino=%llx,%llx",
84+
__entry->tid,
85+
ceph_mds_op_name(__entry->op),
86+
__entry->ino, __entry->snap)
87+
);
88+
89+
TRACE_EVENT(ceph_mdsc_suspend_request,
90+
TP_PROTO(struct ceph_mds_client *mdsc,
91+
struct ceph_mds_session *session,
92+
struct ceph_mds_request *req,
93+
enum ceph_mdsc_suspend_reason reason),
94+
95+
TP_ARGS(mdsc, session, req, reason),
96+
97+
TP_STRUCT__entry(
98+
__field(u64, tid)
99+
__field(int, op)
100+
__field(int, mds)
101+
__field(enum ceph_mdsc_suspend_reason, reason)
102+
),
103+
104+
TP_fast_assign(
105+
__entry->tid = req->r_tid;
106+
__entry->op = req->r_op;
107+
__entry->mds = session ? session->s_mds : -1;
108+
__entry->reason = reason;
109+
),
110+
111+
TP_printk("R=%llu op=%s reason=%s",
112+
__entry->tid,
113+
ceph_mds_op_name(__entry->op),
114+
__print_symbolic(__entry->reason, ceph_mdsc_suspend_reasons))
115+
);
116+
117+
TRACE_EVENT(ceph_mdsc_resume_request,
118+
TP_PROTO(struct ceph_mds_client *mdsc,
119+
struct ceph_mds_request *req),
120+
121+
TP_ARGS(mdsc, req),
122+
123+
TP_STRUCT__entry(
124+
__field(u64, tid)
125+
__field(int, op)
126+
),
127+
128+
TP_fast_assign(
129+
__entry->tid = req->r_tid;
130+
__entry->op = req->r_op;
131+
),
132+
133+
TP_printk("R=%llu op=%s",
134+
__entry->tid,
135+
ceph_mds_op_name(__entry->op))
136+
);
137+
138+
TRACE_EVENT(ceph_mdsc_send_request,
139+
TP_PROTO(struct ceph_mds_session *session,
140+
struct ceph_mds_request *req),
141+
142+
TP_ARGS(session, req),
143+
144+
TP_STRUCT__entry(
145+
__field(u64, tid)
146+
__field(int, op)
147+
__field(int, mds)
148+
),
149+
150+
TP_fast_assign(
151+
__entry->tid = req->r_tid;
152+
__entry->op = req->r_op;
153+
__entry->mds = session->s_mds;
154+
),
155+
156+
TP_printk("R=%llu op=%s mds=%d",
157+
__entry->tid,
158+
ceph_mds_op_name(__entry->op),
159+
__entry->mds)
160+
);
161+
162+
TRACE_EVENT(ceph_mdsc_complete_request,
163+
TP_PROTO(struct ceph_mds_client *mdsc,
164+
struct ceph_mds_request *req),
165+
166+
TP_ARGS(mdsc, req),
167+
168+
TP_STRUCT__entry(
169+
__field(u64, tid)
170+
__field(int, op)
171+
__field(int, err)
172+
__field(unsigned long, latency_ns)
173+
),
174+
175+
TP_fast_assign(
176+
__entry->tid = req->r_tid;
177+
__entry->op = req->r_op;
178+
__entry->err = req->r_err;
179+
__entry->latency_ns = req->r_end_latency - req->r_start_latency;
180+
),
181+
182+
TP_printk("R=%llu op=%s err=%d latency_ns=%lu",
183+
__entry->tid,
184+
ceph_mds_op_name(__entry->op),
185+
__entry->err,
186+
__entry->latency_ns)
187+
);
188+
189+
TRACE_EVENT(ceph_handle_caps,
190+
TP_PROTO(struct ceph_mds_client *mdsc,
191+
struct ceph_mds_session *session,
192+
int op,
193+
const struct ceph_vino *vino,
194+
struct ceph_inode_info *inode,
195+
u32 seq, u32 mseq, u32 issue_seq),
196+
197+
TP_ARGS(mdsc, session, op, vino, inode, seq, mseq, issue_seq),
198+
199+
TP_STRUCT__entry(
200+
__field(int, mds)
201+
__field(int, op)
202+
__field(u64, ino)
203+
__field(u64, snap)
204+
__field(u32, seq)
205+
__field(u32, mseq)
206+
__field(u32, issue_seq)
207+
),
208+
209+
TP_fast_assign(
210+
__entry->mds = session->s_mds;
211+
__entry->op = op;
212+
__entry->ino = vino->ino;
213+
__entry->snap = vino->snap;
214+
__entry->seq = seq;
215+
__entry->mseq = mseq;
216+
__entry->issue_seq = issue_seq;
217+
),
218+
219+
TP_printk("mds=%d op=%s vino=%llx.%llx seq=%u iseq=%u mseq=%u",
220+
__entry->mds,
221+
ceph_cap_op_name(__entry->op),
222+
__entry->ino,
223+
__entry->snap,
224+
__entry->seq,
225+
__entry->issue_seq,
226+
__entry->mseq)
227+
);
228+
229+
#undef EM
230+
#undef E_
231+
#endif /* _TRACE_CEPH_H */
232+
233+
/* This part must be outside protection */
234+
#include <trace/define_trace.h>

0 commit comments

Comments
 (0)