@@ -2840,6 +2840,18 @@ static void encode_mclientrequest_tail(void **p,
28402840 }
28412841}
28422842
2843+ static struct ceph_mds_request_head_legacy *
2844+ find_legacy_request_head (void * p , u64 features )
2845+ {
2846+ bool legacy = !(features & CEPH_FEATURE_FS_BTIME );
2847+ struct ceph_mds_request_head_old * ohead ;
2848+
2849+ if (legacy )
2850+ return (struct ceph_mds_request_head_legacy * )p ;
2851+ ohead = (struct ceph_mds_request_head_old * )p ;
2852+ return (struct ceph_mds_request_head_legacy * )& ohead -> oldest_client_tid ;
2853+ }
2854+
28432855/*
28442856 * called under mdsc->mutex
28452857 */
@@ -2850,7 +2862,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
28502862 int mds = session -> s_mds ;
28512863 struct ceph_mds_client * mdsc = session -> s_mdsc ;
28522864 struct ceph_msg * msg ;
2853- struct ceph_mds_request_head_old * head ;
2865+ struct ceph_mds_request_head_legacy * lhead ;
28542866 const char * path1 = NULL ;
28552867 const char * path2 = NULL ;
28562868 u64 ino1 = 0 , ino2 = 0 ;
@@ -2862,6 +2874,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
28622874 void * p , * end ;
28632875 int ret ;
28642876 bool legacy = !(session -> s_con .peer_features & CEPH_FEATURE_FS_BTIME );
2877+ bool old_version = !test_bit (CEPHFS_FEATURE_32BITS_RETRY_FWD ,
2878+ & session -> s_features );
28652879
28662880 ret = set_request_path_attr (req -> r_inode , req -> r_dentry ,
28672881 req -> r_parent , req -> r_path1 , req -> r_ino1 .ino ,
@@ -2893,7 +2907,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
28932907 goto out_free2 ;
28942908 }
28952909
2896- len = legacy ? sizeof (* head ) : sizeof (struct ceph_mds_request_head );
2910+ /*
2911+ * For old cephs without supporting the 32bit retry/fwd feature
2912+ * it will copy the raw memories directly when decoding the
2913+ * requests. While new cephs will decode the head depending the
2914+ * version member, so we need to make sure it will be compatible
2915+ * with them both.
2916+ */
2917+ if (legacy )
2918+ len = sizeof (struct ceph_mds_request_head_legacy );
2919+ else if (old_version )
2920+ len = sizeof (struct ceph_mds_request_head_old );
2921+ else
2922+ len = sizeof (struct ceph_mds_request_head );
28972923
28982924 /* filepaths */
28992925 len += 2 * (1 + sizeof (u32 ) + sizeof (u64 ));
@@ -2938,33 +2964,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
29382964
29392965 msg -> hdr .tid = cpu_to_le64 (req -> r_tid );
29402966
2967+ lhead = find_legacy_request_head (msg -> front .iov_base ,
2968+ session -> s_con .peer_features );
2969+
29412970 /*
2942- * The old ceph_mds_request_head didn't contain a version field, and
2971+ * The ceph_mds_request_head_legacy didn't contain a version field, and
29432972 * one was added when we moved the message version from 3->4.
29442973 */
29452974 if (legacy ) {
29462975 msg -> hdr .version = cpu_to_le16 (3 );
2947- head = msg -> front .iov_base ;
2948- p = msg -> front .iov_base + sizeof (* head );
2976+ p = msg -> front .iov_base + sizeof (* lhead );
2977+ } else if (old_version ) {
2978+ struct ceph_mds_request_head_old * ohead = msg -> front .iov_base ;
2979+
2980+ msg -> hdr .version = cpu_to_le16 (4 );
2981+ ohead -> version = cpu_to_le16 (1 );
2982+ p = msg -> front .iov_base + sizeof (* ohead );
29492983 } else {
2950- struct ceph_mds_request_head * new_head = msg -> front .iov_base ;
2984+ struct ceph_mds_request_head * nhead = msg -> front .iov_base ;
29512985
29522986 msg -> hdr .version = cpu_to_le16 (6 );
2953- new_head -> version = cpu_to_le16 (CEPH_MDS_REQUEST_HEAD_VERSION );
2954- head = (struct ceph_mds_request_head_old * )& new_head -> oldest_client_tid ;
2955- p = msg -> front .iov_base + sizeof (* new_head );
2987+ nhead -> version = cpu_to_le16 (CEPH_MDS_REQUEST_HEAD_VERSION );
2988+ p = msg -> front .iov_base + sizeof (* nhead );
29562989 }
29572990
29582991 end = msg -> front .iov_base + msg -> front .iov_len ;
29592992
2960- head -> mdsmap_epoch = cpu_to_le32 (mdsc -> mdsmap -> m_epoch );
2961- head -> op = cpu_to_le32 (req -> r_op );
2962- head -> caller_uid = cpu_to_le32 (from_kuid (& init_user_ns ,
2963- req -> r_cred -> fsuid ));
2964- head -> caller_gid = cpu_to_le32 (from_kgid (& init_user_ns ,
2965- req -> r_cred -> fsgid ));
2966- head -> ino = cpu_to_le64 (req -> r_deleg_ino );
2967- head -> args = req -> r_args ;
2993+ lhead -> mdsmap_epoch = cpu_to_le32 (mdsc -> mdsmap -> m_epoch );
2994+ lhead -> op = cpu_to_le32 (req -> r_op );
2995+ lhead -> caller_uid = cpu_to_le32 (from_kuid (& init_user_ns ,
2996+ req -> r_cred -> fsuid ));
2997+ lhead -> caller_gid = cpu_to_le32 (from_kgid (& init_user_ns ,
2998+ req -> r_cred -> fsgid ));
2999+ lhead -> ino = cpu_to_le64 (req -> r_deleg_ino );
3000+ lhead -> args = req -> r_args ;
29683001
29693002 ceph_encode_filepath (& p , end , ino1 , path1 );
29703003 ceph_encode_filepath (& p , end , ino2 , path2 );
@@ -3006,7 +3039,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
30063039 p = msg -> front .iov_base + req -> r_request_release_offset ;
30073040 }
30083041
3009- head -> num_releases = cpu_to_le16 (releases );
3042+ lhead -> num_releases = cpu_to_le16 (releases );
30103043
30113044 encode_mclientrequest_tail (& p , req );
30123045
@@ -3057,18 +3090,6 @@ static void complete_request(struct ceph_mds_client *mdsc,
30573090 complete_all (& req -> r_completion );
30583091}
30593092
3060- static struct ceph_mds_request_head_old *
3061- find_old_request_head (void * p , u64 features )
3062- {
3063- bool legacy = !(features & CEPH_FEATURE_FS_BTIME );
3064- struct ceph_mds_request_head * new_head ;
3065-
3066- if (legacy )
3067- return (struct ceph_mds_request_head_old * )p ;
3068- new_head = (struct ceph_mds_request_head * )p ;
3069- return (struct ceph_mds_request_head_old * )& new_head -> oldest_client_tid ;
3070- }
3071-
30723093/*
30733094 * called under mdsc->mutex
30743095 */
@@ -3078,29 +3099,28 @@ static int __prepare_send_request(struct ceph_mds_session *session,
30783099{
30793100 int mds = session -> s_mds ;
30803101 struct ceph_mds_client * mdsc = session -> s_mdsc ;
3081- struct ceph_mds_request_head_old * rhead ;
3102+ struct ceph_mds_request_head_legacy * lhead ;
3103+ struct ceph_mds_request_head * nhead ;
30823104 struct ceph_msg * msg ;
3083- int flags = 0 , max_retry ;
3105+ int flags = 0 , old_max_retry ;
3106+ bool old_version = !test_bit (CEPHFS_FEATURE_32BITS_RETRY_FWD ,
3107+ & session -> s_features );
30843108
30853109 /*
3086- * The type of 'r_attempts' in kernel 'ceph_mds_request'
3087- * is 'int', while in 'ceph_mds_request_head' the type of
3088- * 'num_retry' is '__u8'. So in case the request retries
3089- * exceeding 256 times, the MDS will receive a incorrect
3090- * retry seq.
3091- *
3092- * In this case it's ususally a bug in MDS and continue
3093- * retrying the request makes no sense.
3094- *
3095- * In future this could be fixed in ceph code, so avoid
3096- * using the hardcode here.
3110+ * Avoid inifinite retrying after overflow. The client will
3111+ * increase the retry count and if the MDS is old version,
3112+ * so we limit to retry at most 256 times.
30973113 */
3098- max_retry = sizeof_field (struct ceph_mds_request_head , num_retry );
3099- max_retry = 1 << (max_retry * BITS_PER_BYTE );
3100- if (req -> r_attempts >= max_retry ) {
3101- pr_warn_ratelimited ("%s request tid %llu seq overflow\n" ,
3102- __func__ , req -> r_tid );
3103- return - EMULTIHOP ;
3114+ if (req -> r_attempts ) {
3115+ old_max_retry = sizeof_field (struct ceph_mds_request_head_old ,
3116+ num_retry );
3117+ old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE );
3118+ if ((old_version && req -> r_attempts >= old_max_retry ) ||
3119+ ((uint32_t )req -> r_attempts >= U32_MAX )) {
3120+ pr_warn_ratelimited ("%s request tid %llu seq overflow\n" ,
3121+ __func__ , req -> r_tid );
3122+ return - EMULTIHOP ;
3123+ }
31043124 }
31053125
31063126 req -> r_attempts ++ ;
@@ -3126,20 +3146,24 @@ static int __prepare_send_request(struct ceph_mds_session *session,
31263146 * d_move mangles the src name.
31273147 */
31283148 msg = req -> r_request ;
3129- rhead = find_old_request_head (msg -> front .iov_base ,
3130- session -> s_con .peer_features );
3149+ lhead = find_legacy_request_head (msg -> front .iov_base ,
3150+ session -> s_con .peer_features );
31313151
3132- flags = le32_to_cpu (rhead -> flags );
3152+ flags = le32_to_cpu (lhead -> flags );
31333153 flags |= CEPH_MDS_FLAG_REPLAY ;
3134- rhead -> flags = cpu_to_le32 (flags );
3154+ lhead -> flags = cpu_to_le32 (flags );
31353155
31363156 if (req -> r_target_inode )
3137- rhead -> ino = cpu_to_le64 (ceph_ino (req -> r_target_inode ));
3157+ lhead -> ino = cpu_to_le64 (ceph_ino (req -> r_target_inode ));
31383158
3139- rhead -> num_retry = req -> r_attempts - 1 ;
3159+ lhead -> num_retry = req -> r_attempts - 1 ;
3160+ if (!old_version ) {
3161+ nhead = (struct ceph_mds_request_head * )msg -> front .iov_base ;
3162+ nhead -> ext_num_retry = cpu_to_le32 (req -> r_attempts - 1 );
3163+ }
31403164
31413165 /* remove cap/dentry releases from message */
3142- rhead -> num_releases = 0 ;
3166+ lhead -> num_releases = 0 ;
31433167
31443168 p = msg -> front .iov_base + req -> r_request_release_offset ;
31453169 encode_mclientrequest_tail (& p , req );
@@ -3160,18 +3184,23 @@ static int __prepare_send_request(struct ceph_mds_session *session,
31603184 }
31613185 req -> r_request = msg ;
31623186
3163- rhead = find_old_request_head (msg -> front .iov_base ,
3164- session -> s_con .peer_features );
3165- rhead -> oldest_client_tid = cpu_to_le64 (__get_oldest_tid (mdsc ));
3187+ lhead = find_legacy_request_head (msg -> front .iov_base ,
3188+ session -> s_con .peer_features );
3189+ lhead -> oldest_client_tid = cpu_to_le64 (__get_oldest_tid (mdsc ));
31663190 if (test_bit (CEPH_MDS_R_GOT_UNSAFE , & req -> r_req_flags ))
31673191 flags |= CEPH_MDS_FLAG_REPLAY ;
31683192 if (test_bit (CEPH_MDS_R_ASYNC , & req -> r_req_flags ))
31693193 flags |= CEPH_MDS_FLAG_ASYNC ;
31703194 if (req -> r_parent )
31713195 flags |= CEPH_MDS_FLAG_WANT_DENTRY ;
3172- rhead -> flags = cpu_to_le32 (flags );
3173- rhead -> num_fwd = req -> r_num_fwd ;
3174- rhead -> num_retry = req -> r_attempts - 1 ;
3196+ lhead -> flags = cpu_to_le32 (flags );
3197+ lhead -> num_fwd = req -> r_num_fwd ;
3198+ lhead -> num_retry = req -> r_attempts - 1 ;
3199+ if (!old_version ) {
3200+ nhead = (struct ceph_mds_request_head * )msg -> front .iov_base ;
3201+ nhead -> ext_num_fwd = cpu_to_le32 (req -> r_num_fwd );
3202+ nhead -> ext_num_retry = cpu_to_le32 (req -> r_attempts - 1 );
3203+ }
31753204
31763205 dout (" r_parent = %p\n" , req -> r_parent );
31773206 return 0 ;
@@ -3830,33 +3859,21 @@ static void handle_forward(struct ceph_mds_client *mdsc,
38303859 if (test_bit (CEPH_MDS_R_ABORTED , & req -> r_req_flags )) {
38313860 dout ("forward tid %llu aborted, unregistering\n" , tid );
38323861 __unregister_request (mdsc , req );
3833- } else if (fwd_seq <= req -> r_num_fwd ) {
3862+ } else if (fwd_seq <= req -> r_num_fwd || ( uint32_t ) fwd_seq >= U32_MAX ) {
38343863 /*
3835- * The type of 'num_fwd' in ceph 'MClientRequestForward'
3836- * is 'int32_t', while in 'ceph_mds_request_head' the
3837- * type is '__u8'. So in case the request bounces between
3838- * MDSes exceeding 256 times, the client will get stuck.
3839- *
3840- * In this case it's ususally a bug in MDS and continue
3841- * bouncing the request makes no sense.
3864+ * Avoid inifinite retrying after overflow.
38423865 *
3843- * In future this could be fixed in ceph code, so avoid
3844- * using the hardcode here.
3866+ * The MDS will increase the fwd count and in client side
3867+ * if the num_fwd is less than the one saved in request
3868+ * that means the MDS is an old version and overflowed of
3869+ * 8 bits.
38453870 */
3846- int max = sizeof_field (struct ceph_mds_request_head , num_fwd );
3847- max = 1 << (max * BITS_PER_BYTE );
3848- if (req -> r_num_fwd >= max ) {
3849- mutex_lock (& req -> r_fill_mutex );
3850- req -> r_err = - EMULTIHOP ;
3851- set_bit (CEPH_MDS_R_ABORTED , & req -> r_req_flags );
3852- mutex_unlock (& req -> r_fill_mutex );
3853- aborted = true;
3854- pr_warn_ratelimited ("forward tid %llu seq overflow\n" ,
3855- tid );
3856- } else {
3857- dout ("forward tid %llu to mds%d - old seq %d <= %d\n" ,
3858- tid , next_mds , req -> r_num_fwd , fwd_seq );
3859- }
3871+ mutex_lock (& req -> r_fill_mutex );
3872+ req -> r_err = - EMULTIHOP ;
3873+ set_bit (CEPH_MDS_R_ABORTED , & req -> r_req_flags );
3874+ mutex_unlock (& req -> r_fill_mutex );
3875+ aborted = true;
3876+ pr_warn_ratelimited ("forward tid %llu seq overflow\n" , tid );
38603877 } else {
38613878 /* resend. forward race not possible; mds would drop */
38623879 dout ("forward tid %llu to mds%d (we resend)\n" , tid , next_mds );
0 commit comments