Skip to content

Commit 24cf78c

Browse files
dtatuleakuba-moo
authored andcommitted
net/mlx5e: SHAMPO, Switch to header memcpy
Previously the HW-GRO code was using a separate page_pool for the header buffer. The pages of the header buffer were replenished via UMR. This mechanism has some drawbacks: - Reference counting on the page_pool page frags is not cheap. - UMRs have HW overhead for updating and also for access. Especially for the KLM type which was previously used. - UMR code for headers is complex. This patch switches to using a static memory area (static MTT MKEY) for the header buffer and does a header memcpy. This happens only once per GRO session. The SKB is allocated from the per-cpu NAPI SKB cache. Performance numbers for x86: +---------------------------------------------------------+ | Test | Baseline | Header Copy | Change | |---------------------+------------+-------------+--------| | iperf3 oncpu | 59.5 Gbps | 64.00 Gbps | 7 % | | iperf3 offcpu | 102.5 Gbps | 104.20 Gbps | 2 % | | kperf oncpu | 115.0 Gbps | 130.00 Gbps | 12 % | | XDP_DROP (skb mode) | 3.9 Mpps | 3.9 Mpps | 0 % | +---------------------------------------------------------+ Notes on test: - System: Intel(R) Xeon(R) Platinum 8380 CPU @ 2.30GHz - oncpu: NAPI and application running on same CPU - offcpu: NAPI and application running on different CPUs - MTU: 1500 - iperf3 tests are single stream, 60s with IPv6 (for slightly larger headers) - kperf version [1] [1] git://git.kernel.dk/kperf.git Suggested-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com> Signed-off-by: Tariq Toukan <tariqt@nvidia.com> Reviewed-by: Jacob Keller <jacob.e.keller@intel.com> Link: https://patch.msgid.link/20260204200345.1724098-1-tariqt@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent 215b530 commit 24cf78c

5 files changed

Lines changed: 188 additions & 484 deletions

File tree

drivers/net/ethernet/mellanox/mlx5/core/en.h

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,10 @@ struct page_pool;
8282

8383
#define MLX5E_RX_MAX_HEAD (256)
8484
#define MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE (8)
85-
#define MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE (9)
86-
#define MLX5E_SHAMPO_WQ_HEADER_PER_PAGE (PAGE_SIZE >> MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE)
87-
#define MLX5E_SHAMPO_LOG_WQ_HEADER_PER_PAGE (PAGE_SHIFT - MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE)
85+
#define MLX5E_SHAMPO_WQ_HEADER_PER_PAGE \
86+
(PAGE_SIZE >> MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE)
87+
#define MLX5E_SHAMPO_LOG_WQ_HEADER_PER_PAGE \
88+
(PAGE_SHIFT - MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE)
8889
#define MLX5E_SHAMPO_WQ_BASE_HEAD_ENTRY_SIZE_SHIFT (6)
8990
#define MLX5E_SHAMPO_WQ_RESRV_SIZE_BASE_SHIFT (12)
9091
#define MLX5E_SHAMPO_WQ_LOG_RESRV_SIZE (16)
@@ -638,16 +639,11 @@ struct mlx5e_dma_info {
638639
};
639640

640641
struct mlx5e_shampo_hd {
641-
struct mlx5e_frag_page *pages;
642642
u32 hd_per_wq;
643-
u32 hd_per_page;
644-
u16 hd_per_wqe;
645-
u8 log_hd_per_page;
646-
u8 log_hd_entry_size;
647-
unsigned long *bitmap;
648-
u16 pi;
649-
u16 ci;
650-
__be32 mkey_be;
643+
u32 hd_buf_size;
644+
u32 mkey;
645+
u32 nentries;
646+
DECLARE_FLEX_ARRAY(struct mlx5e_dma_info, hd_buf_pages);
651647
};
652648

653649
struct mlx5e_hw_gro_data {

drivers/net/ethernet/mellanox/mlx5/core/en/params.c

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1068,26 +1068,6 @@ u32 mlx5e_shampo_hd_per_wq(struct mlx5_core_dev *mdev,
10681068
return hd_per_wq;
10691069
}
10701070

1071-
static u32 mlx5e_shampo_icosq_sz(struct mlx5_core_dev *mdev,
1072-
struct mlx5e_params *params,
1073-
struct mlx5e_rq_param *rq_param)
1074-
{
1075-
int max_num_of_umr_per_wqe, max_hd_per_wqe, max_ksm_per_umr, rest;
1076-
void *wqc = MLX5_ADDR_OF(rqc, rq_param->rqc, wq);
1077-
int wq_size = BIT(MLX5_GET(wq, wqc, log_wq_sz));
1078-
u32 wqebbs;
1079-
1080-
max_ksm_per_umr = MLX5E_MAX_KSM_PER_WQE(mdev);
1081-
max_hd_per_wqe = mlx5e_shampo_hd_per_wqe(mdev, params, rq_param);
1082-
max_num_of_umr_per_wqe = max_hd_per_wqe / max_ksm_per_umr;
1083-
rest = max_hd_per_wqe % max_ksm_per_umr;
1084-
wqebbs = MLX5E_KSM_UMR_WQEBBS(max_ksm_per_umr) * max_num_of_umr_per_wqe;
1085-
if (rest)
1086-
wqebbs += MLX5E_KSM_UMR_WQEBBS(rest);
1087-
wqebbs *= wq_size;
1088-
return wqebbs;
1089-
}
1090-
10911071
#define MLX5E_LRO_TIMEOUT_ARR_SIZE 4
10921072

10931073
u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout)
@@ -1173,9 +1153,6 @@ static u8 mlx5e_build_icosq_log_wq_sz(struct mlx5_core_dev *mdev,
11731153
wqebbs += max_xsk_wqebbs;
11741154
}
11751155

1176-
if (params->packet_merge.type == MLX5E_PACKET_MERGE_SHAMPO)
1177-
wqebbs += mlx5e_shampo_icosq_sz(mdev, params, rqp);
1178-
11791156
/* UMR WQEs don't cross the page boundary, they are padded with NOPs.
11801157
* This padding is always smaller than the max WQE size. That gives us
11811158
* at least (PAGE_SIZE - (max WQE size - MLX5_SEND_WQE_BB)) useful bytes

drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,6 @@ ktime_t mlx5e_cqe_ts_to_ns(cqe_ts_to_ns func, struct mlx5_clock *clock, u64 cqe_
6565
enum mlx5e_icosq_wqe_type {
6666
MLX5E_ICOSQ_WQE_NOP,
6767
MLX5E_ICOSQ_WQE_UMR_RX,
68-
MLX5E_ICOSQ_WQE_SHAMPO_HD_UMR,
6968
#ifdef CONFIG_MLX5_EN_TLS
7069
MLX5E_ICOSQ_WQE_UMR_TLS,
7170
MLX5E_ICOSQ_WQE_SET_PSV_TLS,

0 commit comments

Comments
 (0)