Skip to content

Commit ab427db

Browse files
LorenzoBianconiummakynes
authored andcommitted
netfilter: flowtable: Add IPIP rx sw acceleration
Introduce sw acceleration for rx path of IPIP tunnels relying on the netfilter flowtable infrastructure. Subsequent patches will add sw acceleration for IPIP tunnels tx path. This series introduces basic infrastructure to accelerate other tunnel types (e.g. IP6IP6). IPIP rx sw acceleration can be tested running the following scenario where the traffic is forwarded between two NICs (eth0 and eth1) and an IPIP tunnel is used to access a remote site (using eth1 as the underlay device): ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (192.168.100.2) $ip addr show 6: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet 192.168.0.2/24 scope global eth0 valid_lft forever preferred_lft forever 7: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet 192.168.1.1/24 scope global eth1 valid_lft forever preferred_lft forever 8: tun0@NONE: <POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000 link/ipip 192.168.1.1 peer 192.168.1.2 inet 192.168.100.1/24 scope global tun0 valid_lft forever preferred_lft forever $ip route show default via 192.168.100.2 dev tun0 192.168.0.0/24 dev eth0 proto kernel scope link src 192.168.0.2 192.168.1.0/24 dev eth1 proto kernel scope link src 192.168.1.1 192.168.100.0/24 dev tun0 proto kernel scope link src 192.168.100.1 $nft list ruleset table inet filter { flowtable ft { hook ingress priority filter devices = { eth0, eth1 } } chain forward { type filter hook forward priority filter; policy accept; meta l4proto { tcp, udp } flow add @ft } } Reproducing the scenario described above using veths I got the following results: - TCP stream received from the IPIP tunnel: - net-next: (baseline) ~ 71Gbps - net-next + IPIP flowtbale support: ~101Gbps Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
1 parent a0d98b6 commit ab427db

6 files changed

Lines changed: 153 additions & 13 deletions

File tree

include/linux/netdevice.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -877,6 +877,7 @@ enum net_device_path_type {
877877
DEV_PATH_PPPOE,
878878
DEV_PATH_DSA,
879879
DEV_PATH_MTK_WDMA,
880+
DEV_PATH_TUN,
880881
};
881882

882883
struct net_device_path {
@@ -888,6 +889,18 @@ struct net_device_path {
888889
__be16 proto;
889890
u8 h_dest[ETH_ALEN];
890891
} encap;
892+
struct {
893+
union {
894+
struct in_addr src_v4;
895+
struct in6_addr src_v6;
896+
};
897+
union {
898+
struct in_addr dst_v4;
899+
struct in6_addr dst_v6;
900+
};
901+
902+
u8 l3_proto;
903+
} tun;
891904
struct {
892905
enum {
893906
DEV_PATH_BR_VLAN_KEEP,

include/net/netfilter/nf_flow_table.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,19 @@ enum flow_offload_xmit_type {
107107

108108
#define NF_FLOW_TABLE_ENCAP_MAX 2
109109

110+
struct flow_offload_tunnel {
111+
union {
112+
struct in_addr src_v4;
113+
struct in6_addr src_v6;
114+
};
115+
union {
116+
struct in_addr dst_v4;
117+
struct in6_addr dst_v6;
118+
};
119+
120+
u8 l3_proto;
121+
};
122+
110123
struct flow_offload_tuple {
111124
union {
112125
struct in_addr src_v4;
@@ -130,12 +143,15 @@ struct flow_offload_tuple {
130143
__be16 proto;
131144
} encap[NF_FLOW_TABLE_ENCAP_MAX];
132145

146+
struct flow_offload_tunnel tun;
147+
133148
/* All members above are keys for lookups, see flow_offload_hash(). */
134149
struct { } __hash;
135150

136151
u8 dir:2,
137152
xmit_type:3,
138153
encap_num:2,
154+
tun_num:2,
139155
in_vlan_ingress:2;
140156
u16 mtu;
141157
union {
@@ -206,7 +222,9 @@ struct nf_flow_route {
206222
u16 id;
207223
__be16 proto;
208224
} encap[NF_FLOW_TABLE_ENCAP_MAX];
225+
struct flow_offload_tunnel tun;
209226
u8 num_encaps:2,
227+
num_tuns:2,
210228
ingress_vlans:2;
211229
} in;
212230
struct {

net/ipv4/ipip.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,30 @@ ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd)
353353
return ip_tunnel_ctl(dev, p, cmd);
354354
}
355355

356+
static int ipip_fill_forward_path(struct net_device_path_ctx *ctx,
357+
struct net_device_path *path)
358+
{
359+
struct ip_tunnel *tunnel = netdev_priv(ctx->dev);
360+
const struct iphdr *tiph = &tunnel->parms.iph;
361+
struct rtable *rt;
362+
363+
rt = ip_route_output(dev_net(ctx->dev), tiph->daddr, 0, 0, 0,
364+
RT_SCOPE_UNIVERSE);
365+
if (IS_ERR(rt))
366+
return PTR_ERR(rt);
367+
368+
path->type = DEV_PATH_TUN;
369+
path->tun.src_v4.s_addr = tiph->saddr;
370+
path->tun.dst_v4.s_addr = tiph->daddr;
371+
path->tun.l3_proto = IPPROTO_IPIP;
372+
path->dev = ctx->dev;
373+
374+
ctx->dev = rt->dst.dev;
375+
ip_rt_put(rt);
376+
377+
return 0;
378+
}
379+
356380
static const struct net_device_ops ipip_netdev_ops = {
357381
.ndo_init = ipip_tunnel_init,
358382
.ndo_uninit = ip_tunnel_uninit,
@@ -362,6 +386,7 @@ static const struct net_device_ops ipip_netdev_ops = {
362386
.ndo_get_stats64 = dev_get_tstats64,
363387
.ndo_get_iflink = ip_tunnel_get_iflink,
364388
.ndo_tunnel_ctl = ipip_tunnel_ctl,
389+
.ndo_fill_forward_path = ipip_fill_forward_path,
365390
};
366391

367392
#define IPIP_FEATURES (NETIF_F_SG | \

net/netfilter/nf_flow_table_core.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,10 @@ static int flow_offload_fill_route(struct flow_offload *flow,
118118
flow_tuple->in_vlan_ingress |= BIT(j);
119119
j++;
120120
}
121+
122+
flow_tuple->tun = route->tuple[dir].in.tun;
121123
flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
124+
flow_tuple->tun_num = route->tuple[dir].in.num_tuns;
122125

123126
switch (route->tuple[dir].xmit_type) {
124127
case FLOW_OFFLOAD_XMIT_DIRECT:

net/netfilter/nf_flow_table_ip.c

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,11 @@ static bool ip_has_options(unsigned int thoff)
145145
static void nf_flow_tuple_encap(struct sk_buff *skb,
146146
struct flow_offload_tuple *tuple)
147147
{
148+
__be16 inner_proto = skb->protocol;
148149
struct vlan_ethhdr *veth;
149150
struct pppoe_hdr *phdr;
151+
struct iphdr *iph;
152+
u16 offset = 0;
150153
int i = 0;
151154

152155
if (skb_vlan_tag_present(skb)) {
@@ -159,13 +162,26 @@ static void nf_flow_tuple_encap(struct sk_buff *skb,
159162
veth = (struct vlan_ethhdr *)skb_mac_header(skb);
160163
tuple->encap[i].id = ntohs(veth->h_vlan_TCI);
161164
tuple->encap[i].proto = skb->protocol;
165+
inner_proto = veth->h_vlan_encapsulated_proto;
166+
offset += VLAN_HLEN;
162167
break;
163168
case htons(ETH_P_PPP_SES):
164169
phdr = (struct pppoe_hdr *)skb_network_header(skb);
165170
tuple->encap[i].id = ntohs(phdr->sid);
166171
tuple->encap[i].proto = skb->protocol;
172+
inner_proto = *((__be16 *)(phdr + 1));
173+
offset += PPPOE_SES_HLEN;
167174
break;
168175
}
176+
177+
if (inner_proto == htons(ETH_P_IP)) {
178+
iph = (struct iphdr *)(skb_network_header(skb) + offset);
179+
if (iph->protocol == IPPROTO_IPIP) {
180+
tuple->tun.dst_v4.s_addr = iph->daddr;
181+
tuple->tun.src_v4.s_addr = iph->saddr;
182+
tuple->tun.l3_proto = IPPROTO_IPIP;
183+
}
184+
}
169185
}
170186

171187
struct nf_flowtable_ctx {
@@ -277,11 +293,46 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
277293
return NF_STOLEN;
278294
}
279295

296+
static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize)
297+
{
298+
struct iphdr *iph;
299+
u16 size;
300+
301+
if (!pskb_may_pull(skb, sizeof(*iph) + *psize))
302+
return false;
303+
304+
iph = (struct iphdr *)(skb_network_header(skb) + *psize);
305+
size = iph->ihl << 2;
306+
307+
if (ip_is_fragment(iph) || unlikely(ip_has_options(size)))
308+
return false;
309+
310+
if (iph->ttl <= 1)
311+
return false;
312+
313+
if (iph->protocol == IPPROTO_IPIP)
314+
*psize += size;
315+
316+
return true;
317+
}
318+
319+
static void nf_flow_ip4_tunnel_pop(struct sk_buff *skb)
320+
{
321+
struct iphdr *iph = (struct iphdr *)skb_network_header(skb);
322+
323+
if (iph->protocol != IPPROTO_IPIP)
324+
return;
325+
326+
skb_pull(skb, iph->ihl << 2);
327+
skb_reset_network_header(skb);
328+
}
329+
280330
static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto,
281331
u32 *offset)
282332
{
333+
__be16 inner_proto = skb->protocol;
283334
struct vlan_ethhdr *veth;
284-
__be16 inner_proto;
335+
bool ret = false;
285336

286337
switch (skb->protocol) {
287338
case htons(ETH_P_8021Q):
@@ -291,19 +342,23 @@ static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto,
291342
veth = (struct vlan_ethhdr *)skb_mac_header(skb);
292343
if (veth->h_vlan_encapsulated_proto == proto) {
293344
*offset += VLAN_HLEN;
294-
return true;
345+
inner_proto = proto;
346+
ret = true;
295347
}
296348
break;
297349
case htons(ETH_P_PPP_SES):
298350
if (nf_flow_pppoe_proto(skb, &inner_proto) &&
299351
inner_proto == proto) {
300352
*offset += PPPOE_SES_HLEN;
301-
return true;
353+
ret = true;
302354
}
303355
break;
304356
}
305357

306-
return false;
358+
if (inner_proto == htons(ETH_P_IP))
359+
ret = nf_flow_ip4_tunnel_proto(skb, offset);
360+
361+
return ret;
307362
}
308363

309364
static void nf_flow_encap_pop(struct sk_buff *skb,
@@ -331,6 +386,9 @@ static void nf_flow_encap_pop(struct sk_buff *skb,
331386
break;
332387
}
333388
}
389+
390+
if (skb->protocol == htons(ETH_P_IP))
391+
nf_flow_ip4_tunnel_pop(skb);
334392
}
335393

336394
struct nf_flow_xmit {
@@ -356,8 +414,7 @@ nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx,
356414
{
357415
struct flow_offload_tuple tuple = {};
358416

359-
if (skb->protocol != htons(ETH_P_IP) &&
360-
!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset))
417+
if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset))
361418
return NULL;
362419

363420
if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0)

net/netfilter/nf_flow_table_path.c

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ struct nft_forward_info {
8080
__be16 proto;
8181
} encap[NF_FLOW_TABLE_ENCAP_MAX];
8282
u8 num_encaps;
83+
struct flow_offload_tunnel tun;
84+
u8 num_tuns;
8385
u8 ingress_vlans;
8486
u8 h_source[ETH_ALEN];
8587
u8 h_dest[ETH_ALEN];
@@ -102,6 +104,7 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack,
102104
case DEV_PATH_DSA:
103105
case DEV_PATH_VLAN:
104106
case DEV_PATH_PPPOE:
107+
case DEV_PATH_TUN:
105108
info->indev = path->dev;
106109
if (is_zero_ether_addr(info->h_source))
107110
memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
@@ -113,14 +116,27 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack,
113116
break;
114117
}
115118

116-
/* DEV_PATH_VLAN and DEV_PATH_PPPOE */
117-
if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
118-
info->indev = NULL;
119-
break;
119+
/* DEV_PATH_VLAN, DEV_PATH_PPPOE and DEV_PATH_TUN */
120+
if (path->type == DEV_PATH_TUN) {
121+
if (info->num_tuns) {
122+
info->indev = NULL;
123+
break;
124+
}
125+
info->tun.src_v6 = path->tun.src_v6;
126+
info->tun.dst_v6 = path->tun.dst_v6;
127+
info->tun.l3_proto = path->tun.l3_proto;
128+
info->num_tuns++;
129+
} else {
130+
if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
131+
info->indev = NULL;
132+
break;
133+
}
134+
info->encap[info->num_encaps].id =
135+
path->encap.id;
136+
info->encap[info->num_encaps].proto =
137+
path->encap.proto;
138+
info->num_encaps++;
120139
}
121-
info->encap[info->num_encaps].id = path->encap.id;
122-
info->encap[info->num_encaps].proto = path->encap.proto;
123-
info->num_encaps++;
124140
if (path->type == DEV_PATH_PPPOE)
125141
memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN);
126142
break;
@@ -203,6 +219,14 @@ static void nft_dev_forward_path(struct nf_flow_route *route,
203219
route->tuple[!dir].in.encap[i].id = info.encap[i].id;
204220
route->tuple[!dir].in.encap[i].proto = info.encap[i].proto;
205221
}
222+
223+
if (info.num_tuns) {
224+
route->tuple[!dir].in.tun.src_v6 = info.tun.dst_v6;
225+
route->tuple[!dir].in.tun.dst_v6 = info.tun.src_v6;
226+
route->tuple[!dir].in.tun.l3_proto = info.tun.l3_proto;
227+
route->tuple[!dir].in.num_tuns = info.num_tuns;
228+
}
229+
206230
route->tuple[!dir].in.num_encaps = info.num_encaps;
207231
route->tuple[!dir].in.ingress_vlans = info.ingress_vlans;
208232
route->tuple[dir].out.ifindex = info.outdev->ifindex;

0 commit comments

Comments
 (0)