Skip to content

Commit 6c77997

Browse files
yanzhai-cfborkmann
authored andcommitted
selftests/bpf: Add lwt_xmit tests for BPF_REROUTE
There is no lwt test case for BPF_REROUTE yet. Add test cases for both normal and abnormal situations. The abnormal situation is set up with an fq qdisc on the reroute target device. Without proper fixes, overflow this qdisc queue limit (to trigger a drop) would panic the kernel. Signed-off-by: Yan Zhai <yan@cloudflare.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Link: https://lore.kernel.org/bpf/62c8ddc1e924269dcf80d2e8af1a1e632cee0b3a.1692326837.git.yan@cloudflare.com
1 parent 43a7c3e commit 6c77997

3 files changed

Lines changed: 299 additions & 0 deletions

File tree

tools/testing/selftests/bpf/config

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ CONFIG_NET_IPGRE=y
6060
CONFIG_NET_IPGRE_DEMUX=y
6161
CONFIG_NET_IPIP=y
6262
CONFIG_NET_MPLS_GSO=y
63+
CONFIG_NET_SCH_FQ=y
6364
CONFIG_NET_SCH_INGRESS=y
6465
CONFIG_NET_SCHED=y
6566
CONFIG_NETDEVSIM=y
Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2+
3+
/*
4+
* Test suite of lwt BPF programs that reroutes packets
5+
* The file tests focus not only if these programs work as expected normally,
6+
* but also if they can handle abnormal situations gracefully. This test
7+
* suite currently only covers lwt_xmit hook. lwt_in tests have not been
8+
* implemented.
9+
*
10+
* WARNING
11+
* -------
12+
* This test suite can crash the kernel, thus should be run in a VM.
13+
*
14+
* Setup:
15+
* ---------
16+
* all tests are performed in a single netns. A lwt encap route is setup for
17+
* each subtest:
18+
*
19+
* ip route add 10.0.0.0/24 encap bpf xmit <obj> sec "<section_N>" dev link_err
20+
*
21+
* Here <obj> is statically defined to test_lwt_reroute.bpf.o, and it contains
22+
* a single test program entry. This program sets packet mark by last byte of
23+
* the IPv4 daddr. For example, a packet going to 1.2.3.4 will receive a skb
24+
* mark 4. A packet will only be marked once, and IP x.x.x.0 will be skipped
25+
* to avoid route loop. We didn't use generated BPF skeleton since the
26+
* attachment for lwt programs are not supported by libbpf yet.
27+
*
28+
* The test program will bring up a tun device, and sets up the following
29+
* routes:
30+
*
31+
* ip rule add pref 100 from all fwmark <tun_index> lookup 100
32+
* ip route add table 100 default dev tun0
33+
*
34+
* For normal testing, a ping command is running in the test netns:
35+
*
36+
* ping 10.0.0.<tun_index> -c 1 -w 1 -s 100
37+
*
38+
* For abnormal testing, fq is used as the qdisc of the tun device. Then a UDP
39+
* socket will try to overflow the fq queue and trigger qdisc drop error.
40+
*
41+
* Scenarios:
42+
* --------------------------------
43+
* 1. Reroute to a running tun device
44+
* 2. Reroute to a device where qdisc drop
45+
*
46+
* For case 1, ping packets should be received by the tun device.
47+
*
48+
* For case 2, force UDP packets to overflow fq limit. As long as kernel
49+
* is not crashed, it is considered successful.
50+
*/
51+
#include "lwt_helpers.h"
52+
#include "network_helpers.h"
53+
#include <linux/net_tstamp.h>
54+
55+
#define BPF_OBJECT "test_lwt_reroute.bpf.o"
56+
#define LOCAL_SRC "10.0.0.1"
57+
#define TEST_CIDR "10.0.0.0/24"
58+
#define XMIT_HOOK "xmit"
59+
#define XMIT_SECTION "lwt_xmit"
60+
#define NSEC_PER_SEC 1000000000ULL
61+
62+
/* send a ping to be rerouted to the target device */
63+
static void ping_once(const char *ip)
64+
{
65+
/* We won't get a reply. Don't fail here */
66+
SYS_NOFAIL("ping %s -c1 -W1 -s %d >/dev/null 2>&1",
67+
ip, ICMP_PAYLOAD_SIZE);
68+
}
69+
70+
/* Send snd_target UDP packets to overflow the fq queue and trigger qdisc drop
71+
* error. This is done via TX tstamp to force buffering delayed packets.
72+
*/
73+
static int overflow_fq(int snd_target, const char *target_ip)
74+
{
75+
struct sockaddr_in addr = {
76+
.sin_family = AF_INET,
77+
.sin_port = htons(1234),
78+
};
79+
80+
char data_buf[8]; /* only #pkts matter, so use a random small buffer */
81+
char control_buf[CMSG_SPACE(sizeof(uint64_t))];
82+
struct iovec iov = {
83+
.iov_base = data_buf,
84+
.iov_len = sizeof(data_buf),
85+
};
86+
int err = -1;
87+
int s = -1;
88+
struct sock_txtime txtime_on = {
89+
.clockid = CLOCK_MONOTONIC,
90+
.flags = 0,
91+
};
92+
struct msghdr msg = {
93+
.msg_name = &addr,
94+
.msg_namelen = sizeof(addr),
95+
.msg_control = control_buf,
96+
.msg_controllen = sizeof(control_buf),
97+
.msg_iovlen = 1,
98+
.msg_iov = &iov,
99+
};
100+
struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
101+
102+
memset(data_buf, 0, sizeof(data_buf));
103+
104+
s = socket(AF_INET, SOCK_DGRAM, 0);
105+
if (!ASSERT_GE(s, 0, "socket"))
106+
goto out;
107+
108+
err = setsockopt(s, SOL_SOCKET, SO_TXTIME, &txtime_on, sizeof(txtime_on));
109+
if (!ASSERT_OK(err, "setsockopt(SO_TXTIME)"))
110+
goto out;
111+
112+
err = inet_pton(AF_INET, target_ip, &addr.sin_addr);
113+
if (!ASSERT_EQ(err, 1, "inet_pton"))
114+
goto out;
115+
116+
while (snd_target > 0) {
117+
struct timespec now;
118+
119+
memset(control_buf, 0, sizeof(control_buf));
120+
cmsg->cmsg_type = SCM_TXTIME;
121+
cmsg->cmsg_level = SOL_SOCKET;
122+
cmsg->cmsg_len = CMSG_LEN(sizeof(uint64_t));
123+
124+
err = clock_gettime(CLOCK_MONOTONIC, &now);
125+
if (!ASSERT_OK(err, "clock_gettime(CLOCK_MONOTONIC)")) {
126+
err = -1;
127+
goto out;
128+
}
129+
130+
*(uint64_t *)CMSG_DATA(cmsg) = (now.tv_nsec + 1) * NSEC_PER_SEC +
131+
now.tv_nsec;
132+
133+
/* we will intentionally send more than fq limit, so ignore
134+
* the error here.
135+
*/
136+
sendmsg(s, &msg, MSG_NOSIGNAL);
137+
snd_target--;
138+
}
139+
140+
/* no kernel crash so far is considered success */
141+
err = 0;
142+
143+
out:
144+
if (s >= 0)
145+
close(s);
146+
147+
return err;
148+
}
149+
150+
static int setup(const char *tun_dev)
151+
{
152+
int target_index = -1;
153+
int tap_fd = -1;
154+
155+
tap_fd = open_tuntap(tun_dev, false);
156+
if (!ASSERT_GE(tap_fd, 0, "open_tun"))
157+
return -1;
158+
159+
target_index = if_nametoindex(tun_dev);
160+
if (!ASSERT_GE(target_index, 0, "if_nametoindex"))
161+
return -1;
162+
163+
SYS(fail, "ip link add link_err type dummy");
164+
SYS(fail, "ip link set lo up");
165+
SYS(fail, "ip addr add dev lo " LOCAL_SRC "/32");
166+
SYS(fail, "ip link set link_err up");
167+
SYS(fail, "ip link set %s up", tun_dev);
168+
169+
SYS(fail, "ip route add %s dev link_err encap bpf xmit obj %s sec lwt_xmit",
170+
TEST_CIDR, BPF_OBJECT);
171+
172+
SYS(fail, "ip rule add pref 100 from all fwmark %d lookup 100",
173+
target_index);
174+
SYS(fail, "ip route add t 100 default dev %s", tun_dev);
175+
176+
return tap_fd;
177+
178+
fail:
179+
if (tap_fd >= 0)
180+
close(tap_fd);
181+
return -1;
182+
}
183+
184+
static void test_lwt_reroute_normal_xmit(void)
185+
{
186+
const char *tun_dev = "tun0";
187+
int tun_fd = -1;
188+
int ifindex = -1;
189+
char ip[256];
190+
struct timeval timeo = {
191+
.tv_sec = 0,
192+
.tv_usec = 250000,
193+
};
194+
195+
tun_fd = setup(tun_dev);
196+
if (!ASSERT_GE(tun_fd, 0, "setup_reroute"))
197+
return;
198+
199+
ifindex = if_nametoindex(tun_dev);
200+
if (!ASSERT_GE(ifindex, 0, "if_nametoindex"))
201+
return;
202+
203+
snprintf(ip, 256, "10.0.0.%d", ifindex);
204+
205+
/* ping packets should be received by the tun device */
206+
ping_once(ip);
207+
208+
if (!ASSERT_EQ(wait_for_packet(tun_fd, __expect_icmp_ipv4, &timeo), 1,
209+
"wait_for_packet"))
210+
log_err("%s xmit", __func__);
211+
}
212+
213+
/*
214+
* Test the failure case when the skb is dropped at the qdisc. This is a
215+
* regression prevention at the xmit hook only.
216+
*/
217+
static void test_lwt_reroute_qdisc_dropped(void)
218+
{
219+
const char *tun_dev = "tun0";
220+
int tun_fd = -1;
221+
int ifindex = -1;
222+
char ip[256];
223+
224+
tun_fd = setup(tun_dev);
225+
if (!ASSERT_GE(tun_fd, 0, "setup_reroute"))
226+
goto fail;
227+
228+
SYS(fail, "tc qdisc replace dev %s root fq limit 5 flow_limit 5", tun_dev);
229+
230+
ifindex = if_nametoindex(tun_dev);
231+
if (!ASSERT_GE(ifindex, 0, "if_nametoindex"))
232+
return;
233+
234+
snprintf(ip, 256, "10.0.0.%d", ifindex);
235+
ASSERT_EQ(overflow_fq(10, ip), 0, "overflow_fq");
236+
237+
fail:
238+
if (tun_fd >= 0)
239+
close(tun_fd);
240+
}
241+
242+
static void *test_lwt_reroute_run(void *arg)
243+
{
244+
netns_delete();
245+
RUN_TEST(lwt_reroute_normal_xmit);
246+
RUN_TEST(lwt_reroute_qdisc_dropped);
247+
return NULL;
248+
}
249+
250+
void test_lwt_reroute(void)
251+
{
252+
pthread_t test_thread;
253+
int err;
254+
255+
/* Run the tests in their own thread to isolate the namespace changes
256+
* so they do not affect the environment of other tests.
257+
* (specifically needed because of unshare(CLONE_NEWNS) in open_netns())
258+
*/
259+
err = pthread_create(&test_thread, NULL, &test_lwt_reroute_run, NULL);
260+
if (ASSERT_OK(err, "pthread_create"))
261+
ASSERT_OK(pthread_join(test_thread, NULL), "pthread_join");
262+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
#include <inttypes.h>
3+
#include <linux/bpf.h>
4+
#include <bpf/bpf_endian.h>
5+
#include <bpf/bpf_helpers.h>
6+
#include <linux/if_ether.h>
7+
#include <linux/ip.h>
8+
9+
/* This function extracts the last byte of the daddr, and uses it
10+
* as output dev index.
11+
*/
12+
SEC("lwt_xmit")
13+
int test_lwt_reroute(struct __sk_buff *skb)
14+
{
15+
struct iphdr *iph = NULL;
16+
void *start = (void *)(long)skb->data;
17+
void *end = (void *)(long)skb->data_end;
18+
19+
/* set mark at most once */
20+
if (skb->mark != 0)
21+
return BPF_OK;
22+
23+
if (start + sizeof(*iph) > end)
24+
return BPF_DROP;
25+
26+
iph = (struct iphdr *)start;
27+
skb->mark = bpf_ntohl(iph->daddr) & 0xff;
28+
29+
/* do not reroute x.x.x.0 packets */
30+
if (skb->mark == 0)
31+
return BPF_OK;
32+
33+
return BPF_LWT_REROUTE;
34+
}
35+
36+
char _license[] SEC("license") = "GPL";

0 commit comments

Comments
 (0)