Skip to content

Commit 61dc651

Browse files
committed
Merge tag 'nf-next-23-06-26' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next
Pablo Neira Ayuso says: ==================== Netfilter/IPVS updates for net-next 1) Allow slightly larger IPVS connection table size from Kconfig for 64-bit arch, from Abhijeet Rastogi. 2) Since IPVS connection table might be larger than 2^20 after previous patch, allow to limit it depending on the available memory. Moreover, use kvmalloc. From Julian Anastasov. 3) Do not rebuild VLAN header in nft_payload when matching source and destination MAC address. 4) Remove nested rcu read lock side in ip_set_test(), from Florian Westphal. 5) Allow to update set size, also from Florian. 6) Improve NAT tuple selection when connection is closing, from Florian Westphal. 7) Support for resetting set element stateful expression, from Phil Sutter. 8) Use NLA_POLICY_MAX to narrow down maximum attribute value in nf_tables, from Florian Westphal. * tag 'nf-next-23-06-26' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next: netfilter: nf_tables: limit allowed range via nla_policy netfilter: nf_tables: Introduce NFT_MSG_GETSETELEM_RESET netfilter: snat: evict closing tcp entries on reply tuple collision netfilter: nf_tables: permit update of set size netfilter: ipset: remove rcu_read_lock_bh pair from ip_set_test netfilter: nft_payload: rebuild vlan header when needed ipvs: dynamically limit the connection hash table ipvs: increase ip_vs_conn_tab_bits range for 64BIT ==================== Link: https://lore.kernel.org/r/20230626064749.75525-1-pablo@netfilter.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2 parents 771ca3d + a412dbf commit 61dc651

23 files changed

Lines changed: 199 additions & 70 deletions

include/net/netfilter/nf_tables.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1611,6 +1611,7 @@ struct nft_trans_set {
16111611
u64 timeout;
16121612
bool update;
16131613
bool bound;
1614+
u32 size;
16141615
};
16151616

16161617
#define nft_trans_set(trans) \
@@ -1625,6 +1626,8 @@ struct nft_trans_set {
16251626
(((struct nft_trans_set *)trans->data)->timeout)
16261627
#define nft_trans_set_gc_int(trans) \
16271628
(((struct nft_trans_set *)trans->data)->gc_int)
1629+
#define nft_trans_set_size(trans) \
1630+
(((struct nft_trans_set *)trans->data)->size)
16281631

16291632
struct nft_trans_chain {
16301633
struct nft_chain *chain;

include/uapi/linux/netfilter/nf_tables.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ enum nft_verdicts {
105105
* @NFT_MSG_DESTROYSETELEM: destroy a set element (enum nft_set_elem_attributes)
106106
* @NFT_MSG_DESTROYOBJ: destroy a stateful object (enum nft_object_attributes)
107107
* @NFT_MSG_DESTROYFLOWTABLE: destroy flow table (enum nft_flowtable_attributes)
108+
* @NFT_MSG_GETSETELEM_RESET: get set elements and reset attached stateful expressions (enum nft_set_elem_attributes)
108109
*/
109110
enum nf_tables_msg_types {
110111
NFT_MSG_NEWTABLE,
@@ -140,6 +141,7 @@ enum nf_tables_msg_types {
140141
NFT_MSG_DESTROYSETELEM,
141142
NFT_MSG_DESTROYOBJ,
142143
NFT_MSG_DESTROYFLOWTABLE,
144+
NFT_MSG_GETSETELEM_RESET,
143145
NFT_MSG_MAX,
144146
};
145147

net/netfilter/ipset/ip_set_core.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -739,9 +739,7 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
739739
!(opt->family == set->family || set->family == NFPROTO_UNSPEC))
740740
return 0;
741741

742-
rcu_read_lock_bh();
743742
ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt);
744-
rcu_read_unlock_bh();
745743

746744
if (ret == -EAGAIN) {
747745
/* Type requests element to be completed */

net/netfilter/ipvs/Kconfig

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ config IP_VS_DEBUG
4444

4545
config IP_VS_TAB_BITS
4646
int "IPVS connection table size (the Nth power of 2)"
47-
range 8 20
47+
range 8 20 if !64BIT
48+
range 8 27 if 64BIT
4849
default 12
4950
help
5051
The IPVS connection hash table uses the chaining scheme to handle
@@ -54,24 +55,24 @@ config IP_VS_TAB_BITS
5455

5556
Note the table size must be power of 2. The table size will be the
5657
value of 2 to the your input number power. The number to choose is
57-
from 8 to 20, the default number is 12, which means the table size
58-
is 4096. Don't input the number too small, otherwise you will lose
59-
performance on it. You can adapt the table size yourself, according
60-
to your virtual server application. It is good to set the table size
61-
not far less than the number of connections per second multiplying
62-
average lasting time of connection in the table. For example, your
63-
virtual server gets 200 connections per second, the connection lasts
64-
for 200 seconds in average in the connection table, the table size
65-
should be not far less than 200x200, it is good to set the table
66-
size 32768 (2**15).
58+
from 8 to 27 for 64BIT(20 otherwise), the default number is 12,
59+
which means the table size is 4096. Don't input the number too
60+
small, otherwise you will lose performance on it. You can adapt the
61+
table size yourself, according to your virtual server application.
62+
It is good to set the table size not far less than the number of
63+
connections per second multiplying average lasting time of
64+
connection in the table. For example, your virtual server gets 200
65+
connections per second, the connection lasts for 200 seconds in
66+
average in the connection table, the table size should be not far
67+
less than 200x200, it is good to set the table size 32768 (2**15).
6768

6869
Another note that each connection occupies 128 bytes effectively and
6970
each hash entry uses 8 bytes, so you can estimate how much memory is
7071
needed for your box.
7172

7273
You can overwrite this number setting conn_tab_bits module parameter
73-
or by appending ip_vs.conn_tab_bits=? to the kernel command line
74-
if IP VS was compiled built-in.
74+
or by appending ip_vs.conn_tab_bits=? to the kernel command line if
75+
IP VS was compiled built-in.
7576

7677
comment "IPVS transport protocol load balancing support"
7778

net/netfilter/ipvs/ip_vs_conn.c

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
#include <linux/net.h>
2727
#include <linux/kernel.h>
2828
#include <linux/module.h>
29-
#include <linux/vmalloc.h>
3029
#include <linux/proc_fs.h> /* for proc_net_* */
3130
#include <linux/slab.h>
3231
#include <linux/seq_file.h>
@@ -1482,13 +1481,21 @@ void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs)
14821481
int __init ip_vs_conn_init(void)
14831482
{
14841483
size_t tab_array_size;
1484+
int max_avail;
1485+
#if BITS_PER_LONG > 32
1486+
int max = 27;
1487+
#else
1488+
int max = 20;
1489+
#endif
1490+
int min = 8;
14851491
int idx;
14861492

1487-
/* Compute size and mask */
1488-
if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 20) {
1489-
pr_info("conn_tab_bits not in [8, 20]. Using default value\n");
1490-
ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
1491-
}
1493+
max_avail = order_base_2(totalram_pages()) + PAGE_SHIFT;
1494+
max_avail -= 2; /* ~4 in hash row */
1495+
max_avail -= 1; /* IPVS up to 1/2 of mem */
1496+
max_avail -= order_base_2(sizeof(struct ip_vs_conn));
1497+
max = clamp(max, min, max_avail);
1498+
ip_vs_conn_tab_bits = clamp_val(ip_vs_conn_tab_bits, min, max);
14921499
ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
14931500
ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
14941501

@@ -1497,7 +1504,8 @@ int __init ip_vs_conn_init(void)
14971504
*/
14981505
tab_array_size = array_size(ip_vs_conn_tab_size,
14991506
sizeof(*ip_vs_conn_tab));
1500-
ip_vs_conn_tab = vmalloc(tab_array_size);
1507+
ip_vs_conn_tab = kvmalloc_array(ip_vs_conn_tab_size,
1508+
sizeof(*ip_vs_conn_tab), GFP_KERNEL);
15011509
if (!ip_vs_conn_tab)
15021510
return -ENOMEM;
15031511

@@ -1506,7 +1514,7 @@ int __init ip_vs_conn_init(void)
15061514
sizeof(struct ip_vs_conn), 0,
15071515
SLAB_HWCACHE_ALIGN, NULL);
15081516
if (!ip_vs_conn_cachep) {
1509-
vfree(ip_vs_conn_tab);
1517+
kvfree(ip_vs_conn_tab);
15101518
return -ENOMEM;
15111519
}
15121520

@@ -1534,5 +1542,5 @@ void ip_vs_conn_cleanup(void)
15341542
rcu_barrier();
15351543
/* Release the empty cache */
15361544
kmem_cache_destroy(ip_vs_conn_cachep);
1537-
vfree(ip_vs_conn_tab);
1545+
kvfree(ip_vs_conn_tab);
15381546
}

net/netfilter/nf_nat_core.c

Lines changed: 88 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@
2727

2828
#include "nf_internals.h"
2929

30+
#define NF_NAT_MAX_ATTEMPTS 128
31+
#define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4)
32+
3033
static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];
3134

3235
static DEFINE_MUTEX(nf_nat_proto_mutex);
@@ -197,6 +200,88 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
197200
return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
198201
}
199202

203+
static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags)
204+
{
205+
static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT |
206+
IPS_DYING;
207+
static const unsigned long flags_needed = IPS_SRC_NAT;
208+
enum tcp_conntrack old_state;
209+
210+
old_state = READ_ONCE(ct->proto.tcp.state);
211+
if (old_state < TCP_CONNTRACK_TIME_WAIT)
212+
return false;
213+
214+
if (flags & flags_refuse)
215+
return false;
216+
217+
return (flags & flags_needed) == flags_needed;
218+
}
219+
220+
/* reverse direction will send packets to new source, so
221+
* make sure such packets are invalid.
222+
*/
223+
static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new)
224+
{
225+
return (__s32)(new->proto.tcp.seen[0].td_end -
226+
old->proto.tcp.seen[0].td_end) > 0;
227+
}
228+
229+
static int
230+
nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple,
231+
const struct nf_conn *ignored_conntrack,
232+
unsigned int attempts_left)
233+
{
234+
static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD;
235+
struct nf_conntrack_tuple_hash *thash;
236+
const struct nf_conntrack_zone *zone;
237+
struct nf_conntrack_tuple reply;
238+
unsigned long flags;
239+
struct nf_conn *ct;
240+
bool taken = true;
241+
struct net *net;
242+
243+
nf_ct_invert_tuple(&reply, tuple);
244+
245+
if (attempts_left > NF_NAT_HARDER_THRESH ||
246+
tuple->dst.protonum != IPPROTO_TCP ||
247+
ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT)
248+
return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
249+
250+
/* :ast few attempts to find a free tcp port. Destructive
251+
* action: evict colliding if its in timewait state and the
252+
* tcp sequence number has advanced past the one used by the
253+
* old entry.
254+
*/
255+
net = nf_ct_net(ignored_conntrack);
256+
zone = nf_ct_zone(ignored_conntrack);
257+
258+
thash = nf_conntrack_find_get(net, zone, &reply);
259+
if (!thash)
260+
return false;
261+
262+
ct = nf_ct_tuplehash_to_ctrack(thash);
263+
264+
if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL)
265+
goto out;
266+
267+
if (WARN_ON_ONCE(ct == ignored_conntrack))
268+
goto out;
269+
270+
flags = READ_ONCE(ct->status);
271+
if (!nf_nat_may_kill(ct, flags))
272+
goto out;
273+
274+
if (!nf_seq_has_advanced(ct, ignored_conntrack))
275+
goto out;
276+
277+
/* Even if we can evict do not reuse if entry is offloaded. */
278+
if (nf_ct_kill(ct))
279+
taken = flags & flags_offload;
280+
out:
281+
nf_ct_put(ct);
282+
return taken;
283+
}
284+
200285
static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t,
201286
const struct nf_nat_range2 *range)
202287
{
@@ -385,7 +470,6 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
385470
unsigned int range_size, min, max, i, attempts;
386471
__be16 *keyptr;
387472
u16 off;
388-
static const unsigned int max_attempts = 128;
389473

390474
switch (tuple->dst.protonum) {
391475
case IPPROTO_ICMP:
@@ -471,8 +555,8 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
471555
off = get_random_u16();
472556

473557
attempts = range_size;
474-
if (attempts > max_attempts)
475-
attempts = max_attempts;
558+
if (attempts > NF_NAT_MAX_ATTEMPTS)
559+
attempts = NF_NAT_MAX_ATTEMPTS;
476560

477561
/* We are in softirq; doing a search of the entire range risks
478562
* soft lockup when all tuples are already used.
@@ -483,7 +567,7 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
483567
another_round:
484568
for (i = 0; i < attempts; i++, off++) {
485569
*keyptr = htons(min + off % range_size);
486-
if (!nf_nat_used_tuple(tuple, ct))
570+
if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i))
487571
return;
488572
}
489573

0 commit comments

Comments
 (0)