Skip to content

Commit 792aaea

Browse files
committed
Florian Westphal says: ==================== netfilter: updates for net-next The following patchset contains Netfilter updates for *net-next*: 1) Fix net-next-only use-after-free bug in nf_tables rbtree set: Expired elements cannot be released right away after unlink anymore because there is no guarantee that the binary-search blob is going to be updated. Spotted by syzkaller. 2) Fix esoteric bug in nf_queue with udp fraglist gro, broken since 6.11. Patch 3 adds extends the nfqueue selftest for this. 4) Use dedicated slab for flowtable entries, currently the -512 cache is used, which is wasteful. From Qingfang Deng. 5) Recent net-next update extended existing test for ip6ip6 tunnels, add the required /config entry. Test still passed by accident because the previous tests network setup gets re-used, so also update the test so it will fail in case the ip6ip6 tunnel interface cannot be added. 6) Fix 'nft get element mytable myset { 1.2.3.4 }' on big endian platforms, this was broken since code was added in v5.1. 7) Fix nf_tables counter reset support on 32bit platforms, where counter reset may cause huge values to appear due to wraparound. Broken since reset feature was added in v6.11. From Anders Grahn. 8-11) update nf_tables rbtree set type to detect partial operlaps. This will eventually speed up nftables userspace: at this time userspace does a netlink dump of the set content which slows down incremental updates on interval sets. From Pablo Neira Ayuso. * tag 'nf-next-26-02-06' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next: netfilter: nft_set_rbtree: validate open interval overlap netfilter: nft_set_rbtree: validate element belonging to interval netfilter: nft_set_rbtree: check for partial overlaps in anonymous sets netfilter: nft_set_rbtree: fix bogus EEXIST with NLM_F_CREATE with null interval netfilter: nft_counter: fix reset of counters on 32bit archs netfilter: nft_set_hash: fix get operation on big endian selftests: netfilter: add IPV6_TUNNEL to config netfilter: flowtable: dedicated slab for flow entry selftests: netfilter: nft_queue.sh: add udp fraglist gro test case netfilter: nfnetlink_queue: do shared-unconfirmed check before segmentation netfilter: nft_set_rbtree: don't gc elements on insert ==================== Link: https://patch.msgid.link/20260206153048.17570-1-fw@strlen.de Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2 parents 3a46873 + 6489469 commit 792aaea

12 files changed

Lines changed: 580 additions & 148 deletions

File tree

include/linux/u64_stats_sync.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,11 @@ static inline void u64_stats_add(u64_stats_t *p, unsigned long val)
9797
local64_add(val, &p->v);
9898
}
9999

100+
static inline void u64_stats_sub(u64_stats_t *p, s64 val)
101+
{
102+
local64_sub(val, &p->v);
103+
}
104+
100105
static inline void u64_stats_inc(u64_stats_t *p)
101106
{
102107
local64_inc(&p->v);
@@ -145,6 +150,11 @@ static inline void u64_stats_add(u64_stats_t *p, unsigned long val)
145150
p->v += val;
146151
}
147152

153+
static inline void u64_stats_sub(u64_stats_t *p, s64 val)
154+
{
155+
p->v -= val;
156+
}
157+
148158
static inline void u64_stats_inc(u64_stats_t *p)
149159
{
150160
p->v++;

include/net/netfilter/nf_queue.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ struct nf_queue_entry {
2121
struct net_device *physout;
2222
#endif
2323
struct nf_hook_state state;
24+
bool nf_ct_is_unconfirmed;
2425
u16 size; /* sizeof(entry) + saved route keys */
2526
u16 queue_num;
2627

include/net/netfilter/nf_tables.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,8 @@ struct nft_userdata {
277277
unsigned char data[];
278278
};
279279

280+
#define NFT_SET_ELEM_INTERNAL_LAST 0x1
281+
280282
/* placeholder structure for opaque set element backend representation. */
281283
struct nft_elem_priv { };
282284

@@ -286,6 +288,7 @@ struct nft_elem_priv { };
286288
* @key: element key
287289
* @key_end: closing element key
288290
* @data: element data
291+
* @flags: flags
289292
* @priv: element private data and extensions
290293
*/
291294
struct nft_set_elem {
@@ -301,6 +304,7 @@ struct nft_set_elem {
301304
u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)];
302305
struct nft_data val;
303306
} data;
307+
u32 flags;
304308
struct nft_elem_priv *priv;
305309
};
306310

net/netfilter/nf_flow_table_core.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
static DEFINE_MUTEX(flowtable_lock);
1818
static LIST_HEAD(flowtables);
19+
static __read_mostly struct kmem_cache *flow_offload_cachep;
1920

2021
static void
2122
flow_offload_fill_dir(struct flow_offload *flow,
@@ -56,7 +57,7 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
5657
if (unlikely(nf_ct_is_dying(ct)))
5758
return NULL;
5859

59-
flow = kzalloc(sizeof(*flow), GFP_ATOMIC);
60+
flow = kmem_cache_zalloc(flow_offload_cachep, GFP_ATOMIC);
6061
if (!flow)
6162
return NULL;
6263

@@ -812,9 +813,13 @@ static int __init nf_flow_table_module_init(void)
812813
{
813814
int ret;
814815

816+
flow_offload_cachep = KMEM_CACHE(flow_offload, SLAB_HWCACHE_ALIGN);
817+
if (!flow_offload_cachep)
818+
return -ENOMEM;
819+
815820
ret = register_pernet_subsys(&nf_flow_table_net_ops);
816821
if (ret < 0)
817-
return ret;
822+
goto out_pernet;
818823

819824
ret = nf_flow_table_offload_init();
820825
if (ret)
@@ -830,13 +835,16 @@ static int __init nf_flow_table_module_init(void)
830835
nf_flow_table_offload_exit();
831836
out_offload:
832837
unregister_pernet_subsys(&nf_flow_table_net_ops);
838+
out_pernet:
839+
kmem_cache_destroy(flow_offload_cachep);
833840
return ret;
834841
}
835842

836843
static void __exit nf_flow_table_module_exit(void)
837844
{
838845
nf_flow_table_offload_exit();
839846
unregister_pernet_subsys(&nf_flow_table_net_ops);
847+
kmem_cache_destroy(flow_offload_cachep);
840848
}
841849

842850
module_init(nf_flow_table_module_init);

net/netfilter/nf_tables_api.c

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7270,7 +7270,8 @@ static u32 nft_set_maxsize(const struct nft_set *set)
72707270
}
72717271

72727272
static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
7273-
const struct nlattr *attr, u32 nlmsg_flags)
7273+
const struct nlattr *attr, u32 nlmsg_flags,
7274+
bool last)
72747275
{
72757276
struct nft_expr *expr_array[NFT_SET_EXPR_MAX] = {};
72767277
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
@@ -7556,6 +7557,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
75567557
if (flags)
75577558
*nft_set_ext_flags(ext) = flags;
75587559

7560+
if (last)
7561+
elem.flags = NFT_SET_ELEM_INTERNAL_LAST;
7562+
else
7563+
elem.flags = 0;
7564+
75597565
if (obj)
75607566
*nft_set_ext_obj(ext) = obj;
75617567

@@ -7636,6 +7642,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
76367642
* and an existing one.
76377643
*/
76387644
err = -EEXIST;
7645+
} else if (err == -ECANCELED) {
7646+
/* ECANCELED reports an existing nul-element in
7647+
* interval sets.
7648+
*/
7649+
err = 0;
76397650
}
76407651
goto err_element_clash;
76417652
}
@@ -7714,7 +7725,8 @@ static int nf_tables_newsetelem(struct sk_buff *skb,
77147725
nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
77157726

77167727
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
7717-
err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags);
7728+
err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags,
7729+
nla_is_last(attr, rem));
77187730
if (err < 0) {
77197731
NL_SET_BAD_ATTR(extack, attr);
77207732
return err;
@@ -7838,7 +7850,7 @@ static void nft_trans_elems_destroy_abort(const struct nft_ctx *ctx,
78387850
}
78397851

78407852
static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
7841-
const struct nlattr *attr)
7853+
const struct nlattr *attr, bool last)
78427854
{
78437855
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
78447856
struct nft_set_ext_tmpl tmpl;
@@ -7906,6 +7918,11 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
79067918
if (flags)
79077919
*nft_set_ext_flags(ext) = flags;
79087920

7921+
if (last)
7922+
elem.flags = NFT_SET_ELEM_INTERNAL_LAST;
7923+
else
7924+
elem.flags = 0;
7925+
79097926
trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
79107927
if (trans == NULL)
79117928
goto fail_trans;
@@ -8053,7 +8070,8 @@ static int nf_tables_delsetelem(struct sk_buff *skb,
80538070
return nft_set_flush(&ctx, set, genmask);
80548071

80558072
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
8056-
err = nft_del_setelem(&ctx, set, attr);
8073+
err = nft_del_setelem(&ctx, set, attr,
8074+
nla_is_last(attr, rem));
80578075
if (err == -ENOENT &&
80588076
NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSETELEM)
80598077
continue;

net/netfilter/nfnetlink_queue.c

Lines changed: 74 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,34 @@ static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
435435
nf_queue_entry_free(entry);
436436
}
437437

438+
/* return true if the entry has an unconfirmed conntrack attached that isn't owned by us
439+
* exclusively.
440+
*/
441+
static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry, bool *is_unconfirmed)
442+
{
443+
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
444+
struct nf_conn *ct = (void *)skb_nfct(entry->skb);
445+
446+
if (!ct || nf_ct_is_confirmed(ct))
447+
return false;
448+
449+
if (is_unconfirmed)
450+
*is_unconfirmed = true;
451+
452+
/* in some cases skb_clone() can occur after initial conntrack
453+
* pickup, but conntrack assumes exclusive skb->_nfct ownership for
454+
* unconfirmed entries.
455+
*
456+
* This happens for br_netfilter and with ip multicast routing.
457+
* This can't be solved with serialization here because one clone
458+
* could have been queued for local delivery or could be transmitted
459+
* in parallel on another CPU.
460+
*/
461+
return refcount_read(&ct->ct_general.use) > 1;
462+
#endif
463+
return false;
464+
}
465+
438466
static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
439467
{
440468
const struct nf_ct_hook *ct_hook;
@@ -462,6 +490,24 @@ static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
462490
break;
463491
}
464492
}
493+
494+
if (verdict != NF_DROP && entry->nf_ct_is_unconfirmed) {
495+
/* If first queued segment was already reinjected then
496+
* there is a good chance the ct entry is now confirmed.
497+
*
498+
* Handle the rare cases:
499+
* - out-of-order verdict
500+
* - threaded userspace reinjecting in parallel
501+
* - first segment was dropped
502+
*
503+
* In all of those cases we can't handle this packet
504+
* because we can't be sure that another CPU won't modify
505+
* nf_conn->ext in parallel which isn't allowed.
506+
*/
507+
if (nf_ct_drop_unconfirmed(entry, NULL))
508+
verdict = NF_DROP;
509+
}
510+
465511
nf_reinject(entry, verdict);
466512
}
467513

@@ -891,49 +937,6 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
891937
return NULL;
892938
}
893939

894-
static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry)
895-
{
896-
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
897-
static const unsigned long flags = IPS_CONFIRMED | IPS_DYING;
898-
struct nf_conn *ct = (void *)skb_nfct(entry->skb);
899-
unsigned long status;
900-
unsigned int use;
901-
902-
if (!ct)
903-
return false;
904-
905-
status = READ_ONCE(ct->status);
906-
if ((status & flags) == IPS_DYING)
907-
return true;
908-
909-
if (status & IPS_CONFIRMED)
910-
return false;
911-
912-
/* in some cases skb_clone() can occur after initial conntrack
913-
* pickup, but conntrack assumes exclusive skb->_nfct ownership for
914-
* unconfirmed entries.
915-
*
916-
* This happens for br_netfilter and with ip multicast routing.
917-
* We can't be solved with serialization here because one clone could
918-
* have been queued for local delivery.
919-
*/
920-
use = refcount_read(&ct->ct_general.use);
921-
if (likely(use == 1))
922-
return false;
923-
924-
/* Can't decrement further? Exclusive ownership. */
925-
if (!refcount_dec_not_one(&ct->ct_general.use))
926-
return false;
927-
928-
skb_set_nfct(entry->skb, 0);
929-
/* No nf_ct_put(): we already decremented .use and it cannot
930-
* drop down to 0.
931-
*/
932-
return true;
933-
#endif
934-
return false;
935-
}
936-
937940
static int
938941
__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
939942
struct nf_queue_entry *entry)
@@ -950,9 +953,6 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
950953
}
951954
spin_lock_bh(&queue->lock);
952955

953-
if (nf_ct_drop_unconfirmed(entry))
954-
goto err_out_free_nskb;
955-
956956
if (queue->queue_total >= queue->queue_maxlen)
957957
goto err_out_queue_drop;
958958

@@ -995,7 +995,6 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
995995
else
996996
net_warn_ratelimited("nf_queue: hash insert failed: %d\n", err);
997997
}
998-
err_out_free_nskb:
999998
kfree_skb(nskb);
1000999
err_out_unlock:
10011000
spin_unlock_bh(&queue->lock);
@@ -1074,9 +1073,10 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
10741073
static int
10751074
nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
10761075
{
1077-
unsigned int queued;
1078-
struct nfqnl_instance *queue;
10791076
struct sk_buff *skb, *segs, *nskb;
1077+
bool ct_is_unconfirmed = false;
1078+
struct nfqnl_instance *queue;
1079+
unsigned int queued;
10801080
int err = -ENOBUFS;
10811081
struct net *net = entry->state.net;
10821082
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
@@ -1100,6 +1100,15 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
11001100
break;
11011101
}
11021102

1103+
/* Check if someone already holds another reference to
1104+
* unconfirmed ct. If so, we cannot queue the skb:
1105+
* concurrent modifications of nf_conn->ext are not
1106+
* allowed and we can't know if another CPU isn't
1107+
* processing the same nf_conn entry in parallel.
1108+
*/
1109+
if (nf_ct_drop_unconfirmed(entry, &ct_is_unconfirmed))
1110+
return -EINVAL;
1111+
11031112
if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb)))
11041113
return __nfqnl_enqueue_packet(net, queue, entry);
11051114

@@ -1113,7 +1122,23 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
11131122
goto out_err;
11141123
queued = 0;
11151124
err = 0;
1125+
11161126
skb_list_walk_safe(segs, segs, nskb) {
1127+
if (ct_is_unconfirmed && queued > 0) {
1128+
/* skb_gso_segment() increments the ct refcount.
1129+
* This is a problem for unconfirmed (not in hash)
1130+
* entries, those can race when reinjections happen
1131+
* in parallel.
1132+
*
1133+
* Annotate this for all queued entries except the
1134+
* first one.
1135+
*
1136+
* As long as the first one is reinjected first it
1137+
* will do the confirmation for us.
1138+
*/
1139+
entry->nf_ct_is_unconfirmed = ct_is_unconfirmed;
1140+
}
1141+
11171142
if (err == 0)
11181143
err = __nfqnl_enqueue_packet_gso(net, queue,
11191144
segs, entry);

net/netfilter/nft_counter.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,8 @@ static void nft_counter_reset(struct nft_counter_percpu_priv *priv,
117117
nft_sync = this_cpu_ptr(&nft_counter_sync);
118118

119119
u64_stats_update_begin(nft_sync);
120-
u64_stats_add(&this_cpu->packets, -total->packets);
121-
u64_stats_add(&this_cpu->bytes, -total->bytes);
120+
u64_stats_sub(&this_cpu->packets, total->packets);
121+
u64_stats_sub(&this_cpu->bytes, total->bytes);
122122
u64_stats_update_end(nft_sync);
123123

124124
local_bh_enable();

net/netfilter/nft_set_hash.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -619,15 +619,20 @@ static struct nft_elem_priv *
619619
nft_hash_get(const struct net *net, const struct nft_set *set,
620620
const struct nft_set_elem *elem, unsigned int flags)
621621
{
622+
const u32 *key = (const u32 *)&elem->key.val;
622623
struct nft_hash *priv = nft_set_priv(set);
623624
u8 genmask = nft_genmask_cur(net);
624625
struct nft_hash_elem *he;
625626
u32 hash;
626627

627-
hash = jhash(elem->key.val.data, set->klen, priv->seed);
628+
if (set->klen == 4)
629+
hash = jhash_1word(*key, priv->seed);
630+
else
631+
hash = jhash(key, set->klen, priv->seed);
632+
628633
hash = reciprocal_scale(hash, priv->buckets);
629634
hlist_for_each_entry_rcu(he, &priv->table[hash], node) {
630-
if (!memcmp(nft_set_ext_key(&he->ext), elem->key.val.data, set->klen) &&
635+
if (!memcmp(nft_set_ext_key(&he->ext), key, set->klen) &&
631636
nft_set_elem_active(&he->ext, genmask))
632637
return &he->priv;
633638
}

0 commit comments

Comments
 (0)