Skip to content

Commit 207b3eb

Browse files
author
Florian Westphal
committed
netfilter: nfnetlink_queue: do shared-unconfirmed check before segmentation
Ulrich reports a regression with nfqueue: If an application did not set the 'F_GSO' capability flag and a gso packet with an unconfirmed nf_conn entry is received all packets are now dropped instead of queued, because the check happens after skb_gso_segment(). In that case, we did have exclusive ownership of the skb and its associated conntrack entry. The elevated use count is due to skb_clone happening via skb_gso_segment(). Move the check so that its peformed vs. the aggregated packet. Then, annotate the individual segments except the first one so we can do a 2nd check at reinject time. For the normal case, where userspace does in-order reinjects, this avoids packet drops: first reinjected segment continues traversal and confirms entry, remaining segments observe the confirmed entry. While at it, simplify nf_ct_drop_unconfirmed(): We only care about unconfirmed entries with a refcnt > 1, there is no need to special-case dying entries. This only happens with UDP. With TCP, the only unconfirmed packet will be the TCP SYN, those aren't aggregated by GRO. Next patch adds a udpgro test case to cover this scenario. Reported-by: Ulrich Weber <ulrich.weber@gmail.com> Fixes: 7d8dc1c ("netfilter: nf_queue: drop packets with cloned unconfirmed conntracks") Signed-off-by: Florian Westphal <fw@strlen.de>
1 parent 35f83a7 commit 207b3eb

2 files changed

Lines changed: 75 additions & 49 deletions

File tree

include/net/netfilter/nf_queue.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ struct nf_queue_entry {
2121
struct net_device *physout;
2222
#endif
2323
struct nf_hook_state state;
24+
bool nf_ct_is_unconfirmed;
2425
u16 size; /* sizeof(entry) + saved route keys */
2526
u16 queue_num;
2627

net/netfilter/nfnetlink_queue.c

Lines changed: 74 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,34 @@ static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
435435
nf_queue_entry_free(entry);
436436
}
437437

438+
/* return true if the entry has an unconfirmed conntrack attached that isn't owned by us
439+
* exclusively.
440+
*/
441+
static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry, bool *is_unconfirmed)
442+
{
443+
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
444+
struct nf_conn *ct = (void *)skb_nfct(entry->skb);
445+
446+
if (!ct || nf_ct_is_confirmed(ct))
447+
return false;
448+
449+
if (is_unconfirmed)
450+
*is_unconfirmed = true;
451+
452+
/* in some cases skb_clone() can occur after initial conntrack
453+
* pickup, but conntrack assumes exclusive skb->_nfct ownership for
454+
* unconfirmed entries.
455+
*
456+
* This happens for br_netfilter and with ip multicast routing.
457+
* This can't be solved with serialization here because one clone
458+
* could have been queued for local delivery or could be transmitted
459+
* in parallel on another CPU.
460+
*/
461+
return refcount_read(&ct->ct_general.use) > 1;
462+
#endif
463+
return false;
464+
}
465+
438466
static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
439467
{
440468
const struct nf_ct_hook *ct_hook;
@@ -462,6 +490,24 @@ static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
462490
break;
463491
}
464492
}
493+
494+
if (verdict != NF_DROP && entry->nf_ct_is_unconfirmed) {
495+
/* If first queued segment was already reinjected then
496+
* there is a good chance the ct entry is now confirmed.
497+
*
498+
* Handle the rare cases:
499+
* - out-of-order verdict
500+
* - threaded userspace reinjecting in parallel
501+
* - first segment was dropped
502+
*
503+
* In all of those cases we can't handle this packet
504+
* because we can't be sure that another CPU won't modify
505+
* nf_conn->ext in parallel which isn't allowed.
506+
*/
507+
if (nf_ct_drop_unconfirmed(entry, NULL))
508+
verdict = NF_DROP;
509+
}
510+
465511
nf_reinject(entry, verdict);
466512
}
467513

@@ -891,49 +937,6 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
891937
return NULL;
892938
}
893939

894-
static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry)
895-
{
896-
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
897-
static const unsigned long flags = IPS_CONFIRMED | IPS_DYING;
898-
struct nf_conn *ct = (void *)skb_nfct(entry->skb);
899-
unsigned long status;
900-
unsigned int use;
901-
902-
if (!ct)
903-
return false;
904-
905-
status = READ_ONCE(ct->status);
906-
if ((status & flags) == IPS_DYING)
907-
return true;
908-
909-
if (status & IPS_CONFIRMED)
910-
return false;
911-
912-
/* in some cases skb_clone() can occur after initial conntrack
913-
* pickup, but conntrack assumes exclusive skb->_nfct ownership for
914-
* unconfirmed entries.
915-
*
916-
* This happens for br_netfilter and with ip multicast routing.
917-
* We can't be solved with serialization here because one clone could
918-
* have been queued for local delivery.
919-
*/
920-
use = refcount_read(&ct->ct_general.use);
921-
if (likely(use == 1))
922-
return false;
923-
924-
/* Can't decrement further? Exclusive ownership. */
925-
if (!refcount_dec_not_one(&ct->ct_general.use))
926-
return false;
927-
928-
skb_set_nfct(entry->skb, 0);
929-
/* No nf_ct_put(): we already decremented .use and it cannot
930-
* drop down to 0.
931-
*/
932-
return true;
933-
#endif
934-
return false;
935-
}
936-
937940
static int
938941
__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
939942
struct nf_queue_entry *entry)
@@ -950,9 +953,6 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
950953
}
951954
spin_lock_bh(&queue->lock);
952955

953-
if (nf_ct_drop_unconfirmed(entry))
954-
goto err_out_free_nskb;
955-
956956
if (queue->queue_total >= queue->queue_maxlen)
957957
goto err_out_queue_drop;
958958

@@ -995,7 +995,6 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
995995
else
996996
net_warn_ratelimited("nf_queue: hash insert failed: %d\n", err);
997997
}
998-
err_out_free_nskb:
999998
kfree_skb(nskb);
1000999
err_out_unlock:
10011000
spin_unlock_bh(&queue->lock);
@@ -1074,9 +1073,10 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
10741073
static int
10751074
nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
10761075
{
1077-
unsigned int queued;
1078-
struct nfqnl_instance *queue;
10791076
struct sk_buff *skb, *segs, *nskb;
1077+
bool ct_is_unconfirmed = false;
1078+
struct nfqnl_instance *queue;
1079+
unsigned int queued;
10801080
int err = -ENOBUFS;
10811081
struct net *net = entry->state.net;
10821082
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
@@ -1100,6 +1100,15 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
11001100
break;
11011101
}
11021102

1103+
/* Check if someone already holds another reference to
1104+
* unconfirmed ct. If so, we cannot queue the skb:
1105+
* concurrent modifications of nf_conn->ext are not
1106+
* allowed and we can't know if another CPU isn't
1107+
* processing the same nf_conn entry in parallel.
1108+
*/
1109+
if (nf_ct_drop_unconfirmed(entry, &ct_is_unconfirmed))
1110+
return -EINVAL;
1111+
11031112
if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb)))
11041113
return __nfqnl_enqueue_packet(net, queue, entry);
11051114

@@ -1113,7 +1122,23 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
11131122
goto out_err;
11141123
queued = 0;
11151124
err = 0;
1125+
11161126
skb_list_walk_safe(segs, segs, nskb) {
1127+
if (ct_is_unconfirmed && queued > 0) {
1128+
/* skb_gso_segment() increments the ct refcount.
1129+
* This is a problem for unconfirmed (not in hash)
1130+
* entries, those can race when reinjections happen
1131+
* in parallel.
1132+
*
1133+
* Annotate this for all queued entries except the
1134+
* first one.
1135+
*
1136+
* As long as the first one is reinjected first it
1137+
* will do the confirmation for us.
1138+
*/
1139+
entry->nf_ct_is_unconfirmed = ct_is_unconfirmed;
1140+
}
1141+
11171142
if (err == 0)
11181143
err = __nfqnl_enqueue_packet_gso(net, queue,
11191144
segs, entry);

0 commit comments

Comments
 (0)