Skip to content

Commit 35f83a7

Browse files
author
Florian Westphal
committed
netfilter: nft_set_rbtree: don't gc elements on insert
During insertion we can queue up expired elements for garbage collection. In case of later abort, the commit hook will never be called. Packet path and 'get' requests will find free'd elements in the binary search blob: nft_set_ext_key include/net/netfilter/nf_tables.h:800 [inline] nft_array_get_cmp+0x1f6/0x2a0 net/netfilter/nft_set_rbtree.c:133 __inline_bsearch include/linux/bsearch.h:15 [inline] bsearch+0x50/0xc0 lib/bsearch.c:33 nft_rbtree_get+0x16b/0x400 net/netfilter/nft_set_rbtree.c:169 nft_setelem_get net/netfilter/nf_tables_api.c:6495 [inline] nft_get_set_elem+0x420/0xaa0 net/netfilter/nf_tables_api.c:6543 nf_tables_getsetelem+0x448/0x5e0 net/netfilter/nf_tables_api.c:6632 nfnetlink_rcv_msg+0x8ae/0x12c0 net/netfilter/nfnetlink.c:290 Also, when we insert an element that triggers -EEXIST, and that insertion happens to also zap a timed-out entry, we end up with same issue: Neither commit nor abort hook is called. Fix this by removing gc api usage during insertion. The blamed commit also removes concurrency of the rbtree with the packet path, so we can now safely rb_erase() the element and move it to a new expired list that can be reaped in the commit hook before building the next blob iteration. This also avoids the need to rebuild the blob in the abort path: Expired elements seen during insertion attempts are kept around until a transaction passes. Reported-by: syzbot+d417922a3e7935517ef6@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=d417922a3e7935517ef6 Fixes: 7e43e0a ("netfilter: nft_set_rbtree: translate rbtree to array for binary search") Signed-off-by: Florian Westphal <fw@strlen.de>
1 parent 24cf78c commit 35f83a7

1 file changed

Lines changed: 68 additions & 68 deletions

File tree

net/netfilter/nft_set_rbtree.c

Lines changed: 68 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,15 @@ struct nft_rbtree {
3434
struct nft_array __rcu *array;
3535
struct nft_array *array_next;
3636
unsigned long last_gc;
37+
struct list_head expired;
3738
};
3839

3940
struct nft_rbtree_elem {
4041
struct nft_elem_priv priv;
41-
struct rb_node node;
42+
union {
43+
struct rb_node node;
44+
struct list_head list;
45+
};
4246
struct nft_set_ext ext;
4347
};
4448

@@ -179,13 +183,16 @@ nft_rbtree_get(const struct net *net, const struct nft_set *set,
179183
return &rbe->priv;
180184
}
181185

182-
static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set,
183-
struct nft_rbtree *priv,
184-
struct nft_rbtree_elem *rbe)
186+
static void nft_rbtree_gc_elem_move(struct net *net, struct nft_set *set,
187+
struct nft_rbtree *priv,
188+
struct nft_rbtree_elem *rbe)
185189
{
186190
lockdep_assert_held_write(&priv->lock);
187191
nft_setelem_data_deactivate(net, set, &rbe->priv);
188192
rb_erase(&rbe->node, &priv->root);
193+
194+
/* collected later on in commit callback */
195+
list_add(&rbe->list, &priv->expired);
189196
}
190197

191198
static const struct nft_rbtree_elem *
@@ -196,11 +203,6 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv,
196203
struct rb_node *prev = rb_prev(&rbe->node);
197204
struct net *net = read_pnet(&set->net);
198205
struct nft_rbtree_elem *rbe_prev;
199-
struct nft_trans_gc *gc;
200-
201-
gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC);
202-
if (!gc)
203-
return ERR_PTR(-ENOMEM);
204206

205207
/* search for end interval coming before this element.
206208
* end intervals don't carry a timeout extension, they
@@ -218,28 +220,10 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv,
218220
rbe_prev = NULL;
219221
if (prev) {
220222
rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
221-
nft_rbtree_gc_elem_remove(net, set, priv, rbe_prev);
222-
223-
/* There is always room in this trans gc for this element,
224-
* memory allocation never actually happens, hence, the warning
225-
* splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT,
226-
* this is synchronous gc which never fails.
227-
*/
228-
gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
229-
if (WARN_ON_ONCE(!gc))
230-
return ERR_PTR(-ENOMEM);
231-
232-
nft_trans_gc_elem_add(gc, rbe_prev);
223+
nft_rbtree_gc_elem_move(net, set, priv, rbe_prev);
233224
}
234225

235-
nft_rbtree_gc_elem_remove(net, set, priv, rbe);
236-
gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
237-
if (WARN_ON_ONCE(!gc))
238-
return ERR_PTR(-ENOMEM);
239-
240-
nft_trans_gc_elem_add(gc, rbe);
241-
242-
nft_trans_gc_queue_sync_done(gc);
226+
nft_rbtree_gc_elem_move(net, set, priv, rbe);
243227

244228
return rbe_prev;
245229
}
@@ -675,29 +659,13 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
675659
}
676660
}
677661

678-
static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
679-
struct nft_rbtree *priv,
680-
struct nft_rbtree_elem *rbe)
681-
{
682-
nft_setelem_data_deactivate(net, set, &rbe->priv);
683-
nft_rbtree_erase(priv, rbe);
684-
}
685-
686-
static void nft_rbtree_gc(struct nft_set *set)
662+
static void nft_rbtree_gc_scan(struct nft_set *set)
687663
{
688664
struct nft_rbtree *priv = nft_set_priv(set);
689665
struct nft_rbtree_elem *rbe, *rbe_end = NULL;
690666
struct net *net = read_pnet(&set->net);
691667
u64 tstamp = nft_net_tstamp(net);
692668
struct rb_node *node, *next;
693-
struct nft_trans_gc *gc;
694-
695-
set = nft_set_container_of(priv);
696-
net = read_pnet(&set->net);
697-
698-
gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
699-
if (!gc)
700-
return;
701669

702670
for (node = rb_first(&priv->root); node ; node = next) {
703671
next = rb_next(node);
@@ -715,34 +683,46 @@ static void nft_rbtree_gc(struct nft_set *set)
715683
if (!__nft_set_elem_expired(&rbe->ext, tstamp))
716684
continue;
717685

718-
gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
719-
if (!gc)
720-
goto try_later;
721-
722686
/* end element needs to be removed first, it has
723687
* no timeout extension.
724688
*/
689+
write_lock_bh(&priv->lock);
725690
if (rbe_end) {
726-
nft_rbtree_gc_remove(net, set, priv, rbe_end);
727-
nft_trans_gc_elem_add(gc, rbe_end);
691+
nft_rbtree_gc_elem_move(net, set, priv, rbe_end);
728692
rbe_end = NULL;
729693
}
730694

731-
gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
732-
if (!gc)
733-
goto try_later;
734-
735-
nft_rbtree_gc_remove(net, set, priv, rbe);
736-
nft_trans_gc_elem_add(gc, rbe);
695+
nft_rbtree_gc_elem_move(net, set, priv, rbe);
696+
write_unlock_bh(&priv->lock);
737697
}
738698

739-
try_later:
699+
priv->last_gc = jiffies;
700+
}
701+
702+
static void nft_rbtree_gc_queue(struct nft_set *set)
703+
{
704+
struct nft_rbtree *priv = nft_set_priv(set);
705+
struct nft_rbtree_elem *rbe, *rbe_end;
706+
struct nft_trans_gc *gc;
707+
708+
if (list_empty(&priv->expired))
709+
return;
740710

741-
if (gc) {
742-
gc = nft_trans_gc_catchall_sync(gc);
743-
nft_trans_gc_queue_sync_done(gc);
744-
priv->last_gc = jiffies;
711+
gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
712+
if (!gc)
713+
return;
714+
715+
list_for_each_entry_safe(rbe, rbe_end, &priv->expired, list) {
716+
list_del(&rbe->list);
717+
nft_trans_gc_elem_add(gc, rbe);
718+
719+
gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
720+
if (!gc)
721+
return;
745722
}
723+
724+
gc = nft_trans_gc_catchall_sync(gc);
725+
nft_trans_gc_queue_sync_done(gc);
746726
}
747727

748728
static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
@@ -761,6 +741,7 @@ static int nft_rbtree_init(const struct nft_set *set,
761741

762742
rwlock_init(&priv->lock);
763743
priv->root = RB_ROOT;
744+
INIT_LIST_HEAD(&priv->expired);
764745

765746
priv->array = NULL;
766747
priv->array_next = NULL;
@@ -778,10 +759,15 @@ static void nft_rbtree_destroy(const struct nft_ctx *ctx,
778759
const struct nft_set *set)
779760
{
780761
struct nft_rbtree *priv = nft_set_priv(set);
781-
struct nft_rbtree_elem *rbe;
762+
struct nft_rbtree_elem *rbe, *next;
782763
struct nft_array *array;
783764
struct rb_node *node;
784765

766+
list_for_each_entry_safe(rbe, next, &priv->expired, list) {
767+
list_del(&rbe->list);
768+
nf_tables_set_elem_destroy(ctx, set, &rbe->priv);
769+
}
770+
785771
while ((node = priv->root.rb_node) != NULL) {
786772
rb_erase(node, &priv->root);
787773
rbe = rb_entry(node, struct nft_rbtree_elem, node);
@@ -828,13 +814,21 @@ static void nft_rbtree_commit(struct nft_set *set)
828814
u32 num_intervals = 0;
829815
struct rb_node *node;
830816

831-
if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
832-
nft_rbtree_gc(set);
833-
834817
/* No changes, skip, eg. elements updates only. */
835818
if (!priv->array_next)
836819
return;
837820

821+
/* GC can be performed if the binary search blob is going
822+
* to be rebuilt. It has to be done in two phases: first
823+
* scan tree and move all expired elements to the expired
824+
* list.
825+
*
826+
* Then, after blob has been re-built and published to other
827+
* CPUs, queue collected entries for freeing.
828+
*/
829+
if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
830+
nft_rbtree_gc_scan(set);
831+
838832
/* Reverse walk to create an array from smaller to largest interval. */
839833
node = rb_last(&priv->root);
840834
if (node)
@@ -881,10 +875,16 @@ static void nft_rbtree_commit(struct nft_set *set)
881875
num_intervals++;
882876
err_out:
883877
priv->array_next->num_intervals = num_intervals;
884-
old = rcu_replace_pointer(priv->array, priv->array_next, true);
878+
old = rcu_replace_pointer(priv->array, priv->array_next,
879+
lockdep_is_held(&nft_pernet(read_pnet(&set->net))->commit_mutex));
885880
priv->array_next = NULL;
886881
if (old)
887882
call_rcu(&old->rcu_head, nft_array_free_rcu);
883+
884+
/* New blob is public, queue collected entries for freeing.
885+
* call_rcu ensures elements stay around until readers are done.
886+
*/
887+
nft_rbtree_gc_queue(set);
888888
}
889889

890890
static void nft_rbtree_abort(const struct nft_set *set)

0 commit comments

Comments
 (0)