1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
4 *
5 * Development of this code funded by Astaro AG (http://www.astaro.com/)
6 */
7
8 #include <linux/kernel.h>
9 #include <linux/init.h>
10 #include <linux/module.h>
11 #include <linux/list.h>
12 #include <linux/rbtree.h>
13 #include <linux/bsearch.h>
14 #include <linux/netlink.h>
15 #include <linux/netfilter.h>
16 #include <linux/netfilter/nf_tables.h>
17 #include <net/netfilter/nf_tables_core.h>
18
19 struct nft_array_interval {
20 struct nft_set_ext *from;
21 struct nft_set_ext *to;
22 };
23
24 struct nft_array {
25 u32 max_intervals;
26 u32 num_intervals;
27 struct nft_array_interval *intervals;
28 struct rcu_head rcu_head;
29 };
30
31 struct nft_rbtree {
32 struct rb_root root;
33 rwlock_t lock;
34 struct nft_array __rcu *array;
35 struct nft_array *array_next;
36 unsigned long start_rbe_cookie;
37 unsigned long last_gc;
38 struct list_head expired;
39 u64 last_tstamp;
40 };
41
42 struct nft_rbtree_elem {
43 struct nft_elem_priv priv;
44 union {
45 struct rb_node node;
46 struct list_head list;
47 };
48 struct nft_set_ext ext;
49 };
50
nft_rbtree_interval_end(const struct nft_rbtree_elem * rbe)51 static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
52 {
53 return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
54 (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
55 }
56
nft_rbtree_interval_start(const struct nft_rbtree_elem * rbe)57 static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
58 {
59 return !nft_rbtree_interval_end(rbe);
60 }
61
nft_rbtree_interval_null(const struct nft_set * set,const struct nft_rbtree_elem * rbe)62 static bool nft_rbtree_interval_null(const struct nft_set *set,
63 const struct nft_rbtree_elem *rbe)
64 {
65 return (!memchr_inv(nft_set_ext_key(&rbe->ext), 0, set->klen) &&
66 nft_rbtree_interval_end(rbe));
67 }
68
nft_rbtree_cmp(const struct nft_set * set,const struct nft_rbtree_elem * e1,const struct nft_rbtree_elem * e2)69 static int nft_rbtree_cmp(const struct nft_set *set,
70 const struct nft_rbtree_elem *e1,
71 const struct nft_rbtree_elem *e2)
72 {
73 return memcmp(nft_set_ext_key(&e1->ext), nft_set_ext_key(&e2->ext),
74 set->klen);
75 }
76
77 struct nft_array_lookup_ctx {
78 const u32 *key;
79 u32 klen;
80 };
81
nft_array_lookup_cmp(const void * pkey,const void * entry)82 static int nft_array_lookup_cmp(const void *pkey, const void *entry)
83 {
84 const struct nft_array_interval *interval = entry;
85 const struct nft_array_lookup_ctx *ctx = pkey;
86 int a, b;
87
88 if (!interval->from)
89 return 1;
90
91 a = memcmp(ctx->key, nft_set_ext_key(interval->from), ctx->klen);
92 if (!interval->to)
93 b = -1;
94 else
95 b = memcmp(ctx->key, nft_set_ext_key(interval->to), ctx->klen);
96
97 if (a >= 0 && b < 0)
98 return 0;
99
100 if (a < 0)
101 return -1;
102
103 return 1;
104 }
105
106 INDIRECT_CALLABLE_SCOPE
107 const struct nft_set_ext *
nft_rbtree_lookup(const struct net * net,const struct nft_set * set,const u32 * key)108 nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
109 const u32 *key)
110 {
111 struct nft_rbtree *priv = nft_set_priv(set);
112 struct nft_array *array = rcu_dereference(priv->array);
113 const struct nft_array_interval *interval;
114 struct nft_array_lookup_ctx ctx = {
115 .key = key,
116 .klen = set->klen,
117 };
118
119 if (!array)
120 return NULL;
121
122 interval = bsearch(&ctx, array->intervals, array->num_intervals,
123 sizeof(struct nft_array_interval),
124 nft_array_lookup_cmp);
125 if (!interval || nft_set_elem_expired(interval->from))
126 return NULL;
127
128 return interval->from;
129 }
130
131 struct nft_array_get_ctx {
132 const u32 *key;
133 unsigned int flags;
134 u32 klen;
135 };
136
nft_array_get_cmp(const void * pkey,const void * entry)137 static int nft_array_get_cmp(const void *pkey, const void *entry)
138 {
139 const struct nft_array_interval *interval = entry;
140 const struct nft_array_get_ctx *ctx = pkey;
141 int a, b;
142
143 if (!interval->from)
144 return 1;
145
146 a = memcmp(ctx->key, nft_set_ext_key(interval->from), ctx->klen);
147 if (!interval->to)
148 b = -1;
149 else
150 b = memcmp(ctx->key, nft_set_ext_key(interval->to), ctx->klen);
151
152 if (a >= 0) {
153 if (ctx->flags & NFT_SET_ELEM_INTERVAL_END && b <= 0)
154 return 0;
155 else if (b < 0)
156 return 0;
157 }
158
159 if (a < 0)
160 return -1;
161
162 return 1;
163 }
164
165 static struct nft_elem_priv *
nft_rbtree_get(const struct net * net,const struct nft_set * set,const struct nft_set_elem * elem,unsigned int flags)166 nft_rbtree_get(const struct net *net, const struct nft_set *set,
167 const struct nft_set_elem *elem, unsigned int flags)
168 {
169 struct nft_rbtree *priv = nft_set_priv(set);
170 struct nft_array *array = rcu_dereference(priv->array);
171 const struct nft_array_interval *interval;
172 struct nft_array_get_ctx ctx = {
173 .key = (const u32 *)&elem->key.val,
174 .flags = flags,
175 .klen = set->klen,
176 };
177 struct nft_rbtree_elem *rbe;
178
179 if (!array)
180 return ERR_PTR(-ENOENT);
181
182 interval = bsearch(&ctx, array->intervals, array->num_intervals,
183 sizeof(struct nft_array_interval), nft_array_get_cmp);
184 if (!interval || nft_set_elem_expired(interval->from))
185 return ERR_PTR(-ENOENT);
186
187 if (flags & NFT_SET_ELEM_INTERVAL_END)
188 rbe = container_of(interval->to, struct nft_rbtree_elem, ext);
189 else
190 rbe = container_of(interval->from, struct nft_rbtree_elem, ext);
191
192 return &rbe->priv;
193 }
194
nft_rbtree_gc_elem_move(struct net * net,struct nft_set * set,struct nft_rbtree * priv,struct nft_rbtree_elem * rbe)195 static void nft_rbtree_gc_elem_move(struct net *net, struct nft_set *set,
196 struct nft_rbtree *priv,
197 struct nft_rbtree_elem *rbe)
198 {
199 lockdep_assert_held_write(&priv->lock);
200 nft_setelem_data_deactivate(net, set, &rbe->priv);
201 rb_erase(&rbe->node, &priv->root);
202
203 /* collected later on in commit callback */
204 list_add(&rbe->list, &priv->expired);
205 }
206
207 static const struct nft_rbtree_elem *
nft_rbtree_gc_elem(const struct nft_set * __set,struct nft_rbtree * priv,struct nft_rbtree_elem * rbe)208 nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv,
209 struct nft_rbtree_elem *rbe)
210 {
211 struct nft_set *set = (struct nft_set *)__set;
212 struct rb_node *prev = rb_prev(&rbe->node);
213 struct net *net = read_pnet(&set->net);
214 struct nft_rbtree_elem *rbe_prev;
215
216 /* search for end interval coming before this element.
217 * end intervals don't carry a timeout extension, they
218 * are coupled with the interval start element.
219 */
220 while (prev) {
221 rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
222 if (nft_rbtree_interval_end(rbe_prev) &&
223 nft_set_elem_active(&rbe_prev->ext, NFT_GENMASK_ANY))
224 break;
225
226 prev = rb_prev(prev);
227 }
228
229 rbe_prev = NULL;
230 if (prev) {
231 rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
232 nft_rbtree_gc_elem_move(net, set, priv, rbe_prev);
233 }
234
235 nft_rbtree_gc_elem_move(net, set, priv, rbe);
236
237 return rbe_prev;
238 }
239
nft_rbtree_update_first(const struct nft_set * set,struct nft_rbtree_elem * rbe,struct rb_node * first)240 static bool nft_rbtree_update_first(const struct nft_set *set,
241 struct nft_rbtree_elem *rbe,
242 struct rb_node *first)
243 {
244 struct nft_rbtree_elem *first_elem;
245
246 first_elem = rb_entry(first, struct nft_rbtree_elem, node);
247 /* this element is closest to where the new element is to be inserted:
248 * update the first element for the node list path.
249 */
250 if (nft_rbtree_cmp(set, rbe, first_elem) < 0)
251 return true;
252
253 return false;
254 }
255
256 /* Only for anonymous sets which do not allow updates, all element are active. */
nft_rbtree_prev_active(struct nft_rbtree_elem * rbe)257 static struct nft_rbtree_elem *nft_rbtree_prev_active(struct nft_rbtree_elem *rbe)
258 {
259 struct rb_node *node;
260
261 node = rb_prev(&rbe->node);
262 if (!node)
263 return NULL;
264
265 return rb_entry(node, struct nft_rbtree_elem, node);
266 }
267
268 static struct nft_rbtree_elem *
__nft_rbtree_next_active(struct rb_node * node,u8 genmask)269 __nft_rbtree_next_active(struct rb_node *node, u8 genmask)
270 {
271 struct nft_rbtree_elem *next_rbe;
272
273 while (node) {
274 next_rbe = rb_entry(node, struct nft_rbtree_elem, node);
275 if (!nft_set_elem_active(&next_rbe->ext, genmask)) {
276 node = rb_next(node);
277 continue;
278 }
279
280 return next_rbe;
281 }
282
283 return NULL;
284 }
285
286 static struct nft_rbtree_elem *
nft_rbtree_next_active(struct nft_rbtree_elem * rbe,u8 genmask)287 nft_rbtree_next_active(struct nft_rbtree_elem *rbe, u8 genmask)
288 {
289 return __nft_rbtree_next_active(rb_next(&rbe->node), genmask);
290 }
291
nft_rbtree_maybe_reset_start_cookie(struct nft_rbtree * priv,u64 tstamp)292 static void nft_rbtree_maybe_reset_start_cookie(struct nft_rbtree *priv,
293 u64 tstamp)
294 {
295 if (priv->last_tstamp != tstamp) {
296 priv->start_rbe_cookie = 0;
297 priv->last_tstamp = tstamp;
298 }
299 }
300
nft_rbtree_set_start_cookie(struct nft_rbtree * priv,const struct nft_rbtree_elem * rbe)301 static void nft_rbtree_set_start_cookie(struct nft_rbtree *priv,
302 const struct nft_rbtree_elem *rbe)
303 {
304 priv->start_rbe_cookie = (unsigned long)rbe;
305 }
306
nft_rbtree_cmp_start_cookie(struct nft_rbtree * priv,const struct nft_rbtree_elem * rbe)307 static bool nft_rbtree_cmp_start_cookie(struct nft_rbtree *priv,
308 const struct nft_rbtree_elem *rbe)
309 {
310 return priv->start_rbe_cookie == (unsigned long)rbe;
311 }
312
nft_rbtree_insert_same_interval(const struct net * net,struct nft_rbtree * priv,struct nft_rbtree_elem * rbe)313 static bool nft_rbtree_insert_same_interval(const struct net *net,
314 struct nft_rbtree *priv,
315 struct nft_rbtree_elem *rbe)
316 {
317 u8 genmask = nft_genmask_next(net);
318 struct nft_rbtree_elem *next_rbe;
319
320 if (!priv->start_rbe_cookie)
321 return true;
322
323 next_rbe = nft_rbtree_next_active(rbe, genmask);
324 if (next_rbe) {
325 /* Closest start element differs from last element added. */
326 if (nft_rbtree_interval_start(next_rbe) &&
327 nft_rbtree_cmp_start_cookie(priv, next_rbe)) {
328 priv->start_rbe_cookie = 0;
329 return true;
330 }
331 }
332
333 priv->start_rbe_cookie = 0;
334
335 return false;
336 }
337
__nft_rbtree_insert(const struct net * net,const struct nft_set * set,struct nft_rbtree_elem * new,struct nft_elem_priv ** elem_priv,u64 tstamp)338 static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
339 struct nft_rbtree_elem *new,
340 struct nft_elem_priv **elem_priv, u64 tstamp)
341 {
342 struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL, *rbe_prev;
343 struct rb_node *node, *next, *parent, **p, *first = NULL;
344 struct nft_rbtree *priv = nft_set_priv(set);
345 u8 cur_genmask = nft_genmask_cur(net);
346 u8 genmask = nft_genmask_next(net);
347 int d;
348
349 /* Descend the tree to search for an existing element greater than the
350 * key value to insert that is greater than the new element. This is the
351 * first element to walk the ordered elements to find possible overlap.
352 */
353 parent = NULL;
354 p = &priv->root.rb_node;
355 while (*p != NULL) {
356 parent = *p;
357 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
358 d = nft_rbtree_cmp(set, rbe, new);
359
360 if (d < 0) {
361 p = &parent->rb_left;
362 } else if (d > 0) {
363 if (!first ||
364 nft_rbtree_update_first(set, rbe, first))
365 first = &rbe->node;
366
367 p = &parent->rb_right;
368 } else {
369 if (nft_rbtree_interval_end(rbe))
370 p = &parent->rb_left;
371 else
372 p = &parent->rb_right;
373 }
374 }
375
376 if (!first)
377 first = rb_first(&priv->root);
378
379 /* Detect overlap by going through the list of valid tree nodes.
380 * Values stored in the tree are in reversed order, starting from
381 * highest to lowest value.
382 */
383 for (node = first; node != NULL; node = next) {
384 next = rb_next(node);
385
386 rbe = rb_entry(node, struct nft_rbtree_elem, node);
387
388 if (!nft_set_elem_active(&rbe->ext, genmask))
389 continue;
390
391 /* perform garbage collection to avoid bogus overlap reports
392 * but skip new elements in this transaction.
393 */
394 if (__nft_set_elem_expired(&rbe->ext, tstamp) &&
395 nft_set_elem_active(&rbe->ext, cur_genmask)) {
396 const struct nft_rbtree_elem *removed_end;
397
398 removed_end = nft_rbtree_gc_elem(set, priv, rbe);
399 if (IS_ERR(removed_end))
400 return PTR_ERR(removed_end);
401
402 if (removed_end == rbe_le || removed_end == rbe_ge)
403 return -EAGAIN;
404
405 continue;
406 }
407
408 d = nft_rbtree_cmp(set, rbe, new);
409 if (d == 0) {
410 /* Matching end element: no need to look for an
411 * overlapping greater or equal element.
412 */
413 if (nft_rbtree_interval_end(rbe)) {
414 rbe_le = rbe;
415 break;
416 }
417
418 /* first element that is greater or equal to key value. */
419 if (!rbe_ge) {
420 rbe_ge = rbe;
421 continue;
422 }
423
424 /* this is a closer more or equal element, update it. */
425 if (nft_rbtree_cmp(set, rbe_ge, new) != 0) {
426 rbe_ge = rbe;
427 continue;
428 }
429
430 /* element is equal to key value, make sure flags are
431 * the same, an existing more or equal start element
432 * must not be replaced by more or equal end element.
433 */
434 if ((nft_rbtree_interval_start(new) &&
435 nft_rbtree_interval_start(rbe_ge)) ||
436 (nft_rbtree_interval_end(new) &&
437 nft_rbtree_interval_end(rbe_ge))) {
438 rbe_ge = rbe;
439 continue;
440 }
441 } else if (d > 0) {
442 /* annotate element greater than the new element. */
443 rbe_ge = rbe;
444 continue;
445 } else if (d < 0) {
446 /* annotate element less than the new element. */
447 rbe_le = rbe;
448 break;
449 }
450 }
451
452 if (nft_rbtree_interval_null(set, new))
453 priv->start_rbe_cookie = 0;
454 else if (nft_rbtree_interval_start(new) && priv->start_rbe_cookie)
455 priv->start_rbe_cookie = 0;
456
457 /* - new start element matching existing start element: full overlap
458 * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
459 */
460 if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) &&
461 nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) {
462 *elem_priv = &rbe_ge->priv;
463 nft_rbtree_set_start_cookie(priv, rbe_ge);
464 return -EEXIST;
465 }
466
467 /* - new end element matching existing end element: full overlap
468 * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
469 */
470 if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) &&
471 nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) {
472 /* - ignore null interval, otherwise NLM_F_CREATE bogusly
473 * reports EEXIST.
474 */
475 if (nft_rbtree_interval_null(set, new))
476 return -ECANCELED;
477
478 *elem_priv = &rbe_le->priv;
479
480 /* - start and end element belong to the same interval. */
481 if (!nft_rbtree_insert_same_interval(net, priv, rbe_le))
482 return -ENOTEMPTY;
483
484 return -EEXIST;
485 }
486
487 /* - new start element with existing closest, less or equal key value
488 * being a start element: partial overlap, reported as -ENOTEMPTY.
489 * Anonymous sets allow for two consecutive start element since they
490 * are constant, but validate that this new start element does not
491 * sit in between an existing start and end elements: partial overlap,
492 * reported as -ENOTEMPTY.
493 */
494 if (rbe_le &&
495 nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) {
496 if (!nft_set_is_anonymous(set))
497 return -ENOTEMPTY;
498
499 rbe_prev = nft_rbtree_prev_active(rbe_le);
500 if (rbe_prev && nft_rbtree_interval_end(rbe_prev))
501 return -ENOTEMPTY;
502 }
503
504 /* - new end element with existing closest, less or equal key value
505 * being a end element: partial overlap, reported as -ENOTEMPTY.
506 */
507 if (rbe_le &&
508 nft_rbtree_interval_end(rbe_le) && nft_rbtree_interval_end(new))
509 return -ENOTEMPTY;
510
511 /* - new end element with existing closest, greater or equal key value
512 * being an end element: partial overlap, reported as -ENOTEMPTY
513 */
514 if (rbe_ge &&
515 nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new))
516 return -ENOTEMPTY;
517
518 /* Accepted element: pick insertion point depending on key value */
519 parent = NULL;
520 p = &priv->root.rb_node;
521 while (*p != NULL) {
522 parent = *p;
523 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
524 d = nft_rbtree_cmp(set, rbe, new);
525
526 if (d < 0)
527 p = &parent->rb_left;
528 else if (d > 0)
529 p = &parent->rb_right;
530 else if (nft_rbtree_interval_end(rbe))
531 p = &parent->rb_left;
532 else
533 p = &parent->rb_right;
534 }
535
536 rb_link_node_rcu(&new->node, parent, p);
537 rb_insert_color(&new->node, &priv->root);
538 return 0;
539 }
540
nft_array_intervals_alloc(struct nft_array * array,u32 max_intervals)541 static int nft_array_intervals_alloc(struct nft_array *array, u32 max_intervals)
542 {
543 struct nft_array_interval *intervals;
544
545 intervals = kvzalloc_objs(struct nft_array_interval, max_intervals,
546 GFP_KERNEL_ACCOUNT);
547 if (!intervals)
548 return -ENOMEM;
549
550 if (array->intervals)
551 kvfree(array->intervals);
552
553 array->intervals = intervals;
554 array->max_intervals = max_intervals;
555
556 return 0;
557 }
558
nft_array_alloc(u32 max_intervals)559 static struct nft_array *nft_array_alloc(u32 max_intervals)
560 {
561 struct nft_array *array;
562
563 array = kzalloc_obj(*array, GFP_KERNEL_ACCOUNT);
564 if (!array)
565 return NULL;
566
567 if (nft_array_intervals_alloc(array, max_intervals) < 0) {
568 kfree(array);
569 return NULL;
570 }
571
572 return array;
573 }
574
575 /* Similar to nft_rbtree_{u,k}size to hide details to userspace, but consider
576 * packed representation coming from userspace for anonymous sets too.
577 */
nft_array_elems(const struct nft_set * set)578 static u32 nft_array_elems(const struct nft_set *set)
579 {
580 u32 nelems = atomic_read(&set->nelems) - set->ndeact;
581
582 /* Adjacent intervals are represented with a single start element in
583 * anonymous sets, use the current element counter as is.
584 */
585 if (nft_set_is_anonymous(set))
586 return nelems;
587
588 /* Add extra room for never matching interval at the beginning and open
589 * interval at the end which only use a single element to represent it.
590 * The conversion to array will compact intervals, this allows reduce
591 * memory consumption.
592 */
593 return (nelems / 2) + 2;
594 }
595
596 #define NFT_ARRAY_INITIAL_SIZE 1024
597 #define NFT_ARRAY_INITIAL_ANON_SIZE 16
598 #define NFT_ARRAY_INITIAL_ANON_THRESH (8192U / sizeof(struct nft_array_interval))
599
nft_array_may_resize(const struct nft_set * set,bool flush)600 static int nft_array_may_resize(const struct nft_set *set, bool flush)
601 {
602 u32 initial_intervals, max_intervals, new_max_intervals, delta;
603 u32 shrinked_max_intervals, nelems = nft_array_elems(set);
604 struct nft_rbtree *priv = nft_set_priv(set);
605 struct nft_array *array;
606
607 if (nft_set_is_anonymous(set))
608 initial_intervals = NFT_ARRAY_INITIAL_ANON_SIZE;
609 else
610 initial_intervals = NFT_ARRAY_INITIAL_SIZE;
611
612 if (priv->array_next) {
613 max_intervals = priv->array_next->max_intervals;
614 new_max_intervals = priv->array_next->max_intervals;
615 } else {
616 if (priv->array) {
617 max_intervals = priv->array->max_intervals;
618 new_max_intervals = priv->array->max_intervals;
619 } else {
620 max_intervals = 0;
621 new_max_intervals = initial_intervals;
622 }
623 }
624
625 if (nft_set_is_anonymous(set))
626 goto maybe_grow;
627
628 if (flush) {
629 /* Set flush just started, nelems still report elements.*/
630 nelems = 0;
631 new_max_intervals = NFT_ARRAY_INITIAL_SIZE;
632 goto realloc_array;
633 }
634
635 if (check_add_overflow(new_max_intervals, new_max_intervals,
636 &shrinked_max_intervals))
637 return -EOVERFLOW;
638
639 shrinked_max_intervals = DIV_ROUND_UP(shrinked_max_intervals, 3);
640
641 if (shrinked_max_intervals > NFT_ARRAY_INITIAL_SIZE &&
642 nelems < shrinked_max_intervals) {
643 new_max_intervals = shrinked_max_intervals;
644 goto realloc_array;
645 }
646 maybe_grow:
647 if (nelems > new_max_intervals) {
648 if (nft_set_is_anonymous(set) &&
649 new_max_intervals < NFT_ARRAY_INITIAL_ANON_THRESH) {
650 new_max_intervals <<= 1;
651 } else {
652 delta = new_max_intervals >> 1;
653 if (check_add_overflow(new_max_intervals, delta,
654 &new_max_intervals))
655 return -EOVERFLOW;
656 }
657 }
658
659 realloc_array:
660 if (WARN_ON_ONCE(nelems > new_max_intervals))
661 return -ENOMEM;
662
663 if (priv->array_next) {
664 if (max_intervals == new_max_intervals)
665 return 0;
666
667 if (nft_array_intervals_alloc(priv->array_next, new_max_intervals) < 0)
668 return -ENOMEM;
669 } else {
670 array = nft_array_alloc(new_max_intervals);
671 if (!array)
672 return -ENOMEM;
673
674 priv->array_next = array;
675 }
676
677 return 0;
678 }
679
nft_rbtree_insert(const struct net * net,const struct nft_set * set,const struct nft_set_elem * elem,struct nft_elem_priv ** elem_priv)680 static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
681 const struct nft_set_elem *elem,
682 struct nft_elem_priv **elem_priv)
683 {
684 struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv);
685 struct nft_rbtree *priv = nft_set_priv(set);
686 u64 tstamp = nft_net_tstamp(net);
687 int err;
688
689 nft_rbtree_maybe_reset_start_cookie(priv, tstamp);
690
691 if (nft_array_may_resize(set, false) < 0)
692 return -ENOMEM;
693
694 do {
695 if (fatal_signal_pending(current))
696 return -EINTR;
697
698 cond_resched();
699
700 write_lock(&priv->lock);
701 err = __nft_rbtree_insert(net, set, rbe, elem_priv, tstamp);
702 write_unlock(&priv->lock);
703 } while (err == -EAGAIN);
704
705 return err;
706 }
707
nft_rbtree_remove(const struct net * net,const struct nft_set * set,struct nft_elem_priv * elem_priv)708 static void nft_rbtree_remove(const struct net *net,
709 const struct nft_set *set,
710 struct nft_elem_priv *elem_priv)
711 {
712 struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
713 struct nft_rbtree *priv = nft_set_priv(set);
714
715 write_lock(&priv->lock);
716 rb_erase(&rbe->node, &priv->root);
717 write_unlock(&priv->lock);
718 }
719
nft_rbtree_activate(const struct net * net,const struct nft_set * set,struct nft_elem_priv * elem_priv)720 static void nft_rbtree_activate(const struct net *net,
721 const struct nft_set *set,
722 struct nft_elem_priv *elem_priv)
723 {
724 struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
725
726 nft_clear(net, &rbe->ext);
727 }
728
729 static struct nft_rbtree_elem *
nft_rbtree_next_inactive(struct nft_rbtree_elem * rbe,u8 genmask)730 nft_rbtree_next_inactive(struct nft_rbtree_elem *rbe, u8 genmask)
731 {
732 struct nft_rbtree_elem *next_rbe;
733 struct rb_node *node;
734
735 node = rb_next(&rbe->node);
736 if (node) {
737 next_rbe = rb_entry(node, struct nft_rbtree_elem, node);
738 if (nft_rbtree_interval_start(next_rbe) &&
739 !nft_set_elem_active(&next_rbe->ext, genmask))
740 return next_rbe;
741 }
742
743 return NULL;
744 }
745
nft_rbtree_deactivate_same_interval(const struct net * net,struct nft_rbtree * priv,struct nft_rbtree_elem * rbe)746 static bool nft_rbtree_deactivate_same_interval(const struct net *net,
747 struct nft_rbtree *priv,
748 struct nft_rbtree_elem *rbe)
749 {
750 u8 genmask = nft_genmask_next(net);
751 struct nft_rbtree_elem *next_rbe;
752
753 if (!priv->start_rbe_cookie)
754 return true;
755
756 next_rbe = nft_rbtree_next_inactive(rbe, genmask);
757 if (next_rbe) {
758 /* Closest start element differs from last element added. */
759 if (nft_rbtree_interval_start(next_rbe) &&
760 nft_rbtree_cmp_start_cookie(priv, next_rbe)) {
761 priv->start_rbe_cookie = 0;
762 return true;
763 }
764 }
765
766 priv->start_rbe_cookie = 0;
767
768 return false;
769 }
770
nft_rbtree_flush(const struct net * net,const struct nft_set * set,struct nft_elem_priv * elem_priv)771 static void nft_rbtree_flush(const struct net *net,
772 const struct nft_set *set,
773 struct nft_elem_priv *elem_priv)
774 {
775 struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
776
777 nft_set_elem_change_active(net, set, &rbe->ext);
778 }
779
780 static struct nft_elem_priv *
nft_rbtree_deactivate(const struct net * net,const struct nft_set * set,const struct nft_set_elem * elem)781 nft_rbtree_deactivate(const struct net *net, const struct nft_set *set,
782 const struct nft_set_elem *elem)
783 {
784 struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(elem->priv);
785 struct nft_rbtree *priv = nft_set_priv(set);
786 const struct rb_node *parent = priv->root.rb_node;
787 u8 genmask = nft_genmask_next(net);
788 u64 tstamp = nft_net_tstamp(net);
789 int d;
790
791 nft_rbtree_maybe_reset_start_cookie(priv, tstamp);
792
793 if (nft_rbtree_interval_start(this) ||
794 nft_rbtree_interval_null(set, this))
795 priv->start_rbe_cookie = 0;
796
797 if (nft_array_may_resize(set, false) < 0)
798 return NULL;
799
800 while (parent != NULL) {
801 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
802
803 d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val,
804 set->klen);
805 if (d < 0)
806 parent = parent->rb_left;
807 else if (d > 0)
808 parent = parent->rb_right;
809 else {
810 if (nft_rbtree_interval_end(rbe) &&
811 nft_rbtree_interval_start(this)) {
812 parent = parent->rb_left;
813 continue;
814 } else if (nft_rbtree_interval_start(rbe) &&
815 nft_rbtree_interval_end(this)) {
816 parent = parent->rb_right;
817 continue;
818 } else if (__nft_set_elem_expired(&rbe->ext, tstamp)) {
819 break;
820 } else if (!nft_set_elem_active(&rbe->ext, genmask)) {
821 parent = parent->rb_left;
822 continue;
823 }
824
825 if (nft_rbtree_interval_start(rbe))
826 nft_rbtree_set_start_cookie(priv, rbe);
827 else if (!nft_rbtree_deactivate_same_interval(net, priv, rbe))
828 return NULL;
829
830 nft_rbtree_flush(net, set, &rbe->priv);
831 return &rbe->priv;
832 }
833 }
834 return NULL;
835 }
836
nft_rbtree_do_walk(const struct nft_ctx * ctx,struct nft_set * set,struct nft_set_iter * iter)837 static void nft_rbtree_do_walk(const struct nft_ctx *ctx,
838 struct nft_set *set,
839 struct nft_set_iter *iter)
840 {
841 struct nft_rbtree *priv = nft_set_priv(set);
842 struct nft_rbtree_elem *rbe;
843 struct rb_node *node;
844
845 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
846 rbe = rb_entry(node, struct nft_rbtree_elem, node);
847
848 if (iter->count < iter->skip)
849 goto cont;
850
851 iter->err = iter->fn(ctx, set, iter, &rbe->priv);
852 if (iter->err < 0)
853 return;
854 cont:
855 iter->count++;
856 }
857 }
858
nft_rbtree_walk(const struct nft_ctx * ctx,struct nft_set * set,struct nft_set_iter * iter)859 static void nft_rbtree_walk(const struct nft_ctx *ctx,
860 struct nft_set *set,
861 struct nft_set_iter *iter)
862 {
863 struct nft_rbtree *priv = nft_set_priv(set);
864
865 switch (iter->type) {
866 case NFT_ITER_UPDATE_CLONE:
867 if (nft_array_may_resize(set, true) < 0) {
868 iter->err = -ENOMEM;
869 break;
870 }
871 fallthrough;
872 case NFT_ITER_UPDATE:
873 lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex);
874
875 nft_rbtree_do_walk(ctx, set, iter);
876 break;
877 case NFT_ITER_READ:
878 read_lock(&priv->lock);
879 nft_rbtree_do_walk(ctx, set, iter);
880 read_unlock(&priv->lock);
881 break;
882 default:
883 iter->err = -EINVAL;
884 WARN_ON_ONCE(1);
885 break;
886 }
887 }
888
nft_rbtree_gc_scan(struct nft_set * set)889 static void nft_rbtree_gc_scan(struct nft_set *set)
890 {
891 struct nft_rbtree *priv = nft_set_priv(set);
892 struct nft_rbtree_elem *rbe, *rbe_end = NULL;
893 struct net *net = read_pnet(&set->net);
894 u64 tstamp = nft_net_tstamp(net);
895 struct rb_node *node, *next;
896
897 for (node = rb_first(&priv->root); node ; node = next) {
898 next = rb_next(node);
899
900 rbe = rb_entry(node, struct nft_rbtree_elem, node);
901
902 /* elements are reversed in the rbtree for historical reasons,
903 * from highest to lowest value, that is why end element is
904 * always visited before the start element.
905 */
906 if (nft_rbtree_interval_end(rbe)) {
907 rbe_end = rbe;
908 continue;
909 }
910 if (!__nft_set_elem_expired(&rbe->ext, tstamp))
911 continue;
912
913 /* end element needs to be removed first, it has
914 * no timeout extension.
915 */
916 write_lock(&priv->lock);
917 if (rbe_end) {
918 nft_rbtree_gc_elem_move(net, set, priv, rbe_end);
919 rbe_end = NULL;
920 }
921
922 nft_rbtree_gc_elem_move(net, set, priv, rbe);
923 write_unlock(&priv->lock);
924 }
925
926 priv->last_gc = jiffies;
927 }
928
nft_rbtree_gc_queue(struct nft_set * set)929 static void nft_rbtree_gc_queue(struct nft_set *set)
930 {
931 struct nft_rbtree *priv = nft_set_priv(set);
932 struct nft_rbtree_elem *rbe, *rbe_end;
933 struct nft_trans_gc *gc;
934
935 if (list_empty(&priv->expired))
936 return;
937
938 gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
939 if (!gc)
940 return;
941
942 list_for_each_entry_safe(rbe, rbe_end, &priv->expired, list) {
943 list_del(&rbe->list);
944 nft_trans_gc_elem_add(gc, rbe);
945
946 gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
947 if (!gc)
948 return;
949 }
950
951 gc = nft_trans_gc_catchall_sync(gc);
952 nft_trans_gc_queue_sync_done(gc);
953 }
954
nft_rbtree_privsize(const struct nlattr * const nla[],const struct nft_set_desc * desc)955 static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
956 const struct nft_set_desc *desc)
957 {
958 return sizeof(struct nft_rbtree);
959 }
960
nft_rbtree_init(const struct nft_set * set,const struct nft_set_desc * desc,const struct nlattr * const nla[])961 static int nft_rbtree_init(const struct nft_set *set,
962 const struct nft_set_desc *desc,
963 const struct nlattr * const nla[])
964 {
965 struct nft_rbtree *priv = nft_set_priv(set);
966
967 BUILD_BUG_ON(offsetof(struct nft_rbtree_elem, priv) != 0);
968
969 rwlock_init(&priv->lock);
970 priv->root = RB_ROOT;
971 INIT_LIST_HEAD(&priv->expired);
972
973 priv->array = NULL;
974 priv->array_next = NULL;
975
976 return 0;
977 }
978
__nft_array_free(struct nft_array * array)979 static void __nft_array_free(struct nft_array *array)
980 {
981 kvfree(array->intervals);
982 kfree(array);
983 }
984
nft_rbtree_destroy(const struct nft_ctx * ctx,const struct nft_set * set)985 static void nft_rbtree_destroy(const struct nft_ctx *ctx,
986 const struct nft_set *set)
987 {
988 struct nft_rbtree *priv = nft_set_priv(set);
989 struct nft_rbtree_elem *rbe, *next;
990 struct nft_array *array;
991 struct rb_node *node;
992
993 list_for_each_entry_safe(rbe, next, &priv->expired, list) {
994 list_del(&rbe->list);
995 nf_tables_set_elem_destroy(ctx, set, &rbe->priv);
996 }
997
998 while ((node = priv->root.rb_node) != NULL) {
999 rb_erase(node, &priv->root);
1000 rbe = rb_entry(node, struct nft_rbtree_elem, node);
1001 nf_tables_set_elem_destroy(ctx, set, &rbe->priv);
1002 }
1003
1004 array = rcu_dereference_protected(priv->array, true);
1005 if (array)
1006 __nft_array_free(array);
1007 if (priv->array_next)
1008 __nft_array_free(priv->array_next);
1009 }
1010
nft_rbtree_estimate(const struct nft_set_desc * desc,u32 features,struct nft_set_estimate * est)1011 static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
1012 struct nft_set_estimate *est)
1013 {
1014 if (desc->field_count > 1)
1015 return false;
1016
1017 if (desc->size)
1018 est->size = sizeof(struct nft_rbtree) +
1019 desc->size * sizeof(struct nft_rbtree_elem);
1020 else
1021 est->size = ~0;
1022
1023 est->lookup = NFT_SET_CLASS_O_LOG_N;
1024 est->space = NFT_SET_CLASS_O_N;
1025
1026 return true;
1027 }
1028
nft_array_free_rcu(struct rcu_head * rcu_head)1029 static void nft_array_free_rcu(struct rcu_head *rcu_head)
1030 {
1031 struct nft_array *array = container_of(rcu_head, struct nft_array, rcu_head);
1032
1033 __nft_array_free(array);
1034 }
1035
nft_rbtree_commit(struct nft_set * set)1036 static void nft_rbtree_commit(struct nft_set *set)
1037 {
1038 struct nft_rbtree *priv = nft_set_priv(set);
1039 struct nft_rbtree_elem *rbe, *prev_rbe;
1040 struct nft_array *old;
1041 u32 num_intervals = 0;
1042 struct rb_node *node;
1043
1044 /* No changes, skip, eg. elements updates only. */
1045 if (!priv->array_next)
1046 return;
1047
1048 /* GC can be performed if the binary search blob is going
1049 * to be rebuilt. It has to be done in two phases: first
1050 * scan tree and move all expired elements to the expired
1051 * list.
1052 *
1053 * Then, after blob has been re-built and published to other
1054 * CPUs, queue collected entries for freeing.
1055 */
1056 if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
1057 nft_rbtree_gc_scan(set);
1058
1059 /* Reverse walk to create an array from smaller to largest interval. */
1060 node = rb_last(&priv->root);
1061 if (node)
1062 prev_rbe = rb_entry(node, struct nft_rbtree_elem, node);
1063 else
1064 prev_rbe = NULL;
1065
1066 while (prev_rbe) {
1067 rbe = prev_rbe;
1068
1069 if (nft_rbtree_interval_start(rbe))
1070 priv->array_next->intervals[num_intervals].from = &rbe->ext;
1071 else if (nft_rbtree_interval_end(rbe))
1072 priv->array_next->intervals[num_intervals++].to = &rbe->ext;
1073
1074 if (num_intervals >= priv->array_next->max_intervals) {
1075 pr_warn_once("malformed interval set from userspace?");
1076 goto err_out;
1077 }
1078
1079 node = rb_prev(node);
1080 if (!node)
1081 break;
1082
1083 prev_rbe = rb_entry(node, struct nft_rbtree_elem, node);
1084
1085 /* For anonymous sets, when adjacent ranges are found,
1086 * the end element is not added to the set to pack the set
1087 * representation. Use next start element to complete this
1088 * interval.
1089 */
1090 if (nft_rbtree_interval_start(rbe) &&
1091 nft_rbtree_interval_start(prev_rbe) &&
1092 priv->array_next->intervals[num_intervals].from)
1093 priv->array_next->intervals[num_intervals++].to = &prev_rbe->ext;
1094
1095 if (num_intervals >= priv->array_next->max_intervals) {
1096 pr_warn_once("malformed interval set from userspace?");
1097 goto err_out;
1098 }
1099 }
1100
1101 if (priv->array_next->intervals[num_intervals].from)
1102 num_intervals++;
1103 err_out:
1104 priv->array_next->num_intervals = num_intervals;
1105 old = rcu_replace_pointer(priv->array, priv->array_next,
1106 lockdep_is_held(&nft_pernet(read_pnet(&set->net))->commit_mutex));
1107 priv->array_next = NULL;
1108 if (old)
1109 call_rcu(&old->rcu_head, nft_array_free_rcu);
1110
1111 /* New blob is public, queue collected entries for freeing.
1112 * call_rcu ensures elements stay around until readers are done.
1113 */
1114 nft_rbtree_gc_queue(set);
1115 }
1116
nft_rbtree_abort(const struct nft_set * set)1117 static void nft_rbtree_abort(const struct nft_set *set)
1118 {
1119 struct nft_rbtree *priv = nft_set_priv(set);
1120 struct nft_array *array_next;
1121
1122 if (!priv->array_next)
1123 return;
1124
1125 array_next = priv->array_next;
1126 priv->array_next = NULL;
1127 __nft_array_free(array_next);
1128 }
1129
nft_rbtree_gc_init(const struct nft_set * set)1130 static void nft_rbtree_gc_init(const struct nft_set *set)
1131 {
1132 struct nft_rbtree *priv = nft_set_priv(set);
1133
1134 priv->last_gc = jiffies;
1135 }
1136
1137 /* rbtree stores ranges as singleton elements, each range is composed of two
1138 * elements ...
1139 */
nft_rbtree_ksize(u32 size)1140 static u32 nft_rbtree_ksize(u32 size)
1141 {
1142 return size * 2;
1143 }
1144
1145 /* ... hide this detail to userspace. */
nft_rbtree_usize(u32 size)1146 static u32 nft_rbtree_usize(u32 size)
1147 {
1148 if (!size)
1149 return 0;
1150
1151 return size / 2;
1152 }
1153
nft_rbtree_adjust_maxsize(const struct nft_set * set)1154 static u32 nft_rbtree_adjust_maxsize(const struct nft_set *set)
1155 {
1156 struct nft_rbtree *priv = nft_set_priv(set);
1157 struct nft_rbtree_elem *rbe;
1158 struct rb_node *node;
1159 const void *key;
1160
1161 node = rb_last(&priv->root);
1162 if (!node)
1163 return 0;
1164
1165 rbe = rb_entry(node, struct nft_rbtree_elem, node);
1166 if (!nft_rbtree_interval_end(rbe))
1167 return 0;
1168
1169 key = nft_set_ext_key(&rbe->ext);
1170 if (memchr(key, 1, set->klen))
1171 return 0;
1172
1173 /* this is the all-zero no-match element. */
1174 return 1;
1175 }
1176
1177 const struct nft_set_type nft_set_rbtree_type = {
1178 .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
1179 .ops = {
1180 .privsize = nft_rbtree_privsize,
1181 .elemsize = offsetof(struct nft_rbtree_elem, ext),
1182 .estimate = nft_rbtree_estimate,
1183 .init = nft_rbtree_init,
1184 .destroy = nft_rbtree_destroy,
1185 .insert = nft_rbtree_insert,
1186 .remove = nft_rbtree_remove,
1187 .deactivate = nft_rbtree_deactivate,
1188 .flush = nft_rbtree_flush,
1189 .activate = nft_rbtree_activate,
1190 .commit = nft_rbtree_commit,
1191 .abort = nft_rbtree_abort,
1192 .gc_init = nft_rbtree_gc_init,
1193 .lookup = nft_rbtree_lookup,
1194 .walk = nft_rbtree_walk,
1195 .get = nft_rbtree_get,
1196 .ksize = nft_rbtree_ksize,
1197 .usize = nft_rbtree_usize,
1198 .adjust_maxsize = nft_rbtree_adjust_maxsize,
1199 },
1200 };
1201