xref: /linux/net/netfilter/nft_set_rbtree.c (revision a5210135489ae7bc1ef1cb4a8157361dd7b468cd)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
4  *
5  * Development of this code funded by Astaro AG (http://www.astaro.com/)
6  */
7 
8 #include <linux/kernel.h>
9 #include <linux/init.h>
10 #include <linux/module.h>
11 #include <linux/list.h>
12 #include <linux/rbtree.h>
13 #include <linux/bsearch.h>
14 #include <linux/netlink.h>
15 #include <linux/netfilter.h>
16 #include <linux/netfilter/nf_tables.h>
17 #include <net/netfilter/nf_tables_core.h>
18 
19 struct nft_array_interval {
20 	struct nft_set_ext	*from;
21 	struct nft_set_ext	*to;
22 };
23 
24 struct nft_array {
25 	u32			max_intervals;
26 	u32			num_intervals;
27 	struct nft_array_interval *intervals;
28 	struct rcu_head		rcu_head;
29 };
30 
31 struct nft_rbtree {
32 	struct rb_root		root;
33 	rwlock_t		lock;
34 	struct nft_array __rcu	*array;
35 	struct nft_array	*array_next;
36 	unsigned long		start_rbe_cookie;
37 	unsigned long		last_gc;
38 	struct list_head	expired;
39 	u64			last_tstamp;
40 };
41 
42 struct nft_rbtree_elem {
43 	struct nft_elem_priv	priv;
44 	union {
45 		struct rb_node	node;
46 		struct list_head list;
47 	};
48 	struct nft_set_ext	ext;
49 };
50 
nft_rbtree_interval_end(const struct nft_rbtree_elem * rbe)51 static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
52 {
53 	return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
54 	       (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
55 }
56 
nft_rbtree_interval_start(const struct nft_rbtree_elem * rbe)57 static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
58 {
59 	return !nft_rbtree_interval_end(rbe);
60 }
61 
nft_rbtree_interval_null(const struct nft_set * set,const struct nft_rbtree_elem * rbe)62 static bool nft_rbtree_interval_null(const struct nft_set *set,
63 				     const struct nft_rbtree_elem *rbe)
64 {
65 	return (!memchr_inv(nft_set_ext_key(&rbe->ext), 0, set->klen) &&
66 		nft_rbtree_interval_end(rbe));
67 }
68 
nft_rbtree_cmp(const struct nft_set * set,const struct nft_rbtree_elem * e1,const struct nft_rbtree_elem * e2)69 static int nft_rbtree_cmp(const struct nft_set *set,
70 			  const struct nft_rbtree_elem *e1,
71 			  const struct nft_rbtree_elem *e2)
72 {
73 	return memcmp(nft_set_ext_key(&e1->ext), nft_set_ext_key(&e2->ext),
74 		      set->klen);
75 }
76 
77 struct nft_array_lookup_ctx {
78 	const u32	*key;
79 	u32		klen;
80 };
81 
nft_array_lookup_cmp(const void * pkey,const void * entry)82 static int nft_array_lookup_cmp(const void *pkey, const void *entry)
83 {
84 	const struct nft_array_interval *interval = entry;
85 	const struct nft_array_lookup_ctx *ctx = pkey;
86 	int a, b;
87 
88 	if (!interval->from)
89 		return 1;
90 
91 	a = memcmp(ctx->key, nft_set_ext_key(interval->from), ctx->klen);
92 	if (!interval->to)
93 		b = -1;
94 	else
95 		b = memcmp(ctx->key, nft_set_ext_key(interval->to), ctx->klen);
96 
97 	if (a >= 0 && b < 0)
98 		return 0;
99 
100 	if (a < 0)
101 		return -1;
102 
103 	return 1;
104 }
105 
106 INDIRECT_CALLABLE_SCOPE
107 const struct nft_set_ext *
nft_rbtree_lookup(const struct net * net,const struct nft_set * set,const u32 * key)108 nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
109 		  const u32 *key)
110 {
111 	struct nft_rbtree *priv = nft_set_priv(set);
112 	struct nft_array *array = rcu_dereference(priv->array);
113 	const struct nft_array_interval *interval;
114 	struct nft_array_lookup_ctx ctx = {
115 		.key	= key,
116 		.klen	= set->klen,
117 	};
118 
119 	if (!array)
120 		return NULL;
121 
122 	interval = bsearch(&ctx, array->intervals, array->num_intervals,
123 			   sizeof(struct nft_array_interval),
124 			   nft_array_lookup_cmp);
125 	if (!interval || nft_set_elem_expired(interval->from))
126 		return NULL;
127 
128 	return interval->from;
129 }
130 
131 struct nft_array_get_ctx {
132 	const u32	*key;
133 	unsigned int	flags;
134 	u32		klen;
135 };
136 
nft_array_get_cmp(const void * pkey,const void * entry)137 static int nft_array_get_cmp(const void *pkey, const void *entry)
138 {
139 	const struct nft_array_interval *interval = entry;
140 	const struct nft_array_get_ctx *ctx = pkey;
141 	int a, b;
142 
143 	if (!interval->from)
144 		return 1;
145 
146 	a = memcmp(ctx->key, nft_set_ext_key(interval->from), ctx->klen);
147 	if (!interval->to)
148 		b = -1;
149 	else
150 		b = memcmp(ctx->key, nft_set_ext_key(interval->to), ctx->klen);
151 
152 	if (a >= 0) {
153 		if (ctx->flags & NFT_SET_ELEM_INTERVAL_END && b <= 0)
154 			return 0;
155 		else if (b < 0)
156 			return 0;
157 	}
158 
159 	if (a < 0)
160 		return -1;
161 
162 	return 1;
163 }
164 
165 static struct nft_elem_priv *
nft_rbtree_get(const struct net * net,const struct nft_set * set,const struct nft_set_elem * elem,unsigned int flags)166 nft_rbtree_get(const struct net *net, const struct nft_set *set,
167 	       const struct nft_set_elem *elem, unsigned int flags)
168 {
169 	struct nft_rbtree *priv = nft_set_priv(set);
170 	struct nft_array *array = rcu_dereference(priv->array);
171 	const struct nft_array_interval *interval;
172 	struct nft_array_get_ctx ctx = {
173 		.key	= (const u32 *)&elem->key.val,
174 		.flags	= flags,
175 		.klen	= set->klen,
176 	};
177 	struct nft_rbtree_elem *rbe;
178 
179 	if (!array)
180 		return ERR_PTR(-ENOENT);
181 
182 	interval = bsearch(&ctx, array->intervals, array->num_intervals,
183 			   sizeof(struct nft_array_interval), nft_array_get_cmp);
184 	if (!interval || nft_set_elem_expired(interval->from))
185 		return ERR_PTR(-ENOENT);
186 
187 	if (flags & NFT_SET_ELEM_INTERVAL_END)
188 		rbe = container_of(interval->to, struct nft_rbtree_elem, ext);
189 	else
190 		rbe = container_of(interval->from, struct nft_rbtree_elem, ext);
191 
192 	return &rbe->priv;
193 }
194 
nft_rbtree_gc_elem_move(struct net * net,struct nft_set * set,struct nft_rbtree * priv,struct nft_rbtree_elem * rbe)195 static void nft_rbtree_gc_elem_move(struct net *net, struct nft_set *set,
196 				    struct nft_rbtree *priv,
197 				    struct nft_rbtree_elem *rbe)
198 {
199 	lockdep_assert_held_write(&priv->lock);
200 	nft_setelem_data_deactivate(net, set, &rbe->priv);
201 	rb_erase(&rbe->node, &priv->root);
202 
203 	/* collected later on in commit callback */
204 	list_add(&rbe->list, &priv->expired);
205 }
206 
207 static const struct nft_rbtree_elem *
nft_rbtree_gc_elem(const struct nft_set * __set,struct nft_rbtree * priv,struct nft_rbtree_elem * rbe)208 nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv,
209 		   struct nft_rbtree_elem *rbe)
210 {
211 	struct nft_set *set = (struct nft_set *)__set;
212 	struct rb_node *prev = rb_prev(&rbe->node);
213 	struct net *net = read_pnet(&set->net);
214 	struct nft_rbtree_elem *rbe_prev;
215 
216 	/* search for end interval coming before this element.
217 	 * end intervals don't carry a timeout extension, they
218 	 * are coupled with the interval start element.
219 	 */
220 	while (prev) {
221 		rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
222 		if (nft_rbtree_interval_end(rbe_prev) &&
223 		    nft_set_elem_active(&rbe_prev->ext, NFT_GENMASK_ANY))
224 			break;
225 
226 		prev = rb_prev(prev);
227 	}
228 
229 	rbe_prev = NULL;
230 	if (prev) {
231 		rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
232 		nft_rbtree_gc_elem_move(net, set, priv, rbe_prev);
233 	}
234 
235 	nft_rbtree_gc_elem_move(net, set, priv, rbe);
236 
237 	return rbe_prev;
238 }
239 
nft_rbtree_update_first(const struct nft_set * set,struct nft_rbtree_elem * rbe,struct rb_node * first)240 static bool nft_rbtree_update_first(const struct nft_set *set,
241 				    struct nft_rbtree_elem *rbe,
242 				    struct rb_node *first)
243 {
244 	struct nft_rbtree_elem *first_elem;
245 
246 	first_elem = rb_entry(first, struct nft_rbtree_elem, node);
247 	/* this element is closest to where the new element is to be inserted:
248 	 * update the first element for the node list path.
249 	 */
250 	if (nft_rbtree_cmp(set, rbe, first_elem) < 0)
251 		return true;
252 
253 	return false;
254 }
255 
256 /* Only for anonymous sets which do not allow updates, all element are active. */
nft_rbtree_prev_active(struct nft_rbtree_elem * rbe)257 static struct nft_rbtree_elem *nft_rbtree_prev_active(struct nft_rbtree_elem *rbe)
258 {
259 	struct rb_node *node;
260 
261 	node = rb_prev(&rbe->node);
262 	if (!node)
263 		return NULL;
264 
265 	return rb_entry(node, struct nft_rbtree_elem, node);
266 }
267 
268 static struct nft_rbtree_elem *
__nft_rbtree_next_active(struct rb_node * node,u8 genmask)269 __nft_rbtree_next_active(struct rb_node *node, u8 genmask)
270 {
271 	struct nft_rbtree_elem *next_rbe;
272 
273 	while (node) {
274 		next_rbe = rb_entry(node, struct nft_rbtree_elem, node);
275 		if (!nft_set_elem_active(&next_rbe->ext, genmask)) {
276 			node = rb_next(node);
277 			continue;
278 		}
279 
280 		return next_rbe;
281 	}
282 
283 	return NULL;
284 }
285 
286 static struct nft_rbtree_elem *
nft_rbtree_next_active(struct nft_rbtree_elem * rbe,u8 genmask)287 nft_rbtree_next_active(struct nft_rbtree_elem *rbe, u8 genmask)
288 {
289 	return __nft_rbtree_next_active(rb_next(&rbe->node), genmask);
290 }
291 
nft_rbtree_maybe_reset_start_cookie(struct nft_rbtree * priv,u64 tstamp)292 static void nft_rbtree_maybe_reset_start_cookie(struct nft_rbtree *priv,
293 						u64 tstamp)
294 {
295 	if (priv->last_tstamp != tstamp) {
296 		priv->start_rbe_cookie = 0;
297 		priv->last_tstamp = tstamp;
298 	}
299 }
300 
nft_rbtree_set_start_cookie(struct nft_rbtree * priv,const struct nft_rbtree_elem * rbe)301 static void nft_rbtree_set_start_cookie(struct nft_rbtree *priv,
302 					const struct nft_rbtree_elem *rbe)
303 {
304 	priv->start_rbe_cookie = (unsigned long)rbe;
305 }
306 
nft_rbtree_cmp_start_cookie(struct nft_rbtree * priv,const struct nft_rbtree_elem * rbe)307 static bool nft_rbtree_cmp_start_cookie(struct nft_rbtree *priv,
308 					const struct nft_rbtree_elem *rbe)
309 {
310 	return priv->start_rbe_cookie == (unsigned long)rbe;
311 }
312 
nft_rbtree_insert_same_interval(const struct net * net,struct nft_rbtree * priv,struct nft_rbtree_elem * rbe)313 static bool nft_rbtree_insert_same_interval(const struct net *net,
314 					    struct nft_rbtree *priv,
315 					    struct nft_rbtree_elem *rbe)
316 {
317 	u8 genmask = nft_genmask_next(net);
318 	struct nft_rbtree_elem *next_rbe;
319 
320 	if (!priv->start_rbe_cookie)
321 		return true;
322 
323 	next_rbe = nft_rbtree_next_active(rbe, genmask);
324 	if (next_rbe) {
325 		/* Closest start element differs from last element added. */
326 		if (nft_rbtree_interval_start(next_rbe) &&
327 		    nft_rbtree_cmp_start_cookie(priv, next_rbe)) {
328 			priv->start_rbe_cookie = 0;
329 			return true;
330 		}
331 	}
332 
333 	priv->start_rbe_cookie = 0;
334 
335 	return false;
336 }
337 
__nft_rbtree_insert(const struct net * net,const struct nft_set * set,struct nft_rbtree_elem * new,struct nft_elem_priv ** elem_priv,u64 tstamp)338 static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
339 			       struct nft_rbtree_elem *new,
340 			       struct nft_elem_priv **elem_priv, u64 tstamp)
341 {
342 	struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL, *rbe_prev;
343 	struct rb_node *node, *next, *parent, **p, *first = NULL;
344 	struct nft_rbtree *priv = nft_set_priv(set);
345 	u8 cur_genmask = nft_genmask_cur(net);
346 	u8 genmask = nft_genmask_next(net);
347 	int d;
348 
349 	/* Descend the tree to search for an existing element greater than the
350 	 * key value to insert that is greater than the new element. This is the
351 	 * first element to walk the ordered elements to find possible overlap.
352 	 */
353 	parent = NULL;
354 	p = &priv->root.rb_node;
355 	while (*p != NULL) {
356 		parent = *p;
357 		rbe = rb_entry(parent, struct nft_rbtree_elem, node);
358 		d = nft_rbtree_cmp(set, rbe, new);
359 
360 		if (d < 0) {
361 			p = &parent->rb_left;
362 		} else if (d > 0) {
363 			if (!first ||
364 			    nft_rbtree_update_first(set, rbe, first))
365 				first = &rbe->node;
366 
367 			p = &parent->rb_right;
368 		} else {
369 			if (nft_rbtree_interval_end(rbe))
370 				p = &parent->rb_left;
371 			else
372 				p = &parent->rb_right;
373 		}
374 	}
375 
376 	if (!first)
377 		first = rb_first(&priv->root);
378 
379 	/* Detect overlap by going through the list of valid tree nodes.
380 	 * Values stored in the tree are in reversed order, starting from
381 	 * highest to lowest value.
382 	 */
383 	for (node = first; node != NULL; node = next) {
384 		next = rb_next(node);
385 
386 		rbe = rb_entry(node, struct nft_rbtree_elem, node);
387 
388 		if (!nft_set_elem_active(&rbe->ext, genmask))
389 			continue;
390 
391 		/* perform garbage collection to avoid bogus overlap reports
392 		 * but skip new elements in this transaction.
393 		 */
394 		if (__nft_set_elem_expired(&rbe->ext, tstamp) &&
395 		    nft_set_elem_active(&rbe->ext, cur_genmask)) {
396 			const struct nft_rbtree_elem *removed_end;
397 
398 			removed_end = nft_rbtree_gc_elem(set, priv, rbe);
399 			if (IS_ERR(removed_end))
400 				return PTR_ERR(removed_end);
401 
402 			if (removed_end == rbe_le || removed_end == rbe_ge)
403 				return -EAGAIN;
404 
405 			continue;
406 		}
407 
408 		d = nft_rbtree_cmp(set, rbe, new);
409 		if (d == 0) {
410 			/* Matching end element: no need to look for an
411 			 * overlapping greater or equal element.
412 			 */
413 			if (nft_rbtree_interval_end(rbe)) {
414 				rbe_le = rbe;
415 				break;
416 			}
417 
418 			/* first element that is greater or equal to key value. */
419 			if (!rbe_ge) {
420 				rbe_ge = rbe;
421 				continue;
422 			}
423 
424 			/* this is a closer more or equal element, update it. */
425 			if (nft_rbtree_cmp(set, rbe_ge, new) != 0) {
426 				rbe_ge = rbe;
427 				continue;
428 			}
429 
430 			/* element is equal to key value, make sure flags are
431 			 * the same, an existing more or equal start element
432 			 * must not be replaced by more or equal end element.
433 			 */
434 			if ((nft_rbtree_interval_start(new) &&
435 			     nft_rbtree_interval_start(rbe_ge)) ||
436 			    (nft_rbtree_interval_end(new) &&
437 			     nft_rbtree_interval_end(rbe_ge))) {
438 				rbe_ge = rbe;
439 				continue;
440 			}
441 		} else if (d > 0) {
442 			/* annotate element greater than the new element. */
443 			rbe_ge = rbe;
444 			continue;
445 		} else if (d < 0) {
446 			/* annotate element less than the new element. */
447 			rbe_le = rbe;
448 			break;
449 		}
450 	}
451 
452 	if (nft_rbtree_interval_null(set, new))
453 		priv->start_rbe_cookie = 0;
454 	else if (nft_rbtree_interval_start(new) && priv->start_rbe_cookie)
455 		priv->start_rbe_cookie = 0;
456 
457 	/* - new start element matching existing start element: full overlap
458 	 *   reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
459 	 */
460 	if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) &&
461 	    nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) {
462 		*elem_priv = &rbe_ge->priv;
463 		nft_rbtree_set_start_cookie(priv, rbe_ge);
464 		return -EEXIST;
465 	}
466 
467 	/* - new end element matching existing end element: full overlap
468 	 *   reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
469 	 */
470 	if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) &&
471 	    nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) {
472 		/* - ignore null interval, otherwise NLM_F_CREATE bogusly
473 		 *   reports EEXIST.
474 		 */
475 		if (nft_rbtree_interval_null(set, new))
476 			return -ECANCELED;
477 
478 		*elem_priv = &rbe_le->priv;
479 
480 		/* - start and end element belong to the same interval. */
481 		if (!nft_rbtree_insert_same_interval(net, priv, rbe_le))
482 			return -ENOTEMPTY;
483 
484 		return -EEXIST;
485 	}
486 
487 	/* - new start element with existing closest, less or equal key value
488 	 *   being a start element: partial overlap, reported as -ENOTEMPTY.
489 	 *   Anonymous sets allow for two consecutive start element since they
490 	 *   are constant, but validate that this new start element does not
491 	 *   sit in between an existing start and end elements: partial overlap,
492 	 *   reported as -ENOTEMPTY.
493 	 */
494 	if (rbe_le &&
495 	    nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) {
496 		if (!nft_set_is_anonymous(set))
497 			return -ENOTEMPTY;
498 
499 		rbe_prev = nft_rbtree_prev_active(rbe_le);
500 		if (rbe_prev && nft_rbtree_interval_end(rbe_prev))
501 			return -ENOTEMPTY;
502 	}
503 
504 	/* - new end element with existing closest, less or equal key value
505 	 *   being a end element: partial overlap, reported as -ENOTEMPTY.
506 	 */
507 	if (rbe_le &&
508 	    nft_rbtree_interval_end(rbe_le) && nft_rbtree_interval_end(new))
509 		return -ENOTEMPTY;
510 
511 	/* - new end element with existing closest, greater or equal key value
512 	 *   being an end element: partial overlap, reported as -ENOTEMPTY
513 	 */
514 	if (rbe_ge &&
515 	    nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new))
516 		return -ENOTEMPTY;
517 
518 	/* Accepted element: pick insertion point depending on key value */
519 	parent = NULL;
520 	p = &priv->root.rb_node;
521 	while (*p != NULL) {
522 		parent = *p;
523 		rbe = rb_entry(parent, struct nft_rbtree_elem, node);
524 		d = nft_rbtree_cmp(set, rbe, new);
525 
526 		if (d < 0)
527 			p = &parent->rb_left;
528 		else if (d > 0)
529 			p = &parent->rb_right;
530 		else if (nft_rbtree_interval_end(rbe))
531 			p = &parent->rb_left;
532 		else
533 			p = &parent->rb_right;
534 	}
535 
536 	rb_link_node_rcu(&new->node, parent, p);
537 	rb_insert_color(&new->node, &priv->root);
538 	return 0;
539 }
540 
nft_array_intervals_alloc(struct nft_array * array,u32 max_intervals)541 static int nft_array_intervals_alloc(struct nft_array *array, u32 max_intervals)
542 {
543 	struct nft_array_interval *intervals;
544 
545 	intervals = kvzalloc_objs(struct nft_array_interval, max_intervals,
546 				  GFP_KERNEL_ACCOUNT);
547 	if (!intervals)
548 		return -ENOMEM;
549 
550 	if (array->intervals)
551 		kvfree(array->intervals);
552 
553 	array->intervals = intervals;
554 	array->max_intervals = max_intervals;
555 
556 	return 0;
557 }
558 
nft_array_alloc(u32 max_intervals)559 static struct nft_array *nft_array_alloc(u32 max_intervals)
560 {
561 	struct nft_array *array;
562 
563 	array = kzalloc_obj(*array, GFP_KERNEL_ACCOUNT);
564 	if (!array)
565 		return NULL;
566 
567 	if (nft_array_intervals_alloc(array, max_intervals) < 0) {
568 		kfree(array);
569 		return NULL;
570 	}
571 
572 	return array;
573 }
574 
575 /* Similar to nft_rbtree_{u,k}size to hide details to userspace, but consider
576  * packed representation coming from userspace for anonymous sets too.
577  */
nft_array_elems(const struct nft_set * set)578 static u32 nft_array_elems(const struct nft_set *set)
579 {
580 	u32 nelems = atomic_read(&set->nelems) - set->ndeact;
581 
582 	/* Adjacent intervals are represented with a single start element in
583 	 * anonymous sets, use the current element counter as is.
584 	 */
585 	if (nft_set_is_anonymous(set))
586 		return nelems;
587 
588 	/* Add extra room for never matching interval at the beginning and open
589 	 * interval at the end which only use a single element to represent it.
590 	 * The conversion to array will compact intervals, this allows reduce
591 	 * memory consumption.
592 	 */
593 	return (nelems / 2) + 2;
594 }
595 
596 #define NFT_ARRAY_INITIAL_SIZE		1024
597 #define NFT_ARRAY_INITIAL_ANON_SIZE	16
598 #define NFT_ARRAY_INITIAL_ANON_THRESH	(8192U / sizeof(struct nft_array_interval))
599 
nft_array_may_resize(const struct nft_set * set,bool flush)600 static int nft_array_may_resize(const struct nft_set *set, bool flush)
601 {
602 	u32 initial_intervals, max_intervals, new_max_intervals, delta;
603 	u32 shrinked_max_intervals, nelems = nft_array_elems(set);
604 	struct nft_rbtree *priv = nft_set_priv(set);
605 	struct nft_array *array;
606 
607 	if (nft_set_is_anonymous(set))
608 		initial_intervals = NFT_ARRAY_INITIAL_ANON_SIZE;
609 	else
610 		initial_intervals = NFT_ARRAY_INITIAL_SIZE;
611 
612 	if (priv->array_next) {
613 		max_intervals = priv->array_next->max_intervals;
614 		new_max_intervals = priv->array_next->max_intervals;
615 	} else {
616 		if (priv->array) {
617 			max_intervals = priv->array->max_intervals;
618 			new_max_intervals = priv->array->max_intervals;
619 		} else {
620 			max_intervals = 0;
621 			new_max_intervals = initial_intervals;
622 		}
623 	}
624 
625 	if (nft_set_is_anonymous(set))
626 		goto maybe_grow;
627 
628 	if (flush) {
629 		/* Set flush just started, nelems still report elements.*/
630 		nelems = 0;
631 		new_max_intervals = NFT_ARRAY_INITIAL_SIZE;
632 		goto realloc_array;
633 	}
634 
635 	if (check_add_overflow(new_max_intervals, new_max_intervals,
636 			       &shrinked_max_intervals))
637 		return -EOVERFLOW;
638 
639 	shrinked_max_intervals = DIV_ROUND_UP(shrinked_max_intervals, 3);
640 
641 	if (shrinked_max_intervals > NFT_ARRAY_INITIAL_SIZE &&
642 	    nelems < shrinked_max_intervals) {
643 		new_max_intervals = shrinked_max_intervals;
644 		goto realloc_array;
645 	}
646 maybe_grow:
647 	if (nelems > new_max_intervals) {
648 		if (nft_set_is_anonymous(set) &&
649 		    new_max_intervals < NFT_ARRAY_INITIAL_ANON_THRESH) {
650 			new_max_intervals <<= 1;
651 		} else {
652 			delta = new_max_intervals >> 1;
653 			if (check_add_overflow(new_max_intervals, delta,
654 					       &new_max_intervals))
655 				return -EOVERFLOW;
656 		}
657 	}
658 
659 realloc_array:
660 	if (WARN_ON_ONCE(nelems > new_max_intervals))
661 		return -ENOMEM;
662 
663 	if (priv->array_next) {
664 		if (max_intervals == new_max_intervals)
665 			return 0;
666 
667 		if (nft_array_intervals_alloc(priv->array_next, new_max_intervals) < 0)
668 			return -ENOMEM;
669 	} else {
670 		array = nft_array_alloc(new_max_intervals);
671 		if (!array)
672 			return -ENOMEM;
673 
674 		priv->array_next = array;
675 	}
676 
677 	return 0;
678 }
679 
nft_rbtree_insert(const struct net * net,const struct nft_set * set,const struct nft_set_elem * elem,struct nft_elem_priv ** elem_priv)680 static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
681 			     const struct nft_set_elem *elem,
682 			     struct nft_elem_priv **elem_priv)
683 {
684 	struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv);
685 	struct nft_rbtree *priv = nft_set_priv(set);
686 	u64 tstamp = nft_net_tstamp(net);
687 	int err;
688 
689 	nft_rbtree_maybe_reset_start_cookie(priv, tstamp);
690 
691 	if (nft_array_may_resize(set, false) < 0)
692 		return -ENOMEM;
693 
694 	do {
695 		if (fatal_signal_pending(current))
696 			return -EINTR;
697 
698 		cond_resched();
699 
700 		write_lock(&priv->lock);
701 		err = __nft_rbtree_insert(net, set, rbe, elem_priv, tstamp);
702 		write_unlock(&priv->lock);
703 	} while (err == -EAGAIN);
704 
705 	return err;
706 }
707 
nft_rbtree_remove(const struct net * net,const struct nft_set * set,struct nft_elem_priv * elem_priv)708 static void nft_rbtree_remove(const struct net *net,
709 			      const struct nft_set *set,
710 			      struct nft_elem_priv *elem_priv)
711 {
712 	struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
713 	struct nft_rbtree *priv = nft_set_priv(set);
714 
715 	write_lock(&priv->lock);
716 	rb_erase(&rbe->node, &priv->root);
717 	write_unlock(&priv->lock);
718 }
719 
nft_rbtree_activate(const struct net * net,const struct nft_set * set,struct nft_elem_priv * elem_priv)720 static void nft_rbtree_activate(const struct net *net,
721 				const struct nft_set *set,
722 				struct nft_elem_priv *elem_priv)
723 {
724 	struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
725 
726 	nft_clear(net, &rbe->ext);
727 }
728 
729 static struct nft_rbtree_elem *
nft_rbtree_next_inactive(struct nft_rbtree_elem * rbe,u8 genmask)730 nft_rbtree_next_inactive(struct nft_rbtree_elem *rbe, u8 genmask)
731 {
732 	struct nft_rbtree_elem *next_rbe;
733 	struct rb_node *node;
734 
735 	node = rb_next(&rbe->node);
736 	if (node) {
737 		next_rbe = rb_entry(node, struct nft_rbtree_elem, node);
738 		if (nft_rbtree_interval_start(next_rbe) &&
739 		    !nft_set_elem_active(&next_rbe->ext, genmask))
740 			return next_rbe;
741 	}
742 
743 	return NULL;
744 }
745 
nft_rbtree_deactivate_same_interval(const struct net * net,struct nft_rbtree * priv,struct nft_rbtree_elem * rbe)746 static bool nft_rbtree_deactivate_same_interval(const struct net *net,
747 						struct nft_rbtree *priv,
748 						struct nft_rbtree_elem *rbe)
749 {
750 	u8 genmask = nft_genmask_next(net);
751 	struct nft_rbtree_elem *next_rbe;
752 
753 	if (!priv->start_rbe_cookie)
754 		return true;
755 
756 	next_rbe = nft_rbtree_next_inactive(rbe, genmask);
757 	if (next_rbe) {
758 		/* Closest start element differs from last element added. */
759 		if (nft_rbtree_interval_start(next_rbe) &&
760 		    nft_rbtree_cmp_start_cookie(priv, next_rbe)) {
761 			priv->start_rbe_cookie = 0;
762 			return true;
763 		}
764 	}
765 
766 	priv->start_rbe_cookie = 0;
767 
768 	return false;
769 }
770 
nft_rbtree_flush(const struct net * net,const struct nft_set * set,struct nft_elem_priv * elem_priv)771 static void nft_rbtree_flush(const struct net *net,
772 			     const struct nft_set *set,
773 			     struct nft_elem_priv *elem_priv)
774 {
775 	struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
776 
777 	nft_set_elem_change_active(net, set, &rbe->ext);
778 }
779 
780 static struct nft_elem_priv *
nft_rbtree_deactivate(const struct net * net,const struct nft_set * set,const struct nft_set_elem * elem)781 nft_rbtree_deactivate(const struct net *net, const struct nft_set *set,
782 		      const struct nft_set_elem *elem)
783 {
784 	struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(elem->priv);
785 	struct nft_rbtree *priv = nft_set_priv(set);
786 	const struct rb_node *parent = priv->root.rb_node;
787 	u8 genmask = nft_genmask_next(net);
788 	u64 tstamp = nft_net_tstamp(net);
789 	int d;
790 
791 	nft_rbtree_maybe_reset_start_cookie(priv, tstamp);
792 
793 	if (nft_rbtree_interval_start(this) ||
794 	    nft_rbtree_interval_null(set, this))
795 		priv->start_rbe_cookie = 0;
796 
797 	if (nft_array_may_resize(set, false) < 0)
798 		return NULL;
799 
800 	while (parent != NULL) {
801 		rbe = rb_entry(parent, struct nft_rbtree_elem, node);
802 
803 		d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val,
804 					   set->klen);
805 		if (d < 0)
806 			parent = parent->rb_left;
807 		else if (d > 0)
808 			parent = parent->rb_right;
809 		else {
810 			if (nft_rbtree_interval_end(rbe) &&
811 			    nft_rbtree_interval_start(this)) {
812 				parent = parent->rb_left;
813 				continue;
814 			} else if (nft_rbtree_interval_start(rbe) &&
815 				   nft_rbtree_interval_end(this)) {
816 				parent = parent->rb_right;
817 				continue;
818 			} else if (__nft_set_elem_expired(&rbe->ext, tstamp)) {
819 				break;
820 			} else if (!nft_set_elem_active(&rbe->ext, genmask)) {
821 				parent = parent->rb_left;
822 				continue;
823 			}
824 
825 			if (nft_rbtree_interval_start(rbe))
826 				nft_rbtree_set_start_cookie(priv, rbe);
827 			else if (!nft_rbtree_deactivate_same_interval(net, priv, rbe))
828 				return NULL;
829 
830 			nft_rbtree_flush(net, set, &rbe->priv);
831 			return &rbe->priv;
832 		}
833 	}
834 	return NULL;
835 }
836 
nft_rbtree_do_walk(const struct nft_ctx * ctx,struct nft_set * set,struct nft_set_iter * iter)837 static void nft_rbtree_do_walk(const struct nft_ctx *ctx,
838 			       struct nft_set *set,
839 			       struct nft_set_iter *iter)
840 {
841 	struct nft_rbtree *priv = nft_set_priv(set);
842 	struct nft_rbtree_elem *rbe;
843 	struct rb_node *node;
844 
845 	for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
846 		rbe = rb_entry(node, struct nft_rbtree_elem, node);
847 
848 		if (iter->count < iter->skip)
849 			goto cont;
850 
851 		iter->err = iter->fn(ctx, set, iter, &rbe->priv);
852 		if (iter->err < 0)
853 			return;
854 cont:
855 		iter->count++;
856 	}
857 }
858 
nft_rbtree_walk(const struct nft_ctx * ctx,struct nft_set * set,struct nft_set_iter * iter)859 static void nft_rbtree_walk(const struct nft_ctx *ctx,
860 			    struct nft_set *set,
861 			    struct nft_set_iter *iter)
862 {
863 	struct nft_rbtree *priv = nft_set_priv(set);
864 
865 	switch (iter->type) {
866 	case NFT_ITER_UPDATE_CLONE:
867 		if (nft_array_may_resize(set, true) < 0) {
868 			iter->err = -ENOMEM;
869 			break;
870 		}
871 		fallthrough;
872 	case NFT_ITER_UPDATE:
873 		lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex);
874 
875 		nft_rbtree_do_walk(ctx, set, iter);
876 		break;
877 	case NFT_ITER_READ:
878 		read_lock(&priv->lock);
879 		nft_rbtree_do_walk(ctx, set, iter);
880 		read_unlock(&priv->lock);
881 		break;
882 	default:
883 		iter->err = -EINVAL;
884 		WARN_ON_ONCE(1);
885 		break;
886 	}
887 }
888 
nft_rbtree_gc_scan(struct nft_set * set)889 static void nft_rbtree_gc_scan(struct nft_set *set)
890 {
891 	struct nft_rbtree *priv = nft_set_priv(set);
892 	struct nft_rbtree_elem *rbe, *rbe_end = NULL;
893 	struct net *net = read_pnet(&set->net);
894 	u64 tstamp = nft_net_tstamp(net);
895 	struct rb_node *node, *next;
896 
897 	for (node = rb_first(&priv->root); node ; node = next) {
898 		next = rb_next(node);
899 
900 		rbe = rb_entry(node, struct nft_rbtree_elem, node);
901 
902 		/* elements are reversed in the rbtree for historical reasons,
903 		 * from highest to lowest value, that is why end element is
904 		 * always visited before the start element.
905 		 */
906 		if (nft_rbtree_interval_end(rbe)) {
907 			rbe_end = rbe;
908 			continue;
909 		}
910 		if (!__nft_set_elem_expired(&rbe->ext, tstamp))
911 			continue;
912 
913 		/* end element needs to be removed first, it has
914 		 * no timeout extension.
915 		 */
916 		write_lock(&priv->lock);
917 		if (rbe_end) {
918 			nft_rbtree_gc_elem_move(net, set, priv, rbe_end);
919 			rbe_end = NULL;
920 		}
921 
922 		nft_rbtree_gc_elem_move(net, set, priv, rbe);
923 		write_unlock(&priv->lock);
924 	}
925 
926 	priv->last_gc = jiffies;
927 }
928 
nft_rbtree_gc_queue(struct nft_set * set)929 static void nft_rbtree_gc_queue(struct nft_set *set)
930 {
931 	struct nft_rbtree *priv = nft_set_priv(set);
932 	struct nft_rbtree_elem *rbe, *rbe_end;
933 	struct nft_trans_gc *gc;
934 
935 	if (list_empty(&priv->expired))
936 		return;
937 
938 	gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
939 	if (!gc)
940 		return;
941 
942 	list_for_each_entry_safe(rbe, rbe_end, &priv->expired, list) {
943 		list_del(&rbe->list);
944 		nft_trans_gc_elem_add(gc, rbe);
945 
946 		gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
947 		if (!gc)
948 			return;
949 	}
950 
951 	gc = nft_trans_gc_catchall_sync(gc);
952 	nft_trans_gc_queue_sync_done(gc);
953 }
954 
nft_rbtree_privsize(const struct nlattr * const nla[],const struct nft_set_desc * desc)955 static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
956 			       const struct nft_set_desc *desc)
957 {
958 	return sizeof(struct nft_rbtree);
959 }
960 
nft_rbtree_init(const struct nft_set * set,const struct nft_set_desc * desc,const struct nlattr * const nla[])961 static int nft_rbtree_init(const struct nft_set *set,
962 			   const struct nft_set_desc *desc,
963 			   const struct nlattr * const nla[])
964 {
965 	struct nft_rbtree *priv = nft_set_priv(set);
966 
967 	BUILD_BUG_ON(offsetof(struct nft_rbtree_elem, priv) != 0);
968 
969 	rwlock_init(&priv->lock);
970 	priv->root = RB_ROOT;
971 	INIT_LIST_HEAD(&priv->expired);
972 
973 	priv->array = NULL;
974 	priv->array_next = NULL;
975 
976 	return 0;
977 }
978 
__nft_array_free(struct nft_array * array)979 static void __nft_array_free(struct nft_array *array)
980 {
981 	kvfree(array->intervals);
982 	kfree(array);
983 }
984 
nft_rbtree_destroy(const struct nft_ctx * ctx,const struct nft_set * set)985 static void nft_rbtree_destroy(const struct nft_ctx *ctx,
986 			       const struct nft_set *set)
987 {
988 	struct nft_rbtree *priv = nft_set_priv(set);
989 	struct nft_rbtree_elem *rbe, *next;
990 	struct nft_array *array;
991 	struct rb_node *node;
992 
993 	list_for_each_entry_safe(rbe, next, &priv->expired, list) {
994 		list_del(&rbe->list);
995 		nf_tables_set_elem_destroy(ctx, set, &rbe->priv);
996 	}
997 
998 	while ((node = priv->root.rb_node) != NULL) {
999 		rb_erase(node, &priv->root);
1000 		rbe = rb_entry(node, struct nft_rbtree_elem, node);
1001 		nf_tables_set_elem_destroy(ctx, set, &rbe->priv);
1002 	}
1003 
1004 	array = rcu_dereference_protected(priv->array, true);
1005 	if (array)
1006 		__nft_array_free(array);
1007 	if (priv->array_next)
1008 		__nft_array_free(priv->array_next);
1009 }
1010 
nft_rbtree_estimate(const struct nft_set_desc * desc,u32 features,struct nft_set_estimate * est)1011 static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
1012 				struct nft_set_estimate *est)
1013 {
1014 	if (desc->field_count > 1)
1015 		return false;
1016 
1017 	if (desc->size)
1018 		est->size = sizeof(struct nft_rbtree) +
1019 			    desc->size * sizeof(struct nft_rbtree_elem);
1020 	else
1021 		est->size = ~0;
1022 
1023 	est->lookup = NFT_SET_CLASS_O_LOG_N;
1024 	est->space  = NFT_SET_CLASS_O_N;
1025 
1026 	return true;
1027 }
1028 
nft_array_free_rcu(struct rcu_head * rcu_head)1029 static void nft_array_free_rcu(struct rcu_head *rcu_head)
1030 {
1031 	struct nft_array *array = container_of(rcu_head, struct nft_array, rcu_head);
1032 
1033 	__nft_array_free(array);
1034 }
1035 
nft_rbtree_commit(struct nft_set * set)1036 static void nft_rbtree_commit(struct nft_set *set)
1037 {
1038 	struct nft_rbtree *priv = nft_set_priv(set);
1039 	struct nft_rbtree_elem *rbe, *prev_rbe;
1040 	struct nft_array *old;
1041 	u32 num_intervals = 0;
1042 	struct rb_node *node;
1043 
1044 	/* No changes, skip, eg. elements updates only. */
1045 	if (!priv->array_next)
1046 		return;
1047 
1048 	/* GC can be performed if the binary search blob is going
1049 	 * to be rebuilt.  It has to be done in two phases: first
1050 	 * scan tree and move all expired elements to the expired
1051 	 * list.
1052 	 *
1053 	 * Then, after blob has been re-built and published to other
1054 	 * CPUs, queue collected entries for freeing.
1055 	 */
1056 	if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
1057 		nft_rbtree_gc_scan(set);
1058 
1059 	/* Reverse walk to create an array from smaller to largest interval. */
1060 	node = rb_last(&priv->root);
1061 	if (node)
1062 		prev_rbe = rb_entry(node, struct nft_rbtree_elem, node);
1063 	else
1064 		prev_rbe = NULL;
1065 
1066 	while (prev_rbe) {
1067 		rbe = prev_rbe;
1068 
1069 		if (nft_rbtree_interval_start(rbe))
1070 			priv->array_next->intervals[num_intervals].from = &rbe->ext;
1071 		else if (nft_rbtree_interval_end(rbe))
1072 			priv->array_next->intervals[num_intervals++].to = &rbe->ext;
1073 
1074 		if (num_intervals >= priv->array_next->max_intervals) {
1075 			pr_warn_once("malformed interval set from userspace?");
1076 			goto err_out;
1077 		}
1078 
1079 		node = rb_prev(node);
1080 		if (!node)
1081 			break;
1082 
1083 		prev_rbe = rb_entry(node, struct nft_rbtree_elem, node);
1084 
1085 		/* For anonymous sets, when adjacent ranges are found,
1086 		 * the end element is not added to the set to pack the set
1087 		 * representation. Use next start element to complete this
1088 		 * interval.
1089 		 */
1090 		if (nft_rbtree_interval_start(rbe) &&
1091 		    nft_rbtree_interval_start(prev_rbe) &&
1092 		    priv->array_next->intervals[num_intervals].from)
1093 			priv->array_next->intervals[num_intervals++].to = &prev_rbe->ext;
1094 
1095 		if (num_intervals >= priv->array_next->max_intervals) {
1096 			pr_warn_once("malformed interval set from userspace?");
1097 			goto err_out;
1098 		}
1099 	}
1100 
1101 	if (priv->array_next->intervals[num_intervals].from)
1102 		num_intervals++;
1103 err_out:
1104 	priv->array_next->num_intervals = num_intervals;
1105 	old = rcu_replace_pointer(priv->array, priv->array_next,
1106 				  lockdep_is_held(&nft_pernet(read_pnet(&set->net))->commit_mutex));
1107 	priv->array_next = NULL;
1108 	if (old)
1109 		call_rcu(&old->rcu_head, nft_array_free_rcu);
1110 
1111 	/* New blob is public, queue collected entries for freeing.
1112 	 * call_rcu ensures elements stay around until readers are done.
1113 	 */
1114 	nft_rbtree_gc_queue(set);
1115 }
1116 
nft_rbtree_abort(const struct nft_set * set)1117 static void nft_rbtree_abort(const struct nft_set *set)
1118 {
1119 	struct nft_rbtree *priv = nft_set_priv(set);
1120 	struct nft_array *array_next;
1121 
1122 	if (!priv->array_next)
1123 		return;
1124 
1125 	array_next = priv->array_next;
1126 	priv->array_next = NULL;
1127 	__nft_array_free(array_next);
1128 }
1129 
nft_rbtree_gc_init(const struct nft_set * set)1130 static void nft_rbtree_gc_init(const struct nft_set *set)
1131 {
1132 	struct nft_rbtree *priv = nft_set_priv(set);
1133 
1134 	priv->last_gc = jiffies;
1135 }
1136 
1137 /* rbtree stores ranges as singleton elements, each range is composed of two
1138  * elements ...
1139  */
nft_rbtree_ksize(u32 size)1140 static u32 nft_rbtree_ksize(u32 size)
1141 {
1142 	return size * 2;
1143 }
1144 
1145 /* ... hide this detail to userspace. */
nft_rbtree_usize(u32 size)1146 static u32 nft_rbtree_usize(u32 size)
1147 {
1148 	if (!size)
1149 		return 0;
1150 
1151 	return size / 2;
1152 }
1153 
nft_rbtree_adjust_maxsize(const struct nft_set * set)1154 static u32 nft_rbtree_adjust_maxsize(const struct nft_set *set)
1155 {
1156 	struct nft_rbtree *priv = nft_set_priv(set);
1157 	struct nft_rbtree_elem *rbe;
1158 	struct rb_node *node;
1159 	const void *key;
1160 
1161 	node = rb_last(&priv->root);
1162 	if (!node)
1163 		return 0;
1164 
1165 	rbe = rb_entry(node, struct nft_rbtree_elem, node);
1166 	if (!nft_rbtree_interval_end(rbe))
1167 		return 0;
1168 
1169 	key = nft_set_ext_key(&rbe->ext);
1170 	if (memchr(key, 1, set->klen))
1171 		return 0;
1172 
1173 	/* this is the all-zero no-match element. */
1174 	return 1;
1175 }
1176 
1177 const struct nft_set_type nft_set_rbtree_type = {
1178 	.features	= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
1179 	.ops		= {
1180 		.privsize	= nft_rbtree_privsize,
1181 		.elemsize	= offsetof(struct nft_rbtree_elem, ext),
1182 		.estimate	= nft_rbtree_estimate,
1183 		.init		= nft_rbtree_init,
1184 		.destroy	= nft_rbtree_destroy,
1185 		.insert		= nft_rbtree_insert,
1186 		.remove		= nft_rbtree_remove,
1187 		.deactivate	= nft_rbtree_deactivate,
1188 		.flush		= nft_rbtree_flush,
1189 		.activate	= nft_rbtree_activate,
1190 		.commit		= nft_rbtree_commit,
1191 		.abort		= nft_rbtree_abort,
1192 		.gc_init	= nft_rbtree_gc_init,
1193 		.lookup		= nft_rbtree_lookup,
1194 		.walk		= nft_rbtree_walk,
1195 		.get		= nft_rbtree_get,
1196 		.ksize		= nft_rbtree_ksize,
1197 		.usize		= nft_rbtree_usize,
1198 		.adjust_maxsize = nft_rbtree_adjust_maxsize,
1199 	},
1200 };
1201