xref: /linux/net/sched/cls_u32.c (revision 80d443e8876602be2c130f79c4de81e12e2a700d)
1 /*
2  * net/sched/cls_u32.c	Ugly (or Universal) 32bit key Packet Classifier.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  *	The filters are packed to hash tables of key nodes
12  *	with a set of 32bit key/mask pairs at every node.
13  *	Nodes reference next level hash tables etc.
14  *
15  *	This scheme is the best universal classifier I managed to
16  *	invent; it is not super-fast, but it is not slow (provided you
17  *	program it correctly), and general enough.  And its relative
18  *	speed grows as the number of rules becomes larger.
19  *
20  *	It seems that it represents the best middle point between
21  *	speed and manageability both by human and by machine.
22  *
23  *	It is especially useful for link sharing combined with QoS;
24  *	pure RSVP doesn't need such a general approach and can use
25  *	much simpler (and faster) schemes, sort of cls_rsvp.c.
26  *
27  *	JHS: We should remove the CONFIG_NET_CLS_IND from here
28  *	eventually when the meta match extension is made available
29  *
30  *	nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro>
31  */
32 
33 #include <linux/module.h>
34 #include <linux/slab.h>
35 #include <linux/types.h>
36 #include <linux/kernel.h>
37 #include <linux/string.h>
38 #include <linux/errno.h>
39 #include <linux/percpu.h>
40 #include <linux/rtnetlink.h>
41 #include <linux/skbuff.h>
42 #include <linux/bitmap.h>
43 #include <net/netlink.h>
44 #include <net/act_api.h>
45 #include <net/pkt_cls.h>
46 #include <linux/netdevice.h>
47 
48 struct tc_u_knode {
49 	struct tc_u_knode __rcu	*next;
50 	u32			handle;
51 	struct tc_u_hnode __rcu	*ht_up;
52 	struct tcf_exts		exts;
53 #ifdef CONFIG_NET_CLS_IND
54 	int			ifindex;
55 #endif
56 	u8			fshift;
57 	struct tcf_result	res;
58 	struct tc_u_hnode __rcu	*ht_down;
59 #ifdef CONFIG_CLS_U32_PERF
60 	struct tc_u32_pcnt __percpu *pf;
61 #endif
62 	u32			flags;
63 #ifdef CONFIG_CLS_U32_MARK
64 	u32			val;
65 	u32			mask;
66 	u32 __percpu		*pcpu_success;
67 #endif
68 	struct tcf_proto	*tp;
69 	struct rcu_head		rcu;
70 	/* The 'sel' field MUST be the last field in structure to allow for
71 	 * tc_u32_keys allocated at end of structure.
72 	 */
73 	struct tc_u32_sel	sel;
74 };
75 
76 struct tc_u_hnode {
77 	struct tc_u_hnode __rcu	*next;
78 	u32			handle;
79 	u32			prio;
80 	struct tc_u_common	*tp_c;
81 	int			refcnt;
82 	unsigned int		divisor;
83 	struct rcu_head		rcu;
84 	/* The 'ht' field MUST be the last field in structure to allow for
85 	 * more entries allocated at end of structure.
86 	 */
87 	struct tc_u_knode __rcu	*ht[1];
88 };
89 
90 struct tc_u_common {
91 	struct tc_u_hnode __rcu	*hlist;
92 	struct Qdisc		*q;
93 	int			refcnt;
94 	u32			hgenerator;
95 	struct rcu_head		rcu;
96 };
97 
98 static inline unsigned int u32_hash_fold(__be32 key,
99 					 const struct tc_u32_sel *sel,
100 					 u8 fshift)
101 {
102 	unsigned int h = ntohl(key & sel->hmask) >> fshift;
103 
104 	return h;
105 }
106 
107 static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp,
108 			struct tcf_result *res)
109 {
110 	struct {
111 		struct tc_u_knode *knode;
112 		unsigned int	  off;
113 	} stack[TC_U32_MAXDEPTH];
114 
115 	struct tc_u_hnode *ht = rcu_dereference_bh(tp->root);
116 	unsigned int off = skb_network_offset(skb);
117 	struct tc_u_knode *n;
118 	int sdepth = 0;
119 	int off2 = 0;
120 	int sel = 0;
121 #ifdef CONFIG_CLS_U32_PERF
122 	int j;
123 #endif
124 	int i, r;
125 
126 next_ht:
127 	n = rcu_dereference_bh(ht->ht[sel]);
128 
129 next_knode:
130 	if (n) {
131 		struct tc_u32_key *key = n->sel.keys;
132 
133 #ifdef CONFIG_CLS_U32_PERF
134 		__this_cpu_inc(n->pf->rcnt);
135 		j = 0;
136 #endif
137 
138 		if (tc_skip_sw(n->flags)) {
139 			n = rcu_dereference_bh(n->next);
140 			goto next_knode;
141 		}
142 
143 #ifdef CONFIG_CLS_U32_MARK
144 		if ((skb->mark & n->mask) != n->val) {
145 			n = rcu_dereference_bh(n->next);
146 			goto next_knode;
147 		} else {
148 			__this_cpu_inc(*n->pcpu_success);
149 		}
150 #endif
151 
152 		for (i = n->sel.nkeys; i > 0; i--, key++) {
153 			int toff = off + key->off + (off2 & key->offmask);
154 			__be32 *data, hdata;
155 
156 			if (skb_headroom(skb) + toff > INT_MAX)
157 				goto out;
158 
159 			data = skb_header_pointer(skb, toff, 4, &hdata);
160 			if (!data)
161 				goto out;
162 			if ((*data ^ key->val) & key->mask) {
163 				n = rcu_dereference_bh(n->next);
164 				goto next_knode;
165 			}
166 #ifdef CONFIG_CLS_U32_PERF
167 			__this_cpu_inc(n->pf->kcnts[j]);
168 			j++;
169 #endif
170 		}
171 
172 		ht = rcu_dereference_bh(n->ht_down);
173 		if (!ht) {
174 check_terminal:
175 			if (n->sel.flags & TC_U32_TERMINAL) {
176 
177 				*res = n->res;
178 #ifdef CONFIG_NET_CLS_IND
179 				if (!tcf_match_indev(skb, n->ifindex)) {
180 					n = rcu_dereference_bh(n->next);
181 					goto next_knode;
182 				}
183 #endif
184 #ifdef CONFIG_CLS_U32_PERF
185 				__this_cpu_inc(n->pf->rhit);
186 #endif
187 				r = tcf_exts_exec(skb, &n->exts, res);
188 				if (r < 0) {
189 					n = rcu_dereference_bh(n->next);
190 					goto next_knode;
191 				}
192 
193 				return r;
194 			}
195 			n = rcu_dereference_bh(n->next);
196 			goto next_knode;
197 		}
198 
199 		/* PUSH */
200 		if (sdepth >= TC_U32_MAXDEPTH)
201 			goto deadloop;
202 		stack[sdepth].knode = n;
203 		stack[sdepth].off = off;
204 		sdepth++;
205 
206 		ht = rcu_dereference_bh(n->ht_down);
207 		sel = 0;
208 		if (ht->divisor) {
209 			__be32 *data, hdata;
210 
211 			data = skb_header_pointer(skb, off + n->sel.hoff, 4,
212 						  &hdata);
213 			if (!data)
214 				goto out;
215 			sel = ht->divisor & u32_hash_fold(*data, &n->sel,
216 							  n->fshift);
217 		}
218 		if (!(n->sel.flags & (TC_U32_VAROFFSET | TC_U32_OFFSET | TC_U32_EAT)))
219 			goto next_ht;
220 
221 		if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) {
222 			off2 = n->sel.off + 3;
223 			if (n->sel.flags & TC_U32_VAROFFSET) {
224 				__be16 *data, hdata;
225 
226 				data = skb_header_pointer(skb,
227 							  off + n->sel.offoff,
228 							  2, &hdata);
229 				if (!data)
230 					goto out;
231 				off2 += ntohs(n->sel.offmask & *data) >>
232 					n->sel.offshift;
233 			}
234 			off2 &= ~3;
235 		}
236 		if (n->sel.flags & TC_U32_EAT) {
237 			off += off2;
238 			off2 = 0;
239 		}
240 
241 		if (off < skb->len)
242 			goto next_ht;
243 	}
244 
245 	/* POP */
246 	if (sdepth--) {
247 		n = stack[sdepth].knode;
248 		ht = rcu_dereference_bh(n->ht_up);
249 		off = stack[sdepth].off;
250 		goto check_terminal;
251 	}
252 out:
253 	return -1;
254 
255 deadloop:
256 	net_warn_ratelimited("cls_u32: dead loop\n");
257 	return -1;
258 }
259 
260 static struct tc_u_hnode *u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
261 {
262 	struct tc_u_hnode *ht;
263 
264 	for (ht = rtnl_dereference(tp_c->hlist);
265 	     ht;
266 	     ht = rtnl_dereference(ht->next))
267 		if (ht->handle == handle)
268 			break;
269 
270 	return ht;
271 }
272 
273 static struct tc_u_knode *u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
274 {
275 	unsigned int sel;
276 	struct tc_u_knode *n = NULL;
277 
278 	sel = TC_U32_HASH(handle);
279 	if (sel > ht->divisor)
280 		goto out;
281 
282 	for (n = rtnl_dereference(ht->ht[sel]);
283 	     n;
284 	     n = rtnl_dereference(n->next))
285 		if (n->handle == handle)
286 			break;
287 out:
288 	return n;
289 }
290 
291 
292 static unsigned long u32_get(struct tcf_proto *tp, u32 handle)
293 {
294 	struct tc_u_hnode *ht;
295 	struct tc_u_common *tp_c = tp->data;
296 
297 	if (TC_U32_HTID(handle) == TC_U32_ROOT)
298 		ht = rtnl_dereference(tp->root);
299 	else
300 		ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle));
301 
302 	if (!ht)
303 		return 0;
304 
305 	if (TC_U32_KEY(handle) == 0)
306 		return (unsigned long)ht;
307 
308 	return (unsigned long)u32_lookup_key(ht, handle);
309 }
310 
311 static u32 gen_new_htid(struct tc_u_common *tp_c)
312 {
313 	int i = 0x800;
314 
315 	/* hgenerator only used inside rtnl lock it is safe to increment
316 	 * without read _copy_ update semantics
317 	 */
318 	do {
319 		if (++tp_c->hgenerator == 0x7FF)
320 			tp_c->hgenerator = 1;
321 	} while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
322 
323 	return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;
324 }
325 
326 static int u32_init(struct tcf_proto *tp)
327 {
328 	struct tc_u_hnode *root_ht;
329 	struct tc_u_common *tp_c;
330 
331 	tp_c = tp->q->u32_node;
332 
333 	root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL);
334 	if (root_ht == NULL)
335 		return -ENOBUFS;
336 
337 	root_ht->divisor = 0;
338 	root_ht->refcnt++;
339 	root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000;
340 	root_ht->prio = tp->prio;
341 
342 	if (tp_c == NULL) {
343 		tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL);
344 		if (tp_c == NULL) {
345 			kfree(root_ht);
346 			return -ENOBUFS;
347 		}
348 		tp_c->q = tp->q;
349 		tp->q->u32_node = tp_c;
350 	}
351 
352 	tp_c->refcnt++;
353 	RCU_INIT_POINTER(root_ht->next, tp_c->hlist);
354 	rcu_assign_pointer(tp_c->hlist, root_ht);
355 	root_ht->tp_c = tp_c;
356 
357 	rcu_assign_pointer(tp->root, root_ht);
358 	tp->data = tp_c;
359 	return 0;
360 }
361 
362 static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n,
363 			   bool free_pf)
364 {
365 	tcf_exts_destroy(&n->exts);
366 	if (n->ht_down)
367 		n->ht_down->refcnt--;
368 #ifdef CONFIG_CLS_U32_PERF
369 	if (free_pf)
370 		free_percpu(n->pf);
371 #endif
372 #ifdef CONFIG_CLS_U32_MARK
373 	if (free_pf)
374 		free_percpu(n->pcpu_success);
375 #endif
376 	kfree(n);
377 	return 0;
378 }
379 
380 /* u32_delete_key_rcu should be called when free'ing a copied
381  * version of a tc_u_knode obtained from u32_init_knode(). When
382  * copies are obtained from u32_init_knode() the statistics are
383  * shared between the old and new copies to allow readers to
384  * continue to update the statistics during the copy. To support
385  * this the u32_delete_key_rcu variant does not free the percpu
386  * statistics.
387  */
388 static void u32_delete_key_rcu(struct rcu_head *rcu)
389 {
390 	struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);
391 
392 	u32_destroy_key(key->tp, key, false);
393 }
394 
395 /* u32_delete_key_freepf_rcu is the rcu callback variant
396  * that free's the entire structure including the statistics
397  * percpu variables. Only use this if the key is not a copy
398  * returned by u32_init_knode(). See u32_delete_key_rcu()
399  * for the variant that should be used with keys return from
400  * u32_init_knode()
401  */
402 static void u32_delete_key_freepf_rcu(struct rcu_head *rcu)
403 {
404 	struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);
405 
406 	u32_destroy_key(key->tp, key, true);
407 }
408 
409 static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
410 {
411 	struct tc_u_knode __rcu **kp;
412 	struct tc_u_knode *pkp;
413 	struct tc_u_hnode *ht = rtnl_dereference(key->ht_up);
414 
415 	if (ht) {
416 		kp = &ht->ht[TC_U32_HASH(key->handle)];
417 		for (pkp = rtnl_dereference(*kp); pkp;
418 		     kp = &pkp->next, pkp = rtnl_dereference(*kp)) {
419 			if (pkp == key) {
420 				RCU_INIT_POINTER(*kp, key->next);
421 
422 				tcf_unbind_filter(tp, &key->res);
423 				call_rcu(&key->rcu, u32_delete_key_freepf_rcu);
424 				return 0;
425 			}
426 		}
427 	}
428 	WARN_ON(1);
429 	return 0;
430 }
431 
432 static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
433 {
434 	struct net_device *dev = tp->q->dev_queue->dev;
435 	struct tc_cls_u32_offload u32_offload = {0};
436 	struct tc_to_netdev offload;
437 
438 	offload.type = TC_SETUP_CLSU32;
439 	offload.cls_u32 = &u32_offload;
440 
441 	if (tc_should_offload(dev, tp, 0)) {
442 		offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
443 		offload.cls_u32->knode.handle = handle;
444 		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
445 					      tp->protocol, &offload);
446 	}
447 }
448 
449 static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
450 				u32 flags)
451 {
452 	struct net_device *dev = tp->q->dev_queue->dev;
453 	struct tc_cls_u32_offload u32_offload = {0};
454 	struct tc_to_netdev offload;
455 	int err;
456 
457 	if (!tc_should_offload(dev, tp, flags))
458 		return tc_skip_sw(flags) ? -EINVAL : 0;
459 
460 	offload.type = TC_SETUP_CLSU32;
461 	offload.cls_u32 = &u32_offload;
462 
463 	offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
464 	offload.cls_u32->hnode.divisor = h->divisor;
465 	offload.cls_u32->hnode.handle = h->handle;
466 	offload.cls_u32->hnode.prio = h->prio;
467 
468 	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
469 					    tp->protocol, &offload);
470 	if (tc_skip_sw(flags))
471 		return err;
472 
473 	return 0;
474 }
475 
476 static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
477 {
478 	struct net_device *dev = tp->q->dev_queue->dev;
479 	struct tc_cls_u32_offload u32_offload = {0};
480 	struct tc_to_netdev offload;
481 
482 	offload.type = TC_SETUP_CLSU32;
483 	offload.cls_u32 = &u32_offload;
484 
485 	if (tc_should_offload(dev, tp, 0)) {
486 		offload.cls_u32->command = TC_CLSU32_DELETE_HNODE;
487 		offload.cls_u32->hnode.divisor = h->divisor;
488 		offload.cls_u32->hnode.handle = h->handle;
489 		offload.cls_u32->hnode.prio = h->prio;
490 
491 		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
492 					      tp->protocol, &offload);
493 	}
494 }
495 
496 static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
497 				u32 flags)
498 {
499 	struct net_device *dev = tp->q->dev_queue->dev;
500 	struct tc_cls_u32_offload u32_offload = {0};
501 	struct tc_to_netdev offload;
502 	int err;
503 
504 	offload.type = TC_SETUP_CLSU32;
505 	offload.cls_u32 = &u32_offload;
506 
507 	if (!tc_should_offload(dev, tp, flags))
508 		return tc_skip_sw(flags) ? -EINVAL : 0;
509 
510 	offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
511 	offload.cls_u32->knode.handle = n->handle;
512 	offload.cls_u32->knode.fshift = n->fshift;
513 #ifdef CONFIG_CLS_U32_MARK
514 	offload.cls_u32->knode.val = n->val;
515 	offload.cls_u32->knode.mask = n->mask;
516 #else
517 	offload.cls_u32->knode.val = 0;
518 	offload.cls_u32->knode.mask = 0;
519 #endif
520 	offload.cls_u32->knode.sel = &n->sel;
521 	offload.cls_u32->knode.exts = &n->exts;
522 	if (n->ht_down)
523 		offload.cls_u32->knode.link_handle = n->ht_down->handle;
524 
525 	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
526 					    tp->protocol, &offload);
527 	if (tc_skip_sw(flags))
528 		return err;
529 
530 	return 0;
531 }
532 
533 static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
534 {
535 	struct tc_u_knode *n;
536 	unsigned int h;
537 
538 	for (h = 0; h <= ht->divisor; h++) {
539 		while ((n = rtnl_dereference(ht->ht[h])) != NULL) {
540 			RCU_INIT_POINTER(ht->ht[h],
541 					 rtnl_dereference(n->next));
542 			tcf_unbind_filter(tp, &n->res);
543 			u32_remove_hw_knode(tp, n->handle);
544 			call_rcu(&n->rcu, u32_delete_key_freepf_rcu);
545 		}
546 	}
547 }
548 
549 static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
550 {
551 	struct tc_u_common *tp_c = tp->data;
552 	struct tc_u_hnode __rcu **hn;
553 	struct tc_u_hnode *phn;
554 
555 	WARN_ON(ht->refcnt);
556 
557 	u32_clear_hnode(tp, ht);
558 
559 	hn = &tp_c->hlist;
560 	for (phn = rtnl_dereference(*hn);
561 	     phn;
562 	     hn = &phn->next, phn = rtnl_dereference(*hn)) {
563 		if (phn == ht) {
564 			u32_clear_hw_hnode(tp, ht);
565 			RCU_INIT_POINTER(*hn, ht->next);
566 			kfree_rcu(ht, rcu);
567 			return 0;
568 		}
569 	}
570 
571 	return -ENOENT;
572 }
573 
574 static bool ht_empty(struct tc_u_hnode *ht)
575 {
576 	unsigned int h;
577 
578 	for (h = 0; h <= ht->divisor; h++)
579 		if (rcu_access_pointer(ht->ht[h]))
580 			return false;
581 
582 	return true;
583 }
584 
585 static bool u32_destroy(struct tcf_proto *tp, bool force)
586 {
587 	struct tc_u_common *tp_c = tp->data;
588 	struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
589 
590 	WARN_ON(root_ht == NULL);
591 
592 	if (!force) {
593 		if (root_ht) {
594 			if (root_ht->refcnt > 1)
595 				return false;
596 			if (root_ht->refcnt == 1) {
597 				if (!ht_empty(root_ht))
598 					return false;
599 			}
600 		}
601 
602 		if (tp_c->refcnt > 1)
603 			return false;
604 
605 		if (tp_c->refcnt == 1) {
606 			struct tc_u_hnode *ht;
607 
608 			for (ht = rtnl_dereference(tp_c->hlist);
609 			     ht;
610 			     ht = rtnl_dereference(ht->next))
611 				if (!ht_empty(ht))
612 					return false;
613 		}
614 	}
615 
616 	if (root_ht && --root_ht->refcnt == 0)
617 		u32_destroy_hnode(tp, root_ht);
618 
619 	if (--tp_c->refcnt == 0) {
620 		struct tc_u_hnode *ht;
621 
622 		tp->q->u32_node = NULL;
623 
624 		for (ht = rtnl_dereference(tp_c->hlist);
625 		     ht;
626 		     ht = rtnl_dereference(ht->next)) {
627 			ht->refcnt--;
628 			u32_clear_hnode(tp, ht);
629 		}
630 
631 		while ((ht = rtnl_dereference(tp_c->hlist)) != NULL) {
632 			RCU_INIT_POINTER(tp_c->hlist, ht->next);
633 			kfree_rcu(ht, rcu);
634 		}
635 
636 		kfree(tp_c);
637 	}
638 
639 	tp->data = NULL;
640 	return true;
641 }
642 
643 static int u32_delete(struct tcf_proto *tp, unsigned long arg)
644 {
645 	struct tc_u_hnode *ht = (struct tc_u_hnode *)arg;
646 	struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
647 
648 	if (ht == NULL)
649 		return 0;
650 
651 	if (TC_U32_KEY(ht->handle)) {
652 		u32_remove_hw_knode(tp, ht->handle);
653 		return u32_delete_key(tp, (struct tc_u_knode *)ht);
654 	}
655 
656 	if (root_ht == ht)
657 		return -EINVAL;
658 
659 	if (ht->refcnt == 1) {
660 		ht->refcnt--;
661 		u32_destroy_hnode(tp, ht);
662 	} else {
663 		return -EBUSY;
664 	}
665 
666 	return 0;
667 }
668 
669 #define NR_U32_NODE (1<<12)
670 static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
671 {
672 	struct tc_u_knode *n;
673 	unsigned long i;
674 	unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long),
675 					GFP_KERNEL);
676 	if (!bitmap)
677 		return handle | 0xFFF;
678 
679 	for (n = rtnl_dereference(ht->ht[TC_U32_HASH(handle)]);
680 	     n;
681 	     n = rtnl_dereference(n->next))
682 		set_bit(TC_U32_NODE(n->handle), bitmap);
683 
684 	i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800);
685 	if (i >= NR_U32_NODE)
686 		i = find_next_zero_bit(bitmap, NR_U32_NODE, 1);
687 
688 	kfree(bitmap);
689 	return handle | (i >= NR_U32_NODE ? 0xFFF : i);
690 }
691 
692 static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
693 	[TCA_U32_CLASSID]	= { .type = NLA_U32 },
694 	[TCA_U32_HASH]		= { .type = NLA_U32 },
695 	[TCA_U32_LINK]		= { .type = NLA_U32 },
696 	[TCA_U32_DIVISOR]	= { .type = NLA_U32 },
697 	[TCA_U32_SEL]		= { .len = sizeof(struct tc_u32_sel) },
698 	[TCA_U32_INDEV]		= { .type = NLA_STRING, .len = IFNAMSIZ },
699 	[TCA_U32_MARK]		= { .len = sizeof(struct tc_u32_mark) },
700 	[TCA_U32_FLAGS]		= { .type = NLA_U32 },
701 };
702 
703 static int u32_set_parms(struct net *net, struct tcf_proto *tp,
704 			 unsigned long base, struct tc_u_hnode *ht,
705 			 struct tc_u_knode *n, struct nlattr **tb,
706 			 struct nlattr *est, bool ovr)
707 {
708 	struct tcf_exts e;
709 	int err;
710 
711 	err = tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE);
712 	if (err < 0)
713 		return err;
714 	err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
715 	if (err < 0)
716 		goto errout;
717 
718 	err = -EINVAL;
719 	if (tb[TCA_U32_LINK]) {
720 		u32 handle = nla_get_u32(tb[TCA_U32_LINK]);
721 		struct tc_u_hnode *ht_down = NULL, *ht_old;
722 
723 		if (TC_U32_KEY(handle))
724 			goto errout;
725 
726 		if (handle) {
727 			ht_down = u32_lookup_ht(ht->tp_c, handle);
728 
729 			if (ht_down == NULL)
730 				goto errout;
731 			ht_down->refcnt++;
732 		}
733 
734 		ht_old = rtnl_dereference(n->ht_down);
735 		rcu_assign_pointer(n->ht_down, ht_down);
736 
737 		if (ht_old)
738 			ht_old->refcnt--;
739 	}
740 	if (tb[TCA_U32_CLASSID]) {
741 		n->res.classid = nla_get_u32(tb[TCA_U32_CLASSID]);
742 		tcf_bind_filter(tp, &n->res, base);
743 	}
744 
745 #ifdef CONFIG_NET_CLS_IND
746 	if (tb[TCA_U32_INDEV]) {
747 		int ret;
748 		ret = tcf_change_indev(net, tb[TCA_U32_INDEV]);
749 		if (ret < 0)
750 			goto errout;
751 		n->ifindex = ret;
752 	}
753 #endif
754 	tcf_exts_change(tp, &n->exts, &e);
755 
756 	return 0;
757 errout:
758 	tcf_exts_destroy(&e);
759 	return err;
760 }
761 
762 static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c,
763 			      struct tc_u_knode *n)
764 {
765 	struct tc_u_knode __rcu **ins;
766 	struct tc_u_knode *pins;
767 	struct tc_u_hnode *ht;
768 
769 	if (TC_U32_HTID(n->handle) == TC_U32_ROOT)
770 		ht = rtnl_dereference(tp->root);
771 	else
772 		ht = u32_lookup_ht(tp_c, TC_U32_HTID(n->handle));
773 
774 	ins = &ht->ht[TC_U32_HASH(n->handle)];
775 
776 	/* The node must always exist for it to be replaced if this is not the
777 	 * case then something went very wrong elsewhere.
778 	 */
779 	for (pins = rtnl_dereference(*ins); ;
780 	     ins = &pins->next, pins = rtnl_dereference(*ins))
781 		if (pins->handle == n->handle)
782 			break;
783 
784 	RCU_INIT_POINTER(n->next, pins->next);
785 	rcu_assign_pointer(*ins, n);
786 }
787 
788 static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
789 					 struct tc_u_knode *n)
790 {
791 	struct tc_u_knode *new;
792 	struct tc_u32_sel *s = &n->sel;
793 
794 	new = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key),
795 		      GFP_KERNEL);
796 
797 	if (!new)
798 		return NULL;
799 
800 	RCU_INIT_POINTER(new->next, n->next);
801 	new->handle = n->handle;
802 	RCU_INIT_POINTER(new->ht_up, n->ht_up);
803 
804 #ifdef CONFIG_NET_CLS_IND
805 	new->ifindex = n->ifindex;
806 #endif
807 	new->fshift = n->fshift;
808 	new->res = n->res;
809 	new->flags = n->flags;
810 	RCU_INIT_POINTER(new->ht_down, n->ht_down);
811 
812 	/* bump reference count as long as we hold pointer to structure */
813 	if (new->ht_down)
814 		new->ht_down->refcnt++;
815 
816 #ifdef CONFIG_CLS_U32_PERF
817 	/* Statistics may be incremented by readers during update
818 	 * so we must keep them in tact. When the node is later destroyed
819 	 * a special destroy call must be made to not free the pf memory.
820 	 */
821 	new->pf = n->pf;
822 #endif
823 
824 #ifdef CONFIG_CLS_U32_MARK
825 	new->val = n->val;
826 	new->mask = n->mask;
827 	/* Similarly success statistics must be moved as pointers */
828 	new->pcpu_success = n->pcpu_success;
829 #endif
830 	new->tp = tp;
831 	memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
832 
833 	if (tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE)) {
834 		kfree(new);
835 		return NULL;
836 	}
837 
838 	return new;
839 }
840 
841 static int u32_change(struct net *net, struct sk_buff *in_skb,
842 		      struct tcf_proto *tp, unsigned long base, u32 handle,
843 		      struct nlattr **tca, unsigned long *arg, bool ovr)
844 {
845 	struct tc_u_common *tp_c = tp->data;
846 	struct tc_u_hnode *ht;
847 	struct tc_u_knode *n;
848 	struct tc_u32_sel *s;
849 	struct nlattr *opt = tca[TCA_OPTIONS];
850 	struct nlattr *tb[TCA_U32_MAX + 1];
851 	u32 htid, flags = 0;
852 	int err;
853 #ifdef CONFIG_CLS_U32_PERF
854 	size_t size;
855 #endif
856 
857 	if (opt == NULL)
858 		return handle ? -EINVAL : 0;
859 
860 	err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy);
861 	if (err < 0)
862 		return err;
863 
864 	if (tb[TCA_U32_FLAGS]) {
865 		flags = nla_get_u32(tb[TCA_U32_FLAGS]);
866 		if (!tc_flags_valid(flags))
867 			return -EINVAL;
868 	}
869 
870 	n = (struct tc_u_knode *)*arg;
871 	if (n) {
872 		struct tc_u_knode *new;
873 
874 		if (TC_U32_KEY(n->handle) == 0)
875 			return -EINVAL;
876 
877 		if (n->flags != flags)
878 			return -EINVAL;
879 
880 		new = u32_init_knode(tp, n);
881 		if (!new)
882 			return -ENOMEM;
883 
884 		err = u32_set_parms(net, tp, base,
885 				    rtnl_dereference(n->ht_up), new, tb,
886 				    tca[TCA_RATE], ovr);
887 
888 		if (err) {
889 			u32_destroy_key(tp, new, false);
890 			return err;
891 		}
892 
893 		err = u32_replace_hw_knode(tp, new, flags);
894 		if (err) {
895 			u32_destroy_key(tp, new, false);
896 			return err;
897 		}
898 
899 		u32_replace_knode(tp, tp_c, new);
900 		tcf_unbind_filter(tp, &n->res);
901 		call_rcu(&n->rcu, u32_delete_key_rcu);
902 		return 0;
903 	}
904 
905 	if (tb[TCA_U32_DIVISOR]) {
906 		unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
907 
908 		if (--divisor > 0x100)
909 			return -EINVAL;
910 		if (TC_U32_KEY(handle))
911 			return -EINVAL;
912 		if (handle == 0) {
913 			handle = gen_new_htid(tp->data);
914 			if (handle == 0)
915 				return -ENOMEM;
916 		}
917 		ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);
918 		if (ht == NULL)
919 			return -ENOBUFS;
920 		ht->tp_c = tp_c;
921 		ht->refcnt = 1;
922 		ht->divisor = divisor;
923 		ht->handle = handle;
924 		ht->prio = tp->prio;
925 
926 		err = u32_replace_hw_hnode(tp, ht, flags);
927 		if (err) {
928 			kfree(ht);
929 			return err;
930 		}
931 
932 		RCU_INIT_POINTER(ht->next, tp_c->hlist);
933 		rcu_assign_pointer(tp_c->hlist, ht);
934 		*arg = (unsigned long)ht;
935 
936 		return 0;
937 	}
938 
939 	if (tb[TCA_U32_HASH]) {
940 		htid = nla_get_u32(tb[TCA_U32_HASH]);
941 		if (TC_U32_HTID(htid) == TC_U32_ROOT) {
942 			ht = rtnl_dereference(tp->root);
943 			htid = ht->handle;
944 		} else {
945 			ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid));
946 			if (ht == NULL)
947 				return -EINVAL;
948 		}
949 	} else {
950 		ht = rtnl_dereference(tp->root);
951 		htid = ht->handle;
952 	}
953 
954 	if (ht->divisor < TC_U32_HASH(htid))
955 		return -EINVAL;
956 
957 	if (handle) {
958 		if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid))
959 			return -EINVAL;
960 		handle = htid | TC_U32_NODE(handle);
961 	} else
962 		handle = gen_new_kid(ht, htid);
963 
964 	if (tb[TCA_U32_SEL] == NULL)
965 		return -EINVAL;
966 
967 	s = nla_data(tb[TCA_U32_SEL]);
968 
969 	n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL);
970 	if (n == NULL)
971 		return -ENOBUFS;
972 
973 #ifdef CONFIG_CLS_U32_PERF
974 	size = sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64);
975 	n->pf = __alloc_percpu(size, __alignof__(struct tc_u32_pcnt));
976 	if (!n->pf) {
977 		kfree(n);
978 		return -ENOBUFS;
979 	}
980 #endif
981 
982 	memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
983 	RCU_INIT_POINTER(n->ht_up, ht);
984 	n->handle = handle;
985 	n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
986 	n->flags = flags;
987 	n->tp = tp;
988 
989 	err = tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
990 	if (err < 0)
991 		goto errout;
992 
993 #ifdef CONFIG_CLS_U32_MARK
994 	n->pcpu_success = alloc_percpu(u32);
995 	if (!n->pcpu_success) {
996 		err = -ENOMEM;
997 		goto errout;
998 	}
999 
1000 	if (tb[TCA_U32_MARK]) {
1001 		struct tc_u32_mark *mark;
1002 
1003 		mark = nla_data(tb[TCA_U32_MARK]);
1004 		n->val = mark->val;
1005 		n->mask = mark->mask;
1006 	}
1007 #endif
1008 
1009 	err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr);
1010 	if (err == 0) {
1011 		struct tc_u_knode __rcu **ins;
1012 		struct tc_u_knode *pins;
1013 
1014 		err = u32_replace_hw_knode(tp, n, flags);
1015 		if (err)
1016 			goto errhw;
1017 
1018 		ins = &ht->ht[TC_U32_HASH(handle)];
1019 		for (pins = rtnl_dereference(*ins); pins;
1020 		     ins = &pins->next, pins = rtnl_dereference(*ins))
1021 			if (TC_U32_NODE(handle) < TC_U32_NODE(pins->handle))
1022 				break;
1023 
1024 		RCU_INIT_POINTER(n->next, pins);
1025 		rcu_assign_pointer(*ins, n);
1026 		*arg = (unsigned long)n;
1027 		return 0;
1028 	}
1029 
1030 errhw:
1031 #ifdef CONFIG_CLS_U32_MARK
1032 	free_percpu(n->pcpu_success);
1033 #endif
1034 
1035 errout:
1036 	tcf_exts_destroy(&n->exts);
1037 #ifdef CONFIG_CLS_U32_PERF
1038 	free_percpu(n->pf);
1039 #endif
1040 	kfree(n);
1041 	return err;
1042 }
1043 
1044 static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
1045 {
1046 	struct tc_u_common *tp_c = tp->data;
1047 	struct tc_u_hnode *ht;
1048 	struct tc_u_knode *n;
1049 	unsigned int h;
1050 
1051 	if (arg->stop)
1052 		return;
1053 
1054 	for (ht = rtnl_dereference(tp_c->hlist);
1055 	     ht;
1056 	     ht = rtnl_dereference(ht->next)) {
1057 		if (ht->prio != tp->prio)
1058 			continue;
1059 		if (arg->count >= arg->skip) {
1060 			if (arg->fn(tp, (unsigned long)ht, arg) < 0) {
1061 				arg->stop = 1;
1062 				return;
1063 			}
1064 		}
1065 		arg->count++;
1066 		for (h = 0; h <= ht->divisor; h++) {
1067 			for (n = rtnl_dereference(ht->ht[h]);
1068 			     n;
1069 			     n = rtnl_dereference(n->next)) {
1070 				if (arg->count < arg->skip) {
1071 					arg->count++;
1072 					continue;
1073 				}
1074 				if (arg->fn(tp, (unsigned long)n, arg) < 0) {
1075 					arg->stop = 1;
1076 					return;
1077 				}
1078 				arg->count++;
1079 			}
1080 		}
1081 	}
1082 }
1083 
1084 static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
1085 		    struct sk_buff *skb, struct tcmsg *t)
1086 {
1087 	struct tc_u_knode *n = (struct tc_u_knode *)fh;
1088 	struct tc_u_hnode *ht_up, *ht_down;
1089 	struct nlattr *nest;
1090 
1091 	if (n == NULL)
1092 		return skb->len;
1093 
1094 	t->tcm_handle = n->handle;
1095 
1096 	nest = nla_nest_start(skb, TCA_OPTIONS);
1097 	if (nest == NULL)
1098 		goto nla_put_failure;
1099 
1100 	if (TC_U32_KEY(n->handle) == 0) {
1101 		struct tc_u_hnode *ht = (struct tc_u_hnode *)fh;
1102 		u32 divisor = ht->divisor + 1;
1103 
1104 		if (nla_put_u32(skb, TCA_U32_DIVISOR, divisor))
1105 			goto nla_put_failure;
1106 	} else {
1107 #ifdef CONFIG_CLS_U32_PERF
1108 		struct tc_u32_pcnt *gpf;
1109 		int cpu;
1110 #endif
1111 
1112 		if (nla_put(skb, TCA_U32_SEL,
1113 			    sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
1114 			    &n->sel))
1115 			goto nla_put_failure;
1116 
1117 		ht_up = rtnl_dereference(n->ht_up);
1118 		if (ht_up) {
1119 			u32 htid = n->handle & 0xFFFFF000;
1120 			if (nla_put_u32(skb, TCA_U32_HASH, htid))
1121 				goto nla_put_failure;
1122 		}
1123 		if (n->res.classid &&
1124 		    nla_put_u32(skb, TCA_U32_CLASSID, n->res.classid))
1125 			goto nla_put_failure;
1126 
1127 		ht_down = rtnl_dereference(n->ht_down);
1128 		if (ht_down &&
1129 		    nla_put_u32(skb, TCA_U32_LINK, ht_down->handle))
1130 			goto nla_put_failure;
1131 
1132 		if (n->flags && nla_put_u32(skb, TCA_U32_FLAGS, n->flags))
1133 			goto nla_put_failure;
1134 
1135 #ifdef CONFIG_CLS_U32_MARK
1136 		if ((n->val || n->mask)) {
1137 			struct tc_u32_mark mark = {.val = n->val,
1138 						   .mask = n->mask,
1139 						   .success = 0};
1140 			int cpum;
1141 
1142 			for_each_possible_cpu(cpum) {
1143 				__u32 cnt = *per_cpu_ptr(n->pcpu_success, cpum);
1144 
1145 				mark.success += cnt;
1146 			}
1147 
1148 			if (nla_put(skb, TCA_U32_MARK, sizeof(mark), &mark))
1149 				goto nla_put_failure;
1150 		}
1151 #endif
1152 
1153 		if (tcf_exts_dump(skb, &n->exts) < 0)
1154 			goto nla_put_failure;
1155 
1156 #ifdef CONFIG_NET_CLS_IND
1157 		if (n->ifindex) {
1158 			struct net_device *dev;
1159 			dev = __dev_get_by_index(net, n->ifindex);
1160 			if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name))
1161 				goto nla_put_failure;
1162 		}
1163 #endif
1164 #ifdef CONFIG_CLS_U32_PERF
1165 		gpf = kzalloc(sizeof(struct tc_u32_pcnt) +
1166 			      n->sel.nkeys * sizeof(u64),
1167 			      GFP_KERNEL);
1168 		if (!gpf)
1169 			goto nla_put_failure;
1170 
1171 		for_each_possible_cpu(cpu) {
1172 			int i;
1173 			struct tc_u32_pcnt *pf = per_cpu_ptr(n->pf, cpu);
1174 
1175 			gpf->rcnt += pf->rcnt;
1176 			gpf->rhit += pf->rhit;
1177 			for (i = 0; i < n->sel.nkeys; i++)
1178 				gpf->kcnts[i] += pf->kcnts[i];
1179 		}
1180 
1181 		if (nla_put_64bit(skb, TCA_U32_PCNT,
1182 				  sizeof(struct tc_u32_pcnt) +
1183 				  n->sel.nkeys * sizeof(u64),
1184 				  gpf, TCA_U32_PAD)) {
1185 			kfree(gpf);
1186 			goto nla_put_failure;
1187 		}
1188 		kfree(gpf);
1189 #endif
1190 	}
1191 
1192 	nla_nest_end(skb, nest);
1193 
1194 	if (TC_U32_KEY(n->handle))
1195 		if (tcf_exts_dump_stats(skb, &n->exts) < 0)
1196 			goto nla_put_failure;
1197 	return skb->len;
1198 
1199 nla_put_failure:
1200 	nla_nest_cancel(skb, nest);
1201 	return -1;
1202 }
1203 
1204 static struct tcf_proto_ops cls_u32_ops __read_mostly = {
1205 	.kind		=	"u32",
1206 	.classify	=	u32_classify,
1207 	.init		=	u32_init,
1208 	.destroy	=	u32_destroy,
1209 	.get		=	u32_get,
1210 	.change		=	u32_change,
1211 	.delete		=	u32_delete,
1212 	.walk		=	u32_walk,
1213 	.dump		=	u32_dump,
1214 	.owner		=	THIS_MODULE,
1215 };
1216 
1217 static int __init init_u32(void)
1218 {
1219 	pr_info("u32 classifier\n");
1220 #ifdef CONFIG_CLS_U32_PERF
1221 	pr_info("    Performance counters on\n");
1222 #endif
1223 #ifdef CONFIG_NET_CLS_IND
1224 	pr_info("    input device check on\n");
1225 #endif
1226 #ifdef CONFIG_NET_CLS_ACT
1227 	pr_info("    Actions configured\n");
1228 #endif
1229 	return register_tcf_proto_ops(&cls_u32_ops);
1230 }
1231 
1232 static void __exit exit_u32(void)
1233 {
1234 	unregister_tcf_proto_ops(&cls_u32_ops);
1235 }
1236 
1237 module_init(init_u32)
1238 module_exit(exit_u32)
1239 MODULE_LICENSE("GPL");
1240