xref: /linux/net/sched/sch_htb.c (revision ecbdf3da7813db00014ff8b091bb66303bba29a0)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
4  *
5  * Authors:	Martin Devera, <devik@cdi.cz>
6  *
7  * Credits (in time order) for older HTB versions:
8  *              Stef Coene <stef.coene@docum.org>
9  *			HTB support at LARTC mailing list
10  *		Ondrej Kraus, <krauso@barr.cz>
11  *			found missing INIT_QDISC(htb)
12  *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
13  *			helped a lot to locate nasty class stall bug
14  *		Andi Kleen, Jamal Hadi, Bert Hubert
15  *			code review and helpful comments on shaping
16  *		Tomasz Wrona, <tw@eter.tym.pl>
17  *			created test case so that I was able to fix nasty bug
18  *		Wilfried Weissmann
19  *			spotted bug in dequeue code and helped with fix
20  *		Jiri Fojtasek
21  *			fixed requeue routine
22  *		and many others. thanks.
23  */
24 #include <linux/module.h>
25 #include <linux/moduleparam.h>
26 #include <linux/types.h>
27 #include <linux/kernel.h>
28 #include <linux/string.h>
29 #include <linux/errno.h>
30 #include <linux/skbuff.h>
31 #include <linux/list.h>
32 #include <linux/compiler.h>
33 #include <linux/rbtree.h>
34 #include <linux/workqueue.h>
35 #include <linux/slab.h>
36 #include <net/netlink.h>
37 #include <net/sch_generic.h>
38 #include <net/pkt_sched.h>
39 #include <net/pkt_cls.h>
40 
41 /* HTB algorithm.
42     Author: devik@cdi.cz
43     ========================================================================
44     HTB is like TBF with multiple classes. It is also similar to CBQ because
45     it allows to assign priority to each class in hierarchy.
46     In fact it is another implementation of Floyd's formal sharing.
47 
48     Levels:
49     Each class is assigned level. Leaf has ALWAYS level 0 and root
50     classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
51     one less than their parent.
52 */
53 
54 static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
55 #define HTB_VER 0x30011		/* major must be matched with number supplied by TC as version */
56 
57 #if HTB_VER >> 16 != TC_HTB_PROTOVER
58 #error "Mismatched sch_htb.c and pkt_sch.h"
59 #endif
60 
61 /* Module parameter and sysfs export */
62 module_param    (htb_hysteresis, int, 0640);
63 MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
64 
65 static int htb_rate_est = 0; /* htb classes have a default rate estimator */
66 module_param(htb_rate_est, int, 0640);
67 MODULE_PARM_DESC(htb_rate_est, "setup a default rate estimator (4sec 16sec) for htb classes");
68 
69 /* used internaly to keep status of single class */
70 enum htb_cmode {
71 	HTB_CANT_SEND,		/* class can't send and can't borrow */
72 	HTB_MAY_BORROW,		/* class can't send but may borrow */
73 	HTB_CAN_SEND		/* class can send */
74 };
75 
76 struct htb_prio {
77 	union {
78 		struct rb_root	row;
79 		struct rb_root	feed;
80 	};
81 	struct rb_node	*ptr;
82 	/* When class changes from state 1->2 and disconnects from
83 	 * parent's feed then we lost ptr value and start from the
84 	 * first child again. Here we store classid of the
85 	 * last valid ptr (used when ptr is NULL).
86 	 */
87 	u32		last_ptr_id;
88 };
89 
90 /* interior & leaf nodes; props specific to leaves are marked L:
91  * To reduce false sharing, place mostly read fields at beginning,
92  * and mostly written ones at the end.
93  */
94 struct htb_class {
95 	struct Qdisc_class_common common;
96 	struct psched_ratecfg	rate;
97 	struct psched_ratecfg	ceil;
98 	s64			buffer, cbuffer;/* token bucket depth/rate */
99 	s64			mbuffer;	/* max wait time */
100 	u32			prio;		/* these two are used only by leaves... */
101 	int			quantum;	/* but stored for parent-to-leaf return */
102 
103 	struct tcf_proto __rcu	*filter_list;	/* class attached filters */
104 	struct tcf_block	*block;
105 
106 	int			level;		/* our level (see above) */
107 	unsigned int		children;
108 	struct htb_class	*parent;	/* parent class */
109 
110 	struct net_rate_estimator __rcu *rate_est;
111 
112 	/*
113 	 * Written often fields
114 	 */
115 	struct gnet_stats_basic_sync bstats;
116 	struct gnet_stats_basic_sync bstats_bias;
117 	u32			xstats_lends;
118 	u32			xstats_borrows;
119 
120 	/* token bucket parameters */
121 	s64			tokens, ctokens;/* current number of tokens */
122 	s64			t_c;		/* checkpoint time */
123 
124 	union {
125 		struct htb_class_leaf {
126 			int		deficit[TC_HTB_MAXDEPTH];
127 			struct Qdisc	*q;
128 			struct netdev_queue *offload_queue;
129 		} leaf;
130 		struct htb_class_inner {
131 			struct htb_prio clprio[TC_HTB_NUMPRIO];
132 		} inner;
133 	};
134 	s64			pq_key;
135 
136 	int			prio_activity;	/* for which prios are we active */
137 	enum htb_cmode		cmode;		/* current mode of the class */
138 	struct rb_node		pq_node;	/* node for event queue */
139 	struct rb_node		node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
140 
141 	unsigned int drops ____cacheline_aligned_in_smp;
142 	unsigned int		overlimits;
143 };
144 
145 struct htb_level {
146 	struct rb_root	wait_pq;
147 	struct htb_prio hprio[TC_HTB_NUMPRIO];
148 };
149 
150 struct htb_sched {
151 	struct Qdisc_class_hash clhash;
152 	int			defcls;		/* class where unclassified flows go to */
153 	int			rate2quantum;	/* quant = rate / rate2quantum */
154 
155 	/* filters for qdisc itself */
156 	struct tcf_proto __rcu	*filter_list;
157 	struct tcf_block	*block;
158 
159 #define HTB_WARN_TOOMANYEVENTS	0x1
160 	unsigned int		warned;	/* only one warning */
161 	int			direct_qlen;
162 	struct work_struct	work;
163 
164 	/* non shaped skbs; let them go directly thru */
165 	struct qdisc_skb_head	direct_queue;
166 	u32			direct_pkts;
167 	u32			overlimits;
168 
169 	struct qdisc_watchdog	watchdog;
170 
171 	s64			now;	/* cached dequeue time */
172 
173 	/* time of nearest event per level (row) */
174 	s64			near_ev_cache[TC_HTB_MAXDEPTH];
175 
176 	int			row_mask[TC_HTB_MAXDEPTH];
177 
178 	struct htb_level	hlevel[TC_HTB_MAXDEPTH];
179 
180 	struct Qdisc		**direct_qdiscs;
181 	unsigned int            num_direct_qdiscs;
182 
183 	bool			offload;
184 };
185 
186 /* find class in global hash table using given handle */
187 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
188 {
189 	struct htb_sched *q = qdisc_priv(sch);
190 	struct Qdisc_class_common *clc;
191 
192 	clc = qdisc_class_find(&q->clhash, handle);
193 	if (clc == NULL)
194 		return NULL;
195 	return container_of(clc, struct htb_class, common);
196 }
197 
198 static unsigned long htb_search(struct Qdisc *sch, u32 handle)
199 {
200 	return (unsigned long)htb_find(handle, sch);
201 }
202 
203 #define HTB_DIRECT ((struct htb_class *)-1L)
204 
205 /**
206  * htb_classify - classify a packet into class
207  * @skb: the socket buffer
208  * @sch: the active queue discipline
209  * @qerr: pointer for returned status code
210  *
211  * It returns NULL if the packet should be dropped or -1 if the packet
212  * should be passed directly thru. In all other cases leaf class is returned.
213  * We allow direct class selection by classid in priority. The we examine
214  * filters in qdisc and in inner nodes (if higher filter points to the inner
215  * node). If we end up with classid MAJOR:0 we enqueue the skb into special
216  * internal fifo (direct). These packets then go directly thru. If we still
217  * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
218  * then finish and return direct queue.
219  */
220 static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
221 				      int *qerr)
222 {
223 	struct htb_sched *q = qdisc_priv(sch);
224 	struct htb_class *cl;
225 	struct tcf_result res;
226 	struct tcf_proto *tcf;
227 	int result;
228 
229 	/* allow to select class by setting skb->priority to valid classid;
230 	 * note that nfmark can be used too by attaching filter fw with no
231 	 * rules in it
232 	 */
233 	if (skb->priority == sch->handle)
234 		return HTB_DIRECT;	/* X:0 (direct flow) selected */
235 	cl = htb_find(skb->priority, sch);
236 	if (cl) {
237 		if (cl->level == 0)
238 			return cl;
239 		/* Start with inner filter chain if a non-leaf class is selected */
240 		tcf = rcu_dereference_bh(cl->filter_list);
241 	} else {
242 		tcf = rcu_dereference_bh(q->filter_list);
243 	}
244 
245 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
246 	while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) {
247 #ifdef CONFIG_NET_CLS_ACT
248 		switch (result) {
249 		case TC_ACT_QUEUED:
250 		case TC_ACT_STOLEN:
251 		case TC_ACT_TRAP:
252 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
253 			fallthrough;
254 		case TC_ACT_SHOT:
255 			return NULL;
256 		}
257 #endif
258 		cl = (void *)res.class;
259 		if (!cl) {
260 			if (res.classid == sch->handle)
261 				return HTB_DIRECT;	/* X:0 (direct flow) */
262 			cl = htb_find(res.classid, sch);
263 			if (!cl)
264 				break;	/* filter selected invalid classid */
265 		}
266 		if (!cl->level)
267 			return cl;	/* we hit leaf; return it */
268 
269 		/* we have got inner class; apply inner filter chain */
270 		tcf = rcu_dereference_bh(cl->filter_list);
271 	}
272 	/* classification failed; try to use default class */
273 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
274 	if (!cl || cl->level)
275 		return HTB_DIRECT;	/* bad default .. this is safe bet */
276 	return cl;
277 }
278 
279 /**
280  * htb_add_to_id_tree - adds class to the round robin list
281  * @root: the root of the tree
282  * @cl: the class to add
283  * @prio: the give prio in class
284  *
285  * Routine adds class to the list (actually tree) sorted by classid.
286  * Make sure that class is not already on such list for given prio.
287  */
288 static void htb_add_to_id_tree(struct rb_root *root,
289 			       struct htb_class *cl, int prio)
290 {
291 	struct rb_node **p = &root->rb_node, *parent = NULL;
292 
293 	while (*p) {
294 		struct htb_class *c;
295 		parent = *p;
296 		c = rb_entry(parent, struct htb_class, node[prio]);
297 
298 		if (cl->common.classid > c->common.classid)
299 			p = &parent->rb_right;
300 		else
301 			p = &parent->rb_left;
302 	}
303 	rb_link_node(&cl->node[prio], parent, p);
304 	rb_insert_color(&cl->node[prio], root);
305 }
306 
307 /**
308  * htb_add_to_wait_tree - adds class to the event queue with delay
309  * @q: the priority event queue
310  * @cl: the class to add
311  * @delay: delay in microseconds
312  *
313  * The class is added to priority event queue to indicate that class will
314  * change its mode in cl->pq_key microseconds. Make sure that class is not
315  * already in the queue.
316  */
317 static void htb_add_to_wait_tree(struct htb_sched *q,
318 				 struct htb_class *cl, s64 delay)
319 {
320 	struct rb_node **p = &q->hlevel[cl->level].wait_pq.rb_node, *parent = NULL;
321 
322 	cl->pq_key = q->now + delay;
323 	if (cl->pq_key == q->now)
324 		cl->pq_key++;
325 
326 	/* update the nearest event cache */
327 	if (q->near_ev_cache[cl->level] > cl->pq_key)
328 		q->near_ev_cache[cl->level] = cl->pq_key;
329 
330 	while (*p) {
331 		struct htb_class *c;
332 		parent = *p;
333 		c = rb_entry(parent, struct htb_class, pq_node);
334 		if (cl->pq_key >= c->pq_key)
335 			p = &parent->rb_right;
336 		else
337 			p = &parent->rb_left;
338 	}
339 	rb_link_node(&cl->pq_node, parent, p);
340 	rb_insert_color(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
341 }
342 
343 /**
344  * htb_next_rb_node - finds next node in binary tree
345  * @n: the current node in binary tree
346  *
347  * When we are past last key we return NULL.
348  * Average complexity is 2 steps per call.
349  */
350 static inline void htb_next_rb_node(struct rb_node **n)
351 {
352 	if (*n)
353 		*n = rb_next(*n);
354 }
355 
356 /**
357  * htb_add_class_to_row - add class to its row
358  * @q: the priority event queue
359  * @cl: the class to add
360  * @mask: the given priorities in class in bitmap
361  *
362  * The class is added to row at priorities marked in mask.
363  * It does nothing if mask == 0.
364  */
365 static inline void htb_add_class_to_row(struct htb_sched *q,
366 					struct htb_class *cl, int mask)
367 {
368 	q->row_mask[cl->level] |= mask;
369 	while (mask) {
370 		int prio = ffz(~mask);
371 		mask &= ~(1 << prio);
372 		htb_add_to_id_tree(&q->hlevel[cl->level].hprio[prio].row, cl, prio);
373 	}
374 }
375 
376 /* If this triggers, it is a bug in this code, but it need not be fatal */
377 static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
378 {
379 	if (RB_EMPTY_NODE(rb)) {
380 		WARN_ON(1);
381 	} else {
382 		rb_erase(rb, root);
383 		RB_CLEAR_NODE(rb);
384 	}
385 }
386 
387 
388 /**
389  * htb_remove_class_from_row - removes class from its row
390  * @q: the priority event queue
391  * @cl: the class to add
392  * @mask: the given priorities in class in bitmap
393  *
394  * The class is removed from row at priorities marked in mask.
395  * It does nothing if mask == 0.
396  */
397 static inline void htb_remove_class_from_row(struct htb_sched *q,
398 						 struct htb_class *cl, int mask)
399 {
400 	int m = 0;
401 	struct htb_level *hlevel = &q->hlevel[cl->level];
402 
403 	while (mask) {
404 		int prio = ffz(~mask);
405 		struct htb_prio *hprio = &hlevel->hprio[prio];
406 
407 		mask &= ~(1 << prio);
408 		if (hprio->ptr == cl->node + prio)
409 			htb_next_rb_node(&hprio->ptr);
410 
411 		htb_safe_rb_erase(cl->node + prio, &hprio->row);
412 		if (!hprio->row.rb_node)
413 			m |= 1 << prio;
414 	}
415 	q->row_mask[cl->level] &= ~m;
416 }
417 
418 /**
419  * htb_activate_prios - creates active classe's feed chain
420  * @q: the priority event queue
421  * @cl: the class to activate
422  *
423  * The class is connected to ancestors and/or appropriate rows
424  * for priorities it is participating on. cl->cmode must be new
425  * (activated) mode. It does nothing if cl->prio_activity == 0.
426  */
427 static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
428 {
429 	struct htb_class *p = cl->parent;
430 	long m, mask = cl->prio_activity;
431 
432 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
433 		m = mask;
434 		while (m) {
435 			unsigned int prio = ffz(~m);
436 
437 			if (WARN_ON_ONCE(prio >= ARRAY_SIZE(p->inner.clprio)))
438 				break;
439 			m &= ~(1 << prio);
440 
441 			if (p->inner.clprio[prio].feed.rb_node)
442 				/* parent already has its feed in use so that
443 				 * reset bit in mask as parent is already ok
444 				 */
445 				mask &= ~(1 << prio);
446 
447 			htb_add_to_id_tree(&p->inner.clprio[prio].feed, cl, prio);
448 		}
449 		p->prio_activity |= mask;
450 		cl = p;
451 		p = cl->parent;
452 
453 	}
454 	if (cl->cmode == HTB_CAN_SEND && mask)
455 		htb_add_class_to_row(q, cl, mask);
456 }
457 
458 /**
459  * htb_deactivate_prios - remove class from feed chain
460  * @q: the priority event queue
461  * @cl: the class to deactivate
462  *
463  * cl->cmode must represent old mode (before deactivation). It does
464  * nothing if cl->prio_activity == 0. Class is removed from all feed
465  * chains and rows.
466  */
467 static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
468 {
469 	struct htb_class *p = cl->parent;
470 	long m, mask = cl->prio_activity;
471 
472 	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
473 		m = mask;
474 		mask = 0;
475 		while (m) {
476 			int prio = ffz(~m);
477 			m &= ~(1 << prio);
478 
479 			if (p->inner.clprio[prio].ptr == cl->node + prio) {
480 				/* we are removing child which is pointed to from
481 				 * parent feed - forget the pointer but remember
482 				 * classid
483 				 */
484 				p->inner.clprio[prio].last_ptr_id = cl->common.classid;
485 				p->inner.clprio[prio].ptr = NULL;
486 			}
487 
488 			htb_safe_rb_erase(cl->node + prio,
489 					  &p->inner.clprio[prio].feed);
490 
491 			if (!p->inner.clprio[prio].feed.rb_node)
492 				mask |= 1 << prio;
493 		}
494 
495 		p->prio_activity &= ~mask;
496 		cl = p;
497 		p = cl->parent;
498 
499 	}
500 	if (cl->cmode == HTB_CAN_SEND && mask)
501 		htb_remove_class_from_row(q, cl, mask);
502 }
503 
504 static inline s64 htb_lowater(const struct htb_class *cl)
505 {
506 	if (htb_hysteresis)
507 		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
508 	else
509 		return 0;
510 }
511 static inline s64 htb_hiwater(const struct htb_class *cl)
512 {
513 	if (htb_hysteresis)
514 		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
515 	else
516 		return 0;
517 }
518 
519 
520 /**
521  * htb_class_mode - computes and returns current class mode
522  * @cl: the target class
523  * @diff: diff time in microseconds
524  *
525  * It computes cl's mode at time cl->t_c+diff and returns it. If mode
526  * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
527  * from now to time when cl will change its state.
528  * Also it is worth to note that class mode doesn't change simply
529  * at cl->{c,}tokens == 0 but there can rather be hysteresis of
530  * 0 .. -cl->{c,}buffer range. It is meant to limit number of
531  * mode transitions per time unit. The speed gain is about 1/6.
532  */
533 static inline enum htb_cmode
534 htb_class_mode(struct htb_class *cl, s64 *diff)
535 {
536 	s64 toks;
537 
538 	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
539 		*diff = -toks;
540 		return HTB_CANT_SEND;
541 	}
542 
543 	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
544 		return HTB_CAN_SEND;
545 
546 	*diff = -toks;
547 	return HTB_MAY_BORROW;
548 }
549 
550 /**
551  * htb_change_class_mode - changes classe's mode
552  * @q: the priority event queue
553  * @cl: the target class
554  * @diff: diff time in microseconds
555  *
556  * This should be the only way how to change classe's mode under normal
557  * circumstances. Routine will update feed lists linkage, change mode
558  * and add class to the wait event queue if appropriate. New mode should
559  * be different from old one and cl->pq_key has to be valid if changing
560  * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
561  */
562 static void
563 htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
564 {
565 	enum htb_cmode new_mode = htb_class_mode(cl, diff);
566 
567 	if (new_mode == cl->cmode)
568 		return;
569 
570 	if (new_mode == HTB_CANT_SEND) {
571 		WRITE_ONCE(cl->overlimits, cl->overlimits + 1);
572 		WRITE_ONCE(q->overlimits, q->overlimits + 1);
573 	}
574 
575 	if (cl->prio_activity) {	/* not necessary: speed optimization */
576 		if (cl->cmode != HTB_CANT_SEND)
577 			htb_deactivate_prios(q, cl);
578 		cl->cmode = new_mode;
579 		if (new_mode != HTB_CANT_SEND)
580 			htb_activate_prios(q, cl);
581 	} else
582 		cl->cmode = new_mode;
583 }
584 
585 /**
586  * htb_activate - inserts leaf cl into appropriate active feeds
587  * @q: the priority event queue
588  * @cl: the target class
589  *
590  * Routine learns (new) priority of leaf and activates feed chain
591  * for the prio. It can be called on already active leaf safely.
592  * It also adds leaf into droplist.
593  */
594 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
595 {
596 	WARN_ON(cl->level || !cl->leaf.q);
597 
598 	if (!cl->prio_activity) {
599 		cl->prio_activity = 1 << cl->prio;
600 		htb_activate_prios(q, cl);
601 	}
602 }
603 
604 /**
605  * htb_deactivate - remove leaf cl from active feeds
606  * @q: the priority event queue
607  * @cl: the target class
608  *
609  * Make sure that leaf is active. In the other words it can't be called
610  * with non-active leaf. It also removes class from the drop list.
611  */
612 static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
613 {
614 	if (!cl->prio_activity)
615 		return;
616 	htb_deactivate_prios(q, cl);
617 	cl->prio_activity = 0;
618 }
619 
620 static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
621 		       struct sk_buff **to_free)
622 {
623 	int ret;
624 	unsigned int len = qdisc_pkt_len(skb);
625 	struct htb_sched *q = qdisc_priv(sch);
626 	struct htb_class *cl = htb_classify(skb, sch, &ret);
627 
628 	if (cl == HTB_DIRECT) {
629 		/* enqueue to helper queue */
630 		if (q->direct_queue.qlen < q->direct_qlen) {
631 			__qdisc_enqueue_tail(skb, &q->direct_queue);
632 			WRITE_ONCE(q->direct_pkts, q->direct_pkts + 1);
633 		} else {
634 			return qdisc_drop(skb, sch, to_free);
635 		}
636 #ifdef CONFIG_NET_CLS_ACT
637 	} else if (!cl) {
638 		if (ret & __NET_XMIT_BYPASS)
639 			qdisc_qstats_drop(sch);
640 		__qdisc_drop(skb, to_free);
641 		return ret;
642 #endif
643 	} else if ((ret = qdisc_enqueue(skb, cl->leaf.q,
644 					to_free)) != NET_XMIT_SUCCESS) {
645 		if (net_xmit_drop_count(ret)) {
646 			qdisc_qstats_drop(sch);
647 			WRITE_ONCE(cl->drops, cl->drops + 1);
648 		}
649 		return ret;
650 	} else {
651 		htb_activate(q, cl);
652 	}
653 
654 	qstats_backlog_add(sch, len);
655 	qdisc_qlen_inc(sch);
656 	return NET_XMIT_SUCCESS;
657 }
658 
659 static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff)
660 {
661 	s64 toks = diff + cl->tokens;
662 
663 	if (toks > cl->buffer)
664 		toks = cl->buffer;
665 	toks -= (s64) psched_l2t_ns(&cl->rate, bytes);
666 	if (toks <= -cl->mbuffer)
667 		toks = 1 - cl->mbuffer;
668 
669 	WRITE_ONCE(cl->tokens, toks);
670 }
671 
672 static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff)
673 {
674 	s64 toks = diff + cl->ctokens;
675 
676 	if (toks > cl->cbuffer)
677 		toks = cl->cbuffer;
678 	toks -= (s64) psched_l2t_ns(&cl->ceil, bytes);
679 	if (toks <= -cl->mbuffer)
680 		toks = 1 - cl->mbuffer;
681 
682 	WRITE_ONCE(cl->ctokens, toks);
683 }
684 
685 /**
686  * htb_charge_class - charges amount "bytes" to leaf and ancestors
687  * @q: the priority event queue
688  * @cl: the class to start iterate
689  * @level: the minimum level to account
690  * @skb: the socket buffer
691  *
692  * Routine assumes that packet "bytes" long was dequeued from leaf cl
693  * borrowing from "level". It accounts bytes to ceil leaky bucket for
694  * leaf and all ancestors and to rate bucket for ancestors at levels
695  * "level" and higher. It also handles possible change of mode resulting
696  * from the update. Note that mode can also increase here (MAY_BORROW to
697  * CAN_SEND) because we can use more precise clock that event queue here.
698  * In such case we remove class from event queue first.
699  */
700 static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
701 			     int level, struct sk_buff *skb)
702 {
703 	int bytes = qdisc_pkt_len(skb);
704 	enum htb_cmode old_mode;
705 	s64 diff;
706 
707 	while (cl) {
708 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
709 		if (cl->level >= level) {
710 			if (cl->level == level)
711 				WRITE_ONCE(cl->xstats_lends, cl->xstats_lends + 1);
712 			htb_accnt_tokens(cl, bytes, diff);
713 		} else {
714 			WRITE_ONCE(cl->xstats_borrows, cl->xstats_borrows + 1);
715 			/* we moved t_c; update tokens */
716 			WRITE_ONCE(cl->tokens, cl->tokens + diff);
717 		}
718 		htb_accnt_ctokens(cl, bytes, diff);
719 		cl->t_c = q->now;
720 
721 		old_mode = cl->cmode;
722 		diff = 0;
723 		htb_change_class_mode(q, cl, &diff);
724 		if (old_mode != cl->cmode) {
725 			if (old_mode != HTB_CAN_SEND)
726 				htb_safe_rb_erase(&cl->pq_node, &q->hlevel[cl->level].wait_pq);
727 			if (cl->cmode != HTB_CAN_SEND)
728 				htb_add_to_wait_tree(q, cl, diff);
729 		}
730 
731 		/* update basic stats except for leaves which are already updated */
732 		if (cl->level)
733 			bstats_update(&cl->bstats, skb);
734 
735 		cl = cl->parent;
736 	}
737 }
738 
739 /**
740  * htb_do_events - make mode changes to classes at the level
741  * @q: the priority event queue
742  * @level: which wait_pq in 'q->hlevel'
743  * @start: start jiffies
744  *
745  * Scans event queue for pending events and applies them. Returns time of
746  * next pending event (0 for no event in pq, q->now for too many events).
747  * Note: Applied are events whose have cl->pq_key <= q->now.
748  */
749 static s64 htb_do_events(struct htb_sched *q, const int level,
750 			 unsigned long start)
751 {
752 	/* don't run for longer than 2 jiffies; 2 is used instead of
753 	 * 1 to simplify things when jiffy is going to be incremented
754 	 * too soon
755 	 */
756 	unsigned long stop_at = start + 2;
757 	struct rb_root *wait_pq = &q->hlevel[level].wait_pq;
758 
759 	while (time_before(jiffies, stop_at)) {
760 		struct htb_class *cl;
761 		s64 diff;
762 		struct rb_node *p = rb_first(wait_pq);
763 
764 		if (!p)
765 			return 0;
766 
767 		cl = rb_entry(p, struct htb_class, pq_node);
768 		if (cl->pq_key > q->now)
769 			return cl->pq_key;
770 
771 		htb_safe_rb_erase(p, wait_pq);
772 		diff = min_t(s64, q->now - cl->t_c, cl->mbuffer);
773 		htb_change_class_mode(q, cl, &diff);
774 		if (cl->cmode != HTB_CAN_SEND)
775 			htb_add_to_wait_tree(q, cl, diff);
776 	}
777 
778 	/* too much load - let's continue after a break for scheduling */
779 	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
780 		pr_warn("htb: too many events!\n");
781 		q->warned |= HTB_WARN_TOOMANYEVENTS;
782 	}
783 
784 	return q->now;
785 }
786 
787 /* Returns class->node+prio from id-tree where classe's id is >= id. NULL
788  * is no such one exists.
789  */
790 static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
791 					      u32 id)
792 {
793 	struct rb_node *r = NULL;
794 	while (n) {
795 		struct htb_class *cl =
796 		    rb_entry(n, struct htb_class, node[prio]);
797 
798 		if (id > cl->common.classid) {
799 			n = n->rb_right;
800 		} else if (id < cl->common.classid) {
801 			r = n;
802 			n = n->rb_left;
803 		} else {
804 			return n;
805 		}
806 	}
807 	return r;
808 }
809 
810 /**
811  * htb_lookup_leaf - returns next leaf class in DRR order
812  * @hprio: the current one
813  * @prio: which prio in class
814  *
815  * Find leaf where current feed pointers points to.
816  */
817 static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
818 {
819 	int i;
820 	struct {
821 		struct rb_node *root;
822 		struct rb_node **pptr;
823 		u32 *pid;
824 	} stk[TC_HTB_MAXDEPTH], *sp = stk;
825 
826 	if (unlikely(!hprio->row.rb_node))
827 		return NULL;
828 
829 	sp->root = hprio->row.rb_node;
830 	sp->pptr = &hprio->ptr;
831 	sp->pid = &hprio->last_ptr_id;
832 
833 	for (i = 0; i < 65535; i++) {
834 		if (!*sp->pptr && *sp->pid) {
835 			/* ptr was invalidated but id is valid - try to recover
836 			 * the original or next ptr
837 			 */
838 			*sp->pptr =
839 			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
840 		}
841 		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
842 				 * can become out of date quickly
843 				 */
844 		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
845 			*sp->pptr = sp->root;
846 			while ((*sp->pptr)->rb_left)
847 				*sp->pptr = (*sp->pptr)->rb_left;
848 			if (sp > stk) {
849 				sp--;
850 				if (!*sp->pptr) {
851 					WARN_ON(1);
852 					return NULL;
853 				}
854 				htb_next_rb_node(sp->pptr);
855 			}
856 		} else {
857 			struct htb_class *cl;
858 			struct htb_prio *clp;
859 
860 			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
861 			if (!cl->level)
862 				return cl;
863 			clp = &cl->inner.clprio[prio];
864 			(++sp)->root = clp->feed.rb_node;
865 			sp->pptr = &clp->ptr;
866 			sp->pid = &clp->last_ptr_id;
867 		}
868 	}
869 	WARN_ON(1);
870 	return NULL;
871 }
872 
873 /* dequeues packet at given priority and level; call only if
874  * you are sure that there is active class at prio/level
875  */
876 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio,
877 					const int level)
878 {
879 	struct sk_buff *skb = NULL;
880 	struct htb_class *cl, *start;
881 	struct htb_level *hlevel = &q->hlevel[level];
882 	struct htb_prio *hprio = &hlevel->hprio[prio];
883 
884 	/* look initial class up in the row */
885 	start = cl = htb_lookup_leaf(hprio, prio);
886 
887 	do {
888 next:
889 		if (unlikely(!cl))
890 			return NULL;
891 
892 		/* class can be empty - it is unlikely but can be true if leaf
893 		 * qdisc drops packets in enqueue routine or if someone used
894 		 * graft operation on the leaf since last dequeue;
895 		 * simply deactivate and skip such class
896 		 */
897 		if (unlikely(cl->leaf.q->q.qlen == 0)) {
898 			struct htb_class *next;
899 			htb_deactivate(q, cl);
900 
901 			/* row/level might become empty */
902 			if ((q->row_mask[level] & (1 << prio)) == 0)
903 				return NULL;
904 
905 			next = htb_lookup_leaf(hprio, prio);
906 
907 			if (cl == start)	/* fix start if we just deleted it */
908 				start = next;
909 			cl = next;
910 			goto next;
911 		}
912 
913 		skb = cl->leaf.q->dequeue(cl->leaf.q);
914 		if (likely(skb != NULL))
915 			break;
916 
917 		qdisc_warn_nonwc("htb", cl->leaf.q);
918 		htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr:
919 					 &q->hlevel[0].hprio[prio].ptr);
920 		cl = htb_lookup_leaf(hprio, prio);
921 
922 	} while (cl != start);
923 
924 	if (likely(skb != NULL)) {
925 		bstats_update(&cl->bstats, skb);
926 		cl->leaf.deficit[level] -= qdisc_pkt_len(skb);
927 		if (cl->leaf.deficit[level] < 0) {
928 			cl->leaf.deficit[level] += cl->quantum;
929 			htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr :
930 						 &q->hlevel[0].hprio[prio].ptr);
931 		}
932 		/* this used to be after charge_class but this constelation
933 		 * gives us slightly better performance
934 		 */
935 		if (!cl->leaf.q->q.qlen)
936 			htb_deactivate(q, cl);
937 		htb_charge_class(q, cl, level, skb);
938 	}
939 	return skb;
940 }
941 
942 static struct sk_buff *htb_dequeue(struct Qdisc *sch)
943 {
944 	struct sk_buff *skb;
945 	struct htb_sched *q = qdisc_priv(sch);
946 	int level;
947 	s64 next_event;
948 	unsigned long start_at;
949 
950 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
951 	skb = __qdisc_dequeue_head(&q->direct_queue);
952 	if (skb != NULL) {
953 ok:
954 		qdisc_bstats_update(sch, skb);
955 		qdisc_qstats_backlog_dec(sch, skb);
956 		qdisc_qlen_dec(sch);
957 		return skb;
958 	}
959 
960 	if (!sch->q.qlen)
961 		goto fin;
962 	q->now = ktime_get_ns();
963 	start_at = jiffies;
964 
965 	next_event = q->now + 5LLU * NSEC_PER_SEC;
966 
967 	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
968 		/* common case optimization - skip event handler quickly */
969 		int m;
970 		s64 event = q->near_ev_cache[level];
971 
972 		if (q->now >= event) {
973 			event = htb_do_events(q, level, start_at);
974 			if (!event)
975 				event = q->now + NSEC_PER_SEC;
976 			q->near_ev_cache[level] = event;
977 		}
978 
979 		if (next_event > event)
980 			next_event = event;
981 
982 		m = ~q->row_mask[level];
983 		while (m != (int)(-1)) {
984 			int prio = ffz(m);
985 
986 			m |= 1 << prio;
987 			skb = htb_dequeue_tree(q, prio, level);
988 			if (likely(skb != NULL))
989 				goto ok;
990 		}
991 	}
992 	if (likely(next_event > q->now))
993 		qdisc_watchdog_schedule_ns(&q->watchdog, next_event);
994 	else
995 		schedule_work(&q->work);
996 fin:
997 	return skb;
998 }
999 
1000 /* reset all classes */
1001 /* always caled under BH & queue lock */
1002 static void htb_reset(struct Qdisc *sch)
1003 {
1004 	struct htb_sched *q = qdisc_priv(sch);
1005 	struct htb_class *cl;
1006 	unsigned int i;
1007 
1008 	for (i = 0; i < q->clhash.hashsize; i++) {
1009 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1010 			if (cl->level)
1011 				memset(&cl->inner, 0, sizeof(cl->inner));
1012 			else {
1013 				if (cl->leaf.q && !q->offload)
1014 					qdisc_reset(cl->leaf.q);
1015 			}
1016 			cl->prio_activity = 0;
1017 			cl->cmode = HTB_CAN_SEND;
1018 		}
1019 	}
1020 	qdisc_watchdog_cancel(&q->watchdog);
1021 	__qdisc_reset_queue(&q->direct_queue);
1022 	memset(q->hlevel, 0, sizeof(q->hlevel));
1023 	memset(q->row_mask, 0, sizeof(q->row_mask));
1024 }
1025 
1026 static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
1027 	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
1028 	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
1029 	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1030 	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1031 	[TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 },
1032 	[TCA_HTB_RATE64] = { .type = NLA_U64 },
1033 	[TCA_HTB_CEIL64] = { .type = NLA_U64 },
1034 	[TCA_HTB_OFFLOAD] = { .type = NLA_FLAG },
1035 };
1036 
1037 static void htb_work_func(struct work_struct *work)
1038 {
1039 	struct htb_sched *q = container_of(work, struct htb_sched, work);
1040 	struct Qdisc *sch = q->watchdog.qdisc;
1041 
1042 	rcu_read_lock();
1043 	__netif_schedule(qdisc_root(sch));
1044 	rcu_read_unlock();
1045 }
1046 
1047 static int htb_offload(struct net_device *dev, struct tc_htb_qopt_offload *opt)
1048 {
1049 	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, opt);
1050 }
1051 
1052 static int htb_init(struct Qdisc *sch, struct nlattr *opt,
1053 		    struct netlink_ext_ack *extack)
1054 {
1055 	struct net_device *dev = qdisc_dev(sch);
1056 	struct tc_htb_qopt_offload offload_opt;
1057 	struct htb_sched *q = qdisc_priv(sch);
1058 	struct nlattr *tb[TCA_HTB_MAX + 1];
1059 	struct tc_htb_glob *gopt;
1060 	unsigned int ntx;
1061 	bool offload;
1062 	int err;
1063 
1064 	qdisc_watchdog_init(&q->watchdog, sch);
1065 	INIT_WORK(&q->work, htb_work_func);
1066 
1067 	if (!opt)
1068 		return -EINVAL;
1069 
1070 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
1071 	if (err)
1072 		return err;
1073 
1074 	err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1075 					  NULL);
1076 	if (err < 0)
1077 		return err;
1078 
1079 	if (!tb[TCA_HTB_INIT])
1080 		return -EINVAL;
1081 
1082 	gopt = nla_data(tb[TCA_HTB_INIT]);
1083 	if (gopt->version != HTB_VER >> 16)
1084 		return -EINVAL;
1085 
1086 	offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]);
1087 
1088 	if (offload) {
1089 		if (sch->parent != TC_H_ROOT) {
1090 			NL_SET_ERR_MSG(extack, "HTB must be the root qdisc to use offload");
1091 			return -EOPNOTSUPP;
1092 		}
1093 
1094 		if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) {
1095 			NL_SET_ERR_MSG(extack, "hw-tc-offload ethtool feature flag must be on");
1096 			return -EOPNOTSUPP;
1097 		}
1098 
1099 		q->num_direct_qdiscs = dev->real_num_tx_queues;
1100 		q->direct_qdiscs = kzalloc_objs(*q->direct_qdiscs,
1101 						q->num_direct_qdiscs);
1102 		if (!q->direct_qdiscs)
1103 			return -ENOMEM;
1104 	}
1105 
1106 	err = qdisc_class_hash_init(&q->clhash);
1107 	if (err < 0)
1108 		return err;
1109 
1110 	if (tb[TCA_HTB_DIRECT_QLEN])
1111 		q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]);
1112 	else
1113 		q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1114 
1115 	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1116 		q->rate2quantum = 1;
1117 	q->defcls = gopt->defcls;
1118 
1119 	if (!offload)
1120 		return 0;
1121 
1122 	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1123 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1124 		struct Qdisc *qdisc;
1125 
1126 		qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1127 					  TC_H_MAKE(sch->handle, 0), extack);
1128 		if (!qdisc) {
1129 			return -ENOMEM;
1130 		}
1131 
1132 		q->direct_qdiscs[ntx] = qdisc;
1133 		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1134 	}
1135 
1136 	sch->flags |= TCQ_F_MQROOT;
1137 
1138 	offload_opt = (struct tc_htb_qopt_offload) {
1139 		.command = TC_HTB_CREATE,
1140 		.parent_classid = TC_H_MAJ(sch->handle) >> 16,
1141 		.classid = TC_H_MIN(q->defcls),
1142 		.extack = extack,
1143 	};
1144 	err = htb_offload(dev, &offload_opt);
1145 	if (err)
1146 		return err;
1147 
1148 	/* Defer this assignment, so that htb_destroy skips offload-related
1149 	 * parts (especially calling ndo_setup_tc) on errors.
1150 	 */
1151 	q->offload = true;
1152 	sch->flags |= TCQ_F_OFFLOADED;
1153 
1154 	return 0;
1155 }
1156 
1157 static void htb_attach_offload(struct Qdisc *sch)
1158 {
1159 	struct net_device *dev = qdisc_dev(sch);
1160 	struct htb_sched *q = qdisc_priv(sch);
1161 	unsigned int ntx;
1162 
1163 	for (ntx = 0; ntx < q->num_direct_qdiscs; ntx++) {
1164 		struct Qdisc *old, *qdisc = q->direct_qdiscs[ntx];
1165 
1166 		old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
1167 		qdisc_put(old);
1168 		qdisc_hash_add(qdisc, false);
1169 	}
1170 	for (ntx = q->num_direct_qdiscs; ntx < dev->num_tx_queues; ntx++) {
1171 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1172 		struct Qdisc *old = dev_graft_qdisc(dev_queue, NULL);
1173 
1174 		qdisc_put(old);
1175 	}
1176 
1177 	kfree(q->direct_qdiscs);
1178 	q->direct_qdiscs = NULL;
1179 }
1180 
1181 static void htb_attach_software(struct Qdisc *sch)
1182 {
1183 	struct net_device *dev = qdisc_dev(sch);
1184 	unsigned int ntx;
1185 
1186 	/* Resemble qdisc_graft behavior. */
1187 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
1188 		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
1189 		struct Qdisc *old = dev_graft_qdisc(dev_queue, sch);
1190 
1191 		qdisc_refcount_inc(sch);
1192 
1193 		qdisc_put(old);
1194 	}
1195 }
1196 
1197 static void htb_attach(struct Qdisc *sch)
1198 {
1199 	struct htb_sched *q = qdisc_priv(sch);
1200 
1201 	if (q->offload)
1202 		htb_attach_offload(sch);
1203 	else
1204 		htb_attach_software(sch);
1205 }
1206 
1207 static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1208 {
1209 	struct htb_sched *q = qdisc_priv(sch);
1210 	struct nlattr *nest;
1211 	struct tc_htb_glob gopt;
1212 
1213 	sch->qstats.overlimits = READ_ONCE(q->overlimits);
1214 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1215 	 * no change can happen on the qdisc parameters.
1216 	 */
1217 
1218 	gopt.direct_pkts = READ_ONCE(q->direct_pkts);
1219 	gopt.version = HTB_VER;
1220 	gopt.rate2quantum = q->rate2quantum;
1221 	gopt.defcls = q->defcls;
1222 	gopt.debug = 0;
1223 
1224 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1225 	if (nest == NULL)
1226 		goto nla_put_failure;
1227 	if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt) ||
1228 	    nla_put_u32(skb, TCA_HTB_DIRECT_QLEN, q->direct_qlen))
1229 		goto nla_put_failure;
1230 	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1231 		goto nla_put_failure;
1232 
1233 	return nla_nest_end(skb, nest);
1234 
1235 nla_put_failure:
1236 	nla_nest_cancel(skb, nest);
1237 	return -1;
1238 }
1239 
1240 static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1241 			  struct sk_buff *skb, struct tcmsg *tcm)
1242 {
1243 	struct htb_class *cl = (struct htb_class *)arg;
1244 	struct htb_sched *q = qdisc_priv(sch);
1245 	struct nlattr *nest;
1246 	struct tc_htb_opt opt;
1247 
1248 	/* Its safe to not acquire qdisc lock. As we hold RTNL,
1249 	 * no change can happen on the class parameters.
1250 	 */
1251 	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1252 	tcm->tcm_handle = cl->common.classid;
1253 	if (!cl->level && cl->leaf.q)
1254 		tcm->tcm_info = cl->leaf.q->handle;
1255 
1256 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1257 	if (nest == NULL)
1258 		goto nla_put_failure;
1259 
1260 	memset(&opt, 0, sizeof(opt));
1261 
1262 	psched_ratecfg_getrate(&opt.rate, &cl->rate);
1263 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
1264 	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
1265 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
1266 	opt.quantum = cl->quantum;
1267 	opt.prio = cl->prio;
1268 	opt.level = cl->level;
1269 	if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt))
1270 		goto nla_put_failure;
1271 	if (q->offload && nla_put_flag(skb, TCA_HTB_OFFLOAD))
1272 		goto nla_put_failure;
1273 	if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) &&
1274 	    nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps,
1275 			      TCA_HTB_PAD))
1276 		goto nla_put_failure;
1277 	if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) &&
1278 	    nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps,
1279 			      TCA_HTB_PAD))
1280 		goto nla_put_failure;
1281 
1282 	return nla_nest_end(skb, nest);
1283 
1284 nla_put_failure:
1285 	nla_nest_cancel(skb, nest);
1286 	return -1;
1287 }
1288 
1289 static void htb_offload_aggregate_stats(struct htb_sched *q,
1290 					struct htb_class *cl)
1291 {
1292 	u64 bytes = 0, packets = 0;
1293 	struct htb_class *c;
1294 	unsigned int i;
1295 
1296 	gnet_stats_basic_sync_init(&cl->bstats);
1297 
1298 	for (i = 0; i < q->clhash.hashsize; i++) {
1299 		hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) {
1300 			struct htb_class *p = c;
1301 
1302 			while (p && p->level < cl->level)
1303 				p = p->parent;
1304 
1305 			if (p != cl)
1306 				continue;
1307 
1308 			bytes += u64_stats_read(&c->bstats_bias.bytes);
1309 			packets += u64_stats_read(&c->bstats_bias.packets);
1310 			if (c->level == 0) {
1311 				bytes += u64_stats_read(&c->leaf.q->bstats.bytes);
1312 				packets += u64_stats_read(&c->leaf.q->bstats.packets);
1313 			}
1314 		}
1315 	}
1316 	_bstats_update(&cl->bstats, bytes, packets);
1317 }
1318 
1319 static int
1320 htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1321 {
1322 	struct htb_class *cl = (struct htb_class *)arg;
1323 	struct htb_sched *q = qdisc_priv(sch);
1324 	struct tc_htb_xstats xstats = {
1325 		.lends = READ_ONCE(cl->xstats_lends),
1326 		.borrows = READ_ONCE(cl->xstats_borrows),
1327 	};
1328 	struct gnet_stats_queue qs = {
1329 		.drops = READ_ONCE(cl->drops),
1330 		.overlimits = READ_ONCE(cl->overlimits),
1331 	};
1332 	__u32 qlen = 0;
1333 
1334 	if (!cl->level && cl->leaf.q)
1335 		qdisc_qstats_qlen_backlog(cl->leaf.q, &qlen, &qs.backlog);
1336 
1337 	xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(READ_ONCE(cl->tokens)),
1338 				INT_MIN, INT_MAX);
1339 	xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(READ_ONCE(cl->ctokens)),
1340 				 INT_MIN, INT_MAX);
1341 
1342 	if (q->offload) {
1343 		if (!cl->level) {
1344 			if (cl->leaf.q)
1345 				cl->bstats = cl->leaf.q->bstats;
1346 			else
1347 				gnet_stats_basic_sync_init(&cl->bstats);
1348 			_bstats_update(&cl->bstats,
1349 				       u64_stats_read(&cl->bstats_bias.bytes),
1350 				       u64_stats_read(&cl->bstats_bias.packets));
1351 		} else {
1352 			htb_offload_aggregate_stats(q, cl);
1353 		}
1354 	}
1355 
1356 	if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
1357 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1358 	    gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
1359 		return -1;
1360 
1361 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
1362 }
1363 
1364 static struct netdev_queue *
1365 htb_select_queue(struct Qdisc *sch, struct tcmsg *tcm)
1366 {
1367 	struct net_device *dev = qdisc_dev(sch);
1368 	struct tc_htb_qopt_offload offload_opt;
1369 	struct htb_sched *q = qdisc_priv(sch);
1370 	int err;
1371 
1372 	if (!q->offload)
1373 		return sch->dev_queue;
1374 
1375 	offload_opt = (struct tc_htb_qopt_offload) {
1376 		.command = TC_HTB_LEAF_QUERY_QUEUE,
1377 		.classid = TC_H_MIN(tcm->tcm_parent),
1378 	};
1379 	err = htb_offload(dev, &offload_opt);
1380 	if (err || offload_opt.qid >= dev->num_tx_queues)
1381 		return NULL;
1382 	return netdev_get_tx_queue(dev, offload_opt.qid);
1383 }
1384 
1385 static struct Qdisc *
1386 htb_graft_helper(struct netdev_queue *dev_queue, struct Qdisc *new_q)
1387 {
1388 	struct net_device *dev = dev_queue->dev;
1389 	struct Qdisc *old_q;
1390 
1391 	if (dev->flags & IFF_UP)
1392 		dev_deactivate(dev, false);
1393 	old_q = dev_graft_qdisc(dev_queue, new_q);
1394 	if (new_q)
1395 		new_q->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1396 	if (dev->flags & IFF_UP)
1397 		dev_activate(dev);
1398 
1399 	return old_q;
1400 }
1401 
1402 static struct netdev_queue *htb_offload_get_queue(struct htb_class *cl)
1403 {
1404 	struct netdev_queue *queue;
1405 
1406 	queue = cl->leaf.offload_queue;
1407 	if (!(cl->leaf.q->flags & TCQ_F_BUILTIN))
1408 		WARN_ON(cl->leaf.q->dev_queue != queue);
1409 
1410 	return queue;
1411 }
1412 
1413 static void htb_offload_move_qdisc(struct Qdisc *sch, struct htb_class *cl_old,
1414 				   struct htb_class *cl_new, bool destroying)
1415 {
1416 	struct netdev_queue *queue_old, *queue_new;
1417 	struct net_device *dev = qdisc_dev(sch);
1418 
1419 	queue_old = htb_offload_get_queue(cl_old);
1420 	queue_new = htb_offload_get_queue(cl_new);
1421 
1422 	if (!destroying) {
1423 		struct Qdisc *qdisc;
1424 
1425 		if (dev->flags & IFF_UP)
1426 			dev_deactivate(dev, false);
1427 		qdisc = dev_graft_qdisc(queue_old, NULL);
1428 		WARN_ON(qdisc != cl_old->leaf.q);
1429 	}
1430 
1431 	if (!(cl_old->leaf.q->flags & TCQ_F_BUILTIN))
1432 		cl_old->leaf.q->dev_queue = queue_new;
1433 	cl_old->leaf.offload_queue = queue_new;
1434 
1435 	if (!destroying) {
1436 		struct Qdisc *qdisc;
1437 
1438 		qdisc = dev_graft_qdisc(queue_new, cl_old->leaf.q);
1439 		if (dev->flags & IFF_UP)
1440 			dev_activate(dev);
1441 		WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN));
1442 	}
1443 }
1444 
1445 static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1446 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1447 {
1448 	struct netdev_queue *dev_queue = sch->dev_queue;
1449 	struct htb_class *cl = (struct htb_class *)arg;
1450 	struct htb_sched *q = qdisc_priv(sch);
1451 	struct Qdisc *old_q;
1452 
1453 	if (cl->level)
1454 		return -EINVAL;
1455 
1456 	if (q->offload)
1457 		dev_queue = htb_offload_get_queue(cl);
1458 
1459 	if (!new) {
1460 		new = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1461 					cl->common.classid, extack);
1462 		if (!new)
1463 			return -ENOBUFS;
1464 	}
1465 
1466 	if (q->offload) {
1467 		/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1468 		qdisc_refcount_inc(new);
1469 		old_q = htb_graft_helper(dev_queue, new);
1470 	}
1471 
1472 	*old = qdisc_replace(sch, new, &cl->leaf.q);
1473 
1474 	if (q->offload) {
1475 		WARN_ON(old_q != *old);
1476 		qdisc_put(old_q);
1477 	}
1478 
1479 	return 0;
1480 }
1481 
1482 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1483 {
1484 	struct htb_class *cl = (struct htb_class *)arg;
1485 	return !cl->level ? cl->leaf.q : NULL;
1486 }
1487 
1488 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1489 {
1490 	struct htb_class *cl = (struct htb_class *)arg;
1491 
1492 	htb_deactivate(qdisc_priv(sch), cl);
1493 }
1494 
1495 static inline int htb_parent_last_child(struct htb_class *cl)
1496 {
1497 	if (!cl->parent)
1498 		/* the root class */
1499 		return 0;
1500 	if (cl->parent->children > 1)
1501 		/* not the last child */
1502 		return 0;
1503 	return 1;
1504 }
1505 
1506 static void htb_parent_to_leaf(struct Qdisc *sch, struct htb_class *cl,
1507 			       struct Qdisc *new_q)
1508 {
1509 	struct htb_sched *q = qdisc_priv(sch);
1510 	struct htb_class *parent = cl->parent;
1511 
1512 	WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity);
1513 
1514 	if (parent->cmode != HTB_CAN_SEND)
1515 		htb_safe_rb_erase(&parent->pq_node,
1516 				  &q->hlevel[parent->level].wait_pq);
1517 
1518 	parent->level = 0;
1519 	memset(&parent->inner, 0, sizeof(parent->inner));
1520 	parent->leaf.q = new_q ? new_q : &noop_qdisc;
1521 	WRITE_ONCE(parent->tokens, parent->buffer);
1522 	WRITE_ONCE(parent->ctokens, parent->cbuffer);
1523 	parent->t_c = ktime_get_ns();
1524 	parent->cmode = HTB_CAN_SEND;
1525 	if (q->offload)
1526 		parent->leaf.offload_queue = cl->leaf.offload_queue;
1527 }
1528 
1529 static void htb_parent_to_leaf_offload(struct Qdisc *sch,
1530 				       struct netdev_queue *dev_queue,
1531 				       struct Qdisc *new_q)
1532 {
1533 	struct Qdisc *old_q;
1534 
1535 	/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1536 	if (new_q)
1537 		qdisc_refcount_inc(new_q);
1538 	old_q = htb_graft_helper(dev_queue, new_q);
1539 	WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1540 }
1541 
1542 static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl,
1543 				     bool last_child, bool destroying,
1544 				     struct netlink_ext_ack *extack)
1545 {
1546 	struct tc_htb_qopt_offload offload_opt;
1547 	struct netdev_queue *dev_queue;
1548 	struct Qdisc *q = cl->leaf.q;
1549 	struct Qdisc *old;
1550 	int err;
1551 
1552 	if (cl->level)
1553 		return -EINVAL;
1554 
1555 	WARN_ON(!q);
1556 	dev_queue = htb_offload_get_queue(cl);
1557 	/* When destroying, caller qdisc_graft grafts the new qdisc and invokes
1558 	 * qdisc_put for the qdisc being destroyed. htb_destroy_class_offload
1559 	 * does not need to graft or qdisc_put the qdisc being destroyed.
1560 	 */
1561 	if (!destroying) {
1562 		old = htb_graft_helper(dev_queue, NULL);
1563 		/* Last qdisc grafted should be the same as cl->leaf.q when
1564 		 * calling htb_delete.
1565 		 */
1566 		WARN_ON(old != q);
1567 	}
1568 
1569 	if (cl->parent) {
1570 		_bstats_update(&cl->parent->bstats_bias,
1571 			       u64_stats_read(&q->bstats.bytes),
1572 			       u64_stats_read(&q->bstats.packets));
1573 	}
1574 
1575 	offload_opt = (struct tc_htb_qopt_offload) {
1576 		.command = !last_child ? TC_HTB_LEAF_DEL :
1577 			   destroying ? TC_HTB_LEAF_DEL_LAST_FORCE :
1578 			   TC_HTB_LEAF_DEL_LAST,
1579 		.classid = cl->common.classid,
1580 		.extack = extack,
1581 	};
1582 	err = htb_offload(qdisc_dev(sch), &offload_opt);
1583 
1584 	if (!destroying) {
1585 		if (!err)
1586 			qdisc_put(old);
1587 		else
1588 			htb_graft_helper(dev_queue, old);
1589 	}
1590 
1591 	if (last_child)
1592 		return err;
1593 
1594 	if (!err && offload_opt.classid != TC_H_MIN(cl->common.classid)) {
1595 		u32 classid = TC_H_MAJ(sch->handle) |
1596 			      TC_H_MIN(offload_opt.classid);
1597 		struct htb_class *moved_cl = htb_find(classid, sch);
1598 
1599 		htb_offload_move_qdisc(sch, moved_cl, cl, destroying);
1600 	}
1601 
1602 	return err;
1603 }
1604 
1605 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1606 {
1607 	if (!cl->level) {
1608 		WARN_ON(!cl->leaf.q);
1609 		qdisc_put(cl->leaf.q);
1610 	}
1611 	gen_kill_estimator(&cl->rate_est);
1612 	tcf_block_put(cl->block);
1613 	kfree(cl);
1614 }
1615 
1616 static void htb_destroy(struct Qdisc *sch)
1617 {
1618 	struct net_device *dev = qdisc_dev(sch);
1619 	struct tc_htb_qopt_offload offload_opt;
1620 	struct htb_sched *q = qdisc_priv(sch);
1621 	struct hlist_node *next;
1622 	bool nonempty, changed;
1623 	struct htb_class *cl;
1624 	unsigned int i;
1625 
1626 	cancel_work_sync(&q->work);
1627 	qdisc_watchdog_cancel(&q->watchdog);
1628 	/* This line used to be after htb_destroy_class call below
1629 	 * and surprisingly it worked in 2.4. But it must precede it
1630 	 * because filter need its target class alive to be able to call
1631 	 * unbind_filter on it (without Oops).
1632 	 */
1633 	tcf_block_put(q->block);
1634 
1635 	for (i = 0; i < q->clhash.hashsize; i++) {
1636 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
1637 			tcf_block_put(cl->block);
1638 			cl->block = NULL;
1639 		}
1640 	}
1641 
1642 	do {
1643 		nonempty = false;
1644 		changed = false;
1645 		for (i = 0; i < q->clhash.hashsize; i++) {
1646 			hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
1647 						  common.hnode) {
1648 				bool last_child;
1649 
1650 				if (!q->offload) {
1651 					htb_destroy_class(sch, cl);
1652 					continue;
1653 				}
1654 
1655 				nonempty = true;
1656 
1657 				if (cl->level)
1658 					continue;
1659 
1660 				changed = true;
1661 
1662 				last_child = htb_parent_last_child(cl);
1663 				htb_destroy_class_offload(sch, cl, last_child,
1664 							  true, NULL);
1665 				qdisc_class_hash_remove(&q->clhash,
1666 							&cl->common);
1667 				if (cl->parent)
1668 					cl->parent->children--;
1669 				if (last_child)
1670 					htb_parent_to_leaf(sch, cl, NULL);
1671 				htb_destroy_class(sch, cl);
1672 			}
1673 		}
1674 	} while (changed);
1675 	WARN_ON(nonempty);
1676 
1677 	qdisc_class_hash_destroy(&q->clhash);
1678 	__qdisc_reset_queue(&q->direct_queue);
1679 
1680 	if (q->offload) {
1681 		offload_opt = (struct tc_htb_qopt_offload) {
1682 			.command = TC_HTB_DESTROY,
1683 		};
1684 		htb_offload(dev, &offload_opt);
1685 	}
1686 
1687 	if (!q->direct_qdiscs)
1688 		return;
1689 	for (i = 0; i < q->num_direct_qdiscs && q->direct_qdiscs[i]; i++)
1690 		qdisc_put(q->direct_qdiscs[i]);
1691 	kfree(q->direct_qdiscs);
1692 }
1693 
1694 static int htb_delete(struct Qdisc *sch, unsigned long arg,
1695 		      struct netlink_ext_ack *extack)
1696 {
1697 	struct htb_sched *q = qdisc_priv(sch);
1698 	struct htb_class *cl = (struct htb_class *)arg;
1699 	struct Qdisc *new_q = NULL;
1700 	int last_child = 0;
1701 	int err;
1702 
1703 	/* TODO: why don't allow to delete subtree ? references ? does
1704 	 * tc subsys guarantee us that in htb_destroy it holds no class
1705 	 * refs so that we can remove children safely there ?
1706 	 */
1707 	if (cl->children || qdisc_class_in_use(&cl->common)) {
1708 		NL_SET_ERR_MSG(extack, "HTB class in use");
1709 		return -EBUSY;
1710 	}
1711 
1712 	if (!cl->level && htb_parent_last_child(cl))
1713 		last_child = 1;
1714 
1715 	if (q->offload) {
1716 		err = htb_destroy_class_offload(sch, cl, last_child, false,
1717 						extack);
1718 		if (err)
1719 			return err;
1720 	}
1721 
1722 	if (last_child) {
1723 		struct netdev_queue *dev_queue = sch->dev_queue;
1724 
1725 		if (q->offload)
1726 			dev_queue = htb_offload_get_queue(cl);
1727 
1728 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1729 					  cl->parent->common.classid,
1730 					  NULL);
1731 		if (q->offload)
1732 			htb_parent_to_leaf_offload(sch, dev_queue, new_q);
1733 	}
1734 
1735 	sch_tree_lock(sch);
1736 
1737 	if (!cl->level)
1738 		qdisc_purge_queue(cl->leaf.q);
1739 
1740 	/* delete from hash and active; remainder in destroy_class */
1741 	qdisc_class_hash_remove(&q->clhash, &cl->common);
1742 	if (cl->parent)
1743 		cl->parent->children--;
1744 
1745 	htb_deactivate(q, cl);
1746 
1747 	if (cl->cmode != HTB_CAN_SEND)
1748 		htb_safe_rb_erase(&cl->pq_node,
1749 				  &q->hlevel[cl->level].wait_pq);
1750 
1751 	if (last_child)
1752 		htb_parent_to_leaf(sch, cl, new_q);
1753 
1754 	sch_tree_unlock(sch);
1755 
1756 	htb_destroy_class(sch, cl);
1757 	return 0;
1758 }
1759 
1760 static int htb_change_class(struct Qdisc *sch, u32 classid,
1761 			    u32 parentid, struct nlattr **tca,
1762 			    unsigned long *arg, struct netlink_ext_ack *extack)
1763 {
1764 	int err = -EINVAL;
1765 	struct htb_sched *q = qdisc_priv(sch);
1766 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1767 	struct tc_htb_qopt_offload offload_opt;
1768 	struct nlattr *opt = tca[TCA_OPTIONS];
1769 	struct nlattr *tb[TCA_HTB_MAX + 1];
1770 	struct Qdisc *parent_qdisc = NULL;
1771 	struct netdev_queue *dev_queue;
1772 	struct tc_htb_opt *hopt;
1773 	u64 rate64, ceil64;
1774 	int warn = 0;
1775 
1776 	/* extract all subattrs from opt attr */
1777 	if (!opt)
1778 		goto failure;
1779 
1780 	err = nla_parse_nested_deprecated(tb, TCA_HTB_MAX, opt, htb_policy,
1781 					  extack);
1782 	if (err < 0)
1783 		goto failure;
1784 
1785 	err = -EINVAL;
1786 	if (tb[TCA_HTB_PARMS] == NULL)
1787 		goto failure;
1788 
1789 	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1790 
1791 	hopt = nla_data(tb[TCA_HTB_PARMS]);
1792 	if (!hopt->rate.rate || !hopt->ceil.rate)
1793 		goto failure;
1794 
1795 	if (q->offload) {
1796 		/* Options not supported by the offload. */
1797 		if (hopt->rate.overhead || hopt->ceil.overhead) {
1798 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the overhead parameter");
1799 			goto failure;
1800 		}
1801 		if (hopt->rate.mpu || hopt->ceil.mpu) {
1802 			NL_SET_ERR_MSG(extack, "HTB offload doesn't support the mpu parameter");
1803 			goto failure;
1804 		}
1805 	}
1806 
1807 	/* Keeping backward compatible with rate_table based iproute2 tc */
1808 	if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
1809 		qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB],
1810 					      NULL));
1811 
1812 	if (hopt->ceil.linklayer == TC_LINKLAYER_UNAWARE)
1813 		qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB],
1814 					      NULL));
1815 
1816 	rate64 = nla_get_u64_default(tb[TCA_HTB_RATE64], 0);
1817 	ceil64 = nla_get_u64_default(tb[TCA_HTB_CEIL64], 0);
1818 
1819 	if (!cl) {		/* new class */
1820 		struct net_device *dev = qdisc_dev(sch);
1821 		struct Qdisc *new_q, *old_q;
1822 		int prio;
1823 		struct {
1824 			struct nlattr		nla;
1825 			struct gnet_estimator	opt;
1826 		} est = {
1827 			.nla = {
1828 				.nla_len	= nla_attr_size(sizeof(est.opt)),
1829 				.nla_type	= TCA_RATE,
1830 			},
1831 			.opt = {
1832 				/* 4s interval, 16s averaging constant */
1833 				.interval	= 2,
1834 				.ewma_log	= 2,
1835 			},
1836 		};
1837 
1838 		/* check for valid classid */
1839 		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1840 		    htb_find(classid, sch))
1841 			goto failure;
1842 
1843 		/* check maximal depth */
1844 		if (parent && parent->parent && parent->parent->level < 2) {
1845 			NL_SET_ERR_MSG_MOD(extack, "tree is too deep");
1846 			goto failure;
1847 		}
1848 		err = -ENOBUFS;
1849 		cl = kzalloc_obj(*cl);
1850 		if (!cl)
1851 			goto failure;
1852 
1853 		gnet_stats_basic_sync_init(&cl->bstats);
1854 		gnet_stats_basic_sync_init(&cl->bstats_bias);
1855 
1856 		err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1857 		if (err) {
1858 			kfree(cl);
1859 			goto failure;
1860 		}
1861 		if (htb_rate_est || tca[TCA_RATE]) {
1862 			err = gen_new_estimator(&cl->bstats, NULL,
1863 						&cl->rate_est,
1864 						NULL,
1865 						true,
1866 						tca[TCA_RATE] ? : &est.nla);
1867 			if (err)
1868 				goto err_block_put;
1869 		}
1870 
1871 		cl->children = 0;
1872 		RB_CLEAR_NODE(&cl->pq_node);
1873 
1874 		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1875 			RB_CLEAR_NODE(&cl->node[prio]);
1876 
1877 		cl->common.classid = classid;
1878 
1879 		/* Make sure nothing interrupts us in between of two
1880 		 * ndo_setup_tc calls.
1881 		 */
1882 		ASSERT_RTNL();
1883 
1884 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1885 		 * so that can't be used inside of sch_tree_lock
1886 		 * -- thanks to Karlis Peisenieks
1887 		 */
1888 		if (!q->offload) {
1889 			dev_queue = sch->dev_queue;
1890 		} else if (!(parent && !parent->level)) {
1891 			/* Assign a dev_queue to this classid. */
1892 			offload_opt = (struct tc_htb_qopt_offload) {
1893 				.command = TC_HTB_LEAF_ALLOC_QUEUE,
1894 				.classid = cl->common.classid,
1895 				.parent_classid = parent ?
1896 					TC_H_MIN(parent->common.classid) :
1897 					TC_HTB_CLASSID_ROOT,
1898 				.rate = max_t(u64, hopt->rate.rate, rate64),
1899 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
1900 				.prio = hopt->prio,
1901 				.quantum = hopt->quantum,
1902 				.extack = extack,
1903 			};
1904 			err = htb_offload(dev, &offload_opt);
1905 			if (err) {
1906 				NL_SET_ERR_MSG_WEAK(extack,
1907 						    "Failed to offload TC_HTB_LEAF_ALLOC_QUEUE");
1908 				goto err_kill_estimator;
1909 			}
1910 			dev_queue = netdev_get_tx_queue(dev, offload_opt.qid);
1911 		} else { /* First child. */
1912 			dev_queue = htb_offload_get_queue(parent);
1913 			old_q = htb_graft_helper(dev_queue, NULL);
1914 			WARN_ON(old_q != parent->leaf.q);
1915 			offload_opt = (struct tc_htb_qopt_offload) {
1916 				.command = TC_HTB_LEAF_TO_INNER,
1917 				.classid = cl->common.classid,
1918 				.parent_classid =
1919 					TC_H_MIN(parent->common.classid),
1920 				.rate = max_t(u64, hopt->rate.rate, rate64),
1921 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
1922 				.prio = hopt->prio,
1923 				.quantum = hopt->quantum,
1924 				.extack = extack,
1925 			};
1926 			err = htb_offload(dev, &offload_opt);
1927 			if (err) {
1928 				NL_SET_ERR_MSG_WEAK(extack,
1929 						    "Failed to offload TC_HTB_LEAF_TO_INNER");
1930 				htb_graft_helper(dev_queue, old_q);
1931 				goto err_kill_estimator;
1932 			}
1933 			_bstats_update(&parent->bstats_bias,
1934 				       u64_stats_read(&old_q->bstats.bytes),
1935 				       u64_stats_read(&old_q->bstats.packets));
1936 			qdisc_put(old_q);
1937 		}
1938 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
1939 					  classid, NULL);
1940 		if (q->offload) {
1941 			/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
1942 			if (new_q)
1943 				qdisc_refcount_inc(new_q);
1944 			old_q = htb_graft_helper(dev_queue, new_q);
1945 			/* No qdisc_put needed. */
1946 			WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
1947 		}
1948 		sch_tree_lock(sch);
1949 		if (parent && !parent->level) {
1950 			/* turn parent into inner node */
1951 			qdisc_purge_queue(parent->leaf.q);
1952 			parent_qdisc = parent->leaf.q;
1953 			htb_deactivate(q, parent);
1954 
1955 			/* remove from evt list because of level change */
1956 			if (parent->cmode != HTB_CAN_SEND) {
1957 				htb_safe_rb_erase(&parent->pq_node, &q->hlevel[0].wait_pq);
1958 				parent->cmode = HTB_CAN_SEND;
1959 			}
1960 			parent->level = (parent->parent ? parent->parent->level
1961 					 : TC_HTB_MAXDEPTH) - 1;
1962 			memset(&parent->inner, 0, sizeof(parent->inner));
1963 		}
1964 
1965 		/* leaf (we) needs elementary qdisc */
1966 		cl->leaf.q = new_q ? new_q : &noop_qdisc;
1967 		if (q->offload)
1968 			cl->leaf.offload_queue = dev_queue;
1969 
1970 		cl->parent = parent;
1971 
1972 		/* set class to be in HTB_CAN_SEND state */
1973 		cl->tokens = PSCHED_TICKS2NS(hopt->buffer);
1974 		cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer);
1975 		cl->mbuffer = 60ULL * NSEC_PER_SEC;	/* 1min */
1976 		cl->t_c = ktime_get_ns();
1977 		cl->cmode = HTB_CAN_SEND;
1978 
1979 		/* attach to the hash list and parent's family */
1980 		qdisc_class_hash_insert(&q->clhash, &cl->common);
1981 		if (parent)
1982 			parent->children++;
1983 		if (cl->leaf.q != &noop_qdisc)
1984 			qdisc_hash_add(cl->leaf.q, true);
1985 	} else {
1986 		if (tca[TCA_RATE]) {
1987 			err = gen_replace_estimator(&cl->bstats, NULL,
1988 						    &cl->rate_est,
1989 						    NULL,
1990 						    true,
1991 						    tca[TCA_RATE]);
1992 			if (err)
1993 				return err;
1994 		}
1995 
1996 		if (q->offload) {
1997 			struct net_device *dev = qdisc_dev(sch);
1998 
1999 			offload_opt = (struct tc_htb_qopt_offload) {
2000 				.command = TC_HTB_NODE_MODIFY,
2001 				.classid = cl->common.classid,
2002 				.rate = max_t(u64, hopt->rate.rate, rate64),
2003 				.ceil = max_t(u64, hopt->ceil.rate, ceil64),
2004 				.prio = hopt->prio,
2005 				.quantum = hopt->quantum,
2006 				.extack = extack,
2007 			};
2008 			err = htb_offload(dev, &offload_opt);
2009 			if (err)
2010 				/* Estimator was replaced, and rollback may fail
2011 				 * as well, so we don't try to recover it, and
2012 				 * the estimator won't work property with the
2013 				 * offload anyway, because bstats are updated
2014 				 * only when the stats are queried.
2015 				 */
2016 				return err;
2017 		}
2018 
2019 		sch_tree_lock(sch);
2020 	}
2021 
2022 	psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64);
2023 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
2024 
2025 	/* it used to be a nasty bug here, we have to check that node
2026 	 * is really leaf before changing cl->leaf !
2027 	 */
2028 	if (!cl->level) {
2029 		u64 quantum = cl->rate.rate_bytes_ps;
2030 
2031 		do_div(quantum, q->rate2quantum);
2032 		cl->quantum = min_t(u64, quantum, INT_MAX);
2033 
2034 		if (!hopt->quantum && cl->quantum < 1000) {
2035 			warn = -1;
2036 			cl->quantum = 1000;
2037 		}
2038 		if (!hopt->quantum && cl->quantum > 200000) {
2039 			warn = 1;
2040 			cl->quantum = 200000;
2041 		}
2042 		if (hopt->quantum)
2043 			cl->quantum = hopt->quantum;
2044 		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
2045 			cl->prio = TC_HTB_NUMPRIO - 1;
2046 	}
2047 
2048 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
2049 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
2050 
2051 	sch_tree_unlock(sch);
2052 	qdisc_put(parent_qdisc);
2053 
2054 	if (warn)
2055 		NL_SET_ERR_MSG_FMT_MOD(extack,
2056 				       "quantum of class %X is %s. Consider r2q change.",
2057 				       cl->common.classid, (warn == -1 ? "small" : "big"));
2058 
2059 	qdisc_class_hash_grow(sch, &q->clhash);
2060 
2061 	*arg = (unsigned long)cl;
2062 	return 0;
2063 
2064 err_kill_estimator:
2065 	gen_kill_estimator(&cl->rate_est);
2066 err_block_put:
2067 	tcf_block_put(cl->block);
2068 	kfree(cl);
2069 failure:
2070 	return err;
2071 }
2072 
2073 static struct tcf_block *htb_tcf_block(struct Qdisc *sch, unsigned long arg,
2074 				       struct netlink_ext_ack *extack)
2075 {
2076 	struct htb_sched *q = qdisc_priv(sch);
2077 	struct htb_class *cl = (struct htb_class *)arg;
2078 
2079 	return cl ? cl->block : q->block;
2080 }
2081 
2082 static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
2083 				     u32 classid)
2084 {
2085 	struct htb_class *cl = htb_find(classid, sch);
2086 
2087 	/*if (cl && !cl->level) return 0;
2088 	 * The line above used to be there to prevent attaching filters to
2089 	 * leaves. But at least tc_index filter uses this just to get class
2090 	 * for other reasons so that we have to allow for it.
2091 	 * ----
2092 	 * 19.6.2002 As Werner explained it is ok - bind filter is just
2093 	 * another way to "lock" the class - unlike "get" this lock can
2094 	 * be broken by class during destroy IIUC.
2095 	 */
2096 	if (cl)
2097 		qdisc_class_get(&cl->common);
2098 	return (unsigned long)cl;
2099 }
2100 
2101 static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
2102 {
2103 	struct htb_class *cl = (struct htb_class *)arg;
2104 
2105 	qdisc_class_put(&cl->common);
2106 }
2107 
2108 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2109 {
2110 	struct htb_sched *q = qdisc_priv(sch);
2111 	struct htb_class *cl;
2112 	unsigned int i;
2113 
2114 	if (arg->stop)
2115 		return;
2116 
2117 	for (i = 0; i < q->clhash.hashsize; i++) {
2118 		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
2119 			if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg))
2120 				return;
2121 		}
2122 	}
2123 }
2124 
2125 static const struct Qdisc_class_ops htb_class_ops = {
2126 	.select_queue	=	htb_select_queue,
2127 	.graft		=	htb_graft,
2128 	.leaf		=	htb_leaf,
2129 	.qlen_notify	=	htb_qlen_notify,
2130 	.find		=	htb_search,
2131 	.change		=	htb_change_class,
2132 	.delete		=	htb_delete,
2133 	.walk		=	htb_walk,
2134 	.tcf_block	=	htb_tcf_block,
2135 	.bind_tcf	=	htb_bind_filter,
2136 	.unbind_tcf	=	htb_unbind_filter,
2137 	.dump		=	htb_dump_class,
2138 	.dump_stats	=	htb_dump_class_stats,
2139 };
2140 
2141 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
2142 	.cl_ops		=	&htb_class_ops,
2143 	.id		=	"htb",
2144 	.priv_size	=	sizeof(struct htb_sched),
2145 	.enqueue	=	htb_enqueue,
2146 	.dequeue	=	htb_dequeue,
2147 	.peek		=	qdisc_peek_dequeued,
2148 	.init		=	htb_init,
2149 	.attach		=	htb_attach,
2150 	.reset		=	htb_reset,
2151 	.destroy	=	htb_destroy,
2152 	.dump		=	htb_dump,
2153 	.owner		=	THIS_MODULE,
2154 };
2155 MODULE_ALIAS_NET_SCH("htb");
2156 
2157 static int __init htb_module_init(void)
2158 {
2159 	return register_qdisc(&htb_qdisc_ops);
2160 }
2161 static void __exit htb_module_exit(void)
2162 {
2163 	unregister_qdisc(&htb_qdisc_ops);
2164 }
2165 
2166 module_init(htb_module_init)
2167 module_exit(htb_module_exit)
2168 MODULE_LICENSE("GPL");
2169 MODULE_DESCRIPTION("Hierarchical Token Bucket scheduler");
2170