xref: /linux/net/sched/sch_fq.c (revision 04317b129e4eb5c6f4a58bb899b2019c1545320b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
4  *
5  *  Copyright (C) 2013-2023 Eric Dumazet <edumazet@google.com>
6  *
7  *  Meant to be mostly used for locally generated traffic :
8  *  Fast classification depends on skb->sk being set before reaching us.
9  *  If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
10  *  All packets belonging to a socket are considered as a 'flow'.
11  *
12  *  Flows are dynamically allocated and stored in a hash table of RB trees
13  *  They are also part of one Round Robin 'queues' (new or old flows)
14  *
15  *  Burst avoidance (aka pacing) capability :
16  *
17  *  Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
18  *  bunch of packets, and this packet scheduler adds delay between
19  *  packets to respect rate limitation.
20  *
21  *  enqueue() :
22  *   - lookup one RB tree (out of 1024 or more) to find the flow.
23  *     If non existent flow, create it, add it to the tree.
24  *     Add skb to the per flow list of skb (fifo).
25  *   - Use a special fifo for high prio packets
26  *
27  *  dequeue() : serves flows in Round Robin
28  *  Note : When a flow becomes empty, we do not immediately remove it from
29  *  rb trees, for performance reasons (its expected to send additional packets,
30  *  or SLAB cache will reuse socket for another flow)
31  */
32 
33 #include <linux/module.h>
34 #include <linux/types.h>
35 #include <linux/kernel.h>
36 #include <linux/jiffies.h>
37 #include <linux/string.h>
38 #include <linux/in.h>
39 #include <linux/errno.h>
40 #include <linux/init.h>
41 #include <linux/skbuff.h>
42 #include <linux/slab.h>
43 #include <linux/rbtree.h>
44 #include <linux/hash.h>
45 #include <linux/prefetch.h>
46 #include <linux/vmalloc.h>
47 #include <net/netlink.h>
48 #include <net/pkt_sched.h>
49 #include <net/sock.h>
50 #include <net/tcp_states.h>
51 #include <net/tcp.h>
52 
53 struct fq_skb_cb {
54 	u64	time_to_send;
55 	u8	band;
56 };
57 
58 static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
59 {
60 	qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb));
61 	return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data;
62 }
63 
64 /*
65  * Per flow structure, dynamically allocated.
66  * If packets have monotically increasing time_to_send, they are placed in O(1)
67  * in linear list (head,tail), otherwise are placed in a rbtree (t_root).
68  */
69 struct fq_flow {
70 /* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */
71 	struct rb_root	t_root;
72 	struct sk_buff	*head;		/* list of skbs for this flow : first skb */
73 	union {
74 		struct sk_buff *tail;	/* last skb in the list */
75 		unsigned long  age;	/* (jiffies | 1UL) when flow was emptied, for gc */
76 	};
77 	union {
78 		struct rb_node	fq_node;	/* anchor in fq_root[] trees */
79 		/* Following field is only used for q->internal,
80 		 * because q->internal is not hashed in fq_root[]
81 		 */
82 		u64		stat_fastpath_packets;
83 	};
84 	struct sock	*sk;
85 	u32		socket_hash;	/* sk_hash */
86 	int		qlen;		/* number of packets in flow queue */
87 
88 /* Second cache line */
89 	int		credit;
90 	int		band;
91 	struct fq_flow *next;		/* next pointer in RR lists */
92 
93 	struct rb_node  rate_node;	/* anchor in q->delayed tree */
94 	u64		time_next_packet;
95 };
96 
97 struct fq_flow_head {
98 	struct fq_flow *first;
99 	struct fq_flow *last;
100 };
101 
102 struct fq_perband_flows {
103 	struct fq_flow_head new_flows;
104 	struct fq_flow_head old_flows;
105 	int		    credit;
106 	int		    quantum; /* based on band nr : 576KB, 192KB, 64KB */
107 };
108 
109 struct fq_sched_data {
110 /* Read mostly cache line */
111 
112 	u32		quantum;
113 	u32		initial_quantum;
114 	u32		flow_refill_delay;
115 	u32		flow_plimit;	/* max packets per flow */
116 	unsigned long	flow_max_rate;	/* optional max rate per flow */
117 	u64		ce_threshold;
118 	u64		horizon;	/* horizon in ns */
119 	u32		orphan_mask;	/* mask for orphaned skb */
120 	u32		low_rate_threshold;
121 	struct rb_root	*fq_root;
122 	u8		rate_enable;
123 	u8		fq_trees_log;
124 	u8		horizon_drop;
125 	u8		prio2band[(TC_PRIO_MAX + 1) >> 2];
126 	u32		timer_slack; /* hrtimer slack in ns */
127 
128 /* Read/Write fields. */
129 
130 	unsigned int band_nr; /* band being serviced in fq_dequeue() */
131 
132 	struct fq_perband_flows band_flows[FQ_BANDS];
133 
134 	struct fq_flow	internal;	/* fastpath queue. */
135 	struct rb_root	delayed;	/* for rate limited flows */
136 	u64		time_next_delayed_flow;
137 	unsigned long	unthrottle_latency_ns;
138 
139 	u32		band_pkt_count[FQ_BANDS];
140 	u32		flows;
141 	u32		inactive_flows; /* Flows with no packet to send. */
142 	u32		throttled_flows;
143 
144 	u64		stat_throttled;
145 	struct qdisc_watchdog watchdog;
146 	u64		stat_gc_flows;
147 
148 /* Seldom used fields. */
149 
150 	u64		stat_band_drops[FQ_BANDS];
151 	u64		stat_ce_mark;
152 	u64		stat_horizon_drops;
153 	u64		stat_horizon_caps;
154 	u64		stat_flows_plimit;
155 	u64		stat_pkts_too_long;
156 	u64		stat_allocation_errors;
157 };
158 
159 /* return the i-th 2-bit value ("crumb") */
160 static u8 fq_prio2band(const u8 *prio2band, unsigned int prio)
161 {
162 	return (prio2band[prio / 4] >> (2 * (prio & 0x3))) & 0x3;
163 }
164 
165 /*
166  * f->tail and f->age share the same location.
167  * We can use the low order bit to differentiate if this location points
168  * to a sk_buff or contains a jiffies value, if we force this value to be odd.
169  * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2
170  */
171 static void fq_flow_set_detached(struct fq_flow *f)
172 {
173 	f->age = jiffies | 1UL;
174 }
175 
176 static bool fq_flow_is_detached(const struct fq_flow *f)
177 {
178 	return !!(f->age & 1UL);
179 }
180 
181 /* special value to mark a throttled flow (not on old/new list) */
182 static struct fq_flow throttled;
183 
184 static bool fq_flow_is_throttled(const struct fq_flow *f)
185 {
186 	return f->next == &throttled;
187 }
188 
189 enum new_flow {
190 	NEW_FLOW,
191 	OLD_FLOW
192 };
193 
194 static void fq_flow_add_tail(struct fq_sched_data *q, struct fq_flow *flow,
195 			     enum new_flow list_sel)
196 {
197 	struct fq_perband_flows *pband = &q->band_flows[flow->band];
198 	struct fq_flow_head *head = (list_sel == NEW_FLOW) ?
199 					&pband->new_flows :
200 					&pband->old_flows;
201 
202 	if (head->first)
203 		head->last->next = flow;
204 	else
205 		head->first = flow;
206 	head->last = flow;
207 	flow->next = NULL;
208 }
209 
210 static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f)
211 {
212 	rb_erase(&f->rate_node, &q->delayed);
213 	q->throttled_flows--;
214 	fq_flow_add_tail(q, f, OLD_FLOW);
215 }
216 
217 static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
218 {
219 	struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
220 
221 	while (*p) {
222 		struct fq_flow *aux;
223 
224 		parent = *p;
225 		aux = rb_entry(parent, struct fq_flow, rate_node);
226 		if (f->time_next_packet >= aux->time_next_packet)
227 			p = &parent->rb_right;
228 		else
229 			p = &parent->rb_left;
230 	}
231 	rb_link_node(&f->rate_node, parent, p);
232 	rb_insert_color(&f->rate_node, &q->delayed);
233 	q->throttled_flows++;
234 	q->stat_throttled++;
235 
236 	f->next = &throttled;
237 	if (q->time_next_delayed_flow > f->time_next_packet)
238 		q->time_next_delayed_flow = f->time_next_packet;
239 }
240 
241 
242 static struct kmem_cache *fq_flow_cachep __read_mostly;
243 
244 
245 /* limit number of collected flows per round */
246 #define FQ_GC_MAX 8
247 #define FQ_GC_AGE (3*HZ)
248 
249 static bool fq_gc_candidate(const struct fq_flow *f)
250 {
251 	return fq_flow_is_detached(f) &&
252 	       time_after(jiffies, f->age + FQ_GC_AGE);
253 }
254 
255 static void fq_gc(struct fq_sched_data *q,
256 		  struct rb_root *root,
257 		  struct sock *sk)
258 {
259 	struct rb_node **p, *parent;
260 	void *tofree[FQ_GC_MAX];
261 	struct fq_flow *f;
262 	int i, fcnt = 0;
263 
264 	p = &root->rb_node;
265 	parent = NULL;
266 	while (*p) {
267 		parent = *p;
268 
269 		f = rb_entry(parent, struct fq_flow, fq_node);
270 		if (f->sk == sk)
271 			break;
272 
273 		if (fq_gc_candidate(f)) {
274 			tofree[fcnt++] = f;
275 			if (fcnt == FQ_GC_MAX)
276 				break;
277 		}
278 
279 		if (f->sk > sk)
280 			p = &parent->rb_right;
281 		else
282 			p = &parent->rb_left;
283 	}
284 
285 	if (!fcnt)
286 		return;
287 
288 	for (i = fcnt; i > 0; ) {
289 		f = tofree[--i];
290 		rb_erase(&f->fq_node, root);
291 	}
292 	q->flows -= fcnt;
293 	q->inactive_flows -= fcnt;
294 	q->stat_gc_flows += fcnt;
295 
296 	kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
297 }
298 
299 /* Fast path can be used if :
300  * 1) Packet tstamp is in the past.
301  * 2) FQ qlen == 0   OR
302  *   (no flow is currently eligible for transmit,
303  *    AND fast path queue has less than 8 packets)
304  * 3) No SO_MAX_PACING_RATE on the socket (if any).
305  * 4) No @maxrate attribute on this qdisc,
306  *
307  * FQ can not use generic TCQ_F_CAN_BYPASS infrastructure.
308  */
309 static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb,
310 			      u64 now)
311 {
312 	const struct fq_sched_data *q = qdisc_priv(sch);
313 	const struct sock *sk;
314 
315 	if (fq_skb_cb(skb)->time_to_send > now)
316 		return false;
317 
318 	if (sch->q.qlen != 0) {
319 		/* Even if some packets are stored in this qdisc,
320 		 * we can still enable fast path if all of them are
321 		 * scheduled in the future (ie no flows are eligible)
322 		 * or in the fast path queue.
323 		 */
324 		if (q->flows != q->inactive_flows + q->throttled_flows)
325 			return false;
326 
327 		/* Do not allow fast path queue to explode, we want Fair Queue mode
328 		 * under pressure.
329 		 */
330 		if (q->internal.qlen >= 8)
331 			return false;
332 	}
333 
334 	sk = skb->sk;
335 	if (sk && sk_fullsock(sk) && !sk_is_tcp(sk) &&
336 	    sk->sk_max_pacing_rate != ~0UL)
337 		return false;
338 
339 	if (q->flow_max_rate != ~0UL)
340 		return false;
341 
342 	return true;
343 }
344 
345 static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb,
346 				   u64 now)
347 {
348 	struct fq_sched_data *q = qdisc_priv(sch);
349 	struct rb_node **p, *parent;
350 	struct sock *sk = skb->sk;
351 	struct rb_root *root;
352 	struct fq_flow *f;
353 
354 	/* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
355 	 * or a listener (SYNCOOKIE mode)
356 	 * 1) request sockets are not full blown,
357 	 *    they do not contain sk_pacing_rate
358 	 * 2) They are not part of a 'flow' yet
359 	 * 3) We do not want to rate limit them (eg SYNFLOOD attack),
360 	 *    especially if the listener set SO_MAX_PACING_RATE
361 	 * 4) We pretend they are orphaned
362 	 */
363 	if (!sk || sk_listener(sk)) {
364 		unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
365 
366 		/* By forcing low order bit to 1, we make sure to not
367 		 * collide with a local flow (socket pointers are word aligned)
368 		 */
369 		sk = (struct sock *)((hash << 1) | 1UL);
370 		skb_orphan(skb);
371 	} else if (sk->sk_state == TCP_CLOSE) {
372 		unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
373 		/*
374 		 * Sockets in TCP_CLOSE are non connected.
375 		 * Typical use case is UDP sockets, they can send packets
376 		 * with sendto() to many different destinations.
377 		 * We probably could use a generic bit advertising
378 		 * non connected sockets, instead of sk_state == TCP_CLOSE,
379 		 * if we care enough.
380 		 */
381 		sk = (struct sock *)((hash << 1) | 1UL);
382 	}
383 
384 	if (fq_fastpath_check(sch, skb, now)) {
385 		q->internal.stat_fastpath_packets++;
386 		return &q->internal;
387 	}
388 
389 	root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
390 
391 	fq_gc(q, root, sk);
392 
393 	p = &root->rb_node;
394 	parent = NULL;
395 	while (*p) {
396 		parent = *p;
397 
398 		f = rb_entry(parent, struct fq_flow, fq_node);
399 		if (f->sk == sk) {
400 			/* socket might have been reallocated, so check
401 			 * if its sk_hash is the same.
402 			 * It not, we need to refill credit with
403 			 * initial quantum
404 			 */
405 			if (unlikely(skb->sk == sk &&
406 				     f->socket_hash != sk->sk_hash)) {
407 				f->credit = q->initial_quantum;
408 				f->socket_hash = sk->sk_hash;
409 				if (q->rate_enable)
410 					smp_store_release(&sk->sk_pacing_status,
411 							  SK_PACING_FQ);
412 				if (fq_flow_is_throttled(f))
413 					fq_flow_unset_throttled(q, f);
414 				f->time_next_packet = 0ULL;
415 			}
416 			return f;
417 		}
418 		if (f->sk > sk)
419 			p = &parent->rb_right;
420 		else
421 			p = &parent->rb_left;
422 	}
423 
424 	f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
425 	if (unlikely(!f)) {
426 		q->stat_allocation_errors++;
427 		return &q->internal;
428 	}
429 	/* f->t_root is already zeroed after kmem_cache_zalloc() */
430 
431 	fq_flow_set_detached(f);
432 	f->sk = sk;
433 	if (skb->sk == sk) {
434 		f->socket_hash = sk->sk_hash;
435 		if (q->rate_enable)
436 			smp_store_release(&sk->sk_pacing_status,
437 					  SK_PACING_FQ);
438 	}
439 	f->credit = q->initial_quantum;
440 
441 	rb_link_node(&f->fq_node, parent, p);
442 	rb_insert_color(&f->fq_node, root);
443 
444 	q->flows++;
445 	q->inactive_flows++;
446 	return f;
447 }
448 
449 static struct sk_buff *fq_peek(struct fq_flow *flow)
450 {
451 	struct sk_buff *skb = skb_rb_first(&flow->t_root);
452 	struct sk_buff *head = flow->head;
453 
454 	if (!skb)
455 		return head;
456 
457 	if (!head)
458 		return skb;
459 
460 	if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send)
461 		return skb;
462 	return head;
463 }
464 
465 static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
466 			  struct sk_buff *skb)
467 {
468 	if (skb == flow->head) {
469 		flow->head = skb->next;
470 	} else {
471 		rb_erase(&skb->rbnode, &flow->t_root);
472 		skb->dev = qdisc_dev(sch);
473 	}
474 }
475 
476 /* Remove one skb from flow queue.
477  * This skb must be the return value of prior fq_peek().
478  */
479 static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
480 			   struct sk_buff *skb)
481 {
482 	fq_erase_head(sch, flow, skb);
483 	skb_mark_not_on_list(skb);
484 	qdisc_qstats_backlog_dec(sch, skb);
485 	sch->q.qlen--;
486 }
487 
488 static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
489 {
490 	struct rb_node **p, *parent;
491 	struct sk_buff *head, *aux;
492 
493 	head = flow->head;
494 	if (!head ||
495 	    fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
496 		if (!head)
497 			flow->head = skb;
498 		else
499 			flow->tail->next = skb;
500 		flow->tail = skb;
501 		skb->next = NULL;
502 		return;
503 	}
504 
505 	p = &flow->t_root.rb_node;
506 	parent = NULL;
507 
508 	while (*p) {
509 		parent = *p;
510 		aux = rb_to_skb(parent);
511 		if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send)
512 			p = &parent->rb_right;
513 		else
514 			p = &parent->rb_left;
515 	}
516 	rb_link_node(&skb->rbnode, parent, p);
517 	rb_insert_color(&skb->rbnode, &flow->t_root);
518 }
519 
520 static bool fq_packet_beyond_horizon(const struct sk_buff *skb,
521 				     const struct fq_sched_data *q, u64 now)
522 {
523 	return unlikely((s64)skb->tstamp > (s64)(now + q->horizon));
524 }
525 
526 static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
527 		      struct sk_buff **to_free)
528 {
529 	struct fq_sched_data *q = qdisc_priv(sch);
530 	struct fq_flow *f;
531 	u64 now;
532 	u8 band;
533 
534 	band = fq_prio2band(q->prio2band, skb->priority & TC_PRIO_MAX);
535 	if (unlikely(q->band_pkt_count[band] >= sch->limit)) {
536 		q->stat_band_drops[band]++;
537 		return qdisc_drop(skb, sch, to_free);
538 	}
539 
540 	now = ktime_get_ns();
541 	if (!skb->tstamp) {
542 		fq_skb_cb(skb)->time_to_send = now;
543 	} else {
544 		/* Check if packet timestamp is too far in the future. */
545 		if (fq_packet_beyond_horizon(skb, q, now)) {
546 			if (q->horizon_drop) {
547 					q->stat_horizon_drops++;
548 					return qdisc_drop(skb, sch, to_free);
549 			}
550 			q->stat_horizon_caps++;
551 			skb->tstamp = now + q->horizon;
552 		}
553 		fq_skb_cb(skb)->time_to_send = skb->tstamp;
554 	}
555 
556 	f = fq_classify(sch, skb, now);
557 
558 	if (f != &q->internal) {
559 		if (unlikely(f->qlen >= q->flow_plimit)) {
560 			q->stat_flows_plimit++;
561 			return qdisc_drop(skb, sch, to_free);
562 		}
563 
564 		if (fq_flow_is_detached(f)) {
565 			fq_flow_add_tail(q, f, NEW_FLOW);
566 			if (time_after(jiffies, f->age + q->flow_refill_delay))
567 				f->credit = max_t(u32, f->credit, q->quantum);
568 		}
569 
570 		f->band = band;
571 		q->band_pkt_count[band]++;
572 		fq_skb_cb(skb)->band = band;
573 		if (f->qlen == 0)
574 			q->inactive_flows--;
575 	}
576 
577 	f->qlen++;
578 	/* Note: this overwrites f->age */
579 	flow_queue_add(f, skb);
580 
581 	qdisc_qstats_backlog_inc(sch, skb);
582 	sch->q.qlen++;
583 
584 	return NET_XMIT_SUCCESS;
585 }
586 
587 static void fq_check_throttled(struct fq_sched_data *q, u64 now)
588 {
589 	unsigned long sample;
590 	struct rb_node *p;
591 
592 	if (q->time_next_delayed_flow > now)
593 		return;
594 
595 	/* Update unthrottle latency EWMA.
596 	 * This is cheap and can help diagnosing timer/latency problems.
597 	 */
598 	sample = (unsigned long)(now - q->time_next_delayed_flow);
599 	q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
600 	q->unthrottle_latency_ns += sample >> 3;
601 
602 	q->time_next_delayed_flow = ~0ULL;
603 	while ((p = rb_first(&q->delayed)) != NULL) {
604 		struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node);
605 
606 		if (f->time_next_packet > now) {
607 			q->time_next_delayed_flow = f->time_next_packet;
608 			break;
609 		}
610 		fq_flow_unset_throttled(q, f);
611 	}
612 }
613 
614 static struct fq_flow_head *fq_pband_head_select(struct fq_perband_flows *pband)
615 {
616 	if (pband->credit <= 0)
617 		return NULL;
618 
619 	if (pband->new_flows.first)
620 		return &pband->new_flows;
621 
622 	return pband->old_flows.first ? &pband->old_flows : NULL;
623 }
624 
625 static struct sk_buff *fq_dequeue(struct Qdisc *sch)
626 {
627 	struct fq_sched_data *q = qdisc_priv(sch);
628 	struct fq_perband_flows *pband;
629 	struct fq_flow_head *head;
630 	struct sk_buff *skb;
631 	struct fq_flow *f;
632 	unsigned long rate;
633 	int retry;
634 	u32 plen;
635 	u64 now;
636 
637 	if (!sch->q.qlen)
638 		return NULL;
639 
640 	skb = fq_peek(&q->internal);
641 	if (unlikely(skb)) {
642 		q->internal.qlen--;
643 		fq_dequeue_skb(sch, &q->internal, skb);
644 		goto out;
645 	}
646 
647 	now = ktime_get_ns();
648 	fq_check_throttled(q, now);
649 	retry = 0;
650 	pband = &q->band_flows[q->band_nr];
651 begin:
652 	head = fq_pband_head_select(pband);
653 	if (!head) {
654 		while (++retry < FQ_BANDS) {
655 			if (++q->band_nr == FQ_BANDS)
656 				q->band_nr = 0;
657 			pband = &q->band_flows[q->band_nr];
658 			pband->credit = min(pband->credit + pband->quantum,
659 					    pband->quantum);
660 			goto begin;
661 		}
662 		if (q->time_next_delayed_flow != ~0ULL)
663 			qdisc_watchdog_schedule_range_ns(&q->watchdog,
664 							q->time_next_delayed_flow,
665 							q->timer_slack);
666 		return NULL;
667 	}
668 	f = head->first;
669 	retry = 0;
670 	if (f->credit <= 0) {
671 		f->credit += q->quantum;
672 		head->first = f->next;
673 		fq_flow_add_tail(q, f, OLD_FLOW);
674 		goto begin;
675 	}
676 
677 	skb = fq_peek(f);
678 	if (skb) {
679 		u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
680 					     f->time_next_packet);
681 
682 		if (now < time_next_packet) {
683 			head->first = f->next;
684 			f->time_next_packet = time_next_packet;
685 			fq_flow_set_throttled(q, f);
686 			goto begin;
687 		}
688 		prefetch(&skb->end);
689 		if ((s64)(now - time_next_packet - q->ce_threshold) > 0) {
690 			INET_ECN_set_ce(skb);
691 			q->stat_ce_mark++;
692 		}
693 		if (--f->qlen == 0)
694 			q->inactive_flows++;
695 		q->band_pkt_count[fq_skb_cb(skb)->band]--;
696 		fq_dequeue_skb(sch, f, skb);
697 	} else {
698 		head->first = f->next;
699 		/* force a pass through old_flows to prevent starvation */
700 		if (head == &pband->new_flows) {
701 			fq_flow_add_tail(q, f, OLD_FLOW);
702 		} else {
703 			fq_flow_set_detached(f);
704 		}
705 		goto begin;
706 	}
707 	plen = qdisc_pkt_len(skb);
708 	f->credit -= plen;
709 	pband->credit -= plen;
710 
711 	if (!q->rate_enable)
712 		goto out;
713 
714 	rate = q->flow_max_rate;
715 
716 	/* If EDT time was provided for this skb, we need to
717 	 * update f->time_next_packet only if this qdisc enforces
718 	 * a flow max rate.
719 	 */
720 	if (!skb->tstamp) {
721 		if (skb->sk)
722 			rate = min(READ_ONCE(skb->sk->sk_pacing_rate), rate);
723 
724 		if (rate <= q->low_rate_threshold) {
725 			f->credit = 0;
726 		} else {
727 			plen = max(plen, q->quantum);
728 			if (f->credit > 0)
729 				goto out;
730 		}
731 	}
732 	if (rate != ~0UL) {
733 		u64 len = (u64)plen * NSEC_PER_SEC;
734 
735 		if (likely(rate))
736 			len = div64_ul(len, rate);
737 		/* Since socket rate can change later,
738 		 * clamp the delay to 1 second.
739 		 * Really, providers of too big packets should be fixed !
740 		 */
741 		if (unlikely(len > NSEC_PER_SEC)) {
742 			len = NSEC_PER_SEC;
743 			q->stat_pkts_too_long++;
744 		}
745 		/* Account for schedule/timers drifts.
746 		 * f->time_next_packet was set when prior packet was sent,
747 		 * and current time (@now) can be too late by tens of us.
748 		 */
749 		if (f->time_next_packet)
750 			len -= min(len/2, now - f->time_next_packet);
751 		f->time_next_packet = now + len;
752 	}
753 out:
754 	qdisc_bstats_update(sch, skb);
755 	return skb;
756 }
757 
758 static void fq_flow_purge(struct fq_flow *flow)
759 {
760 	struct rb_node *p = rb_first(&flow->t_root);
761 
762 	while (p) {
763 		struct sk_buff *skb = rb_to_skb(p);
764 
765 		p = rb_next(p);
766 		rb_erase(&skb->rbnode, &flow->t_root);
767 		rtnl_kfree_skbs(skb, skb);
768 	}
769 	rtnl_kfree_skbs(flow->head, flow->tail);
770 	flow->head = NULL;
771 	flow->qlen = 0;
772 }
773 
774 static void fq_reset(struct Qdisc *sch)
775 {
776 	struct fq_sched_data *q = qdisc_priv(sch);
777 	struct rb_root *root;
778 	struct rb_node *p;
779 	struct fq_flow *f;
780 	unsigned int idx;
781 
782 	sch->q.qlen = 0;
783 	sch->qstats.backlog = 0;
784 
785 	fq_flow_purge(&q->internal);
786 
787 	if (!q->fq_root)
788 		return;
789 
790 	for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
791 		root = &q->fq_root[idx];
792 		while ((p = rb_first(root)) != NULL) {
793 			f = rb_entry(p, struct fq_flow, fq_node);
794 			rb_erase(p, root);
795 
796 			fq_flow_purge(f);
797 
798 			kmem_cache_free(fq_flow_cachep, f);
799 		}
800 	}
801 	for (idx = 0; idx < FQ_BANDS; idx++) {
802 		q->band_flows[idx].new_flows.first = NULL;
803 		q->band_flows[idx].old_flows.first = NULL;
804 	}
805 	q->delayed		= RB_ROOT;
806 	q->flows		= 0;
807 	q->inactive_flows	= 0;
808 	q->throttled_flows	= 0;
809 }
810 
811 static void fq_rehash(struct fq_sched_data *q,
812 		      struct rb_root *old_array, u32 old_log,
813 		      struct rb_root *new_array, u32 new_log)
814 {
815 	struct rb_node *op, **np, *parent;
816 	struct rb_root *oroot, *nroot;
817 	struct fq_flow *of, *nf;
818 	int fcnt = 0;
819 	u32 idx;
820 
821 	for (idx = 0; idx < (1U << old_log); idx++) {
822 		oroot = &old_array[idx];
823 		while ((op = rb_first(oroot)) != NULL) {
824 			rb_erase(op, oroot);
825 			of = rb_entry(op, struct fq_flow, fq_node);
826 			if (fq_gc_candidate(of)) {
827 				fcnt++;
828 				kmem_cache_free(fq_flow_cachep, of);
829 				continue;
830 			}
831 			nroot = &new_array[hash_ptr(of->sk, new_log)];
832 
833 			np = &nroot->rb_node;
834 			parent = NULL;
835 			while (*np) {
836 				parent = *np;
837 
838 				nf = rb_entry(parent, struct fq_flow, fq_node);
839 				BUG_ON(nf->sk == of->sk);
840 
841 				if (nf->sk > of->sk)
842 					np = &parent->rb_right;
843 				else
844 					np = &parent->rb_left;
845 			}
846 
847 			rb_link_node(&of->fq_node, parent, np);
848 			rb_insert_color(&of->fq_node, nroot);
849 		}
850 	}
851 	q->flows -= fcnt;
852 	q->inactive_flows -= fcnt;
853 	q->stat_gc_flows += fcnt;
854 }
855 
856 static void fq_free(void *addr)
857 {
858 	kvfree(addr);
859 }
860 
861 static int fq_resize(struct Qdisc *sch, u32 log)
862 {
863 	struct fq_sched_data *q = qdisc_priv(sch);
864 	struct rb_root *array;
865 	void *old_fq_root;
866 	u32 idx;
867 
868 	if (q->fq_root && log == q->fq_trees_log)
869 		return 0;
870 
871 	/* If XPS was setup, we can allocate memory on right NUMA node */
872 	array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_RETRY_MAYFAIL,
873 			      netdev_queue_numa_node_read(sch->dev_queue));
874 	if (!array)
875 		return -ENOMEM;
876 
877 	for (idx = 0; idx < (1U << log); idx++)
878 		array[idx] = RB_ROOT;
879 
880 	sch_tree_lock(sch);
881 
882 	old_fq_root = q->fq_root;
883 	if (old_fq_root)
884 		fq_rehash(q, old_fq_root, q->fq_trees_log, array, log);
885 
886 	q->fq_root = array;
887 	q->fq_trees_log = log;
888 
889 	sch_tree_unlock(sch);
890 
891 	fq_free(old_fq_root);
892 
893 	return 0;
894 }
895 
896 static struct netlink_range_validation iq_range = {
897 	.max = INT_MAX,
898 };
899 
900 static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
901 	[TCA_FQ_UNSPEC]			= { .strict_start_type = TCA_FQ_TIMER_SLACK },
902 
903 	[TCA_FQ_PLIMIT]			= { .type = NLA_U32 },
904 	[TCA_FQ_FLOW_PLIMIT]		= { .type = NLA_U32 },
905 	[TCA_FQ_QUANTUM]		= { .type = NLA_U32 },
906 	[TCA_FQ_INITIAL_QUANTUM]	= NLA_POLICY_FULL_RANGE(NLA_U32, &iq_range),
907 	[TCA_FQ_RATE_ENABLE]		= { .type = NLA_U32 },
908 	[TCA_FQ_FLOW_DEFAULT_RATE]	= { .type = NLA_U32 },
909 	[TCA_FQ_FLOW_MAX_RATE]		= { .type = NLA_U32 },
910 	[TCA_FQ_BUCKETS_LOG]		= { .type = NLA_U32 },
911 	[TCA_FQ_FLOW_REFILL_DELAY]	= { .type = NLA_U32 },
912 	[TCA_FQ_ORPHAN_MASK]		= { .type = NLA_U32 },
913 	[TCA_FQ_LOW_RATE_THRESHOLD]	= { .type = NLA_U32 },
914 	[TCA_FQ_CE_THRESHOLD]		= { .type = NLA_U32 },
915 	[TCA_FQ_TIMER_SLACK]		= { .type = NLA_U32 },
916 	[TCA_FQ_HORIZON]		= { .type = NLA_U32 },
917 	[TCA_FQ_HORIZON_DROP]		= { .type = NLA_U8 },
918 	[TCA_FQ_PRIOMAP]		= {
919 			.type = NLA_BINARY,
920 			.len = sizeof(struct tc_prio_qopt),
921 		},
922 	[TCA_FQ_WEIGHTS]		= {
923 			.type = NLA_BINARY,
924 			.len = FQ_BANDS * sizeof(s32),
925 		},
926 };
927 
928 /* compress a u8 array with all elems <= 3 to an array of 2-bit fields */
929 static void fq_prio2band_compress_crumb(const u8 *in, u8 *out)
930 {
931 	const int num_elems = TC_PRIO_MAX + 1;
932 	int i;
933 
934 	memset(out, 0, num_elems / 4);
935 	for (i = 0; i < num_elems; i++)
936 		out[i / 4] |= in[i] << (2 * (i & 0x3));
937 }
938 
939 static void fq_prio2band_decompress_crumb(const u8 *in, u8 *out)
940 {
941 	const int num_elems = TC_PRIO_MAX + 1;
942 	int i;
943 
944 	for (i = 0; i < num_elems; i++)
945 		out[i] = fq_prio2band(in, i);
946 }
947 
948 static int fq_load_weights(struct fq_sched_data *q,
949 			   const struct nlattr *attr,
950 			   struct netlink_ext_ack *extack)
951 {
952 	s32 *weights = nla_data(attr);
953 	int i;
954 
955 	for (i = 0; i < FQ_BANDS; i++) {
956 		if (weights[i] < FQ_MIN_WEIGHT) {
957 			NL_SET_ERR_MSG_FMT_MOD(extack, "Weight %d less that minimum allowed %d",
958 					       weights[i], FQ_MIN_WEIGHT);
959 			return -EINVAL;
960 		}
961 	}
962 	for (i = 0; i < FQ_BANDS; i++)
963 		q->band_flows[i].quantum = weights[i];
964 	return 0;
965 }
966 
967 static int fq_load_priomap(struct fq_sched_data *q,
968 			   const struct nlattr *attr,
969 			   struct netlink_ext_ack *extack)
970 {
971 	const struct tc_prio_qopt *map = nla_data(attr);
972 	int i;
973 
974 	if (map->bands != FQ_BANDS) {
975 		NL_SET_ERR_MSG_MOD(extack, "FQ only supports 3 bands");
976 		return -EINVAL;
977 	}
978 	for (i = 0; i < TC_PRIO_MAX + 1; i++) {
979 		if (map->priomap[i] >= FQ_BANDS) {
980 			NL_SET_ERR_MSG_FMT_MOD(extack, "FQ priomap field %d maps to a too high band %d",
981 					       i, map->priomap[i]);
982 			return -EINVAL;
983 		}
984 	}
985 	fq_prio2band_compress_crumb(map->priomap, q->prio2band);
986 	return 0;
987 }
988 
989 static int fq_change(struct Qdisc *sch, struct nlattr *opt,
990 		     struct netlink_ext_ack *extack)
991 {
992 	struct fq_sched_data *q = qdisc_priv(sch);
993 	struct nlattr *tb[TCA_FQ_MAX + 1];
994 	int err, drop_count = 0;
995 	unsigned drop_len = 0;
996 	u32 fq_log;
997 
998 	err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy,
999 					  NULL);
1000 	if (err < 0)
1001 		return err;
1002 
1003 	sch_tree_lock(sch);
1004 
1005 	fq_log = q->fq_trees_log;
1006 
1007 	if (tb[TCA_FQ_BUCKETS_LOG]) {
1008 		u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
1009 
1010 		if (nval >= 1 && nval <= ilog2(256*1024))
1011 			fq_log = nval;
1012 		else
1013 			err = -EINVAL;
1014 	}
1015 	if (tb[TCA_FQ_PLIMIT])
1016 		sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
1017 
1018 	if (tb[TCA_FQ_FLOW_PLIMIT])
1019 		q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
1020 
1021 	if (tb[TCA_FQ_QUANTUM]) {
1022 		u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
1023 
1024 		if (quantum > 0 && quantum <= (1 << 20)) {
1025 			q->quantum = quantum;
1026 		} else {
1027 			NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
1028 			err = -EINVAL;
1029 		}
1030 	}
1031 
1032 	if (tb[TCA_FQ_INITIAL_QUANTUM])
1033 		q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
1034 
1035 	if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
1036 		pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
1037 				    nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
1038 
1039 	if (tb[TCA_FQ_FLOW_MAX_RATE]) {
1040 		u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
1041 
1042 		q->flow_max_rate = (rate == ~0U) ? ~0UL : rate;
1043 	}
1044 	if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
1045 		q->low_rate_threshold =
1046 			nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
1047 
1048 	if (tb[TCA_FQ_RATE_ENABLE]) {
1049 		u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
1050 
1051 		if (enable <= 1)
1052 			q->rate_enable = enable;
1053 		else
1054 			err = -EINVAL;
1055 	}
1056 
1057 	if (tb[TCA_FQ_FLOW_REFILL_DELAY]) {
1058 		u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ;
1059 
1060 		q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
1061 	}
1062 
1063 	if (!err && tb[TCA_FQ_PRIOMAP])
1064 		err = fq_load_priomap(q, tb[TCA_FQ_PRIOMAP], extack);
1065 
1066 	if (!err && tb[TCA_FQ_WEIGHTS])
1067 		err = fq_load_weights(q, tb[TCA_FQ_WEIGHTS], extack);
1068 
1069 	if (tb[TCA_FQ_ORPHAN_MASK])
1070 		q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
1071 
1072 	if (tb[TCA_FQ_CE_THRESHOLD])
1073 		q->ce_threshold = (u64)NSEC_PER_USEC *
1074 				  nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
1075 
1076 	if (tb[TCA_FQ_TIMER_SLACK])
1077 		q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]);
1078 
1079 	if (tb[TCA_FQ_HORIZON])
1080 		q->horizon = (u64)NSEC_PER_USEC *
1081 				  nla_get_u32(tb[TCA_FQ_HORIZON]);
1082 
1083 	if (tb[TCA_FQ_HORIZON_DROP])
1084 		q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]);
1085 
1086 	if (!err) {
1087 
1088 		sch_tree_unlock(sch);
1089 		err = fq_resize(sch, fq_log);
1090 		sch_tree_lock(sch);
1091 	}
1092 	while (sch->q.qlen > sch->limit) {
1093 		struct sk_buff *skb = fq_dequeue(sch);
1094 
1095 		if (!skb)
1096 			break;
1097 		drop_len += qdisc_pkt_len(skb);
1098 		rtnl_kfree_skbs(skb, skb);
1099 		drop_count++;
1100 	}
1101 	qdisc_tree_reduce_backlog(sch, drop_count, drop_len);
1102 
1103 	sch_tree_unlock(sch);
1104 	return err;
1105 }
1106 
1107 static void fq_destroy(struct Qdisc *sch)
1108 {
1109 	struct fq_sched_data *q = qdisc_priv(sch);
1110 
1111 	fq_reset(sch);
1112 	fq_free(q->fq_root);
1113 	qdisc_watchdog_cancel(&q->watchdog);
1114 }
1115 
1116 static int fq_init(struct Qdisc *sch, struct nlattr *opt,
1117 		   struct netlink_ext_ack *extack)
1118 {
1119 	struct fq_sched_data *q = qdisc_priv(sch);
1120 	int i, err;
1121 
1122 	sch->limit		= 10000;
1123 	q->flow_plimit		= 100;
1124 	q->quantum		= 2 * psched_mtu(qdisc_dev(sch));
1125 	q->initial_quantum	= 10 * psched_mtu(qdisc_dev(sch));
1126 	q->flow_refill_delay	= msecs_to_jiffies(40);
1127 	q->flow_max_rate	= ~0UL;
1128 	q->time_next_delayed_flow = ~0ULL;
1129 	q->rate_enable		= 1;
1130 	for (i = 0; i < FQ_BANDS; i++) {
1131 		q->band_flows[i].new_flows.first = NULL;
1132 		q->band_flows[i].old_flows.first = NULL;
1133 	}
1134 	q->band_flows[0].quantum = 9 << 16;
1135 	q->band_flows[1].quantum = 3 << 16;
1136 	q->band_flows[2].quantum = 1 << 16;
1137 	q->delayed		= RB_ROOT;
1138 	q->fq_root		= NULL;
1139 	q->fq_trees_log		= ilog2(1024);
1140 	q->orphan_mask		= 1024 - 1;
1141 	q->low_rate_threshold	= 550000 / 8;
1142 
1143 	q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */
1144 
1145 	q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */
1146 	q->horizon_drop = 1; /* by default, drop packets beyond horizon */
1147 
1148 	/* Default ce_threshold of 4294 seconds */
1149 	q->ce_threshold		= (u64)NSEC_PER_USEC * ~0U;
1150 
1151 	fq_prio2band_compress_crumb(sch_default_prio2band, q->prio2band);
1152 	qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
1153 
1154 	if (opt)
1155 		err = fq_change(sch, opt, extack);
1156 	else
1157 		err = fq_resize(sch, q->fq_trees_log);
1158 
1159 	return err;
1160 }
1161 
1162 static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
1163 {
1164 	struct fq_sched_data *q = qdisc_priv(sch);
1165 	u64 ce_threshold = q->ce_threshold;
1166 	struct tc_prio_qopt prio = {
1167 		.bands = FQ_BANDS,
1168 	};
1169 	u64 horizon = q->horizon;
1170 	struct nlattr *opts;
1171 	s32 weights[3];
1172 
1173 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
1174 	if (opts == NULL)
1175 		goto nla_put_failure;
1176 
1177 	/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
1178 
1179 	do_div(ce_threshold, NSEC_PER_USEC);
1180 	do_div(horizon, NSEC_PER_USEC);
1181 
1182 	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
1183 	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
1184 	    nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
1185 	    nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
1186 	    nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
1187 	    nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE,
1188 			min_t(unsigned long, q->flow_max_rate, ~0U)) ||
1189 	    nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
1190 			jiffies_to_usecs(q->flow_refill_delay)) ||
1191 	    nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
1192 	    nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
1193 			q->low_rate_threshold) ||
1194 	    nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
1195 	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) ||
1196 	    nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) ||
1197 	    nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) ||
1198 	    nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop))
1199 		goto nla_put_failure;
1200 
1201 	fq_prio2band_decompress_crumb(q->prio2band, prio.priomap);
1202 	if (nla_put(skb, TCA_FQ_PRIOMAP, sizeof(prio), &prio))
1203 		goto nla_put_failure;
1204 
1205 	weights[0] = q->band_flows[0].quantum;
1206 	weights[1] = q->band_flows[1].quantum;
1207 	weights[2] = q->band_flows[2].quantum;
1208 	if (nla_put(skb, TCA_FQ_WEIGHTS, sizeof(weights), &weights))
1209 		goto nla_put_failure;
1210 
1211 	return nla_nest_end(skb, opts);
1212 
1213 nla_put_failure:
1214 	return -1;
1215 }
1216 
1217 static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1218 {
1219 	struct fq_sched_data *q = qdisc_priv(sch);
1220 	struct tc_fq_qd_stats st;
1221 	int i;
1222 
1223 	st.pad = 0;
1224 
1225 	sch_tree_lock(sch);
1226 
1227 	st.gc_flows		  = q->stat_gc_flows;
1228 	st.highprio_packets	  = 0;
1229 	st.fastpath_packets	  = q->internal.stat_fastpath_packets;
1230 	st.tcp_retrans		  = 0;
1231 	st.throttled		  = q->stat_throttled;
1232 	st.flows_plimit		  = q->stat_flows_plimit;
1233 	st.pkts_too_long	  = q->stat_pkts_too_long;
1234 	st.allocation_errors	  = q->stat_allocation_errors;
1235 	st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack -
1236 				    ktime_get_ns();
1237 	st.flows		  = q->flows;
1238 	st.inactive_flows	  = q->inactive_flows;
1239 	st.throttled_flows	  = q->throttled_flows;
1240 	st.unthrottle_latency_ns  = min_t(unsigned long,
1241 					  q->unthrottle_latency_ns, ~0U);
1242 	st.ce_mark		  = q->stat_ce_mark;
1243 	st.horizon_drops	  = q->stat_horizon_drops;
1244 	st.horizon_caps		  = q->stat_horizon_caps;
1245 	for (i = 0; i < FQ_BANDS; i++) {
1246 		st.band_drops[i]  = q->stat_band_drops[i];
1247 		st.band_pkt_count[i] = q->band_pkt_count[i];
1248 	}
1249 	sch_tree_unlock(sch);
1250 
1251 	return gnet_stats_copy_app(d, &st, sizeof(st));
1252 }
1253 
1254 static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
1255 	.id		=	"fq",
1256 	.priv_size	=	sizeof(struct fq_sched_data),
1257 
1258 	.enqueue	=	fq_enqueue,
1259 	.dequeue	=	fq_dequeue,
1260 	.peek		=	qdisc_peek_dequeued,
1261 	.init		=	fq_init,
1262 	.reset		=	fq_reset,
1263 	.destroy	=	fq_destroy,
1264 	.change		=	fq_change,
1265 	.dump		=	fq_dump,
1266 	.dump_stats	=	fq_dump_stats,
1267 	.owner		=	THIS_MODULE,
1268 };
1269 
1270 static int __init fq_module_init(void)
1271 {
1272 	int ret;
1273 
1274 	fq_flow_cachep = kmem_cache_create("fq_flow_cache",
1275 					   sizeof(struct fq_flow),
1276 					   0, SLAB_HWCACHE_ALIGN, NULL);
1277 	if (!fq_flow_cachep)
1278 		return -ENOMEM;
1279 
1280 	ret = register_qdisc(&fq_qdisc_ops);
1281 	if (ret)
1282 		kmem_cache_destroy(fq_flow_cachep);
1283 	return ret;
1284 }
1285 
1286 static void __exit fq_module_exit(void)
1287 {
1288 	unregister_qdisc(&fq_qdisc_ops);
1289 	kmem_cache_destroy(fq_flow_cachep);
1290 }
1291 
1292 module_init(fq_module_init)
1293 module_exit(fq_module_exit)
1294 MODULE_AUTHOR("Eric Dumazet");
1295 MODULE_LICENSE("GPL");
1296 MODULE_DESCRIPTION("Fair Queue Packet Scheduler");
1297