xref: /freebsd/sys/net/altq/altq_red.c (revision 0b57cec536236d46e3dba9bd041533462f33dbb7)
1 /*-
2  * Copyright (C) 1997-2003
3  *	Sony Computer Science Laboratories Inc.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  */
27 /*-
28  * Copyright (c) 1990-1994 Regents of the University of California.
29  * All rights reserved.
30  *
31  * Redistribution and use in source and binary forms, with or without
32  * modification, are permitted provided that the following conditions
33  * are met:
34  * 1. Redistributions of source code must retain the above copyright
35  *    notice, this list of conditions and the following disclaimer.
36  * 2. Redistributions in binary form must reproduce the above copyright
37  *    notice, this list of conditions and the following disclaimer in the
38  *    documentation and/or other materials provided with the distribution.
39  * 3. All advertising materials mentioning features or use of this software
40  *    must display the following acknowledgement:
41  *	This product includes software developed by the Computer Systems
42  *	Engineering Group at Lawrence Berkeley Laboratory.
43  * 4. Neither the name of the University nor of the Laboratory may be used
44  *    to endorse or promote products derived from this software without
45  *    specific prior written permission.
46  *
47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57  * SUCH DAMAGE.
58  *
59  * $KAME: altq_red.c,v 1.18 2003/09/05 22:40:36 itojun Exp $
60  * $FreeBSD$
61  */
62 
63 #include "opt_altq.h"
64 #include "opt_inet.h"
65 #include "opt_inet6.h"
66 #ifdef ALTQ_RED	/* red is enabled by ALTQ_RED option in opt_altq.h */
67 
68 #include <sys/param.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/socket.h>
72 #include <sys/systm.h>
73 #include <sys/errno.h>
74 #if 1 /* ALTQ3_COMPAT */
75 #include <sys/sockio.h>
76 #include <sys/proc.h>
77 #include <sys/kernel.h>
78 #ifdef ALTQ_FLOWVALVE
79 #include <sys/queue.h>
80 #include <sys/time.h>
81 #endif
82 #endif /* ALTQ3_COMPAT */
83 
84 #include <net/if.h>
85 #include <net/if_var.h>
86 
87 #include <netinet/in.h>
88 #include <netinet/in_systm.h>
89 #include <netinet/ip.h>
90 #ifdef INET6
91 #include <netinet/ip6.h>
92 #endif
93 
94 #include <netpfil/pf/pf.h>
95 #include <netpfil/pf/pf_altq.h>
96 #include <netpfil/pf/pf_mtag.h>
97 #include <net/altq/altq.h>
98 #include <net/altq/altq_red.h>
99 
100 /*
101  * ALTQ/RED (Random Early Detection) implementation using 32-bit
102  * fixed-point calculation.
103  *
104  * written by kjc using the ns code as a reference.
105  * you can learn more about red and ns from Sally's home page at
106  * http://www-nrg.ee.lbl.gov/floyd/
107  *
108  * most of the red parameter values are fixed in this implementation
109  * to prevent fixed-point overflow/underflow.
110  * if you change the parameters, watch out for overflow/underflow!
111  *
112  * the parameters used are recommended values by Sally.
113  * the corresponding ns config looks:
114  *	q_weight=0.00195
115  *	minthresh=5 maxthresh=15 queue-size=60
116  *	linterm=30
117  *	dropmech=drop-tail
118  *	bytes=false (can't be handled by 32-bit fixed-point)
119  *	doubleq=false dqthresh=false
120  *	wait=true
121  */
122 /*
123  * alternative red parameters for a slow link.
124  *
125  * assume the queue length becomes from zero to L and keeps L, it takes
126  * N packets for q_avg to reach 63% of L.
127  * when q_weight is 0.002, N is about 500 packets.
128  * for a slow link like dial-up, 500 packets takes more than 1 minute!
129  * when q_weight is 0.008, N is about 127 packets.
130  * when q_weight is 0.016, N is about 63 packets.
131  * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
132  * are allowed for 0.016.
133  * see Sally's paper for more details.
134  */
135 /* normal red parameters */
136 #define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
137 				/* q_weight = 0.00195 */
138 
139 /* red parameters for a slow link */
140 #define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
141 				/* q_weight = 0.0078125 */
142 
143 /* red parameters for a very slow link (e.g., dialup) */
144 #define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
145 				/* q_weight = 0.015625 */
146 
147 /* fixed-point uses 12-bit decimal places */
148 #define	FP_SHIFT	12	/* fixed-point shift */
149 
150 /* red parameters for drop probability */
151 #define	INV_P_MAX	10	/* inverse of max drop probability */
152 #define	TH_MIN		5	/* min threshold */
153 #define	TH_MAX		15	/* max threshold */
154 
155 #define	RED_LIMIT	60	/* default max queue length */
156 #define	RED_STATS		/* collect statistics */
157 
158 /*
159  * our default policy for forced-drop is drop-tail.
160  * (in altq-1.1.2 or earlier, the default was random-drop.
161  * but it makes more sense to punish the cause of the surge.)
162  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
163  */
164 
165 
166 /* default red parameter values */
167 static int default_th_min = TH_MIN;
168 static int default_th_max = TH_MAX;
169 static int default_inv_pmax = INV_P_MAX;
170 
171 
172 /*
173  * red support routines
174  */
175 red_t *
176 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
177    int pkttime)
178 {
179 	red_t	*rp;
180 	int	 w, i;
181 	int	 npkts_per_sec;
182 
183 	rp = malloc(sizeof(red_t), M_DEVBUF, M_NOWAIT | M_ZERO);
184 	if (rp == NULL)
185 		return (NULL);
186 
187 	if (weight == 0)
188 		rp->red_weight = W_WEIGHT;
189 	else
190 		rp->red_weight = weight;
191 
192 	/* allocate weight table */
193 	rp->red_wtab = wtab_alloc(rp->red_weight);
194 	if (rp->red_wtab == NULL) {
195 		free(rp, M_DEVBUF);
196 		return (NULL);
197 	}
198 
199 	rp->red_avg = 0;
200 	rp->red_idle = 1;
201 
202 	if (inv_pmax == 0)
203 		rp->red_inv_pmax = default_inv_pmax;
204 	else
205 		rp->red_inv_pmax = inv_pmax;
206 	if (th_min == 0)
207 		rp->red_thmin = default_th_min;
208 	else
209 		rp->red_thmin = th_min;
210 	if (th_max == 0)
211 		rp->red_thmax = default_th_max;
212 	else
213 		rp->red_thmax = th_max;
214 
215 	rp->red_flags = flags;
216 
217 	if (pkttime == 0)
218 		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
219 		rp->red_pkttime = 800;
220 	else
221 		rp->red_pkttime = pkttime;
222 
223 	if (weight == 0) {
224 		/* when the link is very slow, adjust red parameters */
225 		npkts_per_sec = 1000000 / rp->red_pkttime;
226 		if (npkts_per_sec < 50) {
227 			/* up to about 400Kbps */
228 			rp->red_weight = W_WEIGHT_2;
229 		} else if (npkts_per_sec < 300) {
230 			/* up to about 2.4Mbps */
231 			rp->red_weight = W_WEIGHT_1;
232 		}
233 	}
234 
235 	/* calculate wshift.  weight must be power of 2 */
236 	w = rp->red_weight;
237 	for (i = 0; w > 1; i++)
238 		w = w >> 1;
239 	rp->red_wshift = i;
240 	w = 1 << rp->red_wshift;
241 	if (w != rp->red_weight) {
242 		printf("invalid weight value %d for red! use %d\n",
243 		       rp->red_weight, w);
244 		rp->red_weight = w;
245 	}
246 
247 	/*
248 	 * thmin_s and thmax_s are scaled versions of th_min and th_max
249 	 * to be compared with avg.
250 	 */
251 	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
252 	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
253 
254 	/*
255 	 * precompute probability denominator
256 	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
257 	 */
258 	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
259 			 * rp->red_inv_pmax) << FP_SHIFT;
260 
261 	microtime(&rp->red_last);
262 	return (rp);
263 }
264 
265 void
266 red_destroy(red_t *rp)
267 {
268 	wtab_destroy(rp->red_wtab);
269 	free(rp, M_DEVBUF);
270 }
271 
272 void
273 red_getstats(red_t *rp, struct redstats *sp)
274 {
275 	sp->q_avg		= rp->red_avg >> rp->red_wshift;
276 	sp->xmit_cnt		= rp->red_stats.xmit_cnt;
277 	sp->drop_cnt		= rp->red_stats.drop_cnt;
278 	sp->drop_forced		= rp->red_stats.drop_forced;
279 	sp->drop_unforced	= rp->red_stats.drop_unforced;
280 	sp->marked_packets	= rp->red_stats.marked_packets;
281 }
282 
283 int
284 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
285     struct altq_pktattr *pktattr)
286 {
287 	int avg, droptype;
288 	int n;
289 
290 	avg = rp->red_avg;
291 
292 	/*
293 	 * if we were idle, we pretend that n packets arrived during
294 	 * the idle period.
295 	 */
296 	if (rp->red_idle) {
297 		struct timeval now;
298 		int t;
299 
300 		rp->red_idle = 0;
301 		microtime(&now);
302 		t = (now.tv_sec - rp->red_last.tv_sec);
303 		if (t > 60) {
304 			/*
305 			 * being idle for more than 1 minute, set avg to zero.
306 			 * this prevents t from overflow.
307 			 */
308 			avg = 0;
309 		} else {
310 			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
311 			n = t / rp->red_pkttime - 1;
312 
313 			/* the following line does (avg = (1 - Wq)^n * avg) */
314 			if (n > 0)
315 				avg = (avg >> FP_SHIFT) *
316 				    pow_w(rp->red_wtab, n);
317 		}
318 	}
319 
320 	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
321 	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
322 	rp->red_avg = avg;		/* save the new value */
323 
324 	/*
325 	 * red_count keeps a tally of arriving traffic that has not
326 	 * been dropped.
327 	 */
328 	rp->red_count++;
329 
330 	/* see if we drop early */
331 	droptype = DTYPE_NODROP;
332 	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
333 		if (avg >= rp->red_thmax_s) {
334 			/* avg >= th_max: forced drop */
335 			droptype = DTYPE_FORCED;
336 		} else if (rp->red_old == 0) {
337 			/* first exceeds th_min */
338 			rp->red_count = 1;
339 			rp->red_old = 1;
340 		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
341 				      rp->red_probd, rp->red_count)) {
342 			/* mark or drop by red */
343 			if ((rp->red_flags & REDF_ECN) &&
344 			    mark_ecn(m, pktattr, rp->red_flags)) {
345 				/* successfully marked.  do not drop. */
346 				rp->red_count = 0;
347 #ifdef RED_STATS
348 				rp->red_stats.marked_packets++;
349 #endif
350 			} else {
351 				/* unforced drop by red */
352 				droptype = DTYPE_EARLY;
353 			}
354 		}
355 	} else {
356 		/* avg < th_min */
357 		rp->red_old = 0;
358 	}
359 
360 	/*
361 	 * if the queue length hits the hard limit, it's a forced drop.
362 	 */
363 	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
364 		droptype = DTYPE_FORCED;
365 
366 #ifdef RED_RANDOM_DROP
367 	/* if successful or forced drop, enqueue this packet. */
368 	if (droptype != DTYPE_EARLY)
369 		_addq(q, m);
370 #else
371 	/* if successful, enqueue this packet. */
372 	if (droptype == DTYPE_NODROP)
373 		_addq(q, m);
374 #endif
375 	if (droptype != DTYPE_NODROP) {
376 		if (droptype == DTYPE_EARLY) {
377 			/* drop the incoming packet */
378 #ifdef RED_STATS
379 			rp->red_stats.drop_unforced++;
380 #endif
381 		} else {
382 			/* forced drop, select a victim packet in the queue. */
383 #ifdef RED_RANDOM_DROP
384 			m = _getq_random(q);
385 #endif
386 #ifdef RED_STATS
387 			rp->red_stats.drop_forced++;
388 #endif
389 		}
390 #ifdef RED_STATS
391 		PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
392 #endif
393 		rp->red_count = 0;
394 		m_freem(m);
395 		return (-1);
396 	}
397 	/* successfully queued */
398 #ifdef RED_STATS
399 	PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
400 #endif
401 	return (0);
402 }
403 
404 /*
405  * early-drop probability is calculated as follows:
406  *   prob = p_max * (avg - th_min) / (th_max - th_min)
407  *   prob_a = prob / (2 - count*prob)
408  *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
409  * here prob_a increases as successive undrop count increases.
410  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
411  * becomes 1 when (count >= (2 / prob))).
412  */
413 int
414 drop_early(int fp_len, int fp_probd, int count)
415 {
416 	int	d;		/* denominator of drop-probability */
417 
418 	d = fp_probd - count * fp_len;
419 	if (d <= 0)
420 		/* count exceeds the hard limit: drop or mark */
421 		return (1);
422 
423 	/*
424 	 * now the range of d is [1..600] in fixed-point. (when
425 	 * th_max-th_min=10 and p_max=1/30)
426 	 * drop probability = (avg - TH_MIN) / d
427 	 */
428 
429 	if ((arc4random() % d) < fp_len) {
430 		/* drop or mark */
431 		return (1);
432 	}
433 	/* no drop/mark */
434 	return (0);
435 }
436 
437 /*
438  * try to mark CE bit to the packet.
439  *    returns 1 if successfully marked, 0 otherwise.
440  */
441 int
442 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
443 {
444 	struct mbuf	*m0;
445 	struct pf_mtag	*at;
446 	void		*hdr;
447 
448 	at = pf_find_mtag(m);
449 	if (at != NULL) {
450 		hdr = at->hdr;
451 	} else
452 		return (0);
453 
454 	/* verify that pattr_hdr is within the mbuf data */
455 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
456 		if (((caddr_t)hdr >= m0->m_data) &&
457 		    ((caddr_t)hdr < m0->m_data + m0->m_len))
458 			break;
459 	if (m0 == NULL) {
460 		/* ick, tag info is stale */
461 		return (0);
462 	}
463 
464 	switch (((struct ip *)hdr)->ip_v) {
465 	case IPVERSION:
466 		if (flags & REDF_ECN4) {
467 			struct ip *ip = hdr;
468 			u_int8_t otos;
469 			int sum;
470 
471 			if (ip->ip_v != 4)
472 				return (0);	/* version mismatch! */
473 
474 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
475 				return (0);	/* not-ECT */
476 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
477 				return (1);	/* already marked */
478 
479 			/*
480 			 * ecn-capable but not marked,
481 			 * mark CE and update checksum
482 			 */
483 			otos = ip->ip_tos;
484 			ip->ip_tos |= IPTOS_ECN_CE;
485 			/*
486 			 * update checksum (from RFC1624)
487 			 *	   HC' = ~(~HC + ~m + m')
488 			 */
489 			sum = ~ntohs(ip->ip_sum) & 0xffff;
490 			sum += (~otos & 0xffff) + ip->ip_tos;
491 			sum = (sum >> 16) + (sum & 0xffff);
492 			sum += (sum >> 16);  /* add carry */
493 			ip->ip_sum = htons(~sum & 0xffff);
494 			return (1);
495 		}
496 		break;
497 #ifdef INET6
498 	case (IPV6_VERSION >> 4):
499 		if (flags & REDF_ECN6) {
500 			struct ip6_hdr *ip6 = hdr;
501 			u_int32_t flowlabel;
502 
503 			flowlabel = ntohl(ip6->ip6_flow);
504 			if ((flowlabel >> 28) != 6)
505 				return (0);	/* version mismatch! */
506 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
507 			    (IPTOS_ECN_NOTECT << 20))
508 				return (0);	/* not-ECT */
509 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
510 			    (IPTOS_ECN_CE << 20))
511 				return (1);	/* already marked */
512 			/*
513 			 * ecn-capable but not marked,  mark CE
514 			 */
515 			flowlabel |= (IPTOS_ECN_CE << 20);
516 			ip6->ip6_flow = htonl(flowlabel);
517 			return (1);
518 		}
519 		break;
520 #endif  /* INET6 */
521 	}
522 
523 	/* not marked */
524 	return (0);
525 }
526 
527 struct mbuf *
528 red_getq(rp, q)
529 	red_t *rp;
530 	class_queue_t *q;
531 {
532 	struct mbuf *m;
533 
534 	if ((m = _getq(q)) == NULL) {
535 		if (rp->red_idle == 0) {
536 			rp->red_idle = 1;
537 			microtime(&rp->red_last);
538 		}
539 		return NULL;
540 	}
541 
542 	rp->red_idle = 0;
543 	return (m);
544 }
545 
546 /*
547  * helper routine to calibrate avg during idle.
548  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
549  * here Wq = 1/weight and the code assumes Wq is close to zero.
550  *
551  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
552  */
553 static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
554 
555 struct wtab *
556 wtab_alloc(int weight)
557 {
558 	struct wtab	*w;
559 	int		 i;
560 
561 	for (w = wtab_list; w != NULL; w = w->w_next)
562 		if (w->w_weight == weight) {
563 			w->w_refcount++;
564 			return (w);
565 		}
566 
567 	w = malloc(sizeof(struct wtab), M_DEVBUF, M_NOWAIT | M_ZERO);
568 	if (w == NULL)
569 		return (NULL);
570 	w->w_weight = weight;
571 	w->w_refcount = 1;
572 	w->w_next = wtab_list;
573 	wtab_list = w;
574 
575 	/* initialize the weight table */
576 	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
577 	for (i = 1; i < 32; i++) {
578 		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
579 		if (w->w_tab[i] == 0 && w->w_param_max == 0)
580 			w->w_param_max = 1 << i;
581 	}
582 
583 	return (w);
584 }
585 
586 int
587 wtab_destroy(struct wtab *w)
588 {
589 	struct wtab	*prev;
590 
591 	if (--w->w_refcount > 0)
592 		return (0);
593 
594 	if (wtab_list == w)
595 		wtab_list = w->w_next;
596 	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
597 		if (prev->w_next == w) {
598 			prev->w_next = w->w_next;
599 			break;
600 		}
601 
602 	free(w, M_DEVBUF);
603 	return (0);
604 }
605 
606 int32_t
607 pow_w(struct wtab *w, int n)
608 {
609 	int	i, bit;
610 	int32_t	val;
611 
612 	if (n >= w->w_param_max)
613 		return (0);
614 
615 	val = 1 << FP_SHIFT;
616 	if (n <= 0)
617 		return (val);
618 
619 	bit = 1;
620 	i = 0;
621 	while (n) {
622 		if (n & bit) {
623 			val = (val * w->w_tab[i]) >> FP_SHIFT;
624 			n &= ~bit;
625 		}
626 		i++;
627 		bit <<=  1;
628 	}
629 	return (val);
630 }
631 
632 
633 #endif /* ALTQ_RED */
634