xref: /freebsd/sys/net/altq/altq_red.c (revision 525fe93dc7487a1e63a90f6a2b956abc601963c1)
1 /*-
2  * Copyright (C) 1997-2003
3  *	Sony Computer Science Laboratories Inc.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  */
27 /*-
28  * Copyright (c) 1990-1994 Regents of the University of California.
29  * All rights reserved.
30  *
31  * Redistribution and use in source and binary forms, with or without
32  * modification, are permitted provided that the following conditions
33  * are met:
34  * 1. Redistributions of source code must retain the above copyright
35  *    notice, this list of conditions and the following disclaimer.
36  * 2. Redistributions in binary form must reproduce the above copyright
37  *    notice, this list of conditions and the following disclaimer in the
38  *    documentation and/or other materials provided with the distribution.
39  * 3. All advertising materials mentioning features or use of this software
40  *    must display the following acknowledgement:
41  *	This product includes software developed by the Computer Systems
42  *	Engineering Group at Lawrence Berkeley Laboratory.
43  * 4. Neither the name of the University nor of the Laboratory may be used
44  *    to endorse or promote products derived from this software without
45  *    specific prior written permission.
46  *
47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57  * SUCH DAMAGE.
58  *
59  * $KAME: altq_red.c,v 1.18 2003/09/05 22:40:36 itojun Exp $
60  */
61 
62 #include "opt_altq.h"
63 #include "opt_inet.h"
64 #include "opt_inet6.h"
65 #ifdef ALTQ_RED	/* red is enabled by ALTQ_RED option in opt_altq.h */
66 
67 #include <sys/param.h>
68 #include <sys/malloc.h>
69 #include <sys/mbuf.h>
70 #include <sys/socket.h>
71 #include <sys/systm.h>
72 #include <sys/errno.h>
73 #if 1 /* ALTQ3_COMPAT */
74 #include <sys/sockio.h>
75 #include <sys/proc.h>
76 #include <sys/kernel.h>
77 #ifdef ALTQ_FLOWVALVE
78 #include <sys/queue.h>
79 #include <sys/time.h>
80 #endif
81 #endif /* ALTQ3_COMPAT */
82 
83 #include <net/if.h>
84 #include <net/if_var.h>
85 
86 #include <netinet/in.h>
87 #include <netinet/in_systm.h>
88 #include <netinet/ip.h>
89 #ifdef INET6
90 #include <netinet/ip6.h>
91 #endif
92 
93 #include <netpfil/pf/pf.h>
94 #include <netpfil/pf/pf_altq.h>
95 #include <netpfil/pf/pf_mtag.h>
96 #include <net/altq/altq.h>
97 #include <net/altq/altq_red.h>
98 
99 /*
100  * ALTQ/RED (Random Early Detection) implementation using 32-bit
101  * fixed-point calculation.
102  *
103  * written by kjc using the ns code as a reference.
104  * you can learn more about red and ns from Sally's home page at
105  * http://www-nrg.ee.lbl.gov/floyd/
106  *
107  * most of the red parameter values are fixed in this implementation
108  * to prevent fixed-point overflow/underflow.
109  * if you change the parameters, watch out for overflow/underflow!
110  *
111  * the parameters used are recommended values by Sally.
112  * the corresponding ns config looks:
113  *	q_weight=0.00195
114  *	minthresh=5 maxthresh=15 queue-size=60
115  *	linterm=30
116  *	dropmech=drop-tail
117  *	bytes=false (can't be handled by 32-bit fixed-point)
118  *	doubleq=false dqthresh=false
119  *	wait=true
120  */
121 /*
122  * alternative red parameters for a slow link.
123  *
124  * assume the queue length becomes from zero to L and keeps L, it takes
125  * N packets for q_avg to reach 63% of L.
126  * when q_weight is 0.002, N is about 500 packets.
127  * for a slow link like dial-up, 500 packets takes more than 1 minute!
128  * when q_weight is 0.008, N is about 127 packets.
129  * when q_weight is 0.016, N is about 63 packets.
130  * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
131  * are allowed for 0.016.
132  * see Sally's paper for more details.
133  */
134 /* normal red parameters */
135 #define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
136 				/* q_weight = 0.00195 */
137 
138 /* red parameters for a slow link */
139 #define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
140 				/* q_weight = 0.0078125 */
141 
142 /* red parameters for a very slow link (e.g., dialup) */
143 #define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
144 				/* q_weight = 0.015625 */
145 
146 /* fixed-point uses 12-bit decimal places */
147 #define	FP_SHIFT	12	/* fixed-point shift */
148 
149 /* red parameters for drop probability */
150 #define	INV_P_MAX	10	/* inverse of max drop probability */
151 #define	TH_MIN		5	/* min threshold */
152 #define	TH_MAX		15	/* max threshold */
153 
154 #define	RED_LIMIT	60	/* default max queue length */
155 #define	RED_STATS		/* collect statistics */
156 
157 /*
158  * our default policy for forced-drop is drop-tail.
159  * (in altq-1.1.2 or earlier, the default was random-drop.
160  * but it makes more sense to punish the cause of the surge.)
161  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
162  */
163 
164 /* default red parameter values */
165 static int default_th_min = TH_MIN;
166 static int default_th_max = TH_MAX;
167 static int default_inv_pmax = INV_P_MAX;
168 
169 /*
170  * red support routines
171  */
172 red_t *
173 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
174    int pkttime)
175 {
176 	red_t	*rp;
177 	int	 w, i;
178 	int	 npkts_per_sec;
179 
180 	rp = malloc(sizeof(red_t), M_DEVBUF, M_NOWAIT | M_ZERO);
181 	if (rp == NULL)
182 		return (NULL);
183 
184 	if (weight == 0)
185 		rp->red_weight = W_WEIGHT;
186 	else
187 		rp->red_weight = weight;
188 
189 	/* allocate weight table */
190 	rp->red_wtab = wtab_alloc(rp->red_weight);
191 	if (rp->red_wtab == NULL) {
192 		free(rp, M_DEVBUF);
193 		return (NULL);
194 	}
195 
196 	rp->red_avg = 0;
197 	rp->red_idle = 1;
198 
199 	if (inv_pmax == 0)
200 		rp->red_inv_pmax = default_inv_pmax;
201 	else
202 		rp->red_inv_pmax = inv_pmax;
203 	if (th_min == 0)
204 		rp->red_thmin = default_th_min;
205 	else
206 		rp->red_thmin = th_min;
207 	if (th_max == 0)
208 		rp->red_thmax = default_th_max;
209 	else
210 		rp->red_thmax = th_max;
211 
212 	rp->red_flags = flags;
213 
214 	if (pkttime == 0)
215 		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
216 		rp->red_pkttime = 800;
217 	else
218 		rp->red_pkttime = pkttime;
219 
220 	if (weight == 0) {
221 		/* when the link is very slow, adjust red parameters */
222 		npkts_per_sec = 1000000 / rp->red_pkttime;
223 		if (npkts_per_sec < 50) {
224 			/* up to about 400Kbps */
225 			rp->red_weight = W_WEIGHT_2;
226 		} else if (npkts_per_sec < 300) {
227 			/* up to about 2.4Mbps */
228 			rp->red_weight = W_WEIGHT_1;
229 		}
230 	}
231 
232 	/* calculate wshift.  weight must be power of 2 */
233 	w = rp->red_weight;
234 	for (i = 0; w > 1; i++)
235 		w = w >> 1;
236 	rp->red_wshift = i;
237 	w = 1 << rp->red_wshift;
238 	if (w != rp->red_weight) {
239 		printf("invalid weight value %d for red! use %d\n",
240 		       rp->red_weight, w);
241 		rp->red_weight = w;
242 	}
243 
244 	/*
245 	 * thmin_s and thmax_s are scaled versions of th_min and th_max
246 	 * to be compared with avg.
247 	 */
248 	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
249 	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
250 
251 	/*
252 	 * precompute probability denominator
253 	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
254 	 */
255 	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
256 			 * rp->red_inv_pmax) << FP_SHIFT;
257 
258 	microtime(&rp->red_last);
259 	return (rp);
260 }
261 
262 void
263 red_destroy(red_t *rp)
264 {
265 	wtab_destroy(rp->red_wtab);
266 	free(rp, M_DEVBUF);
267 }
268 
269 void
270 red_getstats(red_t *rp, struct redstats *sp)
271 {
272 	sp->q_avg		= rp->red_avg >> rp->red_wshift;
273 	sp->xmit_cnt		= rp->red_stats.xmit_cnt;
274 	sp->drop_cnt		= rp->red_stats.drop_cnt;
275 	sp->drop_forced		= rp->red_stats.drop_forced;
276 	sp->drop_unforced	= rp->red_stats.drop_unforced;
277 	sp->marked_packets	= rp->red_stats.marked_packets;
278 }
279 
280 int
281 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
282     struct altq_pktattr *pktattr)
283 {
284 	int avg, droptype;
285 	int n;
286 
287 	avg = rp->red_avg;
288 
289 	/*
290 	 * if we were idle, we pretend that n packets arrived during
291 	 * the idle period.
292 	 */
293 	if (rp->red_idle) {
294 		struct timeval now;
295 		int t;
296 
297 		rp->red_idle = 0;
298 		microtime(&now);
299 		t = (now.tv_sec - rp->red_last.tv_sec);
300 		if (t > 60) {
301 			/*
302 			 * being idle for more than 1 minute, set avg to zero.
303 			 * this prevents t from overflow.
304 			 */
305 			avg = 0;
306 		} else {
307 			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
308 			n = t / rp->red_pkttime - 1;
309 
310 			/* the following line does (avg = (1 - Wq)^n * avg) */
311 			if (n > 0)
312 				avg = (avg >> FP_SHIFT) *
313 				    pow_w(rp->red_wtab, n);
314 		}
315 	}
316 
317 	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
318 	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
319 	rp->red_avg = avg;		/* save the new value */
320 
321 	/*
322 	 * red_count keeps a tally of arriving traffic that has not
323 	 * been dropped.
324 	 */
325 	rp->red_count++;
326 
327 	/* see if we drop early */
328 	droptype = DTYPE_NODROP;
329 	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
330 		if (avg >= rp->red_thmax_s) {
331 			/* avg >= th_max: forced drop */
332 			droptype = DTYPE_FORCED;
333 		} else if (rp->red_old == 0) {
334 			/* first exceeds th_min */
335 			rp->red_count = 1;
336 			rp->red_old = 1;
337 		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
338 				      rp->red_probd, rp->red_count)) {
339 			/* mark or drop by red */
340 			if ((rp->red_flags & REDF_ECN) &&
341 			    mark_ecn(m, pktattr, rp->red_flags)) {
342 				/* successfully marked.  do not drop. */
343 				rp->red_count = 0;
344 #ifdef RED_STATS
345 				rp->red_stats.marked_packets++;
346 #endif
347 			} else {
348 				/* unforced drop by red */
349 				droptype = DTYPE_EARLY;
350 			}
351 		}
352 	} else {
353 		/* avg < th_min */
354 		rp->red_old = 0;
355 	}
356 
357 	/*
358 	 * if the queue length hits the hard limit, it's a forced drop.
359 	 */
360 	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
361 		droptype = DTYPE_FORCED;
362 
363 #ifdef RED_RANDOM_DROP
364 	/* if successful or forced drop, enqueue this packet. */
365 	if (droptype != DTYPE_EARLY)
366 		_addq(q, m);
367 #else
368 	/* if successful, enqueue this packet. */
369 	if (droptype == DTYPE_NODROP)
370 		_addq(q, m);
371 #endif
372 	if (droptype != DTYPE_NODROP) {
373 		if (droptype == DTYPE_EARLY) {
374 			/* drop the incoming packet */
375 #ifdef RED_STATS
376 			rp->red_stats.drop_unforced++;
377 #endif
378 		} else {
379 			/* forced drop, select a victim packet in the queue. */
380 #ifdef RED_RANDOM_DROP
381 			m = _getq_random(q);
382 #endif
383 #ifdef RED_STATS
384 			rp->red_stats.drop_forced++;
385 #endif
386 		}
387 #ifdef RED_STATS
388 		PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
389 #endif
390 		rp->red_count = 0;
391 		m_freem(m);
392 		return (-1);
393 	}
394 	/* successfully queued */
395 #ifdef RED_STATS
396 	PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
397 #endif
398 	return (0);
399 }
400 
401 /*
402  * early-drop probability is calculated as follows:
403  *   prob = p_max * (avg - th_min) / (th_max - th_min)
404  *   prob_a = prob / (2 - count*prob)
405  *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
406  * here prob_a increases as successive undrop count increases.
407  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
408  * becomes 1 when (count >= (2 / prob))).
409  */
410 int
411 drop_early(int fp_len, int fp_probd, int count)
412 {
413 	int	d;		/* denominator of drop-probability */
414 
415 	d = fp_probd - count * fp_len;
416 	if (d <= 0)
417 		/* count exceeds the hard limit: drop or mark */
418 		return (1);
419 
420 	/*
421 	 * now the range of d is [1..600] in fixed-point. (when
422 	 * th_max-th_min=10 and p_max=1/30)
423 	 * drop probability = (avg - TH_MIN) / d
424 	 */
425 
426 	if ((arc4random() % d) < fp_len) {
427 		/* drop or mark */
428 		return (1);
429 	}
430 	/* no drop/mark */
431 	return (0);
432 }
433 
434 /*
435  * try to mark CE bit to the packet.
436  *    returns 1 if successfully marked, 0 otherwise.
437  */
438 int
439 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
440 {
441 	struct mbuf	*m0;
442 	struct pf_mtag	*at;
443 	void		*hdr;
444 
445 	at = pf_find_mtag(m);
446 	if (at != NULL) {
447 		hdr = at->hdr;
448 	} else
449 		return (0);
450 
451 	/* verify that pattr_hdr is within the mbuf data */
452 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
453 		if (((caddr_t)hdr >= m0->m_data) &&
454 		    ((caddr_t)hdr < m0->m_data + m0->m_len))
455 			break;
456 	if (m0 == NULL) {
457 		/* ick, tag info is stale */
458 		return (0);
459 	}
460 
461 	switch (((struct ip *)hdr)->ip_v) {
462 	case IPVERSION:
463 		if (flags & REDF_ECN4) {
464 			struct ip *ip = hdr;
465 			u_int8_t otos;
466 			int sum;
467 
468 			if (ip->ip_v != 4)
469 				return (0);	/* version mismatch! */
470 
471 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
472 				return (0);	/* not-ECT */
473 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
474 				return (1);	/* already marked */
475 
476 			/*
477 			 * ecn-capable but not marked,
478 			 * mark CE and update checksum
479 			 */
480 			otos = ip->ip_tos;
481 			ip->ip_tos |= IPTOS_ECN_CE;
482 			/*
483 			 * update checksum (from RFC1624)
484 			 *	   HC' = ~(~HC + ~m + m')
485 			 */
486 			sum = ~ntohs(ip->ip_sum) & 0xffff;
487 			sum += (~otos & 0xffff) + ip->ip_tos;
488 			sum = (sum >> 16) + (sum & 0xffff);
489 			sum += (sum >> 16);  /* add carry */
490 			ip->ip_sum = htons(~sum & 0xffff);
491 			return (1);
492 		}
493 		break;
494 #ifdef INET6
495 	case (IPV6_VERSION >> 4):
496 		if (flags & REDF_ECN6) {
497 			struct ip6_hdr *ip6 = hdr;
498 			u_int32_t flowlabel;
499 
500 			flowlabel = ntohl(ip6->ip6_flow);
501 			if ((flowlabel >> 28) != 6)
502 				return (0);	/* version mismatch! */
503 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
504 			    (IPTOS_ECN_NOTECT << 20))
505 				return (0);	/* not-ECT */
506 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
507 			    (IPTOS_ECN_CE << 20))
508 				return (1);	/* already marked */
509 			/*
510 			 * ecn-capable but not marked,  mark CE
511 			 */
512 			flowlabel |= (IPTOS_ECN_CE << 20);
513 			ip6->ip6_flow = htonl(flowlabel);
514 			return (1);
515 		}
516 		break;
517 #endif  /* INET6 */
518 	}
519 
520 	/* not marked */
521 	return (0);
522 }
523 
524 struct mbuf *
525 red_getq(red_t *rp, class_queue_t *q)
526 {
527 	struct mbuf *m;
528 
529 	if ((m = _getq(q)) == NULL) {
530 		if (rp->red_idle == 0) {
531 			rp->red_idle = 1;
532 			microtime(&rp->red_last);
533 		}
534 		return NULL;
535 	}
536 
537 	rp->red_idle = 0;
538 	return (m);
539 }
540 
541 /*
542  * helper routine to calibrate avg during idle.
543  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
544  * here Wq = 1/weight and the code assumes Wq is close to zero.
545  *
546  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
547  */
548 static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
549 
550 struct wtab *
551 wtab_alloc(int weight)
552 {
553 	struct wtab	*w;
554 	int		 i;
555 
556 	for (w = wtab_list; w != NULL; w = w->w_next)
557 		if (w->w_weight == weight) {
558 			w->w_refcount++;
559 			return (w);
560 		}
561 
562 	w = malloc(sizeof(struct wtab), M_DEVBUF, M_NOWAIT | M_ZERO);
563 	if (w == NULL)
564 		return (NULL);
565 	w->w_weight = weight;
566 	w->w_refcount = 1;
567 	w->w_next = wtab_list;
568 	wtab_list = w;
569 
570 	/* initialize the weight table */
571 	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
572 	for (i = 1; i < 32; i++) {
573 		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
574 		if (w->w_tab[i] == 0 && w->w_param_max == 0)
575 			w->w_param_max = 1 << i;
576 	}
577 
578 	return (w);
579 }
580 
581 int
582 wtab_destroy(struct wtab *w)
583 {
584 	struct wtab	*prev;
585 
586 	if (--w->w_refcount > 0)
587 		return (0);
588 
589 	if (wtab_list == w)
590 		wtab_list = w->w_next;
591 	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
592 		if (prev->w_next == w) {
593 			prev->w_next = w->w_next;
594 			break;
595 		}
596 
597 	free(w, M_DEVBUF);
598 	return (0);
599 }
600 
601 int32_t
602 pow_w(struct wtab *w, int n)
603 {
604 	int	i, bit;
605 	int32_t	val;
606 
607 	if (n >= w->w_param_max)
608 		return (0);
609 
610 	val = 1 << FP_SHIFT;
611 	if (n <= 0)
612 		return (val);
613 
614 	bit = 1;
615 	i = 0;
616 	while (n) {
617 		if (n & bit) {
618 			val = (val * w->w_tab[i]) >> FP_SHIFT;
619 			n &= ~bit;
620 		}
621 		i++;
622 		bit <<=  1;
623 	}
624 	return (val);
625 }
626 
627 #endif /* ALTQ_RED */
628