1 /*-
2 * Copyright (C) 1998-2003
3 * Sony Computer Science Laboratories Inc. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26 /*-
27 * Copyright (c) 1990-1994 Regents of the University of California.
28 * All rights reserved.
29 *
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
32 * are met:
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the Computer Systems
41 * Engineering Group at Lawrence Berkeley Laboratory.
42 * 4. Neither the name of the University nor of the Laboratory may be used
43 * to endorse or promote products derived from this software without
44 * specific prior written permission.
45 *
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * SUCH DAMAGE.
57 *
58 * $KAME: altq_rio.c,v 1.17 2003/07/10 12:07:49 kjc Exp $
59 */
60
61 #include "opt_altq.h"
62 #include "opt_inet.h"
63 #include "opt_inet6.h"
64 #ifdef ALTQ_RIO /* rio is enabled by ALTQ_RIO option in opt_altq.h */
65
66 #include <sys/param.h>
67 #include <sys/malloc.h>
68 #include <sys/mbuf.h>
69 #include <sys/socket.h>
70 #include <sys/systm.h>
71 #include <sys/errno.h>
72 #if 1 /* ALTQ3_COMPAT */
73 #include <sys/proc.h>
74 #include <sys/sockio.h>
75 #include <sys/kernel.h>
76 #endif
77
78 #include <net/if.h>
79 #include <net/if_var.h>
80
81 #include <netinet/in.h>
82 #include <netinet/in_systm.h>
83 #include <netinet/ip.h>
84 #ifdef INET6
85 #include <netinet/ip6.h>
86 #endif
87
88 #include <netpfil/pf/pf.h>
89 #include <netpfil/pf/pf_altq.h>
90 #include <net/altq/altq.h>
91 #include <net/altq/altq_cdnr.h>
92 #include <net/altq/altq_red.h>
93 #include <net/altq/altq_rio.h>
94
95 /*
96 * RIO: RED with IN/OUT bit
97 * described in
98 * "Explicit Allocation of Best Effort Packet Delivery Service"
99 * David D. Clark and Wenjia Fang, MIT Lab for Computer Science
100 * http://diffserv.lcs.mit.edu/Papers/exp-alloc-ddc-wf.{ps,pdf}
101 *
102 * this implementation is extended to support more than 2 drop precedence
103 * values as described in RFC2597 (Assured Forwarding PHB Group).
104 *
105 */
106 /*
107 * AF DS (differentiated service) codepoints.
108 * (classes can be mapped to CBQ or H-FSC classes.)
109 *
110 * 0 1 2 3 4 5 6 7
111 * +---+---+---+---+---+---+---+---+
112 * | CLASS |DropPre| 0 | CU |
113 * +---+---+---+---+---+---+---+---+
114 *
115 * class 1: 001
116 * class 2: 010
117 * class 3: 011
118 * class 4: 100
119 *
120 * low drop prec: 01
121 * medium drop prec: 10
122 * high drop prec: 01
123 */
124
125 /* normal red parameters */
126 #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */
127 /* q_weight = 0.00195 */
128
129 /* red parameters for a slow link */
130 #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */
131 /* q_weight = 0.0078125 */
132
133 /* red parameters for a very slow link (e.g., dialup) */
134 #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */
135 /* q_weight = 0.015625 */
136
137 /* fixed-point uses 12-bit decimal places */
138 #define FP_SHIFT 12 /* fixed-point shift */
139
140 /* red parameters for drop probability */
141 #define INV_P_MAX 10 /* inverse of max drop probability */
142 #define TH_MIN 5 /* min threshold */
143 #define TH_MAX 15 /* max threshold */
144
145 #define RIO_LIMIT 60 /* default max queue length */
146 #define RIO_STATS /* collect statistics */
147
148 #define TV_DELTA(a, b, delta) { \
149 int xxs; \
150 \
151 delta = (a)->tv_usec - (b)->tv_usec; \
152 if ((xxs = (a)->tv_sec - (b)->tv_sec) != 0) { \
153 if (xxs < 0) { \
154 delta = 60000000; \
155 } else if (xxs > 4) { \
156 if (xxs > 60) \
157 delta = 60000000; \
158 else \
159 delta += xxs * 1000000; \
160 } else while (xxs > 0) { \
161 delta += 1000000; \
162 xxs--; \
163 } \
164 } \
165 }
166
167 /* default rio parameter values */
168 static struct redparams default_rio_params[RIO_NDROPPREC] = {
169 /* th_min, th_max, inv_pmax */
170 { TH_MAX * 2 + TH_MIN, TH_MAX * 3, INV_P_MAX }, /* low drop precedence */
171 { TH_MAX + TH_MIN, TH_MAX * 2, INV_P_MAX }, /* medium drop precedence */
172 { TH_MIN, TH_MAX, INV_P_MAX } /* high drop precedence */
173 };
174
175 /* internal function prototypes */
176 static int dscp2index(u_int8_t);
177
178 rio_t *
rio_alloc(int weight,struct redparams * params,int flags,int pkttime)179 rio_alloc(int weight, struct redparams *params, int flags, int pkttime)
180 {
181 rio_t *rp;
182 int w, i;
183 int npkts_per_sec;
184
185 rp = malloc(sizeof(rio_t), M_DEVBUF, M_NOWAIT | M_ZERO);
186 if (rp == NULL)
187 return (NULL);
188
189 rp->rio_flags = flags;
190 if (pkttime == 0)
191 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
192 rp->rio_pkttime = 800;
193 else
194 rp->rio_pkttime = pkttime;
195
196 if (weight != 0)
197 rp->rio_weight = weight;
198 else {
199 /* use default */
200 rp->rio_weight = W_WEIGHT;
201
202 /* when the link is very slow, adjust red parameters */
203 npkts_per_sec = 1000000 / rp->rio_pkttime;
204 if (npkts_per_sec < 50) {
205 /* up to about 400Kbps */
206 rp->rio_weight = W_WEIGHT_2;
207 } else if (npkts_per_sec < 300) {
208 /* up to about 2.4Mbps */
209 rp->rio_weight = W_WEIGHT_1;
210 }
211 }
212
213 /* calculate wshift. weight must be power of 2 */
214 w = rp->rio_weight;
215 for (i = 0; w > 1; i++)
216 w = w >> 1;
217 rp->rio_wshift = i;
218 w = 1 << rp->rio_wshift;
219 if (w != rp->rio_weight) {
220 printf("invalid weight value %d for red! use %d\n",
221 rp->rio_weight, w);
222 rp->rio_weight = w;
223 }
224
225 /* allocate weight table */
226 rp->rio_wtab = wtab_alloc(rp->rio_weight);
227
228 for (i = 0; i < RIO_NDROPPREC; i++) {
229 struct dropprec_state *prec = &rp->rio_precstate[i];
230
231 prec->avg = 0;
232 prec->idle = 1;
233
234 if (params == NULL || params[i].inv_pmax == 0)
235 prec->inv_pmax = default_rio_params[i].inv_pmax;
236 else
237 prec->inv_pmax = params[i].inv_pmax;
238 if (params == NULL || params[i].th_min == 0)
239 prec->th_min = default_rio_params[i].th_min;
240 else
241 prec->th_min = params[i].th_min;
242 if (params == NULL || params[i].th_max == 0)
243 prec->th_max = default_rio_params[i].th_max;
244 else
245 prec->th_max = params[i].th_max;
246
247 /*
248 * th_min_s and th_max_s are scaled versions of th_min
249 * and th_max to be compared with avg.
250 */
251 prec->th_min_s = prec->th_min << (rp->rio_wshift + FP_SHIFT);
252 prec->th_max_s = prec->th_max << (rp->rio_wshift + FP_SHIFT);
253
254 /*
255 * precompute probability denominator
256 * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
257 */
258 prec->probd = (2 * (prec->th_max - prec->th_min)
259 * prec->inv_pmax) << FP_SHIFT;
260
261 microtime(&prec->last);
262 }
263
264 return (rp);
265 }
266
267 void
rio_destroy(rio_t * rp)268 rio_destroy(rio_t *rp)
269 {
270 wtab_destroy(rp->rio_wtab);
271 free(rp, M_DEVBUF);
272 }
273
274 void
rio_getstats(rio_t * rp,struct redstats * sp)275 rio_getstats(rio_t *rp, struct redstats *sp)
276 {
277 int i;
278
279 for (i = 0; i < RIO_NDROPPREC; i++) {
280 bcopy(&rp->q_stats[i], sp, sizeof(struct redstats));
281 sp->q_avg = rp->rio_precstate[i].avg >> rp->rio_wshift;
282 sp++;
283 }
284 }
285
286 #if (RIO_NDROPPREC == 3)
287 /*
288 * internally, a drop precedence value is converted to an index
289 * starting from 0.
290 */
291 static int
dscp2index(u_int8_t dscp)292 dscp2index(u_int8_t dscp)
293 {
294 int dpindex = dscp & AF_DROPPRECMASK;
295
296 if (dpindex == 0)
297 return (0);
298 return ((dpindex >> 3) - 1);
299 }
300 #endif
301
302 #if 1
303 /*
304 * kludge: when a packet is dequeued, we need to know its drop precedence
305 * in order to keep the queue length of each drop precedence.
306 * use m_pkthdr.rcvif to pass this info.
307 */
308 #define RIOM_SET_PRECINDEX(m, idx) \
309 do { (m)->m_pkthdr.rcvif = (void *)((long)(idx)); } while (0)
310 #define RIOM_GET_PRECINDEX(m) \
311 ({ long idx; idx = (long)((m)->m_pkthdr.rcvif); \
312 (m)->m_pkthdr.rcvif = NULL; idx; })
313 #endif
314
315 int
rio_addq(rio_t * rp,class_queue_t * q,struct mbuf * m,struct altq_pktattr * pktattr)316 rio_addq(rio_t *rp, class_queue_t *q, struct mbuf *m,
317 struct altq_pktattr *pktattr)
318 {
319 int avg, droptype;
320 u_int8_t dsfield, odsfield;
321 int dpindex, i, n, t;
322 struct timeval now;
323 struct dropprec_state *prec;
324
325 dsfield = odsfield = read_dsfield(m, pktattr);
326 dpindex = dscp2index(dsfield);
327
328 /*
329 * update avg of the precedence states whose drop precedence
330 * is larger than or equal to the drop precedence of the packet
331 */
332 now.tv_sec = 0;
333 for (i = dpindex; i < RIO_NDROPPREC; i++) {
334 prec = &rp->rio_precstate[i];
335 avg = prec->avg;
336 if (prec->idle) {
337 prec->idle = 0;
338 if (now.tv_sec == 0)
339 microtime(&now);
340 t = (now.tv_sec - prec->last.tv_sec);
341 if (t > 60)
342 avg = 0;
343 else {
344 t = t * 1000000 +
345 (now.tv_usec - prec->last.tv_usec);
346 n = t / rp->rio_pkttime;
347 /* calculate (avg = (1 - Wq)^n * avg) */
348 if (n > 0)
349 avg = (avg >> FP_SHIFT) *
350 pow_w(rp->rio_wtab, n);
351 }
352 }
353
354 /* run estimator. (avg is scaled by WEIGHT in fixed-point) */
355 avg += (prec->qlen << FP_SHIFT) - (avg >> rp->rio_wshift);
356 prec->avg = avg; /* save the new value */
357 /*
358 * count keeps a tally of arriving traffic that has not
359 * been dropped.
360 */
361 prec->count++;
362 }
363
364 prec = &rp->rio_precstate[dpindex];
365 avg = prec->avg;
366
367 /* see if we drop early */
368 droptype = DTYPE_NODROP;
369 if (avg >= prec->th_min_s && prec->qlen > 1) {
370 if (avg >= prec->th_max_s) {
371 /* avg >= th_max: forced drop */
372 droptype = DTYPE_FORCED;
373 } else if (prec->old == 0) {
374 /* first exceeds th_min */
375 prec->count = 1;
376 prec->old = 1;
377 } else if (drop_early((avg - prec->th_min_s) >> rp->rio_wshift,
378 prec->probd, prec->count)) {
379 /* unforced drop by red */
380 droptype = DTYPE_EARLY;
381 }
382 } else {
383 /* avg < th_min */
384 prec->old = 0;
385 }
386
387 /*
388 * if the queue length hits the hard limit, it's a forced drop.
389 */
390 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
391 droptype = DTYPE_FORCED;
392
393 if (droptype != DTYPE_NODROP) {
394 /* always drop incoming packet (as opposed to randomdrop) */
395 for (i = dpindex; i < RIO_NDROPPREC; i++)
396 rp->rio_precstate[i].count = 0;
397 #ifdef RIO_STATS
398 if (droptype == DTYPE_EARLY)
399 rp->q_stats[dpindex].drop_unforced++;
400 else
401 rp->q_stats[dpindex].drop_forced++;
402 PKTCNTR_ADD(&rp->q_stats[dpindex].drop_cnt, m_pktlen(m));
403 #endif
404 m_freem(m);
405 return (-1);
406 }
407
408 for (i = dpindex; i < RIO_NDROPPREC; i++)
409 rp->rio_precstate[i].qlen++;
410
411 /* save drop precedence index in mbuf hdr */
412 RIOM_SET_PRECINDEX(m, dpindex);
413
414 if (rp->rio_flags & RIOF_CLEARDSCP)
415 dsfield &= ~DSCP_MASK;
416
417 if (dsfield != odsfield)
418 write_dsfield(m, pktattr, dsfield);
419
420 _addq(q, m);
421
422 #ifdef RIO_STATS
423 PKTCNTR_ADD(&rp->q_stats[dpindex].xmit_cnt, m_pktlen(m));
424 #endif
425 return (0);
426 }
427
428 struct mbuf *
rio_getq(rio_t * rp,class_queue_t * q)429 rio_getq(rio_t *rp, class_queue_t *q)
430 {
431 struct mbuf *m;
432 int dpindex, i;
433
434 if ((m = _getq(q)) == NULL)
435 return NULL;
436
437 dpindex = RIOM_GET_PRECINDEX(m);
438 for (i = dpindex; i < RIO_NDROPPREC; i++) {
439 if (--rp->rio_precstate[i].qlen == 0) {
440 if (rp->rio_precstate[i].idle == 0) {
441 rp->rio_precstate[i].idle = 1;
442 microtime(&rp->rio_precstate[i].last);
443 }
444 }
445 }
446 return (m);
447 }
448
449 #endif /* ALTQ_RIO */
450