1 /*- 2 * Copyright (C) 1998-2003 3 * Sony Computer Science Laboratories Inc. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /*- 27 * Copyright (c) 1990-1994 Regents of the University of California. 28 * All rights reserved. 29 * 30 * Redistribution and use in source and binary forms, with or without 31 * modification, are permitted provided that the following conditions 32 * are met: 33 * 1. Redistributions of source code must retain the above copyright 34 * notice, this list of conditions and the following disclaimer. 35 * 2. Redistributions in binary form must reproduce the above copyright 36 * notice, this list of conditions and the following disclaimer in the 37 * documentation and/or other materials provided with the distribution. 38 * 3. All advertising materials mentioning features or use of this software 39 * must display the following acknowledgement: 40 * This product includes software developed by the Computer Systems 41 * Engineering Group at Lawrence Berkeley Laboratory. 42 * 4. Neither the name of the University nor of the Laboratory may be used 43 * to endorse or promote products derived from this software without 44 * specific prior written permission. 45 * 46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 56 * SUCH DAMAGE. 57 * 58 * $KAME: altq_rio.c,v 1.17 2003/07/10 12:07:49 kjc Exp $ 59 * $FreeBSD$ 60 */ 61 62 #include "opt_altq.h" 63 #include "opt_inet.h" 64 #include "opt_inet6.h" 65 #ifdef ALTQ_RIO /* rio is enabled by ALTQ_RIO option in opt_altq.h */ 66 67 #include <sys/param.h> 68 #include <sys/malloc.h> 69 #include <sys/mbuf.h> 70 #include <sys/socket.h> 71 #include <sys/systm.h> 72 #include <sys/errno.h> 73 #if 1 /* ALTQ3_COMPAT */ 74 #include <sys/proc.h> 75 #include <sys/sockio.h> 76 #include <sys/kernel.h> 77 #endif 78 79 #include <net/if.h> 80 #include <net/if_var.h> 81 82 #include <netinet/in.h> 83 #include <netinet/in_systm.h> 84 #include <netinet/ip.h> 85 #ifdef INET6 86 #include <netinet/ip6.h> 87 #endif 88 89 #include <netpfil/pf/pf.h> 90 #include <netpfil/pf/pf_altq.h> 91 #include <net/altq/altq.h> 92 #include <net/altq/altq_cdnr.h> 93 #include <net/altq/altq_red.h> 94 #include <net/altq/altq_rio.h> 95 96 /* 97 * RIO: RED with IN/OUT bit 98 * described in 99 * "Explicit Allocation of Best Effort Packet Delivery Service" 100 * David D. Clark and Wenjia Fang, MIT Lab for Computer Science 101 * http://diffserv.lcs.mit.edu/Papers/exp-alloc-ddc-wf.{ps,pdf} 102 * 103 * this implementation is extended to support more than 2 drop precedence 104 * values as described in RFC2597 (Assured Forwarding PHB Group). 105 * 106 */ 107 /* 108 * AF DS (differentiated service) codepoints. 109 * (classes can be mapped to CBQ or H-FSC classes.) 110 * 111 * 0 1 2 3 4 5 6 7 112 * +---+---+---+---+---+---+---+---+ 113 * | CLASS |DropPre| 0 | CU | 114 * +---+---+---+---+---+---+---+---+ 115 * 116 * class 1: 001 117 * class 2: 010 118 * class 3: 011 119 * class 4: 100 120 * 121 * low drop prec: 01 122 * medium drop prec: 10 123 * high drop prec: 01 124 */ 125 126 /* normal red parameters */ 127 #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ 128 /* q_weight = 0.00195 */ 129 130 /* red parameters for a slow link */ 131 #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ 132 /* q_weight = 0.0078125 */ 133 134 /* red parameters for a very slow link (e.g., dialup) */ 135 #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ 136 /* q_weight = 0.015625 */ 137 138 /* fixed-point uses 12-bit decimal places */ 139 #define FP_SHIFT 12 /* fixed-point shift */ 140 141 /* red parameters for drop probability */ 142 #define INV_P_MAX 10 /* inverse of max drop probability */ 143 #define TH_MIN 5 /* min threshold */ 144 #define TH_MAX 15 /* max threshold */ 145 146 #define RIO_LIMIT 60 /* default max queue length */ 147 #define RIO_STATS /* collect statistics */ 148 149 #define TV_DELTA(a, b, delta) { \ 150 int xxs; \ 151 \ 152 delta = (a)->tv_usec - (b)->tv_usec; \ 153 if ((xxs = (a)->tv_sec - (b)->tv_sec) != 0) { \ 154 if (xxs < 0) { \ 155 delta = 60000000; \ 156 } else if (xxs > 4) { \ 157 if (xxs > 60) \ 158 delta = 60000000; \ 159 else \ 160 delta += xxs * 1000000; \ 161 } else while (xxs > 0) { \ 162 delta += 1000000; \ 163 xxs--; \ 164 } \ 165 } \ 166 } 167 168 /* default rio parameter values */ 169 static struct redparams default_rio_params[RIO_NDROPPREC] = { 170 /* th_min, th_max, inv_pmax */ 171 { TH_MAX * 2 + TH_MIN, TH_MAX * 3, INV_P_MAX }, /* low drop precedence */ 172 { TH_MAX + TH_MIN, TH_MAX * 2, INV_P_MAX }, /* medium drop precedence */ 173 { TH_MIN, TH_MAX, INV_P_MAX } /* high drop precedence */ 174 }; 175 176 /* internal function prototypes */ 177 static int dscp2index(u_int8_t); 178 179 rio_t * 180 rio_alloc(int weight, struct redparams *params, int flags, int pkttime) 181 { 182 rio_t *rp; 183 int w, i; 184 int npkts_per_sec; 185 186 rp = malloc(sizeof(rio_t), M_DEVBUF, M_NOWAIT | M_ZERO); 187 if (rp == NULL) 188 return (NULL); 189 190 rp->rio_flags = flags; 191 if (pkttime == 0) 192 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ 193 rp->rio_pkttime = 800; 194 else 195 rp->rio_pkttime = pkttime; 196 197 if (weight != 0) 198 rp->rio_weight = weight; 199 else { 200 /* use default */ 201 rp->rio_weight = W_WEIGHT; 202 203 /* when the link is very slow, adjust red parameters */ 204 npkts_per_sec = 1000000 / rp->rio_pkttime; 205 if (npkts_per_sec < 50) { 206 /* up to about 400Kbps */ 207 rp->rio_weight = W_WEIGHT_2; 208 } else if (npkts_per_sec < 300) { 209 /* up to about 2.4Mbps */ 210 rp->rio_weight = W_WEIGHT_1; 211 } 212 } 213 214 /* calculate wshift. weight must be power of 2 */ 215 w = rp->rio_weight; 216 for (i = 0; w > 1; i++) 217 w = w >> 1; 218 rp->rio_wshift = i; 219 w = 1 << rp->rio_wshift; 220 if (w != rp->rio_weight) { 221 printf("invalid weight value %d for red! use %d\n", 222 rp->rio_weight, w); 223 rp->rio_weight = w; 224 } 225 226 /* allocate weight table */ 227 rp->rio_wtab = wtab_alloc(rp->rio_weight); 228 229 for (i = 0; i < RIO_NDROPPREC; i++) { 230 struct dropprec_state *prec = &rp->rio_precstate[i]; 231 232 prec->avg = 0; 233 prec->idle = 1; 234 235 if (params == NULL || params[i].inv_pmax == 0) 236 prec->inv_pmax = default_rio_params[i].inv_pmax; 237 else 238 prec->inv_pmax = params[i].inv_pmax; 239 if (params == NULL || params[i].th_min == 0) 240 prec->th_min = default_rio_params[i].th_min; 241 else 242 prec->th_min = params[i].th_min; 243 if (params == NULL || params[i].th_max == 0) 244 prec->th_max = default_rio_params[i].th_max; 245 else 246 prec->th_max = params[i].th_max; 247 248 /* 249 * th_min_s and th_max_s are scaled versions of th_min 250 * and th_max to be compared with avg. 251 */ 252 prec->th_min_s = prec->th_min << (rp->rio_wshift + FP_SHIFT); 253 prec->th_max_s = prec->th_max << (rp->rio_wshift + FP_SHIFT); 254 255 /* 256 * precompute probability denominator 257 * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point 258 */ 259 prec->probd = (2 * (prec->th_max - prec->th_min) 260 * prec->inv_pmax) << FP_SHIFT; 261 262 microtime(&prec->last); 263 } 264 265 return (rp); 266 } 267 268 void 269 rio_destroy(rio_t *rp) 270 { 271 wtab_destroy(rp->rio_wtab); 272 free(rp, M_DEVBUF); 273 } 274 275 void 276 rio_getstats(rio_t *rp, struct redstats *sp) 277 { 278 int i; 279 280 for (i = 0; i < RIO_NDROPPREC; i++) { 281 bcopy(&rp->q_stats[i], sp, sizeof(struct redstats)); 282 sp->q_avg = rp->rio_precstate[i].avg >> rp->rio_wshift; 283 sp++; 284 } 285 } 286 287 #if (RIO_NDROPPREC == 3) 288 /* 289 * internally, a drop precedence value is converted to an index 290 * starting from 0. 291 */ 292 static int 293 dscp2index(u_int8_t dscp) 294 { 295 int dpindex = dscp & AF_DROPPRECMASK; 296 297 if (dpindex == 0) 298 return (0); 299 return ((dpindex >> 3) - 1); 300 } 301 #endif 302 303 #if 1 304 /* 305 * kludge: when a packet is dequeued, we need to know its drop precedence 306 * in order to keep the queue length of each drop precedence. 307 * use m_pkthdr.rcvif to pass this info. 308 */ 309 #define RIOM_SET_PRECINDEX(m, idx) \ 310 do { (m)->m_pkthdr.rcvif = (void *)((long)(idx)); } while (0) 311 #define RIOM_GET_PRECINDEX(m) \ 312 ({ long idx; idx = (long)((m)->m_pkthdr.rcvif); \ 313 (m)->m_pkthdr.rcvif = NULL; idx; }) 314 #endif 315 316 int 317 rio_addq(rio_t *rp, class_queue_t *q, struct mbuf *m, 318 struct altq_pktattr *pktattr) 319 { 320 int avg, droptype; 321 u_int8_t dsfield, odsfield; 322 int dpindex, i, n, t; 323 struct timeval now; 324 struct dropprec_state *prec; 325 326 dsfield = odsfield = read_dsfield(m, pktattr); 327 dpindex = dscp2index(dsfield); 328 329 /* 330 * update avg of the precedence states whose drop precedence 331 * is larger than or equal to the drop precedence of the packet 332 */ 333 now.tv_sec = 0; 334 for (i = dpindex; i < RIO_NDROPPREC; i++) { 335 prec = &rp->rio_precstate[i]; 336 avg = prec->avg; 337 if (prec->idle) { 338 prec->idle = 0; 339 if (now.tv_sec == 0) 340 microtime(&now); 341 t = (now.tv_sec - prec->last.tv_sec); 342 if (t > 60) 343 avg = 0; 344 else { 345 t = t * 1000000 + 346 (now.tv_usec - prec->last.tv_usec); 347 n = t / rp->rio_pkttime; 348 /* calculate (avg = (1 - Wq)^n * avg) */ 349 if (n > 0) 350 avg = (avg >> FP_SHIFT) * 351 pow_w(rp->rio_wtab, n); 352 } 353 } 354 355 /* run estimator. (avg is scaled by WEIGHT in fixed-point) */ 356 avg += (prec->qlen << FP_SHIFT) - (avg >> rp->rio_wshift); 357 prec->avg = avg; /* save the new value */ 358 /* 359 * count keeps a tally of arriving traffic that has not 360 * been dropped. 361 */ 362 prec->count++; 363 } 364 365 prec = &rp->rio_precstate[dpindex]; 366 avg = prec->avg; 367 368 /* see if we drop early */ 369 droptype = DTYPE_NODROP; 370 if (avg >= prec->th_min_s && prec->qlen > 1) { 371 if (avg >= prec->th_max_s) { 372 /* avg >= th_max: forced drop */ 373 droptype = DTYPE_FORCED; 374 } else if (prec->old == 0) { 375 /* first exceeds th_min */ 376 prec->count = 1; 377 prec->old = 1; 378 } else if (drop_early((avg - prec->th_min_s) >> rp->rio_wshift, 379 prec->probd, prec->count)) { 380 /* unforced drop by red */ 381 droptype = DTYPE_EARLY; 382 } 383 } else { 384 /* avg < th_min */ 385 prec->old = 0; 386 } 387 388 /* 389 * if the queue length hits the hard limit, it's a forced drop. 390 */ 391 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) 392 droptype = DTYPE_FORCED; 393 394 if (droptype != DTYPE_NODROP) { 395 /* always drop incoming packet (as opposed to randomdrop) */ 396 for (i = dpindex; i < RIO_NDROPPREC; i++) 397 rp->rio_precstate[i].count = 0; 398 #ifdef RIO_STATS 399 if (droptype == DTYPE_EARLY) 400 rp->q_stats[dpindex].drop_unforced++; 401 else 402 rp->q_stats[dpindex].drop_forced++; 403 PKTCNTR_ADD(&rp->q_stats[dpindex].drop_cnt, m_pktlen(m)); 404 #endif 405 m_freem(m); 406 return (-1); 407 } 408 409 for (i = dpindex; i < RIO_NDROPPREC; i++) 410 rp->rio_precstate[i].qlen++; 411 412 /* save drop precedence index in mbuf hdr */ 413 RIOM_SET_PRECINDEX(m, dpindex); 414 415 if (rp->rio_flags & RIOF_CLEARDSCP) 416 dsfield &= ~DSCP_MASK; 417 418 if (dsfield != odsfield) 419 write_dsfield(m, pktattr, dsfield); 420 421 _addq(q, m); 422 423 #ifdef RIO_STATS 424 PKTCNTR_ADD(&rp->q_stats[dpindex].xmit_cnt, m_pktlen(m)); 425 #endif 426 return (0); 427 } 428 429 struct mbuf * 430 rio_getq(rio_t *rp, class_queue_t *q) 431 { 432 struct mbuf *m; 433 int dpindex, i; 434 435 if ((m = _getq(q)) == NULL) 436 return NULL; 437 438 dpindex = RIOM_GET_PRECINDEX(m); 439 for (i = dpindex; i < RIO_NDROPPREC; i++) { 440 if (--rp->rio_precstate[i].qlen == 0) { 441 if (rp->rio_precstate[i].idle == 0) { 442 rp->rio_precstate[i].idle = 1; 443 microtime(&rp->rio_precstate[i].last); 444 } 445 } 446 } 447 return (m); 448 } 449 450 451 #endif /* ALTQ_RIO */ 452