1 /*- 2 * Copyright (c) 2009-2010 3 * Swinburne University of Technology, Melbourne, Australia 4 * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org> 5 * Copyright (c) 2010-2011 The FreeBSD Foundation 6 * All rights reserved. 7 * 8 * This software was developed at the Centre for Advanced Internet 9 * Architectures, Swinburne University of Technology, by David Hayes, made 10 * possible in part by a grant from the Cisco University Research Program Fund 11 * at Community Foundation Silicon Valley. 12 * 13 * Portions of this software were developed at the Centre for Advanced 14 * Internet Architectures, Swinburne University of Technology, Melbourne, 15 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 #include <sys/cdefs.h> 40 __FBSDID("$FreeBSD$"); 41 42 #include <sys/param.h> 43 #include <sys/kernel.h> 44 #include <sys/mbuf.h> 45 #include <sys/module.h> 46 #include <sys/hhook.h> 47 #include <sys/khelp.h> 48 #include <sys/module_khelp.h> 49 #include <sys/socket.h> 50 #include <sys/sockopt.h> 51 52 #include <net/vnet.h> 53 54 #include <netinet/in.h> 55 #include <netinet/in_pcb.h> 56 #include <netinet/tcp_seq.h> 57 #include <netinet/tcp_var.h> 58 59 #include <netinet/khelp/h_ertt.h> 60 61 #include <vm/uma.h> 62 63 uma_zone_t txseginfo_zone; 64 65 /* Smoothing factor for delayed ack guess. */ 66 #define DLYACK_SMOOTH 5 67 68 /* Max number of time stamp errors allowed in a session. */ 69 #define MAX_TS_ERR 10 70 71 static int ertt_packet_measurement_hook(int hhook_type, int hhook_id, 72 void *udata, void *ctx_data, void *hdata, struct osd *hosd); 73 static int ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, 74 void *udata, void *ctx_data, void *hdata, struct osd *hosd); 75 static int ertt_mod_init(void); 76 static int ertt_mod_destroy(void); 77 static int ertt_uma_ctor(void *mem, int size, void *arg, int flags); 78 static void ertt_uma_dtor(void *mem, int size, void *arg); 79 80 /* 81 * Contains information about the sent segment for comparison with the 82 * corresponding ack. 83 */ 84 struct txseginfo { 85 /* Segment length. */ 86 long len; 87 /* Segment sequence number. */ 88 tcp_seq seq; 89 /* Time stamp indicating when the packet was sent. */ 90 uint32_t tx_ts; 91 /* Last received receiver ts (if the TCP option is used). */ 92 uint32_t rx_ts; 93 uint32_t flags; 94 TAILQ_ENTRY (txseginfo) txsegi_lnk; 95 }; 96 97 /* Flags for struct txseginfo. */ 98 #define TXSI_TSO 0x01 /* TSO was used for this entry. */ 99 #define TXSI_RTT_MEASURE_START 0x02 /* Start a per RTT measurement. */ 100 #define TXSI_RX_MEASURE_END 0x04 /* Measure the rx rate until this txsi. */ 101 102 struct helper ertt_helper = { 103 .mod_init = ertt_mod_init, 104 .mod_destroy = ertt_mod_destroy, 105 .h_flags = HELPER_NEEDS_OSD, 106 .h_classes = HELPER_CLASS_TCP 107 }; 108 109 /* Define the helper hook info required by ERTT. */ 110 struct hookinfo ertt_hooks[] = { 111 { 112 .hook_type = HHOOK_TYPE_TCP, 113 .hook_id = HHOOK_TCP_EST_IN, 114 .hook_udata = NULL, 115 .hook_func = &ertt_packet_measurement_hook 116 }, 117 { 118 .hook_type = HHOOK_TYPE_TCP, 119 .hook_id = HHOOK_TCP_EST_OUT, 120 .hook_udata = NULL, 121 .hook_func = &ertt_add_tx_segment_info_hook 122 } 123 }; 124 125 /* Flags to indicate how marked_packet_rtt should handle this txsi. */ 126 #define MULTI_ACK 0x01 /* More than this txsi is acked. */ 127 #define OLD_TXSI 0x02 /* TXSI is old according to timestamps. */ 128 #define CORRECT_ACK 0X04 /* Acks this TXSI. */ 129 #define FORCED_MEASUREMENT 0X08 /* Force an RTT measurement. */ 130 131 /* 132 * This fuction measures the RTT of a particular segment/ack pair, or the next 133 * closest if this will yield an inaccurate result due to delayed acking or 134 * other issues. 135 */ 136 static void inline 137 marked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp, 138 uint32_t *pmeasurenext, int *pmeasurenext_len, int *prtt_bytes_adjust, 139 int mflag) 140 { 141 142 /* 143 * If we can't measure this one properly due to delayed acking adjust 144 * byte counters and flag to measure next txsi. Note that since the 145 * marked packet's transmitted bytes are measured we need to subtract the 146 * transmitted bytes. Then pretend the next txsi was marked. 147 */ 148 if (mflag & (MULTI_ACK|OLD_TXSI)) { 149 *pmeasurenext = txsi->tx_ts; 150 *pmeasurenext_len = txsi->len; 151 *prtt_bytes_adjust += *pmeasurenext_len; 152 } else { 153 if (mflag & FORCED_MEASUREMENT) { 154 e_t->markedpkt_rtt = ticks - *pmeasurenext + 1; 155 e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt + 156 *pmeasurenext_len - *prtt_bytes_adjust; 157 } else { 158 e_t->markedpkt_rtt = ticks - txsi->tx_ts + 1; 159 e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt - 160 *prtt_bytes_adjust; 161 } 162 e_t->marked_snd_cwnd = tp->snd_cwnd; 163 164 /* 165 * Reset the ERTT_MEASUREMENT_IN_PROGRESS flag to indicate to 166 * add_tx_segment_info that a new measurement should be started. 167 */ 168 e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS; 169 /* 170 * Set ERTT_NEW_MEASUREMENT to tell the congestion control 171 * algorithm that a new marked RTT measurement has has been made 172 * and is available for use. 173 */ 174 e_t->flags |= ERTT_NEW_MEASUREMENT; 175 176 if (tp->t_flags & TF_TSO) { 177 /* Temporarily disable TSO to aid a new measurment. */ 178 tp->t_flags &= ~TF_TSO; 179 /* Keep track that we've disabled it. */ 180 e_t->flags |= ERTT_TSO_DISABLED; 181 } 182 } 183 } 184 185 /* 186 * Ertt_packet_measurements uses a small amount of state kept on each packet 187 * sent to match incoming acknowledgements. This enables more accurate and 188 * secure round trip time measurements. The resulting measurement is used for 189 * congestion control algorithms which require a more accurate time. 190 * Ertt_packet_measurements is called via the helper hook in tcp_input.c 191 */ 192 static int 193 ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata, 194 void *ctx_data, void *hdata, struct osd *hosd) 195 { 196 struct ertt *e_t; 197 struct tcpcb *tp; 198 struct tcphdr *th; 199 struct tcpopt *to; 200 struct tcp_hhook_data *thdp; 201 struct txseginfo *txsi; 202 int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust; 203 uint32_t measurenext, rts; 204 tcp_seq ack; 205 206 KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__)); 207 KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__)); 208 209 e_t = (struct ertt *)hdata; 210 thdp = ctx_data; 211 tp = thdp->tp; 212 th = thdp->th; 213 to = thdp->to; 214 new_sacked_bytes = (tp->sackhint.last_sack_ack != 0); 215 measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0; 216 acked = th->th_ack - tp->snd_una; 217 218 INP_WLOCK_ASSERT(tp->t_inpcb); 219 220 /* Packet has provided new acknowledgements. */ 221 if (acked > 0 || new_sacked_bytes) { 222 if (acked == 0 && new_sacked_bytes) { 223 /* Use last sacked data. */ 224 ack = tp->sackhint.last_sack_ack; 225 } else 226 ack = th->th_ack; 227 228 txsi = TAILQ_FIRST(&e_t->txsegi_q); 229 while (txsi != NULL) { 230 rts = 0; 231 232 /* Acknowledgement is acking more than this txsi. */ 233 if (SEQ_GT(ack, txsi->seq + txsi->len)) { 234 if (txsi->flags & TXSI_RTT_MEASURE_START || 235 measurenext) { 236 marked_packet_rtt(txsi, e_t, tp, 237 &measurenext, &measurenext_len, 238 &rtt_bytes_adjust, MULTI_ACK); 239 } 240 TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); 241 uma_zfree(txseginfo_zone, txsi); 242 txsi = TAILQ_FIRST(&e_t->txsegi_q); 243 continue; 244 } 245 246 /* 247 * Guess if delayed acks are being used by the receiver. 248 * 249 * XXXDH: A simple heuristic that could be improved 250 */ 251 if (!new_sacked_bytes) { 252 if (acked > tp->t_maxseg) { 253 e_t->dlyack_rx += 254 (e_t->dlyack_rx < DLYACK_SMOOTH) ? 255 1 : 0; 256 multiack = 1; 257 } else if (acked > txsi->len) { 258 multiack = 1; 259 e_t->dlyack_rx += 260 (e_t->dlyack_rx < DLYACK_SMOOTH) ? 261 1 : 0; 262 } else if (acked == tp->t_maxseg || 263 acked == txsi->len) { 264 e_t->dlyack_rx -= 265 (e_t->dlyack_rx > 0) ? 1 : 0; 266 } 267 /* Otherwise leave dlyack_rx the way it was. */ 268 } 269 270 /* 271 * Time stamps are only to help match the txsi with the 272 * received acknowledgements. 273 */ 274 if (e_t->timestamp_errors < MAX_TS_ERR && 275 (to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 276 /* 277 * Note: All packets sent with the offload will 278 * have the same time stamp. If we are sending 279 * on a fast interface and the t_maxseg is much 280 * smaller than one tick, this will be fine. The 281 * time stamp would be the same whether we were 282 * using tso or not. However, if the interface 283 * is slow, this will cause problems with the 284 * calculations. If the interface is slow, there 285 * is not reason to be using tso, and it should 286 * be turned off. 287 */ 288 /* 289 * If there are too many time stamp errors, time 290 * stamps won't be trusted 291 */ 292 rts = to->to_tsecr; 293 /* Before this packet. */ 294 if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts)) 295 /* When delayed acking is used, the 296 * reflected time stamp is of the first 297 * packet and thus may be before 298 * txsi->tx_ts. 299 */ 300 break; 301 if (TSTMP_GT(rts, txsi->tx_ts)) { 302 /* 303 * If reflected time stamp is later than 304 * tx_tsi, then this txsi is old. 305 */ 306 if (txsi->flags & TXSI_RTT_MEASURE_START 307 || measurenext) { 308 marked_packet_rtt(txsi, e_t, tp, 309 &measurenext, &measurenext_len, 310 &rtt_bytes_adjust, OLD_TXSI); 311 } 312 TAILQ_REMOVE(&e_t->txsegi_q, txsi, 313 txsegi_lnk); 314 uma_zfree(txseginfo_zone, txsi); 315 txsi = TAILQ_FIRST(&e_t->txsegi_q); 316 continue; 317 } 318 if (rts == txsi->tx_ts && 319 TSTMP_LT(to->to_tsval, txsi->rx_ts)) { 320 /* 321 * Segment received before sent! 322 * Something is wrong with the received 323 * timestamps so increment errors. If 324 * this keeps up we will ignore 325 * timestamps. 326 */ 327 e_t->timestamp_errors++; 328 } 329 } 330 /* 331 * Acknowledging a sequence number before this txsi. 332 * If it is an old txsi that may have had the same seq 333 * numbers, it should have been removed if time stamps 334 * are being used. 335 */ 336 if (SEQ_LEQ(ack, txsi->seq)) 337 break; /* Before first packet in txsi. */ 338 339 /* 340 * Only ack > txsi->seq and ack <= txsi->seq+txsi->len 341 * past this point. 342 * 343 * If delayed acks are being used, an acknowledgement 344 * for a single segment will have been delayed by the 345 * receiver and will yield an inaccurate measurement. In 346 * this case, we only make the measurement if more than 347 * one segment is being acknowledged or sack is 348 * currently being used. 349 */ 350 if (!e_t->dlyack_rx || multiack || new_sacked_bytes) { 351 /* Make an accurate new measurement. */ 352 e_t->rtt = ticks - txsi->tx_ts + 1; 353 354 if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0) 355 e_t->minrtt = e_t->rtt; 356 357 if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0) 358 e_t->maxrtt = e_t->rtt; 359 } 360 361 if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext) 362 marked_packet_rtt(txsi, e_t, tp, 363 &measurenext, &measurenext_len, 364 &rtt_bytes_adjust, CORRECT_ACK); 365 366 if (txsi->flags & TXSI_TSO) { 367 txsi->len -= acked; 368 if (txsi->len > 0) { 369 /* 370 * This presumes ack for first bytes in 371 * txsi, this may not be true but it 372 * shouldn't cause problems for the 373 * timing. 374 * 375 * We remeasure RTT even though we only 376 * have a single txsi. The rationale 377 * behind this is that it is better to 378 * have a slightly inaccurate 379 * measurement than no additional 380 * measurement for the rest of the bulk 381 * transfer. Since TSO is only used on 382 * high speed interface cards, so the 383 * packets should be transmitted at line 384 * rate back to back with little 385 * difference in transmission times (in 386 * ticks). 387 */ 388 txsi->seq += acked; 389 /* 390 * Reset txsi measure flag so we don't 391 * use it for another RTT measurement. 392 */ 393 txsi->flags &= ~TXSI_RTT_MEASURE_START; 394 /* 395 * There is still more data to be acked 396 * from tso bulk transmission, so we 397 * won't remove it from the TAILQ yet. 398 */ 399 break; 400 } 401 } 402 403 TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); 404 uma_zfree(txseginfo_zone, txsi); 405 break; 406 } 407 408 if (measurenext) { 409 /* 410 * We need to do a RTT measurement. It won't be the best 411 * if we do it here. 412 */ 413 marked_packet_rtt(txsi, e_t, tp, 414 &measurenext, &measurenext_len, 415 &rtt_bytes_adjust, FORCED_MEASUREMENT); 416 } 417 } 418 419 return (0); 420 } 421 422 /* 423 * Add information about a transmitted segment to a list. 424 * This is called via the helper hook in tcp_output.c 425 */ 426 static int 427 ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, void *udata, 428 void *ctx_data, void *hdata, struct osd *hosd) 429 { 430 struct ertt *e_t; 431 struct tcpcb *tp; 432 struct tcphdr *th; 433 struct tcpopt *to; 434 struct tcp_hhook_data *thdp; 435 struct txseginfo *txsi; 436 long len; 437 int tso; 438 439 KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__)); 440 KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__)); 441 442 e_t = (struct ertt *)hdata; 443 thdp = ctx_data; 444 tp = thdp->tp; 445 th = thdp->th; 446 to = thdp->to; 447 len = thdp->len; 448 tso = thdp->tso; 449 450 INP_WLOCK_ASSERT(tp->t_inpcb); 451 452 if (len > 0) { 453 txsi = uma_zalloc(txseginfo_zone, M_NOWAIT); 454 if (txsi != NULL) { 455 /* Construct txsi setting the necessary flags. */ 456 txsi->flags = 0; /* Needs to be initialised. */ 457 txsi->seq = ntohl(th->th_seq); 458 txsi->len = len; 459 if (tso) 460 txsi->flags |= TXSI_TSO; 461 else if (e_t->flags & ERTT_TSO_DISABLED) { 462 tp->t_flags |= TF_TSO; 463 e_t->flags &= ~ERTT_TSO_DISABLED; 464 } 465 466 if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS) { 467 e_t->bytes_tx_in_rtt += len; 468 } else { 469 txsi->flags |= TXSI_RTT_MEASURE_START; 470 e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS; 471 e_t->bytes_tx_in_rtt = len; 472 } 473 474 if (((tp->t_flags & TF_NOOPT) == 0) && 475 (to->to_flags & TOF_TS)) { 476 txsi->tx_ts = ntohl(to->to_tsval) - 477 tp->ts_offset; 478 txsi->rx_ts = ntohl(to->to_tsecr); 479 } else { 480 txsi->tx_ts = ticks; 481 txsi->rx_ts = 0; /* No received time stamp. */ 482 } 483 TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk); 484 } 485 } 486 487 return (0); 488 } 489 490 static int 491 ertt_mod_init(void) 492 { 493 494 txseginfo_zone = uma_zcreate("ertt_txseginfo", sizeof(struct txseginfo), 495 NULL, NULL, NULL, NULL, 0, 0); 496 497 return (0); 498 } 499 500 static int 501 ertt_mod_destroy(void) 502 { 503 504 uma_zdestroy(txseginfo_zone); 505 506 return (0); 507 } 508 509 static int 510 ertt_uma_ctor(void *mem, int size, void *arg, int flags) 511 { 512 struct ertt *e_t; 513 514 e_t = mem; 515 516 TAILQ_INIT(&e_t->txsegi_q); 517 e_t->timestamp_errors = 0; 518 e_t->minrtt = 0; 519 e_t->maxrtt = 0; 520 e_t->rtt = 0; 521 e_t->flags = 0; 522 e_t->dlyack_rx = 0; 523 e_t->bytes_tx_in_rtt = 0; 524 e_t->markedpkt_rtt = 0; 525 526 return (0); 527 } 528 529 static void 530 ertt_uma_dtor(void *mem, int size, void *arg) 531 { 532 struct ertt *e_t; 533 struct txseginfo *n_txsi, *txsi; 534 535 e_t = mem; 536 txsi = TAILQ_FIRST(&e_t->txsegi_q); 537 while (txsi != NULL) { 538 n_txsi = TAILQ_NEXT(txsi, txsegi_lnk); 539 uma_zfree(txseginfo_zone, txsi); 540 txsi = n_txsi; 541 } 542 } 543 544 KHELP_DECLARE_MOD_UMA(ertt, &ertt_helper, ertt_hooks, 1, sizeof(struct ertt), 545 ertt_uma_ctor, ertt_uma_dtor); 546