1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2009-2010 5 * Swinburne University of Technology, Melbourne, Australia 6 * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org> 7 * Copyright (c) 2010-2011 The FreeBSD Foundation 8 * All rights reserved. 9 * 10 * This software was developed at the Centre for Advanced Internet 11 * Architectures, Swinburne University of Technology, by David Hayes, made 12 * possible in part by a grant from the Cisco University Research Program Fund 13 * at Community Foundation Silicon Valley. 14 * 15 * Portions of this software were developed at the Centre for Advanced 16 * Internet Architectures, Swinburne University of Technology, Melbourne, 17 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 18 * 19 * Redistribution and use in source and binary forms, with or without 20 * modification, are permitted provided that the following conditions 21 * are met: 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include <sys/param.h> 45 #include <sys/kernel.h> 46 #include <sys/mbuf.h> 47 #include <sys/module.h> 48 #include <sys/hhook.h> 49 #include <sys/khelp.h> 50 #include <sys/module_khelp.h> 51 #include <sys/socket.h> 52 #include <sys/sockopt.h> 53 54 #include <net/vnet.h> 55 56 #include <netinet/in.h> 57 #include <netinet/in_pcb.h> 58 #include <netinet/tcp_seq.h> 59 #include <netinet/tcp_var.h> 60 61 #include <netinet/khelp/h_ertt.h> 62 63 #include <vm/uma.h> 64 65 uma_zone_t txseginfo_zone; 66 67 /* Smoothing factor for delayed ack guess. */ 68 #define DLYACK_SMOOTH 5 69 70 /* Max number of time stamp errors allowed in a session. */ 71 #define MAX_TS_ERR 10 72 73 static int ertt_packet_measurement_hook(int hhook_type, int hhook_id, 74 void *udata, void *ctx_data, void *hdata, struct osd *hosd); 75 static int ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, 76 void *udata, void *ctx_data, void *hdata, struct osd *hosd); 77 static int ertt_mod_init(void); 78 static int ertt_mod_destroy(void); 79 static int ertt_uma_ctor(void *mem, int size, void *arg, int flags); 80 static void ertt_uma_dtor(void *mem, int size, void *arg); 81 82 /* 83 * Contains information about the sent segment for comparison with the 84 * corresponding ack. 85 */ 86 struct txseginfo { 87 /* Segment length. */ 88 uint32_t len; 89 /* Segment sequence number. */ 90 tcp_seq seq; 91 /* Time stamp indicating when the packet was sent. */ 92 uint32_t tx_ts; 93 /* Last received receiver ts (if the TCP option is used). */ 94 uint32_t rx_ts; 95 uint32_t flags; 96 TAILQ_ENTRY (txseginfo) txsegi_lnk; 97 }; 98 99 /* Flags for struct txseginfo. */ 100 #define TXSI_TSO 0x01 /* TSO was used for this entry. */ 101 #define TXSI_RTT_MEASURE_START 0x02 /* Start a per RTT measurement. */ 102 #define TXSI_RX_MEASURE_END 0x04 /* Measure the rx rate until this txsi. */ 103 104 struct helper ertt_helper = { 105 .mod_init = ertt_mod_init, 106 .mod_destroy = ertt_mod_destroy, 107 .h_flags = HELPER_NEEDS_OSD, 108 .h_classes = HELPER_CLASS_TCP 109 }; 110 111 /* Define the helper hook info required by ERTT. */ 112 struct hookinfo ertt_hooks[] = { 113 { 114 .hook_type = HHOOK_TYPE_TCP, 115 .hook_id = HHOOK_TCP_EST_IN, 116 .hook_udata = NULL, 117 .hook_func = &ertt_packet_measurement_hook 118 }, 119 { 120 .hook_type = HHOOK_TYPE_TCP, 121 .hook_id = HHOOK_TCP_EST_OUT, 122 .hook_udata = NULL, 123 .hook_func = &ertt_add_tx_segment_info_hook 124 } 125 }; 126 127 /* Flags to indicate how marked_packet_rtt should handle this txsi. */ 128 #define MULTI_ACK 0x01 /* More than this txsi is acked. */ 129 #define OLD_TXSI 0x02 /* TXSI is old according to timestamps. */ 130 #define CORRECT_ACK 0X04 /* Acks this TXSI. */ 131 #define FORCED_MEASUREMENT 0X08 /* Force an RTT measurement. */ 132 133 /* 134 * This fuction measures the RTT of a particular segment/ack pair, or the next 135 * closest if this will yield an inaccurate result due to delayed acking or 136 * other issues. 137 */ 138 static void inline 139 marked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp, 140 uint32_t *pmeasurenext, int *pmeasurenext_len, int *prtt_bytes_adjust, 141 int mflag) 142 { 143 144 /* 145 * If we can't measure this one properly due to delayed acking adjust 146 * byte counters and flag to measure next txsi. Note that since the 147 * marked packet's transmitted bytes are measured we need to subtract the 148 * transmitted bytes. Then pretend the next txsi was marked. 149 */ 150 if (mflag & (MULTI_ACK|OLD_TXSI)) { 151 *pmeasurenext = txsi->tx_ts; 152 *pmeasurenext_len = txsi->len; 153 *prtt_bytes_adjust += *pmeasurenext_len; 154 } else { 155 if (mflag & FORCED_MEASUREMENT) { 156 e_t->markedpkt_rtt = tcp_ts_getticks() - 157 *pmeasurenext + 1; 158 e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt + 159 *pmeasurenext_len - *prtt_bytes_adjust; 160 } else { 161 e_t->markedpkt_rtt = tcp_ts_getticks() - 162 txsi->tx_ts + 1; 163 e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt - 164 *prtt_bytes_adjust; 165 } 166 e_t->marked_snd_cwnd = tp->snd_cwnd; 167 168 /* 169 * Reset the ERTT_MEASUREMENT_IN_PROGRESS flag to indicate to 170 * add_tx_segment_info that a new measurement should be started. 171 */ 172 e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS; 173 /* 174 * Set ERTT_NEW_MEASUREMENT to tell the congestion control 175 * algorithm that a new marked RTT measurement has has been made 176 * and is available for use. 177 */ 178 e_t->flags |= ERTT_NEW_MEASUREMENT; 179 180 if (tp->t_flags & TF_TSO) { 181 /* Temporarily disable TSO to aid a new measurement. */ 182 tp->t_flags &= ~TF_TSO; 183 /* Keep track that we've disabled it. */ 184 e_t->flags |= ERTT_TSO_DISABLED; 185 } 186 } 187 } 188 189 /* 190 * Ertt_packet_measurements uses a small amount of state kept on each packet 191 * sent to match incoming acknowledgements. This enables more accurate and 192 * secure round trip time measurements. The resulting measurement is used for 193 * congestion control algorithms which require a more accurate time. 194 * Ertt_packet_measurements is called via the helper hook in tcp_input.c 195 */ 196 static int 197 ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata, 198 void *ctx_data, void *hdata, struct osd *hosd) 199 { 200 struct ertt *e_t; 201 struct tcpcb *tp; 202 struct tcphdr *th; 203 struct tcpopt *to; 204 struct tcp_hhook_data *thdp; 205 struct txseginfo *txsi; 206 int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust; 207 uint32_t measurenext, rts; 208 tcp_seq ack; 209 210 KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__)); 211 KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__)); 212 213 e_t = (struct ertt *)hdata; 214 thdp = ctx_data; 215 tp = thdp->tp; 216 th = thdp->th; 217 to = thdp->to; 218 new_sacked_bytes = (tp->sackhint.last_sack_ack != 0); 219 measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0; 220 acked = th->th_ack - tp->snd_una; 221 222 INP_WLOCK_ASSERT(tptoinpcb(tp)); 223 224 /* Packet has provided new acknowledgements. */ 225 if (acked > 0 || new_sacked_bytes) { 226 if (acked == 0 && new_sacked_bytes) { 227 /* Use last sacked data. */ 228 ack = tp->sackhint.last_sack_ack; 229 } else 230 ack = th->th_ack; 231 232 txsi = TAILQ_FIRST(&e_t->txsegi_q); 233 while (txsi != NULL) { 234 rts = 0; 235 236 /* Acknowledgement is acking more than this txsi. */ 237 if (SEQ_GT(ack, txsi->seq + txsi->len)) { 238 if (txsi->flags & TXSI_RTT_MEASURE_START || 239 measurenext) { 240 marked_packet_rtt(txsi, e_t, tp, 241 &measurenext, &measurenext_len, 242 &rtt_bytes_adjust, MULTI_ACK); 243 } 244 TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); 245 uma_zfree(txseginfo_zone, txsi); 246 txsi = TAILQ_FIRST(&e_t->txsegi_q); 247 continue; 248 } 249 250 /* 251 * Guess if delayed acks are being used by the receiver. 252 * 253 * XXXDH: A simple heuristic that could be improved 254 */ 255 if (!new_sacked_bytes) { 256 if (acked > tp->t_maxseg) { 257 e_t->dlyack_rx += 258 (e_t->dlyack_rx < DLYACK_SMOOTH) ? 259 1 : 0; 260 multiack = 1; 261 } else if (acked > txsi->len) { 262 multiack = 1; 263 e_t->dlyack_rx += 264 (e_t->dlyack_rx < DLYACK_SMOOTH) ? 265 1 : 0; 266 } else if (acked == tp->t_maxseg || 267 acked == txsi->len) { 268 e_t->dlyack_rx -= 269 (e_t->dlyack_rx > 0) ? 1 : 0; 270 } 271 /* Otherwise leave dlyack_rx the way it was. */ 272 } 273 274 /* 275 * Time stamps are only to help match the txsi with the 276 * received acknowledgements. 277 */ 278 if (e_t->timestamp_errors < MAX_TS_ERR && 279 (to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 280 /* 281 * Note: All packets sent with the offload will 282 * have the same time stamp. If we are sending 283 * on a fast interface and the t_maxseg is much 284 * smaller than one tick, this will be fine. The 285 * time stamp would be the same whether we were 286 * using tso or not. However, if the interface 287 * is slow, this will cause problems with the 288 * calculations. If the interface is slow, there 289 * is not reason to be using tso, and it should 290 * be turned off. 291 */ 292 /* 293 * If there are too many time stamp errors, time 294 * stamps won't be trusted 295 */ 296 rts = to->to_tsecr; 297 /* Before this packet. */ 298 if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts)) 299 /* When delayed acking is used, the 300 * reflected time stamp is of the first 301 * packet and thus may be before 302 * txsi->tx_ts. 303 */ 304 break; 305 if (TSTMP_GT(rts, txsi->tx_ts)) { 306 /* 307 * If reflected time stamp is later than 308 * tx_tsi, then this txsi is old. 309 */ 310 if (txsi->flags & TXSI_RTT_MEASURE_START 311 || measurenext) { 312 marked_packet_rtt(txsi, e_t, tp, 313 &measurenext, &measurenext_len, 314 &rtt_bytes_adjust, OLD_TXSI); 315 } 316 TAILQ_REMOVE(&e_t->txsegi_q, txsi, 317 txsegi_lnk); 318 uma_zfree(txseginfo_zone, txsi); 319 txsi = TAILQ_FIRST(&e_t->txsegi_q); 320 continue; 321 } 322 if (rts == txsi->tx_ts && 323 TSTMP_LT(to->to_tsval, txsi->rx_ts)) { 324 /* 325 * Segment received before sent! 326 * Something is wrong with the received 327 * timestamps so increment errors. If 328 * this keeps up we will ignore 329 * timestamps. 330 */ 331 e_t->timestamp_errors++; 332 } 333 } 334 /* 335 * Acknowledging a sequence number before this txsi. 336 * If it is an old txsi that may have had the same seq 337 * numbers, it should have been removed if time stamps 338 * are being used. 339 */ 340 if (SEQ_LEQ(ack, txsi->seq)) 341 break; /* Before first packet in txsi. */ 342 343 /* 344 * Only ack > txsi->seq and ack <= txsi->seq+txsi->len 345 * past this point. 346 * 347 * If delayed acks are being used, an acknowledgement 348 * for a single segment will have been delayed by the 349 * receiver and will yield an inaccurate measurement. In 350 * this case, we only make the measurement if more than 351 * one segment is being acknowledged or sack is 352 * currently being used. 353 */ 354 if (!e_t->dlyack_rx || multiack || new_sacked_bytes) { 355 /* Make an accurate new measurement. */ 356 e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1; 357 358 if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0) 359 e_t->minrtt = e_t->rtt; 360 361 if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0) 362 e_t->maxrtt = e_t->rtt; 363 } 364 365 if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext) 366 marked_packet_rtt(txsi, e_t, tp, 367 &measurenext, &measurenext_len, 368 &rtt_bytes_adjust, CORRECT_ACK); 369 370 if (txsi->flags & TXSI_TSO) { 371 if (txsi->len > acked) { 372 txsi->len -= acked; 373 /* 374 * This presumes ack for first bytes in 375 * txsi, this may not be true but it 376 * shouldn't cause problems for the 377 * timing. 378 * 379 * We remeasure RTT even though we only 380 * have a single txsi. The rationale 381 * behind this is that it is better to 382 * have a slightly inaccurate 383 * measurement than no additional 384 * measurement for the rest of the bulk 385 * transfer. Since TSO is only used on 386 * high speed interface cards, so the 387 * packets should be transmitted at line 388 * rate back to back with little 389 * difference in transmission times (in 390 * ticks). 391 */ 392 txsi->seq += acked; 393 /* 394 * Reset txsi measure flag so we don't 395 * use it for another RTT measurement. 396 */ 397 txsi->flags &= ~TXSI_RTT_MEASURE_START; 398 /* 399 * There is still more data to be acked 400 * from tso bulk transmission, so we 401 * won't remove it from the TAILQ yet. 402 */ 403 break; 404 } 405 txsi->len = 0; 406 } 407 408 TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); 409 uma_zfree(txseginfo_zone, txsi); 410 break; 411 } 412 413 if (measurenext) { 414 /* 415 * We need to do a RTT measurement. It won't be the best 416 * if we do it here. 417 */ 418 marked_packet_rtt(txsi, e_t, tp, 419 &measurenext, &measurenext_len, 420 &rtt_bytes_adjust, FORCED_MEASUREMENT); 421 } 422 } 423 424 return (0); 425 } 426 427 /* 428 * Add information about a transmitted segment to a list. 429 * This is called via the helper hook in tcp_output.c 430 */ 431 static int 432 ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, void *udata, 433 void *ctx_data, void *hdata, struct osd *hosd) 434 { 435 struct ertt *e_t; 436 struct tcpcb *tp; 437 struct tcphdr *th; 438 struct tcpopt *to; 439 struct tcp_hhook_data *thdp; 440 struct txseginfo *txsi; 441 uint32_t len; 442 int tso; 443 444 KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__)); 445 KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__)); 446 447 e_t = (struct ertt *)hdata; 448 thdp = ctx_data; 449 tp = thdp->tp; 450 th = thdp->th; 451 to = thdp->to; 452 len = thdp->len; 453 tso = thdp->tso; 454 455 INP_WLOCK_ASSERT(tptoinpcb(tp)); 456 457 if (len > 0) { 458 txsi = uma_zalloc(txseginfo_zone, M_NOWAIT); 459 if (txsi != NULL) { 460 /* Construct txsi setting the necessary flags. */ 461 txsi->flags = 0; /* Needs to be initialised. */ 462 txsi->seq = ntohl(th->th_seq); 463 txsi->len = len; 464 if (tso) 465 txsi->flags |= TXSI_TSO; 466 else if (e_t->flags & ERTT_TSO_DISABLED) { 467 tp->t_flags |= TF_TSO; 468 e_t->flags &= ~ERTT_TSO_DISABLED; 469 } 470 471 if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS) { 472 e_t->bytes_tx_in_rtt += len; 473 } else { 474 txsi->flags |= TXSI_RTT_MEASURE_START; 475 e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS; 476 e_t->bytes_tx_in_rtt = len; 477 } 478 479 if (((tp->t_flags & TF_NOOPT) == 0) && 480 (to->to_flags & TOF_TS)) { 481 txsi->tx_ts = ntohl(to->to_tsval) - 482 tp->ts_offset; 483 txsi->rx_ts = ntohl(to->to_tsecr); 484 } else { 485 txsi->tx_ts = tcp_ts_getticks(); 486 txsi->rx_ts = 0; /* No received time stamp. */ 487 } 488 TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk); 489 } 490 } 491 492 return (0); 493 } 494 495 static int 496 ertt_mod_init(void) 497 { 498 499 txseginfo_zone = uma_zcreate("ertt_txseginfo", sizeof(struct txseginfo), 500 NULL, NULL, NULL, NULL, 0, 0); 501 502 return (0); 503 } 504 505 static int 506 ertt_mod_destroy(void) 507 { 508 509 uma_zdestroy(txseginfo_zone); 510 511 return (0); 512 } 513 514 static int 515 ertt_uma_ctor(void *mem, int size, void *arg, int flags) 516 { 517 struct ertt *e_t; 518 519 e_t = mem; 520 521 TAILQ_INIT(&e_t->txsegi_q); 522 e_t->timestamp_errors = 0; 523 e_t->minrtt = 0; 524 e_t->maxrtt = 0; 525 e_t->rtt = 0; 526 e_t->flags = 0; 527 e_t->dlyack_rx = 0; 528 e_t->bytes_tx_in_rtt = 0; 529 e_t->markedpkt_rtt = 0; 530 531 return (0); 532 } 533 534 static void 535 ertt_uma_dtor(void *mem, int size, void *arg) 536 { 537 struct ertt *e_t; 538 struct txseginfo *n_txsi, *txsi; 539 540 e_t = mem; 541 txsi = TAILQ_FIRST(&e_t->txsegi_q); 542 while (txsi != NULL) { 543 n_txsi = TAILQ_NEXT(txsi, txsegi_lnk); 544 uma_zfree(txseginfo_zone, txsi); 545 txsi = n_txsi; 546 } 547 } 548 549 KHELP_DECLARE_MOD_UMA(ertt, &ertt_helper, ertt_hooks, 1, sizeof(struct ertt), 550 ertt_uma_ctor, ertt_uma_dtor); 551