1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2009-2010 5 * Swinburne University of Technology, Melbourne, Australia 6 * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org> 7 * Copyright (c) 2010-2011 The FreeBSD Foundation 8 * All rights reserved. 9 * 10 * This software was developed at the Centre for Advanced Internet 11 * Architectures, Swinburne University of Technology, by David Hayes, made 12 * possible in part by a grant from the Cisco University Research Program Fund 13 * at Community Foundation Silicon Valley. 14 * 15 * Portions of this software were developed at the Centre for Advanced 16 * Internet Architectures, Swinburne University of Technology, Melbourne, 17 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 18 * 19 * Redistribution and use in source and binary forms, with or without 20 * modification, are permitted provided that the following conditions 21 * are met: 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 */ 40 41 #include <sys/param.h> 42 #include <sys/kernel.h> 43 #include <sys/mbuf.h> 44 #include <sys/module.h> 45 #include <sys/hhook.h> 46 #include <sys/khelp.h> 47 #include <sys/module_khelp.h> 48 #include <sys/socket.h> 49 #include <sys/sockopt.h> 50 51 #include <net/vnet.h> 52 53 #include <netinet/in.h> 54 #include <netinet/in_pcb.h> 55 #include <netinet/tcp_seq.h> 56 #include <netinet/tcp_var.h> 57 58 #include <netinet/khelp/h_ertt.h> 59 60 #include <vm/uma.h> 61 62 uma_zone_t txseginfo_zone; 63 64 /* Smoothing factor for delayed ack guess. */ 65 #define DLYACK_SMOOTH 5 66 67 /* Max number of time stamp errors allowed in a session. */ 68 #define MAX_TS_ERR 10 69 70 static int ertt_packet_measurement_hook(int hhook_type, int hhook_id, 71 void *udata, void *ctx_data, void *hdata, struct osd *hosd); 72 static int ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, 73 void *udata, void *ctx_data, void *hdata, struct osd *hosd); 74 static int ertt_mod_init(void); 75 static int ertt_mod_destroy(void); 76 static int ertt_uma_ctor(void *mem, int size, void *arg, int flags); 77 static void ertt_uma_dtor(void *mem, int size, void *arg); 78 79 /* 80 * Contains information about the sent segment for comparison with the 81 * corresponding ack. 82 */ 83 struct txseginfo { 84 /* Segment length. */ 85 uint32_t len; 86 /* Segment sequence number. */ 87 tcp_seq seq; 88 /* Time stamp indicating when the packet was sent. */ 89 uint32_t tx_ts; 90 /* Last received receiver ts (if the TCP option is used). */ 91 uint32_t rx_ts; 92 uint32_t flags; 93 TAILQ_ENTRY (txseginfo) txsegi_lnk; 94 }; 95 96 /* Flags for struct txseginfo. */ 97 #define TXSI_TSO 0x01 /* TSO was used for this entry. */ 98 #define TXSI_RTT_MEASURE_START 0x02 /* Start a per RTT measurement. */ 99 #define TXSI_RX_MEASURE_END 0x04 /* Measure the rx rate until this txsi. */ 100 101 struct helper ertt_helper = { 102 .mod_init = ertt_mod_init, 103 .mod_destroy = ertt_mod_destroy, 104 .h_flags = HELPER_NEEDS_OSD, 105 .h_classes = HELPER_CLASS_TCP 106 }; 107 108 /* Define the helper hook info required by ERTT. */ 109 struct hookinfo ertt_hooks[] = { 110 { 111 .hook_type = HHOOK_TYPE_TCP, 112 .hook_id = HHOOK_TCP_EST_IN, 113 .hook_udata = NULL, 114 .hook_func = &ertt_packet_measurement_hook 115 }, 116 { 117 .hook_type = HHOOK_TYPE_TCP, 118 .hook_id = HHOOK_TCP_EST_OUT, 119 .hook_udata = NULL, 120 .hook_func = &ertt_add_tx_segment_info_hook 121 } 122 }; 123 124 /* Flags to indicate how marked_packet_rtt should handle this txsi. */ 125 #define MULTI_ACK 0x01 /* More than this txsi is acked. */ 126 #define OLD_TXSI 0x02 /* TXSI is old according to timestamps. */ 127 #define CORRECT_ACK 0X04 /* Acks this TXSI. */ 128 #define FORCED_MEASUREMENT 0X08 /* Force an RTT measurement. */ 129 130 /* 131 * This fuction measures the RTT of a particular segment/ack pair, or the next 132 * closest if this will yield an inaccurate result due to delayed acking or 133 * other issues. 134 */ 135 static void inline 136 marked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp, 137 uint32_t *pmeasurenext, int *pmeasurenext_len, int *prtt_bytes_adjust, 138 int mflag) 139 { 140 141 /* 142 * If we can't measure this one properly due to delayed acking adjust 143 * byte counters and flag to measure next txsi. Note that since the 144 * marked packet's transmitted bytes are measured we need to subtract the 145 * transmitted bytes. Then pretend the next txsi was marked. 146 */ 147 if (mflag & (MULTI_ACK|OLD_TXSI)) { 148 *pmeasurenext = txsi->tx_ts; 149 *pmeasurenext_len = txsi->len; 150 *prtt_bytes_adjust += *pmeasurenext_len; 151 } else { 152 if (mflag & FORCED_MEASUREMENT) { 153 e_t->markedpkt_rtt = tcp_ts_getticks() - 154 *pmeasurenext + 1; 155 e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt + 156 *pmeasurenext_len - *prtt_bytes_adjust; 157 } else { 158 e_t->markedpkt_rtt = tcp_ts_getticks() - 159 txsi->tx_ts + 1; 160 e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt - 161 *prtt_bytes_adjust; 162 } 163 e_t->marked_snd_cwnd = tp->snd_cwnd; 164 165 /* 166 * Reset the ERTT_MEASUREMENT_IN_PROGRESS flag to indicate to 167 * add_tx_segment_info that a new measurement should be started. 168 */ 169 e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS; 170 /* 171 * Set ERTT_NEW_MEASUREMENT to tell the congestion control 172 * algorithm that a new marked RTT measurement has has been made 173 * and is available for use. 174 */ 175 e_t->flags |= ERTT_NEW_MEASUREMENT; 176 177 if (tp->t_flags & TF_TSO) { 178 /* Temporarily disable TSO to aid a new measurement. */ 179 tp->t_flags &= ~TF_TSO; 180 /* Keep track that we've disabled it. */ 181 e_t->flags |= ERTT_TSO_DISABLED; 182 } 183 } 184 } 185 186 /* 187 * Ertt_packet_measurements uses a small amount of state kept on each packet 188 * sent to match incoming acknowledgements. This enables more accurate and 189 * secure round trip time measurements. The resulting measurement is used for 190 * congestion control algorithms which require a more accurate time. 191 * Ertt_packet_measurements is called via the helper hook in tcp_input.c 192 */ 193 static int 194 ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata, 195 void *ctx_data, void *hdata, struct osd *hosd) 196 { 197 struct ertt *e_t; 198 struct tcpcb *tp; 199 struct tcphdr *th; 200 struct tcpopt *to; 201 struct tcp_hhook_data *thdp; 202 struct txseginfo *txsi; 203 int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust; 204 uint32_t measurenext, rts; 205 tcp_seq ack; 206 207 KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__)); 208 KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__)); 209 210 e_t = (struct ertt *)hdata; 211 thdp = ctx_data; 212 tp = thdp->tp; 213 th = thdp->th; 214 to = thdp->to; 215 new_sacked_bytes = (tp->sackhint.last_sack_ack != 0); 216 measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0; 217 acked = th->th_ack - tp->snd_una; 218 219 INP_WLOCK_ASSERT(tptoinpcb(tp)); 220 221 /* Packet has provided new acknowledgements. */ 222 if (acked > 0 || new_sacked_bytes) { 223 if (acked == 0 && new_sacked_bytes) { 224 /* Use last sacked data. */ 225 ack = tp->sackhint.last_sack_ack; 226 } else 227 ack = th->th_ack; 228 229 txsi = TAILQ_FIRST(&e_t->txsegi_q); 230 while (txsi != NULL) { 231 rts = 0; 232 233 /* Acknowledgement is acking more than this txsi. */ 234 if (SEQ_GT(ack, txsi->seq + txsi->len)) { 235 if (txsi->flags & TXSI_RTT_MEASURE_START || 236 measurenext) { 237 marked_packet_rtt(txsi, e_t, tp, 238 &measurenext, &measurenext_len, 239 &rtt_bytes_adjust, MULTI_ACK); 240 } 241 TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); 242 uma_zfree(txseginfo_zone, txsi); 243 txsi = TAILQ_FIRST(&e_t->txsegi_q); 244 continue; 245 } 246 247 /* 248 * Guess if delayed acks are being used by the receiver. 249 * 250 * XXXDH: A simple heuristic that could be improved 251 */ 252 if (!new_sacked_bytes) { 253 if (acked > tp->t_maxseg) { 254 e_t->dlyack_rx += 255 (e_t->dlyack_rx < DLYACK_SMOOTH) ? 256 1 : 0; 257 multiack = 1; 258 } else if (acked > txsi->len) { 259 multiack = 1; 260 e_t->dlyack_rx += 261 (e_t->dlyack_rx < DLYACK_SMOOTH) ? 262 1 : 0; 263 } else if (acked == tp->t_maxseg || 264 acked == txsi->len) { 265 e_t->dlyack_rx -= 266 (e_t->dlyack_rx > 0) ? 1 : 0; 267 } 268 /* Otherwise leave dlyack_rx the way it was. */ 269 } 270 271 /* 272 * Time stamps are only to help match the txsi with the 273 * received acknowledgements. 274 */ 275 if (e_t->timestamp_errors < MAX_TS_ERR && 276 (to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 277 /* 278 * Note: All packets sent with the offload will 279 * have the same time stamp. If we are sending 280 * on a fast interface and the t_maxseg is much 281 * smaller than one tick, this will be fine. The 282 * time stamp would be the same whether we were 283 * using tso or not. However, if the interface 284 * is slow, this will cause problems with the 285 * calculations. If the interface is slow, there 286 * is not reason to be using tso, and it should 287 * be turned off. 288 */ 289 /* 290 * If there are too many time stamp errors, time 291 * stamps won't be trusted 292 */ 293 rts = to->to_tsecr; 294 /* Before this packet. */ 295 if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts)) 296 /* When delayed acking is used, the 297 * reflected time stamp is of the first 298 * packet and thus may be before 299 * txsi->tx_ts. 300 */ 301 break; 302 if (TSTMP_GT(rts, txsi->tx_ts)) { 303 /* 304 * If reflected time stamp is later than 305 * tx_tsi, then this txsi is old. 306 */ 307 if (txsi->flags & TXSI_RTT_MEASURE_START 308 || measurenext) { 309 marked_packet_rtt(txsi, e_t, tp, 310 &measurenext, &measurenext_len, 311 &rtt_bytes_adjust, OLD_TXSI); 312 } 313 TAILQ_REMOVE(&e_t->txsegi_q, txsi, 314 txsegi_lnk); 315 uma_zfree(txseginfo_zone, txsi); 316 txsi = TAILQ_FIRST(&e_t->txsegi_q); 317 continue; 318 } 319 if (rts == txsi->tx_ts && 320 TSTMP_LT(to->to_tsval, txsi->rx_ts)) { 321 /* 322 * Segment received before sent! 323 * Something is wrong with the received 324 * timestamps so increment errors. If 325 * this keeps up we will ignore 326 * timestamps. 327 */ 328 e_t->timestamp_errors++; 329 } 330 } 331 /* 332 * Acknowledging a sequence number before this txsi. 333 * If it is an old txsi that may have had the same seq 334 * numbers, it should have been removed if time stamps 335 * are being used. 336 */ 337 if (SEQ_LEQ(ack, txsi->seq)) 338 break; /* Before first packet in txsi. */ 339 340 /* 341 * Only ack > txsi->seq and ack <= txsi->seq+txsi->len 342 * past this point. 343 * 344 * If delayed acks are being used, an acknowledgement 345 * for a single segment will have been delayed by the 346 * receiver and will yield an inaccurate measurement. In 347 * this case, we only make the measurement if more than 348 * one segment is being acknowledged or sack is 349 * currently being used. 350 */ 351 if (!e_t->dlyack_rx || multiack || new_sacked_bytes) { 352 /* Make an accurate new measurement. */ 353 e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1; 354 355 if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0) 356 e_t->minrtt = e_t->rtt; 357 358 if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0) 359 e_t->maxrtt = e_t->rtt; 360 } 361 362 if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext) 363 marked_packet_rtt(txsi, e_t, tp, 364 &measurenext, &measurenext_len, 365 &rtt_bytes_adjust, CORRECT_ACK); 366 367 if (txsi->flags & TXSI_TSO) { 368 if (txsi->len > acked) { 369 txsi->len -= acked; 370 /* 371 * This presumes ack for first bytes in 372 * txsi, this may not be true but it 373 * shouldn't cause problems for the 374 * timing. 375 * 376 * We remeasure RTT even though we only 377 * have a single txsi. The rationale 378 * behind this is that it is better to 379 * have a slightly inaccurate 380 * measurement than no additional 381 * measurement for the rest of the bulk 382 * transfer. Since TSO is only used on 383 * high speed interface cards, so the 384 * packets should be transmitted at line 385 * rate back to back with little 386 * difference in transmission times (in 387 * ticks). 388 */ 389 txsi->seq += acked; 390 /* 391 * Reset txsi measure flag so we don't 392 * use it for another RTT measurement. 393 */ 394 txsi->flags &= ~TXSI_RTT_MEASURE_START; 395 /* 396 * There is still more data to be acked 397 * from tso bulk transmission, so we 398 * won't remove it from the TAILQ yet. 399 */ 400 break; 401 } 402 txsi->len = 0; 403 } 404 405 TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); 406 uma_zfree(txseginfo_zone, txsi); 407 break; 408 } 409 410 if (measurenext) { 411 /* 412 * We need to do a RTT measurement. It won't be the best 413 * if we do it here. 414 */ 415 marked_packet_rtt(txsi, e_t, tp, 416 &measurenext, &measurenext_len, 417 &rtt_bytes_adjust, FORCED_MEASUREMENT); 418 } 419 } 420 421 return (0); 422 } 423 424 /* 425 * Add information about a transmitted segment to a list. 426 * This is called via the helper hook in tcp_output.c 427 */ 428 static int 429 ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, void *udata, 430 void *ctx_data, void *hdata, struct osd *hosd) 431 { 432 struct ertt *e_t; 433 struct tcpcb *tp; 434 struct tcphdr *th; 435 struct tcpopt *to; 436 struct tcp_hhook_data *thdp; 437 struct txseginfo *txsi; 438 uint32_t len; 439 int tso; 440 441 KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__)); 442 KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__)); 443 444 e_t = (struct ertt *)hdata; 445 thdp = ctx_data; 446 tp = thdp->tp; 447 th = thdp->th; 448 to = thdp->to; 449 len = thdp->len; 450 tso = thdp->tso; 451 452 INP_WLOCK_ASSERT(tptoinpcb(tp)); 453 454 if (len > 0) { 455 txsi = uma_zalloc(txseginfo_zone, M_NOWAIT); 456 if (txsi != NULL) { 457 /* Construct txsi setting the necessary flags. */ 458 txsi->flags = 0; /* Needs to be initialised. */ 459 txsi->seq = ntohl(th->th_seq); 460 txsi->len = len; 461 if (tso) 462 txsi->flags |= TXSI_TSO; 463 else if (e_t->flags & ERTT_TSO_DISABLED) { 464 tp->t_flags |= TF_TSO; 465 e_t->flags &= ~ERTT_TSO_DISABLED; 466 } 467 468 if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS) { 469 e_t->bytes_tx_in_rtt += len; 470 } else { 471 txsi->flags |= TXSI_RTT_MEASURE_START; 472 e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS; 473 e_t->bytes_tx_in_rtt = len; 474 } 475 476 if (((tp->t_flags & TF_NOOPT) == 0) && 477 (to->to_flags & TOF_TS)) { 478 txsi->tx_ts = ntohl(to->to_tsval) - 479 tp->ts_offset; 480 txsi->rx_ts = ntohl(to->to_tsecr); 481 } else { 482 txsi->tx_ts = tcp_ts_getticks(); 483 txsi->rx_ts = 0; /* No received time stamp. */ 484 } 485 TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk); 486 } 487 } 488 489 return (0); 490 } 491 492 static int 493 ertt_mod_init(void) 494 { 495 496 txseginfo_zone = uma_zcreate("ertt_txseginfo", sizeof(struct txseginfo), 497 NULL, NULL, NULL, NULL, 0, 0); 498 499 return (0); 500 } 501 502 static int 503 ertt_mod_destroy(void) 504 { 505 506 uma_zdestroy(txseginfo_zone); 507 508 return (0); 509 } 510 511 static int 512 ertt_uma_ctor(void *mem, int size, void *arg, int flags) 513 { 514 struct ertt *e_t; 515 516 e_t = mem; 517 518 TAILQ_INIT(&e_t->txsegi_q); 519 e_t->timestamp_errors = 0; 520 e_t->minrtt = 0; 521 e_t->maxrtt = 0; 522 e_t->rtt = 0; 523 e_t->flags = 0; 524 e_t->dlyack_rx = 0; 525 e_t->bytes_tx_in_rtt = 0; 526 e_t->markedpkt_rtt = 0; 527 528 return (0); 529 } 530 531 static void 532 ertt_uma_dtor(void *mem, int size, void *arg) 533 { 534 struct ertt *e_t; 535 struct txseginfo *n_txsi, *txsi; 536 537 e_t = mem; 538 txsi = TAILQ_FIRST(&e_t->txsegi_q); 539 while (txsi != NULL) { 540 n_txsi = TAILQ_NEXT(txsi, txsegi_lnk); 541 uma_zfree(txseginfo_zone, txsi); 542 txsi = n_txsi; 543 } 544 } 545 546 KHELP_DECLARE_MOD_UMA(ertt, &ertt_helper, ertt_hooks, 1, sizeof(struct ertt), 547 ertt_uma_ctor, ertt_uma_dtor); 548