1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2009-2010
5 * Swinburne University of Technology, Melbourne, Australia
6 * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
7 * Copyright (c) 2010-2011 The FreeBSD Foundation
8 * All rights reserved.
9 *
10 * This software was developed at the Centre for Advanced Internet
11 * Architectures, Swinburne University of Technology, by David Hayes, made
12 * possible in part by a grant from the Cisco University Research Program Fund
13 * at Community Foundation Silicon Valley.
14 *
15 * Portions of this software were developed at the Centre for Advanced
16 * Internet Architectures, Swinburne University of Technology, Melbourne,
17 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
18 *
19 * Redistribution and use in source and binary forms, with or without
20 * modification, are permitted provided that the following conditions
21 * are met:
22 * 1. Redistributions of source code must retain the above copyright
23 * notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 * notice, this list of conditions and the following disclaimer in the
26 * documentation and/or other materials provided with the distribution.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 */
40
41 #include <sys/param.h>
42 #include <sys/kernel.h>
43 #include <sys/mbuf.h>
44 #include <sys/module.h>
45 #include <sys/hhook.h>
46 #include <sys/khelp.h>
47 #include <sys/module_khelp.h>
48 #include <sys/socket.h>
49 #include <sys/sockopt.h>
50
51 #include <net/vnet.h>
52
53 #include <netinet/in.h>
54 #include <netinet/in_pcb.h>
55 #include <netinet/tcp_seq.h>
56 #include <netinet/tcp_var.h>
57
58 #include <netinet/khelp/h_ertt.h>
59
60 #include <vm/uma.h>
61
62 uma_zone_t txseginfo_zone;
63
64 /* Smoothing factor for delayed ack guess. */
65 #define DLYACK_SMOOTH 5
66
67 /* Max number of time stamp errors allowed in a session. */
68 #define MAX_TS_ERR 10
69
70 static int ertt_packet_measurement_hook(int hhook_type, int hhook_id,
71 void *udata, void *ctx_data, void *hdata, struct osd *hosd);
72 static int ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id,
73 void *udata, void *ctx_data, void *hdata, struct osd *hosd);
74 static int ertt_mod_init(void);
75 static int ertt_mod_destroy(void);
76 static int ertt_uma_ctor(void *mem, int size, void *arg, int flags);
77 static void ertt_uma_dtor(void *mem, int size, void *arg);
78
79 /*
80 * Contains information about the sent segment for comparison with the
81 * corresponding ack.
82 */
83 struct txseginfo {
84 /* Segment length. */
85 uint32_t len;
86 /* Segment sequence number. */
87 tcp_seq seq;
88 /* Time stamp indicating when the packet was sent. */
89 uint32_t tx_ts;
90 /* Last received receiver ts (if the TCP option is used). */
91 uint32_t rx_ts;
92 uint32_t flags;
93 TAILQ_ENTRY (txseginfo) txsegi_lnk;
94 };
95
96 /* Flags for struct txseginfo. */
97 #define TXSI_TSO 0x01 /* TSO was used for this entry. */
98 #define TXSI_RTT_MEASURE_START 0x02 /* Start a per RTT measurement. */
99 #define TXSI_RX_MEASURE_END 0x04 /* Measure the rx rate until this txsi. */
100
101 struct helper ertt_helper = {
102 .mod_init = ertt_mod_init,
103 .mod_destroy = ertt_mod_destroy,
104 .h_flags = HELPER_NEEDS_OSD,
105 .h_classes = HELPER_CLASS_TCP
106 };
107
108 /* Define the helper hook info required by ERTT. */
109 struct hookinfo ertt_hooks[] = {
110 {
111 .hook_type = HHOOK_TYPE_TCP,
112 .hook_id = HHOOK_TCP_EST_IN,
113 .hook_udata = NULL,
114 .hook_func = &ertt_packet_measurement_hook
115 },
116 {
117 .hook_type = HHOOK_TYPE_TCP,
118 .hook_id = HHOOK_TCP_EST_OUT,
119 .hook_udata = NULL,
120 .hook_func = &ertt_add_tx_segment_info_hook
121 }
122 };
123
124 /* Flags to indicate how marked_packet_rtt should handle this txsi. */
125 #define MULTI_ACK 0x01 /* More than this txsi is acked. */
126 #define OLD_TXSI 0x02 /* TXSI is old according to timestamps. */
127 #define CORRECT_ACK 0X04 /* Acks this TXSI. */
128 #define FORCED_MEASUREMENT 0X08 /* Force an RTT measurement. */
129
130 /*
131 * This fuction measures the RTT of a particular segment/ack pair, or the next
132 * closest if this will yield an inaccurate result due to delayed acking or
133 * other issues.
134 */
135 static void inline
marked_packet_rtt(struct txseginfo * txsi,struct ertt * e_t,struct tcpcb * tp,uint32_t * pmeasurenext,int * pmeasurenext_len,int * prtt_bytes_adjust,int mflag)136 marked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp,
137 uint32_t *pmeasurenext, int *pmeasurenext_len, int *prtt_bytes_adjust,
138 int mflag)
139 {
140
141 /*
142 * If we can't measure this one properly due to delayed acking adjust
143 * byte counters and flag to measure next txsi. Note that since the
144 * marked packet's transmitted bytes are measured we need to subtract the
145 * transmitted bytes. Then pretend the next txsi was marked.
146 */
147 if (mflag & (MULTI_ACK|OLD_TXSI)) {
148 *pmeasurenext = txsi->tx_ts;
149 *pmeasurenext_len = txsi->len;
150 *prtt_bytes_adjust += *pmeasurenext_len;
151 } else {
152 if (mflag & FORCED_MEASUREMENT) {
153 e_t->markedpkt_rtt = tcp_ts_getticks() -
154 *pmeasurenext + 1;
155 e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt +
156 *pmeasurenext_len - *prtt_bytes_adjust;
157 } else {
158 e_t->markedpkt_rtt = tcp_ts_getticks() -
159 txsi->tx_ts + 1;
160 e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt -
161 *prtt_bytes_adjust;
162 }
163 e_t->marked_snd_cwnd = tp->snd_cwnd;
164
165 /*
166 * Reset the ERTT_MEASUREMENT_IN_PROGRESS flag to indicate to
167 * add_tx_segment_info that a new measurement should be started.
168 */
169 e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS;
170 /*
171 * Set ERTT_NEW_MEASUREMENT to tell the congestion control
172 * algorithm that a new marked RTT measurement has has been made
173 * and is available for use.
174 */
175 e_t->flags |= ERTT_NEW_MEASUREMENT;
176
177 if (tp->t_flags & TF_TSO) {
178 /* Temporarily disable TSO to aid a new measurement. */
179 tp->t_flags &= ~TF_TSO;
180 /* Keep track that we've disabled it. */
181 e_t->flags |= ERTT_TSO_DISABLED;
182 }
183 }
184 }
185
186 /*
187 * Ertt_packet_measurements uses a small amount of state kept on each packet
188 * sent to match incoming acknowledgements. This enables more accurate and
189 * secure round trip time measurements. The resulting measurement is used for
190 * congestion control algorithms which require a more accurate time.
191 * Ertt_packet_measurements is called via the helper hook in tcp_input.c
192 */
193 static int
ertt_packet_measurement_hook(int hhook_type,int hhook_id,void * udata,void * ctx_data,void * hdata,struct osd * hosd)194 ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata,
195 void *ctx_data, void *hdata, struct osd *hosd)
196 {
197 struct ertt *e_t;
198 struct tcpcb *tp;
199 struct tcphdr *th;
200 struct tcpopt *to;
201 struct tcp_hhook_data *thdp;
202 struct txseginfo *txsi;
203 int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust;
204 uint32_t measurenext, rts;
205 tcp_seq ack;
206
207 KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
208 KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
209
210 e_t = (struct ertt *)hdata;
211 thdp = ctx_data;
212 tp = thdp->tp;
213 th = thdp->th;
214 to = thdp->to;
215 new_sacked_bytes = (tp->sackhint.last_sack_ack != 0);
216 measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0;
217 acked = th->th_ack - tp->snd_una;
218
219 INP_WLOCK_ASSERT(tptoinpcb(tp));
220
221 /* Packet has provided new acknowledgements. */
222 if (acked > 0 || new_sacked_bytes) {
223 if (acked == 0 && new_sacked_bytes) {
224 /* Use last sacked data. */
225 ack = tp->sackhint.last_sack_ack;
226 } else
227 ack = th->th_ack;
228
229 txsi = TAILQ_FIRST(&e_t->txsegi_q);
230 while (txsi != NULL) {
231 rts = 0;
232
233 /* Acknowledgement is acking more than this txsi. */
234 if (SEQ_GT(ack, txsi->seq + txsi->len)) {
235 if (txsi->flags & TXSI_RTT_MEASURE_START ||
236 measurenext) {
237 marked_packet_rtt(txsi, e_t, tp,
238 &measurenext, &measurenext_len,
239 &rtt_bytes_adjust, MULTI_ACK);
240 }
241 TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
242 uma_zfree(txseginfo_zone, txsi);
243 txsi = TAILQ_FIRST(&e_t->txsegi_q);
244 continue;
245 }
246
247 /*
248 * Guess if delayed acks are being used by the receiver.
249 *
250 * XXXDH: A simple heuristic that could be improved
251 */
252 if (!new_sacked_bytes) {
253 if (acked > tp->t_maxseg) {
254 e_t->dlyack_rx +=
255 (e_t->dlyack_rx < DLYACK_SMOOTH) ?
256 1 : 0;
257 multiack = 1;
258 } else if (acked > txsi->len) {
259 multiack = 1;
260 e_t->dlyack_rx +=
261 (e_t->dlyack_rx < DLYACK_SMOOTH) ?
262 1 : 0;
263 } else if (acked == tp->t_maxseg ||
264 acked == txsi->len) {
265 e_t->dlyack_rx -=
266 (e_t->dlyack_rx > 0) ? 1 : 0;
267 }
268 /* Otherwise leave dlyack_rx the way it was. */
269 }
270
271 /*
272 * Time stamps are only to help match the txsi with the
273 * received acknowledgements.
274 */
275 if (e_t->timestamp_errors < MAX_TS_ERR &&
276 (to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
277 /*
278 * Note: All packets sent with the offload will
279 * have the same time stamp. If we are sending
280 * on a fast interface and the t_maxseg is much
281 * smaller than one tick, this will be fine. The
282 * time stamp would be the same whether we were
283 * using tso or not. However, if the interface
284 * is slow, this will cause problems with the
285 * calculations. If the interface is slow, there
286 * is not reason to be using tso, and it should
287 * be turned off.
288 */
289 /*
290 * If there are too many time stamp errors, time
291 * stamps won't be trusted
292 */
293 rts = to->to_tsecr;
294 /* Before this packet. */
295 if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts))
296 /* When delayed acking is used, the
297 * reflected time stamp is of the first
298 * packet and thus may be before
299 * txsi->tx_ts.
300 */
301 break;
302 if (TSTMP_GT(rts, txsi->tx_ts)) {
303 /*
304 * If reflected time stamp is later than
305 * tx_tsi, then this txsi is old.
306 */
307 if (txsi->flags & TXSI_RTT_MEASURE_START
308 || measurenext) {
309 marked_packet_rtt(txsi, e_t, tp,
310 &measurenext, &measurenext_len,
311 &rtt_bytes_adjust, OLD_TXSI);
312 }
313 TAILQ_REMOVE(&e_t->txsegi_q, txsi,
314 txsegi_lnk);
315 uma_zfree(txseginfo_zone, txsi);
316 txsi = TAILQ_FIRST(&e_t->txsegi_q);
317 continue;
318 }
319 if (rts == txsi->tx_ts &&
320 TSTMP_LT(to->to_tsval, txsi->rx_ts)) {
321 /*
322 * Segment received before sent!
323 * Something is wrong with the received
324 * timestamps so increment errors. If
325 * this keeps up we will ignore
326 * timestamps.
327 */
328 e_t->timestamp_errors++;
329 }
330 }
331 /*
332 * Acknowledging a sequence number before this txsi.
333 * If it is an old txsi that may have had the same seq
334 * numbers, it should have been removed if time stamps
335 * are being used.
336 */
337 if (SEQ_LEQ(ack, txsi->seq))
338 break; /* Before first packet in txsi. */
339
340 /*
341 * Only ack > txsi->seq and ack <= txsi->seq+txsi->len
342 * past this point.
343 *
344 * If delayed acks are being used, an acknowledgement
345 * for a single segment will have been delayed by the
346 * receiver and will yield an inaccurate measurement. In
347 * this case, we only make the measurement if more than
348 * one segment is being acknowledged or sack is
349 * currently being used.
350 */
351 if (!e_t->dlyack_rx || multiack || new_sacked_bytes) {
352 /* Make an accurate new measurement. */
353 e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1;
354
355 if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0)
356 e_t->minrtt = e_t->rtt;
357
358 if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0)
359 e_t->maxrtt = e_t->rtt;
360 }
361
362 if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext)
363 marked_packet_rtt(txsi, e_t, tp,
364 &measurenext, &measurenext_len,
365 &rtt_bytes_adjust, CORRECT_ACK);
366
367 if (txsi->flags & TXSI_TSO) {
368 if (txsi->len > acked) {
369 txsi->len -= acked;
370 /*
371 * This presumes ack for first bytes in
372 * txsi, this may not be true but it
373 * shouldn't cause problems for the
374 * timing.
375 *
376 * We remeasure RTT even though we only
377 * have a single txsi. The rationale
378 * behind this is that it is better to
379 * have a slightly inaccurate
380 * measurement than no additional
381 * measurement for the rest of the bulk
382 * transfer. Since TSO is only used on
383 * high speed interface cards, so the
384 * packets should be transmitted at line
385 * rate back to back with little
386 * difference in transmission times (in
387 * ticks).
388 */
389 txsi->seq += acked;
390 /*
391 * Reset txsi measure flag so we don't
392 * use it for another RTT measurement.
393 */
394 txsi->flags &= ~TXSI_RTT_MEASURE_START;
395 /*
396 * There is still more data to be acked
397 * from tso bulk transmission, so we
398 * won't remove it from the TAILQ yet.
399 */
400 break;
401 }
402 txsi->len = 0;
403 }
404
405 TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
406 uma_zfree(txseginfo_zone, txsi);
407 break;
408 }
409
410 if (measurenext) {
411 /*
412 * We need to do a RTT measurement. It won't be the best
413 * if we do it here.
414 */
415 marked_packet_rtt(txsi, e_t, tp,
416 &measurenext, &measurenext_len,
417 &rtt_bytes_adjust, FORCED_MEASUREMENT);
418 }
419 }
420
421 return (0);
422 }
423
424 /*
425 * Add information about a transmitted segment to a list.
426 * This is called via the helper hook in tcp_output.c
427 */
428 static int
ertt_add_tx_segment_info_hook(int hhook_type,int hhook_id,void * udata,void * ctx_data,void * hdata,struct osd * hosd)429 ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, void *udata,
430 void *ctx_data, void *hdata, struct osd *hosd)
431 {
432 struct ertt *e_t;
433 struct tcpcb *tp;
434 struct tcphdr *th;
435 struct tcpopt *to;
436 struct tcp_hhook_data *thdp;
437 struct txseginfo *txsi;
438 uint32_t len;
439 int tso;
440
441 KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
442 KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
443
444 e_t = (struct ertt *)hdata;
445 thdp = ctx_data;
446 tp = thdp->tp;
447 th = thdp->th;
448 to = thdp->to;
449 len = thdp->len;
450 tso = thdp->tso;
451
452 INP_WLOCK_ASSERT(tptoinpcb(tp));
453
454 if (len > 0) {
455 txsi = uma_zalloc(txseginfo_zone, M_NOWAIT);
456 if (txsi != NULL) {
457 /* Construct txsi setting the necessary flags. */
458 txsi->flags = 0; /* Needs to be initialised. */
459 txsi->seq = ntohl(th->th_seq);
460 txsi->len = len;
461 if (tso)
462 txsi->flags |= TXSI_TSO;
463 else if (e_t->flags & ERTT_TSO_DISABLED) {
464 tp->t_flags |= TF_TSO;
465 e_t->flags &= ~ERTT_TSO_DISABLED;
466 }
467
468 if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS) {
469 e_t->bytes_tx_in_rtt += len;
470 } else {
471 txsi->flags |= TXSI_RTT_MEASURE_START;
472 e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS;
473 e_t->bytes_tx_in_rtt = len;
474 }
475
476 if (((tp->t_flags & TF_NOOPT) == 0) &&
477 (to->to_flags & TOF_TS)) {
478 txsi->tx_ts = ntohl(to->to_tsval) -
479 tp->ts_offset;
480 txsi->rx_ts = ntohl(to->to_tsecr);
481 } else {
482 txsi->tx_ts = tcp_ts_getticks();
483 txsi->rx_ts = 0; /* No received time stamp. */
484 }
485 TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk);
486 }
487 }
488
489 return (0);
490 }
491
492 static int
ertt_mod_init(void)493 ertt_mod_init(void)
494 {
495
496 txseginfo_zone = uma_zcreate("ertt_txseginfo", sizeof(struct txseginfo),
497 NULL, NULL, NULL, NULL, 0, 0);
498
499 return (0);
500 }
501
502 static int
ertt_mod_destroy(void)503 ertt_mod_destroy(void)
504 {
505
506 uma_zdestroy(txseginfo_zone);
507
508 return (0);
509 }
510
511 static int
ertt_uma_ctor(void * mem,int size,void * arg,int flags)512 ertt_uma_ctor(void *mem, int size, void *arg, int flags)
513 {
514 struct ertt *e_t;
515
516 e_t = mem;
517
518 TAILQ_INIT(&e_t->txsegi_q);
519 e_t->timestamp_errors = 0;
520 e_t->minrtt = 0;
521 e_t->maxrtt = 0;
522 e_t->rtt = 0;
523 e_t->flags = 0;
524 e_t->dlyack_rx = 0;
525 e_t->bytes_tx_in_rtt = 0;
526 e_t->markedpkt_rtt = 0;
527
528 return (0);
529 }
530
531 static void
ertt_uma_dtor(void * mem,int size,void * arg)532 ertt_uma_dtor(void *mem, int size, void *arg)
533 {
534 struct ertt *e_t;
535 struct txseginfo *n_txsi, *txsi;
536
537 e_t = mem;
538 txsi = TAILQ_FIRST(&e_t->txsegi_q);
539 while (txsi != NULL) {
540 n_txsi = TAILQ_NEXT(txsi, txsegi_lnk);
541 uma_zfree(txseginfo_zone, txsi);
542 txsi = n_txsi;
543 }
544 }
545
546 KHELP_DECLARE_MOD_UMA(ertt, &ertt_helper, ertt_hooks, 1, sizeof(struct ertt),
547 ertt_uma_ctor, ertt_uma_dtor);
548