xref: /freebsd/sys/netinet/khelp/h_ertt.c (revision c8e7f78a3d28ff6e6223ed136ada8e1e2f34965e)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2009-2010
5  * 	Swinburne University of Technology, Melbourne, Australia
6  * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
7  * Copyright (c) 2010-2011 The FreeBSD Foundation
8  * All rights reserved.
9  *
10  * This software was developed at the Centre for Advanced Internet
11  * Architectures, Swinburne University of Technology, by David Hayes, made
12  * possible in part by a grant from the Cisco University Research Program Fund
13  * at Community Foundation Silicon Valley.
14  *
15  * Portions of this software were developed at the Centre for Advanced
16  * Internet Architectures, Swinburne University of Technology, Melbourne,
17  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
18  *
19  * Redistribution and use in source and binary forms, with or without
20  * modification, are permitted provided that the following conditions
21  * are met:
22  * 1. Redistributions of source code must retain the above copyright
23  *    notice, this list of conditions and the following disclaimer.
24  * 2. Redistributions in binary form must reproduce the above copyright
25  *    notice, this list of conditions and the following disclaimer in the
26  *    documentation and/or other materials provided with the distribution.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  */
40 
41 #include <sys/param.h>
42 #include <sys/kernel.h>
43 #include <sys/mbuf.h>
44 #include <sys/module.h>
45 #include <sys/hhook.h>
46 #include <sys/khelp.h>
47 #include <sys/module_khelp.h>
48 #include <sys/socket.h>
49 #include <sys/sockopt.h>
50 
51 #include <net/vnet.h>
52 
53 #include <netinet/in.h>
54 #include <netinet/in_pcb.h>
55 #include <netinet/tcp_seq.h>
56 #include <netinet/tcp_var.h>
57 
58 #include <netinet/khelp/h_ertt.h>
59 
60 #include <vm/uma.h>
61 
62 uma_zone_t txseginfo_zone;
63 
64 /* Smoothing factor for delayed ack guess. */
65 #define	DLYACK_SMOOTH	5
66 
67 /* Max number of time stamp errors allowed in a session. */
68 #define	MAX_TS_ERR	10
69 
70 static int ertt_packet_measurement_hook(int hhook_type, int hhook_id,
71     void *udata, void *ctx_data, void *hdata, struct osd *hosd);
72 static int ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id,
73     void *udata, void *ctx_data, void *hdata, struct osd *hosd);
74 static int ertt_mod_init(void);
75 static int ertt_mod_destroy(void);
76 static int ertt_uma_ctor(void *mem, int size, void *arg, int flags);
77 static void ertt_uma_dtor(void *mem, int size, void *arg);
78 
79 /*
80  * Contains information about the sent segment for comparison with the
81  * corresponding ack.
82  */
83 struct txseginfo {
84 	/* Segment length. */
85 	uint32_t	len;
86 	/* Segment sequence number. */
87 	tcp_seq		seq;
88 	/* Time stamp indicating when the packet was sent. */
89 	uint32_t	tx_ts;
90 	/* Last received receiver ts (if the TCP option is used). */
91 	uint32_t	rx_ts;
92 	uint32_t	flags;
93 	TAILQ_ENTRY (txseginfo) txsegi_lnk;
94 };
95 
96 /* Flags for struct txseginfo. */
97 #define	TXSI_TSO		0x01 /* TSO was used for this entry. */
98 #define	TXSI_RTT_MEASURE_START	0x02 /* Start a per RTT measurement. */
99 #define	TXSI_RX_MEASURE_END	0x04 /* Measure the rx rate until this txsi. */
100 
101 struct helper ertt_helper = {
102 	.mod_init = ertt_mod_init,
103 	.mod_destroy = ertt_mod_destroy,
104 	.h_flags = HELPER_NEEDS_OSD,
105 	.h_classes = HELPER_CLASS_TCP
106 };
107 
108 /* Define the helper hook info required by ERTT. */
109 struct hookinfo ertt_hooks[] = {
110 	{
111 		.hook_type = HHOOK_TYPE_TCP,
112 		.hook_id = HHOOK_TCP_EST_IN,
113 		.hook_udata = NULL,
114 		.hook_func = &ertt_packet_measurement_hook
115 	},
116 	{
117 		.hook_type = HHOOK_TYPE_TCP,
118 		.hook_id = HHOOK_TCP_EST_OUT,
119 		.hook_udata = NULL,
120 		.hook_func = &ertt_add_tx_segment_info_hook
121 	}
122 };
123 
124 /* Flags to indicate how marked_packet_rtt should handle this txsi. */
125 #define	MULTI_ACK		0x01 /* More than this txsi is acked. */
126 #define	OLD_TXSI		0x02 /* TXSI is old according to timestamps. */
127 #define	CORRECT_ACK		0X04 /* Acks this TXSI. */
128 #define	FORCED_MEASUREMENT	0X08 /* Force an RTT measurement. */
129 
130 /*
131  * This fuction measures the RTT of a particular segment/ack pair, or the next
132  * closest if this will yield an inaccurate result due to delayed acking or
133  * other issues.
134  */
135 static void inline
136 marked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp,
137     uint32_t *pmeasurenext, int *pmeasurenext_len, int *prtt_bytes_adjust,
138     int mflag)
139 {
140 
141 	/*
142 	 * If we can't measure this one properly due to delayed acking adjust
143 	 * byte counters and flag to measure next txsi. Note that since the
144 	 * marked packet's transmitted bytes are measured we need to subtract the
145 	 * transmitted bytes. Then pretend the next txsi was marked.
146 	 */
147 	if (mflag & (MULTI_ACK|OLD_TXSI)) {
148 		*pmeasurenext = txsi->tx_ts;
149 		*pmeasurenext_len = txsi->len;
150 		*prtt_bytes_adjust += *pmeasurenext_len;
151 	} else {
152 		if (mflag & FORCED_MEASUREMENT) {
153 			e_t->markedpkt_rtt = tcp_ts_getticks() -
154 			    *pmeasurenext + 1;
155 			e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt +
156 			    *pmeasurenext_len - *prtt_bytes_adjust;
157 		} else {
158 			e_t->markedpkt_rtt = tcp_ts_getticks() -
159 			    txsi->tx_ts + 1;
160 			e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt -
161 			    *prtt_bytes_adjust;
162 		}
163 		e_t->marked_snd_cwnd = tp->snd_cwnd;
164 
165 		/*
166 		 * Reset the ERTT_MEASUREMENT_IN_PROGRESS flag to indicate to
167 		 * add_tx_segment_info that a new measurement should be started.
168 		 */
169 		e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS;
170 		/*
171 		 * Set ERTT_NEW_MEASUREMENT to tell the congestion control
172 		 * algorithm that a new marked RTT measurement has has been made
173 		 * and is available for use.
174 		 */
175 		e_t->flags |= ERTT_NEW_MEASUREMENT;
176 
177 		if (tp->t_flags & TF_TSO) {
178 			/* Temporarily disable TSO to aid a new measurement. */
179 			tp->t_flags &= ~TF_TSO;
180 			/* Keep track that we've disabled it. */
181 			e_t->flags |= ERTT_TSO_DISABLED;
182 		}
183 	}
184 }
185 
186 /*
187  * Ertt_packet_measurements uses a small amount of state kept on each packet
188  * sent to match incoming acknowledgements. This enables more accurate and
189  * secure round trip time measurements. The resulting measurement is used for
190  * congestion control algorithms which require a more accurate time.
191  * Ertt_packet_measurements is called via the helper hook in tcp_input.c
192  */
193 static int
194 ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata,
195     void *ctx_data, void *hdata, struct osd *hosd)
196 {
197 	struct ertt *e_t;
198 	struct tcpcb *tp;
199 	struct tcphdr *th;
200 	struct tcpopt *to;
201 	struct tcp_hhook_data *thdp;
202 	struct txseginfo *txsi;
203 	int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust;
204 	uint32_t measurenext, rts;
205 	tcp_seq ack;
206 
207 	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
208 	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
209 
210 	e_t = (struct ertt *)hdata;
211 	thdp = ctx_data;
212 	tp = thdp->tp;
213 	th = thdp->th;
214 	to = thdp->to;
215 	new_sacked_bytes = (tp->sackhint.last_sack_ack != 0);
216 	measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0;
217 	acked = th->th_ack - tp->snd_una;
218 
219 	INP_WLOCK_ASSERT(tptoinpcb(tp));
220 
221 	/* Packet has provided new acknowledgements. */
222 	if (acked > 0 || new_sacked_bytes) {
223 		if (acked == 0 && new_sacked_bytes) {
224 			/* Use last sacked data. */
225 			ack = tp->sackhint.last_sack_ack;
226 		} else
227 			ack = th->th_ack;
228 
229 		txsi = TAILQ_FIRST(&e_t->txsegi_q);
230 		while (txsi != NULL) {
231 			rts = 0;
232 
233 			/* Acknowledgement is acking more than this txsi. */
234 			if (SEQ_GT(ack, txsi->seq + txsi->len)) {
235 				if (txsi->flags & TXSI_RTT_MEASURE_START ||
236 				    measurenext) {
237 					marked_packet_rtt(txsi, e_t, tp,
238 					    &measurenext, &measurenext_len,
239 					    &rtt_bytes_adjust, MULTI_ACK);
240 				}
241 				TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
242 				uma_zfree(txseginfo_zone, txsi);
243 				txsi = TAILQ_FIRST(&e_t->txsegi_q);
244 				continue;
245 			}
246 
247 			/*
248 			 * Guess if delayed acks are being used by the receiver.
249 			 *
250 			 * XXXDH: A simple heuristic that could be improved
251 			 */
252 			if (!new_sacked_bytes) {
253 				if (acked > tp->t_maxseg) {
254 					e_t->dlyack_rx +=
255 					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
256 					    1 : 0;
257 					multiack = 1;
258 				} else if (acked > txsi->len) {
259 					multiack = 1;
260 					e_t->dlyack_rx +=
261 					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
262 					    1 : 0;
263 				} else if (acked == tp->t_maxseg ||
264 					   acked == txsi->len) {
265 					e_t->dlyack_rx -=
266 					    (e_t->dlyack_rx > 0) ? 1 : 0;
267 				}
268 				/* Otherwise leave dlyack_rx the way it was. */
269 			}
270 
271 			/*
272 			 * Time stamps are only to help match the txsi with the
273 			 * received acknowledgements.
274 			 */
275 			if (e_t->timestamp_errors < MAX_TS_ERR &&
276 			    (to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
277 				/*
278 				 * Note: All packets sent with the offload will
279 				 * have the same time stamp. If we are sending
280 				 * on a fast interface and the t_maxseg is much
281 				 * smaller than one tick, this will be fine. The
282 				 * time stamp would be the same whether we were
283 				 * using tso or not. However, if the interface
284 				 * is slow, this will cause problems with the
285 				 * calculations. If the interface is slow, there
286 				 * is not reason to be using tso, and it should
287 				 * be turned off.
288 				 */
289 				/*
290 				 * If there are too many time stamp errors, time
291 				 * stamps won't be trusted
292 				 */
293 				rts = to->to_tsecr;
294 				/* Before this packet. */
295 				if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts))
296 					/* When delayed acking is used, the
297 					 * reflected time stamp is of the first
298 					 * packet and thus may be before
299 					 * txsi->tx_ts.
300 					 */
301 					break;
302 				if (TSTMP_GT(rts, txsi->tx_ts)) {
303 					/*
304 					 * If reflected time stamp is later than
305 					 * tx_tsi, then this txsi is old.
306 					 */
307 					if (txsi->flags & TXSI_RTT_MEASURE_START
308 					    || measurenext) {
309 						marked_packet_rtt(txsi, e_t, tp,
310 						    &measurenext, &measurenext_len,
311 						    &rtt_bytes_adjust, OLD_TXSI);
312 					}
313 					TAILQ_REMOVE(&e_t->txsegi_q, txsi,
314 					    txsegi_lnk);
315 					uma_zfree(txseginfo_zone, txsi);
316 					txsi = TAILQ_FIRST(&e_t->txsegi_q);
317 					continue;
318 				}
319 				if (rts == txsi->tx_ts &&
320 				    TSTMP_LT(to->to_tsval, txsi->rx_ts)) {
321 					/*
322 					 * Segment received before sent!
323 					 * Something is wrong with the received
324 					 * timestamps so increment errors. If
325 					 * this keeps up we will ignore
326 					 * timestamps.
327 					 */
328 					e_t->timestamp_errors++;
329 				}
330 			}
331 			/*
332 			 * Acknowledging a sequence number before this txsi.
333 			 * If it is an old txsi that may have had the same seq
334 			 * numbers, it should have been removed if time stamps
335 			 * are being used.
336 			 */
337 			if (SEQ_LEQ(ack, txsi->seq))
338 				break; /* Before first packet in txsi. */
339 
340 			/*
341 			 * Only ack > txsi->seq and ack <= txsi->seq+txsi->len
342 			 * past this point.
343 			 *
344 			 * If delayed acks are being used, an acknowledgement
345 			 * for a single segment will have been delayed by the
346 			 * receiver and will yield an inaccurate measurement. In
347 			 * this case, we only make the measurement if more than
348 			 * one segment is being acknowledged or sack is
349 			 * currently being used.
350 			 */
351 			if (!e_t->dlyack_rx || multiack || new_sacked_bytes) {
352 				/* Make an accurate new measurement. */
353 				e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1;
354 
355 				if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0)
356 					e_t->minrtt = e_t->rtt;
357 
358 				if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0)
359 					e_t->maxrtt = e_t->rtt;
360 			}
361 
362 			if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext)
363 				marked_packet_rtt(txsi, e_t, tp,
364 				    &measurenext, &measurenext_len,
365 				    &rtt_bytes_adjust, CORRECT_ACK);
366 
367 			if (txsi->flags & TXSI_TSO) {
368 				if (txsi->len > acked) {
369 					txsi->len -= acked;
370 					/*
371 					 * This presumes ack for first bytes in
372 					 * txsi, this may not be true but it
373 					 * shouldn't cause problems for the
374 					 * timing.
375 					 *
376 					 * We remeasure RTT even though we only
377 					 * have a single txsi. The rationale
378 					 * behind this is that it is better to
379 					 * have a slightly inaccurate
380 					 * measurement than no additional
381 					 * measurement for the rest of the bulk
382 					 * transfer. Since TSO is only used on
383 					 * high speed interface cards, so the
384 					 * packets should be transmitted at line
385 					 * rate back to back with little
386 					 * difference in transmission times (in
387 					 * ticks).
388 					 */
389 					txsi->seq += acked;
390 					/*
391 					 * Reset txsi measure flag so we don't
392 					 * use it for another RTT measurement.
393 					 */
394 					txsi->flags &= ~TXSI_RTT_MEASURE_START;
395 					/*
396 					 * There is still more data to be acked
397 					 * from tso bulk transmission, so we
398 					 * won't remove it from the TAILQ yet.
399 					 */
400 					break;
401 				}
402 				txsi->len = 0;
403 			}
404 
405 			TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
406 			uma_zfree(txseginfo_zone, txsi);
407 			break;
408 		}
409 
410 		if (measurenext) {
411 			/*
412 			 * We need to do a RTT measurement. It won't be the best
413 			 * if we do it here.
414 			 */
415 			marked_packet_rtt(txsi, e_t, tp,
416 			    &measurenext, &measurenext_len,
417 			    &rtt_bytes_adjust, FORCED_MEASUREMENT);
418 		}
419 	}
420 
421 	return (0);
422 }
423 
424 /*
425  * Add information about a transmitted segment to a list.
426  * This is called via the helper hook in tcp_output.c
427  */
428 static int
429 ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, void *udata,
430     void *ctx_data, void *hdata, struct osd *hosd)
431 {
432 	struct ertt *e_t;
433 	struct tcpcb *tp;
434 	struct tcphdr *th;
435 	struct tcpopt *to;
436 	struct tcp_hhook_data *thdp;
437 	struct txseginfo *txsi;
438 	uint32_t len;
439 	int tso;
440 
441 	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
442 	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
443 
444 	e_t = (struct ertt *)hdata;
445 	thdp = ctx_data;
446 	tp = thdp->tp;
447 	th = thdp->th;
448 	to = thdp->to;
449 	len = thdp->len;
450 	tso = thdp->tso;
451 
452 	INP_WLOCK_ASSERT(tptoinpcb(tp));
453 
454 	if (len > 0) {
455 		txsi = uma_zalloc(txseginfo_zone, M_NOWAIT);
456 		if (txsi != NULL) {
457 			/* Construct txsi setting the necessary flags. */
458 			txsi->flags = 0; /* Needs to be initialised. */
459 			txsi->seq = ntohl(th->th_seq);
460 			txsi->len = len;
461 			if (tso)
462 				txsi->flags |= TXSI_TSO;
463 			else if (e_t->flags & ERTT_TSO_DISABLED) {
464 				tp->t_flags |= TF_TSO;
465 				e_t->flags &= ~ERTT_TSO_DISABLED;
466 			}
467 
468 			if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS) {
469 				e_t->bytes_tx_in_rtt += len;
470 			} else {
471 				txsi->flags |= TXSI_RTT_MEASURE_START;
472 				e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS;
473 				e_t->bytes_tx_in_rtt = len;
474 			}
475 
476 			if (((tp->t_flags & TF_NOOPT) == 0) &&
477 			    (to->to_flags & TOF_TS)) {
478 				txsi->tx_ts = ntohl(to->to_tsval) -
479 				    tp->ts_offset;
480 				txsi->rx_ts = ntohl(to->to_tsecr);
481 			} else {
482 				txsi->tx_ts = tcp_ts_getticks();
483 				txsi->rx_ts = 0; /* No received time stamp. */
484 			}
485 			TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk);
486 		}
487 	}
488 
489 	return (0);
490 }
491 
492 static int
493 ertt_mod_init(void)
494 {
495 
496 	txseginfo_zone = uma_zcreate("ertt_txseginfo", sizeof(struct txseginfo),
497 	    NULL, NULL, NULL, NULL, 0, 0);
498 
499 	return (0);
500 }
501 
502 static int
503 ertt_mod_destroy(void)
504 {
505 
506 	uma_zdestroy(txseginfo_zone);
507 
508 	return (0);
509 }
510 
511 static int
512 ertt_uma_ctor(void *mem, int size, void *arg, int flags)
513 {
514 	struct ertt *e_t;
515 
516 	e_t = mem;
517 
518 	TAILQ_INIT(&e_t->txsegi_q);
519 	e_t->timestamp_errors = 0;
520 	e_t->minrtt = 0;
521 	e_t->maxrtt = 0;
522 	e_t->rtt = 0;
523 	e_t->flags = 0;
524 	e_t->dlyack_rx = 0;
525 	e_t->bytes_tx_in_rtt = 0;
526 	e_t->markedpkt_rtt = 0;
527 
528 	return (0);
529 }
530 
531 static void
532 ertt_uma_dtor(void *mem, int size, void *arg)
533 {
534 	struct ertt *e_t;
535 	struct txseginfo *n_txsi, *txsi;
536 
537 	e_t = mem;
538 	txsi = TAILQ_FIRST(&e_t->txsegi_q);
539 	while (txsi != NULL) {
540 		n_txsi = TAILQ_NEXT(txsi, txsegi_lnk);
541 		uma_zfree(txseginfo_zone, txsi);
542 		txsi = n_txsi;
543 	}
544 }
545 
546 KHELP_DECLARE_MOD_UMA(ertt, &ertt_helper, ertt_hooks, 1, sizeof(struct ertt),
547     ertt_uma_ctor, ertt_uma_dtor);
548