1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, Joyent Inc. All rights reserved.
25 */
26
27 /*
28 * This file contains functions related to TCP time wait processing. Also
29 * refer to the time wait handling comments in tcp_impl.h.
30 */
31
32 #include <sys/types.h>
33 #include <sys/strsun.h>
34 #include <sys/squeue_impl.h>
35 #include <sys/squeue.h>
36 #include <sys/callo.h>
37
38 #include <inet/common.h>
39 #include <inet/ip.h>
40 #include <inet/tcp.h>
41 #include <inet/tcp_impl.h>
42 #include <inet/tcp_cluster.h>
43
44 static void tcp_timewait_close(void *, mblk_t *, void *, ip_recv_attr_t *);
45
46 /*
47 * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
48 * Running it every 5 seconds seems to give the best results.
49 */
50 #define TCP_TIME_WAIT_DELAY ((hrtime_t)5 * NANOSEC)
51
52 /*
53 * Remove a connection from the list of detached TIME_WAIT connections.
54 * It returns B_FALSE if it can't remove the connection from the list
55 * as the connection has already been removed from the list due to an
56 * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
57 */
58 boolean_t
tcp_time_wait_remove(tcp_t * tcp,tcp_squeue_priv_t * tcp_time_wait)59 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
60 {
61 boolean_t locked = B_FALSE;
62
63 if (tcp_time_wait == NULL) {
64 tcp_time_wait = *((tcp_squeue_priv_t **)
65 squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
66 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
67 locked = B_TRUE;
68 } else {
69 ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock));
70 }
71
72 /* 0 means that the tcp_t has not been added to the time wait list. */
73 if (tcp->tcp_time_wait_expire == 0) {
74 ASSERT(tcp->tcp_time_wait_next == NULL);
75 ASSERT(tcp->tcp_time_wait_prev == NULL);
76 if (locked)
77 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
78 return (B_FALSE);
79 }
80 ASSERT(TCP_IS_DETACHED(tcp));
81 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
82
83 if (tcp == tcp_time_wait->tcp_time_wait_head) {
84 ASSERT(tcp->tcp_time_wait_prev == NULL);
85 tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next;
86 if (tcp_time_wait->tcp_time_wait_head != NULL) {
87 tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
88 NULL;
89 } else {
90 tcp_time_wait->tcp_time_wait_tail = NULL;
91 }
92 } else if (tcp == tcp_time_wait->tcp_time_wait_tail) {
93 ASSERT(tcp->tcp_time_wait_next == NULL);
94 tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev;
95 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
96 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL;
97 } else {
98 ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp);
99 ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp);
100 tcp->tcp_time_wait_prev->tcp_time_wait_next =
101 tcp->tcp_time_wait_next;
102 tcp->tcp_time_wait_next->tcp_time_wait_prev =
103 tcp->tcp_time_wait_prev;
104 }
105 tcp->tcp_time_wait_next = NULL;
106 tcp->tcp_time_wait_prev = NULL;
107 tcp->tcp_time_wait_expire = 0;
108
109 if (locked)
110 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
111 return (B_TRUE);
112 }
113
114 /* Constants used for fast checking of a localhost address */
115 #if defined(_BIG_ENDIAN)
116 #define IPv4_LOCALHOST 0x7f000000U
117 #define IPv4_LH_MASK 0xffffff00U
118 #else
119 #define IPv4_LOCALHOST 0x0000007fU
120 #define IPv4_LH_MASK 0x00ffffffU
121 #endif
122
123 #define IS_LOCAL_HOST(x) ( \
124 ((x)->tcp_connp->conn_ipversion == IPV4_VERSION && \
125 ((x)->tcp_connp->conn_laddr_v4 & IPv4_LH_MASK) == IPv4_LOCALHOST) || \
126 ((x)->tcp_connp->conn_ipversion == IPV6_VERSION && \
127 IN6_IS_ADDR_LOOPBACK(&(x)->tcp_connp->conn_laddr_v6)))
128
129 /*
130 * Add a connection to the list of detached TIME_WAIT connections
131 * and set its time to expire.
132 */
133 void
tcp_time_wait_append(tcp_t * tcp)134 tcp_time_wait_append(tcp_t *tcp)
135 {
136 tcp_stack_t *tcps = tcp->tcp_tcps;
137 squeue_t *sqp = tcp->tcp_connp->conn_sqp;
138 tcp_squeue_priv_t *tcp_time_wait =
139 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
140 hrtime_t firetime = 0;
141
142 tcp_timers_stop(tcp);
143
144 /* Freed above */
145 ASSERT(tcp->tcp_timer_tid == 0);
146 ASSERT(tcp->tcp_ack_tid == 0);
147
148 /* must have happened at the time of detaching the tcp */
149 ASSERT(tcp->tcp_ptpahn == NULL);
150 ASSERT(tcp->tcp_flow_stopped == 0);
151 ASSERT(tcp->tcp_time_wait_next == NULL);
152 ASSERT(tcp->tcp_time_wait_prev == NULL);
153 ASSERT(tcp->tcp_time_wait_expire == 0);
154 ASSERT(tcp->tcp_listener == NULL);
155
156 tcp->tcp_time_wait_expire = ddi_get_lbolt64();
157 if (IS_LOCAL_HOST(tcp)) {
158 /*
159 * This is the fastpath for handling localhost connections.
160 * Since we don't have to worry about packets on the localhost
161 * showing up after a long network delay, we want to expire
162 * these quickly so the port range on the localhost doesn't
163 * get starved by short-running, local apps.
164 *
165 * Leave tcp_time_wait_expire at the current time. This
166 * essentially means the connection is expired now and it will
167 * clean up the next time tcp_time_wait_collector runs. We set
168 * firetime to use a short delay so that if we have to start a
169 * tcp_time_wait_collector thread below, it runs soon instead
170 * of after a delay of time_wait_interval. firetime being set
171 * to a non-0 value is also our indicator that we should add
172 * this connection to the head of the time wait list (since we
173 * are already expired) so that its sure to get cleaned up on
174 * the next run of tcp_time_wait_collector (which expects the
175 * entries to appear in time-order and stops when it hits the
176 * first non-expired entry).
177 */
178 firetime = TCP_TIME_WAIT_DELAY;
179 } else {
180 /*
181 * Since tcp_time_wait_expire is lbolt64, it should not wrap
182 * around in practice. Hence it cannot be 0. Note that zero
183 * means that the tcp_t is not in the TIME_WAIT list.
184 */
185 tcp->tcp_time_wait_expire += MSEC_TO_TICK(
186 tcps->tcps_time_wait_interval);
187 }
188
189 ASSERT(TCP_IS_DETACHED(tcp));
190 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
191 ASSERT(tcp->tcp_time_wait_next == NULL);
192 ASSERT(tcp->tcp_time_wait_prev == NULL);
193 TCP_DBGSTAT(tcps, tcp_time_wait);
194
195 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
196 if (tcp_time_wait->tcp_time_wait_head == NULL) {
197 ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL);
198 tcp_time_wait->tcp_time_wait_head = tcp;
199
200 /*
201 * Even if the list was empty before, there may be a timer
202 * running since a tcp_t can be removed from the list
203 * in other places, such as tcp_clean_death(). So check if
204 * a timer is needed.
205 */
206 if (tcp_time_wait->tcp_time_wait_tid == 0) {
207 if (firetime == 0)
208 firetime = (hrtime_t)
209 (tcps->tcps_time_wait_interval + 1) *
210 MICROSEC;
211
212 tcp_time_wait->tcp_time_wait_tid =
213 timeout_generic(CALLOUT_NORMAL,
214 tcp_time_wait_collector, sqp, firetime,
215 CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
216 }
217 tcp_time_wait->tcp_time_wait_tail = tcp;
218 } else {
219 /*
220 * The list is not empty, so a timer must be running. If not,
221 * tcp_time_wait_collector() must be running on this
222 * tcp_time_wait list at the same time.
223 */
224 ASSERT(tcp_time_wait->tcp_time_wait_tid != 0 ||
225 tcp_time_wait->tcp_time_wait_running);
226 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
227 ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state ==
228 TCPS_TIME_WAIT);
229
230 if (firetime == 0) {
231 /* add at end */
232 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next =
233 tcp;
234 tcp->tcp_time_wait_prev =
235 tcp_time_wait->tcp_time_wait_tail;
236 tcp_time_wait->tcp_time_wait_tail = tcp;
237 } else {
238 /* add at head */
239 tcp->tcp_time_wait_next =
240 tcp_time_wait->tcp_time_wait_head;
241 tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
242 tcp;
243 tcp_time_wait->tcp_time_wait_head = tcp;
244 }
245 }
246 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
247 }
248
249 /*
250 * Wrapper to call tcp_close_detached() via squeue to clean up TIME-WAIT
251 * tcp_t. Used in tcp_time_wait_collector().
252 */
253 /* ARGSUSED */
254 static void
tcp_timewait_close(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)255 tcp_timewait_close(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
256 {
257 conn_t *connp = (conn_t *)arg;
258 tcp_t *tcp = connp->conn_tcp;
259
260 ASSERT(tcp != NULL);
261 if (tcp->tcp_state == TCPS_CLOSED) {
262 return;
263 }
264
265 ASSERT((connp->conn_family == AF_INET &&
266 connp->conn_ipversion == IPV4_VERSION) ||
267 (connp->conn_family == AF_INET6 &&
268 (connp->conn_ipversion == IPV4_VERSION ||
269 connp->conn_ipversion == IPV6_VERSION)));
270 ASSERT(!tcp->tcp_listener);
271
272 ASSERT(TCP_IS_DETACHED(tcp));
273
274 /*
275 * Because they have no upstream client to rebind or tcp_close()
276 * them later, we axe the connection here and now.
277 */
278 tcp_close_detached(tcp);
279 }
280
281 /*
282 * Blows away all tcps whose TIME_WAIT has expired. List traversal
283 * is done forwards from the head.
284 * This walks all stack instances since
285 * tcp_time_wait remains global across all stacks.
286 */
287 /* ARGSUSED */
288 void
tcp_time_wait_collector(void * arg)289 tcp_time_wait_collector(void *arg)
290 {
291 tcp_t *tcp;
292 int64_t now;
293 mblk_t *mp;
294 conn_t *connp;
295 kmutex_t *lock;
296 boolean_t removed;
297 extern void (*cl_inet_disconnect)(netstackid_t, uint8_t, sa_family_t,
298 uint8_t *, in_port_t, uint8_t *, in_port_t, void *);
299
300 squeue_t *sqp = (squeue_t *)arg;
301 tcp_squeue_priv_t *tcp_time_wait =
302 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
303
304 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
305 tcp_time_wait->tcp_time_wait_tid = 0;
306 #ifdef DEBUG
307 tcp_time_wait->tcp_time_wait_running = B_TRUE;
308 #endif
309
310 if (tcp_time_wait->tcp_free_list != NULL &&
311 tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) {
312 TCP_G_STAT(tcp_freelist_cleanup);
313 while ((tcp = tcp_time_wait->tcp_free_list) != NULL) {
314 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
315 tcp->tcp_time_wait_next = NULL;
316 tcp_time_wait->tcp_free_list_cnt--;
317 ASSERT(tcp->tcp_tcps == NULL);
318 CONN_DEC_REF(tcp->tcp_connp);
319 }
320 ASSERT(tcp_time_wait->tcp_free_list_cnt == 0);
321 }
322
323 /*
324 * In order to reap time waits reliably, we should use a
325 * source of time that is not adjustable by the user -- hence
326 * the call to ddi_get_lbolt64().
327 */
328 now = ddi_get_lbolt64();
329 while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) {
330 /*
331 * lbolt64 should not wrap around in practice... So we can
332 * do a direct comparison.
333 */
334 if (now < tcp->tcp_time_wait_expire)
335 break;
336
337 removed = tcp_time_wait_remove(tcp, tcp_time_wait);
338 ASSERT(removed);
339
340 connp = tcp->tcp_connp;
341 ASSERT(connp->conn_fanout != NULL);
342 lock = &connp->conn_fanout->connf_lock;
343 /*
344 * This is essentially a TW reclaim fast path optimization for
345 * performance where the timewait collector checks under the
346 * fanout lock (so that no one else can get access to the
347 * conn_t) that the refcnt is 2 i.e. one for TCP and one for
348 * the classifier hash list. If ref count is indeed 2, we can
349 * just remove the conn under the fanout lock and avoid
350 * cleaning up the conn under the squeue, provided that
351 * clustering callbacks are not enabled. If clustering is
352 * enabled, we need to make the clustering callback before
353 * setting the CONDEMNED flag and after dropping all locks and
354 * so we forego this optimization and fall back to the slow
355 * path. Also please see the comments in tcp_closei_local
356 * regarding the refcnt logic.
357 *
358 * Since we are holding the tcp_time_wait_lock, its better
359 * not to block on the fanout_lock because other connections
360 * can't add themselves to time_wait list. So we do a
361 * tryenter instead of mutex_enter.
362 */
363 if (mutex_tryenter(lock)) {
364 mutex_enter(&connp->conn_lock);
365 if ((connp->conn_ref == 2) &&
366 (cl_inet_disconnect == NULL)) {
367 ipcl_hash_remove_locked(connp,
368 connp->conn_fanout);
369 /*
370 * Set the CONDEMNED flag now itself so that
371 * the refcnt cannot increase due to any
372 * walker.
373 */
374 connp->conn_state_flags |= CONN_CONDEMNED;
375 mutex_exit(lock);
376 mutex_exit(&connp->conn_lock);
377 if (tcp_time_wait->tcp_free_list_cnt <
378 tcp_free_list_max_cnt) {
379 /* Add to head of tcp_free_list */
380 mutex_exit(
381 &tcp_time_wait->tcp_time_wait_lock);
382 tcp_cleanup(tcp);
383 ASSERT(connp->conn_latch == NULL);
384 ASSERT(connp->conn_policy == NULL);
385 ASSERT(tcp->tcp_tcps == NULL);
386 ASSERT(connp->conn_netstack == NULL);
387
388 mutex_enter(
389 &tcp_time_wait->tcp_time_wait_lock);
390 tcp->tcp_time_wait_next =
391 tcp_time_wait->tcp_free_list;
392 tcp_time_wait->tcp_free_list = tcp;
393 tcp_time_wait->tcp_free_list_cnt++;
394 continue;
395 } else {
396 /* Do not add to tcp_free_list */
397 mutex_exit(
398 &tcp_time_wait->tcp_time_wait_lock);
399 tcp_bind_hash_remove(tcp);
400 ixa_cleanup(tcp->tcp_connp->conn_ixa);
401 tcp_ipsec_cleanup(tcp);
402 CONN_DEC_REF(tcp->tcp_connp);
403 }
404 } else {
405 CONN_INC_REF_LOCKED(connp);
406 mutex_exit(lock);
407 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
408 mutex_exit(&connp->conn_lock);
409 /*
410 * We can reuse the closemp here since conn has
411 * detached (otherwise we wouldn't even be in
412 * time_wait list). tcp_closemp_used can safely
413 * be changed without taking a lock as no other
414 * thread can concurrently access it at this
415 * point in the connection lifecycle.
416 */
417
418 if (tcp->tcp_closemp.b_prev == NULL)
419 tcp->tcp_closemp_used = B_TRUE;
420 else
421 cmn_err(CE_PANIC,
422 "tcp_timewait_collector: "
423 "concurrent use of tcp_closemp: "
424 "connp %p tcp %p\n", (void *)connp,
425 (void *)tcp);
426
427 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
428 mp = &tcp->tcp_closemp;
429 SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
430 tcp_timewait_close, connp, NULL,
431 SQ_FILL, SQTAG_TCP_TIMEWAIT);
432 }
433 } else {
434 mutex_enter(&connp->conn_lock);
435 CONN_INC_REF_LOCKED(connp);
436 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
437 mutex_exit(&connp->conn_lock);
438 /*
439 * We can reuse the closemp here since conn has
440 * detached (otherwise we wouldn't even be in
441 * time_wait list). tcp_closemp_used can safely
442 * be changed without taking a lock as no other
443 * thread can concurrently access it at this
444 * point in the connection lifecycle.
445 */
446
447 if (tcp->tcp_closemp.b_prev == NULL)
448 tcp->tcp_closemp_used = B_TRUE;
449 else
450 cmn_err(CE_PANIC, "tcp_timewait_collector: "
451 "concurrent use of tcp_closemp: "
452 "connp %p tcp %p\n", (void *)connp,
453 (void *)tcp);
454
455 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
456 mp = &tcp->tcp_closemp;
457 SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
458 tcp_timewait_close, connp, NULL,
459 SQ_FILL, SQTAG_TCP_TIMEWAIT);
460 }
461 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
462 }
463
464 if (tcp_time_wait->tcp_free_list != NULL)
465 tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE;
466
467 /*
468 * If the time wait list is not empty and there is no timer running,
469 * restart it.
470 */
471 if ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL &&
472 tcp_time_wait->tcp_time_wait_tid == 0) {
473 hrtime_t firetime;
474
475 /* shouldn't be necessary, but just in case */
476 if (tcp->tcp_time_wait_expire < now)
477 tcp->tcp_time_wait_expire = now;
478
479 firetime = TICK_TO_NSEC(tcp->tcp_time_wait_expire - now);
480 /* This ensures that we won't wake up too often. */
481 firetime = MAX(TCP_TIME_WAIT_DELAY, firetime);
482 tcp_time_wait->tcp_time_wait_tid =
483 timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector,
484 sqp, firetime, CALLOUT_TCP_RESOLUTION,
485 CALLOUT_FLAG_ROUNDUP);
486 }
487 #ifdef DEBUG
488 tcp_time_wait->tcp_time_wait_running = B_FALSE;
489 #endif
490 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
491 }
492
493 /*
494 * tcp_time_wait_processing() handles processing of incoming packets when
495 * the tcp_t is in the TIME_WAIT state.
496 *
497 * A TIME_WAIT tcp_t that has an associated open TCP end point (not in
498 * detached state) is never put on the time wait list.
499 */
500 void
tcp_time_wait_processing(tcp_t * tcp,mblk_t * mp,uint32_t seg_seq,uint32_t seg_ack,int seg_len,tcpha_t * tcpha,ip_recv_attr_t * ira)501 tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
502 uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira)
503 {
504 int32_t bytes_acked;
505 int32_t gap;
506 int32_t rgap;
507 tcp_opt_t tcpopt;
508 uint_t flags;
509 uint32_t new_swnd = 0;
510 conn_t *nconnp;
511 conn_t *connp = tcp->tcp_connp;
512 tcp_stack_t *tcps = tcp->tcp_tcps;
513
514 BUMP_LOCAL(tcp->tcp_ibsegs);
515 DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
516
517 flags = (unsigned int)tcpha->tha_flags & 0xFF;
518 new_swnd = ntohs(tcpha->tha_win) <<
519 ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws);
520
521 if (tcp->tcp_snd_ts_ok && !(tcpha->tha_flags & TH_RST)) {
522 int options;
523 if (tcp->tcp_snd_sack_ok)
524 tcpopt.tcp = tcp;
525 else
526 tcpopt.tcp = NULL;
527 options = tcp_parse_options(tcpha, &tcpopt);
528 if (!(options & TCP_OPT_TSTAMP_PRESENT)) {
529 DTRACE_TCP1(droppedtimestamp, tcp_t *, tcp);
530 goto done;
531 } else if (!tcp_paws_check(tcp, &tcpopt)) {
532 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt,
533 TH_ACK);
534 goto done;
535 }
536 }
537 gap = seg_seq - tcp->tcp_rnxt;
538 rgap = tcp->tcp_rwnd - (gap + seg_len);
539 if (gap < 0) {
540 TCPS_BUMP_MIB(tcps, tcpInDataDupSegs);
541 TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes,
542 (seg_len > -gap ? -gap : seg_len));
543 seg_len += gap;
544 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) {
545 if (flags & TH_RST) {
546 goto done;
547 }
548 if ((flags & TH_FIN) && seg_len == -1) {
549 /*
550 * When TCP receives a duplicate FIN in
551 * TIME_WAIT state, restart the 2 MSL timer.
552 * See page 73 in RFC 793. Make sure this TCP
553 * is already on the TIME_WAIT list. If not,
554 * just restart the timer.
555 */
556 if (TCP_IS_DETACHED(tcp)) {
557 if (tcp_time_wait_remove(tcp, NULL) ==
558 B_TRUE) {
559 tcp_time_wait_append(tcp);
560 TCP_DBGSTAT(tcps,
561 tcp_rput_time_wait);
562 }
563 } else {
564 ASSERT(tcp != NULL);
565 TCP_TIMER_RESTART(tcp,
566 tcps->tcps_time_wait_interval);
567 }
568 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
569 tcp->tcp_rnxt, TH_ACK);
570 goto done;
571 }
572 flags |= TH_ACK_NEEDED;
573 seg_len = 0;
574 goto process_ack;
575 }
576
577 /* Fix seg_seq, and chew the gap off the front. */
578 seg_seq = tcp->tcp_rnxt;
579 }
580
581 if ((flags & TH_SYN) && gap > 0 && rgap < 0) {
582 /*
583 * Make sure that when we accept the connection, pick
584 * an ISS greater than (tcp_snxt + tcp_iss_incr/2) for the
585 * old connection.
586 *
587 * The next ISS generated is equal to tcp_iss_incr_extra
588 * + tcp_iss_incr/2 + other components depending on the
589 * value of tcp_strong_iss. We pre-calculate the new
590 * ISS here and compare with tcp_snxt to determine if
591 * we need to make adjustment to tcp_iss_incr_extra.
592 *
593 * The above calculation is ugly and is a
594 * waste of CPU cycles...
595 */
596 uint32_t new_iss = tcps->tcps_iss_incr_extra;
597 int32_t adj;
598 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
599
600 switch (tcps->tcps_strong_iss) {
601 case 2: {
602 /* Add time and MD5 components. */
603 uint32_t answer[4];
604 struct {
605 uint32_t ports;
606 in6_addr_t src;
607 in6_addr_t dst;
608 } arg;
609 MD5_CTX context;
610
611 mutex_enter(&tcps->tcps_iss_key_lock);
612 context = tcps->tcps_iss_key;
613 mutex_exit(&tcps->tcps_iss_key_lock);
614 arg.ports = connp->conn_ports;
615 /* We use MAPPED addresses in tcp_iss_init */
616 arg.src = connp->conn_laddr_v6;
617 arg.dst = connp->conn_faddr_v6;
618 MD5Update(&context, (uchar_t *)&arg,
619 sizeof (arg));
620 MD5Final((uchar_t *)answer, &context);
621 answer[0] ^= answer[1] ^ answer[2] ^ answer[3];
622 new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0];
623 break;
624 }
625 case 1:
626 /* Add time component and min random (i.e. 1). */
627 new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1;
628 break;
629 default:
630 /* Add only time component. */
631 new_iss += (uint32_t)gethrestime_sec() *
632 tcps->tcps_iss_incr;
633 break;
634 }
635 if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) {
636 /*
637 * New ISS not guaranteed to be tcp_iss_incr/2
638 * ahead of the current tcp_snxt, so add the
639 * difference to tcp_iss_incr_extra.
640 */
641 tcps->tcps_iss_incr_extra += adj;
642 }
643 /*
644 * If tcp_clean_death() can not perform the task now,
645 * drop the SYN packet and let the other side re-xmit.
646 * Otherwise pass the SYN packet back in, since the
647 * old tcp state has been cleaned up or freed.
648 */
649 if (tcp_clean_death(tcp, 0) == -1)
650 goto done;
651 nconnp = ipcl_classify(mp, ira, ipst);
652 if (nconnp != NULL) {
653 TCP_STAT(tcps, tcp_time_wait_syn_success);
654 /* Drops ref on nconnp */
655 tcp_reinput(nconnp, mp, ira, ipst);
656 return;
657 }
658 goto done;
659 }
660
661 /*
662 * rgap is the amount of stuff received out of window. A negative
663 * value is the amount out of window.
664 */
665 if (rgap < 0) {
666 TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
667 TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
668 /* Fix seg_len and make sure there is something left. */
669 seg_len += rgap;
670 if (seg_len <= 0) {
671 if (flags & TH_RST) {
672 goto done;
673 }
674 flags |= TH_ACK_NEEDED;
675 seg_len = 0;
676 goto process_ack;
677 }
678 }
679 /*
680 * Check whether we can update tcp_ts_recent. This test is from RFC
681 * 7323, section 5.3.
682 */
683 if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) &&
684 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) &&
685 SEQ_LEQ(seg_seq, tcp->tcp_rack)) {
686 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val;
687 tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64();
688 }
689
690 if (seg_seq != tcp->tcp_rnxt && seg_len > 0) {
691 /* Always ack out of order packets */
692 flags |= TH_ACK_NEEDED;
693 seg_len = 0;
694 } else if (seg_len > 0) {
695 TCPS_BUMP_MIB(tcps, tcpInClosed);
696 TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
697 TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
698 }
699 if (flags & TH_RST) {
700 (void) tcp_clean_death(tcp, 0);
701 goto done;
702 }
703 if (flags & TH_SYN) {
704 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
705 TH_RST|TH_ACK);
706 /*
707 * Do not delete the TCP structure if it is in
708 * TIME_WAIT state. Refer to RFC 1122, 4.2.2.13.
709 */
710 goto done;
711 }
712 process_ack:
713 if (flags & TH_ACK) {
714 bytes_acked = (int)(seg_ack - tcp->tcp_suna);
715 if (bytes_acked <= 0) {
716 if (bytes_acked == 0 && seg_len == 0 &&
717 new_swnd == tcp->tcp_swnd)
718 TCPS_BUMP_MIB(tcps, tcpInDupAck);
719 } else {
720 /* Acks something not sent */
721 flags |= TH_ACK_NEEDED;
722 }
723 }
724 if (flags & TH_ACK_NEEDED) {
725 /*
726 * Time to send an ack for some reason.
727 */
728 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
729 tcp->tcp_rnxt, TH_ACK);
730 }
731 done:
732 freemsg(mp);
733 }
734