1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/strlog.h> 28 #include <sys/policy.h> 29 #include <sys/strsun.h> 30 #include <sys/squeue_impl.h> 31 #include <sys/squeue.h> 32 33 #include <inet/common.h> 34 #include <inet/ip.h> 35 #include <inet/tcp.h> 36 #include <inet/tcp_impl.h> 37 38 /* Control whether TCP can enter defensive mode when under memory pressure. */ 39 static boolean_t tcp_do_reclaim = B_TRUE; 40 41 /* 42 * Routines related to the TCP_IOC_ABORT_CONN ioctl command. 43 * 44 * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting 45 * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure 46 * (defined in tcp.h) needs to be filled in and passed into the kernel 47 * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t 48 * structure contains the four-tuple of a TCP connection and a range of TCP 49 * states (specified by ac_start and ac_end). The use of wildcard addresses 50 * and ports is allowed. Connections with a matching four tuple and a state 51 * within the specified range will be aborted. The valid states for the 52 * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT, 53 * inclusive. 54 * 55 * An application which has its connection aborted by this ioctl will receive 56 * an error that is dependent on the connection state at the time of the abort. 57 * If the connection state is < TCPS_TIME_WAIT, an application should behave as 58 * though a RST packet has been received. If the connection state is equal to 59 * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel 60 * and all resources associated with the connection will be freed. 61 */ 62 static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *); 63 static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *); 64 static void tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2, 65 ip_recv_attr_t *dummy); 66 static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps); 67 void tcp_ioctl_abort_conn(queue_t *, mblk_t *); 68 static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *, 69 boolean_t, tcp_stack_t *); 70 71 /* 72 * Macros used for accessing the different types of sockaddr 73 * structures inside a tcp_ioc_abort_conn_t. 74 */ 75 #define TCP_AC_V4LADDR(acp) ((sin_t *)&(acp)->ac_local) 76 #define TCP_AC_V4RADDR(acp) ((sin_t *)&(acp)->ac_remote) 77 #define TCP_AC_V4LOCAL(acp) (TCP_AC_V4LADDR(acp)->sin_addr.s_addr) 78 #define TCP_AC_V4REMOTE(acp) (TCP_AC_V4RADDR(acp)->sin_addr.s_addr) 79 #define TCP_AC_V4LPORT(acp) (TCP_AC_V4LADDR(acp)->sin_port) 80 #define TCP_AC_V4RPORT(acp) (TCP_AC_V4RADDR(acp)->sin_port) 81 #define TCP_AC_V6LADDR(acp) ((sin6_t *)&(acp)->ac_local) 82 #define TCP_AC_V6RADDR(acp) ((sin6_t *)&(acp)->ac_remote) 83 #define TCP_AC_V6LOCAL(acp) (TCP_AC_V6LADDR(acp)->sin6_addr) 84 #define TCP_AC_V6REMOTE(acp) (TCP_AC_V6RADDR(acp)->sin6_addr) 85 #define TCP_AC_V6LPORT(acp) (TCP_AC_V6LADDR(acp)->sin6_port) 86 #define TCP_AC_V6RPORT(acp) (TCP_AC_V6RADDR(acp)->sin6_port) 87 88 /* 89 * Return the correct error code to mimic the behavior 90 * of a connection reset. 91 */ 92 #define TCP_AC_GET_ERRCODE(state, err) { \ 93 switch ((state)) { \ 94 case TCPS_SYN_SENT: \ 95 case TCPS_SYN_RCVD: \ 96 (err) = ECONNREFUSED; \ 97 break; \ 98 case TCPS_ESTABLISHED: \ 99 case TCPS_FIN_WAIT_1: \ 100 case TCPS_FIN_WAIT_2: \ 101 case TCPS_CLOSE_WAIT: \ 102 (err) = ECONNRESET; \ 103 break; \ 104 case TCPS_CLOSING: \ 105 case TCPS_LAST_ACK: \ 106 case TCPS_TIME_WAIT: \ 107 (err) = 0; \ 108 break; \ 109 default: \ 110 (err) = ENXIO; \ 111 } \ 112 } 113 114 /* 115 * Check if a tcp structure matches the info in acp. 116 */ 117 #define TCP_AC_ADDR_MATCH(acp, connp, tcp) \ 118 (((acp)->ac_local.ss_family == AF_INET) ? \ 119 ((TCP_AC_V4LOCAL((acp)) == INADDR_ANY || \ 120 TCP_AC_V4LOCAL((acp)) == (connp)->conn_laddr_v4) && \ 121 (TCP_AC_V4REMOTE((acp)) == INADDR_ANY || \ 122 TCP_AC_V4REMOTE((acp)) == (connp)->conn_faddr_v4) && \ 123 (TCP_AC_V4LPORT((acp)) == 0 || \ 124 TCP_AC_V4LPORT((acp)) == (connp)->conn_lport) && \ 125 (TCP_AC_V4RPORT((acp)) == 0 || \ 126 TCP_AC_V4RPORT((acp)) == (connp)->conn_fport) && \ 127 (acp)->ac_start <= (tcp)->tcp_state && \ 128 (acp)->ac_end >= (tcp)->tcp_state) : \ 129 ((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) || \ 130 IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)), \ 131 &(connp)->conn_laddr_v6)) && \ 132 (IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) || \ 133 IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)), \ 134 &(connp)->conn_faddr_v6)) && \ 135 (TCP_AC_V6LPORT((acp)) == 0 || \ 136 TCP_AC_V6LPORT((acp)) == (connp)->conn_lport) && \ 137 (TCP_AC_V6RPORT((acp)) == 0 || \ 138 TCP_AC_V6RPORT((acp)) == (connp)->conn_fport) && \ 139 (acp)->ac_start <= (tcp)->tcp_state && \ 140 (acp)->ac_end >= (tcp)->tcp_state)) 141 142 #define TCP_AC_MATCH(acp, connp, tcp) \ 143 (((acp)->ac_zoneid == ALL_ZONES || \ 144 (acp)->ac_zoneid == (connp)->conn_zoneid) ? \ 145 TCP_AC_ADDR_MATCH(acp, connp, tcp) : 0) 146 147 /* 148 * Build a message containing a tcp_ioc_abort_conn_t structure 149 * which is filled in with information from acp and tp. 150 */ 151 static mblk_t * 152 tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp) 153 { 154 mblk_t *mp; 155 tcp_ioc_abort_conn_t *tacp; 156 157 mp = allocb(sizeof (uint32_t) + sizeof (*acp), BPRI_LO); 158 if (mp == NULL) 159 return (NULL); 160 161 *((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN; 162 tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr + 163 sizeof (uint32_t)); 164 165 tacp->ac_start = acp->ac_start; 166 tacp->ac_end = acp->ac_end; 167 tacp->ac_zoneid = acp->ac_zoneid; 168 169 if (acp->ac_local.ss_family == AF_INET) { 170 tacp->ac_local.ss_family = AF_INET; 171 tacp->ac_remote.ss_family = AF_INET; 172 TCP_AC_V4LOCAL(tacp) = tp->tcp_connp->conn_laddr_v4; 173 TCP_AC_V4REMOTE(tacp) = tp->tcp_connp->conn_faddr_v4; 174 TCP_AC_V4LPORT(tacp) = tp->tcp_connp->conn_lport; 175 TCP_AC_V4RPORT(tacp) = tp->tcp_connp->conn_fport; 176 } else { 177 tacp->ac_local.ss_family = AF_INET6; 178 tacp->ac_remote.ss_family = AF_INET6; 179 TCP_AC_V6LOCAL(tacp) = tp->tcp_connp->conn_laddr_v6; 180 TCP_AC_V6REMOTE(tacp) = tp->tcp_connp->conn_faddr_v6; 181 TCP_AC_V6LPORT(tacp) = tp->tcp_connp->conn_lport; 182 TCP_AC_V6RPORT(tacp) = tp->tcp_connp->conn_fport; 183 } 184 mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp); 185 return (mp); 186 } 187 188 /* 189 * Print a tcp_ioc_abort_conn_t structure. 190 */ 191 static void 192 tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp) 193 { 194 char lbuf[128]; 195 char rbuf[128]; 196 sa_family_t af; 197 in_port_t lport, rport; 198 ushort_t logflags; 199 200 af = acp->ac_local.ss_family; 201 202 if (af == AF_INET) { 203 (void) inet_ntop(af, (const void *)&TCP_AC_V4LOCAL(acp), 204 lbuf, 128); 205 (void) inet_ntop(af, (const void *)&TCP_AC_V4REMOTE(acp), 206 rbuf, 128); 207 lport = ntohs(TCP_AC_V4LPORT(acp)); 208 rport = ntohs(TCP_AC_V4RPORT(acp)); 209 } else { 210 (void) inet_ntop(af, (const void *)&TCP_AC_V6LOCAL(acp), 211 lbuf, 128); 212 (void) inet_ntop(af, (const void *)&TCP_AC_V6REMOTE(acp), 213 rbuf, 128); 214 lport = ntohs(TCP_AC_V6LPORT(acp)); 215 rport = ntohs(TCP_AC_V6RPORT(acp)); 216 } 217 218 logflags = SL_TRACE | SL_NOTE; 219 /* 220 * Don't print this message to the console if the operation was done 221 * to a non-global zone. 222 */ 223 if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) 224 logflags |= SL_CONSOLE; 225 (void) strlog(TCP_MOD_ID, 0, 1, logflags, 226 "TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, " 227 "start = %d, end = %d\n", lbuf, lport, rbuf, rport, 228 acp->ac_start, acp->ac_end); 229 } 230 231 /* 232 * Called using SQ_FILL when a message built using 233 * tcp_ioctl_abort_build_msg is put into a queue. 234 * Note that when we get here there is no wildcard in acp any more. 235 */ 236 /* ARGSUSED2 */ 237 static void 238 tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2, 239 ip_recv_attr_t *dummy) 240 { 241 conn_t *connp = (conn_t *)arg; 242 tcp_t *tcp = connp->conn_tcp; 243 tcp_ioc_abort_conn_t *acp; 244 245 /* 246 * Don't accept any input on a closed tcp as this TCP logically does 247 * not exist on the system. Don't proceed further with this TCP. 248 * For eg. this packet could trigger another close of this tcp 249 * which would be disastrous for tcp_refcnt. tcp_close_detached / 250 * tcp_clean_death / tcp_closei_local must be called at most once 251 * on a TCP. 252 */ 253 if (tcp->tcp_state == TCPS_CLOSED || 254 tcp->tcp_state == TCPS_BOUND) { 255 freemsg(mp); 256 return; 257 } 258 259 acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t)); 260 if (tcp->tcp_state <= acp->ac_end) { 261 /* 262 * If we get here, we are already on the correct 263 * squeue. This ioctl follows the following path 264 * tcp_wput -> tcp_wput_ioctl -> tcp_ioctl_abort_conn 265 * ->tcp_ioctl_abort->squeue_enter (if on a 266 * different squeue) 267 */ 268 int errcode; 269 270 TCP_AC_GET_ERRCODE(tcp->tcp_state, errcode); 271 (void) tcp_clean_death(tcp, errcode); 272 } 273 freemsg(mp); 274 } 275 276 /* 277 * Abort all matching connections on a hash chain. 278 */ 279 static int 280 tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *acp, int index, int *count, 281 boolean_t exact, tcp_stack_t *tcps) 282 { 283 int nmatch, err = 0; 284 tcp_t *tcp; 285 MBLKP mp, last, listhead = NULL; 286 conn_t *tconnp; 287 connf_t *connfp; 288 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 289 290 connfp = &ipst->ips_ipcl_conn_fanout[index]; 291 292 startover: 293 nmatch = 0; 294 295 mutex_enter(&connfp->connf_lock); 296 for (tconnp = connfp->connf_head; tconnp != NULL; 297 tconnp = tconnp->conn_next) { 298 tcp = tconnp->conn_tcp; 299 /* 300 * We are missing a check on sin6_scope_id for linklocals here, 301 * but current usage is just for aborting based on zoneid 302 * for shared-IP zones. 303 */ 304 if (TCP_AC_MATCH(acp, tconnp, tcp)) { 305 CONN_INC_REF(tconnp); 306 mp = tcp_ioctl_abort_build_msg(acp, tcp); 307 if (mp == NULL) { 308 err = ENOMEM; 309 CONN_DEC_REF(tconnp); 310 break; 311 } 312 mp->b_prev = (mblk_t *)tcp; 313 314 if (listhead == NULL) { 315 listhead = mp; 316 last = mp; 317 } else { 318 last->b_next = mp; 319 last = mp; 320 } 321 nmatch++; 322 if (exact) 323 break; 324 } 325 326 /* Avoid holding lock for too long. */ 327 if (nmatch >= 500) 328 break; 329 } 330 mutex_exit(&connfp->connf_lock); 331 332 /* Pass mp into the correct tcp */ 333 while ((mp = listhead) != NULL) { 334 listhead = listhead->b_next; 335 tcp = (tcp_t *)mp->b_prev; 336 mp->b_next = mp->b_prev = NULL; 337 SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp, 338 tcp_ioctl_abort_handler, tcp->tcp_connp, NULL, 339 SQ_FILL, SQTAG_TCP_ABORT_BUCKET); 340 } 341 342 *count += nmatch; 343 if (nmatch >= 500 && err == 0) 344 goto startover; 345 return (err); 346 } 347 348 /* 349 * Abort all connections that matches the attributes specified in acp. 350 */ 351 static int 352 tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp, tcp_stack_t *tcps) 353 { 354 sa_family_t af; 355 uint32_t ports; 356 uint16_t *pports; 357 int err = 0, count = 0; 358 boolean_t exact = B_FALSE; /* set when there is no wildcard */ 359 int index = -1; 360 ushort_t logflags; 361 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 362 363 af = acp->ac_local.ss_family; 364 365 if (af == AF_INET) { 366 if (TCP_AC_V4REMOTE(acp) != INADDR_ANY && 367 TCP_AC_V4LPORT(acp) != 0 && TCP_AC_V4RPORT(acp) != 0) { 368 pports = (uint16_t *)&ports; 369 pports[1] = TCP_AC_V4LPORT(acp); 370 pports[0] = TCP_AC_V4RPORT(acp); 371 exact = (TCP_AC_V4LOCAL(acp) != INADDR_ANY); 372 } 373 } else { 374 if (!IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE(acp)) && 375 TCP_AC_V6LPORT(acp) != 0 && TCP_AC_V6RPORT(acp) != 0) { 376 pports = (uint16_t *)&ports; 377 pports[1] = TCP_AC_V6LPORT(acp); 378 pports[0] = TCP_AC_V6RPORT(acp); 379 exact = !IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL(acp)); 380 } 381 } 382 383 /* 384 * For cases where remote addr, local port, and remote port are non- 385 * wildcards, tcp_ioctl_abort_bucket will only be called once. 386 */ 387 if (index != -1) { 388 err = tcp_ioctl_abort_bucket(acp, index, 389 &count, exact, tcps); 390 } else { 391 /* 392 * loop through all entries for wildcard case 393 */ 394 for (index = 0; 395 index < ipst->ips_ipcl_conn_fanout_size; 396 index++) { 397 err = tcp_ioctl_abort_bucket(acp, index, 398 &count, exact, tcps); 399 if (err != 0) 400 break; 401 } 402 } 403 404 logflags = SL_TRACE | SL_NOTE; 405 /* 406 * Don't print this message to the console if the operation was done 407 * to a non-global zone. 408 */ 409 if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) 410 logflags |= SL_CONSOLE; 411 (void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: " 412 "aborted %d connection%c\n", count, ((count > 1) ? 's' : ' ')); 413 if (err == 0 && count == 0) 414 err = ENOENT; 415 return (err); 416 } 417 418 /* 419 * Process the TCP_IOC_ABORT_CONN ioctl request. 420 */ 421 void 422 tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp) 423 { 424 int err; 425 IOCP iocp; 426 MBLKP mp1; 427 sa_family_t laf, raf; 428 tcp_ioc_abort_conn_t *acp; 429 zone_t *zptr; 430 conn_t *connp = Q_TO_CONN(q); 431 zoneid_t zoneid = connp->conn_zoneid; 432 tcp_t *tcp = connp->conn_tcp; 433 tcp_stack_t *tcps = tcp->tcp_tcps; 434 435 iocp = (IOCP)mp->b_rptr; 436 437 if ((mp1 = mp->b_cont) == NULL || 438 iocp->ioc_count != sizeof (tcp_ioc_abort_conn_t)) { 439 err = EINVAL; 440 goto out; 441 } 442 443 /* check permissions */ 444 if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) { 445 err = EPERM; 446 goto out; 447 } 448 449 if (mp1->b_cont != NULL) { 450 freemsg(mp1->b_cont); 451 mp1->b_cont = NULL; 452 } 453 454 acp = (tcp_ioc_abort_conn_t *)mp1->b_rptr; 455 laf = acp->ac_local.ss_family; 456 raf = acp->ac_remote.ss_family; 457 458 /* check that a zone with the supplied zoneid exists */ 459 if (acp->ac_zoneid != GLOBAL_ZONEID && acp->ac_zoneid != ALL_ZONES) { 460 zptr = zone_find_by_id(zoneid); 461 if (zptr != NULL) { 462 zone_rele(zptr); 463 } else { 464 err = EINVAL; 465 goto out; 466 } 467 } 468 469 /* 470 * For exclusive stacks we set the zoneid to zero 471 * to make TCP operate as if in the global zone. 472 */ 473 if (tcps->tcps_netstack->netstack_stackid != GLOBAL_NETSTACKID) 474 acp->ac_zoneid = GLOBAL_ZONEID; 475 476 if (acp->ac_start < TCPS_SYN_SENT || acp->ac_end > TCPS_TIME_WAIT || 477 acp->ac_start > acp->ac_end || laf != raf || 478 (laf != AF_INET && laf != AF_INET6)) { 479 err = EINVAL; 480 goto out; 481 } 482 483 tcp_ioctl_abort_dump(acp); 484 err = tcp_ioctl_abort(acp, tcps); 485 486 out: 487 if (mp1 != NULL) { 488 freemsg(mp1); 489 mp->b_cont = NULL; 490 } 491 492 if (err != 0) 493 miocnak(q, mp, 0, err); 494 else 495 miocack(q, mp, 0, 0); 496 } 497 498 /* 499 * Timeout function to reset the TCP stack variable tcps_reclaim to false. 500 */ 501 void 502 tcp_reclaim_timer(void *arg) 503 { 504 tcp_stack_t *tcps = (tcp_stack_t *)arg; 505 int64_t tot_conn = 0; 506 int i; 507 extern pgcnt_t lotsfree, needfree; 508 509 for (i = 0; i < tcps->tcps_sc_cnt; i++) 510 tot_conn += tcps->tcps_sc[i]->tcp_sc_conn_cnt; 511 512 /* 513 * This happens only when a stack is going away. tcps_reclaim_tid 514 * should not be reset to 0 when returning in this case. 515 */ 516 mutex_enter(&tcps->tcps_reclaim_lock); 517 if (!tcps->tcps_reclaim) { 518 mutex_exit(&tcps->tcps_reclaim_lock); 519 return; 520 } 521 522 if ((freemem >= lotsfree + needfree) || tot_conn < maxusers) { 523 tcps->tcps_reclaim = B_FALSE; 524 tcps->tcps_reclaim_tid = 0; 525 } else { 526 /* Stay in defensive mode and restart the timer */ 527 tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer, 528 tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period)); 529 } 530 mutex_exit(&tcps->tcps_reclaim_lock); 531 } 532 533 /* 534 * Kmem reclaim call back function. When the system is under memory 535 * pressure, we set the TCP stack variable tcps_reclaim to true. This 536 * variable is reset to false after tcps_reclaim_period msecs. During this 537 * period, TCP will be more aggressive in aborting connections not making 538 * progress, meaning retransmitting for some time (tcp_early_abort seconds). 539 * TCP will also not accept new connection request for those listeners whose 540 * q or q0 is not empty. 541 */ 542 /* ARGSUSED */ 543 void 544 tcp_conn_reclaim(void *arg) 545 { 546 netstack_handle_t nh; 547 netstack_t *ns; 548 tcp_stack_t *tcps; 549 extern pgcnt_t lotsfree, needfree; 550 551 if (!tcp_do_reclaim) 552 return; 553 554 /* 555 * The reclaim function may be called even when the system is not 556 * really under memory pressure. 557 */ 558 if (freemem >= lotsfree + needfree) 559 return; 560 561 netstack_next_init(&nh); 562 while ((ns = netstack_next(&nh)) != NULL) { 563 int i; 564 int64_t tot_conn = 0; 565 566 /* 567 * During boot time, the first netstack_t is created and 568 * initialized before TCP has registered with the netstack 569 * framework. If this reclaim function is called before TCP 570 * has finished its initialization, netstack_next() will 571 * return the first netstack_t (since its netstack_flags is 572 * not NSF_UNINIT). And its netstack_tcp will be NULL. We 573 * need to catch it. 574 * 575 * All subsequent netstack_t creation will not have this 576 * problem since the initialization is not finished until TCP 577 * has finished its own tcp_stack_t initialization. Hence 578 * netstack_next() will not return one with NULL netstack_tcp. 579 */ 580 if ((tcps = ns->netstack_tcp) == NULL) { 581 netstack_rele(ns); 582 continue; 583 } 584 585 /* 586 * Even if the system is under memory pressure, the reason may 587 * not be because of TCP activity. Check the number of 588 * connections in each stack. If the number exceeds the 589 * threshold (maxusers), turn on defensive mode. 590 */ 591 for (i = 0; i < tcps->tcps_sc_cnt; i++) 592 tot_conn += tcps->tcps_sc[i]->tcp_sc_conn_cnt; 593 if (tot_conn < maxusers) { 594 netstack_rele(ns); 595 continue; 596 } 597 598 mutex_enter(&tcps->tcps_reclaim_lock); 599 if (!tcps->tcps_reclaim) { 600 tcps->tcps_reclaim = B_TRUE; 601 tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer, 602 tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period)); 603 TCP_STAT(tcps, tcp_reclaim_cnt); 604 } 605 mutex_exit(&tcps->tcps_reclaim_lock); 606 netstack_rele(ns); 607 } 608 netstack_next_fini(&nh); 609 } 610 611 /* 612 * Given a tcp_stack_t and a port (in host byte order), find a listener 613 * configuration for that port and return the ratio. 614 */ 615 uint32_t 616 tcp_find_listener_conf(tcp_stack_t *tcps, in_port_t port) 617 { 618 tcp_listener_t *tl; 619 uint32_t ratio = 0; 620 621 mutex_enter(&tcps->tcps_listener_conf_lock); 622 for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; 623 tl = list_next(&tcps->tcps_listener_conf, tl)) { 624 if (tl->tl_port == port) { 625 ratio = tl->tl_ratio; 626 break; 627 } 628 } 629 mutex_exit(&tcps->tcps_listener_conf_lock); 630 return (ratio); 631 } 632 633 /* 634 * To remove all listener limit configuration in a tcp_stack_t. 635 */ 636 void 637 tcp_listener_conf_cleanup(tcp_stack_t *tcps) 638 { 639 tcp_listener_t *tl; 640 641 mutex_enter(&tcps->tcps_listener_conf_lock); 642 while ((tl = list_head(&tcps->tcps_listener_conf)) != NULL) { 643 list_remove(&tcps->tcps_listener_conf, tl); 644 kmem_free(tl, sizeof (tcp_listener_t)); 645 } 646 mutex_destroy(&tcps->tcps_listener_conf_lock); 647 list_destroy(&tcps->tcps_listener_conf); 648 } 649 650 /* 651 * When a CPU is added, we need to allocate the per CPU stats struct. 652 */ 653 void 654 tcp_stack_cpu_add(tcp_stack_t *tcps, processorid_t cpu_seqid) 655 { 656 int i; 657 658 if (cpu_seqid < tcps->tcps_sc_cnt) 659 return; 660 for (i = tcps->tcps_sc_cnt; i <= cpu_seqid; i++) { 661 ASSERT(tcps->tcps_sc[i] == NULL); 662 tcps->tcps_sc[i] = kmem_zalloc(sizeof (tcp_stats_cpu_t), 663 KM_SLEEP); 664 } 665 membar_producer(); 666 tcps->tcps_sc_cnt = cpu_seqid + 1; 667 } 668 669 /* 670 * Diagnostic routine used to return a string associated with the tcp state. 671 * Note that if the caller does not supply a buffer, it will use an internal 672 * static string. This means that if multiple threads call this function at 673 * the same time, output can be corrupted... Note also that this function 674 * does not check the size of the supplied buffer. The caller has to make 675 * sure that it is big enough. 676 */ 677 char * 678 tcp_display(tcp_t *tcp, char *sup_buf, char format) 679 { 680 char buf1[30]; 681 static char priv_buf[INET6_ADDRSTRLEN * 2 + 80]; 682 char *buf; 683 char *cp; 684 in6_addr_t local, remote; 685 char local_addrbuf[INET6_ADDRSTRLEN]; 686 char remote_addrbuf[INET6_ADDRSTRLEN]; 687 conn_t *connp; 688 689 if (sup_buf != NULL) 690 buf = sup_buf; 691 else 692 buf = priv_buf; 693 694 if (tcp == NULL) 695 return ("NULL_TCP"); 696 697 connp = tcp->tcp_connp; 698 switch (tcp->tcp_state) { 699 case TCPS_CLOSED: 700 cp = "TCP_CLOSED"; 701 break; 702 case TCPS_IDLE: 703 cp = "TCP_IDLE"; 704 break; 705 case TCPS_BOUND: 706 cp = "TCP_BOUND"; 707 break; 708 case TCPS_LISTEN: 709 cp = "TCP_LISTEN"; 710 break; 711 case TCPS_SYN_SENT: 712 cp = "TCP_SYN_SENT"; 713 break; 714 case TCPS_SYN_RCVD: 715 cp = "TCP_SYN_RCVD"; 716 break; 717 case TCPS_ESTABLISHED: 718 cp = "TCP_ESTABLISHED"; 719 break; 720 case TCPS_CLOSE_WAIT: 721 cp = "TCP_CLOSE_WAIT"; 722 break; 723 case TCPS_FIN_WAIT_1: 724 cp = "TCP_FIN_WAIT_1"; 725 break; 726 case TCPS_CLOSING: 727 cp = "TCP_CLOSING"; 728 break; 729 case TCPS_LAST_ACK: 730 cp = "TCP_LAST_ACK"; 731 break; 732 case TCPS_FIN_WAIT_2: 733 cp = "TCP_FIN_WAIT_2"; 734 break; 735 case TCPS_TIME_WAIT: 736 cp = "TCP_TIME_WAIT"; 737 break; 738 default: 739 (void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state); 740 cp = buf1; 741 break; 742 } 743 switch (format) { 744 case DISP_ADDR_AND_PORT: 745 if (connp->conn_ipversion == IPV4_VERSION) { 746 /* 747 * Note that we use the remote address in the tcp_b 748 * structure. This means that it will print out 749 * the real destination address, not the next hop's 750 * address if source routing is used. 751 */ 752 IN6_IPADDR_TO_V4MAPPED(connp->conn_laddr_v4, &local); 753 IN6_IPADDR_TO_V4MAPPED(connp->conn_faddr_v4, &remote); 754 755 } else { 756 local = connp->conn_laddr_v6; 757 remote = connp->conn_faddr_v6; 758 } 759 (void) inet_ntop(AF_INET6, &local, local_addrbuf, 760 sizeof (local_addrbuf)); 761 (void) inet_ntop(AF_INET6, &remote, remote_addrbuf, 762 sizeof (remote_addrbuf)); 763 (void) mi_sprintf(buf, "[%s.%u, %s.%u] %s", 764 local_addrbuf, ntohs(connp->conn_lport), remote_addrbuf, 765 ntohs(connp->conn_fport), cp); 766 break; 767 case DISP_PORT_ONLY: 768 default: 769 (void) mi_sprintf(buf, "[%u, %u] %s", 770 ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp); 771 break; 772 } 773 774 return (buf); 775 } 776