1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/strlog.h> 29 #include <sys/policy.h> 30 #include <sys/strsun.h> 31 #include <sys/squeue_impl.h> 32 #include <sys/squeue.h> 33 34 #include <inet/common.h> 35 #include <inet/ip.h> 36 #include <inet/tcp.h> 37 #include <inet/tcp_impl.h> 38 39 /* Control whether TCP can enter defensive mode when under memory pressure. */ 40 static boolean_t tcp_do_reclaim = B_TRUE; 41 42 /* 43 * Routines related to the TCP_IOC_ABORT_CONN ioctl command. 44 * 45 * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting 46 * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure 47 * (defined in tcp.h) needs to be filled in and passed into the kernel 48 * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t 49 * structure contains the four-tuple of a TCP connection and a range of TCP 50 * states (specified by ac_start and ac_end). The use of wildcard addresses 51 * and ports is allowed. Connections with a matching four tuple and a state 52 * within the specified range will be aborted. The valid states for the 53 * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT, 54 * inclusive. 55 * 56 * An application which has its connection aborted by this ioctl will receive 57 * an error that is dependent on the connection state at the time of the abort. 58 * If the connection state is < TCPS_TIME_WAIT, an application should behave as 59 * though a RST packet has been received. If the connection state is equal to 60 * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel 61 * and all resources associated with the connection will be freed. 62 */ 63 static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *); 64 static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *); 65 static void tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2, 66 ip_recv_attr_t *dummy); 67 static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps); 68 void tcp_ioctl_abort_conn(queue_t *, mblk_t *); 69 static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *, 70 boolean_t, tcp_stack_t *); 71 72 /* 73 * Macros used for accessing the different types of sockaddr 74 * structures inside a tcp_ioc_abort_conn_t. 75 */ 76 #define TCP_AC_V4LADDR(acp) ((sin_t *)&(acp)->ac_local) 77 #define TCP_AC_V4RADDR(acp) ((sin_t *)&(acp)->ac_remote) 78 #define TCP_AC_V4LOCAL(acp) (TCP_AC_V4LADDR(acp)->sin_addr.s_addr) 79 #define TCP_AC_V4REMOTE(acp) (TCP_AC_V4RADDR(acp)->sin_addr.s_addr) 80 #define TCP_AC_V4LPORT(acp) (TCP_AC_V4LADDR(acp)->sin_port) 81 #define TCP_AC_V4RPORT(acp) (TCP_AC_V4RADDR(acp)->sin_port) 82 #define TCP_AC_V6LADDR(acp) ((sin6_t *)&(acp)->ac_local) 83 #define TCP_AC_V6RADDR(acp) ((sin6_t *)&(acp)->ac_remote) 84 #define TCP_AC_V6LOCAL(acp) (TCP_AC_V6LADDR(acp)->sin6_addr) 85 #define TCP_AC_V6REMOTE(acp) (TCP_AC_V6RADDR(acp)->sin6_addr) 86 #define TCP_AC_V6LPORT(acp) (TCP_AC_V6LADDR(acp)->sin6_port) 87 #define TCP_AC_V6RPORT(acp) (TCP_AC_V6RADDR(acp)->sin6_port) 88 89 /* 90 * Return the correct error code to mimic the behavior 91 * of a connection reset. 92 */ 93 #define TCP_AC_GET_ERRCODE(state, err) { \ 94 switch ((state)) { \ 95 case TCPS_SYN_SENT: \ 96 case TCPS_SYN_RCVD: \ 97 (err) = ECONNREFUSED; \ 98 break; \ 99 case TCPS_ESTABLISHED: \ 100 case TCPS_FIN_WAIT_1: \ 101 case TCPS_FIN_WAIT_2: \ 102 case TCPS_CLOSE_WAIT: \ 103 (err) = ECONNRESET; \ 104 break; \ 105 case TCPS_CLOSING: \ 106 case TCPS_LAST_ACK: \ 107 case TCPS_TIME_WAIT: \ 108 (err) = 0; \ 109 break; \ 110 default: \ 111 (err) = ENXIO; \ 112 } \ 113 } 114 115 /* 116 * Check if a tcp structure matches the info in acp. 117 */ 118 #define TCP_AC_ADDR_MATCH(acp, connp, tcp) \ 119 (((acp)->ac_local.ss_family == AF_INET) ? \ 120 ((TCP_AC_V4LOCAL((acp)) == INADDR_ANY || \ 121 TCP_AC_V4LOCAL((acp)) == (connp)->conn_laddr_v4) && \ 122 (TCP_AC_V4REMOTE((acp)) == INADDR_ANY || \ 123 TCP_AC_V4REMOTE((acp)) == (connp)->conn_faddr_v4) && \ 124 (TCP_AC_V4LPORT((acp)) == 0 || \ 125 TCP_AC_V4LPORT((acp)) == (connp)->conn_lport) && \ 126 (TCP_AC_V4RPORT((acp)) == 0 || \ 127 TCP_AC_V4RPORT((acp)) == (connp)->conn_fport) && \ 128 (acp)->ac_start <= (tcp)->tcp_state && \ 129 (acp)->ac_end >= (tcp)->tcp_state) : \ 130 ((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) || \ 131 IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)), \ 132 &(connp)->conn_laddr_v6)) && \ 133 (IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) || \ 134 IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)), \ 135 &(connp)->conn_faddr_v6)) && \ 136 (TCP_AC_V6LPORT((acp)) == 0 || \ 137 TCP_AC_V6LPORT((acp)) == (connp)->conn_lport) && \ 138 (TCP_AC_V6RPORT((acp)) == 0 || \ 139 TCP_AC_V6RPORT((acp)) == (connp)->conn_fport) && \ 140 (acp)->ac_start <= (tcp)->tcp_state && \ 141 (acp)->ac_end >= (tcp)->tcp_state)) 142 143 #define TCP_AC_MATCH(acp, connp, tcp) \ 144 (((acp)->ac_zoneid == ALL_ZONES || \ 145 (acp)->ac_zoneid == (connp)->conn_zoneid) ? \ 146 TCP_AC_ADDR_MATCH(acp, connp, tcp) : 0) 147 148 /* 149 * Build a message containing a tcp_ioc_abort_conn_t structure 150 * which is filled in with information from acp and tp. 151 */ 152 static mblk_t * 153 tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp) 154 { 155 mblk_t *mp; 156 tcp_ioc_abort_conn_t *tacp; 157 158 mp = allocb(sizeof (uint32_t) + sizeof (*acp), BPRI_LO); 159 if (mp == NULL) 160 return (NULL); 161 162 *((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN; 163 tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr + 164 sizeof (uint32_t)); 165 166 tacp->ac_start = acp->ac_start; 167 tacp->ac_end = acp->ac_end; 168 tacp->ac_zoneid = acp->ac_zoneid; 169 170 if (acp->ac_local.ss_family == AF_INET) { 171 tacp->ac_local.ss_family = AF_INET; 172 tacp->ac_remote.ss_family = AF_INET; 173 TCP_AC_V4LOCAL(tacp) = tp->tcp_connp->conn_laddr_v4; 174 TCP_AC_V4REMOTE(tacp) = tp->tcp_connp->conn_faddr_v4; 175 TCP_AC_V4LPORT(tacp) = tp->tcp_connp->conn_lport; 176 TCP_AC_V4RPORT(tacp) = tp->tcp_connp->conn_fport; 177 } else { 178 tacp->ac_local.ss_family = AF_INET6; 179 tacp->ac_remote.ss_family = AF_INET6; 180 TCP_AC_V6LOCAL(tacp) = tp->tcp_connp->conn_laddr_v6; 181 TCP_AC_V6REMOTE(tacp) = tp->tcp_connp->conn_faddr_v6; 182 TCP_AC_V6LPORT(tacp) = tp->tcp_connp->conn_lport; 183 TCP_AC_V6RPORT(tacp) = tp->tcp_connp->conn_fport; 184 } 185 mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp); 186 return (mp); 187 } 188 189 /* 190 * Print a tcp_ioc_abort_conn_t structure. 191 */ 192 static void 193 tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp) 194 { 195 char lbuf[128]; 196 char rbuf[128]; 197 sa_family_t af; 198 in_port_t lport, rport; 199 ushort_t logflags; 200 201 af = acp->ac_local.ss_family; 202 203 if (af == AF_INET) { 204 (void) inet_ntop(af, (const void *)&TCP_AC_V4LOCAL(acp), 205 lbuf, 128); 206 (void) inet_ntop(af, (const void *)&TCP_AC_V4REMOTE(acp), 207 rbuf, 128); 208 lport = ntohs(TCP_AC_V4LPORT(acp)); 209 rport = ntohs(TCP_AC_V4RPORT(acp)); 210 } else { 211 (void) inet_ntop(af, (const void *)&TCP_AC_V6LOCAL(acp), 212 lbuf, 128); 213 (void) inet_ntop(af, (const void *)&TCP_AC_V6REMOTE(acp), 214 rbuf, 128); 215 lport = ntohs(TCP_AC_V6LPORT(acp)); 216 rport = ntohs(TCP_AC_V6RPORT(acp)); 217 } 218 219 logflags = SL_TRACE | SL_NOTE; 220 /* 221 * Don't print this message to the console if the operation was done 222 * to a non-global zone. 223 */ 224 if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) 225 logflags |= SL_CONSOLE; 226 (void) strlog(TCP_MOD_ID, 0, 1, logflags, 227 "TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, " 228 "start = %d, end = %d\n", lbuf, lport, rbuf, rport, 229 acp->ac_start, acp->ac_end); 230 } 231 232 /* 233 * Called using SQ_FILL when a message built using 234 * tcp_ioctl_abort_build_msg is put into a queue. 235 * Note that when we get here there is no wildcard in acp any more. 236 */ 237 /* ARGSUSED2 */ 238 static void 239 tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2, 240 ip_recv_attr_t *dummy) 241 { 242 conn_t *connp = (conn_t *)arg; 243 tcp_t *tcp = connp->conn_tcp; 244 tcp_ioc_abort_conn_t *acp; 245 246 /* 247 * Don't accept any input on a closed tcp as this TCP logically does 248 * not exist on the system. Don't proceed further with this TCP. 249 * For eg. this packet could trigger another close of this tcp 250 * which would be disastrous for tcp_refcnt. tcp_close_detached / 251 * tcp_clean_death / tcp_closei_local must be called at most once 252 * on a TCP. 253 */ 254 if (tcp->tcp_state == TCPS_CLOSED || 255 tcp->tcp_state == TCPS_BOUND) { 256 freemsg(mp); 257 return; 258 } 259 260 acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t)); 261 if (tcp->tcp_state <= acp->ac_end) { 262 /* 263 * If we get here, we are already on the correct 264 * squeue. This ioctl follows the following path 265 * tcp_wput -> tcp_wput_ioctl -> tcp_ioctl_abort_conn 266 * ->tcp_ioctl_abort->squeue_enter (if on a 267 * different squeue) 268 */ 269 int errcode; 270 271 TCP_AC_GET_ERRCODE(tcp->tcp_state, errcode); 272 (void) tcp_clean_death(tcp, errcode); 273 } 274 freemsg(mp); 275 } 276 277 /* 278 * Abort all matching connections on a hash chain. 279 */ 280 static int 281 tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *acp, int index, int *count, 282 boolean_t exact, tcp_stack_t *tcps) 283 { 284 int nmatch, err = 0; 285 tcp_t *tcp; 286 MBLKP mp, last, listhead = NULL; 287 conn_t *tconnp; 288 connf_t *connfp; 289 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 290 291 connfp = &ipst->ips_ipcl_conn_fanout[index]; 292 293 startover: 294 nmatch = 0; 295 296 mutex_enter(&connfp->connf_lock); 297 for (tconnp = connfp->connf_head; tconnp != NULL; 298 tconnp = tconnp->conn_next) { 299 tcp = tconnp->conn_tcp; 300 /* 301 * We are missing a check on sin6_scope_id for linklocals here, 302 * but current usage is just for aborting based on zoneid 303 * for shared-IP zones. 304 */ 305 if (TCP_AC_MATCH(acp, tconnp, tcp)) { 306 CONN_INC_REF(tconnp); 307 mp = tcp_ioctl_abort_build_msg(acp, tcp); 308 if (mp == NULL) { 309 err = ENOMEM; 310 CONN_DEC_REF(tconnp); 311 break; 312 } 313 mp->b_prev = (mblk_t *)tcp; 314 315 if (listhead == NULL) { 316 listhead = mp; 317 last = mp; 318 } else { 319 last->b_next = mp; 320 last = mp; 321 } 322 nmatch++; 323 if (exact) 324 break; 325 } 326 327 /* Avoid holding lock for too long. */ 328 if (nmatch >= 500) 329 break; 330 } 331 mutex_exit(&connfp->connf_lock); 332 333 /* Pass mp into the correct tcp */ 334 while ((mp = listhead) != NULL) { 335 listhead = listhead->b_next; 336 tcp = (tcp_t *)mp->b_prev; 337 mp->b_next = mp->b_prev = NULL; 338 SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp, 339 tcp_ioctl_abort_handler, tcp->tcp_connp, NULL, 340 SQ_FILL, SQTAG_TCP_ABORT_BUCKET); 341 } 342 343 *count += nmatch; 344 if (nmatch >= 500 && err == 0) 345 goto startover; 346 return (err); 347 } 348 349 /* 350 * Abort all connections that matches the attributes specified in acp. 351 */ 352 static int 353 tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp, tcp_stack_t *tcps) 354 { 355 sa_family_t af; 356 uint32_t ports; 357 uint16_t *pports; 358 int err = 0, count = 0; 359 boolean_t exact = B_FALSE; /* set when there is no wildcard */ 360 int index = -1; 361 ushort_t logflags; 362 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 363 364 af = acp->ac_local.ss_family; 365 366 if (af == AF_INET) { 367 if (TCP_AC_V4REMOTE(acp) != INADDR_ANY && 368 TCP_AC_V4LPORT(acp) != 0 && TCP_AC_V4RPORT(acp) != 0) { 369 pports = (uint16_t *)&ports; 370 pports[1] = TCP_AC_V4LPORT(acp); 371 pports[0] = TCP_AC_V4RPORT(acp); 372 exact = (TCP_AC_V4LOCAL(acp) != INADDR_ANY); 373 } 374 } else { 375 if (!IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE(acp)) && 376 TCP_AC_V6LPORT(acp) != 0 && TCP_AC_V6RPORT(acp) != 0) { 377 pports = (uint16_t *)&ports; 378 pports[1] = TCP_AC_V6LPORT(acp); 379 pports[0] = TCP_AC_V6RPORT(acp); 380 exact = !IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL(acp)); 381 } 382 } 383 384 /* 385 * For cases where remote addr, local port, and remote port are non- 386 * wildcards, tcp_ioctl_abort_bucket will only be called once. 387 */ 388 if (index != -1) { 389 err = tcp_ioctl_abort_bucket(acp, index, 390 &count, exact, tcps); 391 } else { 392 /* 393 * loop through all entries for wildcard case 394 */ 395 for (index = 0; 396 index < ipst->ips_ipcl_conn_fanout_size; 397 index++) { 398 err = tcp_ioctl_abort_bucket(acp, index, 399 &count, exact, tcps); 400 if (err != 0) 401 break; 402 } 403 } 404 405 logflags = SL_TRACE | SL_NOTE; 406 /* 407 * Don't print this message to the console if the operation was done 408 * to a non-global zone. 409 */ 410 if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) 411 logflags |= SL_CONSOLE; 412 (void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: " 413 "aborted %d connection%c\n", count, ((count > 1) ? 's' : ' ')); 414 if (err == 0 && count == 0) 415 err = ENOENT; 416 return (err); 417 } 418 419 /* 420 * Process the TCP_IOC_ABORT_CONN ioctl request. 421 */ 422 void 423 tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp) 424 { 425 int err; 426 IOCP iocp; 427 MBLKP mp1; 428 sa_family_t laf, raf; 429 tcp_ioc_abort_conn_t *acp; 430 zone_t *zptr; 431 conn_t *connp = Q_TO_CONN(q); 432 zoneid_t zoneid = connp->conn_zoneid; 433 tcp_t *tcp = connp->conn_tcp; 434 tcp_stack_t *tcps = tcp->tcp_tcps; 435 436 iocp = (IOCP)mp->b_rptr; 437 438 if ((mp1 = mp->b_cont) == NULL || 439 iocp->ioc_count != sizeof (tcp_ioc_abort_conn_t)) { 440 err = EINVAL; 441 goto out; 442 } 443 444 /* check permissions */ 445 if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) { 446 err = EPERM; 447 goto out; 448 } 449 450 if (mp1->b_cont != NULL) { 451 freemsg(mp1->b_cont); 452 mp1->b_cont = NULL; 453 } 454 455 acp = (tcp_ioc_abort_conn_t *)mp1->b_rptr; 456 laf = acp->ac_local.ss_family; 457 raf = acp->ac_remote.ss_family; 458 459 /* check that a zone with the supplied zoneid exists */ 460 if (acp->ac_zoneid != GLOBAL_ZONEID && acp->ac_zoneid != ALL_ZONES) { 461 zptr = zone_find_by_id(zoneid); 462 if (zptr != NULL) { 463 zone_rele(zptr); 464 } else { 465 err = EINVAL; 466 goto out; 467 } 468 } 469 470 /* 471 * For exclusive stacks we set the zoneid to zero 472 * to make TCP operate as if in the global zone. 473 */ 474 if (tcps->tcps_netstack->netstack_stackid != GLOBAL_NETSTACKID) 475 acp->ac_zoneid = GLOBAL_ZONEID; 476 477 if (acp->ac_start < TCPS_SYN_SENT || acp->ac_end > TCPS_TIME_WAIT || 478 acp->ac_start > acp->ac_end || laf != raf || 479 (laf != AF_INET && laf != AF_INET6)) { 480 err = EINVAL; 481 goto out; 482 } 483 484 tcp_ioctl_abort_dump(acp); 485 err = tcp_ioctl_abort(acp, tcps); 486 487 out: 488 if (mp1 != NULL) { 489 freemsg(mp1); 490 mp->b_cont = NULL; 491 } 492 493 if (err != 0) 494 miocnak(q, mp, 0, err); 495 else 496 miocack(q, mp, 0, 0); 497 } 498 499 /* 500 * Timeout function to reset the TCP stack variable tcps_reclaim to false. 501 */ 502 void 503 tcp_reclaim_timer(void *arg) 504 { 505 tcp_stack_t *tcps = (tcp_stack_t *)arg; 506 int64_t tot_conn = 0; 507 int i; 508 extern pgcnt_t lotsfree, needfree; 509 510 for (i = 0; i < tcps->tcps_sc_cnt; i++) 511 tot_conn += tcps->tcps_sc[i]->tcp_sc_conn_cnt; 512 513 /* 514 * This happens only when a stack is going away. tcps_reclaim_tid 515 * should not be reset to 0 when returning in this case. 516 */ 517 mutex_enter(&tcps->tcps_reclaim_lock); 518 if (!tcps->tcps_reclaim) { 519 mutex_exit(&tcps->tcps_reclaim_lock); 520 return; 521 } 522 523 if ((freemem >= lotsfree + needfree) || tot_conn < maxusers) { 524 tcps->tcps_reclaim = B_FALSE; 525 tcps->tcps_reclaim_tid = 0; 526 } else { 527 /* Stay in defensive mode and restart the timer */ 528 tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer, 529 tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period)); 530 } 531 mutex_exit(&tcps->tcps_reclaim_lock); 532 } 533 534 /* 535 * Kmem reclaim call back function. When the system is under memory 536 * pressure, we set the TCP stack variable tcps_reclaim to true. This 537 * variable is reset to false after tcps_reclaim_period msecs. During this 538 * period, TCP will be more aggressive in aborting connections not making 539 * progress, meaning retransmitting for some time (tcp_early_abort seconds). 540 * TCP will also not accept new connection request for those listeners whose 541 * q or q0 is not empty. 542 */ 543 /* ARGSUSED */ 544 void 545 tcp_conn_reclaim(void *arg) 546 { 547 netstack_handle_t nh; 548 netstack_t *ns; 549 tcp_stack_t *tcps; 550 extern pgcnt_t lotsfree, needfree; 551 552 if (!tcp_do_reclaim) 553 return; 554 555 /* 556 * The reclaim function may be called even when the system is not 557 * really under memory pressure. 558 */ 559 if (freemem >= lotsfree + needfree) 560 return; 561 562 netstack_next_init(&nh); 563 while ((ns = netstack_next(&nh)) != NULL) { 564 int i; 565 int64_t tot_conn = 0; 566 567 /* 568 * During boot time, the first netstack_t is created and 569 * initialized before TCP has registered with the netstack 570 * framework. If this reclaim function is called before TCP 571 * has finished its initialization, netstack_next() will 572 * return the first netstack_t (since its netstack_flags is 573 * not NSF_UNINIT). And its netstack_tcp will be NULL. We 574 * need to catch it. 575 * 576 * All subsequent netstack_t creation will not have this 577 * problem since the initialization is not finished until TCP 578 * has finished its own tcp_stack_t initialization. Hence 579 * netstack_next() will not return one with NULL netstack_tcp. 580 */ 581 if ((tcps = ns->netstack_tcp) == NULL) { 582 netstack_rele(ns); 583 continue; 584 } 585 586 /* 587 * Even if the system is under memory pressure, the reason may 588 * not be because of TCP activity. Check the number of 589 * connections in each stack. If the number exceeds the 590 * threshold (maxusers), turn on defensive mode. 591 */ 592 for (i = 0; i < tcps->tcps_sc_cnt; i++) 593 tot_conn += tcps->tcps_sc[i]->tcp_sc_conn_cnt; 594 if (tot_conn < maxusers) { 595 netstack_rele(ns); 596 continue; 597 } 598 599 mutex_enter(&tcps->tcps_reclaim_lock); 600 if (!tcps->tcps_reclaim) { 601 tcps->tcps_reclaim = B_TRUE; 602 tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer, 603 tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period)); 604 TCP_STAT(tcps, tcp_reclaim_cnt); 605 } 606 mutex_exit(&tcps->tcps_reclaim_lock); 607 netstack_rele(ns); 608 } 609 netstack_next_fini(&nh); 610 } 611 612 /* 613 * Given a tcp_stack_t and a port (in host byte order), find a listener 614 * configuration for that port and return the ratio. 615 */ 616 uint32_t 617 tcp_find_listener_conf(tcp_stack_t *tcps, in_port_t port) 618 { 619 tcp_listener_t *tl; 620 uint32_t ratio = 0; 621 622 mutex_enter(&tcps->tcps_listener_conf_lock); 623 for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; 624 tl = list_next(&tcps->tcps_listener_conf, tl)) { 625 if (tl->tl_port == port) { 626 ratio = tl->tl_ratio; 627 break; 628 } 629 } 630 mutex_exit(&tcps->tcps_listener_conf_lock); 631 return (ratio); 632 } 633 634 /* 635 * Ndd param helper routine to return the current list of listener limit 636 * configuration. 637 */ 638 /* ARGSUSED */ 639 int 640 tcp_listener_conf_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 641 { 642 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 643 tcp_listener_t *tl; 644 645 mutex_enter(&tcps->tcps_listener_conf_lock); 646 for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; 647 tl = list_next(&tcps->tcps_listener_conf, tl)) { 648 (void) mi_mpprintf(mp, "%d:%d ", tl->tl_port, tl->tl_ratio); 649 } 650 mutex_exit(&tcps->tcps_listener_conf_lock); 651 return (0); 652 } 653 654 /* 655 * Ndd param helper routine to add a new listener limit configuration. 656 */ 657 /* ARGSUSED */ 658 int 659 tcp_listener_conf_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 660 cred_t *cr) 661 { 662 tcp_listener_t *new_tl; 663 tcp_listener_t *tl; 664 long lport; 665 long ratio; 666 char *colon; 667 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 668 669 if (ddi_strtol(value, &colon, 10, &lport) != 0 || lport <= 0 || 670 lport > USHRT_MAX || *colon != ':') { 671 return (EINVAL); 672 } 673 if (ddi_strtol(colon + 1, NULL, 10, &ratio) != 0 || ratio <= 0) 674 return (EINVAL); 675 676 mutex_enter(&tcps->tcps_listener_conf_lock); 677 for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; 678 tl = list_next(&tcps->tcps_listener_conf, tl)) { 679 /* There is an existing entry, so update its ratio value. */ 680 if (tl->tl_port == lport) { 681 tl->tl_ratio = ratio; 682 mutex_exit(&tcps->tcps_listener_conf_lock); 683 return (0); 684 } 685 } 686 687 if ((new_tl = kmem_alloc(sizeof (tcp_listener_t), KM_NOSLEEP)) == 688 NULL) { 689 mutex_exit(&tcps->tcps_listener_conf_lock); 690 return (ENOMEM); 691 } 692 693 new_tl->tl_port = lport; 694 new_tl->tl_ratio = ratio; 695 list_insert_tail(&tcps->tcps_listener_conf, new_tl); 696 mutex_exit(&tcps->tcps_listener_conf_lock); 697 return (0); 698 } 699 700 /* 701 * Ndd param helper routine to remove a listener limit configuration. 702 */ 703 /* ARGSUSED */ 704 int 705 tcp_listener_conf_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 706 cred_t *cr) 707 { 708 tcp_listener_t *tl; 709 long lport; 710 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 711 712 if (ddi_strtol(value, NULL, 10, &lport) != 0 || lport <= 0 || 713 lport > USHRT_MAX) { 714 return (EINVAL); 715 } 716 mutex_enter(&tcps->tcps_listener_conf_lock); 717 for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; 718 tl = list_next(&tcps->tcps_listener_conf, tl)) { 719 if (tl->tl_port == lport) { 720 list_remove(&tcps->tcps_listener_conf, tl); 721 mutex_exit(&tcps->tcps_listener_conf_lock); 722 kmem_free(tl, sizeof (tcp_listener_t)); 723 return (0); 724 } 725 } 726 mutex_exit(&tcps->tcps_listener_conf_lock); 727 return (ESRCH); 728 } 729 730 /* 731 * To remove all listener limit configuration in a tcp_stack_t. 732 */ 733 void 734 tcp_listener_conf_cleanup(tcp_stack_t *tcps) 735 { 736 tcp_listener_t *tl; 737 738 mutex_enter(&tcps->tcps_listener_conf_lock); 739 while ((tl = list_head(&tcps->tcps_listener_conf)) != NULL) { 740 list_remove(&tcps->tcps_listener_conf, tl); 741 kmem_free(tl, sizeof (tcp_listener_t)); 742 } 743 mutex_destroy(&tcps->tcps_listener_conf_lock); 744 list_destroy(&tcps->tcps_listener_conf); 745 } 746 747 /* 748 * Call back function for CPU state change. 749 */ 750 /* ARGSUSED */ 751 int 752 tcp_cpu_update(cpu_setup_t what, int id, void *arg) 753 { 754 cpu_t *cp; 755 netstack_handle_t nh; 756 netstack_t *ns; 757 tcp_stack_t *tcps; 758 int i; 759 760 ASSERT(MUTEX_HELD(&cpu_lock)); 761 cp = cpu[id]; 762 763 switch (what) { 764 case CPU_CONFIG: 765 case CPU_ON: 766 case CPU_INIT: 767 case CPU_CPUPART_IN: 768 netstack_next_init(&nh); 769 while ((ns = netstack_next(&nh)) != NULL) { 770 tcps = ns->netstack_tcp; 771 if (cp->cpu_seqid >= tcps->tcps_sc_cnt) { 772 for (i = tcps->tcps_sc_cnt; i <= cp->cpu_seqid; 773 i++) { 774 ASSERT(tcps->tcps_sc[i] == NULL); 775 tcps->tcps_sc[i] = kmem_zalloc( 776 sizeof (tcp_stats_cpu_t), KM_SLEEP); 777 } 778 membar_producer(); 779 tcps->tcps_sc_cnt = cp->cpu_seqid + 1; 780 } 781 netstack_rele(ns); 782 } 783 netstack_next_fini(&nh); 784 break; 785 case CPU_UNCONFIG: 786 case CPU_OFF: 787 case CPU_CPUPART_OUT: 788 /* Nothing to do */ 789 break; 790 default: 791 break; 792 } 793 return (0); 794 } 795 796 /* 797 * Diagnostic routine used to return a string associated with the tcp state. 798 * Note that if the caller does not supply a buffer, it will use an internal 799 * static string. This means that if multiple threads call this function at 800 * the same time, output can be corrupted... Note also that this function 801 * does not check the size of the supplied buffer. The caller has to make 802 * sure that it is big enough. 803 */ 804 char * 805 tcp_display(tcp_t *tcp, char *sup_buf, char format) 806 { 807 char buf1[30]; 808 static char priv_buf[INET6_ADDRSTRLEN * 2 + 80]; 809 char *buf; 810 char *cp; 811 in6_addr_t local, remote; 812 char local_addrbuf[INET6_ADDRSTRLEN]; 813 char remote_addrbuf[INET6_ADDRSTRLEN]; 814 conn_t *connp; 815 816 if (sup_buf != NULL) 817 buf = sup_buf; 818 else 819 buf = priv_buf; 820 821 if (tcp == NULL) 822 return ("NULL_TCP"); 823 824 connp = tcp->tcp_connp; 825 switch (tcp->tcp_state) { 826 case TCPS_CLOSED: 827 cp = "TCP_CLOSED"; 828 break; 829 case TCPS_IDLE: 830 cp = "TCP_IDLE"; 831 break; 832 case TCPS_BOUND: 833 cp = "TCP_BOUND"; 834 break; 835 case TCPS_LISTEN: 836 cp = "TCP_LISTEN"; 837 break; 838 case TCPS_SYN_SENT: 839 cp = "TCP_SYN_SENT"; 840 break; 841 case TCPS_SYN_RCVD: 842 cp = "TCP_SYN_RCVD"; 843 break; 844 case TCPS_ESTABLISHED: 845 cp = "TCP_ESTABLISHED"; 846 break; 847 case TCPS_CLOSE_WAIT: 848 cp = "TCP_CLOSE_WAIT"; 849 break; 850 case TCPS_FIN_WAIT_1: 851 cp = "TCP_FIN_WAIT_1"; 852 break; 853 case TCPS_CLOSING: 854 cp = "TCP_CLOSING"; 855 break; 856 case TCPS_LAST_ACK: 857 cp = "TCP_LAST_ACK"; 858 break; 859 case TCPS_FIN_WAIT_2: 860 cp = "TCP_FIN_WAIT_2"; 861 break; 862 case TCPS_TIME_WAIT: 863 cp = "TCP_TIME_WAIT"; 864 break; 865 default: 866 (void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state); 867 cp = buf1; 868 break; 869 } 870 switch (format) { 871 case DISP_ADDR_AND_PORT: 872 if (connp->conn_ipversion == IPV4_VERSION) { 873 /* 874 * Note that we use the remote address in the tcp_b 875 * structure. This means that it will print out 876 * the real destination address, not the next hop's 877 * address if source routing is used. 878 */ 879 IN6_IPADDR_TO_V4MAPPED(connp->conn_laddr_v4, &local); 880 IN6_IPADDR_TO_V4MAPPED(connp->conn_faddr_v4, &remote); 881 882 } else { 883 local = connp->conn_laddr_v6; 884 remote = connp->conn_faddr_v6; 885 } 886 (void) inet_ntop(AF_INET6, &local, local_addrbuf, 887 sizeof (local_addrbuf)); 888 (void) inet_ntop(AF_INET6, &remote, remote_addrbuf, 889 sizeof (remote_addrbuf)); 890 (void) mi_sprintf(buf, "[%s.%u, %s.%u] %s", 891 local_addrbuf, ntohs(connp->conn_lport), remote_addrbuf, 892 ntohs(connp->conn_fport), cp); 893 break; 894 case DISP_PORT_ONLY: 895 default: 896 (void) mi_sprintf(buf, "[%u, %u] %s", 897 ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp); 898 break; 899 } 900 901 return (buf); 902 } 903