1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/strsubr.h> 31 #include <sys/stropts.h> 32 #include <sys/strsun.h> 33 #include <sys/strlog.h> 34 #define _SUN_TPI_VERSION 2 35 #include <sys/tihdr.h> 36 #include <sys/timod.h> 37 #include <sys/ddi.h> 38 #include <sys/sunddi.h> 39 #include <sys/cmn_err.h> 40 #include <sys/proc.h> 41 #include <sys/suntpi.h> 42 #include <sys/policy.h> 43 #include <sys/zone.h> 44 45 #include <sys/socket.h> 46 #include <netinet/in.h> 47 48 #include <inet/common.h> 49 #include <netinet/ip6.h> 50 #include <inet/ip.h> 51 #include <inet/ipclassifier.h> 52 #include <inet/mi.h> 53 #include <inet/nd.h> 54 #include <inet/optcom.h> 55 #include <netinet/ip_mroute.h> 56 #include <sys/isa_defs.h> 57 #include <net/route.h> 58 59 #include <inet/rts_impl.h> 60 #include <inet/ip_rts.h> 61 62 /* 63 * This is a transport provider for routing sockets. Downstream messages are 64 * wrapped with a IP_IOCTL header, and ip_wput_ioctl calls the appropriate entry 65 * in the ip_ioctl_ftbl callout table to pass the routing socket data into IP. 66 * Upstream messages are generated for listeners of the routing socket as well 67 * as the message sender (unless they have turned off their end using 68 * SO_USELOOPBACK or shutdown(3n)). Upstream messages may also be generated 69 * asynchronously when: 70 * 71 * Interfaces are brought up or down. 72 * Addresses are assigned to interfaces. 73 * ICMP redirects are processed and a IRE_HOST/RTF_DYNAMIC is installed. 74 * No route is found while sending a packet. 75 * When TCP requests IP to remove an IRE_CACHE of a troubled destination. 76 * 77 * Since all we do is reformat the messages between routing socket and 78 * ioctl forms, no synchronization is necessary in this module; all 79 * the dirty work is done down in ip. 80 */ 81 82 /* Default structure copied into T_INFO_ACK messages */ 83 static struct T_info_ack rts_g_t_info_ack = { 84 T_INFO_ACK, 85 T_INFINITE, /* TSDU_size. Maximum size messages. */ 86 T_INVALID, /* ETSDU_size. No expedited data. */ 87 T_INVALID, /* CDATA_size. No connect data. */ 88 T_INVALID, /* DDATA_size. No disconnect data. */ 89 0, /* ADDR_size. */ 90 0, /* OPT_size - not initialized here */ 91 64 * 1024, /* TIDU_size. rts allows maximum size messages. */ 92 T_COTS, /* SERV_type. rts supports connection oriented. */ 93 TS_UNBND, /* CURRENT_state. This is set from rts_state. */ 94 (XPG4_1) /* PROVIDER_flag */ 95 }; 96 97 /* 98 * Table of ND variables supported by rts. These are loaded into rts_g_nd 99 * in rts_open. 100 * All of these are alterable, within the min/max values given, at run time. 101 */ 102 static rtsparam_t lcl_param_arr[] = { 103 /* min max value name */ 104 { 4096, 65536, 8192, "rts_xmit_hiwat"}, 105 { 0, 65536, 1024, "rts_xmit_lowat"}, 106 { 4096, 65536, 8192, "rts_recv_hiwat"}, 107 { 65536, 1024*1024*1024, 256*1024, "rts_max_buf"}, 108 }; 109 #define rtss_xmit_hiwat rtss_params[0].rts_param_value 110 #define rtss_xmit_lowat rtss_params[1].rts_param_value 111 #define rtss_recv_hiwat rtss_params[2].rts_param_value 112 #define rtss_max_buf rtss_params[3].rts_param_value 113 114 static int rts_close(queue_t *q); 115 static void rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, 116 int sys_error); 117 static void rts_input(void *, mblk_t *, void *); 118 static mblk_t *rts_ioctl_alloc(mblk_t *data, cred_t *cr); 119 static int rts_open(queue_t *q, dev_t *devp, int flag, int sflag, 120 cred_t *credp); 121 int rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, 122 uchar_t *ptr); 123 int rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, 124 uchar_t *ptr); 125 int rts_opt_set(queue_t *q, uint_t optset_context, int level, 126 int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, 127 uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk); 128 static int rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); 129 static boolean_t rts_param_register(IDP *ndp, rtsparam_t *rtspa, int cnt); 130 static int rts_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 131 cred_t *cr); 132 static void rts_rsrv(queue_t *q); 133 static void *rts_stack_init(netstackid_t stackid, netstack_t *ns); 134 static void rts_stack_fini(netstackid_t stackid, void *arg); 135 static void rts_wput(queue_t *q, mblk_t *mp); 136 static void rts_wput_iocdata(queue_t *q, mblk_t *mp); 137 static void rts_wput_other(queue_t *q, mblk_t *mp); 138 static int rts_wrw(queue_t *q, struiod_t *dp); 139 140 static struct module_info rts_mod_info = { 141 129, "rts", 1, INFPSZ, 512, 128 142 }; 143 144 static struct qinit rtsrinit = { 145 NULL, (pfi_t)rts_rsrv, rts_open, rts_close, NULL, &rts_mod_info 146 }; 147 148 static struct qinit rtswinit = { 149 (pfi_t)rts_wput, NULL, NULL, NULL, NULL, &rts_mod_info, 150 NULL, (pfi_t)rts_wrw, NULL, STRUIOT_STANDARD 151 }; 152 153 struct streamtab rtsinfo = { 154 &rtsrinit, &rtswinit 155 }; 156 157 /* 158 * This routine allocates the necessary 159 * message blocks for IOCTL wrapping the 160 * user data. 161 */ 162 static mblk_t * 163 rts_ioctl_alloc(mblk_t *data, cred_t *cr) 164 { 165 mblk_t *mp = NULL; 166 mblk_t *mp1 = NULL; 167 ipllc_t *ipllc; 168 struct iocblk *ioc; 169 170 mp = allocb_cred(sizeof (ipllc_t), cr); 171 if (mp == NULL) 172 return (NULL); 173 mp1 = allocb_cred(sizeof (struct iocblk), cr); 174 if (mp1 == NULL) { 175 freeb(mp); 176 return (NULL); 177 } 178 179 ipllc = (ipllc_t *)mp->b_rptr; 180 ipllc->ipllc_cmd = IP_IOC_RTS_REQUEST; 181 ipllc->ipllc_name_offset = 0; 182 ipllc->ipllc_name_length = 0; 183 mp->b_wptr += sizeof (ipllc_t); 184 mp->b_cont = data; 185 186 ioc = (struct iocblk *)mp1->b_rptr; 187 ioc->ioc_cmd = IP_IOCTL; 188 ioc->ioc_error = 0; 189 ioc->ioc_cr = NULL; 190 ioc->ioc_count = msgdsize(mp); 191 mp1->b_wptr += sizeof (struct iocblk); 192 mp1->b_datap->db_type = M_IOCTL; 193 mp1->b_cont = mp; 194 195 return (mp1); 196 } 197 198 /* 199 * This routine closes rts stream, by disabling 200 * put/srv routines and freeing the this module 201 * internal datastructure. 202 */ 203 static int 204 rts_close(queue_t *q) 205 { 206 conn_t *connp = Q_TO_CONN(q); 207 208 ASSERT(connp != NULL && IPCL_IS_RTS(connp)); 209 210 ip_rts_unregister(connp); 211 212 ip_quiesce_conn(connp); 213 214 qprocsoff(q); 215 216 /* 217 * Now we are truly single threaded on this stream, and can 218 * delete the things hanging off the connp, and finally the connp. 219 * We removed this connp from the fanout list, it cannot be 220 * accessed thru the fanouts, and we already waited for the 221 * conn_ref to drop to 0. We are already in close, so 222 * there cannot be any other thread from the top. qprocsoff 223 * has completed, and service has completed or won't run in 224 * future. 225 */ 226 ASSERT(connp->conn_ref == 1); 227 228 inet_minor_free(ip_minor_arena, connp->conn_dev); 229 230 connp->conn_ref--; 231 ipcl_conn_destroy(connp); 232 233 q->q_ptr = WR(q)->q_ptr = NULL; 234 return (0); 235 } 236 237 /* 238 * This is the open routine for routing socket. It allocates 239 * rts_t structure for the stream and tells IP that it is a routing socket. 240 */ 241 /* ARGSUSED */ 242 static int 243 rts_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 244 { 245 rts_t *rts; 246 conn_t *connp; 247 dev_t conn_dev; 248 zoneid_t zoneid; 249 netstack_t *ns; 250 rts_stack_t *rtss; 251 252 /* If the stream is already open, return immediately. */ 253 if (q->q_ptr != NULL) 254 return (0); 255 256 if (sflag == MODOPEN) 257 return (EINVAL); 258 259 ns = netstack_find_by_cred(credp); 260 ASSERT(ns != NULL); 261 rtss = ns->netstack_rts; 262 ASSERT(rtss != NULL); 263 264 /* 265 * For exclusive stacks we set the zoneid to zero 266 * to make RTS operate as if in the global zone. 267 */ 268 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 269 zoneid = GLOBAL_ZONEID; 270 else 271 zoneid = crgetzoneid(credp); 272 273 if ((conn_dev = inet_minor_alloc(ip_minor_arena)) == 0) { 274 netstack_rele(ns); 275 return (EBUSY); 276 } 277 *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); 278 279 connp = ipcl_conn_create(IPCL_RTSCONN, KM_SLEEP, ns); 280 connp->conn_dev = conn_dev; 281 rts = connp->conn_rts; 282 283 /* 284 * ipcl_conn_create did a netstack_hold. Undo the hold that was 285 * done by netstack_find_by_cred() 286 */ 287 netstack_rele(ns); 288 289 /* 290 * Initialize the rts_t structure for this stream. 291 */ 292 q->q_ptr = connp; 293 WR(q)->q_ptr = connp; 294 connp->conn_rq = q; 295 connp->conn_wq = WR(q); 296 297 rw_enter(&rts->rts_rwlock, RW_WRITER); 298 ASSERT(connp->conn_rts == rts); 299 ASSERT(rts->rts_connp == connp); 300 301 /* Set the initial state of the stream and the privilege status. */ 302 rts->rts_state = TS_UNBND; 303 connp->conn_zoneid = zoneid; 304 305 connp->conn_ulp_labeled = is_system_labeled(); 306 307 rts->rts_rtss = rtss; 308 309 q->q_hiwat = rtss->rtss_recv_hiwat; 310 WR(q)->q_hiwat = rtss->rtss_xmit_hiwat; 311 WR(q)->q_lowat = rtss->rtss_xmit_lowat; 312 313 connp->conn_recv = rts_input; 314 crhold(credp); 315 connp->conn_cred = credp; 316 317 mutex_enter(&connp->conn_lock); 318 connp->conn_state_flags &= ~CONN_INCIPIENT; 319 mutex_exit(&connp->conn_lock); 320 321 qprocson(q); 322 rw_exit(&rts->rts_rwlock); 323 324 /* 325 * Indicate the down IP module that this is a routing socket 326 * client by sending an RTS IOCTL without any user data. Although 327 * this is just a notification message (without any real routing 328 * request), we pass in any credential for correctness sake. 329 */ 330 ip_rts_register(connp); 331 332 return (0); 333 334 } 335 336 /* 337 * This routine creates a T_ERROR_ACK message and passes it upstream. 338 */ 339 static void 340 rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) 341 { 342 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 343 qreply(q, mp); 344 } 345 346 /* 347 * This routine creates a T_OK_ACK message and passes it upstream. 348 */ 349 static void 350 rts_ok_ack(queue_t *q, mblk_t *mp) 351 { 352 if ((mp = mi_tpi_ok_ack_alloc(mp)) != NULL) 353 qreply(q, mp); 354 } 355 356 /* 357 * This routine is called by rts_wput to handle T_UNBIND_REQ messages. 358 */ 359 static void 360 rts_unbind(queue_t *q, mblk_t *mp) 361 { 362 conn_t *connp = Q_TO_CONN(q); 363 rts_t *rts = connp->conn_rts; 364 365 /* If a bind has not been done, we can't unbind. */ 366 if (rts->rts_state != TS_IDLE) { 367 rts_err_ack(q, mp, TOUTSTATE, 0); 368 return; 369 } 370 rts->rts_state = TS_UNBND; 371 rts_ok_ack(q, mp); 372 } 373 374 /* 375 * This routine is called to handle each 376 * O_T_BIND_REQ/T_BIND_REQ message passed to 377 * rts_wput. Note: This routine works with both 378 * O_T_BIND_REQ and T_BIND_REQ semantics. 379 */ 380 static void 381 rts_bind(queue_t *q, mblk_t *mp) 382 { 383 conn_t *connp = Q_TO_CONN(q); 384 rts_t *rts = connp->conn_rts; 385 mblk_t *mp1; 386 struct T_bind_req *tbr; 387 388 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 389 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 390 "rts_bind: bad data, %d", rts->rts_state); 391 rts_err_ack(q, mp, TBADADDR, 0); 392 return; 393 } 394 if (rts->rts_state != TS_UNBND) { 395 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 396 "rts_bind: bad state, %d", rts->rts_state); 397 rts_err_ack(q, mp, TOUTSTATE, 0); 398 return; 399 } 400 /* 401 * Reallocate the message to make sure we have enough room for an 402 * address and the protocol type. 403 */ 404 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin_t), 1); 405 if (mp1 == NULL) { 406 rts_err_ack(q, mp, TSYSERR, ENOMEM); 407 return; 408 } 409 mp = mp1; 410 tbr = (struct T_bind_req *)mp->b_rptr; 411 if (tbr->ADDR_length != 0) { 412 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 413 "rts_bind: bad ADDR_length %d", tbr->ADDR_length); 414 rts_err_ack(q, mp, TBADADDR, 0); 415 return; 416 } 417 /* Generic request */ 418 tbr->ADDR_offset = (t_scalar_t)sizeof (struct T_bind_req); 419 tbr->ADDR_length = 0; 420 tbr->PRIM_type = T_BIND_ACK; 421 rts->rts_state = TS_IDLE; 422 qreply(q, mp); 423 } 424 425 static void 426 rts_copy_info(struct T_info_ack *tap, rts_t *rts) 427 { 428 *tap = rts_g_t_info_ack; 429 tap->CURRENT_state = rts->rts_state; 430 tap->OPT_size = rts_max_optsize; 431 } 432 433 /* 434 * This routine responds to T_CAPABILITY_REQ messages. It is called by 435 * rts_wput. Much of the T_CAPABILITY_ACK information is copied from 436 * rts_g_t_info_ack. The current state of the stream is copied from 437 * rts_state. 438 */ 439 static void 440 rts_capability_req(queue_t *q, mblk_t *mp) 441 { 442 conn_t *connp = Q_TO_CONN(q); 443 rts_t *rts = connp->conn_rts; 444 t_uscalar_t cap_bits1; 445 struct T_capability_ack *tcap; 446 447 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 448 449 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 450 mp->b_datap->db_type, T_CAPABILITY_ACK); 451 if (mp == NULL) 452 return; 453 454 tcap = (struct T_capability_ack *)mp->b_rptr; 455 tcap->CAP_bits1 = 0; 456 457 if (cap_bits1 & TC1_INFO) { 458 rts_copy_info(&tcap->INFO_ack, rts); 459 tcap->CAP_bits1 |= TC1_INFO; 460 } 461 462 qreply(q, mp); 463 } 464 465 /* 466 * This routine responds to T_INFO_REQ messages. It is called by rts_wput. 467 * Most of the T_INFO_ACK information is copied from rts_g_t_info_ack. 468 * The current state of the stream is copied from rts_state. 469 */ 470 static void 471 rts_info_req(queue_t *q, mblk_t *mp) 472 { 473 conn_t *connp = Q_TO_CONN(q); 474 rts_t *rts = connp->conn_rts; 475 476 mp = tpi_ack_alloc(mp, sizeof (rts_g_t_info_ack), M_PCPROTO, 477 T_INFO_ACK); 478 if (mp == NULL) 479 return; 480 rts_copy_info((struct T_info_ack *)mp->b_rptr, rts); 481 qreply(q, mp); 482 } 483 484 /* 485 * This routine gets default values of certain options whose default 486 * values are maintained by protcol specific code 487 */ 488 /* ARGSUSED */ 489 int 490 rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) 491 { 492 /* no default value processed by protocol specific code currently */ 493 return (-1); 494 } 495 496 /* 497 * This routine retrieves the current status of socket options. 498 * It returns the size of the option retrieved. 499 */ 500 int 501 rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) 502 { 503 int *i1 = (int *)ptr; 504 conn_t *connp = Q_TO_CONN(q); 505 rts_t *rts = connp->conn_rts; 506 507 switch (level) { 508 case SOL_SOCKET: 509 switch (name) { 510 case SO_DEBUG: 511 *i1 = rts->rts_debug; 512 break; 513 case SO_REUSEADDR: 514 *i1 = rts->rts_reuseaddr; 515 break; 516 case SO_TYPE: 517 *i1 = SOCK_RAW; 518 break; 519 520 /* 521 * The following three items are available here, 522 * but are only meaningful to IP. 523 */ 524 case SO_DONTROUTE: 525 *i1 = rts->rts_dontroute; 526 break; 527 case SO_USELOOPBACK: 528 *i1 = rts->rts_useloopback; 529 break; 530 case SO_BROADCAST: 531 *i1 = rts->rts_broadcast; 532 break; 533 case SO_PROTOTYPE: 534 *i1 = rts->rts_proto; 535 break; 536 /* 537 * The following two items can be manipulated, 538 * but changing them should do nothing. 539 */ 540 case SO_SNDBUF: 541 ASSERT(q->q_hiwat <= INT_MAX); 542 *i1 = (int)(q->q_hiwat); 543 break; 544 case SO_RCVBUF: 545 ASSERT(q->q_hiwat <= INT_MAX); 546 *i1 = (int)(RD(q)->q_hiwat); 547 break; 548 case SO_DOMAIN: 549 *i1 = PF_ROUTE; 550 break; 551 default: 552 return (-1); 553 } 554 break; 555 default: 556 return (-1); 557 } 558 return ((int)sizeof (int)); 559 } 560 561 562 /* 563 * This routine sets socket options. 564 */ 565 /*ARGSUSED*/ 566 int 567 rts_opt_set(queue_t *q, uint_t optset_context, int level, 568 int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, 569 uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk) 570 { 571 int *i1 = (int *)invalp; 572 conn_t *connp = Q_TO_CONN(q); 573 rts_t *rts = connp->conn_rts; 574 boolean_t checkonly; 575 rts_stack_t *rtss = rts->rts_rtss; 576 577 switch (optset_context) { 578 case SETFN_OPTCOM_CHECKONLY: 579 checkonly = B_TRUE; 580 /* 581 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 582 * inlen != 0 implies value supplied and 583 * we have to "pretend" to set it. 584 * inlen == 0 implies that there is no 585 * value part in T_CHECK request and just validation 586 * done elsewhere should be enough, we just return here. 587 */ 588 if (inlen == 0) { 589 *outlenp = 0; 590 return (0); 591 } 592 break; 593 case SETFN_OPTCOM_NEGOTIATE: 594 checkonly = B_FALSE; 595 break; 596 case SETFN_UD_NEGOTIATE: 597 case SETFN_CONN_NEGOTIATE: 598 checkonly = B_FALSE; 599 /* 600 * Negotiating local and "association-related" options 601 * through T_UNITDATA_REQ or T_CONN_{REQ,CON} 602 * Not allowed in this module. 603 */ 604 return (EINVAL); 605 default: 606 /* 607 * We should never get here 608 */ 609 *outlenp = 0; 610 return (EINVAL); 611 } 612 613 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 614 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 615 616 /* 617 * For rts, we should have no ancillary data sent down 618 * (rts_wput doesn't handle options). 619 */ 620 ASSERT(thisdg_attrs == NULL); 621 622 /* 623 * For fixed length options, no sanity check 624 * of passed in length is done. It is assumed *_optcom_req() 625 * routines do the right thing. 626 */ 627 628 switch (level) { 629 case SOL_SOCKET: 630 switch (name) { 631 case SO_REUSEADDR: 632 if (!checkonly) 633 rts->rts_reuseaddr = *i1; 634 break; /* goto sizeof (int) option return */ 635 case SO_DEBUG: 636 if (!checkonly) 637 rts->rts_debug = *i1; 638 break; /* goto sizeof (int) option return */ 639 /* 640 * The following three items are available here, 641 * but are only meaningful to IP. 642 */ 643 case SO_DONTROUTE: 644 if (!checkonly) 645 rts->rts_dontroute = *i1; 646 break; /* goto sizeof (int) option return */ 647 case SO_USELOOPBACK: 648 if (!checkonly) 649 rts->rts_useloopback = *i1; 650 break; /* goto sizeof (int) option return */ 651 case SO_BROADCAST: 652 if (!checkonly) 653 rts->rts_broadcast = *i1; 654 break; /* goto sizeof (int) option return */ 655 case SO_PROTOTYPE: 656 /* 657 * Routing socket applications that call socket() with 658 * a third argument can filter which messages will be 659 * sent upstream thanks to sockfs. so_socket() sends 660 * down the SO_PROTOTYPE and rts_queue_input() 661 * implements the filtering. 662 */ 663 if (*i1 != AF_INET && *i1 != AF_INET6) 664 return (EPROTONOSUPPORT); 665 if (!checkonly) 666 rts->rts_proto = *i1; 667 break; /* goto sizeof (int) option return */ 668 /* 669 * The following two items can be manipulated, 670 * but changing them should do nothing. 671 */ 672 case SO_SNDBUF: 673 if (*i1 > rtss->rtss_max_buf) { 674 *outlenp = 0; 675 return (ENOBUFS); 676 } 677 if (!checkonly) { 678 q->q_hiwat = *i1; 679 } 680 break; /* goto sizeof (int) option return */ 681 case SO_RCVBUF: 682 if (*i1 > rtss->rtss_max_buf) { 683 *outlenp = 0; 684 return (ENOBUFS); 685 } 686 if (!checkonly) { 687 RD(q)->q_hiwat = *i1; 688 (void) mi_set_sth_hiwat(RD(q), *i1); 689 } 690 break; /* goto sizeof (int) option return */ 691 default: 692 *outlenp = 0; 693 return (EINVAL); 694 } 695 break; 696 default: 697 *outlenp = 0; 698 return (EINVAL); 699 } 700 /* 701 * Common case of return from an option that is sizeof (int) 702 */ 703 *(int *)outvalp = *i1; 704 *outlenp = (t_uscalar_t)sizeof (int); 705 return (0); 706 } 707 708 /* 709 * This routine retrieves the value of an ND variable in a rtsparam_t 710 * structure. It is called through nd_getset when a user reads the 711 * variable. 712 */ 713 /* ARGSUSED */ 714 static int 715 rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 716 { 717 rtsparam_t *rtspa = (rtsparam_t *)cp; 718 719 (void) mi_mpprintf(mp, "%u", rtspa->rts_param_value); 720 return (0); 721 } 722 723 /* 724 * Walk through the param array specified registering each element with the 725 * named dispatch (ND) handler. 726 */ 727 static boolean_t 728 rts_param_register(IDP *ndp, rtsparam_t *rtspa, int cnt) 729 { 730 for (; cnt-- > 0; rtspa++) { 731 if (rtspa->rts_param_name != NULL && rtspa->rts_param_name[0]) { 732 if (!nd_load(ndp, rtspa->rts_param_name, 733 rts_param_get, rts_param_set, (caddr_t)rtspa)) { 734 nd_free(ndp); 735 return (B_FALSE); 736 } 737 } 738 } 739 return (B_TRUE); 740 } 741 742 /* This routine sets an ND variable in a rtsparam_t structure. */ 743 /* ARGSUSED */ 744 static int 745 rts_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) 746 { 747 ulong_t new_value; 748 rtsparam_t *rtspa = (rtsparam_t *)cp; 749 750 /* 751 * Fail the request if the new value does not lie within the 752 * required bounds. 753 */ 754 if (ddi_strtoul(value, NULL, 10, &new_value) != 0 || 755 new_value < rtspa->rts_param_min || 756 new_value > rtspa->rts_param_max) { 757 return (EINVAL); 758 } 759 760 /* Set the new value */ 761 rtspa->rts_param_value = new_value; 762 return (0); 763 } 764 765 /* 766 * Empty rsrv routine which is used by rts_input to cause a wakeup 767 * of a thread in qwait. 768 */ 769 /*ARGSUSED*/ 770 static void 771 rts_rsrv(queue_t *q) 772 { 773 } 774 775 /* 776 * This routine handles synchronous messages passed downstream. It either 777 * consumes the message or passes it downstream; it never queues a 778 * a message. The data messages that go down are wrapped in an IOCTL 779 * message. 780 * 781 * Since it is synchronous, it waits for the M_IOCACK/M_IOCNAK so that 782 * it can return an immediate error (such as ENETUNREACH when adding a route). 783 * It uses the RTS_WRW_PENDING to ensure that each rts instance has only 784 * one M_IOCTL outstanding at any given time. 785 */ 786 static int 787 rts_wrw(queue_t *q, struiod_t *dp) 788 { 789 mblk_t *mp = dp->d_mp; 790 mblk_t *mp1; 791 int error; 792 rt_msghdr_t *rtm; 793 conn_t *connp = Q_TO_CONN(q); 794 rts_t *rts = connp->conn_rts; 795 796 while (rts->rts_flag & RTS_WRW_PENDING) { 797 if (qwait_rw(q)) { 798 rts->rts_error = EINTR; 799 goto err_ret; 800 } 801 } 802 rts->rts_flag |= RTS_WRW_PENDING; 803 804 if (isuioq(q) && (error = struioget(q, mp, dp, 0))) { 805 /* 806 * Uio error of some sort, so just return the error. 807 */ 808 rts->rts_error = error; 809 goto err_ret; 810 } 811 /* 812 * Pass the mblk (chain) onto wput(). 813 */ 814 dp->d_mp = 0; 815 816 switch (mp->b_datap->db_type) { 817 case M_PROTO: 818 case M_PCPROTO: 819 /* Expedite other than T_DATA_REQ to below the switch */ 820 if (((mp->b_wptr - mp->b_rptr) != 821 sizeof (struct T_data_req)) || 822 (((union T_primitives *)mp->b_rptr)->type != T_DATA_REQ)) 823 break; 824 if ((mp1 = mp->b_cont) == NULL) { 825 rts->rts_error = EINVAL; 826 goto err_ret; 827 } 828 freeb(mp); 829 mp = mp1; 830 /* FALLTHRU */ 831 case M_DATA: 832 /* 833 * The semantics of the routing socket is such that the rtm_pid 834 * field is automatically filled in during requests with the 835 * current process' pid. We do this here (where we still have 836 * user context) after checking we have at least a message the 837 * size of a routing message header. 838 */ 839 if ((mp->b_wptr - mp->b_rptr) < sizeof (rt_msghdr_t)) { 840 if (!pullupmsg(mp, sizeof (rt_msghdr_t))) { 841 rts->rts_error = EINVAL; 842 goto err_ret; 843 } 844 } 845 rtm = (rt_msghdr_t *)mp->b_rptr; 846 rtm->rtm_pid = curproc->p_pid; 847 break; 848 default: 849 break; 850 } 851 rts->rts_flag |= RTS_WPUT_PENDING; 852 rts_wput(q, mp); 853 while (rts->rts_flag & RTS_WPUT_PENDING) 854 if (qwait_rw(q)) { 855 /* RTS_WPUT_PENDING will be cleared below */ 856 rts->rts_error = EINTR; 857 break; 858 } 859 err_ret: 860 rts->rts_flag &= ~(RTS_WPUT_PENDING | RTS_WRW_PENDING); 861 return (rts->rts_error); 862 } 863 864 /* 865 * This routine handles all messages passed downstream. It either 866 * consumes the message or passes it downstream; it never queues a 867 * a message. The data messages that go down are wrapped in an IOCTL 868 * message. 869 * 870 * FIXME? Should we call IP rts_request directly? Could punt on returning 871 * errno in the case when it defers processing due to 872 * IPIF_CHANGING/ILL_CHANGING??? 873 */ 874 static void 875 rts_wput(queue_t *q, mblk_t *mp) 876 { 877 uchar_t *rptr = mp->b_rptr; 878 mblk_t *mp1; 879 conn_t *connp = Q_TO_CONN(q); 880 rts_t *rts = connp->conn_rts; 881 882 switch (mp->b_datap->db_type) { 883 case M_DATA: 884 break; 885 case M_PROTO: 886 case M_PCPROTO: 887 if ((mp->b_wptr - rptr) == sizeof (struct T_data_req)) { 888 /* Expedite valid T_DATA_REQ to below the switch */ 889 if (((union T_primitives *)rptr)->type == T_DATA_REQ) { 890 mp1 = mp->b_cont; 891 freeb(mp); 892 if (mp1 == NULL) 893 return; 894 mp = mp1; 895 break; 896 } 897 } 898 /* FALLTHRU */ 899 default: 900 rts_wput_other(q, mp); 901 return; 902 } 903 904 905 mp1 = rts_ioctl_alloc(mp, DB_CRED(mp)); 906 if (mp1 == NULL) { 907 ASSERT(rts != NULL); 908 freemsg(mp); 909 if (rts->rts_flag & RTS_WPUT_PENDING) { 910 rts->rts_error = ENOMEM; 911 rts->rts_flag &= ~RTS_WPUT_PENDING; 912 } 913 return; 914 } 915 ip_output(connp, mp1, q, IP_WPUT); 916 } 917 918 919 /* 920 * Handles all the control message, if it 921 * can not understand it, it will 922 * pass down stream. 923 */ 924 static void 925 rts_wput_other(queue_t *q, mblk_t *mp) 926 { 927 conn_t *connp = Q_TO_CONN(q); 928 rts_t *rts = connp->conn_rts; 929 uchar_t *rptr = mp->b_rptr; 930 struct iocblk *iocp; 931 cred_t *cr; 932 rts_stack_t *rtss; 933 934 rtss = rts->rts_rtss; 935 936 cr = DB_CREDDEF(mp, connp->conn_cred); 937 938 switch (mp->b_datap->db_type) { 939 case M_PROTO: 940 case M_PCPROTO: 941 if ((mp->b_wptr - rptr) < sizeof (t_scalar_t)) { 942 /* 943 * If the message does not contain a PRIM_type, 944 * throw it away. 945 */ 946 freemsg(mp); 947 return; 948 } 949 switch (((union T_primitives *)rptr)->type) { 950 case T_BIND_REQ: 951 case O_T_BIND_REQ: 952 rts_bind(q, mp); 953 return; 954 case T_UNBIND_REQ: 955 rts_unbind(q, mp); 956 return; 957 case T_CAPABILITY_REQ: 958 rts_capability_req(q, mp); 959 return; 960 case T_INFO_REQ: 961 rts_info_req(q, mp); 962 return; 963 case T_SVR4_OPTMGMT_REQ: 964 (void) svr4_optcom_req(q, mp, cr, &rts_opt_obj, 965 B_TRUE); 966 return; 967 case T_OPTMGMT_REQ: 968 (void) tpi_optcom_req(q, mp, cr, &rts_opt_obj, B_TRUE); 969 return; 970 case O_T_CONN_RES: 971 case T_CONN_RES: 972 case T_DISCON_REQ: 973 /* Not supported by rts. */ 974 rts_err_ack(q, mp, TNOTSUPPORT, 0); 975 return; 976 case T_DATA_REQ: 977 case T_EXDATA_REQ: 978 case T_ORDREL_REQ: 979 /* Illegal for rts. */ 980 freemsg(mp); 981 (void) putnextctl1(RD(q), M_ERROR, EPROTO); 982 return; 983 default: 984 break; 985 } 986 break; 987 case M_IOCTL: 988 iocp = (struct iocblk *)mp->b_rptr; 989 switch (iocp->ioc_cmd) { 990 case ND_SET: 991 case ND_GET: 992 if (nd_getset(q, rtss->rtss_g_nd, mp)) { 993 qreply(q, mp); 994 return; 995 } 996 break; 997 case TI_GETPEERNAME: 998 mi_copyin(q, mp, NULL, 999 SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); 1000 return; 1001 default: 1002 break; 1003 } 1004 case M_IOCDATA: 1005 rts_wput_iocdata(q, mp); 1006 return; 1007 default: 1008 break; 1009 } 1010 ip_output(connp, mp, q, IP_WPUT); 1011 } 1012 1013 /* 1014 * Called by rts_wput_other to handle all M_IOCDATA messages. 1015 */ 1016 static void 1017 rts_wput_iocdata(queue_t *q, mblk_t *mp) 1018 { 1019 conn_t *connp = Q_TO_CONN(q); 1020 struct sockaddr *rtsaddr; 1021 mblk_t *mp1; 1022 STRUCT_HANDLE(strbuf, sb); 1023 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 1024 1025 /* Make sure it is one of ours. */ 1026 switch (iocp->ioc_cmd) { 1027 case TI_GETPEERNAME: 1028 break; 1029 default: 1030 ip_output(connp, mp, q, IP_WPUT); 1031 return; 1032 } 1033 switch (mi_copy_state(q, mp, &mp1)) { 1034 case -1: 1035 return; 1036 case MI_COPY_CASE(MI_COPY_IN, 1): 1037 break; 1038 case MI_COPY_CASE(MI_COPY_OUT, 1): 1039 /* Copy out the strbuf. */ 1040 mi_copyout(q, mp); 1041 return; 1042 case MI_COPY_CASE(MI_COPY_OUT, 2): 1043 /* All done. */ 1044 mi_copy_done(q, mp, 0); 1045 return; 1046 default: 1047 mi_copy_done(q, mp, EPROTO); 1048 return; 1049 } 1050 STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr); 1051 if (STRUCT_FGET(sb, maxlen) < (int)sizeof (sin_t)) { 1052 mi_copy_done(q, mp, EINVAL); 1053 return; 1054 } 1055 switch (iocp->ioc_cmd) { 1056 case TI_GETPEERNAME: 1057 break; 1058 default: 1059 mi_copy_done(q, mp, EPROTO); 1060 return; 1061 } 1062 mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), sizeof (sin_t), 1063 B_TRUE); 1064 if (mp1 == NULL) 1065 return; 1066 STRUCT_FSET(sb, len, (int)sizeof (sin_t)); 1067 rtsaddr = (struct sockaddr *)mp1->b_rptr; 1068 mp1->b_wptr = (uchar_t *)&rtsaddr[1]; 1069 bzero(rtsaddr, sizeof (struct sockaddr)); 1070 rtsaddr->sa_family = AF_ROUTE; 1071 /* Copy out the address */ 1072 mi_copyout(q, mp); 1073 } 1074 1075 /*ARGSUSED2*/ 1076 static void 1077 rts_input(void *arg1, mblk_t *mp, void *arg2) 1078 { 1079 conn_t *connp = (conn_t *)arg1; 1080 rts_t *rts = connp->conn_rts; 1081 struct iocblk *iocp; 1082 mblk_t *mp1; 1083 struct T_data_ind *tdi; 1084 1085 switch (mp->b_datap->db_type) { 1086 case M_IOCACK: 1087 case M_IOCNAK: 1088 iocp = (struct iocblk *)mp->b_rptr; 1089 if (rts->rts_flag & (RTS_WPUT_PENDING)) { 1090 rts->rts_flag &= ~RTS_WPUT_PENDING; 1091 rts->rts_error = iocp->ioc_error; 1092 /* 1093 * Tell rts_wvw/qwait that we are done. 1094 * Note: there is no qwait_wakeup() we can use. 1095 */ 1096 qenable(connp->conn_rq); 1097 freemsg(mp); 1098 return; 1099 } 1100 break; 1101 case M_DATA: 1102 /* 1103 * Prepend T_DATA_IND to prevent the stream head from 1104 * consolidating multiple messages together. 1105 * If the allocation fails just send up the M_DATA. 1106 */ 1107 mp1 = allocb(sizeof (*tdi), BPRI_MED); 1108 if (mp1 != NULL) { 1109 mp1->b_cont = mp; 1110 mp = mp1; 1111 1112 mp->b_datap->db_type = M_PROTO; 1113 mp->b_wptr += sizeof (*tdi); 1114 tdi = (struct T_data_ind *)mp->b_rptr; 1115 tdi->PRIM_type = T_DATA_IND; 1116 tdi->MORE_flag = 0; 1117 } 1118 break; 1119 default: 1120 break; 1121 } 1122 putnext(connp->conn_rq, mp); 1123 } 1124 1125 1126 void 1127 rts_ddi_init(void) 1128 { 1129 rts_max_optsize = optcom_max_optsize(rts_opt_obj.odb_opt_des_arr, 1130 rts_opt_obj.odb_opt_arr_cnt); 1131 1132 /* 1133 * We want to be informed each time a stack is created or 1134 * destroyed in the kernel, so we can maintain the 1135 * set of rts_stack_t's. 1136 */ 1137 netstack_register(NS_RTS, rts_stack_init, NULL, rts_stack_fini); 1138 } 1139 1140 void 1141 rts_ddi_destroy(void) 1142 { 1143 netstack_unregister(NS_RTS); 1144 } 1145 1146 /* 1147 * Initialize the RTS stack instance. 1148 */ 1149 /* ARGSUSED */ 1150 static void * 1151 rts_stack_init(netstackid_t stackid, netstack_t *ns) 1152 { 1153 rts_stack_t *rtss; 1154 rtsparam_t *pa; 1155 1156 rtss = (rts_stack_t *)kmem_zalloc(sizeof (*rtss), KM_SLEEP); 1157 rtss->rtss_netstack = ns; 1158 1159 pa = (rtsparam_t *)kmem_alloc(sizeof (lcl_param_arr), KM_SLEEP); 1160 rtss->rtss_params = pa; 1161 bcopy(lcl_param_arr, rtss->rtss_params, sizeof (lcl_param_arr)); 1162 1163 (void) rts_param_register(&rtss->rtss_g_nd, 1164 rtss->rtss_params, A_CNT(lcl_param_arr)); 1165 return (rtss); 1166 } 1167 1168 /* 1169 * Free the RTS stack instance. 1170 */ 1171 /* ARGSUSED */ 1172 static void 1173 rts_stack_fini(netstackid_t stackid, void *arg) 1174 { 1175 rts_stack_t *rtss = (rts_stack_t *)arg; 1176 1177 nd_free(&rtss->rtss_g_nd); 1178 kmem_free(rtss->rtss_params, sizeof (lcl_param_arr)); 1179 rtss->rtss_params = NULL; 1180 kmem_free(rtss, sizeof (*rtss)); 1181 } 1182