1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/strsubr.h> 31 #include <sys/stropts.h> 32 #include <sys/strsun.h> 33 #include <sys/strlog.h> 34 #define _SUN_TPI_VERSION 2 35 #include <sys/tihdr.h> 36 #include <sys/timod.h> 37 #include <sys/ddi.h> 38 #include <sys/sunddi.h> 39 #include <sys/cmn_err.h> 40 #include <sys/proc.h> 41 #include <sys/suntpi.h> 42 #include <sys/policy.h> 43 #include <sys/zone.h> 44 45 #include <sys/socket.h> 46 #include <netinet/in.h> 47 48 #include <inet/common.h> 49 #include <netinet/ip6.h> 50 #include <inet/ip.h> 51 #include <inet/ipclassifier.h> 52 #include <inet/mi.h> 53 #include <inet/nd.h> 54 #include <inet/optcom.h> 55 #include <netinet/ip_mroute.h> 56 #include <sys/isa_defs.h> 57 #include <net/route.h> 58 59 #include <inet/rts_impl.h> 60 #include <inet/ip_rts.h> 61 62 /* 63 * This is a transport provider for routing sockets. Downstream messages are 64 * wrapped with a IP_IOCTL header, and ip_wput_ioctl calls the appropriate entry 65 * in the ip_ioctl_ftbl callout table to pass the routing socket data into IP. 66 * Upstream messages are generated for listeners of the routing socket as well 67 * as the message sender (unless they have turned off their end using 68 * SO_USELOOPBACK or shutdown(3n)). Upstream messages may also be generated 69 * asynchronously when: 70 * 71 * Interfaces are brought up or down. 72 * Addresses are assigned to interfaces. 73 * ICMP redirects are processed and a IRE_HOST/RTF_DYNAMIC is installed. 74 * No route is found while sending a packet. 75 * When TCP requests IP to remove an IRE_CACHE of a troubled destination. 76 * 77 * Since all we do is reformat the messages between routing socket and 78 * ioctl forms, no synchronization is necessary in this module; all 79 * the dirty work is done down in ip. 80 */ 81 82 /* Default structure copied into T_INFO_ACK messages */ 83 static struct T_info_ack rts_g_t_info_ack = { 84 T_INFO_ACK, 85 T_INFINITE, /* TSDU_size. Maximum size messages. */ 86 T_INVALID, /* ETSDU_size. No expedited data. */ 87 T_INVALID, /* CDATA_size. No connect data. */ 88 T_INVALID, /* DDATA_size. No disconnect data. */ 89 0, /* ADDR_size. */ 90 0, /* OPT_size - not initialized here */ 91 64 * 1024, /* TIDU_size. rts allows maximum size messages. */ 92 T_COTS, /* SERV_type. rts supports connection oriented. */ 93 TS_UNBND, /* CURRENT_state. This is set from rts_state. */ 94 (XPG4_1) /* PROVIDER_flag */ 95 }; 96 97 /* 98 * Table of ND variables supported by rts. These are loaded into rts_g_nd 99 * in rts_open. 100 * All of these are alterable, within the min/max values given, at run time. 101 */ 102 static rtsparam_t lcl_param_arr[] = { 103 /* min max value name */ 104 { 4096, 65536, 8192, "rts_xmit_hiwat"}, 105 { 0, 65536, 1024, "rts_xmit_lowat"}, 106 { 4096, 65536, 8192, "rts_recv_hiwat"}, 107 { 65536, 1024*1024*1024, 256*1024, "rts_max_buf"}, 108 }; 109 #define rtss_xmit_hiwat rtss_params[0].rts_param_value 110 #define rtss_xmit_lowat rtss_params[1].rts_param_value 111 #define rtss_recv_hiwat rtss_params[2].rts_param_value 112 #define rtss_max_buf rtss_params[3].rts_param_value 113 114 static int rts_close(queue_t *q); 115 static void rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, 116 int sys_error); 117 static void rts_input(void *, mblk_t *, void *); 118 static mblk_t *rts_ioctl_alloc(mblk_t *data, cred_t *cr); 119 static int rts_open(queue_t *q, dev_t *devp, int flag, int sflag, 120 cred_t *credp); 121 int rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, 122 uchar_t *ptr); 123 int rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, 124 uchar_t *ptr); 125 int rts_opt_set(queue_t *q, uint_t optset_context, int level, 126 int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, 127 uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk); 128 static int rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); 129 static boolean_t rts_param_register(IDP *ndp, rtsparam_t *rtspa, int cnt); 130 static int rts_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 131 cred_t *cr); 132 static void rts_rsrv(queue_t *q); 133 static void *rts_stack_init(netstackid_t stackid, netstack_t *ns); 134 static void rts_stack_fini(netstackid_t stackid, void *arg); 135 static void rts_wput(queue_t *q, mblk_t *mp); 136 static void rts_wput_iocdata(queue_t *q, mblk_t *mp); 137 static void rts_wput_other(queue_t *q, mblk_t *mp); 138 static int rts_wrw(queue_t *q, struiod_t *dp); 139 140 static struct module_info rts_mod_info = { 141 129, "rts", 1, INFPSZ, 512, 128 142 }; 143 144 static struct qinit rtsrinit = { 145 NULL, (pfi_t)rts_rsrv, rts_open, rts_close, NULL, &rts_mod_info 146 }; 147 148 static struct qinit rtswinit = { 149 (pfi_t)rts_wput, NULL, NULL, NULL, NULL, &rts_mod_info, 150 NULL, (pfi_t)rts_wrw, NULL, STRUIOT_STANDARD 151 }; 152 153 struct streamtab rtsinfo = { 154 &rtsrinit, &rtswinit 155 }; 156 157 /* 158 * This routine allocates the necessary 159 * message blocks for IOCTL wrapping the 160 * user data. 161 */ 162 static mblk_t * 163 rts_ioctl_alloc(mblk_t *data, cred_t *cr) 164 { 165 mblk_t *mp = NULL; 166 mblk_t *mp1 = NULL; 167 ipllc_t *ipllc; 168 struct iocblk *ioc; 169 170 mp = allocb_cred(sizeof (ipllc_t), cr); 171 if (mp == NULL) 172 return (NULL); 173 mp1 = allocb_cred(sizeof (struct iocblk), cr); 174 if (mp1 == NULL) { 175 freeb(mp); 176 return (NULL); 177 } 178 179 ipllc = (ipllc_t *)mp->b_rptr; 180 ipllc->ipllc_cmd = IP_IOC_RTS_REQUEST; 181 ipllc->ipllc_name_offset = 0; 182 ipllc->ipllc_name_length = 0; 183 mp->b_wptr += sizeof (ipllc_t); 184 mp->b_cont = data; 185 186 ioc = (struct iocblk *)mp1->b_rptr; 187 ioc->ioc_cmd = IP_IOCTL; 188 ioc->ioc_error = 0; 189 ioc->ioc_cr = NULL; 190 ioc->ioc_count = msgdsize(mp); 191 mp1->b_wptr += sizeof (struct iocblk); 192 mp1->b_datap->db_type = M_IOCTL; 193 mp1->b_cont = mp; 194 195 return (mp1); 196 } 197 198 /* 199 * This routine closes rts stream, by disabling 200 * put/srv routines and freeing the this module 201 * internal datastructure. 202 */ 203 static int 204 rts_close(queue_t *q) 205 { 206 conn_t *connp = Q_TO_CONN(q); 207 208 ASSERT(connp != NULL && IPCL_IS_RTS(connp)); 209 210 ip_rts_unregister(connp); 211 212 ip_quiesce_conn(connp); 213 214 qprocsoff(q); 215 216 /* 217 * Now we are truly single threaded on this stream, and can 218 * delete the things hanging off the connp, and finally the connp. 219 * We removed this connp from the fanout list, it cannot be 220 * accessed thru the fanouts, and we already waited for the 221 * conn_ref to drop to 0. We are already in close, so 222 * there cannot be any other thread from the top. qprocsoff 223 * has completed, and service has completed or won't run in 224 * future. 225 */ 226 ASSERT(connp->conn_ref == 1); 227 228 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 229 230 connp->conn_ref--; 231 ipcl_conn_destroy(connp); 232 233 q->q_ptr = WR(q)->q_ptr = NULL; 234 return (0); 235 } 236 237 /* 238 * This is the open routine for routing socket. It allocates 239 * rts_t structure for the stream and tells IP that it is a routing socket. 240 */ 241 /* ARGSUSED */ 242 static int 243 rts_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 244 { 245 rts_t *rts; 246 conn_t *connp; 247 dev_t conn_dev; 248 zoneid_t zoneid; 249 netstack_t *ns; 250 rts_stack_t *rtss; 251 252 /* If the stream is already open, return immediately. */ 253 if (q->q_ptr != NULL) 254 return (0); 255 256 if (sflag == MODOPEN) 257 return (EINVAL); 258 259 ns = netstack_find_by_cred(credp); 260 ASSERT(ns != NULL); 261 rtss = ns->netstack_rts; 262 ASSERT(rtss != NULL); 263 264 /* 265 * For exclusive stacks we set the zoneid to zero 266 * to make RTS operate as if in the global zone. 267 */ 268 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 269 zoneid = GLOBAL_ZONEID; 270 else 271 zoneid = crgetzoneid(credp); 272 273 /* 274 * Since RTS is not used so heavily, allocating from the small 275 * arena should be sufficient. 276 */ 277 if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { 278 netstack_rele(ns); 279 return (EBUSY); 280 } 281 *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); 282 283 connp = ipcl_conn_create(IPCL_RTSCONN, KM_SLEEP, ns); 284 connp->conn_dev = conn_dev; 285 connp->conn_minor_arena = ip_minor_arena_sa; 286 rts = connp->conn_rts; 287 288 /* 289 * ipcl_conn_create did a netstack_hold. Undo the hold that was 290 * done by netstack_find_by_cred() 291 */ 292 netstack_rele(ns); 293 294 /* 295 * Initialize the rts_t structure for this stream. 296 */ 297 q->q_ptr = connp; 298 WR(q)->q_ptr = connp; 299 connp->conn_rq = q; 300 connp->conn_wq = WR(q); 301 302 rw_enter(&rts->rts_rwlock, RW_WRITER); 303 ASSERT(connp->conn_rts == rts); 304 ASSERT(rts->rts_connp == connp); 305 306 /* Set the initial state of the stream and the privilege status. */ 307 rts->rts_state = TS_UNBND; 308 connp->conn_zoneid = zoneid; 309 310 connp->conn_ulp_labeled = is_system_labeled(); 311 312 rts->rts_rtss = rtss; 313 314 q->q_hiwat = rtss->rtss_recv_hiwat; 315 WR(q)->q_hiwat = rtss->rtss_xmit_hiwat; 316 WR(q)->q_lowat = rtss->rtss_xmit_lowat; 317 318 connp->conn_recv = rts_input; 319 crhold(credp); 320 connp->conn_cred = credp; 321 322 mutex_enter(&connp->conn_lock); 323 connp->conn_state_flags &= ~CONN_INCIPIENT; 324 mutex_exit(&connp->conn_lock); 325 326 qprocson(q); 327 rw_exit(&rts->rts_rwlock); 328 329 /* 330 * Indicate the down IP module that this is a routing socket 331 * client by sending an RTS IOCTL without any user data. Although 332 * this is just a notification message (without any real routing 333 * request), we pass in any credential for correctness sake. 334 */ 335 ip_rts_register(connp); 336 337 return (0); 338 339 } 340 341 /* 342 * This routine creates a T_ERROR_ACK message and passes it upstream. 343 */ 344 static void 345 rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) 346 { 347 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 348 qreply(q, mp); 349 } 350 351 /* 352 * This routine creates a T_OK_ACK message and passes it upstream. 353 */ 354 static void 355 rts_ok_ack(queue_t *q, mblk_t *mp) 356 { 357 if ((mp = mi_tpi_ok_ack_alloc(mp)) != NULL) 358 qreply(q, mp); 359 } 360 361 /* 362 * This routine is called by rts_wput to handle T_UNBIND_REQ messages. 363 */ 364 static void 365 rts_unbind(queue_t *q, mblk_t *mp) 366 { 367 conn_t *connp = Q_TO_CONN(q); 368 rts_t *rts = connp->conn_rts; 369 370 /* If a bind has not been done, we can't unbind. */ 371 if (rts->rts_state != TS_IDLE) { 372 rts_err_ack(q, mp, TOUTSTATE, 0); 373 return; 374 } 375 rts->rts_state = TS_UNBND; 376 rts_ok_ack(q, mp); 377 } 378 379 /* 380 * This routine is called to handle each 381 * O_T_BIND_REQ/T_BIND_REQ message passed to 382 * rts_wput. Note: This routine works with both 383 * O_T_BIND_REQ and T_BIND_REQ semantics. 384 */ 385 static void 386 rts_bind(queue_t *q, mblk_t *mp) 387 { 388 conn_t *connp = Q_TO_CONN(q); 389 rts_t *rts = connp->conn_rts; 390 mblk_t *mp1; 391 struct T_bind_req *tbr; 392 393 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 394 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 395 "rts_bind: bad data, %d", rts->rts_state); 396 rts_err_ack(q, mp, TBADADDR, 0); 397 return; 398 } 399 if (rts->rts_state != TS_UNBND) { 400 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 401 "rts_bind: bad state, %d", rts->rts_state); 402 rts_err_ack(q, mp, TOUTSTATE, 0); 403 return; 404 } 405 /* 406 * Reallocate the message to make sure we have enough room for an 407 * address and the protocol type. 408 */ 409 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin_t), 1); 410 if (mp1 == NULL) { 411 rts_err_ack(q, mp, TSYSERR, ENOMEM); 412 return; 413 } 414 mp = mp1; 415 tbr = (struct T_bind_req *)mp->b_rptr; 416 if (tbr->ADDR_length != 0) { 417 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, 418 "rts_bind: bad ADDR_length %d", tbr->ADDR_length); 419 rts_err_ack(q, mp, TBADADDR, 0); 420 return; 421 } 422 /* Generic request */ 423 tbr->ADDR_offset = (t_scalar_t)sizeof (struct T_bind_req); 424 tbr->ADDR_length = 0; 425 tbr->PRIM_type = T_BIND_ACK; 426 rts->rts_state = TS_IDLE; 427 qreply(q, mp); 428 } 429 430 static void 431 rts_copy_info(struct T_info_ack *tap, rts_t *rts) 432 { 433 *tap = rts_g_t_info_ack; 434 tap->CURRENT_state = rts->rts_state; 435 tap->OPT_size = rts_max_optsize; 436 } 437 438 /* 439 * This routine responds to T_CAPABILITY_REQ messages. It is called by 440 * rts_wput. Much of the T_CAPABILITY_ACK information is copied from 441 * rts_g_t_info_ack. The current state of the stream is copied from 442 * rts_state. 443 */ 444 static void 445 rts_capability_req(queue_t *q, mblk_t *mp) 446 { 447 conn_t *connp = Q_TO_CONN(q); 448 rts_t *rts = connp->conn_rts; 449 t_uscalar_t cap_bits1; 450 struct T_capability_ack *tcap; 451 452 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 453 454 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 455 mp->b_datap->db_type, T_CAPABILITY_ACK); 456 if (mp == NULL) 457 return; 458 459 tcap = (struct T_capability_ack *)mp->b_rptr; 460 tcap->CAP_bits1 = 0; 461 462 if (cap_bits1 & TC1_INFO) { 463 rts_copy_info(&tcap->INFO_ack, rts); 464 tcap->CAP_bits1 |= TC1_INFO; 465 } 466 467 qreply(q, mp); 468 } 469 470 /* 471 * This routine responds to T_INFO_REQ messages. It is called by rts_wput. 472 * Most of the T_INFO_ACK information is copied from rts_g_t_info_ack. 473 * The current state of the stream is copied from rts_state. 474 */ 475 static void 476 rts_info_req(queue_t *q, mblk_t *mp) 477 { 478 conn_t *connp = Q_TO_CONN(q); 479 rts_t *rts = connp->conn_rts; 480 481 mp = tpi_ack_alloc(mp, sizeof (rts_g_t_info_ack), M_PCPROTO, 482 T_INFO_ACK); 483 if (mp == NULL) 484 return; 485 rts_copy_info((struct T_info_ack *)mp->b_rptr, rts); 486 qreply(q, mp); 487 } 488 489 /* 490 * This routine gets default values of certain options whose default 491 * values are maintained by protcol specific code 492 */ 493 /* ARGSUSED */ 494 int 495 rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) 496 { 497 /* no default value processed by protocol specific code currently */ 498 return (-1); 499 } 500 501 /* 502 * This routine retrieves the current status of socket options. 503 * It returns the size of the option retrieved. 504 */ 505 int 506 rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) 507 { 508 int *i1 = (int *)ptr; 509 conn_t *connp = Q_TO_CONN(q); 510 rts_t *rts = connp->conn_rts; 511 512 switch (level) { 513 case SOL_SOCKET: 514 switch (name) { 515 case SO_DEBUG: 516 *i1 = rts->rts_debug; 517 break; 518 case SO_REUSEADDR: 519 *i1 = rts->rts_reuseaddr; 520 break; 521 case SO_TYPE: 522 *i1 = SOCK_RAW; 523 break; 524 525 /* 526 * The following three items are available here, 527 * but are only meaningful to IP. 528 */ 529 case SO_DONTROUTE: 530 *i1 = rts->rts_dontroute; 531 break; 532 case SO_USELOOPBACK: 533 *i1 = rts->rts_useloopback; 534 break; 535 case SO_BROADCAST: 536 *i1 = rts->rts_broadcast; 537 break; 538 case SO_PROTOTYPE: 539 *i1 = rts->rts_proto; 540 break; 541 /* 542 * The following two items can be manipulated, 543 * but changing them should do nothing. 544 */ 545 case SO_SNDBUF: 546 ASSERT(q->q_hiwat <= INT_MAX); 547 *i1 = (int)(q->q_hiwat); 548 break; 549 case SO_RCVBUF: 550 ASSERT(q->q_hiwat <= INT_MAX); 551 *i1 = (int)(RD(q)->q_hiwat); 552 break; 553 case SO_DOMAIN: 554 *i1 = PF_ROUTE; 555 break; 556 default: 557 return (-1); 558 } 559 break; 560 default: 561 return (-1); 562 } 563 return ((int)sizeof (int)); 564 } 565 566 567 /* 568 * This routine sets socket options. 569 */ 570 /*ARGSUSED*/ 571 int 572 rts_opt_set(queue_t *q, uint_t optset_context, int level, 573 int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, 574 uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk) 575 { 576 int *i1 = (int *)invalp; 577 conn_t *connp = Q_TO_CONN(q); 578 rts_t *rts = connp->conn_rts; 579 boolean_t checkonly; 580 rts_stack_t *rtss = rts->rts_rtss; 581 582 switch (optset_context) { 583 case SETFN_OPTCOM_CHECKONLY: 584 checkonly = B_TRUE; 585 /* 586 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 587 * inlen != 0 implies value supplied and 588 * we have to "pretend" to set it. 589 * inlen == 0 implies that there is no 590 * value part in T_CHECK request and just validation 591 * done elsewhere should be enough, we just return here. 592 */ 593 if (inlen == 0) { 594 *outlenp = 0; 595 return (0); 596 } 597 break; 598 case SETFN_OPTCOM_NEGOTIATE: 599 checkonly = B_FALSE; 600 break; 601 case SETFN_UD_NEGOTIATE: 602 case SETFN_CONN_NEGOTIATE: 603 checkonly = B_FALSE; 604 /* 605 * Negotiating local and "association-related" options 606 * through T_UNITDATA_REQ or T_CONN_{REQ,CON} 607 * Not allowed in this module. 608 */ 609 return (EINVAL); 610 default: 611 /* 612 * We should never get here 613 */ 614 *outlenp = 0; 615 return (EINVAL); 616 } 617 618 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 619 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 620 621 /* 622 * For rts, we should have no ancillary data sent down 623 * (rts_wput doesn't handle options). 624 */ 625 ASSERT(thisdg_attrs == NULL); 626 627 /* 628 * For fixed length options, no sanity check 629 * of passed in length is done. It is assumed *_optcom_req() 630 * routines do the right thing. 631 */ 632 633 switch (level) { 634 case SOL_SOCKET: 635 switch (name) { 636 case SO_REUSEADDR: 637 if (!checkonly) 638 rts->rts_reuseaddr = *i1; 639 break; /* goto sizeof (int) option return */ 640 case SO_DEBUG: 641 if (!checkonly) 642 rts->rts_debug = *i1; 643 break; /* goto sizeof (int) option return */ 644 /* 645 * The following three items are available here, 646 * but are only meaningful to IP. 647 */ 648 case SO_DONTROUTE: 649 if (!checkonly) 650 rts->rts_dontroute = *i1; 651 break; /* goto sizeof (int) option return */ 652 case SO_USELOOPBACK: 653 if (!checkonly) 654 rts->rts_useloopback = *i1; 655 break; /* goto sizeof (int) option return */ 656 case SO_BROADCAST: 657 if (!checkonly) 658 rts->rts_broadcast = *i1; 659 break; /* goto sizeof (int) option return */ 660 case SO_PROTOTYPE: 661 /* 662 * Routing socket applications that call socket() with 663 * a third argument can filter which messages will be 664 * sent upstream thanks to sockfs. so_socket() sends 665 * down the SO_PROTOTYPE and rts_queue_input() 666 * implements the filtering. 667 */ 668 if (*i1 != AF_INET && *i1 != AF_INET6) 669 return (EPROTONOSUPPORT); 670 if (!checkonly) 671 rts->rts_proto = *i1; 672 break; /* goto sizeof (int) option return */ 673 /* 674 * The following two items can be manipulated, 675 * but changing them should do nothing. 676 */ 677 case SO_SNDBUF: 678 if (*i1 > rtss->rtss_max_buf) { 679 *outlenp = 0; 680 return (ENOBUFS); 681 } 682 if (!checkonly) { 683 q->q_hiwat = *i1; 684 } 685 break; /* goto sizeof (int) option return */ 686 case SO_RCVBUF: 687 if (*i1 > rtss->rtss_max_buf) { 688 *outlenp = 0; 689 return (ENOBUFS); 690 } 691 if (!checkonly) { 692 RD(q)->q_hiwat = *i1; 693 (void) mi_set_sth_hiwat(RD(q), *i1); 694 } 695 break; /* goto sizeof (int) option return */ 696 default: 697 *outlenp = 0; 698 return (EINVAL); 699 } 700 break; 701 default: 702 *outlenp = 0; 703 return (EINVAL); 704 } 705 /* 706 * Common case of return from an option that is sizeof (int) 707 */ 708 *(int *)outvalp = *i1; 709 *outlenp = (t_uscalar_t)sizeof (int); 710 return (0); 711 } 712 713 /* 714 * This routine retrieves the value of an ND variable in a rtsparam_t 715 * structure. It is called through nd_getset when a user reads the 716 * variable. 717 */ 718 /* ARGSUSED */ 719 static int 720 rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 721 { 722 rtsparam_t *rtspa = (rtsparam_t *)cp; 723 724 (void) mi_mpprintf(mp, "%u", rtspa->rts_param_value); 725 return (0); 726 } 727 728 /* 729 * Walk through the param array specified registering each element with the 730 * named dispatch (ND) handler. 731 */ 732 static boolean_t 733 rts_param_register(IDP *ndp, rtsparam_t *rtspa, int cnt) 734 { 735 for (; cnt-- > 0; rtspa++) { 736 if (rtspa->rts_param_name != NULL && rtspa->rts_param_name[0]) { 737 if (!nd_load(ndp, rtspa->rts_param_name, 738 rts_param_get, rts_param_set, (caddr_t)rtspa)) { 739 nd_free(ndp); 740 return (B_FALSE); 741 } 742 } 743 } 744 return (B_TRUE); 745 } 746 747 /* This routine sets an ND variable in a rtsparam_t structure. */ 748 /* ARGSUSED */ 749 static int 750 rts_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) 751 { 752 ulong_t new_value; 753 rtsparam_t *rtspa = (rtsparam_t *)cp; 754 755 /* 756 * Fail the request if the new value does not lie within the 757 * required bounds. 758 */ 759 if (ddi_strtoul(value, NULL, 10, &new_value) != 0 || 760 new_value < rtspa->rts_param_min || 761 new_value > rtspa->rts_param_max) { 762 return (EINVAL); 763 } 764 765 /* Set the new value */ 766 rtspa->rts_param_value = new_value; 767 return (0); 768 } 769 770 /* 771 * Empty rsrv routine which is used by rts_input to cause a wakeup 772 * of a thread in qwait. 773 */ 774 /*ARGSUSED*/ 775 static void 776 rts_rsrv(queue_t *q) 777 { 778 } 779 780 /* 781 * This routine handles synchronous messages passed downstream. It either 782 * consumes the message or passes it downstream; it never queues a 783 * a message. The data messages that go down are wrapped in an IOCTL 784 * message. 785 * 786 * Since it is synchronous, it waits for the M_IOCACK/M_IOCNAK so that 787 * it can return an immediate error (such as ENETUNREACH when adding a route). 788 * It uses the RTS_WRW_PENDING to ensure that each rts instance has only 789 * one M_IOCTL outstanding at any given time. 790 */ 791 static int 792 rts_wrw(queue_t *q, struiod_t *dp) 793 { 794 mblk_t *mp = dp->d_mp; 795 mblk_t *mp1; 796 int error; 797 rt_msghdr_t *rtm; 798 conn_t *connp = Q_TO_CONN(q); 799 rts_t *rts = connp->conn_rts; 800 801 while (rts->rts_flag & RTS_WRW_PENDING) { 802 if (qwait_rw(q)) { 803 rts->rts_error = EINTR; 804 goto err_ret; 805 } 806 } 807 rts->rts_flag |= RTS_WRW_PENDING; 808 809 if (isuioq(q) && (error = struioget(q, mp, dp, 0))) { 810 /* 811 * Uio error of some sort, so just return the error. 812 */ 813 rts->rts_error = error; 814 goto err_ret; 815 } 816 /* 817 * Pass the mblk (chain) onto wput(). 818 */ 819 dp->d_mp = 0; 820 821 switch (mp->b_datap->db_type) { 822 case M_PROTO: 823 case M_PCPROTO: 824 /* Expedite other than T_DATA_REQ to below the switch */ 825 if (((mp->b_wptr - mp->b_rptr) != 826 sizeof (struct T_data_req)) || 827 (((union T_primitives *)mp->b_rptr)->type != T_DATA_REQ)) 828 break; 829 if ((mp1 = mp->b_cont) == NULL) { 830 rts->rts_error = EINVAL; 831 goto err_ret; 832 } 833 freeb(mp); 834 mp = mp1; 835 /* FALLTHRU */ 836 case M_DATA: 837 /* 838 * The semantics of the routing socket is such that the rtm_pid 839 * field is automatically filled in during requests with the 840 * current process' pid. We do this here (where we still have 841 * user context) after checking we have at least a message the 842 * size of a routing message header. 843 */ 844 if ((mp->b_wptr - mp->b_rptr) < sizeof (rt_msghdr_t)) { 845 if (!pullupmsg(mp, sizeof (rt_msghdr_t))) { 846 rts->rts_error = EINVAL; 847 goto err_ret; 848 } 849 } 850 rtm = (rt_msghdr_t *)mp->b_rptr; 851 rtm->rtm_pid = curproc->p_pid; 852 break; 853 default: 854 break; 855 } 856 rts->rts_flag |= RTS_WPUT_PENDING; 857 rts_wput(q, mp); 858 while (rts->rts_flag & RTS_WPUT_PENDING) 859 if (qwait_rw(q)) { 860 /* RTS_WPUT_PENDING will be cleared below */ 861 rts->rts_error = EINTR; 862 break; 863 } 864 err_ret: 865 rts->rts_flag &= ~(RTS_WPUT_PENDING | RTS_WRW_PENDING); 866 return (rts->rts_error); 867 } 868 869 /* 870 * This routine handles all messages passed downstream. It either 871 * consumes the message or passes it downstream; it never queues a 872 * a message. The data messages that go down are wrapped in an IOCTL 873 * message. 874 * 875 * FIXME? Should we call IP rts_request directly? Could punt on returning 876 * errno in the case when it defers processing due to 877 * IPIF_CHANGING/ILL_CHANGING??? 878 */ 879 static void 880 rts_wput(queue_t *q, mblk_t *mp) 881 { 882 uchar_t *rptr = mp->b_rptr; 883 mblk_t *mp1; 884 conn_t *connp = Q_TO_CONN(q); 885 rts_t *rts = connp->conn_rts; 886 887 switch (mp->b_datap->db_type) { 888 case M_DATA: 889 break; 890 case M_PROTO: 891 case M_PCPROTO: 892 if ((mp->b_wptr - rptr) == sizeof (struct T_data_req)) { 893 /* Expedite valid T_DATA_REQ to below the switch */ 894 if (((union T_primitives *)rptr)->type == T_DATA_REQ) { 895 mp1 = mp->b_cont; 896 freeb(mp); 897 if (mp1 == NULL) 898 return; 899 mp = mp1; 900 break; 901 } 902 } 903 /* FALLTHRU */ 904 default: 905 rts_wput_other(q, mp); 906 return; 907 } 908 909 910 mp1 = rts_ioctl_alloc(mp, DB_CRED(mp)); 911 if (mp1 == NULL) { 912 ASSERT(rts != NULL); 913 freemsg(mp); 914 if (rts->rts_flag & RTS_WPUT_PENDING) { 915 rts->rts_error = ENOMEM; 916 rts->rts_flag &= ~RTS_WPUT_PENDING; 917 } 918 return; 919 } 920 ip_output(connp, mp1, q, IP_WPUT); 921 } 922 923 924 /* 925 * Handles all the control message, if it 926 * can not understand it, it will 927 * pass down stream. 928 */ 929 static void 930 rts_wput_other(queue_t *q, mblk_t *mp) 931 { 932 conn_t *connp = Q_TO_CONN(q); 933 rts_t *rts = connp->conn_rts; 934 uchar_t *rptr = mp->b_rptr; 935 struct iocblk *iocp; 936 cred_t *cr; 937 rts_stack_t *rtss; 938 939 rtss = rts->rts_rtss; 940 941 cr = DB_CREDDEF(mp, connp->conn_cred); 942 943 switch (mp->b_datap->db_type) { 944 case M_PROTO: 945 case M_PCPROTO: 946 if ((mp->b_wptr - rptr) < sizeof (t_scalar_t)) { 947 /* 948 * If the message does not contain a PRIM_type, 949 * throw it away. 950 */ 951 freemsg(mp); 952 return; 953 } 954 switch (((union T_primitives *)rptr)->type) { 955 case T_BIND_REQ: 956 case O_T_BIND_REQ: 957 rts_bind(q, mp); 958 return; 959 case T_UNBIND_REQ: 960 rts_unbind(q, mp); 961 return; 962 case T_CAPABILITY_REQ: 963 rts_capability_req(q, mp); 964 return; 965 case T_INFO_REQ: 966 rts_info_req(q, mp); 967 return; 968 case T_SVR4_OPTMGMT_REQ: 969 (void) svr4_optcom_req(q, mp, cr, &rts_opt_obj, 970 B_TRUE); 971 return; 972 case T_OPTMGMT_REQ: 973 (void) tpi_optcom_req(q, mp, cr, &rts_opt_obj, B_TRUE); 974 return; 975 case O_T_CONN_RES: 976 case T_CONN_RES: 977 case T_DISCON_REQ: 978 /* Not supported by rts. */ 979 rts_err_ack(q, mp, TNOTSUPPORT, 0); 980 return; 981 case T_DATA_REQ: 982 case T_EXDATA_REQ: 983 case T_ORDREL_REQ: 984 /* Illegal for rts. */ 985 freemsg(mp); 986 (void) putnextctl1(RD(q), M_ERROR, EPROTO); 987 return; 988 default: 989 break; 990 } 991 break; 992 case M_IOCTL: 993 iocp = (struct iocblk *)mp->b_rptr; 994 switch (iocp->ioc_cmd) { 995 case ND_SET: 996 case ND_GET: 997 if (nd_getset(q, rtss->rtss_g_nd, mp)) { 998 qreply(q, mp); 999 return; 1000 } 1001 break; 1002 case TI_GETPEERNAME: 1003 mi_copyin(q, mp, NULL, 1004 SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); 1005 return; 1006 default: 1007 break; 1008 } 1009 case M_IOCDATA: 1010 rts_wput_iocdata(q, mp); 1011 return; 1012 default: 1013 break; 1014 } 1015 ip_output(connp, mp, q, IP_WPUT); 1016 } 1017 1018 /* 1019 * Called by rts_wput_other to handle all M_IOCDATA messages. 1020 */ 1021 static void 1022 rts_wput_iocdata(queue_t *q, mblk_t *mp) 1023 { 1024 conn_t *connp = Q_TO_CONN(q); 1025 struct sockaddr *rtsaddr; 1026 mblk_t *mp1; 1027 STRUCT_HANDLE(strbuf, sb); 1028 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 1029 1030 /* Make sure it is one of ours. */ 1031 switch (iocp->ioc_cmd) { 1032 case TI_GETPEERNAME: 1033 break; 1034 default: 1035 ip_output(connp, mp, q, IP_WPUT); 1036 return; 1037 } 1038 switch (mi_copy_state(q, mp, &mp1)) { 1039 case -1: 1040 return; 1041 case MI_COPY_CASE(MI_COPY_IN, 1): 1042 break; 1043 case MI_COPY_CASE(MI_COPY_OUT, 1): 1044 /* Copy out the strbuf. */ 1045 mi_copyout(q, mp); 1046 return; 1047 case MI_COPY_CASE(MI_COPY_OUT, 2): 1048 /* All done. */ 1049 mi_copy_done(q, mp, 0); 1050 return; 1051 default: 1052 mi_copy_done(q, mp, EPROTO); 1053 return; 1054 } 1055 STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr); 1056 if (STRUCT_FGET(sb, maxlen) < (int)sizeof (sin_t)) { 1057 mi_copy_done(q, mp, EINVAL); 1058 return; 1059 } 1060 switch (iocp->ioc_cmd) { 1061 case TI_GETPEERNAME: 1062 break; 1063 default: 1064 mi_copy_done(q, mp, EPROTO); 1065 return; 1066 } 1067 mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), sizeof (sin_t), 1068 B_TRUE); 1069 if (mp1 == NULL) 1070 return; 1071 STRUCT_FSET(sb, len, (int)sizeof (sin_t)); 1072 rtsaddr = (struct sockaddr *)mp1->b_rptr; 1073 mp1->b_wptr = (uchar_t *)&rtsaddr[1]; 1074 bzero(rtsaddr, sizeof (struct sockaddr)); 1075 rtsaddr->sa_family = AF_ROUTE; 1076 /* Copy out the address */ 1077 mi_copyout(q, mp); 1078 } 1079 1080 /*ARGSUSED2*/ 1081 static void 1082 rts_input(void *arg1, mblk_t *mp, void *arg2) 1083 { 1084 conn_t *connp = (conn_t *)arg1; 1085 rts_t *rts = connp->conn_rts; 1086 struct iocblk *iocp; 1087 mblk_t *mp1; 1088 struct T_data_ind *tdi; 1089 1090 switch (mp->b_datap->db_type) { 1091 case M_IOCACK: 1092 case M_IOCNAK: 1093 iocp = (struct iocblk *)mp->b_rptr; 1094 if (rts->rts_flag & (RTS_WPUT_PENDING)) { 1095 rts->rts_flag &= ~RTS_WPUT_PENDING; 1096 rts->rts_error = iocp->ioc_error; 1097 /* 1098 * Tell rts_wvw/qwait that we are done. 1099 * Note: there is no qwait_wakeup() we can use. 1100 */ 1101 qenable(connp->conn_rq); 1102 freemsg(mp); 1103 return; 1104 } 1105 break; 1106 case M_DATA: 1107 /* 1108 * Prepend T_DATA_IND to prevent the stream head from 1109 * consolidating multiple messages together. 1110 * If the allocation fails just send up the M_DATA. 1111 */ 1112 mp1 = allocb(sizeof (*tdi), BPRI_MED); 1113 if (mp1 != NULL) { 1114 mp1->b_cont = mp; 1115 mp = mp1; 1116 1117 mp->b_datap->db_type = M_PROTO; 1118 mp->b_wptr += sizeof (*tdi); 1119 tdi = (struct T_data_ind *)mp->b_rptr; 1120 tdi->PRIM_type = T_DATA_IND; 1121 tdi->MORE_flag = 0; 1122 } 1123 break; 1124 default: 1125 break; 1126 } 1127 putnext(connp->conn_rq, mp); 1128 } 1129 1130 1131 void 1132 rts_ddi_init(void) 1133 { 1134 rts_max_optsize = optcom_max_optsize(rts_opt_obj.odb_opt_des_arr, 1135 rts_opt_obj.odb_opt_arr_cnt); 1136 1137 /* 1138 * We want to be informed each time a stack is created or 1139 * destroyed in the kernel, so we can maintain the 1140 * set of rts_stack_t's. 1141 */ 1142 netstack_register(NS_RTS, rts_stack_init, NULL, rts_stack_fini); 1143 } 1144 1145 void 1146 rts_ddi_destroy(void) 1147 { 1148 netstack_unregister(NS_RTS); 1149 } 1150 1151 /* 1152 * Initialize the RTS stack instance. 1153 */ 1154 /* ARGSUSED */ 1155 static void * 1156 rts_stack_init(netstackid_t stackid, netstack_t *ns) 1157 { 1158 rts_stack_t *rtss; 1159 rtsparam_t *pa; 1160 1161 rtss = (rts_stack_t *)kmem_zalloc(sizeof (*rtss), KM_SLEEP); 1162 rtss->rtss_netstack = ns; 1163 1164 pa = (rtsparam_t *)kmem_alloc(sizeof (lcl_param_arr), KM_SLEEP); 1165 rtss->rtss_params = pa; 1166 bcopy(lcl_param_arr, rtss->rtss_params, sizeof (lcl_param_arr)); 1167 1168 (void) rts_param_register(&rtss->rtss_g_nd, 1169 rtss->rtss_params, A_CNT(lcl_param_arr)); 1170 return (rtss); 1171 } 1172 1173 /* 1174 * Free the RTS stack instance. 1175 */ 1176 /* ARGSUSED */ 1177 static void 1178 rts_stack_fini(netstackid_t stackid, void *arg) 1179 { 1180 rts_stack_t *rtss = (rts_stack_t *)arg; 1181 1182 nd_free(&rtss->rtss_g_nd); 1183 kmem_free(rtss->rtss_params, sizeof (lcl_param_arr)); 1184 rtss->rtss_params = NULL; 1185 kmem_free(rtss, sizeof (*rtss)); 1186 } 1187