xref: /titanic_41/usr/src/uts/common/inet/ip/rts.c (revision 7800901e60d340b6af88e94a2149805dcfcaaf56)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/strsubr.h>
31 #include <sys/stropts.h>
32 #include <sys/strsun.h>
33 #include <sys/strlog.h>
34 #define	_SUN_TPI_VERSION 2
35 #include <sys/tihdr.h>
36 #include <sys/timod.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/cmn_err.h>
40 #include <sys/proc.h>
41 #include <sys/suntpi.h>
42 #include <sys/policy.h>
43 #include <sys/zone.h>
44 
45 #include <sys/socket.h>
46 #include <netinet/in.h>
47 
48 #include <inet/common.h>
49 #include <netinet/ip6.h>
50 #include <inet/ip.h>
51 #include <inet/ipclassifier.h>
52 #include <inet/mi.h>
53 #include <inet/nd.h>
54 #include <inet/optcom.h>
55 #include <netinet/ip_mroute.h>
56 #include <sys/isa_defs.h>
57 #include <net/route.h>
58 
59 #include <inet/rts_impl.h>
60 #include <inet/ip_rts.h>
61 
62 /*
63  * This is a transport provider for routing sockets.  Downstream messages are
64  * wrapped with a IP_IOCTL header, and ip_wput_ioctl calls the appropriate entry
65  * in the ip_ioctl_ftbl callout table to pass the routing socket data into IP.
66  * Upstream messages are generated for listeners of the routing socket as well
67  * as the message sender (unless they have turned off their end using
68  * SO_USELOOPBACK or shutdown(3n)).  Upstream messages may also be generated
69  * asynchronously when:
70  *
71  *	Interfaces are brought up or down.
72  *	Addresses are assigned to interfaces.
73  *	ICMP redirects are processed and a IRE_HOST/RTF_DYNAMIC is installed.
74  *	No route is found while sending a packet.
75  *	When TCP requests IP to remove an IRE_CACHE of a troubled destination.
76  *
77  * Since all we do is reformat the messages between routing socket and
78  * ioctl forms, no synchronization is necessary in this module; all
79  * the dirty work is done down in ip.
80  */
81 
82 /* Default structure copied into T_INFO_ACK messages */
83 static struct T_info_ack rts_g_t_info_ack = {
84 	T_INFO_ACK,
85 	T_INFINITE,	/* TSDU_size. Maximum size messages. */
86 	T_INVALID,	/* ETSDU_size. No expedited data. */
87 	T_INVALID,	/* CDATA_size. No connect data. */
88 	T_INVALID,	/* DDATA_size. No disconnect data. */
89 	0,		/* ADDR_size. */
90 	0,		/* OPT_size - not initialized here */
91 	64 * 1024,	/* TIDU_size. rts allows maximum size messages. */
92 	T_COTS,		/* SERV_type. rts supports connection oriented. */
93 	TS_UNBND,	/* CURRENT_state. This is set from rts_state. */
94 	(XPG4_1)	/* PROVIDER_flag */
95 };
96 
97 /*
98  * Table of ND variables supported by rts. These are loaded into rts_g_nd
99  * in rts_open.
100  * All of these are alterable, within the min/max values given, at run time.
101  */
102 static rtsparam_t	lcl_param_arr[] = {
103 	/* min		max		value		name */
104 	{ 4096,		65536,		8192,		"rts_xmit_hiwat"},
105 	{ 0,		65536,		1024,		"rts_xmit_lowat"},
106 	{ 4096,		65536,		8192,		"rts_recv_hiwat"},
107 	{ 65536,	1024*1024*1024, 256*1024,	"rts_max_buf"},
108 };
109 #define	rtss_xmit_hiwat		rtss_params[0].rts_param_value
110 #define	rtss_xmit_lowat		rtss_params[1].rts_param_value
111 #define	rtss_recv_hiwat		rtss_params[2].rts_param_value
112 #define	rtss_max_buf		rtss_params[3].rts_param_value
113 
114 static int	rts_close(queue_t *q);
115 static void 	rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
116     int sys_error);
117 static void	rts_input(void *, mblk_t *, void *);
118 static mblk_t	*rts_ioctl_alloc(mblk_t *data, cred_t *cr);
119 static int	rts_open(queue_t *q, dev_t *devp, int flag, int sflag,
120     cred_t *credp);
121 int		rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name,
122     uchar_t *ptr);
123 int		rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name,
124     uchar_t *ptr);
125 int		rts_opt_set(queue_t *q, uint_t optset_context, int level,
126     int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
127     uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk);
128 static int	rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
129 static boolean_t rts_param_register(IDP *ndp, rtsparam_t *rtspa, int cnt);
130 static int	rts_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
131     cred_t *cr);
132 static void	rts_rsrv(queue_t *q);
133 static void	*rts_stack_init(netstackid_t stackid, netstack_t *ns);
134 static void	rts_stack_fini(netstackid_t stackid, void *arg);
135 static void	rts_wput(queue_t *q, mblk_t *mp);
136 static void	rts_wput_iocdata(queue_t *q, mblk_t *mp);
137 static void 	rts_wput_other(queue_t *q, mblk_t *mp);
138 static int	rts_wrw(queue_t *q, struiod_t *dp);
139 
140 static struct module_info rts_mod_info = {
141 	129, "rts", 1, INFPSZ, 512, 128
142 };
143 
144 static struct qinit rtsrinit = {
145 	NULL, (pfi_t)rts_rsrv, rts_open, rts_close, NULL, &rts_mod_info
146 };
147 
148 static struct qinit rtswinit = {
149 	(pfi_t)rts_wput, NULL, NULL, NULL, NULL, &rts_mod_info,
150 	NULL, (pfi_t)rts_wrw, NULL, STRUIOT_STANDARD
151 };
152 
153 struct streamtab rtsinfo = {
154 	&rtsrinit, &rtswinit
155 };
156 
157 /*
158  * This routine allocates the necessary
159  * message blocks for IOCTL wrapping the
160  * user data.
161  */
162 static mblk_t *
163 rts_ioctl_alloc(mblk_t *data, cred_t *cr)
164 {
165 	mblk_t	*mp = NULL;
166 	mblk_t	*mp1 = NULL;
167 	ipllc_t	*ipllc;
168 	struct iocblk	*ioc;
169 
170 	mp = allocb_cred(sizeof (ipllc_t), cr);
171 	if (mp == NULL)
172 		return (NULL);
173 	mp1 = allocb_cred(sizeof (struct iocblk), cr);
174 	if (mp1 == NULL) {
175 		freeb(mp);
176 		return (NULL);
177 	}
178 
179 	ipllc = (ipllc_t *)mp->b_rptr;
180 	ipllc->ipllc_cmd = IP_IOC_RTS_REQUEST;
181 	ipllc->ipllc_name_offset = 0;
182 	ipllc->ipllc_name_length = 0;
183 	mp->b_wptr += sizeof (ipllc_t);
184 	mp->b_cont = data;
185 
186 	ioc = (struct iocblk *)mp1->b_rptr;
187 	ioc->ioc_cmd = IP_IOCTL;
188 	ioc->ioc_error = 0;
189 	ioc->ioc_cr = NULL;
190 	ioc->ioc_count = msgdsize(mp);
191 	mp1->b_wptr += sizeof (struct iocblk);
192 	mp1->b_datap->db_type = M_IOCTL;
193 	mp1->b_cont = mp;
194 
195 	return (mp1);
196 }
197 
198 /*
199  * This routine closes rts stream, by disabling
200  * put/srv routines and freeing the this module
201  * internal datastructure.
202  */
203 static int
204 rts_close(queue_t *q)
205 {
206 	conn_t	*connp = Q_TO_CONN(q);
207 
208 	ASSERT(connp != NULL && IPCL_IS_RTS(connp));
209 
210 	ip_rts_unregister(connp);
211 
212 	ip_quiesce_conn(connp);
213 
214 	qprocsoff(q);
215 
216 	/*
217 	 * Now we are truly single threaded on this stream, and can
218 	 * delete the things hanging off the connp, and finally the connp.
219 	 * We removed this connp from the fanout list, it cannot be
220 	 * accessed thru the fanouts, and we already waited for the
221 	 * conn_ref to drop to 0. We are already in close, so
222 	 * there cannot be any other thread from the top. qprocsoff
223 	 * has completed, and service has completed or won't run in
224 	 * future.
225 	 */
226 	ASSERT(connp->conn_ref == 1);
227 
228 	inet_minor_free(ip_minor_arena, connp->conn_dev);
229 
230 	connp->conn_ref--;
231 	ipcl_conn_destroy(connp);
232 
233 	q->q_ptr = WR(q)->q_ptr = NULL;
234 	return (0);
235 }
236 
237 /*
238  * This is the open routine for routing socket. It allocates
239  * rts_t structure for the stream and tells IP that it is a routing socket.
240  */
241 /* ARGSUSED */
242 static int
243 rts_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
244 {
245 	rts_t	*rts;
246 	conn_t *connp;
247 	dev_t	conn_dev;
248 	zoneid_t zoneid;
249 	netstack_t *ns;
250 	rts_stack_t *rtss;
251 
252 	/* If the stream is already open, return immediately. */
253 	if (q->q_ptr != NULL)
254 		return (0);
255 
256 	if (sflag == MODOPEN)
257 		return (EINVAL);
258 
259 	ns = netstack_find_by_cred(credp);
260 	ASSERT(ns != NULL);
261 	rtss = ns->netstack_rts;
262 	ASSERT(rtss != NULL);
263 
264 	/*
265 	 * For exclusive stacks we set the zoneid to zero
266 	 * to make RTS operate as if in the global zone.
267 	 */
268 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
269 		zoneid = GLOBAL_ZONEID;
270 	else
271 		zoneid = crgetzoneid(credp);
272 
273 	if ((conn_dev = inet_minor_alloc(ip_minor_arena)) == 0) {
274 		netstack_rele(ns);
275 		return (EBUSY);
276 	}
277 	*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
278 
279 	connp = ipcl_conn_create(IPCL_RTSCONN, KM_SLEEP, ns);
280 	connp->conn_dev = conn_dev;
281 	rts = connp->conn_rts;
282 
283 	/*
284 	 * ipcl_conn_create did a netstack_hold. Undo the hold that was
285 	 * done by netstack_find_by_cred()
286 	 */
287 	netstack_rele(ns);
288 
289 	/*
290 	 * Initialize the rts_t structure for this stream.
291 	 */
292 	q->q_ptr = connp;
293 	WR(q)->q_ptr = connp;
294 	connp->conn_rq = q;
295 	connp->conn_wq = WR(q);
296 
297 	rw_enter(&rts->rts_rwlock, RW_WRITER);
298 	ASSERT(connp->conn_rts == rts);
299 	ASSERT(rts->rts_connp == connp);
300 
301 	/* Set the initial state of the stream and the privilege status. */
302 	rts->rts_state = TS_UNBND;
303 	connp->conn_zoneid = zoneid;
304 
305 	connp->conn_ulp_labeled = is_system_labeled();
306 
307 	rts->rts_rtss = rtss;
308 
309 	q->q_hiwat = rtss->rtss_recv_hiwat;
310 	WR(q)->q_hiwat = rtss->rtss_xmit_hiwat;
311 	WR(q)->q_lowat = rtss->rtss_xmit_lowat;
312 
313 	connp->conn_recv = rts_input;
314 	crhold(credp);
315 	connp->conn_cred = credp;
316 
317 	mutex_enter(&connp->conn_lock);
318 	connp->conn_state_flags &= ~CONN_INCIPIENT;
319 	mutex_exit(&connp->conn_lock);
320 
321 	qprocson(q);
322 	rw_exit(&rts->rts_rwlock);
323 
324 	/*
325 	 * Indicate the down IP module that this is a routing socket
326 	 * client by sending an RTS IOCTL without any user data. Although
327 	 * this is just a notification message (without any real routing
328 	 * request), we pass in any credential for correctness sake.
329 	 */
330 	ip_rts_register(connp);
331 
332 	return (0);
333 
334 }
335 
336 /*
337  * This routine creates a T_ERROR_ACK message and passes it upstream.
338  */
339 static void
340 rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
341 {
342 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
343 		qreply(q, mp);
344 }
345 
346 /*
347  * This routine creates a T_OK_ACK message and passes it upstream.
348  */
349 static void
350 rts_ok_ack(queue_t *q, mblk_t *mp)
351 {
352 	if ((mp = mi_tpi_ok_ack_alloc(mp)) != NULL)
353 		qreply(q, mp);
354 }
355 
356 /*
357  * This routine is called by rts_wput to handle T_UNBIND_REQ messages.
358  */
359 static void
360 rts_unbind(queue_t *q, mblk_t *mp)
361 {
362 	conn_t	*connp = Q_TO_CONN(q);
363 	rts_t	*rts = connp->conn_rts;
364 
365 	/* If a bind has not been done, we can't unbind. */
366 	if (rts->rts_state != TS_IDLE) {
367 		rts_err_ack(q, mp, TOUTSTATE, 0);
368 		return;
369 	}
370 	rts->rts_state = TS_UNBND;
371 	rts_ok_ack(q, mp);
372 }
373 
374 /*
375  * This routine is called to handle each
376  * O_T_BIND_REQ/T_BIND_REQ message passed to
377  * rts_wput. Note: This routine works with both
378  * O_T_BIND_REQ and T_BIND_REQ semantics.
379  */
380 static void
381 rts_bind(queue_t *q, mblk_t *mp)
382 {
383 	conn_t	*connp = Q_TO_CONN(q);
384 	rts_t	*rts = connp->conn_rts;
385 	mblk_t	*mp1;
386 	struct T_bind_req *tbr;
387 
388 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
389 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
390 		    "rts_bind: bad data, %d", rts->rts_state);
391 		rts_err_ack(q, mp, TBADADDR, 0);
392 		return;
393 	}
394 	if (rts->rts_state != TS_UNBND) {
395 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
396 		    "rts_bind: bad state, %d", rts->rts_state);
397 		rts_err_ack(q, mp, TOUTSTATE, 0);
398 		return;
399 	}
400 	/*
401 	 * Reallocate the message to make sure we have enough room for an
402 	 * address and the protocol type.
403 	 */
404 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin_t), 1);
405 	if (mp1 == NULL) {
406 		rts_err_ack(q, mp, TSYSERR, ENOMEM);
407 		return;
408 	}
409 	mp = mp1;
410 	tbr = (struct T_bind_req *)mp->b_rptr;
411 	if (tbr->ADDR_length != 0) {
412 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
413 		    "rts_bind: bad ADDR_length %d", tbr->ADDR_length);
414 		rts_err_ack(q, mp, TBADADDR, 0);
415 		return;
416 	}
417 	/* Generic request */
418 	tbr->ADDR_offset = (t_scalar_t)sizeof (struct T_bind_req);
419 	tbr->ADDR_length = 0;
420 	tbr->PRIM_type = T_BIND_ACK;
421 	rts->rts_state = TS_IDLE;
422 	qreply(q, mp);
423 }
424 
425 static void
426 rts_copy_info(struct T_info_ack *tap, rts_t *rts)
427 {
428 	*tap = rts_g_t_info_ack;
429 	tap->CURRENT_state = rts->rts_state;
430 	tap->OPT_size = rts_max_optsize;
431 }
432 
433 /*
434  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
435  * rts_wput.  Much of the T_CAPABILITY_ACK information is copied from
436  * rts_g_t_info_ack.  The current state of the stream is copied from
437  * rts_state.
438  */
439 static void
440 rts_capability_req(queue_t *q, mblk_t *mp)
441 {
442 	conn_t	*connp = Q_TO_CONN(q);
443 	rts_t	*rts = connp->conn_rts;
444 	t_uscalar_t		cap_bits1;
445 	struct T_capability_ack	*tcap;
446 
447 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
448 
449 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
450 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
451 	if (mp == NULL)
452 		return;
453 
454 	tcap = (struct T_capability_ack *)mp->b_rptr;
455 	tcap->CAP_bits1 = 0;
456 
457 	if (cap_bits1 & TC1_INFO) {
458 		rts_copy_info(&tcap->INFO_ack, rts);
459 		tcap->CAP_bits1 |= TC1_INFO;
460 	}
461 
462 	qreply(q, mp);
463 }
464 
465 /*
466  * This routine responds to T_INFO_REQ messages.  It is called by rts_wput.
467  * Most of the T_INFO_ACK information is copied from rts_g_t_info_ack.
468  * The current state of the stream is copied from rts_state.
469  */
470 static void
471 rts_info_req(queue_t *q, mblk_t *mp)
472 {
473 	conn_t	*connp = Q_TO_CONN(q);
474 	rts_t	*rts = connp->conn_rts;
475 
476 	mp = tpi_ack_alloc(mp, sizeof (rts_g_t_info_ack), M_PCPROTO,
477 	    T_INFO_ACK);
478 	if (mp == NULL)
479 		return;
480 	rts_copy_info((struct T_info_ack *)mp->b_rptr, rts);
481 	qreply(q, mp);
482 }
483 
484 /*
485  * This routine gets default values of certain options whose default
486  * values are maintained by protcol specific code
487  */
488 /* ARGSUSED */
489 int
490 rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
491 {
492 	/* no default value processed by protocol specific code currently */
493 	return (-1);
494 }
495 
496 /*
497  * This routine retrieves the current status of socket options.
498  * It returns the size of the option retrieved.
499  */
500 int
501 rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
502 {
503 	int	*i1 = (int *)ptr;
504 	conn_t	*connp = Q_TO_CONN(q);
505 	rts_t	*rts = connp->conn_rts;
506 
507 	switch (level) {
508 	case SOL_SOCKET:
509 		switch (name) {
510 		case SO_DEBUG:
511 			*i1 = rts->rts_debug;
512 			break;
513 		case SO_REUSEADDR:
514 			*i1 = rts->rts_reuseaddr;
515 			break;
516 		case SO_TYPE:
517 			*i1 = SOCK_RAW;
518 			break;
519 
520 		/*
521 		 * The following three items are available here,
522 		 * but are only meaningful to IP.
523 		 */
524 		case SO_DONTROUTE:
525 			*i1 = rts->rts_dontroute;
526 			break;
527 		case SO_USELOOPBACK:
528 			*i1 = rts->rts_useloopback;
529 			break;
530 		case SO_BROADCAST:
531 			*i1 = rts->rts_broadcast;
532 			break;
533 		case SO_PROTOTYPE:
534 			*i1 = rts->rts_proto;
535 			break;
536 		/*
537 		 * The following two items can be manipulated,
538 		 * but changing them should do nothing.
539 		 */
540 		case SO_SNDBUF:
541 			ASSERT(q->q_hiwat <= INT_MAX);
542 			*i1 = (int)(q->q_hiwat);
543 			break;
544 		case SO_RCVBUF:
545 			ASSERT(q->q_hiwat <= INT_MAX);
546 			*i1 = (int)(RD(q)->q_hiwat);
547 			break;
548 		case SO_DOMAIN:
549 			*i1 = PF_ROUTE;
550 			break;
551 		default:
552 			return (-1);
553 		}
554 		break;
555 	default:
556 		return (-1);
557 	}
558 	return ((int)sizeof (int));
559 }
560 
561 
562 /*
563  * This routine sets socket options.
564  */
565 /*ARGSUSED*/
566 int
567 rts_opt_set(queue_t *q, uint_t optset_context, int level,
568     int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
569     uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
570 {
571 	int	*i1 = (int *)invalp;
572 	conn_t	*connp = Q_TO_CONN(q);
573 	rts_t	*rts = connp->conn_rts;
574 	boolean_t checkonly;
575 	rts_stack_t	*rtss = rts->rts_rtss;
576 
577 	switch (optset_context) {
578 	case SETFN_OPTCOM_CHECKONLY:
579 		checkonly = B_TRUE;
580 		/*
581 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
582 		 * inlen != 0 implies value supplied and
583 		 * 	we have to "pretend" to set it.
584 		 * inlen == 0 implies that there is no
585 		 * 	value part in T_CHECK request and just validation
586 		 * done elsewhere should be enough, we just return here.
587 		 */
588 		if (inlen == 0) {
589 			*outlenp = 0;
590 			return (0);
591 		}
592 		break;
593 	case SETFN_OPTCOM_NEGOTIATE:
594 		checkonly = B_FALSE;
595 		break;
596 	case SETFN_UD_NEGOTIATE:
597 	case SETFN_CONN_NEGOTIATE:
598 		checkonly = B_FALSE;
599 		/*
600 		 * Negotiating local and "association-related" options
601 		 * through T_UNITDATA_REQ or T_CONN_{REQ,CON}
602 		 * Not allowed in this module.
603 		 */
604 		return (EINVAL);
605 	default:
606 		/*
607 		 * We should never get here
608 		 */
609 		*outlenp = 0;
610 		return (EINVAL);
611 	}
612 
613 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
614 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
615 
616 	/*
617 	 * For rts, we should have no ancillary data sent down
618 	 * (rts_wput doesn't handle options).
619 	 */
620 	ASSERT(thisdg_attrs == NULL);
621 
622 	/*
623 	 * For fixed length options, no sanity check
624 	 * of passed in length is done. It is assumed *_optcom_req()
625 	 * routines do the right thing.
626 	 */
627 
628 	switch (level) {
629 	case SOL_SOCKET:
630 		switch (name) {
631 		case SO_REUSEADDR:
632 			if (!checkonly)
633 				rts->rts_reuseaddr = *i1;
634 			break;	/* goto sizeof (int) option return */
635 		case SO_DEBUG:
636 			if (!checkonly)
637 				rts->rts_debug = *i1;
638 			break;	/* goto sizeof (int) option return */
639 		/*
640 		 * The following three items are available here,
641 		 * but are only meaningful to IP.
642 		 */
643 		case SO_DONTROUTE:
644 			if (!checkonly)
645 				rts->rts_dontroute = *i1;
646 			break;	/* goto sizeof (int) option return */
647 		case SO_USELOOPBACK:
648 			if (!checkonly)
649 				rts->rts_useloopback = *i1;
650 			break;	/* goto sizeof (int) option return */
651 		case SO_BROADCAST:
652 			if (!checkonly)
653 				rts->rts_broadcast = *i1;
654 			break;	/* goto sizeof (int) option return */
655 		case SO_PROTOTYPE:
656 			/*
657 			 * Routing socket applications that call socket() with
658 			 * a third argument can filter which messages will be
659 			 * sent upstream thanks to sockfs.  so_socket() sends
660 			 * down the SO_PROTOTYPE and rts_queue_input()
661 			 * implements the filtering.
662 			 */
663 			if (*i1 != AF_INET && *i1 != AF_INET6)
664 				return (EPROTONOSUPPORT);
665 			if (!checkonly)
666 				rts->rts_proto = *i1;
667 			break;	/* goto sizeof (int) option return */
668 		/*
669 		 * The following two items can be manipulated,
670 		 * but changing them should do nothing.
671 		 */
672 		case SO_SNDBUF:
673 			if (*i1 > rtss->rtss_max_buf) {
674 				*outlenp = 0;
675 				return (ENOBUFS);
676 			}
677 			if (!checkonly) {
678 				q->q_hiwat = *i1;
679 			}
680 			break;	/* goto sizeof (int) option return */
681 		case SO_RCVBUF:
682 			if (*i1 > rtss->rtss_max_buf) {
683 				*outlenp = 0;
684 				return (ENOBUFS);
685 			}
686 			if (!checkonly) {
687 				RD(q)->q_hiwat = *i1;
688 				(void) mi_set_sth_hiwat(RD(q), *i1);
689 			}
690 			break;	/* goto sizeof (int) option return */
691 		default:
692 			*outlenp = 0;
693 			return (EINVAL);
694 		}
695 		break;
696 	default:
697 		*outlenp = 0;
698 		return (EINVAL);
699 	}
700 	/*
701 	 * Common case of return from an option that is sizeof (int)
702 	 */
703 	*(int *)outvalp = *i1;
704 	*outlenp = (t_uscalar_t)sizeof (int);
705 	return (0);
706 }
707 
708 /*
709  * This routine retrieves the value of an ND variable in a rtsparam_t
710  * structure. It is called through nd_getset when a user reads the
711  * variable.
712  */
713 /* ARGSUSED */
714 static int
715 rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
716 {
717 	rtsparam_t	*rtspa = (rtsparam_t *)cp;
718 
719 	(void) mi_mpprintf(mp, "%u", rtspa->rts_param_value);
720 	return (0);
721 }
722 
723 /*
724  * Walk through the param array specified registering each element with the
725  * named dispatch (ND) handler.
726  */
727 static boolean_t
728 rts_param_register(IDP *ndp, rtsparam_t *rtspa, int cnt)
729 {
730 	for (; cnt-- > 0; rtspa++) {
731 		if (rtspa->rts_param_name != NULL && rtspa->rts_param_name[0]) {
732 			if (!nd_load(ndp, rtspa->rts_param_name,
733 			    rts_param_get, rts_param_set, (caddr_t)rtspa)) {
734 				nd_free(ndp);
735 				return (B_FALSE);
736 			}
737 		}
738 	}
739 	return (B_TRUE);
740 }
741 
742 /* This routine sets an ND variable in a rtsparam_t structure. */
743 /* ARGSUSED */
744 static int
745 rts_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
746 {
747 	ulong_t	new_value;
748 	rtsparam_t	*rtspa = (rtsparam_t *)cp;
749 
750 	/*
751 	 * Fail the request if the new value does not lie within the
752 	 * required bounds.
753 	 */
754 	if (ddi_strtoul(value, NULL, 10, &new_value) != 0 ||
755 	    new_value < rtspa->rts_param_min ||
756 	    new_value > rtspa->rts_param_max) {
757 		return (EINVAL);
758 	}
759 
760 	/* Set the new value */
761 	rtspa->rts_param_value = new_value;
762 	return (0);
763 }
764 
765 /*
766  * Empty rsrv routine which is used by rts_input to cause a wakeup
767  * of a thread in qwait.
768  */
769 /*ARGSUSED*/
770 static void
771 rts_rsrv(queue_t *q)
772 {
773 }
774 
775 /*
776  * This routine handles synchronous messages passed downstream. It either
777  * consumes the message or passes it downstream; it never queues a
778  * a message. The data messages that go down are wrapped in an IOCTL
779  * message.
780  *
781  * Since it is synchronous, it waits for the M_IOCACK/M_IOCNAK so that
782  * it can return an immediate error (such as ENETUNREACH when adding a route).
783  * It uses the RTS_WRW_PENDING to ensure that each rts instance has only
784  * one M_IOCTL outstanding at any given time.
785  */
786 static int
787 rts_wrw(queue_t *q, struiod_t *dp)
788 {
789 	mblk_t	*mp = dp->d_mp;
790 	mblk_t	*mp1;
791 	int	error;
792 	rt_msghdr_t	*rtm;
793 	conn_t	*connp = Q_TO_CONN(q);
794 	rts_t	*rts = connp->conn_rts;
795 
796 	while (rts->rts_flag & RTS_WRW_PENDING) {
797 		if (qwait_rw(q)) {
798 			rts->rts_error = EINTR;
799 			goto err_ret;
800 		}
801 		}
802 	rts->rts_flag |= RTS_WRW_PENDING;
803 
804 	if (isuioq(q) && (error = struioget(q, mp, dp, 0))) {
805 		/*
806 		 * Uio error of some sort, so just return the error.
807 		 */
808 		rts->rts_error = error;
809 		goto err_ret;
810 	}
811 	/*
812 	 * Pass the mblk (chain) onto wput().
813 	 */
814 	dp->d_mp = 0;
815 
816 	switch (mp->b_datap->db_type) {
817 	case M_PROTO:
818 	case M_PCPROTO:
819 		/* Expedite other than T_DATA_REQ to below the switch */
820 		if (((mp->b_wptr - mp->b_rptr) !=
821 		    sizeof (struct T_data_req)) ||
822 		    (((union T_primitives *)mp->b_rptr)->type != T_DATA_REQ))
823 			break;
824 		if ((mp1 = mp->b_cont) == NULL) {
825 			rts->rts_error = EINVAL;
826 			goto err_ret;
827 		}
828 		freeb(mp);
829 		mp = mp1;
830 		/* FALLTHRU */
831 	case M_DATA:
832 		/*
833 		 * The semantics of the routing socket is such that the rtm_pid
834 		 * field is automatically filled in during requests with the
835 		 * current process' pid.  We do this here (where we still have
836 		 * user context) after checking we have at least a message the
837 		 * size of a routing message header.
838 		 */
839 		if ((mp->b_wptr - mp->b_rptr) < sizeof (rt_msghdr_t)) {
840 			if (!pullupmsg(mp, sizeof (rt_msghdr_t))) {
841 				rts->rts_error = EINVAL;
842 				goto err_ret;
843 			}
844 		}
845 		rtm = (rt_msghdr_t *)mp->b_rptr;
846 		rtm->rtm_pid = curproc->p_pid;
847 		break;
848 	default:
849 		break;
850 	}
851 	rts->rts_flag |= RTS_WPUT_PENDING;
852 	rts_wput(q, mp);
853 	while (rts->rts_flag & RTS_WPUT_PENDING)
854 		if (qwait_rw(q)) {
855 			/* RTS_WPUT_PENDING will be cleared below */
856 			rts->rts_error = EINTR;
857 			break;
858 		}
859 err_ret:
860 	rts->rts_flag &= ~(RTS_WPUT_PENDING | RTS_WRW_PENDING);
861 	return (rts->rts_error);
862 }
863 
864 /*
865  * This routine handles all messages passed downstream. It either
866  * consumes the message or passes it downstream; it never queues a
867  * a message. The data messages that go down are wrapped in an IOCTL
868  * message.
869  *
870  * FIXME? Should we call IP rts_request directly? Could punt on returning
871  * errno in the case when it defers processing due to
872  * IPIF_CHANGING/ILL_CHANGING???
873  */
874 static void
875 rts_wput(queue_t *q, mblk_t *mp)
876 {
877 	uchar_t	*rptr = mp->b_rptr;
878 	mblk_t	*mp1;
879 	conn_t	*connp = Q_TO_CONN(q);
880 	rts_t	*rts = connp->conn_rts;
881 
882 	switch (mp->b_datap->db_type) {
883 	case M_DATA:
884 		break;
885 	case M_PROTO:
886 	case M_PCPROTO:
887 		if ((mp->b_wptr - rptr) == sizeof (struct T_data_req)) {
888 			/* Expedite valid T_DATA_REQ to below the switch */
889 			if (((union T_primitives *)rptr)->type == T_DATA_REQ) {
890 				mp1 = mp->b_cont;
891 				freeb(mp);
892 				if (mp1 == NULL)
893 					return;
894 				mp = mp1;
895 				break;
896 			}
897 		}
898 		/* FALLTHRU */
899 	default:
900 		rts_wput_other(q, mp);
901 		return;
902 	}
903 
904 
905 	mp1 = rts_ioctl_alloc(mp, DB_CRED(mp));
906 	if (mp1 == NULL) {
907 		ASSERT(rts != NULL);
908 		freemsg(mp);
909 		if (rts->rts_flag & RTS_WPUT_PENDING) {
910 			rts->rts_error = ENOMEM;
911 			rts->rts_flag &= ~RTS_WPUT_PENDING;
912 		}
913 		return;
914 	}
915 	ip_output(connp, mp1, q, IP_WPUT);
916 }
917 
918 
919 /*
920  * Handles all the control message, if it
921  * can not understand it, it will
922  * pass down stream.
923  */
924 static void
925 rts_wput_other(queue_t *q, mblk_t *mp)
926 {
927 	conn_t	*connp = Q_TO_CONN(q);
928 	rts_t	*rts = connp->conn_rts;
929 	uchar_t	*rptr = mp->b_rptr;
930 	struct iocblk	*iocp;
931 	cred_t	*cr;
932 	rts_stack_t	*rtss;
933 
934 	rtss = rts->rts_rtss;
935 
936 	cr = DB_CREDDEF(mp, connp->conn_cred);
937 
938 	switch (mp->b_datap->db_type) {
939 	case M_PROTO:
940 	case M_PCPROTO:
941 		if ((mp->b_wptr - rptr) < sizeof (t_scalar_t)) {
942 			/*
943 			 * If the message does not contain a PRIM_type,
944 			 * throw it away.
945 			 */
946 			freemsg(mp);
947 			return;
948 		}
949 		switch (((union T_primitives *)rptr)->type) {
950 		case T_BIND_REQ:
951 		case O_T_BIND_REQ:
952 			rts_bind(q, mp);
953 			return;
954 		case T_UNBIND_REQ:
955 			rts_unbind(q, mp);
956 			return;
957 		case T_CAPABILITY_REQ:
958 			rts_capability_req(q, mp);
959 			return;
960 		case T_INFO_REQ:
961 			rts_info_req(q, mp);
962 			return;
963 		case T_SVR4_OPTMGMT_REQ:
964 			(void) svr4_optcom_req(q, mp, cr, &rts_opt_obj,
965 			    B_TRUE);
966 			return;
967 		case T_OPTMGMT_REQ:
968 			(void) tpi_optcom_req(q, mp, cr, &rts_opt_obj, B_TRUE);
969 			return;
970 		case O_T_CONN_RES:
971 		case T_CONN_RES:
972 		case T_DISCON_REQ:
973 			/* Not supported by rts. */
974 			rts_err_ack(q, mp, TNOTSUPPORT, 0);
975 			return;
976 		case T_DATA_REQ:
977 		case T_EXDATA_REQ:
978 		case T_ORDREL_REQ:
979 			/* Illegal for rts. */
980 			freemsg(mp);
981 			(void) putnextctl1(RD(q), M_ERROR, EPROTO);
982 			return;
983 		default:
984 			break;
985 		}
986 		break;
987 	case M_IOCTL:
988 		iocp = (struct iocblk *)mp->b_rptr;
989 		switch (iocp->ioc_cmd) {
990 		case ND_SET:
991 		case ND_GET:
992 			if (nd_getset(q, rtss->rtss_g_nd, mp)) {
993 				qreply(q, mp);
994 				return;
995 			}
996 			break;
997 		case TI_GETPEERNAME:
998 			mi_copyin(q, mp, NULL,
999 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
1000 			return;
1001 		default:
1002 			break;
1003 		}
1004 	case M_IOCDATA:
1005 		rts_wput_iocdata(q, mp);
1006 		return;
1007 	default:
1008 		break;
1009 	}
1010 	ip_output(connp, mp, q, IP_WPUT);
1011 }
1012 
1013 /*
1014  * Called by rts_wput_other to handle all M_IOCDATA messages.
1015  */
1016 static void
1017 rts_wput_iocdata(queue_t *q, mblk_t *mp)
1018 {
1019 	conn_t *connp = Q_TO_CONN(q);
1020 	struct sockaddr	*rtsaddr;
1021 	mblk_t	*mp1;
1022 	STRUCT_HANDLE(strbuf, sb);
1023 	struct iocblk	*iocp	= (struct iocblk *)mp->b_rptr;
1024 
1025 	/* Make sure it is one of ours. */
1026 	switch (iocp->ioc_cmd) {
1027 	case TI_GETPEERNAME:
1028 		break;
1029 	default:
1030 		ip_output(connp, mp, q, IP_WPUT);
1031 		return;
1032 	}
1033 	switch (mi_copy_state(q, mp, &mp1)) {
1034 	case -1:
1035 		return;
1036 	case MI_COPY_CASE(MI_COPY_IN, 1):
1037 		break;
1038 	case MI_COPY_CASE(MI_COPY_OUT, 1):
1039 		/* Copy out the strbuf. */
1040 		mi_copyout(q, mp);
1041 		return;
1042 	case MI_COPY_CASE(MI_COPY_OUT, 2):
1043 		/* All done. */
1044 		mi_copy_done(q, mp, 0);
1045 		return;
1046 	default:
1047 		mi_copy_done(q, mp, EPROTO);
1048 		return;
1049 	}
1050 	STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr);
1051 	if (STRUCT_FGET(sb, maxlen) < (int)sizeof (sin_t)) {
1052 		mi_copy_done(q, mp, EINVAL);
1053 		return;
1054 	}
1055 	switch (iocp->ioc_cmd) {
1056 	case TI_GETPEERNAME:
1057 		break;
1058 	default:
1059 		mi_copy_done(q, mp, EPROTO);
1060 		return;
1061 	}
1062 	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), sizeof (sin_t),
1063 	    B_TRUE);
1064 	if (mp1 == NULL)
1065 		return;
1066 	STRUCT_FSET(sb, len, (int)sizeof (sin_t));
1067 	rtsaddr = (struct sockaddr *)mp1->b_rptr;
1068 	mp1->b_wptr = (uchar_t *)&rtsaddr[1];
1069 	bzero(rtsaddr, sizeof (struct sockaddr));
1070 	rtsaddr->sa_family = AF_ROUTE;
1071 	/* Copy out the address */
1072 	mi_copyout(q, mp);
1073 }
1074 
1075 /*ARGSUSED2*/
1076 static void
1077 rts_input(void *arg1, mblk_t *mp, void *arg2)
1078 {
1079 	conn_t *connp = (conn_t *)arg1;
1080 	rts_t	*rts = connp->conn_rts;
1081 	struct iocblk	*iocp;
1082 	mblk_t *mp1;
1083 	struct T_data_ind *tdi;
1084 
1085 	switch (mp->b_datap->db_type) {
1086 	case M_IOCACK:
1087 	case M_IOCNAK:
1088 		iocp = (struct iocblk *)mp->b_rptr;
1089 		if (rts->rts_flag & (RTS_WPUT_PENDING)) {
1090 			rts->rts_flag &= ~RTS_WPUT_PENDING;
1091 			rts->rts_error = iocp->ioc_error;
1092 			/*
1093 			 * Tell rts_wvw/qwait that we are done.
1094 			 * Note: there is no qwait_wakeup() we can use.
1095 			 */
1096 			qenable(connp->conn_rq);
1097 			freemsg(mp);
1098 			return;
1099 		}
1100 		break;
1101 	case M_DATA:
1102 		/*
1103 		 * Prepend T_DATA_IND to prevent the stream head from
1104 		 * consolidating multiple messages together.
1105 		 * If the allocation fails just send up the M_DATA.
1106 		 */
1107 		mp1 = allocb(sizeof (*tdi), BPRI_MED);
1108 		if (mp1 != NULL) {
1109 			mp1->b_cont = mp;
1110 			mp = mp1;
1111 
1112 			mp->b_datap->db_type = M_PROTO;
1113 			mp->b_wptr += sizeof (*tdi);
1114 			tdi = (struct T_data_ind *)mp->b_rptr;
1115 			tdi->PRIM_type = T_DATA_IND;
1116 			tdi->MORE_flag = 0;
1117 		}
1118 		break;
1119 	default:
1120 		break;
1121 	}
1122 	putnext(connp->conn_rq, mp);
1123 }
1124 
1125 
1126 void
1127 rts_ddi_init(void)
1128 {
1129 	rts_max_optsize = optcom_max_optsize(rts_opt_obj.odb_opt_des_arr,
1130 	    rts_opt_obj.odb_opt_arr_cnt);
1131 
1132 	/*
1133 	 * We want to be informed each time a stack is created or
1134 	 * destroyed in the kernel, so we can maintain the
1135 	 * set of rts_stack_t's.
1136 	 */
1137 	netstack_register(NS_RTS, rts_stack_init, NULL, rts_stack_fini);
1138 }
1139 
1140 void
1141 rts_ddi_destroy(void)
1142 {
1143 	netstack_unregister(NS_RTS);
1144 }
1145 
1146 /*
1147  * Initialize the RTS stack instance.
1148  */
1149 /* ARGSUSED */
1150 static void *
1151 rts_stack_init(netstackid_t stackid, netstack_t *ns)
1152 {
1153 	rts_stack_t	*rtss;
1154 	rtsparam_t	*pa;
1155 
1156 	rtss = (rts_stack_t *)kmem_zalloc(sizeof (*rtss), KM_SLEEP);
1157 	rtss->rtss_netstack = ns;
1158 
1159 	pa = (rtsparam_t *)kmem_alloc(sizeof (lcl_param_arr), KM_SLEEP);
1160 	rtss->rtss_params = pa;
1161 	bcopy(lcl_param_arr, rtss->rtss_params, sizeof (lcl_param_arr));
1162 
1163 	(void) rts_param_register(&rtss->rtss_g_nd,
1164 	    rtss->rtss_params, A_CNT(lcl_param_arr));
1165 	return (rtss);
1166 }
1167 
1168 /*
1169  * Free the RTS stack instance.
1170  */
1171 /* ARGSUSED */
1172 static void
1173 rts_stack_fini(netstackid_t stackid, void *arg)
1174 {
1175 	rts_stack_t *rtss = (rts_stack_t *)arg;
1176 
1177 	nd_free(&rtss->rtss_g_nd);
1178 	kmem_free(rtss->rtss_params, sizeof (lcl_param_arr));
1179 	rtss->rtss_params = NULL;
1180 	kmem_free(rtss, sizeof (*rtss));
1181 }
1182