xref: /titanic_52/usr/src/uts/common/inet/ip/rts.c (revision d321a33cdd896e6b211d113a33698dd76e89b861)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/strsubr.h>
31 #include <sys/stropts.h>
32 #include <sys/strsun.h>
33 #include <sys/strlog.h>
34 #define	_SUN_TPI_VERSION 2
35 #include <sys/tihdr.h>
36 #include <sys/timod.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/cmn_err.h>
40 #include <sys/proc.h>
41 #include <sys/suntpi.h>
42 #include <sys/policy.h>
43 #include <sys/zone.h>
44 
45 #include <sys/socket.h>
46 #include <netinet/in.h>
47 
48 #include <inet/common.h>
49 #include <netinet/ip6.h>
50 #include <inet/ip.h>
51 #include <inet/ipclassifier.h>
52 #include <inet/mi.h>
53 #include <inet/nd.h>
54 #include <inet/optcom.h>
55 #include <netinet/ip_mroute.h>
56 #include <sys/isa_defs.h>
57 #include <net/route.h>
58 
59 #include <inet/rts_impl.h>
60 #include <inet/ip_rts.h>
61 
62 /*
63  * This is a transport provider for routing sockets.  Downstream messages are
64  * wrapped with a IP_IOCTL header, and ip_wput_ioctl calls the appropriate entry
65  * in the ip_ioctl_ftbl callout table to pass the routing socket data into IP.
66  * Upstream messages are generated for listeners of the routing socket as well
67  * as the message sender (unless they have turned off their end using
68  * SO_USELOOPBACK or shutdown(3n)).  Upstream messages may also be generated
69  * asynchronously when:
70  *
71  *	Interfaces are brought up or down.
72  *	Addresses are assigned to interfaces.
73  *	ICMP redirects are processed and a IRE_HOST/RTF_DYNAMIC is installed.
74  *	No route is found while sending a packet.
75  *	When TCP requests IP to remove an IRE_CACHE of a troubled destination.
76  *
77  * Since all we do is reformat the messages between routing socket and
78  * ioctl forms, no synchronization is necessary in this module; all
79  * the dirty work is done down in ip.
80  */
81 
82 /* Default structure copied into T_INFO_ACK messages */
83 static struct T_info_ack rts_g_t_info_ack = {
84 	T_INFO_ACK,
85 	T_INFINITE,	/* TSDU_size. Maximum size messages. */
86 	T_INVALID,	/* ETSDU_size. No expedited data. */
87 	T_INVALID,	/* CDATA_size. No connect data. */
88 	T_INVALID,	/* DDATA_size. No disconnect data. */
89 	0,		/* ADDR_size. */
90 	0,		/* OPT_size - not initialized here */
91 	64 * 1024,	/* TIDU_size. rts allows maximum size messages. */
92 	T_COTS,		/* SERV_type. rts supports connection oriented. */
93 	TS_UNBND,	/* CURRENT_state. This is set from rts_state. */
94 	(XPG4_1)	/* PROVIDER_flag */
95 };
96 
97 /*
98  * Table of ND variables supported by rts. These are loaded into rts_g_nd
99  * in rts_open.
100  * All of these are alterable, within the min/max values given, at run time.
101  */
102 static rtsparam_t	lcl_param_arr[] = {
103 	/* min		max		value		name */
104 	{ 4096,		65536,		8192,		"rts_xmit_hiwat"},
105 	{ 0,		65536,		1024,		"rts_xmit_lowat"},
106 	{ 4096,		65536,		8192,		"rts_recv_hiwat"},
107 	{ 65536,	1024*1024*1024, 256*1024,	"rts_max_buf"},
108 };
109 #define	rtss_xmit_hiwat		rtss_params[0].rts_param_value
110 #define	rtss_xmit_lowat		rtss_params[1].rts_param_value
111 #define	rtss_recv_hiwat		rtss_params[2].rts_param_value
112 #define	rtss_max_buf		rtss_params[3].rts_param_value
113 
114 static int	rts_close(queue_t *q);
115 static void 	rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
116     int sys_error);
117 static void	rts_input(void *, mblk_t *, void *);
118 static mblk_t	*rts_ioctl_alloc(mblk_t *data, cred_t *cr);
119 static int	rts_open(queue_t *q, dev_t *devp, int flag, int sflag,
120     cred_t *credp);
121 int		rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name,
122     uchar_t *ptr);
123 int		rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name,
124     uchar_t *ptr);
125 int		rts_opt_set(queue_t *q, uint_t optset_context, int level,
126     int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
127     uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk);
128 static int	rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
129 static boolean_t rts_param_register(IDP *ndp, rtsparam_t *rtspa, int cnt);
130 static int	rts_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
131     cred_t *cr);
132 static void	rts_rsrv(queue_t *q);
133 static void	*rts_stack_init(netstackid_t stackid, netstack_t *ns);
134 static void	rts_stack_fini(netstackid_t stackid, void *arg);
135 static void	rts_wput(queue_t *q, mblk_t *mp);
136 static void	rts_wput_iocdata(queue_t *q, mblk_t *mp);
137 static void 	rts_wput_other(queue_t *q, mblk_t *mp);
138 static int	rts_wrw(queue_t *q, struiod_t *dp);
139 
140 static struct module_info rts_mod_info = {
141 	129, "rts", 1, INFPSZ, 512, 128
142 };
143 
144 static struct qinit rtsrinit = {
145 	NULL, (pfi_t)rts_rsrv, rts_open, rts_close, NULL, &rts_mod_info
146 };
147 
148 static struct qinit rtswinit = {
149 	(pfi_t)rts_wput, NULL, NULL, NULL, NULL, &rts_mod_info,
150 	NULL, (pfi_t)rts_wrw, NULL, STRUIOT_STANDARD
151 };
152 
153 struct streamtab rtsinfo = {
154 	&rtsrinit, &rtswinit
155 };
156 
157 /*
158  * This routine allocates the necessary
159  * message blocks for IOCTL wrapping the
160  * user data.
161  */
162 static mblk_t *
163 rts_ioctl_alloc(mblk_t *data, cred_t *cr)
164 {
165 	mblk_t	*mp = NULL;
166 	mblk_t	*mp1 = NULL;
167 	ipllc_t	*ipllc;
168 	struct iocblk	*ioc;
169 
170 	mp = allocb_cred(sizeof (ipllc_t), cr);
171 	if (mp == NULL)
172 		return (NULL);
173 	mp1 = allocb_cred(sizeof (struct iocblk), cr);
174 	if (mp1 == NULL) {
175 		freeb(mp);
176 		return (NULL);
177 	}
178 
179 	ipllc = (ipllc_t *)mp->b_rptr;
180 	ipllc->ipllc_cmd = IP_IOC_RTS_REQUEST;
181 	ipllc->ipllc_name_offset = 0;
182 	ipllc->ipllc_name_length = 0;
183 	mp->b_wptr += sizeof (ipllc_t);
184 	mp->b_cont = data;
185 
186 	ioc = (struct iocblk *)mp1->b_rptr;
187 	ioc->ioc_cmd = IP_IOCTL;
188 	ioc->ioc_error = 0;
189 	ioc->ioc_cr = NULL;
190 	ioc->ioc_count = msgdsize(mp);
191 	mp1->b_wptr += sizeof (struct iocblk);
192 	mp1->b_datap->db_type = M_IOCTL;
193 	mp1->b_cont = mp;
194 
195 	return (mp1);
196 }
197 
198 /*
199  * This routine closes rts stream, by disabling
200  * put/srv routines and freeing the this module
201  * internal datastructure.
202  */
203 static int
204 rts_close(queue_t *q)
205 {
206 	conn_t	*connp = Q_TO_CONN(q);
207 
208 	ASSERT(connp != NULL && IPCL_IS_RTS(connp));
209 
210 	ip_rts_unregister(connp);
211 
212 	ip_quiesce_conn(connp);
213 
214 	qprocsoff(q);
215 
216 	/*
217 	 * Now we are truly single threaded on this stream, and can
218 	 * delete the things hanging off the connp, and finally the connp.
219 	 * We removed this connp from the fanout list, it cannot be
220 	 * accessed thru the fanouts, and we already waited for the
221 	 * conn_ref to drop to 0. We are already in close, so
222 	 * there cannot be any other thread from the top. qprocsoff
223 	 * has completed, and service has completed or won't run in
224 	 * future.
225 	 */
226 	ASSERT(connp->conn_ref == 1);
227 
228 	inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
229 
230 	connp->conn_ref--;
231 	ipcl_conn_destroy(connp);
232 
233 	q->q_ptr = WR(q)->q_ptr = NULL;
234 	return (0);
235 }
236 
237 /*
238  * This is the open routine for routing socket. It allocates
239  * rts_t structure for the stream and tells IP that it is a routing socket.
240  */
241 /* ARGSUSED */
242 static int
243 rts_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
244 {
245 	rts_t	*rts;
246 	conn_t *connp;
247 	dev_t	conn_dev;
248 	zoneid_t zoneid;
249 	netstack_t *ns;
250 	rts_stack_t *rtss;
251 
252 	/* If the stream is already open, return immediately. */
253 	if (q->q_ptr != NULL)
254 		return (0);
255 
256 	if (sflag == MODOPEN)
257 		return (EINVAL);
258 
259 	ns = netstack_find_by_cred(credp);
260 	ASSERT(ns != NULL);
261 	rtss = ns->netstack_rts;
262 	ASSERT(rtss != NULL);
263 
264 	/*
265 	 * For exclusive stacks we set the zoneid to zero
266 	 * to make RTS operate as if in the global zone.
267 	 */
268 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
269 		zoneid = GLOBAL_ZONEID;
270 	else
271 		zoneid = crgetzoneid(credp);
272 
273 	/*
274 	 * Since RTS is not used so heavily, allocating from the small
275 	 * arena should be sufficient.
276 	 */
277 	if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
278 		netstack_rele(ns);
279 		return (EBUSY);
280 	}
281 	*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
282 
283 	connp = ipcl_conn_create(IPCL_RTSCONN, KM_SLEEP, ns);
284 	connp->conn_dev = conn_dev;
285 	connp->conn_minor_arena = ip_minor_arena_sa;
286 	rts = connp->conn_rts;
287 
288 	/*
289 	 * ipcl_conn_create did a netstack_hold. Undo the hold that was
290 	 * done by netstack_find_by_cred()
291 	 */
292 	netstack_rele(ns);
293 
294 	/*
295 	 * Initialize the rts_t structure for this stream.
296 	 */
297 	q->q_ptr = connp;
298 	WR(q)->q_ptr = connp;
299 	connp->conn_rq = q;
300 	connp->conn_wq = WR(q);
301 
302 	rw_enter(&rts->rts_rwlock, RW_WRITER);
303 	ASSERT(connp->conn_rts == rts);
304 	ASSERT(rts->rts_connp == connp);
305 
306 	/* Set the initial state of the stream and the privilege status. */
307 	rts->rts_state = TS_UNBND;
308 	connp->conn_zoneid = zoneid;
309 
310 	connp->conn_ulp_labeled = is_system_labeled();
311 
312 	rts->rts_rtss = rtss;
313 
314 	q->q_hiwat = rtss->rtss_recv_hiwat;
315 	WR(q)->q_hiwat = rtss->rtss_xmit_hiwat;
316 	WR(q)->q_lowat = rtss->rtss_xmit_lowat;
317 
318 	connp->conn_recv = rts_input;
319 	crhold(credp);
320 	connp->conn_cred = credp;
321 
322 	mutex_enter(&connp->conn_lock);
323 	connp->conn_state_flags &= ~CONN_INCIPIENT;
324 	mutex_exit(&connp->conn_lock);
325 
326 	qprocson(q);
327 	rw_exit(&rts->rts_rwlock);
328 
329 	/*
330 	 * Indicate the down IP module that this is a routing socket
331 	 * client by sending an RTS IOCTL without any user data. Although
332 	 * this is just a notification message (without any real routing
333 	 * request), we pass in any credential for correctness sake.
334 	 */
335 	ip_rts_register(connp);
336 
337 	return (0);
338 
339 }
340 
341 /*
342  * This routine creates a T_ERROR_ACK message and passes it upstream.
343  */
344 static void
345 rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
346 {
347 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
348 		qreply(q, mp);
349 }
350 
351 /*
352  * This routine creates a T_OK_ACK message and passes it upstream.
353  */
354 static void
355 rts_ok_ack(queue_t *q, mblk_t *mp)
356 {
357 	if ((mp = mi_tpi_ok_ack_alloc(mp)) != NULL)
358 		qreply(q, mp);
359 }
360 
361 /*
362  * This routine is called by rts_wput to handle T_UNBIND_REQ messages.
363  */
364 static void
365 rts_unbind(queue_t *q, mblk_t *mp)
366 {
367 	conn_t	*connp = Q_TO_CONN(q);
368 	rts_t	*rts = connp->conn_rts;
369 
370 	/* If a bind has not been done, we can't unbind. */
371 	if (rts->rts_state != TS_IDLE) {
372 		rts_err_ack(q, mp, TOUTSTATE, 0);
373 		return;
374 	}
375 	rts->rts_state = TS_UNBND;
376 	rts_ok_ack(q, mp);
377 }
378 
379 /*
380  * This routine is called to handle each
381  * O_T_BIND_REQ/T_BIND_REQ message passed to
382  * rts_wput. Note: This routine works with both
383  * O_T_BIND_REQ and T_BIND_REQ semantics.
384  */
385 static void
386 rts_bind(queue_t *q, mblk_t *mp)
387 {
388 	conn_t	*connp = Q_TO_CONN(q);
389 	rts_t	*rts = connp->conn_rts;
390 	mblk_t	*mp1;
391 	struct T_bind_req *tbr;
392 
393 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
394 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
395 		    "rts_bind: bad data, %d", rts->rts_state);
396 		rts_err_ack(q, mp, TBADADDR, 0);
397 		return;
398 	}
399 	if (rts->rts_state != TS_UNBND) {
400 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
401 		    "rts_bind: bad state, %d", rts->rts_state);
402 		rts_err_ack(q, mp, TOUTSTATE, 0);
403 		return;
404 	}
405 	/*
406 	 * Reallocate the message to make sure we have enough room for an
407 	 * address and the protocol type.
408 	 */
409 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin_t), 1);
410 	if (mp1 == NULL) {
411 		rts_err_ack(q, mp, TSYSERR, ENOMEM);
412 		return;
413 	}
414 	mp = mp1;
415 	tbr = (struct T_bind_req *)mp->b_rptr;
416 	if (tbr->ADDR_length != 0) {
417 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
418 		    "rts_bind: bad ADDR_length %d", tbr->ADDR_length);
419 		rts_err_ack(q, mp, TBADADDR, 0);
420 		return;
421 	}
422 	/* Generic request */
423 	tbr->ADDR_offset = (t_scalar_t)sizeof (struct T_bind_req);
424 	tbr->ADDR_length = 0;
425 	tbr->PRIM_type = T_BIND_ACK;
426 	rts->rts_state = TS_IDLE;
427 	qreply(q, mp);
428 }
429 
430 static void
431 rts_copy_info(struct T_info_ack *tap, rts_t *rts)
432 {
433 	*tap = rts_g_t_info_ack;
434 	tap->CURRENT_state = rts->rts_state;
435 	tap->OPT_size = rts_max_optsize;
436 }
437 
438 /*
439  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
440  * rts_wput.  Much of the T_CAPABILITY_ACK information is copied from
441  * rts_g_t_info_ack.  The current state of the stream is copied from
442  * rts_state.
443  */
444 static void
445 rts_capability_req(queue_t *q, mblk_t *mp)
446 {
447 	conn_t	*connp = Q_TO_CONN(q);
448 	rts_t	*rts = connp->conn_rts;
449 	t_uscalar_t		cap_bits1;
450 	struct T_capability_ack	*tcap;
451 
452 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
453 
454 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
455 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
456 	if (mp == NULL)
457 		return;
458 
459 	tcap = (struct T_capability_ack *)mp->b_rptr;
460 	tcap->CAP_bits1 = 0;
461 
462 	if (cap_bits1 & TC1_INFO) {
463 		rts_copy_info(&tcap->INFO_ack, rts);
464 		tcap->CAP_bits1 |= TC1_INFO;
465 	}
466 
467 	qreply(q, mp);
468 }
469 
470 /*
471  * This routine responds to T_INFO_REQ messages.  It is called by rts_wput.
472  * Most of the T_INFO_ACK information is copied from rts_g_t_info_ack.
473  * The current state of the stream is copied from rts_state.
474  */
475 static void
476 rts_info_req(queue_t *q, mblk_t *mp)
477 {
478 	conn_t	*connp = Q_TO_CONN(q);
479 	rts_t	*rts = connp->conn_rts;
480 
481 	mp = tpi_ack_alloc(mp, sizeof (rts_g_t_info_ack), M_PCPROTO,
482 	    T_INFO_ACK);
483 	if (mp == NULL)
484 		return;
485 	rts_copy_info((struct T_info_ack *)mp->b_rptr, rts);
486 	qreply(q, mp);
487 }
488 
489 /*
490  * This routine gets default values of certain options whose default
491  * values are maintained by protcol specific code
492  */
493 /* ARGSUSED */
494 int
495 rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
496 {
497 	/* no default value processed by protocol specific code currently */
498 	return (-1);
499 }
500 
501 /*
502  * This routine retrieves the current status of socket options.
503  * It returns the size of the option retrieved.
504  */
505 int
506 rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
507 {
508 	int	*i1 = (int *)ptr;
509 	conn_t	*connp = Q_TO_CONN(q);
510 	rts_t	*rts = connp->conn_rts;
511 
512 	switch (level) {
513 	case SOL_SOCKET:
514 		switch (name) {
515 		case SO_DEBUG:
516 			*i1 = rts->rts_debug;
517 			break;
518 		case SO_REUSEADDR:
519 			*i1 = rts->rts_reuseaddr;
520 			break;
521 		case SO_TYPE:
522 			*i1 = SOCK_RAW;
523 			break;
524 
525 		/*
526 		 * The following three items are available here,
527 		 * but are only meaningful to IP.
528 		 */
529 		case SO_DONTROUTE:
530 			*i1 = rts->rts_dontroute;
531 			break;
532 		case SO_USELOOPBACK:
533 			*i1 = rts->rts_useloopback;
534 			break;
535 		case SO_BROADCAST:
536 			*i1 = rts->rts_broadcast;
537 			break;
538 		case SO_PROTOTYPE:
539 			*i1 = rts->rts_proto;
540 			break;
541 		/*
542 		 * The following two items can be manipulated,
543 		 * but changing them should do nothing.
544 		 */
545 		case SO_SNDBUF:
546 			ASSERT(q->q_hiwat <= INT_MAX);
547 			*i1 = (int)(q->q_hiwat);
548 			break;
549 		case SO_RCVBUF:
550 			ASSERT(q->q_hiwat <= INT_MAX);
551 			*i1 = (int)(RD(q)->q_hiwat);
552 			break;
553 		case SO_DOMAIN:
554 			*i1 = PF_ROUTE;
555 			break;
556 		default:
557 			return (-1);
558 		}
559 		break;
560 	default:
561 		return (-1);
562 	}
563 	return ((int)sizeof (int));
564 }
565 
566 
567 /*
568  * This routine sets socket options.
569  */
570 /*ARGSUSED*/
571 int
572 rts_opt_set(queue_t *q, uint_t optset_context, int level,
573     int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
574     uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
575 {
576 	int	*i1 = (int *)invalp;
577 	conn_t	*connp = Q_TO_CONN(q);
578 	rts_t	*rts = connp->conn_rts;
579 	boolean_t checkonly;
580 	rts_stack_t	*rtss = rts->rts_rtss;
581 
582 	switch (optset_context) {
583 	case SETFN_OPTCOM_CHECKONLY:
584 		checkonly = B_TRUE;
585 		/*
586 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
587 		 * inlen != 0 implies value supplied and
588 		 * 	we have to "pretend" to set it.
589 		 * inlen == 0 implies that there is no
590 		 * 	value part in T_CHECK request and just validation
591 		 * done elsewhere should be enough, we just return here.
592 		 */
593 		if (inlen == 0) {
594 			*outlenp = 0;
595 			return (0);
596 		}
597 		break;
598 	case SETFN_OPTCOM_NEGOTIATE:
599 		checkonly = B_FALSE;
600 		break;
601 	case SETFN_UD_NEGOTIATE:
602 	case SETFN_CONN_NEGOTIATE:
603 		checkonly = B_FALSE;
604 		/*
605 		 * Negotiating local and "association-related" options
606 		 * through T_UNITDATA_REQ or T_CONN_{REQ,CON}
607 		 * Not allowed in this module.
608 		 */
609 		return (EINVAL);
610 	default:
611 		/*
612 		 * We should never get here
613 		 */
614 		*outlenp = 0;
615 		return (EINVAL);
616 	}
617 
618 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
619 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
620 
621 	/*
622 	 * For rts, we should have no ancillary data sent down
623 	 * (rts_wput doesn't handle options).
624 	 */
625 	ASSERT(thisdg_attrs == NULL);
626 
627 	/*
628 	 * For fixed length options, no sanity check
629 	 * of passed in length is done. It is assumed *_optcom_req()
630 	 * routines do the right thing.
631 	 */
632 
633 	switch (level) {
634 	case SOL_SOCKET:
635 		switch (name) {
636 		case SO_REUSEADDR:
637 			if (!checkonly)
638 				rts->rts_reuseaddr = *i1;
639 			break;	/* goto sizeof (int) option return */
640 		case SO_DEBUG:
641 			if (!checkonly)
642 				rts->rts_debug = *i1;
643 			break;	/* goto sizeof (int) option return */
644 		/*
645 		 * The following three items are available here,
646 		 * but are only meaningful to IP.
647 		 */
648 		case SO_DONTROUTE:
649 			if (!checkonly)
650 				rts->rts_dontroute = *i1;
651 			break;	/* goto sizeof (int) option return */
652 		case SO_USELOOPBACK:
653 			if (!checkonly)
654 				rts->rts_useloopback = *i1;
655 			break;	/* goto sizeof (int) option return */
656 		case SO_BROADCAST:
657 			if (!checkonly)
658 				rts->rts_broadcast = *i1;
659 			break;	/* goto sizeof (int) option return */
660 		case SO_PROTOTYPE:
661 			/*
662 			 * Routing socket applications that call socket() with
663 			 * a third argument can filter which messages will be
664 			 * sent upstream thanks to sockfs.  so_socket() sends
665 			 * down the SO_PROTOTYPE and rts_queue_input()
666 			 * implements the filtering.
667 			 */
668 			if (*i1 != AF_INET && *i1 != AF_INET6)
669 				return (EPROTONOSUPPORT);
670 			if (!checkonly)
671 				rts->rts_proto = *i1;
672 			break;	/* goto sizeof (int) option return */
673 		/*
674 		 * The following two items can be manipulated,
675 		 * but changing them should do nothing.
676 		 */
677 		case SO_SNDBUF:
678 			if (*i1 > rtss->rtss_max_buf) {
679 				*outlenp = 0;
680 				return (ENOBUFS);
681 			}
682 			if (!checkonly) {
683 				q->q_hiwat = *i1;
684 			}
685 			break;	/* goto sizeof (int) option return */
686 		case SO_RCVBUF:
687 			if (*i1 > rtss->rtss_max_buf) {
688 				*outlenp = 0;
689 				return (ENOBUFS);
690 			}
691 			if (!checkonly) {
692 				RD(q)->q_hiwat = *i1;
693 				(void) mi_set_sth_hiwat(RD(q), *i1);
694 			}
695 			break;	/* goto sizeof (int) option return */
696 		default:
697 			*outlenp = 0;
698 			return (EINVAL);
699 		}
700 		break;
701 	default:
702 		*outlenp = 0;
703 		return (EINVAL);
704 	}
705 	/*
706 	 * Common case of return from an option that is sizeof (int)
707 	 */
708 	*(int *)outvalp = *i1;
709 	*outlenp = (t_uscalar_t)sizeof (int);
710 	return (0);
711 }
712 
713 /*
714  * This routine retrieves the value of an ND variable in a rtsparam_t
715  * structure. It is called through nd_getset when a user reads the
716  * variable.
717  */
718 /* ARGSUSED */
719 static int
720 rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
721 {
722 	rtsparam_t	*rtspa = (rtsparam_t *)cp;
723 
724 	(void) mi_mpprintf(mp, "%u", rtspa->rts_param_value);
725 	return (0);
726 }
727 
728 /*
729  * Walk through the param array specified registering each element with the
730  * named dispatch (ND) handler.
731  */
732 static boolean_t
733 rts_param_register(IDP *ndp, rtsparam_t *rtspa, int cnt)
734 {
735 	for (; cnt-- > 0; rtspa++) {
736 		if (rtspa->rts_param_name != NULL && rtspa->rts_param_name[0]) {
737 			if (!nd_load(ndp, rtspa->rts_param_name,
738 			    rts_param_get, rts_param_set, (caddr_t)rtspa)) {
739 				nd_free(ndp);
740 				return (B_FALSE);
741 			}
742 		}
743 	}
744 	return (B_TRUE);
745 }
746 
747 /* This routine sets an ND variable in a rtsparam_t structure. */
748 /* ARGSUSED */
749 static int
750 rts_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
751 {
752 	ulong_t	new_value;
753 	rtsparam_t	*rtspa = (rtsparam_t *)cp;
754 
755 	/*
756 	 * Fail the request if the new value does not lie within the
757 	 * required bounds.
758 	 */
759 	if (ddi_strtoul(value, NULL, 10, &new_value) != 0 ||
760 	    new_value < rtspa->rts_param_min ||
761 	    new_value > rtspa->rts_param_max) {
762 		return (EINVAL);
763 	}
764 
765 	/* Set the new value */
766 	rtspa->rts_param_value = new_value;
767 	return (0);
768 }
769 
770 /*
771  * Empty rsrv routine which is used by rts_input to cause a wakeup
772  * of a thread in qwait.
773  */
774 /*ARGSUSED*/
775 static void
776 rts_rsrv(queue_t *q)
777 {
778 }
779 
780 /*
781  * This routine handles synchronous messages passed downstream. It either
782  * consumes the message or passes it downstream; it never queues a
783  * a message. The data messages that go down are wrapped in an IOCTL
784  * message.
785  *
786  * Since it is synchronous, it waits for the M_IOCACK/M_IOCNAK so that
787  * it can return an immediate error (such as ENETUNREACH when adding a route).
788  * It uses the RTS_WRW_PENDING to ensure that each rts instance has only
789  * one M_IOCTL outstanding at any given time.
790  */
791 static int
792 rts_wrw(queue_t *q, struiod_t *dp)
793 {
794 	mblk_t	*mp = dp->d_mp;
795 	mblk_t	*mp1;
796 	int	error;
797 	rt_msghdr_t	*rtm;
798 	conn_t	*connp = Q_TO_CONN(q);
799 	rts_t	*rts = connp->conn_rts;
800 
801 	while (rts->rts_flag & RTS_WRW_PENDING) {
802 		if (qwait_rw(q)) {
803 			rts->rts_error = EINTR;
804 			goto err_ret;
805 		}
806 		}
807 	rts->rts_flag |= RTS_WRW_PENDING;
808 
809 	if (isuioq(q) && (error = struioget(q, mp, dp, 0))) {
810 		/*
811 		 * Uio error of some sort, so just return the error.
812 		 */
813 		rts->rts_error = error;
814 		goto err_ret;
815 	}
816 	/*
817 	 * Pass the mblk (chain) onto wput().
818 	 */
819 	dp->d_mp = 0;
820 
821 	switch (mp->b_datap->db_type) {
822 	case M_PROTO:
823 	case M_PCPROTO:
824 		/* Expedite other than T_DATA_REQ to below the switch */
825 		if (((mp->b_wptr - mp->b_rptr) !=
826 		    sizeof (struct T_data_req)) ||
827 		    (((union T_primitives *)mp->b_rptr)->type != T_DATA_REQ))
828 			break;
829 		if ((mp1 = mp->b_cont) == NULL) {
830 			rts->rts_error = EINVAL;
831 			goto err_ret;
832 		}
833 		freeb(mp);
834 		mp = mp1;
835 		/* FALLTHRU */
836 	case M_DATA:
837 		/*
838 		 * The semantics of the routing socket is such that the rtm_pid
839 		 * field is automatically filled in during requests with the
840 		 * current process' pid.  We do this here (where we still have
841 		 * user context) after checking we have at least a message the
842 		 * size of a routing message header.
843 		 */
844 		if ((mp->b_wptr - mp->b_rptr) < sizeof (rt_msghdr_t)) {
845 			if (!pullupmsg(mp, sizeof (rt_msghdr_t))) {
846 				rts->rts_error = EINVAL;
847 				goto err_ret;
848 			}
849 		}
850 		rtm = (rt_msghdr_t *)mp->b_rptr;
851 		rtm->rtm_pid = curproc->p_pid;
852 		break;
853 	default:
854 		break;
855 	}
856 	rts->rts_flag |= RTS_WPUT_PENDING;
857 	rts_wput(q, mp);
858 	while (rts->rts_flag & RTS_WPUT_PENDING)
859 		if (qwait_rw(q)) {
860 			/* RTS_WPUT_PENDING will be cleared below */
861 			rts->rts_error = EINTR;
862 			break;
863 		}
864 err_ret:
865 	rts->rts_flag &= ~(RTS_WPUT_PENDING | RTS_WRW_PENDING);
866 	return (rts->rts_error);
867 }
868 
869 /*
870  * This routine handles all messages passed downstream. It either
871  * consumes the message or passes it downstream; it never queues a
872  * a message. The data messages that go down are wrapped in an IOCTL
873  * message.
874  *
875  * FIXME? Should we call IP rts_request directly? Could punt on returning
876  * errno in the case when it defers processing due to
877  * IPIF_CHANGING/ILL_CHANGING???
878  */
879 static void
880 rts_wput(queue_t *q, mblk_t *mp)
881 {
882 	uchar_t	*rptr = mp->b_rptr;
883 	mblk_t	*mp1;
884 	conn_t	*connp = Q_TO_CONN(q);
885 	rts_t	*rts = connp->conn_rts;
886 
887 	switch (mp->b_datap->db_type) {
888 	case M_DATA:
889 		break;
890 	case M_PROTO:
891 	case M_PCPROTO:
892 		if ((mp->b_wptr - rptr) == sizeof (struct T_data_req)) {
893 			/* Expedite valid T_DATA_REQ to below the switch */
894 			if (((union T_primitives *)rptr)->type == T_DATA_REQ) {
895 				mp1 = mp->b_cont;
896 				freeb(mp);
897 				if (mp1 == NULL)
898 					return;
899 				mp = mp1;
900 				break;
901 			}
902 		}
903 		/* FALLTHRU */
904 	default:
905 		rts_wput_other(q, mp);
906 		return;
907 	}
908 
909 
910 	mp1 = rts_ioctl_alloc(mp, DB_CRED(mp));
911 	if (mp1 == NULL) {
912 		ASSERT(rts != NULL);
913 		freemsg(mp);
914 		if (rts->rts_flag & RTS_WPUT_PENDING) {
915 			rts->rts_error = ENOMEM;
916 			rts->rts_flag &= ~RTS_WPUT_PENDING;
917 		}
918 		return;
919 	}
920 	ip_output(connp, mp1, q, IP_WPUT);
921 }
922 
923 
924 /*
925  * Handles all the control message, if it
926  * can not understand it, it will
927  * pass down stream.
928  */
929 static void
930 rts_wput_other(queue_t *q, mblk_t *mp)
931 {
932 	conn_t	*connp = Q_TO_CONN(q);
933 	rts_t	*rts = connp->conn_rts;
934 	uchar_t	*rptr = mp->b_rptr;
935 	struct iocblk	*iocp;
936 	cred_t	*cr;
937 	rts_stack_t	*rtss;
938 
939 	rtss = rts->rts_rtss;
940 
941 	cr = DB_CREDDEF(mp, connp->conn_cred);
942 
943 	switch (mp->b_datap->db_type) {
944 	case M_PROTO:
945 	case M_PCPROTO:
946 		if ((mp->b_wptr - rptr) < sizeof (t_scalar_t)) {
947 			/*
948 			 * If the message does not contain a PRIM_type,
949 			 * throw it away.
950 			 */
951 			freemsg(mp);
952 			return;
953 		}
954 		switch (((union T_primitives *)rptr)->type) {
955 		case T_BIND_REQ:
956 		case O_T_BIND_REQ:
957 			rts_bind(q, mp);
958 			return;
959 		case T_UNBIND_REQ:
960 			rts_unbind(q, mp);
961 			return;
962 		case T_CAPABILITY_REQ:
963 			rts_capability_req(q, mp);
964 			return;
965 		case T_INFO_REQ:
966 			rts_info_req(q, mp);
967 			return;
968 		case T_SVR4_OPTMGMT_REQ:
969 			(void) svr4_optcom_req(q, mp, cr, &rts_opt_obj,
970 			    B_TRUE);
971 			return;
972 		case T_OPTMGMT_REQ:
973 			(void) tpi_optcom_req(q, mp, cr, &rts_opt_obj, B_TRUE);
974 			return;
975 		case O_T_CONN_RES:
976 		case T_CONN_RES:
977 		case T_DISCON_REQ:
978 			/* Not supported by rts. */
979 			rts_err_ack(q, mp, TNOTSUPPORT, 0);
980 			return;
981 		case T_DATA_REQ:
982 		case T_EXDATA_REQ:
983 		case T_ORDREL_REQ:
984 			/* Illegal for rts. */
985 			freemsg(mp);
986 			(void) putnextctl1(RD(q), M_ERROR, EPROTO);
987 			return;
988 		default:
989 			break;
990 		}
991 		break;
992 	case M_IOCTL:
993 		iocp = (struct iocblk *)mp->b_rptr;
994 		switch (iocp->ioc_cmd) {
995 		case ND_SET:
996 		case ND_GET:
997 			if (nd_getset(q, rtss->rtss_g_nd, mp)) {
998 				qreply(q, mp);
999 				return;
1000 			}
1001 			break;
1002 		case TI_GETPEERNAME:
1003 			mi_copyin(q, mp, NULL,
1004 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
1005 			return;
1006 		default:
1007 			break;
1008 		}
1009 	case M_IOCDATA:
1010 		rts_wput_iocdata(q, mp);
1011 		return;
1012 	default:
1013 		break;
1014 	}
1015 	ip_output(connp, mp, q, IP_WPUT);
1016 }
1017 
1018 /*
1019  * Called by rts_wput_other to handle all M_IOCDATA messages.
1020  */
1021 static void
1022 rts_wput_iocdata(queue_t *q, mblk_t *mp)
1023 {
1024 	conn_t *connp = Q_TO_CONN(q);
1025 	struct sockaddr	*rtsaddr;
1026 	mblk_t	*mp1;
1027 	STRUCT_HANDLE(strbuf, sb);
1028 	struct iocblk	*iocp	= (struct iocblk *)mp->b_rptr;
1029 
1030 	/* Make sure it is one of ours. */
1031 	switch (iocp->ioc_cmd) {
1032 	case TI_GETPEERNAME:
1033 		break;
1034 	default:
1035 		ip_output(connp, mp, q, IP_WPUT);
1036 		return;
1037 	}
1038 	switch (mi_copy_state(q, mp, &mp1)) {
1039 	case -1:
1040 		return;
1041 	case MI_COPY_CASE(MI_COPY_IN, 1):
1042 		break;
1043 	case MI_COPY_CASE(MI_COPY_OUT, 1):
1044 		/* Copy out the strbuf. */
1045 		mi_copyout(q, mp);
1046 		return;
1047 	case MI_COPY_CASE(MI_COPY_OUT, 2):
1048 		/* All done. */
1049 		mi_copy_done(q, mp, 0);
1050 		return;
1051 	default:
1052 		mi_copy_done(q, mp, EPROTO);
1053 		return;
1054 	}
1055 	STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr);
1056 	if (STRUCT_FGET(sb, maxlen) < (int)sizeof (sin_t)) {
1057 		mi_copy_done(q, mp, EINVAL);
1058 		return;
1059 	}
1060 	switch (iocp->ioc_cmd) {
1061 	case TI_GETPEERNAME:
1062 		break;
1063 	default:
1064 		mi_copy_done(q, mp, EPROTO);
1065 		return;
1066 	}
1067 	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), sizeof (sin_t),
1068 	    B_TRUE);
1069 	if (mp1 == NULL)
1070 		return;
1071 	STRUCT_FSET(sb, len, (int)sizeof (sin_t));
1072 	rtsaddr = (struct sockaddr *)mp1->b_rptr;
1073 	mp1->b_wptr = (uchar_t *)&rtsaddr[1];
1074 	bzero(rtsaddr, sizeof (struct sockaddr));
1075 	rtsaddr->sa_family = AF_ROUTE;
1076 	/* Copy out the address */
1077 	mi_copyout(q, mp);
1078 }
1079 
1080 /*ARGSUSED2*/
1081 static void
1082 rts_input(void *arg1, mblk_t *mp, void *arg2)
1083 {
1084 	conn_t *connp = (conn_t *)arg1;
1085 	rts_t	*rts = connp->conn_rts;
1086 	struct iocblk	*iocp;
1087 	mblk_t *mp1;
1088 	struct T_data_ind *tdi;
1089 
1090 	switch (mp->b_datap->db_type) {
1091 	case M_IOCACK:
1092 	case M_IOCNAK:
1093 		iocp = (struct iocblk *)mp->b_rptr;
1094 		if (rts->rts_flag & (RTS_WPUT_PENDING)) {
1095 			rts->rts_flag &= ~RTS_WPUT_PENDING;
1096 			rts->rts_error = iocp->ioc_error;
1097 			/*
1098 			 * Tell rts_wvw/qwait that we are done.
1099 			 * Note: there is no qwait_wakeup() we can use.
1100 			 */
1101 			qenable(connp->conn_rq);
1102 			freemsg(mp);
1103 			return;
1104 		}
1105 		break;
1106 	case M_DATA:
1107 		/*
1108 		 * Prepend T_DATA_IND to prevent the stream head from
1109 		 * consolidating multiple messages together.
1110 		 * If the allocation fails just send up the M_DATA.
1111 		 */
1112 		mp1 = allocb(sizeof (*tdi), BPRI_MED);
1113 		if (mp1 != NULL) {
1114 			mp1->b_cont = mp;
1115 			mp = mp1;
1116 
1117 			mp->b_datap->db_type = M_PROTO;
1118 			mp->b_wptr += sizeof (*tdi);
1119 			tdi = (struct T_data_ind *)mp->b_rptr;
1120 			tdi->PRIM_type = T_DATA_IND;
1121 			tdi->MORE_flag = 0;
1122 		}
1123 		break;
1124 	default:
1125 		break;
1126 	}
1127 	putnext(connp->conn_rq, mp);
1128 }
1129 
1130 
1131 void
1132 rts_ddi_init(void)
1133 {
1134 	rts_max_optsize = optcom_max_optsize(rts_opt_obj.odb_opt_des_arr,
1135 	    rts_opt_obj.odb_opt_arr_cnt);
1136 
1137 	/*
1138 	 * We want to be informed each time a stack is created or
1139 	 * destroyed in the kernel, so we can maintain the
1140 	 * set of rts_stack_t's.
1141 	 */
1142 	netstack_register(NS_RTS, rts_stack_init, NULL, rts_stack_fini);
1143 }
1144 
1145 void
1146 rts_ddi_destroy(void)
1147 {
1148 	netstack_unregister(NS_RTS);
1149 }
1150 
1151 /*
1152  * Initialize the RTS stack instance.
1153  */
1154 /* ARGSUSED */
1155 static void *
1156 rts_stack_init(netstackid_t stackid, netstack_t *ns)
1157 {
1158 	rts_stack_t	*rtss;
1159 	rtsparam_t	*pa;
1160 
1161 	rtss = (rts_stack_t *)kmem_zalloc(sizeof (*rtss), KM_SLEEP);
1162 	rtss->rtss_netstack = ns;
1163 
1164 	pa = (rtsparam_t *)kmem_alloc(sizeof (lcl_param_arr), KM_SLEEP);
1165 	rtss->rtss_params = pa;
1166 	bcopy(lcl_param_arr, rtss->rtss_params, sizeof (lcl_param_arr));
1167 
1168 	(void) rts_param_register(&rtss->rtss_g_nd,
1169 	    rtss->rtss_params, A_CNT(lcl_param_arr));
1170 	return (rtss);
1171 }
1172 
1173 /*
1174  * Free the RTS stack instance.
1175  */
1176 /* ARGSUSED */
1177 static void
1178 rts_stack_fini(netstackid_t stackid, void *arg)
1179 {
1180 	rts_stack_t *rtss = (rts_stack_t *)arg;
1181 
1182 	nd_free(&rtss->rtss_g_nd);
1183 	kmem_free(rtss->rtss_params, sizeof (lcl_param_arr));
1184 	rtss->rtss_params = NULL;
1185 	kmem_free(rtss, sizeof (*rtss));
1186 }
1187