xref: /illumos-gate/usr/src/uts/common/inet/ip/rts.c (revision cb6207858a9fcc2feaee22e626912fba281ac969)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/strsubr.h>
31 #include <sys/stropts.h>
32 #include <sys/strsun.h>
33 #include <sys/strlog.h>
34 #define	_SUN_TPI_VERSION 2
35 #include <sys/tihdr.h>
36 #include <sys/timod.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/cmn_err.h>
40 #include <sys/proc.h>
41 #include <sys/suntpi.h>
42 #include <sys/policy.h>
43 #include <sys/zone.h>
44 
45 #include <sys/socket.h>
46 #include <netinet/in.h>
47 
48 #include <inet/common.h>
49 #include <netinet/ip6.h>
50 #include <inet/ip.h>
51 #include <inet/mi.h>
52 #include <inet/nd.h>
53 #include <inet/optcom.h>
54 #include <netinet/ip_mroute.h>
55 #include <sys/isa_defs.h>
56 #include <net/route.h>
57 
58 /*
59  * This is a transport provider for routing sockets.  Downstream messages are
60  * wrapped with a IP_IOCTL header, and ip_wput_ioctl calls the appropriate entry
61  * in the ip_ioctl_ftbl callout table to pass the routing socket data into IP.
62  * Upstream messages are generated for listeners of the routing socket as well
63  * as the message sender (unless they have turned off their end using
64  * SO_USELOOPBACK or shutdown(3n)).  Upstream messages may also be generated
65  * asynchronously when:
66  *
67  *	Interfaces are brought up or down.
68  *	Addresses are assigned to interfaces.
69  *	ICMP redirects are processed and a IRE_HOST_REDIRECT is installed.
70  *	No route is found while sending a packet.
71  *	When TCP requests IP to remove an IRE_CACHE of a troubled destination.
72  *
73  * Since all we do is reformat the messages between routing socket and
74  * ioctl forms, no synchronization is necessary in this module; all
75  * the dirty work is done down in ip.
76  */
77 
78 /*
79  * RTS stack instances
80  */
81 struct rts_stack {
82 	netstack_t		*rtss_netstack;	/* Common netstack */
83 
84 	caddr_t			rtss_g_nd;
85 	struct rtsparam_s	*rtss_params;
86 };
87 typedef struct rts_stack rts_stack_t;
88 
89 /*
90  * Object to represent database of options to search passed to
91  * {sock,tpi}optcom_req() interface routine to take care of option
92  * management and associated methods.
93  * XXX. These and other externs should really move to a rts header.
94  */
95 extern optdb_obj_t	rts_opt_obj;
96 extern uint_t		rts_max_optsize;
97 
98 /* Internal routing socket stream control structure, one per open stream */
99 typedef	struct rts_s {
100 	cred_t	*rts_credp;		/* Opener's credentials */
101 	uint_t	rts_state;		/* Provider interface state */
102 	uint_t	rts_error;		/* Routing socket error code */
103 	uint_t	rts_flag;		/* Pending I/O state */
104 	uint_t	rts_proto;		/* SO_PROTOTYPE "socket" option. */
105 	uint_t	rts_debug : 1,		/* SO_DEBUG "socket" option. */
106 		rts_dontroute : 1,	/* SO_DONTROUTE "socket" option. */
107 		rts_broadcast : 1,	/* SO_BROADCAST "socket" option. */
108 		rts_reuseaddr : 1,	/* SO_REUSEADDR "socket" option. */
109 		rts_useloopback : 1,	/* SO_USELOOPBACK "socket" option. */
110 		rts_multicast_loop : 1,	/* IP_MULTICAST_LOOP option */
111 		rts_hdrincl : 1,	/* IP_HDRINCL option + RAW and IGMP */
112 
113 		: 0;
114 	rts_stack_t	*rts_rtss;
115 } rts_t;
116 
117 #define	RTS_WPUT_PENDING	0x1	/* Waiting for write-side to complete */
118 #define	RTS_WRW_PENDING		0x2	/* Routing socket write in progress */
119 #define	RTS_OPEN_PENDING	0x4	/* Routing socket open in progress */
120 
121 /* Default structure copied into T_INFO_ACK messages */
122 static struct T_info_ack rts_g_t_info_ack = {
123 	T_INFO_ACK,
124 	T_INFINITE,	/* TSDU_size. Maximum size messages. */
125 	T_INVALID,	/* ETSDU_size. No expedited data. */
126 	T_INVALID,	/* CDATA_size. No connect data. */
127 	T_INVALID,	/* DDATA_size. No disconnect data. */
128 	0,		/* ADDR_size. */
129 	0,		/* OPT_size - not initialized here */
130 	64 * 1024,	/* TIDU_size. rts allows maximum size messages. */
131 	T_COTS,		/* SERV_type. rts supports connection oriented. */
132 	TS_UNBND,	/* CURRENT_state. This is set from rts_state. */
133 	(XPG4_1)	/* PROVIDER_flag */
134 };
135 
136 /* Named Dispatch Parameter Management Structure */
137 typedef struct rtsparam_s {
138 	uint_t	rts_param_min;
139 	uint_t	rts_param_max;
140 	uint_t	rts_param_value;
141 	char	*rts_param_name;
142 } rtsparam_t;
143 
144 /*
145  * Table of ND variables supported by rts. These are loaded into rts_g_nd
146  * in rts_open.
147  * All of these are alterable, within the min/max values given, at run time.
148  */
149 static rtsparam_t	lcl_param_arr[] = {
150 	/* min		max		value		name */
151 	{ 4096,		65536,		8192,		"rts_xmit_hiwat"},
152 	{ 0,		65536,		1024,		"rts_xmit_lowat"},
153 	{ 4096,		65536,		8192,		"rts_recv_hiwat"},
154 	{ 65536,	1024*1024*1024, 256*1024,	"rts_max_buf"},
155 };
156 #define	rtss_xmit_hiwat		rtss_params[0].rts_param_value
157 #define	rtss_xmit_lowat		rtss_params[1].rts_param_value
158 #define	rtss_recv_hiwat		rtss_params[2].rts_param_value
159 #define	rtss_max_buf			rtss_params[3].rts_param_value
160 
161 static int	rts_close(queue_t *q);
162 static void 	rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
163     int sys_error);
164 static mblk_t	*rts_ioctl_alloc(mblk_t *data, cred_t *cr);
165 static int	rts_open(queue_t *q, dev_t *devp, int flag, int sflag,
166     cred_t *credp);
167 int		rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name,
168     uchar_t *ptr);
169 int		rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name,
170     uchar_t *ptr);
171 int		rts_opt_set(queue_t *q, uint_t optset_context, int level,
172     int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
173     uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk);
174 static void	rts_param_cleanup(IDP *ndp);
175 static int	rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
176 static boolean_t rts_param_register(IDP *ndp, rtsparam_t *rtspa, int cnt);
177 static int	rts_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
178     cred_t *cr);
179 static void	rts_rput(queue_t *q, mblk_t *mp);
180 static void	*rts_stack_init(netstackid_t stackid, netstack_t *ns);
181 static void	rts_stack_fini(netstackid_t stackid, void *arg);
182 static void	rts_wput(queue_t *q, mblk_t *mp);
183 static void	rts_wput_iocdata(queue_t *q, mblk_t *mp);
184 static void 	rts_wput_other(queue_t *q, mblk_t *mp);
185 static int	rts_wrw(queue_t *q, struiod_t *dp);
186 
187 static struct module_info info = {
188 	129, "rts", 1, INFPSZ, 512, 128
189 };
190 
191 static struct qinit rinit = {
192 	(pfi_t)rts_rput, NULL, rts_open, rts_close, NULL, &info
193 };
194 
195 static struct qinit winit = {
196 	(pfi_t)rts_wput, NULL, NULL, NULL, NULL, &info,
197 	NULL, (pfi_t)rts_wrw, NULL, STRUIOT_STANDARD
198 };
199 
200 struct streamtab rtsinfo = {
201 	&rinit, &winit
202 };
203 
204 /*
205  * This routine allocates the necessary
206  * message blocks for IOCTL wrapping the
207  * user data.
208  */
209 static mblk_t *
210 rts_ioctl_alloc(mblk_t *data, cred_t *cr)
211 {
212 	mblk_t	*mp = NULL;
213 	mblk_t	*mp1 = NULL;
214 	ipllc_t	*ipllc;
215 	struct iocblk	*ioc;
216 
217 	mp = allocb_cred(sizeof (ipllc_t), cr);
218 	if (mp == NULL)
219 		return (NULL);
220 	mp1 = allocb_cred(sizeof (struct iocblk), cr);
221 	if (mp1 == NULL) {
222 		freeb(mp);
223 		return (NULL);
224 	}
225 
226 	ipllc = (ipllc_t *)mp->b_rptr;
227 	ipllc->ipllc_cmd = IP_IOC_RTS_REQUEST;
228 	ipllc->ipllc_name_offset = 0;
229 	ipllc->ipllc_name_length = 0;
230 	mp->b_wptr += sizeof (ipllc_t);
231 	mp->b_cont = data;
232 
233 	ioc = (struct iocblk *)mp1->b_rptr;
234 	ioc->ioc_cmd = IP_IOCTL;
235 	ioc->ioc_error = 0;
236 	ioc->ioc_cr = NULL;
237 	ioc->ioc_count = msgdsize(mp);
238 	mp1->b_wptr += sizeof (struct iocblk);
239 	mp1->b_datap->db_type = M_IOCTL;
240 	mp1->b_cont = mp;
241 
242 	return (mp1);
243 }
244 
245 /*
246  * This routine closes rts stream, by disabling
247  * put/srv routines and freeing the this module
248  * internal datastructure.
249  */
250 static int
251 rts_close(queue_t *q)
252 {
253 	rts_t *rts = (rts_t *)q->q_ptr;
254 
255 	qprocsoff(q);
256 
257 	crfree(rts->rts_credp);
258 	netstack_rele(rts->rts_rtss->rtss_netstack);
259 
260 	mi_free(q->q_ptr);
261 	return (0);
262 }
263 
264 /*
265  * This is the open routine for routing socket. It allocates
266  * rts_t structure for the stream and sends an IOCTL to
267  * the down module to indicate that it is a routing socket
268  * stream.
269  */
270 /* ARGSUSED */
271 static int
272 rts_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
273 {
274 	mblk_t	*mp = NULL;
275 	rts_t	*rts;
276 	netstack_t *ns;
277 	rts_stack_t *rtss;
278 
279 	/* If the stream is already open, return immediately. */
280 	if (q->q_ptr != NULL)
281 		return (0);
282 
283 	/* If this is not a push of rts as a module, fail. */
284 	if (sflag != MODOPEN)
285 		return (EINVAL);
286 
287 	ns = netstack_find_by_cred(credp);
288 	ASSERT(ns != NULL);
289 	rtss = ns->netstack_rts;
290 	ASSERT(rtss != NULL);
291 
292 	q->q_ptr = mi_zalloc_sleep(sizeof (rts_t));
293 	WR(q)->q_ptr = q->q_ptr;
294 	rts = (rts_t *)q->q_ptr;
295 
296 	rts->rts_rtss = rtss;
297 
298 	rts->rts_credp = credp;
299 	crhold(credp);
300 	/*
301 	 * The receive hiwat is only looked at on the stream head queue.
302 	 * Store in q_hiwat in order to return on SO_RCVBUF getsockopts.
303 	 */
304 	q->q_hiwat = rtss->rtss_recv_hiwat;
305 	/*
306 	 * The transmit hiwat/lowat is only looked at on IP's queue.
307 	 * Store in q_hiwat/q_lowat in order to return on SO_SNDBUF/SO_SNDLOWAT
308 	 * getsockopts.
309 	 */
310 	WR(q)->q_hiwat = rtss->rtss_xmit_hiwat;
311 	WR(q)->q_lowat = rtss->rtss_xmit_lowat;
312 	qprocson(q);
313 	/*
314 	 * Indicate the down IP module that this is a routing socket
315 	 * client by sending an RTS IOCTL without any user data. Although
316 	 * this is just a notification message (without any real routing
317 	 * request), we pass in any credential for correctness sake.
318 	 */
319 	mp = rts_ioctl_alloc(NULL, credp);
320 	if (mp == NULL) {
321 		qprocsoff(q);
322 		ASSERT(q->q_ptr != NULL);
323 		netstack_rele(rtss->rtss_netstack);
324 		mi_free(q->q_ptr);
325 		crfree(credp);
326 		return (ENOMEM);
327 	}
328 	rts->rts_flag |= RTS_OPEN_PENDING;
329 	putnext(WR(q), mp);
330 	while (rts->rts_flag & RTS_OPEN_PENDING) {
331 		if (!qwait_sig(q)) {
332 			(void) rts_close(q);
333 			return (EINTR);
334 		}
335 	}
336 	if (rts->rts_error != 0) {
337 		(void) rts_close(q);
338 		return (ENOTSUP);
339 	}
340 	rts->rts_state = TS_UNBND;
341 	return (0);
342 }
343 
344 /*
345  * This routine creates a T_ERROR_ACK message and passes it upstream.
346  */
347 static void
348 rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
349 {
350 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
351 		qreply(q, mp);
352 }
353 
354 /*
355  * This routine creates a T_OK_ACK message and passes it upstream.
356  */
357 static void
358 rts_ok_ack(queue_t *q, mblk_t *mp)
359 {
360 	if ((mp = mi_tpi_ok_ack_alloc(mp)) != NULL)
361 		qreply(q, mp);
362 }
363 
364 /*
365  * This routine is called by rts_wput to handle T_UNBIND_REQ messages.
366  * After some error checking, the message is passed downstream to ip.
367  */
368 static void
369 rts_unbind(queue_t *q, mblk_t *mp)
370 {
371 	rts_t	*rts;
372 
373 	rts = (rts_t *)q->q_ptr;
374 	/* If a bind has not been done, we can't unbind. */
375 	if (rts->rts_state != TS_IDLE) {
376 		rts_err_ack(q, mp, TOUTSTATE, 0);
377 		return;
378 	}
379 	rts->rts_state = TS_UNBND;
380 	rts_ok_ack(q, mp);
381 }
382 
383 /*
384  * This routine is called to handle each
385  * O_T_BIND_REQ/T_BIND_REQ message passed to
386  * rts_wput. Note: This routine works with both
387  * O_T_BIND_REQ and T_BIND_REQ semantics.
388  */
389 static void
390 rts_bind(queue_t *q, mblk_t *mp)
391 {
392 	mblk_t	*mp1;
393 	struct T_bind_req *tbr;
394 	rts_t	*rts;
395 
396 	rts = (rts_t *)q->q_ptr;
397 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
398 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
399 		    "rts_bind: bad data, %d", rts->rts_state);
400 		rts_err_ack(q, mp, TBADADDR, 0);
401 		return;
402 	}
403 	if (rts->rts_state != TS_UNBND) {
404 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
405 		    "rts_bind: bad state, %d", rts->rts_state);
406 		rts_err_ack(q, mp, TOUTSTATE, 0);
407 		return;
408 	}
409 	/*
410 	 * Reallocate the message to make sure we have enough room for an
411 	 * address and the protocol type.
412 	 */
413 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin_t), 1);
414 	if (mp1 == NULL) {
415 		rts_err_ack(q, mp, TSYSERR, ENOMEM);
416 		return;
417 	}
418 	mp = mp1;
419 	tbr = (struct T_bind_req *)mp->b_rptr;
420 	if (tbr->ADDR_length != 0) {
421 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
422 		    "rts_bind: bad ADDR_length %d", tbr->ADDR_length);
423 		rts_err_ack(q, mp, TBADADDR, 0);
424 		return;
425 	}
426 	/* Generic request */
427 	tbr->ADDR_offset = (t_scalar_t)sizeof (struct T_bind_req);
428 	tbr->ADDR_length = 0;
429 	tbr->PRIM_type = T_BIND_ACK;
430 	rts->rts_state = TS_IDLE;
431 	qreply(q, mp);
432 }
433 
434 static void
435 rts_copy_info(struct T_info_ack *tap, rts_t *rts)
436 {
437 	*tap = rts_g_t_info_ack;
438 	tap->CURRENT_state = rts->rts_state;
439 	tap->OPT_size = rts_max_optsize;
440 }
441 
442 /*
443  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
444  * rts_wput.  Much of the T_CAPABILITY_ACK information is copied from
445  * rts_g_t_info_ack.  The current state of the stream is copied from
446  * rts_state.
447  */
448 static void
449 rts_capability_req(queue_t *q, mblk_t *mp)
450 {
451 	rts_t			*rts = (rts_t *)q->q_ptr;
452 	t_uscalar_t		cap_bits1;
453 	struct T_capability_ack	*tcap;
454 
455 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
456 
457 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
458 		mp->b_datap->db_type, T_CAPABILITY_ACK);
459 	if (mp == NULL)
460 		return;
461 
462 	tcap = (struct T_capability_ack *)mp->b_rptr;
463 	tcap->CAP_bits1 = 0;
464 
465 	if (cap_bits1 & TC1_INFO) {
466 		rts_copy_info(&tcap->INFO_ack, rts);
467 		tcap->CAP_bits1 |= TC1_INFO;
468 	}
469 
470 	qreply(q, mp);
471 }
472 
473 /*
474  * This routine responds to T_INFO_REQ messages.  It is called by rts_wput.
475  * Most of the T_INFO_ACK information is copied from rts_g_t_info_ack.
476  * The current state of the stream is copied from rts_state.
477  */
478 static void
479 rts_info_req(queue_t *q, mblk_t *mp)
480 {
481 	rts_t	*rts = (rts_t *)q->q_ptr;
482 
483 	mp = tpi_ack_alloc(mp, sizeof (rts_g_t_info_ack), M_PCPROTO,
484 	    T_INFO_ACK);
485 	if (mp == NULL)
486 		return;
487 	rts_copy_info((struct T_info_ack *)mp->b_rptr, rts);
488 	qreply(q, mp);
489 }
490 
491 /*
492  * This routine gets default values of certain options whose default
493  * values are maintained by protcol specific code
494  */
495 /* ARGSUSED */
496 int
497 rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
498 {
499 	/* no default value processed by protocol specific code currently */
500 	return (-1);
501 }
502 
503 /*
504  * This routine retrieves the current status of socket options.
505  * It returns the size of the option retrieved.
506  */
507 int
508 rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
509 {
510 	int	*i1 = (int *)ptr;
511 	rts_t	*rts = (rts_t *)q->q_ptr;
512 
513 	switch (level) {
514 	case SOL_SOCKET:
515 		switch (name) {
516 		case SO_DEBUG:
517 			*i1 = rts->rts_debug;
518 			break;
519 		case SO_REUSEADDR:
520 			*i1 = rts->rts_reuseaddr;
521 			break;
522 		case SO_TYPE:
523 			*i1 = SOCK_RAW;
524 			break;
525 
526 		/*
527 		 * The following three items are available here,
528 		 * but are only meaningful to IP.
529 		 */
530 		case SO_DONTROUTE:
531 			*i1 = rts->rts_dontroute;
532 			break;
533 		case SO_USELOOPBACK:
534 			*i1 = rts->rts_useloopback;
535 			break;
536 		case SO_BROADCAST:
537 			*i1 = rts->rts_broadcast;
538 			break;
539 		case SO_PROTOTYPE:
540 			*i1 = rts->rts_proto;
541 			break;
542 		/*
543 		 * The following two items can be manipulated,
544 		 * but changing them should do nothing.
545 		 */
546 		case SO_SNDBUF:
547 			ASSERT(q->q_hiwat <= INT_MAX);
548 			*i1 = (int)(q->q_hiwat);
549 			break;
550 		case SO_RCVBUF:
551 			ASSERT(q->q_hiwat <= INT_MAX);
552 			*i1 = (int)(RD(q)->q_hiwat);
553 			break;
554 		case SO_DOMAIN:
555 			*i1 = PF_ROUTE;
556 			break;
557 		default:
558 			return (-1);
559 		}
560 		break;
561 	default:
562 		return (-1);
563 	}
564 	return ((int)sizeof (int));
565 }
566 
567 
568 /*
569  * This routine sets socket options.
570  */
571 /*ARGSUSED*/
572 int
573 rts_opt_set(queue_t *q, uint_t optset_context, int level,
574     int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
575     uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
576 {
577 	int	*i1 = (int *)invalp;
578 	rts_t	*rts = (rts_t *)q->q_ptr;
579 	boolean_t checkonly;
580 	rts_stack_t	*rtss = rts->rts_rtss;
581 
582 	switch (optset_context) {
583 	case SETFN_OPTCOM_CHECKONLY:
584 		checkonly = B_TRUE;
585 		/*
586 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
587 		 * inlen != 0 implies value supplied and
588 		 * 	we have to "pretend" to set it.
589 		 * inlen == 0 implies that there is no
590 		 * 	value part in T_CHECK request and just validation
591 		 * done elsewhere should be enough, we just return here.
592 		 */
593 		if (inlen == 0) {
594 			*outlenp = 0;
595 			return (0);
596 		}
597 		break;
598 	case SETFN_OPTCOM_NEGOTIATE:
599 		checkonly = B_FALSE;
600 		break;
601 	case SETFN_UD_NEGOTIATE:
602 	case SETFN_CONN_NEGOTIATE:
603 		checkonly = B_FALSE;
604 		/*
605 		 * Negotiating local and "association-related" options
606 		 * through T_UNITDATA_REQ or T_CONN_{REQ,CON}
607 		 * Not allowed in this module.
608 		 */
609 		return (EINVAL);
610 	default:
611 		/*
612 		 * We should never get here
613 		 */
614 		*outlenp = 0;
615 		return (EINVAL);
616 	}
617 
618 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
619 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
620 
621 	/*
622 	 * For rts, we should have no ancillary data sent down
623 	 * (rts_wput doesn't handle options).
624 	 */
625 	ASSERT(thisdg_attrs == NULL);
626 
627 	/*
628 	 * For fixed length options, no sanity check
629 	 * of passed in length is done. It is assumed *_optcom_req()
630 	 * routines do the right thing.
631 	 */
632 
633 	switch (level) {
634 	case SOL_SOCKET:
635 		switch (name) {
636 		case SO_REUSEADDR:
637 			if (!checkonly)
638 				rts->rts_reuseaddr = *i1;
639 			break;	/* goto sizeof (int) option return */
640 		case SO_DEBUG:
641 			if (!checkonly)
642 				rts->rts_debug = *i1;
643 			break;	/* goto sizeof (int) option return */
644 		/*
645 		 * The following three items are available here,
646 		 * but are only meaningful to IP.
647 		 */
648 		case SO_DONTROUTE:
649 			if (!checkonly)
650 				rts->rts_dontroute = *i1;
651 			break;	/* goto sizeof (int) option return */
652 		case SO_USELOOPBACK:
653 			if (!checkonly)
654 				rts->rts_useloopback = *i1;
655 			break;	/* goto sizeof (int) option return */
656 		case SO_BROADCAST:
657 			if (!checkonly)
658 				rts->rts_broadcast = *i1;
659 			break;	/* goto sizeof (int) option return */
660 		case SO_PROTOTYPE:
661 			/*
662 			 * Routing socket applications that call socket() with
663 			 * a third argument can filter which messages will be
664 			 * sent upstream thanks to sockfs.  so_socket() sends
665 			 * down the SO_PROTOTYPE and rts_queue_input()
666 			 * implements the filtering.
667 			 */
668 			if (*i1 != AF_INET && *i1 != AF_INET6)
669 				return (EPROTONOSUPPORT);
670 			if (!checkonly)
671 				rts->rts_proto = *i1;
672 			break;	/* goto sizeof (int) option return */
673 		/*
674 		 * The following two items can be manipulated,
675 		 * but changing them should do nothing.
676 		 */
677 		case SO_SNDBUF:
678 			if (*i1 > rtss->rtss_max_buf) {
679 				*outlenp = 0;
680 				return (ENOBUFS);
681 			}
682 			if (!checkonly) {
683 				q->q_hiwat = *i1;
684 				q->q_next->q_hiwat = *i1;
685 			}
686 			break;	/* goto sizeof (int) option return */
687 		case SO_RCVBUF:
688 			if (*i1 > rtss->rtss_max_buf) {
689 				*outlenp = 0;
690 				return (ENOBUFS);
691 			}
692 			if (!checkonly) {
693 				RD(q)->q_hiwat = *i1;
694 				(void) mi_set_sth_hiwat(RD(q), *i1);
695 			}
696 			break;	/* goto sizeof (int) option return */
697 		default:
698 			*outlenp = 0;
699 			return (EINVAL);
700 		}
701 		break;
702 	default:
703 		*outlenp = 0;
704 		return (EINVAL);
705 	}
706 	/*
707 	 * Common case of return from an option that is sizeof (int)
708 	 */
709 	*(int *)outvalp = *i1;
710 	*outlenp = (t_uscalar_t)sizeof (int);
711 	return (0);
712 }
713 
714 /*
715  * This routine frees the ND table if all streams have been closed.
716  * It is called by rts_close and rts_open.
717  */
718 static void
719 rts_param_cleanup(IDP *ndp)
720 {
721 	nd_free(ndp);
722 }
723 
724 /*
725  * This routine retrieves the value of an ND variable in a rtsparam_t
726  * structure. It is called through nd_getset when a user reads the
727  * variable.
728  */
729 /* ARGSUSED */
730 static int
731 rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
732 {
733 	rtsparam_t	*rtspa = (rtsparam_t *)cp;
734 
735 	(void) mi_mpprintf(mp, "%u", rtspa->rts_param_value);
736 	return (0);
737 }
738 
739 /*
740  * Walk through the param array specified registering each element with the
741  * named dispatch (ND) handler.
742  */
743 static boolean_t
744 rts_param_register(IDP *ndp, rtsparam_t *rtspa, int cnt)
745 {
746 	for (; cnt-- > 0; rtspa++) {
747 		if (rtspa->rts_param_name != NULL && rtspa->rts_param_name[0]) {
748 			if (!nd_load(ndp, rtspa->rts_param_name,
749 			    rts_param_get, rts_param_set, (caddr_t)rtspa)) {
750 				nd_free(ndp);
751 				return (B_FALSE);
752 			}
753 		}
754 	}
755 	return (B_TRUE);
756 }
757 
758 /* This routine sets an ND variable in a rtsparam_t structure. */
759 /* ARGSUSED */
760 static int
761 rts_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
762 {
763 	ulong_t	new_value;
764 	rtsparam_t	*rtspa = (rtsparam_t *)cp;
765 
766 	/*
767 	 * Fail the request if the new value does not lie within the
768 	 * required bounds.
769 	 */
770 	if (ddi_strtoul(value, NULL, 10, &new_value) != 0 ||
771 	    new_value < rtspa->rts_param_min ||
772 	    new_value > rtspa->rts_param_max) {
773 		return (EINVAL);
774 	}
775 
776 	/* Set the new value */
777 	rtspa->rts_param_value = new_value;
778 	return (0);
779 }
780 
781 /*
782  * This routine handles synchronous messages passed downstream. It either
783  * consumes the message or passes it downstream; it never queues a
784  * a message. The data messages that go down are wrapped in an IOCTL
785  * message.
786  *
787  * Since it is synchronous, it waits for the M_IOCACK/M_IOCNAK so that
788  * it can return an immediate error (such as ENETUNREACH when adding a route).
789  * It uses the RTS_WRW_PENDING to ensure that each rts instance has only
790  * one M_IOCTL outstanding at any given time.
791  */
792 static int
793 rts_wrw(queue_t *q, struiod_t *dp)
794 {
795 	mblk_t	*mp = dp->d_mp;
796 	mblk_t	*mp1;
797 	int	error;
798 	rt_msghdr_t	*rtm;
799 	rts_t	*rts;
800 
801 	rts = (rts_t *)q->q_ptr;
802 	while (rts->rts_flag & RTS_WRW_PENDING) {
803 		if (qwait_rw(q)) {
804 			rts->rts_error = EINTR;
805 			goto err_ret;
806 		}
807 		}
808 	rts->rts_flag |= RTS_WRW_PENDING;
809 
810 	if (isuioq(q) && (error = struioget(q, mp, dp, 0))) {
811 		/*
812 		 * Uio error of some sort, so just return the error.
813 		 */
814 		rts->rts_error = error;
815 		goto err_ret;
816 	}
817 	/*
818 	 * Pass the mblk (chain) onto wput().
819 	 */
820 	dp->d_mp = 0;
821 
822 	switch (mp->b_datap->db_type) {
823 	case M_PROTO:
824 	case M_PCPROTO:
825 		/* Expedite other than T_DATA_REQ to below the switch */
826 		if (((mp->b_wptr - mp->b_rptr) !=
827 		    sizeof (struct T_data_req)) ||
828 		    (((union T_primitives *)mp->b_rptr)->type != T_DATA_REQ))
829 			break;
830 		if ((mp1 = mp->b_cont) == NULL) {
831 			rts->rts_error = EINVAL;
832 			goto err_ret;
833 		}
834 		freeb(mp);
835 		mp = mp1;
836 		/* FALLTHRU */
837 	case M_DATA:
838 		/*
839 		 * The semantics of the routing socket is such that the rtm_pid
840 		 * field is automatically filled in during requests with the
841 		 * current process' pid.  We do this here (where we still have
842 		 * user context) after checking we have at least a message the
843 		 * size of a routing message header.
844 		 */
845 		if ((mp->b_wptr - mp->b_rptr) < sizeof (rt_msghdr_t)) {
846 			if (!pullupmsg(mp, sizeof (rt_msghdr_t))) {
847 				rts->rts_error = EINVAL;
848 				goto err_ret;
849 			}
850 		}
851 		rtm = (rt_msghdr_t *)mp->b_rptr;
852 		rtm->rtm_pid = curproc->p_pid;
853 		break;
854 	default:
855 		break;
856 	}
857 	rts->rts_flag |= RTS_WPUT_PENDING;
858 	rts_wput(q, mp);
859 	while (rts->rts_flag & RTS_WPUT_PENDING)
860 		if (qwait_rw(q)) {
861 			/* RTS_WPUT_PENDING will be cleared below */
862 			rts->rts_error = EINTR;
863 			break;
864 		}
865 err_ret:
866 	rts->rts_flag &= ~(RTS_WPUT_PENDING | RTS_WRW_PENDING);
867 	return (rts->rts_error);
868 }
869 
870 /*
871  * This routine handles all messages passed downstream. It either
872  * consumes the message or passes it downstream; it never queues a
873  * a message. The data messages that go down are wrapped in an IOCTL
874  * message.
875  */
876 static void
877 rts_wput(queue_t *q, mblk_t *mp)
878 {
879 	uchar_t	*rptr = mp->b_rptr;
880 	mblk_t	*mp1;
881 
882 	switch (mp->b_datap->db_type) {
883 	case M_DATA:
884 		break;
885 	case M_PROTO:
886 	case M_PCPROTO:
887 		if ((mp->b_wptr - rptr) == sizeof (struct T_data_req)) {
888 			/* Expedite valid T_DATA_REQ to below the switch */
889 			if (((union T_primitives *)rptr)->type == T_DATA_REQ) {
890 				mp1 = mp->b_cont;
891 				freeb(mp);
892 				if (mp1 == NULL)
893 					return;
894 				mp = mp1;
895 				break;
896 			}
897 		}
898 		/* FALLTHRU */
899 	default:
900 		rts_wput_other(q, mp);
901 		return;
902 	}
903 
904 
905 	mp1 = rts_ioctl_alloc(mp, DB_CRED(mp));
906 	if (mp1 == NULL) {
907 		rts_t	*rts = (rts_t *)q->q_ptr;
908 
909 		ASSERT(rts != NULL);
910 		freemsg(mp);
911 		if (rts->rts_flag & RTS_WPUT_PENDING) {
912 			rts->rts_error = ENOMEM;
913 			rts->rts_flag &= ~RTS_WPUT_PENDING;
914 		}
915 		return;
916 	}
917 	putnext(q, mp1);
918 }
919 
920 
921 /*
922  * Handles all the control message, if it
923  * can not understand it, it will
924  * pass down stream.
925  */
926 static void
927 rts_wput_other(queue_t *q, mblk_t *mp)
928 {
929 	uchar_t	*rptr = mp->b_rptr;
930 	rts_t	*rts;
931 	struct iocblk	*iocp;
932 	cred_t	*cr;
933 	rts_stack_t	*rtss;
934 
935 	rts = (rts_t *)q->q_ptr;
936 	rtss = rts->rts_rtss;
937 
938 	cr = DB_CREDDEF(mp, rts->rts_credp);
939 
940 	switch (mp->b_datap->db_type) {
941 	case M_PROTO:
942 	case M_PCPROTO:
943 		if ((mp->b_wptr - rptr) < sizeof (t_scalar_t)) {
944 			/*
945 			 * If the message does not contain a PRIM_type,
946 			 * throw it away.
947 			 */
948 			freemsg(mp);
949 			return;
950 		}
951 		switch (((union T_primitives *)rptr)->type) {
952 		case T_BIND_REQ:
953 		case O_T_BIND_REQ:
954 			rts_bind(q, mp);
955 			return;
956 		case T_UNBIND_REQ:
957 			rts_unbind(q, mp);
958 			return;
959 		case T_CAPABILITY_REQ:
960 			rts_capability_req(q, mp);
961 			return;
962 		case T_INFO_REQ:
963 			rts_info_req(q, mp);
964 			return;
965 		case T_SVR4_OPTMGMT_REQ:
966 			(void) svr4_optcom_req(q, mp, cr, &rts_opt_obj);
967 			return;
968 		case T_OPTMGMT_REQ:
969 			(void) tpi_optcom_req(q, mp, cr, &rts_opt_obj);
970 			return;
971 		case O_T_CONN_RES:
972 		case T_CONN_RES:
973 		case T_DISCON_REQ:
974 			/* Not supported by rts. */
975 			rts_err_ack(q, mp, TNOTSUPPORT, 0);
976 			return;
977 		case T_DATA_REQ:
978 		case T_EXDATA_REQ:
979 		case T_ORDREL_REQ:
980 			/* Illegal for rts. */
981 			freemsg(mp);
982 			(void) putnextctl1(RD(q), M_ERROR, EPROTO);
983 			return;
984 		default:
985 			break;
986 		}
987 		break;
988 	case M_IOCTL:
989 		iocp = (struct iocblk *)mp->b_rptr;
990 		switch (iocp->ioc_cmd) {
991 		case ND_SET:
992 		case ND_GET:
993 			if (nd_getset(q, rtss->rtss_g_nd, mp)) {
994 				qreply(q, mp);
995 				return;
996 			}
997 			break;
998 		case TI_GETPEERNAME:
999 			mi_copyin(q, mp, NULL,
1000 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
1001 			return;
1002 		default:
1003 			break;
1004 		}
1005 	case M_IOCDATA:
1006 		rts_wput_iocdata(q, mp);
1007 		return;
1008 	default:
1009 		break;
1010 	}
1011 	putnext(q, mp);
1012 }
1013 
1014 /*
1015  * Called by rts_wput_other to handle all M_IOCDATA messages.
1016  */
1017 static void
1018 rts_wput_iocdata(queue_t *q, mblk_t *mp)
1019 {
1020 	struct sockaddr	*rtsaddr;
1021 	mblk_t	*mp1;
1022 	STRUCT_HANDLE(strbuf, sb);
1023 	struct iocblk	*iocp	= (struct iocblk *)mp->b_rptr;
1024 
1025 	/* Make sure it is one of ours. */
1026 	switch (iocp->ioc_cmd) {
1027 	case TI_GETPEERNAME:
1028 		break;
1029 	default:
1030 		putnext(q, mp);
1031 		return;
1032 	}
1033 	switch (mi_copy_state(q, mp, &mp1)) {
1034 	case -1:
1035 		return;
1036 	case MI_COPY_CASE(MI_COPY_IN, 1):
1037 		break;
1038 	case MI_COPY_CASE(MI_COPY_OUT, 1):
1039 		/* Copy out the strbuf. */
1040 		mi_copyout(q, mp);
1041 		return;
1042 	case MI_COPY_CASE(MI_COPY_OUT, 2):
1043 		/* All done. */
1044 		mi_copy_done(q, mp, 0);
1045 		return;
1046 	default:
1047 		mi_copy_done(q, mp, EPROTO);
1048 		return;
1049 	}
1050 	STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr);
1051 	if (STRUCT_FGET(sb, maxlen) < (int)sizeof (sin_t)) {
1052 		mi_copy_done(q, mp, EINVAL);
1053 		return;
1054 	}
1055 	switch (iocp->ioc_cmd) {
1056 	case TI_GETPEERNAME:
1057 		break;
1058 	default:
1059 		mi_copy_done(q, mp, EPROTO);
1060 		return;
1061 	}
1062 	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), sizeof (sin_t),
1063 	    B_TRUE);
1064 	if (mp1 == NULL)
1065 		return;
1066 	STRUCT_FSET(sb, len, (int)sizeof (sin_t));
1067 	rtsaddr = (struct sockaddr *)mp1->b_rptr;
1068 	mp1->b_wptr = (uchar_t *)&rtsaddr[1];
1069 	bzero(rtsaddr, sizeof (struct sockaddr));
1070 	rtsaddr->sa_family = AF_ROUTE;
1071 	/* Copy out the address */
1072 	mi_copyout(q, mp);
1073 }
1074 
1075 static void
1076 rts_rput(queue_t *q, mblk_t *mp)
1077 {
1078 	rts_t	*rts;
1079 	struct iocblk	*iocp;
1080 	mblk_t *mp1;
1081 	struct T_data_ind *tdi;
1082 
1083 	rts = (rts_t *)q->q_ptr;
1084 	switch (mp->b_datap->db_type) {
1085 	case M_IOCACK:
1086 	case M_IOCNAK:
1087 		iocp = (struct iocblk *)mp->b_rptr;
1088 		if (rts->rts_flag & (RTS_WPUT_PENDING|RTS_OPEN_PENDING)) {
1089 			if (rts->rts_flag & RTS_WPUT_PENDING)
1090 				rts->rts_flag &= ~RTS_WPUT_PENDING;
1091 			else
1092 				rts->rts_flag &= ~RTS_OPEN_PENDING;
1093 			rts->rts_error = iocp->ioc_error;
1094 			freemsg(mp);
1095 			return;
1096 		}
1097 		break;
1098 	case M_DATA:
1099 		/*
1100 		 * Prepend T_DATA_IND to prevent the stream head from
1101 		 * consolidating multiple messages together.
1102 		 * If the allocation fails just send up the M_DATA.
1103 		 */
1104 		mp1 = allocb(sizeof (*tdi), BPRI_MED);
1105 		if (mp1 != NULL) {
1106 			mp1->b_cont = mp;
1107 			mp = mp1;
1108 
1109 			mp->b_datap->db_type = M_PROTO;
1110 			mp->b_wptr += sizeof (*tdi);
1111 			tdi = (struct T_data_ind *)mp->b_rptr;
1112 			tdi->PRIM_type = T_DATA_IND;
1113 			tdi->MORE_flag = 0;
1114 		}
1115 		break;
1116 	default:
1117 		break;
1118 	}
1119 	putnext(q, mp);
1120 }
1121 
1122 
1123 void
1124 rts_ddi_init(void)
1125 {
1126 	rts_max_optsize = optcom_max_optsize(rts_opt_obj.odb_opt_des_arr,
1127 	    rts_opt_obj.odb_opt_arr_cnt);
1128 
1129 	/*
1130 	 * We want to be informed each time a stack is created or
1131 	 * destroyed in the kernel, so we can maintain the
1132 	 * set of rts_stack_t's.
1133 	 */
1134 	netstack_register(NS_RTS, rts_stack_init, NULL, rts_stack_fini);
1135 }
1136 
1137 void
1138 rts_ddi_destroy(void)
1139 {
1140 	netstack_unregister(NS_RTS);
1141 }
1142 
1143 /*
1144  * Initialize the RTS stack instance.
1145  */
1146 /* ARGSUSED */
1147 static void *
1148 rts_stack_init(netstackid_t stackid, netstack_t *ns)
1149 {
1150 	rts_stack_t	*rtss;
1151 	rtsparam_t	*pa;
1152 
1153 	rtss = (rts_stack_t *)kmem_zalloc(sizeof (*rtss), KM_SLEEP);
1154 	rtss->rtss_netstack = ns;
1155 
1156 	pa = (rtsparam_t *)kmem_alloc(sizeof (lcl_param_arr), KM_SLEEP);
1157 	rtss->rtss_params = pa;
1158 	bcopy(lcl_param_arr, rtss->rtss_params, sizeof (lcl_param_arr));
1159 
1160 	(void) rts_param_register(&rtss->rtss_g_nd,
1161 	    rtss->rtss_params, A_CNT(lcl_param_arr));
1162 	return (rtss);
1163 }
1164 
1165 /*
1166  * Free the RTS stack instance.
1167  */
1168 /* ARGSUSED */
1169 static void
1170 rts_stack_fini(netstackid_t stackid, void *arg)
1171 {
1172 	rts_stack_t *rtss = (rts_stack_t *)arg;
1173 
1174 	rts_param_cleanup(&rtss->rtss_g_nd);
1175 	kmem_free(rtss->rtss_params, sizeof (lcl_param_arr));
1176 	rtss->rtss_params = NULL;
1177 	kmem_free(rtss, sizeof (*rtss));
1178 }
1179