xref: /titanic_44/usr/src/uts/common/inet/ip/rts.c (revision b250187ecb9698546885f906fc8321a2a399f0e6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/strsubr.h>
32 #include <sys/stropts.h>
33 #include <sys/strsun.h>
34 #include <sys/strlog.h>
35 #define	_SUN_TPI_VERSION 2
36 #include <sys/tihdr.h>
37 #include <sys/timod.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/cmn_err.h>
41 #include <sys/proc.h>
42 #include <sys/suntpi.h>
43 #include <sys/policy.h>
44 
45 #include <sys/socket.h>
46 #include <netinet/in.h>
47 
48 #include <inet/common.h>
49 #include <netinet/ip6.h>
50 #include <inet/ip.h>
51 #include <inet/mi.h>
52 #include <inet/nd.h>
53 #include <inet/optcom.h>
54 #include <netinet/ip_mroute.h>
55 #include <sys/isa_defs.h>
56 #include <net/route.h>
57 
58 /*
59  * This is a transport provider for routing sockets.  Downstream messages are
60  * wrapped with a IP_IOCTL header, and ip_wput_ioctl calls the appropriate entry
61  * in the ip_ioctl_ftbl callout table to pass the routing socket data into IP.
62  * Upstream messages are generated for listeners of the routing socket as well
63  * as the message sender (unless they have turned off their end using
64  * SO_USELOOPBACK or shutdown(3n)).  Upstream messages may also be generated
65  * asynchronously when:
66  *
67  *	Interfaces are brought up or down.
68  *	Addresses are assigned to interfaces.
69  *	ICMP redirects are processed and a IRE_HOST_REDIRECT is installed.
70  *	No route is found while sending a packet.
71  *	When TCP requests IP to remove an IRE_CACHE of a troubled destination.
72  *
73  * Since all we do is reformat the messages between routing socket and
74  * ioctl forms, no synchronization is necessary in this module; all
75  * the dirty work is done down in ip.
76  */
77 
78 /*
79  * Object to represent database of options to search passed to
80  * {sock,tpi}optcom_req() interface routine to take care of option
81  * management and associated methods.
82  * XXX. These and other externs should really move to a rts header.
83  */
84 extern optdb_obj_t	rts_opt_obj;
85 extern uint_t		rts_max_optsize;
86 
87 /* Internal routing socket stream control structure, one per open stream */
88 typedef	struct rts_s {
89 	cred_t	*rts_credp;		/* Opener's credentials */
90 	uint_t	rts_state;		/* Provider interface state */
91 	uint_t	rts_error;		/* Routing socket error code */
92 	uint_t	rts_flag;		/* Pending I/O state */
93 	uint_t	rts_proto;		/* SO_PROTOTYPE "socket" option. */
94 	uint_t	rts_debug : 1,		/* SO_DEBUG "socket" option. */
95 		rts_dontroute : 1,	/* SO_DONTROUTE "socket" option. */
96 		rts_broadcast : 1,	/* SO_BROADCAST "socket" option. */
97 		rts_reuseaddr : 1,	/* SO_REUSEADDR "socket" option. */
98 		rts_useloopback : 1,	/* SO_USELOOPBACK "socket" option. */
99 		rts_multicast_loop : 1,	/* IP_MULTICAST_LOOP option */
100 		rts_hdrincl : 1,	/* IP_HDRINCL option + RAW and IGMP */
101 
102 		: 0;
103 } rts_t;
104 
105 #define	RTS_WPUT_PENDING	0x1	/* Waiting for write-side to complete */
106 #define	RTS_WRW_PENDING		0x2	/* Routing socket write in progress */
107 #define	RTS_OPEN_PENDING	0x4	/* Routing socket open in progress */
108 
109 /* Default structure copied into T_INFO_ACK messages */
110 static struct T_info_ack rts_g_t_info_ack = {
111 	T_INFO_ACK,
112 	T_INFINITE,	/* TSDU_size. Maximum size messages. */
113 	T_INVALID,	/* ETSDU_size. No expedited data. */
114 	T_INVALID,	/* CDATA_size. No connect data. */
115 	T_INVALID,	/* DDATA_size. No disconnect data. */
116 	0,		/* ADDR_size. */
117 	0,		/* OPT_size - not initialized here */
118 	64 * 1024,	/* TIDU_size. rts allows maximum size messages. */
119 	T_COTS,		/* SERV_type. rts supports connection oriented. */
120 	TS_UNBND,	/* CURRENT_state. This is set from rts_state. */
121 	(XPG4_1)	/* PROVIDER_flag */
122 };
123 
124 /* Named Dispatch Parameter Management Structure */
125 typedef struct rtspparam_s {
126 	uint_t	rts_param_min;
127 	uint_t	rts_param_max;
128 	uint_t	rts_param_value;
129 	char	*rts_param_name;
130 } rtsparam_t;
131 
132 /*
133  * Table of ND variables supported by rts. These are loaded into rts_g_nd
134  * in rts_open.
135  * All of these are alterable, within the min/max values given, at run time.
136  */
137 static rtsparam_t	rts_param_arr[] = {
138 	/* min		max		value		name */
139 	{ 4096,		65536,		8192,		"rts_xmit_hiwat"},
140 	{ 0,		65536,		1024,		"rts_xmit_lowat"},
141 	{ 4096,		65536,		8192,		"rts_recv_hiwat"},
142 	{ 65536,	1024*1024*1024, 256*1024,	"rts_max_buf"},
143 };
144 #define	rts_xmit_hiwat			rts_param_arr[0].rts_param_value
145 #define	rts_xmit_lowat			rts_param_arr[1].rts_param_value
146 #define	rts_recv_hiwat			rts_param_arr[2].rts_param_value
147 #define	rts_max_buf			rts_param_arr[3].rts_param_value
148 
149 static int	rts_close(queue_t *q);
150 static void 	rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
151     int sys_error);
152 static mblk_t	*rts_ioctl_alloc(mblk_t *data, cred_t *cr);
153 static int	rts_open(queue_t *q, dev_t *devp, int flag, int sflag,
154     cred_t *credp);
155 int		rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name,
156     uchar_t *ptr);
157 int		rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name,
158     uchar_t *ptr);
159 int		rts_opt_set(queue_t *q, uint_t optset_context, int level,
160     int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
161     uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk);
162 static void	rts_param_cleanup(void);
163 static int	rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
164 static boolean_t rts_param_register(rtsparam_t *rtspa, int cnt);
165 static int	rts_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
166     cred_t *cr);
167 static void	rts_rput(queue_t *q, mblk_t *mp);
168 static void	rts_wput(queue_t *q, mblk_t *mp);
169 static void	rts_wput_iocdata(queue_t *q, mblk_t *mp);
170 static void 	rts_wput_other(queue_t *q, mblk_t *mp);
171 static int	rts_wrw(queue_t *q, struiod_t *dp);
172 
173 static struct module_info info = {
174 	129, "rts", 1, INFPSZ, 512, 128
175 };
176 
177 static struct qinit rinit = {
178 	(pfi_t)rts_rput, NULL, rts_open, rts_close, NULL, &info
179 };
180 
181 static struct qinit winit = {
182 	(pfi_t)rts_wput, NULL, NULL, NULL, NULL, &info,
183 	NULL, (pfi_t)rts_wrw, NULL, STRUIOT_STANDARD
184 };
185 
186 struct streamtab rtsinfo = {
187 	&rinit, &winit
188 };
189 
190 static IDP	rts_g_nd;	/* Points to table of RTS ND variables. */
191 uint_t		rts_open_streams = 0;
192 
193 /*
194  * This routine allocates the necessary
195  * message blocks for IOCTL wrapping the
196  * user data.
197  */
198 static mblk_t *
199 rts_ioctl_alloc(mblk_t *data, cred_t *cr)
200 {
201 	mblk_t	*mp = NULL;
202 	mblk_t	*mp1 = NULL;
203 	ipllc_t	*ipllc;
204 	struct iocblk	*ioc;
205 
206 	mp = allocb_cred(sizeof (ipllc_t), cr);
207 	if (mp == NULL)
208 		return (NULL);
209 	mp1 = allocb_cred(sizeof (struct iocblk), cr);
210 	if (mp1 == NULL) {
211 		freeb(mp);
212 		return (NULL);
213 	}
214 
215 	ipllc = (ipllc_t *)mp->b_rptr;
216 	ipllc->ipllc_cmd = IP_IOC_RTS_REQUEST;
217 	ipllc->ipllc_name_offset = 0;
218 	ipllc->ipllc_name_length = 0;
219 	mp->b_wptr += sizeof (ipllc_t);
220 	mp->b_cont = data;
221 
222 	ioc = (struct iocblk *)mp1->b_rptr;
223 	ioc->ioc_cmd = IP_IOCTL;
224 	ioc->ioc_error = 0;
225 	ioc->ioc_cr = NULL;
226 	ioc->ioc_count = msgdsize(mp);
227 	mp1->b_wptr += sizeof (struct iocblk);
228 	mp1->b_datap->db_type = M_IOCTL;
229 	mp1->b_cont = mp;
230 
231 	return (mp1);
232 }
233 
234 /*
235  * This routine closes rts stream, by disabling
236  * put/srv routines and freeing the this module
237  * internal datastructure.
238  */
239 static int
240 rts_close(queue_t *q)
241 {
242 	qprocsoff(q);
243 
244 	crfree(((rts_t *)q->q_ptr)->rts_credp);
245 
246 	mi_free(q->q_ptr);
247 	rts_open_streams--;
248 	/*
249 	 * Free the ND table if this was
250 	 * the last stream close
251 	 */
252 	rts_param_cleanup();
253 	return (0);
254 }
255 
256 /*
257  * This is the open routine for routing socket. It allocates
258  * rts_t structure for the stream and sends an IOCTL to
259  * the down module to indicate that it is a routing socket
260  * stream.
261  */
262 /* ARGSUSED */
263 static int
264 rts_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
265 {
266 	mblk_t	*mp = NULL;
267 	rts_t	*rts;
268 
269 	/* If the stream is already open, return immediately. */
270 	if (q->q_ptr != NULL)
271 		return (0);
272 
273 	/* If this is not a push of rts as a module, fail. */
274 	if (sflag != MODOPEN)
275 		return (EINVAL);
276 
277 	/* If this is the first open of rts, create the ND table. */
278 	if (rts_g_nd == NULL) {
279 		if (!rts_param_register(rts_param_arr, A_CNT(rts_param_arr)))
280 			return (ENOMEM);
281 	}
282 	q->q_ptr = mi_zalloc_sleep(sizeof (rts_t));
283 	WR(q)->q_ptr = q->q_ptr;
284 	rts = (rts_t *)q->q_ptr;
285 
286 	rts->rts_credp = credp;
287 	crhold(credp);
288 	/*
289 	 * The receive hiwat is only looked at on the stream head queue.
290 	 * Store in q_hiwat in order to return on SO_RCVBUF getsockopts.
291 	 */
292 	q->q_hiwat = rts_recv_hiwat;
293 	/*
294 	 * The transmit hiwat/lowat is only looked at on IP's queue.
295 	 * Store in q_hiwat/q_lowat in order to return on SO_SNDBUF/SO_SNDLOWAT
296 	 * getsockopts.
297 	 */
298 	WR(q)->q_hiwat = rts_xmit_hiwat;
299 	WR(q)->q_lowat = rts_xmit_lowat;
300 	qprocson(q);
301 	/*
302 	 * Indicate the down IP module that this is a routing socket
303 	 * client by sending an RTS IOCTL without any user data. Although
304 	 * this is just a notification message (without any real routing
305 	 * request), we pass in any credential for correctness sake.
306 	 */
307 	mp = rts_ioctl_alloc(NULL, credp);
308 	if (mp == NULL) {
309 		rts_param_cleanup();
310 		qprocsoff(q);
311 		ASSERT(q->q_ptr != NULL);
312 		mi_free(q->q_ptr);
313 		crfree(credp);
314 		return (ENOMEM);
315 	}
316 	rts_open_streams++;
317 	rts->rts_flag |= RTS_OPEN_PENDING;
318 	putnext(WR(q), mp);
319 	while (rts->rts_flag & RTS_OPEN_PENDING) {
320 		if (!qwait_sig(q)) {
321 			(void) rts_close(q);
322 			return (EINTR);
323 		}
324 	}
325 	if (rts->rts_error != 0) {
326 		(void) rts_close(q);
327 		return (ENOTSUP);
328 	}
329 	rts->rts_state = TS_UNBND;
330 	return (0);
331 }
332 
333 /*
334  * This routine creates a T_ERROR_ACK message and passes it upstream.
335  */
336 static void
337 rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
338 {
339 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
340 		qreply(q, mp);
341 }
342 
343 /*
344  * This routine creates a T_OK_ACK message and passes it upstream.
345  */
346 static void
347 rts_ok_ack(queue_t *q, mblk_t *mp)
348 {
349 	if ((mp = mi_tpi_ok_ack_alloc(mp)) != NULL)
350 		qreply(q, mp);
351 }
352 
353 /*
354  * This routine is called by rts_wput to handle T_UNBIND_REQ messages.
355  * After some error checking, the message is passed downstream to ip.
356  */
357 static void
358 rts_unbind(queue_t *q, mblk_t *mp)
359 {
360 	rts_t	*rts;
361 
362 	rts = (rts_t *)q->q_ptr;
363 	/* If a bind has not been done, we can't unbind. */
364 	if (rts->rts_state != TS_IDLE) {
365 		rts_err_ack(q, mp, TOUTSTATE, 0);
366 		return;
367 	}
368 	rts->rts_state = TS_UNBND;
369 	rts_ok_ack(q, mp);
370 }
371 
372 /*
373  * This routine is called to handle each
374  * O_T_BIND_REQ/T_BIND_REQ message passed to
375  * rts_wput. Note: This routine works with both
376  * O_T_BIND_REQ and T_BIND_REQ semantics.
377  */
378 static void
379 rts_bind(queue_t *q, mblk_t *mp)
380 {
381 	mblk_t	*mp1;
382 	struct T_bind_req *tbr;
383 	rts_t	*rts;
384 
385 	rts = (rts_t *)q->q_ptr;
386 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
387 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
388 		    "rts_bind: bad data, %d", rts->rts_state);
389 		rts_err_ack(q, mp, TBADADDR, 0);
390 		return;
391 	}
392 	if (rts->rts_state != TS_UNBND) {
393 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
394 		    "rts_bind: bad state, %d", rts->rts_state);
395 		rts_err_ack(q, mp, TOUTSTATE, 0);
396 		return;
397 	}
398 	/*
399 	 * Reallocate the message to make sure we have enough room for an
400 	 * address and the protocol type.
401 	 */
402 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin_t), 1);
403 	if (mp1 == NULL) {
404 		rts_err_ack(q, mp, TSYSERR, ENOMEM);
405 		return;
406 	}
407 	mp = mp1;
408 	tbr = (struct T_bind_req *)mp->b_rptr;
409 	if (tbr->ADDR_length != 0) {
410 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
411 		    "rts_bind: bad ADDR_length %d", tbr->ADDR_length);
412 		rts_err_ack(q, mp, TBADADDR, 0);
413 		return;
414 	}
415 	/* Generic request */
416 	tbr->ADDR_offset = (t_scalar_t)sizeof (struct T_bind_req);
417 	tbr->ADDR_length = 0;
418 	tbr->PRIM_type = T_BIND_ACK;
419 	rts->rts_state = TS_IDLE;
420 	qreply(q, mp);
421 }
422 
423 static void
424 rts_copy_info(struct T_info_ack *tap, rts_t *rts)
425 {
426 	*tap = rts_g_t_info_ack;
427 	tap->CURRENT_state = rts->rts_state;
428 	tap->OPT_size = rts_max_optsize;
429 }
430 
431 /*
432  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
433  * rts_wput.  Much of the T_CAPABILITY_ACK information is copied from
434  * rts_g_t_info_ack.  The current state of the stream is copied from
435  * rts_state.
436  */
437 static void
438 rts_capability_req(queue_t *q, mblk_t *mp)
439 {
440 	rts_t			*rts = (rts_t *)q->q_ptr;
441 	t_uscalar_t		cap_bits1;
442 	struct T_capability_ack	*tcap;
443 
444 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
445 
446 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
447 		mp->b_datap->db_type, T_CAPABILITY_ACK);
448 	if (mp == NULL)
449 		return;
450 
451 	tcap = (struct T_capability_ack *)mp->b_rptr;
452 	tcap->CAP_bits1 = 0;
453 
454 	if (cap_bits1 & TC1_INFO) {
455 		rts_copy_info(&tcap->INFO_ack, rts);
456 		tcap->CAP_bits1 |= TC1_INFO;
457 	}
458 
459 	qreply(q, mp);
460 }
461 
462 /*
463  * This routine responds to T_INFO_REQ messages.  It is called by rts_wput.
464  * Most of the T_INFO_ACK information is copied from rts_g_t_info_ack.
465  * The current state of the stream is copied from rts_state.
466  */
467 static void
468 rts_info_req(queue_t *q, mblk_t *mp)
469 {
470 	rts_t	*rts = (rts_t *)q->q_ptr;
471 
472 	mp = tpi_ack_alloc(mp, sizeof (rts_g_t_info_ack), M_PCPROTO,
473 	    T_INFO_ACK);
474 	if (mp == NULL)
475 		return;
476 	rts_copy_info((struct T_info_ack *)mp->b_rptr, rts);
477 	qreply(q, mp);
478 }
479 
480 /*
481  * This routine gets default values of certain options whose default
482  * values are maintained by protcol specific code
483  */
484 /* ARGSUSED */
485 int
486 rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
487 {
488 	/* no default value processed by protocol specific code currently */
489 	return (-1);
490 }
491 
492 /*
493  * This routine retrieves the current status of socket options.
494  * It returns the size of the option retrieved.
495  */
496 int
497 rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
498 {
499 	int	*i1 = (int *)ptr;
500 	rts_t	*rts = (rts_t *)q->q_ptr;
501 
502 	switch (level) {
503 	case SOL_SOCKET:
504 		switch (name) {
505 		case SO_DEBUG:
506 			*i1 = rts->rts_debug;
507 			break;
508 		case SO_REUSEADDR:
509 			*i1 = rts->rts_reuseaddr;
510 			break;
511 		case SO_TYPE:
512 			*i1 = SOCK_RAW;
513 			break;
514 
515 		/*
516 		 * The following three items are available here,
517 		 * but are only meaningful to IP.
518 		 */
519 		case SO_DONTROUTE:
520 			*i1 = rts->rts_dontroute;
521 			break;
522 		case SO_USELOOPBACK:
523 			*i1 = rts->rts_useloopback;
524 			break;
525 		case SO_BROADCAST:
526 			*i1 = rts->rts_broadcast;
527 			break;
528 		case SO_PROTOTYPE:
529 			*i1 = rts->rts_proto;
530 			break;
531 		/*
532 		 * The following two items can be manipulated,
533 		 * but changing them should do nothing.
534 		 */
535 		case SO_SNDBUF:
536 			ASSERT(q->q_hiwat <= INT_MAX);
537 			*i1 = (int)(q->q_hiwat);
538 			break;
539 		case SO_RCVBUF:
540 			ASSERT(q->q_hiwat <= INT_MAX);
541 			*i1 = (int)(RD(q)->q_hiwat);
542 			break;
543 		default:
544 			return (-1);
545 		}
546 		break;
547 	default:
548 		return (-1);
549 	}
550 	return ((int)sizeof (int));
551 }
552 
553 
554 /*
555  * This routine sets socket options.
556  */
557 /*ARGSUSED*/
558 int
559 rts_opt_set(queue_t *q, uint_t optset_context, int level,
560     int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
561     uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
562 {
563 	int	*i1 = (int *)invalp;
564 	rts_t	*rts = (rts_t *)q->q_ptr;
565 	boolean_t checkonly;
566 
567 	switch (optset_context) {
568 	case SETFN_OPTCOM_CHECKONLY:
569 		checkonly = B_TRUE;
570 		/*
571 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
572 		 * inlen != 0 implies value supplied and
573 		 * 	we have to "pretend" to set it.
574 		 * inlen == 0 implies that there is no
575 		 * 	value part in T_CHECK request and just validation
576 		 * done elsewhere should be enough, we just return here.
577 		 */
578 		if (inlen == 0) {
579 			*outlenp = 0;
580 			return (0);
581 		}
582 		break;
583 	case SETFN_OPTCOM_NEGOTIATE:
584 		checkonly = B_FALSE;
585 		break;
586 	case SETFN_UD_NEGOTIATE:
587 	case SETFN_CONN_NEGOTIATE:
588 		checkonly = B_FALSE;
589 		/*
590 		 * Negotiating local and "association-related" options
591 		 * through T_UNITDATA_REQ or T_CONN_{REQ,CON}
592 		 * Not allowed in this module.
593 		 */
594 		return (EINVAL);
595 	default:
596 		/*
597 		 * We should never get here
598 		 */
599 		*outlenp = 0;
600 		return (EINVAL);
601 	}
602 
603 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
604 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
605 
606 	/*
607 	 * For rts, we should have no ancillary data sent down
608 	 * (rts_wput doesn't handle options).
609 	 */
610 	ASSERT(thisdg_attrs == NULL);
611 
612 	/*
613 	 * For fixed length options, no sanity check
614 	 * of passed in length is done. It is assumed *_optcom_req()
615 	 * routines do the right thing.
616 	 */
617 
618 	switch (level) {
619 	case SOL_SOCKET:
620 		switch (name) {
621 		case SO_REUSEADDR:
622 			if (!checkonly)
623 				rts->rts_reuseaddr = *i1;
624 			break;	/* goto sizeof (int) option return */
625 		case SO_DEBUG:
626 			if (!checkonly)
627 				rts->rts_debug = *i1;
628 			break;	/* goto sizeof (int) option return */
629 		/*
630 		 * The following three items are available here,
631 		 * but are only meaningful to IP.
632 		 */
633 		case SO_DONTROUTE:
634 			if (!checkonly)
635 				rts->rts_dontroute = *i1;
636 			break;	/* goto sizeof (int) option return */
637 		case SO_USELOOPBACK:
638 			if (!checkonly)
639 				rts->rts_useloopback = *i1;
640 			break;	/* goto sizeof (int) option return */
641 		case SO_BROADCAST:
642 			if (!checkonly)
643 				rts->rts_broadcast = *i1;
644 			break;	/* goto sizeof (int) option return */
645 		case SO_PROTOTYPE:
646 			/*
647 			 * Routing socket applications that call socket() with
648 			 * a third argument can filter which messages will be
649 			 * sent upstream thanks to sockfs.  so_socket() sends
650 			 * down the SO_PROTOTYPE and rts_queue_input()
651 			 * implements the filtering.
652 			 */
653 			if (*i1 != AF_INET && *i1 != AF_INET6)
654 				return (EPROTONOSUPPORT);
655 			if (!checkonly)
656 				rts->rts_proto = *i1;
657 			break;	/* goto sizeof (int) option return */
658 		/*
659 		 * The following two items can be manipulated,
660 		 * but changing them should do nothing.
661 		 */
662 		case SO_SNDBUF:
663 			if (*i1 > rts_max_buf) {
664 				*outlenp = 0;
665 				return (ENOBUFS);
666 			}
667 			if (!checkonly) {
668 				q->q_hiwat = *i1;
669 				q->q_next->q_hiwat = *i1;
670 			}
671 			break;	/* goto sizeof (int) option return */
672 		case SO_RCVBUF:
673 			if (*i1 > rts_max_buf) {
674 				*outlenp = 0;
675 				return (ENOBUFS);
676 			}
677 			if (!checkonly) {
678 				RD(q)->q_hiwat = *i1;
679 				(void) mi_set_sth_hiwat(RD(q), *i1);
680 			}
681 			break;	/* goto sizeof (int) option return */
682 		default:
683 			*outlenp = 0;
684 			return (EINVAL);
685 		}
686 		break;
687 	default:
688 		*outlenp = 0;
689 		return (EINVAL);
690 	}
691 	/*
692 	 * Common case of return from an option that is sizeof (int)
693 	 */
694 	*(int *)outvalp = *i1;
695 	*outlenp = (t_uscalar_t)sizeof (int);
696 	return (0);
697 }
698 
699 /*
700  * This routine frees the ND table if all streams have been closed.
701  * It is called by rts_close and rts_open.
702  */
703 static void
704 rts_param_cleanup(void)
705 {
706 	if (!rts_open_streams)
707 		nd_free(&rts_g_nd);
708 }
709 
710 /*
711  * This routine retrieves the value of an ND variable in a rtsparam_t
712  * structure. It is called through nd_getset when a user reads the
713  * variable.
714  */
715 /* ARGSUSED */
716 static int
717 rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
718 {
719 	rtsparam_t	*rtspa = (rtsparam_t *)cp;
720 
721 	(void) mi_mpprintf(mp, "%u", rtspa->rts_param_value);
722 	return (0);
723 }
724 
725 /*
726  * Walk through the param array specified registering each element with the
727  * named dispatch (ND) handler.
728  */
729 static boolean_t
730 rts_param_register(rtsparam_t *rtspa, int cnt)
731 {
732 	for (; cnt-- > 0; rtspa++) {
733 		if (rtspa->rts_param_name != NULL && rtspa->rts_param_name[0]) {
734 			if (!nd_load(&rts_g_nd, rtspa->rts_param_name,
735 			    rts_param_get, rts_param_set, (caddr_t)rtspa)) {
736 				nd_free(&rts_g_nd);
737 				return (B_FALSE);
738 			}
739 		}
740 	}
741 	return (B_TRUE);
742 }
743 
744 /* This routine sets an ND variable in a rtsparam_t structure. */
745 /* ARGSUSED */
746 static int
747 rts_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
748 {
749 	ulong_t	new_value;
750 	rtsparam_t	*rtspa = (rtsparam_t *)cp;
751 
752 	/*
753 	 * Fail the request if the new value does not lie within the
754 	 * required bounds.
755 	 */
756 	if (ddi_strtoul(value, NULL, 10, &new_value) != 0 ||
757 	    new_value < rtspa->rts_param_min ||
758 	    new_value > rtspa->rts_param_max) {
759 		return (EINVAL);
760 	}
761 
762 	/* Set the new value */
763 	rtspa->rts_param_value = new_value;
764 	return (0);
765 }
766 
767 /*
768  * This routine handles synchronous messages passed downstream. It either
769  * consumes the message or passes it downstream; it never queues a
770  * a message. The data messages that go down are wrapped in an IOCTL
771  * message.
772  *
773  * Since it is synchronous, it waits for the M_IOCACK/M_IOCNAK so that
774  * it can return an immediate error (such as ENETUNREACH when adding a route).
775  * It uses the RTS_WRW_PENDING to ensure that each rts instance has only
776  * one M_IOCTL outstanding at any given time.
777  */
778 static int
779 rts_wrw(queue_t *q, struiod_t *dp)
780 {
781 	mblk_t	*mp = dp->d_mp;
782 	mblk_t	*mp1;
783 	int	error;
784 	rt_msghdr_t	*rtm;
785 	rts_t	*rts;
786 
787 	rts = (rts_t *)q->q_ptr;
788 	while (rts->rts_flag & RTS_WRW_PENDING) {
789 		if (qwait_rw(q)) {
790 			rts->rts_error = EINTR;
791 			goto err_ret;
792 		}
793 	}
794 	rts->rts_flag |= RTS_WRW_PENDING;
795 
796 	if (isuioq(q) && (error = struioget(q, mp, dp, 0))) {
797 		/*
798 		 * Uio error of some sort, so just return the error.
799 		 */
800 		rts->rts_error = error;
801 		goto err_ret;
802 	}
803 	/*
804 	 * Pass the mblk (chain) onto wput().
805 	 */
806 	dp->d_mp = 0;
807 
808 	switch (mp->b_datap->db_type) {
809 	case M_PROTO:
810 	case M_PCPROTO:
811 		/* Expedite other than T_DATA_REQ to below the switch */
812 		if (((mp->b_wptr - mp->b_rptr) !=
813 		    sizeof (struct T_data_req)) ||
814 		    (((union T_primitives *)mp->b_rptr)->type != T_DATA_REQ))
815 			break;
816 		if ((mp1 = mp->b_cont) == NULL) {
817 			rts->rts_error = EINVAL;
818 			goto err_ret;
819 		}
820 		freeb(mp);
821 		mp = mp1;
822 		/* FALLTHRU */
823 	case M_DATA:
824 		/*
825 		 * The semantics of the routing socket is such that the rtm_pid
826 		 * field is automatically filled in during requests with the
827 		 * current process' pid.  We do this here (where we still have
828 		 * user context) after checking we have at least a message the
829 		 * size of a routing message header.
830 		 */
831 		if ((mp->b_wptr - mp->b_rptr) < sizeof (rt_msghdr_t)) {
832 			if (!pullupmsg(mp, sizeof (rt_msghdr_t))) {
833 				rts->rts_error = EINVAL;
834 				goto err_ret;
835 			}
836 		}
837 		rtm = (rt_msghdr_t *)mp->b_rptr;
838 		rtm->rtm_pid = curproc->p_pid;
839 		break;
840 	default:
841 		break;
842 	}
843 	rts->rts_flag |= RTS_WPUT_PENDING;
844 	rts_wput(q, mp);
845 	while (rts->rts_flag & RTS_WPUT_PENDING)
846 		if (qwait_rw(q)) {
847 			/* RTS_WPUT_PENDING will be cleared below */
848 			rts->rts_error = EINTR;
849 			break;
850 		}
851 err_ret:
852 	rts->rts_flag &= ~(RTS_WPUT_PENDING | RTS_WRW_PENDING);
853 	return (rts->rts_error);
854 }
855 
856 /*
857  * This routine handles all messages passed downstream. It either
858  * consumes the message or passes it downstream; it never queues a
859  * a message. The data messages that go down are wrapped in an IOCTL
860  * message.
861  */
862 static void
863 rts_wput(queue_t *q, mblk_t *mp)
864 {
865 	uchar_t	*rptr = mp->b_rptr;
866 	mblk_t	*mp1;
867 
868 	switch (mp->b_datap->db_type) {
869 	case M_DATA:
870 		break;
871 	case M_PROTO:
872 	case M_PCPROTO:
873 		if ((mp->b_wptr - rptr) == sizeof (struct T_data_req)) {
874 			/* Expedite valid T_DATA_REQ to below the switch */
875 			if (((union T_primitives *)rptr)->type == T_DATA_REQ) {
876 				mp1 = mp->b_cont;
877 				freeb(mp);
878 				if (mp1 == NULL)
879 					return;
880 				mp = mp1;
881 				break;
882 			}
883 		}
884 		/* FALLTHRU */
885 	default:
886 		rts_wput_other(q, mp);
887 		return;
888 	}
889 
890 
891 	mp1 = rts_ioctl_alloc(mp, DB_CRED(mp));
892 	if (mp1 == NULL) {
893 		rts_t	*rts = (rts_t *)q->q_ptr;
894 
895 		ASSERT(rts != NULL);
896 		freemsg(mp);
897 		if (rts->rts_flag & RTS_WPUT_PENDING) {
898 			rts->rts_error = ENOMEM;
899 			rts->rts_flag &= ~RTS_WPUT_PENDING;
900 		}
901 		return;
902 	}
903 	putnext(q, mp1);
904 }
905 
906 
907 /*
908  * Handles all the control message, if it
909  * can not understand it, it will
910  * pass down stream.
911  */
912 static void
913 rts_wput_other(queue_t *q, mblk_t *mp)
914 {
915 	uchar_t	*rptr = mp->b_rptr;
916 	rts_t	*rts;
917 	struct iocblk	*iocp;
918 	cred_t	*cr;
919 
920 	rts = (rts_t *)q->q_ptr;
921 
922 	cr = DB_CREDDEF(mp, rts->rts_credp);
923 
924 	switch (mp->b_datap->db_type) {
925 	case M_PROTO:
926 	case M_PCPROTO:
927 		if ((mp->b_wptr - rptr) < sizeof (t_scalar_t)) {
928 			/*
929 			 * If the message does not contain a PRIM_type,
930 			 * throw it away.
931 			 */
932 			freemsg(mp);
933 			return;
934 		}
935 		switch (((union T_primitives *)rptr)->type) {
936 		case T_BIND_REQ:
937 		case O_T_BIND_REQ:
938 			rts_bind(q, mp);
939 			return;
940 		case T_UNBIND_REQ:
941 			rts_unbind(q, mp);
942 			return;
943 		case T_CAPABILITY_REQ:
944 			rts_capability_req(q, mp);
945 			return;
946 		case T_INFO_REQ:
947 			rts_info_req(q, mp);
948 			return;
949 		case T_SVR4_OPTMGMT_REQ:
950 			(void) svr4_optcom_req(q, mp, cr, &rts_opt_obj);
951 			return;
952 		case T_OPTMGMT_REQ:
953 			(void) tpi_optcom_req(q, mp, cr, &rts_opt_obj);
954 			return;
955 		case O_T_CONN_RES:
956 		case T_CONN_RES:
957 		case T_DISCON_REQ:
958 			/* Not supported by rts. */
959 			rts_err_ack(q, mp, TNOTSUPPORT, 0);
960 			return;
961 		case T_DATA_REQ:
962 		case T_EXDATA_REQ:
963 		case T_ORDREL_REQ:
964 			/* Illegal for rts. */
965 			freemsg(mp);
966 			(void) putnextctl1(RD(q), M_ERROR, EPROTO);
967 			return;
968 		default:
969 			break;
970 		}
971 		break;
972 	case M_IOCTL:
973 		iocp = (struct iocblk *)mp->b_rptr;
974 		switch (iocp->ioc_cmd) {
975 		case ND_SET:
976 		case ND_GET:
977 			if (nd_getset(q, rts_g_nd, mp)) {
978 				qreply(q, mp);
979 				return;
980 			}
981 			break;
982 		case TI_GETPEERNAME:
983 			mi_copyin(q, mp, NULL,
984 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
985 			return;
986 		default:
987 			break;
988 		}
989 	case M_IOCDATA:
990 		rts_wput_iocdata(q, mp);
991 		return;
992 	default:
993 		break;
994 	}
995 	putnext(q, mp);
996 }
997 
998 /*
999  * Called by rts_wput_other to handle all M_IOCDATA messages.
1000  */
1001 static void
1002 rts_wput_iocdata(queue_t *q, mblk_t *mp)
1003 {
1004 	struct sockaddr	*rtsaddr;
1005 	mblk_t	*mp1;
1006 	STRUCT_HANDLE(strbuf, sb);
1007 	struct iocblk	*iocp	= (struct iocblk *)mp->b_rptr;
1008 
1009 	/* Make sure it is one of ours. */
1010 	switch (iocp->ioc_cmd) {
1011 	case TI_GETPEERNAME:
1012 		break;
1013 	default:
1014 		putnext(q, mp);
1015 		return;
1016 	}
1017 	switch (mi_copy_state(q, mp, &mp1)) {
1018 	case -1:
1019 		return;
1020 	case MI_COPY_CASE(MI_COPY_IN, 1):
1021 		break;
1022 	case MI_COPY_CASE(MI_COPY_OUT, 1):
1023 		/* Copy out the strbuf. */
1024 		mi_copyout(q, mp);
1025 		return;
1026 	case MI_COPY_CASE(MI_COPY_OUT, 2):
1027 		/* All done. */
1028 		mi_copy_done(q, mp, 0);
1029 		return;
1030 	default:
1031 		mi_copy_done(q, mp, EPROTO);
1032 		return;
1033 	}
1034 	STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr);
1035 	if (STRUCT_FGET(sb, maxlen) < (int)sizeof (sin_t)) {
1036 		mi_copy_done(q, mp, EINVAL);
1037 		return;
1038 	}
1039 	switch (iocp->ioc_cmd) {
1040 	case TI_GETPEERNAME:
1041 		break;
1042 	default:
1043 		mi_copy_done(q, mp, EPROTO);
1044 		return;
1045 	}
1046 	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), sizeof (sin_t),
1047 	    B_TRUE);
1048 	if (mp1 == NULL)
1049 		return;
1050 	STRUCT_FSET(sb, len, (int)sizeof (sin_t));
1051 	rtsaddr = (struct sockaddr *)mp1->b_rptr;
1052 	mp1->b_wptr = (uchar_t *)&rtsaddr[1];
1053 	bzero(rtsaddr, sizeof (struct sockaddr));
1054 	rtsaddr->sa_family = AF_ROUTE;
1055 	/* Copy out the address */
1056 	mi_copyout(q, mp);
1057 }
1058 
1059 static void
1060 rts_rput(queue_t *q, mblk_t *mp)
1061 {
1062 	rts_t	*rts;
1063 	struct iocblk	*iocp;
1064 	mblk_t *mp1;
1065 	struct T_data_ind *tdi;
1066 
1067 	rts = (rts_t *)q->q_ptr;
1068 	switch (mp->b_datap->db_type) {
1069 	case M_IOCACK:
1070 	case M_IOCNAK:
1071 		iocp = (struct iocblk *)mp->b_rptr;
1072 		if (rts->rts_flag & (RTS_WPUT_PENDING|RTS_OPEN_PENDING)) {
1073 			if (rts->rts_flag & RTS_WPUT_PENDING)
1074 				rts->rts_flag &= ~RTS_WPUT_PENDING;
1075 			else
1076 				rts->rts_flag &= ~RTS_OPEN_PENDING;
1077 			rts->rts_error = iocp->ioc_error;
1078 			freemsg(mp);
1079 			return;
1080 		}
1081 		break;
1082 	case M_DATA:
1083 		/*
1084 		 * Prepend T_DATA_IND to prevent the stream head from
1085 		 * consolidating multiple messages together.
1086 		 * If the allocation fails just send up the M_DATA.
1087 		 */
1088 		mp1 = allocb(sizeof (*tdi), BPRI_MED);
1089 		if (mp1 != NULL) {
1090 			mp1->b_cont = mp;
1091 			mp = mp1;
1092 
1093 			mp->b_datap->db_type = M_PROTO;
1094 			mp->b_wptr += sizeof (*tdi);
1095 			tdi = (struct T_data_ind *)mp->b_rptr;
1096 			tdi->PRIM_type = T_DATA_IND;
1097 			tdi->MORE_flag = 0;
1098 		}
1099 		break;
1100 	default:
1101 		break;
1102 	}
1103 	putnext(q, mp);
1104 }
1105 
1106 
1107 void
1108 rts_ddi_init(void)
1109 {
1110 	rts_max_optsize = optcom_max_optsize(rts_opt_obj.odb_opt_des_arr,
1111 	    rts_opt_obj.odb_opt_arr_cnt);
1112 }
1113