xref: /titanic_52/usr/src/uts/common/inet/ip/rts.c (revision 3472f5dc5f31b4385e87145cddb24ae3c94de14b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/strsubr.h>
31 #include <sys/stropts.h>
32 #include <sys/strsun.h>
33 #include <sys/strlog.h>
34 #define	_SUN_TPI_VERSION 2
35 #include <sys/tihdr.h>
36 #include <sys/timod.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/cmn_err.h>
40 #include <sys/proc.h>
41 #include <sys/suntpi.h>
42 #include <sys/policy.h>
43 
44 #include <sys/socket.h>
45 #include <netinet/in.h>
46 
47 #include <inet/common.h>
48 #include <netinet/ip6.h>
49 #include <inet/ip.h>
50 #include <inet/mi.h>
51 #include <inet/nd.h>
52 #include <inet/optcom.h>
53 #include <netinet/ip_mroute.h>
54 #include <sys/isa_defs.h>
55 #include <net/route.h>
56 
57 /*
58  * This is a transport provider for routing sockets.  Downstream messages are
59  * wrapped with a IP_IOCTL header, and ip_wput_ioctl calls the appropriate entry
60  * in the ip_ioctl_ftbl callout table to pass the routing socket data into IP.
61  * Upstream messages are generated for listeners of the routing socket as well
62  * as the message sender (unless they have turned off their end using
63  * SO_USELOOPBACK or shutdown(3n)).  Upstream messages may also be generated
64  * asynchronously when:
65  *
66  *	Interfaces are brought up or down.
67  *	Addresses are assigned to interfaces.
68  *	ICMP redirects are processed and a IRE_HOST_REDIRECT is installed.
69  *	No route is found while sending a packet.
70  *	When TCP requests IP to remove an IRE_CACHE of a troubled destination.
71  *
72  * Since all we do is reformat the messages between routing socket and
73  * ioctl forms, no synchronization is necessary in this module; all
74  * the dirty work is done down in ip.
75  */
76 
77 /*
78  * Object to represent database of options to search passed to
79  * {sock,tpi}optcom_req() interface routine to take care of option
80  * management and associated methods.
81  * XXX. These and other externs should really move to a rts header.
82  */
83 extern optdb_obj_t	rts_opt_obj;
84 extern uint_t		rts_max_optsize;
85 
86 /* Internal routing socket stream control structure, one per open stream */
87 typedef	struct rts_s {
88 	cred_t	*rts_credp;		/* Opener's credentials */
89 	uint_t	rts_state;		/* Provider interface state */
90 	uint_t	rts_error;		/* Routing socket error code */
91 	uint_t	rts_flag;		/* Pending I/O state */
92 	uint_t	rts_proto;		/* SO_PROTOTYPE "socket" option. */
93 	uint_t	rts_debug : 1,		/* SO_DEBUG "socket" option. */
94 		rts_dontroute : 1,	/* SO_DONTROUTE "socket" option. */
95 		rts_broadcast : 1,	/* SO_BROADCAST "socket" option. */
96 		rts_reuseaddr : 1,	/* SO_REUSEADDR "socket" option. */
97 		rts_useloopback : 1,	/* SO_USELOOPBACK "socket" option. */
98 		rts_multicast_loop : 1,	/* IP_MULTICAST_LOOP option */
99 		rts_hdrincl : 1,	/* IP_HDRINCL option + RAW and IGMP */
100 
101 		: 0;
102 } rts_t;
103 
104 #define	RTS_WPUT_PENDING	0x1	/* Waiting for write-side to complete */
105 #define	RTS_WRW_PENDING		0x2	/* Routing socket write in progress */
106 #define	RTS_OPEN_PENDING	0x4	/* Routing socket open in progress */
107 
108 /* Default structure copied into T_INFO_ACK messages */
109 static struct T_info_ack rts_g_t_info_ack = {
110 	T_INFO_ACK,
111 	T_INFINITE,	/* TSDU_size. Maximum size messages. */
112 	T_INVALID,	/* ETSDU_size. No expedited data. */
113 	T_INVALID,	/* CDATA_size. No connect data. */
114 	T_INVALID,	/* DDATA_size. No disconnect data. */
115 	0,		/* ADDR_size. */
116 	0,		/* OPT_size - not initialized here */
117 	64 * 1024,	/* TIDU_size. rts allows maximum size messages. */
118 	T_COTS,		/* SERV_type. rts supports connection oriented. */
119 	TS_UNBND,	/* CURRENT_state. This is set from rts_state. */
120 	(XPG4_1)	/* PROVIDER_flag */
121 };
122 
123 /* Named Dispatch Parameter Management Structure */
124 typedef struct rtspparam_s {
125 	uint_t	rts_param_min;
126 	uint_t	rts_param_max;
127 	uint_t	rts_param_value;
128 	char	*rts_param_name;
129 } rtsparam_t;
130 
131 /*
132  * Table of ND variables supported by rts. These are loaded into rts_g_nd
133  * in rts_open.
134  * All of these are alterable, within the min/max values given, at run time.
135  */
136 static rtsparam_t	rts_param_arr[] = {
137 	/* min		max		value		name */
138 	{ 4096,		65536,		8192,		"rts_xmit_hiwat"},
139 	{ 0,		65536,		1024,		"rts_xmit_lowat"},
140 	{ 4096,		65536,		8192,		"rts_recv_hiwat"},
141 	{ 65536,	1024*1024*1024, 256*1024,	"rts_max_buf"},
142 };
143 #define	rts_xmit_hiwat			rts_param_arr[0].rts_param_value
144 #define	rts_xmit_lowat			rts_param_arr[1].rts_param_value
145 #define	rts_recv_hiwat			rts_param_arr[2].rts_param_value
146 #define	rts_max_buf			rts_param_arr[3].rts_param_value
147 
148 static int	rts_close(queue_t *q);
149 static void 	rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
150     int sys_error);
151 static mblk_t	*rts_ioctl_alloc(mblk_t *data, cred_t *cr);
152 static int	rts_open(queue_t *q, dev_t *devp, int flag, int sflag,
153     cred_t *credp);
154 int		rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name,
155     uchar_t *ptr);
156 int		rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name,
157     uchar_t *ptr);
158 int		rts_opt_set(queue_t *q, uint_t optset_context, int level,
159     int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
160     uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk);
161 static void	rts_param_cleanup(void);
162 static int	rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
163 static boolean_t rts_param_register(rtsparam_t *rtspa, int cnt);
164 static int	rts_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
165     cred_t *cr);
166 static void	rts_rput(queue_t *q, mblk_t *mp);
167 static void	rts_wput(queue_t *q, mblk_t *mp);
168 static void	rts_wput_iocdata(queue_t *q, mblk_t *mp);
169 static void 	rts_wput_other(queue_t *q, mblk_t *mp);
170 static int	rts_wrw(queue_t *q, struiod_t *dp);
171 
172 static struct module_info info = {
173 	129, "rts", 1, INFPSZ, 512, 128
174 };
175 
176 static struct qinit rinit = {
177 	(pfi_t)rts_rput, NULL, rts_open, rts_close, NULL, &info
178 };
179 
180 static struct qinit winit = {
181 	(pfi_t)rts_wput, NULL, NULL, NULL, NULL, &info,
182 	NULL, (pfi_t)rts_wrw, NULL, STRUIOT_STANDARD
183 };
184 
185 struct streamtab rtsinfo = {
186 	&rinit, &winit
187 };
188 
189 static IDP	rts_g_nd;	/* Points to table of RTS ND variables. */
190 uint_t		rts_open_streams = 0;
191 
192 /*
193  * This routine allocates the necessary
194  * message blocks for IOCTL wrapping the
195  * user data.
196  */
197 static mblk_t *
198 rts_ioctl_alloc(mblk_t *data, cred_t *cr)
199 {
200 	mblk_t	*mp = NULL;
201 	mblk_t	*mp1 = NULL;
202 	ipllc_t	*ipllc;
203 	struct iocblk	*ioc;
204 
205 	mp = allocb_cred(sizeof (ipllc_t), cr);
206 	if (mp == NULL)
207 		return (NULL);
208 	mp1 = allocb_cred(sizeof (struct iocblk), cr);
209 	if (mp1 == NULL) {
210 		freeb(mp);
211 		return (NULL);
212 	}
213 
214 	ipllc = (ipllc_t *)mp->b_rptr;
215 	ipllc->ipllc_cmd = IP_IOC_RTS_REQUEST;
216 	ipllc->ipllc_name_offset = 0;
217 	ipllc->ipllc_name_length = 0;
218 	mp->b_wptr += sizeof (ipllc_t);
219 	mp->b_cont = data;
220 
221 	ioc = (struct iocblk *)mp1->b_rptr;
222 	ioc->ioc_cmd = IP_IOCTL;
223 	ioc->ioc_error = 0;
224 	ioc->ioc_cr = NULL;
225 	ioc->ioc_count = msgdsize(mp);
226 	mp1->b_wptr += sizeof (struct iocblk);
227 	mp1->b_datap->db_type = M_IOCTL;
228 	mp1->b_cont = mp;
229 
230 	return (mp1);
231 }
232 
233 /*
234  * This routine closes rts stream, by disabling
235  * put/srv routines and freeing the this module
236  * internal datastructure.
237  */
238 static int
239 rts_close(queue_t *q)
240 {
241 	qprocsoff(q);
242 
243 	crfree(((rts_t *)q->q_ptr)->rts_credp);
244 
245 	mi_free(q->q_ptr);
246 	rts_open_streams--;
247 	/*
248 	 * Free the ND table if this was
249 	 * the last stream close
250 	 */
251 	rts_param_cleanup();
252 	return (0);
253 }
254 
255 /*
256  * This is the open routine for routing socket. It allocates
257  * rts_t structure for the stream and sends an IOCTL to
258  * the down module to indicate that it is a routing socket
259  * stream.
260  */
261 /* ARGSUSED */
262 static int
263 rts_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
264 {
265 	mblk_t	*mp = NULL;
266 	rts_t	*rts;
267 
268 	/* If the stream is already open, return immediately. */
269 	if (q->q_ptr != NULL)
270 		return (0);
271 
272 	/* If this is not a push of rts as a module, fail. */
273 	if (sflag != MODOPEN)
274 		return (EINVAL);
275 
276 	/* If this is the first open of rts, create the ND table. */
277 	if (rts_g_nd == NULL) {
278 		if (!rts_param_register(rts_param_arr, A_CNT(rts_param_arr)))
279 			return (ENOMEM);
280 	}
281 	q->q_ptr = mi_zalloc_sleep(sizeof (rts_t));
282 	WR(q)->q_ptr = q->q_ptr;
283 	rts = (rts_t *)q->q_ptr;
284 
285 	rts->rts_credp = credp;
286 	crhold(credp);
287 	/*
288 	 * The receive hiwat is only looked at on the stream head queue.
289 	 * Store in q_hiwat in order to return on SO_RCVBUF getsockopts.
290 	 */
291 	q->q_hiwat = rts_recv_hiwat;
292 	/*
293 	 * The transmit hiwat/lowat is only looked at on IP's queue.
294 	 * Store in q_hiwat/q_lowat in order to return on SO_SNDBUF/SO_SNDLOWAT
295 	 * getsockopts.
296 	 */
297 	WR(q)->q_hiwat = rts_xmit_hiwat;
298 	WR(q)->q_lowat = rts_xmit_lowat;
299 	qprocson(q);
300 	/*
301 	 * Indicate the down IP module that this is a routing socket
302 	 * client by sending an RTS IOCTL without any user data. Although
303 	 * this is just a notification message (without any real routing
304 	 * request), we pass in any credential for correctness sake.
305 	 */
306 	mp = rts_ioctl_alloc(NULL, credp);
307 	if (mp == NULL) {
308 		rts_param_cleanup();
309 		qprocsoff(q);
310 		ASSERT(q->q_ptr != NULL);
311 		mi_free(q->q_ptr);
312 		crfree(credp);
313 		return (ENOMEM);
314 	}
315 	rts_open_streams++;
316 	rts->rts_flag |= RTS_OPEN_PENDING;
317 	putnext(WR(q), mp);
318 	while (rts->rts_flag & RTS_OPEN_PENDING) {
319 		if (!qwait_sig(q)) {
320 			(void) rts_close(q);
321 			return (EINTR);
322 		}
323 	}
324 	if (rts->rts_error != 0) {
325 		(void) rts_close(q);
326 		return (ENOTSUP);
327 	}
328 	rts->rts_state = TS_UNBND;
329 	return (0);
330 }
331 
332 /*
333  * This routine creates a T_ERROR_ACK message and passes it upstream.
334  */
335 static void
336 rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
337 {
338 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
339 		qreply(q, mp);
340 }
341 
342 /*
343  * This routine creates a T_OK_ACK message and passes it upstream.
344  */
345 static void
346 rts_ok_ack(queue_t *q, mblk_t *mp)
347 {
348 	if ((mp = mi_tpi_ok_ack_alloc(mp)) != NULL)
349 		qreply(q, mp);
350 }
351 
352 /*
353  * This routine is called by rts_wput to handle T_UNBIND_REQ messages.
354  * After some error checking, the message is passed downstream to ip.
355  */
356 static void
357 rts_unbind(queue_t *q, mblk_t *mp)
358 {
359 	rts_t	*rts;
360 
361 	rts = (rts_t *)q->q_ptr;
362 	/* If a bind has not been done, we can't unbind. */
363 	if (rts->rts_state != TS_IDLE) {
364 		rts_err_ack(q, mp, TOUTSTATE, 0);
365 		return;
366 	}
367 	rts->rts_state = TS_UNBND;
368 	rts_ok_ack(q, mp);
369 }
370 
371 /*
372  * This routine is called to handle each
373  * O_T_BIND_REQ/T_BIND_REQ message passed to
374  * rts_wput. Note: This routine works with both
375  * O_T_BIND_REQ and T_BIND_REQ semantics.
376  */
377 static void
378 rts_bind(queue_t *q, mblk_t *mp)
379 {
380 	mblk_t	*mp1;
381 	struct T_bind_req *tbr;
382 	rts_t	*rts;
383 
384 	rts = (rts_t *)q->q_ptr;
385 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
386 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
387 		    "rts_bind: bad data, %d", rts->rts_state);
388 		rts_err_ack(q, mp, TBADADDR, 0);
389 		return;
390 	}
391 	if (rts->rts_state != TS_UNBND) {
392 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
393 		    "rts_bind: bad state, %d", rts->rts_state);
394 		rts_err_ack(q, mp, TOUTSTATE, 0);
395 		return;
396 	}
397 	/*
398 	 * Reallocate the message to make sure we have enough room for an
399 	 * address and the protocol type.
400 	 */
401 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin_t), 1);
402 	if (mp1 == NULL) {
403 		rts_err_ack(q, mp, TSYSERR, ENOMEM);
404 		return;
405 	}
406 	mp = mp1;
407 	tbr = (struct T_bind_req *)mp->b_rptr;
408 	if (tbr->ADDR_length != 0) {
409 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
410 		    "rts_bind: bad ADDR_length %d", tbr->ADDR_length);
411 		rts_err_ack(q, mp, TBADADDR, 0);
412 		return;
413 	}
414 	/* Generic request */
415 	tbr->ADDR_offset = (t_scalar_t)sizeof (struct T_bind_req);
416 	tbr->ADDR_length = 0;
417 	tbr->PRIM_type = T_BIND_ACK;
418 	rts->rts_state = TS_IDLE;
419 	qreply(q, mp);
420 }
421 
422 static void
423 rts_copy_info(struct T_info_ack *tap, rts_t *rts)
424 {
425 	*tap = rts_g_t_info_ack;
426 	tap->CURRENT_state = rts->rts_state;
427 	tap->OPT_size = rts_max_optsize;
428 }
429 
430 /*
431  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
432  * rts_wput.  Much of the T_CAPABILITY_ACK information is copied from
433  * rts_g_t_info_ack.  The current state of the stream is copied from
434  * rts_state.
435  */
436 static void
437 rts_capability_req(queue_t *q, mblk_t *mp)
438 {
439 	rts_t			*rts = (rts_t *)q->q_ptr;
440 	t_uscalar_t		cap_bits1;
441 	struct T_capability_ack	*tcap;
442 
443 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
444 
445 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
446 		mp->b_datap->db_type, T_CAPABILITY_ACK);
447 	if (mp == NULL)
448 		return;
449 
450 	tcap = (struct T_capability_ack *)mp->b_rptr;
451 	tcap->CAP_bits1 = 0;
452 
453 	if (cap_bits1 & TC1_INFO) {
454 		rts_copy_info(&tcap->INFO_ack, rts);
455 		tcap->CAP_bits1 |= TC1_INFO;
456 	}
457 
458 	qreply(q, mp);
459 }
460 
461 /*
462  * This routine responds to T_INFO_REQ messages.  It is called by rts_wput.
463  * Most of the T_INFO_ACK information is copied from rts_g_t_info_ack.
464  * The current state of the stream is copied from rts_state.
465  */
466 static void
467 rts_info_req(queue_t *q, mblk_t *mp)
468 {
469 	rts_t	*rts = (rts_t *)q->q_ptr;
470 
471 	mp = tpi_ack_alloc(mp, sizeof (rts_g_t_info_ack), M_PCPROTO,
472 	    T_INFO_ACK);
473 	if (mp == NULL)
474 		return;
475 	rts_copy_info((struct T_info_ack *)mp->b_rptr, rts);
476 	qreply(q, mp);
477 }
478 
479 /*
480  * This routine gets default values of certain options whose default
481  * values are maintained by protcol specific code
482  */
483 /* ARGSUSED */
484 int
485 rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
486 {
487 	/* no default value processed by protocol specific code currently */
488 	return (-1);
489 }
490 
491 /*
492  * This routine retrieves the current status of socket options.
493  * It returns the size of the option retrieved.
494  */
495 int
496 rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
497 {
498 	int	*i1 = (int *)ptr;
499 	rts_t	*rts = (rts_t *)q->q_ptr;
500 
501 	switch (level) {
502 	case SOL_SOCKET:
503 		switch (name) {
504 		case SO_DEBUG:
505 			*i1 = rts->rts_debug;
506 			break;
507 		case SO_REUSEADDR:
508 			*i1 = rts->rts_reuseaddr;
509 			break;
510 		case SO_TYPE:
511 			*i1 = SOCK_RAW;
512 			break;
513 
514 		/*
515 		 * The following three items are available here,
516 		 * but are only meaningful to IP.
517 		 */
518 		case SO_DONTROUTE:
519 			*i1 = rts->rts_dontroute;
520 			break;
521 		case SO_USELOOPBACK:
522 			*i1 = rts->rts_useloopback;
523 			break;
524 		case SO_BROADCAST:
525 			*i1 = rts->rts_broadcast;
526 			break;
527 		case SO_PROTOTYPE:
528 			*i1 = rts->rts_proto;
529 			break;
530 		/*
531 		 * The following two items can be manipulated,
532 		 * but changing them should do nothing.
533 		 */
534 		case SO_SNDBUF:
535 			ASSERT(q->q_hiwat <= INT_MAX);
536 			*i1 = (int)(q->q_hiwat);
537 			break;
538 		case SO_RCVBUF:
539 			ASSERT(q->q_hiwat <= INT_MAX);
540 			*i1 = (int)(RD(q)->q_hiwat);
541 			break;
542 		case SO_DOMAIN:
543 			*i1 = PF_ROUTE;
544 			break;
545 		default:
546 			return (-1);
547 		}
548 		break;
549 	default:
550 		return (-1);
551 	}
552 	return ((int)sizeof (int));
553 }
554 
555 
556 /*
557  * This routine sets socket options.
558  */
559 /*ARGSUSED*/
560 int
561 rts_opt_set(queue_t *q, uint_t optset_context, int level,
562     int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
563     uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
564 {
565 	int	*i1 = (int *)invalp;
566 	rts_t	*rts = (rts_t *)q->q_ptr;
567 	boolean_t checkonly;
568 
569 	switch (optset_context) {
570 	case SETFN_OPTCOM_CHECKONLY:
571 		checkonly = B_TRUE;
572 		/*
573 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
574 		 * inlen != 0 implies value supplied and
575 		 * 	we have to "pretend" to set it.
576 		 * inlen == 0 implies that there is no
577 		 * 	value part in T_CHECK request and just validation
578 		 * done elsewhere should be enough, we just return here.
579 		 */
580 		if (inlen == 0) {
581 			*outlenp = 0;
582 			return (0);
583 		}
584 		break;
585 	case SETFN_OPTCOM_NEGOTIATE:
586 		checkonly = B_FALSE;
587 		break;
588 	case SETFN_UD_NEGOTIATE:
589 	case SETFN_CONN_NEGOTIATE:
590 		checkonly = B_FALSE;
591 		/*
592 		 * Negotiating local and "association-related" options
593 		 * through T_UNITDATA_REQ or T_CONN_{REQ,CON}
594 		 * Not allowed in this module.
595 		 */
596 		return (EINVAL);
597 	default:
598 		/*
599 		 * We should never get here
600 		 */
601 		*outlenp = 0;
602 		return (EINVAL);
603 	}
604 
605 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
606 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
607 
608 	/*
609 	 * For rts, we should have no ancillary data sent down
610 	 * (rts_wput doesn't handle options).
611 	 */
612 	ASSERT(thisdg_attrs == NULL);
613 
614 	/*
615 	 * For fixed length options, no sanity check
616 	 * of passed in length is done. It is assumed *_optcom_req()
617 	 * routines do the right thing.
618 	 */
619 
620 	switch (level) {
621 	case SOL_SOCKET:
622 		switch (name) {
623 		case SO_REUSEADDR:
624 			if (!checkonly)
625 				rts->rts_reuseaddr = *i1;
626 			break;	/* goto sizeof (int) option return */
627 		case SO_DEBUG:
628 			if (!checkonly)
629 				rts->rts_debug = *i1;
630 			break;	/* goto sizeof (int) option return */
631 		/*
632 		 * The following three items are available here,
633 		 * but are only meaningful to IP.
634 		 */
635 		case SO_DONTROUTE:
636 			if (!checkonly)
637 				rts->rts_dontroute = *i1;
638 			break;	/* goto sizeof (int) option return */
639 		case SO_USELOOPBACK:
640 			if (!checkonly)
641 				rts->rts_useloopback = *i1;
642 			break;	/* goto sizeof (int) option return */
643 		case SO_BROADCAST:
644 			if (!checkonly)
645 				rts->rts_broadcast = *i1;
646 			break;	/* goto sizeof (int) option return */
647 		case SO_PROTOTYPE:
648 			/*
649 			 * Routing socket applications that call socket() with
650 			 * a third argument can filter which messages will be
651 			 * sent upstream thanks to sockfs.  so_socket() sends
652 			 * down the SO_PROTOTYPE and rts_queue_input()
653 			 * implements the filtering.
654 			 */
655 			if (*i1 != AF_INET && *i1 != AF_INET6)
656 				return (EPROTONOSUPPORT);
657 			if (!checkonly)
658 				rts->rts_proto = *i1;
659 			break;	/* goto sizeof (int) option return */
660 		/*
661 		 * The following two items can be manipulated,
662 		 * but changing them should do nothing.
663 		 */
664 		case SO_SNDBUF:
665 			if (*i1 > rts_max_buf) {
666 				*outlenp = 0;
667 				return (ENOBUFS);
668 			}
669 			if (!checkonly) {
670 				q->q_hiwat = *i1;
671 				q->q_next->q_hiwat = *i1;
672 			}
673 			break;	/* goto sizeof (int) option return */
674 		case SO_RCVBUF:
675 			if (*i1 > rts_max_buf) {
676 				*outlenp = 0;
677 				return (ENOBUFS);
678 			}
679 			if (!checkonly) {
680 				RD(q)->q_hiwat = *i1;
681 				(void) mi_set_sth_hiwat(RD(q), *i1);
682 			}
683 			break;	/* goto sizeof (int) option return */
684 		default:
685 			*outlenp = 0;
686 			return (EINVAL);
687 		}
688 		break;
689 	default:
690 		*outlenp = 0;
691 		return (EINVAL);
692 	}
693 	/*
694 	 * Common case of return from an option that is sizeof (int)
695 	 */
696 	*(int *)outvalp = *i1;
697 	*outlenp = (t_uscalar_t)sizeof (int);
698 	return (0);
699 }
700 
701 /*
702  * This routine frees the ND table if all streams have been closed.
703  * It is called by rts_close and rts_open.
704  */
705 static void
706 rts_param_cleanup(void)
707 {
708 	if (!rts_open_streams)
709 		nd_free(&rts_g_nd);
710 }
711 
712 /*
713  * This routine retrieves the value of an ND variable in a rtsparam_t
714  * structure. It is called through nd_getset when a user reads the
715  * variable.
716  */
717 /* ARGSUSED */
718 static int
719 rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
720 {
721 	rtsparam_t	*rtspa = (rtsparam_t *)cp;
722 
723 	(void) mi_mpprintf(mp, "%u", rtspa->rts_param_value);
724 	return (0);
725 }
726 
727 /*
728  * Walk through the param array specified registering each element with the
729  * named dispatch (ND) handler.
730  */
731 static boolean_t
732 rts_param_register(rtsparam_t *rtspa, int cnt)
733 {
734 	for (; cnt-- > 0; rtspa++) {
735 		if (rtspa->rts_param_name != NULL && rtspa->rts_param_name[0]) {
736 			if (!nd_load(&rts_g_nd, rtspa->rts_param_name,
737 			    rts_param_get, rts_param_set, (caddr_t)rtspa)) {
738 				nd_free(&rts_g_nd);
739 				return (B_FALSE);
740 			}
741 		}
742 	}
743 	return (B_TRUE);
744 }
745 
746 /* This routine sets an ND variable in a rtsparam_t structure. */
747 /* ARGSUSED */
748 static int
749 rts_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
750 {
751 	ulong_t	new_value;
752 	rtsparam_t	*rtspa = (rtsparam_t *)cp;
753 
754 	/*
755 	 * Fail the request if the new value does not lie within the
756 	 * required bounds.
757 	 */
758 	if (ddi_strtoul(value, NULL, 10, &new_value) != 0 ||
759 	    new_value < rtspa->rts_param_min ||
760 	    new_value > rtspa->rts_param_max) {
761 		return (EINVAL);
762 	}
763 
764 	/* Set the new value */
765 	rtspa->rts_param_value = new_value;
766 	return (0);
767 }
768 
769 /*
770  * This routine handles synchronous messages passed downstream. It either
771  * consumes the message or passes it downstream; it never queues a
772  * a message. The data messages that go down are wrapped in an IOCTL
773  * message.
774  *
775  * Since it is synchronous, it waits for the M_IOCACK/M_IOCNAK so that
776  * it can return an immediate error (such as ENETUNREACH when adding a route).
777  * It uses the RTS_WRW_PENDING to ensure that each rts instance has only
778  * one M_IOCTL outstanding at any given time.
779  */
780 static int
781 rts_wrw(queue_t *q, struiod_t *dp)
782 {
783 	mblk_t	*mp = dp->d_mp;
784 	mblk_t	*mp1;
785 	int	error;
786 	rt_msghdr_t	*rtm;
787 	rts_t	*rts;
788 
789 	rts = (rts_t *)q->q_ptr;
790 	while (rts->rts_flag & RTS_WRW_PENDING) {
791 		if (qwait_rw(q)) {
792 			rts->rts_error = EINTR;
793 			goto err_ret;
794 		}
795 	}
796 	rts->rts_flag |= RTS_WRW_PENDING;
797 
798 	if (isuioq(q) && (error = struioget(q, mp, dp, 0))) {
799 		/*
800 		 * Uio error of some sort, so just return the error.
801 		 */
802 		rts->rts_error = error;
803 		goto err_ret;
804 	}
805 	/*
806 	 * Pass the mblk (chain) onto wput().
807 	 */
808 	dp->d_mp = 0;
809 
810 	switch (mp->b_datap->db_type) {
811 	case M_PROTO:
812 	case M_PCPROTO:
813 		/* Expedite other than T_DATA_REQ to below the switch */
814 		if (((mp->b_wptr - mp->b_rptr) !=
815 		    sizeof (struct T_data_req)) ||
816 		    (((union T_primitives *)mp->b_rptr)->type != T_DATA_REQ))
817 			break;
818 		if ((mp1 = mp->b_cont) == NULL) {
819 			rts->rts_error = EINVAL;
820 			goto err_ret;
821 		}
822 		freeb(mp);
823 		mp = mp1;
824 		/* FALLTHRU */
825 	case M_DATA:
826 		/*
827 		 * The semantics of the routing socket is such that the rtm_pid
828 		 * field is automatically filled in during requests with the
829 		 * current process' pid.  We do this here (where we still have
830 		 * user context) after checking we have at least a message the
831 		 * size of a routing message header.
832 		 */
833 		if ((mp->b_wptr - mp->b_rptr) < sizeof (rt_msghdr_t)) {
834 			if (!pullupmsg(mp, sizeof (rt_msghdr_t))) {
835 				rts->rts_error = EINVAL;
836 				goto err_ret;
837 			}
838 		}
839 		rtm = (rt_msghdr_t *)mp->b_rptr;
840 		rtm->rtm_pid = curproc->p_pid;
841 		break;
842 	default:
843 		break;
844 	}
845 	rts->rts_flag |= RTS_WPUT_PENDING;
846 	rts_wput(q, mp);
847 	while (rts->rts_flag & RTS_WPUT_PENDING)
848 		if (qwait_rw(q)) {
849 			/* RTS_WPUT_PENDING will be cleared below */
850 			rts->rts_error = EINTR;
851 			break;
852 		}
853 err_ret:
854 	rts->rts_flag &= ~(RTS_WPUT_PENDING | RTS_WRW_PENDING);
855 	return (rts->rts_error);
856 }
857 
858 /*
859  * This routine handles all messages passed downstream. It either
860  * consumes the message or passes it downstream; it never queues a
861  * a message. The data messages that go down are wrapped in an IOCTL
862  * message.
863  */
864 static void
865 rts_wput(queue_t *q, mblk_t *mp)
866 {
867 	uchar_t	*rptr = mp->b_rptr;
868 	mblk_t	*mp1;
869 
870 	switch (mp->b_datap->db_type) {
871 	case M_DATA:
872 		break;
873 	case M_PROTO:
874 	case M_PCPROTO:
875 		if ((mp->b_wptr - rptr) == sizeof (struct T_data_req)) {
876 			/* Expedite valid T_DATA_REQ to below the switch */
877 			if (((union T_primitives *)rptr)->type == T_DATA_REQ) {
878 				mp1 = mp->b_cont;
879 				freeb(mp);
880 				if (mp1 == NULL)
881 					return;
882 				mp = mp1;
883 				break;
884 			}
885 		}
886 		/* FALLTHRU */
887 	default:
888 		rts_wput_other(q, mp);
889 		return;
890 	}
891 
892 
893 	mp1 = rts_ioctl_alloc(mp, DB_CRED(mp));
894 	if (mp1 == NULL) {
895 		rts_t	*rts = (rts_t *)q->q_ptr;
896 
897 		ASSERT(rts != NULL);
898 		freemsg(mp);
899 		if (rts->rts_flag & RTS_WPUT_PENDING) {
900 			rts->rts_error = ENOMEM;
901 			rts->rts_flag &= ~RTS_WPUT_PENDING;
902 		}
903 		return;
904 	}
905 	putnext(q, mp1);
906 }
907 
908 
909 /*
910  * Handles all the control message, if it
911  * can not understand it, it will
912  * pass down stream.
913  */
914 static void
915 rts_wput_other(queue_t *q, mblk_t *mp)
916 {
917 	uchar_t	*rptr = mp->b_rptr;
918 	rts_t	*rts;
919 	struct iocblk	*iocp;
920 	cred_t	*cr;
921 
922 	rts = (rts_t *)q->q_ptr;
923 
924 	cr = DB_CREDDEF(mp, rts->rts_credp);
925 
926 	switch (mp->b_datap->db_type) {
927 	case M_PROTO:
928 	case M_PCPROTO:
929 		if ((mp->b_wptr - rptr) < sizeof (t_scalar_t)) {
930 			/*
931 			 * If the message does not contain a PRIM_type,
932 			 * throw it away.
933 			 */
934 			freemsg(mp);
935 			return;
936 		}
937 		switch (((union T_primitives *)rptr)->type) {
938 		case T_BIND_REQ:
939 		case O_T_BIND_REQ:
940 			rts_bind(q, mp);
941 			return;
942 		case T_UNBIND_REQ:
943 			rts_unbind(q, mp);
944 			return;
945 		case T_CAPABILITY_REQ:
946 			rts_capability_req(q, mp);
947 			return;
948 		case T_INFO_REQ:
949 			rts_info_req(q, mp);
950 			return;
951 		case T_SVR4_OPTMGMT_REQ:
952 			(void) svr4_optcom_req(q, mp, cr, &rts_opt_obj);
953 			return;
954 		case T_OPTMGMT_REQ:
955 			(void) tpi_optcom_req(q, mp, cr, &rts_opt_obj);
956 			return;
957 		case O_T_CONN_RES:
958 		case T_CONN_RES:
959 		case T_DISCON_REQ:
960 			/* Not supported by rts. */
961 			rts_err_ack(q, mp, TNOTSUPPORT, 0);
962 			return;
963 		case T_DATA_REQ:
964 		case T_EXDATA_REQ:
965 		case T_ORDREL_REQ:
966 			/* Illegal for rts. */
967 			freemsg(mp);
968 			(void) putnextctl1(RD(q), M_ERROR, EPROTO);
969 			return;
970 		default:
971 			break;
972 		}
973 		break;
974 	case M_IOCTL:
975 		iocp = (struct iocblk *)mp->b_rptr;
976 		switch (iocp->ioc_cmd) {
977 		case ND_SET:
978 		case ND_GET:
979 			if (nd_getset(q, rts_g_nd, mp)) {
980 				qreply(q, mp);
981 				return;
982 			}
983 			break;
984 		case TI_GETPEERNAME:
985 			mi_copyin(q, mp, NULL,
986 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
987 			return;
988 		default:
989 			break;
990 		}
991 	case M_IOCDATA:
992 		rts_wput_iocdata(q, mp);
993 		return;
994 	default:
995 		break;
996 	}
997 	putnext(q, mp);
998 }
999 
1000 /*
1001  * Called by rts_wput_other to handle all M_IOCDATA messages.
1002  */
1003 static void
1004 rts_wput_iocdata(queue_t *q, mblk_t *mp)
1005 {
1006 	struct sockaddr	*rtsaddr;
1007 	mblk_t	*mp1;
1008 	STRUCT_HANDLE(strbuf, sb);
1009 	struct iocblk	*iocp	= (struct iocblk *)mp->b_rptr;
1010 
1011 	/* Make sure it is one of ours. */
1012 	switch (iocp->ioc_cmd) {
1013 	case TI_GETPEERNAME:
1014 		break;
1015 	default:
1016 		putnext(q, mp);
1017 		return;
1018 	}
1019 	switch (mi_copy_state(q, mp, &mp1)) {
1020 	case -1:
1021 		return;
1022 	case MI_COPY_CASE(MI_COPY_IN, 1):
1023 		break;
1024 	case MI_COPY_CASE(MI_COPY_OUT, 1):
1025 		/* Copy out the strbuf. */
1026 		mi_copyout(q, mp);
1027 		return;
1028 	case MI_COPY_CASE(MI_COPY_OUT, 2):
1029 		/* All done. */
1030 		mi_copy_done(q, mp, 0);
1031 		return;
1032 	default:
1033 		mi_copy_done(q, mp, EPROTO);
1034 		return;
1035 	}
1036 	STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr);
1037 	if (STRUCT_FGET(sb, maxlen) < (int)sizeof (sin_t)) {
1038 		mi_copy_done(q, mp, EINVAL);
1039 		return;
1040 	}
1041 	switch (iocp->ioc_cmd) {
1042 	case TI_GETPEERNAME:
1043 		break;
1044 	default:
1045 		mi_copy_done(q, mp, EPROTO);
1046 		return;
1047 	}
1048 	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), sizeof (sin_t),
1049 	    B_TRUE);
1050 	if (mp1 == NULL)
1051 		return;
1052 	STRUCT_FSET(sb, len, (int)sizeof (sin_t));
1053 	rtsaddr = (struct sockaddr *)mp1->b_rptr;
1054 	mp1->b_wptr = (uchar_t *)&rtsaddr[1];
1055 	bzero(rtsaddr, sizeof (struct sockaddr));
1056 	rtsaddr->sa_family = AF_ROUTE;
1057 	/* Copy out the address */
1058 	mi_copyout(q, mp);
1059 }
1060 
1061 static void
1062 rts_rput(queue_t *q, mblk_t *mp)
1063 {
1064 	rts_t	*rts;
1065 	struct iocblk	*iocp;
1066 	mblk_t *mp1;
1067 	struct T_data_ind *tdi;
1068 
1069 	rts = (rts_t *)q->q_ptr;
1070 	switch (mp->b_datap->db_type) {
1071 	case M_IOCACK:
1072 	case M_IOCNAK:
1073 		iocp = (struct iocblk *)mp->b_rptr;
1074 		if (rts->rts_flag & (RTS_WPUT_PENDING|RTS_OPEN_PENDING)) {
1075 			if (rts->rts_flag & RTS_WPUT_PENDING)
1076 				rts->rts_flag &= ~RTS_WPUT_PENDING;
1077 			else
1078 				rts->rts_flag &= ~RTS_OPEN_PENDING;
1079 			rts->rts_error = iocp->ioc_error;
1080 			freemsg(mp);
1081 			return;
1082 		}
1083 		break;
1084 	case M_DATA:
1085 		/*
1086 		 * Prepend T_DATA_IND to prevent the stream head from
1087 		 * consolidating multiple messages together.
1088 		 * If the allocation fails just send up the M_DATA.
1089 		 */
1090 		mp1 = allocb(sizeof (*tdi), BPRI_MED);
1091 		if (mp1 != NULL) {
1092 			mp1->b_cont = mp;
1093 			mp = mp1;
1094 
1095 			mp->b_datap->db_type = M_PROTO;
1096 			mp->b_wptr += sizeof (*tdi);
1097 			tdi = (struct T_data_ind *)mp->b_rptr;
1098 			tdi->PRIM_type = T_DATA_IND;
1099 			tdi->MORE_flag = 0;
1100 		}
1101 		break;
1102 	default:
1103 		break;
1104 	}
1105 	putnext(q, mp);
1106 }
1107 
1108 
1109 void
1110 rts_ddi_init(void)
1111 {
1112 	rts_max_optsize = optcom_max_optsize(rts_opt_obj.odb_opt_des_arr,
1113 	    rts_opt_obj.odb_opt_arr_cnt);
1114 }
1115