xref: /titanic_51/usr/src/uts/common/inet/optcom.c (revision 7aec1d6e253b21f9e9b7ef68b4d81ab9859b51fe)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 /* Copyright (c) 1990 Mentat Inc. */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 /*
31  * This file contains common code for handling Options Management requests.
32  */
33 
34 #include <sys/types.h>
35 #include <sys/stream.h>
36 #include <sys/stropts.h>
37 #include <sys/strlog.h>
38 #include <sys/strsubr.h>
39 #include <sys/errno.h>
40 #define	_SUN_TPI_VERSION 2
41 #include <sys/tihdr.h>
42 #include <sys/timod.h>
43 #include <sys/socket.h>
44 #include <sys/ddi.h>
45 #include <sys/cmn_err.h>
46 #include <sys/debug.h>		/* for ASSERT */
47 #include <sys/policy.h>
48 
49 #include <inet/common.h>
50 #include <inet/mi.h>
51 #include <inet/nd.h>
52 #include <netinet/ip6.h>
53 #include <inet/ip.h>
54 #include <inet/mib2.h>
55 #include <netinet/in.h>
56 #include <netinet/tcp.h>
57 #include <netinet/ip_mroute.h>
58 #include "optcom.h"
59 
60 #include <inet/optcom.h>
61 
62 /*
63  * Function prototypes
64  */
65 static t_scalar_t process_topthdrs_first_pass(mblk_t *, cred_t *, optdb_obj_t *,
66     boolean_t *, size_t *);
67 static t_scalar_t do_options_second_pass(queue_t *q, mblk_t *reqmp,
68     mblk_t *ack_mp, cred_t *, optdb_obj_t *dbobjp,
69     mblk_t *first_mp, boolean_t is_restart, boolean_t *queued_statusp);
70 static t_uscalar_t get_worst_status(t_uscalar_t, t_uscalar_t);
71 static int do_opt_default(queue_t *, struct T_opthdr *, uchar_t **,
72     t_uscalar_t *, cred_t *, optdb_obj_t *);
73 static void do_opt_current(queue_t *, struct T_opthdr *, uchar_t **,
74     t_uscalar_t *, cred_t *cr, optdb_obj_t *);
75 static int do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
76     uint_t optset_context, uchar_t **resptrp, t_uscalar_t *worst_statusp,
77     cred_t *, optdb_obj_t *dbobjp, mblk_t *first_mp);
78 static opdes_t *opt_chk_lookup(t_uscalar_t, t_uscalar_t, opdes_t *, uint_t);
79 static boolean_t opt_level_valid(t_uscalar_t, optlevel_t *, uint_t);
80 static size_t opt_level_allopts_lengths(t_uscalar_t, opdes_t *, uint_t);
81 static boolean_t opt_length_ok(opdes_t *, struct T_opthdr *);
82 static t_uscalar_t optcom_max_optbuf_len(opdes_t *, uint_t);
83 static boolean_t opt_bloated_maxsize(opdes_t *);
84 
85 /* Common code for sending back a T_ERROR_ACK. */
86 void
87 optcom_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
88 {
89 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
90 		qreply(q, mp);
91 }
92 
93 /*
94  * The option management routines svr4_optcom_req() and tpi_optcom_req() use
95  * callback functions as arguments. Here is the expected interfaces
96  * assumed from the callback functions
97  *
98  *
99  * (1) deffn(q, optlevel, optname, optvalp)
100  *
101  *	- Function only called when default value comes from protocol
102  *	 specific code and not the option database table (indicated by
103  *	  OP_DEF_FN property in option database.)
104  *	- Error return is -1. Valid returns are >=0.
105  *	- When valid, the return value represents the length used for storing
106  *		the default value of the option.
107  *      - Error return implies the called routine did not recognize this
108  *              option. Something downstream could so input is left unchanged
109  *              in request buffer.
110  *
111  * (2) getfn(q, optlevel, optname, optvalp)
112  *
113  *	- Error return is -1. Valid returns are >=0.
114  *	- When valid, the return value represents the length used for storing
115  *		the actual value of the option.
116  *      - Error return implies the called routine did not recognize this
117  *              option. Something downstream could so input is left unchanged
118  *              in request buffer.
119  *
120  * (3) setfn(q, optset_context, optlevel, optname, inlen, invalp,
121  *	outlenp, outvalp, attrp, cr);
122  *
123  *	- OK return is 0, Error code is returned as a non-zero argument.
124  *      - If negative it is ignored by svr4_optcom_req(). If positive, error
125  *        is returned. A negative return implies that option, while handled on
126  *	  this stack is not handled at this level and will be handled further
127  *	  downstream.
128  *	- Both negative and positive errors are treats as errors in an
129  *	  identical manner by tpi_optcom_req(). The errors affect "status"
130  *	  field of each option's T_opthdr. If sucessfull, an appropriate sucess
131  *	  result is carried. If error, it instantiated to "failure" at the
132  *	  topmost level and left unchanged at other levels. (This "failure" can
133  *	  turn to a success at another level).
134  *	- optset_context passed for tpi_optcom_req(). It is interpreted as:
135  *        - SETFN_OPTCOM_CHECKONLY
136  *		semantics are to pretend to set the value and report
137  *		back if it would be successful.
138  *		This is used with T_CHECK semantics in XTI
139  *        - SETFN_OPTCOM_NEGOTIATE
140  *		set the value. Call from option management primitive
141  *		T_OPTMGMT_REQ when T_NEGOTIATE flags is used.
142  *	  - SETFN_UD_NEGOTIATE
143  *		option request came riding on UNITDATA primitive most often
144  *		has  "this datagram" semantics to influence prpoerties
145  *		affecting an outgoig datagram or associated with recived
146  *		datagram
147  *		[ Note: XTI permits this use outside of "this datagram"
148  *		semantics also and permits setting "management related"
149  *		options in this	context and its test suite enforces it ]
150  *	  - SETFN_CONN_NEGOTATE
151  *		option request came riding on CONN_REQ/RES primitive and
152  *		most often has "this connection" (negotiation during
153  *		"connection estblishment") semantics.
154  *		[ Note: XTI permits use of these outside of "this connection"
155  *		semantics and permits "management related" options in this
156  *		context and its test suite enforces it. ]
157  *
158  *	- inlen, invalp is the option length,value requested to be set.
159  *	- outlenp, outvalp represent return parameters which contain the
160  *	  value set and it might be different from one passed on input.
161  *	- attrp points to a data structure that's used by v6 modules to
162  *	  store ancillary data options or sticky options.
163  *	- cr points to the caller's credentials
164  *	- the caller might pass same buffers for input and output and the
165  *	  routine should protect against this case by not updating output
166  *	  buffers until it is done referencing input buffers and any other
167  *	  issues (e.g. not use bcopy() if we do not trust what it does).
168  *      - If option is not known, it returns error. We randomly pick EINVAL.
169  *        It can however get called with options that are handled downstream
170  *        opr upstream so for svr4_optcom_req(), it does not return error for
171  *        negative return values.
172  *
173  */
174 
175 /*
176  * Upper Level Protocols call this routine when they receive
177  * a T_SVR4_OPTMGMT_REQ message.  They supply callback functions
178  * for setting a new value for a single options, getting the
179  * current value for a single option, and checking for support
180  * of a single option.  svr4_optcom_req validates the option management
181  * buffer passed in, and calls the appropriate routines to do the
182  * job requested.
183  * XXX Code below needs some restructuring after we have some more
184  * macros to support 'struct opthdr' in the headers.
185  *
186  * IP-MT notes: The option management framework functions svr4_optcom_req() and
187  * tpi_optcom_req() allocate and prepend an M_CTL mblk to the actual
188  * T_optmgmt_req mblk and pass the chain as an additional parameter to the
189  * protocol set functions. If a protocol set function (such as ip_opt_set)
190  * cannot process the option immediately it can return EINPROGRESS. ip_opt_set
191  * enqueues the message in the appropriate sq and returns EINPROGRESS. Later
192  * the sq framework arranges to restart this operation and passes control to
193  * the restart function ip_restart_optmgmt() which in turn calls
194  * svr4_optcom_req() or tpi_optcom_req() to restart the option processing.
195  */
196 int
197 svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp)
198 {
199 	pfi_t	deffn = dbobjp->odb_deffn;
200 	pfi_t	getfn = dbobjp->odb_getfn;
201 	opt_set_fn setfn = dbobjp->odb_setfn;
202 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
203 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
204 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
205 	opt_restart_t *or;
206 	struct opthdr *restart_opt;
207 	boolean_t is_restart = B_FALSE;
208 	mblk_t	*first_mp;
209 
210 	t_uscalar_t max_optbuf_len;
211 	int len;
212 	mblk_t	*mp1 = NULL;
213 	struct opthdr *next_opt;
214 	struct opthdr *opt;
215 	struct opthdr *opt1;
216 	struct opthdr *opt_end;
217 	struct opthdr *opt_start;
218 	opdes_t	*optd;
219 	boolean_t	pass_to_next = B_FALSE;
220 	boolean_t	pass_to_ip = B_FALSE;
221 	boolean_t	is_tcp;
222 	struct T_optmgmt_ack *toa;
223 	struct T_optmgmt_req *tor;
224 
225 	is_tcp = (dbobjp == &tcp_opt_obj);
226 
227 	/*
228 	 * Allocate M_CTL and prepend to the packet for restarting this
229 	 * option if needed. IP may need to queue and restart the option
230 	 * if it cannot obtain exclusive conditions immediately. Please see
231 	 * IP-MT notes before the start of svr4_optcom_req
232 	 */
233 	if (mp->b_datap->db_type == M_CTL) {
234 		is_restart = B_TRUE;
235 		first_mp = mp;
236 		mp = mp->b_cont;
237 		ASSERT(mp->b_wptr - mp->b_rptr >=
238 		    sizeof (struct T_optmgmt_req));
239 		tor = (struct T_optmgmt_req *)mp->b_rptr;
240 		ASSERT(tor->MGMT_flags == T_NEGOTIATE);
241 
242 		or = (opt_restart_t *)first_mp->b_rptr;
243 		opt_start = or->or_start;
244 		opt_end = or->or_end;
245 		restart_opt = or->or_ropt;
246 		goto restart;
247 	}
248 
249 	tor = (struct T_optmgmt_req *)mp->b_rptr;
250 	/* Verify message integrity. */
251 	if (mp->b_wptr - mp->b_rptr < sizeof (struct T_optmgmt_req))
252 		goto bad_opt;
253 	/* Verify MGMT_flags legal */
254 	switch (tor->MGMT_flags) {
255 	case T_DEFAULT:
256 	case T_NEGOTIATE:
257 	case T_CURRENT:
258 	case T_CHECK:
259 		/* OK - legal request flags */
260 		break;
261 	default:
262 		optcom_err_ack(q, mp, TBADFLAG, 0);
263 		return (0);
264 	}
265 	if (tor->MGMT_flags == T_DEFAULT) {
266 		/* Is it a request for default option settings? */
267 
268 		/*
269 		 * Note: XXX TLI and TPI specification was unclear about
270 		 * semantics of T_DEFAULT and the following historical note
271 		 * and its interpretation is incorrect (it implies a request
272 		 * for default values of only the identified options not all.
273 		 * The semantics have been explained better in XTI spec.)
274 		 * However, we do not modify (comment or code) here to keep
275 		 * compatibility.
276 		 * We can rethink this if it ever becomes an issue.
277 		 * ----historical comment start------
278 		 * As we understand it, the input buffer is meaningless
279 		 * so we ditch the message.  A T_DEFAULT request is a
280 		 * request to obtain a buffer containing defaults for
281 		 * all supported options, so we allocate a maximum length
282 		 * reply.
283 		 * ----historical comment end -------
284 		 */
285 		/* T_DEFAULT not passed down */
286 		ASSERT(topmost_tpiprovider == B_TRUE);
287 		freemsg(mp);
288 		max_optbuf_len = optcom_max_optbuf_len(opt_arr,
289 		    opt_arr_cnt);
290 		mp = allocb(max_optbuf_len, BPRI_MED);
291 		if (!mp) {
292 no_mem:;
293 			optcom_err_ack(q, mp, TSYSERR, ENOMEM);
294 			return (0);
295 		}
296 
297 		/* Initialize the T_optmgmt_ack header. */
298 		toa = (struct T_optmgmt_ack *)mp->b_rptr;
299 		bzero((char *)toa, max_optbuf_len);
300 		toa->PRIM_type = T_OPTMGMT_ACK;
301 		toa->OPT_offset = (t_scalar_t)sizeof (struct T_optmgmt_ack);
302 		/* TODO: Is T_DEFAULT the right thing to put in MGMT_flags? */
303 		toa->MGMT_flags = T_DEFAULT;
304 
305 		/* Now walk the table of options passed in */
306 		opt = (struct opthdr *)&toa[1];
307 		for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
308 			/*
309 			 * All the options in the table of options passed
310 			 * in are by definition supported by the protocol
311 			 * calling this function.
312 			 */
313 			if (!OA_READ_PERMISSION(optd, cr))
314 				continue;
315 			opt->level = optd->opdes_level;
316 			opt->name = optd->opdes_name;
317 			if (!(optd->opdes_props & OP_DEF_FN) ||
318 			    ((len = (*deffn)(q, opt->level,
319 				opt->name, (uchar_t *)&opt[1])) < 0)) {
320 				/*
321 				 * Fill length and value from table.
322 				 *
323 				 * Default value not instantiated from function
324 				 * (or the protocol specific function failed it;
325 				 * In this interpretation of T_DEFAULT, this is
326 				 * the best we can do)
327 				 */
328 				switch (optd->opdes_size) {
329 				/*
330 				 * Since options are guaranteed aligned only
331 				 * on a 4 byte boundary (t_scalar_t) any
332 				 * option that is greater in size will default
333 				 * to the bcopy below
334 				 */
335 				case sizeof (int32_t):
336 					*(int32_t *)&opt[1] =
337 					    (int32_t)optd->opdes_default;
338 					break;
339 				case sizeof (int16_t):
340 					*(int16_t *)&opt[1] =
341 					    (int16_t)optd->opdes_default;
342 					break;
343 				case sizeof (int8_t):
344 					*(int8_t *)&opt[1] =
345 					    (int8_t)optd->opdes_default;
346 					break;
347 				default:
348 					/*
349 					 * other length but still assume
350 					 * fixed - use bcopy
351 					 */
352 					bcopy(optd->opdes_defbuf,
353 					    &opt[1], optd->opdes_size);
354 					break;
355 				}
356 				opt->len = optd->opdes_size;
357 			}
358 			else
359 				opt->len = (t_uscalar_t)len;
360 			opt = (struct opthdr *)((char *)&opt[1] +
361 			    _TPI_ALIGN_OPT(opt->len));
362 		}
363 
364 		/* Now record the final length. */
365 		toa->OPT_length = (t_scalar_t)((char *)opt - (char *)&toa[1]);
366 		mp->b_wptr = (uchar_t *)opt;
367 		mp->b_datap->db_type = M_PCPROTO;
368 		/* Ship it back. */
369 		qreply(q, mp);
370 		return (0);
371 	}
372 	/* T_DEFAULT processing complete - no more T_DEFAULT */
373 
374 	/*
375 	 * For T_NEGOTIATE, T_CURRENT, and T_CHECK requests, we make a
376 	 * pass through the input buffer validating the details and
377 	 * making sure each option is supported by the protocol.
378 	 */
379 	if ((opt_start = (struct opthdr *)mi_offset_param(mp,
380 	    tor->OPT_offset, tor->OPT_length)) == NULL)
381 		goto bad_opt;
382 	if (!__TPI_OPT_ISALIGNED(opt_start))
383 		goto bad_opt;
384 
385 	opt_end = (struct opthdr *)((uchar_t *)opt_start +
386 	    tor->OPT_length);
387 
388 	for (opt = opt_start; opt < opt_end; opt = next_opt) {
389 		/*
390 		 * Verify we have room to reference the option header
391 		 * fields in the option buffer.
392 		 */
393 		if ((uchar_t *)opt + sizeof (struct opthdr) >
394 		    (uchar_t *)opt_end)
395 			goto bad_opt;
396 		/*
397 		 * We now compute pointer to next option in buffer 'next_opt'
398 		 * The next_opt computation above below 'opt->len' initialized
399 		 * by application which cannot be trusted. The usual value
400 		 * too large will be captured by the loop termination condition
401 		 * above. We check for the following which it will miss.
402 		 * 	-pointer space wraparound arithmetic overflow
403 		 *	-last option in buffer with 'opt->len' being too large
404 		 *	 (only reason 'next_opt' should equal or exceed
405 		 *	 'opt_end' for last option is roundup unless length is
406 		 *	 too-large/invalid)
407 		 */
408 		next_opt = (struct opthdr *)((uchar_t *)&opt[1] +
409 		    _TPI_ALIGN_OPT(opt->len));
410 
411 		if ((uchar_t *)next_opt < (uchar_t *)&opt[1] ||
412 		    ((next_opt >= opt_end) &&
413 			(((uchar_t *)next_opt - (uchar_t *)opt_end) >=
414 			    __TPI_ALIGN_SIZE)))
415 			goto bad_opt;
416 
417 		/* sanity check */
418 		if (opt->name == T_ALLOPT)
419 			goto bad_opt;
420 
421 		/* Find the option in the opt_arr. */
422 		if ((optd = opt_chk_lookup(opt->level, opt->name,
423 		    opt_arr, opt_arr_cnt)) == NULL) {
424 			/*
425 			 * Not found, that is a bad thing if
426 			 * the caller is a tpi provider
427 			 */
428 			if (topmost_tpiprovider)
429 				goto bad_opt;
430 			else
431 				continue; /* skip unmodified */
432 		}
433 
434 		/* Additional checks dependent on operation. */
435 		switch (tor->MGMT_flags) {
436 		case T_NEGOTIATE:
437 			if (!OA_WRITE_OR_EXECUTE(optd, cr)) {
438 				/* can't negotiate option */
439 				if (!(OA_MATCHED_PRIV(optd, cr)) &&
440 				    OA_WX_ANYPRIV(optd)) {
441 					/*
442 					 * not privileged but privilege
443 					 * will help negotiate option.
444 					 */
445 					optcom_err_ack(q, mp, TACCES, 0);
446 					return (0);
447 				} else
448 					goto bad_opt;
449 			}
450 			/*
451 			 * Verify size for options
452 			 * Note: For retaining compatibility with historical
453 			 * behavior, variable lengths options will have their
454 			 * length verified in the setfn() processing.
455 			 * In order to be compatible with SunOS 4.X we return
456 			 * EINVAL errors for bad lengths.
457 			 */
458 			if (!(optd->opdes_props & OP_VARLEN)) {
459 				/* fixed length - size must match */
460 				if (opt->len != optd->opdes_size) {
461 					optcom_err_ack(q, mp, TSYSERR, EINVAL);
462 					return (0);
463 				}
464 			}
465 			break;
466 
467 		case T_CHECK:
468 			if (!OA_RWX_ANYPRIV(optd))
469 				/* any of "rwx" permission but not not none */
470 				goto bad_opt;
471 			/*
472 			 * XXX Since T_CURRENT was not there in TLI and the
473 			 * official TLI inspired TPI standard, getsockopt()
474 			 * API uses T_CHECK (for T_CURRENT semantics)
475 			 * The following fallthru makes sense because of its
476 			 * historical use as semantic equivalent to T_CURRENT.
477 			 */
478 			/* FALLTHRU */
479 		case T_CURRENT:
480 			if (!OA_READ_PERMISSION(optd, cr)) {
481 				/* can't read option value */
482 				if (!(OA_MATCHED_PRIV(optd, cr)) &&
483 				    OA_R_ANYPRIV(optd)) {
484 					/*
485 					 * not privileged but privilege
486 					 * will help in reading option value.
487 					 */
488 					optcom_err_ack(q, mp, TACCES, 0);
489 					return (0);
490 				} else
491 					goto bad_opt;
492 			}
493 			break;
494 
495 		default:
496 			optcom_err_ack(q, mp, TBADFLAG, 0);
497 			return (0);
498 		}
499 		/* We liked it.  Keep going. */
500 	} /* end for loop scanning option buffer */
501 
502 	/* Now complete the operation as required. */
503 	switch (tor->MGMT_flags) {
504 	case T_CHECK:
505 		/*
506 		 * Historically used same as T_CURRENT (which was added to
507 		 * standard later). Code retained for compatibility.
508 		 */
509 		/* FALLTHROUGH */
510 	case T_CURRENT:
511 		/*
512 		 * Allocate a maximum size reply.  Perhaps we are supposed to
513 		 * assume that the input buffer includes space for the answers
514 		 * as well as the opthdrs, but we don't know that for sure.
515 		 * So, instead, we create a new output buffer, using the
516 		 * input buffer only as a list of options.
517 		 */
518 		max_optbuf_len = optcom_max_optbuf_len(opt_arr,
519 		    opt_arr_cnt);
520 		mp1 = allocb_cred(max_optbuf_len, cr);
521 		if (!mp1)
522 			goto no_mem;
523 		/* Initialize the header. */
524 		mp1->b_datap->db_type = M_PCPROTO;
525 		mp1->b_wptr = &mp1->b_rptr[sizeof (struct T_optmgmt_ack)];
526 		toa = (struct T_optmgmt_ack *)mp1->b_rptr;
527 		toa->OPT_offset = (t_scalar_t)sizeof (struct T_optmgmt_ack);
528 		toa->MGMT_flags = tor->MGMT_flags;
529 		/*
530 		 * Walk through the input buffer again, this time adding
531 		 * entries to the output buffer for each option requested.
532 		 * Note, sanity of option header, last option etc, verified
533 		 * in first pass.
534 		 */
535 		opt1 = (struct opthdr *)&toa[1];
536 
537 		for (opt = opt_start; opt < opt_end; opt = next_opt) {
538 
539 		    next_opt = (struct opthdr *)((uchar_t *)&opt[1] +
540 			_TPI_ALIGN_OPT(opt->len));
541 
542 			opt1->name = opt->name;
543 			opt1->level = opt->level;
544 			len = (*getfn)(q, opt->level,
545 			    opt->name, (uchar_t *)&opt1[1]);
546 			/*
547 			 * Failure means option is not recognized. Copy input
548 			 * buffer as is
549 			 */
550 			if (len < 0) {
551 				opt1->len = opt->len;
552 				bcopy(&opt[1], &opt1[1], opt->len);
553 				/*
554 				 * Pass the option down to IP only
555 				 * if TCP hasn't processed it.
556 				 */
557 				if (is_tcp)
558 					pass_to_ip = B_TRUE;
559 			} else {
560 				opt1->len = (t_uscalar_t)len;
561 			}
562 			opt1 = (struct opthdr *)((uchar_t *)&opt1[1] +
563 			    _TPI_ALIGN_OPT(opt1->len));
564 		} /* end for loop */
565 
566 		/* Record the final length. */
567 		toa->OPT_length = (t_scalar_t)((uchar_t *)opt1 -
568 		    (uchar_t *)&toa[1]);
569 		mp1->b_wptr = (uchar_t *)opt1;
570 		/* Ditch the input buffer. */
571 		freemsg(mp);
572 		mp = mp1;
573 		/* Always let the next module look at the option. */
574 		pass_to_next = B_TRUE;
575 		break;
576 
577 	case T_NEGOTIATE:
578 		first_mp = allocb(sizeof (opt_restart_t), BPRI_LO);
579 		if (first_mp == NULL) {
580 			optcom_err_ack(q, mp, TSYSERR, ENOMEM);
581 			return (0);
582 		}
583 		first_mp->b_datap->db_type = M_CTL;
584 		or = (opt_restart_t *)first_mp->b_rptr;
585 		or->or_start = opt_start;
586 		or->or_end =  opt_end;
587 		or->or_type = T_SVR4_OPTMGMT_REQ;
588 		or->or_private = 0;
589 		first_mp->b_cont = mp;
590 restart:
591 		/*
592 		 * Here we are expecting that the response buffer is exactly
593 		 * the same size as the input buffer.  We pass each opthdr
594 		 * to the protocol's set function.  If the protocol doesn't
595 		 * like it, it can update the value in it return argument.
596 		 */
597 		/*
598 		 * Pass each negotiated option through the protocol set
599 		 * function.
600 		 * Note: sanity check on option header values done in first
601 		 * pass and not repeated here.
602 		 */
603 		toa = (struct T_optmgmt_ack *)tor;
604 
605 		for (opt = is_restart ? restart_opt: opt_start; opt < opt_end;
606 		    opt = next_opt) {
607 			int error;
608 
609 			/*
610 			 * Point to the current option in or, in case this
611 			 * option has to be restarted later on
612 			 */
613 			or->or_ropt = opt;
614 			next_opt = (struct opthdr *)((uchar_t *)&opt[1] +
615 			    _TPI_ALIGN_OPT(opt->len));
616 
617 			error = (*setfn)(q, SETFN_OPTCOM_NEGOTIATE,
618 			    opt->level, opt->name,
619 			    opt->len, (uchar_t *)&opt[1],
620 			    &opt->len, (uchar_t *)&opt[1], NULL, cr, first_mp);
621 			/*
622 			 * Treat positive "errors" as real.
623 			 * Note: negative errors are to be treated as
624 			 * non-fatal by svr4_optcom_req() and are
625 			 * returned by setfn() when it is passed an
626 			 * option it does not handle. Since the option
627 			 * passed opt_chk_lookup(), it is implied that
628 			 * it is valid but was either handled upstream
629 			 * or will be handled downstream.
630 			 */
631 			if (error == EINPROGRESS) {
632 				/*
633 				 * The message is queued and will be
634 				 * reprocessed later. Typically ip queued
635 				 * the message to get some exclusive conditions
636 				 * and later on calls this func again.
637 				 */
638 				return (EINPROGRESS);
639 			} else if (error > 0) {
640 				optcom_err_ack(q, mp, TSYSERR, error);
641 				freeb(first_mp);
642 				return (0);
643 			} else if (error < 0 && is_tcp) {
644 				/*
645 				 * Pass the option down to IP only
646 				 * if TCP hasn't processed it.
647 				 */
648 				pass_to_ip = B_TRUE;
649 			}
650 		}
651 		/* Done with the restart control mp. */
652 		freeb(first_mp);
653 		pass_to_next = B_TRUE;
654 		break;
655 	default:
656 		optcom_err_ack(q, mp, TBADFLAG, 0);
657 		return (0);
658 	}
659 
660 	if (pass_to_next && (q->q_next != NULL || pass_to_ip)) {
661 		/* Send it down to the next module and let it reply */
662 		toa->PRIM_type = T_SVR4_OPTMGMT_REQ; /* Changed by IP to ACK */
663 		if (q->q_next != NULL)
664 			putnext(q, mp);
665 		else
666 			ip_output(Q_TO_CONN(q), mp, q, IP_WPUT);
667 	} else {
668 		/* Set common fields in the header. */
669 		toa->MGMT_flags = T_SUCCESS;
670 		mp->b_datap->db_type = M_PCPROTO;
671 		toa->PRIM_type = T_OPTMGMT_ACK;
672 		qreply(q, mp);
673 	}
674 	return (0);
675 bad_opt:;
676 	optcom_err_ack(q, mp, TBADOPT, 0);
677 	return (0);
678 }
679 
680 /*
681  * New optcom_req inspired by TPI/XTI semantics
682  */
683 int
684 tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp)
685 {
686 	t_scalar_t t_error;
687 	mblk_t *toa_mp;
688 	boolean_t pass_to_next;
689 	size_t toa_len;
690 	struct T_optmgmt_ack *toa;
691 	struct T_optmgmt_req *tor =
692 	    (struct T_optmgmt_req *)mp->b_rptr;
693 
694 	opt_restart_t *or;
695 	boolean_t is_restart = B_FALSE;
696 	mblk_t	*first_mp = NULL;
697 	t_uscalar_t worst_status;
698 	boolean_t queued_status;
699 
700 	/*
701 	 * Allocate M_CTL and prepend to the packet for restarting this
702 	 * option if needed. IP may need to queue and restart the option
703 	 * if it cannot obtain exclusive conditions immediately. Please see
704 	 * IP-MT notes before the start of svr4_optcom_req
705 	 */
706 	if (mp->b_datap->db_type == M_CTL) {
707 		is_restart = B_TRUE;
708 		first_mp = mp;
709 		toa_mp = mp->b_cont;
710 		mp = toa_mp->b_cont;
711 		ASSERT(mp->b_wptr - mp->b_rptr >=
712 		    sizeof (struct T_optmgmt_req));
713 		tor = (struct T_optmgmt_req *)mp->b_rptr;
714 		ASSERT(tor->MGMT_flags == T_NEGOTIATE);
715 
716 		or = (opt_restart_t *)first_mp->b_rptr;
717 		goto restart;
718 	}
719 
720 	/* Verify message integrity. */
721 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_optmgmt_req)) {
722 		optcom_err_ack(q, mp, TBADOPT, 0);
723 		return (0);
724 	}
725 
726 	/* Verify MGMT_flags legal */
727 	switch (tor->MGMT_flags) {
728 	case T_DEFAULT:
729 	case T_NEGOTIATE:
730 	case T_CURRENT:
731 	case T_CHECK:
732 		/* OK - legal request flags */
733 		break;
734 	default:
735 		optcom_err_ack(q, mp, TBADFLAG, 0);
736 		return (0);
737 	}
738 
739 	/*
740 	 * In this design, there are two passes required on the input buffer
741 	 * mostly to accomodate variable length options and "T_ALLOPT" option
742 	 * which has the semantics "all options of the specified level".
743 	 *
744 	 * For T_DEFAULT, T_NEGOTIATE, T_CURRENT, and T_CHECK requests, we make
745 	 * a pass through the input buffer validating the details and making
746 	 * sure each option is supported by the protocol. We also determine the
747 	 * length of the option buffer to return. (Variable length options and
748 	 * T_ALLOPT mean that length can be different for output buffer).
749 	 */
750 
751 	pass_to_next = B_FALSE;	/* initial value */
752 	toa_len = 0;		/* initial value */
753 
754 	/*
755 	 * First pass, we do the following
756 	 *	- estimate cumulative length needed for results
757 	 *	- set "status" field based on permissions, option header check
758 	 *	  etc.
759 	 *	- determine "pass_to_next" whether we need to send request to
760 	 *	  downstream module/driver.
761 	 */
762 	if ((t_error = process_topthdrs_first_pass(mp, cr, dbobjp,
763 	    &pass_to_next, &toa_len)) != 0) {
764 		optcom_err_ack(q, mp, t_error, 0);
765 		return (0);
766 	}
767 
768 	/*
769 	 * A validation phase of the input buffer is done. We have also
770 	 * obtained the length requirement and and other details about the
771 	 * input and we liked input buffer so far.  We make another scan
772 	 * through the input now and generate the output necessary to complete
773 	 * the operation.
774 	 */
775 
776 	toa_mp = allocb_cred(toa_len, cr);
777 	if (!toa_mp) {
778 		optcom_err_ack(q, mp, TSYSERR, ENOMEM);
779 		return (0);
780 	}
781 
782 	first_mp = allocb(sizeof (opt_restart_t), BPRI_LO);
783 	if (first_mp == NULL) {
784 		freeb(toa_mp);
785 		optcom_err_ack(q, mp, TSYSERR, ENOMEM);
786 		return (0);
787 	}
788 	first_mp->b_datap->db_type = M_CTL;
789 	or = (opt_restart_t *)first_mp->b_rptr;
790 	/*
791 	 * Set initial values for generating output.
792 	 */
793 	or->or_worst_status = T_SUCCESS;
794 	or->or_type = T_OPTMGMT_REQ;
795 	or->or_private = 0;
796 	/* remaining fields fileed in do_options_second_pass */
797 
798 restart:
799 	/*
800 	 * This routine makes another pass through the option buffer this
801 	 * time acting on the request based on "status" result in the
802 	 * first pass. It also performs "expansion" of T_ALLOPT into
803 	 * all options of a certain level and acts on each for this request.
804 	 */
805 	if ((t_error = do_options_second_pass(q, mp, toa_mp, cr, dbobjp,
806 	    first_mp, is_restart, &queued_status)) != 0) {
807 		freemsg(toa_mp);
808 		optcom_err_ack(q, mp, t_error, 0);
809 		return (0);
810 	}
811 	if (queued_status) {
812 		/* Option will be restarted */
813 		return (EINPROGRESS);
814 	}
815 	worst_status = or->or_worst_status;
816 	/* Done with the first mp */
817 	freeb(first_mp);
818 	toa_mp->b_cont = NULL;
819 
820 	/*
821 	 * Following code relies on the coincidence that T_optmgmt_req
822 	 * and T_optmgmt_ack are identical in binary representation
823 	 */
824 	toa = (struct T_optmgmt_ack *)toa_mp->b_rptr;
825 	toa->OPT_length = (t_scalar_t)(toa_mp->b_wptr - (toa_mp->b_rptr +
826 	    sizeof (struct T_optmgmt_ack)));
827 	toa->OPT_offset = (t_scalar_t)sizeof (struct T_optmgmt_ack);
828 
829 	toa->MGMT_flags = tor->MGMT_flags;
830 
831 
832 	freemsg(mp);		/* free input mblk */
833 
834 	/*
835 	 * If there is atleast one option that requires a downstream
836 	 * forwarding and if it is possible, we forward the message
837 	 * downstream. Else we ack it.
838 	 */
839 	if (pass_to_next && (q->q_next != NULL || dbobjp == &tcp_opt_obj)) {
840 		/*
841 		 * We pass it down as T_OPTMGMT_REQ. This code relies
842 		 * on the happy coincidence that T_optmgmt_req and
843 		 * T_optmgmt_ack are identical data structures
844 		 * at the binary representation level.
845 		 */
846 		toa_mp->b_datap->db_type = M_PROTO;
847 		toa->PRIM_type = T_OPTMGMT_REQ;
848 		if (q->q_next != NULL)
849 			putnext(q, toa_mp);
850 		else
851 			ip_output(Q_TO_CONN(q), toa_mp, q, IP_WPUT);
852 	} else {
853 		toa->PRIM_type = T_OPTMGMT_ACK;
854 		toa_mp->b_datap->db_type = M_PCPROTO;
855 		toa->MGMT_flags |= worst_status; /* XXX "worst" or "OR" TPI ? */
856 		qreply(q, toa_mp);
857 	}
858 	return (0);
859 }
860 
861 
862 /*
863  * Following routine makes a pass through option buffer in mp and performs the
864  * following tasks.
865  *	- estimate cumulative length needed for results
866  *	- set "status" field based on permissions, option header check
867  *	  etc.
868  *	- determine "pass_to_next" whether we need to send request to
869  *	  downstream module/driver.
870  */
871 
872 static t_scalar_t
873 process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
874     boolean_t *pass_to_nextp, size_t *toa_lenp)
875 {
876 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
877 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
878 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
879 	optlevel_t *valid_level_arr = dbobjp->odb_valid_levels_arr;
880 	uint_t valid_level_arr_cnt = dbobjp->odb_valid_levels_arr_cnt;
881 	struct T_opthdr *opt;
882 	struct T_opthdr *opt_start, *opt_end;
883 	opdes_t	*optd;
884 	size_t allopt_len;
885 	struct T_optmgmt_req *tor =
886 	    (struct T_optmgmt_req *)mp->b_rptr;
887 
888 	*toa_lenp = sizeof (struct T_optmgmt_ack); /* initial value */
889 
890 	if ((opt_start = (struct T_opthdr *)
891 	    mi_offset_param(mp, tor->OPT_offset, tor->OPT_length)) == NULL) {
892 		return (TBADOPT);
893 	}
894 	if (!__TPI_TOPT_ISALIGNED(opt_start))
895 		return (TBADOPT);
896 
897 	opt_end = (struct T_opthdr *)((uchar_t *)opt_start + tor->OPT_length);
898 
899 	for (opt = opt_start; opt && (opt < opt_end);
900 	    opt = _TPI_TOPT_NEXTHDR(opt_start, tor->OPT_length, opt)) {
901 		/*
902 		 * Validate the option for length and alignment
903 		 * before accessing anything in it.
904 		 */
905 		if (!(_TPI_TOPT_VALID(opt, opt_start, opt_end)))
906 			return (TBADOPT);
907 
908 		/* Find the option in the opt_arr. */
909 		if (opt->name != T_ALLOPT) {
910 			optd = opt_chk_lookup(opt->level, opt->name,
911 			    opt_arr, opt_arr_cnt);
912 			if (optd == NULL) {
913 				/*
914 				 * Option not found
915 				 *
916 				 * Verify if level is "valid" or not.
917 				 * Note: This check is required by XTI
918 				 *
919 				 * TPI provider always initializes
920 				 * the "not supported" (or whatever) status
921 				 * for the options. Other levels leave status
922 				 * unchanged if they do not understand an
923 				 * option.
924 				 */
925 				if (topmost_tpiprovider) {
926 					if (!opt_level_valid(opt->level,
927 					    valid_level_arr,
928 					    valid_level_arr_cnt))
929 						return (TBADOPT);
930 					/*
931 					 * level is valid - initialize
932 					 * option as not supported
933 					 */
934 					opt->status = T_NOTSUPPORT;
935 				}
936 
937 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
938 				continue;
939 			}
940 		} else {
941 			/*
942 			 * Handle T_ALLOPT case as a special case.
943 			 * Note: T_ALLOPT does not mean anything
944 			 * for T_CHECK operation.
945 			 */
946 			allopt_len = 0;
947 			if (tor->MGMT_flags == T_CHECK ||
948 			    !topmost_tpiprovider ||
949 			    ((allopt_len = opt_level_allopts_lengths(opt->level,
950 				opt_arr, opt_arr_cnt)) == 0)) {
951 				/*
952 				 * This is confusing but correct !
953 				 * It is not valid to to use T_ALLOPT with
954 				 * T_CHECK flag.
955 				 *
956 				 * T_ALLOPT is assumed "expanded" at the
957 				 * topmost_tpiprovider level so it should not
958 				 * be there as an "option name" if this is not
959 				 * a topmost_tpiprovider call and we fail it.
960 				 *
961 				 * opt_level_allopts_lengths() is used to verify
962 				 * that "level" associated with the T_ALLOPT is
963 				 * supported.
964 				 *
965 				 */
966 				opt->status = T_FAILURE;
967 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
968 				continue;
969 			}
970 			ASSERT(allopt_len != 0); /* remove ? */
971 
972 			*toa_lenp += allopt_len;
973 			opt->status = T_SUCCESS;
974 			/* XXX - always set T_ALLOPT 'pass_to_next' for now */
975 			*pass_to_nextp = B_TRUE;
976 			continue;
977 		}
978 		/*
979 		 * Check if option wants to flow downstream
980 		 */
981 		if (optd->opdes_props & OP_PASSNEXT)
982 			*pass_to_nextp = B_TRUE;
983 
984 		/* Additional checks dependent on operation. */
985 		switch (tor->MGMT_flags) {
986 		case T_DEFAULT:
987 		case T_CURRENT:
988 
989 			/*
990 			 * The opt_chk_lookup() routine call above approved of
991 			 * this option so we can work on the status for it
992 			 * based on the permissions for the operation. (This
993 			 * can override any status for it set at higher levels)
994 			 * We assume this override is OK since chkfn at this
995 			 * level approved of this option.
996 			 *
997 			 * T_CURRENT semantics:
998 			 * The read access is required. Else option
999 			 * status is T_NOTSUPPORT.
1000 			 *
1001 			 * T_DEFAULT semantics:
1002 			 * Note: specification is not clear on this but we
1003 			 * interpret T_DEFAULT semantics such that access to
1004 			 * read value is required for access even the default
1005 			 * value. Otherwise the option status is T_NOTSUPPORT.
1006 			 */
1007 			if (!OA_READ_PERMISSION(optd, cr)) {
1008 				opt->status = T_NOTSUPPORT;
1009 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
1010 				/* skip to next */
1011 				continue;
1012 			}
1013 
1014 			/*
1015 			 * T_DEFAULT/T_CURRENT semantics:
1016 			 * We know that read access is set. If no other access
1017 			 * is set, then status is T_READONLY.
1018 			 */
1019 			if (OA_READONLY_PERMISSION(optd, cr))
1020 				opt->status = T_READONLY;
1021 			else
1022 				opt->status = T_SUCCESS;
1023 			/*
1024 			 * Option passes all checks. Make room for it in the
1025 			 * ack. Note: size stored in table does not include
1026 			 * space for option header.
1027 			 */
1028 			*toa_lenp += sizeof (struct T_opthdr) +
1029 			    _TPI_ALIGN_TOPT(optd->opdes_size);
1030 			break;
1031 
1032 		case T_CHECK:
1033 		case T_NEGOTIATE:
1034 
1035 			/*
1036 			 * T_NEGOTIATE semantics:
1037 			 * If for fixed length option value on input is not the
1038 			 * same as value supplied, then status is T_FAILURE.
1039 			 *
1040 			 * T_CHECK semantics:
1041 			 * If value is supplied, semantics same as T_NEGOTIATE.
1042 			 * It is however ok not to supply a value with T_CHECK.
1043 			 */
1044 
1045 			if (tor->MGMT_flags == T_NEGOTIATE ||
1046 			    (opt->len != sizeof (struct T_opthdr))) {
1047 				/*
1048 				 * Implies "value" is specified in T_CHECK or
1049 				 * it is a T_NEGOTIATE request.
1050 				 * Verify size.
1051 				 * Note: This can override anything about this
1052 				 * option request done at a higher level.
1053 				 */
1054 				if (!opt_length_ok(optd, opt)) {
1055 					/* bad size */
1056 					*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
1057 					opt->status = T_FAILURE;
1058 					continue;
1059 				}
1060 			}
1061 			/*
1062 			 * The opt_chk_lookup()  routine above() approved of
1063 			 * this option so we can work on the status for it based
1064 			 * on the permissions for the operation. (This can
1065 			 * override anything set at a higher level).
1066 			 *
1067 			 * T_CHECK/T_NEGOTIATE semantics:
1068 			 * Set status to T_READONLY if read is the only access
1069 			 * permitted
1070 			 */
1071 			if (OA_READONLY_PERMISSION(optd, cr)) {
1072 				opt->status = T_READONLY;
1073 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
1074 				/* skip to next */
1075 				continue;
1076 			}
1077 
1078 			/*
1079 			 * T_CHECK/T_NEGOTIATE semantics:
1080 			 * If write (or execute) access is not set, then status
1081 			 * is T_NOTSUPPORT.
1082 			 */
1083 			if (!OA_WRITE_OR_EXECUTE(optd, cr)) {
1084 				opt->status = T_NOTSUPPORT;
1085 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
1086 				/* skip to next option */
1087 				continue;
1088 			}
1089 			/*
1090 			 * Option passes all checks. Make room for it in the
1091 			 * ack and set success in status.
1092 			 * Note: size stored in table does not include header
1093 			 * length.
1094 			 */
1095 			opt->status = T_SUCCESS;
1096 			*toa_lenp += sizeof (struct T_opthdr) +
1097 			    _TPI_ALIGN_TOPT(optd->opdes_size);
1098 			break;
1099 
1100 		default:
1101 			return (TBADFLAG);
1102 		}
1103 	} /* for loop scanning input buffer */
1104 
1105 	return (0);		/* OK return */
1106 }
1107 
1108 /*
1109  * This routine makes another pass through the option buffer this
1110  * time acting on the request based on "status" result in the
1111  * first pass. It also performs "expansion" of T_ALLOPT into
1112  * all options of a certain level and acts on each for this request.
1113  */
1114 static t_scalar_t
1115 do_options_second_pass(queue_t *q, mblk_t *reqmp, mblk_t *ack_mp, cred_t *cr,
1116     optdb_obj_t *dbobjp, mblk_t *first_mp, boolean_t is_restart,
1117     boolean_t *queued_statusp)
1118 {
1119 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
1120 	int failed_option;
1121 	struct T_opthdr *opt;
1122 	struct T_opthdr *opt_start, *opt_end, *restart_opt;
1123 	uchar_t *optr;
1124 	uint_t optset_context;
1125 	struct T_optmgmt_req *tor = (struct T_optmgmt_req *)reqmp->b_rptr;
1126 	opt_restart_t	*or;
1127 	t_uscalar_t	*worst_statusp;
1128 	int	err;
1129 
1130 	*queued_statusp = B_FALSE;
1131 	or = (opt_restart_t *)first_mp->b_rptr;
1132 	worst_statusp = &or->or_worst_status;
1133 
1134 	optr = (uchar_t *)ack_mp->b_rptr +
1135 	    sizeof (struct T_optmgmt_ack); /* assumed int32_t aligned */
1136 
1137 	/*
1138 	 * Set initial values for scanning input
1139 	 */
1140 	if (is_restart) {
1141 		opt_start = (struct T_opthdr *)or->or_start;
1142 		opt_end = (struct T_opthdr *)or->or_end;
1143 		restart_opt = (struct T_opthdr *)or->or_ropt;
1144 	} else {
1145 		opt_start = (struct T_opthdr *)mi_offset_param(reqmp,
1146 		    tor->OPT_offset, tor->OPT_length);
1147 		if (opt_start == NULL)
1148 			return (TBADOPT);
1149 		opt_end = (struct T_opthdr *)((uchar_t *)opt_start +
1150 		    tor->OPT_length);
1151 		or->or_start = (struct opthdr *)opt_start;
1152 		or->or_end = (struct opthdr *)opt_end;
1153 		/*
1154 		 * construct the mp chain, in case the setfn needs to
1155 		 * queue this and restart option processing later on.
1156 		 */
1157 		first_mp->b_cont = ack_mp;
1158 		ack_mp->b_cont = reqmp;
1159 	}
1160 	ASSERT(__TPI_TOPT_ISALIGNED(opt_start)); /* verified in first pass */
1161 
1162 	for (opt = is_restart ? restart_opt : opt_start;
1163 	    opt && (opt < opt_end);
1164 	    opt = _TPI_TOPT_NEXTHDR(opt_start, tor->OPT_length, opt)) {
1165 		or->or_ropt = (struct opthdr *)opt;
1166 		/* verified in first pass */
1167 		ASSERT(_TPI_TOPT_VALID(opt, opt_start, opt_end));
1168 
1169 		/*
1170 		 * If the first pass in process_topthdrs_first_pass()
1171 		 * has marked the option as a failure case for the MGMT_flags
1172 		 * semantics then there is not much to do.
1173 		 *
1174 		 * Note: For all practical purposes, T_READONLY status is
1175 		 * a "success" for T_DEFAULT/T_CURRENT and "failure" for
1176 		 * T_CHECK/T_NEGOTIATE
1177 		 */
1178 		failed_option =
1179 		    (opt->status == T_NOTSUPPORT) ||
1180 		    (opt->status == T_FAILURE) ||
1181 		    ((tor->MGMT_flags & (T_NEGOTIATE|T_CHECK)) &&
1182 			(opt->status == T_READONLY));
1183 
1184 		if (failed_option) {
1185 			/*
1186 			 * According to T_DEFAULT/T_CURRENT semantics, the
1187 			 * input values, even if present, are to be ignored.
1188 			 * Note: Specification is not clear on this, but we
1189 			 * interpret that even though we ignore the values, we
1190 			 * can return them as is. So we process them similar to
1191 			 * T_CHECK/T_NEGOTIATE case which has the semantics to
1192 			 * return the values as is. XXX If interpretation is
1193 			 * ever determined incorrect fill in appropriate code
1194 			 * here to treat T_DEFAULT/T_CURRENT differently.
1195 			 *
1196 			 * According to T_CHECK/T_NEGOTIATE semantics,
1197 			 * in the case of T_NOTSUPPORT/T_FAILURE/T_READONLY,
1198 			 * the semantics are to return the "value" part of
1199 			 * option untouched. So here we copy the option
1200 			 * head including value part if any to output.
1201 			 */
1202 
1203 			bcopy(opt, optr, opt->len);
1204 			optr += _TPI_ALIGN_TOPT(opt->len);
1205 
1206 			*worst_statusp = get_worst_status(opt->status,
1207 			    *worst_statusp);
1208 
1209 			/* skip to process next option in buffer */
1210 			continue;
1211 
1212 		} /* end if "failed option" */
1213 		/*
1214 		 * The status is T_SUCCESS or T_READONLY
1215 		 * We process the value part here
1216 		 */
1217 		ASSERT(opt->status == T_SUCCESS || opt->status == T_READONLY);
1218 		switch (tor->MGMT_flags) {
1219 		case T_DEFAULT:
1220 			/*
1221 			 * We fill default value from table or protocol specific
1222 			 * function. If this call fails, we pass input through.
1223 			 */
1224 			if (do_opt_default(q, opt, &optr, worst_statusp,
1225 			    cr, dbobjp) < 0) {
1226 				/* fail or pass transparently */
1227 				if (topmost_tpiprovider)
1228 					opt->status = T_FAILURE;
1229 				bcopy(opt, optr, opt->len);
1230 				optr += _TPI_ALIGN_TOPT(opt->len);
1231 				*worst_statusp = get_worst_status(opt->status,
1232 				    *worst_statusp);
1233 			}
1234 			break;
1235 
1236 		case T_CURRENT:
1237 
1238 			do_opt_current(q, opt, &optr, worst_statusp, cr,
1239 			    dbobjp);
1240 			break;
1241 
1242 		case T_CHECK:
1243 		case T_NEGOTIATE:
1244 			if (tor->MGMT_flags == T_CHECK)
1245 				optset_context = SETFN_OPTCOM_CHECKONLY;
1246 			else	/* T_NEGOTIATE */
1247 				optset_context = SETFN_OPTCOM_NEGOTIATE;
1248 			err = do_opt_check_or_negotiate(q, opt, optset_context,
1249 			    &optr, worst_statusp, cr, dbobjp, first_mp);
1250 			if (err == EINPROGRESS) {
1251 				*queued_statusp = B_TRUE;
1252 				return (0);
1253 			}
1254 			break;
1255 		default:
1256 			return (TBADFLAG);
1257 		}
1258 	} /* end for loop scanning option buffer */
1259 
1260 	ack_mp->b_wptr = optr;
1261 	ASSERT(ack_mp->b_wptr <= ack_mp->b_datap->db_lim);
1262 
1263 	return (0);		/* OK return */
1264 }
1265 
1266 
1267 static t_uscalar_t
1268 get_worst_status(t_uscalar_t status, t_uscalar_t current_worst_status)
1269 {
1270 	/*
1271 	 * Return the "worst" among the arguments "status" and
1272 	 * "current_worst_status".
1273 	 *
1274 	 * Note: Tracking "worst_status" can be made a bit simpler
1275 	 * if we use the property that status codes are bitwise
1276 	 * distinct.
1277 	 *
1278 	 * The pecking order is
1279 	 *
1280 	 * T_SUCCESS ..... best
1281 	 * T_PARTSUCCESS
1282 	 * T_FAILURE
1283 	 * T_READONLY
1284 	 * T_NOTSUPPORT... worst
1285 	 */
1286 	if (status == current_worst_status)
1287 		return (current_worst_status);
1288 	switch (current_worst_status) {
1289 	case T_SUCCESS:
1290 		if (status == T_PARTSUCCESS)
1291 			return (T_PARTSUCCESS);
1292 		/* FALLTHROUGH */
1293 	case T_PARTSUCCESS:
1294 		if (status == T_FAILURE)
1295 			return (T_FAILURE);
1296 		/* FALLTHROUGH */
1297 	case T_FAILURE:
1298 		if (status == T_READONLY)
1299 			return (T_READONLY);
1300 		/* FALLTHROUGH */
1301 	case T_READONLY:
1302 		if (status == T_NOTSUPPORT)
1303 			return (T_NOTSUPPORT);
1304 		/* FALLTHROUGH */
1305 	case T_NOTSUPPORT:
1306 	default:
1307 		return (current_worst_status);
1308 	}
1309 }
1310 
1311 static int
1312 do_opt_default(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
1313     t_uscalar_t *worst_statusp, cred_t *cr, optdb_obj_t *dbobjp)
1314 {
1315 	pfi_t	deffn = dbobjp->odb_deffn;
1316 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
1317 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
1318 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
1319 
1320 	struct T_opthdr *topth;
1321 	opdes_t *optd;
1322 
1323 	if (reqopt->name != T_ALLOPT) {
1324 		/*
1325 		 * lookup the option in the table and fill default value
1326 		 */
1327 		optd = opt_chk_lookup(reqopt->level, reqopt->name,
1328 		    opt_arr, opt_arr_cnt);
1329 
1330 		if (optd == NULL) {
1331 			/*
1332 			 * not found - fail this one. Should not happen
1333 			 * for topmost_tpiprovider as calling routine
1334 			 * should have verified it.
1335 			 */
1336 			ASSERT(!topmost_tpiprovider);
1337 			return (-1);
1338 		}
1339 
1340 		topth = (struct T_opthdr *)(*resptrp);
1341 		topth->level = reqopt->level;
1342 		topth->name = reqopt->name;
1343 		topth->status = reqopt->status;
1344 
1345 		*worst_statusp = get_worst_status(reqopt->status,
1346 		    *worst_statusp);
1347 
1348 		if (optd->opdes_props & OP_NODEFAULT) {
1349 			/* header only, no default "value" part */
1350 			topth->len = sizeof (struct T_opthdr);
1351 			*resptrp += sizeof (struct T_opthdr);
1352 		} else {
1353 			int deflen;
1354 
1355 			if (optd->opdes_props & OP_DEF_FN) {
1356 				deflen = (*deffn)(q, reqopt->level,
1357 				    reqopt->name, _TPI_TOPT_DATA(topth));
1358 				if (deflen >= 0) {
1359 					topth->len = (t_uscalar_t)
1360 					    (sizeof (struct T_opthdr) + deflen);
1361 				} else {
1362 					/*
1363 					 * return error, this should 'pass
1364 					 * through' the option and maybe some
1365 					 * other level will fill it in or
1366 					 * already did.
1367 					 * (No change in 'resptrp' upto here)
1368 					 */
1369 					return (-1);
1370 				}
1371 			} else {
1372 				/* fill length and value part */
1373 				switch (optd->opdes_size) {
1374 				/*
1375 				 * Since options are guaranteed aligned only
1376 				 * on a 4 byte boundary (t_scalar_t) any
1377 				 * option that is greater in size will default
1378 				 * to the bcopy below
1379 				 */
1380 				case sizeof (int32_t):
1381 					*(int32_t *)_TPI_TOPT_DATA(topth) =
1382 					    (int32_t)optd->opdes_default;
1383 					break;
1384 				case sizeof (int16_t):
1385 					*(int16_t *)_TPI_TOPT_DATA(topth) =
1386 					    (int16_t)optd->opdes_default;
1387 					break;
1388 				case sizeof (int8_t):
1389 					*(int8_t *)_TPI_TOPT_DATA(topth) =
1390 					    (int8_t)optd->opdes_default;
1391 					break;
1392 				default:
1393 					/*
1394 					 * other length but still assume
1395 					 * fixed - use bcopy
1396 					 */
1397 					bcopy(optd->opdes_defbuf,
1398 					    _TPI_TOPT_DATA(topth),
1399 					    optd->opdes_size);
1400 					break;
1401 				}
1402 				topth->len = (t_uscalar_t)(optd->opdes_size +
1403 				    sizeof (struct T_opthdr));
1404 			}
1405 			*resptrp += _TPI_ALIGN_TOPT(topth->len);
1406 		}
1407 		return (0);	/* OK return */
1408 	}
1409 
1410 	/*
1411 	 * T_ALLOPT processing
1412 	 *
1413 	 * lookup and stuff default values of all the options of the
1414 	 * level specified
1415 	 * Note: This expansion of T_ALLOPT should happen in
1416 	 * a topmost_tpiprovider.
1417 	 */
1418 	ASSERT(topmost_tpiprovider);
1419 	for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
1420 		if (reqopt->level != optd->opdes_level)
1421 			continue;
1422 		/*
1423 		 *
1424 		 * T_DEFAULT semantics:
1425 		 * XXX: we interpret T_DEFAULT semantics such that access to
1426 		 * read value is required for access even the default value.
1427 		 * Else option is ignored for T_ALLOPT request.
1428 		 */
1429 		if (!OA_READ_PERMISSION(optd, cr))
1430 			/* skip this one */
1431 			continue;
1432 
1433 		/*
1434 		 * Found option of same level as T_ALLOPT request
1435 		 * that we can return.
1436 		 */
1437 
1438 		topth = (struct T_opthdr *)(*resptrp);
1439 		topth->level = optd->opdes_level;
1440 		topth->name = optd->opdes_name;
1441 
1442 		/*
1443 		 * T_DEFAULT semantics:
1444 		 * We know that read access is set. If no other access is set,
1445 		 * then status is T_READONLY
1446 		 */
1447 		if (OA_READONLY_PERMISSION(optd, cr)) {
1448 			topth->status = T_READONLY;
1449 			*worst_statusp = get_worst_status(T_READONLY,
1450 			    *worst_statusp);
1451 		} else {
1452 			topth->status = T_SUCCESS;
1453 			/*
1454 			 * Note: *worst_statusp has to be T_SUCCESS or
1455 			 * worse so no need to adjust
1456 			 */
1457 		}
1458 
1459 		if (optd->opdes_props & OP_NODEFAULT) {
1460 			/* header only, no value part */
1461 			topth->len = sizeof (struct T_opthdr);
1462 			*resptrp += sizeof (struct T_opthdr);
1463 		} else {
1464 			int deflen;
1465 
1466 			if (optd->opdes_props & OP_DEF_FN) {
1467 				deflen = (*deffn)(q, reqopt->level,
1468 				    reqopt->name, _TPI_TOPT_DATA(topth));
1469 				if (deflen >= 0) {
1470 					topth->len = (t_uscalar_t)(deflen +
1471 					    sizeof (struct T_opthdr));
1472 				} else {
1473 					/*
1474 					 * deffn failed.
1475 					 * return just the header as T_ALLOPT
1476 					 * expansion.
1477 					 * Some other level deffn may
1478 					 * supply value part.
1479 					 */
1480 					topth->len = sizeof (struct T_opthdr);
1481 					topth->status = T_FAILURE;
1482 					*worst_statusp =
1483 					    get_worst_status(T_FAILURE,
1484 						*worst_statusp);
1485 				}
1486 			} else {
1487 				/*
1488 				 * fill length and value part from
1489 				 * table
1490 				 */
1491 				switch (optd->opdes_size) {
1492 				/*
1493 				 * Since options are guaranteed aligned only
1494 				 * on a 4 byte boundary (t_scalar_t) any
1495 				 * option that is greater in size will default
1496 				 * to the bcopy below
1497 				 */
1498 				case sizeof (int32_t):
1499 					*(int32_t *)_TPI_TOPT_DATA(topth) =
1500 					    (int32_t)optd->opdes_default;
1501 					break;
1502 				case sizeof (int16_t):
1503 					*(int16_t *)_TPI_TOPT_DATA(topth) =
1504 					    (int16_t)optd->opdes_default;
1505 					break;
1506 				case sizeof (int8_t):
1507 					*(int8_t *)_TPI_TOPT_DATA(topth) =
1508 					    (int8_t)optd->opdes_default;
1509 					break;
1510 				default:
1511 					/*
1512 					 * other length but still assume
1513 					 * fixed - use bcopy
1514 					 */
1515 					bcopy(optd->opdes_defbuf,
1516 					    _TPI_TOPT_DATA(topth),
1517 					    optd->opdes_size);
1518 				}
1519 				topth->len = (t_uscalar_t)(optd->opdes_size +
1520 				    sizeof (struct T_opthdr));
1521 			}
1522 			*resptrp += _TPI_ALIGN_TOPT(topth->len);
1523 		}
1524 	}
1525 	return (0);
1526 }
1527 
1528 static void
1529 do_opt_current(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
1530     t_uscalar_t *worst_statusp, cred_t *cr, optdb_obj_t *dbobjp)
1531 {
1532 	pfi_t	getfn = dbobjp->odb_getfn;
1533 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
1534 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
1535 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
1536 
1537 	struct T_opthdr *topth;
1538 	opdes_t *optd;
1539 	int optlen;
1540 	uchar_t *initptr = *resptrp;
1541 
1542 	/*
1543 	 * We call getfn to get the current value of an option. The call may
1544 	 * fail in which case we copy the values from the input buffer. Maybe
1545 	 * something downstream will fill it in or something upstream did.
1546 	 */
1547 
1548 	if (reqopt->name != T_ALLOPT) {
1549 		topth = (struct T_opthdr *)*resptrp;
1550 		*resptrp += sizeof (struct T_opthdr);
1551 		optlen = (*getfn)(q, reqopt->level, reqopt->name, *resptrp);
1552 		if (optlen >= 0) {
1553 			topth->len = (t_uscalar_t)(optlen +
1554 			    sizeof (struct T_opthdr));
1555 			topth->level = reqopt->level;
1556 			topth->name = reqopt->name;
1557 			topth->status = reqopt->status;
1558 			*resptrp += _TPI_ALIGN_TOPT(optlen);
1559 			*worst_statusp = get_worst_status(topth->status,
1560 			    *worst_statusp);
1561 		} else {
1562 			/* failed - reset "*resptrp" pointer */
1563 			*resptrp -= sizeof (struct T_opthdr);
1564 		}
1565 	} else {		/* T_ALLOPT processing */
1566 		ASSERT(topmost_tpiprovider == B_TRUE);
1567 		/* scan and get all options */
1568 		for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
1569 			/* skip other levels */
1570 			if (reqopt->level != optd->opdes_level)
1571 				continue;
1572 
1573 			if (!OA_READ_PERMISSION(optd, cr))
1574 				/* skip this one */
1575 				continue;
1576 
1577 			topth = (struct T_opthdr *)*resptrp;
1578 			*resptrp += sizeof (struct T_opthdr);
1579 
1580 			/* get option of this level */
1581 			optlen = (*getfn)(q, reqopt->level, optd->opdes_name,
1582 			    *resptrp);
1583 			if (optlen >= 0) {
1584 				/* success */
1585 				topth->len = (t_uscalar_t)(optlen +
1586 				    sizeof (struct T_opthdr));
1587 				topth->level = reqopt->level;
1588 				topth->name = optd->opdes_name;
1589 				if (OA_READONLY_PERMISSION(optd, cr))
1590 					topth->status = T_READONLY;
1591 				else
1592 					topth->status = T_SUCCESS;
1593 				*resptrp += _TPI_ALIGN_TOPT(optlen);
1594 			} else {
1595 				/*
1596 				 * failed, return as T_FAILURE and null value
1597 				 * part. Maybe something downstream will
1598 				 * handle this one and fill in a value. Here
1599 				 * it is just part of T_ALLOPT expansion.
1600 				 */
1601 				topth->len = sizeof (struct T_opthdr);
1602 				topth->level = reqopt->level;
1603 				topth->name = optd->opdes_name;
1604 				topth->status = T_FAILURE;
1605 			}
1606 			*worst_statusp = get_worst_status(topth->status,
1607 			    *worst_statusp);
1608 		} /* end for loop */
1609 	}
1610 	if (*resptrp == initptr) {
1611 		/*
1612 		 * getfn failed and does not want to handle this option. Maybe
1613 		 * something downstream will or something upstream did. (If
1614 		 * topmost_tpiprovider, initialize "status" to failure which
1615 		 * can possibly change downstream). Copy the input "as is" from
1616 		 * input option buffer if any to maintain transparency.
1617 		 */
1618 		if (topmost_tpiprovider)
1619 			reqopt->status = T_FAILURE;
1620 		bcopy(reqopt, *resptrp, reqopt->len);
1621 		*resptrp += _TPI_ALIGN_TOPT(reqopt->len);
1622 		*worst_statusp = get_worst_status(reqopt->status,
1623 		    *worst_statusp);
1624 	}
1625 }
1626 
1627 
1628 
1629 static int
1630 do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
1631     uint_t optset_context, uchar_t **resptrp, t_uscalar_t *worst_statusp,
1632     cred_t *cr, optdb_obj_t *dbobjp, mblk_t *first_mp)
1633 {
1634 	pfi_t	deffn = dbobjp->odb_deffn;
1635 	opt_set_fn setfn = dbobjp->odb_setfn;
1636 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
1637 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
1638 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
1639 
1640 	struct T_opthdr *topth;
1641 	opdes_t *optd;
1642 	int error;
1643 	t_uscalar_t optlen;
1644 	t_scalar_t optsize;
1645 	uchar_t *initptr = *resptrp;
1646 
1647 	ASSERT(reqopt->status == T_SUCCESS);
1648 
1649 	if (reqopt->name != T_ALLOPT) {
1650 		topth = (struct T_opthdr *)*resptrp;
1651 		*resptrp += sizeof (struct T_opthdr);
1652 		error = (*setfn)(q, optset_context, reqopt->level, reqopt->name,
1653 		    reqopt->len - sizeof (struct T_opthdr),
1654 		    _TPI_TOPT_DATA(reqopt), &optlen, _TPI_TOPT_DATA(topth),
1655 		    NULL, cr, first_mp);
1656 		if (error) {
1657 			/* failed - reset "*resptrp" */
1658 			*resptrp -= sizeof (struct T_opthdr);
1659 			if (error == EINPROGRESS)
1660 				return (error);
1661 		} else {
1662 			/*
1663 			 * success - "value" already filled in setfn()
1664 			 */
1665 			topth->len = (t_uscalar_t)(optlen +
1666 			    sizeof (struct T_opthdr));
1667 			topth->level = reqopt->level;
1668 			topth->name = reqopt->name;
1669 			topth->status = reqopt->status;
1670 			*resptrp += _TPI_ALIGN_TOPT(optlen);
1671 			*worst_statusp = get_worst_status(topth->status,
1672 			    *worst_statusp);
1673 		}
1674 	} else {		/* T_ALLOPT processing */
1675 		/* only for T_NEGOTIATE case */
1676 		ASSERT(optset_context == SETFN_OPTCOM_NEGOTIATE);
1677 		ASSERT(topmost_tpiprovider == B_TRUE);
1678 
1679 		/* scan and set all options to default value */
1680 		for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
1681 
1682 			/* skip other levels */
1683 			if (reqopt->level != optd->opdes_level)
1684 				continue;
1685 
1686 			if (OA_EXECUTE_PERMISSION(optd, cr) ||
1687 			    OA_NO_PERMISSION(optd, cr)) {
1688 				/*
1689 				 * skip this one too. Does not make sense to
1690 				 * set anything to default value for "execute"
1691 				 * options.
1692 				 */
1693 				continue;
1694 			}
1695 
1696 			if (OA_READONLY_PERMISSION(optd, cr)) {
1697 				/*
1698 				 * Return with T_READONLY status (and no value
1699 				 * part). Note: spec is not clear but
1700 				 * XTI test suite needs this.
1701 				 */
1702 				topth = (struct T_opthdr *)*resptrp;
1703 				topth->len = sizeof (struct T_opthdr);
1704 				*resptrp += topth->len;
1705 				topth->level = reqopt->level;
1706 				topth->name = optd->opdes_name;
1707 				topth->status = T_READONLY;
1708 				*worst_statusp = get_worst_status(topth->status,
1709 				    *worst_statusp);
1710 				continue;
1711 			}
1712 
1713 			/*
1714 			 * It is not read only or execute type
1715 			 * the it must have write permission
1716 			 */
1717 			ASSERT(OA_WRITE_PERMISSION(optd, cr));
1718 
1719 			topth = (struct T_opthdr *)*resptrp;
1720 			*resptrp += sizeof (struct T_opthdr);
1721 
1722 			topth->len = sizeof (struct T_opthdr);
1723 			topth->level = reqopt->level;
1724 			topth->name = optd->opdes_name;
1725 			if (optd->opdes_props & OP_NODEFAULT) {
1726 				/*
1727 				 * Option of "no default value" so it does not
1728 				 * make sense to try to set it. We just return
1729 				 * header with status of T_SUCCESS
1730 				 * XXX should this be failure ?
1731 				 */
1732 				topth->status = T_SUCCESS;
1733 				continue; /* skip setting */
1734 			}
1735 			if (optd->opdes_props & OP_DEF_FN) {
1736 				if ((optd->opdes_props & OP_VARLEN) ||
1737 				    ((optsize = (*deffn)(q, reqopt->level,
1738 					optd->opdes_name,
1739 					(uchar_t *)optd->opdes_defbuf)) < 0)) {
1740 					/* XXX - skip these too */
1741 					topth->status = T_SUCCESS;
1742 					continue; /* skip setting */
1743 				}
1744 			} else {
1745 				optsize = optd->opdes_size;
1746 			}
1747 
1748 
1749 			/* set option of this level */
1750 			error = (*setfn)(q, SETFN_OPTCOM_NEGOTIATE,
1751 			    reqopt->level, optd->opdes_name, optsize,
1752 			    (uchar_t *)optd->opdes_defbuf, &optlen,
1753 			    _TPI_TOPT_DATA(topth), NULL, cr, NULL);
1754 			if (error) {
1755 				/*
1756 				 * failed, return as T_FAILURE and null value
1757 				 * part. Maybe something downstream will
1758 				 * handle this one and fill in a value. Here
1759 				 * it is just part of T_ALLOPT expansion.
1760 				 */
1761 				topth->status = T_FAILURE;
1762 				*worst_statusp = get_worst_status(topth->status,
1763 				    *worst_statusp);
1764 			} else {
1765 				/* success */
1766 				topth->len += optlen;
1767 				topth->status = T_SUCCESS;
1768 				*resptrp += _TPI_ALIGN_TOPT(optlen);
1769 			}
1770 		} /* end for loop */
1771 		/* END T_ALLOPT */
1772 	}
1773 
1774 	if (*resptrp == initptr) {
1775 		/*
1776 		 * setfn failed and does not want to handle this option. Maybe
1777 		 * something downstream will or something upstream
1778 		 * did. Copy the input as is from input option buffer if any to
1779 		 * maintain transparency (maybe something at a level above
1780 		 * did something.
1781 		 */
1782 		if (topmost_tpiprovider)
1783 			reqopt->status = T_FAILURE;
1784 		bcopy(reqopt, *resptrp, reqopt->len);
1785 		*resptrp += _TPI_ALIGN_TOPT(reqopt->len);
1786 		*worst_statusp = get_worst_status(reqopt->status,
1787 		    *worst_statusp);
1788 	}
1789 	return (0);
1790 }
1791 
1792 /*
1793  * The following routines process options buffer passed with
1794  * T_CONN_REQ, T_CONN_RES and T_UNITDATA_REQ.
1795  * This routine does the consistency check applied to the
1796  * sanity of formatting of multiple options packed in the
1797  * buffer.
1798  *
1799  * XTI brain damage alert:
1800  * XTI interface adopts the notion of an option being an
1801  * "absolute requirement" from OSI transport service (but applies
1802  * it to all transports including Internet transports).
1803  * The main effect of that is action on failure to "negotiate" a
1804  * requested option to the exact requested value
1805  *
1806  *          - if the option is an "absolute requirement", the primitive
1807  *            is aborted (e.g T_DISCON_REQ or T_UDERR generated)
1808  *          - if the option is NOT and "absolute requirement" it can
1809  *            just be ignored.
1810  *
1811  * We would not support "negotiating" of options on connection
1812  * primitives for Internet transports. However just in case we
1813  * forced to in order to pass strange test suites, the design here
1814  * tries to support these notions.
1815  *
1816  * tpi_optcom_buf(q, mp, opt_lenp, opt_offset, cred, dbobjp, thisdg_attrs,
1817  *	*is_absreq_failurep)
1818  *
1819  * - Verify the option buffer, if formatted badly, return error 1
1820  *
1821  * - If it is a "permissions" failure (read-only), return error 2
1822  *
1823  * - Else, process the option "in place", the following can happen,
1824  *	     - if a "privileged" option, mark it as "ignored".
1825  *	     - if "not supported", mark "ignored"
1826  *	     - if "supported" attempt negotiation and fill result in
1827  *	       the outcome
1828  *			- if "absolute requirement", set "*is_absreq_failurep"
1829  *			- if NOT an "absolute requirement", then our
1830  *			  interpretation is to mark is at ignored if
1831  *			  negotiation fails (Spec allows partial success
1832  *			  as in OSI protocols but not failure)
1833  *
1834  *   Then delete "ignored" options from option buffer and return success.
1835  *
1836  */
1837 
1838 int
1839 tpi_optcom_buf(queue_t *q, mblk_t *mp, t_scalar_t *opt_lenp,
1840     t_scalar_t opt_offset, cred_t *cr, optdb_obj_t *dbobjp,
1841     void *thisdg_attrs, int *is_absreq_failurep)
1842 {
1843 	opt_set_fn setfn = dbobjp->odb_setfn;
1844 	opdes_t *opt_arr = dbobjp->odb_opt_des_arr;
1845 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
1846 	struct T_opthdr *opt, *opt_start, *opt_end;
1847 	mblk_t  *copy_mp_head;
1848 	uchar_t *optr, *init_optr;
1849 	opdes_t *optd;
1850 	uint_t optset_context;
1851 	t_uscalar_t olen;
1852 	int error = 0;
1853 
1854 	ASSERT((uchar_t *)opt_lenp > mp->b_rptr &&
1855 	    (uchar_t *)opt_lenp < mp->b_wptr);
1856 
1857 	copy_mp_head = NULL;
1858 	*is_absreq_failurep = 0;
1859 	switch (((union T_primitives *)mp->b_rptr)->type) {
1860 	case T_CONN_REQ:
1861 	case T_CONN_RES:
1862 		optset_context = SETFN_CONN_NEGOTIATE;
1863 		break;
1864 	case T_UNITDATA_REQ:
1865 		optset_context = SETFN_UD_NEGOTIATE;
1866 		break;
1867 	default:
1868 		/*
1869 		 * should never get here, all possible TPI primitives
1870 		 * where this can be called from should be accounted
1871 		 * for in the cases above
1872 		 */
1873 		return (EINVAL);
1874 	}
1875 
1876 	if ((opt_start = (struct T_opthdr *)
1877 	    mi_offset_param(mp, opt_offset, *opt_lenp)) == NULL) {
1878 		error = ENOPROTOOPT;
1879 		goto error_ret;
1880 	}
1881 	if (!__TPI_TOPT_ISALIGNED(opt_start)) {
1882 		error = ENOPROTOOPT;
1883 		goto error_ret;
1884 	}
1885 
1886 	opt_end = (struct T_opthdr *)((uchar_t *)opt_start
1887 	    + *opt_lenp);
1888 
1889 	if ((copy_mp_head = copyb(mp)) == (mblk_t *)NULL) {
1890 		error = ENOMEM;
1891 		goto error_ret;
1892 	}
1893 
1894 	init_optr = optr = (uchar_t *)&copy_mp_head->b_rptr[opt_offset];
1895 
1896 	for (opt = opt_start; opt && (opt < opt_end);
1897 	    opt = _TPI_TOPT_NEXTHDR(opt_start, *opt_lenp, opt)) {
1898 		/*
1899 		 * Validate the option for length and alignment
1900 		 * before accessing anything in it
1901 		 */
1902 		if (!_TPI_TOPT_VALID(opt, opt_start, opt_end)) {
1903 			error = ENOPROTOOPT;
1904 			goto error_ret;
1905 		}
1906 
1907 		/* Find the option in the opt_arr. */
1908 		optd = opt_chk_lookup(opt->level, opt->name,
1909 		    opt_arr, opt_arr_cnt);
1910 
1911 		if (optd == NULL) {
1912 			/*
1913 			 * Option not found
1914 			 */
1915 			opt->status = T_NOTSUPPORT;
1916 			continue;
1917 		}
1918 
1919 		/*
1920 		 * Weird but as in XTI spec.
1921 		 * Sec 6.3.6 "Privileged and ReadOnly Options"
1922 		 * Permission problems (e.g.readonly) fail with bad access
1923 		 * BUT "privileged" option request from those NOT PRIVILEGED
1924 		 * are to be merely "ignored".
1925 		 * XXX Prevents "probing" of privileged options ?
1926 		 */
1927 		if (OA_READONLY_PERMISSION(optd, cr)) {
1928 			error = EACCES;
1929 			goto error_ret;
1930 		}
1931 		if (OA_MATCHED_PRIV(optd, cr)) {
1932 			/*
1933 			 * For privileged options, we DO perform
1934 			 * access checks as is common sense
1935 			 */
1936 			if (!OA_WX_ANYPRIV(optd)) {
1937 				error = EACCES;
1938 				goto error_ret;
1939 			}
1940 		} else {
1941 			/*
1942 			 * For non privileged, we fail instead following
1943 			 * "ignore" semantics dictated by XTI spec for
1944 			 * permissions problems.
1945 			 * Sec 6.3.6 "Privileged and ReadOnly Options"
1946 			 * XXX Should we do "ignore" semantics ?
1947 			 */
1948 			if (!OA_WX_NOPRIV(optd)) { /* nopriv */
1949 				opt->status = T_FAILURE;
1950 				continue;
1951 			}
1952 		}
1953 		/*
1954 		 *
1955 		 * If the negotiation fails, for options that
1956 		 * are "absolute requirement", it is a fatal error.
1957 		 * For options that are NOT "absolute requirements",
1958 		 * and the value fails to negotiate, the XTI spec
1959 		 * only considers the possibility of partial success
1960 		 * (T_PARTSUCCES - not likely for Internet protocols).
1961 		 * The spec is in denial about complete failure
1962 		 * (T_FAILURE) to negotiate for options that are
1963 		 * carried on T_CONN_REQ/T_CONN_RES/T_UNITDATA
1964 		 * We interpret the T_FAILURE to negotiate an option
1965 		 * that is NOT an absolute requirement that it is safe
1966 		 * to ignore it.
1967 		 */
1968 
1969 		/* verify length */
1970 		if (!opt_length_ok(optd, opt)) {
1971 			/* bad size */
1972 			if ((optd->opdes_props & OP_NOT_ABSREQ) == 0) {
1973 				/* option is absolute requirement */
1974 				*is_absreq_failurep = 1;
1975 				error = EINVAL;
1976 				goto error_ret;
1977 			}
1978 			opt->status = T_FAILURE;
1979 			continue;
1980 		}
1981 
1982 		/*
1983 		 * verified generic attributes. Now call set function.
1984 		 * Note: We assume the following to simplify code.
1985 		 * XXX If this is found not to be valid, this routine
1986 		 * will need to be rewritten. At this point it would
1987 		 * be premature to introduce more complexity than is
1988 		 * needed.
1989 		 * Assumption: For variable length options, we assume
1990 		 * that the value returned will be same or less length
1991 		 * (size does not increase). This makes it OK to pass the
1992 		 * same space for output as it is on input.
1993 		 */
1994 
1995 		error = (*setfn)(q, optset_context, opt->level, opt->name,
1996 		    opt->len - (t_uscalar_t)sizeof (struct T_opthdr),
1997 		    _TPI_TOPT_DATA(opt), &olen, _TPI_TOPT_DATA(opt),
1998 		    thisdg_attrs, cr, NULL);
1999 
2000 		if (olen > (int)(opt->len - sizeof (struct T_opthdr))) {
2001 			/*
2002 			 * Space on output more than space on input. Should
2003 			 * not happen and we consider it a bug/error.
2004 			 * More of a restriction than an error in our
2005 			 * implementation. Will see if we can live with this
2006 			 * otherwise code will get more hairy with multiple
2007 			 * passes.
2008 			 */
2009 			error = EINVAL;
2010 			goto error_ret;
2011 		}
2012 		if (error != 0) {
2013 			if ((optd->opdes_props & OP_NOT_ABSREQ) == 0) {
2014 				/* option is absolute requirement. */
2015 				*is_absreq_failurep = 1;
2016 				goto error_ret;
2017 			}
2018 			/*
2019 			 * failed - but option "not an absolute
2020 			 * requirement"
2021 			 */
2022 			opt->status = T_FAILURE;
2023 			continue;
2024 		}
2025 		/*
2026 		 * Fill in the only possible successful result
2027 		 * (Note: TPI allows for T_PARTSUCCESS - partial
2028 		 * sucess result code which is relevant in OSI world
2029 		 * and not possible in Internet code)
2030 		 */
2031 		opt->status = T_SUCCESS;
2032 
2033 		/*
2034 		 * Add T_SUCCESS result code options to the "output" options.
2035 		 * No T_FAILURES or T_NOTSUPPORT here as they are to be
2036 		 * ignored.
2037 		 * This code assumes output option buffer will
2038 		 * be <= input option buffer.
2039 		 *
2040 		 * Copy option header+value
2041 		 */
2042 		bcopy(opt, optr, opt->len);
2043 		optr +=  _TPI_ALIGN_TOPT(opt->len);
2044 	}
2045 	/*
2046 	 * Overwrite the input mblk option buffer now with the output
2047 	 * and update length, and contents in original mbl
2048 	 * (offset remains unchanged).
2049 	 */
2050 	*opt_lenp = (t_scalar_t)(optr - init_optr);
2051 	if (*opt_lenp > 0) {
2052 		bcopy(init_optr, opt_start, *opt_lenp);
2053 	}
2054 
2055 error_ret:
2056 	if (copy_mp_head != NULL)
2057 		freeb(copy_mp_head);
2058 	return (error);
2059 }
2060 
2061 static opdes_t *
2062 opt_chk_lookup(t_uscalar_t level, t_uscalar_t name, opdes_t *opt_arr,
2063     uint_t opt_arr_cnt)
2064 {
2065 	opdes_t		*optd;
2066 
2067 	for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt];
2068 	    optd++) {
2069 		if (level == (uint_t)optd->opdes_level &&
2070 		    name == (uint_t)optd->opdes_name)
2071 			return (optd);
2072 	}
2073 	return (NULL);
2074 }
2075 
2076 static boolean_t
2077 opt_level_valid(t_uscalar_t level, optlevel_t *valid_level_arr,
2078     uint_t valid_level_arr_cnt)
2079 {
2080 	optlevel_t		*olp;
2081 
2082 	for (olp = valid_level_arr;
2083 	    olp < &valid_level_arr[valid_level_arr_cnt];
2084 	    olp++) {
2085 		if (level == (uint_t)(*olp))
2086 			return (B_TRUE);
2087 	}
2088 	return (B_FALSE);
2089 }
2090 
2091 
2092 /*
2093  * Compute largest possible size for an option buffer containing
2094  * all options in one buffer.
2095  *
2096  * XXX TBD, investigate use of opt_bloated_maxsize() to avoid
2097  *     wastefully large buffer allocation.
2098  */
2099 static size_t
2100 opt_level_allopts_lengths(t_uscalar_t level, opdes_t *opt_arr,
2101     uint_t opt_arr_cnt)
2102 {
2103 	opdes_t		*optd;
2104 	size_t allopt_len = 0;	/* 0 implies no option at this level */
2105 
2106 	/*
2107 	 * Scan opt_arr computing aggregate length
2108 	 * requirement for storing values of all
2109 	 * options.
2110 	 * Note: we do not filter for permissions
2111 	 * etc. This will be >= the real aggregate
2112 	 * length required (upper bound).
2113 	 */
2114 
2115 	for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt];
2116 	    optd++) {
2117 		if (level == optd->opdes_level) {
2118 			allopt_len += sizeof (struct T_opthdr) +
2119 			    _TPI_ALIGN_TOPT(optd->opdes_size);
2120 		}
2121 	}
2122 	return (allopt_len);	/* 0 implies level not found */
2123 }
2124 
2125 /*
2126  * Compute largest possible size for an option buffer containing
2127  * all options in one buffer - a (theoretical?) worst case scenario
2128  * for certain cases.
2129  */
2130 t_uscalar_t
2131 optcom_max_optbuf_len(opdes_t *opt_arr, uint_t opt_arr_cnt)
2132 {
2133 	t_uscalar_t max_optbuf_len = sizeof (struct T_info_ack);
2134 	opdes_t		*optd;
2135 
2136 	for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
2137 		max_optbuf_len += (t_uscalar_t)sizeof (struct T_opthdr) +
2138 		    (t_uscalar_t)_TPI_ALIGN_TOPT(optd->opdes_size);
2139 	}
2140 	return (max_optbuf_len);
2141 }
2142 
2143 /*
2144  * Compute largest possible size for OPT_size for a transport.
2145  * Heuristic used is to add all but certain extremely large
2146  * size options; this is done by calling opt_bloated_maxsize().
2147  * It affects user level allocations in TLI/XTI code using t_alloc()
2148  * and other TLI/XTI implementation instance strucutures.
2149  * The large size options excluded are presumed to be
2150  * never accessed through the (theoretical?) worst case code paths
2151  * through TLI/XTI as they are currently IPv6 specific options.
2152  */
2153 
2154 t_uscalar_t
2155 optcom_max_optsize(opdes_t *opt_arr, uint_t opt_arr_cnt)
2156 {
2157 	t_uscalar_t max_optbuf_len = sizeof (struct T_info_ack);
2158 	opdes_t		*optd;
2159 
2160 	for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
2161 		if (!opt_bloated_maxsize(optd)) {
2162 			max_optbuf_len +=
2163 			    (t_uscalar_t)sizeof (struct T_opthdr) +
2164 			    (t_uscalar_t)_TPI_ALIGN_TOPT(optd->opdes_size);
2165 		}
2166 	}
2167 	return (max_optbuf_len);
2168 }
2169 
2170 /*
2171  * The theoretical model used in optcom_max_optsize() and
2172  * opt_level_allopts_lengths() accounts for the worst case of all
2173  * possible options for the theoretical cases and results in wasteful
2174  * memory allocations for certain theoretically correct usage scenarios.
2175  * In practice, the "features" they support are rarely, if ever,
2176  * used and even then only by test suites for those features (VSU, VST).
2177  * However, they result in large allocations due to the increased transport
2178  * T_INFO_ACK OPT_size field affecting t_alloc() users and TLI/XTI library
2179  * instance data structures for applications.
2180  *
2181  * The following routine opt_bloated_maxsize() supports a hack that avoids
2182  * paying the tax for the bloated options by excluding them and pretending
2183  * they don't exist for certain features without affecting features that
2184  * do use them.
2185  *
2186  * XXX Currently implemented only for optcom_max_optsize()
2187  *     (to reduce risk late in release).
2188  *     TBD for future, investigate use in optcom_level_allopts_lengths() and
2189  *     all the instances of T_ALLOPT processing to exclude "bloated options".
2190  *     Will not affect VSU/VST tests as they do not test with IPPROTO_IPV6
2191  *     level options which are the only ones that fit the "bloated maxsize"
2192  *     option profile now.
2193  */
2194 static boolean_t
2195 opt_bloated_maxsize(opdes_t *optd)
2196 {
2197 	if (optd->opdes_level != IPPROTO_IPV6)
2198 		return (B_FALSE);
2199 	switch (optd->opdes_name) {
2200 	case IPV6_HOPOPTS:
2201 	case IPV6_DSTOPTS:
2202 	case IPV6_RTHDRDSTOPTS:
2203 	case IPV6_RTHDR:
2204 	case IPV6_PATHMTU:
2205 		return (B_TRUE);
2206 	default:
2207 		break;
2208 	}
2209 	return (B_FALSE);
2210 }
2211 
2212 static boolean_t
2213 opt_length_ok(opdes_t *optd, struct T_opthdr *opt)
2214 {
2215 	/*
2216 	 * Verify length.
2217 	 * Value specified should match length of fixed length option or be
2218 	 * less than maxlen of variable length option.
2219 	 */
2220 	if (optd->opdes_props & OP_VARLEN) {
2221 		if (opt->len <= optd->opdes_size +
2222 		    (t_uscalar_t)sizeof (struct T_opthdr))
2223 			return (B_TRUE);
2224 	} else {
2225 		/* fixed length option */
2226 		if (opt->len == optd->opdes_size +
2227 		    (t_uscalar_t)sizeof (struct T_opthdr))
2228 			return (B_TRUE);
2229 	}
2230 	return (B_FALSE);
2231 }
2232