xref: /titanic_44/usr/src/uts/common/inet/optcom.c (revision 98157a7002f4f2cf7978f3084ca5577f0a1d72b2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * This file contains common code for handling Options Management requests.
31  */
32 
33 #include <sys/types.h>
34 #include <sys/stream.h>
35 #include <sys/stropts.h>
36 #include <sys/strsubr.h>
37 #include <sys/errno.h>
38 #define	_SUN_TPI_VERSION 2
39 #include <sys/tihdr.h>
40 #include <sys/socket.h>
41 #include <sys/ddi.h>
42 #include <sys/debug.h>		/* for ASSERT */
43 #include <sys/policy.h>
44 
45 #include <inet/common.h>
46 #include <inet/mi.h>
47 #include <inet/nd.h>
48 #include <netinet/ip6.h>
49 #include <inet/ip.h>
50 #include <inet/mib2.h>
51 #include <netinet/in.h>
52 #include "optcom.h"
53 
54 #include <inet/optcom.h>
55 
56 /*
57  * Function prototypes
58  */
59 static t_scalar_t process_topthdrs_first_pass(mblk_t *, cred_t *, optdb_obj_t *,
60     boolean_t *, size_t *);
61 static t_scalar_t do_options_second_pass(queue_t *q, mblk_t *reqmp,
62     mblk_t *ack_mp, cred_t *, optdb_obj_t *dbobjp,
63     mblk_t *first_mp, boolean_t is_restart, boolean_t *queued_statusp);
64 static t_uscalar_t get_worst_status(t_uscalar_t, t_uscalar_t);
65 static int do_opt_default(queue_t *, struct T_opthdr *, uchar_t **,
66     t_uscalar_t *, cred_t *, optdb_obj_t *);
67 static void do_opt_current(queue_t *, struct T_opthdr *, uchar_t **,
68     t_uscalar_t *, cred_t *cr, optdb_obj_t *);
69 static int do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
70     uint_t optset_context, uchar_t **resptrp, t_uscalar_t *worst_statusp,
71     cred_t *, optdb_obj_t *dbobjp, mblk_t *first_mp);
72 static opdes_t *opt_chk_lookup(t_uscalar_t, t_uscalar_t, opdes_t *, uint_t);
73 static boolean_t opt_level_valid(t_uscalar_t, optlevel_t *, uint_t);
74 static size_t opt_level_allopts_lengths(t_uscalar_t, opdes_t *, uint_t);
75 static boolean_t opt_length_ok(opdes_t *, struct T_opthdr *);
76 static t_uscalar_t optcom_max_optbuf_len(opdes_t *, uint_t);
77 static boolean_t opt_bloated_maxsize(opdes_t *);
78 
79 /* Common code for sending back a T_ERROR_ACK. */
80 void
81 optcom_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
82 {
83 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
84 		qreply(q, mp);
85 }
86 
87 /*
88  * The option management routines svr4_optcom_req() and tpi_optcom_req() use
89  * callback functions as arguments. Here is the expected interfaces
90  * assumed from the callback functions
91  *
92  *
93  * (1) deffn(q, optlevel, optname, optvalp)
94  *
95  *	- Function only called when default value comes from protocol
96  *	 specific code and not the option database table (indicated by
97  *	  OP_DEF_FN property in option database.)
98  *	- Error return is -1. Valid returns are >=0.
99  *	- When valid, the return value represents the length used for storing
100  *		the default value of the option.
101  *      - Error return implies the called routine did not recognize this
102  *              option. Something downstream could so input is left unchanged
103  *              in request buffer.
104  *
105  * (2) getfn(q, optlevel, optname, optvalp)
106  *
107  *	- Error return is -1. Valid returns are >=0.
108  *	- When valid, the return value represents the length used for storing
109  *		the actual value of the option.
110  *      - Error return implies the called routine did not recognize this
111  *              option. Something downstream could so input is left unchanged
112  *              in request buffer.
113  *
114  * (3) setfn(q, optset_context, optlevel, optname, inlen, invalp,
115  *	outlenp, outvalp, attrp, cr);
116  *
117  *	- OK return is 0, Error code is returned as a non-zero argument.
118  *      - If negative it is ignored by svr4_optcom_req(). If positive, error
119  *        is returned. A negative return implies that option, while handled on
120  *	  this stack is not handled at this level and will be handled further
121  *	  downstream.
122  *	- Both negative and positive errors are treats as errors in an
123  *	  identical manner by tpi_optcom_req(). The errors affect "status"
124  *	  field of each option's T_opthdr. If sucessfull, an appropriate sucess
125  *	  result is carried. If error, it instantiated to "failure" at the
126  *	  topmost level and left unchanged at other levels. (This "failure" can
127  *	  turn to a success at another level).
128  *	- optset_context passed for tpi_optcom_req(). It is interpreted as:
129  *        - SETFN_OPTCOM_CHECKONLY
130  *		semantics are to pretend to set the value and report
131  *		back if it would be successful.
132  *		This is used with T_CHECK semantics in XTI
133  *        - SETFN_OPTCOM_NEGOTIATE
134  *		set the value. Call from option management primitive
135  *		T_OPTMGMT_REQ when T_NEGOTIATE flags is used.
136  *	  - SETFN_UD_NEGOTIATE
137  *		option request came riding on UNITDATA primitive most often
138  *		has  "this datagram" semantics to influence prpoerties
139  *		affecting an outgoig datagram or associated with recived
140  *		datagram
141  *		[ Note: XTI permits this use outside of "this datagram"
142  *		semantics also and permits setting "management related"
143  *		options in this	context and its test suite enforces it ]
144  *	  - SETFN_CONN_NEGOTATE
145  *		option request came riding on CONN_REQ/RES primitive and
146  *		most often has "this connection" (negotiation during
147  *		"connection estblishment") semantics.
148  *		[ Note: XTI permits use of these outside of "this connection"
149  *		semantics and permits "management related" options in this
150  *		context and its test suite enforces it. ]
151  *
152  *	- inlen, invalp is the option length,value requested to be set.
153  *	- outlenp, outvalp represent return parameters which contain the
154  *	  value set and it might be different from one passed on input.
155  *	- attrp points to a data structure that's used by v6 modules to
156  *	  store ancillary data options or sticky options.
157  *	- cr points to the caller's credentials
158  *	- the caller might pass same buffers for input and output and the
159  *	  routine should protect against this case by not updating output
160  *	  buffers until it is done referencing input buffers and any other
161  *	  issues (e.g. not use bcopy() if we do not trust what it does).
162  *      - If option is not known, it returns error. We randomly pick EINVAL.
163  *        It can however get called with options that are handled downstream
164  *        opr upstream so for svr4_optcom_req(), it does not return error for
165  *        negative return values.
166  *
167  */
168 
169 /*
170  * Upper Level Protocols call this routine when they receive
171  * a T_SVR4_OPTMGMT_REQ message.  They supply callback functions
172  * for setting a new value for a single options, getting the
173  * current value for a single option, and checking for support
174  * of a single option.  svr4_optcom_req validates the option management
175  * buffer passed in, and calls the appropriate routines to do the
176  * job requested.
177  * XXX Code below needs some restructuring after we have some more
178  * macros to support 'struct opthdr' in the headers.
179  *
180  * IP-MT notes: The option management framework functions svr4_optcom_req() and
181  * tpi_optcom_req() allocate and prepend an M_CTL mblk to the actual
182  * T_optmgmt_req mblk and pass the chain as an additional parameter to the
183  * protocol set functions. If a protocol set function (such as ip_opt_set)
184  * cannot process the option immediately it can return EINPROGRESS. ip_opt_set
185  * enqueues the message in the appropriate sq and returns EINPROGRESS. Later
186  * the sq framework arranges to restart this operation and passes control to
187  * the restart function ip_restart_optmgmt() which in turn calls
188  * svr4_optcom_req() or tpi_optcom_req() to restart the option processing.
189  */
190 int
191 svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
192     boolean_t pass_to_ip)
193 {
194 	pfi_t	deffn = dbobjp->odb_deffn;
195 	pfi_t	getfn = dbobjp->odb_getfn;
196 	opt_set_fn setfn = dbobjp->odb_setfn;
197 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
198 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
199 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
200 	opt_restart_t *or;
201 	struct opthdr *restart_opt;
202 	boolean_t is_restart = B_FALSE;
203 	mblk_t	*first_mp;
204 
205 	t_uscalar_t max_optbuf_len;
206 	int len;
207 	mblk_t	*mp1 = NULL;
208 	struct opthdr *next_opt;
209 	struct opthdr *opt;
210 	struct opthdr *opt1;
211 	struct opthdr *opt_end;
212 	struct opthdr *opt_start;
213 	opdes_t	*optd;
214 	boolean_t	pass_to_next = B_FALSE;
215 	struct T_optmgmt_ack *toa;
216 	struct T_optmgmt_req *tor;
217 
218 	/*
219 	 * Allocate M_CTL and prepend to the packet for restarting this
220 	 * option if needed. IP may need to queue and restart the option
221 	 * if it cannot obtain exclusive conditions immediately. Please see
222 	 * IP-MT notes before the start of svr4_optcom_req
223 	 */
224 	if (mp->b_datap->db_type == M_CTL) {
225 		is_restart = B_TRUE;
226 		first_mp = mp;
227 		mp = mp->b_cont;
228 		ASSERT(mp->b_wptr - mp->b_rptr >=
229 		    sizeof (struct T_optmgmt_req));
230 		tor = (struct T_optmgmt_req *)mp->b_rptr;
231 		ASSERT(tor->MGMT_flags == T_NEGOTIATE);
232 
233 		or = (opt_restart_t *)first_mp->b_rptr;
234 		opt_start = or->or_start;
235 		opt_end = or->or_end;
236 		restart_opt = or->or_ropt;
237 		goto restart;
238 	}
239 
240 	tor = (struct T_optmgmt_req *)mp->b_rptr;
241 	/* Verify message integrity. */
242 	if (mp->b_wptr - mp->b_rptr < sizeof (struct T_optmgmt_req))
243 		goto bad_opt;
244 	/* Verify MGMT_flags legal */
245 	switch (tor->MGMT_flags) {
246 	case T_DEFAULT:
247 	case T_NEGOTIATE:
248 	case T_CURRENT:
249 	case T_CHECK:
250 		/* OK - legal request flags */
251 		break;
252 	default:
253 		optcom_err_ack(q, mp, TBADFLAG, 0);
254 		return (0);
255 	}
256 	if (tor->MGMT_flags == T_DEFAULT) {
257 		/* Is it a request for default option settings? */
258 
259 		/*
260 		 * Note: XXX TLI and TPI specification was unclear about
261 		 * semantics of T_DEFAULT and the following historical note
262 		 * and its interpretation is incorrect (it implies a request
263 		 * for default values of only the identified options not all.
264 		 * The semantics have been explained better in XTI spec.)
265 		 * However, we do not modify (comment or code) here to keep
266 		 * compatibility.
267 		 * We can rethink this if it ever becomes an issue.
268 		 * ----historical comment start------
269 		 * As we understand it, the input buffer is meaningless
270 		 * so we ditch the message.  A T_DEFAULT request is a
271 		 * request to obtain a buffer containing defaults for
272 		 * all supported options, so we allocate a maximum length
273 		 * reply.
274 		 * ----historical comment end -------
275 		 */
276 		/* T_DEFAULT not passed down */
277 		ASSERT(topmost_tpiprovider == B_TRUE);
278 		freemsg(mp);
279 		max_optbuf_len = optcom_max_optbuf_len(opt_arr,
280 		    opt_arr_cnt);
281 		mp = allocb(max_optbuf_len, BPRI_MED);
282 		if (!mp) {
283 no_mem:;
284 			optcom_err_ack(q, mp, TSYSERR, ENOMEM);
285 			return (0);
286 		}
287 
288 		/* Initialize the T_optmgmt_ack header. */
289 		toa = (struct T_optmgmt_ack *)mp->b_rptr;
290 		bzero((char *)toa, max_optbuf_len);
291 		toa->PRIM_type = T_OPTMGMT_ACK;
292 		toa->OPT_offset = (t_scalar_t)sizeof (struct T_optmgmt_ack);
293 		/* TODO: Is T_DEFAULT the right thing to put in MGMT_flags? */
294 		toa->MGMT_flags = T_DEFAULT;
295 
296 		/* Now walk the table of options passed in */
297 		opt = (struct opthdr *)&toa[1];
298 		for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
299 			/*
300 			 * All the options in the table of options passed
301 			 * in are by definition supported by the protocol
302 			 * calling this function.
303 			 */
304 			if (!OA_READ_PERMISSION(optd, cr))
305 				continue;
306 			opt->level = optd->opdes_level;
307 			opt->name = optd->opdes_name;
308 			if (!(optd->opdes_props & OP_DEF_FN) ||
309 			    ((len = (*deffn)(q, opt->level,
310 			    opt->name, (uchar_t *)&opt[1])) < 0)) {
311 				/*
312 				 * Fill length and value from table.
313 				 *
314 				 * Default value not instantiated from function
315 				 * (or the protocol specific function failed it;
316 				 * In this interpretation of T_DEFAULT, this is
317 				 * the best we can do)
318 				 */
319 				switch (optd->opdes_size) {
320 				/*
321 				 * Since options are guaranteed aligned only
322 				 * on a 4 byte boundary (t_scalar_t) any
323 				 * option that is greater in size will default
324 				 * to the bcopy below
325 				 */
326 				case sizeof (int32_t):
327 					*(int32_t *)&opt[1] =
328 					    (int32_t)optd->opdes_default;
329 					break;
330 				case sizeof (int16_t):
331 					*(int16_t *)&opt[1] =
332 					    (int16_t)optd->opdes_default;
333 					break;
334 				case sizeof (int8_t):
335 					*(int8_t *)&opt[1] =
336 					    (int8_t)optd->opdes_default;
337 					break;
338 				default:
339 					/*
340 					 * other length but still assume
341 					 * fixed - use bcopy
342 					 */
343 					bcopy(optd->opdes_defbuf,
344 					    &opt[1], optd->opdes_size);
345 					break;
346 				}
347 				opt->len = optd->opdes_size;
348 			}
349 			else
350 				opt->len = (t_uscalar_t)len;
351 			opt = (struct opthdr *)((char *)&opt[1] +
352 			    _TPI_ALIGN_OPT(opt->len));
353 		}
354 
355 		/* Now record the final length. */
356 		toa->OPT_length = (t_scalar_t)((char *)opt - (char *)&toa[1]);
357 		mp->b_wptr = (uchar_t *)opt;
358 		mp->b_datap->db_type = M_PCPROTO;
359 		/* Ship it back. */
360 		qreply(q, mp);
361 		return (0);
362 	}
363 	/* T_DEFAULT processing complete - no more T_DEFAULT */
364 
365 	/*
366 	 * For T_NEGOTIATE, T_CURRENT, and T_CHECK requests, we make a
367 	 * pass through the input buffer validating the details and
368 	 * making sure each option is supported by the protocol.
369 	 */
370 	if ((opt_start = (struct opthdr *)mi_offset_param(mp,
371 	    tor->OPT_offset, tor->OPT_length)) == NULL)
372 		goto bad_opt;
373 	if (!__TPI_OPT_ISALIGNED(opt_start))
374 		goto bad_opt;
375 
376 	opt_end = (struct opthdr *)((uchar_t *)opt_start +
377 	    tor->OPT_length);
378 
379 	for (opt = opt_start; opt < opt_end; opt = next_opt) {
380 		/*
381 		 * Verify we have room to reference the option header
382 		 * fields in the option buffer.
383 		 */
384 		if ((uchar_t *)opt + sizeof (struct opthdr) >
385 		    (uchar_t *)opt_end)
386 			goto bad_opt;
387 		/*
388 		 * We now compute pointer to next option in buffer 'next_opt'
389 		 * The next_opt computation above below 'opt->len' initialized
390 		 * by application which cannot be trusted. The usual value
391 		 * too large will be captured by the loop termination condition
392 		 * above. We check for the following which it will miss.
393 		 * 	-pointer space wraparound arithmetic overflow
394 		 *	-last option in buffer with 'opt->len' being too large
395 		 *	 (only reason 'next_opt' should equal or exceed
396 		 *	 'opt_end' for last option is roundup unless length is
397 		 *	 too-large/invalid)
398 		 */
399 		next_opt = (struct opthdr *)((uchar_t *)&opt[1] +
400 		    _TPI_ALIGN_OPT(opt->len));
401 
402 		if ((uchar_t *)next_opt < (uchar_t *)&opt[1] ||
403 		    ((next_opt >= opt_end) &&
404 		    (((uchar_t *)next_opt - (uchar_t *)opt_end) >=
405 		    __TPI_ALIGN_SIZE)))
406 			goto bad_opt;
407 
408 		/* sanity check */
409 		if (opt->name == T_ALLOPT)
410 			goto bad_opt;
411 
412 		/* Find the option in the opt_arr. */
413 		if ((optd = opt_chk_lookup(opt->level, opt->name,
414 		    opt_arr, opt_arr_cnt)) == NULL) {
415 			/*
416 			 * Not found, that is a bad thing if
417 			 * the caller is a tpi provider
418 			 */
419 			if (topmost_tpiprovider)
420 				goto bad_opt;
421 			else
422 				continue; /* skip unmodified */
423 		}
424 
425 		/* Additional checks dependent on operation. */
426 		switch (tor->MGMT_flags) {
427 		case T_NEGOTIATE:
428 			if (!OA_WRITE_OR_EXECUTE(optd, cr)) {
429 				/* can't negotiate option */
430 				if (!(OA_MATCHED_PRIV(optd, cr)) &&
431 				    OA_WX_ANYPRIV(optd)) {
432 					/*
433 					 * not privileged but privilege
434 					 * will help negotiate option.
435 					 */
436 					optcom_err_ack(q, mp, TACCES, 0);
437 					return (0);
438 				} else
439 					goto bad_opt;
440 			}
441 			/*
442 			 * Verify size for options
443 			 * Note: For retaining compatibility with historical
444 			 * behavior, variable lengths options will have their
445 			 * length verified in the setfn() processing.
446 			 * In order to be compatible with SunOS 4.X we return
447 			 * EINVAL errors for bad lengths.
448 			 */
449 			if (!(optd->opdes_props & OP_VARLEN)) {
450 				/* fixed length - size must match */
451 				if (opt->len != optd->opdes_size) {
452 					optcom_err_ack(q, mp, TSYSERR, EINVAL);
453 					return (0);
454 				}
455 			}
456 			break;
457 
458 		case T_CHECK:
459 			if (!OA_RWX_ANYPRIV(optd))
460 				/* any of "rwx" permission but not not none */
461 				goto bad_opt;
462 			/*
463 			 * XXX Since T_CURRENT was not there in TLI and the
464 			 * official TLI inspired TPI standard, getsockopt()
465 			 * API uses T_CHECK (for T_CURRENT semantics)
466 			 * The following fallthru makes sense because of its
467 			 * historical use as semantic equivalent to T_CURRENT.
468 			 */
469 			/* FALLTHRU */
470 		case T_CURRENT:
471 			if (!OA_READ_PERMISSION(optd, cr)) {
472 				/* can't read option value */
473 				if (!(OA_MATCHED_PRIV(optd, cr)) &&
474 				    OA_R_ANYPRIV(optd)) {
475 					/*
476 					 * not privileged but privilege
477 					 * will help in reading option value.
478 					 */
479 					optcom_err_ack(q, mp, TACCES, 0);
480 					return (0);
481 				} else
482 					goto bad_opt;
483 			}
484 			break;
485 
486 		default:
487 			optcom_err_ack(q, mp, TBADFLAG, 0);
488 			return (0);
489 		}
490 		/* We liked it.  Keep going. */
491 	} /* end for loop scanning option buffer */
492 
493 	/* Now complete the operation as required. */
494 	switch (tor->MGMT_flags) {
495 	case T_CHECK:
496 		/*
497 		 * Historically used same as T_CURRENT (which was added to
498 		 * standard later). Code retained for compatibility.
499 		 */
500 		/* FALLTHROUGH */
501 	case T_CURRENT:
502 		/*
503 		 * Allocate a maximum size reply.  Perhaps we are supposed to
504 		 * assume that the input buffer includes space for the answers
505 		 * as well as the opthdrs, but we don't know that for sure.
506 		 * So, instead, we create a new output buffer, using the
507 		 * input buffer only as a list of options.
508 		 */
509 		max_optbuf_len = optcom_max_optbuf_len(opt_arr,
510 		    opt_arr_cnt);
511 		mp1 = allocb_cred(max_optbuf_len, cr);
512 		if (!mp1)
513 			goto no_mem;
514 		/* Initialize the header. */
515 		mp1->b_datap->db_type = M_PCPROTO;
516 		mp1->b_wptr = &mp1->b_rptr[sizeof (struct T_optmgmt_ack)];
517 		toa = (struct T_optmgmt_ack *)mp1->b_rptr;
518 		toa->OPT_offset = (t_scalar_t)sizeof (struct T_optmgmt_ack);
519 		toa->MGMT_flags = tor->MGMT_flags;
520 		/*
521 		 * Walk through the input buffer again, this time adding
522 		 * entries to the output buffer for each option requested.
523 		 * Note, sanity of option header, last option etc, verified
524 		 * in first pass.
525 		 */
526 		opt1 = (struct opthdr *)&toa[1];
527 
528 		for (opt = opt_start; opt < opt_end; opt = next_opt) {
529 
530 			next_opt = (struct opthdr *)((uchar_t *)&opt[1] +
531 			    _TPI_ALIGN_OPT(opt->len));
532 
533 			opt1->name = opt->name;
534 			opt1->level = opt->level;
535 			len = (*getfn)(q, opt->level,
536 			    opt->name, (uchar_t *)&opt1[1]);
537 			/*
538 			 * Failure means option is not recognized. Copy input
539 			 * buffer as is
540 			 */
541 			if (len < 0) {
542 				opt1->len = opt->len;
543 				bcopy(&opt[1], &opt1[1], opt->len);
544 			} else {
545 				opt1->len = (t_uscalar_t)len;
546 			}
547 			opt1 = (struct opthdr *)((uchar_t *)&opt1[1] +
548 			    _TPI_ALIGN_OPT(opt1->len));
549 		} /* end for loop */
550 
551 		/* Record the final length. */
552 		toa->OPT_length = (t_scalar_t)((uchar_t *)opt1 -
553 		    (uchar_t *)&toa[1]);
554 		mp1->b_wptr = (uchar_t *)opt1;
555 		/* Ditch the input buffer. */
556 		freemsg(mp);
557 		mp = mp1;
558 		/* Always let the next module look at the option. */
559 		pass_to_next = B_TRUE;
560 		break;
561 
562 	case T_NEGOTIATE:
563 		first_mp = allocb(sizeof (opt_restart_t), BPRI_LO);
564 		if (first_mp == NULL) {
565 			optcom_err_ack(q, mp, TSYSERR, ENOMEM);
566 			return (0);
567 		}
568 		first_mp->b_datap->db_type = M_CTL;
569 		or = (opt_restart_t *)first_mp->b_rptr;
570 		or->or_start = opt_start;
571 		or->or_end =  opt_end;
572 		or->or_type = T_SVR4_OPTMGMT_REQ;
573 		or->or_private = 0;
574 		first_mp->b_cont = mp;
575 restart:
576 		/*
577 		 * Here we are expecting that the response buffer is exactly
578 		 * the same size as the input buffer.  We pass each opthdr
579 		 * to the protocol's set function.  If the protocol doesn't
580 		 * like it, it can update the value in it return argument.
581 		 */
582 		/*
583 		 * Pass each negotiated option through the protocol set
584 		 * function.
585 		 * Note: sanity check on option header values done in first
586 		 * pass and not repeated here.
587 		 */
588 		toa = (struct T_optmgmt_ack *)tor;
589 
590 		for (opt = is_restart ? restart_opt: opt_start; opt < opt_end;
591 		    opt = next_opt) {
592 			int error;
593 
594 			/*
595 			 * Point to the current option in or, in case this
596 			 * option has to be restarted later on
597 			 */
598 			or->or_ropt = opt;
599 			next_opt = (struct opthdr *)((uchar_t *)&opt[1] +
600 			    _TPI_ALIGN_OPT(opt->len));
601 
602 			error = (*setfn)(q, SETFN_OPTCOM_NEGOTIATE,
603 			    opt->level, opt->name,
604 			    opt->len, (uchar_t *)&opt[1],
605 			    &opt->len, (uchar_t *)&opt[1], NULL, cr, first_mp);
606 			/*
607 			 * Treat positive "errors" as real.
608 			 * Note: negative errors are to be treated as
609 			 * non-fatal by svr4_optcom_req() and are
610 			 * returned by setfn() when it is passed an
611 			 * option it does not handle. Since the option
612 			 * passed opt_chk_lookup(), it is implied that
613 			 * it is valid but was either handled upstream
614 			 * or will be handled downstream.
615 			 */
616 			if (error == EINPROGRESS) {
617 				/*
618 				 * The message is queued and will be
619 				 * reprocessed later. Typically ip queued
620 				 * the message to get some exclusive conditions
621 				 * and later on calls this func again.
622 				 */
623 				return (EINPROGRESS);
624 			} else if (error > 0) {
625 				optcom_err_ack(q, mp, TSYSERR, error);
626 				freeb(first_mp);
627 				return (0);
628 			}
629 			/*
630 			 * error < 0 means option is not recognized.
631 			 * But with OP_PASSNEXT the next module
632 			 * might recognize it.
633 			 */
634 		}
635 		/* Done with the restart control mp. */
636 		freeb(first_mp);
637 		pass_to_next = B_TRUE;
638 		break;
639 	default:
640 		optcom_err_ack(q, mp, TBADFLAG, 0);
641 		return (0);
642 	}
643 
644 	if (pass_to_next && (q->q_next != NULL || pass_to_ip)) {
645 		/* Send it down to the next module and let it reply */
646 		toa->PRIM_type = T_SVR4_OPTMGMT_REQ; /* Changed by IP to ACK */
647 		if (q->q_next != NULL)
648 			putnext(q, mp);
649 		else
650 			ip_output(Q_TO_CONN(q), mp, q, IP_WPUT);
651 	} else {
652 		/* Set common fields in the header. */
653 		toa->MGMT_flags = T_SUCCESS;
654 		mp->b_datap->db_type = M_PCPROTO;
655 		toa->PRIM_type = T_OPTMGMT_ACK;
656 		qreply(q, mp);
657 	}
658 	return (0);
659 bad_opt:;
660 	optcom_err_ack(q, mp, TBADOPT, 0);
661 	return (0);
662 }
663 
664 /*
665  * New optcom_req inspired by TPI/XTI semantics
666  */
667 int
668 tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
669     boolean_t pass_to_ip)
670 {
671 	t_scalar_t t_error;
672 	mblk_t *toa_mp;
673 	boolean_t pass_to_next;
674 	size_t toa_len;
675 	struct T_optmgmt_ack *toa;
676 	struct T_optmgmt_req *tor =
677 	    (struct T_optmgmt_req *)mp->b_rptr;
678 
679 	opt_restart_t *or;
680 	boolean_t is_restart = B_FALSE;
681 	mblk_t	*first_mp = NULL;
682 	t_uscalar_t worst_status;
683 	boolean_t queued_status;
684 
685 	/*
686 	 * Allocate M_CTL and prepend to the packet for restarting this
687 	 * option if needed. IP may need to queue and restart the option
688 	 * if it cannot obtain exclusive conditions immediately. Please see
689 	 * IP-MT notes before the start of svr4_optcom_req
690 	 */
691 	if (mp->b_datap->db_type == M_CTL) {
692 		is_restart = B_TRUE;
693 		first_mp = mp;
694 		toa_mp = mp->b_cont;
695 		mp = toa_mp->b_cont;
696 		ASSERT(mp->b_wptr - mp->b_rptr >=
697 		    sizeof (struct T_optmgmt_req));
698 		tor = (struct T_optmgmt_req *)mp->b_rptr;
699 		ASSERT(tor->MGMT_flags == T_NEGOTIATE);
700 
701 		or = (opt_restart_t *)first_mp->b_rptr;
702 		goto restart;
703 	}
704 
705 	/* Verify message integrity. */
706 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_optmgmt_req)) {
707 		optcom_err_ack(q, mp, TBADOPT, 0);
708 		return (0);
709 	}
710 
711 	/* Verify MGMT_flags legal */
712 	switch (tor->MGMT_flags) {
713 	case T_DEFAULT:
714 	case T_NEGOTIATE:
715 	case T_CURRENT:
716 	case T_CHECK:
717 		/* OK - legal request flags */
718 		break;
719 	default:
720 		optcom_err_ack(q, mp, TBADFLAG, 0);
721 		return (0);
722 	}
723 
724 	/*
725 	 * In this design, there are two passes required on the input buffer
726 	 * mostly to accomodate variable length options and "T_ALLOPT" option
727 	 * which has the semantics "all options of the specified level".
728 	 *
729 	 * For T_DEFAULT, T_NEGOTIATE, T_CURRENT, and T_CHECK requests, we make
730 	 * a pass through the input buffer validating the details and making
731 	 * sure each option is supported by the protocol. We also determine the
732 	 * length of the option buffer to return. (Variable length options and
733 	 * T_ALLOPT mean that length can be different for output buffer).
734 	 */
735 
736 	pass_to_next = B_FALSE;	/* initial value */
737 	toa_len = 0;		/* initial value */
738 
739 	/*
740 	 * First pass, we do the following
741 	 *	- estimate cumulative length needed for results
742 	 *	- set "status" field based on permissions, option header check
743 	 *	  etc.
744 	 *	- determine "pass_to_next" whether we need to send request to
745 	 *	  downstream module/driver.
746 	 */
747 	if ((t_error = process_topthdrs_first_pass(mp, cr, dbobjp,
748 	    &pass_to_next, &toa_len)) != 0) {
749 		optcom_err_ack(q, mp, t_error, 0);
750 		return (0);
751 	}
752 
753 	/*
754 	 * A validation phase of the input buffer is done. We have also
755 	 * obtained the length requirement and and other details about the
756 	 * input and we liked input buffer so far.  We make another scan
757 	 * through the input now and generate the output necessary to complete
758 	 * the operation.
759 	 */
760 
761 	toa_mp = allocb_cred(toa_len, cr);
762 	if (!toa_mp) {
763 		optcom_err_ack(q, mp, TSYSERR, ENOMEM);
764 		return (0);
765 	}
766 
767 	first_mp = allocb(sizeof (opt_restart_t), BPRI_LO);
768 	if (first_mp == NULL) {
769 		freeb(toa_mp);
770 		optcom_err_ack(q, mp, TSYSERR, ENOMEM);
771 		return (0);
772 	}
773 	first_mp->b_datap->db_type = M_CTL;
774 	or = (opt_restart_t *)first_mp->b_rptr;
775 	/*
776 	 * Set initial values for generating output.
777 	 */
778 	or->or_worst_status = T_SUCCESS;
779 	or->or_type = T_OPTMGMT_REQ;
780 	or->or_private = 0;
781 	/* remaining fields fileed in do_options_second_pass */
782 
783 restart:
784 	/*
785 	 * This routine makes another pass through the option buffer this
786 	 * time acting on the request based on "status" result in the
787 	 * first pass. It also performs "expansion" of T_ALLOPT into
788 	 * all options of a certain level and acts on each for this request.
789 	 */
790 	if ((t_error = do_options_second_pass(q, mp, toa_mp, cr, dbobjp,
791 	    first_mp, is_restart, &queued_status)) != 0) {
792 		freemsg(toa_mp);
793 		optcom_err_ack(q, mp, t_error, 0);
794 		return (0);
795 	}
796 	if (queued_status) {
797 		/* Option will be restarted */
798 		return (EINPROGRESS);
799 	}
800 	worst_status = or->or_worst_status;
801 	/* Done with the first mp */
802 	freeb(first_mp);
803 	toa_mp->b_cont = NULL;
804 
805 	/*
806 	 * Following code relies on the coincidence that T_optmgmt_req
807 	 * and T_optmgmt_ack are identical in binary representation
808 	 */
809 	toa = (struct T_optmgmt_ack *)toa_mp->b_rptr;
810 	toa->OPT_length = (t_scalar_t)(toa_mp->b_wptr - (toa_mp->b_rptr +
811 	    sizeof (struct T_optmgmt_ack)));
812 	toa->OPT_offset = (t_scalar_t)sizeof (struct T_optmgmt_ack);
813 
814 	toa->MGMT_flags = tor->MGMT_flags;
815 
816 
817 	freemsg(mp);		/* free input mblk */
818 
819 	/*
820 	 * If there is atleast one option that requires a downstream
821 	 * forwarding and if it is possible, we forward the message
822 	 * downstream. Else we ack it.
823 	 */
824 	if (pass_to_next && (q->q_next != NULL || pass_to_ip)) {
825 		/*
826 		 * We pass it down as T_OPTMGMT_REQ. This code relies
827 		 * on the happy coincidence that T_optmgmt_req and
828 		 * T_optmgmt_ack are identical data structures
829 		 * at the binary representation level.
830 		 */
831 		toa_mp->b_datap->db_type = M_PROTO;
832 		toa->PRIM_type = T_OPTMGMT_REQ;
833 		if (q->q_next != NULL)
834 			putnext(q, toa_mp);
835 		else
836 			ip_output(Q_TO_CONN(q), toa_mp, q, IP_WPUT);
837 	} else {
838 		toa->PRIM_type = T_OPTMGMT_ACK;
839 		toa_mp->b_datap->db_type = M_PCPROTO;
840 		toa->MGMT_flags |= worst_status; /* XXX "worst" or "OR" TPI ? */
841 		qreply(q, toa_mp);
842 	}
843 	return (0);
844 }
845 
846 
847 /*
848  * Following routine makes a pass through option buffer in mp and performs the
849  * following tasks.
850  *	- estimate cumulative length needed for results
851  *	- set "status" field based on permissions, option header check
852  *	  etc.
853  *	- determine "pass_to_next" whether we need to send request to
854  *	  downstream module/driver.
855  */
856 
857 static t_scalar_t
858 process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
859     boolean_t *pass_to_nextp, size_t *toa_lenp)
860 {
861 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
862 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
863 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
864 	optlevel_t *valid_level_arr = dbobjp->odb_valid_levels_arr;
865 	uint_t valid_level_arr_cnt = dbobjp->odb_valid_levels_arr_cnt;
866 	struct T_opthdr *opt;
867 	struct T_opthdr *opt_start, *opt_end;
868 	opdes_t	*optd;
869 	size_t allopt_len;
870 	struct T_optmgmt_req *tor =
871 	    (struct T_optmgmt_req *)mp->b_rptr;
872 
873 	*toa_lenp = sizeof (struct T_optmgmt_ack); /* initial value */
874 
875 	if ((opt_start = (struct T_opthdr *)
876 	    mi_offset_param(mp, tor->OPT_offset, tor->OPT_length)) == NULL) {
877 		return (TBADOPT);
878 	}
879 	if (!__TPI_TOPT_ISALIGNED(opt_start))
880 		return (TBADOPT);
881 
882 	opt_end = (struct T_opthdr *)((uchar_t *)opt_start + tor->OPT_length);
883 
884 	for (opt = opt_start; opt && (opt < opt_end);
885 	    opt = _TPI_TOPT_NEXTHDR(opt_start, tor->OPT_length, opt)) {
886 		/*
887 		 * Validate the option for length and alignment
888 		 * before accessing anything in it.
889 		 */
890 		if (!(_TPI_TOPT_VALID(opt, opt_start, opt_end)))
891 			return (TBADOPT);
892 
893 		/* Find the option in the opt_arr. */
894 		if (opt->name != T_ALLOPT) {
895 			optd = opt_chk_lookup(opt->level, opt->name,
896 			    opt_arr, opt_arr_cnt);
897 			if (optd == NULL) {
898 				/*
899 				 * Option not found
900 				 *
901 				 * Verify if level is "valid" or not.
902 				 * Note: This check is required by XTI
903 				 *
904 				 * TPI provider always initializes
905 				 * the "not supported" (or whatever) status
906 				 * for the options. Other levels leave status
907 				 * unchanged if they do not understand an
908 				 * option.
909 				 */
910 				if (topmost_tpiprovider) {
911 					if (!opt_level_valid(opt->level,
912 					    valid_level_arr,
913 					    valid_level_arr_cnt))
914 						return (TBADOPT);
915 					/*
916 					 * level is valid - initialize
917 					 * option as not supported
918 					 */
919 					opt->status = T_NOTSUPPORT;
920 				}
921 
922 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
923 				continue;
924 			}
925 		} else {
926 			/*
927 			 * Handle T_ALLOPT case as a special case.
928 			 * Note: T_ALLOPT does not mean anything
929 			 * for T_CHECK operation.
930 			 */
931 			allopt_len = 0;
932 			if (tor->MGMT_flags == T_CHECK ||
933 			    !topmost_tpiprovider ||
934 			    ((allopt_len = opt_level_allopts_lengths(opt->level,
935 			    opt_arr, opt_arr_cnt)) == 0)) {
936 				/*
937 				 * This is confusing but correct !
938 				 * It is not valid to to use T_ALLOPT with
939 				 * T_CHECK flag.
940 				 *
941 				 * T_ALLOPT is assumed "expanded" at the
942 				 * topmost_tpiprovider level so it should not
943 				 * be there as an "option name" if this is not
944 				 * a topmost_tpiprovider call and we fail it.
945 				 *
946 				 * opt_level_allopts_lengths() is used to verify
947 				 * that "level" associated with the T_ALLOPT is
948 				 * supported.
949 				 *
950 				 */
951 				opt->status = T_FAILURE;
952 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
953 				continue;
954 			}
955 			ASSERT(allopt_len != 0); /* remove ? */
956 
957 			*toa_lenp += allopt_len;
958 			opt->status = T_SUCCESS;
959 			/* XXX - always set T_ALLOPT 'pass_to_next' for now */
960 			*pass_to_nextp = B_TRUE;
961 			continue;
962 		}
963 		/*
964 		 * Check if option wants to flow downstream
965 		 */
966 		if (optd->opdes_props & OP_PASSNEXT)
967 			*pass_to_nextp = B_TRUE;
968 
969 		/* Additional checks dependent on operation. */
970 		switch (tor->MGMT_flags) {
971 		case T_DEFAULT:
972 		case T_CURRENT:
973 
974 			/*
975 			 * The opt_chk_lookup() routine call above approved of
976 			 * this option so we can work on the status for it
977 			 * based on the permissions for the operation. (This
978 			 * can override any status for it set at higher levels)
979 			 * We assume this override is OK since chkfn at this
980 			 * level approved of this option.
981 			 *
982 			 * T_CURRENT semantics:
983 			 * The read access is required. Else option
984 			 * status is T_NOTSUPPORT.
985 			 *
986 			 * T_DEFAULT semantics:
987 			 * Note: specification is not clear on this but we
988 			 * interpret T_DEFAULT semantics such that access to
989 			 * read value is required for access even the default
990 			 * value. Otherwise the option status is T_NOTSUPPORT.
991 			 */
992 			if (!OA_READ_PERMISSION(optd, cr)) {
993 				opt->status = T_NOTSUPPORT;
994 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
995 				/* skip to next */
996 				continue;
997 			}
998 
999 			/*
1000 			 * T_DEFAULT/T_CURRENT semantics:
1001 			 * We know that read access is set. If no other access
1002 			 * is set, then status is T_READONLY.
1003 			 */
1004 			if (OA_READONLY_PERMISSION(optd, cr))
1005 				opt->status = T_READONLY;
1006 			else
1007 				opt->status = T_SUCCESS;
1008 			/*
1009 			 * Option passes all checks. Make room for it in the
1010 			 * ack. Note: size stored in table does not include
1011 			 * space for option header.
1012 			 */
1013 			*toa_lenp += sizeof (struct T_opthdr) +
1014 			    _TPI_ALIGN_TOPT(optd->opdes_size);
1015 			break;
1016 
1017 		case T_CHECK:
1018 		case T_NEGOTIATE:
1019 
1020 			/*
1021 			 * T_NEGOTIATE semantics:
1022 			 * If for fixed length option value on input is not the
1023 			 * same as value supplied, then status is T_FAILURE.
1024 			 *
1025 			 * T_CHECK semantics:
1026 			 * If value is supplied, semantics same as T_NEGOTIATE.
1027 			 * It is however ok not to supply a value with T_CHECK.
1028 			 */
1029 
1030 			if (tor->MGMT_flags == T_NEGOTIATE ||
1031 			    (opt->len != sizeof (struct T_opthdr))) {
1032 				/*
1033 				 * Implies "value" is specified in T_CHECK or
1034 				 * it is a T_NEGOTIATE request.
1035 				 * Verify size.
1036 				 * Note: This can override anything about this
1037 				 * option request done at a higher level.
1038 				 */
1039 				if (!opt_length_ok(optd, opt)) {
1040 					/* bad size */
1041 					*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
1042 					opt->status = T_FAILURE;
1043 					continue;
1044 				}
1045 			}
1046 			/*
1047 			 * The opt_chk_lookup()  routine above() approved of
1048 			 * this option so we can work on the status for it based
1049 			 * on the permissions for the operation. (This can
1050 			 * override anything set at a higher level).
1051 			 *
1052 			 * T_CHECK/T_NEGOTIATE semantics:
1053 			 * Set status to T_READONLY if read is the only access
1054 			 * permitted
1055 			 */
1056 			if (OA_READONLY_PERMISSION(optd, cr)) {
1057 				opt->status = T_READONLY;
1058 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
1059 				/* skip to next */
1060 				continue;
1061 			}
1062 
1063 			/*
1064 			 * T_CHECK/T_NEGOTIATE semantics:
1065 			 * If write (or execute) access is not set, then status
1066 			 * is T_NOTSUPPORT.
1067 			 */
1068 			if (!OA_WRITE_OR_EXECUTE(optd, cr)) {
1069 				opt->status = T_NOTSUPPORT;
1070 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
1071 				/* skip to next option */
1072 				continue;
1073 			}
1074 			/*
1075 			 * Option passes all checks. Make room for it in the
1076 			 * ack and set success in status.
1077 			 * Note: size stored in table does not include header
1078 			 * length.
1079 			 */
1080 			opt->status = T_SUCCESS;
1081 			*toa_lenp += sizeof (struct T_opthdr) +
1082 			    _TPI_ALIGN_TOPT(optd->opdes_size);
1083 			break;
1084 
1085 		default:
1086 			return (TBADFLAG);
1087 		}
1088 	} /* for loop scanning input buffer */
1089 
1090 	return (0);		/* OK return */
1091 }
1092 
1093 /*
1094  * This routine makes another pass through the option buffer this
1095  * time acting on the request based on "status" result in the
1096  * first pass. It also performs "expansion" of T_ALLOPT into
1097  * all options of a certain level and acts on each for this request.
1098  */
1099 static t_scalar_t
1100 do_options_second_pass(queue_t *q, mblk_t *reqmp, mblk_t *ack_mp, cred_t *cr,
1101     optdb_obj_t *dbobjp, mblk_t *first_mp, boolean_t is_restart,
1102     boolean_t *queued_statusp)
1103 {
1104 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
1105 	int failed_option;
1106 	struct T_opthdr *opt;
1107 	struct T_opthdr *opt_start, *opt_end, *restart_opt;
1108 	uchar_t *optr;
1109 	uint_t optset_context;
1110 	struct T_optmgmt_req *tor = (struct T_optmgmt_req *)reqmp->b_rptr;
1111 	opt_restart_t	*or;
1112 	t_uscalar_t	*worst_statusp;
1113 	int	err;
1114 
1115 	*queued_statusp = B_FALSE;
1116 	or = (opt_restart_t *)first_mp->b_rptr;
1117 	worst_statusp = &or->or_worst_status;
1118 
1119 	optr = (uchar_t *)ack_mp->b_rptr +
1120 	    sizeof (struct T_optmgmt_ack); /* assumed int32_t aligned */
1121 
1122 	/*
1123 	 * Set initial values for scanning input
1124 	 */
1125 	if (is_restart) {
1126 		opt_start = (struct T_opthdr *)or->or_start;
1127 		opt_end = (struct T_opthdr *)or->or_end;
1128 		restart_opt = (struct T_opthdr *)or->or_ropt;
1129 	} else {
1130 		opt_start = (struct T_opthdr *)mi_offset_param(reqmp,
1131 		    tor->OPT_offset, tor->OPT_length);
1132 		if (opt_start == NULL)
1133 			return (TBADOPT);
1134 		opt_end = (struct T_opthdr *)((uchar_t *)opt_start +
1135 		    tor->OPT_length);
1136 		or->or_start = (struct opthdr *)opt_start;
1137 		or->or_end = (struct opthdr *)opt_end;
1138 		/*
1139 		 * construct the mp chain, in case the setfn needs to
1140 		 * queue this and restart option processing later on.
1141 		 */
1142 		first_mp->b_cont = ack_mp;
1143 		ack_mp->b_cont = reqmp;
1144 	}
1145 	ASSERT(__TPI_TOPT_ISALIGNED(opt_start)); /* verified in first pass */
1146 
1147 	for (opt = is_restart ? restart_opt : opt_start;
1148 	    opt && (opt < opt_end);
1149 	    opt = _TPI_TOPT_NEXTHDR(opt_start, tor->OPT_length, opt)) {
1150 		or->or_ropt = (struct opthdr *)opt;
1151 		/* verified in first pass */
1152 		ASSERT(_TPI_TOPT_VALID(opt, opt_start, opt_end));
1153 
1154 		/*
1155 		 * If the first pass in process_topthdrs_first_pass()
1156 		 * has marked the option as a failure case for the MGMT_flags
1157 		 * semantics then there is not much to do.
1158 		 *
1159 		 * Note: For all practical purposes, T_READONLY status is
1160 		 * a "success" for T_DEFAULT/T_CURRENT and "failure" for
1161 		 * T_CHECK/T_NEGOTIATE
1162 		 */
1163 		failed_option =
1164 		    (opt->status == T_NOTSUPPORT) ||
1165 		    (opt->status == T_FAILURE) ||
1166 		    ((tor->MGMT_flags & (T_NEGOTIATE|T_CHECK)) &&
1167 		    (opt->status == T_READONLY));
1168 
1169 		if (failed_option) {
1170 			/*
1171 			 * According to T_DEFAULT/T_CURRENT semantics, the
1172 			 * input values, even if present, are to be ignored.
1173 			 * Note: Specification is not clear on this, but we
1174 			 * interpret that even though we ignore the values, we
1175 			 * can return them as is. So we process them similar to
1176 			 * T_CHECK/T_NEGOTIATE case which has the semantics to
1177 			 * return the values as is. XXX If interpretation is
1178 			 * ever determined incorrect fill in appropriate code
1179 			 * here to treat T_DEFAULT/T_CURRENT differently.
1180 			 *
1181 			 * According to T_CHECK/T_NEGOTIATE semantics,
1182 			 * in the case of T_NOTSUPPORT/T_FAILURE/T_READONLY,
1183 			 * the semantics are to return the "value" part of
1184 			 * option untouched. So here we copy the option
1185 			 * head including value part if any to output.
1186 			 */
1187 
1188 			bcopy(opt, optr, opt->len);
1189 			optr += _TPI_ALIGN_TOPT(opt->len);
1190 
1191 			*worst_statusp = get_worst_status(opt->status,
1192 			    *worst_statusp);
1193 
1194 			/* skip to process next option in buffer */
1195 			continue;
1196 
1197 		} /* end if "failed option" */
1198 		/*
1199 		 * The status is T_SUCCESS or T_READONLY
1200 		 * We process the value part here
1201 		 */
1202 		ASSERT(opt->status == T_SUCCESS || opt->status == T_READONLY);
1203 		switch (tor->MGMT_flags) {
1204 		case T_DEFAULT:
1205 			/*
1206 			 * We fill default value from table or protocol specific
1207 			 * function. If this call fails, we pass input through.
1208 			 */
1209 			if (do_opt_default(q, opt, &optr, worst_statusp,
1210 			    cr, dbobjp) < 0) {
1211 				/* fail or pass transparently */
1212 				if (topmost_tpiprovider)
1213 					opt->status = T_FAILURE;
1214 				bcopy(opt, optr, opt->len);
1215 				optr += _TPI_ALIGN_TOPT(opt->len);
1216 				*worst_statusp = get_worst_status(opt->status,
1217 				    *worst_statusp);
1218 			}
1219 			break;
1220 
1221 		case T_CURRENT:
1222 
1223 			do_opt_current(q, opt, &optr, worst_statusp, cr,
1224 			    dbobjp);
1225 			break;
1226 
1227 		case T_CHECK:
1228 		case T_NEGOTIATE:
1229 			if (tor->MGMT_flags == T_CHECK)
1230 				optset_context = SETFN_OPTCOM_CHECKONLY;
1231 			else	/* T_NEGOTIATE */
1232 				optset_context = SETFN_OPTCOM_NEGOTIATE;
1233 			err = do_opt_check_or_negotiate(q, opt, optset_context,
1234 			    &optr, worst_statusp, cr, dbobjp, first_mp);
1235 			if (err == EINPROGRESS) {
1236 				*queued_statusp = B_TRUE;
1237 				return (0);
1238 			}
1239 			break;
1240 		default:
1241 			return (TBADFLAG);
1242 		}
1243 	} /* end for loop scanning option buffer */
1244 
1245 	ack_mp->b_wptr = optr;
1246 	ASSERT(ack_mp->b_wptr <= ack_mp->b_datap->db_lim);
1247 
1248 	return (0);		/* OK return */
1249 }
1250 
1251 
1252 static t_uscalar_t
1253 get_worst_status(t_uscalar_t status, t_uscalar_t current_worst_status)
1254 {
1255 	/*
1256 	 * Return the "worst" among the arguments "status" and
1257 	 * "current_worst_status".
1258 	 *
1259 	 * Note: Tracking "worst_status" can be made a bit simpler
1260 	 * if we use the property that status codes are bitwise
1261 	 * distinct.
1262 	 *
1263 	 * The pecking order is
1264 	 *
1265 	 * T_SUCCESS ..... best
1266 	 * T_PARTSUCCESS
1267 	 * T_FAILURE
1268 	 * T_READONLY
1269 	 * T_NOTSUPPORT... worst
1270 	 */
1271 	if (status == current_worst_status)
1272 		return (current_worst_status);
1273 	switch (current_worst_status) {
1274 	case T_SUCCESS:
1275 		if (status == T_PARTSUCCESS)
1276 			return (T_PARTSUCCESS);
1277 		/* FALLTHROUGH */
1278 	case T_PARTSUCCESS:
1279 		if (status == T_FAILURE)
1280 			return (T_FAILURE);
1281 		/* FALLTHROUGH */
1282 	case T_FAILURE:
1283 		if (status == T_READONLY)
1284 			return (T_READONLY);
1285 		/* FALLTHROUGH */
1286 	case T_READONLY:
1287 		if (status == T_NOTSUPPORT)
1288 			return (T_NOTSUPPORT);
1289 		/* FALLTHROUGH */
1290 	case T_NOTSUPPORT:
1291 	default:
1292 		return (current_worst_status);
1293 	}
1294 }
1295 
1296 static int
1297 do_opt_default(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
1298     t_uscalar_t *worst_statusp, cred_t *cr, optdb_obj_t *dbobjp)
1299 {
1300 	pfi_t	deffn = dbobjp->odb_deffn;
1301 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
1302 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
1303 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
1304 
1305 	struct T_opthdr *topth;
1306 	opdes_t *optd;
1307 
1308 	if (reqopt->name != T_ALLOPT) {
1309 		/*
1310 		 * lookup the option in the table and fill default value
1311 		 */
1312 		optd = opt_chk_lookup(reqopt->level, reqopt->name,
1313 		    opt_arr, opt_arr_cnt);
1314 
1315 		if (optd == NULL) {
1316 			/*
1317 			 * not found - fail this one. Should not happen
1318 			 * for topmost_tpiprovider as calling routine
1319 			 * should have verified it.
1320 			 */
1321 			ASSERT(!topmost_tpiprovider);
1322 			return (-1);
1323 		}
1324 
1325 		topth = (struct T_opthdr *)(*resptrp);
1326 		topth->level = reqopt->level;
1327 		topth->name = reqopt->name;
1328 		topth->status = reqopt->status;
1329 
1330 		*worst_statusp = get_worst_status(reqopt->status,
1331 		    *worst_statusp);
1332 
1333 		if (optd->opdes_props & OP_NODEFAULT) {
1334 			/* header only, no default "value" part */
1335 			topth->len = sizeof (struct T_opthdr);
1336 			*resptrp += sizeof (struct T_opthdr);
1337 		} else {
1338 			int deflen;
1339 
1340 			if (optd->opdes_props & OP_DEF_FN) {
1341 				deflen = (*deffn)(q, reqopt->level,
1342 				    reqopt->name, _TPI_TOPT_DATA(topth));
1343 				if (deflen >= 0) {
1344 					topth->len = (t_uscalar_t)
1345 					    (sizeof (struct T_opthdr) + deflen);
1346 				} else {
1347 					/*
1348 					 * return error, this should 'pass
1349 					 * through' the option and maybe some
1350 					 * other level will fill it in or
1351 					 * already did.
1352 					 * (No change in 'resptrp' upto here)
1353 					 */
1354 					return (-1);
1355 				}
1356 			} else {
1357 				/* fill length and value part */
1358 				switch (optd->opdes_size) {
1359 				/*
1360 				 * Since options are guaranteed aligned only
1361 				 * on a 4 byte boundary (t_scalar_t) any
1362 				 * option that is greater in size will default
1363 				 * to the bcopy below
1364 				 */
1365 				case sizeof (int32_t):
1366 					*(int32_t *)_TPI_TOPT_DATA(topth) =
1367 					    (int32_t)optd->opdes_default;
1368 					break;
1369 				case sizeof (int16_t):
1370 					*(int16_t *)_TPI_TOPT_DATA(topth) =
1371 					    (int16_t)optd->opdes_default;
1372 					break;
1373 				case sizeof (int8_t):
1374 					*(int8_t *)_TPI_TOPT_DATA(topth) =
1375 					    (int8_t)optd->opdes_default;
1376 					break;
1377 				default:
1378 					/*
1379 					 * other length but still assume
1380 					 * fixed - use bcopy
1381 					 */
1382 					bcopy(optd->opdes_defbuf,
1383 					    _TPI_TOPT_DATA(topth),
1384 					    optd->opdes_size);
1385 					break;
1386 				}
1387 				topth->len = (t_uscalar_t)(optd->opdes_size +
1388 				    sizeof (struct T_opthdr));
1389 			}
1390 			*resptrp += _TPI_ALIGN_TOPT(topth->len);
1391 		}
1392 		return (0);	/* OK return */
1393 	}
1394 
1395 	/*
1396 	 * T_ALLOPT processing
1397 	 *
1398 	 * lookup and stuff default values of all the options of the
1399 	 * level specified
1400 	 * Note: This expansion of T_ALLOPT should happen in
1401 	 * a topmost_tpiprovider.
1402 	 */
1403 	ASSERT(topmost_tpiprovider);
1404 	for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
1405 		if (reqopt->level != optd->opdes_level)
1406 			continue;
1407 		/*
1408 		 *
1409 		 * T_DEFAULT semantics:
1410 		 * XXX: we interpret T_DEFAULT semantics such that access to
1411 		 * read value is required for access even the default value.
1412 		 * Else option is ignored for T_ALLOPT request.
1413 		 */
1414 		if (!OA_READ_PERMISSION(optd, cr))
1415 			/* skip this one */
1416 			continue;
1417 
1418 		/*
1419 		 * Found option of same level as T_ALLOPT request
1420 		 * that we can return.
1421 		 */
1422 
1423 		topth = (struct T_opthdr *)(*resptrp);
1424 		topth->level = optd->opdes_level;
1425 		topth->name = optd->opdes_name;
1426 
1427 		/*
1428 		 * T_DEFAULT semantics:
1429 		 * We know that read access is set. If no other access is set,
1430 		 * then status is T_READONLY
1431 		 */
1432 		if (OA_READONLY_PERMISSION(optd, cr)) {
1433 			topth->status = T_READONLY;
1434 			*worst_statusp = get_worst_status(T_READONLY,
1435 			    *worst_statusp);
1436 		} else {
1437 			topth->status = T_SUCCESS;
1438 			/*
1439 			 * Note: *worst_statusp has to be T_SUCCESS or
1440 			 * worse so no need to adjust
1441 			 */
1442 		}
1443 
1444 		if (optd->opdes_props & OP_NODEFAULT) {
1445 			/* header only, no value part */
1446 			topth->len = sizeof (struct T_opthdr);
1447 			*resptrp += sizeof (struct T_opthdr);
1448 		} else {
1449 			int deflen;
1450 
1451 			if (optd->opdes_props & OP_DEF_FN) {
1452 				deflen = (*deffn)(q, reqopt->level,
1453 				    reqopt->name, _TPI_TOPT_DATA(topth));
1454 				if (deflen >= 0) {
1455 					topth->len = (t_uscalar_t)(deflen +
1456 					    sizeof (struct T_opthdr));
1457 				} else {
1458 					/*
1459 					 * deffn failed.
1460 					 * return just the header as T_ALLOPT
1461 					 * expansion.
1462 					 * Some other level deffn may
1463 					 * supply value part.
1464 					 */
1465 					topth->len = sizeof (struct T_opthdr);
1466 					topth->status = T_FAILURE;
1467 					*worst_statusp =
1468 					    get_worst_status(T_FAILURE,
1469 					    *worst_statusp);
1470 				}
1471 			} else {
1472 				/*
1473 				 * fill length and value part from
1474 				 * table
1475 				 */
1476 				switch (optd->opdes_size) {
1477 				/*
1478 				 * Since options are guaranteed aligned only
1479 				 * on a 4 byte boundary (t_scalar_t) any
1480 				 * option that is greater in size will default
1481 				 * to the bcopy below
1482 				 */
1483 				case sizeof (int32_t):
1484 					*(int32_t *)_TPI_TOPT_DATA(topth) =
1485 					    (int32_t)optd->opdes_default;
1486 					break;
1487 				case sizeof (int16_t):
1488 					*(int16_t *)_TPI_TOPT_DATA(topth) =
1489 					    (int16_t)optd->opdes_default;
1490 					break;
1491 				case sizeof (int8_t):
1492 					*(int8_t *)_TPI_TOPT_DATA(topth) =
1493 					    (int8_t)optd->opdes_default;
1494 					break;
1495 				default:
1496 					/*
1497 					 * other length but still assume
1498 					 * fixed - use bcopy
1499 					 */
1500 					bcopy(optd->opdes_defbuf,
1501 					    _TPI_TOPT_DATA(topth),
1502 					    optd->opdes_size);
1503 				}
1504 				topth->len = (t_uscalar_t)(optd->opdes_size +
1505 				    sizeof (struct T_opthdr));
1506 			}
1507 			*resptrp += _TPI_ALIGN_TOPT(topth->len);
1508 		}
1509 	}
1510 	return (0);
1511 }
1512 
1513 static void
1514 do_opt_current(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
1515     t_uscalar_t *worst_statusp, cred_t *cr, optdb_obj_t *dbobjp)
1516 {
1517 	pfi_t	getfn = dbobjp->odb_getfn;
1518 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
1519 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
1520 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
1521 
1522 	struct T_opthdr *topth;
1523 	opdes_t *optd;
1524 	int optlen;
1525 	uchar_t *initptr = *resptrp;
1526 
1527 	/*
1528 	 * We call getfn to get the current value of an option. The call may
1529 	 * fail in which case we copy the values from the input buffer. Maybe
1530 	 * something downstream will fill it in or something upstream did.
1531 	 */
1532 
1533 	if (reqopt->name != T_ALLOPT) {
1534 		topth = (struct T_opthdr *)*resptrp;
1535 		*resptrp += sizeof (struct T_opthdr);
1536 		optlen = (*getfn)(q, reqopt->level, reqopt->name, *resptrp);
1537 		if (optlen >= 0) {
1538 			topth->len = (t_uscalar_t)(optlen +
1539 			    sizeof (struct T_opthdr));
1540 			topth->level = reqopt->level;
1541 			topth->name = reqopt->name;
1542 			topth->status = reqopt->status;
1543 			*resptrp += _TPI_ALIGN_TOPT(optlen);
1544 			*worst_statusp = get_worst_status(topth->status,
1545 			    *worst_statusp);
1546 		} else {
1547 			/* failed - reset "*resptrp" pointer */
1548 			*resptrp -= sizeof (struct T_opthdr);
1549 		}
1550 	} else {		/* T_ALLOPT processing */
1551 		ASSERT(topmost_tpiprovider == B_TRUE);
1552 		/* scan and get all options */
1553 		for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
1554 			/* skip other levels */
1555 			if (reqopt->level != optd->opdes_level)
1556 				continue;
1557 
1558 			if (!OA_READ_PERMISSION(optd, cr))
1559 				/* skip this one */
1560 				continue;
1561 
1562 			topth = (struct T_opthdr *)*resptrp;
1563 			*resptrp += sizeof (struct T_opthdr);
1564 
1565 			/* get option of this level */
1566 			optlen = (*getfn)(q, reqopt->level, optd->opdes_name,
1567 			    *resptrp);
1568 			if (optlen >= 0) {
1569 				/* success */
1570 				topth->len = (t_uscalar_t)(optlen +
1571 				    sizeof (struct T_opthdr));
1572 				topth->level = reqopt->level;
1573 				topth->name = optd->opdes_name;
1574 				if (OA_READONLY_PERMISSION(optd, cr))
1575 					topth->status = T_READONLY;
1576 				else
1577 					topth->status = T_SUCCESS;
1578 				*resptrp += _TPI_ALIGN_TOPT(optlen);
1579 			} else {
1580 				/*
1581 				 * failed, return as T_FAILURE and null value
1582 				 * part. Maybe something downstream will
1583 				 * handle this one and fill in a value. Here
1584 				 * it is just part of T_ALLOPT expansion.
1585 				 */
1586 				topth->len = sizeof (struct T_opthdr);
1587 				topth->level = reqopt->level;
1588 				topth->name = optd->opdes_name;
1589 				topth->status = T_FAILURE;
1590 			}
1591 			*worst_statusp = get_worst_status(topth->status,
1592 			    *worst_statusp);
1593 		} /* end for loop */
1594 	}
1595 	if (*resptrp == initptr) {
1596 		/*
1597 		 * getfn failed and does not want to handle this option. Maybe
1598 		 * something downstream will or something upstream did. (If
1599 		 * topmost_tpiprovider, initialize "status" to failure which
1600 		 * can possibly change downstream). Copy the input "as is" from
1601 		 * input option buffer if any to maintain transparency.
1602 		 */
1603 		if (topmost_tpiprovider)
1604 			reqopt->status = T_FAILURE;
1605 		bcopy(reqopt, *resptrp, reqopt->len);
1606 		*resptrp += _TPI_ALIGN_TOPT(reqopt->len);
1607 		*worst_statusp = get_worst_status(reqopt->status,
1608 		    *worst_statusp);
1609 	}
1610 }
1611 
1612 
1613 
1614 static int
1615 do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
1616     uint_t optset_context, uchar_t **resptrp, t_uscalar_t *worst_statusp,
1617     cred_t *cr, optdb_obj_t *dbobjp, mblk_t *first_mp)
1618 {
1619 	pfi_t	deffn = dbobjp->odb_deffn;
1620 	opt_set_fn setfn = dbobjp->odb_setfn;
1621 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
1622 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
1623 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
1624 
1625 	struct T_opthdr *topth;
1626 	opdes_t *optd;
1627 	int error;
1628 	t_uscalar_t optlen;
1629 	t_scalar_t optsize;
1630 	uchar_t *initptr = *resptrp;
1631 
1632 	ASSERT(reqopt->status == T_SUCCESS);
1633 
1634 	if (reqopt->name != T_ALLOPT) {
1635 		topth = (struct T_opthdr *)*resptrp;
1636 		*resptrp += sizeof (struct T_opthdr);
1637 		error = (*setfn)(q, optset_context, reqopt->level, reqopt->name,
1638 		    reqopt->len - sizeof (struct T_opthdr),
1639 		    _TPI_TOPT_DATA(reqopt), &optlen, _TPI_TOPT_DATA(topth),
1640 		    NULL, cr, first_mp);
1641 		if (error) {
1642 			/* failed - reset "*resptrp" */
1643 			*resptrp -= sizeof (struct T_opthdr);
1644 			if (error == EINPROGRESS)
1645 				return (error);
1646 		} else {
1647 			/*
1648 			 * success - "value" already filled in setfn()
1649 			 */
1650 			topth->len = (t_uscalar_t)(optlen +
1651 			    sizeof (struct T_opthdr));
1652 			topth->level = reqopt->level;
1653 			topth->name = reqopt->name;
1654 			topth->status = reqopt->status;
1655 			*resptrp += _TPI_ALIGN_TOPT(optlen);
1656 			*worst_statusp = get_worst_status(topth->status,
1657 			    *worst_statusp);
1658 		}
1659 	} else {		/* T_ALLOPT processing */
1660 		/* only for T_NEGOTIATE case */
1661 		ASSERT(optset_context == SETFN_OPTCOM_NEGOTIATE);
1662 		ASSERT(topmost_tpiprovider == B_TRUE);
1663 
1664 		/* scan and set all options to default value */
1665 		for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
1666 
1667 			/* skip other levels */
1668 			if (reqopt->level != optd->opdes_level)
1669 				continue;
1670 
1671 			if (OA_EXECUTE_PERMISSION(optd, cr) ||
1672 			    OA_NO_PERMISSION(optd, cr)) {
1673 				/*
1674 				 * skip this one too. Does not make sense to
1675 				 * set anything to default value for "execute"
1676 				 * options.
1677 				 */
1678 				continue;
1679 			}
1680 
1681 			if (OA_READONLY_PERMISSION(optd, cr)) {
1682 				/*
1683 				 * Return with T_READONLY status (and no value
1684 				 * part). Note: spec is not clear but
1685 				 * XTI test suite needs this.
1686 				 */
1687 				topth = (struct T_opthdr *)*resptrp;
1688 				topth->len = sizeof (struct T_opthdr);
1689 				*resptrp += topth->len;
1690 				topth->level = reqopt->level;
1691 				topth->name = optd->opdes_name;
1692 				topth->status = T_READONLY;
1693 				*worst_statusp = get_worst_status(topth->status,
1694 				    *worst_statusp);
1695 				continue;
1696 			}
1697 
1698 			/*
1699 			 * It is not read only or execute type
1700 			 * the it must have write permission
1701 			 */
1702 			ASSERT(OA_WRITE_PERMISSION(optd, cr));
1703 
1704 			topth = (struct T_opthdr *)*resptrp;
1705 			*resptrp += sizeof (struct T_opthdr);
1706 
1707 			topth->len = sizeof (struct T_opthdr);
1708 			topth->level = reqopt->level;
1709 			topth->name = optd->opdes_name;
1710 			if (optd->opdes_props & OP_NODEFAULT) {
1711 				/*
1712 				 * Option of "no default value" so it does not
1713 				 * make sense to try to set it. We just return
1714 				 * header with status of T_SUCCESS
1715 				 * XXX should this be failure ?
1716 				 */
1717 				topth->status = T_SUCCESS;
1718 				continue; /* skip setting */
1719 			}
1720 			if (optd->opdes_props & OP_DEF_FN) {
1721 				if ((optd->opdes_props & OP_VARLEN) ||
1722 				    ((optsize = (*deffn)(q, reqopt->level,
1723 				    optd->opdes_name,
1724 				    (uchar_t *)optd->opdes_defbuf)) < 0)) {
1725 					/* XXX - skip these too */
1726 					topth->status = T_SUCCESS;
1727 					continue; /* skip setting */
1728 				}
1729 			} else {
1730 				optsize = optd->opdes_size;
1731 			}
1732 
1733 
1734 			/* set option of this level */
1735 			error = (*setfn)(q, SETFN_OPTCOM_NEGOTIATE,
1736 			    reqopt->level, optd->opdes_name, optsize,
1737 			    (uchar_t *)optd->opdes_defbuf, &optlen,
1738 			    _TPI_TOPT_DATA(topth), NULL, cr, NULL);
1739 			if (error) {
1740 				/*
1741 				 * failed, return as T_FAILURE and null value
1742 				 * part. Maybe something downstream will
1743 				 * handle this one and fill in a value. Here
1744 				 * it is just part of T_ALLOPT expansion.
1745 				 */
1746 				topth->status = T_FAILURE;
1747 				*worst_statusp = get_worst_status(topth->status,
1748 				    *worst_statusp);
1749 			} else {
1750 				/* success */
1751 				topth->len += optlen;
1752 				topth->status = T_SUCCESS;
1753 				*resptrp += _TPI_ALIGN_TOPT(optlen);
1754 			}
1755 		} /* end for loop */
1756 		/* END T_ALLOPT */
1757 	}
1758 
1759 	if (*resptrp == initptr) {
1760 		/*
1761 		 * setfn failed and does not want to handle this option. Maybe
1762 		 * something downstream will or something upstream
1763 		 * did. Copy the input as is from input option buffer if any to
1764 		 * maintain transparency (maybe something at a level above
1765 		 * did something.
1766 		 */
1767 		if (topmost_tpiprovider)
1768 			reqopt->status = T_FAILURE;
1769 		bcopy(reqopt, *resptrp, reqopt->len);
1770 		*resptrp += _TPI_ALIGN_TOPT(reqopt->len);
1771 		*worst_statusp = get_worst_status(reqopt->status,
1772 		    *worst_statusp);
1773 	}
1774 	return (0);
1775 }
1776 
1777 /*
1778  * The following routines process options buffer passed with
1779  * T_CONN_REQ, T_CONN_RES and T_UNITDATA_REQ.
1780  * This routine does the consistency check applied to the
1781  * sanity of formatting of multiple options packed in the
1782  * buffer.
1783  *
1784  * XTI brain damage alert:
1785  * XTI interface adopts the notion of an option being an
1786  * "absolute requirement" from OSI transport service (but applies
1787  * it to all transports including Internet transports).
1788  * The main effect of that is action on failure to "negotiate" a
1789  * requested option to the exact requested value
1790  *
1791  *          - if the option is an "absolute requirement", the primitive
1792  *            is aborted (e.g T_DISCON_REQ or T_UDERR generated)
1793  *          - if the option is NOT and "absolute requirement" it can
1794  *            just be ignored.
1795  *
1796  * We would not support "negotiating" of options on connection
1797  * primitives for Internet transports. However just in case we
1798  * forced to in order to pass strange test suites, the design here
1799  * tries to support these notions.
1800  *
1801  * tpi_optcom_buf(q, mp, opt_lenp, opt_offset, cred, dbobjp, thisdg_attrs,
1802  *	*is_absreq_failurep)
1803  *
1804  * - Verify the option buffer, if formatted badly, return error 1
1805  *
1806  * - If it is a "permissions" failure (read-only), return error 2
1807  *
1808  * - Else, process the option "in place", the following can happen,
1809  *	     - if a "privileged" option, mark it as "ignored".
1810  *	     - if "not supported", mark "ignored"
1811  *	     - if "supported" attempt negotiation and fill result in
1812  *	       the outcome
1813  *			- if "absolute requirement", set "*is_absreq_failurep"
1814  *			- if NOT an "absolute requirement", then our
1815  *			  interpretation is to mark is at ignored if
1816  *			  negotiation fails (Spec allows partial success
1817  *			  as in OSI protocols but not failure)
1818  *
1819  *   Then delete "ignored" options from option buffer and return success.
1820  *
1821  */
1822 
1823 int
1824 tpi_optcom_buf(queue_t *q, mblk_t *mp, t_scalar_t *opt_lenp,
1825     t_scalar_t opt_offset, cred_t *cr, optdb_obj_t *dbobjp,
1826     void *thisdg_attrs, int *is_absreq_failurep)
1827 {
1828 	opt_set_fn setfn = dbobjp->odb_setfn;
1829 	opdes_t *opt_arr = dbobjp->odb_opt_des_arr;
1830 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
1831 	struct T_opthdr *opt, *opt_start, *opt_end;
1832 	mblk_t  *copy_mp_head;
1833 	uchar_t *optr, *init_optr;
1834 	opdes_t *optd;
1835 	uint_t optset_context;
1836 	t_uscalar_t olen;
1837 	int error = 0;
1838 
1839 	ASSERT((uchar_t *)opt_lenp > mp->b_rptr &&
1840 	    (uchar_t *)opt_lenp < mp->b_wptr);
1841 
1842 	copy_mp_head = NULL;
1843 	*is_absreq_failurep = 0;
1844 	switch (((union T_primitives *)mp->b_rptr)->type) {
1845 	case T_CONN_REQ:
1846 	case T_CONN_RES:
1847 		optset_context = SETFN_CONN_NEGOTIATE;
1848 		break;
1849 	case T_UNITDATA_REQ:
1850 		optset_context = SETFN_UD_NEGOTIATE;
1851 		break;
1852 	default:
1853 		/*
1854 		 * should never get here, all possible TPI primitives
1855 		 * where this can be called from should be accounted
1856 		 * for in the cases above
1857 		 */
1858 		return (EINVAL);
1859 	}
1860 
1861 	if ((opt_start = (struct T_opthdr *)
1862 	    mi_offset_param(mp, opt_offset, *opt_lenp)) == NULL) {
1863 		error = ENOPROTOOPT;
1864 		goto error_ret;
1865 	}
1866 	if (!__TPI_TOPT_ISALIGNED(opt_start)) {
1867 		error = ENOPROTOOPT;
1868 		goto error_ret;
1869 	}
1870 
1871 	opt_end = (struct T_opthdr *)((uchar_t *)opt_start
1872 	    + *opt_lenp);
1873 
1874 	if ((copy_mp_head = copyb(mp)) == (mblk_t *)NULL) {
1875 		error = ENOMEM;
1876 		goto error_ret;
1877 	}
1878 
1879 	init_optr = optr = (uchar_t *)&copy_mp_head->b_rptr[opt_offset];
1880 
1881 	for (opt = opt_start; opt && (opt < opt_end);
1882 	    opt = _TPI_TOPT_NEXTHDR(opt_start, *opt_lenp, opt)) {
1883 		/*
1884 		 * Validate the option for length and alignment
1885 		 * before accessing anything in it
1886 		 */
1887 		if (!_TPI_TOPT_VALID(opt, opt_start, opt_end)) {
1888 			error = ENOPROTOOPT;
1889 			goto error_ret;
1890 		}
1891 
1892 		/* Find the option in the opt_arr. */
1893 		optd = opt_chk_lookup(opt->level, opt->name,
1894 		    opt_arr, opt_arr_cnt);
1895 
1896 		if (optd == NULL) {
1897 			/*
1898 			 * Option not found
1899 			 */
1900 			opt->status = T_NOTSUPPORT;
1901 			continue;
1902 		}
1903 
1904 		/*
1905 		 * Weird but as in XTI spec.
1906 		 * Sec 6.3.6 "Privileged and ReadOnly Options"
1907 		 * Permission problems (e.g.readonly) fail with bad access
1908 		 * BUT "privileged" option request from those NOT PRIVILEGED
1909 		 * are to be merely "ignored".
1910 		 * XXX Prevents "probing" of privileged options ?
1911 		 */
1912 		if (OA_READONLY_PERMISSION(optd, cr)) {
1913 			error = EACCES;
1914 			goto error_ret;
1915 		}
1916 		if (OA_MATCHED_PRIV(optd, cr)) {
1917 			/*
1918 			 * For privileged options, we DO perform
1919 			 * access checks as is common sense
1920 			 */
1921 			if (!OA_WX_ANYPRIV(optd)) {
1922 				error = EACCES;
1923 				goto error_ret;
1924 			}
1925 		} else {
1926 			/*
1927 			 * For non privileged, we fail instead following
1928 			 * "ignore" semantics dictated by XTI spec for
1929 			 * permissions problems.
1930 			 * Sec 6.3.6 "Privileged and ReadOnly Options"
1931 			 * XXX Should we do "ignore" semantics ?
1932 			 */
1933 			if (!OA_WX_NOPRIV(optd)) { /* nopriv */
1934 				opt->status = T_FAILURE;
1935 				continue;
1936 			}
1937 		}
1938 		/*
1939 		 *
1940 		 * If the negotiation fails, for options that
1941 		 * are "absolute requirement", it is a fatal error.
1942 		 * For options that are NOT "absolute requirements",
1943 		 * and the value fails to negotiate, the XTI spec
1944 		 * only considers the possibility of partial success
1945 		 * (T_PARTSUCCES - not likely for Internet protocols).
1946 		 * The spec is in denial about complete failure
1947 		 * (T_FAILURE) to negotiate for options that are
1948 		 * carried on T_CONN_REQ/T_CONN_RES/T_UNITDATA
1949 		 * We interpret the T_FAILURE to negotiate an option
1950 		 * that is NOT an absolute requirement that it is safe
1951 		 * to ignore it.
1952 		 */
1953 
1954 		/* verify length */
1955 		if (!opt_length_ok(optd, opt)) {
1956 			/* bad size */
1957 			if ((optd->opdes_props & OP_NOT_ABSREQ) == 0) {
1958 				/* option is absolute requirement */
1959 				*is_absreq_failurep = 1;
1960 				error = EINVAL;
1961 				goto error_ret;
1962 			}
1963 			opt->status = T_FAILURE;
1964 			continue;
1965 		}
1966 
1967 		/*
1968 		 * verified generic attributes. Now call set function.
1969 		 * Note: We assume the following to simplify code.
1970 		 * XXX If this is found not to be valid, this routine
1971 		 * will need to be rewritten. At this point it would
1972 		 * be premature to introduce more complexity than is
1973 		 * needed.
1974 		 * Assumption: For variable length options, we assume
1975 		 * that the value returned will be same or less length
1976 		 * (size does not increase). This makes it OK to pass the
1977 		 * same space for output as it is on input.
1978 		 */
1979 
1980 		error = (*setfn)(q, optset_context, opt->level, opt->name,
1981 		    opt->len - (t_uscalar_t)sizeof (struct T_opthdr),
1982 		    _TPI_TOPT_DATA(opt), &olen, _TPI_TOPT_DATA(opt),
1983 		    thisdg_attrs, cr, NULL);
1984 
1985 		if (olen > (int)(opt->len - sizeof (struct T_opthdr))) {
1986 			/*
1987 			 * Space on output more than space on input. Should
1988 			 * not happen and we consider it a bug/error.
1989 			 * More of a restriction than an error in our
1990 			 * implementation. Will see if we can live with this
1991 			 * otherwise code will get more hairy with multiple
1992 			 * passes.
1993 			 */
1994 			error = EINVAL;
1995 			goto error_ret;
1996 		}
1997 		if (error != 0) {
1998 			if ((optd->opdes_props & OP_NOT_ABSREQ) == 0) {
1999 				/* option is absolute requirement. */
2000 				*is_absreq_failurep = 1;
2001 				goto error_ret;
2002 			}
2003 			/*
2004 			 * failed - but option "not an absolute
2005 			 * requirement"
2006 			 */
2007 			opt->status = T_FAILURE;
2008 			continue;
2009 		}
2010 		/*
2011 		 * Fill in the only possible successful result
2012 		 * (Note: TPI allows for T_PARTSUCCESS - partial
2013 		 * sucess result code which is relevant in OSI world
2014 		 * and not possible in Internet code)
2015 		 */
2016 		opt->status = T_SUCCESS;
2017 
2018 		/*
2019 		 * Add T_SUCCESS result code options to the "output" options.
2020 		 * No T_FAILURES or T_NOTSUPPORT here as they are to be
2021 		 * ignored.
2022 		 * This code assumes output option buffer will
2023 		 * be <= input option buffer.
2024 		 *
2025 		 * Copy option header+value
2026 		 */
2027 		bcopy(opt, optr, opt->len);
2028 		optr +=  _TPI_ALIGN_TOPT(opt->len);
2029 	}
2030 	/*
2031 	 * Overwrite the input mblk option buffer now with the output
2032 	 * and update length, and contents in original mbl
2033 	 * (offset remains unchanged).
2034 	 */
2035 	*opt_lenp = (t_scalar_t)(optr - init_optr);
2036 	if (*opt_lenp > 0) {
2037 		bcopy(init_optr, opt_start, *opt_lenp);
2038 	}
2039 
2040 error_ret:
2041 	if (copy_mp_head != NULL)
2042 		freeb(copy_mp_head);
2043 	return (error);
2044 }
2045 
2046 static opdes_t *
2047 opt_chk_lookup(t_uscalar_t level, t_uscalar_t name, opdes_t *opt_arr,
2048     uint_t opt_arr_cnt)
2049 {
2050 	opdes_t		*optd;
2051 
2052 	for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt];
2053 	    optd++) {
2054 		if (level == (uint_t)optd->opdes_level &&
2055 		    name == (uint_t)optd->opdes_name)
2056 			return (optd);
2057 	}
2058 	return (NULL);
2059 }
2060 
2061 static boolean_t
2062 opt_level_valid(t_uscalar_t level, optlevel_t *valid_level_arr,
2063     uint_t valid_level_arr_cnt)
2064 {
2065 	optlevel_t		*olp;
2066 
2067 	for (olp = valid_level_arr;
2068 	    olp < &valid_level_arr[valid_level_arr_cnt];
2069 	    olp++) {
2070 		if (level == (uint_t)(*olp))
2071 			return (B_TRUE);
2072 	}
2073 	return (B_FALSE);
2074 }
2075 
2076 
2077 /*
2078  * Compute largest possible size for an option buffer containing
2079  * all options in one buffer.
2080  *
2081  * XXX TBD, investigate use of opt_bloated_maxsize() to avoid
2082  *     wastefully large buffer allocation.
2083  */
2084 static size_t
2085 opt_level_allopts_lengths(t_uscalar_t level, opdes_t *opt_arr,
2086     uint_t opt_arr_cnt)
2087 {
2088 	opdes_t		*optd;
2089 	size_t allopt_len = 0;	/* 0 implies no option at this level */
2090 
2091 	/*
2092 	 * Scan opt_arr computing aggregate length
2093 	 * requirement for storing values of all
2094 	 * options.
2095 	 * Note: we do not filter for permissions
2096 	 * etc. This will be >= the real aggregate
2097 	 * length required (upper bound).
2098 	 */
2099 
2100 	for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt];
2101 	    optd++) {
2102 		if (level == optd->opdes_level) {
2103 			allopt_len += sizeof (struct T_opthdr) +
2104 			    _TPI_ALIGN_TOPT(optd->opdes_size);
2105 		}
2106 	}
2107 	return (allopt_len);	/* 0 implies level not found */
2108 }
2109 
2110 /*
2111  * Compute largest possible size for an option buffer containing
2112  * all options in one buffer - a (theoretical?) worst case scenario
2113  * for certain cases.
2114  */
2115 t_uscalar_t
2116 optcom_max_optbuf_len(opdes_t *opt_arr, uint_t opt_arr_cnt)
2117 {
2118 	t_uscalar_t max_optbuf_len = sizeof (struct T_info_ack);
2119 	opdes_t		*optd;
2120 
2121 	for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
2122 		max_optbuf_len += (t_uscalar_t)sizeof (struct T_opthdr) +
2123 		    (t_uscalar_t)_TPI_ALIGN_TOPT(optd->opdes_size);
2124 	}
2125 	return (max_optbuf_len);
2126 }
2127 
2128 /*
2129  * Compute largest possible size for OPT_size for a transport.
2130  * Heuristic used is to add all but certain extremely large
2131  * size options; this is done by calling opt_bloated_maxsize().
2132  * It affects user level allocations in TLI/XTI code using t_alloc()
2133  * and other TLI/XTI implementation instance strucutures.
2134  * The large size options excluded are presumed to be
2135  * never accessed through the (theoretical?) worst case code paths
2136  * through TLI/XTI as they are currently IPv6 specific options.
2137  */
2138 
2139 t_uscalar_t
2140 optcom_max_optsize(opdes_t *opt_arr, uint_t opt_arr_cnt)
2141 {
2142 	t_uscalar_t max_optbuf_len = sizeof (struct T_info_ack);
2143 	opdes_t		*optd;
2144 
2145 	for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
2146 		if (!opt_bloated_maxsize(optd)) {
2147 			max_optbuf_len +=
2148 			    (t_uscalar_t)sizeof (struct T_opthdr) +
2149 			    (t_uscalar_t)_TPI_ALIGN_TOPT(optd->opdes_size);
2150 		}
2151 	}
2152 	return (max_optbuf_len);
2153 }
2154 
2155 /*
2156  * The theoretical model used in optcom_max_optsize() and
2157  * opt_level_allopts_lengths() accounts for the worst case of all
2158  * possible options for the theoretical cases and results in wasteful
2159  * memory allocations for certain theoretically correct usage scenarios.
2160  * In practice, the "features" they support are rarely, if ever,
2161  * used and even then only by test suites for those features (VSU, VST).
2162  * However, they result in large allocations due to the increased transport
2163  * T_INFO_ACK OPT_size field affecting t_alloc() users and TLI/XTI library
2164  * instance data structures for applications.
2165  *
2166  * The following routine opt_bloated_maxsize() supports a hack that avoids
2167  * paying the tax for the bloated options by excluding them and pretending
2168  * they don't exist for certain features without affecting features that
2169  * do use them.
2170  *
2171  * XXX Currently implemented only for optcom_max_optsize()
2172  *     (to reduce risk late in release).
2173  *     TBD for future, investigate use in optcom_level_allopts_lengths() and
2174  *     all the instances of T_ALLOPT processing to exclude "bloated options".
2175  *     Will not affect VSU/VST tests as they do not test with IPPROTO_IPV6
2176  *     level options which are the only ones that fit the "bloated maxsize"
2177  *     option profile now.
2178  */
2179 static boolean_t
2180 opt_bloated_maxsize(opdes_t *optd)
2181 {
2182 	if (optd->opdes_level != IPPROTO_IPV6)
2183 		return (B_FALSE);
2184 	switch (optd->opdes_name) {
2185 	case IPV6_HOPOPTS:
2186 	case IPV6_DSTOPTS:
2187 	case IPV6_RTHDRDSTOPTS:
2188 	case IPV6_RTHDR:
2189 	case IPV6_PATHMTU:
2190 		return (B_TRUE);
2191 	default:
2192 		break;
2193 	}
2194 	return (B_FALSE);
2195 }
2196 
2197 static boolean_t
2198 opt_length_ok(opdes_t *optd, struct T_opthdr *opt)
2199 {
2200 	/*
2201 	 * Verify length.
2202 	 * Value specified should match length of fixed length option or be
2203 	 * less than maxlen of variable length option.
2204 	 */
2205 	if (optd->opdes_props & OP_VARLEN) {
2206 		if (opt->len <= optd->opdes_size +
2207 		    (t_uscalar_t)sizeof (struct T_opthdr))
2208 			return (B_TRUE);
2209 	} else {
2210 		/* fixed length option */
2211 		if (opt->len == optd->opdes_size +
2212 		    (t_uscalar_t)sizeof (struct T_opthdr))
2213 			return (B_TRUE);
2214 	}
2215 	return (B_FALSE);
2216 }
2217 
2218 /*
2219  * This routine appends a pssed in hop-by-hop option to the existing
2220  * option (in this case a cipso label encoded in HOPOPT option). The
2221  * passed in option is always padded. The 'reservelen' is the
2222  * length of reserved data (label). New memory will be allocated if
2223  * the current buffer is not large enough. Return failure if memory
2224  * can not be allocated.
2225  */
2226 int
2227 optcom_pkt_set(uchar_t *invalp, uint_t inlen, boolean_t sticky,
2228     uchar_t **optbufp, uint_t *optlenp, uint_t reservelen)
2229 {
2230 	uchar_t *optbuf;
2231 	uchar_t	*optp;
2232 
2233 	if (!sticky) {
2234 		*optbufp = invalp;
2235 		*optlenp = inlen;
2236 		return (0);
2237 	}
2238 
2239 	if (inlen == *optlenp - reservelen) {
2240 		/* Unchanged length - no need to reallocate */
2241 		optp = *optbufp + reservelen;
2242 		bcopy(invalp, optp, inlen);
2243 		if (reservelen != 0) {
2244 			/*
2245 			 * Convert the NextHeader and Length of the
2246 			 * passed in hop-by-hop header to pads
2247 			 */
2248 			optp[0] = IP6OPT_PADN;
2249 			optp[1] = 0;
2250 		}
2251 		return (0);
2252 	}
2253 	if (inlen + reservelen > 0) {
2254 		/* Allocate new buffer before free */
2255 		optbuf = kmem_alloc(inlen + reservelen, KM_NOSLEEP);
2256 		if (optbuf == NULL)
2257 			return (ENOMEM);
2258 	} else {
2259 		optbuf = NULL;
2260 	}
2261 
2262 	/* Copy out old reserved data (label) */
2263 	if (reservelen > 0)
2264 		bcopy(*optbufp, optbuf, reservelen);
2265 
2266 	/* Free old buffer */
2267 	if (*optlenp != 0)
2268 		kmem_free(*optbufp, *optlenp);
2269 
2270 	if (inlen > 0)
2271 		bcopy(invalp, optbuf + reservelen, inlen);
2272 
2273 	if (reservelen != 0) {
2274 		/*
2275 		 * Convert the NextHeader and Length of the
2276 		 * passed in hop-by-hop header to pads
2277 		 */
2278 		optbuf[reservelen] = IP6OPT_PADN;
2279 		optbuf[reservelen + 1] = 0;
2280 		/*
2281 		 * Set the Length of the hop-by-hop header, number of 8
2282 		 * byte-words following the 1st 8 bytes
2283 		 */
2284 		optbuf[1] = (reservelen + inlen - 1) >> 3;
2285 	}
2286 	*optbufp = optbuf;
2287 	*optlenp = inlen + reservelen;
2288 	return (0);
2289 }
2290