xref: /titanic_52/usr/src/uts/common/inet/optcom.c (revision 82d33c01b078ed404a986a369750cdb4743773fb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * This file contains common code for handling Options Management requests.
31  */
32 
33 #include <sys/types.h>
34 #include <sys/stream.h>
35 #include <sys/stropts.h>
36 #include <sys/strsubr.h>
37 #include <sys/errno.h>
38 #define	_SUN_TPI_VERSION 2
39 #include <sys/tihdr.h>
40 #include <sys/socket.h>
41 #include <sys/ddi.h>
42 #include <sys/debug.h>		/* for ASSERT */
43 #include <sys/policy.h>
44 
45 #include <inet/common.h>
46 #include <inet/mi.h>
47 #include <inet/nd.h>
48 #include <netinet/ip6.h>
49 #include <inet/ip.h>
50 #include <inet/mib2.h>
51 #include <netinet/in.h>
52 #include "optcom.h"
53 
54 #include <inet/optcom.h>
55 
56 /*
57  * Function prototypes
58  */
59 static t_scalar_t process_topthdrs_first_pass(mblk_t *, cred_t *, optdb_obj_t *,
60     boolean_t *, size_t *);
61 static t_scalar_t do_options_second_pass(queue_t *q, mblk_t *reqmp,
62     mblk_t *ack_mp, cred_t *, optdb_obj_t *dbobjp,
63     mblk_t *first_mp, boolean_t is_restart, boolean_t *queued_statusp);
64 static t_uscalar_t get_worst_status(t_uscalar_t, t_uscalar_t);
65 static int do_opt_default(queue_t *, struct T_opthdr *, uchar_t **,
66     t_uscalar_t *, cred_t *, optdb_obj_t *);
67 static void do_opt_current(queue_t *, struct T_opthdr *, uchar_t **,
68     t_uscalar_t *, cred_t *cr, optdb_obj_t *);
69 static int do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
70     uint_t optset_context, uchar_t **resptrp, t_uscalar_t *worst_statusp,
71     cred_t *, optdb_obj_t *dbobjp, mblk_t *first_mp);
72 static opdes_t *opt_chk_lookup(t_uscalar_t, t_uscalar_t, opdes_t *, uint_t);
73 static boolean_t opt_level_valid(t_uscalar_t, optlevel_t *, uint_t);
74 static size_t opt_level_allopts_lengths(t_uscalar_t, opdes_t *, uint_t);
75 static boolean_t opt_length_ok(opdes_t *, struct T_opthdr *);
76 static t_uscalar_t optcom_max_optbuf_len(opdes_t *, uint_t);
77 static boolean_t opt_bloated_maxsize(opdes_t *);
78 
79 /* Common code for sending back a T_ERROR_ACK. */
80 void
81 optcom_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
82 {
83 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
84 		qreply(q, mp);
85 }
86 
87 /*
88  * The option management routines svr4_optcom_req() and tpi_optcom_req() use
89  * callback functions as arguments. Here is the expected interfaces
90  * assumed from the callback functions
91  *
92  *
93  * (1) deffn(q, optlevel, optname, optvalp)
94  *
95  *	- Function only called when default value comes from protocol
96  *	 specific code and not the option database table (indicated by
97  *	  OP_DEF_FN property in option database.)
98  *	- Error return is -1. Valid returns are >=0.
99  *	- When valid, the return value represents the length used for storing
100  *		the default value of the option.
101  *      - Error return implies the called routine did not recognize this
102  *              option. Something downstream could so input is left unchanged
103  *              in request buffer.
104  *
105  * (2) getfn(q, optlevel, optname, optvalp)
106  *
107  *	- Error return is -1. Valid returns are >=0.
108  *	- When valid, the return value represents the length used for storing
109  *		the actual value of the option.
110  *      - Error return implies the called routine did not recognize this
111  *              option. Something downstream could so input is left unchanged
112  *              in request buffer.
113  *
114  * (3) setfn(q, optset_context, optlevel, optname, inlen, invalp,
115  *	outlenp, outvalp, attrp, cr);
116  *
117  *	- OK return is 0, Error code is returned as a non-zero argument.
118  *      - If negative it is ignored by svr4_optcom_req(). If positive, error
119  *        is returned. A negative return implies that option, while handled on
120  *	  this stack is not handled at this level and will be handled further
121  *	  downstream.
122  *	- Both negative and positive errors are treats as errors in an
123  *	  identical manner by tpi_optcom_req(). The errors affect "status"
124  *	  field of each option's T_opthdr. If sucessfull, an appropriate sucess
125  *	  result is carried. If error, it instantiated to "failure" at the
126  *	  topmost level and left unchanged at other levels. (This "failure" can
127  *	  turn to a success at another level).
128  *	- optset_context passed for tpi_optcom_req(). It is interpreted as:
129  *        - SETFN_OPTCOM_CHECKONLY
130  *		semantics are to pretend to set the value and report
131  *		back if it would be successful.
132  *		This is used with T_CHECK semantics in XTI
133  *        - SETFN_OPTCOM_NEGOTIATE
134  *		set the value. Call from option management primitive
135  *		T_OPTMGMT_REQ when T_NEGOTIATE flags is used.
136  *	  - SETFN_UD_NEGOTIATE
137  *		option request came riding on UNITDATA primitive most often
138  *		has  "this datagram" semantics to influence prpoerties
139  *		affecting an outgoig datagram or associated with recived
140  *		datagram
141  *		[ Note: XTI permits this use outside of "this datagram"
142  *		semantics also and permits setting "management related"
143  *		options in this	context and its test suite enforces it ]
144  *	  - SETFN_CONN_NEGOTATE
145  *		option request came riding on CONN_REQ/RES primitive and
146  *		most often has "this connection" (negotiation during
147  *		"connection estblishment") semantics.
148  *		[ Note: XTI permits use of these outside of "this connection"
149  *		semantics and permits "management related" options in this
150  *		context and its test suite enforces it. ]
151  *
152  *	- inlen, invalp is the option length,value requested to be set.
153  *	- outlenp, outvalp represent return parameters which contain the
154  *	  value set and it might be different from one passed on input.
155  *	- attrp points to a data structure that's used by v6 modules to
156  *	  store ancillary data options or sticky options.
157  *	- cr points to the caller's credentials
158  *	- the caller might pass same buffers for input and output and the
159  *	  routine should protect against this case by not updating output
160  *	  buffers until it is done referencing input buffers and any other
161  *	  issues (e.g. not use bcopy() if we do not trust what it does).
162  *      - If option is not known, it returns error. We randomly pick EINVAL.
163  *        It can however get called with options that are handled downstream
164  *        opr upstream so for svr4_optcom_req(), it does not return error for
165  *        negative return values.
166  *
167  */
168 
169 /*
170  * Upper Level Protocols call this routine when they receive
171  * a T_SVR4_OPTMGMT_REQ message.  They supply callback functions
172  * for setting a new value for a single options, getting the
173  * current value for a single option, and checking for support
174  * of a single option.  svr4_optcom_req validates the option management
175  * buffer passed in, and calls the appropriate routines to do the
176  * job requested.
177  * XXX Code below needs some restructuring after we have some more
178  * macros to support 'struct opthdr' in the headers.
179  *
180  * IP-MT notes: The option management framework functions svr4_optcom_req() and
181  * tpi_optcom_req() allocate and prepend an M_CTL mblk to the actual
182  * T_optmgmt_req mblk and pass the chain as an additional parameter to the
183  * protocol set functions. If a protocol set function (such as ip_opt_set)
184  * cannot process the option immediately it can return EINPROGRESS. ip_opt_set
185  * enqueues the message in the appropriate sq and returns EINPROGRESS. Later
186  * the sq framework arranges to restart this operation and passes control to
187  * the restart function ip_restart_optmgmt() which in turn calls
188  * svr4_optcom_req() or tpi_optcom_req() to restart the option processing.
189  */
190 int
191 svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp)
192 {
193 	pfi_t	deffn = dbobjp->odb_deffn;
194 	pfi_t	getfn = dbobjp->odb_getfn;
195 	opt_set_fn setfn = dbobjp->odb_setfn;
196 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
197 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
198 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
199 	opt_restart_t *or;
200 	struct opthdr *restart_opt;
201 	boolean_t is_restart = B_FALSE;
202 	mblk_t	*first_mp;
203 
204 	t_uscalar_t max_optbuf_len;
205 	int len;
206 	mblk_t	*mp1 = NULL;
207 	struct opthdr *next_opt;
208 	struct opthdr *opt;
209 	struct opthdr *opt1;
210 	struct opthdr *opt_end;
211 	struct opthdr *opt_start;
212 	opdes_t	*optd;
213 	boolean_t	pass_to_next = B_FALSE;
214 	boolean_t	pass_to_ip = B_FALSE;
215 	boolean_t	is_tcp;
216 	struct T_optmgmt_ack *toa;
217 	struct T_optmgmt_req *tor;
218 
219 	is_tcp = (dbobjp == &tcp_opt_obj);
220 
221 	/*
222 	 * Allocate M_CTL and prepend to the packet for restarting this
223 	 * option if needed. IP may need to queue and restart the option
224 	 * if it cannot obtain exclusive conditions immediately. Please see
225 	 * IP-MT notes before the start of svr4_optcom_req
226 	 */
227 	if (mp->b_datap->db_type == M_CTL) {
228 		is_restart = B_TRUE;
229 		first_mp = mp;
230 		mp = mp->b_cont;
231 		ASSERT(mp->b_wptr - mp->b_rptr >=
232 		    sizeof (struct T_optmgmt_req));
233 		tor = (struct T_optmgmt_req *)mp->b_rptr;
234 		ASSERT(tor->MGMT_flags == T_NEGOTIATE);
235 
236 		or = (opt_restart_t *)first_mp->b_rptr;
237 		opt_start = or->or_start;
238 		opt_end = or->or_end;
239 		restart_opt = or->or_ropt;
240 		goto restart;
241 	}
242 
243 	tor = (struct T_optmgmt_req *)mp->b_rptr;
244 	/* Verify message integrity. */
245 	if (mp->b_wptr - mp->b_rptr < sizeof (struct T_optmgmt_req))
246 		goto bad_opt;
247 	/* Verify MGMT_flags legal */
248 	switch (tor->MGMT_flags) {
249 	case T_DEFAULT:
250 	case T_NEGOTIATE:
251 	case T_CURRENT:
252 	case T_CHECK:
253 		/* OK - legal request flags */
254 		break;
255 	default:
256 		optcom_err_ack(q, mp, TBADFLAG, 0);
257 		return (0);
258 	}
259 	if (tor->MGMT_flags == T_DEFAULT) {
260 		/* Is it a request for default option settings? */
261 
262 		/*
263 		 * Note: XXX TLI and TPI specification was unclear about
264 		 * semantics of T_DEFAULT and the following historical note
265 		 * and its interpretation is incorrect (it implies a request
266 		 * for default values of only the identified options not all.
267 		 * The semantics have been explained better in XTI spec.)
268 		 * However, we do not modify (comment or code) here to keep
269 		 * compatibility.
270 		 * We can rethink this if it ever becomes an issue.
271 		 * ----historical comment start------
272 		 * As we understand it, the input buffer is meaningless
273 		 * so we ditch the message.  A T_DEFAULT request is a
274 		 * request to obtain a buffer containing defaults for
275 		 * all supported options, so we allocate a maximum length
276 		 * reply.
277 		 * ----historical comment end -------
278 		 */
279 		/* T_DEFAULT not passed down */
280 		ASSERT(topmost_tpiprovider == B_TRUE);
281 		freemsg(mp);
282 		max_optbuf_len = optcom_max_optbuf_len(opt_arr,
283 		    opt_arr_cnt);
284 		mp = allocb(max_optbuf_len, BPRI_MED);
285 		if (!mp) {
286 no_mem:;
287 			optcom_err_ack(q, mp, TSYSERR, ENOMEM);
288 			return (0);
289 		}
290 
291 		/* Initialize the T_optmgmt_ack header. */
292 		toa = (struct T_optmgmt_ack *)mp->b_rptr;
293 		bzero((char *)toa, max_optbuf_len);
294 		toa->PRIM_type = T_OPTMGMT_ACK;
295 		toa->OPT_offset = (t_scalar_t)sizeof (struct T_optmgmt_ack);
296 		/* TODO: Is T_DEFAULT the right thing to put in MGMT_flags? */
297 		toa->MGMT_flags = T_DEFAULT;
298 
299 		/* Now walk the table of options passed in */
300 		opt = (struct opthdr *)&toa[1];
301 		for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
302 			/*
303 			 * All the options in the table of options passed
304 			 * in are by definition supported by the protocol
305 			 * calling this function.
306 			 */
307 			if (!OA_READ_PERMISSION(optd, cr))
308 				continue;
309 			opt->level = optd->opdes_level;
310 			opt->name = optd->opdes_name;
311 			if (!(optd->opdes_props & OP_DEF_FN) ||
312 			    ((len = (*deffn)(q, opt->level,
313 				opt->name, (uchar_t *)&opt[1])) < 0)) {
314 				/*
315 				 * Fill length and value from table.
316 				 *
317 				 * Default value not instantiated from function
318 				 * (or the protocol specific function failed it;
319 				 * In this interpretation of T_DEFAULT, this is
320 				 * the best we can do)
321 				 */
322 				switch (optd->opdes_size) {
323 				/*
324 				 * Since options are guaranteed aligned only
325 				 * on a 4 byte boundary (t_scalar_t) any
326 				 * option that is greater in size will default
327 				 * to the bcopy below
328 				 */
329 				case sizeof (int32_t):
330 					*(int32_t *)&opt[1] =
331 					    (int32_t)optd->opdes_default;
332 					break;
333 				case sizeof (int16_t):
334 					*(int16_t *)&opt[1] =
335 					    (int16_t)optd->opdes_default;
336 					break;
337 				case sizeof (int8_t):
338 					*(int8_t *)&opt[1] =
339 					    (int8_t)optd->opdes_default;
340 					break;
341 				default:
342 					/*
343 					 * other length but still assume
344 					 * fixed - use bcopy
345 					 */
346 					bcopy(optd->opdes_defbuf,
347 					    &opt[1], optd->opdes_size);
348 					break;
349 				}
350 				opt->len = optd->opdes_size;
351 			}
352 			else
353 				opt->len = (t_uscalar_t)len;
354 			opt = (struct opthdr *)((char *)&opt[1] +
355 			    _TPI_ALIGN_OPT(opt->len));
356 		}
357 
358 		/* Now record the final length. */
359 		toa->OPT_length = (t_scalar_t)((char *)opt - (char *)&toa[1]);
360 		mp->b_wptr = (uchar_t *)opt;
361 		mp->b_datap->db_type = M_PCPROTO;
362 		/* Ship it back. */
363 		qreply(q, mp);
364 		return (0);
365 	}
366 	/* T_DEFAULT processing complete - no more T_DEFAULT */
367 
368 	/*
369 	 * For T_NEGOTIATE, T_CURRENT, and T_CHECK requests, we make a
370 	 * pass through the input buffer validating the details and
371 	 * making sure each option is supported by the protocol.
372 	 */
373 	if ((opt_start = (struct opthdr *)mi_offset_param(mp,
374 	    tor->OPT_offset, tor->OPT_length)) == NULL)
375 		goto bad_opt;
376 	if (!__TPI_OPT_ISALIGNED(opt_start))
377 		goto bad_opt;
378 
379 	opt_end = (struct opthdr *)((uchar_t *)opt_start +
380 	    tor->OPT_length);
381 
382 	for (opt = opt_start; opt < opt_end; opt = next_opt) {
383 		/*
384 		 * Verify we have room to reference the option header
385 		 * fields in the option buffer.
386 		 */
387 		if ((uchar_t *)opt + sizeof (struct opthdr) >
388 		    (uchar_t *)opt_end)
389 			goto bad_opt;
390 		/*
391 		 * We now compute pointer to next option in buffer 'next_opt'
392 		 * The next_opt computation above below 'opt->len' initialized
393 		 * by application which cannot be trusted. The usual value
394 		 * too large will be captured by the loop termination condition
395 		 * above. We check for the following which it will miss.
396 		 * 	-pointer space wraparound arithmetic overflow
397 		 *	-last option in buffer with 'opt->len' being too large
398 		 *	 (only reason 'next_opt' should equal or exceed
399 		 *	 'opt_end' for last option is roundup unless length is
400 		 *	 too-large/invalid)
401 		 */
402 		next_opt = (struct opthdr *)((uchar_t *)&opt[1] +
403 		    _TPI_ALIGN_OPT(opt->len));
404 
405 		if ((uchar_t *)next_opt < (uchar_t *)&opt[1] ||
406 		    ((next_opt >= opt_end) &&
407 			(((uchar_t *)next_opt - (uchar_t *)opt_end) >=
408 			    __TPI_ALIGN_SIZE)))
409 			goto bad_opt;
410 
411 		/* sanity check */
412 		if (opt->name == T_ALLOPT)
413 			goto bad_opt;
414 
415 		/* Find the option in the opt_arr. */
416 		if ((optd = opt_chk_lookup(opt->level, opt->name,
417 		    opt_arr, opt_arr_cnt)) == NULL) {
418 			/*
419 			 * Not found, that is a bad thing if
420 			 * the caller is a tpi provider
421 			 */
422 			if (topmost_tpiprovider)
423 				goto bad_opt;
424 			else
425 				continue; /* skip unmodified */
426 		}
427 
428 		/* Additional checks dependent on operation. */
429 		switch (tor->MGMT_flags) {
430 		case T_NEGOTIATE:
431 			if (!OA_WRITE_OR_EXECUTE(optd, cr)) {
432 				/* can't negotiate option */
433 				if (!(OA_MATCHED_PRIV(optd, cr)) &&
434 				    OA_WX_ANYPRIV(optd)) {
435 					/*
436 					 * not privileged but privilege
437 					 * will help negotiate option.
438 					 */
439 					optcom_err_ack(q, mp, TACCES, 0);
440 					return (0);
441 				} else
442 					goto bad_opt;
443 			}
444 			/*
445 			 * Verify size for options
446 			 * Note: For retaining compatibility with historical
447 			 * behavior, variable lengths options will have their
448 			 * length verified in the setfn() processing.
449 			 * In order to be compatible with SunOS 4.X we return
450 			 * EINVAL errors for bad lengths.
451 			 */
452 			if (!(optd->opdes_props & OP_VARLEN)) {
453 				/* fixed length - size must match */
454 				if (opt->len != optd->opdes_size) {
455 					optcom_err_ack(q, mp, TSYSERR, EINVAL);
456 					return (0);
457 				}
458 			}
459 			break;
460 
461 		case T_CHECK:
462 			if (!OA_RWX_ANYPRIV(optd))
463 				/* any of "rwx" permission but not not none */
464 				goto bad_opt;
465 			/*
466 			 * XXX Since T_CURRENT was not there in TLI and the
467 			 * official TLI inspired TPI standard, getsockopt()
468 			 * API uses T_CHECK (for T_CURRENT semantics)
469 			 * The following fallthru makes sense because of its
470 			 * historical use as semantic equivalent to T_CURRENT.
471 			 */
472 			/* FALLTHRU */
473 		case T_CURRENT:
474 			if (!OA_READ_PERMISSION(optd, cr)) {
475 				/* can't read option value */
476 				if (!(OA_MATCHED_PRIV(optd, cr)) &&
477 				    OA_R_ANYPRIV(optd)) {
478 					/*
479 					 * not privileged but privilege
480 					 * will help in reading option value.
481 					 */
482 					optcom_err_ack(q, mp, TACCES, 0);
483 					return (0);
484 				} else
485 					goto bad_opt;
486 			}
487 			break;
488 
489 		default:
490 			optcom_err_ack(q, mp, TBADFLAG, 0);
491 			return (0);
492 		}
493 		/* We liked it.  Keep going. */
494 	} /* end for loop scanning option buffer */
495 
496 	/* Now complete the operation as required. */
497 	switch (tor->MGMT_flags) {
498 	case T_CHECK:
499 		/*
500 		 * Historically used same as T_CURRENT (which was added to
501 		 * standard later). Code retained for compatibility.
502 		 */
503 		/* FALLTHROUGH */
504 	case T_CURRENT:
505 		/*
506 		 * Allocate a maximum size reply.  Perhaps we are supposed to
507 		 * assume that the input buffer includes space for the answers
508 		 * as well as the opthdrs, but we don't know that for sure.
509 		 * So, instead, we create a new output buffer, using the
510 		 * input buffer only as a list of options.
511 		 */
512 		max_optbuf_len = optcom_max_optbuf_len(opt_arr,
513 		    opt_arr_cnt);
514 		mp1 = allocb_cred(max_optbuf_len, cr);
515 		if (!mp1)
516 			goto no_mem;
517 		/* Initialize the header. */
518 		mp1->b_datap->db_type = M_PCPROTO;
519 		mp1->b_wptr = &mp1->b_rptr[sizeof (struct T_optmgmt_ack)];
520 		toa = (struct T_optmgmt_ack *)mp1->b_rptr;
521 		toa->OPT_offset = (t_scalar_t)sizeof (struct T_optmgmt_ack);
522 		toa->MGMT_flags = tor->MGMT_flags;
523 		/*
524 		 * Walk through the input buffer again, this time adding
525 		 * entries to the output buffer for each option requested.
526 		 * Note, sanity of option header, last option etc, verified
527 		 * in first pass.
528 		 */
529 		opt1 = (struct opthdr *)&toa[1];
530 
531 		for (opt = opt_start; opt < opt_end; opt = next_opt) {
532 
533 		    next_opt = (struct opthdr *)((uchar_t *)&opt[1] +
534 			_TPI_ALIGN_OPT(opt->len));
535 
536 			opt1->name = opt->name;
537 			opt1->level = opt->level;
538 			len = (*getfn)(q, opt->level,
539 			    opt->name, (uchar_t *)&opt1[1]);
540 			/*
541 			 * Failure means option is not recognized. Copy input
542 			 * buffer as is
543 			 */
544 			if (len < 0) {
545 				opt1->len = opt->len;
546 				bcopy(&opt[1], &opt1[1], opt->len);
547 				/*
548 				 * Pass the option down to IP only
549 				 * if TCP hasn't processed it.
550 				 */
551 				if (is_tcp)
552 					pass_to_ip = B_TRUE;
553 			} else {
554 				opt1->len = (t_uscalar_t)len;
555 			}
556 			opt1 = (struct opthdr *)((uchar_t *)&opt1[1] +
557 			    _TPI_ALIGN_OPT(opt1->len));
558 		} /* end for loop */
559 
560 		/* Record the final length. */
561 		toa->OPT_length = (t_scalar_t)((uchar_t *)opt1 -
562 		    (uchar_t *)&toa[1]);
563 		mp1->b_wptr = (uchar_t *)opt1;
564 		/* Ditch the input buffer. */
565 		freemsg(mp);
566 		mp = mp1;
567 		/* Always let the next module look at the option. */
568 		pass_to_next = B_TRUE;
569 		break;
570 
571 	case T_NEGOTIATE:
572 		first_mp = allocb(sizeof (opt_restart_t), BPRI_LO);
573 		if (first_mp == NULL) {
574 			optcom_err_ack(q, mp, TSYSERR, ENOMEM);
575 			return (0);
576 		}
577 		first_mp->b_datap->db_type = M_CTL;
578 		or = (opt_restart_t *)first_mp->b_rptr;
579 		or->or_start = opt_start;
580 		or->or_end =  opt_end;
581 		or->or_type = T_SVR4_OPTMGMT_REQ;
582 		or->or_private = 0;
583 		first_mp->b_cont = mp;
584 restart:
585 		/*
586 		 * Here we are expecting that the response buffer is exactly
587 		 * the same size as the input buffer.  We pass each opthdr
588 		 * to the protocol's set function.  If the protocol doesn't
589 		 * like it, it can update the value in it return argument.
590 		 */
591 		/*
592 		 * Pass each negotiated option through the protocol set
593 		 * function.
594 		 * Note: sanity check on option header values done in first
595 		 * pass and not repeated here.
596 		 */
597 		toa = (struct T_optmgmt_ack *)tor;
598 
599 		for (opt = is_restart ? restart_opt: opt_start; opt < opt_end;
600 		    opt = next_opt) {
601 			int error;
602 
603 			/*
604 			 * Point to the current option in or, in case this
605 			 * option has to be restarted later on
606 			 */
607 			or->or_ropt = opt;
608 			next_opt = (struct opthdr *)((uchar_t *)&opt[1] +
609 			    _TPI_ALIGN_OPT(opt->len));
610 
611 			error = (*setfn)(q, SETFN_OPTCOM_NEGOTIATE,
612 			    opt->level, opt->name,
613 			    opt->len, (uchar_t *)&opt[1],
614 			    &opt->len, (uchar_t *)&opt[1], NULL, cr, first_mp);
615 			/*
616 			 * Treat positive "errors" as real.
617 			 * Note: negative errors are to be treated as
618 			 * non-fatal by svr4_optcom_req() and are
619 			 * returned by setfn() when it is passed an
620 			 * option it does not handle. Since the option
621 			 * passed opt_chk_lookup(), it is implied that
622 			 * it is valid but was either handled upstream
623 			 * or will be handled downstream.
624 			 */
625 			if (error == EINPROGRESS) {
626 				/*
627 				 * The message is queued and will be
628 				 * reprocessed later. Typically ip queued
629 				 * the message to get some exclusive conditions
630 				 * and later on calls this func again.
631 				 */
632 				return (EINPROGRESS);
633 			} else if (error > 0) {
634 				optcom_err_ack(q, mp, TSYSERR, error);
635 				freeb(first_mp);
636 				return (0);
637 			} else if (error < 0 && is_tcp) {
638 				/*
639 				 * Pass the option down to IP only
640 				 * if TCP hasn't processed it.
641 				 */
642 				pass_to_ip = B_TRUE;
643 			}
644 		}
645 		/* Done with the restart control mp. */
646 		freeb(first_mp);
647 		pass_to_next = B_TRUE;
648 		break;
649 	default:
650 		optcom_err_ack(q, mp, TBADFLAG, 0);
651 		return (0);
652 	}
653 
654 	if (pass_to_next && (q->q_next != NULL || pass_to_ip)) {
655 		/* Send it down to the next module and let it reply */
656 		toa->PRIM_type = T_SVR4_OPTMGMT_REQ; /* Changed by IP to ACK */
657 		if (q->q_next != NULL)
658 			putnext(q, mp);
659 		else
660 			ip_output(Q_TO_CONN(q), mp, q, IP_WPUT);
661 	} else {
662 		/* Set common fields in the header. */
663 		toa->MGMT_flags = T_SUCCESS;
664 		mp->b_datap->db_type = M_PCPROTO;
665 		toa->PRIM_type = T_OPTMGMT_ACK;
666 		qreply(q, mp);
667 	}
668 	return (0);
669 bad_opt:;
670 	optcom_err_ack(q, mp, TBADOPT, 0);
671 	return (0);
672 }
673 
674 /*
675  * New optcom_req inspired by TPI/XTI semantics
676  */
677 int
678 tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp)
679 {
680 	t_scalar_t t_error;
681 	mblk_t *toa_mp;
682 	boolean_t pass_to_next;
683 	size_t toa_len;
684 	struct T_optmgmt_ack *toa;
685 	struct T_optmgmt_req *tor =
686 	    (struct T_optmgmt_req *)mp->b_rptr;
687 
688 	opt_restart_t *or;
689 	boolean_t is_restart = B_FALSE;
690 	mblk_t	*first_mp = NULL;
691 	t_uscalar_t worst_status;
692 	boolean_t queued_status;
693 
694 	/*
695 	 * Allocate M_CTL and prepend to the packet for restarting this
696 	 * option if needed. IP may need to queue and restart the option
697 	 * if it cannot obtain exclusive conditions immediately. Please see
698 	 * IP-MT notes before the start of svr4_optcom_req
699 	 */
700 	if (mp->b_datap->db_type == M_CTL) {
701 		is_restart = B_TRUE;
702 		first_mp = mp;
703 		toa_mp = mp->b_cont;
704 		mp = toa_mp->b_cont;
705 		ASSERT(mp->b_wptr - mp->b_rptr >=
706 		    sizeof (struct T_optmgmt_req));
707 		tor = (struct T_optmgmt_req *)mp->b_rptr;
708 		ASSERT(tor->MGMT_flags == T_NEGOTIATE);
709 
710 		or = (opt_restart_t *)first_mp->b_rptr;
711 		goto restart;
712 	}
713 
714 	/* Verify message integrity. */
715 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_optmgmt_req)) {
716 		optcom_err_ack(q, mp, TBADOPT, 0);
717 		return (0);
718 	}
719 
720 	/* Verify MGMT_flags legal */
721 	switch (tor->MGMT_flags) {
722 	case T_DEFAULT:
723 	case T_NEGOTIATE:
724 	case T_CURRENT:
725 	case T_CHECK:
726 		/* OK - legal request flags */
727 		break;
728 	default:
729 		optcom_err_ack(q, mp, TBADFLAG, 0);
730 		return (0);
731 	}
732 
733 	/*
734 	 * In this design, there are two passes required on the input buffer
735 	 * mostly to accomodate variable length options and "T_ALLOPT" option
736 	 * which has the semantics "all options of the specified level".
737 	 *
738 	 * For T_DEFAULT, T_NEGOTIATE, T_CURRENT, and T_CHECK requests, we make
739 	 * a pass through the input buffer validating the details and making
740 	 * sure each option is supported by the protocol. We also determine the
741 	 * length of the option buffer to return. (Variable length options and
742 	 * T_ALLOPT mean that length can be different for output buffer).
743 	 */
744 
745 	pass_to_next = B_FALSE;	/* initial value */
746 	toa_len = 0;		/* initial value */
747 
748 	/*
749 	 * First pass, we do the following
750 	 *	- estimate cumulative length needed for results
751 	 *	- set "status" field based on permissions, option header check
752 	 *	  etc.
753 	 *	- determine "pass_to_next" whether we need to send request to
754 	 *	  downstream module/driver.
755 	 */
756 	if ((t_error = process_topthdrs_first_pass(mp, cr, dbobjp,
757 	    &pass_to_next, &toa_len)) != 0) {
758 		optcom_err_ack(q, mp, t_error, 0);
759 		return (0);
760 	}
761 
762 	/*
763 	 * A validation phase of the input buffer is done. We have also
764 	 * obtained the length requirement and and other details about the
765 	 * input and we liked input buffer so far.  We make another scan
766 	 * through the input now and generate the output necessary to complete
767 	 * the operation.
768 	 */
769 
770 	toa_mp = allocb_cred(toa_len, cr);
771 	if (!toa_mp) {
772 		optcom_err_ack(q, mp, TSYSERR, ENOMEM);
773 		return (0);
774 	}
775 
776 	first_mp = allocb(sizeof (opt_restart_t), BPRI_LO);
777 	if (first_mp == NULL) {
778 		freeb(toa_mp);
779 		optcom_err_ack(q, mp, TSYSERR, ENOMEM);
780 		return (0);
781 	}
782 	first_mp->b_datap->db_type = M_CTL;
783 	or = (opt_restart_t *)first_mp->b_rptr;
784 	/*
785 	 * Set initial values for generating output.
786 	 */
787 	or->or_worst_status = T_SUCCESS;
788 	or->or_type = T_OPTMGMT_REQ;
789 	or->or_private = 0;
790 	/* remaining fields fileed in do_options_second_pass */
791 
792 restart:
793 	/*
794 	 * This routine makes another pass through the option buffer this
795 	 * time acting on the request based on "status" result in the
796 	 * first pass. It also performs "expansion" of T_ALLOPT into
797 	 * all options of a certain level and acts on each for this request.
798 	 */
799 	if ((t_error = do_options_second_pass(q, mp, toa_mp, cr, dbobjp,
800 	    first_mp, is_restart, &queued_status)) != 0) {
801 		freemsg(toa_mp);
802 		optcom_err_ack(q, mp, t_error, 0);
803 		return (0);
804 	}
805 	if (queued_status) {
806 		/* Option will be restarted */
807 		return (EINPROGRESS);
808 	}
809 	worst_status = or->or_worst_status;
810 	/* Done with the first mp */
811 	freeb(first_mp);
812 	toa_mp->b_cont = NULL;
813 
814 	/*
815 	 * Following code relies on the coincidence that T_optmgmt_req
816 	 * and T_optmgmt_ack are identical in binary representation
817 	 */
818 	toa = (struct T_optmgmt_ack *)toa_mp->b_rptr;
819 	toa->OPT_length = (t_scalar_t)(toa_mp->b_wptr - (toa_mp->b_rptr +
820 	    sizeof (struct T_optmgmt_ack)));
821 	toa->OPT_offset = (t_scalar_t)sizeof (struct T_optmgmt_ack);
822 
823 	toa->MGMT_flags = tor->MGMT_flags;
824 
825 
826 	freemsg(mp);		/* free input mblk */
827 
828 	/*
829 	 * If there is atleast one option that requires a downstream
830 	 * forwarding and if it is possible, we forward the message
831 	 * downstream. Else we ack it.
832 	 */
833 	if (pass_to_next && (q->q_next != NULL || dbobjp == &tcp_opt_obj)) {
834 		/*
835 		 * We pass it down as T_OPTMGMT_REQ. This code relies
836 		 * on the happy coincidence that T_optmgmt_req and
837 		 * T_optmgmt_ack are identical data structures
838 		 * at the binary representation level.
839 		 */
840 		toa_mp->b_datap->db_type = M_PROTO;
841 		toa->PRIM_type = T_OPTMGMT_REQ;
842 		if (q->q_next != NULL)
843 			putnext(q, toa_mp);
844 		else
845 			ip_output(Q_TO_CONN(q), toa_mp, q, IP_WPUT);
846 	} else {
847 		toa->PRIM_type = T_OPTMGMT_ACK;
848 		toa_mp->b_datap->db_type = M_PCPROTO;
849 		toa->MGMT_flags |= worst_status; /* XXX "worst" or "OR" TPI ? */
850 		qreply(q, toa_mp);
851 	}
852 	return (0);
853 }
854 
855 
856 /*
857  * Following routine makes a pass through option buffer in mp and performs the
858  * following tasks.
859  *	- estimate cumulative length needed for results
860  *	- set "status" field based on permissions, option header check
861  *	  etc.
862  *	- determine "pass_to_next" whether we need to send request to
863  *	  downstream module/driver.
864  */
865 
866 static t_scalar_t
867 process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
868     boolean_t *pass_to_nextp, size_t *toa_lenp)
869 {
870 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
871 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
872 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
873 	optlevel_t *valid_level_arr = dbobjp->odb_valid_levels_arr;
874 	uint_t valid_level_arr_cnt = dbobjp->odb_valid_levels_arr_cnt;
875 	struct T_opthdr *opt;
876 	struct T_opthdr *opt_start, *opt_end;
877 	opdes_t	*optd;
878 	size_t allopt_len;
879 	struct T_optmgmt_req *tor =
880 	    (struct T_optmgmt_req *)mp->b_rptr;
881 
882 	*toa_lenp = sizeof (struct T_optmgmt_ack); /* initial value */
883 
884 	if ((opt_start = (struct T_opthdr *)
885 	    mi_offset_param(mp, tor->OPT_offset, tor->OPT_length)) == NULL) {
886 		return (TBADOPT);
887 	}
888 	if (!__TPI_TOPT_ISALIGNED(opt_start))
889 		return (TBADOPT);
890 
891 	opt_end = (struct T_opthdr *)((uchar_t *)opt_start + tor->OPT_length);
892 
893 	for (opt = opt_start; opt && (opt < opt_end);
894 	    opt = _TPI_TOPT_NEXTHDR(opt_start, tor->OPT_length, opt)) {
895 		/*
896 		 * Validate the option for length and alignment
897 		 * before accessing anything in it.
898 		 */
899 		if (!(_TPI_TOPT_VALID(opt, opt_start, opt_end)))
900 			return (TBADOPT);
901 
902 		/* Find the option in the opt_arr. */
903 		if (opt->name != T_ALLOPT) {
904 			optd = opt_chk_lookup(opt->level, opt->name,
905 			    opt_arr, opt_arr_cnt);
906 			if (optd == NULL) {
907 				/*
908 				 * Option not found
909 				 *
910 				 * Verify if level is "valid" or not.
911 				 * Note: This check is required by XTI
912 				 *
913 				 * TPI provider always initializes
914 				 * the "not supported" (or whatever) status
915 				 * for the options. Other levels leave status
916 				 * unchanged if they do not understand an
917 				 * option.
918 				 */
919 				if (topmost_tpiprovider) {
920 					if (!opt_level_valid(opt->level,
921 					    valid_level_arr,
922 					    valid_level_arr_cnt))
923 						return (TBADOPT);
924 					/*
925 					 * level is valid - initialize
926 					 * option as not supported
927 					 */
928 					opt->status = T_NOTSUPPORT;
929 				}
930 
931 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
932 				continue;
933 			}
934 		} else {
935 			/*
936 			 * Handle T_ALLOPT case as a special case.
937 			 * Note: T_ALLOPT does not mean anything
938 			 * for T_CHECK operation.
939 			 */
940 			allopt_len = 0;
941 			if (tor->MGMT_flags == T_CHECK ||
942 			    !topmost_tpiprovider ||
943 			    ((allopt_len = opt_level_allopts_lengths(opt->level,
944 				opt_arr, opt_arr_cnt)) == 0)) {
945 				/*
946 				 * This is confusing but correct !
947 				 * It is not valid to to use T_ALLOPT with
948 				 * T_CHECK flag.
949 				 *
950 				 * T_ALLOPT is assumed "expanded" at the
951 				 * topmost_tpiprovider level so it should not
952 				 * be there as an "option name" if this is not
953 				 * a topmost_tpiprovider call and we fail it.
954 				 *
955 				 * opt_level_allopts_lengths() is used to verify
956 				 * that "level" associated with the T_ALLOPT is
957 				 * supported.
958 				 *
959 				 */
960 				opt->status = T_FAILURE;
961 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
962 				continue;
963 			}
964 			ASSERT(allopt_len != 0); /* remove ? */
965 
966 			*toa_lenp += allopt_len;
967 			opt->status = T_SUCCESS;
968 			/* XXX - always set T_ALLOPT 'pass_to_next' for now */
969 			*pass_to_nextp = B_TRUE;
970 			continue;
971 		}
972 		/*
973 		 * Check if option wants to flow downstream
974 		 */
975 		if (optd->opdes_props & OP_PASSNEXT)
976 			*pass_to_nextp = B_TRUE;
977 
978 		/* Additional checks dependent on operation. */
979 		switch (tor->MGMT_flags) {
980 		case T_DEFAULT:
981 		case T_CURRENT:
982 
983 			/*
984 			 * The opt_chk_lookup() routine call above approved of
985 			 * this option so we can work on the status for it
986 			 * based on the permissions for the operation. (This
987 			 * can override any status for it set at higher levels)
988 			 * We assume this override is OK since chkfn at this
989 			 * level approved of this option.
990 			 *
991 			 * T_CURRENT semantics:
992 			 * The read access is required. Else option
993 			 * status is T_NOTSUPPORT.
994 			 *
995 			 * T_DEFAULT semantics:
996 			 * Note: specification is not clear on this but we
997 			 * interpret T_DEFAULT semantics such that access to
998 			 * read value is required for access even the default
999 			 * value. Otherwise the option status is T_NOTSUPPORT.
1000 			 */
1001 			if (!OA_READ_PERMISSION(optd, cr)) {
1002 				opt->status = T_NOTSUPPORT;
1003 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
1004 				/* skip to next */
1005 				continue;
1006 			}
1007 
1008 			/*
1009 			 * T_DEFAULT/T_CURRENT semantics:
1010 			 * We know that read access is set. If no other access
1011 			 * is set, then status is T_READONLY.
1012 			 */
1013 			if (OA_READONLY_PERMISSION(optd, cr))
1014 				opt->status = T_READONLY;
1015 			else
1016 				opt->status = T_SUCCESS;
1017 			/*
1018 			 * Option passes all checks. Make room for it in the
1019 			 * ack. Note: size stored in table does not include
1020 			 * space for option header.
1021 			 */
1022 			*toa_lenp += sizeof (struct T_opthdr) +
1023 			    _TPI_ALIGN_TOPT(optd->opdes_size);
1024 			break;
1025 
1026 		case T_CHECK:
1027 		case T_NEGOTIATE:
1028 
1029 			/*
1030 			 * T_NEGOTIATE semantics:
1031 			 * If for fixed length option value on input is not the
1032 			 * same as value supplied, then status is T_FAILURE.
1033 			 *
1034 			 * T_CHECK semantics:
1035 			 * If value is supplied, semantics same as T_NEGOTIATE.
1036 			 * It is however ok not to supply a value with T_CHECK.
1037 			 */
1038 
1039 			if (tor->MGMT_flags == T_NEGOTIATE ||
1040 			    (opt->len != sizeof (struct T_opthdr))) {
1041 				/*
1042 				 * Implies "value" is specified in T_CHECK or
1043 				 * it is a T_NEGOTIATE request.
1044 				 * Verify size.
1045 				 * Note: This can override anything about this
1046 				 * option request done at a higher level.
1047 				 */
1048 				if (!opt_length_ok(optd, opt)) {
1049 					/* bad size */
1050 					*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
1051 					opt->status = T_FAILURE;
1052 					continue;
1053 				}
1054 			}
1055 			/*
1056 			 * The opt_chk_lookup()  routine above() approved of
1057 			 * this option so we can work on the status for it based
1058 			 * on the permissions for the operation. (This can
1059 			 * override anything set at a higher level).
1060 			 *
1061 			 * T_CHECK/T_NEGOTIATE semantics:
1062 			 * Set status to T_READONLY if read is the only access
1063 			 * permitted
1064 			 */
1065 			if (OA_READONLY_PERMISSION(optd, cr)) {
1066 				opt->status = T_READONLY;
1067 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
1068 				/* skip to next */
1069 				continue;
1070 			}
1071 
1072 			/*
1073 			 * T_CHECK/T_NEGOTIATE semantics:
1074 			 * If write (or execute) access is not set, then status
1075 			 * is T_NOTSUPPORT.
1076 			 */
1077 			if (!OA_WRITE_OR_EXECUTE(optd, cr)) {
1078 				opt->status = T_NOTSUPPORT;
1079 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
1080 				/* skip to next option */
1081 				continue;
1082 			}
1083 			/*
1084 			 * Option passes all checks. Make room for it in the
1085 			 * ack and set success in status.
1086 			 * Note: size stored in table does not include header
1087 			 * length.
1088 			 */
1089 			opt->status = T_SUCCESS;
1090 			*toa_lenp += sizeof (struct T_opthdr) +
1091 			    _TPI_ALIGN_TOPT(optd->opdes_size);
1092 			break;
1093 
1094 		default:
1095 			return (TBADFLAG);
1096 		}
1097 	} /* for loop scanning input buffer */
1098 
1099 	return (0);		/* OK return */
1100 }
1101 
1102 /*
1103  * This routine makes another pass through the option buffer this
1104  * time acting on the request based on "status" result in the
1105  * first pass. It also performs "expansion" of T_ALLOPT into
1106  * all options of a certain level and acts on each for this request.
1107  */
1108 static t_scalar_t
1109 do_options_second_pass(queue_t *q, mblk_t *reqmp, mblk_t *ack_mp, cred_t *cr,
1110     optdb_obj_t *dbobjp, mblk_t *first_mp, boolean_t is_restart,
1111     boolean_t *queued_statusp)
1112 {
1113 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
1114 	int failed_option;
1115 	struct T_opthdr *opt;
1116 	struct T_opthdr *opt_start, *opt_end, *restart_opt;
1117 	uchar_t *optr;
1118 	uint_t optset_context;
1119 	struct T_optmgmt_req *tor = (struct T_optmgmt_req *)reqmp->b_rptr;
1120 	opt_restart_t	*or;
1121 	t_uscalar_t	*worst_statusp;
1122 	int	err;
1123 
1124 	*queued_statusp = B_FALSE;
1125 	or = (opt_restart_t *)first_mp->b_rptr;
1126 	worst_statusp = &or->or_worst_status;
1127 
1128 	optr = (uchar_t *)ack_mp->b_rptr +
1129 	    sizeof (struct T_optmgmt_ack); /* assumed int32_t aligned */
1130 
1131 	/*
1132 	 * Set initial values for scanning input
1133 	 */
1134 	if (is_restart) {
1135 		opt_start = (struct T_opthdr *)or->or_start;
1136 		opt_end = (struct T_opthdr *)or->or_end;
1137 		restart_opt = (struct T_opthdr *)or->or_ropt;
1138 	} else {
1139 		opt_start = (struct T_opthdr *)mi_offset_param(reqmp,
1140 		    tor->OPT_offset, tor->OPT_length);
1141 		if (opt_start == NULL)
1142 			return (TBADOPT);
1143 		opt_end = (struct T_opthdr *)((uchar_t *)opt_start +
1144 		    tor->OPT_length);
1145 		or->or_start = (struct opthdr *)opt_start;
1146 		or->or_end = (struct opthdr *)opt_end;
1147 		/*
1148 		 * construct the mp chain, in case the setfn needs to
1149 		 * queue this and restart option processing later on.
1150 		 */
1151 		first_mp->b_cont = ack_mp;
1152 		ack_mp->b_cont = reqmp;
1153 	}
1154 	ASSERT(__TPI_TOPT_ISALIGNED(opt_start)); /* verified in first pass */
1155 
1156 	for (opt = is_restart ? restart_opt : opt_start;
1157 	    opt && (opt < opt_end);
1158 	    opt = _TPI_TOPT_NEXTHDR(opt_start, tor->OPT_length, opt)) {
1159 		or->or_ropt = (struct opthdr *)opt;
1160 		/* verified in first pass */
1161 		ASSERT(_TPI_TOPT_VALID(opt, opt_start, opt_end));
1162 
1163 		/*
1164 		 * If the first pass in process_topthdrs_first_pass()
1165 		 * has marked the option as a failure case for the MGMT_flags
1166 		 * semantics then there is not much to do.
1167 		 *
1168 		 * Note: For all practical purposes, T_READONLY status is
1169 		 * a "success" for T_DEFAULT/T_CURRENT and "failure" for
1170 		 * T_CHECK/T_NEGOTIATE
1171 		 */
1172 		failed_option =
1173 		    (opt->status == T_NOTSUPPORT) ||
1174 		    (opt->status == T_FAILURE) ||
1175 		    ((tor->MGMT_flags & (T_NEGOTIATE|T_CHECK)) &&
1176 			(opt->status == T_READONLY));
1177 
1178 		if (failed_option) {
1179 			/*
1180 			 * According to T_DEFAULT/T_CURRENT semantics, the
1181 			 * input values, even if present, are to be ignored.
1182 			 * Note: Specification is not clear on this, but we
1183 			 * interpret that even though we ignore the values, we
1184 			 * can return them as is. So we process them similar to
1185 			 * T_CHECK/T_NEGOTIATE case which has the semantics to
1186 			 * return the values as is. XXX If interpretation is
1187 			 * ever determined incorrect fill in appropriate code
1188 			 * here to treat T_DEFAULT/T_CURRENT differently.
1189 			 *
1190 			 * According to T_CHECK/T_NEGOTIATE semantics,
1191 			 * in the case of T_NOTSUPPORT/T_FAILURE/T_READONLY,
1192 			 * the semantics are to return the "value" part of
1193 			 * option untouched. So here we copy the option
1194 			 * head including value part if any to output.
1195 			 */
1196 
1197 			bcopy(opt, optr, opt->len);
1198 			optr += _TPI_ALIGN_TOPT(opt->len);
1199 
1200 			*worst_statusp = get_worst_status(opt->status,
1201 			    *worst_statusp);
1202 
1203 			/* skip to process next option in buffer */
1204 			continue;
1205 
1206 		} /* end if "failed option" */
1207 		/*
1208 		 * The status is T_SUCCESS or T_READONLY
1209 		 * We process the value part here
1210 		 */
1211 		ASSERT(opt->status == T_SUCCESS || opt->status == T_READONLY);
1212 		switch (tor->MGMT_flags) {
1213 		case T_DEFAULT:
1214 			/*
1215 			 * We fill default value from table or protocol specific
1216 			 * function. If this call fails, we pass input through.
1217 			 */
1218 			if (do_opt_default(q, opt, &optr, worst_statusp,
1219 			    cr, dbobjp) < 0) {
1220 				/* fail or pass transparently */
1221 				if (topmost_tpiprovider)
1222 					opt->status = T_FAILURE;
1223 				bcopy(opt, optr, opt->len);
1224 				optr += _TPI_ALIGN_TOPT(opt->len);
1225 				*worst_statusp = get_worst_status(opt->status,
1226 				    *worst_statusp);
1227 			}
1228 			break;
1229 
1230 		case T_CURRENT:
1231 
1232 			do_opt_current(q, opt, &optr, worst_statusp, cr,
1233 			    dbobjp);
1234 			break;
1235 
1236 		case T_CHECK:
1237 		case T_NEGOTIATE:
1238 			if (tor->MGMT_flags == T_CHECK)
1239 				optset_context = SETFN_OPTCOM_CHECKONLY;
1240 			else	/* T_NEGOTIATE */
1241 				optset_context = SETFN_OPTCOM_NEGOTIATE;
1242 			err = do_opt_check_or_negotiate(q, opt, optset_context,
1243 			    &optr, worst_statusp, cr, dbobjp, first_mp);
1244 			if (err == EINPROGRESS) {
1245 				*queued_statusp = B_TRUE;
1246 				return (0);
1247 			}
1248 			break;
1249 		default:
1250 			return (TBADFLAG);
1251 		}
1252 	} /* end for loop scanning option buffer */
1253 
1254 	ack_mp->b_wptr = optr;
1255 	ASSERT(ack_mp->b_wptr <= ack_mp->b_datap->db_lim);
1256 
1257 	return (0);		/* OK return */
1258 }
1259 
1260 
1261 static t_uscalar_t
1262 get_worst_status(t_uscalar_t status, t_uscalar_t current_worst_status)
1263 {
1264 	/*
1265 	 * Return the "worst" among the arguments "status" and
1266 	 * "current_worst_status".
1267 	 *
1268 	 * Note: Tracking "worst_status" can be made a bit simpler
1269 	 * if we use the property that status codes are bitwise
1270 	 * distinct.
1271 	 *
1272 	 * The pecking order is
1273 	 *
1274 	 * T_SUCCESS ..... best
1275 	 * T_PARTSUCCESS
1276 	 * T_FAILURE
1277 	 * T_READONLY
1278 	 * T_NOTSUPPORT... worst
1279 	 */
1280 	if (status == current_worst_status)
1281 		return (current_worst_status);
1282 	switch (current_worst_status) {
1283 	case T_SUCCESS:
1284 		if (status == T_PARTSUCCESS)
1285 			return (T_PARTSUCCESS);
1286 		/* FALLTHROUGH */
1287 	case T_PARTSUCCESS:
1288 		if (status == T_FAILURE)
1289 			return (T_FAILURE);
1290 		/* FALLTHROUGH */
1291 	case T_FAILURE:
1292 		if (status == T_READONLY)
1293 			return (T_READONLY);
1294 		/* FALLTHROUGH */
1295 	case T_READONLY:
1296 		if (status == T_NOTSUPPORT)
1297 			return (T_NOTSUPPORT);
1298 		/* FALLTHROUGH */
1299 	case T_NOTSUPPORT:
1300 	default:
1301 		return (current_worst_status);
1302 	}
1303 }
1304 
1305 static int
1306 do_opt_default(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
1307     t_uscalar_t *worst_statusp, cred_t *cr, optdb_obj_t *dbobjp)
1308 {
1309 	pfi_t	deffn = dbobjp->odb_deffn;
1310 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
1311 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
1312 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
1313 
1314 	struct T_opthdr *topth;
1315 	opdes_t *optd;
1316 
1317 	if (reqopt->name != T_ALLOPT) {
1318 		/*
1319 		 * lookup the option in the table and fill default value
1320 		 */
1321 		optd = opt_chk_lookup(reqopt->level, reqopt->name,
1322 		    opt_arr, opt_arr_cnt);
1323 
1324 		if (optd == NULL) {
1325 			/*
1326 			 * not found - fail this one. Should not happen
1327 			 * for topmost_tpiprovider as calling routine
1328 			 * should have verified it.
1329 			 */
1330 			ASSERT(!topmost_tpiprovider);
1331 			return (-1);
1332 		}
1333 
1334 		topth = (struct T_opthdr *)(*resptrp);
1335 		topth->level = reqopt->level;
1336 		topth->name = reqopt->name;
1337 		topth->status = reqopt->status;
1338 
1339 		*worst_statusp = get_worst_status(reqopt->status,
1340 		    *worst_statusp);
1341 
1342 		if (optd->opdes_props & OP_NODEFAULT) {
1343 			/* header only, no default "value" part */
1344 			topth->len = sizeof (struct T_opthdr);
1345 			*resptrp += sizeof (struct T_opthdr);
1346 		} else {
1347 			int deflen;
1348 
1349 			if (optd->opdes_props & OP_DEF_FN) {
1350 				deflen = (*deffn)(q, reqopt->level,
1351 				    reqopt->name, _TPI_TOPT_DATA(topth));
1352 				if (deflen >= 0) {
1353 					topth->len = (t_uscalar_t)
1354 					    (sizeof (struct T_opthdr) + deflen);
1355 				} else {
1356 					/*
1357 					 * return error, this should 'pass
1358 					 * through' the option and maybe some
1359 					 * other level will fill it in or
1360 					 * already did.
1361 					 * (No change in 'resptrp' upto here)
1362 					 */
1363 					return (-1);
1364 				}
1365 			} else {
1366 				/* fill length and value part */
1367 				switch (optd->opdes_size) {
1368 				/*
1369 				 * Since options are guaranteed aligned only
1370 				 * on a 4 byte boundary (t_scalar_t) any
1371 				 * option that is greater in size will default
1372 				 * to the bcopy below
1373 				 */
1374 				case sizeof (int32_t):
1375 					*(int32_t *)_TPI_TOPT_DATA(topth) =
1376 					    (int32_t)optd->opdes_default;
1377 					break;
1378 				case sizeof (int16_t):
1379 					*(int16_t *)_TPI_TOPT_DATA(topth) =
1380 					    (int16_t)optd->opdes_default;
1381 					break;
1382 				case sizeof (int8_t):
1383 					*(int8_t *)_TPI_TOPT_DATA(topth) =
1384 					    (int8_t)optd->opdes_default;
1385 					break;
1386 				default:
1387 					/*
1388 					 * other length but still assume
1389 					 * fixed - use bcopy
1390 					 */
1391 					bcopy(optd->opdes_defbuf,
1392 					    _TPI_TOPT_DATA(topth),
1393 					    optd->opdes_size);
1394 					break;
1395 				}
1396 				topth->len = (t_uscalar_t)(optd->opdes_size +
1397 				    sizeof (struct T_opthdr));
1398 			}
1399 			*resptrp += _TPI_ALIGN_TOPT(topth->len);
1400 		}
1401 		return (0);	/* OK return */
1402 	}
1403 
1404 	/*
1405 	 * T_ALLOPT processing
1406 	 *
1407 	 * lookup and stuff default values of all the options of the
1408 	 * level specified
1409 	 * Note: This expansion of T_ALLOPT should happen in
1410 	 * a topmost_tpiprovider.
1411 	 */
1412 	ASSERT(topmost_tpiprovider);
1413 	for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
1414 		if (reqopt->level != optd->opdes_level)
1415 			continue;
1416 		/*
1417 		 *
1418 		 * T_DEFAULT semantics:
1419 		 * XXX: we interpret T_DEFAULT semantics such that access to
1420 		 * read value is required for access even the default value.
1421 		 * Else option is ignored for T_ALLOPT request.
1422 		 */
1423 		if (!OA_READ_PERMISSION(optd, cr))
1424 			/* skip this one */
1425 			continue;
1426 
1427 		/*
1428 		 * Found option of same level as T_ALLOPT request
1429 		 * that we can return.
1430 		 */
1431 
1432 		topth = (struct T_opthdr *)(*resptrp);
1433 		topth->level = optd->opdes_level;
1434 		topth->name = optd->opdes_name;
1435 
1436 		/*
1437 		 * T_DEFAULT semantics:
1438 		 * We know that read access is set. If no other access is set,
1439 		 * then status is T_READONLY
1440 		 */
1441 		if (OA_READONLY_PERMISSION(optd, cr)) {
1442 			topth->status = T_READONLY;
1443 			*worst_statusp = get_worst_status(T_READONLY,
1444 			    *worst_statusp);
1445 		} else {
1446 			topth->status = T_SUCCESS;
1447 			/*
1448 			 * Note: *worst_statusp has to be T_SUCCESS or
1449 			 * worse so no need to adjust
1450 			 */
1451 		}
1452 
1453 		if (optd->opdes_props & OP_NODEFAULT) {
1454 			/* header only, no value part */
1455 			topth->len = sizeof (struct T_opthdr);
1456 			*resptrp += sizeof (struct T_opthdr);
1457 		} else {
1458 			int deflen;
1459 
1460 			if (optd->opdes_props & OP_DEF_FN) {
1461 				deflen = (*deffn)(q, reqopt->level,
1462 				    reqopt->name, _TPI_TOPT_DATA(topth));
1463 				if (deflen >= 0) {
1464 					topth->len = (t_uscalar_t)(deflen +
1465 					    sizeof (struct T_opthdr));
1466 				} else {
1467 					/*
1468 					 * deffn failed.
1469 					 * return just the header as T_ALLOPT
1470 					 * expansion.
1471 					 * Some other level deffn may
1472 					 * supply value part.
1473 					 */
1474 					topth->len = sizeof (struct T_opthdr);
1475 					topth->status = T_FAILURE;
1476 					*worst_statusp =
1477 					    get_worst_status(T_FAILURE,
1478 						*worst_statusp);
1479 				}
1480 			} else {
1481 				/*
1482 				 * fill length and value part from
1483 				 * table
1484 				 */
1485 				switch (optd->opdes_size) {
1486 				/*
1487 				 * Since options are guaranteed aligned only
1488 				 * on a 4 byte boundary (t_scalar_t) any
1489 				 * option that is greater in size will default
1490 				 * to the bcopy below
1491 				 */
1492 				case sizeof (int32_t):
1493 					*(int32_t *)_TPI_TOPT_DATA(topth) =
1494 					    (int32_t)optd->opdes_default;
1495 					break;
1496 				case sizeof (int16_t):
1497 					*(int16_t *)_TPI_TOPT_DATA(topth) =
1498 					    (int16_t)optd->opdes_default;
1499 					break;
1500 				case sizeof (int8_t):
1501 					*(int8_t *)_TPI_TOPT_DATA(topth) =
1502 					    (int8_t)optd->opdes_default;
1503 					break;
1504 				default:
1505 					/*
1506 					 * other length but still assume
1507 					 * fixed - use bcopy
1508 					 */
1509 					bcopy(optd->opdes_defbuf,
1510 					    _TPI_TOPT_DATA(topth),
1511 					    optd->opdes_size);
1512 				}
1513 				topth->len = (t_uscalar_t)(optd->opdes_size +
1514 				    sizeof (struct T_opthdr));
1515 			}
1516 			*resptrp += _TPI_ALIGN_TOPT(topth->len);
1517 		}
1518 	}
1519 	return (0);
1520 }
1521 
1522 static void
1523 do_opt_current(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
1524     t_uscalar_t *worst_statusp, cred_t *cr, optdb_obj_t *dbobjp)
1525 {
1526 	pfi_t	getfn = dbobjp->odb_getfn;
1527 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
1528 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
1529 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
1530 
1531 	struct T_opthdr *topth;
1532 	opdes_t *optd;
1533 	int optlen;
1534 	uchar_t *initptr = *resptrp;
1535 
1536 	/*
1537 	 * We call getfn to get the current value of an option. The call may
1538 	 * fail in which case we copy the values from the input buffer. Maybe
1539 	 * something downstream will fill it in or something upstream did.
1540 	 */
1541 
1542 	if (reqopt->name != T_ALLOPT) {
1543 		topth = (struct T_opthdr *)*resptrp;
1544 		*resptrp += sizeof (struct T_opthdr);
1545 		optlen = (*getfn)(q, reqopt->level, reqopt->name, *resptrp);
1546 		if (optlen >= 0) {
1547 			topth->len = (t_uscalar_t)(optlen +
1548 			    sizeof (struct T_opthdr));
1549 			topth->level = reqopt->level;
1550 			topth->name = reqopt->name;
1551 			topth->status = reqopt->status;
1552 			*resptrp += _TPI_ALIGN_TOPT(optlen);
1553 			*worst_statusp = get_worst_status(topth->status,
1554 			    *worst_statusp);
1555 		} else {
1556 			/* failed - reset "*resptrp" pointer */
1557 			*resptrp -= sizeof (struct T_opthdr);
1558 		}
1559 	} else {		/* T_ALLOPT processing */
1560 		ASSERT(topmost_tpiprovider == B_TRUE);
1561 		/* scan and get all options */
1562 		for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
1563 			/* skip other levels */
1564 			if (reqopt->level != optd->opdes_level)
1565 				continue;
1566 
1567 			if (!OA_READ_PERMISSION(optd, cr))
1568 				/* skip this one */
1569 				continue;
1570 
1571 			topth = (struct T_opthdr *)*resptrp;
1572 			*resptrp += sizeof (struct T_opthdr);
1573 
1574 			/* get option of this level */
1575 			optlen = (*getfn)(q, reqopt->level, optd->opdes_name,
1576 			    *resptrp);
1577 			if (optlen >= 0) {
1578 				/* success */
1579 				topth->len = (t_uscalar_t)(optlen +
1580 				    sizeof (struct T_opthdr));
1581 				topth->level = reqopt->level;
1582 				topth->name = optd->opdes_name;
1583 				if (OA_READONLY_PERMISSION(optd, cr))
1584 					topth->status = T_READONLY;
1585 				else
1586 					topth->status = T_SUCCESS;
1587 				*resptrp += _TPI_ALIGN_TOPT(optlen);
1588 			} else {
1589 				/*
1590 				 * failed, return as T_FAILURE and null value
1591 				 * part. Maybe something downstream will
1592 				 * handle this one and fill in a value. Here
1593 				 * it is just part of T_ALLOPT expansion.
1594 				 */
1595 				topth->len = sizeof (struct T_opthdr);
1596 				topth->level = reqopt->level;
1597 				topth->name = optd->opdes_name;
1598 				topth->status = T_FAILURE;
1599 			}
1600 			*worst_statusp = get_worst_status(topth->status,
1601 			    *worst_statusp);
1602 		} /* end for loop */
1603 	}
1604 	if (*resptrp == initptr) {
1605 		/*
1606 		 * getfn failed and does not want to handle this option. Maybe
1607 		 * something downstream will or something upstream did. (If
1608 		 * topmost_tpiprovider, initialize "status" to failure which
1609 		 * can possibly change downstream). Copy the input "as is" from
1610 		 * input option buffer if any to maintain transparency.
1611 		 */
1612 		if (topmost_tpiprovider)
1613 			reqopt->status = T_FAILURE;
1614 		bcopy(reqopt, *resptrp, reqopt->len);
1615 		*resptrp += _TPI_ALIGN_TOPT(reqopt->len);
1616 		*worst_statusp = get_worst_status(reqopt->status,
1617 		    *worst_statusp);
1618 	}
1619 }
1620 
1621 
1622 
1623 static int
1624 do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
1625     uint_t optset_context, uchar_t **resptrp, t_uscalar_t *worst_statusp,
1626     cred_t *cr, optdb_obj_t *dbobjp, mblk_t *first_mp)
1627 {
1628 	pfi_t	deffn = dbobjp->odb_deffn;
1629 	opt_set_fn setfn = dbobjp->odb_setfn;
1630 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
1631 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
1632 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
1633 
1634 	struct T_opthdr *topth;
1635 	opdes_t *optd;
1636 	int error;
1637 	t_uscalar_t optlen;
1638 	t_scalar_t optsize;
1639 	uchar_t *initptr = *resptrp;
1640 
1641 	ASSERT(reqopt->status == T_SUCCESS);
1642 
1643 	if (reqopt->name != T_ALLOPT) {
1644 		topth = (struct T_opthdr *)*resptrp;
1645 		*resptrp += sizeof (struct T_opthdr);
1646 		error = (*setfn)(q, optset_context, reqopt->level, reqopt->name,
1647 		    reqopt->len - sizeof (struct T_opthdr),
1648 		    _TPI_TOPT_DATA(reqopt), &optlen, _TPI_TOPT_DATA(topth),
1649 		    NULL, cr, first_mp);
1650 		if (error) {
1651 			/* failed - reset "*resptrp" */
1652 			*resptrp -= sizeof (struct T_opthdr);
1653 			if (error == EINPROGRESS)
1654 				return (error);
1655 		} else {
1656 			/*
1657 			 * success - "value" already filled in setfn()
1658 			 */
1659 			topth->len = (t_uscalar_t)(optlen +
1660 			    sizeof (struct T_opthdr));
1661 			topth->level = reqopt->level;
1662 			topth->name = reqopt->name;
1663 			topth->status = reqopt->status;
1664 			*resptrp += _TPI_ALIGN_TOPT(optlen);
1665 			*worst_statusp = get_worst_status(topth->status,
1666 			    *worst_statusp);
1667 		}
1668 	} else {		/* T_ALLOPT processing */
1669 		/* only for T_NEGOTIATE case */
1670 		ASSERT(optset_context == SETFN_OPTCOM_NEGOTIATE);
1671 		ASSERT(topmost_tpiprovider == B_TRUE);
1672 
1673 		/* scan and set all options to default value */
1674 		for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
1675 
1676 			/* skip other levels */
1677 			if (reqopt->level != optd->opdes_level)
1678 				continue;
1679 
1680 			if (OA_EXECUTE_PERMISSION(optd, cr) ||
1681 			    OA_NO_PERMISSION(optd, cr)) {
1682 				/*
1683 				 * skip this one too. Does not make sense to
1684 				 * set anything to default value for "execute"
1685 				 * options.
1686 				 */
1687 				continue;
1688 			}
1689 
1690 			if (OA_READONLY_PERMISSION(optd, cr)) {
1691 				/*
1692 				 * Return with T_READONLY status (and no value
1693 				 * part). Note: spec is not clear but
1694 				 * XTI test suite needs this.
1695 				 */
1696 				topth = (struct T_opthdr *)*resptrp;
1697 				topth->len = sizeof (struct T_opthdr);
1698 				*resptrp += topth->len;
1699 				topth->level = reqopt->level;
1700 				topth->name = optd->opdes_name;
1701 				topth->status = T_READONLY;
1702 				*worst_statusp = get_worst_status(topth->status,
1703 				    *worst_statusp);
1704 				continue;
1705 			}
1706 
1707 			/*
1708 			 * It is not read only or execute type
1709 			 * the it must have write permission
1710 			 */
1711 			ASSERT(OA_WRITE_PERMISSION(optd, cr));
1712 
1713 			topth = (struct T_opthdr *)*resptrp;
1714 			*resptrp += sizeof (struct T_opthdr);
1715 
1716 			topth->len = sizeof (struct T_opthdr);
1717 			topth->level = reqopt->level;
1718 			topth->name = optd->opdes_name;
1719 			if (optd->opdes_props & OP_NODEFAULT) {
1720 				/*
1721 				 * Option of "no default value" so it does not
1722 				 * make sense to try to set it. We just return
1723 				 * header with status of T_SUCCESS
1724 				 * XXX should this be failure ?
1725 				 */
1726 				topth->status = T_SUCCESS;
1727 				continue; /* skip setting */
1728 			}
1729 			if (optd->opdes_props & OP_DEF_FN) {
1730 				if ((optd->opdes_props & OP_VARLEN) ||
1731 				    ((optsize = (*deffn)(q, reqopt->level,
1732 					optd->opdes_name,
1733 					(uchar_t *)optd->opdes_defbuf)) < 0)) {
1734 					/* XXX - skip these too */
1735 					topth->status = T_SUCCESS;
1736 					continue; /* skip setting */
1737 				}
1738 			} else {
1739 				optsize = optd->opdes_size;
1740 			}
1741 
1742 
1743 			/* set option of this level */
1744 			error = (*setfn)(q, SETFN_OPTCOM_NEGOTIATE,
1745 			    reqopt->level, optd->opdes_name, optsize,
1746 			    (uchar_t *)optd->opdes_defbuf, &optlen,
1747 			    _TPI_TOPT_DATA(topth), NULL, cr, NULL);
1748 			if (error) {
1749 				/*
1750 				 * failed, return as T_FAILURE and null value
1751 				 * part. Maybe something downstream will
1752 				 * handle this one and fill in a value. Here
1753 				 * it is just part of T_ALLOPT expansion.
1754 				 */
1755 				topth->status = T_FAILURE;
1756 				*worst_statusp = get_worst_status(topth->status,
1757 				    *worst_statusp);
1758 			} else {
1759 				/* success */
1760 				topth->len += optlen;
1761 				topth->status = T_SUCCESS;
1762 				*resptrp += _TPI_ALIGN_TOPT(optlen);
1763 			}
1764 		} /* end for loop */
1765 		/* END T_ALLOPT */
1766 	}
1767 
1768 	if (*resptrp == initptr) {
1769 		/*
1770 		 * setfn failed and does not want to handle this option. Maybe
1771 		 * something downstream will or something upstream
1772 		 * did. Copy the input as is from input option buffer if any to
1773 		 * maintain transparency (maybe something at a level above
1774 		 * did something.
1775 		 */
1776 		if (topmost_tpiprovider)
1777 			reqopt->status = T_FAILURE;
1778 		bcopy(reqopt, *resptrp, reqopt->len);
1779 		*resptrp += _TPI_ALIGN_TOPT(reqopt->len);
1780 		*worst_statusp = get_worst_status(reqopt->status,
1781 		    *worst_statusp);
1782 	}
1783 	return (0);
1784 }
1785 
1786 /*
1787  * The following routines process options buffer passed with
1788  * T_CONN_REQ, T_CONN_RES and T_UNITDATA_REQ.
1789  * This routine does the consistency check applied to the
1790  * sanity of formatting of multiple options packed in the
1791  * buffer.
1792  *
1793  * XTI brain damage alert:
1794  * XTI interface adopts the notion of an option being an
1795  * "absolute requirement" from OSI transport service (but applies
1796  * it to all transports including Internet transports).
1797  * The main effect of that is action on failure to "negotiate" a
1798  * requested option to the exact requested value
1799  *
1800  *          - if the option is an "absolute requirement", the primitive
1801  *            is aborted (e.g T_DISCON_REQ or T_UDERR generated)
1802  *          - if the option is NOT and "absolute requirement" it can
1803  *            just be ignored.
1804  *
1805  * We would not support "negotiating" of options on connection
1806  * primitives for Internet transports. However just in case we
1807  * forced to in order to pass strange test suites, the design here
1808  * tries to support these notions.
1809  *
1810  * tpi_optcom_buf(q, mp, opt_lenp, opt_offset, cred, dbobjp, thisdg_attrs,
1811  *	*is_absreq_failurep)
1812  *
1813  * - Verify the option buffer, if formatted badly, return error 1
1814  *
1815  * - If it is a "permissions" failure (read-only), return error 2
1816  *
1817  * - Else, process the option "in place", the following can happen,
1818  *	     - if a "privileged" option, mark it as "ignored".
1819  *	     - if "not supported", mark "ignored"
1820  *	     - if "supported" attempt negotiation and fill result in
1821  *	       the outcome
1822  *			- if "absolute requirement", set "*is_absreq_failurep"
1823  *			- if NOT an "absolute requirement", then our
1824  *			  interpretation is to mark is at ignored if
1825  *			  negotiation fails (Spec allows partial success
1826  *			  as in OSI protocols but not failure)
1827  *
1828  *   Then delete "ignored" options from option buffer and return success.
1829  *
1830  */
1831 
1832 int
1833 tpi_optcom_buf(queue_t *q, mblk_t *mp, t_scalar_t *opt_lenp,
1834     t_scalar_t opt_offset, cred_t *cr, optdb_obj_t *dbobjp,
1835     void *thisdg_attrs, int *is_absreq_failurep)
1836 {
1837 	opt_set_fn setfn = dbobjp->odb_setfn;
1838 	opdes_t *opt_arr = dbobjp->odb_opt_des_arr;
1839 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
1840 	struct T_opthdr *opt, *opt_start, *opt_end;
1841 	mblk_t  *copy_mp_head;
1842 	uchar_t *optr, *init_optr;
1843 	opdes_t *optd;
1844 	uint_t optset_context;
1845 	t_uscalar_t olen;
1846 	int error = 0;
1847 
1848 	ASSERT((uchar_t *)opt_lenp > mp->b_rptr &&
1849 	    (uchar_t *)opt_lenp < mp->b_wptr);
1850 
1851 	copy_mp_head = NULL;
1852 	*is_absreq_failurep = 0;
1853 	switch (((union T_primitives *)mp->b_rptr)->type) {
1854 	case T_CONN_REQ:
1855 	case T_CONN_RES:
1856 		optset_context = SETFN_CONN_NEGOTIATE;
1857 		break;
1858 	case T_UNITDATA_REQ:
1859 		optset_context = SETFN_UD_NEGOTIATE;
1860 		break;
1861 	default:
1862 		/*
1863 		 * should never get here, all possible TPI primitives
1864 		 * where this can be called from should be accounted
1865 		 * for in the cases above
1866 		 */
1867 		return (EINVAL);
1868 	}
1869 
1870 	if ((opt_start = (struct T_opthdr *)
1871 	    mi_offset_param(mp, opt_offset, *opt_lenp)) == NULL) {
1872 		error = ENOPROTOOPT;
1873 		goto error_ret;
1874 	}
1875 	if (!__TPI_TOPT_ISALIGNED(opt_start)) {
1876 		error = ENOPROTOOPT;
1877 		goto error_ret;
1878 	}
1879 
1880 	opt_end = (struct T_opthdr *)((uchar_t *)opt_start
1881 	    + *opt_lenp);
1882 
1883 	if ((copy_mp_head = copyb(mp)) == (mblk_t *)NULL) {
1884 		error = ENOMEM;
1885 		goto error_ret;
1886 	}
1887 
1888 	init_optr = optr = (uchar_t *)&copy_mp_head->b_rptr[opt_offset];
1889 
1890 	for (opt = opt_start; opt && (opt < opt_end);
1891 	    opt = _TPI_TOPT_NEXTHDR(opt_start, *opt_lenp, opt)) {
1892 		/*
1893 		 * Validate the option for length and alignment
1894 		 * before accessing anything in it
1895 		 */
1896 		if (!_TPI_TOPT_VALID(opt, opt_start, opt_end)) {
1897 			error = ENOPROTOOPT;
1898 			goto error_ret;
1899 		}
1900 
1901 		/* Find the option in the opt_arr. */
1902 		optd = opt_chk_lookup(opt->level, opt->name,
1903 		    opt_arr, opt_arr_cnt);
1904 
1905 		if (optd == NULL) {
1906 			/*
1907 			 * Option not found
1908 			 */
1909 			opt->status = T_NOTSUPPORT;
1910 			continue;
1911 		}
1912 
1913 		/*
1914 		 * Weird but as in XTI spec.
1915 		 * Sec 6.3.6 "Privileged and ReadOnly Options"
1916 		 * Permission problems (e.g.readonly) fail with bad access
1917 		 * BUT "privileged" option request from those NOT PRIVILEGED
1918 		 * are to be merely "ignored".
1919 		 * XXX Prevents "probing" of privileged options ?
1920 		 */
1921 		if (OA_READONLY_PERMISSION(optd, cr)) {
1922 			error = EACCES;
1923 			goto error_ret;
1924 		}
1925 		if (OA_MATCHED_PRIV(optd, cr)) {
1926 			/*
1927 			 * For privileged options, we DO perform
1928 			 * access checks as is common sense
1929 			 */
1930 			if (!OA_WX_ANYPRIV(optd)) {
1931 				error = EACCES;
1932 				goto error_ret;
1933 			}
1934 		} else {
1935 			/*
1936 			 * For non privileged, we fail instead following
1937 			 * "ignore" semantics dictated by XTI spec for
1938 			 * permissions problems.
1939 			 * Sec 6.3.6 "Privileged and ReadOnly Options"
1940 			 * XXX Should we do "ignore" semantics ?
1941 			 */
1942 			if (!OA_WX_NOPRIV(optd)) { /* nopriv */
1943 				opt->status = T_FAILURE;
1944 				continue;
1945 			}
1946 		}
1947 		/*
1948 		 *
1949 		 * If the negotiation fails, for options that
1950 		 * are "absolute requirement", it is a fatal error.
1951 		 * For options that are NOT "absolute requirements",
1952 		 * and the value fails to negotiate, the XTI spec
1953 		 * only considers the possibility of partial success
1954 		 * (T_PARTSUCCES - not likely for Internet protocols).
1955 		 * The spec is in denial about complete failure
1956 		 * (T_FAILURE) to negotiate for options that are
1957 		 * carried on T_CONN_REQ/T_CONN_RES/T_UNITDATA
1958 		 * We interpret the T_FAILURE to negotiate an option
1959 		 * that is NOT an absolute requirement that it is safe
1960 		 * to ignore it.
1961 		 */
1962 
1963 		/* verify length */
1964 		if (!opt_length_ok(optd, opt)) {
1965 			/* bad size */
1966 			if ((optd->opdes_props & OP_NOT_ABSREQ) == 0) {
1967 				/* option is absolute requirement */
1968 				*is_absreq_failurep = 1;
1969 				error = EINVAL;
1970 				goto error_ret;
1971 			}
1972 			opt->status = T_FAILURE;
1973 			continue;
1974 		}
1975 
1976 		/*
1977 		 * verified generic attributes. Now call set function.
1978 		 * Note: We assume the following to simplify code.
1979 		 * XXX If this is found not to be valid, this routine
1980 		 * will need to be rewritten. At this point it would
1981 		 * be premature to introduce more complexity than is
1982 		 * needed.
1983 		 * Assumption: For variable length options, we assume
1984 		 * that the value returned will be same or less length
1985 		 * (size does not increase). This makes it OK to pass the
1986 		 * same space for output as it is on input.
1987 		 */
1988 
1989 		error = (*setfn)(q, optset_context, opt->level, opt->name,
1990 		    opt->len - (t_uscalar_t)sizeof (struct T_opthdr),
1991 		    _TPI_TOPT_DATA(opt), &olen, _TPI_TOPT_DATA(opt),
1992 		    thisdg_attrs, cr, NULL);
1993 
1994 		if (olen > (int)(opt->len - sizeof (struct T_opthdr))) {
1995 			/*
1996 			 * Space on output more than space on input. Should
1997 			 * not happen and we consider it a bug/error.
1998 			 * More of a restriction than an error in our
1999 			 * implementation. Will see if we can live with this
2000 			 * otherwise code will get more hairy with multiple
2001 			 * passes.
2002 			 */
2003 			error = EINVAL;
2004 			goto error_ret;
2005 		}
2006 		if (error != 0) {
2007 			if ((optd->opdes_props & OP_NOT_ABSREQ) == 0) {
2008 				/* option is absolute requirement. */
2009 				*is_absreq_failurep = 1;
2010 				goto error_ret;
2011 			}
2012 			/*
2013 			 * failed - but option "not an absolute
2014 			 * requirement"
2015 			 */
2016 			opt->status = T_FAILURE;
2017 			continue;
2018 		}
2019 		/*
2020 		 * Fill in the only possible successful result
2021 		 * (Note: TPI allows for T_PARTSUCCESS - partial
2022 		 * sucess result code which is relevant in OSI world
2023 		 * and not possible in Internet code)
2024 		 */
2025 		opt->status = T_SUCCESS;
2026 
2027 		/*
2028 		 * Add T_SUCCESS result code options to the "output" options.
2029 		 * No T_FAILURES or T_NOTSUPPORT here as they are to be
2030 		 * ignored.
2031 		 * This code assumes output option buffer will
2032 		 * be <= input option buffer.
2033 		 *
2034 		 * Copy option header+value
2035 		 */
2036 		bcopy(opt, optr, opt->len);
2037 		optr +=  _TPI_ALIGN_TOPT(opt->len);
2038 	}
2039 	/*
2040 	 * Overwrite the input mblk option buffer now with the output
2041 	 * and update length, and contents in original mbl
2042 	 * (offset remains unchanged).
2043 	 */
2044 	*opt_lenp = (t_scalar_t)(optr - init_optr);
2045 	if (*opt_lenp > 0) {
2046 		bcopy(init_optr, opt_start, *opt_lenp);
2047 	}
2048 
2049 error_ret:
2050 	if (copy_mp_head != NULL)
2051 		freeb(copy_mp_head);
2052 	return (error);
2053 }
2054 
2055 static opdes_t *
2056 opt_chk_lookup(t_uscalar_t level, t_uscalar_t name, opdes_t *opt_arr,
2057     uint_t opt_arr_cnt)
2058 {
2059 	opdes_t		*optd;
2060 
2061 	for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt];
2062 	    optd++) {
2063 		if (level == (uint_t)optd->opdes_level &&
2064 		    name == (uint_t)optd->opdes_name)
2065 			return (optd);
2066 	}
2067 	return (NULL);
2068 }
2069 
2070 static boolean_t
2071 opt_level_valid(t_uscalar_t level, optlevel_t *valid_level_arr,
2072     uint_t valid_level_arr_cnt)
2073 {
2074 	optlevel_t		*olp;
2075 
2076 	for (olp = valid_level_arr;
2077 	    olp < &valid_level_arr[valid_level_arr_cnt];
2078 	    olp++) {
2079 		if (level == (uint_t)(*olp))
2080 			return (B_TRUE);
2081 	}
2082 	return (B_FALSE);
2083 }
2084 
2085 
2086 /*
2087  * Compute largest possible size for an option buffer containing
2088  * all options in one buffer.
2089  *
2090  * XXX TBD, investigate use of opt_bloated_maxsize() to avoid
2091  *     wastefully large buffer allocation.
2092  */
2093 static size_t
2094 opt_level_allopts_lengths(t_uscalar_t level, opdes_t *opt_arr,
2095     uint_t opt_arr_cnt)
2096 {
2097 	opdes_t		*optd;
2098 	size_t allopt_len = 0;	/* 0 implies no option at this level */
2099 
2100 	/*
2101 	 * Scan opt_arr computing aggregate length
2102 	 * requirement for storing values of all
2103 	 * options.
2104 	 * Note: we do not filter for permissions
2105 	 * etc. This will be >= the real aggregate
2106 	 * length required (upper bound).
2107 	 */
2108 
2109 	for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt];
2110 	    optd++) {
2111 		if (level == optd->opdes_level) {
2112 			allopt_len += sizeof (struct T_opthdr) +
2113 			    _TPI_ALIGN_TOPT(optd->opdes_size);
2114 		}
2115 	}
2116 	return (allopt_len);	/* 0 implies level not found */
2117 }
2118 
2119 /*
2120  * Compute largest possible size for an option buffer containing
2121  * all options in one buffer - a (theoretical?) worst case scenario
2122  * for certain cases.
2123  */
2124 t_uscalar_t
2125 optcom_max_optbuf_len(opdes_t *opt_arr, uint_t opt_arr_cnt)
2126 {
2127 	t_uscalar_t max_optbuf_len = sizeof (struct T_info_ack);
2128 	opdes_t		*optd;
2129 
2130 	for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
2131 		max_optbuf_len += (t_uscalar_t)sizeof (struct T_opthdr) +
2132 		    (t_uscalar_t)_TPI_ALIGN_TOPT(optd->opdes_size);
2133 	}
2134 	return (max_optbuf_len);
2135 }
2136 
2137 /*
2138  * Compute largest possible size for OPT_size for a transport.
2139  * Heuristic used is to add all but certain extremely large
2140  * size options; this is done by calling opt_bloated_maxsize().
2141  * It affects user level allocations in TLI/XTI code using t_alloc()
2142  * and other TLI/XTI implementation instance strucutures.
2143  * The large size options excluded are presumed to be
2144  * never accessed through the (theoretical?) worst case code paths
2145  * through TLI/XTI as they are currently IPv6 specific options.
2146  */
2147 
2148 t_uscalar_t
2149 optcom_max_optsize(opdes_t *opt_arr, uint_t opt_arr_cnt)
2150 {
2151 	t_uscalar_t max_optbuf_len = sizeof (struct T_info_ack);
2152 	opdes_t		*optd;
2153 
2154 	for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
2155 		if (!opt_bloated_maxsize(optd)) {
2156 			max_optbuf_len +=
2157 			    (t_uscalar_t)sizeof (struct T_opthdr) +
2158 			    (t_uscalar_t)_TPI_ALIGN_TOPT(optd->opdes_size);
2159 		}
2160 	}
2161 	return (max_optbuf_len);
2162 }
2163 
2164 /*
2165  * The theoretical model used in optcom_max_optsize() and
2166  * opt_level_allopts_lengths() accounts for the worst case of all
2167  * possible options for the theoretical cases and results in wasteful
2168  * memory allocations for certain theoretically correct usage scenarios.
2169  * In practice, the "features" they support are rarely, if ever,
2170  * used and even then only by test suites for those features (VSU, VST).
2171  * However, they result in large allocations due to the increased transport
2172  * T_INFO_ACK OPT_size field affecting t_alloc() users and TLI/XTI library
2173  * instance data structures for applications.
2174  *
2175  * The following routine opt_bloated_maxsize() supports a hack that avoids
2176  * paying the tax for the bloated options by excluding them and pretending
2177  * they don't exist for certain features without affecting features that
2178  * do use them.
2179  *
2180  * XXX Currently implemented only for optcom_max_optsize()
2181  *     (to reduce risk late in release).
2182  *     TBD for future, investigate use in optcom_level_allopts_lengths() and
2183  *     all the instances of T_ALLOPT processing to exclude "bloated options".
2184  *     Will not affect VSU/VST tests as they do not test with IPPROTO_IPV6
2185  *     level options which are the only ones that fit the "bloated maxsize"
2186  *     option profile now.
2187  */
2188 static boolean_t
2189 opt_bloated_maxsize(opdes_t *optd)
2190 {
2191 	if (optd->opdes_level != IPPROTO_IPV6)
2192 		return (B_FALSE);
2193 	switch (optd->opdes_name) {
2194 	case IPV6_HOPOPTS:
2195 	case IPV6_DSTOPTS:
2196 	case IPV6_RTHDRDSTOPTS:
2197 	case IPV6_RTHDR:
2198 	case IPV6_PATHMTU:
2199 		return (B_TRUE);
2200 	default:
2201 		break;
2202 	}
2203 	return (B_FALSE);
2204 }
2205 
2206 static boolean_t
2207 opt_length_ok(opdes_t *optd, struct T_opthdr *opt)
2208 {
2209 	/*
2210 	 * Verify length.
2211 	 * Value specified should match length of fixed length option or be
2212 	 * less than maxlen of variable length option.
2213 	 */
2214 	if (optd->opdes_props & OP_VARLEN) {
2215 		if (opt->len <= optd->opdes_size +
2216 		    (t_uscalar_t)sizeof (struct T_opthdr))
2217 			return (B_TRUE);
2218 	} else {
2219 		/* fixed length option */
2220 		if (opt->len == optd->opdes_size +
2221 		    (t_uscalar_t)sizeof (struct T_opthdr))
2222 			return (B_TRUE);
2223 	}
2224 	return (B_FALSE);
2225 }
2226 
2227 /*
2228  * This routine appends a pssed in hop-by-hop option to the existing
2229  * option (in this case a cipso label encoded in HOPOPT option). The
2230  * passed in option is always padded. The 'reservelen' is the
2231  * length of reserved data (label). New memory will be allocated if
2232  * the current buffer is not large enough. Return failure if memory
2233  * can not be allocated.
2234  */
2235 int
2236 optcom_pkt_set(uchar_t *invalp, uint_t inlen, boolean_t sticky,
2237     uchar_t **optbufp, uint_t *optlenp, uint_t reservelen)
2238 {
2239 	uchar_t *optbuf;
2240 	uchar_t	*optp;
2241 
2242 	if (!sticky) {
2243 		*optbufp = invalp;
2244 		*optlenp = inlen;
2245 		return (0);
2246 	}
2247 
2248 	if (inlen == *optlenp - reservelen) {
2249 		/* Unchanged length - no need to reallocate */
2250 		optp = *optbufp + reservelen;
2251 		bcopy(invalp, optp, inlen);
2252 		if (reservelen != 0) {
2253 			/*
2254 			 * Convert the NextHeader and Length of the
2255 			 * passed in hop-by-hop header to pads
2256 			 */
2257 			optp[0] = IP6OPT_PADN;
2258 			optp[1] = 0;
2259 		}
2260 		return (0);
2261 	}
2262 	if (inlen + reservelen > 0) {
2263 		/* Allocate new buffer before free */
2264 		optbuf = kmem_alloc(inlen + reservelen, KM_NOSLEEP);
2265 		if (optbuf == NULL)
2266 			return (ENOMEM);
2267 	} else {
2268 		optbuf = NULL;
2269 	}
2270 
2271 	/* Copy out old reserved data (label) */
2272 	if (reservelen > 0)
2273 		bcopy(*optbufp, optbuf, reservelen);
2274 
2275 	/* Free old buffer */
2276 	if (*optlenp != 0)
2277 		kmem_free(*optbufp, *optlenp);
2278 
2279 	if (inlen > 0)
2280 		bcopy(invalp, optbuf + reservelen, inlen);
2281 
2282 	if (reservelen != 0) {
2283 		/*
2284 		 * Convert the NextHeader and Length of the
2285 		 * passed in hop-by-hop header to pads
2286 		 */
2287 		optbuf[reservelen] = IP6OPT_PADN;
2288 		optbuf[reservelen + 1] = 0;
2289 		/*
2290 		 * Set the Length of the hop-by-hop header, number of 8
2291 		 * byte-words following the 1st 8 bytes
2292 		 */
2293 		optbuf[1] = (reservelen + inlen - 1) >> 3;
2294 	}
2295 	*optbufp = optbuf;
2296 	*optlenp = inlen + reservelen;
2297 	return (0);
2298 }
2299