xref: /illumos-gate/usr/src/uts/common/os/contract.c (revision 533affcbc7fc4d0c8132976ea454aaa715fe2307)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2017 by Delphix. All rights reserved.
27  */
28 
29 /*
30  * Contracts
31  * ---------
32  *
33  * Contracts are a primitive which enrich the relationships between
34  * processes and system resources.  The primary purpose of contracts is
35  * to provide a means for the system to negotiate the departure from a
36  * binding relationship (e.g. pages locked in memory or a thread bound
37  * to processor), but they can also be used as a purely asynchronous
38  * error reporting mechanism as they are with process contracts.
39  *
40  * More information on how one interfaces with contracts and what
41  * contracts can do for you can be found in:
42  *   PSARC 2003/193 Solaris Contracts
43  *   PSARC 2004/460 Contracts addendum
44  *
45  * This file contains the core contracts framework.  By itself it is
46  * useless: it depends the contracts filesystem (ctfs) to provide an
47  * interface to user processes and individual contract types to
48  * implement the process/resource relationships.
49  *
50  * Data structure overview
51  * -----------------------
52  *
53  * A contract is represented by a contract_t, which itself points to an
54  * encapsulating contract-type specific contract object.  A contract_t
55  * contains the contract's static identity (including its terms), its
56  * linkage to various bookkeeping structures, the contract-specific
57  * event queue, and a reference count.
58  *
59  * A contract template is represented by a ct_template_t, which, like a
60  * contract, points to an encapsulating contract-type specific template
61  * object.  A ct_template_t contains the template's terms.
62  *
63  * An event queue is represented by a ct_equeue_t, and consists of a
64  * list of events, a list of listeners, and a list of listeners who are
65  * waiting for new events (affectionately referred to as "tail
66  * listeners").  There are three queue types, defined by ct_listnum_t
67  * (an enum).  An event may be on one of each type of queue
68  * simultaneously; the list linkage used by a queue is determined by
69  * its type.
70  *
71  * An event is represented by a ct_kevent_t, which contains mostly
72  * static event data (e.g. id, payload).  It also has an array of
73  * ct_member_t structures, each of which contains a list_node_t and
74  * represent the event's linkage in a specific event queue.
75  *
76  * Each open of an event endpoint results in the creation of a new
77  * listener, represented by a ct_listener_t.  In addition to linkage
78  * into the aforementioned lists in the event_queue, a ct_listener_t
79  * contains a pointer to the ct_kevent_t it is currently positioned at
80  * as well as a set of status flags and other administrative data.
81  *
82  * Each process has a list of contracts it owns, p_ct_held; a pointer
83  * to the process contract it is a member of, p_ct_process; the linkage
84  * for that membership, p_ct_member; and an array of event queue
85  * structures representing the process bundle queues.
86  *
87  * Each LWP has an array of its active templates, lwp_ct_active; and
88  * the most recently created contracts, lwp_ct_latest.
89  *
90  * A process contract has a list of member processes and a list of
91  * inherited contracts.
92  *
93  * There is a system-wide list of all contracts, as well as per-type
94  * lists of contracts.
95  *
96  * Lock ordering overview
97  * ----------------------
98  *
99  * Locks at the top are taken first:
100  *
101  *                   ct_evtlock
102  *                   regent ct_lock
103  *                   member ct_lock
104  *                   pidlock
105  *                   p_lock
106  *    contract ctq_lock         contract_lock
107  *    pbundle ctq_lock
108  *    cte_lock
109  *                   ct_reflock
110  *
111  * contract_lock and ctq_lock/cte_lock are not currently taken at the
112  * same time.
113  *
114  * Reference counting and locking
115  * ------------------------------
116  *
117  * A contract has a reference count, protected by ct_reflock.
118  * (ct_reflock is also used in a couple other places where atomic
119  * access to a variable is needed in an innermost context).  A process
120  * maintains a hold on each contract it owns.  A process contract has a
121  * hold on each contract is has inherited.  Each event has a hold on
122  * the contract which generated it.  Process contract templates have
123  * holds on the contracts referred to by their transfer terms.  CTFS
124  * contract directory nodes have holds on contracts.  Lastly, various
125  * code paths may temporarily take holds on contracts to prevent them
126  * from disappearing while other processing is going on.  It is
127  * important to note that the global contract lists do not hold
128  * references on contracts; a contract is removed from these structures
129  * atomically with the release of its last reference.
130  *
131  * At a given point in time, a contract can either be owned by a
132  * process, inherited by a regent process contract, or orphaned.  A
133  * contract_t's  owner and regent pointers, ct_owner and ct_regent, are
134  * protected by its ct_lock.  The linkage in the holder's (holder =
135  * owner or regent) list of contracts, ct_ctlist, is protected by
136  * whatever lock protects the holder's data structure.  In order for
137  * these two directions to remain consistent, changing the holder of a
138  * contract requires that both locks be held.
139  *
140  * Events also have reference counts.  There is one hold on an event
141  * per queue it is present on, in addition to those needed for the
142  * usual sundry reasons.  Individual listeners are associated with
143  * specific queues, and increase a queue-specific reference count
144  * stored in the ct_member_t structure.
145  *
146  * The dynamic contents of an event (reference count and flags) are
147  * protected by its cte_lock, while the contents of the embedded
148  * ct_member_t structures are protected by the locks of the queues they
149  * are linked into.  A ct_listener_t's contents are also protected by
150  * its event queue's ctq_lock.
151  *
152  * Resource controls
153  * -----------------
154  *
155  * Control:      project.max-contracts (rc_project_contract)
156  * Description:  Maximum number of contracts allowed a project.
157  *
158  *   When a contract is created, the project's allocation is tested and
159  *   (assuming success) increased.  When the last reference to a
160  *   contract is released, the creating project's allocation is
161  *   decreased.
162  */
163 
164 #include <sys/mutex.h>
165 #include <sys/debug.h>
166 #include <sys/types.h>
167 #include <sys/param.h>
168 #include <sys/kmem.h>
169 #include <sys/thread.h>
170 #include <sys/id_space.h>
171 #include <sys/avl.h>
172 #include <sys/list.h>
173 #include <sys/sysmacros.h>
174 #include <sys/proc.h>
175 #include <sys/ctfs.h>
176 #include <sys/contract_impl.h>
177 #include <sys/contract/process_impl.h>
178 #include <sys/dditypes.h>
179 #include <sys/contract/device_impl.h>
180 #include <sys/systm.h>
181 #include <sys/atomic.h>
182 #include <sys/cmn_err.h>
183 #include <sys/model.h>
184 #include <sys/policy.h>
185 #include <sys/zone.h>
186 #include <sys/task.h>
187 #include <sys/ddi.h>
188 #include <sys/sunddi.h>
189 
190 extern rctl_hndl_t rc_project_contract;
191 
192 static id_space_t	*contract_ids;
193 static avl_tree_t	contract_avl;
194 static kmutex_t		contract_lock;
195 
196 int			ct_ntypes = CTT_MAXTYPE;
197 static ct_type_t	*ct_types_static[CTT_MAXTYPE];
198 ct_type_t		**ct_types = ct_types_static;
199 int			ct_debug;
200 
201 static void cte_queue_create(ct_equeue_t *, ct_listnum_t, int, int);
202 static void cte_queue_destroy(ct_equeue_t *);
203 static void cte_queue_drain(ct_equeue_t *, int);
204 static void cte_trim(ct_equeue_t *, contract_t *);
205 static void cte_copy(ct_equeue_t *, ct_equeue_t *);
206 
207 /*
208  * contract_compar
209  *
210  * A contract comparator which sorts on contract ID.
211  */
212 int
213 contract_compar(const void *x, const void *y)
214 {
215 	const contract_t *ct1 = x;
216 	const contract_t *ct2 = y;
217 
218 	if (ct1->ct_id < ct2->ct_id)
219 		return (-1);
220 	if (ct1->ct_id > ct2->ct_id)
221 		return (1);
222 	return (0);
223 }
224 
225 /*
226  * contract_init
227  *
228  * Initializes the contract subsystem, the specific contract types, and
229  * process 0.
230  */
231 void
232 contract_init(void)
233 {
234 	/*
235 	 * Initialize contract subsystem.
236 	 */
237 	contract_ids = id_space_create("contracts", 1, INT_MAX);
238 	avl_create(&contract_avl, contract_compar, sizeof (contract_t),
239 	    offsetof(contract_t, ct_ctavl));
240 	mutex_init(&contract_lock, NULL, MUTEX_DEFAULT, NULL);
241 
242 	/*
243 	 * Initialize contract types.
244 	 */
245 	contract_process_init();
246 	contract_device_init();
247 
248 	/*
249 	 * Initialize p0/lwp0 contract state.
250 	 */
251 	avl_create(&p0.p_ct_held, contract_compar, sizeof (contract_t),
252 	    offsetof(contract_t, ct_ctlist));
253 }
254 
255 /*
256  * contract_dtor
257  *
258  * Performs basic destruction of the common portions of a contract.
259  * Called from the failure path of contract_ctor and from
260  * contract_rele.
261  */
262 static void
263 contract_dtor(contract_t *ct)
264 {
265 	cte_queue_destroy(&ct->ct_events);
266 	list_destroy(&ct->ct_vnodes);
267 	mutex_destroy(&ct->ct_reflock);
268 	mutex_destroy(&ct->ct_lock);
269 	mutex_destroy(&ct->ct_evtlock);
270 }
271 
272 /*
273  * contract_ctor
274  *
275  * Called by a contract type to initialize a contract.  Fails if the
276  * max-contract resource control would have been exceeded.  After a
277  * successful call to contract_ctor, the contract is unlocked and
278  * visible in all namespaces; any type-specific initialization should
279  * be completed before calling contract_ctor.  Returns 0 on success.
280  *
281  * Because not all callers can tolerate failure, a 0 value for canfail
282  * instructs contract_ctor to ignore the project.max-contracts resource
283  * control.  Obviously, this "out" should only be employed by callers
284  * who are sufficiently constrained in other ways (e.g. newproc).
285  */
286 int
287 contract_ctor(contract_t *ct, ct_type_t *type, ct_template_t *tmpl, void *data,
288     ctflags_t flags, proc_t *author, int canfail)
289 {
290 	avl_index_t where;
291 	klwp_t *curlwp = ttolwp(curthread);
292 
293 	ASSERT(author == curproc);
294 
295 	mutex_init(&ct->ct_lock, NULL, MUTEX_DEFAULT, NULL);
296 	mutex_init(&ct->ct_reflock, NULL, MUTEX_DEFAULT, NULL);
297 	mutex_init(&ct->ct_evtlock, NULL, MUTEX_DEFAULT, NULL);
298 	ct->ct_id = id_alloc(contract_ids);
299 
300 	cte_queue_create(&ct->ct_events, CTEL_CONTRACT, 20, 0);
301 	list_create(&ct->ct_vnodes, sizeof (contract_vnode_t),
302 	    offsetof(contract_vnode_t, ctv_node));
303 
304 	/*
305 	 * Instance data
306 	 */
307 	ct->ct_ref = 2;		/* one for the holder, one for "latest" */
308 	ct->ct_cuid = crgetuid(CRED());
309 	ct->ct_type = type;
310 	ct->ct_data = data;
311 	gethrestime(&ct->ct_ctime);
312 	ct->ct_state = CTS_OWNED;
313 	ct->ct_flags = flags;
314 	ct->ct_regent = author->p_ct_process ?
315 	    &author->p_ct_process->conp_contract : NULL;
316 	ct->ct_ev_info = tmpl->ctmpl_ev_info;
317 	ct->ct_ev_crit = tmpl->ctmpl_ev_crit;
318 	ct->ct_cookie = tmpl->ctmpl_cookie;
319 	ct->ct_owner = author;
320 	ct->ct_ntime.ctm_total = -1;
321 	ct->ct_qtime.ctm_total = -1;
322 	ct->ct_nevent = NULL;
323 
324 	/*
325 	 * Test project.max-contracts.
326 	 */
327 	mutex_enter(&author->p_lock);
328 	mutex_enter(&contract_lock);
329 	if (canfail && rctl_test(rc_project_contract,
330 	    author->p_task->tk_proj->kpj_rctls, author, 1,
331 	    RCA_SAFE) & RCT_DENY) {
332 		id_free(contract_ids, ct->ct_id);
333 		mutex_exit(&contract_lock);
334 		mutex_exit(&author->p_lock);
335 		ct->ct_events.ctq_flags |= CTQ_DEAD;
336 		contract_dtor(ct);
337 		return (1);
338 	}
339 	ct->ct_proj = author->p_task->tk_proj;
340 	ct->ct_proj->kpj_data.kpd_contract++;
341 	(void) project_hold(ct->ct_proj);
342 	mutex_exit(&contract_lock);
343 
344 	/*
345 	 * Insert into holder's avl of contracts.
346 	 * We use an avl not because order is important, but because
347 	 * readdir of /proc/contracts requires we be able to use a
348 	 * scalar as an index into the process's list of contracts
349 	 */
350 	ct->ct_zoneid = author->p_zone->zone_id;
351 	ct->ct_czuniqid = ct->ct_mzuniqid = author->p_zone->zone_uniqid;
352 	VERIFY(avl_find(&author->p_ct_held, ct, &where) == NULL);
353 	avl_insert(&author->p_ct_held, ct, where);
354 	mutex_exit(&author->p_lock);
355 
356 	/*
357 	 * Insert into global contract AVL
358 	 */
359 	mutex_enter(&contract_lock);
360 	VERIFY(avl_find(&contract_avl, ct, &where) == NULL);
361 	avl_insert(&contract_avl, ct, where);
362 	mutex_exit(&contract_lock);
363 
364 	/*
365 	 * Insert into type AVL
366 	 */
367 	mutex_enter(&type->ct_type_lock);
368 	VERIFY(avl_find(&type->ct_type_avl, ct, &where) == NULL);
369 	avl_insert(&type->ct_type_avl, ct, where);
370 	type->ct_type_timestruc = ct->ct_ctime;
371 	mutex_exit(&type->ct_type_lock);
372 
373 	if (curlwp->lwp_ct_latest[type->ct_type_index])
374 		contract_rele(curlwp->lwp_ct_latest[type->ct_type_index]);
375 	curlwp->lwp_ct_latest[type->ct_type_index] = ct;
376 
377 	return (0);
378 }
379 
380 /*
381  * contract_rele
382  *
383  * Releases a reference to a contract.  If the caller had the last
384  * reference, the contract is removed from all namespaces, its
385  * allocation against the max-contracts resource control is released,
386  * and the contract type's free entry point is invoked for any
387  * type-specific deconstruction and to (presumably) free the object.
388  */
389 void
390 contract_rele(contract_t *ct)
391 {
392 	uint64_t nref;
393 
394 	mutex_enter(&ct->ct_reflock);
395 	ASSERT(ct->ct_ref > 0);
396 	nref = --ct->ct_ref;
397 	mutex_exit(&ct->ct_reflock);
398 	if (nref == 0) {
399 		/*
400 		 * ct_owner is cleared when it drops its reference.
401 		 */
402 		ASSERT(ct->ct_owner == NULL);
403 		ASSERT(ct->ct_evcnt == 0);
404 
405 		/*
406 		 * Remove from global contract AVL
407 		 */
408 		mutex_enter(&contract_lock);
409 		avl_remove(&contract_avl, ct);
410 		mutex_exit(&contract_lock);
411 
412 		/*
413 		 * Remove from type AVL
414 		 */
415 		mutex_enter(&ct->ct_type->ct_type_lock);
416 		avl_remove(&ct->ct_type->ct_type_avl, ct);
417 		mutex_exit(&ct->ct_type->ct_type_lock);
418 
419 		/*
420 		 * Release the contract's ID
421 		 */
422 		id_free(contract_ids, ct->ct_id);
423 
424 		/*
425 		 * Release project hold
426 		 */
427 		mutex_enter(&contract_lock);
428 		ct->ct_proj->kpj_data.kpd_contract--;
429 		project_rele(ct->ct_proj);
430 		mutex_exit(&contract_lock);
431 
432 		/*
433 		 * Free the contract
434 		 */
435 		contract_dtor(ct);
436 		ct->ct_type->ct_type_ops->contop_free(ct);
437 	}
438 }
439 
440 /*
441  * contract_hold
442  *
443  * Adds a reference to a contract
444  */
445 void
446 contract_hold(contract_t *ct)
447 {
448 	mutex_enter(&ct->ct_reflock);
449 	ASSERT(ct->ct_ref < UINT64_MAX);
450 	ct->ct_ref++;
451 	mutex_exit(&ct->ct_reflock);
452 }
453 
454 /*
455  * contract_getzuniqid
456  *
457  * Get a contract's zone unique ID.  Needed because 64-bit reads and
458  * writes aren't atomic on x86.  Since there are contexts where we are
459  * unable to take ct_lock, we instead use ct_reflock; in actuality any
460  * lock would do.
461  */
462 uint64_t
463 contract_getzuniqid(contract_t *ct)
464 {
465 	uint64_t zuniqid;
466 
467 	mutex_enter(&ct->ct_reflock);
468 	zuniqid = ct->ct_mzuniqid;
469 	mutex_exit(&ct->ct_reflock);
470 
471 	return (zuniqid);
472 }
473 
474 /*
475  * contract_setzuniqid
476  *
477  * Sets a contract's zone unique ID.   See contract_getzuniqid.
478  */
479 void
480 contract_setzuniqid(contract_t *ct, uint64_t zuniqid)
481 {
482 	mutex_enter(&ct->ct_reflock);
483 	ct->ct_mzuniqid = zuniqid;
484 	mutex_exit(&ct->ct_reflock);
485 }
486 
487 /*
488  * contract_abandon
489  *
490  * Abandons the specified contract.  If "explicit" is clear, the
491  * contract was implicitly abandoned (by process exit) and should be
492  * inherited if its terms allow it and its owner was a member of a
493  * regent contract.  Otherwise, the contract type's abandon entry point
494  * is invoked to either destroy or orphan the contract.
495  */
496 int
497 contract_abandon(contract_t *ct, proc_t *p, int explicit)
498 {
499 	ct_equeue_t *q = NULL;
500 	contract_t *parent = &p->p_ct_process->conp_contract;
501 	int inherit = 0;
502 
503 	VERIFY(p == curproc);
504 
505 	mutex_enter(&ct->ct_lock);
506 
507 	/*
508 	 * Multiple contract locks are taken contract -> subcontract.
509 	 * Check if the contract will be inherited so we can acquire
510 	 * all the necessary locks before making sensitive changes.
511 	 */
512 	if (!explicit && (ct->ct_flags & CTF_INHERIT) &&
513 	    contract_process_accept(parent)) {
514 		mutex_exit(&ct->ct_lock);
515 		mutex_enter(&parent->ct_lock);
516 		mutex_enter(&ct->ct_lock);
517 		inherit = 1;
518 	}
519 
520 	if (ct->ct_owner != p) {
521 		mutex_exit(&ct->ct_lock);
522 		if (inherit)
523 			mutex_exit(&parent->ct_lock);
524 		return (EINVAL);
525 	}
526 
527 	mutex_enter(&p->p_lock);
528 	if (explicit)
529 		avl_remove(&p->p_ct_held, ct);
530 	ct->ct_owner = NULL;
531 	mutex_exit(&p->p_lock);
532 
533 	/*
534 	 * Since we can't call cte_trim with the contract lock held,
535 	 * we grab the queue pointer here.
536 	 */
537 	if (p->p_ct_equeue)
538 		q = p->p_ct_equeue[ct->ct_type->ct_type_index];
539 
540 	/*
541 	 * contop_abandon may destroy the contract so we rely on it to
542 	 * drop ct_lock.  We retain a reference on the contract so that
543 	 * the cte_trim which follows functions properly.  Even though
544 	 * cte_trim doesn't dereference the contract pointer, it is
545 	 * still necessary to retain a reference to the contract so
546 	 * that we don't trim events which are sent by a subsequently
547 	 * allocated contract infortuitously located at the same address.
548 	 */
549 	contract_hold(ct);
550 
551 	if (inherit) {
552 		ct->ct_state = CTS_INHERITED;
553 		VERIFY(ct->ct_regent == parent);
554 		contract_process_take(parent, ct);
555 
556 		/*
557 		 * We are handing off the process's reference to the
558 		 * parent contract.  For this reason, the order in
559 		 * which we drop the contract locks is also important.
560 		 */
561 		mutex_exit(&ct->ct_lock);
562 		mutex_exit(&parent->ct_lock);
563 	} else {
564 		ct->ct_regent = NULL;
565 		ct->ct_type->ct_type_ops->contop_abandon(ct);
566 	}
567 
568 	/*
569 	 * ct_lock has been dropped; we can safely trim the event
570 	 * queue now.
571 	 */
572 	if (q) {
573 		mutex_enter(&q->ctq_lock);
574 		cte_trim(q, ct);
575 		mutex_exit(&q->ctq_lock);
576 	}
577 
578 	contract_rele(ct);
579 
580 	return (0);
581 }
582 
583 int
584 contract_newct(contract_t *ct)
585 {
586 	return (ct->ct_type->ct_type_ops->contop_newct(ct));
587 }
588 
589 /*
590  * contract_adopt
591  *
592  * Adopts a contract.  After a successful call to this routine, the
593  * previously inherited contract will belong to the calling process,
594  * and its events will have been appended to its new owner's process
595  * bundle queue.
596  */
597 int
598 contract_adopt(contract_t *ct, proc_t *p)
599 {
600 	avl_index_t where;
601 	ct_equeue_t *q;
602 	contract_t *parent;
603 
604 	ASSERT(p == curproc);
605 
606 	/*
607 	 * Ensure the process has an event queue.  Checked by ASSERTs
608 	 * below.
609 	 */
610 	(void) contract_type_pbundle(ct->ct_type, p);
611 
612 	mutex_enter(&ct->ct_lock);
613 	parent = ct->ct_regent;
614 	if (ct->ct_state != CTS_INHERITED ||
615 	    &p->p_ct_process->conp_contract != parent ||
616 	    p->p_zone->zone_uniqid != ct->ct_czuniqid) {
617 		mutex_exit(&ct->ct_lock);
618 		return (EINVAL);
619 	}
620 
621 	/*
622 	 * Multiple contract locks are taken contract -> subcontract.
623 	 */
624 	mutex_exit(&ct->ct_lock);
625 	mutex_enter(&parent->ct_lock);
626 	mutex_enter(&ct->ct_lock);
627 
628 	/*
629 	 * It is possible that the contract was adopted by someone else
630 	 * while its lock was dropped.  It isn't possible for the
631 	 * contract to have been inherited by a different regent
632 	 * contract.
633 	 */
634 	if (ct->ct_state != CTS_INHERITED) {
635 		mutex_exit(&parent->ct_lock);
636 		mutex_exit(&ct->ct_lock);
637 		return (EBUSY);
638 	}
639 	ASSERT(ct->ct_regent == parent);
640 
641 	ct->ct_state = CTS_OWNED;
642 
643 	contract_process_adopt(ct, p);
644 
645 	mutex_enter(&p->p_lock);
646 	ct->ct_owner = p;
647 	VERIFY(avl_find(&p->p_ct_held, ct, &where) == NULL);
648 	avl_insert(&p->p_ct_held, ct, where);
649 	mutex_exit(&p->p_lock);
650 
651 	ASSERT(ct->ct_owner->p_ct_equeue);
652 	ASSERT(ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index]);
653 	q = ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index];
654 	cte_copy(&ct->ct_events, q);
655 	mutex_exit(&ct->ct_lock);
656 
657 	return (0);
658 }
659 
660 /*
661  * contract_ack
662  *
663  * Acknowledges receipt of a critical event.
664  */
665 int
666 contract_ack(contract_t *ct, uint64_t evid, int ack)
667 {
668 	ct_kevent_t *ev;
669 	list_t *queue = &ct->ct_events.ctq_events;
670 	int error = ESRCH;
671 	int nego = 0;
672 	uint_t evtype;
673 
674 	ASSERT(ack == CT_ACK || ack == CT_NACK);
675 
676 	mutex_enter(&ct->ct_lock);
677 	mutex_enter(&ct->ct_events.ctq_lock);
678 	/*
679 	 * We are probably ACKing something near the head of the queue.
680 	 */
681 	for (ev = list_head(queue); ev; ev = list_next(queue, ev)) {
682 		if (ev->cte_id == evid) {
683 			if (ev->cte_flags & CTE_NEG)
684 				nego = 1;
685 			else if (ack == CT_NACK)
686 				break;
687 			if ((ev->cte_flags & (CTE_INFO | CTE_ACK)) == 0) {
688 				ev->cte_flags |= CTE_ACK;
689 				ct->ct_evcnt--;
690 				evtype = ev->cte_type;
691 				error = 0;
692 			}
693 			break;
694 		}
695 	}
696 	mutex_exit(&ct->ct_events.ctq_lock);
697 	mutex_exit(&ct->ct_lock);
698 
699 	/*
700 	 * Not all critical events are negotiation events, however
701 	 * every negotiation event is a critical event. NEGEND events
702 	 * are critical events but are not negotiation events
703 	 */
704 	if (error || !nego)
705 		return (error);
706 
707 	if (ack == CT_ACK)
708 		error = ct->ct_type->ct_type_ops->contop_ack(ct, evtype, evid);
709 	else
710 		error = ct->ct_type->ct_type_ops->contop_nack(ct, evtype, evid);
711 
712 	return (error);
713 }
714 
715 /*ARGSUSED*/
716 int
717 contract_ack_inval(contract_t *ct, uint_t evtype, uint64_t evid)
718 {
719 	cmn_err(CE_PANIC, "contract_ack_inval: unsupported call: ctid: %u",
720 	    ct->ct_id);
721 	return (ENOSYS);
722 }
723 
724 /*ARGSUSED*/
725 int
726 contract_qack_inval(contract_t *ct, uint_t evtype, uint64_t evid)
727 {
728 	cmn_err(CE_PANIC, "contract_ack_inval: unsupported call: ctid: %u",
729 	    ct->ct_id);
730 	return (ENOSYS);
731 }
732 
733 /*ARGSUSED*/
734 int
735 contract_qack_notsup(contract_t *ct, uint_t evtype, uint64_t evid)
736 {
737 	return (ERANGE);
738 }
739 
740 /*
741  * contract_qack
742  *
743  * Asks that negotiations be extended by another time quantum
744  */
745 int
746 contract_qack(contract_t *ct, uint64_t evid)
747 {
748 	ct_kevent_t *ev;
749 	list_t *queue = &ct->ct_events.ctq_events;
750 	int nego = 0;
751 	uint_t evtype;
752 
753 	mutex_enter(&ct->ct_lock);
754 	mutex_enter(&ct->ct_events.ctq_lock);
755 
756 	for (ev = list_head(queue); ev; ev = list_next(queue, ev)) {
757 		if (ev->cte_id == evid) {
758 			if ((ev->cte_flags & (CTE_NEG | CTE_ACK)) == CTE_NEG) {
759 				evtype = ev->cte_type;
760 				nego = 1;
761 			}
762 			break;
763 		}
764 	}
765 	mutex_exit(&ct->ct_events.ctq_lock);
766 	mutex_exit(&ct->ct_lock);
767 
768 	/*
769 	 * Only a negotiated event (which is by definition also a critical
770 	 * event) which has not yet been acknowledged can provide
771 	 * time quanta to a negotiating owner process.
772 	 */
773 	if (!nego)
774 		return (ESRCH);
775 
776 	return (ct->ct_type->ct_type_ops->contop_qack(ct, evtype, evid));
777 }
778 
779 /*
780  * contract_orphan
781  *
782  * Icky-poo.  This is a process-contract special, used to ACK all
783  * critical messages when a contract is orphaned.
784  */
785 void
786 contract_orphan(contract_t *ct)
787 {
788 	ct_kevent_t *ev;
789 	list_t *queue = &ct->ct_events.ctq_events;
790 
791 	ASSERT(MUTEX_HELD(&ct->ct_lock));
792 	ASSERT(ct->ct_state != CTS_ORPHAN);
793 
794 	mutex_enter(&ct->ct_events.ctq_lock);
795 	ct->ct_state = CTS_ORPHAN;
796 	for (ev = list_head(queue); ev; ev = list_next(queue, ev)) {
797 		if ((ev->cte_flags & (CTE_INFO | CTE_ACK)) == 0) {
798 			ev->cte_flags |= CTE_ACK;
799 			ct->ct_evcnt--;
800 		}
801 	}
802 	mutex_exit(&ct->ct_events.ctq_lock);
803 
804 	ASSERT(ct->ct_evcnt == 0);
805 }
806 
807 /*
808  * contract_destroy
809  *
810  * Explicit contract destruction.  Called when contract is empty.
811  * The contract will actually stick around until all of its events are
812  * removed from the bundle and and process bundle queues, and all fds
813  * which refer to it are closed.  See contract_dtor if you are looking
814  * for what destroys the contract structure.
815  */
816 void
817 contract_destroy(contract_t *ct)
818 {
819 	ASSERT(MUTEX_HELD(&ct->ct_lock));
820 	ASSERT(ct->ct_state != CTS_DEAD);
821 	ASSERT(ct->ct_owner == NULL);
822 
823 	ct->ct_state = CTS_DEAD;
824 	cte_queue_drain(&ct->ct_events, 1);
825 	mutex_exit(&ct->ct_lock);
826 	mutex_enter(&ct->ct_type->ct_type_events.ctq_lock);
827 	cte_trim(&ct->ct_type->ct_type_events, ct);
828 	mutex_exit(&ct->ct_type->ct_type_events.ctq_lock);
829 	mutex_enter(&ct->ct_lock);
830 	ct->ct_type->ct_type_ops->contop_destroy(ct);
831 	mutex_exit(&ct->ct_lock);
832 	contract_rele(ct);
833 }
834 
835 /*
836  * contract_vnode_get
837  *
838  * Obtains the contract directory vnode for this contract, if there is
839  * one.  The caller must VN_RELE the vnode when they are through using
840  * it.
841  */
842 vnode_t *
843 contract_vnode_get(contract_t *ct, vfs_t *vfsp)
844 {
845 	contract_vnode_t *ctv;
846 	vnode_t *vp = NULL;
847 
848 	mutex_enter(&ct->ct_lock);
849 	for (ctv = list_head(&ct->ct_vnodes); ctv != NULL;
850 	    ctv = list_next(&ct->ct_vnodes, ctv))
851 		if (ctv->ctv_vnode->v_vfsp == vfsp) {
852 			vp = ctv->ctv_vnode;
853 			VN_HOLD(vp);
854 			break;
855 		}
856 	mutex_exit(&ct->ct_lock);
857 	return (vp);
858 }
859 
860 /*
861  * contract_vnode_set
862  *
863  * Sets the contract directory vnode for this contract.  We don't hold
864  * a reference on the vnode because we don't want to prevent it from
865  * being freed.  The vnode's inactive entry point will take care of
866  * notifying us when it should be removed.
867  */
868 void
869 contract_vnode_set(contract_t *ct, contract_vnode_t *ctv, vnode_t *vnode)
870 {
871 	mutex_enter(&ct->ct_lock);
872 	ctv->ctv_vnode = vnode;
873 	list_insert_head(&ct->ct_vnodes, ctv);
874 	mutex_exit(&ct->ct_lock);
875 }
876 
877 /*
878  * contract_vnode_clear
879  *
880  * Removes this vnode as the contract directory vnode for this
881  * contract.  Called from a contract directory's inactive entry point,
882  * this may return 0 indicating that the vnode gained another reference
883  * because of a simultaneous call to contract_vnode_get.
884  */
885 int
886 contract_vnode_clear(contract_t *ct, contract_vnode_t *ctv)
887 {
888 	vnode_t *vp = ctv->ctv_vnode;
889 	int result;
890 
891 	mutex_enter(&ct->ct_lock);
892 	mutex_enter(&vp->v_lock);
893 	if (vp->v_count == 1) {
894 		list_remove(&ct->ct_vnodes, ctv);
895 		result = 1;
896 	} else {
897 		VN_RELE_LOCKED(vp);
898 		result = 0;
899 	}
900 	mutex_exit(&vp->v_lock);
901 	mutex_exit(&ct->ct_lock);
902 
903 	return (result);
904 }
905 
906 /*
907  * contract_exit
908  *
909  * Abandons all contracts held by process p, and drains process p's
910  * bundle queues.  Called on process exit.
911  */
912 void
913 contract_exit(proc_t *p)
914 {
915 	contract_t *ct;
916 	void *cookie = NULL;
917 	int i;
918 
919 	ASSERT(p == curproc);
920 
921 	/*
922 	 * Abandon held contracts.  contract_abandon knows enough not
923 	 * to remove the contract from the list a second time.  We are
924 	 * exiting, so no locks are needed here.  But because
925 	 * contract_abandon will take p_lock, we need to make sure we
926 	 * aren't holding it.
927 	 */
928 	ASSERT(MUTEX_NOT_HELD(&p->p_lock));
929 	while ((ct = avl_destroy_nodes(&p->p_ct_held, &cookie)) != NULL)
930 		VERIFY(contract_abandon(ct, p, 0) == 0);
931 
932 	/*
933 	 * Drain pbundles.  Because a process bundle queue could have
934 	 * been passed to another process, they may not be freed right
935 	 * away.
936 	 */
937 	if (p->p_ct_equeue) {
938 		for (i = 0; i < CTT_MAXTYPE; i++)
939 			if (p->p_ct_equeue[i])
940 				cte_queue_drain(p->p_ct_equeue[i], 0);
941 		kmem_free(p->p_ct_equeue, CTT_MAXTYPE * sizeof (ct_equeue_t *));
942 	}
943 }
944 
945 static int
946 get_time_left(struct ct_time *t)
947 {
948 	clock_t ticks_elapsed;
949 	int secs_elapsed;
950 
951 	if (t->ctm_total == -1)
952 		return (-1);
953 
954 	ticks_elapsed = ddi_get_lbolt() - t->ctm_start;
955 	secs_elapsed = t->ctm_total - (drv_hztousec(ticks_elapsed)/MICROSEC);
956 	return (secs_elapsed > 0 ? secs_elapsed : 0);
957 }
958 
959 /*
960  * contract_status_common
961  *
962  * Populates a ct_status structure.  Used by contract types in their
963  * status entry points and ctfs when only common information is
964  * requested.
965  */
966 void
967 contract_status_common(contract_t *ct, zone_t *zone, void *status,
968     model_t model)
969 {
970 	STRUCT_HANDLE(ct_status, lstatus);
971 
972 	STRUCT_SET_HANDLE(lstatus, model, status);
973 	ASSERT(MUTEX_HELD(&ct->ct_lock));
974 	if (zone->zone_uniqid == GLOBAL_ZONEUNIQID ||
975 	    zone->zone_uniqid == ct->ct_czuniqid) {
976 		zone_t *czone;
977 		zoneid_t zoneid = -1;
978 
979 		/*
980 		 * Contracts don't have holds on the zones they were
981 		 * created by.  If the contract's zone no longer
982 		 * exists, we say its zoneid is -1.
983 		 */
984 		if (zone->zone_uniqid == ct->ct_czuniqid ||
985 		    ct->ct_czuniqid == GLOBAL_ZONEUNIQID) {
986 			zoneid = ct->ct_zoneid;
987 		} else if ((czone = zone_find_by_id(ct->ct_zoneid)) != NULL) {
988 			if (czone->zone_uniqid == ct->ct_mzuniqid)
989 				zoneid = ct->ct_zoneid;
990 			zone_rele(czone);
991 		}
992 
993 		STRUCT_FSET(lstatus, ctst_zoneid, zoneid);
994 		STRUCT_FSET(lstatus, ctst_holder,
995 		    (ct->ct_state == CTS_OWNED) ? ct->ct_owner->p_pid :
996 		    (ct->ct_state == CTS_INHERITED) ? ct->ct_regent->ct_id : 0);
997 		STRUCT_FSET(lstatus, ctst_state, ct->ct_state);
998 	} else {
999 		/*
1000 		 * We are looking at a contract which was created by a
1001 		 * process outside of our zone.  We provide fake zone,
1002 		 * holder, and state information.
1003 		 */
1004 
1005 		STRUCT_FSET(lstatus, ctst_zoneid, zone->zone_id);
1006 		/*
1007 		 * Since "zone" can't disappear until the calling ctfs
1008 		 * is unmounted, zone_zsched must be valid.
1009 		 */
1010 		STRUCT_FSET(lstatus, ctst_holder, (ct->ct_state < CTS_ORPHAN) ?
1011 		    zone->zone_zsched->p_pid : 0);
1012 		STRUCT_FSET(lstatus, ctst_state, (ct->ct_state < CTS_ORPHAN) ?
1013 		    CTS_OWNED : ct->ct_state);
1014 	}
1015 	STRUCT_FSET(lstatus, ctst_nevents, ct->ct_evcnt);
1016 	STRUCT_FSET(lstatus, ctst_ntime, get_time_left(&ct->ct_ntime));
1017 	STRUCT_FSET(lstatus, ctst_qtime, get_time_left(&ct->ct_qtime));
1018 	STRUCT_FSET(lstatus, ctst_nevid,
1019 	    ct->ct_nevent ? ct->ct_nevent->cte_id : 0);
1020 	STRUCT_FSET(lstatus, ctst_critical, ct->ct_ev_crit);
1021 	STRUCT_FSET(lstatus, ctst_informative, ct->ct_ev_info);
1022 	STRUCT_FSET(lstatus, ctst_cookie, ct->ct_cookie);
1023 	STRUCT_FSET(lstatus, ctst_type, ct->ct_type->ct_type_index);
1024 	STRUCT_FSET(lstatus, ctst_id, ct->ct_id);
1025 }
1026 
1027 /*
1028  * contract_checkcred
1029  *
1030  * Determines if the specified contract is owned by a process with the
1031  * same effective uid as the specified credential.  The caller must
1032  * ensure that the uid spaces are the same.  Returns 1 on success.
1033  */
1034 static int
1035 contract_checkcred(contract_t *ct, const cred_t *cr)
1036 {
1037 	proc_t *p;
1038 	int fail = 1;
1039 
1040 	mutex_enter(&ct->ct_lock);
1041 	if ((p = ct->ct_owner) != NULL) {
1042 		mutex_enter(&p->p_crlock);
1043 		fail = crgetuid(cr) != crgetuid(p->p_cred);
1044 		mutex_exit(&p->p_crlock);
1045 	}
1046 	mutex_exit(&ct->ct_lock);
1047 
1048 	return (!fail);
1049 }
1050 
1051 /*
1052  * contract_owned
1053  *
1054  * Determines if the specified credential can view an event generated
1055  * by the specified contract.  If locked is set, the contract's ct_lock
1056  * is held and the caller will need to do additional work to determine
1057  * if they truly can see the event.  Returns 1 on success.
1058  */
1059 int
1060 contract_owned(contract_t *ct, const cred_t *cr, int locked)
1061 {
1062 	int owner, cmatch, zmatch;
1063 	uint64_t zuniqid, mzuniqid;
1064 	uid_t euid;
1065 
1066 	ASSERT(locked || MUTEX_NOT_HELD(&ct->ct_lock));
1067 
1068 	zuniqid = curproc->p_zone->zone_uniqid;
1069 	mzuniqid = contract_getzuniqid(ct);
1070 	euid = crgetuid(cr);
1071 
1072 	/*
1073 	 * owner: we own the contract
1074 	 * cmatch: we are in the creator's (and holder's) zone and our
1075 	 *   uid matches the creator's or holder's
1076 	 * zmatch: we are in the effective zone of a contract created
1077 	 *   in the global zone, and our uid matches that of the
1078 	 *   virtualized holder's (zsched/kcred)
1079 	 */
1080 	owner = (ct->ct_owner == curproc);
1081 	cmatch = (zuniqid == ct->ct_czuniqid) &&
1082 	    ((ct->ct_cuid == euid) || (!locked && contract_checkcred(ct, cr)));
1083 	zmatch = (ct->ct_czuniqid != mzuniqid) && (zuniqid == mzuniqid) &&
1084 	    (crgetuid(kcred) == euid);
1085 
1086 	return (owner || cmatch || zmatch);
1087 }
1088 
1089 
1090 /*
1091  * contract_type_init
1092  *
1093  * Called by contract types to register themselves with the contracts
1094  * framework.
1095  */
1096 ct_type_t *
1097 contract_type_init(ct_typeid_t type, const char *name, contops_t *ops,
1098     ct_f_default_t *dfault)
1099 {
1100 	ct_type_t *result;
1101 
1102 	ASSERT(type < CTT_MAXTYPE);
1103 
1104 	result = kmem_alloc(sizeof (ct_type_t), KM_SLEEP);
1105 
1106 	mutex_init(&result->ct_type_lock, NULL, MUTEX_DEFAULT, NULL);
1107 	avl_create(&result->ct_type_avl, contract_compar, sizeof (contract_t),
1108 	    offsetof(contract_t, ct_cttavl));
1109 	cte_queue_create(&result->ct_type_events, CTEL_BUNDLE, 20, 0);
1110 	result->ct_type_name = name;
1111 	result->ct_type_ops = ops;
1112 	result->ct_type_default = dfault;
1113 	result->ct_type_evid = 0;
1114 	gethrestime(&result->ct_type_timestruc);
1115 	result->ct_type_index = type;
1116 
1117 	ct_types[type] = result;
1118 
1119 	return (result);
1120 }
1121 
1122 /*
1123  * contract_type_count
1124  *
1125  * Obtains the number of contracts of a particular type.
1126  */
1127 int
1128 contract_type_count(ct_type_t *type)
1129 {
1130 	ulong_t count;
1131 
1132 	mutex_enter(&type->ct_type_lock);
1133 	count = avl_numnodes(&type->ct_type_avl);
1134 	mutex_exit(&type->ct_type_lock);
1135 
1136 	return (count);
1137 }
1138 
1139 /*
1140  * contract_type_max
1141  *
1142  * Obtains the maximum contract id of of a particular type.
1143  */
1144 ctid_t
1145 contract_type_max(ct_type_t *type)
1146 {
1147 	contract_t *ct;
1148 	ctid_t res;
1149 
1150 	mutex_enter(&type->ct_type_lock);
1151 	ct = avl_last(&type->ct_type_avl);
1152 	res = ct ? ct->ct_id : -1;
1153 	mutex_exit(&type->ct_type_lock);
1154 
1155 	return (res);
1156 }
1157 
1158 /*
1159  * contract_max
1160  *
1161  * Obtains the maximum contract id.
1162  */
1163 ctid_t
1164 contract_max(void)
1165 {
1166 	contract_t *ct;
1167 	ctid_t res;
1168 
1169 	mutex_enter(&contract_lock);
1170 	ct = avl_last(&contract_avl);
1171 	res = ct ? ct->ct_id : -1;
1172 	mutex_exit(&contract_lock);
1173 
1174 	return (res);
1175 }
1176 
1177 /*
1178  * contract_lookup_common
1179  *
1180  * Common code for contract_lookup and contract_type_lookup.  Takes a
1181  * pointer to an AVL tree to search in.  Should be called with the
1182  * appropriate tree-protecting lock held (unfortunately unassertable).
1183  */
1184 static ctid_t
1185 contract_lookup_common(avl_tree_t *tree, uint64_t zuniqid, ctid_t current)
1186 {
1187 	contract_t template, *ct;
1188 	avl_index_t where;
1189 	ctid_t res;
1190 
1191 	template.ct_id = current;
1192 	ct = avl_find(tree, &template, &where);
1193 	if (ct == NULL)
1194 		ct = avl_nearest(tree, where, AVL_AFTER);
1195 	if (zuniqid != GLOBAL_ZONEUNIQID)
1196 		while (ct && (contract_getzuniqid(ct) != zuniqid))
1197 			ct = AVL_NEXT(tree, ct);
1198 	res = ct ? ct->ct_id : -1;
1199 
1200 	return (res);
1201 }
1202 
1203 /*
1204  * contract_type_lookup
1205  *
1206  * Returns the next type contract after the specified id, visible from
1207  * the specified zone.
1208  */
1209 ctid_t
1210 contract_type_lookup(ct_type_t *type, uint64_t zuniqid, ctid_t current)
1211 {
1212 	ctid_t res;
1213 
1214 	mutex_enter(&type->ct_type_lock);
1215 	res = contract_lookup_common(&type->ct_type_avl, zuniqid, current);
1216 	mutex_exit(&type->ct_type_lock);
1217 
1218 	return (res);
1219 }
1220 
1221 /*
1222  * contract_lookup
1223  *
1224  * Returns the next contract after the specified id, visible from the
1225  * specified zone.
1226  */
1227 ctid_t
1228 contract_lookup(uint64_t zuniqid, ctid_t current)
1229 {
1230 	ctid_t res;
1231 
1232 	mutex_enter(&contract_lock);
1233 	res = contract_lookup_common(&contract_avl, zuniqid, current);
1234 	mutex_exit(&contract_lock);
1235 
1236 	return (res);
1237 }
1238 
1239 /*
1240  * contract_plookup
1241  *
1242  * Returns the next contract held by process p after the specified id,
1243  * visible from the specified zone.  Made complicated by the fact that
1244  * contracts visible in a zone but held by processes outside of the
1245  * zone need to appear as being held by zsched to zone members.
1246  */
1247 ctid_t
1248 contract_plookup(proc_t *p, ctid_t current, uint64_t zuniqid)
1249 {
1250 	contract_t template, *ct;
1251 	avl_index_t where;
1252 	ctid_t res;
1253 
1254 	template.ct_id = current;
1255 	if (zuniqid != GLOBAL_ZONEUNIQID &&
1256 	    (p->p_flag & (SSYS|SZONETOP)) == (SSYS|SZONETOP)) {
1257 		/* This is inelegant. */
1258 		mutex_enter(&contract_lock);
1259 		ct = avl_find(&contract_avl, &template, &where);
1260 		if (ct == NULL)
1261 			ct = avl_nearest(&contract_avl, where, AVL_AFTER);
1262 		while (ct && !(ct->ct_state < CTS_ORPHAN &&
1263 		    contract_getzuniqid(ct) == zuniqid &&
1264 		    ct->ct_czuniqid == GLOBAL_ZONEUNIQID))
1265 			ct = AVL_NEXT(&contract_avl, ct);
1266 		res = ct ? ct->ct_id : -1;
1267 		mutex_exit(&contract_lock);
1268 	} else {
1269 		mutex_enter(&p->p_lock);
1270 		ct = avl_find(&p->p_ct_held, &template, &where);
1271 		if (ct == NULL)
1272 			ct = avl_nearest(&p->p_ct_held, where, AVL_AFTER);
1273 		res = ct ? ct->ct_id : -1;
1274 		mutex_exit(&p->p_lock);
1275 	}
1276 
1277 	return (res);
1278 }
1279 
1280 /*
1281  * contract_ptr_common
1282  *
1283  * Common code for contract_ptr and contract_type_ptr.  Takes a pointer
1284  * to an AVL tree to search in.  Should be called with the appropriate
1285  * tree-protecting lock held (unfortunately unassertable).
1286  */
1287 static contract_t *
1288 contract_ptr_common(avl_tree_t *tree, ctid_t id, uint64_t zuniqid)
1289 {
1290 	contract_t template, *ct;
1291 
1292 	template.ct_id = id;
1293 	ct = avl_find(tree, &template, NULL);
1294 	if (ct == NULL || (zuniqid != GLOBAL_ZONEUNIQID &&
1295 	    contract_getzuniqid(ct) != zuniqid)) {
1296 		return (NULL);
1297 	}
1298 
1299 	/*
1300 	 * Check to see if a thread is in the window in contract_rele
1301 	 * between dropping the reference count and removing the
1302 	 * contract from the type AVL.
1303 	 */
1304 	mutex_enter(&ct->ct_reflock);
1305 	if (ct->ct_ref) {
1306 		ct->ct_ref++;
1307 		mutex_exit(&ct->ct_reflock);
1308 	} else {
1309 		mutex_exit(&ct->ct_reflock);
1310 		ct = NULL;
1311 	}
1312 
1313 	return (ct);
1314 }
1315 
1316 /*
1317  * contract_type_ptr
1318  *
1319  * Returns a pointer to the contract with the specified id.  The
1320  * contract is held, so the caller needs to release the reference when
1321  * it is through with the contract.
1322  */
1323 contract_t *
1324 contract_type_ptr(ct_type_t *type, ctid_t id, uint64_t zuniqid)
1325 {
1326 	contract_t *ct;
1327 
1328 	mutex_enter(&type->ct_type_lock);
1329 	ct = contract_ptr_common(&type->ct_type_avl, id, zuniqid);
1330 	mutex_exit(&type->ct_type_lock);
1331 
1332 	return (ct);
1333 }
1334 
1335 /*
1336  * contract_ptr
1337  *
1338  * Returns a pointer to the contract with the specified id.  The
1339  * contract is held, so the caller needs to release the reference when
1340  * it is through with the contract.
1341  */
1342 contract_t *
1343 contract_ptr(ctid_t id, uint64_t zuniqid)
1344 {
1345 	contract_t *ct;
1346 
1347 	mutex_enter(&contract_lock);
1348 	ct = contract_ptr_common(&contract_avl, id, zuniqid);
1349 	mutex_exit(&contract_lock);
1350 
1351 	return (ct);
1352 }
1353 
1354 /*
1355  * contract_type_time
1356  *
1357  * Obtains the last time a contract of a particular type was created.
1358  */
1359 void
1360 contract_type_time(ct_type_t *type, timestruc_t *time)
1361 {
1362 	mutex_enter(&type->ct_type_lock);
1363 	*time = type->ct_type_timestruc;
1364 	mutex_exit(&type->ct_type_lock);
1365 }
1366 
1367 /*
1368  * contract_type_bundle
1369  *
1370  * Obtains a type's bundle queue.
1371  */
1372 ct_equeue_t *
1373 contract_type_bundle(ct_type_t *type)
1374 {
1375 	return (&type->ct_type_events);
1376 }
1377 
1378 /*
1379  * contract_type_pbundle
1380  *
1381  * Obtain's a process's bundle queue.  If one doesn't exist, one is
1382  * created.  Often used simply to ensure that a bundle queue is
1383  * allocated.
1384  */
1385 ct_equeue_t *
1386 contract_type_pbundle(ct_type_t *type, proc_t *pp)
1387 {
1388 	/*
1389 	 * If there isn't an array of bundle queues, allocate one.
1390 	 */
1391 	if (pp->p_ct_equeue == NULL) {
1392 		size_t size = CTT_MAXTYPE * sizeof (ct_equeue_t *);
1393 		ct_equeue_t **qa = kmem_zalloc(size, KM_SLEEP);
1394 
1395 		mutex_enter(&pp->p_lock);
1396 		if (pp->p_ct_equeue)
1397 			kmem_free(qa, size);
1398 		else
1399 			pp->p_ct_equeue = qa;
1400 		mutex_exit(&pp->p_lock);
1401 	}
1402 
1403 	/*
1404 	 * If there isn't a bundle queue of the required type, allocate
1405 	 * one.
1406 	 */
1407 	if (pp->p_ct_equeue[type->ct_type_index] == NULL) {
1408 		ct_equeue_t *q = kmem_zalloc(sizeof (ct_equeue_t), KM_SLEEP);
1409 		cte_queue_create(q, CTEL_PBUNDLE, 20, 1);
1410 
1411 		mutex_enter(&pp->p_lock);
1412 		if (pp->p_ct_equeue[type->ct_type_index])
1413 			cte_queue_drain(q, 0);
1414 		else
1415 			pp->p_ct_equeue[type->ct_type_index] = q;
1416 		mutex_exit(&pp->p_lock);
1417 	}
1418 
1419 	return (pp->p_ct_equeue[type->ct_type_index]);
1420 }
1421 
1422 /*
1423  * ctparam_copyin
1424  *
1425  * copyin a ct_param_t for CT_TSET or CT_TGET commands.
1426  * If ctparam_copyout() is not called after ctparam_copyin(), then
1427  * the caller must kmem_free() the buffer pointed by kparam->ctpm_kbuf.
1428  *
1429  * The copyin/out of ct_param_t is not done in ctmpl_set() and ctmpl_get()
1430  * because prctioctl() calls ctmpl_set() and ctmpl_get() while holding a
1431  * process lock.
1432  */
1433 int
1434 ctparam_copyin(const void *uaddr, ct_kparam_t *kparam, int flag, int cmd)
1435 {
1436 	uint32_t size;
1437 	void *ubuf;
1438 	ct_param_t *param = &kparam->param;
1439 	STRUCT_DECL(ct_param, uarg);
1440 
1441 	STRUCT_INIT(uarg, flag);
1442 	if (copyin(uaddr, STRUCT_BUF(uarg), STRUCT_SIZE(uarg)))
1443 		return (EFAULT);
1444 	size = STRUCT_FGET(uarg, ctpm_size);
1445 	ubuf = STRUCT_FGETP(uarg, ctpm_value);
1446 
1447 	if (size > CT_PARAM_MAX_SIZE || size == 0)
1448 		return (EINVAL);
1449 
1450 	kparam->ctpm_kbuf = kmem_alloc(size, KM_SLEEP);
1451 	if (cmd == CT_TSET) {
1452 		if (copyin(ubuf, kparam->ctpm_kbuf, size)) {
1453 			kmem_free(kparam->ctpm_kbuf, size);
1454 			return (EFAULT);
1455 		}
1456 	}
1457 	param->ctpm_id = STRUCT_FGET(uarg, ctpm_id);
1458 	param->ctpm_size = size;
1459 	param->ctpm_value = ubuf;
1460 	kparam->ret_size = 0;
1461 
1462 	return (0);
1463 }
1464 
1465 /*
1466  * ctparam_copyout
1467  *
1468  * copyout a ct_kparam_t and frees the buffer pointed by the member
1469  * ctpm_kbuf of ct_kparam_t
1470  */
1471 int
1472 ctparam_copyout(ct_kparam_t *kparam, void *uaddr, int flag)
1473 {
1474 	int r = 0;
1475 	ct_param_t *param = &kparam->param;
1476 	STRUCT_DECL(ct_param, uarg);
1477 
1478 	STRUCT_INIT(uarg, flag);
1479 
1480 	STRUCT_FSET(uarg, ctpm_id, param->ctpm_id);
1481 	STRUCT_FSET(uarg, ctpm_size, kparam->ret_size);
1482 	STRUCT_FSETP(uarg, ctpm_value, param->ctpm_value);
1483 	if (copyout(STRUCT_BUF(uarg), uaddr, STRUCT_SIZE(uarg))) {
1484 		r = EFAULT;
1485 		goto error;
1486 	}
1487 	if (copyout(kparam->ctpm_kbuf, param->ctpm_value,
1488 	    MIN(kparam->ret_size, param->ctpm_size))) {
1489 		r = EFAULT;
1490 	}
1491 
1492 error:
1493 	kmem_free(kparam->ctpm_kbuf, param->ctpm_size);
1494 
1495 	return (r);
1496 }
1497 
1498 /*
1499  * ctmpl_free
1500  *
1501  * Frees a template.
1502  */
1503 void
1504 ctmpl_free(ct_template_t *template)
1505 {
1506 	mutex_destroy(&template->ctmpl_lock);
1507 	template->ctmpl_ops->ctop_free(template);
1508 }
1509 
1510 /*
1511  * ctmpl_dup
1512  *
1513  * Creates a copy of a template.
1514  */
1515 ct_template_t *
1516 ctmpl_dup(ct_template_t *template)
1517 {
1518 	ct_template_t *new;
1519 
1520 	if (template == NULL)
1521 		return (NULL);
1522 
1523 	new = template->ctmpl_ops->ctop_dup(template);
1524 	/*
1525 	 * ctmpl_lock was taken by ctop_dup's call to ctmpl_copy and
1526 	 * should have remain held until now.
1527 	 */
1528 	mutex_exit(&template->ctmpl_lock);
1529 
1530 	return (new);
1531 }
1532 
1533 /*
1534  * ctmpl_set
1535  *
1536  * Sets the requested terms of a template.
1537  */
1538 int
1539 ctmpl_set(ct_template_t *template, ct_kparam_t *kparam, const cred_t *cr)
1540 {
1541 	int result = 0;
1542 	ct_param_t *param = &kparam->param;
1543 	uint64_t param_value;
1544 
1545 	param_value = 0;
1546 	if (param->ctpm_id == CTP_COOKIE ||
1547 	    param->ctpm_id == CTP_EV_INFO ||
1548 	    param->ctpm_id == CTP_EV_CRITICAL) {
1549 		if (param->ctpm_size < sizeof (uint64_t)) {
1550 			return (EINVAL);
1551 		} else {
1552 			param_value = *(uint64_t *)kparam->ctpm_kbuf;
1553 		}
1554 	}
1555 
1556 	mutex_enter(&template->ctmpl_lock);
1557 	switch (param->ctpm_id) {
1558 	case CTP_COOKIE:
1559 		template->ctmpl_cookie = param_value;
1560 		break;
1561 	case CTP_EV_INFO:
1562 		if (param_value & ~(uint64_t)template->ctmpl_ops->allevents)
1563 			result = EINVAL;
1564 		else
1565 			template->ctmpl_ev_info = param_value;
1566 		break;
1567 	case CTP_EV_CRITICAL:
1568 		if (param_value & ~(uint64_t)template->ctmpl_ops->allevents) {
1569 			result = EINVAL;
1570 			break;
1571 		} else if ((~template->ctmpl_ev_crit & param_value) == 0) {
1572 			/*
1573 			 * Assume that a pure reduction of the critical
1574 			 * set is allowed by the contract type.
1575 			 */
1576 			template->ctmpl_ev_crit = param_value;
1577 			break;
1578 		}
1579 		/*
1580 		 * There may be restrictions on what we can make
1581 		 * critical, so we defer to the judgement of the
1582 		 * contract type.
1583 		 */
1584 		/* FALLTHROUGH */
1585 	default:
1586 		result = template->ctmpl_ops->ctop_set(template, kparam, cr);
1587 	}
1588 	mutex_exit(&template->ctmpl_lock);
1589 
1590 	return (result);
1591 }
1592 
1593 /*
1594  * ctmpl_get
1595  *
1596  * Obtains the requested terms from a template.
1597  *
1598  * If the term requested is a variable-sized term and the buffer
1599  * provided is too small for the data, we truncate the data and return
1600  * the buffer size necessary to fit the term in kparam->ret_size. If the
1601  * term requested is fix-sized (uint64_t) and the buffer provided is too
1602  * small, we return EINVAL.  This should never happen if you're using
1603  * libcontract(3LIB), only if you call ioctl with a hand constructed
1604  * ct_param_t argument.
1605  *
1606  * Currently, only contract specific parameters have variable-sized
1607  * parameters.
1608  */
1609 int
1610 ctmpl_get(ct_template_t *template, ct_kparam_t *kparam)
1611 {
1612 	int result = 0;
1613 	ct_param_t *param = &kparam->param;
1614 	uint64_t *param_value;
1615 
1616 	param_value = NULL;
1617 	if (param->ctpm_id == CTP_COOKIE ||
1618 	    param->ctpm_id == CTP_EV_INFO ||
1619 	    param->ctpm_id == CTP_EV_CRITICAL) {
1620 		if (param->ctpm_size < sizeof (uint64_t)) {
1621 			return (EINVAL);
1622 		} else {
1623 			param_value = kparam->ctpm_kbuf;
1624 			kparam->ret_size = sizeof (uint64_t);
1625 		}
1626 	}
1627 
1628 	mutex_enter(&template->ctmpl_lock);
1629 	switch (param->ctpm_id) {
1630 	case CTP_COOKIE:
1631 		if (param_value != NULL)
1632 			*param_value = template->ctmpl_cookie;
1633 		break;
1634 	case CTP_EV_INFO:
1635 		if (param_value != NULL)
1636 			*param_value = template->ctmpl_ev_info;
1637 		break;
1638 	case CTP_EV_CRITICAL:
1639 		if (param_value != NULL)
1640 			*param_value = template->ctmpl_ev_crit;
1641 		break;
1642 	default:
1643 		result = template->ctmpl_ops->ctop_get(template, kparam);
1644 	}
1645 	mutex_exit(&template->ctmpl_lock);
1646 
1647 	return (result);
1648 }
1649 
1650 /*
1651  * ctmpl_makecurrent
1652  *
1653  * Used by ctmpl_activate and ctmpl_clear to set the current thread's
1654  * active template.  Frees the old active template, if there was one.
1655  */
1656 static void
1657 ctmpl_makecurrent(ct_template_t *template, ct_template_t *new)
1658 {
1659 	klwp_t *curlwp = ttolwp(curthread);
1660 	proc_t *p = curproc;
1661 	ct_template_t *old;
1662 
1663 	mutex_enter(&p->p_lock);
1664 	old = curlwp->lwp_ct_active[template->ctmpl_type->ct_type_index];
1665 	curlwp->lwp_ct_active[template->ctmpl_type->ct_type_index] = new;
1666 	mutex_exit(&p->p_lock);
1667 
1668 	if (old)
1669 		ctmpl_free(old);
1670 }
1671 
1672 /*
1673  * ctmpl_activate
1674  *
1675  * Copy the specified template as the current thread's activate
1676  * template of that type.
1677  */
1678 void
1679 ctmpl_activate(ct_template_t *template)
1680 {
1681 	ctmpl_makecurrent(template, ctmpl_dup(template));
1682 }
1683 
1684 /*
1685  * ctmpl_clear
1686  *
1687  * Clears the current thread's activate template of the same type as
1688  * the specified template.
1689  */
1690 void
1691 ctmpl_clear(ct_template_t *template)
1692 {
1693 	ctmpl_makecurrent(template, NULL);
1694 }
1695 
1696 /*
1697  * ctmpl_create
1698  *
1699  * Creates a new contract using the specified template.
1700  */
1701 int
1702 ctmpl_create(ct_template_t *template, ctid_t *ctidp)
1703 {
1704 	return (template->ctmpl_ops->ctop_create(template, ctidp));
1705 }
1706 
1707 /*
1708  * ctmpl_init
1709  *
1710  * Initializes the common portion of a new contract template.
1711  */
1712 void
1713 ctmpl_init(ct_template_t *new, ctmplops_t *ops, ct_type_t *type, void *data)
1714 {
1715 	mutex_init(&new->ctmpl_lock, NULL, MUTEX_DEFAULT, NULL);
1716 	new->ctmpl_ops = ops;
1717 	new->ctmpl_type = type;
1718 	new->ctmpl_data = data;
1719 	new->ctmpl_ev_info = new->ctmpl_ev_crit = 0;
1720 	new->ctmpl_cookie = 0;
1721 }
1722 
1723 /*
1724  * ctmpl_copy
1725  *
1726  * Copies the common portions of a contract template.  Intended for use
1727  * by a contract type's ctop_dup template op.  Returns with the old
1728  * template's lock held, which will should remain held until the
1729  * template op returns (it is dropped by ctmpl_dup).
1730  */
1731 void
1732 ctmpl_copy(ct_template_t *new, ct_template_t *old)
1733 {
1734 	mutex_init(&new->ctmpl_lock, NULL, MUTEX_DEFAULT, NULL);
1735 	mutex_enter(&old->ctmpl_lock);
1736 	new->ctmpl_ops = old->ctmpl_ops;
1737 	new->ctmpl_type = old->ctmpl_type;
1738 	new->ctmpl_ev_crit = old->ctmpl_ev_crit;
1739 	new->ctmpl_ev_info = old->ctmpl_ev_info;
1740 	new->ctmpl_cookie = old->ctmpl_cookie;
1741 }
1742 
1743 /*
1744  * ctmpl_create_inval
1745  *
1746  * Returns EINVAL.  Provided for the convenience of those contract
1747  * types which don't support ct_tmpl_create(3contract) and would
1748  * otherwise need to create their own stub for the ctop_create template
1749  * op.
1750  */
1751 /*ARGSUSED*/
1752 int
1753 ctmpl_create_inval(ct_template_t *template, ctid_t *ctidp)
1754 {
1755 	return (EINVAL);
1756 }
1757 
1758 
1759 /*
1760  * cte_queue_create
1761  *
1762  * Initializes a queue of a particular type.  If dynamic is set, the
1763  * queue is to be freed when its last listener is removed after being
1764  * drained.
1765  */
1766 static void
1767 cte_queue_create(ct_equeue_t *q, ct_listnum_t list, int maxinf, int dynamic)
1768 {
1769 	mutex_init(&q->ctq_lock, NULL, MUTEX_DEFAULT, NULL);
1770 	q->ctq_listno = list;
1771 	list_create(&q->ctq_events, sizeof (ct_kevent_t),
1772 	    offsetof(ct_kevent_t, cte_nodes[list].ctm_node));
1773 	list_create(&q->ctq_listeners, sizeof (ct_listener_t),
1774 	    offsetof(ct_listener_t, ctl_allnode));
1775 	list_create(&q->ctq_tail, sizeof (ct_listener_t),
1776 	    offsetof(ct_listener_t, ctl_tailnode));
1777 	gethrestime(&q->ctq_atime);
1778 	q->ctq_nlisteners = 0;
1779 	q->ctq_nreliable = 0;
1780 	q->ctq_ninf = 0;
1781 	q->ctq_max = maxinf;
1782 
1783 	/*
1784 	 * Bundle queues and contract queues are embedded in other
1785 	 * structures and are implicitly referenced counted by virtue
1786 	 * of their vnodes' indirect hold on their contracts.  Process
1787 	 * bundle queues are dynamically allocated and may persist
1788 	 * after the death of the process, so they must be explicitly
1789 	 * reference counted.
1790 	 */
1791 	q->ctq_flags = dynamic ? CTQ_REFFED : 0;
1792 }
1793 
1794 /*
1795  * cte_queue_destroy
1796  *
1797  * Destroys the specified queue.  The queue is freed if referenced
1798  * counted.
1799  */
1800 static void
1801 cte_queue_destroy(ct_equeue_t *q)
1802 {
1803 	ASSERT(q->ctq_flags & CTQ_DEAD);
1804 	ASSERT(q->ctq_nlisteners == 0);
1805 	ASSERT(q->ctq_nreliable == 0);
1806 	list_destroy(&q->ctq_events);
1807 	list_destroy(&q->ctq_listeners);
1808 	list_destroy(&q->ctq_tail);
1809 	mutex_destroy(&q->ctq_lock);
1810 	if (q->ctq_flags & CTQ_REFFED)
1811 		kmem_free(q, sizeof (ct_equeue_t));
1812 }
1813 
1814 /*
1815  * cte_hold
1816  *
1817  * Takes a hold on the specified event.
1818  */
1819 static void
1820 cte_hold(ct_kevent_t *e)
1821 {
1822 	mutex_enter(&e->cte_lock);
1823 	ASSERT(e->cte_refs > 0);
1824 	e->cte_refs++;
1825 	mutex_exit(&e->cte_lock);
1826 }
1827 
1828 /*
1829  * cte_rele
1830  *
1831  * Releases a hold on the specified event.  If the caller had the last
1832  * reference, frees the event and releases its hold on the contract
1833  * that generated it.
1834  */
1835 static void
1836 cte_rele(ct_kevent_t *e)
1837 {
1838 	mutex_enter(&e->cte_lock);
1839 	ASSERT(e->cte_refs > 0);
1840 	if (--e->cte_refs) {
1841 		mutex_exit(&e->cte_lock);
1842 		return;
1843 	}
1844 
1845 	contract_rele(e->cte_contract);
1846 
1847 	mutex_destroy(&e->cte_lock);
1848 	nvlist_free(e->cte_data);
1849 	nvlist_free(e->cte_gdata);
1850 	kmem_free(e, sizeof (ct_kevent_t));
1851 }
1852 
1853 /*
1854  * cte_qrele
1855  *
1856  * Remove this listener's hold on the specified event, removing and
1857  * releasing the queue's hold on the event if appropriate.
1858  */
1859 static void
1860 cte_qrele(ct_equeue_t *q, ct_listener_t *l, ct_kevent_t *e)
1861 {
1862 	ct_member_t *member = &e->cte_nodes[q->ctq_listno];
1863 
1864 	ASSERT(MUTEX_HELD(&q->ctq_lock));
1865 
1866 	if (l->ctl_flags & CTLF_RELIABLE)
1867 		member->ctm_nreliable--;
1868 	if ((--member->ctm_refs == 0) && member->ctm_trimmed) {
1869 		member->ctm_trimmed = 0;
1870 		list_remove(&q->ctq_events, e);
1871 		cte_rele(e);
1872 	}
1873 }
1874 
1875 /*
1876  * cte_qmove
1877  *
1878  * Move this listener to the specified event in the queue.
1879  */
1880 static ct_kevent_t *
1881 cte_qmove(ct_equeue_t *q, ct_listener_t *l, ct_kevent_t *e)
1882 {
1883 	ct_kevent_t *olde;
1884 
1885 	ASSERT(MUTEX_HELD(&q->ctq_lock));
1886 	ASSERT(l->ctl_equeue == q);
1887 
1888 	if ((olde = l->ctl_position) == NULL)
1889 		list_remove(&q->ctq_tail, l);
1890 
1891 	while (e != NULL && e->cte_nodes[q->ctq_listno].ctm_trimmed)
1892 		e = list_next(&q->ctq_events, e);
1893 
1894 	if (e != NULL) {
1895 		e->cte_nodes[q->ctq_listno].ctm_refs++;
1896 		if (l->ctl_flags & CTLF_RELIABLE)
1897 			e->cte_nodes[q->ctq_listno].ctm_nreliable++;
1898 	} else {
1899 		list_insert_tail(&q->ctq_tail, l);
1900 	}
1901 
1902 	l->ctl_position = e;
1903 	if (olde)
1904 		cte_qrele(q, l, olde);
1905 
1906 	return (e);
1907 }
1908 
1909 /*
1910  * cte_checkcred
1911  *
1912  * Determines if the specified event's contract is owned by a process
1913  * with the same effective uid as the specified credential.  Called
1914  * after a failed call to contract_owned with locked set.  Because it
1915  * drops the queue lock, its caller (cte_qreadable) needs to make sure
1916  * we're still in the same place after we return.  Returns 1 on
1917  * success.
1918  */
1919 static int
1920 cte_checkcred(ct_equeue_t *q, ct_kevent_t *e, const cred_t *cr)
1921 {
1922 	int result;
1923 	contract_t *ct = e->cte_contract;
1924 
1925 	cte_hold(e);
1926 	mutex_exit(&q->ctq_lock);
1927 	result = curproc->p_zone->zone_uniqid == ct->ct_czuniqid &&
1928 	    contract_checkcred(ct, cr);
1929 	mutex_enter(&q->ctq_lock);
1930 	cte_rele(e);
1931 
1932 	return (result);
1933 }
1934 
1935 /*
1936  * cte_qreadable
1937  *
1938  * Ensures that the listener is pointing to a valid event that the
1939  * caller has the credentials to read.  Returns 0 if we can read the
1940  * event we're pointing to.
1941  */
1942 static int
1943 cte_qreadable(ct_equeue_t *q, ct_listener_t *l, const cred_t *cr,
1944     uint64_t zuniqid, int crit)
1945 {
1946 	ct_kevent_t *e, *next;
1947 	contract_t *ct;
1948 
1949 	ASSERT(MUTEX_HELD(&q->ctq_lock));
1950 	ASSERT(l->ctl_equeue == q);
1951 
1952 	if (l->ctl_flags & CTLF_COPYOUT)
1953 		return (1);
1954 
1955 	next = l->ctl_position;
1956 	while (e = cte_qmove(q, l, next)) {
1957 		ct = e->cte_contract;
1958 		/*
1959 		 * Check obvious things first.  If we are looking for a
1960 		 * critical message, is this one?  If we aren't in the
1961 		 * global zone, is this message meant for us?
1962 		 */
1963 		if ((crit && (e->cte_flags & (CTE_INFO | CTE_ACK))) ||
1964 		    (cr != NULL && zuniqid != GLOBAL_ZONEUNIQID &&
1965 		    zuniqid != contract_getzuniqid(ct))) {
1966 
1967 			next = list_next(&q->ctq_events, e);
1968 
1969 		/*
1970 		 * Next, see if our effective uid equals that of owner
1971 		 * or author of the contract.  Since we are holding the
1972 		 * queue lock, contract_owned can't always check if we
1973 		 * have the same effective uid as the contract's
1974 		 * owner.  If it comes to that, it fails and we take
1975 		 * the slow(er) path.
1976 		 */
1977 		} else if (cr != NULL && !contract_owned(ct, cr, B_TRUE)) {
1978 
1979 			/*
1980 			 * At this point we either don't have any claim
1981 			 * to this contract or we match the effective
1982 			 * uid of the owner but couldn't tell.  We
1983 			 * first test for a NULL holder so that events
1984 			 * from orphans and inherited contracts avoid
1985 			 * the penalty phase.
1986 			 */
1987 			if (e->cte_contract->ct_owner == NULL &&
1988 			    !secpolicy_contract_observer_choice(cr))
1989 				next = list_next(&q->ctq_events, e);
1990 
1991 			/*
1992 			 * cte_checkcred will juggle locks to see if we
1993 			 * have the same uid as the event's contract's
1994 			 * current owner.  If it succeeds, we have to
1995 			 * make sure we are in the same point in the
1996 			 * queue.
1997 			 */
1998 			else if (cte_checkcred(q, e, cr) &&
1999 			    l->ctl_position == e)
2000 				break;
2001 
2002 			/*
2003 			 * cte_checkcred failed; see if we're in the
2004 			 * same place.
2005 			 */
2006 			else if (l->ctl_position == e)
2007 				if (secpolicy_contract_observer_choice(cr))
2008 					break;
2009 				else
2010 					next = list_next(&q->ctq_events, e);
2011 
2012 			/*
2013 			 * cte_checkcred failed, and our position was
2014 			 * changed.  Start from there.
2015 			 */
2016 			else
2017 				next = l->ctl_position;
2018 		} else {
2019 			break;
2020 		}
2021 	}
2022 
2023 	/*
2024 	 * We check for CTLF_COPYOUT again in case we dropped the queue
2025 	 * lock in cte_checkcred.
2026 	 */
2027 	return ((l->ctl_flags & CTLF_COPYOUT) || (l->ctl_position == NULL));
2028 }
2029 
2030 /*
2031  * cte_qwakeup
2032  *
2033  * Wakes up any waiting listeners and points them at the specified event.
2034  */
2035 static void
2036 cte_qwakeup(ct_equeue_t *q, ct_kevent_t *e)
2037 {
2038 	ct_listener_t *l;
2039 
2040 	ASSERT(MUTEX_HELD(&q->ctq_lock));
2041 
2042 	while (l = list_head(&q->ctq_tail)) {
2043 		list_remove(&q->ctq_tail, l);
2044 		e->cte_nodes[q->ctq_listno].ctm_refs++;
2045 		if (l->ctl_flags & CTLF_RELIABLE)
2046 			e->cte_nodes[q->ctq_listno].ctm_nreliable++;
2047 		l->ctl_position = e;
2048 		cv_signal(&l->ctl_cv);
2049 		pollwakeup(&l->ctl_pollhead, POLLIN);
2050 	}
2051 }
2052 
2053 /*
2054  * cte_copy
2055  *
2056  * Copies events from the specified contract event queue to the
2057  * end of the specified process bundle queue.  Only called from
2058  * contract_adopt.
2059  *
2060  * We copy to the end of the target queue instead of mixing the events
2061  * in their proper order because otherwise the act of adopting a
2062  * contract would require a process to reset all process bundle
2063  * listeners it needed to see the new events.  This would, in turn,
2064  * require the process to keep track of which preexisting events had
2065  * already been processed.
2066  */
2067 static void
2068 cte_copy(ct_equeue_t *q, ct_equeue_t *newq)
2069 {
2070 	ct_kevent_t *e, *first = NULL;
2071 
2072 	VERIFY(q->ctq_listno == CTEL_CONTRACT);
2073 	VERIFY(newq->ctq_listno == CTEL_PBUNDLE);
2074 
2075 	mutex_enter(&q->ctq_lock);
2076 	mutex_enter(&newq->ctq_lock);
2077 
2078 	/*
2079 	 * For now, only copy critical events.
2080 	 */
2081 	for (e = list_head(&q->ctq_events); e != NULL;
2082 	    e = list_next(&q->ctq_events, e)) {
2083 		if ((e->cte_flags & (CTE_INFO | CTE_ACK)) == 0) {
2084 			if (first == NULL)
2085 				first = e;
2086 			/*
2087 			 * It is possible for adoption to race with an owner's
2088 			 * cte_publish_all(); we must only enqueue events that
2089 			 * have not already been enqueued.
2090 			 */
2091 			if (!list_link_active((list_node_t *)
2092 			    ((uintptr_t)e + newq->ctq_events.list_offset))) {
2093 				list_insert_tail(&newq->ctq_events, e);
2094 				cte_hold(e);
2095 			}
2096 		}
2097 	}
2098 
2099 	mutex_exit(&q->ctq_lock);
2100 
2101 	if (first)
2102 		cte_qwakeup(newq, first);
2103 
2104 	mutex_exit(&newq->ctq_lock);
2105 }
2106 
2107 /*
2108  * cte_trim
2109  *
2110  * Trims unneeded events from an event queue.  Algorithm works as
2111  * follows:
2112  *
2113  *   Removes all informative and acknowledged critical events until the
2114  *   first referenced event is found.
2115  *
2116  *   If a contract is specified, removes all events (regardless of
2117  *   acknowledgement) generated by that contract until the first event
2118  *   referenced by a reliable listener is found.  Reference events are
2119  *   removed by marking them "trimmed".  Such events will be removed
2120  *   when the last reference is dropped and will be skipped by future
2121  *   listeners.
2122  *
2123  * This is pretty basic.  Ideally this should remove from the middle of
2124  * the list (i.e. beyond the first referenced event), and even
2125  * referenced events.
2126  */
2127 static void
2128 cte_trim(ct_equeue_t *q, contract_t *ct)
2129 {
2130 	ct_kevent_t *e, *next;
2131 	int flags, stopper;
2132 	int start = 1;
2133 
2134 	VERIFY(MUTEX_HELD(&q->ctq_lock));
2135 
2136 	for (e = list_head(&q->ctq_events); e != NULL; e = next) {
2137 		next = list_next(&q->ctq_events, e);
2138 		flags = e->cte_flags;
2139 		stopper = (q->ctq_listno != CTEL_PBUNDLE) &&
2140 		    (e->cte_nodes[q->ctq_listno].ctm_nreliable > 0);
2141 		if (e->cte_nodes[q->ctq_listno].ctm_refs == 0) {
2142 			if ((start && (flags & (CTE_INFO | CTE_ACK))) ||
2143 			    (e->cte_contract == ct)) {
2144 				/*
2145 				 * Toss informative and ACKed critical messages.
2146 				 */
2147 				list_remove(&q->ctq_events, e);
2148 				cte_rele(e);
2149 			}
2150 		} else if ((e->cte_contract == ct) && !stopper) {
2151 			ASSERT(q->ctq_nlisteners != 0);
2152 			e->cte_nodes[q->ctq_listno].ctm_trimmed = 1;
2153 		} else if (ct && !stopper) {
2154 			start = 0;
2155 		} else {
2156 			/*
2157 			 * Don't free messages past the first reader.
2158 			 */
2159 			break;
2160 		}
2161 	}
2162 }
2163 
2164 /*
2165  * cte_queue_drain
2166  *
2167  * Drain all events from the specified queue, and mark it dead.  If
2168  * "ack" is set, acknowledge any critical events we find along the
2169  * way.
2170  */
2171 static void
2172 cte_queue_drain(ct_equeue_t *q, int ack)
2173 {
2174 	ct_kevent_t *e, *next;
2175 	ct_listener_t *l;
2176 
2177 	mutex_enter(&q->ctq_lock);
2178 
2179 	for (e = list_head(&q->ctq_events); e != NULL; e = next) {
2180 		next = list_next(&q->ctq_events, e);
2181 		if (ack && ((e->cte_flags & (CTE_INFO | CTE_ACK)) == 0)) {
2182 			/*
2183 			 * Make sure critical messages are eventually
2184 			 * removed from the bundle queues.
2185 			 */
2186 			mutex_enter(&e->cte_lock);
2187 			e->cte_flags |= CTE_ACK;
2188 			mutex_exit(&e->cte_lock);
2189 			ASSERT(MUTEX_HELD(&e->cte_contract->ct_lock));
2190 			e->cte_contract->ct_evcnt--;
2191 		}
2192 		list_remove(&q->ctq_events, e);
2193 		e->cte_nodes[q->ctq_listno].ctm_refs = 0;
2194 		e->cte_nodes[q->ctq_listno].ctm_nreliable = 0;
2195 		e->cte_nodes[q->ctq_listno].ctm_trimmed = 0;
2196 		cte_rele(e);
2197 	}
2198 
2199 	/*
2200 	 * This is necessary only because of CTEL_PBUNDLE listeners;
2201 	 * the events they point to can move from one pbundle to
2202 	 * another.  Fortunately, this only happens if the contract is
2203 	 * inherited, which (in turn) only happens if the process
2204 	 * exits, which means it's an all-or-nothing deal.  If this
2205 	 * wasn't the case, we would instead need to keep track of
2206 	 * listeners on a per-event basis, not just a per-queue basis.
2207 	 * This would have the side benefit of letting us clean up
2208 	 * trimmed events sooner (i.e. immediately), but would
2209 	 * unfortunately make events even bigger than they already
2210 	 * are.
2211 	 */
2212 	for (l = list_head(&q->ctq_listeners); l;
2213 	    l = list_next(&q->ctq_listeners, l)) {
2214 		l->ctl_flags |= CTLF_DEAD;
2215 		if (l->ctl_position) {
2216 			l->ctl_position = NULL;
2217 			list_insert_tail(&q->ctq_tail, l);
2218 		}
2219 		cv_broadcast(&l->ctl_cv);
2220 	}
2221 
2222 	/*
2223 	 * Disallow events.
2224 	 */
2225 	q->ctq_flags |= CTQ_DEAD;
2226 
2227 	/*
2228 	 * If we represent the last reference to a reference counted
2229 	 * process bundle queue, free it.
2230 	 */
2231 	if ((q->ctq_flags & CTQ_REFFED) && (q->ctq_nlisteners == 0))
2232 		cte_queue_destroy(q);
2233 	else
2234 		mutex_exit(&q->ctq_lock);
2235 }
2236 
2237 /*
2238  * cte_publish
2239  *
2240  * Publishes an event to a specific queue.  Only called by
2241  * cte_publish_all.
2242  */
2243 static void
2244 cte_publish(ct_equeue_t *q, ct_kevent_t *e, timespec_t *tsp, boolean_t mayexist)
2245 {
2246 	ASSERT(MUTEX_HELD(&q->ctq_lock));
2247 
2248 	q->ctq_atime = *tsp;
2249 
2250 	/*
2251 	 * If this event may already exist on this queue, check to see if it
2252 	 * is already there and return if so.
2253 	 */
2254 	if (mayexist && list_link_active((list_node_t *)((uintptr_t)e +
2255 	    q->ctq_events.list_offset))) {
2256 		mutex_exit(&q->ctq_lock);
2257 		cte_rele(e);
2258 		return;
2259 	}
2260 
2261 	/*
2262 	 * Don't publish if the event is informative and there aren't
2263 	 * any listeners, or if the queue has been shut down.
2264 	 */
2265 	if (((q->ctq_nlisteners == 0) && (e->cte_flags & (CTE_INFO|CTE_ACK))) ||
2266 	    (q->ctq_flags & CTQ_DEAD)) {
2267 		mutex_exit(&q->ctq_lock);
2268 		cte_rele(e);
2269 		return;
2270 	}
2271 
2272 	/*
2273 	 * Enqueue event
2274 	 */
2275 	VERIFY(!list_link_active((list_node_t *)
2276 	    ((uintptr_t)e + q->ctq_events.list_offset)));
2277 	list_insert_tail(&q->ctq_events, e);
2278 
2279 	/*
2280 	 * Check for waiting listeners
2281 	 */
2282 	cte_qwakeup(q, e);
2283 
2284 	/*
2285 	 * Trim unnecessary events from the queue.
2286 	 */
2287 	cte_trim(q, NULL);
2288 	mutex_exit(&q->ctq_lock);
2289 }
2290 
2291 /*
2292  * cte_publish_all
2293  *
2294  * Publish an event to all necessary event queues.  The event, e, must
2295  * be zallocated by the caller, and the event's flags and type must be
2296  * set.  The rest of the event's fields are initialized here.
2297  */
2298 uint64_t
2299 cte_publish_all(contract_t *ct, ct_kevent_t *e, nvlist_t *data, nvlist_t *gdata)
2300 {
2301 	ct_equeue_t *q;
2302 	timespec_t ts;
2303 	uint64_t evid;
2304 	ct_kevent_t *negev;
2305 	int negend;
2306 
2307 	e->cte_contract = ct;
2308 	e->cte_data = data;
2309 	e->cte_gdata = gdata;
2310 	e->cte_refs = 3;
2311 	evid = e->cte_id = atomic_inc_64_nv(&ct->ct_type->ct_type_evid);
2312 	contract_hold(ct);
2313 
2314 	/*
2315 	 * For a negotiation event we set the ct->ct_nevent field of the
2316 	 * contract for the duration of the negotiation
2317 	 */
2318 	negend = 0;
2319 	if (e->cte_flags & CTE_NEG) {
2320 		cte_hold(e);
2321 		ct->ct_nevent = e;
2322 	} else if (e->cte_type == CT_EV_NEGEND) {
2323 		negend = 1;
2324 	}
2325 
2326 	gethrestime(&ts);
2327 
2328 	/*
2329 	 * ct_evtlock simply (and only) ensures that two events sent
2330 	 * from the same contract are delivered to all queues in the
2331 	 * same order.
2332 	 */
2333 	mutex_enter(&ct->ct_evtlock);
2334 
2335 	/*
2336 	 * CTEL_CONTRACT - First deliver to the contract queue, acking
2337 	 * the event if the contract has been orphaned.
2338 	 */
2339 	mutex_enter(&ct->ct_lock);
2340 	mutex_enter(&ct->ct_events.ctq_lock);
2341 	if ((e->cte_flags & CTE_INFO) == 0) {
2342 		if (ct->ct_state >= CTS_ORPHAN)
2343 			e->cte_flags |= CTE_ACK;
2344 		else
2345 			ct->ct_evcnt++;
2346 	}
2347 	mutex_exit(&ct->ct_lock);
2348 	cte_publish(&ct->ct_events, e, &ts, B_FALSE);
2349 
2350 	/*
2351 	 * CTEL_BUNDLE - Next deliver to the contract type's bundle
2352 	 * queue.
2353 	 */
2354 	mutex_enter(&ct->ct_type->ct_type_events.ctq_lock);
2355 	cte_publish(&ct->ct_type->ct_type_events, e, &ts, B_FALSE);
2356 
2357 	/*
2358 	 * CTEL_PBUNDLE - Finally, if the contract has an owner,
2359 	 * deliver to the owner's process bundle queue.
2360 	 */
2361 	mutex_enter(&ct->ct_lock);
2362 	if (ct->ct_owner) {
2363 		/*
2364 		 * proc_exit doesn't free event queues until it has
2365 		 * abandoned all contracts.
2366 		 */
2367 		ASSERT(ct->ct_owner->p_ct_equeue);
2368 		ASSERT(ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index]);
2369 		q = ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index];
2370 		mutex_enter(&q->ctq_lock);
2371 		mutex_exit(&ct->ct_lock);
2372 
2373 		/*
2374 		 * It is possible for this code to race with adoption; we
2375 		 * publish the event indicating that the event may already
2376 		 * be enqueued because adoption beat us to it (in which case
2377 		 * cte_pubish() does nothing).
2378 		 */
2379 		cte_publish(q, e, &ts, B_TRUE);
2380 	} else {
2381 		mutex_exit(&ct->ct_lock);
2382 		cte_rele(e);
2383 	}
2384 
2385 	if (negend) {
2386 		mutex_enter(&ct->ct_lock);
2387 		negev = ct->ct_nevent;
2388 		ct->ct_nevent = NULL;
2389 		cte_rele(negev);
2390 		mutex_exit(&ct->ct_lock);
2391 	}
2392 
2393 	mutex_exit(&ct->ct_evtlock);
2394 
2395 	return (evid);
2396 }
2397 
2398 /*
2399  * cte_add_listener
2400  *
2401  * Add a new listener to an event queue.
2402  */
2403 void
2404 cte_add_listener(ct_equeue_t *q, ct_listener_t *l)
2405 {
2406 	cv_init(&l->ctl_cv, NULL, CV_DEFAULT, NULL);
2407 	l->ctl_equeue = q;
2408 	l->ctl_position = NULL;
2409 	l->ctl_flags = 0;
2410 
2411 	mutex_enter(&q->ctq_lock);
2412 	list_insert_head(&q->ctq_tail, l);
2413 	list_insert_head(&q->ctq_listeners, l);
2414 	q->ctq_nlisteners++;
2415 	mutex_exit(&q->ctq_lock);
2416 }
2417 
2418 /*
2419  * cte_remove_listener
2420  *
2421  * Remove a listener from an event queue.  No other queue activities
2422  * (e.g. cte_get event) may be in progress at this endpoint when this
2423  * is called.
2424  */
2425 void
2426 cte_remove_listener(ct_listener_t *l)
2427 {
2428 	ct_equeue_t *q = l->ctl_equeue;
2429 	ct_kevent_t *e;
2430 
2431 	mutex_enter(&q->ctq_lock);
2432 
2433 	ASSERT((l->ctl_flags & (CTLF_COPYOUT|CTLF_RESET)) == 0);
2434 
2435 	if ((e = l->ctl_position) != NULL)
2436 		cte_qrele(q, l, e);
2437 	else
2438 		list_remove(&q->ctq_tail, l);
2439 	l->ctl_position = NULL;
2440 
2441 	q->ctq_nlisteners--;
2442 	list_remove(&q->ctq_listeners, l);
2443 
2444 	if (l->ctl_flags & CTLF_RELIABLE)
2445 		q->ctq_nreliable--;
2446 
2447 	/*
2448 	 * If we are a the last listener of a dead reference counted
2449 	 * queue (i.e. a process bundle) we free it.  Otherwise we just
2450 	 * trim any events which may have been kept around for our
2451 	 * benefit.
2452 	 */
2453 	if ((q->ctq_flags & CTQ_REFFED) && (q->ctq_flags & CTQ_DEAD) &&
2454 	    (q->ctq_nlisteners == 0)) {
2455 		cte_queue_destroy(q);
2456 	} else {
2457 		cte_trim(q, NULL);
2458 		mutex_exit(&q->ctq_lock);
2459 	}
2460 }
2461 
2462 /*
2463  * cte_reset_listener
2464  *
2465  * Moves a listener's queue pointer to the beginning of the queue.
2466  */
2467 void
2468 cte_reset_listener(ct_listener_t *l)
2469 {
2470 	ct_equeue_t *q = l->ctl_equeue;
2471 
2472 	mutex_enter(&q->ctq_lock);
2473 
2474 	/*
2475 	 * We allow an asynchronous reset because it doesn't make a
2476 	 * whole lot of sense to make reset block or fail.  We already
2477 	 * have most of the mechanism needed thanks to queue trimming,
2478 	 * so implementing it isn't a big deal.
2479 	 */
2480 	if (l->ctl_flags & CTLF_COPYOUT)
2481 		l->ctl_flags |= CTLF_RESET;
2482 
2483 	(void) cte_qmove(q, l, list_head(&q->ctq_events));
2484 
2485 	/*
2486 	 * Inform blocked readers.
2487 	 */
2488 	cv_broadcast(&l->ctl_cv);
2489 	pollwakeup(&l->ctl_pollhead, POLLIN);
2490 	mutex_exit(&q->ctq_lock);
2491 }
2492 
2493 /*
2494  * cte_next_event
2495  *
2496  * Moves the event pointer for the specified listener to the next event
2497  * on the queue.  To avoid races, this movement only occurs if the
2498  * specified event id matches that of the current event.  This is used
2499  * primarily to skip events that have been read but whose extended data
2500  * haven't been copied out.
2501  */
2502 int
2503 cte_next_event(ct_listener_t *l, uint64_t id)
2504 {
2505 	ct_equeue_t *q = l->ctl_equeue;
2506 	ct_kevent_t *old;
2507 
2508 	mutex_enter(&q->ctq_lock);
2509 
2510 	if (l->ctl_flags & CTLF_COPYOUT)
2511 		l->ctl_flags |= CTLF_RESET;
2512 
2513 	if (((old = l->ctl_position) != NULL) && (old->cte_id == id))
2514 		(void) cte_qmove(q, l, list_next(&q->ctq_events, old));
2515 
2516 	mutex_exit(&q->ctq_lock);
2517 
2518 	return (0);
2519 }
2520 
2521 /*
2522  * cte_get_event
2523  *
2524  * Reads an event from an event endpoint.  If "nonblock" is clear, we
2525  * block until a suitable event is ready.  If "crit" is set, we only
2526  * read critical events.  Note that while "cr" is the caller's cred,
2527  * "zuniqid" is the unique id of the zone the calling contract
2528  * filesystem was mounted in.
2529  */
2530 int
2531 cte_get_event(ct_listener_t *l, int nonblock, void *uaddr, const cred_t *cr,
2532     uint64_t zuniqid, int crit)
2533 {
2534 	ct_equeue_t *q = l->ctl_equeue;
2535 	ct_kevent_t *temp;
2536 	int result = 0;
2537 	int partial = 0;
2538 	size_t size, gsize, len;
2539 	model_t mdl = get_udatamodel();
2540 	STRUCT_DECL(ct_event, ev);
2541 	STRUCT_INIT(ev, mdl);
2542 
2543 	/*
2544 	 * cte_qreadable checks for CTLF_COPYOUT as well as ensures
2545 	 * that there exists, and we are pointing to, an appropriate
2546 	 * event.  It may temporarily drop ctq_lock, but that doesn't
2547 	 * really matter to us.
2548 	 */
2549 	mutex_enter(&q->ctq_lock);
2550 	while (cte_qreadable(q, l, cr, zuniqid, crit)) {
2551 		if (nonblock) {
2552 			result = EAGAIN;
2553 			goto error;
2554 		}
2555 		if (q->ctq_flags & CTQ_DEAD) {
2556 			result = EIDRM;
2557 			goto error;
2558 		}
2559 		result = cv_wait_sig(&l->ctl_cv, &q->ctq_lock);
2560 		if (result == 0) {
2561 			result = EINTR;
2562 			goto error;
2563 		}
2564 	}
2565 	temp = l->ctl_position;
2566 	cte_hold(temp);
2567 	l->ctl_flags |= CTLF_COPYOUT;
2568 	mutex_exit(&q->ctq_lock);
2569 
2570 	/*
2571 	 * We now have an event.  Copy in the user event structure to
2572 	 * see how much space we have to work with.
2573 	 */
2574 	result = copyin(uaddr, STRUCT_BUF(ev), STRUCT_SIZE(ev));
2575 	if (result)
2576 		goto copyerr;
2577 
2578 	/*
2579 	 * Determine what data we have and what the user should be
2580 	 * allowed to see.
2581 	 */
2582 	size = gsize = 0;
2583 	if (temp->cte_data) {
2584 		VERIFY(nvlist_size(temp->cte_data, &size,
2585 		    NV_ENCODE_NATIVE) == 0);
2586 		ASSERT(size != 0);
2587 	}
2588 	if (zuniqid == GLOBAL_ZONEUNIQID && temp->cte_gdata) {
2589 		VERIFY(nvlist_size(temp->cte_gdata, &gsize,
2590 		    NV_ENCODE_NATIVE) == 0);
2591 		ASSERT(gsize != 0);
2592 	}
2593 
2594 	/*
2595 	 * If we have enough space, copy out the extended event data.
2596 	 */
2597 	len = size + gsize;
2598 	if (len) {
2599 		if (STRUCT_FGET(ev, ctev_nbytes) >= len) {
2600 			char *buf = kmem_alloc(len, KM_SLEEP);
2601 
2602 			if (size)
2603 				VERIFY(nvlist_pack(temp->cte_data, &buf, &size,
2604 				    NV_ENCODE_NATIVE, KM_SLEEP) == 0);
2605 			if (gsize) {
2606 				char *tmp = buf + size;
2607 
2608 				VERIFY(nvlist_pack(temp->cte_gdata, &tmp,
2609 				    &gsize, NV_ENCODE_NATIVE, KM_SLEEP) == 0);
2610 			}
2611 
2612 			/* This shouldn't have changed */
2613 			ASSERT(size + gsize == len);
2614 			result = copyout(buf, STRUCT_FGETP(ev, ctev_buffer),
2615 			    len);
2616 			kmem_free(buf, len);
2617 			if (result)
2618 				goto copyerr;
2619 		} else {
2620 			partial = 1;
2621 		}
2622 	}
2623 
2624 	/*
2625 	 * Copy out the common event data.
2626 	 */
2627 	STRUCT_FSET(ev, ctev_id, temp->cte_contract->ct_id);
2628 	STRUCT_FSET(ev, ctev_evid, temp->cte_id);
2629 	STRUCT_FSET(ev, ctev_cttype,
2630 	    temp->cte_contract->ct_type->ct_type_index);
2631 	STRUCT_FSET(ev, ctev_flags, temp->cte_flags &
2632 	    (CTE_ACK|CTE_INFO|CTE_NEG));
2633 	STRUCT_FSET(ev, ctev_type, temp->cte_type);
2634 	STRUCT_FSET(ev, ctev_nbytes, len);
2635 	STRUCT_FSET(ev, ctev_goffset, size);
2636 	result = copyout(STRUCT_BUF(ev), uaddr, STRUCT_SIZE(ev));
2637 
2638 copyerr:
2639 	/*
2640 	 * Only move our location in the queue if all copyouts were
2641 	 * successful, the caller provided enough space for the entire
2642 	 * event, and our endpoint wasn't reset or otherwise moved by
2643 	 * another thread.
2644 	 */
2645 	mutex_enter(&q->ctq_lock);
2646 	if (result)
2647 		result = EFAULT;
2648 	else if (!partial && ((l->ctl_flags & CTLF_RESET) == 0) &&
2649 	    (l->ctl_position == temp))
2650 		(void) cte_qmove(q, l, list_next(&q->ctq_events, temp));
2651 	l->ctl_flags &= ~(CTLF_COPYOUT|CTLF_RESET);
2652 	/*
2653 	 * Signal any readers blocked on our CTLF_COPYOUT.
2654 	 */
2655 	cv_signal(&l->ctl_cv);
2656 	cte_rele(temp);
2657 
2658 error:
2659 	mutex_exit(&q->ctq_lock);
2660 	return (result);
2661 }
2662 
2663 /*
2664  * cte_set_reliable
2665  *
2666  * Requests that events be reliably delivered to an event endpoint.
2667  * Unread informative and acknowledged critical events will not be
2668  * removed from the queue until this listener reads or skips them.
2669  * Because a listener could maliciously request reliable delivery and
2670  * then do nothing, this requires that PRIV_CONTRACT_EVENT be in the
2671  * caller's effective set.
2672  */
2673 int
2674 cte_set_reliable(ct_listener_t *l, const cred_t *cr)
2675 {
2676 	ct_equeue_t *q = l->ctl_equeue;
2677 	int error;
2678 
2679 	if ((error = secpolicy_contract_event(cr)) != 0)
2680 		return (error);
2681 
2682 	mutex_enter(&q->ctq_lock);
2683 	if ((l->ctl_flags & CTLF_RELIABLE) == 0) {
2684 		l->ctl_flags |= CTLF_RELIABLE;
2685 		q->ctq_nreliable++;
2686 		if (l->ctl_position != NULL)
2687 			l->ctl_position->cte_nodes[q->ctq_listno].
2688 			    ctm_nreliable++;
2689 	}
2690 	mutex_exit(&q->ctq_lock);
2691 
2692 	return (0);
2693 }
2694