xref: /titanic_41/usr/src/uts/common/os/contract.c (revision cc1a9a89a73172cc2db053635fab3b1b91691657)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Contracts
30  * ---------
31  *
32  * Contracts are a primitive which enrich the relationships between
33  * processes and system resources.  The primary purpose of contracts is
34  * to provide a means for the system to negotiate the departure from a
35  * binding relationship (e.g. pages locked in memory or a thread bound
36  * to processor), but they can also be used as a purely asynchronous
37  * error reporting mechanism as they are with process contracts.
38  *
39  * More information on how one interfaces with contracts and what
40  * contracts can do for you can be found in:
41  *   PSARC 2003/193 Solaris Contracts
42  *   PSARC 2004/460 Contracts addendum
43  *
44  * This file contains the core contracts framework.  By itself it is
45  * useless: it depends the contracts filesystem (ctfs) to provide an
46  * interface to user processes and individual contract types to
47  * implement the process/resource relationships.
48  *
49  * Data structure overview
50  * -----------------------
51  *
52  * A contract is represented by a contract_t, which itself points to an
53  * encapsulating contract-type specific contract object.  A contract_t
54  * contains the contract's static identity (including its terms), its
55  * linkage to various bookkeeping structures, the contract-specific
56  * event queue, and a reference count.
57  *
58  * A contract template is represented by a ct_template_t, which, like a
59  * contract, points to an encapsulating contract-type specific template
60  * object.  A ct_template_t contains the template's terms.
61  *
62  * An event queue is represented by a ct_equeue_t, and consists of a
63  * list of events, a list of listeners, and a list of listeners who are
64  * waiting for new events (affectionately referred to as "tail
65  * listeners").  There are three queue types, defined by ct_listnum_t
66  * (an enum).  An event may be on one of each type of queue
67  * simultaneously; the list linkage used by a queue is determined by
68  * its type.
69  *
70  * An event is represented by a ct_kevent_t, which contains mostly
71  * static event data (e.g. id, payload).  It also has an array of
72  * ct_member_t structures, each of which contains a list_node_t and
73  * represent the event's linkage in a specific event queue.
74  *
75  * Each open of an event endpoint results in the creation of a new
76  * listener, represented by a ct_listener_t.  In addition to linkage
77  * into the aforementioned lists in the event_queue, a ct_listener_t
78  * contains a pointer to the ct_kevent_t it is currently positioned at
79  * as well as a set of status flags and other administrative data.
80  *
81  * Each process has a list of contracts it owns, p_ct_held; a pointer
82  * to the process contract it is a member of, p_ct_process; the linkage
83  * for that membership, p_ct_member; and an array of event queue
84  * structures representing the process bundle queues.
85  *
86  * Each LWP has an array of its active templates, lwp_ct_active; and
87  * the most recently created contracts, lwp_ct_latest.
88  *
89  * A process contract has a list of member processes and a list of
90  * inherited contracts.
91  *
92  * There is a system-wide list of all contracts, as well as per-type
93  * lists of contracts.
94  *
95  * Lock ordering overview
96  * ----------------------
97  *
98  * Locks at the top are taken first:
99  *
100  *                   ct_evtlock
101  *                   regent ct_lock
102  *                   member ct_lock
103  *                   pidlock
104  *                   p_lock
105  *    contract ctq_lock         contract_lock
106  *    pbundle ctq_lock
107  *    cte_lock
108  *                   ct_reflock
109  *
110  * contract_lock and ctq_lock/cte_lock are not currently taken at the
111  * same time.
112  *
113  * Reference counting and locking
114  * ------------------------------
115  *
116  * A contract has a reference count, protected by ct_reflock.
117  * (ct_reflock is also used in a couple other places where atomic
118  * access to a variable is needed in an innermost context).  A process
119  * maintains a hold on each contract it owns.  A process contract has a
120  * hold on each contract is has inherited.  Each event has a hold on
121  * the contract which generated it.  Process contract templates have
122  * holds on the contracts referred to by their transfer terms.  CTFS
123  * contract directory nodes have holds on contracts.  Lastly, various
124  * code paths may temporarily take holds on contracts to prevent them
125  * from disappearing while other processing is going on.  It is
126  * important to note that the global contract lists do not hold
127  * references on contracts; a contract is removed from these structures
128  * atomically with the release of its last reference.
129  *
130  * At a given point in time, a contract can either be owned by a
131  * process, inherited by a regent process contract, or orphaned.  A
132  * contract_t's  owner and regent pointers, ct_owner and ct_regent, are
133  * protected by its ct_lock.  The linkage in the holder's (holder =
134  * owner or regent) list of contracts, ct_ctlist, is protected by
135  * whatever lock protects the holder's data structure.  In order for
136  * these two directions to remain consistent, changing the holder of a
137  * contract requires that both locks be held.
138  *
139  * Events also have reference counts.  There is one hold on an event
140  * per queue it is present on, in addition to those needed for the
141  * usual sundry reasons.  Individual listeners are associated with
142  * specific queues, and increase a queue-specific reference count
143  * stored in the ct_member_t structure.
144  *
145  * The dynamic contents of an event (reference count and flags) are
146  * protected by its cte_lock, while the contents of the embedded
147  * ct_member_t structures are protected by the locks of the queues they
148  * are linked into.  A ct_listener_t's contents are also protected by
149  * its event queue's ctq_lock.
150  *
151  * Resource controls
152  * -----------------
153  *
154  * Control:      project.max-contracts (rc_project_contract)
155  * Description:  Maximum number of contracts allowed a project.
156  *
157  *   When a contract is created, the project's allocation is tested and
158  *   (assuming success) increased.  When the last reference to a
159  *   contract is released, the creating project's allocation is
160  *   decreased.
161  */
162 
163 #include <sys/mutex.h>
164 #include <sys/debug.h>
165 #include <sys/types.h>
166 #include <sys/param.h>
167 #include <sys/kmem.h>
168 #include <sys/thread.h>
169 #include <sys/id_space.h>
170 #include <sys/avl.h>
171 #include <sys/list.h>
172 #include <sys/sysmacros.h>
173 #include <sys/proc.h>
174 #include <sys/contract_impl.h>
175 #include <sys/contract/process_impl.h>
176 #include <sys/dditypes.h>
177 #include <sys/contract/device_impl.h>
178 #include <sys/systm.h>
179 #include <sys/atomic.h>
180 #include <sys/cmn_err.h>
181 #include <sys/model.h>
182 #include <sys/policy.h>
183 #include <sys/zone.h>
184 #include <sys/task.h>
185 #include <sys/ddi.h>
186 #include <sys/sunddi.h>
187 
188 extern rctl_hndl_t rc_project_contract;
189 
190 static id_space_t	*contract_ids;
191 static avl_tree_t	contract_avl;
192 static kmutex_t		contract_lock;
193 
194 int			ct_ntypes = CTT_MAXTYPE;
195 static ct_type_t	*ct_types_static[CTT_MAXTYPE];
196 ct_type_t		**ct_types = ct_types_static;
197 int			ct_debug;
198 
199 static void cte_queue_create(ct_equeue_t *, ct_listnum_t, int, int);
200 static void cte_queue_destroy(ct_equeue_t *);
201 static void cte_queue_drain(ct_equeue_t *, int);
202 static void cte_trim(ct_equeue_t *, contract_t *);
203 static void cte_copy(ct_equeue_t *, ct_equeue_t *);
204 
205 /*
206  * contract_compar
207  *
208  * A contract comparator which sorts on contract ID.
209  */
210 int
211 contract_compar(const void *x, const void *y)
212 {
213 	const contract_t *ct1 = x;
214 	const contract_t *ct2 = y;
215 
216 	if (ct1->ct_id < ct2->ct_id)
217 		return (-1);
218 	if (ct1->ct_id > ct2->ct_id)
219 		return (1);
220 	return (0);
221 }
222 
223 /*
224  * contract_init
225  *
226  * Initializes the contract subsystem, the specific contract types, and
227  * process 0.
228  */
229 void
230 contract_init(void)
231 {
232 	/*
233 	 * Initialize contract subsystem.
234 	 */
235 	contract_ids = id_space_create("contracts", 1, INT_MAX);
236 	avl_create(&contract_avl, contract_compar, sizeof (contract_t),
237 	    offsetof(contract_t, ct_ctavl));
238 	mutex_init(&contract_lock, NULL, MUTEX_DEFAULT, NULL);
239 
240 	/*
241 	 * Initialize contract types.
242 	 */
243 	contract_process_init();
244 	contract_device_init();
245 
246 	/*
247 	 * Initialize p0/lwp0 contract state.
248 	 */
249 	avl_create(&p0.p_ct_held, contract_compar, sizeof (contract_t),
250 	    offsetof(contract_t, ct_ctlist));
251 }
252 
253 /*
254  * contract_dtor
255  *
256  * Performs basic destruction of the common portions of a contract.
257  * Called from the failure path of contract_ctor and from
258  * contract_rele.
259  */
260 static void
261 contract_dtor(contract_t *ct)
262 {
263 	cte_queue_destroy(&ct->ct_events);
264 	list_destroy(&ct->ct_vnodes);
265 	mutex_destroy(&ct->ct_reflock);
266 	mutex_destroy(&ct->ct_lock);
267 	mutex_destroy(&ct->ct_evtlock);
268 }
269 
270 /*
271  * contract_ctor
272  *
273  * Called by a contract type to initialize a contract.  Fails if the
274  * max-contract resource control would have been exceeded.  After a
275  * successful call to contract_ctor, the contract is unlocked and
276  * visible in all namespaces; any type-specific initialization should
277  * be completed before calling contract_ctor.  Returns 0 on success.
278  *
279  * Because not all callers can tolerate failure, a 0 value for canfail
280  * instructs contract_ctor to ignore the project.max-contracts resource
281  * control.  Obviously, this "out" should only be employed by callers
282  * who are sufficiently constrained in other ways (e.g. newproc).
283  */
284 int
285 contract_ctor(contract_t *ct, ct_type_t *type, ct_template_t *tmpl, void *data,
286     ctflags_t flags, proc_t *author, int canfail)
287 {
288 	avl_index_t where;
289 	klwp_t *curlwp = ttolwp(curthread);
290 
291 	ASSERT(author == curproc);
292 
293 	mutex_init(&ct->ct_lock, NULL, MUTEX_DEFAULT, NULL);
294 	mutex_init(&ct->ct_reflock, NULL, MUTEX_DEFAULT, NULL);
295 	mutex_init(&ct->ct_evtlock, NULL, MUTEX_DEFAULT, NULL);
296 	ct->ct_id = id_alloc(contract_ids);
297 
298 	cte_queue_create(&ct->ct_events, CTEL_CONTRACT, 20, 0);
299 	list_create(&ct->ct_vnodes, sizeof (contract_vnode_t),
300 	    offsetof(contract_vnode_t, ctv_node));
301 
302 	/*
303 	 * Instance data
304 	 */
305 	ct->ct_ref = 2;		/* one for the holder, one for "latest" */
306 	ct->ct_cuid = crgetuid(CRED());
307 	ct->ct_type = type;
308 	ct->ct_data = data;
309 	gethrestime(&ct->ct_ctime);
310 	ct->ct_state = CTS_OWNED;
311 	ct->ct_flags = flags;
312 	ct->ct_regent = author->p_ct_process ?
313 	    &author->p_ct_process->conp_contract : NULL;
314 	ct->ct_ev_info = tmpl->ctmpl_ev_info;
315 	ct->ct_ev_crit = tmpl->ctmpl_ev_crit;
316 	ct->ct_cookie = tmpl->ctmpl_cookie;
317 	ct->ct_owner = author;
318 	ct->ct_ntime.ctm_total = -1;
319 	ct->ct_qtime.ctm_total = -1;
320 	ct->ct_nevent = NULL;
321 
322 	/*
323 	 * Test project.max-contracts.
324 	 */
325 	mutex_enter(&author->p_lock);
326 	mutex_enter(&contract_lock);
327 	if (canfail && rctl_test(rc_project_contract,
328 	    author->p_task->tk_proj->kpj_rctls, author, 1,
329 	    RCA_SAFE) & RCT_DENY) {
330 		id_free(contract_ids, ct->ct_id);
331 		mutex_exit(&contract_lock);
332 		mutex_exit(&author->p_lock);
333 		ct->ct_events.ctq_flags |= CTQ_DEAD;
334 		contract_dtor(ct);
335 		return (1);
336 	}
337 	ct->ct_proj = author->p_task->tk_proj;
338 	ct->ct_proj->kpj_data.kpd_contract++;
339 	(void) project_hold(ct->ct_proj);
340 	mutex_exit(&contract_lock);
341 
342 	/*
343 	 * Insert into holder's avl of contracts.
344 	 * We use an avl not because order is important, but because
345 	 * readdir of /proc/contracts requires we be able to use a
346 	 * scalar as an index into the process's list of contracts
347 	 */
348 	ct->ct_zoneid = author->p_zone->zone_id;
349 	ct->ct_czuniqid = ct->ct_mzuniqid = author->p_zone->zone_uniqid;
350 	VERIFY(avl_find(&author->p_ct_held, ct, &where) == NULL);
351 	avl_insert(&author->p_ct_held, ct, where);
352 	mutex_exit(&author->p_lock);
353 
354 	/*
355 	 * Insert into global contract AVL
356 	 */
357 	mutex_enter(&contract_lock);
358 	VERIFY(avl_find(&contract_avl, ct, &where) == NULL);
359 	avl_insert(&contract_avl, ct, where);
360 	mutex_exit(&contract_lock);
361 
362 	/*
363 	 * Insert into type AVL
364 	 */
365 	mutex_enter(&type->ct_type_lock);
366 	VERIFY(avl_find(&type->ct_type_avl, ct, &where) == NULL);
367 	avl_insert(&type->ct_type_avl, ct, where);
368 	type->ct_type_timestruc = ct->ct_ctime;
369 	mutex_exit(&type->ct_type_lock);
370 
371 	if (curlwp->lwp_ct_latest[type->ct_type_index])
372 		contract_rele(curlwp->lwp_ct_latest[type->ct_type_index]);
373 	curlwp->lwp_ct_latest[type->ct_type_index] = ct;
374 
375 	return (0);
376 }
377 
378 /*
379  * contract_rele
380  *
381  * Releases a reference to a contract.  If the caller had the last
382  * reference, the contract is removed from all namespaces, its
383  * allocation against the max-contracts resource control is released,
384  * and the contract type's free entry point is invoked for any
385  * type-specific deconstruction and to (presumably) free the object.
386  */
387 void
388 contract_rele(contract_t *ct)
389 {
390 	uint64_t nref;
391 
392 	mutex_enter(&ct->ct_reflock);
393 	ASSERT(ct->ct_ref > 0);
394 	nref = --ct->ct_ref;
395 	mutex_exit(&ct->ct_reflock);
396 	if (nref == 0) {
397 		/*
398 		 * ct_owner is cleared when it drops its reference.
399 		 */
400 		ASSERT(ct->ct_owner == NULL);
401 		ASSERT(ct->ct_evcnt == 0);
402 
403 		/*
404 		 * Remove from global contract AVL
405 		 */
406 		mutex_enter(&contract_lock);
407 		avl_remove(&contract_avl, ct);
408 		mutex_exit(&contract_lock);
409 
410 		/*
411 		 * Remove from type AVL
412 		 */
413 		mutex_enter(&ct->ct_type->ct_type_lock);
414 		avl_remove(&ct->ct_type->ct_type_avl, ct);
415 		mutex_exit(&ct->ct_type->ct_type_lock);
416 
417 		/*
418 		 * Release the contract's ID
419 		 */
420 		id_free(contract_ids, ct->ct_id);
421 
422 		/*
423 		 * Release project hold
424 		 */
425 		mutex_enter(&contract_lock);
426 		ct->ct_proj->kpj_data.kpd_contract--;
427 		project_rele(ct->ct_proj);
428 		mutex_exit(&contract_lock);
429 
430 		/*
431 		 * Free the contract
432 		 */
433 		contract_dtor(ct);
434 		ct->ct_type->ct_type_ops->contop_free(ct);
435 	}
436 }
437 
438 /*
439  * contract_hold
440  *
441  * Adds a reference to a contract
442  */
443 void
444 contract_hold(contract_t *ct)
445 {
446 	mutex_enter(&ct->ct_reflock);
447 	ASSERT(ct->ct_ref < UINT64_MAX);
448 	ct->ct_ref++;
449 	mutex_exit(&ct->ct_reflock);
450 }
451 
452 /*
453  * contract_getzuniqid
454  *
455  * Get a contract's zone unique ID.  Needed because 64-bit reads and
456  * writes aren't atomic on x86.  Since there are contexts where we are
457  * unable to take ct_lock, we instead use ct_reflock; in actuality any
458  * lock would do.
459  */
460 uint64_t
461 contract_getzuniqid(contract_t *ct)
462 {
463 	uint64_t zuniqid;
464 
465 	mutex_enter(&ct->ct_reflock);
466 	zuniqid = ct->ct_mzuniqid;
467 	mutex_exit(&ct->ct_reflock);
468 
469 	return (zuniqid);
470 }
471 
472 /*
473  * contract_setzuniqid
474  *
475  * Sets a contract's zone unique ID.   See contract_getzuniqid.
476  */
477 void
478 contract_setzuniqid(contract_t *ct, uint64_t zuniqid)
479 {
480 	mutex_enter(&ct->ct_reflock);
481 	ct->ct_mzuniqid = zuniqid;
482 	mutex_exit(&ct->ct_reflock);
483 }
484 
485 /*
486  * contract_abandon
487  *
488  * Abandons the specified contract.  If "explicit" is clear, the
489  * contract was implicitly abandoned (by process exit) and should be
490  * inherited if its terms allow it and its owner was a member of a
491  * regent contract.  Otherwise, the contract type's abandon entry point
492  * is invoked to either destroy or orphan the contract.
493  */
494 int
495 contract_abandon(contract_t *ct, proc_t *p, int explicit)
496 {
497 	ct_equeue_t *q = NULL;
498 	contract_t *parent = &p->p_ct_process->conp_contract;
499 	int inherit = 0;
500 
501 	ASSERT(p == curproc);
502 
503 	mutex_enter(&ct->ct_lock);
504 
505 	/*
506 	 * Multiple contract locks are taken contract -> subcontract.
507 	 * Check if the contract will be inherited so we can acquire
508 	 * all the necessary locks before making sensitive changes.
509 	 */
510 	if (!explicit && (ct->ct_flags & CTF_INHERIT) &&
511 	    contract_process_accept(parent)) {
512 		mutex_exit(&ct->ct_lock);
513 		mutex_enter(&parent->ct_lock);
514 		mutex_enter(&ct->ct_lock);
515 		inherit = 1;
516 	}
517 
518 	if (ct->ct_owner != p) {
519 		mutex_exit(&ct->ct_lock);
520 		if (inherit)
521 			mutex_exit(&parent->ct_lock);
522 		return (EINVAL);
523 	}
524 
525 	mutex_enter(&p->p_lock);
526 	if (explicit)
527 		avl_remove(&p->p_ct_held, ct);
528 	ct->ct_owner = NULL;
529 	mutex_exit(&p->p_lock);
530 
531 	/*
532 	 * Since we can't call cte_trim with the contract lock held,
533 	 * we grab the queue pointer here.
534 	 */
535 	if (p->p_ct_equeue)
536 		q = p->p_ct_equeue[ct->ct_type->ct_type_index];
537 
538 	/*
539 	 * contop_abandon may destroy the contract so we rely on it to
540 	 * drop ct_lock.  We retain a reference on the contract so that
541 	 * the cte_trim which follows functions properly.  Even though
542 	 * cte_trim doesn't dereference the contract pointer, it is
543 	 * still necessary to retain a reference to the contract so
544 	 * that we don't trim events which are sent by a subsequently
545 	 * allocated contract infortuitously located at the same address.
546 	 */
547 	contract_hold(ct);
548 
549 	if (inherit) {
550 		ct->ct_state = CTS_INHERITED;
551 		ASSERT(ct->ct_regent == parent);
552 		contract_process_take(parent, ct);
553 
554 		/*
555 		 * We are handing off the process's reference to the
556 		 * parent contract.  For this reason, the order in
557 		 * which we drop the contract locks is also important.
558 		 */
559 		mutex_exit(&ct->ct_lock);
560 		mutex_exit(&parent->ct_lock);
561 	} else {
562 		ct->ct_regent = NULL;
563 		ct->ct_type->ct_type_ops->contop_abandon(ct);
564 	}
565 
566 	/*
567 	 * ct_lock has been dropped; we can safely trim the event
568 	 * queue now.
569 	 */
570 	if (q) {
571 		mutex_enter(&q->ctq_lock);
572 		cte_trim(q, ct);
573 		mutex_exit(&q->ctq_lock);
574 	}
575 
576 	contract_rele(ct);
577 
578 	return (0);
579 }
580 
581 int
582 contract_newct(contract_t *ct)
583 {
584 	return (ct->ct_type->ct_type_ops->contop_newct(ct));
585 }
586 
587 /*
588  * contract_adopt
589  *
590  * Adopts a contract.  After a successful call to this routine, the
591  * previously inherited contract will belong to the calling process,
592  * and its events will have been appended to its new owner's process
593  * bundle queue.
594  */
595 int
596 contract_adopt(contract_t *ct, proc_t *p)
597 {
598 	avl_index_t where;
599 	ct_equeue_t *q;
600 	contract_t *parent;
601 
602 	ASSERT(p == curproc);
603 
604 	/*
605 	 * Ensure the process has an event queue.  Checked by ASSERTs
606 	 * below.
607 	 */
608 	(void) contract_type_pbundle(ct->ct_type, p);
609 
610 	mutex_enter(&ct->ct_lock);
611 	parent = ct->ct_regent;
612 	if (ct->ct_state != CTS_INHERITED ||
613 	    &p->p_ct_process->conp_contract != parent ||
614 	    p->p_zone->zone_uniqid != ct->ct_czuniqid) {
615 		mutex_exit(&ct->ct_lock);
616 		return (EINVAL);
617 	}
618 
619 	/*
620 	 * Multiple contract locks are taken contract -> subcontract.
621 	 */
622 	mutex_exit(&ct->ct_lock);
623 	mutex_enter(&parent->ct_lock);
624 	mutex_enter(&ct->ct_lock);
625 
626 	/*
627 	 * It is possible that the contract was adopted by someone else
628 	 * while its lock was dropped.  It isn't possible for the
629 	 * contract to have been inherited by a different regent
630 	 * contract.
631 	 */
632 	if (ct->ct_state != CTS_INHERITED) {
633 		mutex_exit(&parent->ct_lock);
634 		mutex_exit(&ct->ct_lock);
635 		return (EBUSY);
636 	}
637 	ASSERT(ct->ct_regent == parent);
638 
639 	ct->ct_state = CTS_OWNED;
640 
641 	contract_process_adopt(ct, p);
642 
643 	mutex_enter(&p->p_lock);
644 	ct->ct_owner = p;
645 	VERIFY(avl_find(&p->p_ct_held, ct, &where) == NULL);
646 	avl_insert(&p->p_ct_held, ct, where);
647 	mutex_exit(&p->p_lock);
648 
649 	ASSERT(ct->ct_owner->p_ct_equeue);
650 	ASSERT(ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index]);
651 	q = ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index];
652 	cte_copy(&ct->ct_events, q);
653 	mutex_exit(&ct->ct_lock);
654 
655 	return (0);
656 }
657 
658 /*
659  * contract_ack
660  *
661  * Acknowledges receipt of a critical event.
662  */
663 int
664 contract_ack(contract_t *ct, uint64_t evid, int ack)
665 {
666 	ct_kevent_t *ev;
667 	list_t *queue = &ct->ct_events.ctq_events;
668 	int error = ESRCH;
669 	int nego = 0;
670 	uint_t evtype;
671 
672 	ASSERT(ack == CT_ACK || ack == CT_NACK);
673 
674 	mutex_enter(&ct->ct_lock);
675 	mutex_enter(&ct->ct_events.ctq_lock);
676 	/*
677 	 * We are probably ACKing something near the head of the queue.
678 	 */
679 	for (ev = list_head(queue); ev; ev = list_next(queue, ev)) {
680 		if (ev->cte_id == evid) {
681 			if (ev->cte_flags & CTE_NEG)
682 				nego = 1;
683 			else if (ack == CT_NACK)
684 				break;
685 			if ((ev->cte_flags & (CTE_INFO | CTE_ACK)) == 0) {
686 				ev->cte_flags |= CTE_ACK;
687 				ct->ct_evcnt--;
688 				evtype = ev->cte_type;
689 				error = 0;
690 			}
691 			break;
692 		}
693 	}
694 	mutex_exit(&ct->ct_events.ctq_lock);
695 	mutex_exit(&ct->ct_lock);
696 
697 	/*
698 	 * Not all critical events are negotiation events, however
699 	 * every negotiation event is a critical event. NEGEND events
700 	 * are critical events but are not negotiation events
701 	 */
702 	if (error || !nego)
703 		return (error);
704 
705 	if (ack == CT_ACK)
706 		error = ct->ct_type->ct_type_ops->contop_ack(ct, evtype, evid);
707 	else
708 		error = ct->ct_type->ct_type_ops->contop_nack(ct, evtype, evid);
709 
710 	return (error);
711 }
712 
713 /*ARGSUSED*/
714 int
715 contract_ack_inval(contract_t *ct, uint_t evtype, uint64_t evid)
716 {
717 	cmn_err(CE_PANIC, "contract_ack_inval: unsupported call: ctid: %u",
718 	    ct->ct_id);
719 	return (ENOSYS);
720 }
721 
722 /*ARGSUSED*/
723 int
724 contract_qack_inval(contract_t *ct, uint_t evtype, uint64_t evid)
725 {
726 	cmn_err(CE_PANIC, "contract_ack_inval: unsupported call: ctid: %u",
727 	    ct->ct_id);
728 	return (ENOSYS);
729 }
730 
731 /*ARGSUSED*/
732 int
733 contract_qack_notsup(contract_t *ct, uint_t evtype, uint64_t evid)
734 {
735 	return (ERANGE);
736 }
737 
738 /*
739  * contract_qack
740  *
741  * Asks that negotiations be extended by another time quantum
742  */
743 int
744 contract_qack(contract_t *ct, uint64_t evid)
745 {
746 	ct_kevent_t *ev;
747 	list_t *queue = &ct->ct_events.ctq_events;
748 	int nego = 0;
749 	uint_t evtype;
750 
751 	mutex_enter(&ct->ct_lock);
752 	mutex_enter(&ct->ct_events.ctq_lock);
753 
754 	for (ev = list_head(queue); ev; ev = list_next(queue, ev)) {
755 		if (ev->cte_id == evid) {
756 			if ((ev->cte_flags & (CTE_NEG | CTE_ACK)) == CTE_NEG) {
757 				evtype = ev->cte_type;
758 				nego = 1;
759 			}
760 			break;
761 		}
762 	}
763 	mutex_exit(&ct->ct_events.ctq_lock);
764 	mutex_exit(&ct->ct_lock);
765 
766 	/*
767 	 * Only a negotiated event (which is by definition also a critical
768 	 * event) which has not yet been acknowledged can provide
769 	 * time quanta to a negotiating owner process.
770 	 */
771 	if (!nego)
772 		return (ESRCH);
773 
774 	return (ct->ct_type->ct_type_ops->contop_qack(ct, evtype, evid));
775 }
776 
777 /*
778  * contract_orphan
779  *
780  * Icky-poo.  This is a process-contract special, used to ACK all
781  * critical messages when a contract is orphaned.
782  */
783 void
784 contract_orphan(contract_t *ct)
785 {
786 	ct_kevent_t *ev;
787 	list_t *queue = &ct->ct_events.ctq_events;
788 
789 	ASSERT(MUTEX_HELD(&ct->ct_lock));
790 	ASSERT(ct->ct_state != CTS_ORPHAN);
791 
792 	mutex_enter(&ct->ct_events.ctq_lock);
793 	ct->ct_state = CTS_ORPHAN;
794 	for (ev = list_head(queue); ev; ev = list_next(queue, ev)) {
795 		if ((ev->cte_flags & (CTE_INFO | CTE_ACK)) == 0) {
796 			ev->cte_flags |= CTE_ACK;
797 			ct->ct_evcnt--;
798 		}
799 	}
800 	mutex_exit(&ct->ct_events.ctq_lock);
801 
802 	ASSERT(ct->ct_evcnt == 0);
803 }
804 
805 /*
806  * contract_destroy
807  *
808  * Explicit contract destruction.  Called when contract is empty.
809  * The contract will actually stick around until all of its events are
810  * removed from the bundle and and process bundle queues, and all fds
811  * which refer to it are closed.  See contract_dtor if you are looking
812  * for what destroys the contract structure.
813  */
814 void
815 contract_destroy(contract_t *ct)
816 {
817 	ASSERT(MUTEX_HELD(&ct->ct_lock));
818 	ASSERT(ct->ct_state != CTS_DEAD);
819 	ASSERT(ct->ct_owner == NULL);
820 
821 	ct->ct_state = CTS_DEAD;
822 	cte_queue_drain(&ct->ct_events, 1);
823 	mutex_exit(&ct->ct_lock);
824 	mutex_enter(&ct->ct_type->ct_type_events.ctq_lock);
825 	cte_trim(&ct->ct_type->ct_type_events, ct);
826 	mutex_exit(&ct->ct_type->ct_type_events.ctq_lock);
827 	mutex_enter(&ct->ct_lock);
828 	ct->ct_type->ct_type_ops->contop_destroy(ct);
829 	mutex_exit(&ct->ct_lock);
830 	contract_rele(ct);
831 }
832 
833 /*
834  * contract_vnode_get
835  *
836  * Obtains the contract directory vnode for this contract, if there is
837  * one.  The caller must VN_RELE the vnode when they are through using
838  * it.
839  */
840 vnode_t *
841 contract_vnode_get(contract_t *ct, vfs_t *vfsp)
842 {
843 	contract_vnode_t *ctv;
844 	vnode_t *vp = NULL;
845 
846 	mutex_enter(&ct->ct_lock);
847 	for (ctv = list_head(&ct->ct_vnodes); ctv != NULL;
848 	    ctv = list_next(&ct->ct_vnodes, ctv))
849 		if (ctv->ctv_vnode->v_vfsp == vfsp) {
850 			vp = ctv->ctv_vnode;
851 			VN_HOLD(vp);
852 			break;
853 		}
854 	mutex_exit(&ct->ct_lock);
855 	return (vp);
856 }
857 
858 /*
859  * contract_vnode_set
860  *
861  * Sets the contract directory vnode for this contract.  We don't hold
862  * a reference on the vnode because we don't want to prevent it from
863  * being freed.  The vnode's inactive entry point will take care of
864  * notifying us when it should be removed.
865  */
866 void
867 contract_vnode_set(contract_t *ct, contract_vnode_t *ctv, vnode_t *vnode)
868 {
869 	mutex_enter(&ct->ct_lock);
870 	ctv->ctv_vnode = vnode;
871 	list_insert_head(&ct->ct_vnodes, ctv);
872 	mutex_exit(&ct->ct_lock);
873 }
874 
875 /*
876  * contract_vnode_clear
877  *
878  * Removes this vnode as the contract directory vnode for this
879  * contract.  Called from a contract directory's inactive entry point,
880  * this may return 0 indicating that the vnode gained another reference
881  * because of a simultaneous call to contract_vnode_get.
882  */
883 int
884 contract_vnode_clear(contract_t *ct, contract_vnode_t *ctv)
885 {
886 	vnode_t *vp = ctv->ctv_vnode;
887 	int result;
888 
889 	mutex_enter(&ct->ct_lock);
890 	mutex_enter(&vp->v_lock);
891 	if (vp->v_count == 1) {
892 		list_remove(&ct->ct_vnodes, ctv);
893 		result = 1;
894 	} else {
895 		vp->v_count--;
896 		result = 0;
897 	}
898 	mutex_exit(&vp->v_lock);
899 	mutex_exit(&ct->ct_lock);
900 
901 	return (result);
902 }
903 
904 /*
905  * contract_exit
906  *
907  * Abandons all contracts held by process p, and drains process p's
908  * bundle queues.  Called on process exit.
909  */
910 void
911 contract_exit(proc_t *p)
912 {
913 	contract_t *ct;
914 	void *cookie = NULL;
915 	int i;
916 
917 	ASSERT(p == curproc);
918 
919 	/*
920 	 * Abandon held contracts.  contract_abandon knows enough not
921 	 * to remove the contract from the list a second time.  We are
922 	 * exiting, so no locks are needed here.  But because
923 	 * contract_abandon will take p_lock, we need to make sure we
924 	 * aren't holding it.
925 	 */
926 	ASSERT(MUTEX_NOT_HELD(&p->p_lock));
927 	while ((ct = avl_destroy_nodes(&p->p_ct_held, &cookie)) != NULL)
928 		VERIFY(contract_abandon(ct, p, 0) == 0);
929 
930 	/*
931 	 * Drain pbundles.  Because a process bundle queue could have
932 	 * been passed to another process, they may not be freed right
933 	 * away.
934 	 */
935 	if (p->p_ct_equeue) {
936 		for (i = 0; i < CTT_MAXTYPE; i++)
937 			if (p->p_ct_equeue[i])
938 				cte_queue_drain(p->p_ct_equeue[i], 0);
939 		kmem_free(p->p_ct_equeue, CTT_MAXTYPE * sizeof (ct_equeue_t *));
940 	}
941 }
942 
943 static int
944 get_time_left(struct ct_time *t)
945 {
946 	clock_t ticks_elapsed;
947 	int secs_elapsed;
948 
949 	if (t->ctm_total == -1)
950 		return (-1);
951 
952 	ticks_elapsed = ddi_get_lbolt() - t->ctm_start;
953 	secs_elapsed = t->ctm_total - (drv_hztousec(ticks_elapsed)/MICROSEC);
954 	return (secs_elapsed > 0 ? secs_elapsed : 0);
955 }
956 
957 /*
958  * contract_status_common
959  *
960  * Populates a ct_status structure.  Used by contract types in their
961  * status entry points and ctfs when only common information is
962  * requested.
963  */
964 void
965 contract_status_common(contract_t *ct, zone_t *zone, void *status,
966     model_t model)
967 {
968 	STRUCT_HANDLE(ct_status, lstatus);
969 
970 	STRUCT_SET_HANDLE(lstatus, model, status);
971 	ASSERT(MUTEX_HELD(&ct->ct_lock));
972 	if (zone->zone_uniqid == GLOBAL_ZONEUNIQID ||
973 	    zone->zone_uniqid == ct->ct_czuniqid) {
974 		zone_t *czone;
975 		zoneid_t zoneid = -1;
976 
977 		/*
978 		 * Contracts don't have holds on the zones they were
979 		 * created by.  If the contract's zone no longer
980 		 * exists, we say its zoneid is -1.
981 		 */
982 		if (zone->zone_uniqid == ct->ct_czuniqid ||
983 		    ct->ct_czuniqid == GLOBAL_ZONEUNIQID) {
984 			zoneid = ct->ct_zoneid;
985 		} else if ((czone = zone_find_by_id(ct->ct_zoneid)) != NULL) {
986 			if (czone->zone_uniqid == ct->ct_mzuniqid)
987 				zoneid = ct->ct_zoneid;
988 			zone_rele(czone);
989 		}
990 
991 		STRUCT_FSET(lstatus, ctst_zoneid, zoneid);
992 		STRUCT_FSET(lstatus, ctst_holder,
993 		    (ct->ct_state == CTS_OWNED) ? ct->ct_owner->p_pid :
994 		    (ct->ct_state == CTS_INHERITED) ? ct->ct_regent->ct_id : 0);
995 		STRUCT_FSET(lstatus, ctst_state, ct->ct_state);
996 	} else {
997 		/*
998 		 * We are looking at a contract which was created by a
999 		 * process outside of our zone.  We provide fake zone,
1000 		 * holder, and state information.
1001 		 */
1002 
1003 		STRUCT_FSET(lstatus, ctst_zoneid, zone->zone_id);
1004 		/*
1005 		 * Since "zone" can't disappear until the calling ctfs
1006 		 * is unmounted, zone_zsched must be valid.
1007 		 */
1008 		STRUCT_FSET(lstatus, ctst_holder, (ct->ct_state < CTS_ORPHAN) ?
1009 		    zone->zone_zsched->p_pid : 0);
1010 		STRUCT_FSET(lstatus, ctst_state, (ct->ct_state < CTS_ORPHAN) ?
1011 		    CTS_OWNED : ct->ct_state);
1012 	}
1013 	STRUCT_FSET(lstatus, ctst_nevents, ct->ct_evcnt);
1014 	STRUCT_FSET(lstatus, ctst_ntime, get_time_left(&ct->ct_ntime));
1015 	STRUCT_FSET(lstatus, ctst_qtime, get_time_left(&ct->ct_qtime));
1016 	STRUCT_FSET(lstatus, ctst_nevid,
1017 	    ct->ct_nevent ? ct->ct_nevent->cte_id : 0);
1018 	STRUCT_FSET(lstatus, ctst_critical, ct->ct_ev_crit);
1019 	STRUCT_FSET(lstatus, ctst_informative, ct->ct_ev_info);
1020 	STRUCT_FSET(lstatus, ctst_cookie, ct->ct_cookie);
1021 	STRUCT_FSET(lstatus, ctst_type, ct->ct_type->ct_type_index);
1022 	STRUCT_FSET(lstatus, ctst_id, ct->ct_id);
1023 }
1024 
1025 /*
1026  * contract_checkcred
1027  *
1028  * Determines if the specified contract is owned by a process with the
1029  * same effective uid as the specified credential.  The caller must
1030  * ensure that the uid spaces are the same.  Returns 1 on success.
1031  */
1032 static int
1033 contract_checkcred(contract_t *ct, const cred_t *cr)
1034 {
1035 	proc_t *p;
1036 	int fail = 1;
1037 
1038 	mutex_enter(&ct->ct_lock);
1039 	if ((p = ct->ct_owner) != NULL) {
1040 		mutex_enter(&p->p_crlock);
1041 		fail = crgetuid(cr) != crgetuid(p->p_cred);
1042 		mutex_exit(&p->p_crlock);
1043 	}
1044 	mutex_exit(&ct->ct_lock);
1045 
1046 	return (!fail);
1047 }
1048 
1049 /*
1050  * contract_owned
1051  *
1052  * Determines if the specified credential can view an event generated
1053  * by the specified contract.  If locked is set, the contract's ct_lock
1054  * is held and the caller will need to do additional work to determine
1055  * if they truly can see the event.  Returns 1 on success.
1056  */
1057 int
1058 contract_owned(contract_t *ct, const cred_t *cr, int locked)
1059 {
1060 	int owner, cmatch, zmatch;
1061 	uint64_t zuniqid, mzuniqid;
1062 	uid_t euid;
1063 
1064 	ASSERT(locked || MUTEX_NOT_HELD(&ct->ct_lock));
1065 
1066 	zuniqid = curproc->p_zone->zone_uniqid;
1067 	mzuniqid = contract_getzuniqid(ct);
1068 	euid = crgetuid(cr);
1069 
1070 	/*
1071 	 * owner: we own the contract
1072 	 * cmatch: we are in the creator's (and holder's) zone and our
1073 	 *   uid matches the creator's or holder's
1074 	 * zmatch: we are in the effective zone of a contract created
1075 	 *   in the global zone, and our uid matches that of the
1076 	 *   virtualized holder's (zsched/kcred)
1077 	 */
1078 	owner = (ct->ct_owner == curproc);
1079 	cmatch = (zuniqid == ct->ct_czuniqid) &&
1080 	    ((ct->ct_cuid == euid) || (!locked && contract_checkcred(ct, cr)));
1081 	zmatch = (ct->ct_czuniqid != mzuniqid) && (zuniqid == mzuniqid) &&
1082 	    (crgetuid(kcred) == euid);
1083 
1084 	return (owner || cmatch || zmatch);
1085 }
1086 
1087 
1088 /*
1089  * contract_type_init
1090  *
1091  * Called by contract types to register themselves with the contracts
1092  * framework.
1093  */
1094 ct_type_t *
1095 contract_type_init(ct_typeid_t type, const char *name, contops_t *ops,
1096     ct_f_default_t *dfault)
1097 {
1098 	ct_type_t *result;
1099 
1100 	ASSERT(type < CTT_MAXTYPE);
1101 
1102 	result = kmem_alloc(sizeof (ct_type_t), KM_SLEEP);
1103 
1104 	mutex_init(&result->ct_type_lock, NULL, MUTEX_DEFAULT, NULL);
1105 	avl_create(&result->ct_type_avl, contract_compar, sizeof (contract_t),
1106 	    offsetof(contract_t, ct_cttavl));
1107 	cte_queue_create(&result->ct_type_events, CTEL_BUNDLE, 20, 0);
1108 	result->ct_type_name = name;
1109 	result->ct_type_ops = ops;
1110 	result->ct_type_default = dfault;
1111 	result->ct_type_evid = 0;
1112 	gethrestime(&result->ct_type_timestruc);
1113 	result->ct_type_index = type;
1114 
1115 	ct_types[type] = result;
1116 
1117 	return (result);
1118 }
1119 
1120 /*
1121  * contract_type_count
1122  *
1123  * Obtains the number of contracts of a particular type.
1124  */
1125 int
1126 contract_type_count(ct_type_t *type)
1127 {
1128 	ulong_t count;
1129 
1130 	mutex_enter(&type->ct_type_lock);
1131 	count = avl_numnodes(&type->ct_type_avl);
1132 	mutex_exit(&type->ct_type_lock);
1133 
1134 	return (count);
1135 }
1136 
1137 /*
1138  * contract_type_max
1139  *
1140  * Obtains the maximum contract id of of a particular type.
1141  */
1142 ctid_t
1143 contract_type_max(ct_type_t *type)
1144 {
1145 	contract_t *ct;
1146 	ctid_t res;
1147 
1148 	mutex_enter(&type->ct_type_lock);
1149 	ct = avl_last(&type->ct_type_avl);
1150 	res = ct ? ct->ct_id : -1;
1151 	mutex_exit(&type->ct_type_lock);
1152 
1153 	return (res);
1154 }
1155 
1156 /*
1157  * contract_max
1158  *
1159  * Obtains the maximum contract id.
1160  */
1161 ctid_t
1162 contract_max(void)
1163 {
1164 	contract_t *ct;
1165 	ctid_t res;
1166 
1167 	mutex_enter(&contract_lock);
1168 	ct = avl_last(&contract_avl);
1169 	res = ct ? ct->ct_id : -1;
1170 	mutex_exit(&contract_lock);
1171 
1172 	return (res);
1173 }
1174 
1175 /*
1176  * contract_lookup_common
1177  *
1178  * Common code for contract_lookup and contract_type_lookup.  Takes a
1179  * pointer to an AVL tree to search in.  Should be called with the
1180  * appropriate tree-protecting lock held (unfortunately unassertable).
1181  */
1182 static ctid_t
1183 contract_lookup_common(avl_tree_t *tree, uint64_t zuniqid, ctid_t current)
1184 {
1185 	contract_t template, *ct;
1186 	avl_index_t where;
1187 	ctid_t res;
1188 
1189 	template.ct_id = current;
1190 	ct = avl_find(tree, &template, &where);
1191 	if (ct == NULL)
1192 		ct = avl_nearest(tree, where, AVL_AFTER);
1193 	if (zuniqid != GLOBAL_ZONEUNIQID)
1194 		while (ct && (contract_getzuniqid(ct) != zuniqid))
1195 			ct = AVL_NEXT(tree, ct);
1196 	res = ct ? ct->ct_id : -1;
1197 
1198 	return (res);
1199 }
1200 
1201 /*
1202  * contract_type_lookup
1203  *
1204  * Returns the next type contract after the specified id, visible from
1205  * the specified zone.
1206  */
1207 ctid_t
1208 contract_type_lookup(ct_type_t *type, uint64_t zuniqid, ctid_t current)
1209 {
1210 	ctid_t res;
1211 
1212 	mutex_enter(&type->ct_type_lock);
1213 	res = contract_lookup_common(&type->ct_type_avl, zuniqid, current);
1214 	mutex_exit(&type->ct_type_lock);
1215 
1216 	return (res);
1217 }
1218 
1219 /*
1220  * contract_lookup
1221  *
1222  * Returns the next contract after the specified id, visible from the
1223  * specified zone.
1224  */
1225 ctid_t
1226 contract_lookup(uint64_t zuniqid, ctid_t current)
1227 {
1228 	ctid_t res;
1229 
1230 	mutex_enter(&contract_lock);
1231 	res = contract_lookup_common(&contract_avl, zuniqid, current);
1232 	mutex_exit(&contract_lock);
1233 
1234 	return (res);
1235 }
1236 
1237 /*
1238  * contract_plookup
1239  *
1240  * Returns the next contract held by process p after the specified id,
1241  * visible from the specified zone.  Made complicated by the fact that
1242  * contracts visible in a zone but held by processes outside of the
1243  * zone need to appear as being held by zsched to zone members.
1244  */
1245 ctid_t
1246 contract_plookup(proc_t *p, ctid_t current, uint64_t zuniqid)
1247 {
1248 	contract_t template, *ct;
1249 	avl_index_t where;
1250 	ctid_t res;
1251 
1252 	template.ct_id = current;
1253 	if (zuniqid != GLOBAL_ZONEUNIQID &&
1254 	    (p->p_flag & (SSYS|SZONETOP)) == (SSYS|SZONETOP)) {
1255 		/* This is inelegant. */
1256 		mutex_enter(&contract_lock);
1257 		ct = avl_find(&contract_avl, &template, &where);
1258 		if (ct == NULL)
1259 			ct = avl_nearest(&contract_avl, where, AVL_AFTER);
1260 		while (ct && !(ct->ct_state < CTS_ORPHAN &&
1261 		    contract_getzuniqid(ct) == zuniqid &&
1262 		    ct->ct_czuniqid == GLOBAL_ZONEUNIQID))
1263 			ct = AVL_NEXT(&contract_avl, ct);
1264 		res = ct ? ct->ct_id : -1;
1265 		mutex_exit(&contract_lock);
1266 	} else {
1267 		mutex_enter(&p->p_lock);
1268 		ct = avl_find(&p->p_ct_held, &template, &where);
1269 		if (ct == NULL)
1270 			ct = avl_nearest(&p->p_ct_held, where, AVL_AFTER);
1271 		res = ct ? ct->ct_id : -1;
1272 		mutex_exit(&p->p_lock);
1273 	}
1274 
1275 	return (res);
1276 }
1277 
1278 /*
1279  * contract_ptr_common
1280  *
1281  * Common code for contract_ptr and contract_type_ptr.  Takes a pointer
1282  * to an AVL tree to search in.  Should be called with the appropriate
1283  * tree-protecting lock held (unfortunately unassertable).
1284  */
1285 static contract_t *
1286 contract_ptr_common(avl_tree_t *tree, ctid_t id, uint64_t zuniqid)
1287 {
1288 	contract_t template, *ct;
1289 
1290 	template.ct_id = id;
1291 	ct = avl_find(tree, &template, NULL);
1292 	if (ct == NULL || (zuniqid != GLOBAL_ZONEUNIQID &&
1293 	    contract_getzuniqid(ct) != zuniqid)) {
1294 		return (NULL);
1295 	}
1296 
1297 	/*
1298 	 * Check to see if a thread is in the window in contract_rele
1299 	 * between dropping the reference count and removing the
1300 	 * contract from the type AVL.
1301 	 */
1302 	mutex_enter(&ct->ct_reflock);
1303 	if (ct->ct_ref) {
1304 		ct->ct_ref++;
1305 		mutex_exit(&ct->ct_reflock);
1306 	} else {
1307 		mutex_exit(&ct->ct_reflock);
1308 		ct = NULL;
1309 	}
1310 
1311 	return (ct);
1312 }
1313 
1314 /*
1315  * contract_type_ptr
1316  *
1317  * Returns a pointer to the contract with the specified id.  The
1318  * contract is held, so the caller needs to release the reference when
1319  * it is through with the contract.
1320  */
1321 contract_t *
1322 contract_type_ptr(ct_type_t *type, ctid_t id, uint64_t zuniqid)
1323 {
1324 	contract_t *ct;
1325 
1326 	mutex_enter(&type->ct_type_lock);
1327 	ct = contract_ptr_common(&type->ct_type_avl, id, zuniqid);
1328 	mutex_exit(&type->ct_type_lock);
1329 
1330 	return (ct);
1331 }
1332 
1333 /*
1334  * contract_ptr
1335  *
1336  * Returns a pointer to the contract with the specified id.  The
1337  * contract is held, so the caller needs to release the reference when
1338  * it is through with the contract.
1339  */
1340 contract_t *
1341 contract_ptr(ctid_t id, uint64_t zuniqid)
1342 {
1343 	contract_t *ct;
1344 
1345 	mutex_enter(&contract_lock);
1346 	ct = contract_ptr_common(&contract_avl, id, zuniqid);
1347 	mutex_exit(&contract_lock);
1348 
1349 	return (ct);
1350 }
1351 
1352 /*
1353  * contract_type_time
1354  *
1355  * Obtains the last time a contract of a particular type was created.
1356  */
1357 void
1358 contract_type_time(ct_type_t *type, timestruc_t *time)
1359 {
1360 	mutex_enter(&type->ct_type_lock);
1361 	*time = type->ct_type_timestruc;
1362 	mutex_exit(&type->ct_type_lock);
1363 }
1364 
1365 /*
1366  * contract_type_bundle
1367  *
1368  * Obtains a type's bundle queue.
1369  */
1370 ct_equeue_t *
1371 contract_type_bundle(ct_type_t *type)
1372 {
1373 	return (&type->ct_type_events);
1374 }
1375 
1376 /*
1377  * contract_type_pbundle
1378  *
1379  * Obtain's a process's bundle queue.  If one doesn't exist, one is
1380  * created.  Often used simply to ensure that a bundle queue is
1381  * allocated.
1382  */
1383 ct_equeue_t *
1384 contract_type_pbundle(ct_type_t *type, proc_t *pp)
1385 {
1386 	/*
1387 	 * If there isn't an array of bundle queues, allocate one.
1388 	 */
1389 	if (pp->p_ct_equeue == NULL) {
1390 		size_t size = CTT_MAXTYPE * sizeof (ct_equeue_t *);
1391 		ct_equeue_t **qa = kmem_zalloc(size, KM_SLEEP);
1392 
1393 		mutex_enter(&pp->p_lock);
1394 		if (pp->p_ct_equeue)
1395 			kmem_free(qa, size);
1396 		else
1397 			pp->p_ct_equeue = qa;
1398 		mutex_exit(&pp->p_lock);
1399 	}
1400 
1401 	/*
1402 	 * If there isn't a bundle queue of the required type, allocate
1403 	 * one.
1404 	 */
1405 	if (pp->p_ct_equeue[type->ct_type_index] == NULL) {
1406 		ct_equeue_t *q = kmem_zalloc(sizeof (ct_equeue_t), KM_SLEEP);
1407 		cte_queue_create(q, CTEL_PBUNDLE, 20, 1);
1408 
1409 		mutex_enter(&pp->p_lock);
1410 		if (pp->p_ct_equeue[type->ct_type_index])
1411 			cte_queue_drain(q, 0);
1412 		else
1413 			pp->p_ct_equeue[type->ct_type_index] = q;
1414 		mutex_exit(&pp->p_lock);
1415 	}
1416 
1417 	return (pp->p_ct_equeue[type->ct_type_index]);
1418 }
1419 
1420 /*
1421  * ctmpl_free
1422  *
1423  * Frees a template.
1424  */
1425 void
1426 ctmpl_free(ct_template_t *template)
1427 {
1428 	mutex_destroy(&template->ctmpl_lock);
1429 	template->ctmpl_ops->ctop_free(template);
1430 }
1431 
1432 /*
1433  * ctmpl_dup
1434  *
1435  * Creates a copy of a template.
1436  */
1437 ct_template_t *
1438 ctmpl_dup(ct_template_t *template)
1439 {
1440 	ct_template_t *new;
1441 
1442 	if (template == NULL)
1443 		return (NULL);
1444 
1445 	new = template->ctmpl_ops->ctop_dup(template);
1446 	/*
1447 	 * ctmpl_lock was taken by ctop_dup's call to ctmpl_copy and
1448 	 * should have remain held until now.
1449 	 */
1450 	mutex_exit(&template->ctmpl_lock);
1451 
1452 	return (new);
1453 }
1454 
1455 /*
1456  * ctmpl_set
1457  *
1458  * Sets the requested terms of a template.
1459  */
1460 int
1461 ctmpl_set(ct_template_t *template, ct_param_t *param, const cred_t *cr)
1462 {
1463 	int result = 0;
1464 	uint64_t param_value;
1465 
1466 	if (param->ctpm_id == CTP_COOKIE ||
1467 	    param->ctpm_id == CTP_EV_INFO ||
1468 	    param->ctpm_id == CTP_EV_CRITICAL) {
1469 		if (param->ctpm_size < sizeof (uint64_t)) {
1470 			return (EINVAL);
1471 		} else {
1472 			param_value = *(uint64_t *)param->ctpm_value;
1473 		}
1474 	}
1475 
1476 	mutex_enter(&template->ctmpl_lock);
1477 	switch (param->ctpm_id) {
1478 	case CTP_COOKIE:
1479 		template->ctmpl_cookie = param_value;
1480 		break;
1481 	case CTP_EV_INFO:
1482 		if (param_value & ~(uint64_t)template->ctmpl_ops->allevents)
1483 			result = EINVAL;
1484 		else
1485 			template->ctmpl_ev_info = param_value;
1486 		break;
1487 	case CTP_EV_CRITICAL:
1488 		if (param_value & ~(uint64_t)template->ctmpl_ops->allevents) {
1489 			result = EINVAL;
1490 			break;
1491 		} else if ((~template->ctmpl_ev_crit & param_value) == 0) {
1492 			/*
1493 			 * Assume that a pure reduction of the critical
1494 			 * set is allowed by the contract type.
1495 			 */
1496 			template->ctmpl_ev_crit = param_value;
1497 			break;
1498 		}
1499 		/*
1500 		 * There may be restrictions on what we can make
1501 		 * critical, so we defer to the judgement of the
1502 		 * contract type.
1503 		 */
1504 		/* FALLTHROUGH */
1505 	default:
1506 		result = template->ctmpl_ops->ctop_set(template, param, cr);
1507 	}
1508 	mutex_exit(&template->ctmpl_lock);
1509 
1510 	return (result);
1511 }
1512 
1513 /*
1514  * ctmpl_get
1515  *
1516  * Obtains the requested terms from a template.
1517  *
1518  * If the term requested is a variable-sized term and the buffer
1519  * provided is too small for the data, we truncate the data and return
1520  * the buffer size necessary to fit the term in param->ctpm_size. If the
1521  * term requested is fix-sized (uint64_t) and the buffer provided is too
1522  * small, we return EINVAL.  This should never happen if you're using
1523  * libcontract(3LIB), only if you call ioctl with a hand constructed
1524  * ct_param_t argument.
1525  *
1526  * Currently, only contract specific parameters have variable-sized
1527  * parameters.
1528  */
1529 int
1530 ctmpl_get(ct_template_t *template, ct_param_t *param)
1531 {
1532 	int result = 0;
1533 	uint64_t *param_value;
1534 
1535 	if (param->ctpm_id == CTP_COOKIE ||
1536 	    param->ctpm_id == CTP_EV_INFO ||
1537 	    param->ctpm_id == CTP_EV_CRITICAL) {
1538 		if (param->ctpm_size < sizeof (uint64_t)) {
1539 			return (EINVAL);
1540 		} else {
1541 			param_value = param->ctpm_value;
1542 			param->ctpm_size = sizeof (uint64_t);
1543 		}
1544 	}
1545 
1546 	mutex_enter(&template->ctmpl_lock);
1547 	switch (param->ctpm_id) {
1548 	case CTP_COOKIE:
1549 		*param_value = template->ctmpl_cookie;
1550 		break;
1551 	case CTP_EV_INFO:
1552 		*param_value = template->ctmpl_ev_info;
1553 		break;
1554 	case CTP_EV_CRITICAL:
1555 		*param_value = template->ctmpl_ev_crit;
1556 		break;
1557 	default:
1558 		result = template->ctmpl_ops->ctop_get(template, param);
1559 	}
1560 	mutex_exit(&template->ctmpl_lock);
1561 
1562 	return (result);
1563 }
1564 
1565 /*
1566  * ctmpl_makecurrent
1567  *
1568  * Used by ctmpl_activate and ctmpl_clear to set the current thread's
1569  * active template.  Frees the old active template, if there was one.
1570  */
1571 static void
1572 ctmpl_makecurrent(ct_template_t *template, ct_template_t *new)
1573 {
1574 	klwp_t *curlwp = ttolwp(curthread);
1575 	proc_t *p = curproc;
1576 	ct_template_t *old;
1577 
1578 	mutex_enter(&p->p_lock);
1579 	old = curlwp->lwp_ct_active[template->ctmpl_type->ct_type_index];
1580 	curlwp->lwp_ct_active[template->ctmpl_type->ct_type_index] = new;
1581 	mutex_exit(&p->p_lock);
1582 
1583 	if (old)
1584 		ctmpl_free(old);
1585 }
1586 
1587 /*
1588  * ctmpl_activate
1589  *
1590  * Copy the specified template as the current thread's activate
1591  * template of that type.
1592  */
1593 void
1594 ctmpl_activate(ct_template_t *template)
1595 {
1596 	ctmpl_makecurrent(template, ctmpl_dup(template));
1597 }
1598 
1599 /*
1600  * ctmpl_clear
1601  *
1602  * Clears the current thread's activate template of the same type as
1603  * the specified template.
1604  */
1605 void
1606 ctmpl_clear(ct_template_t *template)
1607 {
1608 	ctmpl_makecurrent(template, NULL);
1609 }
1610 
1611 /*
1612  * ctmpl_create
1613  *
1614  * Creates a new contract using the specified template.
1615  */
1616 int
1617 ctmpl_create(ct_template_t *template, ctid_t *ctidp)
1618 {
1619 	return (template->ctmpl_ops->ctop_create(template, ctidp));
1620 }
1621 
1622 /*
1623  * ctmpl_init
1624  *
1625  * Initializes the common portion of a new contract template.
1626  */
1627 void
1628 ctmpl_init(ct_template_t *new, ctmplops_t *ops, ct_type_t *type, void *data)
1629 {
1630 	mutex_init(&new->ctmpl_lock, NULL, MUTEX_DEFAULT, NULL);
1631 	new->ctmpl_ops = ops;
1632 	new->ctmpl_type = type;
1633 	new->ctmpl_data = data;
1634 	new->ctmpl_ev_info = new->ctmpl_ev_crit = 0;
1635 	new->ctmpl_cookie = 0;
1636 }
1637 
1638 /*
1639  * ctmpl_copy
1640  *
1641  * Copies the common portions of a contract template.  Intended for use
1642  * by a contract type's ctop_dup template op.  Returns with the old
1643  * template's lock held, which will should remain held until the
1644  * template op returns (it is dropped by ctmpl_dup).
1645  */
1646 void
1647 ctmpl_copy(ct_template_t *new, ct_template_t *old)
1648 {
1649 	mutex_init(&new->ctmpl_lock, NULL, MUTEX_DEFAULT, NULL);
1650 	mutex_enter(&old->ctmpl_lock);
1651 	new->ctmpl_ops = old->ctmpl_ops;
1652 	new->ctmpl_type = old->ctmpl_type;
1653 	new->ctmpl_ev_crit = old->ctmpl_ev_crit;
1654 	new->ctmpl_ev_info = old->ctmpl_ev_info;
1655 	new->ctmpl_cookie = old->ctmpl_cookie;
1656 }
1657 
1658 /*
1659  * ctmpl_create_inval
1660  *
1661  * Returns EINVAL.  Provided for the convenience of those contract
1662  * types which don't support ct_tmpl_create(3contract) and would
1663  * otherwise need to create their own stub for the ctop_create template
1664  * op.
1665  */
1666 /*ARGSUSED*/
1667 int
1668 ctmpl_create_inval(ct_template_t *template, ctid_t *ctidp)
1669 {
1670 	return (EINVAL);
1671 }
1672 
1673 
1674 /*
1675  * cte_queue_create
1676  *
1677  * Initializes a queue of a particular type.  If dynamic is set, the
1678  * queue is to be freed when its last listener is removed after being
1679  * drained.
1680  */
1681 static void
1682 cte_queue_create(ct_equeue_t *q, ct_listnum_t list, int maxinf, int dynamic)
1683 {
1684 	mutex_init(&q->ctq_lock, NULL, MUTEX_DEFAULT, NULL);
1685 	q->ctq_listno = list;
1686 	list_create(&q->ctq_events, sizeof (ct_kevent_t),
1687 	    offsetof(ct_kevent_t, cte_nodes[list].ctm_node));
1688 	list_create(&q->ctq_listeners, sizeof (ct_listener_t),
1689 	    offsetof(ct_listener_t, ctl_allnode));
1690 	list_create(&q->ctq_tail, sizeof (ct_listener_t),
1691 	    offsetof(ct_listener_t, ctl_tailnode));
1692 	gethrestime(&q->ctq_atime);
1693 	q->ctq_nlisteners = 0;
1694 	q->ctq_nreliable = 0;
1695 	q->ctq_ninf = 0;
1696 	q->ctq_max = maxinf;
1697 
1698 	/*
1699 	 * Bundle queues and contract queues are embedded in other
1700 	 * structures and are implicitly referenced counted by virtue
1701 	 * of their vnodes' indirect hold on their contracts.  Process
1702 	 * bundle queues are dynamically allocated and may persist
1703 	 * after the death of the process, so they must be explicitly
1704 	 * reference counted.
1705 	 */
1706 	q->ctq_flags = dynamic ? CTQ_REFFED : 0;
1707 }
1708 
1709 /*
1710  * cte_queue_destroy
1711  *
1712  * Destroys the specified queue.  The queue is freed if referenced
1713  * counted.
1714  */
1715 static void
1716 cte_queue_destroy(ct_equeue_t *q)
1717 {
1718 	ASSERT(q->ctq_flags & CTQ_DEAD);
1719 	ASSERT(q->ctq_nlisteners == 0);
1720 	ASSERT(q->ctq_nreliable == 0);
1721 	list_destroy(&q->ctq_events);
1722 	list_destroy(&q->ctq_listeners);
1723 	list_destroy(&q->ctq_tail);
1724 	mutex_destroy(&q->ctq_lock);
1725 	if (q->ctq_flags & CTQ_REFFED)
1726 		kmem_free(q, sizeof (ct_equeue_t));
1727 }
1728 
1729 /*
1730  * cte_hold
1731  *
1732  * Takes a hold on the specified event.
1733  */
1734 static void
1735 cte_hold(ct_kevent_t *e)
1736 {
1737 	mutex_enter(&e->cte_lock);
1738 	ASSERT(e->cte_refs > 0);
1739 	e->cte_refs++;
1740 	mutex_exit(&e->cte_lock);
1741 }
1742 
1743 /*
1744  * cte_rele
1745  *
1746  * Releases a hold on the specified event.  If the caller had the last
1747  * reference, frees the event and releases its hold on the contract
1748  * that generated it.
1749  */
1750 static void
1751 cte_rele(ct_kevent_t *e)
1752 {
1753 	mutex_enter(&e->cte_lock);
1754 	ASSERT(e->cte_refs > 0);
1755 	if (--e->cte_refs) {
1756 		mutex_exit(&e->cte_lock);
1757 		return;
1758 	}
1759 
1760 	contract_rele(e->cte_contract);
1761 
1762 	mutex_destroy(&e->cte_lock);
1763 	if (e->cte_data)
1764 		nvlist_free(e->cte_data);
1765 	if (e->cte_gdata)
1766 		nvlist_free(e->cte_gdata);
1767 	kmem_free(e, sizeof (ct_kevent_t));
1768 }
1769 
1770 /*
1771  * cte_qrele
1772  *
1773  * Remove this listener's hold on the specified event, removing and
1774  * releasing the queue's hold on the event if appropriate.
1775  */
1776 static void
1777 cte_qrele(ct_equeue_t *q, ct_listener_t *l, ct_kevent_t *e)
1778 {
1779 	ct_member_t *member = &e->cte_nodes[q->ctq_listno];
1780 
1781 	ASSERT(MUTEX_HELD(&q->ctq_lock));
1782 
1783 	if (l->ctl_flags & CTLF_RELIABLE)
1784 		member->ctm_nreliable--;
1785 	if ((--member->ctm_refs == 0) && member->ctm_trimmed) {
1786 		member->ctm_trimmed = 0;
1787 		list_remove(&q->ctq_events, e);
1788 		cte_rele(e);
1789 	}
1790 }
1791 
1792 /*
1793  * cte_qmove
1794  *
1795  * Move this listener to the specified event in the queue.
1796  */
1797 static ct_kevent_t *
1798 cte_qmove(ct_equeue_t *q, ct_listener_t *l, ct_kevent_t *e)
1799 {
1800 	ct_kevent_t *olde;
1801 
1802 	ASSERT(MUTEX_HELD(&q->ctq_lock));
1803 	ASSERT(l->ctl_equeue == q);
1804 
1805 	if ((olde = l->ctl_position) == NULL)
1806 		list_remove(&q->ctq_tail, l);
1807 
1808 	while (e != NULL && e->cte_nodes[q->ctq_listno].ctm_trimmed)
1809 		e = list_next(&q->ctq_events, e);
1810 
1811 	if (e != NULL) {
1812 		e->cte_nodes[q->ctq_listno].ctm_refs++;
1813 		if (l->ctl_flags & CTLF_RELIABLE)
1814 			e->cte_nodes[q->ctq_listno].ctm_nreliable++;
1815 	} else {
1816 		list_insert_tail(&q->ctq_tail, l);
1817 	}
1818 
1819 	l->ctl_position = e;
1820 	if (olde)
1821 		cte_qrele(q, l, olde);
1822 
1823 	return (e);
1824 }
1825 
1826 /*
1827  * cte_checkcred
1828  *
1829  * Determines if the specified event's contract is owned by a process
1830  * with the same effective uid as the specified credential.  Called
1831  * after a failed call to contract_owned with locked set.  Because it
1832  * drops the queue lock, its caller (cte_qreadable) needs to make sure
1833  * we're still in the same place after we return.  Returns 1 on
1834  * success.
1835  */
1836 static int
1837 cte_checkcred(ct_equeue_t *q, ct_kevent_t *e, const cred_t *cr)
1838 {
1839 	int result;
1840 	contract_t *ct = e->cte_contract;
1841 
1842 	cte_hold(e);
1843 	mutex_exit(&q->ctq_lock);
1844 	result = curproc->p_zone->zone_uniqid == ct->ct_czuniqid &&
1845 	    contract_checkcred(ct, cr);
1846 	mutex_enter(&q->ctq_lock);
1847 	cte_rele(e);
1848 
1849 	return (result);
1850 }
1851 
1852 /*
1853  * cte_qreadable
1854  *
1855  * Ensures that the listener is pointing to a valid event that the
1856  * caller has the credentials to read.  Returns 0 if we can read the
1857  * event we're pointing to.
1858  */
1859 static int
1860 cte_qreadable(ct_equeue_t *q, ct_listener_t *l, const cred_t *cr,
1861     uint64_t zuniqid, int crit)
1862 {
1863 	ct_kevent_t *e, *next;
1864 	contract_t *ct;
1865 
1866 	ASSERT(MUTEX_HELD(&q->ctq_lock));
1867 	ASSERT(l->ctl_equeue == q);
1868 
1869 	if (l->ctl_flags & CTLF_COPYOUT)
1870 		return (1);
1871 
1872 	next = l->ctl_position;
1873 	while (e = cte_qmove(q, l, next)) {
1874 		ct = e->cte_contract;
1875 		/*
1876 		 * Check obvious things first.  If we are looking for a
1877 		 * critical message, is this one?  If we aren't in the
1878 		 * global zone, is this message meant for us?
1879 		 */
1880 		if ((crit && (e->cte_flags & (CTE_INFO | CTE_ACK))) ||
1881 		    (cr != NULL && zuniqid != GLOBAL_ZONEUNIQID &&
1882 		    zuniqid != contract_getzuniqid(ct))) {
1883 
1884 			next = list_next(&q->ctq_events, e);
1885 
1886 		/*
1887 		 * Next, see if our effective uid equals that of owner
1888 		 * or author of the contract.  Since we are holding the
1889 		 * queue lock, contract_owned can't always check if we
1890 		 * have the same effective uid as the contract's
1891 		 * owner.  If it comes to that, it fails and we take
1892 		 * the slow(er) path.
1893 		 */
1894 		} else if (cr != NULL && !contract_owned(ct, cr, B_TRUE)) {
1895 
1896 			/*
1897 			 * At this point we either don't have any claim
1898 			 * to this contract or we match the effective
1899 			 * uid of the owner but couldn't tell.  We
1900 			 * first test for a NULL holder so that events
1901 			 * from orphans and inherited contracts avoid
1902 			 * the penalty phase.
1903 			 */
1904 			if (e->cte_contract->ct_owner == NULL &&
1905 			    !secpolicy_contract_observer_choice(cr))
1906 				next = list_next(&q->ctq_events, e);
1907 
1908 			/*
1909 			 * cte_checkcred will juggle locks to see if we
1910 			 * have the same uid as the event's contract's
1911 			 * current owner.  If it succeeds, we have to
1912 			 * make sure we are in the same point in the
1913 			 * queue.
1914 			 */
1915 			else if (cte_checkcred(q, e, cr) &&
1916 			    l->ctl_position == e)
1917 				break;
1918 
1919 			/*
1920 			 * cte_checkcred failed; see if we're in the
1921 			 * same place.
1922 			 */
1923 			else if (l->ctl_position == e)
1924 				if (secpolicy_contract_observer_choice(cr))
1925 					break;
1926 				else
1927 					next = list_next(&q->ctq_events, e);
1928 
1929 			/*
1930 			 * cte_checkcred failed, and our position was
1931 			 * changed.  Start from there.
1932 			 */
1933 			else
1934 				next = l->ctl_position;
1935 		} else {
1936 			break;
1937 		}
1938 	}
1939 
1940 	/*
1941 	 * We check for CTLF_COPYOUT again in case we dropped the queue
1942 	 * lock in cte_checkcred.
1943 	 */
1944 	return ((l->ctl_flags & CTLF_COPYOUT) || (l->ctl_position == NULL));
1945 }
1946 
1947 /*
1948  * cte_qwakeup
1949  *
1950  * Wakes up any waiting listeners and points them at the specified event.
1951  */
1952 static void
1953 cte_qwakeup(ct_equeue_t *q, ct_kevent_t *e)
1954 {
1955 	ct_listener_t *l;
1956 
1957 	ASSERT(MUTEX_HELD(&q->ctq_lock));
1958 
1959 	while (l = list_head(&q->ctq_tail)) {
1960 		list_remove(&q->ctq_tail, l);
1961 		e->cte_nodes[q->ctq_listno].ctm_refs++;
1962 		if (l->ctl_flags & CTLF_RELIABLE)
1963 			e->cte_nodes[q->ctq_listno].ctm_nreliable++;
1964 		l->ctl_position = e;
1965 		cv_signal(&l->ctl_cv);
1966 		pollwakeup(&l->ctl_pollhead, POLLIN);
1967 	}
1968 }
1969 
1970 /*
1971  * cte_copy
1972  *
1973  * Copies events from the specified contract event queue to the
1974  * end of the specified process bundle queue.  Only called from
1975  * contract_adopt.
1976  *
1977  * We copy to the end of the target queue instead of mixing the events
1978  * in their proper order because otherwise the act of adopting a
1979  * contract would require a process to reset all process bundle
1980  * listeners it needed to see the new events.  This would, in turn,
1981  * require the process to keep track of which preexisting events had
1982  * already been processed.
1983  */
1984 static void
1985 cte_copy(ct_equeue_t *q, ct_equeue_t *newq)
1986 {
1987 	ct_kevent_t *e, *first = NULL;
1988 
1989 	ASSERT(q->ctq_listno == CTEL_CONTRACT);
1990 	ASSERT(newq->ctq_listno == CTEL_PBUNDLE);
1991 
1992 	mutex_enter(&q->ctq_lock);
1993 	mutex_enter(&newq->ctq_lock);
1994 
1995 	/*
1996 	 * For now, only copy critical events.
1997 	 */
1998 	for (e = list_head(&q->ctq_events); e != NULL;
1999 	    e = list_next(&q->ctq_events, e)) {
2000 		if ((e->cte_flags & (CTE_INFO | CTE_ACK)) == 0) {
2001 			if (first == NULL)
2002 				first = e;
2003 			list_insert_tail(&newq->ctq_events, e);
2004 			cte_hold(e);
2005 		}
2006 	}
2007 
2008 	mutex_exit(&q->ctq_lock);
2009 
2010 	if (first)
2011 		cte_qwakeup(newq, first);
2012 
2013 	mutex_exit(&newq->ctq_lock);
2014 }
2015 
2016 /*
2017  * cte_trim
2018  *
2019  * Trims unneeded events from an event queue.  Algorithm works as
2020  * follows:
2021  *
2022  *   Removes all informative and acknowledged critical events until the
2023  *   first referenced event is found.
2024  *
2025  *   If a contract is specified, removes all events (regardless of
2026  *   acknowledgement) generated by that contract until the first event
2027  *   referenced by a reliable listener is found.  Reference events are
2028  *   removed by marking them "trimmed".  Such events will be removed
2029  *   when the last reference is dropped and will be skipped by future
2030  *   listeners.
2031  *
2032  * This is pretty basic.  Ideally this should remove from the middle of
2033  * the list (i.e. beyond the first referenced event), and even
2034  * referenced events.
2035  */
2036 static void
2037 cte_trim(ct_equeue_t *q, contract_t *ct)
2038 {
2039 	ct_kevent_t *e, *next;
2040 	int flags, stopper;
2041 	int start = 1;
2042 
2043 	ASSERT(MUTEX_HELD(&q->ctq_lock));
2044 
2045 	for (e = list_head(&q->ctq_events); e != NULL; e = next) {
2046 		next = list_next(&q->ctq_events, e);
2047 		flags = e->cte_flags;
2048 		stopper = (q->ctq_listno != CTEL_PBUNDLE) &&
2049 		    (e->cte_nodes[q->ctq_listno].ctm_nreliable > 0);
2050 		if (e->cte_nodes[q->ctq_listno].ctm_refs == 0) {
2051 			if ((start && (flags & (CTE_INFO | CTE_ACK))) ||
2052 			    (e->cte_contract == ct)) {
2053 				/*
2054 				 * Toss informative and ACKed critical messages.
2055 				 */
2056 				list_remove(&q->ctq_events, e);
2057 				cte_rele(e);
2058 			}
2059 		} else if ((e->cte_contract == ct) && !stopper) {
2060 			ASSERT(q->ctq_nlisteners != 0);
2061 			e->cte_nodes[q->ctq_listno].ctm_trimmed = 1;
2062 		} else if (ct && !stopper) {
2063 			start = 0;
2064 		} else {
2065 			/*
2066 			 * Don't free messages past the first reader.
2067 			 */
2068 			break;
2069 		}
2070 	}
2071 }
2072 
2073 /*
2074  * cte_queue_drain
2075  *
2076  * Drain all events from the specified queue, and mark it dead.  If
2077  * "ack" is set, acknowledge any critical events we find along the
2078  * way.
2079  */
2080 static void
2081 cte_queue_drain(ct_equeue_t *q, int ack)
2082 {
2083 	ct_kevent_t *e, *next;
2084 	ct_listener_t *l;
2085 
2086 	mutex_enter(&q->ctq_lock);
2087 
2088 	for (e = list_head(&q->ctq_events); e != NULL; e = next) {
2089 		next = list_next(&q->ctq_events, e);
2090 		if (ack && ((e->cte_flags & (CTE_INFO | CTE_ACK)) == 0)) {
2091 			/*
2092 			 * Make sure critical messages are eventually
2093 			 * removed from the bundle queues.
2094 			 */
2095 			mutex_enter(&e->cte_lock);
2096 			e->cte_flags |= CTE_ACK;
2097 			mutex_exit(&e->cte_lock);
2098 			ASSERT(MUTEX_HELD(&e->cte_contract->ct_lock));
2099 			e->cte_contract->ct_evcnt--;
2100 		}
2101 		list_remove(&q->ctq_events, e);
2102 		e->cte_nodes[q->ctq_listno].ctm_refs = 0;
2103 		e->cte_nodes[q->ctq_listno].ctm_nreliable = 0;
2104 		e->cte_nodes[q->ctq_listno].ctm_trimmed = 0;
2105 		cte_rele(e);
2106 	}
2107 
2108 	/*
2109 	 * This is necessary only because of CTEL_PBUNDLE listeners;
2110 	 * the events they point to can move from one pbundle to
2111 	 * another.  Fortunately, this only happens if the contract is
2112 	 * inherited, which (in turn) only happens if the process
2113 	 * exits, which means it's an all-or-nothing deal.  If this
2114 	 * wasn't the case, we would instead need to keep track of
2115 	 * listeners on a per-event basis, not just a per-queue basis.
2116 	 * This would have the side benefit of letting us clean up
2117 	 * trimmed events sooner (i.e. immediately), but would
2118 	 * unfortunately make events even bigger than they already
2119 	 * are.
2120 	 */
2121 	for (l = list_head(&q->ctq_listeners); l;
2122 	    l = list_next(&q->ctq_listeners, l)) {
2123 		l->ctl_flags |= CTLF_DEAD;
2124 		if (l->ctl_position) {
2125 			l->ctl_position = NULL;
2126 			list_insert_tail(&q->ctq_tail, l);
2127 		}
2128 		cv_broadcast(&l->ctl_cv);
2129 	}
2130 
2131 	/*
2132 	 * Disallow events.
2133 	 */
2134 	q->ctq_flags |= CTQ_DEAD;
2135 
2136 	/*
2137 	 * If we represent the last reference to a reference counted
2138 	 * process bundle queue, free it.
2139 	 */
2140 	if ((q->ctq_flags & CTQ_REFFED) && (q->ctq_nlisteners == 0))
2141 		cte_queue_destroy(q);
2142 	else
2143 		mutex_exit(&q->ctq_lock);
2144 }
2145 
2146 /*
2147  * cte_publish
2148  *
2149  * Publishes an event to a specific queue.  Only called by
2150  * cte_publish_all.
2151  */
2152 static void
2153 cte_publish(ct_equeue_t *q, ct_kevent_t *e, timespec_t *tsp)
2154 {
2155 	ASSERT(MUTEX_HELD(&q->ctq_lock));
2156 
2157 	q->ctq_atime = *tsp;
2158 
2159 	/*
2160 	 * Don't publish if the event is informative and there aren't
2161 	 * any listeners, or if the queue has been shut down.
2162 	 */
2163 	if (((q->ctq_nlisteners == 0) && (e->cte_flags & (CTE_INFO|CTE_ACK))) ||
2164 	    (q->ctq_flags & CTQ_DEAD)) {
2165 		mutex_exit(&q->ctq_lock);
2166 		cte_rele(e);
2167 		return;
2168 	}
2169 
2170 	/*
2171 	 * Enqueue event
2172 	 */
2173 	list_insert_tail(&q->ctq_events, e);
2174 
2175 	/*
2176 	 * Check for waiting listeners
2177 	 */
2178 	cte_qwakeup(q, e);
2179 
2180 	/*
2181 	 * Trim unnecessary events from the queue.
2182 	 */
2183 	cte_trim(q, NULL);
2184 	mutex_exit(&q->ctq_lock);
2185 }
2186 
2187 /*
2188  * cte_publish_all
2189  *
2190  * Publish an event to all necessary event queues.  The event, e, must
2191  * be zallocated by the caller, and the event's flags and type must be
2192  * set.  The rest of the event's fields are initialized here.
2193  */
2194 uint64_t
2195 cte_publish_all(contract_t *ct, ct_kevent_t *e, nvlist_t *data, nvlist_t *gdata)
2196 {
2197 	ct_equeue_t *q;
2198 	timespec_t ts;
2199 	uint64_t evid;
2200 	ct_kevent_t *negev;
2201 	int negend;
2202 
2203 	e->cte_contract = ct;
2204 	e->cte_data = data;
2205 	e->cte_gdata = gdata;
2206 	e->cte_refs = 3;
2207 	evid = e->cte_id = atomic_add_64_nv(&ct->ct_type->ct_type_evid, 1);
2208 	contract_hold(ct);
2209 
2210 	/*
2211 	 * For a negotiation event we set the ct->ct_nevent field of the
2212 	 * contract for the duration of the negotiation
2213 	 */
2214 	negend = 0;
2215 	if (e->cte_flags & CTE_NEG) {
2216 		cte_hold(e);
2217 		ct->ct_nevent = e;
2218 	} else if (e->cte_type == CT_EV_NEGEND) {
2219 		negend = 1;
2220 	}
2221 
2222 	gethrestime(&ts);
2223 
2224 	/*
2225 	 * ct_evtlock simply (and only) ensures that two events sent
2226 	 * from the same contract are delivered to all queues in the
2227 	 * same order.
2228 	 */
2229 	mutex_enter(&ct->ct_evtlock);
2230 
2231 	/*
2232 	 * CTEL_CONTRACT - First deliver to the contract queue, acking
2233 	 * the event if the contract has been orphaned.
2234 	 */
2235 	mutex_enter(&ct->ct_lock);
2236 	mutex_enter(&ct->ct_events.ctq_lock);
2237 	if ((e->cte_flags & CTE_INFO) == 0) {
2238 		if (ct->ct_state >= CTS_ORPHAN)
2239 			e->cte_flags |= CTE_ACK;
2240 		else
2241 			ct->ct_evcnt++;
2242 	}
2243 	mutex_exit(&ct->ct_lock);
2244 	cte_publish(&ct->ct_events, e, &ts);
2245 
2246 	/*
2247 	 * CTEL_BUNDLE - Next deliver to the contract type's bundle
2248 	 * queue.
2249 	 */
2250 	mutex_enter(&ct->ct_type->ct_type_events.ctq_lock);
2251 	cte_publish(&ct->ct_type->ct_type_events, e, &ts);
2252 
2253 	/*
2254 	 * CTEL_PBUNDLE - Finally, if the contract has an owner,
2255 	 * deliver to the owner's process bundle queue.
2256 	 */
2257 	mutex_enter(&ct->ct_lock);
2258 	if (ct->ct_owner) {
2259 		/*
2260 		 * proc_exit doesn't free event queues until it has
2261 		 * abandoned all contracts.
2262 		 */
2263 		ASSERT(ct->ct_owner->p_ct_equeue);
2264 		ASSERT(ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index]);
2265 		q = ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index];
2266 		mutex_enter(&q->ctq_lock);
2267 		mutex_exit(&ct->ct_lock);
2268 		cte_publish(q, e, &ts);
2269 	} else {
2270 		mutex_exit(&ct->ct_lock);
2271 		cte_rele(e);
2272 	}
2273 
2274 	if (negend) {
2275 		mutex_enter(&ct->ct_lock);
2276 		negev = ct->ct_nevent;
2277 		ct->ct_nevent = NULL;
2278 		cte_rele(negev);
2279 		mutex_exit(&ct->ct_lock);
2280 	}
2281 
2282 	mutex_exit(&ct->ct_evtlock);
2283 
2284 	return (evid);
2285 }
2286 
2287 /*
2288  * cte_add_listener
2289  *
2290  * Add a new listener to an event queue.
2291  */
2292 void
2293 cte_add_listener(ct_equeue_t *q, ct_listener_t *l)
2294 {
2295 	cv_init(&l->ctl_cv, NULL, CV_DEFAULT, NULL);
2296 	l->ctl_equeue = q;
2297 	l->ctl_position = NULL;
2298 	l->ctl_flags = 0;
2299 
2300 	mutex_enter(&q->ctq_lock);
2301 	list_insert_head(&q->ctq_tail, l);
2302 	list_insert_head(&q->ctq_listeners, l);
2303 	q->ctq_nlisteners++;
2304 	mutex_exit(&q->ctq_lock);
2305 }
2306 
2307 /*
2308  * cte_remove_listener
2309  *
2310  * Remove a listener from an event queue.  No other queue activities
2311  * (e.g. cte_get event) may be in progress at this endpoint when this
2312  * is called.
2313  */
2314 void
2315 cte_remove_listener(ct_listener_t *l)
2316 {
2317 	ct_equeue_t *q = l->ctl_equeue;
2318 	ct_kevent_t *e;
2319 
2320 	mutex_enter(&q->ctq_lock);
2321 
2322 	ASSERT((l->ctl_flags & (CTLF_COPYOUT|CTLF_RESET)) == 0);
2323 
2324 	if ((e = l->ctl_position) != NULL)
2325 		cte_qrele(q, l, e);
2326 	else
2327 		list_remove(&q->ctq_tail, l);
2328 	l->ctl_position = NULL;
2329 
2330 	q->ctq_nlisteners--;
2331 	list_remove(&q->ctq_listeners, l);
2332 
2333 	if (l->ctl_flags & CTLF_RELIABLE)
2334 		q->ctq_nreliable--;
2335 
2336 	/*
2337 	 * If we are a the last listener of a dead reference counted
2338 	 * queue (i.e. a process bundle) we free it.  Otherwise we just
2339 	 * trim any events which may have been kept around for our
2340 	 * benefit.
2341 	 */
2342 	if ((q->ctq_flags & CTQ_REFFED) && (q->ctq_flags & CTQ_DEAD) &&
2343 	    (q->ctq_nlisteners == 0)) {
2344 		cte_queue_destroy(q);
2345 	} else {
2346 		cte_trim(q, NULL);
2347 		mutex_exit(&q->ctq_lock);
2348 	}
2349 }
2350 
2351 /*
2352  * cte_reset_listener
2353  *
2354  * Moves a listener's queue pointer to the beginning of the queue.
2355  */
2356 void
2357 cte_reset_listener(ct_listener_t *l)
2358 {
2359 	ct_equeue_t *q = l->ctl_equeue;
2360 
2361 	mutex_enter(&q->ctq_lock);
2362 
2363 	/*
2364 	 * We allow an asynchronous reset because it doesn't make a
2365 	 * whole lot of sense to make reset block or fail.  We already
2366 	 * have most of the mechanism needed thanks to queue trimming,
2367 	 * so implementing it isn't a big deal.
2368 	 */
2369 	if (l->ctl_flags & CTLF_COPYOUT)
2370 		l->ctl_flags |= CTLF_RESET;
2371 
2372 	(void) cte_qmove(q, l, list_head(&q->ctq_events));
2373 
2374 	/*
2375 	 * Inform blocked readers.
2376 	 */
2377 	cv_broadcast(&l->ctl_cv);
2378 	pollwakeup(&l->ctl_pollhead, POLLIN);
2379 	mutex_exit(&q->ctq_lock);
2380 }
2381 
2382 /*
2383  * cte_next_event
2384  *
2385  * Moves the event pointer for the specified listener to the next event
2386  * on the queue.  To avoid races, this movement only occurs if the
2387  * specified event id matches that of the current event.  This is used
2388  * primarily to skip events that have been read but whose extended data
2389  * haven't been copied out.
2390  */
2391 int
2392 cte_next_event(ct_listener_t *l, uint64_t id)
2393 {
2394 	ct_equeue_t *q = l->ctl_equeue;
2395 	ct_kevent_t *old;
2396 
2397 	mutex_enter(&q->ctq_lock);
2398 
2399 	if (l->ctl_flags & CTLF_COPYOUT)
2400 		l->ctl_flags |= CTLF_RESET;
2401 
2402 	if (((old = l->ctl_position) != NULL) && (old->cte_id == id))
2403 		(void) cte_qmove(q, l, list_next(&q->ctq_events, old));
2404 
2405 	mutex_exit(&q->ctq_lock);
2406 
2407 	return (0);
2408 }
2409 
2410 /*
2411  * cte_get_event
2412  *
2413  * Reads an event from an event endpoint.  If "nonblock" is clear, we
2414  * block until a suitable event is ready.  If "crit" is set, we only
2415  * read critical events.  Note that while "cr" is the caller's cred,
2416  * "zuniqid" is the unique id of the zone the calling contract
2417  * filesystem was mounted in.
2418  */
2419 int
2420 cte_get_event(ct_listener_t *l, int nonblock, void *uaddr, const cred_t *cr,
2421     uint64_t zuniqid, int crit)
2422 {
2423 	ct_equeue_t *q = l->ctl_equeue;
2424 	ct_kevent_t *temp;
2425 	int result = 0;
2426 	int partial = 0;
2427 	size_t size, gsize, len;
2428 	model_t mdl = get_udatamodel();
2429 	STRUCT_DECL(ct_event, ev);
2430 	STRUCT_INIT(ev, mdl);
2431 
2432 	/*
2433 	 * cte_qreadable checks for CTLF_COPYOUT as well as ensures
2434 	 * that there exists, and we are pointing to, an appropriate
2435 	 * event.  It may temporarily drop ctq_lock, but that doesn't
2436 	 * really matter to us.
2437 	 */
2438 	mutex_enter(&q->ctq_lock);
2439 	while (cte_qreadable(q, l, cr, zuniqid, crit)) {
2440 		if (nonblock) {
2441 			result = EAGAIN;
2442 			goto error;
2443 		}
2444 		if (q->ctq_flags & CTQ_DEAD) {
2445 			result = EIDRM;
2446 			goto error;
2447 		}
2448 		result = cv_wait_sig(&l->ctl_cv, &q->ctq_lock);
2449 		if (result == 0) {
2450 			result = EINTR;
2451 			goto error;
2452 		}
2453 	}
2454 	temp = l->ctl_position;
2455 	cte_hold(temp);
2456 	l->ctl_flags |= CTLF_COPYOUT;
2457 	mutex_exit(&q->ctq_lock);
2458 
2459 	/*
2460 	 * We now have an event.  Copy in the user event structure to
2461 	 * see how much space we have to work with.
2462 	 */
2463 	result = copyin(uaddr, STRUCT_BUF(ev), STRUCT_SIZE(ev));
2464 	if (result)
2465 		goto copyerr;
2466 
2467 	/*
2468 	 * Determine what data we have and what the user should be
2469 	 * allowed to see.
2470 	 */
2471 	size = gsize = 0;
2472 	if (temp->cte_data) {
2473 		VERIFY(nvlist_size(temp->cte_data, &size,
2474 		    NV_ENCODE_NATIVE) == 0);
2475 		ASSERT(size != 0);
2476 	}
2477 	if (zuniqid == GLOBAL_ZONEUNIQID && temp->cte_gdata) {
2478 		VERIFY(nvlist_size(temp->cte_gdata, &gsize,
2479 		    NV_ENCODE_NATIVE) == 0);
2480 		ASSERT(gsize != 0);
2481 	}
2482 
2483 	/*
2484 	 * If we have enough space, copy out the extended event data.
2485 	 */
2486 	len = size + gsize;
2487 	if (len) {
2488 		if (STRUCT_FGET(ev, ctev_nbytes) >= len) {
2489 			char *buf = kmem_alloc(len, KM_SLEEP);
2490 
2491 			if (size)
2492 				VERIFY(nvlist_pack(temp->cte_data, &buf, &size,
2493 				    NV_ENCODE_NATIVE, KM_SLEEP) == 0);
2494 			if (gsize) {
2495 				char *tmp = buf + size;
2496 
2497 				VERIFY(nvlist_pack(temp->cte_gdata, &tmp,
2498 				    &gsize, NV_ENCODE_NATIVE, KM_SLEEP) == 0);
2499 			}
2500 
2501 			/* This shouldn't have changed */
2502 			ASSERT(size + gsize == len);
2503 			result = copyout(buf, STRUCT_FGETP(ev, ctev_buffer),
2504 			    len);
2505 			kmem_free(buf, len);
2506 			if (result)
2507 				goto copyerr;
2508 		} else {
2509 			partial = 1;
2510 		}
2511 	}
2512 
2513 	/*
2514 	 * Copy out the common event data.
2515 	 */
2516 	STRUCT_FSET(ev, ctev_id, temp->cte_contract->ct_id);
2517 	STRUCT_FSET(ev, ctev_evid, temp->cte_id);
2518 	STRUCT_FSET(ev, ctev_cttype,
2519 	    temp->cte_contract->ct_type->ct_type_index);
2520 	STRUCT_FSET(ev, ctev_flags, temp->cte_flags &
2521 	    (CTE_ACK|CTE_INFO|CTE_NEG));
2522 	STRUCT_FSET(ev, ctev_type, temp->cte_type);
2523 	STRUCT_FSET(ev, ctev_nbytes, len);
2524 	STRUCT_FSET(ev, ctev_goffset, size);
2525 	result = copyout(STRUCT_BUF(ev), uaddr, STRUCT_SIZE(ev));
2526 
2527 copyerr:
2528 	/*
2529 	 * Only move our location in the queue if all copyouts were
2530 	 * successful, the caller provided enough space for the entire
2531 	 * event, and our endpoint wasn't reset or otherwise moved by
2532 	 * another thread.
2533 	 */
2534 	mutex_enter(&q->ctq_lock);
2535 	if (result)
2536 		result = EFAULT;
2537 	else if (!partial && ((l->ctl_flags & CTLF_RESET) == 0) &&
2538 	    (l->ctl_position == temp))
2539 		(void) cte_qmove(q, l, list_next(&q->ctq_events, temp));
2540 	l->ctl_flags &= ~(CTLF_COPYOUT|CTLF_RESET);
2541 	/*
2542 	 * Signal any readers blocked on our CTLF_COPYOUT.
2543 	 */
2544 	cv_signal(&l->ctl_cv);
2545 	cte_rele(temp);
2546 
2547 error:
2548 	mutex_exit(&q->ctq_lock);
2549 	return (result);
2550 }
2551 
2552 /*
2553  * cte_set_reliable
2554  *
2555  * Requests that events be reliably delivered to an event endpoint.
2556  * Unread informative and acknowledged critical events will not be
2557  * removed from the queue until this listener reads or skips them.
2558  * Because a listener could maliciously request reliable delivery and
2559  * then do nothing, this requires that PRIV_CONTRACT_EVENT be in the
2560  * caller's effective set.
2561  */
2562 int
2563 cte_set_reliable(ct_listener_t *l, const cred_t *cr)
2564 {
2565 	ct_equeue_t *q = l->ctl_equeue;
2566 	int error;
2567 
2568 	if ((error = secpolicy_contract_event(cr)) != 0)
2569 		return (error);
2570 
2571 	mutex_enter(&q->ctq_lock);
2572 	if ((l->ctl_flags & CTLF_RELIABLE) == 0) {
2573 		l->ctl_flags |= CTLF_RELIABLE;
2574 		q->ctq_nreliable++;
2575 		if (l->ctl_position != NULL)
2576 			l->ctl_position->cte_nodes[q->ctq_listno].
2577 			    ctm_nreliable++;
2578 	}
2579 	mutex_exit(&q->ctq_lock);
2580 
2581 	return (0);
2582 }
2583