xref: /titanic_44/usr/src/uts/common/contract/device.c (revision ddf7fe95b8ad67aa16deb427a0b78f4dd4ff22b1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/mutex.h>
29 #include <sys/debug.h>
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/kmem.h>
33 #include <sys/thread.h>
34 #include <sys/id_space.h>
35 #include <sys/avl.h>
36 #include <sys/list.h>
37 #include <sys/sysmacros.h>
38 #include <sys/proc.h>
39 #include <sys/contract.h>
40 #include <sys/contract_impl.h>
41 #include <sys/contract/device.h>
42 #include <sys/contract/device_impl.h>
43 #include <sys/cmn_err.h>
44 #include <sys/nvpair.h>
45 #include <sys/policy.h>
46 #include <sys/ddi_impldefs.h>
47 #include <sys/ddi_implfuncs.h>
48 #include <sys/systm.h>
49 #include <sys/stat.h>
50 #include <sys/sunddi.h>
51 #include <sys/esunddi.h>
52 #include <sys/ddi.h>
53 #include <sys/fs/dv_node.h>
54 #include <sys/sunndi.h>
55 #undef ct_lock	/* needed because clnt.h defines ct_lock as a macro */
56 
57 /*
58  * Device Contracts
59  * -----------------
60  * This file contains the core code for the device contracts framework.
61  * A device contract is an agreement or a contract between a process and
62  * the kernel regarding the state of the device. A device contract may be
63  * created when a relationship is formed between a device and a process
64  * i.e. at open(2) time, or it may be created at some point after the device
65  * has been opened. A device contract once formed may be broken by either party.
66  * A device contract can be broken by the process by an explicit abandon of the
67  * contract or by an implicit abandon when the process exits. A device contract
68  * can be broken by the kernel either asynchronously (without negotiation) or
69  * synchronously (with negotiation). Exactly which happens depends on the device
70  * state transition. The following state diagram shows the transitions between
71  * device states. Only device state transitions currently supported by device
72  * contracts is shown.
73  *
74  *                              <-- A -->
75  *                       /-----------------> DEGRADED
76  *                       |                      |
77  *                       |                      |
78  *                       |                      | S
79  *                       |                      | |
80  *                       |                      | v
81  *                       v       S -->          v
82  *                      ONLINE ------------> OFFLINE
83  *
84  *
85  * In the figure above, the arrows indicate the direction of transition. The
86  * letter S refers to transitions which are inherently synchronous i.e.
87  * require negotiation and the letter A indicates transitions which are
88  * asynchronous i.e. are done without contract negotiations. A good example
89  * of a synchronous transition is the ONLINE -> OFFLINE transition. This
90  * transition cannot happen as long as there are consumers which have the
91  * device open. Thus some form of negotiation needs to happen between the
92  * consumers and the kernel to ensure that consumers either close devices
93  * or disallow the move to OFFLINE. Certain other transitions such as
94  * ONLINE --> DEGRADED for example, are inherently asynchronous i.e.
95  * non-negotiable. A device that suffers a fault that degrades its
96  * capabilities will become degraded irrespective of what consumers it has,
97  * so a negotiation in this case is pointless.
98  *
99  * The following device states are currently defined for device contracts:
100  *
101  *      CT_DEV_EV_ONLINE
102  *              The device is online and functioning normally
103  *      CT_DEV_EV_DEGRADED
104  *              The device is online but is functioning in a degraded capacity
105  *      CT_DEV_EV_OFFLINE
106  *              The device is offline and is no longer configured
107  *
108  * A typical consumer of device contracts starts out with a contract
109  * template and adds terms to that template. These include the
110  * "acceptable set" (A-set) term, which is a bitset of device states which
111  * are guaranteed by the contract. If the device moves out of a state in
112  * the A-set, the contract is broken. The breaking of the contract can
113  * be asynchronous in which case a critical contract event is sent to the
114  * contract holder but no negotiations take place. If the breaking of the
115  * contract is synchronous, negotations are opened between the affected
116  * consumer and the kernel. The kernel does this by sending a critical
117  * event to the consumer with the CTE_NEG flag set indicating that this
118  * is a negotiation event. The consumer can accept this change by sending
119  * a ACK message to the kernel. Alternatively, if it has the necessary
120  * privileges, it can send a NACK message to the kernel which will block
121  * the device state change. To NACK a negotiable event, a process must
122  * have the {PRIV_SYS_DEVICES} privilege asserted in its effective set.
123  *
124  * Other terms include the "minor path" term, specified explicitly if the
125  * contract is not being created at open(2) time or specified implicitly
126  * if the contract is being created at open time via an activated template.
127  *
128  * A contract event is sent on any state change to which the contract
129  * owner has subscribed via the informative or critical event sets. Only
130  * critical events are guaranteed to be delivered. Since all device state
131  * changes are controlled by the kernel and cannot be arbitrarily generated
132  * by a non-privileged user, the {PRIV_CONTRACT_EVENT} privilege does not
133  * need to be asserted in a process's effective set to designate an event as
134  * critical. To ensure privacy, a process must either have the same effective
135  * userid as the contract holder or have the {PRIV_CONTRACT_OBSERVER} privilege
136  * asserted in its effective set in order to observe device contract events
137  * off the device contract type specific endpoint.
138  *
139  * Yet another term available with device contracts is the "non-negotiable"
140  * term. This term is used to pre-specify a NACK to any contract negotiation.
141  * This term is ignored for asynchronous state changes. For example, a
142  * provcess may have the A-set {ONLINE|DEGRADED} and make the contract
143  * non-negotiable. In this case, the device contract framework assumes a
144  * NACK for any transition to OFFLINE and blocks the offline. If the A-set
145  * is {ONLINE} and the non-negotiable term is set, transitions to OFFLINE
146  * are NACKed but transitions to DEGRADE succeed.
147  *
148  * The OFFLINE negotiation (if OFFLINE state is not in the A-set for a contract)
149  * happens just before the I/O framework attempts to offline a device
150  * (i.e. detach a device and set the offline flag so that it cannot be
151  * reattached). A device contract holder is expected to either NACK the offline
152  * (if privileged) or release the device and allow the offline to proceed.
153  *
154  * The DEGRADE contract event (if DEGRADE is not in the A-set for a contract)
155  * is generated just before the I/O framework transitions the device state
156  * to "degraded" (i.e. DEVI_DEVICE_DEGRADED in I/O framework terminology).
157  *
158  * The contract holder is expected to ACK or NACK a negotiation event
159  * within a certain period of time. If the ACK/NACK is not received
160  * within the timeout period, the device contract framework will behave
161  * as if the contract does not exist and will proceed with the event.
162  *
163  * Unlike a process contract a device contract does not need to exist
164  * once it is abandoned, since it does not define a fault boundary. It
165  * merely represents an agreement between a process and the kernel
166  * regarding the state of the device. Once the process has abandoned
167  * the contract (either implicitly via a process exit or explicitly)
168  * the kernel has no reason to retain the contract. As a result
169  * device contracts are neither inheritable nor need to exist in an
170  * orphan state.
171  *
172  * A device unlike a process may exist in multiple contracts and has
173  * a "life" outside a device contract. A device unlike a process
174  * may exist without an associated contract. Unlike a process contract
175  * a device contract may be formed after a binding relationship is
176  * formed between a process and a device.
177  *
178  *	IMPLEMENTATION NOTES
179  *	====================
180  * DATA STRUCTURES
181  * ----------------
182  * 	The heart of the device contracts implementation is the device contract
183  * 	private cont_device_t (or ctd for short) data structure. It encapsulates
184  * 	the generic contract_t data structure and has a number of private
185  *	fields.
186  * 	These include:
187  *		cond_minor: The minor device that is the subject of the contract
188  *		cond_aset:  The bitset of states which are guaranteed by the
189  *			   contract
190  *		cond_noneg: If set, indicates that the result of negotiation has
191  *			    been predefined to be a NACK
192  * 	In addition, there are other device identifiers such the devinfo node,
193  * 	dev_t and spec_type of the minor node. There are also a few fields that
194  * 	are used during negotiation to maintain state. See
195  *		uts/common/sys/contract/device_impl.h
196  * 	for details.
197  * 	The ctd structure represents the device private part of a contract of
198  * 	type "device"
199  *
200  * 	Another data structure used by device contracts is ctmpl_device. It is
201  * 	the device contracts private part of the contract template structure. It
202  *	encapsulates the generic template structure "ct_template_t" and includes
203  *	the following device contract specific fields
204  *		ctd_aset:   The bitset of states that should be guaranteed by a
205  *			    contract
206  *		ctd_noneg:  If set, indicates that contract should NACK a
207  *			    negotiation
208  *		ctd_minor:  The devfs_path (without the /devices prefix) of the
209  *			    minor node that is the subject of the contract.
210  *
211  * ALGORITHMS
212  * ---------
213  * There are three sets of routines in this file
214  * 	Template related routines
215  * 	-------------------------
216  *	These routines provide support for template related operations initated
217  *	via the generic template operations. These include routines that dup
218  *	a template, free it, and set various terms in the template
219  *	(such as the minor node path, the acceptable state set (or A-set)
220  *	and the non-negotiable term) as well as a routine to query the
221  *	device specific portion of the template for the abovementioned terms.
222  *	There is also a routine to create (ctmpl_device_create) that is used to
223  *	create a contract from a template. This routine calls (after initial
224  *	setup) the common function used to create a device contract
225  *	(contract_device_create).
226  *
227  *	core device contract implementation
228  *	----------------------------------
229  *	These routines support the generic contract framework to provide
230  *	functionality that allows contracts to be created, managed and
231  *	destroyed. The contract_device_create() routine is a routine used
232  *	to create a contract from a template (either via an explicit create
233  *	operation on a template or implicitly via an open with an
234  *	activated template.). The contract_device_free() routine assists
235  *	in freeing the device contract specific parts. There are routines
236  *	used to abandon (contract_device_abandon) a device contract as well
237  *	as a routine to destroy (which despite its name does not destroy,
238  *	it only moves a contract to a dead state) a contract.
239  *	There is also a routine to return status information about a
240  *	contract - the level of detail depends on what is requested by the
241  *	user. A value of CTD_FIXED only returns fixed length fields such
242  *	as the A-set, state of device and value of the "noneg" term. If
243  *	CTD_ALL is specified, the minor node path is returned as well.
244  *
245  *	In addition there are interfaces (contract_device_ack/nack) which
246  *	are used to support negotiation between userland processes and
247  *	device contracts. These interfaces record the acknowledgement
248  *	or lack thereof for negotiation events and help determine if the
249  *	negotiated event should occur.
250  *
251  *	"backend routines"
252  *	-----------------
253  *	The backend routines form the interface between the I/O framework
254  *	and the device contract subsystem. These routines, allow the I/O
255  *	framework to call into the device contract subsystem to notify it of
256  *	impending changes to a device state as well as to inform of the
257  *	final disposition of such attempted state changes. Routines in this
258  *	class include contract_device_offline() that indicates an attempt to
259  *	offline a device, contract_device_degrade() that indicates that
260  *	a device is moving to the degraded state and contract_device_negend()
261  *	that is used by the I/O framework to inform the contracts subsystem of
262  *	the final disposition of an attempted operation.
263  *
264  *	SUMMARY
265  *	-------
266  *      A contract starts its life as a template. A process allocates a device
267  *	contract template and sets various terms:
268  *		The A-set
269  *		The device minor node
270  *		Critical and informative events
271  *		The noneg i.e. no negotition term
272  *	Setting of these terms in the template is done via the
273  *	ctmpl_device_set() entry point in this file. A process can query a
274  *	template to determine the terms already set in the template - this is
275  *	facilitated by the ctmpl_device_get() routine.
276  *
277  *	Once all the appropriate terms are set, the contract is instantiated via
278  *	one of two methods
279  *	- via an explicit create operation - this is facilitated by the
280  *	  ctmpl_device_create() entry point
281  *	- synchronously with the open(2) system call - this is achieved via the
282  *	  contract_device_open() routine.
283  *	The core work for both these above functions is done by
284  *	contract_device_create()
285  *
286  *	A contract once created can be queried for its status. Support for
287  *	status info is provided by both the common contracts framework and by
288  *	the "device" contract type. If the level of detail requested is
289  *	CTD_COMMON, only the common contract framework data is used. Higher
290  *	levels of detail result in calls to contract_device_status() to supply
291  *	device contract type specific status information.
292  *
293  *	A contract once created may be abandoned either explicitly or implictly.
294  *	In either case, the contract_device_abandon() function is invoked. This
295  * 	function merely calls contract_destroy() which moves the contract to
296  *	the DEAD state. The device contract portion of destroy processing is
297  *	provided by contract_device_destroy() which merely disassociates the
298  *	contract from its device devinfo node. A contract in the DEAD state is
299  *	not freed. It hanbgs around until all references to the contract are
300  *	gone. When that happens, the contract is finally deallocated. The
301  *	device contract specific portion of the free is done by
302  *	contract_device_free() which finally frees the device contract specific
303  *	data structure (cont_device_t).
304  *
305  *	When a device undergoes a state change, the I/O framework calls the
306  *	corresponding device contract entry point. For example, when a device
307  *	is about to go OFFLINE, the routine contract_device_offline() is
308  *	invoked. Similarly if a device moves to DEGRADED state, the routine
309  *	contract_device_degrade() function is called. These functions call the
310  *	core routine contract_device_publish(). This function determines via
311  *	the function is_sync_neg() whether an event is a synchronous (i.e.
312  *	negotiable) event or not. In the former case contract_device_publish()
313  *	publishes a CTE_NEG event and then waits in wait_for_acks() for ACKs
314  *	and/or NACKs from contract holders. In the latter case, it simply
315  *	publishes the event and does not wait. In the negotiation case, ACKs or
316  *	NACKs from userland consumers results in contract_device_ack_nack()
317  *	being called where the result of the negotiation is recorded in the
318  *	contract data structure. Once all outstanding contract owners have
319  *	responded, the device contract code in wait_for_acks() determines the
320  *	final result of the negotiation. A single NACK overrides all other ACKs
321  *	If there is no NACK, then a single ACK will result in an overall ACK
322  *	result. If there are no ACKs or NACKs, then the result CT_NONE is
323  *	returned back to the I/O framework. Once the event is permitted or
324  *	blocked, the I/O framework proceeds or aborts the state change. The
325  *	I/O framework then calls contract_device_negend() with a result code
326  *	indicating final disposition of the event. This call releases the
327  *	barrier and other state associated with the previous negotiation,
328  *	which permits the next event (if any) to come into the device contract
329  *	framework.
330  *
331  *	Finally, a device that has outstanding contracts may be removed from
332  *	the system which results in its devinfo node being freed. The devinfo
333  *	free routine in the I/O framework, calls into the device contract
334  *	function - contract_device_remove_dip(). This routine, disassociates
335  *	the dip from all contracts associated with the contract being freed,
336  *	allowing the devinfo node to be freed.
337  *
338  * LOCKING
339  * ---------
340  * 	There are four sets of data that need to be protected by locks
341  *
342  *	i) device contract specific portion of the contract template - This data
343  *	is protected by the template lock ctmpl_lock.
344  *
345  *	ii) device contract specific portion of the contract - This data is
346  *	protected by the contract lock ct_lock
347  *
348  *	iii) The linked list of contracts hanging off a devinfo node - This
349  *	list is protected by the per-devinfo node lock devi_ct_lock
350  *
351  *	iv) Finally there is a barrier, controlled by devi_ct_lock, devi_ct_cv
352  *	and devi_ct_count that controls state changes to a dip
353  *
354  *	The template lock is independent in that none of the other locks in this
355  *	file may be taken while holding the template lock (and vice versa).
356  *
357  *	The remaining three locks have the following lock order
358  *
359  *	devi_ct_lock  -> ct_count barrier ->  ct_lock
360  *
361  */
362 
363 static cont_device_t *contract_device_create(ctmpl_device_t *dtmpl, dev_t dev,
364     int spec_type, proc_t *owner, int *errorp);
365 
366 /* barrier routines */
367 static void ct_barrier_acquire(dev_info_t *dip);
368 static void ct_barrier_release(dev_info_t *dip);
369 static int ct_barrier_held(dev_info_t *dip);
370 static int ct_barrier_empty(dev_info_t *dip);
371 static void ct_barrier_wait_for_release(dev_info_t *dip);
372 static int ct_barrier_wait_for_empty(dev_info_t *dip, int secs);
373 static void ct_barrier_decr(dev_info_t *dip);
374 static void ct_barrier_incr(dev_info_t *dip);
375 
376 ct_type_t *device_type;
377 
378 /*
379  * Macro predicates for determining when events should be sent and how.
380  */
381 #define	EVSENDP(ctd, flag) \
382 	((ctd->cond_contract.ct_ev_info | ctd->cond_contract.ct_ev_crit) & flag)
383 
384 #define	EVINFOP(ctd, flag) \
385 	((ctd->cond_contract.ct_ev_crit & flag) == 0)
386 
387 /*
388  * State transition table showing which transitions are synchronous and which
389  * are not.
390  */
391 struct ct_dev_negtable {
392 	uint_t	st_old;
393 	uint_t	st_new;
394 	uint_t	st_neg;
395 } ct_dev_negtable[] = {
396 	{CT_DEV_EV_ONLINE, CT_DEV_EV_OFFLINE,	1},
397 	{CT_DEV_EV_ONLINE, CT_DEV_EV_DEGRADED,	0},
398 	{CT_DEV_EV_DEGRADED, CT_DEV_EV_ONLINE,	0},
399 	{CT_DEV_EV_DEGRADED, CT_DEV_EV_OFFLINE,	1},
400 	{0}
401 };
402 
403 /*
404  * Device contract template implementation
405  */
406 
407 /*
408  * ctmpl_device_dup
409  *
410  * The device contract template dup entry point.
411  * This simply copies all the fields (generic as well as device contract
412  * specific) fields of the original.
413  */
414 static struct ct_template *
415 ctmpl_device_dup(struct ct_template *template)
416 {
417 	ctmpl_device_t *new;
418 	ctmpl_device_t *old = template->ctmpl_data;
419 	char *buf;
420 	char *minor;
421 
422 	new = kmem_zalloc(sizeof (ctmpl_device_t), KM_SLEEP);
423 	buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
424 
425 	/*
426 	 * copy generic fields.
427 	 * ctmpl_copy returns with old template lock held
428 	 */
429 	ctmpl_copy(&new->ctd_ctmpl, template);
430 
431 	new->ctd_ctmpl.ctmpl_data = new;
432 	new->ctd_aset = old->ctd_aset;
433 	new->ctd_minor = NULL;
434 	new->ctd_noneg = old->ctd_noneg;
435 
436 	if (old->ctd_minor) {
437 		ASSERT(strlen(old->ctd_minor) + 1 <= MAXPATHLEN);
438 		bcopy(old->ctd_minor, buf, strlen(old->ctd_minor) + 1);
439 	} else {
440 		kmem_free(buf, MAXPATHLEN);
441 		buf = NULL;
442 	}
443 
444 	mutex_exit(&template->ctmpl_lock);
445 	if (buf) {
446 		minor = i_ddi_strdup(buf, KM_SLEEP);
447 		kmem_free(buf, MAXPATHLEN);
448 		buf = NULL;
449 	} else {
450 		minor = NULL;
451 	}
452 	mutex_enter(&template->ctmpl_lock);
453 
454 	if (minor) {
455 		new->ctd_minor = minor;
456 	}
457 
458 	ASSERT(buf == NULL);
459 	return (&new->ctd_ctmpl);
460 }
461 
462 /*
463  * ctmpl_device_free
464  *
465  * The device contract template free entry point.  Just
466  * frees the template.
467  */
468 static void
469 ctmpl_device_free(struct ct_template *template)
470 {
471 	ctmpl_device_t *dtmpl = template->ctmpl_data;
472 
473 	if (dtmpl->ctd_minor)
474 		kmem_free(dtmpl->ctd_minor, strlen(dtmpl->ctd_minor) + 1);
475 
476 	kmem_free(dtmpl, sizeof (ctmpl_device_t));
477 }
478 
479 /*
480  * SAFE_EV is the set of events which a non-privileged process is
481  * allowed to make critical. An unprivileged device contract owner has
482  * no control over when a device changes state, so all device events
483  * can be in the critical set.
484  *
485  * EXCESS tells us if "value", a critical event set, requires
486  * additional privilege. For device contracts EXCESS currently
487  * evaluates to 0.
488  */
489 #define	SAFE_EV		(CT_DEV_ALLEVENT)
490 #define	EXCESS(value)	((value) & ~SAFE_EV)
491 
492 
493 /*
494  * ctmpl_device_set
495  *
496  * The device contract template set entry point. Sets various terms in the
497  * template. The non-negotiable  term can only be set if the process has
498  * the {PRIV_SYS_DEVICES} privilege asserted in its effective set.
499  */
500 static int
501 ctmpl_device_set(struct ct_template *tmpl, ct_param_t *param, const cred_t *cr)
502 {
503 	ctmpl_device_t *dtmpl = tmpl->ctmpl_data;
504 	int error;
505 	dev_info_t *dip;
506 	int spec_type;
507 	uint64_t param_value;
508 	char *str_value;
509 
510 	ASSERT(MUTEX_HELD(&tmpl->ctmpl_lock));
511 
512 	if (param->ctpm_id == CTDP_MINOR) {
513 		str_value = (char *)param->ctpm_value;
514 		str_value[param->ctpm_size - 1] = '\0';
515 	} else {
516 		param_value = *(uint64_t *)param->ctpm_value;
517 	}
518 
519 	switch (param->ctpm_id) {
520 	case CTDP_ACCEPT:
521 		if (param_value & ~CT_DEV_ALLEVENT)
522 			return (EINVAL);
523 		if (param_value == 0)
524 			return (EINVAL);
525 		if (param_value == CT_DEV_ALLEVENT)
526 			return (EINVAL);
527 
528 		dtmpl->ctd_aset = param_value;
529 		break;
530 	case CTDP_NONEG:
531 		if (param_value != CTDP_NONEG_SET &&
532 		    param_value != CTDP_NONEG_CLEAR)
533 			return (EINVAL);
534 
535 		/*
536 		 * only privileged processes can designate a contract
537 		 * non-negotiatble.
538 		 */
539 		if (param_value == CTDP_NONEG_SET &&
540 		    (error = secpolicy_sys_devices(cr)) != 0) {
541 			return (error);
542 		}
543 
544 		dtmpl->ctd_noneg = param_value;
545 		break;
546 
547 	case CTDP_MINOR:
548 		if (*str_value != '/' ||
549 		    strncmp(str_value, "/devices/",
550 		    strlen("/devices/")) == 0 ||
551 		    strstr(str_value, "../devices/") != NULL ||
552 		    strchr(str_value, ':') == NULL) {
553 			return (EINVAL);
554 		}
555 
556 		spec_type = 0;
557 		dip = NULL;
558 		if (resolve_pathname(str_value, &dip, NULL, &spec_type) != 0) {
559 			return (ERANGE);
560 		}
561 		ddi_release_devi(dip);
562 
563 		if (spec_type != S_IFCHR && spec_type != S_IFBLK) {
564 			return (EINVAL);
565 		}
566 
567 		if (dtmpl->ctd_minor != NULL) {
568 			kmem_free(dtmpl->ctd_minor,
569 			    strlen(dtmpl->ctd_minor) + 1);
570 		}
571 		dtmpl->ctd_minor = i_ddi_strdup(str_value, KM_SLEEP);
572 		break;
573 	case CTP_EV_CRITICAL:
574 		/*
575 		 * Currently for device contracts, any event
576 		 * may be added to the critical set. We retain the
577 		 * following code however for future enhancements.
578 		 */
579 		if (EXCESS(param_value) &&
580 		    (error = secpolicy_contract_event(cr)) != 0)
581 			return (error);
582 		tmpl->ctmpl_ev_crit = param_value;
583 		break;
584 	default:
585 		return (EINVAL);
586 	}
587 
588 	return (0);
589 }
590 
591 /*
592  * ctmpl_device_get
593  *
594  * The device contract template get entry point.  Simply fetches and
595  * returns the value of the requested term.
596  */
597 static int
598 ctmpl_device_get(struct ct_template *template, ct_param_t *param)
599 {
600 	ctmpl_device_t *dtmpl = template->ctmpl_data;
601 	uint64_t *param_value = param->ctpm_value;
602 
603 	ASSERT(MUTEX_HELD(&template->ctmpl_lock));
604 
605 	switch (param->ctpm_id) {
606 	case CTDP_ACCEPT:
607 		*param_value = dtmpl->ctd_aset;
608 		break;
609 	case CTDP_NONEG:
610 		*param_value = dtmpl->ctd_noneg;
611 		break;
612 	case CTDP_MINOR:
613 		if (dtmpl->ctd_minor) {
614 			param->ctpm_size = strlcpy((char *)param->ctpm_value,
615 			    dtmpl->ctd_minor, param->ctpm_size);
616 			param->ctpm_size++;
617 		} else {
618 			return (ENOENT);
619 		}
620 		break;
621 	default:
622 		return (EINVAL);
623 	}
624 
625 	return (0);
626 }
627 
628 /*
629  * Device contract type specific portion of creating a contract using
630  * a specified template
631  */
632 /*ARGSUSED*/
633 int
634 ctmpl_device_create(ct_template_t *template, ctid_t *ctidp)
635 {
636 	ctmpl_device_t *dtmpl;
637 	char *buf;
638 	dev_t dev;
639 	int spec_type;
640 	int error;
641 	cont_device_t *ctd;
642 
643 	if (ctidp == NULL)
644 		return (EINVAL);
645 
646 	buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
647 
648 	dtmpl = template->ctmpl_data;
649 
650 	mutex_enter(&template->ctmpl_lock);
651 	if (dtmpl->ctd_minor == NULL) {
652 		/* incomplete template */
653 		mutex_exit(&template->ctmpl_lock);
654 		kmem_free(buf, MAXPATHLEN);
655 		return (EINVAL);
656 	} else {
657 		ASSERT(strlen(dtmpl->ctd_minor) < MAXPATHLEN);
658 		bcopy(dtmpl->ctd_minor, buf, strlen(dtmpl->ctd_minor) + 1);
659 	}
660 	mutex_exit(&template->ctmpl_lock);
661 
662 	spec_type = 0;
663 	dev = NODEV;
664 	if (resolve_pathname(buf, NULL, &dev, &spec_type) != 0 ||
665 	    dev == NODEV || dev == DDI_DEV_T_ANY || dev == DDI_DEV_T_NONE ||
666 	    (spec_type != S_IFCHR && spec_type != S_IFBLK)) {
667 		CT_DEBUG((CE_WARN,
668 		    "tmpl_create: failed to find device: %s", buf));
669 		kmem_free(buf, MAXPATHLEN);
670 		return (ERANGE);
671 	}
672 	kmem_free(buf, MAXPATHLEN);
673 
674 	ctd = contract_device_create(template->ctmpl_data,
675 	    dev, spec_type, curproc, &error);
676 
677 	if (ctd == NULL) {
678 		CT_DEBUG((CE_WARN, "Failed to create device contract for "
679 		    "process (%d) with device (devt = %lu, spec_type = %s)",
680 		    curproc->p_pid, dev,
681 		    spec_type == S_IFCHR ? "S_IFCHR" : "S_IFBLK"));
682 		return (error);
683 	}
684 
685 	mutex_enter(&ctd->cond_contract.ct_lock);
686 	*ctidp = ctd->cond_contract.ct_id;
687 	mutex_exit(&ctd->cond_contract.ct_lock);
688 
689 	return (0);
690 }
691 
692 /*
693  * Device contract specific template entry points
694  */
695 static ctmplops_t ctmpl_device_ops = {
696 	ctmpl_device_dup,		/* ctop_dup */
697 	ctmpl_device_free,		/* ctop_free */
698 	ctmpl_device_set,		/* ctop_set */
699 	ctmpl_device_get,		/* ctop_get */
700 	ctmpl_device_create,		/* ctop_create */
701 	CT_DEV_ALLEVENT			/* all device events bitmask */
702 };
703 
704 
705 /*
706  * Device contract implementation
707  */
708 
709 /*
710  * contract_device_default
711  *
712  * The device contract default template entry point.  Creates a
713  * device contract template with a default A-set and no "noneg" ,
714  * with informative degrade events and critical offline events.
715  * There is no default minor path.
716  */
717 static ct_template_t *
718 contract_device_default(void)
719 {
720 	ctmpl_device_t *new;
721 
722 	new = kmem_zalloc(sizeof (ctmpl_device_t), KM_SLEEP);
723 	ctmpl_init(&new->ctd_ctmpl, &ctmpl_device_ops, device_type, new);
724 
725 	new->ctd_aset = CT_DEV_EV_ONLINE | CT_DEV_EV_DEGRADED;
726 	new->ctd_noneg = 0;
727 	new->ctd_ctmpl.ctmpl_ev_info = CT_DEV_EV_DEGRADED;
728 	new->ctd_ctmpl.ctmpl_ev_crit = CT_DEV_EV_OFFLINE;
729 
730 	return (&new->ctd_ctmpl);
731 }
732 
733 /*
734  * contract_device_free
735  *
736  * Destroys the device contract specific portion of a contract and
737  * frees the contract.
738  */
739 static void
740 contract_device_free(contract_t *ct)
741 {
742 	cont_device_t *ctd = ct->ct_data;
743 
744 	ASSERT(ctd->cond_minor);
745 	ASSERT(strlen(ctd->cond_minor) < MAXPATHLEN);
746 	kmem_free(ctd->cond_minor, strlen(ctd->cond_minor) + 1);
747 
748 	ASSERT(ctd->cond_devt != DDI_DEV_T_ANY &&
749 	    ctd->cond_devt != DDI_DEV_T_NONE && ctd->cond_devt != NODEV);
750 
751 	ASSERT(ctd->cond_spec == S_IFBLK || ctd->cond_spec == S_IFCHR);
752 
753 	ASSERT(!(ctd->cond_aset & ~CT_DEV_ALLEVENT));
754 	ASSERT(ctd->cond_noneg == 0 || ctd->cond_noneg == 1);
755 
756 	ASSERT(!(ctd->cond_currev_type & ~CT_DEV_ALLEVENT));
757 	ASSERT(!(ctd->cond_currev_ack & ~(CT_ACK | CT_NACK)));
758 
759 	ASSERT((ctd->cond_currev_id > 0) ^ (ctd->cond_currev_type == 0));
760 	ASSERT((ctd->cond_currev_id > 0) || (ctd->cond_currev_ack == 0));
761 
762 	ASSERT(!list_link_active(&ctd->cond_next));
763 
764 	kmem_free(ctd, sizeof (cont_device_t));
765 }
766 
767 /*
768  * contract_device_abandon
769  *
770  * The device contract abandon entry point.
771  */
772 static void
773 contract_device_abandon(contract_t *ct)
774 {
775 	ASSERT(MUTEX_HELD(&ct->ct_lock));
776 
777 	/*
778 	 * device contracts cannot be inherited or orphaned.
779 	 * Move the contract to the DEAD_STATE. It will be freed
780 	 * once all references to it are gone.
781 	 */
782 	contract_destroy(ct);
783 }
784 
785 /*
786  * contract_device_destroy
787  *
788  * The device contract destroy entry point.
789  * Called from contract_destroy() to do any type specific destroy. Note
790  * that destroy is a misnomer - this does not free the contract, it only
791  * moves it to the dead state. A contract is actually freed via
792  * 	contract_rele() -> contract_dtor(), contop_free()
793  */
794 static void
795 contract_device_destroy(contract_t *ct)
796 {
797 	cont_device_t	*ctd = ct->ct_data;
798 	dev_info_t	*dip = ctd->cond_dip;
799 
800 	ASSERT(MUTEX_HELD(&ct->ct_lock));
801 
802 	if (dip == NULL) {
803 		/*
804 		 * The dip has been removed, this is a dangling contract
805 		 * Check that dip linkages are NULL
806 		 */
807 		ASSERT(!list_link_active(&ctd->cond_next));
808 		CT_DEBUG((CE_NOTE, "contract_device_destroy: contract has no "
809 		    "devinfo node. contract ctid : %d", ct->ct_id));
810 		return;
811 	}
812 
813 	/*
814 	 * Need to have lock order: devi_ct_lock -> ct_count barrier -> ct_lock
815 	 */
816 	mutex_exit(&ct->ct_lock);
817 
818 	/*
819 	 * Waiting for the barrier to be released is strictly speaking not
820 	 * necessary. But it simplifies the implementation of
821 	 * contract_device_publish() by establishing the invariant that
822 	 * device contracts cannot go away during negotiation.
823 	 */
824 	mutex_enter(&(DEVI(dip)->devi_ct_lock));
825 	ct_barrier_wait_for_release(dip);
826 	mutex_enter(&ct->ct_lock);
827 
828 	list_remove(&(DEVI(dip)->devi_ct), ctd);
829 	ctd->cond_dip = NULL; /* no longer linked to dip */
830 	contract_rele(ct);	/* remove hold for dip linkage */
831 
832 	mutex_exit(&ct->ct_lock);
833 	mutex_exit(&(DEVI(dip)->devi_ct_lock));
834 	mutex_enter(&ct->ct_lock);
835 }
836 
837 /*
838  * contract_device_status
839  *
840  * The device contract status entry point. Called when level of "detail"
841  * is either CTD_FIXED or CTD_ALL
842  *
843  */
844 static void
845 contract_device_status(contract_t *ct, zone_t *zone, int detail, nvlist_t *nvl,
846     void *status, model_t model)
847 {
848 	cont_device_t *ctd = ct->ct_data;
849 
850 	ASSERT(detail == CTD_FIXED || detail == CTD_ALL);
851 
852 	mutex_enter(&ct->ct_lock);
853 	contract_status_common(ct, zone, status, model);
854 
855 	/*
856 	 * There's no need to hold the contract lock while accessing static
857 	 * data like aset or noneg. But since we need the lock to access other
858 	 * data like state, we hold it anyway.
859 	 */
860 	VERIFY(nvlist_add_uint32(nvl, CTDS_STATE, ctd->cond_state) == 0);
861 	VERIFY(nvlist_add_uint32(nvl, CTDS_ASET, ctd->cond_aset) == 0);
862 	VERIFY(nvlist_add_uint32(nvl, CTDS_NONEG, ctd->cond_noneg) == 0);
863 
864 	if (detail == CTD_FIXED) {
865 		mutex_exit(&ct->ct_lock);
866 		return;
867 	}
868 
869 	ASSERT(ctd->cond_minor);
870 	VERIFY(nvlist_add_string(nvl, CTDS_MINOR, ctd->cond_minor) == 0);
871 
872 	mutex_exit(&ct->ct_lock);
873 }
874 
875 /*
876  * Converts a result integer into the corresponding string. Used for printing
877  * messages
878  */
879 static char *
880 result_str(uint_t result)
881 {
882 	switch (result) {
883 	case CT_ACK:
884 		return ("CT_ACK");
885 	case CT_NACK:
886 		return ("CT_NACK");
887 	case CT_NONE:
888 		return ("CT_NONE");
889 	default:
890 		return ("UNKNOWN");
891 	}
892 }
893 
894 /*
895  * Converts a device state integer constant into the corresponding string.
896  * Used to print messages.
897  */
898 static char *
899 state_str(uint_t state)
900 {
901 	switch (state) {
902 	case CT_DEV_EV_ONLINE:
903 		return ("ONLINE");
904 	case CT_DEV_EV_DEGRADED:
905 		return ("DEGRADED");
906 	case CT_DEV_EV_OFFLINE:
907 		return ("OFFLINE");
908 	default:
909 		return ("UNKNOWN");
910 	}
911 }
912 
913 /*
914  * Routine that determines if a particular CT_DEV_EV_? event corresponds to a
915  * synchronous state change or not.
916  */
917 static int
918 is_sync_neg(uint_t old, uint_t new)
919 {
920 	int	i;
921 
922 	ASSERT(old & CT_DEV_ALLEVENT);
923 	ASSERT(new & CT_DEV_ALLEVENT);
924 
925 	if (old == new) {
926 		CT_DEBUG((CE_WARN, "is_sync_neg: transition to same state: %s",
927 		    state_str(new)));
928 		return (-2);
929 	}
930 
931 	for (i = 0; ct_dev_negtable[i].st_new != 0; i++) {
932 		if (old == ct_dev_negtable[i].st_old &&
933 		    new == ct_dev_negtable[i].st_new) {
934 			return (ct_dev_negtable[i].st_neg);
935 		}
936 	}
937 
938 	CT_DEBUG((CE_WARN, "is_sync_neg: Unsupported state transition: "
939 	    "old = %s -> new = %s", state_str(old), state_str(new)));
940 
941 	return (-1);
942 }
943 
944 /*
945  * Used to cleanup cached dv_nodes so that when a device is released by
946  * a contract holder, its devinfo node can be successfully detached.
947  */
948 static int
949 contract_device_dvclean(dev_info_t *dip)
950 {
951 	char		*devnm;
952 	dev_info_t	*pdip;
953 	int		error;
954 
955 	ASSERT(dip);
956 
957 	/* pdip can be NULL if we have contracts against the root dip */
958 	pdip = ddi_get_parent(dip);
959 
960 	if (pdip && DEVI_BUSY_OWNED(pdip) || !pdip && DEVI_BUSY_OWNED(dip)) {
961 		char		*path;
962 
963 		path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
964 		(void) ddi_pathname(dip, path);
965 		CT_DEBUG((CE_WARN, "ct_dv_clean: Parent node is busy owned, "
966 		    "device=%s", path));
967 		kmem_free(path, MAXPATHLEN);
968 		return (EDEADLOCK);
969 	}
970 
971 	if (pdip) {
972 		devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
973 		(void) ddi_deviname(dip, devnm);
974 		error = devfs_clean(pdip, devnm + 1, DV_CLEAN_FORCE);
975 		kmem_free(devnm, MAXNAMELEN + 1);
976 	} else {
977 		error = devfs_clean(dip, NULL, DV_CLEAN_FORCE);
978 	}
979 
980 	return (error);
981 }
982 
983 /*
984  * Endpoint of a ct_ctl_ack() or ct_ctl_nack() call from userland.
985  * Results in the ACK or NACK being recorded on the dip for one particular
986  * contract. The device contracts framework evaluates the ACK/NACKs for all
987  * contracts against a device to determine if a particular device state change
988  * should be allowed.
989  */
990 static int
991 contract_device_ack_nack(contract_t *ct, uint_t evtype, uint64_t evid,
992     uint_t cmd)
993 {
994 	cont_device_t *ctd = ct->ct_data;
995 	dev_info_t *dip;
996 	ctid_t	ctid;
997 	int error;
998 
999 	ctid = ct->ct_id;
1000 
1001 	CT_DEBUG((CE_NOTE, "ack_nack: entered: ctid %d", ctid));
1002 
1003 	mutex_enter(&ct->ct_lock);
1004 	CT_DEBUG((CE_NOTE, "ack_nack: contract lock acquired: %d", ctid));
1005 
1006 	dip = ctd->cond_dip;
1007 
1008 	ASSERT(ctd->cond_minor);
1009 	ASSERT(strlen(ctd->cond_minor) < MAXPATHLEN);
1010 
1011 	/*
1012 	 * Negotiation only if new state is not in A-set
1013 	 */
1014 	ASSERT(!(ctd->cond_aset & evtype));
1015 
1016 	/*
1017 	 * Negotiation only if transition is synchronous
1018 	 */
1019 	ASSERT(is_sync_neg(ctd->cond_state, evtype));
1020 
1021 	/*
1022 	 * We shouldn't be negotiating if the "noneg" flag is set
1023 	 */
1024 	ASSERT(!ctd->cond_noneg);
1025 
1026 	if (dip)
1027 		ndi_hold_devi(dip);
1028 
1029 	mutex_exit(&ct->ct_lock);
1030 
1031 	/*
1032 	 * dv_clean only if !NACK and offline state change
1033 	 */
1034 	if (cmd != CT_NACK && evtype == CT_DEV_EV_OFFLINE && dip) {
1035 		CT_DEBUG((CE_NOTE, "ack_nack: dv_clean: %d", ctid));
1036 		error = contract_device_dvclean(dip);
1037 		if (error != 0) {
1038 			CT_DEBUG((CE_NOTE, "ack_nack: dv_clean: failed: %d",
1039 			    ctid));
1040 			ddi_release_devi(dip);
1041 		}
1042 	}
1043 
1044 	mutex_enter(&ct->ct_lock);
1045 
1046 	if (dip)
1047 		ddi_release_devi(dip);
1048 
1049 	if (dip == NULL) {
1050 		if (ctd->cond_currev_id != evid) {
1051 			CT_DEBUG((CE_WARN, "%sACK for non-current event "
1052 			    "(type=%s, id=%llu) on removed device",
1053 			    cmd == CT_NACK ? "N" : "",
1054 			    state_str(evtype), (unsigned long long)evid));
1055 			CT_DEBUG((CE_NOTE, "ack_nack: error: ESRCH, ctid: %d",
1056 			    ctid));
1057 		} else {
1058 			ASSERT(ctd->cond_currev_type == evtype);
1059 			CT_DEBUG((CE_WARN, "contract_ack: no such device: "
1060 			    "ctid: %d", ctid));
1061 		}
1062 		error = (ct->ct_state == CTS_DEAD) ? ESRCH :
1063 		    ((cmd == CT_NACK) ? ETIMEDOUT : 0);
1064 		mutex_exit(&ct->ct_lock);
1065 		return (error);
1066 	}
1067 
1068 	/*
1069 	 * Must follow lock order: devi_ct_lock -> ct_count barrier - >ct_lock
1070 	 */
1071 	mutex_exit(&ct->ct_lock);
1072 
1073 	mutex_enter(&DEVI(dip)->devi_ct_lock);
1074 	mutex_enter(&ct->ct_lock);
1075 	if (ctd->cond_currev_id != evid) {
1076 		char *buf;
1077 		mutex_exit(&ct->ct_lock);
1078 		mutex_exit(&DEVI(dip)->devi_ct_lock);
1079 		ndi_hold_devi(dip);
1080 		buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1081 		(void) ddi_pathname(dip, buf);
1082 		ddi_release_devi(dip);
1083 		CT_DEBUG((CE_WARN, "%sACK for non-current event"
1084 		    "(type=%s, id=%llu) on device %s",
1085 		    cmd == CT_NACK ? "N" : "",
1086 		    state_str(evtype), (unsigned long long)evid, buf));
1087 		kmem_free(buf, MAXPATHLEN);
1088 		CT_DEBUG((CE_NOTE, "ack_nack: error: %d, ctid: %d",
1089 		    cmd == CT_NACK ? ETIMEDOUT : 0, ctid));
1090 		return (cmd == CT_ACK ? 0 : ETIMEDOUT);
1091 	}
1092 
1093 	ASSERT(ctd->cond_currev_type == evtype);
1094 	ASSERT(cmd == CT_ACK || cmd == CT_NACK);
1095 
1096 	CT_DEBUG((CE_NOTE, "ack_nack: setting %sACK for ctid: %d",
1097 	    cmd == CT_NACK ? "N" : "", ctid));
1098 
1099 	ctd->cond_currev_ack = cmd;
1100 	mutex_exit(&ct->ct_lock);
1101 
1102 	ct_barrier_decr(dip);
1103 	mutex_exit(&DEVI(dip)->devi_ct_lock);
1104 
1105 	CT_DEBUG((CE_NOTE, "ack_nack: normal exit: ctid: %d", ctid));
1106 
1107 	return (0);
1108 }
1109 
1110 /*
1111  * Invoked when a userland contract holder approves (i.e. ACKs) a state change
1112  */
1113 static int
1114 contract_device_ack(contract_t *ct, uint_t evtype, uint64_t evid)
1115 {
1116 	return (contract_device_ack_nack(ct, evtype, evid, CT_ACK));
1117 }
1118 
1119 /*
1120  * Invoked when a userland contract holder blocks (i.e. NACKs) a state change
1121  */
1122 static int
1123 contract_device_nack(contract_t *ct, uint_t evtype, uint64_t evid)
1124 {
1125 	return (contract_device_ack_nack(ct, evtype, evid, CT_NACK));
1126 }
1127 
1128 /*
1129  * Creates a new contract synchronously with the breaking of an existing
1130  * contract. Currently not supported.
1131  */
1132 /*ARGSUSED*/
1133 static int
1134 contract_device_newct(contract_t *ct)
1135 {
1136 	return (ENOTSUP);
1137 }
1138 
1139 /*
1140  * Core device contract implementation entry points
1141  */
1142 static contops_t contract_device_ops = {
1143 	contract_device_free,		/* contop_free */
1144 	contract_device_abandon,	/* contop_abandon */
1145 	contract_device_destroy,	/* contop_destroy */
1146 	contract_device_status,		/* contop_status */
1147 	contract_device_ack,		/* contop_ack */
1148 	contract_device_nack,		/* contop_nack */
1149 	contract_qack_notsup,		/* contop_qack */
1150 	contract_device_newct		/* contop_newct */
1151 };
1152 
1153 /*
1154  * contract_device_init
1155  *
1156  * Initializes the device contract type.
1157  */
1158 void
1159 contract_device_init(void)
1160 {
1161 	device_type = contract_type_init(CTT_DEVICE, "device",
1162 	    &contract_device_ops, contract_device_default);
1163 }
1164 
1165 /*
1166  * contract_device_create
1167  *
1168  * create a device contract given template "tmpl" and the "owner" process.
1169  * May fail and return NULL if project.max-contracts would have been exceeded.
1170  *
1171  * Common device contract creation routine called for both open-time and
1172  * non-open time device contract creation
1173  */
1174 static cont_device_t *
1175 contract_device_create(ctmpl_device_t *dtmpl, dev_t dev, int spec_type,
1176     proc_t *owner, int *errorp)
1177 {
1178 	cont_device_t *ctd;
1179 	char *minor;
1180 	char *path;
1181 	dev_info_t *dip;
1182 
1183 	ASSERT(dtmpl != NULL);
1184 	ASSERT(dev != NODEV && dev != DDI_DEV_T_ANY && dev != DDI_DEV_T_NONE);
1185 	ASSERT(spec_type == S_IFCHR || spec_type == S_IFBLK);
1186 	ASSERT(errorp);
1187 
1188 	*errorp = 0;
1189 
1190 	path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1191 
1192 	mutex_enter(&dtmpl->ctd_ctmpl.ctmpl_lock);
1193 	ASSERT(strlen(dtmpl->ctd_minor) < MAXPATHLEN);
1194 	bcopy(dtmpl->ctd_minor, path, strlen(dtmpl->ctd_minor) + 1);
1195 	mutex_exit(&dtmpl->ctd_ctmpl.ctmpl_lock);
1196 
1197 	dip = e_ddi_hold_devi_by_path(path, 0);
1198 	if (dip == NULL) {
1199 		cmn_err(CE_WARN, "contract_create: Cannot find devinfo node "
1200 		    "for device path (%s)", path);
1201 		kmem_free(path, MAXPATHLEN);
1202 		*errorp = ERANGE;
1203 		return (NULL);
1204 	}
1205 
1206 	/*
1207 	 * Lock out any parallel contract negotiations
1208 	 */
1209 	mutex_enter(&(DEVI(dip)->devi_ct_lock));
1210 	ct_barrier_acquire(dip);
1211 	mutex_exit(&(DEVI(dip)->devi_ct_lock));
1212 
1213 	minor = i_ddi_strdup(path, KM_SLEEP);
1214 	kmem_free(path, MAXPATHLEN);
1215 
1216 	(void) contract_type_pbundle(device_type, owner);
1217 
1218 	ctd = kmem_zalloc(sizeof (cont_device_t), KM_SLEEP);
1219 
1220 	/*
1221 	 * Only we hold a refernce to this contract. Safe to access
1222 	 * the fields without a ct_lock
1223 	 */
1224 	ctd->cond_minor = minor;
1225 	/*
1226 	 * It is safe to set the dip pointer in the contract
1227 	 * as the contract will always be destroyed before the dip
1228 	 * is released
1229 	 */
1230 	ctd->cond_dip = dip;
1231 	ctd->cond_devt = dev;
1232 	ctd->cond_spec = spec_type;
1233 
1234 	/*
1235 	 * Since we are able to lookup the device, it is either
1236 	 * online or degraded
1237 	 */
1238 	ctd->cond_state = DEVI_IS_DEVICE_DEGRADED(dip) ?
1239 	    CT_DEV_EV_DEGRADED : CT_DEV_EV_ONLINE;
1240 
1241 	mutex_enter(&dtmpl->ctd_ctmpl.ctmpl_lock);
1242 	ctd->cond_aset = dtmpl->ctd_aset;
1243 	ctd->cond_noneg = dtmpl->ctd_noneg;
1244 
1245 	/*
1246 	 * contract_ctor() initailizes the common portion of a contract
1247 	 * contract_dtor() destroys the common portion of a contract
1248 	 */
1249 	if (contract_ctor(&ctd->cond_contract, device_type, &dtmpl->ctd_ctmpl,
1250 	    ctd, 0, owner, B_TRUE)) {
1251 		mutex_exit(&dtmpl->ctd_ctmpl.ctmpl_lock);
1252 		/*
1253 		 * contract_device_free() destroys the type specific
1254 		 * portion of a contract and frees the contract.
1255 		 * The "minor" path and "cred" is a part of the type specific
1256 		 * portion of the contract and will be freed by
1257 		 * contract_device_free()
1258 		 */
1259 		contract_device_free(&ctd->cond_contract);
1260 
1261 		/* release barrier */
1262 		mutex_enter(&(DEVI(dip)->devi_ct_lock));
1263 		ct_barrier_release(dip);
1264 		mutex_exit(&(DEVI(dip)->devi_ct_lock));
1265 
1266 		ddi_release_devi(dip);
1267 		*errorp = EAGAIN;
1268 		return (NULL);
1269 	}
1270 	mutex_exit(&dtmpl->ctd_ctmpl.ctmpl_lock);
1271 
1272 	mutex_enter(&ctd->cond_contract.ct_lock);
1273 	ctd->cond_contract.ct_ntime.ctm_total = CT_DEV_ACKTIME;
1274 	ctd->cond_contract.ct_qtime.ctm_total = CT_DEV_ACKTIME;
1275 	ctd->cond_contract.ct_ntime.ctm_start = -1;
1276 	ctd->cond_contract.ct_qtime.ctm_start = -1;
1277 	mutex_exit(&ctd->cond_contract.ct_lock);
1278 
1279 	/*
1280 	 * Insert device contract into list hanging off the dip
1281 	 * Bump up the ref-count on the contract to reflect this
1282 	 */
1283 	contract_hold(&ctd->cond_contract);
1284 	mutex_enter(&(DEVI(dip)->devi_ct_lock));
1285 	list_insert_tail(&(DEVI(dip)->devi_ct), ctd);
1286 
1287 	/* release barrier */
1288 	ct_barrier_release(dip);
1289 	mutex_exit(&(DEVI(dip)->devi_ct_lock));
1290 
1291 	ddi_release_devi(dip);
1292 
1293 	return (ctd);
1294 }
1295 
1296 /*
1297  * Called when a device is successfully opened to create an open-time contract
1298  * i.e. synchronously with a device open.
1299  */
1300 int
1301 contract_device_open(dev_t dev, int spec_type, contract_t **ctpp)
1302 {
1303 	ctmpl_device_t *dtmpl;
1304 	ct_template_t  *tmpl;
1305 	cont_device_t *ctd;
1306 	char *path;
1307 	klwp_t *lwp;
1308 	int error;
1309 
1310 	if (ctpp)
1311 		*ctpp = NULL;
1312 
1313 	/*
1314 	 * Check if we are in user-context i.e. if we have an lwp
1315 	 */
1316 	lwp = ttolwp(curthread);
1317 	if (lwp == NULL) {
1318 		CT_DEBUG((CE_NOTE, "contract_open: Not user-context"));
1319 		return (0);
1320 	}
1321 
1322 	tmpl = ctmpl_dup(lwp->lwp_ct_active[device_type->ct_type_index]);
1323 	if (tmpl == NULL) {
1324 		return (0);
1325 	}
1326 	dtmpl = tmpl->ctmpl_data;
1327 
1328 	/*
1329 	 * If the user set a minor path in the template before an open,
1330 	 * ignore it. We use the minor path of the actual minor opened.
1331 	 */
1332 	mutex_enter(&tmpl->ctmpl_lock);
1333 	if (dtmpl->ctd_minor != NULL) {
1334 		CT_DEBUG((CE_NOTE, "contract_device_open(): Process %d: "
1335 		    "ignoring device minor path in active template: %s",
1336 		    curproc->p_pid, dtmpl->ctd_minor));
1337 		/*
1338 		 * This is a copy of the actual activated template.
1339 		 * Safe to make changes such as freeing the minor
1340 		 * path in the template.
1341 		 */
1342 		kmem_free(dtmpl->ctd_minor, strlen(dtmpl->ctd_minor) + 1);
1343 		dtmpl->ctd_minor = NULL;
1344 	}
1345 	mutex_exit(&tmpl->ctmpl_lock);
1346 
1347 	path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1348 
1349 	if (ddi_dev_pathname(dev, spec_type, path) != DDI_SUCCESS) {
1350 		CT_DEBUG((CE_NOTE, "contract_device_open(): Failed to derive "
1351 		    "minor path from dev_t,spec {%lu, %d} for process (%d)",
1352 		    dev, spec_type, curproc->p_pid));
1353 		ctmpl_free(tmpl);
1354 		kmem_free(path, MAXPATHLEN);
1355 		return (1);
1356 	}
1357 
1358 	mutex_enter(&tmpl->ctmpl_lock);
1359 	ASSERT(dtmpl->ctd_minor == NULL);
1360 	dtmpl->ctd_minor = path;
1361 	mutex_exit(&tmpl->ctmpl_lock);
1362 
1363 	ctd = contract_device_create(dtmpl, dev, spec_type, curproc, &error);
1364 
1365 	mutex_enter(&tmpl->ctmpl_lock);
1366 	ASSERT(dtmpl->ctd_minor);
1367 	dtmpl->ctd_minor = NULL;
1368 	mutex_exit(&tmpl->ctmpl_lock);
1369 	ctmpl_free(tmpl);
1370 	kmem_free(path, MAXPATHLEN);
1371 
1372 	if (ctd == NULL) {
1373 		cmn_err(CE_NOTE, "contract_device_open(): Failed to "
1374 		    "create device contract for process (%d) holding "
1375 		    "device (devt = %lu, spec_type = %d)",
1376 		    curproc->p_pid, dev, spec_type);
1377 		return (1);
1378 	}
1379 
1380 	if (ctpp) {
1381 		mutex_enter(&ctd->cond_contract.ct_lock);
1382 		*ctpp = &ctd->cond_contract;
1383 		mutex_exit(&ctd->cond_contract.ct_lock);
1384 	}
1385 	return (0);
1386 }
1387 
1388 /*
1389  * Called during contract negotiation by the device contract framework to wait
1390  * for ACKs or NACKs from contract holders. If all responses are not received
1391  * before a specified timeout, this routine times out.
1392  */
1393 static uint_t
1394 wait_for_acks(dev_info_t *dip, dev_t dev, int spec_type, uint_t evtype)
1395 {
1396 	cont_device_t *ctd;
1397 	int timed_out = 0;
1398 	int result = CT_NONE;
1399 	int ack;
1400 	char *f = "wait_for_acks";
1401 
1402 	ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_ct_lock)));
1403 	ASSERT(dip);
1404 	ASSERT(evtype & CT_DEV_ALLEVENT);
1405 	ASSERT(dev != NODEV && dev != DDI_DEV_T_NONE);
1406 	ASSERT((dev == DDI_DEV_T_ANY && spec_type == 0) ||
1407 	    (spec_type == S_IFBLK || spec_type == S_IFCHR));
1408 
1409 	CT_DEBUG((CE_NOTE, "%s: entered: dip: %p", f, (void *)dip));
1410 
1411 	if (ct_barrier_wait_for_empty(dip, CT_DEV_ACKTIME) == -1) {
1412 		/*
1413 		 * some contract owner(s) didn't respond in time
1414 		 */
1415 		CT_DEBUG((CE_NOTE, "%s: timed out: %p", f, (void *)dip));
1416 		timed_out = 1;
1417 	}
1418 
1419 	ack = 0;
1420 	for (ctd = list_head(&(DEVI(dip)->devi_ct)); ctd != NULL;
1421 	    ctd = list_next(&(DEVI(dip)->devi_ct), ctd)) {
1422 
1423 		mutex_enter(&ctd->cond_contract.ct_lock);
1424 
1425 		ASSERT(ctd->cond_dip == dip);
1426 
1427 		if (dev != DDI_DEV_T_ANY && dev != ctd->cond_devt) {
1428 			mutex_exit(&ctd->cond_contract.ct_lock);
1429 			continue;
1430 		}
1431 		if (dev != DDI_DEV_T_ANY && spec_type != ctd->cond_spec) {
1432 			mutex_exit(&ctd->cond_contract.ct_lock);
1433 			continue;
1434 		}
1435 
1436 		/* skip if non-negotiable contract */
1437 		if (ctd->cond_noneg) {
1438 			mutex_exit(&ctd->cond_contract.ct_lock);
1439 			continue;
1440 		}
1441 
1442 		ASSERT(ctd->cond_currev_type == evtype);
1443 		if (ctd->cond_currev_ack == CT_NACK) {
1444 			CT_DEBUG((CE_NOTE, "%s: found a NACK,result = NACK: %p",
1445 			    f, (void *)dip));
1446 			mutex_exit(&ctd->cond_contract.ct_lock);
1447 			return (CT_NACK);
1448 		} else if (ctd->cond_currev_ack == CT_ACK) {
1449 			ack = 1;
1450 			CT_DEBUG((CE_NOTE, "%s: found a ACK: %p",
1451 			    f, (void *)dip));
1452 		}
1453 		mutex_exit(&ctd->cond_contract.ct_lock);
1454 	}
1455 
1456 	if (ack) {
1457 		result = CT_ACK;
1458 		CT_DEBUG((CE_NOTE, "%s: result = ACK, dip=%p", f, (void *)dip));
1459 	} else if (timed_out) {
1460 		result = CT_NONE;
1461 		CT_DEBUG((CE_NOTE, "%s: result = NONE (timed-out), dip=%p",
1462 		    f, (void *)dip));
1463 	} else {
1464 		CT_DEBUG((CE_NOTE, "%s: result = NONE, dip=%p",
1465 		    f, (void *)dip));
1466 	}
1467 
1468 
1469 	return (result);
1470 }
1471 
1472 /*
1473  * Determines the current state of a device (i.e a devinfo node
1474  */
1475 static int
1476 get_state(dev_info_t *dip)
1477 {
1478 	if (DEVI_IS_DEVICE_OFFLINE(dip) || DEVI_IS_DEVICE_DOWN(dip))
1479 		return (CT_DEV_EV_OFFLINE);
1480 	else if (DEVI_IS_DEVICE_DEGRADED(dip))
1481 		return (CT_DEV_EV_DEGRADED);
1482 	else
1483 		return (CT_DEV_EV_ONLINE);
1484 }
1485 
1486 /*
1487  * Sets the current state of a device in a device contract
1488  */
1489 static void
1490 set_cond_state(dev_info_t *dip)
1491 {
1492 	uint_t state = get_state(dip);
1493 	cont_device_t *ctd;
1494 
1495 	/* verify that barrier is held */
1496 	ASSERT(ct_barrier_held(dip));
1497 
1498 	for (ctd = list_head(&(DEVI(dip)->devi_ct)); ctd != NULL;
1499 	    ctd = list_next(&(DEVI(dip)->devi_ct), ctd)) {
1500 		mutex_enter(&ctd->cond_contract.ct_lock);
1501 		ASSERT(ctd->cond_dip == dip);
1502 		ctd->cond_state = state;
1503 		mutex_exit(&ctd->cond_contract.ct_lock);
1504 	}
1505 }
1506 
1507 /*
1508  * Core routine called by event-specific routines when an event occurs.
1509  * Determines if an event should be be published, and if it is to be
1510  * published, whether a negotiation should take place. Also implements
1511  * NEGEND events which publish the final disposition of an event after
1512  * negotiations are complete.
1513  *
1514  * When an event occurs on a minor node, this routine walks the list of
1515  * contracts hanging off a devinfo node and for each contract on the affected
1516  * dip, evaluates the following cases
1517  *
1518  *	a. an event that is synchronous, breaks the contract and NONEG not set
1519  *		- bumps up the outstanding negotiation counts on the dip
1520  *		- marks the dip as undergoing negotiation (devi_ct_neg)
1521  *		- event of type CTE_NEG is published
1522  *	b. an event that is synchronous, breaks the contract and NONEG is set
1523  *		- sets the final result to CT_NACK, event is blocked
1524  *		- does not publish an event
1525  *	c. event is asynchronous and breaks the contract
1526  *		- publishes a critical event irrespect of whether the NONEG
1527  *		  flag is set, since the contract will be broken and contract
1528  *		  owner needs to be informed.
1529  *	d. No contract breakage but the owner has subscribed to the event
1530  *		- publishes the event irrespective of the NONEG event as the
1531  *		  owner has explicitly subscribed to the event.
1532  *	e. NEGEND event
1533  *		- publishes a critical event. Should only be doing this if
1534  *		  if NONEG is not set.
1535  *	f. all other events
1536  *		- Since a contract is not broken and this event has not been
1537  *		  subscribed to, this event does not need to be published for
1538  *		  for this contract.
1539  *
1540  *	Once an event is published, what happens next depends on the type of
1541  *	event:
1542  *
1543  *	a. NEGEND event
1544  *		- cleanup all state associated with the preceding negotiation
1545  *		  and return CT_ACK to the caller of contract_device_publish()
1546  *	b. NACKed event
1547  *		- One or more contracts had the NONEG term, so the event was
1548  *		  blocked. Return CT_NACK to the caller.
1549  *	c. Negotiated event
1550  *		- Call wait_for_acks() to wait for responses from contract
1551  *		holders. The end result is either CT_ACK (event is permitted),
1552  *		CT_NACK (event is blocked) or CT_NONE (no contract owner)
1553  *		responded. This result is returned back to the caller.
1554  *	d. All other events
1555  *		- If the event was asynchronous (i.e. not negotiated) or
1556  *		a contract was not broken return CT_ACK to the caller.
1557  */
1558 static uint_t
1559 contract_device_publish(dev_info_t *dip, dev_t dev, int spec_type,
1560     uint_t evtype, nvlist_t *tnvl)
1561 {
1562 	cont_device_t *ctd;
1563 	uint_t result = CT_NONE;
1564 	uint64_t evid = 0;
1565 	uint64_t nevid = 0;
1566 	char *path = NULL;
1567 	int negend;
1568 	int match;
1569 	int sync = 0;
1570 	contract_t *ct;
1571 	ct_kevent_t *event;
1572 	nvlist_t *nvl;
1573 	int broken = 0;
1574 
1575 	ASSERT(dip);
1576 	ASSERT(dev != NODEV && dev != DDI_DEV_T_NONE);
1577 	ASSERT((dev == DDI_DEV_T_ANY && spec_type == 0) ||
1578 	    (spec_type == S_IFBLK || spec_type == S_IFCHR));
1579 	ASSERT(evtype == 0 || (evtype & CT_DEV_ALLEVENT));
1580 
1581 	/* Is this a synchronous state change ? */
1582 	if (evtype != CT_EV_NEGEND) {
1583 		sync = is_sync_neg(get_state(dip), evtype);
1584 		/* NOP if unsupported transition */
1585 		if (sync == -2 || sync == -1) {
1586 			DEVI(dip)->devi_flags |= DEVI_CT_NOP;
1587 			result = (sync == -2) ? CT_ACK : CT_NONE;
1588 			goto out;
1589 		}
1590 		CT_DEBUG((CE_NOTE, "publish: is%s sync state change",
1591 		    sync ? "" : " not"));
1592 	} else if (DEVI(dip)->devi_flags & DEVI_CT_NOP) {
1593 		DEVI(dip)->devi_flags &= ~DEVI_CT_NOP;
1594 		result = CT_ACK;
1595 		goto out;
1596 	}
1597 
1598 	path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1599 	(void) ddi_pathname(dip, path);
1600 
1601 	mutex_enter(&(DEVI(dip)->devi_ct_lock));
1602 
1603 	/*
1604 	 * Negotiation end - set the state of the device in the contract
1605 	 */
1606 	if (evtype == CT_EV_NEGEND) {
1607 		CT_DEBUG((CE_NOTE, "publish: negend: setting cond state"));
1608 		set_cond_state(dip);
1609 	}
1610 
1611 	/*
1612 	 * If this device didn't go through negotiation, don't publish
1613 	 * a NEGEND event - simply release the barrier to allow other
1614 	 * device events in.
1615 	 */
1616 	negend = 0;
1617 	if (evtype == CT_EV_NEGEND && !DEVI(dip)->devi_ct_neg) {
1618 		CT_DEBUG((CE_NOTE, "publish: no negend reqd. release barrier"));
1619 		ct_barrier_release(dip);
1620 		mutex_exit(&(DEVI(dip)->devi_ct_lock));
1621 		result = CT_ACK;
1622 		goto out;
1623 	} else if (evtype == CT_EV_NEGEND) {
1624 		/*
1625 		 * There are negotiated contract breakages that
1626 		 * need a NEGEND event
1627 		 */
1628 		ASSERT(ct_barrier_held(dip));
1629 		negend = 1;
1630 		CT_DEBUG((CE_NOTE, "publish: setting negend flag"));
1631 	} else {
1632 		/*
1633 		 * This is a new event, not a NEGEND event. Wait for previous
1634 		 * contract events to complete.
1635 		 */
1636 		ct_barrier_acquire(dip);
1637 	}
1638 
1639 
1640 	match = 0;
1641 	for (ctd = list_head(&(DEVI(dip)->devi_ct)); ctd != NULL;
1642 	    ctd = list_next(&(DEVI(dip)->devi_ct), ctd)) {
1643 
1644 		ctid_t ctid;
1645 		size_t len = strlen(path);
1646 
1647 		mutex_enter(&ctd->cond_contract.ct_lock);
1648 
1649 		ASSERT(ctd->cond_dip == dip);
1650 		ASSERT(ctd->cond_minor);
1651 		ASSERT(strncmp(ctd->cond_minor, path, len) == 0 &&
1652 		    ctd->cond_minor[len] == ':');
1653 
1654 		if (dev != DDI_DEV_T_ANY && dev != ctd->cond_devt) {
1655 			mutex_exit(&ctd->cond_contract.ct_lock);
1656 			continue;
1657 		}
1658 		if (dev != DDI_DEV_T_ANY && spec_type != ctd->cond_spec) {
1659 			mutex_exit(&ctd->cond_contract.ct_lock);
1660 			continue;
1661 		}
1662 
1663 		/* We have a matching contract */
1664 		match = 1;
1665 		ctid = ctd->cond_contract.ct_id;
1666 		CT_DEBUG((CE_NOTE, "publish: found matching contract: %d",
1667 		    ctid));
1668 
1669 		/*
1670 		 * There are 4 possible cases
1671 		 * 1. A contract is broken (dev not in acceptable state) and
1672 		 *    the state change is synchronous - start negotiation
1673 		 *    by sending a CTE_NEG critical event.
1674 		 * 2. A contract is broken and the state change is
1675 		 *    asynchronous - just send a critical event and
1676 		 *    break the contract.
1677 		 * 3. Contract is not broken, but consumer has subscribed
1678 		 *    to the event as a critical or informative event
1679 		 *    - just send the appropriate event
1680 		 * 4. contract waiting for negend event - just send the critical
1681 		 *    NEGEND event.
1682 		 */
1683 		broken = 0;
1684 		if (!negend && !(evtype & ctd->cond_aset)) {
1685 			broken = 1;
1686 			CT_DEBUG((CE_NOTE, "publish: Contract broken: %d",
1687 			    ctid));
1688 		}
1689 
1690 		/*
1691 		 * Don't send event if
1692 		 *	- contract is not broken AND
1693 		 *	- contract holder has not subscribed to this event AND
1694 		 *	- contract not waiting for a NEGEND event
1695 		 */
1696 		if (!broken && !EVSENDP(ctd, evtype) &&
1697 		    !ctd->cond_neg) {
1698 			CT_DEBUG((CE_NOTE, "contract_device_publish(): "
1699 			    "contract (%d): no publish reqd: event %d",
1700 			    ctd->cond_contract.ct_id, evtype));
1701 			mutex_exit(&ctd->cond_contract.ct_lock);
1702 			continue;
1703 		}
1704 
1705 		/*
1706 		 * Note: need to kmem_zalloc() the event so mutexes are
1707 		 * initialized automatically
1708 		 */
1709 		ct = &ctd->cond_contract;
1710 		event = kmem_zalloc(sizeof (ct_kevent_t), KM_SLEEP);
1711 		event->cte_type = evtype;
1712 
1713 		if (broken && sync) {
1714 			CT_DEBUG((CE_NOTE, "publish: broken + sync: "
1715 			    "ctid: %d", ctid));
1716 			ASSERT(!negend);
1717 			ASSERT(ctd->cond_currev_id == 0);
1718 			ASSERT(ctd->cond_currev_type == 0);
1719 			ASSERT(ctd->cond_currev_ack == 0);
1720 			ASSERT(ctd->cond_neg == 0);
1721 			if (ctd->cond_noneg) {
1722 				/* Nothing to publish. Event has been blocked */
1723 				CT_DEBUG((CE_NOTE, "publish: sync and noneg:"
1724 				    "not publishing blocked ev: ctid: %d",
1725 				    ctid));
1726 				result = CT_NACK;
1727 				kmem_free(event, sizeof (ct_kevent_t));
1728 				mutex_exit(&ctd->cond_contract.ct_lock);
1729 				continue;
1730 			}
1731 			event->cte_flags = CTE_NEG; /* critical neg. event */
1732 			ctd->cond_currev_type = event->cte_type;
1733 			ct_barrier_incr(dip);
1734 			DEVI(dip)->devi_ct_neg = 1; /* waiting for negend */
1735 			ctd->cond_neg = 1;
1736 		} else if (broken && !sync) {
1737 			CT_DEBUG((CE_NOTE, "publish: broken + async: ctid: %d",
1738 			    ctid));
1739 			ASSERT(!negend);
1740 			ASSERT(ctd->cond_currev_id == 0);
1741 			ASSERT(ctd->cond_currev_type == 0);
1742 			ASSERT(ctd->cond_currev_ack == 0);
1743 			ASSERT(ctd->cond_neg == 0);
1744 			event->cte_flags = 0; /* critical event */
1745 		} else if (EVSENDP(ctd, event->cte_type)) {
1746 			CT_DEBUG((CE_NOTE, "publish: event suscrib: ctid: %d",
1747 			    ctid));
1748 			ASSERT(!negend);
1749 			ASSERT(ctd->cond_currev_id == 0);
1750 			ASSERT(ctd->cond_currev_type == 0);
1751 			ASSERT(ctd->cond_currev_ack == 0);
1752 			ASSERT(ctd->cond_neg == 0);
1753 			event->cte_flags = EVINFOP(ctd, event->cte_type) ?
1754 			    CTE_INFO : 0;
1755 		} else if (ctd->cond_neg) {
1756 			CT_DEBUG((CE_NOTE, "publish: NEGEND: ctid: %d", ctid));
1757 			ASSERT(negend);
1758 			ASSERT(ctd->cond_noneg == 0);
1759 			nevid = ctd->cond_contract.ct_nevent ?
1760 			    ctd->cond_contract.ct_nevent->cte_id : 0;
1761 			ASSERT(ctd->cond_currev_id == nevid);
1762 			event->cte_flags = 0;	/* NEGEND is always critical */
1763 			ctd->cond_currev_id = 0;
1764 			ctd->cond_currev_type = 0;
1765 			ctd->cond_currev_ack = 0;
1766 			ctd->cond_neg = 0;
1767 		} else {
1768 			CT_DEBUG((CE_NOTE, "publish: not publishing event for "
1769 			    "ctid: %d, evtype: %d",
1770 			    ctd->cond_contract.ct_id, event->cte_type));
1771 			ASSERT(!negend);
1772 			ASSERT(ctd->cond_currev_id == 0);
1773 			ASSERT(ctd->cond_currev_type == 0);
1774 			ASSERT(ctd->cond_currev_ack == 0);
1775 			ASSERT(ctd->cond_neg == 0);
1776 			kmem_free(event, sizeof (ct_kevent_t));
1777 			mutex_exit(&ctd->cond_contract.ct_lock);
1778 			continue;
1779 		}
1780 
1781 		nvl = NULL;
1782 		if (tnvl) {
1783 			VERIFY(nvlist_dup(tnvl, &nvl, 0) == 0);
1784 			if (negend) {
1785 				int32_t newct = 0;
1786 				ASSERT(ctd->cond_noneg == 0);
1787 				VERIFY(nvlist_add_uint64(nvl, CTS_NEVID, nevid)
1788 				    == 0);
1789 				VERIFY(nvlist_lookup_int32(nvl, CTS_NEWCT,
1790 				    &newct) == 0);
1791 				VERIFY(nvlist_add_int32(nvl, CTS_NEWCT,
1792 				    newct == 1 ? 0 :
1793 				    ctd->cond_contract.ct_id) == 0);
1794 				CT_DEBUG((CE_NOTE, "publish: negend: ctid: %d "
1795 				    "CTS_NEVID: %llu, CTS_NEWCT: %s",
1796 				    ctid, (unsigned long long)nevid,
1797 				    newct ? "success" : "failure"));
1798 
1799 			}
1800 		}
1801 
1802 		if (ctd->cond_neg) {
1803 			ASSERT(ctd->cond_contract.ct_ntime.ctm_start == -1);
1804 			ASSERT(ctd->cond_contract.ct_qtime.ctm_start == -1);
1805 			ctd->cond_contract.ct_ntime.ctm_start = ddi_get_lbolt();
1806 			ctd->cond_contract.ct_qtime.ctm_start =
1807 			    ctd->cond_contract.ct_ntime.ctm_start;
1808 		}
1809 
1810 		/*
1811 		 * by holding the dip's devi_ct_lock we ensure that
1812 		 * all ACK/NACKs are held up until we have finished
1813 		 * publishing to all contracts.
1814 		 */
1815 		mutex_exit(&ctd->cond_contract.ct_lock);
1816 		evid = cte_publish_all(ct, event, nvl, NULL);
1817 		mutex_enter(&ctd->cond_contract.ct_lock);
1818 
1819 		if (ctd->cond_neg) {
1820 			ASSERT(!negend);
1821 			ASSERT(broken);
1822 			ASSERT(sync);
1823 			ASSERT(!ctd->cond_noneg);
1824 			CT_DEBUG((CE_NOTE, "publish: sync break, setting evid"
1825 			    ": %d", ctid));
1826 			ctd->cond_currev_id = evid;
1827 		} else if (negend) {
1828 			ctd->cond_contract.ct_ntime.ctm_start = -1;
1829 			ctd->cond_contract.ct_qtime.ctm_start = -1;
1830 		}
1831 		mutex_exit(&ctd->cond_contract.ct_lock);
1832 	}
1833 
1834 	/*
1835 	 * If "negend" set counter back to initial state (-1) so that
1836 	 * other events can be published. Also clear the negotiation flag
1837 	 * on dip.
1838 	 *
1839 	 * 0 .. n are used for counting.
1840 	 * -1 indicates counter is available for use.
1841 	 */
1842 	if (negend) {
1843 		/*
1844 		 * devi_ct_count not necessarily 0. We may have
1845 		 * timed out in which case, count will be non-zero.
1846 		 */
1847 		ct_barrier_release(dip);
1848 		DEVI(dip)->devi_ct_neg = 0;
1849 		CT_DEBUG((CE_NOTE, "publish: negend: reset dip state: dip=%p",
1850 		    (void *)dip));
1851 	} else if (DEVI(dip)->devi_ct_neg) {
1852 		ASSERT(match);
1853 		ASSERT(!ct_barrier_empty(dip));
1854 		CT_DEBUG((CE_NOTE, "publish: sync count=%d, dip=%p",
1855 		    DEVI(dip)->devi_ct_count, (void *)dip));
1856 	} else {
1857 		/*
1858 		 * for non-negotiated events or subscribed events or no
1859 		 * matching contracts
1860 		 */
1861 		ASSERT(ct_barrier_empty(dip));
1862 		ASSERT(DEVI(dip)->devi_ct_neg == 0);
1863 		CT_DEBUG((CE_NOTE, "publish: async/non-nego/subscrib/no-match: "
1864 		    "dip=%p", (void *)dip));
1865 
1866 		/*
1867 		 * only this function when called from contract_device_negend()
1868 		 * can reset the counter to READY state i.e. -1. This function
1869 		 * is so called for every event whether a NEGEND event is needed
1870 		 * or not, but the negend event is only published if the event
1871 		 * whose end they signal is a negotiated event for the contract.
1872 		 */
1873 	}
1874 
1875 	if (!match) {
1876 		/* No matching contracts */
1877 		CT_DEBUG((CE_NOTE, "publish: No matching contract"));
1878 		result = CT_NONE;
1879 	} else if (result == CT_NACK) {
1880 		/* a non-negotiable contract exists and this is a neg. event */
1881 		CT_DEBUG((CE_NOTE, "publish: found 1 or more NONEG contract"));
1882 		(void) wait_for_acks(dip, dev, spec_type, evtype);
1883 	} else if (DEVI(dip)->devi_ct_neg) {
1884 		/* one or more contracts going through negotations  */
1885 		CT_DEBUG((CE_NOTE, "publish: sync contract: waiting"));
1886 		result = wait_for_acks(dip, dev, spec_type, evtype);
1887 	} else {
1888 		/* no negotiated contracts or no broken contracts or NEGEND */
1889 		CT_DEBUG((CE_NOTE, "publish: async/no-break/negend"));
1890 		result = CT_ACK;
1891 	}
1892 
1893 	/*
1894 	 * Release the lock only now so that the only point where we
1895 	 * drop the lock is in wait_for_acks(). This is so that we don't
1896 	 * miss cv_signal/cv_broadcast from contract holders
1897 	 */
1898 	CT_DEBUG((CE_NOTE, "publish: dropping devi_ct_lock"));
1899 	mutex_exit(&(DEVI(dip)->devi_ct_lock));
1900 
1901 out:
1902 	if (tnvl)
1903 		nvlist_free(tnvl);
1904 	if (path)
1905 		kmem_free(path, MAXPATHLEN);
1906 
1907 
1908 	CT_DEBUG((CE_NOTE, "publish: result = %s", result_str(result)));
1909 	return (result);
1910 }
1911 
1912 
1913 /*
1914  * contract_device_offline
1915  *
1916  * Event publishing routine called by I/O framework when a device is offlined.
1917  */
1918 ct_ack_t
1919 contract_device_offline(dev_info_t *dip, dev_t dev, int spec_type)
1920 {
1921 	nvlist_t *nvl;
1922 	uint_t result;
1923 	uint_t evtype;
1924 
1925 	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1926 
1927 	evtype = CT_DEV_EV_OFFLINE;
1928 	result = contract_device_publish(dip, dev, spec_type, evtype, nvl);
1929 
1930 	/*
1931 	 * If a contract offline is NACKED, the framework expects us to call
1932 	 * NEGEND ourselves, since we know the final result
1933 	 */
1934 	if (result == CT_NACK) {
1935 		contract_device_negend(dip, dev, spec_type, CT_EV_FAILURE);
1936 	}
1937 
1938 	return (result);
1939 }
1940 
1941 /*
1942  * contract_device_degrade
1943  *
1944  * Event publishing routine called by I/O framework when a device
1945  * moves to degrade state.
1946  */
1947 /*ARGSUSED*/
1948 void
1949 contract_device_degrade(dev_info_t *dip, dev_t dev, int spec_type)
1950 {
1951 	nvlist_t *nvl;
1952 	uint_t evtype;
1953 
1954 	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1955 
1956 	evtype = CT_DEV_EV_DEGRADED;
1957 	(void) contract_device_publish(dip, dev, spec_type, evtype, nvl);
1958 }
1959 
1960 /*
1961  * contract_device_undegrade
1962  *
1963  * Event publishing routine called by I/O framework when a device
1964  * moves from degraded state to online state.
1965  */
1966 /*ARGSUSED*/
1967 void
1968 contract_device_undegrade(dev_info_t *dip, dev_t dev, int spec_type)
1969 {
1970 	nvlist_t *nvl;
1971 	uint_t evtype;
1972 
1973 	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1974 
1975 	evtype = CT_DEV_EV_ONLINE;
1976 	(void) contract_device_publish(dip, dev, spec_type, evtype, nvl);
1977 }
1978 
1979 /*
1980  * For all contracts which have undergone a negotiation (because the device
1981  * moved out of the acceptable state for that contract and the state
1982  * change is synchronous i.e. requires negotiation) this routine publishes
1983  * a CT_EV_NEGEND event with the final disposition of the event.
1984  *
1985  * This event is always a critical event.
1986  */
1987 void
1988 contract_device_negend(dev_info_t *dip, dev_t dev, int spec_type, int result)
1989 {
1990 	nvlist_t *nvl;
1991 	uint_t evtype;
1992 
1993 	ASSERT(result == CT_EV_SUCCESS || result == CT_EV_FAILURE);
1994 
1995 	CT_DEBUG((CE_NOTE, "contract_device_negend(): entered: result: %d, "
1996 	    "dip: %p", result, (void *)dip));
1997 
1998 	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1999 	VERIFY(nvlist_add_int32(nvl, CTS_NEWCT,
2000 	    result == CT_EV_SUCCESS ? 1 : 0) == 0);
2001 
2002 	evtype = CT_EV_NEGEND;
2003 	(void) contract_device_publish(dip, dev, spec_type, evtype, nvl);
2004 
2005 	CT_DEBUG((CE_NOTE, "contract_device_negend(): exit dip: %p",
2006 	    (void *)dip));
2007 }
2008 
2009 /*
2010  * Wrapper routine called by other subsystems (such as LDI) to start
2011  * negotiations when a synchronous device state change occurs.
2012  * Returns CT_ACK or CT_NACK.
2013  */
2014 ct_ack_t
2015 contract_device_negotiate(dev_info_t *dip, dev_t dev, int spec_type,
2016     uint_t evtype)
2017 {
2018 	int	result;
2019 
2020 	ASSERT(dip);
2021 	ASSERT(dev != NODEV);
2022 	ASSERT(dev != DDI_DEV_T_ANY);
2023 	ASSERT(dev != DDI_DEV_T_NONE);
2024 	ASSERT(spec_type == S_IFBLK || spec_type == S_IFCHR);
2025 
2026 	switch (evtype) {
2027 	case CT_DEV_EV_OFFLINE:
2028 		result = contract_device_offline(dip, dev, spec_type);
2029 		break;
2030 	default:
2031 		cmn_err(CE_PANIC, "contract_device_negotiate(): Negotiation "
2032 		    "not supported: event (%d) for dev_t (%lu) and spec (%d), "
2033 		    "dip (%p)", evtype, dev, spec_type, (void *)dip);
2034 		result = CT_NACK;
2035 		break;
2036 	}
2037 
2038 	return (result);
2039 }
2040 
2041 /*
2042  * A wrapper routine called by other subsystems (such as the LDI) to
2043  * finalize event processing for a state change event. For synchronous
2044  * state changes, this publishes NEGEND events. For asynchronous i.e.
2045  * non-negotiable events this publishes the event.
2046  */
2047 void
2048 contract_device_finalize(dev_info_t *dip, dev_t dev, int spec_type,
2049     uint_t evtype, int ct_result)
2050 {
2051 	ASSERT(dip);
2052 	ASSERT(dev != NODEV);
2053 	ASSERT(dev != DDI_DEV_T_ANY);
2054 	ASSERT(dev != DDI_DEV_T_NONE);
2055 	ASSERT(spec_type == S_IFBLK || spec_type == S_IFCHR);
2056 
2057 	switch (evtype) {
2058 	case CT_DEV_EV_OFFLINE:
2059 		contract_device_negend(dip, dev, spec_type, ct_result);
2060 		break;
2061 	case CT_DEV_EV_DEGRADED:
2062 		contract_device_degrade(dip, dev, spec_type);
2063 		contract_device_negend(dip, dev, spec_type, ct_result);
2064 		break;
2065 	case CT_DEV_EV_ONLINE:
2066 		contract_device_undegrade(dip, dev, spec_type);
2067 		contract_device_negend(dip, dev, spec_type, ct_result);
2068 		break;
2069 	default:
2070 		cmn_err(CE_PANIC, "contract_device_finalize(): Unsupported "
2071 		    "event (%d) for dev_t (%lu) and spec (%d), dip (%p)",
2072 		    evtype, dev, spec_type, (void *)dip);
2073 		break;
2074 	}
2075 }
2076 
2077 /*
2078  * Called by I/O framework when a devinfo node is freed to remove the
2079  * association between a devinfo node and its contracts.
2080  */
2081 void
2082 contract_device_remove_dip(dev_info_t *dip)
2083 {
2084 	cont_device_t *ctd;
2085 	cont_device_t *next;
2086 	contract_t *ct;
2087 
2088 	mutex_enter(&(DEVI(dip)->devi_ct_lock));
2089 	ct_barrier_wait_for_release(dip);
2090 
2091 	for (ctd = list_head(&(DEVI(dip)->devi_ct)); ctd != NULL; ctd = next) {
2092 		next = list_next(&(DEVI(dip)->devi_ct), ctd);
2093 		list_remove(&(DEVI(dip)->devi_ct), ctd);
2094 		ct = &ctd->cond_contract;
2095 		/*
2096 		 * Unlink the dip associated with this contract
2097 		 */
2098 		mutex_enter(&ct->ct_lock);
2099 		ASSERT(ctd->cond_dip == dip);
2100 		ctd->cond_dip = NULL; /* no longer linked to dip */
2101 		contract_rele(ct);	/* remove hold for dip linkage */
2102 		CT_DEBUG((CE_NOTE, "ct: remove_dip: removed dip from contract: "
2103 		    "ctid: %d", ct->ct_id));
2104 		mutex_exit(&ct->ct_lock);
2105 	}
2106 	ASSERT(list_is_empty(&(DEVI(dip)->devi_ct)));
2107 	mutex_exit(&(DEVI(dip)->devi_ct_lock));
2108 }
2109 
2110 /*
2111  * Barrier related routines
2112  */
2113 static void
2114 ct_barrier_acquire(dev_info_t *dip)
2115 {
2116 	ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_ct_lock)));
2117 	CT_DEBUG((CE_NOTE, "ct_barrier_acquire: waiting for barrier"));
2118 	while (DEVI(dip)->devi_ct_count != -1)
2119 		cv_wait(&(DEVI(dip)->devi_ct_cv), &(DEVI(dip)->devi_ct_lock));
2120 	DEVI(dip)->devi_ct_count = 0;
2121 	CT_DEBUG((CE_NOTE, "ct_barrier_acquire: thread owns barrier"));
2122 }
2123 
2124 static void
2125 ct_barrier_release(dev_info_t *dip)
2126 {
2127 	ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_ct_lock)));
2128 	ASSERT(DEVI(dip)->devi_ct_count != -1);
2129 	DEVI(dip)->devi_ct_count = -1;
2130 	cv_broadcast(&(DEVI(dip)->devi_ct_cv));
2131 	CT_DEBUG((CE_NOTE, "ct_barrier_release: Released barrier"));
2132 }
2133 
2134 static int
2135 ct_barrier_held(dev_info_t *dip)
2136 {
2137 	ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_ct_lock)));
2138 	return (DEVI(dip)->devi_ct_count != -1);
2139 }
2140 
2141 static int
2142 ct_barrier_empty(dev_info_t *dip)
2143 {
2144 	ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_ct_lock)));
2145 	ASSERT(DEVI(dip)->devi_ct_count != -1);
2146 	return (DEVI(dip)->devi_ct_count == 0);
2147 }
2148 
2149 static void
2150 ct_barrier_wait_for_release(dev_info_t *dip)
2151 {
2152 	ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_ct_lock)));
2153 	while (DEVI(dip)->devi_ct_count != -1)
2154 		cv_wait(&(DEVI(dip)->devi_ct_cv), &(DEVI(dip)->devi_ct_lock));
2155 }
2156 
2157 static void
2158 ct_barrier_decr(dev_info_t *dip)
2159 {
2160 	CT_DEBUG((CE_NOTE, "barrier_decr:  ct_count before decr: %d",
2161 	    DEVI(dip)->devi_ct_count));
2162 
2163 	ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_ct_lock)));
2164 	ASSERT(DEVI(dip)->devi_ct_count > 0);
2165 
2166 	DEVI(dip)->devi_ct_count--;
2167 	if (DEVI(dip)->devi_ct_count == 0) {
2168 		cv_broadcast(&DEVI(dip)->devi_ct_cv);
2169 		CT_DEBUG((CE_NOTE, "barrier_decr: cv_broadcast"));
2170 	}
2171 }
2172 
2173 static void
2174 ct_barrier_incr(dev_info_t *dip)
2175 {
2176 	ASSERT(ct_barrier_held(dip));
2177 	DEVI(dip)->devi_ct_count++;
2178 }
2179 
2180 static int
2181 ct_barrier_wait_for_empty(dev_info_t *dip, int secs)
2182 {
2183 	clock_t abstime;
2184 
2185 	ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_ct_lock)));
2186 
2187 	abstime = ddi_get_lbolt() + drv_usectohz(secs*1000000);
2188 	while (DEVI(dip)->devi_ct_count) {
2189 		if (cv_timedwait(&(DEVI(dip)->devi_ct_cv),
2190 		    &(DEVI(dip)->devi_ct_lock), abstime) == -1) {
2191 			return (-1);
2192 		}
2193 	}
2194 	return (0);
2195 }
2196