xref: /illumos-gate/usr/src/uts/common/contract/device.c (revision 9b664393d4fdda96221e6ea9ea95790d3c15be70)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2019 Joyent, Inc.
24  */
25 
26 #include <sys/mutex.h>
27 #include <sys/debug.h>
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/kmem.h>
31 #include <sys/thread.h>
32 #include <sys/id_space.h>
33 #include <sys/avl.h>
34 #include <sys/list.h>
35 #include <sys/sysmacros.h>
36 #include <sys/proc.h>
37 #include <sys/contract.h>
38 #include <sys/contract_impl.h>
39 #include <sys/contract/device.h>
40 #include <sys/contract/device_impl.h>
41 #include <sys/cmn_err.h>
42 #include <sys/nvpair.h>
43 #include <sys/policy.h>
44 #include <sys/ddi_impldefs.h>
45 #include <sys/ddi_implfuncs.h>
46 #include <sys/systm.h>
47 #include <sys/stat.h>
48 #include <sys/sunddi.h>
49 #include <sys/esunddi.h>
50 #include <sys/ddi.h>
51 #include <sys/fs/dv_node.h>
52 #include <sys/sunndi.h>
53 #undef ct_lock	/* needed because clnt.h defines ct_lock as a macro */
54 
55 /*
56  * Device Contracts
57  * -----------------
58  * This file contains the core code for the device contracts framework.
59  * A device contract is an agreement or a contract between a process and
60  * the kernel regarding the state of the device. A device contract may be
61  * created when a relationship is formed between a device and a process
62  * i.e. at open(2) time, or it may be created at some point after the device
63  * has been opened. A device contract once formed may be broken by either party.
64  * A device contract can be broken by the process by an explicit abandon of the
65  * contract or by an implicit abandon when the process exits. A device contract
66  * can be broken by the kernel either asynchronously (without negotiation) or
67  * synchronously (with negotiation). Exactly which happens depends on the device
68  * state transition. The following state diagram shows the transitions between
69  * device states. Only device state transitions currently supported by device
70  * contracts is shown.
71  *
72  *                              <-- A -->
73  *                       /-----------------> DEGRADED
74  *                       |                      |
75  *                       |                      |
76  *                       |                      | S
77  *                       |                      | |
78  *                       |                      | v
79  *                       v       S -->          v
80  *                      ONLINE ------------> OFFLINE
81  *
82  *
83  * In the figure above, the arrows indicate the direction of transition. The
84  * letter S refers to transitions which are inherently synchronous i.e.
85  * require negotiation and the letter A indicates transitions which are
86  * asynchronous i.e. are done without contract negotiations. A good example
87  * of a synchronous transition is the ONLINE -> OFFLINE transition. This
88  * transition cannot happen as long as there are consumers which have the
89  * device open. Thus some form of negotiation needs to happen between the
90  * consumers and the kernel to ensure that consumers either close devices
91  * or disallow the move to OFFLINE. Certain other transitions such as
92  * ONLINE --> DEGRADED for example, are inherently asynchronous i.e.
93  * non-negotiable. A device that suffers a fault that degrades its
94  * capabilities will become degraded irrespective of what consumers it has,
95  * so a negotiation in this case is pointless.
96  *
97  * The following device states are currently defined for device contracts:
98  *
99  *      CT_DEV_EV_ONLINE
100  *              The device is online and functioning normally
101  *      CT_DEV_EV_DEGRADED
102  *              The device is online but is functioning in a degraded capacity
103  *      CT_DEV_EV_OFFLINE
104  *              The device is offline and is no longer configured
105  *
106  * A typical consumer of device contracts starts out with a contract
107  * template and adds terms to that template. These include the
108  * "acceptable set" (A-set) term, which is a bitset of device states which
109  * are guaranteed by the contract. If the device moves out of a state in
110  * the A-set, the contract is broken. The breaking of the contract can
111  * be asynchronous in which case a critical contract event is sent to the
112  * contract holder but no negotiations take place. If the breaking of the
113  * contract is synchronous, negotations are opened between the affected
114  * consumer and the kernel. The kernel does this by sending a critical
115  * event to the consumer with the CTE_NEG flag set indicating that this
116  * is a negotiation event. The consumer can accept this change by sending
117  * a ACK message to the kernel. Alternatively, if it has the necessary
118  * privileges, it can send a NACK message to the kernel which will block
119  * the device state change. To NACK a negotiable event, a process must
120  * have the {PRIV_SYS_DEVICES} privilege asserted in its effective set.
121  *
122  * Other terms include the "minor path" term, specified explicitly if the
123  * contract is not being created at open(2) time or specified implicitly
124  * if the contract is being created at open time via an activated template.
125  *
126  * A contract event is sent on any state change to which the contract
127  * owner has subscribed via the informative or critical event sets. Only
128  * critical events are guaranteed to be delivered. Since all device state
129  * changes are controlled by the kernel and cannot be arbitrarily generated
130  * by a non-privileged user, the {PRIV_CONTRACT_EVENT} privilege does not
131  * need to be asserted in a process's effective set to designate an event as
132  * critical. To ensure privacy, a process must either have the same effective
133  * userid as the contract holder or have the {PRIV_CONTRACT_OBSERVER} privilege
134  * asserted in its effective set in order to observe device contract events
135  * off the device contract type specific endpoint.
136  *
137  * Yet another term available with device contracts is the "non-negotiable"
138  * term. This term is used to pre-specify a NACK to any contract negotiation.
139  * This term is ignored for asynchronous state changes. For example, a
140  * provcess may have the A-set {ONLINE|DEGRADED} and make the contract
141  * non-negotiable. In this case, the device contract framework assumes a
142  * NACK for any transition to OFFLINE and blocks the offline. If the A-set
143  * is {ONLINE} and the non-negotiable term is set, transitions to OFFLINE
144  * are NACKed but transitions to DEGRADE succeed.
145  *
146  * The OFFLINE negotiation (if OFFLINE state is not in the A-set for a contract)
147  * happens just before the I/O framework attempts to offline a device
148  * (i.e. detach a device and set the offline flag so that it cannot be
149  * reattached). A device contract holder is expected to either NACK the offline
150  * (if privileged) or release the device and allow the offline to proceed.
151  *
152  * The DEGRADE contract event (if DEGRADE is not in the A-set for a contract)
153  * is generated just before the I/O framework transitions the device state
154  * to "degraded" (i.e. DEVI_DEVICE_DEGRADED in I/O framework terminology).
155  *
156  * The contract holder is expected to ACK or NACK a negotiation event
157  * within a certain period of time. If the ACK/NACK is not received
158  * within the timeout period, the device contract framework will behave
159  * as if the contract does not exist and will proceed with the event.
160  *
161  * Unlike a process contract a device contract does not need to exist
162  * once it is abandoned, since it does not define a fault boundary. It
163  * merely represents an agreement between a process and the kernel
164  * regarding the state of the device. Once the process has abandoned
165  * the contract (either implicitly via a process exit or explicitly)
166  * the kernel has no reason to retain the contract. As a result
167  * device contracts are neither inheritable nor need to exist in an
168  * orphan state.
169  *
170  * A device unlike a process may exist in multiple contracts and has
171  * a "life" outside a device contract. A device unlike a process
172  * may exist without an associated contract. Unlike a process contract
173  * a device contract may be formed after a binding relationship is
174  * formed between a process and a device.
175  *
176  *	IMPLEMENTATION NOTES
177  *	====================
178  * DATA STRUCTURES
179  * ----------------
180  * 	The heart of the device contracts implementation is the device contract
181  * 	private cont_device_t (or ctd for short) data structure. It encapsulates
182  * 	the generic contract_t data structure and has a number of private
183  *	fields.
184  * 	These include:
185  *		cond_minor: The minor device that is the subject of the contract
186  *		cond_aset:  The bitset of states which are guaranteed by the
187  *			   contract
188  *		cond_noneg: If set, indicates that the result of negotiation has
189  *			    been predefined to be a NACK
190  * 	In addition, there are other device identifiers such the devinfo node,
191  * 	dev_t and spec_type of the minor node. There are also a few fields that
192  * 	are used during negotiation to maintain state. See
193  *		uts/common/sys/contract/device_impl.h
194  * 	for details.
195  * 	The ctd structure represents the device private part of a contract of
196  * 	type "device"
197  *
198  * 	Another data structure used by device contracts is ctmpl_device. It is
199  * 	the device contracts private part of the contract template structure. It
200  *	encapsulates the generic template structure "ct_template_t" and includes
201  *	the following device contract specific fields
202  *		ctd_aset:   The bitset of states that should be guaranteed by a
203  *			    contract
204  *		ctd_noneg:  If set, indicates that contract should NACK a
205  *			    negotiation
206  *		ctd_minor:  The devfs_path (without the /devices prefix) of the
207  *			    minor node that is the subject of the contract.
208  *
209  * ALGORITHMS
210  * ---------
211  * There are three sets of routines in this file
212  * 	Template related routines
213  * 	-------------------------
214  *	These routines provide support for template related operations initated
215  *	via the generic template operations. These include routines that dup
216  *	a template, free it, and set various terms in the template
217  *	(such as the minor node path, the acceptable state set (or A-set)
218  *	and the non-negotiable term) as well as a routine to query the
219  *	device specific portion of the template for the abovementioned terms.
220  *	There is also a routine to create (ctmpl_device_create) that is used to
221  *	create a contract from a template. This routine calls (after initial
222  *	setup) the common function used to create a device contract
223  *	(contract_device_create).
224  *
225  *	core device contract implementation
226  *	----------------------------------
227  *	These routines support the generic contract framework to provide
228  *	functionality that allows contracts to be created, managed and
229  *	destroyed. The contract_device_create() routine is a routine used
230  *	to create a contract from a template (either via an explicit create
231  *	operation on a template or implicitly via an open with an
232  *	activated template.). The contract_device_free() routine assists
233  *	in freeing the device contract specific parts. There are routines
234  *	used to abandon (contract_device_abandon) a device contract as well
235  *	as a routine to destroy (which despite its name does not destroy,
236  *	it only moves a contract to a dead state) a contract.
237  *	There is also a routine to return status information about a
238  *	contract - the level of detail depends on what is requested by the
239  *	user. A value of CTD_FIXED only returns fixed length fields such
240  *	as the A-set, state of device and value of the "noneg" term. If
241  *	CTD_ALL is specified, the minor node path is returned as well.
242  *
243  *	In addition there are interfaces (contract_device_ack/nack) which
244  *	are used to support negotiation between userland processes and
245  *	device contracts. These interfaces record the acknowledgement
246  *	or lack thereof for negotiation events and help determine if the
247  *	negotiated event should occur.
248  *
249  *	"backend routines"
250  *	-----------------
251  *	The backend routines form the interface between the I/O framework
252  *	and the device contract subsystem. These routines, allow the I/O
253  *	framework to call into the device contract subsystem to notify it of
254  *	impending changes to a device state as well as to inform of the
255  *	final disposition of such attempted state changes. Routines in this
256  *	class include contract_device_offline() that indicates an attempt to
257  *	offline a device, contract_device_degrade() that indicates that
258  *	a device is moving to the degraded state and contract_device_negend()
259  *	that is used by the I/O framework to inform the contracts subsystem of
260  *	the final disposition of an attempted operation.
261  *
262  *	SUMMARY
263  *	-------
264  *      A contract starts its life as a template. A process allocates a device
265  *	contract template and sets various terms:
266  *		The A-set
267  *		The device minor node
268  *		Critical and informative events
269  *		The noneg i.e. no negotition term
270  *	Setting of these terms in the template is done via the
271  *	ctmpl_device_set() entry point in this file. A process can query a
272  *	template to determine the terms already set in the template - this is
273  *	facilitated by the ctmpl_device_get() routine.
274  *
275  *	Once all the appropriate terms are set, the contract is instantiated via
276  *	one of two methods
277  *	- via an explicit create operation - this is facilitated by the
278  *	  ctmpl_device_create() entry point
279  *	- synchronously with the open(2) system call - this is achieved via the
280  *	  contract_device_open() routine.
281  *	The core work for both these above functions is done by
282  *	contract_device_create()
283  *
284  *	A contract once created can be queried for its status. Support for
285  *	status info is provided by both the common contracts framework and by
286  *	the "device" contract type. If the level of detail requested is
287  *	CTD_COMMON, only the common contract framework data is used. Higher
288  *	levels of detail result in calls to contract_device_status() to supply
289  *	device contract type specific status information.
290  *
291  *	A contract once created may be abandoned either explicitly or implictly.
292  *	In either case, the contract_device_abandon() function is invoked. This
293  * 	function merely calls contract_destroy() which moves the contract to
294  *	the DEAD state. The device contract portion of destroy processing is
295  *	provided by contract_device_destroy() which merely disassociates the
296  *	contract from its device devinfo node. A contract in the DEAD state is
297  *	not freed. It hanbgs around until all references to the contract are
298  *	gone. When that happens, the contract is finally deallocated. The
299  *	device contract specific portion of the free is done by
300  *	contract_device_free() which finally frees the device contract specific
301  *	data structure (cont_device_t).
302  *
303  *	When a device undergoes a state change, the I/O framework calls the
304  *	corresponding device contract entry point. For example, when a device
305  *	is about to go OFFLINE, the routine contract_device_offline() is
306  *	invoked. Similarly if a device moves to DEGRADED state, the routine
307  *	contract_device_degrade() function is called. These functions call the
308  *	core routine contract_device_publish(). This function determines via
309  *	the function is_sync_neg() whether an event is a synchronous (i.e.
310  *	negotiable) event or not. In the former case contract_device_publish()
311  *	publishes a CTE_NEG event and then waits in wait_for_acks() for ACKs
312  *	and/or NACKs from contract holders. In the latter case, it simply
313  *	publishes the event and does not wait. In the negotiation case, ACKs or
314  *	NACKs from userland consumers results in contract_device_ack_nack()
315  *	being called where the result of the negotiation is recorded in the
316  *	contract data structure. Once all outstanding contract owners have
317  *	responded, the device contract code in wait_for_acks() determines the
318  *	final result of the negotiation. A single NACK overrides all other ACKs
319  *	If there is no NACK, then a single ACK will result in an overall ACK
320  *	result. If there are no ACKs or NACKs, then the result CT_NONE is
321  *	returned back to the I/O framework. Once the event is permitted or
322  *	blocked, the I/O framework proceeds or aborts the state change. The
323  *	I/O framework then calls contract_device_negend() with a result code
324  *	indicating final disposition of the event. This call releases the
325  *	barrier and other state associated with the previous negotiation,
326  *	which permits the next event (if any) to come into the device contract
327  *	framework.
328  *
329  *	Finally, a device that has outstanding contracts may be removed from
330  *	the system which results in its devinfo node being freed. The devinfo
331  *	free routine in the I/O framework, calls into the device contract
332  *	function - contract_device_remove_dip(). This routine, disassociates
333  *	the dip from all contracts associated with the contract being freed,
334  *	allowing the devinfo node to be freed.
335  *
336  * LOCKING
337  * ---------
338  * 	There are four sets of data that need to be protected by locks
339  *
340  *	i) device contract specific portion of the contract template - This data
341  *	is protected by the template lock ctmpl_lock.
342  *
343  *	ii) device contract specific portion of the contract - This data is
344  *	protected by the contract lock ct_lock
345  *
346  *	iii) The linked list of contracts hanging off a devinfo node - This
347  *	list is protected by the per-devinfo node lock devi_ct_lock
348  *
349  *	iv) Finally there is a barrier, controlled by devi_ct_lock, devi_ct_cv
350  *	and devi_ct_count that controls state changes to a dip
351  *
352  *	The template lock is independent in that none of the other locks in this
353  *	file may be taken while holding the template lock (and vice versa).
354  *
355  *	The remaining three locks have the following lock order
356  *
357  *	devi_ct_lock  -> ct_count barrier ->  ct_lock
358  *
359  */
360 
361 static cont_device_t *contract_device_create(ctmpl_device_t *dtmpl, dev_t dev,
362     int spec_type, proc_t *owner, int *errorp);
363 
364 /* barrier routines */
365 static void ct_barrier_acquire(dev_info_t *dip);
366 static void ct_barrier_release(dev_info_t *dip);
367 static int ct_barrier_held(dev_info_t *dip);
368 static int ct_barrier_empty(dev_info_t *dip);
369 static void ct_barrier_wait_for_release(dev_info_t *dip);
370 static int ct_barrier_wait_for_empty(dev_info_t *dip, int secs);
371 static void ct_barrier_decr(dev_info_t *dip);
372 static void ct_barrier_incr(dev_info_t *dip);
373 
374 ct_type_t *device_type;
375 
376 /*
377  * Macro predicates for determining when events should be sent and how.
378  */
379 #define	EVSENDP(ctd, flag) \
380 	((ctd->cond_contract.ct_ev_info | ctd->cond_contract.ct_ev_crit) & flag)
381 
382 #define	EVINFOP(ctd, flag) \
383 	((ctd->cond_contract.ct_ev_crit & flag) == 0)
384 
385 /*
386  * State transition table showing which transitions are synchronous and which
387  * are not.
388  */
389 struct ct_dev_negtable {
390 	uint_t	st_old;
391 	uint_t	st_new;
392 	uint_t	st_neg;
393 } ct_dev_negtable[] = {
394 	{CT_DEV_EV_ONLINE, CT_DEV_EV_OFFLINE,	1},
395 	{CT_DEV_EV_ONLINE, CT_DEV_EV_DEGRADED,	0},
396 	{CT_DEV_EV_DEGRADED, CT_DEV_EV_ONLINE,	0},
397 	{CT_DEV_EV_DEGRADED, CT_DEV_EV_OFFLINE,	1},
398 	{0}
399 };
400 
401 /*
402  * Device contract template implementation
403  */
404 
405 /*
406  * ctmpl_device_dup
407  *
408  * The device contract template dup entry point.
409  * This simply copies all the fields (generic as well as device contract
410  * specific) fields of the original.
411  */
412 static struct ct_template *
413 ctmpl_device_dup(struct ct_template *template)
414 {
415 	ctmpl_device_t *new;
416 	ctmpl_device_t *old = template->ctmpl_data;
417 	char *buf;
418 	char *minor;
419 
420 	new = kmem_zalloc(sizeof (ctmpl_device_t), KM_SLEEP);
421 	buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
422 
423 	/*
424 	 * copy generic fields.
425 	 * ctmpl_copy returns with old template lock held
426 	 */
427 	ctmpl_copy(&new->ctd_ctmpl, template);
428 
429 	new->ctd_ctmpl.ctmpl_data = new;
430 	new->ctd_aset = old->ctd_aset;
431 	new->ctd_minor = NULL;
432 	new->ctd_noneg = old->ctd_noneg;
433 
434 	if (old->ctd_minor) {
435 		ASSERT(strlen(old->ctd_minor) + 1 <= MAXPATHLEN);
436 		bcopy(old->ctd_minor, buf, strlen(old->ctd_minor) + 1);
437 	} else {
438 		kmem_free(buf, MAXPATHLEN);
439 		buf = NULL;
440 	}
441 
442 	mutex_exit(&template->ctmpl_lock);
443 	if (buf) {
444 		minor = i_ddi_strdup(buf, KM_SLEEP);
445 		kmem_free(buf, MAXPATHLEN);
446 		buf = NULL;
447 	} else {
448 		minor = NULL;
449 	}
450 	mutex_enter(&template->ctmpl_lock);
451 
452 	if (minor) {
453 		new->ctd_minor = minor;
454 	}
455 
456 	ASSERT(buf == NULL);
457 	return (&new->ctd_ctmpl);
458 }
459 
460 /*
461  * ctmpl_device_free
462  *
463  * The device contract template free entry point.  Just
464  * frees the template.
465  */
466 static void
467 ctmpl_device_free(struct ct_template *template)
468 {
469 	ctmpl_device_t *dtmpl = template->ctmpl_data;
470 
471 	if (dtmpl->ctd_minor)
472 		kmem_free(dtmpl->ctd_minor, strlen(dtmpl->ctd_minor) + 1);
473 
474 	kmem_free(dtmpl, sizeof (ctmpl_device_t));
475 }
476 
477 /*
478  * SAFE_EV is the set of events which a non-privileged process is
479  * allowed to make critical. An unprivileged device contract owner has
480  * no control over when a device changes state, so all device events
481  * can be in the critical set.
482  *
483  * EXCESS tells us if "value", a critical event set, requires
484  * additional privilege. For device contracts EXCESS currently
485  * evaluates to 0.
486  */
487 #define	SAFE_EV		(CT_DEV_ALLEVENT)
488 #define	EXCESS(value)	((value) & ~SAFE_EV)
489 
490 
491 /*
492  * ctmpl_device_set
493  *
494  * The device contract template set entry point. Sets various terms in the
495  * template. The non-negotiable  term can only be set if the process has
496  * the {PRIV_SYS_DEVICES} privilege asserted in its effective set.
497  */
498 static int
499 ctmpl_device_set(struct ct_template *tmpl, ct_kparam_t *kparam,
500     const cred_t *cr)
501 {
502 	ctmpl_device_t *dtmpl = tmpl->ctmpl_data;
503 	ct_param_t *param = &kparam->param;
504 	int error;
505 	dev_info_t *dip;
506 	int spec_type;
507 	uint64_t param_value;
508 	char *str_value;
509 
510 	ASSERT(MUTEX_HELD(&tmpl->ctmpl_lock));
511 
512 	param_value = SAFE_EV;
513 	if (param->ctpm_id == CTDP_MINOR) {
514 		str_value = (char *)kparam->ctpm_kbuf;
515 		str_value[param->ctpm_size - 1] = '\0';
516 	} else {
517 		if (param->ctpm_size < sizeof (uint64_t))
518 			return (EINVAL);
519 		param_value = *(uint64_t *)kparam->ctpm_kbuf;
520 	}
521 
522 	switch (param->ctpm_id) {
523 	case CTDP_ACCEPT:
524 		if (param_value & ~CT_DEV_ALLEVENT)
525 			return (EINVAL);
526 		if (param_value == 0)
527 			return (EINVAL);
528 		if (param_value == CT_DEV_ALLEVENT)
529 			return (EINVAL);
530 
531 		dtmpl->ctd_aset = param_value;
532 		break;
533 	case CTDP_NONEG:
534 		if (param_value != CTDP_NONEG_SET &&
535 		    param_value != CTDP_NONEG_CLEAR)
536 			return (EINVAL);
537 
538 		/*
539 		 * only privileged processes can designate a contract
540 		 * non-negotiatble.
541 		 */
542 		if (param_value == CTDP_NONEG_SET &&
543 		    (error = secpolicy_sys_devices(cr)) != 0) {
544 			return (error);
545 		}
546 
547 		dtmpl->ctd_noneg = param_value;
548 		break;
549 
550 	case CTDP_MINOR:
551 		if (*str_value != '/' ||
552 		    strncmp(str_value, "/devices/",
553 		    strlen("/devices/")) == 0 ||
554 		    strstr(str_value, "../devices/") != NULL ||
555 		    strchr(str_value, ':') == NULL) {
556 			return (EINVAL);
557 		}
558 
559 		spec_type = 0;
560 		dip = NULL;
561 		if (resolve_pathname(str_value, &dip, NULL, &spec_type) != 0) {
562 			return (ERANGE);
563 		}
564 		ddi_release_devi(dip);
565 
566 		if (spec_type != S_IFCHR && spec_type != S_IFBLK) {
567 			return (EINVAL);
568 		}
569 
570 		if (dtmpl->ctd_minor != NULL) {
571 			kmem_free(dtmpl->ctd_minor,
572 			    strlen(dtmpl->ctd_minor) + 1);
573 		}
574 		dtmpl->ctd_minor = i_ddi_strdup(str_value, KM_SLEEP);
575 		break;
576 	case CTP_EV_CRITICAL:
577 		/*
578 		 * Currently for device contracts, any event
579 		 * may be added to the critical set. We retain the
580 		 * following code however for future enhancements.
581 		 */
582 		if (EXCESS(param_value) &&
583 		    (error = secpolicy_contract_event(cr)) != 0)
584 			return (error);
585 		tmpl->ctmpl_ev_crit = param_value;
586 		break;
587 	default:
588 		return (EINVAL);
589 	}
590 
591 	return (0);
592 }
593 
594 /*
595  * ctmpl_device_get
596  *
597  * The device contract template get entry point.  Simply fetches and
598  * returns the value of the requested term.
599  */
600 static int
601 ctmpl_device_get(struct ct_template *template, ct_kparam_t *kparam)
602 {
603 	ctmpl_device_t *dtmpl = template->ctmpl_data;
604 	ct_param_t *param = &kparam->param;
605 	uint64_t *param_value = kparam->ctpm_kbuf;
606 
607 	ASSERT(MUTEX_HELD(&template->ctmpl_lock));
608 
609 	if (param->ctpm_id == CTDP_ACCEPT ||
610 	    param->ctpm_id == CTDP_NONEG) {
611 		if (param->ctpm_size < sizeof (uint64_t))
612 			return (EINVAL);
613 		kparam->ret_size = sizeof (uint64_t);
614 	}
615 
616 	switch (param->ctpm_id) {
617 	case CTDP_ACCEPT:
618 		*param_value = dtmpl->ctd_aset;
619 		break;
620 	case CTDP_NONEG:
621 		*param_value = dtmpl->ctd_noneg;
622 		break;
623 	case CTDP_MINOR:
624 		if (dtmpl->ctd_minor) {
625 			kparam->ret_size = strlcpy((char *)kparam->ctpm_kbuf,
626 			    dtmpl->ctd_minor, param->ctpm_size);
627 			kparam->ret_size++;
628 		} else {
629 			return (ENOENT);
630 		}
631 		break;
632 	default:
633 		return (EINVAL);
634 	}
635 
636 	return (0);
637 }
638 
639 /*
640  * Device contract type specific portion of creating a contract using
641  * a specified template
642  */
643 /*ARGSUSED*/
644 int
645 ctmpl_device_create(ct_template_t *template, ctid_t *ctidp)
646 {
647 	ctmpl_device_t *dtmpl;
648 	char *buf;
649 	dev_t dev;
650 	int spec_type;
651 	int error;
652 	cont_device_t *ctd;
653 
654 	if (ctidp == NULL)
655 		return (EINVAL);
656 
657 	buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
658 
659 	dtmpl = template->ctmpl_data;
660 
661 	mutex_enter(&template->ctmpl_lock);
662 	if (dtmpl->ctd_minor == NULL) {
663 		/* incomplete template */
664 		mutex_exit(&template->ctmpl_lock);
665 		kmem_free(buf, MAXPATHLEN);
666 		return (EINVAL);
667 	} else {
668 		ASSERT(strlen(dtmpl->ctd_minor) < MAXPATHLEN);
669 		bcopy(dtmpl->ctd_minor, buf, strlen(dtmpl->ctd_minor) + 1);
670 	}
671 	mutex_exit(&template->ctmpl_lock);
672 
673 	spec_type = 0;
674 	dev = NODEV;
675 	if (resolve_pathname(buf, NULL, &dev, &spec_type) != 0 ||
676 	    dev == NODEV || dev == DDI_DEV_T_ANY || dev == DDI_DEV_T_NONE ||
677 	    (spec_type != S_IFCHR && spec_type != S_IFBLK)) {
678 		CT_DEBUG((CE_WARN,
679 		    "tmpl_create: failed to find device: %s", buf));
680 		kmem_free(buf, MAXPATHLEN);
681 		return (ERANGE);
682 	}
683 	kmem_free(buf, MAXPATHLEN);
684 
685 	ctd = contract_device_create(template->ctmpl_data,
686 	    dev, spec_type, curproc, &error);
687 
688 	if (ctd == NULL) {
689 		CT_DEBUG((CE_WARN, "Failed to create device contract for "
690 		    "process (%d) with device (devt = %lu, spec_type = %s)",
691 		    curproc->p_pid, dev,
692 		    spec_type == S_IFCHR ? "S_IFCHR" : "S_IFBLK"));
693 		return (error);
694 	}
695 
696 	mutex_enter(&ctd->cond_contract.ct_lock);
697 	*ctidp = ctd->cond_contract.ct_id;
698 	mutex_exit(&ctd->cond_contract.ct_lock);
699 
700 	return (0);
701 }
702 
703 /*
704  * Device contract specific template entry points
705  */
706 static ctmplops_t ctmpl_device_ops = {
707 	ctmpl_device_dup,		/* ctop_dup */
708 	ctmpl_device_free,		/* ctop_free */
709 	ctmpl_device_set,		/* ctop_set */
710 	ctmpl_device_get,		/* ctop_get */
711 	ctmpl_device_create,		/* ctop_create */
712 	CT_DEV_ALLEVENT			/* all device events bitmask */
713 };
714 
715 
716 /*
717  * Device contract implementation
718  */
719 
720 /*
721  * contract_device_default
722  *
723  * The device contract default template entry point.  Creates a
724  * device contract template with a default A-set and no "noneg" ,
725  * with informative degrade events and critical offline events.
726  * There is no default minor path.
727  */
728 static ct_template_t *
729 contract_device_default(void)
730 {
731 	ctmpl_device_t *new;
732 
733 	new = kmem_zalloc(sizeof (ctmpl_device_t), KM_SLEEP);
734 	ctmpl_init(&new->ctd_ctmpl, &ctmpl_device_ops, device_type, new);
735 
736 	new->ctd_aset = CT_DEV_EV_ONLINE | CT_DEV_EV_DEGRADED;
737 	new->ctd_noneg = 0;
738 	new->ctd_ctmpl.ctmpl_ev_info = CT_DEV_EV_DEGRADED;
739 	new->ctd_ctmpl.ctmpl_ev_crit = CT_DEV_EV_OFFLINE;
740 
741 	return (&new->ctd_ctmpl);
742 }
743 
744 /*
745  * contract_device_free
746  *
747  * Destroys the device contract specific portion of a contract and
748  * frees the contract.
749  */
750 static void
751 contract_device_free(contract_t *ct)
752 {
753 	cont_device_t *ctd = ct->ct_data;
754 
755 	ASSERT(ctd->cond_minor);
756 	ASSERT(strlen(ctd->cond_minor) < MAXPATHLEN);
757 	kmem_free(ctd->cond_minor, strlen(ctd->cond_minor) + 1);
758 
759 	ASSERT(ctd->cond_devt != DDI_DEV_T_ANY &&
760 	    ctd->cond_devt != DDI_DEV_T_NONE && ctd->cond_devt != NODEV);
761 
762 	ASSERT(ctd->cond_spec == S_IFBLK || ctd->cond_spec == S_IFCHR);
763 
764 	ASSERT(!(ctd->cond_aset & ~CT_DEV_ALLEVENT));
765 	ASSERT(ctd->cond_noneg == 0 || ctd->cond_noneg == 1);
766 
767 	ASSERT(!(ctd->cond_currev_type & ~CT_DEV_ALLEVENT));
768 	ASSERT(!(ctd->cond_currev_ack & ~(CT_ACK | CT_NACK)));
769 
770 	ASSERT((ctd->cond_currev_id > 0) ^ (ctd->cond_currev_type == 0));
771 	ASSERT((ctd->cond_currev_id > 0) || (ctd->cond_currev_ack == 0));
772 
773 	ASSERT(!list_link_active(&ctd->cond_next));
774 
775 	kmem_free(ctd, sizeof (cont_device_t));
776 }
777 
778 /*
779  * contract_device_abandon
780  *
781  * The device contract abandon entry point.
782  */
783 static void
784 contract_device_abandon(contract_t *ct)
785 {
786 	ASSERT(MUTEX_HELD(&ct->ct_lock));
787 
788 	/*
789 	 * device contracts cannot be inherited or orphaned.
790 	 * Move the contract to the DEAD_STATE. It will be freed
791 	 * once all references to it are gone.
792 	 */
793 	contract_destroy(ct);
794 }
795 
796 /*
797  * contract_device_destroy
798  *
799  * The device contract destroy entry point.
800  * Called from contract_destroy() to do any type specific destroy. Note
801  * that destroy is a misnomer - this does not free the contract, it only
802  * moves it to the dead state. A contract is actually freed via
803  * 	contract_rele() -> contract_dtor(), contop_free()
804  */
805 static void
806 contract_device_destroy(contract_t *ct)
807 {
808 	cont_device_t	*ctd;
809 	dev_info_t	*dip;
810 
811 	ASSERT(MUTEX_HELD(&ct->ct_lock));
812 
813 	for (;;) {
814 		ctd = ct->ct_data;
815 		dip = ctd->cond_dip;
816 		if (dip == NULL) {
817 			/*
818 			 * The dip has been removed, this is a dangling contract
819 			 * Check that dip linkages are NULL
820 			 */
821 			ASSERT(!list_link_active(&ctd->cond_next));
822 			CT_DEBUG((CE_NOTE, "contract_device_destroy:"
823 			    " contract has no devinfo node. contract ctid : %d",
824 			    ct->ct_id));
825 			return;
826 		}
827 
828 		/*
829 		 * The intended lock order is : devi_ct_lock -> ct_count
830 		 * barrier -> ct_lock.
831 		 * However we can't do this here as dropping the ct_lock allows
832 		 * a race condition with i_ddi_free_node()/
833 		 * contract_device_remove_dip() which may free off dip before
834 		 * we can take devi_ct_lock. So use mutex_tryenter to avoid
835 		 * dropping ct_lock until we have acquired devi_ct_lock.
836 		 */
837 		if (mutex_tryenter(&(DEVI(dip)->devi_ct_lock)) != 0)
838 			break;
839 		mutex_exit(&ct->ct_lock);
840 		delay(drv_usectohz(1000));
841 		mutex_enter(&ct->ct_lock);
842 	}
843 	mutex_exit(&ct->ct_lock);
844 
845 	/*
846 	 * Waiting for the barrier to be released is strictly speaking not
847 	 * necessary. But it simplifies the implementation of
848 	 * contract_device_publish() by establishing the invariant that
849 	 * device contracts cannot go away during negotiation.
850 	 */
851 	ct_barrier_wait_for_release(dip);
852 	mutex_enter(&ct->ct_lock);
853 
854 	list_remove(&(DEVI(dip)->devi_ct), ctd);
855 	ctd->cond_dip = NULL; /* no longer linked to dip */
856 	contract_rele(ct);	/* remove hold for dip linkage */
857 
858 	mutex_exit(&ct->ct_lock);
859 	mutex_exit(&(DEVI(dip)->devi_ct_lock));
860 	mutex_enter(&ct->ct_lock);
861 }
862 
863 /*
864  * contract_device_status
865  *
866  * The device contract status entry point. Called when level of "detail"
867  * is either CTD_FIXED or CTD_ALL
868  *
869  */
870 static void
871 contract_device_status(contract_t *ct, zone_t *zone, int detail, nvlist_t *nvl,
872     void *status, model_t model)
873 {
874 	cont_device_t *ctd = ct->ct_data;
875 
876 	ASSERT(detail == CTD_FIXED || detail == CTD_ALL);
877 
878 	mutex_enter(&ct->ct_lock);
879 	contract_status_common(ct, zone, status, model);
880 
881 	/*
882 	 * There's no need to hold the contract lock while accessing static
883 	 * data like aset or noneg. But since we need the lock to access other
884 	 * data like state, we hold it anyway.
885 	 */
886 	VERIFY(nvlist_add_uint32(nvl, CTDS_STATE, ctd->cond_state) == 0);
887 	VERIFY(nvlist_add_uint32(nvl, CTDS_ASET, ctd->cond_aset) == 0);
888 	VERIFY(nvlist_add_uint32(nvl, CTDS_NONEG, ctd->cond_noneg) == 0);
889 
890 	if (detail == CTD_FIXED) {
891 		mutex_exit(&ct->ct_lock);
892 		return;
893 	}
894 
895 	ASSERT(ctd->cond_minor);
896 	VERIFY(nvlist_add_string(nvl, CTDS_MINOR, ctd->cond_minor) == 0);
897 
898 	mutex_exit(&ct->ct_lock);
899 }
900 
901 /*
902  * Converts a result integer into the corresponding string. Used for printing
903  * messages
904  */
905 static char *
906 result_str(uint_t result)
907 {
908 	switch (result) {
909 	case CT_ACK:
910 		return ("CT_ACK");
911 	case CT_NACK:
912 		return ("CT_NACK");
913 	case CT_NONE:
914 		return ("CT_NONE");
915 	default:
916 		return ("UNKNOWN");
917 	}
918 }
919 
920 /*
921  * Converts a device state integer constant into the corresponding string.
922  * Used to print messages.
923  */
924 static char *
925 state_str(uint_t state)
926 {
927 	switch (state) {
928 	case CT_DEV_EV_ONLINE:
929 		return ("ONLINE");
930 	case CT_DEV_EV_DEGRADED:
931 		return ("DEGRADED");
932 	case CT_DEV_EV_OFFLINE:
933 		return ("OFFLINE");
934 	default:
935 		return ("UNKNOWN");
936 	}
937 }
938 
939 /*
940  * Routine that determines if a particular CT_DEV_EV_? event corresponds to a
941  * synchronous state change or not.
942  */
943 static int
944 is_sync_neg(uint_t old, uint_t new)
945 {
946 	int	i;
947 
948 	ASSERT(old & CT_DEV_ALLEVENT);
949 	ASSERT(new & CT_DEV_ALLEVENT);
950 
951 	if (old == new) {
952 		CT_DEBUG((CE_WARN, "is_sync_neg: transition to same state: %s",
953 		    state_str(new)));
954 		return (-2);
955 	}
956 
957 	for (i = 0; ct_dev_negtable[i].st_new != 0; i++) {
958 		if (old == ct_dev_negtable[i].st_old &&
959 		    new == ct_dev_negtable[i].st_new) {
960 			return (ct_dev_negtable[i].st_neg);
961 		}
962 	}
963 
964 	CT_DEBUG((CE_WARN, "is_sync_neg: Unsupported state transition: "
965 	    "old = %s -> new = %s", state_str(old), state_str(new)));
966 
967 	return (-1);
968 }
969 
970 /*
971  * Used to cleanup cached dv_nodes so that when a device is released by
972  * a contract holder, its devinfo node can be successfully detached.
973  */
974 static int
975 contract_device_dvclean(dev_info_t *dip)
976 {
977 	char		*devnm;
978 	dev_info_t	*pdip;
979 
980 	ASSERT(dip);
981 
982 	/* pdip can be NULL if we have contracts against the root dip */
983 	pdip = ddi_get_parent(dip);
984 
985 	if (pdip && DEVI_BUSY_OWNED(pdip) || !pdip && DEVI_BUSY_OWNED(dip)) {
986 		char		*path;
987 
988 		path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
989 		(void) ddi_pathname(dip, path);
990 		CT_DEBUG((CE_WARN, "ct_dv_clean: Parent node is busy owned, "
991 		    "device=%s", path));
992 		kmem_free(path, MAXPATHLEN);
993 		return (EDEADLOCK);
994 	}
995 
996 	if (pdip) {
997 		devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
998 		(void) ddi_deviname(dip, devnm);
999 		(void) devfs_clean(pdip, devnm + 1, DV_CLEAN_FORCE);
1000 		kmem_free(devnm, MAXNAMELEN + 1);
1001 	} else {
1002 		(void) devfs_clean(dip, NULL, DV_CLEAN_FORCE);
1003 	}
1004 
1005 	return (0);
1006 }
1007 
1008 /*
1009  * Endpoint of a ct_ctl_ack() or ct_ctl_nack() call from userland.
1010  * Results in the ACK or NACK being recorded on the dip for one particular
1011  * contract. The device contracts framework evaluates the ACK/NACKs for all
1012  * contracts against a device to determine if a particular device state change
1013  * should be allowed.
1014  */
1015 static int
1016 contract_device_ack_nack(contract_t *ct, uint_t evtype, uint64_t evid,
1017     uint_t cmd)
1018 {
1019 	cont_device_t *ctd = ct->ct_data;
1020 	dev_info_t *dip;
1021 	ctid_t	ctid;
1022 	int error;
1023 
1024 	ctid = ct->ct_id;
1025 
1026 	CT_DEBUG((CE_NOTE, "ack_nack: entered: ctid %d", ctid));
1027 
1028 	mutex_enter(&ct->ct_lock);
1029 	CT_DEBUG((CE_NOTE, "ack_nack: contract lock acquired: %d", ctid));
1030 
1031 	dip = ctd->cond_dip;
1032 
1033 	ASSERT(ctd->cond_minor);
1034 	ASSERT(strlen(ctd->cond_minor) < MAXPATHLEN);
1035 
1036 	/*
1037 	 * Negotiation only if new state is not in A-set
1038 	 */
1039 	ASSERT(!(ctd->cond_aset & evtype));
1040 
1041 	/*
1042 	 * Negotiation only if transition is synchronous
1043 	 */
1044 	ASSERT(is_sync_neg(ctd->cond_state, evtype));
1045 
1046 	/*
1047 	 * We shouldn't be negotiating if the "noneg" flag is set
1048 	 */
1049 	ASSERT(!ctd->cond_noneg);
1050 
1051 	if (dip)
1052 		ndi_hold_devi(dip);
1053 
1054 	mutex_exit(&ct->ct_lock);
1055 
1056 	/*
1057 	 * dv_clean only if !NACK and offline state change
1058 	 */
1059 	if (cmd != CT_NACK && evtype == CT_DEV_EV_OFFLINE && dip) {
1060 		CT_DEBUG((CE_NOTE, "ack_nack: dv_clean: %d", ctid));
1061 		error = contract_device_dvclean(dip);
1062 		if (error != 0) {
1063 			CT_DEBUG((CE_NOTE, "ack_nack: dv_clean: failed: %d",
1064 			    ctid));
1065 			ddi_release_devi(dip);
1066 		}
1067 	}
1068 
1069 	mutex_enter(&ct->ct_lock);
1070 
1071 	if (dip)
1072 		ddi_release_devi(dip);
1073 
1074 	if (dip == NULL) {
1075 		if (ctd->cond_currev_id != evid) {
1076 			CT_DEBUG((CE_WARN, "%sACK for non-current event "
1077 			    "(type=%s, id=%llu) on removed device",
1078 			    cmd == CT_NACK ? "N" : "",
1079 			    state_str(evtype), (unsigned long long)evid));
1080 			CT_DEBUG((CE_NOTE, "ack_nack: error: ESRCH, ctid: %d",
1081 			    ctid));
1082 		} else {
1083 			ASSERT(ctd->cond_currev_type == evtype);
1084 			CT_DEBUG((CE_WARN, "contract_ack: no such device: "
1085 			    "ctid: %d", ctid));
1086 		}
1087 		error = (ct->ct_state == CTS_DEAD) ? ESRCH :
1088 		    ((cmd == CT_NACK) ? ETIMEDOUT : 0);
1089 		mutex_exit(&ct->ct_lock);
1090 		return (error);
1091 	}
1092 
1093 	/*
1094 	 * Must follow lock order: devi_ct_lock -> ct_count barrier - >ct_lock
1095 	 */
1096 	mutex_exit(&ct->ct_lock);
1097 
1098 	mutex_enter(&DEVI(dip)->devi_ct_lock);
1099 	mutex_enter(&ct->ct_lock);
1100 	if (ctd->cond_currev_id != evid) {
1101 		char *buf;
1102 		mutex_exit(&ct->ct_lock);
1103 		mutex_exit(&DEVI(dip)->devi_ct_lock);
1104 		ndi_hold_devi(dip);
1105 		buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1106 		(void) ddi_pathname(dip, buf);
1107 		ddi_release_devi(dip);
1108 		CT_DEBUG((CE_WARN, "%sACK for non-current event"
1109 		    "(type=%s, id=%llu) on device %s",
1110 		    cmd == CT_NACK ? "N" : "",
1111 		    state_str(evtype), (unsigned long long)evid, buf));
1112 		kmem_free(buf, MAXPATHLEN);
1113 		CT_DEBUG((CE_NOTE, "ack_nack: error: %d, ctid: %d",
1114 		    cmd == CT_NACK ? ETIMEDOUT : 0, ctid));
1115 		return (cmd == CT_ACK ? 0 : ETIMEDOUT);
1116 	}
1117 
1118 	ASSERT(ctd->cond_currev_type == evtype);
1119 	ASSERT(cmd == CT_ACK || cmd == CT_NACK);
1120 
1121 	CT_DEBUG((CE_NOTE, "ack_nack: setting %sACK for ctid: %d",
1122 	    cmd == CT_NACK ? "N" : "", ctid));
1123 
1124 	ctd->cond_currev_ack = cmd;
1125 	mutex_exit(&ct->ct_lock);
1126 
1127 	ct_barrier_decr(dip);
1128 	mutex_exit(&DEVI(dip)->devi_ct_lock);
1129 
1130 	CT_DEBUG((CE_NOTE, "ack_nack: normal exit: ctid: %d", ctid));
1131 
1132 	return (0);
1133 }
1134 
1135 /*
1136  * Invoked when a userland contract holder approves (i.e. ACKs) a state change
1137  */
1138 static int
1139 contract_device_ack(contract_t *ct, uint_t evtype, uint64_t evid)
1140 {
1141 	return (contract_device_ack_nack(ct, evtype, evid, CT_ACK));
1142 }
1143 
1144 /*
1145  * Invoked when a userland contract holder blocks (i.e. NACKs) a state change
1146  */
1147 static int
1148 contract_device_nack(contract_t *ct, uint_t evtype, uint64_t evid)
1149 {
1150 	return (contract_device_ack_nack(ct, evtype, evid, CT_NACK));
1151 }
1152 
1153 /*
1154  * Creates a new contract synchronously with the breaking of an existing
1155  * contract. Currently not supported.
1156  */
1157 /*ARGSUSED*/
1158 static int
1159 contract_device_newct(contract_t *ct)
1160 {
1161 	return (ENOTSUP);
1162 }
1163 
1164 /*
1165  * Core device contract implementation entry points
1166  */
1167 static contops_t contract_device_ops = {
1168 	contract_device_free,		/* contop_free */
1169 	contract_device_abandon,	/* contop_abandon */
1170 	contract_device_destroy,	/* contop_destroy */
1171 	contract_device_status,		/* contop_status */
1172 	contract_device_ack,		/* contop_ack */
1173 	contract_device_nack,		/* contop_nack */
1174 	contract_qack_notsup,		/* contop_qack */
1175 	contract_device_newct		/* contop_newct */
1176 };
1177 
1178 /*
1179  * contract_device_init
1180  *
1181  * Initializes the device contract type.
1182  */
1183 void
1184 contract_device_init(void)
1185 {
1186 	device_type = contract_type_init(CTT_DEVICE, "device",
1187 	    &contract_device_ops, contract_device_default);
1188 }
1189 
1190 /*
1191  * contract_device_create
1192  *
1193  * create a device contract given template "tmpl" and the "owner" process.
1194  * May fail and return NULL if project.max-contracts would have been exceeded.
1195  *
1196  * Common device contract creation routine called for both open-time and
1197  * non-open time device contract creation
1198  */
1199 static cont_device_t *
1200 contract_device_create(ctmpl_device_t *dtmpl, dev_t dev, int spec_type,
1201     proc_t *owner, int *errorp)
1202 {
1203 	cont_device_t *ctd;
1204 	char *minor;
1205 	char *path;
1206 	dev_info_t *dip;
1207 
1208 	ASSERT(dtmpl != NULL);
1209 	ASSERT(dev != NODEV && dev != DDI_DEV_T_ANY && dev != DDI_DEV_T_NONE);
1210 	ASSERT(spec_type == S_IFCHR || spec_type == S_IFBLK);
1211 	ASSERT(errorp);
1212 
1213 	*errorp = 0;
1214 
1215 	path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1216 
1217 	mutex_enter(&dtmpl->ctd_ctmpl.ctmpl_lock);
1218 	ASSERT(strlen(dtmpl->ctd_minor) < MAXPATHLEN);
1219 	bcopy(dtmpl->ctd_minor, path, strlen(dtmpl->ctd_minor) + 1);
1220 	mutex_exit(&dtmpl->ctd_ctmpl.ctmpl_lock);
1221 
1222 	dip = e_ddi_hold_devi_by_path(path, 0);
1223 	if (dip == NULL) {
1224 		cmn_err(CE_WARN, "contract_create: Cannot find devinfo node "
1225 		    "for device path (%s)", path);
1226 		kmem_free(path, MAXPATHLEN);
1227 		*errorp = ERANGE;
1228 		return (NULL);
1229 	}
1230 
1231 	/*
1232 	 * Lock out any parallel contract negotiations
1233 	 */
1234 	mutex_enter(&(DEVI(dip)->devi_ct_lock));
1235 	ct_barrier_acquire(dip);
1236 	mutex_exit(&(DEVI(dip)->devi_ct_lock));
1237 
1238 	minor = i_ddi_strdup(path, KM_SLEEP);
1239 	kmem_free(path, MAXPATHLEN);
1240 
1241 	(void) contract_type_pbundle(device_type, owner);
1242 
1243 	ctd = kmem_zalloc(sizeof (cont_device_t), KM_SLEEP);
1244 
1245 	/*
1246 	 * Only we hold a refernce to this contract. Safe to access
1247 	 * the fields without a ct_lock
1248 	 */
1249 	ctd->cond_minor = minor;
1250 	/*
1251 	 * It is safe to set the dip pointer in the contract
1252 	 * as the contract will always be destroyed before the dip
1253 	 * is released
1254 	 */
1255 	ctd->cond_dip = dip;
1256 	ctd->cond_devt = dev;
1257 	ctd->cond_spec = spec_type;
1258 
1259 	/*
1260 	 * Since we are able to lookup the device, it is either
1261 	 * online or degraded
1262 	 */
1263 	ctd->cond_state = DEVI_IS_DEVICE_DEGRADED(dip) ?
1264 	    CT_DEV_EV_DEGRADED : CT_DEV_EV_ONLINE;
1265 
1266 	mutex_enter(&dtmpl->ctd_ctmpl.ctmpl_lock);
1267 	ctd->cond_aset = dtmpl->ctd_aset;
1268 	ctd->cond_noneg = dtmpl->ctd_noneg;
1269 
1270 	/*
1271 	 * contract_ctor() initailizes the common portion of a contract
1272 	 * contract_dtor() destroys the common portion of a contract
1273 	 */
1274 	if (contract_ctor(&ctd->cond_contract, device_type, &dtmpl->ctd_ctmpl,
1275 	    ctd, 0, owner, B_TRUE)) {
1276 		mutex_exit(&dtmpl->ctd_ctmpl.ctmpl_lock);
1277 		/*
1278 		 * contract_device_free() destroys the type specific
1279 		 * portion of a contract and frees the contract.
1280 		 * The "minor" path and "cred" is a part of the type specific
1281 		 * portion of the contract and will be freed by
1282 		 * contract_device_free()
1283 		 */
1284 		contract_device_free(&ctd->cond_contract);
1285 
1286 		/* release barrier */
1287 		mutex_enter(&(DEVI(dip)->devi_ct_lock));
1288 		ct_barrier_release(dip);
1289 		mutex_exit(&(DEVI(dip)->devi_ct_lock));
1290 
1291 		ddi_release_devi(dip);
1292 		*errorp = EAGAIN;
1293 		return (NULL);
1294 	}
1295 	mutex_exit(&dtmpl->ctd_ctmpl.ctmpl_lock);
1296 
1297 	mutex_enter(&ctd->cond_contract.ct_lock);
1298 	ctd->cond_contract.ct_ntime.ctm_total = CT_DEV_ACKTIME;
1299 	ctd->cond_contract.ct_qtime.ctm_total = CT_DEV_ACKTIME;
1300 	ctd->cond_contract.ct_ntime.ctm_start = -1;
1301 	ctd->cond_contract.ct_qtime.ctm_start = -1;
1302 	mutex_exit(&ctd->cond_contract.ct_lock);
1303 
1304 	/*
1305 	 * Insert device contract into list hanging off the dip
1306 	 * Bump up the ref-count on the contract to reflect this
1307 	 */
1308 	contract_hold(&ctd->cond_contract);
1309 	mutex_enter(&(DEVI(dip)->devi_ct_lock));
1310 	list_insert_tail(&(DEVI(dip)->devi_ct), ctd);
1311 
1312 	/* release barrier */
1313 	ct_barrier_release(dip);
1314 	mutex_exit(&(DEVI(dip)->devi_ct_lock));
1315 
1316 	ddi_release_devi(dip);
1317 
1318 	return (ctd);
1319 }
1320 
1321 /*
1322  * Called when a device is successfully opened to create an open-time contract
1323  * i.e. synchronously with a device open.
1324  */
1325 int
1326 contract_device_open(dev_t dev, int spec_type, contract_t **ctpp)
1327 {
1328 	ctmpl_device_t *dtmpl;
1329 	ct_template_t  *tmpl;
1330 	cont_device_t *ctd;
1331 	char *path;
1332 	klwp_t *lwp;
1333 	int error;
1334 
1335 	if (ctpp)
1336 		*ctpp = NULL;
1337 
1338 	/*
1339 	 * Check if we are in user-context i.e. if we have an lwp
1340 	 */
1341 	lwp = ttolwp(curthread);
1342 	if (lwp == NULL) {
1343 		CT_DEBUG((CE_NOTE, "contract_open: Not user-context"));
1344 		return (0);
1345 	}
1346 
1347 	tmpl = ctmpl_dup(lwp->lwp_ct_active[device_type->ct_type_index]);
1348 	if (tmpl == NULL) {
1349 		return (0);
1350 	}
1351 	dtmpl = tmpl->ctmpl_data;
1352 
1353 	/*
1354 	 * If the user set a minor path in the template before an open,
1355 	 * ignore it. We use the minor path of the actual minor opened.
1356 	 */
1357 	mutex_enter(&tmpl->ctmpl_lock);
1358 	if (dtmpl->ctd_minor != NULL) {
1359 		CT_DEBUG((CE_NOTE, "contract_device_open(): Process %d: "
1360 		    "ignoring device minor path in active template: %s",
1361 		    curproc->p_pid, dtmpl->ctd_minor));
1362 		/*
1363 		 * This is a copy of the actual activated template.
1364 		 * Safe to make changes such as freeing the minor
1365 		 * path in the template.
1366 		 */
1367 		kmem_free(dtmpl->ctd_minor, strlen(dtmpl->ctd_minor) + 1);
1368 		dtmpl->ctd_minor = NULL;
1369 	}
1370 	mutex_exit(&tmpl->ctmpl_lock);
1371 
1372 	path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1373 
1374 	if (ddi_dev_pathname(dev, spec_type, path) != DDI_SUCCESS) {
1375 		CT_DEBUG((CE_NOTE, "contract_device_open(): Failed to derive "
1376 		    "minor path from dev_t,spec {%lu, %d} for process (%d)",
1377 		    dev, spec_type, curproc->p_pid));
1378 		ctmpl_free(tmpl);
1379 		kmem_free(path, MAXPATHLEN);
1380 		return (1);
1381 	}
1382 
1383 	mutex_enter(&tmpl->ctmpl_lock);
1384 	ASSERT(dtmpl->ctd_minor == NULL);
1385 	dtmpl->ctd_minor = path;
1386 	mutex_exit(&tmpl->ctmpl_lock);
1387 
1388 	ctd = contract_device_create(dtmpl, dev, spec_type, curproc, &error);
1389 
1390 	mutex_enter(&tmpl->ctmpl_lock);
1391 	ASSERT(dtmpl->ctd_minor);
1392 	dtmpl->ctd_minor = NULL;
1393 	mutex_exit(&tmpl->ctmpl_lock);
1394 	ctmpl_free(tmpl);
1395 	kmem_free(path, MAXPATHLEN);
1396 
1397 	if (ctd == NULL) {
1398 		cmn_err(CE_NOTE, "contract_device_open(): Failed to "
1399 		    "create device contract for process (%d) holding "
1400 		    "device (devt = %lu, spec_type = %d)",
1401 		    curproc->p_pid, dev, spec_type);
1402 		return (1);
1403 	}
1404 
1405 	if (ctpp) {
1406 		mutex_enter(&ctd->cond_contract.ct_lock);
1407 		*ctpp = &ctd->cond_contract;
1408 		mutex_exit(&ctd->cond_contract.ct_lock);
1409 	}
1410 	return (0);
1411 }
1412 
1413 /*
1414  * Called during contract negotiation by the device contract framework to wait
1415  * for ACKs or NACKs from contract holders. If all responses are not received
1416  * before a specified timeout, this routine times out.
1417  */
1418 static uint_t
1419 wait_for_acks(dev_info_t *dip, dev_t dev, int spec_type, uint_t evtype)
1420 {
1421 	cont_device_t *ctd;
1422 	int timed_out = 0;
1423 	int result = CT_NONE;
1424 	int ack;
1425 	char *f = "wait_for_acks";
1426 
1427 	ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_ct_lock)));
1428 	ASSERT(dip);
1429 	ASSERT(evtype & CT_DEV_ALLEVENT);
1430 	ASSERT(dev != NODEV && dev != DDI_DEV_T_NONE);
1431 	ASSERT((dev == DDI_DEV_T_ANY && spec_type == 0) ||
1432 	    (spec_type == S_IFBLK || spec_type == S_IFCHR));
1433 
1434 	CT_DEBUG((CE_NOTE, "%s: entered: dip: %p", f, (void *)dip));
1435 
1436 	if (ct_barrier_wait_for_empty(dip, CT_DEV_ACKTIME) == -1) {
1437 		/*
1438 		 * some contract owner(s) didn't respond in time
1439 		 */
1440 		CT_DEBUG((CE_NOTE, "%s: timed out: %p", f, (void *)dip));
1441 		timed_out = 1;
1442 	}
1443 
1444 	ack = 0;
1445 	for (ctd = list_head(&(DEVI(dip)->devi_ct)); ctd != NULL;
1446 	    ctd = list_next(&(DEVI(dip)->devi_ct), ctd)) {
1447 
1448 		mutex_enter(&ctd->cond_contract.ct_lock);
1449 
1450 		ASSERT(ctd->cond_dip == dip);
1451 
1452 		if (dev != DDI_DEV_T_ANY && dev != ctd->cond_devt) {
1453 			mutex_exit(&ctd->cond_contract.ct_lock);
1454 			continue;
1455 		}
1456 		if (dev != DDI_DEV_T_ANY && spec_type != ctd->cond_spec) {
1457 			mutex_exit(&ctd->cond_contract.ct_lock);
1458 			continue;
1459 		}
1460 
1461 		/* skip if non-negotiable contract */
1462 		if (ctd->cond_noneg) {
1463 			mutex_exit(&ctd->cond_contract.ct_lock);
1464 			continue;
1465 		}
1466 
1467 		ASSERT(ctd->cond_currev_type == evtype);
1468 		if (ctd->cond_currev_ack == CT_NACK) {
1469 			CT_DEBUG((CE_NOTE, "%s: found a NACK,result = NACK: %p",
1470 			    f, (void *)dip));
1471 			mutex_exit(&ctd->cond_contract.ct_lock);
1472 			return (CT_NACK);
1473 		} else if (ctd->cond_currev_ack == CT_ACK) {
1474 			ack = 1;
1475 			CT_DEBUG((CE_NOTE, "%s: found a ACK: %p",
1476 			    f, (void *)dip));
1477 		}
1478 		mutex_exit(&ctd->cond_contract.ct_lock);
1479 	}
1480 
1481 	if (ack) {
1482 		result = CT_ACK;
1483 		CT_DEBUG((CE_NOTE, "%s: result = ACK, dip=%p", f, (void *)dip));
1484 	} else if (timed_out) {
1485 		result = CT_NONE;
1486 		CT_DEBUG((CE_NOTE, "%s: result = NONE (timed-out), dip=%p",
1487 		    f, (void *)dip));
1488 	} else {
1489 		CT_DEBUG((CE_NOTE, "%s: result = NONE, dip=%p",
1490 		    f, (void *)dip));
1491 	}
1492 
1493 
1494 	return (result);
1495 }
1496 
1497 /*
1498  * Determines the current state of a device (i.e a devinfo node
1499  */
1500 static int
1501 get_state(dev_info_t *dip)
1502 {
1503 	if (DEVI_IS_DEVICE_OFFLINE(dip) || DEVI_IS_DEVICE_DOWN(dip))
1504 		return (CT_DEV_EV_OFFLINE);
1505 	else if (DEVI_IS_DEVICE_DEGRADED(dip))
1506 		return (CT_DEV_EV_DEGRADED);
1507 	else
1508 		return (CT_DEV_EV_ONLINE);
1509 }
1510 
1511 /*
1512  * Sets the current state of a device in a device contract
1513  */
1514 static void
1515 set_cond_state(dev_info_t *dip)
1516 {
1517 	uint_t state = get_state(dip);
1518 	cont_device_t *ctd;
1519 
1520 	/* verify that barrier is held */
1521 	ASSERT(ct_barrier_held(dip));
1522 
1523 	for (ctd = list_head(&(DEVI(dip)->devi_ct)); ctd != NULL;
1524 	    ctd = list_next(&(DEVI(dip)->devi_ct), ctd)) {
1525 		mutex_enter(&ctd->cond_contract.ct_lock);
1526 		ASSERT(ctd->cond_dip == dip);
1527 		ctd->cond_state = state;
1528 		mutex_exit(&ctd->cond_contract.ct_lock);
1529 	}
1530 }
1531 
1532 /*
1533  * Core routine called by event-specific routines when an event occurs.
1534  * Determines if an event should be be published, and if it is to be
1535  * published, whether a negotiation should take place. Also implements
1536  * NEGEND events which publish the final disposition of an event after
1537  * negotiations are complete.
1538  *
1539  * When an event occurs on a minor node, this routine walks the list of
1540  * contracts hanging off a devinfo node and for each contract on the affected
1541  * dip, evaluates the following cases
1542  *
1543  *	a. an event that is synchronous, breaks the contract and NONEG not set
1544  *		- bumps up the outstanding negotiation counts on the dip
1545  *		- marks the dip as undergoing negotiation (devi_ct_neg)
1546  *		- event of type CTE_NEG is published
1547  *	b. an event that is synchronous, breaks the contract and NONEG is set
1548  *		- sets the final result to CT_NACK, event is blocked
1549  *		- does not publish an event
1550  *	c. event is asynchronous and breaks the contract
1551  *		- publishes a critical event irrespect of whether the NONEG
1552  *		  flag is set, since the contract will be broken and contract
1553  *		  owner needs to be informed.
1554  *	d. No contract breakage but the owner has subscribed to the event
1555  *		- publishes the event irrespective of the NONEG event as the
1556  *		  owner has explicitly subscribed to the event.
1557  *	e. NEGEND event
1558  *		- publishes a critical event. Should only be doing this if
1559  *		  if NONEG is not set.
1560  *	f. all other events
1561  *		- Since a contract is not broken and this event has not been
1562  *		  subscribed to, this event does not need to be published for
1563  *		  for this contract.
1564  *
1565  *	Once an event is published, what happens next depends on the type of
1566  *	event:
1567  *
1568  *	a. NEGEND event
1569  *		- cleanup all state associated with the preceding negotiation
1570  *		  and return CT_ACK to the caller of contract_device_publish()
1571  *	b. NACKed event
1572  *		- One or more contracts had the NONEG term, so the event was
1573  *		  blocked. Return CT_NACK to the caller.
1574  *	c. Negotiated event
1575  *		- Call wait_for_acks() to wait for responses from contract
1576  *		holders. The end result is either CT_ACK (event is permitted),
1577  *		CT_NACK (event is blocked) or CT_NONE (no contract owner)
1578  *		responded. This result is returned back to the caller.
1579  *	d. All other events
1580  *		- If the event was asynchronous (i.e. not negotiated) or
1581  *		a contract was not broken return CT_ACK to the caller.
1582  */
1583 static uint_t
1584 contract_device_publish(dev_info_t *dip, dev_t dev, int spec_type,
1585     uint_t evtype, nvlist_t *tnvl)
1586 {
1587 	cont_device_t *ctd;
1588 	uint_t result = CT_NONE;
1589 	uint64_t evid = 0;
1590 	uint64_t nevid = 0;
1591 	char *path = NULL;
1592 	int negend;
1593 	int match;
1594 	int sync = 0;
1595 	contract_t *ct;
1596 	ct_kevent_t *event;
1597 	nvlist_t *nvl;
1598 	int broken = 0;
1599 
1600 	ASSERT(dip);
1601 	ASSERT(dev != NODEV && dev != DDI_DEV_T_NONE);
1602 	ASSERT((dev == DDI_DEV_T_ANY && spec_type == 0) ||
1603 	    (spec_type == S_IFBLK || spec_type == S_IFCHR));
1604 	ASSERT(evtype == 0 || (evtype & CT_DEV_ALLEVENT));
1605 
1606 	/* Is this a synchronous state change ? */
1607 	if (evtype != CT_EV_NEGEND) {
1608 		sync = is_sync_neg(get_state(dip), evtype);
1609 		/* NOP if unsupported transition */
1610 		if (sync == -2 || sync == -1) {
1611 			DEVI(dip)->devi_flags |= DEVI_CT_NOP;
1612 			result = (sync == -2) ? CT_ACK : CT_NONE;
1613 			goto out;
1614 		}
1615 		CT_DEBUG((CE_NOTE, "publish: is%s sync state change",
1616 		    sync ? "" : " not"));
1617 	} else if (DEVI(dip)->devi_flags & DEVI_CT_NOP) {
1618 		DEVI(dip)->devi_flags &= ~DEVI_CT_NOP;
1619 		result = CT_ACK;
1620 		goto out;
1621 	}
1622 
1623 	path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1624 	(void) ddi_pathname(dip, path);
1625 
1626 	mutex_enter(&(DEVI(dip)->devi_ct_lock));
1627 
1628 	/*
1629 	 * Negotiation end - set the state of the device in the contract
1630 	 */
1631 	if (evtype == CT_EV_NEGEND) {
1632 		CT_DEBUG((CE_NOTE, "publish: negend: setting cond state"));
1633 		set_cond_state(dip);
1634 	}
1635 
1636 	/*
1637 	 * If this device didn't go through negotiation, don't publish
1638 	 * a NEGEND event - simply release the barrier to allow other
1639 	 * device events in.
1640 	 */
1641 	negend = 0;
1642 	if (evtype == CT_EV_NEGEND && !DEVI(dip)->devi_ct_neg) {
1643 		CT_DEBUG((CE_NOTE, "publish: no negend reqd. release barrier"));
1644 		ct_barrier_release(dip);
1645 		mutex_exit(&(DEVI(dip)->devi_ct_lock));
1646 		result = CT_ACK;
1647 		goto out;
1648 	} else if (evtype == CT_EV_NEGEND) {
1649 		/*
1650 		 * There are negotiated contract breakages that
1651 		 * need a NEGEND event
1652 		 */
1653 		ASSERT(ct_barrier_held(dip));
1654 		negend = 1;
1655 		CT_DEBUG((CE_NOTE, "publish: setting negend flag"));
1656 	} else {
1657 		/*
1658 		 * This is a new event, not a NEGEND event. Wait for previous
1659 		 * contract events to complete.
1660 		 */
1661 		ct_barrier_acquire(dip);
1662 	}
1663 
1664 
1665 	match = 0;
1666 	for (ctd = list_head(&(DEVI(dip)->devi_ct)); ctd != NULL;
1667 	    ctd = list_next(&(DEVI(dip)->devi_ct), ctd)) {
1668 
1669 		ctid_t ctid;
1670 		size_t len = strlen(path);
1671 
1672 		mutex_enter(&ctd->cond_contract.ct_lock);
1673 
1674 		ASSERT(ctd->cond_dip == dip);
1675 		ASSERT(ctd->cond_minor);
1676 		ASSERT(strncmp(ctd->cond_minor, path, len) == 0 &&
1677 		    ctd->cond_minor[len] == ':');
1678 
1679 		if (dev != DDI_DEV_T_ANY && dev != ctd->cond_devt) {
1680 			mutex_exit(&ctd->cond_contract.ct_lock);
1681 			continue;
1682 		}
1683 		if (dev != DDI_DEV_T_ANY && spec_type != ctd->cond_spec) {
1684 			mutex_exit(&ctd->cond_contract.ct_lock);
1685 			continue;
1686 		}
1687 
1688 		/* We have a matching contract */
1689 		match = 1;
1690 		ctid = ctd->cond_contract.ct_id;
1691 		CT_DEBUG((CE_NOTE, "publish: found matching contract: %d",
1692 		    ctid));
1693 
1694 		/*
1695 		 * There are 4 possible cases
1696 		 * 1. A contract is broken (dev not in acceptable state) and
1697 		 *    the state change is synchronous - start negotiation
1698 		 *    by sending a CTE_NEG critical event.
1699 		 * 2. A contract is broken and the state change is
1700 		 *    asynchronous - just send a critical event and
1701 		 *    break the contract.
1702 		 * 3. Contract is not broken, but consumer has subscribed
1703 		 *    to the event as a critical or informative event
1704 		 *    - just send the appropriate event
1705 		 * 4. contract waiting for negend event - just send the critical
1706 		 *    NEGEND event.
1707 		 */
1708 		broken = 0;
1709 		if (!negend && !(evtype & ctd->cond_aset)) {
1710 			broken = 1;
1711 			CT_DEBUG((CE_NOTE, "publish: Contract broken: %d",
1712 			    ctid));
1713 		}
1714 
1715 		/*
1716 		 * Don't send event if
1717 		 *	- contract is not broken AND
1718 		 *	- contract holder has not subscribed to this event AND
1719 		 *	- contract not waiting for a NEGEND event
1720 		 */
1721 		if (!broken && !EVSENDP(ctd, evtype) &&
1722 		    !ctd->cond_neg) {
1723 			CT_DEBUG((CE_NOTE, "contract_device_publish(): "
1724 			    "contract (%d): no publish reqd: event %d",
1725 			    ctd->cond_contract.ct_id, evtype));
1726 			mutex_exit(&ctd->cond_contract.ct_lock);
1727 			continue;
1728 		}
1729 
1730 		/*
1731 		 * Note: need to kmem_zalloc() the event so mutexes are
1732 		 * initialized automatically
1733 		 */
1734 		ct = &ctd->cond_contract;
1735 		event = kmem_zalloc(sizeof (ct_kevent_t), KM_SLEEP);
1736 		event->cte_type = evtype;
1737 
1738 		if (broken && sync) {
1739 			CT_DEBUG((CE_NOTE, "publish: broken + sync: "
1740 			    "ctid: %d", ctid));
1741 			ASSERT(!negend);
1742 			ASSERT(ctd->cond_currev_id == 0);
1743 			ASSERT(ctd->cond_currev_type == 0);
1744 			ASSERT(ctd->cond_currev_ack == 0);
1745 			ASSERT(ctd->cond_neg == 0);
1746 			if (ctd->cond_noneg) {
1747 				/* Nothing to publish. Event has been blocked */
1748 				CT_DEBUG((CE_NOTE, "publish: sync and noneg:"
1749 				    "not publishing blocked ev: ctid: %d",
1750 				    ctid));
1751 				result = CT_NACK;
1752 				kmem_free(event, sizeof (ct_kevent_t));
1753 				mutex_exit(&ctd->cond_contract.ct_lock);
1754 				continue;
1755 			}
1756 			event->cte_flags = CTE_NEG; /* critical neg. event */
1757 			ctd->cond_currev_type = event->cte_type;
1758 			ct_barrier_incr(dip);
1759 			DEVI(dip)->devi_ct_neg = 1; /* waiting for negend */
1760 			ctd->cond_neg = 1;
1761 		} else if (broken && !sync) {
1762 			CT_DEBUG((CE_NOTE, "publish: broken + async: ctid: %d",
1763 			    ctid));
1764 			ASSERT(!negend);
1765 			ASSERT(ctd->cond_currev_id == 0);
1766 			ASSERT(ctd->cond_currev_type == 0);
1767 			ASSERT(ctd->cond_currev_ack == 0);
1768 			ASSERT(ctd->cond_neg == 0);
1769 			event->cte_flags = 0; /* critical event */
1770 		} else if (EVSENDP(ctd, event->cte_type)) {
1771 			CT_DEBUG((CE_NOTE, "publish: event suscrib: ctid: %d",
1772 			    ctid));
1773 			ASSERT(!negend);
1774 			ASSERT(ctd->cond_currev_id == 0);
1775 			ASSERT(ctd->cond_currev_type == 0);
1776 			ASSERT(ctd->cond_currev_ack == 0);
1777 			ASSERT(ctd->cond_neg == 0);
1778 			event->cte_flags = EVINFOP(ctd, event->cte_type) ?
1779 			    CTE_INFO : 0;
1780 		} else if (ctd->cond_neg) {
1781 			CT_DEBUG((CE_NOTE, "publish: NEGEND: ctid: %d", ctid));
1782 			ASSERT(negend);
1783 			ASSERT(ctd->cond_noneg == 0);
1784 			nevid = ctd->cond_contract.ct_nevent ?
1785 			    ctd->cond_contract.ct_nevent->cte_id : 0;
1786 			ASSERT(ctd->cond_currev_id == nevid);
1787 			event->cte_flags = 0;	/* NEGEND is always critical */
1788 			ctd->cond_currev_id = 0;
1789 			ctd->cond_currev_type = 0;
1790 			ctd->cond_currev_ack = 0;
1791 			ctd->cond_neg = 0;
1792 		} else {
1793 			CT_DEBUG((CE_NOTE, "publish: not publishing event for "
1794 			    "ctid: %d, evtype: %d",
1795 			    ctd->cond_contract.ct_id, event->cte_type));
1796 			ASSERT(!negend);
1797 			ASSERT(ctd->cond_currev_id == 0);
1798 			ASSERT(ctd->cond_currev_type == 0);
1799 			ASSERT(ctd->cond_currev_ack == 0);
1800 			ASSERT(ctd->cond_neg == 0);
1801 			kmem_free(event, sizeof (ct_kevent_t));
1802 			mutex_exit(&ctd->cond_contract.ct_lock);
1803 			continue;
1804 		}
1805 
1806 		nvl = NULL;
1807 		if (tnvl) {
1808 			VERIFY(nvlist_dup(tnvl, &nvl, 0) == 0);
1809 			if (negend) {
1810 				int32_t newct = 0;
1811 				ASSERT(ctd->cond_noneg == 0);
1812 				VERIFY(nvlist_add_uint64(nvl, CTS_NEVID, nevid)
1813 				    == 0);
1814 				VERIFY(nvlist_lookup_int32(nvl, CTS_NEWCT,
1815 				    &newct) == 0);
1816 				VERIFY(nvlist_add_int32(nvl, CTS_NEWCT,
1817 				    newct == 1 ? 0 :
1818 				    ctd->cond_contract.ct_id) == 0);
1819 				CT_DEBUG((CE_NOTE, "publish: negend: ctid: %d "
1820 				    "CTS_NEVID: %llu, CTS_NEWCT: %s",
1821 				    ctid, (unsigned long long)nevid,
1822 				    newct ? "success" : "failure"));
1823 
1824 			}
1825 		}
1826 
1827 		if (ctd->cond_neg) {
1828 			ASSERT(ctd->cond_contract.ct_ntime.ctm_start == -1);
1829 			ASSERT(ctd->cond_contract.ct_qtime.ctm_start == -1);
1830 			ctd->cond_contract.ct_ntime.ctm_start = ddi_get_lbolt();
1831 			ctd->cond_contract.ct_qtime.ctm_start =
1832 			    ctd->cond_contract.ct_ntime.ctm_start;
1833 		}
1834 
1835 		/*
1836 		 * by holding the dip's devi_ct_lock we ensure that
1837 		 * all ACK/NACKs are held up until we have finished
1838 		 * publishing to all contracts.
1839 		 */
1840 		mutex_exit(&ctd->cond_contract.ct_lock);
1841 		evid = cte_publish_all(ct, event, nvl, NULL);
1842 		mutex_enter(&ctd->cond_contract.ct_lock);
1843 
1844 		if (ctd->cond_neg) {
1845 			ASSERT(!negend);
1846 			ASSERT(broken);
1847 			ASSERT(sync);
1848 			ASSERT(!ctd->cond_noneg);
1849 			CT_DEBUG((CE_NOTE, "publish: sync break, setting evid"
1850 			    ": %d", ctid));
1851 			ctd->cond_currev_id = evid;
1852 		} else if (negend) {
1853 			ctd->cond_contract.ct_ntime.ctm_start = -1;
1854 			ctd->cond_contract.ct_qtime.ctm_start = -1;
1855 		}
1856 		mutex_exit(&ctd->cond_contract.ct_lock);
1857 	}
1858 
1859 	/*
1860 	 * If "negend" set counter back to initial state (-1) so that
1861 	 * other events can be published. Also clear the negotiation flag
1862 	 * on dip.
1863 	 *
1864 	 * 0 .. n are used for counting.
1865 	 * -1 indicates counter is available for use.
1866 	 */
1867 	if (negend) {
1868 		/*
1869 		 * devi_ct_count not necessarily 0. We may have
1870 		 * timed out in which case, count will be non-zero.
1871 		 */
1872 		ct_barrier_release(dip);
1873 		DEVI(dip)->devi_ct_neg = 0;
1874 		CT_DEBUG((CE_NOTE, "publish: negend: reset dip state: dip=%p",
1875 		    (void *)dip));
1876 	} else if (DEVI(dip)->devi_ct_neg) {
1877 		ASSERT(match);
1878 		ASSERT(!ct_barrier_empty(dip));
1879 		CT_DEBUG((CE_NOTE, "publish: sync count=%d, dip=%p",
1880 		    DEVI(dip)->devi_ct_count, (void *)dip));
1881 	} else {
1882 		/*
1883 		 * for non-negotiated events or subscribed events or no
1884 		 * matching contracts
1885 		 */
1886 		ASSERT(ct_barrier_empty(dip));
1887 		ASSERT(DEVI(dip)->devi_ct_neg == 0);
1888 		CT_DEBUG((CE_NOTE, "publish: async/non-nego/subscrib/no-match: "
1889 		    "dip=%p", (void *)dip));
1890 
1891 		/*
1892 		 * only this function when called from contract_device_negend()
1893 		 * can reset the counter to READY state i.e. -1. This function
1894 		 * is so called for every event whether a NEGEND event is needed
1895 		 * or not, but the negend event is only published if the event
1896 		 * whose end they signal is a negotiated event for the contract.
1897 		 */
1898 	}
1899 
1900 	if (!match) {
1901 		/* No matching contracts */
1902 		CT_DEBUG((CE_NOTE, "publish: No matching contract"));
1903 		result = CT_NONE;
1904 	} else if (result == CT_NACK) {
1905 		/* a non-negotiable contract exists and this is a neg. event */
1906 		CT_DEBUG((CE_NOTE, "publish: found 1 or more NONEG contract"));
1907 		(void) wait_for_acks(dip, dev, spec_type, evtype);
1908 	} else if (DEVI(dip)->devi_ct_neg) {
1909 		/* one or more contracts going through negotations  */
1910 		CT_DEBUG((CE_NOTE, "publish: sync contract: waiting"));
1911 		result = wait_for_acks(dip, dev, spec_type, evtype);
1912 	} else {
1913 		/* no negotiated contracts or no broken contracts or NEGEND */
1914 		CT_DEBUG((CE_NOTE, "publish: async/no-break/negend"));
1915 		result = CT_ACK;
1916 	}
1917 
1918 	/*
1919 	 * Release the lock only now so that the only point where we
1920 	 * drop the lock is in wait_for_acks(). This is so that we don't
1921 	 * miss cv_signal/cv_broadcast from contract holders
1922 	 */
1923 	CT_DEBUG((CE_NOTE, "publish: dropping devi_ct_lock"));
1924 	mutex_exit(&(DEVI(dip)->devi_ct_lock));
1925 
1926 out:
1927 	nvlist_free(tnvl);
1928 	if (path)
1929 		kmem_free(path, MAXPATHLEN);
1930 
1931 
1932 	CT_DEBUG((CE_NOTE, "publish: result = %s", result_str(result)));
1933 	return (result);
1934 }
1935 
1936 
1937 /*
1938  * contract_device_offline
1939  *
1940  * Event publishing routine called by I/O framework when a device is offlined.
1941  */
1942 ct_ack_t
1943 contract_device_offline(dev_info_t *dip, dev_t dev, int spec_type)
1944 {
1945 	nvlist_t *nvl;
1946 	uint_t result;
1947 	uint_t evtype;
1948 
1949 	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1950 
1951 	evtype = CT_DEV_EV_OFFLINE;
1952 	result = contract_device_publish(dip, dev, spec_type, evtype, nvl);
1953 
1954 	/*
1955 	 * If a contract offline is NACKED, the framework expects us to call
1956 	 * NEGEND ourselves, since we know the final result
1957 	 */
1958 	if (result == CT_NACK) {
1959 		contract_device_negend(dip, dev, spec_type, CT_EV_FAILURE);
1960 	}
1961 
1962 	return (result);
1963 }
1964 
1965 /*
1966  * contract_device_degrade
1967  *
1968  * Event publishing routine called by I/O framework when a device
1969  * moves to degrade state.
1970  */
1971 /*ARGSUSED*/
1972 void
1973 contract_device_degrade(dev_info_t *dip, dev_t dev, int spec_type)
1974 {
1975 	nvlist_t *nvl;
1976 	uint_t evtype;
1977 
1978 	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1979 
1980 	evtype = CT_DEV_EV_DEGRADED;
1981 	(void) contract_device_publish(dip, dev, spec_type, evtype, nvl);
1982 }
1983 
1984 /*
1985  * contract_device_undegrade
1986  *
1987  * Event publishing routine called by I/O framework when a device
1988  * moves from degraded state to online state.
1989  */
1990 /*ARGSUSED*/
1991 void
1992 contract_device_undegrade(dev_info_t *dip, dev_t dev, int spec_type)
1993 {
1994 	nvlist_t *nvl;
1995 	uint_t evtype;
1996 
1997 	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1998 
1999 	evtype = CT_DEV_EV_ONLINE;
2000 	(void) contract_device_publish(dip, dev, spec_type, evtype, nvl);
2001 }
2002 
2003 /*
2004  * For all contracts which have undergone a negotiation (because the device
2005  * moved out of the acceptable state for that contract and the state
2006  * change is synchronous i.e. requires negotiation) this routine publishes
2007  * a CT_EV_NEGEND event with the final disposition of the event.
2008  *
2009  * This event is always a critical event.
2010  */
2011 void
2012 contract_device_negend(dev_info_t *dip, dev_t dev, int spec_type, int result)
2013 {
2014 	nvlist_t *nvl;
2015 	uint_t evtype;
2016 
2017 	ASSERT(result == CT_EV_SUCCESS || result == CT_EV_FAILURE);
2018 
2019 	CT_DEBUG((CE_NOTE, "contract_device_negend(): entered: result: %d, "
2020 	    "dip: %p", result, (void *)dip));
2021 
2022 	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2023 	VERIFY(nvlist_add_int32(nvl, CTS_NEWCT,
2024 	    result == CT_EV_SUCCESS ? 1 : 0) == 0);
2025 
2026 	evtype = CT_EV_NEGEND;
2027 	(void) contract_device_publish(dip, dev, spec_type, evtype, nvl);
2028 
2029 	CT_DEBUG((CE_NOTE, "contract_device_negend(): exit dip: %p",
2030 	    (void *)dip));
2031 }
2032 
2033 /*
2034  * Wrapper routine called by other subsystems (such as LDI) to start
2035  * negotiations when a synchronous device state change occurs.
2036  * Returns CT_ACK or CT_NACK.
2037  */
2038 ct_ack_t
2039 contract_device_negotiate(dev_info_t *dip, dev_t dev, int spec_type,
2040     uint_t evtype)
2041 {
2042 	int	result;
2043 
2044 	ASSERT(dip);
2045 	ASSERT(dev != NODEV);
2046 	ASSERT(dev != DDI_DEV_T_ANY);
2047 	ASSERT(dev != DDI_DEV_T_NONE);
2048 	ASSERT(spec_type == S_IFBLK || spec_type == S_IFCHR);
2049 
2050 	result = CT_NACK;
2051 	switch (evtype) {
2052 	case CT_DEV_EV_OFFLINE:
2053 		result = contract_device_offline(dip, dev, spec_type);
2054 		break;
2055 	default:
2056 		cmn_err(CE_PANIC, "contract_device_negotiate(): Negotiation "
2057 		    "not supported: event (%d) for dev_t (%lu) and spec (%d), "
2058 		    "dip (%p)", evtype, dev, spec_type, (void *)dip);
2059 		break;
2060 	}
2061 
2062 	return (result);
2063 }
2064 
2065 /*
2066  * A wrapper routine called by other subsystems (such as the LDI) to
2067  * finalize event processing for a state change event. For synchronous
2068  * state changes, this publishes NEGEND events. For asynchronous i.e.
2069  * non-negotiable events this publishes the event.
2070  */
2071 void
2072 contract_device_finalize(dev_info_t *dip, dev_t dev, int spec_type,
2073     uint_t evtype, int ct_result)
2074 {
2075 	ASSERT(dip);
2076 	ASSERT(dev != NODEV);
2077 	ASSERT(dev != DDI_DEV_T_ANY);
2078 	ASSERT(dev != DDI_DEV_T_NONE);
2079 	ASSERT(spec_type == S_IFBLK || spec_type == S_IFCHR);
2080 
2081 	switch (evtype) {
2082 	case CT_DEV_EV_OFFLINE:
2083 		contract_device_negend(dip, dev, spec_type, ct_result);
2084 		break;
2085 	case CT_DEV_EV_DEGRADED:
2086 		contract_device_degrade(dip, dev, spec_type);
2087 		contract_device_negend(dip, dev, spec_type, ct_result);
2088 		break;
2089 	case CT_DEV_EV_ONLINE:
2090 		contract_device_undegrade(dip, dev, spec_type);
2091 		contract_device_negend(dip, dev, spec_type, ct_result);
2092 		break;
2093 	default:
2094 		cmn_err(CE_PANIC, "contract_device_finalize(): Unsupported "
2095 		    "event (%d) for dev_t (%lu) and spec (%d), dip (%p)",
2096 		    evtype, dev, spec_type, (void *)dip);
2097 		break;
2098 	}
2099 }
2100 
2101 /*
2102  * Called by I/O framework when a devinfo node is freed to remove the
2103  * association between a devinfo node and its contracts.
2104  */
2105 void
2106 contract_device_remove_dip(dev_info_t *dip)
2107 {
2108 	cont_device_t *ctd;
2109 	cont_device_t *next;
2110 	contract_t *ct;
2111 
2112 	mutex_enter(&(DEVI(dip)->devi_ct_lock));
2113 	ct_barrier_wait_for_release(dip);
2114 
2115 	for (ctd = list_head(&(DEVI(dip)->devi_ct)); ctd != NULL; ctd = next) {
2116 		next = list_next(&(DEVI(dip)->devi_ct), ctd);
2117 		list_remove(&(DEVI(dip)->devi_ct), ctd);
2118 		ct = &ctd->cond_contract;
2119 		/*
2120 		 * Unlink the dip associated with this contract
2121 		 */
2122 		mutex_enter(&ct->ct_lock);
2123 		ASSERT(ctd->cond_dip == dip);
2124 		ctd->cond_dip = NULL; /* no longer linked to dip */
2125 		contract_rele(ct);	/* remove hold for dip linkage */
2126 		CT_DEBUG((CE_NOTE, "ct: remove_dip: removed dip from contract: "
2127 		    "ctid: %d", ct->ct_id));
2128 		mutex_exit(&ct->ct_lock);
2129 	}
2130 	ASSERT(list_is_empty(&(DEVI(dip)->devi_ct)));
2131 	mutex_exit(&(DEVI(dip)->devi_ct_lock));
2132 }
2133 
2134 /*
2135  * Barrier related routines
2136  */
2137 static void
2138 ct_barrier_acquire(dev_info_t *dip)
2139 {
2140 	ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_ct_lock)));
2141 	CT_DEBUG((CE_NOTE, "ct_barrier_acquire: waiting for barrier"));
2142 	while (DEVI(dip)->devi_ct_count != -1)
2143 		cv_wait(&(DEVI(dip)->devi_ct_cv), &(DEVI(dip)->devi_ct_lock));
2144 	DEVI(dip)->devi_ct_count = 0;
2145 	CT_DEBUG((CE_NOTE, "ct_barrier_acquire: thread owns barrier"));
2146 }
2147 
2148 static void
2149 ct_barrier_release(dev_info_t *dip)
2150 {
2151 	ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_ct_lock)));
2152 	ASSERT(DEVI(dip)->devi_ct_count != -1);
2153 	DEVI(dip)->devi_ct_count = -1;
2154 	cv_broadcast(&(DEVI(dip)->devi_ct_cv));
2155 	CT_DEBUG((CE_NOTE, "ct_barrier_release: Released barrier"));
2156 }
2157 
2158 static int
2159 ct_barrier_held(dev_info_t *dip)
2160 {
2161 	ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_ct_lock)));
2162 	return (DEVI(dip)->devi_ct_count != -1);
2163 }
2164 
2165 static int
2166 ct_barrier_empty(dev_info_t *dip)
2167 {
2168 	ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_ct_lock)));
2169 	ASSERT(DEVI(dip)->devi_ct_count != -1);
2170 	return (DEVI(dip)->devi_ct_count == 0);
2171 }
2172 
2173 static void
2174 ct_barrier_wait_for_release(dev_info_t *dip)
2175 {
2176 	ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_ct_lock)));
2177 	while (DEVI(dip)->devi_ct_count != -1)
2178 		cv_wait(&(DEVI(dip)->devi_ct_cv), &(DEVI(dip)->devi_ct_lock));
2179 }
2180 
2181 static void
2182 ct_barrier_decr(dev_info_t *dip)
2183 {
2184 	CT_DEBUG((CE_NOTE, "barrier_decr:  ct_count before decr: %d",
2185 	    DEVI(dip)->devi_ct_count));
2186 
2187 	ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_ct_lock)));
2188 	ASSERT(DEVI(dip)->devi_ct_count > 0);
2189 
2190 	DEVI(dip)->devi_ct_count--;
2191 	if (DEVI(dip)->devi_ct_count == 0) {
2192 		cv_broadcast(&DEVI(dip)->devi_ct_cv);
2193 		CT_DEBUG((CE_NOTE, "barrier_decr: cv_broadcast"));
2194 	}
2195 }
2196 
2197 static void
2198 ct_barrier_incr(dev_info_t *dip)
2199 {
2200 	ASSERT(ct_barrier_held(dip));
2201 	DEVI(dip)->devi_ct_count++;
2202 }
2203 
2204 static int
2205 ct_barrier_wait_for_empty(dev_info_t *dip, int secs)
2206 {
2207 	clock_t abstime;
2208 
2209 	ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_ct_lock)));
2210 
2211 	abstime = ddi_get_lbolt() + drv_usectohz(secs*1000000);
2212 	while (DEVI(dip)->devi_ct_count) {
2213 		if (cv_timedwait(&(DEVI(dip)->devi_ct_cv),
2214 		    &(DEVI(dip)->devi_ct_lock), abstime) == -1) {
2215 			return (-1);
2216 		}
2217 	}
2218 	return (0);
2219 }
2220