xref: /titanic_50/usr/src/uts/i86pc/io/fipe/fipe_pm.c (revision 3d729aecc03ea6ebb9bd5d56b8dccd24f57daa41)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, Intel Corporation.
23  * All rights reserved.
24  */
25 
26 #include <sys/atomic.h>
27 #include <sys/cpuvar.h>
28 #include <sys/cpu.h>
29 #include <sys/cpu_event.h>
30 #include <sys/cmn_err.h>
31 #include <sys/ddi.h>
32 #include <sys/kmem.h>
33 #include <sys/kstat.h>
34 #include <sys/pci.h>
35 #include <sys/sunddi.h>
36 #include <sys/sunndi.h>
37 #include <sys/synch.h>
38 #include <sys/sysmacros.h>
39 #include <sys/fipe.h>
40 #include <vm/hat.h>
41 
42 /* Current PM policy, configurable through /etc/system and fipe.conf. */
43 fipe_pm_policy_t fipe_pm_policy = FIPE_PM_POLICY_BALANCE;
44 int fipe_pm_throttle_level = 1;
45 
46 /* Enable kstat support. */
47 #define	FIPE_KSTAT_SUPPORT		1
48 
49 /* Enable performance relative statistics. */
50 #define	FIPE_KSTAT_DETAIL		1
51 
52 /* Enable builtin IOAT driver if no IOAT driver is available. */
53 #define	FIPE_IOAT_BUILTIN		0
54 #if defined(FIPE_IOAT_BUILTIN) && (FIPE_IOAT_BUILTIN == 0)
55 #undef	FIPE_IOAT_BUILTIN
56 #endif
57 
58 #ifdef	FIPE_IOAT_BUILTIN
59 /* Use IOAT channel 3 to generate memory transactions. */
60 #define	FIPE_IOAT_CHAN_CTRL		0x200
61 #define	FIPE_IOAT_CHAN_STS_LO		0x204
62 #define	FIPE_IOAT_CHAN_STS_HI		0x208
63 #define	FIPE_IOAT_CHAN_ADDR_LO		0x20C
64 #define	FIPE_IOAT_CHAN_ADDR_HI		0x210
65 #define	FIPE_IOAT_CHAN_CMD		0x214
66 #define	FIPE_IOAT_CHAN_ERR		0x228
67 #else	/* FIPE_IOAT_BUILTIN */
68 #include <sys/dcopy.h>
69 #endif	/* FIPE_IOAT_BUILTIN */
70 
71 /* Memory controller relative PCI configuration constants. */
72 #define	FIPE_MC_GBLACT			0x60
73 #define	FIPE_MC_THRTLOW			0x64
74 #define	FIPE_MC_THRTCTRL 		0x67
75 #define	FIPE_MC_THRTCTRL_HUNT		0x1
76 
77 /* Hardware recommended values. */
78 #define	FIPE_MC_MEMORY_OFFSET		1024
79 #define	FIPE_MC_MEMORY_SIZE		128
80 
81 /* Number of IOAT commands posted when entering idle. */
82 #define	FIPE_IOAT_CMD_NUM		2
83 
84 /* Resource allocation retry interval in microsecond. */
85 #define	FIPE_IOAT_RETRY_INTERVAL	(15 * 1000 * 1000)
86 
87 /* Statistics update interval in nanosecond. */
88 #define	FIPE_STAT_INTERVAL		(10 * 1000 * 1000)
89 
90 /* Configuration profile support. */
91 #define	FIPE_PROFILE_FIELD(field)	(fipe_profile_curr->field)
92 #define	FIPE_PROF_IDLE_COUNT		FIPE_PROFILE_FIELD(idle_count)
93 #define	FIPE_PROF_BUSY_THRESHOLD	FIPE_PROFILE_FIELD(busy_threshold)
94 #define	FIPE_PROF_INTR_THRESHOLD	FIPE_PROFILE_FIELD(intr_threshold)
95 #define	FIPE_PROF_INTR_BUSY_THRESHOLD	FIPE_PROFILE_FIELD(intr_busy_threshold)
96 #define	FIPE_PROF_INTR_BUSY_THROTTLE	FIPE_PROFILE_FIELD(intr_busy_throttle)
97 
98 /* Priority assigned to FIPE memory power management driver on x86. */
99 #define	CPU_IDLE_CB_PRIO_FIPE		(CPU_IDLE_CB_PRIO_LOW_BASE + 0x4000000)
100 
101 /* Structure to support power management profile. */
102 #pragma align CPU_CACHE_COHERENCE_SIZE(fipe_profiles)
103 static struct fipe_profile {
104 	uint32_t			idle_count;
105 	uint32_t			busy_threshold;
106 	uint32_t			intr_threshold;
107 	uint32_t			intr_busy_threshold;
108 	uint32_t			intr_busy_throttle;
109 } fipe_profiles[FIPE_PM_POLICY_MAX] = {
110 	{ 0,	0,	0,	0,	0 },
111 	{ 5,	30,	20,	50,	5 },
112 	{ 10,	40,	40,	75,	4 },
113 	{ 15,	50,	60,	100,	2 },
114 };
115 
116 /* Structure to store memory controller relative data. */
117 #pragma align CPU_CACHE_COHERENCE_SIZE(fipe_mc_ctrl)
118 static struct fipe_mc_ctrl {
119 	ddi_acc_handle_t		mc_pci_hdl;
120 	unsigned char			mc_thrtctrl;
121 	unsigned char			mc_thrtlow;
122 	unsigned char			mc_gblact;
123 	dev_info_t			*mc_dip;
124 	boolean_t			mc_initialized;
125 } fipe_mc_ctrl;
126 
127 /* Structure to store IOAT relative information. */
128 #pragma align CPU_CACHE_COHERENCE_SIZE(fipe_ioat_ctrl)
129 static struct fipe_ioat_control {
130 	kmutex_t			ioat_lock;
131 	boolean_t			ioat_ready;
132 #ifdef	FIPE_IOAT_BUILTIN
133 	boolean_t			ioat_reg_mapped;
134 	ddi_acc_handle_t		ioat_reg_handle;
135 	uint8_t				*ioat_reg_addr;
136 	uint64_t			ioat_cmd_physaddr;
137 #else	/* FIPE_IOAT_BUILTIN */
138 	dcopy_cmd_t			ioat_cmds[FIPE_IOAT_CMD_NUM + 1];
139 	dcopy_handle_t			ioat_handle;
140 #endif	/* FIPE_IOAT_BUILTIN */
141 	dev_info_t			*ioat_dev_info;
142 	uint64_t			ioat_buf_physaddr;
143 	char				*ioat_buf_virtaddr;
144 	char				*ioat_buf_start;
145 	size_t				ioat_buf_size;
146 	timeout_id_t			ioat_timerid;
147 	boolean_t			ioat_failed;
148 	boolean_t			ioat_cancel;
149 	boolean_t			ioat_try_alloc;
150 } fipe_ioat_ctrl;
151 
152 #pragma align CPU_CACHE_COHERENCE_SIZE(fipe_idle_ctrl)
153 static struct fipe_idle_ctrl {
154 	boolean_t			idle_ready;
155 	cpu_idle_callback_handle_t	cb_handle;
156 	cpu_idle_prop_handle_t		prop_enter;
157 	cpu_idle_prop_handle_t		prop_exit;
158 	cpu_idle_prop_handle_t		prop_busy;
159 	cpu_idle_prop_handle_t		prop_idle;
160 	cpu_idle_prop_handle_t		prop_intr;
161 
162 	/* Put here for cache efficiency, it should be in fipe_global_ctrl. */
163 	hrtime_t			tick_interval;
164 } fipe_idle_ctrl;
165 
166 /*
167  * Global control structure.
168  * Solaris idle thread has no reentrance issue, so it's enough to count CPUs
169  * in idle state. Otherwise cpuset_t bitmap should be used to track idle CPUs.
170  */
171 #pragma align CPU_CACHE_COHERENCE_SIZE(fipe_gbl_ctrl)
172 static struct fipe_global_ctrl {
173 	kmutex_t			lock;
174 	boolean_t			pm_enabled;
175 	volatile boolean_t		pm_active;
176 	volatile uint32_t		cpu_count;
177 	volatile uint64_t		io_waiters;
178 	hrtime_t			enter_ts;
179 	hrtime_t			time_in_pm;
180 	size_t				state_size;
181 	char				*state_buf;
182 #ifdef	FIPE_KSTAT_SUPPORT
183 	kstat_t				*fipe_kstat;
184 #endif	/* FIPE_KSTAT_SUPPORT */
185 } fipe_gbl_ctrl;
186 
187 #define	FIPE_CPU_STATE_PAD		(128 - \
188 	2 * sizeof (boolean_t) -  4 * sizeof (hrtime_t) - \
189 	2 * sizeof (uint64_t) - 2 * sizeof (uint32_t))
190 
191 /* Per-CPU status. */
192 #pragma pack(1)
193 typedef struct fipe_cpu_state {
194 	boolean_t			cond_ready;
195 	boolean_t			state_ready;
196 	uint32_t			idle_count;
197 	uint32_t			throttle_cnt;
198 	hrtime_t			throttle_ts;
199 	hrtime_t			next_ts;
200 	hrtime_t			last_busy;
201 	hrtime_t			last_idle;
202 	uint64_t			last_intr;
203 	uint64_t			last_iowait;
204 	char				pad1[FIPE_CPU_STATE_PAD];
205 } fipe_cpu_state_t;
206 #pragma pack()
207 
208 #ifdef	FIPE_KSTAT_SUPPORT
209 #pragma align CPU_CACHE_COHERENCE_SIZE(fipe_kstat)
210 static struct fipe_kstat_s {
211 	kstat_named_t		fipe_enabled;
212 	kstat_named_t		fipe_policy;
213 	kstat_named_t		fipe_pm_time;
214 #ifdef	FIPE_KSTAT_DETAIL
215 	kstat_named_t		ioat_ready;
216 	kstat_named_t		pm_tryenter_cnt;
217 	kstat_named_t		pm_success_cnt;
218 	kstat_named_t		pm_race_cnt;
219 	kstat_named_t		cpu_loop_cnt;
220 	kstat_named_t		cpu_busy_cnt;
221 	kstat_named_t		cpu_idle_cnt;
222 	kstat_named_t		cpu_intr_busy_cnt;
223 	kstat_named_t		cpu_intr_throttle_cnt;
224 	kstat_named_t		bio_busy_cnt;
225 	kstat_named_t		ioat_start_fail_cnt;
226 	kstat_named_t		ioat_stop_fail_cnt;
227 #endif	/* FIPE_KSTAT_DETAIL */
228 } fipe_kstat = {
229 	{ "fipe_enabled",	KSTAT_DATA_INT32 },
230 	{ "fipe_policy",	KSTAT_DATA_INT32 },
231 	{ "fipe_pm_time",	KSTAT_DATA_UINT64 },
232 #ifdef	FIPE_KSTAT_DETAIL
233 	{ "ioat_ready",		KSTAT_DATA_INT32 },
234 	{ "pm_tryenter_cnt",	KSTAT_DATA_UINT64 },
235 	{ "pm_success_cnt",	KSTAT_DATA_UINT64 },
236 	{ "pm_race_cnt",	KSTAT_DATA_UINT64 },
237 	{ "cpu_loop_cnt",	KSTAT_DATA_UINT64 },
238 	{ "cpu_busy_cnt",	KSTAT_DATA_UINT64 },
239 	{ "cpu_idle_cnt",	KSTAT_DATA_UINT64 },
240 	{ "cpu_intr_busy_cnt",	KSTAT_DATA_UINT64 },
241 	{ "cpu_intr_thrt_cnt",	KSTAT_DATA_UINT64 },
242 	{ "bio_busy_cnt",	KSTAT_DATA_UINT64 },
243 	{ "ioat_start_fail_cnt", KSTAT_DATA_UINT64 },
244 	{ "ioat_stop_fail_cnt",	KSTAT_DATA_UINT64 }
245 #endif	/* FIPE_KSTAT_DETAIL */
246 };
247 
248 #define	FIPE_KSTAT_INC(v)		\
249 	atomic_inc_64(&fipe_kstat.v.value.ui64)
250 #ifdef	FIPE_KSTAT_DETAIL
251 #define	FIPE_KSTAT_DETAIL_INC(v)	\
252 	atomic_inc_64(&fipe_kstat.v.value.ui64)
253 #else	/* FIPE_KSTAT_DETAIL */
254 #define	FIPE_KSTAT_DETAIL_INC(v)
255 #endif	/* FIPE_KSTAT_DETAIL */
256 
257 #else	/* FIPE_KSTAT_SUPPORT */
258 
259 #define	FIPE_KSTAT_INC(v)
260 #define	FIPE_KSTAT_DETAIL_INC(v)
261 
262 #endif	/* FIPE_KSTAT_SUPPORT */
263 
264 /* Save current power management profile during suspend/resume. */
265 static fipe_pm_policy_t	fipe_pm_policy_saved = FIPE_PM_POLICY_BALANCE;
266 static fipe_cpu_state_t *fipe_cpu_states = NULL;
267 
268 /*
269  * There is no lock to protect fipe_profile_curr, so fipe_profile_curr
270  * could change on threads in fipe_idle_enter.  This is not an issue,
271  * as it always points to a valid profile, and though it might make
272  * an incorrect choice for the new profile, it will still be a valid
273  * selection, and would do the correct operation for the new profile on
274  * next cpu_idle_enter cycle.  Since the selections would always be
275  * valid for some profile, the overhead for the lock is not wasted.
276  */
277 static struct fipe_profile *fipe_profile_curr = NULL;
278 
279 static void fipe_idle_enter(void *arg, cpu_idle_callback_context_t ctx,
280     cpu_idle_check_wakeup_t check_func, void* check_arg);
281 static void fipe_idle_exit(void* arg, cpu_idle_callback_context_t ctx,
282     int flags);
283 static cpu_idle_callback_t fipe_idle_cb = {
284 	CPU_IDLE_CALLBACK_VER0,
285 	fipe_idle_enter,
286 	fipe_idle_exit,
287 };
288 
289 /*
290  * Configure memory controller into power saving mode:
291  * 1) OLTT activation limit is set to unlimited
292  * 2) MC works in S-CLTT mode
293  */
294 static int
295 fipe_mc_change(int throttle)
296 {
297 	/* Enable OLTT/disable S-CLTT mode */
298 	pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_THRTCTRL,
299 	    fipe_mc_ctrl.mc_thrtctrl & ~FIPE_MC_THRTCTRL_HUNT);
300 	/* Set OLTT activation limit to unlimited */
301 	pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_GBLACT, 0);
302 	/*
303 	 * Set S-CLTT low throttling to desired value. The lower value,
304 	 * the more power saving and the less available memory bandwidth.
305 	 */
306 	pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_THRTLOW, throttle);
307 	/* Enable S-CLTT/disable OLTT mode */
308 	pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_THRTCTRL,
309 	    fipe_mc_ctrl.mc_thrtctrl | FIPE_MC_THRTCTRL_HUNT);
310 
311 	return (0);
312 }
313 
314 /*
315  * Restore memory controller's original configuration.
316  */
317 static void
318 fipe_mc_restore(void)
319 {
320 	pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_THRTCTRL,
321 	    fipe_mc_ctrl.mc_thrtctrl & ~FIPE_MC_THRTCTRL_HUNT);
322 	pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_GBLACT,
323 	    fipe_mc_ctrl.mc_gblact);
324 	pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_THRTLOW,
325 	    fipe_mc_ctrl.mc_thrtlow);
326 	pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_THRTCTRL,
327 	    fipe_mc_ctrl.mc_thrtctrl);
328 }
329 
330 /*
331  * Initialize memory controller's data structure and status.
332  */
333 static int
334 fipe_mc_init(dev_info_t *dip)
335 {
336 	ddi_acc_handle_t handle;
337 
338 	bzero(&fipe_mc_ctrl, sizeof (fipe_mc_ctrl));
339 
340 	/* Hold one reference count and will be released in fipe_mc_fini. */
341 	ndi_hold_devi(dip);
342 
343 	/* Setup pci configuration handler. */
344 	if (pci_config_setup(dip, &handle) != DDI_SUCCESS) {
345 		cmn_err(CE_WARN,
346 		    "!fipe: failed to setup pcicfg handler in mc_init.");
347 		ndi_rele_devi(dip);
348 		return (-1);
349 	}
350 
351 	/* Save original configuration. */
352 	fipe_mc_ctrl.mc_thrtctrl = pci_config_get8(handle, FIPE_MC_THRTCTRL);
353 	fipe_mc_ctrl.mc_thrtlow = pci_config_get8(handle, FIPE_MC_THRTLOW);
354 	fipe_mc_ctrl.mc_gblact = pci_config_get8(handle, FIPE_MC_GBLACT);
355 	fipe_mc_ctrl.mc_dip = dip;
356 	fipe_mc_ctrl.mc_pci_hdl = handle;
357 	fipe_mc_ctrl.mc_initialized = B_TRUE;
358 
359 	return (0);
360 }
361 
362 /*
363  * Restore memory controller's configuration and release resources.
364  */
365 static void
366 fipe_mc_fini(void)
367 {
368 	if (fipe_mc_ctrl.mc_initialized) {
369 		fipe_mc_restore();
370 		pci_config_teardown(&fipe_mc_ctrl.mc_pci_hdl);
371 		ndi_rele_devi(fipe_mc_ctrl.mc_dip);
372 		fipe_mc_ctrl.mc_initialized = B_FALSE;
373 	}
374 	bzero(&fipe_mc_ctrl, sizeof (fipe_mc_ctrl));
375 }
376 
377 /* Search device with specific pci ids. */
378 struct fipe_pci_ioat_id {
379 	uint16_t		venid;
380 	uint16_t		devid;
381 	uint16_t		subvenid;
382 	uint16_t		subsysid;
383 	char			*unitaddr;
384 };
385 
386 static struct fipe_pci_ioat_id fipe_pci_ioat_ids[] = {
387 	{ 0x8086, 0x1a38, 0xffff, 0xffff, NULL },
388 	{ 0x8086, 0x360b, 0xffff, 0xffff, NULL },
389 };
390 
391 /*ARGSUSED*/
392 static int
393 fipe_search_ioat_dev(dev_info_t *dip, void *arg)
394 {
395 	char *unit;
396 	struct fipe_pci_ioat_id *id;
397 	int i, max, venid, devid, subvenid, subsysid;
398 
399 	/* Query PCI id properties. */
400 	venid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
401 	    "vendor-id", 0xffffffff);
402 	if (venid == 0xffffffff) {
403 		return (DDI_WALK_CONTINUE);
404 	}
405 	devid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
406 	    "device-id", 0xffffffff);
407 	if (devid == 0xffffffff) {
408 		return (DDI_WALK_CONTINUE);
409 	}
410 	subvenid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
411 	    "subsystem-vendor-id", 0xffffffff);
412 	if (subvenid == 0xffffffff) {
413 		return (DDI_WALK_CONTINUE);
414 	}
415 	subsysid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
416 	    "subsystem-id", 0xffffffff);
417 	if (subvenid == 0xffffffff) {
418 		return (DDI_WALK_CONTINUE);
419 	}
420 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
421 	    "unit-address", &unit) != DDI_PROP_SUCCESS) {
422 		return (DDI_WALK_CONTINUE);
423 	}
424 
425 	max = sizeof (fipe_pci_ioat_ids) / sizeof (fipe_pci_ioat_ids[0]);
426 	for (i = 0; i < max; i++) {
427 		id = &fipe_pci_ioat_ids[i];
428 		if ((id->venid == 0xffffu || id->venid == venid) &&
429 		    (id->devid == 0xffffu || id->devid == devid) &&
430 		    (id->subvenid == 0xffffu || id->subvenid == subvenid) &&
431 		    (id->subsysid == 0xffffu || id->subsysid == subsysid) &&
432 		    (id->unitaddr == NULL || strcmp(id->unitaddr, unit) == 0)) {
433 			break;
434 		}
435 	}
436 	ddi_prop_free(unit);
437 	if (i >= max) {
438 		return (DDI_WALK_CONTINUE);
439 	}
440 
441 	/* Found IOAT device, hold one reference count. */
442 	ndi_hold_devi(dip);
443 	fipe_ioat_ctrl.ioat_dev_info = dip;
444 
445 	return (DDI_WALK_TERMINATE);
446 }
447 
448 /*
449  * To enable FBDIMM idle power enhancement mechanism, IOAT will be used to
450  * generate enough memory traffic to trigger memory controller thermal throttle
451  * circuitry.
452  * If dcopy/ioat is available, we will use dcopy interface to communicate
453  * with IOAT. Otherwise the built-in driver will directly talk to IOAT
454  * hardware.
455  */
456 #ifdef	FIPE_IOAT_BUILTIN
457 static int
458 fipe_ioat_trigger(void)
459 {
460 	uint16_t ctrl;
461 	uint32_t err;
462 	uint8_t	*addr = fipe_ioat_ctrl.ioat_reg_addr;
463 	ddi_acc_handle_t handle = fipe_ioat_ctrl.ioat_reg_handle;
464 
465 	/* Check channel in use flag. */
466 	ctrl = ddi_get16(handle, (uint16_t *)(addr + FIPE_IOAT_CHAN_CTRL));
467 	if (ctrl & 0x100) {
468 		/*
469 		 * Channel is in use by somebody else. IOAT driver may have
470 		 * been loaded, forbid fipe from accessing IOAT hardware
471 		 * anymore.
472 		 */
473 		fipe_ioat_ctrl.ioat_ready = B_FALSE;
474 		fipe_ioat_ctrl.ioat_failed = B_TRUE;
475 		FIPE_KSTAT_INC(ioat_start_fail_cnt);
476 		return (-1);
477 	} else {
478 		/* Set channel in use flag. */
479 		ddi_put16(handle,
480 		    (uint16_t *)(addr + FIPE_IOAT_CHAN_CTRL), 0x100);
481 	}
482 
483 	/* Write command address. */
484 	ddi_put32(handle,
485 	    (uint32_t *)(addr + FIPE_IOAT_CHAN_ADDR_LO),
486 	    (uint32_t)fipe_ioat_ctrl.ioat_cmd_physaddr);
487 	ddi_put32(handle, (uint32_t *)(addr + FIPE_IOAT_CHAN_ADDR_HI),
488 	    (uint32_t)(fipe_ioat_ctrl.ioat_cmd_physaddr >> 32));
489 
490 	/* Check and clear error flags. */
491 	err = ddi_get32(handle, (uint32_t *)(addr + FIPE_IOAT_CHAN_ERR));
492 	if (err != 0) {
493 		ddi_put32(handle, (uint32_t *)(addr + FIPE_IOAT_CHAN_ERR), err);
494 	}
495 
496 	/* Start channel. */
497 	ddi_put8(handle, (uint8_t *)(addr + FIPE_IOAT_CHAN_CMD), 0x1);
498 
499 	return (0);
500 }
501 
502 static void
503 fipe_ioat_cancel(void)
504 {
505 	uint32_t status;
506 	uint8_t	*addr = fipe_ioat_ctrl.ioat_reg_addr;
507 	ddi_acc_handle_t handle = fipe_ioat_ctrl.ioat_reg_handle;
508 
509 	/*
510 	 * Reset channel. Sometimes reset is not reliable,
511 	 * so check completion or abort status after reset.
512 	 */
513 	/* LINTED: constant in conditional context */
514 	while (1) {
515 		/* Issue reset channel command. */
516 		ddi_put8(handle, (uint8_t *)(addr + FIPE_IOAT_CHAN_CMD), 0x20);
517 
518 		/* Query command status. */
519 		status = ddi_get32(handle,
520 		    (uint32_t *)(addr + FIPE_IOAT_CHAN_STS_LO));
521 		if (status & 0x1) {
522 			/* Reset channel completed. */
523 			break;
524 		} else {
525 			SMT_PAUSE();
526 		}
527 	}
528 
529 	/* Put channel into "not in use" state. */
530 	ddi_put16(handle, (uint16_t *)(addr + FIPE_IOAT_CHAN_CTRL), 0);
531 }
532 
533 /*ARGSUSED*/
534 static void
535 fipe_ioat_alloc(void *arg)
536 {
537 	int rc = 0, nregs;
538 	dev_info_t *dip;
539 	ddi_device_acc_attr_t attr;
540 	boolean_t fatal = B_FALSE;
541 
542 	mutex_enter(&fipe_ioat_ctrl.ioat_lock);
543 	/*
544 	 * fipe_ioat_alloc() is called in DEVICE ATTACH context when loaded.
545 	 * In DEVICE ATTACH context, it can't call ddi_walk_devs(), so just
546 	 * schedule a timer and exit.
547 	 */
548 	if (fipe_ioat_ctrl.ioat_try_alloc == B_FALSE) {
549 		fipe_ioat_ctrl.ioat_try_alloc = B_TRUE;
550 		goto out_error;
551 	}
552 
553 	/* Check whether has been initialized or encountered permanent error. */
554 	if (fipe_ioat_ctrl.ioat_ready || fipe_ioat_ctrl.ioat_failed ||
555 	    fipe_ioat_ctrl.ioat_cancel) {
556 		fipe_ioat_ctrl.ioat_timerid = 0;
557 		mutex_exit(&fipe_ioat_ctrl.ioat_lock);
558 		return;
559 	}
560 
561 	if (fipe_ioat_ctrl.ioat_dev_info == NULL) {
562 		/* Find dev_info_t for IOAT engine. */
563 		ddi_walk_devs(ddi_root_node(), fipe_search_ioat_dev, NULL);
564 		if (fipe_ioat_ctrl.ioat_dev_info == NULL) {
565 			cmn_err(CE_NOTE,
566 			    "!fipe: no IOAT hardware found, disable pm.");
567 			fatal = B_TRUE;
568 			goto out_error;
569 		}
570 	}
571 
572 	/* Map in IOAT control register window. */
573 	ASSERT(fipe_ioat_ctrl.ioat_dev_info != NULL);
574 	ASSERT(fipe_ioat_ctrl.ioat_reg_mapped == B_FALSE);
575 	dip = fipe_ioat_ctrl.ioat_dev_info;
576 	if (ddi_dev_nregs(dip, &nregs) != DDI_SUCCESS || nregs < 2) {
577 		cmn_err(CE_WARN, "!fipe: ioat has not enough register bars.");
578 		fatal = B_TRUE;
579 		goto out_error;
580 	}
581 	attr.devacc_attr_version = DDI_DEVICE_ATTR_V0;
582 	attr.devacc_attr_endian_flags = DDI_NEVERSWAP_ACC;
583 	attr.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
584 	rc = ddi_regs_map_setup(dip, 1,
585 	    (caddr_t *)&fipe_ioat_ctrl.ioat_reg_addr,
586 	    0, 0, &attr, &fipe_ioat_ctrl.ioat_reg_handle);
587 	if (rc != DDI_SUCCESS) {
588 		cmn_err(CE_WARN, "!fipe: failed to map IOAT registeres.");
589 		fatal = B_TRUE;
590 		goto out_error;
591 	}
592 
593 	/* Mark IOAT status. */
594 	fipe_ioat_ctrl.ioat_reg_mapped = B_TRUE;
595 	fipe_ioat_ctrl.ioat_ready = B_TRUE;
596 	fipe_ioat_ctrl.ioat_failed = B_FALSE;
597 	fipe_ioat_ctrl.ioat_timerid = 0;
598 	mutex_exit(&fipe_ioat_ctrl.ioat_lock);
599 
600 	return;
601 
602 out_error:
603 	fipe_ioat_ctrl.ioat_timerid = 0;
604 	if (!fipe_ioat_ctrl.ioat_ready && !fipe_ioat_ctrl.ioat_cancel) {
605 		if (fatal) {
606 			/* Mark permanent error and give up. */
607 			fipe_ioat_ctrl.ioat_failed = B_TRUE;
608 			/* Release reference count hold by ddi_find_devinfo. */
609 			if (fipe_ioat_ctrl.ioat_dev_info != NULL) {
610 				ndi_rele_devi(fipe_ioat_ctrl.ioat_dev_info);
611 				fipe_ioat_ctrl.ioat_dev_info = NULL;
612 			}
613 		} else {
614 			/*
615 			 * Schedule another timer to keep on trying.
616 			 * timeout() should always succeed, no need to check
617 			 * return.
618 			 */
619 			fipe_ioat_ctrl.ioat_timerid = timeout(fipe_ioat_alloc,
620 			    NULL, drv_usectohz(FIPE_IOAT_RETRY_INTERVAL));
621 		}
622 	}
623 	mutex_exit(&fipe_ioat_ctrl.ioat_lock);
624 }
625 
626 static void
627 fipe_ioat_free(void)
628 {
629 	mutex_enter(&fipe_ioat_ctrl.ioat_lock);
630 	/* Cancel timeout to avoid race condition. */
631 	if (fipe_ioat_ctrl.ioat_timerid != 0) {
632 		fipe_ioat_ctrl.ioat_cancel = B_TRUE;
633 		mutex_exit(&fipe_ioat_ctrl.ioat_lock);
634 		(void) untimeout(fipe_ioat_ctrl.ioat_timerid);
635 		mutex_enter(&fipe_ioat_ctrl.ioat_lock);
636 		fipe_ioat_ctrl.ioat_timerid = 0;
637 		fipe_ioat_ctrl.ioat_cancel = B_FALSE;
638 	}
639 
640 	if (fipe_ioat_ctrl.ioat_reg_mapped) {
641 		ddi_regs_map_free(&fipe_ioat_ctrl.ioat_reg_handle);
642 		fipe_ioat_ctrl.ioat_reg_mapped = B_FALSE;
643 	}
644 
645 	fipe_ioat_ctrl.ioat_ready = B_FALSE;
646 	mutex_exit(&fipe_ioat_ctrl.ioat_lock);
647 }
648 
649 #else	/* FIPE_IOAT_BUILTIN */
650 
651 /*
652  * Trigger IOAT memory copy operation when entering power saving state.
653  * A group of commands will be posted to IOAT driver and those commands
654  * will be placed into an IOAT ring buffer.
655  */
656 static int
657 fipe_ioat_trigger(void)
658 {
659 	int idx;
660 	dcopy_cmd_t *cmds = fipe_ioat_ctrl.ioat_cmds;
661 
662 	for (idx = FIPE_IOAT_CMD_NUM; idx > 0; idx--) {
663 		if (dcopy_cmd_post(cmds[idx]) == DCOPY_SUCCESS) {
664 			continue;
665 		} else {
666 			/*
667 			 * Don't rollback on failure, it doesn't hurt much more
668 			 * than some small memory copy operations.
669 			 */
670 			FIPE_KSTAT_DETAIL_INC(ioat_start_fail_cnt);
671 			return (-1);
672 		}
673 	}
674 
675 	return (0);
676 }
677 
678 /*
679  * Cancel the memory copy operations posted by fipe_ioat_trigger.
680  * It's achieved by posting a new command which will break the ring
681  * created by fipe_ioat_trigger. If it fails, the best way to recover
682  * is to just let it go. IOAT will recover when posting next command
683  * on the same channel.
684  */
685 static void
686 fipe_ioat_cancel(void)
687 {
688 	if (dcopy_cmd_post(fipe_ioat_ctrl.ioat_cmds[0]) != DCOPY_SUCCESS) {
689 		FIPE_KSTAT_DETAIL_INC(ioat_stop_fail_cnt);
690 	}
691 }
692 
693 /*
694  * This function will be called from allocate IOAT resources.
695  * Allocation may fail due to following reasons:
696  * 1) IOAT driver hasn't been loaded yet. Keep on trying in this case.
697  * 2) IOAT resources are temporarily unavailable.  Keep on trying in this case.
698  * 3) Other no recoverable reasons. Disable power management function.
699  */
700 /*ARGSUSED*/
701 static void
702 fipe_ioat_alloc(void *arg)
703 {
704 	int idx, flags, rc = 0;
705 	uint64_t physaddr;
706 	boolean_t fatal = B_FALSE;
707 	dcopy_query_t info;
708 	dcopy_handle_t handle;
709 	dcopy_cmd_t cmds[FIPE_IOAT_CMD_NUM + 1];
710 
711 	mutex_enter(&fipe_ioat_ctrl.ioat_lock);
712 	/*
713 	 * fipe_ioat_alloc() is called in DEVICE ATTACH context when loaded.
714 	 * In DEVICE ATTACH context, it can't call ddi_walk_devs(), so just
715 	 * schedule a timer and exit.
716 	 */
717 	if (fipe_ioat_ctrl.ioat_try_alloc == B_FALSE) {
718 		fipe_ioat_ctrl.ioat_try_alloc = B_TRUE;
719 		mutex_exit(&fipe_ioat_ctrl.ioat_lock);
720 		goto out_error;
721 	}
722 
723 	/*
724 	 * Check whether device has been initialized or if it encountered
725 	 * some permanent error.
726 	 */
727 	if (fipe_ioat_ctrl.ioat_ready || fipe_ioat_ctrl.ioat_failed ||
728 	    fipe_ioat_ctrl.ioat_cancel) {
729 		fipe_ioat_ctrl.ioat_timerid = 0;
730 		mutex_exit(&fipe_ioat_ctrl.ioat_lock);
731 		return;
732 	}
733 
734 	if (fipe_ioat_ctrl.ioat_dev_info == NULL) {
735 		/* Find dev_info_t for IOAT engine. */
736 		ddi_walk_devs(ddi_root_node(), fipe_search_ioat_dev, NULL);
737 		if (fipe_ioat_ctrl.ioat_dev_info == NULL) {
738 			cmn_err(CE_NOTE,
739 			    "!fipe: no IOAT hardware found, disable pm.");
740 			mutex_exit(&fipe_ioat_ctrl.ioat_lock);
741 			fatal = B_TRUE;
742 			goto out_error;
743 		}
744 	}
745 	mutex_exit(&fipe_ioat_ctrl.ioat_lock);
746 
747 	/* Check, allocate and initialize IOAT resources with lock released. */
748 	dcopy_query(&info);
749 	if (info.dq_version < DCOPY_QUERY_V0) {
750 		/* Permanent error, give up. */
751 		cmn_err(CE_WARN, "!fipe: IOAT driver version mismatch.");
752 		fatal = B_TRUE;
753 		goto out_error;
754 	} else if (info.dq_num_channels == 0) {
755 		/* IOAT driver hasn't been loaded, keep trying. */
756 		goto out_error;
757 	}
758 
759 	/* Allocate IOAT channel. */
760 	rc = dcopy_alloc(DCOPY_NOSLEEP, &handle);
761 	if (rc == DCOPY_NORESOURCES) {
762 		/* Resource temporarily not available, keep trying. */
763 		goto out_error;
764 	} else if (rc != DCOPY_SUCCESS) {
765 		/* Permanent error, give up. */
766 		cmn_err(CE_WARN, "!fipe: failed to allocate IOAT channel.");
767 		fatal = B_TRUE;
768 		goto out_error;
769 	}
770 
771 	/*
772 	 * Allocate multiple IOAT commands and organize them into a ring to
773 	 * loop forever. Commands number is determined by IOAT descriptor size
774 	 * and memory interleave pattern.
775 	 * cmd[0] is used break the loop and disable IOAT operation.
776 	 * cmd[1, FIPE_IOAT_CMD_NUM] are grouped into a ring and cmd[1] is the
777 	 * list head.
778 	 */
779 	bzero(cmds, sizeof (cmds));
780 	physaddr = fipe_ioat_ctrl.ioat_buf_physaddr;
781 	for (idx = FIPE_IOAT_CMD_NUM; idx >= 0; idx--) {
782 		/* Allocate IOAT commands. */
783 		if (idx == 0 || idx == FIPE_IOAT_CMD_NUM) {
784 			flags = DCOPY_NOSLEEP;
785 		} else {
786 			/*
787 			 * To link commands into a list, the initial value of
788 			 * cmd need to be set to next cmd on list.
789 			 */
790 			flags = DCOPY_NOSLEEP | DCOPY_ALLOC_LINK;
791 			cmds[idx] = cmds[idx + 1];
792 		}
793 		rc = dcopy_cmd_alloc(handle, flags, &cmds[idx]);
794 		if (rc == DCOPY_NORESOURCES) {
795 			goto out_freecmd;
796 		} else if (rc != DCOPY_SUCCESS) {
797 			/* Permanent error, give up. */
798 			cmn_err(CE_WARN,
799 			    "!fipe: failed to allocate IOAT command.");
800 			fatal = B_TRUE;
801 			goto out_freecmd;
802 		}
803 
804 		/* Disable src/dst snoop to improve CPU cache efficiency. */
805 		cmds[idx]->dp_flags = DCOPY_CMD_NOSRCSNP | DCOPY_CMD_NODSTSNP;
806 		/* Specially handle commands on the list. */
807 		if (idx != 0) {
808 			/* Disable IOAT status. */
809 			cmds[idx]->dp_flags |= DCOPY_CMD_NOSTAT;
810 			/* Disable waiting for resources. */
811 			cmds[idx]->dp_flags |= DCOPY_CMD_NOWAIT;
812 			if (idx == 1) {
813 				/* The list head, chain command into loop. */
814 				cmds[idx]->dp_flags |= DCOPY_CMD_LOOP;
815 			} else {
816 				/* Queue all other commands except head. */
817 				cmds[idx]->dp_flags |= DCOPY_CMD_QUEUE;
818 			}
819 		}
820 		cmds[idx]->dp_cmd = DCOPY_CMD_COPY;
821 		cmds[idx]->dp.copy.cc_source = physaddr;
822 		cmds[idx]->dp.copy.cc_dest = physaddr + FIPE_MC_MEMORY_OFFSET;
823 		if (idx == 0) {
824 			/*
825 			 * Command 0 is used to cancel memory copy by breaking
826 			 * the ring created in fipe_ioat_trigger().
827 			 * For efficiency, use the smallest memory copy size.
828 			 */
829 			cmds[idx]->dp.copy.cc_size = 1;
830 		} else {
831 			cmds[idx]->dp.copy.cc_size = FIPE_MC_MEMORY_SIZE;
832 		}
833 	}
834 
835 	/* Update IOAT control status if it hasn't been initialized yet. */
836 	mutex_enter(&fipe_ioat_ctrl.ioat_lock);
837 	if (!fipe_ioat_ctrl.ioat_ready && !fipe_ioat_ctrl.ioat_cancel) {
838 		fipe_ioat_ctrl.ioat_handle = handle;
839 		for (idx = 0; idx <= FIPE_IOAT_CMD_NUM; idx++) {
840 			fipe_ioat_ctrl.ioat_cmds[idx] = cmds[idx];
841 		}
842 		fipe_ioat_ctrl.ioat_ready = B_TRUE;
843 		fipe_ioat_ctrl.ioat_failed = B_FALSE;
844 		fipe_ioat_ctrl.ioat_timerid = 0;
845 		mutex_exit(&fipe_ioat_ctrl.ioat_lock);
846 		return;
847 	}
848 	mutex_exit(&fipe_ioat_ctrl.ioat_lock);
849 	/* Initialized by another thread, fall through to free resources. */
850 
851 out_freecmd:
852 	if (cmds[0] != NULL) {
853 		dcopy_cmd_free(&cmds[0]);
854 	}
855 	/* Only need to free head, dcopy will free all commands on the list. */
856 	for (idx = 1; idx <= FIPE_IOAT_CMD_NUM; idx++) {
857 		if (cmds[idx] != NULL) {
858 			dcopy_cmd_free(&cmds[idx]);
859 			break;
860 		}
861 	}
862 	dcopy_free(&handle);
863 
864 out_error:
865 	mutex_enter(&fipe_ioat_ctrl.ioat_lock);
866 	fipe_ioat_ctrl.ioat_timerid = 0;
867 	if (!fipe_ioat_ctrl.ioat_ready && !fipe_ioat_ctrl.ioat_cancel) {
868 		if (fatal) {
869 			/* Mark permanent error and give up. */
870 			fipe_ioat_ctrl.ioat_failed = B_TRUE;
871 			/* Release reference count hold by ddi_find_devinfo. */
872 			if (fipe_ioat_ctrl.ioat_dev_info != NULL) {
873 				ndi_rele_devi(fipe_ioat_ctrl.ioat_dev_info);
874 				fipe_ioat_ctrl.ioat_dev_info = NULL;
875 			}
876 		} else {
877 			/*
878 			 * Schedule another timer to keep on trying.
879 			 * timeout() should always success, no need to check.
880 			 */
881 			fipe_ioat_ctrl.ioat_timerid = timeout(fipe_ioat_alloc,
882 			    NULL, drv_usectohz(FIPE_IOAT_RETRY_INTERVAL));
883 		}
884 	}
885 	mutex_exit(&fipe_ioat_ctrl.ioat_lock);
886 }
887 
888 /*
889  * Free resources allocated in fipe_ioat_alloc.
890  */
891 static void
892 fipe_ioat_free(void)
893 {
894 	int idx = 0;
895 	dcopy_cmd_t *cmds = fipe_ioat_ctrl.ioat_cmds;
896 
897 	mutex_enter(&fipe_ioat_ctrl.ioat_lock);
898 
899 	/* Cancel timeout to avoid race condition. */
900 	if (fipe_ioat_ctrl.ioat_timerid != 0) {
901 		fipe_ioat_ctrl.ioat_cancel = B_TRUE;
902 		mutex_exit(&fipe_ioat_ctrl.ioat_lock);
903 		(void) untimeout(fipe_ioat_ctrl.ioat_timerid);
904 		mutex_enter(&fipe_ioat_ctrl.ioat_lock);
905 		fipe_ioat_ctrl.ioat_timerid = 0;
906 		fipe_ioat_ctrl.ioat_cancel = B_FALSE;
907 	}
908 
909 	/* Free ioat resources. */
910 	if (fipe_ioat_ctrl.ioat_ready) {
911 		if (cmds[0] != NULL) {
912 			dcopy_cmd_free(&cmds[0]);
913 		}
914 		for (idx = 1; idx <= FIPE_IOAT_CMD_NUM; idx++) {
915 			if (cmds[idx] != NULL) {
916 				dcopy_cmd_free(&cmds[idx]);
917 				break;
918 			}
919 		}
920 		bzero(fipe_ioat_ctrl.ioat_cmds,
921 		    sizeof (fipe_ioat_ctrl.ioat_cmds));
922 		dcopy_free(&fipe_ioat_ctrl.ioat_handle);
923 		fipe_ioat_ctrl.ioat_handle = NULL;
924 		fipe_ioat_ctrl.ioat_ready = B_FALSE;
925 	}
926 
927 	/* Release reference count hold by ddi_find_devinfo. */
928 	if (fipe_ioat_ctrl.ioat_dev_info != NULL) {
929 		ndi_rele_devi(fipe_ioat_ctrl.ioat_dev_info);
930 		fipe_ioat_ctrl.ioat_dev_info = NULL;
931 	}
932 
933 	mutex_exit(&fipe_ioat_ctrl.ioat_lock);
934 }
935 #endif	/* FIPE_IOAT_BUILTIN */
936 
937 /*
938  * Initialize IOAT relative resources.
939  */
940 static int
941 fipe_ioat_init(void)
942 {
943 	char *buf;
944 	size_t size;
945 
946 	bzero(&fipe_ioat_ctrl, sizeof (fipe_ioat_ctrl));
947 	mutex_init(&fipe_ioat_ctrl.ioat_lock, NULL, MUTEX_DRIVER, NULL);
948 
949 	/*
950 	 * Allocate memory for IOAT memory copy operation.
951 	 * The allocated memory should be page aligned to achieve better power
952 	 * savings.
953 	 * Don't use ddi_dma_mem_alloc here to keep thing simple.  This also
954 	 * makes quiesce easier.
955 	 */
956 	size = PAGESIZE;
957 	buf = kmem_zalloc(size, KM_SLEEP);
958 	if ((intptr_t)buf & PAGEOFFSET) {
959 		kmem_free(buf, PAGESIZE);
960 		size <<= 1;
961 		buf = kmem_zalloc(size, KM_SLEEP);
962 	}
963 	fipe_ioat_ctrl.ioat_buf_size = size;
964 	fipe_ioat_ctrl.ioat_buf_start = buf;
965 	buf = (char *)P2ROUNDUP((intptr_t)buf, PAGESIZE);
966 	fipe_ioat_ctrl.ioat_buf_virtaddr = buf;
967 	fipe_ioat_ctrl.ioat_buf_physaddr = hat_getpfnum(kas.a_hat, buf);
968 	fipe_ioat_ctrl.ioat_buf_physaddr <<= PAGESHIFT;
969 
970 #ifdef	FIPE_IOAT_BUILTIN
971 	{
972 		uint64_t bufpa;
973 		/* IOAT descriptor data structure copied from ioat.h. */
974 		struct fipe_ioat_cmd_desc {
975 			uint32_t	dd_size;
976 			uint32_t	dd_ctrl;
977 			uint64_t	dd_src_paddr;
978 			uint64_t	dd_dest_paddr;
979 			uint64_t	dd_next_desc;
980 			uint64_t	dd_res4;
981 			uint64_t	dd_res5;
982 			uint64_t	dd_res6;
983 			uint64_t	dd_res7;
984 		} *desc;
985 
986 		/*
987 		 * Build two IOAT command descriptors and chain them into ring.
988 		 * Control flags as below:
989 		 *	0x2: disable source snoop
990 		 *	0x4: disable destination snoop
991 		 *	0x0 << 24: memory copy operation
992 		 * The layout for command descriptors and memory buffers are
993 		 * organized for power saving effect, please don't change it.
994 		 */
995 		buf = fipe_ioat_ctrl.ioat_buf_virtaddr;
996 		bufpa = fipe_ioat_ctrl.ioat_buf_physaddr;
997 		fipe_ioat_ctrl.ioat_cmd_physaddr = bufpa;
998 
999 		/* First command descriptor. */
1000 		desc = (struct fipe_ioat_cmd_desc *)(buf);
1001 		desc->dd_size = 128;
1002 		desc->dd_ctrl = 0x6;
1003 		desc->dd_src_paddr = bufpa + 2048;
1004 		desc->dd_dest_paddr = bufpa + 3072;
1005 		/* Point to second descriptor. */
1006 		desc->dd_next_desc = bufpa + 64;
1007 
1008 		/* Second command descriptor. */
1009 		desc = (struct fipe_ioat_cmd_desc *)(buf + 64);
1010 		desc->dd_size = 128;
1011 		desc->dd_ctrl = 0x6;
1012 		desc->dd_src_paddr = bufpa + 2048;
1013 		desc->dd_dest_paddr = bufpa + 3072;
1014 		/* Point to first descriptor. */
1015 		desc->dd_next_desc = bufpa;
1016 	}
1017 #endif	/* FIPE_IOAT_BUILTIN */
1018 
1019 	return (0);
1020 }
1021 
1022 static void
1023 fipe_ioat_fini(void)
1024 {
1025 	/* Release reference count hold by ddi_find_devinfo. */
1026 	if (fipe_ioat_ctrl.ioat_dev_info != NULL) {
1027 		ndi_rele_devi(fipe_ioat_ctrl.ioat_dev_info);
1028 		fipe_ioat_ctrl.ioat_dev_info = NULL;
1029 	}
1030 
1031 	if (fipe_ioat_ctrl.ioat_buf_start != NULL) {
1032 		ASSERT(fipe_ioat_ctrl.ioat_buf_size != 0);
1033 		kmem_free(fipe_ioat_ctrl.ioat_buf_start,
1034 		    fipe_ioat_ctrl.ioat_buf_size);
1035 	}
1036 
1037 	mutex_destroy(&fipe_ioat_ctrl.ioat_lock);
1038 	bzero(&fipe_ioat_ctrl, sizeof (fipe_ioat_ctrl));
1039 }
1040 
1041 static int
1042 fipe_idle_start(void)
1043 {
1044 	int rc;
1045 
1046 	if (fipe_idle_ctrl.idle_ready) {
1047 		return (0);
1048 	}
1049 
1050 	if (cpu_idle_prop_create_handle(CPU_IDLE_PROP_ENTER_TIMESTAMP,
1051 	    &fipe_idle_ctrl.prop_enter) != 0) {
1052 		cmn_err(CE_WARN, "!fipe: failed to get enter_ts property.");
1053 		return (-1);
1054 	}
1055 	if (cpu_idle_prop_create_handle(CPU_IDLE_PROP_EXIT_TIMESTAMP,
1056 	    &fipe_idle_ctrl.prop_exit) != 0) {
1057 		cmn_err(CE_WARN, "!fipe: failed to get exit_ts property.");
1058 		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_enter);
1059 		return (-1);
1060 	}
1061 	if (cpu_idle_prop_create_handle(CPU_IDLE_PROP_TOTAL_IDLE_TIME,
1062 	    &fipe_idle_ctrl.prop_idle) != 0) {
1063 		cmn_err(CE_WARN, "!fipe: failed to get idle_time property.");
1064 		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_exit);
1065 		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_enter);
1066 		return (-1);
1067 	}
1068 	if (cpu_idle_prop_create_handle(CPU_IDLE_PROP_TOTAL_BUSY_TIME,
1069 	    &fipe_idle_ctrl.prop_busy) != 0) {
1070 		cmn_err(CE_WARN, "!fipe: failed to get busy_time property.");
1071 		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_idle);
1072 		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_exit);
1073 		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_enter);
1074 		return (-1);
1075 	}
1076 	if (cpu_idle_prop_create_handle(CPU_IDLE_PROP_INTERRUPT_COUNT,
1077 	    &fipe_idle_ctrl.prop_intr) != 0) {
1078 		cmn_err(CE_WARN, "!fipe: failed to get intr_count property.");
1079 		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_busy);
1080 		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_idle);
1081 		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_exit);
1082 		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_enter);
1083 		return (-1);
1084 	}
1085 
1086 	/* Register idle state notification callback. */
1087 	rc = cpu_idle_register_callback(CPU_IDLE_CB_PRIO_FIPE, &fipe_idle_cb,
1088 	    NULL, &fipe_idle_ctrl.cb_handle);
1089 	if (rc != 0) {
1090 		cmn_err(CE_WARN, "!fipe: failed to register cpuidle callback.");
1091 		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_intr);
1092 		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_busy);
1093 		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_idle);
1094 		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_exit);
1095 		(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_enter);
1096 		return (-1);
1097 	}
1098 
1099 	fipe_idle_ctrl.idle_ready = B_TRUE;
1100 
1101 	return (0);
1102 }
1103 
1104 static int
1105 fipe_idle_stop(void)
1106 {
1107 	int rc;
1108 
1109 	if (fipe_idle_ctrl.idle_ready == B_FALSE) {
1110 		return (0);
1111 	}
1112 
1113 	rc = cpu_idle_unregister_callback(fipe_idle_ctrl.cb_handle);
1114 	if (rc != 0) {
1115 		cmn_err(CE_WARN,
1116 		    "!fipe: failed to unregister cpuidle callback.");
1117 		return (-1);
1118 	}
1119 
1120 	(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_intr);
1121 	(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_busy);
1122 	(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_idle);
1123 	(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_exit);
1124 	(void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_enter);
1125 
1126 	fipe_idle_ctrl.idle_ready = B_FALSE;
1127 
1128 	return (0);
1129 }
1130 
1131 #ifdef	FIPE_KSTAT_SUPPORT
1132 static int
1133 fipe_kstat_update(kstat_t *ksp, int rw)
1134 {
1135 	struct fipe_kstat_s *sp;
1136 	hrtime_t hrt;
1137 
1138 	if (rw == KSTAT_WRITE) {
1139 		return (EACCES);
1140 	}
1141 
1142 	sp = ksp->ks_data;
1143 	sp->fipe_enabled.value.i32 = fipe_gbl_ctrl.pm_enabled ? 1 : 0;
1144 	sp->fipe_policy.value.i32 = fipe_pm_policy;
1145 
1146 	hrt = fipe_gbl_ctrl.time_in_pm;
1147 	scalehrtime(&hrt);
1148 	sp->fipe_pm_time.value.ui64 = (uint64_t)hrt;
1149 
1150 #ifdef	FIPE_KSTAT_DETAIL
1151 	sp->ioat_ready.value.i32 = fipe_ioat_ctrl.ioat_ready ? 1 : 0;
1152 #endif	/* FIPE_KSTAT_DETAIL */
1153 
1154 	return (0);
1155 }
1156 #endif	/* FIPE_KSTAT_SUPPORT */
1157 
1158 /*
1159  * Initialize memory power management subsystem.
1160  * Note: This function should only be called from ATTACH.
1161  * Note: caller must ensure exclusive access to all fipe_xxx interfaces.
1162  */
1163 int
1164 fipe_init(dev_info_t *dip)
1165 {
1166 	size_t nsize;
1167 	hrtime_t hrt;
1168 
1169 	/* Initialize global control structure. */
1170 	bzero(&fipe_gbl_ctrl, sizeof (fipe_gbl_ctrl));
1171 	mutex_init(&fipe_gbl_ctrl.lock, NULL, MUTEX_DRIVER, NULL);
1172 
1173 	/* Query power management policy from device property. */
1174 	fipe_pm_policy = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
1175 	    FIPE_PROP_PM_POLICY, fipe_pm_policy);
1176 	if (fipe_pm_policy < 0 || fipe_pm_policy >= FIPE_PM_POLICY_MAX) {
1177 		cmn_err(CE_CONT,
1178 		    "?fipe: invalid power management policy %d.\n",
1179 		    fipe_pm_policy);
1180 		fipe_pm_policy = FIPE_PM_POLICY_BALANCE;
1181 	}
1182 	fipe_profile_curr = &fipe_profiles[fipe_pm_policy];
1183 
1184 	/*
1185 	 * Compute unscaled hrtime value corresponding to FIPE_STAT_INTERVAL.
1186 	 * (1 << 36) should be big enough here.
1187 	 */
1188 	hrt = 1ULL << 36;
1189 	scalehrtime(&hrt);
1190 	fipe_idle_ctrl.tick_interval = FIPE_STAT_INTERVAL * (1ULL << 36) / hrt;
1191 
1192 	if (fipe_mc_init(dip) != 0) {
1193 		cmn_err(CE_WARN, "!fipe: failed to initialize mc state.");
1194 		goto out_mc_error;
1195 	}
1196 	if (fipe_ioat_init() != 0) {
1197 		cmn_err(CE_NOTE, "!fipe: failed to initialize ioat state.");
1198 		goto out_ioat_error;
1199 	}
1200 
1201 	/* Allocate per-CPU structure. */
1202 	nsize = max_ncpus * sizeof (fipe_cpu_state_t);
1203 	nsize += CPU_CACHE_COHERENCE_SIZE;
1204 	fipe_gbl_ctrl.state_buf = kmem_zalloc(nsize, KM_SLEEP);
1205 	fipe_gbl_ctrl.state_size = nsize;
1206 	fipe_cpu_states = (fipe_cpu_state_t *)P2ROUNDUP(
1207 	    (intptr_t)fipe_gbl_ctrl.state_buf, CPU_CACHE_COHERENCE_SIZE);
1208 
1209 #ifdef	FIPE_KSTAT_SUPPORT
1210 	fipe_gbl_ctrl.fipe_kstat = kstat_create("fipe", 0, "fipe-pm", "misc",
1211 	    KSTAT_TYPE_NAMED, sizeof (fipe_kstat) / sizeof (kstat_named_t),
1212 	    KSTAT_FLAG_VIRTUAL);
1213 	if (fipe_gbl_ctrl.fipe_kstat == NULL) {
1214 		cmn_err(CE_CONT, "?fipe: failed to create kstat object.\n");
1215 	} else {
1216 		fipe_gbl_ctrl.fipe_kstat->ks_lock = &fipe_gbl_ctrl.lock;
1217 		fipe_gbl_ctrl.fipe_kstat->ks_data = &fipe_kstat;
1218 		fipe_gbl_ctrl.fipe_kstat->ks_update = fipe_kstat_update;
1219 		kstat_install(fipe_gbl_ctrl.fipe_kstat);
1220 	}
1221 #endif	/* FIPE_KSTAT_SUPPORT */
1222 
1223 	return (0);
1224 
1225 out_ioat_error:
1226 	fipe_mc_fini();
1227 out_mc_error:
1228 	mutex_destroy(&fipe_gbl_ctrl.lock);
1229 	bzero(&fipe_gbl_ctrl, sizeof (fipe_gbl_ctrl));
1230 
1231 	return (-1);
1232 }
1233 
1234 /*
1235  * Destroy memory power management subsystem.
1236  * Note: This function should only be called from DETACH.
1237  * Note: caller must ensure exclusive access to all fipe_xxx interfaces.
1238  */
1239 int
1240 fipe_fini(void)
1241 {
1242 	if (fipe_gbl_ctrl.pm_enabled) {
1243 		cmn_err(CE_NOTE, "!fipe: call fipe_fini without stopping PM.");
1244 		return (EBUSY);
1245 	}
1246 
1247 	ASSERT(!fipe_gbl_ctrl.pm_active);
1248 	fipe_ioat_fini();
1249 	fipe_mc_fini();
1250 
1251 #ifdef	FIPE_KSTAT_SUPPORT
1252 	if (fipe_gbl_ctrl.fipe_kstat != NULL) {
1253 		kstat_delete(fipe_gbl_ctrl.fipe_kstat);
1254 		fipe_gbl_ctrl.fipe_kstat = NULL;
1255 	}
1256 #endif	/* FIPE_KSTAT_SUPPORT */
1257 
1258 	if (fipe_gbl_ctrl.state_buf != NULL) {
1259 		ASSERT(fipe_gbl_ctrl.state_size != 0);
1260 		kmem_free(fipe_gbl_ctrl.state_buf, fipe_gbl_ctrl.state_size);
1261 		fipe_cpu_states = NULL;
1262 	}
1263 
1264 	fipe_profile_curr = NULL;
1265 	mutex_destroy(&fipe_gbl_ctrl.lock);
1266 	bzero(&fipe_gbl_ctrl, sizeof (fipe_gbl_ctrl));
1267 
1268 	return (0);
1269 }
1270 
1271 /*
1272  * Start memory power management subsystem.
1273  * Note: caller must ensure exclusive access to all fipe_xxx interfaces.
1274  */
1275 int
1276 fipe_start(void)
1277 {
1278 	if (fipe_gbl_ctrl.pm_enabled == B_TRUE) {
1279 		return (0);
1280 	}
1281 
1282 	bzero(fipe_cpu_states, max_ncpus * sizeof (fipe_cpu_states[0]));
1283 	fipe_ioat_alloc(NULL);
1284 	if (fipe_idle_start() != 0) {
1285 		cmn_err(CE_NOTE, "!fipe: failed to start PM subsystem.");
1286 		fipe_ioat_free();
1287 		return (-1);
1288 	}
1289 
1290 	fipe_gbl_ctrl.pm_enabled = B_TRUE;
1291 
1292 	return (0);
1293 }
1294 
1295 /*
1296  * Stop memory power management subsystem.
1297  * Note: caller must ensure exclusive access to all fipe_xxx interfaces.
1298  */
1299 int
1300 fipe_stop(void)
1301 {
1302 	if (fipe_gbl_ctrl.pm_enabled) {
1303 		if (fipe_idle_stop() != 0) {
1304 			cmn_err(CE_NOTE,
1305 			    "!fipe: failed to stop PM subsystem.");
1306 			return (-1);
1307 		}
1308 		fipe_ioat_free();
1309 		fipe_gbl_ctrl.pm_enabled = B_FALSE;
1310 	}
1311 	ASSERT(!fipe_gbl_ctrl.pm_active);
1312 
1313 	return (0);
1314 }
1315 
1316 int
1317 fipe_suspend(void)
1318 {
1319 	/* Save current power management policy. */
1320 	fipe_pm_policy_saved = fipe_pm_policy;
1321 	/* Disable PM by setting profile to FIPE_PM_POLICY_DISABLE. */
1322 	fipe_pm_policy = FIPE_PM_POLICY_DISABLE;
1323 	fipe_profile_curr = &fipe_profiles[fipe_pm_policy];
1324 
1325 	return (0);
1326 }
1327 
1328 int
1329 fipe_resume(void)
1330 {
1331 	/* Restore saved power management policy. */
1332 	fipe_pm_policy = fipe_pm_policy_saved;
1333 	fipe_profile_curr = &fipe_profiles[fipe_pm_policy];
1334 
1335 	return (0);
1336 }
1337 
1338 fipe_pm_policy_t
1339 fipe_get_pmpolicy(void)
1340 {
1341 	return (fipe_pm_policy);
1342 }
1343 
1344 int
1345 fipe_set_pmpolicy(fipe_pm_policy_t policy)
1346 {
1347 	if (policy < 0 || policy >= FIPE_PM_POLICY_MAX) {
1348 		return (EINVAL);
1349 	}
1350 	fipe_pm_policy = policy;
1351 	fipe_profile_curr = &fipe_profiles[fipe_pm_policy];
1352 
1353 	return (0);
1354 }
1355 
1356 /*
1357  * Check condition (fipe_gbl_ctrl.cpu_cnt == ncpus) to make sure that
1358  * there is other CPU trying to wake up system from memory power saving state.
1359  * If a CPU is waking up system, fipe_disable() will set
1360  * fipe_gbl_ctrl.pm_active to false as soon as possible and allow other CPU's
1361  * to continue, and it will take the responsibility to recover system from
1362  * memory power saving state.
1363  */
1364 static void
1365 fipe_enable(int throttle, cpu_idle_check_wakeup_t check_func, void* check_arg)
1366 {
1367 	extern void membar_sync(void);
1368 
1369 	FIPE_KSTAT_DETAIL_INC(pm_tryenter_cnt);
1370 
1371 	/*
1372 	 * Check CPU wakeup events.
1373 	 */
1374 	if (check_func != NULL) {
1375 		(*check_func)(check_arg);
1376 	}
1377 
1378 	/*
1379 	 * Try to acquire mutex, which also implicitly has the same effect
1380 	 * of calling membar_sync().
1381 	 * If mutex_tryenter fails, that means other CPU is waking up.
1382 	 */
1383 	if (mutex_tryenter(&fipe_gbl_ctrl.lock) == 0) {
1384 		FIPE_KSTAT_DETAIL_INC(pm_race_cnt);
1385 	/*
1386 	 * Handle a special race condition for the case that a CPU wakes
1387 	 * and then enters into idle state within a short period.
1388 	 * This case can't be reliably detected by cpu_count mechanism.
1389 	 */
1390 	} else if (fipe_gbl_ctrl.pm_active) {
1391 		FIPE_KSTAT_DETAIL_INC(pm_race_cnt);
1392 		mutex_exit(&fipe_gbl_ctrl.lock);
1393 	} else {
1394 		fipe_gbl_ctrl.pm_active = B_TRUE;
1395 		membar_sync();
1396 		if (fipe_gbl_ctrl.cpu_count != ncpus) {
1397 			FIPE_KSTAT_DETAIL_INC(pm_race_cnt);
1398 			fipe_gbl_ctrl.pm_active = B_FALSE;
1399 		} else if (fipe_ioat_trigger() != 0) {
1400 			fipe_gbl_ctrl.pm_active = B_FALSE;
1401 		} else if (fipe_gbl_ctrl.cpu_count != ncpus ||
1402 		    fipe_mc_change(throttle) != 0) {
1403 			fipe_gbl_ctrl.pm_active = B_FALSE;
1404 			fipe_ioat_cancel();
1405 			if (fipe_gbl_ctrl.cpu_count != ncpus) {
1406 				FIPE_KSTAT_DETAIL_INC(pm_race_cnt);
1407 			}
1408 		} else if (fipe_gbl_ctrl.cpu_count != ncpus) {
1409 			fipe_gbl_ctrl.pm_active = B_FALSE;
1410 			fipe_mc_restore();
1411 			fipe_ioat_cancel();
1412 			FIPE_KSTAT_DETAIL_INC(pm_race_cnt);
1413 		} else {
1414 			FIPE_KSTAT_DETAIL_INC(pm_success_cnt);
1415 		}
1416 		mutex_exit(&fipe_gbl_ctrl.lock);
1417 	}
1418 }
1419 
1420 static void
1421 fipe_disable(void)
1422 {
1423 	/*
1424 	 * Try to acquire lock, which also implicitly has the same effect
1425 	 * of calling membar_sync().
1426 	 */
1427 	while (mutex_tryenter(&fipe_gbl_ctrl.lock) == 0) {
1428 		/*
1429 		 * If power saving is inactive, just return and all dirty
1430 		 * house-keeping work will be handled in fipe_enable().
1431 		 */
1432 		if (fipe_gbl_ctrl.pm_active == B_FALSE) {
1433 			return;
1434 		} else {
1435 			(void) SMT_PAUSE();
1436 		}
1437 	}
1438 
1439 	/* Disable power saving if it's active. */
1440 	if (fipe_gbl_ctrl.pm_active) {
1441 		/*
1442 		 * Set pm_active to FALSE as soon as possible to prevent
1443 		 * other CPUs from waiting on pm_active flag.
1444 		 */
1445 		fipe_gbl_ctrl.pm_active = B_FALSE;
1446 		membar_producer();
1447 		fipe_mc_restore();
1448 		fipe_ioat_cancel();
1449 	}
1450 
1451 	mutex_exit(&fipe_gbl_ctrl.lock);
1452 }
1453 
1454 /*ARGSUSED*/
1455 static boolean_t
1456 fipe_check_cpu(struct fipe_cpu_state *sp, cpu_idle_callback_context_t ctx,
1457     hrtime_t ts)
1458 {
1459 	if (cpu_flagged_offline(CPU->cpu_flags)) {
1460 		/* Treat CPU in offline state as ready. */
1461 		sp->cond_ready = B_TRUE;
1462 		return (B_TRUE);
1463 	} else if (sp->next_ts <= ts) {
1464 		uint64_t intr;
1465 		hrtime_t idle, busy, diff;
1466 		cpu_idle_prop_value_t val;
1467 
1468 		/* Set default value. */
1469 		sp->cond_ready = B_TRUE;
1470 		sp->idle_count = 0;
1471 
1472 		/* Calculate idle percent. */
1473 		idle = sp->last_idle;
1474 		sp->last_idle = cpu_idle_prop_get_hrtime(
1475 		    fipe_idle_ctrl.prop_idle, ctx);
1476 		idle = sp->last_idle - idle;
1477 		busy = sp->last_busy;
1478 		sp->last_busy = cpu_idle_prop_get_hrtime(
1479 		    fipe_idle_ctrl.prop_busy, ctx);
1480 		busy = sp->last_busy - busy;
1481 		/* Check idle condition. */
1482 		if (idle > 0 && busy > 0) {
1483 			if (busy * (100 - FIPE_PROF_BUSY_THRESHOLD) >
1484 			    idle * FIPE_PROF_BUSY_THRESHOLD) {
1485 				FIPE_KSTAT_DETAIL_INC(cpu_busy_cnt);
1486 				sp->cond_ready = B_FALSE;
1487 			} else {
1488 				FIPE_KSTAT_DETAIL_INC(cpu_idle_cnt);
1489 			}
1490 		} else {
1491 			FIPE_KSTAT_DETAIL_INC(cpu_busy_cnt);
1492 			sp->cond_ready = B_FALSE;
1493 		}
1494 
1495 		/* Calculate interrupt count. */
1496 		diff = sp->next_ts;
1497 		sp->next_ts = ts + fipe_idle_ctrl.tick_interval;
1498 		diff = sp->next_ts - diff;
1499 		intr = sp->last_intr;
1500 		if (cpu_idle_prop_get_value(fipe_idle_ctrl.prop_intr, ctx,
1501 		    &val) == 0) {
1502 			sp->last_intr = val.cipv_uint64;
1503 			intr = sp->last_intr - intr;
1504 			if (diff != 0) {
1505 				intr = intr * fipe_idle_ctrl.tick_interval;
1506 				intr /= diff;
1507 			} else {
1508 				intr = FIPE_PROF_INTR_THRESHOLD;
1509 			}
1510 		} else {
1511 			intr = FIPE_PROF_INTR_THRESHOLD;
1512 		}
1513 
1514 		/*
1515 		 * System is busy with interrupts, so disable all PM
1516 		 * status checks for INTR_BUSY_THROTTLE ticks.
1517 		 * Interrupts are disabled when FIPE callbacks are called,
1518 		 * so this optimization will help to reduce interrupt
1519 		 * latency.
1520 		 */
1521 		if (intr >= FIPE_PROF_INTR_BUSY_THRESHOLD) {
1522 			FIPE_KSTAT_DETAIL_INC(cpu_intr_busy_cnt);
1523 			sp->throttle_ts = ts + FIPE_PROF_INTR_BUSY_THROTTLE *
1524 			    fipe_idle_ctrl.tick_interval;
1525 			sp->cond_ready = B_FALSE;
1526 		} else if (intr >= FIPE_PROF_INTR_THRESHOLD) {
1527 			FIPE_KSTAT_DETAIL_INC(cpu_intr_throttle_cnt);
1528 			sp->cond_ready = B_FALSE;
1529 		}
1530 	} else if (++sp->idle_count >= FIPE_PROF_IDLE_COUNT) {
1531 		/* Too many idle enter/exit in this tick. */
1532 		FIPE_KSTAT_DETAIL_INC(cpu_loop_cnt);
1533 		sp->throttle_ts = sp->next_ts + fipe_idle_ctrl.tick_interval;
1534 		sp->idle_count = 0;
1535 		sp->cond_ready = B_FALSE;
1536 		return (B_FALSE);
1537 	}
1538 
1539 	return (sp->cond_ready);
1540 }
1541 
1542 /*ARGSUSED*/
1543 static void
1544 fipe_idle_enter(void *arg, cpu_idle_callback_context_t ctx,
1545     cpu_idle_check_wakeup_t check_func, void* check_arg)
1546 {
1547 	hrtime_t ts;
1548 	uint32_t cnt;
1549 	uint64_t iowait;
1550 	cpu_t *cp = CPU;
1551 	struct fipe_cpu_state *sp;
1552 
1553 	sp = &fipe_cpu_states[cp->cpu_id];
1554 	ts = cpu_idle_prop_get_hrtime(fipe_idle_ctrl.prop_enter, ctx);
1555 
1556 	if (fipe_pm_policy != FIPE_PM_POLICY_DISABLE &&
1557 	    fipe_ioat_ctrl.ioat_ready &&
1558 	    sp->state_ready && sp->throttle_ts <= ts) {
1559 		/* Adjust iowait count for local CPU. */
1560 		iowait = CPU_STATS(cp, sys.iowait);
1561 		if (iowait != sp->last_iowait) {
1562 			atomic_add_64(&fipe_gbl_ctrl.io_waiters,
1563 			    iowait - sp->last_iowait);
1564 			sp->last_iowait = iowait;
1565 		}
1566 
1567 		/* Check current CPU status. */
1568 		if (fipe_check_cpu(sp, ctx, ts)) {
1569 			/* Increase count of CPU ready for power saving. */
1570 			do {
1571 				cnt = fipe_gbl_ctrl.cpu_count;
1572 				ASSERT(cnt < ncpus);
1573 			} while (atomic_cas_32(&fipe_gbl_ctrl.cpu_count,
1574 			    cnt, cnt + 1) != cnt);
1575 
1576 			/*
1577 			 * Enable power saving if all CPUs are idle.
1578 			 */
1579 			if (cnt + 1 == ncpus) {
1580 				if (fipe_gbl_ctrl.io_waiters == 0) {
1581 					fipe_gbl_ctrl.enter_ts = ts;
1582 					fipe_enable(fipe_pm_throttle_level,
1583 					    check_func, check_arg);
1584 				/* There are ongoing block io operations. */
1585 				} else {
1586 					FIPE_KSTAT_DETAIL_INC(bio_busy_cnt);
1587 				}
1588 			}
1589 		}
1590 	} else if (fipe_pm_policy == FIPE_PM_POLICY_DISABLE ||
1591 	    fipe_ioat_ctrl.ioat_ready == B_FALSE) {
1592 		if (sp->cond_ready == B_TRUE) {
1593 			sp->cond_ready = B_FALSE;
1594 		}
1595 	} else if (sp->state_ready == B_FALSE) {
1596 		sp->cond_ready = B_FALSE;
1597 		sp->state_ready = B_TRUE;
1598 		sp->throttle_ts = 0;
1599 		sp->next_ts = ts + fipe_idle_ctrl.tick_interval;
1600 		sp->last_busy = cpu_idle_prop_get_hrtime(
1601 		    fipe_idle_ctrl.prop_busy, ctx);
1602 		sp->last_idle = cpu_idle_prop_get_hrtime(
1603 		    fipe_idle_ctrl.prop_idle, ctx);
1604 		sp->last_intr = cpu_idle_prop_get_hrtime(
1605 		    fipe_idle_ctrl.prop_intr, ctx);
1606 		sp->idle_count = 0;
1607 	}
1608 }
1609 
1610 /*ARGSUSED*/
1611 static void
1612 fipe_idle_exit(void* arg, cpu_idle_callback_context_t ctx, int flags)
1613 {
1614 	uint32_t cnt;
1615 	hrtime_t ts;
1616 	struct fipe_cpu_state *sp;
1617 
1618 	sp = &fipe_cpu_states[CPU->cpu_id];
1619 	if (sp->cond_ready) {
1620 		do {
1621 			cnt = fipe_gbl_ctrl.cpu_count;
1622 			ASSERT(cnt > 0);
1623 		} while (atomic_cas_32(&fipe_gbl_ctrl.cpu_count,
1624 		    cnt, cnt - 1) != cnt);
1625 
1626 		/*
1627 		 * Try to disable power saving state.
1628 		 * Only the first CPU waking from idle state will try to
1629 		 * disable power saving state, all other CPUs will just go
1630 		 * on and not try to wait for memory to recover from power
1631 		 * saving state.
1632 		 * So there are possible periods during which some CPUs are in
1633 		 * active state but memory is in power saving state.
1634 		 * This is OK, since it is an uncommon case, and it is
1635 		 * better for performance to let them continue as their
1636 		 * blocking latency is smaller than a mutex, and is only
1637 		 * hit in the uncommon condition.
1638 		 */
1639 		if (cnt == ncpus) {
1640 			fipe_disable();
1641 			ts = cpu_idle_prop_get_hrtime(fipe_idle_ctrl.prop_exit,
1642 			    ctx);
1643 			fipe_gbl_ctrl.time_in_pm += ts - fipe_gbl_ctrl.enter_ts;
1644 		}
1645 	}
1646 }
1647