xref: /titanic_44/usr/src/uts/common/io/pciex/pcie_pwr.c (revision d7bec57c3803769d0e8bf1960016b866617d455c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/ddi.h>
28 #include <sys/kmem.h>
29 #include <sys/sysmacros.h>
30 #include <sys/sunddi.h>
31 #include <sys/sunpm.h>
32 #include <sys/epm.h>
33 #include <sys/sunndi.h>
34 #include <sys/ddi_impldefs.h>
35 #include <sys/ddi_implfuncs.h>
36 #include <sys/pcie.h>
37 #include <sys/pcie_impl.h>
38 #include <sys/promif.h>		/* prom_printf */
39 #include <sys/pcie_pwr.h>
40 
41 /*
42  * This file implements the power management functionality for
43  * pci express switch and pci express-to-pci/pci-x bridge. All the
44  * code in this file is generic and is not specific to a particular chip.
45  * The algorithm, which decides when to go to a lower power is explained
46  * below:
47  *
48  *	1. Initially when no children are attached, the driver is idle from
49  *	PM framework point of view ( PM idle/PM busy).
50  *
51  *	2. Driver is PM busy if either a reference count called pwr_hold is
52  *	greater than zero or driver is already at the lowest possible power
53  *	level. The lowest possible power level for the driver is equal to the
54  *	highest power level among its children. The PM busy condition is
55  *	indicated by PCIE_PM_BUSY bit. At any point, only one pm_busy_component
56  *	call is made for a nexus driver instance.
57  *
58  *	3. Driver is PM idle if the pwr_hold is zero and the lowest
59  *	possible power level is less than the driver's current power level.
60  *	At any point, only one pm_idle_component call is made for a nexus
61  *	driver instance.
62  *
63  *	4. For any events like child attach, it increments pwr_hold and marks
64  *	itslef busy, if it is not already done so. This temporary hold is
65  *	removed when the event is complete.
66  *
67  *	5. Any child's power change requires the parent (this driver) to be
68  *	full power. So it raises its power and increments pwr_hold. It also
69  *	marks itself temporarily busy, if it is not already done. This hold
70  *	is removed when the child power change is complete.
71  *
72  *	6. After each child power change, it evaluates what is the lowest
73  *	possible power level. If the lowest possible power level is less than
74  *	the current power level and pwr_hold is zero, then it marks itself
75  *	idle. The lowest power level is equal or greater than the highest level
76  *	among the children. It keeps track of children's power level by
77  *	using counters.
78  *
79  *	7. Any code e.g., which is accessing the driver's own registers should
80  *	place a temporary hold using pcie_pm_hold.
81  */
82 
83 static int pcie_pwr_change(dev_info_t *dip, pcie_pwr_t *pwr_p, int new);
84 static void pwr_update_counters(int *countersp, int olevel, int nlevel);
85 static int pwr_level_allowed(pcie_pwr_t *pwr_p);
86 static void pcie_add_comps(dev_info_t *dip, dev_info_t *cdip,
87     pcie_pwr_t *pwr_p);
88 static void pcie_remove_comps(dev_info_t *dip, dev_info_t *cdip,
89     pcie_pwr_t *pwr_p);
90 static void pcie_pm_subrelease(dev_info_t *dip, pcie_pwr_t *pwr_p);
91 static boolean_t pcie_is_pcie(dev_info_t *dip);
92 #ifdef DEBUG
93 static char *pcie_decode_pwr_op(pm_bus_power_op_t op);
94 #else
95 #define	pcie_decode_pwr_op
96 #endif
97 
98 /*
99  * power entry point.
100  *
101  * This function decides whether the PM request is honorable.
102  * If yes, it then does what's necessary for switch or
103  *    bridge to change its power.
104  */
105 /* ARGSUSED */
106 int
107 pcie_power(dev_info_t *dip, int component, int level)
108 {
109 	pcie_pwr_t *pwr_p = PCIE_NEXUS_PMINFO(dip);
110 	int *counters = pwr_p->pwr_counters;
111 	int pmcaps = pwr_p->pwr_pmcaps;
112 	int ret = DDI_FAILURE;
113 
114 #if defined(__i386) || defined(__amd64)
115 	if (dip)
116 		return (DDI_SUCCESS);
117 #endif /* defined(__i386) || defined(__amd64) */
118 
119 	ASSERT(level != PM_LEVEL_UNKNOWN);
120 	/* PM should not asking for a level, which is unsupported */
121 	ASSERT(level == PM_LEVEL_D0 || level == PM_LEVEL_D3 ||
122 	    (level == PM_LEVEL_D1 && (pmcaps & PCIE_SUPPORTS_D1)) ||
123 	    (level == PM_LEVEL_D2 && (pmcaps & PCIE_SUPPORTS_D2)));
124 
125 	mutex_enter(&pwr_p->pwr_lock);
126 	PCIE_DBG("%s(%d): pcie_power: change from %d to %d\n",
127 	    ddi_driver_name(dip), ddi_get_instance(dip), pwr_p->pwr_func_lvl,
128 	    level);
129 	if (pwr_p->pwr_func_lvl == level) {
130 		PCIE_DBG("%s(%d): pcie_power: already at %d\n",
131 		    ddi_driver_name(dip), ddi_get_instance(dip), level);
132 		ret = DDI_SUCCESS;
133 		goto pcie_pwr_done;
134 	}
135 
136 	if (level < pwr_p->pwr_func_lvl) {
137 		/*
138 		 * Going to lower power. Reject this if we are either busy
139 		 * or there is a hold.
140 		 */
141 		if (pwr_p->pwr_flags & PCIE_PM_BUSY) {
142 			PCIE_DBG("%s(%d): pcie_power: rejecting change to %d "
143 			    "as busy\n", ddi_driver_name(dip),
144 			    ddi_get_instance(dip), level);
145 			goto pcie_pwr_done;
146 		}
147 
148 		/*
149 		 * Now we know that we are neither busy nor there is a hold.
150 		 * At this point none of the children should be at full power.
151 		 * Reject the request if level reqested is lower than the level
152 		 * possible.
153 		 */
154 		ASSERT(!counters[PCIE_D0_INDEX] &&
155 		    !counters[PCIE_UNKNOWN_INDEX]);
156 		if (level < pwr_level_allowed(pwr_p)) {
157 			PCIE_DBG("%s(%d): pcie_power: rejecting level %d as"
158 			    " %d is the lowest possible\n",
159 			    ddi_driver_name(dip), ddi_get_instance(dip), level,
160 			    pwr_level_allowed(pwr_p));
161 			goto pcie_pwr_done;
162 		}
163 	}
164 
165 	if (pcie_pwr_change(dip, pwr_p, level) != DDI_SUCCESS) {
166 		PCIE_DBG("%s(%d): pcie_power: attempt to change to %d "
167 		    " failed \n", ddi_driver_name(dip), ddi_get_instance(dip),
168 		    level);
169 		goto pcie_pwr_done;
170 	}
171 	pwr_p->pwr_func_lvl = level;
172 	PCIE_DBG("%s(%d): pcie_power: level changed to %d \n",
173 	    ddi_driver_name(dip), ddi_get_instance(dip), level);
174 	ret = DDI_SUCCESS;
175 
176 pcie_pwr_done:
177 	mutex_exit(&pwr_p->pwr_lock);
178 	return (ret);
179 }
180 
181 /*
182  * Called by pcie_power() only. Caller holds the pwr_lock.
183  *
184  * dip - dev_info pointer
185  * pwr_p - pm info for the node.
186  * new     - new level
187  */
188 static int
189 pcie_pwr_change(dev_info_t *dip, pcie_pwr_t *pwr_p, int new)
190 {
191 	uint16_t pmcsr;
192 
193 	ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
194 	ASSERT(new != pwr_p->pwr_func_lvl);
195 	pmcsr = pci_config_get16(pwr_p->pwr_conf_hdl, pwr_p->pwr_pmcsr_offset);
196 	pmcsr &= ~PCI_PMCSR_STATE_MASK;
197 	switch (new) {
198 	case PM_LEVEL_D0:
199 		pmcsr |= PCI_PMCSR_D0;
200 		break;
201 
202 	case PM_LEVEL_D1:
203 		pmcsr |= PCI_PMCSR_D1;
204 		break;
205 
206 	case PM_LEVEL_D2:
207 		pmcsr |= PCI_PMCSR_D2;
208 		break;
209 
210 	case PM_LEVEL_D3:
211 		pmcsr |= PCI_PMCSR_D3HOT;
212 		break;
213 
214 	default:
215 		ASSERT(0);
216 		break;
217 	}
218 	/* Save config space, if going to D3 */
219 	if (new == PM_LEVEL_D3) {
220 		PCIE_DBG("%s(%d): pwr_change: saving config space regs\n",
221 		    ddi_driver_name(dip), ddi_get_instance(dip));
222 		if (pci_save_config_regs(dip) != DDI_SUCCESS) {
223 			PCIE_DBG("%s(%d): pcie_pwr_change: failed to save "
224 			    "config space regs\n", ddi_driver_name(dip),
225 			    ddi_get_instance(dip));
226 			return (DDI_FAILURE);
227 		}
228 	}
229 
230 	pci_config_put16(pwr_p->pwr_conf_hdl, pwr_p->pwr_pmcsr_offset, pmcsr);
231 
232 	/*
233 	 * TBD: Taken from pci_pci driver. Is this required?
234 	 * No bus transactions should occur without waiting for
235 	 * settle time specified in PCI PM spec rev 2.1 sec 5.6.1
236 	 * To make things simple, just use the max time specified for
237 	 * all state transitions.
238 	 */
239 	delay(drv_usectohz(PCI_CLK_SETTLE_TIME));
240 
241 	/*
242 	 * Restore config space if coming out of D3
243 	 */
244 	if (pwr_p->pwr_func_lvl == PM_LEVEL_D3) {
245 		PCIE_DBG("%s(%d): pcie_pwr_change: restoring config space\n",
246 		    ddi_driver_name(dip), ddi_get_instance(dip));
247 		if (pci_restore_config_regs(dip) != DDI_SUCCESS) {
248 			PCIE_DBG("%s(%d): pcie_pwr_change: failed to restore "
249 			    "config space regs\n", ddi_driver_name(dip),
250 			    ddi_get_instance(dip));
251 			return (DDI_FAILURE);
252 		}
253 	}
254 	return (DDI_SUCCESS);
255 }
256 
257 /*
258  * bus_ctlops.bus_power function.
259  *
260  * This function handles PRE_ POST_ change notifications, sent by
261  * PM framework related to child's power level change. It marks itself
262  * idle or busy based on the children's power level.
263  */
264 int
265 pcie_bus_power(dev_info_t *dip, void *impl_arg, pm_bus_power_op_t op,
266     void *arg, void *result)
267 {
268 	pcie_pwr_t *pwr_p = PCIE_NEXUS_PMINFO(dip);
269 	int *counters = pwr_p->pwr_counters; /* nexus counters */
270 	int *child_counters; /* per child dip counters */
271 	pm_bp_child_pwrchg_t *bpc;
272 	pm_bp_has_changed_t *bphc;
273 	dev_info_t *cdip;
274 	int new_level;
275 	int old_level;
276 	int rv = DDI_SUCCESS;
277 	int level_allowed, comp;
278 
279 #if defined(__i386) || defined(__amd64)
280 	if (dip)
281 		return (DDI_SUCCESS);
282 #endif /* defined(__i386) || defined(__amd64) */
283 
284 	switch (op) {
285 	case BUS_POWER_PRE_NOTIFICATION:
286 	case BUS_POWER_POST_NOTIFICATION:
287 		bpc = (pm_bp_child_pwrchg_t *)arg;
288 		cdip = bpc->bpc_dip;
289 		new_level = bpc->bpc_nlevel;
290 		old_level = bpc->bpc_olevel;
291 		comp = bpc->bpc_comp;
292 		break;
293 
294 	case BUS_POWER_HAS_CHANGED:
295 		bphc = (pm_bp_has_changed_t *)arg;
296 		cdip = bphc->bphc_dip;
297 		new_level = bphc->bphc_nlevel;
298 		old_level = bphc->bphc_olevel;
299 		comp = bphc->bphc_comp;
300 		break;
301 
302 	default:
303 		break;
304 
305 	}
306 
307 	ASSERT(pwr_p);
308 	mutex_enter(&pwr_p->pwr_lock);
309 	switch (op) {
310 	case BUS_POWER_PRE_NOTIFICATION:
311 		PCIE_DBG("%s(%d): pcie_bus_power: %s@%d op %s %d->%d\n",
312 		    ddi_driver_name(dip), ddi_get_instance(dip),
313 		    ddi_driver_name(cdip), ddi_get_instance(cdip),
314 		    pcie_decode_pwr_op(op), old_level, new_level);
315 		/*
316 		 * If the nexus doesn't want the child to go into
317 		 * non-D0 state, mark the child busy. This way PM
318 		 * framework will never try to lower the child's power.
319 		 * In case of pm_lower_power, marking busy won't help.
320 		 * So we need to specifically reject the attempt to
321 		 * go to non-D0 state.
322 		 */
323 		if (pwr_p->pwr_flags & PCIE_NO_CHILD_PM) {
324 			if (!PCIE_IS_COMPS_COUNTED(cdip)) {
325 				PCIE_DBG("%s(%d): pcie_bus_power: marking "
326 				    "child busy to disable pm \n",
327 				    ddi_driver_name(dip),
328 				    ddi_get_instance(dip));
329 				(void) pm_busy_component(cdip, 0);
330 			}
331 			if (new_level < PM_LEVEL_D0 && !comp) {
332 				PCIE_DBG("%s(%d): pcie_bus_power: rejecting "
333 				    "child's attempt to go to %d\n",
334 				    ddi_driver_name(dip), ddi_get_instance(dip),
335 				    new_level);
336 				rv = DDI_FAILURE;
337 			}
338 		}
339 		mutex_exit(&pwr_p->pwr_lock);
340 		if (rv == DDI_SUCCESS)
341 			rv = pcie_pm_hold(dip);
342 		return (rv);
343 
344 	case BUS_POWER_HAS_CHANGED:
345 	case BUS_POWER_POST_NOTIFICATION:
346 		PCIE_DBG("%s(%d): pcie_bus_power: %s@%d op %s %d->%d\n",
347 		    ddi_driver_name(dip), ddi_get_instance(dip),
348 		    ddi_driver_name(cdip), ddi_get_instance(cdip),
349 		    pcie_decode_pwr_op(op), old_level, new_level);
350 		/*
351 		 * Child device power changed
352 		 * If pm components of this child aren't accounted for
353 		 * then add the components to the counters. This can't
354 		 * be done in POST_ATTACH ctlop as pm info isn't created
355 		 * by then. Also because a driver can make a pm call during
356 		 * the attach.
357 		 */
358 		if (!PCIE_IS_COMPS_COUNTED(cdip)) {
359 			(void) pcie_pm_add_child(dip, cdip);
360 			if ((pwr_p->pwr_flags & PCIE_NO_CHILD_PM) &&
361 			    (op == BUS_POWER_HAS_CHANGED)) {
362 				PCIE_DBG("%s(%d): pcie_bus_power: marking "
363 				    "child busy to disable pm \n",
364 				    ddi_driver_name(dip),
365 				    ddi_get_instance(dip));
366 				(void) pm_busy_component(cdip, 0);
367 				/*
368 				 * If the driver has already changed to lower
369 				 * power(pm_power_has_changed) on its own,
370 				 * there is nothing we can do other than
371 				 * logging the warning message on the console.
372 				 */
373 				if (new_level < PM_LEVEL_D0)
374 					cmn_err(CE_WARN, "!Downstream device "
375 					    "%s@%d went to non-D0 state: "
376 					    "possible loss of link\n",
377 					    ddi_driver_name(cdip),
378 					    ddi_get_instance(cdip));
379 			}
380 		}
381 
382 
383 		/*
384 		 * If it is POST and device PM is supported, release the
385 		 * hold done in PRE.
386 		 */
387 		if (op == BUS_POWER_POST_NOTIFICATION &&
388 		    PCIE_SUPPORTS_DEVICE_PM(dip)) {
389 			pcie_pm_subrelease(dip, pwr_p);
390 		}
391 
392 		if (*((int *)result) == DDI_FAILURE) {
393 			PCIE_DBG("%s(%d): pcie_bus_power: change for %s%d "
394 			    "failed\n", ddi_driver_name(dip),
395 			    ddi_get_instance(dip), ddi_driver_name(cdip),
396 			    ddi_get_instance(cdip));
397 			break;
398 		}
399 		/* Modify counters appropriately */
400 		pwr_update_counters(counters, old_level, new_level);
401 
402 		child_counters = PCIE_CHILD_COUNTERS(cdip);
403 		pwr_update_counters(child_counters, old_level, new_level);
404 
405 		/* If no device PM, return */
406 		if (!PCIE_SUPPORTS_DEVICE_PM(dip))
407 			break;
408 
409 		level_allowed = pwr_level_allowed(pwr_p);
410 		/*
411 		 * Check conditions for marking busy
412 		 * Check the flag to set this busy only once for multiple
413 		 * busy conditions. Mark busy if our current lowest possible
414 		 * is equal or greater to the current level.
415 		 */
416 		if (level_allowed >= pwr_p->pwr_func_lvl &&
417 		    !(pwr_p->pwr_flags & PCIE_PM_BUSY)) {
418 			PCIE_DBG("%s(%d): pcie_bus_power: marking busy\n",
419 			    ddi_driver_name(dip), ddi_get_instance(dip));
420 			(void) pm_busy_component(dip, 0);
421 			pwr_p->pwr_flags |= PCIE_PM_BUSY;
422 			break;
423 		}
424 		/*
425 		 * Check conditions for marking idle.
426 		 * If our lowest possible level is less than our current
427 		 * level mark idle. Mark idle only if it is not already done.
428 		 */
429 		if ((level_allowed < pwr_p->pwr_func_lvl) &&
430 		    (pwr_p->pwr_hold == 0) &&
431 		    (pwr_p->pwr_flags & PCIE_PM_BUSY)) {
432 			/*
433 			 * For pci express, we should check here whether
434 			 * the link is in L1 state or not.
435 			 */
436 			PCIE_DBG("%s(%d): pcie_bus_power: marking idle\n",
437 			    ddi_driver_name(dip), ddi_get_instance(dip));
438 			(void) pm_idle_component(dip, 0);
439 			pwr_p->pwr_flags &= ~PCIE_PM_BUSY;
440 			break;
441 		}
442 		break;
443 
444 	default:
445 		mutex_exit(&pwr_p->pwr_lock);
446 		return (pm_busop_bus_power(dip, impl_arg, op, arg, result));
447 	}
448 	mutex_exit(&pwr_p->pwr_lock);
449 	return (rv);
450 }
451 
452 /*
453  * Decrement the count of children at olevel by one and increment
454  * count of children at nlevel by one.
455  */
456 static void
457 pwr_update_counters(int *countersp, int olevel, int nlevel)
458 {
459 	uint32_t	index;
460 
461 	ASSERT(olevel >= PM_LEVEL_UNKNOWN && olevel <= PM_LEVEL_D0);
462 	ASSERT(nlevel >= PM_LEVEL_UNKNOWN && nlevel <= PM_LEVEL_D0);
463 
464 	index = (olevel == PM_LEVEL_UNKNOWN ? PCIE_UNKNOWN_INDEX : olevel);
465 	countersp[index]--;
466 	index = (nlevel == PM_LEVEL_UNKNOWN ? PCIE_UNKNOWN_INDEX : nlevel);
467 	countersp[index]++;
468 }
469 
470 /*
471  * Returns the lowest possible power level allowed for nexus
472  * based on children's power level. Lowest possible level is
473  * equal to the highest level among the children. It also checks
474  * for the supported level
475  * UNKNOWN = D0 > D1 > D2 > D3
476  */
477 static int
478 pwr_level_allowed(pcie_pwr_t *pwr_p)
479 {
480 	int *counters = pwr_p->pwr_counters;
481 	int i, j;
482 
483 	ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
484 	/*
485 	 * Search from UNKNOWN to D2. unknown is same as D0.
486 	 * find the highest level among the children. If that
487 	 * level is supported, return that level. If not,
488 	 * find the next higher supported level and return that
489 	 * level. For example, if the D1 is the highest among
490 	 * children and if D1 isn't supported return D0 as the
491 	 * lowest possible level. We don't need to look at D3
492 	 * as that is the default lowest level and it is always
493 	 * supported.
494 	 */
495 	for (i = PCIE_UNKNOWN_INDEX; i > 0; i--) {
496 		if (counters[i]) {
497 			if (i == PCIE_UNKNOWN_INDEX)
498 				return (PM_LEVEL_D0);
499 			/*
500 			 * i is the highest level among children. If this is
501 			 * supported, return i.
502 			 */
503 			if (PCIE_LEVEL_SUPPORTED(pwr_p->pwr_pmcaps, i))
504 				return (i);
505 			/* find the next higher supported level */
506 			for (j = i + 1; j <= PCIE_D0_INDEX; j++) {
507 				if (PCIE_LEVEL_SUPPORTED(pwr_p->pwr_pmcaps, j))
508 					return (j);
509 			}
510 		}
511 	}
512 
513 	return (PM_LEVEL_D3);
514 }
515 
516 /*
517  * Update the counters with number pm components of the child
518  * all components are assumed to be at UNKNOWN level.
519  */
520 static void
521 pcie_add_comps(dev_info_t *dip, dev_info_t *cdip, pcie_pwr_t *pwr_p)
522 {
523 	int comps = PM_NUMCMPTS(cdip);
524 	pcie_pm_t *pcie_pm_p;
525 	pcie_pwr_child_t *cpwr_p;
526 
527 	ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
528 	if (!comps)
529 		return;
530 
531 	PCIE_DBG("%s(%d): pcie_add_comps: unknown level counter incremented "
532 	    "from %d by %d because of %s@%d\n",
533 	    ddi_driver_name(dip), ddi_get_instance(dip),
534 	    (pwr_p->pwr_counters)[PCIE_UNKNOWN_INDEX], comps,
535 	    ddi_driver_name(cdip), ddi_get_instance(cdip));
536 	(pwr_p->pwr_counters)[PCIE_UNKNOWN_INDEX] += comps;
537 	/*
538 	 * Allocate counters per child. This is a part of pcie
539 	 * pm info. If there is no pcie pm info, allocate it here.
540 	 * pcie pm info might already be there for pci express nexus
541 	 * driver e.g. pcieb. For all leaf nodes, it is allocated here.
542 	 */
543 	if ((pcie_pm_p = PCIE_PMINFO(cdip)) == NULL) {
544 		pcie_pm_p = (pcie_pm_t *)kmem_zalloc(
545 		    sizeof (pcie_pm_t), KM_SLEEP);
546 		PCIE_SET_PMINFO(cdip, pcie_pm_p);
547 	}
548 	cpwr_p = (pcie_pwr_child_t *)kmem_zalloc(sizeof (pcie_pwr_child_t),
549 	    KM_SLEEP);
550 	pcie_pm_p->pcie_par_pminfo = cpwr_p;
551 	(cpwr_p->pwr_child_counters)[PCIE_UNKNOWN_INDEX] += comps;
552 }
553 
554 /*
555  * Remove the pm components of a child from our counters.
556  */
557 static void
558 pcie_remove_comps(dev_info_t *dip, dev_info_t *cdip, pcie_pwr_t *pwr_p)
559 {
560 	int i;
561 	int *child_counters;
562 
563 	ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
564 	if (!(PCIE_PMINFO(cdip)) || !PCIE_PAR_PMINFO(cdip)) {
565 		if (PCIE_SUPPORTS_DEVICE_PM(dip)) {
566 			/*
567 			 * Driver never made a PM call and we didn't create
568 			 * any counters for this device. This also means that
569 			 * hold made at the PRE_ATTACH time, still remains.
570 			 * Remove the hold now. The correct thing to do is to
571 			 * stay at full power when a child is at full power
572 			 * whether a driver is there or not. This will be
573 			 * implemented in the future.
574 			 */
575 			pcie_pm_subrelease(dip, pwr_p);
576 		}
577 		return;
578 	}
579 	PCIE_DBG("%s(%d): pcie_remove_comps:counters decremented because of "
580 	    "%s@%d\n", ddi_driver_name(dip), ddi_get_instance(dip),
581 	    ddi_driver_name(cdip), ddi_get_instance(cdip));
582 	child_counters = PCIE_CHILD_COUNTERS(cdip);
583 	/*
584 	 * Adjust the nexus counters. No need to adjust per child dip
585 	 * counters as we are freeing the per child dip info.
586 	 */
587 	for (i = 0; i < PCIE_MAX_PWR_LEVELS; i++) {
588 		ASSERT((pwr_p->pwr_counters)[i] >= child_counters[i]);
589 		(pwr_p->pwr_counters)[i] -= child_counters[i];
590 	}
591 	/* remove both parent pm info and pcie pminfo itself */
592 	kmem_free(PCIE_PAR_PMINFO(cdip), sizeof (pcie_pwr_child_t));
593 	kmem_free(PCIE_PMINFO(cdip), sizeof (pcie_pm_t));
594 	PCIE_RESET_PMINFO(cdip);
595 }
596 
597 /*
598  * Power management related initialization common to px and pcieb
599  */
600 int
601 pwr_common_setup(dev_info_t *dip)
602 {
603 	pcie_pm_t		*pcie_pm_p;
604 	pcie_pwr_t		*pwr_p;
605 	int			pminfo_created = 0;
606 
607 	/* Create pminfo, if it doesn't exist already */
608 	if ((pcie_pm_p = PCIE_PMINFO(dip)) == NULL) {
609 		pcie_pm_p = (pcie_pm_t *)kmem_zalloc(
610 		    sizeof (pcie_pm_t), KM_SLEEP);
611 		PCIE_SET_PMINFO(dip, pcie_pm_p);
612 		pminfo_created = 1;
613 	}
614 	pwr_p = (pcie_pwr_t *)kmem_zalloc(sizeof (pcie_pwr_t), KM_SLEEP);
615 	mutex_init(&pwr_p->pwr_lock, NULL, MUTEX_DRIVER, NULL);
616 	/* Initialize the power level and default level support */
617 	pwr_p->pwr_func_lvl = PM_LEVEL_UNKNOWN;
618 	pwr_p->pwr_pmcaps = PCIE_DEFAULT_LEVEL_SUPPORTED;
619 
620 	if (pcie_plat_pwr_setup(dip) != DDI_SUCCESS)
621 		goto pwr_common_err;
622 
623 	pcie_pm_p->pcie_pwr_p = pwr_p;
624 	return (DDI_SUCCESS);
625 
626 pwr_common_err:
627 	mutex_destroy(&pwr_p->pwr_lock);
628 	kmem_free(pwr_p, sizeof (pcie_pwr_t));
629 	if (pminfo_created) {
630 		PCIE_RESET_PMINFO(dip);
631 		kmem_free(pcie_pm_p, sizeof (pcie_pm_t));
632 	}
633 	return (DDI_FAILURE);
634 
635 }
636 
637 /*
638  * Undo whatever is done in pwr_common_setup. Called by px_detach or pxb_detach
639  */
640 void
641 pwr_common_teardown(dev_info_t *dip)
642 {
643 	pcie_pm_t *pcie_pm_p = PCIE_PMINFO(dip);
644 	pcie_pwr_t *pwr_p;
645 
646 	if (!pcie_pm_p || !(pwr_p = PCIE_NEXUS_PMINFO(dip)))
647 		return;
648 
649 	pcie_plat_pwr_teardown(dip);
650 	mutex_destroy(&pwr_p->pwr_lock);
651 	pcie_pm_p->pcie_pwr_p = NULL;
652 	kmem_free(pwr_p, sizeof (pcie_pwr_t));
653 	/*
654 	 * If the parent didn't store have any pm info about
655 	 * this node, that means parent doesn't need pminfo when it handles
656 	 * POST_DETACH for this node. For example, if dip is the dip of
657 	 * root complex, then there is no parent pm info.
658 	 */
659 	if (!PCIE_PAR_PMINFO(dip)) {
660 		kmem_free(pcie_pm_p, sizeof (pcie_pm_t));
661 		PCIE_RESET_PMINFO(dip);
662 	}
663 }
664 
665 /*
666  * Raises the power and marks itself busy.
667  */
668 int
669 pcie_pm_hold(dev_info_t *dip)
670 {
671 	pcie_pwr_t *pwr_p;
672 
673 	/* If no PM info or no device PM, return */
674 	if (!PCIE_PMINFO(dip) || !(pwr_p = PCIE_NEXUS_PMINFO(dip)) ||
675 	    !(PCIE_SUPPORTS_DEVICE_PM(dip)))
676 		return (DDI_SUCCESS);
677 
678 	/*
679 	 * If we are not at full power, then powerup.
680 	 * Need to be at full power so that link can be
681 	 * at L0. Similarly for PCI/PCI-X bus, it should be
682 	 * at full power.
683 	 */
684 	mutex_enter(&pwr_p->pwr_lock);
685 	ASSERT(pwr_p->pwr_hold >= 0);
686 	PCIE_DBG("%s(%d): pm_hold: incrementing hold \n",
687 	    ddi_driver_name(dip), ddi_get_instance(dip));
688 	pwr_p->pwr_hold++;
689 	/* Mark itself busy, if it is not done already */
690 	if (!(pwr_p->pwr_flags & PCIE_PM_BUSY)) {
691 		PCIE_DBG("%s(%d): pm_hold: marking busy\n",
692 		    ddi_driver_name(dip), ddi_get_instance(dip));
693 		pwr_p->pwr_flags |= PCIE_PM_BUSY;
694 		(void) pm_busy_component(dip, 0);
695 	}
696 	if (pwr_p->pwr_func_lvl == PM_LEVEL_D0) {
697 		mutex_exit(&pwr_p->pwr_lock);
698 		return (DDI_SUCCESS);
699 	}
700 	mutex_exit(&pwr_p->pwr_lock);
701 	if (pm_raise_power(dip, 0, PM_LEVEL_D0) != DDI_SUCCESS) {
702 		PCIE_DBG("%s(%d): pm_hold: attempt to raise power "
703 		    "from %d to %d failed\n", ddi_driver_name(dip),
704 		    ddi_get_instance(dip), pwr_p->pwr_func_lvl,
705 		    PM_LEVEL_D0);
706 		pcie_pm_release(dip);
707 		return (DDI_FAILURE);
708 	}
709 	return (DDI_SUCCESS);
710 }
711 
712 /*
713  * Reverse the things done in pcie_pm_hold
714  */
715 void
716 pcie_pm_release(dev_info_t *dip)
717 {
718 	pcie_pwr_t *pwr_p;
719 
720 	/* If no PM info or no device PM, return */
721 	if (!PCIE_PMINFO(dip) || !(pwr_p = PCIE_NEXUS_PMINFO(dip)) ||
722 	    !(PCIE_SUPPORTS_DEVICE_PM(dip)))
723 		return;
724 
725 	mutex_enter(&pwr_p->pwr_lock);
726 	pcie_pm_subrelease(dip, pwr_p);
727 	mutex_exit(&pwr_p->pwr_lock);
728 }
729 
730 static void
731 pcie_pm_subrelease(dev_info_t *dip, pcie_pwr_t *pwr_p)
732 {
733 	int level;
734 
735 	ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
736 	ASSERT(pwr_p->pwr_hold > 0);
737 	PCIE_DBG("%s(%d): pm_subrelease: decrementing hold \n",
738 	    ddi_driver_name(dip), ddi_get_instance(dip));
739 	pwr_p->pwr_hold--;
740 	ASSERT(pwr_p->pwr_hold >= 0);
741 	ASSERT(pwr_p->pwr_flags & PCIE_PM_BUSY);
742 	level = pwr_level_allowed(pwr_p);
743 	if (pwr_p->pwr_hold == 0 && level < pwr_p->pwr_func_lvl) {
744 		PCIE_DBG("%s(%d): pm_subrelease: marking idle \n",
745 		    ddi_driver_name(dip), ddi_get_instance(dip));
746 		(void) pm_idle_component(dip, 0);
747 		pwr_p->pwr_flags &= ~PCIE_PM_BUSY;
748 	}
749 }
750 
751 /*
752  * Called when the child makes the first power management call.
753  * sets up the counters. All the components of the child device are
754  * assumed to be at unknown level. It also releases the power hold
755  * 	pwr_p - parent's pwr_t
756  *	cdip   - child's dip
757  */
758 int
759 pcie_pm_add_child(dev_info_t *dip, dev_info_t *cdip)
760 {
761 	pcie_pwr_t *pwr_p;
762 
763 	/* If no PM info, return */
764 	if (!PCIE_PMINFO(dip) || !(pwr_p = PCIE_NEXUS_PMINFO(dip)))
765 		return (DDI_SUCCESS);
766 
767 	ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
768 	ASSERT(pwr_p->pwr_func_lvl == PM_LEVEL_D0);
769 	pcie_add_comps(dip, cdip, pwr_p);
770 
771 	/* If no device power management then return */
772 	if (!PCIE_SUPPORTS_DEVICE_PM(dip))
773 		return (DDI_SUCCESS);
774 
775 	/*
776 	 * We have informed PM that we are busy at PRE_ATTACH time for
777 	 * this child. Release the hold and but don't clear the busy bit.
778 	 * If a device never changes power, hold will not be released
779 	 * and we stay at full power.
780 	 */
781 	ASSERT(pwr_p->pwr_hold > 0);
782 	PCIE_DBG("%s(%d): pm_add_child: decrementing hold \n",
783 	    ddi_driver_name(dip), ddi_get_instance(dip));
784 	pwr_p->pwr_hold--;
785 	/*
786 	 * We must have made sure that busy bit
787 	 * is set when we put the hold
788 	 */
789 	ASSERT(pwr_p->pwr_flags & PCIE_PM_BUSY);
790 	return (DDI_SUCCESS);
791 }
792 
793 /*
794  * Adjust the counters when a child detaches
795  * Marks itself idle if the idle conditions are met.
796  * Called at POST_DETACH time
797  */
798 int
799 pcie_pm_remove_child(dev_info_t *dip, dev_info_t *cdip)
800 {
801 	int *counters;
802 	int total;
803 	pcie_pwr_t *pwr_p;
804 
805 	/* If no PM info, return */
806 	if (!PCIE_PMINFO(dip) || !(pwr_p = PCIE_NEXUS_PMINFO(dip)))
807 		return (DDI_SUCCESS);
808 
809 	counters = pwr_p->pwr_counters;
810 	mutex_enter(&pwr_p->pwr_lock);
811 	pcie_remove_comps(dip, cdip, pwr_p);
812 	/* If no device power management then return */
813 	if (!PCIE_SUPPORTS_DEVICE_PM(dip)) {
814 		mutex_exit(&pwr_p->pwr_lock);
815 		return (DDI_SUCCESS);
816 	}
817 	total = (counters[PCIE_D0_INDEX] + counters[PCIE_UNKNOWN_INDEX] +
818 	    counters[PCIE_D1_INDEX] + counters[PCIE_D2_INDEX] +
819 	    counters[PCIE_D3_INDEX]);
820 	/*
821 	 * Mark idle if either there are no children or our lowest
822 	 * possible level is less than the current level. Mark idle
823 	 * only if it is not already done.
824 	 */
825 	if ((pwr_p->pwr_hold == 0) &&
826 	    (!total || (pwr_level_allowed(pwr_p) < pwr_p->pwr_func_lvl))) {
827 		if (pwr_p->pwr_flags & PCIE_PM_BUSY) {
828 			PCIE_DBG("%s(%d): pcie_bus_power: marking idle\n",
829 			    ddi_driver_name(dip), ddi_get_instance(dip));
830 			(void) pm_idle_component(dip, 0);
831 			pwr_p->pwr_flags &= ~PCIE_PM_BUSY;
832 		}
833 	}
834 	mutex_exit(&pwr_p->pwr_lock);
835 	return (DDI_SUCCESS);
836 }
837 
838 boolean_t
839 pcie_is_pcie(dev_info_t *dip)
840 {
841 	pcie_bus_t *bus_p = PCIE_DIP2BUS(dip);
842 	ASSERT(bus_p);
843 	return (bus_p->bus_pcie_off != 0);
844 }
845 
846 /*
847  * Called by px_attach or pcieb_attach:: DDI_RESUME
848  */
849 int
850 pcie_pwr_resume(dev_info_t *dip)
851 {
852 	dev_info_t *cdip;
853 	pcie_pwr_t *pwr_p = NULL;
854 
855 #if defined(__i386) || defined(__amd64)
856 	if (dip)
857 		return (DDI_SUCCESS);
858 #endif /* defined(__i386) || defined(__amd64) */
859 
860 	if (PCIE_PMINFO(dip))
861 		pwr_p = PCIE_NEXUS_PMINFO(dip);
862 
863 	if (pwr_p) {
864 		/* Inform the PM framework that dip is at full power */
865 		if (PCIE_SUPPORTS_DEVICE_PM(dip)) {
866 			ASSERT(pwr_p->pwr_func_lvl == PM_LEVEL_D0);
867 			(void) pm_raise_power(dip, 0,
868 			    pwr_p->pwr_func_lvl);
869 		}
870 	}
871 
872 	/*
873 	 * Code taken from pci driver.
874 	 * Restore config registers for children that did not save
875 	 * their own registers.  Children pwr states are UNKNOWN after
876 	 * a resume since it is possible for the PM framework to call
877 	 * resume without an actual power cycle. (ie if suspend fails).
878 	 */
879 	for (cdip = ddi_get_child(dip); cdip != NULL;
880 	    cdip = ddi_get_next_sibling(cdip)) {
881 		boolean_t	is_pcie;
882 
883 		/*
884 		 * Not interested in children who are not already
885 		 * init'ed.  They will be set up by init_child().
886 		 */
887 		if (i_ddi_node_state(cdip) < DS_INITIALIZED) {
888 			PCIE_DBG("%s(%d): "
889 			    "DDI_RESUME: skipping %s%d not in CF1\n",
890 			    ddi_driver_name(dip), ddi_get_instance(dip),
891 			    ddi_driver_name(cdip), ddi_get_instance(cdip));
892 			continue;
893 		}
894 
895 		/*
896 		 * Only restore config registers if saved by nexus.
897 		 */
898 		if (ddi_prop_exists(DDI_DEV_T_ANY, cdip, DDI_PROP_DONTPASS,
899 		    "nexus-saved-config-regs") != 1)
900 			continue;
901 
902 		PCIE_DBG("%s(%d): "
903 		    "DDI_RESUME: nexus restoring %s%d config regs\n",
904 		    ddi_driver_name(dip), ddi_get_instance(dip),
905 		    ddi_driver_name(cdip), ddi_get_instance(cdip));
906 
907 		/* clear errors left by OBP scrubbing */
908 		pcie_clear_errors(cdip);
909 
910 		/* PCIe workaround: disable errors during 4K config resore */
911 		if (is_pcie = pcie_is_pcie(cdip))
912 			pcie_disable_errors(cdip);
913 		(void) pci_restore_config_regs(cdip);
914 		if (is_pcie) {
915 			pcie_enable_errors(cdip);
916 			(void) pcie_enable_ce(cdip);
917 		}
918 
919 		if (ndi_prop_remove(DDI_DEV_T_NONE, cdip,
920 		    "nexus-saved-config-regs") != DDI_PROP_SUCCESS) {
921 			PCIE_DBG("%s(%d): %s%d can't remove prop %s",
922 			    ddi_driver_name(dip), ddi_get_instance(dip),
923 			    ddi_driver_name(cdip), ddi_get_instance(cdip),
924 			    "nexus-saved-config-regs");
925 		}
926 	}
927 	return (DDI_SUCCESS);
928 }
929 
930 /*
931  * Called by pcie_detach or pcieb_detach:: DDI_SUSPEND
932  */
933 int
934 pcie_pwr_suspend(dev_info_t *dip)
935 {
936 	dev_info_t *cdip;
937 	int i, *counters; /* per nexus counters */
938 	int *child_counters = NULL; /* per child dip counters */
939 	pcie_pwr_t *pwr_p = NULL;
940 
941 #if defined(__i386) || defined(__amd64)
942 	if (dip)
943 		return (DDI_SUCCESS);
944 #endif /* defined(__i386) || defined(__amd64) */
945 
946 	if (PCIE_PMINFO(dip))
947 		pwr_p = PCIE_NEXUS_PMINFO(dip);
948 
949 	/*
950 	 * Mark all children to be unknown and bring our power level
951 	 * to full, if required. This is to avoid any panics while
952 	 * accessing the child's config space.
953 	 */
954 	if (pwr_p) {
955 		mutex_enter(&pwr_p->pwr_lock);
956 		if (PCIE_SUPPORTS_DEVICE_PM(dip) &&
957 		    pwr_p->pwr_func_lvl != PM_LEVEL_D0) {
958 			mutex_exit(&pwr_p->pwr_lock);
959 			if (pm_raise_power(dip, 0, PM_LEVEL_D0) !=
960 			    DDI_SUCCESS) {
961 				PCIE_DBG("%s(%d): pwr_suspend: attempt "
962 				    "to raise power from %d to %d "
963 				    "failed\n", ddi_driver_name(dip),
964 				    ddi_get_instance(dip), pwr_p->pwr_func_lvl,
965 				    PM_LEVEL_D0);
966 				return (DDI_FAILURE);
967 			}
968 			mutex_enter(&pwr_p->pwr_lock);
969 		}
970 		counters = pwr_p->pwr_counters;
971 		/*
972 		 * Update the nexus counters. At the resume time all
973 		 * components are considered to be at unknown level. Use the
974 		 * fact that counters for unknown level are at the end.
975 		 */
976 		for (i = 0; i < PCIE_UNKNOWN_INDEX; i++) {
977 			counters[PCIE_UNKNOWN_INDEX] += counters[i];
978 			counters[i] = 0;
979 		}
980 		mutex_exit(&pwr_p->pwr_lock);
981 	}
982 
983 	/*
984 	 * Code taken from pci driver.
985 	 * Save the state of the configuration headers of child
986 	 * nodes.
987 	 */
988 	for (cdip = ddi_get_child(dip); cdip != NULL;
989 	    cdip = ddi_get_next_sibling(cdip)) {
990 		boolean_t	is_pcie;
991 
992 		/*
993 		 * Not interested in children who are not already
994 		 * init'ed.  They will be set up in init_child().
995 		 */
996 		if (i_ddi_node_state(cdip) < DS_INITIALIZED) {
997 			PCIE_DBG("%s(%d): DDI_SUSPEND: skipping "
998 			    "%s%d not in CF1\n", ddi_driver_name(dip),
999 			    ddi_get_instance(dip), ddi_driver_name(cdip),
1000 			    ddi_get_instance(cdip));
1001 			continue;
1002 		}
1003 		/*
1004 		 * Update per child dip counters, if any. Counters
1005 		 * will not exist if the child is not power manageable
1006 		 * or if its power entry is never invoked.
1007 		 */
1008 		if (PCIE_PMINFO(cdip) && PCIE_PAR_PMINFO(cdip))
1009 			child_counters = PCIE_CHILD_COUNTERS(cdip);
1010 		if (child_counters && pwr_p) {
1011 			mutex_enter(&pwr_p->pwr_lock);
1012 			for (i = 0; i < PCIE_UNKNOWN_INDEX; i++) {
1013 				child_counters[PCIE_UNKNOWN_INDEX] +=
1014 				    child_counters[i];
1015 				child_counters[i] = 0;
1016 			}
1017 			mutex_exit(&pwr_p->pwr_lock);
1018 		}
1019 
1020 		/*
1021 		 * Only save config registers if not already saved by child.
1022 		 */
1023 		if (ddi_prop_exists(DDI_DEV_T_ANY, cdip, DDI_PROP_DONTPASS,
1024 		    SAVED_CONFIG_REGS) == 1) {
1025 			continue;
1026 		}
1027 
1028 		/*
1029 		 * The nexus needs to save config registers.  Create a property
1030 		 * so it knows to restore on resume.
1031 		 */
1032 		if (ndi_prop_create_boolean(DDI_DEV_T_NONE, cdip,
1033 		    "nexus-saved-config-regs") != DDI_PROP_SUCCESS) {
1034 			PCIE_DBG("%s(%d): %s%d can't update prop %s",
1035 			    ddi_driver_name(dip), ddi_get_instance(dip),
1036 			    ddi_driver_name(cdip), ddi_get_instance(cdip),
1037 			    "nexus-saved-config-regs");
1038 		}
1039 		PCIE_DBG("%s(%d): DDI_SUSPEND: saving config space for"
1040 		    " %s%d\n", ddi_driver_name(dip), ddi_get_instance(dip),
1041 		    ddi_driver_name(cdip), ddi_get_instance(cdip));
1042 
1043 		/* PCIe workaround: disable errors during 4K config save */
1044 		if (is_pcie = pcie_is_pcie(cdip))
1045 			pcie_disable_errors(cdip);
1046 		(void) pci_save_config_regs(cdip);
1047 		if (is_pcie) {
1048 			pcie_enable_errors(cdip);
1049 			(void) pcie_enable_ce(cdip);
1050 		}
1051 	}
1052 	return (DDI_SUCCESS);
1053 }
1054 
1055 #ifdef DEBUG
1056 /*
1057  * Description of bus_power_op.
1058  */
1059 typedef struct pcie_buspwr_desc {
1060 	pm_bus_power_op_t pwr_op;
1061 	char *pwr_desc;
1062 } pcie_buspwr_desc_t;
1063 
1064 static pcie_buspwr_desc_t pcie_buspwr_desc[] = {
1065 	{BUS_POWER_CHILD_PWRCHG, "CHILD_PWRCHG"},
1066 	{BUS_POWER_NEXUS_PWRUP, "NEXUS_PWRUP"},
1067 	{BUS_POWER_PRE_NOTIFICATION, "PRE_NOTIFICATION"},
1068 	{BUS_POWER_POST_NOTIFICATION, "POST_NOTIFICATION"},
1069 	{BUS_POWER_HAS_CHANGED, "HAS_CHANGED"},
1070 	{BUS_POWER_NOINVOL, "NOINVOL"},
1071 	{-1, NULL}
1072 };
1073 
1074 /*
1075  * Returns description of the bus_power_op.
1076  */
1077 static char *
1078 pcie_decode_pwr_op(pm_bus_power_op_t op)
1079 {
1080 	pcie_buspwr_desc_t *descp = pcie_buspwr_desc;
1081 
1082 	for (; descp->pwr_desc; descp++) {
1083 		if (op == descp->pwr_op)
1084 			return (descp->pwr_desc);
1085 	}
1086 	return ("UNKNOWN OP");
1087 }
1088 #endif
1089