xref: /illumos-gate/usr/src/uts/common/io/pciex/pcie_pwr.c (revision 4eaa471005973e11a6110b69fe990530b3b95a38)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/ddi.h>
28 #include <sys/kmem.h>
29 #include <sys/sysmacros.h>
30 #include <sys/sunddi.h>
31 #include <sys/sunpm.h>
32 #include <sys/epm.h>
33 #include <sys/sunndi.h>
34 #include <sys/ddi_impldefs.h>
35 #include <sys/ddi_implfuncs.h>
36 #include <sys/pcie.h>
37 #include <sys/pcie_impl.h>
38 #include <sys/promif.h>		/* prom_printf */
39 #include <sys/pcie_pwr.h>
40 
41 #if defined(DEBUG)
42 
43 #define	DBG pcie_pwr_dbg
44 static void pcie_pwr_dbg(dev_info_t *dip, char *fmt, ...);
45 static uint_t pcie_pwr_print = 0;
46 
47 #else /* DEBUG */
48 
49 #define	DBG 0 &&
50 
51 #endif /* DEBUG */
52 
53 /*
54  * This file implements the power management functionality for
55  * pci express switch and pci express-to-pci/pci-x bridge. All the
56  * code in this file is generic and is not specific to a particular chip.
57  * The algorithm, which decides when to go to a lower power is explained
58  * below:
59  *
60  *	1. Initially when no children are attached, the driver is idle from
61  *	PM framework point of view ( PM idle/PM busy).
62  *
63  *	2. Driver is PM busy if either a reference count called pwr_hold is
64  *	greater than zero or driver is already at the lowest possible power
65  *	level. The lowest possible power level for the driver is equal to the
66  *	highest power level among its children. The PM busy condition is
67  *	indicated by PCIE_PM_BUSY bit. At any point, only one pm_busy_component
68  *	call is made for a nexus driver instance.
69  *
70  *	3. Driver is PM idle if the pwr_hold is zero and the lowest
71  *	possible power level is less than the driver's current power level.
72  *	At any point, only one pm_idle_component call is made for a nexus
73  *	driver instance.
74  *
75  *	4. For any events like child attach, it increments pwr_hold and marks
76  *	itslef busy, if it is not already done so. This temporary hold is
77  *	removed when the event is complete.
78  *
79  *	5. Any child's power change requires the parent (this driver) to be
80  *	full power. So it raises its power and increments pwr_hold. It also
81  *	marks itself temporarily busy, if it is not already done. This hold
82  *	is removed when the child power change is complete.
83  *
84  *	6. After each child power change, it evaluates what is the lowest
85  *	possible power level. If the lowest possible power level is less than
86  *	the current power level and pwr_hold is zero, then it marks itself
87  *	idle. The lowest power level is equal or greater than the highest level
88  *	among the children. It keeps track of children's power level by
89  *	using counters.
90  *
91  *	7. Any code e.g., which is accessing the driver's own registers should
92  *	place a temporary hold using pcie_pm_hold.
93  */
94 
95 static int pcie_pwr_change(dev_info_t *dip, pcie_pwr_t *pwr_p, int new);
96 static void pwr_update_counters(int *countersp, int olevel, int nlevel);
97 static int pwr_level_allowed(pcie_pwr_t *pwr_p);
98 static void pcie_add_comps(dev_info_t *dip, dev_info_t *cdip,
99     pcie_pwr_t *pwr_p);
100 static void pcie_remove_comps(dev_info_t *dip, dev_info_t *cdip,
101     pcie_pwr_t *pwr_p);
102 static void pcie_pm_subrelease(dev_info_t *dip, pcie_pwr_t *pwr_p);
103 static boolean_t pcie_is_pcie(dev_info_t *dip);
104 #ifdef DEBUG
105 static char *pcie_decode_pwr_op(pm_bus_power_op_t op);
106 #else
107 #define	pcie_decode_pwr_op
108 #endif
109 
110 /*
111  * power entry point.
112  *
113  * This function decides whether the PM request is honorable.
114  * If yes, it then does what's necessary for switch or
115  *    bridge to change its power.
116  */
117 /* ARGSUSED */
118 int
119 pcie_power(dev_info_t *dip, int component, int level)
120 {
121 	pcie_pwr_t *pwr_p = PCIE_NEXUS_PMINFO(dip);
122 	int *counters = pwr_p->pwr_counters;
123 	int pmcaps = pwr_p->pwr_pmcaps;
124 	int ret = DDI_FAILURE;
125 
126 #if defined(__i386) || defined(__amd64)
127 	if (dip)
128 		return (DDI_SUCCESS);
129 #endif /* defined(__i386) || defined(__amd64) */
130 
131 	ASSERT(level != PM_LEVEL_UNKNOWN);
132 	/* PM should not asking for a level, which is unsupported */
133 	ASSERT(level == PM_LEVEL_D0 || level == PM_LEVEL_D3 ||
134 	    (level == PM_LEVEL_D1 && (pmcaps & PCIE_SUPPORTS_D1)) ||
135 	    (level == PM_LEVEL_D2 && (pmcaps & PCIE_SUPPORTS_D2)));
136 
137 	mutex_enter(&pwr_p->pwr_lock);
138 	DBG(dip, "pcie_power: change from %d to %d\n",
139 	    pwr_p->pwr_func_lvl, level);
140 	if (pwr_p->pwr_func_lvl == level) {
141 		DBG(dip, "pcie_power: already at %d\n", level);
142 		ret = DDI_SUCCESS;
143 		goto pcie_pwr_done;
144 	}
145 
146 	if (level < pwr_p->pwr_func_lvl) {
147 		/*
148 		 * Going to lower power. Reject this if we are either busy
149 		 * or there is a hold.
150 		 */
151 		if (pwr_p->pwr_flags & PCIE_PM_BUSY) {
152 			DBG(dip, "pcie_power: rejecting change to %d "
153 			    "as busy\n", level);
154 			goto pcie_pwr_done;
155 		}
156 
157 		/*
158 		 * Now we know that we are neither busy nor there is a hold.
159 		 * At this point none of the children should be at full power.
160 		 * Reject the request if level reqested is lower than the level
161 		 * possible.
162 		 */
163 		ASSERT(!counters[PCIE_D0_INDEX] &&
164 		    !counters[PCIE_UNKNOWN_INDEX]);
165 		if (level < pwr_level_allowed(pwr_p)) {
166 			DBG(dip, "pcie_power: rejecting level %d as"
167 			    " %d is the lowest possible\n", level,
168 			    pwr_level_allowed(pwr_p));
169 			goto pcie_pwr_done;
170 		}
171 	}
172 
173 	if (pcie_pwr_change(dip, pwr_p, level) != DDI_SUCCESS) {
174 		DBG(dip, "pcie_power: attempt to change to %d "
175 		    " failed \n", level);
176 		goto pcie_pwr_done;
177 	}
178 	pwr_p->pwr_func_lvl = level;
179 	DBG(dip, "pcie_power: level changed to %d \n", level);
180 	ret = DDI_SUCCESS;
181 
182 pcie_pwr_done:
183 	mutex_exit(&pwr_p->pwr_lock);
184 	return (ret);
185 }
186 
187 /*
188  * Called by pcie_power() only. Caller holds the pwr_lock.
189  *
190  * dip - dev_info pointer
191  * pwr_p - pm info for the node.
192  * new     - new level
193  */
194 static int
195 pcie_pwr_change(dev_info_t *dip, pcie_pwr_t *pwr_p, int new)
196 {
197 	uint16_t pmcsr;
198 
199 	ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
200 	ASSERT(new != pwr_p->pwr_func_lvl);
201 	pmcsr = pci_config_get16(pwr_p->pwr_conf_hdl, pwr_p->pwr_pmcsr_offset);
202 	pmcsr &= ~PCI_PMCSR_STATE_MASK;
203 	switch (new) {
204 	case PM_LEVEL_D0:
205 		pmcsr |= PCI_PMCSR_D0;
206 		break;
207 
208 	case PM_LEVEL_D1:
209 		pmcsr |= PCI_PMCSR_D1;
210 		break;
211 
212 	case PM_LEVEL_D2:
213 		pmcsr |= PCI_PMCSR_D2;
214 		break;
215 
216 	case PM_LEVEL_D3:
217 		pmcsr |= PCI_PMCSR_D3HOT;
218 		break;
219 
220 	default:
221 		ASSERT(0);
222 		break;
223 	}
224 	/* Save config space, if going to D3 */
225 	if (new == PM_LEVEL_D3) {
226 		DBG(dip, "pwr_change: saving config space regs\n");
227 		if (pci_save_config_regs(dip) != DDI_SUCCESS) {
228 			DBG(dip, "pcie_pwr_change: failed to save "
229 			    "config space regs\n");
230 			return (DDI_FAILURE);
231 		}
232 	}
233 
234 	pci_config_put16(pwr_p->pwr_conf_hdl, pwr_p->pwr_pmcsr_offset, pmcsr);
235 
236 	/*
237 	 * TBD: Taken from pci_pci driver. Is this required?
238 	 * No bus transactions should occur without waiting for
239 	 * settle time specified in PCI PM spec rev 2.1 sec 5.6.1
240 	 * To make things simple, just use the max time specified for
241 	 * all state transitions.
242 	 */
243 	delay(drv_usectohz(PCI_CLK_SETTLE_TIME));
244 
245 	/*
246 	 * Restore config space if coming out of D3
247 	 */
248 	if (pwr_p->pwr_func_lvl == PM_LEVEL_D3) {
249 		DBG(dip, "pcie_pwr_change: restoring config space\n");
250 		if (pci_restore_config_regs(dip) != DDI_SUCCESS) {
251 			DBG(dip, "pcie_pwr_change: failed to restore "
252 			    "config space regs\n");
253 			return (DDI_FAILURE);
254 		}
255 	}
256 	return (DDI_SUCCESS);
257 }
258 
259 /*
260  * bus_ctlops.bus_power function.
261  *
262  * This function handles PRE_ POST_ change notifications, sent by
263  * PM framework related to child's power level change. It marks itself
264  * idle or busy based on the children's power level.
265  */
266 int
267 pcie_bus_power(dev_info_t *dip, void *impl_arg, pm_bus_power_op_t op,
268     void *arg, void *result)
269 {
270 	pcie_pwr_t *pwr_p = PCIE_NEXUS_PMINFO(dip);
271 	int *counters = pwr_p->pwr_counters; /* nexus counters */
272 	int *child_counters; /* per child dip counters */
273 	pm_bp_child_pwrchg_t *bpc;
274 	pm_bp_has_changed_t *bphc;
275 	dev_info_t *cdip;
276 	int new_level;
277 	int old_level;
278 	int rv = DDI_SUCCESS;
279 	int level_allowed, comp;
280 
281 #if defined(__i386) || defined(__amd64)
282 	if (dip)
283 		return (DDI_SUCCESS);
284 #endif /* defined(__i386) || defined(__amd64) */
285 
286 	switch (op) {
287 	case BUS_POWER_PRE_NOTIFICATION:
288 	case BUS_POWER_POST_NOTIFICATION:
289 		bpc = (pm_bp_child_pwrchg_t *)arg;
290 		cdip = bpc->bpc_dip;
291 		new_level = bpc->bpc_nlevel;
292 		old_level = bpc->bpc_olevel;
293 		comp = bpc->bpc_comp;
294 		break;
295 
296 	case BUS_POWER_HAS_CHANGED:
297 		bphc = (pm_bp_has_changed_t *)arg;
298 		cdip = bphc->bphc_dip;
299 		new_level = bphc->bphc_nlevel;
300 		old_level = bphc->bphc_olevel;
301 		comp = bphc->bphc_comp;
302 		break;
303 
304 	default:
305 		break;
306 
307 	}
308 
309 	ASSERT(pwr_p);
310 	mutex_enter(&pwr_p->pwr_lock);
311 	switch (op) {
312 	case BUS_POWER_PRE_NOTIFICATION:
313 		DBG(dip, "pcie_bus_power: %s@%d op %s %d->%d\n",
314 		    ddi_driver_name(cdip), ddi_get_instance(cdip),
315 		    pcie_decode_pwr_op(op), old_level, new_level);
316 		/*
317 		 * If the nexus doesn't want the child to go into
318 		 * non-D0 state, mark the child busy. This way PM
319 		 * framework will never try to lower the child's power.
320 		 * In case of pm_lower_power, marking busy won't help.
321 		 * So we need to specifically reject the attempt to
322 		 * go to non-D0 state.
323 		 */
324 		if (pwr_p->pwr_flags & PCIE_NO_CHILD_PM) {
325 			if (!PCIE_IS_COMPS_COUNTED(cdip)) {
326 				DBG(dip, "pcie_bus_power: marking child "
327 				    "busy to disable pm \n");
328 				(void) pm_busy_component(cdip, 0);
329 			}
330 			if (new_level < PM_LEVEL_D0 && !comp) {
331 				DBG(dip, "pcie_bus_power: rejecting "
332 				    "child's attempt to go to %d\n", new_level);
333 				rv = DDI_FAILURE;
334 			}
335 		}
336 		mutex_exit(&pwr_p->pwr_lock);
337 		if (rv == DDI_SUCCESS)
338 			rv = pcie_pm_hold(dip);
339 		return (rv);
340 
341 	case BUS_POWER_HAS_CHANGED:
342 	case BUS_POWER_POST_NOTIFICATION:
343 		DBG(dip, "pcie_bus_power: %s@%d op %s %d->%d\n",
344 		    ddi_driver_name(cdip), ddi_get_instance(cdip),
345 		    pcie_decode_pwr_op(op), old_level, new_level);
346 		/*
347 		 * Child device power changed
348 		 * If pm components of this child aren't accounted for
349 		 * then add the components to the counters. This can't
350 		 * be done in POST_ATTACH ctlop as pm info isn't created
351 		 * by then. Also because a driver can make a pm call during
352 		 * the attach.
353 		 */
354 		if (!PCIE_IS_COMPS_COUNTED(cdip)) {
355 			(void) pcie_pm_add_child(dip, cdip);
356 			if ((pwr_p->pwr_flags & PCIE_NO_CHILD_PM) &&
357 			    (op == BUS_POWER_HAS_CHANGED)) {
358 				DBG(dip, "pcie_bus_power: marking child "
359 				    "busy to disable pm \n");
360 				(void) pm_busy_component(cdip, 0);
361 				/*
362 				 * If the driver has already changed to lower
363 				 * power(pm_power_has_changed) on its own,
364 				 * there is nothing we can do other than
365 				 * logging the warning message on the console.
366 				 */
367 				if (new_level < PM_LEVEL_D0)
368 					cmn_err(CE_WARN, "!Downstream device "
369 					    "%s@%d went to non-D0 state: "
370 					    "possible loss of link\n",
371 					    ddi_driver_name(cdip),
372 					    ddi_get_instance(cdip));
373 			}
374 		}
375 
376 
377 		/*
378 		 * If it is POST and device PM is supported, release the
379 		 * hold done in PRE.
380 		 */
381 		if (op == BUS_POWER_POST_NOTIFICATION &&
382 		    PCIE_SUPPORTS_DEVICE_PM(dip)) {
383 			pcie_pm_subrelease(dip, pwr_p);
384 		}
385 
386 		if (*((int *)result) == DDI_FAILURE) {
387 			DBG(dip, "pcie_bus_power: change for %s%d failed\n",
388 			    ddi_driver_name(cdip), ddi_get_instance(cdip));
389 			break;
390 		}
391 		/* Modify counters appropriately */
392 		pwr_update_counters(counters, old_level, new_level);
393 
394 		child_counters = PCIE_CHILD_COUNTERS(cdip);
395 		pwr_update_counters(child_counters, old_level, new_level);
396 
397 		/* If no device PM, return */
398 		if (!PCIE_SUPPORTS_DEVICE_PM(dip))
399 			break;
400 
401 		level_allowed = pwr_level_allowed(pwr_p);
402 		/*
403 		 * Check conditions for marking busy
404 		 * Check the flag to set this busy only once for multiple
405 		 * busy conditions. Mark busy if our current lowest possible
406 		 * is equal or greater to the current level.
407 		 */
408 		if (level_allowed >= pwr_p->pwr_func_lvl &&
409 		    !(pwr_p->pwr_flags & PCIE_PM_BUSY)) {
410 			DBG(dip, "pcie_bus_power: marking busy\n");
411 			(void) pm_busy_component(dip, 0);
412 			pwr_p->pwr_flags |= PCIE_PM_BUSY;
413 			break;
414 		}
415 		/*
416 		 * Check conditions for marking idle.
417 		 * If our lowest possible level is less than our current
418 		 * level mark idle. Mark idle only if it is not already done.
419 		 */
420 		if ((level_allowed < pwr_p->pwr_func_lvl) &&
421 		    (pwr_p->pwr_hold == 0) &&
422 		    (pwr_p->pwr_flags & PCIE_PM_BUSY)) {
423 			/*
424 			 * For pci express, we should check here whether
425 			 * the link is in L1 state or not.
426 			 */
427 			DBG(dip, "pcie_bus_power: marking idle\n");
428 			(void) pm_idle_component(dip, 0);
429 			pwr_p->pwr_flags &= ~PCIE_PM_BUSY;
430 			break;
431 		}
432 		break;
433 
434 	default:
435 		mutex_exit(&pwr_p->pwr_lock);
436 		return (pm_busop_bus_power(dip, impl_arg, op, arg, result));
437 	}
438 	mutex_exit(&pwr_p->pwr_lock);
439 	return (rv);
440 }
441 
442 /*
443  * Decrement the count of children at olevel by one and increment
444  * count of children at nlevel by one.
445  */
446 static void
447 pwr_update_counters(int *countersp, int olevel, int nlevel)
448 {
449 	uint32_t	index;
450 
451 	ASSERT(olevel >= PM_LEVEL_UNKNOWN && olevel <= PM_LEVEL_D0);
452 	ASSERT(nlevel >= PM_LEVEL_UNKNOWN && nlevel <= PM_LEVEL_D0);
453 
454 	index = (olevel == PM_LEVEL_UNKNOWN ? PCIE_UNKNOWN_INDEX : olevel);
455 	countersp[index]--;
456 	index = (nlevel == PM_LEVEL_UNKNOWN ? PCIE_UNKNOWN_INDEX : nlevel);
457 	countersp[index]++;
458 }
459 
460 /*
461  * Returns the lowest possible power level allowed for nexus
462  * based on children's power level. Lowest possible level is
463  * equal to the highest level among the children. It also checks
464  * for the supported level
465  * UNKNOWN = D0 > D1 > D2 > D3
466  */
467 static int
468 pwr_level_allowed(pcie_pwr_t *pwr_p)
469 {
470 	int *counters = pwr_p->pwr_counters;
471 	int i, j;
472 
473 	ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
474 	/*
475 	 * Search from UNKNOWN to D2. unknown is same as D0.
476 	 * find the highest level among the children. If that
477 	 * level is supported, return that level. If not,
478 	 * find the next higher supported level and return that
479 	 * level. For example, if the D1 is the highest among
480 	 * children and if D1 isn't supported return D0 as the
481 	 * lowest possible level. We don't need to look at D3
482 	 * as that is the default lowest level and it is always
483 	 * supported.
484 	 */
485 	for (i = PCIE_UNKNOWN_INDEX; i > 0; i--) {
486 		if (counters[i]) {
487 			if (i == PCIE_UNKNOWN_INDEX)
488 				return (PM_LEVEL_D0);
489 			/*
490 			 * i is the highest level among children. If this is
491 			 * supported, return i.
492 			 */
493 			if (PCIE_LEVEL_SUPPORTED(pwr_p->pwr_pmcaps, i))
494 				return (i);
495 			/* find the next higher supported level */
496 			for (j = i + 1; j <= PCIE_D0_INDEX; j++) {
497 				if (PCIE_LEVEL_SUPPORTED(pwr_p->pwr_pmcaps, j))
498 					return (j);
499 			}
500 		}
501 	}
502 
503 	return (PM_LEVEL_D3);
504 }
505 
506 /*
507  * Update the counters with number pm components of the child
508  * all components are assumed to be at UNKNOWN level.
509  */
510 static void
511 pcie_add_comps(dev_info_t *dip, dev_info_t *cdip, pcie_pwr_t *pwr_p)
512 {
513 	int comps = PM_NUMCMPTS(cdip);
514 	pcie_pm_t *pcie_pm_p;
515 	pcie_pwr_child_t *cpwr_p;
516 
517 	ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
518 	if (!comps)
519 		return;
520 
521 	DBG(dip, "pcie_add_comps: unknown level counter incremented "
522 	    "from %d by %d because of %s@%d\n",
523 	    (pwr_p->pwr_counters)[PCIE_UNKNOWN_INDEX], comps,
524 	    ddi_driver_name(cdip), ddi_get_instance(cdip));
525 	(pwr_p->pwr_counters)[PCIE_UNKNOWN_INDEX] += comps;
526 	/*
527 	 * Allocate counters per child. This is a part of pcie
528 	 * pm info. If there is no pcie pm info, allocate it here.
529 	 * pcie pm info might already be there for pci express nexus
530 	 * driver e.g. pcieb. For all leaf nodes, it is allocated here.
531 	 */
532 	if ((pcie_pm_p = PCIE_PMINFO(cdip)) == NULL) {
533 		pcie_pm_p = (pcie_pm_t *)kmem_zalloc(
534 		    sizeof (pcie_pm_t), KM_SLEEP);
535 		PCIE_SET_PMINFO(cdip, pcie_pm_p);
536 	}
537 	cpwr_p = (pcie_pwr_child_t *)kmem_zalloc(sizeof (pcie_pwr_child_t),
538 	    KM_SLEEP);
539 	pcie_pm_p->pcie_par_pminfo = cpwr_p;
540 	(cpwr_p->pwr_child_counters)[PCIE_UNKNOWN_INDEX] += comps;
541 }
542 
543 /*
544  * Remove the pm components of a child from our counters.
545  */
546 static void
547 pcie_remove_comps(dev_info_t *dip, dev_info_t *cdip, pcie_pwr_t *pwr_p)
548 {
549 	int i;
550 	int *child_counters;
551 
552 	ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
553 	if (!(PCIE_PMINFO(cdip)) || !PCIE_PAR_PMINFO(cdip)) {
554 		if (PCIE_SUPPORTS_DEVICE_PM(dip)) {
555 			/*
556 			 * Driver never made a PM call and we didn't create
557 			 * any counters for this device. This also means that
558 			 * hold made at the PRE_ATTACH time, still remains.
559 			 * Remove the hold now. The correct thing to do is to
560 			 * stay at full power when a child is at full power
561 			 * whether a driver is there or not. This will be
562 			 * implemented in the future.
563 			 */
564 			pcie_pm_subrelease(dip, pwr_p);
565 		}
566 		return;
567 	}
568 	DBG(dip, "pcie_remove_comps:counters decremented because of "
569 	    "%s@%d\n", ddi_driver_name(cdip), ddi_get_instance(cdip));
570 	child_counters = PCIE_CHILD_COUNTERS(cdip);
571 	/*
572 	 * Adjust the nexus counters. No need to adjust per child dip
573 	 * counters as we are freeing the per child dip info.
574 	 */
575 	for (i = 0; i < PCIE_MAX_PWR_LEVELS; i++) {
576 		ASSERT((pwr_p->pwr_counters)[i] >= child_counters[i]);
577 		(pwr_p->pwr_counters)[i] -= child_counters[i];
578 	}
579 	/* remove both parent pm info and pcie pminfo itself */
580 	kmem_free(PCIE_PAR_PMINFO(cdip), sizeof (pcie_pwr_child_t));
581 	kmem_free(PCIE_PMINFO(cdip), sizeof (pcie_pm_t));
582 	PCIE_RESET_PMINFO(cdip);
583 }
584 
585 /*
586  * Power management related initialization common to px and pcieb
587  */
588 int
589 pwr_common_setup(dev_info_t *dip)
590 {
591 	pcie_pm_t		*pcie_pm_p;
592 	pcie_pwr_t		*pwr_p;
593 	int			pminfo_created = 0;
594 
595 	/* Create pminfo, if it doesn't exist already */
596 	if ((pcie_pm_p = PCIE_PMINFO(dip)) == NULL) {
597 		pcie_pm_p = (pcie_pm_t *)kmem_zalloc(
598 		    sizeof (pcie_pm_t), KM_SLEEP);
599 		PCIE_SET_PMINFO(dip, pcie_pm_p);
600 		pminfo_created = 1;
601 	}
602 	pwr_p = (pcie_pwr_t *)kmem_zalloc(sizeof (pcie_pwr_t), KM_SLEEP);
603 	mutex_init(&pwr_p->pwr_lock, NULL, MUTEX_DRIVER, NULL);
604 	/* Initialize the power level and default level support */
605 	pwr_p->pwr_func_lvl = PM_LEVEL_UNKNOWN;
606 	pwr_p->pwr_pmcaps = PCIE_DEFAULT_LEVEL_SUPPORTED;
607 
608 	if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
609 	    "pm-want-child-notification?", NULL, NULL) != DDI_PROP_SUCCESS) {
610 		DBG(dip, "can't create pm-want-child-notification \n");
611 		goto pwr_common_err;
612 	}
613 	pcie_pm_p->pcie_pwr_p = pwr_p;
614 
615 	return (DDI_SUCCESS);
616 
617 pwr_common_err:
618 	mutex_destroy(&pwr_p->pwr_lock);
619 	kmem_free(pwr_p, sizeof (pcie_pwr_t));
620 	if (pminfo_created) {
621 		PCIE_RESET_PMINFO(dip);
622 		kmem_free(pcie_pm_p, sizeof (pcie_pm_t));
623 	}
624 	return (DDI_FAILURE);
625 
626 }
627 
628 /*
629  * Undo whatever is done in pwr_common_setup. Called by px_detach or pxb_detach
630  */
631 void
632 pwr_common_teardown(dev_info_t *dip)
633 {
634 	pcie_pm_t *pcie_pm_p = PCIE_PMINFO(dip);
635 	pcie_pwr_t *pwr_p;
636 
637 	if (!pcie_pm_p || !(pwr_p = PCIE_NEXUS_PMINFO(dip)))
638 		return;
639 
640 	(void) ddi_prop_remove(DDI_DEV_T_NONE, dip,
641 	    "pm-want-child-notification?");
642 	mutex_destroy(&pwr_p->pwr_lock);
643 	pcie_pm_p->pcie_pwr_p = NULL;
644 	kmem_free(pwr_p, sizeof (pcie_pwr_t));
645 	/*
646 	 * If the parent didn't store have any pm info about
647 	 * this node, that means parent doesn't need pminfo when it handles
648 	 * POST_DETACH for this node. For example, if dip is the dip of
649 	 * root complex, then there is no parent pm info.
650 	 */
651 	if (!PCIE_PAR_PMINFO(dip)) {
652 		kmem_free(pcie_pm_p, sizeof (pcie_pm_t));
653 		PCIE_RESET_PMINFO(dip);
654 	}
655 }
656 
657 /*
658  * Raises the power and marks itself busy.
659  */
660 int
661 pcie_pm_hold(dev_info_t *dip)
662 {
663 	pcie_pwr_t *pwr_p;
664 
665 	/* If no PM info or no device PM, return */
666 	if (!PCIE_PMINFO(dip) || !(pwr_p = PCIE_NEXUS_PMINFO(dip)) ||
667 	    !(PCIE_SUPPORTS_DEVICE_PM(dip)))
668 		return (DDI_SUCCESS);
669 
670 	/*
671 	 * If we are not at full power, then powerup.
672 	 * Need to be at full power so that link can be
673 	 * at L0. Similarly for PCI/PCI-X bus, it should be
674 	 * at full power.
675 	 */
676 	mutex_enter(&pwr_p->pwr_lock);
677 	ASSERT(pwr_p->pwr_hold >= 0);
678 	DBG(dip, "pm_hold: incrementing hold \n");
679 	pwr_p->pwr_hold++;
680 	/* Mark itself busy, if it is not done already */
681 	if (!(pwr_p->pwr_flags & PCIE_PM_BUSY)) {
682 		DBG(dip, "pm_hold: marking busy\n");
683 		pwr_p->pwr_flags |= PCIE_PM_BUSY;
684 		(void) pm_busy_component(dip, 0);
685 	}
686 	if (pwr_p->pwr_func_lvl == PM_LEVEL_D0) {
687 		mutex_exit(&pwr_p->pwr_lock);
688 		return (DDI_SUCCESS);
689 	}
690 	mutex_exit(&pwr_p->pwr_lock);
691 	if (pm_raise_power(dip, 0, PM_LEVEL_D0) != DDI_SUCCESS) {
692 		DBG(dip, "pm_hold: attempt to raise power "
693 		    "from %d to %d failed\n", pwr_p->pwr_func_lvl,
694 		    PM_LEVEL_D0);
695 		pcie_pm_release(dip);
696 		return (DDI_FAILURE);
697 	}
698 	return (DDI_SUCCESS);
699 }
700 
701 /*
702  * Reverse the things done in pcie_pm_hold
703  */
704 void
705 pcie_pm_release(dev_info_t *dip)
706 {
707 	pcie_pwr_t *pwr_p;
708 
709 	/* If no PM info or no device PM, return */
710 	if (!PCIE_PMINFO(dip) || !(pwr_p = PCIE_NEXUS_PMINFO(dip)) ||
711 	    !(PCIE_SUPPORTS_DEVICE_PM(dip)))
712 		return;
713 
714 	mutex_enter(&pwr_p->pwr_lock);
715 	pcie_pm_subrelease(dip, pwr_p);
716 	mutex_exit(&pwr_p->pwr_lock);
717 }
718 
719 static void
720 pcie_pm_subrelease(dev_info_t *dip, pcie_pwr_t *pwr_p)
721 {
722 	int level;
723 
724 	ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
725 	ASSERT(pwr_p->pwr_hold > 0);
726 	DBG(dip, "pm_subrelease: decrementing hold \n");
727 	pwr_p->pwr_hold--;
728 	ASSERT(pwr_p->pwr_hold >= 0);
729 	ASSERT(pwr_p->pwr_flags & PCIE_PM_BUSY);
730 	level = pwr_level_allowed(pwr_p);
731 	if (pwr_p->pwr_hold == 0 && level < pwr_p->pwr_func_lvl) {
732 		DBG(dip, "pm_subrelease: marking idle \n");
733 		(void) pm_idle_component(dip, 0);
734 		pwr_p->pwr_flags &= ~PCIE_PM_BUSY;
735 	}
736 }
737 
738 /*
739  * Called when the child makes the first power management call.
740  * sets up the counters. All the components of the child device are
741  * assumed to be at unknown level. It also releases the power hold
742  * 	pwr_p - parent's pwr_t
743  *	cdip   - child's dip
744  */
745 int
746 pcie_pm_add_child(dev_info_t *dip, dev_info_t *cdip)
747 {
748 	pcie_pwr_t *pwr_p;
749 
750 	/* If no PM info, return */
751 	if (!PCIE_PMINFO(dip) || !(pwr_p = PCIE_NEXUS_PMINFO(dip)))
752 		return (DDI_SUCCESS);
753 
754 	ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
755 	ASSERT(pwr_p->pwr_func_lvl == PM_LEVEL_D0);
756 	pcie_add_comps(dip, cdip, pwr_p);
757 
758 	/* If no device power management then return */
759 	if (!PCIE_SUPPORTS_DEVICE_PM(dip))
760 		return (DDI_SUCCESS);
761 
762 	/*
763 	 * We have informed PM that we are busy at PRE_ATTACH time for
764 	 * this child. Release the hold and but don't clear the busy bit.
765 	 * If a device never changes power, hold will not be released
766 	 * and we stay at full power.
767 	 */
768 	ASSERT(pwr_p->pwr_hold > 0);
769 	DBG(dip, "pm_add_child: decrementing hold \n");
770 	pwr_p->pwr_hold--;
771 	/*
772 	 * We must have made sure that busy bit
773 	 * is set when we put the hold
774 	 */
775 	ASSERT(pwr_p->pwr_flags & PCIE_PM_BUSY);
776 	return (DDI_SUCCESS);
777 }
778 
779 /*
780  * Adjust the counters when a child detaches
781  * Marks itself idle if the idle conditions are met.
782  * Called at POST_DETACH time
783  */
784 int
785 pcie_pm_remove_child(dev_info_t *dip, dev_info_t *cdip)
786 {
787 	int *counters;
788 	int total;
789 	pcie_pwr_t *pwr_p;
790 
791 	/* If no PM info, return */
792 	if (!PCIE_PMINFO(dip) || !(pwr_p = PCIE_NEXUS_PMINFO(dip)))
793 		return (DDI_SUCCESS);
794 
795 	counters = pwr_p->pwr_counters;
796 	mutex_enter(&pwr_p->pwr_lock);
797 	pcie_remove_comps(dip, cdip, pwr_p);
798 	/* If no device power management then return */
799 	if (!PCIE_SUPPORTS_DEVICE_PM(dip)) {
800 		mutex_exit(&pwr_p->pwr_lock);
801 		return (DDI_SUCCESS);
802 	}
803 	total = (counters[PCIE_D0_INDEX] + counters[PCIE_UNKNOWN_INDEX] +
804 	    counters[PCIE_D1_INDEX] + counters[PCIE_D2_INDEX] +
805 	    counters[PCIE_D3_INDEX]);
806 	/*
807 	 * Mark idle if either there are no children or our lowest
808 	 * possible level is less than the current level. Mark idle
809 	 * only if it is not already done.
810 	 */
811 	if ((pwr_p->pwr_hold == 0) &&
812 	    (!total || (pwr_level_allowed(pwr_p) < pwr_p->pwr_func_lvl))) {
813 		if (pwr_p->pwr_flags & PCIE_PM_BUSY) {
814 			DBG(dip, "pcie_bus_power: marking idle\n");
815 			(void) pm_idle_component(dip, 0);
816 			pwr_p->pwr_flags &= ~PCIE_PM_BUSY;
817 		}
818 	}
819 	mutex_exit(&pwr_p->pwr_lock);
820 	return (DDI_SUCCESS);
821 }
822 
823 boolean_t
824 pcie_is_pcie(dev_info_t *dip)
825 {
826 	pcie_bus_t *bus_p = PCIE_DIP2BUS(dip);
827 	ASSERT(bus_p);
828 	return (bus_p->bus_pcie_off != 0);
829 }
830 
831 /*
832  * Called by px_attach or pcieb_attach:: DDI_RESUME
833  */
834 int
835 pcie_pwr_resume(dev_info_t *dip)
836 {
837 	dev_info_t *cdip;
838 	pcie_pwr_t *pwr_p = NULL;
839 
840 #if defined(__i386) || defined(__amd64)
841 	if (dip)
842 		return (DDI_SUCCESS);
843 #endif /* defined(__i386) || defined(__amd64) */
844 
845 	if (PCIE_PMINFO(dip))
846 		pwr_p = PCIE_NEXUS_PMINFO(dip);
847 
848 	if (pwr_p) {
849 		/* Inform the PM framework that dip is at full power */
850 		if (PCIE_SUPPORTS_DEVICE_PM(dip)) {
851 			ASSERT(pwr_p->pwr_func_lvl == PM_LEVEL_D0);
852 			(void) pm_raise_power(dip, 0,
853 			    pwr_p->pwr_func_lvl);
854 		}
855 	}
856 
857 	/*
858 	 * Code taken from pci driver.
859 	 * Restore config registers for children that did not save
860 	 * their own registers.  Children pwr states are UNKNOWN after
861 	 * a resume since it is possible for the PM framework to call
862 	 * resume without an actual power cycle. (ie if suspend fails).
863 	 */
864 	for (cdip = ddi_get_child(dip); cdip != NULL;
865 	    cdip = ddi_get_next_sibling(cdip)) {
866 		boolean_t	is_pcie;
867 
868 		/*
869 		 * Not interested in children who are not already
870 		 * init'ed.  They will be set up by init_child().
871 		 */
872 		if (i_ddi_node_state(cdip) < DS_INITIALIZED) {
873 			DBG(dip,
874 			    "DDI_RESUME: skipping %s%d not in CF1\n",
875 			    ddi_driver_name(cdip), ddi_get_instance(cdip));
876 			continue;
877 		}
878 
879 		/*
880 		 * Only restore config registers if saved by nexus.
881 		 */
882 		if (ddi_prop_exists(DDI_DEV_T_ANY, cdip, DDI_PROP_DONTPASS,
883 		    "nexus-saved-config-regs") != 1)
884 			continue;
885 
886 		DBG(dip,
887 		    "DDI_RESUME: nexus restoring %s%d config regs\n",
888 		    ddi_driver_name(cdip), ddi_get_instance(cdip));
889 
890 		/* clear errors left by OBP scrubbing */
891 		pcie_clear_errors(cdip);
892 
893 		/* PCIe workaround: disable errors during 4K config resore */
894 		if (is_pcie = pcie_is_pcie(cdip))
895 			pcie_disable_errors(cdip);
896 		(void) pci_restore_config_regs(cdip);
897 		if (is_pcie) {
898 			pcie_enable_errors(cdip);
899 			(void) pcie_enable_ce(cdip);
900 		}
901 
902 		if (ndi_prop_remove(DDI_DEV_T_NONE, cdip,
903 		    "nexus-saved-config-regs") != DDI_PROP_SUCCESS) {
904 			DBG(dip, "%s%d can't remove prop %s",
905 			    ddi_driver_name(cdip), ddi_get_instance(cdip),
906 			    "nexus-saved-config-regs");
907 		}
908 	}
909 	return (DDI_SUCCESS);
910 }
911 
912 /*
913  * Called by pcie_detach or pcieb_detach:: DDI_SUSPEND
914  */
915 int
916 pcie_pwr_suspend(dev_info_t *dip)
917 {
918 	dev_info_t *cdip;
919 	int i, *counters; /* per nexus counters */
920 	int *child_counters = NULL; /* per child dip counters */
921 	pcie_pwr_t *pwr_p = NULL;
922 
923 #if defined(__i386) || defined(__amd64)
924 	if (dip)
925 		return (DDI_SUCCESS);
926 #endif /* defined(__i386) || defined(__amd64) */
927 
928 	if (PCIE_PMINFO(dip))
929 		pwr_p = PCIE_NEXUS_PMINFO(dip);
930 
931 	/*
932 	 * Mark all children to be unknown and bring our power level
933 	 * to full, if required. This is to avoid any panics while
934 	 * accessing the child's config space.
935 	 */
936 	if (pwr_p) {
937 		mutex_enter(&pwr_p->pwr_lock);
938 		if (PCIE_SUPPORTS_DEVICE_PM(dip) &&
939 		    pwr_p->pwr_func_lvl != PM_LEVEL_D0) {
940 			mutex_exit(&pwr_p->pwr_lock);
941 			if (pm_raise_power(dip, 0, PM_LEVEL_D0) !=
942 			    DDI_SUCCESS) {
943 				DBG(dip, "pwr_suspend: attempt "
944 				    "to raise power from %d to %d "
945 				    "failed\n", pwr_p->pwr_func_lvl,
946 				    PM_LEVEL_D0);
947 				return (DDI_FAILURE);
948 			}
949 			mutex_enter(&pwr_p->pwr_lock);
950 		}
951 		counters = pwr_p->pwr_counters;
952 		/*
953 		 * Update the nexus counters. At the resume time all
954 		 * components are considered to be at unknown level. Use the
955 		 * fact that counters for unknown level are at the end.
956 		 */
957 		for (i = 0; i < PCIE_UNKNOWN_INDEX; i++) {
958 			counters[PCIE_UNKNOWN_INDEX] += counters[i];
959 			counters[i] = 0;
960 		}
961 		mutex_exit(&pwr_p->pwr_lock);
962 	}
963 
964 	/*
965 	 * Code taken from pci driver.
966 	 * Save the state of the configuration headers of child
967 	 * nodes.
968 	 */
969 	for (cdip = ddi_get_child(dip); cdip != NULL;
970 	    cdip = ddi_get_next_sibling(cdip)) {
971 		boolean_t	is_pcie;
972 
973 		/*
974 		 * Not interested in children who are not already
975 		 * init'ed.  They will be set up in init_child().
976 		 */
977 		if (i_ddi_node_state(cdip) < DS_INITIALIZED) {
978 			DBG(dip, "DDI_SUSPEND: skipping "
979 			    "%s%d not in CF1\n", ddi_driver_name(cdip),
980 			    ddi_get_instance(cdip));
981 			continue;
982 		}
983 		/*
984 		 * Update per child dip counters, if any. Counters
985 		 * will not exist if the child is not power manageable
986 		 * or if its power entry is never invoked.
987 		 */
988 		if (PCIE_PMINFO(cdip) && PCIE_PAR_PMINFO(cdip))
989 			child_counters = PCIE_CHILD_COUNTERS(cdip);
990 		if (child_counters && pwr_p) {
991 			mutex_enter(&pwr_p->pwr_lock);
992 			for (i = 0; i < PCIE_UNKNOWN_INDEX; i++) {
993 				child_counters[PCIE_UNKNOWN_INDEX] +=
994 				    child_counters[i];
995 				child_counters[i] = 0;
996 			}
997 			mutex_exit(&pwr_p->pwr_lock);
998 		}
999 
1000 		/*
1001 		 * Only save config registers if not already saved by child.
1002 		 */
1003 		if (ddi_prop_exists(DDI_DEV_T_ANY, cdip, DDI_PROP_DONTPASS,
1004 		    SAVED_CONFIG_REGS) == 1) {
1005 			continue;
1006 		}
1007 
1008 		/*
1009 		 * The nexus needs to save config registers.  Create a property
1010 		 * so it knows to restore on resume.
1011 		 */
1012 		if (ndi_prop_create_boolean(DDI_DEV_T_NONE, cdip,
1013 		    "nexus-saved-config-regs") != DDI_PROP_SUCCESS) {
1014 			DBG(dip, "%s%d can't update prop %s",
1015 			    ddi_driver_name(cdip), ddi_get_instance(cdip),
1016 			    "nexus-saved-config-regs");
1017 		}
1018 		DBG(dip, "DDI_SUSPEND: saving config space for"
1019 		    " %s%d\n", ddi_driver_name(cdip), ddi_get_instance(cdip));
1020 
1021 		/* PCIe workaround: disable errors during 4K config save */
1022 		if (is_pcie = pcie_is_pcie(cdip))
1023 			pcie_disable_errors(cdip);
1024 		(void) pci_save_config_regs(cdip);
1025 		if (is_pcie) {
1026 			pcie_enable_errors(cdip);
1027 			(void) pcie_enable_ce(cdip);
1028 		}
1029 	}
1030 	return (DDI_SUCCESS);
1031 }
1032 
1033 #ifdef DEBUG
1034 /*
1035  * Description of bus_power_op.
1036  */
1037 typedef struct pcie_buspwr_desc {
1038 	pm_bus_power_op_t pwr_op;
1039 	char *pwr_desc;
1040 } pcie_buspwr_desc_t;
1041 
1042 static pcie_buspwr_desc_t pcie_buspwr_desc[] = {
1043 	{BUS_POWER_CHILD_PWRCHG, "CHILD_PWRCHG"},
1044 	{BUS_POWER_NEXUS_PWRUP, "NEXUS_PWRUP"},
1045 	{BUS_POWER_PRE_NOTIFICATION, "PRE_NOTIFICATION"},
1046 	{BUS_POWER_POST_NOTIFICATION, "POST_NOTIFICATION"},
1047 	{BUS_POWER_HAS_CHANGED, "HAS_CHANGED"},
1048 	{BUS_POWER_NOINVOL, "NOINVOL"},
1049 	{-1, NULL}
1050 };
1051 
1052 /*
1053  * Returns description of the bus_power_op.
1054  */
1055 static char *
1056 pcie_decode_pwr_op(pm_bus_power_op_t op)
1057 {
1058 	pcie_buspwr_desc_t *descp = pcie_buspwr_desc;
1059 
1060 	for (; descp->pwr_desc; descp++) {
1061 		if (op == descp->pwr_op)
1062 			return (descp->pwr_desc);
1063 	}
1064 	return ("UNKNOWN OP");
1065 }
1066 
1067 static void
1068 pcie_pwr_dbg(dev_info_t *dip, char *fmt, ...)
1069 {
1070 	va_list ap;
1071 	if (!pcie_pwr_print)
1072 		return;
1073 
1074 	if (dip)
1075 		prom_printf("%s(%d): pcie pwr: ", ddi_driver_name(dip),
1076 		    ddi_get_instance(dip));
1077 body:
1078 	va_start(ap, fmt);
1079 	if (ap)
1080 		prom_vprintf(fmt, ap);
1081 	else
1082 		prom_printf(fmt);
1083 
1084 	va_end(ap);
1085 }
1086 
1087 #endif
1088