xref: /freebsd/sys/x86/cpufreq/hwpstate_amd.c (revision 3fdbd8a07a2dcb8fe3cec19fc59ef064453e4755)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2005 Nate Lawson
5  * Copyright (c) 2004 Colin Percival
6  * Copyright (c) 2004-2005 Bruno Durcot
7  * Copyright (c) 2004 FUKUDA Nobuhiko
8  * Copyright (c) 2009 Michael Reifenberger
9  * Copyright (c) 2009 Norikatsu Shigemura
10  * Copyright (c) 2008-2009 Gen Otsuji
11  * Copyright (c) 2025 ShengYi Hung
12  * Copyright (c) 2026 The FreeBSD Foundation
13  *
14  * Portions of this software were developed by Olivier Certner
15  * <olce@FreeBSD.org> at Kumacom SARL under sponsorship from the FreeBSD
16  * Foundation.
17  *
18  * This code is depending on kern_cpu.c, est.c, powernow.c, p4tcc.c, smist.c
19  * in various parts. The authors of these files are Nate Lawson,
20  * Colin Percival, Bruno Durcot, and FUKUDA Nobuhiko.
21  * This code contains patches by Michael Reifenberger and Norikatsu Shigemura.
22  * Thank you.
23  *
24  * Redistribution and use in source and binary forms, with or without
25  * modification, are permitted providing that the following conditions
26  * are met:
27  * 1. Redistributions of source code must retain the above copyright
28  *    notice, this list of conditions and the following disclaimer.
29  * 2. Redistributions in binary form must reproduce the above copyright
30  *    notice, this list of conditions and the following disclaimer in the
31  *    documentation and/or other materials provided with the distribution.
32  *
33  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR``AS IS'' AND ANY EXPRESS OR
34  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
35  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
37  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
39  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
41  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
42  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
43  * POSSIBILITY OF SUCH DAMAGE.
44  */
45 
46 /*
47  * For more info:
48  * BIOS and Kernel Developer's Guide(BKDG) for AMD Family 10h Processors
49  * 31116 Rev 3.20  February 04, 2009
50  * BIOS and Kernel Developer's Guide(BKDG) for AMD Family 11h Processors
51  * 41256 Rev 3.00 - July 07, 2008
52  * Processor Programming Reference (PPR) for AMD Family 1Ah Model 02h,
53  * Revision C1 Processors Volume 1 of 7 - Sep 29, 2024
54  */
55 
56 #include <sys/param.h>
57 #include <sys/bus.h>
58 #include <sys/cpu.h>
59 #include <sys/kernel.h>
60 #include <sys/malloc.h>
61 #include <sys/module.h>
62 #include <sys/pcpu.h>
63 #include <sys/proc.h>
64 #include <sys/sbuf.h>
65 #include <sys/sched.h>
66 #include <sys/smp.h>
67 
68 #include <machine/_inttypes.h>
69 #include <machine/cputypes.h>
70 #include <machine/md_var.h>
71 #include <machine/specialreg.h>
72 
73 #include <contrib/dev/acpica/include/acpi.h>
74 
75 #include <dev/acpica/acpivar.h>
76 
77 #include <x86/cpufreq/hwpstate_common.h>
78 
79 #include "acpi_if.h"
80 #include "cpufreq_if.h"
81 
82 
83 #define	MSR_AMD_10H_11H_LIMIT	0xc0010061
84 #define	MSR_AMD_10H_11H_CONTROL	0xc0010062
85 #define	MSR_AMD_10H_11H_STATUS	0xc0010063
86 #define	MSR_AMD_10H_11H_CONFIG	0xc0010064
87 
88 #define	MSR_AMD_CPPC_CAPS_1	0xc00102b0
89 #define	MSR_AMD_CPPC_ENABLE	0xc00102b1
90 #define	MSR_AMD_CPPC_CAPS_2	0xc00102b2
91 #define	MSR_AMD_CPPC_REQUEST	0xc00102b3
92 #define	MSR_AMD_CPPC_STATUS	0xc00102b4
93 
94 #define	MSR_AMD_CPPC_CAPS_1_NAME	"CPPC_CAPABILITY_1"
95 #define	MSR_AMD_CPPC_ENABLE_NAME	"CPPC_ENABLE"
96 #define	MSR_AMD_CPPC_REQUEST_NAME	"CPPC_REQUEST"
97 
98 #define	MSR_AMD_PWR_ACC		0xc001007a
99 #define	MSR_AMD_PWR_ACC_MX	0xc001007b
100 
101 #define	AMD_10H_11H_MAX_STATES	16
102 
103 /* for MSR_AMD_10H_11H_LIMIT C001_0061 */
104 #define	AMD_10H_11H_GET_PSTATE_MAX_VAL(msr)	(((msr) >> 4) & 0x7)
105 #define	AMD_10H_11H_GET_PSTATE_LIMIT(msr)	(((msr)) & 0x7)
106 /* for MSR_AMD_10H_11H_CONFIG 10h:C001_0064:68 / 11h:C001_0064:6B */
107 #define	AMD_10H_11H_CUR_VID(msr)		(((msr) >> 9) & 0x7F)
108 #define	AMD_10H_11H_CUR_DID(msr)		(((msr) >> 6) & 0x07)
109 #define	AMD_10H_11H_CUR_FID(msr)		((msr) & 0x3F)
110 
111 #define	AMD_17H_CUR_IDIV(msr)			(((msr) >> 30) & 0x03)
112 #define	AMD_17H_CUR_IDD(msr)			(((msr) >> 22) & 0xFF)
113 #define	AMD_17H_CUR_VID(msr)			(((msr) >> 14) & 0xFF)
114 #define	AMD_17H_CUR_DID(msr)			(((msr) >> 8) & 0x3F)
115 #define	AMD_17H_CUR_FID(msr)			((msr) & 0xFF)
116 
117 #define	AMD_1AH_CUR_FID(msr)			((msr) & 0xFFF)
118 
119 #define	AMD_CPPC_CAPS_1_HIGHEST_PERF_BITS	0xff000000
120 #define	AMD_CPPC_CAPS_1_NOMINAL_PERF_BITS	0x00ff0000
121 #define	AMD_CPPC_CAPS_1_EFFICIENT_PERF_BITS	0x0000ff00
122 #define	AMD_CPPC_CAPS_1_LOWEST_PERF_BITS	0x000000ff
123 
124 #define	AMD_CPPC_REQUEST_EPP_BITS		0xff000000
125 #define	AMD_CPPC_REQUEST_DES_PERF_BITS		0x00ff0000
126 #define	AMD_CPPC_REQUEST_MIN_PERF_BITS		0x0000ff00
127 #define	AMD_CPPC_REQUEST_MAX_PERF_BITS		0x000000ff
128 
129 #define	HWP_AMD_CLASSNAME			"hwpstate_amd"
130 
131 #define	BITS_VALUE(bits, val)						\
132 	(((val) & (bits)) >> (ffsll((bits)) - 1))
133 #define	BITS_WITH_VALUE(bits, val)					\
134 	(((uintmax_t)(val) << (ffsll((bits)) - 1)) & (bits))
135 #define	SET_BITS_VALUE(var, bits, val)					\
136 	((var) = ((var) & ~(bits)) | BITS_WITH_VALUE((bits), (val)))
137 
138 #define	HWPSTATE_DEBUG(dev, msg...)			\
139 	do {						\
140 		if (hwpstate_verbose)			\
141 			device_printf(dev, msg);	\
142 	} while (0)
143 
144 struct hwpstate_setting {
145 	int	freq;		/* CPU clock in Mhz or 100ths of a percent. */
146 	int	volts;		/* Voltage in mV. */
147 	int	power;		/* Power consumed in mW. */
148 	int	lat;		/* Transition latency in us. */
149 	int	pstate_id;	/* P-State id */
150 };
151 
152 #define HWPFL_USE_CPPC		(1 << 0)
153 
154 /*
155  * Atomicity is achieved by only modifying a given softc on its associated CPU
156  * and with interrupts disabled.
157  *
158  * XXX - Only the CPPC support complies at the moment.
159  */
160 struct hwpstate_softc {
161 	device_t	dev;
162 	u_int		flags;
163 	union {
164 		struct {
165 			struct hwpstate_setting
166 			hwpstate_settings[AMD_10H_11H_MAX_STATES];
167 			int cfnum;
168 		};
169 		struct {
170 			uint64_t request;
171 		} cppc;
172 	};
173 };
174 
175 static void	hwpstate_identify(driver_t *driver, device_t parent);
176 static int	hwpstate_probe(device_t dev);
177 static int	hwpstate_attach(device_t dev);
178 static int	hwpstate_detach(device_t dev);
179 static int	hwpstate_set(device_t dev, const struct cf_setting *cf);
180 static int	hwpstate_get(device_t dev, struct cf_setting *cf);
181 static int	hwpstate_settings(device_t dev, struct cf_setting *sets, int *count);
182 static int	hwpstate_type(device_t dev, int *type);
183 static int	hwpstate_shutdown(device_t dev);
184 static int	hwpstate_features(driver_t *driver, u_int *features);
185 static int	hwpstate_get_info_from_acpi_perf(device_t dev, device_t perf_dev);
186 static int	hwpstate_get_info_from_msr(device_t dev);
187 static int	hwpstate_goto_pstate(device_t dev, int pstate_id);
188 
189 static int	hwpstate_verify;
190 SYSCTL_INT(_debug, OID_AUTO, hwpstate_verify, CTLFLAG_RWTUN,
191     &hwpstate_verify, 0, "Verify P-state after setting");
192 
193 static bool	hwpstate_pstate_limit;
194 SYSCTL_BOOL(_debug, OID_AUTO, hwpstate_pstate_limit, CTLFLAG_RWTUN,
195     &hwpstate_pstate_limit, 0,
196     "If enabled (1), limit administrative control of P-states to the value in "
197     "CurPstateLimit");
198 
199 static bool	hwpstate_amd_cppc_enable = true;
200 SYSCTL_BOOL(_machdep, OID_AUTO, hwpstate_amd_cppc_enable, CTLFLAG_RDTUN,
201     &hwpstate_amd_cppc_enable, 0,
202     "Set 1 (default) to enable AMD CPPC, 0 to disable");
203 
204 static device_method_t hwpstate_methods[] = {
205 	/* Device interface */
206 	DEVMETHOD(device_identify,	hwpstate_identify),
207 	DEVMETHOD(device_probe,		hwpstate_probe),
208 	DEVMETHOD(device_attach,	hwpstate_attach),
209 	DEVMETHOD(device_detach,	hwpstate_detach),
210 	DEVMETHOD(device_shutdown,	hwpstate_shutdown),
211 
212 	/* cpufreq interface */
213 	DEVMETHOD(cpufreq_drv_set,	hwpstate_set),
214 	DEVMETHOD(cpufreq_drv_get,	hwpstate_get),
215 	DEVMETHOD(cpufreq_drv_settings,	hwpstate_settings),
216 	DEVMETHOD(cpufreq_drv_type,	hwpstate_type),
217 
218 	/* ACPI interface */
219 	DEVMETHOD(acpi_get_features,	hwpstate_features),
220 	{0, 0}
221 };
222 
223 static inline void
224 check_cppc_in_use(const struct hwpstate_softc *const sc, const char *const func)
225 {
226 	KASSERT((sc->flags & HWPFL_USE_CPPC) != 0, (HWP_AMD_CLASSNAME
227 	    ": %s() called but HWPFL_USE_CPPC not set", func));
228 }
229 
230 static void
231 print_msr_bits(struct sbuf *const sb, const char *const legend,
232     const uint64_t bits, const uint64_t msr_value)
233 {
234 	sbuf_printf(sb, "\t%s: %" PRIu64 "\n", legend,
235 	    BITS_VALUE(bits, msr_value));
236 }
237 
238 static void
239 print_cppc_caps_1(struct sbuf *const sb, const uint64_t caps)
240 {
241 	sbuf_printf(sb, MSR_AMD_CPPC_CAPS_1_NAME ": %#016" PRIx64 "\n", caps);
242 	print_msr_bits(sb, "Highest Performance",
243 	    AMD_CPPC_CAPS_1_HIGHEST_PERF_BITS, caps);
244 	print_msr_bits(sb, "Guaranteed Performance",
245 	    AMD_CPPC_CAPS_1_NOMINAL_PERF_BITS, caps);
246 	print_msr_bits(sb, "Efficient Performance",
247 	    AMD_CPPC_CAPS_1_EFFICIENT_PERF_BITS, caps);
248 	print_msr_bits(sb, "Lowest Performance",
249 	    AMD_CPPC_CAPS_1_LOWEST_PERF_BITS, caps);
250 }
251 
252 #define MSR_NOT_READ_MSG	"Not read (fault or previous errors)"
253 
254 static void
255 print_cppc_no_caps_1(struct sbuf *const sb)
256 {
257 	sbuf_printf(sb, MSR_AMD_CPPC_CAPS_1_NAME ": " MSR_NOT_READ_MSG "\n");
258 }
259 
260 static void
261 print_cppc_request(struct sbuf *const sb, const uint64_t request)
262 {
263 	sbuf_printf(sb, MSR_AMD_CPPC_REQUEST_NAME ": %#016" PRIx64 "\n",
264 	    request);
265 	print_msr_bits(sb, "Efficiency / Energy Preference",
266 	    AMD_CPPC_REQUEST_EPP_BITS, request);
267 	print_msr_bits(sb, "Desired Performance",
268 	    AMD_CPPC_REQUEST_DES_PERF_BITS, request);
269 	print_msr_bits(sb, "Minimum Performance",
270 	    AMD_CPPC_REQUEST_MIN_PERF_BITS, request);
271 	print_msr_bits(sb, "Maximum Performance",
272 	    AMD_CPPC_REQUEST_MAX_PERF_BITS, request);
273 }
274 
275 static void
276 print_cppc_no_request(struct sbuf *const sb)
277 {
278 	sbuf_printf(sb, MSR_AMD_CPPC_REQUEST_NAME ": " MSR_NOT_READ_MSG "\n");
279 }
280 
281 /*
282  * Internal errors conveyed by code executing on another CPU.
283  */
284 #define HWP_ERROR_CPPC_ENABLE		(1 << 0)
285 #define HWP_ERROR_CPPC_CAPS		(1 << 1)
286 #define HWP_ERROR_CPPC_REQUEST		(1 << 2)
287 #define HWP_ERROR_CPPC_REQUEST_WRITE	(1 << 3)
288 
289 static inline bool
290 hwp_has_error(u_int res, u_int err)
291 {
292 	return ((res & err) != 0);
293 }
294 
295 struct get_cppc_regs_data {
296 	uint64_t enable;
297 	uint64_t caps;
298 	uint64_t req;
299 	/* HWP_ERROR_CPPC_* except HWP_ERROR_*_WRITE */
300 	u_int res;
301 };
302 
303 static void
304 get_cppc_regs_cb(void *args)
305 {
306 	struct get_cppc_regs_data *data = args;
307 	int error;
308 
309 	data->res = 0;
310 
311 	error = rdmsr_safe(MSR_AMD_CPPC_ENABLE, &data->enable);
312 	if (error != 0)
313 		data->res |= HWP_ERROR_CPPC_ENABLE;
314 
315 	error = rdmsr_safe(MSR_AMD_CPPC_CAPS_1, &data->caps);
316 	if (error != 0)
317 		data->res |= HWP_ERROR_CPPC_CAPS;
318 
319 	error = rdmsr_safe(MSR_AMD_CPPC_REQUEST, &data->req);
320 	if (error != 0)
321 		data->res |= HWP_ERROR_CPPC_REQUEST;
322 }
323 
324 static int
325 sysctl_cppc_dump_handler(SYSCTL_HANDLER_ARGS)
326 {
327 	const struct hwpstate_softc *const sc = arg1;
328 	const device_t dev = sc->dev;
329 	const u_int cpuid = cpu_get_pcpu(dev)->pc_cpuid;
330 	struct sbuf *sb;
331 	struct sbuf sbs;
332 	struct get_cppc_regs_data data;
333 	int error;
334 
335 	/* Sysctl knob does not exist if HWPFL_USE_CPPC is not set. */
336 	check_cppc_in_use(sc, __func__);
337 
338 	sb = sbuf_new_for_sysctl(&sbs, NULL, 0, req);
339 
340 	smp_rendezvous_cpu(cpuid, smp_no_rendezvous_barrier, get_cppc_regs_cb,
341 	    smp_no_rendezvous_barrier, &data);
342 
343 	if (hwp_has_error(data.res, HWP_ERROR_CPPC_ENABLE))
344 		sbuf_printf(sb, "CPU%u: " MSR_AMD_CPPC_ENABLE_NAME ": "
345 		    MSR_NOT_READ_MSG "\n", cpuid);
346 	else
347 		sbuf_printf(sb, "CPU%u: HWP %sabled (" MSR_AMD_CPPC_REQUEST_NAME
348 		    ": %#" PRIx64 ")\n", cpuid, data.enable & 1 ? "En" : "Dis",
349 		    data.enable);
350 
351 	if (hwp_has_error(data.res, HWP_ERROR_CPPC_CAPS))
352 		print_cppc_no_caps_1(sb);
353 	else
354 		print_cppc_caps_1(sb, data.caps);
355 
356 	if (hwp_has_error(data.res, HWP_ERROR_CPPC_REQUEST))
357 		print_cppc_no_request(sb);
358 	else
359 		print_cppc_request(sb, data.req);
360 
361 	error = sbuf_finish(sb);
362 	sbuf_delete(sb);
363 
364 	return (error);
365 }
366 
367 
368 struct set_cppc_request_cb {
369 	struct hwpstate_softc	*sc;
370 	uint64_t		 request;
371 	uint64_t		 mask;
372 	int			 res; /* 0 or HWP_ERROR_CPPC_REQUEST_WRITE */
373 };
374 
375 static void
376 set_cppc_request_cb(void *args)
377 {
378 	struct set_cppc_request_cb *const data = args;
379 	uint64_t *const req = &data->sc->cppc.request;
380 	int error;
381 
382 	*req &= ~data->mask;
383 	*req |= data->request & data->mask;
384 
385 	error = wrmsr_safe(MSR_AMD_CPPC_REQUEST, *req);
386 	data->res = error == 0 ? 0 : HWP_ERROR_CPPC_REQUEST_WRITE;
387 }
388 
389 static inline void
390 set_cppc_request_send_one(struct set_cppc_request_cb *const data, device_t dev)
391 {
392 	const u_int cpuid = cpu_get_pcpu(dev)->pc_cpuid;
393 
394 	data->sc = device_get_softc(dev);
395 	smp_rendezvous_cpu(cpuid, smp_no_rendezvous_barrier,
396 	    set_cppc_request_cb, smp_no_rendezvous_barrier, data);
397 }
398 
399 static int
400 set_cppc_request(device_t hwp_dev, uint64_t request, uint64_t mask)
401 {
402 	struct set_cppc_request_cb data = {
403 		.request = request,
404 		.mask = mask,
405 		/* 'sc' filled by set_cppc_request_send_one(). */
406 	};
407 	int error;
408 
409 	if (hwpstate_pkg_ctrl_enable) {
410 		const devclass_t dc = devclass_find(HWP_AMD_CLASSNAME);
411 		const int units = devclass_get_maxunit(dc);
412 
413 		error = 0;
414 		for (int i = 0; i < units; ++i) {
415 			const device_t dev = devclass_get_device(dc, i);
416 
417 			set_cppc_request_send_one(&data, dev);
418 			if (data.res != 0)
419 				/* Note the error, but continue. */
420 				error = EFAULT;
421 		}
422 	} else {
423 		set_cppc_request_send_one(&data, hwp_dev);
424 		error = data.res != 0 ? EFAULT : 0;
425 	}
426 
427 	return (error);
428 }
429 
430 static int
431 sysctl_cppc_request_field_handler(SYSCTL_HANDLER_ARGS)
432 {
433 	const u_int max = BITS_VALUE(arg2, (uint64_t)-1);
434 	const device_t dev = arg1;
435 	struct hwpstate_softc *const sc = device_get_softc(dev);
436 	u_int val;
437 	int error;
438 
439 	/* Sysctl knob does not exist if HWPFL_USE_CPPC is not set. */
440 	check_cppc_in_use(sc, __func__);
441 
442 	val = BITS_VALUE(arg2, sc->cppc.request);
443 
444 	error = sysctl_handle_int(oidp, &val, 0, req);
445 	if (error != 0 || req->newptr == NULL)
446 		return (error);
447 
448 	if (val > max)
449 		return (EINVAL);
450 	error = set_cppc_request(dev, BITS_WITH_VALUE(arg2, val),
451 	    BITS_WITH_VALUE(arg2, -1));
452 	return (error);
453 }
454 
455 static driver_t hwpstate_driver = {
456 	HWP_AMD_CLASSNAME,
457 	hwpstate_methods,
458 	sizeof(struct hwpstate_softc),
459 };
460 
461 DRIVER_MODULE(hwpstate, cpu, hwpstate_driver, 0, 0);
462 
463 static int
464 hwpstate_amd_iscale(int val, int div)
465 {
466 	switch (div) {
467 	case 3: /* divide by 1000 */
468 		val /= 10;
469 	case 2: /* divide by 100 */
470 		val /= 10;
471 	case 1: /* divide by 10 */
472 		val /= 10;
473 	case 0: /* divide by 1 */
474 	    ;
475 	}
476 
477 	return (val);
478 }
479 
480 /*
481  * Go to Px-state on all cpus, considering the limit register (if so
482  * configured).
483  */
484 static int
485 hwpstate_goto_pstate(device_t dev, int id)
486 {
487 	sbintime_t sbt;
488 	uint64_t msr;
489 	int cpu, i, j, limit;
490 
491 	if (hwpstate_pstate_limit) {
492 		/* get the current pstate limit */
493 		msr = rdmsr(MSR_AMD_10H_11H_LIMIT);
494 		limit = AMD_10H_11H_GET_PSTATE_LIMIT(msr);
495 		if (limit > id) {
496 			HWPSTATE_DEBUG(dev, "Restricting requested P%d to P%d "
497 			    "due to HW limit\n", id, limit);
498 			id = limit;
499 		}
500 	}
501 
502 	cpu = curcpu;
503 	HWPSTATE_DEBUG(dev, "setting P%d-state on cpu%d\n", id, cpu);
504 	/* Go To Px-state */
505 	wrmsr(MSR_AMD_10H_11H_CONTROL, id);
506 
507 	/*
508 	 * We are going to the same Px-state on all cpus.
509 	 * Probably should take _PSD into account.
510 	 */
511 	CPU_FOREACH(i) {
512 		if (i == cpu)
513 			continue;
514 
515 		/* Bind to each cpu. */
516 		thread_lock(curthread);
517 		sched_bind(curthread, i);
518 		thread_unlock(curthread);
519 		HWPSTATE_DEBUG(dev, "setting P%d-state on cpu%d\n", id, i);
520 		/* Go To Px-state */
521 		wrmsr(MSR_AMD_10H_11H_CONTROL, id);
522 	}
523 
524 	/*
525 	 * Verify whether each core is in the requested P-state.
526 	 */
527 	if (hwpstate_verify) {
528 		CPU_FOREACH(i) {
529 			thread_lock(curthread);
530 			sched_bind(curthread, i);
531 			thread_unlock(curthread);
532 			/* wait loop (100*100 usec is enough ?) */
533 			for (j = 0; j < 100; j++) {
534 				/* get the result. not assure msr=id */
535 				msr = rdmsr(MSR_AMD_10H_11H_STATUS);
536 				if (msr == id)
537 					break;
538 				sbt = SBT_1MS / 10;
539 				tsleep_sbt(dev, PZERO, "pstate_goto", sbt,
540 				    sbt >> tc_precexp, 0);
541 			}
542 			HWPSTATE_DEBUG(dev, "result: P%d-state on cpu%d\n",
543 			    (int)msr, i);
544 			if (msr != id) {
545 				HWPSTATE_DEBUG(dev,
546 				    "error: loop is not enough.\n");
547 				return (ENXIO);
548 			}
549 		}
550 	}
551 
552 	return (0);
553 }
554 
555 static int
556 hwpstate_set(device_t dev, const struct cf_setting *cf)
557 {
558 	struct hwpstate_softc *sc;
559 	struct hwpstate_setting *set;
560 	int i;
561 
562 	if (cf == NULL)
563 		return (EINVAL);
564 	sc = device_get_softc(dev);
565 	if ((sc->flags & HWPFL_USE_CPPC) != 0)
566 		return (EOPNOTSUPP);
567 	set = sc->hwpstate_settings;
568 	for (i = 0; i < sc->cfnum; i++)
569 		if (CPUFREQ_CMP(cf->freq, set[i].freq))
570 			break;
571 	if (i == sc->cfnum)
572 		return (EINVAL);
573 
574 	return (hwpstate_goto_pstate(dev, set[i].pstate_id));
575 }
576 
577 static int
578 hwpstate_get(device_t dev, struct cf_setting *cf)
579 {
580 	struct hwpstate_softc *sc;
581 	struct hwpstate_setting set;
582 	struct pcpu *pc;
583 	uint64_t msr;
584 	uint64_t rate;
585 	int ret;
586 
587 	sc = device_get_softc(dev);
588 	if (cf == NULL)
589 		return (EINVAL);
590 
591 	if ((sc->flags & HWPFL_USE_CPPC) != 0) {
592 		pc = cpu_get_pcpu(dev);
593 		if (pc == NULL)
594 			return (ENXIO);
595 
596 		memset(cf, CPUFREQ_VAL_UNKNOWN, sizeof(*cf));
597 		cf->dev = dev;
598 		if ((ret = cpu_est_clockrate(pc->pc_cpuid, &rate)))
599 			return (ret);
600 		cf->freq = rate / 1000000;
601 	} else {
602 		msr = rdmsr(MSR_AMD_10H_11H_STATUS);
603 		if (msr >= sc->cfnum)
604 			return (EINVAL);
605 		set = sc->hwpstate_settings[msr];
606 
607 		cf->freq = set.freq;
608 		cf->volts = set.volts;
609 		cf->power = set.power;
610 		cf->lat = set.lat;
611 		cf->dev = dev;
612 	}
613 
614 	return (0);
615 }
616 
617 static int
618 hwpstate_settings(device_t dev, struct cf_setting *sets, int *count)
619 {
620 	struct hwpstate_softc *sc;
621 	struct hwpstate_setting set;
622 	int i;
623 
624 	if (sets == NULL || count == NULL)
625 		return (EINVAL);
626 	sc = device_get_softc(dev);
627 	if ((sc->flags & HWPFL_USE_CPPC) != 0)
628 		return (EOPNOTSUPP);
629 
630 	if (*count < sc->cfnum)
631 		return (E2BIG);
632 	for (i = 0; i < sc->cfnum; i++, sets++) {
633 		set = sc->hwpstate_settings[i];
634 		sets->freq = set.freq;
635 		sets->volts = set.volts;
636 		sets->power = set.power;
637 		sets->lat = set.lat;
638 		sets->dev = dev;
639 	}
640 	*count = sc->cfnum;
641 
642 	return (0);
643 }
644 
645 static int
646 hwpstate_type(device_t dev, int *type)
647 {
648 	struct hwpstate_softc *sc;
649 
650 	if (type == NULL)
651 		return (EINVAL);
652 	sc = device_get_softc(dev);
653 
654 	*type = CPUFREQ_TYPE_ABSOLUTE;
655 	*type |= (sc->flags & HWPFL_USE_CPPC) != 0 ?
656 	    CPUFREQ_FLAG_INFO_ONLY | CPUFREQ_FLAG_UNCACHED :
657 	    0;
658 	return (0);
659 }
660 
661 static void
662 hwpstate_identify(driver_t *driver, device_t parent)
663 {
664 	if (device_find_child(parent, HWP_AMD_CLASSNAME, DEVICE_UNIT_ANY) !=
665 	    NULL)
666 		return;
667 
668 	if ((cpu_vendor_id != CPU_VENDOR_AMD || CPUID_TO_FAMILY(cpu_id) < 0x10) &&
669 	    cpu_vendor_id != CPU_VENDOR_HYGON)
670 		return;
671 
672 	/*
673 	 * Check if hardware pstate enable bit is set.
674 	 */
675 	if ((amd_pminfo & AMDPM_HW_PSTATE) == 0) {
676 		HWPSTATE_DEBUG(parent, "hwpstate enable bit is not set.\n");
677 		return;
678 	}
679 
680 	if (resource_disabled(HWP_AMD_CLASSNAME, 0))
681 		return;
682 
683 	if (BUS_ADD_CHILD(parent, 10, HWP_AMD_CLASSNAME,
684 		device_get_unit(parent)) == NULL)
685 		device_printf(parent, "hwpstate: add child failed\n");
686 }
687 
688 struct set_autonomous_hwp_data {
689 	/* Inputs */
690 	struct hwpstate_softc *sc;
691 	/* Outputs */
692 	/* HWP_ERROR_CPPC_* */
693 	u_int res;
694 	/* Below fields filled depending on 'res'. */
695 	uint64_t caps;
696 	uint64_t init_request;
697 	uint64_t request;
698 };
699 
700 static void
701 enable_cppc_cb(void *args)
702 {
703 	struct set_autonomous_hwp_data *const data = args;
704 	struct hwpstate_softc *const sc = data->sc;
705 	uint64_t lowest_perf, highest_perf;
706 	int error;
707 
708 	/* We proceed sequentially, so we'll clear out errors on progress. */
709 	data->res = HWP_ERROR_CPPC_ENABLE | HWP_ERROR_CPPC_CAPS |
710 	    HWP_ERROR_CPPC_REQUEST | HWP_ERROR_CPPC_REQUEST_WRITE;
711 
712 	error = wrmsr_safe(MSR_AMD_CPPC_ENABLE, 1);
713 	if (error != 0)
714 		return;
715 	data->res &= ~HWP_ERROR_CPPC_ENABLE;
716 
717 	error = rdmsr_safe(MSR_AMD_CPPC_CAPS_1, &data->caps);
718 	if (error != 0)
719 		return;
720 	data->res &= ~HWP_ERROR_CPPC_CAPS;
721 
722 	error = rdmsr_safe(MSR_AMD_CPPC_REQUEST, &sc->cppc.request);
723 	if (error != 0)
724 		return;
725 	data->res &= ~HWP_ERROR_CPPC_REQUEST;
726 	/* The CPPC_REQUEST value before we tweak it. */
727 	data->init_request = sc->cppc.request;
728 
729 	/*
730 	 * In Intel's reference manual, the default value of EPP is 0x80u which
731 	 * is the balanced mode. For consistency, we set the same value in AMD's
732 	 * CPPC driver.
733 	 */
734 	SET_BITS_VALUE(sc->cppc.request, AMD_CPPC_REQUEST_EPP_BITS, 0x80);
735 
736 	/* Enable autonomous mode by setting desired performance to 0. */
737 	SET_BITS_VALUE(sc->cppc.request, AMD_CPPC_REQUEST_DES_PERF_BITS, 0);
738 
739 	/*
740 	 * When MSR_AMD_CPPC_CAPS_1 stays at its reset value (0) before CPPC
741 	 * activation (not supposed to happen, but happens in the field), we use
742 	 * reasonable default values that are explicitly described by the ACPI
743 	 * spec (all 0s for the minimum value, all 1s for the maximum one).
744 	 * Going further, we actually do the same as long as the minimum and
745 	 * maximum performance levels are not sorted or are equal (in which case
746 	 * CPPC is not supposed to make sense at all), which covers the reset
747 	 * value case.
748 	 */
749 	lowest_perf = BITS_VALUE(AMD_CPPC_CAPS_1_LOWEST_PERF_BITS, data->caps);
750 	highest_perf = BITS_VALUE(AMD_CPPC_CAPS_1_HIGHEST_PERF_BITS, data->caps);
751 	if (lowest_perf >= highest_perf) {
752 		lowest_perf = 0;
753 		highest_perf = -1;
754 	}
755 	SET_BITS_VALUE(sc->cppc.request, AMD_CPPC_REQUEST_MIN_PERF_BITS,
756 	    lowest_perf);
757 	SET_BITS_VALUE(sc->cppc.request, AMD_CPPC_REQUEST_MAX_PERF_BITS,
758 	    highest_perf);
759 
760 	error = wrmsr_safe(MSR_AMD_CPPC_REQUEST, sc->cppc.request);
761 	if (error != 0)
762 		return;
763 	data->res &= ~HWP_ERROR_CPPC_REQUEST_WRITE;
764 	data->request = sc->cppc.request;
765 }
766 
767 static int
768 enable_cppc(struct hwpstate_softc *sc)
769 {
770 	const device_t dev = sc->dev;
771 	const u_int cpuid = cpu_get_pcpu(dev)->pc_cpuid;
772 	struct set_autonomous_hwp_data data;
773 	struct sbuf sbs;
774 	struct sbuf *sb;
775 
776 	data.sc = sc;
777 	smp_rendezvous_cpu(cpuid, smp_no_rendezvous_barrier,
778 	    enable_cppc_cb, smp_no_rendezvous_barrier, &data);
779 
780 	if (hwp_has_error(data.res, HWP_ERROR_CPPC_ENABLE)) {
781 		device_printf(dev, "CPU%u: Failed to enable CPPC!\n", cpuid);
782 		return (ENXIO);
783 	}
784 	device_printf(dev, "CPU%u: CPPC enabled.\n", cpuid);
785 
786 	/*
787 	 * Now that we have enabled CPPC, we can't go back, so we'll attach even
788 	 * in case of further malfunction, allowing the user to retry setting
789 	 * CPPC_REQUEST via the sysctl knobs.
790 	 */
791 
792 	sb = sbuf_new(&sbs, NULL, 0, SBUF_AUTOEXTEND);
793 
794 	if (hwpstate_verbose)
795 		sbuf_printf(sb,
796 		    "CPU%u: Initial MSR values after CPPC enable:\n", cpuid);
797 	if (hwp_has_error(data.res, HWP_ERROR_CPPC_CAPS))
798 		print_cppc_no_caps_1(sb);
799 	else if (hwpstate_verbose)
800 		print_cppc_caps_1(sb, data.caps);
801 	if (hwp_has_error(data.res, HWP_ERROR_CPPC_REQUEST))
802 		print_cppc_no_request(sb);
803 	else if (hwpstate_verbose)
804 		print_cppc_request(sb, data.init_request);
805 	if (hwp_has_error(data.res, HWP_ERROR_CPPC_REQUEST_WRITE))
806 		device_printf(dev, "CPU%u: Could not write into "
807 		    MSR_AMD_CPPC_REQUEST_NAME "!\n",
808 		    cpuid);
809 	else if (hwpstate_verbose) {
810 		sbuf_printf(sb, "CPU%u: Tweaked MSR values:\n", cpuid);
811 		print_cppc_request(sb, data.request);
812 	}
813 
814 	sbuf_finish(sb);
815 	sbuf_putbuf(sb);
816 	sbuf_delete(sb);
817 
818 	return (0);
819 }
820 
821 static int
822 hwpstate_probe(device_t dev)
823 {
824 	struct hwpstate_softc *sc;
825 	device_t perf_dev;
826 	uint64_t msr;
827 	int error, type;
828 
829 	sc = device_get_softc(dev);
830 
831 	if (hwpstate_amd_cppc_enable &&
832 	   (amd_extended_feature_extensions & AMDFEID_CPPC)) {
833 		sc->flags |= HWPFL_USE_CPPC;
834 		device_set_desc(dev,
835 		    "AMD Collaborative Processor Performance Control (CPPC)");
836 	} else {
837 		/*
838 		 * No CPPC support.  Only keep hwpstate0, it goes well with
839 		 * acpi_throttle.
840 		 */
841 		if (device_get_unit(dev) != 0)
842 			return (ENXIO);
843 		device_set_desc(dev, "Cool`n'Quiet 2.0");
844 	}
845 
846 	sc->dev = dev;
847 	if ((sc->flags & HWPFL_USE_CPPC) != 0)
848 		return (0);
849 
850 	/*
851 	 * Check if acpi_perf has INFO only flag.
852 	 */
853 	perf_dev = device_find_child(device_get_parent(dev), "acpi_perf",
854 	    DEVICE_UNIT_ANY);
855 	error = TRUE;
856 	if (perf_dev && device_is_attached(perf_dev)) {
857 		error = CPUFREQ_DRV_TYPE(perf_dev, &type);
858 		if (error == 0) {
859 			if ((type & CPUFREQ_FLAG_INFO_ONLY) == 0) {
860 				/*
861 				 * If acpi_perf doesn't have INFO_ONLY flag,
862 				 * it will take care of pstate transitions.
863 				 */
864 				HWPSTATE_DEBUG(dev, "acpi_perf will take care of pstate transitions.\n");
865 				return (ENXIO);
866 			} else {
867 				/*
868 				 * If acpi_perf has INFO_ONLY flag, (_PCT has FFixedHW)
869 				 * we can get _PSS info from acpi_perf
870 				 * without going into ACPI.
871 				 */
872 				HWPSTATE_DEBUG(dev, "going to fetch info from acpi_perf\n");
873 				error = hwpstate_get_info_from_acpi_perf(dev, perf_dev);
874 			}
875 		}
876 	}
877 
878 	if (error == 0) {
879 		/*
880 		 * Now we get _PSS info from acpi_perf without error.
881 		 * Let's check it.
882 		 */
883 		msr = rdmsr(MSR_AMD_10H_11H_LIMIT);
884 		if (sc->cfnum != 1 + AMD_10H_11H_GET_PSTATE_MAX_VAL(msr)) {
885 			HWPSTATE_DEBUG(dev, "MSR (%jd) and ACPI _PSS (%d)"
886 			    " count mismatch\n", (intmax_t)msr, sc->cfnum);
887 			error = TRUE;
888 		}
889 	}
890 
891 	/*
892 	 * If we cannot get info from acpi_perf,
893 	 * Let's get info from MSRs.
894 	 */
895 	if (error)
896 		error = hwpstate_get_info_from_msr(dev);
897 	if (error)
898 		return (error);
899 
900 	return (0);
901 }
902 
903 static int
904 hwpstate_attach(device_t dev)
905 {
906 	struct hwpstate_softc *sc;
907 	int res;
908 
909 	sc = device_get_softc(dev);
910 	if ((sc->flags & HWPFL_USE_CPPC) != 0) {
911 		if ((res = enable_cppc(sc)) != 0)
912 			return (res);
913 		SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
914 		    SYSCTL_STATIC_CHILDREN(_debug), OID_AUTO,
915 		    device_get_nameunit(dev),
916 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE,
917 		    sc, 0, sysctl_cppc_dump_handler, "A", "");
918 
919 		SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
920 		    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
921 		    "epp", CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
922 		    dev, AMD_CPPC_REQUEST_EPP_BITS,
923 		    sysctl_cppc_request_field_handler, "IU",
924 		    "Efficiency/Performance Preference (from 0, "
925 		    "most performant, to 255, most efficient)");
926 
927 		SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
928 		    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
929 		    "minimum_performance",
930 		    CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
931 		    dev, AMD_CPPC_REQUEST_MIN_PERF_BITS,
932 		    sysctl_cppc_request_field_handler, "IU",
933 		    "Minimum allowed performance level (from 0 to 255; "
934 		    "should be smaller than 'maximum_performance'; "
935 		    "effective range limited by CPU)");
936 
937 		SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
938 		    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
939 		    "maximum_performance",
940 		    CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
941 		    dev, AMD_CPPC_REQUEST_MAX_PERF_BITS,
942 		    sysctl_cppc_request_field_handler, "IU",
943 		    "Maximum allowed performance level (from 0 to 255; "
944 		    "should be larger than 'minimum_performance'; "
945 		    "effective range limited by CPU)");
946 
947 		SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
948 		    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
949 		    "desired_performance",
950 		    CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
951 		    dev, AMD_CPPC_REQUEST_DES_PERF_BITS,
952 		    sysctl_cppc_request_field_handler, "IU",
953 		    "Desired performance level (from 0 to 255, "
954 		    "0 enables autonomous mode, otherwise value should be "
955 		    "between 'minimum_performance' and 'maximum_performance' "
956 		    "inclusive)");
957 	}
958 	return (cpufreq_register(dev));
959 }
960 
961 static int
962 hwpstate_get_info_from_msr(device_t dev)
963 {
964 	struct hwpstate_softc *sc;
965 	struct hwpstate_setting *hwpstate_set;
966 	uint64_t msr;
967 	int family, i, fid, did;
968 
969 	family = CPUID_TO_FAMILY(cpu_id);
970 	sc = device_get_softc(dev);
971 	/* Get pstate count */
972 	msr = rdmsr(MSR_AMD_10H_11H_LIMIT);
973 	sc->cfnum = 1 + AMD_10H_11H_GET_PSTATE_MAX_VAL(msr);
974 	hwpstate_set = sc->hwpstate_settings;
975 	for (i = 0; i < sc->cfnum; i++) {
976 		msr = rdmsr(MSR_AMD_10H_11H_CONFIG + i);
977 		if ((msr & ((uint64_t)1 << 63)) == 0) {
978 			HWPSTATE_DEBUG(dev, "msr is not valid.\n");
979 			return (ENXIO);
980 		}
981 		did = AMD_10H_11H_CUR_DID(msr);
982 		fid = AMD_10H_11H_CUR_FID(msr);
983 
984 		hwpstate_set[i].volts = CPUFREQ_VAL_UNKNOWN;
985 		hwpstate_set[i].power = CPUFREQ_VAL_UNKNOWN;
986 		hwpstate_set[i].lat = CPUFREQ_VAL_UNKNOWN;
987 		/* Convert fid/did to frequency. */
988 		switch (family) {
989 		case 0x11:
990 			hwpstate_set[i].freq = (100 * (fid + 0x08)) >> did;
991 			break;
992 		case 0x10:
993 		case 0x12:
994 		case 0x15:
995 		case 0x16:
996 			hwpstate_set[i].freq = (100 * (fid + 0x10)) >> did;
997 			break;
998 		case 0x17:
999 		case 0x18:
1000 		case 0x19:
1001 		case 0x1A:
1002 			/* calculate freq */
1003 			if (family == 0x1A) {
1004 				fid = AMD_1AH_CUR_FID(msr);
1005 				/* 1Ah CPU don't use a divisor */
1006 				hwpstate_set[i].freq = fid;
1007 				if (fid > 0x0f)
1008 					hwpstate_set[i].freq *= 5;
1009 				else {
1010 					HWPSTATE_DEBUG(dev,
1011 					    "unexpected fid: %d\n", fid);
1012 					return (ENXIO);
1013 				}
1014 			} else {
1015 				did = AMD_17H_CUR_DID(msr);
1016 				if (did == 0) {
1017 					HWPSTATE_DEBUG(dev,
1018 					    "unexpected did: 0\n");
1019 					did = 1;
1020 				}
1021 				fid = AMD_17H_CUR_FID(msr);
1022 				hwpstate_set[i].freq = (200 * fid) / did;
1023 			}
1024 
1025 			/* Vid step is 6.25mV, so scale by 100. */
1026 			hwpstate_set[i].volts =
1027 			    (155000 - (625 * AMD_17H_CUR_VID(msr))) / 100;
1028 			/*
1029 			 * Calculate current first.
1030 			 * This equation is mentioned in
1031 			 * "BKDG for AMD Family 15h Models 70h-7fh Processors",
1032 			 * section 2.5.2.1.6.
1033 			 */
1034 			hwpstate_set[i].power = AMD_17H_CUR_IDD(msr) * 1000;
1035 			hwpstate_set[i].power = hwpstate_amd_iscale(
1036 			    hwpstate_set[i].power, AMD_17H_CUR_IDIV(msr));
1037 			hwpstate_set[i].power *= hwpstate_set[i].volts;
1038 			/* Milli amps * milli volts to milli watts. */
1039 			hwpstate_set[i].power /= 1000;
1040 			break;
1041 		default:
1042 			HWPSTATE_DEBUG(dev, "get_info_from_msr: %s family"
1043 			    " 0x%02x CPUs are not supported yet\n",
1044 			    cpu_vendor_id == CPU_VENDOR_HYGON ? "Hygon" : "AMD",
1045 			    family);
1046 			return (ENXIO);
1047 		}
1048 		hwpstate_set[i].pstate_id = i;
1049 	}
1050 	return (0);
1051 }
1052 
1053 static int
1054 hwpstate_get_info_from_acpi_perf(device_t dev, device_t perf_dev)
1055 {
1056 	struct hwpstate_softc *sc;
1057 	struct cf_setting *perf_set;
1058 	struct hwpstate_setting *hwpstate_set;
1059 	int count, error, i;
1060 
1061 	perf_set = malloc(MAX_SETTINGS * sizeof(*perf_set), M_TEMP, M_NOWAIT);
1062 	if (perf_set == NULL) {
1063 		HWPSTATE_DEBUG(dev, "nomem\n");
1064 		return (ENOMEM);
1065 	}
1066 	/*
1067 	 * Fetch settings from acpi_perf.
1068 	 * Now it is attached, and has info only flag.
1069 	 */
1070 	count = MAX_SETTINGS;
1071 	error = CPUFREQ_DRV_SETTINGS(perf_dev, perf_set, &count);
1072 	if (error) {
1073 		HWPSTATE_DEBUG(dev, "error: CPUFREQ_DRV_SETTINGS.\n");
1074 		goto out;
1075 	}
1076 	sc = device_get_softc(dev);
1077 	sc->cfnum = count;
1078 	hwpstate_set = sc->hwpstate_settings;
1079 	for (i = 0; i < count; i++) {
1080 		if (i == perf_set[i].spec[0]) {
1081 			hwpstate_set[i].pstate_id = i;
1082 			hwpstate_set[i].freq = perf_set[i].freq;
1083 			hwpstate_set[i].volts = perf_set[i].volts;
1084 			hwpstate_set[i].power = perf_set[i].power;
1085 			hwpstate_set[i].lat = perf_set[i].lat;
1086 		} else {
1087 			HWPSTATE_DEBUG(dev, "ACPI _PSS object mismatch.\n");
1088 			error = ENXIO;
1089 			goto out;
1090 		}
1091 	}
1092 out:
1093 	if (perf_set)
1094 		free(perf_set, M_TEMP);
1095 	return (error);
1096 }
1097 
1098 static int
1099 hwpstate_detach(device_t dev)
1100 {
1101 	struct hwpstate_softc *sc;
1102 
1103 	sc = device_get_softc(dev);
1104 	if ((sc->flags & HWPFL_USE_CPPC) == 0)
1105 		hwpstate_goto_pstate(dev, 0);
1106 	return (cpufreq_unregister(dev));
1107 }
1108 
1109 static int
1110 hwpstate_shutdown(device_t dev)
1111 {
1112 
1113 	/* hwpstate_goto_pstate(dev, 0); */
1114 	return (0);
1115 }
1116 
1117 static int
1118 hwpstate_features(driver_t *driver, u_int *features)
1119 {
1120 
1121 	/* Notify the ACPI CPU that we support direct access to MSRs */
1122 	*features = ACPI_CAP_PERF_MSRS;
1123 	return (0);
1124 }
1125