xref: /linux/arch/x86/kernel/cpu/mtrr/mtrr.c (revision bfb921b2a9d5d1123d1d10b196a39db629ddef87)
1 /*  Generic MTRR (Memory Type Range Register) driver.
2 
3     Copyright (C) 1997-2000  Richard Gooch
4     Copyright (c) 2002	     Patrick Mochel
5 
6     This library is free software; you can redistribute it and/or
7     modify it under the terms of the GNU Library General Public
8     License as published by the Free Software Foundation; either
9     version 2 of the License, or (at your option) any later version.
10 
11     This library is distributed in the hope that it will be useful,
12     but WITHOUT ANY WARRANTY; without even the implied warranty of
13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14     Library General Public License for more details.
15 
16     You should have received a copy of the GNU Library General Public
17     License along with this library; if not, write to the Free
18     Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 
20     Richard Gooch may be reached by email at  rgooch@atnf.csiro.au
21     The postal address is:
22       Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
23 
24     Source: "Pentium Pro Family Developer's Manual, Volume 3:
25     Operating System Writer's Guide" (Intel document number 242692),
26     section 11.11.7
27 
28     This was cleaned and made readable by Patrick Mochel <mochel@osdl.org>
29     on 6-7 March 2002.
30     Source: Intel Architecture Software Developers Manual, Volume 3:
31     System Programming Guide; Section 9.11. (1997 edition - PPro).
32 */
33 
34 #include <linux/types.h> /* FIXME: kvm_para.h needs this */
35 
36 #include <linux/stop_machine.h>
37 #include <linux/kvm_para.h>
38 #include <linux/uaccess.h>
39 #include <linux/export.h>
40 #include <linux/mutex.h>
41 #include <linux/init.h>
42 #include <linux/sort.h>
43 #include <linux/cpu.h>
44 #include <linux/pci.h>
45 #include <linux/smp.h>
46 #include <linux/syscore_ops.h>
47 #include <linux/rcupdate.h>
48 
49 #include <asm/cacheinfo.h>
50 #include <asm/cpufeature.h>
51 #include <asm/e820/api.h>
52 #include <asm/mtrr.h>
53 #include <asm/msr.h>
54 #include <asm/memtype.h>
55 
56 #include "mtrr.h"
57 
58 /* arch_phys_wc_add returns an MTRR register index plus this offset. */
59 #define MTRR_TO_PHYS_WC_OFFSET 1000
60 
61 u32 num_var_ranges;
62 
63 unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
64 DEFINE_MUTEX(mtrr_mutex);
65 
66 const struct mtrr_ops *mtrr_if;
67 
68 /*  Returns non-zero if we have the write-combining memory type  */
69 static int have_wrcomb(void)
70 {
71 	struct pci_dev *dev;
72 
73 	dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL);
74 	if (dev != NULL) {
75 		/*
76 		 * ServerWorks LE chipsets < rev 6 have problems with
77 		 * write-combining. Don't allow it and leave room for other
78 		 * chipsets to be tagged
79 		 */
80 		if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
81 		    dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
82 		    dev->revision <= 5) {
83 			pr_info("Serverworks LE rev < 6 detected. Write-combining disabled.\n");
84 			pci_dev_put(dev);
85 			return 0;
86 		}
87 		/*
88 		 * Intel 450NX errata # 23. Non ascending cacheline evictions to
89 		 * write combining memory may resulting in data corruption
90 		 */
91 		if (dev->vendor == PCI_VENDOR_ID_INTEL &&
92 		    dev->device == PCI_DEVICE_ID_INTEL_82451NX) {
93 			pr_info("Intel 450NX MMC detected. Write-combining disabled.\n");
94 			pci_dev_put(dev);
95 			return 0;
96 		}
97 		pci_dev_put(dev);
98 	}
99 	return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0;
100 }
101 
102 static void __init init_table(void)
103 {
104 	int i, max;
105 
106 	max = num_var_ranges;
107 	for (i = 0; i < max; i++)
108 		mtrr_usage_table[i] = 1;
109 }
110 
111 struct set_mtrr_data {
112 	unsigned long	smp_base;
113 	unsigned long	smp_size;
114 	unsigned int	smp_reg;
115 	mtrr_type	smp_type;
116 };
117 
118 /**
119  * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed
120  * by all the CPUs.
121  * @info: pointer to mtrr configuration data
122  *
123  * Returns nothing.
124  */
125 static int mtrr_rendezvous_handler(void *info)
126 {
127 	struct set_mtrr_data *data = info;
128 
129 	mtrr_if->set(data->smp_reg, data->smp_base,
130 		     data->smp_size, data->smp_type);
131 	return 0;
132 }
133 
134 static inline int types_compatible(mtrr_type type1, mtrr_type type2)
135 {
136 	return type1 == MTRR_TYPE_UNCACHABLE ||
137 	       type2 == MTRR_TYPE_UNCACHABLE ||
138 	       (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) ||
139 	       (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH);
140 }
141 
142 /**
143  * set_mtrr - update mtrrs on all processors
144  * @reg:	mtrr in question
145  * @base:	mtrr base
146  * @size:	mtrr size
147  * @type:	mtrr type
148  *
149  * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
150  *
151  * 1. Queue work to do the following on all processors:
152  * 2. Disable Interrupts
153  * 3. Wait for all procs to do so
154  * 4. Enter no-fill cache mode
155  * 5. Flush caches
156  * 6. Clear PGE bit
157  * 7. Flush all TLBs
158  * 8. Disable all range registers
159  * 9. Update the MTRRs
160  * 10. Enable all range registers
161  * 11. Flush all TLBs and caches again
162  * 12. Enter normal cache mode and reenable caching
163  * 13. Set PGE
164  * 14. Wait for buddies to catch up
165  * 15. Enable interrupts.
166  *
167  * What does that mean for us? Well, stop_machine() will ensure that
168  * the rendezvous handler is started on each CPU. And in lockstep they
169  * do the state transition of disabling interrupts, updating MTRR's
170  * (the CPU vendors may each do it differently, so we call mtrr_if->set()
171  * callback and let them take care of it.) and enabling interrupts.
172  *
173  * Note that the mechanism is the same for UP systems, too; all the SMP stuff
174  * becomes nops.
175  */
176 static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size,
177 		     mtrr_type type)
178 {
179 	struct set_mtrr_data data = { .smp_reg = reg,
180 				      .smp_base = base,
181 				      .smp_size = size,
182 				      .smp_type = type
183 				    };
184 
185 	stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask);
186 
187 	generic_rebuild_map();
188 }
189 
190 /**
191  * mtrr_add_page - Add a memory type region
192  * @base: Physical base address of region in pages (in units of 4 kB!)
193  * @size: Physical size of region in pages (4 kB)
194  * @type: Type of MTRR desired
195  * @increment: If this is true do usage counting on the region
196  *
197  * Memory type region registers control the caching on newer Intel and
198  * non Intel processors. This function allows drivers to request an
199  * MTRR is added. The details and hardware specifics of each processor's
200  * implementation are hidden from the caller, but nevertheless the
201  * caller should expect to need to provide a power of two size on an
202  * equivalent power of two boundary.
203  *
204  * If the region cannot be added either because all regions are in use
205  * or the CPU cannot support it a negative value is returned. On success
206  * the register number for this entry is returned, but should be treated
207  * as a cookie only.
208  *
209  * On a multiprocessor machine the changes are made to all processors.
210  * This is required on x86 by the Intel processors.
211  *
212  * The available types are
213  *
214  * %MTRR_TYPE_UNCACHABLE - No caching
215  *
216  * %MTRR_TYPE_WRBACK - Write data back in bursts whenever
217  *
218  * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
219  *
220  * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
221  *
222  * BUGS: Needs a quiet flag for the cases where drivers do not mind
223  * failures and do not wish system log messages to be sent.
224  */
225 int mtrr_add_page(unsigned long base, unsigned long size,
226 		  unsigned int type, bool increment)
227 {
228 	unsigned long lbase, lsize;
229 	int i, replace, error;
230 	mtrr_type ltype;
231 
232 	if (!mtrr_enabled())
233 		return -ENXIO;
234 
235 	error = mtrr_if->validate_add_page(base, size, type);
236 	if (error)
237 		return error;
238 
239 	if (type >= MTRR_NUM_TYPES) {
240 		pr_warn("type: %u invalid\n", type);
241 		return -EINVAL;
242 	}
243 
244 	/* If the type is WC, check that this processor supports it */
245 	if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
246 		pr_warn("your processor doesn't support write-combining\n");
247 		return -ENOSYS;
248 	}
249 
250 	if (!size) {
251 		pr_warn("zero sized request\n");
252 		return -EINVAL;
253 	}
254 
255 	if ((base | (base + size - 1)) >>
256 	    (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) {
257 		pr_warn("base or size exceeds the MTRR width\n");
258 		return -EINVAL;
259 	}
260 
261 	error = -EINVAL;
262 	replace = -1;
263 
264 	/* No CPU hotplug when we change MTRR entries */
265 	cpus_read_lock();
266 
267 	/* Search for existing MTRR  */
268 	mutex_lock(&mtrr_mutex);
269 	for (i = 0; i < num_var_ranges; ++i) {
270 		mtrr_if->get(i, &lbase, &lsize, &ltype);
271 		if (!lsize || base > lbase + lsize - 1 ||
272 		    base + size - 1 < lbase)
273 			continue;
274 		/*
275 		 * At this point we know there is some kind of
276 		 * overlap/enclosure
277 		 */
278 		if (base < lbase || base + size - 1 > lbase + lsize - 1) {
279 			if (base <= lbase &&
280 			    base + size - 1 >= lbase + lsize - 1) {
281 				/*  New region encloses an existing region  */
282 				if (type == ltype) {
283 					replace = replace == -1 ? i : -2;
284 					continue;
285 				} else if (types_compatible(type, ltype))
286 					continue;
287 			}
288 			pr_warn("0x%lx000,0x%lx000 overlaps existing 0x%lx000,0x%lx000\n", base, size, lbase,
289 				lsize);
290 			goto out;
291 		}
292 		/* New region is enclosed by an existing region */
293 		if (ltype != type) {
294 			if (types_compatible(type, ltype))
295 				continue;
296 			pr_warn("type mismatch for %lx000,%lx000 old: %s new: %s\n",
297 				base, size, mtrr_attrib_to_str(ltype),
298 				mtrr_attrib_to_str(type));
299 			goto out;
300 		}
301 		if (increment)
302 			++mtrr_usage_table[i];
303 		error = i;
304 		goto out;
305 	}
306 	/* Search for an empty MTRR */
307 	i = mtrr_if->get_free_region(base, size, replace);
308 	if (i >= 0) {
309 		set_mtrr(i, base, size, type);
310 		if (likely(replace < 0)) {
311 			mtrr_usage_table[i] = 1;
312 		} else {
313 			mtrr_usage_table[i] = mtrr_usage_table[replace];
314 			if (increment)
315 				mtrr_usage_table[i]++;
316 			if (unlikely(replace != i)) {
317 				set_mtrr(replace, 0, 0, 0);
318 				mtrr_usage_table[replace] = 0;
319 			}
320 		}
321 	} else {
322 		pr_info("no more MTRRs available\n");
323 	}
324 	error = i;
325  out:
326 	mutex_unlock(&mtrr_mutex);
327 	cpus_read_unlock();
328 	return error;
329 }
330 
331 static int mtrr_check(unsigned long base, unsigned long size)
332 {
333 	if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
334 		pr_warn("size and base must be multiples of 4 kiB\n");
335 		Dprintk("size: 0x%lx  base: 0x%lx\n", size, base);
336 		dump_stack();
337 		return -1;
338 	}
339 	return 0;
340 }
341 
342 /**
343  * mtrr_add - Add a memory type region
344  * @base: Physical base address of region
345  * @size: Physical size of region
346  * @type: Type of MTRR desired
347  * @increment: If this is true do usage counting on the region
348  *
349  * Memory type region registers control the caching on newer Intel and
350  * non Intel processors. This function allows drivers to request an
351  * MTRR is added. The details and hardware specifics of each processor's
352  * implementation are hidden from the caller, but nevertheless the
353  * caller should expect to need to provide a power of two size on an
354  * equivalent power of two boundary.
355  *
356  * If the region cannot be added either because all regions are in use
357  * or the CPU cannot support it a negative value is returned. On success
358  * the register number for this entry is returned, but should be treated
359  * as a cookie only.
360  *
361  * On a multiprocessor machine the changes are made to all processors.
362  * This is required on x86 by the Intel processors.
363  *
364  * The available types are
365  *
366  * %MTRR_TYPE_UNCACHABLE - No caching
367  *
368  * %MTRR_TYPE_WRBACK - Write data back in bursts whenever
369  *
370  * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
371  *
372  * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
373  *
374  * BUGS: Needs a quiet flag for the cases where drivers do not mind
375  * failures and do not wish system log messages to be sent.
376  */
377 int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
378 	     bool increment)
379 {
380 	if (!mtrr_enabled())
381 		return -ENODEV;
382 	if (mtrr_check(base, size))
383 		return -EINVAL;
384 	return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
385 			     increment);
386 }
387 
388 /**
389  * mtrr_del_page - delete a memory type region
390  * @reg: Register returned by mtrr_add
391  * @base: Physical base address
392  * @size: Size of region
393  *
394  * If register is supplied then base and size are ignored. This is
395  * how drivers should call it.
396  *
397  * Releases an MTRR region. If the usage count drops to zero the
398  * register is freed and the region returns to default state.
399  * On success the register is returned, on failure a negative error
400  * code.
401  */
402 int mtrr_del_page(int reg, unsigned long base, unsigned long size)
403 {
404 	int i, max;
405 	mtrr_type ltype;
406 	unsigned long lbase, lsize;
407 	int error = -EINVAL;
408 
409 	if (!mtrr_enabled())
410 		return -ENODEV;
411 
412 	max = num_var_ranges;
413 	/* No CPU hotplug when we change MTRR entries */
414 	cpus_read_lock();
415 	mutex_lock(&mtrr_mutex);
416 	if (reg < 0) {
417 		/*  Search for existing MTRR  */
418 		for (i = 0; i < max; ++i) {
419 			mtrr_if->get(i, &lbase, &lsize, &ltype);
420 			if (lbase == base && lsize == size) {
421 				reg = i;
422 				break;
423 			}
424 		}
425 		if (reg < 0) {
426 			Dprintk("no MTRR for %lx000,%lx000 found\n", base, size);
427 			goto out;
428 		}
429 	}
430 	if (reg >= max) {
431 		pr_warn("register: %d too big\n", reg);
432 		goto out;
433 	}
434 	mtrr_if->get(reg, &lbase, &lsize, &ltype);
435 	if (lsize < 1) {
436 		pr_warn("MTRR %d not used\n", reg);
437 		goto out;
438 	}
439 	if (mtrr_usage_table[reg] < 1) {
440 		pr_warn("reg: %d has count=0\n", reg);
441 		goto out;
442 	}
443 	if (--mtrr_usage_table[reg] < 1)
444 		set_mtrr(reg, 0, 0, 0);
445 	error = reg;
446  out:
447 	mutex_unlock(&mtrr_mutex);
448 	cpus_read_unlock();
449 	return error;
450 }
451 
452 /**
453  * mtrr_del - delete a memory type region
454  * @reg: Register returned by mtrr_add
455  * @base: Physical base address
456  * @size: Size of region
457  *
458  * If register is supplied then base and size are ignored. This is
459  * how drivers should call it.
460  *
461  * Releases an MTRR region. If the usage count drops to zero the
462  * register is freed and the region returns to default state.
463  * On success the register is returned, on failure a negative error
464  * code.
465  */
466 int mtrr_del(int reg, unsigned long base, unsigned long size)
467 {
468 	if (!mtrr_enabled())
469 		return -ENODEV;
470 	if (mtrr_check(base, size))
471 		return -EINVAL;
472 	return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
473 }
474 
475 /**
476  * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable
477  * @base: Physical base address
478  * @size: Size of region
479  *
480  * If PAT is available, this does nothing.  If PAT is unavailable, it
481  * attempts to add a WC MTRR covering size bytes starting at base and
482  * logs an error if this fails.
483  *
484  * The called should provide a power of two size on an equivalent
485  * power of two boundary.
486  *
487  * Drivers must store the return value to pass to mtrr_del_wc_if_needed,
488  * but drivers should not try to interpret that return value.
489  */
490 int arch_phys_wc_add(unsigned long base, unsigned long size)
491 {
492 	int ret;
493 
494 	if (pat_enabled() || !mtrr_enabled())
495 		return 0;  /* Success!  (We don't need to do anything.) */
496 
497 	ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true);
498 	if (ret < 0) {
499 		pr_warn("Failed to add WC MTRR for [%p-%p]; performance may suffer.",
500 			(void *)base, (void *)(base + size - 1));
501 		return ret;
502 	}
503 	return ret + MTRR_TO_PHYS_WC_OFFSET;
504 }
505 EXPORT_SYMBOL(arch_phys_wc_add);
506 
507 /*
508  * arch_phys_wc_del - undoes arch_phys_wc_add
509  * @handle: Return value from arch_phys_wc_add
510  *
511  * This cleans up after mtrr_add_wc_if_needed.
512  *
513  * The API guarantees that mtrr_del_wc_if_needed(error code) and
514  * mtrr_del_wc_if_needed(0) do nothing.
515  */
516 void arch_phys_wc_del(int handle)
517 {
518 	if (handle >= 1) {
519 		WARN_ON(handle < MTRR_TO_PHYS_WC_OFFSET);
520 		mtrr_del(handle - MTRR_TO_PHYS_WC_OFFSET, 0, 0);
521 	}
522 }
523 EXPORT_SYMBOL(arch_phys_wc_del);
524 
525 /*
526  * arch_phys_wc_index - translates arch_phys_wc_add's return value
527  * @handle: Return value from arch_phys_wc_add
528  *
529  * This will turn the return value from arch_phys_wc_add into an mtrr
530  * index suitable for debugging.
531  *
532  * Note: There is no legitimate use for this function, except possibly
533  * in printk line.  Alas there is an illegitimate use in some ancient
534  * drm ioctls.
535  */
536 int arch_phys_wc_index(int handle)
537 {
538 	if (handle < MTRR_TO_PHYS_WC_OFFSET)
539 		return -1;
540 	else
541 		return handle - MTRR_TO_PHYS_WC_OFFSET;
542 }
543 EXPORT_SYMBOL_GPL(arch_phys_wc_index);
544 
545 int __initdata changed_by_mtrr_cleanup;
546 
547 /**
548  * mtrr_bp_init - initialize MTRRs on the boot CPU
549  *
550  * This needs to be called early; before any of the other CPUs are
551  * initialized (i.e. before smp_init()).
552  */
553 void __init mtrr_bp_init(void)
554 {
555 	bool generic_mtrrs = cpu_feature_enabled(X86_FEATURE_MTRR);
556 	const char *why = "(not available)";
557 	unsigned long config, dummy;
558 
559 	phys_hi_rsvd = GENMASK(31, boot_cpu_data.x86_phys_bits - 32);
560 
561 	if (!generic_mtrrs && mtrr_state.enabled) {
562 		/*
563 		 * Software overwrite of MTRR state, only for generic case.
564 		 * Note that X86_FEATURE_MTRR has been reset in this case.
565 		 */
566 		init_table();
567 		mtrr_build_map();
568 		pr_info("MTRRs set to read-only\n");
569 
570 		return;
571 	}
572 
573 	if (generic_mtrrs)
574 		mtrr_if = &generic_mtrr_ops;
575 	else
576 		mtrr_set_if();
577 
578 	if (mtrr_enabled()) {
579 		/* Get the number of variable MTRR ranges. */
580 		if (mtrr_if == &generic_mtrr_ops)
581 			rdmsr(MSR_MTRRcap, config, dummy);
582 		else
583 			config = mtrr_if->var_regs;
584 		num_var_ranges = config & MTRR_CAP_VCNT;
585 
586 		init_table();
587 		if (mtrr_if == &generic_mtrr_ops) {
588 			/* BIOS may override */
589 			if (get_mtrr_state()) {
590 				memory_caching_control |= CACHE_MTRR;
591 				changed_by_mtrr_cleanup = mtrr_cleanup();
592 				mtrr_build_map();
593 			} else {
594 				mtrr_if = NULL;
595 				why = "by BIOS";
596 			}
597 		}
598 	}
599 
600 	if (!mtrr_enabled())
601 		pr_info("MTRRs disabled %s\n", why);
602 }
603 
604 /**
605  * mtrr_save_state - Save current fixed-range MTRR state of the first
606  *	cpu in cpu_online_mask.
607  */
608 void mtrr_save_state(void)
609 {
610 	int first_cpu;
611 
612 	if (!mtrr_enabled())
613 		return;
614 
615 	first_cpu = cpumask_first(cpu_online_mask);
616 	smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
617 }
618 
619 static int __init mtrr_init_finalize(void)
620 {
621 	/*
622 	 * Map might exist if mtrr_overwrite_state() has been called or if
623 	 * mtrr_enabled() returns true.
624 	 */
625 	mtrr_copy_map();
626 
627 	if (!mtrr_enabled())
628 		return 0;
629 
630 	if (memory_caching_control & CACHE_MTRR) {
631 		if (!changed_by_mtrr_cleanup)
632 			mtrr_state_warn();
633 		return 0;
634 	}
635 
636 	mtrr_register_syscore();
637 
638 	return 0;
639 }
640 subsys_initcall(mtrr_init_finalize);
641