1 /* 2 * Machine specific setup for xen 3 * 4 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 5 */ 6 7 #include <linux/module.h> 8 #include <linux/sched.h> 9 #include <linux/mm.h> 10 #include <linux/pm.h> 11 #include <linux/memblock.h> 12 #include <linux/cpuidle.h> 13 #include <linux/cpufreq.h> 14 15 #include <asm/elf.h> 16 #include <asm/vdso.h> 17 #include <asm/e820.h> 18 #include <asm/setup.h> 19 #include <asm/acpi.h> 20 #include <asm/numa.h> 21 #include <asm/xen/hypervisor.h> 22 #include <asm/xen/hypercall.h> 23 24 #include <xen/xen.h> 25 #include <xen/page.h> 26 #include <xen/interface/callback.h> 27 #include <xen/interface/memory.h> 28 #include <xen/interface/physdev.h> 29 #include <xen/features.h> 30 #include "xen-ops.h" 31 #include "vdso.h" 32 33 /* These are code, but not functions. Defined in entry.S */ 34 extern const char xen_hypervisor_callback[]; 35 extern const char xen_failsafe_callback[]; 36 #ifdef CONFIG_X86_64 37 extern asmlinkage void nmi(void); 38 #endif 39 extern void xen_sysenter_target(void); 40 extern void xen_syscall_target(void); 41 extern void xen_syscall32_target(void); 42 43 /* Amount of extra memory space we add to the e820 ranges */ 44 struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; 45 46 /* Number of pages released from the initial allocation. */ 47 unsigned long xen_released_pages; 48 49 /* 50 * The maximum amount of extra memory compared to the base size. The 51 * main scaling factor is the size of struct page. At extreme ratios 52 * of base:extra, all the base memory can be filled with page 53 * structures for the extra memory, leaving no space for anything 54 * else. 55 * 56 * 10x seems like a reasonable balance between scaling flexibility and 57 * leaving a practically usable system. 58 */ 59 #define EXTRA_MEM_RATIO (10) 60 61 static void __init xen_add_extra_mem(u64 start, u64 size) 62 { 63 unsigned long pfn; 64 int i; 65 66 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { 67 /* Add new region. */ 68 if (xen_extra_mem[i].size == 0) { 69 xen_extra_mem[i].start = start; 70 xen_extra_mem[i].size = size; 71 break; 72 } 73 /* Append to existing region. */ 74 if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) { 75 xen_extra_mem[i].size += size; 76 break; 77 } 78 } 79 if (i == XEN_EXTRA_MEM_MAX_REGIONS) 80 printk(KERN_WARNING "Warning: not enough extra memory regions\n"); 81 82 memblock_reserve(start, size); 83 84 xen_max_p2m_pfn = PFN_DOWN(start + size); 85 for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { 86 unsigned long mfn = pfn_to_mfn(pfn); 87 88 if (WARN_ONCE(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) 89 continue; 90 WARN_ONCE(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", 91 pfn, mfn); 92 93 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 94 } 95 } 96 97 static unsigned long __init xen_do_chunk(unsigned long start, 98 unsigned long end, bool release) 99 { 100 struct xen_memory_reservation reservation = { 101 .address_bits = 0, 102 .extent_order = 0, 103 .domid = DOMID_SELF 104 }; 105 unsigned long len = 0; 106 unsigned long pfn; 107 int ret; 108 109 for (pfn = start; pfn < end; pfn++) { 110 unsigned long frame; 111 unsigned long mfn = pfn_to_mfn(pfn); 112 113 if (release) { 114 /* Make sure pfn exists to start with */ 115 if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) 116 continue; 117 frame = mfn; 118 } else { 119 if (mfn != INVALID_P2M_ENTRY) 120 continue; 121 frame = pfn; 122 } 123 set_xen_guest_handle(reservation.extent_start, &frame); 124 reservation.nr_extents = 1; 125 126 ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap, 127 &reservation); 128 WARN(ret != 1, "Failed to %s pfn %lx err=%d\n", 129 release ? "release" : "populate", pfn, ret); 130 131 if (ret == 1) { 132 if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) { 133 if (release) 134 break; 135 set_xen_guest_handle(reservation.extent_start, &frame); 136 reservation.nr_extents = 1; 137 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, 138 &reservation); 139 break; 140 } 141 len++; 142 } else 143 break; 144 } 145 if (len) 146 printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n", 147 release ? "Freeing" : "Populating", 148 start, end, len, 149 release ? "freed" : "added"); 150 151 return len; 152 } 153 154 static unsigned long __init xen_release_chunk(unsigned long start, 155 unsigned long end) 156 { 157 return xen_do_chunk(start, end, true); 158 } 159 160 static unsigned long __init xen_populate_chunk( 161 const struct e820entry *list, size_t map_size, 162 unsigned long max_pfn, unsigned long *last_pfn, 163 unsigned long credits_left) 164 { 165 const struct e820entry *entry; 166 unsigned int i; 167 unsigned long done = 0; 168 unsigned long dest_pfn; 169 170 for (i = 0, entry = list; i < map_size; i++, entry++) { 171 unsigned long s_pfn; 172 unsigned long e_pfn; 173 unsigned long pfns; 174 long capacity; 175 176 if (credits_left <= 0) 177 break; 178 179 if (entry->type != E820_RAM) 180 continue; 181 182 e_pfn = PFN_DOWN(entry->addr + entry->size); 183 184 /* We only care about E820 after the xen_start_info->nr_pages */ 185 if (e_pfn <= max_pfn) 186 continue; 187 188 s_pfn = PFN_UP(entry->addr); 189 /* If the E820 falls within the nr_pages, we want to start 190 * at the nr_pages PFN. 191 * If that would mean going past the E820 entry, skip it 192 */ 193 if (s_pfn <= max_pfn) { 194 capacity = e_pfn - max_pfn; 195 dest_pfn = max_pfn; 196 } else { 197 capacity = e_pfn - s_pfn; 198 dest_pfn = s_pfn; 199 } 200 201 if (credits_left < capacity) 202 capacity = credits_left; 203 204 pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false); 205 done += pfns; 206 *last_pfn = (dest_pfn + pfns); 207 if (pfns < capacity) 208 break; 209 credits_left -= pfns; 210 } 211 return done; 212 } 213 214 static void __init xen_set_identity_and_release_chunk( 215 unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, 216 unsigned long *released, unsigned long *identity) 217 { 218 unsigned long pfn; 219 220 /* 221 * If the PFNs are currently mapped, clear the mappings 222 * (except for the ISA region which must be 1:1 mapped) to 223 * release the refcounts (in Xen) on the original frames. 224 */ 225 for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) { 226 pte_t pte = __pte_ma(0); 227 228 if (pfn < PFN_UP(ISA_END_ADDRESS)) 229 pte = mfn_pte(pfn, PAGE_KERNEL_IO); 230 231 (void)HYPERVISOR_update_va_mapping( 232 (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0); 233 } 234 235 if (start_pfn < nr_pages) 236 *released += xen_release_chunk( 237 start_pfn, min(end_pfn, nr_pages)); 238 239 *identity += set_phys_range_identity(start_pfn, end_pfn); 240 } 241 242 static unsigned long __init xen_set_identity_and_release( 243 const struct e820entry *list, size_t map_size, unsigned long nr_pages) 244 { 245 phys_addr_t start = 0; 246 unsigned long released = 0; 247 unsigned long identity = 0; 248 const struct e820entry *entry; 249 int i; 250 251 /* 252 * Combine non-RAM regions and gaps until a RAM region (or the 253 * end of the map) is reached, then set the 1:1 map and 254 * release the pages (if available) in those non-RAM regions. 255 * 256 * The combined non-RAM regions are rounded to a whole number 257 * of pages so any partial pages are accessible via the 1:1 258 * mapping. This is needed for some BIOSes that put (for 259 * example) the DMI tables in a reserved region that begins on 260 * a non-page boundary. 261 */ 262 for (i = 0, entry = list; i < map_size; i++, entry++) { 263 phys_addr_t end = entry->addr + entry->size; 264 if (entry->type == E820_RAM || i == map_size - 1) { 265 unsigned long start_pfn = PFN_DOWN(start); 266 unsigned long end_pfn = PFN_UP(end); 267 268 if (entry->type == E820_RAM) 269 end_pfn = PFN_UP(entry->addr); 270 271 if (start_pfn < end_pfn) 272 xen_set_identity_and_release_chunk( 273 start_pfn, end_pfn, nr_pages, 274 &released, &identity); 275 276 start = end; 277 } 278 } 279 280 if (released) 281 printk(KERN_INFO "Released %lu pages of unused memory\n", released); 282 if (identity) 283 printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); 284 285 return released; 286 } 287 288 static unsigned long __init xen_get_max_pages(void) 289 { 290 unsigned long max_pages = MAX_DOMAIN_PAGES; 291 domid_t domid = DOMID_SELF; 292 int ret; 293 294 /* 295 * For the initial domain we use the maximum reservation as 296 * the maximum page. 297 * 298 * For guest domains the current maximum reservation reflects 299 * the current maximum rather than the static maximum. In this 300 * case the e820 map provided to us will cover the static 301 * maximum region. 302 */ 303 if (xen_initial_domain()) { 304 ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); 305 if (ret > 0) 306 max_pages = ret; 307 } 308 309 return min(max_pages, MAX_DOMAIN_PAGES); 310 } 311 312 static void xen_align_and_add_e820_region(u64 start, u64 size, int type) 313 { 314 u64 end = start + size; 315 316 /* Align RAM regions to page boundaries. */ 317 if (type == E820_RAM) { 318 start = PAGE_ALIGN(start); 319 end &= ~((u64)PAGE_SIZE - 1); 320 } 321 322 e820_add_region(start, end - start, type); 323 } 324 325 void xen_ignore_unusable(struct e820entry *list, size_t map_size) 326 { 327 struct e820entry *entry; 328 unsigned int i; 329 330 for (i = 0, entry = list; i < map_size; i++, entry++) { 331 if (entry->type == E820_UNUSABLE) 332 entry->type = E820_RAM; 333 } 334 } 335 336 /** 337 * machine_specific_memory_setup - Hook for machine specific memory setup. 338 **/ 339 char * __init xen_memory_setup(void) 340 { 341 static struct e820entry map[E820MAX] __initdata; 342 343 unsigned long max_pfn = xen_start_info->nr_pages; 344 unsigned long long mem_end; 345 int rc; 346 struct xen_memory_map memmap; 347 unsigned long max_pages; 348 unsigned long last_pfn = 0; 349 unsigned long extra_pages = 0; 350 unsigned long populated; 351 int i; 352 int op; 353 354 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); 355 mem_end = PFN_PHYS(max_pfn); 356 357 memmap.nr_entries = E820MAX; 358 set_xen_guest_handle(memmap.buffer, map); 359 360 op = xen_initial_domain() ? 361 XENMEM_machine_memory_map : 362 XENMEM_memory_map; 363 rc = HYPERVISOR_memory_op(op, &memmap); 364 if (rc == -ENOSYS) { 365 BUG_ON(xen_initial_domain()); 366 memmap.nr_entries = 1; 367 map[0].addr = 0ULL; 368 map[0].size = mem_end; 369 /* 8MB slack (to balance backend allocations). */ 370 map[0].size += 8ULL << 20; 371 map[0].type = E820_RAM; 372 rc = 0; 373 } 374 BUG_ON(rc); 375 376 /* 377 * Xen won't allow a 1:1 mapping to be created to UNUSABLE 378 * regions, so if we're using the machine memory map leave the 379 * region as RAM as it is in the pseudo-physical map. 380 * 381 * UNUSABLE regions in domUs are not handled and will need 382 * a patch in the future. 383 */ 384 if (xen_initial_domain()) 385 xen_ignore_unusable(map, memmap.nr_entries); 386 387 /* Make sure the Xen-supplied memory map is well-ordered. */ 388 sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); 389 390 max_pages = xen_get_max_pages(); 391 if (max_pages > max_pfn) 392 extra_pages += max_pages - max_pfn; 393 394 /* 395 * Set P2M for all non-RAM pages and E820 gaps to be identity 396 * type PFNs. Any RAM pages that would be made inaccesible by 397 * this are first released. 398 */ 399 xen_released_pages = xen_set_identity_and_release( 400 map, memmap.nr_entries, max_pfn); 401 402 /* 403 * Populate back the non-RAM pages and E820 gaps that had been 404 * released. */ 405 populated = xen_populate_chunk(map, memmap.nr_entries, 406 max_pfn, &last_pfn, xen_released_pages); 407 408 xen_released_pages -= populated; 409 extra_pages += xen_released_pages; 410 411 if (last_pfn > max_pfn) { 412 max_pfn = min(MAX_DOMAIN_PAGES, last_pfn); 413 mem_end = PFN_PHYS(max_pfn); 414 } 415 /* 416 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO 417 * factor the base size. On non-highmem systems, the base 418 * size is the full initial memory allocation; on highmem it 419 * is limited to the max size of lowmem, so that it doesn't 420 * get completely filled. 421 * 422 * In principle there could be a problem in lowmem systems if 423 * the initial memory is also very large with respect to 424 * lowmem, but we won't try to deal with that here. 425 */ 426 extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), 427 extra_pages); 428 i = 0; 429 while (i < memmap.nr_entries) { 430 u64 addr = map[i].addr; 431 u64 size = map[i].size; 432 u32 type = map[i].type; 433 434 if (type == E820_RAM) { 435 if (addr < mem_end) { 436 size = min(size, mem_end - addr); 437 } else if (extra_pages) { 438 size = min(size, (u64)extra_pages * PAGE_SIZE); 439 extra_pages -= size / PAGE_SIZE; 440 xen_add_extra_mem(addr, size); 441 } else 442 type = E820_UNUSABLE; 443 } 444 445 xen_align_and_add_e820_region(addr, size, type); 446 447 map[i].addr += size; 448 map[i].size -= size; 449 if (map[i].size == 0) 450 i++; 451 } 452 453 /* 454 * Set the rest as identity mapped, in case PCI BARs are 455 * located here. 456 * 457 * PFNs above MAX_P2M_PFN are considered identity mapped as 458 * well. 459 */ 460 set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul); 461 462 /* 463 * In domU, the ISA region is normal, usable memory, but we 464 * reserve ISA memory anyway because too many things poke 465 * about in there. 466 */ 467 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, 468 E820_RESERVED); 469 470 /* 471 * Reserve Xen bits: 472 * - mfn_list 473 * - xen_start_info 474 * See comment above "struct start_info" in <xen/interface/xen.h> 475 * We tried to make the the memblock_reserve more selective so 476 * that it would be clear what region is reserved. Sadly we ran 477 * in the problem wherein on a 64-bit hypervisor with a 32-bit 478 * initial domain, the pt_base has the cr3 value which is not 479 * neccessarily where the pagetable starts! As Jan put it: " 480 * Actually, the adjustment turns out to be correct: The page 481 * tables for a 32-on-64 dom0 get allocated in the order "first L1", 482 * "first L2", "first L3", so the offset to the page table base is 483 * indeed 2. When reading xen/include/public/xen.h's comment 484 * very strictly, this is not a violation (since there nothing is said 485 * that the first thing in the page table space is pointed to by 486 * pt_base; I admit that this seems to be implied though, namely 487 * do I think that it is implied that the page table space is the 488 * range [pt_base, pt_base + nt_pt_frames), whereas that 489 * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), 490 * which - without a priori knowledge - the kernel would have 491 * difficulty to figure out)." - so lets just fall back to the 492 * easy way and reserve the whole region. 493 */ 494 memblock_reserve(__pa(xen_start_info->mfn_list), 495 xen_start_info->pt_base - xen_start_info->mfn_list); 496 497 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 498 499 return "Xen"; 500 } 501 502 /* 503 * Machine specific memory setup for auto-translated guests. 504 */ 505 char * __init xen_auto_xlated_memory_setup(void) 506 { 507 static struct e820entry map[E820MAX] __initdata; 508 509 struct xen_memory_map memmap; 510 int i; 511 int rc; 512 513 memmap.nr_entries = E820MAX; 514 set_xen_guest_handle(memmap.buffer, map); 515 516 rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); 517 if (rc < 0) 518 panic("No memory map (%d)\n", rc); 519 520 sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries); 521 522 for (i = 0; i < memmap.nr_entries; i++) 523 e820_add_region(map[i].addr, map[i].size, map[i].type); 524 525 memblock_reserve(__pa(xen_start_info->mfn_list), 526 xen_start_info->pt_base - xen_start_info->mfn_list); 527 528 return "Xen"; 529 } 530 531 /* 532 * Set the bit indicating "nosegneg" library variants should be used. 533 * We only need to bother in pure 32-bit mode; compat 32-bit processes 534 * can have un-truncated segments, so wrapping around is allowed. 535 */ 536 static void __init fiddle_vdso(void) 537 { 538 #ifdef CONFIG_X86_32 539 /* 540 * This could be called before selected_vdso32 is initialized, so 541 * just fiddle with both possible images. vdso_image_32_syscall 542 * can't be selected, since it only exists on 64-bit systems. 543 */ 544 u32 *mask; 545 mask = vdso_image_32_int80.data + 546 vdso_image_32_int80.sym_VDSO32_NOTE_MASK; 547 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 548 mask = vdso_image_32_sysenter.data + 549 vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK; 550 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 551 #endif 552 } 553 554 static int register_callback(unsigned type, const void *func) 555 { 556 struct callback_register callback = { 557 .type = type, 558 .address = XEN_CALLBACK(__KERNEL_CS, func), 559 .flags = CALLBACKF_mask_events, 560 }; 561 562 return HYPERVISOR_callback_op(CALLBACKOP_register, &callback); 563 } 564 565 void xen_enable_sysenter(void) 566 { 567 int ret; 568 unsigned sysenter_feature; 569 570 #ifdef CONFIG_X86_32 571 sysenter_feature = X86_FEATURE_SEP; 572 #else 573 sysenter_feature = X86_FEATURE_SYSENTER32; 574 #endif 575 576 if (!boot_cpu_has(sysenter_feature)) 577 return; 578 579 ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target); 580 if(ret != 0) 581 setup_clear_cpu_cap(sysenter_feature); 582 } 583 584 void xen_enable_syscall(void) 585 { 586 #ifdef CONFIG_X86_64 587 int ret; 588 589 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); 590 if (ret != 0) { 591 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret); 592 /* Pretty fatal; 64-bit userspace has no other 593 mechanism for syscalls. */ 594 } 595 596 if (boot_cpu_has(X86_FEATURE_SYSCALL32)) { 597 ret = register_callback(CALLBACKTYPE_syscall32, 598 xen_syscall32_target); 599 if (ret != 0) 600 setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); 601 } 602 #endif /* CONFIG_X86_64 */ 603 } 604 605 void __init xen_pvmmu_arch_setup(void) 606 { 607 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); 608 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); 609 610 HYPERVISOR_vm_assist(VMASST_CMD_enable, 611 VMASST_TYPE_pae_extended_cr3); 612 613 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || 614 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) 615 BUG(); 616 617 xen_enable_sysenter(); 618 xen_enable_syscall(); 619 } 620 621 /* This function is not called for HVM domains */ 622 void __init xen_arch_setup(void) 623 { 624 xen_panic_handler_init(); 625 if (!xen_feature(XENFEAT_auto_translated_physmap)) 626 xen_pvmmu_arch_setup(); 627 628 #ifdef CONFIG_ACPI 629 if (!(xen_start_info->flags & SIF_INITDOMAIN)) { 630 printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); 631 disable_acpi(); 632 } 633 #endif 634 635 memcpy(boot_command_line, xen_start_info->cmd_line, 636 MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? 637 COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); 638 639 /* Set up idle, making sure it calls safe_halt() pvop */ 640 disable_cpuidle(); 641 disable_cpufreq(); 642 WARN_ON(xen_set_default_idle()); 643 fiddle_vdso(); 644 #ifdef CONFIG_NUMA 645 numa_off = 1; 646 #endif 647 } 648