1 /* 2 * Machine specific setup for xen 3 * 4 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 5 */ 6 7 #include <linux/module.h> 8 #include <linux/sched.h> 9 #include <linux/mm.h> 10 #include <linux/pm.h> 11 #include <linux/memblock.h> 12 #include <linux/cpuidle.h> 13 #include <linux/cpufreq.h> 14 15 #include <asm/elf.h> 16 #include <asm/vdso.h> 17 #include <asm/e820.h> 18 #include <asm/setup.h> 19 #include <asm/acpi.h> 20 #include <asm/xen/hypervisor.h> 21 #include <asm/xen/hypercall.h> 22 23 #include <xen/xen.h> 24 #include <xen/page.h> 25 #include <xen/interface/callback.h> 26 #include <xen/interface/memory.h> 27 #include <xen/interface/physdev.h> 28 #include <xen/features.h> 29 #include "xen-ops.h" 30 #include "vdso.h" 31 32 /* These are code, but not functions. Defined in entry.S */ 33 extern const char xen_hypervisor_callback[]; 34 extern const char xen_failsafe_callback[]; 35 extern void xen_sysenter_target(void); 36 extern void xen_syscall_target(void); 37 extern void xen_syscall32_target(void); 38 39 /* Amount of extra memory space we add to the e820 ranges */ 40 struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; 41 42 /* Number of pages released from the initial allocation. */ 43 unsigned long xen_released_pages; 44 45 /* 46 * The maximum amount of extra memory compared to the base size. The 47 * main scaling factor is the size of struct page. At extreme ratios 48 * of base:extra, all the base memory can be filled with page 49 * structures for the extra memory, leaving no space for anything 50 * else. 51 * 52 * 10x seems like a reasonable balance between scaling flexibility and 53 * leaving a practically usable system. 54 */ 55 #define EXTRA_MEM_RATIO (10) 56 57 static void __init xen_add_extra_mem(u64 start, u64 size) 58 { 59 unsigned long pfn; 60 int i; 61 62 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { 63 /* Add new region. */ 64 if (xen_extra_mem[i].size == 0) { 65 xen_extra_mem[i].start = start; 66 xen_extra_mem[i].size = size; 67 break; 68 } 69 /* Append to existing region. */ 70 if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) { 71 xen_extra_mem[i].size += size; 72 break; 73 } 74 } 75 if (i == XEN_EXTRA_MEM_MAX_REGIONS) 76 printk(KERN_WARNING "Warning: not enough extra memory regions\n"); 77 78 memblock_reserve(start, size); 79 80 xen_max_p2m_pfn = PFN_DOWN(start + size); 81 82 for (pfn = PFN_DOWN(start); pfn <= xen_max_p2m_pfn; pfn++) 83 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 84 } 85 86 static unsigned long __init xen_do_chunk(unsigned long start, 87 unsigned long end, bool release) 88 { 89 struct xen_memory_reservation reservation = { 90 .address_bits = 0, 91 .extent_order = 0, 92 .domid = DOMID_SELF 93 }; 94 unsigned long len = 0; 95 unsigned long pfn; 96 int ret; 97 98 for (pfn = start; pfn < end; pfn++) { 99 unsigned long frame; 100 unsigned long mfn = pfn_to_mfn(pfn); 101 102 if (release) { 103 /* Make sure pfn exists to start with */ 104 if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) 105 continue; 106 frame = mfn; 107 } else { 108 if (mfn != INVALID_P2M_ENTRY) 109 continue; 110 frame = pfn; 111 } 112 set_xen_guest_handle(reservation.extent_start, &frame); 113 reservation.nr_extents = 1; 114 115 ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap, 116 &reservation); 117 WARN(ret != 1, "Failed to %s pfn %lx err=%d\n", 118 release ? "release" : "populate", pfn, ret); 119 120 if (ret == 1) { 121 if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) { 122 if (release) 123 break; 124 set_xen_guest_handle(reservation.extent_start, &frame); 125 reservation.nr_extents = 1; 126 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, 127 &reservation); 128 break; 129 } 130 len++; 131 } else 132 break; 133 } 134 if (len) 135 printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n", 136 release ? "Freeing" : "Populating", 137 start, end, len, 138 release ? "freed" : "added"); 139 140 return len; 141 } 142 143 static unsigned long __init xen_release_chunk(unsigned long start, 144 unsigned long end) 145 { 146 return xen_do_chunk(start, end, true); 147 } 148 149 static unsigned long __init xen_populate_chunk( 150 const struct e820entry *list, size_t map_size, 151 unsigned long max_pfn, unsigned long *last_pfn, 152 unsigned long credits_left) 153 { 154 const struct e820entry *entry; 155 unsigned int i; 156 unsigned long done = 0; 157 unsigned long dest_pfn; 158 159 for (i = 0, entry = list; i < map_size; i++, entry++) { 160 unsigned long s_pfn; 161 unsigned long e_pfn; 162 unsigned long pfns; 163 long capacity; 164 165 if (credits_left <= 0) 166 break; 167 168 if (entry->type != E820_RAM) 169 continue; 170 171 e_pfn = PFN_DOWN(entry->addr + entry->size); 172 173 /* We only care about E820 after the xen_start_info->nr_pages */ 174 if (e_pfn <= max_pfn) 175 continue; 176 177 s_pfn = PFN_UP(entry->addr); 178 /* If the E820 falls within the nr_pages, we want to start 179 * at the nr_pages PFN. 180 * If that would mean going past the E820 entry, skip it 181 */ 182 if (s_pfn <= max_pfn) { 183 capacity = e_pfn - max_pfn; 184 dest_pfn = max_pfn; 185 } else { 186 capacity = e_pfn - s_pfn; 187 dest_pfn = s_pfn; 188 } 189 190 if (credits_left < capacity) 191 capacity = credits_left; 192 193 pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false); 194 done += pfns; 195 *last_pfn = (dest_pfn + pfns); 196 if (pfns < capacity) 197 break; 198 credits_left -= pfns; 199 } 200 return done; 201 } 202 203 static void __init xen_set_identity_and_release_chunk( 204 unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, 205 unsigned long *released, unsigned long *identity) 206 { 207 unsigned long pfn; 208 209 /* 210 * If the PFNs are currently mapped, the VA mapping also needs 211 * to be updated to be 1:1. 212 */ 213 for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) 214 (void)HYPERVISOR_update_va_mapping( 215 (unsigned long)__va(pfn << PAGE_SHIFT), 216 mfn_pte(pfn, PAGE_KERNEL_IO), 0); 217 218 if (start_pfn < nr_pages) 219 *released += xen_release_chunk( 220 start_pfn, min(end_pfn, nr_pages)); 221 222 *identity += set_phys_range_identity(start_pfn, end_pfn); 223 } 224 225 static unsigned long __init xen_set_identity_and_release( 226 const struct e820entry *list, size_t map_size, unsigned long nr_pages) 227 { 228 phys_addr_t start = 0; 229 unsigned long released = 0; 230 unsigned long identity = 0; 231 const struct e820entry *entry; 232 int i; 233 234 /* 235 * Combine non-RAM regions and gaps until a RAM region (or the 236 * end of the map) is reached, then set the 1:1 map and 237 * release the pages (if available) in those non-RAM regions. 238 * 239 * The combined non-RAM regions are rounded to a whole number 240 * of pages so any partial pages are accessible via the 1:1 241 * mapping. This is needed for some BIOSes that put (for 242 * example) the DMI tables in a reserved region that begins on 243 * a non-page boundary. 244 */ 245 for (i = 0, entry = list; i < map_size; i++, entry++) { 246 phys_addr_t end = entry->addr + entry->size; 247 if (entry->type == E820_RAM || i == map_size - 1) { 248 unsigned long start_pfn = PFN_DOWN(start); 249 unsigned long end_pfn = PFN_UP(end); 250 251 if (entry->type == E820_RAM) 252 end_pfn = PFN_UP(entry->addr); 253 254 if (start_pfn < end_pfn) 255 xen_set_identity_and_release_chunk( 256 start_pfn, end_pfn, nr_pages, 257 &released, &identity); 258 259 start = end; 260 } 261 } 262 263 if (released) 264 printk(KERN_INFO "Released %lu pages of unused memory\n", released); 265 if (identity) 266 printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); 267 268 return released; 269 } 270 271 static unsigned long __init xen_get_max_pages(void) 272 { 273 unsigned long max_pages = MAX_DOMAIN_PAGES; 274 domid_t domid = DOMID_SELF; 275 int ret; 276 277 /* 278 * For the initial domain we use the maximum reservation as 279 * the maximum page. 280 * 281 * For guest domains the current maximum reservation reflects 282 * the current maximum rather than the static maximum. In this 283 * case the e820 map provided to us will cover the static 284 * maximum region. 285 */ 286 if (xen_initial_domain()) { 287 ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); 288 if (ret > 0) 289 max_pages = ret; 290 } 291 292 return min(max_pages, MAX_DOMAIN_PAGES); 293 } 294 295 static void xen_align_and_add_e820_region(u64 start, u64 size, int type) 296 { 297 u64 end = start + size; 298 299 /* Align RAM regions to page boundaries. */ 300 if (type == E820_RAM) { 301 start = PAGE_ALIGN(start); 302 end &= ~((u64)PAGE_SIZE - 1); 303 } 304 305 e820_add_region(start, end - start, type); 306 } 307 308 /** 309 * machine_specific_memory_setup - Hook for machine specific memory setup. 310 **/ 311 char * __init xen_memory_setup(void) 312 { 313 static struct e820entry map[E820MAX] __initdata; 314 315 unsigned long max_pfn = xen_start_info->nr_pages; 316 unsigned long long mem_end; 317 int rc; 318 struct xen_memory_map memmap; 319 unsigned long max_pages; 320 unsigned long last_pfn = 0; 321 unsigned long extra_pages = 0; 322 unsigned long populated; 323 int i; 324 int op; 325 326 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); 327 mem_end = PFN_PHYS(max_pfn); 328 329 memmap.nr_entries = E820MAX; 330 set_xen_guest_handle(memmap.buffer, map); 331 332 op = xen_initial_domain() ? 333 XENMEM_machine_memory_map : 334 XENMEM_memory_map; 335 rc = HYPERVISOR_memory_op(op, &memmap); 336 if (rc == -ENOSYS) { 337 BUG_ON(xen_initial_domain()); 338 memmap.nr_entries = 1; 339 map[0].addr = 0ULL; 340 map[0].size = mem_end; 341 /* 8MB slack (to balance backend allocations). */ 342 map[0].size += 8ULL << 20; 343 map[0].type = E820_RAM; 344 rc = 0; 345 } 346 BUG_ON(rc); 347 348 /* Make sure the Xen-supplied memory map is well-ordered. */ 349 sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); 350 351 max_pages = xen_get_max_pages(); 352 if (max_pages > max_pfn) 353 extra_pages += max_pages - max_pfn; 354 355 /* 356 * Set P2M for all non-RAM pages and E820 gaps to be identity 357 * type PFNs. Any RAM pages that would be made inaccesible by 358 * this are first released. 359 */ 360 xen_released_pages = xen_set_identity_and_release( 361 map, memmap.nr_entries, max_pfn); 362 363 /* 364 * Populate back the non-RAM pages and E820 gaps that had been 365 * released. */ 366 populated = xen_populate_chunk(map, memmap.nr_entries, 367 max_pfn, &last_pfn, xen_released_pages); 368 369 xen_released_pages -= populated; 370 extra_pages += xen_released_pages; 371 372 if (last_pfn > max_pfn) { 373 max_pfn = min(MAX_DOMAIN_PAGES, last_pfn); 374 mem_end = PFN_PHYS(max_pfn); 375 } 376 /* 377 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO 378 * factor the base size. On non-highmem systems, the base 379 * size is the full initial memory allocation; on highmem it 380 * is limited to the max size of lowmem, so that it doesn't 381 * get completely filled. 382 * 383 * In principle there could be a problem in lowmem systems if 384 * the initial memory is also very large with respect to 385 * lowmem, but we won't try to deal with that here. 386 */ 387 extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), 388 extra_pages); 389 i = 0; 390 while (i < memmap.nr_entries) { 391 u64 addr = map[i].addr; 392 u64 size = map[i].size; 393 u32 type = map[i].type; 394 395 if (type == E820_RAM) { 396 if (addr < mem_end) { 397 size = min(size, mem_end - addr); 398 } else if (extra_pages) { 399 size = min(size, (u64)extra_pages * PAGE_SIZE); 400 extra_pages -= size / PAGE_SIZE; 401 xen_add_extra_mem(addr, size); 402 } else 403 type = E820_UNUSABLE; 404 } 405 406 xen_align_and_add_e820_region(addr, size, type); 407 408 map[i].addr += size; 409 map[i].size -= size; 410 if (map[i].size == 0) 411 i++; 412 } 413 414 /* 415 * In domU, the ISA region is normal, usable memory, but we 416 * reserve ISA memory anyway because too many things poke 417 * about in there. 418 */ 419 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, 420 E820_RESERVED); 421 422 /* 423 * Reserve Xen bits: 424 * - mfn_list 425 * - xen_start_info 426 * See comment above "struct start_info" in <xen/interface/xen.h> 427 */ 428 memblock_reserve(__pa(xen_start_info->mfn_list), 429 xen_start_info->pt_base - xen_start_info->mfn_list); 430 431 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 432 433 return "Xen"; 434 } 435 436 /* 437 * Set the bit indicating "nosegneg" library variants should be used. 438 * We only need to bother in pure 32-bit mode; compat 32-bit processes 439 * can have un-truncated segments, so wrapping around is allowed. 440 */ 441 static void __init fiddle_vdso(void) 442 { 443 #ifdef CONFIG_X86_32 444 u32 *mask; 445 mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK); 446 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 447 mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK); 448 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 449 #endif 450 } 451 452 static int __cpuinit register_callback(unsigned type, const void *func) 453 { 454 struct callback_register callback = { 455 .type = type, 456 .address = XEN_CALLBACK(__KERNEL_CS, func), 457 .flags = CALLBACKF_mask_events, 458 }; 459 460 return HYPERVISOR_callback_op(CALLBACKOP_register, &callback); 461 } 462 463 void __cpuinit xen_enable_sysenter(void) 464 { 465 int ret; 466 unsigned sysenter_feature; 467 468 #ifdef CONFIG_X86_32 469 sysenter_feature = X86_FEATURE_SEP; 470 #else 471 sysenter_feature = X86_FEATURE_SYSENTER32; 472 #endif 473 474 if (!boot_cpu_has(sysenter_feature)) 475 return; 476 477 ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target); 478 if(ret != 0) 479 setup_clear_cpu_cap(sysenter_feature); 480 } 481 482 void __cpuinit xen_enable_syscall(void) 483 { 484 #ifdef CONFIG_X86_64 485 int ret; 486 487 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); 488 if (ret != 0) { 489 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret); 490 /* Pretty fatal; 64-bit userspace has no other 491 mechanism for syscalls. */ 492 } 493 494 if (boot_cpu_has(X86_FEATURE_SYSCALL32)) { 495 ret = register_callback(CALLBACKTYPE_syscall32, 496 xen_syscall32_target); 497 if (ret != 0) 498 setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); 499 } 500 #endif /* CONFIG_X86_64 */ 501 } 502 503 void __init xen_arch_setup(void) 504 { 505 xen_panic_handler_init(); 506 507 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); 508 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); 509 510 if (!xen_feature(XENFEAT_auto_translated_physmap)) 511 HYPERVISOR_vm_assist(VMASST_CMD_enable, 512 VMASST_TYPE_pae_extended_cr3); 513 514 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || 515 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) 516 BUG(); 517 518 xen_enable_sysenter(); 519 xen_enable_syscall(); 520 521 #ifdef CONFIG_ACPI 522 if (!(xen_start_info->flags & SIF_INITDOMAIN)) { 523 printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); 524 disable_acpi(); 525 } 526 #endif 527 528 memcpy(boot_command_line, xen_start_info->cmd_line, 529 MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? 530 COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); 531 532 /* Set up idle, making sure it calls safe_halt() pvop */ 533 #ifdef CONFIG_X86_32 534 boot_cpu_data.hlt_works_ok = 1; 535 #endif 536 disable_cpuidle(); 537 disable_cpufreq(); 538 WARN_ON(set_pm_idle_to_default()); 539 fiddle_vdso(); 540 } 541