1 /* 2 * Machine specific setup for xen 3 * 4 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 5 */ 6 7 #include <linux/module.h> 8 #include <linux/sched.h> 9 #include <linux/mm.h> 10 #include <linux/pm.h> 11 #include <linux/memblock.h> 12 #include <linux/cpuidle.h> 13 14 #include <asm/elf.h> 15 #include <asm/vdso.h> 16 #include <asm/e820.h> 17 #include <asm/setup.h> 18 #include <asm/acpi.h> 19 #include <asm/xen/hypervisor.h> 20 #include <asm/xen/hypercall.h> 21 22 #include <xen/xen.h> 23 #include <xen/page.h> 24 #include <xen/interface/callback.h> 25 #include <xen/interface/memory.h> 26 #include <xen/interface/physdev.h> 27 #include <xen/features.h> 28 29 #include "xen-ops.h" 30 #include "vdso.h" 31 32 /* These are code, but not functions. Defined in entry.S */ 33 extern const char xen_hypervisor_callback[]; 34 extern const char xen_failsafe_callback[]; 35 extern void xen_sysenter_target(void); 36 extern void xen_syscall_target(void); 37 extern void xen_syscall32_target(void); 38 39 /* Amount of extra memory space we add to the e820 ranges */ 40 phys_addr_t xen_extra_mem_start, xen_extra_mem_size; 41 42 /* 43 * The maximum amount of extra memory compared to the base size. The 44 * main scaling factor is the size of struct page. At extreme ratios 45 * of base:extra, all the base memory can be filled with page 46 * structures for the extra memory, leaving no space for anything 47 * else. 48 * 49 * 10x seems like a reasonable balance between scaling flexibility and 50 * leaving a practically usable system. 51 */ 52 #define EXTRA_MEM_RATIO (10) 53 54 static void __init xen_add_extra_mem(unsigned long pages) 55 { 56 unsigned long pfn; 57 58 u64 size = (u64)pages * PAGE_SIZE; 59 u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; 60 61 if (!pages) 62 return; 63 64 e820_add_region(extra_start, size, E820_RAM); 65 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 66 67 memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA"); 68 69 xen_extra_mem_size += size; 70 71 xen_max_p2m_pfn = PFN_DOWN(extra_start + size); 72 73 for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++) 74 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 75 } 76 77 static unsigned long __init xen_release_chunk(phys_addr_t start_addr, 78 phys_addr_t end_addr) 79 { 80 struct xen_memory_reservation reservation = { 81 .address_bits = 0, 82 .extent_order = 0, 83 .domid = DOMID_SELF 84 }; 85 unsigned long start, end; 86 unsigned long len = 0; 87 unsigned long pfn; 88 int ret; 89 90 start = PFN_UP(start_addr); 91 end = PFN_DOWN(end_addr); 92 93 if (end <= start) 94 return 0; 95 96 for(pfn = start; pfn < end; pfn++) { 97 unsigned long mfn = pfn_to_mfn(pfn); 98 99 /* Make sure pfn exists to start with */ 100 if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) 101 continue; 102 103 set_xen_guest_handle(reservation.extent_start, &mfn); 104 reservation.nr_extents = 1; 105 106 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, 107 &reservation); 108 WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); 109 if (ret == 1) { 110 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 111 len++; 112 } 113 } 114 printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n", 115 start, end, len); 116 117 return len; 118 } 119 120 static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, 121 const struct e820map *e820) 122 { 123 phys_addr_t max_addr = PFN_PHYS(max_pfn); 124 phys_addr_t last_end = ISA_END_ADDRESS; 125 unsigned long released = 0; 126 int i; 127 128 /* Free any unused memory above the low 1Mbyte. */ 129 for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { 130 phys_addr_t end = e820->map[i].addr; 131 end = min(max_addr, end); 132 133 if (last_end < end) 134 released += xen_release_chunk(last_end, end); 135 last_end = max(last_end, e820->map[i].addr + e820->map[i].size); 136 } 137 138 if (last_end < max_addr) 139 released += xen_release_chunk(last_end, max_addr); 140 141 printk(KERN_INFO "released %lu pages of unused memory\n", released); 142 return released; 143 } 144 145 static unsigned long __init xen_set_identity(const struct e820entry *list, 146 ssize_t map_size) 147 { 148 phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS; 149 phys_addr_t start_pci = last; 150 const struct e820entry *entry; 151 unsigned long identity = 0; 152 int i; 153 154 for (i = 0, entry = list; i < map_size; i++, entry++) { 155 phys_addr_t start = entry->addr; 156 phys_addr_t end = start + entry->size; 157 158 if (start < last) 159 start = last; 160 161 if (end <= start) 162 continue; 163 164 /* Skip over the 1MB region. */ 165 if (last > end) 166 continue; 167 168 if ((entry->type == E820_RAM) || (entry->type == E820_UNUSABLE)) { 169 if (start > start_pci) 170 identity += set_phys_range_identity( 171 PFN_UP(start_pci), PFN_DOWN(start)); 172 173 /* Without saving 'last' we would gooble RAM too 174 * at the end of the loop. */ 175 last = end; 176 start_pci = end; 177 continue; 178 } 179 start_pci = min(start, start_pci); 180 last = end; 181 } 182 if (last > start_pci) 183 identity += set_phys_range_identity( 184 PFN_UP(start_pci), PFN_DOWN(last)); 185 return identity; 186 } 187 /** 188 * machine_specific_memory_setup - Hook for machine specific memory setup. 189 **/ 190 char * __init xen_memory_setup(void) 191 { 192 static struct e820entry map[E820MAX] __initdata; 193 static struct e820entry map_raw[E820MAX] __initdata; 194 195 unsigned long max_pfn = xen_start_info->nr_pages; 196 unsigned long long mem_end; 197 int rc; 198 struct xen_memory_map memmap; 199 unsigned long extra_pages = 0; 200 unsigned long extra_limit; 201 unsigned long identity_pages = 0; 202 int i; 203 int op; 204 205 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); 206 mem_end = PFN_PHYS(max_pfn); 207 208 memmap.nr_entries = E820MAX; 209 set_xen_guest_handle(memmap.buffer, map); 210 211 op = xen_initial_domain() ? 212 XENMEM_machine_memory_map : 213 XENMEM_memory_map; 214 rc = HYPERVISOR_memory_op(op, &memmap); 215 if (rc == -ENOSYS) { 216 BUG_ON(xen_initial_domain()); 217 memmap.nr_entries = 1; 218 map[0].addr = 0ULL; 219 map[0].size = mem_end; 220 /* 8MB slack (to balance backend allocations). */ 221 map[0].size += 8ULL << 20; 222 map[0].type = E820_RAM; 223 rc = 0; 224 } 225 BUG_ON(rc); 226 227 memcpy(map_raw, map, sizeof(map)); 228 e820.nr_map = 0; 229 xen_extra_mem_start = mem_end; 230 for (i = 0; i < memmap.nr_entries; i++) { 231 unsigned long long end; 232 233 /* Guard against non-page aligned E820 entries. */ 234 if (map[i].type == E820_RAM) 235 map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE; 236 237 end = map[i].addr + map[i].size; 238 if (map[i].type == E820_RAM && end > mem_end) { 239 /* RAM off the end - may be partially included */ 240 u64 delta = min(map[i].size, end - mem_end); 241 242 map[i].size -= delta; 243 end -= delta; 244 245 extra_pages += PFN_DOWN(delta); 246 /* 247 * Set RAM below 4GB that is not for us to be unusable. 248 * This prevents "System RAM" address space from being 249 * used as potential resource for I/O address (happens 250 * when 'allocate_resource' is called). 251 */ 252 if (delta && 253 (xen_initial_domain() && end < 0x100000000ULL)) 254 e820_add_region(end, delta, E820_UNUSABLE); 255 } 256 257 if (map[i].size > 0 && end > xen_extra_mem_start) 258 xen_extra_mem_start = end; 259 260 /* Add region if any remains */ 261 if (map[i].size > 0) 262 e820_add_region(map[i].addr, map[i].size, map[i].type); 263 } 264 /* Align the balloon area so that max_low_pfn does not get set 265 * to be at the _end_ of the PCI gap at the far end (fee01000). 266 * Note that xen_extra_mem_start gets set in the loop above to be 267 * past the last E820 region. */ 268 if (xen_initial_domain() && (xen_extra_mem_start < (1ULL<<32))) 269 xen_extra_mem_start = (1ULL<<32); 270 271 /* 272 * In domU, the ISA region is normal, usable memory, but we 273 * reserve ISA memory anyway because too many things poke 274 * about in there. 275 * 276 * In Dom0, the host E820 information can leave gaps in the 277 * ISA range, which would cause us to release those pages. To 278 * avoid this, we unconditionally reserve them here. 279 */ 280 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, 281 E820_RESERVED); 282 283 /* 284 * Reserve Xen bits: 285 * - mfn_list 286 * - xen_start_info 287 * See comment above "struct start_info" in <xen/interface/xen.h> 288 */ 289 memblock_x86_reserve_range(__pa(xen_start_info->mfn_list), 290 __pa(xen_start_info->pt_base), 291 "XEN START INFO"); 292 293 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 294 295 extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); 296 297 /* 298 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO 299 * factor the base size. On non-highmem systems, the base 300 * size is the full initial memory allocation; on highmem it 301 * is limited to the max size of lowmem, so that it doesn't 302 * get completely filled. 303 * 304 * In principle there could be a problem in lowmem systems if 305 * the initial memory is also very large with respect to 306 * lowmem, but we won't try to deal with that here. 307 */ 308 extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), 309 max_pfn + extra_pages); 310 311 if (extra_limit >= max_pfn) 312 extra_pages = extra_limit - max_pfn; 313 else 314 extra_pages = 0; 315 316 xen_add_extra_mem(extra_pages); 317 318 /* 319 * Set P2M for all non-RAM pages and E820 gaps to be identity 320 * type PFNs. We supply it with the non-sanitized version 321 * of the E820. 322 */ 323 identity_pages = xen_set_identity(map_raw, memmap.nr_entries); 324 printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages); 325 return "Xen"; 326 } 327 328 /* 329 * Set the bit indicating "nosegneg" library variants should be used. 330 * We only need to bother in pure 32-bit mode; compat 32-bit processes 331 * can have un-truncated segments, so wrapping around is allowed. 332 */ 333 static void __init fiddle_vdso(void) 334 { 335 #ifdef CONFIG_X86_32 336 u32 *mask; 337 mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK); 338 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 339 mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK); 340 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 341 #endif 342 } 343 344 static int __cpuinit register_callback(unsigned type, const void *func) 345 { 346 struct callback_register callback = { 347 .type = type, 348 .address = XEN_CALLBACK(__KERNEL_CS, func), 349 .flags = CALLBACKF_mask_events, 350 }; 351 352 return HYPERVISOR_callback_op(CALLBACKOP_register, &callback); 353 } 354 355 void __cpuinit xen_enable_sysenter(void) 356 { 357 int ret; 358 unsigned sysenter_feature; 359 360 #ifdef CONFIG_X86_32 361 sysenter_feature = X86_FEATURE_SEP; 362 #else 363 sysenter_feature = X86_FEATURE_SYSENTER32; 364 #endif 365 366 if (!boot_cpu_has(sysenter_feature)) 367 return; 368 369 ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target); 370 if(ret != 0) 371 setup_clear_cpu_cap(sysenter_feature); 372 } 373 374 void __cpuinit xen_enable_syscall(void) 375 { 376 #ifdef CONFIG_X86_64 377 int ret; 378 379 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); 380 if (ret != 0) { 381 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret); 382 /* Pretty fatal; 64-bit userspace has no other 383 mechanism for syscalls. */ 384 } 385 386 if (boot_cpu_has(X86_FEATURE_SYSCALL32)) { 387 ret = register_callback(CALLBACKTYPE_syscall32, 388 xen_syscall32_target); 389 if (ret != 0) 390 setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); 391 } 392 #endif /* CONFIG_X86_64 */ 393 } 394 395 void __init xen_arch_setup(void) 396 { 397 xen_panic_handler_init(); 398 399 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); 400 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); 401 402 if (!xen_feature(XENFEAT_auto_translated_physmap)) 403 HYPERVISOR_vm_assist(VMASST_CMD_enable, 404 VMASST_TYPE_pae_extended_cr3); 405 406 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || 407 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) 408 BUG(); 409 410 xen_enable_sysenter(); 411 xen_enable_syscall(); 412 413 #ifdef CONFIG_ACPI 414 if (!(xen_start_info->flags & SIF_INITDOMAIN)) { 415 printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); 416 disable_acpi(); 417 } 418 #endif 419 420 memcpy(boot_command_line, xen_start_info->cmd_line, 421 MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? 422 COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); 423 424 /* Set up idle, making sure it calls safe_halt() pvop */ 425 #ifdef CONFIG_X86_32 426 boot_cpu_data.hlt_works_ok = 1; 427 #endif 428 disable_cpuidle(); 429 boot_option_idle_override = IDLE_HALT; 430 431 fiddle_vdso(); 432 } 433