1 /* 2 * KVM paravirt_ops implementation 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 17 * 18 * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 19 * Copyright IBM Corporation, 2007 20 * Authors: Anthony Liguori <aliguori@us.ibm.com> 21 */ 22 23 #include <linux/module.h> 24 #include <linux/kernel.h> 25 #include <linux/kvm_para.h> 26 #include <linux/cpu.h> 27 #include <linux/mm.h> 28 #include <linux/highmem.h> 29 #include <linux/hardirq.h> 30 #include <linux/notifier.h> 31 #include <linux/reboot.h> 32 #include <linux/hash.h> 33 #include <linux/sched.h> 34 #include <linux/slab.h> 35 #include <linux/kprobes.h> 36 #include <asm/timer.h> 37 #include <asm/cpu.h> 38 #include <asm/traps.h> 39 #include <asm/desc.h> 40 #include <asm/tlbflush.h> 41 #include <asm/idle.h> 42 #include <asm/apic.h> 43 #include <asm/apicdef.h> 44 #include <asm/hypervisor.h> 45 46 static int kvmapf = 1; 47 48 static int parse_no_kvmapf(char *arg) 49 { 50 kvmapf = 0; 51 return 0; 52 } 53 54 early_param("no-kvmapf", parse_no_kvmapf); 55 56 static int steal_acc = 1; 57 static int parse_no_stealacc(char *arg) 58 { 59 steal_acc = 0; 60 return 0; 61 } 62 63 early_param("no-steal-acc", parse_no_stealacc); 64 65 static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); 66 static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); 67 static int has_steal_clock = 0; 68 69 /* 70 * No need for any "IO delay" on KVM 71 */ 72 static void kvm_io_delay(void) 73 { 74 } 75 76 #define KVM_TASK_SLEEP_HASHBITS 8 77 #define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS) 78 79 struct kvm_task_sleep_node { 80 struct hlist_node link; 81 wait_queue_head_t wq; 82 u32 token; 83 int cpu; 84 bool halted; 85 }; 86 87 static struct kvm_task_sleep_head { 88 spinlock_t lock; 89 struct hlist_head list; 90 } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; 91 92 static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, 93 u32 token) 94 { 95 struct hlist_node *p; 96 97 hlist_for_each(p, &b->list) { 98 struct kvm_task_sleep_node *n = 99 hlist_entry(p, typeof(*n), link); 100 if (n->token == token) 101 return n; 102 } 103 104 return NULL; 105 } 106 107 void kvm_async_pf_task_wait(u32 token) 108 { 109 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); 110 struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; 111 struct kvm_task_sleep_node n, *e; 112 DEFINE_WAIT(wait); 113 int cpu, idle; 114 115 cpu = get_cpu(); 116 idle = idle_cpu(cpu); 117 put_cpu(); 118 119 spin_lock(&b->lock); 120 e = _find_apf_task(b, token); 121 if (e) { 122 /* dummy entry exist -> wake up was delivered ahead of PF */ 123 hlist_del(&e->link); 124 kfree(e); 125 spin_unlock(&b->lock); 126 return; 127 } 128 129 n.token = token; 130 n.cpu = smp_processor_id(); 131 n.halted = idle || preempt_count() > 1; 132 init_waitqueue_head(&n.wq); 133 hlist_add_head(&n.link, &b->list); 134 spin_unlock(&b->lock); 135 136 for (;;) { 137 if (!n.halted) 138 prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); 139 if (hlist_unhashed(&n.link)) 140 break; 141 142 if (!n.halted) { 143 local_irq_enable(); 144 schedule(); 145 local_irq_disable(); 146 } else { 147 /* 148 * We cannot reschedule. So halt. 149 */ 150 native_safe_halt(); 151 local_irq_disable(); 152 } 153 } 154 if (!n.halted) 155 finish_wait(&n.wq, &wait); 156 157 return; 158 } 159 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); 160 161 static void apf_task_wake_one(struct kvm_task_sleep_node *n) 162 { 163 hlist_del_init(&n->link); 164 if (n->halted) 165 smp_send_reschedule(n->cpu); 166 else if (waitqueue_active(&n->wq)) 167 wake_up(&n->wq); 168 } 169 170 static void apf_task_wake_all(void) 171 { 172 int i; 173 174 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { 175 struct hlist_node *p, *next; 176 struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; 177 spin_lock(&b->lock); 178 hlist_for_each_safe(p, next, &b->list) { 179 struct kvm_task_sleep_node *n = 180 hlist_entry(p, typeof(*n), link); 181 if (n->cpu == smp_processor_id()) 182 apf_task_wake_one(n); 183 } 184 spin_unlock(&b->lock); 185 } 186 } 187 188 void kvm_async_pf_task_wake(u32 token) 189 { 190 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); 191 struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; 192 struct kvm_task_sleep_node *n; 193 194 if (token == ~0) { 195 apf_task_wake_all(); 196 return; 197 } 198 199 again: 200 spin_lock(&b->lock); 201 n = _find_apf_task(b, token); 202 if (!n) { 203 /* 204 * async PF was not yet handled. 205 * Add dummy entry for the token. 206 */ 207 n = kzalloc(sizeof(*n), GFP_ATOMIC); 208 if (!n) { 209 /* 210 * Allocation failed! Busy wait while other cpu 211 * handles async PF. 212 */ 213 spin_unlock(&b->lock); 214 cpu_relax(); 215 goto again; 216 } 217 n->token = token; 218 n->cpu = smp_processor_id(); 219 init_waitqueue_head(&n->wq); 220 hlist_add_head(&n->link, &b->list); 221 } else 222 apf_task_wake_one(n); 223 spin_unlock(&b->lock); 224 return; 225 } 226 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); 227 228 u32 kvm_read_and_reset_pf_reason(void) 229 { 230 u32 reason = 0; 231 232 if (__get_cpu_var(apf_reason).enabled) { 233 reason = __get_cpu_var(apf_reason).reason; 234 __get_cpu_var(apf_reason).reason = 0; 235 } 236 237 return reason; 238 } 239 EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); 240 241 dotraplinkage void __kprobes 242 do_async_page_fault(struct pt_regs *regs, unsigned long error_code) 243 { 244 switch (kvm_read_and_reset_pf_reason()) { 245 default: 246 do_page_fault(regs, error_code); 247 break; 248 case KVM_PV_REASON_PAGE_NOT_PRESENT: 249 /* page is swapped out by the host. */ 250 kvm_async_pf_task_wait((u32)read_cr2()); 251 break; 252 case KVM_PV_REASON_PAGE_READY: 253 rcu_irq_enter(); 254 exit_idle(); 255 kvm_async_pf_task_wake((u32)read_cr2()); 256 rcu_irq_exit(); 257 break; 258 } 259 } 260 261 static void __init paravirt_ops_setup(void) 262 { 263 pv_info.name = "KVM"; 264 pv_info.paravirt_enabled = 1; 265 266 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) 267 pv_cpu_ops.io_delay = kvm_io_delay; 268 269 #ifdef CONFIG_X86_IO_APIC 270 no_timer_check = 1; 271 #endif 272 } 273 274 static void kvm_register_steal_time(void) 275 { 276 int cpu = smp_processor_id(); 277 struct kvm_steal_time *st = &per_cpu(steal_time, cpu); 278 279 if (!has_steal_clock) 280 return; 281 282 memset(st, 0, sizeof(*st)); 283 284 wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED)); 285 printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n", 286 cpu, __pa(st)); 287 } 288 289 static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; 290 291 static void kvm_guest_apic_eoi_write(u32 reg, u32 val) 292 { 293 /** 294 * This relies on __test_and_clear_bit to modify the memory 295 * in a way that is atomic with respect to the local CPU. 296 * The hypervisor only accesses this memory from the local CPU so 297 * there's no need for lock or memory barriers. 298 * An optimization barrier is implied in apic write. 299 */ 300 if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi))) 301 return; 302 apic_write(APIC_EOI, APIC_EOI_ACK); 303 } 304 305 void __cpuinit kvm_guest_cpu_init(void) 306 { 307 if (!kvm_para_available()) 308 return; 309 310 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { 311 u64 pa = __pa(&__get_cpu_var(apf_reason)); 312 313 #ifdef CONFIG_PREEMPT 314 pa |= KVM_ASYNC_PF_SEND_ALWAYS; 315 #endif 316 wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED); 317 __get_cpu_var(apf_reason).enabled = 1; 318 printk(KERN_INFO"KVM setup async PF for cpu %d\n", 319 smp_processor_id()); 320 } 321 322 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { 323 unsigned long pa; 324 /* Size alignment is implied but just to make it explicit. */ 325 BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); 326 __get_cpu_var(kvm_apic_eoi) = 0; 327 pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; 328 wrmsrl(MSR_KVM_PV_EOI_EN, pa); 329 } 330 331 if (has_steal_clock) 332 kvm_register_steal_time(); 333 } 334 335 static void kvm_pv_disable_apf(void) 336 { 337 if (!__get_cpu_var(apf_reason).enabled) 338 return; 339 340 wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); 341 __get_cpu_var(apf_reason).enabled = 0; 342 343 printk(KERN_INFO"Unregister pv shared memory for cpu %d\n", 344 smp_processor_id()); 345 } 346 347 static void kvm_pv_guest_cpu_reboot(void *unused) 348 { 349 /* 350 * We disable PV EOI before we load a new kernel by kexec, 351 * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. 352 * New kernel can re-enable when it boots. 353 */ 354 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 355 wrmsrl(MSR_KVM_PV_EOI_EN, 0); 356 kvm_pv_disable_apf(); 357 } 358 359 static int kvm_pv_reboot_notify(struct notifier_block *nb, 360 unsigned long code, void *unused) 361 { 362 if (code == SYS_RESTART) 363 on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); 364 return NOTIFY_DONE; 365 } 366 367 static struct notifier_block kvm_pv_reboot_nb = { 368 .notifier_call = kvm_pv_reboot_notify, 369 }; 370 371 static u64 kvm_steal_clock(int cpu) 372 { 373 u64 steal; 374 struct kvm_steal_time *src; 375 int version; 376 377 src = &per_cpu(steal_time, cpu); 378 do { 379 version = src->version; 380 rmb(); 381 steal = src->steal; 382 rmb(); 383 } while ((version & 1) || (version != src->version)); 384 385 return steal; 386 } 387 388 void kvm_disable_steal_time(void) 389 { 390 if (!has_steal_clock) 391 return; 392 393 wrmsr(MSR_KVM_STEAL_TIME, 0, 0); 394 } 395 396 #ifdef CONFIG_SMP 397 static void __init kvm_smp_prepare_boot_cpu(void) 398 { 399 #ifdef CONFIG_KVM_CLOCK 400 WARN_ON(kvm_register_clock("primary cpu clock")); 401 #endif 402 kvm_guest_cpu_init(); 403 native_smp_prepare_boot_cpu(); 404 } 405 406 static void __cpuinit kvm_guest_cpu_online(void *dummy) 407 { 408 kvm_guest_cpu_init(); 409 } 410 411 static void kvm_guest_cpu_offline(void *dummy) 412 { 413 kvm_disable_steal_time(); 414 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 415 wrmsrl(MSR_KVM_PV_EOI_EN, 0); 416 kvm_pv_disable_apf(); 417 apf_task_wake_all(); 418 } 419 420 static int __cpuinit kvm_cpu_notify(struct notifier_block *self, 421 unsigned long action, void *hcpu) 422 { 423 int cpu = (unsigned long)hcpu; 424 switch (action) { 425 case CPU_ONLINE: 426 case CPU_DOWN_FAILED: 427 case CPU_ONLINE_FROZEN: 428 smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0); 429 break; 430 case CPU_DOWN_PREPARE: 431 case CPU_DOWN_PREPARE_FROZEN: 432 smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1); 433 break; 434 default: 435 break; 436 } 437 return NOTIFY_OK; 438 } 439 440 static struct notifier_block __cpuinitdata kvm_cpu_notifier = { 441 .notifier_call = kvm_cpu_notify, 442 }; 443 #endif 444 445 static void __init kvm_apf_trap_init(void) 446 { 447 set_intr_gate(14, &async_page_fault); 448 } 449 450 void __init kvm_guest_init(void) 451 { 452 int i; 453 454 if (!kvm_para_available()) 455 return; 456 457 paravirt_ops_setup(); 458 register_reboot_notifier(&kvm_pv_reboot_nb); 459 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) 460 spin_lock_init(&async_pf_sleepers[i].lock); 461 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) 462 x86_init.irqs.trap_init = kvm_apf_trap_init; 463 464 if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { 465 has_steal_clock = 1; 466 pv_time_ops.steal_clock = kvm_steal_clock; 467 } 468 469 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 470 apic_set_eoi_write(kvm_guest_apic_eoi_write); 471 472 #ifdef CONFIG_SMP 473 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 474 register_cpu_notifier(&kvm_cpu_notifier); 475 #else 476 kvm_guest_cpu_init(); 477 #endif 478 } 479 480 static bool __init kvm_detect(void) 481 { 482 if (!kvm_para_available()) 483 return false; 484 return true; 485 } 486 487 const struct hypervisor_x86 x86_hyper_kvm __refconst = { 488 .name = "KVM", 489 .detect = kvm_detect, 490 }; 491 EXPORT_SYMBOL_GPL(x86_hyper_kvm); 492 493 static __init int activate_jump_labels(void) 494 { 495 if (has_steal_clock) { 496 static_key_slow_inc(¶virt_steal_enabled); 497 if (steal_acc) 498 static_key_slow_inc(¶virt_steal_rq_enabled); 499 } 500 501 return 0; 502 } 503 arch_initcall(activate_jump_labels); 504