1 /* 2 * Copyright 2007 Andi Kleen, SUSE Labs. 3 * Subject to the GPL, v.2 4 * 5 * This contains most of the x86 vDSO kernel-side code. 6 */ 7 #include <linux/mm.h> 8 #include <linux/err.h> 9 #include <linux/sched.h> 10 #include <linux/slab.h> 11 #include <linux/init.h> 12 #include <linux/random.h> 13 #include <linux/elf.h> 14 #include <linux/cpu.h> 15 #include <asm/pvclock.h> 16 #include <asm/vgtod.h> 17 #include <asm/proto.h> 18 #include <asm/vdso.h> 19 #include <asm/vvar.h> 20 #include <asm/page.h> 21 #include <asm/hpet.h> 22 #include <asm/desc.h> 23 24 #if defined(CONFIG_X86_64) 25 unsigned int __read_mostly vdso64_enabled = 1; 26 #endif 27 28 void __init init_vdso_image(const struct vdso_image *image) 29 { 30 int i; 31 int npages = (image->size) / PAGE_SIZE; 32 33 BUG_ON(image->size % PAGE_SIZE != 0); 34 for (i = 0; i < npages; i++) 35 image->text_mapping.pages[i] = 36 virt_to_page(image->data + i*PAGE_SIZE); 37 38 apply_alternatives((struct alt_instr *)(image->data + image->alt), 39 (struct alt_instr *)(image->data + image->alt + 40 image->alt_len)); 41 } 42 43 struct linux_binprm; 44 45 /* 46 * Put the vdso above the (randomized) stack with another randomized 47 * offset. This way there is no hole in the middle of address space. 48 * To save memory make sure it is still in the same PTE as the stack 49 * top. This doesn't give that many random bits. 50 * 51 * Note that this algorithm is imperfect: the distribution of the vdso 52 * start address within a PMD is biased toward the end. 53 * 54 * Only used for the 64-bit and x32 vdsos. 55 */ 56 static unsigned long vdso_addr(unsigned long start, unsigned len) 57 { 58 #ifdef CONFIG_X86_32 59 return 0; 60 #else 61 unsigned long addr, end; 62 unsigned offset; 63 64 /* 65 * Round up the start address. It can start out unaligned as a result 66 * of stack start randomization. 67 */ 68 start = PAGE_ALIGN(start); 69 70 /* Round the lowest possible end address up to a PMD boundary. */ 71 end = (start + len + PMD_SIZE - 1) & PMD_MASK; 72 if (end >= TASK_SIZE_MAX) 73 end = TASK_SIZE_MAX; 74 end -= len; 75 76 if (end > start) { 77 offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); 78 addr = start + (offset << PAGE_SHIFT); 79 } else { 80 addr = start; 81 } 82 83 /* 84 * Forcibly align the final address in case we have a hardware 85 * issue that requires alignment for performance reasons. 86 */ 87 addr = align_vdso_addr(addr); 88 89 return addr; 90 #endif 91 } 92 93 static int map_vdso(const struct vdso_image *image, bool calculate_addr) 94 { 95 struct mm_struct *mm = current->mm; 96 struct vm_area_struct *vma; 97 unsigned long addr, text_start; 98 int ret = 0; 99 static struct page *no_pages[] = {NULL}; 100 static struct vm_special_mapping vvar_mapping = { 101 .name = "[vvar]", 102 .pages = no_pages, 103 }; 104 struct pvclock_vsyscall_time_info *pvti; 105 106 if (calculate_addr) { 107 addr = vdso_addr(current->mm->start_stack, 108 image->size - image->sym_vvar_start); 109 } else { 110 addr = 0; 111 } 112 113 down_write(&mm->mmap_sem); 114 115 addr = get_unmapped_area(NULL, addr, 116 image->size - image->sym_vvar_start, 0, 0); 117 if (IS_ERR_VALUE(addr)) { 118 ret = addr; 119 goto up_fail; 120 } 121 122 text_start = addr - image->sym_vvar_start; 123 current->mm->context.vdso = (void __user *)text_start; 124 125 /* 126 * MAYWRITE to allow gdb to COW and set breakpoints 127 */ 128 vma = _install_special_mapping(mm, 129 text_start, 130 image->size, 131 VM_READ|VM_EXEC| 132 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 133 &image->text_mapping); 134 135 if (IS_ERR(vma)) { 136 ret = PTR_ERR(vma); 137 goto up_fail; 138 } 139 140 vma = _install_special_mapping(mm, 141 addr, 142 -image->sym_vvar_start, 143 VM_READ|VM_MAYREAD, 144 &vvar_mapping); 145 146 if (IS_ERR(vma)) { 147 ret = PTR_ERR(vma); 148 goto up_fail; 149 } 150 151 if (image->sym_vvar_page) 152 ret = remap_pfn_range(vma, 153 text_start + image->sym_vvar_page, 154 __pa_symbol(&__vvar_page) >> PAGE_SHIFT, 155 PAGE_SIZE, 156 PAGE_READONLY); 157 158 if (ret) 159 goto up_fail; 160 161 #ifdef CONFIG_HPET_TIMER 162 if (hpet_address && image->sym_hpet_page) { 163 ret = io_remap_pfn_range(vma, 164 text_start + image->sym_hpet_page, 165 hpet_address >> PAGE_SHIFT, 166 PAGE_SIZE, 167 pgprot_noncached(PAGE_READONLY)); 168 169 if (ret) 170 goto up_fail; 171 } 172 #endif 173 174 pvti = pvclock_pvti_cpu0_va(); 175 if (pvti && image->sym_pvclock_page) { 176 ret = remap_pfn_range(vma, 177 text_start + image->sym_pvclock_page, 178 __pa(pvti) >> PAGE_SHIFT, 179 PAGE_SIZE, 180 PAGE_READONLY); 181 182 if (ret) 183 goto up_fail; 184 } 185 186 up_fail: 187 if (ret) 188 current->mm->context.vdso = NULL; 189 190 up_write(&mm->mmap_sem); 191 return ret; 192 } 193 194 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 195 static int load_vdso32(void) 196 { 197 if (vdso32_enabled != 1) /* Other values all mean "disabled" */ 198 return 0; 199 200 return map_vdso(&vdso_image_32, false); 201 } 202 #endif 203 204 #ifdef CONFIG_X86_64 205 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) 206 { 207 if (!vdso64_enabled) 208 return 0; 209 210 return map_vdso(&vdso_image_64, true); 211 } 212 213 #ifdef CONFIG_COMPAT 214 int compat_arch_setup_additional_pages(struct linux_binprm *bprm, 215 int uses_interp) 216 { 217 #ifdef CONFIG_X86_X32_ABI 218 if (test_thread_flag(TIF_X32)) { 219 if (!vdso64_enabled) 220 return 0; 221 222 return map_vdso(&vdso_image_x32, true); 223 } 224 #endif 225 #ifdef CONFIG_IA32_EMULATION 226 return load_vdso32(); 227 #else 228 return 0; 229 #endif 230 } 231 #endif 232 #else 233 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) 234 { 235 return load_vdso32(); 236 } 237 #endif 238 239 #ifdef CONFIG_X86_64 240 static __init int vdso_setup(char *s) 241 { 242 vdso64_enabled = simple_strtoul(s, NULL, 0); 243 return 0; 244 } 245 __setup("vdso=", vdso_setup); 246 #endif 247 248 #ifdef CONFIG_X86_64 249 static void vgetcpu_cpu_init(void *arg) 250 { 251 int cpu = smp_processor_id(); 252 struct desc_struct d = { }; 253 unsigned long node = 0; 254 #ifdef CONFIG_NUMA 255 node = cpu_to_node(cpu); 256 #endif 257 if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) 258 write_rdtscp_aux((node << 12) | cpu); 259 260 /* 261 * Store cpu number in limit so that it can be loaded 262 * quickly in user space in vgetcpu. (12 bits for the CPU 263 * and 8 bits for the node) 264 */ 265 d.limit0 = cpu | ((node & 0xf) << 12); 266 d.limit = node >> 4; 267 d.type = 5; /* RO data, expand down, accessed */ 268 d.dpl = 3; /* Visible to user code */ 269 d.s = 1; /* Not a system segment */ 270 d.p = 1; /* Present */ 271 d.d = 1; /* 32-bit */ 272 273 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); 274 } 275 276 static int 277 vgetcpu_cpu_notifier(struct notifier_block *n, unsigned long action, void *arg) 278 { 279 long cpu = (long)arg; 280 281 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) 282 smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1); 283 284 return NOTIFY_DONE; 285 } 286 287 static int __init init_vdso(void) 288 { 289 init_vdso_image(&vdso_image_64); 290 291 #ifdef CONFIG_X86_X32_ABI 292 init_vdso_image(&vdso_image_x32); 293 #endif 294 295 cpu_notifier_register_begin(); 296 297 on_each_cpu(vgetcpu_cpu_init, NULL, 1); 298 /* notifier priority > KVM */ 299 __hotcpu_notifier(vgetcpu_cpu_notifier, 30); 300 301 cpu_notifier_register_done(); 302 303 return 0; 304 } 305 subsys_initcall(init_vdso); 306 #endif /* CONFIG_X86_64 */ 307