1 /* 2 * Copyright IBM Corp. 2007,2009 3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/gfp.h> 10 #include <linux/mm.h> 11 #include <linux/swap.h> 12 #include <linux/smp.h> 13 #include <linux/highmem.h> 14 #include <linux/pagemap.h> 15 #include <linux/spinlock.h> 16 #include <linux/module.h> 17 #include <linux/quicklist.h> 18 #include <linux/rcupdate.h> 19 20 #include <asm/system.h> 21 #include <asm/pgtable.h> 22 #include <asm/pgalloc.h> 23 #include <asm/tlb.h> 24 #include <asm/tlbflush.h> 25 #include <asm/mmu_context.h> 26 27 struct rcu_table_freelist { 28 struct rcu_head rcu; 29 struct mm_struct *mm; 30 unsigned int pgt_index; 31 unsigned int crst_index; 32 unsigned long *table[0]; 33 }; 34 35 #define RCU_FREELIST_SIZE \ 36 ((PAGE_SIZE - sizeof(struct rcu_table_freelist)) \ 37 / sizeof(unsigned long)) 38 39 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 40 static DEFINE_PER_CPU(struct rcu_table_freelist *, rcu_table_freelist); 41 42 static void __page_table_free(struct mm_struct *mm, unsigned long *table); 43 static void __crst_table_free(struct mm_struct *mm, unsigned long *table); 44 45 static struct rcu_table_freelist *rcu_table_freelist_get(struct mm_struct *mm) 46 { 47 struct rcu_table_freelist **batchp = &__get_cpu_var(rcu_table_freelist); 48 struct rcu_table_freelist *batch = *batchp; 49 50 if (batch) 51 return batch; 52 batch = (struct rcu_table_freelist *) __get_free_page(GFP_ATOMIC); 53 if (batch) { 54 batch->mm = mm; 55 batch->pgt_index = 0; 56 batch->crst_index = RCU_FREELIST_SIZE; 57 *batchp = batch; 58 } 59 return batch; 60 } 61 62 static void rcu_table_freelist_callback(struct rcu_head *head) 63 { 64 struct rcu_table_freelist *batch = 65 container_of(head, struct rcu_table_freelist, rcu); 66 67 while (batch->pgt_index > 0) 68 __page_table_free(batch->mm, batch->table[--batch->pgt_index]); 69 while (batch->crst_index < RCU_FREELIST_SIZE) 70 __crst_table_free(batch->mm, batch->table[batch->crst_index++]); 71 free_page((unsigned long) batch); 72 } 73 74 void rcu_table_freelist_finish(void) 75 { 76 struct rcu_table_freelist *batch = __get_cpu_var(rcu_table_freelist); 77 78 if (!batch) 79 return; 80 call_rcu(&batch->rcu, rcu_table_freelist_callback); 81 __get_cpu_var(rcu_table_freelist) = NULL; 82 } 83 84 static void smp_sync(void *arg) 85 { 86 } 87 88 #ifndef CONFIG_64BIT 89 #define ALLOC_ORDER 1 90 #define TABLES_PER_PAGE 4 91 #define FRAG_MASK 15UL 92 #define SECOND_HALVES 10UL 93 94 void clear_table_pgstes(unsigned long *table) 95 { 96 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4); 97 memset(table + 256, 0, PAGE_SIZE/4); 98 clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4); 99 memset(table + 768, 0, PAGE_SIZE/4); 100 } 101 102 #else 103 #define ALLOC_ORDER 2 104 #define TABLES_PER_PAGE 2 105 #define FRAG_MASK 3UL 106 #define SECOND_HALVES 2UL 107 108 void clear_table_pgstes(unsigned long *table) 109 { 110 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2); 111 memset(table + 256, 0, PAGE_SIZE/2); 112 } 113 114 #endif 115 116 unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE; 117 EXPORT_SYMBOL(VMALLOC_START); 118 119 static int __init parse_vmalloc(char *arg) 120 { 121 if (!arg) 122 return -EINVAL; 123 VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK; 124 return 0; 125 } 126 early_param("vmalloc", parse_vmalloc); 127 128 unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec) 129 { 130 struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 131 132 if (!page) 133 return NULL; 134 page->index = 0; 135 if (noexec) { 136 struct page *shadow = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 137 if (!shadow) { 138 __free_pages(page, ALLOC_ORDER); 139 return NULL; 140 } 141 page->index = page_to_phys(shadow); 142 } 143 spin_lock_bh(&mm->context.list_lock); 144 list_add(&page->lru, &mm->context.crst_list); 145 spin_unlock_bh(&mm->context.list_lock); 146 return (unsigned long *) page_to_phys(page); 147 } 148 149 static void __crst_table_free(struct mm_struct *mm, unsigned long *table) 150 { 151 unsigned long *shadow = get_shadow_table(table); 152 153 if (shadow) 154 free_pages((unsigned long) shadow, ALLOC_ORDER); 155 free_pages((unsigned long) table, ALLOC_ORDER); 156 } 157 158 void crst_table_free(struct mm_struct *mm, unsigned long *table) 159 { 160 struct page *page = virt_to_page(table); 161 162 spin_lock_bh(&mm->context.list_lock); 163 list_del(&page->lru); 164 spin_unlock_bh(&mm->context.list_lock); 165 __crst_table_free(mm, table); 166 } 167 168 void crst_table_free_rcu(struct mm_struct *mm, unsigned long *table) 169 { 170 struct rcu_table_freelist *batch; 171 struct page *page = virt_to_page(table); 172 173 spin_lock_bh(&mm->context.list_lock); 174 list_del(&page->lru); 175 spin_unlock_bh(&mm->context.list_lock); 176 if (atomic_read(&mm->mm_users) < 2 && 177 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { 178 __crst_table_free(mm, table); 179 return; 180 } 181 batch = rcu_table_freelist_get(mm); 182 if (!batch) { 183 smp_call_function(smp_sync, NULL, 1); 184 __crst_table_free(mm, table); 185 return; 186 } 187 batch->table[--batch->crst_index] = table; 188 if (batch->pgt_index >= batch->crst_index) 189 rcu_table_freelist_finish(); 190 } 191 192 #ifdef CONFIG_64BIT 193 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) 194 { 195 unsigned long *table, *pgd; 196 unsigned long entry; 197 198 BUG_ON(limit > (1UL << 53)); 199 repeat: 200 table = crst_table_alloc(mm, mm->context.noexec); 201 if (!table) 202 return -ENOMEM; 203 spin_lock_bh(&mm->page_table_lock); 204 if (mm->context.asce_limit < limit) { 205 pgd = (unsigned long *) mm->pgd; 206 if (mm->context.asce_limit <= (1UL << 31)) { 207 entry = _REGION3_ENTRY_EMPTY; 208 mm->context.asce_limit = 1UL << 42; 209 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 210 _ASCE_USER_BITS | 211 _ASCE_TYPE_REGION3; 212 } else { 213 entry = _REGION2_ENTRY_EMPTY; 214 mm->context.asce_limit = 1UL << 53; 215 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 216 _ASCE_USER_BITS | 217 _ASCE_TYPE_REGION2; 218 } 219 crst_table_init(table, entry); 220 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); 221 mm->pgd = (pgd_t *) table; 222 mm->task_size = mm->context.asce_limit; 223 table = NULL; 224 } 225 spin_unlock_bh(&mm->page_table_lock); 226 if (table) 227 crst_table_free(mm, table); 228 if (mm->context.asce_limit < limit) 229 goto repeat; 230 update_mm(mm, current); 231 return 0; 232 } 233 234 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) 235 { 236 pgd_t *pgd; 237 238 if (mm->context.asce_limit <= limit) 239 return; 240 __tlb_flush_mm(mm); 241 while (mm->context.asce_limit > limit) { 242 pgd = mm->pgd; 243 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { 244 case _REGION_ENTRY_TYPE_R2: 245 mm->context.asce_limit = 1UL << 42; 246 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 247 _ASCE_USER_BITS | 248 _ASCE_TYPE_REGION3; 249 break; 250 case _REGION_ENTRY_TYPE_R3: 251 mm->context.asce_limit = 1UL << 31; 252 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 253 _ASCE_USER_BITS | 254 _ASCE_TYPE_SEGMENT; 255 break; 256 default: 257 BUG(); 258 } 259 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); 260 mm->task_size = mm->context.asce_limit; 261 crst_table_free(mm, (unsigned long *) pgd); 262 } 263 update_mm(mm, current); 264 } 265 #endif 266 267 /* 268 * page table entry allocation/free routines. 269 */ 270 unsigned long *page_table_alloc(struct mm_struct *mm) 271 { 272 struct page *page; 273 unsigned long *table; 274 unsigned long bits; 275 276 bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; 277 spin_lock_bh(&mm->context.list_lock); 278 page = NULL; 279 if (!list_empty(&mm->context.pgtable_list)) { 280 page = list_first_entry(&mm->context.pgtable_list, 281 struct page, lru); 282 if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1)) 283 page = NULL; 284 } 285 if (!page) { 286 spin_unlock_bh(&mm->context.list_lock); 287 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 288 if (!page) 289 return NULL; 290 pgtable_page_ctor(page); 291 page->flags &= ~FRAG_MASK; 292 table = (unsigned long *) page_to_phys(page); 293 if (mm->context.has_pgste) 294 clear_table_pgstes(table); 295 else 296 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); 297 spin_lock_bh(&mm->context.list_lock); 298 list_add(&page->lru, &mm->context.pgtable_list); 299 } 300 table = (unsigned long *) page_to_phys(page); 301 while (page->flags & bits) { 302 table += 256; 303 bits <<= 1; 304 } 305 page->flags |= bits; 306 if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1)) 307 list_move_tail(&page->lru, &mm->context.pgtable_list); 308 spin_unlock_bh(&mm->context.list_lock); 309 return table; 310 } 311 312 static void __page_table_free(struct mm_struct *mm, unsigned long *table) 313 { 314 struct page *page; 315 unsigned long bits; 316 317 bits = ((unsigned long) table) & 15; 318 table = (unsigned long *)(((unsigned long) table) ^ bits); 319 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 320 page->flags ^= bits; 321 if (!(page->flags & FRAG_MASK)) { 322 pgtable_page_dtor(page); 323 __free_page(page); 324 } 325 } 326 327 void page_table_free(struct mm_struct *mm, unsigned long *table) 328 { 329 struct page *page; 330 unsigned long bits; 331 332 bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; 333 bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); 334 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 335 spin_lock_bh(&mm->context.list_lock); 336 page->flags ^= bits; 337 if (page->flags & FRAG_MASK) { 338 /* Page now has some free pgtable fragments. */ 339 if (!list_empty(&page->lru)) 340 list_move(&page->lru, &mm->context.pgtable_list); 341 page = NULL; 342 } else 343 /* All fragments of the 4K page have been freed. */ 344 list_del(&page->lru); 345 spin_unlock_bh(&mm->context.list_lock); 346 if (page) { 347 pgtable_page_dtor(page); 348 __free_page(page); 349 } 350 } 351 352 void page_table_free_rcu(struct mm_struct *mm, unsigned long *table) 353 { 354 struct rcu_table_freelist *batch; 355 struct page *page; 356 unsigned long bits; 357 358 if (atomic_read(&mm->mm_users) < 2 && 359 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { 360 page_table_free(mm, table); 361 return; 362 } 363 batch = rcu_table_freelist_get(mm); 364 if (!batch) { 365 smp_call_function(smp_sync, NULL, 1); 366 page_table_free(mm, table); 367 return; 368 } 369 bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; 370 bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); 371 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 372 spin_lock_bh(&mm->context.list_lock); 373 /* Delayed freeing with rcu prevents reuse of pgtable fragments */ 374 list_del_init(&page->lru); 375 spin_unlock_bh(&mm->context.list_lock); 376 table = (unsigned long *)(((unsigned long) table) | bits); 377 batch->table[batch->pgt_index++] = table; 378 if (batch->pgt_index >= batch->crst_index) 379 rcu_table_freelist_finish(); 380 } 381 382 void disable_noexec(struct mm_struct *mm, struct task_struct *tsk) 383 { 384 struct page *page; 385 386 spin_lock_bh(&mm->context.list_lock); 387 /* Free shadow region and segment tables. */ 388 list_for_each_entry(page, &mm->context.crst_list, lru) 389 if (page->index) { 390 free_pages((unsigned long) page->index, ALLOC_ORDER); 391 page->index = 0; 392 } 393 /* "Free" second halves of page tables. */ 394 list_for_each_entry(page, &mm->context.pgtable_list, lru) 395 page->flags &= ~SECOND_HALVES; 396 spin_unlock_bh(&mm->context.list_lock); 397 mm->context.noexec = 0; 398 update_mm(mm, tsk); 399 } 400 401 /* 402 * switch on pgstes for its userspace process (for kvm) 403 */ 404 int s390_enable_sie(void) 405 { 406 struct task_struct *tsk = current; 407 struct mm_struct *mm, *old_mm; 408 409 /* Do we have switched amode? If no, we cannot do sie */ 410 if (user_mode == HOME_SPACE_MODE) 411 return -EINVAL; 412 413 /* Do we have pgstes? if yes, we are done */ 414 if (tsk->mm->context.has_pgste) 415 return 0; 416 417 /* lets check if we are allowed to replace the mm */ 418 task_lock(tsk); 419 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || 420 #ifdef CONFIG_AIO 421 !hlist_empty(&tsk->mm->ioctx_list) || 422 #endif 423 tsk->mm != tsk->active_mm) { 424 task_unlock(tsk); 425 return -EINVAL; 426 } 427 task_unlock(tsk); 428 429 /* we copy the mm and let dup_mm create the page tables with_pgstes */ 430 tsk->mm->context.alloc_pgste = 1; 431 mm = dup_mm(tsk); 432 tsk->mm->context.alloc_pgste = 0; 433 if (!mm) 434 return -ENOMEM; 435 436 /* Now lets check again if something happened */ 437 task_lock(tsk); 438 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || 439 #ifdef CONFIG_AIO 440 !hlist_empty(&tsk->mm->ioctx_list) || 441 #endif 442 tsk->mm != tsk->active_mm) { 443 mmput(mm); 444 task_unlock(tsk); 445 return -EINVAL; 446 } 447 448 /* ok, we are alone. No ptrace, no threads, etc. */ 449 old_mm = tsk->mm; 450 tsk->mm = tsk->active_mm = mm; 451 preempt_disable(); 452 update_mm(mm, tsk); 453 atomic_inc(&mm->context.attach_count); 454 atomic_dec(&old_mm->context.attach_count); 455 cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); 456 preempt_enable(); 457 task_unlock(tsk); 458 mmput(old_mm); 459 return 0; 460 } 461 EXPORT_SYMBOL_GPL(s390_enable_sie); 462 463 #if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION) 464 bool kernel_page_present(struct page *page) 465 { 466 unsigned long addr; 467 int cc; 468 469 addr = page_to_phys(page); 470 asm volatile( 471 " lra %1,0(%1)\n" 472 " ipm %0\n" 473 " srl %0,28" 474 : "=d" (cc), "+a" (addr) : : "cc"); 475 return cc == 0; 476 } 477 #endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */ 478