1 /* 2 * Copyright IBM Corp. 2007,2009 3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/mm.h> 10 #include <linux/swap.h> 11 #include <linux/smp.h> 12 #include <linux/highmem.h> 13 #include <linux/slab.h> 14 #include <linux/pagemap.h> 15 #include <linux/spinlock.h> 16 #include <linux/module.h> 17 #include <linux/quicklist.h> 18 19 #include <asm/system.h> 20 #include <asm/pgtable.h> 21 #include <asm/pgalloc.h> 22 #include <asm/tlb.h> 23 #include <asm/tlbflush.h> 24 #include <asm/mmu_context.h> 25 26 #ifndef CONFIG_64BIT 27 #define ALLOC_ORDER 1 28 #define TABLES_PER_PAGE 4 29 #define FRAG_MASK 15UL 30 #define SECOND_HALVES 10UL 31 32 void clear_table_pgstes(unsigned long *table) 33 { 34 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4); 35 memset(table + 256, 0, PAGE_SIZE/4); 36 clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4); 37 memset(table + 768, 0, PAGE_SIZE/4); 38 } 39 40 #else 41 #define ALLOC_ORDER 2 42 #define TABLES_PER_PAGE 2 43 #define FRAG_MASK 3UL 44 #define SECOND_HALVES 2UL 45 46 void clear_table_pgstes(unsigned long *table) 47 { 48 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2); 49 memset(table + 256, 0, PAGE_SIZE/2); 50 } 51 52 #endif 53 54 unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE; 55 EXPORT_SYMBOL(VMALLOC_START); 56 57 static int __init parse_vmalloc(char *arg) 58 { 59 if (!arg) 60 return -EINVAL; 61 VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK; 62 return 0; 63 } 64 early_param("vmalloc", parse_vmalloc); 65 66 unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec) 67 { 68 struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 69 70 if (!page) 71 return NULL; 72 page->index = 0; 73 if (noexec) { 74 struct page *shadow = alloc_pages(GFP_KERNEL, ALLOC_ORDER); 75 if (!shadow) { 76 __free_pages(page, ALLOC_ORDER); 77 return NULL; 78 } 79 page->index = page_to_phys(shadow); 80 } 81 spin_lock(&mm->context.list_lock); 82 list_add(&page->lru, &mm->context.crst_list); 83 spin_unlock(&mm->context.list_lock); 84 return (unsigned long *) page_to_phys(page); 85 } 86 87 void crst_table_free(struct mm_struct *mm, unsigned long *table) 88 { 89 unsigned long *shadow = get_shadow_table(table); 90 struct page *page = virt_to_page(table); 91 92 spin_lock(&mm->context.list_lock); 93 list_del(&page->lru); 94 spin_unlock(&mm->context.list_lock); 95 if (shadow) 96 free_pages((unsigned long) shadow, ALLOC_ORDER); 97 free_pages((unsigned long) table, ALLOC_ORDER); 98 } 99 100 #ifdef CONFIG_64BIT 101 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) 102 { 103 unsigned long *table, *pgd; 104 unsigned long entry; 105 106 BUG_ON(limit > (1UL << 53)); 107 repeat: 108 table = crst_table_alloc(mm, mm->context.noexec); 109 if (!table) 110 return -ENOMEM; 111 spin_lock(&mm->page_table_lock); 112 if (mm->context.asce_limit < limit) { 113 pgd = (unsigned long *) mm->pgd; 114 if (mm->context.asce_limit <= (1UL << 31)) { 115 entry = _REGION3_ENTRY_EMPTY; 116 mm->context.asce_limit = 1UL << 42; 117 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 118 _ASCE_USER_BITS | 119 _ASCE_TYPE_REGION3; 120 } else { 121 entry = _REGION2_ENTRY_EMPTY; 122 mm->context.asce_limit = 1UL << 53; 123 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 124 _ASCE_USER_BITS | 125 _ASCE_TYPE_REGION2; 126 } 127 crst_table_init(table, entry); 128 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); 129 mm->pgd = (pgd_t *) table; 130 mm->task_size = mm->context.asce_limit; 131 table = NULL; 132 } 133 spin_unlock(&mm->page_table_lock); 134 if (table) 135 crst_table_free(mm, table); 136 if (mm->context.asce_limit < limit) 137 goto repeat; 138 update_mm(mm, current); 139 return 0; 140 } 141 142 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) 143 { 144 pgd_t *pgd; 145 146 if (mm->context.asce_limit <= limit) 147 return; 148 __tlb_flush_mm(mm); 149 while (mm->context.asce_limit > limit) { 150 pgd = mm->pgd; 151 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { 152 case _REGION_ENTRY_TYPE_R2: 153 mm->context.asce_limit = 1UL << 42; 154 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 155 _ASCE_USER_BITS | 156 _ASCE_TYPE_REGION3; 157 break; 158 case _REGION_ENTRY_TYPE_R3: 159 mm->context.asce_limit = 1UL << 31; 160 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 161 _ASCE_USER_BITS | 162 _ASCE_TYPE_SEGMENT; 163 break; 164 default: 165 BUG(); 166 } 167 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); 168 mm->task_size = mm->context.asce_limit; 169 crst_table_free(mm, (unsigned long *) pgd); 170 } 171 update_mm(mm, current); 172 } 173 #endif 174 175 /* 176 * page table entry allocation/free routines. 177 */ 178 unsigned long *page_table_alloc(struct mm_struct *mm) 179 { 180 struct page *page; 181 unsigned long *table; 182 unsigned long bits; 183 184 bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; 185 spin_lock(&mm->context.list_lock); 186 page = NULL; 187 if (!list_empty(&mm->context.pgtable_list)) { 188 page = list_first_entry(&mm->context.pgtable_list, 189 struct page, lru); 190 if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1)) 191 page = NULL; 192 } 193 if (!page) { 194 spin_unlock(&mm->context.list_lock); 195 page = alloc_page(GFP_KERNEL|__GFP_REPEAT); 196 if (!page) 197 return NULL; 198 pgtable_page_ctor(page); 199 page->flags &= ~FRAG_MASK; 200 table = (unsigned long *) page_to_phys(page); 201 if (mm->context.has_pgste) 202 clear_table_pgstes(table); 203 else 204 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); 205 spin_lock(&mm->context.list_lock); 206 list_add(&page->lru, &mm->context.pgtable_list); 207 } 208 table = (unsigned long *) page_to_phys(page); 209 while (page->flags & bits) { 210 table += 256; 211 bits <<= 1; 212 } 213 page->flags |= bits; 214 if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1)) 215 list_move_tail(&page->lru, &mm->context.pgtable_list); 216 spin_unlock(&mm->context.list_lock); 217 return table; 218 } 219 220 void page_table_free(struct mm_struct *mm, unsigned long *table) 221 { 222 struct page *page; 223 unsigned long bits; 224 225 bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; 226 bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); 227 page = pfn_to_page(__pa(table) >> PAGE_SHIFT); 228 spin_lock(&mm->context.list_lock); 229 page->flags ^= bits; 230 if (page->flags & FRAG_MASK) { 231 /* Page now has some free pgtable fragments. */ 232 list_move(&page->lru, &mm->context.pgtable_list); 233 page = NULL; 234 } else 235 /* All fragments of the 4K page have been freed. */ 236 list_del(&page->lru); 237 spin_unlock(&mm->context.list_lock); 238 if (page) { 239 pgtable_page_dtor(page); 240 __free_page(page); 241 } 242 } 243 244 void disable_noexec(struct mm_struct *mm, struct task_struct *tsk) 245 { 246 struct page *page; 247 248 spin_lock(&mm->context.list_lock); 249 /* Free shadow region and segment tables. */ 250 list_for_each_entry(page, &mm->context.crst_list, lru) 251 if (page->index) { 252 free_pages((unsigned long) page->index, ALLOC_ORDER); 253 page->index = 0; 254 } 255 /* "Free" second halves of page tables. */ 256 list_for_each_entry(page, &mm->context.pgtable_list, lru) 257 page->flags &= ~SECOND_HALVES; 258 spin_unlock(&mm->context.list_lock); 259 mm->context.noexec = 0; 260 update_mm(mm, tsk); 261 } 262 263 /* 264 * switch on pgstes for its userspace process (for kvm) 265 */ 266 int s390_enable_sie(void) 267 { 268 struct task_struct *tsk = current; 269 struct mm_struct *mm, *old_mm; 270 271 /* Do we have switched amode? If no, we cannot do sie */ 272 if (user_mode == HOME_SPACE_MODE) 273 return -EINVAL; 274 275 /* Do we have pgstes? if yes, we are done */ 276 if (tsk->mm->context.has_pgste) 277 return 0; 278 279 /* lets check if we are allowed to replace the mm */ 280 task_lock(tsk); 281 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || 282 #ifdef CONFIG_AIO 283 !hlist_empty(&tsk->mm->ioctx_list) || 284 #endif 285 tsk->mm != tsk->active_mm) { 286 task_unlock(tsk); 287 return -EINVAL; 288 } 289 task_unlock(tsk); 290 291 /* we copy the mm and let dup_mm create the page tables with_pgstes */ 292 tsk->mm->context.alloc_pgste = 1; 293 mm = dup_mm(tsk); 294 tsk->mm->context.alloc_pgste = 0; 295 if (!mm) 296 return -ENOMEM; 297 298 /* Now lets check again if something happened */ 299 task_lock(tsk); 300 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || 301 #ifdef CONFIG_AIO 302 !hlist_empty(&tsk->mm->ioctx_list) || 303 #endif 304 tsk->mm != tsk->active_mm) { 305 mmput(mm); 306 task_unlock(tsk); 307 return -EINVAL; 308 } 309 310 /* ok, we are alone. No ptrace, no threads, etc. */ 311 old_mm = tsk->mm; 312 tsk->mm = tsk->active_mm = mm; 313 preempt_disable(); 314 update_mm(mm, tsk); 315 cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); 316 preempt_enable(); 317 task_unlock(tsk); 318 mmput(old_mm); 319 return 0; 320 } 321 EXPORT_SYMBOL_GPL(s390_enable_sie); 322 323 #if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION) 324 bool kernel_page_present(struct page *page) 325 { 326 unsigned long addr; 327 int cc; 328 329 addr = page_to_phys(page); 330 asm volatile( 331 " lra %1,0(%1)\n" 332 " ipm %0\n" 333 " srl %0,28" 334 : "=d" (cc), "+a" (addr) : : "cc"); 335 return cc == 0; 336 } 337 #endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */ 338