1 /* 2 * linux/mm/madvise.c 3 * 4 * Copyright (C) 1999 Linus Torvalds 5 * Copyright (C) 2002 Christoph Hellwig 6 */ 7 8 #include <linux/mman.h> 9 #include <linux/pagemap.h> 10 #include <linux/syscalls.h> 11 #include <linux/mempolicy.h> 12 #include <linux/hugetlb.h> 13 #include <linux/sched.h> 14 #include <linux/ksm.h> 15 16 /* 17 * Any behaviour which results in changes to the vma->vm_flags needs to 18 * take mmap_sem for writing. Others, which simply traverse vmas, need 19 * to only take it for reading. 20 */ 21 static int madvise_need_mmap_write(int behavior) 22 { 23 switch (behavior) { 24 case MADV_REMOVE: 25 case MADV_WILLNEED: 26 case MADV_DONTNEED: 27 return 0; 28 default: 29 /* be safe, default to 1. list exceptions explicitly */ 30 return 1; 31 } 32 } 33 34 /* 35 * We can potentially split a vm area into separate 36 * areas, each area with its own behavior. 37 */ 38 static long madvise_behavior(struct vm_area_struct * vma, 39 struct vm_area_struct **prev, 40 unsigned long start, unsigned long end, int behavior) 41 { 42 struct mm_struct * mm = vma->vm_mm; 43 int error = 0; 44 pgoff_t pgoff; 45 unsigned long new_flags = vma->vm_flags; 46 47 switch (behavior) { 48 case MADV_NORMAL: 49 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 50 break; 51 case MADV_SEQUENTIAL: 52 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 53 break; 54 case MADV_RANDOM: 55 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 56 break; 57 case MADV_DONTFORK: 58 new_flags |= VM_DONTCOPY; 59 break; 60 case MADV_DOFORK: 61 if (vma->vm_flags & VM_IO) { 62 error = -EINVAL; 63 goto out; 64 } 65 new_flags &= ~VM_DONTCOPY; 66 break; 67 case MADV_MERGEABLE: 68 case MADV_UNMERGEABLE: 69 error = ksm_madvise(vma, start, end, behavior, &new_flags); 70 if (error) 71 goto out; 72 break; 73 } 74 75 if (new_flags == vma->vm_flags) { 76 *prev = vma; 77 goto out; 78 } 79 80 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 81 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, 82 vma->vm_file, pgoff, vma_policy(vma)); 83 if (*prev) { 84 vma = *prev; 85 goto success; 86 } 87 88 *prev = vma; 89 90 if (start != vma->vm_start) { 91 error = split_vma(mm, vma, start, 1); 92 if (error) 93 goto out; 94 } 95 96 if (end != vma->vm_end) { 97 error = split_vma(mm, vma, end, 0); 98 if (error) 99 goto out; 100 } 101 102 success: 103 /* 104 * vm_flags is protected by the mmap_sem held in write mode. 105 */ 106 vma->vm_flags = new_flags; 107 108 out: 109 if (error == -ENOMEM) 110 error = -EAGAIN; 111 return error; 112 } 113 114 /* 115 * Schedule all required I/O operations. Do not wait for completion. 116 */ 117 static long madvise_willneed(struct vm_area_struct * vma, 118 struct vm_area_struct ** prev, 119 unsigned long start, unsigned long end) 120 { 121 struct file *file = vma->vm_file; 122 123 if (!file) 124 return -EBADF; 125 126 if (file->f_mapping->a_ops->get_xip_mem) { 127 /* no bad return value, but ignore advice */ 128 return 0; 129 } 130 131 *prev = vma; 132 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 133 if (end > vma->vm_end) 134 end = vma->vm_end; 135 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 136 137 force_page_cache_readahead(file->f_mapping, file, start, end - start); 138 return 0; 139 } 140 141 /* 142 * Application no longer needs these pages. If the pages are dirty, 143 * it's OK to just throw them away. The app will be more careful about 144 * data it wants to keep. Be sure to free swap resources too. The 145 * zap_page_range call sets things up for shrink_active_list to actually free 146 * these pages later if no one else has touched them in the meantime, 147 * although we could add these pages to a global reuse list for 148 * shrink_active_list to pick up before reclaiming other pages. 149 * 150 * NB: This interface discards data rather than pushes it out to swap, 151 * as some implementations do. This has performance implications for 152 * applications like large transactional databases which want to discard 153 * pages in anonymous maps after committing to backing store the data 154 * that was kept in them. There is no reason to write this data out to 155 * the swap area if the application is discarding it. 156 * 157 * An interface that causes the system to free clean pages and flush 158 * dirty pages is already available as msync(MS_INVALIDATE). 159 */ 160 static long madvise_dontneed(struct vm_area_struct * vma, 161 struct vm_area_struct ** prev, 162 unsigned long start, unsigned long end) 163 { 164 *prev = vma; 165 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) 166 return -EINVAL; 167 168 if (unlikely(vma->vm_flags & VM_NONLINEAR)) { 169 struct zap_details details = { 170 .nonlinear_vma = vma, 171 .last_index = ULONG_MAX, 172 }; 173 zap_page_range(vma, start, end - start, &details); 174 } else 175 zap_page_range(vma, start, end - start, NULL); 176 return 0; 177 } 178 179 /* 180 * Application wants to free up the pages and associated backing store. 181 * This is effectively punching a hole into the middle of a file. 182 * 183 * NOTE: Currently, only shmfs/tmpfs is supported for this operation. 184 * Other filesystems return -ENOSYS. 185 */ 186 static long madvise_remove(struct vm_area_struct *vma, 187 struct vm_area_struct **prev, 188 unsigned long start, unsigned long end) 189 { 190 struct address_space *mapping; 191 loff_t offset, endoff; 192 int error; 193 194 *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 195 196 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) 197 return -EINVAL; 198 199 if (!vma->vm_file || !vma->vm_file->f_mapping 200 || !vma->vm_file->f_mapping->host) { 201 return -EINVAL; 202 } 203 204 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 205 return -EACCES; 206 207 mapping = vma->vm_file->f_mapping; 208 209 offset = (loff_t)(start - vma->vm_start) 210 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 211 endoff = (loff_t)(end - vma->vm_start - 1) 212 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 213 214 /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ 215 up_read(¤t->mm->mmap_sem); 216 error = vmtruncate_range(mapping->host, offset, endoff); 217 down_read(¤t->mm->mmap_sem); 218 return error; 219 } 220 221 static long 222 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 223 unsigned long start, unsigned long end, int behavior) 224 { 225 switch (behavior) { 226 case MADV_REMOVE: 227 return madvise_remove(vma, prev, start, end); 228 case MADV_WILLNEED: 229 return madvise_willneed(vma, prev, start, end); 230 case MADV_DONTNEED: 231 return madvise_dontneed(vma, prev, start, end); 232 default: 233 return madvise_behavior(vma, prev, start, end, behavior); 234 } 235 } 236 237 static int 238 madvise_behavior_valid(int behavior) 239 { 240 switch (behavior) { 241 case MADV_DOFORK: 242 case MADV_DONTFORK: 243 case MADV_NORMAL: 244 case MADV_SEQUENTIAL: 245 case MADV_RANDOM: 246 case MADV_REMOVE: 247 case MADV_WILLNEED: 248 case MADV_DONTNEED: 249 #ifdef CONFIG_KSM 250 case MADV_MERGEABLE: 251 case MADV_UNMERGEABLE: 252 #endif 253 return 1; 254 255 default: 256 return 0; 257 } 258 } 259 260 /* 261 * The madvise(2) system call. 262 * 263 * Applications can use madvise() to advise the kernel how it should 264 * handle paging I/O in this VM area. The idea is to help the kernel 265 * use appropriate read-ahead and caching techniques. The information 266 * provided is advisory only, and can be safely disregarded by the 267 * kernel without affecting the correct operation of the application. 268 * 269 * behavior values: 270 * MADV_NORMAL - the default behavior is to read clusters. This 271 * results in some read-ahead and read-behind. 272 * MADV_RANDOM - the system should read the minimum amount of data 273 * on any access, since it is unlikely that the appli- 274 * cation will need more than what it asks for. 275 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 276 * once, so they can be aggressively read ahead, and 277 * can be freed soon after they are accessed. 278 * MADV_WILLNEED - the application is notifying the system to read 279 * some pages ahead. 280 * MADV_DONTNEED - the application is finished with the given range, 281 * so the kernel can free resources associated with it. 282 * MADV_REMOVE - the application wants to free up the given range of 283 * pages and associated backing store. 284 * MADV_DONTFORK - omit this area from child's address space when forking: 285 * typically, to avoid COWing pages pinned by get_user_pages(). 286 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 287 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 288 * this area with pages of identical content from other such areas. 289 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 290 * 291 * return values: 292 * zero - success 293 * -EINVAL - start + len < 0, start is not page-aligned, 294 * "behavior" is not a valid value, or application 295 * is attempting to release locked or shared pages. 296 * -ENOMEM - addresses in the specified range are not currently 297 * mapped, or are outside the AS of the process. 298 * -EIO - an I/O error occurred while paging in data. 299 * -EBADF - map exists, but area maps something that isn't a file. 300 * -EAGAIN - a kernel resource was temporarily unavailable. 301 */ 302 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 303 { 304 unsigned long end, tmp; 305 struct vm_area_struct * vma, *prev; 306 int unmapped_error = 0; 307 int error = -EINVAL; 308 int write; 309 size_t len; 310 311 if (!madvise_behavior_valid(behavior)) 312 return error; 313 314 write = madvise_need_mmap_write(behavior); 315 if (write) 316 down_write(¤t->mm->mmap_sem); 317 else 318 down_read(¤t->mm->mmap_sem); 319 320 if (start & ~PAGE_MASK) 321 goto out; 322 len = (len_in + ~PAGE_MASK) & PAGE_MASK; 323 324 /* Check to see whether len was rounded up from small -ve to zero */ 325 if (len_in && !len) 326 goto out; 327 328 end = start + len; 329 if (end < start) 330 goto out; 331 332 error = 0; 333 if (end == start) 334 goto out; 335 336 /* 337 * If the interval [start,end) covers some unmapped address 338 * ranges, just ignore them, but return -ENOMEM at the end. 339 * - different from the way of handling in mlock etc. 340 */ 341 vma = find_vma_prev(current->mm, start, &prev); 342 if (vma && start > vma->vm_start) 343 prev = vma; 344 345 for (;;) { 346 /* Still start < end. */ 347 error = -ENOMEM; 348 if (!vma) 349 goto out; 350 351 /* Here start < (end|vma->vm_end). */ 352 if (start < vma->vm_start) { 353 unmapped_error = -ENOMEM; 354 start = vma->vm_start; 355 if (start >= end) 356 goto out; 357 } 358 359 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 360 tmp = vma->vm_end; 361 if (end < tmp) 362 tmp = end; 363 364 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 365 error = madvise_vma(vma, &prev, start, tmp, behavior); 366 if (error) 367 goto out; 368 start = tmp; 369 if (prev && start < prev->vm_end) 370 start = prev->vm_end; 371 error = unmapped_error; 372 if (start >= end) 373 goto out; 374 if (prev) 375 vma = prev->vm_next; 376 else /* madvise_remove dropped mmap_sem */ 377 vma = find_vma(current->mm, start); 378 } 379 out: 380 if (write) 381 up_write(¤t->mm->mmap_sem); 382 else 383 up_read(¤t->mm->mmap_sem); 384 385 return error; 386 } 387