xref: /linux/mm/madvise.c (revision 040932cdcfca9b0ac55a4f74f194c2e2c8a2527b)
1 /*
2  *	linux/mm/madvise.c
3  *
4  * Copyright (C) 1999  Linus Torvalds
5  * Copyright (C) 2002  Christoph Hellwig
6  */
7 
8 #include <linux/mman.h>
9 #include <linux/pagemap.h>
10 #include <linux/syscalls.h>
11 #include <linux/mempolicy.h>
12 #include <linux/hugetlb.h>
13 #include <linux/sched.h>
14 
15 /*
16  * Any behaviour which results in changes to the vma->vm_flags needs to
17  * take mmap_sem for writing. Others, which simply traverse vmas, need
18  * to only take it for reading.
19  */
20 static int madvise_need_mmap_write(int behavior)
21 {
22 	switch (behavior) {
23 	case MADV_REMOVE:
24 	case MADV_WILLNEED:
25 	case MADV_DONTNEED:
26 		return 0;
27 	default:
28 		/* be safe, default to 1. list exceptions explicitly */
29 		return 1;
30 	}
31 }
32 
33 /*
34  * We can potentially split a vm area into separate
35  * areas, each area with its own behavior.
36  */
37 static long madvise_behavior(struct vm_area_struct * vma,
38 		     struct vm_area_struct **prev,
39 		     unsigned long start, unsigned long end, int behavior)
40 {
41 	struct mm_struct * mm = vma->vm_mm;
42 	int error = 0;
43 	pgoff_t pgoff;
44 	int new_flags = vma->vm_flags;
45 
46 	switch (behavior) {
47 	case MADV_NORMAL:
48 		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
49 		break;
50 	case MADV_SEQUENTIAL:
51 		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
52 		break;
53 	case MADV_RANDOM:
54 		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
55 		break;
56 	case MADV_DONTFORK:
57 		new_flags |= VM_DONTCOPY;
58 		break;
59 	case MADV_DOFORK:
60 		new_flags &= ~VM_DONTCOPY;
61 		break;
62 	}
63 
64 	if (new_flags == vma->vm_flags) {
65 		*prev = vma;
66 		goto out;
67 	}
68 
69 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
70 	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
71 				vma->vm_file, pgoff, vma_policy(vma));
72 	if (*prev) {
73 		vma = *prev;
74 		goto success;
75 	}
76 
77 	*prev = vma;
78 
79 	if (start != vma->vm_start) {
80 		error = split_vma(mm, vma, start, 1);
81 		if (error)
82 			goto out;
83 	}
84 
85 	if (end != vma->vm_end) {
86 		error = split_vma(mm, vma, end, 0);
87 		if (error)
88 			goto out;
89 	}
90 
91 success:
92 	/*
93 	 * vm_flags is protected by the mmap_sem held in write mode.
94 	 */
95 	vma->vm_flags = new_flags;
96 
97 out:
98 	if (error == -ENOMEM)
99 		error = -EAGAIN;
100 	return error;
101 }
102 
103 /*
104  * Schedule all required I/O operations.  Do not wait for completion.
105  */
106 static long madvise_willneed(struct vm_area_struct * vma,
107 			     struct vm_area_struct ** prev,
108 			     unsigned long start, unsigned long end)
109 {
110 	struct file *file = vma->vm_file;
111 
112 	if (!file)
113 		return -EBADF;
114 
115 	if (file->f_mapping->a_ops->get_xip_mem) {
116 		/* no bad return value, but ignore advice */
117 		return 0;
118 	}
119 
120 	*prev = vma;
121 	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
122 	if (end > vma->vm_end)
123 		end = vma->vm_end;
124 	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
125 
126 	force_page_cache_readahead(file->f_mapping, file, start, end - start);
127 	return 0;
128 }
129 
130 /*
131  * Application no longer needs these pages.  If the pages are dirty,
132  * it's OK to just throw them away.  The app will be more careful about
133  * data it wants to keep.  Be sure to free swap resources too.  The
134  * zap_page_range call sets things up for shrink_active_list to actually free
135  * these pages later if no one else has touched them in the meantime,
136  * although we could add these pages to a global reuse list for
137  * shrink_active_list to pick up before reclaiming other pages.
138  *
139  * NB: This interface discards data rather than pushes it out to swap,
140  * as some implementations do.  This has performance implications for
141  * applications like large transactional databases which want to discard
142  * pages in anonymous maps after committing to backing store the data
143  * that was kept in them.  There is no reason to write this data out to
144  * the swap area if the application is discarding it.
145  *
146  * An interface that causes the system to free clean pages and flush
147  * dirty pages is already available as msync(MS_INVALIDATE).
148  */
149 static long madvise_dontneed(struct vm_area_struct * vma,
150 			     struct vm_area_struct ** prev,
151 			     unsigned long start, unsigned long end)
152 {
153 	*prev = vma;
154 	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
155 		return -EINVAL;
156 
157 	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
158 		struct zap_details details = {
159 			.nonlinear_vma = vma,
160 			.last_index = ULONG_MAX,
161 		};
162 		zap_page_range(vma, start, end - start, &details);
163 	} else
164 		zap_page_range(vma, start, end - start, NULL);
165 	return 0;
166 }
167 
168 /*
169  * Application wants to free up the pages and associated backing store.
170  * This is effectively punching a hole into the middle of a file.
171  *
172  * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
173  * Other filesystems return -ENOSYS.
174  */
175 static long madvise_remove(struct vm_area_struct *vma,
176 				struct vm_area_struct **prev,
177 				unsigned long start, unsigned long end)
178 {
179 	struct address_space *mapping;
180 	loff_t offset, endoff;
181 	int error;
182 
183 	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
184 
185 	if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
186 		return -EINVAL;
187 
188 	if (!vma->vm_file || !vma->vm_file->f_mapping
189 		|| !vma->vm_file->f_mapping->host) {
190 			return -EINVAL;
191 	}
192 
193 	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
194 		return -EACCES;
195 
196 	mapping = vma->vm_file->f_mapping;
197 
198 	offset = (loff_t)(start - vma->vm_start)
199 			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
200 	endoff = (loff_t)(end - vma->vm_start - 1)
201 			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
202 
203 	/* vmtruncate_range needs to take i_mutex and i_alloc_sem */
204 	up_read(&current->mm->mmap_sem);
205 	error = vmtruncate_range(mapping->host, offset, endoff);
206 	down_read(&current->mm->mmap_sem);
207 	return error;
208 }
209 
210 static long
211 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
212 		unsigned long start, unsigned long end, int behavior)
213 {
214 	long error;
215 
216 	switch (behavior) {
217 	case MADV_DOFORK:
218 		if (vma->vm_flags & VM_IO) {
219 			error = -EINVAL;
220 			break;
221 		}
222 	case MADV_DONTFORK:
223 	case MADV_NORMAL:
224 	case MADV_SEQUENTIAL:
225 	case MADV_RANDOM:
226 		error = madvise_behavior(vma, prev, start, end, behavior);
227 		break;
228 	case MADV_REMOVE:
229 		error = madvise_remove(vma, prev, start, end);
230 		break;
231 
232 	case MADV_WILLNEED:
233 		error = madvise_willneed(vma, prev, start, end);
234 		break;
235 
236 	case MADV_DONTNEED:
237 		error = madvise_dontneed(vma, prev, start, end);
238 		break;
239 
240 	default:
241 		BUG();
242 		break;
243 	}
244 	return error;
245 }
246 
247 static int
248 madvise_behavior_valid(int behavior)
249 {
250 	switch (behavior) {
251 	case MADV_DOFORK:
252 	case MADV_DONTFORK:
253 	case MADV_NORMAL:
254 	case MADV_SEQUENTIAL:
255 	case MADV_RANDOM:
256 	case MADV_REMOVE:
257 	case MADV_WILLNEED:
258 	case MADV_DONTNEED:
259 		return 1;
260 
261 	default:
262 		return 0;
263 	}
264 }
265 /*
266  * The madvise(2) system call.
267  *
268  * Applications can use madvise() to advise the kernel how it should
269  * handle paging I/O in this VM area.  The idea is to help the kernel
270  * use appropriate read-ahead and caching techniques.  The information
271  * provided is advisory only, and can be safely disregarded by the
272  * kernel without affecting the correct operation of the application.
273  *
274  * behavior values:
275  *  MADV_NORMAL - the default behavior is to read clusters.  This
276  *		results in some read-ahead and read-behind.
277  *  MADV_RANDOM - the system should read the minimum amount of data
278  *		on any access, since it is unlikely that the appli-
279  *		cation will need more than what it asks for.
280  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
281  *		once, so they can be aggressively read ahead, and
282  *		can be freed soon after they are accessed.
283  *  MADV_WILLNEED - the application is notifying the system to read
284  *		some pages ahead.
285  *  MADV_DONTNEED - the application is finished with the given range,
286  *		so the kernel can free resources associated with it.
287  *  MADV_REMOVE - the application wants to free up the given range of
288  *		pages and associated backing store.
289  *
290  * return values:
291  *  zero    - success
292  *  -EINVAL - start + len < 0, start is not page-aligned,
293  *		"behavior" is not a valid value, or application
294  *		is attempting to release locked or shared pages.
295  *  -ENOMEM - addresses in the specified range are not currently
296  *		mapped, or are outside the AS of the process.
297  *  -EIO    - an I/O error occurred while paging in data.
298  *  -EBADF  - map exists, but area maps something that isn't a file.
299  *  -EAGAIN - a kernel resource was temporarily unavailable.
300  */
301 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
302 {
303 	unsigned long end, tmp;
304 	struct vm_area_struct * vma, *prev;
305 	int unmapped_error = 0;
306 	int error = -EINVAL;
307 	int write;
308 	size_t len;
309 
310 	if (!madvise_behavior_valid(behavior))
311 		return error;
312 
313 	write = madvise_need_mmap_write(behavior);
314 	if (write)
315 		down_write(&current->mm->mmap_sem);
316 	else
317 		down_read(&current->mm->mmap_sem);
318 
319 	if (start & ~PAGE_MASK)
320 		goto out;
321 	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
322 
323 	/* Check to see whether len was rounded up from small -ve to zero */
324 	if (len_in && !len)
325 		goto out;
326 
327 	end = start + len;
328 	if (end < start)
329 		goto out;
330 
331 	error = 0;
332 	if (end == start)
333 		goto out;
334 
335 	/*
336 	 * If the interval [start,end) covers some unmapped address
337 	 * ranges, just ignore them, but return -ENOMEM at the end.
338 	 * - different from the way of handling in mlock etc.
339 	 */
340 	vma = find_vma_prev(current->mm, start, &prev);
341 	if (vma && start > vma->vm_start)
342 		prev = vma;
343 
344 	for (;;) {
345 		/* Still start < end. */
346 		error = -ENOMEM;
347 		if (!vma)
348 			goto out;
349 
350 		/* Here start < (end|vma->vm_end). */
351 		if (start < vma->vm_start) {
352 			unmapped_error = -ENOMEM;
353 			start = vma->vm_start;
354 			if (start >= end)
355 				goto out;
356 		}
357 
358 		/* Here vma->vm_start <= start < (end|vma->vm_end) */
359 		tmp = vma->vm_end;
360 		if (end < tmp)
361 			tmp = end;
362 
363 		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
364 		error = madvise_vma(vma, &prev, start, tmp, behavior);
365 		if (error)
366 			goto out;
367 		start = tmp;
368 		if (prev && start < prev->vm_end)
369 			start = prev->vm_end;
370 		error = unmapped_error;
371 		if (start >= end)
372 			goto out;
373 		if (prev)
374 			vma = prev->vm_next;
375 		else	/* madvise_remove dropped mmap_sem */
376 			vma = find_vma(current->mm, start);
377 	}
378 out:
379 	if (write)
380 		up_write(&current->mm->mmap_sem);
381 	else
382 		up_read(&current->mm->mmap_sem);
383 
384 	return error;
385 }
386