xref: /linux/tools/testing/selftests/mm/process_madv.c (revision da23ea194db94257123f1534d487f3cdc9b5626d)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 
3 #define _GNU_SOURCE
4 #include "../kselftest_harness.h"
5 #include <errno.h>
6 #include <setjmp.h>
7 #include <signal.h>
8 #include <stdbool.h>
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <linux/mman.h>
13 #include <sys/syscall.h>
14 #include <unistd.h>
15 #include <sched.h>
16 #include "vm_util.h"
17 
18 #include "../pidfd/pidfd.h"
19 
FIXTURE(process_madvise)20 FIXTURE(process_madvise)
21 {
22 	unsigned long page_size;
23 	pid_t child_pid;
24 	int remote_pidfd;
25 	int pidfd;
26 };
27 
FIXTURE_SETUP(process_madvise)28 FIXTURE_SETUP(process_madvise)
29 {
30 	self->page_size = (unsigned long)sysconf(_SC_PAGESIZE);
31 	self->pidfd = PIDFD_SELF;
32 	self->remote_pidfd = -1;
33 	self->child_pid = -1;
34 };
35 
FIXTURE_TEARDOWN_PARENT(process_madvise)36 FIXTURE_TEARDOWN_PARENT(process_madvise)
37 {
38 	/* This teardown is guaranteed to run, even if tests SKIP or ASSERT */
39 	if (self->child_pid > 0) {
40 		kill(self->child_pid, SIGKILL);
41 		waitpid(self->child_pid, NULL, 0);
42 	}
43 
44 	if (self->remote_pidfd >= 0)
45 		close(self->remote_pidfd);
46 }
47 
sys_process_madvise(int pidfd,const struct iovec * iovec,size_t vlen,int advice,unsigned int flags)48 static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,
49 				   size_t vlen, int advice, unsigned int flags)
50 {
51 	return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags);
52 }
53 
54 /*
55  * This test uses PIDFD_SELF to target the current process. The main
56  * goal is to verify the basic behavior of process_madvise() with
57  * a vector of non-contiguous memory ranges, not its cross-process
58  * capabilities.
59  */
TEST_F(process_madvise,basic)60 TEST_F(process_madvise, basic)
61 {
62 	const unsigned long pagesize = self->page_size;
63 	const int madvise_pages = 4;
64 	struct iovec vec[madvise_pages];
65 	int pidfd = self->pidfd;
66 	ssize_t ret;
67 	char *map;
68 
69 	/*
70 	 * Create a single large mapping. We will pick pages from this
71 	 * mapping to advise on. This ensures we test non-contiguous iovecs.
72 	 */
73 	map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE,
74 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
75 	if (map == MAP_FAILED)
76 		SKIP(return, "mmap failed, not enough memory.\n");
77 
78 	/* Fill the entire region with a known pattern. */
79 	memset(map, 'A', pagesize * 10);
80 
81 	/*
82 	 * Setup the iovec to point to 4 non-contiguous pages
83 	 * within the mapping.
84 	 */
85 	vec[0].iov_base = &map[0 * pagesize];
86 	vec[0].iov_len = pagesize;
87 	vec[1].iov_base = &map[3 * pagesize];
88 	vec[1].iov_len = pagesize;
89 	vec[2].iov_base = &map[5 * pagesize];
90 	vec[2].iov_len = pagesize;
91 	vec[3].iov_base = &map[8 * pagesize];
92 	vec[3].iov_len = pagesize;
93 
94 	ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0);
95 	if (ret == -1 && errno == EPERM)
96 		SKIP(return,
97 			   "process_madvise() unsupported or permission denied, try running as root.\n");
98 	else if (errno == EINVAL)
99 		SKIP(return,
100 			   "process_madvise() unsupported or parameter invalid, please check arguments.\n");
101 
102 	/* The call should succeed and report the total bytes processed. */
103 	ASSERT_EQ(ret, madvise_pages * pagesize);
104 
105 	/* Check that advised pages are now zero. */
106 	for (int i = 0; i < madvise_pages; i++) {
107 		char *advised_page = (char *)vec[i].iov_base;
108 
109 		/* Content must be 0, not 'A'. */
110 		ASSERT_EQ(*advised_page, '\0');
111 	}
112 
113 	/* Check that an un-advised page in between is still 'A'. */
114 	char *unadvised_page = &map[1 * pagesize];
115 
116 	for (int i = 0; i < pagesize; i++)
117 		ASSERT_EQ(unadvised_page[i], 'A');
118 
119 	/* Cleanup. */
120 	ASSERT_EQ(munmap(map, pagesize * 10), 0);
121 }
122 
123 /*
124  * This test deterministically validates process_madvise() with MADV_COLLAPSE
125  * on a remote process, other advices are difficult to verify reliably.
126  *
127  * The test verifies that a memory region in a child process,
128  * focus on process_madv remote result, only check addresses and lengths.
129  * The correctness of the MADV_COLLAPSE can be found in the relevant test examples in khugepaged.
130  */
TEST_F(process_madvise,remote_collapse)131 TEST_F(process_madvise, remote_collapse)
132 {
133 	const unsigned long pagesize = self->page_size;
134 	long huge_page_size;
135 	int pipe_info[2];
136 	ssize_t ret;
137 	struct iovec vec;
138 
139 	struct child_info {
140 		pid_t pid;
141 		void *map_addr;
142 	} info;
143 
144 	huge_page_size = read_pmd_pagesize();
145 	if (huge_page_size <= 0)
146 		SKIP(return, "Could not determine a valid huge page size.\n");
147 
148 	ASSERT_EQ(pipe(pipe_info), 0);
149 
150 	self->child_pid = fork();
151 	ASSERT_NE(self->child_pid, -1);
152 
153 	if (self->child_pid == 0) {
154 		char *map;
155 		size_t map_size = 2 * huge_page_size;
156 
157 		close(pipe_info[0]);
158 
159 		map = mmap(NULL, map_size, PROT_READ | PROT_WRITE,
160 			   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
161 		ASSERT_NE(map, MAP_FAILED);
162 
163 		/* Fault in as small pages */
164 		for (size_t i = 0; i < map_size; i += pagesize)
165 			map[i] = 'A';
166 
167 		/* Send info and pause */
168 		info.pid = getpid();
169 		info.map_addr = map;
170 		ret = write(pipe_info[1], &info, sizeof(info));
171 		ASSERT_EQ(ret, sizeof(info));
172 		close(pipe_info[1]);
173 
174 		pause();
175 		exit(0);
176 	}
177 
178 	close(pipe_info[1]);
179 
180 	/* Receive child info */
181 	ret = read(pipe_info[0], &info, sizeof(info));
182 	if (ret <= 0) {
183 		waitpid(self->child_pid, NULL, 0);
184 		SKIP(return, "Failed to read child info from pipe.\n");
185 	}
186 	ASSERT_EQ(ret, sizeof(info));
187 	close(pipe_info[0]);
188 	self->child_pid = info.pid;
189 
190 	self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
191 	ASSERT_GE(self->remote_pidfd, 0);
192 
193 	vec.iov_base = info.map_addr;
194 	vec.iov_len = huge_page_size;
195 
196 	ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_COLLAPSE,
197 				  0);
198 	if (ret == -1) {
199 		if (errno == EINVAL)
200 			SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n");
201 		else if (errno == EPERM)
202 			SKIP(return,
203 				   "No process_madvise() permissions, try running as root.\n");
204 		return;
205 	}
206 
207 	ASSERT_EQ(ret, huge_page_size);
208 }
209 
210 /*
211  * Test process_madvise() with a pidfd for a process that has already
212  * exited to ensure correct error handling.
213  */
TEST_F(process_madvise,exited_process_pidfd)214 TEST_F(process_madvise, exited_process_pidfd)
215 {
216 	const unsigned long pagesize = self->page_size;
217 	struct iovec vec;
218 	char *map;
219 	ssize_t ret;
220 
221 	map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
222 		   0);
223 	if (map == MAP_FAILED)
224 		SKIP(return, "mmap failed, not enough memory.\n");
225 
226 	vec.iov_base = map;
227 	vec.iov_len = pagesize;
228 
229 	/*
230 	 * Using a pidfd for a process that has already exited should fail
231 	 * with ESRCH.
232 	 */
233 	self->child_pid = fork();
234 	ASSERT_NE(self->child_pid, -1);
235 
236 	if (self->child_pid == 0)
237 		exit(0);
238 
239 	self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
240 	ASSERT_GE(self->remote_pidfd, 0);
241 
242 	/* Wait for the child to ensure it has terminated. */
243 	waitpid(self->child_pid, NULL, 0);
244 
245 	ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_DONTNEED,
246 				  0);
247 	ASSERT_EQ(ret, -1);
248 	ASSERT_EQ(errno, ESRCH);
249 }
250 
251 /*
252  * Test process_madvise() with bad pidfds to ensure correct error
253  * handling.
254  */
TEST_F(process_madvise,bad_pidfd)255 TEST_F(process_madvise, bad_pidfd)
256 {
257 	const unsigned long pagesize = self->page_size;
258 	struct iovec vec;
259 	char *map;
260 	ssize_t ret;
261 
262 	map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
263 		   0);
264 	if (map == MAP_FAILED)
265 		SKIP(return, "mmap failed, not enough memory.\n");
266 
267 	vec.iov_base = map;
268 	vec.iov_len = pagesize;
269 
270 	/* Using an invalid fd number (-1) should fail with EBADF. */
271 	ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0);
272 	ASSERT_EQ(ret, -1);
273 	ASSERT_EQ(errno, EBADF);
274 
275 	/*
276 	 * Using a valid fd that is not a pidfd (e.g. stdin) should fail
277 	 * with EBADF.
278 	 */
279 	ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0);
280 	ASSERT_EQ(ret, -1);
281 	ASSERT_EQ(errno, EBADF);
282 }
283 
284 /*
285  * Test that process_madvise() rejects vlen > UIO_MAXIOV.
286  * The kernel should return -EINVAL when the number of iovecs exceeds 1024.
287  */
TEST_F(process_madvise,invalid_vlen)288 TEST_F(process_madvise, invalid_vlen)
289 {
290 	const unsigned long pagesize = self->page_size;
291 	int pidfd = self->pidfd;
292 	struct iovec vec;
293 	char *map;
294 	ssize_t ret;
295 
296 	map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
297 		   0);
298 	if (map == MAP_FAILED)
299 		SKIP(return, "mmap failed, not enough memory.\n");
300 
301 	vec.iov_base = map;
302 	vec.iov_len = pagesize;
303 
304 	ret = sys_process_madvise(pidfd, &vec, 1025, MADV_DONTNEED, 0);
305 	ASSERT_EQ(ret, -1);
306 	ASSERT_EQ(errno, EINVAL);
307 
308 	/* Cleanup. */
309 	ASSERT_EQ(munmap(map, pagesize), 0);
310 }
311 
312 /*
313  * Test process_madvise() with an invalid flag value. Currently, only a flag
314  * value of 0 is supported. This test is reserved for the future, e.g., if
315  * synchronous flags are added.
316  */
TEST_F(process_madvise,flag)317 TEST_F(process_madvise, flag)
318 {
319 	const unsigned long pagesize = self->page_size;
320 	unsigned int invalid_flag;
321 	int pidfd = self->pidfd;
322 	struct iovec vec;
323 	char *map;
324 	ssize_t ret;
325 
326 	map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
327 		   0);
328 	if (map == MAP_FAILED)
329 		SKIP(return, "mmap failed, not enough memory.\n");
330 
331 	vec.iov_base = map;
332 	vec.iov_len = pagesize;
333 
334 	invalid_flag = 0x80000000;
335 
336 	ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, invalid_flag);
337 	ASSERT_EQ(ret, -1);
338 	ASSERT_EQ(errno, EINVAL);
339 
340 	/* Cleanup. */
341 	ASSERT_EQ(munmap(map, pagesize), 0);
342 }
343 
344 TEST_HARNESS_MAIN
345