xref: /linux/tools/testing/selftests/mm/process_madv.c (revision da23ea194db94257123f1534d487f3cdc9b5626d)
1*b50e3788Swang lian // SPDX-License-Identifier: GPL-2.0-or-later
2*b50e3788Swang lian 
3*b50e3788Swang lian #define _GNU_SOURCE
4*b50e3788Swang lian #include "../kselftest_harness.h"
5*b50e3788Swang lian #include <errno.h>
6*b50e3788Swang lian #include <setjmp.h>
7*b50e3788Swang lian #include <signal.h>
8*b50e3788Swang lian #include <stdbool.h>
9*b50e3788Swang lian #include <stdio.h>
10*b50e3788Swang lian #include <stdlib.h>
11*b50e3788Swang lian #include <string.h>
12*b50e3788Swang lian #include <linux/mman.h>
13*b50e3788Swang lian #include <sys/syscall.h>
14*b50e3788Swang lian #include <unistd.h>
15*b50e3788Swang lian #include <sched.h>
16*b50e3788Swang lian #include "vm_util.h"
17*b50e3788Swang lian 
18*b50e3788Swang lian #include "../pidfd/pidfd.h"
19*b50e3788Swang lian 
FIXTURE(process_madvise)20*b50e3788Swang lian FIXTURE(process_madvise)
21*b50e3788Swang lian {
22*b50e3788Swang lian 	unsigned long page_size;
23*b50e3788Swang lian 	pid_t child_pid;
24*b50e3788Swang lian 	int remote_pidfd;
25*b50e3788Swang lian 	int pidfd;
26*b50e3788Swang lian };
27*b50e3788Swang lian 
FIXTURE_SETUP(process_madvise)28*b50e3788Swang lian FIXTURE_SETUP(process_madvise)
29*b50e3788Swang lian {
30*b50e3788Swang lian 	self->page_size = (unsigned long)sysconf(_SC_PAGESIZE);
31*b50e3788Swang lian 	self->pidfd = PIDFD_SELF;
32*b50e3788Swang lian 	self->remote_pidfd = -1;
33*b50e3788Swang lian 	self->child_pid = -1;
34*b50e3788Swang lian };
35*b50e3788Swang lian 
FIXTURE_TEARDOWN_PARENT(process_madvise)36*b50e3788Swang lian FIXTURE_TEARDOWN_PARENT(process_madvise)
37*b50e3788Swang lian {
38*b50e3788Swang lian 	/* This teardown is guaranteed to run, even if tests SKIP or ASSERT */
39*b50e3788Swang lian 	if (self->child_pid > 0) {
40*b50e3788Swang lian 		kill(self->child_pid, SIGKILL);
41*b50e3788Swang lian 		waitpid(self->child_pid, NULL, 0);
42*b50e3788Swang lian 	}
43*b50e3788Swang lian 
44*b50e3788Swang lian 	if (self->remote_pidfd >= 0)
45*b50e3788Swang lian 		close(self->remote_pidfd);
46*b50e3788Swang lian }
47*b50e3788Swang lian 
sys_process_madvise(int pidfd,const struct iovec * iovec,size_t vlen,int advice,unsigned int flags)48*b50e3788Swang lian static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,
49*b50e3788Swang lian 				   size_t vlen, int advice, unsigned int flags)
50*b50e3788Swang lian {
51*b50e3788Swang lian 	return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags);
52*b50e3788Swang lian }
53*b50e3788Swang lian 
54*b50e3788Swang lian /*
55*b50e3788Swang lian  * This test uses PIDFD_SELF to target the current process. The main
56*b50e3788Swang lian  * goal is to verify the basic behavior of process_madvise() with
57*b50e3788Swang lian  * a vector of non-contiguous memory ranges, not its cross-process
58*b50e3788Swang lian  * capabilities.
59*b50e3788Swang lian  */
TEST_F(process_madvise,basic)60*b50e3788Swang lian TEST_F(process_madvise, basic)
61*b50e3788Swang lian {
62*b50e3788Swang lian 	const unsigned long pagesize = self->page_size;
63*b50e3788Swang lian 	const int madvise_pages = 4;
64*b50e3788Swang lian 	struct iovec vec[madvise_pages];
65*b50e3788Swang lian 	int pidfd = self->pidfd;
66*b50e3788Swang lian 	ssize_t ret;
67*b50e3788Swang lian 	char *map;
68*b50e3788Swang lian 
69*b50e3788Swang lian 	/*
70*b50e3788Swang lian 	 * Create a single large mapping. We will pick pages from this
71*b50e3788Swang lian 	 * mapping to advise on. This ensures we test non-contiguous iovecs.
72*b50e3788Swang lian 	 */
73*b50e3788Swang lian 	map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE,
74*b50e3788Swang lian 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
75*b50e3788Swang lian 	if (map == MAP_FAILED)
76*b50e3788Swang lian 		SKIP(return, "mmap failed, not enough memory.\n");
77*b50e3788Swang lian 
78*b50e3788Swang lian 	/* Fill the entire region with a known pattern. */
79*b50e3788Swang lian 	memset(map, 'A', pagesize * 10);
80*b50e3788Swang lian 
81*b50e3788Swang lian 	/*
82*b50e3788Swang lian 	 * Setup the iovec to point to 4 non-contiguous pages
83*b50e3788Swang lian 	 * within the mapping.
84*b50e3788Swang lian 	 */
85*b50e3788Swang lian 	vec[0].iov_base = &map[0 * pagesize];
86*b50e3788Swang lian 	vec[0].iov_len = pagesize;
87*b50e3788Swang lian 	vec[1].iov_base = &map[3 * pagesize];
88*b50e3788Swang lian 	vec[1].iov_len = pagesize;
89*b50e3788Swang lian 	vec[2].iov_base = &map[5 * pagesize];
90*b50e3788Swang lian 	vec[2].iov_len = pagesize;
91*b50e3788Swang lian 	vec[3].iov_base = &map[8 * pagesize];
92*b50e3788Swang lian 	vec[3].iov_len = pagesize;
93*b50e3788Swang lian 
94*b50e3788Swang lian 	ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0);
95*b50e3788Swang lian 	if (ret == -1 && errno == EPERM)
96*b50e3788Swang lian 		SKIP(return,
97*b50e3788Swang lian 			   "process_madvise() unsupported or permission denied, try running as root.\n");
98*b50e3788Swang lian 	else if (errno == EINVAL)
99*b50e3788Swang lian 		SKIP(return,
100*b50e3788Swang lian 			   "process_madvise() unsupported or parameter invalid, please check arguments.\n");
101*b50e3788Swang lian 
102*b50e3788Swang lian 	/* The call should succeed and report the total bytes processed. */
103*b50e3788Swang lian 	ASSERT_EQ(ret, madvise_pages * pagesize);
104*b50e3788Swang lian 
105*b50e3788Swang lian 	/* Check that advised pages are now zero. */
106*b50e3788Swang lian 	for (int i = 0; i < madvise_pages; i++) {
107*b50e3788Swang lian 		char *advised_page = (char *)vec[i].iov_base;
108*b50e3788Swang lian 
109*b50e3788Swang lian 		/* Content must be 0, not 'A'. */
110*b50e3788Swang lian 		ASSERT_EQ(*advised_page, '\0');
111*b50e3788Swang lian 	}
112*b50e3788Swang lian 
113*b50e3788Swang lian 	/* Check that an un-advised page in between is still 'A'. */
114*b50e3788Swang lian 	char *unadvised_page = &map[1 * pagesize];
115*b50e3788Swang lian 
116*b50e3788Swang lian 	for (int i = 0; i < pagesize; i++)
117*b50e3788Swang lian 		ASSERT_EQ(unadvised_page[i], 'A');
118*b50e3788Swang lian 
119*b50e3788Swang lian 	/* Cleanup. */
120*b50e3788Swang lian 	ASSERT_EQ(munmap(map, pagesize * 10), 0);
121*b50e3788Swang lian }
122*b50e3788Swang lian 
123*b50e3788Swang lian /*
124*b50e3788Swang lian  * This test deterministically validates process_madvise() with MADV_COLLAPSE
125*b50e3788Swang lian  * on a remote process, other advices are difficult to verify reliably.
126*b50e3788Swang lian  *
127*b50e3788Swang lian  * The test verifies that a memory region in a child process,
128*b50e3788Swang lian  * focus on process_madv remote result, only check addresses and lengths.
129*b50e3788Swang lian  * The correctness of the MADV_COLLAPSE can be found in the relevant test examples in khugepaged.
130*b50e3788Swang lian  */
TEST_F(process_madvise,remote_collapse)131*b50e3788Swang lian TEST_F(process_madvise, remote_collapse)
132*b50e3788Swang lian {
133*b50e3788Swang lian 	const unsigned long pagesize = self->page_size;
134*b50e3788Swang lian 	long huge_page_size;
135*b50e3788Swang lian 	int pipe_info[2];
136*b50e3788Swang lian 	ssize_t ret;
137*b50e3788Swang lian 	struct iovec vec;
138*b50e3788Swang lian 
139*b50e3788Swang lian 	struct child_info {
140*b50e3788Swang lian 		pid_t pid;
141*b50e3788Swang lian 		void *map_addr;
142*b50e3788Swang lian 	} info;
143*b50e3788Swang lian 
144*b50e3788Swang lian 	huge_page_size = read_pmd_pagesize();
145*b50e3788Swang lian 	if (huge_page_size <= 0)
146*b50e3788Swang lian 		SKIP(return, "Could not determine a valid huge page size.\n");
147*b50e3788Swang lian 
148*b50e3788Swang lian 	ASSERT_EQ(pipe(pipe_info), 0);
149*b50e3788Swang lian 
150*b50e3788Swang lian 	self->child_pid = fork();
151*b50e3788Swang lian 	ASSERT_NE(self->child_pid, -1);
152*b50e3788Swang lian 
153*b50e3788Swang lian 	if (self->child_pid == 0) {
154*b50e3788Swang lian 		char *map;
155*b50e3788Swang lian 		size_t map_size = 2 * huge_page_size;
156*b50e3788Swang lian 
157*b50e3788Swang lian 		close(pipe_info[0]);
158*b50e3788Swang lian 
159*b50e3788Swang lian 		map = mmap(NULL, map_size, PROT_READ | PROT_WRITE,
160*b50e3788Swang lian 			   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
161*b50e3788Swang lian 		ASSERT_NE(map, MAP_FAILED);
162*b50e3788Swang lian 
163*b50e3788Swang lian 		/* Fault in as small pages */
164*b50e3788Swang lian 		for (size_t i = 0; i < map_size; i += pagesize)
165*b50e3788Swang lian 			map[i] = 'A';
166*b50e3788Swang lian 
167*b50e3788Swang lian 		/* Send info and pause */
168*b50e3788Swang lian 		info.pid = getpid();
169*b50e3788Swang lian 		info.map_addr = map;
170*b50e3788Swang lian 		ret = write(pipe_info[1], &info, sizeof(info));
171*b50e3788Swang lian 		ASSERT_EQ(ret, sizeof(info));
172*b50e3788Swang lian 		close(pipe_info[1]);
173*b50e3788Swang lian 
174*b50e3788Swang lian 		pause();
175*b50e3788Swang lian 		exit(0);
176*b50e3788Swang lian 	}
177*b50e3788Swang lian 
178*b50e3788Swang lian 	close(pipe_info[1]);
179*b50e3788Swang lian 
180*b50e3788Swang lian 	/* Receive child info */
181*b50e3788Swang lian 	ret = read(pipe_info[0], &info, sizeof(info));
182*b50e3788Swang lian 	if (ret <= 0) {
183*b50e3788Swang lian 		waitpid(self->child_pid, NULL, 0);
184*b50e3788Swang lian 		SKIP(return, "Failed to read child info from pipe.\n");
185*b50e3788Swang lian 	}
186*b50e3788Swang lian 	ASSERT_EQ(ret, sizeof(info));
187*b50e3788Swang lian 	close(pipe_info[0]);
188*b50e3788Swang lian 	self->child_pid = info.pid;
189*b50e3788Swang lian 
190*b50e3788Swang lian 	self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
191*b50e3788Swang lian 	ASSERT_GE(self->remote_pidfd, 0);
192*b50e3788Swang lian 
193*b50e3788Swang lian 	vec.iov_base = info.map_addr;
194*b50e3788Swang lian 	vec.iov_len = huge_page_size;
195*b50e3788Swang lian 
196*b50e3788Swang lian 	ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_COLLAPSE,
197*b50e3788Swang lian 				  0);
198*b50e3788Swang lian 	if (ret == -1) {
199*b50e3788Swang lian 		if (errno == EINVAL)
200*b50e3788Swang lian 			SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n");
201*b50e3788Swang lian 		else if (errno == EPERM)
202*b50e3788Swang lian 			SKIP(return,
203*b50e3788Swang lian 				   "No process_madvise() permissions, try running as root.\n");
204*b50e3788Swang lian 		return;
205*b50e3788Swang lian 	}
206*b50e3788Swang lian 
207*b50e3788Swang lian 	ASSERT_EQ(ret, huge_page_size);
208*b50e3788Swang lian }
209*b50e3788Swang lian 
210*b50e3788Swang lian /*
211*b50e3788Swang lian  * Test process_madvise() with a pidfd for a process that has already
212*b50e3788Swang lian  * exited to ensure correct error handling.
213*b50e3788Swang lian  */
TEST_F(process_madvise,exited_process_pidfd)214*b50e3788Swang lian TEST_F(process_madvise, exited_process_pidfd)
215*b50e3788Swang lian {
216*b50e3788Swang lian 	const unsigned long pagesize = self->page_size;
217*b50e3788Swang lian 	struct iovec vec;
218*b50e3788Swang lian 	char *map;
219*b50e3788Swang lian 	ssize_t ret;
220*b50e3788Swang lian 
221*b50e3788Swang lian 	map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
222*b50e3788Swang lian 		   0);
223*b50e3788Swang lian 	if (map == MAP_FAILED)
224*b50e3788Swang lian 		SKIP(return, "mmap failed, not enough memory.\n");
225*b50e3788Swang lian 
226*b50e3788Swang lian 	vec.iov_base = map;
227*b50e3788Swang lian 	vec.iov_len = pagesize;
228*b50e3788Swang lian 
229*b50e3788Swang lian 	/*
230*b50e3788Swang lian 	 * Using a pidfd for a process that has already exited should fail
231*b50e3788Swang lian 	 * with ESRCH.
232*b50e3788Swang lian 	 */
233*b50e3788Swang lian 	self->child_pid = fork();
234*b50e3788Swang lian 	ASSERT_NE(self->child_pid, -1);
235*b50e3788Swang lian 
236*b50e3788Swang lian 	if (self->child_pid == 0)
237*b50e3788Swang lian 		exit(0);
238*b50e3788Swang lian 
239*b50e3788Swang lian 	self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
240*b50e3788Swang lian 	ASSERT_GE(self->remote_pidfd, 0);
241*b50e3788Swang lian 
242*b50e3788Swang lian 	/* Wait for the child to ensure it has terminated. */
243*b50e3788Swang lian 	waitpid(self->child_pid, NULL, 0);
244*b50e3788Swang lian 
245*b50e3788Swang lian 	ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_DONTNEED,
246*b50e3788Swang lian 				  0);
247*b50e3788Swang lian 	ASSERT_EQ(ret, -1);
248*b50e3788Swang lian 	ASSERT_EQ(errno, ESRCH);
249*b50e3788Swang lian }
250*b50e3788Swang lian 
251*b50e3788Swang lian /*
252*b50e3788Swang lian  * Test process_madvise() with bad pidfds to ensure correct error
253*b50e3788Swang lian  * handling.
254*b50e3788Swang lian  */
TEST_F(process_madvise,bad_pidfd)255*b50e3788Swang lian TEST_F(process_madvise, bad_pidfd)
256*b50e3788Swang lian {
257*b50e3788Swang lian 	const unsigned long pagesize = self->page_size;
258*b50e3788Swang lian 	struct iovec vec;
259*b50e3788Swang lian 	char *map;
260*b50e3788Swang lian 	ssize_t ret;
261*b50e3788Swang lian 
262*b50e3788Swang lian 	map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
263*b50e3788Swang lian 		   0);
264*b50e3788Swang lian 	if (map == MAP_FAILED)
265*b50e3788Swang lian 		SKIP(return, "mmap failed, not enough memory.\n");
266*b50e3788Swang lian 
267*b50e3788Swang lian 	vec.iov_base = map;
268*b50e3788Swang lian 	vec.iov_len = pagesize;
269*b50e3788Swang lian 
270*b50e3788Swang lian 	/* Using an invalid fd number (-1) should fail with EBADF. */
271*b50e3788Swang lian 	ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0);
272*b50e3788Swang lian 	ASSERT_EQ(ret, -1);
273*b50e3788Swang lian 	ASSERT_EQ(errno, EBADF);
274*b50e3788Swang lian 
275*b50e3788Swang lian 	/*
276*b50e3788Swang lian 	 * Using a valid fd that is not a pidfd (e.g. stdin) should fail
277*b50e3788Swang lian 	 * with EBADF.
278*b50e3788Swang lian 	 */
279*b50e3788Swang lian 	ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0);
280*b50e3788Swang lian 	ASSERT_EQ(ret, -1);
281*b50e3788Swang lian 	ASSERT_EQ(errno, EBADF);
282*b50e3788Swang lian }
283*b50e3788Swang lian 
284*b50e3788Swang lian /*
285*b50e3788Swang lian  * Test that process_madvise() rejects vlen > UIO_MAXIOV.
286*b50e3788Swang lian  * The kernel should return -EINVAL when the number of iovecs exceeds 1024.
287*b50e3788Swang lian  */
TEST_F(process_madvise,invalid_vlen)288*b50e3788Swang lian TEST_F(process_madvise, invalid_vlen)
289*b50e3788Swang lian {
290*b50e3788Swang lian 	const unsigned long pagesize = self->page_size;
291*b50e3788Swang lian 	int pidfd = self->pidfd;
292*b50e3788Swang lian 	struct iovec vec;
293*b50e3788Swang lian 	char *map;
294*b50e3788Swang lian 	ssize_t ret;
295*b50e3788Swang lian 
296*b50e3788Swang lian 	map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
297*b50e3788Swang lian 		   0);
298*b50e3788Swang lian 	if (map == MAP_FAILED)
299*b50e3788Swang lian 		SKIP(return, "mmap failed, not enough memory.\n");
300*b50e3788Swang lian 
301*b50e3788Swang lian 	vec.iov_base = map;
302*b50e3788Swang lian 	vec.iov_len = pagesize;
303*b50e3788Swang lian 
304*b50e3788Swang lian 	ret = sys_process_madvise(pidfd, &vec, 1025, MADV_DONTNEED, 0);
305*b50e3788Swang lian 	ASSERT_EQ(ret, -1);
306*b50e3788Swang lian 	ASSERT_EQ(errno, EINVAL);
307*b50e3788Swang lian 
308*b50e3788Swang lian 	/* Cleanup. */
309*b50e3788Swang lian 	ASSERT_EQ(munmap(map, pagesize), 0);
310*b50e3788Swang lian }
311*b50e3788Swang lian 
312*b50e3788Swang lian /*
313*b50e3788Swang lian  * Test process_madvise() with an invalid flag value. Currently, only a flag
314*b50e3788Swang lian  * value of 0 is supported. This test is reserved for the future, e.g., if
315*b50e3788Swang lian  * synchronous flags are added.
316*b50e3788Swang lian  */
TEST_F(process_madvise,flag)317*b50e3788Swang lian TEST_F(process_madvise, flag)
318*b50e3788Swang lian {
319*b50e3788Swang lian 	const unsigned long pagesize = self->page_size;
320*b50e3788Swang lian 	unsigned int invalid_flag;
321*b50e3788Swang lian 	int pidfd = self->pidfd;
322*b50e3788Swang lian 	struct iovec vec;
323*b50e3788Swang lian 	char *map;
324*b50e3788Swang lian 	ssize_t ret;
325*b50e3788Swang lian 
326*b50e3788Swang lian 	map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
327*b50e3788Swang lian 		   0);
328*b50e3788Swang lian 	if (map == MAP_FAILED)
329*b50e3788Swang lian 		SKIP(return, "mmap failed, not enough memory.\n");
330*b50e3788Swang lian 
331*b50e3788Swang lian 	vec.iov_base = map;
332*b50e3788Swang lian 	vec.iov_len = pagesize;
333*b50e3788Swang lian 
334*b50e3788Swang lian 	invalid_flag = 0x80000000;
335*b50e3788Swang lian 
336*b50e3788Swang lian 	ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, invalid_flag);
337*b50e3788Swang lian 	ASSERT_EQ(ret, -1);
338*b50e3788Swang lian 	ASSERT_EQ(errno, EINVAL);
339*b50e3788Swang lian 
340*b50e3788Swang lian 	/* Cleanup. */
341*b50e3788Swang lian 	ASSERT_EQ(munmap(map, pagesize), 0);
342*b50e3788Swang lian }
343*b50e3788Swang lian 
344*b50e3788Swang lian TEST_HARNESS_MAIN
345