1*b50e3788Swang lian // SPDX-License-Identifier: GPL-2.0-or-later
2*b50e3788Swang lian
3*b50e3788Swang lian #define _GNU_SOURCE
4*b50e3788Swang lian #include "../kselftest_harness.h"
5*b50e3788Swang lian #include <errno.h>
6*b50e3788Swang lian #include <setjmp.h>
7*b50e3788Swang lian #include <signal.h>
8*b50e3788Swang lian #include <stdbool.h>
9*b50e3788Swang lian #include <stdio.h>
10*b50e3788Swang lian #include <stdlib.h>
11*b50e3788Swang lian #include <string.h>
12*b50e3788Swang lian #include <linux/mman.h>
13*b50e3788Swang lian #include <sys/syscall.h>
14*b50e3788Swang lian #include <unistd.h>
15*b50e3788Swang lian #include <sched.h>
16*b50e3788Swang lian #include "vm_util.h"
17*b50e3788Swang lian
18*b50e3788Swang lian #include "../pidfd/pidfd.h"
19*b50e3788Swang lian
FIXTURE(process_madvise)20*b50e3788Swang lian FIXTURE(process_madvise)
21*b50e3788Swang lian {
22*b50e3788Swang lian unsigned long page_size;
23*b50e3788Swang lian pid_t child_pid;
24*b50e3788Swang lian int remote_pidfd;
25*b50e3788Swang lian int pidfd;
26*b50e3788Swang lian };
27*b50e3788Swang lian
FIXTURE_SETUP(process_madvise)28*b50e3788Swang lian FIXTURE_SETUP(process_madvise)
29*b50e3788Swang lian {
30*b50e3788Swang lian self->page_size = (unsigned long)sysconf(_SC_PAGESIZE);
31*b50e3788Swang lian self->pidfd = PIDFD_SELF;
32*b50e3788Swang lian self->remote_pidfd = -1;
33*b50e3788Swang lian self->child_pid = -1;
34*b50e3788Swang lian };
35*b50e3788Swang lian
FIXTURE_TEARDOWN_PARENT(process_madvise)36*b50e3788Swang lian FIXTURE_TEARDOWN_PARENT(process_madvise)
37*b50e3788Swang lian {
38*b50e3788Swang lian /* This teardown is guaranteed to run, even if tests SKIP or ASSERT */
39*b50e3788Swang lian if (self->child_pid > 0) {
40*b50e3788Swang lian kill(self->child_pid, SIGKILL);
41*b50e3788Swang lian waitpid(self->child_pid, NULL, 0);
42*b50e3788Swang lian }
43*b50e3788Swang lian
44*b50e3788Swang lian if (self->remote_pidfd >= 0)
45*b50e3788Swang lian close(self->remote_pidfd);
46*b50e3788Swang lian }
47*b50e3788Swang lian
sys_process_madvise(int pidfd,const struct iovec * iovec,size_t vlen,int advice,unsigned int flags)48*b50e3788Swang lian static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,
49*b50e3788Swang lian size_t vlen, int advice, unsigned int flags)
50*b50e3788Swang lian {
51*b50e3788Swang lian return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags);
52*b50e3788Swang lian }
53*b50e3788Swang lian
54*b50e3788Swang lian /*
55*b50e3788Swang lian * This test uses PIDFD_SELF to target the current process. The main
56*b50e3788Swang lian * goal is to verify the basic behavior of process_madvise() with
57*b50e3788Swang lian * a vector of non-contiguous memory ranges, not its cross-process
58*b50e3788Swang lian * capabilities.
59*b50e3788Swang lian */
TEST_F(process_madvise,basic)60*b50e3788Swang lian TEST_F(process_madvise, basic)
61*b50e3788Swang lian {
62*b50e3788Swang lian const unsigned long pagesize = self->page_size;
63*b50e3788Swang lian const int madvise_pages = 4;
64*b50e3788Swang lian struct iovec vec[madvise_pages];
65*b50e3788Swang lian int pidfd = self->pidfd;
66*b50e3788Swang lian ssize_t ret;
67*b50e3788Swang lian char *map;
68*b50e3788Swang lian
69*b50e3788Swang lian /*
70*b50e3788Swang lian * Create a single large mapping. We will pick pages from this
71*b50e3788Swang lian * mapping to advise on. This ensures we test non-contiguous iovecs.
72*b50e3788Swang lian */
73*b50e3788Swang lian map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE,
74*b50e3788Swang lian MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
75*b50e3788Swang lian if (map == MAP_FAILED)
76*b50e3788Swang lian SKIP(return, "mmap failed, not enough memory.\n");
77*b50e3788Swang lian
78*b50e3788Swang lian /* Fill the entire region with a known pattern. */
79*b50e3788Swang lian memset(map, 'A', pagesize * 10);
80*b50e3788Swang lian
81*b50e3788Swang lian /*
82*b50e3788Swang lian * Setup the iovec to point to 4 non-contiguous pages
83*b50e3788Swang lian * within the mapping.
84*b50e3788Swang lian */
85*b50e3788Swang lian vec[0].iov_base = &map[0 * pagesize];
86*b50e3788Swang lian vec[0].iov_len = pagesize;
87*b50e3788Swang lian vec[1].iov_base = &map[3 * pagesize];
88*b50e3788Swang lian vec[1].iov_len = pagesize;
89*b50e3788Swang lian vec[2].iov_base = &map[5 * pagesize];
90*b50e3788Swang lian vec[2].iov_len = pagesize;
91*b50e3788Swang lian vec[3].iov_base = &map[8 * pagesize];
92*b50e3788Swang lian vec[3].iov_len = pagesize;
93*b50e3788Swang lian
94*b50e3788Swang lian ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0);
95*b50e3788Swang lian if (ret == -1 && errno == EPERM)
96*b50e3788Swang lian SKIP(return,
97*b50e3788Swang lian "process_madvise() unsupported or permission denied, try running as root.\n");
98*b50e3788Swang lian else if (errno == EINVAL)
99*b50e3788Swang lian SKIP(return,
100*b50e3788Swang lian "process_madvise() unsupported or parameter invalid, please check arguments.\n");
101*b50e3788Swang lian
102*b50e3788Swang lian /* The call should succeed and report the total bytes processed. */
103*b50e3788Swang lian ASSERT_EQ(ret, madvise_pages * pagesize);
104*b50e3788Swang lian
105*b50e3788Swang lian /* Check that advised pages are now zero. */
106*b50e3788Swang lian for (int i = 0; i < madvise_pages; i++) {
107*b50e3788Swang lian char *advised_page = (char *)vec[i].iov_base;
108*b50e3788Swang lian
109*b50e3788Swang lian /* Content must be 0, not 'A'. */
110*b50e3788Swang lian ASSERT_EQ(*advised_page, '\0');
111*b50e3788Swang lian }
112*b50e3788Swang lian
113*b50e3788Swang lian /* Check that an un-advised page in between is still 'A'. */
114*b50e3788Swang lian char *unadvised_page = &map[1 * pagesize];
115*b50e3788Swang lian
116*b50e3788Swang lian for (int i = 0; i < pagesize; i++)
117*b50e3788Swang lian ASSERT_EQ(unadvised_page[i], 'A');
118*b50e3788Swang lian
119*b50e3788Swang lian /* Cleanup. */
120*b50e3788Swang lian ASSERT_EQ(munmap(map, pagesize * 10), 0);
121*b50e3788Swang lian }
122*b50e3788Swang lian
123*b50e3788Swang lian /*
124*b50e3788Swang lian * This test deterministically validates process_madvise() with MADV_COLLAPSE
125*b50e3788Swang lian * on a remote process, other advices are difficult to verify reliably.
126*b50e3788Swang lian *
127*b50e3788Swang lian * The test verifies that a memory region in a child process,
128*b50e3788Swang lian * focus on process_madv remote result, only check addresses and lengths.
129*b50e3788Swang lian * The correctness of the MADV_COLLAPSE can be found in the relevant test examples in khugepaged.
130*b50e3788Swang lian */
TEST_F(process_madvise,remote_collapse)131*b50e3788Swang lian TEST_F(process_madvise, remote_collapse)
132*b50e3788Swang lian {
133*b50e3788Swang lian const unsigned long pagesize = self->page_size;
134*b50e3788Swang lian long huge_page_size;
135*b50e3788Swang lian int pipe_info[2];
136*b50e3788Swang lian ssize_t ret;
137*b50e3788Swang lian struct iovec vec;
138*b50e3788Swang lian
139*b50e3788Swang lian struct child_info {
140*b50e3788Swang lian pid_t pid;
141*b50e3788Swang lian void *map_addr;
142*b50e3788Swang lian } info;
143*b50e3788Swang lian
144*b50e3788Swang lian huge_page_size = read_pmd_pagesize();
145*b50e3788Swang lian if (huge_page_size <= 0)
146*b50e3788Swang lian SKIP(return, "Could not determine a valid huge page size.\n");
147*b50e3788Swang lian
148*b50e3788Swang lian ASSERT_EQ(pipe(pipe_info), 0);
149*b50e3788Swang lian
150*b50e3788Swang lian self->child_pid = fork();
151*b50e3788Swang lian ASSERT_NE(self->child_pid, -1);
152*b50e3788Swang lian
153*b50e3788Swang lian if (self->child_pid == 0) {
154*b50e3788Swang lian char *map;
155*b50e3788Swang lian size_t map_size = 2 * huge_page_size;
156*b50e3788Swang lian
157*b50e3788Swang lian close(pipe_info[0]);
158*b50e3788Swang lian
159*b50e3788Swang lian map = mmap(NULL, map_size, PROT_READ | PROT_WRITE,
160*b50e3788Swang lian MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
161*b50e3788Swang lian ASSERT_NE(map, MAP_FAILED);
162*b50e3788Swang lian
163*b50e3788Swang lian /* Fault in as small pages */
164*b50e3788Swang lian for (size_t i = 0; i < map_size; i += pagesize)
165*b50e3788Swang lian map[i] = 'A';
166*b50e3788Swang lian
167*b50e3788Swang lian /* Send info and pause */
168*b50e3788Swang lian info.pid = getpid();
169*b50e3788Swang lian info.map_addr = map;
170*b50e3788Swang lian ret = write(pipe_info[1], &info, sizeof(info));
171*b50e3788Swang lian ASSERT_EQ(ret, sizeof(info));
172*b50e3788Swang lian close(pipe_info[1]);
173*b50e3788Swang lian
174*b50e3788Swang lian pause();
175*b50e3788Swang lian exit(0);
176*b50e3788Swang lian }
177*b50e3788Swang lian
178*b50e3788Swang lian close(pipe_info[1]);
179*b50e3788Swang lian
180*b50e3788Swang lian /* Receive child info */
181*b50e3788Swang lian ret = read(pipe_info[0], &info, sizeof(info));
182*b50e3788Swang lian if (ret <= 0) {
183*b50e3788Swang lian waitpid(self->child_pid, NULL, 0);
184*b50e3788Swang lian SKIP(return, "Failed to read child info from pipe.\n");
185*b50e3788Swang lian }
186*b50e3788Swang lian ASSERT_EQ(ret, sizeof(info));
187*b50e3788Swang lian close(pipe_info[0]);
188*b50e3788Swang lian self->child_pid = info.pid;
189*b50e3788Swang lian
190*b50e3788Swang lian self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
191*b50e3788Swang lian ASSERT_GE(self->remote_pidfd, 0);
192*b50e3788Swang lian
193*b50e3788Swang lian vec.iov_base = info.map_addr;
194*b50e3788Swang lian vec.iov_len = huge_page_size;
195*b50e3788Swang lian
196*b50e3788Swang lian ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_COLLAPSE,
197*b50e3788Swang lian 0);
198*b50e3788Swang lian if (ret == -1) {
199*b50e3788Swang lian if (errno == EINVAL)
200*b50e3788Swang lian SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n");
201*b50e3788Swang lian else if (errno == EPERM)
202*b50e3788Swang lian SKIP(return,
203*b50e3788Swang lian "No process_madvise() permissions, try running as root.\n");
204*b50e3788Swang lian return;
205*b50e3788Swang lian }
206*b50e3788Swang lian
207*b50e3788Swang lian ASSERT_EQ(ret, huge_page_size);
208*b50e3788Swang lian }
209*b50e3788Swang lian
210*b50e3788Swang lian /*
211*b50e3788Swang lian * Test process_madvise() with a pidfd for a process that has already
212*b50e3788Swang lian * exited to ensure correct error handling.
213*b50e3788Swang lian */
TEST_F(process_madvise,exited_process_pidfd)214*b50e3788Swang lian TEST_F(process_madvise, exited_process_pidfd)
215*b50e3788Swang lian {
216*b50e3788Swang lian const unsigned long pagesize = self->page_size;
217*b50e3788Swang lian struct iovec vec;
218*b50e3788Swang lian char *map;
219*b50e3788Swang lian ssize_t ret;
220*b50e3788Swang lian
221*b50e3788Swang lian map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
222*b50e3788Swang lian 0);
223*b50e3788Swang lian if (map == MAP_FAILED)
224*b50e3788Swang lian SKIP(return, "mmap failed, not enough memory.\n");
225*b50e3788Swang lian
226*b50e3788Swang lian vec.iov_base = map;
227*b50e3788Swang lian vec.iov_len = pagesize;
228*b50e3788Swang lian
229*b50e3788Swang lian /*
230*b50e3788Swang lian * Using a pidfd for a process that has already exited should fail
231*b50e3788Swang lian * with ESRCH.
232*b50e3788Swang lian */
233*b50e3788Swang lian self->child_pid = fork();
234*b50e3788Swang lian ASSERT_NE(self->child_pid, -1);
235*b50e3788Swang lian
236*b50e3788Swang lian if (self->child_pid == 0)
237*b50e3788Swang lian exit(0);
238*b50e3788Swang lian
239*b50e3788Swang lian self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0);
240*b50e3788Swang lian ASSERT_GE(self->remote_pidfd, 0);
241*b50e3788Swang lian
242*b50e3788Swang lian /* Wait for the child to ensure it has terminated. */
243*b50e3788Swang lian waitpid(self->child_pid, NULL, 0);
244*b50e3788Swang lian
245*b50e3788Swang lian ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_DONTNEED,
246*b50e3788Swang lian 0);
247*b50e3788Swang lian ASSERT_EQ(ret, -1);
248*b50e3788Swang lian ASSERT_EQ(errno, ESRCH);
249*b50e3788Swang lian }
250*b50e3788Swang lian
251*b50e3788Swang lian /*
252*b50e3788Swang lian * Test process_madvise() with bad pidfds to ensure correct error
253*b50e3788Swang lian * handling.
254*b50e3788Swang lian */
TEST_F(process_madvise,bad_pidfd)255*b50e3788Swang lian TEST_F(process_madvise, bad_pidfd)
256*b50e3788Swang lian {
257*b50e3788Swang lian const unsigned long pagesize = self->page_size;
258*b50e3788Swang lian struct iovec vec;
259*b50e3788Swang lian char *map;
260*b50e3788Swang lian ssize_t ret;
261*b50e3788Swang lian
262*b50e3788Swang lian map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
263*b50e3788Swang lian 0);
264*b50e3788Swang lian if (map == MAP_FAILED)
265*b50e3788Swang lian SKIP(return, "mmap failed, not enough memory.\n");
266*b50e3788Swang lian
267*b50e3788Swang lian vec.iov_base = map;
268*b50e3788Swang lian vec.iov_len = pagesize;
269*b50e3788Swang lian
270*b50e3788Swang lian /* Using an invalid fd number (-1) should fail with EBADF. */
271*b50e3788Swang lian ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0);
272*b50e3788Swang lian ASSERT_EQ(ret, -1);
273*b50e3788Swang lian ASSERT_EQ(errno, EBADF);
274*b50e3788Swang lian
275*b50e3788Swang lian /*
276*b50e3788Swang lian * Using a valid fd that is not a pidfd (e.g. stdin) should fail
277*b50e3788Swang lian * with EBADF.
278*b50e3788Swang lian */
279*b50e3788Swang lian ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0);
280*b50e3788Swang lian ASSERT_EQ(ret, -1);
281*b50e3788Swang lian ASSERT_EQ(errno, EBADF);
282*b50e3788Swang lian }
283*b50e3788Swang lian
284*b50e3788Swang lian /*
285*b50e3788Swang lian * Test that process_madvise() rejects vlen > UIO_MAXIOV.
286*b50e3788Swang lian * The kernel should return -EINVAL when the number of iovecs exceeds 1024.
287*b50e3788Swang lian */
TEST_F(process_madvise,invalid_vlen)288*b50e3788Swang lian TEST_F(process_madvise, invalid_vlen)
289*b50e3788Swang lian {
290*b50e3788Swang lian const unsigned long pagesize = self->page_size;
291*b50e3788Swang lian int pidfd = self->pidfd;
292*b50e3788Swang lian struct iovec vec;
293*b50e3788Swang lian char *map;
294*b50e3788Swang lian ssize_t ret;
295*b50e3788Swang lian
296*b50e3788Swang lian map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
297*b50e3788Swang lian 0);
298*b50e3788Swang lian if (map == MAP_FAILED)
299*b50e3788Swang lian SKIP(return, "mmap failed, not enough memory.\n");
300*b50e3788Swang lian
301*b50e3788Swang lian vec.iov_base = map;
302*b50e3788Swang lian vec.iov_len = pagesize;
303*b50e3788Swang lian
304*b50e3788Swang lian ret = sys_process_madvise(pidfd, &vec, 1025, MADV_DONTNEED, 0);
305*b50e3788Swang lian ASSERT_EQ(ret, -1);
306*b50e3788Swang lian ASSERT_EQ(errno, EINVAL);
307*b50e3788Swang lian
308*b50e3788Swang lian /* Cleanup. */
309*b50e3788Swang lian ASSERT_EQ(munmap(map, pagesize), 0);
310*b50e3788Swang lian }
311*b50e3788Swang lian
312*b50e3788Swang lian /*
313*b50e3788Swang lian * Test process_madvise() with an invalid flag value. Currently, only a flag
314*b50e3788Swang lian * value of 0 is supported. This test is reserved for the future, e.g., if
315*b50e3788Swang lian * synchronous flags are added.
316*b50e3788Swang lian */
TEST_F(process_madvise,flag)317*b50e3788Swang lian TEST_F(process_madvise, flag)
318*b50e3788Swang lian {
319*b50e3788Swang lian const unsigned long pagesize = self->page_size;
320*b50e3788Swang lian unsigned int invalid_flag;
321*b50e3788Swang lian int pidfd = self->pidfd;
322*b50e3788Swang lian struct iovec vec;
323*b50e3788Swang lian char *map;
324*b50e3788Swang lian ssize_t ret;
325*b50e3788Swang lian
326*b50e3788Swang lian map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1,
327*b50e3788Swang lian 0);
328*b50e3788Swang lian if (map == MAP_FAILED)
329*b50e3788Swang lian SKIP(return, "mmap failed, not enough memory.\n");
330*b50e3788Swang lian
331*b50e3788Swang lian vec.iov_base = map;
332*b50e3788Swang lian vec.iov_len = pagesize;
333*b50e3788Swang lian
334*b50e3788Swang lian invalid_flag = 0x80000000;
335*b50e3788Swang lian
336*b50e3788Swang lian ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, invalid_flag);
337*b50e3788Swang lian ASSERT_EQ(ret, -1);
338*b50e3788Swang lian ASSERT_EQ(errno, EINVAL);
339*b50e3788Swang lian
340*b50e3788Swang lian /* Cleanup. */
341*b50e3788Swang lian ASSERT_EQ(munmap(map, pagesize), 0);
342*b50e3788Swang lian }
343*b50e3788Swang lian
344*b50e3788Swang lian TEST_HARNESS_MAIN
345