1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 #define _GNU_SOURCE 4 #include "../kselftest_harness.h" 5 #include <errno.h> 6 #include <setjmp.h> 7 #include <signal.h> 8 #include <stdbool.h> 9 #include <stdio.h> 10 #include <stdlib.h> 11 #include <string.h> 12 #include <linux/mman.h> 13 #include <sys/syscall.h> 14 #include <unistd.h> 15 #include <sched.h> 16 #include "vm_util.h" 17 18 #include "../pidfd/pidfd.h" 19 20 FIXTURE(process_madvise) 21 { 22 unsigned long page_size; 23 pid_t child_pid; 24 int remote_pidfd; 25 int pidfd; 26 }; 27 28 FIXTURE_SETUP(process_madvise) 29 { 30 self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); 31 self->pidfd = PIDFD_SELF; 32 self->remote_pidfd = -1; 33 self->child_pid = -1; 34 }; 35 36 FIXTURE_TEARDOWN_PARENT(process_madvise) 37 { 38 /* This teardown is guaranteed to run, even if tests SKIP or ASSERT */ 39 if (self->child_pid > 0) { 40 kill(self->child_pid, SIGKILL); 41 waitpid(self->child_pid, NULL, 0); 42 } 43 44 if (self->remote_pidfd >= 0) 45 close(self->remote_pidfd); 46 } 47 48 static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec, 49 size_t vlen, int advice, unsigned int flags) 50 { 51 return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags); 52 } 53 54 /* 55 * This test uses PIDFD_SELF to target the current process. The main 56 * goal is to verify the basic behavior of process_madvise() with 57 * a vector of non-contiguous memory ranges, not its cross-process 58 * capabilities. 59 */ 60 TEST_F(process_madvise, basic) 61 { 62 const unsigned long pagesize = self->page_size; 63 const int madvise_pages = 4; 64 struct iovec vec[madvise_pages]; 65 int pidfd = self->pidfd; 66 ssize_t ret; 67 char *map; 68 69 /* 70 * Create a single large mapping. We will pick pages from this 71 * mapping to advise on. This ensures we test non-contiguous iovecs. 72 */ 73 map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE, 74 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 75 if (map == MAP_FAILED) 76 SKIP(return, "mmap failed, not enough memory.\n"); 77 78 /* Fill the entire region with a known pattern. */ 79 memset(map, 'A', pagesize * 10); 80 81 /* 82 * Setup the iovec to point to 4 non-contiguous pages 83 * within the mapping. 84 */ 85 vec[0].iov_base = &map[0 * pagesize]; 86 vec[0].iov_len = pagesize; 87 vec[1].iov_base = &map[3 * pagesize]; 88 vec[1].iov_len = pagesize; 89 vec[2].iov_base = &map[5 * pagesize]; 90 vec[2].iov_len = pagesize; 91 vec[3].iov_base = &map[8 * pagesize]; 92 vec[3].iov_len = pagesize; 93 94 ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0); 95 if (ret == -1 && errno == EPERM) 96 SKIP(return, 97 "process_madvise() unsupported or permission denied, try running as root.\n"); 98 else if (errno == EINVAL) 99 SKIP(return, 100 "process_madvise() unsupported or parameter invalid, please check arguments.\n"); 101 102 /* The call should succeed and report the total bytes processed. */ 103 ASSERT_EQ(ret, madvise_pages * pagesize); 104 105 /* Check that advised pages are now zero. */ 106 for (int i = 0; i < madvise_pages; i++) { 107 char *advised_page = (char *)vec[i].iov_base; 108 109 /* Content must be 0, not 'A'. */ 110 ASSERT_EQ(*advised_page, '\0'); 111 } 112 113 /* Check that an un-advised page in between is still 'A'. */ 114 char *unadvised_page = &map[1 * pagesize]; 115 116 for (int i = 0; i < pagesize; i++) 117 ASSERT_EQ(unadvised_page[i], 'A'); 118 119 /* Cleanup. */ 120 ASSERT_EQ(munmap(map, pagesize * 10), 0); 121 } 122 123 /* 124 * This test deterministically validates process_madvise() with MADV_COLLAPSE 125 * on a remote process, other advices are difficult to verify reliably. 126 * 127 * The test verifies that a memory region in a child process, 128 * focus on process_madv remote result, only check addresses and lengths. 129 * The correctness of the MADV_COLLAPSE can be found in the relevant test examples in khugepaged. 130 */ 131 TEST_F(process_madvise, remote_collapse) 132 { 133 const unsigned long pagesize = self->page_size; 134 long huge_page_size; 135 int pipe_info[2]; 136 ssize_t ret; 137 struct iovec vec; 138 139 struct child_info { 140 pid_t pid; 141 void *map_addr; 142 } info; 143 144 huge_page_size = read_pmd_pagesize(); 145 if (huge_page_size <= 0) 146 SKIP(return, "Could not determine a valid huge page size.\n"); 147 148 ASSERT_EQ(pipe(pipe_info), 0); 149 150 self->child_pid = fork(); 151 ASSERT_NE(self->child_pid, -1); 152 153 if (self->child_pid == 0) { 154 char *map; 155 size_t map_size = 2 * huge_page_size; 156 157 close(pipe_info[0]); 158 159 map = mmap(NULL, map_size, PROT_READ | PROT_WRITE, 160 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 161 ASSERT_NE(map, MAP_FAILED); 162 163 /* Fault in as small pages */ 164 for (size_t i = 0; i < map_size; i += pagesize) 165 map[i] = 'A'; 166 167 /* Send info and pause */ 168 info.pid = getpid(); 169 info.map_addr = map; 170 ret = write(pipe_info[1], &info, sizeof(info)); 171 ASSERT_EQ(ret, sizeof(info)); 172 close(pipe_info[1]); 173 174 pause(); 175 exit(0); 176 } 177 178 close(pipe_info[1]); 179 180 /* Receive child info */ 181 ret = read(pipe_info[0], &info, sizeof(info)); 182 if (ret <= 0) { 183 waitpid(self->child_pid, NULL, 0); 184 SKIP(return, "Failed to read child info from pipe.\n"); 185 } 186 ASSERT_EQ(ret, sizeof(info)); 187 close(pipe_info[0]); 188 self->child_pid = info.pid; 189 190 self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); 191 ASSERT_GE(self->remote_pidfd, 0); 192 193 vec.iov_base = info.map_addr; 194 vec.iov_len = huge_page_size; 195 196 ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_COLLAPSE, 197 0); 198 if (ret == -1) { 199 if (errno == EINVAL) 200 SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n"); 201 else if (errno == EPERM) 202 SKIP(return, 203 "No process_madvise() permissions, try running as root.\n"); 204 return; 205 } 206 207 ASSERT_EQ(ret, huge_page_size); 208 } 209 210 /* 211 * Test process_madvise() with a pidfd for a process that has already 212 * exited to ensure correct error handling. 213 */ 214 TEST_F(process_madvise, exited_process_pidfd) 215 { 216 const unsigned long pagesize = self->page_size; 217 struct iovec vec; 218 char *map; 219 ssize_t ret; 220 221 map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 222 0); 223 if (map == MAP_FAILED) 224 SKIP(return, "mmap failed, not enough memory.\n"); 225 226 vec.iov_base = map; 227 vec.iov_len = pagesize; 228 229 /* 230 * Using a pidfd for a process that has already exited should fail 231 * with ESRCH. 232 */ 233 self->child_pid = fork(); 234 ASSERT_NE(self->child_pid, -1); 235 236 if (self->child_pid == 0) 237 exit(0); 238 239 self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); 240 ASSERT_GE(self->remote_pidfd, 0); 241 242 /* Wait for the child to ensure it has terminated. */ 243 waitpid(self->child_pid, NULL, 0); 244 245 ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_DONTNEED, 246 0); 247 ASSERT_EQ(ret, -1); 248 ASSERT_EQ(errno, ESRCH); 249 } 250 251 /* 252 * Test process_madvise() with bad pidfds to ensure correct error 253 * handling. 254 */ 255 TEST_F(process_madvise, bad_pidfd) 256 { 257 const unsigned long pagesize = self->page_size; 258 struct iovec vec; 259 char *map; 260 ssize_t ret; 261 262 map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 263 0); 264 if (map == MAP_FAILED) 265 SKIP(return, "mmap failed, not enough memory.\n"); 266 267 vec.iov_base = map; 268 vec.iov_len = pagesize; 269 270 /* Using an invalid fd number (-1) should fail with EBADF. */ 271 ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0); 272 ASSERT_EQ(ret, -1); 273 ASSERT_EQ(errno, EBADF); 274 275 /* 276 * Using a valid fd that is not a pidfd (e.g. stdin) should fail 277 * with EBADF. 278 */ 279 ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0); 280 ASSERT_EQ(ret, -1); 281 ASSERT_EQ(errno, EBADF); 282 } 283 284 /* 285 * Test that process_madvise() rejects vlen > UIO_MAXIOV. 286 * The kernel should return -EINVAL when the number of iovecs exceeds 1024. 287 */ 288 TEST_F(process_madvise, invalid_vlen) 289 { 290 const unsigned long pagesize = self->page_size; 291 int pidfd = self->pidfd; 292 struct iovec vec; 293 char *map; 294 ssize_t ret; 295 296 map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 297 0); 298 if (map == MAP_FAILED) 299 SKIP(return, "mmap failed, not enough memory.\n"); 300 301 vec.iov_base = map; 302 vec.iov_len = pagesize; 303 304 ret = sys_process_madvise(pidfd, &vec, 1025, MADV_DONTNEED, 0); 305 ASSERT_EQ(ret, -1); 306 ASSERT_EQ(errno, EINVAL); 307 308 /* Cleanup. */ 309 ASSERT_EQ(munmap(map, pagesize), 0); 310 } 311 312 /* 313 * Test process_madvise() with an invalid flag value. Currently, only a flag 314 * value of 0 is supported. This test is reserved for the future, e.g., if 315 * synchronous flags are added. 316 */ 317 TEST_F(process_madvise, flag) 318 { 319 const unsigned long pagesize = self->page_size; 320 unsigned int invalid_flag; 321 int pidfd = self->pidfd; 322 struct iovec vec; 323 char *map; 324 ssize_t ret; 325 326 map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 327 0); 328 if (map == MAP_FAILED) 329 SKIP(return, "mmap failed, not enough memory.\n"); 330 331 vec.iov_base = map; 332 vec.iov_len = pagesize; 333 334 invalid_flag = 0x80000000; 335 336 ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, invalid_flag); 337 ASSERT_EQ(ret, -1); 338 ASSERT_EQ(errno, EINVAL); 339 340 /* Cleanup. */ 341 ASSERT_EQ(munmap(map, pagesize), 0); 342 } 343 344 TEST_HARNESS_MAIN 345