1*b50e3788Swang lian // SPDX-License-Identifier: GPL-2.0-or-later 2*b50e3788Swang lian 3*b50e3788Swang lian #define _GNU_SOURCE 4*b50e3788Swang lian #include "../kselftest_harness.h" 5*b50e3788Swang lian #include <errno.h> 6*b50e3788Swang lian #include <setjmp.h> 7*b50e3788Swang lian #include <signal.h> 8*b50e3788Swang lian #include <stdbool.h> 9*b50e3788Swang lian #include <stdio.h> 10*b50e3788Swang lian #include <stdlib.h> 11*b50e3788Swang lian #include <string.h> 12*b50e3788Swang lian #include <linux/mman.h> 13*b50e3788Swang lian #include <sys/syscall.h> 14*b50e3788Swang lian #include <unistd.h> 15*b50e3788Swang lian #include <sched.h> 16*b50e3788Swang lian #include "vm_util.h" 17*b50e3788Swang lian 18*b50e3788Swang lian #include "../pidfd/pidfd.h" 19*b50e3788Swang lian 20*b50e3788Swang lian FIXTURE(process_madvise) 21*b50e3788Swang lian { 22*b50e3788Swang lian unsigned long page_size; 23*b50e3788Swang lian pid_t child_pid; 24*b50e3788Swang lian int remote_pidfd; 25*b50e3788Swang lian int pidfd; 26*b50e3788Swang lian }; 27*b50e3788Swang lian 28*b50e3788Swang lian FIXTURE_SETUP(process_madvise) 29*b50e3788Swang lian { 30*b50e3788Swang lian self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); 31*b50e3788Swang lian self->pidfd = PIDFD_SELF; 32*b50e3788Swang lian self->remote_pidfd = -1; 33*b50e3788Swang lian self->child_pid = -1; 34*b50e3788Swang lian }; 35*b50e3788Swang lian 36*b50e3788Swang lian FIXTURE_TEARDOWN_PARENT(process_madvise) 37*b50e3788Swang lian { 38*b50e3788Swang lian /* This teardown is guaranteed to run, even if tests SKIP or ASSERT */ 39*b50e3788Swang lian if (self->child_pid > 0) { 40*b50e3788Swang lian kill(self->child_pid, SIGKILL); 41*b50e3788Swang lian waitpid(self->child_pid, NULL, 0); 42*b50e3788Swang lian } 43*b50e3788Swang lian 44*b50e3788Swang lian if (self->remote_pidfd >= 0) 45*b50e3788Swang lian close(self->remote_pidfd); 46*b50e3788Swang lian } 47*b50e3788Swang lian 48*b50e3788Swang lian static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec, 49*b50e3788Swang lian size_t vlen, int advice, unsigned int flags) 50*b50e3788Swang lian { 51*b50e3788Swang lian return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags); 52*b50e3788Swang lian } 53*b50e3788Swang lian 54*b50e3788Swang lian /* 55*b50e3788Swang lian * This test uses PIDFD_SELF to target the current process. The main 56*b50e3788Swang lian * goal is to verify the basic behavior of process_madvise() with 57*b50e3788Swang lian * a vector of non-contiguous memory ranges, not its cross-process 58*b50e3788Swang lian * capabilities. 59*b50e3788Swang lian */ 60*b50e3788Swang lian TEST_F(process_madvise, basic) 61*b50e3788Swang lian { 62*b50e3788Swang lian const unsigned long pagesize = self->page_size; 63*b50e3788Swang lian const int madvise_pages = 4; 64*b50e3788Swang lian struct iovec vec[madvise_pages]; 65*b50e3788Swang lian int pidfd = self->pidfd; 66*b50e3788Swang lian ssize_t ret; 67*b50e3788Swang lian char *map; 68*b50e3788Swang lian 69*b50e3788Swang lian /* 70*b50e3788Swang lian * Create a single large mapping. We will pick pages from this 71*b50e3788Swang lian * mapping to advise on. This ensures we test non-contiguous iovecs. 72*b50e3788Swang lian */ 73*b50e3788Swang lian map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE, 74*b50e3788Swang lian MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 75*b50e3788Swang lian if (map == MAP_FAILED) 76*b50e3788Swang lian SKIP(return, "mmap failed, not enough memory.\n"); 77*b50e3788Swang lian 78*b50e3788Swang lian /* Fill the entire region with a known pattern. */ 79*b50e3788Swang lian memset(map, 'A', pagesize * 10); 80*b50e3788Swang lian 81*b50e3788Swang lian /* 82*b50e3788Swang lian * Setup the iovec to point to 4 non-contiguous pages 83*b50e3788Swang lian * within the mapping. 84*b50e3788Swang lian */ 85*b50e3788Swang lian vec[0].iov_base = &map[0 * pagesize]; 86*b50e3788Swang lian vec[0].iov_len = pagesize; 87*b50e3788Swang lian vec[1].iov_base = &map[3 * pagesize]; 88*b50e3788Swang lian vec[1].iov_len = pagesize; 89*b50e3788Swang lian vec[2].iov_base = &map[5 * pagesize]; 90*b50e3788Swang lian vec[2].iov_len = pagesize; 91*b50e3788Swang lian vec[3].iov_base = &map[8 * pagesize]; 92*b50e3788Swang lian vec[3].iov_len = pagesize; 93*b50e3788Swang lian 94*b50e3788Swang lian ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0); 95*b50e3788Swang lian if (ret == -1 && errno == EPERM) 96*b50e3788Swang lian SKIP(return, 97*b50e3788Swang lian "process_madvise() unsupported or permission denied, try running as root.\n"); 98*b50e3788Swang lian else if (errno == EINVAL) 99*b50e3788Swang lian SKIP(return, 100*b50e3788Swang lian "process_madvise() unsupported or parameter invalid, please check arguments.\n"); 101*b50e3788Swang lian 102*b50e3788Swang lian /* The call should succeed and report the total bytes processed. */ 103*b50e3788Swang lian ASSERT_EQ(ret, madvise_pages * pagesize); 104*b50e3788Swang lian 105*b50e3788Swang lian /* Check that advised pages are now zero. */ 106*b50e3788Swang lian for (int i = 0; i < madvise_pages; i++) { 107*b50e3788Swang lian char *advised_page = (char *)vec[i].iov_base; 108*b50e3788Swang lian 109*b50e3788Swang lian /* Content must be 0, not 'A'. */ 110*b50e3788Swang lian ASSERT_EQ(*advised_page, '\0'); 111*b50e3788Swang lian } 112*b50e3788Swang lian 113*b50e3788Swang lian /* Check that an un-advised page in between is still 'A'. */ 114*b50e3788Swang lian char *unadvised_page = &map[1 * pagesize]; 115*b50e3788Swang lian 116*b50e3788Swang lian for (int i = 0; i < pagesize; i++) 117*b50e3788Swang lian ASSERT_EQ(unadvised_page[i], 'A'); 118*b50e3788Swang lian 119*b50e3788Swang lian /* Cleanup. */ 120*b50e3788Swang lian ASSERT_EQ(munmap(map, pagesize * 10), 0); 121*b50e3788Swang lian } 122*b50e3788Swang lian 123*b50e3788Swang lian /* 124*b50e3788Swang lian * This test deterministically validates process_madvise() with MADV_COLLAPSE 125*b50e3788Swang lian * on a remote process, other advices are difficult to verify reliably. 126*b50e3788Swang lian * 127*b50e3788Swang lian * The test verifies that a memory region in a child process, 128*b50e3788Swang lian * focus on process_madv remote result, only check addresses and lengths. 129*b50e3788Swang lian * The correctness of the MADV_COLLAPSE can be found in the relevant test examples in khugepaged. 130*b50e3788Swang lian */ 131*b50e3788Swang lian TEST_F(process_madvise, remote_collapse) 132*b50e3788Swang lian { 133*b50e3788Swang lian const unsigned long pagesize = self->page_size; 134*b50e3788Swang lian long huge_page_size; 135*b50e3788Swang lian int pipe_info[2]; 136*b50e3788Swang lian ssize_t ret; 137*b50e3788Swang lian struct iovec vec; 138*b50e3788Swang lian 139*b50e3788Swang lian struct child_info { 140*b50e3788Swang lian pid_t pid; 141*b50e3788Swang lian void *map_addr; 142*b50e3788Swang lian } info; 143*b50e3788Swang lian 144*b50e3788Swang lian huge_page_size = read_pmd_pagesize(); 145*b50e3788Swang lian if (huge_page_size <= 0) 146*b50e3788Swang lian SKIP(return, "Could not determine a valid huge page size.\n"); 147*b50e3788Swang lian 148*b50e3788Swang lian ASSERT_EQ(pipe(pipe_info), 0); 149*b50e3788Swang lian 150*b50e3788Swang lian self->child_pid = fork(); 151*b50e3788Swang lian ASSERT_NE(self->child_pid, -1); 152*b50e3788Swang lian 153*b50e3788Swang lian if (self->child_pid == 0) { 154*b50e3788Swang lian char *map; 155*b50e3788Swang lian size_t map_size = 2 * huge_page_size; 156*b50e3788Swang lian 157*b50e3788Swang lian close(pipe_info[0]); 158*b50e3788Swang lian 159*b50e3788Swang lian map = mmap(NULL, map_size, PROT_READ | PROT_WRITE, 160*b50e3788Swang lian MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 161*b50e3788Swang lian ASSERT_NE(map, MAP_FAILED); 162*b50e3788Swang lian 163*b50e3788Swang lian /* Fault in as small pages */ 164*b50e3788Swang lian for (size_t i = 0; i < map_size; i += pagesize) 165*b50e3788Swang lian map[i] = 'A'; 166*b50e3788Swang lian 167*b50e3788Swang lian /* Send info and pause */ 168*b50e3788Swang lian info.pid = getpid(); 169*b50e3788Swang lian info.map_addr = map; 170*b50e3788Swang lian ret = write(pipe_info[1], &info, sizeof(info)); 171*b50e3788Swang lian ASSERT_EQ(ret, sizeof(info)); 172*b50e3788Swang lian close(pipe_info[1]); 173*b50e3788Swang lian 174*b50e3788Swang lian pause(); 175*b50e3788Swang lian exit(0); 176*b50e3788Swang lian } 177*b50e3788Swang lian 178*b50e3788Swang lian close(pipe_info[1]); 179*b50e3788Swang lian 180*b50e3788Swang lian /* Receive child info */ 181*b50e3788Swang lian ret = read(pipe_info[0], &info, sizeof(info)); 182*b50e3788Swang lian if (ret <= 0) { 183*b50e3788Swang lian waitpid(self->child_pid, NULL, 0); 184*b50e3788Swang lian SKIP(return, "Failed to read child info from pipe.\n"); 185*b50e3788Swang lian } 186*b50e3788Swang lian ASSERT_EQ(ret, sizeof(info)); 187*b50e3788Swang lian close(pipe_info[0]); 188*b50e3788Swang lian self->child_pid = info.pid; 189*b50e3788Swang lian 190*b50e3788Swang lian self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); 191*b50e3788Swang lian ASSERT_GE(self->remote_pidfd, 0); 192*b50e3788Swang lian 193*b50e3788Swang lian vec.iov_base = info.map_addr; 194*b50e3788Swang lian vec.iov_len = huge_page_size; 195*b50e3788Swang lian 196*b50e3788Swang lian ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_COLLAPSE, 197*b50e3788Swang lian 0); 198*b50e3788Swang lian if (ret == -1) { 199*b50e3788Swang lian if (errno == EINVAL) 200*b50e3788Swang lian SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n"); 201*b50e3788Swang lian else if (errno == EPERM) 202*b50e3788Swang lian SKIP(return, 203*b50e3788Swang lian "No process_madvise() permissions, try running as root.\n"); 204*b50e3788Swang lian return; 205*b50e3788Swang lian } 206*b50e3788Swang lian 207*b50e3788Swang lian ASSERT_EQ(ret, huge_page_size); 208*b50e3788Swang lian } 209*b50e3788Swang lian 210*b50e3788Swang lian /* 211*b50e3788Swang lian * Test process_madvise() with a pidfd for a process that has already 212*b50e3788Swang lian * exited to ensure correct error handling. 213*b50e3788Swang lian */ 214*b50e3788Swang lian TEST_F(process_madvise, exited_process_pidfd) 215*b50e3788Swang lian { 216*b50e3788Swang lian const unsigned long pagesize = self->page_size; 217*b50e3788Swang lian struct iovec vec; 218*b50e3788Swang lian char *map; 219*b50e3788Swang lian ssize_t ret; 220*b50e3788Swang lian 221*b50e3788Swang lian map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 222*b50e3788Swang lian 0); 223*b50e3788Swang lian if (map == MAP_FAILED) 224*b50e3788Swang lian SKIP(return, "mmap failed, not enough memory.\n"); 225*b50e3788Swang lian 226*b50e3788Swang lian vec.iov_base = map; 227*b50e3788Swang lian vec.iov_len = pagesize; 228*b50e3788Swang lian 229*b50e3788Swang lian /* 230*b50e3788Swang lian * Using a pidfd for a process that has already exited should fail 231*b50e3788Swang lian * with ESRCH. 232*b50e3788Swang lian */ 233*b50e3788Swang lian self->child_pid = fork(); 234*b50e3788Swang lian ASSERT_NE(self->child_pid, -1); 235*b50e3788Swang lian 236*b50e3788Swang lian if (self->child_pid == 0) 237*b50e3788Swang lian exit(0); 238*b50e3788Swang lian 239*b50e3788Swang lian self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); 240*b50e3788Swang lian ASSERT_GE(self->remote_pidfd, 0); 241*b50e3788Swang lian 242*b50e3788Swang lian /* Wait for the child to ensure it has terminated. */ 243*b50e3788Swang lian waitpid(self->child_pid, NULL, 0); 244*b50e3788Swang lian 245*b50e3788Swang lian ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_DONTNEED, 246*b50e3788Swang lian 0); 247*b50e3788Swang lian ASSERT_EQ(ret, -1); 248*b50e3788Swang lian ASSERT_EQ(errno, ESRCH); 249*b50e3788Swang lian } 250*b50e3788Swang lian 251*b50e3788Swang lian /* 252*b50e3788Swang lian * Test process_madvise() with bad pidfds to ensure correct error 253*b50e3788Swang lian * handling. 254*b50e3788Swang lian */ 255*b50e3788Swang lian TEST_F(process_madvise, bad_pidfd) 256*b50e3788Swang lian { 257*b50e3788Swang lian const unsigned long pagesize = self->page_size; 258*b50e3788Swang lian struct iovec vec; 259*b50e3788Swang lian char *map; 260*b50e3788Swang lian ssize_t ret; 261*b50e3788Swang lian 262*b50e3788Swang lian map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 263*b50e3788Swang lian 0); 264*b50e3788Swang lian if (map == MAP_FAILED) 265*b50e3788Swang lian SKIP(return, "mmap failed, not enough memory.\n"); 266*b50e3788Swang lian 267*b50e3788Swang lian vec.iov_base = map; 268*b50e3788Swang lian vec.iov_len = pagesize; 269*b50e3788Swang lian 270*b50e3788Swang lian /* Using an invalid fd number (-1) should fail with EBADF. */ 271*b50e3788Swang lian ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0); 272*b50e3788Swang lian ASSERT_EQ(ret, -1); 273*b50e3788Swang lian ASSERT_EQ(errno, EBADF); 274*b50e3788Swang lian 275*b50e3788Swang lian /* 276*b50e3788Swang lian * Using a valid fd that is not a pidfd (e.g. stdin) should fail 277*b50e3788Swang lian * with EBADF. 278*b50e3788Swang lian */ 279*b50e3788Swang lian ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0); 280*b50e3788Swang lian ASSERT_EQ(ret, -1); 281*b50e3788Swang lian ASSERT_EQ(errno, EBADF); 282*b50e3788Swang lian } 283*b50e3788Swang lian 284*b50e3788Swang lian /* 285*b50e3788Swang lian * Test that process_madvise() rejects vlen > UIO_MAXIOV. 286*b50e3788Swang lian * The kernel should return -EINVAL when the number of iovecs exceeds 1024. 287*b50e3788Swang lian */ 288*b50e3788Swang lian TEST_F(process_madvise, invalid_vlen) 289*b50e3788Swang lian { 290*b50e3788Swang lian const unsigned long pagesize = self->page_size; 291*b50e3788Swang lian int pidfd = self->pidfd; 292*b50e3788Swang lian struct iovec vec; 293*b50e3788Swang lian char *map; 294*b50e3788Swang lian ssize_t ret; 295*b50e3788Swang lian 296*b50e3788Swang lian map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 297*b50e3788Swang lian 0); 298*b50e3788Swang lian if (map == MAP_FAILED) 299*b50e3788Swang lian SKIP(return, "mmap failed, not enough memory.\n"); 300*b50e3788Swang lian 301*b50e3788Swang lian vec.iov_base = map; 302*b50e3788Swang lian vec.iov_len = pagesize; 303*b50e3788Swang lian 304*b50e3788Swang lian ret = sys_process_madvise(pidfd, &vec, 1025, MADV_DONTNEED, 0); 305*b50e3788Swang lian ASSERT_EQ(ret, -1); 306*b50e3788Swang lian ASSERT_EQ(errno, EINVAL); 307*b50e3788Swang lian 308*b50e3788Swang lian /* Cleanup. */ 309*b50e3788Swang lian ASSERT_EQ(munmap(map, pagesize), 0); 310*b50e3788Swang lian } 311*b50e3788Swang lian 312*b50e3788Swang lian /* 313*b50e3788Swang lian * Test process_madvise() with an invalid flag value. Currently, only a flag 314*b50e3788Swang lian * value of 0 is supported. This test is reserved for the future, e.g., if 315*b50e3788Swang lian * synchronous flags are added. 316*b50e3788Swang lian */ 317*b50e3788Swang lian TEST_F(process_madvise, flag) 318*b50e3788Swang lian { 319*b50e3788Swang lian const unsigned long pagesize = self->page_size; 320*b50e3788Swang lian unsigned int invalid_flag; 321*b50e3788Swang lian int pidfd = self->pidfd; 322*b50e3788Swang lian struct iovec vec; 323*b50e3788Swang lian char *map; 324*b50e3788Swang lian ssize_t ret; 325*b50e3788Swang lian 326*b50e3788Swang lian map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 327*b50e3788Swang lian 0); 328*b50e3788Swang lian if (map == MAP_FAILED) 329*b50e3788Swang lian SKIP(return, "mmap failed, not enough memory.\n"); 330*b50e3788Swang lian 331*b50e3788Swang lian vec.iov_base = map; 332*b50e3788Swang lian vec.iov_len = pagesize; 333*b50e3788Swang lian 334*b50e3788Swang lian invalid_flag = 0x80000000; 335*b50e3788Swang lian 336*b50e3788Swang lian ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, invalid_flag); 337*b50e3788Swang lian ASSERT_EQ(ret, -1); 338*b50e3788Swang lian ASSERT_EQ(errno, EINVAL); 339*b50e3788Swang lian 340*b50e3788Swang lian /* Cleanup. */ 341*b50e3788Swang lian ASSERT_EQ(munmap(map, pagesize), 0); 342*b50e3788Swang lian } 343*b50e3788Swang lian 344*b50e3788Swang lian TEST_HARNESS_MAIN 345