xref: /linux/tools/testing/selftests/mm/khugepaged.c (revision bba2c3615bd6cfee7456d1130f2e6b01b3f4e9ba)
1 #define _GNU_SOURCE
2 #include <ctype.h>
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <limits.h>
6 #include <dirent.h>
7 #include <signal.h>
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <stdbool.h>
11 #include <string.h>
12 #include <unistd.h>
13 
14 #include <linux/mman.h>
15 #include <sys/mman.h>
16 #include <sys/wait.h>
17 #include <sys/types.h>
18 #include <sys/stat.h>
19 #include <sys/sysmacros.h>
20 #include <sys/vfs.h>
21 
22 #include "linux/magic.h"
23 
24 #include "vm_util.h"
25 #include "thp_settings.h"
26 
27 #define BASE_ADDR ((void *)(1UL << 30))
28 static unsigned long hpage_pmd_size;
29 static unsigned long page_size;
30 static int hpage_pmd_nr;
31 static int anon_order;
32 
33 #define PID_SMAPS "/proc/self/smaps"
34 #define TEST_FILE "collapse_test_file"
35 
36 #define MAX_LINE_LENGTH 500
37 
38 enum vma_type {
39 	VMA_ANON,
40 	VMA_FILE,
41 	VMA_SHMEM,
42 };
43 
44 struct mem_ops {
45 	void *(*setup_area)(int nr_hpages);
46 	void (*cleanup_area)(void *p, unsigned long size);
47 	void (*fault)(void *p, unsigned long start, unsigned long end);
48 	bool (*check_huge)(void *addr, int nr_hpages);
49 	const char *name;
50 };
51 
52 static struct mem_ops *file_ops;
53 static struct mem_ops *anon_ops;
54 static struct mem_ops *shmem_ops;
55 
56 struct collapse_context {
57 	void (*collapse)(const char *msg, char *p, int nr_hpages,
58 			 struct mem_ops *ops, bool expect);
59 	bool enforce_pte_scan_limits;
60 	const char *name;
61 };
62 
63 static struct collapse_context *khugepaged_context;
64 static struct collapse_context *madvise_context;
65 
66 struct file_info {
67 	const char *dir;
68 	char path[PATH_MAX];
69 	enum vma_type type;
70 	int fd;
71 	char dev_queue_read_ahead_path[PATH_MAX];
72 };
73 
74 static struct file_info finfo;
75 static bool skip_settings_restore;
76 static int exit_status;
77 
78 static void success(const char *msg)
79 {
80 	printf(" \e[32m%s\e[0m\n", msg);
81 }
82 
83 static void fail(const char *msg)
84 {
85 	printf(" \e[31m%s\e[0m\n", msg);
86 	exit_status++;
87 }
88 
89 static void skip(const char *msg)
90 {
91 	printf(" \e[33m%s\e[0m\n", msg);
92 }
93 
94 static void restore_settings_atexit(void)
95 {
96 	if (skip_settings_restore)
97 		return;
98 
99 	printf("Restore THP and khugepaged settings...");
100 	thp_restore_settings();
101 	success("OK");
102 
103 	skip_settings_restore = true;
104 }
105 
106 static void restore_settings(int sig)
107 {
108 	/* exit() will invoke the restore_settings_atexit handler. */
109 	exit(sig ? EXIT_FAILURE : exit_status);
110 }
111 
112 static void save_settings(void)
113 {
114 	printf("Save THP and khugepaged settings...");
115 	if (file_ops && finfo.type == VMA_FILE)
116 		thp_set_read_ahead_path(finfo.dev_queue_read_ahead_path);
117 	thp_save_settings();
118 
119 	success("OK");
120 
121 	atexit(restore_settings_atexit);
122 	signal(SIGTERM, restore_settings);
123 	signal(SIGINT, restore_settings);
124 	signal(SIGHUP, restore_settings);
125 	signal(SIGQUIT, restore_settings);
126 }
127 
128 static void get_finfo(const char *dir)
129 {
130 	struct stat path_stat;
131 	struct statfs fs;
132 	char buf[1 << 10];
133 	char path[PATH_MAX];
134 	char *str, *end;
135 
136 	finfo.dir = dir;
137 	stat(finfo.dir, &path_stat);
138 	if (!S_ISDIR(path_stat.st_mode)) {
139 		printf("%s: Not a directory (%s)\n", __func__, finfo.dir);
140 		exit(EXIT_FAILURE);
141 	}
142 	if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE,
143 		     finfo.dir) >= sizeof(finfo.path)) {
144 		printf("%s: Pathname is too long\n", __func__);
145 		exit(EXIT_FAILURE);
146 	}
147 	if (statfs(finfo.dir, &fs)) {
148 		perror("statfs()");
149 		exit(EXIT_FAILURE);
150 	}
151 	finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE;
152 	if (finfo.type == VMA_SHMEM)
153 		return;
154 
155 	/* Find owning device's queue/read_ahead_kb control */
156 	if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent",
157 		     major(path_stat.st_dev), minor(path_stat.st_dev))
158 	    >= sizeof(path)) {
159 		printf("%s: Pathname is too long\n", __func__);
160 		exit(EXIT_FAILURE);
161 	}
162 	if (read_file(path, buf, sizeof(buf)) < 0) {
163 		perror("read_file(read_num)");
164 		exit(EXIT_FAILURE);
165 	}
166 	if (strstr(buf, "DEVTYPE=disk")) {
167 		/* Found it */
168 		if (snprintf(finfo.dev_queue_read_ahead_path,
169 			     sizeof(finfo.dev_queue_read_ahead_path),
170 			     "/sys/dev/block/%d:%d/queue/read_ahead_kb",
171 			     major(path_stat.st_dev), minor(path_stat.st_dev))
172 		    >= sizeof(finfo.dev_queue_read_ahead_path)) {
173 			printf("%s: Pathname is too long\n", __func__);
174 			exit(EXIT_FAILURE);
175 		}
176 		return;
177 	}
178 	if (!strstr(buf, "DEVTYPE=partition")) {
179 		printf("%s: Unknown device type: %s\n", __func__, path);
180 		exit(EXIT_FAILURE);
181 	}
182 	/*
183 	 * Partition of block device - need to find actual device.
184 	 * Using naming convention that devnameN is partition of
185 	 * device devname.
186 	 */
187 	str = strstr(buf, "DEVNAME=");
188 	if (!str) {
189 		printf("%s: Could not read: %s", __func__, path);
190 		exit(EXIT_FAILURE);
191 	}
192 	str += 8;
193 	end = str;
194 	while (*end) {
195 		if (isdigit(*end)) {
196 			*end = '\0';
197 			if (snprintf(finfo.dev_queue_read_ahead_path,
198 				     sizeof(finfo.dev_queue_read_ahead_path),
199 				     "/sys/block/%s/queue/read_ahead_kb",
200 				     str) >= sizeof(finfo.dev_queue_read_ahead_path)) {
201 				printf("%s: Pathname is too long\n", __func__);
202 				exit(EXIT_FAILURE);
203 			}
204 			return;
205 		}
206 		++end;
207 	}
208 	printf("%s: Could not read: %s\n", __func__, path);
209 	exit(EXIT_FAILURE);
210 }
211 
212 static bool check_swap(void *addr, unsigned long size)
213 {
214 	bool swap = false;
215 	int ret;
216 	FILE *fp;
217 	char buffer[MAX_LINE_LENGTH];
218 	char addr_pattern[MAX_LINE_LENGTH];
219 
220 	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
221 		       (unsigned long) addr);
222 	if (ret >= MAX_LINE_LENGTH) {
223 		printf("%s: Pattern is too long\n", __func__);
224 		exit(EXIT_FAILURE);
225 	}
226 
227 
228 	fp = fopen(PID_SMAPS, "r");
229 	if (!fp) {
230 		printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
231 		exit(EXIT_FAILURE);
232 	}
233 	if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
234 		goto err_out;
235 
236 	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB",
237 		       size >> 10);
238 	if (ret >= MAX_LINE_LENGTH) {
239 		printf("%s: Pattern is too long\n", __func__);
240 		exit(EXIT_FAILURE);
241 	}
242 	/*
243 	 * Fetch the Swap: in the same block and check whether it got
244 	 * the expected number of hugeepages next.
245 	 */
246 	if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer)))
247 		goto err_out;
248 
249 	if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
250 		goto err_out;
251 
252 	swap = true;
253 err_out:
254 	fclose(fp);
255 	return swap;
256 }
257 
258 static void *alloc_mapping(int nr)
259 {
260 	void *p;
261 
262 	p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE,
263 		 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
264 	if (p != BASE_ADDR) {
265 		printf("Failed to allocate VMA at %p\n", BASE_ADDR);
266 		exit(EXIT_FAILURE);
267 	}
268 
269 	return p;
270 }
271 
272 static void fill_memory(int *p, unsigned long start, unsigned long end)
273 {
274 	int i;
275 
276 	for (i = start / page_size; i < end / page_size; i++)
277 		p[i * page_size / sizeof(*p)] = i + 0xdead0000;
278 }
279 
280 /*
281  * MADV_COLLAPSE is a best-effort request and may fail if an internal
282  * resource is temporarily unavailable, in which case it will set errno to
283  * EAGAIN.  In such a case, immediately reattempt the operation one more
284  * time.
285  */
286 static int madvise_collapse_retry(void *p, unsigned long size)
287 {
288 	bool retry = true;
289 	int ret;
290 
291 retry:
292 	ret = madvise(p, size, MADV_COLLAPSE);
293 	if (ret && errno == EAGAIN && retry) {
294 		retry = false;
295 		goto retry;
296 	}
297 	return ret;
298 }
299 
300 /*
301  * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with
302  * validate_memory()'able contents.
303  */
304 static void *alloc_hpage(struct mem_ops *ops)
305 {
306 	void *p = ops->setup_area(1);
307 
308 	ops->fault(p, 0, hpage_pmd_size);
309 
310 	/*
311 	 * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE.
312 	 * The latter is ineligible for collapse by MADV_COLLAPSE
313 	 * while the former might cause MADV_COLLAPSE to race with
314 	 * khugepaged on low-load system (like a test machine), which
315 	 * would cause MADV_COLLAPSE to fail with EAGAIN.
316 	 */
317 	printf("Allocate huge page...");
318 	if (madvise_collapse_retry(p, hpage_pmd_size)) {
319 		perror("madvise(MADV_COLLAPSE)");
320 		exit(EXIT_FAILURE);
321 	}
322 	if (!ops->check_huge(p, 1)) {
323 		perror("madvise(MADV_COLLAPSE)");
324 		exit(EXIT_FAILURE);
325 	}
326 	if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) {
327 		perror("madvise(MADV_HUGEPAGE)");
328 		exit(EXIT_FAILURE);
329 	}
330 	success("OK");
331 	return p;
332 }
333 
334 static void validate_memory(int *p, unsigned long start, unsigned long end)
335 {
336 	int i;
337 
338 	for (i = start / page_size; i < end / page_size; i++) {
339 		if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
340 			printf("Page %d is corrupted: %#x\n",
341 					i, p[i * page_size / sizeof(*p)]);
342 			exit(EXIT_FAILURE);
343 		}
344 	}
345 }
346 
347 static void *anon_setup_area(int nr_hpages)
348 {
349 	return alloc_mapping(nr_hpages);
350 }
351 
352 static void anon_cleanup_area(void *p, unsigned long size)
353 {
354 	munmap(p, size);
355 }
356 
357 static void anon_fault(void *p, unsigned long start, unsigned long end)
358 {
359 	fill_memory(p, start, end);
360 }
361 
362 static bool anon_check_huge(void *addr, int nr_hpages)
363 {
364 	return check_huge_anon(addr, nr_hpages, hpage_pmd_size);
365 }
366 
367 static void *file_setup_area(int nr_hpages)
368 {
369 	int fd;
370 	void *p;
371 	unsigned long size;
372 
373 	unlink(finfo.path);  /* Cleanup from previous failed tests */
374 	printf("Creating %s for collapse%s...", finfo.path,
375 	       finfo.type == VMA_SHMEM ? " (tmpfs)" : "");
376 	fd = open(finfo.path, O_CREAT | O_RDWR | O_TRUNC | O_EXCL,
377 		  777);
378 	if (fd < 0) {
379 		perror("open()");
380 		exit(EXIT_FAILURE);
381 	}
382 
383 	size = nr_hpages * hpage_pmd_size;
384 	if (ftruncate(fd, size)) {
385 		perror("ftruncate()");
386 		exit(EXIT_FAILURE);
387 	}
388 	p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE,
389 		MAP_SHARED, fd, 0);
390 	if (p != BASE_ADDR) {
391 		perror("mmap()");
392 		exit(EXIT_FAILURE);
393 	}
394 	fill_memory(p, 0, size);
395 	if (msync(p, size, MS_SYNC)) {
396 		perror("msync()");
397 		exit(EXIT_FAILURE);
398 	}
399 	close(fd);
400 	munmap(p, size);
401 	success("OK");
402 
403 	printf("Opening %s read only for collapse...", finfo.path);
404 	finfo.fd = open(finfo.path, O_RDONLY, 777);
405 	if (finfo.fd < 0) {
406 		perror("open()");
407 		exit(EXIT_FAILURE);
408 	}
409 	p = mmap(BASE_ADDR, size, PROT_READ,
410 		 MAP_PRIVATE, finfo.fd, 0);
411 	if (p == MAP_FAILED || p != BASE_ADDR) {
412 		perror("mmap()");
413 		exit(EXIT_FAILURE);
414 	}
415 
416 	/* Drop page cache */
417 	write_file("/proc/sys/vm/drop_caches", "3", 2);
418 	success("OK");
419 	return p;
420 }
421 
422 static void file_cleanup_area(void *p, unsigned long size)
423 {
424 	munmap(p, size);
425 	close(finfo.fd);
426 	unlink(finfo.path);
427 }
428 
429 static void file_fault(void *p, unsigned long start, unsigned long end)
430 {
431 	if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) {
432 		perror("madvise(MADV_POPULATE_READ");
433 		exit(EXIT_FAILURE);
434 	}
435 }
436 
437 static bool file_check_huge(void *addr, int nr_hpages)
438 {
439 	switch (finfo.type) {
440 	case VMA_FILE:
441 		return check_huge_file(addr, nr_hpages, hpage_pmd_size);
442 	case VMA_SHMEM:
443 		return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
444 	default:
445 		exit(EXIT_FAILURE);
446 		return false;
447 	}
448 }
449 
450 static void *shmem_setup_area(int nr_hpages)
451 {
452 	void *p;
453 	unsigned long size = nr_hpages * hpage_pmd_size;
454 
455 	finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0);
456 	if (finfo.fd < 0)  {
457 		perror("memfd_create()");
458 		exit(EXIT_FAILURE);
459 	}
460 	if (ftruncate(finfo.fd, size)) {
461 		perror("ftruncate()");
462 		exit(EXIT_FAILURE);
463 	}
464 	p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd,
465 		 0);
466 	if (p != BASE_ADDR) {
467 		perror("mmap()");
468 		exit(EXIT_FAILURE);
469 	}
470 	return p;
471 }
472 
473 static void shmem_cleanup_area(void *p, unsigned long size)
474 {
475 	munmap(p, size);
476 	close(finfo.fd);
477 }
478 
479 static bool shmem_check_huge(void *addr, int nr_hpages)
480 {
481 	return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
482 }
483 
484 static struct mem_ops __anon_ops = {
485 	.setup_area = &anon_setup_area,
486 	.cleanup_area = &anon_cleanup_area,
487 	.fault = &anon_fault,
488 	.check_huge = &anon_check_huge,
489 	.name = "anon",
490 };
491 
492 static struct mem_ops __file_ops = {
493 	.setup_area = &file_setup_area,
494 	.cleanup_area = &file_cleanup_area,
495 	.fault = &file_fault,
496 	.check_huge = &file_check_huge,
497 	.name = "file",
498 };
499 
500 static struct mem_ops __shmem_ops = {
501 	.setup_area = &shmem_setup_area,
502 	.cleanup_area = &shmem_cleanup_area,
503 	.fault = &anon_fault,
504 	.check_huge = &shmem_check_huge,
505 	.name = "shmem",
506 };
507 
508 static void __madvise_collapse(const char *msg, char *p, int nr_hpages,
509 			       struct mem_ops *ops, bool expect)
510 {
511 	int ret;
512 	struct thp_settings settings = *thp_current_settings();
513 
514 	printf("%s...", msg);
515 
516 	/*
517 	 * Prevent khugepaged interference and tests that MADV_COLLAPSE
518 	 * ignores /sys/kernel/mm/transparent_hugepage/enabled
519 	 */
520 	settings.thp_enabled = THP_NEVER;
521 	settings.shmem_enabled = SHMEM_NEVER;
522 	thp_push_settings(&settings);
523 
524 	/* Clear VM_NOHUGEPAGE */
525 	madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
526 	ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size);
527 	if (((bool)ret) == expect)
528 		fail("Fail: Bad return value");
529 	else if (!ops->check_huge(p, expect ? nr_hpages : 0))
530 		fail("Fail: check_huge()");
531 	else
532 		success("OK");
533 
534 	thp_pop_settings();
535 }
536 
537 static void madvise_collapse(const char *msg, char *p, int nr_hpages,
538 			     struct mem_ops *ops, bool expect)
539 {
540 	/* Sanity check */
541 	if (!ops->check_huge(p, 0)) {
542 		printf("Unexpected huge page\n");
543 		exit(EXIT_FAILURE);
544 	}
545 	__madvise_collapse(msg, p, nr_hpages, ops, expect);
546 }
547 
548 #define TICK 500000
549 static bool wait_for_scan(const char *msg, char *p, int nr_hpages,
550 			  struct mem_ops *ops)
551 {
552 	int full_scans;
553 	int timeout = 6; /* 3 seconds */
554 
555 	/* Sanity check */
556 	if (!ops->check_huge(p, 0)) {
557 		printf("Unexpected huge page\n");
558 		exit(EXIT_FAILURE);
559 	}
560 
561 	madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
562 
563 	/* Wait until the second full_scan completed */
564 	full_scans = thp_read_num("khugepaged/full_scans") + 2;
565 
566 	printf("%s...", msg);
567 	while (timeout--) {
568 		if (ops->check_huge(p, nr_hpages))
569 			break;
570 		if (thp_read_num("khugepaged/full_scans") >= full_scans)
571 			break;
572 		printf(".");
573 		usleep(TICK);
574 	}
575 
576 	return timeout == -1;
577 }
578 
579 static void khugepaged_collapse(const char *msg, char *p, int nr_hpages,
580 				struct mem_ops *ops, bool expect)
581 {
582 	if (wait_for_scan(msg, p, nr_hpages, ops)) {
583 		if (expect)
584 			fail("Timeout");
585 		else
586 			success("OK");
587 		return;
588 	}
589 
590 	/*
591 	 * For file and shmem memory, khugepaged only retracts pte entries after
592 	 * putting the new hugepage in the page cache. The hugepage must be
593 	 * subsequently refaulted to install the pmd mapping for the mm.
594 	 */
595 	if (ops != &__anon_ops)
596 		ops->fault(p, 0, nr_hpages * hpage_pmd_size);
597 
598 	if (ops->check_huge(p, expect ? nr_hpages : 0))
599 		success("OK");
600 	else
601 		fail("Fail");
602 }
603 
604 static struct collapse_context __khugepaged_context = {
605 	.collapse = &khugepaged_collapse,
606 	.enforce_pte_scan_limits = true,
607 	.name = "khugepaged",
608 };
609 
610 static struct collapse_context __madvise_context = {
611 	.collapse = &madvise_collapse,
612 	.enforce_pte_scan_limits = false,
613 	.name = "madvise",
614 };
615 
616 static bool is_tmpfs(struct mem_ops *ops)
617 {
618 	return ops == &__file_ops && finfo.type == VMA_SHMEM;
619 }
620 
621 static bool is_anon(struct mem_ops *ops)
622 {
623 	return ops == &__anon_ops;
624 }
625 
626 static void alloc_at_fault(void)
627 {
628 	struct thp_settings settings = *thp_current_settings();
629 	char *p;
630 
631 	settings.thp_enabled = THP_ALWAYS;
632 	thp_push_settings(&settings);
633 
634 	p = alloc_mapping(1);
635 	*p = 1;
636 	printf("Allocate huge page on fault...");
637 	if (check_huge_anon(p, 1, hpage_pmd_size))
638 		success("OK");
639 	else
640 		fail("Fail");
641 
642 	thp_pop_settings();
643 
644 	madvise(p, page_size, MADV_DONTNEED);
645 	printf("Split huge PMD on MADV_DONTNEED...");
646 	if (check_huge_anon(p, 0, hpage_pmd_size))
647 		success("OK");
648 	else
649 		fail("Fail");
650 	munmap(p, hpage_pmd_size);
651 }
652 
653 static void collapse_full(struct collapse_context *c, struct mem_ops *ops)
654 {
655 	void *p;
656 	int nr_hpages = 4;
657 	unsigned long size = nr_hpages * hpage_pmd_size;
658 
659 	p = ops->setup_area(nr_hpages);
660 	ops->fault(p, 0, size);
661 	c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages,
662 		    ops, true);
663 	validate_memory(p, 0, size);
664 	ops->cleanup_area(p, size);
665 }
666 
667 static void collapse_empty(struct collapse_context *c, struct mem_ops *ops)
668 {
669 	void *p;
670 
671 	p = ops->setup_area(1);
672 	c->collapse("Do not collapse empty PTE table", p, 1, ops, false);
673 	ops->cleanup_area(p, hpage_pmd_size);
674 }
675 
676 static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops)
677 {
678 	void *p;
679 
680 	p = ops->setup_area(1);
681 	ops->fault(p, 0, page_size);
682 	c->collapse("Collapse PTE table with single PTE entry present", p,
683 		    1, ops, true);
684 	ops->cleanup_area(p, hpage_pmd_size);
685 }
686 
687 static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops)
688 {
689 	int max_ptes_none = hpage_pmd_nr / 2;
690 	struct thp_settings settings = *thp_current_settings();
691 	void *p;
692 	int fault_nr_pages = is_anon(ops) ? 1 << anon_order : 1;
693 
694 	settings.khugepaged.max_ptes_none = max_ptes_none;
695 	thp_push_settings(&settings);
696 
697 	p = ops->setup_area(1);
698 
699 	if (is_tmpfs(ops)) {
700 		/* shmem pages always in the page cache */
701 		printf("tmpfs...");
702 		skip("Skip");
703 		goto skip;
704 	}
705 
706 	ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size);
707 	c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1,
708 		    ops, !c->enforce_pte_scan_limits);
709 	validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size);
710 
711 	if (c->enforce_pte_scan_limits) {
712 		ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
713 		c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops,
714 			    true);
715 		validate_memory(p, 0,
716 				(hpage_pmd_nr - max_ptes_none) * page_size);
717 	}
718 skip:
719 	ops->cleanup_area(p, hpage_pmd_size);
720 	thp_pop_settings();
721 }
722 
723 static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops)
724 {
725 	void *p;
726 
727 	p = ops->setup_area(1);
728 	ops->fault(p, 0, hpage_pmd_size);
729 
730 	printf("Swapout one page...");
731 	if (madvise(p, page_size, MADV_PAGEOUT)) {
732 		perror("madvise(MADV_PAGEOUT)");
733 		exit(EXIT_FAILURE);
734 	}
735 	if (check_swap(p, page_size)) {
736 		success("OK");
737 	} else {
738 		fail("Fail");
739 		goto out;
740 	}
741 
742 	c->collapse("Collapse with swapping in single PTE entry", p, 1, ops,
743 		    true);
744 	validate_memory(p, 0, hpage_pmd_size);
745 out:
746 	ops->cleanup_area(p, hpage_pmd_size);
747 }
748 
749 static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops)
750 {
751 	int max_ptes_swap = thp_read_num("khugepaged/max_ptes_swap");
752 	void *p;
753 
754 	p = ops->setup_area(1);
755 	ops->fault(p, 0, hpage_pmd_size);
756 
757 	printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
758 	if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
759 		perror("madvise(MADV_PAGEOUT)");
760 		exit(EXIT_FAILURE);
761 	}
762 	if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
763 		success("OK");
764 	} else {
765 		fail("Fail");
766 		goto out;
767 	}
768 
769 	c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops,
770 		    !c->enforce_pte_scan_limits);
771 	validate_memory(p, 0, hpage_pmd_size);
772 
773 	if (c->enforce_pte_scan_limits) {
774 		ops->fault(p, 0, hpage_pmd_size);
775 		printf("Swapout %d of %d pages...", max_ptes_swap,
776 		       hpage_pmd_nr);
777 		if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
778 			perror("madvise(MADV_PAGEOUT)");
779 			exit(EXIT_FAILURE);
780 		}
781 		if (check_swap(p, max_ptes_swap * page_size)) {
782 			success("OK");
783 		} else {
784 			fail("Fail");
785 			goto out;
786 		}
787 
788 		c->collapse("Collapse with max_ptes_swap pages swapped out", p,
789 			    1, ops, true);
790 		validate_memory(p, 0, hpage_pmd_size);
791 	}
792 out:
793 	ops->cleanup_area(p, hpage_pmd_size);
794 }
795 
796 static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops)
797 {
798 	void *p;
799 
800 	p = alloc_hpage(ops);
801 
802 	if (is_tmpfs(ops)) {
803 		/* MADV_DONTNEED won't evict tmpfs pages */
804 		printf("tmpfs...");
805 		skip("Skip");
806 		goto skip;
807 	}
808 
809 	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
810 	printf("Split huge page leaving single PTE mapping compound page...");
811 	madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
812 	if (ops->check_huge(p, 0))
813 		success("OK");
814 	else
815 		fail("Fail");
816 
817 	c->collapse("Collapse PTE table with single PTE mapping compound page",
818 		    p, 1, ops, true);
819 	validate_memory(p, 0, page_size);
820 skip:
821 	ops->cleanup_area(p, hpage_pmd_size);
822 }
823 
824 static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops)
825 {
826 	void *p;
827 
828 	p = alloc_hpage(ops);
829 	printf("Split huge page leaving single PTE page table full of compound pages...");
830 	madvise(p, page_size, MADV_NOHUGEPAGE);
831 	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
832 	if (ops->check_huge(p, 0))
833 		success("OK");
834 	else
835 		fail("Fail");
836 
837 	c->collapse("Collapse PTE table full of compound pages", p, 1, ops,
838 		    true);
839 	validate_memory(p, 0, hpage_pmd_size);
840 	ops->cleanup_area(p, hpage_pmd_size);
841 }
842 
843 static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops)
844 {
845 	void *p;
846 	int i;
847 
848 	p = ops->setup_area(1);
849 	for (i = 0; i < hpage_pmd_nr; i++) {
850 		printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
851 				i + 1, hpage_pmd_nr);
852 
853 		madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
854 		ops->fault(BASE_ADDR, 0, hpage_pmd_size);
855 		if (!ops->check_huge(BASE_ADDR, 1)) {
856 			printf("Failed to allocate huge page\n");
857 			exit(EXIT_FAILURE);
858 		}
859 		madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
860 
861 		p = mremap(BASE_ADDR - i * page_size,
862 				i * page_size + hpage_pmd_size,
863 				(i + 1) * page_size,
864 				MREMAP_MAYMOVE | MREMAP_FIXED,
865 				BASE_ADDR + 2 * hpage_pmd_size);
866 		if (p == MAP_FAILED) {
867 			perror("mremap+unmap");
868 			exit(EXIT_FAILURE);
869 		}
870 
871 		p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
872 				(i + 1) * page_size,
873 				(i + 1) * page_size + hpage_pmd_size,
874 				MREMAP_MAYMOVE | MREMAP_FIXED,
875 				BASE_ADDR - (i + 1) * page_size);
876 		if (p == MAP_FAILED) {
877 			perror("mremap+alloc");
878 			exit(EXIT_FAILURE);
879 		}
880 	}
881 
882 	ops->cleanup_area(BASE_ADDR, hpage_pmd_size);
883 	ops->fault(p, 0, hpage_pmd_size);
884 	if (!ops->check_huge(p, 1))
885 		success("OK");
886 	else
887 		fail("Fail");
888 
889 	c->collapse("Collapse PTE table full of different compound pages", p, 1,
890 		    ops, true);
891 
892 	validate_memory(p, 0, hpage_pmd_size);
893 	ops->cleanup_area(p, hpage_pmd_size);
894 }
895 
896 static void collapse_fork(struct collapse_context *c, struct mem_ops *ops)
897 {
898 	int wstatus;
899 	void *p;
900 
901 	p = ops->setup_area(1);
902 
903 	printf("Allocate small page...");
904 	ops->fault(p, 0, page_size);
905 	if (ops->check_huge(p, 0))
906 		success("OK");
907 	else
908 		fail("Fail");
909 
910 	printf("Share small page over fork()...");
911 	if (!fork()) {
912 		/* Do not touch settings on child exit */
913 		skip_settings_restore = true;
914 		exit_status = 0;
915 
916 		if (ops->check_huge(p, 0))
917 			success("OK");
918 		else
919 			fail("Fail");
920 
921 		ops->fault(p, page_size, 2 * page_size);
922 		c->collapse("Collapse PTE table with single page shared with parent process",
923 			    p, 1, ops, true);
924 
925 		validate_memory(p, 0, page_size);
926 		ops->cleanup_area(p, hpage_pmd_size);
927 		exit(exit_status);
928 	}
929 
930 	wait(&wstatus);
931 	exit_status += WEXITSTATUS(wstatus);
932 
933 	printf("Check if parent still has small page...");
934 	if (ops->check_huge(p, 0))
935 		success("OK");
936 	else
937 		fail("Fail");
938 	validate_memory(p, 0, page_size);
939 	ops->cleanup_area(p, hpage_pmd_size);
940 }
941 
942 static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops)
943 {
944 	int wstatus;
945 	void *p;
946 
947 	p = alloc_hpage(ops);
948 	printf("Share huge page over fork()...");
949 	if (!fork()) {
950 		/* Do not touch settings on child exit */
951 		skip_settings_restore = true;
952 		exit_status = 0;
953 
954 		if (ops->check_huge(p, 1))
955 			success("OK");
956 		else
957 			fail("Fail");
958 
959 		printf("Split huge page PMD in child process...");
960 		madvise(p, page_size, MADV_NOHUGEPAGE);
961 		madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
962 		if (ops->check_huge(p, 0))
963 			success("OK");
964 		else
965 			fail("Fail");
966 		ops->fault(p, 0, page_size);
967 
968 		thp_write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
969 		c->collapse("Collapse PTE table full of compound pages in child",
970 			    p, 1, ops, true);
971 		thp_write_num("khugepaged/max_ptes_shared",
972 			  thp_current_settings()->khugepaged.max_ptes_shared);
973 
974 		validate_memory(p, 0, hpage_pmd_size);
975 		ops->cleanup_area(p, hpage_pmd_size);
976 		exit(exit_status);
977 	}
978 
979 	wait(&wstatus);
980 	exit_status += WEXITSTATUS(wstatus);
981 
982 	printf("Check if parent still has huge page...");
983 	if (ops->check_huge(p, 1))
984 		success("OK");
985 	else
986 		fail("Fail");
987 	validate_memory(p, 0, hpage_pmd_size);
988 	ops->cleanup_area(p, hpage_pmd_size);
989 }
990 
991 static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops)
992 {
993 	int max_ptes_shared = thp_read_num("khugepaged/max_ptes_shared");
994 	int wstatus;
995 	void *p;
996 
997 	p = alloc_hpage(ops);
998 	printf("Share huge page over fork()...");
999 	if (!fork()) {
1000 		/* Do not touch settings on child exit */
1001 		skip_settings_restore = true;
1002 		exit_status = 0;
1003 
1004 		if (ops->check_huge(p, 1))
1005 			success("OK");
1006 		else
1007 			fail("Fail");
1008 
1009 		printf("Trigger CoW on page %d of %d...",
1010 				hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
1011 		ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
1012 		if (ops->check_huge(p, 0))
1013 			success("OK");
1014 		else
1015 			fail("Fail");
1016 
1017 		c->collapse("Maybe collapse with max_ptes_shared exceeded", p,
1018 			    1, ops, !c->enforce_pte_scan_limits);
1019 
1020 		if (c->enforce_pte_scan_limits) {
1021 			printf("Trigger CoW on page %d of %d...",
1022 			       hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
1023 			ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) *
1024 				    page_size);
1025 			if (ops->check_huge(p, 0))
1026 				success("OK");
1027 			else
1028 				fail("Fail");
1029 
1030 			c->collapse("Collapse with max_ptes_shared PTEs shared",
1031 				    p, 1, ops, true);
1032 		}
1033 
1034 		validate_memory(p, 0, hpage_pmd_size);
1035 		ops->cleanup_area(p, hpage_pmd_size);
1036 		exit(exit_status);
1037 	}
1038 
1039 	wait(&wstatus);
1040 	exit_status += WEXITSTATUS(wstatus);
1041 
1042 	printf("Check if parent still has huge page...");
1043 	if (ops->check_huge(p, 1))
1044 		success("OK");
1045 	else
1046 		fail("Fail");
1047 	validate_memory(p, 0, hpage_pmd_size);
1048 	ops->cleanup_area(p, hpage_pmd_size);
1049 }
1050 
1051 static void madvise_collapse_existing_thps(struct collapse_context *c,
1052 					   struct mem_ops *ops)
1053 {
1054 	void *p;
1055 
1056 	p = ops->setup_area(1);
1057 	ops->fault(p, 0, hpage_pmd_size);
1058 	c->collapse("Collapse fully populated PTE table...", p, 1, ops, true);
1059 	validate_memory(p, 0, hpage_pmd_size);
1060 
1061 	/* c->collapse() will find a hugepage and complain - call directly. */
1062 	__madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true);
1063 	validate_memory(p, 0, hpage_pmd_size);
1064 	ops->cleanup_area(p, hpage_pmd_size);
1065 }
1066 
1067 /*
1068  * Test race with khugepaged where page tables have been retracted and
1069  * pmd cleared.
1070  */
1071 static void madvise_retracted_page_tables(struct collapse_context *c,
1072 					  struct mem_ops *ops)
1073 {
1074 	void *p;
1075 	int nr_hpages = 1;
1076 	unsigned long size = nr_hpages * hpage_pmd_size;
1077 
1078 	p = ops->setup_area(nr_hpages);
1079 	ops->fault(p, 0, size);
1080 
1081 	/* Let khugepaged collapse and leave pmd cleared */
1082 	if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages,
1083 			  ops)) {
1084 		fail("Timeout");
1085 		return;
1086 	}
1087 	success("OK");
1088 	c->collapse("Install huge PMD from page cache", p, nr_hpages, ops,
1089 		    true);
1090 	validate_memory(p, 0, size);
1091 	ops->cleanup_area(p, size);
1092 }
1093 
1094 static void usage(void)
1095 {
1096 	fprintf(stderr, "\nUsage: ./khugepaged [OPTIONS] <test type> [dir]\n\n");
1097 	fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n");
1098 	fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n");
1099 	fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n");
1100 	fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n");
1101 	fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n");
1102 	fprintf(stderr,	"\tCONFIG_READ_ONLY_THP_FOR_FS=y\n");
1103 	fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n");
1104 	fprintf(stderr,	"\tmounted with huge=advise option for khugepaged tests to work\n");
1105 	fprintf(stderr,	"\n\tSupported Options:\n");
1106 	fprintf(stderr,	"\t\t-h: This help message.\n");
1107 	fprintf(stderr,	"\t\t-s: mTHP size, expressed as page order.\n");
1108 	fprintf(stderr,	"\t\t    Defaults to 0. Use this size for anon or shmem allocations.\n");
1109 	exit(1);
1110 }
1111 
1112 static void parse_test_type(int argc, char **argv)
1113 {
1114 	int opt;
1115 	char *buf;
1116 	const char *token;
1117 
1118 	while ((opt = getopt(argc, argv, "s:h")) != -1) {
1119 		switch (opt) {
1120 		case 's':
1121 			anon_order = atoi(optarg);
1122 			break;
1123 		case 'h':
1124 		default:
1125 			usage();
1126 		}
1127 	}
1128 
1129 	argv += optind;
1130 	argc -= optind;
1131 
1132 	if (argc == 0) {
1133 		/* Backwards compatibility */
1134 		khugepaged_context =  &__khugepaged_context;
1135 		madvise_context =  &__madvise_context;
1136 		anon_ops = &__anon_ops;
1137 		return;
1138 	}
1139 
1140 	buf = strdup(argv[0]);
1141 	token = strsep(&buf, ":");
1142 
1143 	if (!strcmp(token, "all")) {
1144 		khugepaged_context =  &__khugepaged_context;
1145 		madvise_context =  &__madvise_context;
1146 	} else if (!strcmp(token, "khugepaged")) {
1147 		khugepaged_context =  &__khugepaged_context;
1148 	} else if (!strcmp(token, "madvise")) {
1149 		madvise_context =  &__madvise_context;
1150 	} else {
1151 		usage();
1152 	}
1153 
1154 	if (!buf)
1155 		usage();
1156 
1157 	if (!strcmp(buf, "all")) {
1158 		file_ops =  &__file_ops;
1159 		anon_ops = &__anon_ops;
1160 		shmem_ops = &__shmem_ops;
1161 	} else if (!strcmp(buf, "anon")) {
1162 		anon_ops = &__anon_ops;
1163 	} else if (!strcmp(buf, "file")) {
1164 		file_ops =  &__file_ops;
1165 	} else if (!strcmp(buf, "shmem")) {
1166 		shmem_ops = &__shmem_ops;
1167 	} else {
1168 		usage();
1169 	}
1170 
1171 	if (!file_ops)
1172 		return;
1173 
1174 	if (argc != 2)
1175 		usage();
1176 
1177 	get_finfo(argv[1]);
1178 }
1179 
1180 int main(int argc, char **argv)
1181 {
1182 	int hpage_pmd_order;
1183 	struct thp_settings default_settings = {
1184 		.thp_enabled = THP_MADVISE,
1185 		.thp_defrag = THP_DEFRAG_ALWAYS,
1186 		.shmem_enabled = SHMEM_ADVISE,
1187 		.use_zero_page = 0,
1188 		.khugepaged = {
1189 			.defrag = 1,
1190 			.alloc_sleep_millisecs = 10,
1191 			.scan_sleep_millisecs = 10,
1192 		},
1193 		/*
1194 		 * When testing file-backed memory, the collapse path
1195 		 * looks at how many pages are found in the page cache, not
1196 		 * what pages are mapped. Disable read ahead optimization so
1197 		 * pages don't find their way into the page cache unless
1198 		 * we mem_ops->fault() them in.
1199 		 */
1200 		.read_ahead_kb = 0,
1201 	};
1202 
1203 	if (!thp_is_enabled()) {
1204 		printf("Transparent Hugepages not available\n");
1205 		return KSFT_SKIP;
1206 	}
1207 
1208 	parse_test_type(argc, argv);
1209 
1210 	setbuf(stdout, NULL);
1211 
1212 	page_size = getpagesize();
1213 	hpage_pmd_size = read_pmd_pagesize();
1214 	if (!hpage_pmd_size) {
1215 		printf("Reading PMD pagesize failed");
1216 		exit(EXIT_FAILURE);
1217 	}
1218 	hpage_pmd_nr = hpage_pmd_size / page_size;
1219 	hpage_pmd_order = __builtin_ctz(hpage_pmd_nr);
1220 
1221 	default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
1222 	default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
1223 	default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
1224 	default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
1225 	default_settings.hugepages[hpage_pmd_order].enabled = THP_INHERIT;
1226 	default_settings.hugepages[anon_order].enabled = THP_ALWAYS;
1227 	default_settings.shmem_hugepages[hpage_pmd_order].enabled = SHMEM_INHERIT;
1228 	default_settings.shmem_hugepages[anon_order].enabled = SHMEM_ALWAYS;
1229 
1230 	save_settings();
1231 	thp_push_settings(&default_settings);
1232 
1233 	alloc_at_fault();
1234 
1235 #define TEST(t, c, o) do { \
1236 	if (c && o) { \
1237 		printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \
1238 		t(c, o); \
1239 	} \
1240 	} while (0)
1241 
1242 	TEST(collapse_full, khugepaged_context, anon_ops);
1243 	TEST(collapse_full, khugepaged_context, file_ops);
1244 	TEST(collapse_full, khugepaged_context, shmem_ops);
1245 	TEST(collapse_full, madvise_context, anon_ops);
1246 	TEST(collapse_full, madvise_context, file_ops);
1247 	TEST(collapse_full, madvise_context, shmem_ops);
1248 
1249 	TEST(collapse_empty, khugepaged_context, anon_ops);
1250 	TEST(collapse_empty, madvise_context, anon_ops);
1251 
1252 	TEST(collapse_single_pte_entry, khugepaged_context, anon_ops);
1253 	TEST(collapse_single_pte_entry, khugepaged_context, file_ops);
1254 	TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops);
1255 	TEST(collapse_single_pte_entry, madvise_context, anon_ops);
1256 	TEST(collapse_single_pte_entry, madvise_context, file_ops);
1257 	TEST(collapse_single_pte_entry, madvise_context, shmem_ops);
1258 
1259 	TEST(collapse_max_ptes_none, khugepaged_context, anon_ops);
1260 	TEST(collapse_max_ptes_none, khugepaged_context, file_ops);
1261 	TEST(collapse_max_ptes_none, madvise_context, anon_ops);
1262 	TEST(collapse_max_ptes_none, madvise_context, file_ops);
1263 
1264 	TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops);
1265 	TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops);
1266 	TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops);
1267 	TEST(collapse_single_pte_entry_compound, madvise_context, file_ops);
1268 
1269 	TEST(collapse_full_of_compound, khugepaged_context, anon_ops);
1270 	TEST(collapse_full_of_compound, khugepaged_context, file_ops);
1271 	TEST(collapse_full_of_compound, khugepaged_context, shmem_ops);
1272 	TEST(collapse_full_of_compound, madvise_context, anon_ops);
1273 	TEST(collapse_full_of_compound, madvise_context, file_ops);
1274 	TEST(collapse_full_of_compound, madvise_context, shmem_ops);
1275 
1276 	TEST(collapse_compound_extreme, khugepaged_context, anon_ops);
1277 	TEST(collapse_compound_extreme, madvise_context, anon_ops);
1278 
1279 	TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops);
1280 	TEST(collapse_swapin_single_pte, madvise_context, anon_ops);
1281 
1282 	TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops);
1283 	TEST(collapse_max_ptes_swap, madvise_context, anon_ops);
1284 
1285 	TEST(collapse_fork, khugepaged_context, anon_ops);
1286 	TEST(collapse_fork, madvise_context, anon_ops);
1287 
1288 	TEST(collapse_fork_compound, khugepaged_context, anon_ops);
1289 	TEST(collapse_fork_compound, madvise_context, anon_ops);
1290 
1291 	TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops);
1292 	TEST(collapse_max_ptes_shared, madvise_context, anon_ops);
1293 
1294 	TEST(madvise_collapse_existing_thps, madvise_context, anon_ops);
1295 	TEST(madvise_collapse_existing_thps, madvise_context, file_ops);
1296 	TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops);
1297 
1298 	TEST(madvise_retracted_page_tables, madvise_context, file_ops);
1299 	TEST(madvise_retracted_page_tables, madvise_context, shmem_ops);
1300 
1301 	restore_settings(0);
1302 }
1303