xref: /linux/arch/um/kernel/skas/stub_exe.c (revision 7f81907b7e3f93dfed2e903af52659baa4944341)
1 #include <sys/ptrace.h>
2 #include <sys/prctl.h>
3 #include <sys/fcntl.h>
4 #include <asm/unistd.h>
5 #include <sysdep/stub.h>
6 #include <stub-data.h>
7 #include <linux/filter.h>
8 #include <linux/seccomp.h>
9 #include <generated/asm-offsets.h>
10 
11 void _start(void);
12 
13 noinline static void real_init(void)
14 {
15 	struct stub_init_data init_data;
16 	unsigned long res;
17 	struct {
18 		void  *ss_sp;
19 		int    ss_flags;
20 		size_t ss_size;
21 	} stack = {
22 		.ss_size = STUB_DATA_PAGES * UM_KERN_PAGE_SIZE,
23 	};
24 	struct {
25 		void *sa_handler_;
26 		unsigned long sa_flags;
27 		void *sa_restorer;
28 		unsigned long long sa_mask;
29 	} sa = {
30 		/* Need to set SA_RESTORER (but the handler never returns) */
31 		.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO | 0x04000000,
32 	};
33 
34 	/* set a nice name */
35 	stub_syscall2(__NR_prctl, PR_SET_NAME, (unsigned long)"uml-userspace");
36 
37 	/* Make sure this process dies if the kernel dies */
38 	stub_syscall2(__NR_prctl, PR_SET_PDEATHSIG, SIGKILL);
39 
40 	/* Needed in SECCOMP mode (and safe to do anyway) */
41 	stub_syscall5(__NR_prctl, PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
42 
43 	/* read information from STDIN and close it */
44 	res = stub_syscall3(__NR_read, 0,
45 			    (unsigned long)&init_data, sizeof(init_data));
46 	if (res != sizeof(init_data))
47 		stub_syscall1(__NR_exit, 10);
48 
49 	/* In SECCOMP mode, FD 0 is a socket and is later used for FD passing */
50 	if (!init_data.seccomp)
51 		stub_syscall1(__NR_close, 0);
52 	else
53 		stub_syscall3(__NR_fcntl, 0, F_SETFL, O_NONBLOCK);
54 
55 	/* map stub code + data */
56 	res = stub_syscall6(STUB_MMAP_NR,
57 			    init_data.stub_start, UM_KERN_PAGE_SIZE,
58 			    PROT_READ | PROT_EXEC, MAP_FIXED | MAP_SHARED,
59 			    init_data.stub_code_fd, init_data.stub_code_offset);
60 	if (res != init_data.stub_start)
61 		stub_syscall1(__NR_exit, 11);
62 
63 	res = stub_syscall6(STUB_MMAP_NR,
64 			    init_data.stub_start + UM_KERN_PAGE_SIZE,
65 			    STUB_DATA_PAGES * UM_KERN_PAGE_SIZE,
66 			    PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED,
67 			    init_data.stub_data_fd, init_data.stub_data_offset);
68 	if (res != init_data.stub_start + UM_KERN_PAGE_SIZE)
69 		stub_syscall1(__NR_exit, 12);
70 
71 	/* In SECCOMP mode, we only need the signalling FD from now on */
72 	if (init_data.seccomp) {
73 		res = stub_syscall3(__NR_close_range, 1, ~0U, 0);
74 		if (res != 0)
75 			stub_syscall1(__NR_exit, 13);
76 	}
77 
78 	/* setup signal stack inside stub data */
79 	stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE;
80 	stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0);
81 
82 	/* register signal handlers */
83 	sa.sa_handler_ = (void *) init_data.signal_handler;
84 	sa.sa_restorer = (void *) init_data.signal_restorer;
85 	if (!init_data.seccomp) {
86 		/* In ptrace mode, the SIGSEGV handler never returns */
87 		sa.sa_mask = 0;
88 
89 		res = stub_syscall4(__NR_rt_sigaction, SIGSEGV,
90 				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
91 		if (res != 0)
92 			stub_syscall1(__NR_exit, 14);
93 	} else {
94 		/* SECCOMP mode uses rt_sigreturn, need to mask all signals */
95 		sa.sa_mask = ~0ULL;
96 
97 		res = stub_syscall4(__NR_rt_sigaction, SIGSEGV,
98 				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
99 		if (res != 0)
100 			stub_syscall1(__NR_exit, 15);
101 
102 		res = stub_syscall4(__NR_rt_sigaction, SIGSYS,
103 				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
104 		if (res != 0)
105 			stub_syscall1(__NR_exit, 16);
106 
107 		res = stub_syscall4(__NR_rt_sigaction, SIGALRM,
108 				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
109 		if (res != 0)
110 			stub_syscall1(__NR_exit, 17);
111 
112 		res = stub_syscall4(__NR_rt_sigaction, SIGTRAP,
113 				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
114 		if (res != 0)
115 			stub_syscall1(__NR_exit, 18);
116 
117 		res = stub_syscall4(__NR_rt_sigaction, SIGILL,
118 				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
119 		if (res != 0)
120 			stub_syscall1(__NR_exit, 19);
121 
122 		res = stub_syscall4(__NR_rt_sigaction, SIGFPE,
123 				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
124 		if (res != 0)
125 			stub_syscall1(__NR_exit, 20);
126 	}
127 
128 	/*
129 	 * If in seccomp mode, install the SECCOMP filter and trigger a syscall.
130 	 * Otherwise set PTRACE_TRACEME and do a SIGSTOP.
131 	 */
132 	if (init_data.seccomp) {
133 		struct sock_filter filter[] = {
134 #if __BITS_PER_LONG > 32
135 			/* [0] Load upper 32bit of instruction pointer from seccomp_data */
136 			BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
137 				 (offsetof(struct seccomp_data, instruction_pointer) + 4)),
138 
139 			/* [1] Jump forward 3 instructions if the upper address is not identical */
140 			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (init_data.stub_start) >> 32, 0, 3),
141 #endif
142 			/* [2] Load lower 32bit of instruction pointer from seccomp_data */
143 			BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
144 				 (offsetof(struct seccomp_data, instruction_pointer))),
145 
146 			/* [3] Mask out lower bits */
147 			BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 0xfffff000),
148 
149 			/* [4] Jump to [6] if the lower bits are not on the expected page */
150 			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (init_data.stub_start) & 0xfffff000, 1, 0),
151 
152 			/* [5] Trap call, allow */
153 			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP),
154 
155 			/* [6,7] Check architecture */
156 			BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
157 				 offsetof(struct seccomp_data, arch)),
158 			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,
159 				 UM_SECCOMP_ARCH_NATIVE, 1, 0),
160 
161 			/* [8] Kill (for architecture check) */
162 			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
163 
164 			/* [9] Load syscall number */
165 			BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
166 				 offsetof(struct seccomp_data, nr)),
167 
168 			/* [10-16] Check against permitted syscalls */
169 			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_futex,
170 				 7, 0),
171 			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_recvmsg,
172 				 6, 0),
173 			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_close,
174 				 5, 0),
175 			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR,
176 				 4, 0),
177 			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_munmap,
178 				 3, 0),
179 #ifdef __i386__
180 			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_set_thread_area,
181 				 2, 0),
182 #else
183 			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_arch_prctl,
184 				 2, 0),
185 #endif
186 			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigreturn,
187 				 1, 0),
188 
189 			/* [17] Not one of the permitted syscalls */
190 			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
191 
192 			/* [18] Permitted call for the stub */
193 			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
194 		};
195 		struct sock_fprog prog = {
196 			.len = sizeof(filter) / sizeof(filter[0]),
197 			.filter = filter,
198 		};
199 
200 		if (stub_syscall3(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
201 				  SECCOMP_FILTER_FLAG_TSYNC,
202 				  (unsigned long)&prog) != 0)
203 			stub_syscall1(__NR_exit, 21);
204 
205 		/* Fall through, the exit syscall will cause SIGSYS */
206 	} else {
207 		stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0);
208 
209 		stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP);
210 	}
211 
212 	stub_syscall1(__NR_exit, 30);
213 
214 	__builtin_unreachable();
215 }
216 
217 __attribute__((naked)) void _start(void)
218 {
219 	/*
220 	 * Since the stack after exec() starts at the top-most address,
221 	 * but that's exactly where we also want to map the stub data
222 	 * and code, this must:
223 	 *  - push the stack by 1 code and STUB_DATA_PAGES data pages
224 	 *  - call real_init()
225 	 * This way, real_init() can use the stack normally, while the
226 	 * original stack further down (higher address) will become
227 	 * inaccessible after the mmap() calls above.
228 	 */
229 	stub_start(real_init);
230 }
231