xref: /illumos-gate/usr/src/uts/common/os/subr.c (revision 17a5fa85fe0c34b1146222e40a80b42f2aae8500)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright 2019 Joyent, Inc.
28  */
29 
30 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
31 /*	  All Rights Reserved	*/
32 
33 #include <sys/types.h>
34 #include <sys/sysmacros.h>
35 #include <sys/param.h>
36 #include <sys/vmparam.h>
37 #include <sys/systm.h>
38 #include <sys/cred.h>
39 #include <sys/user.h>
40 #include <sys/proc.h>
41 #include <sys/conf.h>
42 #include <sys/tuneable.h>
43 #include <sys/cpuvar.h>
44 #include <sys/archsystm.h>
45 #include <sys/vmem.h>
46 #include <vm/seg_kmem.h>
47 #include <sys/errno.h>
48 #include <sys/cmn_err.h>
49 #include <sys/debug.h>
50 #include <sys/atomic.h>
51 #include <sys/model.h>
52 #include <sys/kmem.h>
53 #include <sys/memlist.h>
54 #include <sys/autoconf.h>
55 #include <sys/ontrap.h>
56 #include <sys/utsname.h>
57 #include <sys/zone.h>
58 
59 #ifdef __sparc
60 #include <sys/membar.h>
61 #endif
62 
63 /*
64  * Routine which sets a user error; placed in
65  * illegal entries in the bdevsw and cdevsw tables.
66  */
67 
68 int
69 nodev()
70 {
71 	return (curthread->t_lwp ?
72 	    ttolwp(curthread)->lwp_error = ENXIO : ENXIO);
73 }
74 
75 /*
76  * Null routine; placed in insignificant entries
77  * in the bdevsw and cdevsw tables.
78  */
79 
80 int
81 nulldev()
82 {
83 	return (0);
84 }
85 
86 static kmutex_t udevlock;
87 
88 /*
89  * Generate an unused major device number.
90  */
91 major_t
92 getudev()
93 {
94 	static major_t next = 0;
95 	major_t ret;
96 
97 	/*
98 	 * Ensure that we start allocating major numbers above the 'devcnt'
99 	 * count.  The only limit we place on the number is that it should be a
100 	 * legal 32-bit SVR4 major number and be greater than or equal to devcnt
101 	 * in the current system).
102 	 */
103 	mutex_enter(&udevlock);
104 	if (next == 0)
105 		next = devcnt;
106 	if (next <= L_MAXMAJ32 && next >= devcnt)
107 		ret = next++;
108 	else {
109 		/*
110 		 * If we fail to allocate a major number because devcnt has
111 		 * reached L_MAXMAJ32, we may be the victim of a sparsely
112 		 * populated devnames array.  We scan the array backwards
113 		 * looking for an empty slot;  if we find one, mark it as
114 		 * DN_GETUDEV so it doesn't get taken by subsequent consumers
115 		 * users of the devnames array, and issue a warning.
116 		 * It is vital for this routine to take drastic measures to
117 		 * succeed, since the kernel really needs it to boot.
118 		 */
119 		int i;
120 		for (i = devcnt - 1; i >= 0; i--) {
121 			LOCK_DEV_OPS(&devnamesp[i].dn_lock);
122 			if (devnamesp[i].dn_name == NULL &&
123 			    ((devnamesp[i].dn_flags & DN_TAKEN_GETUDEV) == 0))
124 				break;
125 			UNLOCK_DEV_OPS(&devnamesp[i].dn_lock);
126 		}
127 		if (i != -1) {
128 			cmn_err(CE_WARN, "Reusing device major number %d.", i);
129 			ASSERT(i >= 0 && i < devcnt);
130 			devnamesp[i].dn_flags |= DN_TAKEN_GETUDEV;
131 			UNLOCK_DEV_OPS(&devnamesp[i].dn_lock);
132 			ret = (major_t)i;
133 		} else {
134 			ret = DDI_MAJOR_T_NONE;
135 		}
136 	}
137 	mutex_exit(&udevlock);
138 	return (ret);
139 }
140 
141 
142 /*
143  * Compress 'long' device number encoding to 32-bit device number
144  * encoding.  If it won't fit, we return failure, but set the
145  * device number to 32-bit NODEV for the sake of our callers.
146  */
147 int
148 cmpldev(dev32_t *dst, dev_t dev)
149 {
150 #if defined(_LP64)
151 	if (dev == NODEV) {
152 		*dst = NODEV32;
153 	} else {
154 		major_t major = dev >> L_BITSMINOR;
155 		minor_t minor = dev & L_MAXMIN;
156 
157 		if (major > L_MAXMAJ32 || minor > L_MAXMIN32) {
158 			*dst = NODEV32;
159 			return (0);
160 		}
161 
162 		*dst = (dev32_t)((major << L_BITSMINOR32) | minor);
163 	}
164 #else
165 	*dst = (dev32_t)dev;
166 #endif
167 	return (1);
168 }
169 
170 /*
171  * Expand 32-bit dev_t's to long dev_t's.  Expansion always "fits"
172  * into the return type, but we're careful to expand NODEV explicitly.
173  */
174 dev_t
175 expldev(dev32_t dev32)
176 {
177 #ifdef _LP64
178 	if (dev32 == NODEV32)
179 		return (NODEV);
180 	return (makedevice((dev32 >> L_BITSMINOR32) & L_MAXMAJ32,
181 	    dev32 & L_MAXMIN32));
182 #else
183 	return ((dev_t)dev32);
184 #endif
185 }
186 
187 #ifndef _LP64
188 /*
189  * Keep these entry points for 32-bit systems but enforce the use
190  * of MIN/MAX macros on 64-bit systems.  The DDI header files already
191  * define min/max as macros so drivers shouldn't need these functions.
192  */
193 
194 int
195 min(int a, int b)
196 {
197 	return (a < b ? a : b);
198 }
199 
200 int
201 max(int a, int b)
202 {
203 	return (a > b ? a : b);
204 }
205 
206 uint_t
207 umin(uint_t a, uint_t b)
208 {
209 	return (a < b ? a : b);
210 }
211 
212 uint_t
213 umax(uint_t a, uint_t b)
214 {
215 	return (a > b ? a : b);
216 }
217 
218 #endif /* !_LP64 */
219 
220 /*
221  * Parse suboptions from a string.
222  * Same as getsubopt(3C).
223  */
224 int
225 getsubopt(char **optionsp, char * const *tokens, char **valuep)
226 {
227 	char *s = *optionsp, *p;
228 	int i;
229 	size_t optlen;
230 
231 	*valuep = NULL;
232 	if (*s == '\0')
233 		return (-1);
234 	p = strchr(s, ',');		/* find next option */
235 	if (p == NULL) {
236 		p = s + strlen(s);
237 	} else {
238 		*p++ = '\0';		/* mark end and point to next */
239 	}
240 	*optionsp = p;			/* point to next option */
241 	p = strchr(s, '=');		/* find value */
242 	if (p == NULL) {
243 		optlen = strlen(s);
244 		*valuep = NULL;
245 	} else {
246 		optlen = p - s;
247 		*valuep = ++p;
248 	}
249 	for (i = 0; tokens[i] != NULL; i++) {
250 		if ((optlen == strlen(tokens[i])) &&
251 		    (strncmp(s, tokens[i], optlen) == 0))
252 			return (i);
253 	}
254 	/* no match, point value at option and return error */
255 	*valuep = s;
256 	return (-1);
257 }
258 
259 /*
260  * Append the suboption string 'opt' starting at the position 'str'
261  * within the buffer defined by 'buf' and 'len'. If 'buf' is not null,
262  * a comma is appended first.
263  * Return a pointer to the end of the resulting string (the null byte).
264  * Return NULL if there isn't enough space left to append 'opt'.
265  */
266 char *
267 append_subopt(const char *buf, size_t len, char *str, const char *opt)
268 {
269 	size_t l = strlen(opt);
270 
271 	/*
272 	 * Include a ',' if this is not the first option.
273 	 * Include space for the null byte.
274 	 */
275 	if (strlen(buf) + (buf[0] != '\0') + l + 1 > len)
276 		return (NULL);
277 
278 	if (buf[0] != '\0')
279 		*str++ = ',';
280 	(void) strcpy(str, opt);
281 	return (str + l);
282 }
283 
284 /*
285  * Tables to convert a single byte to/from binary-coded decimal (BCD).
286  */
287 uchar_t byte_to_bcd[256] = {
288 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
289 	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
290 	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
291 	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
292 	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
293 	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
294 	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
295 	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
296 	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
297 	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
298 };
299 
300 uchar_t bcd_to_byte[256] = {		/* CSTYLED */
301 	 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  0,  0,  0,  0,  0,  0,
302 	10, 11, 12, 13, 14, 15, 16, 17, 18, 19,  0,  0,  0,  0,  0,  0,
303 	20, 21, 22, 23, 24, 25, 26, 27, 28, 29,  0,  0,  0,  0,  0,  0,
304 	30, 31, 32, 33, 34, 35, 36, 37, 38, 39,  0,  0,  0,  0,  0,  0,
305 	40, 41, 42, 43, 44, 45, 46, 47, 48, 49,  0,  0,  0,  0,  0,  0,
306 	50, 51, 52, 53, 54, 55, 56, 57, 58, 59,  0,  0,  0,  0,  0,  0,
307 	60, 61, 62, 63, 64, 65, 66, 67, 68, 69,  0,  0,  0,  0,  0,  0,
308 	70, 71, 72, 73, 74, 75, 76, 77, 78, 79,  0,  0,  0,  0,  0,  0,
309 	80, 81, 82, 83, 84, 85, 86, 87, 88, 89,  0,  0,  0,  0,  0,  0,
310 	90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
311 };
312 
313 /*
314  * Hot-patch a single instruction in the kernel's text.
315  *
316  * If you want to patch multiple instructions you must arrange to do it so that
317  * all intermediate stages are sane -- we don't stop other cpus while doing
318  * this.
319  *
320  * Size must be 1, 2, or 4 bytes with iaddr aligned accordingly.
321  *
322  * The instruction itself might straddle a page boundary, so we have to account
323  * for that.
324  */
325 void
326 hot_patch_kernel_text(caddr_t iaddr, uint32_t new_instr, uint_t size)
327 {
328 	const uintptr_t pageoff = (uintptr_t)iaddr & PAGEOFFSET;
329 	const boolean_t straddles = (pageoff + size > PAGESIZE);
330 	const size_t mapsize = straddles ? PAGESIZE * 2 : PAGESIZE;
331 	caddr_t ipageaddr = iaddr - pageoff;
332 	caddr_t vaddr;
333 	page_t **ppp;
334 
335 	vaddr = vmem_alloc(heap_arena, mapsize, VM_SLEEP);
336 
337 	(void) as_pagelock(&kas, &ppp, ipageaddr, mapsize, S_WRITE);
338 
339 	hat_devload(kas.a_hat, vaddr, PAGESIZE,
340 	    hat_getpfnum(kas.a_hat, ipageaddr), PROT_READ | PROT_WRITE,
341 	    HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
342 
343 	if (straddles) {
344 		hat_devload(kas.a_hat, vaddr + PAGESIZE, PAGESIZE,
345 		    hat_getpfnum(kas.a_hat, ipageaddr + PAGESIZE),
346 		    PROT_READ | PROT_WRITE, HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
347 	}
348 
349 	switch (size) {
350 	case 1:
351 		*(uint8_t *)(vaddr + pageoff) = new_instr;
352 		break;
353 	case 2:
354 		*(uint16_t *)(vaddr + pageoff) = new_instr;
355 		break;
356 	case 4:
357 		*(uint32_t *)(vaddr + pageoff) = new_instr;
358 		break;
359 	default:
360 		panic("illegal hot-patch");
361 	}
362 
363 	membar_enter();
364 	sync_icache(vaddr + pageoff, size);
365 	sync_icache(iaddr, size);
366 	as_pageunlock(&kas, ppp, ipageaddr, mapsize, S_WRITE);
367 	hat_unload(kas.a_hat, vaddr, mapsize, HAT_UNLOAD_UNLOCK);
368 	vmem_free(heap_arena, vaddr, mapsize);
369 }
370 
371 /*
372  * Routine to report an attempt to execute non-executable data.  If the
373  * address executed lies in the stack, explicitly say so.
374  */
375 void
376 report_stack_exec(proc_t *p, caddr_t addr)
377 {
378 	if (!noexec_user_stack_log)
379 		return;
380 
381 	if (addr < p->p_usrstack && addr >= (p->p_usrstack - p->p_stksize)) {
382 		cmn_err(CE_NOTE, "%s[%d] attempt to execute code "
383 		    "on stack by uid %d", p->p_user.u_comm,
384 		    p->p_pid, crgetruid(p->p_cred));
385 	} else {
386 		cmn_err(CE_NOTE, "%s[%d] attempt to execute non-executable "
387 		    "data at 0x%p by uid %d", p->p_user.u_comm,
388 		    p->p_pid, (void *) addr, crgetruid(p->p_cred));
389 	}
390 
391 	delay(hz / 50);
392 }
393 
394 /*
395  * Determine whether the address range [addr, addr + len) is in memlist mp.
396  */
397 int
398 address_in_memlist(struct memlist *mp, uint64_t addr, size_t len)
399 {
400 	while (mp != 0)	 {
401 		if ((addr >= mp->ml_address) &&
402 		    (addr + len <= mp->ml_address + mp->ml_size))
403 			return (1);	 /* TRUE */
404 		mp = mp->ml_next;
405 	}
406 	return (0);	/* FALSE */
407 }
408 
409 /*
410  * Pop the topmost element from the t_ontrap stack, removing the current set of
411  * on_trap() protections.  Refer to <sys/ontrap.h> for more info.  If the
412  * stack is already empty, no_trap() just returns.
413  */
414 void
415 no_trap(void)
416 {
417 	if (curthread->t_ontrap != NULL) {
418 #ifdef __sparc
419 		membar_sync(); /* deferred error barrier (see sparcv9_subr.s) */
420 #endif
421 		curthread->t_ontrap = curthread->t_ontrap->ot_prev;
422 	}
423 }
424 
425 /*
426  * Return utsname.nodename outside a zone, or the zone name within.
427  */
428 char *
429 uts_nodename(void)
430 {
431 	if (curproc == NULL)
432 		return (utsname.nodename);
433 	return (curproc->p_zone->zone_nodename);
434 }
435