xref: /illumos-gate/usr/src/lib/libumem/i386/umem_genasm.c (revision 5328fc53d11d7151861fa272e4fb0248b8f0e145)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2019 Joyent, Inc.  All rights reserved.
23  */
24 
25 /*
26  * Don't Panic! If you find the blocks of assembly that follow confusing and
27  * you're questioning why they exist, please go read section 8 of the umem.c big
28  * theory statement. Next familiarize yourself with the malloc and free
29  * implementations in libumem's malloc.c.
30  *
31  * What follows is the i386 implementation of the thread caching automatic
32  * assembly generation. With i386 a function only has three registers it's
33  * allowed to change without restoring them: eax, ecx, and edx. All others have
34  * to be preserved. Since the set of registers we have available is so small, we
35  * have to make use of esi, ebx, and edi and save their original values to the
36  * stack.
37  *
38  * Malloc register usage:
39  * 	o. esi: Size of the malloc (passed into us and modified)
40  * 	o. edi: Size of the cache
41  * 	o. eax: Buffer to return
42  * 	o. ebx: Scratch space and temporary values
43  * 	o. ecx: Pointer to the tmem_t in the ulwp_t.
44  * 	o. edx: Pointer to the tmem_t array of roots
45  *
46  * Free register usage:
47  * 	o. esi: Size of the malloc (passed into us and modified)
48  * 	o. edi: Size of the cache
49  * 	o. eax: Buffer to free
50  * 	o. ebx: Scratch space and temporary values
51  * 	o. ecx: Pointer to the tmem_t in the ulwp_t.
52  * 	o. edx: Pointer to the tmem_t array of roots
53  *
54  * Once we determine what cache we are using, we increment %edx to the
55  * appropriate offset and set %edi with the size of the cache. This means that
56  * when we break out to the normal buffer allocation point %edx contains the
57  * head of the linked list and %edi is the amount that we have to adjust the
58  * total amount cached by the thread.
59  *
60  * Each block of assembly has psuedocode that describes its purpose.
61  */
62 
63 /*
64  * umem_base must be first.
65  */
66 #include "umem_base.h"
67 
68 #include <inttypes.h>
69 #include <strings.h>
70 #include <umem_impl.h>
71 #include <atomic.h>
72 #include <sys/mman.h>
73 #include <errno.h>
74 
75 const int umem_genasm_supported = 1;
76 static uintptr_t umem_genasm_mptr = (uintptr_t)&_malloc;
77 static size_t umem_genasm_msize = 512;
78 static uintptr_t umem_genasm_fptr = (uintptr_t)&_free;
79 static size_t umem_genasm_fsize = 512;
80 static uintptr_t umem_genasm_omptr = (uintptr_t)umem_malloc;
81 static uintptr_t umem_genasm_ofptr = (uintptr_t)umem_malloc_free;
82 /*
83  * The maximum number of caches we can support. We use a single byte addl so
84  * this is 255 (UINT8_MAX) / sizeof (uintptr_t). In this case 63
85  */
86 #define	UMEM_GENASM_MAX32	63
87 
88 #define	PTC_JMPADDR(dest, src)	(dest - (src + 4))
89 #define	PTC_ROOT_SIZE	sizeof (uintptr_t)
90 #define	MULTINOP	0x0000441f0f
91 
92 /*
93  * void *ptcmalloc(size_t orig_size);
94  *
95  * size_t size = orig_size + 8;
96  *
97  * if (size < orig_size)
98  * 	goto tomalloc;		! This is overflow
99  *
100  * if (size > cache_size)
101  * 	goto tomalloc;
102  *
103  * tmem_t *t = (uintptr_t)curthread() + umem_thr_offset;
104  * void **roots = t->tm_roots;
105  */
106 #define	PTC_MALINIT_JOUT	0x0e
107 #define	PTC_MALINIT_MCS	0x14
108 #define	PTC_MALINIT_JOV	0x1a
109 #define	PTC_MALINIT_SOFF	0x27
110 static const uint8_t malinit[] = {
111 	0x55,					/* pushl %ebp */
112 	0x89, 0xe5,				/* movl %esp, %ebp */
113 	0x57,					/* pushl %edi */
114 	0x56,					/* pushl %esi */
115 	0x53,					/* pushl %ebx */
116 	0x8b, 0x75, 0x08,			/* movl 0x8(%ebp), %esi */
117 	0x83, 0xc6, 0x08,			/* addl $0x8,%esi */
118 	0x0f, 0x82, 0x00, 0x00, 0x00, 0x00, 	/* jc +$JMP (errout) */
119 	0x81, 0xfe, 0x00, 0x00, 0x00, 0x00, 	/* cmpl sizeof ($C0), %esi */
120 	0x0f, 0x87, 0x00, 0x00, 0x00, 0x00,	/* ja +$JMP (errout) */
121 	0x65, 0x8b, 0x0d, 0x00, 0x00, 0x00, 0x00, 	/* movl %gs:0x0,%ecx */
122 	0x81, 0xc1, 0x00, 0x00,	0x00, 0x00, 	/* addl $OFF, %ecx */
123 	0x8d, 0x51, 0x04			/* leal 0x4(%ecx), %edx */
124 };
125 
126 /*
127  * void ptcfree(void *buf);
128  *
129  * if (buf == NULL)
130  * 	return;
131  *
132  * malloc_data_t *tag = buf;
133  * tag--;
134  * int size = tag->malloc_size;
135  * int tagtval = UMEM_MALLOC_DECODE(tag->malloc_tag, size);
136  *
137  * if (tagval != MALLOC_MAGIC)
138  * 	goto tofree;
139  *
140  * if (size > cache_max)
141  * 	goto tofree;
142  *
143  * tmem_t *t = (uintptr_t)curthread() + umem_thr_offset;
144  * void **roots = t->tm_roots;
145  */
146 #define	PTC_FRINI_JDONE	0x0d
147 #define	PTC_FRINI_JFREE	0x23
148 #define	PTC_FRINI_MCS	0x29
149 #define	PTC_FRINI_JOV	0x2f
150 #define	PTC_FRINI_SOFF	0x3c
151 static const uint8_t freeinit[] = {
152 	0x55,					/* pushl %ebp */
153 	0x89, 0xe5,				/* movl %esp, %ebp */
154 	0x57,					/* pushl %edi */
155 	0x56,					/* pushl %esi */
156 	0x53,					/* pushl %ebx */
157 	0x8b, 0x45, 0x08,			/* movl 0x8(%ebp), %eax */
158 	0x85, 0xc0,				/* testl %eax, %eax */
159 	0x0f, 0x84, 0x00, 0x00, 0x00, 0x00,	/* je $JDONE (done) */
160 	0x83, 0xe8, 0x08,			/* subl $0x8,%eax */
161 	0x8b, 0x30,				/* movl (%eax),%esi */
162 	0x8b, 0x50, 0x04,			/* movl 0x4(%eax),%edx */
163 	0x01, 0xf2,				/* addl %esi,%edx */
164 	0x81, 0xfa, 0x00, 0xc0, 0x10, 0x3a,	/* cmpl MAGIC32, %edx */
165 	0x0f, 0x85, 0x00, 0x00, 0x00, 0x00,	/* jne +JFREE (goto freebuf) */
166 
167 	0x81, 0xfe, 0x00, 0x00, 0x00, 0x00, 	/* cmpl sizeof ($C0), %esi */
168 	0x0f, 0x87, 0x00, 0x00, 0x00, 0x00,	/* ja +$JMP (errout) */
169 	0x65, 0x8b, 0x0d, 0x00, 0x0, 0x00, 0x00, /* movl %gs:0x0,%ecx */
170 	0x81, 0xc1, 0x00, 0x00,	0x00, 0x00,	/* addl $0xOFF, %ecx */
171 	0x8d, 0x51, 0x04			/* leal 0x4(%ecx),%edx */
172 };
173 
174 /*
175  * if (size <= $CACHE_SIZE) {
176  *	csize = $CACHE_SIZE;
177  * } else ...				! goto next cache
178  */
179 #define	PTC_INICACHE_CMP	0x02
180 #define	PTC_INICACHE_SIZE 0x09
181 #define	PTC_INICACHE_JMP	0x0e
182 static const uint8_t inicache[] = {
183 	0x81, 0xfe, 0xff, 0x00, 0x00, 0x00, 	/* cmpl sizeof ($C0), %esi */
184 	0x77, 0x0a,				/* ja +0xa */
185 	0xbf, 0xff, 0x00, 0x00, 0x00, 		/* movl sizeof ($C0), %edi */
186 	0xe9, 0x00, 0x00, 0x00, 0x00		/* jmp +$JMP (allocbuf) */
187 };
188 
189 /*
190  * if (size <= $CACHE_SIZE) {
191  *	csize = $CACHE_SIZE;
192  *	roots += $CACHE_NUM;
193  * } else ...				! goto next cache
194  */
195 #define	PTC_GENCACHE_CMP	0x02
196 #define	PTC_GENCACHE_NUM	0x0a
197 #define	PTC_GENCACHE_SIZE 0x0c
198 #define	PTC_GENCACHE_JMP	0x11
199 static const uint8_t gencache[] = {
200 	0x81, 0xfe, 0x00, 0x00, 0x00, 0x00, 	/* cmpl sizeof ($CACHE), %esi */
201 	0x77, 0x0d,				/* ja +0xd (next cache) */
202 	0x83, 0xc2, 0x00,			/* addl $4*$ii, %edx */
203 	0xbf, 0x00, 0x00, 0x00, 0x00, 		/* movl sizeof ($CACHE), %edi */
204 	0xe9, 0x00, 0x00, 0x00, 0x00 		/* jmp +$JMP (allocbuf) */
205 };
206 
207 /*
208  * else if (size <= $CACHE_SIZE) {
209  *	csize = $CACHE_SIZE;
210  *	roots += $CACHE_NUM;
211  * } else {
212  *	goto tofunc; 			! goto tomalloc if ptcmalloc.
213  * }					! goto tofree if ptcfree.
214  */
215 #define	PTC_FINCACHE_CMP 0x02
216 #define	PTC_FINCACHE_JMP	0x07
217 #define	PTC_FINCACHE_NUM 0x0a
218 #define	PTC_FINCACHE_SIZE 0x0c
219 static const uint8_t fincache[] = {
220 	0x81, 0xfe, 0xff, 0x00, 0x00, 0x00,	/* cmpl sizeof ($CLAST), %esi */
221 	0x77, 0x00,				/* ja +$JMP (to errout) */
222 	0x83, 0xc2, 0x00,			/* addl $4*($NCACHES-1), %edx */
223 	0xbf, 0x00, 0x00, 0x00, 0x00, 		/* movl sizeof ($CLAST), %edi */
224 };
225 
226 /*
227  * if (*root == NULL)
228  * 	goto tomalloc;
229  *
230  * malloc_data_t *ret = *root;
231  * *root = *(void **)ret;
232  * t->tm_size += csize;
233  * ret->malloc_size = size;
234  *
235  * ret->malloc_data = UMEM_MALLOC_ENCODE(MALLOC_SECOND_MAGIC, size);
236  * ret++;
237  *
238  * return ((void *)ret);
239  * tomalloc:
240  * 	return (malloc(orig_size));
241  */
242 #define	PTC_MALFINI_ALLABEL	0x00
243 #define	PTC_MALFINI_JMLABEL	0x20
244 #define	PTC_MALFINI_JMADDR	0x25
245 static const uint8_t malfini[] = {
246 	/* allocbuf: */
247 	0x8b, 0x02,			/* movl (%edx), %eax */
248 	0x85, 0xc0,			/* testl %eax, %eax */
249 	0x74, 0x1a,			/* je +0x1a (errout) */
250 	0x8b, 0x18,			/* movl (%eax), %esi */
251 	0x89, 0x1a,			/* movl %esi, (%edx) */
252 	0x29, 0x39,			/* subl %edi, (%ecx) */
253 	0x89, 0x30,			/* movl %esi, ($eax) */
254 	0xba, 0x00, 0xc0, 0x10, 0x3a,	/* movl $0x3a10c000,%edx */
255 	0x29, 0xf2,			/* subl %esi, %edx */
256 	0x89, 0x50, 0x04,		/* movl %edx, 0x4(%eax) */
257 	0x83, 0xc0, 0x08,		/* addl %0x8, %eax */
258 	0x5b,				/* popl %ebx */
259 	0x5e,				/* popl %esi */
260 	0x5f,				/* popl %edi */
261 	0xc9,				/* leave */
262 	0xc3,				/* ret */
263 	/* errout: */
264 	0x5b,				/* popl %ebx */
265 	0x5e,				/* popl %esi */
266 	0x5f,				/* popl %edi */
267 	0xc9,				/* leave */
268 	0xe9, 0x00, 0x00, 0x00, 0x00	/* jmp $malloc */
269 };
270 
271 /*
272  * if (t->tm_size + csize > umem_ptc_size)
273  * 	goto tofree;
274  *
275  * t->tm_size += csize
276  * *(void **)tag = *root;
277  * *root = tag;
278  * return;
279  * tofree:
280  * 	free(buf);
281  * 	return;
282  */
283 #define	PTC_FRFINI_RBUFLABEL	0x00
284 #define	PTC_FRFINI_CACHEMAX	0x06
285 #define	PTC_FRFINI_DONELABEL	0x14
286 #define	PTC_FRFINI_JFLABEL	0x19
287 #define	PTC_FRFINI_JFADDR	0x1e
288 static const uint8_t freefini[] = {
289 	/* freebuf: */
290 	0x8b, 0x19,				/* movl (%ecx),%ebx */
291 	0x01, 0xfb,				/* addl %edi,%ebx */
292 	0x81, 0xfb, 0x00, 0x00, 0x00, 0x00, 	/* cmpl maxsize, %ebx */
293 	0x73, 0x0d,				/* jae +0xd <tofree> */
294 	0x01, 0x39,				/* addl %edi,(%ecx) */
295 	0x8b, 0x3a,				/* movl (%edx),%edi */
296 	0x89, 0x38,				/* movl %edi,(%eax) */
297 	0x89, 0x02,				/* movl %eax,(%edx) */
298 	/* done: */
299 	0x5b,					/* popl %ebx */
300 	0x5e,					/* popl %esi */
301 	0x5f,					/* popl %edi */
302 	0xc9,					/* leave */
303 	0xc3,					/* ret */
304 	/* realfree: */
305 	0x5b,					/* popl %ebx */
306 	0x5e,					/* popl %esi */
307 	0x5f,					/* popl %edi */
308 	0xc9,					/* leave */
309 	0xe9, 0x00, 0x00, 0x00, 0x00		/* jmp free */
310 };
311 
312 /*
313  * Construct the initial part of malloc. off contains the offset from curthread
314  * to the root of the tmem structure. ep is the address of the label to error
315  * and jump to free. csize is the size of the largest umem_cache in ptcumem.
316  */
317 static int
318 genasm_malinit(uint8_t *bp, uint32_t off, uint32_t ep, uint32_t csize)
319 {
320 	uint32_t addr;
321 
322 	bcopy(malinit, bp, sizeof (malinit));
323 	addr = PTC_JMPADDR(ep, PTC_MALINIT_JOUT);
324 	bcopy(&addr, bp + PTC_MALINIT_JOUT, sizeof (addr));
325 	bcopy(&csize, bp + PTC_MALINIT_MCS, sizeof (csize));
326 	addr = PTC_JMPADDR(ep, PTC_MALINIT_JOV);
327 	bcopy(&addr, bp + PTC_MALINIT_JOV, sizeof (addr));
328 	bcopy(&off, bp + PTC_MALINIT_SOFF, sizeof (off));
329 
330 	return (sizeof (malinit));
331 }
332 
333 static int
334 genasm_frinit(uint8_t *bp, uint32_t off, uint32_t dp, uint32_t ep, uint32_t mc)
335 {
336 	uint32_t addr;
337 
338 	bcopy(freeinit, bp, sizeof (freeinit));
339 	addr = PTC_JMPADDR(dp, PTC_FRINI_JDONE);
340 	bcopy(&addr, bp + PTC_FRINI_JDONE, sizeof (addr));
341 	addr = PTC_JMPADDR(ep, PTC_FRINI_JFREE);
342 	bcopy(&addr, bp + PTC_FRINI_JFREE, sizeof (addr));
343 	bcopy(&mc, bp + PTC_FRINI_MCS, sizeof (mc));
344 	addr = PTC_JMPADDR(ep, PTC_FRINI_JOV);
345 	bcopy(&addr, bp + PTC_FRINI_JOV, sizeof (addr));
346 	bcopy(&off, bp + PTC_FRINI_SOFF, sizeof (off));
347 	return (sizeof (freeinit));
348 }
349 
350 /*
351  * Create the initial cache entry of the specified size. The value of ap tells
352  * us what the address of the label to try and allocate a buffer. This value is
353  * an offset from the current base to that value.
354  */
355 static int
356 genasm_firstcache(uint8_t *bp, uint32_t csize, uint32_t ap)
357 {
358 	uint32_t addr;
359 
360 	bcopy(inicache, bp, sizeof (inicache));
361 	bcopy(&csize, bp + PTC_INICACHE_CMP, sizeof (csize));
362 	bcopy(&csize, bp + PTC_INICACHE_SIZE, sizeof (csize));
363 	addr = PTC_JMPADDR(ap, PTC_INICACHE_JMP);
364 	ASSERT(addr != 0);
365 	bcopy(&addr, bp + PTC_INICACHE_JMP, sizeof (addr));
366 
367 	return (sizeof (inicache));
368 }
369 
370 static int
371 genasm_gencache(uint8_t *bp, int num, uint32_t csize, uint32_t ap)
372 {
373 	uint32_t addr;
374 	uint8_t	coff;
375 
376 	ASSERT(256 / PTC_ROOT_SIZE > num);
377 	ASSERT(num != 0);
378 	bcopy(gencache, bp, sizeof (gencache));
379 	bcopy(&csize, bp + PTC_GENCACHE_CMP, sizeof (csize));
380 	bcopy(&csize, bp + PTC_GENCACHE_SIZE, sizeof (csize));
381 	coff = num * PTC_ROOT_SIZE;
382 	bcopy(&coff, bp + PTC_GENCACHE_NUM, sizeof (coff));
383 	addr = PTC_JMPADDR(ap, PTC_GENCACHE_JMP);
384 	bcopy(&addr, bp + PTC_GENCACHE_JMP, sizeof (addr));
385 
386 	return (sizeof (gencache));
387 }
388 
389 static int
390 genasm_lastcache(uint8_t *bp, int num, uint32_t csize, uint32_t ep)
391 {
392 	uint8_t addr;
393 
394 	ASSERT(ep <= 0xff && ep > 7);
395 	ASSERT(256 / PTC_ROOT_SIZE > num);
396 	bcopy(fincache, bp, sizeof (fincache));
397 	bcopy(&csize, bp + PTC_FINCACHE_CMP, sizeof (csize));
398 	bcopy(&csize, bp + PTC_FINCACHE_SIZE, sizeof (csize));
399 	addr = num * PTC_ROOT_SIZE;
400 	bcopy(&addr, bp + PTC_FINCACHE_NUM, sizeof (addr));
401 	addr = ep - PTC_FINCACHE_JMP - 1;
402 	bcopy(&addr, bp + PTC_FINCACHE_JMP, sizeof (addr));
403 
404 	return (sizeof (fincache));
405 }
406 
407 static int
408 genasm_malfini(uint8_t *bp, uintptr_t mptr)
409 {
410 	uint32_t addr;
411 
412 	bcopy(malfini, bp, sizeof (malfini));
413 	addr = PTC_JMPADDR(mptr, ((uintptr_t)bp + PTC_MALFINI_JMADDR));
414 	bcopy(&addr, bp + PTC_MALFINI_JMADDR, sizeof (addr));
415 
416 	return (sizeof (malfini));
417 }
418 
419 static int
420 genasm_frfini(uint8_t *bp, uint32_t maxthr, uintptr_t fptr)
421 {
422 	uint32_t addr;
423 
424 	bcopy(freefini, bp, sizeof (freefini));
425 	bcopy(&maxthr, bp + PTC_FRFINI_CACHEMAX, sizeof (maxthr));
426 	addr = PTC_JMPADDR(fptr, ((uintptr_t)bp + PTC_FRFINI_JFADDR));
427 	bcopy(&addr, bp + PTC_FRFINI_JFADDR, sizeof (addr));
428 
429 	return (sizeof (freefini));
430 }
431 
432 /*
433  * The malloc inline assembly is constructed as follows:
434  *
435  * o Malloc prologue assembly
436  * o Generic first-cache check
437  * o n Generic cache checks (where n = _tmem_get_entries() - 2)
438  * o Generic last-cache check
439  * o Malloc epilogue assembly
440  *
441  * Generally there are at least three caches. When there is only one cache we
442  * only use the generic last-cache. In the case where there are two caches, we
443  * just leave out the middle ones.
444  */
445 static int
446 genasm_malloc(void *base, size_t len, int nents, int *umem_alloc_sizes)
447 {
448 	int ii, off;
449 	uint8_t *bp;
450 	size_t total;
451 	uint32_t allocoff, erroff;
452 
453 	total = sizeof (malinit) + sizeof (malfini) + sizeof (fincache);
454 
455 	if (nents >= 2)
456 		total += sizeof (inicache) + sizeof (gencache) * (nents - 2);
457 
458 	if (total > len)
459 		return (1);
460 
461 	erroff = total - sizeof (malfini) + PTC_MALFINI_JMLABEL;
462 	allocoff = total - sizeof (malfini) + PTC_MALFINI_ALLABEL;
463 
464 	bp = base;
465 
466 	off = genasm_malinit(bp, umem_tmem_off, erroff,
467 	    umem_alloc_sizes[nents-1]);
468 	bp += off;
469 	allocoff -= off;
470 	erroff -= off;
471 
472 	if (nents > 1) {
473 		off = genasm_firstcache(bp, umem_alloc_sizes[0], allocoff);
474 		bp += off;
475 		allocoff -= off;
476 		erroff -= off;
477 	}
478 
479 	for (ii = 1; ii < nents - 1; ii++) {
480 		off = genasm_gencache(bp, ii, umem_alloc_sizes[ii], allocoff);
481 		bp += off;
482 		allocoff -= off;
483 		erroff -= off;
484 	}
485 
486 	bp += genasm_lastcache(bp, nents - 1, umem_alloc_sizes[nents - 1],
487 	    erroff);
488 	bp += genasm_malfini(bp, umem_genasm_omptr);
489 	ASSERT(((uintptr_t)bp - total) == (uintptr_t)base);
490 
491 	return (0);
492 }
493 
494 static int
495 genasm_free(void *base, size_t len, int nents, int *umem_alloc_sizes)
496 {
497 	uint8_t *bp;
498 	int ii, off;
499 	size_t total;
500 	uint32_t rbufoff, retoff, erroff;
501 
502 	/* Assume that nents has already been audited for us */
503 	total = sizeof (freeinit) + sizeof (freefini) + sizeof (fincache);
504 	if (nents >= 2)
505 		total += sizeof (inicache) + sizeof (gencache) * (nents - 2);
506 
507 	if (total > len)
508 		return (1);
509 
510 	erroff = total - (sizeof (freefini) - PTC_FRFINI_JFLABEL);
511 	rbufoff = total - (sizeof (freefini) - PTC_FRFINI_RBUFLABEL);
512 	retoff = total - (sizeof (freefini) - PTC_FRFINI_DONELABEL);
513 
514 	bp = base;
515 
516 	off = genasm_frinit(bp, umem_tmem_off, retoff, erroff,
517 	    umem_alloc_sizes[nents - 1]);
518 	bp += off;
519 	erroff -= off;
520 	rbufoff -= off;
521 
522 	if (nents > 1) {
523 		off = genasm_firstcache(bp, umem_alloc_sizes[0], rbufoff);
524 		bp += off;
525 		erroff -= off;
526 		rbufoff -= off;
527 	}
528 
529 	for (ii = 1; ii < nents - 1; ii++) {
530 		off = genasm_gencache(bp, ii, umem_alloc_sizes[ii], rbufoff);
531 		bp += off;
532 		rbufoff -= off;
533 		erroff -= off;
534 	}
535 
536 	bp += genasm_lastcache(bp, nents - 1, umem_alloc_sizes[nents - 1],
537 	    erroff);
538 	bp += genasm_frfini(bp, umem_ptc_size, umem_genasm_ofptr);
539 	ASSERT(((uintptr_t)bp - total) == (uintptr_t)base);
540 
541 	return (0);
542 }
543 
544 boolean_t
545 umem_genasm(int *alloc_sizes, umem_cache_t **caches, int ncaches)
546 {
547 	int nents, i;
548 	uint8_t *mptr;
549 	uint8_t *fptr;
550 	uint64_t v, *vptr;
551 	size_t mplen, fplen;
552 	uintptr_t mpbase, fpbase;
553 	boolean_t ret = B_FALSE;
554 
555 	mptr = (void *)((uintptr_t)umem_genasm_mptr + 5);
556 	fptr = (void *)((uintptr_t)umem_genasm_fptr + 5);
557 	if (umem_genasm_mptr == 0 || umem_genasm_msize == 0 ||
558 	    umem_genasm_fptr == 0 || umem_genasm_fsize == 0) {
559 		return (B_FALSE);
560 	}
561 
562 	mplen = P2ROUNDUP(umem_genasm_msize, pagesize);
563 	mpbase = P2ALIGN((uintptr_t)umem_genasm_mptr, pagesize);
564 	fplen = P2ROUNDUP(umem_genasm_fsize, pagesize);
565 	fpbase = P2ALIGN((uintptr_t)umem_genasm_mptr, pagesize);
566 
567 	/*
568 	 * If the values straddle a page boundary, then we might need to
569 	 * actually remap two pages.
570 	 */
571 	if (P2ALIGN(umem_genasm_msize + (uintptr_t)umem_genasm_mptr,
572 	    pagesize) != mpbase) {
573 		mplen += pagesize;
574 	}
575 
576 	if (P2ALIGN(umem_genasm_fsize + (uintptr_t)umem_genasm_fptr,
577 	    pagesize) != fpbase) {
578 		fplen += pagesize;
579 	}
580 
581 	if (mprotect((void *)mpbase, mplen, PROT_READ | PROT_WRITE |
582 	    PROT_EXEC) != 0) {
583 		return (B_FALSE);
584 	}
585 
586 	if (mprotect((void *)fpbase, fplen, PROT_READ | PROT_WRITE |
587 	    PROT_EXEC) != 0) {
588 		if (mprotect((void *)mpbase, mplen, PROT_READ | PROT_EXEC) !=
589 		    0) {
590 			umem_panic("genasm failed to restore memory "
591 			    "protection: %d", errno);
592 		}
593 		return (B_FALSE);
594 	}
595 
596 	/*
597 	 * The total number of caches that we can service is the minimum of:
598 	 *  o the amount supported by libc
599 	 *  o the total number of umem caches
600 	 *  o we use a single byte addl, so it's 255 / sizeof (uintptr_t). For
601 	 *    32-bit, this is 63.
602 	 */
603 	nents = _tmem_get_nentries();
604 
605 	if (UMEM_GENASM_MAX32 < nents)
606 		nents = UMEM_GENASM_MAX32;
607 
608 	if (ncaches < nents)
609 		nents = ncaches;
610 
611 	/*
612 	 * If the number of per-thread caches has been set to zero or the
613 	 * per-thread cache size has been set to zero, don't bother trying to
614 	 * write any assembly and just use the default malloc and free. When we
615 	 * return, indicate that there is no PTC support.
616 	 */
617 	if (nents == 0 || umem_ptc_size == 0) {
618 		goto out;
619 	}
620 
621 	/* Take into account the jump */
622 	if (genasm_malloc(mptr, umem_genasm_msize, nents,
623 	    alloc_sizes) != 0) {
624 		goto out;
625 	}
626 
627 	if (genasm_free(fptr, umem_genasm_fsize, nents,
628 	    alloc_sizes) != 0) {
629 		goto out;
630 	}
631 
632 	/* nop out the jump with a multibyte jump */
633 	vptr = (void *)umem_genasm_mptr;
634 	v = MULTINOP;
635 	v |= *vptr & (0xffffffULL << 40);
636 	(void) atomic_swap_64(vptr, v);
637 	vptr = (void *)umem_genasm_fptr;
638 	v = MULTINOP;
639 	v |= *vptr & (0xffffffULL << 40);
640 	(void) atomic_swap_64(vptr, v);
641 
642 	for (i = 0; i < nents; i++)
643 		caches[i]->cache_flags |= UMF_PTC;
644 
645 	ret = B_TRUE;
646 out:
647 	if (mprotect((void *)mpbase, mplen, PROT_READ | PROT_EXEC) != 0) {
648 		umem_panic("genasm failed to restore memory protection: %d",
649 		    errno);
650 	}
651 
652 	if (mprotect((void *)fpbase, fplen, PROT_READ | PROT_EXEC) != 0) {
653 		umem_panic("genasm failed to restore memory protection: %d",
654 		    errno);
655 	}
656 
657 	return (ret);
658 }
659