xref: /illumos-gate/usr/src/lib/libumem/amd64/umem_genasm.c (revision 9d6ca3965c3358c32eb68544fe91ff8ad9c3fcde)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2019 Joyent, Inc.  All rights reserved.
23  */
24 
25 /*
26  * Don't Panic! If you find the blocks of assembly that follow confusing and
27  * you're questioning why they exist, please go read section 8 of the umem.c big
28  * theory statement. Next familiarize yourself with the malloc and free
29  * implementations in libumem's malloc.c.
30  *
31  * What follows is the amd64 implementation of the thread caching automatic
32  * assembly generation. The amd64 calling conventions are documented in the
33  * 64-bit System V ABI. For our purposes what matters is that our first argument
34  * will come in rdi. Our functions have to preserve rbp, rbx, and r12->r15. We
35  * are free to do whatever we want with rax, rcx, rdx, rsi, rdi, and r8->r11.
36  *
37  * For both our implementation of malloc and free we only use the registers we
38  * don't have to preserve.
39  *
40  * Malloc register usage:
41  * 	o. rdi: Original size to malloc. This never changes and is preserved.
42  * 	o. rsi: Adjusted malloc size for malloc_data_tag(s).
43  * 	o. rcx: Pointer to the tmem_t in the ulwp_t.
44  * 	o. rdx: Pointer to the tmem_t array of roots
45  * 	o. r8:  Size of the cache
46  * 	o. r9:  Scratch register
47  *
48  * Free register usage:
49  *	o. rdi: Original buffer to free. This never changes and is preserved.
50  *	o. rax: The actual buffer, adjusted for the hidden malloc_data_t(s).
51  * 	o. rcx: Pointer to the tmem_t in the ulwp_t.
52  * 	o. rdx: Pointer to the tmem_t array of roots
53  * 	o. r8:  Size of the cache
54  * 	o. r9:  Scratch register
55  *
56  * Once we determine what cache we are using, we increment %rdx to the
57  * appropriate offset and set %r8 with the size of the cache. This means that
58  * when we break out to the normal buffer allocation point %rdx contains the
59  * head of the linked list and %r8 is the amount that we have to adjust the
60  * thread's cached amount by.
61  *
62  * Each block of assembly has psuedocode that describes its purpose.
63  */
64 
65 /*
66  * umem_base must be first.
67  */
68 #include "umem_base.h"
69 
70 #include <inttypes.h>
71 #include <strings.h>
72 #include <umem_impl.h>
73 #include <atomic.h>
74 #include <sys/mman.h>
75 #include <errno.h>
76 
77 
78 #include <stdio.h>
79 
80 const int umem_genasm_supported = 1;
81 static uintptr_t umem_genasm_mptr = (uintptr_t)&_malloc;
82 static size_t umem_genasm_msize = 576;
83 static uintptr_t umem_genasm_fptr = (uintptr_t)&_free;
84 static size_t umem_genasm_fsize = 576;
85 static uintptr_t umem_genasm_omptr = (uintptr_t)umem_malloc;
86 static uintptr_t umem_genasm_ofptr = (uintptr_t)umem_malloc_free;
87 
88 #define	UMEM_GENASM_MAX64	(UINT32_MAX / sizeof (uintptr_t))
89 #define	PTC_JMPADDR(dest, src)	(dest - (src + 4))
90 #define	PTC_ROOT_SIZE	sizeof (uintptr_t)
91 #define	MULTINOP	0x0000441f0f
92 
93 /*
94  * void *ptcmalloc(size_t orig_size);
95  *
96  * size_t size = orig_size + 8;
97  * if (size > UMEM_SECOND_ALIGN)
98  * 	size += 8;
99  *
100  * if (size < orig_size)
101  * 	goto tomalloc;		! This is overflow
102  *
103  * if (size > cache_max)
104  * 	goto tomalloc
105  *
106  * tmem_t *t = (uintptr_t)curthread() + umem_thr_offset;
107  * void **roots = t->tm_roots;
108  */
109 #define	PTC_MALINIT_JOUT	0x13
110 #define	PTC_MALINIT_MCS	0x1a
111 #define	PTC_MALINIT_JOV	0x20
112 #define	PTC_MALINIT_SOFF	0x30
113 static const uint8_t malinit[] =  {
114 	0x48, 0x8d, 0x77, 0x08,		/* leaq 0x8(%rdi),%rsi */
115 	0x48, 0x83, 0xfe, 0x10,		/* cmpq $0x10, %rsi */
116 	0x76, 0x04,			/* jbe +0x4 */
117 	0x48, 0x8d, 0x77, 0x10,		/* leaq 0x10(%rdi),%rsi */
118 	0x48, 0x39, 0xfe,		/* cmpq %rdi,%rsi */
119 	0x0f, 0x82, 0x00, 0x00, 0x00, 0x00,	/* jb +errout */
120 	0x48, 0x81, 0xfe,
121 	0x00, 0x00, 0x00, 0x00,		/* cmpq sizeof ($CACHE), %rsi */
122 	0x0f, 0x87, 0x00, 0x00, 0x00, 0x00,	/* ja +errout */
123 	0x64, 0x48, 0x8b, 0x0c, 0x25,
124 	0x00, 0x00, 0x00, 0x00,		/* movq %fs:0x0,%rcx */
125 	0x48, 0x81, 0xc1,
126 	0x00, 0x00, 0x00, 0x00,		/* addq $SOFF, %rcx */
127 	0x48, 0x8d, 0x51, 0x08,		/* leaq 0x8(%rcx),%rdx */
128 };
129 
130 /*
131  * void ptcfree(void *buf);
132  *
133  * if (buf == NULL)
134  * 	return;
135  *
136  * malloc_data_t *tag = buf;
137  * tag--;
138  * int size = tag->malloc_size;
139  * int tagval = UMEM_MALLOC_DECODE(tag->malloc_tag, size);
140  * if (tagval == MALLOC_SECOND_MAGIC) {
141  * 	tag--;
142  * } else if (tagval != MALLOC_MAGIC) {
143  * 	goto tofree;
144  * }
145  *
146  * if (size > cache_max)
147  * 	goto tofree;
148  *
149  * tmem_t *t = (uintptr_t)curthread() + umem_thr_offset;
150  * void **roots = t->tm_roots;
151  */
152 #define	PTC_FRINI_JDONE	0x05
153 #define	PTC_FRINI_JFREE	0x25
154 #define	PTC_FRINI_MCS	0x30
155 #define	PTC_FRINI_JOV	0x36
156 #define	PTC_FRINI_SOFF	0x46
157 static const uint8_t freeinit[] = {
158 	0x48, 0x85, 0xff,		/* testq %rdi,%rdi */
159 	0x0f, 0x84, 0x00, 0x00, 0x00, 0x00,	/* jmp $JDONE (done) */
160 	0x8b, 0x77, 0xf8,		/* movl -0x8(%rdi),%esi */
161 	0x8b, 0x47, 0xfc,		/* movl -0x4(%rdi),%eax */
162 	0x01, 0xf0,			/* addl %esi,%eax */
163 	0x3d, 0x00, 0x70, 0xba, 0x16,	/* cmpl $MALLOC_2_MAGIC, %eax */
164 	0x75, 0x06,			/* jne +0x6 (checkover) */
165 	0x48, 0x8d, 0x47, 0xf0,		/* leaq -0x10(%rdi),%eax */
166 	0xeb, 0x0f,			/* jmp +0xf (freebuf) */
167 	0x3d, 0x00, 0xc0, 0x10, 0x3a,	/* cmpl $MALLOC_MAGIC, %eax */
168 	0x0f, 0x85, 0x00, 0x00, 0x00, 0x00,	/* jmp +JFREE (goto torfree) */
169 	0x48, 0x8d, 0x47, 0xf8,		/* leaq -0x8(%rdi),%rax */
170 	0x48, 0x81, 0xfe,
171 	0x00, 0x00, 0x00, 0x00,		/* cmpq sizeof ($CACHE), %rsi */
172 	0x0f, 0x87, 0x00, 0x00, 0x00, 0x00,	/* ja +errout */
173 	0x64, 0x48, 0x8b, 0x0c, 0x25,
174 	0x00, 0x00, 0x00, 0x00,		/* movq %fs:0x0,%rcx */
175 	0x48, 0x81, 0xc1,
176 	0x00, 0x00, 0x00, 0x00,		/* addq $SOFF, %rcx */
177 	0x48, 0x8d, 0x51, 0x08,		/* leaq 0x8(%rcx),%rdx */
178 };
179 
180 /*
181  * if (size <= $CACHE_SIZE) {
182  *	csize = $CACHE_SIZE;
183  * } else ...				! goto next cache
184  */
185 #define	PTC_INICACHE_CMP	0x03
186 #define	PTC_INICACHE_SIZE	0x0c
187 #define	PTC_INICACHE_JMP	0x11
188 static const uint8_t inicache[] = {
189 	0x48, 0x81, 0xfe,
190 	0x00, 0x00, 0x00, 0x00,		/* cmpq sizeof ($CACHE), %rsi */
191 	0x77, 0x0c,			/* ja +0xc (next cache) */
192 	0x49, 0xc7, 0xc0,
193 	0x00, 0x00, 0x00, 0x00,		/* movq sizeof ($CACHE), %r8 */
194 	0xe9, 0x00, 0x00, 0x00, 0x00,	/* jmp $JMP (allocbuf) */
195 };
196 
197 /*
198  * if (size <= $CACHE_SIZE) {
199  *	csize = $CACHE_SIZE;
200  *	roots += $CACHE_NUM;
201  * } else ...				! goto next cache
202  */
203 #define	PTC_GENCACHE_CMP	0x03
204 #define	PTC_GENCACHE_SIZE	0x0c
205 #define	PTC_GENCACHE_NUM	0x13
206 #define	PTC_GENCACHE_JMP	0x18
207 static const uint8_t gencache[] = {
208 	0x48, 0x81, 0xfe,
209 	0x00, 0x00, 0x00, 0x00,		/* cmpq sizeof ($CACHE), %rsi */
210 	0x77, 0x14,			/* ja +0xc (next cache) */
211 	0x49, 0xc7, 0xc0,
212 	0x00, 0x00, 0x00, 0x00,		/* movq sizeof ($CACHE), %r8 */
213 	0x48, 0x81, 0xc2,
214 	0x00, 0x00, 0x00, 0x00,		/* addq $8*ii, %rdx */
215 	0xe9, 0x00, 0x00, 0x00, 0x00	/* jmp +$JMP (allocbuf ) */
216 };
217 
218 /*
219  * else if (size <= $CACHE_SIZE) {
220  *	csize = $CACHE_SIZE;
221  *	roots += $CACHE_NUM;
222  * } else {
223  *	goto tofunc; 			! goto tomalloc if ptcmalloc.
224  * }					! goto tofree if ptcfree.
225  */
226 #define	PTC_FINCACHE_CMP	0x03
227 #define	PTC_FINCACHE_JMP	0x08
228 #define	PTC_FINCACHE_SIZE	0x0c
229 #define	PTC_FINCACHE_NUM	0x13
230 static const uint8_t fincache[] = {
231 	0x48, 0x81, 0xfe,
232 	0x00, 0x00, 0x00, 0x00,		/* cmpq sizeof ($CACHE), %rsi */
233 	0x77, 0x00,			/* ja +JMP (to real malloc) */
234 	0x49, 0xc7, 0xc0,
235 	0x00, 0x00, 0x00, 0x00,		/* movq sizeof ($CACHE), %r8 */
236 	0x48, 0x81, 0xc2,
237 	0x00, 0x00, 0x00, 0x00,		/* addq $8*ii, %rdx */
238 
239 };
240 
241 /*
242  * if (*root == NULL)
243  * 	goto tomalloc;
244  *
245  * malloc_data_t *ret = *root;
246  * *root = *(void **)ret;
247  * t->tm_size += csize;
248  * ret->malloc_size = size;
249  *
250  * if (size > UMEM_SECOND_ALIGN) {
251  *	ret->malloc_data = UMEM_MALLOC_ENCODE(MALLOC_SECOND_MAGIC, size);
252  *	ret += 2;
253  * } else {
254  *	ret->malloc_data = UMEM_MALLOC_ENCODE(MALLOC_SECOND_MAGIC, size);
255  *	ret += 1;
256  * }
257  *
258  * return ((void *)ret);
259  * tomalloc:
260  * 	return (malloc(orig_size));
261  */
262 #define	PTC_MALFINI_ALLABEL	0x00
263 #define	PTC_MALFINI_JMLABEL	0x40
264 #define	PTC_MALFINI_JMADDR	0x41
265 static const uint8_t malfini[] = {
266 	0x48, 0x8b, 0x02,		/* movl (%rdx),%rax */
267 	0x48, 0x85, 0xc0,		/* testq %rax,%rax */
268 	0x74, 0x38,			/* je +0x38 (errout) */
269 	0x4c, 0x8b, 0x08,		/* movq (%rax),%r9 */
270 	0x4c, 0x89, 0x0a,		/* movq %r9,(%rdx) */
271 	0x4c, 0x29, 0x01,		/* subq %rsi,(%rcx) */
272 	0x48, 0x83, 0xfe, 0x10,		/* cmpq $0x10,%rsi */
273 	0x76, 0x15,			/* jbe +0x15 */
274 	0x41, 0xb9, 0x00, 0x70, 0xba, 0x16, /* movl $MALLOC_MAGIC_2, %r9d */
275 	0x89, 0x70, 0x08,		/* movl %r9d,0x8(%rax) */
276 	0x41, 0x29, 0xf1,		/* subl %esi, %r9d */
277 	0x44, 0x89, 0x48, 0x0c,		/* movl %r9d, 0xc(%rax) */
278 	0x48, 0x83, 0xc0, 0x10,		/* addq $0x10, %rax */
279 	0xc3,				/* ret */
280 	0x41, 0xb9, 0x00, 0xc0, 0x10, 0x3a,	/* movl %MALLOC_MAGIC, %r9d */
281 	0x89, 0x30,			/* movl %esi,(%rax) */
282 	0x41, 0x29, 0xf1,		/* subl %esi,%r9d */
283 	0x44, 0x89, 0x48, 0x04,		/* movl %r9d,0x4(%rax) */
284 	0x48, 0x83, 0xc0, 0x08,		/* addq $0x8,%rax */
285 	0xc3,				/* ret */
286 	0xe9, 0x00, 0x00, 0x00, 0x00	/* jmp $MALLOC */
287 };
288 
289 /*
290  * if (t->tm_size + csize > umem_ptc_size)
291  * 	goto tofree;
292  *
293  * t->tm_size += csize
294  * *(void **)tag = *root;
295  * *root = tag;
296  * return;
297  * tofree:
298  * 	free(buf);
299  * 	return;
300  */
301 #define	PTC_FRFINI_RBUFLABEL	0x00
302 #define	PTC_FRFINI_CACHEMAX	0x09
303 #define	PTC_FRFINI_DONELABEL	0x1b
304 #define	PTC_FRFINI_JFLABEL	0x1c
305 #define	PTC_FRFINI_JFADDR	0x1d
306 static const uint8_t freefini[] = {
307 	0x4c, 0x8b, 0x09,		/* movq (%rcx),%r9 */
308 	0x4d, 0x01, 0xc1,		/* addq %r8, %r9 */
309 	0x49, 0x81, 0xf9,
310 	0x00, 0x00, 0x00, 0x00,		/* cmpl $THR_CACHE_MAX, %r9 */
311 	0x77, 0x0d,			/* jae +0xd (torfree) */
312 	0x4c, 0x01, 0x01,		/* addq %r8,(%rcx) */
313 	0x4c, 0x8b, 0x0a,		/* movq (%rdx),%r9 */
314 	0x4c, 0x89, 0x08,		/* movq %r9,(%rax) */
315 	0x48, 0x89, 0x02,		/* movq %rax,(%rdx) */
316 	0xc3,				/* ret */
317 	0xe9, 0x00, 0x00, 0x00, 0x00	/* jmp free */
318 };
319 
320 /*
321  * Construct the initial part of malloc. off contains the offset from curthread
322  * to the root of the tmem structure. ep is the address of the label to error
323  * and jump to free. csize is the size of the largest umem_cache in ptcumem.
324  */
325 static int
326 genasm_malinit(uint8_t *bp, uint32_t off, uint32_t ep, uint32_t csize)
327 {
328 	uint32_t addr;
329 
330 	bcopy(malinit, bp, sizeof (malinit));
331 	addr = PTC_JMPADDR(ep, PTC_MALINIT_JOUT);
332 	bcopy(&addr, bp + PTC_MALINIT_JOUT, sizeof (addr));
333 	bcopy(&csize, bp + PTC_MALINIT_MCS, sizeof (csize));
334 	addr = PTC_JMPADDR(ep, PTC_MALINIT_JOV);
335 	bcopy(&addr, bp + PTC_MALINIT_JOV, sizeof (addr));
336 	bcopy(&off, bp + PTC_MALINIT_SOFF, sizeof (off));
337 
338 	return (sizeof (malinit));
339 }
340 
341 static int
342 genasm_frinit(uint8_t *bp, uint32_t off, uint32_t dp, uint32_t ep, uint32_t mcs)
343 {
344 	uint32_t addr;
345 
346 	bcopy(freeinit, bp, sizeof (freeinit));
347 	addr = PTC_JMPADDR(dp, PTC_FRINI_JDONE);
348 	bcopy(&addr, bp + PTC_FRINI_JDONE, sizeof (addr));
349 	addr = PTC_JMPADDR(ep, PTC_FRINI_JFREE);
350 	bcopy(&addr, bp + PTC_FRINI_JFREE, sizeof (addr));
351 	bcopy(&mcs, bp + PTC_FRINI_MCS, sizeof (mcs));
352 	addr = PTC_JMPADDR(ep, PTC_FRINI_JOV);
353 	bcopy(&addr, bp + PTC_FRINI_JOV, sizeof (addr));
354 	bcopy(&off, bp + PTC_FRINI_SOFF, sizeof (off));
355 	return (sizeof (freeinit));
356 }
357 
358 
359 /*
360  * Create the initial cache entry of the specified size. The value of ap tells
361  * us what the address of the label to try and allocate a buffer. This value is
362  * an offset from the current base to that value.
363  */
364 static int
365 genasm_firstcache(uint8_t *bp, uint32_t csize, uint32_t ap)
366 {
367 	uint32_t addr;
368 
369 	bcopy(inicache, bp, sizeof (inicache));
370 	bcopy(&csize, bp + PTC_INICACHE_CMP, sizeof (csize));
371 	bcopy(&csize, bp + PTC_INICACHE_SIZE, sizeof (csize));
372 	addr = PTC_JMPADDR(ap, PTC_INICACHE_JMP);
373 	ASSERT(addr != 0);
374 	bcopy(&addr, bp + PTC_INICACHE_JMP, sizeof (addr));
375 
376 	return (sizeof (inicache));
377 }
378 
379 static int
380 genasm_gencache(uint8_t *bp, int num, uint32_t csize, uint32_t ap)
381 {
382 	uint32_t addr;
383 	uint32_t coff;
384 
385 	ASSERT(UINT32_MAX / PTC_ROOT_SIZE > num);
386 	ASSERT(num != 0);
387 	bcopy(gencache, bp, sizeof (gencache));
388 	bcopy(&csize, bp + PTC_GENCACHE_CMP, sizeof (csize));
389 	bcopy(&csize, bp + PTC_GENCACHE_SIZE, sizeof (csize));
390 	coff = num * PTC_ROOT_SIZE;
391 	bcopy(&coff, bp + PTC_GENCACHE_NUM, sizeof (coff));
392 	addr = PTC_JMPADDR(ap, PTC_GENCACHE_JMP);
393 	bcopy(&addr, bp + PTC_GENCACHE_JMP, sizeof (addr));
394 
395 	return (sizeof (gencache));
396 }
397 
398 static int
399 genasm_lastcache(uint8_t *bp, int num, uint32_t csize, uint32_t ep)
400 {
401 	uint8_t eap;
402 	uint32_t coff;
403 
404 	ASSERT(ep <= 0xff && ep > 7);
405 	ASSERT(UINT32_MAX / PTC_ROOT_SIZE > num);
406 	bcopy(fincache, bp, sizeof (fincache));
407 	bcopy(&csize, bp + PTC_FINCACHE_CMP, sizeof (csize));
408 	bcopy(&csize, bp + PTC_FINCACHE_SIZE, sizeof (csize));
409 	coff = num * PTC_ROOT_SIZE;
410 	bcopy(&coff, bp + PTC_FINCACHE_NUM, sizeof (coff));
411 	eap = ep - PTC_FINCACHE_JMP - 1;
412 	bcopy(&eap, bp + PTC_FINCACHE_JMP, sizeof (eap));
413 
414 	return (sizeof (fincache));
415 }
416 
417 static int
418 genasm_malfini(uint8_t *bp, uintptr_t mptr)
419 {
420 	uint32_t addr;
421 
422 	bcopy(malfini, bp, sizeof (malfini));
423 	addr = PTC_JMPADDR(mptr, ((uintptr_t)bp + PTC_MALFINI_JMADDR));
424 	bcopy(&addr, bp + PTC_MALFINI_JMADDR, sizeof (addr));
425 
426 	return (sizeof (malfini));
427 }
428 
429 static int
430 genasm_frfini(uint8_t *bp, uint32_t maxthr, uintptr_t fptr)
431 {
432 	uint32_t addr;
433 
434 	bcopy(freefini, bp, sizeof (freefini));
435 	bcopy(&maxthr, bp + PTC_FRFINI_CACHEMAX, sizeof (maxthr));
436 	addr = PTC_JMPADDR(fptr, ((uintptr_t)bp + PTC_FRFINI_JFADDR));
437 	bcopy(&addr, bp + PTC_FRFINI_JFADDR, sizeof (addr));
438 
439 	return (sizeof (freefini));
440 }
441 
442 /*
443  * The malloc inline assembly is constructed as follows:
444  *
445  * o Malloc prologue assembly
446  * o Generic first-cache check
447  * o n Generic cache checks (where n = _tmem_get_entries() - 2)
448  * o Generic last-cache check
449  * o Malloc epilogue assembly
450  *
451  * Generally there are at least three caches. When there is only one cache we
452  * only use the generic last-cache. In the case where there are two caches, we
453  * just leave out the middle ones.
454  */
455 static int
456 genasm_malloc(void *base, size_t len, int nents, int *umem_alloc_sizes)
457 {
458 	int ii, off;
459 	uint8_t *bp;
460 	size_t total;
461 	uint32_t allocoff, erroff;
462 
463 	total = sizeof (malinit) + sizeof (malfini) + sizeof (fincache);
464 
465 	if (nents >= 2)
466 		total += sizeof (inicache) + sizeof (gencache) * (nents - 2);
467 
468 	if (total > len)
469 		return (1);
470 
471 	erroff = total - sizeof (malfini) + PTC_MALFINI_JMLABEL;
472 	allocoff = total - sizeof (malfini) + PTC_MALFINI_ALLABEL;
473 
474 	bp = base;
475 
476 	off = genasm_malinit(bp, umem_tmem_off, erroff,
477 	    umem_alloc_sizes[nents-1]);
478 	bp += off;
479 	allocoff -= off;
480 	erroff -= off;
481 
482 	if (nents > 1) {
483 		off = genasm_firstcache(bp, umem_alloc_sizes[0], allocoff);
484 		bp += off;
485 		allocoff -= off;
486 		erroff -= off;
487 	}
488 
489 	for (ii = 1; ii < nents - 1; ii++) {
490 		off = genasm_gencache(bp, ii, umem_alloc_sizes[ii], allocoff);
491 		bp += off;
492 		allocoff -= off;
493 		erroff -= off;
494 	}
495 
496 	bp += genasm_lastcache(bp, nents - 1, umem_alloc_sizes[nents - 1],
497 	    erroff);
498 	bp += genasm_malfini(bp, umem_genasm_omptr);
499 	ASSERT(((uintptr_t)bp - total) == (uintptr_t)base);
500 
501 	return (0);
502 }
503 
504 static int
505 genasm_free(void *base, size_t len, int nents, int *umem_alloc_sizes)
506 {
507 	uint8_t *bp;
508 	int ii, off;
509 	size_t total;
510 	uint32_t rbufoff, retoff, erroff;
511 
512 	/* Assume that nents has already been audited for us */
513 	total = sizeof (freeinit) + sizeof (freefini) + sizeof (fincache);
514 	if (nents >= 2)
515 		total += sizeof (inicache) + sizeof (gencache) * (nents - 2);
516 
517 	if (total > len)
518 		return (1);
519 
520 	erroff = total - (sizeof (freefini) - PTC_FRFINI_JFLABEL);
521 	rbufoff = total - (sizeof (freefini) - PTC_FRFINI_RBUFLABEL);
522 	retoff = total - (sizeof (freefini) - PTC_FRFINI_DONELABEL);
523 
524 	bp = base;
525 
526 	off = genasm_frinit(bp, umem_tmem_off, retoff, erroff,
527 	    umem_alloc_sizes[nents - 1]);
528 	bp += off;
529 	erroff -= off;
530 	rbufoff -= off;
531 
532 	if (nents > 1) {
533 		off = genasm_firstcache(bp, umem_alloc_sizes[0], rbufoff);
534 		bp += off;
535 		erroff -= off;
536 		rbufoff -= off;
537 	}
538 
539 	for (ii = 1; ii < nents - 1; ii++) {
540 		off = genasm_gencache(bp, ii, umem_alloc_sizes[ii], rbufoff);
541 		bp += off;
542 		rbufoff -= off;
543 		erroff -= off;
544 	}
545 
546 	bp += genasm_lastcache(bp, nents - 1, umem_alloc_sizes[nents - 1],
547 	    erroff);
548 	bp += genasm_frfini(bp, umem_ptc_size, umem_genasm_ofptr);
549 	ASSERT(((uintptr_t)bp - total) == (uintptr_t)base);
550 
551 	return (0);
552 }
553 
554 boolean_t
555 umem_genasm(int *cp, umem_cache_t **caches, int nc)
556 {
557 	int nents, i;
558 	uint8_t *mptr;
559 	uint8_t *fptr;
560 	uint64_t v, *vptr;
561 	size_t mplen, fplen;
562 	uintptr_t mpbase, fpbase;
563 	boolean_t ret = B_FALSE;
564 
565 	mptr = (void *)((uintptr_t)umem_genasm_mptr + 5);
566 	fptr = (void *)((uintptr_t)umem_genasm_fptr + 5);
567 	if (umem_genasm_mptr == 0 || umem_genasm_msize == 0 ||
568 	    umem_genasm_fptr == 0 || umem_genasm_fsize == 0) {
569 		return (B_FALSE);
570 	}
571 
572 	mplen = P2ROUNDUP(umem_genasm_msize, pagesize);
573 	mpbase = P2ALIGN((uintptr_t)umem_genasm_mptr, pagesize);
574 	fplen = P2ROUNDUP(umem_genasm_fsize, pagesize);
575 	fpbase = P2ALIGN((uintptr_t)umem_genasm_mptr, pagesize);
576 
577 	/*
578 	 * If the values straddle a page boundary, then we might need to
579 	 * actually remap two pages.
580 	 */
581 	if (P2ALIGN(umem_genasm_msize + (uintptr_t)umem_genasm_mptr,
582 	    pagesize) != mpbase) {
583 		mplen += pagesize;
584 	}
585 
586 	if (P2ALIGN(umem_genasm_fsize + (uintptr_t)umem_genasm_fptr,
587 	    pagesize) != fpbase) {
588 		fplen += pagesize;
589 	}
590 
591 	if (mprotect((void *)mpbase, mplen, PROT_READ | PROT_WRITE |
592 	    PROT_EXEC) != 0) {
593 		return (B_FALSE);
594 	}
595 
596 	if (mprotect((void *)fpbase, fplen, PROT_READ | PROT_WRITE |
597 	    PROT_EXEC) != 0) {
598 		if (mprotect((void *)mpbase, mplen, PROT_READ | PROT_EXEC) !=
599 		    0) {
600 			umem_panic("genasm failed to restore memory "
601 			    "protection: %d", errno);
602 		}
603 		return (B_FALSE);
604 	}
605 
606 	/*
607 	 * The total number of caches that we can service is the minimum of:
608 	 *  o the amount supported by libc
609 	 *  o the total number of umem caches
610 	 *  o we use a single byte addl, so it's MAX_UINT32 / sizeof (uintptr_t)
611 	 *    For 64-bit, this is MAX_UINT32 >> 3, a lot.
612 	 */
613 	nents = _tmem_get_nentries();
614 
615 	if (UMEM_GENASM_MAX64 < nents)
616 		nents = UMEM_GENASM_MAX64;
617 
618 	if (nc < nents)
619 		nents = nc;
620 
621 	/*
622 	 * If the number of per-thread caches has been set to zero or the
623 	 * per-thread cache size has been set to zero, don't bother trying to
624 	 * write any assembly and just use the default malloc and free. When we
625 	 * return, indicate that there is no PTC support.
626 	 */
627 	if (nents == 0 || umem_ptc_size == 0) {
628 		goto out;
629 	}
630 
631 	/* Take into account the jump */
632 	if (genasm_malloc(mptr, umem_genasm_msize, nents, cp) != 0) {
633 		goto out;
634 	}
635 
636 	if (genasm_free(fptr, umem_genasm_fsize, nents, cp) != 0) {
637 		goto out;
638 	}
639 
640 	/* nop out the jump with a multibyte jump */
641 	vptr = (void *)umem_genasm_mptr;
642 	v = MULTINOP;
643 	v |= *vptr & (0xffffffULL << 40);
644 	(void) atomic_swap_64(vptr, v);
645 	vptr = (void *)umem_genasm_fptr;
646 	v = MULTINOP;
647 	v |= *vptr & (0xffffffULL << 40);
648 	(void) atomic_swap_64(vptr, v);
649 
650 	for (i = 0; i < nents; i++)
651 		caches[i]->cache_flags |= UMF_PTC;
652 
653 	ret = B_TRUE;
654 out:
655 	if (mprotect((void *)mpbase, mplen, PROT_READ | PROT_EXEC) != 0) {
656 		umem_panic("genasm failed to restore memory protection: %d",
657 		    errno);
658 	}
659 
660 	if (mprotect((void *)fpbase, fplen, PROT_READ | PROT_EXEC) != 0) {
661 		umem_panic("genasm failed to restore memory protection: %d",
662 		    errno);
663 	}
664 
665 	return (ret);
666 }
667