/* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* * Copyright 2019 Joyent, Inc. * Copyright 2020 Oxide Computer Company */ #include #include #include #include #include #define GETCPU_GDT_OFFSET SEL_GDT(GDT_CPUID, SEL_UPL) .file "cp_subr.s" /* * These are cloned from TSC and time related code in the kernel. They should * be kept in sync in the case that the source values are changed. * See: uts/i86pc/os/timestamp.c */ #define NSEC_SHIFT 5 #define ADJ_SHIFT 4 #define NANOSEC 0x3b9aca00 /* * For __cp_tsc_read calls which incur looping retries due to CPU migration, * this represents the maximum number of tries before bailing out. */ #define TSC_READ_MAXLOOP 0x4 /* * hrtime_t * __cp_tsc_read(comm_page_t *cp) * * Stack usage: 0 bytes */ ENTRY_NP(__cp_tsc_read) movl CP_TSC_TYPE(%rdi), %esi movl CP_TSC_NCPU(%rdi), %r8d cmpl $TSC_TSCP, %esi jne 2f rdtscp /* * When the TSC is read, the low 32 bits are placed in %eax while the * high 32 bits are placed in %edx. They are shifted and ORed together * to obtain the full 64-bit value. */ shlq $0x20, %rdx orq %rdx, %rax /* * A zeroed cp_tsc_ncpu (currently held in r8d) indicates that no * per-CPU TSC offsets are required. */ testl %r8d, %r8d jnz 1f ret 1: /* * A non-zero cp_tsc_ncpu indicates the array length of * cp_tsc_sync_tick_delta containing per-CPU offsets which are applied * to TSC readings. The CPU ID furnished by the IA32_TSC_AUX register * via rdtscp (placed in rcx) is used to look up an offset value in * that array and apply it to the TSC value. */ leaq CP_TSC_SYNC_TICK_DELTA(%rdi), %r9 movq (%r9, %rcx, 8), %rdx addq %rdx, %rax ret 2: /* * TSC reading without RDTSCP * * Check if handling for per-CPU TSC offsets is required. If not, * immediately skip to the the appropriate steps to perform a rdtsc. * * If per-CPU offsets are present, the TSC reading process is more * complicated. Without rdtscp, there is no way to simultaneously read * the TSC and query the current CPU. In order to "catch" migrations * during execution, the CPU ID is queried before and after rdtsc. The * execution is repeated if results differ, subject to a loop limit. */ xorq %r9, %r9 testl %r8d, %r8d jz 3f /* * Load the address of the per-CPU offset array, since it is needed. * The attempted loop count is kept in r8. */ leaq CP_TSC_SYNC_TICK_DELTA(%rdi), %r9 xorl %r8d, %r8d /* Query the CPU ID and stash it in r10 for later comparison */ movl $GETCPU_GDT_OFFSET, %edx lsl %edx, %edx movl %edx, %r10d 3: /* * TSC_RDTSC_MFENCE was used in the past for AMD chips, but has been * supplanted by TSC_RDTSC_LFENCE, which works on Intel and AMD (when * lfence can be confirmed as serializing). */ 4: cmpl $TSC_RDTSC_LFENCE, %esi jne 5f lfence rdtsc jmp 7f 5: cmpl $TSC_RDTSC_CPUID, %esi jne 6f /* * Since the amd64 ABI dictates that %rbx is callee-saved, it must be * preserved here. Its contents will be overwritten when cpuid is used * as a serializing instruction. */ movq %rbx, %r11 xorl %eax, %eax cpuid rdtsc movq %r11, %rbx jmp 7f 6: /* * Other protections should have prevented this function from being * called in the first place. Since callers must handle a failure from * CPU migration looping, yield the same result as a bail-out: 0 */ xorl %eax, %eax ret 7: shlq $0x20, %rdx orq %rdx, %rax /* * With the TSC reading in-hand, check if any per-CPU offset handling * is required. The address to the array of deltas (r9) will not have * been populated if offset handling is unecessary. */ testq %r9, %r9 jnz 8f ret 8: movl $GETCPU_GDT_OFFSET, %edx lsl %edx, %edx cmpl %edx, %r10d jne 9f movq (%r9, %rdx, 8), %rdx addq %rdx, %rax ret 9: /* * It appears that a migration has occurred between the first CPU ID * query and now. Check if the loop limit has been broken and retry if * that's not the case. */ cmpl $TSC_READ_MAXLOOP, %r8d jge 10f incl %r8d movl %edx, %r10d jmp 3b 10: /* Loop limit was reached. Return bail-out value of 0. */ xorl %eax, %eax ret SET_SIZE(__cp_tsc_read) /* * uint_t * __cp_getcpu(comm_page_t *) * * Stack usage: 0 bytes */ ENTRY_NP(__cp_getcpu) movl CP_TSC_TYPE(%rdi), %edi /* * If RDTSCP is available, it is a quick way to grab the cpu_id which * is stored in the TSC_AUX MSR by the kernel. */ cmpl $TSC_TSCP, %edi jne 1f rdtscp movl %ecx, %eax ret 1: mov $GETCPU_GDT_OFFSET, %eax lsl %eax, %eax ret SET_SIZE(__cp_getcpu) /* * hrtime_t * __cp_gethrtime(comm_page_t *cp) * * Stack usage: 0x20 local + 0x8 call = 0x28 bytes * * %rsp+0x00 - hrtime_t tsc_last * %rsp+0x08 - hrtime_t hrtime_base * %rsp+0x10 - commpage_t *cp * %rsp+0x18 - int hres_lock */ ENTRY_NP(__cp_gethrtime) subq $0x20, %rsp movq %rdi, 0x10(%rsp) 1: movl CP_HRES_LOCK(%rdi), %r9d movl %r9d, 0x18(%rsp) movq CP_TSC_LAST(%rdi), %rax movq CP_TSC_HRTIME_BASE(%rdi), %rdx movq %rax, (%rsp) movq %rdx, 0x8(%rsp) call __cp_tsc_read /* * Failure is inferred from a TSC reading of 0. The normal fasttrap * mechanism can be used as a fallback in such cases. */ testq %rax, %rax jz 6f movq 0x10(%rsp), %rdi movl 0x18(%rsp), %r9d movl CP_HRES_LOCK(%rdi), %edx andl $0xfffffffe, %r9d cmpl %r9d, %edx jne 1b /* * The in-kernel logic for calculating hrtime performs several checks * to protect against edge cases. That logic is summarized as: * if (tsc >= tsc_last) { * delta -= tsc_last; * } else if (tsc >= tsc_last - 2*tsc_max_delta) { * delta = 0; * } else { * delta = MIN(tsc, tsc_resume_cap); * } * * The below implementation achieves the same result, although it is * structured for speed and optimized for the fast path: * * delta = tsc - tsc_last; * if (delta < 0) { * delta += (tsc_max_delta << 1); * if (delta >= 0) { * delta = 0; * } else { * delta = MIN(tsc, tsc_resume_cap); * } * } */ movq (%rsp), %rdx subq %rdx, %rax /* delta = tsc - tsc_last */ jbe 3f /* if (delta < 0) */ 2: /* * Optimized TSC_CONVERT_AND_ADD: * hrtime_base += (tsc_delta * nsec_scale) >> (32 - NSEC_SHIFT) * * Since the multiply and shift are done in 128-bit, there is no need * to worry about overflow. */ movl CP_NSEC_SCALE(%rdi), %ecx mulq %rcx shrdq $_CONST(32 - NSEC_SHIFT), %rdx, %rax movq 0x8(%rsp), %r8 addq %r8, %rax addq $0x20, %rsp ret 3: movq %rax, %r9 /* save (tsc - tsc_last) in r9 */ movl CP_TSC_MAX_DELTA(%rdi), %ecx sall $1, %ecx addq %rcx, %rax /* delta += (tsc_max_delta << 1) */ jae 4f /* delta < 0 */ xorq %rax, %rax jmp 2b 4: /* * Repopulate %rax with the TSC reading by adding tsc_last to %r9 * (which holds tsc - tsc_last) */ movq (%rsp), %rax addq %r9, %rax /* delta = MIN(tsc, resume_cap) */ movq CP_TSC_RESUME_CAP(%rdi), %rcx cmpq %rcx, %rax jbe 5f movq %rcx, %rax 5: jmp 2b 6: movl $T_GETHRTIME, %eax int $T_FASTTRAP addq $0x20, %rsp ret SET_SIZE(__cp_gethrtime) /* * int * __cp_clock_gettime_monotonic(comm_page_t *cp, timespec_t *tsp) * * Stack usage: 0x8 local + 0x8 call + 0x28 called func. = 0x38 bytes * * %rsp+0x00 - timespec_t *tsp */ ENTRY_NP(__cp_clock_gettime_monotonic) subq $0x8, %rsp movq %rsi, (%rsp) call __cp_gethrtime /* * Convert from hrtime_t (int64_t in nanoseconds) to timespec_t. * This uses the same approach as hrt2ts, although it has been updated * to utilize 64-bit math. * 1 / 1,000,000,000 = * 1000100101110000010111110100000100110110101101001010110110011B-26 * = 0x112e0be826d694b3 * 2^-26 * * secs = (nsecs * 0x112e0be826d694b3) >> 26 * * In order to account for the 2s-compliment of negative inputs, a * final operation completes the process: * * secs -= (nsecs >> 63) */ movq %rax, %r11 movq $0x112e0be826d694b3, %rdx imulq %rdx sarq $0x1a, %rdx movq %r11, %rax sarq $0x3f, %rax subq %rax, %rdx movq (%rsp), %rsi movq %rdx, (%rsi) /* * Populating tv_nsec is easier: * tv_nsec = nsecs - (secs * NANOSEC) */ imulq $NANOSEC, %rdx, %rdx subq %rdx, %r11 movq %r11, 0x8(%rsi) xorl %eax, %eax addq $0x8, %rsp ret SET_SIZE(__cp_clock_gettime_monotonic) /* * int * __cp_clock_gettime_realtime(comm_page_t *cp, timespec_t *tsp) * * Stack usage: 0x18 local + 0x8 call + 0x28 called func. = 0x48 bytes * * %rsp+0x00 - commpage_t *cp * %rsp+0x08 - timespec_t *tsp * %rsp+0x10 - int hres_lock */ ENTRY_NP(__cp_clock_gettime_realtime) subq $0x18, %rsp movq %rdi, (%rsp) movq %rsi, 0x8(%rsp) 1: movl CP_HRES_LOCK(%rdi), %eax movl %eax, 0x10(%rsp) call __cp_gethrtime movq (%rsp), %rdi movq CP_HRES_LAST_TICK(%rdi), %rdx subq %rdx, %rax /* nslt = hrtime - last_tick */ jb 1b movq CP_HRESTIME(%rdi), %r9 movq _CONST(CP_HRESTIME + CP_HRESTIME_INCR)(%rdi), %r10 movl CP_HRESTIME_ADJ(%rdi), %r11d addq %rax, %r10 /* now.tv_nsec += nslt */ cmpl $0, %r11d jb 4f /* hres_adj > 0 */ ja 6f /* hres_adj < 0 */ 2: cmpq $NANOSEC, %r10 jae 8f /* tv_nsec >= NANOSEC */ 3: movl 0x10(%rsp), %eax movl CP_HRES_LOCK(%rdi), %edx andl $0xfffffffe, %edx cmpl %eax, %edx jne 1b movq 0x8(%rsp), %rsi movq %r9, (%rsi) movq %r10, 0x8(%rsi) xorl %eax, %eax addq $0x18, %rsp ret 4: /* hres_adj > 0 */ sarq $ADJ_SHIFT, %rax cmpl %r11d, %eax jbe 5f movl %r11d, %eax 5: addq %rax, %r10 jmp 2b 6: /* hres_adj < 0 */ sarq $ADJ_SHIFT, %rax negl %r11d cmpl %r11d, %eax jbe 7f movl %r11d, %eax 7: subq %rax, %r10 jmp 2b 8: /* tv_nsec >= NANOSEC */ subq $NANOSEC, %r10 incq %r9 cmpq $NANOSEC, %r10 jae 8b jmp 3b SET_SIZE(__cp_clock_gettime_realtime)