1/* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12/* 13 * Copyright 2019 Joyent, Inc. 14 * Copyright 2020 Oxide Computer Company 15 */ 16 17#include <sys/asm_linkage.h> 18#include <sys/segments.h> 19#include <sys/time_impl.h> 20#include <sys/tsc.h> 21#include <cp_offsets.h> 22 23#define GETCPU_GDT_OFFSET SEL_GDT(GDT_CPUID, SEL_UPL) 24 25 .file "cp_subr.s" 26 27/* 28 * These are cloned from TSC and time related code in the kernel. They should 29 * be kept in sync in the case that the source values are changed. 30 * See: uts/i86pc/os/timestamp.c 31 */ 32#define NSEC_SHIFT 5 33#define ADJ_SHIFT 4 34#define NANOSEC 0x3b9aca00 35 36/* 37 * For __cp_tsc_read calls which incur looping retries due to CPU migration, 38 * this represents the maximum number of tries before bailing out. 39 */ 40#define TSC_READ_MAXLOOP 0x4 41 42/* 43 * hrtime_t 44 * __cp_tsc_read(comm_page_t *cp) 45 * 46 * Stack usage: 0 bytes 47 */ 48 ENTRY_NP(__cp_tsc_read) 49 movl CP_TSC_TYPE(%rdi), %esi 50 movl CP_TSC_NCPU(%rdi), %r8d 51 52 cmpl $TSC_TSCP, %esi 53 jne 2f 54 rdtscp 55 /* 56 * When the TSC is read, the low 32 bits are placed in %eax while the 57 * high 32 bits are placed in %edx. They are shifted and ORed together 58 * to obtain the full 64-bit value. 59 */ 60 shlq $0x20, %rdx 61 orq %rdx, %rax 62 63 /* 64 * A zeroed cp_tsc_ncpu (currently held in r8d) indicates that no 65 * per-CPU TSC offsets are required. 66 */ 67 testl %r8d, %r8d 68 jnz 1f 69 ret 70 711: 72 /* 73 * A non-zero cp_tsc_ncpu indicates the array length of 74 * cp_tsc_sync_tick_delta containing per-CPU offsets which are applied 75 * to TSC readings. The CPU ID furnished by the IA32_TSC_AUX register 76 * via rdtscp (placed in rcx) is used to look up an offset value in 77 * that array and apply it to the TSC value. 78 */ 79 leaq CP_TSC_SYNC_TICK_DELTA(%rdi), %r9 80 movq (%r9, %rcx, 8), %rdx 81 addq %rdx, %rax 82 ret 83 842: 85 /* 86 * TSC reading without RDTSCP 87 * 88 * Check if handling for per-CPU TSC offsets is required. If not, 89 * immediately skip to the the appropriate steps to perform a rdtsc. 90 * 91 * If per-CPU offsets are present, the TSC reading process is more 92 * complicated. Without rdtscp, there is no way to simultaneously read 93 * the TSC and query the current CPU. In order to "catch" migrations 94 * during execution, the CPU ID is queried before and after rdtsc. The 95 * execution is repeated if results differ, subject to a loop limit. 96 */ 97 xorq %r9, %r9 98 testl %r8d, %r8d 99 jz 3f 100 101 /* 102 * Load the address of the per-CPU offset array, since it is needed. 103 * The attempted loop count is kept in r8. 104 */ 105 leaq CP_TSC_SYNC_TICK_DELTA(%rdi), %r9 106 xorl %r8d, %r8d 107 108 /* Query the CPU ID and stash it in r10 for later comparison */ 109 movl $GETCPU_GDT_OFFSET, %edx 110 lsl %edx, %edx 111 movl %edx, %r10d 112 1133: 114 /* 115 * TSC_RDTSC_MFENCE was used in the past for AMD chips, but has been 116 * supplanted by TSC_RDTSC_LFENCE, which works on Intel and AMD (when 117 * lfence can be confirmed as serializing). 118 */ 119 1204: 121 cmpl $TSC_RDTSC_LFENCE, %esi 122 jne 5f 123 lfence 124 rdtsc 125 jmp 7f 126 1275: 128 cmpl $TSC_RDTSC_CPUID, %esi 129 jne 6f 130 /* 131 * Since the amd64 ABI dictates that %rbx is callee-saved, it must be 132 * preserved here. Its contents will be overwritten when cpuid is used 133 * as a serializing instruction. 134 */ 135 movq %rbx, %r11 136 xorl %eax, %eax 137 cpuid 138 rdtsc 139 movq %r11, %rbx 140 jmp 7f 141 1426: 143 /* 144 * Other protections should have prevented this function from being 145 * called in the first place. Since callers must handle a failure from 146 * CPU migration looping, yield the same result as a bail-out: 0 147 */ 148 xorl %eax, %eax 149 ret 150 1517: 152 shlq $0x20, %rdx 153 orq %rdx, %rax 154 155 /* 156 * With the TSC reading in-hand, check if any per-CPU offset handling 157 * is required. The address to the array of deltas (r9) will not have 158 * been populated if offset handling is unecessary. 159 */ 160 testq %r9, %r9 161 jnz 8f 162 ret 163 1648: 165 movl $GETCPU_GDT_OFFSET, %edx 166 lsl %edx, %edx 167 cmpl %edx, %r10d 168 jne 9f 169 movq (%r9, %rdx, 8), %rdx 170 addq %rdx, %rax 171 ret 172 1739: 174 /* 175 * It appears that a migration has occurred between the first CPU ID 176 * query and now. Check if the loop limit has been broken and retry if 177 * that's not the case. 178 */ 179 cmpl $TSC_READ_MAXLOOP, %r8d 180 jge 10f 181 incl %r8d 182 movl %edx, %r10d 183 jmp 3b 184 18510: 186 /* Loop limit was reached. Return bail-out value of 0. */ 187 xorl %eax, %eax 188 ret 189 190 SET_SIZE(__cp_tsc_read) 191 192 193/* 194 * uint_t 195 * __cp_getcpu(comm_page_t *) 196 * 197 * Stack usage: 0 bytes 198 */ 199 ENTRY_NP(__cp_getcpu) 200 movl CP_TSC_TYPE(%rdi), %edi 201 /* 202 * If RDTSCP is available, it is a quick way to grab the cpu_id which 203 * is stored in the TSC_AUX MSR by the kernel. 204 */ 205 cmpl $TSC_TSCP, %edi 206 jne 1f 207 rdtscp 208 movl %ecx, %eax 209 ret 2101: 211 mov $GETCPU_GDT_OFFSET, %eax 212 lsl %eax, %eax 213 ret 214 SET_SIZE(__cp_getcpu) 215 216/* 217 * hrtime_t 218 * __cp_gethrtime(comm_page_t *cp) 219 * 220 * Stack usage: 0x20 local + 0x8 call = 0x28 bytes 221 * 222 * %rsp+0x00 - hrtime_t tsc_last 223 * %rsp+0x08 - hrtime_t hrtime_base 224 * %rsp+0x10 - commpage_t *cp 225 * %rsp+0x18 - int hres_lock 226 */ 227 ENTRY_NP(__cp_gethrtime) 228 subq $0x20, %rsp 229 movq %rdi, 0x10(%rsp) 2301: 231 movl CP_HRES_LOCK(%rdi), %r9d 232 movl %r9d, 0x18(%rsp) 233 234 movq CP_TSC_LAST(%rdi), %rax 235 movq CP_TSC_HRTIME_BASE(%rdi), %rdx 236 movq %rax, (%rsp) 237 movq %rdx, 0x8(%rsp) 238 239 call __cp_tsc_read 240 241 /* 242 * Failure is inferred from a TSC reading of 0. The normal fasttrap 243 * mechanism can be used as a fallback in such cases. 244 */ 245 testq %rax, %rax 246 jz 6f 247 248 movq 0x10(%rsp), %rdi 249 movl 0x18(%rsp), %r9d 250 movl CP_HRES_LOCK(%rdi), %edx 251 andl $0xfffffffe, %r9d 252 cmpl %r9d, %edx 253 jne 1b 254 255 /* 256 * The in-kernel logic for calculating hrtime performs several checks 257 * to protect against edge cases. That logic is summarized as: 258 * if (tsc >= tsc_last) { 259 * delta -= tsc_last; 260 * } else if (tsc >= tsc_last - 2*tsc_max_delta) { 261 * delta = 0; 262 * } else { 263 * delta = MIN(tsc, tsc_resume_cap); 264 * } 265 * 266 * The below implementation achieves the same result, although it is 267 * structured for speed and optimized for the fast path: 268 * 269 * delta = tsc - tsc_last; 270 * if (delta < 0) { 271 * delta += (tsc_max_delta << 1); 272 * if (delta >= 0) { 273 * delta = 0; 274 * } else { 275 * delta = MIN(tsc, tsc_resume_cap); 276 * } 277 * } 278 */ 279 movq (%rsp), %rdx 280 subq %rdx, %rax /* delta = tsc - tsc_last */ 281 jbe 3f /* if (delta < 0) */ 282 2832: 284 /* 285 * Optimized TSC_CONVERT_AND_ADD: 286 * hrtime_base += (tsc_delta * nsec_scale) >> (32 - NSEC_SHIFT) 287 * 288 * Since the multiply and shift are done in 128-bit, there is no need 289 * to worry about overflow. 290 */ 291 movl CP_NSEC_SCALE(%rdi), %ecx 292 mulq %rcx 293 shrdq $_CONST(32 - NSEC_SHIFT), %rdx, %rax 294 movq 0x8(%rsp), %r8 295 addq %r8, %rax 296 297 addq $0x20, %rsp 298 ret 299 3003: 301 movq %rax, %r9 /* save (tsc - tsc_last) in r9 */ 302 movl CP_TSC_MAX_DELTA(%rdi), %ecx 303 sall $1, %ecx 304 addq %rcx, %rax /* delta += (tsc_max_delta << 1) */ 305 jae 4f /* delta < 0 */ 306 xorq %rax, %rax 307 jmp 2b 308 3094: 310 /* 311 * Repopulate %rax with the TSC reading by adding tsc_last to %r9 312 * (which holds tsc - tsc_last) 313 */ 314 movq (%rsp), %rax 315 addq %r9, %rax 316 317 /* delta = MIN(tsc, resume_cap) */ 318 movq CP_TSC_RESUME_CAP(%rdi), %rcx 319 cmpq %rcx, %rax 320 jbe 5f 321 movq %rcx, %rax 3225: 323 jmp 2b 324 3256: 326 movl $T_GETHRTIME, %eax 327 int $T_FASTTRAP 328 addq $0x20, %rsp 329 ret 330 331 SET_SIZE(__cp_gethrtime) 332 333/* 334 * int 335 * __cp_clock_gettime_monotonic(comm_page_t *cp, timespec_t *tsp) 336 * 337 * Stack usage: 0x8 local + 0x8 call + 0x28 called func. = 0x38 bytes 338 * 339 * %rsp+0x00 - timespec_t *tsp 340 */ 341 ENTRY_NP(__cp_clock_gettime_monotonic) 342 subq $0x8, %rsp 343 movq %rsi, (%rsp) 344 345 call __cp_gethrtime 346 347 /* 348 * Convert from hrtime_t (int64_t in nanoseconds) to timespec_t. 349 * This uses the same approach as hrt2ts, although it has been updated 350 * to utilize 64-bit math. 351 * 1 / 1,000,000,000 = 352 * 1000100101110000010111110100000100110110101101001010110110011B-26 353 * = 0x112e0be826d694b3 * 2^-26 354 * 355 * secs = (nsecs * 0x112e0be826d694b3) >> 26 356 * 357 * In order to account for the 2s-compliment of negative inputs, a 358 * final operation completes the process: 359 * 360 * secs -= (nsecs >> 63) 361 */ 362 movq %rax, %r11 363 movq $0x112e0be826d694b3, %rdx 364 imulq %rdx 365 sarq $0x1a, %rdx 366 movq %r11, %rax 367 sarq $0x3f, %rax 368 subq %rax, %rdx 369 movq (%rsp), %rsi 370 movq %rdx, (%rsi) 371 /* 372 * Populating tv_nsec is easier: 373 * tv_nsec = nsecs - (secs * NANOSEC) 374 */ 375 imulq $NANOSEC, %rdx, %rdx 376 subq %rdx, %r11 377 movq %r11, 0x8(%rsi) 378 379 xorl %eax, %eax 380 addq $0x8, %rsp 381 ret 382 SET_SIZE(__cp_clock_gettime_monotonic) 383 384/* 385 * int 386 * __cp_clock_gettime_realtime(comm_page_t *cp, timespec_t *tsp) 387 * 388 * Stack usage: 0x18 local + 0x8 call + 0x28 called func. = 0x48 bytes 389 * 390 * %rsp+0x00 - commpage_t *cp 391 * %rsp+0x08 - timespec_t *tsp 392 * %rsp+0x10 - int hres_lock 393 */ 394 ENTRY_NP(__cp_clock_gettime_realtime) 395 subq $0x18, %rsp 396 movq %rdi, (%rsp) 397 movq %rsi, 0x8(%rsp) 398 3991: 400 movl CP_HRES_LOCK(%rdi), %eax 401 movl %eax, 0x10(%rsp) 402 403 call __cp_gethrtime 404 movq (%rsp), %rdi 405 movq CP_HRES_LAST_TICK(%rdi), %rdx 406 subq %rdx, %rax /* nslt = hrtime - last_tick */ 407 jb 1b 408 movq CP_HRESTIME(%rdi), %r9 409 movq _CONST(CP_HRESTIME + CP_HRESTIME_INCR)(%rdi), %r10 410 movl CP_HRESTIME_ADJ(%rdi), %r11d 411 412 addq %rax, %r10 /* now.tv_nsec += nslt */ 413 414 cmpl $0, %r11d 415 jb 4f /* hres_adj > 0 */ 416 ja 6f /* hres_adj < 0 */ 417 4182: 419 cmpq $NANOSEC, %r10 420 jae 8f /* tv_nsec >= NANOSEC */ 421 4223: 423 movl 0x10(%rsp), %eax 424 movl CP_HRES_LOCK(%rdi), %edx 425 andl $0xfffffffe, %edx 426 cmpl %eax, %edx 427 jne 1b 428 429 movq 0x8(%rsp), %rsi 430 movq %r9, (%rsi) 431 movq %r10, 0x8(%rsi) 432 433 xorl %eax, %eax 434 addq $0x18, %rsp 435 ret 436 437 4384: /* hres_adj > 0 */ 439 sarq $ADJ_SHIFT, %rax 440 cmpl %r11d, %eax 441 jbe 5f 442 movl %r11d, %eax 4435: 444 addq %rax, %r10 445 jmp 2b 446 4476: /* hres_adj < 0 */ 448 sarq $ADJ_SHIFT, %rax 449 negl %r11d 450 cmpl %r11d, %eax 451 jbe 7f 452 movl %r11d, %eax 4537: 454 subq %rax, %r10 455 jmp 2b 456 4578: /* tv_nsec >= NANOSEC */ 458 subq $NANOSEC, %r10 459 incq %r9 460 cmpq $NANOSEC, %r10 461 jae 8b 462 jmp 3b 463 464 SET_SIZE(__cp_clock_gettime_realtime) 465