xref: /titanic_52/usr/src/uts/sun4v/os/mach_startup.c (revision 02b4e56ca3a4e4a4fe9e52fca9c2972101f0e57f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/machsystm.h>
27 #include <sys/archsystm.h>
28 #include <sys/prom_plat.h>
29 #include <sys/promif.h>
30 #include <sys/vm.h>
31 #include <sys/cpu.h>
32 #include <sys/bitset.h>
33 #include <sys/cpupart.h>
34 #include <sys/disp.h>
35 #include <sys/hypervisor_api.h>
36 #include <sys/traptrace.h>
37 #include <sys/modctl.h>
38 #include <sys/ldoms.h>
39 #include <sys/cpu_module.h>
40 #include <sys/mutex_impl.h>
41 #include <sys/rwlock.h>
42 #include <sys/sdt.h>
43 #include <sys/cmt.h>
44 #include <vm/vm_dep.h>
45 
46 #ifdef TRAPTRACE
47 int mach_htraptrace_enable = 1;
48 #else
49 int mach_htraptrace_enable = 0;
50 #endif
51 int htrap_tr0_inuse = 0;
52 extern char htrap_tr0[];	/* prealloc buf for boot cpu */
53 
54 caddr_t	mmu_fault_status_area;
55 
56 extern void sfmmu_set_tsbs(void);
57 /*
58  * CPU IDLE optimization variables/routines
59  */
60 static int enable_halt_idle_cpus = 1;
61 
62 /*
63  * Defines for the idle_state_transition DTrace probe
64  *
65  * The probe fires when the CPU undergoes an idle state change (e.g. hv yield)
66  * The agument passed is the state to which the CPU is transitioning.
67  *
68  * The states are defined here.
69  */
70 #define	IDLE_STATE_NORMAL 0
71 #define	IDLE_STATE_YIELDED 1
72 
73 #define	SUN4V_CLOCK_TICK_THRESHOLD	64
74 #define	SUN4V_CLOCK_TICK_NCPUS		64
75 
76 extern int	clock_tick_threshold;
77 extern int	clock_tick_ncpus;
78 
79 uint_t cp_haltset_fanout = 3;
80 
81 void
82 setup_trap_table(void)
83 {
84 	caddr_t mmfsa_va;
85 	extern	 caddr_t mmu_fault_status_area;
86 	mmfsa_va =
87 	    mmu_fault_status_area + (MMFSA_SIZE * CPU->cpu_id);
88 
89 	intr_init(CPU);		/* init interrupt request free list */
90 	setwstate(WSTATE_KERN);
91 	set_mmfsa_scratchpad(mmfsa_va);
92 	prom_set_mmfsa_traptable(&trap_table, va_to_pa(mmfsa_va));
93 	sfmmu_set_tsbs();
94 }
95 
96 void
97 phys_install_has_changed(void)
98 {
99 
100 }
101 
102 /*
103  * Halt the present CPU until awoken via an interrupt
104  */
105 static void
106 cpu_halt(void)
107 {
108 	cpu_t *cpup = CPU;
109 	processorid_t cpu_sid = cpup->cpu_seqid;
110 	cpupart_t *cp = cpup->cpu_part;
111 	int hset_update = 1;
112 	volatile int *p = &cpup->cpu_disp->disp_nrunnable;
113 	uint_t s;
114 
115 	/*
116 	 * If this CPU is online then we should notate our halting
117 	 * by adding ourselves to the partition's halted CPU
118 	 * bitset. This allows other CPUs to find/awaken us when
119 	 * work becomes available.
120 	 */
121 	if (CPU->cpu_flags & CPU_OFFLINE)
122 		hset_update = 0;
123 
124 	/*
125 	 * Add ourselves to the partition's halted CPUs bitset
126 	 * and set our HALTED flag, if necessary.
127 	 *
128 	 * When a thread becomes runnable, it is placed on the queue
129 	 * and then the halted cpu bitset is checked to determine who
130 	 * (if anyone) should be awoken. We therefore need to first
131 	 * add ourselves to the halted bitset, and then check if there
132 	 * is any work available.  The order is important to prevent a race
133 	 * that can lead to work languishing on a run queue somewhere while
134 	 * this CPU remains halted.
135 	 *
136 	 * Either the producing CPU will see we're halted and will awaken us,
137 	 * or this CPU will see the work available in disp_anywork()
138 	 */
139 	if (hset_update) {
140 		cpup->cpu_disp_flags |= CPU_DISP_HALTED;
141 		membar_producer();
142 		bitset_atomic_add(&cp->cp_haltset, cpu_sid);
143 	}
144 
145 	/*
146 	 * Check to make sure there's really nothing to do.
147 	 * Work destined for this CPU may become available after
148 	 * this check. We'll be notified through the clearing of our
149 	 * bit in the halted CPU bitset, and a poke.
150 	 */
151 	if (disp_anywork()) {
152 		if (hset_update) {
153 			cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
154 			bitset_atomic_del(&cp->cp_haltset, cpu_sid);
155 		}
156 		return;
157 	}
158 
159 	/*
160 	 * We're on our way to being halted.  Wait until something becomes
161 	 * runnable locally or we are awaken (i.e. removed from the halt set).
162 	 * Note that the call to hv_cpu_yield() can return even if we have
163 	 * nothing to do.
164 	 *
165 	 * Disable interrupts now, so that we'll awaken immediately
166 	 * after halting if someone tries to poke us between now and
167 	 * the time we actually halt.
168 	 *
169 	 * We check for the presence of our bit after disabling interrupts.
170 	 * If it's cleared, we'll return. If the bit is cleared after
171 	 * we check then the poke will pop us out of the halted state.
172 	 * Also, if the offlined CPU has been brought back on-line, then
173 	 * we return as well.
174 	 *
175 	 * The ordering of the poke and the clearing of the bit by cpu_wakeup
176 	 * is important.
177 	 * cpu_wakeup() must clear, then poke.
178 	 * cpu_halt() must disable interrupts, then check for the bit.
179 	 *
180 	 * The check for anything locally runnable is here for performance
181 	 * and isn't needed for correctness. disp_nrunnable ought to be
182 	 * in our cache still, so it's inexpensive to check, and if there
183 	 * is anything runnable we won't have to wait for the poke.
184 	 *
185 	 * Any interrupt will awaken the cpu from halt. Looping here
186 	 * will filter spurious interrupts that wake us up, but don't
187 	 * represent a need for us to head back out to idle().  This
188 	 * will enable the idle loop to be more efficient and sleep in
189 	 * the processor pipeline for a larger percent of the time,
190 	 * which returns useful cycles to the peer hardware strand
191 	 * that shares the pipeline.
192 	 */
193 	s = disable_vec_intr();
194 	while (*p == 0 &&
195 	    ((hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid)) ||
196 	    (!hset_update && (CPU->cpu_flags & CPU_OFFLINE)))) {
197 
198 		DTRACE_PROBE1(idle__state__transition,
199 		    uint_t, IDLE_STATE_YIELDED);
200 		(void) hv_cpu_yield();
201 		DTRACE_PROBE1(idle__state__transition,
202 		    uint_t, IDLE_STATE_NORMAL);
203 
204 		enable_vec_intr(s);
205 		s = disable_vec_intr();
206 	}
207 
208 	/*
209 	 * We're no longer halted
210 	 */
211 	enable_vec_intr(s);
212 	if (hset_update) {
213 		cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
214 		bitset_atomic_del(&cp->cp_haltset, cpu_sid);
215 	}
216 }
217 
218 /*
219  * If "cpu" is halted, then wake it up clearing its halted bit in advance.
220  * Otherwise, see if other CPUs in the cpu partition are halted and need to
221  * be woken up so that they can steal the thread we placed on this CPU.
222  * This function is only used on MP systems.
223  */
224 static void
225 cpu_wakeup(cpu_t *cpu, int bound)
226 {
227 	uint_t		cpu_found;
228 	processorid_t	cpu_sid;
229 	cpupart_t	*cp;
230 
231 	cp = cpu->cpu_part;
232 	cpu_sid = cpu->cpu_seqid;
233 	if (bitset_in_set(&cp->cp_haltset, cpu_sid)) {
234 		/*
235 		 * Clear the halted bit for that CPU since it will be
236 		 * poked in a moment.
237 		 */
238 		bitset_atomic_del(&cp->cp_haltset, cpu_sid);
239 		/*
240 		 * We may find the current CPU present in the halted cpu bitset
241 		 * if we're in the context of an interrupt that occurred
242 		 * before we had a chance to clear our bit in cpu_halt().
243 		 * Poking ourself is obviously unnecessary, since if
244 		 * we're here, we're not halted.
245 		 */
246 		if (cpu != CPU)
247 			poke_cpu(cpu->cpu_id);
248 		return;
249 	} else {
250 		/*
251 		 * This cpu isn't halted, but it's idle or undergoing a
252 		 * context switch. No need to awaken anyone else.
253 		 */
254 		if (cpu->cpu_thread == cpu->cpu_idle_thread ||
255 		    cpu->cpu_disp_flags & CPU_DISP_DONTSTEAL)
256 			return;
257 	}
258 
259 	/*
260 	 * No need to wake up other CPUs if this is for a bound thread.
261 	 */
262 	if (bound)
263 		return;
264 
265 	/*
266 	 * The CPU specified for wakeup isn't currently halted, so check
267 	 * to see if there are any other halted CPUs in the partition,
268 	 * and if there are then awaken one.
269 	 */
270 	do {
271 		cpu_found = bitset_find(&cp->cp_haltset);
272 		if (cpu_found == (uint_t)-1)
273 			return;
274 	} while (bitset_atomic_test_and_del(&cp->cp_haltset, cpu_found) < 0);
275 
276 	if (cpu_found != CPU->cpu_seqid)
277 		poke_cpu(cpu_seq[cpu_found]->cpu_id);
278 }
279 
280 void
281 mach_cpu_halt_idle(void)
282 {
283 	if (enable_halt_idle_cpus) {
284 		idle_cpu = cpu_halt;
285 		disp_enq_thread = cpu_wakeup;
286 	}
287 }
288 
289 int
290 ndata_alloc_mmfsa(struct memlist *ndata)
291 {
292 	size_t	size;
293 
294 	size = MMFSA_SIZE * max_ncpus;
295 	mmu_fault_status_area = ndata_alloc(ndata, size, ecache_alignsize);
296 	if (mmu_fault_status_area == NULL)
297 		return (-1);
298 	return (0);
299 }
300 
301 void
302 mach_memscrub(void)
303 {
304 	/* no memscrub support for sun4v for now */
305 }
306 
307 void
308 mach_fpras()
309 {
310 	/* no fpras support for sun4v for now */
311 }
312 
313 void
314 mach_hw_copy_limit(void)
315 {
316 	/* HW copy limits set by individual CPU module */
317 }
318 
319 /*
320  * We need to enable soft ring functionality on Niagara platforms since
321  * one strand can't handle interrupts for a 1Gb NIC. So set the tunable
322  * mac_soft_ring_enable by default on this platform.
323  * mac_soft_ring_enable variable is defined in space.c and used by MAC
324  * module. This tunable in concert with mac_soft_ring_count (declared
325  * in mac.h) will configure the number of fanout soft rings for a link.
326  */
327 extern boolean_t mac_soft_ring_enable;
328 void
329 startup_platform(void)
330 {
331 	mac_soft_ring_enable = B_TRUE;
332 	if (clock_tick_threshold == 0)
333 		clock_tick_threshold = SUN4V_CLOCK_TICK_THRESHOLD;
334 	if (clock_tick_ncpus == 0)
335 		clock_tick_ncpus = SUN4V_CLOCK_TICK_NCPUS;
336 	/* set per-platform constants for mutex_backoff */
337 	mutex_backoff_base = 1;
338 	mutex_cap_factor = 4;
339 	if (l2_cache_node_count() > 1) {
340 		/* VF for example */
341 		mutex_backoff_base = 2;
342 		mutex_cap_factor = 64;
343 	}
344 	rw_lock_backoff = default_lock_backoff;
345 	rw_lock_delay = default_lock_delay;
346 }
347 
348 /*
349  * This function sets up hypervisor traptrace buffer
350  * This routine is called by the boot cpu only
351  */
352 void
353 mach_htraptrace_setup(int cpuid)
354 {
355 	TRAP_TRACE_CTL	*ctlp;
356 	int bootcpuid = getprocessorid(); /* invoked on boot cpu only */
357 
358 	if (mach_htraptrace_enable && ((cpuid != bootcpuid) ||
359 	    !htrap_tr0_inuse)) {
360 		ctlp = &trap_trace_ctl[cpuid];
361 		ctlp->d.hvaddr_base = (cpuid == bootcpuid) ? htrap_tr0 :
362 		    contig_mem_alloc_align(HTRAP_TSIZE, HTRAP_TSIZE);
363 		if (ctlp->d.hvaddr_base == NULL) {
364 			ctlp->d.hlimit = 0;
365 			ctlp->d.hpaddr_base = NULL;
366 			cmn_err(CE_WARN, "!cpu%d: failed to allocate HV "
367 			    "traptrace buffer", cpuid);
368 		} else {
369 			ctlp->d.hlimit = HTRAP_TSIZE;
370 			ctlp->d.hpaddr_base = va_to_pa(ctlp->d.hvaddr_base);
371 		}
372 	}
373 }
374 
375 /*
376  * This function enables or disables the hypervisor traptracing
377  */
378 void
379 mach_htraptrace_configure(int cpuid)
380 {
381 	uint64_t ret;
382 	uint64_t prev_buf, prev_bufsize;
383 	uint64_t prev_enable;
384 	uint64_t size;
385 	TRAP_TRACE_CTL	*ctlp;
386 
387 	ctlp = &trap_trace_ctl[cpuid];
388 	if (mach_htraptrace_enable) {
389 		if ((ctlp->d.hvaddr_base != NULL) &&
390 		    ((ctlp->d.hvaddr_base != htrap_tr0) ||
391 		    (!htrap_tr0_inuse))) {
392 			ret = hv_ttrace_buf_info(&prev_buf, &prev_bufsize);
393 			if ((ret == H_EOK) && (prev_bufsize != 0)) {
394 				cmn_err(CE_CONT,
395 				    "!cpu%d: previous HV traptrace buffer of "
396 				    "size 0x%lx at address 0x%lx", cpuid,
397 				    prev_bufsize, prev_buf);
398 			}
399 
400 			ret = hv_ttrace_buf_conf(ctlp->d.hpaddr_base,
401 			    ctlp->d.hlimit /
402 			    (sizeof (struct htrap_trace_record)), &size);
403 			if (ret == H_EOK) {
404 				ret = hv_ttrace_enable(\
405 				    (uint64_t)TRAP_TENABLE_ALL, &prev_enable);
406 				if (ret != H_EOK) {
407 					cmn_err(CE_WARN,
408 					    "!cpu%d: HV traptracing not "
409 					    "enabled, ta: 0x%x returned error: "
410 					    "%ld", cpuid, TTRACE_ENABLE, ret);
411 				} else {
412 					if (ctlp->d.hvaddr_base == htrap_tr0)
413 						htrap_tr0_inuse = 1;
414 				}
415 			} else {
416 				cmn_err(CE_WARN,
417 				    "!cpu%d: HV traptrace buffer not "
418 				    "configured, ta: 0x%x returned error: %ld",
419 				    cpuid, TTRACE_BUF_CONF, ret);
420 			}
421 			/*
422 			 * set hvaddr_base to NULL when traptrace buffer
423 			 * registration fails
424 			 */
425 			if (ret != H_EOK) {
426 				ctlp->d.hvaddr_base = NULL;
427 				ctlp->d.hlimit = 0;
428 				ctlp->d.hpaddr_base = NULL;
429 			}
430 		}
431 	} else {
432 		ret = hv_ttrace_buf_info(&prev_buf, &prev_bufsize);
433 		if ((ret == H_EOK) && (prev_bufsize != 0)) {
434 			ret = hv_ttrace_enable((uint64_t)TRAP_TDISABLE_ALL,
435 			    &prev_enable);
436 			if (ret == H_EOK) {
437 				if (ctlp->d.hvaddr_base == htrap_tr0)
438 					htrap_tr0_inuse = 0;
439 				ctlp->d.hvaddr_base = NULL;
440 				ctlp->d.hlimit = 0;
441 				ctlp->d.hpaddr_base = NULL;
442 			} else
443 				cmn_err(CE_WARN,
444 				    "!cpu%d: HV traptracing is not disabled, "
445 				    "ta: 0x%x returned error: %ld",
446 				    cpuid, TTRACE_ENABLE, ret);
447 		}
448 	}
449 }
450 
451 /*
452  * This function cleans up the hypervisor traptrace buffer
453  */
454 void
455 mach_htraptrace_cleanup(int cpuid)
456 {
457 	if (mach_htraptrace_enable) {
458 		TRAP_TRACE_CTL *ctlp;
459 		caddr_t httrace_buf_va;
460 
461 		ASSERT(cpuid < max_ncpus);
462 		ctlp = &trap_trace_ctl[cpuid];
463 		httrace_buf_va = ctlp->d.hvaddr_base;
464 		if (httrace_buf_va == htrap_tr0) {
465 			bzero(httrace_buf_va, HTRAP_TSIZE);
466 		} else if (httrace_buf_va != NULL) {
467 			contig_mem_free(httrace_buf_va, HTRAP_TSIZE);
468 		}
469 		ctlp->d.hvaddr_base = NULL;
470 		ctlp->d.hlimit = 0;
471 		ctlp->d.hpaddr_base = NULL;
472 	}
473 }
474 
475 /*
476  * Load any required machine class (sun4v) specific drivers.
477  */
478 void
479 load_mach_drivers(void)
480 {
481 	/*
482 	 * We don't want to load these LDOMs-specific
483 	 * modules if domaining is not supported.  Also,
484 	 * we must be able to run on non-LDOMs firmware.
485 	 */
486 	if (!domaining_supported())
487 		return;
488 
489 	/*
490 	 * Load the core domain services module
491 	 */
492 	if (modload("misc", "ds") == -1)
493 		cmn_err(CE_NOTE, "!'ds' module failed to load");
494 
495 	/*
496 	 * Load the rest of the domain services
497 	 */
498 	if (modload("misc", "fault_iso") == -1)
499 		cmn_err(CE_NOTE, "!'fault_iso' module failed to load");
500 
501 	if (modload("misc", "platsvc") == -1)
502 		cmn_err(CE_NOTE, "!'platsvc' module failed to load");
503 
504 	if (domaining_enabled() && modload("misc", "dr_cpu") == -1)
505 		cmn_err(CE_NOTE, "!'dr_cpu' module failed to load");
506 
507 	if (modload("misc", "dr_io") == -1)
508 		cmn_err(CE_NOTE, "!'dr_io' module failed to load");
509 
510 	if (modload("misc", "dr_mem") == -1)
511 		cmn_err(CE_NOTE, "!'dr_mem' module failed to load");
512 
513 	/*
514 	 * Attempt to attach any virtual device servers. These
515 	 * drivers must be loaded at start of day so that they
516 	 * can respond to any updates to the machine description.
517 	 *
518 	 * Since it is quite likely that a domain will not support
519 	 * one or more of these servers, failures are ignored.
520 	 */
521 
522 	/* virtual disk server */
523 	(void) i_ddi_attach_hw_nodes("vds");
524 
525 	/* virtual network switch */
526 	(void) i_ddi_attach_hw_nodes("vsw");
527 
528 	/* virtual console concentrator */
529 	(void) i_ddi_attach_hw_nodes("vcc");
530 }
531 
532 void
533 set_platform_defaults(void)
534 {
535 	/*
536 	 * Allow at most one context domain per 8 CPUs, which is ample for
537 	 * good performance.  Do not make this too large, because it
538 	 * increases the space consumed in the per-process sfmmu structure.
539 	 */
540 	if (max_mmu_ctxdoms == 0)
541 		max_mmu_ctxdoms = (NCPU + 7) / 8;
542 }
543