xref: /linux/arch/powerpc/platforms/pseries/vas.c (revision bf36793fa260cb68cc817f311f1f683788261796)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright 2020-21 IBM Corp.
4  */
5 
6 #define pr_fmt(fmt) "vas: " fmt
7 
8 #include <linux/module.h>
9 #include <linux/kernel.h>
10 #include <linux/export.h>
11 #include <linux/types.h>
12 #include <linux/delay.h>
13 #include <linux/slab.h>
14 #include <linux/interrupt.h>
15 #include <linux/irqdomain.h>
16 #include <asm/machdep.h>
17 #include <asm/hvcall.h>
18 #include <asm/plpar_wrappers.h>
19 #include <asm/firmware.h>
20 #include <asm/vphn.h>
21 #include <asm/vas.h>
22 #include "vas.h"
23 
24 #define VAS_INVALID_WIN_ADDRESS	0xFFFFFFFFFFFFFFFFul
25 #define VAS_DEFAULT_DOMAIN_ID	0xFFFFFFFFFFFFFFFFul
26 /* The hypervisor allows one credit per window right now */
27 #define DEF_WIN_CREDS		1
28 
29 static struct vas_all_caps caps_all;
30 static bool copypaste_feat;
31 static struct hv_vas_cop_feat_caps hv_cop_caps;
32 
33 static struct vas_caps vascaps[VAS_MAX_FEAT_TYPE];
34 static DEFINE_MUTEX(vas_pseries_mutex);
35 static bool migration_in_progress;
36 
37 static long hcall_return_busy_check(long rc)
38 {
39 	/* Check if we are stalled for some time */
40 	if (H_IS_LONG_BUSY(rc)) {
41 		unsigned int ms;
42 		/*
43 		 * Allocate, Modify and Deallocate HCALLs returns
44 		 * H_LONG_BUSY_ORDER_1_MSEC or H_LONG_BUSY_ORDER_10_MSEC
45 		 * for the long delay. So the sleep time should always
46 		 * be either 1 or 10msecs, but in case if the HCALL
47 		 * returns the long delay > 10 msecs, clamp the sleep
48 		 * time to 10msecs.
49 		 */
50 		ms = clamp(get_longbusy_msecs(rc), 1, 10);
51 
52 		/*
53 		 * msleep() will often sleep at least 20 msecs even
54 		 * though the hypervisor suggests that the OS reissue
55 		 * HCALLs after 1 or 10msecs. Also the delay hint from
56 		 * the HCALL is just a suggestion. So OK to pause for
57 		 * less time than the hinted delay. Use usleep_range()
58 		 * to ensure we don't sleep much longer than actually
59 		 * needed.
60 		 */
61 		usleep_range(ms * (USEC_PER_MSEC / 10), ms * USEC_PER_MSEC);
62 		rc = H_BUSY;
63 	} else if (rc == H_BUSY) {
64 		cond_resched();
65 	}
66 
67 	return rc;
68 }
69 
70 /*
71  * Allocate VAS window hcall
72  */
73 static int h_allocate_vas_window(struct pseries_vas_window *win, u64 *domain,
74 				     u8 wintype, u16 credits)
75 {
76 	long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
77 	long rc;
78 
79 	do {
80 		rc = plpar_hcall9(H_ALLOCATE_VAS_WINDOW, retbuf, wintype,
81 				  credits, domain[0], domain[1], domain[2],
82 				  domain[3], domain[4], domain[5]);
83 
84 		rc = hcall_return_busy_check(rc);
85 	} while (rc == H_BUSY);
86 
87 	if (rc == H_SUCCESS) {
88 		if (win->win_addr == VAS_INVALID_WIN_ADDRESS) {
89 			pr_err("H_ALLOCATE_VAS_WINDOW: COPY/PASTE is not supported\n");
90 			return -ENOTSUPP;
91 		}
92 		win->vas_win.winid = retbuf[0];
93 		win->win_addr = retbuf[1];
94 		win->complete_irq = retbuf[2];
95 		win->fault_irq = retbuf[3];
96 		return 0;
97 	}
98 
99 	pr_err("H_ALLOCATE_VAS_WINDOW error: %ld, wintype: %u, credits: %u\n",
100 		rc, wintype, credits);
101 
102 	return -EIO;
103 }
104 
105 /*
106  * Deallocate VAS window hcall.
107  */
108 static int h_deallocate_vas_window(u64 winid)
109 {
110 	long rc;
111 
112 	do {
113 		rc = plpar_hcall_norets(H_DEALLOCATE_VAS_WINDOW, winid);
114 
115 		rc = hcall_return_busy_check(rc);
116 	} while (rc == H_BUSY);
117 
118 	if (rc == H_SUCCESS)
119 		return 0;
120 
121 	pr_err("H_DEALLOCATE_VAS_WINDOW error: %ld, winid: %llu\n",
122 		rc, winid);
123 	return -EIO;
124 }
125 
126 /*
127  * Modify VAS window.
128  * After the window is opened with allocate window hcall, configure it
129  * with flags and LPAR PID before using.
130  */
131 static int h_modify_vas_window(struct pseries_vas_window *win)
132 {
133 	long rc;
134 
135 	/*
136 	 * AMR value is not supported in Linux VAS implementation.
137 	 * The hypervisor ignores it if 0 is passed.
138 	 */
139 	do {
140 		rc = plpar_hcall_norets(H_MODIFY_VAS_WINDOW,
141 					win->vas_win.winid, win->pid, 0,
142 					VAS_MOD_WIN_FLAGS, 0);
143 
144 		rc = hcall_return_busy_check(rc);
145 	} while (rc == H_BUSY);
146 
147 	if (rc == H_SUCCESS)
148 		return 0;
149 
150 	pr_err("H_MODIFY_VAS_WINDOW error: %ld, winid %u pid %u\n",
151 			rc, win->vas_win.winid, win->pid);
152 	return -EIO;
153 }
154 
155 /*
156  * This hcall is used to determine the capabilities from the hypervisor.
157  * @hcall: H_QUERY_VAS_CAPABILITIES or H_QUERY_NX_CAPABILITIES
158  * @query_type: If 0 is passed, the hypervisor returns the overall
159  *		capabilities which provides all feature(s) that are
160  *		available. Then query the hypervisor to get the
161  *		corresponding capabilities for the specific feature.
162  *		Example: H_QUERY_VAS_CAPABILITIES provides VAS GZIP QoS
163  *			and VAS GZIP Default capabilities.
164  *			H_QUERY_NX_CAPABILITIES provides NX GZIP
165  *			capabilities.
166  * @result: Return buffer to save capabilities.
167  */
168 int h_query_vas_capabilities(const u64 hcall, u8 query_type, u64 result)
169 {
170 	long rc;
171 
172 	rc = plpar_hcall_norets(hcall, query_type, result);
173 
174 	if (rc == H_SUCCESS)
175 		return 0;
176 
177 	/* H_FUNCTION means HV does not support VAS so don't print an error */
178 	if (rc != H_FUNCTION) {
179 		pr_err("%s error %ld, query_type %u, result buffer 0x%llx\n",
180 			(hcall == H_QUERY_VAS_CAPABILITIES) ?
181 				"H_QUERY_VAS_CAPABILITIES" :
182 				"H_QUERY_NX_CAPABILITIES",
183 			rc, query_type, result);
184 	}
185 
186 	return -EIO;
187 }
188 EXPORT_SYMBOL_GPL(h_query_vas_capabilities);
189 
190 /*
191  * hcall to get fault CRB from the hypervisor.
192  */
193 static int h_get_nx_fault(u32 winid, u64 buffer)
194 {
195 	long rc;
196 
197 	rc = plpar_hcall_norets(H_GET_NX_FAULT, winid, buffer);
198 
199 	if (rc == H_SUCCESS)
200 		return 0;
201 
202 	pr_err("H_GET_NX_FAULT error: %ld, winid %u, buffer 0x%llx\n",
203 		rc, winid, buffer);
204 	return -EIO;
205 
206 }
207 
208 /*
209  * Handle the fault interrupt.
210  * When the fault interrupt is received for each window, query the
211  * hypervisor to get the fault CRB on the specific fault. Then
212  * process the CRB by updating CSB or send signal if the user space
213  * CSB is invalid.
214  * Note: The hypervisor forwards an interrupt for each fault request.
215  *	So one fault CRB to process for each H_GET_NX_FAULT hcall.
216  */
217 static irqreturn_t pseries_vas_fault_thread_fn(int irq, void *data)
218 {
219 	struct pseries_vas_window *txwin = data;
220 	struct coprocessor_request_block crb;
221 	struct vas_user_win_ref *tsk_ref;
222 	int rc;
223 
224 	while (atomic_read(&txwin->pending_faults)) {
225 		rc = h_get_nx_fault(txwin->vas_win.winid, (u64)virt_to_phys(&crb));
226 		if (!rc) {
227 			tsk_ref = &txwin->vas_win.task_ref;
228 			vas_dump_crb(&crb);
229 			vas_update_csb(&crb, tsk_ref);
230 		}
231 		atomic_dec(&txwin->pending_faults);
232 	}
233 
234 	return IRQ_HANDLED;
235 }
236 
237 /*
238  * irq_default_primary_handler() can be used only with IRQF_ONESHOT
239  * which disables IRQ before executing the thread handler and enables
240  * it after. But this disabling interrupt sets the VAS IRQ OFF
241  * state in the hypervisor. If the NX generates fault interrupt
242  * during this window, the hypervisor will not deliver this
243  * interrupt to the LPAR. So use VAS specific IRQ handler instead
244  * of calling the default primary handler.
245  */
246 static irqreturn_t pseries_vas_irq_handler(int irq, void *data)
247 {
248 	struct pseries_vas_window *txwin = data;
249 
250 	/*
251 	 * The thread handler will process this interrupt if it is
252 	 * already running.
253 	 */
254 	atomic_inc(&txwin->pending_faults);
255 
256 	return IRQ_WAKE_THREAD;
257 }
258 
259 /*
260  * Allocate window and setup IRQ mapping.
261  */
262 static int allocate_setup_window(struct pseries_vas_window *txwin,
263 				 u64 *domain, u8 wintype)
264 {
265 	int rc;
266 
267 	rc = h_allocate_vas_window(txwin, domain, wintype, DEF_WIN_CREDS);
268 	if (rc)
269 		return rc;
270 	/*
271 	 * On PowerVM, the hypervisor setup and forwards the fault
272 	 * interrupt per window. So the IRQ setup and fault handling
273 	 * will be done for each open window separately.
274 	 */
275 	txwin->fault_virq = irq_create_mapping(NULL, txwin->fault_irq);
276 	if (!txwin->fault_virq) {
277 		pr_err("Failed irq mapping %d\n", txwin->fault_irq);
278 		rc = -EINVAL;
279 		goto out_win;
280 	}
281 
282 	txwin->name = kasprintf(GFP_KERNEL, "vas-win-%d",
283 				txwin->vas_win.winid);
284 	if (!txwin->name) {
285 		rc = -ENOMEM;
286 		goto out_irq;
287 	}
288 
289 	rc = request_threaded_irq(txwin->fault_virq,
290 				  pseries_vas_irq_handler,
291 				  pseries_vas_fault_thread_fn, 0,
292 				  txwin->name, txwin);
293 	if (rc) {
294 		pr_err("VAS-Window[%d]: Request IRQ(%u) failed with %d\n",
295 		       txwin->vas_win.winid, txwin->fault_virq, rc);
296 		goto out_free;
297 	}
298 
299 	txwin->vas_win.wcreds_max = DEF_WIN_CREDS;
300 
301 	return 0;
302 out_free:
303 	kfree(txwin->name);
304 out_irq:
305 	irq_dispose_mapping(txwin->fault_virq);
306 out_win:
307 	h_deallocate_vas_window(txwin->vas_win.winid);
308 	return rc;
309 }
310 
311 static inline void free_irq_setup(struct pseries_vas_window *txwin)
312 {
313 	free_irq(txwin->fault_virq, txwin);
314 	kfree(txwin->name);
315 	irq_dispose_mapping(txwin->fault_virq);
316 }
317 
318 static struct vas_window *vas_allocate_window(int vas_id, u64 flags,
319 					      enum vas_cop_type cop_type)
320 {
321 	long domain[PLPAR_HCALL9_BUFSIZE] = {VAS_DEFAULT_DOMAIN_ID};
322 	struct vas_cop_feat_caps *cop_feat_caps;
323 	struct vas_caps *caps;
324 	struct pseries_vas_window *txwin;
325 	int rc;
326 
327 	txwin = kzalloc(sizeof(*txwin), GFP_KERNEL);
328 	if (!txwin)
329 		return ERR_PTR(-ENOMEM);
330 
331 	/*
332 	 * A VAS window can have many credits which means that many
333 	 * requests can be issued simultaneously. But the hypervisor
334 	 * restricts one credit per window.
335 	 * The hypervisor introduces 2 different types of credits:
336 	 * Default credit type (Uses normal priority FIFO):
337 	 *	A limited number of credits are assigned to partitions
338 	 *	based on processor entitlement. But these credits may be
339 	 *	over-committed on a system depends on whether the CPUs
340 	 *	are in shared or dedicated modes - that is, more requests
341 	 *	may be issued across the system than NX can service at
342 	 *	once which can result in paste command failure (RMA_busy).
343 	 *	Then the process has to resend requests or fall-back to
344 	 *	SW compression.
345 	 * Quality of Service (QoS) credit type (Uses high priority FIFO):
346 	 *	To avoid NX HW contention, the system admins can assign
347 	 *	QoS credits for each LPAR so that this partition is
348 	 *	guaranteed access to NX resources. These credits are
349 	 *	assigned to partitions via the HMC.
350 	 *	Refer PAPR for more information.
351 	 *
352 	 * Allocate window with QoS credits if user requested. Otherwise
353 	 * default credits are used.
354 	 */
355 	if (flags & VAS_TX_WIN_FLAG_QOS_CREDIT)
356 		caps = &vascaps[VAS_GZIP_QOS_FEAT_TYPE];
357 	else
358 		caps = &vascaps[VAS_GZIP_DEF_FEAT_TYPE];
359 
360 	cop_feat_caps = &caps->caps;
361 
362 	if (atomic_inc_return(&cop_feat_caps->nr_used_credits) >
363 			atomic_read(&cop_feat_caps->nr_total_credits)) {
364 		pr_err_ratelimited("Credits are not available to allocate window\n");
365 		rc = -EINVAL;
366 		goto out;
367 	}
368 
369 	if (vas_id == -1) {
370 		/*
371 		 * The user space is requesting to allocate a window on
372 		 * a VAS instance where the process is executing.
373 		 * On PowerVM, domain values are passed to the hypervisor
374 		 * to select VAS instance. Useful if the process is
375 		 * affinity to NUMA node.
376 		 * The hypervisor selects VAS instance if
377 		 * VAS_DEFAULT_DOMAIN_ID (-1) is passed for domain values.
378 		 * The h_allocate_vas_window hcall is defined to take a
379 		 * domain values as specified by h_home_node_associativity,
380 		 * So no unpacking needs to be done.
381 		 */
382 		rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, domain,
383 				  VPHN_FLAG_VCPU, hard_smp_processor_id());
384 		if (rc != H_SUCCESS) {
385 			pr_err("H_HOME_NODE_ASSOCIATIVITY error: %d\n", rc);
386 			goto out;
387 		}
388 	}
389 
390 	txwin->pid = mfspr(SPRN_PID);
391 
392 	/*
393 	 * Allocate / Deallocate window hcalls and setup / free IRQs
394 	 * have to be protected with mutex.
395 	 * Open VAS window: Allocate window hcall and setup IRQ
396 	 * Close VAS window: Deallocate window hcall and free IRQ
397 	 *	The hypervisor waits until all NX requests are
398 	 *	completed before closing the window. So expects OS
399 	 *	to handle NX faults, means IRQ can be freed only
400 	 *	after the deallocate window hcall is returned.
401 	 * So once the window is closed with deallocate hcall before
402 	 * the IRQ is freed, it can be assigned to new allocate
403 	 * hcall with the same fault IRQ by the hypervisor. It can
404 	 * result in setup IRQ fail for the new window since the
405 	 * same fault IRQ is not freed by the OS before.
406 	 */
407 	mutex_lock(&vas_pseries_mutex);
408 	if (migration_in_progress) {
409 		rc = -EBUSY;
410 	} else {
411 		rc = allocate_setup_window(txwin, (u64 *)&domain[0],
412 				   cop_feat_caps->win_type);
413 		if (!rc)
414 			caps->nr_open_wins_progress++;
415 	}
416 
417 	mutex_unlock(&vas_pseries_mutex);
418 	if (rc)
419 		goto out;
420 
421 	/*
422 	 * Modify window and it is ready to use.
423 	 */
424 	rc = h_modify_vas_window(txwin);
425 	if (!rc)
426 		rc = get_vas_user_win_ref(&txwin->vas_win.task_ref);
427 	if (rc)
428 		goto out_free;
429 
430 	txwin->win_type = cop_feat_caps->win_type;
431 
432 	/*
433 	 * The migration SUSPEND thread sets migration_in_progress and
434 	 * closes all open windows from the list. But the window is
435 	 * added to the list after open and modify HCALLs. So possible
436 	 * that migration_in_progress is set before modify HCALL which
437 	 * may cause some windows are still open when the hypervisor
438 	 * initiates the migration.
439 	 * So checks the migration_in_progress flag again and close all
440 	 * open windows.
441 	 *
442 	 * Possible to lose the acquired credit with DLPAR core
443 	 * removal after the window is opened. So if there are any
444 	 * closed windows (means with lost credits), do not give new
445 	 * window to user space. New windows will be opened only
446 	 * after the existing windows are reopened when credits are
447 	 * available.
448 	 */
449 	mutex_lock(&vas_pseries_mutex);
450 	if (!caps->nr_close_wins && !migration_in_progress) {
451 		list_add(&txwin->win_list, &caps->list);
452 		caps->nr_open_windows++;
453 		caps->nr_open_wins_progress--;
454 		mutex_unlock(&vas_pseries_mutex);
455 		vas_user_win_add_mm_context(&txwin->vas_win.task_ref);
456 		return &txwin->vas_win;
457 	}
458 	mutex_unlock(&vas_pseries_mutex);
459 
460 	put_vas_user_win_ref(&txwin->vas_win.task_ref);
461 	rc = -EBUSY;
462 	pr_err_ratelimited("No credit is available to allocate window\n");
463 
464 out_free:
465 	/*
466 	 * Window is not operational. Free IRQ before closing
467 	 * window so that do not have to hold mutex.
468 	 */
469 	free_irq_setup(txwin);
470 	h_deallocate_vas_window(txwin->vas_win.winid);
471 	/*
472 	 * Hold mutex and reduce nr_open_wins_progress counter.
473 	 */
474 	mutex_lock(&vas_pseries_mutex);
475 	caps->nr_open_wins_progress--;
476 	mutex_unlock(&vas_pseries_mutex);
477 out:
478 	atomic_dec(&cop_feat_caps->nr_used_credits);
479 	kfree(txwin);
480 	return ERR_PTR(rc);
481 }
482 
483 static u64 vas_paste_address(struct vas_window *vwin)
484 {
485 	struct pseries_vas_window *win;
486 
487 	win = container_of(vwin, struct pseries_vas_window, vas_win);
488 	return win->win_addr;
489 }
490 
491 static int deallocate_free_window(struct pseries_vas_window *win)
492 {
493 	int rc = 0;
494 
495 	/*
496 	 * The hypervisor waits for all requests including faults
497 	 * are processed before closing the window - Means all
498 	 * credits have to be returned. In the case of fault
499 	 * request, a credit is returned after OS issues
500 	 * H_GET_NX_FAULT hcall.
501 	 * So free IRQ after executing H_DEALLOCATE_VAS_WINDOW
502 	 * hcall.
503 	 */
504 	rc = h_deallocate_vas_window(win->vas_win.winid);
505 	if (!rc)
506 		free_irq_setup(win);
507 
508 	return rc;
509 }
510 
511 static int vas_deallocate_window(struct vas_window *vwin)
512 {
513 	struct pseries_vas_window *win;
514 	struct vas_cop_feat_caps *caps;
515 	int rc = 0;
516 
517 	if (!vwin)
518 		return -EINVAL;
519 
520 	win = container_of(vwin, struct pseries_vas_window, vas_win);
521 
522 	/* Should not happen */
523 	if (win->win_type >= VAS_MAX_FEAT_TYPE) {
524 		pr_err("Window (%u): Invalid window type %u\n",
525 				vwin->winid, win->win_type);
526 		return -EINVAL;
527 	}
528 
529 	caps = &vascaps[win->win_type].caps;
530 	mutex_lock(&vas_pseries_mutex);
531 	/*
532 	 * VAS window is already closed in the hypervisor when
533 	 * lost the credit or with migration. So just remove the entry
534 	 * from the list, remove task references and free vas_window
535 	 * struct.
536 	 */
537 	if (!(win->vas_win.status & VAS_WIN_NO_CRED_CLOSE) &&
538 		!(win->vas_win.status & VAS_WIN_MIGRATE_CLOSE)) {
539 		rc = deallocate_free_window(win);
540 		if (rc) {
541 			mutex_unlock(&vas_pseries_mutex);
542 			return rc;
543 		}
544 	} else
545 		vascaps[win->win_type].nr_close_wins--;
546 
547 	list_del(&win->win_list);
548 	atomic_dec(&caps->nr_used_credits);
549 	vascaps[win->win_type].nr_open_windows--;
550 	mutex_unlock(&vas_pseries_mutex);
551 
552 	mm_context_remove_vas_window(vwin->task_ref.mm);
553 	put_vas_user_win_ref(&vwin->task_ref);
554 
555 	kfree(win);
556 	return 0;
557 }
558 
559 static const struct vas_user_win_ops vops_pseries = {
560 	.open_win	= vas_allocate_window,	/* Open and configure window */
561 	.paste_addr	= vas_paste_address,	/* To do copy/paste */
562 	.close_win	= vas_deallocate_window, /* Close window */
563 };
564 
565 /*
566  * Supporting only nx-gzip coprocessor type now, but this API code
567  * extended to other coprocessor types later.
568  */
569 int vas_register_api_pseries(struct module *mod, enum vas_cop_type cop_type,
570 			     const char *name)
571 {
572 	if (!copypaste_feat)
573 		return -ENOTSUPP;
574 
575 	return vas_register_coproc_api(mod, cop_type, name, &vops_pseries);
576 }
577 EXPORT_SYMBOL_GPL(vas_register_api_pseries);
578 
579 void vas_unregister_api_pseries(void)
580 {
581 	vas_unregister_coproc_api();
582 }
583 EXPORT_SYMBOL_GPL(vas_unregister_api_pseries);
584 
585 /*
586  * Get the specific capabilities based on the feature type.
587  * Right now supports GZIP default and GZIP QoS capabilities.
588  */
589 static int __init get_vas_capabilities(u8 feat, enum vas_cop_feat_type type,
590 				struct hv_vas_cop_feat_caps *hv_caps)
591 {
592 	struct vas_cop_feat_caps *caps;
593 	struct vas_caps *vcaps;
594 	int rc = 0;
595 
596 	vcaps = &vascaps[type];
597 	memset(vcaps, 0, sizeof(*vcaps));
598 	INIT_LIST_HEAD(&vcaps->list);
599 
600 	vcaps->feat = feat;
601 	caps = &vcaps->caps;
602 
603 	rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, feat,
604 					  (u64)virt_to_phys(hv_caps));
605 	if (rc)
606 		return rc;
607 
608 	caps->user_mode = hv_caps->user_mode;
609 	if (!(caps->user_mode & VAS_COPY_PASTE_USER_MODE)) {
610 		pr_err("User space COPY/PASTE is not supported\n");
611 		return -ENOTSUPP;
612 	}
613 
614 	caps->descriptor = be64_to_cpu(hv_caps->descriptor);
615 	caps->win_type = hv_caps->win_type;
616 	if (caps->win_type >= VAS_MAX_FEAT_TYPE) {
617 		pr_err("Unsupported window type %u\n", caps->win_type);
618 		return -EINVAL;
619 	}
620 	caps->max_lpar_creds = be16_to_cpu(hv_caps->max_lpar_creds);
621 	caps->max_win_creds = be16_to_cpu(hv_caps->max_win_creds);
622 	atomic_set(&caps->nr_total_credits,
623 		   be16_to_cpu(hv_caps->target_lpar_creds));
624 	if (feat == VAS_GZIP_DEF_FEAT) {
625 		caps->def_lpar_creds = be16_to_cpu(hv_caps->def_lpar_creds);
626 
627 		if (caps->max_win_creds < DEF_WIN_CREDS) {
628 			pr_err("Window creds(%u) > max allowed window creds(%u)\n",
629 			       DEF_WIN_CREDS, caps->max_win_creds);
630 			return -EINVAL;
631 		}
632 	}
633 
634 	rc = sysfs_add_vas_caps(caps);
635 	if (rc)
636 		return rc;
637 
638 	copypaste_feat = true;
639 
640 	return 0;
641 }
642 
643 /*
644  * VAS windows can be closed due to lost credits when the core is
645  * removed. So reopen them if credits are available due to DLPAR
646  * core add and set the window active status. When NX sees the page
647  * fault on the unmapped paste address, the kernel handles the fault
648  * by setting the remapping to new paste address if the window is
649  * active.
650  */
651 static int reconfig_open_windows(struct vas_caps *vcaps, int creds,
652 				 bool migrate)
653 {
654 	long domain[PLPAR_HCALL9_BUFSIZE] = {VAS_DEFAULT_DOMAIN_ID};
655 	struct vas_cop_feat_caps *caps = &vcaps->caps;
656 	struct pseries_vas_window *win = NULL, *tmp;
657 	int rc, mv_ents = 0;
658 	int flag;
659 
660 	/*
661 	 * Nothing to do if there are no closed windows.
662 	 */
663 	if (!vcaps->nr_close_wins)
664 		return 0;
665 
666 	/*
667 	 * For the core removal, the hypervisor reduces the credits
668 	 * assigned to the LPAR and the kernel closes VAS windows
669 	 * in the hypervisor depends on reduced credits. The kernel
670 	 * uses LIFO (the last windows that are opened will be closed
671 	 * first) and expects to open in the same order when credits
672 	 * are available.
673 	 * For example, 40 windows are closed when the LPAR lost 2 cores
674 	 * (dedicated). If 1 core is added, this LPAR can have 20 more
675 	 * credits. It means the kernel can reopen 20 windows. So move
676 	 * 20 entries in the VAS windows lost and reopen next 20 windows.
677 	 * For partition migration, reopen all windows that are closed
678 	 * during resume.
679 	 */
680 	if ((vcaps->nr_close_wins > creds) && !migrate)
681 		mv_ents = vcaps->nr_close_wins - creds;
682 
683 	list_for_each_entry_safe(win, tmp, &vcaps->list, win_list) {
684 		if (!mv_ents)
685 			break;
686 
687 		mv_ents--;
688 	}
689 
690 	/*
691 	 * Open windows if they are closed only with migration or
692 	 * DLPAR (lost credit) before.
693 	 */
694 	if (migrate)
695 		flag = VAS_WIN_MIGRATE_CLOSE;
696 	else
697 		flag = VAS_WIN_NO_CRED_CLOSE;
698 
699 	list_for_each_entry_safe_from(win, tmp, &vcaps->list, win_list) {
700 		/*
701 		 * This window is closed with DLPAR and migration events.
702 		 * So reopen the window with the last event.
703 		 * The user space is not suspended with the current
704 		 * migration notifier. So the user space can issue DLPAR
705 		 * CPU hotplug while migration in progress. In this case
706 		 * this window will be opened with the last event.
707 		 */
708 		if ((win->vas_win.status & VAS_WIN_NO_CRED_CLOSE) &&
709 			(win->vas_win.status & VAS_WIN_MIGRATE_CLOSE)) {
710 			win->vas_win.status &= ~flag;
711 			continue;
712 		}
713 
714 		/*
715 		 * Nothing to do on this window if it is not closed
716 		 * with this flag
717 		 */
718 		if (!(win->vas_win.status & flag))
719 			continue;
720 
721 		rc = allocate_setup_window(win, (u64 *)&domain[0],
722 					   caps->win_type);
723 		if (rc)
724 			return rc;
725 
726 		rc = h_modify_vas_window(win);
727 		if (rc)
728 			goto out;
729 
730 		mutex_lock(&win->vas_win.task_ref.mmap_mutex);
731 		/*
732 		 * Set window status to active
733 		 */
734 		win->vas_win.status &= ~flag;
735 		mutex_unlock(&win->vas_win.task_ref.mmap_mutex);
736 		win->win_type = caps->win_type;
737 		if (!--vcaps->nr_close_wins)
738 			break;
739 	}
740 
741 	return 0;
742 out:
743 	/*
744 	 * Window modify HCALL failed. So close the window to the
745 	 * hypervisor and return.
746 	 */
747 	free_irq_setup(win);
748 	h_deallocate_vas_window(win->vas_win.winid);
749 	return rc;
750 }
751 
752 /*
753  * The hypervisor reduces the available credits if the LPAR lost core. It
754  * means the excessive windows should not be active and the user space
755  * should not be using these windows to send compression requests to NX.
756  * So the kernel closes the excessive windows and unmap the paste address
757  * such that the user space receives paste instruction failure. Then up to
758  * the user space to fall back to SW compression and manage with the
759  * existing windows.
760  */
761 static int reconfig_close_windows(struct vas_caps *vcap, int excess_creds,
762 									bool migrate)
763 {
764 	struct pseries_vas_window *win, *tmp;
765 	struct vas_user_win_ref *task_ref;
766 	struct vm_area_struct *vma;
767 	int rc = 0, flag;
768 
769 	if (migrate)
770 		flag = VAS_WIN_MIGRATE_CLOSE;
771 	else
772 		flag = VAS_WIN_NO_CRED_CLOSE;
773 
774 	list_for_each_entry_safe(win, tmp, &vcap->list, win_list) {
775 		/*
776 		 * This window is already closed due to lost credit
777 		 * or for migration before. Go for next window.
778 		 * For migration, nothing to do since this window
779 		 * closed for DLPAR and will be reopened even on
780 		 * the destination system with other DLPAR operation.
781 		 */
782 		if ((win->vas_win.status & VAS_WIN_MIGRATE_CLOSE) ||
783 			(win->vas_win.status & VAS_WIN_NO_CRED_CLOSE)) {
784 			win->vas_win.status |= flag;
785 			continue;
786 		}
787 
788 		task_ref = &win->vas_win.task_ref;
789 		/*
790 		 * VAS mmap (coproc_mmap()) and its fault handler
791 		 * (vas_mmap_fault()) are called after holding mmap lock.
792 		 * So hold mmap mutex after mmap_lock to avoid deadlock.
793 		 */
794 		mmap_write_lock(task_ref->mm);
795 		mutex_lock(&task_ref->mmap_mutex);
796 		vma = task_ref->vma;
797 		/*
798 		 * Number of available credits are reduced, So select
799 		 * and close windows.
800 		 */
801 		win->vas_win.status |= flag;
802 
803 		/*
804 		 * vma is set in the original mapping. But this mapping
805 		 * is done with mmap() after the window is opened with ioctl.
806 		 * so we may not see the original mapping if the core remove
807 		 * is done before the original mmap() and after the ioctl.
808 		 */
809 		if (vma)
810 			zap_vma_pages(vma);
811 
812 		mutex_unlock(&task_ref->mmap_mutex);
813 		mmap_write_unlock(task_ref->mm);
814 		/*
815 		 * Close VAS window in the hypervisor, but do not
816 		 * free vas_window struct since it may be reused
817 		 * when the credit is available later (DLPAR with
818 		 * adding cores). This struct will be used
819 		 * later when the process issued with close(FD).
820 		 */
821 		rc = deallocate_free_window(win);
822 		/*
823 		 * This failure is from the hypervisor.
824 		 * No way to stop migration for these failures.
825 		 * So ignore error and continue closing other windows.
826 		 */
827 		if (rc && !migrate)
828 			return rc;
829 
830 		vcap->nr_close_wins++;
831 
832 		/*
833 		 * For migration, do not depend on lpar_creds in case if
834 		 * mismatch with the hypervisor value (should not happen).
835 		 * So close all active windows in the list and will be
836 		 * reopened windows based on the new lpar_creds on the
837 		 * destination system during resume.
838 		 */
839 		if (!migrate && !--excess_creds)
840 			break;
841 	}
842 
843 	return 0;
844 }
845 
846 /*
847  * Get new VAS capabilities when the core add/removal configuration
848  * changes. Reconfig window configurations based on the credits
849  * availability from this new capabilities.
850  */
851 int vas_reconfig_capabilties(u8 type, int new_nr_creds)
852 {
853 	struct vas_cop_feat_caps *caps;
854 	int old_nr_creds;
855 	struct vas_caps *vcaps;
856 	int rc = 0, nr_active_wins;
857 
858 	if (type >= VAS_MAX_FEAT_TYPE) {
859 		pr_err("Invalid credit type %d\n", type);
860 		return -EINVAL;
861 	}
862 
863 	vcaps = &vascaps[type];
864 	caps = &vcaps->caps;
865 
866 	mutex_lock(&vas_pseries_mutex);
867 
868 	old_nr_creds = atomic_read(&caps->nr_total_credits);
869 
870 	atomic_set(&caps->nr_total_credits, new_nr_creds);
871 	/*
872 	 * The total number of available credits may be decreased or
873 	 * increased with DLPAR operation. Means some windows have to be
874 	 * closed / reopened. Hold the vas_pseries_mutex so that the
875 	 * user space can not open new windows.
876 	 */
877 	if (old_nr_creds <  new_nr_creds) {
878 		/*
879 		 * If the existing target credits is less than the new
880 		 * target, reopen windows if they are closed due to
881 		 * the previous DLPAR (core removal).
882 		 */
883 		rc = reconfig_open_windows(vcaps, new_nr_creds - old_nr_creds,
884 					   false);
885 	} else {
886 		/*
887 		 * # active windows is more than new LPAR available
888 		 * credits. So close the excessive windows.
889 		 * On pseries, each window will have 1 credit.
890 		 */
891 		nr_active_wins = vcaps->nr_open_windows - vcaps->nr_close_wins;
892 		if (nr_active_wins > new_nr_creds)
893 			rc = reconfig_close_windows(vcaps,
894 					nr_active_wins - new_nr_creds,
895 					false);
896 	}
897 
898 	mutex_unlock(&vas_pseries_mutex);
899 	return rc;
900 }
901 
902 int pseries_vas_dlpar_cpu(void)
903 {
904 	int new_nr_creds, rc;
905 
906 	/*
907 	 * NX-GZIP is not enabled. Nothing to do for DLPAR event
908 	 */
909 	if (!copypaste_feat)
910 		return 0;
911 
912 
913 	rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES,
914 				      vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat,
915 				      (u64)virt_to_phys(&hv_cop_caps));
916 	if (!rc) {
917 		new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds);
918 		rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE, new_nr_creds);
919 	}
920 
921 	if (rc)
922 		pr_err("Failed reconfig VAS capabilities with DLPAR\n");
923 
924 	return rc;
925 }
926 
927 /*
928  * Total number of default credits available (target_credits)
929  * in LPAR depends on number of cores configured. It varies based on
930  * whether processors are in shared mode or dedicated mode.
931  * Get the notifier when CPU configuration is changed with DLPAR
932  * operation so that get the new target_credits (vas default capabilities)
933  * and then update the existing windows usage if needed.
934  */
935 static int pseries_vas_notifier(struct notifier_block *nb,
936 				unsigned long action, void *data)
937 {
938 	struct of_reconfig_data *rd = data;
939 	struct device_node *dn = rd->dn;
940 	const __be32 *intserv = NULL;
941 	int len;
942 
943 	/*
944 	 * For shared CPU partition, the hypervisor assigns total credits
945 	 * based on entitled core capacity. So updating VAS windows will
946 	 * be called from lparcfg_write().
947 	 */
948 	if (is_shared_processor())
949 		return NOTIFY_OK;
950 
951 	if ((action == OF_RECONFIG_ATTACH_NODE) ||
952 		(action == OF_RECONFIG_DETACH_NODE))
953 		intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s",
954 					  &len);
955 	/*
956 	 * Processor config is not changed
957 	 */
958 	if (!intserv)
959 		return NOTIFY_OK;
960 
961 	return pseries_vas_dlpar_cpu();
962 }
963 
964 static struct notifier_block pseries_vas_nb = {
965 	.notifier_call = pseries_vas_notifier,
966 };
967 
968 /*
969  * For LPM, all windows have to be closed on the source partition
970  * before migration and reopen them on the destination partition
971  * after migration. So closing windows during suspend and
972  * reopen them during resume.
973  */
974 int vas_migration_handler(int action)
975 {
976 	struct vas_cop_feat_caps *caps;
977 	int old_nr_creds, new_nr_creds = 0;
978 	struct vas_caps *vcaps;
979 	int i, rc = 0;
980 
981 	pr_info("VAS migration event %d\n", action);
982 
983 	/*
984 	 * NX-GZIP is not enabled. Nothing to do for migration.
985 	 */
986 	if (!copypaste_feat)
987 		return rc;
988 
989 	if (action == VAS_SUSPEND)
990 		migration_in_progress = true;
991 	else
992 		migration_in_progress = false;
993 
994 	for (i = 0; i < VAS_MAX_FEAT_TYPE; i++) {
995 		vcaps = &vascaps[i];
996 		caps = &vcaps->caps;
997 		old_nr_creds = atomic_read(&caps->nr_total_credits);
998 
999 		rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES,
1000 					      vcaps->feat,
1001 					      (u64)virt_to_phys(&hv_cop_caps));
1002 		if (!rc) {
1003 			new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds);
1004 			/*
1005 			 * Should not happen. But incase print messages, close
1006 			 * all windows in the list during suspend and reopen
1007 			 * windows based on new lpar_creds on the destination
1008 			 * system.
1009 			 */
1010 			if (old_nr_creds != new_nr_creds) {
1011 				pr_err("Target credits mismatch with the hypervisor\n");
1012 				pr_err("state(%d): lpar creds: %d HV lpar creds: %d\n",
1013 					action, old_nr_creds, new_nr_creds);
1014 				pr_err("Used creds: %d, Active creds: %d\n",
1015 					atomic_read(&caps->nr_used_credits),
1016 					vcaps->nr_open_windows - vcaps->nr_close_wins);
1017 			}
1018 		} else {
1019 			pr_err("state(%d): Get VAS capabilities failed with %d\n",
1020 				action, rc);
1021 			/*
1022 			 * We can not stop migration with the current lpm
1023 			 * implementation. So continue closing all windows in
1024 			 * the list (during suspend) and return without
1025 			 * opening windows (during resume) if VAS capabilities
1026 			 * HCALL failed.
1027 			 */
1028 			if (action == VAS_RESUME)
1029 				goto out;
1030 		}
1031 
1032 		switch (action) {
1033 		case VAS_SUSPEND:
1034 			mutex_lock(&vas_pseries_mutex);
1035 			rc = reconfig_close_windows(vcaps, vcaps->nr_open_windows,
1036 							true);
1037 			/*
1038 			 * Windows are included in the list after successful
1039 			 * open. So wait for closing these in-progress open
1040 			 * windows in vas_allocate_window() which will be
1041 			 * done if the migration_in_progress is set.
1042 			 */
1043 			while (vcaps->nr_open_wins_progress) {
1044 				mutex_unlock(&vas_pseries_mutex);
1045 				msleep(10);
1046 				mutex_lock(&vas_pseries_mutex);
1047 			}
1048 			mutex_unlock(&vas_pseries_mutex);
1049 			break;
1050 		case VAS_RESUME:
1051 			mutex_lock(&vas_pseries_mutex);
1052 			atomic_set(&caps->nr_total_credits, new_nr_creds);
1053 			rc = reconfig_open_windows(vcaps, new_nr_creds, true);
1054 			mutex_unlock(&vas_pseries_mutex);
1055 			break;
1056 		default:
1057 			/* should not happen */
1058 			pr_err("Invalid migration action %d\n", action);
1059 			rc = -EINVAL;
1060 			goto out;
1061 		}
1062 
1063 		/*
1064 		 * Ignore errors during suspend and return for resume.
1065 		 */
1066 		if (rc && (action == VAS_RESUME))
1067 			goto out;
1068 	}
1069 
1070 	pr_info("VAS migration event (%d) successful\n", action);
1071 
1072 out:
1073 	return rc;
1074 }
1075 
1076 static int __init pseries_vas_init(void)
1077 {
1078 	struct hv_vas_all_caps *hv_caps;
1079 	int rc = 0;
1080 
1081 	/*
1082 	 * Linux supports user space COPY/PASTE only with Radix
1083 	 */
1084 	if (!radix_enabled()) {
1085 		copypaste_feat = false;
1086 		pr_err("API is supported only with radix page tables\n");
1087 		return -ENOTSUPP;
1088 	}
1089 
1090 	hv_caps = kmalloc(sizeof(*hv_caps), GFP_KERNEL);
1091 	if (!hv_caps)
1092 		return -ENOMEM;
1093 	/*
1094 	 * Get VAS overall capabilities by passing 0 to feature type.
1095 	 */
1096 	rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, 0,
1097 					  (u64)virt_to_phys(hv_caps));
1098 	if (rc)
1099 		goto out;
1100 
1101 	caps_all.descriptor = be64_to_cpu(hv_caps->descriptor);
1102 	caps_all.feat_type = be64_to_cpu(hv_caps->feat_type);
1103 
1104 	sysfs_pseries_vas_init(&caps_all);
1105 
1106 	/*
1107 	 * QOS capabilities available
1108 	 */
1109 	if (caps_all.feat_type & VAS_GZIP_QOS_FEAT_BIT) {
1110 		rc = get_vas_capabilities(VAS_GZIP_QOS_FEAT,
1111 					  VAS_GZIP_QOS_FEAT_TYPE, &hv_cop_caps);
1112 
1113 		if (rc)
1114 			goto out;
1115 	}
1116 	/*
1117 	 * Default capabilities available
1118 	 */
1119 	if (caps_all.feat_type & VAS_GZIP_DEF_FEAT_BIT)
1120 		rc = get_vas_capabilities(VAS_GZIP_DEF_FEAT,
1121 					  VAS_GZIP_DEF_FEAT_TYPE, &hv_cop_caps);
1122 
1123 	if (!rc && copypaste_feat) {
1124 		if (firmware_has_feature(FW_FEATURE_LPAR))
1125 			of_reconfig_notifier_register(&pseries_vas_nb);
1126 
1127 		pr_info("GZIP feature is available\n");
1128 	} else {
1129 		/*
1130 		 * Should not happen, but only when get default
1131 		 * capabilities HCALL failed. So disable copy paste
1132 		 * feature.
1133 		 */
1134 		copypaste_feat = false;
1135 	}
1136 
1137 out:
1138 	kfree(hv_caps);
1139 	return rc;
1140 }
1141 machine_device_initcall(pseries, pseries_vas_init);
1142