xref: /linux/drivers/hv/vmbus_drv.c (revision 86c48271e0d60c82665e9fd61277002391efcef7)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2009, Microsoft Corporation.
4  *
5  * Authors:
6  *   Haiyang Zhang <haiyangz@microsoft.com>
7  *   Hank Janssen  <hjanssen@microsoft.com>
8  *   K. Y. Srinivasan <kys@microsoft.com>
9  */
10 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11 
12 #include <linux/init.h>
13 #include <linux/module.h>
14 #include <linux/device.h>
15 #include <linux/platform_device.h>
16 #include <linux/interrupt.h>
17 #include <linux/sysctl.h>
18 #include <linux/slab.h>
19 #include <linux/acpi.h>
20 #include <linux/completion.h>
21 #include <linux/hyperv.h>
22 #include <linux/kernel_stat.h>
23 #include <linux/of_address.h>
24 #include <linux/clockchips.h>
25 #include <linux/cpu.h>
26 #include <linux/sched/isolation.h>
27 #include <linux/sched/task_stack.h>
28 
29 #include <linux/delay.h>
30 #include <linux/panic_notifier.h>
31 #include <linux/ptrace.h>
32 #include <linux/screen_info.h>
33 #include <linux/efi.h>
34 #include <linux/random.h>
35 #include <linux/kernel.h>
36 #include <linux/syscore_ops.h>
37 #include <linux/dma-map-ops.h>
38 #include <linux/pci.h>
39 #include <clocksource/hyperv_timer.h>
40 #include <asm/mshyperv.h>
41 #include "hyperv_vmbus.h"
42 
43 struct vmbus_dynid {
44 	struct list_head node;
45 	struct hv_vmbus_device_id id;
46 };
47 
48 /* VMBus Root Device */
49 static struct device  *vmbus_root_device;
50 
51 static int hyperv_cpuhp_online;
52 
53 static long __percpu *vmbus_evt;
54 
55 /* Values parsed from ACPI DSDT */
56 int vmbus_irq;
57 int vmbus_interrupt;
58 
59 /*
60  * The panic notifier below is responsible solely for unloading the
61  * vmbus connection, which is necessary in a panic event.
62  *
63  * Notice an intrincate relation of this notifier with Hyper-V
64  * framebuffer panic notifier exists - we need vmbus connection alive
65  * there in order to succeed, so we need to order both with each other
66  * [see hvfb_on_panic()] - this is done using notifiers' priorities.
67  */
68 static int hv_panic_vmbus_unload(struct notifier_block *nb, unsigned long val,
69 			      void *args)
70 {
71 	vmbus_initiate_unload(true);
72 	return NOTIFY_DONE;
73 }
74 static struct notifier_block hyperv_panic_vmbus_unload_block = {
75 	.notifier_call	= hv_panic_vmbus_unload,
76 	.priority	= INT_MIN + 1, /* almost the latest one to execute */
77 };
78 
79 static const char *fb_mmio_name = "fb_range";
80 static struct resource *fb_mmio;
81 static struct resource *hyperv_mmio;
82 static DEFINE_MUTEX(hyperv_mmio_lock);
83 
84 struct device *hv_get_vmbus_root_device(void)
85 {
86 	return vmbus_root_device;
87 }
88 EXPORT_SYMBOL_GPL(hv_get_vmbus_root_device);
89 
90 static int vmbus_exists(void)
91 {
92 	if (vmbus_root_device == NULL)
93 		return -ENODEV;
94 
95 	return 0;
96 }
97 
98 static u8 channel_monitor_group(const struct vmbus_channel *channel)
99 {
100 	return (u8)channel->offermsg.monitorid / 32;
101 }
102 
103 static u8 channel_monitor_offset(const struct vmbus_channel *channel)
104 {
105 	return (u8)channel->offermsg.monitorid % 32;
106 }
107 
108 static u32 channel_pending(const struct vmbus_channel *channel,
109 			   const struct hv_monitor_page *monitor_page)
110 {
111 	u8 monitor_group = channel_monitor_group(channel);
112 
113 	return monitor_page->trigger_group[monitor_group].pending;
114 }
115 
116 static u32 channel_latency(const struct vmbus_channel *channel,
117 			   const struct hv_monitor_page *monitor_page)
118 {
119 	u8 monitor_group = channel_monitor_group(channel);
120 	u8 monitor_offset = channel_monitor_offset(channel);
121 
122 	return monitor_page->latency[monitor_group][monitor_offset];
123 }
124 
125 static u32 channel_conn_id(struct vmbus_channel *channel,
126 			   struct hv_monitor_page *monitor_page)
127 {
128 	u8 monitor_group = channel_monitor_group(channel);
129 	u8 monitor_offset = channel_monitor_offset(channel);
130 
131 	return monitor_page->parameter[monitor_group][monitor_offset].connectionid.u.id;
132 }
133 
134 static ssize_t id_show(struct device *dev, struct device_attribute *dev_attr,
135 		       char *buf)
136 {
137 	struct hv_device *hv_dev = device_to_hv_device(dev);
138 
139 	if (!hv_dev->channel)
140 		return -ENODEV;
141 	return sysfs_emit(buf, "%d\n", hv_dev->channel->offermsg.child_relid);
142 }
143 static DEVICE_ATTR_RO(id);
144 
145 static ssize_t state_show(struct device *dev, struct device_attribute *dev_attr,
146 			  char *buf)
147 {
148 	struct hv_device *hv_dev = device_to_hv_device(dev);
149 
150 	if (!hv_dev->channel)
151 		return -ENODEV;
152 	return sysfs_emit(buf, "%d\n", hv_dev->channel->state);
153 }
154 static DEVICE_ATTR_RO(state);
155 
156 static ssize_t monitor_id_show(struct device *dev,
157 			       struct device_attribute *dev_attr, char *buf)
158 {
159 	struct hv_device *hv_dev = device_to_hv_device(dev);
160 
161 	if (!hv_dev->channel)
162 		return -ENODEV;
163 	return sysfs_emit(buf, "%d\n", hv_dev->channel->offermsg.monitorid);
164 }
165 static DEVICE_ATTR_RO(monitor_id);
166 
167 static ssize_t class_id_show(struct device *dev,
168 			       struct device_attribute *dev_attr, char *buf)
169 {
170 	struct hv_device *hv_dev = device_to_hv_device(dev);
171 
172 	if (!hv_dev->channel)
173 		return -ENODEV;
174 	return sysfs_emit(buf, "{%pUl}\n",
175 			  &hv_dev->channel->offermsg.offer.if_type);
176 }
177 static DEVICE_ATTR_RO(class_id);
178 
179 static ssize_t device_id_show(struct device *dev,
180 			      struct device_attribute *dev_attr, char *buf)
181 {
182 	struct hv_device *hv_dev = device_to_hv_device(dev);
183 
184 	if (!hv_dev->channel)
185 		return -ENODEV;
186 	return sysfs_emit(buf, "{%pUl}\n",
187 			  &hv_dev->channel->offermsg.offer.if_instance);
188 }
189 static DEVICE_ATTR_RO(device_id);
190 
191 static ssize_t modalias_show(struct device *dev,
192 			     struct device_attribute *dev_attr, char *buf)
193 {
194 	struct hv_device *hv_dev = device_to_hv_device(dev);
195 
196 	return sysfs_emit(buf, "vmbus:%*phN\n", UUID_SIZE, &hv_dev->dev_type);
197 }
198 static DEVICE_ATTR_RO(modalias);
199 
200 #ifdef CONFIG_NUMA
201 static ssize_t numa_node_show(struct device *dev,
202 			      struct device_attribute *attr, char *buf)
203 {
204 	struct hv_device *hv_dev = device_to_hv_device(dev);
205 
206 	if (!hv_dev->channel)
207 		return -ENODEV;
208 
209 	return sysfs_emit(buf, "%d\n", cpu_to_node(hv_dev->channel->target_cpu));
210 }
211 static DEVICE_ATTR_RO(numa_node);
212 #endif
213 
214 static ssize_t server_monitor_pending_show(struct device *dev,
215 					   struct device_attribute *dev_attr,
216 					   char *buf)
217 {
218 	struct hv_device *hv_dev = device_to_hv_device(dev);
219 
220 	if (!hv_dev->channel)
221 		return -ENODEV;
222 	return sysfs_emit(buf, "%d\n", channel_pending(hv_dev->channel,
223 			  vmbus_connection.monitor_pages[0]));
224 }
225 static DEVICE_ATTR_RO(server_monitor_pending);
226 
227 static ssize_t client_monitor_pending_show(struct device *dev,
228 					   struct device_attribute *dev_attr,
229 					   char *buf)
230 {
231 	struct hv_device *hv_dev = device_to_hv_device(dev);
232 
233 	if (!hv_dev->channel)
234 		return -ENODEV;
235 	return sysfs_emit(buf, "%d\n", channel_pending(hv_dev->channel,
236 			  vmbus_connection.monitor_pages[1]));
237 }
238 static DEVICE_ATTR_RO(client_monitor_pending);
239 
240 static ssize_t server_monitor_latency_show(struct device *dev,
241 					   struct device_attribute *dev_attr,
242 					   char *buf)
243 {
244 	struct hv_device *hv_dev = device_to_hv_device(dev);
245 
246 	if (!hv_dev->channel)
247 		return -ENODEV;
248 	return sysfs_emit(buf, "%d\n", channel_latency(hv_dev->channel,
249 			  vmbus_connection.monitor_pages[0]));
250 }
251 static DEVICE_ATTR_RO(server_monitor_latency);
252 
253 static ssize_t client_monitor_latency_show(struct device *dev,
254 					   struct device_attribute *dev_attr,
255 					   char *buf)
256 {
257 	struct hv_device *hv_dev = device_to_hv_device(dev);
258 
259 	if (!hv_dev->channel)
260 		return -ENODEV;
261 	return sysfs_emit(buf, "%d\n", channel_latency(hv_dev->channel,
262 			  vmbus_connection.monitor_pages[1]));
263 }
264 static DEVICE_ATTR_RO(client_monitor_latency);
265 
266 static ssize_t server_monitor_conn_id_show(struct device *dev,
267 					   struct device_attribute *dev_attr,
268 					   char *buf)
269 {
270 	struct hv_device *hv_dev = device_to_hv_device(dev);
271 
272 	if (!hv_dev->channel)
273 		return -ENODEV;
274 	return sysfs_emit(buf, "%d\n", channel_conn_id(hv_dev->channel,
275 			  vmbus_connection.monitor_pages[0]));
276 }
277 static DEVICE_ATTR_RO(server_monitor_conn_id);
278 
279 static ssize_t client_monitor_conn_id_show(struct device *dev,
280 					   struct device_attribute *dev_attr,
281 					   char *buf)
282 {
283 	struct hv_device *hv_dev = device_to_hv_device(dev);
284 
285 	if (!hv_dev->channel)
286 		return -ENODEV;
287 	return sysfs_emit(buf, "%d\n", channel_conn_id(hv_dev->channel,
288 			  vmbus_connection.monitor_pages[1]));
289 }
290 static DEVICE_ATTR_RO(client_monitor_conn_id);
291 
292 static ssize_t out_intr_mask_show(struct device *dev,
293 				  struct device_attribute *dev_attr, char *buf)
294 {
295 	struct hv_device *hv_dev = device_to_hv_device(dev);
296 	struct hv_ring_buffer_debug_info outbound;
297 	int ret;
298 
299 	if (!hv_dev->channel)
300 		return -ENODEV;
301 
302 	ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound,
303 					  &outbound);
304 	if (ret < 0)
305 		return ret;
306 
307 	return sysfs_emit(buf, "%d\n", outbound.current_interrupt_mask);
308 }
309 static DEVICE_ATTR_RO(out_intr_mask);
310 
311 static ssize_t out_read_index_show(struct device *dev,
312 				   struct device_attribute *dev_attr, char *buf)
313 {
314 	struct hv_device *hv_dev = device_to_hv_device(dev);
315 	struct hv_ring_buffer_debug_info outbound;
316 	int ret;
317 
318 	if (!hv_dev->channel)
319 		return -ENODEV;
320 
321 	ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound,
322 					  &outbound);
323 	if (ret < 0)
324 		return ret;
325 	return sysfs_emit(buf, "%d\n", outbound.current_read_index);
326 }
327 static DEVICE_ATTR_RO(out_read_index);
328 
329 static ssize_t out_write_index_show(struct device *dev,
330 				    struct device_attribute *dev_attr,
331 				    char *buf)
332 {
333 	struct hv_device *hv_dev = device_to_hv_device(dev);
334 	struct hv_ring_buffer_debug_info outbound;
335 	int ret;
336 
337 	if (!hv_dev->channel)
338 		return -ENODEV;
339 
340 	ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound,
341 					  &outbound);
342 	if (ret < 0)
343 		return ret;
344 	return sysfs_emit(buf, "%d\n", outbound.current_write_index);
345 }
346 static DEVICE_ATTR_RO(out_write_index);
347 
348 static ssize_t out_read_bytes_avail_show(struct device *dev,
349 					 struct device_attribute *dev_attr,
350 					 char *buf)
351 {
352 	struct hv_device *hv_dev = device_to_hv_device(dev);
353 	struct hv_ring_buffer_debug_info outbound;
354 	int ret;
355 
356 	if (!hv_dev->channel)
357 		return -ENODEV;
358 
359 	ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound,
360 					  &outbound);
361 	if (ret < 0)
362 		return ret;
363 	return sysfs_emit(buf, "%d\n", outbound.bytes_avail_toread);
364 }
365 static DEVICE_ATTR_RO(out_read_bytes_avail);
366 
367 static ssize_t out_write_bytes_avail_show(struct device *dev,
368 					  struct device_attribute *dev_attr,
369 					  char *buf)
370 {
371 	struct hv_device *hv_dev = device_to_hv_device(dev);
372 	struct hv_ring_buffer_debug_info outbound;
373 	int ret;
374 
375 	if (!hv_dev->channel)
376 		return -ENODEV;
377 
378 	ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound,
379 					  &outbound);
380 	if (ret < 0)
381 		return ret;
382 	return sysfs_emit(buf, "%d\n", outbound.bytes_avail_towrite);
383 }
384 static DEVICE_ATTR_RO(out_write_bytes_avail);
385 
386 static ssize_t in_intr_mask_show(struct device *dev,
387 				 struct device_attribute *dev_attr, char *buf)
388 {
389 	struct hv_device *hv_dev = device_to_hv_device(dev);
390 	struct hv_ring_buffer_debug_info inbound;
391 	int ret;
392 
393 	if (!hv_dev->channel)
394 		return -ENODEV;
395 
396 	ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound);
397 	if (ret < 0)
398 		return ret;
399 
400 	return sysfs_emit(buf, "%d\n", inbound.current_interrupt_mask);
401 }
402 static DEVICE_ATTR_RO(in_intr_mask);
403 
404 static ssize_t in_read_index_show(struct device *dev,
405 				  struct device_attribute *dev_attr, char *buf)
406 {
407 	struct hv_device *hv_dev = device_to_hv_device(dev);
408 	struct hv_ring_buffer_debug_info inbound;
409 	int ret;
410 
411 	if (!hv_dev->channel)
412 		return -ENODEV;
413 
414 	ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound);
415 	if (ret < 0)
416 		return ret;
417 
418 	return sysfs_emit(buf, "%d\n", inbound.current_read_index);
419 }
420 static DEVICE_ATTR_RO(in_read_index);
421 
422 static ssize_t in_write_index_show(struct device *dev,
423 				   struct device_attribute *dev_attr, char *buf)
424 {
425 	struct hv_device *hv_dev = device_to_hv_device(dev);
426 	struct hv_ring_buffer_debug_info inbound;
427 	int ret;
428 
429 	if (!hv_dev->channel)
430 		return -ENODEV;
431 
432 	ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound);
433 	if (ret < 0)
434 		return ret;
435 
436 	return sysfs_emit(buf, "%d\n", inbound.current_write_index);
437 }
438 static DEVICE_ATTR_RO(in_write_index);
439 
440 static ssize_t in_read_bytes_avail_show(struct device *dev,
441 					struct device_attribute *dev_attr,
442 					char *buf)
443 {
444 	struct hv_device *hv_dev = device_to_hv_device(dev);
445 	struct hv_ring_buffer_debug_info inbound;
446 	int ret;
447 
448 	if (!hv_dev->channel)
449 		return -ENODEV;
450 
451 	ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound);
452 	if (ret < 0)
453 		return ret;
454 
455 	return sysfs_emit(buf, "%d\n", inbound.bytes_avail_toread);
456 }
457 static DEVICE_ATTR_RO(in_read_bytes_avail);
458 
459 static ssize_t in_write_bytes_avail_show(struct device *dev,
460 					 struct device_attribute *dev_attr,
461 					 char *buf)
462 {
463 	struct hv_device *hv_dev = device_to_hv_device(dev);
464 	struct hv_ring_buffer_debug_info inbound;
465 	int ret;
466 
467 	if (!hv_dev->channel)
468 		return -ENODEV;
469 
470 	ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound);
471 	if (ret < 0)
472 		return ret;
473 
474 	return sysfs_emit(buf, "%d\n", inbound.bytes_avail_towrite);
475 }
476 static DEVICE_ATTR_RO(in_write_bytes_avail);
477 
478 static ssize_t channel_vp_mapping_show(struct device *dev,
479 				       struct device_attribute *dev_attr,
480 				       char *buf)
481 {
482 	struct hv_device *hv_dev = device_to_hv_device(dev);
483 	struct vmbus_channel *channel = hv_dev->channel, *cur_sc;
484 	int n_written;
485 	struct list_head *cur;
486 
487 	if (!channel)
488 		return -ENODEV;
489 
490 	mutex_lock(&vmbus_connection.channel_mutex);
491 
492 	n_written = sysfs_emit(buf, "%u:%u\n",
493 			       channel->offermsg.child_relid,
494 			       channel->target_cpu);
495 
496 	list_for_each(cur, &channel->sc_list) {
497 
498 		cur_sc = list_entry(cur, struct vmbus_channel, sc_list);
499 		n_written += sysfs_emit_at(buf, n_written, "%u:%u\n",
500 					  cur_sc->offermsg.child_relid,
501 					  cur_sc->target_cpu);
502 	}
503 
504 	mutex_unlock(&vmbus_connection.channel_mutex);
505 
506 	return n_written;
507 }
508 static DEVICE_ATTR_RO(channel_vp_mapping);
509 
510 static ssize_t vendor_show(struct device *dev,
511 			   struct device_attribute *dev_attr,
512 			   char *buf)
513 {
514 	struct hv_device *hv_dev = device_to_hv_device(dev);
515 
516 	return sysfs_emit(buf, "0x%x\n", hv_dev->vendor_id);
517 }
518 static DEVICE_ATTR_RO(vendor);
519 
520 static ssize_t device_show(struct device *dev,
521 			   struct device_attribute *dev_attr,
522 			   char *buf)
523 {
524 	struct hv_device *hv_dev = device_to_hv_device(dev);
525 
526 	return sysfs_emit(buf, "0x%x\n", hv_dev->device_id);
527 }
528 static DEVICE_ATTR_RO(device);
529 
530 static ssize_t driver_override_store(struct device *dev,
531 				     struct device_attribute *attr,
532 				     const char *buf, size_t count)
533 {
534 	struct hv_device *hv_dev = device_to_hv_device(dev);
535 	int ret;
536 
537 	ret = driver_set_override(dev, &hv_dev->driver_override, buf, count);
538 	if (ret)
539 		return ret;
540 
541 	return count;
542 }
543 
544 static ssize_t driver_override_show(struct device *dev,
545 				    struct device_attribute *attr, char *buf)
546 {
547 	struct hv_device *hv_dev = device_to_hv_device(dev);
548 	ssize_t len;
549 
550 	device_lock(dev);
551 	len = sysfs_emit(buf, "%s\n", hv_dev->driver_override);
552 	device_unlock(dev);
553 
554 	return len;
555 }
556 static DEVICE_ATTR_RW(driver_override);
557 
558 /* Set up per device attributes in /sys/bus/vmbus/devices/<bus device> */
559 static struct attribute *vmbus_dev_attrs[] = {
560 	&dev_attr_id.attr,
561 	&dev_attr_state.attr,
562 	&dev_attr_monitor_id.attr,
563 	&dev_attr_class_id.attr,
564 	&dev_attr_device_id.attr,
565 	&dev_attr_modalias.attr,
566 #ifdef CONFIG_NUMA
567 	&dev_attr_numa_node.attr,
568 #endif
569 	&dev_attr_server_monitor_pending.attr,
570 	&dev_attr_client_monitor_pending.attr,
571 	&dev_attr_server_monitor_latency.attr,
572 	&dev_attr_client_monitor_latency.attr,
573 	&dev_attr_server_monitor_conn_id.attr,
574 	&dev_attr_client_monitor_conn_id.attr,
575 	&dev_attr_out_intr_mask.attr,
576 	&dev_attr_out_read_index.attr,
577 	&dev_attr_out_write_index.attr,
578 	&dev_attr_out_read_bytes_avail.attr,
579 	&dev_attr_out_write_bytes_avail.attr,
580 	&dev_attr_in_intr_mask.attr,
581 	&dev_attr_in_read_index.attr,
582 	&dev_attr_in_write_index.attr,
583 	&dev_attr_in_read_bytes_avail.attr,
584 	&dev_attr_in_write_bytes_avail.attr,
585 	&dev_attr_channel_vp_mapping.attr,
586 	&dev_attr_vendor.attr,
587 	&dev_attr_device.attr,
588 	&dev_attr_driver_override.attr,
589 	NULL,
590 };
591 
592 /*
593  * Device-level attribute_group callback function. Returns the permission for
594  * each attribute, and returns 0 if an attribute is not visible.
595  */
596 static umode_t vmbus_dev_attr_is_visible(struct kobject *kobj,
597 					 struct attribute *attr, int idx)
598 {
599 	struct device *dev = kobj_to_dev(kobj);
600 	const struct hv_device *hv_dev = device_to_hv_device(dev);
601 
602 	/* Hide the monitor attributes if the monitor mechanism is not used. */
603 	if (!hv_dev->channel->offermsg.monitor_allocated &&
604 	    (attr == &dev_attr_monitor_id.attr ||
605 	     attr == &dev_attr_server_monitor_pending.attr ||
606 	     attr == &dev_attr_client_monitor_pending.attr ||
607 	     attr == &dev_attr_server_monitor_latency.attr ||
608 	     attr == &dev_attr_client_monitor_latency.attr ||
609 	     attr == &dev_attr_server_monitor_conn_id.attr ||
610 	     attr == &dev_attr_client_monitor_conn_id.attr))
611 		return 0;
612 
613 	return attr->mode;
614 }
615 
616 static const struct attribute_group vmbus_dev_group = {
617 	.attrs = vmbus_dev_attrs,
618 	.is_visible = vmbus_dev_attr_is_visible
619 };
620 __ATTRIBUTE_GROUPS(vmbus_dev);
621 
622 /* Set up the attribute for /sys/bus/vmbus/hibernation */
623 static ssize_t hibernation_show(const struct bus_type *bus, char *buf)
624 {
625 	return sprintf(buf, "%d\n", !!hv_is_hibernation_supported());
626 }
627 
628 static BUS_ATTR_RO(hibernation);
629 
630 static struct attribute *vmbus_bus_attrs[] = {
631 	&bus_attr_hibernation.attr,
632 	NULL,
633 };
634 static const struct attribute_group vmbus_bus_group = {
635 	.attrs = vmbus_bus_attrs,
636 };
637 __ATTRIBUTE_GROUPS(vmbus_bus);
638 
639 /*
640  * vmbus_uevent - add uevent for our device
641  *
642  * This routine is invoked when a device is added or removed on the vmbus to
643  * generate a uevent to udev in the userspace. The udev will then look at its
644  * rule and the uevent generated here to load the appropriate driver
645  *
646  * The alias string will be of the form vmbus:guid where guid is the string
647  * representation of the device guid (each byte of the guid will be
648  * represented with two hex characters.
649  */
650 static int vmbus_uevent(const struct device *device, struct kobj_uevent_env *env)
651 {
652 	const struct hv_device *dev = device_to_hv_device(device);
653 	const char *format = "MODALIAS=vmbus:%*phN";
654 
655 	return add_uevent_var(env, format, UUID_SIZE, &dev->dev_type);
656 }
657 
658 static const struct hv_vmbus_device_id *
659 hv_vmbus_dev_match(const struct hv_vmbus_device_id *id, const guid_t *guid)
660 {
661 	if (id == NULL)
662 		return NULL; /* empty device table */
663 
664 	for (; !guid_is_null(&id->guid); id++)
665 		if (guid_equal(&id->guid, guid))
666 			return id;
667 
668 	return NULL;
669 }
670 
671 static const struct hv_vmbus_device_id *
672 hv_vmbus_dynid_match(struct hv_driver *drv, const guid_t *guid)
673 {
674 	const struct hv_vmbus_device_id *id = NULL;
675 	struct vmbus_dynid *dynid;
676 
677 	spin_lock(&drv->dynids.lock);
678 	list_for_each_entry(dynid, &drv->dynids.list, node) {
679 		if (guid_equal(&dynid->id.guid, guid)) {
680 			id = &dynid->id;
681 			break;
682 		}
683 	}
684 	spin_unlock(&drv->dynids.lock);
685 
686 	return id;
687 }
688 
689 static const struct hv_vmbus_device_id vmbus_device_null;
690 
691 /*
692  * Return a matching hv_vmbus_device_id pointer.
693  * If there is no match, return NULL.
694  */
695 static const struct hv_vmbus_device_id *hv_vmbus_get_id(const struct hv_driver *drv,
696 							struct hv_device *dev)
697 {
698 	const guid_t *guid = &dev->dev_type;
699 	const struct hv_vmbus_device_id *id;
700 
701 	/* When driver_override is set, only bind to the matching driver */
702 	if (dev->driver_override && strcmp(dev->driver_override, drv->name))
703 		return NULL;
704 
705 	/* Look at the dynamic ids first, before the static ones */
706 	id = hv_vmbus_dynid_match((struct hv_driver *)drv, guid);
707 	if (!id)
708 		id = hv_vmbus_dev_match(drv->id_table, guid);
709 
710 	/* driver_override will always match, send a dummy id */
711 	if (!id && dev->driver_override)
712 		id = &vmbus_device_null;
713 
714 	return id;
715 }
716 
717 /* vmbus_add_dynid - add a new device ID to this driver and re-probe devices */
718 static int vmbus_add_dynid(struct hv_driver *drv, guid_t *guid)
719 {
720 	struct vmbus_dynid *dynid;
721 
722 	dynid = kzalloc(sizeof(*dynid), GFP_KERNEL);
723 	if (!dynid)
724 		return -ENOMEM;
725 
726 	dynid->id.guid = *guid;
727 
728 	spin_lock(&drv->dynids.lock);
729 	list_add_tail(&dynid->node, &drv->dynids.list);
730 	spin_unlock(&drv->dynids.lock);
731 
732 	return driver_attach(&drv->driver);
733 }
734 
735 static void vmbus_free_dynids(struct hv_driver *drv)
736 {
737 	struct vmbus_dynid *dynid, *n;
738 
739 	spin_lock(&drv->dynids.lock);
740 	list_for_each_entry_safe(dynid, n, &drv->dynids.list, node) {
741 		list_del(&dynid->node);
742 		kfree(dynid);
743 	}
744 	spin_unlock(&drv->dynids.lock);
745 }
746 
747 /*
748  * store_new_id - sysfs frontend to vmbus_add_dynid()
749  *
750  * Allow GUIDs to be added to an existing driver via sysfs.
751  */
752 static ssize_t new_id_store(struct device_driver *driver, const char *buf,
753 			    size_t count)
754 {
755 	struct hv_driver *drv = drv_to_hv_drv(driver);
756 	guid_t guid;
757 	ssize_t retval;
758 
759 	retval = guid_parse(buf, &guid);
760 	if (retval)
761 		return retval;
762 
763 	if (hv_vmbus_dynid_match(drv, &guid))
764 		return -EEXIST;
765 
766 	retval = vmbus_add_dynid(drv, &guid);
767 	if (retval)
768 		return retval;
769 	return count;
770 }
771 static DRIVER_ATTR_WO(new_id);
772 
773 /*
774  * store_remove_id - remove a PCI device ID from this driver
775  *
776  * Removes a dynamic pci device ID to this driver.
777  */
778 static ssize_t remove_id_store(struct device_driver *driver, const char *buf,
779 			       size_t count)
780 {
781 	struct hv_driver *drv = drv_to_hv_drv(driver);
782 	struct vmbus_dynid *dynid, *n;
783 	guid_t guid;
784 	ssize_t retval;
785 
786 	retval = guid_parse(buf, &guid);
787 	if (retval)
788 		return retval;
789 
790 	retval = -ENODEV;
791 	spin_lock(&drv->dynids.lock);
792 	list_for_each_entry_safe(dynid, n, &drv->dynids.list, node) {
793 		struct hv_vmbus_device_id *id = &dynid->id;
794 
795 		if (guid_equal(&id->guid, &guid)) {
796 			list_del(&dynid->node);
797 			kfree(dynid);
798 			retval = count;
799 			break;
800 		}
801 	}
802 	spin_unlock(&drv->dynids.lock);
803 
804 	return retval;
805 }
806 static DRIVER_ATTR_WO(remove_id);
807 
808 static struct attribute *vmbus_drv_attrs[] = {
809 	&driver_attr_new_id.attr,
810 	&driver_attr_remove_id.attr,
811 	NULL,
812 };
813 ATTRIBUTE_GROUPS(vmbus_drv);
814 
815 
816 /*
817  * vmbus_match - Attempt to match the specified device to the specified driver
818  */
819 static int vmbus_match(struct device *device, const struct device_driver *driver)
820 {
821 	const struct hv_driver *drv = drv_to_hv_drv(driver);
822 	struct hv_device *hv_dev = device_to_hv_device(device);
823 
824 	/* The hv_sock driver handles all hv_sock offers. */
825 	if (is_hvsock_channel(hv_dev->channel))
826 		return drv->hvsock;
827 
828 	if (hv_vmbus_get_id(drv, hv_dev))
829 		return 1;
830 
831 	return 0;
832 }
833 
834 /*
835  * vmbus_probe - Add the new vmbus's child device
836  */
837 static int vmbus_probe(struct device *child_device)
838 {
839 	int ret = 0;
840 	struct hv_driver *drv =
841 			drv_to_hv_drv(child_device->driver);
842 	struct hv_device *dev = device_to_hv_device(child_device);
843 	const struct hv_vmbus_device_id *dev_id;
844 
845 	dev_id = hv_vmbus_get_id(drv, dev);
846 	if (drv->probe) {
847 		ret = drv->probe(dev, dev_id);
848 		if (ret != 0)
849 			pr_err("probe failed for device %s (%d)\n",
850 			       dev_name(child_device), ret);
851 
852 	} else {
853 		pr_err("probe not set for driver %s\n",
854 		       dev_name(child_device));
855 		ret = -ENODEV;
856 	}
857 	return ret;
858 }
859 
860 /*
861  * vmbus_dma_configure -- Configure DMA coherence for VMbus device
862  */
863 static int vmbus_dma_configure(struct device *child_device)
864 {
865 	/*
866 	 * On ARM64, propagate the DMA coherence setting from the top level
867 	 * VMbus ACPI device to the child VMbus device being added here.
868 	 * On x86/x64 coherence is assumed and these calls have no effect.
869 	 */
870 	hv_setup_dma_ops(child_device,
871 		device_get_dma_attr(vmbus_root_device) == DEV_DMA_COHERENT);
872 	return 0;
873 }
874 
875 /*
876  * vmbus_remove - Remove a vmbus device
877  */
878 static void vmbus_remove(struct device *child_device)
879 {
880 	struct hv_driver *drv;
881 	struct hv_device *dev = device_to_hv_device(child_device);
882 
883 	if (child_device->driver) {
884 		drv = drv_to_hv_drv(child_device->driver);
885 		if (drv->remove)
886 			drv->remove(dev);
887 	}
888 }
889 
890 /*
891  * vmbus_shutdown - Shutdown a vmbus device
892  */
893 static void vmbus_shutdown(struct device *child_device)
894 {
895 	struct hv_driver *drv;
896 	struct hv_device *dev = device_to_hv_device(child_device);
897 
898 
899 	/* The device may not be attached yet */
900 	if (!child_device->driver)
901 		return;
902 
903 	drv = drv_to_hv_drv(child_device->driver);
904 
905 	if (drv->shutdown)
906 		drv->shutdown(dev);
907 }
908 
909 #ifdef CONFIG_PM_SLEEP
910 /*
911  * vmbus_suspend - Suspend a vmbus device
912  */
913 static int vmbus_suspend(struct device *child_device)
914 {
915 	struct hv_driver *drv;
916 	struct hv_device *dev = device_to_hv_device(child_device);
917 
918 	/* The device may not be attached yet */
919 	if (!child_device->driver)
920 		return 0;
921 
922 	drv = drv_to_hv_drv(child_device->driver);
923 	if (!drv->suspend)
924 		return -EOPNOTSUPP;
925 
926 	return drv->suspend(dev);
927 }
928 
929 /*
930  * vmbus_resume - Resume a vmbus device
931  */
932 static int vmbus_resume(struct device *child_device)
933 {
934 	struct hv_driver *drv;
935 	struct hv_device *dev = device_to_hv_device(child_device);
936 
937 	/* The device may not be attached yet */
938 	if (!child_device->driver)
939 		return 0;
940 
941 	drv = drv_to_hv_drv(child_device->driver);
942 	if (!drv->resume)
943 		return -EOPNOTSUPP;
944 
945 	return drv->resume(dev);
946 }
947 #else
948 #define vmbus_suspend NULL
949 #define vmbus_resume NULL
950 #endif /* CONFIG_PM_SLEEP */
951 
952 /*
953  * vmbus_device_release - Final callback release of the vmbus child device
954  */
955 static void vmbus_device_release(struct device *device)
956 {
957 	struct hv_device *hv_dev = device_to_hv_device(device);
958 	struct vmbus_channel *channel = hv_dev->channel;
959 
960 	hv_debug_rm_dev_dir(hv_dev);
961 
962 	mutex_lock(&vmbus_connection.channel_mutex);
963 	hv_process_channel_removal(channel);
964 	mutex_unlock(&vmbus_connection.channel_mutex);
965 	kfree(hv_dev);
966 }
967 
968 /*
969  * Note: we must use the "noirq" ops: see the comment before vmbus_bus_pm.
970  *
971  * suspend_noirq/resume_noirq are set to NULL to support Suspend-to-Idle: we
972  * shouldn't suspend the vmbus devices upon Suspend-to-Idle, otherwise there
973  * is no way to wake up a Generation-2 VM.
974  *
975  * The other 4 ops are for hibernation.
976  */
977 
978 static const struct dev_pm_ops vmbus_pm = {
979 	.suspend_noirq	= NULL,
980 	.resume_noirq	= NULL,
981 	.freeze_noirq	= vmbus_suspend,
982 	.thaw_noirq	= vmbus_resume,
983 	.poweroff_noirq	= vmbus_suspend,
984 	.restore_noirq	= vmbus_resume,
985 };
986 
987 /* The one and only one */
988 static const struct bus_type  hv_bus = {
989 	.name =		"vmbus",
990 	.match =		vmbus_match,
991 	.shutdown =		vmbus_shutdown,
992 	.remove =		vmbus_remove,
993 	.probe =		vmbus_probe,
994 	.uevent =		vmbus_uevent,
995 	.dma_configure =	vmbus_dma_configure,
996 	.dev_groups =		vmbus_dev_groups,
997 	.drv_groups =		vmbus_drv_groups,
998 	.bus_groups =		vmbus_bus_groups,
999 	.pm =			&vmbus_pm,
1000 };
1001 
1002 struct onmessage_work_context {
1003 	struct work_struct work;
1004 	struct {
1005 		struct hv_message_header header;
1006 		u8 payload[];
1007 	} msg;
1008 };
1009 
1010 static void vmbus_onmessage_work(struct work_struct *work)
1011 {
1012 	struct onmessage_work_context *ctx;
1013 
1014 	/* Do not process messages if we're in DISCONNECTED state */
1015 	if (vmbus_connection.conn_state == DISCONNECTED)
1016 		return;
1017 
1018 	ctx = container_of(work, struct onmessage_work_context,
1019 			   work);
1020 	vmbus_onmessage((struct vmbus_channel_message_header *)
1021 			&ctx->msg.payload);
1022 	kfree(ctx);
1023 }
1024 
1025 void vmbus_on_msg_dpc(unsigned long data)
1026 {
1027 	struct hv_per_cpu_context *hv_cpu = (void *)data;
1028 	void *page_addr = hv_cpu->synic_message_page;
1029 	struct hv_message msg_copy, *msg = (struct hv_message *)page_addr +
1030 				  VMBUS_MESSAGE_SINT;
1031 	struct vmbus_channel_message_header *hdr;
1032 	enum vmbus_channel_message_type msgtype;
1033 	const struct vmbus_channel_message_table_entry *entry;
1034 	struct onmessage_work_context *ctx;
1035 	__u8 payload_size;
1036 	u32 message_type;
1037 
1038 	/*
1039 	 * 'enum vmbus_channel_message_type' is supposed to always be 'u32' as
1040 	 * it is being used in 'struct vmbus_channel_message_header' definition
1041 	 * which is supposed to match hypervisor ABI.
1042 	 */
1043 	BUILD_BUG_ON(sizeof(enum vmbus_channel_message_type) != sizeof(u32));
1044 
1045 	/*
1046 	 * Since the message is in memory shared with the host, an erroneous or
1047 	 * malicious Hyper-V could modify the message while vmbus_on_msg_dpc()
1048 	 * or individual message handlers are executing; to prevent this, copy
1049 	 * the message into private memory.
1050 	 */
1051 	memcpy(&msg_copy, msg, sizeof(struct hv_message));
1052 
1053 	message_type = msg_copy.header.message_type;
1054 	if (message_type == HVMSG_NONE)
1055 		/* no msg */
1056 		return;
1057 
1058 	hdr = (struct vmbus_channel_message_header *)msg_copy.u.payload;
1059 	msgtype = hdr->msgtype;
1060 
1061 	trace_vmbus_on_msg_dpc(hdr);
1062 
1063 	if (msgtype >= CHANNELMSG_COUNT) {
1064 		WARN_ONCE(1, "unknown msgtype=%d\n", msgtype);
1065 		goto msg_handled;
1066 	}
1067 
1068 	payload_size = msg_copy.header.payload_size;
1069 	if (payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) {
1070 		WARN_ONCE(1, "payload size is too large (%d)\n", payload_size);
1071 		goto msg_handled;
1072 	}
1073 
1074 	entry = &channel_message_table[msgtype];
1075 
1076 	if (!entry->message_handler)
1077 		goto msg_handled;
1078 
1079 	if (payload_size < entry->min_payload_len) {
1080 		WARN_ONCE(1, "message too short: msgtype=%d len=%d\n", msgtype, payload_size);
1081 		goto msg_handled;
1082 	}
1083 
1084 	if (entry->handler_type	== VMHT_BLOCKING) {
1085 		ctx = kmalloc(struct_size(ctx, msg.payload, payload_size), GFP_ATOMIC);
1086 		if (ctx == NULL)
1087 			return;
1088 
1089 		INIT_WORK(&ctx->work, vmbus_onmessage_work);
1090 		ctx->msg.header = msg_copy.header;
1091 		memcpy(&ctx->msg.payload, msg_copy.u.payload, payload_size);
1092 
1093 		/*
1094 		 * The host can generate a rescind message while we
1095 		 * may still be handling the original offer. We deal with
1096 		 * this condition by relying on the synchronization provided
1097 		 * by offer_in_progress and by channel_mutex.  See also the
1098 		 * inline comments in vmbus_onoffer_rescind().
1099 		 */
1100 		switch (msgtype) {
1101 		case CHANNELMSG_RESCIND_CHANNELOFFER:
1102 			/*
1103 			 * If we are handling the rescind message;
1104 			 * schedule the work on the global work queue.
1105 			 *
1106 			 * The OFFER message and the RESCIND message should
1107 			 * not be handled by the same serialized work queue,
1108 			 * because the OFFER handler may call vmbus_open(),
1109 			 * which tries to open the channel by sending an
1110 			 * OPEN_CHANNEL message to the host and waits for
1111 			 * the host's response; however, if the host has
1112 			 * rescinded the channel before it receives the
1113 			 * OPEN_CHANNEL message, the host just silently
1114 			 * ignores the OPEN_CHANNEL message; as a result,
1115 			 * the guest's OFFER handler hangs for ever, if we
1116 			 * handle the RESCIND message in the same serialized
1117 			 * work queue: the RESCIND handler can not start to
1118 			 * run before the OFFER handler finishes.
1119 			 */
1120 			if (vmbus_connection.ignore_any_offer_msg)
1121 				break;
1122 			queue_work(vmbus_connection.rescind_work_queue, &ctx->work);
1123 			break;
1124 
1125 		case CHANNELMSG_OFFERCHANNEL:
1126 			/*
1127 			 * The host sends the offer message of a given channel
1128 			 * before sending the rescind message of the same
1129 			 * channel.  These messages are sent to the guest's
1130 			 * connect CPU; the guest then starts processing them
1131 			 * in the tasklet handler on this CPU:
1132 			 *
1133 			 * VMBUS_CONNECT_CPU
1134 			 *
1135 			 * [vmbus_on_msg_dpc()]
1136 			 * atomic_inc()  // CHANNELMSG_OFFERCHANNEL
1137 			 * queue_work()
1138 			 * ...
1139 			 * [vmbus_on_msg_dpc()]
1140 			 * schedule_work()  // CHANNELMSG_RESCIND_CHANNELOFFER
1141 			 *
1142 			 * We rely on the memory-ordering properties of the
1143 			 * queue_work() and schedule_work() primitives, which
1144 			 * guarantee that the atomic increment will be visible
1145 			 * to the CPUs which will execute the offer & rescind
1146 			 * works by the time these works will start execution.
1147 			 */
1148 			if (vmbus_connection.ignore_any_offer_msg)
1149 				break;
1150 			atomic_inc(&vmbus_connection.offer_in_progress);
1151 			fallthrough;
1152 
1153 		default:
1154 			queue_work(vmbus_connection.work_queue, &ctx->work);
1155 		}
1156 	} else
1157 		entry->message_handler(hdr);
1158 
1159 msg_handled:
1160 	vmbus_signal_eom(msg, message_type);
1161 }
1162 
1163 #ifdef CONFIG_PM_SLEEP
1164 /*
1165  * Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for
1166  * hibernation, because hv_sock connections can not persist across hibernation.
1167  */
1168 static void vmbus_force_channel_rescinded(struct vmbus_channel *channel)
1169 {
1170 	struct onmessage_work_context *ctx;
1171 	struct vmbus_channel_rescind_offer *rescind;
1172 
1173 	WARN_ON(!is_hvsock_channel(channel));
1174 
1175 	/*
1176 	 * Allocation size is small and the allocation should really not fail,
1177 	 * otherwise the state of the hv_sock connections ends up in limbo.
1178 	 */
1179 	ctx = kzalloc(sizeof(*ctx) + sizeof(*rescind),
1180 		      GFP_KERNEL | __GFP_NOFAIL);
1181 
1182 	/*
1183 	 * So far, these are not really used by Linux. Just set them to the
1184 	 * reasonable values conforming to the definitions of the fields.
1185 	 */
1186 	ctx->msg.header.message_type = 1;
1187 	ctx->msg.header.payload_size = sizeof(*rescind);
1188 
1189 	/* These values are actually used by Linux. */
1190 	rescind = (struct vmbus_channel_rescind_offer *)ctx->msg.payload;
1191 	rescind->header.msgtype = CHANNELMSG_RESCIND_CHANNELOFFER;
1192 	rescind->child_relid = channel->offermsg.child_relid;
1193 
1194 	INIT_WORK(&ctx->work, vmbus_onmessage_work);
1195 
1196 	queue_work(vmbus_connection.work_queue, &ctx->work);
1197 }
1198 #endif /* CONFIG_PM_SLEEP */
1199 
1200 /*
1201  * Schedule all channels with events pending
1202  */
1203 static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
1204 {
1205 	unsigned long *recv_int_page;
1206 	u32 maxbits, relid;
1207 
1208 	/*
1209 	 * The event page can be directly checked to get the id of
1210 	 * the channel that has the interrupt pending.
1211 	 */
1212 	void *page_addr = hv_cpu->synic_event_page;
1213 	union hv_synic_event_flags *event
1214 		= (union hv_synic_event_flags *)page_addr +
1215 					 VMBUS_MESSAGE_SINT;
1216 
1217 	maxbits = HV_EVENT_FLAGS_COUNT;
1218 	recv_int_page = event->flags;
1219 
1220 	if (unlikely(!recv_int_page))
1221 		return;
1222 
1223 	for_each_set_bit(relid, recv_int_page, maxbits) {
1224 		void (*callback_fn)(void *context);
1225 		struct vmbus_channel *channel;
1226 
1227 		if (!sync_test_and_clear_bit(relid, recv_int_page))
1228 			continue;
1229 
1230 		/* Special case - vmbus channel protocol msg */
1231 		if (relid == 0)
1232 			continue;
1233 
1234 		/*
1235 		 * Pairs with the kfree_rcu() in vmbus_chan_release().
1236 		 * Guarantees that the channel data structure doesn't
1237 		 * get freed while the channel pointer below is being
1238 		 * dereferenced.
1239 		 */
1240 		rcu_read_lock();
1241 
1242 		/* Find channel based on relid */
1243 		channel = relid2channel(relid);
1244 		if (channel == NULL)
1245 			goto sched_unlock_rcu;
1246 
1247 		if (channel->rescind)
1248 			goto sched_unlock_rcu;
1249 
1250 		/*
1251 		 * Make sure that the ring buffer data structure doesn't get
1252 		 * freed while we dereference the ring buffer pointer.  Test
1253 		 * for the channel's onchannel_callback being NULL within a
1254 		 * sched_lock critical section.  See also the inline comments
1255 		 * in vmbus_reset_channel_cb().
1256 		 */
1257 		spin_lock(&channel->sched_lock);
1258 
1259 		callback_fn = channel->onchannel_callback;
1260 		if (unlikely(callback_fn == NULL))
1261 			goto sched_unlock;
1262 
1263 		trace_vmbus_chan_sched(channel);
1264 
1265 		++channel->interrupts;
1266 
1267 		switch (channel->callback_mode) {
1268 		case HV_CALL_ISR:
1269 			(*callback_fn)(channel->channel_callback_context);
1270 			break;
1271 
1272 		case HV_CALL_BATCHED:
1273 			hv_begin_read(&channel->inbound);
1274 			fallthrough;
1275 		case HV_CALL_DIRECT:
1276 			tasklet_schedule(&channel->callback_event);
1277 		}
1278 
1279 sched_unlock:
1280 		spin_unlock(&channel->sched_lock);
1281 sched_unlock_rcu:
1282 		rcu_read_unlock();
1283 	}
1284 }
1285 
1286 static void vmbus_isr(void)
1287 {
1288 	struct hv_per_cpu_context *hv_cpu
1289 		= this_cpu_ptr(hv_context.cpu_context);
1290 	void *page_addr;
1291 	struct hv_message *msg;
1292 
1293 	vmbus_chan_sched(hv_cpu);
1294 
1295 	page_addr = hv_cpu->synic_message_page;
1296 	msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
1297 
1298 	/* Check if there are actual msgs to be processed */
1299 	if (msg->header.message_type != HVMSG_NONE) {
1300 		if (msg->header.message_type == HVMSG_TIMER_EXPIRED) {
1301 			hv_stimer0_isr();
1302 			vmbus_signal_eom(msg, HVMSG_TIMER_EXPIRED);
1303 		} else
1304 			tasklet_schedule(&hv_cpu->msg_dpc);
1305 	}
1306 
1307 	add_interrupt_randomness(vmbus_interrupt);
1308 }
1309 
1310 static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
1311 {
1312 	vmbus_isr();
1313 	return IRQ_HANDLED;
1314 }
1315 
1316 static void vmbus_percpu_work(struct work_struct *work)
1317 {
1318 	unsigned int cpu = smp_processor_id();
1319 
1320 	hv_synic_init(cpu);
1321 }
1322 
1323 /*
1324  * vmbus_bus_init -Main vmbus driver initialization routine.
1325  *
1326  * Here, we
1327  *	- initialize the vmbus driver context
1328  *	- invoke the vmbus hv main init routine
1329  *	- retrieve the channel offers
1330  */
1331 static int vmbus_bus_init(void)
1332 {
1333 	int ret, cpu;
1334 	struct work_struct __percpu *works;
1335 
1336 	ret = hv_init();
1337 	if (ret != 0) {
1338 		pr_err("Unable to initialize the hypervisor - 0x%x\n", ret);
1339 		return ret;
1340 	}
1341 
1342 	ret = bus_register(&hv_bus);
1343 	if (ret)
1344 		return ret;
1345 
1346 	/*
1347 	 * VMbus interrupts are best modeled as per-cpu interrupts. If
1348 	 * on an architecture with support for per-cpu IRQs (e.g. ARM64),
1349 	 * allocate a per-cpu IRQ using standard Linux kernel functionality.
1350 	 * If not on such an architecture (e.g., x86/x64), then rely on
1351 	 * code in the arch-specific portion of the code tree to connect
1352 	 * the VMbus interrupt handler.
1353 	 */
1354 
1355 	if (vmbus_irq == -1) {
1356 		hv_setup_vmbus_handler(vmbus_isr);
1357 	} else {
1358 		vmbus_evt = alloc_percpu(long);
1359 		ret = request_percpu_irq(vmbus_irq, vmbus_percpu_isr,
1360 				"Hyper-V VMbus", vmbus_evt);
1361 		if (ret) {
1362 			pr_err("Can't request Hyper-V VMbus IRQ %d, Err %d",
1363 					vmbus_irq, ret);
1364 			free_percpu(vmbus_evt);
1365 			goto err_setup;
1366 		}
1367 	}
1368 
1369 	ret = hv_synic_alloc();
1370 	if (ret)
1371 		goto err_alloc;
1372 
1373 	works = alloc_percpu(struct work_struct);
1374 	if (!works) {
1375 		ret = -ENOMEM;
1376 		goto err_alloc;
1377 	}
1378 
1379 	/*
1380 	 * Initialize the per-cpu interrupt state and stimer state.
1381 	 * Then connect to the host.
1382 	 */
1383 	cpus_read_lock();
1384 	for_each_online_cpu(cpu) {
1385 		struct work_struct *work = per_cpu_ptr(works, cpu);
1386 
1387 		INIT_WORK(work, vmbus_percpu_work);
1388 		schedule_work_on(cpu, work);
1389 	}
1390 
1391 	for_each_online_cpu(cpu)
1392 		flush_work(per_cpu_ptr(works, cpu));
1393 
1394 	/* Register the callbacks for possible CPU online/offline'ing */
1395 	ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
1396 						   hv_synic_init, hv_synic_cleanup);
1397 	cpus_read_unlock();
1398 	free_percpu(works);
1399 	if (ret < 0)
1400 		goto err_alloc;
1401 	hyperv_cpuhp_online = ret;
1402 
1403 	ret = vmbus_connect();
1404 	if (ret)
1405 		goto err_connect;
1406 
1407 	/*
1408 	 * Always register the vmbus unload panic notifier because we
1409 	 * need to shut the VMbus channel connection on panic.
1410 	 */
1411 	atomic_notifier_chain_register(&panic_notifier_list,
1412 			       &hyperv_panic_vmbus_unload_block);
1413 
1414 	vmbus_request_offers();
1415 
1416 	return 0;
1417 
1418 err_connect:
1419 	cpuhp_remove_state(hyperv_cpuhp_online);
1420 err_alloc:
1421 	hv_synic_free();
1422 	if (vmbus_irq == -1) {
1423 		hv_remove_vmbus_handler();
1424 	} else {
1425 		free_percpu_irq(vmbus_irq, vmbus_evt);
1426 		free_percpu(vmbus_evt);
1427 	}
1428 err_setup:
1429 	bus_unregister(&hv_bus);
1430 	return ret;
1431 }
1432 
1433 /**
1434  * __vmbus_driver_register() - Register a vmbus's driver
1435  * @hv_driver: Pointer to driver structure you want to register
1436  * @owner: owner module of the drv
1437  * @mod_name: module name string
1438  *
1439  * Registers the given driver with Linux through the 'driver_register()' call
1440  * and sets up the hyper-v vmbus handling for this driver.
1441  * It will return the state of the 'driver_register()' call.
1442  *
1443  */
1444 int __vmbus_driver_register(struct hv_driver *hv_driver, struct module *owner, const char *mod_name)
1445 {
1446 	int ret;
1447 
1448 	pr_info("registering driver %s\n", hv_driver->name);
1449 
1450 	ret = vmbus_exists();
1451 	if (ret < 0)
1452 		return ret;
1453 
1454 	hv_driver->driver.name = hv_driver->name;
1455 	hv_driver->driver.owner = owner;
1456 	hv_driver->driver.mod_name = mod_name;
1457 	hv_driver->driver.bus = &hv_bus;
1458 
1459 	spin_lock_init(&hv_driver->dynids.lock);
1460 	INIT_LIST_HEAD(&hv_driver->dynids.list);
1461 
1462 	ret = driver_register(&hv_driver->driver);
1463 
1464 	return ret;
1465 }
1466 EXPORT_SYMBOL_GPL(__vmbus_driver_register);
1467 
1468 /**
1469  * vmbus_driver_unregister() - Unregister a vmbus's driver
1470  * @hv_driver: Pointer to driver structure you want to
1471  *             un-register
1472  *
1473  * Un-register the given driver that was previous registered with a call to
1474  * vmbus_driver_register()
1475  */
1476 void vmbus_driver_unregister(struct hv_driver *hv_driver)
1477 {
1478 	pr_info("unregistering driver %s\n", hv_driver->name);
1479 
1480 	if (!vmbus_exists()) {
1481 		driver_unregister(&hv_driver->driver);
1482 		vmbus_free_dynids(hv_driver);
1483 	}
1484 }
1485 EXPORT_SYMBOL_GPL(vmbus_driver_unregister);
1486 
1487 
1488 /*
1489  * Called when last reference to channel is gone.
1490  */
1491 static void vmbus_chan_release(struct kobject *kobj)
1492 {
1493 	struct vmbus_channel *channel
1494 		= container_of(kobj, struct vmbus_channel, kobj);
1495 
1496 	kfree_rcu(channel, rcu);
1497 }
1498 
1499 struct vmbus_chan_attribute {
1500 	struct attribute attr;
1501 	ssize_t (*show)(struct vmbus_channel *chan, char *buf);
1502 	ssize_t (*store)(struct vmbus_channel *chan,
1503 			 const char *buf, size_t count);
1504 };
1505 #define VMBUS_CHAN_ATTR(_name, _mode, _show, _store) \
1506 	struct vmbus_chan_attribute chan_attr_##_name \
1507 		= __ATTR(_name, _mode, _show, _store)
1508 #define VMBUS_CHAN_ATTR_RW(_name) \
1509 	struct vmbus_chan_attribute chan_attr_##_name = __ATTR_RW(_name)
1510 #define VMBUS_CHAN_ATTR_RO(_name) \
1511 	struct vmbus_chan_attribute chan_attr_##_name = __ATTR_RO(_name)
1512 #define VMBUS_CHAN_ATTR_WO(_name) \
1513 	struct vmbus_chan_attribute chan_attr_##_name = __ATTR_WO(_name)
1514 
1515 static ssize_t vmbus_chan_attr_show(struct kobject *kobj,
1516 				    struct attribute *attr, char *buf)
1517 {
1518 	const struct vmbus_chan_attribute *attribute
1519 		= container_of(attr, struct vmbus_chan_attribute, attr);
1520 	struct vmbus_channel *chan
1521 		= container_of(kobj, struct vmbus_channel, kobj);
1522 
1523 	if (!attribute->show)
1524 		return -EIO;
1525 
1526 	return attribute->show(chan, buf);
1527 }
1528 
1529 static ssize_t vmbus_chan_attr_store(struct kobject *kobj,
1530 				     struct attribute *attr, const char *buf,
1531 				     size_t count)
1532 {
1533 	const struct vmbus_chan_attribute *attribute
1534 		= container_of(attr, struct vmbus_chan_attribute, attr);
1535 	struct vmbus_channel *chan
1536 		= container_of(kobj, struct vmbus_channel, kobj);
1537 
1538 	if (!attribute->store)
1539 		return -EIO;
1540 
1541 	return attribute->store(chan, buf, count);
1542 }
1543 
1544 static const struct sysfs_ops vmbus_chan_sysfs_ops = {
1545 	.show = vmbus_chan_attr_show,
1546 	.store = vmbus_chan_attr_store,
1547 };
1548 
1549 static ssize_t out_mask_show(struct vmbus_channel *channel, char *buf)
1550 {
1551 	struct hv_ring_buffer_info *rbi = &channel->outbound;
1552 	ssize_t ret;
1553 
1554 	mutex_lock(&rbi->ring_buffer_mutex);
1555 	if (!rbi->ring_buffer) {
1556 		mutex_unlock(&rbi->ring_buffer_mutex);
1557 		return -EINVAL;
1558 	}
1559 
1560 	ret = sprintf(buf, "%u\n", rbi->ring_buffer->interrupt_mask);
1561 	mutex_unlock(&rbi->ring_buffer_mutex);
1562 	return ret;
1563 }
1564 static VMBUS_CHAN_ATTR_RO(out_mask);
1565 
1566 static ssize_t in_mask_show(struct vmbus_channel *channel, char *buf)
1567 {
1568 	struct hv_ring_buffer_info *rbi = &channel->inbound;
1569 	ssize_t ret;
1570 
1571 	mutex_lock(&rbi->ring_buffer_mutex);
1572 	if (!rbi->ring_buffer) {
1573 		mutex_unlock(&rbi->ring_buffer_mutex);
1574 		return -EINVAL;
1575 	}
1576 
1577 	ret = sprintf(buf, "%u\n", rbi->ring_buffer->interrupt_mask);
1578 	mutex_unlock(&rbi->ring_buffer_mutex);
1579 	return ret;
1580 }
1581 static VMBUS_CHAN_ATTR_RO(in_mask);
1582 
1583 static ssize_t read_avail_show(struct vmbus_channel *channel, char *buf)
1584 {
1585 	struct hv_ring_buffer_info *rbi = &channel->inbound;
1586 	ssize_t ret;
1587 
1588 	mutex_lock(&rbi->ring_buffer_mutex);
1589 	if (!rbi->ring_buffer) {
1590 		mutex_unlock(&rbi->ring_buffer_mutex);
1591 		return -EINVAL;
1592 	}
1593 
1594 	ret = sprintf(buf, "%u\n", hv_get_bytes_to_read(rbi));
1595 	mutex_unlock(&rbi->ring_buffer_mutex);
1596 	return ret;
1597 }
1598 static VMBUS_CHAN_ATTR_RO(read_avail);
1599 
1600 static ssize_t write_avail_show(struct vmbus_channel *channel, char *buf)
1601 {
1602 	struct hv_ring_buffer_info *rbi = &channel->outbound;
1603 	ssize_t ret;
1604 
1605 	mutex_lock(&rbi->ring_buffer_mutex);
1606 	if (!rbi->ring_buffer) {
1607 		mutex_unlock(&rbi->ring_buffer_mutex);
1608 		return -EINVAL;
1609 	}
1610 
1611 	ret = sprintf(buf, "%u\n", hv_get_bytes_to_write(rbi));
1612 	mutex_unlock(&rbi->ring_buffer_mutex);
1613 	return ret;
1614 }
1615 static VMBUS_CHAN_ATTR_RO(write_avail);
1616 
1617 static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf)
1618 {
1619 	return sprintf(buf, "%u\n", channel->target_cpu);
1620 }
1621 
1622 int vmbus_channel_set_cpu(struct vmbus_channel *channel, u32 target_cpu)
1623 {
1624 	u32 origin_cpu;
1625 	int ret = 0;
1626 
1627 	lockdep_assert_cpus_held();
1628 	lockdep_assert_held(&vmbus_connection.channel_mutex);
1629 
1630 	if (vmbus_proto_version < VERSION_WIN10_V4_1)
1631 		return -EIO;
1632 
1633 	/* Validate target_cpu for the cpumask_test_cpu() operation below. */
1634 	if (target_cpu >= nr_cpumask_bits)
1635 		return -EINVAL;
1636 
1637 	if (!cpumask_test_cpu(target_cpu, housekeeping_cpumask(HK_TYPE_MANAGED_IRQ)))
1638 		return -EINVAL;
1639 
1640 	if (!cpu_online(target_cpu))
1641 		return -EINVAL;
1642 
1643 	/*
1644 	 * Synchronizes vmbus_channel_set_cpu() and channel closure:
1645 	 *
1646 	 * { Initially: state = CHANNEL_OPENED }
1647 	 *
1648 	 * CPU1				CPU2
1649 	 *
1650 	 * [vmbus_channel_set_cpu()]	[vmbus_disconnect_ring()]
1651 	 *
1652 	 * LOCK channel_mutex		LOCK channel_mutex
1653 	 * LOAD r1 = state		LOAD r2 = state
1654 	 * IF (r1 == CHANNEL_OPENED)	IF (r2 == CHANNEL_OPENED)
1655 	 *   SEND MODIFYCHANNEL		  STORE state = CHANNEL_OPEN
1656 	 *   [...]			  SEND CLOSECHANNEL
1657 	 * UNLOCK channel_mutex		UNLOCK channel_mutex
1658 	 *
1659 	 * Forbids: r1 == r2 == CHANNEL_OPENED (i.e., CPU1's LOCK precedes
1660 	 * 		CPU2's LOCK) && CPU2's SEND precedes CPU1's SEND
1661 	 *
1662 	 * Note.  The host processes the channel messages "sequentially", in
1663 	 * the order in which they are received on a per-partition basis.
1664 	 */
1665 
1666 	/*
1667 	 * Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels;
1668 	 * avoid sending the message and fail here for such channels.
1669 	 */
1670 	if (channel->state != CHANNEL_OPENED_STATE) {
1671 		ret = -EIO;
1672 		goto end;
1673 	}
1674 
1675 	origin_cpu = channel->target_cpu;
1676 	if (target_cpu == origin_cpu)
1677 		goto end;
1678 
1679 	if (vmbus_send_modifychannel(channel,
1680 				     hv_cpu_number_to_vp_number(target_cpu))) {
1681 		ret = -EIO;
1682 		goto end;
1683 	}
1684 
1685 	/*
1686 	 * For version before VERSION_WIN10_V5_3, the following warning holds:
1687 	 *
1688 	 * Warning.  At this point, there is *no* guarantee that the host will
1689 	 * have successfully processed the vmbus_send_modifychannel() request.
1690 	 * See the header comment of vmbus_send_modifychannel() for more info.
1691 	 *
1692 	 * Lags in the processing of the above vmbus_send_modifychannel() can
1693 	 * result in missed interrupts if the "old" target CPU is taken offline
1694 	 * before Hyper-V starts sending interrupts to the "new" target CPU.
1695 	 * But apart from this offlining scenario, the code tolerates such
1696 	 * lags.  It will function correctly even if a channel interrupt comes
1697 	 * in on a CPU that is different from the channel target_cpu value.
1698 	 */
1699 
1700 	channel->target_cpu = target_cpu;
1701 
1702 	/* See init_vp_index(). */
1703 	if (hv_is_perf_channel(channel))
1704 		hv_update_allocated_cpus(origin_cpu, target_cpu);
1705 
1706 	/* Currently set only for storvsc channels. */
1707 	if (channel->change_target_cpu_callback) {
1708 		(*channel->change_target_cpu_callback)(channel,
1709 				origin_cpu, target_cpu);
1710 	}
1711 
1712 end:
1713 	return ret;
1714 }
1715 
1716 static ssize_t target_cpu_store(struct vmbus_channel *channel,
1717 				const char *buf, size_t count)
1718 {
1719 	u32 target_cpu;
1720 	ssize_t ret;
1721 
1722 	if (sscanf(buf, "%uu", &target_cpu) != 1)
1723 		return -EIO;
1724 
1725 	cpus_read_lock();
1726 	mutex_lock(&vmbus_connection.channel_mutex);
1727 	ret = vmbus_channel_set_cpu(channel, target_cpu);
1728 	mutex_unlock(&vmbus_connection.channel_mutex);
1729 	cpus_read_unlock();
1730 
1731 	return ret ?: count;
1732 }
1733 static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store);
1734 
1735 static ssize_t channel_pending_show(struct vmbus_channel *channel,
1736 				    char *buf)
1737 {
1738 	return sprintf(buf, "%d\n",
1739 		       channel_pending(channel,
1740 				       vmbus_connection.monitor_pages[1]));
1741 }
1742 static VMBUS_CHAN_ATTR(pending, 0444, channel_pending_show, NULL);
1743 
1744 static ssize_t channel_latency_show(struct vmbus_channel *channel,
1745 				    char *buf)
1746 {
1747 	return sprintf(buf, "%d\n",
1748 		       channel_latency(channel,
1749 				       vmbus_connection.monitor_pages[1]));
1750 }
1751 static VMBUS_CHAN_ATTR(latency, 0444, channel_latency_show, NULL);
1752 
1753 static ssize_t channel_interrupts_show(struct vmbus_channel *channel, char *buf)
1754 {
1755 	return sprintf(buf, "%llu\n", channel->interrupts);
1756 }
1757 static VMBUS_CHAN_ATTR(interrupts, 0444, channel_interrupts_show, NULL);
1758 
1759 static ssize_t channel_events_show(struct vmbus_channel *channel, char *buf)
1760 {
1761 	return sprintf(buf, "%llu\n", channel->sig_events);
1762 }
1763 static VMBUS_CHAN_ATTR(events, 0444, channel_events_show, NULL);
1764 
1765 static ssize_t channel_intr_in_full_show(struct vmbus_channel *channel,
1766 					 char *buf)
1767 {
1768 	return sprintf(buf, "%llu\n",
1769 		       (unsigned long long)channel->intr_in_full);
1770 }
1771 static VMBUS_CHAN_ATTR(intr_in_full, 0444, channel_intr_in_full_show, NULL);
1772 
1773 static ssize_t channel_intr_out_empty_show(struct vmbus_channel *channel,
1774 					   char *buf)
1775 {
1776 	return sprintf(buf, "%llu\n",
1777 		       (unsigned long long)channel->intr_out_empty);
1778 }
1779 static VMBUS_CHAN_ATTR(intr_out_empty, 0444, channel_intr_out_empty_show, NULL);
1780 
1781 static ssize_t channel_out_full_first_show(struct vmbus_channel *channel,
1782 					   char *buf)
1783 {
1784 	return sprintf(buf, "%llu\n",
1785 		       (unsigned long long)channel->out_full_first);
1786 }
1787 static VMBUS_CHAN_ATTR(out_full_first, 0444, channel_out_full_first_show, NULL);
1788 
1789 static ssize_t channel_out_full_total_show(struct vmbus_channel *channel,
1790 					   char *buf)
1791 {
1792 	return sprintf(buf, "%llu\n",
1793 		       (unsigned long long)channel->out_full_total);
1794 }
1795 static VMBUS_CHAN_ATTR(out_full_total, 0444, channel_out_full_total_show, NULL);
1796 
1797 static ssize_t subchannel_monitor_id_show(struct vmbus_channel *channel,
1798 					  char *buf)
1799 {
1800 	return sprintf(buf, "%u\n", channel->offermsg.monitorid);
1801 }
1802 static VMBUS_CHAN_ATTR(monitor_id, 0444, subchannel_monitor_id_show, NULL);
1803 
1804 static ssize_t subchannel_id_show(struct vmbus_channel *channel,
1805 				  char *buf)
1806 {
1807 	return sprintf(buf, "%u\n",
1808 		       channel->offermsg.offer.sub_channel_index);
1809 }
1810 static VMBUS_CHAN_ATTR_RO(subchannel_id);
1811 
1812 static int hv_mmap_ring_buffer_wrapper(struct file *filp, struct kobject *kobj,
1813 				       const struct bin_attribute *attr,
1814 				       struct vm_area_struct *vma)
1815 {
1816 	struct vmbus_channel *channel = container_of(kobj, struct vmbus_channel, kobj);
1817 
1818 	/*
1819 	 * hv_(create|remove)_ring_sysfs implementation ensures that mmap_ring_buffer
1820 	 * is not NULL.
1821 	 */
1822 	return channel->mmap_ring_buffer(channel, vma);
1823 }
1824 
1825 static struct bin_attribute chan_attr_ring_buffer = {
1826 	.attr = {
1827 		.name = "ring",
1828 		.mode = 0600,
1829 	},
1830 	.mmap = hv_mmap_ring_buffer_wrapper,
1831 };
1832 static struct attribute *vmbus_chan_attrs[] = {
1833 	&chan_attr_out_mask.attr,
1834 	&chan_attr_in_mask.attr,
1835 	&chan_attr_read_avail.attr,
1836 	&chan_attr_write_avail.attr,
1837 	&chan_attr_cpu.attr,
1838 	&chan_attr_pending.attr,
1839 	&chan_attr_latency.attr,
1840 	&chan_attr_interrupts.attr,
1841 	&chan_attr_events.attr,
1842 	&chan_attr_intr_in_full.attr,
1843 	&chan_attr_intr_out_empty.attr,
1844 	&chan_attr_out_full_first.attr,
1845 	&chan_attr_out_full_total.attr,
1846 	&chan_attr_monitor_id.attr,
1847 	&chan_attr_subchannel_id.attr,
1848 	NULL
1849 };
1850 
1851 static struct bin_attribute *vmbus_chan_bin_attrs[] = {
1852 	&chan_attr_ring_buffer,
1853 	NULL
1854 };
1855 
1856 /*
1857  * Channel-level attribute_group callback function. Returns the permission for
1858  * each attribute, and returns 0 if an attribute is not visible.
1859  */
1860 static umode_t vmbus_chan_attr_is_visible(struct kobject *kobj,
1861 					  struct attribute *attr, int idx)
1862 {
1863 	const struct vmbus_channel *channel =
1864 		container_of(kobj, struct vmbus_channel, kobj);
1865 
1866 	/* Hide the monitor attributes if the monitor mechanism is not used. */
1867 	if (!channel->offermsg.monitor_allocated &&
1868 	    (attr == &chan_attr_pending.attr ||
1869 	     attr == &chan_attr_latency.attr ||
1870 	     attr == &chan_attr_monitor_id.attr))
1871 		return 0;
1872 
1873 	return attr->mode;
1874 }
1875 
1876 static umode_t vmbus_chan_bin_attr_is_visible(struct kobject *kobj,
1877 					      const struct bin_attribute *attr, int idx)
1878 {
1879 	const struct vmbus_channel *channel =
1880 		container_of(kobj, struct vmbus_channel, kobj);
1881 
1882 	/* Hide ring attribute if channel's ring_sysfs_visible is set to false */
1883 	if (attr ==  &chan_attr_ring_buffer && !channel->ring_sysfs_visible)
1884 		return 0;
1885 
1886 	return attr->attr.mode;
1887 }
1888 
1889 static size_t vmbus_chan_bin_size(struct kobject *kobj,
1890 				  const struct bin_attribute *bin_attr, int a)
1891 {
1892 	const struct vmbus_channel *channel =
1893 		container_of(kobj, struct vmbus_channel, kobj);
1894 
1895 	return channel->ringbuffer_pagecount << PAGE_SHIFT;
1896 }
1897 
1898 static const struct attribute_group vmbus_chan_group = {
1899 	.attrs = vmbus_chan_attrs,
1900 	.bin_attrs = vmbus_chan_bin_attrs,
1901 	.is_visible = vmbus_chan_attr_is_visible,
1902 	.is_bin_visible = vmbus_chan_bin_attr_is_visible,
1903 	.bin_size = vmbus_chan_bin_size,
1904 };
1905 
1906 static const struct kobj_type vmbus_chan_ktype = {
1907 	.sysfs_ops = &vmbus_chan_sysfs_ops,
1908 	.release = vmbus_chan_release,
1909 };
1910 
1911 /**
1912  * hv_create_ring_sysfs() - create "ring" sysfs entry corresponding to ring buffers for a channel.
1913  * @channel: Pointer to vmbus_channel structure
1914  * @hv_mmap_ring_buffer: function pointer for initializing the function to be called on mmap of
1915  *                       channel's "ring" sysfs node, which is for the ring buffer of that channel.
1916  *                       Function pointer is of below type:
1917  *                       int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel,
1918  *                                                  struct vm_area_struct *vma))
1919  *                       This has a pointer to the channel and a pointer to vm_area_struct,
1920  *                       used for mmap, as arguments.
1921  *
1922  * Sysfs node for ring buffer of a channel is created along with other fields, however its
1923  * visibility is disabled by default. Sysfs creation needs to be controlled when the use-case
1924  * is running.
1925  * For example, HV_NIC device is used either by uio_hv_generic or hv_netvsc at any given point of
1926  * time, and "ring" sysfs is needed only when uio_hv_generic is bound to that device. To avoid
1927  * exposing the ring buffer by default, this function is reponsible to enable visibility of
1928  * ring for userspace to use.
1929  * Note: Race conditions can happen with userspace and it is not encouraged to create new
1930  * use-cases for this. This was added to maintain backward compatibility, while solving
1931  * one of the race conditions in uio_hv_generic while creating sysfs.
1932  *
1933  * Returns 0 on success or error code on failure.
1934  */
1935 int hv_create_ring_sysfs(struct vmbus_channel *channel,
1936 			 int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel,
1937 						    struct vm_area_struct *vma))
1938 {
1939 	struct kobject *kobj = &channel->kobj;
1940 
1941 	channel->mmap_ring_buffer = hv_mmap_ring_buffer;
1942 	channel->ring_sysfs_visible = true;
1943 
1944 	return sysfs_update_group(kobj, &vmbus_chan_group);
1945 }
1946 EXPORT_SYMBOL_GPL(hv_create_ring_sysfs);
1947 
1948 /**
1949  * hv_remove_ring_sysfs() - remove ring sysfs entry corresponding to ring buffers for a channel.
1950  * @channel: Pointer to vmbus_channel structure
1951  *
1952  * Hide "ring" sysfs for a channel by changing its is_visible attribute and updating sysfs group.
1953  *
1954  * Returns 0 on success or error code on failure.
1955  */
1956 int hv_remove_ring_sysfs(struct vmbus_channel *channel)
1957 {
1958 	struct kobject *kobj = &channel->kobj;
1959 	int ret;
1960 
1961 	channel->ring_sysfs_visible = false;
1962 	ret = sysfs_update_group(kobj, &vmbus_chan_group);
1963 	channel->mmap_ring_buffer = NULL;
1964 	return ret;
1965 }
1966 EXPORT_SYMBOL_GPL(hv_remove_ring_sysfs);
1967 
1968 /*
1969  * vmbus_add_channel_kobj - setup a sub-directory under device/channels
1970  */
1971 int vmbus_add_channel_kobj(struct hv_device *dev, struct vmbus_channel *channel)
1972 {
1973 	const struct device *device = &dev->device;
1974 	struct kobject *kobj = &channel->kobj;
1975 	u32 relid = channel->offermsg.child_relid;
1976 	int ret;
1977 
1978 	kobj->kset = dev->channels_kset;
1979 	ret = kobject_init_and_add(kobj, &vmbus_chan_ktype, NULL,
1980 				   "%u", relid);
1981 	if (ret) {
1982 		kobject_put(kobj);
1983 		return ret;
1984 	}
1985 
1986 	ret = sysfs_create_group(kobj, &vmbus_chan_group);
1987 
1988 	if (ret) {
1989 		/*
1990 		 * The calling functions' error handling paths will cleanup the
1991 		 * empty channel directory.
1992 		 */
1993 		kobject_put(kobj);
1994 		dev_err(device, "Unable to set up channel sysfs files\n");
1995 		return ret;
1996 	}
1997 
1998 	kobject_uevent(kobj, KOBJ_ADD);
1999 
2000 	return 0;
2001 }
2002 
2003 /*
2004  * vmbus_remove_channel_attr_group - remove the channel's attribute group
2005  */
2006 void vmbus_remove_channel_attr_group(struct vmbus_channel *channel)
2007 {
2008 	sysfs_remove_group(&channel->kobj, &vmbus_chan_group);
2009 }
2010 
2011 /*
2012  * vmbus_device_create - Creates and registers a new child device
2013  * on the vmbus.
2014  */
2015 struct hv_device *vmbus_device_create(const guid_t *type,
2016 				      const guid_t *instance,
2017 				      struct vmbus_channel *channel)
2018 {
2019 	struct hv_device *child_device_obj;
2020 
2021 	child_device_obj = kzalloc(sizeof(struct hv_device), GFP_KERNEL);
2022 	if (!child_device_obj) {
2023 		pr_err("Unable to allocate device object for child device\n");
2024 		return NULL;
2025 	}
2026 
2027 	child_device_obj->channel = channel;
2028 	guid_copy(&child_device_obj->dev_type, type);
2029 	guid_copy(&child_device_obj->dev_instance, instance);
2030 	child_device_obj->vendor_id = PCI_VENDOR_ID_MICROSOFT;
2031 
2032 	return child_device_obj;
2033 }
2034 
2035 /*
2036  * vmbus_device_register - Register the child device
2037  */
2038 int vmbus_device_register(struct hv_device *child_device_obj)
2039 {
2040 	struct kobject *kobj = &child_device_obj->device.kobj;
2041 	int ret;
2042 
2043 	dev_set_name(&child_device_obj->device, "%pUl",
2044 		     &child_device_obj->channel->offermsg.offer.if_instance);
2045 
2046 	child_device_obj->device.bus = &hv_bus;
2047 	child_device_obj->device.parent = vmbus_root_device;
2048 	child_device_obj->device.release = vmbus_device_release;
2049 
2050 	child_device_obj->device.dma_parms = &child_device_obj->dma_parms;
2051 	child_device_obj->device.dma_mask = &child_device_obj->dma_mask;
2052 	dma_set_mask(&child_device_obj->device, DMA_BIT_MASK(64));
2053 
2054 	/*
2055 	 * Register with the LDM. This will kick off the driver/device
2056 	 * binding...which will eventually call vmbus_match() and vmbus_probe()
2057 	 */
2058 	ret = device_register(&child_device_obj->device);
2059 	if (ret) {
2060 		pr_err("Unable to register child device\n");
2061 		put_device(&child_device_obj->device);
2062 		return ret;
2063 	}
2064 
2065 	child_device_obj->channels_kset = kset_create_and_add("channels",
2066 							      NULL, kobj);
2067 	if (!child_device_obj->channels_kset) {
2068 		ret = -ENOMEM;
2069 		goto err_dev_unregister;
2070 	}
2071 
2072 	ret = vmbus_add_channel_kobj(child_device_obj,
2073 				     child_device_obj->channel);
2074 	if (ret) {
2075 		pr_err("Unable to register primary channeln");
2076 		goto err_kset_unregister;
2077 	}
2078 	hv_debug_add_dev_dir(child_device_obj);
2079 
2080 	return 0;
2081 
2082 err_kset_unregister:
2083 	kset_unregister(child_device_obj->channels_kset);
2084 
2085 err_dev_unregister:
2086 	device_unregister(&child_device_obj->device);
2087 	return ret;
2088 }
2089 
2090 /*
2091  * vmbus_device_unregister - Remove the specified child device
2092  * from the vmbus.
2093  */
2094 void vmbus_device_unregister(struct hv_device *device_obj)
2095 {
2096 	pr_debug("child device %s unregistered\n",
2097 		dev_name(&device_obj->device));
2098 
2099 	kset_unregister(device_obj->channels_kset);
2100 
2101 	/*
2102 	 * Kick off the process of unregistering the device.
2103 	 * This will call vmbus_remove() and eventually vmbus_device_release()
2104 	 */
2105 	device_unregister(&device_obj->device);
2106 }
2107 EXPORT_SYMBOL_GPL(vmbus_device_unregister);
2108 
2109 #ifdef CONFIG_ACPI
2110 /*
2111  * VMBUS is an acpi enumerated device. Get the information we
2112  * need from DSDT.
2113  */
2114 static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx)
2115 {
2116 	resource_size_t start = 0;
2117 	resource_size_t end = 0;
2118 	struct resource *new_res;
2119 	struct resource **old_res = &hyperv_mmio;
2120 	struct resource **prev_res = NULL;
2121 	struct resource r;
2122 
2123 	switch (res->type) {
2124 
2125 	/*
2126 	 * "Address" descriptors are for bus windows. Ignore
2127 	 * "memory" descriptors, which are for registers on
2128 	 * devices.
2129 	 */
2130 	case ACPI_RESOURCE_TYPE_ADDRESS32:
2131 		start = res->data.address32.address.minimum;
2132 		end = res->data.address32.address.maximum;
2133 		break;
2134 
2135 	case ACPI_RESOURCE_TYPE_ADDRESS64:
2136 		start = res->data.address64.address.minimum;
2137 		end = res->data.address64.address.maximum;
2138 		break;
2139 
2140 	/*
2141 	 * The IRQ information is needed only on ARM64, which Hyper-V
2142 	 * sets up in the extended format. IRQ information is present
2143 	 * on x86/x64 in the non-extended format but it is not used by
2144 	 * Linux. So don't bother checking for the non-extended format.
2145 	 */
2146 	case ACPI_RESOURCE_TYPE_EXTENDED_IRQ:
2147 		if (!acpi_dev_resource_interrupt(res, 0, &r)) {
2148 			pr_err("Unable to parse Hyper-V ACPI interrupt\n");
2149 			return AE_ERROR;
2150 		}
2151 		/* ARM64 INTID for VMbus */
2152 		vmbus_interrupt = res->data.extended_irq.interrupts[0];
2153 		/* Linux IRQ number */
2154 		vmbus_irq = r.start;
2155 		return AE_OK;
2156 
2157 	default:
2158 		/* Unused resource type */
2159 		return AE_OK;
2160 
2161 	}
2162 	/*
2163 	 * Ignore ranges that are below 1MB, as they're not
2164 	 * necessary or useful here.
2165 	 */
2166 	if (end < 0x100000)
2167 		return AE_OK;
2168 
2169 	new_res = kzalloc(sizeof(*new_res), GFP_ATOMIC);
2170 	if (!new_res)
2171 		return AE_NO_MEMORY;
2172 
2173 	/* If this range overlaps the virtual TPM, truncate it. */
2174 	if (end > VTPM_BASE_ADDRESS && start < VTPM_BASE_ADDRESS)
2175 		end = VTPM_BASE_ADDRESS;
2176 
2177 	new_res->name = "hyperv mmio";
2178 	new_res->flags = IORESOURCE_MEM;
2179 	new_res->start = start;
2180 	new_res->end = end;
2181 
2182 	/*
2183 	 * If two ranges are adjacent, merge them.
2184 	 */
2185 	do {
2186 		if (!*old_res) {
2187 			*old_res = new_res;
2188 			break;
2189 		}
2190 
2191 		if (((*old_res)->end + 1) == new_res->start) {
2192 			(*old_res)->end = new_res->end;
2193 			kfree(new_res);
2194 			break;
2195 		}
2196 
2197 		if ((*old_res)->start == new_res->end + 1) {
2198 			(*old_res)->start = new_res->start;
2199 			kfree(new_res);
2200 			break;
2201 		}
2202 
2203 		if ((*old_res)->start > new_res->end) {
2204 			new_res->sibling = *old_res;
2205 			if (prev_res)
2206 				(*prev_res)->sibling = new_res;
2207 			*old_res = new_res;
2208 			break;
2209 		}
2210 
2211 		prev_res = old_res;
2212 		old_res = &(*old_res)->sibling;
2213 
2214 	} while (1);
2215 
2216 	return AE_OK;
2217 }
2218 #endif
2219 
2220 static void vmbus_mmio_remove(void)
2221 {
2222 	struct resource *cur_res;
2223 	struct resource *next_res;
2224 
2225 	if (hyperv_mmio) {
2226 		if (fb_mmio) {
2227 			__release_region(hyperv_mmio, fb_mmio->start,
2228 					 resource_size(fb_mmio));
2229 			fb_mmio = NULL;
2230 		}
2231 
2232 		for (cur_res = hyperv_mmio; cur_res; cur_res = next_res) {
2233 			next_res = cur_res->sibling;
2234 			kfree(cur_res);
2235 		}
2236 	}
2237 }
2238 
2239 static void __maybe_unused vmbus_reserve_fb(void)
2240 {
2241 	resource_size_t start = 0, size;
2242 	struct pci_dev *pdev;
2243 
2244 	if (efi_enabled(EFI_BOOT)) {
2245 		/* Gen2 VM: get FB base from EFI framebuffer */
2246 		if (IS_ENABLED(CONFIG_SYSFB)) {
2247 			start = screen_info.lfb_base;
2248 			size = max_t(__u32, screen_info.lfb_size, 0x800000);
2249 		}
2250 	} else {
2251 		/* Gen1 VM: get FB base from PCI */
2252 		pdev = pci_get_device(PCI_VENDOR_ID_MICROSOFT,
2253 				      PCI_DEVICE_ID_HYPERV_VIDEO, NULL);
2254 		if (!pdev)
2255 			return;
2256 
2257 		if (pdev->resource[0].flags & IORESOURCE_MEM) {
2258 			start = pci_resource_start(pdev, 0);
2259 			size = pci_resource_len(pdev, 0);
2260 		}
2261 
2262 		/*
2263 		 * Release the PCI device so hyperv_drm or hyperv_fb driver can
2264 		 * grab it later.
2265 		 */
2266 		pci_dev_put(pdev);
2267 	}
2268 
2269 	if (!start)
2270 		return;
2271 
2272 	/*
2273 	 * Make a claim for the frame buffer in the resource tree under the
2274 	 * first node, which will be the one below 4GB.  The length seems to
2275 	 * be underreported, particularly in a Generation 1 VM.  So start out
2276 	 * reserving a larger area and make it smaller until it succeeds.
2277 	 */
2278 	for (; !fb_mmio && (size >= 0x100000); size >>= 1)
2279 		fb_mmio = __request_region(hyperv_mmio, start, size, fb_mmio_name, 0);
2280 }
2281 
2282 /**
2283  * vmbus_allocate_mmio() - Pick a memory-mapped I/O range.
2284  * @new:		If successful, supplied a pointer to the
2285  *			allocated MMIO space.
2286  * @device_obj:		Identifies the caller
2287  * @min:		Minimum guest physical address of the
2288  *			allocation
2289  * @max:		Maximum guest physical address
2290  * @size:		Size of the range to be allocated
2291  * @align:		Alignment of the range to be allocated
2292  * @fb_overlap_ok:	Whether this allocation can be allowed
2293  *			to overlap the video frame buffer.
2294  *
2295  * This function walks the resources granted to VMBus by the
2296  * _CRS object in the ACPI namespace underneath the parent
2297  * "bridge" whether that's a root PCI bus in the Generation 1
2298  * case or a Module Device in the Generation 2 case.  It then
2299  * attempts to allocate from the global MMIO pool in a way that
2300  * matches the constraints supplied in these parameters and by
2301  * that _CRS.
2302  *
2303  * Return: 0 on success, -errno on failure
2304  */
2305 int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj,
2306 			resource_size_t min, resource_size_t max,
2307 			resource_size_t size, resource_size_t align,
2308 			bool fb_overlap_ok)
2309 {
2310 	struct resource *iter, *shadow;
2311 	resource_size_t range_min, range_max, start, end;
2312 	const char *dev_n = dev_name(&device_obj->device);
2313 	int retval;
2314 
2315 	retval = -ENXIO;
2316 	mutex_lock(&hyperv_mmio_lock);
2317 
2318 	/*
2319 	 * If overlaps with frame buffers are allowed, then first attempt to
2320 	 * make the allocation from within the reserved region.  Because it
2321 	 * is already reserved, no shadow allocation is necessary.
2322 	 */
2323 	if (fb_overlap_ok && fb_mmio && !(min > fb_mmio->end) &&
2324 	    !(max < fb_mmio->start)) {
2325 
2326 		range_min = fb_mmio->start;
2327 		range_max = fb_mmio->end;
2328 		start = (range_min + align - 1) & ~(align - 1);
2329 		for (; start + size - 1 <= range_max; start += align) {
2330 			*new = request_mem_region_exclusive(start, size, dev_n);
2331 			if (*new) {
2332 				retval = 0;
2333 				goto exit;
2334 			}
2335 		}
2336 	}
2337 
2338 	for (iter = hyperv_mmio; iter; iter = iter->sibling) {
2339 		if ((iter->start >= max) || (iter->end <= min))
2340 			continue;
2341 
2342 		range_min = iter->start;
2343 		range_max = iter->end;
2344 		start = (range_min + align - 1) & ~(align - 1);
2345 		for (; start + size - 1 <= range_max; start += align) {
2346 			end = start + size - 1;
2347 
2348 			/* Skip the whole fb_mmio region if not fb_overlap_ok */
2349 			if (!fb_overlap_ok && fb_mmio &&
2350 			    (((start >= fb_mmio->start) && (start <= fb_mmio->end)) ||
2351 			     ((end >= fb_mmio->start) && (end <= fb_mmio->end))))
2352 				continue;
2353 
2354 			shadow = __request_region(iter, start, size, NULL,
2355 						  IORESOURCE_BUSY);
2356 			if (!shadow)
2357 				continue;
2358 
2359 			*new = request_mem_region_exclusive(start, size, dev_n);
2360 			if (*new) {
2361 				shadow->name = (char *)*new;
2362 				retval = 0;
2363 				goto exit;
2364 			}
2365 
2366 			__release_region(iter, start, size);
2367 		}
2368 	}
2369 
2370 exit:
2371 	mutex_unlock(&hyperv_mmio_lock);
2372 	return retval;
2373 }
2374 EXPORT_SYMBOL_GPL(vmbus_allocate_mmio);
2375 
2376 /**
2377  * vmbus_free_mmio() - Free a memory-mapped I/O range.
2378  * @start:		Base address of region to release.
2379  * @size:		Size of the range to be allocated
2380  *
2381  * This function releases anything requested by
2382  * vmbus_mmio_allocate().
2383  */
2384 void vmbus_free_mmio(resource_size_t start, resource_size_t size)
2385 {
2386 	struct resource *iter;
2387 
2388 	mutex_lock(&hyperv_mmio_lock);
2389 
2390 	/*
2391 	 * If all bytes of the MMIO range to be released are within the
2392 	 * special case fb_mmio shadow region, skip releasing the shadow
2393 	 * region since no corresponding __request_region() was done
2394 	 * in vmbus_allocate_mmio().
2395 	 */
2396 	if (fb_mmio && start >= fb_mmio->start &&
2397 	    (start + size - 1 <= fb_mmio->end))
2398 		goto skip_shadow_release;
2399 
2400 	for (iter = hyperv_mmio; iter; iter = iter->sibling) {
2401 		if ((iter->start >= start + size) || (iter->end <= start))
2402 			continue;
2403 
2404 		__release_region(iter, start, size);
2405 	}
2406 
2407 skip_shadow_release:
2408 	release_mem_region(start, size);
2409 	mutex_unlock(&hyperv_mmio_lock);
2410 
2411 }
2412 EXPORT_SYMBOL_GPL(vmbus_free_mmio);
2413 
2414 #ifdef CONFIG_ACPI
2415 static int vmbus_acpi_add(struct platform_device *pdev)
2416 {
2417 	acpi_status result;
2418 	int ret_val = -ENODEV;
2419 	struct acpi_device *ancestor;
2420 	struct acpi_device *device = ACPI_COMPANION(&pdev->dev);
2421 
2422 	vmbus_root_device = &device->dev;
2423 
2424 	/*
2425 	 * Older versions of Hyper-V for ARM64 fail to include the _CCA
2426 	 * method on the top level VMbus device in the DSDT. But devices
2427 	 * are hardware coherent in all current Hyper-V use cases, so fix
2428 	 * up the ACPI device to behave as if _CCA is present and indicates
2429 	 * hardware coherence.
2430 	 */
2431 	ACPI_COMPANION_SET(&device->dev, device);
2432 	if (IS_ENABLED(CONFIG_ACPI_CCA_REQUIRED) &&
2433 	    device_get_dma_attr(&device->dev) == DEV_DMA_NOT_SUPPORTED) {
2434 		pr_info("No ACPI _CCA found; assuming coherent device I/O\n");
2435 		device->flags.cca_seen = true;
2436 		device->flags.coherent_dma = true;
2437 	}
2438 
2439 	result = acpi_walk_resources(device->handle, METHOD_NAME__CRS,
2440 					vmbus_walk_resources, NULL);
2441 
2442 	if (ACPI_FAILURE(result))
2443 		goto acpi_walk_err;
2444 	/*
2445 	 * Some ancestor of the vmbus acpi device (Gen1 or Gen2
2446 	 * firmware) is the VMOD that has the mmio ranges. Get that.
2447 	 */
2448 	for (ancestor = acpi_dev_parent(device);
2449 	     ancestor && ancestor->handle != ACPI_ROOT_OBJECT;
2450 	     ancestor = acpi_dev_parent(ancestor)) {
2451 		result = acpi_walk_resources(ancestor->handle, METHOD_NAME__CRS,
2452 					     vmbus_walk_resources, NULL);
2453 
2454 		if (ACPI_FAILURE(result))
2455 			continue;
2456 		if (hyperv_mmio) {
2457 			vmbus_reserve_fb();
2458 			break;
2459 		}
2460 	}
2461 	ret_val = 0;
2462 
2463 acpi_walk_err:
2464 	if (ret_val)
2465 		vmbus_mmio_remove();
2466 	return ret_val;
2467 }
2468 #else
2469 static int vmbus_acpi_add(struct platform_device *pdev)
2470 {
2471 	return 0;
2472 }
2473 #endif
2474 
2475 static int vmbus_set_irq(struct platform_device *pdev)
2476 {
2477 	struct irq_data *data;
2478 	int irq;
2479 	irq_hw_number_t hwirq;
2480 
2481 	irq = platform_get_irq(pdev, 0);
2482 	/* platform_get_irq() may not return 0. */
2483 	if (irq < 0)
2484 		return irq;
2485 
2486 	data = irq_get_irq_data(irq);
2487 	if (!data) {
2488 		pr_err("No interrupt data for VMBus virq %d\n", irq);
2489 		return -ENODEV;
2490 	}
2491 	hwirq = irqd_to_hwirq(data);
2492 
2493 	vmbus_irq = irq;
2494 	vmbus_interrupt = hwirq;
2495 	pr_debug("VMBus virq %d, hwirq %d\n", vmbus_irq, vmbus_interrupt);
2496 
2497 	return 0;
2498 }
2499 
2500 static int vmbus_device_add(struct platform_device *pdev)
2501 {
2502 	struct resource **cur_res = &hyperv_mmio;
2503 	struct of_range range;
2504 	struct of_range_parser parser;
2505 	struct device_node *np = pdev->dev.of_node;
2506 	int ret;
2507 
2508 	vmbus_root_device = &pdev->dev;
2509 
2510 	ret = of_range_parser_init(&parser, np);
2511 	if (ret)
2512 		return ret;
2513 
2514 	if (!__is_defined(HYPERVISOR_CALLBACK_VECTOR))
2515 		ret = vmbus_set_irq(pdev);
2516 	if (ret)
2517 		return ret;
2518 
2519 	for_each_of_range(&parser, &range) {
2520 		struct resource *res;
2521 
2522 		res = kzalloc(sizeof(*res), GFP_KERNEL);
2523 		if (!res) {
2524 			vmbus_mmio_remove();
2525 			return -ENOMEM;
2526 		}
2527 
2528 		res->name = "hyperv mmio";
2529 		res->flags = range.flags;
2530 		res->start = range.cpu_addr;
2531 		res->end = range.cpu_addr + range.size;
2532 
2533 		*cur_res = res;
2534 		cur_res = &res->sibling;
2535 	}
2536 
2537 	return ret;
2538 }
2539 
2540 static int vmbus_platform_driver_probe(struct platform_device *pdev)
2541 {
2542 	if (acpi_disabled)
2543 		return vmbus_device_add(pdev);
2544 	else
2545 		return vmbus_acpi_add(pdev);
2546 }
2547 
2548 static void vmbus_platform_driver_remove(struct platform_device *pdev)
2549 {
2550 	vmbus_mmio_remove();
2551 }
2552 
2553 #ifdef CONFIG_PM_SLEEP
2554 static int vmbus_bus_suspend(struct device *dev)
2555 {
2556 	struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(
2557 			hv_context.cpu_context, VMBUS_CONNECT_CPU);
2558 	struct vmbus_channel *channel, *sc;
2559 
2560 	tasklet_disable(&hv_cpu->msg_dpc);
2561 	vmbus_connection.ignore_any_offer_msg = true;
2562 	/* The tasklet_enable() takes care of providing a memory barrier */
2563 	tasklet_enable(&hv_cpu->msg_dpc);
2564 
2565 	/* Drain all the workqueues as we are in suspend */
2566 	drain_workqueue(vmbus_connection.rescind_work_queue);
2567 	drain_workqueue(vmbus_connection.work_queue);
2568 	drain_workqueue(vmbus_connection.handle_primary_chan_wq);
2569 	drain_workqueue(vmbus_connection.handle_sub_chan_wq);
2570 
2571 	mutex_lock(&vmbus_connection.channel_mutex);
2572 	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
2573 		if (!is_hvsock_channel(channel))
2574 			continue;
2575 
2576 		vmbus_force_channel_rescinded(channel);
2577 	}
2578 	mutex_unlock(&vmbus_connection.channel_mutex);
2579 
2580 	/*
2581 	 * Wait until all the sub-channels and hv_sock channels have been
2582 	 * cleaned up. Sub-channels should be destroyed upon suspend, otherwise
2583 	 * they would conflict with the new sub-channels that will be created
2584 	 * in the resume path. hv_sock channels should also be destroyed, but
2585 	 * a hv_sock channel of an established hv_sock connection can not be
2586 	 * really destroyed since it may still be referenced by the userspace
2587 	 * application, so we just force the hv_sock channel to be rescinded
2588 	 * by vmbus_force_channel_rescinded(), and the userspace application
2589 	 * will thoroughly destroy the channel after hibernation.
2590 	 *
2591 	 * Note: the counter nr_chan_close_on_suspend may never go above 0 if
2592 	 * the VM has no sub-channel and hv_sock channel, e.g. a 1-vCPU VM.
2593 	 */
2594 	if (atomic_read(&vmbus_connection.nr_chan_close_on_suspend) > 0)
2595 		wait_for_completion(&vmbus_connection.ready_for_suspend_event);
2596 
2597 	mutex_lock(&vmbus_connection.channel_mutex);
2598 
2599 	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
2600 		/*
2601 		 * Remove the channel from the array of channels and invalidate
2602 		 * the channel's relid.  Upon resume, vmbus_onoffer() will fix
2603 		 * up the relid (and other fields, if necessary) and add the
2604 		 * channel back to the array.
2605 		 */
2606 		vmbus_channel_unmap_relid(channel);
2607 		channel->offermsg.child_relid = INVALID_RELID;
2608 
2609 		if (is_hvsock_channel(channel)) {
2610 			if (!channel->rescind) {
2611 				pr_err("hv_sock channel not rescinded!\n");
2612 				WARN_ON_ONCE(1);
2613 			}
2614 			continue;
2615 		}
2616 
2617 		list_for_each_entry(sc, &channel->sc_list, sc_list) {
2618 			pr_err("Sub-channel not deleted!\n");
2619 			WARN_ON_ONCE(1);
2620 		}
2621 	}
2622 
2623 	mutex_unlock(&vmbus_connection.channel_mutex);
2624 
2625 	vmbus_initiate_unload(false);
2626 
2627 	return 0;
2628 }
2629 
2630 static int vmbus_bus_resume(struct device *dev)
2631 {
2632 	struct vmbus_channel *channel;
2633 	struct vmbus_channel_msginfo *msginfo;
2634 	size_t msgsize;
2635 	int ret;
2636 
2637 	vmbus_connection.ignore_any_offer_msg = false;
2638 
2639 	/*
2640 	 * We only use the 'vmbus_proto_version', which was in use before
2641 	 * hibernation, to re-negotiate with the host.
2642 	 */
2643 	if (!vmbus_proto_version) {
2644 		pr_err("Invalid proto version = 0x%x\n", vmbus_proto_version);
2645 		return -EINVAL;
2646 	}
2647 
2648 	msgsize = sizeof(*msginfo) +
2649 		  sizeof(struct vmbus_channel_initiate_contact);
2650 
2651 	msginfo = kzalloc(msgsize, GFP_KERNEL);
2652 
2653 	if (msginfo == NULL)
2654 		return -ENOMEM;
2655 
2656 	ret = vmbus_negotiate_version(msginfo, vmbus_proto_version);
2657 
2658 	kfree(msginfo);
2659 
2660 	if (ret != 0)
2661 		return ret;
2662 
2663 	vmbus_request_offers();
2664 
2665 	mutex_lock(&vmbus_connection.channel_mutex);
2666 	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
2667 		if (channel->offermsg.child_relid != INVALID_RELID)
2668 			continue;
2669 
2670 		/* hvsock channels are not expected to be present. */
2671 		if (is_hvsock_channel(channel))
2672 			continue;
2673 
2674 		pr_err("channel %pUl/%pUl not present after resume.\n",
2675 		       &channel->offermsg.offer.if_type,
2676 		       &channel->offermsg.offer.if_instance);
2677 		/* ToDo: Cleanup these channels here */
2678 	}
2679 	mutex_unlock(&vmbus_connection.channel_mutex);
2680 
2681 	/* Reset the event for the next suspend. */
2682 	reinit_completion(&vmbus_connection.ready_for_suspend_event);
2683 
2684 	return 0;
2685 }
2686 #else
2687 #define vmbus_bus_suspend NULL
2688 #define vmbus_bus_resume NULL
2689 #endif /* CONFIG_PM_SLEEP */
2690 
2691 static const __maybe_unused struct of_device_id vmbus_of_match[] = {
2692 	{
2693 		.compatible = "microsoft,vmbus",
2694 	},
2695 	{
2696 		/* sentinel */
2697 	},
2698 };
2699 MODULE_DEVICE_TABLE(of, vmbus_of_match);
2700 
2701 static const __maybe_unused struct acpi_device_id vmbus_acpi_device_ids[] = {
2702 	{"VMBUS", 0},
2703 	{"VMBus", 0},
2704 	{"", 0},
2705 };
2706 MODULE_DEVICE_TABLE(acpi, vmbus_acpi_device_ids);
2707 
2708 /*
2709  * Note: we must use the "no_irq" ops, otherwise hibernation can not work with
2710  * PCI device assignment, because "pci_dev_pm_ops" uses the "noirq" ops: in
2711  * the resume path, the pci "noirq" restore op runs before "non-noirq" op (see
2712  * resume_target_kernel() -> dpm_resume_start(), and hibernation_restore() ->
2713  * dpm_resume_end()). This means vmbus_bus_resume() and the pci-hyperv's
2714  * resume callback must also run via the "noirq" ops.
2715  *
2716  * Set suspend_noirq/resume_noirq to NULL for Suspend-to-Idle: see the comment
2717  * earlier in this file before vmbus_pm.
2718  */
2719 
2720 static const struct dev_pm_ops vmbus_bus_pm = {
2721 	.suspend_noirq	= NULL,
2722 	.resume_noirq	= NULL,
2723 	.freeze_noirq	= vmbus_bus_suspend,
2724 	.thaw_noirq	= vmbus_bus_resume,
2725 	.poweroff_noirq	= vmbus_bus_suspend,
2726 	.restore_noirq	= vmbus_bus_resume
2727 };
2728 
2729 static struct platform_driver vmbus_platform_driver = {
2730 	.probe = vmbus_platform_driver_probe,
2731 	.remove = vmbus_platform_driver_remove,
2732 	.driver = {
2733 		.name = "vmbus",
2734 		.acpi_match_table = ACPI_PTR(vmbus_acpi_device_ids),
2735 		.of_match_table = of_match_ptr(vmbus_of_match),
2736 		.pm = &vmbus_bus_pm,
2737 		.probe_type = PROBE_FORCE_SYNCHRONOUS,
2738 	}
2739 };
2740 
2741 static void hv_kexec_handler(void)
2742 {
2743 	hv_stimer_global_cleanup();
2744 	vmbus_initiate_unload(false);
2745 	/* Make sure conn_state is set as hv_synic_cleanup checks for it */
2746 	mb();
2747 	cpuhp_remove_state(hyperv_cpuhp_online);
2748 };
2749 
2750 static void hv_crash_handler(struct pt_regs *regs)
2751 {
2752 	int cpu;
2753 
2754 	vmbus_initiate_unload(true);
2755 	/*
2756 	 * In crash handler we can't schedule synic cleanup for all CPUs,
2757 	 * doing the cleanup for current CPU only. This should be sufficient
2758 	 * for kdump.
2759 	 */
2760 	cpu = smp_processor_id();
2761 	hv_stimer_cleanup(cpu);
2762 	hv_synic_disable_regs(cpu);
2763 };
2764 
2765 static int hv_synic_suspend(void)
2766 {
2767 	/*
2768 	 * When we reach here, all the non-boot CPUs have been offlined.
2769 	 * If we're in a legacy configuration where stimer Direct Mode is
2770 	 * not enabled, the stimers on the non-boot CPUs have been unbound
2771 	 * in hv_synic_cleanup() -> hv_stimer_legacy_cleanup() ->
2772 	 * hv_stimer_cleanup() -> clockevents_unbind_device().
2773 	 *
2774 	 * hv_synic_suspend() only runs on CPU0 with interrupts disabled.
2775 	 * Here we do not call hv_stimer_legacy_cleanup() on CPU0 because:
2776 	 * 1) it's unnecessary as interrupts remain disabled between
2777 	 * syscore_suspend() and syscore_resume(): see create_image() and
2778 	 * resume_target_kernel()
2779 	 * 2) the stimer on CPU0 is automatically disabled later by
2780 	 * syscore_suspend() -> timekeeping_suspend() -> tick_suspend() -> ...
2781 	 * -> clockevents_shutdown() -> ... -> hv_ce_shutdown()
2782 	 * 3) a warning would be triggered if we call
2783 	 * clockevents_unbind_device(), which may sleep, in an
2784 	 * interrupts-disabled context.
2785 	 */
2786 
2787 	hv_synic_disable_regs(0);
2788 
2789 	return 0;
2790 }
2791 
2792 static void hv_synic_resume(void)
2793 {
2794 	hv_synic_enable_regs(0);
2795 
2796 	/*
2797 	 * Note: we don't need to call hv_stimer_init(0), because the timer
2798 	 * on CPU0 is not unbound in hv_synic_suspend(), and the timer is
2799 	 * automatically re-enabled in timekeeping_resume().
2800 	 */
2801 }
2802 
2803 /* The callbacks run only on CPU0, with irqs_disabled. */
2804 static struct syscore_ops hv_synic_syscore_ops = {
2805 	.suspend = hv_synic_suspend,
2806 	.resume = hv_synic_resume,
2807 };
2808 
2809 static int __init hv_acpi_init(void)
2810 {
2811 	int ret;
2812 
2813 	if (!hv_is_hyperv_initialized())
2814 		return -ENODEV;
2815 
2816 	if (hv_root_partition() && !hv_nested)
2817 		return 0;
2818 
2819 	/*
2820 	 * Get ACPI resources first.
2821 	 */
2822 	ret = platform_driver_register(&vmbus_platform_driver);
2823 	if (ret)
2824 		return ret;
2825 
2826 	if (!vmbus_root_device) {
2827 		ret = -ENODEV;
2828 		goto cleanup;
2829 	}
2830 
2831 	/*
2832 	 * If we're on an architecture with a hardcoded hypervisor
2833 	 * vector (i.e. x86/x64), override the VMbus interrupt found
2834 	 * in the ACPI tables. Ensure vmbus_irq is not set since the
2835 	 * normal Linux IRQ mechanism is not used in this case.
2836 	 */
2837 #ifdef HYPERVISOR_CALLBACK_VECTOR
2838 	vmbus_interrupt = HYPERVISOR_CALLBACK_VECTOR;
2839 	vmbus_irq = -1;
2840 #endif
2841 
2842 	hv_debug_init();
2843 
2844 	ret = vmbus_bus_init();
2845 	if (ret)
2846 		goto cleanup;
2847 
2848 	hv_setup_kexec_handler(hv_kexec_handler);
2849 	hv_setup_crash_handler(hv_crash_handler);
2850 
2851 	register_syscore_ops(&hv_synic_syscore_ops);
2852 
2853 	return 0;
2854 
2855 cleanup:
2856 	platform_driver_unregister(&vmbus_platform_driver);
2857 	vmbus_root_device = NULL;
2858 	return ret;
2859 }
2860 
2861 static void __exit vmbus_exit(void)
2862 {
2863 	int cpu;
2864 
2865 	unregister_syscore_ops(&hv_synic_syscore_ops);
2866 
2867 	hv_remove_kexec_handler();
2868 	hv_remove_crash_handler();
2869 	vmbus_connection.conn_state = DISCONNECTED;
2870 	hv_stimer_global_cleanup();
2871 	vmbus_disconnect();
2872 	if (vmbus_irq == -1) {
2873 		hv_remove_vmbus_handler();
2874 	} else {
2875 		free_percpu_irq(vmbus_irq, vmbus_evt);
2876 		free_percpu(vmbus_evt);
2877 	}
2878 	for_each_online_cpu(cpu) {
2879 		struct hv_per_cpu_context *hv_cpu
2880 			= per_cpu_ptr(hv_context.cpu_context, cpu);
2881 
2882 		tasklet_kill(&hv_cpu->msg_dpc);
2883 	}
2884 	hv_debug_rm_all_dir();
2885 
2886 	vmbus_free_channels();
2887 	kfree(vmbus_connection.channels);
2888 
2889 	/*
2890 	 * The vmbus panic notifier is always registered, hence we should
2891 	 * also unconditionally unregister it here as well.
2892 	 */
2893 	atomic_notifier_chain_unregister(&panic_notifier_list,
2894 					&hyperv_panic_vmbus_unload_block);
2895 
2896 	bus_unregister(&hv_bus);
2897 
2898 	cpuhp_remove_state(hyperv_cpuhp_online);
2899 	hv_synic_free();
2900 	platform_driver_unregister(&vmbus_platform_driver);
2901 }
2902 
2903 
2904 MODULE_LICENSE("GPL");
2905 MODULE_DESCRIPTION("Microsoft Hyper-V VMBus Driver");
2906 
2907 subsys_initcall(hv_acpi_init);
2908 module_exit(vmbus_exit);
2909